diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,63033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5016722408026756, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.574136008918618e-05, + "grad_norm": 0.8008168339729309, + "learning_rate": 8e-05, + "loss": 2.3659, + "step": 1 + }, + { + "epoch": 0.00011148272017837236, + "grad_norm": 0.4908035099506378, + "learning_rate": 8e-05, + "loss": 1.7515, + "step": 2 + }, + { + "epoch": 0.00016722408026755852, + "grad_norm": 0.5122831463813782, + "learning_rate": 8e-05, + "loss": 1.998, + "step": 3 + }, + { + "epoch": 0.0002229654403567447, + "grad_norm": 0.47656959295272827, + "learning_rate": 8e-05, + "loss": 1.7709, + "step": 4 + }, + { + "epoch": 0.0002787068004459309, + "grad_norm": 0.47274479269981384, + "learning_rate": 8e-05, + "loss": 1.9645, + "step": 5 + }, + { + "epoch": 0.00033444816053511704, + "grad_norm": 0.42502275109291077, + "learning_rate": 8e-05, + "loss": 1.8408, + "step": 6 + }, + { + "epoch": 0.00039018952062430326, + "grad_norm": 0.4569794535636902, + "learning_rate": 8e-05, + "loss": 1.9836, + "step": 7 + }, + { + "epoch": 0.0004459308807134894, + "grad_norm": 0.43709850311279297, + "learning_rate": 8e-05, + "loss": 1.6727, + "step": 8 + }, + { + "epoch": 0.0005016722408026755, + "grad_norm": 0.5758728384971619, + "learning_rate": 8e-05, + "loss": 2.1397, + "step": 9 + }, + { + "epoch": 0.0005574136008918618, + "grad_norm": 0.5145262479782104, + "learning_rate": 8e-05, + "loss": 1.8755, + "step": 10 + }, + { + "epoch": 0.000613154960981048, + "grad_norm": 0.46036529541015625, + "learning_rate": 8e-05, + "loss": 1.8653, + "step": 11 + }, + { + "epoch": 0.0006688963210702341, + "grad_norm": 0.45718926191329956, + "learning_rate": 8e-05, + "loss": 1.7391, + "step": 12 + }, + { + "epoch": 0.0007246376811594203, + "grad_norm": 0.45198720693588257, + "learning_rate": 8e-05, + "loss": 1.8477, + "step": 13 + }, + { + "epoch": 0.0007803790412486065, + "grad_norm": 0.4350980222225189, + "learning_rate": 8e-05, + "loss": 1.8566, + "step": 14 + }, + { + "epoch": 0.0008361204013377926, + "grad_norm": 0.5104784369468689, + "learning_rate": 8e-05, + "loss": 1.8545, + "step": 15 + }, + { + "epoch": 0.0008918617614269788, + "grad_norm": 0.4408394992351532, + "learning_rate": 8e-05, + "loss": 1.9149, + "step": 16 + }, + { + "epoch": 0.000947603121516165, + "grad_norm": 0.45479220151901245, + "learning_rate": 8e-05, + "loss": 2.0529, + "step": 17 + }, + { + "epoch": 0.001003344481605351, + "grad_norm": 0.5046110153198242, + "learning_rate": 8e-05, + "loss": 1.8707, + "step": 18 + }, + { + "epoch": 0.0010590858416945374, + "grad_norm": 0.44725045561790466, + "learning_rate": 8e-05, + "loss": 1.7574, + "step": 19 + }, + { + "epoch": 0.0011148272017837235, + "grad_norm": 0.4346023499965668, + "learning_rate": 8e-05, + "loss": 1.8512, + "step": 20 + }, + { + "epoch": 0.0011705685618729096, + "grad_norm": 0.40324667096138, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 21 + }, + { + "epoch": 0.001226309921962096, + "grad_norm": 0.47196584939956665, + "learning_rate": 8e-05, + "loss": 1.9381, + "step": 22 + }, + { + "epoch": 0.001282051282051282, + "grad_norm": 0.4341256022453308, + "learning_rate": 8e-05, + "loss": 1.7059, + "step": 23 + }, + { + "epoch": 0.0013377926421404682, + "grad_norm": 0.4710449278354645, + "learning_rate": 8e-05, + "loss": 2.167, + "step": 24 + }, + { + "epoch": 0.0013935340022296545, + "grad_norm": 0.4591739773750305, + "learning_rate": 8e-05, + "loss": 2.0049, + "step": 25 + }, + { + "epoch": 0.0014492753623188406, + "grad_norm": 0.4278070330619812, + "learning_rate": 8e-05, + "loss": 1.6577, + "step": 26 + }, + { + "epoch": 0.0015050167224080267, + "grad_norm": 0.4664648771286011, + "learning_rate": 8e-05, + "loss": 2.1976, + "step": 27 + }, + { + "epoch": 0.001560758082497213, + "grad_norm": 0.44260603189468384, + "learning_rate": 8e-05, + "loss": 1.9631, + "step": 28 + }, + { + "epoch": 0.0016164994425863991, + "grad_norm": 0.4564704895019531, + "learning_rate": 8e-05, + "loss": 2.19, + "step": 29 + }, + { + "epoch": 0.0016722408026755853, + "grad_norm": 0.4133101999759674, + "learning_rate": 8e-05, + "loss": 1.6704, + "step": 30 + }, + { + "epoch": 0.0017279821627647714, + "grad_norm": 0.46999362111091614, + "learning_rate": 8e-05, + "loss": 1.8143, + "step": 31 + }, + { + "epoch": 0.0017837235228539577, + "grad_norm": 0.4743380546569824, + "learning_rate": 8e-05, + "loss": 1.9599, + "step": 32 + }, + { + "epoch": 0.0018394648829431438, + "grad_norm": 0.4114800691604614, + "learning_rate": 8e-05, + "loss": 1.7434, + "step": 33 + }, + { + "epoch": 0.00189520624303233, + "grad_norm": 0.4391994774341583, + "learning_rate": 8e-05, + "loss": 1.6625, + "step": 34 + }, + { + "epoch": 0.0019509476031215162, + "grad_norm": 0.41881778836250305, + "learning_rate": 8e-05, + "loss": 1.8337, + "step": 35 + }, + { + "epoch": 0.002006688963210702, + "grad_norm": 0.40023884177207947, + "learning_rate": 8e-05, + "loss": 1.8276, + "step": 36 + }, + { + "epoch": 0.0020624303232998887, + "grad_norm": 0.4486073851585388, + "learning_rate": 8e-05, + "loss": 2.1585, + "step": 37 + }, + { + "epoch": 0.002118171683389075, + "grad_norm": 0.442481130361557, + "learning_rate": 8e-05, + "loss": 1.8827, + "step": 38 + }, + { + "epoch": 0.002173913043478261, + "grad_norm": 0.41534623503685, + "learning_rate": 8e-05, + "loss": 2.1137, + "step": 39 + }, + { + "epoch": 0.002229654403567447, + "grad_norm": 0.40957891941070557, + "learning_rate": 8e-05, + "loss": 1.8755, + "step": 40 + }, + { + "epoch": 0.002285395763656633, + "grad_norm": 0.42762520909309387, + "learning_rate": 8e-05, + "loss": 1.9187, + "step": 41 + }, + { + "epoch": 0.0023411371237458192, + "grad_norm": 0.43732979893684387, + "learning_rate": 8e-05, + "loss": 1.8745, + "step": 42 + }, + { + "epoch": 0.0023968784838350058, + "grad_norm": 0.42588138580322266, + "learning_rate": 8e-05, + "loss": 1.79, + "step": 43 + }, + { + "epoch": 0.002452619843924192, + "grad_norm": 0.4093983769416809, + "learning_rate": 8e-05, + "loss": 1.7689, + "step": 44 + }, + { + "epoch": 0.002508361204013378, + "grad_norm": 0.39260199666023254, + "learning_rate": 8e-05, + "loss": 1.8451, + "step": 45 + }, + { + "epoch": 0.002564102564102564, + "grad_norm": 0.3898474872112274, + "learning_rate": 8e-05, + "loss": 1.6764, + "step": 46 + }, + { + "epoch": 0.00261984392419175, + "grad_norm": 0.4310983121395111, + "learning_rate": 8e-05, + "loss": 1.8254, + "step": 47 + }, + { + "epoch": 0.0026755852842809363, + "grad_norm": 0.45289134979248047, + "learning_rate": 8e-05, + "loss": 2.0347, + "step": 48 + }, + { + "epoch": 0.0027313266443701224, + "grad_norm": 0.3983175754547119, + "learning_rate": 8e-05, + "loss": 1.5513, + "step": 49 + }, + { + "epoch": 0.002787068004459309, + "grad_norm": 0.3877411186695099, + "learning_rate": 8e-05, + "loss": 1.7061, + "step": 50 + }, + { + "epoch": 0.002842809364548495, + "grad_norm": 0.4502890706062317, + "learning_rate": 8e-05, + "loss": 2.0879, + "step": 51 + }, + { + "epoch": 0.002898550724637681, + "grad_norm": 0.3804650902748108, + "learning_rate": 8e-05, + "loss": 1.5597, + "step": 52 + }, + { + "epoch": 0.0029542920847268673, + "grad_norm": 0.4189783036708832, + "learning_rate": 8e-05, + "loss": 2.0595, + "step": 53 + }, + { + "epoch": 0.0030100334448160534, + "grad_norm": 0.38596734404563904, + "learning_rate": 8e-05, + "loss": 1.8491, + "step": 54 + }, + { + "epoch": 0.0030657748049052395, + "grad_norm": 0.45324382185935974, + "learning_rate": 8e-05, + "loss": 2.12, + "step": 55 + }, + { + "epoch": 0.003121516164994426, + "grad_norm": 0.42319026589393616, + "learning_rate": 8e-05, + "loss": 1.9806, + "step": 56 + }, + { + "epoch": 0.003177257525083612, + "grad_norm": 0.3817268908023834, + "learning_rate": 8e-05, + "loss": 1.7646, + "step": 57 + }, + { + "epoch": 0.0032329988851727983, + "grad_norm": 0.4332798421382904, + "learning_rate": 8e-05, + "loss": 1.9969, + "step": 58 + }, + { + "epoch": 0.0032887402452619844, + "grad_norm": 0.43772000074386597, + "learning_rate": 8e-05, + "loss": 1.8521, + "step": 59 + }, + { + "epoch": 0.0033444816053511705, + "grad_norm": 0.4649338722229004, + "learning_rate": 8e-05, + "loss": 1.832, + "step": 60 + }, + { + "epoch": 0.0034002229654403566, + "grad_norm": 0.3952140808105469, + "learning_rate": 8e-05, + "loss": 1.8608, + "step": 61 + }, + { + "epoch": 0.0034559643255295427, + "grad_norm": 0.4187444746494293, + "learning_rate": 8e-05, + "loss": 1.8068, + "step": 62 + }, + { + "epoch": 0.0035117056856187293, + "grad_norm": 0.39462578296661377, + "learning_rate": 8e-05, + "loss": 1.7132, + "step": 63 + }, + { + "epoch": 0.0035674470457079154, + "grad_norm": 0.4504048526287079, + "learning_rate": 8e-05, + "loss": 2.0016, + "step": 64 + }, + { + "epoch": 0.0036231884057971015, + "grad_norm": 0.40093812346458435, + "learning_rate": 8e-05, + "loss": 1.7959, + "step": 65 + }, + { + "epoch": 0.0036789297658862876, + "grad_norm": 0.454746812582016, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 66 + }, + { + "epoch": 0.0037346711259754737, + "grad_norm": 0.3936002254486084, + "learning_rate": 8e-05, + "loss": 1.7267, + "step": 67 + }, + { + "epoch": 0.00379041248606466, + "grad_norm": 0.3876648247241974, + "learning_rate": 8e-05, + "loss": 1.7422, + "step": 68 + }, + { + "epoch": 0.0038461538461538464, + "grad_norm": 0.44103604555130005, + "learning_rate": 8e-05, + "loss": 1.8784, + "step": 69 + }, + { + "epoch": 0.0039018952062430325, + "grad_norm": 0.4045323431491852, + "learning_rate": 8e-05, + "loss": 1.7219, + "step": 70 + }, + { + "epoch": 0.003957636566332219, + "grad_norm": 0.45244699716567993, + "learning_rate": 8e-05, + "loss": 2.0911, + "step": 71 + }, + { + "epoch": 0.004013377926421404, + "grad_norm": 0.4084925055503845, + "learning_rate": 8e-05, + "loss": 1.8906, + "step": 72 + }, + { + "epoch": 0.004069119286510591, + "grad_norm": 0.3983491063117981, + "learning_rate": 8e-05, + "loss": 1.5774, + "step": 73 + }, + { + "epoch": 0.004124860646599777, + "grad_norm": 0.4227916896343231, + "learning_rate": 8e-05, + "loss": 1.7904, + "step": 74 + }, + { + "epoch": 0.004180602006688963, + "grad_norm": 0.41621050238609314, + "learning_rate": 8e-05, + "loss": 1.7614, + "step": 75 + }, + { + "epoch": 0.00423634336677815, + "grad_norm": 0.4000743329524994, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 76 + }, + { + "epoch": 0.004292084726867335, + "grad_norm": 0.41884997487068176, + "learning_rate": 8e-05, + "loss": 1.8346, + "step": 77 + }, + { + "epoch": 0.004347826086956522, + "grad_norm": 0.38804078102111816, + "learning_rate": 8e-05, + "loss": 1.8449, + "step": 78 + }, + { + "epoch": 0.004403567447045708, + "grad_norm": 0.46406039595603943, + "learning_rate": 8e-05, + "loss": 2.236, + "step": 79 + }, + { + "epoch": 0.004459308807134894, + "grad_norm": 0.5488149523735046, + "learning_rate": 8e-05, + "loss": 1.9053, + "step": 80 + }, + { + "epoch": 0.0045150501672240806, + "grad_norm": 0.41493290662765503, + "learning_rate": 8e-05, + "loss": 1.9203, + "step": 81 + }, + { + "epoch": 0.004570791527313266, + "grad_norm": 0.44460850954055786, + "learning_rate": 8e-05, + "loss": 1.9612, + "step": 82 + }, + { + "epoch": 0.004626532887402453, + "grad_norm": 0.42977744340896606, + "learning_rate": 8e-05, + "loss": 1.9116, + "step": 83 + }, + { + "epoch": 0.0046822742474916385, + "grad_norm": 0.39066213369369507, + "learning_rate": 8e-05, + "loss": 1.881, + "step": 84 + }, + { + "epoch": 0.004738015607580825, + "grad_norm": 0.39784467220306396, + "learning_rate": 8e-05, + "loss": 1.5992, + "step": 85 + }, + { + "epoch": 0.0047937569676700115, + "grad_norm": 0.4012242555618286, + "learning_rate": 8e-05, + "loss": 1.8036, + "step": 86 + }, + { + "epoch": 0.004849498327759197, + "grad_norm": 0.3906935751438141, + "learning_rate": 8e-05, + "loss": 1.6839, + "step": 87 + }, + { + "epoch": 0.004905239687848384, + "grad_norm": 0.47963079810142517, + "learning_rate": 8e-05, + "loss": 1.9194, + "step": 88 + }, + { + "epoch": 0.0049609810479375694, + "grad_norm": 0.4412635862827301, + "learning_rate": 8e-05, + "loss": 1.7194, + "step": 89 + }, + { + "epoch": 0.005016722408026756, + "grad_norm": 0.40161287784576416, + "learning_rate": 8e-05, + "loss": 1.9614, + "step": 90 + }, + { + "epoch": 0.005072463768115942, + "grad_norm": 0.37398388981819153, + "learning_rate": 8e-05, + "loss": 1.5069, + "step": 91 + }, + { + "epoch": 0.005128205128205128, + "grad_norm": 0.4409233629703522, + "learning_rate": 8e-05, + "loss": 1.9873, + "step": 92 + }, + { + "epoch": 0.005183946488294315, + "grad_norm": 0.3823505938053131, + "learning_rate": 8e-05, + "loss": 1.8509, + "step": 93 + }, + { + "epoch": 0.0052396878483835, + "grad_norm": 0.41670355200767517, + "learning_rate": 8e-05, + "loss": 2.0128, + "step": 94 + }, + { + "epoch": 0.005295429208472687, + "grad_norm": 0.4004131257534027, + "learning_rate": 8e-05, + "loss": 1.9371, + "step": 95 + }, + { + "epoch": 0.005351170568561873, + "grad_norm": 0.40070030093193054, + "learning_rate": 8e-05, + "loss": 1.8621, + "step": 96 + }, + { + "epoch": 0.005406911928651059, + "grad_norm": 0.4367406964302063, + "learning_rate": 8e-05, + "loss": 1.9458, + "step": 97 + }, + { + "epoch": 0.005462653288740245, + "grad_norm": 0.4184412658214569, + "learning_rate": 8e-05, + "loss": 1.9502, + "step": 98 + }, + { + "epoch": 0.005518394648829431, + "grad_norm": 0.41309577226638794, + "learning_rate": 8e-05, + "loss": 1.885, + "step": 99 + }, + { + "epoch": 0.005574136008918618, + "grad_norm": 0.4838528633117676, + "learning_rate": 8e-05, + "loss": 1.995, + "step": 100 + }, + { + "epoch": 0.005629877369007804, + "grad_norm": 0.39496132731437683, + "learning_rate": 8e-05, + "loss": 1.7633, + "step": 101 + }, + { + "epoch": 0.00568561872909699, + "grad_norm": 0.42587682604789734, + "learning_rate": 8e-05, + "loss": 1.9289, + "step": 102 + }, + { + "epoch": 0.005741360089186176, + "grad_norm": 0.4074772596359253, + "learning_rate": 8e-05, + "loss": 1.8369, + "step": 103 + }, + { + "epoch": 0.005797101449275362, + "grad_norm": 0.3786422610282898, + "learning_rate": 8e-05, + "loss": 1.6762, + "step": 104 + }, + { + "epoch": 0.005852842809364548, + "grad_norm": 0.41452324390411377, + "learning_rate": 8e-05, + "loss": 1.7729, + "step": 105 + }, + { + "epoch": 0.005908584169453735, + "grad_norm": 0.39360037446022034, + "learning_rate": 8e-05, + "loss": 1.8084, + "step": 106 + }, + { + "epoch": 0.005964325529542921, + "grad_norm": 0.44575709104537964, + "learning_rate": 8e-05, + "loss": 1.9992, + "step": 107 + }, + { + "epoch": 0.006020066889632107, + "grad_norm": 0.378961980342865, + "learning_rate": 8e-05, + "loss": 1.6555, + "step": 108 + }, + { + "epoch": 0.006075808249721293, + "grad_norm": 0.3773297667503357, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 109 + }, + { + "epoch": 0.006131549609810479, + "grad_norm": 0.3853168189525604, + "learning_rate": 8e-05, + "loss": 1.4821, + "step": 110 + }, + { + "epoch": 0.006187290969899666, + "grad_norm": 0.48528149724006653, + "learning_rate": 8e-05, + "loss": 1.9857, + "step": 111 + }, + { + "epoch": 0.006243032329988852, + "grad_norm": 0.435882031917572, + "learning_rate": 8e-05, + "loss": 1.7617, + "step": 112 + }, + { + "epoch": 0.006298773690078038, + "grad_norm": 0.39531388878822327, + "learning_rate": 8e-05, + "loss": 1.5861, + "step": 113 + }, + { + "epoch": 0.006354515050167224, + "grad_norm": 0.43698954582214355, + "learning_rate": 8e-05, + "loss": 1.9298, + "step": 114 + }, + { + "epoch": 0.00641025641025641, + "grad_norm": 0.403531938791275, + "learning_rate": 8e-05, + "loss": 1.8521, + "step": 115 + }, + { + "epoch": 0.006465997770345597, + "grad_norm": 0.4131438434123993, + "learning_rate": 8e-05, + "loss": 1.8334, + "step": 116 + }, + { + "epoch": 0.006521739130434782, + "grad_norm": 0.37669649720191956, + "learning_rate": 8e-05, + "loss": 1.7758, + "step": 117 + }, + { + "epoch": 0.006577480490523969, + "grad_norm": 0.4128376543521881, + "learning_rate": 8e-05, + "loss": 1.9285, + "step": 118 + }, + { + "epoch": 0.006633221850613155, + "grad_norm": 0.40930864214897156, + "learning_rate": 8e-05, + "loss": 1.9049, + "step": 119 + }, + { + "epoch": 0.006688963210702341, + "grad_norm": 0.42722001671791077, + "learning_rate": 8e-05, + "loss": 1.9176, + "step": 120 + }, + { + "epoch": 0.0067447045707915276, + "grad_norm": 0.4142158031463623, + "learning_rate": 8e-05, + "loss": 1.9328, + "step": 121 + }, + { + "epoch": 0.006800445930880713, + "grad_norm": 0.4357644319534302, + "learning_rate": 8e-05, + "loss": 1.7735, + "step": 122 + }, + { + "epoch": 0.0068561872909699, + "grad_norm": 0.39733514189720154, + "learning_rate": 8e-05, + "loss": 1.7758, + "step": 123 + }, + { + "epoch": 0.0069119286510590855, + "grad_norm": 0.3832526206970215, + "learning_rate": 8e-05, + "loss": 1.7294, + "step": 124 + }, + { + "epoch": 0.006967670011148272, + "grad_norm": 0.4044865369796753, + "learning_rate": 8e-05, + "loss": 1.7418, + "step": 125 + }, + { + "epoch": 0.0070234113712374585, + "grad_norm": 0.41378268599510193, + "learning_rate": 8e-05, + "loss": 1.7275, + "step": 126 + }, + { + "epoch": 0.007079152731326644, + "grad_norm": 0.4132898151874542, + "learning_rate": 8e-05, + "loss": 1.8666, + "step": 127 + }, + { + "epoch": 0.007134894091415831, + "grad_norm": 0.4443804919719696, + "learning_rate": 8e-05, + "loss": 1.9086, + "step": 128 + }, + { + "epoch": 0.0071906354515050164, + "grad_norm": 0.41749894618988037, + "learning_rate": 8e-05, + "loss": 1.8343, + "step": 129 + }, + { + "epoch": 0.007246376811594203, + "grad_norm": 0.38238924741744995, + "learning_rate": 8e-05, + "loss": 1.4543, + "step": 130 + }, + { + "epoch": 0.007302118171683389, + "grad_norm": 0.47451648116111755, + "learning_rate": 8e-05, + "loss": 1.8353, + "step": 131 + }, + { + "epoch": 0.007357859531772575, + "grad_norm": 0.4221440851688385, + "learning_rate": 8e-05, + "loss": 1.8808, + "step": 132 + }, + { + "epoch": 0.007413600891861762, + "grad_norm": 0.4281880557537079, + "learning_rate": 8e-05, + "loss": 1.9372, + "step": 133 + }, + { + "epoch": 0.007469342251950947, + "grad_norm": 0.4605168104171753, + "learning_rate": 8e-05, + "loss": 2.0274, + "step": 134 + }, + { + "epoch": 0.007525083612040134, + "grad_norm": 0.38289088010787964, + "learning_rate": 8e-05, + "loss": 1.7703, + "step": 135 + }, + { + "epoch": 0.00758082497212932, + "grad_norm": 0.40322351455688477, + "learning_rate": 8e-05, + "loss": 1.8378, + "step": 136 + }, + { + "epoch": 0.007636566332218506, + "grad_norm": 0.40321865677833557, + "learning_rate": 8e-05, + "loss": 1.6512, + "step": 137 + }, + { + "epoch": 0.007692307692307693, + "grad_norm": 0.3676503598690033, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 138 + }, + { + "epoch": 0.007748049052396878, + "grad_norm": 0.3847725987434387, + "learning_rate": 8e-05, + "loss": 1.8038, + "step": 139 + }, + { + "epoch": 0.007803790412486065, + "grad_norm": 0.4042004942893982, + "learning_rate": 8e-05, + "loss": 1.9278, + "step": 140 + }, + { + "epoch": 0.007859531772575251, + "grad_norm": 0.4066566824913025, + "learning_rate": 8e-05, + "loss": 1.7756, + "step": 141 + }, + { + "epoch": 0.007915273132664437, + "grad_norm": 0.39882540702819824, + "learning_rate": 8e-05, + "loss": 1.8299, + "step": 142 + }, + { + "epoch": 0.007971014492753623, + "grad_norm": 0.3843311071395874, + "learning_rate": 8e-05, + "loss": 1.7093, + "step": 143 + }, + { + "epoch": 0.008026755852842809, + "grad_norm": 0.39015519618988037, + "learning_rate": 8e-05, + "loss": 1.7497, + "step": 144 + }, + { + "epoch": 0.008082497212931996, + "grad_norm": 0.40187525749206543, + "learning_rate": 8e-05, + "loss": 1.9149, + "step": 145 + }, + { + "epoch": 0.008138238573021182, + "grad_norm": 0.46802017092704773, + "learning_rate": 8e-05, + "loss": 2.0147, + "step": 146 + }, + { + "epoch": 0.008193979933110367, + "grad_norm": 0.40548357367515564, + "learning_rate": 8e-05, + "loss": 1.7083, + "step": 147 + }, + { + "epoch": 0.008249721293199555, + "grad_norm": 0.4108128547668457, + "learning_rate": 8e-05, + "loss": 1.7361, + "step": 148 + }, + { + "epoch": 0.00830546265328874, + "grad_norm": 0.4061950445175171, + "learning_rate": 8e-05, + "loss": 2.0837, + "step": 149 + }, + { + "epoch": 0.008361204013377926, + "grad_norm": 0.4076997935771942, + "learning_rate": 8e-05, + "loss": 1.6702, + "step": 150 + }, + { + "epoch": 0.008416945373467112, + "grad_norm": 0.40252676606178284, + "learning_rate": 8e-05, + "loss": 1.8599, + "step": 151 + }, + { + "epoch": 0.0084726867335563, + "grad_norm": 0.41289693117141724, + "learning_rate": 8e-05, + "loss": 1.9507, + "step": 152 + }, + { + "epoch": 0.008528428093645485, + "grad_norm": 0.406189501285553, + "learning_rate": 8e-05, + "loss": 1.7856, + "step": 153 + }, + { + "epoch": 0.00858416945373467, + "grad_norm": 0.40082770586013794, + "learning_rate": 8e-05, + "loss": 1.8779, + "step": 154 + }, + { + "epoch": 0.008639910813823858, + "grad_norm": 0.4297981560230255, + "learning_rate": 8e-05, + "loss": 1.6413, + "step": 155 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 0.42399170994758606, + "learning_rate": 8e-05, + "loss": 1.6762, + "step": 156 + }, + { + "epoch": 0.00875139353400223, + "grad_norm": 0.4469023644924164, + "learning_rate": 8e-05, + "loss": 2.2177, + "step": 157 + }, + { + "epoch": 0.008807134894091417, + "grad_norm": 0.41254499554634094, + "learning_rate": 8e-05, + "loss": 1.9285, + "step": 158 + }, + { + "epoch": 0.008862876254180602, + "grad_norm": 0.37810757756233215, + "learning_rate": 8e-05, + "loss": 1.6574, + "step": 159 + }, + { + "epoch": 0.008918617614269788, + "grad_norm": 0.5044556260108948, + "learning_rate": 8e-05, + "loss": 1.4282, + "step": 160 + }, + { + "epoch": 0.008974358974358974, + "grad_norm": 0.4354773759841919, + "learning_rate": 8e-05, + "loss": 1.963, + "step": 161 + }, + { + "epoch": 0.009030100334448161, + "grad_norm": 0.4584670662879944, + "learning_rate": 8e-05, + "loss": 2.0064, + "step": 162 + }, + { + "epoch": 0.009085841694537347, + "grad_norm": 0.4361465871334076, + "learning_rate": 8e-05, + "loss": 1.8227, + "step": 163 + }, + { + "epoch": 0.009141583054626532, + "grad_norm": 0.3891424834728241, + "learning_rate": 8e-05, + "loss": 1.5972, + "step": 164 + }, + { + "epoch": 0.00919732441471572, + "grad_norm": 0.44024306535720825, + "learning_rate": 8e-05, + "loss": 1.8614, + "step": 165 + }, + { + "epoch": 0.009253065774804906, + "grad_norm": 0.38277462124824524, + "learning_rate": 8e-05, + "loss": 1.8031, + "step": 166 + }, + { + "epoch": 0.009308807134894091, + "grad_norm": 0.46869274973869324, + "learning_rate": 8e-05, + "loss": 2.156, + "step": 167 + }, + { + "epoch": 0.009364548494983277, + "grad_norm": 0.4273754358291626, + "learning_rate": 8e-05, + "loss": 1.7214, + "step": 168 + }, + { + "epoch": 0.009420289855072464, + "grad_norm": 0.4267232418060303, + "learning_rate": 8e-05, + "loss": 1.89, + "step": 169 + }, + { + "epoch": 0.00947603121516165, + "grad_norm": 0.4204559028148651, + "learning_rate": 8e-05, + "loss": 1.937, + "step": 170 + }, + { + "epoch": 0.009531772575250836, + "grad_norm": 0.4121902585029602, + "learning_rate": 8e-05, + "loss": 1.9587, + "step": 171 + }, + { + "epoch": 0.009587513935340023, + "grad_norm": 0.4139564633369446, + "learning_rate": 8e-05, + "loss": 1.7723, + "step": 172 + }, + { + "epoch": 0.009643255295429209, + "grad_norm": 0.441736102104187, + "learning_rate": 8e-05, + "loss": 1.889, + "step": 173 + }, + { + "epoch": 0.009698996655518394, + "grad_norm": 0.4170135259628296, + "learning_rate": 8e-05, + "loss": 1.9719, + "step": 174 + }, + { + "epoch": 0.00975473801560758, + "grad_norm": 0.5091986060142517, + "learning_rate": 8e-05, + "loss": 1.8272, + "step": 175 + }, + { + "epoch": 0.009810479375696768, + "grad_norm": 0.391644150018692, + "learning_rate": 8e-05, + "loss": 1.6708, + "step": 176 + }, + { + "epoch": 0.009866220735785953, + "grad_norm": 0.4127022624015808, + "learning_rate": 8e-05, + "loss": 1.7926, + "step": 177 + }, + { + "epoch": 0.009921962095875139, + "grad_norm": 0.43958815932273865, + "learning_rate": 8e-05, + "loss": 1.8704, + "step": 178 + }, + { + "epoch": 0.009977703455964326, + "grad_norm": 0.46184685826301575, + "learning_rate": 8e-05, + "loss": 1.9766, + "step": 179 + }, + { + "epoch": 0.010033444816053512, + "grad_norm": 0.4079630374908447, + "learning_rate": 8e-05, + "loss": 1.6926, + "step": 180 + }, + { + "epoch": 0.010089186176142698, + "grad_norm": 0.3960077166557312, + "learning_rate": 8e-05, + "loss": 1.599, + "step": 181 + }, + { + "epoch": 0.010144927536231883, + "grad_norm": 0.37671923637390137, + "learning_rate": 8e-05, + "loss": 1.6455, + "step": 182 + }, + { + "epoch": 0.01020066889632107, + "grad_norm": 0.4062810242176056, + "learning_rate": 8e-05, + "loss": 1.9651, + "step": 183 + }, + { + "epoch": 0.010256410256410256, + "grad_norm": 0.4247036278247833, + "learning_rate": 8e-05, + "loss": 2.0219, + "step": 184 + }, + { + "epoch": 0.010312151616499442, + "grad_norm": 0.40789371728897095, + "learning_rate": 8e-05, + "loss": 1.9062, + "step": 185 + }, + { + "epoch": 0.01036789297658863, + "grad_norm": 0.4061046540737152, + "learning_rate": 8e-05, + "loss": 1.9277, + "step": 186 + }, + { + "epoch": 0.010423634336677815, + "grad_norm": 0.396852046251297, + "learning_rate": 8e-05, + "loss": 1.8621, + "step": 187 + }, + { + "epoch": 0.010479375696767, + "grad_norm": 0.4642435610294342, + "learning_rate": 8e-05, + "loss": 1.9323, + "step": 188 + }, + { + "epoch": 0.010535117056856187, + "grad_norm": 0.43151143193244934, + "learning_rate": 8e-05, + "loss": 1.8768, + "step": 189 + }, + { + "epoch": 0.010590858416945374, + "grad_norm": 0.4810677766799927, + "learning_rate": 8e-05, + "loss": 2.1525, + "step": 190 + }, + { + "epoch": 0.01064659977703456, + "grad_norm": 0.40719544887542725, + "learning_rate": 8e-05, + "loss": 1.8136, + "step": 191 + }, + { + "epoch": 0.010702341137123745, + "grad_norm": 0.4264082908630371, + "learning_rate": 8e-05, + "loss": 2.1189, + "step": 192 + }, + { + "epoch": 0.010758082497212933, + "grad_norm": 0.40869855880737305, + "learning_rate": 8e-05, + "loss": 1.8806, + "step": 193 + }, + { + "epoch": 0.010813823857302118, + "grad_norm": 0.45739123225212097, + "learning_rate": 8e-05, + "loss": 2.0878, + "step": 194 + }, + { + "epoch": 0.010869565217391304, + "grad_norm": 0.43634286522865295, + "learning_rate": 8e-05, + "loss": 1.8352, + "step": 195 + }, + { + "epoch": 0.01092530657748049, + "grad_norm": 0.41823098063468933, + "learning_rate": 8e-05, + "loss": 1.8458, + "step": 196 + }, + { + "epoch": 0.010981047937569677, + "grad_norm": 0.4044710695743561, + "learning_rate": 8e-05, + "loss": 1.9065, + "step": 197 + }, + { + "epoch": 0.011036789297658863, + "grad_norm": 0.4063010513782501, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 198 + }, + { + "epoch": 0.011092530657748048, + "grad_norm": 0.43073466420173645, + "learning_rate": 8e-05, + "loss": 1.9087, + "step": 199 + }, + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4032673239707947, + "learning_rate": 8e-05, + "loss": 1.7571, + "step": 200 + }, + { + "epoch": 0.011204013377926422, + "grad_norm": 0.39450713992118835, + "learning_rate": 8e-05, + "loss": 1.7264, + "step": 201 + }, + { + "epoch": 0.011259754738015607, + "grad_norm": 0.40192824602127075, + "learning_rate": 8e-05, + "loss": 1.8697, + "step": 202 + }, + { + "epoch": 0.011315496098104793, + "grad_norm": 0.4325474798679352, + "learning_rate": 8e-05, + "loss": 1.8927, + "step": 203 + }, + { + "epoch": 0.01137123745819398, + "grad_norm": 0.38713356852531433, + "learning_rate": 8e-05, + "loss": 1.6918, + "step": 204 + }, + { + "epoch": 0.011426978818283166, + "grad_norm": 0.39463481307029724, + "learning_rate": 8e-05, + "loss": 1.8829, + "step": 205 + }, + { + "epoch": 0.011482720178372352, + "grad_norm": 0.41305360198020935, + "learning_rate": 8e-05, + "loss": 1.7638, + "step": 206 + }, + { + "epoch": 0.011538461538461539, + "grad_norm": 0.4194389879703522, + "learning_rate": 8e-05, + "loss": 1.8702, + "step": 207 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 0.4725489020347595, + "learning_rate": 8e-05, + "loss": 2.0307, + "step": 208 + }, + { + "epoch": 0.01164994425863991, + "grad_norm": 0.45651867985725403, + "learning_rate": 8e-05, + "loss": 1.9788, + "step": 209 + }, + { + "epoch": 0.011705685618729096, + "grad_norm": 0.41888317465782166, + "learning_rate": 8e-05, + "loss": 1.9555, + "step": 210 + }, + { + "epoch": 0.011761426978818284, + "grad_norm": 0.4058757722377777, + "learning_rate": 8e-05, + "loss": 1.8253, + "step": 211 + }, + { + "epoch": 0.01181716833890747, + "grad_norm": 0.3691366910934448, + "learning_rate": 8e-05, + "loss": 1.596, + "step": 212 + }, + { + "epoch": 0.011872909698996655, + "grad_norm": 0.37606245279312134, + "learning_rate": 8e-05, + "loss": 1.587, + "step": 213 + }, + { + "epoch": 0.011928651059085842, + "grad_norm": 0.42671895027160645, + "learning_rate": 8e-05, + "loss": 1.7991, + "step": 214 + }, + { + "epoch": 0.011984392419175028, + "grad_norm": 0.40445569157600403, + "learning_rate": 8e-05, + "loss": 1.7706, + "step": 215 + }, + { + "epoch": 0.012040133779264214, + "grad_norm": 0.4408814013004303, + "learning_rate": 8e-05, + "loss": 1.8041, + "step": 216 + }, + { + "epoch": 0.012095875139353401, + "grad_norm": 0.43463078141212463, + "learning_rate": 8e-05, + "loss": 1.9364, + "step": 217 + }, + { + "epoch": 0.012151616499442587, + "grad_norm": 0.4390909671783447, + "learning_rate": 8e-05, + "loss": 1.8111, + "step": 218 + }, + { + "epoch": 0.012207357859531772, + "grad_norm": 0.4165789484977722, + "learning_rate": 8e-05, + "loss": 1.8843, + "step": 219 + }, + { + "epoch": 0.012263099219620958, + "grad_norm": 0.4711140990257263, + "learning_rate": 8e-05, + "loss": 2.1673, + "step": 220 + }, + { + "epoch": 0.012318840579710146, + "grad_norm": 0.4233494997024536, + "learning_rate": 8e-05, + "loss": 1.8285, + "step": 221 + }, + { + "epoch": 0.012374581939799331, + "grad_norm": 0.41040298342704773, + "learning_rate": 8e-05, + "loss": 1.7355, + "step": 222 + }, + { + "epoch": 0.012430323299888517, + "grad_norm": 0.4037279784679413, + "learning_rate": 8e-05, + "loss": 1.8224, + "step": 223 + }, + { + "epoch": 0.012486064659977704, + "grad_norm": 0.45015132427215576, + "learning_rate": 8e-05, + "loss": 2.0457, + "step": 224 + }, + { + "epoch": 0.01254180602006689, + "grad_norm": 0.4033258855342865, + "learning_rate": 8e-05, + "loss": 1.8827, + "step": 225 + }, + { + "epoch": 0.012597547380156076, + "grad_norm": 0.38210955262184143, + "learning_rate": 8e-05, + "loss": 1.7697, + "step": 226 + }, + { + "epoch": 0.012653288740245261, + "grad_norm": 0.39336487650871277, + "learning_rate": 8e-05, + "loss": 1.7893, + "step": 227 + }, + { + "epoch": 0.012709030100334449, + "grad_norm": 0.38320380449295044, + "learning_rate": 8e-05, + "loss": 1.7757, + "step": 228 + }, + { + "epoch": 0.012764771460423634, + "grad_norm": 0.4049006700515747, + "learning_rate": 8e-05, + "loss": 1.8953, + "step": 229 + }, + { + "epoch": 0.01282051282051282, + "grad_norm": 0.412638783454895, + "learning_rate": 8e-05, + "loss": 1.7646, + "step": 230 + }, + { + "epoch": 0.012876254180602007, + "grad_norm": 0.3987637460231781, + "learning_rate": 8e-05, + "loss": 1.7276, + "step": 231 + }, + { + "epoch": 0.012931995540691193, + "grad_norm": 0.40791159868240356, + "learning_rate": 8e-05, + "loss": 1.6983, + "step": 232 + }, + { + "epoch": 0.012987736900780379, + "grad_norm": 0.41750428080558777, + "learning_rate": 8e-05, + "loss": 2.0136, + "step": 233 + }, + { + "epoch": 0.013043478260869565, + "grad_norm": 0.4128837585449219, + "learning_rate": 8e-05, + "loss": 1.8859, + "step": 234 + }, + { + "epoch": 0.013099219620958752, + "grad_norm": 0.40263018012046814, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 235 + }, + { + "epoch": 0.013154960981047938, + "grad_norm": 0.40996524691581726, + "learning_rate": 8e-05, + "loss": 1.6886, + "step": 236 + }, + { + "epoch": 0.013210702341137123, + "grad_norm": 0.47232288122177124, + "learning_rate": 8e-05, + "loss": 1.9667, + "step": 237 + }, + { + "epoch": 0.01326644370122631, + "grad_norm": 0.4224655032157898, + "learning_rate": 8e-05, + "loss": 1.7665, + "step": 238 + }, + { + "epoch": 0.013322185061315496, + "grad_norm": 0.3976142406463623, + "learning_rate": 8e-05, + "loss": 1.7982, + "step": 239 + }, + { + "epoch": 0.013377926421404682, + "grad_norm": 0.4137797951698303, + "learning_rate": 8e-05, + "loss": 1.8543, + "step": 240 + }, + { + "epoch": 0.013433667781493868, + "grad_norm": 0.4323852062225342, + "learning_rate": 8e-05, + "loss": 1.9242, + "step": 241 + }, + { + "epoch": 0.013489409141583055, + "grad_norm": 0.4583011269569397, + "learning_rate": 8e-05, + "loss": 2.0125, + "step": 242 + }, + { + "epoch": 0.01354515050167224, + "grad_norm": 0.4027802646160126, + "learning_rate": 8e-05, + "loss": 1.8284, + "step": 243 + }, + { + "epoch": 0.013600891861761426, + "grad_norm": 0.45875123143196106, + "learning_rate": 8e-05, + "loss": 2.0507, + "step": 244 + }, + { + "epoch": 0.013656633221850614, + "grad_norm": 0.4033229351043701, + "learning_rate": 8e-05, + "loss": 1.9382, + "step": 245 + }, + { + "epoch": 0.0137123745819398, + "grad_norm": 0.3958418667316437, + "learning_rate": 8e-05, + "loss": 1.8293, + "step": 246 + }, + { + "epoch": 0.013768115942028985, + "grad_norm": 0.4176151752471924, + "learning_rate": 8e-05, + "loss": 1.8388, + "step": 247 + }, + { + "epoch": 0.013823857302118171, + "grad_norm": 0.3736697733402252, + "learning_rate": 8e-05, + "loss": 1.681, + "step": 248 + }, + { + "epoch": 0.013879598662207358, + "grad_norm": 0.40974247455596924, + "learning_rate": 8e-05, + "loss": 1.7743, + "step": 249 + }, + { + "epoch": 0.013935340022296544, + "grad_norm": 0.41332730650901794, + "learning_rate": 8e-05, + "loss": 1.6346, + "step": 250 + }, + { + "epoch": 0.01399108138238573, + "grad_norm": 0.42761102318763733, + "learning_rate": 8e-05, + "loss": 1.8236, + "step": 251 + }, + { + "epoch": 0.014046822742474917, + "grad_norm": 0.4135681986808777, + "learning_rate": 8e-05, + "loss": 1.8745, + "step": 252 + }, + { + "epoch": 0.014102564102564103, + "grad_norm": 0.38827580213546753, + "learning_rate": 8e-05, + "loss": 1.633, + "step": 253 + }, + { + "epoch": 0.014158305462653288, + "grad_norm": 0.42247524857521057, + "learning_rate": 8e-05, + "loss": 1.8305, + "step": 254 + }, + { + "epoch": 0.014214046822742474, + "grad_norm": 0.3817632794380188, + "learning_rate": 8e-05, + "loss": 1.6589, + "step": 255 + }, + { + "epoch": 0.014269788182831662, + "grad_norm": 0.41619542241096497, + "learning_rate": 8e-05, + "loss": 1.6908, + "step": 256 + }, + { + "epoch": 0.014325529542920847, + "grad_norm": 0.40807268023490906, + "learning_rate": 8e-05, + "loss": 1.8188, + "step": 257 + }, + { + "epoch": 0.014381270903010033, + "grad_norm": 0.3903752565383911, + "learning_rate": 8e-05, + "loss": 1.5068, + "step": 258 + }, + { + "epoch": 0.01443701226309922, + "grad_norm": 0.43174487352371216, + "learning_rate": 8e-05, + "loss": 2.0029, + "step": 259 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 0.40700212121009827, + "learning_rate": 8e-05, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.014548494983277592, + "grad_norm": 0.4008272588253021, + "learning_rate": 8e-05, + "loss": 1.8252, + "step": 261 + }, + { + "epoch": 0.014604236343366777, + "grad_norm": 0.441720187664032, + "learning_rate": 8e-05, + "loss": 1.7293, + "step": 262 + }, + { + "epoch": 0.014659977703455965, + "grad_norm": 0.37992534041404724, + "learning_rate": 8e-05, + "loss": 1.7273, + "step": 263 + }, + { + "epoch": 0.01471571906354515, + "grad_norm": 0.38580581545829773, + "learning_rate": 8e-05, + "loss": 1.7454, + "step": 264 + }, + { + "epoch": 0.014771460423634336, + "grad_norm": 0.4266432225704193, + "learning_rate": 8e-05, + "loss": 1.9173, + "step": 265 + }, + { + "epoch": 0.014827201783723524, + "grad_norm": 0.4495110809803009, + "learning_rate": 8e-05, + "loss": 1.9769, + "step": 266 + }, + { + "epoch": 0.01488294314381271, + "grad_norm": 0.4119555354118347, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 267 + }, + { + "epoch": 0.014938684503901895, + "grad_norm": 0.41930001974105835, + "learning_rate": 8e-05, + "loss": 1.9008, + "step": 268 + }, + { + "epoch": 0.01499442586399108, + "grad_norm": 0.41471660137176514, + "learning_rate": 8e-05, + "loss": 1.825, + "step": 269 + }, + { + "epoch": 0.015050167224080268, + "grad_norm": 0.4505252540111542, + "learning_rate": 8e-05, + "loss": 1.9394, + "step": 270 + }, + { + "epoch": 0.015105908584169454, + "grad_norm": 0.45706626772880554, + "learning_rate": 8e-05, + "loss": 2.0057, + "step": 271 + }, + { + "epoch": 0.01516164994425864, + "grad_norm": 0.41290906071662903, + "learning_rate": 8e-05, + "loss": 2.0293, + "step": 272 + }, + { + "epoch": 0.015217391304347827, + "grad_norm": 0.41640666127204895, + "learning_rate": 8e-05, + "loss": 1.7892, + "step": 273 + }, + { + "epoch": 0.015273132664437012, + "grad_norm": 0.4064461588859558, + "learning_rate": 8e-05, + "loss": 1.8026, + "step": 274 + }, + { + "epoch": 0.015328874024526198, + "grad_norm": 0.46918439865112305, + "learning_rate": 8e-05, + "loss": 1.8843, + "step": 275 + }, + { + "epoch": 0.015384615384615385, + "grad_norm": 0.39473360776901245, + "learning_rate": 8e-05, + "loss": 1.77, + "step": 276 + }, + { + "epoch": 0.015440356744704571, + "grad_norm": 0.36619892716407776, + "learning_rate": 8e-05, + "loss": 1.5864, + "step": 277 + }, + { + "epoch": 0.015496098104793757, + "grad_norm": 0.4101516306400299, + "learning_rate": 8e-05, + "loss": 1.7618, + "step": 278 + }, + { + "epoch": 0.015551839464882942, + "grad_norm": 0.41335687041282654, + "learning_rate": 8e-05, + "loss": 1.6014, + "step": 279 + }, + { + "epoch": 0.01560758082497213, + "grad_norm": 0.44227463006973267, + "learning_rate": 8e-05, + "loss": 2.0184, + "step": 280 + }, + { + "epoch": 0.015663322185061314, + "grad_norm": 0.40748894214630127, + "learning_rate": 8e-05, + "loss": 1.7469, + "step": 281 + }, + { + "epoch": 0.015719063545150503, + "grad_norm": 0.41847971081733704, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 282 + }, + { + "epoch": 0.01577480490523969, + "grad_norm": 0.41384992003440857, + "learning_rate": 8e-05, + "loss": 1.4822, + "step": 283 + }, + { + "epoch": 0.015830546265328874, + "grad_norm": 0.40941479802131653, + "learning_rate": 8e-05, + "loss": 1.8697, + "step": 284 + }, + { + "epoch": 0.01588628762541806, + "grad_norm": 0.42511463165283203, + "learning_rate": 8e-05, + "loss": 1.8659, + "step": 285 + }, + { + "epoch": 0.015942028985507246, + "grad_norm": 0.41972148418426514, + "learning_rate": 8e-05, + "loss": 1.7126, + "step": 286 + }, + { + "epoch": 0.01599777034559643, + "grad_norm": 0.42691537737846375, + "learning_rate": 8e-05, + "loss": 1.8377, + "step": 287 + }, + { + "epoch": 0.016053511705685617, + "grad_norm": 0.4027348756790161, + "learning_rate": 8e-05, + "loss": 1.6883, + "step": 288 + }, + { + "epoch": 0.016109253065774806, + "grad_norm": 0.41511085629463196, + "learning_rate": 8e-05, + "loss": 1.9085, + "step": 289 + }, + { + "epoch": 0.016164994425863992, + "grad_norm": 0.4326029419898987, + "learning_rate": 8e-05, + "loss": 1.8441, + "step": 290 + }, + { + "epoch": 0.016220735785953178, + "grad_norm": 0.40246617794036865, + "learning_rate": 8e-05, + "loss": 1.7686, + "step": 291 + }, + { + "epoch": 0.016276477146042363, + "grad_norm": 0.38311776518821716, + "learning_rate": 8e-05, + "loss": 1.7145, + "step": 292 + }, + { + "epoch": 0.01633221850613155, + "grad_norm": 0.3850622773170471, + "learning_rate": 8e-05, + "loss": 1.7244, + "step": 293 + }, + { + "epoch": 0.016387959866220735, + "grad_norm": 0.4168466627597809, + "learning_rate": 8e-05, + "loss": 1.8325, + "step": 294 + }, + { + "epoch": 0.01644370122630992, + "grad_norm": 0.3952409029006958, + "learning_rate": 8e-05, + "loss": 1.9131, + "step": 295 + }, + { + "epoch": 0.01649944258639911, + "grad_norm": 0.42862653732299805, + "learning_rate": 8e-05, + "loss": 1.8013, + "step": 296 + }, + { + "epoch": 0.016555183946488295, + "grad_norm": 0.42462411522865295, + "learning_rate": 8e-05, + "loss": 1.7125, + "step": 297 + }, + { + "epoch": 0.01661092530657748, + "grad_norm": 0.3978937268257141, + "learning_rate": 8e-05, + "loss": 1.7819, + "step": 298 + }, + { + "epoch": 0.016666666666666666, + "grad_norm": 0.3845030963420868, + "learning_rate": 8e-05, + "loss": 1.6577, + "step": 299 + }, + { + "epoch": 0.016722408026755852, + "grad_norm": 0.4124613404273987, + "learning_rate": 8e-05, + "loss": 1.7366, + "step": 300 + }, + { + "epoch": 0.016778149386845038, + "grad_norm": 0.45144006609916687, + "learning_rate": 8e-05, + "loss": 1.8667, + "step": 301 + }, + { + "epoch": 0.016833890746934223, + "grad_norm": 0.37532711029052734, + "learning_rate": 8e-05, + "loss": 1.5467, + "step": 302 + }, + { + "epoch": 0.016889632107023413, + "grad_norm": 0.41644760966300964, + "learning_rate": 8e-05, + "loss": 1.8618, + "step": 303 + }, + { + "epoch": 0.0169453734671126, + "grad_norm": 0.4005732834339142, + "learning_rate": 8e-05, + "loss": 1.7652, + "step": 304 + }, + { + "epoch": 0.017001114827201784, + "grad_norm": 0.384604275226593, + "learning_rate": 8e-05, + "loss": 1.7026, + "step": 305 + }, + { + "epoch": 0.01705685618729097, + "grad_norm": 0.3821338415145874, + "learning_rate": 8e-05, + "loss": 1.6649, + "step": 306 + }, + { + "epoch": 0.017112597547380155, + "grad_norm": 0.415862500667572, + "learning_rate": 8e-05, + "loss": 1.9326, + "step": 307 + }, + { + "epoch": 0.01716833890746934, + "grad_norm": 0.4307110905647278, + "learning_rate": 8e-05, + "loss": 1.7675, + "step": 308 + }, + { + "epoch": 0.01722408026755853, + "grad_norm": 0.49622923135757446, + "learning_rate": 8e-05, + "loss": 1.8899, + "step": 309 + }, + { + "epoch": 0.017279821627647716, + "grad_norm": 0.4144655466079712, + "learning_rate": 8e-05, + "loss": 1.9463, + "step": 310 + }, + { + "epoch": 0.0173355629877369, + "grad_norm": 0.42982232570648193, + "learning_rate": 8e-05, + "loss": 1.7616, + "step": 311 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 0.419792115688324, + "learning_rate": 8e-05, + "loss": 1.7261, + "step": 312 + }, + { + "epoch": 0.017447045707915273, + "grad_norm": 0.4065159261226654, + "learning_rate": 8e-05, + "loss": 1.4601, + "step": 313 + }, + { + "epoch": 0.01750278706800446, + "grad_norm": 0.4134047329425812, + "learning_rate": 8e-05, + "loss": 1.8337, + "step": 314 + }, + { + "epoch": 0.017558528428093644, + "grad_norm": 0.4063039720058441, + "learning_rate": 8e-05, + "loss": 1.8483, + "step": 315 + }, + { + "epoch": 0.017614269788182833, + "grad_norm": 0.41646286845207214, + "learning_rate": 8e-05, + "loss": 1.6276, + "step": 316 + }, + { + "epoch": 0.01767001114827202, + "grad_norm": 0.4397350549697876, + "learning_rate": 8e-05, + "loss": 1.9969, + "step": 317 + }, + { + "epoch": 0.017725752508361205, + "grad_norm": 0.40330687165260315, + "learning_rate": 8e-05, + "loss": 1.6762, + "step": 318 + }, + { + "epoch": 0.01778149386845039, + "grad_norm": 0.38782450556755066, + "learning_rate": 8e-05, + "loss": 1.8607, + "step": 319 + }, + { + "epoch": 0.017837235228539576, + "grad_norm": 0.41678085923194885, + "learning_rate": 8e-05, + "loss": 1.8923, + "step": 320 + }, + { + "epoch": 0.01789297658862876, + "grad_norm": 0.43278276920318604, + "learning_rate": 8e-05, + "loss": 1.7742, + "step": 321 + }, + { + "epoch": 0.017948717948717947, + "grad_norm": 0.43438470363616943, + "learning_rate": 8e-05, + "loss": 2.0248, + "step": 322 + }, + { + "epoch": 0.018004459308807137, + "grad_norm": 0.41625747084617615, + "learning_rate": 8e-05, + "loss": 1.8575, + "step": 323 + }, + { + "epoch": 0.018060200668896322, + "grad_norm": 0.3881988227367401, + "learning_rate": 8e-05, + "loss": 1.815, + "step": 324 + }, + { + "epoch": 0.018115942028985508, + "grad_norm": 0.462026447057724, + "learning_rate": 8e-05, + "loss": 2.1086, + "step": 325 + }, + { + "epoch": 0.018171683389074694, + "grad_norm": 0.39492854475975037, + "learning_rate": 8e-05, + "loss": 1.7288, + "step": 326 + }, + { + "epoch": 0.01822742474916388, + "grad_norm": 0.4187218248844147, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 327 + }, + { + "epoch": 0.018283166109253065, + "grad_norm": 0.37417539954185486, + "learning_rate": 8e-05, + "loss": 1.6756, + "step": 328 + }, + { + "epoch": 0.01833890746934225, + "grad_norm": 0.379098504781723, + "learning_rate": 8e-05, + "loss": 1.5272, + "step": 329 + }, + { + "epoch": 0.01839464882943144, + "grad_norm": 0.4593086838722229, + "learning_rate": 8e-05, + "loss": 1.9489, + "step": 330 + }, + { + "epoch": 0.018450390189520625, + "grad_norm": 0.4017282724380493, + "learning_rate": 8e-05, + "loss": 1.6141, + "step": 331 + }, + { + "epoch": 0.01850613154960981, + "grad_norm": 0.41741320490837097, + "learning_rate": 8e-05, + "loss": 1.9041, + "step": 332 + }, + { + "epoch": 0.018561872909698997, + "grad_norm": 0.7527666687965393, + "learning_rate": 8e-05, + "loss": 1.9661, + "step": 333 + }, + { + "epoch": 0.018617614269788182, + "grad_norm": 0.3948970437049866, + "learning_rate": 8e-05, + "loss": 1.7473, + "step": 334 + }, + { + "epoch": 0.018673355629877368, + "grad_norm": 0.46345531940460205, + "learning_rate": 8e-05, + "loss": 2.0197, + "step": 335 + }, + { + "epoch": 0.018729096989966554, + "grad_norm": 0.41160738468170166, + "learning_rate": 8e-05, + "loss": 1.8306, + "step": 336 + }, + { + "epoch": 0.018784838350055743, + "grad_norm": 0.4199187457561493, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 337 + }, + { + "epoch": 0.01884057971014493, + "grad_norm": 0.43813398480415344, + "learning_rate": 8e-05, + "loss": 1.9953, + "step": 338 + }, + { + "epoch": 0.018896321070234114, + "grad_norm": 0.3779383897781372, + "learning_rate": 8e-05, + "loss": 1.5923, + "step": 339 + }, + { + "epoch": 0.0189520624303233, + "grad_norm": 0.4910745918750763, + "learning_rate": 8e-05, + "loss": 2.1573, + "step": 340 + }, + { + "epoch": 0.019007803790412486, + "grad_norm": 0.44113677740097046, + "learning_rate": 8e-05, + "loss": 1.9559, + "step": 341 + }, + { + "epoch": 0.01906354515050167, + "grad_norm": 0.41205549240112305, + "learning_rate": 8e-05, + "loss": 1.6418, + "step": 342 + }, + { + "epoch": 0.019119286510590857, + "grad_norm": 0.41746771335601807, + "learning_rate": 8e-05, + "loss": 1.8842, + "step": 343 + }, + { + "epoch": 0.019175027870680046, + "grad_norm": 0.42498695850372314, + "learning_rate": 8e-05, + "loss": 1.8814, + "step": 344 + }, + { + "epoch": 0.019230769230769232, + "grad_norm": 0.4002072811126709, + "learning_rate": 8e-05, + "loss": 1.7794, + "step": 345 + }, + { + "epoch": 0.019286510590858418, + "grad_norm": 0.4046269953250885, + "learning_rate": 8e-05, + "loss": 1.9115, + "step": 346 + }, + { + "epoch": 0.019342251950947603, + "grad_norm": 0.38134604692459106, + "learning_rate": 8e-05, + "loss": 1.6016, + "step": 347 + }, + { + "epoch": 0.01939799331103679, + "grad_norm": 0.44585302472114563, + "learning_rate": 8e-05, + "loss": 1.7686, + "step": 348 + }, + { + "epoch": 0.019453734671125975, + "grad_norm": 0.42694950103759766, + "learning_rate": 8e-05, + "loss": 1.746, + "step": 349 + }, + { + "epoch": 0.01950947603121516, + "grad_norm": 0.3884536623954773, + "learning_rate": 8e-05, + "loss": 1.6251, + "step": 350 + }, + { + "epoch": 0.01956521739130435, + "grad_norm": 0.4125804901123047, + "learning_rate": 8e-05, + "loss": 1.6578, + "step": 351 + }, + { + "epoch": 0.019620958751393535, + "grad_norm": 0.3952336609363556, + "learning_rate": 8e-05, + "loss": 1.475, + "step": 352 + }, + { + "epoch": 0.01967670011148272, + "grad_norm": 0.40272772312164307, + "learning_rate": 8e-05, + "loss": 1.7858, + "step": 353 + }, + { + "epoch": 0.019732441471571906, + "grad_norm": 0.398934930562973, + "learning_rate": 8e-05, + "loss": 1.6765, + "step": 354 + }, + { + "epoch": 0.019788182831661092, + "grad_norm": 0.4227500557899475, + "learning_rate": 8e-05, + "loss": 1.6917, + "step": 355 + }, + { + "epoch": 0.019843924191750278, + "grad_norm": 0.44395002722740173, + "learning_rate": 8e-05, + "loss": 2.0649, + "step": 356 + }, + { + "epoch": 0.019899665551839463, + "grad_norm": 0.4013097286224365, + "learning_rate": 8e-05, + "loss": 1.7733, + "step": 357 + }, + { + "epoch": 0.019955406911928653, + "grad_norm": 0.3915567100048065, + "learning_rate": 8e-05, + "loss": 1.8218, + "step": 358 + }, + { + "epoch": 0.020011148272017838, + "grad_norm": 0.4370361268520355, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 359 + }, + { + "epoch": 0.020066889632107024, + "grad_norm": 0.4509705901145935, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 360 + }, + { + "epoch": 0.02012263099219621, + "grad_norm": 0.3994988799095154, + "learning_rate": 8e-05, + "loss": 1.7667, + "step": 361 + }, + { + "epoch": 0.020178372352285395, + "grad_norm": 0.41288429498672485, + "learning_rate": 8e-05, + "loss": 1.8398, + "step": 362 + }, + { + "epoch": 0.02023411371237458, + "grad_norm": 0.4281528890132904, + "learning_rate": 8e-05, + "loss": 1.8315, + "step": 363 + }, + { + "epoch": 0.020289855072463767, + "grad_norm": 0.5056607127189636, + "learning_rate": 8e-05, + "loss": 1.9024, + "step": 364 + }, + { + "epoch": 0.020345596432552956, + "grad_norm": 0.379406601190567, + "learning_rate": 8e-05, + "loss": 1.6976, + "step": 365 + }, + { + "epoch": 0.02040133779264214, + "grad_norm": 0.3984786868095398, + "learning_rate": 8e-05, + "loss": 1.9396, + "step": 366 + }, + { + "epoch": 0.020457079152731327, + "grad_norm": 0.4233458936214447, + "learning_rate": 8e-05, + "loss": 1.8615, + "step": 367 + }, + { + "epoch": 0.020512820512820513, + "grad_norm": 0.4357461929321289, + "learning_rate": 8e-05, + "loss": 1.8852, + "step": 368 + }, + { + "epoch": 0.0205685618729097, + "grad_norm": 0.3997776210308075, + "learning_rate": 8e-05, + "loss": 1.6763, + "step": 369 + }, + { + "epoch": 0.020624303232998884, + "grad_norm": 0.4437631070613861, + "learning_rate": 8e-05, + "loss": 1.9861, + "step": 370 + }, + { + "epoch": 0.02068004459308807, + "grad_norm": 0.41594505310058594, + "learning_rate": 8e-05, + "loss": 1.7889, + "step": 371 + }, + { + "epoch": 0.02073578595317726, + "grad_norm": 0.4146479368209839, + "learning_rate": 8e-05, + "loss": 1.8944, + "step": 372 + }, + { + "epoch": 0.020791527313266445, + "grad_norm": 0.43242311477661133, + "learning_rate": 8e-05, + "loss": 1.8777, + "step": 373 + }, + { + "epoch": 0.02084726867335563, + "grad_norm": 0.427500456571579, + "learning_rate": 8e-05, + "loss": 1.7193, + "step": 374 + }, + { + "epoch": 0.020903010033444816, + "grad_norm": 0.4136715531349182, + "learning_rate": 8e-05, + "loss": 1.8498, + "step": 375 + }, + { + "epoch": 0.020958751393534, + "grad_norm": 0.39340317249298096, + "learning_rate": 8e-05, + "loss": 1.7887, + "step": 376 + }, + { + "epoch": 0.021014492753623187, + "grad_norm": 0.46348172426223755, + "learning_rate": 8e-05, + "loss": 1.9388, + "step": 377 + }, + { + "epoch": 0.021070234113712373, + "grad_norm": 0.3817267119884491, + "learning_rate": 8e-05, + "loss": 1.5686, + "step": 378 + }, + { + "epoch": 0.021125975473801562, + "grad_norm": 0.4563877284526825, + "learning_rate": 8e-05, + "loss": 1.8118, + "step": 379 + }, + { + "epoch": 0.021181716833890748, + "grad_norm": 0.41754311323165894, + "learning_rate": 8e-05, + "loss": 1.7371, + "step": 380 + }, + { + "epoch": 0.021237458193979934, + "grad_norm": 0.3904457688331604, + "learning_rate": 8e-05, + "loss": 1.5807, + "step": 381 + }, + { + "epoch": 0.02129319955406912, + "grad_norm": 0.4270111322402954, + "learning_rate": 8e-05, + "loss": 1.7304, + "step": 382 + }, + { + "epoch": 0.021348940914158305, + "grad_norm": 0.425065279006958, + "learning_rate": 8e-05, + "loss": 1.8769, + "step": 383 + }, + { + "epoch": 0.02140468227424749, + "grad_norm": 0.4404243528842926, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 384 + }, + { + "epoch": 0.021460423634336676, + "grad_norm": 0.4347771108150482, + "learning_rate": 8e-05, + "loss": 1.9571, + "step": 385 + }, + { + "epoch": 0.021516164994425865, + "grad_norm": 0.4173373579978943, + "learning_rate": 8e-05, + "loss": 1.7546, + "step": 386 + }, + { + "epoch": 0.02157190635451505, + "grad_norm": 0.4261404275894165, + "learning_rate": 8e-05, + "loss": 1.9542, + "step": 387 + }, + { + "epoch": 0.021627647714604237, + "grad_norm": 0.4324272871017456, + "learning_rate": 8e-05, + "loss": 1.7786, + "step": 388 + }, + { + "epoch": 0.021683389074693422, + "grad_norm": 0.39340800046920776, + "learning_rate": 8e-05, + "loss": 1.6615, + "step": 389 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 0.41590434312820435, + "learning_rate": 8e-05, + "loss": 1.8427, + "step": 390 + }, + { + "epoch": 0.021794871794871794, + "grad_norm": 0.41448861360549927, + "learning_rate": 8e-05, + "loss": 1.7105, + "step": 391 + }, + { + "epoch": 0.02185061315496098, + "grad_norm": 0.42208924889564514, + "learning_rate": 8e-05, + "loss": 2.0502, + "step": 392 + }, + { + "epoch": 0.02190635451505017, + "grad_norm": 0.37751758098602295, + "learning_rate": 8e-05, + "loss": 1.65, + "step": 393 + }, + { + "epoch": 0.021962095875139354, + "grad_norm": 0.4493613839149475, + "learning_rate": 8e-05, + "loss": 2.0241, + "step": 394 + }, + { + "epoch": 0.02201783723522854, + "grad_norm": 0.3963421583175659, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 395 + }, + { + "epoch": 0.022073578595317726, + "grad_norm": 0.41097620129585266, + "learning_rate": 8e-05, + "loss": 1.894, + "step": 396 + }, + { + "epoch": 0.02212931995540691, + "grad_norm": 0.4165438711643219, + "learning_rate": 8e-05, + "loss": 1.9027, + "step": 397 + }, + { + "epoch": 0.022185061315496097, + "grad_norm": 0.42101776599884033, + "learning_rate": 8e-05, + "loss": 1.9453, + "step": 398 + }, + { + "epoch": 0.022240802675585283, + "grad_norm": 0.47843456268310547, + "learning_rate": 8e-05, + "loss": 2.0013, + "step": 399 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.44337159395217896, + "learning_rate": 8e-05, + "loss": 1.9636, + "step": 400 + }, + { + "epoch": 0.022352285395763657, + "grad_norm": 0.39196833968162537, + "learning_rate": 8e-05, + "loss": 1.7866, + "step": 401 + }, + { + "epoch": 0.022408026755852843, + "grad_norm": 0.45299163460731506, + "learning_rate": 8e-05, + "loss": 1.8547, + "step": 402 + }, + { + "epoch": 0.02246376811594203, + "grad_norm": 0.37776532769203186, + "learning_rate": 8e-05, + "loss": 1.6039, + "step": 403 + }, + { + "epoch": 0.022519509476031215, + "grad_norm": 0.4386959969997406, + "learning_rate": 8e-05, + "loss": 1.765, + "step": 404 + }, + { + "epoch": 0.0225752508361204, + "grad_norm": 0.4808010756969452, + "learning_rate": 8e-05, + "loss": 1.5659, + "step": 405 + }, + { + "epoch": 0.022630992196209586, + "grad_norm": 0.4042125344276428, + "learning_rate": 8e-05, + "loss": 1.7711, + "step": 406 + }, + { + "epoch": 0.022686733556298775, + "grad_norm": 0.4260920584201813, + "learning_rate": 8e-05, + "loss": 1.6221, + "step": 407 + }, + { + "epoch": 0.02274247491638796, + "grad_norm": 0.4069453477859497, + "learning_rate": 8e-05, + "loss": 2.023, + "step": 408 + }, + { + "epoch": 0.022798216276477146, + "grad_norm": 0.38290277123451233, + "learning_rate": 8e-05, + "loss": 1.7505, + "step": 409 + }, + { + "epoch": 0.022853957636566332, + "grad_norm": 0.3907026946544647, + "learning_rate": 8e-05, + "loss": 1.8634, + "step": 410 + }, + { + "epoch": 0.022909698996655518, + "grad_norm": 0.4567570686340332, + "learning_rate": 8e-05, + "loss": 1.7813, + "step": 411 + }, + { + "epoch": 0.022965440356744703, + "grad_norm": 0.4327886998653412, + "learning_rate": 8e-05, + "loss": 1.8774, + "step": 412 + }, + { + "epoch": 0.02302118171683389, + "grad_norm": 0.4102731943130493, + "learning_rate": 8e-05, + "loss": 1.6824, + "step": 413 + }, + { + "epoch": 0.023076923076923078, + "grad_norm": 0.42088595032691956, + "learning_rate": 8e-05, + "loss": 1.8037, + "step": 414 + }, + { + "epoch": 0.023132664437012264, + "grad_norm": 0.40860041975975037, + "learning_rate": 8e-05, + "loss": 1.9645, + "step": 415 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 0.4047831594944, + "learning_rate": 8e-05, + "loss": 1.6969, + "step": 416 + }, + { + "epoch": 0.023244147157190635, + "grad_norm": 0.38738128542900085, + "learning_rate": 8e-05, + "loss": 1.7507, + "step": 417 + }, + { + "epoch": 0.02329988851727982, + "grad_norm": 0.4307592809200287, + "learning_rate": 8e-05, + "loss": 1.6861, + "step": 418 + }, + { + "epoch": 0.023355629877369007, + "grad_norm": 0.44377589225769043, + "learning_rate": 8e-05, + "loss": 1.5974, + "step": 419 + }, + { + "epoch": 0.023411371237458192, + "grad_norm": 0.42999017238616943, + "learning_rate": 8e-05, + "loss": 1.8469, + "step": 420 + }, + { + "epoch": 0.02346711259754738, + "grad_norm": 0.3810324966907501, + "learning_rate": 8e-05, + "loss": 1.5995, + "step": 421 + }, + { + "epoch": 0.023522853957636567, + "grad_norm": 0.42189478874206543, + "learning_rate": 8e-05, + "loss": 1.5233, + "step": 422 + }, + { + "epoch": 0.023578595317725753, + "grad_norm": 0.40228262543678284, + "learning_rate": 8e-05, + "loss": 1.5722, + "step": 423 + }, + { + "epoch": 0.02363433667781494, + "grad_norm": 0.44540834426879883, + "learning_rate": 8e-05, + "loss": 1.8335, + "step": 424 + }, + { + "epoch": 0.023690078037904124, + "grad_norm": 0.40963396430015564, + "learning_rate": 8e-05, + "loss": 1.6149, + "step": 425 + }, + { + "epoch": 0.02374581939799331, + "grad_norm": 0.4283663332462311, + "learning_rate": 8e-05, + "loss": 1.9118, + "step": 426 + }, + { + "epoch": 0.0238015607580825, + "grad_norm": 0.41021615266799927, + "learning_rate": 8e-05, + "loss": 1.5939, + "step": 427 + }, + { + "epoch": 0.023857302118171685, + "grad_norm": 0.40412041544914246, + "learning_rate": 8e-05, + "loss": 1.8285, + "step": 428 + }, + { + "epoch": 0.02391304347826087, + "grad_norm": 0.4041658639907837, + "learning_rate": 8e-05, + "loss": 1.8908, + "step": 429 + }, + { + "epoch": 0.023968784838350056, + "grad_norm": 0.44249221682548523, + "learning_rate": 8e-05, + "loss": 1.9286, + "step": 430 + }, + { + "epoch": 0.02402452619843924, + "grad_norm": 0.40249529480934143, + "learning_rate": 8e-05, + "loss": 1.7416, + "step": 431 + }, + { + "epoch": 0.024080267558528427, + "grad_norm": 0.4116725027561188, + "learning_rate": 8e-05, + "loss": 1.8227, + "step": 432 + }, + { + "epoch": 0.024136008918617613, + "grad_norm": 0.41705870628356934, + "learning_rate": 8e-05, + "loss": 1.7292, + "step": 433 + }, + { + "epoch": 0.024191750278706802, + "grad_norm": 0.428973525762558, + "learning_rate": 8e-05, + "loss": 1.8535, + "step": 434 + }, + { + "epoch": 0.024247491638795988, + "grad_norm": 0.39946043491363525, + "learning_rate": 8e-05, + "loss": 1.6571, + "step": 435 + }, + { + "epoch": 0.024303232998885173, + "grad_norm": 0.374835729598999, + "learning_rate": 8e-05, + "loss": 1.4186, + "step": 436 + }, + { + "epoch": 0.02435897435897436, + "grad_norm": 0.4034141004085541, + "learning_rate": 8e-05, + "loss": 1.5538, + "step": 437 + }, + { + "epoch": 0.024414715719063545, + "grad_norm": 0.5284610986709595, + "learning_rate": 8e-05, + "loss": 2.1798, + "step": 438 + }, + { + "epoch": 0.02447045707915273, + "grad_norm": 0.43948811292648315, + "learning_rate": 8e-05, + "loss": 1.8571, + "step": 439 + }, + { + "epoch": 0.024526198439241916, + "grad_norm": 0.4280640780925751, + "learning_rate": 8e-05, + "loss": 1.8725, + "step": 440 + }, + { + "epoch": 0.024581939799331105, + "grad_norm": 0.45238423347473145, + "learning_rate": 8e-05, + "loss": 1.8458, + "step": 441 + }, + { + "epoch": 0.02463768115942029, + "grad_norm": 0.4630393981933594, + "learning_rate": 8e-05, + "loss": 2.1301, + "step": 442 + }, + { + "epoch": 0.024693422519509477, + "grad_norm": 0.38115811347961426, + "learning_rate": 8e-05, + "loss": 1.5022, + "step": 443 + }, + { + "epoch": 0.024749163879598662, + "grad_norm": 0.41422200202941895, + "learning_rate": 8e-05, + "loss": 1.6895, + "step": 444 + }, + { + "epoch": 0.024804905239687848, + "grad_norm": 0.398720383644104, + "learning_rate": 8e-05, + "loss": 1.7783, + "step": 445 + }, + { + "epoch": 0.024860646599777034, + "grad_norm": 0.41158318519592285, + "learning_rate": 8e-05, + "loss": 1.7808, + "step": 446 + }, + { + "epoch": 0.02491638795986622, + "grad_norm": 0.39195117354393005, + "learning_rate": 8e-05, + "loss": 1.669, + "step": 447 + }, + { + "epoch": 0.02497212931995541, + "grad_norm": 0.45583266019821167, + "learning_rate": 8e-05, + "loss": 1.8407, + "step": 448 + }, + { + "epoch": 0.025027870680044594, + "grad_norm": 0.40098410844802856, + "learning_rate": 8e-05, + "loss": 1.7442, + "step": 449 + }, + { + "epoch": 0.02508361204013378, + "grad_norm": 0.4209321439266205, + "learning_rate": 8e-05, + "loss": 1.9494, + "step": 450 + }, + { + "epoch": 0.025139353400222966, + "grad_norm": 0.42204686999320984, + "learning_rate": 8e-05, + "loss": 1.7992, + "step": 451 + }, + { + "epoch": 0.02519509476031215, + "grad_norm": 0.4665510654449463, + "learning_rate": 8e-05, + "loss": 2.1564, + "step": 452 + }, + { + "epoch": 0.025250836120401337, + "grad_norm": 0.40209999680519104, + "learning_rate": 8e-05, + "loss": 1.7804, + "step": 453 + }, + { + "epoch": 0.025306577480490523, + "grad_norm": 0.4011898636817932, + "learning_rate": 8e-05, + "loss": 1.7567, + "step": 454 + }, + { + "epoch": 0.025362318840579712, + "grad_norm": 0.41723349690437317, + "learning_rate": 8e-05, + "loss": 1.899, + "step": 455 + }, + { + "epoch": 0.025418060200668897, + "grad_norm": 0.4524818956851959, + "learning_rate": 8e-05, + "loss": 2.0513, + "step": 456 + }, + { + "epoch": 0.025473801560758083, + "grad_norm": 0.4193991422653198, + "learning_rate": 8e-05, + "loss": 1.8053, + "step": 457 + }, + { + "epoch": 0.02552954292084727, + "grad_norm": 0.5020811557769775, + "learning_rate": 8e-05, + "loss": 1.993, + "step": 458 + }, + { + "epoch": 0.025585284280936454, + "grad_norm": 0.39976030588150024, + "learning_rate": 8e-05, + "loss": 1.7527, + "step": 459 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 0.43179136514663696, + "learning_rate": 8e-05, + "loss": 1.6838, + "step": 460 + }, + { + "epoch": 0.025696767001114826, + "grad_norm": 0.4227728545665741, + "learning_rate": 8e-05, + "loss": 1.8846, + "step": 461 + }, + { + "epoch": 0.025752508361204015, + "grad_norm": 0.40155425667762756, + "learning_rate": 8e-05, + "loss": 1.662, + "step": 462 + }, + { + "epoch": 0.0258082497212932, + "grad_norm": 0.4173693358898163, + "learning_rate": 8e-05, + "loss": 1.854, + "step": 463 + }, + { + "epoch": 0.025863991081382386, + "grad_norm": 0.41311848163604736, + "learning_rate": 8e-05, + "loss": 1.7414, + "step": 464 + }, + { + "epoch": 0.025919732441471572, + "grad_norm": 0.4107280671596527, + "learning_rate": 8e-05, + "loss": 1.6538, + "step": 465 + }, + { + "epoch": 0.025975473801560758, + "grad_norm": 0.4078384339809418, + "learning_rate": 8e-05, + "loss": 1.8702, + "step": 466 + }, + { + "epoch": 0.026031215161649943, + "grad_norm": 0.41355952620506287, + "learning_rate": 8e-05, + "loss": 1.8909, + "step": 467 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 0.4284036159515381, + "learning_rate": 8e-05, + "loss": 1.8563, + "step": 468 + }, + { + "epoch": 0.026142697881828318, + "grad_norm": 0.41841962933540344, + "learning_rate": 8e-05, + "loss": 1.7837, + "step": 469 + }, + { + "epoch": 0.026198439241917504, + "grad_norm": 0.4622490406036377, + "learning_rate": 8e-05, + "loss": 1.8756, + "step": 470 + }, + { + "epoch": 0.02625418060200669, + "grad_norm": 0.4452904760837555, + "learning_rate": 8e-05, + "loss": 2.0227, + "step": 471 + }, + { + "epoch": 0.026309921962095875, + "grad_norm": 0.47404196858406067, + "learning_rate": 8e-05, + "loss": 2.0884, + "step": 472 + }, + { + "epoch": 0.02636566332218506, + "grad_norm": 0.4138879179954529, + "learning_rate": 8e-05, + "loss": 1.8301, + "step": 473 + }, + { + "epoch": 0.026421404682274247, + "grad_norm": 0.3863980174064636, + "learning_rate": 8e-05, + "loss": 1.6119, + "step": 474 + }, + { + "epoch": 0.026477146042363432, + "grad_norm": 0.42295849323272705, + "learning_rate": 8e-05, + "loss": 1.752, + "step": 475 + }, + { + "epoch": 0.02653288740245262, + "grad_norm": 0.40239834785461426, + "learning_rate": 8e-05, + "loss": 2.001, + "step": 476 + }, + { + "epoch": 0.026588628762541807, + "grad_norm": 0.4178367555141449, + "learning_rate": 8e-05, + "loss": 1.8134, + "step": 477 + }, + { + "epoch": 0.026644370122630993, + "grad_norm": 0.41569623351097107, + "learning_rate": 8e-05, + "loss": 1.7635, + "step": 478 + }, + { + "epoch": 0.02670011148272018, + "grad_norm": 0.40798434615135193, + "learning_rate": 8e-05, + "loss": 1.787, + "step": 479 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 0.3902462124824524, + "learning_rate": 8e-05, + "loss": 1.6576, + "step": 480 + }, + { + "epoch": 0.02681159420289855, + "grad_norm": 0.3853597044944763, + "learning_rate": 8e-05, + "loss": 1.5487, + "step": 481 + }, + { + "epoch": 0.026867335562987735, + "grad_norm": 0.43139299750328064, + "learning_rate": 8e-05, + "loss": 1.9277, + "step": 482 + }, + { + "epoch": 0.026923076923076925, + "grad_norm": 0.41244861483573914, + "learning_rate": 8e-05, + "loss": 1.7304, + "step": 483 + }, + { + "epoch": 0.02697881828316611, + "grad_norm": 0.4112268090248108, + "learning_rate": 8e-05, + "loss": 1.7498, + "step": 484 + }, + { + "epoch": 0.027034559643255296, + "grad_norm": 0.40720945596694946, + "learning_rate": 8e-05, + "loss": 1.7, + "step": 485 + }, + { + "epoch": 0.02709030100334448, + "grad_norm": 0.45372486114501953, + "learning_rate": 8e-05, + "loss": 2.0107, + "step": 486 + }, + { + "epoch": 0.027146042363433667, + "grad_norm": 0.429392546415329, + "learning_rate": 8e-05, + "loss": 1.7736, + "step": 487 + }, + { + "epoch": 0.027201783723522853, + "grad_norm": 0.38353031873703003, + "learning_rate": 8e-05, + "loss": 1.6865, + "step": 488 + }, + { + "epoch": 0.02725752508361204, + "grad_norm": 0.41096165776252747, + "learning_rate": 8e-05, + "loss": 1.6531, + "step": 489 + }, + { + "epoch": 0.027313266443701228, + "grad_norm": 0.395088255405426, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 490 + }, + { + "epoch": 0.027369007803790413, + "grad_norm": 0.4200167655944824, + "learning_rate": 8e-05, + "loss": 1.8258, + "step": 491 + }, + { + "epoch": 0.0274247491638796, + "grad_norm": 0.3914308249950409, + "learning_rate": 8e-05, + "loss": 1.7873, + "step": 492 + }, + { + "epoch": 0.027480490523968785, + "grad_norm": 0.4021241068840027, + "learning_rate": 8e-05, + "loss": 1.7569, + "step": 493 + }, + { + "epoch": 0.02753623188405797, + "grad_norm": 0.42652398347854614, + "learning_rate": 8e-05, + "loss": 1.913, + "step": 494 + }, + { + "epoch": 0.027591973244147156, + "grad_norm": 0.43813228607177734, + "learning_rate": 8e-05, + "loss": 2.0001, + "step": 495 + }, + { + "epoch": 0.027647714604236342, + "grad_norm": 0.42736557126045227, + "learning_rate": 8e-05, + "loss": 1.751, + "step": 496 + }, + { + "epoch": 0.02770345596432553, + "grad_norm": 0.44482421875, + "learning_rate": 8e-05, + "loss": 1.9259, + "step": 497 + }, + { + "epoch": 0.027759197324414717, + "grad_norm": 0.4331215023994446, + "learning_rate": 8e-05, + "loss": 1.9296, + "step": 498 + }, + { + "epoch": 0.027814938684503902, + "grad_norm": 0.42177051305770874, + "learning_rate": 8e-05, + "loss": 1.9159, + "step": 499 + }, + { + "epoch": 0.027870680044593088, + "grad_norm": 0.4275566339492798, + "learning_rate": 8e-05, + "loss": 1.7328, + "step": 500 + }, + { + "epoch": 0.027926421404682274, + "grad_norm": 0.4206371605396271, + "learning_rate": 8e-05, + "loss": 1.8639, + "step": 501 + }, + { + "epoch": 0.02798216276477146, + "grad_norm": 0.4210817813873291, + "learning_rate": 8e-05, + "loss": 1.772, + "step": 502 + }, + { + "epoch": 0.028037904124860645, + "grad_norm": 0.41412290930747986, + "learning_rate": 8e-05, + "loss": 1.9401, + "step": 503 + }, + { + "epoch": 0.028093645484949834, + "grad_norm": 0.44537246227264404, + "learning_rate": 8e-05, + "loss": 2.1035, + "step": 504 + }, + { + "epoch": 0.02814938684503902, + "grad_norm": 0.3925379812717438, + "learning_rate": 8e-05, + "loss": 1.7115, + "step": 505 + }, + { + "epoch": 0.028205128205128206, + "grad_norm": 0.4123174846172333, + "learning_rate": 8e-05, + "loss": 1.7238, + "step": 506 + }, + { + "epoch": 0.02826086956521739, + "grad_norm": 0.4027958810329437, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 507 + }, + { + "epoch": 0.028316610925306577, + "grad_norm": 0.3883973956108093, + "learning_rate": 8e-05, + "loss": 1.7822, + "step": 508 + }, + { + "epoch": 0.028372352285395763, + "grad_norm": 0.42848825454711914, + "learning_rate": 8e-05, + "loss": 1.7291, + "step": 509 + }, + { + "epoch": 0.028428093645484948, + "grad_norm": 0.41434234380722046, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 510 + }, + { + "epoch": 0.028483835005574137, + "grad_norm": 0.44031184911727905, + "learning_rate": 8e-05, + "loss": 1.7749, + "step": 511 + }, + { + "epoch": 0.028539576365663323, + "grad_norm": 0.40377092361450195, + "learning_rate": 8e-05, + "loss": 1.5409, + "step": 512 + }, + { + "epoch": 0.02859531772575251, + "grad_norm": 0.4525328278541565, + "learning_rate": 8e-05, + "loss": 1.9709, + "step": 513 + }, + { + "epoch": 0.028651059085841694, + "grad_norm": 0.417842298746109, + "learning_rate": 8e-05, + "loss": 1.8349, + "step": 514 + }, + { + "epoch": 0.02870680044593088, + "grad_norm": 0.40975886583328247, + "learning_rate": 8e-05, + "loss": 1.7792, + "step": 515 + }, + { + "epoch": 0.028762541806020066, + "grad_norm": 0.40695229172706604, + "learning_rate": 8e-05, + "loss": 1.7144, + "step": 516 + }, + { + "epoch": 0.02881828316610925, + "grad_norm": 0.4242783784866333, + "learning_rate": 8e-05, + "loss": 1.8216, + "step": 517 + }, + { + "epoch": 0.02887402452619844, + "grad_norm": 0.40845513343811035, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 518 + }, + { + "epoch": 0.028929765886287626, + "grad_norm": 0.444774329662323, + "learning_rate": 8e-05, + "loss": 1.9723, + "step": 519 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 0.43584856390953064, + "learning_rate": 8e-05, + "loss": 1.7096, + "step": 520 + }, + { + "epoch": 0.029041248606465998, + "grad_norm": 0.39967840909957886, + "learning_rate": 8e-05, + "loss": 1.7449, + "step": 521 + }, + { + "epoch": 0.029096989966555183, + "grad_norm": 0.46941077709198, + "learning_rate": 8e-05, + "loss": 1.8926, + "step": 522 + }, + { + "epoch": 0.02915273132664437, + "grad_norm": 0.43449297547340393, + "learning_rate": 8e-05, + "loss": 1.7825, + "step": 523 + }, + { + "epoch": 0.029208472686733555, + "grad_norm": 0.3803488314151764, + "learning_rate": 8e-05, + "loss": 1.6198, + "step": 524 + }, + { + "epoch": 0.029264214046822744, + "grad_norm": 0.4477572739124298, + "learning_rate": 8e-05, + "loss": 1.7894, + "step": 525 + }, + { + "epoch": 0.02931995540691193, + "grad_norm": 0.3807078003883362, + "learning_rate": 8e-05, + "loss": 1.576, + "step": 526 + }, + { + "epoch": 0.029375696767001115, + "grad_norm": 0.45292210578918457, + "learning_rate": 8e-05, + "loss": 1.9879, + "step": 527 + }, + { + "epoch": 0.0294314381270903, + "grad_norm": 0.460365355014801, + "learning_rate": 8e-05, + "loss": 1.924, + "step": 528 + }, + { + "epoch": 0.029487179487179487, + "grad_norm": 0.4228498339653015, + "learning_rate": 8e-05, + "loss": 1.8188, + "step": 529 + }, + { + "epoch": 0.029542920847268672, + "grad_norm": 0.3947596251964569, + "learning_rate": 8e-05, + "loss": 1.6433, + "step": 530 + }, + { + "epoch": 0.029598662207357858, + "grad_norm": 0.3465898334980011, + "learning_rate": 8e-05, + "loss": 1.1918, + "step": 531 + }, + { + "epoch": 0.029654403567447047, + "grad_norm": 0.3989419639110565, + "learning_rate": 8e-05, + "loss": 1.7479, + "step": 532 + }, + { + "epoch": 0.029710144927536233, + "grad_norm": 0.4237585663795471, + "learning_rate": 8e-05, + "loss": 1.8069, + "step": 533 + }, + { + "epoch": 0.02976588628762542, + "grad_norm": 0.39906057715415955, + "learning_rate": 8e-05, + "loss": 1.6856, + "step": 534 + }, + { + "epoch": 0.029821627647714604, + "grad_norm": 0.4093904197216034, + "learning_rate": 8e-05, + "loss": 1.7184, + "step": 535 + }, + { + "epoch": 0.02987736900780379, + "grad_norm": 0.4156496524810791, + "learning_rate": 8e-05, + "loss": 1.753, + "step": 536 + }, + { + "epoch": 0.029933110367892975, + "grad_norm": 0.4160612225532532, + "learning_rate": 8e-05, + "loss": 1.8341, + "step": 537 + }, + { + "epoch": 0.02998885172798216, + "grad_norm": 0.3792247474193573, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 538 + }, + { + "epoch": 0.03004459308807135, + "grad_norm": 0.4268282353878021, + "learning_rate": 8e-05, + "loss": 1.8001, + "step": 539 + }, + { + "epoch": 0.030100334448160536, + "grad_norm": 0.411487340927124, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 540 + }, + { + "epoch": 0.03015607580824972, + "grad_norm": 0.37929436564445496, + "learning_rate": 8e-05, + "loss": 1.5027, + "step": 541 + }, + { + "epoch": 0.030211817168338907, + "grad_norm": 0.4008893072605133, + "learning_rate": 8e-05, + "loss": 1.6987, + "step": 542 + }, + { + "epoch": 0.030267558528428093, + "grad_norm": 0.4297763407230377, + "learning_rate": 8e-05, + "loss": 1.8859, + "step": 543 + }, + { + "epoch": 0.03032329988851728, + "grad_norm": 0.3972001373767853, + "learning_rate": 8e-05, + "loss": 1.626, + "step": 544 + }, + { + "epoch": 0.030379041248606464, + "grad_norm": 0.44114306569099426, + "learning_rate": 8e-05, + "loss": 1.7328, + "step": 545 + }, + { + "epoch": 0.030434782608695653, + "grad_norm": 0.38401421904563904, + "learning_rate": 8e-05, + "loss": 1.6671, + "step": 546 + }, + { + "epoch": 0.03049052396878484, + "grad_norm": 0.4116717576980591, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 547 + }, + { + "epoch": 0.030546265328874025, + "grad_norm": 0.4649137854576111, + "learning_rate": 8e-05, + "loss": 2.0108, + "step": 548 + }, + { + "epoch": 0.03060200668896321, + "grad_norm": 0.3979105055332184, + "learning_rate": 8e-05, + "loss": 1.7386, + "step": 549 + }, + { + "epoch": 0.030657748049052396, + "grad_norm": 0.39438769221305847, + "learning_rate": 8e-05, + "loss": 1.6171, + "step": 550 + }, + { + "epoch": 0.030713489409141582, + "grad_norm": 0.41145819425582886, + "learning_rate": 8e-05, + "loss": 1.7747, + "step": 551 + }, + { + "epoch": 0.03076923076923077, + "grad_norm": 0.37144097685813904, + "learning_rate": 8e-05, + "loss": 1.6617, + "step": 552 + }, + { + "epoch": 0.030824972129319957, + "grad_norm": 0.40567418932914734, + "learning_rate": 8e-05, + "loss": 1.5851, + "step": 553 + }, + { + "epoch": 0.030880713489409142, + "grad_norm": 0.3914574980735779, + "learning_rate": 8e-05, + "loss": 1.838, + "step": 554 + }, + { + "epoch": 0.030936454849498328, + "grad_norm": 0.4121246337890625, + "learning_rate": 8e-05, + "loss": 1.8852, + "step": 555 + }, + { + "epoch": 0.030992196209587514, + "grad_norm": 0.4243144989013672, + "learning_rate": 8e-05, + "loss": 1.8316, + "step": 556 + }, + { + "epoch": 0.0310479375696767, + "grad_norm": 0.4198228716850281, + "learning_rate": 8e-05, + "loss": 1.7642, + "step": 557 + }, + { + "epoch": 0.031103678929765885, + "grad_norm": 0.41224730014801025, + "learning_rate": 8e-05, + "loss": 1.9605, + "step": 558 + }, + { + "epoch": 0.031159420289855074, + "grad_norm": 0.4147040545940399, + "learning_rate": 8e-05, + "loss": 1.7033, + "step": 559 + }, + { + "epoch": 0.03121516164994426, + "grad_norm": 0.4140629768371582, + "learning_rate": 8e-05, + "loss": 1.9389, + "step": 560 + }, + { + "epoch": 0.03127090301003344, + "grad_norm": 0.42293891310691833, + "learning_rate": 8e-05, + "loss": 1.7608, + "step": 561 + }, + { + "epoch": 0.03132664437012263, + "grad_norm": 0.398647665977478, + "learning_rate": 8e-05, + "loss": 1.7532, + "step": 562 + }, + { + "epoch": 0.03138238573021182, + "grad_norm": 0.4059430956840515, + "learning_rate": 8e-05, + "loss": 1.7975, + "step": 563 + }, + { + "epoch": 0.031438127090301006, + "grad_norm": 0.45980408787727356, + "learning_rate": 8e-05, + "loss": 1.8965, + "step": 564 + }, + { + "epoch": 0.03149386845039019, + "grad_norm": 0.4658120274543762, + "learning_rate": 8e-05, + "loss": 2.0726, + "step": 565 + }, + { + "epoch": 0.03154960981047938, + "grad_norm": 0.46347862482070923, + "learning_rate": 8e-05, + "loss": 1.8033, + "step": 566 + }, + { + "epoch": 0.03160535117056856, + "grad_norm": 0.40268826484680176, + "learning_rate": 8e-05, + "loss": 1.7909, + "step": 567 + }, + { + "epoch": 0.03166109253065775, + "grad_norm": 0.4346879720687866, + "learning_rate": 8e-05, + "loss": 1.7894, + "step": 568 + }, + { + "epoch": 0.031716833890746934, + "grad_norm": 0.423518568277359, + "learning_rate": 8e-05, + "loss": 1.8991, + "step": 569 + }, + { + "epoch": 0.03177257525083612, + "grad_norm": 0.42770105600357056, + "learning_rate": 8e-05, + "loss": 1.8731, + "step": 570 + }, + { + "epoch": 0.031828316610925306, + "grad_norm": 0.4165306091308594, + "learning_rate": 8e-05, + "loss": 1.7888, + "step": 571 + }, + { + "epoch": 0.03188405797101449, + "grad_norm": 0.40096932649612427, + "learning_rate": 8e-05, + "loss": 1.947, + "step": 572 + }, + { + "epoch": 0.03193979933110368, + "grad_norm": 0.3981882631778717, + "learning_rate": 8e-05, + "loss": 1.715, + "step": 573 + }, + { + "epoch": 0.03199554069119286, + "grad_norm": 0.4129897654056549, + "learning_rate": 8e-05, + "loss": 1.8301, + "step": 574 + }, + { + "epoch": 0.03205128205128205, + "grad_norm": 0.40707018971443176, + "learning_rate": 8e-05, + "loss": 1.4342, + "step": 575 + }, + { + "epoch": 0.032107023411371234, + "grad_norm": 0.4106016457080841, + "learning_rate": 8e-05, + "loss": 1.6461, + "step": 576 + }, + { + "epoch": 0.03216276477146043, + "grad_norm": 0.4073600769042969, + "learning_rate": 8e-05, + "loss": 1.5317, + "step": 577 + }, + { + "epoch": 0.03221850613154961, + "grad_norm": 0.4160962402820587, + "learning_rate": 8e-05, + "loss": 1.8114, + "step": 578 + }, + { + "epoch": 0.0322742474916388, + "grad_norm": 0.4713279902935028, + "learning_rate": 8e-05, + "loss": 2.0019, + "step": 579 + }, + { + "epoch": 0.032329988851727984, + "grad_norm": 0.40895307064056396, + "learning_rate": 8e-05, + "loss": 1.8948, + "step": 580 + }, + { + "epoch": 0.03238573021181717, + "grad_norm": 0.37472397089004517, + "learning_rate": 8e-05, + "loss": 1.7124, + "step": 581 + }, + { + "epoch": 0.032441471571906355, + "grad_norm": 0.3960634171962738, + "learning_rate": 8e-05, + "loss": 1.7363, + "step": 582 + }, + { + "epoch": 0.03249721293199554, + "grad_norm": 0.3730868101119995, + "learning_rate": 8e-05, + "loss": 1.7037, + "step": 583 + }, + { + "epoch": 0.032552954292084726, + "grad_norm": 0.36749905347824097, + "learning_rate": 8e-05, + "loss": 1.5133, + "step": 584 + }, + { + "epoch": 0.03260869565217391, + "grad_norm": 0.40374237298965454, + "learning_rate": 8e-05, + "loss": 1.7778, + "step": 585 + }, + { + "epoch": 0.0326644370122631, + "grad_norm": 0.43943536281585693, + "learning_rate": 8e-05, + "loss": 1.957, + "step": 586 + }, + { + "epoch": 0.032720178372352283, + "grad_norm": 0.4169772267341614, + "learning_rate": 8e-05, + "loss": 1.7456, + "step": 587 + }, + { + "epoch": 0.03277591973244147, + "grad_norm": 0.4045918583869934, + "learning_rate": 8e-05, + "loss": 1.6216, + "step": 588 + }, + { + "epoch": 0.032831661092530655, + "grad_norm": 0.41666683554649353, + "learning_rate": 8e-05, + "loss": 1.9929, + "step": 589 + }, + { + "epoch": 0.03288740245261984, + "grad_norm": 0.38340914249420166, + "learning_rate": 8e-05, + "loss": 1.6984, + "step": 590 + }, + { + "epoch": 0.03294314381270903, + "grad_norm": 0.41442227363586426, + "learning_rate": 8e-05, + "loss": 1.8554, + "step": 591 + }, + { + "epoch": 0.03299888517279822, + "grad_norm": 0.4207206666469574, + "learning_rate": 8e-05, + "loss": 1.8215, + "step": 592 + }, + { + "epoch": 0.033054626532887404, + "grad_norm": 0.4293190538883209, + "learning_rate": 8e-05, + "loss": 1.9728, + "step": 593 + }, + { + "epoch": 0.03311036789297659, + "grad_norm": 0.4169306755065918, + "learning_rate": 8e-05, + "loss": 1.8503, + "step": 594 + }, + { + "epoch": 0.033166109253065776, + "grad_norm": 0.4055793583393097, + "learning_rate": 8e-05, + "loss": 1.7387, + "step": 595 + }, + { + "epoch": 0.03322185061315496, + "grad_norm": 0.4573780596256256, + "learning_rate": 8e-05, + "loss": 1.911, + "step": 596 + }, + { + "epoch": 0.03327759197324415, + "grad_norm": 0.3761114180088043, + "learning_rate": 8e-05, + "loss": 1.6057, + "step": 597 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 0.4141940772533417, + "learning_rate": 8e-05, + "loss": 1.7041, + "step": 598 + }, + { + "epoch": 0.03338907469342252, + "grad_norm": 0.4186020791530609, + "learning_rate": 8e-05, + "loss": 1.8301, + "step": 599 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.42576029896736145, + "learning_rate": 8e-05, + "loss": 1.8344, + "step": 600 + }, + { + "epoch": 0.03350055741360089, + "grad_norm": 0.41014382243156433, + "learning_rate": 8e-05, + "loss": 1.8637, + "step": 601 + }, + { + "epoch": 0.033556298773690076, + "grad_norm": 0.4340376555919647, + "learning_rate": 8e-05, + "loss": 1.7651, + "step": 602 + }, + { + "epoch": 0.03361204013377926, + "grad_norm": 0.4143203794956207, + "learning_rate": 8e-05, + "loss": 1.7698, + "step": 603 + }, + { + "epoch": 0.03366778149386845, + "grad_norm": 0.42128315567970276, + "learning_rate": 8e-05, + "loss": 1.7218, + "step": 604 + }, + { + "epoch": 0.03372352285395764, + "grad_norm": 0.4313509166240692, + "learning_rate": 8e-05, + "loss": 1.8213, + "step": 605 + }, + { + "epoch": 0.033779264214046825, + "grad_norm": 0.3960542678833008, + "learning_rate": 8e-05, + "loss": 1.4856, + "step": 606 + }, + { + "epoch": 0.03383500557413601, + "grad_norm": 0.4187854528427124, + "learning_rate": 8e-05, + "loss": 1.7748, + "step": 607 + }, + { + "epoch": 0.0338907469342252, + "grad_norm": 0.4670550227165222, + "learning_rate": 8e-05, + "loss": 1.9125, + "step": 608 + }, + { + "epoch": 0.03394648829431438, + "grad_norm": 0.4087151885032654, + "learning_rate": 8e-05, + "loss": 1.6741, + "step": 609 + }, + { + "epoch": 0.03400222965440357, + "grad_norm": 0.4307301640510559, + "learning_rate": 8e-05, + "loss": 1.8006, + "step": 610 + }, + { + "epoch": 0.034057971014492754, + "grad_norm": 0.46052247285842896, + "learning_rate": 8e-05, + "loss": 1.8099, + "step": 611 + }, + { + "epoch": 0.03411371237458194, + "grad_norm": 0.41967251896858215, + "learning_rate": 8e-05, + "loss": 1.7435, + "step": 612 + }, + { + "epoch": 0.034169453734671125, + "grad_norm": 0.42859208583831787, + "learning_rate": 8e-05, + "loss": 1.8204, + "step": 613 + }, + { + "epoch": 0.03422519509476031, + "grad_norm": 0.4440454840660095, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 614 + }, + { + "epoch": 0.034280936454849496, + "grad_norm": 0.4726400673389435, + "learning_rate": 8e-05, + "loss": 2.0625, + "step": 615 + }, + { + "epoch": 0.03433667781493868, + "grad_norm": 0.43250757455825806, + "learning_rate": 8e-05, + "loss": 1.8348, + "step": 616 + }, + { + "epoch": 0.03439241917502787, + "grad_norm": 0.4797033369541168, + "learning_rate": 8e-05, + "loss": 1.9095, + "step": 617 + }, + { + "epoch": 0.03444816053511706, + "grad_norm": 0.421512633562088, + "learning_rate": 8e-05, + "loss": 1.717, + "step": 618 + }, + { + "epoch": 0.034503901895206246, + "grad_norm": 0.37292420864105225, + "learning_rate": 8e-05, + "loss": 1.651, + "step": 619 + }, + { + "epoch": 0.03455964325529543, + "grad_norm": 0.5055415630340576, + "learning_rate": 8e-05, + "loss": 1.6273, + "step": 620 + }, + { + "epoch": 0.03461538461538462, + "grad_norm": 0.4223185181617737, + "learning_rate": 8e-05, + "loss": 1.7644, + "step": 621 + }, + { + "epoch": 0.0346711259754738, + "grad_norm": 0.42007625102996826, + "learning_rate": 8e-05, + "loss": 1.7028, + "step": 622 + }, + { + "epoch": 0.03472686733556299, + "grad_norm": 0.4167858958244324, + "learning_rate": 8e-05, + "loss": 1.8177, + "step": 623 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 0.47208330035209656, + "learning_rate": 8e-05, + "loss": 1.7619, + "step": 624 + }, + { + "epoch": 0.03483835005574136, + "grad_norm": 0.3962777853012085, + "learning_rate": 8e-05, + "loss": 1.7748, + "step": 625 + }, + { + "epoch": 0.034894091415830546, + "grad_norm": 0.3653423488140106, + "learning_rate": 8e-05, + "loss": 1.5153, + "step": 626 + }, + { + "epoch": 0.03494983277591973, + "grad_norm": 0.38869357109069824, + "learning_rate": 8e-05, + "loss": 1.623, + "step": 627 + }, + { + "epoch": 0.03500557413600892, + "grad_norm": 0.43486788868904114, + "learning_rate": 8e-05, + "loss": 1.8753, + "step": 628 + }, + { + "epoch": 0.0350613154960981, + "grad_norm": 0.38648611307144165, + "learning_rate": 8e-05, + "loss": 1.6221, + "step": 629 + }, + { + "epoch": 0.03511705685618729, + "grad_norm": 0.41406360268592834, + "learning_rate": 8e-05, + "loss": 1.9074, + "step": 630 + }, + { + "epoch": 0.035172798216276474, + "grad_norm": 0.45213326811790466, + "learning_rate": 8e-05, + "loss": 1.9577, + "step": 631 + }, + { + "epoch": 0.03522853957636567, + "grad_norm": 0.4049549698829651, + "learning_rate": 8e-05, + "loss": 1.8268, + "step": 632 + }, + { + "epoch": 0.03528428093645485, + "grad_norm": 0.39177536964416504, + "learning_rate": 8e-05, + "loss": 1.3487, + "step": 633 + }, + { + "epoch": 0.03534002229654404, + "grad_norm": 0.42474237084388733, + "learning_rate": 8e-05, + "loss": 1.8953, + "step": 634 + }, + { + "epoch": 0.035395763656633224, + "grad_norm": 0.42674046754837036, + "learning_rate": 8e-05, + "loss": 1.8642, + "step": 635 + }, + { + "epoch": 0.03545150501672241, + "grad_norm": 0.4271129071712494, + "learning_rate": 8e-05, + "loss": 1.8885, + "step": 636 + }, + { + "epoch": 0.035507246376811595, + "grad_norm": 0.43485212326049805, + "learning_rate": 8e-05, + "loss": 1.812, + "step": 637 + }, + { + "epoch": 0.03556298773690078, + "grad_norm": 0.4561612606048584, + "learning_rate": 8e-05, + "loss": 1.8138, + "step": 638 + }, + { + "epoch": 0.035618729096989966, + "grad_norm": 0.4725930094718933, + "learning_rate": 8e-05, + "loss": 1.9059, + "step": 639 + }, + { + "epoch": 0.03567447045707915, + "grad_norm": 0.468065083026886, + "learning_rate": 8e-05, + "loss": 1.7419, + "step": 640 + }, + { + "epoch": 0.03573021181716834, + "grad_norm": 0.43373408913612366, + "learning_rate": 8e-05, + "loss": 1.9229, + "step": 641 + }, + { + "epoch": 0.03578595317725752, + "grad_norm": 0.4082108736038208, + "learning_rate": 8e-05, + "loss": 1.8043, + "step": 642 + }, + { + "epoch": 0.03584169453734671, + "grad_norm": 0.3930438756942749, + "learning_rate": 8e-05, + "loss": 1.6384, + "step": 643 + }, + { + "epoch": 0.035897435897435895, + "grad_norm": 0.39798983931541443, + "learning_rate": 8e-05, + "loss": 1.7297, + "step": 644 + }, + { + "epoch": 0.03595317725752508, + "grad_norm": 0.4810659885406494, + "learning_rate": 8e-05, + "loss": 1.8956, + "step": 645 + }, + { + "epoch": 0.03600891861761427, + "grad_norm": 0.4323861300945282, + "learning_rate": 8e-05, + "loss": 1.8167, + "step": 646 + }, + { + "epoch": 0.03606465997770346, + "grad_norm": 0.42571496963500977, + "learning_rate": 8e-05, + "loss": 1.9223, + "step": 647 + }, + { + "epoch": 0.036120401337792644, + "grad_norm": 0.4450274109840393, + "learning_rate": 8e-05, + "loss": 1.8361, + "step": 648 + }, + { + "epoch": 0.03617614269788183, + "grad_norm": 0.4110467731952667, + "learning_rate": 8e-05, + "loss": 1.6817, + "step": 649 + }, + { + "epoch": 0.036231884057971016, + "grad_norm": 0.39012429118156433, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 650 + }, + { + "epoch": 0.0362876254180602, + "grad_norm": 0.39485234022140503, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 651 + }, + { + "epoch": 0.03634336677814939, + "grad_norm": 0.4552437961101532, + "learning_rate": 8e-05, + "loss": 1.8181, + "step": 652 + }, + { + "epoch": 0.03639910813823857, + "grad_norm": 0.4294509291648865, + "learning_rate": 8e-05, + "loss": 1.6509, + "step": 653 + }, + { + "epoch": 0.03645484949832776, + "grad_norm": 0.3852769136428833, + "learning_rate": 8e-05, + "loss": 1.6274, + "step": 654 + }, + { + "epoch": 0.036510590858416944, + "grad_norm": 0.4140886962413788, + "learning_rate": 8e-05, + "loss": 1.6733, + "step": 655 + }, + { + "epoch": 0.03656633221850613, + "grad_norm": 0.4174540638923645, + "learning_rate": 8e-05, + "loss": 1.7846, + "step": 656 + }, + { + "epoch": 0.036622073578595316, + "grad_norm": 0.41302043199539185, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 657 + }, + { + "epoch": 0.0366778149386845, + "grad_norm": 0.44165873527526855, + "learning_rate": 8e-05, + "loss": 1.5783, + "step": 658 + }, + { + "epoch": 0.03673355629877369, + "grad_norm": 0.4951744079589844, + "learning_rate": 8e-05, + "loss": 1.9361, + "step": 659 + }, + { + "epoch": 0.03678929765886288, + "grad_norm": 0.4248286485671997, + "learning_rate": 8e-05, + "loss": 1.7216, + "step": 660 + }, + { + "epoch": 0.036845039018952065, + "grad_norm": 0.4633992314338684, + "learning_rate": 8e-05, + "loss": 1.7314, + "step": 661 + }, + { + "epoch": 0.03690078037904125, + "grad_norm": 0.3990238606929779, + "learning_rate": 8e-05, + "loss": 1.6677, + "step": 662 + }, + { + "epoch": 0.03695652173913044, + "grad_norm": 0.40574949979782104, + "learning_rate": 8e-05, + "loss": 1.6418, + "step": 663 + }, + { + "epoch": 0.03701226309921962, + "grad_norm": 0.37986770272254944, + "learning_rate": 8e-05, + "loss": 1.5045, + "step": 664 + }, + { + "epoch": 0.03706800445930881, + "grad_norm": 0.4152768552303314, + "learning_rate": 8e-05, + "loss": 1.6917, + "step": 665 + }, + { + "epoch": 0.037123745819397994, + "grad_norm": 0.4041266441345215, + "learning_rate": 8e-05, + "loss": 1.7146, + "step": 666 + }, + { + "epoch": 0.03717948717948718, + "grad_norm": 0.46567487716674805, + "learning_rate": 8e-05, + "loss": 1.8633, + "step": 667 + }, + { + "epoch": 0.037235228539576365, + "grad_norm": 0.4492206275463104, + "learning_rate": 8e-05, + "loss": 1.8854, + "step": 668 + }, + { + "epoch": 0.03729096989966555, + "grad_norm": 0.4595526456832886, + "learning_rate": 8e-05, + "loss": 1.458, + "step": 669 + }, + { + "epoch": 0.037346711259754736, + "grad_norm": 0.43089577555656433, + "learning_rate": 8e-05, + "loss": 1.9997, + "step": 670 + }, + { + "epoch": 0.03740245261984392, + "grad_norm": 0.4291883409023285, + "learning_rate": 8e-05, + "loss": 1.7632, + "step": 671 + }, + { + "epoch": 0.03745819397993311, + "grad_norm": 0.42062389850616455, + "learning_rate": 8e-05, + "loss": 1.7392, + "step": 672 + }, + { + "epoch": 0.03751393534002229, + "grad_norm": 0.4016992449760437, + "learning_rate": 8e-05, + "loss": 1.7487, + "step": 673 + }, + { + "epoch": 0.037569676700111486, + "grad_norm": 0.40747544169425964, + "learning_rate": 8e-05, + "loss": 1.6961, + "step": 674 + }, + { + "epoch": 0.03762541806020067, + "grad_norm": 0.40056928992271423, + "learning_rate": 8e-05, + "loss": 1.6378, + "step": 675 + }, + { + "epoch": 0.03768115942028986, + "grad_norm": 0.4546568989753723, + "learning_rate": 8e-05, + "loss": 1.928, + "step": 676 + }, + { + "epoch": 0.03773690078037904, + "grad_norm": 0.41371530294418335, + "learning_rate": 8e-05, + "loss": 1.7306, + "step": 677 + }, + { + "epoch": 0.03779264214046823, + "grad_norm": 0.4063621163368225, + "learning_rate": 8e-05, + "loss": 1.8541, + "step": 678 + }, + { + "epoch": 0.037848383500557414, + "grad_norm": 0.45284903049468994, + "learning_rate": 8e-05, + "loss": 1.7621, + "step": 679 + }, + { + "epoch": 0.0379041248606466, + "grad_norm": 0.3936507999897003, + "learning_rate": 8e-05, + "loss": 1.6523, + "step": 680 + }, + { + "epoch": 0.037959866220735786, + "grad_norm": 0.4094870984554291, + "learning_rate": 8e-05, + "loss": 1.8397, + "step": 681 + }, + { + "epoch": 0.03801560758082497, + "grad_norm": 0.4652598798274994, + "learning_rate": 8e-05, + "loss": 1.956, + "step": 682 + }, + { + "epoch": 0.03807134894091416, + "grad_norm": 0.4646802842617035, + "learning_rate": 8e-05, + "loss": 2.0511, + "step": 683 + }, + { + "epoch": 0.03812709030100334, + "grad_norm": 0.4306330680847168, + "learning_rate": 8e-05, + "loss": 1.7777, + "step": 684 + }, + { + "epoch": 0.03818283166109253, + "grad_norm": 0.4153509736061096, + "learning_rate": 8e-05, + "loss": 1.7838, + "step": 685 + }, + { + "epoch": 0.038238573021181714, + "grad_norm": 0.4056133031845093, + "learning_rate": 8e-05, + "loss": 1.6948, + "step": 686 + }, + { + "epoch": 0.0382943143812709, + "grad_norm": 0.42202329635620117, + "learning_rate": 8e-05, + "loss": 1.574, + "step": 687 + }, + { + "epoch": 0.03835005574136009, + "grad_norm": 0.4424411952495575, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 688 + }, + { + "epoch": 0.03840579710144928, + "grad_norm": 0.404994934797287, + "learning_rate": 8e-05, + "loss": 1.6564, + "step": 689 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 0.44672891497612, + "learning_rate": 8e-05, + "loss": 2.0652, + "step": 690 + }, + { + "epoch": 0.03851727982162765, + "grad_norm": 0.357619971036911, + "learning_rate": 8e-05, + "loss": 1.5158, + "step": 691 + }, + { + "epoch": 0.038573021181716835, + "grad_norm": 0.4439055621623993, + "learning_rate": 8e-05, + "loss": 1.9357, + "step": 692 + }, + { + "epoch": 0.03862876254180602, + "grad_norm": 0.4590149521827698, + "learning_rate": 8e-05, + "loss": 1.8221, + "step": 693 + }, + { + "epoch": 0.038684503901895206, + "grad_norm": 0.4029330909252167, + "learning_rate": 8e-05, + "loss": 1.5921, + "step": 694 + }, + { + "epoch": 0.03874024526198439, + "grad_norm": 0.4233595132827759, + "learning_rate": 8e-05, + "loss": 1.4612, + "step": 695 + }, + { + "epoch": 0.03879598662207358, + "grad_norm": 0.47652146220207214, + "learning_rate": 8e-05, + "loss": 1.6849, + "step": 696 + }, + { + "epoch": 0.03885172798216276, + "grad_norm": 0.45156988501548767, + "learning_rate": 8e-05, + "loss": 1.8079, + "step": 697 + }, + { + "epoch": 0.03890746934225195, + "grad_norm": 0.44426658749580383, + "learning_rate": 8e-05, + "loss": 1.7389, + "step": 698 + }, + { + "epoch": 0.038963210702341135, + "grad_norm": 0.4575849771499634, + "learning_rate": 8e-05, + "loss": 1.902, + "step": 699 + }, + { + "epoch": 0.03901895206243032, + "grad_norm": 0.4535796046257019, + "learning_rate": 8e-05, + "loss": 1.5685, + "step": 700 + }, + { + "epoch": 0.039074693422519506, + "grad_norm": 0.4648195207118988, + "learning_rate": 8e-05, + "loss": 1.8874, + "step": 701 + }, + { + "epoch": 0.0391304347826087, + "grad_norm": 0.44183456897735596, + "learning_rate": 8e-05, + "loss": 1.8103, + "step": 702 + }, + { + "epoch": 0.039186176142697884, + "grad_norm": 0.37236881256103516, + "learning_rate": 8e-05, + "loss": 1.6319, + "step": 703 + }, + { + "epoch": 0.03924191750278707, + "grad_norm": 0.4322548508644104, + "learning_rate": 8e-05, + "loss": 1.7023, + "step": 704 + }, + { + "epoch": 0.039297658862876256, + "grad_norm": 0.4289608597755432, + "learning_rate": 8e-05, + "loss": 1.9537, + "step": 705 + }, + { + "epoch": 0.03935340022296544, + "grad_norm": 0.4339289367198944, + "learning_rate": 8e-05, + "loss": 1.8794, + "step": 706 + }, + { + "epoch": 0.03940914158305463, + "grad_norm": 0.451537549495697, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 707 + }, + { + "epoch": 0.03946488294314381, + "grad_norm": 0.4093172550201416, + "learning_rate": 8e-05, + "loss": 1.491, + "step": 708 + }, + { + "epoch": 0.039520624303233, + "grad_norm": 0.5952094793319702, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 709 + }, + { + "epoch": 0.039576365663322184, + "grad_norm": 0.4200764000415802, + "learning_rate": 8e-05, + "loss": 1.6626, + "step": 710 + }, + { + "epoch": 0.03963210702341137, + "grad_norm": 0.39256078004837036, + "learning_rate": 8e-05, + "loss": 1.5719, + "step": 711 + }, + { + "epoch": 0.039687848383500555, + "grad_norm": 0.4018345773220062, + "learning_rate": 8e-05, + "loss": 1.6568, + "step": 712 + }, + { + "epoch": 0.03974358974358974, + "grad_norm": 0.39940693974494934, + "learning_rate": 8e-05, + "loss": 1.6682, + "step": 713 + }, + { + "epoch": 0.03979933110367893, + "grad_norm": 0.421329528093338, + "learning_rate": 8e-05, + "loss": 1.6526, + "step": 714 + }, + { + "epoch": 0.03985507246376811, + "grad_norm": 0.47900378704071045, + "learning_rate": 8e-05, + "loss": 2.1028, + "step": 715 + }, + { + "epoch": 0.039910813823857305, + "grad_norm": 0.4266323447227478, + "learning_rate": 8e-05, + "loss": 1.9051, + "step": 716 + }, + { + "epoch": 0.03996655518394649, + "grad_norm": 0.4186037480831146, + "learning_rate": 8e-05, + "loss": 1.6456, + "step": 717 + }, + { + "epoch": 0.040022296544035676, + "grad_norm": 0.459991991519928, + "learning_rate": 8e-05, + "loss": 1.7576, + "step": 718 + }, + { + "epoch": 0.04007803790412486, + "grad_norm": 0.5402688980102539, + "learning_rate": 8e-05, + "loss": 1.8946, + "step": 719 + }, + { + "epoch": 0.04013377926421405, + "grad_norm": 0.4141964316368103, + "learning_rate": 8e-05, + "loss": 1.7124, + "step": 720 + }, + { + "epoch": 0.040189520624303234, + "grad_norm": 0.4429500997066498, + "learning_rate": 8e-05, + "loss": 2.0645, + "step": 721 + }, + { + "epoch": 0.04024526198439242, + "grad_norm": 0.4050302803516388, + "learning_rate": 8e-05, + "loss": 1.7279, + "step": 722 + }, + { + "epoch": 0.040301003344481605, + "grad_norm": 0.42763057351112366, + "learning_rate": 8e-05, + "loss": 1.9281, + "step": 723 + }, + { + "epoch": 0.04035674470457079, + "grad_norm": 0.4697290360927582, + "learning_rate": 8e-05, + "loss": 1.9958, + "step": 724 + }, + { + "epoch": 0.040412486064659976, + "grad_norm": 0.3971403241157532, + "learning_rate": 8e-05, + "loss": 1.6868, + "step": 725 + }, + { + "epoch": 0.04046822742474916, + "grad_norm": 0.405602365732193, + "learning_rate": 8e-05, + "loss": 1.7869, + "step": 726 + }, + { + "epoch": 0.04052396878483835, + "grad_norm": 0.41436219215393066, + "learning_rate": 8e-05, + "loss": 1.8323, + "step": 727 + }, + { + "epoch": 0.04057971014492753, + "grad_norm": 0.41769540309906006, + "learning_rate": 8e-05, + "loss": 1.7949, + "step": 728 + }, + { + "epoch": 0.040635451505016726, + "grad_norm": 0.40197986364364624, + "learning_rate": 8e-05, + "loss": 1.8478, + "step": 729 + }, + { + "epoch": 0.04069119286510591, + "grad_norm": 0.3910923898220062, + "learning_rate": 8e-05, + "loss": 1.5842, + "step": 730 + }, + { + "epoch": 0.0407469342251951, + "grad_norm": 0.41100266575813293, + "learning_rate": 8e-05, + "loss": 1.8538, + "step": 731 + }, + { + "epoch": 0.04080267558528428, + "grad_norm": 0.4086783826351166, + "learning_rate": 8e-05, + "loss": 1.6857, + "step": 732 + }, + { + "epoch": 0.04085841694537347, + "grad_norm": 0.41415607929229736, + "learning_rate": 8e-05, + "loss": 1.7507, + "step": 733 + }, + { + "epoch": 0.040914158305462654, + "grad_norm": 0.3935626149177551, + "learning_rate": 8e-05, + "loss": 1.7366, + "step": 734 + }, + { + "epoch": 0.04096989966555184, + "grad_norm": 0.41125422716140747, + "learning_rate": 8e-05, + "loss": 1.9468, + "step": 735 + }, + { + "epoch": 0.041025641025641026, + "grad_norm": 0.4091278314590454, + "learning_rate": 8e-05, + "loss": 1.7788, + "step": 736 + }, + { + "epoch": 0.04108138238573021, + "grad_norm": 0.43207159638404846, + "learning_rate": 8e-05, + "loss": 2.0038, + "step": 737 + }, + { + "epoch": 0.0411371237458194, + "grad_norm": 0.43338799476623535, + "learning_rate": 8e-05, + "loss": 1.7264, + "step": 738 + }, + { + "epoch": 0.04119286510590858, + "grad_norm": 0.40661856532096863, + "learning_rate": 8e-05, + "loss": 1.7491, + "step": 739 + }, + { + "epoch": 0.04124860646599777, + "grad_norm": 0.3823534846305847, + "learning_rate": 8e-05, + "loss": 1.5044, + "step": 740 + }, + { + "epoch": 0.041304347826086954, + "grad_norm": 0.43768197298049927, + "learning_rate": 8e-05, + "loss": 1.8421, + "step": 741 + }, + { + "epoch": 0.04136008918617614, + "grad_norm": 0.43626633286476135, + "learning_rate": 8e-05, + "loss": 1.8624, + "step": 742 + }, + { + "epoch": 0.04141583054626533, + "grad_norm": 0.44751766324043274, + "learning_rate": 8e-05, + "loss": 1.8846, + "step": 743 + }, + { + "epoch": 0.04147157190635452, + "grad_norm": 0.3814850151538849, + "learning_rate": 8e-05, + "loss": 1.5948, + "step": 744 + }, + { + "epoch": 0.041527313266443704, + "grad_norm": 0.4221818447113037, + "learning_rate": 8e-05, + "loss": 1.6793, + "step": 745 + }, + { + "epoch": 0.04158305462653289, + "grad_norm": 0.4538663625717163, + "learning_rate": 8e-05, + "loss": 1.9088, + "step": 746 + }, + { + "epoch": 0.041638795986622075, + "grad_norm": 0.4213586449623108, + "learning_rate": 8e-05, + "loss": 1.7283, + "step": 747 + }, + { + "epoch": 0.04169453734671126, + "grad_norm": 0.458469033241272, + "learning_rate": 8e-05, + "loss": 1.7802, + "step": 748 + }, + { + "epoch": 0.041750278706800446, + "grad_norm": 0.43127307295799255, + "learning_rate": 8e-05, + "loss": 1.6724, + "step": 749 + }, + { + "epoch": 0.04180602006688963, + "grad_norm": 0.45649224519729614, + "learning_rate": 8e-05, + "loss": 1.9681, + "step": 750 + }, + { + "epoch": 0.04186176142697882, + "grad_norm": 0.4290968179702759, + "learning_rate": 8e-05, + "loss": 1.8333, + "step": 751 + }, + { + "epoch": 0.041917502787068, + "grad_norm": 0.4485679566860199, + "learning_rate": 8e-05, + "loss": 1.8084, + "step": 752 + }, + { + "epoch": 0.04197324414715719, + "grad_norm": 0.4158881902694702, + "learning_rate": 8e-05, + "loss": 1.7163, + "step": 753 + }, + { + "epoch": 0.042028985507246375, + "grad_norm": 0.4581759572029114, + "learning_rate": 8e-05, + "loss": 1.5889, + "step": 754 + }, + { + "epoch": 0.04208472686733556, + "grad_norm": 0.4187484085559845, + "learning_rate": 8e-05, + "loss": 1.8103, + "step": 755 + }, + { + "epoch": 0.042140468227424746, + "grad_norm": 0.41237813234329224, + "learning_rate": 8e-05, + "loss": 1.6807, + "step": 756 + }, + { + "epoch": 0.04219620958751394, + "grad_norm": 0.46197548508644104, + "learning_rate": 8e-05, + "loss": 1.9533, + "step": 757 + }, + { + "epoch": 0.042251950947603124, + "grad_norm": 0.44682711362838745, + "learning_rate": 8e-05, + "loss": 1.6977, + "step": 758 + }, + { + "epoch": 0.04230769230769231, + "grad_norm": 0.39638906717300415, + "learning_rate": 8e-05, + "loss": 1.5057, + "step": 759 + }, + { + "epoch": 0.042363433667781496, + "grad_norm": 0.45771071314811707, + "learning_rate": 8e-05, + "loss": 2.0477, + "step": 760 + }, + { + "epoch": 0.04241917502787068, + "grad_norm": 0.42211708426475525, + "learning_rate": 8e-05, + "loss": 1.8795, + "step": 761 + }, + { + "epoch": 0.04247491638795987, + "grad_norm": 0.4084051549434662, + "learning_rate": 8e-05, + "loss": 1.6568, + "step": 762 + }, + { + "epoch": 0.04253065774804905, + "grad_norm": 0.4379608631134033, + "learning_rate": 8e-05, + "loss": 1.7923, + "step": 763 + }, + { + "epoch": 0.04258639910813824, + "grad_norm": 0.40212878584861755, + "learning_rate": 8e-05, + "loss": 1.508, + "step": 764 + }, + { + "epoch": 0.042642140468227424, + "grad_norm": 0.40446737408638, + "learning_rate": 8e-05, + "loss": 1.8271, + "step": 765 + }, + { + "epoch": 0.04269788182831661, + "grad_norm": 0.41475871205329895, + "learning_rate": 8e-05, + "loss": 1.9338, + "step": 766 + }, + { + "epoch": 0.042753623188405795, + "grad_norm": 0.4280930757522583, + "learning_rate": 8e-05, + "loss": 1.9265, + "step": 767 + }, + { + "epoch": 0.04280936454849498, + "grad_norm": 0.3947325050830841, + "learning_rate": 8e-05, + "loss": 1.7675, + "step": 768 + }, + { + "epoch": 0.04286510590858417, + "grad_norm": 0.4449823796749115, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 769 + }, + { + "epoch": 0.04292084726867335, + "grad_norm": 0.40130770206451416, + "learning_rate": 8e-05, + "loss": 1.6473, + "step": 770 + }, + { + "epoch": 0.042976588628762545, + "grad_norm": 0.4342806041240692, + "learning_rate": 8e-05, + "loss": 1.852, + "step": 771 + }, + { + "epoch": 0.04303232998885173, + "grad_norm": 0.4064382016658783, + "learning_rate": 8e-05, + "loss": 1.8133, + "step": 772 + }, + { + "epoch": 0.043088071348940916, + "grad_norm": 0.3967300057411194, + "learning_rate": 8e-05, + "loss": 1.7411, + "step": 773 + }, + { + "epoch": 0.0431438127090301, + "grad_norm": 0.4405178427696228, + "learning_rate": 8e-05, + "loss": 1.9962, + "step": 774 + }, + { + "epoch": 0.04319955406911929, + "grad_norm": 0.45430299639701843, + "learning_rate": 8e-05, + "loss": 1.7633, + "step": 775 + }, + { + "epoch": 0.043255295429208473, + "grad_norm": 0.4152611494064331, + "learning_rate": 8e-05, + "loss": 1.4639, + "step": 776 + }, + { + "epoch": 0.04331103678929766, + "grad_norm": 0.4138352572917938, + "learning_rate": 8e-05, + "loss": 1.8169, + "step": 777 + }, + { + "epoch": 0.043366778149386845, + "grad_norm": 0.39472225308418274, + "learning_rate": 8e-05, + "loss": 1.5993, + "step": 778 + }, + { + "epoch": 0.04342251950947603, + "grad_norm": 0.4525393545627594, + "learning_rate": 8e-05, + "loss": 1.8683, + "step": 779 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.4362929165363312, + "learning_rate": 8e-05, + "loss": 1.8648, + "step": 780 + }, + { + "epoch": 0.0435340022296544, + "grad_norm": 0.40343475341796875, + "learning_rate": 8e-05, + "loss": 1.6745, + "step": 781 + }, + { + "epoch": 0.04358974358974359, + "grad_norm": 0.4135652184486389, + "learning_rate": 8e-05, + "loss": 1.753, + "step": 782 + }, + { + "epoch": 0.04364548494983277, + "grad_norm": 0.45906737446784973, + "learning_rate": 8e-05, + "loss": 1.9136, + "step": 783 + }, + { + "epoch": 0.04370122630992196, + "grad_norm": 0.42325344681739807, + "learning_rate": 8e-05, + "loss": 1.7591, + "step": 784 + }, + { + "epoch": 0.04375696767001115, + "grad_norm": 0.4207473397254944, + "learning_rate": 8e-05, + "loss": 1.7254, + "step": 785 + }, + { + "epoch": 0.04381270903010034, + "grad_norm": 0.41280874609947205, + "learning_rate": 8e-05, + "loss": 1.8155, + "step": 786 + }, + { + "epoch": 0.04386845039018952, + "grad_norm": 0.423045814037323, + "learning_rate": 8e-05, + "loss": 1.8083, + "step": 787 + }, + { + "epoch": 0.04392419175027871, + "grad_norm": 0.44817253947257996, + "learning_rate": 8e-05, + "loss": 1.9363, + "step": 788 + }, + { + "epoch": 0.043979933110367894, + "grad_norm": 0.43137675523757935, + "learning_rate": 8e-05, + "loss": 1.993, + "step": 789 + }, + { + "epoch": 0.04403567447045708, + "grad_norm": 0.4204290509223938, + "learning_rate": 8e-05, + "loss": 1.8462, + "step": 790 + }, + { + "epoch": 0.044091415830546266, + "grad_norm": 0.40246155858039856, + "learning_rate": 8e-05, + "loss": 1.5627, + "step": 791 + }, + { + "epoch": 0.04414715719063545, + "grad_norm": 0.5489731431007385, + "learning_rate": 8e-05, + "loss": 2.0595, + "step": 792 + }, + { + "epoch": 0.04420289855072464, + "grad_norm": 0.46178895235061646, + "learning_rate": 8e-05, + "loss": 1.952, + "step": 793 + }, + { + "epoch": 0.04425863991081382, + "grad_norm": 0.41195032000541687, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 794 + }, + { + "epoch": 0.04431438127090301, + "grad_norm": 0.4108409881591797, + "learning_rate": 8e-05, + "loss": 1.7034, + "step": 795 + }, + { + "epoch": 0.044370122630992194, + "grad_norm": 0.48466190695762634, + "learning_rate": 8e-05, + "loss": 2.1322, + "step": 796 + }, + { + "epoch": 0.04442586399108138, + "grad_norm": 0.4071280360221863, + "learning_rate": 8e-05, + "loss": 1.6725, + "step": 797 + }, + { + "epoch": 0.044481605351170565, + "grad_norm": 0.4744816720485687, + "learning_rate": 8e-05, + "loss": 2.0929, + "step": 798 + }, + { + "epoch": 0.04453734671125976, + "grad_norm": 0.42697998881340027, + "learning_rate": 8e-05, + "loss": 1.8765, + "step": 799 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4529907703399658, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 800 + }, + { + "epoch": 0.04464882943143813, + "grad_norm": 0.4429434537887573, + "learning_rate": 8e-05, + "loss": 1.8094, + "step": 801 + }, + { + "epoch": 0.044704570791527315, + "grad_norm": 0.4543124735355377, + "learning_rate": 8e-05, + "loss": 1.7702, + "step": 802 + }, + { + "epoch": 0.0447603121516165, + "grad_norm": 0.4180499315261841, + "learning_rate": 8e-05, + "loss": 1.8529, + "step": 803 + }, + { + "epoch": 0.044816053511705686, + "grad_norm": 0.44464290142059326, + "learning_rate": 8e-05, + "loss": 1.8953, + "step": 804 + }, + { + "epoch": 0.04487179487179487, + "grad_norm": 0.399472177028656, + "learning_rate": 8e-05, + "loss": 1.643, + "step": 805 + }, + { + "epoch": 0.04492753623188406, + "grad_norm": 0.44110196828842163, + "learning_rate": 8e-05, + "loss": 2.1205, + "step": 806 + }, + { + "epoch": 0.04498327759197324, + "grad_norm": 0.42552313208580017, + "learning_rate": 8e-05, + "loss": 1.8727, + "step": 807 + }, + { + "epoch": 0.04503901895206243, + "grad_norm": 0.4377604126930237, + "learning_rate": 8e-05, + "loss": 1.8279, + "step": 808 + }, + { + "epoch": 0.045094760312151615, + "grad_norm": 0.3928232789039612, + "learning_rate": 8e-05, + "loss": 1.5861, + "step": 809 + }, + { + "epoch": 0.0451505016722408, + "grad_norm": 0.43836626410484314, + "learning_rate": 8e-05, + "loss": 1.9002, + "step": 810 + }, + { + "epoch": 0.045206243032329986, + "grad_norm": 0.423073410987854, + "learning_rate": 8e-05, + "loss": 1.6317, + "step": 811 + }, + { + "epoch": 0.04526198439241917, + "grad_norm": 0.46345505118370056, + "learning_rate": 8e-05, + "loss": 1.9068, + "step": 812 + }, + { + "epoch": 0.045317725752508364, + "grad_norm": 0.47184428572654724, + "learning_rate": 8e-05, + "loss": 1.9154, + "step": 813 + }, + { + "epoch": 0.04537346711259755, + "grad_norm": 0.43765196204185486, + "learning_rate": 8e-05, + "loss": 2.0261, + "step": 814 + }, + { + "epoch": 0.045429208472686736, + "grad_norm": 0.4310826063156128, + "learning_rate": 8e-05, + "loss": 1.8355, + "step": 815 + }, + { + "epoch": 0.04548494983277592, + "grad_norm": 0.47284650802612305, + "learning_rate": 8e-05, + "loss": 1.8096, + "step": 816 + }, + { + "epoch": 0.04554069119286511, + "grad_norm": 0.42523178458213806, + "learning_rate": 8e-05, + "loss": 1.7155, + "step": 817 + }, + { + "epoch": 0.04559643255295429, + "grad_norm": 0.3846169114112854, + "learning_rate": 8e-05, + "loss": 1.616, + "step": 818 + }, + { + "epoch": 0.04565217391304348, + "grad_norm": 0.42457297444343567, + "learning_rate": 8e-05, + "loss": 1.9693, + "step": 819 + }, + { + "epoch": 0.045707915273132664, + "grad_norm": 0.43201375007629395, + "learning_rate": 8e-05, + "loss": 1.7744, + "step": 820 + }, + { + "epoch": 0.04576365663322185, + "grad_norm": 0.39910250902175903, + "learning_rate": 8e-05, + "loss": 1.6146, + "step": 821 + }, + { + "epoch": 0.045819397993311035, + "grad_norm": 0.45629212260246277, + "learning_rate": 8e-05, + "loss": 1.8616, + "step": 822 + }, + { + "epoch": 0.04587513935340022, + "grad_norm": 0.4624870717525482, + "learning_rate": 8e-05, + "loss": 1.8472, + "step": 823 + }, + { + "epoch": 0.04593088071348941, + "grad_norm": 0.41313448548316956, + "learning_rate": 8e-05, + "loss": 1.5226, + "step": 824 + }, + { + "epoch": 0.04598662207357859, + "grad_norm": 0.3996032178401947, + "learning_rate": 8e-05, + "loss": 1.6795, + "step": 825 + }, + { + "epoch": 0.04604236343366778, + "grad_norm": 0.4157262146472931, + "learning_rate": 8e-05, + "loss": 1.7918, + "step": 826 + }, + { + "epoch": 0.04609810479375697, + "grad_norm": 0.46568799018859863, + "learning_rate": 8e-05, + "loss": 2.0148, + "step": 827 + }, + { + "epoch": 0.046153846153846156, + "grad_norm": 0.3986344337463379, + "learning_rate": 8e-05, + "loss": 1.7419, + "step": 828 + }, + { + "epoch": 0.04620958751393534, + "grad_norm": 0.38107046484947205, + "learning_rate": 8e-05, + "loss": 1.6143, + "step": 829 + }, + { + "epoch": 0.04626532887402453, + "grad_norm": 0.4358171820640564, + "learning_rate": 8e-05, + "loss": 1.7574, + "step": 830 + }, + { + "epoch": 0.04632107023411371, + "grad_norm": 0.38718435168266296, + "learning_rate": 8e-05, + "loss": 1.4672, + "step": 831 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 0.3949934244155884, + "learning_rate": 8e-05, + "loss": 1.7404, + "step": 832 + }, + { + "epoch": 0.046432552954292085, + "grad_norm": 0.39293748140335083, + "learning_rate": 8e-05, + "loss": 1.7409, + "step": 833 + }, + { + "epoch": 0.04648829431438127, + "grad_norm": 0.3997895121574402, + "learning_rate": 8e-05, + "loss": 1.68, + "step": 834 + }, + { + "epoch": 0.046544035674470456, + "grad_norm": 0.4256502091884613, + "learning_rate": 8e-05, + "loss": 1.8699, + "step": 835 + }, + { + "epoch": 0.04659977703455964, + "grad_norm": 0.4751846492290497, + "learning_rate": 8e-05, + "loss": 1.8223, + "step": 836 + }, + { + "epoch": 0.04665551839464883, + "grad_norm": 0.4109809398651123, + "learning_rate": 8e-05, + "loss": 1.6245, + "step": 837 + }, + { + "epoch": 0.04671125975473801, + "grad_norm": 0.41468480229377747, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 838 + }, + { + "epoch": 0.0467670011148272, + "grad_norm": 0.43772780895233154, + "learning_rate": 8e-05, + "loss": 1.8532, + "step": 839 + }, + { + "epoch": 0.046822742474916385, + "grad_norm": 0.4629352390766144, + "learning_rate": 8e-05, + "loss": 1.8597, + "step": 840 + }, + { + "epoch": 0.04687848383500558, + "grad_norm": 0.4625055193901062, + "learning_rate": 8e-05, + "loss": 1.9523, + "step": 841 + }, + { + "epoch": 0.04693422519509476, + "grad_norm": 0.42157262563705444, + "learning_rate": 8e-05, + "loss": 1.7857, + "step": 842 + }, + { + "epoch": 0.04698996655518395, + "grad_norm": 0.40957966446876526, + "learning_rate": 8e-05, + "loss": 1.636, + "step": 843 + }, + { + "epoch": 0.047045707915273134, + "grad_norm": 0.41058987379074097, + "learning_rate": 8e-05, + "loss": 1.8647, + "step": 844 + }, + { + "epoch": 0.04710144927536232, + "grad_norm": 0.4620298147201538, + "learning_rate": 8e-05, + "loss": 1.7832, + "step": 845 + }, + { + "epoch": 0.047157190635451506, + "grad_norm": 0.4322960674762726, + "learning_rate": 8e-05, + "loss": 1.7886, + "step": 846 + }, + { + "epoch": 0.04721293199554069, + "grad_norm": 0.45721161365509033, + "learning_rate": 8e-05, + "loss": 2.1069, + "step": 847 + }, + { + "epoch": 0.04726867335562988, + "grad_norm": 0.44434359669685364, + "learning_rate": 8e-05, + "loss": 1.881, + "step": 848 + }, + { + "epoch": 0.04732441471571906, + "grad_norm": 0.47906211018562317, + "learning_rate": 8e-05, + "loss": 1.9895, + "step": 849 + }, + { + "epoch": 0.04738015607580825, + "grad_norm": 0.4150347411632538, + "learning_rate": 8e-05, + "loss": 1.8052, + "step": 850 + }, + { + "epoch": 0.047435897435897434, + "grad_norm": 0.4207112491130829, + "learning_rate": 8e-05, + "loss": 1.8485, + "step": 851 + }, + { + "epoch": 0.04749163879598662, + "grad_norm": 0.6500937342643738, + "learning_rate": 8e-05, + "loss": 1.6529, + "step": 852 + }, + { + "epoch": 0.047547380156075805, + "grad_norm": 0.43128669261932373, + "learning_rate": 8e-05, + "loss": 1.9591, + "step": 853 + }, + { + "epoch": 0.047603121516165, + "grad_norm": 0.43956103920936584, + "learning_rate": 8e-05, + "loss": 1.7868, + "step": 854 + }, + { + "epoch": 0.047658862876254184, + "grad_norm": 0.41977569460868835, + "learning_rate": 8e-05, + "loss": 1.8002, + "step": 855 + }, + { + "epoch": 0.04771460423634337, + "grad_norm": 0.4412843883037567, + "learning_rate": 8e-05, + "loss": 1.7019, + "step": 856 + }, + { + "epoch": 0.047770345596432555, + "grad_norm": 0.4043019115924835, + "learning_rate": 8e-05, + "loss": 1.6508, + "step": 857 + }, + { + "epoch": 0.04782608695652174, + "grad_norm": 0.3964078724384308, + "learning_rate": 8e-05, + "loss": 1.7001, + "step": 858 + }, + { + "epoch": 0.047881828316610926, + "grad_norm": 0.41778528690338135, + "learning_rate": 8e-05, + "loss": 1.6894, + "step": 859 + }, + { + "epoch": 0.04793756967670011, + "grad_norm": 0.40355780720710754, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 860 + }, + { + "epoch": 0.0479933110367893, + "grad_norm": 0.4723891317844391, + "learning_rate": 8e-05, + "loss": 1.9092, + "step": 861 + }, + { + "epoch": 0.04804905239687848, + "grad_norm": 0.42808517813682556, + "learning_rate": 8e-05, + "loss": 1.8493, + "step": 862 + }, + { + "epoch": 0.04810479375696767, + "grad_norm": 0.4569825232028961, + "learning_rate": 8e-05, + "loss": 1.7595, + "step": 863 + }, + { + "epoch": 0.048160535117056855, + "grad_norm": 0.40674179792404175, + "learning_rate": 8e-05, + "loss": 1.5641, + "step": 864 + }, + { + "epoch": 0.04821627647714604, + "grad_norm": 0.41264376044273376, + "learning_rate": 8e-05, + "loss": 1.6743, + "step": 865 + }, + { + "epoch": 0.048272017837235226, + "grad_norm": 0.4213387966156006, + "learning_rate": 8e-05, + "loss": 1.5963, + "step": 866 + }, + { + "epoch": 0.04832775919732441, + "grad_norm": 0.4398549199104309, + "learning_rate": 8e-05, + "loss": 1.901, + "step": 867 + }, + { + "epoch": 0.048383500557413604, + "grad_norm": 0.4367522597312927, + "learning_rate": 8e-05, + "loss": 1.8161, + "step": 868 + }, + { + "epoch": 0.04843924191750279, + "grad_norm": 0.46793094277381897, + "learning_rate": 8e-05, + "loss": 2.0695, + "step": 869 + }, + { + "epoch": 0.048494983277591976, + "grad_norm": 0.4361442029476166, + "learning_rate": 8e-05, + "loss": 1.8895, + "step": 870 + }, + { + "epoch": 0.04855072463768116, + "grad_norm": 0.3976099193096161, + "learning_rate": 8e-05, + "loss": 1.7172, + "step": 871 + }, + { + "epoch": 0.04860646599777035, + "grad_norm": 0.4153869152069092, + "learning_rate": 8e-05, + "loss": 1.7037, + "step": 872 + }, + { + "epoch": 0.04866220735785953, + "grad_norm": 0.4478858709335327, + "learning_rate": 8e-05, + "loss": 1.7821, + "step": 873 + }, + { + "epoch": 0.04871794871794872, + "grad_norm": 0.44808489084243774, + "learning_rate": 8e-05, + "loss": 1.8322, + "step": 874 + }, + { + "epoch": 0.048773690078037904, + "grad_norm": 0.44651350378990173, + "learning_rate": 8e-05, + "loss": 1.8766, + "step": 875 + }, + { + "epoch": 0.04882943143812709, + "grad_norm": 0.4439350962638855, + "learning_rate": 8e-05, + "loss": 1.7531, + "step": 876 + }, + { + "epoch": 0.048885172798216275, + "grad_norm": 0.4228324890136719, + "learning_rate": 8e-05, + "loss": 1.7428, + "step": 877 + }, + { + "epoch": 0.04894091415830546, + "grad_norm": 0.42650389671325684, + "learning_rate": 8e-05, + "loss": 1.8161, + "step": 878 + }, + { + "epoch": 0.04899665551839465, + "grad_norm": 0.38016974925994873, + "learning_rate": 8e-05, + "loss": 1.6092, + "step": 879 + }, + { + "epoch": 0.04905239687848383, + "grad_norm": 0.4425662159919739, + "learning_rate": 8e-05, + "loss": 1.6695, + "step": 880 + }, + { + "epoch": 0.04910813823857302, + "grad_norm": 0.4162566363811493, + "learning_rate": 8e-05, + "loss": 1.7926, + "step": 881 + }, + { + "epoch": 0.04916387959866221, + "grad_norm": 0.4386153221130371, + "learning_rate": 8e-05, + "loss": 1.7272, + "step": 882 + }, + { + "epoch": 0.049219620958751396, + "grad_norm": 0.41146156191825867, + "learning_rate": 8e-05, + "loss": 1.7901, + "step": 883 + }, + { + "epoch": 0.04927536231884058, + "grad_norm": 0.38946112990379333, + "learning_rate": 8e-05, + "loss": 1.7411, + "step": 884 + }, + { + "epoch": 0.04933110367892977, + "grad_norm": 0.4616527557373047, + "learning_rate": 8e-05, + "loss": 2.0916, + "step": 885 + }, + { + "epoch": 0.04938684503901895, + "grad_norm": 0.4302521049976349, + "learning_rate": 8e-05, + "loss": 1.7878, + "step": 886 + }, + { + "epoch": 0.04944258639910814, + "grad_norm": 0.4233269691467285, + "learning_rate": 8e-05, + "loss": 1.6362, + "step": 887 + }, + { + "epoch": 0.049498327759197325, + "grad_norm": 0.4491744935512543, + "learning_rate": 8e-05, + "loss": 1.8209, + "step": 888 + }, + { + "epoch": 0.04955406911928651, + "grad_norm": 0.4061885178089142, + "learning_rate": 8e-05, + "loss": 1.7068, + "step": 889 + }, + { + "epoch": 0.049609810479375696, + "grad_norm": 0.42203137278556824, + "learning_rate": 8e-05, + "loss": 1.6899, + "step": 890 + }, + { + "epoch": 0.04966555183946488, + "grad_norm": 0.4250293970108032, + "learning_rate": 8e-05, + "loss": 1.7895, + "step": 891 + }, + { + "epoch": 0.04972129319955407, + "grad_norm": 0.43000224232673645, + "learning_rate": 8e-05, + "loss": 1.7183, + "step": 892 + }, + { + "epoch": 0.04977703455964325, + "grad_norm": 0.49140119552612305, + "learning_rate": 8e-05, + "loss": 2.0823, + "step": 893 + }, + { + "epoch": 0.04983277591973244, + "grad_norm": 0.4494474530220032, + "learning_rate": 8e-05, + "loss": 1.854, + "step": 894 + }, + { + "epoch": 0.049888517279821624, + "grad_norm": 0.45056265592575073, + "learning_rate": 8e-05, + "loss": 1.754, + "step": 895 + }, + { + "epoch": 0.04994425863991082, + "grad_norm": 0.45480015873908997, + "learning_rate": 8e-05, + "loss": 1.7431, + "step": 896 + }, + { + "epoch": 0.05, + "grad_norm": 0.4143247902393341, + "learning_rate": 8e-05, + "loss": 1.6383, + "step": 897 + }, + { + "epoch": 0.05005574136008919, + "grad_norm": 0.4520198702812195, + "learning_rate": 8e-05, + "loss": 1.7012, + "step": 898 + }, + { + "epoch": 0.050111482720178374, + "grad_norm": 0.4883075952529907, + "learning_rate": 8e-05, + "loss": 1.926, + "step": 899 + }, + { + "epoch": 0.05016722408026756, + "grad_norm": 0.46189042925834656, + "learning_rate": 8e-05, + "loss": 1.6825, + "step": 900 + }, + { + "epoch": 0.050222965440356745, + "grad_norm": 0.40424585342407227, + "learning_rate": 8e-05, + "loss": 1.6505, + "step": 901 + }, + { + "epoch": 0.05027870680044593, + "grad_norm": 0.3945220112800598, + "learning_rate": 8e-05, + "loss": 1.5049, + "step": 902 + }, + { + "epoch": 0.05033444816053512, + "grad_norm": 0.4237712025642395, + "learning_rate": 8e-05, + "loss": 1.74, + "step": 903 + }, + { + "epoch": 0.0503901895206243, + "grad_norm": 0.434099018573761, + "learning_rate": 8e-05, + "loss": 1.8923, + "step": 904 + }, + { + "epoch": 0.05044593088071349, + "grad_norm": 0.4102526903152466, + "learning_rate": 8e-05, + "loss": 1.7801, + "step": 905 + }, + { + "epoch": 0.050501672240802674, + "grad_norm": 0.4492480158805847, + "learning_rate": 8e-05, + "loss": 1.8818, + "step": 906 + }, + { + "epoch": 0.05055741360089186, + "grad_norm": 0.42090365290641785, + "learning_rate": 8e-05, + "loss": 1.6556, + "step": 907 + }, + { + "epoch": 0.050613154960981045, + "grad_norm": 0.37998881936073303, + "learning_rate": 8e-05, + "loss": 1.6473, + "step": 908 + }, + { + "epoch": 0.05066889632107023, + "grad_norm": 0.44362199306488037, + "learning_rate": 8e-05, + "loss": 1.9331, + "step": 909 + }, + { + "epoch": 0.050724637681159424, + "grad_norm": 0.42187029123306274, + "learning_rate": 8e-05, + "loss": 1.7093, + "step": 910 + }, + { + "epoch": 0.05078037904124861, + "grad_norm": 0.4547136425971985, + "learning_rate": 8e-05, + "loss": 1.8013, + "step": 911 + }, + { + "epoch": 0.050836120401337795, + "grad_norm": 0.6749727129936218, + "learning_rate": 8e-05, + "loss": 1.6621, + "step": 912 + }, + { + "epoch": 0.05089186176142698, + "grad_norm": 0.39881759881973267, + "learning_rate": 8e-05, + "loss": 1.6822, + "step": 913 + }, + { + "epoch": 0.050947603121516166, + "grad_norm": 0.40810444951057434, + "learning_rate": 8e-05, + "loss": 1.5836, + "step": 914 + }, + { + "epoch": 0.05100334448160535, + "grad_norm": 0.4395993947982788, + "learning_rate": 8e-05, + "loss": 1.8861, + "step": 915 + }, + { + "epoch": 0.05105908584169454, + "grad_norm": 0.4386335015296936, + "learning_rate": 8e-05, + "loss": 1.824, + "step": 916 + }, + { + "epoch": 0.05111482720178372, + "grad_norm": 0.3997780978679657, + "learning_rate": 8e-05, + "loss": 1.733, + "step": 917 + }, + { + "epoch": 0.05117056856187291, + "grad_norm": 0.4373658299446106, + "learning_rate": 8e-05, + "loss": 1.8932, + "step": 918 + }, + { + "epoch": 0.051226309921962095, + "grad_norm": 0.42019960284233093, + "learning_rate": 8e-05, + "loss": 1.6854, + "step": 919 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 0.4224587380886078, + "learning_rate": 8e-05, + "loss": 1.6851, + "step": 920 + }, + { + "epoch": 0.051337792642140466, + "grad_norm": 0.4506530463695526, + "learning_rate": 8e-05, + "loss": 2.0913, + "step": 921 + }, + { + "epoch": 0.05139353400222965, + "grad_norm": 0.5099965333938599, + "learning_rate": 8e-05, + "loss": 2.0435, + "step": 922 + }, + { + "epoch": 0.05144927536231884, + "grad_norm": 0.4297906756401062, + "learning_rate": 8e-05, + "loss": 1.761, + "step": 923 + }, + { + "epoch": 0.05150501672240803, + "grad_norm": 0.44647520780563354, + "learning_rate": 8e-05, + "loss": 1.8595, + "step": 924 + }, + { + "epoch": 0.051560758082497216, + "grad_norm": 0.42764410376548767, + "learning_rate": 8e-05, + "loss": 1.9272, + "step": 925 + }, + { + "epoch": 0.0516164994425864, + "grad_norm": 0.42151692509651184, + "learning_rate": 8e-05, + "loss": 1.9704, + "step": 926 + }, + { + "epoch": 0.05167224080267559, + "grad_norm": 0.432170569896698, + "learning_rate": 8e-05, + "loss": 1.7059, + "step": 927 + }, + { + "epoch": 0.05172798216276477, + "grad_norm": 0.49925047159194946, + "learning_rate": 8e-05, + "loss": 1.8333, + "step": 928 + }, + { + "epoch": 0.05178372352285396, + "grad_norm": 0.4397178292274475, + "learning_rate": 8e-05, + "loss": 1.8248, + "step": 929 + }, + { + "epoch": 0.051839464882943144, + "grad_norm": 0.390194833278656, + "learning_rate": 8e-05, + "loss": 1.7337, + "step": 930 + }, + { + "epoch": 0.05189520624303233, + "grad_norm": 0.4256782531738281, + "learning_rate": 8e-05, + "loss": 1.6734, + "step": 931 + }, + { + "epoch": 0.051950947603121515, + "grad_norm": 0.3956703245639801, + "learning_rate": 8e-05, + "loss": 1.665, + "step": 932 + }, + { + "epoch": 0.0520066889632107, + "grad_norm": 0.4501950740814209, + "learning_rate": 8e-05, + "loss": 1.8167, + "step": 933 + }, + { + "epoch": 0.05206243032329989, + "grad_norm": 0.41400691866874695, + "learning_rate": 8e-05, + "loss": 1.5718, + "step": 934 + }, + { + "epoch": 0.05211817168338907, + "grad_norm": 0.4194335639476776, + "learning_rate": 8e-05, + "loss": 1.6497, + "step": 935 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 0.41398027539253235, + "learning_rate": 8e-05, + "loss": 1.5243, + "step": 936 + }, + { + "epoch": 0.052229654403567444, + "grad_norm": 0.42045271396636963, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 937 + }, + { + "epoch": 0.052285395763656636, + "grad_norm": 0.36227548122406006, + "learning_rate": 8e-05, + "loss": 1.3389, + "step": 938 + }, + { + "epoch": 0.05234113712374582, + "grad_norm": 0.4060024321079254, + "learning_rate": 8e-05, + "loss": 1.6522, + "step": 939 + }, + { + "epoch": 0.05239687848383501, + "grad_norm": 0.3965223431587219, + "learning_rate": 8e-05, + "loss": 1.7567, + "step": 940 + }, + { + "epoch": 0.05245261984392419, + "grad_norm": 0.4708893299102783, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 941 + }, + { + "epoch": 0.05250836120401338, + "grad_norm": 0.3749980628490448, + "learning_rate": 8e-05, + "loss": 1.595, + "step": 942 + }, + { + "epoch": 0.052564102564102565, + "grad_norm": 0.4108208119869232, + "learning_rate": 8e-05, + "loss": 1.6256, + "step": 943 + }, + { + "epoch": 0.05261984392419175, + "grad_norm": 0.4239984452724457, + "learning_rate": 8e-05, + "loss": 1.7197, + "step": 944 + }, + { + "epoch": 0.052675585284280936, + "grad_norm": 0.4631035029888153, + "learning_rate": 8e-05, + "loss": 1.8541, + "step": 945 + }, + { + "epoch": 0.05273132664437012, + "grad_norm": 0.4573301374912262, + "learning_rate": 8e-05, + "loss": 1.8671, + "step": 946 + }, + { + "epoch": 0.05278706800445931, + "grad_norm": 0.4194883108139038, + "learning_rate": 8e-05, + "loss": 1.7707, + "step": 947 + }, + { + "epoch": 0.05284280936454849, + "grad_norm": 0.412008672952652, + "learning_rate": 8e-05, + "loss": 1.5866, + "step": 948 + }, + { + "epoch": 0.05289855072463768, + "grad_norm": 0.4511415660381317, + "learning_rate": 8e-05, + "loss": 1.7598, + "step": 949 + }, + { + "epoch": 0.052954292084726864, + "grad_norm": 0.4199249744415283, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 950 + }, + { + "epoch": 0.05301003344481605, + "grad_norm": 0.4607292711734772, + "learning_rate": 8e-05, + "loss": 1.7429, + "step": 951 + }, + { + "epoch": 0.05306577480490524, + "grad_norm": 0.47078394889831543, + "learning_rate": 8e-05, + "loss": 1.997, + "step": 952 + }, + { + "epoch": 0.05312151616499443, + "grad_norm": 0.4439176023006439, + "learning_rate": 8e-05, + "loss": 1.7236, + "step": 953 + }, + { + "epoch": 0.053177257525083614, + "grad_norm": 0.4908515214920044, + "learning_rate": 8e-05, + "loss": 1.5529, + "step": 954 + }, + { + "epoch": 0.0532329988851728, + "grad_norm": 0.48976606130599976, + "learning_rate": 8e-05, + "loss": 1.9534, + "step": 955 + }, + { + "epoch": 0.053288740245261985, + "grad_norm": 0.4504796266555786, + "learning_rate": 8e-05, + "loss": 1.8611, + "step": 956 + }, + { + "epoch": 0.05334448160535117, + "grad_norm": 0.4429522156715393, + "learning_rate": 8e-05, + "loss": 1.8884, + "step": 957 + }, + { + "epoch": 0.05340022296544036, + "grad_norm": 0.4269380271434784, + "learning_rate": 8e-05, + "loss": 1.6737, + "step": 958 + }, + { + "epoch": 0.05345596432552954, + "grad_norm": 0.4351010322570801, + "learning_rate": 8e-05, + "loss": 1.7113, + "step": 959 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 0.4203762114048004, + "learning_rate": 8e-05, + "loss": 1.7045, + "step": 960 + }, + { + "epoch": 0.053567447045707914, + "grad_norm": 0.4742576479911804, + "learning_rate": 8e-05, + "loss": 1.7861, + "step": 961 + }, + { + "epoch": 0.0536231884057971, + "grad_norm": 0.40854305028915405, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 962 + }, + { + "epoch": 0.053678929765886285, + "grad_norm": 0.4558371901512146, + "learning_rate": 8e-05, + "loss": 1.9133, + "step": 963 + }, + { + "epoch": 0.05373467112597547, + "grad_norm": 0.44087305665016174, + "learning_rate": 8e-05, + "loss": 1.6143, + "step": 964 + }, + { + "epoch": 0.053790412486064657, + "grad_norm": 0.4590452313423157, + "learning_rate": 8e-05, + "loss": 1.8756, + "step": 965 + }, + { + "epoch": 0.05384615384615385, + "grad_norm": 0.43718910217285156, + "learning_rate": 8e-05, + "loss": 1.8377, + "step": 966 + }, + { + "epoch": 0.053901895206243035, + "grad_norm": 0.39862263202667236, + "learning_rate": 8e-05, + "loss": 1.6154, + "step": 967 + }, + { + "epoch": 0.05395763656633222, + "grad_norm": 0.4717220366001129, + "learning_rate": 8e-05, + "loss": 1.8788, + "step": 968 + }, + { + "epoch": 0.054013377926421406, + "grad_norm": 0.43801164627075195, + "learning_rate": 8e-05, + "loss": 1.8031, + "step": 969 + }, + { + "epoch": 0.05406911928651059, + "grad_norm": 0.43170639872550964, + "learning_rate": 8e-05, + "loss": 1.8745, + "step": 970 + }, + { + "epoch": 0.05412486064659978, + "grad_norm": 0.4330897033214569, + "learning_rate": 8e-05, + "loss": 1.8022, + "step": 971 + }, + { + "epoch": 0.05418060200668896, + "grad_norm": 0.43624183535575867, + "learning_rate": 8e-05, + "loss": 1.874, + "step": 972 + }, + { + "epoch": 0.05423634336677815, + "grad_norm": 0.4237411916255951, + "learning_rate": 8e-05, + "loss": 1.7464, + "step": 973 + }, + { + "epoch": 0.054292084726867335, + "grad_norm": 0.4099571406841278, + "learning_rate": 8e-05, + "loss": 1.5987, + "step": 974 + }, + { + "epoch": 0.05434782608695652, + "grad_norm": 0.44440948963165283, + "learning_rate": 8e-05, + "loss": 1.7165, + "step": 975 + }, + { + "epoch": 0.054403567447045706, + "grad_norm": 0.43227434158325195, + "learning_rate": 8e-05, + "loss": 1.9345, + "step": 976 + }, + { + "epoch": 0.05445930880713489, + "grad_norm": 0.43842023611068726, + "learning_rate": 8e-05, + "loss": 1.6778, + "step": 977 + }, + { + "epoch": 0.05451505016722408, + "grad_norm": 0.4868525564670563, + "learning_rate": 8e-05, + "loss": 1.8353, + "step": 978 + }, + { + "epoch": 0.05457079152731327, + "grad_norm": 0.43103519082069397, + "learning_rate": 8e-05, + "loss": 1.6692, + "step": 979 + }, + { + "epoch": 0.054626532887402456, + "grad_norm": 0.465599924325943, + "learning_rate": 8e-05, + "loss": 2.117, + "step": 980 + }, + { + "epoch": 0.05468227424749164, + "grad_norm": 0.46371522545814514, + "learning_rate": 8e-05, + "loss": 1.8647, + "step": 981 + }, + { + "epoch": 0.05473801560758083, + "grad_norm": 0.4412715435028076, + "learning_rate": 8e-05, + "loss": 1.7827, + "step": 982 + }, + { + "epoch": 0.05479375696767001, + "grad_norm": 0.5162610411643982, + "learning_rate": 8e-05, + "loss": 1.969, + "step": 983 + }, + { + "epoch": 0.0548494983277592, + "grad_norm": 0.4213210940361023, + "learning_rate": 8e-05, + "loss": 1.6282, + "step": 984 + }, + { + "epoch": 0.054905239687848384, + "grad_norm": 0.4084482491016388, + "learning_rate": 8e-05, + "loss": 1.6544, + "step": 985 + }, + { + "epoch": 0.05496098104793757, + "grad_norm": 0.4904555678367615, + "learning_rate": 8e-05, + "loss": 1.9587, + "step": 986 + }, + { + "epoch": 0.055016722408026755, + "grad_norm": 0.4115452170372009, + "learning_rate": 8e-05, + "loss": 1.8093, + "step": 987 + }, + { + "epoch": 0.05507246376811594, + "grad_norm": 0.4041500389575958, + "learning_rate": 8e-05, + "loss": 1.7468, + "step": 988 + }, + { + "epoch": 0.05512820512820513, + "grad_norm": 0.42626655101776123, + "learning_rate": 8e-05, + "loss": 1.8658, + "step": 989 + }, + { + "epoch": 0.05518394648829431, + "grad_norm": 0.4490128457546234, + "learning_rate": 8e-05, + "loss": 1.9277, + "step": 990 + }, + { + "epoch": 0.0552396878483835, + "grad_norm": 0.4793890118598938, + "learning_rate": 8e-05, + "loss": 2.067, + "step": 991 + }, + { + "epoch": 0.055295429208472684, + "grad_norm": 0.4292072653770447, + "learning_rate": 8e-05, + "loss": 1.5705, + "step": 992 + }, + { + "epoch": 0.055351170568561876, + "grad_norm": 0.4330323040485382, + "learning_rate": 8e-05, + "loss": 1.9699, + "step": 993 + }, + { + "epoch": 0.05540691192865106, + "grad_norm": 0.5033552646636963, + "learning_rate": 8e-05, + "loss": 1.7123, + "step": 994 + }, + { + "epoch": 0.05546265328874025, + "grad_norm": 0.4264853298664093, + "learning_rate": 8e-05, + "loss": 1.9333, + "step": 995 + }, + { + "epoch": 0.05551839464882943, + "grad_norm": 0.43361344933509827, + "learning_rate": 8e-05, + "loss": 1.7706, + "step": 996 + }, + { + "epoch": 0.05557413600891862, + "grad_norm": 0.4839682877063751, + "learning_rate": 8e-05, + "loss": 1.9086, + "step": 997 + }, + { + "epoch": 0.055629877369007805, + "grad_norm": 0.46842366456985474, + "learning_rate": 8e-05, + "loss": 2.0759, + "step": 998 + }, + { + "epoch": 0.05568561872909699, + "grad_norm": 0.4213140308856964, + "learning_rate": 8e-05, + "loss": 1.7904, + "step": 999 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.4456956684589386, + "learning_rate": 8e-05, + "loss": 1.8378, + "step": 1000 + }, + { + "epoch": 0.05579710144927536, + "grad_norm": 0.4476751685142517, + "learning_rate": 8e-05, + "loss": 1.7923, + "step": 1001 + }, + { + "epoch": 0.05585284280936455, + "grad_norm": 0.4022519588470459, + "learning_rate": 8e-05, + "loss": 1.6922, + "step": 1002 + }, + { + "epoch": 0.05590858416945373, + "grad_norm": 0.4169318377971649, + "learning_rate": 8e-05, + "loss": 1.867, + "step": 1003 + }, + { + "epoch": 0.05596432552954292, + "grad_norm": 0.5523567795753479, + "learning_rate": 8e-05, + "loss": 2.3807, + "step": 1004 + }, + { + "epoch": 0.056020066889632104, + "grad_norm": 0.40831258893013, + "learning_rate": 8e-05, + "loss": 1.617, + "step": 1005 + }, + { + "epoch": 0.05607580824972129, + "grad_norm": 0.4216151535511017, + "learning_rate": 8e-05, + "loss": 1.7542, + "step": 1006 + }, + { + "epoch": 0.05613154960981048, + "grad_norm": 0.4263494610786438, + "learning_rate": 8e-05, + "loss": 1.6239, + "step": 1007 + }, + { + "epoch": 0.05618729096989967, + "grad_norm": 0.4082014858722687, + "learning_rate": 8e-05, + "loss": 1.5427, + "step": 1008 + }, + { + "epoch": 0.056243032329988854, + "grad_norm": 0.4632360339164734, + "learning_rate": 8e-05, + "loss": 1.8907, + "step": 1009 + }, + { + "epoch": 0.05629877369007804, + "grad_norm": 0.4561009109020233, + "learning_rate": 8e-05, + "loss": 1.7918, + "step": 1010 + }, + { + "epoch": 0.056354515050167225, + "grad_norm": 0.4626276195049286, + "learning_rate": 8e-05, + "loss": 1.7108, + "step": 1011 + }, + { + "epoch": 0.05641025641025641, + "grad_norm": 0.4755576252937317, + "learning_rate": 8e-05, + "loss": 2.021, + "step": 1012 + }, + { + "epoch": 0.0564659977703456, + "grad_norm": 0.4339366555213928, + "learning_rate": 8e-05, + "loss": 1.8268, + "step": 1013 + }, + { + "epoch": 0.05652173913043478, + "grad_norm": 0.4644366502761841, + "learning_rate": 8e-05, + "loss": 1.8686, + "step": 1014 + }, + { + "epoch": 0.05657748049052397, + "grad_norm": 0.47668248414993286, + "learning_rate": 8e-05, + "loss": 2.1487, + "step": 1015 + }, + { + "epoch": 0.056633221850613154, + "grad_norm": 0.46674248576164246, + "learning_rate": 8e-05, + "loss": 2.0188, + "step": 1016 + }, + { + "epoch": 0.05668896321070234, + "grad_norm": 0.45375779271125793, + "learning_rate": 8e-05, + "loss": 1.8338, + "step": 1017 + }, + { + "epoch": 0.056744704570791525, + "grad_norm": 0.40222418308258057, + "learning_rate": 8e-05, + "loss": 1.7165, + "step": 1018 + }, + { + "epoch": 0.05680044593088071, + "grad_norm": 0.5088784694671631, + "learning_rate": 8e-05, + "loss": 2.2179, + "step": 1019 + }, + { + "epoch": 0.056856187290969896, + "grad_norm": 0.4156004786491394, + "learning_rate": 8e-05, + "loss": 1.7308, + "step": 1020 + }, + { + "epoch": 0.05691192865105909, + "grad_norm": 0.43296828866004944, + "learning_rate": 8e-05, + "loss": 1.9068, + "step": 1021 + }, + { + "epoch": 0.056967670011148275, + "grad_norm": 0.42278701066970825, + "learning_rate": 8e-05, + "loss": 1.8354, + "step": 1022 + }, + { + "epoch": 0.05702341137123746, + "grad_norm": 0.44968274235725403, + "learning_rate": 8e-05, + "loss": 1.8766, + "step": 1023 + }, + { + "epoch": 0.057079152731326646, + "grad_norm": 0.4208853244781494, + "learning_rate": 8e-05, + "loss": 1.7541, + "step": 1024 + }, + { + "epoch": 0.05713489409141583, + "grad_norm": 0.42695286870002747, + "learning_rate": 8e-05, + "loss": 1.852, + "step": 1025 + }, + { + "epoch": 0.05719063545150502, + "grad_norm": 0.4679296910762787, + "learning_rate": 8e-05, + "loss": 1.9165, + "step": 1026 + }, + { + "epoch": 0.0572463768115942, + "grad_norm": 0.4364148676395416, + "learning_rate": 8e-05, + "loss": 1.7477, + "step": 1027 + }, + { + "epoch": 0.05730211817168339, + "grad_norm": 0.41188475489616394, + "learning_rate": 8e-05, + "loss": 1.5547, + "step": 1028 + }, + { + "epoch": 0.057357859531772575, + "grad_norm": 0.3878803849220276, + "learning_rate": 8e-05, + "loss": 1.6174, + "step": 1029 + }, + { + "epoch": 0.05741360089186176, + "grad_norm": 0.4230212867259979, + "learning_rate": 8e-05, + "loss": 1.7003, + "step": 1030 + }, + { + "epoch": 0.057469342251950946, + "grad_norm": 0.5208142995834351, + "learning_rate": 8e-05, + "loss": 2.1672, + "step": 1031 + }, + { + "epoch": 0.05752508361204013, + "grad_norm": 0.43669408559799194, + "learning_rate": 8e-05, + "loss": 1.7353, + "step": 1032 + }, + { + "epoch": 0.05758082497212932, + "grad_norm": 0.4216177761554718, + "learning_rate": 8e-05, + "loss": 1.6684, + "step": 1033 + }, + { + "epoch": 0.0576365663322185, + "grad_norm": 0.42164376378059387, + "learning_rate": 8e-05, + "loss": 1.6215, + "step": 1034 + }, + { + "epoch": 0.057692307692307696, + "grad_norm": 0.4818020462989807, + "learning_rate": 8e-05, + "loss": 2.0377, + "step": 1035 + }, + { + "epoch": 0.05774804905239688, + "grad_norm": 0.462981641292572, + "learning_rate": 8e-05, + "loss": 1.8978, + "step": 1036 + }, + { + "epoch": 0.05780379041248607, + "grad_norm": 0.4435645043849945, + "learning_rate": 8e-05, + "loss": 1.6334, + "step": 1037 + }, + { + "epoch": 0.05785953177257525, + "grad_norm": 0.3870157301425934, + "learning_rate": 8e-05, + "loss": 1.6838, + "step": 1038 + }, + { + "epoch": 0.05791527313266444, + "grad_norm": 0.43893805146217346, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 1039 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.46928489208221436, + "learning_rate": 8e-05, + "loss": 1.9366, + "step": 1040 + }, + { + "epoch": 0.05802675585284281, + "grad_norm": 0.5174750685691833, + "learning_rate": 8e-05, + "loss": 1.9242, + "step": 1041 + }, + { + "epoch": 0.058082497212931995, + "grad_norm": 0.41075706481933594, + "learning_rate": 8e-05, + "loss": 1.5632, + "step": 1042 + }, + { + "epoch": 0.05813823857302118, + "grad_norm": 0.39559102058410645, + "learning_rate": 8e-05, + "loss": 1.6224, + "step": 1043 + }, + { + "epoch": 0.05819397993311037, + "grad_norm": 0.43666520714759827, + "learning_rate": 8e-05, + "loss": 1.737, + "step": 1044 + }, + { + "epoch": 0.05824972129319955, + "grad_norm": 0.44067472219467163, + "learning_rate": 8e-05, + "loss": 1.915, + "step": 1045 + }, + { + "epoch": 0.05830546265328874, + "grad_norm": 0.5051589012145996, + "learning_rate": 8e-05, + "loss": 1.9358, + "step": 1046 + }, + { + "epoch": 0.058361204013377924, + "grad_norm": 0.4161045253276825, + "learning_rate": 8e-05, + "loss": 1.707, + "step": 1047 + }, + { + "epoch": 0.05841694537346711, + "grad_norm": 0.44676727056503296, + "learning_rate": 8e-05, + "loss": 1.8439, + "step": 1048 + }, + { + "epoch": 0.0584726867335563, + "grad_norm": 0.4336719512939453, + "learning_rate": 8e-05, + "loss": 1.8815, + "step": 1049 + }, + { + "epoch": 0.05852842809364549, + "grad_norm": 0.4447490870952606, + "learning_rate": 8e-05, + "loss": 1.6256, + "step": 1050 + }, + { + "epoch": 0.05858416945373467, + "grad_norm": 0.4244912564754486, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 1051 + }, + { + "epoch": 0.05863991081382386, + "grad_norm": 0.42827799916267395, + "learning_rate": 8e-05, + "loss": 1.9004, + "step": 1052 + }, + { + "epoch": 0.058695652173913045, + "grad_norm": 0.4283365309238434, + "learning_rate": 8e-05, + "loss": 1.8683, + "step": 1053 + }, + { + "epoch": 0.05875139353400223, + "grad_norm": 0.4388998746871948, + "learning_rate": 8e-05, + "loss": 1.9283, + "step": 1054 + }, + { + "epoch": 0.058807134894091416, + "grad_norm": 0.4022662937641144, + "learning_rate": 8e-05, + "loss": 1.4931, + "step": 1055 + }, + { + "epoch": 0.0588628762541806, + "grad_norm": 0.4228089153766632, + "learning_rate": 8e-05, + "loss": 1.7865, + "step": 1056 + }, + { + "epoch": 0.05891861761426979, + "grad_norm": 0.4698995053768158, + "learning_rate": 8e-05, + "loss": 1.8428, + "step": 1057 + }, + { + "epoch": 0.05897435897435897, + "grad_norm": 0.40131834149360657, + "learning_rate": 8e-05, + "loss": 1.5503, + "step": 1058 + }, + { + "epoch": 0.05903010033444816, + "grad_norm": 0.46206822991371155, + "learning_rate": 8e-05, + "loss": 1.8623, + "step": 1059 + }, + { + "epoch": 0.059085841694537344, + "grad_norm": 0.4545944929122925, + "learning_rate": 8e-05, + "loss": 1.688, + "step": 1060 + }, + { + "epoch": 0.05914158305462653, + "grad_norm": 0.42100989818573, + "learning_rate": 8e-05, + "loss": 1.6139, + "step": 1061 + }, + { + "epoch": 0.059197324414715716, + "grad_norm": 0.46316903829574585, + "learning_rate": 8e-05, + "loss": 1.7782, + "step": 1062 + }, + { + "epoch": 0.05925306577480491, + "grad_norm": 0.4210160970687866, + "learning_rate": 8e-05, + "loss": 1.6899, + "step": 1063 + }, + { + "epoch": 0.059308807134894094, + "grad_norm": 0.4561643898487091, + "learning_rate": 8e-05, + "loss": 1.9143, + "step": 1064 + }, + { + "epoch": 0.05936454849498328, + "grad_norm": 0.4355778098106384, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 1065 + }, + { + "epoch": 0.059420289855072465, + "grad_norm": 0.4599842429161072, + "learning_rate": 8e-05, + "loss": 1.8062, + "step": 1066 + }, + { + "epoch": 0.05947603121516165, + "grad_norm": 0.4494960904121399, + "learning_rate": 8e-05, + "loss": 1.6854, + "step": 1067 + }, + { + "epoch": 0.05953177257525084, + "grad_norm": 0.4336021840572357, + "learning_rate": 8e-05, + "loss": 1.8961, + "step": 1068 + }, + { + "epoch": 0.05958751393534002, + "grad_norm": 0.42015063762664795, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 1069 + }, + { + "epoch": 0.05964325529542921, + "grad_norm": 0.44604378938674927, + "learning_rate": 8e-05, + "loss": 1.9194, + "step": 1070 + }, + { + "epoch": 0.059698996655518394, + "grad_norm": 0.4459189474582672, + "learning_rate": 8e-05, + "loss": 1.8687, + "step": 1071 + }, + { + "epoch": 0.05975473801560758, + "grad_norm": 0.4442739486694336, + "learning_rate": 8e-05, + "loss": 1.8116, + "step": 1072 + }, + { + "epoch": 0.059810479375696765, + "grad_norm": 0.45290660858154297, + "learning_rate": 8e-05, + "loss": 1.8709, + "step": 1073 + }, + { + "epoch": 0.05986622073578595, + "grad_norm": 0.41268160939216614, + "learning_rate": 8e-05, + "loss": 1.6687, + "step": 1074 + }, + { + "epoch": 0.059921962095875136, + "grad_norm": 0.47134310007095337, + "learning_rate": 8e-05, + "loss": 2.1694, + "step": 1075 + }, + { + "epoch": 0.05997770345596432, + "grad_norm": 0.45093834400177, + "learning_rate": 8e-05, + "loss": 1.8108, + "step": 1076 + }, + { + "epoch": 0.060033444816053515, + "grad_norm": 0.40651044249534607, + "learning_rate": 8e-05, + "loss": 1.5763, + "step": 1077 + }, + { + "epoch": 0.0600891861761427, + "grad_norm": 0.42038488388061523, + "learning_rate": 8e-05, + "loss": 1.5817, + "step": 1078 + }, + { + "epoch": 0.060144927536231886, + "grad_norm": 0.4239666163921356, + "learning_rate": 8e-05, + "loss": 1.7996, + "step": 1079 + }, + { + "epoch": 0.06020066889632107, + "grad_norm": 0.43619927763938904, + "learning_rate": 8e-05, + "loss": 1.9681, + "step": 1080 + }, + { + "epoch": 0.06025641025641026, + "grad_norm": 0.41996511816978455, + "learning_rate": 8e-05, + "loss": 1.8411, + "step": 1081 + }, + { + "epoch": 0.06031215161649944, + "grad_norm": 0.4244367182254791, + "learning_rate": 8e-05, + "loss": 1.7147, + "step": 1082 + }, + { + "epoch": 0.06036789297658863, + "grad_norm": 0.44703248143196106, + "learning_rate": 8e-05, + "loss": 1.8158, + "step": 1083 + }, + { + "epoch": 0.060423634336677814, + "grad_norm": 0.45021986961364746, + "learning_rate": 8e-05, + "loss": 1.9023, + "step": 1084 + }, + { + "epoch": 0.060479375696767, + "grad_norm": 0.4354102909564972, + "learning_rate": 8e-05, + "loss": 1.9024, + "step": 1085 + }, + { + "epoch": 0.060535117056856186, + "grad_norm": 0.4334500730037689, + "learning_rate": 8e-05, + "loss": 1.8172, + "step": 1086 + }, + { + "epoch": 0.06059085841694537, + "grad_norm": 0.4097336530685425, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 1087 + }, + { + "epoch": 0.06064659977703456, + "grad_norm": 0.4830532968044281, + "learning_rate": 8e-05, + "loss": 1.824, + "step": 1088 + }, + { + "epoch": 0.06070234113712374, + "grad_norm": 0.447437047958374, + "learning_rate": 8e-05, + "loss": 1.9229, + "step": 1089 + }, + { + "epoch": 0.06075808249721293, + "grad_norm": 0.4807756543159485, + "learning_rate": 8e-05, + "loss": 1.8219, + "step": 1090 + }, + { + "epoch": 0.06081382385730212, + "grad_norm": 0.4496588408946991, + "learning_rate": 8e-05, + "loss": 2.0536, + "step": 1091 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 0.41599252820014954, + "learning_rate": 8e-05, + "loss": 1.6985, + "step": 1092 + }, + { + "epoch": 0.06092530657748049, + "grad_norm": 0.4328576922416687, + "learning_rate": 8e-05, + "loss": 1.8231, + "step": 1093 + }, + { + "epoch": 0.06098104793756968, + "grad_norm": 0.4406525790691376, + "learning_rate": 8e-05, + "loss": 1.8673, + "step": 1094 + }, + { + "epoch": 0.061036789297658864, + "grad_norm": 0.401132732629776, + "learning_rate": 8e-05, + "loss": 1.5546, + "step": 1095 + }, + { + "epoch": 0.06109253065774805, + "grad_norm": 0.42298394441604614, + "learning_rate": 8e-05, + "loss": 1.568, + "step": 1096 + }, + { + "epoch": 0.061148272017837235, + "grad_norm": 0.4252376854419708, + "learning_rate": 8e-05, + "loss": 1.752, + "step": 1097 + }, + { + "epoch": 0.06120401337792642, + "grad_norm": 0.4755648374557495, + "learning_rate": 8e-05, + "loss": 1.832, + "step": 1098 + }, + { + "epoch": 0.06125975473801561, + "grad_norm": 0.5341011881828308, + "learning_rate": 8e-05, + "loss": 2.1064, + "step": 1099 + }, + { + "epoch": 0.06131549609810479, + "grad_norm": 0.43493762612342834, + "learning_rate": 8e-05, + "loss": 1.8192, + "step": 1100 + }, + { + "epoch": 0.06137123745819398, + "grad_norm": 0.4518592059612274, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 1101 + }, + { + "epoch": 0.061426978818283164, + "grad_norm": 0.4443103075027466, + "learning_rate": 8e-05, + "loss": 1.8146, + "step": 1102 + }, + { + "epoch": 0.06148272017837235, + "grad_norm": 0.4577937424182892, + "learning_rate": 8e-05, + "loss": 1.7822, + "step": 1103 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 0.48581668734550476, + "learning_rate": 8e-05, + "loss": 1.9258, + "step": 1104 + }, + { + "epoch": 0.06159420289855073, + "grad_norm": 0.4341087341308594, + "learning_rate": 8e-05, + "loss": 1.8588, + "step": 1105 + }, + { + "epoch": 0.06164994425863991, + "grad_norm": 0.43398237228393555, + "learning_rate": 8e-05, + "loss": 1.778, + "step": 1106 + }, + { + "epoch": 0.0617056856187291, + "grad_norm": 0.40834975242614746, + "learning_rate": 8e-05, + "loss": 1.7681, + "step": 1107 + }, + { + "epoch": 0.061761426978818285, + "grad_norm": 0.41185036301612854, + "learning_rate": 8e-05, + "loss": 1.638, + "step": 1108 + }, + { + "epoch": 0.06181716833890747, + "grad_norm": 0.4246727526187897, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 1109 + }, + { + "epoch": 0.061872909698996656, + "grad_norm": 0.4513862431049347, + "learning_rate": 8e-05, + "loss": 1.8033, + "step": 1110 + }, + { + "epoch": 0.06192865105908584, + "grad_norm": 0.4458586573600769, + "learning_rate": 8e-05, + "loss": 1.9101, + "step": 1111 + }, + { + "epoch": 0.06198439241917503, + "grad_norm": 0.46575620770454407, + "learning_rate": 8e-05, + "loss": 1.8896, + "step": 1112 + }, + { + "epoch": 0.06204013377926421, + "grad_norm": 0.4389342963695526, + "learning_rate": 8e-05, + "loss": 1.7504, + "step": 1113 + }, + { + "epoch": 0.0620958751393534, + "grad_norm": 0.49950799345970154, + "learning_rate": 8e-05, + "loss": 1.9905, + "step": 1114 + }, + { + "epoch": 0.062151616499442584, + "grad_norm": 0.4360363781452179, + "learning_rate": 8e-05, + "loss": 1.2877, + "step": 1115 + }, + { + "epoch": 0.06220735785953177, + "grad_norm": 0.4367810785770416, + "learning_rate": 8e-05, + "loss": 1.7995, + "step": 1116 + }, + { + "epoch": 0.062263099219620956, + "grad_norm": 0.45174649357795715, + "learning_rate": 8e-05, + "loss": 2.0174, + "step": 1117 + }, + { + "epoch": 0.06231884057971015, + "grad_norm": 0.4332228899002075, + "learning_rate": 8e-05, + "loss": 1.7798, + "step": 1118 + }, + { + "epoch": 0.062374581939799334, + "grad_norm": 0.42212432622909546, + "learning_rate": 8e-05, + "loss": 1.6598, + "step": 1119 + }, + { + "epoch": 0.06243032329988852, + "grad_norm": 0.3963571786880493, + "learning_rate": 8e-05, + "loss": 1.5384, + "step": 1120 + }, + { + "epoch": 0.062486064659977705, + "grad_norm": 0.42810705304145813, + "learning_rate": 8e-05, + "loss": 1.7214, + "step": 1121 + }, + { + "epoch": 0.06254180602006688, + "grad_norm": 0.4167286455631256, + "learning_rate": 8e-05, + "loss": 1.7947, + "step": 1122 + }, + { + "epoch": 0.06259754738015608, + "grad_norm": 0.43644922971725464, + "learning_rate": 8e-05, + "loss": 1.8355, + "step": 1123 + }, + { + "epoch": 0.06265328874024526, + "grad_norm": 0.4328288435935974, + "learning_rate": 8e-05, + "loss": 1.6814, + "step": 1124 + }, + { + "epoch": 0.06270903010033445, + "grad_norm": 0.4443534314632416, + "learning_rate": 8e-05, + "loss": 1.7836, + "step": 1125 + }, + { + "epoch": 0.06276477146042364, + "grad_norm": 0.4291650652885437, + "learning_rate": 8e-05, + "loss": 1.654, + "step": 1126 + }, + { + "epoch": 0.06282051282051282, + "grad_norm": 0.44335997104644775, + "learning_rate": 8e-05, + "loss": 1.9186, + "step": 1127 + }, + { + "epoch": 0.06287625418060201, + "grad_norm": 0.4430725872516632, + "learning_rate": 8e-05, + "loss": 1.7812, + "step": 1128 + }, + { + "epoch": 0.06293199554069119, + "grad_norm": 0.4567205607891083, + "learning_rate": 8e-05, + "loss": 1.9849, + "step": 1129 + }, + { + "epoch": 0.06298773690078038, + "grad_norm": 0.45621874928474426, + "learning_rate": 8e-05, + "loss": 1.9115, + "step": 1130 + }, + { + "epoch": 0.06304347826086956, + "grad_norm": 0.45525816082954407, + "learning_rate": 8e-05, + "loss": 1.8807, + "step": 1131 + }, + { + "epoch": 0.06309921962095875, + "grad_norm": 0.4256766736507416, + "learning_rate": 8e-05, + "loss": 1.7896, + "step": 1132 + }, + { + "epoch": 0.06315496098104793, + "grad_norm": 0.4518592059612274, + "learning_rate": 8e-05, + "loss": 1.8965, + "step": 1133 + }, + { + "epoch": 0.06321070234113713, + "grad_norm": 0.44870826601982117, + "learning_rate": 8e-05, + "loss": 1.7877, + "step": 1134 + }, + { + "epoch": 0.0632664437012263, + "grad_norm": 0.39587363600730896, + "learning_rate": 8e-05, + "loss": 1.498, + "step": 1135 + }, + { + "epoch": 0.0633221850613155, + "grad_norm": 0.42017364501953125, + "learning_rate": 8e-05, + "loss": 1.7699, + "step": 1136 + }, + { + "epoch": 0.06337792642140468, + "grad_norm": 0.430050790309906, + "learning_rate": 8e-05, + "loss": 1.7077, + "step": 1137 + }, + { + "epoch": 0.06343366778149387, + "grad_norm": 0.4952334463596344, + "learning_rate": 8e-05, + "loss": 1.5378, + "step": 1138 + }, + { + "epoch": 0.06348940914158306, + "grad_norm": 0.39611610770225525, + "learning_rate": 8e-05, + "loss": 1.3078, + "step": 1139 + }, + { + "epoch": 0.06354515050167224, + "grad_norm": 0.4411247968673706, + "learning_rate": 8e-05, + "loss": 1.7183, + "step": 1140 + }, + { + "epoch": 0.06360089186176143, + "grad_norm": 0.441955029964447, + "learning_rate": 8e-05, + "loss": 1.943, + "step": 1141 + }, + { + "epoch": 0.06365663322185061, + "grad_norm": 0.4392283856868744, + "learning_rate": 8e-05, + "loss": 1.6887, + "step": 1142 + }, + { + "epoch": 0.0637123745819398, + "grad_norm": 0.44392138719558716, + "learning_rate": 8e-05, + "loss": 1.7069, + "step": 1143 + }, + { + "epoch": 0.06376811594202898, + "grad_norm": 0.40321075916290283, + "learning_rate": 8e-05, + "loss": 1.8176, + "step": 1144 + }, + { + "epoch": 0.06382385730211818, + "grad_norm": 0.4580495059490204, + "learning_rate": 8e-05, + "loss": 1.9481, + "step": 1145 + }, + { + "epoch": 0.06387959866220735, + "grad_norm": 0.4451245069503784, + "learning_rate": 8e-05, + "loss": 1.7982, + "step": 1146 + }, + { + "epoch": 0.06393534002229655, + "grad_norm": 0.4405924081802368, + "learning_rate": 8e-05, + "loss": 1.863, + "step": 1147 + }, + { + "epoch": 0.06399108138238573, + "grad_norm": 0.44056808948516846, + "learning_rate": 8e-05, + "loss": 1.8536, + "step": 1148 + }, + { + "epoch": 0.06404682274247492, + "grad_norm": 0.44250497221946716, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 1149 + }, + { + "epoch": 0.0641025641025641, + "grad_norm": 0.42131105065345764, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 1150 + }, + { + "epoch": 0.06415830546265329, + "grad_norm": 0.4144631624221802, + "learning_rate": 8e-05, + "loss": 1.8841, + "step": 1151 + }, + { + "epoch": 0.06421404682274247, + "grad_norm": 0.4111699163913727, + "learning_rate": 8e-05, + "loss": 1.5023, + "step": 1152 + }, + { + "epoch": 0.06426978818283166, + "grad_norm": 0.4193449020385742, + "learning_rate": 8e-05, + "loss": 1.584, + "step": 1153 + }, + { + "epoch": 0.06432552954292085, + "grad_norm": 0.44446641206741333, + "learning_rate": 8e-05, + "loss": 1.8562, + "step": 1154 + }, + { + "epoch": 0.06438127090301003, + "grad_norm": 0.4735074043273926, + "learning_rate": 8e-05, + "loss": 1.6038, + "step": 1155 + }, + { + "epoch": 0.06443701226309922, + "grad_norm": 0.4791753888130188, + "learning_rate": 8e-05, + "loss": 2.0993, + "step": 1156 + }, + { + "epoch": 0.0644927536231884, + "grad_norm": 0.42762860655784607, + "learning_rate": 8e-05, + "loss": 1.8449, + "step": 1157 + }, + { + "epoch": 0.0645484949832776, + "grad_norm": 0.428559273481369, + "learning_rate": 8e-05, + "loss": 1.6434, + "step": 1158 + }, + { + "epoch": 0.06460423634336677, + "grad_norm": 0.41851234436035156, + "learning_rate": 8e-05, + "loss": 1.7801, + "step": 1159 + }, + { + "epoch": 0.06465997770345597, + "grad_norm": 0.4329962730407715, + "learning_rate": 8e-05, + "loss": 1.7616, + "step": 1160 + }, + { + "epoch": 0.06471571906354515, + "grad_norm": 0.42422378063201904, + "learning_rate": 8e-05, + "loss": 1.7998, + "step": 1161 + }, + { + "epoch": 0.06477146042363434, + "grad_norm": 0.4383358955383301, + "learning_rate": 8e-05, + "loss": 1.9044, + "step": 1162 + }, + { + "epoch": 0.06482720178372352, + "grad_norm": 0.4317667484283447, + "learning_rate": 8e-05, + "loss": 1.8342, + "step": 1163 + }, + { + "epoch": 0.06488294314381271, + "grad_norm": 0.4675438404083252, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 1164 + }, + { + "epoch": 0.06493868450390189, + "grad_norm": 0.43177488446235657, + "learning_rate": 8e-05, + "loss": 1.8793, + "step": 1165 + }, + { + "epoch": 0.06499442586399108, + "grad_norm": 0.3976529836654663, + "learning_rate": 8e-05, + "loss": 1.8037, + "step": 1166 + }, + { + "epoch": 0.06505016722408027, + "grad_norm": 0.49942541122436523, + "learning_rate": 8e-05, + "loss": 1.9051, + "step": 1167 + }, + { + "epoch": 0.06510590858416945, + "grad_norm": 0.46481525897979736, + "learning_rate": 8e-05, + "loss": 1.8427, + "step": 1168 + }, + { + "epoch": 0.06516164994425865, + "grad_norm": 0.44631510972976685, + "learning_rate": 8e-05, + "loss": 1.8011, + "step": 1169 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 0.4932478368282318, + "learning_rate": 8e-05, + "loss": 1.7985, + "step": 1170 + }, + { + "epoch": 0.06527313266443702, + "grad_norm": 0.4396071135997772, + "learning_rate": 8e-05, + "loss": 1.7359, + "step": 1171 + }, + { + "epoch": 0.0653288740245262, + "grad_norm": 0.45359596610069275, + "learning_rate": 8e-05, + "loss": 1.8659, + "step": 1172 + }, + { + "epoch": 0.06538461538461539, + "grad_norm": 0.40468963980674744, + "learning_rate": 8e-05, + "loss": 1.4637, + "step": 1173 + }, + { + "epoch": 0.06544035674470457, + "grad_norm": 0.4339541792869568, + "learning_rate": 8e-05, + "loss": 1.8278, + "step": 1174 + }, + { + "epoch": 0.06549609810479376, + "grad_norm": 0.3990840017795563, + "learning_rate": 8e-05, + "loss": 1.6032, + "step": 1175 + }, + { + "epoch": 0.06555183946488294, + "grad_norm": 0.4356841742992401, + "learning_rate": 8e-05, + "loss": 1.7532, + "step": 1176 + }, + { + "epoch": 0.06560758082497213, + "grad_norm": 0.4568274915218353, + "learning_rate": 8e-05, + "loss": 1.9382, + "step": 1177 + }, + { + "epoch": 0.06566332218506131, + "grad_norm": 0.43120601773262024, + "learning_rate": 8e-05, + "loss": 1.8981, + "step": 1178 + }, + { + "epoch": 0.0657190635451505, + "grad_norm": 0.4385588765144348, + "learning_rate": 8e-05, + "loss": 1.7199, + "step": 1179 + }, + { + "epoch": 0.06577480490523968, + "grad_norm": 0.4216311275959015, + "learning_rate": 8e-05, + "loss": 1.7709, + "step": 1180 + }, + { + "epoch": 0.06583054626532887, + "grad_norm": 0.433501273393631, + "learning_rate": 8e-05, + "loss": 1.5969, + "step": 1181 + }, + { + "epoch": 0.06588628762541807, + "grad_norm": 0.44115445017814636, + "learning_rate": 8e-05, + "loss": 1.8182, + "step": 1182 + }, + { + "epoch": 0.06594202898550725, + "grad_norm": 0.4317324161529541, + "learning_rate": 8e-05, + "loss": 1.7448, + "step": 1183 + }, + { + "epoch": 0.06599777034559644, + "grad_norm": 0.43778157234191895, + "learning_rate": 8e-05, + "loss": 1.759, + "step": 1184 + }, + { + "epoch": 0.06605351170568562, + "grad_norm": 0.4287802278995514, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 1185 + }, + { + "epoch": 0.06610925306577481, + "grad_norm": 0.4651936888694763, + "learning_rate": 8e-05, + "loss": 1.6471, + "step": 1186 + }, + { + "epoch": 0.06616499442586399, + "grad_norm": 0.4521205425262451, + "learning_rate": 8e-05, + "loss": 1.807, + "step": 1187 + }, + { + "epoch": 0.06622073578595318, + "grad_norm": 0.43945908546447754, + "learning_rate": 8e-05, + "loss": 1.7347, + "step": 1188 + }, + { + "epoch": 0.06627647714604236, + "grad_norm": 0.4200635850429535, + "learning_rate": 8e-05, + "loss": 1.6566, + "step": 1189 + }, + { + "epoch": 0.06633221850613155, + "grad_norm": 0.43169304728507996, + "learning_rate": 8e-05, + "loss": 1.7691, + "step": 1190 + }, + { + "epoch": 0.06638795986622073, + "grad_norm": 0.4562690556049347, + "learning_rate": 8e-05, + "loss": 1.8396, + "step": 1191 + }, + { + "epoch": 0.06644370122630992, + "grad_norm": 0.43797406554222107, + "learning_rate": 8e-05, + "loss": 1.6736, + "step": 1192 + }, + { + "epoch": 0.0664994425863991, + "grad_norm": 0.4583410322666168, + "learning_rate": 8e-05, + "loss": 2.0178, + "step": 1193 + }, + { + "epoch": 0.0665551839464883, + "grad_norm": 0.49902230501174927, + "learning_rate": 8e-05, + "loss": 1.8571, + "step": 1194 + }, + { + "epoch": 0.06661092530657749, + "grad_norm": 0.43419089913368225, + "learning_rate": 8e-05, + "loss": 1.6779, + "step": 1195 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4792231023311615, + "learning_rate": 8e-05, + "loss": 1.8727, + "step": 1196 + }, + { + "epoch": 0.06672240802675586, + "grad_norm": 0.43299925327301025, + "learning_rate": 8e-05, + "loss": 1.7666, + "step": 1197 + }, + { + "epoch": 0.06677814938684504, + "grad_norm": 0.4165794253349304, + "learning_rate": 8e-05, + "loss": 1.7137, + "step": 1198 + }, + { + "epoch": 0.06683389074693423, + "grad_norm": 0.4535585939884186, + "learning_rate": 8e-05, + "loss": 1.7655, + "step": 1199 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4509890377521515, + "learning_rate": 8e-05, + "loss": 1.8209, + "step": 1200 + }, + { + "epoch": 0.0669453734671126, + "grad_norm": 0.4380910098552704, + "learning_rate": 8e-05, + "loss": 1.7428, + "step": 1201 + }, + { + "epoch": 0.06700111482720178, + "grad_norm": 0.4500320553779602, + "learning_rate": 8e-05, + "loss": 1.8026, + "step": 1202 + }, + { + "epoch": 0.06705685618729097, + "grad_norm": 0.47676727175712585, + "learning_rate": 8e-05, + "loss": 1.8406, + "step": 1203 + }, + { + "epoch": 0.06711259754738015, + "grad_norm": 0.43615782260894775, + "learning_rate": 8e-05, + "loss": 1.7105, + "step": 1204 + }, + { + "epoch": 0.06716833890746934, + "grad_norm": 0.4056113660335541, + "learning_rate": 8e-05, + "loss": 1.7475, + "step": 1205 + }, + { + "epoch": 0.06722408026755852, + "grad_norm": 0.43659117817878723, + "learning_rate": 8e-05, + "loss": 1.7423, + "step": 1206 + }, + { + "epoch": 0.06727982162764772, + "grad_norm": 0.4657900631427765, + "learning_rate": 8e-05, + "loss": 1.7744, + "step": 1207 + }, + { + "epoch": 0.0673355629877369, + "grad_norm": 0.47949662804603577, + "learning_rate": 8e-05, + "loss": 1.9269, + "step": 1208 + }, + { + "epoch": 0.06739130434782609, + "grad_norm": 0.45301929116249084, + "learning_rate": 8e-05, + "loss": 1.7783, + "step": 1209 + }, + { + "epoch": 0.06744704570791528, + "grad_norm": 0.48128190636634827, + "learning_rate": 8e-05, + "loss": 1.8949, + "step": 1210 + }, + { + "epoch": 0.06750278706800446, + "grad_norm": 0.4121861159801483, + "learning_rate": 8e-05, + "loss": 1.6111, + "step": 1211 + }, + { + "epoch": 0.06755852842809365, + "grad_norm": 0.5082221627235413, + "learning_rate": 8e-05, + "loss": 1.8098, + "step": 1212 + }, + { + "epoch": 0.06761426978818283, + "grad_norm": 0.45201507210731506, + "learning_rate": 8e-05, + "loss": 1.7695, + "step": 1213 + }, + { + "epoch": 0.06767001114827202, + "grad_norm": 0.40513336658477783, + "learning_rate": 8e-05, + "loss": 1.6133, + "step": 1214 + }, + { + "epoch": 0.0677257525083612, + "grad_norm": 0.45347344875335693, + "learning_rate": 8e-05, + "loss": 1.9623, + "step": 1215 + }, + { + "epoch": 0.0677814938684504, + "grad_norm": 0.42242351174354553, + "learning_rate": 8e-05, + "loss": 1.7273, + "step": 1216 + }, + { + "epoch": 0.06783723522853957, + "grad_norm": 0.42420676350593567, + "learning_rate": 8e-05, + "loss": 1.6187, + "step": 1217 + }, + { + "epoch": 0.06789297658862876, + "grad_norm": 0.45483478903770447, + "learning_rate": 8e-05, + "loss": 1.844, + "step": 1218 + }, + { + "epoch": 0.06794871794871794, + "grad_norm": 0.440166175365448, + "learning_rate": 8e-05, + "loss": 1.7197, + "step": 1219 + }, + { + "epoch": 0.06800445930880714, + "grad_norm": 0.4046355187892914, + "learning_rate": 8e-05, + "loss": 1.6218, + "step": 1220 + }, + { + "epoch": 0.06806020066889631, + "grad_norm": 0.44956326484680176, + "learning_rate": 8e-05, + "loss": 1.6526, + "step": 1221 + }, + { + "epoch": 0.06811594202898551, + "grad_norm": 0.4798918664455414, + "learning_rate": 8e-05, + "loss": 1.977, + "step": 1222 + }, + { + "epoch": 0.0681716833890747, + "grad_norm": 0.45986703038215637, + "learning_rate": 8e-05, + "loss": 1.8512, + "step": 1223 + }, + { + "epoch": 0.06822742474916388, + "grad_norm": 0.4693866968154907, + "learning_rate": 8e-05, + "loss": 1.6778, + "step": 1224 + }, + { + "epoch": 0.06828316610925307, + "grad_norm": 0.42940130829811096, + "learning_rate": 8e-05, + "loss": 1.6264, + "step": 1225 + }, + { + "epoch": 0.06833890746934225, + "grad_norm": 0.4441980719566345, + "learning_rate": 8e-05, + "loss": 1.7485, + "step": 1226 + }, + { + "epoch": 0.06839464882943144, + "grad_norm": 0.45005375146865845, + "learning_rate": 8e-05, + "loss": 2.0098, + "step": 1227 + }, + { + "epoch": 0.06845039018952062, + "grad_norm": 0.4309133291244507, + "learning_rate": 8e-05, + "loss": 1.7733, + "step": 1228 + }, + { + "epoch": 0.06850613154960981, + "grad_norm": 0.43084168434143066, + "learning_rate": 8e-05, + "loss": 1.7545, + "step": 1229 + }, + { + "epoch": 0.06856187290969899, + "grad_norm": 0.45081162452697754, + "learning_rate": 8e-05, + "loss": 1.5478, + "step": 1230 + }, + { + "epoch": 0.06861761426978819, + "grad_norm": 0.5341338515281677, + "learning_rate": 8e-05, + "loss": 1.6129, + "step": 1231 + }, + { + "epoch": 0.06867335562987736, + "grad_norm": 0.4506458640098572, + "learning_rate": 8e-05, + "loss": 1.8137, + "step": 1232 + }, + { + "epoch": 0.06872909698996656, + "grad_norm": 0.47452664375305176, + "learning_rate": 8e-05, + "loss": 1.8111, + "step": 1233 + }, + { + "epoch": 0.06878483835005574, + "grad_norm": 0.46135878562927246, + "learning_rate": 8e-05, + "loss": 1.6382, + "step": 1234 + }, + { + "epoch": 0.06884057971014493, + "grad_norm": 0.4212229549884796, + "learning_rate": 8e-05, + "loss": 1.6441, + "step": 1235 + }, + { + "epoch": 0.06889632107023412, + "grad_norm": 0.4409344494342804, + "learning_rate": 8e-05, + "loss": 1.5812, + "step": 1236 + }, + { + "epoch": 0.0689520624303233, + "grad_norm": 0.45152005553245544, + "learning_rate": 8e-05, + "loss": 1.5605, + "step": 1237 + }, + { + "epoch": 0.06900780379041249, + "grad_norm": 0.45905745029449463, + "learning_rate": 8e-05, + "loss": 1.7057, + "step": 1238 + }, + { + "epoch": 0.06906354515050167, + "grad_norm": 0.45360052585601807, + "learning_rate": 8e-05, + "loss": 1.7103, + "step": 1239 + }, + { + "epoch": 0.06911928651059086, + "grad_norm": 0.43128153681755066, + "learning_rate": 8e-05, + "loss": 1.6429, + "step": 1240 + }, + { + "epoch": 0.06917502787068004, + "grad_norm": 0.5309453010559082, + "learning_rate": 8e-05, + "loss": 1.9669, + "step": 1241 + }, + { + "epoch": 0.06923076923076923, + "grad_norm": 0.4460867941379547, + "learning_rate": 8e-05, + "loss": 1.6892, + "step": 1242 + }, + { + "epoch": 0.06928651059085841, + "grad_norm": 0.44082680344581604, + "learning_rate": 8e-05, + "loss": 1.8816, + "step": 1243 + }, + { + "epoch": 0.0693422519509476, + "grad_norm": 0.43136364221572876, + "learning_rate": 8e-05, + "loss": 1.6994, + "step": 1244 + }, + { + "epoch": 0.06939799331103678, + "grad_norm": 0.40499478578567505, + "learning_rate": 8e-05, + "loss": 1.5882, + "step": 1245 + }, + { + "epoch": 0.06945373467112598, + "grad_norm": 0.4175102710723877, + "learning_rate": 8e-05, + "loss": 1.5226, + "step": 1246 + }, + { + "epoch": 0.06950947603121516, + "grad_norm": 0.4481530487537384, + "learning_rate": 8e-05, + "loss": 1.7025, + "step": 1247 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 0.42439576983451843, + "learning_rate": 8e-05, + "loss": 1.6737, + "step": 1248 + }, + { + "epoch": 0.06962095875139353, + "grad_norm": 0.43070897459983826, + "learning_rate": 8e-05, + "loss": 1.6205, + "step": 1249 + }, + { + "epoch": 0.06967670011148272, + "grad_norm": 0.4086393415927887, + "learning_rate": 8e-05, + "loss": 1.7679, + "step": 1250 + }, + { + "epoch": 0.06973244147157191, + "grad_norm": 0.4932767450809479, + "learning_rate": 8e-05, + "loss": 1.5989, + "step": 1251 + }, + { + "epoch": 0.06978818283166109, + "grad_norm": 0.5050959587097168, + "learning_rate": 8e-05, + "loss": 2.0871, + "step": 1252 + }, + { + "epoch": 0.06984392419175028, + "grad_norm": 0.4637925326824188, + "learning_rate": 8e-05, + "loss": 1.868, + "step": 1253 + }, + { + "epoch": 0.06989966555183946, + "grad_norm": 0.41985011100769043, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 1254 + }, + { + "epoch": 0.06995540691192866, + "grad_norm": 0.4753486216068268, + "learning_rate": 8e-05, + "loss": 1.9415, + "step": 1255 + }, + { + "epoch": 0.07001114827201783, + "grad_norm": 0.43326473236083984, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 1256 + }, + { + "epoch": 0.07006688963210703, + "grad_norm": 0.41782644391059875, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 1257 + }, + { + "epoch": 0.0701226309921962, + "grad_norm": 0.44368311762809753, + "learning_rate": 8e-05, + "loss": 1.541, + "step": 1258 + }, + { + "epoch": 0.0701783723522854, + "grad_norm": 0.4449438750743866, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 1259 + }, + { + "epoch": 0.07023411371237458, + "grad_norm": 0.4272336959838867, + "learning_rate": 8e-05, + "loss": 1.4967, + "step": 1260 + }, + { + "epoch": 0.07028985507246377, + "grad_norm": 0.4614323377609253, + "learning_rate": 8e-05, + "loss": 1.8735, + "step": 1261 + }, + { + "epoch": 0.07034559643255295, + "grad_norm": 0.42471393942832947, + "learning_rate": 8e-05, + "loss": 1.946, + "step": 1262 + }, + { + "epoch": 0.07040133779264214, + "grad_norm": 0.4473448097705841, + "learning_rate": 8e-05, + "loss": 1.8145, + "step": 1263 + }, + { + "epoch": 0.07045707915273133, + "grad_norm": 0.582961916923523, + "learning_rate": 8e-05, + "loss": 1.7832, + "step": 1264 + }, + { + "epoch": 0.07051282051282051, + "grad_norm": 0.4432711601257324, + "learning_rate": 8e-05, + "loss": 1.9099, + "step": 1265 + }, + { + "epoch": 0.0705685618729097, + "grad_norm": 0.4061930775642395, + "learning_rate": 8e-05, + "loss": 1.6089, + "step": 1266 + }, + { + "epoch": 0.07062430323299888, + "grad_norm": 0.43295302987098694, + "learning_rate": 8e-05, + "loss": 1.9276, + "step": 1267 + }, + { + "epoch": 0.07068004459308808, + "grad_norm": 0.4516142010688782, + "learning_rate": 8e-05, + "loss": 1.7385, + "step": 1268 + }, + { + "epoch": 0.07073578595317725, + "grad_norm": 0.5671238899230957, + "learning_rate": 8e-05, + "loss": 1.9751, + "step": 1269 + }, + { + "epoch": 0.07079152731326645, + "grad_norm": 0.4582408666610718, + "learning_rate": 8e-05, + "loss": 1.9635, + "step": 1270 + }, + { + "epoch": 0.07084726867335563, + "grad_norm": 0.45998480916023254, + "learning_rate": 8e-05, + "loss": 1.7577, + "step": 1271 + }, + { + "epoch": 0.07090301003344482, + "grad_norm": 0.44361039996147156, + "learning_rate": 8e-05, + "loss": 1.7662, + "step": 1272 + }, + { + "epoch": 0.070958751393534, + "grad_norm": 0.45488402247428894, + "learning_rate": 8e-05, + "loss": 1.8307, + "step": 1273 + }, + { + "epoch": 0.07101449275362319, + "grad_norm": 0.4565194845199585, + "learning_rate": 8e-05, + "loss": 1.6827, + "step": 1274 + }, + { + "epoch": 0.07107023411371237, + "grad_norm": 0.44688448309898376, + "learning_rate": 8e-05, + "loss": 1.6676, + "step": 1275 + }, + { + "epoch": 0.07112597547380156, + "grad_norm": 0.4568803012371063, + "learning_rate": 8e-05, + "loss": 1.7335, + "step": 1276 + }, + { + "epoch": 0.07118171683389074, + "grad_norm": 0.43606460094451904, + "learning_rate": 8e-05, + "loss": 1.8572, + "step": 1277 + }, + { + "epoch": 0.07123745819397993, + "grad_norm": 0.4564160406589508, + "learning_rate": 8e-05, + "loss": 1.8938, + "step": 1278 + }, + { + "epoch": 0.07129319955406913, + "grad_norm": 0.4709673821926117, + "learning_rate": 8e-05, + "loss": 2.0716, + "step": 1279 + }, + { + "epoch": 0.0713489409141583, + "grad_norm": 0.4627903401851654, + "learning_rate": 8e-05, + "loss": 1.9952, + "step": 1280 + }, + { + "epoch": 0.0714046822742475, + "grad_norm": 0.45227548480033875, + "learning_rate": 8e-05, + "loss": 1.7264, + "step": 1281 + }, + { + "epoch": 0.07146042363433668, + "grad_norm": 0.4532218277454376, + "learning_rate": 8e-05, + "loss": 1.7318, + "step": 1282 + }, + { + "epoch": 0.07151616499442587, + "grad_norm": 0.43891414999961853, + "learning_rate": 8e-05, + "loss": 1.7138, + "step": 1283 + }, + { + "epoch": 0.07157190635451505, + "grad_norm": 0.41745471954345703, + "learning_rate": 8e-05, + "loss": 1.6738, + "step": 1284 + }, + { + "epoch": 0.07162764771460424, + "grad_norm": 0.43092063069343567, + "learning_rate": 8e-05, + "loss": 1.7968, + "step": 1285 + }, + { + "epoch": 0.07168338907469342, + "grad_norm": 0.3872428834438324, + "learning_rate": 8e-05, + "loss": 1.4228, + "step": 1286 + }, + { + "epoch": 0.07173913043478261, + "grad_norm": 0.4488355815410614, + "learning_rate": 8e-05, + "loss": 1.8599, + "step": 1287 + }, + { + "epoch": 0.07179487179487179, + "grad_norm": 0.4347779154777527, + "learning_rate": 8e-05, + "loss": 1.7125, + "step": 1288 + }, + { + "epoch": 0.07185061315496098, + "grad_norm": 0.46295249462127686, + "learning_rate": 8e-05, + "loss": 1.9316, + "step": 1289 + }, + { + "epoch": 0.07190635451505016, + "grad_norm": 0.47468453645706177, + "learning_rate": 8e-05, + "loss": 2.0974, + "step": 1290 + }, + { + "epoch": 0.07196209587513935, + "grad_norm": 0.44178271293640137, + "learning_rate": 8e-05, + "loss": 1.7739, + "step": 1291 + }, + { + "epoch": 0.07201783723522855, + "grad_norm": 0.4681906998157501, + "learning_rate": 8e-05, + "loss": 1.6988, + "step": 1292 + }, + { + "epoch": 0.07207357859531772, + "grad_norm": 0.43681320548057556, + "learning_rate": 8e-05, + "loss": 1.7113, + "step": 1293 + }, + { + "epoch": 0.07212931995540692, + "grad_norm": 0.45803403854370117, + "learning_rate": 8e-05, + "loss": 1.8703, + "step": 1294 + }, + { + "epoch": 0.0721850613154961, + "grad_norm": 0.48975130915641785, + "learning_rate": 8e-05, + "loss": 1.5916, + "step": 1295 + }, + { + "epoch": 0.07224080267558529, + "grad_norm": 0.4666009545326233, + "learning_rate": 8e-05, + "loss": 1.7433, + "step": 1296 + }, + { + "epoch": 0.07229654403567447, + "grad_norm": 0.43131640553474426, + "learning_rate": 8e-05, + "loss": 1.7401, + "step": 1297 + }, + { + "epoch": 0.07235228539576366, + "grad_norm": 0.44986674189567566, + "learning_rate": 8e-05, + "loss": 1.6756, + "step": 1298 + }, + { + "epoch": 0.07240802675585284, + "grad_norm": 0.4445556104183197, + "learning_rate": 8e-05, + "loss": 1.6817, + "step": 1299 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 0.4518899917602539, + "learning_rate": 8e-05, + "loss": 1.6919, + "step": 1300 + }, + { + "epoch": 0.07251950947603121, + "grad_norm": 0.44288474321365356, + "learning_rate": 8e-05, + "loss": 1.6887, + "step": 1301 + }, + { + "epoch": 0.0725752508361204, + "grad_norm": 0.4361318051815033, + "learning_rate": 8e-05, + "loss": 1.9064, + "step": 1302 + }, + { + "epoch": 0.07263099219620958, + "grad_norm": 0.4643789529800415, + "learning_rate": 8e-05, + "loss": 1.6757, + "step": 1303 + }, + { + "epoch": 0.07268673355629877, + "grad_norm": 0.44260480999946594, + "learning_rate": 8e-05, + "loss": 1.6959, + "step": 1304 + }, + { + "epoch": 0.07274247491638795, + "grad_norm": 0.4395084083080292, + "learning_rate": 8e-05, + "loss": 1.7706, + "step": 1305 + }, + { + "epoch": 0.07279821627647715, + "grad_norm": 0.4949529469013214, + "learning_rate": 8e-05, + "loss": 2.1344, + "step": 1306 + }, + { + "epoch": 0.07285395763656634, + "grad_norm": 0.4440799653530121, + "learning_rate": 8e-05, + "loss": 1.9293, + "step": 1307 + }, + { + "epoch": 0.07290969899665552, + "grad_norm": 0.42284902930259705, + "learning_rate": 8e-05, + "loss": 1.5556, + "step": 1308 + }, + { + "epoch": 0.07296544035674471, + "grad_norm": 0.395102322101593, + "learning_rate": 8e-05, + "loss": 1.7397, + "step": 1309 + }, + { + "epoch": 0.07302118171683389, + "grad_norm": 0.42531904578208923, + "learning_rate": 8e-05, + "loss": 1.7416, + "step": 1310 + }, + { + "epoch": 0.07307692307692308, + "grad_norm": 0.42302653193473816, + "learning_rate": 8e-05, + "loss": 1.6757, + "step": 1311 + }, + { + "epoch": 0.07313266443701226, + "grad_norm": 0.4515868127346039, + "learning_rate": 8e-05, + "loss": 1.9239, + "step": 1312 + }, + { + "epoch": 0.07318840579710145, + "grad_norm": 0.42175641655921936, + "learning_rate": 8e-05, + "loss": 1.6537, + "step": 1313 + }, + { + "epoch": 0.07324414715719063, + "grad_norm": 0.428892582654953, + "learning_rate": 8e-05, + "loss": 1.6068, + "step": 1314 + }, + { + "epoch": 0.07329988851727982, + "grad_norm": 0.4598456919193268, + "learning_rate": 8e-05, + "loss": 1.9036, + "step": 1315 + }, + { + "epoch": 0.073355629877369, + "grad_norm": 0.4194553792476654, + "learning_rate": 8e-05, + "loss": 1.7527, + "step": 1316 + }, + { + "epoch": 0.0734113712374582, + "grad_norm": 0.40796273946762085, + "learning_rate": 8e-05, + "loss": 1.7015, + "step": 1317 + }, + { + "epoch": 0.07346711259754737, + "grad_norm": 0.4502258002758026, + "learning_rate": 8e-05, + "loss": 1.5502, + "step": 1318 + }, + { + "epoch": 0.07352285395763657, + "grad_norm": 0.4401506185531616, + "learning_rate": 8e-05, + "loss": 1.7199, + "step": 1319 + }, + { + "epoch": 0.07357859531772576, + "grad_norm": 0.41750505566596985, + "learning_rate": 8e-05, + "loss": 1.8776, + "step": 1320 + }, + { + "epoch": 0.07363433667781494, + "grad_norm": 0.4231717586517334, + "learning_rate": 8e-05, + "loss": 1.6411, + "step": 1321 + }, + { + "epoch": 0.07369007803790413, + "grad_norm": 0.42720434069633484, + "learning_rate": 8e-05, + "loss": 1.536, + "step": 1322 + }, + { + "epoch": 0.07374581939799331, + "grad_norm": 0.47691041231155396, + "learning_rate": 8e-05, + "loss": 1.9963, + "step": 1323 + }, + { + "epoch": 0.0738015607580825, + "grad_norm": 0.4454549252986908, + "learning_rate": 8e-05, + "loss": 1.6888, + "step": 1324 + }, + { + "epoch": 0.07385730211817168, + "grad_norm": 0.4553287923336029, + "learning_rate": 8e-05, + "loss": 1.5597, + "step": 1325 + }, + { + "epoch": 0.07391304347826087, + "grad_norm": 0.4337655305862427, + "learning_rate": 8e-05, + "loss": 1.7658, + "step": 1326 + }, + { + "epoch": 0.07396878483835005, + "grad_norm": 0.46036162972450256, + "learning_rate": 8e-05, + "loss": 1.7755, + "step": 1327 + }, + { + "epoch": 0.07402452619843924, + "grad_norm": 0.45223167538642883, + "learning_rate": 8e-05, + "loss": 1.8664, + "step": 1328 + }, + { + "epoch": 0.07408026755852842, + "grad_norm": 0.44723761081695557, + "learning_rate": 8e-05, + "loss": 1.7667, + "step": 1329 + }, + { + "epoch": 0.07413600891861762, + "grad_norm": 0.4148390591144562, + "learning_rate": 8e-05, + "loss": 1.6549, + "step": 1330 + }, + { + "epoch": 0.0741917502787068, + "grad_norm": 0.4551730155944824, + "learning_rate": 8e-05, + "loss": 1.7353, + "step": 1331 + }, + { + "epoch": 0.07424749163879599, + "grad_norm": 0.4615274965763092, + "learning_rate": 8e-05, + "loss": 1.8675, + "step": 1332 + }, + { + "epoch": 0.07430323299888517, + "grad_norm": 0.49332278966903687, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 1333 + }, + { + "epoch": 0.07435897435897436, + "grad_norm": 0.44719934463500977, + "learning_rate": 8e-05, + "loss": 1.8286, + "step": 1334 + }, + { + "epoch": 0.07441471571906355, + "grad_norm": 0.42437857389450073, + "learning_rate": 8e-05, + "loss": 1.7739, + "step": 1335 + }, + { + "epoch": 0.07447045707915273, + "grad_norm": 0.4635609984397888, + "learning_rate": 8e-05, + "loss": 1.9338, + "step": 1336 + }, + { + "epoch": 0.07452619843924192, + "grad_norm": 0.438240110874176, + "learning_rate": 8e-05, + "loss": 1.6367, + "step": 1337 + }, + { + "epoch": 0.0745819397993311, + "grad_norm": 0.4692282974720001, + "learning_rate": 8e-05, + "loss": 1.8855, + "step": 1338 + }, + { + "epoch": 0.0746376811594203, + "grad_norm": 0.4450299143791199, + "learning_rate": 8e-05, + "loss": 1.9291, + "step": 1339 + }, + { + "epoch": 0.07469342251950947, + "grad_norm": 0.4614245593547821, + "learning_rate": 8e-05, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.07474916387959867, + "grad_norm": 0.4570753574371338, + "learning_rate": 8e-05, + "loss": 1.656, + "step": 1341 + }, + { + "epoch": 0.07480490523968784, + "grad_norm": 0.47076496481895447, + "learning_rate": 8e-05, + "loss": 1.7928, + "step": 1342 + }, + { + "epoch": 0.07486064659977704, + "grad_norm": 0.47231245040893555, + "learning_rate": 8e-05, + "loss": 1.8747, + "step": 1343 + }, + { + "epoch": 0.07491638795986622, + "grad_norm": 0.4231669008731842, + "learning_rate": 8e-05, + "loss": 1.7642, + "step": 1344 + }, + { + "epoch": 0.07497212931995541, + "grad_norm": 0.45035478472709656, + "learning_rate": 8e-05, + "loss": 1.8285, + "step": 1345 + }, + { + "epoch": 0.07502787068004459, + "grad_norm": 0.4413871467113495, + "learning_rate": 8e-05, + "loss": 1.8548, + "step": 1346 + }, + { + "epoch": 0.07508361204013378, + "grad_norm": 0.4781755208969116, + "learning_rate": 8e-05, + "loss": 1.8532, + "step": 1347 + }, + { + "epoch": 0.07513935340022297, + "grad_norm": 0.41016334295272827, + "learning_rate": 8e-05, + "loss": 1.5955, + "step": 1348 + }, + { + "epoch": 0.07519509476031215, + "grad_norm": 0.4543410837650299, + "learning_rate": 8e-05, + "loss": 1.9111, + "step": 1349 + }, + { + "epoch": 0.07525083612040134, + "grad_norm": 0.43364307284355164, + "learning_rate": 8e-05, + "loss": 1.7241, + "step": 1350 + }, + { + "epoch": 0.07530657748049052, + "grad_norm": 0.44212257862091064, + "learning_rate": 8e-05, + "loss": 1.6172, + "step": 1351 + }, + { + "epoch": 0.07536231884057971, + "grad_norm": 0.777114748954773, + "learning_rate": 8e-05, + "loss": 1.8149, + "step": 1352 + }, + { + "epoch": 0.0754180602006689, + "grad_norm": 0.4357220232486725, + "learning_rate": 8e-05, + "loss": 1.7667, + "step": 1353 + }, + { + "epoch": 0.07547380156075809, + "grad_norm": 0.4471208155155182, + "learning_rate": 8e-05, + "loss": 1.5841, + "step": 1354 + }, + { + "epoch": 0.07552954292084726, + "grad_norm": 0.43049171566963196, + "learning_rate": 8e-05, + "loss": 1.7802, + "step": 1355 + }, + { + "epoch": 0.07558528428093646, + "grad_norm": 0.4427327513694763, + "learning_rate": 8e-05, + "loss": 1.897, + "step": 1356 + }, + { + "epoch": 0.07564102564102564, + "grad_norm": 0.45292583107948303, + "learning_rate": 8e-05, + "loss": 1.8114, + "step": 1357 + }, + { + "epoch": 0.07569676700111483, + "grad_norm": 0.45449328422546387, + "learning_rate": 8e-05, + "loss": 1.8064, + "step": 1358 + }, + { + "epoch": 0.07575250836120401, + "grad_norm": 0.4421929717063904, + "learning_rate": 8e-05, + "loss": 1.9081, + "step": 1359 + }, + { + "epoch": 0.0758082497212932, + "grad_norm": 0.4415033459663391, + "learning_rate": 8e-05, + "loss": 1.7954, + "step": 1360 + }, + { + "epoch": 0.07586399108138239, + "grad_norm": 0.494498074054718, + "learning_rate": 8e-05, + "loss": 1.5815, + "step": 1361 + }, + { + "epoch": 0.07591973244147157, + "grad_norm": 0.5168023109436035, + "learning_rate": 8e-05, + "loss": 1.9749, + "step": 1362 + }, + { + "epoch": 0.07597547380156076, + "grad_norm": 0.45021355152130127, + "learning_rate": 8e-05, + "loss": 1.8096, + "step": 1363 + }, + { + "epoch": 0.07603121516164994, + "grad_norm": 0.4276469647884369, + "learning_rate": 8e-05, + "loss": 1.7189, + "step": 1364 + }, + { + "epoch": 0.07608695652173914, + "grad_norm": 0.4566865861415863, + "learning_rate": 8e-05, + "loss": 1.807, + "step": 1365 + }, + { + "epoch": 0.07614269788182831, + "grad_norm": 0.445454478263855, + "learning_rate": 8e-05, + "loss": 1.8407, + "step": 1366 + }, + { + "epoch": 0.0761984392419175, + "grad_norm": 0.48000916838645935, + "learning_rate": 8e-05, + "loss": 1.8314, + "step": 1367 + }, + { + "epoch": 0.07625418060200669, + "grad_norm": 0.43749409914016724, + "learning_rate": 8e-05, + "loss": 1.6374, + "step": 1368 + }, + { + "epoch": 0.07630992196209588, + "grad_norm": 0.46065568923950195, + "learning_rate": 8e-05, + "loss": 1.7961, + "step": 1369 + }, + { + "epoch": 0.07636566332218506, + "grad_norm": 0.5014117956161499, + "learning_rate": 8e-05, + "loss": 1.9533, + "step": 1370 + }, + { + "epoch": 0.07642140468227425, + "grad_norm": 0.4392300546169281, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 1371 + }, + { + "epoch": 0.07647714604236343, + "grad_norm": 0.4479599595069885, + "learning_rate": 8e-05, + "loss": 1.6262, + "step": 1372 + }, + { + "epoch": 0.07653288740245262, + "grad_norm": 0.4235548675060272, + "learning_rate": 8e-05, + "loss": 1.6683, + "step": 1373 + }, + { + "epoch": 0.0765886287625418, + "grad_norm": 0.4663847088813782, + "learning_rate": 8e-05, + "loss": 1.8703, + "step": 1374 + }, + { + "epoch": 0.07664437012263099, + "grad_norm": 0.5052254796028137, + "learning_rate": 8e-05, + "loss": 1.7682, + "step": 1375 + }, + { + "epoch": 0.07670011148272018, + "grad_norm": 0.4608347713947296, + "learning_rate": 8e-05, + "loss": 1.8169, + "step": 1376 + }, + { + "epoch": 0.07675585284280936, + "grad_norm": 0.4845850467681885, + "learning_rate": 8e-05, + "loss": 1.9304, + "step": 1377 + }, + { + "epoch": 0.07681159420289856, + "grad_norm": 0.4543549120426178, + "learning_rate": 8e-05, + "loss": 1.9527, + "step": 1378 + }, + { + "epoch": 0.07686733556298773, + "grad_norm": 0.4503234922885895, + "learning_rate": 8e-05, + "loss": 1.7021, + "step": 1379 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.48853322863578796, + "learning_rate": 8e-05, + "loss": 1.7907, + "step": 1380 + }, + { + "epoch": 0.0769788182831661, + "grad_norm": 0.4230073094367981, + "learning_rate": 8e-05, + "loss": 1.7625, + "step": 1381 + }, + { + "epoch": 0.0770345596432553, + "grad_norm": 0.4668857753276825, + "learning_rate": 8e-05, + "loss": 1.7012, + "step": 1382 + }, + { + "epoch": 0.07709030100334448, + "grad_norm": 0.43255892395973206, + "learning_rate": 8e-05, + "loss": 1.7422, + "step": 1383 + }, + { + "epoch": 0.07714604236343367, + "grad_norm": 0.44348350167274475, + "learning_rate": 8e-05, + "loss": 1.6422, + "step": 1384 + }, + { + "epoch": 0.07720178372352285, + "grad_norm": 0.45581987500190735, + "learning_rate": 8e-05, + "loss": 1.8175, + "step": 1385 + }, + { + "epoch": 0.07725752508361204, + "grad_norm": 0.518310010433197, + "learning_rate": 8e-05, + "loss": 1.9449, + "step": 1386 + }, + { + "epoch": 0.07731326644370122, + "grad_norm": 0.4802246391773224, + "learning_rate": 8e-05, + "loss": 1.9455, + "step": 1387 + }, + { + "epoch": 0.07736900780379041, + "grad_norm": 0.4345766603946686, + "learning_rate": 8e-05, + "loss": 1.6838, + "step": 1388 + }, + { + "epoch": 0.0774247491638796, + "grad_norm": 0.4461798071861267, + "learning_rate": 8e-05, + "loss": 1.8256, + "step": 1389 + }, + { + "epoch": 0.07748049052396878, + "grad_norm": 0.45585009455680847, + "learning_rate": 8e-05, + "loss": 1.7255, + "step": 1390 + }, + { + "epoch": 0.07753623188405798, + "grad_norm": 0.4471147954463959, + "learning_rate": 8e-05, + "loss": 1.7223, + "step": 1391 + }, + { + "epoch": 0.07759197324414716, + "grad_norm": 0.4584360420703888, + "learning_rate": 8e-05, + "loss": 1.8044, + "step": 1392 + }, + { + "epoch": 0.07764771460423635, + "grad_norm": 0.46038952469825745, + "learning_rate": 8e-05, + "loss": 1.6906, + "step": 1393 + }, + { + "epoch": 0.07770345596432553, + "grad_norm": 0.45119255781173706, + "learning_rate": 8e-05, + "loss": 1.8722, + "step": 1394 + }, + { + "epoch": 0.07775919732441472, + "grad_norm": 0.4340799152851105, + "learning_rate": 8e-05, + "loss": 1.8374, + "step": 1395 + }, + { + "epoch": 0.0778149386845039, + "grad_norm": 0.44295990467071533, + "learning_rate": 8e-05, + "loss": 1.7464, + "step": 1396 + }, + { + "epoch": 0.07787068004459309, + "grad_norm": 0.4677167236804962, + "learning_rate": 8e-05, + "loss": 1.7631, + "step": 1397 + }, + { + "epoch": 0.07792642140468227, + "grad_norm": 0.4594052731990814, + "learning_rate": 8e-05, + "loss": 1.7957, + "step": 1398 + }, + { + "epoch": 0.07798216276477146, + "grad_norm": 0.4324091076850891, + "learning_rate": 8e-05, + "loss": 1.7465, + "step": 1399 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.45841190218925476, + "learning_rate": 8e-05, + "loss": 1.7334, + "step": 1400 + }, + { + "epoch": 0.07809364548494983, + "grad_norm": 0.5322265625, + "learning_rate": 8e-05, + "loss": 2.1824, + "step": 1401 + }, + { + "epoch": 0.07814938684503901, + "grad_norm": 0.36952677369117737, + "learning_rate": 8e-05, + "loss": 1.4223, + "step": 1402 + }, + { + "epoch": 0.0782051282051282, + "grad_norm": 0.486471027135849, + "learning_rate": 8e-05, + "loss": 1.8555, + "step": 1403 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 0.43990740180015564, + "learning_rate": 8e-05, + "loss": 1.8663, + "step": 1404 + }, + { + "epoch": 0.07831661092530658, + "grad_norm": 0.4258549213409424, + "learning_rate": 8e-05, + "loss": 1.8713, + "step": 1405 + }, + { + "epoch": 0.07837235228539577, + "grad_norm": 0.510369598865509, + "learning_rate": 8e-05, + "loss": 1.9984, + "step": 1406 + }, + { + "epoch": 0.07842809364548495, + "grad_norm": 0.42820146679878235, + "learning_rate": 8e-05, + "loss": 1.7277, + "step": 1407 + }, + { + "epoch": 0.07848383500557414, + "grad_norm": 0.4371456801891327, + "learning_rate": 8e-05, + "loss": 1.5951, + "step": 1408 + }, + { + "epoch": 0.07853957636566332, + "grad_norm": 0.5079208016395569, + "learning_rate": 8e-05, + "loss": 1.6506, + "step": 1409 + }, + { + "epoch": 0.07859531772575251, + "grad_norm": 0.4610532224178314, + "learning_rate": 8e-05, + "loss": 1.9321, + "step": 1410 + }, + { + "epoch": 0.07865105908584169, + "grad_norm": 0.4237179756164551, + "learning_rate": 8e-05, + "loss": 1.7152, + "step": 1411 + }, + { + "epoch": 0.07870680044593088, + "grad_norm": 0.4514097571372986, + "learning_rate": 8e-05, + "loss": 1.659, + "step": 1412 + }, + { + "epoch": 0.07876254180602006, + "grad_norm": 0.4557318389415741, + "learning_rate": 8e-05, + "loss": 1.8183, + "step": 1413 + }, + { + "epoch": 0.07881828316610925, + "grad_norm": 0.42246413230895996, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 1414 + }, + { + "epoch": 0.07887402452619843, + "grad_norm": 0.4116936922073364, + "learning_rate": 8e-05, + "loss": 1.5519, + "step": 1415 + }, + { + "epoch": 0.07892976588628763, + "grad_norm": 0.4407467842102051, + "learning_rate": 8e-05, + "loss": 1.8434, + "step": 1416 + }, + { + "epoch": 0.07898550724637682, + "grad_norm": 0.5225424766540527, + "learning_rate": 8e-05, + "loss": 1.9881, + "step": 1417 + }, + { + "epoch": 0.079041248606466, + "grad_norm": 0.4358792006969452, + "learning_rate": 8e-05, + "loss": 1.7754, + "step": 1418 + }, + { + "epoch": 0.07909698996655519, + "grad_norm": 0.4558788537979126, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 1419 + }, + { + "epoch": 0.07915273132664437, + "grad_norm": 0.532288134098053, + "learning_rate": 8e-05, + "loss": 2.1258, + "step": 1420 + }, + { + "epoch": 0.07920847268673356, + "grad_norm": 0.5065916180610657, + "learning_rate": 8e-05, + "loss": 1.8183, + "step": 1421 + }, + { + "epoch": 0.07926421404682274, + "grad_norm": 0.44754672050476074, + "learning_rate": 8e-05, + "loss": 1.6664, + "step": 1422 + }, + { + "epoch": 0.07931995540691193, + "grad_norm": 0.45572909712791443, + "learning_rate": 8e-05, + "loss": 1.915, + "step": 1423 + }, + { + "epoch": 0.07937569676700111, + "grad_norm": 0.43835529685020447, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 1424 + }, + { + "epoch": 0.0794314381270903, + "grad_norm": 0.43413209915161133, + "learning_rate": 8e-05, + "loss": 1.8262, + "step": 1425 + }, + { + "epoch": 0.07948717948717948, + "grad_norm": 0.4330446124076843, + "learning_rate": 8e-05, + "loss": 1.4287, + "step": 1426 + }, + { + "epoch": 0.07954292084726867, + "grad_norm": 0.4555690884590149, + "learning_rate": 8e-05, + "loss": 1.638, + "step": 1427 + }, + { + "epoch": 0.07959866220735785, + "grad_norm": 0.4385247826576233, + "learning_rate": 8e-05, + "loss": 1.6261, + "step": 1428 + }, + { + "epoch": 0.07965440356744705, + "grad_norm": 0.4520176649093628, + "learning_rate": 8e-05, + "loss": 1.8925, + "step": 1429 + }, + { + "epoch": 0.07971014492753623, + "grad_norm": 0.43680456280708313, + "learning_rate": 8e-05, + "loss": 1.9335, + "step": 1430 + }, + { + "epoch": 0.07976588628762542, + "grad_norm": 0.4729453921318054, + "learning_rate": 8e-05, + "loss": 1.8992, + "step": 1431 + }, + { + "epoch": 0.07982162764771461, + "grad_norm": 0.4304512143135071, + "learning_rate": 8e-05, + "loss": 1.6413, + "step": 1432 + }, + { + "epoch": 0.07987736900780379, + "grad_norm": 0.41261231899261475, + "learning_rate": 8e-05, + "loss": 1.8193, + "step": 1433 + }, + { + "epoch": 0.07993311036789298, + "grad_norm": 0.47836706042289734, + "learning_rate": 8e-05, + "loss": 1.8577, + "step": 1434 + }, + { + "epoch": 0.07998885172798216, + "grad_norm": 0.5016499161720276, + "learning_rate": 8e-05, + "loss": 1.9283, + "step": 1435 + }, + { + "epoch": 0.08004459308807135, + "grad_norm": 0.4799736440181732, + "learning_rate": 8e-05, + "loss": 1.9349, + "step": 1436 + }, + { + "epoch": 0.08010033444816053, + "grad_norm": 0.45294785499572754, + "learning_rate": 8e-05, + "loss": 1.9936, + "step": 1437 + }, + { + "epoch": 0.08015607580824972, + "grad_norm": 0.3943483233451843, + "learning_rate": 8e-05, + "loss": 1.5038, + "step": 1438 + }, + { + "epoch": 0.0802118171683389, + "grad_norm": 0.44112369418144226, + "learning_rate": 8e-05, + "loss": 1.6879, + "step": 1439 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 0.48437753319740295, + "learning_rate": 8e-05, + "loss": 2.0316, + "step": 1440 + }, + { + "epoch": 0.08032329988851727, + "grad_norm": 0.4124515652656555, + "learning_rate": 8e-05, + "loss": 1.4986, + "step": 1441 + }, + { + "epoch": 0.08037904124860647, + "grad_norm": 0.46731048822402954, + "learning_rate": 8e-05, + "loss": 1.7208, + "step": 1442 + }, + { + "epoch": 0.08043478260869565, + "grad_norm": 0.47354432940483093, + "learning_rate": 8e-05, + "loss": 1.9603, + "step": 1443 + }, + { + "epoch": 0.08049052396878484, + "grad_norm": 0.4480750262737274, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 1444 + }, + { + "epoch": 0.08054626532887403, + "grad_norm": 0.4697958528995514, + "learning_rate": 8e-05, + "loss": 1.4256, + "step": 1445 + }, + { + "epoch": 0.08060200668896321, + "grad_norm": 0.42544737458229065, + "learning_rate": 8e-05, + "loss": 1.7851, + "step": 1446 + }, + { + "epoch": 0.0806577480490524, + "grad_norm": 0.44435450434684753, + "learning_rate": 8e-05, + "loss": 1.6728, + "step": 1447 + }, + { + "epoch": 0.08071348940914158, + "grad_norm": 0.4432964026927948, + "learning_rate": 8e-05, + "loss": 1.755, + "step": 1448 + }, + { + "epoch": 0.08076923076923077, + "grad_norm": 0.4708921015262604, + "learning_rate": 8e-05, + "loss": 1.6021, + "step": 1449 + }, + { + "epoch": 0.08082497212931995, + "grad_norm": 0.4536772072315216, + "learning_rate": 8e-05, + "loss": 1.76, + "step": 1450 + }, + { + "epoch": 0.08088071348940915, + "grad_norm": 0.4945363700389862, + "learning_rate": 8e-05, + "loss": 1.9236, + "step": 1451 + }, + { + "epoch": 0.08093645484949832, + "grad_norm": 0.48008787631988525, + "learning_rate": 8e-05, + "loss": 1.9332, + "step": 1452 + }, + { + "epoch": 0.08099219620958752, + "grad_norm": 0.446678102016449, + "learning_rate": 8e-05, + "loss": 1.7992, + "step": 1453 + }, + { + "epoch": 0.0810479375696767, + "grad_norm": 0.4077160358428955, + "learning_rate": 8e-05, + "loss": 1.5497, + "step": 1454 + }, + { + "epoch": 0.08110367892976589, + "grad_norm": 0.4312504529953003, + "learning_rate": 8e-05, + "loss": 1.8776, + "step": 1455 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 0.42562606930732727, + "learning_rate": 8e-05, + "loss": 1.6336, + "step": 1456 + }, + { + "epoch": 0.08121516164994426, + "grad_norm": 0.44738060235977173, + "learning_rate": 8e-05, + "loss": 1.7712, + "step": 1457 + }, + { + "epoch": 0.08127090301003345, + "grad_norm": 0.4285328686237335, + "learning_rate": 8e-05, + "loss": 1.8588, + "step": 1458 + }, + { + "epoch": 0.08132664437012263, + "grad_norm": 0.5084940195083618, + "learning_rate": 8e-05, + "loss": 2.0677, + "step": 1459 + }, + { + "epoch": 0.08138238573021182, + "grad_norm": 0.4501677453517914, + "learning_rate": 8e-05, + "loss": 1.7812, + "step": 1460 + }, + { + "epoch": 0.081438127090301, + "grad_norm": 0.4214770793914795, + "learning_rate": 8e-05, + "loss": 1.578, + "step": 1461 + }, + { + "epoch": 0.0814938684503902, + "grad_norm": 0.4178116023540497, + "learning_rate": 8e-05, + "loss": 1.6665, + "step": 1462 + }, + { + "epoch": 0.08154960981047937, + "grad_norm": 0.46865931153297424, + "learning_rate": 8e-05, + "loss": 1.8594, + "step": 1463 + }, + { + "epoch": 0.08160535117056857, + "grad_norm": 0.4274550974369049, + "learning_rate": 8e-05, + "loss": 1.7344, + "step": 1464 + }, + { + "epoch": 0.08166109253065774, + "grad_norm": 0.47668880224227905, + "learning_rate": 8e-05, + "loss": 1.7793, + "step": 1465 + }, + { + "epoch": 0.08171683389074694, + "grad_norm": 0.45111218094825745, + "learning_rate": 8e-05, + "loss": 1.5396, + "step": 1466 + }, + { + "epoch": 0.08177257525083612, + "grad_norm": 0.46514469385147095, + "learning_rate": 8e-05, + "loss": 1.6373, + "step": 1467 + }, + { + "epoch": 0.08182831661092531, + "grad_norm": 0.4513457119464874, + "learning_rate": 8e-05, + "loss": 1.9016, + "step": 1468 + }, + { + "epoch": 0.08188405797101449, + "grad_norm": 0.4727977216243744, + "learning_rate": 8e-05, + "loss": 1.7992, + "step": 1469 + }, + { + "epoch": 0.08193979933110368, + "grad_norm": 0.4027753174304962, + "learning_rate": 8e-05, + "loss": 1.5857, + "step": 1470 + }, + { + "epoch": 0.08199554069119286, + "grad_norm": 0.4126647114753723, + "learning_rate": 8e-05, + "loss": 1.5366, + "step": 1471 + }, + { + "epoch": 0.08205128205128205, + "grad_norm": 0.40489810705184937, + "learning_rate": 8e-05, + "loss": 1.781, + "step": 1472 + }, + { + "epoch": 0.08210702341137124, + "grad_norm": 0.4544224143028259, + "learning_rate": 8e-05, + "loss": 1.7395, + "step": 1473 + }, + { + "epoch": 0.08216276477146042, + "grad_norm": 0.4425218999385834, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 1474 + }, + { + "epoch": 0.08221850613154962, + "grad_norm": 0.4314938187599182, + "learning_rate": 8e-05, + "loss": 1.6134, + "step": 1475 + }, + { + "epoch": 0.0822742474916388, + "grad_norm": 0.4418397545814514, + "learning_rate": 8e-05, + "loss": 1.7419, + "step": 1476 + }, + { + "epoch": 0.08232998885172799, + "grad_norm": 0.4508616626262665, + "learning_rate": 8e-05, + "loss": 1.5632, + "step": 1477 + }, + { + "epoch": 0.08238573021181717, + "grad_norm": 0.44871529936790466, + "learning_rate": 8e-05, + "loss": 1.5814, + "step": 1478 + }, + { + "epoch": 0.08244147157190636, + "grad_norm": 0.4989759027957916, + "learning_rate": 8e-05, + "loss": 1.8102, + "step": 1479 + }, + { + "epoch": 0.08249721293199554, + "grad_norm": 0.4352324903011322, + "learning_rate": 8e-05, + "loss": 1.8375, + "step": 1480 + }, + { + "epoch": 0.08255295429208473, + "grad_norm": 0.452763170003891, + "learning_rate": 8e-05, + "loss": 1.9005, + "step": 1481 + }, + { + "epoch": 0.08260869565217391, + "grad_norm": 0.5133951306343079, + "learning_rate": 8e-05, + "loss": 1.9717, + "step": 1482 + }, + { + "epoch": 0.0826644370122631, + "grad_norm": 0.45723623037338257, + "learning_rate": 8e-05, + "loss": 1.7423, + "step": 1483 + }, + { + "epoch": 0.08272017837235228, + "grad_norm": 0.4535520672798157, + "learning_rate": 8e-05, + "loss": 1.9049, + "step": 1484 + }, + { + "epoch": 0.08277591973244147, + "grad_norm": 0.4558051824569702, + "learning_rate": 8e-05, + "loss": 1.8305, + "step": 1485 + }, + { + "epoch": 0.08283166109253066, + "grad_norm": 0.43357986211776733, + "learning_rate": 8e-05, + "loss": 1.7504, + "step": 1486 + }, + { + "epoch": 0.08288740245261984, + "grad_norm": 0.4610099196434021, + "learning_rate": 8e-05, + "loss": 1.8672, + "step": 1487 + }, + { + "epoch": 0.08294314381270904, + "grad_norm": 0.48371803760528564, + "learning_rate": 8e-05, + "loss": 1.9017, + "step": 1488 + }, + { + "epoch": 0.08299888517279821, + "grad_norm": 0.4555537700653076, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 1489 + }, + { + "epoch": 0.08305462653288741, + "grad_norm": 0.463793009519577, + "learning_rate": 8e-05, + "loss": 1.6915, + "step": 1490 + }, + { + "epoch": 0.08311036789297659, + "grad_norm": 0.46372777223587036, + "learning_rate": 8e-05, + "loss": 1.9179, + "step": 1491 + }, + { + "epoch": 0.08316610925306578, + "grad_norm": 0.4361550509929657, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 1492 + }, + { + "epoch": 0.08322185061315496, + "grad_norm": 0.4673331677913666, + "learning_rate": 8e-05, + "loss": 1.937, + "step": 1493 + }, + { + "epoch": 0.08327759197324415, + "grad_norm": 0.41564467549324036, + "learning_rate": 8e-05, + "loss": 1.5847, + "step": 1494 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.4272595942020416, + "learning_rate": 8e-05, + "loss": 1.7388, + "step": 1495 + }, + { + "epoch": 0.08338907469342252, + "grad_norm": 0.44510963559150696, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 1496 + }, + { + "epoch": 0.0834448160535117, + "grad_norm": 0.4720876216888428, + "learning_rate": 8e-05, + "loss": 1.9245, + "step": 1497 + }, + { + "epoch": 0.08350055741360089, + "grad_norm": 0.4473607540130615, + "learning_rate": 8e-05, + "loss": 1.6843, + "step": 1498 + }, + { + "epoch": 0.08355629877369007, + "grad_norm": 0.4595058858394623, + "learning_rate": 8e-05, + "loss": 1.6747, + "step": 1499 + }, + { + "epoch": 0.08361204013377926, + "grad_norm": 0.43724626302719116, + "learning_rate": 8e-05, + "loss": 1.6508, + "step": 1500 + }, + { + "epoch": 0.08366778149386846, + "grad_norm": 0.4269401729106903, + "learning_rate": 8e-05, + "loss": 1.6373, + "step": 1501 + }, + { + "epoch": 0.08372352285395764, + "grad_norm": 0.4769815504550934, + "learning_rate": 8e-05, + "loss": 1.9236, + "step": 1502 + }, + { + "epoch": 0.08377926421404683, + "grad_norm": 0.4663258194923401, + "learning_rate": 8e-05, + "loss": 1.8029, + "step": 1503 + }, + { + "epoch": 0.083835005574136, + "grad_norm": 0.44873517751693726, + "learning_rate": 8e-05, + "loss": 1.965, + "step": 1504 + }, + { + "epoch": 0.0838907469342252, + "grad_norm": 0.4724174737930298, + "learning_rate": 8e-05, + "loss": 1.9627, + "step": 1505 + }, + { + "epoch": 0.08394648829431438, + "grad_norm": 0.43560653924942017, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 1506 + }, + { + "epoch": 0.08400222965440357, + "grad_norm": 0.45047271251678467, + "learning_rate": 8e-05, + "loss": 1.8491, + "step": 1507 + }, + { + "epoch": 0.08405797101449275, + "grad_norm": 0.4241717457771301, + "learning_rate": 8e-05, + "loss": 1.6695, + "step": 1508 + }, + { + "epoch": 0.08411371237458194, + "grad_norm": 0.4313092529773712, + "learning_rate": 8e-05, + "loss": 1.6094, + "step": 1509 + }, + { + "epoch": 0.08416945373467112, + "grad_norm": 0.4118594527244568, + "learning_rate": 8e-05, + "loss": 1.659, + "step": 1510 + }, + { + "epoch": 0.08422519509476031, + "grad_norm": 0.43712878227233887, + "learning_rate": 8e-05, + "loss": 1.5832, + "step": 1511 + }, + { + "epoch": 0.08428093645484949, + "grad_norm": 0.515166699886322, + "learning_rate": 8e-05, + "loss": 1.6133, + "step": 1512 + }, + { + "epoch": 0.08433667781493868, + "grad_norm": 0.4808172285556793, + "learning_rate": 8e-05, + "loss": 1.8906, + "step": 1513 + }, + { + "epoch": 0.08439241917502788, + "grad_norm": 0.451366126537323, + "learning_rate": 8e-05, + "loss": 1.8264, + "step": 1514 + }, + { + "epoch": 0.08444816053511706, + "grad_norm": 0.47188040614128113, + "learning_rate": 8e-05, + "loss": 1.863, + "step": 1515 + }, + { + "epoch": 0.08450390189520625, + "grad_norm": 0.4248305857181549, + "learning_rate": 8e-05, + "loss": 1.6737, + "step": 1516 + }, + { + "epoch": 0.08455964325529543, + "grad_norm": 0.44855618476867676, + "learning_rate": 8e-05, + "loss": 1.7684, + "step": 1517 + }, + { + "epoch": 0.08461538461538462, + "grad_norm": 0.4507581293582916, + "learning_rate": 8e-05, + "loss": 1.9199, + "step": 1518 + }, + { + "epoch": 0.0846711259754738, + "grad_norm": 0.4774366617202759, + "learning_rate": 8e-05, + "loss": 2.0712, + "step": 1519 + }, + { + "epoch": 0.08472686733556299, + "grad_norm": 0.45017415285110474, + "learning_rate": 8e-05, + "loss": 1.7392, + "step": 1520 + }, + { + "epoch": 0.08478260869565217, + "grad_norm": 0.5057975649833679, + "learning_rate": 8e-05, + "loss": 1.8693, + "step": 1521 + }, + { + "epoch": 0.08483835005574136, + "grad_norm": 0.4550042748451233, + "learning_rate": 8e-05, + "loss": 1.6986, + "step": 1522 + }, + { + "epoch": 0.08489409141583054, + "grad_norm": 0.48192504048347473, + "learning_rate": 8e-05, + "loss": 1.868, + "step": 1523 + }, + { + "epoch": 0.08494983277591973, + "grad_norm": 0.49184849858283997, + "learning_rate": 8e-05, + "loss": 1.7318, + "step": 1524 + }, + { + "epoch": 0.08500557413600891, + "grad_norm": 0.46011582016944885, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 1525 + }, + { + "epoch": 0.0850613154960981, + "grad_norm": 0.49036216735839844, + "learning_rate": 8e-05, + "loss": 1.8992, + "step": 1526 + }, + { + "epoch": 0.08511705685618728, + "grad_norm": 0.4744269549846649, + "learning_rate": 8e-05, + "loss": 1.8252, + "step": 1527 + }, + { + "epoch": 0.08517279821627648, + "grad_norm": 0.4782600998878479, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 1528 + }, + { + "epoch": 0.08522853957636567, + "grad_norm": 0.473105251789093, + "learning_rate": 8e-05, + "loss": 1.9843, + "step": 1529 + }, + { + "epoch": 0.08528428093645485, + "grad_norm": 0.4600432515144348, + "learning_rate": 8e-05, + "loss": 1.8821, + "step": 1530 + }, + { + "epoch": 0.08534002229654404, + "grad_norm": 0.4404701590538025, + "learning_rate": 8e-05, + "loss": 1.6798, + "step": 1531 + }, + { + "epoch": 0.08539576365663322, + "grad_norm": 0.4240405559539795, + "learning_rate": 8e-05, + "loss": 1.7695, + "step": 1532 + }, + { + "epoch": 0.08545150501672241, + "grad_norm": 0.4390221834182739, + "learning_rate": 8e-05, + "loss": 1.4667, + "step": 1533 + }, + { + "epoch": 0.08550724637681159, + "grad_norm": 0.5003925561904907, + "learning_rate": 8e-05, + "loss": 1.82, + "step": 1534 + }, + { + "epoch": 0.08556298773690078, + "grad_norm": 0.4676069915294647, + "learning_rate": 8e-05, + "loss": 1.9581, + "step": 1535 + }, + { + "epoch": 0.08561872909698996, + "grad_norm": 0.45873093605041504, + "learning_rate": 8e-05, + "loss": 1.9127, + "step": 1536 + }, + { + "epoch": 0.08567447045707915, + "grad_norm": 0.44906124472618103, + "learning_rate": 8e-05, + "loss": 1.6728, + "step": 1537 + }, + { + "epoch": 0.08573021181716833, + "grad_norm": 0.4520525634288788, + "learning_rate": 8e-05, + "loss": 1.6776, + "step": 1538 + }, + { + "epoch": 0.08578595317725753, + "grad_norm": 0.4146431088447571, + "learning_rate": 8e-05, + "loss": 1.6065, + "step": 1539 + }, + { + "epoch": 0.0858416945373467, + "grad_norm": 0.4170554578304291, + "learning_rate": 8e-05, + "loss": 1.6201, + "step": 1540 + }, + { + "epoch": 0.0858974358974359, + "grad_norm": 0.43560776114463806, + "learning_rate": 8e-05, + "loss": 1.4419, + "step": 1541 + }, + { + "epoch": 0.08595317725752509, + "grad_norm": 0.4761258661746979, + "learning_rate": 8e-05, + "loss": 1.8902, + "step": 1542 + }, + { + "epoch": 0.08600891861761427, + "grad_norm": 0.456225723028183, + "learning_rate": 8e-05, + "loss": 1.84, + "step": 1543 + }, + { + "epoch": 0.08606465997770346, + "grad_norm": 0.39672479033470154, + "learning_rate": 8e-05, + "loss": 1.5196, + "step": 1544 + }, + { + "epoch": 0.08612040133779264, + "grad_norm": 0.3968527019023895, + "learning_rate": 8e-05, + "loss": 1.5071, + "step": 1545 + }, + { + "epoch": 0.08617614269788183, + "grad_norm": 0.46591445803642273, + "learning_rate": 8e-05, + "loss": 1.8287, + "step": 1546 + }, + { + "epoch": 0.08623188405797101, + "grad_norm": 0.41506892442703247, + "learning_rate": 8e-05, + "loss": 1.4293, + "step": 1547 + }, + { + "epoch": 0.0862876254180602, + "grad_norm": 0.4221274256706238, + "learning_rate": 8e-05, + "loss": 1.7141, + "step": 1548 + }, + { + "epoch": 0.08634336677814938, + "grad_norm": 0.42595311999320984, + "learning_rate": 8e-05, + "loss": 1.7241, + "step": 1549 + }, + { + "epoch": 0.08639910813823858, + "grad_norm": 0.4534299373626709, + "learning_rate": 8e-05, + "loss": 1.8307, + "step": 1550 + }, + { + "epoch": 0.08645484949832775, + "grad_norm": 0.4355649948120117, + "learning_rate": 8e-05, + "loss": 1.717, + "step": 1551 + }, + { + "epoch": 0.08651059085841695, + "grad_norm": 0.4492719769477844, + "learning_rate": 8e-05, + "loss": 1.7433, + "step": 1552 + }, + { + "epoch": 0.08656633221850613, + "grad_norm": 0.4739130437374115, + "learning_rate": 8e-05, + "loss": 1.929, + "step": 1553 + }, + { + "epoch": 0.08662207357859532, + "grad_norm": 0.4500783681869507, + "learning_rate": 8e-05, + "loss": 1.7929, + "step": 1554 + }, + { + "epoch": 0.0866778149386845, + "grad_norm": 0.46833619475364685, + "learning_rate": 8e-05, + "loss": 1.7378, + "step": 1555 + }, + { + "epoch": 0.08673355629877369, + "grad_norm": 0.43054115772247314, + "learning_rate": 8e-05, + "loss": 1.6093, + "step": 1556 + }, + { + "epoch": 0.08678929765886288, + "grad_norm": 0.43833228945732117, + "learning_rate": 8e-05, + "loss": 1.7848, + "step": 1557 + }, + { + "epoch": 0.08684503901895206, + "grad_norm": 0.4455707371234894, + "learning_rate": 8e-05, + "loss": 1.8429, + "step": 1558 + }, + { + "epoch": 0.08690078037904125, + "grad_norm": 0.42907917499542236, + "learning_rate": 8e-05, + "loss": 1.6299, + "step": 1559 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.4292406141757965, + "learning_rate": 8e-05, + "loss": 1.7396, + "step": 1560 + }, + { + "epoch": 0.08701226309921962, + "grad_norm": 0.45911526679992676, + "learning_rate": 8e-05, + "loss": 1.8727, + "step": 1561 + }, + { + "epoch": 0.0870680044593088, + "grad_norm": 0.47614502906799316, + "learning_rate": 8e-05, + "loss": 1.7738, + "step": 1562 + }, + { + "epoch": 0.087123745819398, + "grad_norm": 0.4749417006969452, + "learning_rate": 8e-05, + "loss": 1.9041, + "step": 1563 + }, + { + "epoch": 0.08717948717948718, + "grad_norm": 0.4312790036201477, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 1564 + }, + { + "epoch": 0.08723522853957637, + "grad_norm": 0.4703930616378784, + "learning_rate": 8e-05, + "loss": 1.6906, + "step": 1565 + }, + { + "epoch": 0.08729096989966555, + "grad_norm": 0.480101615190506, + "learning_rate": 8e-05, + "loss": 1.7722, + "step": 1566 + }, + { + "epoch": 0.08734671125975474, + "grad_norm": 0.42672696709632874, + "learning_rate": 8e-05, + "loss": 1.6706, + "step": 1567 + }, + { + "epoch": 0.08740245261984392, + "grad_norm": 0.42830973863601685, + "learning_rate": 8e-05, + "loss": 1.6612, + "step": 1568 + }, + { + "epoch": 0.08745819397993311, + "grad_norm": 0.44229260087013245, + "learning_rate": 8e-05, + "loss": 1.7886, + "step": 1569 + }, + { + "epoch": 0.0875139353400223, + "grad_norm": 0.445111483335495, + "learning_rate": 8e-05, + "loss": 1.3896, + "step": 1570 + }, + { + "epoch": 0.08756967670011148, + "grad_norm": 0.47544875741004944, + "learning_rate": 8e-05, + "loss": 1.9077, + "step": 1571 + }, + { + "epoch": 0.08762541806020067, + "grad_norm": 0.4429614543914795, + "learning_rate": 8e-05, + "loss": 1.9828, + "step": 1572 + }, + { + "epoch": 0.08768115942028985, + "grad_norm": 0.4450688064098358, + "learning_rate": 8e-05, + "loss": 1.801, + "step": 1573 + }, + { + "epoch": 0.08773690078037905, + "grad_norm": 0.4217309355735779, + "learning_rate": 8e-05, + "loss": 1.543, + "step": 1574 + }, + { + "epoch": 0.08779264214046822, + "grad_norm": 0.43825316429138184, + "learning_rate": 8e-05, + "loss": 1.8347, + "step": 1575 + }, + { + "epoch": 0.08784838350055742, + "grad_norm": 0.44539615511894226, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 1576 + }, + { + "epoch": 0.0879041248606466, + "grad_norm": 0.426485151052475, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 1577 + }, + { + "epoch": 0.08795986622073579, + "grad_norm": 0.451470285654068, + "learning_rate": 8e-05, + "loss": 1.6693, + "step": 1578 + }, + { + "epoch": 0.08801560758082497, + "grad_norm": 0.43202123045921326, + "learning_rate": 8e-05, + "loss": 1.4994, + "step": 1579 + }, + { + "epoch": 0.08807134894091416, + "grad_norm": 0.4591750502586365, + "learning_rate": 8e-05, + "loss": 1.9642, + "step": 1580 + }, + { + "epoch": 0.08812709030100334, + "grad_norm": 0.43216124176979065, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 1581 + }, + { + "epoch": 0.08818283166109253, + "grad_norm": 0.4440372586250305, + "learning_rate": 8e-05, + "loss": 1.7201, + "step": 1582 + }, + { + "epoch": 0.08823857302118172, + "grad_norm": 0.46146702766418457, + "learning_rate": 8e-05, + "loss": 1.845, + "step": 1583 + }, + { + "epoch": 0.0882943143812709, + "grad_norm": 0.44598087668418884, + "learning_rate": 8e-05, + "loss": 1.5421, + "step": 1584 + }, + { + "epoch": 0.0883500557413601, + "grad_norm": 0.4281664788722992, + "learning_rate": 8e-05, + "loss": 1.7447, + "step": 1585 + }, + { + "epoch": 0.08840579710144927, + "grad_norm": 0.47506773471832275, + "learning_rate": 8e-05, + "loss": 1.7401, + "step": 1586 + }, + { + "epoch": 0.08846153846153847, + "grad_norm": 0.4095071256160736, + "learning_rate": 8e-05, + "loss": 1.5696, + "step": 1587 + }, + { + "epoch": 0.08851727982162765, + "grad_norm": 0.46955305337905884, + "learning_rate": 8e-05, + "loss": 2.009, + "step": 1588 + }, + { + "epoch": 0.08857302118171684, + "grad_norm": 0.47986504435539246, + "learning_rate": 8e-05, + "loss": 1.8714, + "step": 1589 + }, + { + "epoch": 0.08862876254180602, + "grad_norm": 0.5251606702804565, + "learning_rate": 8e-05, + "loss": 1.9946, + "step": 1590 + }, + { + "epoch": 0.08868450390189521, + "grad_norm": 0.46084538102149963, + "learning_rate": 8e-05, + "loss": 1.7821, + "step": 1591 + }, + { + "epoch": 0.08874024526198439, + "grad_norm": 0.4135066866874695, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 1592 + }, + { + "epoch": 0.08879598662207358, + "grad_norm": 0.49193426966667175, + "learning_rate": 8e-05, + "loss": 2.0588, + "step": 1593 + }, + { + "epoch": 0.08885172798216276, + "grad_norm": 0.4328749179840088, + "learning_rate": 8e-05, + "loss": 1.7467, + "step": 1594 + }, + { + "epoch": 0.08890746934225195, + "grad_norm": 0.43832674622535706, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 1595 + }, + { + "epoch": 0.08896321070234113, + "grad_norm": 0.4430064558982849, + "learning_rate": 8e-05, + "loss": 1.7493, + "step": 1596 + }, + { + "epoch": 0.08901895206243032, + "grad_norm": 0.42061346769332886, + "learning_rate": 8e-05, + "loss": 1.6184, + "step": 1597 + }, + { + "epoch": 0.08907469342251952, + "grad_norm": 0.4191155731678009, + "learning_rate": 8e-05, + "loss": 1.6722, + "step": 1598 + }, + { + "epoch": 0.0891304347826087, + "grad_norm": 0.46061015129089355, + "learning_rate": 8e-05, + "loss": 1.886, + "step": 1599 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.42885342240333557, + "learning_rate": 8e-05, + "loss": 1.6767, + "step": 1600 + }, + { + "epoch": 0.08924191750278707, + "grad_norm": 0.4492940902709961, + "learning_rate": 8e-05, + "loss": 1.905, + "step": 1601 + }, + { + "epoch": 0.08929765886287626, + "grad_norm": 0.4751074016094208, + "learning_rate": 8e-05, + "loss": 1.7845, + "step": 1602 + }, + { + "epoch": 0.08935340022296544, + "grad_norm": 0.4468177855014801, + "learning_rate": 8e-05, + "loss": 1.906, + "step": 1603 + }, + { + "epoch": 0.08940914158305463, + "grad_norm": 0.43063390254974365, + "learning_rate": 8e-05, + "loss": 1.8162, + "step": 1604 + }, + { + "epoch": 0.08946488294314381, + "grad_norm": 0.43314266204833984, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 1605 + }, + { + "epoch": 0.089520624303233, + "grad_norm": 0.4799504578113556, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 1606 + }, + { + "epoch": 0.08957636566332218, + "grad_norm": 0.45094433426856995, + "learning_rate": 8e-05, + "loss": 1.7955, + "step": 1607 + }, + { + "epoch": 0.08963210702341137, + "grad_norm": 0.41243717074394226, + "learning_rate": 8e-05, + "loss": 1.7514, + "step": 1608 + }, + { + "epoch": 0.08968784838350055, + "grad_norm": 0.45868125557899475, + "learning_rate": 8e-05, + "loss": 1.7971, + "step": 1609 + }, + { + "epoch": 0.08974358974358974, + "grad_norm": 0.4831272065639496, + "learning_rate": 8e-05, + "loss": 1.8472, + "step": 1610 + }, + { + "epoch": 0.08979933110367894, + "grad_norm": 0.43193748593330383, + "learning_rate": 8e-05, + "loss": 1.589, + "step": 1611 + }, + { + "epoch": 0.08985507246376812, + "grad_norm": 0.4636029303073883, + "learning_rate": 8e-05, + "loss": 1.7388, + "step": 1612 + }, + { + "epoch": 0.08991081382385731, + "grad_norm": 0.40288928151130676, + "learning_rate": 8e-05, + "loss": 1.4132, + "step": 1613 + }, + { + "epoch": 0.08996655518394649, + "grad_norm": 0.44026824831962585, + "learning_rate": 8e-05, + "loss": 1.684, + "step": 1614 + }, + { + "epoch": 0.09002229654403568, + "grad_norm": 0.4749715328216553, + "learning_rate": 8e-05, + "loss": 1.8288, + "step": 1615 + }, + { + "epoch": 0.09007803790412486, + "grad_norm": 0.4861137568950653, + "learning_rate": 8e-05, + "loss": 1.7672, + "step": 1616 + }, + { + "epoch": 0.09013377926421405, + "grad_norm": 0.4515342712402344, + "learning_rate": 8e-05, + "loss": 1.6005, + "step": 1617 + }, + { + "epoch": 0.09018952062430323, + "grad_norm": 0.4273092448711395, + "learning_rate": 8e-05, + "loss": 1.5918, + "step": 1618 + }, + { + "epoch": 0.09024526198439242, + "grad_norm": 0.43101832270622253, + "learning_rate": 8e-05, + "loss": 1.8811, + "step": 1619 + }, + { + "epoch": 0.0903010033444816, + "grad_norm": 0.4761602580547333, + "learning_rate": 8e-05, + "loss": 1.8548, + "step": 1620 + }, + { + "epoch": 0.0903567447045708, + "grad_norm": 0.4239153563976288, + "learning_rate": 8e-05, + "loss": 1.7455, + "step": 1621 + }, + { + "epoch": 0.09041248606465997, + "grad_norm": 0.44429951906204224, + "learning_rate": 8e-05, + "loss": 1.8431, + "step": 1622 + }, + { + "epoch": 0.09046822742474916, + "grad_norm": 0.4681923985481262, + "learning_rate": 8e-05, + "loss": 1.7589, + "step": 1623 + }, + { + "epoch": 0.09052396878483834, + "grad_norm": 0.41767820715904236, + "learning_rate": 8e-05, + "loss": 1.7157, + "step": 1624 + }, + { + "epoch": 0.09057971014492754, + "grad_norm": 0.6465441584587097, + "learning_rate": 8e-05, + "loss": 1.6272, + "step": 1625 + }, + { + "epoch": 0.09063545150501673, + "grad_norm": 0.48799723386764526, + "learning_rate": 8e-05, + "loss": 1.7972, + "step": 1626 + }, + { + "epoch": 0.09069119286510591, + "grad_norm": 0.45866358280181885, + "learning_rate": 8e-05, + "loss": 1.9811, + "step": 1627 + }, + { + "epoch": 0.0907469342251951, + "grad_norm": 0.4481167197227478, + "learning_rate": 8e-05, + "loss": 1.7685, + "step": 1628 + }, + { + "epoch": 0.09080267558528428, + "grad_norm": 0.4584820568561554, + "learning_rate": 8e-05, + "loss": 1.7461, + "step": 1629 + }, + { + "epoch": 0.09085841694537347, + "grad_norm": 0.45503467321395874, + "learning_rate": 8e-05, + "loss": 1.6809, + "step": 1630 + }, + { + "epoch": 0.09091415830546265, + "grad_norm": 0.4234899878501892, + "learning_rate": 8e-05, + "loss": 1.5796, + "step": 1631 + }, + { + "epoch": 0.09096989966555184, + "grad_norm": 0.47477665543556213, + "learning_rate": 8e-05, + "loss": 1.8594, + "step": 1632 + }, + { + "epoch": 0.09102564102564102, + "grad_norm": 0.44016775488853455, + "learning_rate": 8e-05, + "loss": 1.6832, + "step": 1633 + }, + { + "epoch": 0.09108138238573021, + "grad_norm": 0.4387587606906891, + "learning_rate": 8e-05, + "loss": 1.7215, + "step": 1634 + }, + { + "epoch": 0.09113712374581939, + "grad_norm": 0.4861997067928314, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 1635 + }, + { + "epoch": 0.09119286510590859, + "grad_norm": 0.42896974086761475, + "learning_rate": 8e-05, + "loss": 1.5212, + "step": 1636 + }, + { + "epoch": 0.09124860646599776, + "grad_norm": 0.43611395359039307, + "learning_rate": 8e-05, + "loss": 1.7206, + "step": 1637 + }, + { + "epoch": 0.09130434782608696, + "grad_norm": 0.45591825246810913, + "learning_rate": 8e-05, + "loss": 1.8414, + "step": 1638 + }, + { + "epoch": 0.09136008918617615, + "grad_norm": 0.40458470582962036, + "learning_rate": 8e-05, + "loss": 1.5752, + "step": 1639 + }, + { + "epoch": 0.09141583054626533, + "grad_norm": 0.43076464533805847, + "learning_rate": 8e-05, + "loss": 1.7437, + "step": 1640 + }, + { + "epoch": 0.09147157190635452, + "grad_norm": 0.4555833637714386, + "learning_rate": 8e-05, + "loss": 1.6524, + "step": 1641 + }, + { + "epoch": 0.0915273132664437, + "grad_norm": 0.4160289168357849, + "learning_rate": 8e-05, + "loss": 1.4798, + "step": 1642 + }, + { + "epoch": 0.09158305462653289, + "grad_norm": 0.4631211459636688, + "learning_rate": 8e-05, + "loss": 1.779, + "step": 1643 + }, + { + "epoch": 0.09163879598662207, + "grad_norm": 0.44678810238838196, + "learning_rate": 8e-05, + "loss": 1.6644, + "step": 1644 + }, + { + "epoch": 0.09169453734671126, + "grad_norm": 0.45467978715896606, + "learning_rate": 8e-05, + "loss": 1.7205, + "step": 1645 + }, + { + "epoch": 0.09175027870680044, + "grad_norm": 0.4594002068042755, + "learning_rate": 8e-05, + "loss": 1.7903, + "step": 1646 + }, + { + "epoch": 0.09180602006688963, + "grad_norm": 0.4318181872367859, + "learning_rate": 8e-05, + "loss": 1.5232, + "step": 1647 + }, + { + "epoch": 0.09186176142697881, + "grad_norm": 0.4873585104942322, + "learning_rate": 8e-05, + "loss": 2.0432, + "step": 1648 + }, + { + "epoch": 0.091917502787068, + "grad_norm": 0.4591080844402313, + "learning_rate": 8e-05, + "loss": 1.8974, + "step": 1649 + }, + { + "epoch": 0.09197324414715718, + "grad_norm": 0.45263177156448364, + "learning_rate": 8e-05, + "loss": 1.8263, + "step": 1650 + }, + { + "epoch": 0.09202898550724638, + "grad_norm": 0.6010552644729614, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 1651 + }, + { + "epoch": 0.09208472686733556, + "grad_norm": 0.49455901980400085, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 1652 + }, + { + "epoch": 0.09214046822742475, + "grad_norm": 0.4340011179447174, + "learning_rate": 8e-05, + "loss": 1.6308, + "step": 1653 + }, + { + "epoch": 0.09219620958751394, + "grad_norm": 0.4920664429664612, + "learning_rate": 8e-05, + "loss": 1.8942, + "step": 1654 + }, + { + "epoch": 0.09225195094760312, + "grad_norm": 0.4639732241630554, + "learning_rate": 8e-05, + "loss": 1.9255, + "step": 1655 + }, + { + "epoch": 0.09230769230769231, + "grad_norm": 0.4229355752468109, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 1656 + }, + { + "epoch": 0.09236343366778149, + "grad_norm": 0.42051416635513306, + "learning_rate": 8e-05, + "loss": 1.7474, + "step": 1657 + }, + { + "epoch": 0.09241917502787068, + "grad_norm": 0.409633070230484, + "learning_rate": 8e-05, + "loss": 1.9147, + "step": 1658 + }, + { + "epoch": 0.09247491638795986, + "grad_norm": 0.4497150480747223, + "learning_rate": 8e-05, + "loss": 1.6605, + "step": 1659 + }, + { + "epoch": 0.09253065774804906, + "grad_norm": 0.42622387409210205, + "learning_rate": 8e-05, + "loss": 1.6316, + "step": 1660 + }, + { + "epoch": 0.09258639910813823, + "grad_norm": 0.5611870288848877, + "learning_rate": 8e-05, + "loss": 1.8821, + "step": 1661 + }, + { + "epoch": 0.09264214046822743, + "grad_norm": 0.5077276229858398, + "learning_rate": 8e-05, + "loss": 1.6538, + "step": 1662 + }, + { + "epoch": 0.0926978818283166, + "grad_norm": 0.47354525327682495, + "learning_rate": 8e-05, + "loss": 1.7993, + "step": 1663 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 0.4800141751766205, + "learning_rate": 8e-05, + "loss": 1.8998, + "step": 1664 + }, + { + "epoch": 0.09280936454849498, + "grad_norm": 0.47717806696891785, + "learning_rate": 8e-05, + "loss": 1.8321, + "step": 1665 + }, + { + "epoch": 0.09286510590858417, + "grad_norm": 0.44892174005508423, + "learning_rate": 8e-05, + "loss": 1.4733, + "step": 1666 + }, + { + "epoch": 0.09292084726867336, + "grad_norm": 0.48279869556427, + "learning_rate": 8e-05, + "loss": 1.7329, + "step": 1667 + }, + { + "epoch": 0.09297658862876254, + "grad_norm": 0.4728907346725464, + "learning_rate": 8e-05, + "loss": 1.803, + "step": 1668 + }, + { + "epoch": 0.09303232998885173, + "grad_norm": 0.4132857918739319, + "learning_rate": 8e-05, + "loss": 1.6814, + "step": 1669 + }, + { + "epoch": 0.09308807134894091, + "grad_norm": 0.4486645460128784, + "learning_rate": 8e-05, + "loss": 1.5777, + "step": 1670 + }, + { + "epoch": 0.0931438127090301, + "grad_norm": 0.47105464339256287, + "learning_rate": 8e-05, + "loss": 1.7778, + "step": 1671 + }, + { + "epoch": 0.09319955406911928, + "grad_norm": 0.4360397160053253, + "learning_rate": 8e-05, + "loss": 1.7292, + "step": 1672 + }, + { + "epoch": 0.09325529542920848, + "grad_norm": 0.3917115330696106, + "learning_rate": 8e-05, + "loss": 1.728, + "step": 1673 + }, + { + "epoch": 0.09331103678929765, + "grad_norm": 0.40846002101898193, + "learning_rate": 8e-05, + "loss": 1.6767, + "step": 1674 + }, + { + "epoch": 0.09336677814938685, + "grad_norm": 0.43595507740974426, + "learning_rate": 8e-05, + "loss": 1.6926, + "step": 1675 + }, + { + "epoch": 0.09342251950947603, + "grad_norm": 0.4381343424320221, + "learning_rate": 8e-05, + "loss": 1.6388, + "step": 1676 + }, + { + "epoch": 0.09347826086956522, + "grad_norm": 0.4457002878189087, + "learning_rate": 8e-05, + "loss": 1.7086, + "step": 1677 + }, + { + "epoch": 0.0935340022296544, + "grad_norm": 0.44507187604904175, + "learning_rate": 8e-05, + "loss": 1.553, + "step": 1678 + }, + { + "epoch": 0.09358974358974359, + "grad_norm": 0.46890974044799805, + "learning_rate": 8e-05, + "loss": 1.8152, + "step": 1679 + }, + { + "epoch": 0.09364548494983277, + "grad_norm": 0.5010303258895874, + "learning_rate": 8e-05, + "loss": 1.9007, + "step": 1680 + }, + { + "epoch": 0.09370122630992196, + "grad_norm": 0.44895729422569275, + "learning_rate": 8e-05, + "loss": 1.6811, + "step": 1681 + }, + { + "epoch": 0.09375696767001115, + "grad_norm": 0.44381895661354065, + "learning_rate": 8e-05, + "loss": 1.7254, + "step": 1682 + }, + { + "epoch": 0.09381270903010033, + "grad_norm": 0.42607060074806213, + "learning_rate": 8e-05, + "loss": 1.6745, + "step": 1683 + }, + { + "epoch": 0.09386845039018953, + "grad_norm": 0.4664064347743988, + "learning_rate": 8e-05, + "loss": 1.8605, + "step": 1684 + }, + { + "epoch": 0.0939241917502787, + "grad_norm": 0.4705875813961029, + "learning_rate": 8e-05, + "loss": 1.9974, + "step": 1685 + }, + { + "epoch": 0.0939799331103679, + "grad_norm": 0.4623195230960846, + "learning_rate": 8e-05, + "loss": 1.6875, + "step": 1686 + }, + { + "epoch": 0.09403567447045708, + "grad_norm": 0.4435681998729706, + "learning_rate": 8e-05, + "loss": 1.7876, + "step": 1687 + }, + { + "epoch": 0.09409141583054627, + "grad_norm": 0.4290086627006531, + "learning_rate": 8e-05, + "loss": 1.7866, + "step": 1688 + }, + { + "epoch": 0.09414715719063545, + "grad_norm": 0.47279369831085205, + "learning_rate": 8e-05, + "loss": 1.8753, + "step": 1689 + }, + { + "epoch": 0.09420289855072464, + "grad_norm": 0.43134960532188416, + "learning_rate": 8e-05, + "loss": 1.6947, + "step": 1690 + }, + { + "epoch": 0.09425863991081382, + "grad_norm": 0.588997483253479, + "learning_rate": 8e-05, + "loss": 2.0056, + "step": 1691 + }, + { + "epoch": 0.09431438127090301, + "grad_norm": 0.48162195086479187, + "learning_rate": 8e-05, + "loss": 1.9277, + "step": 1692 + }, + { + "epoch": 0.09437012263099219, + "grad_norm": 0.42257192730903625, + "learning_rate": 8e-05, + "loss": 1.6565, + "step": 1693 + }, + { + "epoch": 0.09442586399108138, + "grad_norm": 0.41762158274650574, + "learning_rate": 8e-05, + "loss": 1.7561, + "step": 1694 + }, + { + "epoch": 0.09448160535117058, + "grad_norm": 0.44371649622917175, + "learning_rate": 8e-05, + "loss": 1.5394, + "step": 1695 + }, + { + "epoch": 0.09453734671125975, + "grad_norm": 0.4283082187175751, + "learning_rate": 8e-05, + "loss": 1.586, + "step": 1696 + }, + { + "epoch": 0.09459308807134895, + "grad_norm": 0.4454692304134369, + "learning_rate": 8e-05, + "loss": 1.6537, + "step": 1697 + }, + { + "epoch": 0.09464882943143813, + "grad_norm": 0.463825523853302, + "learning_rate": 8e-05, + "loss": 1.8293, + "step": 1698 + }, + { + "epoch": 0.09470457079152732, + "grad_norm": 0.41332781314849854, + "learning_rate": 8e-05, + "loss": 1.7457, + "step": 1699 + }, + { + "epoch": 0.0947603121516165, + "grad_norm": 0.42744752764701843, + "learning_rate": 8e-05, + "loss": 1.616, + "step": 1700 + }, + { + "epoch": 0.09481605351170569, + "grad_norm": 0.4430151581764221, + "learning_rate": 8e-05, + "loss": 1.7341, + "step": 1701 + }, + { + "epoch": 0.09487179487179487, + "grad_norm": 0.48058566451072693, + "learning_rate": 8e-05, + "loss": 1.8291, + "step": 1702 + }, + { + "epoch": 0.09492753623188406, + "grad_norm": 0.4263574481010437, + "learning_rate": 8e-05, + "loss": 1.6344, + "step": 1703 + }, + { + "epoch": 0.09498327759197324, + "grad_norm": 0.4660191237926483, + "learning_rate": 8e-05, + "loss": 1.7992, + "step": 1704 + }, + { + "epoch": 0.09503901895206243, + "grad_norm": 0.44191208481788635, + "learning_rate": 8e-05, + "loss": 1.7902, + "step": 1705 + }, + { + "epoch": 0.09509476031215161, + "grad_norm": 0.4195825159549713, + "learning_rate": 8e-05, + "loss": 1.5295, + "step": 1706 + }, + { + "epoch": 0.0951505016722408, + "grad_norm": 0.4643992483615875, + "learning_rate": 8e-05, + "loss": 1.8399, + "step": 1707 + }, + { + "epoch": 0.09520624303233, + "grad_norm": 0.4567433297634125, + "learning_rate": 8e-05, + "loss": 1.7278, + "step": 1708 + }, + { + "epoch": 0.09526198439241917, + "grad_norm": 0.4396064579486847, + "learning_rate": 8e-05, + "loss": 1.5918, + "step": 1709 + }, + { + "epoch": 0.09531772575250837, + "grad_norm": 0.47985053062438965, + "learning_rate": 8e-05, + "loss": 1.8334, + "step": 1710 + }, + { + "epoch": 0.09537346711259755, + "grad_norm": 0.46934187412261963, + "learning_rate": 8e-05, + "loss": 1.8587, + "step": 1711 + }, + { + "epoch": 0.09542920847268674, + "grad_norm": 0.4805198311805725, + "learning_rate": 8e-05, + "loss": 1.9085, + "step": 1712 + }, + { + "epoch": 0.09548494983277592, + "grad_norm": 0.47350266575813293, + "learning_rate": 8e-05, + "loss": 1.8195, + "step": 1713 + }, + { + "epoch": 0.09554069119286511, + "grad_norm": 0.46012142300605774, + "learning_rate": 8e-05, + "loss": 1.784, + "step": 1714 + }, + { + "epoch": 0.09559643255295429, + "grad_norm": 0.49588704109191895, + "learning_rate": 8e-05, + "loss": 1.9803, + "step": 1715 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 0.4539101719856262, + "learning_rate": 8e-05, + "loss": 1.8789, + "step": 1716 + }, + { + "epoch": 0.09570791527313266, + "grad_norm": 0.432472288608551, + "learning_rate": 8e-05, + "loss": 1.6145, + "step": 1717 + }, + { + "epoch": 0.09576365663322185, + "grad_norm": 0.47179466485977173, + "learning_rate": 8e-05, + "loss": 1.7848, + "step": 1718 + }, + { + "epoch": 0.09581939799331103, + "grad_norm": 0.42406386137008667, + "learning_rate": 8e-05, + "loss": 1.4585, + "step": 1719 + }, + { + "epoch": 0.09587513935340022, + "grad_norm": 0.442560613155365, + "learning_rate": 8e-05, + "loss": 1.7944, + "step": 1720 + }, + { + "epoch": 0.0959308807134894, + "grad_norm": 0.4633161425590515, + "learning_rate": 8e-05, + "loss": 1.7961, + "step": 1721 + }, + { + "epoch": 0.0959866220735786, + "grad_norm": 0.44542253017425537, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 1722 + }, + { + "epoch": 0.09604236343366779, + "grad_norm": 0.6311537623405457, + "learning_rate": 8e-05, + "loss": 1.7164, + "step": 1723 + }, + { + "epoch": 0.09609810479375697, + "grad_norm": 0.40386149287223816, + "learning_rate": 8e-05, + "loss": 1.6877, + "step": 1724 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 0.4860149621963501, + "learning_rate": 8e-05, + "loss": 1.7702, + "step": 1725 + }, + { + "epoch": 0.09620958751393534, + "grad_norm": 0.4561831057071686, + "learning_rate": 8e-05, + "loss": 1.7912, + "step": 1726 + }, + { + "epoch": 0.09626532887402453, + "grad_norm": 0.48729828000068665, + "learning_rate": 8e-05, + "loss": 1.816, + "step": 1727 + }, + { + "epoch": 0.09632107023411371, + "grad_norm": 0.4261293411254883, + "learning_rate": 8e-05, + "loss": 1.585, + "step": 1728 + }, + { + "epoch": 0.0963768115942029, + "grad_norm": 0.4322323799133301, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 1729 + }, + { + "epoch": 0.09643255295429208, + "grad_norm": 0.4254898130893707, + "learning_rate": 8e-05, + "loss": 1.5349, + "step": 1730 + }, + { + "epoch": 0.09648829431438127, + "grad_norm": 0.4253242611885071, + "learning_rate": 8e-05, + "loss": 1.8648, + "step": 1731 + }, + { + "epoch": 0.09654403567447045, + "grad_norm": 0.4364851117134094, + "learning_rate": 8e-05, + "loss": 1.5873, + "step": 1732 + }, + { + "epoch": 0.09659977703455964, + "grad_norm": 0.4467128813266754, + "learning_rate": 8e-05, + "loss": 1.8267, + "step": 1733 + }, + { + "epoch": 0.09665551839464882, + "grad_norm": 0.45185184478759766, + "learning_rate": 8e-05, + "loss": 1.604, + "step": 1734 + }, + { + "epoch": 0.09671125975473802, + "grad_norm": 0.4952625632286072, + "learning_rate": 8e-05, + "loss": 1.8941, + "step": 1735 + }, + { + "epoch": 0.09676700111482721, + "grad_norm": 0.4527432918548584, + "learning_rate": 8e-05, + "loss": 1.5787, + "step": 1736 + }, + { + "epoch": 0.09682274247491639, + "grad_norm": 0.5307126045227051, + "learning_rate": 8e-05, + "loss": 2.1902, + "step": 1737 + }, + { + "epoch": 0.09687848383500558, + "grad_norm": 0.45969170331954956, + "learning_rate": 8e-05, + "loss": 1.8296, + "step": 1738 + }, + { + "epoch": 0.09693422519509476, + "grad_norm": 0.4471434950828552, + "learning_rate": 8e-05, + "loss": 1.711, + "step": 1739 + }, + { + "epoch": 0.09698996655518395, + "grad_norm": 0.45372387766838074, + "learning_rate": 8e-05, + "loss": 1.8327, + "step": 1740 + }, + { + "epoch": 0.09704570791527313, + "grad_norm": 0.4237399995326996, + "learning_rate": 8e-05, + "loss": 1.7176, + "step": 1741 + }, + { + "epoch": 0.09710144927536232, + "grad_norm": 0.4647175967693329, + "learning_rate": 8e-05, + "loss": 1.577, + "step": 1742 + }, + { + "epoch": 0.0971571906354515, + "grad_norm": 0.46522289514541626, + "learning_rate": 8e-05, + "loss": 1.6859, + "step": 1743 + }, + { + "epoch": 0.0972129319955407, + "grad_norm": 0.4213174879550934, + "learning_rate": 8e-05, + "loss": 1.6958, + "step": 1744 + }, + { + "epoch": 0.09726867335562987, + "grad_norm": 0.4828932583332062, + "learning_rate": 8e-05, + "loss": 1.7898, + "step": 1745 + }, + { + "epoch": 0.09732441471571907, + "grad_norm": 0.4179130494594574, + "learning_rate": 8e-05, + "loss": 1.4819, + "step": 1746 + }, + { + "epoch": 0.09738015607580824, + "grad_norm": 0.4496394097805023, + "learning_rate": 8e-05, + "loss": 1.7487, + "step": 1747 + }, + { + "epoch": 0.09743589743589744, + "grad_norm": 0.4822089374065399, + "learning_rate": 8e-05, + "loss": 1.7319, + "step": 1748 + }, + { + "epoch": 0.09749163879598662, + "grad_norm": 0.454473614692688, + "learning_rate": 8e-05, + "loss": 1.6915, + "step": 1749 + }, + { + "epoch": 0.09754738015607581, + "grad_norm": 0.46822261810302734, + "learning_rate": 8e-05, + "loss": 1.9073, + "step": 1750 + }, + { + "epoch": 0.097603121516165, + "grad_norm": 0.44990289211273193, + "learning_rate": 8e-05, + "loss": 1.7303, + "step": 1751 + }, + { + "epoch": 0.09765886287625418, + "grad_norm": 0.5409207344055176, + "learning_rate": 8e-05, + "loss": 1.7165, + "step": 1752 + }, + { + "epoch": 0.09771460423634337, + "grad_norm": 0.43510881066322327, + "learning_rate": 8e-05, + "loss": 1.6771, + "step": 1753 + }, + { + "epoch": 0.09777034559643255, + "grad_norm": 0.5424200892448425, + "learning_rate": 8e-05, + "loss": 2.2344, + "step": 1754 + }, + { + "epoch": 0.09782608695652174, + "grad_norm": 0.4397199749946594, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 1755 + }, + { + "epoch": 0.09788182831661092, + "grad_norm": 0.40220770239830017, + "learning_rate": 8e-05, + "loss": 1.5287, + "step": 1756 + }, + { + "epoch": 0.09793756967670011, + "grad_norm": 0.4556962847709656, + "learning_rate": 8e-05, + "loss": 1.7051, + "step": 1757 + }, + { + "epoch": 0.0979933110367893, + "grad_norm": 0.4355781376361847, + "learning_rate": 8e-05, + "loss": 1.7602, + "step": 1758 + }, + { + "epoch": 0.09804905239687849, + "grad_norm": 0.41649800539016724, + "learning_rate": 8e-05, + "loss": 1.44, + "step": 1759 + }, + { + "epoch": 0.09810479375696766, + "grad_norm": 0.4674367606639862, + "learning_rate": 8e-05, + "loss": 1.7817, + "step": 1760 + }, + { + "epoch": 0.09816053511705686, + "grad_norm": 0.45049193501472473, + "learning_rate": 8e-05, + "loss": 1.6204, + "step": 1761 + }, + { + "epoch": 0.09821627647714604, + "grad_norm": 0.45521673560142517, + "learning_rate": 8e-05, + "loss": 1.7195, + "step": 1762 + }, + { + "epoch": 0.09827201783723523, + "grad_norm": 0.4201885759830475, + "learning_rate": 8e-05, + "loss": 1.3177, + "step": 1763 + }, + { + "epoch": 0.09832775919732442, + "grad_norm": 0.4933217763900757, + "learning_rate": 8e-05, + "loss": 1.8507, + "step": 1764 + }, + { + "epoch": 0.0983835005574136, + "grad_norm": 0.4303235411643982, + "learning_rate": 8e-05, + "loss": 1.7071, + "step": 1765 + }, + { + "epoch": 0.09843924191750279, + "grad_norm": 0.4998074769973755, + "learning_rate": 8e-05, + "loss": 1.5192, + "step": 1766 + }, + { + "epoch": 0.09849498327759197, + "grad_norm": 0.4623011648654938, + "learning_rate": 8e-05, + "loss": 1.8838, + "step": 1767 + }, + { + "epoch": 0.09855072463768116, + "grad_norm": 0.486713171005249, + "learning_rate": 8e-05, + "loss": 1.9141, + "step": 1768 + }, + { + "epoch": 0.09860646599777034, + "grad_norm": 0.5125125050544739, + "learning_rate": 8e-05, + "loss": 1.9007, + "step": 1769 + }, + { + "epoch": 0.09866220735785954, + "grad_norm": 0.4394649565219879, + "learning_rate": 8e-05, + "loss": 1.669, + "step": 1770 + }, + { + "epoch": 0.09871794871794871, + "grad_norm": 0.4292559325695038, + "learning_rate": 8e-05, + "loss": 1.6525, + "step": 1771 + }, + { + "epoch": 0.0987736900780379, + "grad_norm": 0.47850099205970764, + "learning_rate": 8e-05, + "loss": 1.5841, + "step": 1772 + }, + { + "epoch": 0.09882943143812709, + "grad_norm": 0.4640037715435028, + "learning_rate": 8e-05, + "loss": 1.8342, + "step": 1773 + }, + { + "epoch": 0.09888517279821628, + "grad_norm": 0.47437646985054016, + "learning_rate": 8e-05, + "loss": 1.9839, + "step": 1774 + }, + { + "epoch": 0.09894091415830546, + "grad_norm": 0.511099636554718, + "learning_rate": 8e-05, + "loss": 1.9201, + "step": 1775 + }, + { + "epoch": 0.09899665551839465, + "grad_norm": 0.42483294010162354, + "learning_rate": 8e-05, + "loss": 1.6935, + "step": 1776 + }, + { + "epoch": 0.09905239687848383, + "grad_norm": 0.4893593490123749, + "learning_rate": 8e-05, + "loss": 1.9715, + "step": 1777 + }, + { + "epoch": 0.09910813823857302, + "grad_norm": 0.47139880061149597, + "learning_rate": 8e-05, + "loss": 1.6563, + "step": 1778 + }, + { + "epoch": 0.09916387959866221, + "grad_norm": 0.4339061379432678, + "learning_rate": 8e-05, + "loss": 1.5654, + "step": 1779 + }, + { + "epoch": 0.09921962095875139, + "grad_norm": 0.4311170279979706, + "learning_rate": 8e-05, + "loss": 1.5791, + "step": 1780 + }, + { + "epoch": 0.09927536231884058, + "grad_norm": 0.42284849286079407, + "learning_rate": 8e-05, + "loss": 1.6309, + "step": 1781 + }, + { + "epoch": 0.09933110367892976, + "grad_norm": 0.442691832780838, + "learning_rate": 8e-05, + "loss": 1.8009, + "step": 1782 + }, + { + "epoch": 0.09938684503901896, + "grad_norm": 0.4467600882053375, + "learning_rate": 8e-05, + "loss": 1.6598, + "step": 1783 + }, + { + "epoch": 0.09944258639910813, + "grad_norm": 0.48540544509887695, + "learning_rate": 8e-05, + "loss": 1.7799, + "step": 1784 + }, + { + "epoch": 0.09949832775919733, + "grad_norm": 0.48900890350341797, + "learning_rate": 8e-05, + "loss": 1.8178, + "step": 1785 + }, + { + "epoch": 0.0995540691192865, + "grad_norm": 0.4849342107772827, + "learning_rate": 8e-05, + "loss": 1.5124, + "step": 1786 + }, + { + "epoch": 0.0996098104793757, + "grad_norm": 0.43086501955986023, + "learning_rate": 8e-05, + "loss": 1.4617, + "step": 1787 + }, + { + "epoch": 0.09966555183946488, + "grad_norm": 0.4691721200942993, + "learning_rate": 8e-05, + "loss": 1.8552, + "step": 1788 + }, + { + "epoch": 0.09972129319955407, + "grad_norm": 0.5271967649459839, + "learning_rate": 8e-05, + "loss": 1.8783, + "step": 1789 + }, + { + "epoch": 0.09977703455964325, + "grad_norm": 0.4975190758705139, + "learning_rate": 8e-05, + "loss": 1.7687, + "step": 1790 + }, + { + "epoch": 0.09983277591973244, + "grad_norm": 0.4852193593978882, + "learning_rate": 8e-05, + "loss": 1.9312, + "step": 1791 + }, + { + "epoch": 0.09988851727982163, + "grad_norm": 0.4707431495189667, + "learning_rate": 8e-05, + "loss": 1.7425, + "step": 1792 + }, + { + "epoch": 0.09994425863991081, + "grad_norm": 0.4759067893028259, + "learning_rate": 8e-05, + "loss": 1.6955, + "step": 1793 + }, + { + "epoch": 0.1, + "grad_norm": 0.43870437145233154, + "learning_rate": 8e-05, + "loss": 1.4649, + "step": 1794 + }, + { + "epoch": 0.10005574136008918, + "grad_norm": 0.5789974331855774, + "learning_rate": 8e-05, + "loss": 1.6798, + "step": 1795 + }, + { + "epoch": 0.10011148272017838, + "grad_norm": 0.45335498452186584, + "learning_rate": 8e-05, + "loss": 1.6491, + "step": 1796 + }, + { + "epoch": 0.10016722408026756, + "grad_norm": 0.5454590320587158, + "learning_rate": 8e-05, + "loss": 1.7319, + "step": 1797 + }, + { + "epoch": 0.10022296544035675, + "grad_norm": 0.6508112549781799, + "learning_rate": 8e-05, + "loss": 1.9191, + "step": 1798 + }, + { + "epoch": 0.10027870680044593, + "grad_norm": 0.4346283972263336, + "learning_rate": 8e-05, + "loss": 1.5611, + "step": 1799 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.46036475896835327, + "learning_rate": 8e-05, + "loss": 1.8125, + "step": 1800 + }, + { + "epoch": 0.1003901895206243, + "grad_norm": 0.44356054067611694, + "learning_rate": 8e-05, + "loss": 1.6516, + "step": 1801 + }, + { + "epoch": 0.10044593088071349, + "grad_norm": 0.4230916500091553, + "learning_rate": 8e-05, + "loss": 1.6221, + "step": 1802 + }, + { + "epoch": 0.10050167224080267, + "grad_norm": 0.4426247775554657, + "learning_rate": 8e-05, + "loss": 1.7226, + "step": 1803 + }, + { + "epoch": 0.10055741360089186, + "grad_norm": 0.42851877212524414, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 1804 + }, + { + "epoch": 0.10061315496098104, + "grad_norm": 0.47624823451042175, + "learning_rate": 8e-05, + "loss": 1.8385, + "step": 1805 + }, + { + "epoch": 0.10066889632107023, + "grad_norm": 0.4784403145313263, + "learning_rate": 8e-05, + "loss": 1.7394, + "step": 1806 + }, + { + "epoch": 0.10072463768115943, + "grad_norm": 0.45466241240501404, + "learning_rate": 8e-05, + "loss": 1.6348, + "step": 1807 + }, + { + "epoch": 0.1007803790412486, + "grad_norm": 0.44868579506874084, + "learning_rate": 8e-05, + "loss": 1.834, + "step": 1808 + }, + { + "epoch": 0.1008361204013378, + "grad_norm": 0.4959905445575714, + "learning_rate": 8e-05, + "loss": 1.7847, + "step": 1809 + }, + { + "epoch": 0.10089186176142698, + "grad_norm": 0.4354304373264313, + "learning_rate": 8e-05, + "loss": 1.7289, + "step": 1810 + }, + { + "epoch": 0.10094760312151617, + "grad_norm": 0.4733140766620636, + "learning_rate": 8e-05, + "loss": 1.8648, + "step": 1811 + }, + { + "epoch": 0.10100334448160535, + "grad_norm": 0.4682197868824005, + "learning_rate": 8e-05, + "loss": 1.8402, + "step": 1812 + }, + { + "epoch": 0.10105908584169454, + "grad_norm": 0.4327491521835327, + "learning_rate": 8e-05, + "loss": 1.8034, + "step": 1813 + }, + { + "epoch": 0.10111482720178372, + "grad_norm": 0.4196842610836029, + "learning_rate": 8e-05, + "loss": 1.5267, + "step": 1814 + }, + { + "epoch": 0.10117056856187291, + "grad_norm": 0.4950825870037079, + "learning_rate": 8e-05, + "loss": 1.7586, + "step": 1815 + }, + { + "epoch": 0.10122630992196209, + "grad_norm": 0.47089883685112, + "learning_rate": 8e-05, + "loss": 1.6667, + "step": 1816 + }, + { + "epoch": 0.10128205128205128, + "grad_norm": 0.418155312538147, + "learning_rate": 8e-05, + "loss": 1.6951, + "step": 1817 + }, + { + "epoch": 0.10133779264214046, + "grad_norm": 0.4778137803077698, + "learning_rate": 8e-05, + "loss": 1.7777, + "step": 1818 + }, + { + "epoch": 0.10139353400222965, + "grad_norm": 0.44255295395851135, + "learning_rate": 8e-05, + "loss": 1.7593, + "step": 1819 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 0.43998026847839355, + "learning_rate": 8e-05, + "loss": 1.8034, + "step": 1820 + }, + { + "epoch": 0.10150501672240803, + "grad_norm": 0.4900084435939789, + "learning_rate": 8e-05, + "loss": 1.835, + "step": 1821 + }, + { + "epoch": 0.10156075808249722, + "grad_norm": 0.44791969656944275, + "learning_rate": 8e-05, + "loss": 1.6511, + "step": 1822 + }, + { + "epoch": 0.1016164994425864, + "grad_norm": 0.4674259126186371, + "learning_rate": 8e-05, + "loss": 1.7348, + "step": 1823 + }, + { + "epoch": 0.10167224080267559, + "grad_norm": 0.44596970081329346, + "learning_rate": 8e-05, + "loss": 1.5885, + "step": 1824 + }, + { + "epoch": 0.10172798216276477, + "grad_norm": 0.4487878084182739, + "learning_rate": 8e-05, + "loss": 1.7307, + "step": 1825 + }, + { + "epoch": 0.10178372352285396, + "grad_norm": 0.4508936107158661, + "learning_rate": 8e-05, + "loss": 1.8602, + "step": 1826 + }, + { + "epoch": 0.10183946488294314, + "grad_norm": 0.44692227244377136, + "learning_rate": 8e-05, + "loss": 1.6913, + "step": 1827 + }, + { + "epoch": 0.10189520624303233, + "grad_norm": 0.47166091203689575, + "learning_rate": 8e-05, + "loss": 1.641, + "step": 1828 + }, + { + "epoch": 0.10195094760312151, + "grad_norm": 0.43342047929763794, + "learning_rate": 8e-05, + "loss": 1.6667, + "step": 1829 + }, + { + "epoch": 0.1020066889632107, + "grad_norm": 0.514275848865509, + "learning_rate": 8e-05, + "loss": 1.9541, + "step": 1830 + }, + { + "epoch": 0.10206243032329988, + "grad_norm": 0.45009151101112366, + "learning_rate": 8e-05, + "loss": 1.7608, + "step": 1831 + }, + { + "epoch": 0.10211817168338908, + "grad_norm": 0.43719324469566345, + "learning_rate": 8e-05, + "loss": 1.7147, + "step": 1832 + }, + { + "epoch": 0.10217391304347827, + "grad_norm": 0.4204999804496765, + "learning_rate": 8e-05, + "loss": 1.6842, + "step": 1833 + }, + { + "epoch": 0.10222965440356745, + "grad_norm": 0.4463106095790863, + "learning_rate": 8e-05, + "loss": 1.6066, + "step": 1834 + }, + { + "epoch": 0.10228539576365664, + "grad_norm": 0.4450172781944275, + "learning_rate": 8e-05, + "loss": 1.6528, + "step": 1835 + }, + { + "epoch": 0.10234113712374582, + "grad_norm": 0.5039206743240356, + "learning_rate": 8e-05, + "loss": 1.8365, + "step": 1836 + }, + { + "epoch": 0.10239687848383501, + "grad_norm": 0.45737943053245544, + "learning_rate": 8e-05, + "loss": 1.6945, + "step": 1837 + }, + { + "epoch": 0.10245261984392419, + "grad_norm": 0.48224934935569763, + "learning_rate": 8e-05, + "loss": 1.5769, + "step": 1838 + }, + { + "epoch": 0.10250836120401338, + "grad_norm": 0.4501650333404541, + "learning_rate": 8e-05, + "loss": 1.732, + "step": 1839 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 0.45120200514793396, + "learning_rate": 8e-05, + "loss": 1.7743, + "step": 1840 + }, + { + "epoch": 0.10261984392419175, + "grad_norm": 0.5067091584205627, + "learning_rate": 8e-05, + "loss": 1.8807, + "step": 1841 + }, + { + "epoch": 0.10267558528428093, + "grad_norm": 0.4903699457645416, + "learning_rate": 8e-05, + "loss": 1.7831, + "step": 1842 + }, + { + "epoch": 0.10273132664437012, + "grad_norm": 0.4485124945640564, + "learning_rate": 8e-05, + "loss": 1.7676, + "step": 1843 + }, + { + "epoch": 0.1027870680044593, + "grad_norm": 0.49188125133514404, + "learning_rate": 8e-05, + "loss": 1.9028, + "step": 1844 + }, + { + "epoch": 0.1028428093645485, + "grad_norm": 0.45281144976615906, + "learning_rate": 8e-05, + "loss": 1.7229, + "step": 1845 + }, + { + "epoch": 0.10289855072463767, + "grad_norm": 0.48761826753616333, + "learning_rate": 8e-05, + "loss": 1.8395, + "step": 1846 + }, + { + "epoch": 0.10295429208472687, + "grad_norm": 0.47400587797164917, + "learning_rate": 8e-05, + "loss": 1.8974, + "step": 1847 + }, + { + "epoch": 0.10301003344481606, + "grad_norm": 0.4568255543708801, + "learning_rate": 8e-05, + "loss": 1.8252, + "step": 1848 + }, + { + "epoch": 0.10306577480490524, + "grad_norm": 0.4409621059894562, + "learning_rate": 8e-05, + "loss": 1.6955, + "step": 1849 + }, + { + "epoch": 0.10312151616499443, + "grad_norm": 0.48078006505966187, + "learning_rate": 8e-05, + "loss": 1.795, + "step": 1850 + }, + { + "epoch": 0.10317725752508361, + "grad_norm": 0.48587360978126526, + "learning_rate": 8e-05, + "loss": 1.864, + "step": 1851 + }, + { + "epoch": 0.1032329988851728, + "grad_norm": 0.5020497441291809, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 1852 + }, + { + "epoch": 0.10328874024526198, + "grad_norm": 0.4517962634563446, + "learning_rate": 8e-05, + "loss": 1.5516, + "step": 1853 + }, + { + "epoch": 0.10334448160535117, + "grad_norm": 0.4889071583747864, + "learning_rate": 8e-05, + "loss": 1.9975, + "step": 1854 + }, + { + "epoch": 0.10340022296544035, + "grad_norm": 0.4774690568447113, + "learning_rate": 8e-05, + "loss": 1.8562, + "step": 1855 + }, + { + "epoch": 0.10345596432552955, + "grad_norm": 0.427127867937088, + "learning_rate": 8e-05, + "loss": 1.7057, + "step": 1856 + }, + { + "epoch": 0.10351170568561872, + "grad_norm": 0.4165712594985962, + "learning_rate": 8e-05, + "loss": 1.5408, + "step": 1857 + }, + { + "epoch": 0.10356744704570792, + "grad_norm": 0.4988797605037689, + "learning_rate": 8e-05, + "loss": 1.8549, + "step": 1858 + }, + { + "epoch": 0.1036231884057971, + "grad_norm": 0.45366349816322327, + "learning_rate": 8e-05, + "loss": 1.7291, + "step": 1859 + }, + { + "epoch": 0.10367892976588629, + "grad_norm": 0.48266640305519104, + "learning_rate": 8e-05, + "loss": 1.8506, + "step": 1860 + }, + { + "epoch": 0.10373467112597548, + "grad_norm": 0.5486199855804443, + "learning_rate": 8e-05, + "loss": 1.9328, + "step": 1861 + }, + { + "epoch": 0.10379041248606466, + "grad_norm": 0.4452139735221863, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 1862 + }, + { + "epoch": 0.10384615384615385, + "grad_norm": 0.49109047651290894, + "learning_rate": 8e-05, + "loss": 1.9918, + "step": 1863 + }, + { + "epoch": 0.10390189520624303, + "grad_norm": 0.4817899763584137, + "learning_rate": 8e-05, + "loss": 1.8768, + "step": 1864 + }, + { + "epoch": 0.10395763656633222, + "grad_norm": 0.456006795167923, + "learning_rate": 8e-05, + "loss": 1.7827, + "step": 1865 + }, + { + "epoch": 0.1040133779264214, + "grad_norm": 0.4890350103378296, + "learning_rate": 8e-05, + "loss": 1.8561, + "step": 1866 + }, + { + "epoch": 0.1040691192865106, + "grad_norm": 0.46255168318748474, + "learning_rate": 8e-05, + "loss": 1.8764, + "step": 1867 + }, + { + "epoch": 0.10412486064659977, + "grad_norm": 0.4551663398742676, + "learning_rate": 8e-05, + "loss": 1.773, + "step": 1868 + }, + { + "epoch": 0.10418060200668897, + "grad_norm": 0.4570346772670746, + "learning_rate": 8e-05, + "loss": 1.7073, + "step": 1869 + }, + { + "epoch": 0.10423634336677814, + "grad_norm": 0.4652957618236542, + "learning_rate": 8e-05, + "loss": 1.871, + "step": 1870 + }, + { + "epoch": 0.10429208472686734, + "grad_norm": 0.45328280329704285, + "learning_rate": 8e-05, + "loss": 1.6733, + "step": 1871 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 0.4582420289516449, + "learning_rate": 8e-05, + "loss": 2.0648, + "step": 1872 + }, + { + "epoch": 0.10440356744704571, + "grad_norm": 0.4341772794723511, + "learning_rate": 8e-05, + "loss": 1.7146, + "step": 1873 + }, + { + "epoch": 0.10445930880713489, + "grad_norm": 0.4439215660095215, + "learning_rate": 8e-05, + "loss": 1.7852, + "step": 1874 + }, + { + "epoch": 0.10451505016722408, + "grad_norm": 0.46234145760536194, + "learning_rate": 8e-05, + "loss": 1.7784, + "step": 1875 + }, + { + "epoch": 0.10457079152731327, + "grad_norm": 0.5071830749511719, + "learning_rate": 8e-05, + "loss": 1.9705, + "step": 1876 + }, + { + "epoch": 0.10462653288740245, + "grad_norm": 0.4649686813354492, + "learning_rate": 8e-05, + "loss": 1.6822, + "step": 1877 + }, + { + "epoch": 0.10468227424749164, + "grad_norm": 0.4438864588737488, + "learning_rate": 8e-05, + "loss": 1.8218, + "step": 1878 + }, + { + "epoch": 0.10473801560758082, + "grad_norm": 0.5034699440002441, + "learning_rate": 8e-05, + "loss": 1.8693, + "step": 1879 + }, + { + "epoch": 0.10479375696767002, + "grad_norm": 0.43608057498931885, + "learning_rate": 8e-05, + "loss": 1.7494, + "step": 1880 + }, + { + "epoch": 0.1048494983277592, + "grad_norm": 0.4907682538032532, + "learning_rate": 8e-05, + "loss": 1.9842, + "step": 1881 + }, + { + "epoch": 0.10490523968784839, + "grad_norm": 0.44603654742240906, + "learning_rate": 8e-05, + "loss": 1.6714, + "step": 1882 + }, + { + "epoch": 0.10496098104793757, + "grad_norm": 0.440072238445282, + "learning_rate": 8e-05, + "loss": 1.7894, + "step": 1883 + }, + { + "epoch": 0.10501672240802676, + "grad_norm": 0.501599133014679, + "learning_rate": 8e-05, + "loss": 1.9509, + "step": 1884 + }, + { + "epoch": 0.10507246376811594, + "grad_norm": 0.47615668177604675, + "learning_rate": 8e-05, + "loss": 1.6011, + "step": 1885 + }, + { + "epoch": 0.10512820512820513, + "grad_norm": 0.4609353840351105, + "learning_rate": 8e-05, + "loss": 1.799, + "step": 1886 + }, + { + "epoch": 0.10518394648829431, + "grad_norm": 0.4454839825630188, + "learning_rate": 8e-05, + "loss": 1.6604, + "step": 1887 + }, + { + "epoch": 0.1052396878483835, + "grad_norm": 0.44142767786979675, + "learning_rate": 8e-05, + "loss": 1.8667, + "step": 1888 + }, + { + "epoch": 0.1052954292084727, + "grad_norm": 0.4566827416419983, + "learning_rate": 8e-05, + "loss": 1.3807, + "step": 1889 + }, + { + "epoch": 0.10535117056856187, + "grad_norm": 0.4766084551811218, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 1890 + }, + { + "epoch": 0.10540691192865106, + "grad_norm": 0.44653797149658203, + "learning_rate": 8e-05, + "loss": 1.721, + "step": 1891 + }, + { + "epoch": 0.10546265328874024, + "grad_norm": 0.4749918580055237, + "learning_rate": 8e-05, + "loss": 2.0207, + "step": 1892 + }, + { + "epoch": 0.10551839464882944, + "grad_norm": 0.42253586649894714, + "learning_rate": 8e-05, + "loss": 1.4832, + "step": 1893 + }, + { + "epoch": 0.10557413600891861, + "grad_norm": 0.46958550810813904, + "learning_rate": 8e-05, + "loss": 1.5483, + "step": 1894 + }, + { + "epoch": 0.10562987736900781, + "grad_norm": 0.4402949810028076, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 1895 + }, + { + "epoch": 0.10568561872909699, + "grad_norm": 0.49999094009399414, + "learning_rate": 8e-05, + "loss": 1.6266, + "step": 1896 + }, + { + "epoch": 0.10574136008918618, + "grad_norm": 0.47692495584487915, + "learning_rate": 8e-05, + "loss": 1.8586, + "step": 1897 + }, + { + "epoch": 0.10579710144927536, + "grad_norm": 0.48545780777931213, + "learning_rate": 8e-05, + "loss": 1.8865, + "step": 1898 + }, + { + "epoch": 0.10585284280936455, + "grad_norm": 0.45592087507247925, + "learning_rate": 8e-05, + "loss": 1.8979, + "step": 1899 + }, + { + "epoch": 0.10590858416945373, + "grad_norm": 0.5070605278015137, + "learning_rate": 8e-05, + "loss": 1.7738, + "step": 1900 + }, + { + "epoch": 0.10596432552954292, + "grad_norm": 0.5075011253356934, + "learning_rate": 8e-05, + "loss": 1.7382, + "step": 1901 + }, + { + "epoch": 0.1060200668896321, + "grad_norm": 0.4672543406486511, + "learning_rate": 8e-05, + "loss": 1.7201, + "step": 1902 + }, + { + "epoch": 0.10607580824972129, + "grad_norm": 0.4875164031982422, + "learning_rate": 8e-05, + "loss": 1.8483, + "step": 1903 + }, + { + "epoch": 0.10613154960981049, + "grad_norm": 0.4881870448589325, + "learning_rate": 8e-05, + "loss": 1.6607, + "step": 1904 + }, + { + "epoch": 0.10618729096989966, + "grad_norm": 0.5196061730384827, + "learning_rate": 8e-05, + "loss": 1.8877, + "step": 1905 + }, + { + "epoch": 0.10624303232998886, + "grad_norm": 0.45339685678482056, + "learning_rate": 8e-05, + "loss": 1.8615, + "step": 1906 + }, + { + "epoch": 0.10629877369007804, + "grad_norm": 0.4748283624649048, + "learning_rate": 8e-05, + "loss": 1.6601, + "step": 1907 + }, + { + "epoch": 0.10635451505016723, + "grad_norm": 0.41848766803741455, + "learning_rate": 8e-05, + "loss": 1.6985, + "step": 1908 + }, + { + "epoch": 0.1064102564102564, + "grad_norm": 0.4328663945198059, + "learning_rate": 8e-05, + "loss": 1.6866, + "step": 1909 + }, + { + "epoch": 0.1064659977703456, + "grad_norm": 0.4594240188598633, + "learning_rate": 8e-05, + "loss": 1.831, + "step": 1910 + }, + { + "epoch": 0.10652173913043478, + "grad_norm": 0.48182612657546997, + "learning_rate": 8e-05, + "loss": 1.6941, + "step": 1911 + }, + { + "epoch": 0.10657748049052397, + "grad_norm": 0.4915633797645569, + "learning_rate": 8e-05, + "loss": 1.739, + "step": 1912 + }, + { + "epoch": 0.10663322185061315, + "grad_norm": 0.44918638467788696, + "learning_rate": 8e-05, + "loss": 1.6886, + "step": 1913 + }, + { + "epoch": 0.10668896321070234, + "grad_norm": 0.45250630378723145, + "learning_rate": 8e-05, + "loss": 1.652, + "step": 1914 + }, + { + "epoch": 0.10674470457079152, + "grad_norm": 0.4650265574455261, + "learning_rate": 8e-05, + "loss": 1.5937, + "step": 1915 + }, + { + "epoch": 0.10680044593088071, + "grad_norm": 0.4698179364204407, + "learning_rate": 8e-05, + "loss": 1.7066, + "step": 1916 + }, + { + "epoch": 0.1068561872909699, + "grad_norm": 0.46050387620925903, + "learning_rate": 8e-05, + "loss": 1.6285, + "step": 1917 + }, + { + "epoch": 0.10691192865105908, + "grad_norm": 0.4757847785949707, + "learning_rate": 8e-05, + "loss": 1.9062, + "step": 1918 + }, + { + "epoch": 0.10696767001114828, + "grad_norm": 0.48355957865715027, + "learning_rate": 8e-05, + "loss": 1.8416, + "step": 1919 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 0.4324290156364441, + "learning_rate": 8e-05, + "loss": 1.7597, + "step": 1920 + }, + { + "epoch": 0.10707915273132665, + "grad_norm": 0.48721253871917725, + "learning_rate": 8e-05, + "loss": 1.9141, + "step": 1921 + }, + { + "epoch": 0.10713489409141583, + "grad_norm": 0.5150349736213684, + "learning_rate": 8e-05, + "loss": 1.7725, + "step": 1922 + }, + { + "epoch": 0.10719063545150502, + "grad_norm": 0.4498967230319977, + "learning_rate": 8e-05, + "loss": 1.4825, + "step": 1923 + }, + { + "epoch": 0.1072463768115942, + "grad_norm": 0.7026382088661194, + "learning_rate": 8e-05, + "loss": 1.7359, + "step": 1924 + }, + { + "epoch": 0.10730211817168339, + "grad_norm": 0.42979228496551514, + "learning_rate": 8e-05, + "loss": 1.2196, + "step": 1925 + }, + { + "epoch": 0.10735785953177257, + "grad_norm": 0.4411061704158783, + "learning_rate": 8e-05, + "loss": 1.5994, + "step": 1926 + }, + { + "epoch": 0.10741360089186176, + "grad_norm": 0.4753207862377167, + "learning_rate": 8e-05, + "loss": 1.7305, + "step": 1927 + }, + { + "epoch": 0.10746934225195094, + "grad_norm": 0.4685523509979248, + "learning_rate": 8e-05, + "loss": 1.7912, + "step": 1928 + }, + { + "epoch": 0.10752508361204013, + "grad_norm": 0.4643222987651825, + "learning_rate": 8e-05, + "loss": 1.8235, + "step": 1929 + }, + { + "epoch": 0.10758082497212931, + "grad_norm": 0.44670549035072327, + "learning_rate": 8e-05, + "loss": 1.7247, + "step": 1930 + }, + { + "epoch": 0.1076365663322185, + "grad_norm": 0.4861903488636017, + "learning_rate": 8e-05, + "loss": 1.8318, + "step": 1931 + }, + { + "epoch": 0.1076923076923077, + "grad_norm": 0.5127159357070923, + "learning_rate": 8e-05, + "loss": 1.8395, + "step": 1932 + }, + { + "epoch": 0.10774804905239688, + "grad_norm": 0.46058979630470276, + "learning_rate": 8e-05, + "loss": 1.7455, + "step": 1933 + }, + { + "epoch": 0.10780379041248607, + "grad_norm": 0.5476949214935303, + "learning_rate": 8e-05, + "loss": 1.9165, + "step": 1934 + }, + { + "epoch": 0.10785953177257525, + "grad_norm": 0.4545874297618866, + "learning_rate": 8e-05, + "loss": 1.6108, + "step": 1935 + }, + { + "epoch": 0.10791527313266444, + "grad_norm": 0.4586825668811798, + "learning_rate": 8e-05, + "loss": 1.743, + "step": 1936 + }, + { + "epoch": 0.10797101449275362, + "grad_norm": 0.48396754264831543, + "learning_rate": 8e-05, + "loss": 1.513, + "step": 1937 + }, + { + "epoch": 0.10802675585284281, + "grad_norm": 0.4474707543849945, + "learning_rate": 8e-05, + "loss": 1.7162, + "step": 1938 + }, + { + "epoch": 0.10808249721293199, + "grad_norm": 0.42385002970695496, + "learning_rate": 8e-05, + "loss": 1.5032, + "step": 1939 + }, + { + "epoch": 0.10813823857302118, + "grad_norm": 0.5376535654067993, + "learning_rate": 8e-05, + "loss": 1.5726, + "step": 1940 + }, + { + "epoch": 0.10819397993311036, + "grad_norm": 0.4649655818939209, + "learning_rate": 8e-05, + "loss": 1.7542, + "step": 1941 + }, + { + "epoch": 0.10824972129319956, + "grad_norm": 0.4395988881587982, + "learning_rate": 8e-05, + "loss": 1.5953, + "step": 1942 + }, + { + "epoch": 0.10830546265328873, + "grad_norm": 0.5266790390014648, + "learning_rate": 8e-05, + "loss": 2.0656, + "step": 1943 + }, + { + "epoch": 0.10836120401337793, + "grad_norm": 0.44322502613067627, + "learning_rate": 8e-05, + "loss": 1.8313, + "step": 1944 + }, + { + "epoch": 0.10841694537346712, + "grad_norm": 0.4520188868045807, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 1945 + }, + { + "epoch": 0.1084726867335563, + "grad_norm": 0.4372086822986603, + "learning_rate": 8e-05, + "loss": 1.6014, + "step": 1946 + }, + { + "epoch": 0.10852842809364549, + "grad_norm": 0.46225085854530334, + "learning_rate": 8e-05, + "loss": 1.8784, + "step": 1947 + }, + { + "epoch": 0.10858416945373467, + "grad_norm": 0.49719974398612976, + "learning_rate": 8e-05, + "loss": 1.8832, + "step": 1948 + }, + { + "epoch": 0.10863991081382386, + "grad_norm": 0.4694206714630127, + "learning_rate": 8e-05, + "loss": 1.7833, + "step": 1949 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 0.5506940484046936, + "learning_rate": 8e-05, + "loss": 2.2823, + "step": 1950 + }, + { + "epoch": 0.10875139353400223, + "grad_norm": 0.5728739500045776, + "learning_rate": 8e-05, + "loss": 2.0483, + "step": 1951 + }, + { + "epoch": 0.10880713489409141, + "grad_norm": 0.44349315762519836, + "learning_rate": 8e-05, + "loss": 1.6299, + "step": 1952 + }, + { + "epoch": 0.1088628762541806, + "grad_norm": 0.49954578280448914, + "learning_rate": 8e-05, + "loss": 1.7425, + "step": 1953 + }, + { + "epoch": 0.10891861761426978, + "grad_norm": 0.5076021552085876, + "learning_rate": 8e-05, + "loss": 1.9896, + "step": 1954 + }, + { + "epoch": 0.10897435897435898, + "grad_norm": 0.48354780673980713, + "learning_rate": 8e-05, + "loss": 1.7661, + "step": 1955 + }, + { + "epoch": 0.10903010033444815, + "grad_norm": 0.47348281741142273, + "learning_rate": 8e-05, + "loss": 1.7341, + "step": 1956 + }, + { + "epoch": 0.10908584169453735, + "grad_norm": 0.45801085233688354, + "learning_rate": 8e-05, + "loss": 1.6604, + "step": 1957 + }, + { + "epoch": 0.10914158305462654, + "grad_norm": 0.4829603433609009, + "learning_rate": 8e-05, + "loss": 1.8312, + "step": 1958 + }, + { + "epoch": 0.10919732441471572, + "grad_norm": 0.4740946590900421, + "learning_rate": 8e-05, + "loss": 1.7841, + "step": 1959 + }, + { + "epoch": 0.10925306577480491, + "grad_norm": 0.46358588337898254, + "learning_rate": 8e-05, + "loss": 2.0598, + "step": 1960 + }, + { + "epoch": 0.10930880713489409, + "grad_norm": 0.44745883345603943, + "learning_rate": 8e-05, + "loss": 1.7173, + "step": 1961 + }, + { + "epoch": 0.10936454849498328, + "grad_norm": 0.45677393674850464, + "learning_rate": 8e-05, + "loss": 1.8382, + "step": 1962 + }, + { + "epoch": 0.10942028985507246, + "grad_norm": 0.4677621126174927, + "learning_rate": 8e-05, + "loss": 1.7491, + "step": 1963 + }, + { + "epoch": 0.10947603121516165, + "grad_norm": 0.4459719955921173, + "learning_rate": 8e-05, + "loss": 1.8215, + "step": 1964 + }, + { + "epoch": 0.10953177257525083, + "grad_norm": 0.44011497497558594, + "learning_rate": 8e-05, + "loss": 1.4185, + "step": 1965 + }, + { + "epoch": 0.10958751393534003, + "grad_norm": 0.5114625692367554, + "learning_rate": 8e-05, + "loss": 2.0072, + "step": 1966 + }, + { + "epoch": 0.1096432552954292, + "grad_norm": 0.4603794813156128, + "learning_rate": 8e-05, + "loss": 1.8177, + "step": 1967 + }, + { + "epoch": 0.1096989966555184, + "grad_norm": 0.44069552421569824, + "learning_rate": 8e-05, + "loss": 1.6253, + "step": 1968 + }, + { + "epoch": 0.10975473801560758, + "grad_norm": 0.4534465968608856, + "learning_rate": 8e-05, + "loss": 1.5732, + "step": 1969 + }, + { + "epoch": 0.10981047937569677, + "grad_norm": 0.5088835954666138, + "learning_rate": 8e-05, + "loss": 1.549, + "step": 1970 + }, + { + "epoch": 0.10986622073578595, + "grad_norm": 0.472575843334198, + "learning_rate": 8e-05, + "loss": 1.7316, + "step": 1971 + }, + { + "epoch": 0.10992196209587514, + "grad_norm": 0.4615132510662079, + "learning_rate": 8e-05, + "loss": 1.6679, + "step": 1972 + }, + { + "epoch": 0.10997770345596433, + "grad_norm": 0.5491868853569031, + "learning_rate": 8e-05, + "loss": 1.7941, + "step": 1973 + }, + { + "epoch": 0.11003344481605351, + "grad_norm": 0.45354369282722473, + "learning_rate": 8e-05, + "loss": 1.7325, + "step": 1974 + }, + { + "epoch": 0.1100891861761427, + "grad_norm": 0.4760855436325073, + "learning_rate": 8e-05, + "loss": 1.7824, + "step": 1975 + }, + { + "epoch": 0.11014492753623188, + "grad_norm": 0.4518475830554962, + "learning_rate": 8e-05, + "loss": 1.6424, + "step": 1976 + }, + { + "epoch": 0.11020066889632107, + "grad_norm": 0.44643738865852356, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 1977 + }, + { + "epoch": 0.11025641025641025, + "grad_norm": 0.46472617983818054, + "learning_rate": 8e-05, + "loss": 1.6799, + "step": 1978 + }, + { + "epoch": 0.11031215161649945, + "grad_norm": 0.46941208839416504, + "learning_rate": 8e-05, + "loss": 1.8121, + "step": 1979 + }, + { + "epoch": 0.11036789297658862, + "grad_norm": 0.47246110439300537, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 1980 + }, + { + "epoch": 0.11042363433667782, + "grad_norm": 0.4268743693828583, + "learning_rate": 8e-05, + "loss": 1.5715, + "step": 1981 + }, + { + "epoch": 0.110479375696767, + "grad_norm": 0.4658483564853668, + "learning_rate": 8e-05, + "loss": 1.9498, + "step": 1982 + }, + { + "epoch": 0.11053511705685619, + "grad_norm": 0.4351171851158142, + "learning_rate": 8e-05, + "loss": 1.6103, + "step": 1983 + }, + { + "epoch": 0.11059085841694537, + "grad_norm": 0.48690593242645264, + "learning_rate": 8e-05, + "loss": 1.8043, + "step": 1984 + }, + { + "epoch": 0.11064659977703456, + "grad_norm": 0.4481874108314514, + "learning_rate": 8e-05, + "loss": 1.7241, + "step": 1985 + }, + { + "epoch": 0.11070234113712375, + "grad_norm": 0.43319857120513916, + "learning_rate": 8e-05, + "loss": 1.7234, + "step": 1986 + }, + { + "epoch": 0.11075808249721293, + "grad_norm": 0.48339614272117615, + "learning_rate": 8e-05, + "loss": 1.7305, + "step": 1987 + }, + { + "epoch": 0.11081382385730212, + "grad_norm": 0.4544183313846588, + "learning_rate": 8e-05, + "loss": 1.8293, + "step": 1988 + }, + { + "epoch": 0.1108695652173913, + "grad_norm": 0.4810698330402374, + "learning_rate": 8e-05, + "loss": 1.7221, + "step": 1989 + }, + { + "epoch": 0.1109253065774805, + "grad_norm": 0.4192270040512085, + "learning_rate": 8e-05, + "loss": 1.6379, + "step": 1990 + }, + { + "epoch": 0.11098104793756967, + "grad_norm": 0.4315672516822815, + "learning_rate": 8e-05, + "loss": 1.543, + "step": 1991 + }, + { + "epoch": 0.11103678929765887, + "grad_norm": 0.4906083643436432, + "learning_rate": 8e-05, + "loss": 1.8446, + "step": 1992 + }, + { + "epoch": 0.11109253065774805, + "grad_norm": 0.42365047335624695, + "learning_rate": 8e-05, + "loss": 1.6205, + "step": 1993 + }, + { + "epoch": 0.11114827201783724, + "grad_norm": 0.51629239320755, + "learning_rate": 8e-05, + "loss": 1.9564, + "step": 1994 + }, + { + "epoch": 0.11120401337792642, + "grad_norm": 0.43064284324645996, + "learning_rate": 8e-05, + "loss": 1.5131, + "step": 1995 + }, + { + "epoch": 0.11125975473801561, + "grad_norm": 0.5297256112098694, + "learning_rate": 8e-05, + "loss": 2.0655, + "step": 1996 + }, + { + "epoch": 0.11131549609810479, + "grad_norm": 0.4883597493171692, + "learning_rate": 8e-05, + "loss": 1.7461, + "step": 1997 + }, + { + "epoch": 0.11137123745819398, + "grad_norm": 0.46033623814582825, + "learning_rate": 8e-05, + "loss": 1.7602, + "step": 1998 + }, + { + "epoch": 0.11142697881828316, + "grad_norm": 0.4744260311126709, + "learning_rate": 8e-05, + "loss": 1.9014, + "step": 1999 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.49645110964775085, + "learning_rate": 8e-05, + "loss": 1.8827, + "step": 2000 + }, + { + "epoch": 0.11153846153846154, + "grad_norm": 0.4630310833454132, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 2001 + }, + { + "epoch": 0.11159420289855072, + "grad_norm": 0.48973262310028076, + "learning_rate": 8e-05, + "loss": 1.8411, + "step": 2002 + }, + { + "epoch": 0.11164994425863992, + "grad_norm": 0.4827178716659546, + "learning_rate": 8e-05, + "loss": 1.7959, + "step": 2003 + }, + { + "epoch": 0.1117056856187291, + "grad_norm": 0.4800148904323578, + "learning_rate": 8e-05, + "loss": 1.8899, + "step": 2004 + }, + { + "epoch": 0.11176142697881829, + "grad_norm": 0.4850090444087982, + "learning_rate": 8e-05, + "loss": 1.7319, + "step": 2005 + }, + { + "epoch": 0.11181716833890747, + "grad_norm": 0.47121182084083557, + "learning_rate": 8e-05, + "loss": 1.6407, + "step": 2006 + }, + { + "epoch": 0.11187290969899666, + "grad_norm": 0.5369400382041931, + "learning_rate": 8e-05, + "loss": 1.9465, + "step": 2007 + }, + { + "epoch": 0.11192865105908584, + "grad_norm": 0.4796384871006012, + "learning_rate": 8e-05, + "loss": 1.9416, + "step": 2008 + }, + { + "epoch": 0.11198439241917503, + "grad_norm": 0.4600585699081421, + "learning_rate": 8e-05, + "loss": 1.759, + "step": 2009 + }, + { + "epoch": 0.11204013377926421, + "grad_norm": 0.4528631567955017, + "learning_rate": 8e-05, + "loss": 1.6321, + "step": 2010 + }, + { + "epoch": 0.1120958751393534, + "grad_norm": 0.48515555262565613, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 2011 + }, + { + "epoch": 0.11215161649944258, + "grad_norm": 0.46082037687301636, + "learning_rate": 8e-05, + "loss": 1.7435, + "step": 2012 + }, + { + "epoch": 0.11220735785953177, + "grad_norm": 0.4584730863571167, + "learning_rate": 8e-05, + "loss": 1.7968, + "step": 2013 + }, + { + "epoch": 0.11226309921962097, + "grad_norm": 0.47898101806640625, + "learning_rate": 8e-05, + "loss": 1.7914, + "step": 2014 + }, + { + "epoch": 0.11231884057971014, + "grad_norm": 0.4609237313270569, + "learning_rate": 8e-05, + "loss": 1.7662, + "step": 2015 + }, + { + "epoch": 0.11237458193979934, + "grad_norm": 0.4741979241371155, + "learning_rate": 8e-05, + "loss": 1.682, + "step": 2016 + }, + { + "epoch": 0.11243032329988852, + "grad_norm": 0.41993820667266846, + "learning_rate": 8e-05, + "loss": 1.7958, + "step": 2017 + }, + { + "epoch": 0.11248606465997771, + "grad_norm": 0.45935675501823425, + "learning_rate": 8e-05, + "loss": 1.7562, + "step": 2018 + }, + { + "epoch": 0.11254180602006689, + "grad_norm": 0.46503686904907227, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 2019 + }, + { + "epoch": 0.11259754738015608, + "grad_norm": 0.445574551820755, + "learning_rate": 8e-05, + "loss": 1.6958, + "step": 2020 + }, + { + "epoch": 0.11265328874024526, + "grad_norm": 0.51012122631073, + "learning_rate": 8e-05, + "loss": 1.984, + "step": 2021 + }, + { + "epoch": 0.11270903010033445, + "grad_norm": 0.4418213367462158, + "learning_rate": 8e-05, + "loss": 1.7274, + "step": 2022 + }, + { + "epoch": 0.11276477146042363, + "grad_norm": 0.4851004481315613, + "learning_rate": 8e-05, + "loss": 1.8347, + "step": 2023 + }, + { + "epoch": 0.11282051282051282, + "grad_norm": 0.44825586676597595, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 2024 + }, + { + "epoch": 0.112876254180602, + "grad_norm": 0.45392194390296936, + "learning_rate": 8e-05, + "loss": 1.7268, + "step": 2025 + }, + { + "epoch": 0.1129319955406912, + "grad_norm": 0.4423568546772003, + "learning_rate": 8e-05, + "loss": 1.5794, + "step": 2026 + }, + { + "epoch": 0.11298773690078037, + "grad_norm": 0.49717170000076294, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 2027 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 0.4636692404747009, + "learning_rate": 8e-05, + "loss": 1.9055, + "step": 2028 + }, + { + "epoch": 0.11309921962095876, + "grad_norm": 0.4840092062950134, + "learning_rate": 8e-05, + "loss": 1.8431, + "step": 2029 + }, + { + "epoch": 0.11315496098104794, + "grad_norm": 0.44109421968460083, + "learning_rate": 8e-05, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 0.11321070234113713, + "grad_norm": 0.47095662355422974, + "learning_rate": 8e-05, + "loss": 1.6746, + "step": 2031 + }, + { + "epoch": 0.11326644370122631, + "grad_norm": 0.4372450113296509, + "learning_rate": 8e-05, + "loss": 1.8052, + "step": 2032 + }, + { + "epoch": 0.1133221850613155, + "grad_norm": 0.5086802244186401, + "learning_rate": 8e-05, + "loss": 1.9591, + "step": 2033 + }, + { + "epoch": 0.11337792642140468, + "grad_norm": 0.4743632376194, + "learning_rate": 8e-05, + "loss": 1.888, + "step": 2034 + }, + { + "epoch": 0.11343366778149387, + "grad_norm": 0.4422663152217865, + "learning_rate": 8e-05, + "loss": 1.6306, + "step": 2035 + }, + { + "epoch": 0.11348940914158305, + "grad_norm": 0.44120973348617554, + "learning_rate": 8e-05, + "loss": 1.7567, + "step": 2036 + }, + { + "epoch": 0.11354515050167224, + "grad_norm": 0.5340293049812317, + "learning_rate": 8e-05, + "loss": 1.8003, + "step": 2037 + }, + { + "epoch": 0.11360089186176142, + "grad_norm": 0.4521288573741913, + "learning_rate": 8e-05, + "loss": 1.6788, + "step": 2038 + }, + { + "epoch": 0.11365663322185061, + "grad_norm": 0.46381545066833496, + "learning_rate": 8e-05, + "loss": 1.8585, + "step": 2039 + }, + { + "epoch": 0.11371237458193979, + "grad_norm": 0.4723742604255676, + "learning_rate": 8e-05, + "loss": 1.624, + "step": 2040 + }, + { + "epoch": 0.11376811594202899, + "grad_norm": 0.48812541365623474, + "learning_rate": 8e-05, + "loss": 1.799, + "step": 2041 + }, + { + "epoch": 0.11382385730211818, + "grad_norm": 0.42873966693878174, + "learning_rate": 8e-05, + "loss": 1.4369, + "step": 2042 + }, + { + "epoch": 0.11387959866220736, + "grad_norm": 0.4686322510242462, + "learning_rate": 8e-05, + "loss": 1.7953, + "step": 2043 + }, + { + "epoch": 0.11393534002229655, + "grad_norm": 0.4819076657295227, + "learning_rate": 8e-05, + "loss": 1.9851, + "step": 2044 + }, + { + "epoch": 0.11399108138238573, + "grad_norm": 0.44066208600997925, + "learning_rate": 8e-05, + "loss": 1.8221, + "step": 2045 + }, + { + "epoch": 0.11404682274247492, + "grad_norm": 0.4927108585834503, + "learning_rate": 8e-05, + "loss": 1.8009, + "step": 2046 + }, + { + "epoch": 0.1141025641025641, + "grad_norm": 0.4504014849662781, + "learning_rate": 8e-05, + "loss": 1.5526, + "step": 2047 + }, + { + "epoch": 0.11415830546265329, + "grad_norm": 0.44212785363197327, + "learning_rate": 8e-05, + "loss": 1.7123, + "step": 2048 + }, + { + "epoch": 0.11421404682274247, + "grad_norm": 0.46705907583236694, + "learning_rate": 8e-05, + "loss": 1.7985, + "step": 2049 + }, + { + "epoch": 0.11426978818283166, + "grad_norm": 0.43998247385025024, + "learning_rate": 8e-05, + "loss": 1.4829, + "step": 2050 + }, + { + "epoch": 0.11432552954292084, + "grad_norm": 0.48790040612220764, + "learning_rate": 8e-05, + "loss": 1.9512, + "step": 2051 + }, + { + "epoch": 0.11438127090301003, + "grad_norm": 0.4267139434814453, + "learning_rate": 8e-05, + "loss": 1.5513, + "step": 2052 + }, + { + "epoch": 0.11443701226309921, + "grad_norm": 0.428985059261322, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 2053 + }, + { + "epoch": 0.1144927536231884, + "grad_norm": 0.47211799025535583, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 2054 + }, + { + "epoch": 0.11454849498327759, + "grad_norm": 0.4569706618785858, + "learning_rate": 8e-05, + "loss": 1.7369, + "step": 2055 + }, + { + "epoch": 0.11460423634336678, + "grad_norm": 0.44445621967315674, + "learning_rate": 8e-05, + "loss": 1.8191, + "step": 2056 + }, + { + "epoch": 0.11465997770345597, + "grad_norm": 0.497089147567749, + "learning_rate": 8e-05, + "loss": 1.79, + "step": 2057 + }, + { + "epoch": 0.11471571906354515, + "grad_norm": 0.46985799074172974, + "learning_rate": 8e-05, + "loss": 1.799, + "step": 2058 + }, + { + "epoch": 0.11477146042363434, + "grad_norm": 0.4886013865470886, + "learning_rate": 8e-05, + "loss": 1.9005, + "step": 2059 + }, + { + "epoch": 0.11482720178372352, + "grad_norm": 0.45066893100738525, + "learning_rate": 8e-05, + "loss": 1.3898, + "step": 2060 + }, + { + "epoch": 0.11488294314381271, + "grad_norm": 0.4732798933982849, + "learning_rate": 8e-05, + "loss": 1.7565, + "step": 2061 + }, + { + "epoch": 0.11493868450390189, + "grad_norm": 0.469097375869751, + "learning_rate": 8e-05, + "loss": 1.6626, + "step": 2062 + }, + { + "epoch": 0.11499442586399108, + "grad_norm": 0.475689560174942, + "learning_rate": 8e-05, + "loss": 1.7464, + "step": 2063 + }, + { + "epoch": 0.11505016722408026, + "grad_norm": 0.4490555226802826, + "learning_rate": 8e-05, + "loss": 1.7317, + "step": 2064 + }, + { + "epoch": 0.11510590858416946, + "grad_norm": 0.4971592426300049, + "learning_rate": 8e-05, + "loss": 1.7497, + "step": 2065 + }, + { + "epoch": 0.11516164994425863, + "grad_norm": 0.4859224855899811, + "learning_rate": 8e-05, + "loss": 1.7051, + "step": 2066 + }, + { + "epoch": 0.11521739130434783, + "grad_norm": 0.5182693600654602, + "learning_rate": 8e-05, + "loss": 1.6759, + "step": 2067 + }, + { + "epoch": 0.115273132664437, + "grad_norm": 0.5141357183456421, + "learning_rate": 8e-05, + "loss": 1.968, + "step": 2068 + }, + { + "epoch": 0.1153288740245262, + "grad_norm": 0.4426954686641693, + "learning_rate": 8e-05, + "loss": 1.6516, + "step": 2069 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 0.47207707166671753, + "learning_rate": 8e-05, + "loss": 2.1095, + "step": 2070 + }, + { + "epoch": 0.11544035674470457, + "grad_norm": 0.48400771617889404, + "learning_rate": 8e-05, + "loss": 1.6553, + "step": 2071 + }, + { + "epoch": 0.11549609810479376, + "grad_norm": 0.5170599818229675, + "learning_rate": 8e-05, + "loss": 1.9569, + "step": 2072 + }, + { + "epoch": 0.11555183946488294, + "grad_norm": 0.4554281234741211, + "learning_rate": 8e-05, + "loss": 1.7292, + "step": 2073 + }, + { + "epoch": 0.11560758082497213, + "grad_norm": 0.443513423204422, + "learning_rate": 8e-05, + "loss": 1.5986, + "step": 2074 + }, + { + "epoch": 0.11566332218506131, + "grad_norm": 0.44155359268188477, + "learning_rate": 8e-05, + "loss": 1.698, + "step": 2075 + }, + { + "epoch": 0.1157190635451505, + "grad_norm": 0.48619598150253296, + "learning_rate": 8e-05, + "loss": 1.863, + "step": 2076 + }, + { + "epoch": 0.11577480490523968, + "grad_norm": 0.42969754338264465, + "learning_rate": 8e-05, + "loss": 1.4224, + "step": 2077 + }, + { + "epoch": 0.11583054626532888, + "grad_norm": 0.44721049070358276, + "learning_rate": 8e-05, + "loss": 1.6163, + "step": 2078 + }, + { + "epoch": 0.11588628762541806, + "grad_norm": 0.474103718996048, + "learning_rate": 8e-05, + "loss": 1.8101, + "step": 2079 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.40180113911628723, + "learning_rate": 8e-05, + "loss": 1.2825, + "step": 2080 + }, + { + "epoch": 0.11599777034559643, + "grad_norm": 0.4595925211906433, + "learning_rate": 8e-05, + "loss": 1.5944, + "step": 2081 + }, + { + "epoch": 0.11605351170568562, + "grad_norm": 0.4501091539859772, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 2082 + }, + { + "epoch": 0.11610925306577481, + "grad_norm": 0.4637793004512787, + "learning_rate": 8e-05, + "loss": 1.9328, + "step": 2083 + }, + { + "epoch": 0.11616499442586399, + "grad_norm": 0.486345112323761, + "learning_rate": 8e-05, + "loss": 1.8168, + "step": 2084 + }, + { + "epoch": 0.11622073578595318, + "grad_norm": 0.4314326047897339, + "learning_rate": 8e-05, + "loss": 1.7082, + "step": 2085 + }, + { + "epoch": 0.11627647714604236, + "grad_norm": 0.4588695466518402, + "learning_rate": 8e-05, + "loss": 1.801, + "step": 2086 + }, + { + "epoch": 0.11633221850613155, + "grad_norm": 0.4157659709453583, + "learning_rate": 8e-05, + "loss": 1.5326, + "step": 2087 + }, + { + "epoch": 0.11638795986622073, + "grad_norm": 0.4738490879535675, + "learning_rate": 8e-05, + "loss": 1.8684, + "step": 2088 + }, + { + "epoch": 0.11644370122630993, + "grad_norm": 0.47257229685783386, + "learning_rate": 8e-05, + "loss": 1.8574, + "step": 2089 + }, + { + "epoch": 0.1164994425863991, + "grad_norm": 0.4345060884952545, + "learning_rate": 8e-05, + "loss": 1.6715, + "step": 2090 + }, + { + "epoch": 0.1165551839464883, + "grad_norm": 0.4843539893627167, + "learning_rate": 8e-05, + "loss": 1.8623, + "step": 2091 + }, + { + "epoch": 0.11661092530657748, + "grad_norm": 0.4963710904121399, + "learning_rate": 8e-05, + "loss": 2.03, + "step": 2092 + }, + { + "epoch": 0.11666666666666667, + "grad_norm": 0.4864402413368225, + "learning_rate": 8e-05, + "loss": 1.6748, + "step": 2093 + }, + { + "epoch": 0.11672240802675585, + "grad_norm": 0.4512304663658142, + "learning_rate": 8e-05, + "loss": 1.725, + "step": 2094 + }, + { + "epoch": 0.11677814938684504, + "grad_norm": 0.5019859671592712, + "learning_rate": 8e-05, + "loss": 1.8322, + "step": 2095 + }, + { + "epoch": 0.11683389074693422, + "grad_norm": 0.44922488927841187, + "learning_rate": 8e-05, + "loss": 1.8584, + "step": 2096 + }, + { + "epoch": 0.11688963210702341, + "grad_norm": 0.5823623538017273, + "learning_rate": 8e-05, + "loss": 1.9786, + "step": 2097 + }, + { + "epoch": 0.1169453734671126, + "grad_norm": 0.5128968954086304, + "learning_rate": 8e-05, + "loss": 1.3316, + "step": 2098 + }, + { + "epoch": 0.11700111482720178, + "grad_norm": 0.4815669357776642, + "learning_rate": 8e-05, + "loss": 1.7753, + "step": 2099 + }, + { + "epoch": 0.11705685618729098, + "grad_norm": 0.44617798924446106, + "learning_rate": 8e-05, + "loss": 1.4031, + "step": 2100 + }, + { + "epoch": 0.11711259754738015, + "grad_norm": 0.4319689869880676, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 2101 + }, + { + "epoch": 0.11716833890746935, + "grad_norm": 0.47626733779907227, + "learning_rate": 8e-05, + "loss": 1.8199, + "step": 2102 + }, + { + "epoch": 0.11722408026755853, + "grad_norm": 0.5096200108528137, + "learning_rate": 8e-05, + "loss": 1.9839, + "step": 2103 + }, + { + "epoch": 0.11727982162764772, + "grad_norm": 0.4460195302963257, + "learning_rate": 8e-05, + "loss": 1.6491, + "step": 2104 + }, + { + "epoch": 0.1173355629877369, + "grad_norm": 0.4559648036956787, + "learning_rate": 8e-05, + "loss": 1.6841, + "step": 2105 + }, + { + "epoch": 0.11739130434782609, + "grad_norm": 0.46103864908218384, + "learning_rate": 8e-05, + "loss": 1.7696, + "step": 2106 + }, + { + "epoch": 0.11744704570791527, + "grad_norm": 0.46135443449020386, + "learning_rate": 8e-05, + "loss": 1.7862, + "step": 2107 + }, + { + "epoch": 0.11750278706800446, + "grad_norm": 0.4671906530857086, + "learning_rate": 8e-05, + "loss": 1.8382, + "step": 2108 + }, + { + "epoch": 0.11755852842809364, + "grad_norm": 0.39926496148109436, + "learning_rate": 8e-05, + "loss": 1.4544, + "step": 2109 + }, + { + "epoch": 0.11761426978818283, + "grad_norm": 0.4120591878890991, + "learning_rate": 8e-05, + "loss": 1.5544, + "step": 2110 + }, + { + "epoch": 0.11767001114827202, + "grad_norm": 0.4811738431453705, + "learning_rate": 8e-05, + "loss": 1.8538, + "step": 2111 + }, + { + "epoch": 0.1177257525083612, + "grad_norm": 0.4601971507072449, + "learning_rate": 8e-05, + "loss": 1.8839, + "step": 2112 + }, + { + "epoch": 0.1177814938684504, + "grad_norm": 0.48387205600738525, + "learning_rate": 8e-05, + "loss": 1.9743, + "step": 2113 + }, + { + "epoch": 0.11783723522853957, + "grad_norm": 0.4420911371707916, + "learning_rate": 8e-05, + "loss": 1.5835, + "step": 2114 + }, + { + "epoch": 0.11789297658862877, + "grad_norm": 0.4618477523326874, + "learning_rate": 8e-05, + "loss": 1.7418, + "step": 2115 + }, + { + "epoch": 0.11794871794871795, + "grad_norm": 0.44071194529533386, + "learning_rate": 8e-05, + "loss": 1.6977, + "step": 2116 + }, + { + "epoch": 0.11800445930880714, + "grad_norm": 0.5029857754707336, + "learning_rate": 8e-05, + "loss": 1.6455, + "step": 2117 + }, + { + "epoch": 0.11806020066889632, + "grad_norm": 0.5218876004219055, + "learning_rate": 8e-05, + "loss": 1.7321, + "step": 2118 + }, + { + "epoch": 0.11811594202898551, + "grad_norm": 0.4479510486125946, + "learning_rate": 8e-05, + "loss": 1.772, + "step": 2119 + }, + { + "epoch": 0.11817168338907469, + "grad_norm": 0.45699721574783325, + "learning_rate": 8e-05, + "loss": 1.6361, + "step": 2120 + }, + { + "epoch": 0.11822742474916388, + "grad_norm": 0.45344969630241394, + "learning_rate": 8e-05, + "loss": 1.6322, + "step": 2121 + }, + { + "epoch": 0.11828316610925306, + "grad_norm": 0.48492100834846497, + "learning_rate": 8e-05, + "loss": 1.7463, + "step": 2122 + }, + { + "epoch": 0.11833890746934225, + "grad_norm": 0.4809948205947876, + "learning_rate": 8e-05, + "loss": 1.6205, + "step": 2123 + }, + { + "epoch": 0.11839464882943143, + "grad_norm": 0.4564463198184967, + "learning_rate": 8e-05, + "loss": 1.6662, + "step": 2124 + }, + { + "epoch": 0.11845039018952062, + "grad_norm": 0.46469205617904663, + "learning_rate": 8e-05, + "loss": 1.6511, + "step": 2125 + }, + { + "epoch": 0.11850613154960982, + "grad_norm": 0.4757656157016754, + "learning_rate": 8e-05, + "loss": 1.8471, + "step": 2126 + }, + { + "epoch": 0.118561872909699, + "grad_norm": 0.424377977848053, + "learning_rate": 8e-05, + "loss": 1.5451, + "step": 2127 + }, + { + "epoch": 0.11861761426978819, + "grad_norm": 0.46142521500587463, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 2128 + }, + { + "epoch": 0.11867335562987737, + "grad_norm": 0.4726645052433014, + "learning_rate": 8e-05, + "loss": 1.8552, + "step": 2129 + }, + { + "epoch": 0.11872909698996656, + "grad_norm": 0.4805576801300049, + "learning_rate": 8e-05, + "loss": 1.9007, + "step": 2130 + }, + { + "epoch": 0.11878483835005574, + "grad_norm": 0.4529826045036316, + "learning_rate": 8e-05, + "loss": 1.5354, + "step": 2131 + }, + { + "epoch": 0.11884057971014493, + "grad_norm": 0.4534931182861328, + "learning_rate": 8e-05, + "loss": 1.6667, + "step": 2132 + }, + { + "epoch": 0.11889632107023411, + "grad_norm": 0.4818662106990814, + "learning_rate": 8e-05, + "loss": 1.6025, + "step": 2133 + }, + { + "epoch": 0.1189520624303233, + "grad_norm": 0.4961552917957306, + "learning_rate": 8e-05, + "loss": 1.8231, + "step": 2134 + }, + { + "epoch": 0.11900780379041248, + "grad_norm": 0.5023543834686279, + "learning_rate": 8e-05, + "loss": 1.9326, + "step": 2135 + }, + { + "epoch": 0.11906354515050167, + "grad_norm": 0.4256194829940796, + "learning_rate": 8e-05, + "loss": 1.6839, + "step": 2136 + }, + { + "epoch": 0.11911928651059085, + "grad_norm": 0.4191765785217285, + "learning_rate": 8e-05, + "loss": 1.5835, + "step": 2137 + }, + { + "epoch": 0.11917502787068004, + "grad_norm": 0.44517984986305237, + "learning_rate": 8e-05, + "loss": 1.5365, + "step": 2138 + }, + { + "epoch": 0.11923076923076924, + "grad_norm": 0.4512975811958313, + "learning_rate": 8e-05, + "loss": 1.681, + "step": 2139 + }, + { + "epoch": 0.11928651059085842, + "grad_norm": 0.4347441792488098, + "learning_rate": 8e-05, + "loss": 1.7185, + "step": 2140 + }, + { + "epoch": 0.11934225195094761, + "grad_norm": 0.46845176815986633, + "learning_rate": 8e-05, + "loss": 1.663, + "step": 2141 + }, + { + "epoch": 0.11939799331103679, + "grad_norm": 0.47879666090011597, + "learning_rate": 8e-05, + "loss": 1.7167, + "step": 2142 + }, + { + "epoch": 0.11945373467112598, + "grad_norm": 0.49749916791915894, + "learning_rate": 8e-05, + "loss": 1.8381, + "step": 2143 + }, + { + "epoch": 0.11950947603121516, + "grad_norm": 0.4949910640716553, + "learning_rate": 8e-05, + "loss": 1.9321, + "step": 2144 + }, + { + "epoch": 0.11956521739130435, + "grad_norm": 0.4539514482021332, + "learning_rate": 8e-05, + "loss": 1.7162, + "step": 2145 + }, + { + "epoch": 0.11962095875139353, + "grad_norm": 0.5238786339759827, + "learning_rate": 8e-05, + "loss": 1.6846, + "step": 2146 + }, + { + "epoch": 0.11967670011148272, + "grad_norm": 0.44461992383003235, + "learning_rate": 8e-05, + "loss": 1.6656, + "step": 2147 + }, + { + "epoch": 0.1197324414715719, + "grad_norm": 0.4791272282600403, + "learning_rate": 8e-05, + "loss": 1.9857, + "step": 2148 + }, + { + "epoch": 0.1197881828316611, + "grad_norm": 0.4683381915092468, + "learning_rate": 8e-05, + "loss": 1.6514, + "step": 2149 + }, + { + "epoch": 0.11984392419175027, + "grad_norm": 0.5213828682899475, + "learning_rate": 8e-05, + "loss": 1.8802, + "step": 2150 + }, + { + "epoch": 0.11989966555183947, + "grad_norm": 0.48594242334365845, + "learning_rate": 8e-05, + "loss": 1.8658, + "step": 2151 + }, + { + "epoch": 0.11995540691192864, + "grad_norm": 0.4764782190322876, + "learning_rate": 8e-05, + "loss": 1.7479, + "step": 2152 + }, + { + "epoch": 0.12001114827201784, + "grad_norm": 0.45884522795677185, + "learning_rate": 8e-05, + "loss": 1.8825, + "step": 2153 + }, + { + "epoch": 0.12006688963210703, + "grad_norm": 0.5118260979652405, + "learning_rate": 8e-05, + "loss": 2.047, + "step": 2154 + }, + { + "epoch": 0.12012263099219621, + "grad_norm": 0.5420330166816711, + "learning_rate": 8e-05, + "loss": 2.1218, + "step": 2155 + }, + { + "epoch": 0.1201783723522854, + "grad_norm": 0.4370868504047394, + "learning_rate": 8e-05, + "loss": 1.5695, + "step": 2156 + }, + { + "epoch": 0.12023411371237458, + "grad_norm": 0.4741508960723877, + "learning_rate": 8e-05, + "loss": 1.654, + "step": 2157 + }, + { + "epoch": 0.12028985507246377, + "grad_norm": 0.4782915711402893, + "learning_rate": 8e-05, + "loss": 1.7332, + "step": 2158 + }, + { + "epoch": 0.12034559643255295, + "grad_norm": 0.5331648588180542, + "learning_rate": 8e-05, + "loss": 1.9393, + "step": 2159 + }, + { + "epoch": 0.12040133779264214, + "grad_norm": 0.4637068212032318, + "learning_rate": 8e-05, + "loss": 1.6524, + "step": 2160 + }, + { + "epoch": 0.12045707915273132, + "grad_norm": 0.4589218497276306, + "learning_rate": 8e-05, + "loss": 1.8953, + "step": 2161 + }, + { + "epoch": 0.12051282051282051, + "grad_norm": 0.4819149672985077, + "learning_rate": 8e-05, + "loss": 2.0129, + "step": 2162 + }, + { + "epoch": 0.1205685618729097, + "grad_norm": 0.437137246131897, + "learning_rate": 8e-05, + "loss": 1.5711, + "step": 2163 + }, + { + "epoch": 0.12062430323299889, + "grad_norm": 0.47422948479652405, + "learning_rate": 8e-05, + "loss": 1.7699, + "step": 2164 + }, + { + "epoch": 0.12068004459308806, + "grad_norm": 0.474104642868042, + "learning_rate": 8e-05, + "loss": 1.7519, + "step": 2165 + }, + { + "epoch": 0.12073578595317726, + "grad_norm": 0.4602581262588501, + "learning_rate": 8e-05, + "loss": 1.6553, + "step": 2166 + }, + { + "epoch": 0.12079152731326645, + "grad_norm": 0.5225976705551147, + "learning_rate": 8e-05, + "loss": 2.3767, + "step": 2167 + }, + { + "epoch": 0.12084726867335563, + "grad_norm": 0.45720773935317993, + "learning_rate": 8e-05, + "loss": 1.6674, + "step": 2168 + }, + { + "epoch": 0.12090301003344482, + "grad_norm": 0.4820588529109955, + "learning_rate": 8e-05, + "loss": 1.842, + "step": 2169 + }, + { + "epoch": 0.120958751393534, + "grad_norm": 0.4850405752658844, + "learning_rate": 8e-05, + "loss": 1.8699, + "step": 2170 + }, + { + "epoch": 0.12101449275362319, + "grad_norm": 0.4590352177619934, + "learning_rate": 8e-05, + "loss": 1.7275, + "step": 2171 + }, + { + "epoch": 0.12107023411371237, + "grad_norm": 0.4691894054412842, + "learning_rate": 8e-05, + "loss": 1.8761, + "step": 2172 + }, + { + "epoch": 0.12112597547380156, + "grad_norm": 0.467676043510437, + "learning_rate": 8e-05, + "loss": 1.7993, + "step": 2173 + }, + { + "epoch": 0.12118171683389074, + "grad_norm": 0.4780999720096588, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 2174 + }, + { + "epoch": 0.12123745819397994, + "grad_norm": 0.46493837237358093, + "learning_rate": 8e-05, + "loss": 1.7838, + "step": 2175 + }, + { + "epoch": 0.12129319955406911, + "grad_norm": 0.4486934542655945, + "learning_rate": 8e-05, + "loss": 1.5086, + "step": 2176 + }, + { + "epoch": 0.12134894091415831, + "grad_norm": 0.4756641983985901, + "learning_rate": 8e-05, + "loss": 1.6553, + "step": 2177 + }, + { + "epoch": 0.12140468227424749, + "grad_norm": 0.47148486971855164, + "learning_rate": 8e-05, + "loss": 2.0012, + "step": 2178 + }, + { + "epoch": 0.12146042363433668, + "grad_norm": 0.45895448327064514, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 2179 + }, + { + "epoch": 0.12151616499442586, + "grad_norm": 0.4958876371383667, + "learning_rate": 8e-05, + "loss": 1.8134, + "step": 2180 + }, + { + "epoch": 0.12157190635451505, + "grad_norm": 0.4782322645187378, + "learning_rate": 8e-05, + "loss": 1.7053, + "step": 2181 + }, + { + "epoch": 0.12162764771460424, + "grad_norm": 0.4557296931743622, + "learning_rate": 8e-05, + "loss": 1.6757, + "step": 2182 + }, + { + "epoch": 0.12168338907469342, + "grad_norm": 0.4677014648914337, + "learning_rate": 8e-05, + "loss": 1.7492, + "step": 2183 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 0.47127747535705566, + "learning_rate": 8e-05, + "loss": 1.8121, + "step": 2184 + }, + { + "epoch": 0.12179487179487179, + "grad_norm": 0.44277313351631165, + "learning_rate": 8e-05, + "loss": 1.7282, + "step": 2185 + }, + { + "epoch": 0.12185061315496098, + "grad_norm": 0.4506374001502991, + "learning_rate": 8e-05, + "loss": 1.874, + "step": 2186 + }, + { + "epoch": 0.12190635451505016, + "grad_norm": 0.4785115122795105, + "learning_rate": 8e-05, + "loss": 1.8468, + "step": 2187 + }, + { + "epoch": 0.12196209587513936, + "grad_norm": 0.4615698456764221, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 2188 + }, + { + "epoch": 0.12201783723522854, + "grad_norm": 0.5574464797973633, + "learning_rate": 8e-05, + "loss": 1.7617, + "step": 2189 + }, + { + "epoch": 0.12207357859531773, + "grad_norm": 0.47425100207328796, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 2190 + }, + { + "epoch": 0.1221293199554069, + "grad_norm": 0.4637100100517273, + "learning_rate": 8e-05, + "loss": 1.7061, + "step": 2191 + }, + { + "epoch": 0.1221850613154961, + "grad_norm": 0.4725492000579834, + "learning_rate": 8e-05, + "loss": 1.7959, + "step": 2192 + }, + { + "epoch": 0.12224080267558528, + "grad_norm": 0.47741201519966125, + "learning_rate": 8e-05, + "loss": 1.854, + "step": 2193 + }, + { + "epoch": 0.12229654403567447, + "grad_norm": 0.4578995108604431, + "learning_rate": 8e-05, + "loss": 1.569, + "step": 2194 + }, + { + "epoch": 0.12235228539576366, + "grad_norm": 0.4601028263568878, + "learning_rate": 8e-05, + "loss": 1.6284, + "step": 2195 + }, + { + "epoch": 0.12240802675585284, + "grad_norm": 0.45650744438171387, + "learning_rate": 8e-05, + "loss": 1.6563, + "step": 2196 + }, + { + "epoch": 0.12246376811594203, + "grad_norm": 0.4996529221534729, + "learning_rate": 8e-05, + "loss": 1.8408, + "step": 2197 + }, + { + "epoch": 0.12251950947603121, + "grad_norm": 0.4483855068683624, + "learning_rate": 8e-05, + "loss": 1.5627, + "step": 2198 + }, + { + "epoch": 0.1225752508361204, + "grad_norm": 0.489122599363327, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 2199 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4574604332447052, + "learning_rate": 8e-05, + "loss": 1.801, + "step": 2200 + }, + { + "epoch": 0.12268673355629878, + "grad_norm": 0.5230749249458313, + "learning_rate": 8e-05, + "loss": 2.0084, + "step": 2201 + }, + { + "epoch": 0.12274247491638796, + "grad_norm": 0.4647735357284546, + "learning_rate": 8e-05, + "loss": 1.6136, + "step": 2202 + }, + { + "epoch": 0.12279821627647715, + "grad_norm": 0.45680564641952515, + "learning_rate": 8e-05, + "loss": 1.5722, + "step": 2203 + }, + { + "epoch": 0.12285395763656633, + "grad_norm": 0.4625045955181122, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 2204 + }, + { + "epoch": 0.12290969899665552, + "grad_norm": 0.46245506405830383, + "learning_rate": 8e-05, + "loss": 1.5329, + "step": 2205 + }, + { + "epoch": 0.1229654403567447, + "grad_norm": 0.4651133418083191, + "learning_rate": 8e-05, + "loss": 1.7119, + "step": 2206 + }, + { + "epoch": 0.12302118171683389, + "grad_norm": 0.45406287908554077, + "learning_rate": 8e-05, + "loss": 1.4837, + "step": 2207 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 0.5053935050964355, + "learning_rate": 8e-05, + "loss": 1.8821, + "step": 2208 + }, + { + "epoch": 0.12313266443701226, + "grad_norm": 0.5143939256668091, + "learning_rate": 8e-05, + "loss": 1.8583, + "step": 2209 + }, + { + "epoch": 0.12318840579710146, + "grad_norm": 0.4740615487098694, + "learning_rate": 8e-05, + "loss": 1.8297, + "step": 2210 + }, + { + "epoch": 0.12324414715719063, + "grad_norm": 0.5244475603103638, + "learning_rate": 8e-05, + "loss": 1.7763, + "step": 2211 + }, + { + "epoch": 0.12329988851727983, + "grad_norm": 0.5088525414466858, + "learning_rate": 8e-05, + "loss": 2.1075, + "step": 2212 + }, + { + "epoch": 0.123355629877369, + "grad_norm": 0.48931679129600525, + "learning_rate": 8e-05, + "loss": 1.9784, + "step": 2213 + }, + { + "epoch": 0.1234113712374582, + "grad_norm": 0.4884590804576874, + "learning_rate": 8e-05, + "loss": 1.8409, + "step": 2214 + }, + { + "epoch": 0.12346711259754738, + "grad_norm": 0.4486399292945862, + "learning_rate": 8e-05, + "loss": 1.8856, + "step": 2215 + }, + { + "epoch": 0.12352285395763657, + "grad_norm": 0.4579851031303406, + "learning_rate": 8e-05, + "loss": 1.7836, + "step": 2216 + }, + { + "epoch": 0.12357859531772575, + "grad_norm": 0.40383824706077576, + "learning_rate": 8e-05, + "loss": 1.4822, + "step": 2217 + }, + { + "epoch": 0.12363433667781494, + "grad_norm": 0.4631842374801636, + "learning_rate": 8e-05, + "loss": 1.6552, + "step": 2218 + }, + { + "epoch": 0.12369007803790412, + "grad_norm": 0.47191739082336426, + "learning_rate": 8e-05, + "loss": 1.7517, + "step": 2219 + }, + { + "epoch": 0.12374581939799331, + "grad_norm": 0.47322702407836914, + "learning_rate": 8e-05, + "loss": 1.8699, + "step": 2220 + }, + { + "epoch": 0.12380156075808249, + "grad_norm": 0.48753586411476135, + "learning_rate": 8e-05, + "loss": 1.6901, + "step": 2221 + }, + { + "epoch": 0.12385730211817168, + "grad_norm": 0.489136278629303, + "learning_rate": 8e-05, + "loss": 1.9412, + "step": 2222 + }, + { + "epoch": 0.12391304347826088, + "grad_norm": 0.45695024728775024, + "learning_rate": 8e-05, + "loss": 1.8031, + "step": 2223 + }, + { + "epoch": 0.12396878483835005, + "grad_norm": 0.4342595636844635, + "learning_rate": 8e-05, + "loss": 1.733, + "step": 2224 + }, + { + "epoch": 0.12402452619843925, + "grad_norm": 0.5584686994552612, + "learning_rate": 8e-05, + "loss": 1.6239, + "step": 2225 + }, + { + "epoch": 0.12408026755852843, + "grad_norm": 0.5097849369049072, + "learning_rate": 8e-05, + "loss": 1.7744, + "step": 2226 + }, + { + "epoch": 0.12413600891861762, + "grad_norm": 0.43732327222824097, + "learning_rate": 8e-05, + "loss": 1.5259, + "step": 2227 + }, + { + "epoch": 0.1241917502787068, + "grad_norm": 0.46310287714004517, + "learning_rate": 8e-05, + "loss": 1.6754, + "step": 2228 + }, + { + "epoch": 0.12424749163879599, + "grad_norm": 0.4878222942352295, + "learning_rate": 8e-05, + "loss": 1.8352, + "step": 2229 + }, + { + "epoch": 0.12430323299888517, + "grad_norm": 0.4622375965118408, + "learning_rate": 8e-05, + "loss": 1.7948, + "step": 2230 + }, + { + "epoch": 0.12435897435897436, + "grad_norm": 0.4793466329574585, + "learning_rate": 8e-05, + "loss": 2.0367, + "step": 2231 + }, + { + "epoch": 0.12441471571906354, + "grad_norm": 0.45333993434906006, + "learning_rate": 8e-05, + "loss": 1.5381, + "step": 2232 + }, + { + "epoch": 0.12447045707915273, + "grad_norm": 0.4307631850242615, + "learning_rate": 8e-05, + "loss": 1.5617, + "step": 2233 + }, + { + "epoch": 0.12452619843924191, + "grad_norm": 0.4964076578617096, + "learning_rate": 8e-05, + "loss": 1.9645, + "step": 2234 + }, + { + "epoch": 0.1245819397993311, + "grad_norm": 0.4543098509311676, + "learning_rate": 8e-05, + "loss": 1.4873, + "step": 2235 + }, + { + "epoch": 0.1246376811594203, + "grad_norm": 0.4989856779575348, + "learning_rate": 8e-05, + "loss": 1.8919, + "step": 2236 + }, + { + "epoch": 0.12469342251950948, + "grad_norm": 0.46211791038513184, + "learning_rate": 8e-05, + "loss": 1.745, + "step": 2237 + }, + { + "epoch": 0.12474916387959867, + "grad_norm": 0.525151789188385, + "learning_rate": 8e-05, + "loss": 2.0378, + "step": 2238 + }, + { + "epoch": 0.12480490523968785, + "grad_norm": 0.4855180084705353, + "learning_rate": 8e-05, + "loss": 1.8417, + "step": 2239 + }, + { + "epoch": 0.12486064659977704, + "grad_norm": 0.46944236755371094, + "learning_rate": 8e-05, + "loss": 1.6507, + "step": 2240 + }, + { + "epoch": 0.12491638795986622, + "grad_norm": 0.4638369679450989, + "learning_rate": 8e-05, + "loss": 1.7477, + "step": 2241 + }, + { + "epoch": 0.12497212931995541, + "grad_norm": 0.49965327978134155, + "learning_rate": 8e-05, + "loss": 1.8928, + "step": 2242 + }, + { + "epoch": 0.1250278706800446, + "grad_norm": 0.4624139964580536, + "learning_rate": 8e-05, + "loss": 1.8775, + "step": 2243 + }, + { + "epoch": 0.12508361204013377, + "grad_norm": 0.4745078384876251, + "learning_rate": 8e-05, + "loss": 2.0191, + "step": 2244 + }, + { + "epoch": 0.12513935340022297, + "grad_norm": 0.4567002058029175, + "learning_rate": 8e-05, + "loss": 1.7058, + "step": 2245 + }, + { + "epoch": 0.12519509476031215, + "grad_norm": 0.4978659749031067, + "learning_rate": 8e-05, + "loss": 1.8425, + "step": 2246 + }, + { + "epoch": 0.12525083612040133, + "grad_norm": 0.5292001366615295, + "learning_rate": 8e-05, + "loss": 1.9908, + "step": 2247 + }, + { + "epoch": 0.1253065774804905, + "grad_norm": 0.4715481698513031, + "learning_rate": 8e-05, + "loss": 1.7982, + "step": 2248 + }, + { + "epoch": 0.12536231884057972, + "grad_norm": 0.4420139491558075, + "learning_rate": 8e-05, + "loss": 1.2473, + "step": 2249 + }, + { + "epoch": 0.1254180602006689, + "grad_norm": 0.5093467235565186, + "learning_rate": 8e-05, + "loss": 1.9668, + "step": 2250 + }, + { + "epoch": 0.12547380156075807, + "grad_norm": 0.449186772108078, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 2251 + }, + { + "epoch": 0.12552954292084728, + "grad_norm": 0.45465993881225586, + "learning_rate": 8e-05, + "loss": 1.7156, + "step": 2252 + }, + { + "epoch": 0.12558528428093646, + "grad_norm": 0.46614545583724976, + "learning_rate": 8e-05, + "loss": 1.647, + "step": 2253 + }, + { + "epoch": 0.12564102564102564, + "grad_norm": 0.4224128723144531, + "learning_rate": 8e-05, + "loss": 1.6038, + "step": 2254 + }, + { + "epoch": 0.12569676700111482, + "grad_norm": 0.4402707517147064, + "learning_rate": 8e-05, + "loss": 1.748, + "step": 2255 + }, + { + "epoch": 0.12575250836120402, + "grad_norm": 0.4561031758785248, + "learning_rate": 8e-05, + "loss": 1.8052, + "step": 2256 + }, + { + "epoch": 0.1258082497212932, + "grad_norm": 0.44972988963127136, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 2257 + }, + { + "epoch": 0.12586399108138238, + "grad_norm": 0.5131308436393738, + "learning_rate": 8e-05, + "loss": 1.7484, + "step": 2258 + }, + { + "epoch": 0.12591973244147156, + "grad_norm": 0.45187079906463623, + "learning_rate": 8e-05, + "loss": 1.8119, + "step": 2259 + }, + { + "epoch": 0.12597547380156077, + "grad_norm": 0.46400967240333557, + "learning_rate": 8e-05, + "loss": 1.689, + "step": 2260 + }, + { + "epoch": 0.12603121516164995, + "grad_norm": 0.44966134428977966, + "learning_rate": 8e-05, + "loss": 1.4424, + "step": 2261 + }, + { + "epoch": 0.12608695652173912, + "grad_norm": 0.468363881111145, + "learning_rate": 8e-05, + "loss": 1.5864, + "step": 2262 + }, + { + "epoch": 0.1261426978818283, + "grad_norm": 0.4988058805465698, + "learning_rate": 8e-05, + "loss": 1.8812, + "step": 2263 + }, + { + "epoch": 0.1261984392419175, + "grad_norm": 0.5007922649383545, + "learning_rate": 8e-05, + "loss": 1.9379, + "step": 2264 + }, + { + "epoch": 0.1262541806020067, + "grad_norm": 0.4592812955379486, + "learning_rate": 8e-05, + "loss": 1.6016, + "step": 2265 + }, + { + "epoch": 0.12630992196209587, + "grad_norm": 0.5309374332427979, + "learning_rate": 8e-05, + "loss": 2.0361, + "step": 2266 + }, + { + "epoch": 0.12636566332218507, + "grad_norm": 0.49097344279289246, + "learning_rate": 8e-05, + "loss": 1.8266, + "step": 2267 + }, + { + "epoch": 0.12642140468227425, + "grad_norm": 0.48501044511795044, + "learning_rate": 8e-05, + "loss": 1.5567, + "step": 2268 + }, + { + "epoch": 0.12647714604236343, + "grad_norm": 0.4972813129425049, + "learning_rate": 8e-05, + "loss": 1.6461, + "step": 2269 + }, + { + "epoch": 0.1265328874024526, + "grad_norm": 0.544996440410614, + "learning_rate": 8e-05, + "loss": 1.8706, + "step": 2270 + }, + { + "epoch": 0.12658862876254182, + "grad_norm": 0.45850032567977905, + "learning_rate": 8e-05, + "loss": 1.3768, + "step": 2271 + }, + { + "epoch": 0.126644370122631, + "grad_norm": 0.46759000420570374, + "learning_rate": 8e-05, + "loss": 1.6794, + "step": 2272 + }, + { + "epoch": 0.12670011148272017, + "grad_norm": 0.47211045026779175, + "learning_rate": 8e-05, + "loss": 1.7303, + "step": 2273 + }, + { + "epoch": 0.12675585284280935, + "grad_norm": 0.4648776054382324, + "learning_rate": 8e-05, + "loss": 1.6901, + "step": 2274 + }, + { + "epoch": 0.12681159420289856, + "grad_norm": 0.5143617987632751, + "learning_rate": 8e-05, + "loss": 1.9032, + "step": 2275 + }, + { + "epoch": 0.12686733556298774, + "grad_norm": 0.49569982290267944, + "learning_rate": 8e-05, + "loss": 1.7533, + "step": 2276 + }, + { + "epoch": 0.12692307692307692, + "grad_norm": 0.4662966728210449, + "learning_rate": 8e-05, + "loss": 1.4075, + "step": 2277 + }, + { + "epoch": 0.12697881828316612, + "grad_norm": 0.5277073979377747, + "learning_rate": 8e-05, + "loss": 1.8739, + "step": 2278 + }, + { + "epoch": 0.1270345596432553, + "grad_norm": 0.45093291997909546, + "learning_rate": 8e-05, + "loss": 1.6588, + "step": 2279 + }, + { + "epoch": 0.12709030100334448, + "grad_norm": 0.43871331214904785, + "learning_rate": 8e-05, + "loss": 1.514, + "step": 2280 + }, + { + "epoch": 0.12714604236343366, + "grad_norm": 0.46460381150245667, + "learning_rate": 8e-05, + "loss": 1.7531, + "step": 2281 + }, + { + "epoch": 0.12720178372352287, + "grad_norm": 0.44319167733192444, + "learning_rate": 8e-05, + "loss": 1.8248, + "step": 2282 + }, + { + "epoch": 0.12725752508361204, + "grad_norm": 0.4465970993041992, + "learning_rate": 8e-05, + "loss": 1.659, + "step": 2283 + }, + { + "epoch": 0.12731326644370122, + "grad_norm": 0.466734915971756, + "learning_rate": 8e-05, + "loss": 1.9343, + "step": 2284 + }, + { + "epoch": 0.1273690078037904, + "grad_norm": 0.42454588413238525, + "learning_rate": 8e-05, + "loss": 1.6708, + "step": 2285 + }, + { + "epoch": 0.1274247491638796, + "grad_norm": 0.4505043625831604, + "learning_rate": 8e-05, + "loss": 1.6236, + "step": 2286 + }, + { + "epoch": 0.1274804905239688, + "grad_norm": 0.437162846326828, + "learning_rate": 8e-05, + "loss": 1.6453, + "step": 2287 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 0.48901355266571045, + "learning_rate": 8e-05, + "loss": 2.0439, + "step": 2288 + }, + { + "epoch": 0.12759197324414714, + "grad_norm": 0.44122645258903503, + "learning_rate": 8e-05, + "loss": 1.4767, + "step": 2289 + }, + { + "epoch": 0.12764771460423635, + "grad_norm": 0.4454699158668518, + "learning_rate": 8e-05, + "loss": 1.6566, + "step": 2290 + }, + { + "epoch": 0.12770345596432553, + "grad_norm": 0.444238543510437, + "learning_rate": 8e-05, + "loss": 1.6981, + "step": 2291 + }, + { + "epoch": 0.1277591973244147, + "grad_norm": 0.47556188702583313, + "learning_rate": 8e-05, + "loss": 1.7683, + "step": 2292 + }, + { + "epoch": 0.12781493868450391, + "grad_norm": 0.45707184076309204, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 2293 + }, + { + "epoch": 0.1278706800445931, + "grad_norm": 0.5256324410438538, + "learning_rate": 8e-05, + "loss": 1.8879, + "step": 2294 + }, + { + "epoch": 0.12792642140468227, + "grad_norm": 0.4542886018753052, + "learning_rate": 8e-05, + "loss": 1.6595, + "step": 2295 + }, + { + "epoch": 0.12798216276477145, + "grad_norm": 0.4788365662097931, + "learning_rate": 8e-05, + "loss": 1.5935, + "step": 2296 + }, + { + "epoch": 0.12803790412486066, + "grad_norm": 0.4697909653186798, + "learning_rate": 8e-05, + "loss": 1.8354, + "step": 2297 + }, + { + "epoch": 0.12809364548494984, + "grad_norm": 0.4855862557888031, + "learning_rate": 8e-05, + "loss": 1.5338, + "step": 2298 + }, + { + "epoch": 0.12814938684503902, + "grad_norm": 0.4692869484424591, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 2299 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 0.4679480493068695, + "learning_rate": 8e-05, + "loss": 1.6737, + "step": 2300 + }, + { + "epoch": 0.1282608695652174, + "grad_norm": 0.4384745955467224, + "learning_rate": 8e-05, + "loss": 1.5909, + "step": 2301 + }, + { + "epoch": 0.12831661092530658, + "grad_norm": 0.46700745820999146, + "learning_rate": 8e-05, + "loss": 1.8678, + "step": 2302 + }, + { + "epoch": 0.12837235228539576, + "grad_norm": 0.42763611674308777, + "learning_rate": 8e-05, + "loss": 1.6465, + "step": 2303 + }, + { + "epoch": 0.12842809364548494, + "grad_norm": 0.6245459318161011, + "learning_rate": 8e-05, + "loss": 2.2235, + "step": 2304 + }, + { + "epoch": 0.12848383500557414, + "grad_norm": 0.5192124247550964, + "learning_rate": 8e-05, + "loss": 1.802, + "step": 2305 + }, + { + "epoch": 0.12853957636566332, + "grad_norm": 0.45005589723587036, + "learning_rate": 8e-05, + "loss": 1.6594, + "step": 2306 + }, + { + "epoch": 0.1285953177257525, + "grad_norm": 0.4243832230567932, + "learning_rate": 8e-05, + "loss": 1.677, + "step": 2307 + }, + { + "epoch": 0.1286510590858417, + "grad_norm": 0.44305911660194397, + "learning_rate": 8e-05, + "loss": 1.5636, + "step": 2308 + }, + { + "epoch": 0.12870680044593089, + "grad_norm": 0.4739604592323303, + "learning_rate": 8e-05, + "loss": 1.8299, + "step": 2309 + }, + { + "epoch": 0.12876254180602006, + "grad_norm": 0.46584707498550415, + "learning_rate": 8e-05, + "loss": 1.6265, + "step": 2310 + }, + { + "epoch": 0.12881828316610924, + "grad_norm": 0.46699920296669006, + "learning_rate": 8e-05, + "loss": 1.7646, + "step": 2311 + }, + { + "epoch": 0.12887402452619845, + "grad_norm": 0.44824424386024475, + "learning_rate": 8e-05, + "loss": 1.5775, + "step": 2312 + }, + { + "epoch": 0.12892976588628763, + "grad_norm": 0.48826146125793457, + "learning_rate": 8e-05, + "loss": 1.8624, + "step": 2313 + }, + { + "epoch": 0.1289855072463768, + "grad_norm": 0.44969749450683594, + "learning_rate": 8e-05, + "loss": 1.5919, + "step": 2314 + }, + { + "epoch": 0.12904124860646599, + "grad_norm": 0.4631759822368622, + "learning_rate": 8e-05, + "loss": 1.6601, + "step": 2315 + }, + { + "epoch": 0.1290969899665552, + "grad_norm": 0.48021945357322693, + "learning_rate": 8e-05, + "loss": 1.6901, + "step": 2316 + }, + { + "epoch": 0.12915273132664437, + "grad_norm": 0.46702641248703003, + "learning_rate": 8e-05, + "loss": 1.6362, + "step": 2317 + }, + { + "epoch": 0.12920847268673355, + "grad_norm": 0.451192170381546, + "learning_rate": 8e-05, + "loss": 1.724, + "step": 2318 + }, + { + "epoch": 0.12926421404682276, + "grad_norm": 0.5693803429603577, + "learning_rate": 8e-05, + "loss": 2.0528, + "step": 2319 + }, + { + "epoch": 0.12931995540691194, + "grad_norm": 0.46649542450904846, + "learning_rate": 8e-05, + "loss": 1.5328, + "step": 2320 + }, + { + "epoch": 0.1293756967670011, + "grad_norm": 0.45501086115837097, + "learning_rate": 8e-05, + "loss": 1.6744, + "step": 2321 + }, + { + "epoch": 0.1294314381270903, + "grad_norm": 0.4884272813796997, + "learning_rate": 8e-05, + "loss": 1.7933, + "step": 2322 + }, + { + "epoch": 0.1294871794871795, + "grad_norm": 0.45968663692474365, + "learning_rate": 8e-05, + "loss": 1.8927, + "step": 2323 + }, + { + "epoch": 0.12954292084726868, + "grad_norm": 0.4407302141189575, + "learning_rate": 8e-05, + "loss": 1.7062, + "step": 2324 + }, + { + "epoch": 0.12959866220735786, + "grad_norm": 0.47550809383392334, + "learning_rate": 8e-05, + "loss": 1.8064, + "step": 2325 + }, + { + "epoch": 0.12965440356744704, + "grad_norm": 0.4188564121723175, + "learning_rate": 8e-05, + "loss": 1.5301, + "step": 2326 + }, + { + "epoch": 0.12971014492753624, + "grad_norm": 0.4824972450733185, + "learning_rate": 8e-05, + "loss": 1.6146, + "step": 2327 + }, + { + "epoch": 0.12976588628762542, + "grad_norm": 0.5340510606765747, + "learning_rate": 8e-05, + "loss": 1.9268, + "step": 2328 + }, + { + "epoch": 0.1298216276477146, + "grad_norm": 0.46250587701797485, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 2329 + }, + { + "epoch": 0.12987736900780378, + "grad_norm": 0.46774420142173767, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 2330 + }, + { + "epoch": 0.12993311036789298, + "grad_norm": 0.47270211577415466, + "learning_rate": 8e-05, + "loss": 1.7448, + "step": 2331 + }, + { + "epoch": 0.12998885172798216, + "grad_norm": 0.46113133430480957, + "learning_rate": 8e-05, + "loss": 1.7026, + "step": 2332 + }, + { + "epoch": 0.13004459308807134, + "grad_norm": 0.46038854122161865, + "learning_rate": 8e-05, + "loss": 1.8473, + "step": 2333 + }, + { + "epoch": 0.13010033444816055, + "grad_norm": 0.5171126127243042, + "learning_rate": 8e-05, + "loss": 1.9117, + "step": 2334 + }, + { + "epoch": 0.13015607580824973, + "grad_norm": 0.462720662355423, + "learning_rate": 8e-05, + "loss": 2.0309, + "step": 2335 + }, + { + "epoch": 0.1302118171683389, + "grad_norm": 0.45984727144241333, + "learning_rate": 8e-05, + "loss": 1.618, + "step": 2336 + }, + { + "epoch": 0.13026755852842808, + "grad_norm": 0.476755827665329, + "learning_rate": 8e-05, + "loss": 1.8149, + "step": 2337 + }, + { + "epoch": 0.1303232998885173, + "grad_norm": 0.44505324959754944, + "learning_rate": 8e-05, + "loss": 1.7122, + "step": 2338 + }, + { + "epoch": 0.13037904124860647, + "grad_norm": 0.4444805681705475, + "learning_rate": 8e-05, + "loss": 1.5229, + "step": 2339 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.4566769599914551, + "learning_rate": 8e-05, + "loss": 1.6963, + "step": 2340 + }, + { + "epoch": 0.13049052396878483, + "grad_norm": 0.48411083221435547, + "learning_rate": 8e-05, + "loss": 1.7126, + "step": 2341 + }, + { + "epoch": 0.13054626532887403, + "grad_norm": 0.485125333070755, + "learning_rate": 8e-05, + "loss": 1.6481, + "step": 2342 + }, + { + "epoch": 0.1306020066889632, + "grad_norm": 0.4920346140861511, + "learning_rate": 8e-05, + "loss": 1.8729, + "step": 2343 + }, + { + "epoch": 0.1306577480490524, + "grad_norm": 0.5254184603691101, + "learning_rate": 8e-05, + "loss": 1.9184, + "step": 2344 + }, + { + "epoch": 0.13071348940914157, + "grad_norm": 0.475009560585022, + "learning_rate": 8e-05, + "loss": 1.6471, + "step": 2345 + }, + { + "epoch": 0.13076923076923078, + "grad_norm": 0.41936373710632324, + "learning_rate": 8e-05, + "loss": 1.2888, + "step": 2346 + }, + { + "epoch": 0.13082497212931996, + "grad_norm": 0.49300825595855713, + "learning_rate": 8e-05, + "loss": 1.5936, + "step": 2347 + }, + { + "epoch": 0.13088071348940913, + "grad_norm": 0.4508037865161896, + "learning_rate": 8e-05, + "loss": 1.6086, + "step": 2348 + }, + { + "epoch": 0.13093645484949834, + "grad_norm": 0.4737650156021118, + "learning_rate": 8e-05, + "loss": 1.7373, + "step": 2349 + }, + { + "epoch": 0.13099219620958752, + "grad_norm": 0.48412951827049255, + "learning_rate": 8e-05, + "loss": 1.5208, + "step": 2350 + }, + { + "epoch": 0.1310479375696767, + "grad_norm": 0.5189991593360901, + "learning_rate": 8e-05, + "loss": 2.0046, + "step": 2351 + }, + { + "epoch": 0.13110367892976588, + "grad_norm": 0.49417969584465027, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 2352 + }, + { + "epoch": 0.13115942028985508, + "grad_norm": 0.5066606998443604, + "learning_rate": 8e-05, + "loss": 1.8102, + "step": 2353 + }, + { + "epoch": 0.13121516164994426, + "grad_norm": 0.47830361127853394, + "learning_rate": 8e-05, + "loss": 1.6611, + "step": 2354 + }, + { + "epoch": 0.13127090301003344, + "grad_norm": 0.4630042016506195, + "learning_rate": 8e-05, + "loss": 1.9327, + "step": 2355 + }, + { + "epoch": 0.13132664437012262, + "grad_norm": 0.4288950562477112, + "learning_rate": 8e-05, + "loss": 1.461, + "step": 2356 + }, + { + "epoch": 0.13138238573021183, + "grad_norm": 0.47212231159210205, + "learning_rate": 8e-05, + "loss": 1.831, + "step": 2357 + }, + { + "epoch": 0.131438127090301, + "grad_norm": 0.5282948017120361, + "learning_rate": 8e-05, + "loss": 2.0361, + "step": 2358 + }, + { + "epoch": 0.13149386845039018, + "grad_norm": 0.4551253616809845, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 2359 + }, + { + "epoch": 0.13154960981047936, + "grad_norm": 0.47356507182121277, + "learning_rate": 8e-05, + "loss": 1.7755, + "step": 2360 + }, + { + "epoch": 0.13160535117056857, + "grad_norm": 0.45470499992370605, + "learning_rate": 8e-05, + "loss": 1.8588, + "step": 2361 + }, + { + "epoch": 0.13166109253065775, + "grad_norm": 0.4819483160972595, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 2362 + }, + { + "epoch": 0.13171683389074693, + "grad_norm": 0.43687912821769714, + "learning_rate": 8e-05, + "loss": 1.658, + "step": 2363 + }, + { + "epoch": 0.13177257525083613, + "grad_norm": 0.48129919171333313, + "learning_rate": 8e-05, + "loss": 1.886, + "step": 2364 + }, + { + "epoch": 0.1318283166109253, + "grad_norm": 0.49660441279411316, + "learning_rate": 8e-05, + "loss": 1.7364, + "step": 2365 + }, + { + "epoch": 0.1318840579710145, + "grad_norm": 0.4926661550998688, + "learning_rate": 8e-05, + "loss": 2.0036, + "step": 2366 + }, + { + "epoch": 0.13193979933110367, + "grad_norm": 0.46969401836395264, + "learning_rate": 8e-05, + "loss": 1.6531, + "step": 2367 + }, + { + "epoch": 0.13199554069119288, + "grad_norm": 0.46234428882598877, + "learning_rate": 8e-05, + "loss": 1.7915, + "step": 2368 + }, + { + "epoch": 0.13205128205128205, + "grad_norm": 0.42720118165016174, + "learning_rate": 8e-05, + "loss": 1.6161, + "step": 2369 + }, + { + "epoch": 0.13210702341137123, + "grad_norm": 0.48308509588241577, + "learning_rate": 8e-05, + "loss": 1.7814, + "step": 2370 + }, + { + "epoch": 0.1321627647714604, + "grad_norm": 0.4851537048816681, + "learning_rate": 8e-05, + "loss": 1.7679, + "step": 2371 + }, + { + "epoch": 0.13221850613154962, + "grad_norm": 0.4548836052417755, + "learning_rate": 8e-05, + "loss": 1.5165, + "step": 2372 + }, + { + "epoch": 0.1322742474916388, + "grad_norm": 0.47316473722457886, + "learning_rate": 8e-05, + "loss": 1.7382, + "step": 2373 + }, + { + "epoch": 0.13232998885172798, + "grad_norm": 0.5082695484161377, + "learning_rate": 8e-05, + "loss": 1.7757, + "step": 2374 + }, + { + "epoch": 0.13238573021181718, + "grad_norm": 0.42562705278396606, + "learning_rate": 8e-05, + "loss": 1.3878, + "step": 2375 + }, + { + "epoch": 0.13244147157190636, + "grad_norm": 0.4784887135028839, + "learning_rate": 8e-05, + "loss": 1.5597, + "step": 2376 + }, + { + "epoch": 0.13249721293199554, + "grad_norm": 0.45655760169029236, + "learning_rate": 8e-05, + "loss": 1.3887, + "step": 2377 + }, + { + "epoch": 0.13255295429208472, + "grad_norm": 0.4548302888870239, + "learning_rate": 8e-05, + "loss": 1.7282, + "step": 2378 + }, + { + "epoch": 0.13260869565217392, + "grad_norm": 0.5463017821311951, + "learning_rate": 8e-05, + "loss": 1.93, + "step": 2379 + }, + { + "epoch": 0.1326644370122631, + "grad_norm": 0.5709219574928284, + "learning_rate": 8e-05, + "loss": 1.7358, + "step": 2380 + }, + { + "epoch": 0.13272017837235228, + "grad_norm": 0.4988589882850647, + "learning_rate": 8e-05, + "loss": 1.9047, + "step": 2381 + }, + { + "epoch": 0.13277591973244146, + "grad_norm": 0.47514599561691284, + "learning_rate": 8e-05, + "loss": 1.7282, + "step": 2382 + }, + { + "epoch": 0.13283166109253067, + "grad_norm": 0.47735369205474854, + "learning_rate": 8e-05, + "loss": 1.7754, + "step": 2383 + }, + { + "epoch": 0.13288740245261985, + "grad_norm": 0.4829070568084717, + "learning_rate": 8e-05, + "loss": 1.804, + "step": 2384 + }, + { + "epoch": 0.13294314381270902, + "grad_norm": 0.4622078537940979, + "learning_rate": 8e-05, + "loss": 1.6201, + "step": 2385 + }, + { + "epoch": 0.1329988851727982, + "grad_norm": 0.4724544286727905, + "learning_rate": 8e-05, + "loss": 1.6616, + "step": 2386 + }, + { + "epoch": 0.1330546265328874, + "grad_norm": 0.4968656301498413, + "learning_rate": 8e-05, + "loss": 1.8219, + "step": 2387 + }, + { + "epoch": 0.1331103678929766, + "grad_norm": 0.4678482413291931, + "learning_rate": 8e-05, + "loss": 1.7585, + "step": 2388 + }, + { + "epoch": 0.13316610925306577, + "grad_norm": 0.42720043659210205, + "learning_rate": 8e-05, + "loss": 1.6677, + "step": 2389 + }, + { + "epoch": 0.13322185061315497, + "grad_norm": 0.4516279995441437, + "learning_rate": 8e-05, + "loss": 1.8366, + "step": 2390 + }, + { + "epoch": 0.13327759197324415, + "grad_norm": 0.44496116042137146, + "learning_rate": 8e-05, + "loss": 1.603, + "step": 2391 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4435427188873291, + "learning_rate": 8e-05, + "loss": 1.7424, + "step": 2392 + }, + { + "epoch": 0.1333890746934225, + "grad_norm": 0.492079496383667, + "learning_rate": 8e-05, + "loss": 1.8003, + "step": 2393 + }, + { + "epoch": 0.13344481605351172, + "grad_norm": 0.47527799010276794, + "learning_rate": 8e-05, + "loss": 1.8065, + "step": 2394 + }, + { + "epoch": 0.1335005574136009, + "grad_norm": 0.48369163274765015, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 2395 + }, + { + "epoch": 0.13355629877369007, + "grad_norm": 0.47162288427352905, + "learning_rate": 8e-05, + "loss": 1.7446, + "step": 2396 + }, + { + "epoch": 0.13361204013377925, + "grad_norm": 0.45104944705963135, + "learning_rate": 8e-05, + "loss": 1.8041, + "step": 2397 + }, + { + "epoch": 0.13366778149386846, + "grad_norm": 0.48598307371139526, + "learning_rate": 8e-05, + "loss": 1.6556, + "step": 2398 + }, + { + "epoch": 0.13372352285395764, + "grad_norm": 0.4734358787536621, + "learning_rate": 8e-05, + "loss": 1.7203, + "step": 2399 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46318376064300537, + "learning_rate": 8e-05, + "loss": 1.5928, + "step": 2400 + }, + { + "epoch": 0.133835005574136, + "grad_norm": 0.48137640953063965, + "learning_rate": 8e-05, + "loss": 1.8884, + "step": 2401 + }, + { + "epoch": 0.1338907469342252, + "grad_norm": 0.4602339565753937, + "learning_rate": 8e-05, + "loss": 1.9138, + "step": 2402 + }, + { + "epoch": 0.13394648829431438, + "grad_norm": 0.4798427224159241, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 2403 + }, + { + "epoch": 0.13400222965440356, + "grad_norm": 0.464205265045166, + "learning_rate": 8e-05, + "loss": 1.5574, + "step": 2404 + }, + { + "epoch": 0.13405797101449277, + "grad_norm": 0.4638040363788605, + "learning_rate": 8e-05, + "loss": 1.9085, + "step": 2405 + }, + { + "epoch": 0.13411371237458194, + "grad_norm": 0.4874891936779022, + "learning_rate": 8e-05, + "loss": 1.8563, + "step": 2406 + }, + { + "epoch": 0.13416945373467112, + "grad_norm": 0.4475204646587372, + "learning_rate": 8e-05, + "loss": 1.6447, + "step": 2407 + }, + { + "epoch": 0.1342251950947603, + "grad_norm": 0.4701690673828125, + "learning_rate": 8e-05, + "loss": 1.7522, + "step": 2408 + }, + { + "epoch": 0.1342809364548495, + "grad_norm": 0.44603896141052246, + "learning_rate": 8e-05, + "loss": 1.6628, + "step": 2409 + }, + { + "epoch": 0.1343366778149387, + "grad_norm": 0.41180136799812317, + "learning_rate": 8e-05, + "loss": 1.5771, + "step": 2410 + }, + { + "epoch": 0.13439241917502787, + "grad_norm": 0.4283235967159271, + "learning_rate": 8e-05, + "loss": 1.5256, + "step": 2411 + }, + { + "epoch": 0.13444816053511705, + "grad_norm": 0.44074901938438416, + "learning_rate": 8e-05, + "loss": 1.5656, + "step": 2412 + }, + { + "epoch": 0.13450390189520625, + "grad_norm": 0.4715913236141205, + "learning_rate": 8e-05, + "loss": 1.7263, + "step": 2413 + }, + { + "epoch": 0.13455964325529543, + "grad_norm": 0.4731295704841614, + "learning_rate": 8e-05, + "loss": 1.6281, + "step": 2414 + }, + { + "epoch": 0.1346153846153846, + "grad_norm": 0.49616292119026184, + "learning_rate": 8e-05, + "loss": 1.7264, + "step": 2415 + }, + { + "epoch": 0.1346711259754738, + "grad_norm": 0.4744231700897217, + "learning_rate": 8e-05, + "loss": 1.5563, + "step": 2416 + }, + { + "epoch": 0.134726867335563, + "grad_norm": 0.5391608476638794, + "learning_rate": 8e-05, + "loss": 1.9143, + "step": 2417 + }, + { + "epoch": 0.13478260869565217, + "grad_norm": 0.47722572088241577, + "learning_rate": 8e-05, + "loss": 1.7501, + "step": 2418 + }, + { + "epoch": 0.13483835005574135, + "grad_norm": 0.45435523986816406, + "learning_rate": 8e-05, + "loss": 1.5783, + "step": 2419 + }, + { + "epoch": 0.13489409141583056, + "grad_norm": 0.49873173236846924, + "learning_rate": 8e-05, + "loss": 1.4969, + "step": 2420 + }, + { + "epoch": 0.13494983277591974, + "grad_norm": 0.4588122069835663, + "learning_rate": 8e-05, + "loss": 1.6821, + "step": 2421 + }, + { + "epoch": 0.13500557413600892, + "grad_norm": 0.5086076259613037, + "learning_rate": 8e-05, + "loss": 1.6913, + "step": 2422 + }, + { + "epoch": 0.1350613154960981, + "grad_norm": 0.49438467621803284, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 2423 + }, + { + "epoch": 0.1351170568561873, + "grad_norm": 0.44089025259017944, + "learning_rate": 8e-05, + "loss": 1.5156, + "step": 2424 + }, + { + "epoch": 0.13517279821627648, + "grad_norm": 0.4743707776069641, + "learning_rate": 8e-05, + "loss": 1.725, + "step": 2425 + }, + { + "epoch": 0.13522853957636566, + "grad_norm": 0.47016942501068115, + "learning_rate": 8e-05, + "loss": 1.6863, + "step": 2426 + }, + { + "epoch": 0.13528428093645484, + "grad_norm": 0.46018049120903015, + "learning_rate": 8e-05, + "loss": 1.822, + "step": 2427 + }, + { + "epoch": 0.13534002229654404, + "grad_norm": 0.49280330538749695, + "learning_rate": 8e-05, + "loss": 1.7777, + "step": 2428 + }, + { + "epoch": 0.13539576365663322, + "grad_norm": 0.4748382270336151, + "learning_rate": 8e-05, + "loss": 1.7763, + "step": 2429 + }, + { + "epoch": 0.1354515050167224, + "grad_norm": 0.49057191610336304, + "learning_rate": 8e-05, + "loss": 1.7156, + "step": 2430 + }, + { + "epoch": 0.1355072463768116, + "grad_norm": 0.4892786145210266, + "learning_rate": 8e-05, + "loss": 1.7613, + "step": 2431 + }, + { + "epoch": 0.1355629877369008, + "grad_norm": 0.48499733209609985, + "learning_rate": 8e-05, + "loss": 1.9075, + "step": 2432 + }, + { + "epoch": 0.13561872909698997, + "grad_norm": 0.49811092019081116, + "learning_rate": 8e-05, + "loss": 1.8027, + "step": 2433 + }, + { + "epoch": 0.13567447045707914, + "grad_norm": 0.46384453773498535, + "learning_rate": 8e-05, + "loss": 1.7716, + "step": 2434 + }, + { + "epoch": 0.13573021181716835, + "grad_norm": 0.4836145341396332, + "learning_rate": 8e-05, + "loss": 1.7956, + "step": 2435 + }, + { + "epoch": 0.13578595317725753, + "grad_norm": 0.5062441229820251, + "learning_rate": 8e-05, + "loss": 2.0621, + "step": 2436 + }, + { + "epoch": 0.1358416945373467, + "grad_norm": 0.4342644512653351, + "learning_rate": 8e-05, + "loss": 1.5002, + "step": 2437 + }, + { + "epoch": 0.1358974358974359, + "grad_norm": 0.45354142785072327, + "learning_rate": 8e-05, + "loss": 1.6726, + "step": 2438 + }, + { + "epoch": 0.1359531772575251, + "grad_norm": 0.4888480007648468, + "learning_rate": 8e-05, + "loss": 2.021, + "step": 2439 + }, + { + "epoch": 0.13600891861761427, + "grad_norm": 0.4443151652812958, + "learning_rate": 8e-05, + "loss": 1.6286, + "step": 2440 + }, + { + "epoch": 0.13606465997770345, + "grad_norm": 0.4630712866783142, + "learning_rate": 8e-05, + "loss": 1.8708, + "step": 2441 + }, + { + "epoch": 0.13612040133779263, + "grad_norm": 0.4247455298900604, + "learning_rate": 8e-05, + "loss": 1.5506, + "step": 2442 + }, + { + "epoch": 0.13617614269788184, + "grad_norm": 0.47813916206359863, + "learning_rate": 8e-05, + "loss": 1.6775, + "step": 2443 + }, + { + "epoch": 0.13623188405797101, + "grad_norm": 0.5136623382568359, + "learning_rate": 8e-05, + "loss": 1.6883, + "step": 2444 + }, + { + "epoch": 0.1362876254180602, + "grad_norm": 0.448414146900177, + "learning_rate": 8e-05, + "loss": 1.7714, + "step": 2445 + }, + { + "epoch": 0.1363433667781494, + "grad_norm": 0.4655492603778839, + "learning_rate": 8e-05, + "loss": 1.7398, + "step": 2446 + }, + { + "epoch": 0.13639910813823858, + "grad_norm": 0.4698059856891632, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 2447 + }, + { + "epoch": 0.13645484949832776, + "grad_norm": 0.4656384587287903, + "learning_rate": 8e-05, + "loss": 1.7179, + "step": 2448 + }, + { + "epoch": 0.13651059085841694, + "grad_norm": 0.4606095552444458, + "learning_rate": 8e-05, + "loss": 1.6952, + "step": 2449 + }, + { + "epoch": 0.13656633221850614, + "grad_norm": 0.4883759617805481, + "learning_rate": 8e-05, + "loss": 1.7862, + "step": 2450 + }, + { + "epoch": 0.13662207357859532, + "grad_norm": 0.4531194567680359, + "learning_rate": 8e-05, + "loss": 1.6778, + "step": 2451 + }, + { + "epoch": 0.1366778149386845, + "grad_norm": 0.4543571472167969, + "learning_rate": 8e-05, + "loss": 1.5263, + "step": 2452 + }, + { + "epoch": 0.13673355629877368, + "grad_norm": 0.49247556924819946, + "learning_rate": 8e-05, + "loss": 1.6838, + "step": 2453 + }, + { + "epoch": 0.13678929765886289, + "grad_norm": 0.46396002173423767, + "learning_rate": 8e-05, + "loss": 1.6609, + "step": 2454 + }, + { + "epoch": 0.13684503901895206, + "grad_norm": 0.46788784861564636, + "learning_rate": 8e-05, + "loss": 1.7977, + "step": 2455 + }, + { + "epoch": 0.13690078037904124, + "grad_norm": 0.4540233910083771, + "learning_rate": 8e-05, + "loss": 1.4826, + "step": 2456 + }, + { + "epoch": 0.13695652173913042, + "grad_norm": 0.4384143650531769, + "learning_rate": 8e-05, + "loss": 1.5633, + "step": 2457 + }, + { + "epoch": 0.13701226309921963, + "grad_norm": 0.4648273289203644, + "learning_rate": 8e-05, + "loss": 1.7519, + "step": 2458 + }, + { + "epoch": 0.1370680044593088, + "grad_norm": 0.4895458221435547, + "learning_rate": 8e-05, + "loss": 1.7789, + "step": 2459 + }, + { + "epoch": 0.13712374581939799, + "grad_norm": 0.4567234516143799, + "learning_rate": 8e-05, + "loss": 1.7469, + "step": 2460 + }, + { + "epoch": 0.1371794871794872, + "grad_norm": 0.48818498849868774, + "learning_rate": 8e-05, + "loss": 1.8405, + "step": 2461 + }, + { + "epoch": 0.13723522853957637, + "grad_norm": 0.4511561393737793, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 2462 + }, + { + "epoch": 0.13729096989966555, + "grad_norm": 0.4609898626804352, + "learning_rate": 8e-05, + "loss": 1.5258, + "step": 2463 + }, + { + "epoch": 0.13734671125975473, + "grad_norm": 0.47602224349975586, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 2464 + }, + { + "epoch": 0.13740245261984393, + "grad_norm": 0.49551478028297424, + "learning_rate": 8e-05, + "loss": 1.8406, + "step": 2465 + }, + { + "epoch": 0.1374581939799331, + "grad_norm": 0.49049773812294006, + "learning_rate": 8e-05, + "loss": 1.7249, + "step": 2466 + }, + { + "epoch": 0.1375139353400223, + "grad_norm": 0.4555240869522095, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 2467 + }, + { + "epoch": 0.13756967670011147, + "grad_norm": 0.5260162949562073, + "learning_rate": 8e-05, + "loss": 1.6192, + "step": 2468 + }, + { + "epoch": 0.13762541806020068, + "grad_norm": 0.44580748677253723, + "learning_rate": 8e-05, + "loss": 1.7445, + "step": 2469 + }, + { + "epoch": 0.13768115942028986, + "grad_norm": 0.5073334574699402, + "learning_rate": 8e-05, + "loss": 1.7509, + "step": 2470 + }, + { + "epoch": 0.13773690078037903, + "grad_norm": 0.44060927629470825, + "learning_rate": 8e-05, + "loss": 1.7038, + "step": 2471 + }, + { + "epoch": 0.13779264214046824, + "grad_norm": 0.4801987409591675, + "learning_rate": 8e-05, + "loss": 1.8005, + "step": 2472 + }, + { + "epoch": 0.13784838350055742, + "grad_norm": 0.47730058431625366, + "learning_rate": 8e-05, + "loss": 1.7899, + "step": 2473 + }, + { + "epoch": 0.1379041248606466, + "grad_norm": 0.4480297863483429, + "learning_rate": 8e-05, + "loss": 1.6313, + "step": 2474 + }, + { + "epoch": 0.13795986622073578, + "grad_norm": 0.4790990948677063, + "learning_rate": 8e-05, + "loss": 1.7961, + "step": 2475 + }, + { + "epoch": 0.13801560758082498, + "grad_norm": 0.4796399176120758, + "learning_rate": 8e-05, + "loss": 1.9536, + "step": 2476 + }, + { + "epoch": 0.13807134894091416, + "grad_norm": 0.43935978412628174, + "learning_rate": 8e-05, + "loss": 1.7373, + "step": 2477 + }, + { + "epoch": 0.13812709030100334, + "grad_norm": 0.48501768708229065, + "learning_rate": 8e-05, + "loss": 1.93, + "step": 2478 + }, + { + "epoch": 0.13818283166109252, + "grad_norm": 0.43100371956825256, + "learning_rate": 8e-05, + "loss": 1.6818, + "step": 2479 + }, + { + "epoch": 0.13823857302118173, + "grad_norm": 0.4330509901046753, + "learning_rate": 8e-05, + "loss": 1.4093, + "step": 2480 + }, + { + "epoch": 0.1382943143812709, + "grad_norm": 0.4759175181388855, + "learning_rate": 8e-05, + "loss": 1.9259, + "step": 2481 + }, + { + "epoch": 0.13835005574136008, + "grad_norm": 0.45059260725975037, + "learning_rate": 8e-05, + "loss": 1.7486, + "step": 2482 + }, + { + "epoch": 0.13840579710144926, + "grad_norm": 0.4803008735179901, + "learning_rate": 8e-05, + "loss": 1.7893, + "step": 2483 + }, + { + "epoch": 0.13846153846153847, + "grad_norm": 0.46275773644447327, + "learning_rate": 8e-05, + "loss": 1.7574, + "step": 2484 + }, + { + "epoch": 0.13851727982162765, + "grad_norm": 0.5310719609260559, + "learning_rate": 8e-05, + "loss": 1.8372, + "step": 2485 + }, + { + "epoch": 0.13857302118171683, + "grad_norm": 0.4447019398212433, + "learning_rate": 8e-05, + "loss": 1.6102, + "step": 2486 + }, + { + "epoch": 0.13862876254180603, + "grad_norm": 0.48079508543014526, + "learning_rate": 8e-05, + "loss": 1.745, + "step": 2487 + }, + { + "epoch": 0.1386845039018952, + "grad_norm": 0.4743659496307373, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 2488 + }, + { + "epoch": 0.1387402452619844, + "grad_norm": 0.49217358231544495, + "learning_rate": 8e-05, + "loss": 1.6408, + "step": 2489 + }, + { + "epoch": 0.13879598662207357, + "grad_norm": 0.4995599091053009, + "learning_rate": 8e-05, + "loss": 1.7086, + "step": 2490 + }, + { + "epoch": 0.13885172798216278, + "grad_norm": 0.4510442912578583, + "learning_rate": 8e-05, + "loss": 1.5491, + "step": 2491 + }, + { + "epoch": 0.13890746934225195, + "grad_norm": 0.461445152759552, + "learning_rate": 8e-05, + "loss": 1.7203, + "step": 2492 + }, + { + "epoch": 0.13896321070234113, + "grad_norm": 0.4649452269077301, + "learning_rate": 8e-05, + "loss": 1.7949, + "step": 2493 + }, + { + "epoch": 0.1390189520624303, + "grad_norm": 0.48574158549308777, + "learning_rate": 8e-05, + "loss": 1.8664, + "step": 2494 + }, + { + "epoch": 0.13907469342251952, + "grad_norm": 0.47030866146087646, + "learning_rate": 8e-05, + "loss": 1.4127, + "step": 2495 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 0.44855526089668274, + "learning_rate": 8e-05, + "loss": 1.7595, + "step": 2496 + }, + { + "epoch": 0.13918617614269788, + "grad_norm": 0.46104398369789124, + "learning_rate": 8e-05, + "loss": 1.8975, + "step": 2497 + }, + { + "epoch": 0.13924191750278705, + "grad_norm": 0.4790503978729248, + "learning_rate": 8e-05, + "loss": 1.7316, + "step": 2498 + }, + { + "epoch": 0.13929765886287626, + "grad_norm": 0.4176745116710663, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 2499 + }, + { + "epoch": 0.13935340022296544, + "grad_norm": 0.5315077900886536, + "learning_rate": 8e-05, + "loss": 2.0945, + "step": 2500 + }, + { + "epoch": 0.13940914158305462, + "grad_norm": 0.47703084349632263, + "learning_rate": 8e-05, + "loss": 1.6344, + "step": 2501 + }, + { + "epoch": 0.13946488294314383, + "grad_norm": 0.42754313349723816, + "learning_rate": 8e-05, + "loss": 1.5776, + "step": 2502 + }, + { + "epoch": 0.139520624303233, + "grad_norm": 0.4698762893676758, + "learning_rate": 8e-05, + "loss": 1.7629, + "step": 2503 + }, + { + "epoch": 0.13957636566332218, + "grad_norm": 0.4266977906227112, + "learning_rate": 8e-05, + "loss": 1.5624, + "step": 2504 + }, + { + "epoch": 0.13963210702341136, + "grad_norm": 0.48394453525543213, + "learning_rate": 8e-05, + "loss": 1.9536, + "step": 2505 + }, + { + "epoch": 0.13968784838350057, + "grad_norm": 0.4214484691619873, + "learning_rate": 8e-05, + "loss": 1.5147, + "step": 2506 + }, + { + "epoch": 0.13974358974358975, + "grad_norm": 0.4360334575176239, + "learning_rate": 8e-05, + "loss": 1.5634, + "step": 2507 + }, + { + "epoch": 0.13979933110367893, + "grad_norm": 0.4772351086139679, + "learning_rate": 8e-05, + "loss": 1.8834, + "step": 2508 + }, + { + "epoch": 0.1398550724637681, + "grad_norm": 0.42568275332450867, + "learning_rate": 8e-05, + "loss": 1.4775, + "step": 2509 + }, + { + "epoch": 0.1399108138238573, + "grad_norm": 0.4546456038951874, + "learning_rate": 8e-05, + "loss": 1.6071, + "step": 2510 + }, + { + "epoch": 0.1399665551839465, + "grad_norm": 0.4755224883556366, + "learning_rate": 8e-05, + "loss": 1.7269, + "step": 2511 + }, + { + "epoch": 0.14002229654403567, + "grad_norm": 0.4712219834327698, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 2512 + }, + { + "epoch": 0.14007803790412485, + "grad_norm": 0.4919564127922058, + "learning_rate": 8e-05, + "loss": 1.702, + "step": 2513 + }, + { + "epoch": 0.14013377926421405, + "grad_norm": 0.5029149055480957, + "learning_rate": 8e-05, + "loss": 1.8241, + "step": 2514 + }, + { + "epoch": 0.14018952062430323, + "grad_norm": 0.45856478810310364, + "learning_rate": 8e-05, + "loss": 1.7345, + "step": 2515 + }, + { + "epoch": 0.1402452619843924, + "grad_norm": 0.44365623593330383, + "learning_rate": 8e-05, + "loss": 1.5347, + "step": 2516 + }, + { + "epoch": 0.14030100334448162, + "grad_norm": 0.4528595507144928, + "learning_rate": 8e-05, + "loss": 1.7813, + "step": 2517 + }, + { + "epoch": 0.1403567447045708, + "grad_norm": 0.44543299078941345, + "learning_rate": 8e-05, + "loss": 1.4169, + "step": 2518 + }, + { + "epoch": 0.14041248606465997, + "grad_norm": 0.5014076828956604, + "learning_rate": 8e-05, + "loss": 1.9818, + "step": 2519 + }, + { + "epoch": 0.14046822742474915, + "grad_norm": 0.4682339131832123, + "learning_rate": 8e-05, + "loss": 1.4848, + "step": 2520 + }, + { + "epoch": 0.14052396878483836, + "grad_norm": 0.4805692434310913, + "learning_rate": 8e-05, + "loss": 1.7424, + "step": 2521 + }, + { + "epoch": 0.14057971014492754, + "grad_norm": 0.4191701114177704, + "learning_rate": 8e-05, + "loss": 1.6192, + "step": 2522 + }, + { + "epoch": 0.14063545150501672, + "grad_norm": 0.45228466391563416, + "learning_rate": 8e-05, + "loss": 1.5186, + "step": 2523 + }, + { + "epoch": 0.1406911928651059, + "grad_norm": 0.4798497259616852, + "learning_rate": 8e-05, + "loss": 1.7834, + "step": 2524 + }, + { + "epoch": 0.1407469342251951, + "grad_norm": 0.4433005452156067, + "learning_rate": 8e-05, + "loss": 1.7957, + "step": 2525 + }, + { + "epoch": 0.14080267558528428, + "grad_norm": 0.4283502995967865, + "learning_rate": 8e-05, + "loss": 1.5706, + "step": 2526 + }, + { + "epoch": 0.14085841694537346, + "grad_norm": 0.41895848512649536, + "learning_rate": 8e-05, + "loss": 1.6454, + "step": 2527 + }, + { + "epoch": 0.14091415830546267, + "grad_norm": 0.48576539754867554, + "learning_rate": 8e-05, + "loss": 1.7753, + "step": 2528 + }, + { + "epoch": 0.14096989966555185, + "grad_norm": 0.44927990436553955, + "learning_rate": 8e-05, + "loss": 1.5163, + "step": 2529 + }, + { + "epoch": 0.14102564102564102, + "grad_norm": 0.48466259241104126, + "learning_rate": 8e-05, + "loss": 1.8365, + "step": 2530 + }, + { + "epoch": 0.1410813823857302, + "grad_norm": 0.45357945561408997, + "learning_rate": 8e-05, + "loss": 1.7663, + "step": 2531 + }, + { + "epoch": 0.1411371237458194, + "grad_norm": 0.49716877937316895, + "learning_rate": 8e-05, + "loss": 1.7604, + "step": 2532 + }, + { + "epoch": 0.1411928651059086, + "grad_norm": 0.49428150057792664, + "learning_rate": 8e-05, + "loss": 1.996, + "step": 2533 + }, + { + "epoch": 0.14124860646599777, + "grad_norm": 0.4324924945831299, + "learning_rate": 8e-05, + "loss": 1.5958, + "step": 2534 + }, + { + "epoch": 0.14130434782608695, + "grad_norm": 0.4904332160949707, + "learning_rate": 8e-05, + "loss": 1.8865, + "step": 2535 + }, + { + "epoch": 0.14136008918617615, + "grad_norm": 0.436411052942276, + "learning_rate": 8e-05, + "loss": 1.6264, + "step": 2536 + }, + { + "epoch": 0.14141583054626533, + "grad_norm": 0.5399770140647888, + "learning_rate": 8e-05, + "loss": 1.897, + "step": 2537 + }, + { + "epoch": 0.1414715719063545, + "grad_norm": 0.4226623475551605, + "learning_rate": 8e-05, + "loss": 1.5677, + "step": 2538 + }, + { + "epoch": 0.1415273132664437, + "grad_norm": 0.48872801661491394, + "learning_rate": 8e-05, + "loss": 1.9442, + "step": 2539 + }, + { + "epoch": 0.1415830546265329, + "grad_norm": 0.4366551637649536, + "learning_rate": 8e-05, + "loss": 1.6084, + "step": 2540 + }, + { + "epoch": 0.14163879598662207, + "grad_norm": 0.4629233479499817, + "learning_rate": 8e-05, + "loss": 1.6951, + "step": 2541 + }, + { + "epoch": 0.14169453734671125, + "grad_norm": 0.43385395407676697, + "learning_rate": 8e-05, + "loss": 1.548, + "step": 2542 + }, + { + "epoch": 0.14175027870680046, + "grad_norm": 0.4846147298812866, + "learning_rate": 8e-05, + "loss": 1.8151, + "step": 2543 + }, + { + "epoch": 0.14180602006688964, + "grad_norm": 0.4602271616458893, + "learning_rate": 8e-05, + "loss": 1.5667, + "step": 2544 + }, + { + "epoch": 0.14186176142697882, + "grad_norm": 0.5012955069541931, + "learning_rate": 8e-05, + "loss": 1.9807, + "step": 2545 + }, + { + "epoch": 0.141917502787068, + "grad_norm": 0.45967939496040344, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 2546 + }, + { + "epoch": 0.1419732441471572, + "grad_norm": 0.46281909942626953, + "learning_rate": 8e-05, + "loss": 1.7764, + "step": 2547 + }, + { + "epoch": 0.14202898550724638, + "grad_norm": 0.4586803913116455, + "learning_rate": 8e-05, + "loss": 1.822, + "step": 2548 + }, + { + "epoch": 0.14208472686733556, + "grad_norm": 0.4685911536216736, + "learning_rate": 8e-05, + "loss": 1.6216, + "step": 2549 + }, + { + "epoch": 0.14214046822742474, + "grad_norm": 0.4778120517730713, + "learning_rate": 8e-05, + "loss": 1.8606, + "step": 2550 + }, + { + "epoch": 0.14219620958751394, + "grad_norm": 0.4755409359931946, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 2551 + }, + { + "epoch": 0.14225195094760312, + "grad_norm": 0.5101373791694641, + "learning_rate": 8e-05, + "loss": 1.9016, + "step": 2552 + }, + { + "epoch": 0.1423076923076923, + "grad_norm": 0.4830615520477295, + "learning_rate": 8e-05, + "loss": 1.8319, + "step": 2553 + }, + { + "epoch": 0.14236343366778148, + "grad_norm": 0.4818636476993561, + "learning_rate": 8e-05, + "loss": 1.7987, + "step": 2554 + }, + { + "epoch": 0.1424191750278707, + "grad_norm": 0.5467934012413025, + "learning_rate": 8e-05, + "loss": 2.1335, + "step": 2555 + }, + { + "epoch": 0.14247491638795987, + "grad_norm": 0.48569151759147644, + "learning_rate": 8e-05, + "loss": 1.7904, + "step": 2556 + }, + { + "epoch": 0.14253065774804904, + "grad_norm": 0.5114737749099731, + "learning_rate": 8e-05, + "loss": 1.8073, + "step": 2557 + }, + { + "epoch": 0.14258639910813825, + "grad_norm": 0.47948169708251953, + "learning_rate": 8e-05, + "loss": 1.8756, + "step": 2558 + }, + { + "epoch": 0.14264214046822743, + "grad_norm": 0.4516530930995941, + "learning_rate": 8e-05, + "loss": 1.8219, + "step": 2559 + }, + { + "epoch": 0.1426978818283166, + "grad_norm": 0.485946923494339, + "learning_rate": 8e-05, + "loss": 1.703, + "step": 2560 + }, + { + "epoch": 0.1427536231884058, + "grad_norm": 0.4625237286090851, + "learning_rate": 8e-05, + "loss": 1.7076, + "step": 2561 + }, + { + "epoch": 0.142809364548495, + "grad_norm": 0.4589962065219879, + "learning_rate": 8e-05, + "loss": 1.754, + "step": 2562 + }, + { + "epoch": 0.14286510590858417, + "grad_norm": 0.45477667450904846, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 2563 + }, + { + "epoch": 0.14292084726867335, + "grad_norm": 0.46879589557647705, + "learning_rate": 8e-05, + "loss": 1.6575, + "step": 2564 + }, + { + "epoch": 0.14297658862876253, + "grad_norm": 0.4632941484451294, + "learning_rate": 8e-05, + "loss": 1.9237, + "step": 2565 + }, + { + "epoch": 0.14303232998885174, + "grad_norm": 0.5111912488937378, + "learning_rate": 8e-05, + "loss": 1.7429, + "step": 2566 + }, + { + "epoch": 0.14308807134894092, + "grad_norm": 0.5094893574714661, + "learning_rate": 8e-05, + "loss": 1.8616, + "step": 2567 + }, + { + "epoch": 0.1431438127090301, + "grad_norm": 0.5143493413925171, + "learning_rate": 8e-05, + "loss": 1.4942, + "step": 2568 + }, + { + "epoch": 0.1431995540691193, + "grad_norm": 0.44312921166419983, + "learning_rate": 8e-05, + "loss": 1.6771, + "step": 2569 + }, + { + "epoch": 0.14325529542920848, + "grad_norm": 0.46118608117103577, + "learning_rate": 8e-05, + "loss": 1.6036, + "step": 2570 + }, + { + "epoch": 0.14331103678929766, + "grad_norm": 0.44793501496315, + "learning_rate": 8e-05, + "loss": 1.5554, + "step": 2571 + }, + { + "epoch": 0.14336677814938684, + "grad_norm": 0.5015727281570435, + "learning_rate": 8e-05, + "loss": 1.7773, + "step": 2572 + }, + { + "epoch": 0.14342251950947604, + "grad_norm": 0.4987427890300751, + "learning_rate": 8e-05, + "loss": 2.0529, + "step": 2573 + }, + { + "epoch": 0.14347826086956522, + "grad_norm": 0.4836692810058594, + "learning_rate": 8e-05, + "loss": 1.6583, + "step": 2574 + }, + { + "epoch": 0.1435340022296544, + "grad_norm": 0.4915635585784912, + "learning_rate": 8e-05, + "loss": 1.5852, + "step": 2575 + }, + { + "epoch": 0.14358974358974358, + "grad_norm": 0.4204218089580536, + "learning_rate": 8e-05, + "loss": 1.4971, + "step": 2576 + }, + { + "epoch": 0.14364548494983279, + "grad_norm": 0.5908351540565491, + "learning_rate": 8e-05, + "loss": 1.9646, + "step": 2577 + }, + { + "epoch": 0.14370122630992196, + "grad_norm": 0.4868652820587158, + "learning_rate": 8e-05, + "loss": 1.9946, + "step": 2578 + }, + { + "epoch": 0.14375696767001114, + "grad_norm": 0.5083109736442566, + "learning_rate": 8e-05, + "loss": 1.9526, + "step": 2579 + }, + { + "epoch": 0.14381270903010032, + "grad_norm": 0.45809340476989746, + "learning_rate": 8e-05, + "loss": 1.6082, + "step": 2580 + }, + { + "epoch": 0.14386845039018953, + "grad_norm": 0.4436138868331909, + "learning_rate": 8e-05, + "loss": 1.5737, + "step": 2581 + }, + { + "epoch": 0.1439241917502787, + "grad_norm": 0.5178206562995911, + "learning_rate": 8e-05, + "loss": 1.7271, + "step": 2582 + }, + { + "epoch": 0.14397993311036789, + "grad_norm": 0.4951595366001129, + "learning_rate": 8e-05, + "loss": 1.8733, + "step": 2583 + }, + { + "epoch": 0.1440356744704571, + "grad_norm": 0.47114184498786926, + "learning_rate": 8e-05, + "loss": 1.8224, + "step": 2584 + }, + { + "epoch": 0.14409141583054627, + "grad_norm": 0.4795887768268585, + "learning_rate": 8e-05, + "loss": 1.7847, + "step": 2585 + }, + { + "epoch": 0.14414715719063545, + "grad_norm": 0.43819573521614075, + "learning_rate": 8e-05, + "loss": 1.5203, + "step": 2586 + }, + { + "epoch": 0.14420289855072463, + "grad_norm": 0.4707789123058319, + "learning_rate": 8e-05, + "loss": 1.622, + "step": 2587 + }, + { + "epoch": 0.14425863991081384, + "grad_norm": 0.5049995183944702, + "learning_rate": 8e-05, + "loss": 1.7843, + "step": 2588 + }, + { + "epoch": 0.144314381270903, + "grad_norm": 0.5321909785270691, + "learning_rate": 8e-05, + "loss": 1.9627, + "step": 2589 + }, + { + "epoch": 0.1443701226309922, + "grad_norm": 0.49330127239227295, + "learning_rate": 8e-05, + "loss": 1.8805, + "step": 2590 + }, + { + "epoch": 0.14442586399108137, + "grad_norm": 0.5383540391921997, + "learning_rate": 8e-05, + "loss": 1.8618, + "step": 2591 + }, + { + "epoch": 0.14448160535117058, + "grad_norm": 0.45850417017936707, + "learning_rate": 8e-05, + "loss": 1.7167, + "step": 2592 + }, + { + "epoch": 0.14453734671125976, + "grad_norm": 0.475832998752594, + "learning_rate": 8e-05, + "loss": 1.7077, + "step": 2593 + }, + { + "epoch": 0.14459308807134894, + "grad_norm": 0.5012633800506592, + "learning_rate": 8e-05, + "loss": 1.7972, + "step": 2594 + }, + { + "epoch": 0.14464882943143811, + "grad_norm": 0.491800457239151, + "learning_rate": 8e-05, + "loss": 1.7677, + "step": 2595 + }, + { + "epoch": 0.14470457079152732, + "grad_norm": 0.4740968346595764, + "learning_rate": 8e-05, + "loss": 1.7764, + "step": 2596 + }, + { + "epoch": 0.1447603121516165, + "grad_norm": 0.44466421008110046, + "learning_rate": 8e-05, + "loss": 1.7242, + "step": 2597 + }, + { + "epoch": 0.14481605351170568, + "grad_norm": 0.4517992436885834, + "learning_rate": 8e-05, + "loss": 1.6083, + "step": 2598 + }, + { + "epoch": 0.14487179487179488, + "grad_norm": 0.48363086581230164, + "learning_rate": 8e-05, + "loss": 1.8892, + "step": 2599 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.4761711359024048, + "learning_rate": 8e-05, + "loss": 1.8833, + "step": 2600 + }, + { + "epoch": 0.14498327759197324, + "grad_norm": 0.47965916991233826, + "learning_rate": 8e-05, + "loss": 1.9019, + "step": 2601 + }, + { + "epoch": 0.14503901895206242, + "grad_norm": 0.4785793125629425, + "learning_rate": 8e-05, + "loss": 1.7075, + "step": 2602 + }, + { + "epoch": 0.14509476031215163, + "grad_norm": 0.522576093673706, + "learning_rate": 8e-05, + "loss": 2.017, + "step": 2603 + }, + { + "epoch": 0.1451505016722408, + "grad_norm": 0.5296246409416199, + "learning_rate": 8e-05, + "loss": 1.6347, + "step": 2604 + }, + { + "epoch": 0.14520624303232998, + "grad_norm": 0.40535768866539, + "learning_rate": 8e-05, + "loss": 1.4272, + "step": 2605 + }, + { + "epoch": 0.14526198439241916, + "grad_norm": 0.44816961884498596, + "learning_rate": 8e-05, + "loss": 1.5196, + "step": 2606 + }, + { + "epoch": 0.14531772575250837, + "grad_norm": 0.5010104179382324, + "learning_rate": 8e-05, + "loss": 1.6105, + "step": 2607 + }, + { + "epoch": 0.14537346711259755, + "grad_norm": 0.5008467435836792, + "learning_rate": 8e-05, + "loss": 2.0679, + "step": 2608 + }, + { + "epoch": 0.14542920847268673, + "grad_norm": 0.562042772769928, + "learning_rate": 8e-05, + "loss": 2.1236, + "step": 2609 + }, + { + "epoch": 0.1454849498327759, + "grad_norm": 0.5288259387016296, + "learning_rate": 8e-05, + "loss": 1.8061, + "step": 2610 + }, + { + "epoch": 0.1455406911928651, + "grad_norm": 0.5157101154327393, + "learning_rate": 8e-05, + "loss": 1.8109, + "step": 2611 + }, + { + "epoch": 0.1455964325529543, + "grad_norm": 0.5012960433959961, + "learning_rate": 8e-05, + "loss": 1.7315, + "step": 2612 + }, + { + "epoch": 0.14565217391304347, + "grad_norm": 0.5298734903335571, + "learning_rate": 8e-05, + "loss": 1.8804, + "step": 2613 + }, + { + "epoch": 0.14570791527313268, + "grad_norm": 0.49280276894569397, + "learning_rate": 8e-05, + "loss": 1.809, + "step": 2614 + }, + { + "epoch": 0.14576365663322186, + "grad_norm": 0.4701242744922638, + "learning_rate": 8e-05, + "loss": 1.6592, + "step": 2615 + }, + { + "epoch": 0.14581939799331103, + "grad_norm": 0.4502270817756653, + "learning_rate": 8e-05, + "loss": 1.5762, + "step": 2616 + }, + { + "epoch": 0.1458751393534002, + "grad_norm": 0.4606007933616638, + "learning_rate": 8e-05, + "loss": 1.6542, + "step": 2617 + }, + { + "epoch": 0.14593088071348942, + "grad_norm": 0.4372333586215973, + "learning_rate": 8e-05, + "loss": 1.4818, + "step": 2618 + }, + { + "epoch": 0.1459866220735786, + "grad_norm": 0.5123947858810425, + "learning_rate": 8e-05, + "loss": 1.7565, + "step": 2619 + }, + { + "epoch": 0.14604236343366778, + "grad_norm": 0.46434301137924194, + "learning_rate": 8e-05, + "loss": 1.6585, + "step": 2620 + }, + { + "epoch": 0.14609810479375696, + "grad_norm": 0.48808830976486206, + "learning_rate": 8e-05, + "loss": 1.6373, + "step": 2621 + }, + { + "epoch": 0.14615384615384616, + "grad_norm": 0.5246288180351257, + "learning_rate": 8e-05, + "loss": 1.9852, + "step": 2622 + }, + { + "epoch": 0.14620958751393534, + "grad_norm": 0.45331263542175293, + "learning_rate": 8e-05, + "loss": 1.739, + "step": 2623 + }, + { + "epoch": 0.14626532887402452, + "grad_norm": 0.4503915011882782, + "learning_rate": 8e-05, + "loss": 1.7201, + "step": 2624 + }, + { + "epoch": 0.14632107023411373, + "grad_norm": 0.456082820892334, + "learning_rate": 8e-05, + "loss": 1.5943, + "step": 2625 + }, + { + "epoch": 0.1463768115942029, + "grad_norm": 0.43923071026802063, + "learning_rate": 8e-05, + "loss": 1.5383, + "step": 2626 + }, + { + "epoch": 0.14643255295429208, + "grad_norm": 0.5226138830184937, + "learning_rate": 8e-05, + "loss": 1.822, + "step": 2627 + }, + { + "epoch": 0.14648829431438126, + "grad_norm": 0.4432237148284912, + "learning_rate": 8e-05, + "loss": 1.6528, + "step": 2628 + }, + { + "epoch": 0.14654403567447047, + "grad_norm": 0.44176673889160156, + "learning_rate": 8e-05, + "loss": 1.4834, + "step": 2629 + }, + { + "epoch": 0.14659977703455965, + "grad_norm": 0.4683099389076233, + "learning_rate": 8e-05, + "loss": 1.6887, + "step": 2630 + }, + { + "epoch": 0.14665551839464883, + "grad_norm": 0.44806182384490967, + "learning_rate": 8e-05, + "loss": 1.6024, + "step": 2631 + }, + { + "epoch": 0.146711259754738, + "grad_norm": 0.488511323928833, + "learning_rate": 8e-05, + "loss": 1.9064, + "step": 2632 + }, + { + "epoch": 0.1467670011148272, + "grad_norm": 0.4772742986679077, + "learning_rate": 8e-05, + "loss": 1.8026, + "step": 2633 + }, + { + "epoch": 0.1468227424749164, + "grad_norm": 0.4587347209453583, + "learning_rate": 8e-05, + "loss": 1.6626, + "step": 2634 + }, + { + "epoch": 0.14687848383500557, + "grad_norm": 0.5513268113136292, + "learning_rate": 8e-05, + "loss": 2.0832, + "step": 2635 + }, + { + "epoch": 0.14693422519509475, + "grad_norm": 0.44140949845314026, + "learning_rate": 8e-05, + "loss": 1.6923, + "step": 2636 + }, + { + "epoch": 0.14698996655518395, + "grad_norm": 0.4342259168624878, + "learning_rate": 8e-05, + "loss": 1.5283, + "step": 2637 + }, + { + "epoch": 0.14704570791527313, + "grad_norm": 0.5041188597679138, + "learning_rate": 8e-05, + "loss": 1.8199, + "step": 2638 + }, + { + "epoch": 0.1471014492753623, + "grad_norm": 0.4816203713417053, + "learning_rate": 8e-05, + "loss": 1.8403, + "step": 2639 + }, + { + "epoch": 0.14715719063545152, + "grad_norm": 0.5168691873550415, + "learning_rate": 8e-05, + "loss": 1.9105, + "step": 2640 + }, + { + "epoch": 0.1472129319955407, + "grad_norm": 0.47315770387649536, + "learning_rate": 8e-05, + "loss": 1.7493, + "step": 2641 + }, + { + "epoch": 0.14726867335562988, + "grad_norm": 0.4396923780441284, + "learning_rate": 8e-05, + "loss": 1.4694, + "step": 2642 + }, + { + "epoch": 0.14732441471571905, + "grad_norm": 0.4882901906967163, + "learning_rate": 8e-05, + "loss": 1.6426, + "step": 2643 + }, + { + "epoch": 0.14738015607580826, + "grad_norm": 0.5237675905227661, + "learning_rate": 8e-05, + "loss": 1.9971, + "step": 2644 + }, + { + "epoch": 0.14743589743589744, + "grad_norm": 0.4680856466293335, + "learning_rate": 8e-05, + "loss": 1.6813, + "step": 2645 + }, + { + "epoch": 0.14749163879598662, + "grad_norm": 0.4802532494068146, + "learning_rate": 8e-05, + "loss": 1.6923, + "step": 2646 + }, + { + "epoch": 0.1475473801560758, + "grad_norm": 0.4531373381614685, + "learning_rate": 8e-05, + "loss": 1.6865, + "step": 2647 + }, + { + "epoch": 0.147603121516165, + "grad_norm": 0.49141162633895874, + "learning_rate": 8e-05, + "loss": 1.8942, + "step": 2648 + }, + { + "epoch": 0.14765886287625418, + "grad_norm": 0.40621107816696167, + "learning_rate": 8e-05, + "loss": 1.5562, + "step": 2649 + }, + { + "epoch": 0.14771460423634336, + "grad_norm": 0.48760560154914856, + "learning_rate": 8e-05, + "loss": 1.8392, + "step": 2650 + }, + { + "epoch": 0.14777034559643254, + "grad_norm": 0.4867985248565674, + "learning_rate": 8e-05, + "loss": 1.8491, + "step": 2651 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 0.5120669603347778, + "learning_rate": 8e-05, + "loss": 1.8304, + "step": 2652 + }, + { + "epoch": 0.14788182831661092, + "grad_norm": 0.4798111915588379, + "learning_rate": 8e-05, + "loss": 1.8607, + "step": 2653 + }, + { + "epoch": 0.1479375696767001, + "grad_norm": 0.47370168566703796, + "learning_rate": 8e-05, + "loss": 1.7137, + "step": 2654 + }, + { + "epoch": 0.1479933110367893, + "grad_norm": 0.6196955442428589, + "learning_rate": 8e-05, + "loss": 1.6156, + "step": 2655 + }, + { + "epoch": 0.1480490523968785, + "grad_norm": 0.4761423170566559, + "learning_rate": 8e-05, + "loss": 1.7644, + "step": 2656 + }, + { + "epoch": 0.14810479375696767, + "grad_norm": 0.49045848846435547, + "learning_rate": 8e-05, + "loss": 1.5832, + "step": 2657 + }, + { + "epoch": 0.14816053511705685, + "grad_norm": 0.45145368576049805, + "learning_rate": 8e-05, + "loss": 1.4705, + "step": 2658 + }, + { + "epoch": 0.14821627647714605, + "grad_norm": 0.49962612986564636, + "learning_rate": 8e-05, + "loss": 1.7623, + "step": 2659 + }, + { + "epoch": 0.14827201783723523, + "grad_norm": 0.4893467426300049, + "learning_rate": 8e-05, + "loss": 1.9686, + "step": 2660 + }, + { + "epoch": 0.1483277591973244, + "grad_norm": 0.48408836126327515, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 2661 + }, + { + "epoch": 0.1483835005574136, + "grad_norm": 0.48030319809913635, + "learning_rate": 8e-05, + "loss": 2.0596, + "step": 2662 + }, + { + "epoch": 0.1484392419175028, + "grad_norm": 0.46270525455474854, + "learning_rate": 8e-05, + "loss": 1.7686, + "step": 2663 + }, + { + "epoch": 0.14849498327759197, + "grad_norm": 0.41890811920166016, + "learning_rate": 8e-05, + "loss": 1.6958, + "step": 2664 + }, + { + "epoch": 0.14855072463768115, + "grad_norm": 0.49203094840049744, + "learning_rate": 8e-05, + "loss": 1.8678, + "step": 2665 + }, + { + "epoch": 0.14860646599777033, + "grad_norm": 0.4484955072402954, + "learning_rate": 8e-05, + "loss": 1.8809, + "step": 2666 + }, + { + "epoch": 0.14866220735785954, + "grad_norm": 0.5041595697402954, + "learning_rate": 8e-05, + "loss": 1.9027, + "step": 2667 + }, + { + "epoch": 0.14871794871794872, + "grad_norm": 0.4210891127586365, + "learning_rate": 8e-05, + "loss": 1.4913, + "step": 2668 + }, + { + "epoch": 0.1487736900780379, + "grad_norm": 0.47749096155166626, + "learning_rate": 8e-05, + "loss": 1.5191, + "step": 2669 + }, + { + "epoch": 0.1488294314381271, + "grad_norm": 0.45108339190483093, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 2670 + }, + { + "epoch": 0.14888517279821628, + "grad_norm": 0.5282843112945557, + "learning_rate": 8e-05, + "loss": 1.4526, + "step": 2671 + }, + { + "epoch": 0.14894091415830546, + "grad_norm": 0.4593731164932251, + "learning_rate": 8e-05, + "loss": 1.7042, + "step": 2672 + }, + { + "epoch": 0.14899665551839464, + "grad_norm": 0.5506393313407898, + "learning_rate": 8e-05, + "loss": 1.935, + "step": 2673 + }, + { + "epoch": 0.14905239687848384, + "grad_norm": 0.5012826919555664, + "learning_rate": 8e-05, + "loss": 1.45, + "step": 2674 + }, + { + "epoch": 0.14910813823857302, + "grad_norm": 0.5560808777809143, + "learning_rate": 8e-05, + "loss": 2.025, + "step": 2675 + }, + { + "epoch": 0.1491638795986622, + "grad_norm": 0.5152798891067505, + "learning_rate": 8e-05, + "loss": 1.8135, + "step": 2676 + }, + { + "epoch": 0.14921962095875138, + "grad_norm": 0.5184229016304016, + "learning_rate": 8e-05, + "loss": 1.7617, + "step": 2677 + }, + { + "epoch": 0.1492753623188406, + "grad_norm": 0.49748098850250244, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 2678 + }, + { + "epoch": 0.14933110367892977, + "grad_norm": 0.5202342867851257, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 2679 + }, + { + "epoch": 0.14938684503901895, + "grad_norm": 0.47590065002441406, + "learning_rate": 8e-05, + "loss": 1.7537, + "step": 2680 + }, + { + "epoch": 0.14944258639910815, + "grad_norm": 0.5148934125900269, + "learning_rate": 8e-05, + "loss": 1.8217, + "step": 2681 + }, + { + "epoch": 0.14949832775919733, + "grad_norm": 0.487512469291687, + "learning_rate": 8e-05, + "loss": 1.8185, + "step": 2682 + }, + { + "epoch": 0.1495540691192865, + "grad_norm": 0.505423367023468, + "learning_rate": 8e-05, + "loss": 1.8296, + "step": 2683 + }, + { + "epoch": 0.1496098104793757, + "grad_norm": 0.4880646765232086, + "learning_rate": 8e-05, + "loss": 1.797, + "step": 2684 + }, + { + "epoch": 0.1496655518394649, + "grad_norm": 0.45995259284973145, + "learning_rate": 8e-05, + "loss": 1.548, + "step": 2685 + }, + { + "epoch": 0.14972129319955407, + "grad_norm": 0.48819291591644287, + "learning_rate": 8e-05, + "loss": 1.6364, + "step": 2686 + }, + { + "epoch": 0.14977703455964325, + "grad_norm": 0.47882404923439026, + "learning_rate": 8e-05, + "loss": 1.5055, + "step": 2687 + }, + { + "epoch": 0.14983277591973243, + "grad_norm": 0.437224805355072, + "learning_rate": 8e-05, + "loss": 1.6657, + "step": 2688 + }, + { + "epoch": 0.14988851727982164, + "grad_norm": 0.49094241857528687, + "learning_rate": 8e-05, + "loss": 1.8073, + "step": 2689 + }, + { + "epoch": 0.14994425863991082, + "grad_norm": 0.45183655619621277, + "learning_rate": 8e-05, + "loss": 1.5465, + "step": 2690 + }, + { + "epoch": 0.15, + "grad_norm": 0.48996084928512573, + "learning_rate": 8e-05, + "loss": 1.901, + "step": 2691 + }, + { + "epoch": 0.15005574136008917, + "grad_norm": 0.43467774987220764, + "learning_rate": 8e-05, + "loss": 1.615, + "step": 2692 + }, + { + "epoch": 0.15011148272017838, + "grad_norm": 0.460771769285202, + "learning_rate": 8e-05, + "loss": 1.8058, + "step": 2693 + }, + { + "epoch": 0.15016722408026756, + "grad_norm": 0.49444660544395447, + "learning_rate": 8e-05, + "loss": 1.6476, + "step": 2694 + }, + { + "epoch": 0.15022296544035674, + "grad_norm": 0.4313269257545471, + "learning_rate": 8e-05, + "loss": 1.4598, + "step": 2695 + }, + { + "epoch": 0.15027870680044594, + "grad_norm": 0.5513227581977844, + "learning_rate": 8e-05, + "loss": 2.0741, + "step": 2696 + }, + { + "epoch": 0.15033444816053512, + "grad_norm": 0.46913573145866394, + "learning_rate": 8e-05, + "loss": 1.6384, + "step": 2697 + }, + { + "epoch": 0.1503901895206243, + "grad_norm": 0.4985828399658203, + "learning_rate": 8e-05, + "loss": 1.4213, + "step": 2698 + }, + { + "epoch": 0.15044593088071348, + "grad_norm": 0.4544661045074463, + "learning_rate": 8e-05, + "loss": 1.7351, + "step": 2699 + }, + { + "epoch": 0.1505016722408027, + "grad_norm": 0.5336171984672546, + "learning_rate": 8e-05, + "loss": 1.9037, + "step": 2700 + }, + { + "epoch": 0.15055741360089187, + "grad_norm": 0.4664371609687805, + "learning_rate": 8e-05, + "loss": 1.6724, + "step": 2701 + }, + { + "epoch": 0.15061315496098104, + "grad_norm": 0.4967934489250183, + "learning_rate": 8e-05, + "loss": 1.9133, + "step": 2702 + }, + { + "epoch": 0.15066889632107022, + "grad_norm": 0.4723069667816162, + "learning_rate": 8e-05, + "loss": 1.6718, + "step": 2703 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 0.5051380395889282, + "learning_rate": 8e-05, + "loss": 1.6417, + "step": 2704 + }, + { + "epoch": 0.1507803790412486, + "grad_norm": 0.49244359135627747, + "learning_rate": 8e-05, + "loss": 1.6002, + "step": 2705 + }, + { + "epoch": 0.1508361204013378, + "grad_norm": 0.4505351483821869, + "learning_rate": 8e-05, + "loss": 1.5397, + "step": 2706 + }, + { + "epoch": 0.15089186176142697, + "grad_norm": 0.5007824897766113, + "learning_rate": 8e-05, + "loss": 1.6409, + "step": 2707 + }, + { + "epoch": 0.15094760312151617, + "grad_norm": 0.5030344724655151, + "learning_rate": 8e-05, + "loss": 1.7374, + "step": 2708 + }, + { + "epoch": 0.15100334448160535, + "grad_norm": 0.5079347491264343, + "learning_rate": 8e-05, + "loss": 1.9303, + "step": 2709 + }, + { + "epoch": 0.15105908584169453, + "grad_norm": 0.5016282200813293, + "learning_rate": 8e-05, + "loss": 2.0209, + "step": 2710 + }, + { + "epoch": 0.15111482720178374, + "grad_norm": 0.4660756289958954, + "learning_rate": 8e-05, + "loss": 1.5367, + "step": 2711 + }, + { + "epoch": 0.15117056856187291, + "grad_norm": 0.536495566368103, + "learning_rate": 8e-05, + "loss": 1.8843, + "step": 2712 + }, + { + "epoch": 0.1512263099219621, + "grad_norm": 0.49942296743392944, + "learning_rate": 8e-05, + "loss": 1.7927, + "step": 2713 + }, + { + "epoch": 0.15128205128205127, + "grad_norm": 0.5121089220046997, + "learning_rate": 8e-05, + "loss": 1.9474, + "step": 2714 + }, + { + "epoch": 0.15133779264214048, + "grad_norm": 0.49932828545570374, + "learning_rate": 8e-05, + "loss": 1.9303, + "step": 2715 + }, + { + "epoch": 0.15139353400222966, + "grad_norm": 0.43620043992996216, + "learning_rate": 8e-05, + "loss": 1.638, + "step": 2716 + }, + { + "epoch": 0.15144927536231884, + "grad_norm": 0.491070032119751, + "learning_rate": 8e-05, + "loss": 1.8854, + "step": 2717 + }, + { + "epoch": 0.15150501672240801, + "grad_norm": 0.47080928087234497, + "learning_rate": 8e-05, + "loss": 1.8339, + "step": 2718 + }, + { + "epoch": 0.15156075808249722, + "grad_norm": 0.4873843193054199, + "learning_rate": 8e-05, + "loss": 1.9661, + "step": 2719 + }, + { + "epoch": 0.1516164994425864, + "grad_norm": 0.5007215738296509, + "learning_rate": 8e-05, + "loss": 1.8296, + "step": 2720 + }, + { + "epoch": 0.15167224080267558, + "grad_norm": 0.47283855080604553, + "learning_rate": 8e-05, + "loss": 1.7406, + "step": 2721 + }, + { + "epoch": 0.15172798216276479, + "grad_norm": 0.4671013653278351, + "learning_rate": 8e-05, + "loss": 1.6189, + "step": 2722 + }, + { + "epoch": 0.15178372352285396, + "grad_norm": 0.45656901597976685, + "learning_rate": 8e-05, + "loss": 1.6288, + "step": 2723 + }, + { + "epoch": 0.15183946488294314, + "grad_norm": 0.45888885855674744, + "learning_rate": 8e-05, + "loss": 1.6675, + "step": 2724 + }, + { + "epoch": 0.15189520624303232, + "grad_norm": 0.4929250180721283, + "learning_rate": 8e-05, + "loss": 1.6704, + "step": 2725 + }, + { + "epoch": 0.15195094760312153, + "grad_norm": 0.5203201770782471, + "learning_rate": 8e-05, + "loss": 1.8276, + "step": 2726 + }, + { + "epoch": 0.1520066889632107, + "grad_norm": 0.45683154463768005, + "learning_rate": 8e-05, + "loss": 1.5656, + "step": 2727 + }, + { + "epoch": 0.15206243032329989, + "grad_norm": 0.4747931659221649, + "learning_rate": 8e-05, + "loss": 1.6466, + "step": 2728 + }, + { + "epoch": 0.15211817168338906, + "grad_norm": 0.49119818210601807, + "learning_rate": 8e-05, + "loss": 1.7029, + "step": 2729 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 0.47732195258140564, + "learning_rate": 8e-05, + "loss": 1.6918, + "step": 2730 + }, + { + "epoch": 0.15222965440356745, + "grad_norm": 0.490499347448349, + "learning_rate": 8e-05, + "loss": 1.7927, + "step": 2731 + }, + { + "epoch": 0.15228539576365663, + "grad_norm": 0.5020274519920349, + "learning_rate": 8e-05, + "loss": 1.893, + "step": 2732 + }, + { + "epoch": 0.1523411371237458, + "grad_norm": 0.46516138315200806, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 2733 + }, + { + "epoch": 0.152396878483835, + "grad_norm": 0.5290771722793579, + "learning_rate": 8e-05, + "loss": 2.1387, + "step": 2734 + }, + { + "epoch": 0.1524526198439242, + "grad_norm": 0.4662680923938751, + "learning_rate": 8e-05, + "loss": 1.7573, + "step": 2735 + }, + { + "epoch": 0.15250836120401337, + "grad_norm": 0.4540351331233978, + "learning_rate": 8e-05, + "loss": 1.7422, + "step": 2736 + }, + { + "epoch": 0.15256410256410258, + "grad_norm": 0.4451354444026947, + "learning_rate": 8e-05, + "loss": 1.4756, + "step": 2737 + }, + { + "epoch": 0.15261984392419176, + "grad_norm": 0.4653637409210205, + "learning_rate": 8e-05, + "loss": 1.782, + "step": 2738 + }, + { + "epoch": 0.15267558528428093, + "grad_norm": 0.48152562975883484, + "learning_rate": 8e-05, + "loss": 1.7043, + "step": 2739 + }, + { + "epoch": 0.1527313266443701, + "grad_norm": 0.5343277454376221, + "learning_rate": 8e-05, + "loss": 1.834, + "step": 2740 + }, + { + "epoch": 0.15278706800445932, + "grad_norm": 0.48790308833122253, + "learning_rate": 8e-05, + "loss": 1.7217, + "step": 2741 + }, + { + "epoch": 0.1528428093645485, + "grad_norm": 0.4877580404281616, + "learning_rate": 8e-05, + "loss": 1.771, + "step": 2742 + }, + { + "epoch": 0.15289855072463768, + "grad_norm": 0.4496322274208069, + "learning_rate": 8e-05, + "loss": 1.7451, + "step": 2743 + }, + { + "epoch": 0.15295429208472686, + "grad_norm": 0.4950348734855652, + "learning_rate": 8e-05, + "loss": 1.7627, + "step": 2744 + }, + { + "epoch": 0.15301003344481606, + "grad_norm": 0.4993313252925873, + "learning_rate": 8e-05, + "loss": 1.7715, + "step": 2745 + }, + { + "epoch": 0.15306577480490524, + "grad_norm": 0.45644307136535645, + "learning_rate": 8e-05, + "loss": 1.7968, + "step": 2746 + }, + { + "epoch": 0.15312151616499442, + "grad_norm": 0.4572172164916992, + "learning_rate": 8e-05, + "loss": 1.6494, + "step": 2747 + }, + { + "epoch": 0.1531772575250836, + "grad_norm": 0.48790818452835083, + "learning_rate": 8e-05, + "loss": 1.8427, + "step": 2748 + }, + { + "epoch": 0.1532329988851728, + "grad_norm": 0.4811917841434479, + "learning_rate": 8e-05, + "loss": 1.7571, + "step": 2749 + }, + { + "epoch": 0.15328874024526198, + "grad_norm": 0.4466029107570648, + "learning_rate": 8e-05, + "loss": 1.6303, + "step": 2750 + }, + { + "epoch": 0.15334448160535116, + "grad_norm": 0.47539398074150085, + "learning_rate": 8e-05, + "loss": 1.8565, + "step": 2751 + }, + { + "epoch": 0.15340022296544037, + "grad_norm": 0.45929116010665894, + "learning_rate": 8e-05, + "loss": 1.9393, + "step": 2752 + }, + { + "epoch": 0.15345596432552955, + "grad_norm": 0.5932511687278748, + "learning_rate": 8e-05, + "loss": 1.7947, + "step": 2753 + }, + { + "epoch": 0.15351170568561873, + "grad_norm": 0.4963116943836212, + "learning_rate": 8e-05, + "loss": 1.84, + "step": 2754 + }, + { + "epoch": 0.1535674470457079, + "grad_norm": 0.4386138617992401, + "learning_rate": 8e-05, + "loss": 1.5813, + "step": 2755 + }, + { + "epoch": 0.1536231884057971, + "grad_norm": 0.5233179330825806, + "learning_rate": 8e-05, + "loss": 1.7951, + "step": 2756 + }, + { + "epoch": 0.1536789297658863, + "grad_norm": 0.42530587315559387, + "learning_rate": 8e-05, + "loss": 1.4469, + "step": 2757 + }, + { + "epoch": 0.15373467112597547, + "grad_norm": 0.45138615369796753, + "learning_rate": 8e-05, + "loss": 1.6971, + "step": 2758 + }, + { + "epoch": 0.15379041248606465, + "grad_norm": 0.4747151732444763, + "learning_rate": 8e-05, + "loss": 1.6825, + "step": 2759 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.4871062934398651, + "learning_rate": 8e-05, + "loss": 1.8835, + "step": 2760 + }, + { + "epoch": 0.15390189520624303, + "grad_norm": 0.4672834277153015, + "learning_rate": 8e-05, + "loss": 1.5864, + "step": 2761 + }, + { + "epoch": 0.1539576365663322, + "grad_norm": 0.5292240381240845, + "learning_rate": 8e-05, + "loss": 2.1529, + "step": 2762 + }, + { + "epoch": 0.1540133779264214, + "grad_norm": 0.5465215444564819, + "learning_rate": 8e-05, + "loss": 1.8391, + "step": 2763 + }, + { + "epoch": 0.1540691192865106, + "grad_norm": 0.4289812445640564, + "learning_rate": 8e-05, + "loss": 1.3537, + "step": 2764 + }, + { + "epoch": 0.15412486064659978, + "grad_norm": 0.4715195596218109, + "learning_rate": 8e-05, + "loss": 1.7668, + "step": 2765 + }, + { + "epoch": 0.15418060200668895, + "grad_norm": 0.5369305610656738, + "learning_rate": 8e-05, + "loss": 1.9698, + "step": 2766 + }, + { + "epoch": 0.15423634336677816, + "grad_norm": 0.4952063262462616, + "learning_rate": 8e-05, + "loss": 1.8069, + "step": 2767 + }, + { + "epoch": 0.15429208472686734, + "grad_norm": 0.46554696559906006, + "learning_rate": 8e-05, + "loss": 1.7292, + "step": 2768 + }, + { + "epoch": 0.15434782608695652, + "grad_norm": 0.48321375250816345, + "learning_rate": 8e-05, + "loss": 1.7208, + "step": 2769 + }, + { + "epoch": 0.1544035674470457, + "grad_norm": 0.45590123534202576, + "learning_rate": 8e-05, + "loss": 1.7738, + "step": 2770 + }, + { + "epoch": 0.1544593088071349, + "grad_norm": 0.558228075504303, + "learning_rate": 8e-05, + "loss": 2.0212, + "step": 2771 + }, + { + "epoch": 0.15451505016722408, + "grad_norm": 0.44301968812942505, + "learning_rate": 8e-05, + "loss": 1.6274, + "step": 2772 + }, + { + "epoch": 0.15457079152731326, + "grad_norm": 0.5060457587242126, + "learning_rate": 8e-05, + "loss": 1.9058, + "step": 2773 + }, + { + "epoch": 0.15462653288740244, + "grad_norm": 0.4313315153121948, + "learning_rate": 8e-05, + "loss": 1.68, + "step": 2774 + }, + { + "epoch": 0.15468227424749165, + "grad_norm": 0.47403407096862793, + "learning_rate": 8e-05, + "loss": 1.7216, + "step": 2775 + }, + { + "epoch": 0.15473801560758083, + "grad_norm": 0.48227450251579285, + "learning_rate": 8e-05, + "loss": 1.8895, + "step": 2776 + }, + { + "epoch": 0.15479375696767, + "grad_norm": 0.5059760212898254, + "learning_rate": 8e-05, + "loss": 1.9685, + "step": 2777 + }, + { + "epoch": 0.1548494983277592, + "grad_norm": 0.5228263139724731, + "learning_rate": 8e-05, + "loss": 2.211, + "step": 2778 + }, + { + "epoch": 0.1549052396878484, + "grad_norm": 0.46741005778312683, + "learning_rate": 8e-05, + "loss": 1.6471, + "step": 2779 + }, + { + "epoch": 0.15496098104793757, + "grad_norm": 0.48850780725479126, + "learning_rate": 8e-05, + "loss": 1.8356, + "step": 2780 + }, + { + "epoch": 0.15501672240802675, + "grad_norm": 0.5071626305580139, + "learning_rate": 8e-05, + "loss": 1.6518, + "step": 2781 + }, + { + "epoch": 0.15507246376811595, + "grad_norm": 0.47851163148880005, + "learning_rate": 8e-05, + "loss": 1.7904, + "step": 2782 + }, + { + "epoch": 0.15512820512820513, + "grad_norm": 0.45001041889190674, + "learning_rate": 8e-05, + "loss": 1.672, + "step": 2783 + }, + { + "epoch": 0.1551839464882943, + "grad_norm": 0.4666180908679962, + "learning_rate": 8e-05, + "loss": 1.6842, + "step": 2784 + }, + { + "epoch": 0.1552396878483835, + "grad_norm": 0.4250207841396332, + "learning_rate": 8e-05, + "loss": 1.4794, + "step": 2785 + }, + { + "epoch": 0.1552954292084727, + "grad_norm": 0.5495896935462952, + "learning_rate": 8e-05, + "loss": 1.9165, + "step": 2786 + }, + { + "epoch": 0.15535117056856187, + "grad_norm": 0.48266035318374634, + "learning_rate": 8e-05, + "loss": 1.8742, + "step": 2787 + }, + { + "epoch": 0.15540691192865105, + "grad_norm": 0.47781747579574585, + "learning_rate": 8e-05, + "loss": 1.8515, + "step": 2788 + }, + { + "epoch": 0.15546265328874023, + "grad_norm": 0.4679318368434906, + "learning_rate": 8e-05, + "loss": 1.7413, + "step": 2789 + }, + { + "epoch": 0.15551839464882944, + "grad_norm": 0.4929085969924927, + "learning_rate": 8e-05, + "loss": 1.8921, + "step": 2790 + }, + { + "epoch": 0.15557413600891862, + "grad_norm": 0.4672272205352783, + "learning_rate": 8e-05, + "loss": 1.6709, + "step": 2791 + }, + { + "epoch": 0.1556298773690078, + "grad_norm": 0.4650382399559021, + "learning_rate": 8e-05, + "loss": 1.504, + "step": 2792 + }, + { + "epoch": 0.155685618729097, + "grad_norm": 0.43125632405281067, + "learning_rate": 8e-05, + "loss": 1.4325, + "step": 2793 + }, + { + "epoch": 0.15574136008918618, + "grad_norm": 0.446275919675827, + "learning_rate": 8e-05, + "loss": 1.6414, + "step": 2794 + }, + { + "epoch": 0.15579710144927536, + "grad_norm": 0.449980229139328, + "learning_rate": 8e-05, + "loss": 1.6934, + "step": 2795 + }, + { + "epoch": 0.15585284280936454, + "grad_norm": 0.4828239977359772, + "learning_rate": 8e-05, + "loss": 1.7457, + "step": 2796 + }, + { + "epoch": 0.15590858416945375, + "grad_norm": 0.5396363735198975, + "learning_rate": 8e-05, + "loss": 1.8686, + "step": 2797 + }, + { + "epoch": 0.15596432552954292, + "grad_norm": 0.4906916320323944, + "learning_rate": 8e-05, + "loss": 1.7499, + "step": 2798 + }, + { + "epoch": 0.1560200668896321, + "grad_norm": 0.502632200717926, + "learning_rate": 8e-05, + "loss": 1.9716, + "step": 2799 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.5308877229690552, + "learning_rate": 8e-05, + "loss": 1.694, + "step": 2800 + }, + { + "epoch": 0.1561315496098105, + "grad_norm": 0.45435258746147156, + "learning_rate": 8e-05, + "loss": 1.573, + "step": 2801 + }, + { + "epoch": 0.15618729096989967, + "grad_norm": 0.4956980049610138, + "learning_rate": 8e-05, + "loss": 1.8578, + "step": 2802 + }, + { + "epoch": 0.15624303232998885, + "grad_norm": 0.5421011447906494, + "learning_rate": 8e-05, + "loss": 1.6132, + "step": 2803 + }, + { + "epoch": 0.15629877369007802, + "grad_norm": 0.4681341350078583, + "learning_rate": 8e-05, + "loss": 1.6352, + "step": 2804 + }, + { + "epoch": 0.15635451505016723, + "grad_norm": 0.44525671005249023, + "learning_rate": 8e-05, + "loss": 1.6762, + "step": 2805 + }, + { + "epoch": 0.1564102564102564, + "grad_norm": 0.4492572247982025, + "learning_rate": 8e-05, + "loss": 1.8298, + "step": 2806 + }, + { + "epoch": 0.1564659977703456, + "grad_norm": 0.45993149280548096, + "learning_rate": 8e-05, + "loss": 1.6447, + "step": 2807 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 0.4706052541732788, + "learning_rate": 8e-05, + "loss": 1.5694, + "step": 2808 + }, + { + "epoch": 0.15657748049052397, + "grad_norm": 0.4705091118812561, + "learning_rate": 8e-05, + "loss": 1.6959, + "step": 2809 + }, + { + "epoch": 0.15663322185061315, + "grad_norm": 0.47114166617393494, + "learning_rate": 8e-05, + "loss": 1.6371, + "step": 2810 + }, + { + "epoch": 0.15668896321070233, + "grad_norm": 0.5016292333602905, + "learning_rate": 8e-05, + "loss": 1.7122, + "step": 2811 + }, + { + "epoch": 0.15674470457079154, + "grad_norm": 0.48519521951675415, + "learning_rate": 8e-05, + "loss": 1.7304, + "step": 2812 + }, + { + "epoch": 0.15680044593088072, + "grad_norm": 0.48272377252578735, + "learning_rate": 8e-05, + "loss": 1.7092, + "step": 2813 + }, + { + "epoch": 0.1568561872909699, + "grad_norm": 0.505794882774353, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 2814 + }, + { + "epoch": 0.15691192865105907, + "grad_norm": 0.44441741704940796, + "learning_rate": 8e-05, + "loss": 1.702, + "step": 2815 + }, + { + "epoch": 0.15696767001114828, + "grad_norm": 0.47186487913131714, + "learning_rate": 8e-05, + "loss": 1.7099, + "step": 2816 + }, + { + "epoch": 0.15702341137123746, + "grad_norm": 0.5894055366516113, + "learning_rate": 8e-05, + "loss": 1.6043, + "step": 2817 + }, + { + "epoch": 0.15707915273132664, + "grad_norm": 0.5004922151565552, + "learning_rate": 8e-05, + "loss": 1.7105, + "step": 2818 + }, + { + "epoch": 0.15713489409141584, + "grad_norm": 0.46560198068618774, + "learning_rate": 8e-05, + "loss": 1.7297, + "step": 2819 + }, + { + "epoch": 0.15719063545150502, + "grad_norm": 0.4759940505027771, + "learning_rate": 8e-05, + "loss": 1.7775, + "step": 2820 + }, + { + "epoch": 0.1572463768115942, + "grad_norm": 0.5159291625022888, + "learning_rate": 8e-05, + "loss": 1.9294, + "step": 2821 + }, + { + "epoch": 0.15730211817168338, + "grad_norm": 0.5078474879264832, + "learning_rate": 8e-05, + "loss": 1.9248, + "step": 2822 + }, + { + "epoch": 0.1573578595317726, + "grad_norm": 0.4777558445930481, + "learning_rate": 8e-05, + "loss": 1.7751, + "step": 2823 + }, + { + "epoch": 0.15741360089186177, + "grad_norm": 0.4959481656551361, + "learning_rate": 8e-05, + "loss": 1.6357, + "step": 2824 + }, + { + "epoch": 0.15746934225195094, + "grad_norm": 0.4931829273700714, + "learning_rate": 8e-05, + "loss": 1.5699, + "step": 2825 + }, + { + "epoch": 0.15752508361204012, + "grad_norm": 0.4662152826786041, + "learning_rate": 8e-05, + "loss": 1.6673, + "step": 2826 + }, + { + "epoch": 0.15758082497212933, + "grad_norm": 0.4971119165420532, + "learning_rate": 8e-05, + "loss": 1.6575, + "step": 2827 + }, + { + "epoch": 0.1576365663322185, + "grad_norm": 0.478135347366333, + "learning_rate": 8e-05, + "loss": 1.7929, + "step": 2828 + }, + { + "epoch": 0.1576923076923077, + "grad_norm": 0.5057351589202881, + "learning_rate": 8e-05, + "loss": 1.8841, + "step": 2829 + }, + { + "epoch": 0.15774804905239687, + "grad_norm": 0.5114669799804688, + "learning_rate": 8e-05, + "loss": 1.6789, + "step": 2830 + }, + { + "epoch": 0.15780379041248607, + "grad_norm": 0.4946887791156769, + "learning_rate": 8e-05, + "loss": 1.9107, + "step": 2831 + }, + { + "epoch": 0.15785953177257525, + "grad_norm": 0.4401973485946655, + "learning_rate": 8e-05, + "loss": 1.4577, + "step": 2832 + }, + { + "epoch": 0.15791527313266443, + "grad_norm": 0.46898627281188965, + "learning_rate": 8e-05, + "loss": 1.7578, + "step": 2833 + }, + { + "epoch": 0.15797101449275364, + "grad_norm": 0.5002691745758057, + "learning_rate": 8e-05, + "loss": 1.5825, + "step": 2834 + }, + { + "epoch": 0.15802675585284282, + "grad_norm": 0.5099799633026123, + "learning_rate": 8e-05, + "loss": 1.7851, + "step": 2835 + }, + { + "epoch": 0.158082497212932, + "grad_norm": 0.49243247509002686, + "learning_rate": 8e-05, + "loss": 1.8366, + "step": 2836 + }, + { + "epoch": 0.15813823857302117, + "grad_norm": 0.547187864780426, + "learning_rate": 8e-05, + "loss": 1.792, + "step": 2837 + }, + { + "epoch": 0.15819397993311038, + "grad_norm": 0.4610172212123871, + "learning_rate": 8e-05, + "loss": 1.6167, + "step": 2838 + }, + { + "epoch": 0.15824972129319956, + "grad_norm": 0.5065680742263794, + "learning_rate": 8e-05, + "loss": 2.005, + "step": 2839 + }, + { + "epoch": 0.15830546265328874, + "grad_norm": 0.5296475887298584, + "learning_rate": 8e-05, + "loss": 1.9063, + "step": 2840 + }, + { + "epoch": 0.15836120401337792, + "grad_norm": 0.47584494948387146, + "learning_rate": 8e-05, + "loss": 1.6319, + "step": 2841 + }, + { + "epoch": 0.15841694537346712, + "grad_norm": 0.482562780380249, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 2842 + }, + { + "epoch": 0.1584726867335563, + "grad_norm": 0.5090284943580627, + "learning_rate": 8e-05, + "loss": 1.7997, + "step": 2843 + }, + { + "epoch": 0.15852842809364548, + "grad_norm": 0.4627036452293396, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 2844 + }, + { + "epoch": 0.15858416945373466, + "grad_norm": 0.48607441782951355, + "learning_rate": 8e-05, + "loss": 1.7816, + "step": 2845 + }, + { + "epoch": 0.15863991081382386, + "grad_norm": 0.4424927532672882, + "learning_rate": 8e-05, + "loss": 1.4956, + "step": 2846 + }, + { + "epoch": 0.15869565217391304, + "grad_norm": 0.5370016098022461, + "learning_rate": 8e-05, + "loss": 1.8462, + "step": 2847 + }, + { + "epoch": 0.15875139353400222, + "grad_norm": 0.5331100225448608, + "learning_rate": 8e-05, + "loss": 1.8603, + "step": 2848 + }, + { + "epoch": 0.15880713489409143, + "grad_norm": 0.4674780070781708, + "learning_rate": 8e-05, + "loss": 1.8825, + "step": 2849 + }, + { + "epoch": 0.1588628762541806, + "grad_norm": 0.4758144021034241, + "learning_rate": 8e-05, + "loss": 1.7723, + "step": 2850 + }, + { + "epoch": 0.15891861761426979, + "grad_norm": 0.4641381502151489, + "learning_rate": 8e-05, + "loss": 1.8323, + "step": 2851 + }, + { + "epoch": 0.15897435897435896, + "grad_norm": 0.4850490689277649, + "learning_rate": 8e-05, + "loss": 1.5911, + "step": 2852 + }, + { + "epoch": 0.15903010033444817, + "grad_norm": 0.48367488384246826, + "learning_rate": 8e-05, + "loss": 1.8259, + "step": 2853 + }, + { + "epoch": 0.15908584169453735, + "grad_norm": 0.49388548731803894, + "learning_rate": 8e-05, + "loss": 1.6125, + "step": 2854 + }, + { + "epoch": 0.15914158305462653, + "grad_norm": 0.6949816346168518, + "learning_rate": 8e-05, + "loss": 1.8107, + "step": 2855 + }, + { + "epoch": 0.1591973244147157, + "grad_norm": 0.4608065187931061, + "learning_rate": 8e-05, + "loss": 1.6638, + "step": 2856 + }, + { + "epoch": 0.15925306577480491, + "grad_norm": 0.5375053882598877, + "learning_rate": 8e-05, + "loss": 1.8655, + "step": 2857 + }, + { + "epoch": 0.1593088071348941, + "grad_norm": 0.4775127172470093, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 2858 + }, + { + "epoch": 0.15936454849498327, + "grad_norm": 0.5275262594223022, + "learning_rate": 8e-05, + "loss": 1.823, + "step": 2859 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 0.4744328558444977, + "learning_rate": 8e-05, + "loss": 1.7652, + "step": 2860 + }, + { + "epoch": 0.15947603121516166, + "grad_norm": 0.4801606237888336, + "learning_rate": 8e-05, + "loss": 1.8581, + "step": 2861 + }, + { + "epoch": 0.15953177257525084, + "grad_norm": 0.49865031242370605, + "learning_rate": 8e-05, + "loss": 1.5966, + "step": 2862 + }, + { + "epoch": 0.15958751393534001, + "grad_norm": 0.5179030895233154, + "learning_rate": 8e-05, + "loss": 1.917, + "step": 2863 + }, + { + "epoch": 0.15964325529542922, + "grad_norm": 0.45887720584869385, + "learning_rate": 8e-05, + "loss": 1.7588, + "step": 2864 + }, + { + "epoch": 0.1596989966555184, + "grad_norm": 0.4781455099582672, + "learning_rate": 8e-05, + "loss": 1.5995, + "step": 2865 + }, + { + "epoch": 0.15975473801560758, + "grad_norm": 0.5103084444999695, + "learning_rate": 8e-05, + "loss": 1.7247, + "step": 2866 + }, + { + "epoch": 0.15981047937569676, + "grad_norm": 0.4783124029636383, + "learning_rate": 8e-05, + "loss": 1.8298, + "step": 2867 + }, + { + "epoch": 0.15986622073578596, + "grad_norm": 0.46342748403549194, + "learning_rate": 8e-05, + "loss": 1.7312, + "step": 2868 + }, + { + "epoch": 0.15992196209587514, + "grad_norm": 0.496513694524765, + "learning_rate": 8e-05, + "loss": 1.4232, + "step": 2869 + }, + { + "epoch": 0.15997770345596432, + "grad_norm": 0.4710693955421448, + "learning_rate": 8e-05, + "loss": 1.6533, + "step": 2870 + }, + { + "epoch": 0.1600334448160535, + "grad_norm": 0.4609511196613312, + "learning_rate": 8e-05, + "loss": 1.5186, + "step": 2871 + }, + { + "epoch": 0.1600891861761427, + "grad_norm": 0.5193403959274292, + "learning_rate": 8e-05, + "loss": 1.7667, + "step": 2872 + }, + { + "epoch": 0.16014492753623188, + "grad_norm": 0.4479955732822418, + "learning_rate": 8e-05, + "loss": 1.6617, + "step": 2873 + }, + { + "epoch": 0.16020066889632106, + "grad_norm": 0.47863709926605225, + "learning_rate": 8e-05, + "loss": 1.6544, + "step": 2874 + }, + { + "epoch": 0.16025641025641027, + "grad_norm": 0.5270398259162903, + "learning_rate": 8e-05, + "loss": 1.9415, + "step": 2875 + }, + { + "epoch": 0.16031215161649945, + "grad_norm": 0.46953439712524414, + "learning_rate": 8e-05, + "loss": 1.8319, + "step": 2876 + }, + { + "epoch": 0.16036789297658863, + "grad_norm": 0.46271249651908875, + "learning_rate": 8e-05, + "loss": 1.6766, + "step": 2877 + }, + { + "epoch": 0.1604236343366778, + "grad_norm": 0.4576559066772461, + "learning_rate": 8e-05, + "loss": 1.857, + "step": 2878 + }, + { + "epoch": 0.160479375696767, + "grad_norm": 0.4797700047492981, + "learning_rate": 8e-05, + "loss": 1.6108, + "step": 2879 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 0.47959211468696594, + "learning_rate": 8e-05, + "loss": 1.9271, + "step": 2880 + }, + { + "epoch": 0.16059085841694537, + "grad_norm": 0.4791256785392761, + "learning_rate": 8e-05, + "loss": 1.9178, + "step": 2881 + }, + { + "epoch": 0.16064659977703455, + "grad_norm": 0.47225552797317505, + "learning_rate": 8e-05, + "loss": 1.598, + "step": 2882 + }, + { + "epoch": 0.16070234113712376, + "grad_norm": 0.4455716609954834, + "learning_rate": 8e-05, + "loss": 1.5567, + "step": 2883 + }, + { + "epoch": 0.16075808249721293, + "grad_norm": 0.48891139030456543, + "learning_rate": 8e-05, + "loss": 1.9012, + "step": 2884 + }, + { + "epoch": 0.1608138238573021, + "grad_norm": 0.5016549825668335, + "learning_rate": 8e-05, + "loss": 1.816, + "step": 2885 + }, + { + "epoch": 0.1608695652173913, + "grad_norm": 0.4449464976787567, + "learning_rate": 8e-05, + "loss": 1.5622, + "step": 2886 + }, + { + "epoch": 0.1609253065774805, + "grad_norm": 0.5120397210121155, + "learning_rate": 8e-05, + "loss": 1.5982, + "step": 2887 + }, + { + "epoch": 0.16098104793756968, + "grad_norm": 0.5218576788902283, + "learning_rate": 8e-05, + "loss": 1.7747, + "step": 2888 + }, + { + "epoch": 0.16103678929765886, + "grad_norm": 0.5918441414833069, + "learning_rate": 8e-05, + "loss": 2.2052, + "step": 2889 + }, + { + "epoch": 0.16109253065774806, + "grad_norm": 0.4566386044025421, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 2890 + }, + { + "epoch": 0.16114827201783724, + "grad_norm": 0.45735636353492737, + "learning_rate": 8e-05, + "loss": 1.7771, + "step": 2891 + }, + { + "epoch": 0.16120401337792642, + "grad_norm": 0.45574647188186646, + "learning_rate": 8e-05, + "loss": 1.5037, + "step": 2892 + }, + { + "epoch": 0.1612597547380156, + "grad_norm": 0.4675073027610779, + "learning_rate": 8e-05, + "loss": 1.5138, + "step": 2893 + }, + { + "epoch": 0.1613154960981048, + "grad_norm": 0.4805678427219391, + "learning_rate": 8e-05, + "loss": 1.7286, + "step": 2894 + }, + { + "epoch": 0.16137123745819398, + "grad_norm": 0.46076643466949463, + "learning_rate": 8e-05, + "loss": 1.7129, + "step": 2895 + }, + { + "epoch": 0.16142697881828316, + "grad_norm": 0.4653468132019043, + "learning_rate": 8e-05, + "loss": 1.7953, + "step": 2896 + }, + { + "epoch": 0.16148272017837234, + "grad_norm": 0.6508491635322571, + "learning_rate": 8e-05, + "loss": 1.808, + "step": 2897 + }, + { + "epoch": 0.16153846153846155, + "grad_norm": 0.4713476598262787, + "learning_rate": 8e-05, + "loss": 1.6684, + "step": 2898 + }, + { + "epoch": 0.16159420289855073, + "grad_norm": 0.511515200138092, + "learning_rate": 8e-05, + "loss": 1.8001, + "step": 2899 + }, + { + "epoch": 0.1616499442586399, + "grad_norm": 0.4759061932563782, + "learning_rate": 8e-05, + "loss": 1.6275, + "step": 2900 + }, + { + "epoch": 0.16170568561872908, + "grad_norm": 0.47947847843170166, + "learning_rate": 8e-05, + "loss": 1.9198, + "step": 2901 + }, + { + "epoch": 0.1617614269788183, + "grad_norm": 0.4760971665382385, + "learning_rate": 8e-05, + "loss": 1.7911, + "step": 2902 + }, + { + "epoch": 0.16181716833890747, + "grad_norm": 0.48154783248901367, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 2903 + }, + { + "epoch": 0.16187290969899665, + "grad_norm": 0.4559423625469208, + "learning_rate": 8e-05, + "loss": 1.6635, + "step": 2904 + }, + { + "epoch": 0.16192865105908585, + "grad_norm": 0.47385692596435547, + "learning_rate": 8e-05, + "loss": 1.7711, + "step": 2905 + }, + { + "epoch": 0.16198439241917503, + "grad_norm": 0.4514434337615967, + "learning_rate": 8e-05, + "loss": 1.7176, + "step": 2906 + }, + { + "epoch": 0.1620401337792642, + "grad_norm": 0.441662460565567, + "learning_rate": 8e-05, + "loss": 1.655, + "step": 2907 + }, + { + "epoch": 0.1620958751393534, + "grad_norm": 0.4862743020057678, + "learning_rate": 8e-05, + "loss": 1.9833, + "step": 2908 + }, + { + "epoch": 0.1621516164994426, + "grad_norm": 0.4469553232192993, + "learning_rate": 8e-05, + "loss": 1.4799, + "step": 2909 + }, + { + "epoch": 0.16220735785953178, + "grad_norm": 0.4639607071876526, + "learning_rate": 8e-05, + "loss": 1.7082, + "step": 2910 + }, + { + "epoch": 0.16226309921962095, + "grad_norm": 0.5053742527961731, + "learning_rate": 8e-05, + "loss": 1.8838, + "step": 2911 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 0.4670344293117523, + "learning_rate": 8e-05, + "loss": 1.812, + "step": 2912 + }, + { + "epoch": 0.16237458193979934, + "grad_norm": 0.47207584977149963, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 2913 + }, + { + "epoch": 0.16243032329988852, + "grad_norm": 0.4806153476238251, + "learning_rate": 8e-05, + "loss": 1.7662, + "step": 2914 + }, + { + "epoch": 0.1624860646599777, + "grad_norm": 0.4803650379180908, + "learning_rate": 8e-05, + "loss": 1.7332, + "step": 2915 + }, + { + "epoch": 0.1625418060200669, + "grad_norm": 0.40704938769340515, + "learning_rate": 8e-05, + "loss": 1.3805, + "step": 2916 + }, + { + "epoch": 0.16259754738015608, + "grad_norm": 0.48974987864494324, + "learning_rate": 8e-05, + "loss": 1.7292, + "step": 2917 + }, + { + "epoch": 0.16265328874024526, + "grad_norm": 0.45540478825569153, + "learning_rate": 8e-05, + "loss": 1.6662, + "step": 2918 + }, + { + "epoch": 0.16270903010033444, + "grad_norm": 0.4781588912010193, + "learning_rate": 8e-05, + "loss": 1.6925, + "step": 2919 + }, + { + "epoch": 0.16276477146042365, + "grad_norm": 0.5591298341751099, + "learning_rate": 8e-05, + "loss": 1.3078, + "step": 2920 + }, + { + "epoch": 0.16282051282051282, + "grad_norm": 0.49242156744003296, + "learning_rate": 8e-05, + "loss": 1.6533, + "step": 2921 + }, + { + "epoch": 0.162876254180602, + "grad_norm": 0.5097176432609558, + "learning_rate": 8e-05, + "loss": 1.8765, + "step": 2922 + }, + { + "epoch": 0.16293199554069118, + "grad_norm": 0.5155987739562988, + "learning_rate": 8e-05, + "loss": 1.743, + "step": 2923 + }, + { + "epoch": 0.1629877369007804, + "grad_norm": 0.5098856687545776, + "learning_rate": 8e-05, + "loss": 1.7513, + "step": 2924 + }, + { + "epoch": 0.16304347826086957, + "grad_norm": 0.5157544612884521, + "learning_rate": 8e-05, + "loss": 1.8728, + "step": 2925 + }, + { + "epoch": 0.16309921962095875, + "grad_norm": 0.4856857359409332, + "learning_rate": 8e-05, + "loss": 1.4589, + "step": 2926 + }, + { + "epoch": 0.16315496098104793, + "grad_norm": 0.4530739188194275, + "learning_rate": 8e-05, + "loss": 1.7018, + "step": 2927 + }, + { + "epoch": 0.16321070234113713, + "grad_norm": 0.4573028087615967, + "learning_rate": 8e-05, + "loss": 1.6762, + "step": 2928 + }, + { + "epoch": 0.1632664437012263, + "grad_norm": 0.4893021583557129, + "learning_rate": 8e-05, + "loss": 1.8043, + "step": 2929 + }, + { + "epoch": 0.1633221850613155, + "grad_norm": 0.557016909122467, + "learning_rate": 8e-05, + "loss": 2.0862, + "step": 2930 + }, + { + "epoch": 0.1633779264214047, + "grad_norm": 0.5262312889099121, + "learning_rate": 8e-05, + "loss": 1.9518, + "step": 2931 + }, + { + "epoch": 0.16343366778149387, + "grad_norm": 0.422815203666687, + "learning_rate": 8e-05, + "loss": 1.5857, + "step": 2932 + }, + { + "epoch": 0.16348940914158305, + "grad_norm": 0.4872015416622162, + "learning_rate": 8e-05, + "loss": 1.8183, + "step": 2933 + }, + { + "epoch": 0.16354515050167223, + "grad_norm": 0.45688596367836, + "learning_rate": 8e-05, + "loss": 1.6444, + "step": 2934 + }, + { + "epoch": 0.16360089186176144, + "grad_norm": 0.4974451959133148, + "learning_rate": 8e-05, + "loss": 1.8459, + "step": 2935 + }, + { + "epoch": 0.16365663322185062, + "grad_norm": 0.49110689759254456, + "learning_rate": 8e-05, + "loss": 1.9244, + "step": 2936 + }, + { + "epoch": 0.1637123745819398, + "grad_norm": 0.49117761850357056, + "learning_rate": 8e-05, + "loss": 1.6758, + "step": 2937 + }, + { + "epoch": 0.16376811594202897, + "grad_norm": 0.4526088833808899, + "learning_rate": 8e-05, + "loss": 1.7698, + "step": 2938 + }, + { + "epoch": 0.16382385730211818, + "grad_norm": 0.49112263321876526, + "learning_rate": 8e-05, + "loss": 1.9145, + "step": 2939 + }, + { + "epoch": 0.16387959866220736, + "grad_norm": 0.5060984492301941, + "learning_rate": 8e-05, + "loss": 1.734, + "step": 2940 + }, + { + "epoch": 0.16393534002229654, + "grad_norm": 0.45045602321624756, + "learning_rate": 8e-05, + "loss": 1.6977, + "step": 2941 + }, + { + "epoch": 0.16399108138238572, + "grad_norm": 0.4900680482387543, + "learning_rate": 8e-05, + "loss": 1.7354, + "step": 2942 + }, + { + "epoch": 0.16404682274247492, + "grad_norm": 0.4850306808948517, + "learning_rate": 8e-05, + "loss": 1.8396, + "step": 2943 + }, + { + "epoch": 0.1641025641025641, + "grad_norm": 0.4529063403606415, + "learning_rate": 8e-05, + "loss": 1.744, + "step": 2944 + }, + { + "epoch": 0.16415830546265328, + "grad_norm": 0.4494536817073822, + "learning_rate": 8e-05, + "loss": 1.8157, + "step": 2945 + }, + { + "epoch": 0.1642140468227425, + "grad_norm": 0.4689137935638428, + "learning_rate": 8e-05, + "loss": 1.6594, + "step": 2946 + }, + { + "epoch": 0.16426978818283167, + "grad_norm": 0.480871319770813, + "learning_rate": 8e-05, + "loss": 1.8693, + "step": 2947 + }, + { + "epoch": 0.16432552954292085, + "grad_norm": 0.49745580554008484, + "learning_rate": 8e-05, + "loss": 1.8757, + "step": 2948 + }, + { + "epoch": 0.16438127090301002, + "grad_norm": 0.49291250109672546, + "learning_rate": 8e-05, + "loss": 1.7641, + "step": 2949 + }, + { + "epoch": 0.16443701226309923, + "grad_norm": 0.47803646326065063, + "learning_rate": 8e-05, + "loss": 1.7651, + "step": 2950 + }, + { + "epoch": 0.1644927536231884, + "grad_norm": 0.44731417298316956, + "learning_rate": 8e-05, + "loss": 1.6245, + "step": 2951 + }, + { + "epoch": 0.1645484949832776, + "grad_norm": 0.46639564633369446, + "learning_rate": 8e-05, + "loss": 1.6166, + "step": 2952 + }, + { + "epoch": 0.16460423634336677, + "grad_norm": 0.460929274559021, + "learning_rate": 8e-05, + "loss": 1.9163, + "step": 2953 + }, + { + "epoch": 0.16465997770345597, + "grad_norm": 0.44395554065704346, + "learning_rate": 8e-05, + "loss": 1.6165, + "step": 2954 + }, + { + "epoch": 0.16471571906354515, + "grad_norm": 0.5062477588653564, + "learning_rate": 8e-05, + "loss": 1.8948, + "step": 2955 + }, + { + "epoch": 0.16477146042363433, + "grad_norm": 0.5431268215179443, + "learning_rate": 8e-05, + "loss": 1.5934, + "step": 2956 + }, + { + "epoch": 0.1648272017837235, + "grad_norm": 0.4659595489501953, + "learning_rate": 8e-05, + "loss": 1.7985, + "step": 2957 + }, + { + "epoch": 0.16488294314381272, + "grad_norm": 0.5105186104774475, + "learning_rate": 8e-05, + "loss": 1.7925, + "step": 2958 + }, + { + "epoch": 0.1649386845039019, + "grad_norm": 0.5099925994873047, + "learning_rate": 8e-05, + "loss": 1.8497, + "step": 2959 + }, + { + "epoch": 0.16499442586399107, + "grad_norm": 0.5125576853752136, + "learning_rate": 8e-05, + "loss": 2.023, + "step": 2960 + }, + { + "epoch": 0.16505016722408028, + "grad_norm": 0.47445541620254517, + "learning_rate": 8e-05, + "loss": 1.7916, + "step": 2961 + }, + { + "epoch": 0.16510590858416946, + "grad_norm": 0.5191357135772705, + "learning_rate": 8e-05, + "loss": 1.8498, + "step": 2962 + }, + { + "epoch": 0.16516164994425864, + "grad_norm": 0.48971402645111084, + "learning_rate": 8e-05, + "loss": 1.6435, + "step": 2963 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 0.5164715051651001, + "learning_rate": 8e-05, + "loss": 1.7424, + "step": 2964 + }, + { + "epoch": 0.16527313266443702, + "grad_norm": 0.5232493281364441, + "learning_rate": 8e-05, + "loss": 2.0409, + "step": 2965 + }, + { + "epoch": 0.1653288740245262, + "grad_norm": 0.46612119674682617, + "learning_rate": 8e-05, + "loss": 1.7034, + "step": 2966 + }, + { + "epoch": 0.16538461538461538, + "grad_norm": 0.45249107480049133, + "learning_rate": 8e-05, + "loss": 1.8216, + "step": 2967 + }, + { + "epoch": 0.16544035674470456, + "grad_norm": 0.5156910419464111, + "learning_rate": 8e-05, + "loss": 1.7517, + "step": 2968 + }, + { + "epoch": 0.16549609810479377, + "grad_norm": 0.4460727274417877, + "learning_rate": 8e-05, + "loss": 1.6204, + "step": 2969 + }, + { + "epoch": 0.16555183946488294, + "grad_norm": 0.47430551052093506, + "learning_rate": 8e-05, + "loss": 1.6935, + "step": 2970 + }, + { + "epoch": 0.16560758082497212, + "grad_norm": 0.5011522173881531, + "learning_rate": 8e-05, + "loss": 2.0306, + "step": 2971 + }, + { + "epoch": 0.16566332218506133, + "grad_norm": 0.47904592752456665, + "learning_rate": 8e-05, + "loss": 1.7152, + "step": 2972 + }, + { + "epoch": 0.1657190635451505, + "grad_norm": 0.5341163277626038, + "learning_rate": 8e-05, + "loss": 1.8604, + "step": 2973 + }, + { + "epoch": 0.1657748049052397, + "grad_norm": 0.46354594826698303, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 2974 + }, + { + "epoch": 0.16583054626532887, + "grad_norm": 0.4791519045829773, + "learning_rate": 8e-05, + "loss": 1.6751, + "step": 2975 + }, + { + "epoch": 0.16588628762541807, + "grad_norm": 0.470883309841156, + "learning_rate": 8e-05, + "loss": 1.7161, + "step": 2976 + }, + { + "epoch": 0.16594202898550725, + "grad_norm": 0.5448764562606812, + "learning_rate": 8e-05, + "loss": 1.8716, + "step": 2977 + }, + { + "epoch": 0.16599777034559643, + "grad_norm": 0.44938236474990845, + "learning_rate": 8e-05, + "loss": 1.5895, + "step": 2978 + }, + { + "epoch": 0.1660535117056856, + "grad_norm": 0.5080894231796265, + "learning_rate": 8e-05, + "loss": 1.9543, + "step": 2979 + }, + { + "epoch": 0.16610925306577481, + "grad_norm": 0.5100005865097046, + "learning_rate": 8e-05, + "loss": 2.0521, + "step": 2980 + }, + { + "epoch": 0.166164994425864, + "grad_norm": 0.49559685587882996, + "learning_rate": 8e-05, + "loss": 1.7063, + "step": 2981 + }, + { + "epoch": 0.16622073578595317, + "grad_norm": 0.4607832431793213, + "learning_rate": 8e-05, + "loss": 1.6359, + "step": 2982 + }, + { + "epoch": 0.16627647714604235, + "grad_norm": 0.4844558835029602, + "learning_rate": 8e-05, + "loss": 1.5046, + "step": 2983 + }, + { + "epoch": 0.16633221850613156, + "grad_norm": 0.4657902717590332, + "learning_rate": 8e-05, + "loss": 1.7483, + "step": 2984 + }, + { + "epoch": 0.16638795986622074, + "grad_norm": 0.47202810645103455, + "learning_rate": 8e-05, + "loss": 1.5051, + "step": 2985 + }, + { + "epoch": 0.16644370122630991, + "grad_norm": 0.45511099696159363, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 2986 + }, + { + "epoch": 0.16649944258639912, + "grad_norm": 0.462038516998291, + "learning_rate": 8e-05, + "loss": 1.6827, + "step": 2987 + }, + { + "epoch": 0.1665551839464883, + "grad_norm": 0.4840214252471924, + "learning_rate": 8e-05, + "loss": 1.5835, + "step": 2988 + }, + { + "epoch": 0.16661092530657748, + "grad_norm": 0.5105730295181274, + "learning_rate": 8e-05, + "loss": 1.9062, + "step": 2989 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.4960428476333618, + "learning_rate": 8e-05, + "loss": 1.8725, + "step": 2990 + }, + { + "epoch": 0.16672240802675586, + "grad_norm": 0.479893296957016, + "learning_rate": 8e-05, + "loss": 1.5747, + "step": 2991 + }, + { + "epoch": 0.16677814938684504, + "grad_norm": 0.4770605266094208, + "learning_rate": 8e-05, + "loss": 1.7569, + "step": 2992 + }, + { + "epoch": 0.16683389074693422, + "grad_norm": 0.44722259044647217, + "learning_rate": 8e-05, + "loss": 1.6787, + "step": 2993 + }, + { + "epoch": 0.1668896321070234, + "grad_norm": 0.5155320763587952, + "learning_rate": 8e-05, + "loss": 1.6401, + "step": 2994 + }, + { + "epoch": 0.1669453734671126, + "grad_norm": 0.5102251172065735, + "learning_rate": 8e-05, + "loss": 1.7707, + "step": 2995 + }, + { + "epoch": 0.16700111482720179, + "grad_norm": 0.5326622724533081, + "learning_rate": 8e-05, + "loss": 1.9579, + "step": 2996 + }, + { + "epoch": 0.16705685618729096, + "grad_norm": 0.4790048897266388, + "learning_rate": 8e-05, + "loss": 1.5006, + "step": 2997 + }, + { + "epoch": 0.16711259754738014, + "grad_norm": 0.4893651604652405, + "learning_rate": 8e-05, + "loss": 1.7729, + "step": 2998 + }, + { + "epoch": 0.16716833890746935, + "grad_norm": 0.50043785572052, + "learning_rate": 8e-05, + "loss": 1.7786, + "step": 2999 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.48809880018234253, + "learning_rate": 8e-05, + "loss": 1.8332, + "step": 3000 + }, + { + "epoch": 0.1672798216276477, + "grad_norm": 0.4811587333679199, + "learning_rate": 8e-05, + "loss": 1.6124, + "step": 3001 + }, + { + "epoch": 0.1673355629877369, + "grad_norm": 0.5150866508483887, + "learning_rate": 8e-05, + "loss": 1.6237, + "step": 3002 + }, + { + "epoch": 0.1673913043478261, + "grad_norm": 0.5085856914520264, + "learning_rate": 8e-05, + "loss": 1.8273, + "step": 3003 + }, + { + "epoch": 0.16744704570791527, + "grad_norm": 0.5587477684020996, + "learning_rate": 8e-05, + "loss": 1.8274, + "step": 3004 + }, + { + "epoch": 0.16750278706800445, + "grad_norm": 0.4674791693687439, + "learning_rate": 8e-05, + "loss": 1.74, + "step": 3005 + }, + { + "epoch": 0.16755852842809366, + "grad_norm": 0.4883214235305786, + "learning_rate": 8e-05, + "loss": 1.6489, + "step": 3006 + }, + { + "epoch": 0.16761426978818283, + "grad_norm": 0.5089623928070068, + "learning_rate": 8e-05, + "loss": 1.7817, + "step": 3007 + }, + { + "epoch": 0.167670011148272, + "grad_norm": 0.5148787498474121, + "learning_rate": 8e-05, + "loss": 1.6104, + "step": 3008 + }, + { + "epoch": 0.1677257525083612, + "grad_norm": 0.4540792405605316, + "learning_rate": 8e-05, + "loss": 1.6607, + "step": 3009 + }, + { + "epoch": 0.1677814938684504, + "grad_norm": 0.4619336724281311, + "learning_rate": 8e-05, + "loss": 1.6425, + "step": 3010 + }, + { + "epoch": 0.16783723522853958, + "grad_norm": 0.44761553406715393, + "learning_rate": 8e-05, + "loss": 1.5614, + "step": 3011 + }, + { + "epoch": 0.16789297658862876, + "grad_norm": 0.471306174993515, + "learning_rate": 8e-05, + "loss": 1.5586, + "step": 3012 + }, + { + "epoch": 0.16794871794871793, + "grad_norm": 0.5210175514221191, + "learning_rate": 8e-05, + "loss": 1.8424, + "step": 3013 + }, + { + "epoch": 0.16800445930880714, + "grad_norm": 0.4540490508079529, + "learning_rate": 8e-05, + "loss": 1.5963, + "step": 3014 + }, + { + "epoch": 0.16806020066889632, + "grad_norm": 0.47507572174072266, + "learning_rate": 8e-05, + "loss": 1.632, + "step": 3015 + }, + { + "epoch": 0.1681159420289855, + "grad_norm": 0.47896212339401245, + "learning_rate": 8e-05, + "loss": 1.7958, + "step": 3016 + }, + { + "epoch": 0.1681716833890747, + "grad_norm": 0.48261168599128723, + "learning_rate": 8e-05, + "loss": 1.5484, + "step": 3017 + }, + { + "epoch": 0.16822742474916388, + "grad_norm": 0.47131311893463135, + "learning_rate": 8e-05, + "loss": 1.6909, + "step": 3018 + }, + { + "epoch": 0.16828316610925306, + "grad_norm": 0.47304922342300415, + "learning_rate": 8e-05, + "loss": 1.7071, + "step": 3019 + }, + { + "epoch": 0.16833890746934224, + "grad_norm": 0.4762814939022064, + "learning_rate": 8e-05, + "loss": 1.7035, + "step": 3020 + }, + { + "epoch": 0.16839464882943145, + "grad_norm": 0.5088526606559753, + "learning_rate": 8e-05, + "loss": 1.8491, + "step": 3021 + }, + { + "epoch": 0.16845039018952063, + "grad_norm": 0.48544204235076904, + "learning_rate": 8e-05, + "loss": 1.8434, + "step": 3022 + }, + { + "epoch": 0.1685061315496098, + "grad_norm": 0.49382224678993225, + "learning_rate": 8e-05, + "loss": 1.6825, + "step": 3023 + }, + { + "epoch": 0.16856187290969898, + "grad_norm": 0.4782942235469818, + "learning_rate": 8e-05, + "loss": 1.7259, + "step": 3024 + }, + { + "epoch": 0.1686176142697882, + "grad_norm": 0.555604875087738, + "learning_rate": 8e-05, + "loss": 1.9323, + "step": 3025 + }, + { + "epoch": 0.16867335562987737, + "grad_norm": 0.5326486825942993, + "learning_rate": 8e-05, + "loss": 1.612, + "step": 3026 + }, + { + "epoch": 0.16872909698996655, + "grad_norm": 0.5532233715057373, + "learning_rate": 8e-05, + "loss": 1.9169, + "step": 3027 + }, + { + "epoch": 0.16878483835005575, + "grad_norm": 0.8552114367485046, + "learning_rate": 8e-05, + "loss": 1.8408, + "step": 3028 + }, + { + "epoch": 0.16884057971014493, + "grad_norm": 0.544897198677063, + "learning_rate": 8e-05, + "loss": 2.0083, + "step": 3029 + }, + { + "epoch": 0.1688963210702341, + "grad_norm": 0.4543430805206299, + "learning_rate": 8e-05, + "loss": 1.5772, + "step": 3030 + }, + { + "epoch": 0.1689520624303233, + "grad_norm": 0.49418988823890686, + "learning_rate": 8e-05, + "loss": 1.7765, + "step": 3031 + }, + { + "epoch": 0.1690078037904125, + "grad_norm": 0.4817911386489868, + "learning_rate": 8e-05, + "loss": 1.6769, + "step": 3032 + }, + { + "epoch": 0.16906354515050168, + "grad_norm": 0.5358490943908691, + "learning_rate": 8e-05, + "loss": 1.6488, + "step": 3033 + }, + { + "epoch": 0.16911928651059085, + "grad_norm": 0.4769594669342041, + "learning_rate": 8e-05, + "loss": 1.7918, + "step": 3034 + }, + { + "epoch": 0.16917502787068003, + "grad_norm": 0.48201489448547363, + "learning_rate": 8e-05, + "loss": 1.8778, + "step": 3035 + }, + { + "epoch": 0.16923076923076924, + "grad_norm": 0.470387727022171, + "learning_rate": 8e-05, + "loss": 1.7893, + "step": 3036 + }, + { + "epoch": 0.16928651059085842, + "grad_norm": 0.49865806102752686, + "learning_rate": 8e-05, + "loss": 1.739, + "step": 3037 + }, + { + "epoch": 0.1693422519509476, + "grad_norm": 0.5022916793823242, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 3038 + }, + { + "epoch": 0.16939799331103678, + "grad_norm": 0.4783816337585449, + "learning_rate": 8e-05, + "loss": 1.6419, + "step": 3039 + }, + { + "epoch": 0.16945373467112598, + "grad_norm": 0.4603171646595001, + "learning_rate": 8e-05, + "loss": 1.5285, + "step": 3040 + }, + { + "epoch": 0.16950947603121516, + "grad_norm": 0.5255218148231506, + "learning_rate": 8e-05, + "loss": 2.0596, + "step": 3041 + }, + { + "epoch": 0.16956521739130434, + "grad_norm": 0.4812007546424866, + "learning_rate": 8e-05, + "loss": 1.892, + "step": 3042 + }, + { + "epoch": 0.16962095875139355, + "grad_norm": 0.47994357347488403, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 3043 + }, + { + "epoch": 0.16967670011148273, + "grad_norm": 0.43760350346565247, + "learning_rate": 8e-05, + "loss": 1.4635, + "step": 3044 + }, + { + "epoch": 0.1697324414715719, + "grad_norm": 0.465116411447525, + "learning_rate": 8e-05, + "loss": 1.8058, + "step": 3045 + }, + { + "epoch": 0.16978818283166108, + "grad_norm": 0.4570809006690979, + "learning_rate": 8e-05, + "loss": 1.5542, + "step": 3046 + }, + { + "epoch": 0.1698439241917503, + "grad_norm": 0.4452064335346222, + "learning_rate": 8e-05, + "loss": 1.6527, + "step": 3047 + }, + { + "epoch": 0.16989966555183947, + "grad_norm": 0.4564603269100189, + "learning_rate": 8e-05, + "loss": 1.5997, + "step": 3048 + }, + { + "epoch": 0.16995540691192865, + "grad_norm": 0.4724256694316864, + "learning_rate": 8e-05, + "loss": 1.6461, + "step": 3049 + }, + { + "epoch": 0.17001114827201783, + "grad_norm": 0.4666452705860138, + "learning_rate": 8e-05, + "loss": 1.8152, + "step": 3050 + }, + { + "epoch": 0.17006688963210703, + "grad_norm": 0.46496039628982544, + "learning_rate": 8e-05, + "loss": 1.6681, + "step": 3051 + }, + { + "epoch": 0.1701226309921962, + "grad_norm": 0.4887557923793793, + "learning_rate": 8e-05, + "loss": 1.6887, + "step": 3052 + }, + { + "epoch": 0.1701783723522854, + "grad_norm": 0.4998195767402649, + "learning_rate": 8e-05, + "loss": 1.8402, + "step": 3053 + }, + { + "epoch": 0.17023411371237457, + "grad_norm": 0.5062663555145264, + "learning_rate": 8e-05, + "loss": 1.8232, + "step": 3054 + }, + { + "epoch": 0.17028985507246377, + "grad_norm": 0.5044751167297363, + "learning_rate": 8e-05, + "loss": 1.506, + "step": 3055 + }, + { + "epoch": 0.17034559643255295, + "grad_norm": 0.49747434258461, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 3056 + }, + { + "epoch": 0.17040133779264213, + "grad_norm": 0.463445246219635, + "learning_rate": 8e-05, + "loss": 1.5947, + "step": 3057 + }, + { + "epoch": 0.17045707915273134, + "grad_norm": 0.46119794249534607, + "learning_rate": 8e-05, + "loss": 1.6294, + "step": 3058 + }, + { + "epoch": 0.17051282051282052, + "grad_norm": 0.44738781452178955, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 3059 + }, + { + "epoch": 0.1705685618729097, + "grad_norm": 0.47132405638694763, + "learning_rate": 8e-05, + "loss": 1.7297, + "step": 3060 + }, + { + "epoch": 0.17062430323299888, + "grad_norm": 0.523993968963623, + "learning_rate": 8e-05, + "loss": 1.8783, + "step": 3061 + }, + { + "epoch": 0.17068004459308808, + "grad_norm": 0.5431890487670898, + "learning_rate": 8e-05, + "loss": 1.8269, + "step": 3062 + }, + { + "epoch": 0.17073578595317726, + "grad_norm": 0.4824149012565613, + "learning_rate": 8e-05, + "loss": 1.4909, + "step": 3063 + }, + { + "epoch": 0.17079152731326644, + "grad_norm": 0.47434085607528687, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 3064 + }, + { + "epoch": 0.17084726867335562, + "grad_norm": 0.5017184019088745, + "learning_rate": 8e-05, + "loss": 1.6937, + "step": 3065 + }, + { + "epoch": 0.17090301003344482, + "grad_norm": 0.4829297363758087, + "learning_rate": 8e-05, + "loss": 1.656, + "step": 3066 + }, + { + "epoch": 0.170958751393534, + "grad_norm": 0.4362059235572815, + "learning_rate": 8e-05, + "loss": 1.3688, + "step": 3067 + }, + { + "epoch": 0.17101449275362318, + "grad_norm": 0.4372815489768982, + "learning_rate": 8e-05, + "loss": 1.4497, + "step": 3068 + }, + { + "epoch": 0.1710702341137124, + "grad_norm": 0.48382410407066345, + "learning_rate": 8e-05, + "loss": 1.8213, + "step": 3069 + }, + { + "epoch": 0.17112597547380157, + "grad_norm": 0.49431711435317993, + "learning_rate": 8e-05, + "loss": 1.7322, + "step": 3070 + }, + { + "epoch": 0.17118171683389075, + "grad_norm": 0.5378329157829285, + "learning_rate": 8e-05, + "loss": 1.7495, + "step": 3071 + }, + { + "epoch": 0.17123745819397992, + "grad_norm": 0.4819210469722748, + "learning_rate": 8e-05, + "loss": 1.4963, + "step": 3072 + }, + { + "epoch": 0.17129319955406913, + "grad_norm": 0.4515661597251892, + "learning_rate": 8e-05, + "loss": 1.6581, + "step": 3073 + }, + { + "epoch": 0.1713489409141583, + "grad_norm": 0.40329015254974365, + "learning_rate": 8e-05, + "loss": 1.5052, + "step": 3074 + }, + { + "epoch": 0.1714046822742475, + "grad_norm": 0.48003634810447693, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 3075 + }, + { + "epoch": 0.17146042363433667, + "grad_norm": 0.4498198628425598, + "learning_rate": 8e-05, + "loss": 1.6493, + "step": 3076 + }, + { + "epoch": 0.17151616499442587, + "grad_norm": 0.4563552737236023, + "learning_rate": 8e-05, + "loss": 1.5818, + "step": 3077 + }, + { + "epoch": 0.17157190635451505, + "grad_norm": 0.5010289549827576, + "learning_rate": 8e-05, + "loss": 1.8007, + "step": 3078 + }, + { + "epoch": 0.17162764771460423, + "grad_norm": 0.4739150106906891, + "learning_rate": 8e-05, + "loss": 1.6223, + "step": 3079 + }, + { + "epoch": 0.1716833890746934, + "grad_norm": 0.48382750153541565, + "learning_rate": 8e-05, + "loss": 1.635, + "step": 3080 + }, + { + "epoch": 0.17173913043478262, + "grad_norm": 0.49976804852485657, + "learning_rate": 8e-05, + "loss": 1.6488, + "step": 3081 + }, + { + "epoch": 0.1717948717948718, + "grad_norm": 0.4661661386489868, + "learning_rate": 8e-05, + "loss": 1.6358, + "step": 3082 + }, + { + "epoch": 0.17185061315496097, + "grad_norm": 0.4675019085407257, + "learning_rate": 8e-05, + "loss": 1.7324, + "step": 3083 + }, + { + "epoch": 0.17190635451505018, + "grad_norm": 0.497050940990448, + "learning_rate": 8e-05, + "loss": 1.8559, + "step": 3084 + }, + { + "epoch": 0.17196209587513936, + "grad_norm": 0.4740678071975708, + "learning_rate": 8e-05, + "loss": 1.7249, + "step": 3085 + }, + { + "epoch": 0.17201783723522854, + "grad_norm": 0.5225610136985779, + "learning_rate": 8e-05, + "loss": 1.8596, + "step": 3086 + }, + { + "epoch": 0.17207357859531772, + "grad_norm": 0.4995535612106323, + "learning_rate": 8e-05, + "loss": 1.7586, + "step": 3087 + }, + { + "epoch": 0.17212931995540692, + "grad_norm": 0.5729968547821045, + "learning_rate": 8e-05, + "loss": 1.6392, + "step": 3088 + }, + { + "epoch": 0.1721850613154961, + "grad_norm": 0.4812772870063782, + "learning_rate": 8e-05, + "loss": 1.8687, + "step": 3089 + }, + { + "epoch": 0.17224080267558528, + "grad_norm": 0.4795670211315155, + "learning_rate": 8e-05, + "loss": 1.7171, + "step": 3090 + }, + { + "epoch": 0.17229654403567446, + "grad_norm": 0.49664998054504395, + "learning_rate": 8e-05, + "loss": 1.553, + "step": 3091 + }, + { + "epoch": 0.17235228539576367, + "grad_norm": 0.4796065092086792, + "learning_rate": 8e-05, + "loss": 1.9998, + "step": 3092 + }, + { + "epoch": 0.17240802675585284, + "grad_norm": 0.5262255072593689, + "learning_rate": 8e-05, + "loss": 1.8668, + "step": 3093 + }, + { + "epoch": 0.17246376811594202, + "grad_norm": 0.46140360832214355, + "learning_rate": 8e-05, + "loss": 1.5494, + "step": 3094 + }, + { + "epoch": 0.1725195094760312, + "grad_norm": 0.4980575144290924, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 3095 + }, + { + "epoch": 0.1725752508361204, + "grad_norm": 0.49190661311149597, + "learning_rate": 8e-05, + "loss": 1.7526, + "step": 3096 + }, + { + "epoch": 0.1726309921962096, + "grad_norm": 0.4727287292480469, + "learning_rate": 8e-05, + "loss": 1.9271, + "step": 3097 + }, + { + "epoch": 0.17268673355629877, + "grad_norm": 0.4625808894634247, + "learning_rate": 8e-05, + "loss": 1.6226, + "step": 3098 + }, + { + "epoch": 0.17274247491638797, + "grad_norm": 0.534032940864563, + "learning_rate": 8e-05, + "loss": 1.8686, + "step": 3099 + }, + { + "epoch": 0.17279821627647715, + "grad_norm": 0.47960618138313293, + "learning_rate": 8e-05, + "loss": 1.6062, + "step": 3100 + }, + { + "epoch": 0.17285395763656633, + "grad_norm": 0.47547629475593567, + "learning_rate": 8e-05, + "loss": 1.7415, + "step": 3101 + }, + { + "epoch": 0.1729096989966555, + "grad_norm": 0.5057584643363953, + "learning_rate": 8e-05, + "loss": 1.8651, + "step": 3102 + }, + { + "epoch": 0.17296544035674472, + "grad_norm": 0.4372791647911072, + "learning_rate": 8e-05, + "loss": 1.6305, + "step": 3103 + }, + { + "epoch": 0.1730211817168339, + "grad_norm": 0.4549502432346344, + "learning_rate": 8e-05, + "loss": 1.5812, + "step": 3104 + }, + { + "epoch": 0.17307692307692307, + "grad_norm": 0.5240883827209473, + "learning_rate": 8e-05, + "loss": 1.8658, + "step": 3105 + }, + { + "epoch": 0.17313266443701225, + "grad_norm": 0.5285409092903137, + "learning_rate": 8e-05, + "loss": 1.7676, + "step": 3106 + }, + { + "epoch": 0.17318840579710146, + "grad_norm": 0.5472353100776672, + "learning_rate": 8e-05, + "loss": 1.8128, + "step": 3107 + }, + { + "epoch": 0.17324414715719064, + "grad_norm": 0.48296090960502625, + "learning_rate": 8e-05, + "loss": 1.644, + "step": 3108 + }, + { + "epoch": 0.17329988851727982, + "grad_norm": 0.5111985802650452, + "learning_rate": 8e-05, + "loss": 1.7872, + "step": 3109 + }, + { + "epoch": 0.173355629877369, + "grad_norm": 0.5271138548851013, + "learning_rate": 8e-05, + "loss": 1.9658, + "step": 3110 + }, + { + "epoch": 0.1734113712374582, + "grad_norm": 0.4623909294605255, + "learning_rate": 8e-05, + "loss": 1.7587, + "step": 3111 + }, + { + "epoch": 0.17346711259754738, + "grad_norm": 0.5246699452400208, + "learning_rate": 8e-05, + "loss": 1.8177, + "step": 3112 + }, + { + "epoch": 0.17352285395763656, + "grad_norm": 0.4383607506752014, + "learning_rate": 8e-05, + "loss": 1.6104, + "step": 3113 + }, + { + "epoch": 0.17357859531772576, + "grad_norm": 0.5097667574882507, + "learning_rate": 8e-05, + "loss": 1.5114, + "step": 3114 + }, + { + "epoch": 0.17363433667781494, + "grad_norm": 0.4617764353752136, + "learning_rate": 8e-05, + "loss": 1.6502, + "step": 3115 + }, + { + "epoch": 0.17369007803790412, + "grad_norm": 0.501473605632782, + "learning_rate": 8e-05, + "loss": 1.9709, + "step": 3116 + }, + { + "epoch": 0.1737458193979933, + "grad_norm": 0.4450080990791321, + "learning_rate": 8e-05, + "loss": 1.6331, + "step": 3117 + }, + { + "epoch": 0.1738015607580825, + "grad_norm": 0.4507881999015808, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 3118 + }, + { + "epoch": 0.17385730211817169, + "grad_norm": 0.48858508467674255, + "learning_rate": 8e-05, + "loss": 1.7731, + "step": 3119 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.46448734402656555, + "learning_rate": 8e-05, + "loss": 1.5604, + "step": 3120 + }, + { + "epoch": 0.17396878483835004, + "grad_norm": 0.4925326108932495, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 3121 + }, + { + "epoch": 0.17402452619843925, + "grad_norm": 0.45742926001548767, + "learning_rate": 8e-05, + "loss": 1.6341, + "step": 3122 + }, + { + "epoch": 0.17408026755852843, + "grad_norm": 0.4373356103897095, + "learning_rate": 8e-05, + "loss": 1.5657, + "step": 3123 + }, + { + "epoch": 0.1741360089186176, + "grad_norm": 0.4854992628097534, + "learning_rate": 8e-05, + "loss": 1.5725, + "step": 3124 + }, + { + "epoch": 0.17419175027870681, + "grad_norm": 0.5004236102104187, + "learning_rate": 8e-05, + "loss": 1.7323, + "step": 3125 + }, + { + "epoch": 0.174247491638796, + "grad_norm": 0.47592586278915405, + "learning_rate": 8e-05, + "loss": 1.7518, + "step": 3126 + }, + { + "epoch": 0.17430323299888517, + "grad_norm": 0.4757702350616455, + "learning_rate": 8e-05, + "loss": 1.6715, + "step": 3127 + }, + { + "epoch": 0.17435897435897435, + "grad_norm": 0.5010035634040833, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 3128 + }, + { + "epoch": 0.17441471571906356, + "grad_norm": 0.48768213391304016, + "learning_rate": 8e-05, + "loss": 1.7032, + "step": 3129 + }, + { + "epoch": 0.17447045707915274, + "grad_norm": 0.4932835102081299, + "learning_rate": 8e-05, + "loss": 1.8724, + "step": 3130 + }, + { + "epoch": 0.17452619843924191, + "grad_norm": 0.5223470330238342, + "learning_rate": 8e-05, + "loss": 1.8051, + "step": 3131 + }, + { + "epoch": 0.1745819397993311, + "grad_norm": 0.5153441429138184, + "learning_rate": 8e-05, + "loss": 1.9186, + "step": 3132 + }, + { + "epoch": 0.1746376811594203, + "grad_norm": 0.4976043701171875, + "learning_rate": 8e-05, + "loss": 1.7093, + "step": 3133 + }, + { + "epoch": 0.17469342251950948, + "grad_norm": 0.5194041132926941, + "learning_rate": 8e-05, + "loss": 1.8921, + "step": 3134 + }, + { + "epoch": 0.17474916387959866, + "grad_norm": 0.48766300082206726, + "learning_rate": 8e-05, + "loss": 1.8425, + "step": 3135 + }, + { + "epoch": 0.17480490523968784, + "grad_norm": 0.4757583439350128, + "learning_rate": 8e-05, + "loss": 1.7665, + "step": 3136 + }, + { + "epoch": 0.17486064659977704, + "grad_norm": 0.4470495879650116, + "learning_rate": 8e-05, + "loss": 1.5407, + "step": 3137 + }, + { + "epoch": 0.17491638795986622, + "grad_norm": 0.5015825033187866, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 3138 + }, + { + "epoch": 0.1749721293199554, + "grad_norm": 0.49691611528396606, + "learning_rate": 8e-05, + "loss": 1.9104, + "step": 3139 + }, + { + "epoch": 0.1750278706800446, + "grad_norm": 0.46983975172042847, + "learning_rate": 8e-05, + "loss": 1.6565, + "step": 3140 + }, + { + "epoch": 0.17508361204013378, + "grad_norm": 0.4764535129070282, + "learning_rate": 8e-05, + "loss": 1.8129, + "step": 3141 + }, + { + "epoch": 0.17513935340022296, + "grad_norm": 0.5874860286712646, + "learning_rate": 8e-05, + "loss": 2.2321, + "step": 3142 + }, + { + "epoch": 0.17519509476031214, + "grad_norm": 0.48502081632614136, + "learning_rate": 8e-05, + "loss": 1.7636, + "step": 3143 + }, + { + "epoch": 0.17525083612040135, + "grad_norm": 0.46429380774497986, + "learning_rate": 8e-05, + "loss": 1.7481, + "step": 3144 + }, + { + "epoch": 0.17530657748049053, + "grad_norm": 0.4960067570209503, + "learning_rate": 8e-05, + "loss": 1.7872, + "step": 3145 + }, + { + "epoch": 0.1753623188405797, + "grad_norm": 0.46121087670326233, + "learning_rate": 8e-05, + "loss": 1.6306, + "step": 3146 + }, + { + "epoch": 0.17541806020066888, + "grad_norm": 0.497938871383667, + "learning_rate": 8e-05, + "loss": 1.76, + "step": 3147 + }, + { + "epoch": 0.1754738015607581, + "grad_norm": 0.45689964294433594, + "learning_rate": 8e-05, + "loss": 1.5589, + "step": 3148 + }, + { + "epoch": 0.17552954292084727, + "grad_norm": 0.4511463940143585, + "learning_rate": 8e-05, + "loss": 1.7215, + "step": 3149 + }, + { + "epoch": 0.17558528428093645, + "grad_norm": 0.48650410771369934, + "learning_rate": 8e-05, + "loss": 1.6654, + "step": 3150 + }, + { + "epoch": 0.17564102564102563, + "grad_norm": 0.5025672316551208, + "learning_rate": 8e-05, + "loss": 1.638, + "step": 3151 + }, + { + "epoch": 0.17569676700111483, + "grad_norm": 0.4942224621772766, + "learning_rate": 8e-05, + "loss": 1.6517, + "step": 3152 + }, + { + "epoch": 0.175752508361204, + "grad_norm": 0.471215158700943, + "learning_rate": 8e-05, + "loss": 1.7378, + "step": 3153 + }, + { + "epoch": 0.1758082497212932, + "grad_norm": 0.5158399343490601, + "learning_rate": 8e-05, + "loss": 1.6319, + "step": 3154 + }, + { + "epoch": 0.1758639910813824, + "grad_norm": 0.47845038771629333, + "learning_rate": 8e-05, + "loss": 1.7183, + "step": 3155 + }, + { + "epoch": 0.17591973244147158, + "grad_norm": 0.5168014764785767, + "learning_rate": 8e-05, + "loss": 1.7709, + "step": 3156 + }, + { + "epoch": 0.17597547380156076, + "grad_norm": 0.4960615336894989, + "learning_rate": 8e-05, + "loss": 1.7438, + "step": 3157 + }, + { + "epoch": 0.17603121516164993, + "grad_norm": 0.4402819871902466, + "learning_rate": 8e-05, + "loss": 1.3315, + "step": 3158 + }, + { + "epoch": 0.17608695652173914, + "grad_norm": 0.4752297103404999, + "learning_rate": 8e-05, + "loss": 1.7649, + "step": 3159 + }, + { + "epoch": 0.17614269788182832, + "grad_norm": 0.4856758713722229, + "learning_rate": 8e-05, + "loss": 1.678, + "step": 3160 + }, + { + "epoch": 0.1761984392419175, + "grad_norm": 0.4814453721046448, + "learning_rate": 8e-05, + "loss": 1.7371, + "step": 3161 + }, + { + "epoch": 0.17625418060200668, + "grad_norm": 0.4642793536186218, + "learning_rate": 8e-05, + "loss": 1.6614, + "step": 3162 + }, + { + "epoch": 0.17630992196209588, + "grad_norm": 0.4925251305103302, + "learning_rate": 8e-05, + "loss": 1.5645, + "step": 3163 + }, + { + "epoch": 0.17636566332218506, + "grad_norm": 0.52099609375, + "learning_rate": 8e-05, + "loss": 1.6317, + "step": 3164 + }, + { + "epoch": 0.17642140468227424, + "grad_norm": 0.4923543930053711, + "learning_rate": 8e-05, + "loss": 1.6023, + "step": 3165 + }, + { + "epoch": 0.17647714604236345, + "grad_norm": 0.522320568561554, + "learning_rate": 8e-05, + "loss": 1.7948, + "step": 3166 + }, + { + "epoch": 0.17653288740245263, + "grad_norm": 0.48389363288879395, + "learning_rate": 8e-05, + "loss": 1.7495, + "step": 3167 + }, + { + "epoch": 0.1765886287625418, + "grad_norm": 0.5073885321617126, + "learning_rate": 8e-05, + "loss": 1.9295, + "step": 3168 + }, + { + "epoch": 0.17664437012263098, + "grad_norm": 0.45694851875305176, + "learning_rate": 8e-05, + "loss": 1.6847, + "step": 3169 + }, + { + "epoch": 0.1767001114827202, + "grad_norm": 0.48219072818756104, + "learning_rate": 8e-05, + "loss": 1.8583, + "step": 3170 + }, + { + "epoch": 0.17675585284280937, + "grad_norm": 0.4569888710975647, + "learning_rate": 8e-05, + "loss": 1.8423, + "step": 3171 + }, + { + "epoch": 0.17681159420289855, + "grad_norm": 0.49059024453163147, + "learning_rate": 8e-05, + "loss": 1.798, + "step": 3172 + }, + { + "epoch": 0.17686733556298773, + "grad_norm": 0.5194540619850159, + "learning_rate": 8e-05, + "loss": 1.5778, + "step": 3173 + }, + { + "epoch": 0.17692307692307693, + "grad_norm": 0.5097144842147827, + "learning_rate": 8e-05, + "loss": 1.5033, + "step": 3174 + }, + { + "epoch": 0.1769788182831661, + "grad_norm": 0.48274803161621094, + "learning_rate": 8e-05, + "loss": 1.7761, + "step": 3175 + }, + { + "epoch": 0.1770345596432553, + "grad_norm": 0.5038658976554871, + "learning_rate": 8e-05, + "loss": 1.7308, + "step": 3176 + }, + { + "epoch": 0.17709030100334447, + "grad_norm": 0.46520641446113586, + "learning_rate": 8e-05, + "loss": 1.6843, + "step": 3177 + }, + { + "epoch": 0.17714604236343368, + "grad_norm": 0.47753003239631653, + "learning_rate": 8e-05, + "loss": 1.7283, + "step": 3178 + }, + { + "epoch": 0.17720178372352285, + "grad_norm": 0.5261279940605164, + "learning_rate": 8e-05, + "loss": 1.86, + "step": 3179 + }, + { + "epoch": 0.17725752508361203, + "grad_norm": 0.497805655002594, + "learning_rate": 8e-05, + "loss": 1.6267, + "step": 3180 + }, + { + "epoch": 0.17731326644370124, + "grad_norm": 0.464225172996521, + "learning_rate": 8e-05, + "loss": 1.7415, + "step": 3181 + }, + { + "epoch": 0.17736900780379042, + "grad_norm": 0.4999517798423767, + "learning_rate": 8e-05, + "loss": 1.6564, + "step": 3182 + }, + { + "epoch": 0.1774247491638796, + "grad_norm": 0.5199729800224304, + "learning_rate": 8e-05, + "loss": 1.9543, + "step": 3183 + }, + { + "epoch": 0.17748049052396878, + "grad_norm": 0.48327144980430603, + "learning_rate": 8e-05, + "loss": 1.5824, + "step": 3184 + }, + { + "epoch": 0.17753623188405798, + "grad_norm": 0.5581362843513489, + "learning_rate": 8e-05, + "loss": 1.9749, + "step": 3185 + }, + { + "epoch": 0.17759197324414716, + "grad_norm": 0.4988551437854767, + "learning_rate": 8e-05, + "loss": 1.9346, + "step": 3186 + }, + { + "epoch": 0.17764771460423634, + "grad_norm": 0.5109734535217285, + "learning_rate": 8e-05, + "loss": 1.7523, + "step": 3187 + }, + { + "epoch": 0.17770345596432552, + "grad_norm": 0.4862396717071533, + "learning_rate": 8e-05, + "loss": 1.5092, + "step": 3188 + }, + { + "epoch": 0.17775919732441472, + "grad_norm": 0.44461333751678467, + "learning_rate": 8e-05, + "loss": 1.6452, + "step": 3189 + }, + { + "epoch": 0.1778149386845039, + "grad_norm": 0.4702376425266266, + "learning_rate": 8e-05, + "loss": 1.6332, + "step": 3190 + }, + { + "epoch": 0.17787068004459308, + "grad_norm": 0.48019805550575256, + "learning_rate": 8e-05, + "loss": 1.5162, + "step": 3191 + }, + { + "epoch": 0.17792642140468226, + "grad_norm": 0.49448859691619873, + "learning_rate": 8e-05, + "loss": 1.6892, + "step": 3192 + }, + { + "epoch": 0.17798216276477147, + "grad_norm": 0.4584326446056366, + "learning_rate": 8e-05, + "loss": 1.6477, + "step": 3193 + }, + { + "epoch": 0.17803790412486065, + "grad_norm": 0.49770230054855347, + "learning_rate": 8e-05, + "loss": 1.7618, + "step": 3194 + }, + { + "epoch": 0.17809364548494983, + "grad_norm": 0.4516306221485138, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 3195 + }, + { + "epoch": 0.17814938684503903, + "grad_norm": 0.4722249507904053, + "learning_rate": 8e-05, + "loss": 1.4028, + "step": 3196 + }, + { + "epoch": 0.1782051282051282, + "grad_norm": 0.5114902853965759, + "learning_rate": 8e-05, + "loss": 1.7978, + "step": 3197 + }, + { + "epoch": 0.1782608695652174, + "grad_norm": 0.5099138021469116, + "learning_rate": 8e-05, + "loss": 2.0185, + "step": 3198 + }, + { + "epoch": 0.17831661092530657, + "grad_norm": 0.5226367712020874, + "learning_rate": 8e-05, + "loss": 1.8694, + "step": 3199 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.5212682485580444, + "learning_rate": 8e-05, + "loss": 1.9197, + "step": 3200 + }, + { + "epoch": 0.17842809364548495, + "grad_norm": 0.5298042893409729, + "learning_rate": 8e-05, + "loss": 1.9656, + "step": 3201 + }, + { + "epoch": 0.17848383500557413, + "grad_norm": 0.5426698327064514, + "learning_rate": 8e-05, + "loss": 1.6831, + "step": 3202 + }, + { + "epoch": 0.1785395763656633, + "grad_norm": 0.47143349051475525, + "learning_rate": 8e-05, + "loss": 1.6756, + "step": 3203 + }, + { + "epoch": 0.17859531772575252, + "grad_norm": 0.4910406470298767, + "learning_rate": 8e-05, + "loss": 1.9391, + "step": 3204 + }, + { + "epoch": 0.1786510590858417, + "grad_norm": 0.5471988320350647, + "learning_rate": 8e-05, + "loss": 2.0599, + "step": 3205 + }, + { + "epoch": 0.17870680044593087, + "grad_norm": 0.4846929907798767, + "learning_rate": 8e-05, + "loss": 1.6376, + "step": 3206 + }, + { + "epoch": 0.17876254180602005, + "grad_norm": 0.5019040703773499, + "learning_rate": 8e-05, + "loss": 1.8373, + "step": 3207 + }, + { + "epoch": 0.17881828316610926, + "grad_norm": 0.5033964514732361, + "learning_rate": 8e-05, + "loss": 1.8035, + "step": 3208 + }, + { + "epoch": 0.17887402452619844, + "grad_norm": 0.5568781495094299, + "learning_rate": 8e-05, + "loss": 1.7519, + "step": 3209 + }, + { + "epoch": 0.17892976588628762, + "grad_norm": 0.5149691700935364, + "learning_rate": 8e-05, + "loss": 1.7481, + "step": 3210 + }, + { + "epoch": 0.17898550724637682, + "grad_norm": 0.47844332456588745, + "learning_rate": 8e-05, + "loss": 1.8602, + "step": 3211 + }, + { + "epoch": 0.179041248606466, + "grad_norm": 0.5395461916923523, + "learning_rate": 8e-05, + "loss": 1.764, + "step": 3212 + }, + { + "epoch": 0.17909698996655518, + "grad_norm": 0.4720330238342285, + "learning_rate": 8e-05, + "loss": 1.7078, + "step": 3213 + }, + { + "epoch": 0.17915273132664436, + "grad_norm": 0.458877295255661, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 3214 + }, + { + "epoch": 0.17920847268673357, + "grad_norm": 0.500705361366272, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 3215 + }, + { + "epoch": 0.17926421404682275, + "grad_norm": 0.5020824074745178, + "learning_rate": 8e-05, + "loss": 1.7152, + "step": 3216 + }, + { + "epoch": 0.17931995540691192, + "grad_norm": 0.4906066060066223, + "learning_rate": 8e-05, + "loss": 1.8397, + "step": 3217 + }, + { + "epoch": 0.1793756967670011, + "grad_norm": 0.4836278557777405, + "learning_rate": 8e-05, + "loss": 1.6074, + "step": 3218 + }, + { + "epoch": 0.1794314381270903, + "grad_norm": 0.49199914932250977, + "learning_rate": 8e-05, + "loss": 1.732, + "step": 3219 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 0.5196998715400696, + "learning_rate": 8e-05, + "loss": 1.7605, + "step": 3220 + }, + { + "epoch": 0.17954292084726867, + "grad_norm": 0.453242689371109, + "learning_rate": 8e-05, + "loss": 1.5641, + "step": 3221 + }, + { + "epoch": 0.17959866220735787, + "grad_norm": 0.43356654047966003, + "learning_rate": 8e-05, + "loss": 1.5325, + "step": 3222 + }, + { + "epoch": 0.17965440356744705, + "grad_norm": 0.4988141655921936, + "learning_rate": 8e-05, + "loss": 1.5707, + "step": 3223 + }, + { + "epoch": 0.17971014492753623, + "grad_norm": 0.49998071789741516, + "learning_rate": 8e-05, + "loss": 1.849, + "step": 3224 + }, + { + "epoch": 0.1797658862876254, + "grad_norm": 0.5347229838371277, + "learning_rate": 8e-05, + "loss": 1.9297, + "step": 3225 + }, + { + "epoch": 0.17982162764771462, + "grad_norm": 0.49117156863212585, + "learning_rate": 8e-05, + "loss": 1.6969, + "step": 3226 + }, + { + "epoch": 0.1798773690078038, + "grad_norm": 0.5386723875999451, + "learning_rate": 8e-05, + "loss": 1.8024, + "step": 3227 + }, + { + "epoch": 0.17993311036789297, + "grad_norm": 0.48068922758102417, + "learning_rate": 8e-05, + "loss": 1.6902, + "step": 3228 + }, + { + "epoch": 0.17998885172798215, + "grad_norm": 0.46101266145706177, + "learning_rate": 8e-05, + "loss": 1.5828, + "step": 3229 + }, + { + "epoch": 0.18004459308807136, + "grad_norm": 0.4808231294155121, + "learning_rate": 8e-05, + "loss": 1.6905, + "step": 3230 + }, + { + "epoch": 0.18010033444816054, + "grad_norm": 0.5057591795921326, + "learning_rate": 8e-05, + "loss": 1.8851, + "step": 3231 + }, + { + "epoch": 0.18015607580824972, + "grad_norm": 0.47574618458747864, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 3232 + }, + { + "epoch": 0.1802118171683389, + "grad_norm": 0.4971584677696228, + "learning_rate": 8e-05, + "loss": 1.8116, + "step": 3233 + }, + { + "epoch": 0.1802675585284281, + "grad_norm": 0.5194235444068909, + "learning_rate": 8e-05, + "loss": 1.8604, + "step": 3234 + }, + { + "epoch": 0.18032329988851728, + "grad_norm": 0.4879535138607025, + "learning_rate": 8e-05, + "loss": 1.7534, + "step": 3235 + }, + { + "epoch": 0.18037904124860646, + "grad_norm": 0.5174830555915833, + "learning_rate": 8e-05, + "loss": 1.775, + "step": 3236 + }, + { + "epoch": 0.18043478260869567, + "grad_norm": 0.5025219917297363, + "learning_rate": 8e-05, + "loss": 1.911, + "step": 3237 + }, + { + "epoch": 0.18049052396878484, + "grad_norm": 0.5307998061180115, + "learning_rate": 8e-05, + "loss": 1.7317, + "step": 3238 + }, + { + "epoch": 0.18054626532887402, + "grad_norm": 0.47522616386413574, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 3239 + }, + { + "epoch": 0.1806020066889632, + "grad_norm": 0.4941982626914978, + "learning_rate": 8e-05, + "loss": 1.8178, + "step": 3240 + }, + { + "epoch": 0.1806577480490524, + "grad_norm": 0.5120488405227661, + "learning_rate": 8e-05, + "loss": 1.7547, + "step": 3241 + }, + { + "epoch": 0.1807134894091416, + "grad_norm": 0.5394561290740967, + "learning_rate": 8e-05, + "loss": 2.0231, + "step": 3242 + }, + { + "epoch": 0.18076923076923077, + "grad_norm": 0.45192497968673706, + "learning_rate": 8e-05, + "loss": 1.4677, + "step": 3243 + }, + { + "epoch": 0.18082497212931994, + "grad_norm": 0.4537535011768341, + "learning_rate": 8e-05, + "loss": 1.6961, + "step": 3244 + }, + { + "epoch": 0.18088071348940915, + "grad_norm": 0.5035440325737, + "learning_rate": 8e-05, + "loss": 1.7087, + "step": 3245 + }, + { + "epoch": 0.18093645484949833, + "grad_norm": 0.4700638949871063, + "learning_rate": 8e-05, + "loss": 1.7097, + "step": 3246 + }, + { + "epoch": 0.1809921962095875, + "grad_norm": 0.47896021604537964, + "learning_rate": 8e-05, + "loss": 1.724, + "step": 3247 + }, + { + "epoch": 0.1810479375696767, + "grad_norm": 0.5172485113143921, + "learning_rate": 8e-05, + "loss": 1.8554, + "step": 3248 + }, + { + "epoch": 0.1811036789297659, + "grad_norm": 0.4391520023345947, + "learning_rate": 8e-05, + "loss": 1.6714, + "step": 3249 + }, + { + "epoch": 0.18115942028985507, + "grad_norm": 0.4664669632911682, + "learning_rate": 8e-05, + "loss": 1.5275, + "step": 3250 + }, + { + "epoch": 0.18121516164994425, + "grad_norm": 0.44695574045181274, + "learning_rate": 8e-05, + "loss": 1.5913, + "step": 3251 + }, + { + "epoch": 0.18127090301003346, + "grad_norm": 0.52314692735672, + "learning_rate": 8e-05, + "loss": 1.8208, + "step": 3252 + }, + { + "epoch": 0.18132664437012264, + "grad_norm": 0.4656141698360443, + "learning_rate": 8e-05, + "loss": 1.6378, + "step": 3253 + }, + { + "epoch": 0.18138238573021181, + "grad_norm": 0.4591492712497711, + "learning_rate": 8e-05, + "loss": 1.481, + "step": 3254 + }, + { + "epoch": 0.181438127090301, + "grad_norm": 0.4969112277030945, + "learning_rate": 8e-05, + "loss": 1.6094, + "step": 3255 + }, + { + "epoch": 0.1814938684503902, + "grad_norm": 0.4783157408237457, + "learning_rate": 8e-05, + "loss": 1.5105, + "step": 3256 + }, + { + "epoch": 0.18154960981047938, + "grad_norm": 0.5113158226013184, + "learning_rate": 8e-05, + "loss": 1.7022, + "step": 3257 + }, + { + "epoch": 0.18160535117056856, + "grad_norm": 0.48434895277023315, + "learning_rate": 8e-05, + "loss": 1.6306, + "step": 3258 + }, + { + "epoch": 0.18166109253065774, + "grad_norm": 0.6748455762863159, + "learning_rate": 8e-05, + "loss": 1.6323, + "step": 3259 + }, + { + "epoch": 0.18171683389074694, + "grad_norm": 0.47873955965042114, + "learning_rate": 8e-05, + "loss": 1.6777, + "step": 3260 + }, + { + "epoch": 0.18177257525083612, + "grad_norm": 0.4789445102214813, + "learning_rate": 8e-05, + "loss": 1.8353, + "step": 3261 + }, + { + "epoch": 0.1818283166109253, + "grad_norm": 0.538105309009552, + "learning_rate": 8e-05, + "loss": 1.8148, + "step": 3262 + }, + { + "epoch": 0.18188405797101448, + "grad_norm": 0.4994378685951233, + "learning_rate": 8e-05, + "loss": 1.7498, + "step": 3263 + }, + { + "epoch": 0.18193979933110369, + "grad_norm": 0.4745805561542511, + "learning_rate": 8e-05, + "loss": 1.7025, + "step": 3264 + }, + { + "epoch": 0.18199554069119286, + "grad_norm": 0.46566084027290344, + "learning_rate": 8e-05, + "loss": 1.7204, + "step": 3265 + }, + { + "epoch": 0.18205128205128204, + "grad_norm": 0.475907564163208, + "learning_rate": 8e-05, + "loss": 1.6802, + "step": 3266 + }, + { + "epoch": 0.18210702341137125, + "grad_norm": 0.475202739238739, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 3267 + }, + { + "epoch": 0.18216276477146043, + "grad_norm": 0.5017547011375427, + "learning_rate": 8e-05, + "loss": 1.7254, + "step": 3268 + }, + { + "epoch": 0.1822185061315496, + "grad_norm": 0.5284735560417175, + "learning_rate": 8e-05, + "loss": 1.9484, + "step": 3269 + }, + { + "epoch": 0.18227424749163879, + "grad_norm": 0.4836098253726959, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 3270 + }, + { + "epoch": 0.182329988851728, + "grad_norm": 0.5028160810470581, + "learning_rate": 8e-05, + "loss": 1.8658, + "step": 3271 + }, + { + "epoch": 0.18238573021181717, + "grad_norm": 0.5013949275016785, + "learning_rate": 8e-05, + "loss": 1.6311, + "step": 3272 + }, + { + "epoch": 0.18244147157190635, + "grad_norm": 0.5350913405418396, + "learning_rate": 8e-05, + "loss": 1.9957, + "step": 3273 + }, + { + "epoch": 0.18249721293199553, + "grad_norm": 0.49767985939979553, + "learning_rate": 8e-05, + "loss": 1.6772, + "step": 3274 + }, + { + "epoch": 0.18255295429208473, + "grad_norm": 0.5184692740440369, + "learning_rate": 8e-05, + "loss": 2.0062, + "step": 3275 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 0.4783124029636383, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 3276 + }, + { + "epoch": 0.1826644370122631, + "grad_norm": 0.4819895923137665, + "learning_rate": 8e-05, + "loss": 1.5692, + "step": 3277 + }, + { + "epoch": 0.1827201783723523, + "grad_norm": 0.48155200481414795, + "learning_rate": 8e-05, + "loss": 1.6602, + "step": 3278 + }, + { + "epoch": 0.18277591973244148, + "grad_norm": 0.492043673992157, + "learning_rate": 8e-05, + "loss": 1.7576, + "step": 3279 + }, + { + "epoch": 0.18283166109253066, + "grad_norm": 0.4913448095321655, + "learning_rate": 8e-05, + "loss": 1.8854, + "step": 3280 + }, + { + "epoch": 0.18288740245261983, + "grad_norm": 0.4953262507915497, + "learning_rate": 8e-05, + "loss": 1.7527, + "step": 3281 + }, + { + "epoch": 0.18294314381270904, + "grad_norm": 0.482516884803772, + "learning_rate": 8e-05, + "loss": 1.8599, + "step": 3282 + }, + { + "epoch": 0.18299888517279822, + "grad_norm": 0.4654371440410614, + "learning_rate": 8e-05, + "loss": 1.8008, + "step": 3283 + }, + { + "epoch": 0.1830546265328874, + "grad_norm": 0.44700589776039124, + "learning_rate": 8e-05, + "loss": 1.7483, + "step": 3284 + }, + { + "epoch": 0.18311036789297658, + "grad_norm": 0.49507349729537964, + "learning_rate": 8e-05, + "loss": 1.9131, + "step": 3285 + }, + { + "epoch": 0.18316610925306578, + "grad_norm": 0.46997302770614624, + "learning_rate": 8e-05, + "loss": 1.6543, + "step": 3286 + }, + { + "epoch": 0.18322185061315496, + "grad_norm": 0.5000645518302917, + "learning_rate": 8e-05, + "loss": 1.868, + "step": 3287 + }, + { + "epoch": 0.18327759197324414, + "grad_norm": 0.5379271507263184, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 3288 + }, + { + "epoch": 0.18333333333333332, + "grad_norm": 0.4435328543186188, + "learning_rate": 8e-05, + "loss": 1.5935, + "step": 3289 + }, + { + "epoch": 0.18338907469342253, + "grad_norm": 0.4761490523815155, + "learning_rate": 8e-05, + "loss": 1.828, + "step": 3290 + }, + { + "epoch": 0.1834448160535117, + "grad_norm": 0.5158578753471375, + "learning_rate": 8e-05, + "loss": 1.6859, + "step": 3291 + }, + { + "epoch": 0.18350055741360088, + "grad_norm": 0.45683667063713074, + "learning_rate": 8e-05, + "loss": 1.5538, + "step": 3292 + }, + { + "epoch": 0.1835562987736901, + "grad_norm": 0.460464209318161, + "learning_rate": 8e-05, + "loss": 1.775, + "step": 3293 + }, + { + "epoch": 0.18361204013377927, + "grad_norm": 0.4554464817047119, + "learning_rate": 8e-05, + "loss": 1.723, + "step": 3294 + }, + { + "epoch": 0.18366778149386845, + "grad_norm": 0.49409928917884827, + "learning_rate": 8e-05, + "loss": 1.8673, + "step": 3295 + }, + { + "epoch": 0.18372352285395763, + "grad_norm": 0.5223574638366699, + "learning_rate": 8e-05, + "loss": 1.7798, + "step": 3296 + }, + { + "epoch": 0.18377926421404683, + "grad_norm": 0.4630155563354492, + "learning_rate": 8e-05, + "loss": 1.6178, + "step": 3297 + }, + { + "epoch": 0.183835005574136, + "grad_norm": 0.5209290981292725, + "learning_rate": 8e-05, + "loss": 1.9866, + "step": 3298 + }, + { + "epoch": 0.1838907469342252, + "grad_norm": 0.48574021458625793, + "learning_rate": 8e-05, + "loss": 1.7712, + "step": 3299 + }, + { + "epoch": 0.18394648829431437, + "grad_norm": 0.5119464993476868, + "learning_rate": 8e-05, + "loss": 1.9065, + "step": 3300 + }, + { + "epoch": 0.18400222965440358, + "grad_norm": 0.4706062078475952, + "learning_rate": 8e-05, + "loss": 1.7841, + "step": 3301 + }, + { + "epoch": 0.18405797101449275, + "grad_norm": 0.47743088006973267, + "learning_rate": 8e-05, + "loss": 1.722, + "step": 3302 + }, + { + "epoch": 0.18411371237458193, + "grad_norm": 0.4631895422935486, + "learning_rate": 8e-05, + "loss": 1.7439, + "step": 3303 + }, + { + "epoch": 0.1841694537346711, + "grad_norm": 0.4625717103481293, + "learning_rate": 8e-05, + "loss": 1.62, + "step": 3304 + }, + { + "epoch": 0.18422519509476032, + "grad_norm": 0.5006458759307861, + "learning_rate": 8e-05, + "loss": 1.9197, + "step": 3305 + }, + { + "epoch": 0.1842809364548495, + "grad_norm": 0.5063039064407349, + "learning_rate": 8e-05, + "loss": 2.0728, + "step": 3306 + }, + { + "epoch": 0.18433667781493868, + "grad_norm": 0.4803706109523773, + "learning_rate": 8e-05, + "loss": 1.7504, + "step": 3307 + }, + { + "epoch": 0.18439241917502788, + "grad_norm": 0.47429147362709045, + "learning_rate": 8e-05, + "loss": 1.8056, + "step": 3308 + }, + { + "epoch": 0.18444816053511706, + "grad_norm": 0.4567185342311859, + "learning_rate": 8e-05, + "loss": 1.721, + "step": 3309 + }, + { + "epoch": 0.18450390189520624, + "grad_norm": 0.4384846091270447, + "learning_rate": 8e-05, + "loss": 1.7278, + "step": 3310 + }, + { + "epoch": 0.18455964325529542, + "grad_norm": 0.4725017845630646, + "learning_rate": 8e-05, + "loss": 1.5995, + "step": 3311 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 0.46991947293281555, + "learning_rate": 8e-05, + "loss": 1.3783, + "step": 3312 + }, + { + "epoch": 0.1846711259754738, + "grad_norm": 0.5544723868370056, + "learning_rate": 8e-05, + "loss": 1.9191, + "step": 3313 + }, + { + "epoch": 0.18472686733556298, + "grad_norm": 0.46554556488990784, + "learning_rate": 8e-05, + "loss": 1.7393, + "step": 3314 + }, + { + "epoch": 0.18478260869565216, + "grad_norm": 0.4672147035598755, + "learning_rate": 8e-05, + "loss": 1.7015, + "step": 3315 + }, + { + "epoch": 0.18483835005574137, + "grad_norm": 0.47809523344039917, + "learning_rate": 8e-05, + "loss": 1.7469, + "step": 3316 + }, + { + "epoch": 0.18489409141583055, + "grad_norm": 0.4963299334049225, + "learning_rate": 8e-05, + "loss": 1.9078, + "step": 3317 + }, + { + "epoch": 0.18494983277591973, + "grad_norm": 0.48992517590522766, + "learning_rate": 8e-05, + "loss": 1.5713, + "step": 3318 + }, + { + "epoch": 0.18500557413600893, + "grad_norm": 0.49141886830329895, + "learning_rate": 8e-05, + "loss": 1.847, + "step": 3319 + }, + { + "epoch": 0.1850613154960981, + "grad_norm": 0.4902811646461487, + "learning_rate": 8e-05, + "loss": 1.675, + "step": 3320 + }, + { + "epoch": 0.1851170568561873, + "grad_norm": 0.5069705843925476, + "learning_rate": 8e-05, + "loss": 1.9477, + "step": 3321 + }, + { + "epoch": 0.18517279821627647, + "grad_norm": 0.5962790846824646, + "learning_rate": 8e-05, + "loss": 1.8071, + "step": 3322 + }, + { + "epoch": 0.18522853957636568, + "grad_norm": 0.4733704626560211, + "learning_rate": 8e-05, + "loss": 1.4901, + "step": 3323 + }, + { + "epoch": 0.18528428093645485, + "grad_norm": 0.514161229133606, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 3324 + }, + { + "epoch": 0.18534002229654403, + "grad_norm": 0.5042579770088196, + "learning_rate": 8e-05, + "loss": 1.7477, + "step": 3325 + }, + { + "epoch": 0.1853957636566332, + "grad_norm": 0.503969132900238, + "learning_rate": 8e-05, + "loss": 1.7513, + "step": 3326 + }, + { + "epoch": 0.18545150501672242, + "grad_norm": 0.503426730632782, + "learning_rate": 8e-05, + "loss": 1.6288, + "step": 3327 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 0.48891016840934753, + "learning_rate": 8e-05, + "loss": 1.7253, + "step": 3328 + }, + { + "epoch": 0.18556298773690078, + "grad_norm": 0.5385537147521973, + "learning_rate": 8e-05, + "loss": 1.6348, + "step": 3329 + }, + { + "epoch": 0.18561872909698995, + "grad_norm": 0.4645160138607025, + "learning_rate": 8e-05, + "loss": 1.7895, + "step": 3330 + }, + { + "epoch": 0.18567447045707916, + "grad_norm": 0.49372631311416626, + "learning_rate": 8e-05, + "loss": 1.5114, + "step": 3331 + }, + { + "epoch": 0.18573021181716834, + "grad_norm": 0.46734169125556946, + "learning_rate": 8e-05, + "loss": 1.5832, + "step": 3332 + }, + { + "epoch": 0.18578595317725752, + "grad_norm": 0.5161205530166626, + "learning_rate": 8e-05, + "loss": 1.863, + "step": 3333 + }, + { + "epoch": 0.18584169453734672, + "grad_norm": 0.47789087891578674, + "learning_rate": 8e-05, + "loss": 1.5687, + "step": 3334 + }, + { + "epoch": 0.1858974358974359, + "grad_norm": 0.5009278655052185, + "learning_rate": 8e-05, + "loss": 1.891, + "step": 3335 + }, + { + "epoch": 0.18595317725752508, + "grad_norm": 0.44556164741516113, + "learning_rate": 8e-05, + "loss": 1.7084, + "step": 3336 + }, + { + "epoch": 0.18600891861761426, + "grad_norm": 0.45805823802948, + "learning_rate": 8e-05, + "loss": 1.5131, + "step": 3337 + }, + { + "epoch": 0.18606465997770347, + "grad_norm": 0.48302045464515686, + "learning_rate": 8e-05, + "loss": 1.7633, + "step": 3338 + }, + { + "epoch": 0.18612040133779265, + "grad_norm": 0.4615001380443573, + "learning_rate": 8e-05, + "loss": 1.4623, + "step": 3339 + }, + { + "epoch": 0.18617614269788182, + "grad_norm": 0.49533554911613464, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 3340 + }, + { + "epoch": 0.186231884057971, + "grad_norm": 0.43167346715927124, + "learning_rate": 8e-05, + "loss": 1.4441, + "step": 3341 + }, + { + "epoch": 0.1862876254180602, + "grad_norm": 0.527940571308136, + "learning_rate": 8e-05, + "loss": 1.5926, + "step": 3342 + }, + { + "epoch": 0.1863433667781494, + "grad_norm": 0.5032770037651062, + "learning_rate": 8e-05, + "loss": 1.6924, + "step": 3343 + }, + { + "epoch": 0.18639910813823857, + "grad_norm": 0.5254105925559998, + "learning_rate": 8e-05, + "loss": 1.9019, + "step": 3344 + }, + { + "epoch": 0.18645484949832775, + "grad_norm": 0.5116928815841675, + "learning_rate": 8e-05, + "loss": 1.8028, + "step": 3345 + }, + { + "epoch": 0.18651059085841695, + "grad_norm": 0.45237305760383606, + "learning_rate": 8e-05, + "loss": 1.5661, + "step": 3346 + }, + { + "epoch": 0.18656633221850613, + "grad_norm": 0.4751761853694916, + "learning_rate": 8e-05, + "loss": 1.6603, + "step": 3347 + }, + { + "epoch": 0.1866220735785953, + "grad_norm": 0.5808463096618652, + "learning_rate": 8e-05, + "loss": 1.9472, + "step": 3348 + }, + { + "epoch": 0.18667781493868452, + "grad_norm": 0.48247790336608887, + "learning_rate": 8e-05, + "loss": 1.6478, + "step": 3349 + }, + { + "epoch": 0.1867335562987737, + "grad_norm": 0.49884429574012756, + "learning_rate": 8e-05, + "loss": 1.9385, + "step": 3350 + }, + { + "epoch": 0.18678929765886287, + "grad_norm": 0.49466973543167114, + "learning_rate": 8e-05, + "loss": 1.7743, + "step": 3351 + }, + { + "epoch": 0.18684503901895205, + "grad_norm": 0.47581541538238525, + "learning_rate": 8e-05, + "loss": 1.552, + "step": 3352 + }, + { + "epoch": 0.18690078037904126, + "grad_norm": 0.448135107755661, + "learning_rate": 8e-05, + "loss": 1.5884, + "step": 3353 + }, + { + "epoch": 0.18695652173913044, + "grad_norm": 0.560992956161499, + "learning_rate": 8e-05, + "loss": 1.9356, + "step": 3354 + }, + { + "epoch": 0.18701226309921962, + "grad_norm": 0.5413427352905273, + "learning_rate": 8e-05, + "loss": 1.7125, + "step": 3355 + }, + { + "epoch": 0.1870680044593088, + "grad_norm": 0.48547688126564026, + "learning_rate": 8e-05, + "loss": 1.7948, + "step": 3356 + }, + { + "epoch": 0.187123745819398, + "grad_norm": 0.4625580906867981, + "learning_rate": 8e-05, + "loss": 1.4295, + "step": 3357 + }, + { + "epoch": 0.18717948717948718, + "grad_norm": 0.49883168935775757, + "learning_rate": 8e-05, + "loss": 1.6727, + "step": 3358 + }, + { + "epoch": 0.18723522853957636, + "grad_norm": 0.4742003381252289, + "learning_rate": 8e-05, + "loss": 1.52, + "step": 3359 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.5026939511299133, + "learning_rate": 8e-05, + "loss": 1.6116, + "step": 3360 + }, + { + "epoch": 0.18734671125975474, + "grad_norm": 0.48718181252479553, + "learning_rate": 8e-05, + "loss": 1.7806, + "step": 3361 + }, + { + "epoch": 0.18740245261984392, + "grad_norm": 0.5140118598937988, + "learning_rate": 8e-05, + "loss": 1.7943, + "step": 3362 + }, + { + "epoch": 0.1874581939799331, + "grad_norm": 0.506673276424408, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 3363 + }, + { + "epoch": 0.1875139353400223, + "grad_norm": 0.5099788904190063, + "learning_rate": 8e-05, + "loss": 1.7816, + "step": 3364 + }, + { + "epoch": 0.1875696767001115, + "grad_norm": 0.5400450229644775, + "learning_rate": 8e-05, + "loss": 1.7322, + "step": 3365 + }, + { + "epoch": 0.18762541806020067, + "grad_norm": 0.48514536023139954, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 3366 + }, + { + "epoch": 0.18768115942028984, + "grad_norm": 0.47433406114578247, + "learning_rate": 8e-05, + "loss": 1.6121, + "step": 3367 + }, + { + "epoch": 0.18773690078037905, + "grad_norm": 0.46342793107032776, + "learning_rate": 8e-05, + "loss": 1.7103, + "step": 3368 + }, + { + "epoch": 0.18779264214046823, + "grad_norm": 0.44723591208457947, + "learning_rate": 8e-05, + "loss": 1.4898, + "step": 3369 + }, + { + "epoch": 0.1878483835005574, + "grad_norm": 0.4659675657749176, + "learning_rate": 8e-05, + "loss": 1.5409, + "step": 3370 + }, + { + "epoch": 0.1879041248606466, + "grad_norm": 0.5222867131233215, + "learning_rate": 8e-05, + "loss": 1.7159, + "step": 3371 + }, + { + "epoch": 0.1879598662207358, + "grad_norm": 0.49539661407470703, + "learning_rate": 8e-05, + "loss": 1.8474, + "step": 3372 + }, + { + "epoch": 0.18801560758082497, + "grad_norm": 0.5112997889518738, + "learning_rate": 8e-05, + "loss": 1.9201, + "step": 3373 + }, + { + "epoch": 0.18807134894091415, + "grad_norm": 0.44712772965431213, + "learning_rate": 8e-05, + "loss": 1.4429, + "step": 3374 + }, + { + "epoch": 0.18812709030100336, + "grad_norm": 0.48310035467147827, + "learning_rate": 8e-05, + "loss": 1.689, + "step": 3375 + }, + { + "epoch": 0.18818283166109254, + "grad_norm": 0.4607887864112854, + "learning_rate": 8e-05, + "loss": 1.3945, + "step": 3376 + }, + { + "epoch": 0.18823857302118172, + "grad_norm": 0.5434450507164001, + "learning_rate": 8e-05, + "loss": 1.9745, + "step": 3377 + }, + { + "epoch": 0.1882943143812709, + "grad_norm": 0.48917487263679504, + "learning_rate": 8e-05, + "loss": 1.6444, + "step": 3378 + }, + { + "epoch": 0.1883500557413601, + "grad_norm": 0.5110101103782654, + "learning_rate": 8e-05, + "loss": 2.0023, + "step": 3379 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 0.489247590303421, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 3380 + }, + { + "epoch": 0.18846153846153846, + "grad_norm": 0.5191060304641724, + "learning_rate": 8e-05, + "loss": 1.8036, + "step": 3381 + }, + { + "epoch": 0.18851727982162764, + "grad_norm": 0.5540537238121033, + "learning_rate": 8e-05, + "loss": 1.7418, + "step": 3382 + }, + { + "epoch": 0.18857302118171684, + "grad_norm": 0.575836718082428, + "learning_rate": 8e-05, + "loss": 2.0404, + "step": 3383 + }, + { + "epoch": 0.18862876254180602, + "grad_norm": 0.5042715072631836, + "learning_rate": 8e-05, + "loss": 1.6793, + "step": 3384 + }, + { + "epoch": 0.1886845039018952, + "grad_norm": 0.45747798681259155, + "learning_rate": 8e-05, + "loss": 1.6696, + "step": 3385 + }, + { + "epoch": 0.18874024526198438, + "grad_norm": 0.4825889468193054, + "learning_rate": 8e-05, + "loss": 1.8729, + "step": 3386 + }, + { + "epoch": 0.18879598662207359, + "grad_norm": 0.5231027603149414, + "learning_rate": 8e-05, + "loss": 1.7968, + "step": 3387 + }, + { + "epoch": 0.18885172798216276, + "grad_norm": 0.4612344205379486, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 3388 + }, + { + "epoch": 0.18890746934225194, + "grad_norm": 0.44895699620246887, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 3389 + }, + { + "epoch": 0.18896321070234115, + "grad_norm": 0.44689661264419556, + "learning_rate": 8e-05, + "loss": 1.3054, + "step": 3390 + }, + { + "epoch": 0.18901895206243033, + "grad_norm": 0.5180425047874451, + "learning_rate": 8e-05, + "loss": 2.0898, + "step": 3391 + }, + { + "epoch": 0.1890746934225195, + "grad_norm": 0.44576889276504517, + "learning_rate": 8e-05, + "loss": 1.6609, + "step": 3392 + }, + { + "epoch": 0.1891304347826087, + "grad_norm": 0.47959277033805847, + "learning_rate": 8e-05, + "loss": 1.7771, + "step": 3393 + }, + { + "epoch": 0.1891861761426979, + "grad_norm": 0.4864841401576996, + "learning_rate": 8e-05, + "loss": 1.8195, + "step": 3394 + }, + { + "epoch": 0.18924191750278707, + "grad_norm": 0.4945772588253021, + "learning_rate": 8e-05, + "loss": 1.7601, + "step": 3395 + }, + { + "epoch": 0.18929765886287625, + "grad_norm": 0.4611192047595978, + "learning_rate": 8e-05, + "loss": 1.6278, + "step": 3396 + }, + { + "epoch": 0.18935340022296543, + "grad_norm": 0.5064223408699036, + "learning_rate": 8e-05, + "loss": 1.9821, + "step": 3397 + }, + { + "epoch": 0.18940914158305464, + "grad_norm": 0.49804046750068665, + "learning_rate": 8e-05, + "loss": 1.8064, + "step": 3398 + }, + { + "epoch": 0.18946488294314381, + "grad_norm": 0.4759809076786041, + "learning_rate": 8e-05, + "loss": 1.8248, + "step": 3399 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.462550550699234, + "learning_rate": 8e-05, + "loss": 1.6176, + "step": 3400 + }, + { + "epoch": 0.18957636566332217, + "grad_norm": 0.4693813920021057, + "learning_rate": 8e-05, + "loss": 1.6927, + "step": 3401 + }, + { + "epoch": 0.18963210702341138, + "grad_norm": 0.48910364508628845, + "learning_rate": 8e-05, + "loss": 1.6771, + "step": 3402 + }, + { + "epoch": 0.18968784838350056, + "grad_norm": 0.4354751706123352, + "learning_rate": 8e-05, + "loss": 1.5138, + "step": 3403 + }, + { + "epoch": 0.18974358974358974, + "grad_norm": 0.4465082883834839, + "learning_rate": 8e-05, + "loss": 1.6123, + "step": 3404 + }, + { + "epoch": 0.18979933110367894, + "grad_norm": 0.5002762079238892, + "learning_rate": 8e-05, + "loss": 1.8652, + "step": 3405 + }, + { + "epoch": 0.18985507246376812, + "grad_norm": 0.46198657155036926, + "learning_rate": 8e-05, + "loss": 1.4936, + "step": 3406 + }, + { + "epoch": 0.1899108138238573, + "grad_norm": 0.5062534213066101, + "learning_rate": 8e-05, + "loss": 1.8373, + "step": 3407 + }, + { + "epoch": 0.18996655518394648, + "grad_norm": 0.4888454079627991, + "learning_rate": 8e-05, + "loss": 1.6433, + "step": 3408 + }, + { + "epoch": 0.19002229654403568, + "grad_norm": 0.5151571035385132, + "learning_rate": 8e-05, + "loss": 1.7548, + "step": 3409 + }, + { + "epoch": 0.19007803790412486, + "grad_norm": 0.5547520518302917, + "learning_rate": 8e-05, + "loss": 2.0664, + "step": 3410 + }, + { + "epoch": 0.19013377926421404, + "grad_norm": 0.4662136733531952, + "learning_rate": 8e-05, + "loss": 1.6069, + "step": 3411 + }, + { + "epoch": 0.19018952062430322, + "grad_norm": 0.48590052127838135, + "learning_rate": 8e-05, + "loss": 1.776, + "step": 3412 + }, + { + "epoch": 0.19024526198439243, + "grad_norm": 0.4569084048271179, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 3413 + }, + { + "epoch": 0.1903010033444816, + "grad_norm": 0.5148277282714844, + "learning_rate": 8e-05, + "loss": 1.5761, + "step": 3414 + }, + { + "epoch": 0.19035674470457078, + "grad_norm": 0.4618729054927826, + "learning_rate": 8e-05, + "loss": 1.7345, + "step": 3415 + }, + { + "epoch": 0.19041248606466, + "grad_norm": 0.44613951444625854, + "learning_rate": 8e-05, + "loss": 1.6473, + "step": 3416 + }, + { + "epoch": 0.19046822742474917, + "grad_norm": 0.49018537998199463, + "learning_rate": 8e-05, + "loss": 1.8905, + "step": 3417 + }, + { + "epoch": 0.19052396878483835, + "grad_norm": 0.47118470072746277, + "learning_rate": 8e-05, + "loss": 1.7407, + "step": 3418 + }, + { + "epoch": 0.19057971014492753, + "grad_norm": 0.46336379647254944, + "learning_rate": 8e-05, + "loss": 1.5438, + "step": 3419 + }, + { + "epoch": 0.19063545150501673, + "grad_norm": 0.49951037764549255, + "learning_rate": 8e-05, + "loss": 1.7482, + "step": 3420 + }, + { + "epoch": 0.1906911928651059, + "grad_norm": 0.5517299771308899, + "learning_rate": 8e-05, + "loss": 2.0162, + "step": 3421 + }, + { + "epoch": 0.1907469342251951, + "grad_norm": 0.4926646053791046, + "learning_rate": 8e-05, + "loss": 1.6469, + "step": 3422 + }, + { + "epoch": 0.19080267558528427, + "grad_norm": 0.524966299533844, + "learning_rate": 8e-05, + "loss": 1.6917, + "step": 3423 + }, + { + "epoch": 0.19085841694537348, + "grad_norm": 0.469455361366272, + "learning_rate": 8e-05, + "loss": 1.6371, + "step": 3424 + }, + { + "epoch": 0.19091415830546266, + "grad_norm": 0.4847599267959595, + "learning_rate": 8e-05, + "loss": 1.6893, + "step": 3425 + }, + { + "epoch": 0.19096989966555183, + "grad_norm": 0.5023056268692017, + "learning_rate": 8e-05, + "loss": 1.7794, + "step": 3426 + }, + { + "epoch": 0.191025641025641, + "grad_norm": 0.48208606243133545, + "learning_rate": 8e-05, + "loss": 1.8087, + "step": 3427 + }, + { + "epoch": 0.19108138238573022, + "grad_norm": 0.4761766493320465, + "learning_rate": 8e-05, + "loss": 1.7857, + "step": 3428 + }, + { + "epoch": 0.1911371237458194, + "grad_norm": 0.43917298316955566, + "learning_rate": 8e-05, + "loss": 1.4967, + "step": 3429 + }, + { + "epoch": 0.19119286510590858, + "grad_norm": 0.5421159267425537, + "learning_rate": 8e-05, + "loss": 1.8842, + "step": 3430 + }, + { + "epoch": 0.19124860646599778, + "grad_norm": 0.4713871479034424, + "learning_rate": 8e-05, + "loss": 1.6097, + "step": 3431 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 0.5096777081489563, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 3432 + }, + { + "epoch": 0.19136008918617614, + "grad_norm": 0.5059357285499573, + "learning_rate": 8e-05, + "loss": 1.8719, + "step": 3433 + }, + { + "epoch": 0.19141583054626532, + "grad_norm": 0.4401608109474182, + "learning_rate": 8e-05, + "loss": 1.6274, + "step": 3434 + }, + { + "epoch": 0.19147157190635453, + "grad_norm": 0.4493738114833832, + "learning_rate": 8e-05, + "loss": 1.6411, + "step": 3435 + }, + { + "epoch": 0.1915273132664437, + "grad_norm": 0.47014740109443665, + "learning_rate": 8e-05, + "loss": 1.7378, + "step": 3436 + }, + { + "epoch": 0.19158305462653288, + "grad_norm": 0.4577510952949524, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 3437 + }, + { + "epoch": 0.19163879598662206, + "grad_norm": 0.44429123401641846, + "learning_rate": 8e-05, + "loss": 1.6286, + "step": 3438 + }, + { + "epoch": 0.19169453734671127, + "grad_norm": 0.557929515838623, + "learning_rate": 8e-05, + "loss": 1.4677, + "step": 3439 + }, + { + "epoch": 0.19175027870680045, + "grad_norm": 0.5293607711791992, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 3440 + }, + { + "epoch": 0.19180602006688963, + "grad_norm": 0.5156054496765137, + "learning_rate": 8e-05, + "loss": 1.9326, + "step": 3441 + }, + { + "epoch": 0.1918617614269788, + "grad_norm": 0.576087474822998, + "learning_rate": 8e-05, + "loss": 1.8974, + "step": 3442 + }, + { + "epoch": 0.191917502787068, + "grad_norm": 0.435739129781723, + "learning_rate": 8e-05, + "loss": 1.5585, + "step": 3443 + }, + { + "epoch": 0.1919732441471572, + "grad_norm": 0.4967978894710541, + "learning_rate": 8e-05, + "loss": 2.0174, + "step": 3444 + }, + { + "epoch": 0.19202898550724637, + "grad_norm": 0.4740888178348541, + "learning_rate": 8e-05, + "loss": 1.7055, + "step": 3445 + }, + { + "epoch": 0.19208472686733558, + "grad_norm": 0.5190867185592651, + "learning_rate": 8e-05, + "loss": 1.8603, + "step": 3446 + }, + { + "epoch": 0.19214046822742475, + "grad_norm": 0.5093804001808167, + "learning_rate": 8e-05, + "loss": 1.692, + "step": 3447 + }, + { + "epoch": 0.19219620958751393, + "grad_norm": 0.6346522569656372, + "learning_rate": 8e-05, + "loss": 1.709, + "step": 3448 + }, + { + "epoch": 0.1922519509476031, + "grad_norm": 0.5031291842460632, + "learning_rate": 8e-05, + "loss": 1.7797, + "step": 3449 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 0.5159427523612976, + "learning_rate": 8e-05, + "loss": 1.6472, + "step": 3450 + }, + { + "epoch": 0.1923634336677815, + "grad_norm": 0.5027660727500916, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 3451 + }, + { + "epoch": 0.19241917502787068, + "grad_norm": 0.5181394219398499, + "learning_rate": 8e-05, + "loss": 1.8142, + "step": 3452 + }, + { + "epoch": 0.19247491638795985, + "grad_norm": 0.47508713603019714, + "learning_rate": 8e-05, + "loss": 1.723, + "step": 3453 + }, + { + "epoch": 0.19253065774804906, + "grad_norm": 0.49145597219467163, + "learning_rate": 8e-05, + "loss": 1.8007, + "step": 3454 + }, + { + "epoch": 0.19258639910813824, + "grad_norm": 0.4852198660373688, + "learning_rate": 8e-05, + "loss": 1.7551, + "step": 3455 + }, + { + "epoch": 0.19264214046822742, + "grad_norm": 0.502672553062439, + "learning_rate": 8e-05, + "loss": 1.81, + "step": 3456 + }, + { + "epoch": 0.1926978818283166, + "grad_norm": 0.47326627373695374, + "learning_rate": 8e-05, + "loss": 1.6708, + "step": 3457 + }, + { + "epoch": 0.1927536231884058, + "grad_norm": 0.4441857933998108, + "learning_rate": 8e-05, + "loss": 1.5861, + "step": 3458 + }, + { + "epoch": 0.19280936454849498, + "grad_norm": 0.4696533977985382, + "learning_rate": 8e-05, + "loss": 1.6572, + "step": 3459 + }, + { + "epoch": 0.19286510590858416, + "grad_norm": 0.444561243057251, + "learning_rate": 8e-05, + "loss": 1.5593, + "step": 3460 + }, + { + "epoch": 0.19292084726867337, + "grad_norm": 0.44416651129722595, + "learning_rate": 8e-05, + "loss": 1.4879, + "step": 3461 + }, + { + "epoch": 0.19297658862876255, + "grad_norm": 0.5351315140724182, + "learning_rate": 8e-05, + "loss": 1.8234, + "step": 3462 + }, + { + "epoch": 0.19303232998885173, + "grad_norm": 0.5529216527938843, + "learning_rate": 8e-05, + "loss": 1.623, + "step": 3463 + }, + { + "epoch": 0.1930880713489409, + "grad_norm": 0.4560251235961914, + "learning_rate": 8e-05, + "loss": 1.5238, + "step": 3464 + }, + { + "epoch": 0.1931438127090301, + "grad_norm": 0.4910484254360199, + "learning_rate": 8e-05, + "loss": 1.8426, + "step": 3465 + }, + { + "epoch": 0.1931995540691193, + "grad_norm": 0.5156372785568237, + "learning_rate": 8e-05, + "loss": 1.8463, + "step": 3466 + }, + { + "epoch": 0.19325529542920847, + "grad_norm": 0.5058777332305908, + "learning_rate": 8e-05, + "loss": 1.7118, + "step": 3467 + }, + { + "epoch": 0.19331103678929765, + "grad_norm": 0.5257939100265503, + "learning_rate": 8e-05, + "loss": 1.8323, + "step": 3468 + }, + { + "epoch": 0.19336677814938685, + "grad_norm": 0.5075682401657104, + "learning_rate": 8e-05, + "loss": 1.884, + "step": 3469 + }, + { + "epoch": 0.19342251950947603, + "grad_norm": 0.5267577171325684, + "learning_rate": 8e-05, + "loss": 1.7791, + "step": 3470 + }, + { + "epoch": 0.1934782608695652, + "grad_norm": 0.44919127225875854, + "learning_rate": 8e-05, + "loss": 1.6597, + "step": 3471 + }, + { + "epoch": 0.19353400222965442, + "grad_norm": 0.4594336450099945, + "learning_rate": 8e-05, + "loss": 1.7039, + "step": 3472 + }, + { + "epoch": 0.1935897435897436, + "grad_norm": 0.4646263122558594, + "learning_rate": 8e-05, + "loss": 1.5838, + "step": 3473 + }, + { + "epoch": 0.19364548494983277, + "grad_norm": 0.5099480748176575, + "learning_rate": 8e-05, + "loss": 1.6858, + "step": 3474 + }, + { + "epoch": 0.19370122630992195, + "grad_norm": 0.5205021500587463, + "learning_rate": 8e-05, + "loss": 1.853, + "step": 3475 + }, + { + "epoch": 0.19375696767001116, + "grad_norm": 0.450195848941803, + "learning_rate": 8e-05, + "loss": 1.5248, + "step": 3476 + }, + { + "epoch": 0.19381270903010034, + "grad_norm": 0.49337515234947205, + "learning_rate": 8e-05, + "loss": 1.662, + "step": 3477 + }, + { + "epoch": 0.19386845039018952, + "grad_norm": 0.5189906358718872, + "learning_rate": 8e-05, + "loss": 1.8305, + "step": 3478 + }, + { + "epoch": 0.1939241917502787, + "grad_norm": 0.5012702345848083, + "learning_rate": 8e-05, + "loss": 1.656, + "step": 3479 + }, + { + "epoch": 0.1939799331103679, + "grad_norm": 0.4817003011703491, + "learning_rate": 8e-05, + "loss": 1.7924, + "step": 3480 + }, + { + "epoch": 0.19403567447045708, + "grad_norm": 0.45084530115127563, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 3481 + }, + { + "epoch": 0.19409141583054626, + "grad_norm": 0.5064246654510498, + "learning_rate": 8e-05, + "loss": 1.6494, + "step": 3482 + }, + { + "epoch": 0.19414715719063544, + "grad_norm": 0.515352725982666, + "learning_rate": 8e-05, + "loss": 1.7608, + "step": 3483 + }, + { + "epoch": 0.19420289855072465, + "grad_norm": 0.4773847162723541, + "learning_rate": 8e-05, + "loss": 1.9209, + "step": 3484 + }, + { + "epoch": 0.19425863991081382, + "grad_norm": 0.4580041766166687, + "learning_rate": 8e-05, + "loss": 1.6939, + "step": 3485 + }, + { + "epoch": 0.194314381270903, + "grad_norm": 0.5036474466323853, + "learning_rate": 8e-05, + "loss": 1.7424, + "step": 3486 + }, + { + "epoch": 0.1943701226309922, + "grad_norm": 0.5263992547988892, + "learning_rate": 8e-05, + "loss": 1.929, + "step": 3487 + }, + { + "epoch": 0.1944258639910814, + "grad_norm": 0.5149288177490234, + "learning_rate": 8e-05, + "loss": 1.7964, + "step": 3488 + }, + { + "epoch": 0.19448160535117057, + "grad_norm": 0.4722621738910675, + "learning_rate": 8e-05, + "loss": 1.6756, + "step": 3489 + }, + { + "epoch": 0.19453734671125975, + "grad_norm": 0.46639785170555115, + "learning_rate": 8e-05, + "loss": 1.6602, + "step": 3490 + }, + { + "epoch": 0.19459308807134895, + "grad_norm": 0.5215376019477844, + "learning_rate": 8e-05, + "loss": 1.8744, + "step": 3491 + }, + { + "epoch": 0.19464882943143813, + "grad_norm": 0.4881868362426758, + "learning_rate": 8e-05, + "loss": 1.8877, + "step": 3492 + }, + { + "epoch": 0.1947045707915273, + "grad_norm": 0.5139914751052856, + "learning_rate": 8e-05, + "loss": 1.8469, + "step": 3493 + }, + { + "epoch": 0.1947603121516165, + "grad_norm": 0.44681939482688904, + "learning_rate": 8e-05, + "loss": 1.5248, + "step": 3494 + }, + { + "epoch": 0.1948160535117057, + "grad_norm": 0.44363757967948914, + "learning_rate": 8e-05, + "loss": 1.6206, + "step": 3495 + }, + { + "epoch": 0.19487179487179487, + "grad_norm": 0.461874783039093, + "learning_rate": 8e-05, + "loss": 1.645, + "step": 3496 + }, + { + "epoch": 0.19492753623188405, + "grad_norm": 0.46725091338157654, + "learning_rate": 8e-05, + "loss": 1.7982, + "step": 3497 + }, + { + "epoch": 0.19498327759197323, + "grad_norm": 0.4530015289783478, + "learning_rate": 8e-05, + "loss": 1.5392, + "step": 3498 + }, + { + "epoch": 0.19503901895206244, + "grad_norm": 0.5092223882675171, + "learning_rate": 8e-05, + "loss": 1.7695, + "step": 3499 + }, + { + "epoch": 0.19509476031215162, + "grad_norm": 0.4746079444885254, + "learning_rate": 8e-05, + "loss": 1.7414, + "step": 3500 + }, + { + "epoch": 0.1951505016722408, + "grad_norm": 0.5035505294799805, + "learning_rate": 8e-05, + "loss": 1.8174, + "step": 3501 + }, + { + "epoch": 0.19520624303233, + "grad_norm": 0.49879980087280273, + "learning_rate": 8e-05, + "loss": 1.7143, + "step": 3502 + }, + { + "epoch": 0.19526198439241918, + "grad_norm": 0.4672737419605255, + "learning_rate": 8e-05, + "loss": 1.5076, + "step": 3503 + }, + { + "epoch": 0.19531772575250836, + "grad_norm": 0.4909713864326477, + "learning_rate": 8e-05, + "loss": 1.9685, + "step": 3504 + }, + { + "epoch": 0.19537346711259754, + "grad_norm": 0.5099307298660278, + "learning_rate": 8e-05, + "loss": 1.6727, + "step": 3505 + }, + { + "epoch": 0.19542920847268674, + "grad_norm": 0.49376577138900757, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 3506 + }, + { + "epoch": 0.19548494983277592, + "grad_norm": 0.4841322898864746, + "learning_rate": 8e-05, + "loss": 1.9417, + "step": 3507 + }, + { + "epoch": 0.1955406911928651, + "grad_norm": 0.5126500725746155, + "learning_rate": 8e-05, + "loss": 1.7245, + "step": 3508 + }, + { + "epoch": 0.19559643255295428, + "grad_norm": 0.5002952814102173, + "learning_rate": 8e-05, + "loss": 1.8467, + "step": 3509 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.5159156322479248, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 3510 + }, + { + "epoch": 0.19570791527313267, + "grad_norm": 0.5152454972267151, + "learning_rate": 8e-05, + "loss": 1.5334, + "step": 3511 + }, + { + "epoch": 0.19576365663322184, + "grad_norm": 0.47474539279937744, + "learning_rate": 8e-05, + "loss": 1.6742, + "step": 3512 + }, + { + "epoch": 0.19581939799331102, + "grad_norm": 0.4961521327495575, + "learning_rate": 8e-05, + "loss": 1.6075, + "step": 3513 + }, + { + "epoch": 0.19587513935340023, + "grad_norm": 0.4723234176635742, + "learning_rate": 8e-05, + "loss": 1.6282, + "step": 3514 + }, + { + "epoch": 0.1959308807134894, + "grad_norm": 0.48595571517944336, + "learning_rate": 8e-05, + "loss": 1.7194, + "step": 3515 + }, + { + "epoch": 0.1959866220735786, + "grad_norm": 0.4401930868625641, + "learning_rate": 8e-05, + "loss": 1.5628, + "step": 3516 + }, + { + "epoch": 0.1960423634336678, + "grad_norm": 0.493459016084671, + "learning_rate": 8e-05, + "loss": 1.6753, + "step": 3517 + }, + { + "epoch": 0.19609810479375697, + "grad_norm": 0.4308880567550659, + "learning_rate": 8e-05, + "loss": 1.4897, + "step": 3518 + }, + { + "epoch": 0.19615384615384615, + "grad_norm": 0.4599229097366333, + "learning_rate": 8e-05, + "loss": 1.4711, + "step": 3519 + }, + { + "epoch": 0.19620958751393533, + "grad_norm": 0.47723254561424255, + "learning_rate": 8e-05, + "loss": 1.6434, + "step": 3520 + }, + { + "epoch": 0.19626532887402454, + "grad_norm": 0.4884144365787506, + "learning_rate": 8e-05, + "loss": 1.6371, + "step": 3521 + }, + { + "epoch": 0.19632107023411371, + "grad_norm": 0.5237398147583008, + "learning_rate": 8e-05, + "loss": 1.8574, + "step": 3522 + }, + { + "epoch": 0.1963768115942029, + "grad_norm": 0.476162850856781, + "learning_rate": 8e-05, + "loss": 1.4318, + "step": 3523 + }, + { + "epoch": 0.19643255295429207, + "grad_norm": 0.48270082473754883, + "learning_rate": 8e-05, + "loss": 1.6951, + "step": 3524 + }, + { + "epoch": 0.19648829431438128, + "grad_norm": 0.4689558446407318, + "learning_rate": 8e-05, + "loss": 1.5397, + "step": 3525 + }, + { + "epoch": 0.19654403567447046, + "grad_norm": 0.4758528470993042, + "learning_rate": 8e-05, + "loss": 1.4821, + "step": 3526 + }, + { + "epoch": 0.19659977703455964, + "grad_norm": 0.5153961777687073, + "learning_rate": 8e-05, + "loss": 1.918, + "step": 3527 + }, + { + "epoch": 0.19665551839464884, + "grad_norm": 0.5305852890014648, + "learning_rate": 8e-05, + "loss": 1.8169, + "step": 3528 + }, + { + "epoch": 0.19671125975473802, + "grad_norm": 0.5135340094566345, + "learning_rate": 8e-05, + "loss": 1.8728, + "step": 3529 + }, + { + "epoch": 0.1967670011148272, + "grad_norm": 0.48879870772361755, + "learning_rate": 8e-05, + "loss": 1.7221, + "step": 3530 + }, + { + "epoch": 0.19682274247491638, + "grad_norm": 0.4915723204612732, + "learning_rate": 8e-05, + "loss": 1.7132, + "step": 3531 + }, + { + "epoch": 0.19687848383500559, + "grad_norm": 0.49147656559944153, + "learning_rate": 8e-05, + "loss": 1.6383, + "step": 3532 + }, + { + "epoch": 0.19693422519509476, + "grad_norm": 0.5483487248420715, + "learning_rate": 8e-05, + "loss": 1.8147, + "step": 3533 + }, + { + "epoch": 0.19698996655518394, + "grad_norm": 0.4494275748729706, + "learning_rate": 8e-05, + "loss": 1.4576, + "step": 3534 + }, + { + "epoch": 0.19704570791527312, + "grad_norm": 0.5068910121917725, + "learning_rate": 8e-05, + "loss": 1.7459, + "step": 3535 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 0.4981747269630432, + "learning_rate": 8e-05, + "loss": 1.5986, + "step": 3536 + }, + { + "epoch": 0.1971571906354515, + "grad_norm": 0.47614431381225586, + "learning_rate": 8e-05, + "loss": 1.3079, + "step": 3537 + }, + { + "epoch": 0.19721293199554069, + "grad_norm": 0.48696333169937134, + "learning_rate": 8e-05, + "loss": 1.6743, + "step": 3538 + }, + { + "epoch": 0.19726867335562986, + "grad_norm": 0.4971964359283447, + "learning_rate": 8e-05, + "loss": 1.8352, + "step": 3539 + }, + { + "epoch": 0.19732441471571907, + "grad_norm": 0.5399619936943054, + "learning_rate": 8e-05, + "loss": 1.8265, + "step": 3540 + }, + { + "epoch": 0.19738015607580825, + "grad_norm": 0.50235515832901, + "learning_rate": 8e-05, + "loss": 1.7268, + "step": 3541 + }, + { + "epoch": 0.19743589743589743, + "grad_norm": 0.5028350353240967, + "learning_rate": 8e-05, + "loss": 1.6568, + "step": 3542 + }, + { + "epoch": 0.19749163879598663, + "grad_norm": 0.5096933841705322, + "learning_rate": 8e-05, + "loss": 1.6951, + "step": 3543 + }, + { + "epoch": 0.1975473801560758, + "grad_norm": 0.5013551115989685, + "learning_rate": 8e-05, + "loss": 1.5317, + "step": 3544 + }, + { + "epoch": 0.197603121516165, + "grad_norm": 0.5122877955436707, + "learning_rate": 8e-05, + "loss": 1.8456, + "step": 3545 + }, + { + "epoch": 0.19765886287625417, + "grad_norm": 0.5154286026954651, + "learning_rate": 8e-05, + "loss": 1.7878, + "step": 3546 + }, + { + "epoch": 0.19771460423634338, + "grad_norm": 0.5381590127944946, + "learning_rate": 8e-05, + "loss": 1.8982, + "step": 3547 + }, + { + "epoch": 0.19777034559643256, + "grad_norm": 0.5874319672584534, + "learning_rate": 8e-05, + "loss": 1.6731, + "step": 3548 + }, + { + "epoch": 0.19782608695652174, + "grad_norm": 0.5069901347160339, + "learning_rate": 8e-05, + "loss": 1.7125, + "step": 3549 + }, + { + "epoch": 0.1978818283166109, + "grad_norm": 0.4645526111125946, + "learning_rate": 8e-05, + "loss": 1.3551, + "step": 3550 + }, + { + "epoch": 0.19793756967670012, + "grad_norm": 0.4883759021759033, + "learning_rate": 8e-05, + "loss": 1.6016, + "step": 3551 + }, + { + "epoch": 0.1979933110367893, + "grad_norm": 0.49408966302871704, + "learning_rate": 8e-05, + "loss": 1.8289, + "step": 3552 + }, + { + "epoch": 0.19804905239687848, + "grad_norm": 0.5287270545959473, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 3553 + }, + { + "epoch": 0.19810479375696766, + "grad_norm": 0.48724809288978577, + "learning_rate": 8e-05, + "loss": 1.7525, + "step": 3554 + }, + { + "epoch": 0.19816053511705686, + "grad_norm": 0.48583024740219116, + "learning_rate": 8e-05, + "loss": 1.8145, + "step": 3555 + }, + { + "epoch": 0.19821627647714604, + "grad_norm": 0.5329705476760864, + "learning_rate": 8e-05, + "loss": 1.7537, + "step": 3556 + }, + { + "epoch": 0.19827201783723522, + "grad_norm": 0.5065861940383911, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 3557 + }, + { + "epoch": 0.19832775919732443, + "grad_norm": 0.48774999380111694, + "learning_rate": 8e-05, + "loss": 1.8509, + "step": 3558 + }, + { + "epoch": 0.1983835005574136, + "grad_norm": 0.5659329295158386, + "learning_rate": 8e-05, + "loss": 2.0576, + "step": 3559 + }, + { + "epoch": 0.19843924191750278, + "grad_norm": 0.4959155023097992, + "learning_rate": 8e-05, + "loss": 1.6683, + "step": 3560 + }, + { + "epoch": 0.19849498327759196, + "grad_norm": 0.5472777485847473, + "learning_rate": 8e-05, + "loss": 1.5919, + "step": 3561 + }, + { + "epoch": 0.19855072463768117, + "grad_norm": 0.49425390362739563, + "learning_rate": 8e-05, + "loss": 1.8153, + "step": 3562 + }, + { + "epoch": 0.19860646599777035, + "grad_norm": 0.5090674161911011, + "learning_rate": 8e-05, + "loss": 1.8033, + "step": 3563 + }, + { + "epoch": 0.19866220735785953, + "grad_norm": 0.45376265048980713, + "learning_rate": 8e-05, + "loss": 1.8647, + "step": 3564 + }, + { + "epoch": 0.1987179487179487, + "grad_norm": 0.5247412323951721, + "learning_rate": 8e-05, + "loss": 2.0173, + "step": 3565 + }, + { + "epoch": 0.1987736900780379, + "grad_norm": 0.5044318437576294, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 3566 + }, + { + "epoch": 0.1988294314381271, + "grad_norm": 0.49488723278045654, + "learning_rate": 8e-05, + "loss": 1.7759, + "step": 3567 + }, + { + "epoch": 0.19888517279821627, + "grad_norm": 0.46972551941871643, + "learning_rate": 8e-05, + "loss": 1.6848, + "step": 3568 + }, + { + "epoch": 0.19894091415830548, + "grad_norm": 0.49496331810951233, + "learning_rate": 8e-05, + "loss": 1.5929, + "step": 3569 + }, + { + "epoch": 0.19899665551839466, + "grad_norm": 0.5036976337432861, + "learning_rate": 8e-05, + "loss": 1.9322, + "step": 3570 + }, + { + "epoch": 0.19905239687848383, + "grad_norm": 0.5012423992156982, + "learning_rate": 8e-05, + "loss": 1.7887, + "step": 3571 + }, + { + "epoch": 0.199108138238573, + "grad_norm": 0.47637394070625305, + "learning_rate": 8e-05, + "loss": 1.5631, + "step": 3572 + }, + { + "epoch": 0.19916387959866222, + "grad_norm": 0.47292307019233704, + "learning_rate": 8e-05, + "loss": 1.6106, + "step": 3573 + }, + { + "epoch": 0.1992196209587514, + "grad_norm": 0.48646533489227295, + "learning_rate": 8e-05, + "loss": 1.7756, + "step": 3574 + }, + { + "epoch": 0.19927536231884058, + "grad_norm": 0.4798130691051483, + "learning_rate": 8e-05, + "loss": 1.5798, + "step": 3575 + }, + { + "epoch": 0.19933110367892976, + "grad_norm": 0.4687424600124359, + "learning_rate": 8e-05, + "loss": 1.6849, + "step": 3576 + }, + { + "epoch": 0.19938684503901896, + "grad_norm": 0.49993613362312317, + "learning_rate": 8e-05, + "loss": 1.7479, + "step": 3577 + }, + { + "epoch": 0.19944258639910814, + "grad_norm": 0.5133943557739258, + "learning_rate": 8e-05, + "loss": 1.7845, + "step": 3578 + }, + { + "epoch": 0.19949832775919732, + "grad_norm": 0.4577353596687317, + "learning_rate": 8e-05, + "loss": 1.6277, + "step": 3579 + }, + { + "epoch": 0.1995540691192865, + "grad_norm": 0.44345855712890625, + "learning_rate": 8e-05, + "loss": 1.5042, + "step": 3580 + }, + { + "epoch": 0.1996098104793757, + "grad_norm": 0.5210535526275635, + "learning_rate": 8e-05, + "loss": 1.8266, + "step": 3581 + }, + { + "epoch": 0.19966555183946488, + "grad_norm": 0.534353494644165, + "learning_rate": 8e-05, + "loss": 1.7317, + "step": 3582 + }, + { + "epoch": 0.19972129319955406, + "grad_norm": 0.531521737575531, + "learning_rate": 8e-05, + "loss": 1.6291, + "step": 3583 + }, + { + "epoch": 0.19977703455964327, + "grad_norm": 0.4943453073501587, + "learning_rate": 8e-05, + "loss": 1.5845, + "step": 3584 + }, + { + "epoch": 0.19983277591973245, + "grad_norm": 0.4547666907310486, + "learning_rate": 8e-05, + "loss": 1.7167, + "step": 3585 + }, + { + "epoch": 0.19988851727982163, + "grad_norm": 0.45870059728622437, + "learning_rate": 8e-05, + "loss": 1.646, + "step": 3586 + }, + { + "epoch": 0.1999442586399108, + "grad_norm": 0.5779637098312378, + "learning_rate": 8e-05, + "loss": 1.5718, + "step": 3587 + }, + { + "epoch": 0.2, + "grad_norm": 0.5157275795936584, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 3588 + }, + { + "epoch": 0.2000557413600892, + "grad_norm": 0.4938097596168518, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 3589 + }, + { + "epoch": 0.20011148272017837, + "grad_norm": 0.5136789083480835, + "learning_rate": 8e-05, + "loss": 1.7808, + "step": 3590 + }, + { + "epoch": 0.20016722408026755, + "grad_norm": 0.49631932377815247, + "learning_rate": 8e-05, + "loss": 1.6969, + "step": 3591 + }, + { + "epoch": 0.20022296544035675, + "grad_norm": 0.5018367767333984, + "learning_rate": 8e-05, + "loss": 1.7466, + "step": 3592 + }, + { + "epoch": 0.20027870680044593, + "grad_norm": 0.5183612704277039, + "learning_rate": 8e-05, + "loss": 1.8397, + "step": 3593 + }, + { + "epoch": 0.2003344481605351, + "grad_norm": 0.48466330766677856, + "learning_rate": 8e-05, + "loss": 1.7564, + "step": 3594 + }, + { + "epoch": 0.2003901895206243, + "grad_norm": 0.448240727186203, + "learning_rate": 8e-05, + "loss": 1.4317, + "step": 3595 + }, + { + "epoch": 0.2004459308807135, + "grad_norm": 0.51080721616745, + "learning_rate": 8e-05, + "loss": 1.743, + "step": 3596 + }, + { + "epoch": 0.20050167224080268, + "grad_norm": 0.5205346941947937, + "learning_rate": 8e-05, + "loss": 1.6999, + "step": 3597 + }, + { + "epoch": 0.20055741360089185, + "grad_norm": 0.4996013641357422, + "learning_rate": 8e-05, + "loss": 1.5763, + "step": 3598 + }, + { + "epoch": 0.20061315496098106, + "grad_norm": 0.465975284576416, + "learning_rate": 8e-05, + "loss": 1.7167, + "step": 3599 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.5416645407676697, + "learning_rate": 8e-05, + "loss": 1.8032, + "step": 3600 + }, + { + "epoch": 0.20072463768115942, + "grad_norm": 0.49942782521247864, + "learning_rate": 8e-05, + "loss": 1.4552, + "step": 3601 + }, + { + "epoch": 0.2007803790412486, + "grad_norm": 0.4765155017375946, + "learning_rate": 8e-05, + "loss": 1.5083, + "step": 3602 + }, + { + "epoch": 0.2008361204013378, + "grad_norm": 0.5089762806892395, + "learning_rate": 8e-05, + "loss": 1.8018, + "step": 3603 + }, + { + "epoch": 0.20089186176142698, + "grad_norm": 0.5311180949211121, + "learning_rate": 8e-05, + "loss": 1.3827, + "step": 3604 + }, + { + "epoch": 0.20094760312151616, + "grad_norm": 0.4367247521877289, + "learning_rate": 8e-05, + "loss": 1.433, + "step": 3605 + }, + { + "epoch": 0.20100334448160534, + "grad_norm": 0.5144723653793335, + "learning_rate": 8e-05, + "loss": 1.4288, + "step": 3606 + }, + { + "epoch": 0.20105908584169455, + "grad_norm": 0.5270749926567078, + "learning_rate": 8e-05, + "loss": 1.7999, + "step": 3607 + }, + { + "epoch": 0.20111482720178372, + "grad_norm": 0.4606470763683319, + "learning_rate": 8e-05, + "loss": 1.5438, + "step": 3608 + }, + { + "epoch": 0.2011705685618729, + "grad_norm": 0.5114845037460327, + "learning_rate": 8e-05, + "loss": 1.7967, + "step": 3609 + }, + { + "epoch": 0.20122630992196208, + "grad_norm": 0.4875860810279846, + "learning_rate": 8e-05, + "loss": 1.8787, + "step": 3610 + }, + { + "epoch": 0.2012820512820513, + "grad_norm": 0.5535544157028198, + "learning_rate": 8e-05, + "loss": 1.5494, + "step": 3611 + }, + { + "epoch": 0.20133779264214047, + "grad_norm": 0.47383052110671997, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 3612 + }, + { + "epoch": 0.20139353400222965, + "grad_norm": 0.4879821836948395, + "learning_rate": 8e-05, + "loss": 1.5114, + "step": 3613 + }, + { + "epoch": 0.20144927536231885, + "grad_norm": 0.5040041208267212, + "learning_rate": 8e-05, + "loss": 1.6116, + "step": 3614 + }, + { + "epoch": 0.20150501672240803, + "grad_norm": 0.4656704068183899, + "learning_rate": 8e-05, + "loss": 1.6972, + "step": 3615 + }, + { + "epoch": 0.2015607580824972, + "grad_norm": 0.47616857290267944, + "learning_rate": 8e-05, + "loss": 1.8055, + "step": 3616 + }, + { + "epoch": 0.2016164994425864, + "grad_norm": 0.4548301696777344, + "learning_rate": 8e-05, + "loss": 1.742, + "step": 3617 + }, + { + "epoch": 0.2016722408026756, + "grad_norm": 0.4619283676147461, + "learning_rate": 8e-05, + "loss": 1.6631, + "step": 3618 + }, + { + "epoch": 0.20172798216276477, + "grad_norm": 0.4462347626686096, + "learning_rate": 8e-05, + "loss": 1.5333, + "step": 3619 + }, + { + "epoch": 0.20178372352285395, + "grad_norm": 0.523777961730957, + "learning_rate": 8e-05, + "loss": 1.734, + "step": 3620 + }, + { + "epoch": 0.20183946488294313, + "grad_norm": 0.4466857612133026, + "learning_rate": 8e-05, + "loss": 1.4318, + "step": 3621 + }, + { + "epoch": 0.20189520624303234, + "grad_norm": 0.5048567056655884, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 3622 + }, + { + "epoch": 0.20195094760312152, + "grad_norm": 0.5019554495811462, + "learning_rate": 8e-05, + "loss": 1.6489, + "step": 3623 + }, + { + "epoch": 0.2020066889632107, + "grad_norm": 0.5542216897010803, + "learning_rate": 8e-05, + "loss": 1.9606, + "step": 3624 + }, + { + "epoch": 0.2020624303232999, + "grad_norm": 0.4789694547653198, + "learning_rate": 8e-05, + "loss": 1.786, + "step": 3625 + }, + { + "epoch": 0.20211817168338908, + "grad_norm": 0.5140082240104675, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 3626 + }, + { + "epoch": 0.20217391304347826, + "grad_norm": 0.4956878125667572, + "learning_rate": 8e-05, + "loss": 1.8947, + "step": 3627 + }, + { + "epoch": 0.20222965440356744, + "grad_norm": 0.541315495967865, + "learning_rate": 8e-05, + "loss": 1.9393, + "step": 3628 + }, + { + "epoch": 0.20228539576365664, + "grad_norm": 0.4940366744995117, + "learning_rate": 8e-05, + "loss": 1.8449, + "step": 3629 + }, + { + "epoch": 0.20234113712374582, + "grad_norm": 0.5367388725280762, + "learning_rate": 8e-05, + "loss": 1.9762, + "step": 3630 + }, + { + "epoch": 0.202396878483835, + "grad_norm": 0.4210890829563141, + "learning_rate": 8e-05, + "loss": 1.4644, + "step": 3631 + }, + { + "epoch": 0.20245261984392418, + "grad_norm": 0.4935843348503113, + "learning_rate": 8e-05, + "loss": 1.8778, + "step": 3632 + }, + { + "epoch": 0.2025083612040134, + "grad_norm": 0.4714401960372925, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 3633 + }, + { + "epoch": 0.20256410256410257, + "grad_norm": 0.5045357346534729, + "learning_rate": 8e-05, + "loss": 1.8094, + "step": 3634 + }, + { + "epoch": 0.20261984392419174, + "grad_norm": 0.5472859740257263, + "learning_rate": 8e-05, + "loss": 1.8484, + "step": 3635 + }, + { + "epoch": 0.20267558528428092, + "grad_norm": 0.4495568573474884, + "learning_rate": 8e-05, + "loss": 1.3515, + "step": 3636 + }, + { + "epoch": 0.20273132664437013, + "grad_norm": 0.478530615568161, + "learning_rate": 8e-05, + "loss": 1.716, + "step": 3637 + }, + { + "epoch": 0.2027870680044593, + "grad_norm": 0.4960003197193146, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 3638 + }, + { + "epoch": 0.2028428093645485, + "grad_norm": 0.44264334440231323, + "learning_rate": 8e-05, + "loss": 1.5679, + "step": 3639 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.5412372350692749, + "learning_rate": 8e-05, + "loss": 2.0983, + "step": 3640 + }, + { + "epoch": 0.20295429208472687, + "grad_norm": 0.5256844758987427, + "learning_rate": 8e-05, + "loss": 1.5816, + "step": 3641 + }, + { + "epoch": 0.20301003344481605, + "grad_norm": 0.47482818365097046, + "learning_rate": 8e-05, + "loss": 1.4953, + "step": 3642 + }, + { + "epoch": 0.20306577480490523, + "grad_norm": 0.4723987281322479, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 3643 + }, + { + "epoch": 0.20312151616499444, + "grad_norm": 0.454328328371048, + "learning_rate": 8e-05, + "loss": 1.6628, + "step": 3644 + }, + { + "epoch": 0.20317725752508362, + "grad_norm": 0.46709755063056946, + "learning_rate": 8e-05, + "loss": 1.6382, + "step": 3645 + }, + { + "epoch": 0.2032329988851728, + "grad_norm": 0.48052144050598145, + "learning_rate": 8e-05, + "loss": 1.4057, + "step": 3646 + }, + { + "epoch": 0.20328874024526197, + "grad_norm": 0.5479748845100403, + "learning_rate": 8e-05, + "loss": 1.8707, + "step": 3647 + }, + { + "epoch": 0.20334448160535118, + "grad_norm": 0.4681054651737213, + "learning_rate": 8e-05, + "loss": 1.4069, + "step": 3648 + }, + { + "epoch": 0.20340022296544036, + "grad_norm": 0.4966769516468048, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 3649 + }, + { + "epoch": 0.20345596432552954, + "grad_norm": 0.5099707245826721, + "learning_rate": 8e-05, + "loss": 1.5968, + "step": 3650 + }, + { + "epoch": 0.20351170568561872, + "grad_norm": 0.5672361254692078, + "learning_rate": 8e-05, + "loss": 1.8859, + "step": 3651 + }, + { + "epoch": 0.20356744704570792, + "grad_norm": 0.5006271004676819, + "learning_rate": 8e-05, + "loss": 1.6707, + "step": 3652 + }, + { + "epoch": 0.2036231884057971, + "grad_norm": 0.5286397337913513, + "learning_rate": 8e-05, + "loss": 1.7566, + "step": 3653 + }, + { + "epoch": 0.20367892976588628, + "grad_norm": 0.5863698720932007, + "learning_rate": 8e-05, + "loss": 1.883, + "step": 3654 + }, + { + "epoch": 0.20373467112597549, + "grad_norm": 0.47385674715042114, + "learning_rate": 8e-05, + "loss": 1.6727, + "step": 3655 + }, + { + "epoch": 0.20379041248606466, + "grad_norm": 0.490665465593338, + "learning_rate": 8e-05, + "loss": 1.7301, + "step": 3656 + }, + { + "epoch": 0.20384615384615384, + "grad_norm": 0.49912935495376587, + "learning_rate": 8e-05, + "loss": 1.6501, + "step": 3657 + }, + { + "epoch": 0.20390189520624302, + "grad_norm": 0.4729326367378235, + "learning_rate": 8e-05, + "loss": 1.5658, + "step": 3658 + }, + { + "epoch": 0.20395763656633223, + "grad_norm": 0.4842578172683716, + "learning_rate": 8e-05, + "loss": 1.5118, + "step": 3659 + }, + { + "epoch": 0.2040133779264214, + "grad_norm": 0.494857519865036, + "learning_rate": 8e-05, + "loss": 1.7177, + "step": 3660 + }, + { + "epoch": 0.2040691192865106, + "grad_norm": 0.4974959194660187, + "learning_rate": 8e-05, + "loss": 1.668, + "step": 3661 + }, + { + "epoch": 0.20412486064659977, + "grad_norm": 0.5402908325195312, + "learning_rate": 8e-05, + "loss": 1.9019, + "step": 3662 + }, + { + "epoch": 0.20418060200668897, + "grad_norm": 0.5406147241592407, + "learning_rate": 8e-05, + "loss": 1.7114, + "step": 3663 + }, + { + "epoch": 0.20423634336677815, + "grad_norm": 0.47465139627456665, + "learning_rate": 8e-05, + "loss": 1.707, + "step": 3664 + }, + { + "epoch": 0.20429208472686733, + "grad_norm": 0.5029194355010986, + "learning_rate": 8e-05, + "loss": 1.6362, + "step": 3665 + }, + { + "epoch": 0.20434782608695654, + "grad_norm": 0.49449896812438965, + "learning_rate": 8e-05, + "loss": 1.6479, + "step": 3666 + }, + { + "epoch": 0.20440356744704571, + "grad_norm": 0.5273813009262085, + "learning_rate": 8e-05, + "loss": 1.8518, + "step": 3667 + }, + { + "epoch": 0.2044593088071349, + "grad_norm": 0.4844883382320404, + "learning_rate": 8e-05, + "loss": 1.8226, + "step": 3668 + }, + { + "epoch": 0.20451505016722407, + "grad_norm": 0.5077101588249207, + "learning_rate": 8e-05, + "loss": 1.8068, + "step": 3669 + }, + { + "epoch": 0.20457079152731328, + "grad_norm": 0.48901060223579407, + "learning_rate": 8e-05, + "loss": 1.5398, + "step": 3670 + }, + { + "epoch": 0.20462653288740246, + "grad_norm": 0.43883004784584045, + "learning_rate": 8e-05, + "loss": 1.3666, + "step": 3671 + }, + { + "epoch": 0.20468227424749164, + "grad_norm": 0.4607550799846649, + "learning_rate": 8e-05, + "loss": 1.4984, + "step": 3672 + }, + { + "epoch": 0.20473801560758081, + "grad_norm": 0.4580022096633911, + "learning_rate": 8e-05, + "loss": 1.725, + "step": 3673 + }, + { + "epoch": 0.20479375696767002, + "grad_norm": 0.47610732913017273, + "learning_rate": 8e-05, + "loss": 1.7055, + "step": 3674 + }, + { + "epoch": 0.2048494983277592, + "grad_norm": 0.48338666558265686, + "learning_rate": 8e-05, + "loss": 1.8632, + "step": 3675 + }, + { + "epoch": 0.20490523968784838, + "grad_norm": 0.4903329312801361, + "learning_rate": 8e-05, + "loss": 1.4769, + "step": 3676 + }, + { + "epoch": 0.20496098104793756, + "grad_norm": 0.5103690028190613, + "learning_rate": 8e-05, + "loss": 1.6833, + "step": 3677 + }, + { + "epoch": 0.20501672240802676, + "grad_norm": 0.5390543937683105, + "learning_rate": 8e-05, + "loss": 1.8047, + "step": 3678 + }, + { + "epoch": 0.20507246376811594, + "grad_norm": 0.4767552614212036, + "learning_rate": 8e-05, + "loss": 1.7662, + "step": 3679 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.5675135254859924, + "learning_rate": 8e-05, + "loss": 1.7962, + "step": 3680 + }, + { + "epoch": 0.20518394648829433, + "grad_norm": 0.521496593952179, + "learning_rate": 8e-05, + "loss": 1.7376, + "step": 3681 + }, + { + "epoch": 0.2052396878483835, + "grad_norm": 0.5281379818916321, + "learning_rate": 8e-05, + "loss": 1.6355, + "step": 3682 + }, + { + "epoch": 0.20529542920847269, + "grad_norm": 0.47564199566841125, + "learning_rate": 8e-05, + "loss": 1.5621, + "step": 3683 + }, + { + "epoch": 0.20535117056856186, + "grad_norm": 0.4812081754207611, + "learning_rate": 8e-05, + "loss": 1.6206, + "step": 3684 + }, + { + "epoch": 0.20540691192865107, + "grad_norm": 0.49556106328964233, + "learning_rate": 8e-05, + "loss": 1.7597, + "step": 3685 + }, + { + "epoch": 0.20546265328874025, + "grad_norm": 0.5186765789985657, + "learning_rate": 8e-05, + "loss": 1.6219, + "step": 3686 + }, + { + "epoch": 0.20551839464882943, + "grad_norm": 0.5270774960517883, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 3687 + }, + { + "epoch": 0.2055741360089186, + "grad_norm": 0.5229092240333557, + "learning_rate": 8e-05, + "loss": 1.9134, + "step": 3688 + }, + { + "epoch": 0.2056298773690078, + "grad_norm": 0.5199306607246399, + "learning_rate": 8e-05, + "loss": 1.9065, + "step": 3689 + }, + { + "epoch": 0.205685618729097, + "grad_norm": 0.5622035264968872, + "learning_rate": 8e-05, + "loss": 1.7641, + "step": 3690 + }, + { + "epoch": 0.20574136008918617, + "grad_norm": 0.5008182525634766, + "learning_rate": 8e-05, + "loss": 1.6597, + "step": 3691 + }, + { + "epoch": 0.20579710144927535, + "grad_norm": 0.46879804134368896, + "learning_rate": 8e-05, + "loss": 1.4784, + "step": 3692 + }, + { + "epoch": 0.20585284280936456, + "grad_norm": 0.4561975300312042, + "learning_rate": 8e-05, + "loss": 1.5148, + "step": 3693 + }, + { + "epoch": 0.20590858416945373, + "grad_norm": 0.5250406265258789, + "learning_rate": 8e-05, + "loss": 1.8827, + "step": 3694 + }, + { + "epoch": 0.2059643255295429, + "grad_norm": 0.48637449741363525, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 3695 + }, + { + "epoch": 0.20602006688963212, + "grad_norm": 0.5028102397918701, + "learning_rate": 8e-05, + "loss": 1.9182, + "step": 3696 + }, + { + "epoch": 0.2060758082497213, + "grad_norm": 0.48546063899993896, + "learning_rate": 8e-05, + "loss": 1.7412, + "step": 3697 + }, + { + "epoch": 0.20613154960981048, + "grad_norm": 0.46001705527305603, + "learning_rate": 8e-05, + "loss": 1.765, + "step": 3698 + }, + { + "epoch": 0.20618729096989966, + "grad_norm": 0.4724488854408264, + "learning_rate": 8e-05, + "loss": 1.6078, + "step": 3699 + }, + { + "epoch": 0.20624303232998886, + "grad_norm": 0.51436448097229, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 3700 + }, + { + "epoch": 0.20629877369007804, + "grad_norm": 0.5569310784339905, + "learning_rate": 8e-05, + "loss": 1.9332, + "step": 3701 + }, + { + "epoch": 0.20635451505016722, + "grad_norm": 0.5193493366241455, + "learning_rate": 8e-05, + "loss": 1.7944, + "step": 3702 + }, + { + "epoch": 0.2064102564102564, + "grad_norm": 0.5052384734153748, + "learning_rate": 8e-05, + "loss": 1.84, + "step": 3703 + }, + { + "epoch": 0.2064659977703456, + "grad_norm": 0.4906817674636841, + "learning_rate": 8e-05, + "loss": 1.6263, + "step": 3704 + }, + { + "epoch": 0.20652173913043478, + "grad_norm": 0.48959335684776306, + "learning_rate": 8e-05, + "loss": 1.6381, + "step": 3705 + }, + { + "epoch": 0.20657748049052396, + "grad_norm": 0.4935573935508728, + "learning_rate": 8e-05, + "loss": 1.7671, + "step": 3706 + }, + { + "epoch": 0.20663322185061314, + "grad_norm": 0.5062634348869324, + "learning_rate": 8e-05, + "loss": 1.8406, + "step": 3707 + }, + { + "epoch": 0.20668896321070235, + "grad_norm": 0.5000908970832825, + "learning_rate": 8e-05, + "loss": 1.7604, + "step": 3708 + }, + { + "epoch": 0.20674470457079153, + "grad_norm": 0.5079050660133362, + "learning_rate": 8e-05, + "loss": 1.8167, + "step": 3709 + }, + { + "epoch": 0.2068004459308807, + "grad_norm": 0.48654088377952576, + "learning_rate": 8e-05, + "loss": 1.7648, + "step": 3710 + }, + { + "epoch": 0.2068561872909699, + "grad_norm": 0.5441576838493347, + "learning_rate": 8e-05, + "loss": 2.0396, + "step": 3711 + }, + { + "epoch": 0.2069119286510591, + "grad_norm": 0.48717156052589417, + "learning_rate": 8e-05, + "loss": 1.8172, + "step": 3712 + }, + { + "epoch": 0.20696767001114827, + "grad_norm": 0.4929221272468567, + "learning_rate": 8e-05, + "loss": 1.5783, + "step": 3713 + }, + { + "epoch": 0.20702341137123745, + "grad_norm": 0.5209141969680786, + "learning_rate": 8e-05, + "loss": 1.9343, + "step": 3714 + }, + { + "epoch": 0.20707915273132665, + "grad_norm": 0.4781465530395508, + "learning_rate": 8e-05, + "loss": 1.8023, + "step": 3715 + }, + { + "epoch": 0.20713489409141583, + "grad_norm": 0.475308358669281, + "learning_rate": 8e-05, + "loss": 1.7513, + "step": 3716 + }, + { + "epoch": 0.207190635451505, + "grad_norm": 0.48245689272880554, + "learning_rate": 8e-05, + "loss": 1.6194, + "step": 3717 + }, + { + "epoch": 0.2072463768115942, + "grad_norm": 0.48839378356933594, + "learning_rate": 8e-05, + "loss": 1.6131, + "step": 3718 + }, + { + "epoch": 0.2073021181716834, + "grad_norm": 0.49294933676719666, + "learning_rate": 8e-05, + "loss": 1.8494, + "step": 3719 + }, + { + "epoch": 0.20735785953177258, + "grad_norm": 0.46077826619148254, + "learning_rate": 8e-05, + "loss": 1.6807, + "step": 3720 + }, + { + "epoch": 0.20741360089186175, + "grad_norm": 0.49710530042648315, + "learning_rate": 8e-05, + "loss": 1.8692, + "step": 3721 + }, + { + "epoch": 0.20746934225195096, + "grad_norm": 0.5080289244651794, + "learning_rate": 8e-05, + "loss": 1.8348, + "step": 3722 + }, + { + "epoch": 0.20752508361204014, + "grad_norm": 0.47977977991104126, + "learning_rate": 8e-05, + "loss": 1.8032, + "step": 3723 + }, + { + "epoch": 0.20758082497212932, + "grad_norm": 0.5408175587654114, + "learning_rate": 8e-05, + "loss": 1.8521, + "step": 3724 + }, + { + "epoch": 0.2076365663322185, + "grad_norm": 0.45105478167533875, + "learning_rate": 8e-05, + "loss": 1.5664, + "step": 3725 + }, + { + "epoch": 0.2076923076923077, + "grad_norm": 0.5026746988296509, + "learning_rate": 8e-05, + "loss": 1.9063, + "step": 3726 + }, + { + "epoch": 0.20774804905239688, + "grad_norm": 0.493769109249115, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 3727 + }, + { + "epoch": 0.20780379041248606, + "grad_norm": 0.4682243764400482, + "learning_rate": 8e-05, + "loss": 1.5133, + "step": 3728 + }, + { + "epoch": 0.20785953177257524, + "grad_norm": 0.5349406599998474, + "learning_rate": 8e-05, + "loss": 1.7283, + "step": 3729 + }, + { + "epoch": 0.20791527313266445, + "grad_norm": 0.46482259035110474, + "learning_rate": 8e-05, + "loss": 1.3779, + "step": 3730 + }, + { + "epoch": 0.20797101449275363, + "grad_norm": 0.5020154714584351, + "learning_rate": 8e-05, + "loss": 1.8065, + "step": 3731 + }, + { + "epoch": 0.2080267558528428, + "grad_norm": 0.5125617980957031, + "learning_rate": 8e-05, + "loss": 1.818, + "step": 3732 + }, + { + "epoch": 0.20808249721293198, + "grad_norm": 0.4770820736885071, + "learning_rate": 8e-05, + "loss": 1.4732, + "step": 3733 + }, + { + "epoch": 0.2081382385730212, + "grad_norm": 0.5883281230926514, + "learning_rate": 8e-05, + "loss": 2.0237, + "step": 3734 + }, + { + "epoch": 0.20819397993311037, + "grad_norm": 0.5591292977333069, + "learning_rate": 8e-05, + "loss": 1.7542, + "step": 3735 + }, + { + "epoch": 0.20824972129319955, + "grad_norm": 0.5060874223709106, + "learning_rate": 8e-05, + "loss": 1.866, + "step": 3736 + }, + { + "epoch": 0.20830546265328875, + "grad_norm": 0.5743696093559265, + "learning_rate": 8e-05, + "loss": 1.8753, + "step": 3737 + }, + { + "epoch": 0.20836120401337793, + "grad_norm": 0.48902419209480286, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 3738 + }, + { + "epoch": 0.2084169453734671, + "grad_norm": 0.5021654367446899, + "learning_rate": 8e-05, + "loss": 1.5628, + "step": 3739 + }, + { + "epoch": 0.2084726867335563, + "grad_norm": 0.5163289308547974, + "learning_rate": 8e-05, + "loss": 1.8731, + "step": 3740 + }, + { + "epoch": 0.2085284280936455, + "grad_norm": 0.4969714283943176, + "learning_rate": 8e-05, + "loss": 1.5473, + "step": 3741 + }, + { + "epoch": 0.20858416945373467, + "grad_norm": 0.4648710787296295, + "learning_rate": 8e-05, + "loss": 1.4971, + "step": 3742 + }, + { + "epoch": 0.20863991081382385, + "grad_norm": 0.5347151160240173, + "learning_rate": 8e-05, + "loss": 1.7845, + "step": 3743 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 0.4412599503993988, + "learning_rate": 8e-05, + "loss": 1.4139, + "step": 3744 + }, + { + "epoch": 0.20875139353400224, + "grad_norm": 0.49814075231552124, + "learning_rate": 8e-05, + "loss": 1.7027, + "step": 3745 + }, + { + "epoch": 0.20880713489409142, + "grad_norm": 0.5593386888504028, + "learning_rate": 8e-05, + "loss": 1.7623, + "step": 3746 + }, + { + "epoch": 0.2088628762541806, + "grad_norm": 0.5205774307250977, + "learning_rate": 8e-05, + "loss": 1.8267, + "step": 3747 + }, + { + "epoch": 0.20891861761426977, + "grad_norm": 0.5357809662818909, + "learning_rate": 8e-05, + "loss": 1.8368, + "step": 3748 + }, + { + "epoch": 0.20897435897435898, + "grad_norm": 0.530596911907196, + "learning_rate": 8e-05, + "loss": 1.5877, + "step": 3749 + }, + { + "epoch": 0.20903010033444816, + "grad_norm": 0.5513470768928528, + "learning_rate": 8e-05, + "loss": 1.9045, + "step": 3750 + }, + { + "epoch": 0.20908584169453734, + "grad_norm": 0.5307799577713013, + "learning_rate": 8e-05, + "loss": 1.8905, + "step": 3751 + }, + { + "epoch": 0.20914158305462655, + "grad_norm": 0.45953071117401123, + "learning_rate": 8e-05, + "loss": 1.6001, + "step": 3752 + }, + { + "epoch": 0.20919732441471572, + "grad_norm": 0.5116471648216248, + "learning_rate": 8e-05, + "loss": 1.4961, + "step": 3753 + }, + { + "epoch": 0.2092530657748049, + "grad_norm": 0.4981320798397064, + "learning_rate": 8e-05, + "loss": 1.6046, + "step": 3754 + }, + { + "epoch": 0.20930880713489408, + "grad_norm": 0.5089046955108643, + "learning_rate": 8e-05, + "loss": 1.7513, + "step": 3755 + }, + { + "epoch": 0.2093645484949833, + "grad_norm": 0.6006644368171692, + "learning_rate": 8e-05, + "loss": 1.914, + "step": 3756 + }, + { + "epoch": 0.20942028985507247, + "grad_norm": 0.534820020198822, + "learning_rate": 8e-05, + "loss": 1.6961, + "step": 3757 + }, + { + "epoch": 0.20947603121516165, + "grad_norm": 0.499206006526947, + "learning_rate": 8e-05, + "loss": 1.7807, + "step": 3758 + }, + { + "epoch": 0.20953177257525082, + "grad_norm": 0.47656241059303284, + "learning_rate": 8e-05, + "loss": 1.6846, + "step": 3759 + }, + { + "epoch": 0.20958751393534003, + "grad_norm": 0.4986822307109833, + "learning_rate": 8e-05, + "loss": 1.685, + "step": 3760 + }, + { + "epoch": 0.2096432552954292, + "grad_norm": 0.5661086440086365, + "learning_rate": 8e-05, + "loss": 1.8754, + "step": 3761 + }, + { + "epoch": 0.2096989966555184, + "grad_norm": 0.47916561365127563, + "learning_rate": 8e-05, + "loss": 1.6673, + "step": 3762 + }, + { + "epoch": 0.2097547380156076, + "grad_norm": 0.5194079875946045, + "learning_rate": 8e-05, + "loss": 1.8266, + "step": 3763 + }, + { + "epoch": 0.20981047937569677, + "grad_norm": 0.5287598967552185, + "learning_rate": 8e-05, + "loss": 1.8464, + "step": 3764 + }, + { + "epoch": 0.20986622073578595, + "grad_norm": 0.5045037269592285, + "learning_rate": 8e-05, + "loss": 1.8192, + "step": 3765 + }, + { + "epoch": 0.20992196209587513, + "grad_norm": 0.4872746169567108, + "learning_rate": 8e-05, + "loss": 1.6454, + "step": 3766 + }, + { + "epoch": 0.20997770345596434, + "grad_norm": 0.5383753180503845, + "learning_rate": 8e-05, + "loss": 2.1398, + "step": 3767 + }, + { + "epoch": 0.21003344481605352, + "grad_norm": 0.5127830505371094, + "learning_rate": 8e-05, + "loss": 1.6806, + "step": 3768 + }, + { + "epoch": 0.2100891861761427, + "grad_norm": 0.4865786135196686, + "learning_rate": 8e-05, + "loss": 1.8012, + "step": 3769 + }, + { + "epoch": 0.21014492753623187, + "grad_norm": 0.5016859173774719, + "learning_rate": 8e-05, + "loss": 1.7015, + "step": 3770 + }, + { + "epoch": 0.21020066889632108, + "grad_norm": 0.5375359654426575, + "learning_rate": 8e-05, + "loss": 2.0587, + "step": 3771 + }, + { + "epoch": 0.21025641025641026, + "grad_norm": 0.5184040665626526, + "learning_rate": 8e-05, + "loss": 1.7752, + "step": 3772 + }, + { + "epoch": 0.21031215161649944, + "grad_norm": 0.5094031095504761, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 3773 + }, + { + "epoch": 0.21036789297658862, + "grad_norm": 0.4853542745113373, + "learning_rate": 8e-05, + "loss": 1.7475, + "step": 3774 + }, + { + "epoch": 0.21042363433667782, + "grad_norm": 0.45781707763671875, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 3775 + }, + { + "epoch": 0.210479375696767, + "grad_norm": 0.48402369022369385, + "learning_rate": 8e-05, + "loss": 1.7368, + "step": 3776 + }, + { + "epoch": 0.21053511705685618, + "grad_norm": 0.46384137868881226, + "learning_rate": 8e-05, + "loss": 1.6851, + "step": 3777 + }, + { + "epoch": 0.2105908584169454, + "grad_norm": 0.5177857875823975, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 3778 + }, + { + "epoch": 0.21064659977703457, + "grad_norm": 0.5443496108055115, + "learning_rate": 8e-05, + "loss": 1.7854, + "step": 3779 + }, + { + "epoch": 0.21070234113712374, + "grad_norm": 0.5036930441856384, + "learning_rate": 8e-05, + "loss": 1.7195, + "step": 3780 + }, + { + "epoch": 0.21075808249721292, + "grad_norm": 0.5116427540779114, + "learning_rate": 8e-05, + "loss": 1.6763, + "step": 3781 + }, + { + "epoch": 0.21081382385730213, + "grad_norm": 0.5227110981941223, + "learning_rate": 8e-05, + "loss": 1.8153, + "step": 3782 + }, + { + "epoch": 0.2108695652173913, + "grad_norm": 0.49027928709983826, + "learning_rate": 8e-05, + "loss": 1.7977, + "step": 3783 + }, + { + "epoch": 0.2109253065774805, + "grad_norm": 0.4768216609954834, + "learning_rate": 8e-05, + "loss": 1.6186, + "step": 3784 + }, + { + "epoch": 0.21098104793756967, + "grad_norm": 0.47161638736724854, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 3785 + }, + { + "epoch": 0.21103678929765887, + "grad_norm": 0.46826526522636414, + "learning_rate": 8e-05, + "loss": 1.5809, + "step": 3786 + }, + { + "epoch": 0.21109253065774805, + "grad_norm": 0.5086841583251953, + "learning_rate": 8e-05, + "loss": 1.7624, + "step": 3787 + }, + { + "epoch": 0.21114827201783723, + "grad_norm": 0.5066002607345581, + "learning_rate": 8e-05, + "loss": 2.0431, + "step": 3788 + }, + { + "epoch": 0.2112040133779264, + "grad_norm": 0.49868473410606384, + "learning_rate": 8e-05, + "loss": 1.7682, + "step": 3789 + }, + { + "epoch": 0.21125975473801561, + "grad_norm": 0.5334035158157349, + "learning_rate": 8e-05, + "loss": 1.8199, + "step": 3790 + }, + { + "epoch": 0.2113154960981048, + "grad_norm": 0.5027373433113098, + "learning_rate": 8e-05, + "loss": 1.7154, + "step": 3791 + }, + { + "epoch": 0.21137123745819397, + "grad_norm": 0.48657071590423584, + "learning_rate": 8e-05, + "loss": 1.6964, + "step": 3792 + }, + { + "epoch": 0.21142697881828318, + "grad_norm": 0.4769059717655182, + "learning_rate": 8e-05, + "loss": 1.4626, + "step": 3793 + }, + { + "epoch": 0.21148272017837236, + "grad_norm": 0.47266313433647156, + "learning_rate": 8e-05, + "loss": 1.7482, + "step": 3794 + }, + { + "epoch": 0.21153846153846154, + "grad_norm": 0.5105955004692078, + "learning_rate": 8e-05, + "loss": 1.7192, + "step": 3795 + }, + { + "epoch": 0.21159420289855072, + "grad_norm": 0.474439799785614, + "learning_rate": 8e-05, + "loss": 1.6764, + "step": 3796 + }, + { + "epoch": 0.21164994425863992, + "grad_norm": 0.5234792232513428, + "learning_rate": 8e-05, + "loss": 1.9227, + "step": 3797 + }, + { + "epoch": 0.2117056856187291, + "grad_norm": 0.47840628027915955, + "learning_rate": 8e-05, + "loss": 1.7192, + "step": 3798 + }, + { + "epoch": 0.21176142697881828, + "grad_norm": 0.5176959037780762, + "learning_rate": 8e-05, + "loss": 1.8396, + "step": 3799 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.5095927119255066, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 3800 + }, + { + "epoch": 0.21187290969899666, + "grad_norm": 0.47420892119407654, + "learning_rate": 8e-05, + "loss": 1.5153, + "step": 3801 + }, + { + "epoch": 0.21192865105908584, + "grad_norm": 0.5116045475006104, + "learning_rate": 8e-05, + "loss": 1.7009, + "step": 3802 + }, + { + "epoch": 0.21198439241917502, + "grad_norm": 0.4922579228878021, + "learning_rate": 8e-05, + "loss": 1.8566, + "step": 3803 + }, + { + "epoch": 0.2120401337792642, + "grad_norm": 0.46212583780288696, + "learning_rate": 8e-05, + "loss": 1.5053, + "step": 3804 + }, + { + "epoch": 0.2120958751393534, + "grad_norm": 0.45981431007385254, + "learning_rate": 8e-05, + "loss": 1.6856, + "step": 3805 + }, + { + "epoch": 0.21215161649944259, + "grad_norm": 0.48287737369537354, + "learning_rate": 8e-05, + "loss": 1.7064, + "step": 3806 + }, + { + "epoch": 0.21220735785953176, + "grad_norm": 0.5647776126861572, + "learning_rate": 8e-05, + "loss": 2.1445, + "step": 3807 + }, + { + "epoch": 0.21226309921962097, + "grad_norm": 0.5176706314086914, + "learning_rate": 8e-05, + "loss": 1.7832, + "step": 3808 + }, + { + "epoch": 0.21231884057971015, + "grad_norm": 0.4690020978450775, + "learning_rate": 8e-05, + "loss": 1.7234, + "step": 3809 + }, + { + "epoch": 0.21237458193979933, + "grad_norm": 0.5125473141670227, + "learning_rate": 8e-05, + "loss": 1.616, + "step": 3810 + }, + { + "epoch": 0.2124303232998885, + "grad_norm": 0.5069548487663269, + "learning_rate": 8e-05, + "loss": 1.7209, + "step": 3811 + }, + { + "epoch": 0.2124860646599777, + "grad_norm": 0.4834950268268585, + "learning_rate": 8e-05, + "loss": 1.7798, + "step": 3812 + }, + { + "epoch": 0.2125418060200669, + "grad_norm": 0.477280855178833, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 3813 + }, + { + "epoch": 0.21259754738015607, + "grad_norm": 0.47901955246925354, + "learning_rate": 8e-05, + "loss": 1.8597, + "step": 3814 + }, + { + "epoch": 0.21265328874024525, + "grad_norm": 0.4886859953403473, + "learning_rate": 8e-05, + "loss": 1.6653, + "step": 3815 + }, + { + "epoch": 0.21270903010033446, + "grad_norm": 0.5218357443809509, + "learning_rate": 8e-05, + "loss": 1.5118, + "step": 3816 + }, + { + "epoch": 0.21276477146042364, + "grad_norm": 0.4620465934276581, + "learning_rate": 8e-05, + "loss": 1.6675, + "step": 3817 + }, + { + "epoch": 0.2128205128205128, + "grad_norm": 0.4797252416610718, + "learning_rate": 8e-05, + "loss": 1.8343, + "step": 3818 + }, + { + "epoch": 0.21287625418060202, + "grad_norm": 0.4706561863422394, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 3819 + }, + { + "epoch": 0.2129319955406912, + "grad_norm": 1.0401971340179443, + "learning_rate": 8e-05, + "loss": 1.8073, + "step": 3820 + }, + { + "epoch": 0.21298773690078038, + "grad_norm": 0.4873477816581726, + "learning_rate": 8e-05, + "loss": 1.7037, + "step": 3821 + }, + { + "epoch": 0.21304347826086956, + "grad_norm": 0.49408429861068726, + "learning_rate": 8e-05, + "loss": 1.7433, + "step": 3822 + }, + { + "epoch": 0.21309921962095876, + "grad_norm": 0.5126897096633911, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 3823 + }, + { + "epoch": 0.21315496098104794, + "grad_norm": 0.5350395441055298, + "learning_rate": 8e-05, + "loss": 1.8235, + "step": 3824 + }, + { + "epoch": 0.21321070234113712, + "grad_norm": 0.5210526585578918, + "learning_rate": 8e-05, + "loss": 1.8589, + "step": 3825 + }, + { + "epoch": 0.2132664437012263, + "grad_norm": 0.5119580030441284, + "learning_rate": 8e-05, + "loss": 1.9103, + "step": 3826 + }, + { + "epoch": 0.2133221850613155, + "grad_norm": 0.5356841087341309, + "learning_rate": 8e-05, + "loss": 1.9764, + "step": 3827 + }, + { + "epoch": 0.21337792642140468, + "grad_norm": 0.5229792594909668, + "learning_rate": 8e-05, + "loss": 1.9064, + "step": 3828 + }, + { + "epoch": 0.21343366778149386, + "grad_norm": 0.5162675380706787, + "learning_rate": 8e-05, + "loss": 1.8165, + "step": 3829 + }, + { + "epoch": 0.21348940914158304, + "grad_norm": 0.49807944893836975, + "learning_rate": 8e-05, + "loss": 1.7083, + "step": 3830 + }, + { + "epoch": 0.21354515050167225, + "grad_norm": 0.5183794498443604, + "learning_rate": 8e-05, + "loss": 1.5318, + "step": 3831 + }, + { + "epoch": 0.21360089186176143, + "grad_norm": 0.4966142773628235, + "learning_rate": 8e-05, + "loss": 1.7479, + "step": 3832 + }, + { + "epoch": 0.2136566332218506, + "grad_norm": 0.5268084406852722, + "learning_rate": 8e-05, + "loss": 1.8522, + "step": 3833 + }, + { + "epoch": 0.2137123745819398, + "grad_norm": 0.5724077820777893, + "learning_rate": 8e-05, + "loss": 2.0082, + "step": 3834 + }, + { + "epoch": 0.213768115942029, + "grad_norm": 0.4624224603176117, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 3835 + }, + { + "epoch": 0.21382385730211817, + "grad_norm": 0.4890481233596802, + "learning_rate": 8e-05, + "loss": 1.6399, + "step": 3836 + }, + { + "epoch": 0.21387959866220735, + "grad_norm": 0.5407193899154663, + "learning_rate": 8e-05, + "loss": 1.8587, + "step": 3837 + }, + { + "epoch": 0.21393534002229656, + "grad_norm": 0.48641809821128845, + "learning_rate": 8e-05, + "loss": 1.729, + "step": 3838 + }, + { + "epoch": 0.21399108138238573, + "grad_norm": 0.4703226387500763, + "learning_rate": 8e-05, + "loss": 1.6493, + "step": 3839 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.52630615234375, + "learning_rate": 8e-05, + "loss": 1.864, + "step": 3840 + }, + { + "epoch": 0.2141025641025641, + "grad_norm": 0.4883647859096527, + "learning_rate": 8e-05, + "loss": 1.8115, + "step": 3841 + }, + { + "epoch": 0.2141583054626533, + "grad_norm": 0.4749646782875061, + "learning_rate": 8e-05, + "loss": 1.5468, + "step": 3842 + }, + { + "epoch": 0.21421404682274248, + "grad_norm": 0.47765839099884033, + "learning_rate": 8e-05, + "loss": 1.7855, + "step": 3843 + }, + { + "epoch": 0.21426978818283166, + "grad_norm": 0.5097349286079407, + "learning_rate": 8e-05, + "loss": 1.8423, + "step": 3844 + }, + { + "epoch": 0.21432552954292083, + "grad_norm": 0.49306657910346985, + "learning_rate": 8e-05, + "loss": 1.785, + "step": 3845 + }, + { + "epoch": 0.21438127090301004, + "grad_norm": 0.47584104537963867, + "learning_rate": 8e-05, + "loss": 1.6606, + "step": 3846 + }, + { + "epoch": 0.21443701226309922, + "grad_norm": 0.5335642099380493, + "learning_rate": 8e-05, + "loss": 1.7793, + "step": 3847 + }, + { + "epoch": 0.2144927536231884, + "grad_norm": 0.5167551636695862, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 3848 + }, + { + "epoch": 0.2145484949832776, + "grad_norm": 0.528070867061615, + "learning_rate": 8e-05, + "loss": 1.943, + "step": 3849 + }, + { + "epoch": 0.21460423634336678, + "grad_norm": 0.5179917812347412, + "learning_rate": 8e-05, + "loss": 1.752, + "step": 3850 + }, + { + "epoch": 0.21465997770345596, + "grad_norm": 0.45794737339019775, + "learning_rate": 8e-05, + "loss": 1.6796, + "step": 3851 + }, + { + "epoch": 0.21471571906354514, + "grad_norm": 0.486577570438385, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 3852 + }, + { + "epoch": 0.21477146042363435, + "grad_norm": 0.5513952374458313, + "learning_rate": 8e-05, + "loss": 1.7578, + "step": 3853 + }, + { + "epoch": 0.21482720178372353, + "grad_norm": 0.4476759135723114, + "learning_rate": 8e-05, + "loss": 1.5636, + "step": 3854 + }, + { + "epoch": 0.2148829431438127, + "grad_norm": 0.5062263607978821, + "learning_rate": 8e-05, + "loss": 1.7727, + "step": 3855 + }, + { + "epoch": 0.21493868450390188, + "grad_norm": 0.47806316614151, + "learning_rate": 8e-05, + "loss": 1.6939, + "step": 3856 + }, + { + "epoch": 0.2149944258639911, + "grad_norm": 0.5024604201316833, + "learning_rate": 8e-05, + "loss": 1.7159, + "step": 3857 + }, + { + "epoch": 0.21505016722408027, + "grad_norm": 0.47938990592956543, + "learning_rate": 8e-05, + "loss": 1.723, + "step": 3858 + }, + { + "epoch": 0.21510590858416945, + "grad_norm": 0.5239027142524719, + "learning_rate": 8e-05, + "loss": 1.4392, + "step": 3859 + }, + { + "epoch": 0.21516164994425863, + "grad_norm": 0.4923347234725952, + "learning_rate": 8e-05, + "loss": 1.5475, + "step": 3860 + }, + { + "epoch": 0.21521739130434783, + "grad_norm": 0.5067053437232971, + "learning_rate": 8e-05, + "loss": 1.5485, + "step": 3861 + }, + { + "epoch": 0.215273132664437, + "grad_norm": 0.5009576082229614, + "learning_rate": 8e-05, + "loss": 1.6084, + "step": 3862 + }, + { + "epoch": 0.2153288740245262, + "grad_norm": 0.5134173631668091, + "learning_rate": 8e-05, + "loss": 1.8511, + "step": 3863 + }, + { + "epoch": 0.2153846153846154, + "grad_norm": 0.4962582290172577, + "learning_rate": 8e-05, + "loss": 1.7865, + "step": 3864 + }, + { + "epoch": 0.21544035674470458, + "grad_norm": 0.4840092957019806, + "learning_rate": 8e-05, + "loss": 1.6366, + "step": 3865 + }, + { + "epoch": 0.21549609810479375, + "grad_norm": 0.4821156859397888, + "learning_rate": 8e-05, + "loss": 1.6677, + "step": 3866 + }, + { + "epoch": 0.21555183946488293, + "grad_norm": 0.5027958750724792, + "learning_rate": 8e-05, + "loss": 1.6693, + "step": 3867 + }, + { + "epoch": 0.21560758082497214, + "grad_norm": 0.4980306625366211, + "learning_rate": 8e-05, + "loss": 1.8091, + "step": 3868 + }, + { + "epoch": 0.21566332218506132, + "grad_norm": 0.5521800518035889, + "learning_rate": 8e-05, + "loss": 2.0795, + "step": 3869 + }, + { + "epoch": 0.2157190635451505, + "grad_norm": 0.48513853549957275, + "learning_rate": 8e-05, + "loss": 1.7454, + "step": 3870 + }, + { + "epoch": 0.21577480490523968, + "grad_norm": 0.5031861662864685, + "learning_rate": 8e-05, + "loss": 1.7551, + "step": 3871 + }, + { + "epoch": 0.21583054626532888, + "grad_norm": 0.4629993140697479, + "learning_rate": 8e-05, + "loss": 1.7578, + "step": 3872 + }, + { + "epoch": 0.21588628762541806, + "grad_norm": 0.5550334453582764, + "learning_rate": 8e-05, + "loss": 1.7309, + "step": 3873 + }, + { + "epoch": 0.21594202898550724, + "grad_norm": 0.5278251767158508, + "learning_rate": 8e-05, + "loss": 1.8432, + "step": 3874 + }, + { + "epoch": 0.21599777034559645, + "grad_norm": 0.5503281950950623, + "learning_rate": 8e-05, + "loss": 1.9233, + "step": 3875 + }, + { + "epoch": 0.21605351170568562, + "grad_norm": 0.5020831823348999, + "learning_rate": 8e-05, + "loss": 1.9928, + "step": 3876 + }, + { + "epoch": 0.2161092530657748, + "grad_norm": 0.4613478481769562, + "learning_rate": 8e-05, + "loss": 1.4694, + "step": 3877 + }, + { + "epoch": 0.21616499442586398, + "grad_norm": 0.5243741869926453, + "learning_rate": 8e-05, + "loss": 1.8008, + "step": 3878 + }, + { + "epoch": 0.2162207357859532, + "grad_norm": 0.467386931180954, + "learning_rate": 8e-05, + "loss": 1.5784, + "step": 3879 + }, + { + "epoch": 0.21627647714604237, + "grad_norm": 0.4884064197540283, + "learning_rate": 8e-05, + "loss": 1.644, + "step": 3880 + }, + { + "epoch": 0.21633221850613155, + "grad_norm": 0.5259974002838135, + "learning_rate": 8e-05, + "loss": 1.8779, + "step": 3881 + }, + { + "epoch": 0.21638795986622072, + "grad_norm": 0.5110856294631958, + "learning_rate": 8e-05, + "loss": 1.8159, + "step": 3882 + }, + { + "epoch": 0.21644370122630993, + "grad_norm": 0.5513606071472168, + "learning_rate": 8e-05, + "loss": 2.0017, + "step": 3883 + }, + { + "epoch": 0.2164994425863991, + "grad_norm": 0.49897506833076477, + "learning_rate": 8e-05, + "loss": 1.9742, + "step": 3884 + }, + { + "epoch": 0.2165551839464883, + "grad_norm": 0.512783408164978, + "learning_rate": 8e-05, + "loss": 1.4606, + "step": 3885 + }, + { + "epoch": 0.21661092530657747, + "grad_norm": 0.47280240058898926, + "learning_rate": 8e-05, + "loss": 1.7028, + "step": 3886 + }, + { + "epoch": 0.21666666666666667, + "grad_norm": 0.5042930841445923, + "learning_rate": 8e-05, + "loss": 1.738, + "step": 3887 + }, + { + "epoch": 0.21672240802675585, + "grad_norm": 0.49630722403526306, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 3888 + }, + { + "epoch": 0.21677814938684503, + "grad_norm": 0.4888682961463928, + "learning_rate": 8e-05, + "loss": 1.735, + "step": 3889 + }, + { + "epoch": 0.21683389074693424, + "grad_norm": 0.4728870689868927, + "learning_rate": 8e-05, + "loss": 1.8203, + "step": 3890 + }, + { + "epoch": 0.21688963210702342, + "grad_norm": 0.5488735437393188, + "learning_rate": 8e-05, + "loss": 1.7903, + "step": 3891 + }, + { + "epoch": 0.2169453734671126, + "grad_norm": 0.48417264223098755, + "learning_rate": 8e-05, + "loss": 1.7028, + "step": 3892 + }, + { + "epoch": 0.21700111482720177, + "grad_norm": 0.5011885762214661, + "learning_rate": 8e-05, + "loss": 1.5675, + "step": 3893 + }, + { + "epoch": 0.21705685618729098, + "grad_norm": 0.503288745880127, + "learning_rate": 8e-05, + "loss": 1.5926, + "step": 3894 + }, + { + "epoch": 0.21711259754738016, + "grad_norm": 0.5019866824150085, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 3895 + }, + { + "epoch": 0.21716833890746934, + "grad_norm": 0.4990195631980896, + "learning_rate": 8e-05, + "loss": 1.8422, + "step": 3896 + }, + { + "epoch": 0.21722408026755852, + "grad_norm": 0.49024635553359985, + "learning_rate": 8e-05, + "loss": 2.0611, + "step": 3897 + }, + { + "epoch": 0.21727982162764772, + "grad_norm": 0.44876179099082947, + "learning_rate": 8e-05, + "loss": 1.3315, + "step": 3898 + }, + { + "epoch": 0.2173355629877369, + "grad_norm": 0.45446836948394775, + "learning_rate": 8e-05, + "loss": 1.6923, + "step": 3899 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.4820328950881958, + "learning_rate": 8e-05, + "loss": 1.5184, + "step": 3900 + }, + { + "epoch": 0.21744704570791526, + "grad_norm": 0.46806085109710693, + "learning_rate": 8e-05, + "loss": 1.5763, + "step": 3901 + }, + { + "epoch": 0.21750278706800447, + "grad_norm": 0.49191901087760925, + "learning_rate": 8e-05, + "loss": 1.7162, + "step": 3902 + }, + { + "epoch": 0.21755852842809364, + "grad_norm": 0.4780477285385132, + "learning_rate": 8e-05, + "loss": 1.6722, + "step": 3903 + }, + { + "epoch": 0.21761426978818282, + "grad_norm": 0.5140283107757568, + "learning_rate": 8e-05, + "loss": 1.7773, + "step": 3904 + }, + { + "epoch": 0.21767001114827203, + "grad_norm": 0.5415761470794678, + "learning_rate": 8e-05, + "loss": 1.8011, + "step": 3905 + }, + { + "epoch": 0.2177257525083612, + "grad_norm": 0.49452102184295654, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 3906 + }, + { + "epoch": 0.2177814938684504, + "grad_norm": 0.4950968623161316, + "learning_rate": 8e-05, + "loss": 1.6263, + "step": 3907 + }, + { + "epoch": 0.21783723522853957, + "grad_norm": 0.4706718325614929, + "learning_rate": 8e-05, + "loss": 1.6071, + "step": 3908 + }, + { + "epoch": 0.21789297658862877, + "grad_norm": 0.5482810139656067, + "learning_rate": 8e-05, + "loss": 1.9042, + "step": 3909 + }, + { + "epoch": 0.21794871794871795, + "grad_norm": 0.5073991417884827, + "learning_rate": 8e-05, + "loss": 1.7979, + "step": 3910 + }, + { + "epoch": 0.21800445930880713, + "grad_norm": 0.5709139108657837, + "learning_rate": 8e-05, + "loss": 1.3266, + "step": 3911 + }, + { + "epoch": 0.2180602006688963, + "grad_norm": 0.5192441344261169, + "learning_rate": 8e-05, + "loss": 1.7538, + "step": 3912 + }, + { + "epoch": 0.21811594202898552, + "grad_norm": 0.44952622056007385, + "learning_rate": 8e-05, + "loss": 1.5676, + "step": 3913 + }, + { + "epoch": 0.2181716833890747, + "grad_norm": 0.48135310411453247, + "learning_rate": 8e-05, + "loss": 1.6884, + "step": 3914 + }, + { + "epoch": 0.21822742474916387, + "grad_norm": 0.45865342020988464, + "learning_rate": 8e-05, + "loss": 1.5379, + "step": 3915 + }, + { + "epoch": 0.21828316610925308, + "grad_norm": 0.47076812386512756, + "learning_rate": 8e-05, + "loss": 1.7358, + "step": 3916 + }, + { + "epoch": 0.21833890746934226, + "grad_norm": 0.45950251817703247, + "learning_rate": 8e-05, + "loss": 1.7064, + "step": 3917 + }, + { + "epoch": 0.21839464882943144, + "grad_norm": 0.4762863516807556, + "learning_rate": 8e-05, + "loss": 1.6946, + "step": 3918 + }, + { + "epoch": 0.21845039018952062, + "grad_norm": 0.5255429148674011, + "learning_rate": 8e-05, + "loss": 1.6414, + "step": 3919 + }, + { + "epoch": 0.21850613154960982, + "grad_norm": 0.4682084321975708, + "learning_rate": 8e-05, + "loss": 1.8241, + "step": 3920 + }, + { + "epoch": 0.218561872909699, + "grad_norm": 0.4832845628261566, + "learning_rate": 8e-05, + "loss": 1.7665, + "step": 3921 + }, + { + "epoch": 0.21861761426978818, + "grad_norm": 0.4966776669025421, + "learning_rate": 8e-05, + "loss": 1.7103, + "step": 3922 + }, + { + "epoch": 0.21867335562987736, + "grad_norm": 0.5141316652297974, + "learning_rate": 8e-05, + "loss": 1.8795, + "step": 3923 + }, + { + "epoch": 0.21872909698996656, + "grad_norm": 0.493316650390625, + "learning_rate": 8e-05, + "loss": 1.665, + "step": 3924 + }, + { + "epoch": 0.21878483835005574, + "grad_norm": 0.49819833040237427, + "learning_rate": 8e-05, + "loss": 1.9585, + "step": 3925 + }, + { + "epoch": 0.21884057971014492, + "grad_norm": 0.4498457610607147, + "learning_rate": 8e-05, + "loss": 1.4625, + "step": 3926 + }, + { + "epoch": 0.2188963210702341, + "grad_norm": 0.5001556277275085, + "learning_rate": 8e-05, + "loss": 1.8738, + "step": 3927 + }, + { + "epoch": 0.2189520624303233, + "grad_norm": 0.49394869804382324, + "learning_rate": 8e-05, + "loss": 1.771, + "step": 3928 + }, + { + "epoch": 0.2190078037904125, + "grad_norm": 0.4962061941623688, + "learning_rate": 8e-05, + "loss": 1.6487, + "step": 3929 + }, + { + "epoch": 0.21906354515050167, + "grad_norm": 0.5592943429946899, + "learning_rate": 8e-05, + "loss": 1.9334, + "step": 3930 + }, + { + "epoch": 0.21911928651059087, + "grad_norm": 0.5389150381088257, + "learning_rate": 8e-05, + "loss": 1.9751, + "step": 3931 + }, + { + "epoch": 0.21917502787068005, + "grad_norm": 0.5096388459205627, + "learning_rate": 8e-05, + "loss": 1.9468, + "step": 3932 + }, + { + "epoch": 0.21923076923076923, + "grad_norm": 0.561657726764679, + "learning_rate": 8e-05, + "loss": 1.84, + "step": 3933 + }, + { + "epoch": 0.2192865105908584, + "grad_norm": 0.48407647013664246, + "learning_rate": 8e-05, + "loss": 1.5551, + "step": 3934 + }, + { + "epoch": 0.21934225195094761, + "grad_norm": 0.4679587185382843, + "learning_rate": 8e-05, + "loss": 1.6836, + "step": 3935 + }, + { + "epoch": 0.2193979933110368, + "grad_norm": 0.5219367742538452, + "learning_rate": 8e-05, + "loss": 1.7788, + "step": 3936 + }, + { + "epoch": 0.21945373467112597, + "grad_norm": 0.4916120171546936, + "learning_rate": 8e-05, + "loss": 1.6, + "step": 3937 + }, + { + "epoch": 0.21950947603121515, + "grad_norm": 0.4261520206928253, + "learning_rate": 8e-05, + "loss": 1.3857, + "step": 3938 + }, + { + "epoch": 0.21956521739130436, + "grad_norm": 0.48919540643692017, + "learning_rate": 8e-05, + "loss": 1.7356, + "step": 3939 + }, + { + "epoch": 0.21962095875139354, + "grad_norm": 0.49579474329948425, + "learning_rate": 8e-05, + "loss": 1.7985, + "step": 3940 + }, + { + "epoch": 0.21967670011148271, + "grad_norm": 0.47610533237457275, + "learning_rate": 8e-05, + "loss": 1.6159, + "step": 3941 + }, + { + "epoch": 0.2197324414715719, + "grad_norm": 0.4687567949295044, + "learning_rate": 8e-05, + "loss": 1.2385, + "step": 3942 + }, + { + "epoch": 0.2197881828316611, + "grad_norm": 0.4921078085899353, + "learning_rate": 8e-05, + "loss": 1.8218, + "step": 3943 + }, + { + "epoch": 0.21984392419175028, + "grad_norm": 0.45393243432044983, + "learning_rate": 8e-05, + "loss": 1.5897, + "step": 3944 + }, + { + "epoch": 0.21989966555183946, + "grad_norm": 0.503121018409729, + "learning_rate": 8e-05, + "loss": 1.9286, + "step": 3945 + }, + { + "epoch": 0.21995540691192866, + "grad_norm": 0.4986448585987091, + "learning_rate": 8e-05, + "loss": 1.8007, + "step": 3946 + }, + { + "epoch": 0.22001114827201784, + "grad_norm": 0.4901581406593323, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 3947 + }, + { + "epoch": 0.22006688963210702, + "grad_norm": 0.48810702562332153, + "learning_rate": 8e-05, + "loss": 1.5888, + "step": 3948 + }, + { + "epoch": 0.2201226309921962, + "grad_norm": 0.5303023457527161, + "learning_rate": 8e-05, + "loss": 1.5916, + "step": 3949 + }, + { + "epoch": 0.2201783723522854, + "grad_norm": 0.4808427095413208, + "learning_rate": 8e-05, + "loss": 1.6419, + "step": 3950 + }, + { + "epoch": 0.22023411371237459, + "grad_norm": 0.462200790643692, + "learning_rate": 8e-05, + "loss": 1.5292, + "step": 3951 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 0.5043500661849976, + "learning_rate": 8e-05, + "loss": 1.5486, + "step": 3952 + }, + { + "epoch": 0.22034559643255294, + "grad_norm": 0.5242111086845398, + "learning_rate": 8e-05, + "loss": 1.7288, + "step": 3953 + }, + { + "epoch": 0.22040133779264215, + "grad_norm": 0.47073057293891907, + "learning_rate": 8e-05, + "loss": 1.5865, + "step": 3954 + }, + { + "epoch": 0.22045707915273133, + "grad_norm": 0.4670935869216919, + "learning_rate": 8e-05, + "loss": 1.6749, + "step": 3955 + }, + { + "epoch": 0.2205128205128205, + "grad_norm": 0.49966269731521606, + "learning_rate": 8e-05, + "loss": 1.6227, + "step": 3956 + }, + { + "epoch": 0.22056856187290969, + "grad_norm": 0.5078837275505066, + "learning_rate": 8e-05, + "loss": 1.6791, + "step": 3957 + }, + { + "epoch": 0.2206243032329989, + "grad_norm": 0.4879036545753479, + "learning_rate": 8e-05, + "loss": 1.6912, + "step": 3958 + }, + { + "epoch": 0.22068004459308807, + "grad_norm": 0.4623623490333557, + "learning_rate": 8e-05, + "loss": 1.5699, + "step": 3959 + }, + { + "epoch": 0.22073578595317725, + "grad_norm": 0.42267879843711853, + "learning_rate": 8e-05, + "loss": 1.5417, + "step": 3960 + }, + { + "epoch": 0.22079152731326646, + "grad_norm": 0.5437389612197876, + "learning_rate": 8e-05, + "loss": 2.0238, + "step": 3961 + }, + { + "epoch": 0.22084726867335563, + "grad_norm": 0.5264094471931458, + "learning_rate": 8e-05, + "loss": 1.8531, + "step": 3962 + }, + { + "epoch": 0.2209030100334448, + "grad_norm": 0.531224250793457, + "learning_rate": 8e-05, + "loss": 1.9189, + "step": 3963 + }, + { + "epoch": 0.220958751393534, + "grad_norm": 0.4769847095012665, + "learning_rate": 8e-05, + "loss": 1.6585, + "step": 3964 + }, + { + "epoch": 0.2210144927536232, + "grad_norm": 0.4706733822822571, + "learning_rate": 8e-05, + "loss": 1.5896, + "step": 3965 + }, + { + "epoch": 0.22107023411371238, + "grad_norm": 0.46875301003456116, + "learning_rate": 8e-05, + "loss": 1.7306, + "step": 3966 + }, + { + "epoch": 0.22112597547380156, + "grad_norm": 0.5377287268638611, + "learning_rate": 8e-05, + "loss": 1.7412, + "step": 3967 + }, + { + "epoch": 0.22118171683389073, + "grad_norm": 0.5149115920066833, + "learning_rate": 8e-05, + "loss": 1.7949, + "step": 3968 + }, + { + "epoch": 0.22123745819397994, + "grad_norm": 0.47132450342178345, + "learning_rate": 8e-05, + "loss": 1.4499, + "step": 3969 + }, + { + "epoch": 0.22129319955406912, + "grad_norm": 0.46263396739959717, + "learning_rate": 8e-05, + "loss": 1.5182, + "step": 3970 + }, + { + "epoch": 0.2213489409141583, + "grad_norm": 0.49939507246017456, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 3971 + }, + { + "epoch": 0.2214046822742475, + "grad_norm": 0.4451838433742523, + "learning_rate": 8e-05, + "loss": 1.5621, + "step": 3972 + }, + { + "epoch": 0.22146042363433668, + "grad_norm": 0.5174611210823059, + "learning_rate": 8e-05, + "loss": 1.6537, + "step": 3973 + }, + { + "epoch": 0.22151616499442586, + "grad_norm": 0.5026767253875732, + "learning_rate": 8e-05, + "loss": 1.7238, + "step": 3974 + }, + { + "epoch": 0.22157190635451504, + "grad_norm": 0.5344337821006775, + "learning_rate": 8e-05, + "loss": 1.7551, + "step": 3975 + }, + { + "epoch": 0.22162764771460425, + "grad_norm": 0.48642006516456604, + "learning_rate": 8e-05, + "loss": 1.6268, + "step": 3976 + }, + { + "epoch": 0.22168338907469343, + "grad_norm": 0.4529288113117218, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 3977 + }, + { + "epoch": 0.2217391304347826, + "grad_norm": 0.46826764941215515, + "learning_rate": 8e-05, + "loss": 1.5043, + "step": 3978 + }, + { + "epoch": 0.22179487179487178, + "grad_norm": 0.467683881521225, + "learning_rate": 8e-05, + "loss": 1.6435, + "step": 3979 + }, + { + "epoch": 0.221850613154961, + "grad_norm": 0.4558468461036682, + "learning_rate": 8e-05, + "loss": 1.4534, + "step": 3980 + }, + { + "epoch": 0.22190635451505017, + "grad_norm": 0.46639105677604675, + "learning_rate": 8e-05, + "loss": 1.8479, + "step": 3981 + }, + { + "epoch": 0.22196209587513935, + "grad_norm": 0.45973077416419983, + "learning_rate": 8e-05, + "loss": 1.7484, + "step": 3982 + }, + { + "epoch": 0.22201783723522853, + "grad_norm": 0.5095018148422241, + "learning_rate": 8e-05, + "loss": 1.7441, + "step": 3983 + }, + { + "epoch": 0.22207357859531773, + "grad_norm": 0.4564839005470276, + "learning_rate": 8e-05, + "loss": 1.5841, + "step": 3984 + }, + { + "epoch": 0.2221293199554069, + "grad_norm": 0.5045386552810669, + "learning_rate": 8e-05, + "loss": 1.9361, + "step": 3985 + }, + { + "epoch": 0.2221850613154961, + "grad_norm": 0.512732982635498, + "learning_rate": 8e-05, + "loss": 1.8255, + "step": 3986 + }, + { + "epoch": 0.2222408026755853, + "grad_norm": 0.47748759388923645, + "learning_rate": 8e-05, + "loss": 1.6961, + "step": 3987 + }, + { + "epoch": 0.22229654403567448, + "grad_norm": 0.4907570481300354, + "learning_rate": 8e-05, + "loss": 1.6522, + "step": 3988 + }, + { + "epoch": 0.22235228539576365, + "grad_norm": 0.49349287152290344, + "learning_rate": 8e-05, + "loss": 1.5232, + "step": 3989 + }, + { + "epoch": 0.22240802675585283, + "grad_norm": 0.5386398434638977, + "learning_rate": 8e-05, + "loss": 1.7628, + "step": 3990 + }, + { + "epoch": 0.22246376811594204, + "grad_norm": 0.5197376012802124, + "learning_rate": 8e-05, + "loss": 1.742, + "step": 3991 + }, + { + "epoch": 0.22251950947603122, + "grad_norm": 0.45318424701690674, + "learning_rate": 8e-05, + "loss": 1.52, + "step": 3992 + }, + { + "epoch": 0.2225752508361204, + "grad_norm": 0.4926500618457794, + "learning_rate": 8e-05, + "loss": 1.7327, + "step": 3993 + }, + { + "epoch": 0.22263099219620958, + "grad_norm": 0.4656613767147064, + "learning_rate": 8e-05, + "loss": 1.7519, + "step": 3994 + }, + { + "epoch": 0.22268673355629878, + "grad_norm": 0.48848897218704224, + "learning_rate": 8e-05, + "loss": 1.7931, + "step": 3995 + }, + { + "epoch": 0.22274247491638796, + "grad_norm": 0.4615493416786194, + "learning_rate": 8e-05, + "loss": 1.6964, + "step": 3996 + }, + { + "epoch": 0.22279821627647714, + "grad_norm": 0.48456570506095886, + "learning_rate": 8e-05, + "loss": 1.8696, + "step": 3997 + }, + { + "epoch": 0.22285395763656632, + "grad_norm": 0.492690771818161, + "learning_rate": 8e-05, + "loss": 1.8145, + "step": 3998 + }, + { + "epoch": 0.22290969899665553, + "grad_norm": 0.48878180980682373, + "learning_rate": 8e-05, + "loss": 1.666, + "step": 3999 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.46159079670906067, + "learning_rate": 8e-05, + "loss": 1.4359, + "step": 4000 + }, + { + "epoch": 0.22302118171683388, + "grad_norm": 0.5181668996810913, + "learning_rate": 8e-05, + "loss": 1.9858, + "step": 4001 + }, + { + "epoch": 0.2230769230769231, + "grad_norm": 0.5265150666236877, + "learning_rate": 8e-05, + "loss": 1.7475, + "step": 4002 + }, + { + "epoch": 0.22313266443701227, + "grad_norm": 0.4892853796482086, + "learning_rate": 8e-05, + "loss": 1.6459, + "step": 4003 + }, + { + "epoch": 0.22318840579710145, + "grad_norm": 0.5201447606086731, + "learning_rate": 8e-05, + "loss": 1.8574, + "step": 4004 + }, + { + "epoch": 0.22324414715719063, + "grad_norm": 0.49057894945144653, + "learning_rate": 8e-05, + "loss": 1.8647, + "step": 4005 + }, + { + "epoch": 0.22329988851727983, + "grad_norm": 0.4769148528575897, + "learning_rate": 8e-05, + "loss": 1.7238, + "step": 4006 + }, + { + "epoch": 0.223355629877369, + "grad_norm": 0.4808587431907654, + "learning_rate": 8e-05, + "loss": 1.8717, + "step": 4007 + }, + { + "epoch": 0.2234113712374582, + "grad_norm": 0.48524975776672363, + "learning_rate": 8e-05, + "loss": 1.7951, + "step": 4008 + }, + { + "epoch": 0.22346711259754737, + "grad_norm": 0.4786055088043213, + "learning_rate": 8e-05, + "loss": 1.5853, + "step": 4009 + }, + { + "epoch": 0.22352285395763657, + "grad_norm": 0.4943832755088806, + "learning_rate": 8e-05, + "loss": 1.8111, + "step": 4010 + }, + { + "epoch": 0.22357859531772575, + "grad_norm": 0.5508502125740051, + "learning_rate": 8e-05, + "loss": 1.9332, + "step": 4011 + }, + { + "epoch": 0.22363433667781493, + "grad_norm": 0.5170157551765442, + "learning_rate": 8e-05, + "loss": 1.9951, + "step": 4012 + }, + { + "epoch": 0.22369007803790414, + "grad_norm": 0.4975753128528595, + "learning_rate": 8e-05, + "loss": 1.7051, + "step": 4013 + }, + { + "epoch": 0.22374581939799332, + "grad_norm": 0.5136903524398804, + "learning_rate": 8e-05, + "loss": 1.7638, + "step": 4014 + }, + { + "epoch": 0.2238015607580825, + "grad_norm": 0.5532751083374023, + "learning_rate": 8e-05, + "loss": 1.8038, + "step": 4015 + }, + { + "epoch": 0.22385730211817167, + "grad_norm": 0.525316059589386, + "learning_rate": 8e-05, + "loss": 1.877, + "step": 4016 + }, + { + "epoch": 0.22391304347826088, + "grad_norm": 0.5056881308555603, + "learning_rate": 8e-05, + "loss": 1.6853, + "step": 4017 + }, + { + "epoch": 0.22396878483835006, + "grad_norm": 0.48879149556159973, + "learning_rate": 8e-05, + "loss": 1.7952, + "step": 4018 + }, + { + "epoch": 0.22402452619843924, + "grad_norm": 0.5147798657417297, + "learning_rate": 8e-05, + "loss": 1.945, + "step": 4019 + }, + { + "epoch": 0.22408026755852842, + "grad_norm": 0.45114102959632874, + "learning_rate": 8e-05, + "loss": 1.4624, + "step": 4020 + }, + { + "epoch": 0.22413600891861762, + "grad_norm": 0.4589650630950928, + "learning_rate": 8e-05, + "loss": 1.6146, + "step": 4021 + }, + { + "epoch": 0.2241917502787068, + "grad_norm": 0.5016673803329468, + "learning_rate": 8e-05, + "loss": 1.7754, + "step": 4022 + }, + { + "epoch": 0.22424749163879598, + "grad_norm": 0.47358280420303345, + "learning_rate": 8e-05, + "loss": 1.3906, + "step": 4023 + }, + { + "epoch": 0.22430323299888516, + "grad_norm": 0.4887877404689789, + "learning_rate": 8e-05, + "loss": 1.744, + "step": 4024 + }, + { + "epoch": 0.22435897435897437, + "grad_norm": 0.5506033301353455, + "learning_rate": 8e-05, + "loss": 1.7464, + "step": 4025 + }, + { + "epoch": 0.22441471571906355, + "grad_norm": 0.5490965247154236, + "learning_rate": 8e-05, + "loss": 1.965, + "step": 4026 + }, + { + "epoch": 0.22447045707915272, + "grad_norm": 0.4694077670574188, + "learning_rate": 8e-05, + "loss": 1.638, + "step": 4027 + }, + { + "epoch": 0.22452619843924193, + "grad_norm": 0.4980950951576233, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 4028 + }, + { + "epoch": 0.2245819397993311, + "grad_norm": 0.47829583287239075, + "learning_rate": 8e-05, + "loss": 1.4633, + "step": 4029 + }, + { + "epoch": 0.2246376811594203, + "grad_norm": 0.47786465287208557, + "learning_rate": 8e-05, + "loss": 1.7009, + "step": 4030 + }, + { + "epoch": 0.22469342251950947, + "grad_norm": 0.5337238907814026, + "learning_rate": 8e-05, + "loss": 1.9198, + "step": 4031 + }, + { + "epoch": 0.22474916387959867, + "grad_norm": 0.4835289418697357, + "learning_rate": 8e-05, + "loss": 1.7093, + "step": 4032 + }, + { + "epoch": 0.22480490523968785, + "grad_norm": 0.4613661766052246, + "learning_rate": 8e-05, + "loss": 1.5062, + "step": 4033 + }, + { + "epoch": 0.22486064659977703, + "grad_norm": 0.47875353693962097, + "learning_rate": 8e-05, + "loss": 1.5384, + "step": 4034 + }, + { + "epoch": 0.2249163879598662, + "grad_norm": 0.5129967927932739, + "learning_rate": 8e-05, + "loss": 1.8924, + "step": 4035 + }, + { + "epoch": 0.22497212931995542, + "grad_norm": 0.5353673100471497, + "learning_rate": 8e-05, + "loss": 1.6436, + "step": 4036 + }, + { + "epoch": 0.2250278706800446, + "grad_norm": 0.5585724115371704, + "learning_rate": 8e-05, + "loss": 1.9538, + "step": 4037 + }, + { + "epoch": 0.22508361204013377, + "grad_norm": 0.5115454792976379, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 4038 + }, + { + "epoch": 0.22513935340022295, + "grad_norm": 0.5760135650634766, + "learning_rate": 8e-05, + "loss": 2.0288, + "step": 4039 + }, + { + "epoch": 0.22519509476031216, + "grad_norm": 0.5313804149627686, + "learning_rate": 8e-05, + "loss": 1.6336, + "step": 4040 + }, + { + "epoch": 0.22525083612040134, + "grad_norm": 0.4843834638595581, + "learning_rate": 8e-05, + "loss": 1.7583, + "step": 4041 + }, + { + "epoch": 0.22530657748049052, + "grad_norm": 0.45977699756622314, + "learning_rate": 8e-05, + "loss": 1.6708, + "step": 4042 + }, + { + "epoch": 0.22536231884057972, + "grad_norm": 0.44780343770980835, + "learning_rate": 8e-05, + "loss": 1.5733, + "step": 4043 + }, + { + "epoch": 0.2254180602006689, + "grad_norm": 0.5017595887184143, + "learning_rate": 8e-05, + "loss": 1.7078, + "step": 4044 + }, + { + "epoch": 0.22547380156075808, + "grad_norm": 0.46054017543792725, + "learning_rate": 8e-05, + "loss": 1.8015, + "step": 4045 + }, + { + "epoch": 0.22552954292084726, + "grad_norm": 0.5294150710105896, + "learning_rate": 8e-05, + "loss": 1.8796, + "step": 4046 + }, + { + "epoch": 0.22558528428093647, + "grad_norm": 0.4661663770675659, + "learning_rate": 8e-05, + "loss": 1.8345, + "step": 4047 + }, + { + "epoch": 0.22564102564102564, + "grad_norm": 0.5371278524398804, + "learning_rate": 8e-05, + "loss": 1.7545, + "step": 4048 + }, + { + "epoch": 0.22569676700111482, + "grad_norm": 0.4482397139072418, + "learning_rate": 8e-05, + "loss": 1.5857, + "step": 4049 + }, + { + "epoch": 0.225752508361204, + "grad_norm": 0.4550190567970276, + "learning_rate": 8e-05, + "loss": 1.6947, + "step": 4050 + }, + { + "epoch": 0.2258082497212932, + "grad_norm": 0.4869641661643982, + "learning_rate": 8e-05, + "loss": 1.6888, + "step": 4051 + }, + { + "epoch": 0.2258639910813824, + "grad_norm": 0.5037266612052917, + "learning_rate": 8e-05, + "loss": 1.8179, + "step": 4052 + }, + { + "epoch": 0.22591973244147157, + "grad_norm": 0.5212584137916565, + "learning_rate": 8e-05, + "loss": 1.8177, + "step": 4053 + }, + { + "epoch": 0.22597547380156074, + "grad_norm": 0.4695870876312256, + "learning_rate": 8e-05, + "loss": 1.6418, + "step": 4054 + }, + { + "epoch": 0.22603121516164995, + "grad_norm": 0.5005946159362793, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 4055 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 0.5524263381958008, + "learning_rate": 8e-05, + "loss": 1.7914, + "step": 4056 + }, + { + "epoch": 0.2261426978818283, + "grad_norm": 0.5313774347305298, + "learning_rate": 8e-05, + "loss": 1.9922, + "step": 4057 + }, + { + "epoch": 0.22619843924191751, + "grad_norm": 0.5394706726074219, + "learning_rate": 8e-05, + "loss": 1.5808, + "step": 4058 + }, + { + "epoch": 0.2262541806020067, + "grad_norm": 0.4736047089099884, + "learning_rate": 8e-05, + "loss": 1.7312, + "step": 4059 + }, + { + "epoch": 0.22630992196209587, + "grad_norm": 0.4683007299900055, + "learning_rate": 8e-05, + "loss": 1.7285, + "step": 4060 + }, + { + "epoch": 0.22636566332218505, + "grad_norm": 0.4946795701980591, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 4061 + }, + { + "epoch": 0.22642140468227426, + "grad_norm": 0.5438491702079773, + "learning_rate": 8e-05, + "loss": 2.0871, + "step": 4062 + }, + { + "epoch": 0.22647714604236344, + "grad_norm": 0.4584484100341797, + "learning_rate": 8e-05, + "loss": 1.5073, + "step": 4063 + }, + { + "epoch": 0.22653288740245262, + "grad_norm": 0.4815159738063812, + "learning_rate": 8e-05, + "loss": 1.6304, + "step": 4064 + }, + { + "epoch": 0.2265886287625418, + "grad_norm": 0.5199421048164368, + "learning_rate": 8e-05, + "loss": 1.8117, + "step": 4065 + }, + { + "epoch": 0.226644370122631, + "grad_norm": 0.5211115479469299, + "learning_rate": 8e-05, + "loss": 1.6965, + "step": 4066 + }, + { + "epoch": 0.22670011148272018, + "grad_norm": 0.5173338651657104, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 4067 + }, + { + "epoch": 0.22675585284280936, + "grad_norm": 0.49385011196136475, + "learning_rate": 8e-05, + "loss": 1.5888, + "step": 4068 + }, + { + "epoch": 0.22681159420289856, + "grad_norm": 0.5200061202049255, + "learning_rate": 8e-05, + "loss": 1.9021, + "step": 4069 + }, + { + "epoch": 0.22686733556298774, + "grad_norm": 0.5064250826835632, + "learning_rate": 8e-05, + "loss": 1.8157, + "step": 4070 + }, + { + "epoch": 0.22692307692307692, + "grad_norm": 0.49232232570648193, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 4071 + }, + { + "epoch": 0.2269788182831661, + "grad_norm": 0.5013061761856079, + "learning_rate": 8e-05, + "loss": 1.7978, + "step": 4072 + }, + { + "epoch": 0.2270345596432553, + "grad_norm": 0.4811973571777344, + "learning_rate": 8e-05, + "loss": 1.7333, + "step": 4073 + }, + { + "epoch": 0.22709030100334449, + "grad_norm": 0.5344292521476746, + "learning_rate": 8e-05, + "loss": 1.8552, + "step": 4074 + }, + { + "epoch": 0.22714604236343366, + "grad_norm": 0.49985915422439575, + "learning_rate": 8e-05, + "loss": 1.5754, + "step": 4075 + }, + { + "epoch": 0.22720178372352284, + "grad_norm": 0.5089527368545532, + "learning_rate": 8e-05, + "loss": 1.7264, + "step": 4076 + }, + { + "epoch": 0.22725752508361205, + "grad_norm": 0.49516212940216064, + "learning_rate": 8e-05, + "loss": 1.6891, + "step": 4077 + }, + { + "epoch": 0.22731326644370123, + "grad_norm": 0.49753841757774353, + "learning_rate": 8e-05, + "loss": 1.6529, + "step": 4078 + }, + { + "epoch": 0.2273690078037904, + "grad_norm": 0.5481187105178833, + "learning_rate": 8e-05, + "loss": 1.8607, + "step": 4079 + }, + { + "epoch": 0.22742474916387959, + "grad_norm": 0.47371983528137207, + "learning_rate": 8e-05, + "loss": 1.4895, + "step": 4080 + }, + { + "epoch": 0.2274804905239688, + "grad_norm": 0.5114725232124329, + "learning_rate": 8e-05, + "loss": 1.8095, + "step": 4081 + }, + { + "epoch": 0.22753623188405797, + "grad_norm": 0.4610219895839691, + "learning_rate": 8e-05, + "loss": 1.6146, + "step": 4082 + }, + { + "epoch": 0.22759197324414715, + "grad_norm": 0.5723327398300171, + "learning_rate": 8e-05, + "loss": 1.8573, + "step": 4083 + }, + { + "epoch": 0.22764771460423636, + "grad_norm": 0.4931768476963043, + "learning_rate": 8e-05, + "loss": 1.8444, + "step": 4084 + }, + { + "epoch": 0.22770345596432554, + "grad_norm": 0.504690945148468, + "learning_rate": 8e-05, + "loss": 1.6546, + "step": 4085 + }, + { + "epoch": 0.22775919732441471, + "grad_norm": 0.49038875102996826, + "learning_rate": 8e-05, + "loss": 1.6053, + "step": 4086 + }, + { + "epoch": 0.2278149386845039, + "grad_norm": 0.5072624087333679, + "learning_rate": 8e-05, + "loss": 1.6051, + "step": 4087 + }, + { + "epoch": 0.2278706800445931, + "grad_norm": 0.5469437837600708, + "learning_rate": 8e-05, + "loss": 1.826, + "step": 4088 + }, + { + "epoch": 0.22792642140468228, + "grad_norm": 0.4576040208339691, + "learning_rate": 8e-05, + "loss": 1.5462, + "step": 4089 + }, + { + "epoch": 0.22798216276477146, + "grad_norm": 0.5090271830558777, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 4090 + }, + { + "epoch": 0.22803790412486064, + "grad_norm": 0.5030748248100281, + "learning_rate": 8e-05, + "loss": 1.6206, + "step": 4091 + }, + { + "epoch": 0.22809364548494984, + "grad_norm": 0.5506917238235474, + "learning_rate": 8e-05, + "loss": 1.7736, + "step": 4092 + }, + { + "epoch": 0.22814938684503902, + "grad_norm": 0.5064511299133301, + "learning_rate": 8e-05, + "loss": 1.7259, + "step": 4093 + }, + { + "epoch": 0.2282051282051282, + "grad_norm": 0.5426530241966248, + "learning_rate": 8e-05, + "loss": 1.6866, + "step": 4094 + }, + { + "epoch": 0.22826086956521738, + "grad_norm": 0.5241387486457825, + "learning_rate": 8e-05, + "loss": 1.5815, + "step": 4095 + }, + { + "epoch": 0.22831661092530658, + "grad_norm": 0.458842933177948, + "learning_rate": 8e-05, + "loss": 1.5426, + "step": 4096 + }, + { + "epoch": 0.22837235228539576, + "grad_norm": 0.4742090106010437, + "learning_rate": 8e-05, + "loss": 1.6251, + "step": 4097 + }, + { + "epoch": 0.22842809364548494, + "grad_norm": 0.47541600465774536, + "learning_rate": 8e-05, + "loss": 1.5276, + "step": 4098 + }, + { + "epoch": 0.22848383500557415, + "grad_norm": 0.4711078405380249, + "learning_rate": 8e-05, + "loss": 1.8478, + "step": 4099 + }, + { + "epoch": 0.22853957636566333, + "grad_norm": 0.474153071641922, + "learning_rate": 8e-05, + "loss": 1.7794, + "step": 4100 + }, + { + "epoch": 0.2285953177257525, + "grad_norm": 0.46833422780036926, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 4101 + }, + { + "epoch": 0.22865105908584168, + "grad_norm": 0.5075449347496033, + "learning_rate": 8e-05, + "loss": 1.4074, + "step": 4102 + }, + { + "epoch": 0.2287068004459309, + "grad_norm": 0.4985544979572296, + "learning_rate": 8e-05, + "loss": 1.491, + "step": 4103 + }, + { + "epoch": 0.22876254180602007, + "grad_norm": 0.4538790285587311, + "learning_rate": 8e-05, + "loss": 1.444, + "step": 4104 + }, + { + "epoch": 0.22881828316610925, + "grad_norm": 0.5139552354812622, + "learning_rate": 8e-05, + "loss": 1.9494, + "step": 4105 + }, + { + "epoch": 0.22887402452619843, + "grad_norm": 0.507438063621521, + "learning_rate": 8e-05, + "loss": 1.8153, + "step": 4106 + }, + { + "epoch": 0.22892976588628763, + "grad_norm": 0.5059213638305664, + "learning_rate": 8e-05, + "loss": 1.52, + "step": 4107 + }, + { + "epoch": 0.2289855072463768, + "grad_norm": 0.49112826585769653, + "learning_rate": 8e-05, + "loss": 1.8568, + "step": 4108 + }, + { + "epoch": 0.229041248606466, + "grad_norm": 0.5132395029067993, + "learning_rate": 8e-05, + "loss": 1.8098, + "step": 4109 + }, + { + "epoch": 0.22909698996655517, + "grad_norm": 0.47251254320144653, + "learning_rate": 8e-05, + "loss": 1.7584, + "step": 4110 + }, + { + "epoch": 0.22915273132664438, + "grad_norm": 0.4928188920021057, + "learning_rate": 8e-05, + "loss": 1.5482, + "step": 4111 + }, + { + "epoch": 0.22920847268673356, + "grad_norm": 0.5108476877212524, + "learning_rate": 8e-05, + "loss": 1.876, + "step": 4112 + }, + { + "epoch": 0.22926421404682273, + "grad_norm": 0.4915235936641693, + "learning_rate": 8e-05, + "loss": 1.852, + "step": 4113 + }, + { + "epoch": 0.22931995540691194, + "grad_norm": 0.5093597173690796, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 4114 + }, + { + "epoch": 0.22937569676700112, + "grad_norm": 0.4863458573818207, + "learning_rate": 8e-05, + "loss": 1.6503, + "step": 4115 + }, + { + "epoch": 0.2294314381270903, + "grad_norm": 0.5055260062217712, + "learning_rate": 8e-05, + "loss": 1.8135, + "step": 4116 + }, + { + "epoch": 0.22948717948717948, + "grad_norm": 0.48014530539512634, + "learning_rate": 8e-05, + "loss": 1.6573, + "step": 4117 + }, + { + "epoch": 0.22954292084726868, + "grad_norm": 0.5218930244445801, + "learning_rate": 8e-05, + "loss": 1.8912, + "step": 4118 + }, + { + "epoch": 0.22959866220735786, + "grad_norm": 0.5085465908050537, + "learning_rate": 8e-05, + "loss": 1.6765, + "step": 4119 + }, + { + "epoch": 0.22965440356744704, + "grad_norm": 0.46710649132728577, + "learning_rate": 8e-05, + "loss": 1.6694, + "step": 4120 + }, + { + "epoch": 0.22971014492753622, + "grad_norm": 0.4992454946041107, + "learning_rate": 8e-05, + "loss": 1.6592, + "step": 4121 + }, + { + "epoch": 0.22976588628762543, + "grad_norm": 0.526770293712616, + "learning_rate": 8e-05, + "loss": 1.738, + "step": 4122 + }, + { + "epoch": 0.2298216276477146, + "grad_norm": 0.4905931055545807, + "learning_rate": 8e-05, + "loss": 1.7881, + "step": 4123 + }, + { + "epoch": 0.22987736900780378, + "grad_norm": 0.5293172597885132, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 4124 + }, + { + "epoch": 0.229933110367893, + "grad_norm": 0.5025931000709534, + "learning_rate": 8e-05, + "loss": 1.7261, + "step": 4125 + }, + { + "epoch": 0.22998885172798217, + "grad_norm": 0.5392381548881531, + "learning_rate": 8e-05, + "loss": 1.7726, + "step": 4126 + }, + { + "epoch": 0.23004459308807135, + "grad_norm": 0.5503458976745605, + "learning_rate": 8e-05, + "loss": 1.9244, + "step": 4127 + }, + { + "epoch": 0.23010033444816053, + "grad_norm": 0.5248586535453796, + "learning_rate": 8e-05, + "loss": 1.6884, + "step": 4128 + }, + { + "epoch": 0.23015607580824973, + "grad_norm": 0.5418709516525269, + "learning_rate": 8e-05, + "loss": 1.7191, + "step": 4129 + }, + { + "epoch": 0.2302118171683389, + "grad_norm": 0.476408451795578, + "learning_rate": 8e-05, + "loss": 1.433, + "step": 4130 + }, + { + "epoch": 0.2302675585284281, + "grad_norm": 0.4656904935836792, + "learning_rate": 8e-05, + "loss": 1.6336, + "step": 4131 + }, + { + "epoch": 0.23032329988851727, + "grad_norm": 0.5517549514770508, + "learning_rate": 8e-05, + "loss": 1.6893, + "step": 4132 + }, + { + "epoch": 0.23037904124860648, + "grad_norm": 0.473292738199234, + "learning_rate": 8e-05, + "loss": 1.6653, + "step": 4133 + }, + { + "epoch": 0.23043478260869565, + "grad_norm": 0.48487353324890137, + "learning_rate": 8e-05, + "loss": 1.5773, + "step": 4134 + }, + { + "epoch": 0.23049052396878483, + "grad_norm": 0.4657806158065796, + "learning_rate": 8e-05, + "loss": 1.7153, + "step": 4135 + }, + { + "epoch": 0.230546265328874, + "grad_norm": 0.4734620153903961, + "learning_rate": 8e-05, + "loss": 1.6555, + "step": 4136 + }, + { + "epoch": 0.23060200668896322, + "grad_norm": 0.5308153629302979, + "learning_rate": 8e-05, + "loss": 1.7279, + "step": 4137 + }, + { + "epoch": 0.2306577480490524, + "grad_norm": 0.5166023969650269, + "learning_rate": 8e-05, + "loss": 1.9491, + "step": 4138 + }, + { + "epoch": 0.23071348940914158, + "grad_norm": 0.46642354130744934, + "learning_rate": 8e-05, + "loss": 1.6976, + "step": 4139 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.5172951817512512, + "learning_rate": 8e-05, + "loss": 1.889, + "step": 4140 + }, + { + "epoch": 0.23082497212931996, + "grad_norm": 0.48462292551994324, + "learning_rate": 8e-05, + "loss": 1.6684, + "step": 4141 + }, + { + "epoch": 0.23088071348940914, + "grad_norm": 0.5029516816139221, + "learning_rate": 8e-05, + "loss": 1.7295, + "step": 4142 + }, + { + "epoch": 0.23093645484949832, + "grad_norm": 0.4953778386116028, + "learning_rate": 8e-05, + "loss": 1.7833, + "step": 4143 + }, + { + "epoch": 0.23099219620958752, + "grad_norm": 0.5369850397109985, + "learning_rate": 8e-05, + "loss": 1.6728, + "step": 4144 + }, + { + "epoch": 0.2310479375696767, + "grad_norm": 0.45318469405174255, + "learning_rate": 8e-05, + "loss": 1.3945, + "step": 4145 + }, + { + "epoch": 0.23110367892976588, + "grad_norm": 0.4903089106082916, + "learning_rate": 8e-05, + "loss": 1.8122, + "step": 4146 + }, + { + "epoch": 0.23115942028985506, + "grad_norm": 0.48141953349113464, + "learning_rate": 8e-05, + "loss": 1.869, + "step": 4147 + }, + { + "epoch": 0.23121516164994427, + "grad_norm": 0.5402095913887024, + "learning_rate": 8e-05, + "loss": 1.9276, + "step": 4148 + }, + { + "epoch": 0.23127090301003345, + "grad_norm": 0.5884537100791931, + "learning_rate": 8e-05, + "loss": 2.0484, + "step": 4149 + }, + { + "epoch": 0.23132664437012262, + "grad_norm": 0.47551897168159485, + "learning_rate": 8e-05, + "loss": 1.5622, + "step": 4150 + }, + { + "epoch": 0.2313823857302118, + "grad_norm": 0.5315403342247009, + "learning_rate": 8e-05, + "loss": 1.9436, + "step": 4151 + }, + { + "epoch": 0.231438127090301, + "grad_norm": 0.4857591390609741, + "learning_rate": 8e-05, + "loss": 1.6029, + "step": 4152 + }, + { + "epoch": 0.2314938684503902, + "grad_norm": 0.46490103006362915, + "learning_rate": 8e-05, + "loss": 1.6518, + "step": 4153 + }, + { + "epoch": 0.23154960981047937, + "grad_norm": 0.4752138555049896, + "learning_rate": 8e-05, + "loss": 1.761, + "step": 4154 + }, + { + "epoch": 0.23160535117056857, + "grad_norm": 0.4420930743217468, + "learning_rate": 8e-05, + "loss": 1.5482, + "step": 4155 + }, + { + "epoch": 0.23166109253065775, + "grad_norm": 0.44526705145835876, + "learning_rate": 8e-05, + "loss": 1.4529, + "step": 4156 + }, + { + "epoch": 0.23171683389074693, + "grad_norm": 0.5382043719291687, + "learning_rate": 8e-05, + "loss": 1.729, + "step": 4157 + }, + { + "epoch": 0.2317725752508361, + "grad_norm": 0.5094907283782959, + "learning_rate": 8e-05, + "loss": 1.8522, + "step": 4158 + }, + { + "epoch": 0.23182831661092532, + "grad_norm": 0.48272833228111267, + "learning_rate": 8e-05, + "loss": 1.8283, + "step": 4159 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.47672557830810547, + "learning_rate": 8e-05, + "loss": 1.6897, + "step": 4160 + }, + { + "epoch": 0.23193979933110367, + "grad_norm": 0.5173724889755249, + "learning_rate": 8e-05, + "loss": 1.8552, + "step": 4161 + }, + { + "epoch": 0.23199554069119285, + "grad_norm": 0.5262453556060791, + "learning_rate": 8e-05, + "loss": 1.6888, + "step": 4162 + }, + { + "epoch": 0.23205128205128206, + "grad_norm": 0.5105573534965515, + "learning_rate": 8e-05, + "loss": 1.7991, + "step": 4163 + }, + { + "epoch": 0.23210702341137124, + "grad_norm": 0.5028561949729919, + "learning_rate": 8e-05, + "loss": 1.6346, + "step": 4164 + }, + { + "epoch": 0.23216276477146042, + "grad_norm": 0.4426646828651428, + "learning_rate": 8e-05, + "loss": 1.6081, + "step": 4165 + }, + { + "epoch": 0.23221850613154962, + "grad_norm": 0.5555811524391174, + "learning_rate": 8e-05, + "loss": 1.8258, + "step": 4166 + }, + { + "epoch": 0.2322742474916388, + "grad_norm": 0.47089752554893494, + "learning_rate": 8e-05, + "loss": 1.7385, + "step": 4167 + }, + { + "epoch": 0.23232998885172798, + "grad_norm": 0.48612332344055176, + "learning_rate": 8e-05, + "loss": 1.7468, + "step": 4168 + }, + { + "epoch": 0.23238573021181716, + "grad_norm": 0.4978078305721283, + "learning_rate": 8e-05, + "loss": 1.74, + "step": 4169 + }, + { + "epoch": 0.23244147157190637, + "grad_norm": 1.1125239133834839, + "learning_rate": 8e-05, + "loss": 1.83, + "step": 4170 + }, + { + "epoch": 0.23249721293199554, + "grad_norm": 0.5372199416160583, + "learning_rate": 8e-05, + "loss": 1.8458, + "step": 4171 + }, + { + "epoch": 0.23255295429208472, + "grad_norm": 0.5097193717956543, + "learning_rate": 8e-05, + "loss": 1.7344, + "step": 4172 + }, + { + "epoch": 0.2326086956521739, + "grad_norm": 0.48618727922439575, + "learning_rate": 8e-05, + "loss": 1.692, + "step": 4173 + }, + { + "epoch": 0.2326644370122631, + "grad_norm": 0.45250874757766724, + "learning_rate": 8e-05, + "loss": 1.719, + "step": 4174 + }, + { + "epoch": 0.2327201783723523, + "grad_norm": 0.4872897267341614, + "learning_rate": 8e-05, + "loss": 1.6807, + "step": 4175 + }, + { + "epoch": 0.23277591973244147, + "grad_norm": 0.4631511867046356, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 4176 + }, + { + "epoch": 0.23283166109253065, + "grad_norm": 0.5219497084617615, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 4177 + }, + { + "epoch": 0.23288740245261985, + "grad_norm": 0.4476414620876312, + "learning_rate": 8e-05, + "loss": 1.422, + "step": 4178 + }, + { + "epoch": 0.23294314381270903, + "grad_norm": 0.4813176095485687, + "learning_rate": 8e-05, + "loss": 1.6695, + "step": 4179 + }, + { + "epoch": 0.2329988851727982, + "grad_norm": 0.4781123101711273, + "learning_rate": 8e-05, + "loss": 1.4277, + "step": 4180 + }, + { + "epoch": 0.23305462653288742, + "grad_norm": 0.5305860638618469, + "learning_rate": 8e-05, + "loss": 1.9108, + "step": 4181 + }, + { + "epoch": 0.2331103678929766, + "grad_norm": 0.4862423241138458, + "learning_rate": 8e-05, + "loss": 1.5683, + "step": 4182 + }, + { + "epoch": 0.23316610925306577, + "grad_norm": 0.5205289721488953, + "learning_rate": 8e-05, + "loss": 1.8538, + "step": 4183 + }, + { + "epoch": 0.23322185061315495, + "grad_norm": 0.512668251991272, + "learning_rate": 8e-05, + "loss": 1.9319, + "step": 4184 + }, + { + "epoch": 0.23327759197324416, + "grad_norm": 0.4808613061904907, + "learning_rate": 8e-05, + "loss": 1.7602, + "step": 4185 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.506522536277771, + "learning_rate": 8e-05, + "loss": 1.686, + "step": 4186 + }, + { + "epoch": 0.23338907469342252, + "grad_norm": 0.4858236014842987, + "learning_rate": 8e-05, + "loss": 1.5494, + "step": 4187 + }, + { + "epoch": 0.2334448160535117, + "grad_norm": 0.5407388210296631, + "learning_rate": 8e-05, + "loss": 2.1236, + "step": 4188 + }, + { + "epoch": 0.2335005574136009, + "grad_norm": 0.47253403067588806, + "learning_rate": 8e-05, + "loss": 1.6083, + "step": 4189 + }, + { + "epoch": 0.23355629877369008, + "grad_norm": 0.4397057592868805, + "learning_rate": 8e-05, + "loss": 1.3396, + "step": 4190 + }, + { + "epoch": 0.23361204013377926, + "grad_norm": 0.4820716977119446, + "learning_rate": 8e-05, + "loss": 1.6484, + "step": 4191 + }, + { + "epoch": 0.23366778149386844, + "grad_norm": 0.4903406500816345, + "learning_rate": 8e-05, + "loss": 1.5289, + "step": 4192 + }, + { + "epoch": 0.23372352285395764, + "grad_norm": 0.49805960059165955, + "learning_rate": 8e-05, + "loss": 1.7073, + "step": 4193 + }, + { + "epoch": 0.23377926421404682, + "grad_norm": 0.44631659984588623, + "learning_rate": 8e-05, + "loss": 1.3544, + "step": 4194 + }, + { + "epoch": 0.233835005574136, + "grad_norm": 0.5055618286132812, + "learning_rate": 8e-05, + "loss": 1.8731, + "step": 4195 + }, + { + "epoch": 0.2338907469342252, + "grad_norm": 0.4882362186908722, + "learning_rate": 8e-05, + "loss": 1.6237, + "step": 4196 + }, + { + "epoch": 0.2339464882943144, + "grad_norm": 0.6776293516159058, + "learning_rate": 8e-05, + "loss": 1.8226, + "step": 4197 + }, + { + "epoch": 0.23400222965440357, + "grad_norm": 0.4881233274936676, + "learning_rate": 8e-05, + "loss": 1.7181, + "step": 4198 + }, + { + "epoch": 0.23405797101449274, + "grad_norm": 0.4813898801803589, + "learning_rate": 8e-05, + "loss": 1.5247, + "step": 4199 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.5439554452896118, + "learning_rate": 8e-05, + "loss": 1.8405, + "step": 4200 + }, + { + "epoch": 0.23416945373467113, + "grad_norm": 0.5290339589118958, + "learning_rate": 8e-05, + "loss": 1.6565, + "step": 4201 + }, + { + "epoch": 0.2342251950947603, + "grad_norm": 0.5204188823699951, + "learning_rate": 8e-05, + "loss": 1.8186, + "step": 4202 + }, + { + "epoch": 0.2342809364548495, + "grad_norm": 0.5180361866950989, + "learning_rate": 8e-05, + "loss": 1.7259, + "step": 4203 + }, + { + "epoch": 0.2343366778149387, + "grad_norm": 0.49083536863327026, + "learning_rate": 8e-05, + "loss": 1.5448, + "step": 4204 + }, + { + "epoch": 0.23439241917502787, + "grad_norm": 0.5805219411849976, + "learning_rate": 8e-05, + "loss": 1.8847, + "step": 4205 + }, + { + "epoch": 0.23444816053511705, + "grad_norm": 0.5516480207443237, + "learning_rate": 8e-05, + "loss": 1.9711, + "step": 4206 + }, + { + "epoch": 0.23450390189520623, + "grad_norm": 0.5432198643684387, + "learning_rate": 8e-05, + "loss": 1.7182, + "step": 4207 + }, + { + "epoch": 0.23455964325529544, + "grad_norm": 0.4973669648170471, + "learning_rate": 8e-05, + "loss": 1.7876, + "step": 4208 + }, + { + "epoch": 0.23461538461538461, + "grad_norm": 0.5608261227607727, + "learning_rate": 8e-05, + "loss": 1.932, + "step": 4209 + }, + { + "epoch": 0.2346711259754738, + "grad_norm": 0.49797818064689636, + "learning_rate": 8e-05, + "loss": 1.6952, + "step": 4210 + }, + { + "epoch": 0.234726867335563, + "grad_norm": 0.4905753433704376, + "learning_rate": 8e-05, + "loss": 1.6302, + "step": 4211 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 0.542640209197998, + "learning_rate": 8e-05, + "loss": 1.8917, + "step": 4212 + }, + { + "epoch": 0.23483835005574136, + "grad_norm": 0.5524635314941406, + "learning_rate": 8e-05, + "loss": 1.9545, + "step": 4213 + }, + { + "epoch": 0.23489409141583054, + "grad_norm": 0.47688010334968567, + "learning_rate": 8e-05, + "loss": 1.8556, + "step": 4214 + }, + { + "epoch": 0.23494983277591974, + "grad_norm": 0.5069547891616821, + "learning_rate": 8e-05, + "loss": 1.7025, + "step": 4215 + }, + { + "epoch": 0.23500557413600892, + "grad_norm": 0.5091939568519592, + "learning_rate": 8e-05, + "loss": 1.8823, + "step": 4216 + }, + { + "epoch": 0.2350613154960981, + "grad_norm": 0.46288204193115234, + "learning_rate": 8e-05, + "loss": 1.6151, + "step": 4217 + }, + { + "epoch": 0.23511705685618728, + "grad_norm": 0.502398669719696, + "learning_rate": 8e-05, + "loss": 1.9325, + "step": 4218 + }, + { + "epoch": 0.23517279821627649, + "grad_norm": 0.5230593085289001, + "learning_rate": 8e-05, + "loss": 1.684, + "step": 4219 + }, + { + "epoch": 0.23522853957636566, + "grad_norm": 0.4896896481513977, + "learning_rate": 8e-05, + "loss": 1.5547, + "step": 4220 + }, + { + "epoch": 0.23528428093645484, + "grad_norm": 0.4928077459335327, + "learning_rate": 8e-05, + "loss": 1.6465, + "step": 4221 + }, + { + "epoch": 0.23534002229654405, + "grad_norm": 0.48584556579589844, + "learning_rate": 8e-05, + "loss": 1.7758, + "step": 4222 + }, + { + "epoch": 0.23539576365663323, + "grad_norm": 0.48859500885009766, + "learning_rate": 8e-05, + "loss": 1.5394, + "step": 4223 + }, + { + "epoch": 0.2354515050167224, + "grad_norm": 0.5136126279830933, + "learning_rate": 8e-05, + "loss": 1.7458, + "step": 4224 + }, + { + "epoch": 0.23550724637681159, + "grad_norm": 0.5393681526184082, + "learning_rate": 8e-05, + "loss": 1.7832, + "step": 4225 + }, + { + "epoch": 0.2355629877369008, + "grad_norm": 0.5175178050994873, + "learning_rate": 8e-05, + "loss": 1.8735, + "step": 4226 + }, + { + "epoch": 0.23561872909698997, + "grad_norm": 0.4961029887199402, + "learning_rate": 8e-05, + "loss": 1.5048, + "step": 4227 + }, + { + "epoch": 0.23567447045707915, + "grad_norm": 0.5005576610565186, + "learning_rate": 8e-05, + "loss": 1.8192, + "step": 4228 + }, + { + "epoch": 0.23573021181716833, + "grad_norm": 0.49185097217559814, + "learning_rate": 8e-05, + "loss": 1.5498, + "step": 4229 + }, + { + "epoch": 0.23578595317725753, + "grad_norm": 0.48157772421836853, + "learning_rate": 8e-05, + "loss": 1.8148, + "step": 4230 + }, + { + "epoch": 0.2358416945373467, + "grad_norm": 0.5232310891151428, + "learning_rate": 8e-05, + "loss": 2.0586, + "step": 4231 + }, + { + "epoch": 0.2358974358974359, + "grad_norm": 0.5013419389724731, + "learning_rate": 8e-05, + "loss": 1.7316, + "step": 4232 + }, + { + "epoch": 0.23595317725752507, + "grad_norm": 0.5057625770568848, + "learning_rate": 8e-05, + "loss": 1.7509, + "step": 4233 + }, + { + "epoch": 0.23600891861761428, + "grad_norm": 0.45670923590660095, + "learning_rate": 8e-05, + "loss": 1.7861, + "step": 4234 + }, + { + "epoch": 0.23606465997770346, + "grad_norm": 0.497193306684494, + "learning_rate": 8e-05, + "loss": 1.7191, + "step": 4235 + }, + { + "epoch": 0.23612040133779263, + "grad_norm": 0.4746546745300293, + "learning_rate": 8e-05, + "loss": 1.8867, + "step": 4236 + }, + { + "epoch": 0.23617614269788184, + "grad_norm": 0.48446911573410034, + "learning_rate": 8e-05, + "loss": 1.6605, + "step": 4237 + }, + { + "epoch": 0.23623188405797102, + "grad_norm": 0.49240419268608093, + "learning_rate": 8e-05, + "loss": 1.7534, + "step": 4238 + }, + { + "epoch": 0.2362876254180602, + "grad_norm": 0.512628436088562, + "learning_rate": 8e-05, + "loss": 1.51, + "step": 4239 + }, + { + "epoch": 0.23634336677814938, + "grad_norm": 0.4205702841281891, + "learning_rate": 8e-05, + "loss": 1.3157, + "step": 4240 + }, + { + "epoch": 0.23639910813823858, + "grad_norm": 0.5130642652511597, + "learning_rate": 8e-05, + "loss": 1.7798, + "step": 4241 + }, + { + "epoch": 0.23645484949832776, + "grad_norm": 0.5004068613052368, + "learning_rate": 8e-05, + "loss": 1.694, + "step": 4242 + }, + { + "epoch": 0.23651059085841694, + "grad_norm": 0.5145587921142578, + "learning_rate": 8e-05, + "loss": 1.742, + "step": 4243 + }, + { + "epoch": 0.23656633221850612, + "grad_norm": 0.4868265688419342, + "learning_rate": 8e-05, + "loss": 1.7886, + "step": 4244 + }, + { + "epoch": 0.23662207357859533, + "grad_norm": 0.5196471214294434, + "learning_rate": 8e-05, + "loss": 1.7996, + "step": 4245 + }, + { + "epoch": 0.2366778149386845, + "grad_norm": 0.47576624155044556, + "learning_rate": 8e-05, + "loss": 1.6135, + "step": 4246 + }, + { + "epoch": 0.23673355629877368, + "grad_norm": 0.5461819767951965, + "learning_rate": 8e-05, + "loss": 1.8768, + "step": 4247 + }, + { + "epoch": 0.23678929765886286, + "grad_norm": 0.522162914276123, + "learning_rate": 8e-05, + "loss": 1.8317, + "step": 4248 + }, + { + "epoch": 0.23684503901895207, + "grad_norm": 0.5273149013519287, + "learning_rate": 8e-05, + "loss": 1.8237, + "step": 4249 + }, + { + "epoch": 0.23690078037904125, + "grad_norm": 0.4362734258174896, + "learning_rate": 8e-05, + "loss": 1.4484, + "step": 4250 + }, + { + "epoch": 0.23695652173913043, + "grad_norm": 0.46066081523895264, + "learning_rate": 8e-05, + "loss": 1.5555, + "step": 4251 + }, + { + "epoch": 0.23701226309921963, + "grad_norm": 0.5295606255531311, + "learning_rate": 8e-05, + "loss": 1.7134, + "step": 4252 + }, + { + "epoch": 0.2370680044593088, + "grad_norm": 0.5264022946357727, + "learning_rate": 8e-05, + "loss": 1.8476, + "step": 4253 + }, + { + "epoch": 0.237123745819398, + "grad_norm": 0.4991813600063324, + "learning_rate": 8e-05, + "loss": 1.8835, + "step": 4254 + }, + { + "epoch": 0.23717948717948717, + "grad_norm": 0.48692673444747925, + "learning_rate": 8e-05, + "loss": 1.6272, + "step": 4255 + }, + { + "epoch": 0.23723522853957638, + "grad_norm": 0.5100888013839722, + "learning_rate": 8e-05, + "loss": 1.8114, + "step": 4256 + }, + { + "epoch": 0.23729096989966555, + "grad_norm": 0.5449693202972412, + "learning_rate": 8e-05, + "loss": 1.89, + "step": 4257 + }, + { + "epoch": 0.23734671125975473, + "grad_norm": 0.479178786277771, + "learning_rate": 8e-05, + "loss": 1.5277, + "step": 4258 + }, + { + "epoch": 0.2374024526198439, + "grad_norm": 0.4832587242126465, + "learning_rate": 8e-05, + "loss": 1.6632, + "step": 4259 + }, + { + "epoch": 0.23745819397993312, + "grad_norm": 0.45674172043800354, + "learning_rate": 8e-05, + "loss": 1.6384, + "step": 4260 + }, + { + "epoch": 0.2375139353400223, + "grad_norm": 0.4978010654449463, + "learning_rate": 8e-05, + "loss": 1.6647, + "step": 4261 + }, + { + "epoch": 0.23756967670011148, + "grad_norm": 0.5177193880081177, + "learning_rate": 8e-05, + "loss": 1.8929, + "step": 4262 + }, + { + "epoch": 0.23762541806020068, + "grad_norm": 0.477351576089859, + "learning_rate": 8e-05, + "loss": 1.7562, + "step": 4263 + }, + { + "epoch": 0.23768115942028986, + "grad_norm": 0.5086522698402405, + "learning_rate": 8e-05, + "loss": 1.751, + "step": 4264 + }, + { + "epoch": 0.23773690078037904, + "grad_norm": 0.4827437102794647, + "learning_rate": 8e-05, + "loss": 1.6273, + "step": 4265 + }, + { + "epoch": 0.23779264214046822, + "grad_norm": 0.5645084977149963, + "learning_rate": 8e-05, + "loss": 1.8631, + "step": 4266 + }, + { + "epoch": 0.23784838350055743, + "grad_norm": 0.48042717576026917, + "learning_rate": 8e-05, + "loss": 1.5488, + "step": 4267 + }, + { + "epoch": 0.2379041248606466, + "grad_norm": 0.5094911456108093, + "learning_rate": 8e-05, + "loss": 1.8086, + "step": 4268 + }, + { + "epoch": 0.23795986622073578, + "grad_norm": 0.533108651638031, + "learning_rate": 8e-05, + "loss": 1.8615, + "step": 4269 + }, + { + "epoch": 0.23801560758082496, + "grad_norm": 0.4914109706878662, + "learning_rate": 8e-05, + "loss": 1.6501, + "step": 4270 + }, + { + "epoch": 0.23807134894091417, + "grad_norm": 0.45172661542892456, + "learning_rate": 8e-05, + "loss": 1.6222, + "step": 4271 + }, + { + "epoch": 0.23812709030100335, + "grad_norm": 0.5230309963226318, + "learning_rate": 8e-05, + "loss": 1.9301, + "step": 4272 + }, + { + "epoch": 0.23818283166109253, + "grad_norm": 0.5283030867576599, + "learning_rate": 8e-05, + "loss": 1.7618, + "step": 4273 + }, + { + "epoch": 0.2382385730211817, + "grad_norm": 0.529082179069519, + "learning_rate": 8e-05, + "loss": 1.7466, + "step": 4274 + }, + { + "epoch": 0.2382943143812709, + "grad_norm": 0.5190643072128296, + "learning_rate": 8e-05, + "loss": 1.9178, + "step": 4275 + }, + { + "epoch": 0.2383500557413601, + "grad_norm": 0.5080832839012146, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 4276 + }, + { + "epoch": 0.23840579710144927, + "grad_norm": 0.574957549571991, + "learning_rate": 8e-05, + "loss": 1.8489, + "step": 4277 + }, + { + "epoch": 0.23846153846153847, + "grad_norm": 0.5026995539665222, + "learning_rate": 8e-05, + "loss": 1.5338, + "step": 4278 + }, + { + "epoch": 0.23851727982162765, + "grad_norm": 0.5039991140365601, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 4279 + }, + { + "epoch": 0.23857302118171683, + "grad_norm": 0.44856736063957214, + "learning_rate": 8e-05, + "loss": 1.551, + "step": 4280 + }, + { + "epoch": 0.238628762541806, + "grad_norm": 0.5068076252937317, + "learning_rate": 8e-05, + "loss": 1.7688, + "step": 4281 + }, + { + "epoch": 0.23868450390189522, + "grad_norm": 0.5426245331764221, + "learning_rate": 8e-05, + "loss": 1.8139, + "step": 4282 + }, + { + "epoch": 0.2387402452619844, + "grad_norm": 0.4990077316761017, + "learning_rate": 8e-05, + "loss": 1.8424, + "step": 4283 + }, + { + "epoch": 0.23879598662207357, + "grad_norm": 0.48569685220718384, + "learning_rate": 8e-05, + "loss": 1.7252, + "step": 4284 + }, + { + "epoch": 0.23885172798216275, + "grad_norm": 0.4463791847229004, + "learning_rate": 8e-05, + "loss": 1.5896, + "step": 4285 + }, + { + "epoch": 0.23890746934225196, + "grad_norm": 0.4885939657688141, + "learning_rate": 8e-05, + "loss": 1.6246, + "step": 4286 + }, + { + "epoch": 0.23896321070234114, + "grad_norm": 0.5006809234619141, + "learning_rate": 8e-05, + "loss": 1.5473, + "step": 4287 + }, + { + "epoch": 0.23901895206243032, + "grad_norm": 0.4943895637989044, + "learning_rate": 8e-05, + "loss": 1.7773, + "step": 4288 + }, + { + "epoch": 0.2390746934225195, + "grad_norm": 0.4957701861858368, + "learning_rate": 8e-05, + "loss": 1.8107, + "step": 4289 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 0.4890899360179901, + "learning_rate": 8e-05, + "loss": 1.6751, + "step": 4290 + }, + { + "epoch": 0.23918617614269788, + "grad_norm": 0.512084424495697, + "learning_rate": 8e-05, + "loss": 1.6399, + "step": 4291 + }, + { + "epoch": 0.23924191750278706, + "grad_norm": 0.5295706391334534, + "learning_rate": 8e-05, + "loss": 1.8503, + "step": 4292 + }, + { + "epoch": 0.23929765886287627, + "grad_norm": 0.49469539523124695, + "learning_rate": 8e-05, + "loss": 1.5238, + "step": 4293 + }, + { + "epoch": 0.23935340022296545, + "grad_norm": 0.48040327429771423, + "learning_rate": 8e-05, + "loss": 1.7763, + "step": 4294 + }, + { + "epoch": 0.23940914158305462, + "grad_norm": 0.4731602668762207, + "learning_rate": 8e-05, + "loss": 1.7902, + "step": 4295 + }, + { + "epoch": 0.2394648829431438, + "grad_norm": 0.5192638635635376, + "learning_rate": 8e-05, + "loss": 1.8651, + "step": 4296 + }, + { + "epoch": 0.239520624303233, + "grad_norm": 0.5231795310974121, + "learning_rate": 8e-05, + "loss": 1.8744, + "step": 4297 + }, + { + "epoch": 0.2395763656633222, + "grad_norm": 0.5153510570526123, + "learning_rate": 8e-05, + "loss": 1.9705, + "step": 4298 + }, + { + "epoch": 0.23963210702341137, + "grad_norm": 0.48195499181747437, + "learning_rate": 8e-05, + "loss": 1.7613, + "step": 4299 + }, + { + "epoch": 0.23968784838350055, + "grad_norm": 0.5296260118484497, + "learning_rate": 8e-05, + "loss": 1.9392, + "step": 4300 + }, + { + "epoch": 0.23974358974358975, + "grad_norm": 0.4738166928291321, + "learning_rate": 8e-05, + "loss": 1.6396, + "step": 4301 + }, + { + "epoch": 0.23979933110367893, + "grad_norm": 0.4884643256664276, + "learning_rate": 8e-05, + "loss": 1.8827, + "step": 4302 + }, + { + "epoch": 0.2398550724637681, + "grad_norm": 0.4728654623031616, + "learning_rate": 8e-05, + "loss": 1.7101, + "step": 4303 + }, + { + "epoch": 0.2399108138238573, + "grad_norm": 0.4706076383590698, + "learning_rate": 8e-05, + "loss": 1.7652, + "step": 4304 + }, + { + "epoch": 0.2399665551839465, + "grad_norm": 0.4969691336154938, + "learning_rate": 8e-05, + "loss": 1.942, + "step": 4305 + }, + { + "epoch": 0.24002229654403567, + "grad_norm": 0.47131776809692383, + "learning_rate": 8e-05, + "loss": 1.725, + "step": 4306 + }, + { + "epoch": 0.24007803790412485, + "grad_norm": 0.5014018416404724, + "learning_rate": 8e-05, + "loss": 1.737, + "step": 4307 + }, + { + "epoch": 0.24013377926421406, + "grad_norm": 0.54756098985672, + "learning_rate": 8e-05, + "loss": 1.7052, + "step": 4308 + }, + { + "epoch": 0.24018952062430324, + "grad_norm": 0.45123088359832764, + "learning_rate": 8e-05, + "loss": 1.4678, + "step": 4309 + }, + { + "epoch": 0.24024526198439242, + "grad_norm": 0.4705629050731659, + "learning_rate": 8e-05, + "loss": 1.6367, + "step": 4310 + }, + { + "epoch": 0.2403010033444816, + "grad_norm": 0.4728585481643677, + "learning_rate": 8e-05, + "loss": 1.4548, + "step": 4311 + }, + { + "epoch": 0.2403567447045708, + "grad_norm": 0.4710177481174469, + "learning_rate": 8e-05, + "loss": 1.6581, + "step": 4312 + }, + { + "epoch": 0.24041248606465998, + "grad_norm": 0.5417190790176392, + "learning_rate": 8e-05, + "loss": 1.796, + "step": 4313 + }, + { + "epoch": 0.24046822742474916, + "grad_norm": 0.47374585270881653, + "learning_rate": 8e-05, + "loss": 1.6996, + "step": 4314 + }, + { + "epoch": 0.24052396878483834, + "grad_norm": 0.5024921298027039, + "learning_rate": 8e-05, + "loss": 1.4732, + "step": 4315 + }, + { + "epoch": 0.24057971014492754, + "grad_norm": 0.5105860829353333, + "learning_rate": 8e-05, + "loss": 1.8446, + "step": 4316 + }, + { + "epoch": 0.24063545150501672, + "grad_norm": 0.4570033550262451, + "learning_rate": 8e-05, + "loss": 1.614, + "step": 4317 + }, + { + "epoch": 0.2406911928651059, + "grad_norm": 0.4843344986438751, + "learning_rate": 8e-05, + "loss": 1.6607, + "step": 4318 + }, + { + "epoch": 0.2407469342251951, + "grad_norm": 0.5121851563453674, + "learning_rate": 8e-05, + "loss": 1.848, + "step": 4319 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.5036677122116089, + "learning_rate": 8e-05, + "loss": 1.633, + "step": 4320 + }, + { + "epoch": 0.24085841694537347, + "grad_norm": 0.497364342212677, + "learning_rate": 8e-05, + "loss": 1.778, + "step": 4321 + }, + { + "epoch": 0.24091415830546264, + "grad_norm": 0.48959243297576904, + "learning_rate": 8e-05, + "loss": 1.7664, + "step": 4322 + }, + { + "epoch": 0.24096989966555185, + "grad_norm": 0.5679601430892944, + "learning_rate": 8e-05, + "loss": 2.0067, + "step": 4323 + }, + { + "epoch": 0.24102564102564103, + "grad_norm": 0.48870909214019775, + "learning_rate": 8e-05, + "loss": 1.6651, + "step": 4324 + }, + { + "epoch": 0.2410813823857302, + "grad_norm": 0.5037427544593811, + "learning_rate": 8e-05, + "loss": 1.6614, + "step": 4325 + }, + { + "epoch": 0.2411371237458194, + "grad_norm": 0.4996836185455322, + "learning_rate": 8e-05, + "loss": 1.5968, + "step": 4326 + }, + { + "epoch": 0.2411928651059086, + "grad_norm": 0.47124508023262024, + "learning_rate": 8e-05, + "loss": 1.7196, + "step": 4327 + }, + { + "epoch": 0.24124860646599777, + "grad_norm": 0.45888665318489075, + "learning_rate": 8e-05, + "loss": 1.5474, + "step": 4328 + }, + { + "epoch": 0.24130434782608695, + "grad_norm": 0.48495161533355713, + "learning_rate": 8e-05, + "loss": 1.6149, + "step": 4329 + }, + { + "epoch": 0.24136008918617613, + "grad_norm": 0.5664768218994141, + "learning_rate": 8e-05, + "loss": 1.5396, + "step": 4330 + }, + { + "epoch": 0.24141583054626534, + "grad_norm": 0.4835517704486847, + "learning_rate": 8e-05, + "loss": 1.6441, + "step": 4331 + }, + { + "epoch": 0.24147157190635452, + "grad_norm": 0.4654442071914673, + "learning_rate": 8e-05, + "loss": 1.5344, + "step": 4332 + }, + { + "epoch": 0.2415273132664437, + "grad_norm": 0.5004253387451172, + "learning_rate": 8e-05, + "loss": 1.7656, + "step": 4333 + }, + { + "epoch": 0.2415830546265329, + "grad_norm": 0.4810848832130432, + "learning_rate": 8e-05, + "loss": 1.7206, + "step": 4334 + }, + { + "epoch": 0.24163879598662208, + "grad_norm": 0.4527301490306854, + "learning_rate": 8e-05, + "loss": 1.5351, + "step": 4335 + }, + { + "epoch": 0.24169453734671126, + "grad_norm": 0.504363477230072, + "learning_rate": 8e-05, + "loss": 1.8366, + "step": 4336 + }, + { + "epoch": 0.24175027870680044, + "grad_norm": 0.49054351449012756, + "learning_rate": 8e-05, + "loss": 1.5619, + "step": 4337 + }, + { + "epoch": 0.24180602006688964, + "grad_norm": 0.5057128071784973, + "learning_rate": 8e-05, + "loss": 1.7013, + "step": 4338 + }, + { + "epoch": 0.24186176142697882, + "grad_norm": 0.5352655649185181, + "learning_rate": 8e-05, + "loss": 1.9699, + "step": 4339 + }, + { + "epoch": 0.241917502787068, + "grad_norm": 0.5110335946083069, + "learning_rate": 8e-05, + "loss": 1.8174, + "step": 4340 + }, + { + "epoch": 0.24197324414715718, + "grad_norm": 0.4975016415119171, + "learning_rate": 8e-05, + "loss": 1.6685, + "step": 4341 + }, + { + "epoch": 0.24202898550724639, + "grad_norm": 0.4963131248950958, + "learning_rate": 8e-05, + "loss": 1.5433, + "step": 4342 + }, + { + "epoch": 0.24208472686733556, + "grad_norm": 0.5176040530204773, + "learning_rate": 8e-05, + "loss": 1.7958, + "step": 4343 + }, + { + "epoch": 0.24214046822742474, + "grad_norm": 0.49720054864883423, + "learning_rate": 8e-05, + "loss": 1.7171, + "step": 4344 + }, + { + "epoch": 0.24219620958751392, + "grad_norm": 0.4979418218135834, + "learning_rate": 8e-05, + "loss": 1.8272, + "step": 4345 + }, + { + "epoch": 0.24225195094760313, + "grad_norm": 0.4736308753490448, + "learning_rate": 8e-05, + "loss": 1.4816, + "step": 4346 + }, + { + "epoch": 0.2423076923076923, + "grad_norm": 0.48848211765289307, + "learning_rate": 8e-05, + "loss": 1.7337, + "step": 4347 + }, + { + "epoch": 0.24236343366778149, + "grad_norm": 0.4813627302646637, + "learning_rate": 8e-05, + "loss": 1.6464, + "step": 4348 + }, + { + "epoch": 0.2424191750278707, + "grad_norm": 0.5645063519477844, + "learning_rate": 8e-05, + "loss": 1.801, + "step": 4349 + }, + { + "epoch": 0.24247491638795987, + "grad_norm": 0.5068780779838562, + "learning_rate": 8e-05, + "loss": 1.6917, + "step": 4350 + }, + { + "epoch": 0.24253065774804905, + "grad_norm": 0.5032528042793274, + "learning_rate": 8e-05, + "loss": 1.8905, + "step": 4351 + }, + { + "epoch": 0.24258639910813823, + "grad_norm": 0.4853366017341614, + "learning_rate": 8e-05, + "loss": 1.6199, + "step": 4352 + }, + { + "epoch": 0.24264214046822744, + "grad_norm": 0.49789735674858093, + "learning_rate": 8e-05, + "loss": 1.6751, + "step": 4353 + }, + { + "epoch": 0.24269788182831661, + "grad_norm": 0.48519742488861084, + "learning_rate": 8e-05, + "loss": 1.6459, + "step": 4354 + }, + { + "epoch": 0.2427536231884058, + "grad_norm": 0.48001378774642944, + "learning_rate": 8e-05, + "loss": 1.5651, + "step": 4355 + }, + { + "epoch": 0.24280936454849497, + "grad_norm": 0.48040971159935, + "learning_rate": 8e-05, + "loss": 1.7485, + "step": 4356 + }, + { + "epoch": 0.24286510590858418, + "grad_norm": 0.48989880084991455, + "learning_rate": 8e-05, + "loss": 1.6495, + "step": 4357 + }, + { + "epoch": 0.24292084726867336, + "grad_norm": 0.4950062334537506, + "learning_rate": 8e-05, + "loss": 1.5592, + "step": 4358 + }, + { + "epoch": 0.24297658862876254, + "grad_norm": 0.5052546262741089, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 4359 + }, + { + "epoch": 0.24303232998885171, + "grad_norm": 0.49012768268585205, + "learning_rate": 8e-05, + "loss": 1.8105, + "step": 4360 + }, + { + "epoch": 0.24308807134894092, + "grad_norm": 0.5038552284240723, + "learning_rate": 8e-05, + "loss": 1.7451, + "step": 4361 + }, + { + "epoch": 0.2431438127090301, + "grad_norm": 0.5328361392021179, + "learning_rate": 8e-05, + "loss": 1.8975, + "step": 4362 + }, + { + "epoch": 0.24319955406911928, + "grad_norm": 0.5039812326431274, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 4363 + }, + { + "epoch": 0.24325529542920848, + "grad_norm": 0.497497022151947, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 4364 + }, + { + "epoch": 0.24331103678929766, + "grad_norm": 0.5225549340248108, + "learning_rate": 8e-05, + "loss": 1.6948, + "step": 4365 + }, + { + "epoch": 0.24336677814938684, + "grad_norm": 0.4995681643486023, + "learning_rate": 8e-05, + "loss": 1.4985, + "step": 4366 + }, + { + "epoch": 0.24342251950947602, + "grad_norm": 0.48537102341651917, + "learning_rate": 8e-05, + "loss": 1.5165, + "step": 4367 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.511480987071991, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 4368 + }, + { + "epoch": 0.2435340022296544, + "grad_norm": 0.4678894281387329, + "learning_rate": 8e-05, + "loss": 1.6623, + "step": 4369 + }, + { + "epoch": 0.24358974358974358, + "grad_norm": 0.4894356429576874, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 4370 + }, + { + "epoch": 0.24364548494983276, + "grad_norm": 0.5064272880554199, + "learning_rate": 8e-05, + "loss": 1.7033, + "step": 4371 + }, + { + "epoch": 0.24370122630992197, + "grad_norm": 0.5376918911933899, + "learning_rate": 8e-05, + "loss": 1.7259, + "step": 4372 + }, + { + "epoch": 0.24375696767001115, + "grad_norm": 0.5779525637626648, + "learning_rate": 8e-05, + "loss": 1.819, + "step": 4373 + }, + { + "epoch": 0.24381270903010033, + "grad_norm": 0.4810136556625366, + "learning_rate": 8e-05, + "loss": 1.5904, + "step": 4374 + }, + { + "epoch": 0.24386845039018953, + "grad_norm": 0.5844541192054749, + "learning_rate": 8e-05, + "loss": 1.7991, + "step": 4375 + }, + { + "epoch": 0.2439241917502787, + "grad_norm": 0.5359873175621033, + "learning_rate": 8e-05, + "loss": 1.8402, + "step": 4376 + }, + { + "epoch": 0.2439799331103679, + "grad_norm": 0.5614349842071533, + "learning_rate": 8e-05, + "loss": 1.6488, + "step": 4377 + }, + { + "epoch": 0.24403567447045707, + "grad_norm": 0.5228754281997681, + "learning_rate": 8e-05, + "loss": 1.7735, + "step": 4378 + }, + { + "epoch": 0.24409141583054628, + "grad_norm": 0.5935977697372437, + "learning_rate": 8e-05, + "loss": 2.014, + "step": 4379 + }, + { + "epoch": 0.24414715719063546, + "grad_norm": 0.510930597782135, + "learning_rate": 8e-05, + "loss": 1.843, + "step": 4380 + }, + { + "epoch": 0.24420289855072463, + "grad_norm": 0.4812546670436859, + "learning_rate": 8e-05, + "loss": 1.6445, + "step": 4381 + }, + { + "epoch": 0.2442586399108138, + "grad_norm": 0.49154365062713623, + "learning_rate": 8e-05, + "loss": 1.6832, + "step": 4382 + }, + { + "epoch": 0.24431438127090302, + "grad_norm": 0.4897598624229431, + "learning_rate": 8e-05, + "loss": 1.709, + "step": 4383 + }, + { + "epoch": 0.2443701226309922, + "grad_norm": 0.51563960313797, + "learning_rate": 8e-05, + "loss": 1.7739, + "step": 4384 + }, + { + "epoch": 0.24442586399108138, + "grad_norm": 0.5068482756614685, + "learning_rate": 8e-05, + "loss": 1.8145, + "step": 4385 + }, + { + "epoch": 0.24448160535117056, + "grad_norm": 0.47429415583610535, + "learning_rate": 8e-05, + "loss": 1.5825, + "step": 4386 + }, + { + "epoch": 0.24453734671125976, + "grad_norm": 0.5053471922874451, + "learning_rate": 8e-05, + "loss": 1.8622, + "step": 4387 + }, + { + "epoch": 0.24459308807134894, + "grad_norm": 0.479079008102417, + "learning_rate": 8e-05, + "loss": 1.6088, + "step": 4388 + }, + { + "epoch": 0.24464882943143812, + "grad_norm": 0.4736523926258087, + "learning_rate": 8e-05, + "loss": 1.6683, + "step": 4389 + }, + { + "epoch": 0.24470457079152733, + "grad_norm": 0.5143937468528748, + "learning_rate": 8e-05, + "loss": 1.7335, + "step": 4390 + }, + { + "epoch": 0.2447603121516165, + "grad_norm": 0.5011566877365112, + "learning_rate": 8e-05, + "loss": 1.8322, + "step": 4391 + }, + { + "epoch": 0.24481605351170568, + "grad_norm": 0.4936258792877197, + "learning_rate": 8e-05, + "loss": 1.5957, + "step": 4392 + }, + { + "epoch": 0.24487179487179486, + "grad_norm": 0.49569499492645264, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 4393 + }, + { + "epoch": 0.24492753623188407, + "grad_norm": 0.5097708702087402, + "learning_rate": 8e-05, + "loss": 1.7159, + "step": 4394 + }, + { + "epoch": 0.24498327759197325, + "grad_norm": 0.500952959060669, + "learning_rate": 8e-05, + "loss": 1.7111, + "step": 4395 + }, + { + "epoch": 0.24503901895206243, + "grad_norm": 0.5259371995925903, + "learning_rate": 8e-05, + "loss": 1.7677, + "step": 4396 + }, + { + "epoch": 0.2450947603121516, + "grad_norm": 0.47353237867355347, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 4397 + }, + { + "epoch": 0.2451505016722408, + "grad_norm": 0.5153634548187256, + "learning_rate": 8e-05, + "loss": 1.8373, + "step": 4398 + }, + { + "epoch": 0.24520624303233, + "grad_norm": 0.5812398791313171, + "learning_rate": 8e-05, + "loss": 1.977, + "step": 4399 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.48786085844039917, + "learning_rate": 8e-05, + "loss": 1.4989, + "step": 4400 + }, + { + "epoch": 0.24531772575250835, + "grad_norm": 0.46562907099723816, + "learning_rate": 8e-05, + "loss": 1.5978, + "step": 4401 + }, + { + "epoch": 0.24537346711259755, + "grad_norm": 0.5388596653938293, + "learning_rate": 8e-05, + "loss": 1.8173, + "step": 4402 + }, + { + "epoch": 0.24542920847268673, + "grad_norm": 0.4896782636642456, + "learning_rate": 8e-05, + "loss": 1.5591, + "step": 4403 + }, + { + "epoch": 0.2454849498327759, + "grad_norm": 0.5358275771141052, + "learning_rate": 8e-05, + "loss": 1.979, + "step": 4404 + }, + { + "epoch": 0.24554069119286512, + "grad_norm": 0.4867069125175476, + "learning_rate": 8e-05, + "loss": 1.7874, + "step": 4405 + }, + { + "epoch": 0.2455964325529543, + "grad_norm": 0.523019015789032, + "learning_rate": 8e-05, + "loss": 1.7394, + "step": 4406 + }, + { + "epoch": 0.24565217391304348, + "grad_norm": 0.5033044219017029, + "learning_rate": 8e-05, + "loss": 1.7044, + "step": 4407 + }, + { + "epoch": 0.24570791527313265, + "grad_norm": 0.5341840982437134, + "learning_rate": 8e-05, + "loss": 1.7647, + "step": 4408 + }, + { + "epoch": 0.24576365663322186, + "grad_norm": 0.52601158618927, + "learning_rate": 8e-05, + "loss": 1.8909, + "step": 4409 + }, + { + "epoch": 0.24581939799331104, + "grad_norm": 0.46190014481544495, + "learning_rate": 8e-05, + "loss": 1.5464, + "step": 4410 + }, + { + "epoch": 0.24587513935340022, + "grad_norm": 0.4728126525878906, + "learning_rate": 8e-05, + "loss": 1.5892, + "step": 4411 + }, + { + "epoch": 0.2459308807134894, + "grad_norm": 0.5146960616111755, + "learning_rate": 8e-05, + "loss": 1.8503, + "step": 4412 + }, + { + "epoch": 0.2459866220735786, + "grad_norm": 0.4900292754173279, + "learning_rate": 8e-05, + "loss": 1.6651, + "step": 4413 + }, + { + "epoch": 0.24604236343366778, + "grad_norm": 0.5353087782859802, + "learning_rate": 8e-05, + "loss": 1.9005, + "step": 4414 + }, + { + "epoch": 0.24609810479375696, + "grad_norm": 0.5150728821754456, + "learning_rate": 8e-05, + "loss": 1.8399, + "step": 4415 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 0.46863245964050293, + "learning_rate": 8e-05, + "loss": 1.639, + "step": 4416 + }, + { + "epoch": 0.24620958751393535, + "grad_norm": 0.47680985927581787, + "learning_rate": 8e-05, + "loss": 1.7233, + "step": 4417 + }, + { + "epoch": 0.24626532887402452, + "grad_norm": 0.5293520092964172, + "learning_rate": 8e-05, + "loss": 1.6824, + "step": 4418 + }, + { + "epoch": 0.2463210702341137, + "grad_norm": 0.4874856770038605, + "learning_rate": 8e-05, + "loss": 1.5342, + "step": 4419 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 0.5101845860481262, + "learning_rate": 8e-05, + "loss": 1.4651, + "step": 4420 + }, + { + "epoch": 0.2464325529542921, + "grad_norm": 0.502173900604248, + "learning_rate": 8e-05, + "loss": 1.6227, + "step": 4421 + }, + { + "epoch": 0.24648829431438127, + "grad_norm": 0.49891185760498047, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 4422 + }, + { + "epoch": 0.24654403567447045, + "grad_norm": 0.5041410326957703, + "learning_rate": 8e-05, + "loss": 1.728, + "step": 4423 + }, + { + "epoch": 0.24659977703455965, + "grad_norm": 0.5070465803146362, + "learning_rate": 8e-05, + "loss": 1.5741, + "step": 4424 + }, + { + "epoch": 0.24665551839464883, + "grad_norm": 0.5117592215538025, + "learning_rate": 8e-05, + "loss": 1.6389, + "step": 4425 + }, + { + "epoch": 0.246711259754738, + "grad_norm": 0.486873060464859, + "learning_rate": 8e-05, + "loss": 1.5909, + "step": 4426 + }, + { + "epoch": 0.2467670011148272, + "grad_norm": 0.5160625576972961, + "learning_rate": 8e-05, + "loss": 1.9141, + "step": 4427 + }, + { + "epoch": 0.2468227424749164, + "grad_norm": 0.4876449704170227, + "learning_rate": 8e-05, + "loss": 1.7166, + "step": 4428 + }, + { + "epoch": 0.24687848383500557, + "grad_norm": 0.4946858882904053, + "learning_rate": 8e-05, + "loss": 1.6845, + "step": 4429 + }, + { + "epoch": 0.24693422519509475, + "grad_norm": 0.4787217080593109, + "learning_rate": 8e-05, + "loss": 1.5846, + "step": 4430 + }, + { + "epoch": 0.24698996655518396, + "grad_norm": 0.4708745777606964, + "learning_rate": 8e-05, + "loss": 1.4559, + "step": 4431 + }, + { + "epoch": 0.24704570791527314, + "grad_norm": 0.5231176614761353, + "learning_rate": 8e-05, + "loss": 1.7047, + "step": 4432 + }, + { + "epoch": 0.24710144927536232, + "grad_norm": 0.4752837121486664, + "learning_rate": 8e-05, + "loss": 1.7061, + "step": 4433 + }, + { + "epoch": 0.2471571906354515, + "grad_norm": 0.5410003662109375, + "learning_rate": 8e-05, + "loss": 1.8634, + "step": 4434 + }, + { + "epoch": 0.2472129319955407, + "grad_norm": 0.48589983582496643, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 4435 + }, + { + "epoch": 0.24726867335562988, + "grad_norm": 0.561657190322876, + "learning_rate": 8e-05, + "loss": 1.9844, + "step": 4436 + }, + { + "epoch": 0.24732441471571906, + "grad_norm": 0.4805437922477722, + "learning_rate": 8e-05, + "loss": 1.5661, + "step": 4437 + }, + { + "epoch": 0.24738015607580824, + "grad_norm": 0.5096524953842163, + "learning_rate": 8e-05, + "loss": 1.809, + "step": 4438 + }, + { + "epoch": 0.24743589743589745, + "grad_norm": 0.5464960336685181, + "learning_rate": 8e-05, + "loss": 1.9204, + "step": 4439 + }, + { + "epoch": 0.24749163879598662, + "grad_norm": 0.4835813641548157, + "learning_rate": 8e-05, + "loss": 1.6065, + "step": 4440 + }, + { + "epoch": 0.2475473801560758, + "grad_norm": 0.4568433165550232, + "learning_rate": 8e-05, + "loss": 1.2585, + "step": 4441 + }, + { + "epoch": 0.24760312151616498, + "grad_norm": 0.5256950855255127, + "learning_rate": 8e-05, + "loss": 1.6795, + "step": 4442 + }, + { + "epoch": 0.2476588628762542, + "grad_norm": 0.4812221825122833, + "learning_rate": 8e-05, + "loss": 2.0234, + "step": 4443 + }, + { + "epoch": 0.24771460423634337, + "grad_norm": 0.49150028824806213, + "learning_rate": 8e-05, + "loss": 1.7599, + "step": 4444 + }, + { + "epoch": 0.24777034559643255, + "grad_norm": 0.5113860368728638, + "learning_rate": 8e-05, + "loss": 1.5278, + "step": 4445 + }, + { + "epoch": 0.24782608695652175, + "grad_norm": 0.497530460357666, + "learning_rate": 8e-05, + "loss": 1.6159, + "step": 4446 + }, + { + "epoch": 0.24788182831661093, + "grad_norm": 0.5291724801063538, + "learning_rate": 8e-05, + "loss": 1.4563, + "step": 4447 + }, + { + "epoch": 0.2479375696767001, + "grad_norm": 0.52456134557724, + "learning_rate": 8e-05, + "loss": 1.8435, + "step": 4448 + }, + { + "epoch": 0.2479933110367893, + "grad_norm": 0.4777376353740692, + "learning_rate": 8e-05, + "loss": 1.7825, + "step": 4449 + }, + { + "epoch": 0.2480490523968785, + "grad_norm": 0.4933377504348755, + "learning_rate": 8e-05, + "loss": 1.8244, + "step": 4450 + }, + { + "epoch": 0.24810479375696767, + "grad_norm": 0.4991428256034851, + "learning_rate": 8e-05, + "loss": 1.6007, + "step": 4451 + }, + { + "epoch": 0.24816053511705685, + "grad_norm": 0.4872720241546631, + "learning_rate": 8e-05, + "loss": 1.7075, + "step": 4452 + }, + { + "epoch": 0.24821627647714603, + "grad_norm": 0.511841893196106, + "learning_rate": 8e-05, + "loss": 1.9252, + "step": 4453 + }, + { + "epoch": 0.24827201783723524, + "grad_norm": 0.51175457239151, + "learning_rate": 8e-05, + "loss": 1.6001, + "step": 4454 + }, + { + "epoch": 0.24832775919732442, + "grad_norm": 0.48516520857810974, + "learning_rate": 8e-05, + "loss": 1.6602, + "step": 4455 + }, + { + "epoch": 0.2483835005574136, + "grad_norm": 0.4751116931438446, + "learning_rate": 8e-05, + "loss": 1.6331, + "step": 4456 + }, + { + "epoch": 0.24843924191750277, + "grad_norm": 0.4790758490562439, + "learning_rate": 8e-05, + "loss": 1.6961, + "step": 4457 + }, + { + "epoch": 0.24849498327759198, + "grad_norm": 0.536362886428833, + "learning_rate": 8e-05, + "loss": 1.8841, + "step": 4458 + }, + { + "epoch": 0.24855072463768116, + "grad_norm": 0.4993929862976074, + "learning_rate": 8e-05, + "loss": 1.6816, + "step": 4459 + }, + { + "epoch": 0.24860646599777034, + "grad_norm": 0.5037083029747009, + "learning_rate": 8e-05, + "loss": 1.877, + "step": 4460 + }, + { + "epoch": 0.24866220735785954, + "grad_norm": 0.5280500650405884, + "learning_rate": 8e-05, + "loss": 1.884, + "step": 4461 + }, + { + "epoch": 0.24871794871794872, + "grad_norm": 0.5553090572357178, + "learning_rate": 8e-05, + "loss": 1.8299, + "step": 4462 + }, + { + "epoch": 0.2487736900780379, + "grad_norm": 0.49785253405570984, + "learning_rate": 8e-05, + "loss": 1.7691, + "step": 4463 + }, + { + "epoch": 0.24882943143812708, + "grad_norm": 0.5308324694633484, + "learning_rate": 8e-05, + "loss": 1.8467, + "step": 4464 + }, + { + "epoch": 0.2488851727982163, + "grad_norm": 0.5013288259506226, + "learning_rate": 8e-05, + "loss": 1.714, + "step": 4465 + }, + { + "epoch": 0.24894091415830547, + "grad_norm": 0.49870118498802185, + "learning_rate": 8e-05, + "loss": 1.8183, + "step": 4466 + }, + { + "epoch": 0.24899665551839464, + "grad_norm": 0.49616575241088867, + "learning_rate": 8e-05, + "loss": 1.5885, + "step": 4467 + }, + { + "epoch": 0.24905239687848382, + "grad_norm": 0.5420321226119995, + "learning_rate": 8e-05, + "loss": 1.8196, + "step": 4468 + }, + { + "epoch": 0.24910813823857303, + "grad_norm": 0.5438189506530762, + "learning_rate": 8e-05, + "loss": 1.8031, + "step": 4469 + }, + { + "epoch": 0.2491638795986622, + "grad_norm": 0.5136116743087769, + "learning_rate": 8e-05, + "loss": 1.7156, + "step": 4470 + }, + { + "epoch": 0.2492196209587514, + "grad_norm": 0.4942511320114136, + "learning_rate": 8e-05, + "loss": 1.6699, + "step": 4471 + }, + { + "epoch": 0.2492753623188406, + "grad_norm": 0.4945601224899292, + "learning_rate": 8e-05, + "loss": 1.6535, + "step": 4472 + }, + { + "epoch": 0.24933110367892977, + "grad_norm": 0.48275116086006165, + "learning_rate": 8e-05, + "loss": 1.583, + "step": 4473 + }, + { + "epoch": 0.24938684503901895, + "grad_norm": 0.4992966651916504, + "learning_rate": 8e-05, + "loss": 1.6477, + "step": 4474 + }, + { + "epoch": 0.24944258639910813, + "grad_norm": 0.4911842942237854, + "learning_rate": 8e-05, + "loss": 1.5971, + "step": 4475 + }, + { + "epoch": 0.24949832775919734, + "grad_norm": 0.5159841179847717, + "learning_rate": 8e-05, + "loss": 1.782, + "step": 4476 + }, + { + "epoch": 0.24955406911928651, + "grad_norm": 0.5172543525695801, + "learning_rate": 8e-05, + "loss": 1.8104, + "step": 4477 + }, + { + "epoch": 0.2496098104793757, + "grad_norm": 0.5436558723449707, + "learning_rate": 8e-05, + "loss": 1.6937, + "step": 4478 + }, + { + "epoch": 0.24966555183946487, + "grad_norm": 0.5378687977790833, + "learning_rate": 8e-05, + "loss": 1.9014, + "step": 4479 + }, + { + "epoch": 0.24972129319955408, + "grad_norm": 0.5014615058898926, + "learning_rate": 8e-05, + "loss": 1.5676, + "step": 4480 + }, + { + "epoch": 0.24977703455964326, + "grad_norm": 0.47929060459136963, + "learning_rate": 8e-05, + "loss": 1.8378, + "step": 4481 + }, + { + "epoch": 0.24983277591973244, + "grad_norm": 0.47527262568473816, + "learning_rate": 8e-05, + "loss": 1.6668, + "step": 4482 + }, + { + "epoch": 0.24988851727982161, + "grad_norm": 0.49721941351890564, + "learning_rate": 8e-05, + "loss": 1.6998, + "step": 4483 + }, + { + "epoch": 0.24994425863991082, + "grad_norm": 0.4377490282058716, + "learning_rate": 8e-05, + "loss": 1.4564, + "step": 4484 + }, + { + "epoch": 0.25, + "grad_norm": 0.531048059463501, + "learning_rate": 8e-05, + "loss": 1.9123, + "step": 4485 + }, + { + "epoch": 0.2500557413600892, + "grad_norm": 0.5143203139305115, + "learning_rate": 8e-05, + "loss": 1.8096, + "step": 4486 + }, + { + "epoch": 0.25011148272017836, + "grad_norm": 0.44039544463157654, + "learning_rate": 8e-05, + "loss": 1.6269, + "step": 4487 + }, + { + "epoch": 0.25016722408026754, + "grad_norm": 0.493327796459198, + "learning_rate": 8e-05, + "loss": 1.6879, + "step": 4488 + }, + { + "epoch": 0.25022296544035677, + "grad_norm": 0.47744813561439514, + "learning_rate": 8e-05, + "loss": 1.8025, + "step": 4489 + }, + { + "epoch": 0.25027870680044595, + "grad_norm": 0.5044304132461548, + "learning_rate": 8e-05, + "loss": 1.7285, + "step": 4490 + }, + { + "epoch": 0.25033444816053513, + "grad_norm": 0.5493786931037903, + "learning_rate": 8e-05, + "loss": 1.889, + "step": 4491 + }, + { + "epoch": 0.2503901895206243, + "grad_norm": 0.4919975996017456, + "learning_rate": 8e-05, + "loss": 1.6406, + "step": 4492 + }, + { + "epoch": 0.2504459308807135, + "grad_norm": 0.5749701857566833, + "learning_rate": 8e-05, + "loss": 1.8115, + "step": 4493 + }, + { + "epoch": 0.25050167224080266, + "grad_norm": 0.5213183760643005, + "learning_rate": 8e-05, + "loss": 1.8957, + "step": 4494 + }, + { + "epoch": 0.25055741360089184, + "grad_norm": 0.5226160287857056, + "learning_rate": 8e-05, + "loss": 1.6662, + "step": 4495 + }, + { + "epoch": 0.250613154960981, + "grad_norm": 0.5784220695495605, + "learning_rate": 8e-05, + "loss": 2.0499, + "step": 4496 + }, + { + "epoch": 0.25066889632107026, + "grad_norm": 0.435458242893219, + "learning_rate": 8e-05, + "loss": 1.4048, + "step": 4497 + }, + { + "epoch": 0.25072463768115943, + "grad_norm": 0.5050778388977051, + "learning_rate": 8e-05, + "loss": 1.6978, + "step": 4498 + }, + { + "epoch": 0.2507803790412486, + "grad_norm": 0.504496157169342, + "learning_rate": 8e-05, + "loss": 1.6795, + "step": 4499 + }, + { + "epoch": 0.2508361204013378, + "grad_norm": 0.5178218483924866, + "learning_rate": 8e-05, + "loss": 1.8451, + "step": 4500 + }, + { + "epoch": 0.25089186176142697, + "grad_norm": 0.4855165183544159, + "learning_rate": 8e-05, + "loss": 1.6222, + "step": 4501 + }, + { + "epoch": 0.25094760312151615, + "grad_norm": 0.5280212759971619, + "learning_rate": 8e-05, + "loss": 1.9271, + "step": 4502 + }, + { + "epoch": 0.25100334448160533, + "grad_norm": 0.5055220723152161, + "learning_rate": 8e-05, + "loss": 1.7128, + "step": 4503 + }, + { + "epoch": 0.25105908584169456, + "grad_norm": 0.5127941370010376, + "learning_rate": 8e-05, + "loss": 1.8969, + "step": 4504 + }, + { + "epoch": 0.25111482720178374, + "grad_norm": 0.47817113995552063, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 4505 + }, + { + "epoch": 0.2511705685618729, + "grad_norm": 0.5066494345664978, + "learning_rate": 8e-05, + "loss": 1.6365, + "step": 4506 + }, + { + "epoch": 0.2512263099219621, + "grad_norm": 0.5416378974914551, + "learning_rate": 8e-05, + "loss": 1.875, + "step": 4507 + }, + { + "epoch": 0.2512820512820513, + "grad_norm": 0.48432135581970215, + "learning_rate": 8e-05, + "loss": 1.5935, + "step": 4508 + }, + { + "epoch": 0.25133779264214046, + "grad_norm": 0.45147547125816345, + "learning_rate": 8e-05, + "loss": 1.5384, + "step": 4509 + }, + { + "epoch": 0.25139353400222963, + "grad_norm": 0.5376551747322083, + "learning_rate": 8e-05, + "loss": 1.9072, + "step": 4510 + }, + { + "epoch": 0.2514492753623188, + "grad_norm": 0.504919171333313, + "learning_rate": 8e-05, + "loss": 1.7057, + "step": 4511 + }, + { + "epoch": 0.25150501672240805, + "grad_norm": 0.5230275988578796, + "learning_rate": 8e-05, + "loss": 1.7698, + "step": 4512 + }, + { + "epoch": 0.2515607580824972, + "grad_norm": 0.4875522553920746, + "learning_rate": 8e-05, + "loss": 1.6571, + "step": 4513 + }, + { + "epoch": 0.2516164994425864, + "grad_norm": 0.5083442330360413, + "learning_rate": 8e-05, + "loss": 1.7291, + "step": 4514 + }, + { + "epoch": 0.2516722408026756, + "grad_norm": 0.46802598237991333, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 4515 + }, + { + "epoch": 0.25172798216276476, + "grad_norm": 0.4877402186393738, + "learning_rate": 8e-05, + "loss": 1.7509, + "step": 4516 + }, + { + "epoch": 0.25178372352285394, + "grad_norm": 0.5294947624206543, + "learning_rate": 8e-05, + "loss": 1.7165, + "step": 4517 + }, + { + "epoch": 0.2518394648829431, + "grad_norm": 0.47198450565338135, + "learning_rate": 8e-05, + "loss": 1.4383, + "step": 4518 + }, + { + "epoch": 0.25189520624303235, + "grad_norm": 0.55704665184021, + "learning_rate": 8e-05, + "loss": 2.1062, + "step": 4519 + }, + { + "epoch": 0.25195094760312153, + "grad_norm": 0.5346017479896545, + "learning_rate": 8e-05, + "loss": 1.6289, + "step": 4520 + }, + { + "epoch": 0.2520066889632107, + "grad_norm": 0.5310443639755249, + "learning_rate": 8e-05, + "loss": 1.7001, + "step": 4521 + }, + { + "epoch": 0.2520624303232999, + "grad_norm": 0.48790326714515686, + "learning_rate": 8e-05, + "loss": 1.6094, + "step": 4522 + }, + { + "epoch": 0.25211817168338907, + "grad_norm": 0.55752032995224, + "learning_rate": 8e-05, + "loss": 1.9714, + "step": 4523 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 0.48758000135421753, + "learning_rate": 8e-05, + "loss": 1.7153, + "step": 4524 + }, + { + "epoch": 0.2522296544035674, + "grad_norm": 0.4836397171020508, + "learning_rate": 8e-05, + "loss": 1.7017, + "step": 4525 + }, + { + "epoch": 0.2522853957636566, + "grad_norm": 0.48506078124046326, + "learning_rate": 8e-05, + "loss": 1.7446, + "step": 4526 + }, + { + "epoch": 0.25234113712374584, + "grad_norm": 0.4948675334453583, + "learning_rate": 8e-05, + "loss": 1.5827, + "step": 4527 + }, + { + "epoch": 0.252396878483835, + "grad_norm": 0.5153377652168274, + "learning_rate": 8e-05, + "loss": 1.8249, + "step": 4528 + }, + { + "epoch": 0.2524526198439242, + "grad_norm": 0.46015942096710205, + "learning_rate": 8e-05, + "loss": 1.4809, + "step": 4529 + }, + { + "epoch": 0.2525083612040134, + "grad_norm": 0.556275486946106, + "learning_rate": 8e-05, + "loss": 1.8923, + "step": 4530 + }, + { + "epoch": 0.25256410256410255, + "grad_norm": 0.5005241632461548, + "learning_rate": 8e-05, + "loss": 1.7709, + "step": 4531 + }, + { + "epoch": 0.25261984392419173, + "grad_norm": 0.4980381429195404, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 4532 + }, + { + "epoch": 0.2526755852842809, + "grad_norm": 0.48650914430618286, + "learning_rate": 8e-05, + "loss": 1.8241, + "step": 4533 + }, + { + "epoch": 0.25273132664437015, + "grad_norm": 0.4514947235584259, + "learning_rate": 8e-05, + "loss": 1.5061, + "step": 4534 + }, + { + "epoch": 0.2527870680044593, + "grad_norm": 0.49436303973197937, + "learning_rate": 8e-05, + "loss": 1.5731, + "step": 4535 + }, + { + "epoch": 0.2528428093645485, + "grad_norm": 0.5584789514541626, + "learning_rate": 8e-05, + "loss": 1.9422, + "step": 4536 + }, + { + "epoch": 0.2528985507246377, + "grad_norm": 0.4882524609565735, + "learning_rate": 8e-05, + "loss": 1.5859, + "step": 4537 + }, + { + "epoch": 0.25295429208472686, + "grad_norm": 0.478646844625473, + "learning_rate": 8e-05, + "loss": 1.6171, + "step": 4538 + }, + { + "epoch": 0.25301003344481604, + "grad_norm": 0.48294344544410706, + "learning_rate": 8e-05, + "loss": 1.652, + "step": 4539 + }, + { + "epoch": 0.2530657748049052, + "grad_norm": 0.5388214588165283, + "learning_rate": 8e-05, + "loss": 1.7317, + "step": 4540 + }, + { + "epoch": 0.25312151616499445, + "grad_norm": 0.5226413011550903, + "learning_rate": 8e-05, + "loss": 1.7449, + "step": 4541 + }, + { + "epoch": 0.25317725752508363, + "grad_norm": 0.5220893621444702, + "learning_rate": 8e-05, + "loss": 1.718, + "step": 4542 + }, + { + "epoch": 0.2532329988851728, + "grad_norm": 0.5089812874794006, + "learning_rate": 8e-05, + "loss": 1.8204, + "step": 4543 + }, + { + "epoch": 0.253288740245262, + "grad_norm": 0.4798874855041504, + "learning_rate": 8e-05, + "loss": 1.6559, + "step": 4544 + }, + { + "epoch": 0.25334448160535117, + "grad_norm": 0.543845534324646, + "learning_rate": 8e-05, + "loss": 1.7922, + "step": 4545 + }, + { + "epoch": 0.25340022296544035, + "grad_norm": 0.5123974680900574, + "learning_rate": 8e-05, + "loss": 1.7801, + "step": 4546 + }, + { + "epoch": 0.2534559643255295, + "grad_norm": 0.5205566883087158, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 4547 + }, + { + "epoch": 0.2535117056856187, + "grad_norm": 0.49157705903053284, + "learning_rate": 8e-05, + "loss": 1.6683, + "step": 4548 + }, + { + "epoch": 0.25356744704570794, + "grad_norm": 0.5303407907485962, + "learning_rate": 8e-05, + "loss": 1.7813, + "step": 4549 + }, + { + "epoch": 0.2536231884057971, + "grad_norm": 0.520340085029602, + "learning_rate": 8e-05, + "loss": 1.6946, + "step": 4550 + }, + { + "epoch": 0.2536789297658863, + "grad_norm": 0.528924286365509, + "learning_rate": 8e-05, + "loss": 1.3523, + "step": 4551 + }, + { + "epoch": 0.2537346711259755, + "grad_norm": 0.4791257083415985, + "learning_rate": 8e-05, + "loss": 1.5945, + "step": 4552 + }, + { + "epoch": 0.25379041248606465, + "grad_norm": 0.48630478978157043, + "learning_rate": 8e-05, + "loss": 1.8585, + "step": 4553 + }, + { + "epoch": 0.25384615384615383, + "grad_norm": 0.4839245676994324, + "learning_rate": 8e-05, + "loss": 1.6635, + "step": 4554 + }, + { + "epoch": 0.253901895206243, + "grad_norm": 0.5476188659667969, + "learning_rate": 8e-05, + "loss": 2.004, + "step": 4555 + }, + { + "epoch": 0.25395763656633225, + "grad_norm": 0.5112416744232178, + "learning_rate": 8e-05, + "loss": 1.6964, + "step": 4556 + }, + { + "epoch": 0.2540133779264214, + "grad_norm": 0.4688325822353363, + "learning_rate": 8e-05, + "loss": 1.6512, + "step": 4557 + }, + { + "epoch": 0.2540691192865106, + "grad_norm": 0.49445050954818726, + "learning_rate": 8e-05, + "loss": 1.8496, + "step": 4558 + }, + { + "epoch": 0.2541248606465998, + "grad_norm": 0.5357611775398254, + "learning_rate": 8e-05, + "loss": 1.7822, + "step": 4559 + }, + { + "epoch": 0.25418060200668896, + "grad_norm": 0.5030912756919861, + "learning_rate": 8e-05, + "loss": 1.5574, + "step": 4560 + }, + { + "epoch": 0.25423634336677814, + "grad_norm": 0.5398586988449097, + "learning_rate": 8e-05, + "loss": 1.9175, + "step": 4561 + }, + { + "epoch": 0.2542920847268673, + "grad_norm": 0.49171751737594604, + "learning_rate": 8e-05, + "loss": 1.814, + "step": 4562 + }, + { + "epoch": 0.2543478260869565, + "grad_norm": 0.4866318106651306, + "learning_rate": 8e-05, + "loss": 1.6577, + "step": 4563 + }, + { + "epoch": 0.25440356744704573, + "grad_norm": 0.4502681791782379, + "learning_rate": 8e-05, + "loss": 1.5948, + "step": 4564 + }, + { + "epoch": 0.2544593088071349, + "grad_norm": 0.5153110027313232, + "learning_rate": 8e-05, + "loss": 1.6973, + "step": 4565 + }, + { + "epoch": 0.2545150501672241, + "grad_norm": 0.4543468654155731, + "learning_rate": 8e-05, + "loss": 1.4745, + "step": 4566 + }, + { + "epoch": 0.25457079152731327, + "grad_norm": 0.4921804368495941, + "learning_rate": 8e-05, + "loss": 1.7234, + "step": 4567 + }, + { + "epoch": 0.25462653288740245, + "grad_norm": 0.5395398139953613, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 4568 + }, + { + "epoch": 0.2546822742474916, + "grad_norm": 0.49862754344940186, + "learning_rate": 8e-05, + "loss": 1.6302, + "step": 4569 + }, + { + "epoch": 0.2547380156075808, + "grad_norm": 0.5175421833992004, + "learning_rate": 8e-05, + "loss": 1.8645, + "step": 4570 + }, + { + "epoch": 0.25479375696767004, + "grad_norm": 0.5347442626953125, + "learning_rate": 8e-05, + "loss": 1.6279, + "step": 4571 + }, + { + "epoch": 0.2548494983277592, + "grad_norm": 0.5138886570930481, + "learning_rate": 8e-05, + "loss": 1.4774, + "step": 4572 + }, + { + "epoch": 0.2549052396878484, + "grad_norm": 0.450295090675354, + "learning_rate": 8e-05, + "loss": 1.0234, + "step": 4573 + }, + { + "epoch": 0.2549609810479376, + "grad_norm": 0.5415732860565186, + "learning_rate": 8e-05, + "loss": 1.7688, + "step": 4574 + }, + { + "epoch": 0.25501672240802675, + "grad_norm": 0.5672006607055664, + "learning_rate": 8e-05, + "loss": 1.8689, + "step": 4575 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 0.5203934907913208, + "learning_rate": 8e-05, + "loss": 1.6057, + "step": 4576 + }, + { + "epoch": 0.2551282051282051, + "grad_norm": 0.5594499111175537, + "learning_rate": 8e-05, + "loss": 1.7061, + "step": 4577 + }, + { + "epoch": 0.2551839464882943, + "grad_norm": 0.5058487057685852, + "learning_rate": 8e-05, + "loss": 1.7657, + "step": 4578 + }, + { + "epoch": 0.2552396878483835, + "grad_norm": 0.4886641204357147, + "learning_rate": 8e-05, + "loss": 1.615, + "step": 4579 + }, + { + "epoch": 0.2552954292084727, + "grad_norm": 0.5026295781135559, + "learning_rate": 8e-05, + "loss": 1.6597, + "step": 4580 + }, + { + "epoch": 0.2553511705685619, + "grad_norm": 0.4680199921131134, + "learning_rate": 8e-05, + "loss": 1.7476, + "step": 4581 + }, + { + "epoch": 0.25540691192865106, + "grad_norm": 0.5621483325958252, + "learning_rate": 8e-05, + "loss": 1.8504, + "step": 4582 + }, + { + "epoch": 0.25546265328874024, + "grad_norm": 0.5055748820304871, + "learning_rate": 8e-05, + "loss": 1.9231, + "step": 4583 + }, + { + "epoch": 0.2555183946488294, + "grad_norm": 0.5216691493988037, + "learning_rate": 8e-05, + "loss": 1.6375, + "step": 4584 + }, + { + "epoch": 0.2555741360089186, + "grad_norm": 0.4793626070022583, + "learning_rate": 8e-05, + "loss": 1.5565, + "step": 4585 + }, + { + "epoch": 0.25562987736900783, + "grad_norm": 0.5235403776168823, + "learning_rate": 8e-05, + "loss": 1.9795, + "step": 4586 + }, + { + "epoch": 0.255685618729097, + "grad_norm": 0.5455905199050903, + "learning_rate": 8e-05, + "loss": 2.0298, + "step": 4587 + }, + { + "epoch": 0.2557413600891862, + "grad_norm": 0.5437529683113098, + "learning_rate": 8e-05, + "loss": 1.7714, + "step": 4588 + }, + { + "epoch": 0.25579710144927537, + "grad_norm": 0.4438577890396118, + "learning_rate": 8e-05, + "loss": 1.5776, + "step": 4589 + }, + { + "epoch": 0.25585284280936454, + "grad_norm": 0.4744518995285034, + "learning_rate": 8e-05, + "loss": 1.5394, + "step": 4590 + }, + { + "epoch": 0.2559085841694537, + "grad_norm": 0.49256959557533264, + "learning_rate": 8e-05, + "loss": 1.72, + "step": 4591 + }, + { + "epoch": 0.2559643255295429, + "grad_norm": 0.4950970411300659, + "learning_rate": 8e-05, + "loss": 1.6956, + "step": 4592 + }, + { + "epoch": 0.2560200668896321, + "grad_norm": 0.5208807587623596, + "learning_rate": 8e-05, + "loss": 1.7964, + "step": 4593 + }, + { + "epoch": 0.2560758082497213, + "grad_norm": 0.47823184728622437, + "learning_rate": 8e-05, + "loss": 1.6011, + "step": 4594 + }, + { + "epoch": 0.2561315496098105, + "grad_norm": 0.5411269664764404, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 4595 + }, + { + "epoch": 0.2561872909698997, + "grad_norm": 0.4891355633735657, + "learning_rate": 8e-05, + "loss": 1.5656, + "step": 4596 + }, + { + "epoch": 0.25624303232998885, + "grad_norm": 0.5012332201004028, + "learning_rate": 8e-05, + "loss": 1.6128, + "step": 4597 + }, + { + "epoch": 0.25629877369007803, + "grad_norm": 0.49547815322875977, + "learning_rate": 8e-05, + "loss": 1.7545, + "step": 4598 + }, + { + "epoch": 0.2563545150501672, + "grad_norm": 0.5473471283912659, + "learning_rate": 8e-05, + "loss": 1.9162, + "step": 4599 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.5002238750457764, + "learning_rate": 8e-05, + "loss": 1.6341, + "step": 4600 + }, + { + "epoch": 0.2564659977703456, + "grad_norm": 0.5576561689376831, + "learning_rate": 8e-05, + "loss": 1.78, + "step": 4601 + }, + { + "epoch": 0.2565217391304348, + "grad_norm": 0.5200320482254028, + "learning_rate": 8e-05, + "loss": 1.8138, + "step": 4602 + }, + { + "epoch": 0.256577480490524, + "grad_norm": 0.5285021662712097, + "learning_rate": 8e-05, + "loss": 1.8028, + "step": 4603 + }, + { + "epoch": 0.25663322185061316, + "grad_norm": 0.46985504031181335, + "learning_rate": 8e-05, + "loss": 1.5906, + "step": 4604 + }, + { + "epoch": 0.25668896321070234, + "grad_norm": 0.5084950923919678, + "learning_rate": 8e-05, + "loss": 1.8134, + "step": 4605 + }, + { + "epoch": 0.2567447045707915, + "grad_norm": 0.4895882308483124, + "learning_rate": 8e-05, + "loss": 1.745, + "step": 4606 + }, + { + "epoch": 0.2568004459308807, + "grad_norm": 0.5340995788574219, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 4607 + }, + { + "epoch": 0.2568561872909699, + "grad_norm": 0.47565460205078125, + "learning_rate": 8e-05, + "loss": 1.5556, + "step": 4608 + }, + { + "epoch": 0.2569119286510591, + "grad_norm": 0.4609222412109375, + "learning_rate": 8e-05, + "loss": 1.6959, + "step": 4609 + }, + { + "epoch": 0.2569676700111483, + "grad_norm": 0.4868863523006439, + "learning_rate": 8e-05, + "loss": 1.6147, + "step": 4610 + }, + { + "epoch": 0.25702341137123746, + "grad_norm": 0.480629563331604, + "learning_rate": 8e-05, + "loss": 1.6158, + "step": 4611 + }, + { + "epoch": 0.25707915273132664, + "grad_norm": 0.5915615558624268, + "learning_rate": 8e-05, + "loss": 1.833, + "step": 4612 + }, + { + "epoch": 0.2571348940914158, + "grad_norm": 0.5114143490791321, + "learning_rate": 8e-05, + "loss": 1.7402, + "step": 4613 + }, + { + "epoch": 0.257190635451505, + "grad_norm": 0.46387219429016113, + "learning_rate": 8e-05, + "loss": 1.6528, + "step": 4614 + }, + { + "epoch": 0.2572463768115942, + "grad_norm": 0.5148956775665283, + "learning_rate": 8e-05, + "loss": 1.7891, + "step": 4615 + }, + { + "epoch": 0.2573021181716834, + "grad_norm": 0.5419014692306519, + "learning_rate": 8e-05, + "loss": 1.9276, + "step": 4616 + }, + { + "epoch": 0.2573578595317726, + "grad_norm": 0.5739691257476807, + "learning_rate": 8e-05, + "loss": 1.8287, + "step": 4617 + }, + { + "epoch": 0.25741360089186177, + "grad_norm": 0.5347944498062134, + "learning_rate": 8e-05, + "loss": 1.8231, + "step": 4618 + }, + { + "epoch": 0.25746934225195095, + "grad_norm": 0.47412458062171936, + "learning_rate": 8e-05, + "loss": 1.7517, + "step": 4619 + }, + { + "epoch": 0.25752508361204013, + "grad_norm": 0.5531072616577148, + "learning_rate": 8e-05, + "loss": 1.8875, + "step": 4620 + }, + { + "epoch": 0.2575808249721293, + "grad_norm": 0.4874540865421295, + "learning_rate": 8e-05, + "loss": 1.5426, + "step": 4621 + }, + { + "epoch": 0.2576365663322185, + "grad_norm": 0.5075742602348328, + "learning_rate": 8e-05, + "loss": 1.6955, + "step": 4622 + }, + { + "epoch": 0.25769230769230766, + "grad_norm": 0.5765818357467651, + "learning_rate": 8e-05, + "loss": 2.1356, + "step": 4623 + }, + { + "epoch": 0.2577480490523969, + "grad_norm": 0.5081427693367004, + "learning_rate": 8e-05, + "loss": 1.7109, + "step": 4624 + }, + { + "epoch": 0.2578037904124861, + "grad_norm": 0.5203375816345215, + "learning_rate": 8e-05, + "loss": 1.8658, + "step": 4625 + }, + { + "epoch": 0.25785953177257526, + "grad_norm": 0.49736112356185913, + "learning_rate": 8e-05, + "loss": 1.63, + "step": 4626 + }, + { + "epoch": 0.25791527313266444, + "grad_norm": 0.495447039604187, + "learning_rate": 8e-05, + "loss": 1.8224, + "step": 4627 + }, + { + "epoch": 0.2579710144927536, + "grad_norm": 0.4577196538448334, + "learning_rate": 8e-05, + "loss": 1.6015, + "step": 4628 + }, + { + "epoch": 0.2580267558528428, + "grad_norm": 0.47778797149658203, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 4629 + }, + { + "epoch": 0.25808249721293197, + "grad_norm": 0.4753131866455078, + "learning_rate": 8e-05, + "loss": 1.7952, + "step": 4630 + }, + { + "epoch": 0.2581382385730212, + "grad_norm": 0.5390071272850037, + "learning_rate": 8e-05, + "loss": 1.6938, + "step": 4631 + }, + { + "epoch": 0.2581939799331104, + "grad_norm": 0.5110119581222534, + "learning_rate": 8e-05, + "loss": 1.5397, + "step": 4632 + }, + { + "epoch": 0.25824972129319956, + "grad_norm": 0.4993075430393219, + "learning_rate": 8e-05, + "loss": 1.8401, + "step": 4633 + }, + { + "epoch": 0.25830546265328874, + "grad_norm": 0.5148491263389587, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 4634 + }, + { + "epoch": 0.2583612040133779, + "grad_norm": 0.5420222282409668, + "learning_rate": 8e-05, + "loss": 1.7288, + "step": 4635 + }, + { + "epoch": 0.2584169453734671, + "grad_norm": 0.5096121430397034, + "learning_rate": 8e-05, + "loss": 1.7989, + "step": 4636 + }, + { + "epoch": 0.2584726867335563, + "grad_norm": 0.5139524936676025, + "learning_rate": 8e-05, + "loss": 1.7249, + "step": 4637 + }, + { + "epoch": 0.2585284280936455, + "grad_norm": 0.5044840574264526, + "learning_rate": 8e-05, + "loss": 1.6603, + "step": 4638 + }, + { + "epoch": 0.2585841694537347, + "grad_norm": 0.5093978047370911, + "learning_rate": 8e-05, + "loss": 1.7897, + "step": 4639 + }, + { + "epoch": 0.25863991081382387, + "grad_norm": 0.5032713413238525, + "learning_rate": 8e-05, + "loss": 1.672, + "step": 4640 + }, + { + "epoch": 0.25869565217391305, + "grad_norm": 0.5168723464012146, + "learning_rate": 8e-05, + "loss": 1.5844, + "step": 4641 + }, + { + "epoch": 0.2587513935340022, + "grad_norm": 0.5161342620849609, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 4642 + }, + { + "epoch": 0.2588071348940914, + "grad_norm": 0.5017462968826294, + "learning_rate": 8e-05, + "loss": 1.7623, + "step": 4643 + }, + { + "epoch": 0.2588628762541806, + "grad_norm": 0.49921777844429016, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 4644 + }, + { + "epoch": 0.25891861761426976, + "grad_norm": 0.47593289613723755, + "learning_rate": 8e-05, + "loss": 1.6082, + "step": 4645 + }, + { + "epoch": 0.258974358974359, + "grad_norm": 0.5113307237625122, + "learning_rate": 8e-05, + "loss": 1.7108, + "step": 4646 + }, + { + "epoch": 0.2590301003344482, + "grad_norm": 0.49452194571495056, + "learning_rate": 8e-05, + "loss": 1.699, + "step": 4647 + }, + { + "epoch": 0.25908584169453736, + "grad_norm": 0.4883759319782257, + "learning_rate": 8e-05, + "loss": 1.6703, + "step": 4648 + }, + { + "epoch": 0.25914158305462653, + "grad_norm": 0.5027516484260559, + "learning_rate": 8e-05, + "loss": 1.713, + "step": 4649 + }, + { + "epoch": 0.2591973244147157, + "grad_norm": 0.4828946888446808, + "learning_rate": 8e-05, + "loss": 1.5637, + "step": 4650 + }, + { + "epoch": 0.2592530657748049, + "grad_norm": 0.4641382396221161, + "learning_rate": 8e-05, + "loss": 1.4836, + "step": 4651 + }, + { + "epoch": 0.25930880713489407, + "grad_norm": 0.5482789278030396, + "learning_rate": 8e-05, + "loss": 1.885, + "step": 4652 + }, + { + "epoch": 0.2593645484949833, + "grad_norm": 0.4978908896446228, + "learning_rate": 8e-05, + "loss": 1.5631, + "step": 4653 + }, + { + "epoch": 0.2594202898550725, + "grad_norm": 0.4770525097846985, + "learning_rate": 8e-05, + "loss": 1.6258, + "step": 4654 + }, + { + "epoch": 0.25947603121516166, + "grad_norm": 0.5395385026931763, + "learning_rate": 8e-05, + "loss": 1.7133, + "step": 4655 + }, + { + "epoch": 0.25953177257525084, + "grad_norm": 0.5239088535308838, + "learning_rate": 8e-05, + "loss": 1.8705, + "step": 4656 + }, + { + "epoch": 0.25958751393534, + "grad_norm": 0.45129743218421936, + "learning_rate": 8e-05, + "loss": 1.5612, + "step": 4657 + }, + { + "epoch": 0.2596432552954292, + "grad_norm": 0.5843803286552429, + "learning_rate": 8e-05, + "loss": 1.9104, + "step": 4658 + }, + { + "epoch": 0.2596989966555184, + "grad_norm": 0.5215731263160706, + "learning_rate": 8e-05, + "loss": 1.7122, + "step": 4659 + }, + { + "epoch": 0.25975473801560756, + "grad_norm": 0.4720430076122284, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 4660 + }, + { + "epoch": 0.2598104793756968, + "grad_norm": 0.5192232728004456, + "learning_rate": 8e-05, + "loss": 1.6112, + "step": 4661 + }, + { + "epoch": 0.25986622073578597, + "grad_norm": 0.5454217195510864, + "learning_rate": 8e-05, + "loss": 1.7424, + "step": 4662 + }, + { + "epoch": 0.25992196209587515, + "grad_norm": 0.5033514499664307, + "learning_rate": 8e-05, + "loss": 1.6306, + "step": 4663 + }, + { + "epoch": 0.2599777034559643, + "grad_norm": 0.48574110865592957, + "learning_rate": 8e-05, + "loss": 1.6805, + "step": 4664 + }, + { + "epoch": 0.2600334448160535, + "grad_norm": 0.5283193588256836, + "learning_rate": 8e-05, + "loss": 1.5043, + "step": 4665 + }, + { + "epoch": 0.2600891861761427, + "grad_norm": 0.4874545931816101, + "learning_rate": 8e-05, + "loss": 1.552, + "step": 4666 + }, + { + "epoch": 0.26014492753623186, + "grad_norm": 0.536874532699585, + "learning_rate": 8e-05, + "loss": 1.7943, + "step": 4667 + }, + { + "epoch": 0.2602006688963211, + "grad_norm": 0.5158773064613342, + "learning_rate": 8e-05, + "loss": 1.7771, + "step": 4668 + }, + { + "epoch": 0.2602564102564103, + "grad_norm": 0.5112432837486267, + "learning_rate": 8e-05, + "loss": 1.9079, + "step": 4669 + }, + { + "epoch": 0.26031215161649945, + "grad_norm": 0.5485742092132568, + "learning_rate": 8e-05, + "loss": 1.8392, + "step": 4670 + }, + { + "epoch": 0.26036789297658863, + "grad_norm": 0.5091162919998169, + "learning_rate": 8e-05, + "loss": 1.5834, + "step": 4671 + }, + { + "epoch": 0.2604236343366778, + "grad_norm": 0.45446982979774475, + "learning_rate": 8e-05, + "loss": 1.6046, + "step": 4672 + }, + { + "epoch": 0.260479375696767, + "grad_norm": 0.5022279620170593, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 4673 + }, + { + "epoch": 0.26053511705685617, + "grad_norm": 0.4997698962688446, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 4674 + }, + { + "epoch": 0.26059085841694535, + "grad_norm": 0.49530526995658875, + "learning_rate": 8e-05, + "loss": 1.6325, + "step": 4675 + }, + { + "epoch": 0.2606465997770346, + "grad_norm": 0.490505576133728, + "learning_rate": 8e-05, + "loss": 1.6641, + "step": 4676 + }, + { + "epoch": 0.26070234113712376, + "grad_norm": 0.5120657086372375, + "learning_rate": 8e-05, + "loss": 1.6395, + "step": 4677 + }, + { + "epoch": 0.26075808249721294, + "grad_norm": 0.5384310483932495, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 4678 + }, + { + "epoch": 0.2608138238573021, + "grad_norm": 0.5185050964355469, + "learning_rate": 8e-05, + "loss": 1.6108, + "step": 4679 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.5490922331809998, + "learning_rate": 8e-05, + "loss": 1.8128, + "step": 4680 + }, + { + "epoch": 0.2609253065774805, + "grad_norm": 0.4805544316768646, + "learning_rate": 8e-05, + "loss": 1.6368, + "step": 4681 + }, + { + "epoch": 0.26098104793756965, + "grad_norm": 0.4884585440158844, + "learning_rate": 8e-05, + "loss": 1.534, + "step": 4682 + }, + { + "epoch": 0.2610367892976589, + "grad_norm": 0.4098069667816162, + "learning_rate": 8e-05, + "loss": 1.2498, + "step": 4683 + }, + { + "epoch": 0.26109253065774807, + "grad_norm": 0.546772301197052, + "learning_rate": 8e-05, + "loss": 1.673, + "step": 4684 + }, + { + "epoch": 0.26114827201783725, + "grad_norm": 0.5115727186203003, + "learning_rate": 8e-05, + "loss": 1.6959, + "step": 4685 + }, + { + "epoch": 0.2612040133779264, + "grad_norm": 0.5167112350463867, + "learning_rate": 8e-05, + "loss": 1.9362, + "step": 4686 + }, + { + "epoch": 0.2612597547380156, + "grad_norm": 0.5140003561973572, + "learning_rate": 8e-05, + "loss": 1.6821, + "step": 4687 + }, + { + "epoch": 0.2613154960981048, + "grad_norm": 0.49699273705482483, + "learning_rate": 8e-05, + "loss": 1.8037, + "step": 4688 + }, + { + "epoch": 0.26137123745819396, + "grad_norm": 0.5371155142784119, + "learning_rate": 8e-05, + "loss": 1.8324, + "step": 4689 + }, + { + "epoch": 0.26142697881828314, + "grad_norm": 0.5561757683753967, + "learning_rate": 8e-05, + "loss": 1.8558, + "step": 4690 + }, + { + "epoch": 0.2614827201783724, + "grad_norm": 0.5155370831489563, + "learning_rate": 8e-05, + "loss": 1.6131, + "step": 4691 + }, + { + "epoch": 0.26153846153846155, + "grad_norm": 0.467456191778183, + "learning_rate": 8e-05, + "loss": 1.515, + "step": 4692 + }, + { + "epoch": 0.26159420289855073, + "grad_norm": 0.5295583009719849, + "learning_rate": 8e-05, + "loss": 1.5421, + "step": 4693 + }, + { + "epoch": 0.2616499442586399, + "grad_norm": 0.5042378306388855, + "learning_rate": 8e-05, + "loss": 1.6345, + "step": 4694 + }, + { + "epoch": 0.2617056856187291, + "grad_norm": 0.5297505855560303, + "learning_rate": 8e-05, + "loss": 1.9112, + "step": 4695 + }, + { + "epoch": 0.26176142697881827, + "grad_norm": 0.5251696109771729, + "learning_rate": 8e-05, + "loss": 2.0348, + "step": 4696 + }, + { + "epoch": 0.26181716833890745, + "grad_norm": 0.5099309086799622, + "learning_rate": 8e-05, + "loss": 1.8421, + "step": 4697 + }, + { + "epoch": 0.2618729096989967, + "grad_norm": 0.502705454826355, + "learning_rate": 8e-05, + "loss": 1.5744, + "step": 4698 + }, + { + "epoch": 0.26192865105908586, + "grad_norm": 0.5405106544494629, + "learning_rate": 8e-05, + "loss": 2.0006, + "step": 4699 + }, + { + "epoch": 0.26198439241917504, + "grad_norm": 0.5157017707824707, + "learning_rate": 8e-05, + "loss": 1.8455, + "step": 4700 + }, + { + "epoch": 0.2620401337792642, + "grad_norm": 0.5418422222137451, + "learning_rate": 8e-05, + "loss": 1.9059, + "step": 4701 + }, + { + "epoch": 0.2620958751393534, + "grad_norm": 0.5688309669494629, + "learning_rate": 8e-05, + "loss": 2.0853, + "step": 4702 + }, + { + "epoch": 0.2621516164994426, + "grad_norm": 0.5489581227302551, + "learning_rate": 8e-05, + "loss": 2.0183, + "step": 4703 + }, + { + "epoch": 0.26220735785953175, + "grad_norm": 0.49559980630874634, + "learning_rate": 8e-05, + "loss": 1.7079, + "step": 4704 + }, + { + "epoch": 0.26226309921962093, + "grad_norm": 0.5063319802284241, + "learning_rate": 8e-05, + "loss": 1.7444, + "step": 4705 + }, + { + "epoch": 0.26231884057971017, + "grad_norm": 0.47826236486434937, + "learning_rate": 8e-05, + "loss": 1.7103, + "step": 4706 + }, + { + "epoch": 0.26237458193979935, + "grad_norm": 0.5195013880729675, + "learning_rate": 8e-05, + "loss": 1.5908, + "step": 4707 + }, + { + "epoch": 0.2624303232998885, + "grad_norm": 0.5257779359817505, + "learning_rate": 8e-05, + "loss": 1.856, + "step": 4708 + }, + { + "epoch": 0.2624860646599777, + "grad_norm": 0.5113489627838135, + "learning_rate": 8e-05, + "loss": 1.9632, + "step": 4709 + }, + { + "epoch": 0.2625418060200669, + "grad_norm": 0.45053985714912415, + "learning_rate": 8e-05, + "loss": 1.5724, + "step": 4710 + }, + { + "epoch": 0.26259754738015606, + "grad_norm": 0.5337340831756592, + "learning_rate": 8e-05, + "loss": 1.8555, + "step": 4711 + }, + { + "epoch": 0.26265328874024524, + "grad_norm": 0.511391282081604, + "learning_rate": 8e-05, + "loss": 1.7864, + "step": 4712 + }, + { + "epoch": 0.2627090301003345, + "grad_norm": 0.47653481364250183, + "learning_rate": 8e-05, + "loss": 1.5625, + "step": 4713 + }, + { + "epoch": 0.26276477146042365, + "grad_norm": 0.4452597200870514, + "learning_rate": 8e-05, + "loss": 1.7052, + "step": 4714 + }, + { + "epoch": 0.26282051282051283, + "grad_norm": 0.5186541676521301, + "learning_rate": 8e-05, + "loss": 1.7116, + "step": 4715 + }, + { + "epoch": 0.262876254180602, + "grad_norm": 0.5272562503814697, + "learning_rate": 8e-05, + "loss": 1.497, + "step": 4716 + }, + { + "epoch": 0.2629319955406912, + "grad_norm": 0.5084412097930908, + "learning_rate": 8e-05, + "loss": 1.6309, + "step": 4717 + }, + { + "epoch": 0.26298773690078037, + "grad_norm": 0.46219387650489807, + "learning_rate": 8e-05, + "loss": 1.6597, + "step": 4718 + }, + { + "epoch": 0.26304347826086955, + "grad_norm": 0.6059833765029907, + "learning_rate": 8e-05, + "loss": 1.8823, + "step": 4719 + }, + { + "epoch": 0.2630992196209587, + "grad_norm": 0.5173678994178772, + "learning_rate": 8e-05, + "loss": 1.9587, + "step": 4720 + }, + { + "epoch": 0.26315496098104796, + "grad_norm": 0.488073468208313, + "learning_rate": 8e-05, + "loss": 1.6157, + "step": 4721 + }, + { + "epoch": 0.26321070234113714, + "grad_norm": 0.5277428030967712, + "learning_rate": 8e-05, + "loss": 1.9368, + "step": 4722 + }, + { + "epoch": 0.2632664437012263, + "grad_norm": 0.5465556383132935, + "learning_rate": 8e-05, + "loss": 1.766, + "step": 4723 + }, + { + "epoch": 0.2633221850613155, + "grad_norm": 0.5116319060325623, + "learning_rate": 8e-05, + "loss": 1.7194, + "step": 4724 + }, + { + "epoch": 0.2633779264214047, + "grad_norm": 0.5023199915885925, + "learning_rate": 8e-05, + "loss": 1.5882, + "step": 4725 + }, + { + "epoch": 0.26343366778149385, + "grad_norm": 0.4644854664802551, + "learning_rate": 8e-05, + "loss": 1.7469, + "step": 4726 + }, + { + "epoch": 0.26348940914158303, + "grad_norm": 0.5060609579086304, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 4727 + }, + { + "epoch": 0.26354515050167227, + "grad_norm": 0.5291848182678223, + "learning_rate": 8e-05, + "loss": 1.8109, + "step": 4728 + }, + { + "epoch": 0.26360089186176144, + "grad_norm": 0.5288201570510864, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 4729 + }, + { + "epoch": 0.2636566332218506, + "grad_norm": 0.5466488599777222, + "learning_rate": 8e-05, + "loss": 1.901, + "step": 4730 + }, + { + "epoch": 0.2637123745819398, + "grad_norm": 0.5276413559913635, + "learning_rate": 8e-05, + "loss": 1.5624, + "step": 4731 + }, + { + "epoch": 0.263768115942029, + "grad_norm": 0.5265539288520813, + "learning_rate": 8e-05, + "loss": 1.8041, + "step": 4732 + }, + { + "epoch": 0.26382385730211816, + "grad_norm": 0.4656563997268677, + "learning_rate": 8e-05, + "loss": 1.5169, + "step": 4733 + }, + { + "epoch": 0.26387959866220734, + "grad_norm": 0.5088443160057068, + "learning_rate": 8e-05, + "loss": 1.7473, + "step": 4734 + }, + { + "epoch": 0.26393534002229657, + "grad_norm": 0.5077308416366577, + "learning_rate": 8e-05, + "loss": 1.6796, + "step": 4735 + }, + { + "epoch": 0.26399108138238575, + "grad_norm": 0.45367294549942017, + "learning_rate": 8e-05, + "loss": 1.5232, + "step": 4736 + }, + { + "epoch": 0.26404682274247493, + "grad_norm": 0.5136170983314514, + "learning_rate": 8e-05, + "loss": 1.7879, + "step": 4737 + }, + { + "epoch": 0.2641025641025641, + "grad_norm": 0.5601361393928528, + "learning_rate": 8e-05, + "loss": 1.8539, + "step": 4738 + }, + { + "epoch": 0.2641583054626533, + "grad_norm": 0.49229079484939575, + "learning_rate": 8e-05, + "loss": 1.6867, + "step": 4739 + }, + { + "epoch": 0.26421404682274247, + "grad_norm": 0.4874022305011749, + "learning_rate": 8e-05, + "loss": 1.6835, + "step": 4740 + }, + { + "epoch": 0.26426978818283164, + "grad_norm": 0.4711064398288727, + "learning_rate": 8e-05, + "loss": 1.4715, + "step": 4741 + }, + { + "epoch": 0.2643255295429208, + "grad_norm": 0.4626575708389282, + "learning_rate": 8e-05, + "loss": 1.3407, + "step": 4742 + }, + { + "epoch": 0.26438127090301006, + "grad_norm": 0.5319688320159912, + "learning_rate": 8e-05, + "loss": 1.7915, + "step": 4743 + }, + { + "epoch": 0.26443701226309924, + "grad_norm": 0.5631923079490662, + "learning_rate": 8e-05, + "loss": 1.8325, + "step": 4744 + }, + { + "epoch": 0.2644927536231884, + "grad_norm": 0.5148433446884155, + "learning_rate": 8e-05, + "loss": 1.6658, + "step": 4745 + }, + { + "epoch": 0.2645484949832776, + "grad_norm": 0.5387375950813293, + "learning_rate": 8e-05, + "loss": 1.7299, + "step": 4746 + }, + { + "epoch": 0.26460423634336677, + "grad_norm": 0.48994046449661255, + "learning_rate": 8e-05, + "loss": 1.7253, + "step": 4747 + }, + { + "epoch": 0.26465997770345595, + "grad_norm": 0.4521850049495697, + "learning_rate": 8e-05, + "loss": 1.5103, + "step": 4748 + }, + { + "epoch": 0.26471571906354513, + "grad_norm": 0.49143075942993164, + "learning_rate": 8e-05, + "loss": 1.5406, + "step": 4749 + }, + { + "epoch": 0.26477146042363436, + "grad_norm": 0.5273923873901367, + "learning_rate": 8e-05, + "loss": 1.7373, + "step": 4750 + }, + { + "epoch": 0.26482720178372354, + "grad_norm": 0.513289749622345, + "learning_rate": 8e-05, + "loss": 1.6385, + "step": 4751 + }, + { + "epoch": 0.2648829431438127, + "grad_norm": 0.5224297642707825, + "learning_rate": 8e-05, + "loss": 1.8056, + "step": 4752 + }, + { + "epoch": 0.2649386845039019, + "grad_norm": 0.449360728263855, + "learning_rate": 8e-05, + "loss": 1.3766, + "step": 4753 + }, + { + "epoch": 0.2649944258639911, + "grad_norm": 0.5372068285942078, + "learning_rate": 8e-05, + "loss": 1.7699, + "step": 4754 + }, + { + "epoch": 0.26505016722408026, + "grad_norm": 0.523019552230835, + "learning_rate": 8e-05, + "loss": 1.6648, + "step": 4755 + }, + { + "epoch": 0.26510590858416944, + "grad_norm": 0.5103756785392761, + "learning_rate": 8e-05, + "loss": 1.7959, + "step": 4756 + }, + { + "epoch": 0.2651616499442586, + "grad_norm": 0.8579369783401489, + "learning_rate": 8e-05, + "loss": 1.6392, + "step": 4757 + }, + { + "epoch": 0.26521739130434785, + "grad_norm": 0.550744891166687, + "learning_rate": 8e-05, + "loss": 1.7364, + "step": 4758 + }, + { + "epoch": 0.26527313266443703, + "grad_norm": 0.5084554553031921, + "learning_rate": 8e-05, + "loss": 1.6298, + "step": 4759 + }, + { + "epoch": 0.2653288740245262, + "grad_norm": 0.6770187616348267, + "learning_rate": 8e-05, + "loss": 1.5945, + "step": 4760 + }, + { + "epoch": 0.2653846153846154, + "grad_norm": 0.49051225185394287, + "learning_rate": 8e-05, + "loss": 1.5087, + "step": 4761 + }, + { + "epoch": 0.26544035674470456, + "grad_norm": 0.5356156826019287, + "learning_rate": 8e-05, + "loss": 1.64, + "step": 4762 + }, + { + "epoch": 0.26549609810479374, + "grad_norm": 0.5299071669578552, + "learning_rate": 8e-05, + "loss": 1.6133, + "step": 4763 + }, + { + "epoch": 0.2655518394648829, + "grad_norm": 0.48002979159355164, + "learning_rate": 8e-05, + "loss": 1.4577, + "step": 4764 + }, + { + "epoch": 0.26560758082497216, + "grad_norm": 0.48105019330978394, + "learning_rate": 8e-05, + "loss": 1.6341, + "step": 4765 + }, + { + "epoch": 0.26566332218506133, + "grad_norm": 0.5923892259597778, + "learning_rate": 8e-05, + "loss": 2.0087, + "step": 4766 + }, + { + "epoch": 0.2657190635451505, + "grad_norm": 0.48609718680381775, + "learning_rate": 8e-05, + "loss": 1.5758, + "step": 4767 + }, + { + "epoch": 0.2657748049052397, + "grad_norm": 0.49808210134506226, + "learning_rate": 8e-05, + "loss": 1.6638, + "step": 4768 + }, + { + "epoch": 0.26583054626532887, + "grad_norm": 0.4882586598396301, + "learning_rate": 8e-05, + "loss": 1.579, + "step": 4769 + }, + { + "epoch": 0.26588628762541805, + "grad_norm": 0.5722583532333374, + "learning_rate": 8e-05, + "loss": 1.9595, + "step": 4770 + }, + { + "epoch": 0.26594202898550723, + "grad_norm": 0.47601956129074097, + "learning_rate": 8e-05, + "loss": 1.5847, + "step": 4771 + }, + { + "epoch": 0.2659977703455964, + "grad_norm": 0.4781881868839264, + "learning_rate": 8e-05, + "loss": 1.61, + "step": 4772 + }, + { + "epoch": 0.26605351170568564, + "grad_norm": 0.5992628931999207, + "learning_rate": 8e-05, + "loss": 1.7718, + "step": 4773 + }, + { + "epoch": 0.2661092530657748, + "grad_norm": 0.49576088786125183, + "learning_rate": 8e-05, + "loss": 1.7456, + "step": 4774 + }, + { + "epoch": 0.266164994425864, + "grad_norm": 0.49744564294815063, + "learning_rate": 8e-05, + "loss": 1.6193, + "step": 4775 + }, + { + "epoch": 0.2662207357859532, + "grad_norm": 0.518963634967804, + "learning_rate": 8e-05, + "loss": 1.7641, + "step": 4776 + }, + { + "epoch": 0.26627647714604236, + "grad_norm": 0.48951300978660583, + "learning_rate": 8e-05, + "loss": 1.6035, + "step": 4777 + }, + { + "epoch": 0.26633221850613154, + "grad_norm": 0.5045288801193237, + "learning_rate": 8e-05, + "loss": 1.7325, + "step": 4778 + }, + { + "epoch": 0.2663879598662207, + "grad_norm": 0.5186219811439514, + "learning_rate": 8e-05, + "loss": 1.7007, + "step": 4779 + }, + { + "epoch": 0.26644370122630995, + "grad_norm": 0.5073248744010925, + "learning_rate": 8e-05, + "loss": 1.7715, + "step": 4780 + }, + { + "epoch": 0.2664994425863991, + "grad_norm": 0.5213096141815186, + "learning_rate": 8e-05, + "loss": 1.684, + "step": 4781 + }, + { + "epoch": 0.2665551839464883, + "grad_norm": 0.47361043095588684, + "learning_rate": 8e-05, + "loss": 1.6807, + "step": 4782 + }, + { + "epoch": 0.2666109253065775, + "grad_norm": 0.48980844020843506, + "learning_rate": 8e-05, + "loss": 1.7931, + "step": 4783 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.48362529277801514, + "learning_rate": 8e-05, + "loss": 1.639, + "step": 4784 + }, + { + "epoch": 0.26672240802675584, + "grad_norm": 0.5443851947784424, + "learning_rate": 8e-05, + "loss": 1.8447, + "step": 4785 + }, + { + "epoch": 0.266778149386845, + "grad_norm": 0.4934808015823364, + "learning_rate": 8e-05, + "loss": 1.8289, + "step": 4786 + }, + { + "epoch": 0.2668338907469342, + "grad_norm": 0.5453766584396362, + "learning_rate": 8e-05, + "loss": 1.9642, + "step": 4787 + }, + { + "epoch": 0.26688963210702343, + "grad_norm": 0.46825507283210754, + "learning_rate": 8e-05, + "loss": 1.5369, + "step": 4788 + }, + { + "epoch": 0.2669453734671126, + "grad_norm": 0.5072756409645081, + "learning_rate": 8e-05, + "loss": 1.5771, + "step": 4789 + }, + { + "epoch": 0.2670011148272018, + "grad_norm": 0.5206819176673889, + "learning_rate": 8e-05, + "loss": 1.8058, + "step": 4790 + }, + { + "epoch": 0.26705685618729097, + "grad_norm": 0.47623395919799805, + "learning_rate": 8e-05, + "loss": 1.7826, + "step": 4791 + }, + { + "epoch": 0.26711259754738015, + "grad_norm": 0.5042856335639954, + "learning_rate": 8e-05, + "loss": 1.8121, + "step": 4792 + }, + { + "epoch": 0.2671683389074693, + "grad_norm": 0.48193052411079407, + "learning_rate": 8e-05, + "loss": 1.7285, + "step": 4793 + }, + { + "epoch": 0.2672240802675585, + "grad_norm": 0.4764842689037323, + "learning_rate": 8e-05, + "loss": 1.7433, + "step": 4794 + }, + { + "epoch": 0.26727982162764774, + "grad_norm": 0.5621132254600525, + "learning_rate": 8e-05, + "loss": 1.6013, + "step": 4795 + }, + { + "epoch": 0.2673355629877369, + "grad_norm": 0.48127683997154236, + "learning_rate": 8e-05, + "loss": 1.8514, + "step": 4796 + }, + { + "epoch": 0.2673913043478261, + "grad_norm": 0.46192049980163574, + "learning_rate": 8e-05, + "loss": 1.7205, + "step": 4797 + }, + { + "epoch": 0.2674470457079153, + "grad_norm": 0.46006837487220764, + "learning_rate": 8e-05, + "loss": 1.5512, + "step": 4798 + }, + { + "epoch": 0.26750278706800446, + "grad_norm": 0.4827045500278473, + "learning_rate": 8e-05, + "loss": 1.6037, + "step": 4799 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.534766435623169, + "learning_rate": 8e-05, + "loss": 1.4909, + "step": 4800 + }, + { + "epoch": 0.2676142697881828, + "grad_norm": 0.516715943813324, + "learning_rate": 8e-05, + "loss": 1.8948, + "step": 4801 + }, + { + "epoch": 0.267670011148272, + "grad_norm": 0.5046938061714172, + "learning_rate": 8e-05, + "loss": 1.7482, + "step": 4802 + }, + { + "epoch": 0.2677257525083612, + "grad_norm": 0.4876144826412201, + "learning_rate": 8e-05, + "loss": 1.6736, + "step": 4803 + }, + { + "epoch": 0.2677814938684504, + "grad_norm": 0.5733070969581604, + "learning_rate": 8e-05, + "loss": 1.9056, + "step": 4804 + }, + { + "epoch": 0.2678372352285396, + "grad_norm": 0.5142390727996826, + "learning_rate": 8e-05, + "loss": 1.648, + "step": 4805 + }, + { + "epoch": 0.26789297658862876, + "grad_norm": 0.49695613980293274, + "learning_rate": 8e-05, + "loss": 1.715, + "step": 4806 + }, + { + "epoch": 0.26794871794871794, + "grad_norm": 0.4340948462486267, + "learning_rate": 8e-05, + "loss": 1.156, + "step": 4807 + }, + { + "epoch": 0.2680044593088071, + "grad_norm": 0.47531047463417053, + "learning_rate": 8e-05, + "loss": 1.5231, + "step": 4808 + }, + { + "epoch": 0.2680602006688963, + "grad_norm": 0.48843592405319214, + "learning_rate": 8e-05, + "loss": 1.5196, + "step": 4809 + }, + { + "epoch": 0.26811594202898553, + "grad_norm": 0.5055053234100342, + "learning_rate": 8e-05, + "loss": 1.5967, + "step": 4810 + }, + { + "epoch": 0.2681716833890747, + "grad_norm": 0.5026859641075134, + "learning_rate": 8e-05, + "loss": 1.8444, + "step": 4811 + }, + { + "epoch": 0.2682274247491639, + "grad_norm": 0.4923660159111023, + "learning_rate": 8e-05, + "loss": 1.6562, + "step": 4812 + }, + { + "epoch": 0.26828316610925307, + "grad_norm": 0.5042104721069336, + "learning_rate": 8e-05, + "loss": 1.3715, + "step": 4813 + }, + { + "epoch": 0.26833890746934225, + "grad_norm": 0.4890134930610657, + "learning_rate": 8e-05, + "loss": 1.7762, + "step": 4814 + }, + { + "epoch": 0.2683946488294314, + "grad_norm": 0.5024535655975342, + "learning_rate": 8e-05, + "loss": 1.4346, + "step": 4815 + }, + { + "epoch": 0.2684503901895206, + "grad_norm": 0.5002022981643677, + "learning_rate": 8e-05, + "loss": 1.6237, + "step": 4816 + }, + { + "epoch": 0.2685061315496098, + "grad_norm": 0.4712158441543579, + "learning_rate": 8e-05, + "loss": 1.5621, + "step": 4817 + }, + { + "epoch": 0.268561872909699, + "grad_norm": 0.553865909576416, + "learning_rate": 8e-05, + "loss": 1.8666, + "step": 4818 + }, + { + "epoch": 0.2686176142697882, + "grad_norm": 0.6038681268692017, + "learning_rate": 8e-05, + "loss": 1.5174, + "step": 4819 + }, + { + "epoch": 0.2686733556298774, + "grad_norm": 0.5450118184089661, + "learning_rate": 8e-05, + "loss": 1.9144, + "step": 4820 + }, + { + "epoch": 0.26872909698996655, + "grad_norm": 0.517101526260376, + "learning_rate": 8e-05, + "loss": 1.7423, + "step": 4821 + }, + { + "epoch": 0.26878483835005573, + "grad_norm": 0.5435220003128052, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 4822 + }, + { + "epoch": 0.2688405797101449, + "grad_norm": 0.5636661648750305, + "learning_rate": 8e-05, + "loss": 1.9104, + "step": 4823 + }, + { + "epoch": 0.2688963210702341, + "grad_norm": 0.4907018840312958, + "learning_rate": 8e-05, + "loss": 1.8114, + "step": 4824 + }, + { + "epoch": 0.2689520624303233, + "grad_norm": 0.4901597499847412, + "learning_rate": 8e-05, + "loss": 1.4494, + "step": 4825 + }, + { + "epoch": 0.2690078037904125, + "grad_norm": 0.5196972489356995, + "learning_rate": 8e-05, + "loss": 1.8353, + "step": 4826 + }, + { + "epoch": 0.2690635451505017, + "grad_norm": 0.4989320933818817, + "learning_rate": 8e-05, + "loss": 1.7714, + "step": 4827 + }, + { + "epoch": 0.26911928651059086, + "grad_norm": 0.4761400818824768, + "learning_rate": 8e-05, + "loss": 1.6661, + "step": 4828 + }, + { + "epoch": 0.26917502787068004, + "grad_norm": 0.4977497160434723, + "learning_rate": 8e-05, + "loss": 1.5345, + "step": 4829 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 0.6041051745414734, + "learning_rate": 8e-05, + "loss": 1.8069, + "step": 4830 + }, + { + "epoch": 0.2692865105908584, + "grad_norm": 0.5953145027160645, + "learning_rate": 8e-05, + "loss": 2.0057, + "step": 4831 + }, + { + "epoch": 0.2693422519509476, + "grad_norm": 0.4859643280506134, + "learning_rate": 8e-05, + "loss": 1.8089, + "step": 4832 + }, + { + "epoch": 0.2693979933110368, + "grad_norm": 0.5140557289123535, + "learning_rate": 8e-05, + "loss": 1.7321, + "step": 4833 + }, + { + "epoch": 0.269453734671126, + "grad_norm": 0.561446487903595, + "learning_rate": 8e-05, + "loss": 1.7825, + "step": 4834 + }, + { + "epoch": 0.26950947603121517, + "grad_norm": 0.5055059194564819, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 4835 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 0.5057458281517029, + "learning_rate": 8e-05, + "loss": 1.6527, + "step": 4836 + }, + { + "epoch": 0.2696209587513935, + "grad_norm": 0.4766155183315277, + "learning_rate": 8e-05, + "loss": 1.5412, + "step": 4837 + }, + { + "epoch": 0.2696767001114827, + "grad_norm": 0.46934545040130615, + "learning_rate": 8e-05, + "loss": 1.5501, + "step": 4838 + }, + { + "epoch": 0.2697324414715719, + "grad_norm": 0.4834466278553009, + "learning_rate": 8e-05, + "loss": 1.4659, + "step": 4839 + }, + { + "epoch": 0.2697881828316611, + "grad_norm": 0.46945199370384216, + "learning_rate": 8e-05, + "loss": 1.4448, + "step": 4840 + }, + { + "epoch": 0.2698439241917503, + "grad_norm": 0.5626839995384216, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 4841 + }, + { + "epoch": 0.2698996655518395, + "grad_norm": 0.4671670198440552, + "learning_rate": 8e-05, + "loss": 1.5116, + "step": 4842 + }, + { + "epoch": 0.26995540691192865, + "grad_norm": 0.46738243103027344, + "learning_rate": 8e-05, + "loss": 1.624, + "step": 4843 + }, + { + "epoch": 0.27001114827201783, + "grad_norm": 0.49982571601867676, + "learning_rate": 8e-05, + "loss": 1.6875, + "step": 4844 + }, + { + "epoch": 0.270066889632107, + "grad_norm": 0.45273247361183167, + "learning_rate": 8e-05, + "loss": 1.5005, + "step": 4845 + }, + { + "epoch": 0.2701226309921962, + "grad_norm": 0.44871610403060913, + "learning_rate": 8e-05, + "loss": 1.4288, + "step": 4846 + }, + { + "epoch": 0.2701783723522854, + "grad_norm": 0.5660805106163025, + "learning_rate": 8e-05, + "loss": 1.8574, + "step": 4847 + }, + { + "epoch": 0.2702341137123746, + "grad_norm": 0.49467673897743225, + "learning_rate": 8e-05, + "loss": 1.6284, + "step": 4848 + }, + { + "epoch": 0.2702898550724638, + "grad_norm": 0.5443269610404968, + "learning_rate": 8e-05, + "loss": 1.681, + "step": 4849 + }, + { + "epoch": 0.27034559643255296, + "grad_norm": 0.5486834049224854, + "learning_rate": 8e-05, + "loss": 1.7333, + "step": 4850 + }, + { + "epoch": 0.27040133779264214, + "grad_norm": 0.4856089651584625, + "learning_rate": 8e-05, + "loss": 1.545, + "step": 4851 + }, + { + "epoch": 0.2704570791527313, + "grad_norm": 0.5295234322547913, + "learning_rate": 8e-05, + "loss": 1.6445, + "step": 4852 + }, + { + "epoch": 0.2705128205128205, + "grad_norm": 0.5542267560958862, + "learning_rate": 8e-05, + "loss": 1.7465, + "step": 4853 + }, + { + "epoch": 0.2705685618729097, + "grad_norm": 0.43070581555366516, + "learning_rate": 8e-05, + "loss": 1.286, + "step": 4854 + }, + { + "epoch": 0.2706243032329989, + "grad_norm": 0.5551098585128784, + "learning_rate": 8e-05, + "loss": 1.7018, + "step": 4855 + }, + { + "epoch": 0.2706800445930881, + "grad_norm": 0.5335925817489624, + "learning_rate": 8e-05, + "loss": 1.7035, + "step": 4856 + }, + { + "epoch": 0.27073578595317727, + "grad_norm": 0.48526740074157715, + "learning_rate": 8e-05, + "loss": 1.6353, + "step": 4857 + }, + { + "epoch": 0.27079152731326644, + "grad_norm": 0.5087800025939941, + "learning_rate": 8e-05, + "loss": 1.8624, + "step": 4858 + }, + { + "epoch": 0.2708472686733556, + "grad_norm": 0.4876925051212311, + "learning_rate": 8e-05, + "loss": 1.6672, + "step": 4859 + }, + { + "epoch": 0.2709030100334448, + "grad_norm": 0.5388376712799072, + "learning_rate": 8e-05, + "loss": 1.9496, + "step": 4860 + }, + { + "epoch": 0.270958751393534, + "grad_norm": 0.46098411083221436, + "learning_rate": 8e-05, + "loss": 1.5343, + "step": 4861 + }, + { + "epoch": 0.2710144927536232, + "grad_norm": 0.4828426241874695, + "learning_rate": 8e-05, + "loss": 1.5259, + "step": 4862 + }, + { + "epoch": 0.2710702341137124, + "grad_norm": 0.5513729453086853, + "learning_rate": 8e-05, + "loss": 2.01, + "step": 4863 + }, + { + "epoch": 0.2711259754738016, + "grad_norm": 0.49254027009010315, + "learning_rate": 8e-05, + "loss": 1.6194, + "step": 4864 + }, + { + "epoch": 0.27118171683389075, + "grad_norm": 0.5209529995918274, + "learning_rate": 8e-05, + "loss": 1.6279, + "step": 4865 + }, + { + "epoch": 0.27123745819397993, + "grad_norm": 0.5383169054985046, + "learning_rate": 8e-05, + "loss": 1.8516, + "step": 4866 + }, + { + "epoch": 0.2712931995540691, + "grad_norm": 0.4927859306335449, + "learning_rate": 8e-05, + "loss": 1.585, + "step": 4867 + }, + { + "epoch": 0.2713489409141583, + "grad_norm": 0.5537318587303162, + "learning_rate": 8e-05, + "loss": 1.9545, + "step": 4868 + }, + { + "epoch": 0.27140468227424747, + "grad_norm": 0.5104342103004456, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 4869 + }, + { + "epoch": 0.2714604236343367, + "grad_norm": 0.4823477268218994, + "learning_rate": 8e-05, + "loss": 1.8651, + "step": 4870 + }, + { + "epoch": 0.2715161649944259, + "grad_norm": 0.5014148950576782, + "learning_rate": 8e-05, + "loss": 1.9006, + "step": 4871 + }, + { + "epoch": 0.27157190635451506, + "grad_norm": 0.4821976125240326, + "learning_rate": 8e-05, + "loss": 1.6347, + "step": 4872 + }, + { + "epoch": 0.27162764771460424, + "grad_norm": 0.5137785077095032, + "learning_rate": 8e-05, + "loss": 1.8702, + "step": 4873 + }, + { + "epoch": 0.2716833890746934, + "grad_norm": 0.4899905323982239, + "learning_rate": 8e-05, + "loss": 1.7127, + "step": 4874 + }, + { + "epoch": 0.2717391304347826, + "grad_norm": 0.5382660627365112, + "learning_rate": 8e-05, + "loss": 1.683, + "step": 4875 + }, + { + "epoch": 0.2717948717948718, + "grad_norm": 0.47894027829170227, + "learning_rate": 8e-05, + "loss": 1.5703, + "step": 4876 + }, + { + "epoch": 0.271850613154961, + "grad_norm": 0.46718770265579224, + "learning_rate": 8e-05, + "loss": 1.6388, + "step": 4877 + }, + { + "epoch": 0.2719063545150502, + "grad_norm": 0.5733405947685242, + "learning_rate": 8e-05, + "loss": 1.9859, + "step": 4878 + }, + { + "epoch": 0.27196209587513936, + "grad_norm": 0.4837845265865326, + "learning_rate": 8e-05, + "loss": 1.5767, + "step": 4879 + }, + { + "epoch": 0.27201783723522854, + "grad_norm": 0.486235111951828, + "learning_rate": 8e-05, + "loss": 1.4791, + "step": 4880 + }, + { + "epoch": 0.2720735785953177, + "grad_norm": 0.48340904712677, + "learning_rate": 8e-05, + "loss": 1.7882, + "step": 4881 + }, + { + "epoch": 0.2721293199554069, + "grad_norm": 0.6102102994918823, + "learning_rate": 8e-05, + "loss": 1.6115, + "step": 4882 + }, + { + "epoch": 0.2721850613154961, + "grad_norm": 0.5262008309364319, + "learning_rate": 8e-05, + "loss": 1.5027, + "step": 4883 + }, + { + "epoch": 0.27224080267558526, + "grad_norm": 0.5147656202316284, + "learning_rate": 8e-05, + "loss": 1.5817, + "step": 4884 + }, + { + "epoch": 0.2722965440356745, + "grad_norm": 0.4796406328678131, + "learning_rate": 8e-05, + "loss": 1.6345, + "step": 4885 + }, + { + "epoch": 0.27235228539576367, + "grad_norm": 0.500372588634491, + "learning_rate": 8e-05, + "loss": 1.8484, + "step": 4886 + }, + { + "epoch": 0.27240802675585285, + "grad_norm": 0.49836209416389465, + "learning_rate": 8e-05, + "loss": 1.7545, + "step": 4887 + }, + { + "epoch": 0.27246376811594203, + "grad_norm": 0.5067594051361084, + "learning_rate": 8e-05, + "loss": 1.7822, + "step": 4888 + }, + { + "epoch": 0.2725195094760312, + "grad_norm": 0.5147234201431274, + "learning_rate": 8e-05, + "loss": 1.7458, + "step": 4889 + }, + { + "epoch": 0.2725752508361204, + "grad_norm": 0.5145500302314758, + "learning_rate": 8e-05, + "loss": 1.6601, + "step": 4890 + }, + { + "epoch": 0.27263099219620957, + "grad_norm": 0.4750419855117798, + "learning_rate": 8e-05, + "loss": 1.5363, + "step": 4891 + }, + { + "epoch": 0.2726867335562988, + "grad_norm": 0.4774273931980133, + "learning_rate": 8e-05, + "loss": 1.7132, + "step": 4892 + }, + { + "epoch": 0.272742474916388, + "grad_norm": 0.5947921276092529, + "learning_rate": 8e-05, + "loss": 1.5743, + "step": 4893 + }, + { + "epoch": 0.27279821627647716, + "grad_norm": 0.5759541988372803, + "learning_rate": 8e-05, + "loss": 1.9485, + "step": 4894 + }, + { + "epoch": 0.27285395763656634, + "grad_norm": 0.5098536014556885, + "learning_rate": 8e-05, + "loss": 1.8144, + "step": 4895 + }, + { + "epoch": 0.2729096989966555, + "grad_norm": 0.48607727885246277, + "learning_rate": 8e-05, + "loss": 1.6344, + "step": 4896 + }, + { + "epoch": 0.2729654403567447, + "grad_norm": 0.48327797651290894, + "learning_rate": 8e-05, + "loss": 1.5865, + "step": 4897 + }, + { + "epoch": 0.27302118171683387, + "grad_norm": 0.5167771577835083, + "learning_rate": 8e-05, + "loss": 1.5991, + "step": 4898 + }, + { + "epoch": 0.27307692307692305, + "grad_norm": 0.5023355484008789, + "learning_rate": 8e-05, + "loss": 1.5283, + "step": 4899 + }, + { + "epoch": 0.2731326644370123, + "grad_norm": 0.49115991592407227, + "learning_rate": 8e-05, + "loss": 1.6185, + "step": 4900 + }, + { + "epoch": 0.27318840579710146, + "grad_norm": 0.49574774503707886, + "learning_rate": 8e-05, + "loss": 1.6107, + "step": 4901 + }, + { + "epoch": 0.27324414715719064, + "grad_norm": 0.5031110048294067, + "learning_rate": 8e-05, + "loss": 1.804, + "step": 4902 + }, + { + "epoch": 0.2732998885172798, + "grad_norm": 0.5140913724899292, + "learning_rate": 8e-05, + "loss": 1.633, + "step": 4903 + }, + { + "epoch": 0.273355629877369, + "grad_norm": 0.5243213176727295, + "learning_rate": 8e-05, + "loss": 1.7039, + "step": 4904 + }, + { + "epoch": 0.2734113712374582, + "grad_norm": 0.5083054304122925, + "learning_rate": 8e-05, + "loss": 1.7294, + "step": 4905 + }, + { + "epoch": 0.27346711259754736, + "grad_norm": 0.5570263862609863, + "learning_rate": 8e-05, + "loss": 1.5553, + "step": 4906 + }, + { + "epoch": 0.2735228539576366, + "grad_norm": 0.5495603680610657, + "learning_rate": 8e-05, + "loss": 1.4761, + "step": 4907 + }, + { + "epoch": 0.27357859531772577, + "grad_norm": 0.5350089073181152, + "learning_rate": 8e-05, + "loss": 1.8056, + "step": 4908 + }, + { + "epoch": 0.27363433667781495, + "grad_norm": 0.4990091025829315, + "learning_rate": 8e-05, + "loss": 1.7132, + "step": 4909 + }, + { + "epoch": 0.2736900780379041, + "grad_norm": 0.5004948973655701, + "learning_rate": 8e-05, + "loss": 1.6129, + "step": 4910 + }, + { + "epoch": 0.2737458193979933, + "grad_norm": 0.48411688208580017, + "learning_rate": 8e-05, + "loss": 1.5139, + "step": 4911 + }, + { + "epoch": 0.2738015607580825, + "grad_norm": 0.511704683303833, + "learning_rate": 8e-05, + "loss": 1.7031, + "step": 4912 + }, + { + "epoch": 0.27385730211817166, + "grad_norm": 0.5102469325065613, + "learning_rate": 8e-05, + "loss": 1.6607, + "step": 4913 + }, + { + "epoch": 0.27391304347826084, + "grad_norm": 0.5354444980621338, + "learning_rate": 8e-05, + "loss": 1.6374, + "step": 4914 + }, + { + "epoch": 0.2739687848383501, + "grad_norm": 0.5184294581413269, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 4915 + }, + { + "epoch": 0.27402452619843926, + "grad_norm": 0.5330214500427246, + "learning_rate": 8e-05, + "loss": 1.8916, + "step": 4916 + }, + { + "epoch": 0.27408026755852843, + "grad_norm": 0.5457834601402283, + "learning_rate": 8e-05, + "loss": 1.9441, + "step": 4917 + }, + { + "epoch": 0.2741360089186176, + "grad_norm": 0.5252987146377563, + "learning_rate": 8e-05, + "loss": 1.6714, + "step": 4918 + }, + { + "epoch": 0.2741917502787068, + "grad_norm": 0.4930163025856018, + "learning_rate": 8e-05, + "loss": 1.8325, + "step": 4919 + }, + { + "epoch": 0.27424749163879597, + "grad_norm": 0.49244213104248047, + "learning_rate": 8e-05, + "loss": 1.8733, + "step": 4920 + }, + { + "epoch": 0.27430323299888515, + "grad_norm": 0.7317171692848206, + "learning_rate": 8e-05, + "loss": 1.5259, + "step": 4921 + }, + { + "epoch": 0.2743589743589744, + "grad_norm": 0.5334247946739197, + "learning_rate": 8e-05, + "loss": 1.9318, + "step": 4922 + }, + { + "epoch": 0.27441471571906356, + "grad_norm": 0.4989931881427765, + "learning_rate": 8e-05, + "loss": 1.7103, + "step": 4923 + }, + { + "epoch": 0.27447045707915274, + "grad_norm": 0.4885168969631195, + "learning_rate": 8e-05, + "loss": 1.667, + "step": 4924 + }, + { + "epoch": 0.2745261984392419, + "grad_norm": 0.5465773940086365, + "learning_rate": 8e-05, + "loss": 1.7775, + "step": 4925 + }, + { + "epoch": 0.2745819397993311, + "grad_norm": 0.4521493911743164, + "learning_rate": 8e-05, + "loss": 1.3499, + "step": 4926 + }, + { + "epoch": 0.2746376811594203, + "grad_norm": 0.5232551693916321, + "learning_rate": 8e-05, + "loss": 1.889, + "step": 4927 + }, + { + "epoch": 0.27469342251950946, + "grad_norm": 0.5108181834220886, + "learning_rate": 8e-05, + "loss": 1.9147, + "step": 4928 + }, + { + "epoch": 0.27474916387959863, + "grad_norm": 0.500938355922699, + "learning_rate": 8e-05, + "loss": 1.5475, + "step": 4929 + }, + { + "epoch": 0.27480490523968787, + "grad_norm": 0.5091776847839355, + "learning_rate": 8e-05, + "loss": 1.7221, + "step": 4930 + }, + { + "epoch": 0.27486064659977705, + "grad_norm": 0.5064066052436829, + "learning_rate": 8e-05, + "loss": 1.5555, + "step": 4931 + }, + { + "epoch": 0.2749163879598662, + "grad_norm": 0.5267865657806396, + "learning_rate": 8e-05, + "loss": 1.7589, + "step": 4932 + }, + { + "epoch": 0.2749721293199554, + "grad_norm": 0.4712657630443573, + "learning_rate": 8e-05, + "loss": 1.5217, + "step": 4933 + }, + { + "epoch": 0.2750278706800446, + "grad_norm": 0.45334136486053467, + "learning_rate": 8e-05, + "loss": 1.6751, + "step": 4934 + }, + { + "epoch": 0.27508361204013376, + "grad_norm": 0.5091070532798767, + "learning_rate": 8e-05, + "loss": 1.6838, + "step": 4935 + }, + { + "epoch": 0.27513935340022294, + "grad_norm": 0.5040649175643921, + "learning_rate": 8e-05, + "loss": 1.7316, + "step": 4936 + }, + { + "epoch": 0.2751950947603122, + "grad_norm": 0.5236833691596985, + "learning_rate": 8e-05, + "loss": 1.7511, + "step": 4937 + }, + { + "epoch": 0.27525083612040135, + "grad_norm": 0.4728735089302063, + "learning_rate": 8e-05, + "loss": 1.5224, + "step": 4938 + }, + { + "epoch": 0.27530657748049053, + "grad_norm": 0.4994482100009918, + "learning_rate": 8e-05, + "loss": 1.7859, + "step": 4939 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 0.5201264023780823, + "learning_rate": 8e-05, + "loss": 1.759, + "step": 4940 + }, + { + "epoch": 0.2754180602006689, + "grad_norm": 0.4370453655719757, + "learning_rate": 8e-05, + "loss": 1.4542, + "step": 4941 + }, + { + "epoch": 0.27547380156075807, + "grad_norm": 0.5194417834281921, + "learning_rate": 8e-05, + "loss": 1.9518, + "step": 4942 + }, + { + "epoch": 0.27552954292084725, + "grad_norm": 0.5058545470237732, + "learning_rate": 8e-05, + "loss": 1.6761, + "step": 4943 + }, + { + "epoch": 0.2755852842809365, + "grad_norm": 0.5490286946296692, + "learning_rate": 8e-05, + "loss": 1.8641, + "step": 4944 + }, + { + "epoch": 0.27564102564102566, + "grad_norm": 0.47824475169181824, + "learning_rate": 8e-05, + "loss": 1.7207, + "step": 4945 + }, + { + "epoch": 0.27569676700111484, + "grad_norm": 0.5228742957115173, + "learning_rate": 8e-05, + "loss": 1.729, + "step": 4946 + }, + { + "epoch": 0.275752508361204, + "grad_norm": 0.5374174118041992, + "learning_rate": 8e-05, + "loss": 1.6857, + "step": 4947 + }, + { + "epoch": 0.2758082497212932, + "grad_norm": 0.5289243459701538, + "learning_rate": 8e-05, + "loss": 1.7642, + "step": 4948 + }, + { + "epoch": 0.2758639910813824, + "grad_norm": 0.4823203980922699, + "learning_rate": 8e-05, + "loss": 1.74, + "step": 4949 + }, + { + "epoch": 0.27591973244147155, + "grad_norm": 0.4666809141635895, + "learning_rate": 8e-05, + "loss": 1.5925, + "step": 4950 + }, + { + "epoch": 0.27597547380156073, + "grad_norm": 0.4994995594024658, + "learning_rate": 8e-05, + "loss": 1.6443, + "step": 4951 + }, + { + "epoch": 0.27603121516164997, + "grad_norm": 0.47002774477005005, + "learning_rate": 8e-05, + "loss": 1.4266, + "step": 4952 + }, + { + "epoch": 0.27608695652173915, + "grad_norm": 0.5007594227790833, + "learning_rate": 8e-05, + "loss": 1.7219, + "step": 4953 + }, + { + "epoch": 0.2761426978818283, + "grad_norm": 0.49902820587158203, + "learning_rate": 8e-05, + "loss": 1.7606, + "step": 4954 + }, + { + "epoch": 0.2761984392419175, + "grad_norm": 0.5196755528450012, + "learning_rate": 8e-05, + "loss": 1.7113, + "step": 4955 + }, + { + "epoch": 0.2762541806020067, + "grad_norm": 0.4996965825557709, + "learning_rate": 8e-05, + "loss": 1.7651, + "step": 4956 + }, + { + "epoch": 0.27630992196209586, + "grad_norm": 0.52850741147995, + "learning_rate": 8e-05, + "loss": 1.7883, + "step": 4957 + }, + { + "epoch": 0.27636566332218504, + "grad_norm": 0.5271864533424377, + "learning_rate": 8e-05, + "loss": 1.9402, + "step": 4958 + }, + { + "epoch": 0.2764214046822743, + "grad_norm": 0.4749406576156616, + "learning_rate": 8e-05, + "loss": 1.5498, + "step": 4959 + }, + { + "epoch": 0.27647714604236345, + "grad_norm": 0.5235247611999512, + "learning_rate": 8e-05, + "loss": 1.8308, + "step": 4960 + }, + { + "epoch": 0.27653288740245263, + "grad_norm": 0.45240989327430725, + "learning_rate": 8e-05, + "loss": 1.1512, + "step": 4961 + }, + { + "epoch": 0.2765886287625418, + "grad_norm": 0.5295997262001038, + "learning_rate": 8e-05, + "loss": 1.7245, + "step": 4962 + }, + { + "epoch": 0.276644370122631, + "grad_norm": 0.5398911237716675, + "learning_rate": 8e-05, + "loss": 1.5923, + "step": 4963 + }, + { + "epoch": 0.27670011148272017, + "grad_norm": 0.5286577343940735, + "learning_rate": 8e-05, + "loss": 1.722, + "step": 4964 + }, + { + "epoch": 0.27675585284280935, + "grad_norm": 0.5557246208190918, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 4965 + }, + { + "epoch": 0.2768115942028985, + "grad_norm": 0.5491385459899902, + "learning_rate": 8e-05, + "loss": 1.9146, + "step": 4966 + }, + { + "epoch": 0.27686733556298776, + "grad_norm": 0.5278223156929016, + "learning_rate": 8e-05, + "loss": 1.7702, + "step": 4967 + }, + { + "epoch": 0.27692307692307694, + "grad_norm": 0.5233393311500549, + "learning_rate": 8e-05, + "loss": 1.663, + "step": 4968 + }, + { + "epoch": 0.2769788182831661, + "grad_norm": 0.5258099436759949, + "learning_rate": 8e-05, + "loss": 1.6163, + "step": 4969 + }, + { + "epoch": 0.2770345596432553, + "grad_norm": 0.46531739830970764, + "learning_rate": 8e-05, + "loss": 1.5218, + "step": 4970 + }, + { + "epoch": 0.2770903010033445, + "grad_norm": 0.4865444004535675, + "learning_rate": 8e-05, + "loss": 1.7111, + "step": 4971 + }, + { + "epoch": 0.27714604236343365, + "grad_norm": 0.5154854655265808, + "learning_rate": 8e-05, + "loss": 1.6215, + "step": 4972 + }, + { + "epoch": 0.27720178372352283, + "grad_norm": 0.5724713802337646, + "learning_rate": 8e-05, + "loss": 1.7638, + "step": 4973 + }, + { + "epoch": 0.27725752508361207, + "grad_norm": 0.4830567240715027, + "learning_rate": 8e-05, + "loss": 1.6199, + "step": 4974 + }, + { + "epoch": 0.27731326644370125, + "grad_norm": 0.528709888458252, + "learning_rate": 8e-05, + "loss": 1.8588, + "step": 4975 + }, + { + "epoch": 0.2773690078037904, + "grad_norm": 0.4855372905731201, + "learning_rate": 8e-05, + "loss": 1.7285, + "step": 4976 + }, + { + "epoch": 0.2774247491638796, + "grad_norm": 0.5438281893730164, + "learning_rate": 8e-05, + "loss": 1.658, + "step": 4977 + }, + { + "epoch": 0.2774804905239688, + "grad_norm": 0.48184525966644287, + "learning_rate": 8e-05, + "loss": 1.6383, + "step": 4978 + }, + { + "epoch": 0.27753623188405796, + "grad_norm": 0.48708274960517883, + "learning_rate": 8e-05, + "loss": 1.7222, + "step": 4979 + }, + { + "epoch": 0.27759197324414714, + "grad_norm": 0.4790375828742981, + "learning_rate": 8e-05, + "loss": 1.6565, + "step": 4980 + }, + { + "epoch": 0.2776477146042363, + "grad_norm": 0.4739849269390106, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 4981 + }, + { + "epoch": 0.27770345596432555, + "grad_norm": 0.480943888425827, + "learning_rate": 8e-05, + "loss": 1.6233, + "step": 4982 + }, + { + "epoch": 0.27775919732441473, + "grad_norm": 0.47466662526130676, + "learning_rate": 8e-05, + "loss": 1.7093, + "step": 4983 + }, + { + "epoch": 0.2778149386845039, + "grad_norm": 0.49456772208213806, + "learning_rate": 8e-05, + "loss": 1.6705, + "step": 4984 + }, + { + "epoch": 0.2778706800445931, + "grad_norm": 0.5678805112838745, + "learning_rate": 8e-05, + "loss": 1.76, + "step": 4985 + }, + { + "epoch": 0.27792642140468227, + "grad_norm": 0.4886873960494995, + "learning_rate": 8e-05, + "loss": 1.6806, + "step": 4986 + }, + { + "epoch": 0.27798216276477145, + "grad_norm": 0.5274010300636292, + "learning_rate": 8e-05, + "loss": 1.9928, + "step": 4987 + }, + { + "epoch": 0.2780379041248606, + "grad_norm": 0.5529814958572388, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 4988 + }, + { + "epoch": 0.27809364548494986, + "grad_norm": 0.4791403114795685, + "learning_rate": 8e-05, + "loss": 1.6132, + "step": 4989 + }, + { + "epoch": 0.27814938684503904, + "grad_norm": 0.5282229781150818, + "learning_rate": 8e-05, + "loss": 1.7558, + "step": 4990 + }, + { + "epoch": 0.2782051282051282, + "grad_norm": 0.47779935598373413, + "learning_rate": 8e-05, + "loss": 1.6692, + "step": 4991 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.5005864500999451, + "learning_rate": 8e-05, + "loss": 1.8308, + "step": 4992 + }, + { + "epoch": 0.2783166109253066, + "grad_norm": 0.49798935651779175, + "learning_rate": 8e-05, + "loss": 1.5786, + "step": 4993 + }, + { + "epoch": 0.27837235228539575, + "grad_norm": 0.5125159621238708, + "learning_rate": 8e-05, + "loss": 1.5815, + "step": 4994 + }, + { + "epoch": 0.27842809364548493, + "grad_norm": 0.5537694692611694, + "learning_rate": 8e-05, + "loss": 1.8341, + "step": 4995 + }, + { + "epoch": 0.2784838350055741, + "grad_norm": 0.552141547203064, + "learning_rate": 8e-05, + "loss": 1.4153, + "step": 4996 + }, + { + "epoch": 0.27853957636566334, + "grad_norm": 0.4947458803653717, + "learning_rate": 8e-05, + "loss": 1.6776, + "step": 4997 + }, + { + "epoch": 0.2785953177257525, + "grad_norm": 0.4945355951786041, + "learning_rate": 8e-05, + "loss": 1.5679, + "step": 4998 + }, + { + "epoch": 0.2786510590858417, + "grad_norm": 0.4887459874153137, + "learning_rate": 8e-05, + "loss": 1.5423, + "step": 4999 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.47073450684547424, + "learning_rate": 8e-05, + "loss": 1.5159, + "step": 5000 + }, + { + "epoch": 0.27876254180602006, + "grad_norm": 0.4986558258533478, + "learning_rate": 8e-05, + "loss": 1.523, + "step": 5001 + }, + { + "epoch": 0.27881828316610924, + "grad_norm": 0.5027152895927429, + "learning_rate": 8e-05, + "loss": 1.6087, + "step": 5002 + }, + { + "epoch": 0.2788740245261984, + "grad_norm": 0.4802919328212738, + "learning_rate": 8e-05, + "loss": 1.5582, + "step": 5003 + }, + { + "epoch": 0.27892976588628765, + "grad_norm": 0.5270285606384277, + "learning_rate": 8e-05, + "loss": 1.7817, + "step": 5004 + }, + { + "epoch": 0.27898550724637683, + "grad_norm": 0.49758392572402954, + "learning_rate": 8e-05, + "loss": 1.7247, + "step": 5005 + }, + { + "epoch": 0.279041248606466, + "grad_norm": 0.49667099118232727, + "learning_rate": 8e-05, + "loss": 1.6441, + "step": 5006 + }, + { + "epoch": 0.2790969899665552, + "grad_norm": 0.5154736638069153, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 5007 + }, + { + "epoch": 0.27915273132664437, + "grad_norm": 0.53494793176651, + "learning_rate": 8e-05, + "loss": 1.4958, + "step": 5008 + }, + { + "epoch": 0.27920847268673354, + "grad_norm": 0.5931455492973328, + "learning_rate": 8e-05, + "loss": 1.5992, + "step": 5009 + }, + { + "epoch": 0.2792642140468227, + "grad_norm": 0.4625272750854492, + "learning_rate": 8e-05, + "loss": 1.3788, + "step": 5010 + }, + { + "epoch": 0.2793199554069119, + "grad_norm": 0.47783440351486206, + "learning_rate": 8e-05, + "loss": 1.4587, + "step": 5011 + }, + { + "epoch": 0.27937569676700114, + "grad_norm": 0.5056319832801819, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 5012 + }, + { + "epoch": 0.2794314381270903, + "grad_norm": 0.503319501876831, + "learning_rate": 8e-05, + "loss": 1.8524, + "step": 5013 + }, + { + "epoch": 0.2794871794871795, + "grad_norm": 0.49347031116485596, + "learning_rate": 8e-05, + "loss": 1.6201, + "step": 5014 + }, + { + "epoch": 0.27954292084726867, + "grad_norm": 0.4896809756755829, + "learning_rate": 8e-05, + "loss": 1.7937, + "step": 5015 + }, + { + "epoch": 0.27959866220735785, + "grad_norm": 0.516046404838562, + "learning_rate": 8e-05, + "loss": 1.794, + "step": 5016 + }, + { + "epoch": 0.27965440356744703, + "grad_norm": 0.46880456805229187, + "learning_rate": 8e-05, + "loss": 1.6199, + "step": 5017 + }, + { + "epoch": 0.2797101449275362, + "grad_norm": 0.49637213349342346, + "learning_rate": 8e-05, + "loss": 1.6879, + "step": 5018 + }, + { + "epoch": 0.27976588628762544, + "grad_norm": 0.5818153023719788, + "learning_rate": 8e-05, + "loss": 2.124, + "step": 5019 + }, + { + "epoch": 0.2798216276477146, + "grad_norm": 0.49280956387519836, + "learning_rate": 8e-05, + "loss": 1.7206, + "step": 5020 + }, + { + "epoch": 0.2798773690078038, + "grad_norm": 0.5034249424934387, + "learning_rate": 8e-05, + "loss": 1.7388, + "step": 5021 + }, + { + "epoch": 0.279933110367893, + "grad_norm": 0.4745542109012604, + "learning_rate": 8e-05, + "loss": 1.6114, + "step": 5022 + }, + { + "epoch": 0.27998885172798216, + "grad_norm": 0.5141556859016418, + "learning_rate": 8e-05, + "loss": 1.867, + "step": 5023 + }, + { + "epoch": 0.28004459308807134, + "grad_norm": 0.5179420113563538, + "learning_rate": 8e-05, + "loss": 1.8646, + "step": 5024 + }, + { + "epoch": 0.2801003344481605, + "grad_norm": 0.4926811158657074, + "learning_rate": 8e-05, + "loss": 1.6828, + "step": 5025 + }, + { + "epoch": 0.2801560758082497, + "grad_norm": 0.5116769671440125, + "learning_rate": 8e-05, + "loss": 1.576, + "step": 5026 + }, + { + "epoch": 0.28021181716833893, + "grad_norm": 0.42399269342422485, + "learning_rate": 8e-05, + "loss": 1.3601, + "step": 5027 + }, + { + "epoch": 0.2802675585284281, + "grad_norm": 0.5206683278083801, + "learning_rate": 8e-05, + "loss": 1.7806, + "step": 5028 + }, + { + "epoch": 0.2803232998885173, + "grad_norm": 0.4614886939525604, + "learning_rate": 8e-05, + "loss": 1.6074, + "step": 5029 + }, + { + "epoch": 0.28037904124860646, + "grad_norm": 0.5836182236671448, + "learning_rate": 8e-05, + "loss": 1.9167, + "step": 5030 + }, + { + "epoch": 0.28043478260869564, + "grad_norm": 0.5373634099960327, + "learning_rate": 8e-05, + "loss": 1.8484, + "step": 5031 + }, + { + "epoch": 0.2804905239687848, + "grad_norm": 0.47761693596839905, + "learning_rate": 8e-05, + "loss": 1.7066, + "step": 5032 + }, + { + "epoch": 0.280546265328874, + "grad_norm": 0.47047945857048035, + "learning_rate": 8e-05, + "loss": 1.5976, + "step": 5033 + }, + { + "epoch": 0.28060200668896323, + "grad_norm": 0.47744011878967285, + "learning_rate": 8e-05, + "loss": 1.6201, + "step": 5034 + }, + { + "epoch": 0.2806577480490524, + "grad_norm": 0.5402758121490479, + "learning_rate": 8e-05, + "loss": 1.836, + "step": 5035 + }, + { + "epoch": 0.2807134894091416, + "grad_norm": 0.5157755017280579, + "learning_rate": 8e-05, + "loss": 1.7456, + "step": 5036 + }, + { + "epoch": 0.28076923076923077, + "grad_norm": 0.5129708051681519, + "learning_rate": 8e-05, + "loss": 1.5757, + "step": 5037 + }, + { + "epoch": 0.28082497212931995, + "grad_norm": 0.5077796578407288, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 5038 + }, + { + "epoch": 0.28088071348940913, + "grad_norm": 0.4889647364616394, + "learning_rate": 8e-05, + "loss": 1.6926, + "step": 5039 + }, + { + "epoch": 0.2809364548494983, + "grad_norm": 0.48703962564468384, + "learning_rate": 8e-05, + "loss": 1.6108, + "step": 5040 + }, + { + "epoch": 0.28099219620958754, + "grad_norm": 0.5404532551765442, + "learning_rate": 8e-05, + "loss": 1.8826, + "step": 5041 + }, + { + "epoch": 0.2810479375696767, + "grad_norm": 0.5641488432884216, + "learning_rate": 8e-05, + "loss": 1.7598, + "step": 5042 + }, + { + "epoch": 0.2811036789297659, + "grad_norm": 0.4878641366958618, + "learning_rate": 8e-05, + "loss": 1.724, + "step": 5043 + }, + { + "epoch": 0.2811594202898551, + "grad_norm": 0.5139477849006653, + "learning_rate": 8e-05, + "loss": 1.6586, + "step": 5044 + }, + { + "epoch": 0.28121516164994426, + "grad_norm": 0.5409367084503174, + "learning_rate": 8e-05, + "loss": 1.7619, + "step": 5045 + }, + { + "epoch": 0.28127090301003344, + "grad_norm": 0.5175203084945679, + "learning_rate": 8e-05, + "loss": 1.7523, + "step": 5046 + }, + { + "epoch": 0.2813266443701226, + "grad_norm": 0.5151575207710266, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 5047 + }, + { + "epoch": 0.2813823857302118, + "grad_norm": 0.5394100546836853, + "learning_rate": 8e-05, + "loss": 1.5896, + "step": 5048 + }, + { + "epoch": 0.281438127090301, + "grad_norm": 0.5807749032974243, + "learning_rate": 8e-05, + "loss": 2.002, + "step": 5049 + }, + { + "epoch": 0.2814938684503902, + "grad_norm": 0.4935115575790405, + "learning_rate": 8e-05, + "loss": 1.5606, + "step": 5050 + }, + { + "epoch": 0.2815496098104794, + "grad_norm": 0.4910953640937805, + "learning_rate": 8e-05, + "loss": 1.6825, + "step": 5051 + }, + { + "epoch": 0.28160535117056856, + "grad_norm": 0.5092360377311707, + "learning_rate": 8e-05, + "loss": 1.7917, + "step": 5052 + }, + { + "epoch": 0.28166109253065774, + "grad_norm": 0.5052596926689148, + "learning_rate": 8e-05, + "loss": 1.6808, + "step": 5053 + }, + { + "epoch": 0.2817168338907469, + "grad_norm": 0.5162422060966492, + "learning_rate": 8e-05, + "loss": 1.8994, + "step": 5054 + }, + { + "epoch": 0.2817725752508361, + "grad_norm": 0.48943495750427246, + "learning_rate": 8e-05, + "loss": 1.6462, + "step": 5055 + }, + { + "epoch": 0.28182831661092533, + "grad_norm": 0.5406565070152283, + "learning_rate": 8e-05, + "loss": 1.8143, + "step": 5056 + }, + { + "epoch": 0.2818840579710145, + "grad_norm": 0.502768874168396, + "learning_rate": 8e-05, + "loss": 1.7451, + "step": 5057 + }, + { + "epoch": 0.2819397993311037, + "grad_norm": 0.5879219770431519, + "learning_rate": 8e-05, + "loss": 2.0077, + "step": 5058 + }, + { + "epoch": 0.28199554069119287, + "grad_norm": 0.5409952998161316, + "learning_rate": 8e-05, + "loss": 1.7412, + "step": 5059 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 0.566804051399231, + "learning_rate": 8e-05, + "loss": 1.8172, + "step": 5060 + }, + { + "epoch": 0.2821070234113712, + "grad_norm": 0.5219933390617371, + "learning_rate": 8e-05, + "loss": 1.7815, + "step": 5061 + }, + { + "epoch": 0.2821627647714604, + "grad_norm": 0.5377077460289001, + "learning_rate": 8e-05, + "loss": 1.5289, + "step": 5062 + }, + { + "epoch": 0.2822185061315496, + "grad_norm": 0.49284616112709045, + "learning_rate": 8e-05, + "loss": 1.8427, + "step": 5063 + }, + { + "epoch": 0.2822742474916388, + "grad_norm": 0.4860459864139557, + "learning_rate": 8e-05, + "loss": 1.6642, + "step": 5064 + }, + { + "epoch": 0.282329988851728, + "grad_norm": 0.49022603034973145, + "learning_rate": 8e-05, + "loss": 1.5648, + "step": 5065 + }, + { + "epoch": 0.2823857302118172, + "grad_norm": 0.5152029395103455, + "learning_rate": 8e-05, + "loss": 1.778, + "step": 5066 + }, + { + "epoch": 0.28244147157190636, + "grad_norm": 0.47602224349975586, + "learning_rate": 8e-05, + "loss": 1.7115, + "step": 5067 + }, + { + "epoch": 0.28249721293199553, + "grad_norm": 0.4774375259876251, + "learning_rate": 8e-05, + "loss": 1.6216, + "step": 5068 + }, + { + "epoch": 0.2825529542920847, + "grad_norm": 0.585257887840271, + "learning_rate": 8e-05, + "loss": 1.8704, + "step": 5069 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 0.5467697978019714, + "learning_rate": 8e-05, + "loss": 1.69, + "step": 5070 + }, + { + "epoch": 0.2826644370122631, + "grad_norm": 0.5306287407875061, + "learning_rate": 8e-05, + "loss": 1.7611, + "step": 5071 + }, + { + "epoch": 0.2827201783723523, + "grad_norm": 0.54197758436203, + "learning_rate": 8e-05, + "loss": 1.8773, + "step": 5072 + }, + { + "epoch": 0.2827759197324415, + "grad_norm": 0.5408424139022827, + "learning_rate": 8e-05, + "loss": 1.86, + "step": 5073 + }, + { + "epoch": 0.28283166109253066, + "grad_norm": 0.5417904257774353, + "learning_rate": 8e-05, + "loss": 1.8728, + "step": 5074 + }, + { + "epoch": 0.28288740245261984, + "grad_norm": 0.47730568051338196, + "learning_rate": 8e-05, + "loss": 1.4756, + "step": 5075 + }, + { + "epoch": 0.282943143812709, + "grad_norm": 0.5218319892883301, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 5076 + }, + { + "epoch": 0.2829988851727982, + "grad_norm": 0.5022181272506714, + "learning_rate": 8e-05, + "loss": 1.7086, + "step": 5077 + }, + { + "epoch": 0.2830546265328874, + "grad_norm": 0.49561530351638794, + "learning_rate": 8e-05, + "loss": 1.7091, + "step": 5078 + }, + { + "epoch": 0.2831103678929766, + "grad_norm": 0.494386225938797, + "learning_rate": 8e-05, + "loss": 1.3795, + "step": 5079 + }, + { + "epoch": 0.2831661092530658, + "grad_norm": 0.49516165256500244, + "learning_rate": 8e-05, + "loss": 1.5951, + "step": 5080 + }, + { + "epoch": 0.28322185061315497, + "grad_norm": 0.501606822013855, + "learning_rate": 8e-05, + "loss": 1.8065, + "step": 5081 + }, + { + "epoch": 0.28327759197324415, + "grad_norm": 0.527359127998352, + "learning_rate": 8e-05, + "loss": 1.8715, + "step": 5082 + }, + { + "epoch": 0.2833333333333333, + "grad_norm": 0.5170018076896667, + "learning_rate": 8e-05, + "loss": 1.7879, + "step": 5083 + }, + { + "epoch": 0.2833890746934225, + "grad_norm": 0.49888837337493896, + "learning_rate": 8e-05, + "loss": 1.4441, + "step": 5084 + }, + { + "epoch": 0.2834448160535117, + "grad_norm": 0.521267294883728, + "learning_rate": 8e-05, + "loss": 1.7673, + "step": 5085 + }, + { + "epoch": 0.2835005574136009, + "grad_norm": 0.49815288186073303, + "learning_rate": 8e-05, + "loss": 1.6354, + "step": 5086 + }, + { + "epoch": 0.2835562987736901, + "grad_norm": 0.5356400609016418, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 5087 + }, + { + "epoch": 0.2836120401337793, + "grad_norm": 0.5145084857940674, + "learning_rate": 8e-05, + "loss": 1.7456, + "step": 5088 + }, + { + "epoch": 0.28366778149386845, + "grad_norm": 0.4927878677845001, + "learning_rate": 8e-05, + "loss": 1.7089, + "step": 5089 + }, + { + "epoch": 0.28372352285395763, + "grad_norm": 0.503649115562439, + "learning_rate": 8e-05, + "loss": 1.73, + "step": 5090 + }, + { + "epoch": 0.2837792642140468, + "grad_norm": 0.5390079021453857, + "learning_rate": 8e-05, + "loss": 1.8051, + "step": 5091 + }, + { + "epoch": 0.283835005574136, + "grad_norm": 0.48613032698631287, + "learning_rate": 8e-05, + "loss": 1.6163, + "step": 5092 + }, + { + "epoch": 0.28389074693422517, + "grad_norm": 0.4882640838623047, + "learning_rate": 8e-05, + "loss": 1.5683, + "step": 5093 + }, + { + "epoch": 0.2839464882943144, + "grad_norm": 0.5540668368339539, + "learning_rate": 8e-05, + "loss": 1.7487, + "step": 5094 + }, + { + "epoch": 0.2840022296544036, + "grad_norm": 0.5007288455963135, + "learning_rate": 8e-05, + "loss": 1.5905, + "step": 5095 + }, + { + "epoch": 0.28405797101449276, + "grad_norm": 0.5038129687309265, + "learning_rate": 8e-05, + "loss": 1.716, + "step": 5096 + }, + { + "epoch": 0.28411371237458194, + "grad_norm": 0.6086083054542542, + "learning_rate": 8e-05, + "loss": 1.9201, + "step": 5097 + }, + { + "epoch": 0.2841694537346711, + "grad_norm": 0.5607197880744934, + "learning_rate": 8e-05, + "loss": 1.7734, + "step": 5098 + }, + { + "epoch": 0.2842251950947603, + "grad_norm": 0.5821718573570251, + "learning_rate": 8e-05, + "loss": 1.7251, + "step": 5099 + }, + { + "epoch": 0.2842809364548495, + "grad_norm": 0.4806770384311676, + "learning_rate": 8e-05, + "loss": 1.5513, + "step": 5100 + }, + { + "epoch": 0.2843366778149387, + "grad_norm": 0.4750522971153259, + "learning_rate": 8e-05, + "loss": 1.5404, + "step": 5101 + }, + { + "epoch": 0.2843924191750279, + "grad_norm": 0.5024062991142273, + "learning_rate": 8e-05, + "loss": 1.691, + "step": 5102 + }, + { + "epoch": 0.28444816053511707, + "grad_norm": 0.498268187046051, + "learning_rate": 8e-05, + "loss": 1.835, + "step": 5103 + }, + { + "epoch": 0.28450390189520625, + "grad_norm": 0.5400916337966919, + "learning_rate": 8e-05, + "loss": 1.9, + "step": 5104 + }, + { + "epoch": 0.2845596432552954, + "grad_norm": 0.4609983563423157, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 5105 + }, + { + "epoch": 0.2846153846153846, + "grad_norm": 0.532023549079895, + "learning_rate": 8e-05, + "loss": 1.7207, + "step": 5106 + }, + { + "epoch": 0.2846711259754738, + "grad_norm": 0.49450594186782837, + "learning_rate": 8e-05, + "loss": 1.8646, + "step": 5107 + }, + { + "epoch": 0.28472686733556296, + "grad_norm": 0.4583391845226288, + "learning_rate": 8e-05, + "loss": 1.4679, + "step": 5108 + }, + { + "epoch": 0.2847826086956522, + "grad_norm": 0.5723952651023865, + "learning_rate": 8e-05, + "loss": 1.7621, + "step": 5109 + }, + { + "epoch": 0.2848383500557414, + "grad_norm": 0.48113951086997986, + "learning_rate": 8e-05, + "loss": 1.6413, + "step": 5110 + }, + { + "epoch": 0.28489409141583055, + "grad_norm": 0.509999692440033, + "learning_rate": 8e-05, + "loss": 1.7916, + "step": 5111 + }, + { + "epoch": 0.28494983277591973, + "grad_norm": 0.6294270753860474, + "learning_rate": 8e-05, + "loss": 2.0127, + "step": 5112 + }, + { + "epoch": 0.2850055741360089, + "grad_norm": 0.5116695165634155, + "learning_rate": 8e-05, + "loss": 1.6236, + "step": 5113 + }, + { + "epoch": 0.2850613154960981, + "grad_norm": 0.49915406107902527, + "learning_rate": 8e-05, + "loss": 1.7948, + "step": 5114 + }, + { + "epoch": 0.28511705685618727, + "grad_norm": 0.5811174511909485, + "learning_rate": 8e-05, + "loss": 1.6464, + "step": 5115 + }, + { + "epoch": 0.2851727982162765, + "grad_norm": 0.5079129934310913, + "learning_rate": 8e-05, + "loss": 1.784, + "step": 5116 + }, + { + "epoch": 0.2852285395763657, + "grad_norm": 0.5594106912612915, + "learning_rate": 8e-05, + "loss": 2.0082, + "step": 5117 + }, + { + "epoch": 0.28528428093645486, + "grad_norm": 0.5111921429634094, + "learning_rate": 8e-05, + "loss": 1.7059, + "step": 5118 + }, + { + "epoch": 0.28534002229654404, + "grad_norm": 0.5258628129959106, + "learning_rate": 8e-05, + "loss": 1.8113, + "step": 5119 + }, + { + "epoch": 0.2853957636566332, + "grad_norm": 0.49711883068084717, + "learning_rate": 8e-05, + "loss": 1.525, + "step": 5120 + }, + { + "epoch": 0.2854515050167224, + "grad_norm": 0.46852102875709534, + "learning_rate": 8e-05, + "loss": 1.4837, + "step": 5121 + }, + { + "epoch": 0.2855072463768116, + "grad_norm": 0.5145292282104492, + "learning_rate": 8e-05, + "loss": 1.7547, + "step": 5122 + }, + { + "epoch": 0.28556298773690075, + "grad_norm": 0.5454033613204956, + "learning_rate": 8e-05, + "loss": 1.699, + "step": 5123 + }, + { + "epoch": 0.28561872909699, + "grad_norm": 0.5002652406692505, + "learning_rate": 8e-05, + "loss": 1.7178, + "step": 5124 + }, + { + "epoch": 0.28567447045707917, + "grad_norm": 0.4545373320579529, + "learning_rate": 8e-05, + "loss": 1.4958, + "step": 5125 + }, + { + "epoch": 0.28573021181716834, + "grad_norm": 0.5078669786453247, + "learning_rate": 8e-05, + "loss": 1.7654, + "step": 5126 + }, + { + "epoch": 0.2857859531772575, + "grad_norm": 0.49822527170181274, + "learning_rate": 8e-05, + "loss": 1.7321, + "step": 5127 + }, + { + "epoch": 0.2858416945373467, + "grad_norm": 0.4882858097553253, + "learning_rate": 8e-05, + "loss": 1.6351, + "step": 5128 + }, + { + "epoch": 0.2858974358974359, + "grad_norm": 0.5043916702270508, + "learning_rate": 8e-05, + "loss": 1.7462, + "step": 5129 + }, + { + "epoch": 0.28595317725752506, + "grad_norm": 0.4957789480686188, + "learning_rate": 8e-05, + "loss": 1.7014, + "step": 5130 + }, + { + "epoch": 0.2860089186176143, + "grad_norm": 0.47981375455856323, + "learning_rate": 8e-05, + "loss": 1.5465, + "step": 5131 + }, + { + "epoch": 0.2860646599777035, + "grad_norm": 0.46363550424575806, + "learning_rate": 8e-05, + "loss": 1.6081, + "step": 5132 + }, + { + "epoch": 0.28612040133779265, + "grad_norm": 0.5549232959747314, + "learning_rate": 8e-05, + "loss": 1.7287, + "step": 5133 + }, + { + "epoch": 0.28617614269788183, + "grad_norm": 0.4919549822807312, + "learning_rate": 8e-05, + "loss": 1.7172, + "step": 5134 + }, + { + "epoch": 0.286231884057971, + "grad_norm": 0.5521557927131653, + "learning_rate": 8e-05, + "loss": 2.0098, + "step": 5135 + }, + { + "epoch": 0.2862876254180602, + "grad_norm": 0.49092942476272583, + "learning_rate": 8e-05, + "loss": 1.8048, + "step": 5136 + }, + { + "epoch": 0.28634336677814937, + "grad_norm": 0.4958489239215851, + "learning_rate": 8e-05, + "loss": 1.8119, + "step": 5137 + }, + { + "epoch": 0.2863991081382386, + "grad_norm": 0.4862135052680969, + "learning_rate": 8e-05, + "loss": 1.7403, + "step": 5138 + }, + { + "epoch": 0.2864548494983278, + "grad_norm": 0.50542151927948, + "learning_rate": 8e-05, + "loss": 1.8351, + "step": 5139 + }, + { + "epoch": 0.28651059085841696, + "grad_norm": 0.48798254132270813, + "learning_rate": 8e-05, + "loss": 1.5942, + "step": 5140 + }, + { + "epoch": 0.28656633221850614, + "grad_norm": 0.5196491479873657, + "learning_rate": 8e-05, + "loss": 1.558, + "step": 5141 + }, + { + "epoch": 0.2866220735785953, + "grad_norm": 0.4845980703830719, + "learning_rate": 8e-05, + "loss": 1.4992, + "step": 5142 + }, + { + "epoch": 0.2866778149386845, + "grad_norm": 0.5071967244148254, + "learning_rate": 8e-05, + "loss": 1.69, + "step": 5143 + }, + { + "epoch": 0.2867335562987737, + "grad_norm": 0.4898008704185486, + "learning_rate": 8e-05, + "loss": 1.6055, + "step": 5144 + }, + { + "epoch": 0.28678929765886285, + "grad_norm": 0.5290776491165161, + "learning_rate": 8e-05, + "loss": 1.9193, + "step": 5145 + }, + { + "epoch": 0.2868450390189521, + "grad_norm": 0.5390506982803345, + "learning_rate": 8e-05, + "loss": 1.654, + "step": 5146 + }, + { + "epoch": 0.28690078037904126, + "grad_norm": 0.5038630366325378, + "learning_rate": 8e-05, + "loss": 1.6403, + "step": 5147 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 0.5432470440864563, + "learning_rate": 8e-05, + "loss": 1.7249, + "step": 5148 + }, + { + "epoch": 0.2870122630992196, + "grad_norm": 0.48758211731910706, + "learning_rate": 8e-05, + "loss": 1.5862, + "step": 5149 + }, + { + "epoch": 0.2870680044593088, + "grad_norm": 0.541468620300293, + "learning_rate": 8e-05, + "loss": 1.971, + "step": 5150 + }, + { + "epoch": 0.287123745819398, + "grad_norm": 0.5260459780693054, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 5151 + }, + { + "epoch": 0.28717948717948716, + "grad_norm": 0.5259945392608643, + "learning_rate": 8e-05, + "loss": 1.6812, + "step": 5152 + }, + { + "epoch": 0.2872352285395764, + "grad_norm": 0.5329565405845642, + "learning_rate": 8e-05, + "loss": 1.8033, + "step": 5153 + }, + { + "epoch": 0.28729096989966557, + "grad_norm": 0.4936564266681671, + "learning_rate": 8e-05, + "loss": 1.6521, + "step": 5154 + }, + { + "epoch": 0.28734671125975475, + "grad_norm": 0.4688653349876404, + "learning_rate": 8e-05, + "loss": 1.5242, + "step": 5155 + }, + { + "epoch": 0.28740245261984393, + "grad_norm": 0.46789103746414185, + "learning_rate": 8e-05, + "loss": 1.5959, + "step": 5156 + }, + { + "epoch": 0.2874581939799331, + "grad_norm": 0.4493112862110138, + "learning_rate": 8e-05, + "loss": 1.3907, + "step": 5157 + }, + { + "epoch": 0.2875139353400223, + "grad_norm": 0.5266214609146118, + "learning_rate": 8e-05, + "loss": 1.7261, + "step": 5158 + }, + { + "epoch": 0.28756967670011147, + "grad_norm": 0.5526112914085388, + "learning_rate": 8e-05, + "loss": 1.9001, + "step": 5159 + }, + { + "epoch": 0.28762541806020064, + "grad_norm": 0.4800586402416229, + "learning_rate": 8e-05, + "loss": 1.5474, + "step": 5160 + }, + { + "epoch": 0.2876811594202899, + "grad_norm": 0.5277012586593628, + "learning_rate": 8e-05, + "loss": 1.8067, + "step": 5161 + }, + { + "epoch": 0.28773690078037906, + "grad_norm": 0.5427259206771851, + "learning_rate": 8e-05, + "loss": 1.9604, + "step": 5162 + }, + { + "epoch": 0.28779264214046824, + "grad_norm": 0.5284680128097534, + "learning_rate": 8e-05, + "loss": 1.7382, + "step": 5163 + }, + { + "epoch": 0.2878483835005574, + "grad_norm": 0.5306553244590759, + "learning_rate": 8e-05, + "loss": 1.6126, + "step": 5164 + }, + { + "epoch": 0.2879041248606466, + "grad_norm": 0.5320262312889099, + "learning_rate": 8e-05, + "loss": 1.805, + "step": 5165 + }, + { + "epoch": 0.28795986622073577, + "grad_norm": 0.4699755609035492, + "learning_rate": 8e-05, + "loss": 1.4751, + "step": 5166 + }, + { + "epoch": 0.28801560758082495, + "grad_norm": 0.5151172876358032, + "learning_rate": 8e-05, + "loss": 1.7173, + "step": 5167 + }, + { + "epoch": 0.2880713489409142, + "grad_norm": 0.480150431394577, + "learning_rate": 8e-05, + "loss": 1.6986, + "step": 5168 + }, + { + "epoch": 0.28812709030100336, + "grad_norm": 0.5028404593467712, + "learning_rate": 8e-05, + "loss": 1.6821, + "step": 5169 + }, + { + "epoch": 0.28818283166109254, + "grad_norm": 0.49127790331840515, + "learning_rate": 8e-05, + "loss": 1.7527, + "step": 5170 + }, + { + "epoch": 0.2882385730211817, + "grad_norm": 0.5041865110397339, + "learning_rate": 8e-05, + "loss": 1.6777, + "step": 5171 + }, + { + "epoch": 0.2882943143812709, + "grad_norm": 0.47198405861854553, + "learning_rate": 8e-05, + "loss": 1.3965, + "step": 5172 + }, + { + "epoch": 0.2883500557413601, + "grad_norm": 0.48658695816993713, + "learning_rate": 8e-05, + "loss": 1.587, + "step": 5173 + }, + { + "epoch": 0.28840579710144926, + "grad_norm": 0.5005254149436951, + "learning_rate": 8e-05, + "loss": 1.5085, + "step": 5174 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 0.5224685072898865, + "learning_rate": 8e-05, + "loss": 1.6863, + "step": 5175 + }, + { + "epoch": 0.28851727982162767, + "grad_norm": 0.526444673538208, + "learning_rate": 8e-05, + "loss": 1.715, + "step": 5176 + }, + { + "epoch": 0.28857302118171685, + "grad_norm": 0.5164334177970886, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 5177 + }, + { + "epoch": 0.288628762541806, + "grad_norm": 0.5021805763244629, + "learning_rate": 8e-05, + "loss": 1.744, + "step": 5178 + }, + { + "epoch": 0.2886845039018952, + "grad_norm": 0.5422660708427429, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 5179 + }, + { + "epoch": 0.2887402452619844, + "grad_norm": 0.5039775967597961, + "learning_rate": 8e-05, + "loss": 1.8386, + "step": 5180 + }, + { + "epoch": 0.28879598662207356, + "grad_norm": 0.482729971408844, + "learning_rate": 8e-05, + "loss": 1.5795, + "step": 5181 + }, + { + "epoch": 0.28885172798216274, + "grad_norm": 0.5006514191627502, + "learning_rate": 8e-05, + "loss": 1.6781, + "step": 5182 + }, + { + "epoch": 0.288907469342252, + "grad_norm": 0.49958372116088867, + "learning_rate": 8e-05, + "loss": 1.714, + "step": 5183 + }, + { + "epoch": 0.28896321070234116, + "grad_norm": 0.4862821400165558, + "learning_rate": 8e-05, + "loss": 1.7902, + "step": 5184 + }, + { + "epoch": 0.28901895206243033, + "grad_norm": 0.5306202173233032, + "learning_rate": 8e-05, + "loss": 1.6501, + "step": 5185 + }, + { + "epoch": 0.2890746934225195, + "grad_norm": 0.49687203764915466, + "learning_rate": 8e-05, + "loss": 1.7856, + "step": 5186 + }, + { + "epoch": 0.2891304347826087, + "grad_norm": 0.4888293147087097, + "learning_rate": 8e-05, + "loss": 1.6805, + "step": 5187 + }, + { + "epoch": 0.28918617614269787, + "grad_norm": 0.5002862215042114, + "learning_rate": 8e-05, + "loss": 1.6364, + "step": 5188 + }, + { + "epoch": 0.28924191750278705, + "grad_norm": 0.5225688815116882, + "learning_rate": 8e-05, + "loss": 1.6908, + "step": 5189 + }, + { + "epoch": 0.28929765886287623, + "grad_norm": 0.5296038389205933, + "learning_rate": 8e-05, + "loss": 1.7575, + "step": 5190 + }, + { + "epoch": 0.28935340022296546, + "grad_norm": 0.5127953290939331, + "learning_rate": 8e-05, + "loss": 1.6997, + "step": 5191 + }, + { + "epoch": 0.28940914158305464, + "grad_norm": 0.525801420211792, + "learning_rate": 8e-05, + "loss": 1.8364, + "step": 5192 + }, + { + "epoch": 0.2894648829431438, + "grad_norm": 0.5201865434646606, + "learning_rate": 8e-05, + "loss": 1.6582, + "step": 5193 + }, + { + "epoch": 0.289520624303233, + "grad_norm": 0.5215128064155579, + "learning_rate": 8e-05, + "loss": 1.802, + "step": 5194 + }, + { + "epoch": 0.2895763656633222, + "grad_norm": 0.5403210520744324, + "learning_rate": 8e-05, + "loss": 1.7328, + "step": 5195 + }, + { + "epoch": 0.28963210702341136, + "grad_norm": 0.5021224021911621, + "learning_rate": 8e-05, + "loss": 1.6957, + "step": 5196 + }, + { + "epoch": 0.28968784838350053, + "grad_norm": 0.4970734715461731, + "learning_rate": 8e-05, + "loss": 1.8673, + "step": 5197 + }, + { + "epoch": 0.28974358974358977, + "grad_norm": 0.498276025056839, + "learning_rate": 8e-05, + "loss": 1.7437, + "step": 5198 + }, + { + "epoch": 0.28979933110367895, + "grad_norm": 0.5261366367340088, + "learning_rate": 8e-05, + "loss": 1.6561, + "step": 5199 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.5094021558761597, + "learning_rate": 8e-05, + "loss": 1.7651, + "step": 5200 + }, + { + "epoch": 0.2899108138238573, + "grad_norm": 0.5043343305587769, + "learning_rate": 8e-05, + "loss": 1.6313, + "step": 5201 + }, + { + "epoch": 0.2899665551839465, + "grad_norm": 0.5139153003692627, + "learning_rate": 8e-05, + "loss": 1.74, + "step": 5202 + }, + { + "epoch": 0.29002229654403566, + "grad_norm": 0.5476909279823303, + "learning_rate": 8e-05, + "loss": 2.0281, + "step": 5203 + }, + { + "epoch": 0.29007803790412484, + "grad_norm": 0.5021553635597229, + "learning_rate": 8e-05, + "loss": 1.6699, + "step": 5204 + }, + { + "epoch": 0.290133779264214, + "grad_norm": 0.5864331722259521, + "learning_rate": 8e-05, + "loss": 1.8308, + "step": 5205 + }, + { + "epoch": 0.29018952062430325, + "grad_norm": 0.5637518167495728, + "learning_rate": 8e-05, + "loss": 1.9774, + "step": 5206 + }, + { + "epoch": 0.29024526198439243, + "grad_norm": 0.4940563142299652, + "learning_rate": 8e-05, + "loss": 1.7939, + "step": 5207 + }, + { + "epoch": 0.2903010033444816, + "grad_norm": 0.4949653744697571, + "learning_rate": 8e-05, + "loss": 1.8159, + "step": 5208 + }, + { + "epoch": 0.2903567447045708, + "grad_norm": 0.5256124138832092, + "learning_rate": 8e-05, + "loss": 1.4621, + "step": 5209 + }, + { + "epoch": 0.29041248606465997, + "grad_norm": 0.4592047333717346, + "learning_rate": 8e-05, + "loss": 1.5091, + "step": 5210 + }, + { + "epoch": 0.29046822742474915, + "grad_norm": 0.5461311936378479, + "learning_rate": 8e-05, + "loss": 1.8522, + "step": 5211 + }, + { + "epoch": 0.2905239687848383, + "grad_norm": 0.48445531725883484, + "learning_rate": 8e-05, + "loss": 1.6062, + "step": 5212 + }, + { + "epoch": 0.29057971014492756, + "grad_norm": 0.5428815484046936, + "learning_rate": 8e-05, + "loss": 1.991, + "step": 5213 + }, + { + "epoch": 0.29063545150501674, + "grad_norm": 0.52653568983078, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 5214 + }, + { + "epoch": 0.2906911928651059, + "grad_norm": 0.5246649384498596, + "learning_rate": 8e-05, + "loss": 1.5794, + "step": 5215 + }, + { + "epoch": 0.2907469342251951, + "grad_norm": 0.5046067833900452, + "learning_rate": 8e-05, + "loss": 1.5843, + "step": 5216 + }, + { + "epoch": 0.2908026755852843, + "grad_norm": 0.4852803647518158, + "learning_rate": 8e-05, + "loss": 1.4408, + "step": 5217 + }, + { + "epoch": 0.29085841694537345, + "grad_norm": 0.506061851978302, + "learning_rate": 8e-05, + "loss": 1.6776, + "step": 5218 + }, + { + "epoch": 0.29091415830546263, + "grad_norm": 0.5446941256523132, + "learning_rate": 8e-05, + "loss": 1.7467, + "step": 5219 + }, + { + "epoch": 0.2909698996655518, + "grad_norm": 0.5437446236610413, + "learning_rate": 8e-05, + "loss": 1.5669, + "step": 5220 + }, + { + "epoch": 0.29102564102564105, + "grad_norm": 0.5718626976013184, + "learning_rate": 8e-05, + "loss": 1.7853, + "step": 5221 + }, + { + "epoch": 0.2910813823857302, + "grad_norm": 0.5383369326591492, + "learning_rate": 8e-05, + "loss": 1.8888, + "step": 5222 + }, + { + "epoch": 0.2911371237458194, + "grad_norm": 0.5096738338470459, + "learning_rate": 8e-05, + "loss": 1.6494, + "step": 5223 + }, + { + "epoch": 0.2911928651059086, + "grad_norm": 0.4973105788230896, + "learning_rate": 8e-05, + "loss": 1.7911, + "step": 5224 + }, + { + "epoch": 0.29124860646599776, + "grad_norm": 0.5213186144828796, + "learning_rate": 8e-05, + "loss": 1.6393, + "step": 5225 + }, + { + "epoch": 0.29130434782608694, + "grad_norm": 0.5377399325370789, + "learning_rate": 8e-05, + "loss": 1.8521, + "step": 5226 + }, + { + "epoch": 0.2913600891861761, + "grad_norm": 0.47744929790496826, + "learning_rate": 8e-05, + "loss": 1.5941, + "step": 5227 + }, + { + "epoch": 0.29141583054626535, + "grad_norm": 0.48108357191085815, + "learning_rate": 8e-05, + "loss": 1.544, + "step": 5228 + }, + { + "epoch": 0.29147157190635453, + "grad_norm": 0.5099393725395203, + "learning_rate": 8e-05, + "loss": 1.6005, + "step": 5229 + }, + { + "epoch": 0.2915273132664437, + "grad_norm": 0.5258042216300964, + "learning_rate": 8e-05, + "loss": 1.8084, + "step": 5230 + }, + { + "epoch": 0.2915830546265329, + "grad_norm": 0.5149073004722595, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 5231 + }, + { + "epoch": 0.29163879598662207, + "grad_norm": 0.5075093507766724, + "learning_rate": 8e-05, + "loss": 1.5967, + "step": 5232 + }, + { + "epoch": 0.29169453734671125, + "grad_norm": 0.5436050891876221, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 5233 + }, + { + "epoch": 0.2917502787068004, + "grad_norm": 0.6043053269386292, + "learning_rate": 8e-05, + "loss": 2.0332, + "step": 5234 + }, + { + "epoch": 0.29180602006688966, + "grad_norm": 0.5168697834014893, + "learning_rate": 8e-05, + "loss": 1.7984, + "step": 5235 + }, + { + "epoch": 0.29186176142697884, + "grad_norm": 0.5074122548103333, + "learning_rate": 8e-05, + "loss": 1.5661, + "step": 5236 + }, + { + "epoch": 0.291917502787068, + "grad_norm": 0.4924065172672272, + "learning_rate": 8e-05, + "loss": 1.7552, + "step": 5237 + }, + { + "epoch": 0.2919732441471572, + "grad_norm": 0.5427802205085754, + "learning_rate": 8e-05, + "loss": 1.7798, + "step": 5238 + }, + { + "epoch": 0.2920289855072464, + "grad_norm": 0.4995916187763214, + "learning_rate": 8e-05, + "loss": 1.67, + "step": 5239 + }, + { + "epoch": 0.29208472686733555, + "grad_norm": 0.49779626727104187, + "learning_rate": 8e-05, + "loss": 1.4751, + "step": 5240 + }, + { + "epoch": 0.29214046822742473, + "grad_norm": 0.46107199788093567, + "learning_rate": 8e-05, + "loss": 1.5479, + "step": 5241 + }, + { + "epoch": 0.2921962095875139, + "grad_norm": 0.534938633441925, + "learning_rate": 8e-05, + "loss": 1.969, + "step": 5242 + }, + { + "epoch": 0.29225195094760315, + "grad_norm": 0.5011855363845825, + "learning_rate": 8e-05, + "loss": 1.6897, + "step": 5243 + }, + { + "epoch": 0.2923076923076923, + "grad_norm": 0.4805475175380707, + "learning_rate": 8e-05, + "loss": 1.431, + "step": 5244 + }, + { + "epoch": 0.2923634336677815, + "grad_norm": 0.5417031645774841, + "learning_rate": 8e-05, + "loss": 2.0226, + "step": 5245 + }, + { + "epoch": 0.2924191750278707, + "grad_norm": 0.48827773332595825, + "learning_rate": 8e-05, + "loss": 1.6413, + "step": 5246 + }, + { + "epoch": 0.29247491638795986, + "grad_norm": 0.5585089921951294, + "learning_rate": 8e-05, + "loss": 1.7636, + "step": 5247 + }, + { + "epoch": 0.29253065774804904, + "grad_norm": 0.5031596422195435, + "learning_rate": 8e-05, + "loss": 1.9364, + "step": 5248 + }, + { + "epoch": 0.2925863991081382, + "grad_norm": 0.50601726770401, + "learning_rate": 8e-05, + "loss": 1.7698, + "step": 5249 + }, + { + "epoch": 0.29264214046822745, + "grad_norm": 0.5573868155479431, + "learning_rate": 8e-05, + "loss": 1.9684, + "step": 5250 + }, + { + "epoch": 0.29269788182831663, + "grad_norm": 0.599646270275116, + "learning_rate": 8e-05, + "loss": 1.7193, + "step": 5251 + }, + { + "epoch": 0.2927536231884058, + "grad_norm": 0.48568132519721985, + "learning_rate": 8e-05, + "loss": 1.5653, + "step": 5252 + }, + { + "epoch": 0.292809364548495, + "grad_norm": 0.47863563895225525, + "learning_rate": 8e-05, + "loss": 1.6626, + "step": 5253 + }, + { + "epoch": 0.29286510590858417, + "grad_norm": 0.4812254011631012, + "learning_rate": 8e-05, + "loss": 1.5981, + "step": 5254 + }, + { + "epoch": 0.29292084726867335, + "grad_norm": 0.5033372640609741, + "learning_rate": 8e-05, + "loss": 1.6784, + "step": 5255 + }, + { + "epoch": 0.2929765886287625, + "grad_norm": 0.5056328177452087, + "learning_rate": 8e-05, + "loss": 1.5998, + "step": 5256 + }, + { + "epoch": 0.2930323299888517, + "grad_norm": 0.4394751191139221, + "learning_rate": 8e-05, + "loss": 1.4819, + "step": 5257 + }, + { + "epoch": 0.29308807134894094, + "grad_norm": 0.5667262673377991, + "learning_rate": 8e-05, + "loss": 1.7092, + "step": 5258 + }, + { + "epoch": 0.2931438127090301, + "grad_norm": 0.5049179792404175, + "learning_rate": 8e-05, + "loss": 1.7625, + "step": 5259 + }, + { + "epoch": 0.2931995540691193, + "grad_norm": 0.5217584371566772, + "learning_rate": 8e-05, + "loss": 1.8257, + "step": 5260 + }, + { + "epoch": 0.2932552954292085, + "grad_norm": 0.5195755362510681, + "learning_rate": 8e-05, + "loss": 1.9529, + "step": 5261 + }, + { + "epoch": 0.29331103678929765, + "grad_norm": 0.49125170707702637, + "learning_rate": 8e-05, + "loss": 1.5536, + "step": 5262 + }, + { + "epoch": 0.29336677814938683, + "grad_norm": 0.49988195300102234, + "learning_rate": 8e-05, + "loss": 1.5557, + "step": 5263 + }, + { + "epoch": 0.293422519509476, + "grad_norm": 0.5084020495414734, + "learning_rate": 8e-05, + "loss": 1.4499, + "step": 5264 + }, + { + "epoch": 0.29347826086956524, + "grad_norm": 0.5017939805984497, + "learning_rate": 8e-05, + "loss": 1.7718, + "step": 5265 + }, + { + "epoch": 0.2935340022296544, + "grad_norm": 0.5145910978317261, + "learning_rate": 8e-05, + "loss": 1.838, + "step": 5266 + }, + { + "epoch": 0.2935897435897436, + "grad_norm": 0.516250729560852, + "learning_rate": 8e-05, + "loss": 1.7568, + "step": 5267 + }, + { + "epoch": 0.2936454849498328, + "grad_norm": 0.4948877990245819, + "learning_rate": 8e-05, + "loss": 1.7616, + "step": 5268 + }, + { + "epoch": 0.29370122630992196, + "grad_norm": 0.5350788831710815, + "learning_rate": 8e-05, + "loss": 1.9806, + "step": 5269 + }, + { + "epoch": 0.29375696767001114, + "grad_norm": 0.5112669467926025, + "learning_rate": 8e-05, + "loss": 1.7857, + "step": 5270 + }, + { + "epoch": 0.2938127090301003, + "grad_norm": 0.4432719945907593, + "learning_rate": 8e-05, + "loss": 1.4863, + "step": 5271 + }, + { + "epoch": 0.2938684503901895, + "grad_norm": 0.5258412957191467, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 5272 + }, + { + "epoch": 0.29392419175027873, + "grad_norm": 0.5164126753807068, + "learning_rate": 8e-05, + "loss": 1.7195, + "step": 5273 + }, + { + "epoch": 0.2939799331103679, + "grad_norm": 0.5118614435195923, + "learning_rate": 8e-05, + "loss": 1.744, + "step": 5274 + }, + { + "epoch": 0.2940356744704571, + "grad_norm": 0.5361714363098145, + "learning_rate": 8e-05, + "loss": 1.9204, + "step": 5275 + }, + { + "epoch": 0.29409141583054627, + "grad_norm": 0.5519838929176331, + "learning_rate": 8e-05, + "loss": 1.9456, + "step": 5276 + }, + { + "epoch": 0.29414715719063544, + "grad_norm": 0.5510969758033752, + "learning_rate": 8e-05, + "loss": 1.8676, + "step": 5277 + }, + { + "epoch": 0.2942028985507246, + "grad_norm": 0.502593994140625, + "learning_rate": 8e-05, + "loss": 1.6703, + "step": 5278 + }, + { + "epoch": 0.2942586399108138, + "grad_norm": 0.5204991698265076, + "learning_rate": 8e-05, + "loss": 1.6156, + "step": 5279 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.5098587870597839, + "learning_rate": 8e-05, + "loss": 1.7501, + "step": 5280 + }, + { + "epoch": 0.2943701226309922, + "grad_norm": 0.47406288981437683, + "learning_rate": 8e-05, + "loss": 1.5785, + "step": 5281 + }, + { + "epoch": 0.2944258639910814, + "grad_norm": 0.48030877113342285, + "learning_rate": 8e-05, + "loss": 1.4928, + "step": 5282 + }, + { + "epoch": 0.29448160535117057, + "grad_norm": 0.4848138689994812, + "learning_rate": 8e-05, + "loss": 1.718, + "step": 5283 + }, + { + "epoch": 0.29453734671125975, + "grad_norm": 0.503160297870636, + "learning_rate": 8e-05, + "loss": 1.7247, + "step": 5284 + }, + { + "epoch": 0.29459308807134893, + "grad_norm": 0.536435604095459, + "learning_rate": 8e-05, + "loss": 1.7329, + "step": 5285 + }, + { + "epoch": 0.2946488294314381, + "grad_norm": 0.5351728200912476, + "learning_rate": 8e-05, + "loss": 1.7665, + "step": 5286 + }, + { + "epoch": 0.2947045707915273, + "grad_norm": 0.47767600417137146, + "learning_rate": 8e-05, + "loss": 1.5381, + "step": 5287 + }, + { + "epoch": 0.2947603121516165, + "grad_norm": 0.5618817806243896, + "learning_rate": 8e-05, + "loss": 1.7063, + "step": 5288 + }, + { + "epoch": 0.2948160535117057, + "grad_norm": 0.5656747221946716, + "learning_rate": 8e-05, + "loss": 1.9849, + "step": 5289 + }, + { + "epoch": 0.2948717948717949, + "grad_norm": 0.5192245841026306, + "learning_rate": 8e-05, + "loss": 1.7184, + "step": 5290 + }, + { + "epoch": 0.29492753623188406, + "grad_norm": 0.5602400898933411, + "learning_rate": 8e-05, + "loss": 1.8306, + "step": 5291 + }, + { + "epoch": 0.29498327759197324, + "grad_norm": 0.504332423210144, + "learning_rate": 8e-05, + "loss": 1.7853, + "step": 5292 + }, + { + "epoch": 0.2950390189520624, + "grad_norm": 0.5190404653549194, + "learning_rate": 8e-05, + "loss": 1.9234, + "step": 5293 + }, + { + "epoch": 0.2950947603121516, + "grad_norm": 0.4828835427761078, + "learning_rate": 8e-05, + "loss": 1.7414, + "step": 5294 + }, + { + "epoch": 0.29515050167224083, + "grad_norm": 0.4870109260082245, + "learning_rate": 8e-05, + "loss": 1.5827, + "step": 5295 + }, + { + "epoch": 0.29520624303233, + "grad_norm": 0.49624398350715637, + "learning_rate": 8e-05, + "loss": 1.5151, + "step": 5296 + }, + { + "epoch": 0.2952619843924192, + "grad_norm": 0.531680703163147, + "learning_rate": 8e-05, + "loss": 1.8093, + "step": 5297 + }, + { + "epoch": 0.29531772575250836, + "grad_norm": 0.49582523107528687, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 5298 + }, + { + "epoch": 0.29537346711259754, + "grad_norm": 0.48187878727912903, + "learning_rate": 8e-05, + "loss": 1.4792, + "step": 5299 + }, + { + "epoch": 0.2954292084726867, + "grad_norm": 0.5246399641036987, + "learning_rate": 8e-05, + "loss": 1.6916, + "step": 5300 + }, + { + "epoch": 0.2954849498327759, + "grad_norm": 0.5118151307106018, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 5301 + }, + { + "epoch": 0.2955406911928651, + "grad_norm": 0.5700403451919556, + "learning_rate": 8e-05, + "loss": 1.7347, + "step": 5302 + }, + { + "epoch": 0.2955964325529543, + "grad_norm": 0.4879356026649475, + "learning_rate": 8e-05, + "loss": 1.4308, + "step": 5303 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 0.4735146164894104, + "learning_rate": 8e-05, + "loss": 1.4481, + "step": 5304 + }, + { + "epoch": 0.29570791527313267, + "grad_norm": 0.5084589123725891, + "learning_rate": 8e-05, + "loss": 1.6398, + "step": 5305 + }, + { + "epoch": 0.29576365663322185, + "grad_norm": 0.5700600147247314, + "learning_rate": 8e-05, + "loss": 1.8626, + "step": 5306 + }, + { + "epoch": 0.29581939799331103, + "grad_norm": 0.5779445171356201, + "learning_rate": 8e-05, + "loss": 1.7356, + "step": 5307 + }, + { + "epoch": 0.2958751393534002, + "grad_norm": 0.5010429620742798, + "learning_rate": 8e-05, + "loss": 1.6907, + "step": 5308 + }, + { + "epoch": 0.2959308807134894, + "grad_norm": 0.5222609639167786, + "learning_rate": 8e-05, + "loss": 1.8282, + "step": 5309 + }, + { + "epoch": 0.2959866220735786, + "grad_norm": 0.5360494256019592, + "learning_rate": 8e-05, + "loss": 1.6219, + "step": 5310 + }, + { + "epoch": 0.2960423634336678, + "grad_norm": 0.5279106497764587, + "learning_rate": 8e-05, + "loss": 1.8183, + "step": 5311 + }, + { + "epoch": 0.296098104793757, + "grad_norm": 0.5564893484115601, + "learning_rate": 8e-05, + "loss": 1.8302, + "step": 5312 + }, + { + "epoch": 0.29615384615384616, + "grad_norm": 0.5432298183441162, + "learning_rate": 8e-05, + "loss": 1.9451, + "step": 5313 + }, + { + "epoch": 0.29620958751393534, + "grad_norm": 0.5052030086517334, + "learning_rate": 8e-05, + "loss": 1.7607, + "step": 5314 + }, + { + "epoch": 0.2962653288740245, + "grad_norm": 0.49412575364112854, + "learning_rate": 8e-05, + "loss": 1.6384, + "step": 5315 + }, + { + "epoch": 0.2963210702341137, + "grad_norm": 0.5110658407211304, + "learning_rate": 8e-05, + "loss": 1.9352, + "step": 5316 + }, + { + "epoch": 0.29637681159420287, + "grad_norm": 0.5185591578483582, + "learning_rate": 8e-05, + "loss": 1.6942, + "step": 5317 + }, + { + "epoch": 0.2964325529542921, + "grad_norm": 0.5007186532020569, + "learning_rate": 8e-05, + "loss": 1.6965, + "step": 5318 + }, + { + "epoch": 0.2964882943143813, + "grad_norm": 0.5010005831718445, + "learning_rate": 8e-05, + "loss": 1.6339, + "step": 5319 + }, + { + "epoch": 0.29654403567447046, + "grad_norm": 0.5402158498764038, + "learning_rate": 8e-05, + "loss": 1.8016, + "step": 5320 + }, + { + "epoch": 0.29659977703455964, + "grad_norm": 0.4774928689002991, + "learning_rate": 8e-05, + "loss": 1.7116, + "step": 5321 + }, + { + "epoch": 0.2966555183946488, + "grad_norm": 0.7964354157447815, + "learning_rate": 8e-05, + "loss": 1.6258, + "step": 5322 + }, + { + "epoch": 0.296711259754738, + "grad_norm": 0.5286622643470764, + "learning_rate": 8e-05, + "loss": 1.7428, + "step": 5323 + }, + { + "epoch": 0.2967670011148272, + "grad_norm": 0.5173184871673584, + "learning_rate": 8e-05, + "loss": 1.7258, + "step": 5324 + }, + { + "epoch": 0.2968227424749164, + "grad_norm": 0.5637500286102295, + "learning_rate": 8e-05, + "loss": 2.0714, + "step": 5325 + }, + { + "epoch": 0.2968784838350056, + "grad_norm": 0.5035580992698669, + "learning_rate": 8e-05, + "loss": 1.7906, + "step": 5326 + }, + { + "epoch": 0.29693422519509477, + "grad_norm": 0.5470683574676514, + "learning_rate": 8e-05, + "loss": 1.736, + "step": 5327 + }, + { + "epoch": 0.29698996655518395, + "grad_norm": 0.5390167832374573, + "learning_rate": 8e-05, + "loss": 1.8918, + "step": 5328 + }, + { + "epoch": 0.2970457079152731, + "grad_norm": 0.501208484172821, + "learning_rate": 8e-05, + "loss": 1.733, + "step": 5329 + }, + { + "epoch": 0.2971014492753623, + "grad_norm": 0.47988513112068176, + "learning_rate": 8e-05, + "loss": 1.5519, + "step": 5330 + }, + { + "epoch": 0.2971571906354515, + "grad_norm": 0.48085033893585205, + "learning_rate": 8e-05, + "loss": 1.6031, + "step": 5331 + }, + { + "epoch": 0.29721293199554066, + "grad_norm": 0.4778374433517456, + "learning_rate": 8e-05, + "loss": 1.5619, + "step": 5332 + }, + { + "epoch": 0.2972686733556299, + "grad_norm": 0.49800926446914673, + "learning_rate": 8e-05, + "loss": 1.6148, + "step": 5333 + }, + { + "epoch": 0.2973244147157191, + "grad_norm": 0.5318037271499634, + "learning_rate": 8e-05, + "loss": 1.8083, + "step": 5334 + }, + { + "epoch": 0.29738015607580826, + "grad_norm": 0.5206844210624695, + "learning_rate": 8e-05, + "loss": 1.7107, + "step": 5335 + }, + { + "epoch": 0.29743589743589743, + "grad_norm": 0.5456350445747375, + "learning_rate": 8e-05, + "loss": 1.7222, + "step": 5336 + }, + { + "epoch": 0.2974916387959866, + "grad_norm": 0.5354796648025513, + "learning_rate": 8e-05, + "loss": 1.511, + "step": 5337 + }, + { + "epoch": 0.2975473801560758, + "grad_norm": 0.5405011773109436, + "learning_rate": 8e-05, + "loss": 1.7553, + "step": 5338 + }, + { + "epoch": 0.29760312151616497, + "grad_norm": 0.5345763564109802, + "learning_rate": 8e-05, + "loss": 1.7713, + "step": 5339 + }, + { + "epoch": 0.2976588628762542, + "grad_norm": 0.4754033088684082, + "learning_rate": 8e-05, + "loss": 1.5024, + "step": 5340 + }, + { + "epoch": 0.2977146042363434, + "grad_norm": 0.45507925748825073, + "learning_rate": 8e-05, + "loss": 1.4949, + "step": 5341 + }, + { + "epoch": 0.29777034559643256, + "grad_norm": 0.49338576197624207, + "learning_rate": 8e-05, + "loss": 1.6169, + "step": 5342 + }, + { + "epoch": 0.29782608695652174, + "grad_norm": 0.48822399973869324, + "learning_rate": 8e-05, + "loss": 1.6804, + "step": 5343 + }, + { + "epoch": 0.2978818283166109, + "grad_norm": 0.4632672965526581, + "learning_rate": 8e-05, + "loss": 1.1824, + "step": 5344 + }, + { + "epoch": 0.2979375696767001, + "grad_norm": 0.5066871047019958, + "learning_rate": 8e-05, + "loss": 1.6845, + "step": 5345 + }, + { + "epoch": 0.2979933110367893, + "grad_norm": 0.4745085537433624, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 5346 + }, + { + "epoch": 0.2980490523968785, + "grad_norm": 0.47879093885421753, + "learning_rate": 8e-05, + "loss": 1.5042, + "step": 5347 + }, + { + "epoch": 0.2981047937569677, + "grad_norm": 0.4783836305141449, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 5348 + }, + { + "epoch": 0.29816053511705687, + "grad_norm": 0.4835946559906006, + "learning_rate": 8e-05, + "loss": 1.6553, + "step": 5349 + }, + { + "epoch": 0.29821627647714605, + "grad_norm": 0.5328168272972107, + "learning_rate": 8e-05, + "loss": 1.7335, + "step": 5350 + }, + { + "epoch": 0.2982720178372352, + "grad_norm": 0.5327500700950623, + "learning_rate": 8e-05, + "loss": 1.7231, + "step": 5351 + }, + { + "epoch": 0.2983277591973244, + "grad_norm": 0.6044172644615173, + "learning_rate": 8e-05, + "loss": 1.8734, + "step": 5352 + }, + { + "epoch": 0.2983835005574136, + "grad_norm": 0.5115371346473694, + "learning_rate": 8e-05, + "loss": 1.6742, + "step": 5353 + }, + { + "epoch": 0.29843924191750276, + "grad_norm": 0.570870041847229, + "learning_rate": 8e-05, + "loss": 1.798, + "step": 5354 + }, + { + "epoch": 0.298494983277592, + "grad_norm": 0.5445014834403992, + "learning_rate": 8e-05, + "loss": 1.6347, + "step": 5355 + }, + { + "epoch": 0.2985507246376812, + "grad_norm": 0.5148465037345886, + "learning_rate": 8e-05, + "loss": 1.7464, + "step": 5356 + }, + { + "epoch": 0.29860646599777035, + "grad_norm": 0.5145999789237976, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 5357 + }, + { + "epoch": 0.29866220735785953, + "grad_norm": 0.49865972995758057, + "learning_rate": 8e-05, + "loss": 1.6532, + "step": 5358 + }, + { + "epoch": 0.2987179487179487, + "grad_norm": 0.495136559009552, + "learning_rate": 8e-05, + "loss": 1.7584, + "step": 5359 + }, + { + "epoch": 0.2987736900780379, + "grad_norm": 0.5089300870895386, + "learning_rate": 8e-05, + "loss": 1.7906, + "step": 5360 + }, + { + "epoch": 0.29882943143812707, + "grad_norm": 0.485957533121109, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 5361 + }, + { + "epoch": 0.2988851727982163, + "grad_norm": 0.550250232219696, + "learning_rate": 8e-05, + "loss": 1.814, + "step": 5362 + }, + { + "epoch": 0.2989409141583055, + "grad_norm": 0.5336512327194214, + "learning_rate": 8e-05, + "loss": 1.8362, + "step": 5363 + }, + { + "epoch": 0.29899665551839466, + "grad_norm": 0.516875147819519, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 5364 + }, + { + "epoch": 0.29905239687848384, + "grad_norm": 0.5087434649467468, + "learning_rate": 8e-05, + "loss": 1.608, + "step": 5365 + }, + { + "epoch": 0.299108138238573, + "grad_norm": 0.5574606657028198, + "learning_rate": 8e-05, + "loss": 1.5601, + "step": 5366 + }, + { + "epoch": 0.2991638795986622, + "grad_norm": 0.5068855881690979, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 5367 + }, + { + "epoch": 0.2992196209587514, + "grad_norm": 0.4880894124507904, + "learning_rate": 8e-05, + "loss": 1.46, + "step": 5368 + }, + { + "epoch": 0.29927536231884055, + "grad_norm": 0.4971500635147095, + "learning_rate": 8e-05, + "loss": 1.6906, + "step": 5369 + }, + { + "epoch": 0.2993311036789298, + "grad_norm": 0.5621704459190369, + "learning_rate": 8e-05, + "loss": 1.9147, + "step": 5370 + }, + { + "epoch": 0.29938684503901897, + "grad_norm": 0.5409892201423645, + "learning_rate": 8e-05, + "loss": 1.8615, + "step": 5371 + }, + { + "epoch": 0.29944258639910815, + "grad_norm": 0.4971315562725067, + "learning_rate": 8e-05, + "loss": 1.6824, + "step": 5372 + }, + { + "epoch": 0.2994983277591973, + "grad_norm": 0.49553629755973816, + "learning_rate": 8e-05, + "loss": 1.7716, + "step": 5373 + }, + { + "epoch": 0.2995540691192865, + "grad_norm": 0.4982132017612457, + "learning_rate": 8e-05, + "loss": 1.8004, + "step": 5374 + }, + { + "epoch": 0.2996098104793757, + "grad_norm": 0.49931418895721436, + "learning_rate": 8e-05, + "loss": 1.6361, + "step": 5375 + }, + { + "epoch": 0.29966555183946486, + "grad_norm": 0.4859783947467804, + "learning_rate": 8e-05, + "loss": 1.5661, + "step": 5376 + }, + { + "epoch": 0.2997212931995541, + "grad_norm": 0.5062422156333923, + "learning_rate": 8e-05, + "loss": 1.4738, + "step": 5377 + }, + { + "epoch": 0.2997770345596433, + "grad_norm": 0.5458669066429138, + "learning_rate": 8e-05, + "loss": 1.9687, + "step": 5378 + }, + { + "epoch": 0.29983277591973245, + "grad_norm": 0.5416478514671326, + "learning_rate": 8e-05, + "loss": 2.02, + "step": 5379 + }, + { + "epoch": 0.29988851727982163, + "grad_norm": 0.5097154974937439, + "learning_rate": 8e-05, + "loss": 1.5451, + "step": 5380 + }, + { + "epoch": 0.2999442586399108, + "grad_norm": 0.4724648594856262, + "learning_rate": 8e-05, + "loss": 1.6004, + "step": 5381 + }, + { + "epoch": 0.3, + "grad_norm": 0.5238843560218811, + "learning_rate": 8e-05, + "loss": 1.8858, + "step": 5382 + }, + { + "epoch": 0.30005574136008917, + "grad_norm": 0.4457351863384247, + "learning_rate": 8e-05, + "loss": 1.4063, + "step": 5383 + }, + { + "epoch": 0.30011148272017835, + "grad_norm": 0.5066267848014832, + "learning_rate": 8e-05, + "loss": 1.9073, + "step": 5384 + }, + { + "epoch": 0.3001672240802676, + "grad_norm": 0.5180755853652954, + "learning_rate": 8e-05, + "loss": 1.7075, + "step": 5385 + }, + { + "epoch": 0.30022296544035676, + "grad_norm": 0.5159972906112671, + "learning_rate": 8e-05, + "loss": 1.7433, + "step": 5386 + }, + { + "epoch": 0.30027870680044594, + "grad_norm": 0.5109407305717468, + "learning_rate": 8e-05, + "loss": 1.5583, + "step": 5387 + }, + { + "epoch": 0.3003344481605351, + "grad_norm": 0.5248592495918274, + "learning_rate": 8e-05, + "loss": 1.7224, + "step": 5388 + }, + { + "epoch": 0.3003901895206243, + "grad_norm": 0.5111548900604248, + "learning_rate": 8e-05, + "loss": 1.7059, + "step": 5389 + }, + { + "epoch": 0.3004459308807135, + "grad_norm": 0.462700754404068, + "learning_rate": 8e-05, + "loss": 1.578, + "step": 5390 + }, + { + "epoch": 0.30050167224080265, + "grad_norm": 0.5779263377189636, + "learning_rate": 8e-05, + "loss": 1.8235, + "step": 5391 + }, + { + "epoch": 0.3005574136008919, + "grad_norm": 1.2829216718673706, + "learning_rate": 8e-05, + "loss": 1.575, + "step": 5392 + }, + { + "epoch": 0.30061315496098107, + "grad_norm": 0.5034168362617493, + "learning_rate": 8e-05, + "loss": 1.5587, + "step": 5393 + }, + { + "epoch": 0.30066889632107024, + "grad_norm": 0.49088746309280396, + "learning_rate": 8e-05, + "loss": 1.4346, + "step": 5394 + }, + { + "epoch": 0.3007246376811594, + "grad_norm": 0.5063590407371521, + "learning_rate": 8e-05, + "loss": 1.8368, + "step": 5395 + }, + { + "epoch": 0.3007803790412486, + "grad_norm": 0.5310071110725403, + "learning_rate": 8e-05, + "loss": 1.6953, + "step": 5396 + }, + { + "epoch": 0.3008361204013378, + "grad_norm": 0.5646356344223022, + "learning_rate": 8e-05, + "loss": 1.7004, + "step": 5397 + }, + { + "epoch": 0.30089186176142696, + "grad_norm": 0.4882517457008362, + "learning_rate": 8e-05, + "loss": 1.4631, + "step": 5398 + }, + { + "epoch": 0.30094760312151614, + "grad_norm": 0.5490998029708862, + "learning_rate": 8e-05, + "loss": 1.6273, + "step": 5399 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.4808351397514343, + "learning_rate": 8e-05, + "loss": 1.6429, + "step": 5400 + }, + { + "epoch": 0.30105908584169455, + "grad_norm": 0.49833354353904724, + "learning_rate": 8e-05, + "loss": 1.5635, + "step": 5401 + }, + { + "epoch": 0.30111482720178373, + "grad_norm": 0.5647911429405212, + "learning_rate": 8e-05, + "loss": 1.7225, + "step": 5402 + }, + { + "epoch": 0.3011705685618729, + "grad_norm": 0.5790539979934692, + "learning_rate": 8e-05, + "loss": 1.8063, + "step": 5403 + }, + { + "epoch": 0.3012263099219621, + "grad_norm": 0.44960105419158936, + "learning_rate": 8e-05, + "loss": 1.4754, + "step": 5404 + }, + { + "epoch": 0.30128205128205127, + "grad_norm": 0.5029364228248596, + "learning_rate": 8e-05, + "loss": 1.7869, + "step": 5405 + }, + { + "epoch": 0.30133779264214045, + "grad_norm": 0.5034712553024292, + "learning_rate": 8e-05, + "loss": 1.6554, + "step": 5406 + }, + { + "epoch": 0.3013935340022297, + "grad_norm": 0.5302921533584595, + "learning_rate": 8e-05, + "loss": 1.8542, + "step": 5407 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 0.5185369253158569, + "learning_rate": 8e-05, + "loss": 1.6084, + "step": 5408 + }, + { + "epoch": 0.30150501672240804, + "grad_norm": 0.48464417457580566, + "learning_rate": 8e-05, + "loss": 1.5505, + "step": 5409 + }, + { + "epoch": 0.3015607580824972, + "grad_norm": 0.5956443548202515, + "learning_rate": 8e-05, + "loss": 1.6358, + "step": 5410 + }, + { + "epoch": 0.3016164994425864, + "grad_norm": 0.5080055594444275, + "learning_rate": 8e-05, + "loss": 1.7066, + "step": 5411 + }, + { + "epoch": 0.3016722408026756, + "grad_norm": 0.5118548274040222, + "learning_rate": 8e-05, + "loss": 1.7763, + "step": 5412 + }, + { + "epoch": 0.30172798216276475, + "grad_norm": 0.5360879898071289, + "learning_rate": 8e-05, + "loss": 1.8087, + "step": 5413 + }, + { + "epoch": 0.30178372352285393, + "grad_norm": 0.4963992238044739, + "learning_rate": 8e-05, + "loss": 1.7252, + "step": 5414 + }, + { + "epoch": 0.30183946488294316, + "grad_norm": 0.5266253352165222, + "learning_rate": 8e-05, + "loss": 1.7846, + "step": 5415 + }, + { + "epoch": 0.30189520624303234, + "grad_norm": 0.5456589460372925, + "learning_rate": 8e-05, + "loss": 1.8456, + "step": 5416 + }, + { + "epoch": 0.3019509476031215, + "grad_norm": 0.5212188959121704, + "learning_rate": 8e-05, + "loss": 1.6749, + "step": 5417 + }, + { + "epoch": 0.3020066889632107, + "grad_norm": 0.550622284412384, + "learning_rate": 8e-05, + "loss": 1.6791, + "step": 5418 + }, + { + "epoch": 0.3020624303232999, + "grad_norm": 0.4872855246067047, + "learning_rate": 8e-05, + "loss": 1.6679, + "step": 5419 + }, + { + "epoch": 0.30211817168338906, + "grad_norm": 0.5363277196884155, + "learning_rate": 8e-05, + "loss": 1.8039, + "step": 5420 + }, + { + "epoch": 0.30217391304347824, + "grad_norm": 0.48044562339782715, + "learning_rate": 8e-05, + "loss": 1.5689, + "step": 5421 + }, + { + "epoch": 0.30222965440356747, + "grad_norm": 0.5340180993080139, + "learning_rate": 8e-05, + "loss": 1.6537, + "step": 5422 + }, + { + "epoch": 0.30228539576365665, + "grad_norm": 0.4993794560432434, + "learning_rate": 8e-05, + "loss": 1.7971, + "step": 5423 + }, + { + "epoch": 0.30234113712374583, + "grad_norm": 0.5311400294303894, + "learning_rate": 8e-05, + "loss": 1.7135, + "step": 5424 + }, + { + "epoch": 0.302396878483835, + "grad_norm": 0.5154658555984497, + "learning_rate": 8e-05, + "loss": 1.7769, + "step": 5425 + }, + { + "epoch": 0.3024526198439242, + "grad_norm": 0.5265001058578491, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 5426 + }, + { + "epoch": 0.30250836120401337, + "grad_norm": 0.540492594242096, + "learning_rate": 8e-05, + "loss": 1.8148, + "step": 5427 + }, + { + "epoch": 0.30256410256410254, + "grad_norm": 0.5124557614326477, + "learning_rate": 8e-05, + "loss": 1.7311, + "step": 5428 + }, + { + "epoch": 0.3026198439241917, + "grad_norm": 0.5288158059120178, + "learning_rate": 8e-05, + "loss": 1.8849, + "step": 5429 + }, + { + "epoch": 0.30267558528428096, + "grad_norm": 0.4883910119533539, + "learning_rate": 8e-05, + "loss": 1.6892, + "step": 5430 + }, + { + "epoch": 0.30273132664437014, + "grad_norm": 0.5443893074989319, + "learning_rate": 8e-05, + "loss": 2.0222, + "step": 5431 + }, + { + "epoch": 0.3027870680044593, + "grad_norm": 0.5112580060958862, + "learning_rate": 8e-05, + "loss": 1.7861, + "step": 5432 + }, + { + "epoch": 0.3028428093645485, + "grad_norm": 0.4806899428367615, + "learning_rate": 8e-05, + "loss": 1.7085, + "step": 5433 + }, + { + "epoch": 0.30289855072463767, + "grad_norm": 0.4477013647556305, + "learning_rate": 8e-05, + "loss": 1.2138, + "step": 5434 + }, + { + "epoch": 0.30295429208472685, + "grad_norm": 0.5485127568244934, + "learning_rate": 8e-05, + "loss": 1.8087, + "step": 5435 + }, + { + "epoch": 0.30301003344481603, + "grad_norm": 0.5672134757041931, + "learning_rate": 8e-05, + "loss": 1.8478, + "step": 5436 + }, + { + "epoch": 0.30306577480490526, + "grad_norm": 0.4998778998851776, + "learning_rate": 8e-05, + "loss": 1.6146, + "step": 5437 + }, + { + "epoch": 0.30312151616499444, + "grad_norm": 0.5240135788917542, + "learning_rate": 8e-05, + "loss": 1.6686, + "step": 5438 + }, + { + "epoch": 0.3031772575250836, + "grad_norm": 0.5832547545433044, + "learning_rate": 8e-05, + "loss": 1.9084, + "step": 5439 + }, + { + "epoch": 0.3032329988851728, + "grad_norm": 0.4926720857620239, + "learning_rate": 8e-05, + "loss": 1.675, + "step": 5440 + }, + { + "epoch": 0.303288740245262, + "grad_norm": 0.5205129981040955, + "learning_rate": 8e-05, + "loss": 1.7816, + "step": 5441 + }, + { + "epoch": 0.30334448160535116, + "grad_norm": 0.4628882110118866, + "learning_rate": 8e-05, + "loss": 1.4962, + "step": 5442 + }, + { + "epoch": 0.30340022296544034, + "grad_norm": 0.5137123465538025, + "learning_rate": 8e-05, + "loss": 1.651, + "step": 5443 + }, + { + "epoch": 0.30345596432552957, + "grad_norm": 0.5469315648078918, + "learning_rate": 8e-05, + "loss": 1.553, + "step": 5444 + }, + { + "epoch": 0.30351170568561875, + "grad_norm": 0.5205637812614441, + "learning_rate": 8e-05, + "loss": 1.7943, + "step": 5445 + }, + { + "epoch": 0.3035674470457079, + "grad_norm": 0.54112309217453, + "learning_rate": 8e-05, + "loss": 1.7074, + "step": 5446 + }, + { + "epoch": 0.3036231884057971, + "grad_norm": 0.4951598644256592, + "learning_rate": 8e-05, + "loss": 1.792, + "step": 5447 + }, + { + "epoch": 0.3036789297658863, + "grad_norm": 0.5104148983955383, + "learning_rate": 8e-05, + "loss": 1.807, + "step": 5448 + }, + { + "epoch": 0.30373467112597546, + "grad_norm": 0.5905737280845642, + "learning_rate": 8e-05, + "loss": 1.6313, + "step": 5449 + }, + { + "epoch": 0.30379041248606464, + "grad_norm": 0.4556120038032532, + "learning_rate": 8e-05, + "loss": 1.2949, + "step": 5450 + }, + { + "epoch": 0.3038461538461538, + "grad_norm": 0.5705082416534424, + "learning_rate": 8e-05, + "loss": 1.8666, + "step": 5451 + }, + { + "epoch": 0.30390189520624306, + "grad_norm": 0.5135024785995483, + "learning_rate": 8e-05, + "loss": 1.858, + "step": 5452 + }, + { + "epoch": 0.30395763656633223, + "grad_norm": 0.6063351631164551, + "learning_rate": 8e-05, + "loss": 1.831, + "step": 5453 + }, + { + "epoch": 0.3040133779264214, + "grad_norm": 0.5353469252586365, + "learning_rate": 8e-05, + "loss": 1.7517, + "step": 5454 + }, + { + "epoch": 0.3040691192865106, + "grad_norm": 0.5837903618812561, + "learning_rate": 8e-05, + "loss": 2.0732, + "step": 5455 + }, + { + "epoch": 0.30412486064659977, + "grad_norm": 0.46746090054512024, + "learning_rate": 8e-05, + "loss": 1.5466, + "step": 5456 + }, + { + "epoch": 0.30418060200668895, + "grad_norm": 0.5374937653541565, + "learning_rate": 8e-05, + "loss": 1.8763, + "step": 5457 + }, + { + "epoch": 0.30423634336677813, + "grad_norm": 0.5043895244598389, + "learning_rate": 8e-05, + "loss": 1.8551, + "step": 5458 + }, + { + "epoch": 0.30429208472686736, + "grad_norm": 0.522622287273407, + "learning_rate": 8e-05, + "loss": 1.7695, + "step": 5459 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.5085619688034058, + "learning_rate": 8e-05, + "loss": 1.7313, + "step": 5460 + }, + { + "epoch": 0.3044035674470457, + "grad_norm": 0.5232932567596436, + "learning_rate": 8e-05, + "loss": 1.5676, + "step": 5461 + }, + { + "epoch": 0.3044593088071349, + "grad_norm": 0.499271422624588, + "learning_rate": 8e-05, + "loss": 1.8006, + "step": 5462 + }, + { + "epoch": 0.3045150501672241, + "grad_norm": 0.5241374969482422, + "learning_rate": 8e-05, + "loss": 1.7208, + "step": 5463 + }, + { + "epoch": 0.30457079152731326, + "grad_norm": 0.5245078206062317, + "learning_rate": 8e-05, + "loss": 1.9892, + "step": 5464 + }, + { + "epoch": 0.30462653288740243, + "grad_norm": 0.49973732233047485, + "learning_rate": 8e-05, + "loss": 1.6267, + "step": 5465 + }, + { + "epoch": 0.3046822742474916, + "grad_norm": 0.5292554497718811, + "learning_rate": 8e-05, + "loss": 1.3804, + "step": 5466 + }, + { + "epoch": 0.30473801560758085, + "grad_norm": 0.5208456516265869, + "learning_rate": 8e-05, + "loss": 1.6966, + "step": 5467 + }, + { + "epoch": 0.30479375696767, + "grad_norm": 0.49586382508277893, + "learning_rate": 8e-05, + "loss": 1.7829, + "step": 5468 + }, + { + "epoch": 0.3048494983277592, + "grad_norm": 0.5234887599945068, + "learning_rate": 8e-05, + "loss": 1.6937, + "step": 5469 + }, + { + "epoch": 0.3049052396878484, + "grad_norm": 0.49152565002441406, + "learning_rate": 8e-05, + "loss": 1.7413, + "step": 5470 + }, + { + "epoch": 0.30496098104793756, + "grad_norm": 0.4878113269805908, + "learning_rate": 8e-05, + "loss": 1.528, + "step": 5471 + }, + { + "epoch": 0.30501672240802674, + "grad_norm": 0.6157654523849487, + "learning_rate": 8e-05, + "loss": 1.8596, + "step": 5472 + }, + { + "epoch": 0.3050724637681159, + "grad_norm": 0.5470446944236755, + "learning_rate": 8e-05, + "loss": 1.7643, + "step": 5473 + }, + { + "epoch": 0.30512820512820515, + "grad_norm": 0.4969521164894104, + "learning_rate": 8e-05, + "loss": 1.5607, + "step": 5474 + }, + { + "epoch": 0.30518394648829433, + "grad_norm": 0.4928933084011078, + "learning_rate": 8e-05, + "loss": 1.7484, + "step": 5475 + }, + { + "epoch": 0.3052396878483835, + "grad_norm": 0.5175816416740417, + "learning_rate": 8e-05, + "loss": 1.6989, + "step": 5476 + }, + { + "epoch": 0.3052954292084727, + "grad_norm": 0.6144953370094299, + "learning_rate": 8e-05, + "loss": 1.7375, + "step": 5477 + }, + { + "epoch": 0.30535117056856187, + "grad_norm": 0.5331835150718689, + "learning_rate": 8e-05, + "loss": 1.7825, + "step": 5478 + }, + { + "epoch": 0.30540691192865105, + "grad_norm": 0.49090802669525146, + "learning_rate": 8e-05, + "loss": 1.6912, + "step": 5479 + }, + { + "epoch": 0.3054626532887402, + "grad_norm": 0.5607184767723083, + "learning_rate": 8e-05, + "loss": 1.8406, + "step": 5480 + }, + { + "epoch": 0.3055183946488294, + "grad_norm": 0.48209232091903687, + "learning_rate": 8e-05, + "loss": 1.7219, + "step": 5481 + }, + { + "epoch": 0.30557413600891864, + "grad_norm": 0.5262097120285034, + "learning_rate": 8e-05, + "loss": 1.7903, + "step": 5482 + }, + { + "epoch": 0.3056298773690078, + "grad_norm": 0.5120763182640076, + "learning_rate": 8e-05, + "loss": 1.5862, + "step": 5483 + }, + { + "epoch": 0.305685618729097, + "grad_norm": 0.5171751379966736, + "learning_rate": 8e-05, + "loss": 1.6802, + "step": 5484 + }, + { + "epoch": 0.3057413600891862, + "grad_norm": 0.4827384352684021, + "learning_rate": 8e-05, + "loss": 1.6011, + "step": 5485 + }, + { + "epoch": 0.30579710144927535, + "grad_norm": 0.5480244159698486, + "learning_rate": 8e-05, + "loss": 1.7993, + "step": 5486 + }, + { + "epoch": 0.30585284280936453, + "grad_norm": 0.506036639213562, + "learning_rate": 8e-05, + "loss": 1.7115, + "step": 5487 + }, + { + "epoch": 0.3059085841694537, + "grad_norm": 0.4862762689590454, + "learning_rate": 8e-05, + "loss": 1.5576, + "step": 5488 + }, + { + "epoch": 0.30596432552954295, + "grad_norm": 0.5329150557518005, + "learning_rate": 8e-05, + "loss": 1.7333, + "step": 5489 + }, + { + "epoch": 0.3060200668896321, + "grad_norm": 0.5178322196006775, + "learning_rate": 8e-05, + "loss": 1.5498, + "step": 5490 + }, + { + "epoch": 0.3060758082497213, + "grad_norm": 0.4974115490913391, + "learning_rate": 8e-05, + "loss": 1.7897, + "step": 5491 + }, + { + "epoch": 0.3061315496098105, + "grad_norm": 0.494586318731308, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 5492 + }, + { + "epoch": 0.30618729096989966, + "grad_norm": 0.48024600744247437, + "learning_rate": 8e-05, + "loss": 1.6855, + "step": 5493 + }, + { + "epoch": 0.30624303232998884, + "grad_norm": 0.5140489339828491, + "learning_rate": 8e-05, + "loss": 1.706, + "step": 5494 + }, + { + "epoch": 0.306298773690078, + "grad_norm": 0.5151708722114563, + "learning_rate": 8e-05, + "loss": 1.6387, + "step": 5495 + }, + { + "epoch": 0.3063545150501672, + "grad_norm": 0.4907633364200592, + "learning_rate": 8e-05, + "loss": 1.6687, + "step": 5496 + }, + { + "epoch": 0.30641025641025643, + "grad_norm": 0.5302457809448242, + "learning_rate": 8e-05, + "loss": 1.7727, + "step": 5497 + }, + { + "epoch": 0.3064659977703456, + "grad_norm": 0.47479885816574097, + "learning_rate": 8e-05, + "loss": 1.6267, + "step": 5498 + }, + { + "epoch": 0.3065217391304348, + "grad_norm": 0.5059468746185303, + "learning_rate": 8e-05, + "loss": 1.642, + "step": 5499 + }, + { + "epoch": 0.30657748049052397, + "grad_norm": 0.46230581402778625, + "learning_rate": 8e-05, + "loss": 1.5666, + "step": 5500 + }, + { + "epoch": 0.30663322185061315, + "grad_norm": 0.49751943349838257, + "learning_rate": 8e-05, + "loss": 1.6127, + "step": 5501 + }, + { + "epoch": 0.3066889632107023, + "grad_norm": 0.5246805548667908, + "learning_rate": 8e-05, + "loss": 1.9026, + "step": 5502 + }, + { + "epoch": 0.3067447045707915, + "grad_norm": 0.5045078992843628, + "learning_rate": 8e-05, + "loss": 1.5361, + "step": 5503 + }, + { + "epoch": 0.30680044593088074, + "grad_norm": 0.5169194340705872, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 5504 + }, + { + "epoch": 0.3068561872909699, + "grad_norm": 0.521348774433136, + "learning_rate": 8e-05, + "loss": 1.7625, + "step": 5505 + }, + { + "epoch": 0.3069119286510591, + "grad_norm": 0.5427587032318115, + "learning_rate": 8e-05, + "loss": 1.8069, + "step": 5506 + }, + { + "epoch": 0.3069676700111483, + "grad_norm": 0.5333295464515686, + "learning_rate": 8e-05, + "loss": 1.707, + "step": 5507 + }, + { + "epoch": 0.30702341137123745, + "grad_norm": 0.5025301575660706, + "learning_rate": 8e-05, + "loss": 1.4767, + "step": 5508 + }, + { + "epoch": 0.30707915273132663, + "grad_norm": 0.5663952827453613, + "learning_rate": 8e-05, + "loss": 1.8625, + "step": 5509 + }, + { + "epoch": 0.3071348940914158, + "grad_norm": 0.4802990257740021, + "learning_rate": 8e-05, + "loss": 1.5655, + "step": 5510 + }, + { + "epoch": 0.307190635451505, + "grad_norm": 0.5298011898994446, + "learning_rate": 8e-05, + "loss": 1.8292, + "step": 5511 + }, + { + "epoch": 0.3072463768115942, + "grad_norm": 0.5581608414649963, + "learning_rate": 8e-05, + "loss": 1.7637, + "step": 5512 + }, + { + "epoch": 0.3073021181716834, + "grad_norm": 0.5245633125305176, + "learning_rate": 8e-05, + "loss": 1.8399, + "step": 5513 + }, + { + "epoch": 0.3073578595317726, + "grad_norm": 0.5069871544837952, + "learning_rate": 8e-05, + "loss": 1.669, + "step": 5514 + }, + { + "epoch": 0.30741360089186176, + "grad_norm": 0.5340516567230225, + "learning_rate": 8e-05, + "loss": 1.802, + "step": 5515 + }, + { + "epoch": 0.30746934225195094, + "grad_norm": 0.506650447845459, + "learning_rate": 8e-05, + "loss": 1.7352, + "step": 5516 + }, + { + "epoch": 0.3075250836120401, + "grad_norm": 0.5162363052368164, + "learning_rate": 8e-05, + "loss": 1.8099, + "step": 5517 + }, + { + "epoch": 0.3075808249721293, + "grad_norm": 0.4924274682998657, + "learning_rate": 8e-05, + "loss": 1.6239, + "step": 5518 + }, + { + "epoch": 0.30763656633221853, + "grad_norm": 0.47701767086982727, + "learning_rate": 8e-05, + "loss": 1.6575, + "step": 5519 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.49266427755355835, + "learning_rate": 8e-05, + "loss": 1.8003, + "step": 5520 + }, + { + "epoch": 0.3077480490523969, + "grad_norm": 0.5872482061386108, + "learning_rate": 8e-05, + "loss": 1.6614, + "step": 5521 + }, + { + "epoch": 0.30780379041248607, + "grad_norm": 0.5668823719024658, + "learning_rate": 8e-05, + "loss": 1.5553, + "step": 5522 + }, + { + "epoch": 0.30785953177257525, + "grad_norm": 0.4845235347747803, + "learning_rate": 8e-05, + "loss": 1.7007, + "step": 5523 + }, + { + "epoch": 0.3079152731326644, + "grad_norm": 0.48626527190208435, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 5524 + }, + { + "epoch": 0.3079710144927536, + "grad_norm": 0.5815119743347168, + "learning_rate": 8e-05, + "loss": 2.0158, + "step": 5525 + }, + { + "epoch": 0.3080267558528428, + "grad_norm": 0.4801939129829407, + "learning_rate": 8e-05, + "loss": 1.5951, + "step": 5526 + }, + { + "epoch": 0.308082497212932, + "grad_norm": 0.5311259627342224, + "learning_rate": 8e-05, + "loss": 2.0277, + "step": 5527 + }, + { + "epoch": 0.3081382385730212, + "grad_norm": 0.5235937237739563, + "learning_rate": 8e-05, + "loss": 1.9141, + "step": 5528 + }, + { + "epoch": 0.3081939799331104, + "grad_norm": 0.5488254427909851, + "learning_rate": 8e-05, + "loss": 1.8013, + "step": 5529 + }, + { + "epoch": 0.30824972129319955, + "grad_norm": 0.4970884919166565, + "learning_rate": 8e-05, + "loss": 1.7506, + "step": 5530 + }, + { + "epoch": 0.30830546265328873, + "grad_norm": 0.5055081844329834, + "learning_rate": 8e-05, + "loss": 1.8335, + "step": 5531 + }, + { + "epoch": 0.3083612040133779, + "grad_norm": 0.4809627830982208, + "learning_rate": 8e-05, + "loss": 1.6935, + "step": 5532 + }, + { + "epoch": 0.3084169453734671, + "grad_norm": 0.47954025864601135, + "learning_rate": 8e-05, + "loss": 1.6094, + "step": 5533 + }, + { + "epoch": 0.3084726867335563, + "grad_norm": 0.5082470178604126, + "learning_rate": 8e-05, + "loss": 1.5929, + "step": 5534 + }, + { + "epoch": 0.3085284280936455, + "grad_norm": 0.5128086805343628, + "learning_rate": 8e-05, + "loss": 1.6322, + "step": 5535 + }, + { + "epoch": 0.3085841694537347, + "grad_norm": 0.4984266459941864, + "learning_rate": 8e-05, + "loss": 1.7852, + "step": 5536 + }, + { + "epoch": 0.30863991081382386, + "grad_norm": 0.49187591671943665, + "learning_rate": 8e-05, + "loss": 1.5455, + "step": 5537 + }, + { + "epoch": 0.30869565217391304, + "grad_norm": 0.5065392851829529, + "learning_rate": 8e-05, + "loss": 1.6408, + "step": 5538 + }, + { + "epoch": 0.3087513935340022, + "grad_norm": 0.511028528213501, + "learning_rate": 8e-05, + "loss": 1.566, + "step": 5539 + }, + { + "epoch": 0.3088071348940914, + "grad_norm": 0.5834546685218811, + "learning_rate": 8e-05, + "loss": 1.6334, + "step": 5540 + }, + { + "epoch": 0.30886287625418063, + "grad_norm": 0.5535149574279785, + "learning_rate": 8e-05, + "loss": 1.776, + "step": 5541 + }, + { + "epoch": 0.3089186176142698, + "grad_norm": 0.5388762354850769, + "learning_rate": 8e-05, + "loss": 1.2867, + "step": 5542 + }, + { + "epoch": 0.308974358974359, + "grad_norm": 0.5357641577720642, + "learning_rate": 8e-05, + "loss": 1.8904, + "step": 5543 + }, + { + "epoch": 0.30903010033444817, + "grad_norm": 0.5241168141365051, + "learning_rate": 8e-05, + "loss": 1.7835, + "step": 5544 + }, + { + "epoch": 0.30908584169453734, + "grad_norm": 0.5548775792121887, + "learning_rate": 8e-05, + "loss": 1.8013, + "step": 5545 + }, + { + "epoch": 0.3091415830546265, + "grad_norm": 0.4995080828666687, + "learning_rate": 8e-05, + "loss": 1.6661, + "step": 5546 + }, + { + "epoch": 0.3091973244147157, + "grad_norm": 0.5398724675178528, + "learning_rate": 8e-05, + "loss": 1.8868, + "step": 5547 + }, + { + "epoch": 0.3092530657748049, + "grad_norm": 0.4948381185531616, + "learning_rate": 8e-05, + "loss": 1.8113, + "step": 5548 + }, + { + "epoch": 0.3093088071348941, + "grad_norm": 0.5208935737609863, + "learning_rate": 8e-05, + "loss": 1.7515, + "step": 5549 + }, + { + "epoch": 0.3093645484949833, + "grad_norm": 0.5310214757919312, + "learning_rate": 8e-05, + "loss": 1.6944, + "step": 5550 + }, + { + "epoch": 0.3094202898550725, + "grad_norm": 0.5287116765975952, + "learning_rate": 8e-05, + "loss": 1.797, + "step": 5551 + }, + { + "epoch": 0.30947603121516165, + "grad_norm": 0.5271665453910828, + "learning_rate": 8e-05, + "loss": 1.677, + "step": 5552 + }, + { + "epoch": 0.30953177257525083, + "grad_norm": 0.46743589639663696, + "learning_rate": 8e-05, + "loss": 1.5052, + "step": 5553 + }, + { + "epoch": 0.30958751393534, + "grad_norm": 0.4670599400997162, + "learning_rate": 8e-05, + "loss": 1.6102, + "step": 5554 + }, + { + "epoch": 0.3096432552954292, + "grad_norm": 0.5504299402236938, + "learning_rate": 8e-05, + "loss": 1.943, + "step": 5555 + }, + { + "epoch": 0.3096989966555184, + "grad_norm": 0.516667902469635, + "learning_rate": 8e-05, + "loss": 1.8104, + "step": 5556 + }, + { + "epoch": 0.3097547380156076, + "grad_norm": 0.5134239196777344, + "learning_rate": 8e-05, + "loss": 1.6219, + "step": 5557 + }, + { + "epoch": 0.3098104793756968, + "grad_norm": 0.5604968667030334, + "learning_rate": 8e-05, + "loss": 1.5912, + "step": 5558 + }, + { + "epoch": 0.30986622073578596, + "grad_norm": 0.4922977685928345, + "learning_rate": 8e-05, + "loss": 1.5991, + "step": 5559 + }, + { + "epoch": 0.30992196209587514, + "grad_norm": 0.49453943967819214, + "learning_rate": 8e-05, + "loss": 1.6388, + "step": 5560 + }, + { + "epoch": 0.3099777034559643, + "grad_norm": 0.48977014422416687, + "learning_rate": 8e-05, + "loss": 1.5058, + "step": 5561 + }, + { + "epoch": 0.3100334448160535, + "grad_norm": 0.5432201623916626, + "learning_rate": 8e-05, + "loss": 1.6409, + "step": 5562 + }, + { + "epoch": 0.3100891861761427, + "grad_norm": 0.560623288154602, + "learning_rate": 8e-05, + "loss": 1.8318, + "step": 5563 + }, + { + "epoch": 0.3101449275362319, + "grad_norm": 0.509880542755127, + "learning_rate": 8e-05, + "loss": 1.855, + "step": 5564 + }, + { + "epoch": 0.3102006688963211, + "grad_norm": 0.5146164298057556, + "learning_rate": 8e-05, + "loss": 1.5432, + "step": 5565 + }, + { + "epoch": 0.31025641025641026, + "grad_norm": 0.5598461031913757, + "learning_rate": 8e-05, + "loss": 1.8373, + "step": 5566 + }, + { + "epoch": 0.31031215161649944, + "grad_norm": 0.4729394018650055, + "learning_rate": 8e-05, + "loss": 1.5871, + "step": 5567 + }, + { + "epoch": 0.3103678929765886, + "grad_norm": 0.47193753719329834, + "learning_rate": 8e-05, + "loss": 1.5819, + "step": 5568 + }, + { + "epoch": 0.3104236343366778, + "grad_norm": 0.5307363867759705, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 5569 + }, + { + "epoch": 0.310479375696767, + "grad_norm": 0.5264828205108643, + "learning_rate": 8e-05, + "loss": 1.6782, + "step": 5570 + }, + { + "epoch": 0.3105351170568562, + "grad_norm": 0.5106006860733032, + "learning_rate": 8e-05, + "loss": 1.6445, + "step": 5571 + }, + { + "epoch": 0.3105908584169454, + "grad_norm": 0.49598777294158936, + "learning_rate": 8e-05, + "loss": 1.5761, + "step": 5572 + }, + { + "epoch": 0.31064659977703457, + "grad_norm": 0.48438408970832825, + "learning_rate": 8e-05, + "loss": 1.6204, + "step": 5573 + }, + { + "epoch": 0.31070234113712375, + "grad_norm": 0.5019887089729309, + "learning_rate": 8e-05, + "loss": 1.5752, + "step": 5574 + }, + { + "epoch": 0.31075808249721293, + "grad_norm": 0.49323540925979614, + "learning_rate": 8e-05, + "loss": 1.5624, + "step": 5575 + }, + { + "epoch": 0.3108138238573021, + "grad_norm": 0.4469727575778961, + "learning_rate": 8e-05, + "loss": 1.3725, + "step": 5576 + }, + { + "epoch": 0.3108695652173913, + "grad_norm": 0.6091485619544983, + "learning_rate": 8e-05, + "loss": 1.6382, + "step": 5577 + }, + { + "epoch": 0.31092530657748046, + "grad_norm": 0.5549405813217163, + "learning_rate": 8e-05, + "loss": 1.6422, + "step": 5578 + }, + { + "epoch": 0.3109810479375697, + "grad_norm": 0.5730343461036682, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 5579 + }, + { + "epoch": 0.3110367892976589, + "grad_norm": 0.5435800552368164, + "learning_rate": 8e-05, + "loss": 1.7752, + "step": 5580 + }, + { + "epoch": 0.31109253065774806, + "grad_norm": 0.6630704998970032, + "learning_rate": 8e-05, + "loss": 2.1515, + "step": 5581 + }, + { + "epoch": 0.31114827201783724, + "grad_norm": 0.588741660118103, + "learning_rate": 8e-05, + "loss": 1.8684, + "step": 5582 + }, + { + "epoch": 0.3112040133779264, + "grad_norm": 0.5837110877037048, + "learning_rate": 8e-05, + "loss": 1.9009, + "step": 5583 + }, + { + "epoch": 0.3112597547380156, + "grad_norm": 0.5275765657424927, + "learning_rate": 8e-05, + "loss": 1.7688, + "step": 5584 + }, + { + "epoch": 0.31131549609810477, + "grad_norm": 0.5006588697433472, + "learning_rate": 8e-05, + "loss": 1.6435, + "step": 5585 + }, + { + "epoch": 0.311371237458194, + "grad_norm": 0.5062305331230164, + "learning_rate": 8e-05, + "loss": 1.5436, + "step": 5586 + }, + { + "epoch": 0.3114269788182832, + "grad_norm": 0.5304542779922485, + "learning_rate": 8e-05, + "loss": 1.8129, + "step": 5587 + }, + { + "epoch": 0.31148272017837236, + "grad_norm": 0.5773730874061584, + "learning_rate": 8e-05, + "loss": 2.1254, + "step": 5588 + }, + { + "epoch": 0.31153846153846154, + "grad_norm": 0.4913802742958069, + "learning_rate": 8e-05, + "loss": 1.7781, + "step": 5589 + }, + { + "epoch": 0.3115942028985507, + "grad_norm": 0.5441120266914368, + "learning_rate": 8e-05, + "loss": 1.878, + "step": 5590 + }, + { + "epoch": 0.3116499442586399, + "grad_norm": 0.5085309147834778, + "learning_rate": 8e-05, + "loss": 1.7295, + "step": 5591 + }, + { + "epoch": 0.3117056856187291, + "grad_norm": 0.49739134311676025, + "learning_rate": 8e-05, + "loss": 1.6436, + "step": 5592 + }, + { + "epoch": 0.31176142697881826, + "grad_norm": 0.5027386546134949, + "learning_rate": 8e-05, + "loss": 1.6793, + "step": 5593 + }, + { + "epoch": 0.3118171683389075, + "grad_norm": 0.4946807026863098, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 5594 + }, + { + "epoch": 0.31187290969899667, + "grad_norm": 0.5363293290138245, + "learning_rate": 8e-05, + "loss": 1.568, + "step": 5595 + }, + { + "epoch": 0.31192865105908585, + "grad_norm": 0.501739501953125, + "learning_rate": 8e-05, + "loss": 1.7039, + "step": 5596 + }, + { + "epoch": 0.311984392419175, + "grad_norm": 0.5119023323059082, + "learning_rate": 8e-05, + "loss": 1.5821, + "step": 5597 + }, + { + "epoch": 0.3120401337792642, + "grad_norm": 0.5860930681228638, + "learning_rate": 8e-05, + "loss": 1.8778, + "step": 5598 + }, + { + "epoch": 0.3120958751393534, + "grad_norm": 0.5110373497009277, + "learning_rate": 8e-05, + "loss": 1.6558, + "step": 5599 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.45514994859695435, + "learning_rate": 8e-05, + "loss": 1.5136, + "step": 5600 + }, + { + "epoch": 0.3122073578595318, + "grad_norm": 0.5342680215835571, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 5601 + }, + { + "epoch": 0.312263099219621, + "grad_norm": 0.5177780389785767, + "learning_rate": 8e-05, + "loss": 1.7188, + "step": 5602 + }, + { + "epoch": 0.31231884057971016, + "grad_norm": 0.5124519467353821, + "learning_rate": 8e-05, + "loss": 1.6296, + "step": 5603 + }, + { + "epoch": 0.31237458193979933, + "grad_norm": 0.49985089898109436, + "learning_rate": 8e-05, + "loss": 1.5149, + "step": 5604 + }, + { + "epoch": 0.3124303232998885, + "grad_norm": 0.4940367639064789, + "learning_rate": 8e-05, + "loss": 1.6693, + "step": 5605 + }, + { + "epoch": 0.3124860646599777, + "grad_norm": 0.4849971532821655, + "learning_rate": 8e-05, + "loss": 1.4553, + "step": 5606 + }, + { + "epoch": 0.31254180602006687, + "grad_norm": 0.47547417879104614, + "learning_rate": 8e-05, + "loss": 1.8421, + "step": 5607 + }, + { + "epoch": 0.31259754738015605, + "grad_norm": 0.5307517647743225, + "learning_rate": 8e-05, + "loss": 1.7207, + "step": 5608 + }, + { + "epoch": 0.3126532887402453, + "grad_norm": 0.493900328874588, + "learning_rate": 8e-05, + "loss": 1.7417, + "step": 5609 + }, + { + "epoch": 0.31270903010033446, + "grad_norm": 0.4873904883861542, + "learning_rate": 8e-05, + "loss": 1.4415, + "step": 5610 + }, + { + "epoch": 0.31276477146042364, + "grad_norm": 0.5056470036506653, + "learning_rate": 8e-05, + "loss": 1.6919, + "step": 5611 + }, + { + "epoch": 0.3128205128205128, + "grad_norm": 0.47142380475997925, + "learning_rate": 8e-05, + "loss": 1.6281, + "step": 5612 + }, + { + "epoch": 0.312876254180602, + "grad_norm": 0.5202730894088745, + "learning_rate": 8e-05, + "loss": 1.9738, + "step": 5613 + }, + { + "epoch": 0.3129319955406912, + "grad_norm": 0.5567691326141357, + "learning_rate": 8e-05, + "loss": 1.6902, + "step": 5614 + }, + { + "epoch": 0.31298773690078036, + "grad_norm": 0.46796953678131104, + "learning_rate": 8e-05, + "loss": 1.5595, + "step": 5615 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.5604295134544373, + "learning_rate": 8e-05, + "loss": 2.0457, + "step": 5616 + }, + { + "epoch": 0.31309921962095877, + "grad_norm": 0.5760122537612915, + "learning_rate": 8e-05, + "loss": 2.0031, + "step": 5617 + }, + { + "epoch": 0.31315496098104795, + "grad_norm": 0.5523648858070374, + "learning_rate": 8e-05, + "loss": 1.7164, + "step": 5618 + }, + { + "epoch": 0.3132107023411371, + "grad_norm": 0.5255864262580872, + "learning_rate": 8e-05, + "loss": 1.6715, + "step": 5619 + }, + { + "epoch": 0.3132664437012263, + "grad_norm": 0.5574448704719543, + "learning_rate": 8e-05, + "loss": 1.6039, + "step": 5620 + }, + { + "epoch": 0.3133221850613155, + "grad_norm": 0.4774945378303528, + "learning_rate": 8e-05, + "loss": 1.8164, + "step": 5621 + }, + { + "epoch": 0.31337792642140466, + "grad_norm": 0.5023643970489502, + "learning_rate": 8e-05, + "loss": 1.5432, + "step": 5622 + }, + { + "epoch": 0.31343366778149384, + "grad_norm": 0.5021160244941711, + "learning_rate": 8e-05, + "loss": 1.8182, + "step": 5623 + }, + { + "epoch": 0.3134894091415831, + "grad_norm": 0.5544974207878113, + "learning_rate": 8e-05, + "loss": 1.6318, + "step": 5624 + }, + { + "epoch": 0.31354515050167225, + "grad_norm": 0.513565719127655, + "learning_rate": 8e-05, + "loss": 1.6485, + "step": 5625 + }, + { + "epoch": 0.31360089186176143, + "grad_norm": 0.5158968567848206, + "learning_rate": 8e-05, + "loss": 1.546, + "step": 5626 + }, + { + "epoch": 0.3136566332218506, + "grad_norm": 0.51337730884552, + "learning_rate": 8e-05, + "loss": 1.8192, + "step": 5627 + }, + { + "epoch": 0.3137123745819398, + "grad_norm": 0.5285170078277588, + "learning_rate": 8e-05, + "loss": 1.6804, + "step": 5628 + }, + { + "epoch": 0.31376811594202897, + "grad_norm": 0.5417566299438477, + "learning_rate": 8e-05, + "loss": 1.6337, + "step": 5629 + }, + { + "epoch": 0.31382385730211815, + "grad_norm": 0.5149359703063965, + "learning_rate": 8e-05, + "loss": 1.688, + "step": 5630 + }, + { + "epoch": 0.3138795986622074, + "grad_norm": 0.5370135307312012, + "learning_rate": 8e-05, + "loss": 1.7254, + "step": 5631 + }, + { + "epoch": 0.31393534002229656, + "grad_norm": 0.49705618619918823, + "learning_rate": 8e-05, + "loss": 1.6279, + "step": 5632 + }, + { + "epoch": 0.31399108138238574, + "grad_norm": 0.536984384059906, + "learning_rate": 8e-05, + "loss": 1.604, + "step": 5633 + }, + { + "epoch": 0.3140468227424749, + "grad_norm": 0.5114649534225464, + "learning_rate": 8e-05, + "loss": 1.8026, + "step": 5634 + }, + { + "epoch": 0.3141025641025641, + "grad_norm": 0.5303798317909241, + "learning_rate": 8e-05, + "loss": 1.5936, + "step": 5635 + }, + { + "epoch": 0.3141583054626533, + "grad_norm": 0.5324503183364868, + "learning_rate": 8e-05, + "loss": 1.7611, + "step": 5636 + }, + { + "epoch": 0.31421404682274245, + "grad_norm": 0.4875989258289337, + "learning_rate": 8e-05, + "loss": 1.5046, + "step": 5637 + }, + { + "epoch": 0.3142697881828317, + "grad_norm": 0.48831331729888916, + "learning_rate": 8e-05, + "loss": 1.6284, + "step": 5638 + }, + { + "epoch": 0.31432552954292087, + "grad_norm": 0.5204409956932068, + "learning_rate": 8e-05, + "loss": 1.7106, + "step": 5639 + }, + { + "epoch": 0.31438127090301005, + "grad_norm": 0.516324520111084, + "learning_rate": 8e-05, + "loss": 1.7483, + "step": 5640 + }, + { + "epoch": 0.3144370122630992, + "grad_norm": 0.5173261165618896, + "learning_rate": 8e-05, + "loss": 1.6939, + "step": 5641 + }, + { + "epoch": 0.3144927536231884, + "grad_norm": 0.5068714618682861, + "learning_rate": 8e-05, + "loss": 1.7087, + "step": 5642 + }, + { + "epoch": 0.3145484949832776, + "grad_norm": 0.4229615032672882, + "learning_rate": 8e-05, + "loss": 1.4433, + "step": 5643 + }, + { + "epoch": 0.31460423634336676, + "grad_norm": 0.5388889908790588, + "learning_rate": 8e-05, + "loss": 1.9121, + "step": 5644 + }, + { + "epoch": 0.31465997770345594, + "grad_norm": 0.4938584566116333, + "learning_rate": 8e-05, + "loss": 1.6384, + "step": 5645 + }, + { + "epoch": 0.3147157190635452, + "grad_norm": 0.5267650485038757, + "learning_rate": 8e-05, + "loss": 1.9155, + "step": 5646 + }, + { + "epoch": 0.31477146042363435, + "grad_norm": 0.5120892524719238, + "learning_rate": 8e-05, + "loss": 1.5734, + "step": 5647 + }, + { + "epoch": 0.31482720178372353, + "grad_norm": 0.5487409830093384, + "learning_rate": 8e-05, + "loss": 1.4687, + "step": 5648 + }, + { + "epoch": 0.3148829431438127, + "grad_norm": 0.5335332751274109, + "learning_rate": 8e-05, + "loss": 1.9635, + "step": 5649 + }, + { + "epoch": 0.3149386845039019, + "grad_norm": 0.4990962743759155, + "learning_rate": 8e-05, + "loss": 1.5531, + "step": 5650 + }, + { + "epoch": 0.31499442586399107, + "grad_norm": 0.5237401127815247, + "learning_rate": 8e-05, + "loss": 1.8537, + "step": 5651 + }, + { + "epoch": 0.31505016722408025, + "grad_norm": 0.5074672698974609, + "learning_rate": 8e-05, + "loss": 1.6768, + "step": 5652 + }, + { + "epoch": 0.3151059085841695, + "grad_norm": 0.5036709308624268, + "learning_rate": 8e-05, + "loss": 1.6335, + "step": 5653 + }, + { + "epoch": 0.31516164994425866, + "grad_norm": 0.5381163358688354, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 5654 + }, + { + "epoch": 0.31521739130434784, + "grad_norm": 0.5201897025108337, + "learning_rate": 8e-05, + "loss": 1.702, + "step": 5655 + }, + { + "epoch": 0.315273132664437, + "grad_norm": 0.5196989178657532, + "learning_rate": 8e-05, + "loss": 1.5655, + "step": 5656 + }, + { + "epoch": 0.3153288740245262, + "grad_norm": 0.5282043218612671, + "learning_rate": 8e-05, + "loss": 1.8807, + "step": 5657 + }, + { + "epoch": 0.3153846153846154, + "grad_norm": 0.5148075819015503, + "learning_rate": 8e-05, + "loss": 1.6046, + "step": 5658 + }, + { + "epoch": 0.31544035674470455, + "grad_norm": 0.5050842761993408, + "learning_rate": 8e-05, + "loss": 1.6702, + "step": 5659 + }, + { + "epoch": 0.31549609810479373, + "grad_norm": 0.5463430285453796, + "learning_rate": 8e-05, + "loss": 1.9318, + "step": 5660 + }, + { + "epoch": 0.31555183946488297, + "grad_norm": 0.5092810392379761, + "learning_rate": 8e-05, + "loss": 1.7894, + "step": 5661 + }, + { + "epoch": 0.31560758082497214, + "grad_norm": 0.5707821249961853, + "learning_rate": 8e-05, + "loss": 1.6085, + "step": 5662 + }, + { + "epoch": 0.3156633221850613, + "grad_norm": 0.5087369680404663, + "learning_rate": 8e-05, + "loss": 1.6965, + "step": 5663 + }, + { + "epoch": 0.3157190635451505, + "grad_norm": 0.5232574343681335, + "learning_rate": 8e-05, + "loss": 1.8687, + "step": 5664 + }, + { + "epoch": 0.3157748049052397, + "grad_norm": 0.5404393076896667, + "learning_rate": 8e-05, + "loss": 1.9312, + "step": 5665 + }, + { + "epoch": 0.31583054626532886, + "grad_norm": 0.4950511157512665, + "learning_rate": 8e-05, + "loss": 1.6128, + "step": 5666 + }, + { + "epoch": 0.31588628762541804, + "grad_norm": 0.4811279773712158, + "learning_rate": 8e-05, + "loss": 1.7009, + "step": 5667 + }, + { + "epoch": 0.3159420289855073, + "grad_norm": 0.48683613538742065, + "learning_rate": 8e-05, + "loss": 1.5999, + "step": 5668 + }, + { + "epoch": 0.31599777034559645, + "grad_norm": 0.4522567391395569, + "learning_rate": 8e-05, + "loss": 1.4242, + "step": 5669 + }, + { + "epoch": 0.31605351170568563, + "grad_norm": 0.5745465159416199, + "learning_rate": 8e-05, + "loss": 1.3288, + "step": 5670 + }, + { + "epoch": 0.3161092530657748, + "grad_norm": 0.577358067035675, + "learning_rate": 8e-05, + "loss": 1.9754, + "step": 5671 + }, + { + "epoch": 0.316164994425864, + "grad_norm": 0.5135446190834045, + "learning_rate": 8e-05, + "loss": 1.8025, + "step": 5672 + }, + { + "epoch": 0.31622073578595317, + "grad_norm": 0.4923860728740692, + "learning_rate": 8e-05, + "loss": 1.6862, + "step": 5673 + }, + { + "epoch": 0.31627647714604235, + "grad_norm": 0.5172598958015442, + "learning_rate": 8e-05, + "loss": 1.7144, + "step": 5674 + }, + { + "epoch": 0.3163322185061315, + "grad_norm": 0.5160543918609619, + "learning_rate": 8e-05, + "loss": 1.6935, + "step": 5675 + }, + { + "epoch": 0.31638795986622076, + "grad_norm": 0.44993856549263, + "learning_rate": 8e-05, + "loss": 1.4565, + "step": 5676 + }, + { + "epoch": 0.31644370122630994, + "grad_norm": 0.5209502577781677, + "learning_rate": 8e-05, + "loss": 1.6713, + "step": 5677 + }, + { + "epoch": 0.3164994425863991, + "grad_norm": 0.5097101330757141, + "learning_rate": 8e-05, + "loss": 1.8201, + "step": 5678 + }, + { + "epoch": 0.3165551839464883, + "grad_norm": 0.51374351978302, + "learning_rate": 8e-05, + "loss": 1.7345, + "step": 5679 + }, + { + "epoch": 0.3166109253065775, + "grad_norm": 0.6368524432182312, + "learning_rate": 8e-05, + "loss": 2.1954, + "step": 5680 + }, + { + "epoch": 0.31666666666666665, + "grad_norm": 0.5003951191902161, + "learning_rate": 8e-05, + "loss": 1.5184, + "step": 5681 + }, + { + "epoch": 0.31672240802675583, + "grad_norm": 0.5287581086158752, + "learning_rate": 8e-05, + "loss": 1.8125, + "step": 5682 + }, + { + "epoch": 0.31677814938684506, + "grad_norm": 0.5246060490608215, + "learning_rate": 8e-05, + "loss": 1.7134, + "step": 5683 + }, + { + "epoch": 0.31683389074693424, + "grad_norm": 0.5368450284004211, + "learning_rate": 8e-05, + "loss": 1.9415, + "step": 5684 + }, + { + "epoch": 0.3168896321070234, + "grad_norm": 0.5108862519264221, + "learning_rate": 8e-05, + "loss": 1.7992, + "step": 5685 + }, + { + "epoch": 0.3169453734671126, + "grad_norm": 0.5072665214538574, + "learning_rate": 8e-05, + "loss": 1.6403, + "step": 5686 + }, + { + "epoch": 0.3170011148272018, + "grad_norm": 0.5059048533439636, + "learning_rate": 8e-05, + "loss": 1.6965, + "step": 5687 + }, + { + "epoch": 0.31705685618729096, + "grad_norm": 0.4943862557411194, + "learning_rate": 8e-05, + "loss": 1.7319, + "step": 5688 + }, + { + "epoch": 0.31711259754738014, + "grad_norm": 0.5123859643936157, + "learning_rate": 8e-05, + "loss": 1.715, + "step": 5689 + }, + { + "epoch": 0.3171683389074693, + "grad_norm": 0.5278445482254028, + "learning_rate": 8e-05, + "loss": 1.7291, + "step": 5690 + }, + { + "epoch": 0.31722408026755855, + "grad_norm": 0.4850660562515259, + "learning_rate": 8e-05, + "loss": 1.6429, + "step": 5691 + }, + { + "epoch": 0.31727982162764773, + "grad_norm": 0.4988071322441101, + "learning_rate": 8e-05, + "loss": 1.7774, + "step": 5692 + }, + { + "epoch": 0.3173355629877369, + "grad_norm": 0.5240743160247803, + "learning_rate": 8e-05, + "loss": 1.8149, + "step": 5693 + }, + { + "epoch": 0.3173913043478261, + "grad_norm": 0.5346617698669434, + "learning_rate": 8e-05, + "loss": 1.9376, + "step": 5694 + }, + { + "epoch": 0.31744704570791527, + "grad_norm": 0.4753016531467438, + "learning_rate": 8e-05, + "loss": 1.5924, + "step": 5695 + }, + { + "epoch": 0.31750278706800444, + "grad_norm": 0.48297837376594543, + "learning_rate": 8e-05, + "loss": 1.5537, + "step": 5696 + }, + { + "epoch": 0.3175585284280936, + "grad_norm": 0.5097260475158691, + "learning_rate": 8e-05, + "loss": 1.7266, + "step": 5697 + }, + { + "epoch": 0.31761426978818286, + "grad_norm": 0.4937364459037781, + "learning_rate": 8e-05, + "loss": 1.6826, + "step": 5698 + }, + { + "epoch": 0.31767001114827204, + "grad_norm": 0.4757471978664398, + "learning_rate": 8e-05, + "loss": 1.6335, + "step": 5699 + }, + { + "epoch": 0.3177257525083612, + "grad_norm": 0.5071625709533691, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 5700 + }, + { + "epoch": 0.3177814938684504, + "grad_norm": 0.5106151700019836, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 5701 + }, + { + "epoch": 0.31783723522853957, + "grad_norm": 0.5215732455253601, + "learning_rate": 8e-05, + "loss": 1.5861, + "step": 5702 + }, + { + "epoch": 0.31789297658862875, + "grad_norm": 0.5385761260986328, + "learning_rate": 8e-05, + "loss": 1.7751, + "step": 5703 + }, + { + "epoch": 0.31794871794871793, + "grad_norm": 0.5467097163200378, + "learning_rate": 8e-05, + "loss": 1.6679, + "step": 5704 + }, + { + "epoch": 0.3180044593088071, + "grad_norm": 0.5180816650390625, + "learning_rate": 8e-05, + "loss": 1.6077, + "step": 5705 + }, + { + "epoch": 0.31806020066889634, + "grad_norm": 0.5191653966903687, + "learning_rate": 8e-05, + "loss": 1.6352, + "step": 5706 + }, + { + "epoch": 0.3181159420289855, + "grad_norm": 0.4900323152542114, + "learning_rate": 8e-05, + "loss": 1.6899, + "step": 5707 + }, + { + "epoch": 0.3181716833890747, + "grad_norm": 0.5084742307662964, + "learning_rate": 8e-05, + "loss": 1.5656, + "step": 5708 + }, + { + "epoch": 0.3182274247491639, + "grad_norm": 0.5018293857574463, + "learning_rate": 8e-05, + "loss": 1.5945, + "step": 5709 + }, + { + "epoch": 0.31828316610925306, + "grad_norm": 0.5255138278007507, + "learning_rate": 8e-05, + "loss": 1.8835, + "step": 5710 + }, + { + "epoch": 0.31833890746934224, + "grad_norm": 0.5178418755531311, + "learning_rate": 8e-05, + "loss": 1.6581, + "step": 5711 + }, + { + "epoch": 0.3183946488294314, + "grad_norm": 0.496112197637558, + "learning_rate": 8e-05, + "loss": 1.605, + "step": 5712 + }, + { + "epoch": 0.31845039018952065, + "grad_norm": 0.519536018371582, + "learning_rate": 8e-05, + "loss": 1.8334, + "step": 5713 + }, + { + "epoch": 0.31850613154960983, + "grad_norm": 0.47761979699134827, + "learning_rate": 8e-05, + "loss": 1.6399, + "step": 5714 + }, + { + "epoch": 0.318561872909699, + "grad_norm": 0.5270336866378784, + "learning_rate": 8e-05, + "loss": 1.6663, + "step": 5715 + }, + { + "epoch": 0.3186176142697882, + "grad_norm": 0.5025304555892944, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 5716 + }, + { + "epoch": 0.31867335562987736, + "grad_norm": 0.5432649850845337, + "learning_rate": 8e-05, + "loss": 1.895, + "step": 5717 + }, + { + "epoch": 0.31872909698996654, + "grad_norm": 0.553049623966217, + "learning_rate": 8e-05, + "loss": 1.8611, + "step": 5718 + }, + { + "epoch": 0.3187848383500557, + "grad_norm": 0.4806937873363495, + "learning_rate": 8e-05, + "loss": 1.6396, + "step": 5719 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.478455126285553, + "learning_rate": 8e-05, + "loss": 1.5705, + "step": 5720 + }, + { + "epoch": 0.31889632107023413, + "grad_norm": 0.5210627913475037, + "learning_rate": 8e-05, + "loss": 1.4042, + "step": 5721 + }, + { + "epoch": 0.3189520624303233, + "grad_norm": 0.5083332657814026, + "learning_rate": 8e-05, + "loss": 1.671, + "step": 5722 + }, + { + "epoch": 0.3190078037904125, + "grad_norm": 0.49854928255081177, + "learning_rate": 8e-05, + "loss": 1.8224, + "step": 5723 + }, + { + "epoch": 0.31906354515050167, + "grad_norm": 0.5092363953590393, + "learning_rate": 8e-05, + "loss": 1.8449, + "step": 5724 + }, + { + "epoch": 0.31911928651059085, + "grad_norm": 0.5526291728019714, + "learning_rate": 8e-05, + "loss": 1.6813, + "step": 5725 + }, + { + "epoch": 0.31917502787068003, + "grad_norm": 0.4988034665584564, + "learning_rate": 8e-05, + "loss": 1.6409, + "step": 5726 + }, + { + "epoch": 0.3192307692307692, + "grad_norm": 0.49596595764160156, + "learning_rate": 8e-05, + "loss": 1.6347, + "step": 5727 + }, + { + "epoch": 0.31928651059085844, + "grad_norm": 0.5038948059082031, + "learning_rate": 8e-05, + "loss": 1.7396, + "step": 5728 + }, + { + "epoch": 0.3193422519509476, + "grad_norm": 0.5159317851066589, + "learning_rate": 8e-05, + "loss": 1.5135, + "step": 5729 + }, + { + "epoch": 0.3193979933110368, + "grad_norm": 0.536002516746521, + "learning_rate": 8e-05, + "loss": 1.6583, + "step": 5730 + }, + { + "epoch": 0.319453734671126, + "grad_norm": 0.5048378705978394, + "learning_rate": 8e-05, + "loss": 1.7345, + "step": 5731 + }, + { + "epoch": 0.31950947603121516, + "grad_norm": 0.4891044497489929, + "learning_rate": 8e-05, + "loss": 1.6499, + "step": 5732 + }, + { + "epoch": 0.31956521739130433, + "grad_norm": 0.5046207308769226, + "learning_rate": 8e-05, + "loss": 1.6562, + "step": 5733 + }, + { + "epoch": 0.3196209587513935, + "grad_norm": 0.4974361062049866, + "learning_rate": 8e-05, + "loss": 1.6007, + "step": 5734 + }, + { + "epoch": 0.31967670011148275, + "grad_norm": 0.506041407585144, + "learning_rate": 8e-05, + "loss": 1.5283, + "step": 5735 + }, + { + "epoch": 0.3197324414715719, + "grad_norm": 0.5237359404563904, + "learning_rate": 8e-05, + "loss": 1.7842, + "step": 5736 + }, + { + "epoch": 0.3197881828316611, + "grad_norm": 0.5203297138214111, + "learning_rate": 8e-05, + "loss": 1.787, + "step": 5737 + }, + { + "epoch": 0.3198439241917503, + "grad_norm": 0.5403977036476135, + "learning_rate": 8e-05, + "loss": 1.8318, + "step": 5738 + }, + { + "epoch": 0.31989966555183946, + "grad_norm": 0.5268109440803528, + "learning_rate": 8e-05, + "loss": 1.8816, + "step": 5739 + }, + { + "epoch": 0.31995540691192864, + "grad_norm": 0.5357729196548462, + "learning_rate": 8e-05, + "loss": 1.689, + "step": 5740 + }, + { + "epoch": 0.3200111482720178, + "grad_norm": 0.5156355500221252, + "learning_rate": 8e-05, + "loss": 1.6791, + "step": 5741 + }, + { + "epoch": 0.320066889632107, + "grad_norm": 0.493498831987381, + "learning_rate": 8e-05, + "loss": 1.6286, + "step": 5742 + }, + { + "epoch": 0.32012263099219623, + "grad_norm": 0.5426554083824158, + "learning_rate": 8e-05, + "loss": 1.9256, + "step": 5743 + }, + { + "epoch": 0.3201783723522854, + "grad_norm": 0.5164318680763245, + "learning_rate": 8e-05, + "loss": 1.6362, + "step": 5744 + }, + { + "epoch": 0.3202341137123746, + "grad_norm": 0.5163710713386536, + "learning_rate": 8e-05, + "loss": 1.4276, + "step": 5745 + }, + { + "epoch": 0.32028985507246377, + "grad_norm": 0.529309093952179, + "learning_rate": 8e-05, + "loss": 1.7972, + "step": 5746 + }, + { + "epoch": 0.32034559643255295, + "grad_norm": 0.4801371097564697, + "learning_rate": 8e-05, + "loss": 1.6195, + "step": 5747 + }, + { + "epoch": 0.3204013377926421, + "grad_norm": 0.5078192949295044, + "learning_rate": 8e-05, + "loss": 1.6717, + "step": 5748 + }, + { + "epoch": 0.3204570791527313, + "grad_norm": 0.5345617532730103, + "learning_rate": 8e-05, + "loss": 1.629, + "step": 5749 + }, + { + "epoch": 0.32051282051282054, + "grad_norm": 0.4875049591064453, + "learning_rate": 8e-05, + "loss": 1.6211, + "step": 5750 + }, + { + "epoch": 0.3205685618729097, + "grad_norm": 0.48048990964889526, + "learning_rate": 8e-05, + "loss": 1.6774, + "step": 5751 + }, + { + "epoch": 0.3206243032329989, + "grad_norm": 0.5032520890235901, + "learning_rate": 8e-05, + "loss": 1.8064, + "step": 5752 + }, + { + "epoch": 0.3206800445930881, + "grad_norm": 0.4913550615310669, + "learning_rate": 8e-05, + "loss": 1.6187, + "step": 5753 + }, + { + "epoch": 0.32073578595317725, + "grad_norm": 0.5281867980957031, + "learning_rate": 8e-05, + "loss": 1.8087, + "step": 5754 + }, + { + "epoch": 0.32079152731326643, + "grad_norm": 0.49618878960609436, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 5755 + }, + { + "epoch": 0.3208472686733556, + "grad_norm": 0.5247597694396973, + "learning_rate": 8e-05, + "loss": 1.8147, + "step": 5756 + }, + { + "epoch": 0.3209030100334448, + "grad_norm": 0.5163070559501648, + "learning_rate": 8e-05, + "loss": 1.6118, + "step": 5757 + }, + { + "epoch": 0.320958751393534, + "grad_norm": 0.49757692217826843, + "learning_rate": 8e-05, + "loss": 1.5848, + "step": 5758 + }, + { + "epoch": 0.3210144927536232, + "grad_norm": 0.5048686265945435, + "learning_rate": 8e-05, + "loss": 1.7403, + "step": 5759 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.5307775139808655, + "learning_rate": 8e-05, + "loss": 1.8107, + "step": 5760 + }, + { + "epoch": 0.32112597547380156, + "grad_norm": 0.4905966520309448, + "learning_rate": 8e-05, + "loss": 1.5565, + "step": 5761 + }, + { + "epoch": 0.32118171683389074, + "grad_norm": 0.5203377604484558, + "learning_rate": 8e-05, + "loss": 1.6875, + "step": 5762 + }, + { + "epoch": 0.3212374581939799, + "grad_norm": 0.48834821581840515, + "learning_rate": 8e-05, + "loss": 1.6393, + "step": 5763 + }, + { + "epoch": 0.3212931995540691, + "grad_norm": 0.5044954419136047, + "learning_rate": 8e-05, + "loss": 1.8203, + "step": 5764 + }, + { + "epoch": 0.32134894091415833, + "grad_norm": 0.5049839615821838, + "learning_rate": 8e-05, + "loss": 1.6837, + "step": 5765 + }, + { + "epoch": 0.3214046822742475, + "grad_norm": 0.4517059922218323, + "learning_rate": 8e-05, + "loss": 1.5599, + "step": 5766 + }, + { + "epoch": 0.3214604236343367, + "grad_norm": 0.48859703540802, + "learning_rate": 8e-05, + "loss": 1.4708, + "step": 5767 + }, + { + "epoch": 0.32151616499442587, + "grad_norm": 0.5075569152832031, + "learning_rate": 8e-05, + "loss": 1.6787, + "step": 5768 + }, + { + "epoch": 0.32157190635451505, + "grad_norm": 0.4926925599575043, + "learning_rate": 8e-05, + "loss": 1.6868, + "step": 5769 + }, + { + "epoch": 0.3216276477146042, + "grad_norm": 0.5494069457054138, + "learning_rate": 8e-05, + "loss": 1.6931, + "step": 5770 + }, + { + "epoch": 0.3216833890746934, + "grad_norm": 0.5246031284332275, + "learning_rate": 8e-05, + "loss": 1.6851, + "step": 5771 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 0.5542581677436829, + "learning_rate": 8e-05, + "loss": 1.8545, + "step": 5772 + }, + { + "epoch": 0.3217948717948718, + "grad_norm": 0.514141321182251, + "learning_rate": 8e-05, + "loss": 1.5756, + "step": 5773 + }, + { + "epoch": 0.321850613154961, + "grad_norm": 0.48406264185905457, + "learning_rate": 8e-05, + "loss": 1.703, + "step": 5774 + }, + { + "epoch": 0.3219063545150502, + "grad_norm": 0.5315650105476379, + "learning_rate": 8e-05, + "loss": 1.681, + "step": 5775 + }, + { + "epoch": 0.32196209587513935, + "grad_norm": 0.5040196776390076, + "learning_rate": 8e-05, + "loss": 1.8194, + "step": 5776 + }, + { + "epoch": 0.32201783723522853, + "grad_norm": 0.5190441608428955, + "learning_rate": 8e-05, + "loss": 1.5079, + "step": 5777 + }, + { + "epoch": 0.3220735785953177, + "grad_norm": 0.5137271285057068, + "learning_rate": 8e-05, + "loss": 1.7097, + "step": 5778 + }, + { + "epoch": 0.3221293199554069, + "grad_norm": 0.5241565108299255, + "learning_rate": 8e-05, + "loss": 1.7308, + "step": 5779 + }, + { + "epoch": 0.3221850613154961, + "grad_norm": 0.5239174365997314, + "learning_rate": 8e-05, + "loss": 1.7569, + "step": 5780 + }, + { + "epoch": 0.3222408026755853, + "grad_norm": 0.5030615925788879, + "learning_rate": 8e-05, + "loss": 1.6763, + "step": 5781 + }, + { + "epoch": 0.3222965440356745, + "grad_norm": 0.5678612589836121, + "learning_rate": 8e-05, + "loss": 1.9383, + "step": 5782 + }, + { + "epoch": 0.32235228539576366, + "grad_norm": 0.5458984375, + "learning_rate": 8e-05, + "loss": 1.7486, + "step": 5783 + }, + { + "epoch": 0.32240802675585284, + "grad_norm": 0.48285484313964844, + "learning_rate": 8e-05, + "loss": 1.6632, + "step": 5784 + }, + { + "epoch": 0.322463768115942, + "grad_norm": 0.5187762975692749, + "learning_rate": 8e-05, + "loss": 1.7322, + "step": 5785 + }, + { + "epoch": 0.3225195094760312, + "grad_norm": 0.5410240292549133, + "learning_rate": 8e-05, + "loss": 1.9998, + "step": 5786 + }, + { + "epoch": 0.3225752508361204, + "grad_norm": 0.4980333149433136, + "learning_rate": 8e-05, + "loss": 1.8457, + "step": 5787 + }, + { + "epoch": 0.3226309921962096, + "grad_norm": 0.475260853767395, + "learning_rate": 8e-05, + "loss": 1.6318, + "step": 5788 + }, + { + "epoch": 0.3226867335562988, + "grad_norm": 0.575297474861145, + "learning_rate": 8e-05, + "loss": 1.9413, + "step": 5789 + }, + { + "epoch": 0.32274247491638797, + "grad_norm": 0.5367628335952759, + "learning_rate": 8e-05, + "loss": 1.8158, + "step": 5790 + }, + { + "epoch": 0.32279821627647715, + "grad_norm": 0.5099639296531677, + "learning_rate": 8e-05, + "loss": 1.8162, + "step": 5791 + }, + { + "epoch": 0.3228539576365663, + "grad_norm": 0.5249901413917542, + "learning_rate": 8e-05, + "loss": 1.6299, + "step": 5792 + }, + { + "epoch": 0.3229096989966555, + "grad_norm": 0.5240976214408875, + "learning_rate": 8e-05, + "loss": 1.7886, + "step": 5793 + }, + { + "epoch": 0.3229654403567447, + "grad_norm": 0.49148687720298767, + "learning_rate": 8e-05, + "loss": 1.6332, + "step": 5794 + }, + { + "epoch": 0.3230211817168339, + "grad_norm": 0.5600979924201965, + "learning_rate": 8e-05, + "loss": 1.6854, + "step": 5795 + }, + { + "epoch": 0.3230769230769231, + "grad_norm": 0.4522090554237366, + "learning_rate": 8e-05, + "loss": 1.6729, + "step": 5796 + }, + { + "epoch": 0.3231326644370123, + "grad_norm": 0.5222184658050537, + "learning_rate": 8e-05, + "loss": 1.8499, + "step": 5797 + }, + { + "epoch": 0.32318840579710145, + "grad_norm": 0.5561509132385254, + "learning_rate": 8e-05, + "loss": 1.8946, + "step": 5798 + }, + { + "epoch": 0.32324414715719063, + "grad_norm": 0.49291518330574036, + "learning_rate": 8e-05, + "loss": 1.8201, + "step": 5799 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.4637990891933441, + "learning_rate": 8e-05, + "loss": 1.6406, + "step": 5800 + }, + { + "epoch": 0.323355629877369, + "grad_norm": 0.6826379895210266, + "learning_rate": 8e-05, + "loss": 2.0064, + "step": 5801 + }, + { + "epoch": 0.32341137123745817, + "grad_norm": 0.49902430176734924, + "learning_rate": 8e-05, + "loss": 1.5408, + "step": 5802 + }, + { + "epoch": 0.3234671125975474, + "grad_norm": 0.6668875813484192, + "learning_rate": 8e-05, + "loss": 2.1603, + "step": 5803 + }, + { + "epoch": 0.3235228539576366, + "grad_norm": 0.498045951128006, + "learning_rate": 8e-05, + "loss": 1.5568, + "step": 5804 + }, + { + "epoch": 0.32357859531772576, + "grad_norm": 0.5072228908538818, + "learning_rate": 8e-05, + "loss": 1.6002, + "step": 5805 + }, + { + "epoch": 0.32363433667781494, + "grad_norm": 0.5403440594673157, + "learning_rate": 8e-05, + "loss": 1.5875, + "step": 5806 + }, + { + "epoch": 0.3236900780379041, + "grad_norm": 0.47508928179740906, + "learning_rate": 8e-05, + "loss": 1.5958, + "step": 5807 + }, + { + "epoch": 0.3237458193979933, + "grad_norm": 0.5487821698188782, + "learning_rate": 8e-05, + "loss": 1.6753, + "step": 5808 + }, + { + "epoch": 0.3238015607580825, + "grad_norm": 0.4984150826931, + "learning_rate": 8e-05, + "loss": 1.6836, + "step": 5809 + }, + { + "epoch": 0.3238573021181717, + "grad_norm": 0.5286592245101929, + "learning_rate": 8e-05, + "loss": 1.8199, + "step": 5810 + }, + { + "epoch": 0.3239130434782609, + "grad_norm": 0.4898530840873718, + "learning_rate": 8e-05, + "loss": 1.584, + "step": 5811 + }, + { + "epoch": 0.32396878483835007, + "grad_norm": 0.48954761028289795, + "learning_rate": 8e-05, + "loss": 1.6911, + "step": 5812 + }, + { + "epoch": 0.32402452619843924, + "grad_norm": 0.5214674472808838, + "learning_rate": 8e-05, + "loss": 1.7365, + "step": 5813 + }, + { + "epoch": 0.3240802675585284, + "grad_norm": 0.47776153683662415, + "learning_rate": 8e-05, + "loss": 1.5675, + "step": 5814 + }, + { + "epoch": 0.3241360089186176, + "grad_norm": 0.4698529839515686, + "learning_rate": 8e-05, + "loss": 1.4761, + "step": 5815 + }, + { + "epoch": 0.3241917502787068, + "grad_norm": 0.4994838833808899, + "learning_rate": 8e-05, + "loss": 1.6804, + "step": 5816 + }, + { + "epoch": 0.32424749163879596, + "grad_norm": 0.5514016151428223, + "learning_rate": 8e-05, + "loss": 1.701, + "step": 5817 + }, + { + "epoch": 0.3243032329988852, + "grad_norm": 0.5442790389060974, + "learning_rate": 8e-05, + "loss": 1.7503, + "step": 5818 + }, + { + "epoch": 0.3243589743589744, + "grad_norm": 0.5837408304214478, + "learning_rate": 8e-05, + "loss": 1.6954, + "step": 5819 + }, + { + "epoch": 0.32441471571906355, + "grad_norm": 0.531805694103241, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 5820 + }, + { + "epoch": 0.32447045707915273, + "grad_norm": 0.49297383427619934, + "learning_rate": 8e-05, + "loss": 1.6382, + "step": 5821 + }, + { + "epoch": 0.3245261984392419, + "grad_norm": 0.5175424218177795, + "learning_rate": 8e-05, + "loss": 1.7722, + "step": 5822 + }, + { + "epoch": 0.3245819397993311, + "grad_norm": 0.5085750222206116, + "learning_rate": 8e-05, + "loss": 1.7484, + "step": 5823 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 0.45293039083480835, + "learning_rate": 8e-05, + "loss": 1.4519, + "step": 5824 + }, + { + "epoch": 0.3246934225195095, + "grad_norm": 0.5808523893356323, + "learning_rate": 8e-05, + "loss": 1.7628, + "step": 5825 + }, + { + "epoch": 0.3247491638795987, + "grad_norm": 0.5092936754226685, + "learning_rate": 8e-05, + "loss": 1.5918, + "step": 5826 + }, + { + "epoch": 0.32480490523968786, + "grad_norm": 0.5005234479904175, + "learning_rate": 8e-05, + "loss": 1.5775, + "step": 5827 + }, + { + "epoch": 0.32486064659977704, + "grad_norm": 0.4908946454524994, + "learning_rate": 8e-05, + "loss": 1.6302, + "step": 5828 + }, + { + "epoch": 0.3249163879598662, + "grad_norm": 0.5337138175964355, + "learning_rate": 8e-05, + "loss": 1.7442, + "step": 5829 + }, + { + "epoch": 0.3249721293199554, + "grad_norm": 0.5431962013244629, + "learning_rate": 8e-05, + "loss": 1.8264, + "step": 5830 + }, + { + "epoch": 0.3250278706800446, + "grad_norm": 0.489956259727478, + "learning_rate": 8e-05, + "loss": 1.4519, + "step": 5831 + }, + { + "epoch": 0.3250836120401338, + "grad_norm": 0.4591715931892395, + "learning_rate": 8e-05, + "loss": 1.3764, + "step": 5832 + }, + { + "epoch": 0.325139353400223, + "grad_norm": 0.5103399753570557, + "learning_rate": 8e-05, + "loss": 1.8932, + "step": 5833 + }, + { + "epoch": 0.32519509476031216, + "grad_norm": 0.5174875855445862, + "learning_rate": 8e-05, + "loss": 1.6451, + "step": 5834 + }, + { + "epoch": 0.32525083612040134, + "grad_norm": 0.5021142959594727, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 5835 + }, + { + "epoch": 0.3253065774804905, + "grad_norm": 0.517684817314148, + "learning_rate": 8e-05, + "loss": 1.8316, + "step": 5836 + }, + { + "epoch": 0.3253623188405797, + "grad_norm": 0.500555694103241, + "learning_rate": 8e-05, + "loss": 1.7339, + "step": 5837 + }, + { + "epoch": 0.3254180602006689, + "grad_norm": 0.5088232159614563, + "learning_rate": 8e-05, + "loss": 1.771, + "step": 5838 + }, + { + "epoch": 0.32547380156075806, + "grad_norm": 0.4987097680568695, + "learning_rate": 8e-05, + "loss": 1.737, + "step": 5839 + }, + { + "epoch": 0.3255295429208473, + "grad_norm": 0.5131197571754456, + "learning_rate": 8e-05, + "loss": 1.846, + "step": 5840 + }, + { + "epoch": 0.32558528428093647, + "grad_norm": 0.5244218707084656, + "learning_rate": 8e-05, + "loss": 1.8961, + "step": 5841 + }, + { + "epoch": 0.32564102564102565, + "grad_norm": 0.4499856233596802, + "learning_rate": 8e-05, + "loss": 1.5501, + "step": 5842 + }, + { + "epoch": 0.32569676700111483, + "grad_norm": 0.4926725924015045, + "learning_rate": 8e-05, + "loss": 1.5282, + "step": 5843 + }, + { + "epoch": 0.325752508361204, + "grad_norm": 0.49711015820503235, + "learning_rate": 8e-05, + "loss": 1.6335, + "step": 5844 + }, + { + "epoch": 0.3258082497212932, + "grad_norm": 0.48810988664627075, + "learning_rate": 8e-05, + "loss": 1.7216, + "step": 5845 + }, + { + "epoch": 0.32586399108138236, + "grad_norm": 0.49561434984207153, + "learning_rate": 8e-05, + "loss": 1.6098, + "step": 5846 + }, + { + "epoch": 0.3259197324414716, + "grad_norm": 0.5036212205886841, + "learning_rate": 8e-05, + "loss": 1.749, + "step": 5847 + }, + { + "epoch": 0.3259754738015608, + "grad_norm": 0.5274983644485474, + "learning_rate": 8e-05, + "loss": 1.7112, + "step": 5848 + }, + { + "epoch": 0.32603121516164996, + "grad_norm": 0.45901793241500854, + "learning_rate": 8e-05, + "loss": 1.5345, + "step": 5849 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.4732411801815033, + "learning_rate": 8e-05, + "loss": 1.6603, + "step": 5850 + }, + { + "epoch": 0.3261426978818283, + "grad_norm": 0.5366707444190979, + "learning_rate": 8e-05, + "loss": 1.6819, + "step": 5851 + }, + { + "epoch": 0.3261984392419175, + "grad_norm": 0.48991480469703674, + "learning_rate": 8e-05, + "loss": 1.667, + "step": 5852 + }, + { + "epoch": 0.32625418060200667, + "grad_norm": 0.5174232721328735, + "learning_rate": 8e-05, + "loss": 1.7938, + "step": 5853 + }, + { + "epoch": 0.32630992196209585, + "grad_norm": 0.6045074462890625, + "learning_rate": 8e-05, + "loss": 1.9182, + "step": 5854 + }, + { + "epoch": 0.3263656633221851, + "grad_norm": 0.5650275945663452, + "learning_rate": 8e-05, + "loss": 1.8663, + "step": 5855 + }, + { + "epoch": 0.32642140468227426, + "grad_norm": 0.5077049732208252, + "learning_rate": 8e-05, + "loss": 1.3586, + "step": 5856 + }, + { + "epoch": 0.32647714604236344, + "grad_norm": 0.47277364134788513, + "learning_rate": 8e-05, + "loss": 1.5135, + "step": 5857 + }, + { + "epoch": 0.3265328874024526, + "grad_norm": 0.49664178490638733, + "learning_rate": 8e-05, + "loss": 1.7544, + "step": 5858 + }, + { + "epoch": 0.3265886287625418, + "grad_norm": 0.47362759709358215, + "learning_rate": 8e-05, + "loss": 1.588, + "step": 5859 + }, + { + "epoch": 0.326644370122631, + "grad_norm": 0.5836063623428345, + "learning_rate": 8e-05, + "loss": 1.8409, + "step": 5860 + }, + { + "epoch": 0.32670011148272016, + "grad_norm": 0.524232029914856, + "learning_rate": 8e-05, + "loss": 1.7189, + "step": 5861 + }, + { + "epoch": 0.3267558528428094, + "grad_norm": 0.4531126618385315, + "learning_rate": 8e-05, + "loss": 1.5504, + "step": 5862 + }, + { + "epoch": 0.32681159420289857, + "grad_norm": 0.5362951755523682, + "learning_rate": 8e-05, + "loss": 1.8944, + "step": 5863 + }, + { + "epoch": 0.32686733556298775, + "grad_norm": 0.542114794254303, + "learning_rate": 8e-05, + "loss": 1.7909, + "step": 5864 + }, + { + "epoch": 0.3269230769230769, + "grad_norm": 0.5658450126647949, + "learning_rate": 8e-05, + "loss": 1.9196, + "step": 5865 + }, + { + "epoch": 0.3269788182831661, + "grad_norm": 0.495639830827713, + "learning_rate": 8e-05, + "loss": 1.5991, + "step": 5866 + }, + { + "epoch": 0.3270345596432553, + "grad_norm": 0.5281422734260559, + "learning_rate": 8e-05, + "loss": 1.5525, + "step": 5867 + }, + { + "epoch": 0.32709030100334446, + "grad_norm": 0.5133866667747498, + "learning_rate": 8e-05, + "loss": 1.8848, + "step": 5868 + }, + { + "epoch": 0.32714604236343364, + "grad_norm": 0.5450278520584106, + "learning_rate": 8e-05, + "loss": 1.7997, + "step": 5869 + }, + { + "epoch": 0.3272017837235229, + "grad_norm": 0.5412588119506836, + "learning_rate": 8e-05, + "loss": 1.8669, + "step": 5870 + }, + { + "epoch": 0.32725752508361206, + "grad_norm": 0.5211524963378906, + "learning_rate": 8e-05, + "loss": 1.6767, + "step": 5871 + }, + { + "epoch": 0.32731326644370123, + "grad_norm": 0.505767285823822, + "learning_rate": 8e-05, + "loss": 1.7258, + "step": 5872 + }, + { + "epoch": 0.3273690078037904, + "grad_norm": 0.5113710761070251, + "learning_rate": 8e-05, + "loss": 1.8295, + "step": 5873 + }, + { + "epoch": 0.3274247491638796, + "grad_norm": 0.5271311402320862, + "learning_rate": 8e-05, + "loss": 2.0436, + "step": 5874 + }, + { + "epoch": 0.32748049052396877, + "grad_norm": 0.4804200530052185, + "learning_rate": 8e-05, + "loss": 1.5799, + "step": 5875 + }, + { + "epoch": 0.32753623188405795, + "grad_norm": 0.5401303768157959, + "learning_rate": 8e-05, + "loss": 1.887, + "step": 5876 + }, + { + "epoch": 0.3275919732441472, + "grad_norm": 0.5247963666915894, + "learning_rate": 8e-05, + "loss": 1.9989, + "step": 5877 + }, + { + "epoch": 0.32764771460423636, + "grad_norm": 0.5222291350364685, + "learning_rate": 8e-05, + "loss": 1.9649, + "step": 5878 + }, + { + "epoch": 0.32770345596432554, + "grad_norm": 0.5129970908164978, + "learning_rate": 8e-05, + "loss": 1.861, + "step": 5879 + }, + { + "epoch": 0.3277591973244147, + "grad_norm": 0.48588502407073975, + "learning_rate": 8e-05, + "loss": 1.5694, + "step": 5880 + }, + { + "epoch": 0.3278149386845039, + "grad_norm": 0.4927371144294739, + "learning_rate": 8e-05, + "loss": 1.7139, + "step": 5881 + }, + { + "epoch": 0.3278706800445931, + "grad_norm": 0.4911452829837799, + "learning_rate": 8e-05, + "loss": 1.5714, + "step": 5882 + }, + { + "epoch": 0.32792642140468226, + "grad_norm": 0.5015363097190857, + "learning_rate": 8e-05, + "loss": 1.5972, + "step": 5883 + }, + { + "epoch": 0.32798216276477143, + "grad_norm": 0.5393746495246887, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 5884 + }, + { + "epoch": 0.32803790412486067, + "grad_norm": 0.4892618656158447, + "learning_rate": 8e-05, + "loss": 1.7331, + "step": 5885 + }, + { + "epoch": 0.32809364548494985, + "grad_norm": 0.47445163130760193, + "learning_rate": 8e-05, + "loss": 1.3907, + "step": 5886 + }, + { + "epoch": 0.328149386845039, + "grad_norm": 0.5472539663314819, + "learning_rate": 8e-05, + "loss": 1.8212, + "step": 5887 + }, + { + "epoch": 0.3282051282051282, + "grad_norm": 0.51388019323349, + "learning_rate": 8e-05, + "loss": 1.7439, + "step": 5888 + }, + { + "epoch": 0.3282608695652174, + "grad_norm": 0.4770544767379761, + "learning_rate": 8e-05, + "loss": 1.5598, + "step": 5889 + }, + { + "epoch": 0.32831661092530656, + "grad_norm": 0.5853254199028015, + "learning_rate": 8e-05, + "loss": 1.9501, + "step": 5890 + }, + { + "epoch": 0.32837235228539574, + "grad_norm": 0.559180736541748, + "learning_rate": 8e-05, + "loss": 1.9934, + "step": 5891 + }, + { + "epoch": 0.328428093645485, + "grad_norm": 0.5387535095214844, + "learning_rate": 8e-05, + "loss": 1.648, + "step": 5892 + }, + { + "epoch": 0.32848383500557415, + "grad_norm": 0.5571669936180115, + "learning_rate": 8e-05, + "loss": 1.9088, + "step": 5893 + }, + { + "epoch": 0.32853957636566333, + "grad_norm": 0.5683614611625671, + "learning_rate": 8e-05, + "loss": 1.9428, + "step": 5894 + }, + { + "epoch": 0.3285953177257525, + "grad_norm": 0.4627940058708191, + "learning_rate": 8e-05, + "loss": 1.43, + "step": 5895 + }, + { + "epoch": 0.3286510590858417, + "grad_norm": 0.5188383460044861, + "learning_rate": 8e-05, + "loss": 1.5124, + "step": 5896 + }, + { + "epoch": 0.32870680044593087, + "grad_norm": 0.5177381634712219, + "learning_rate": 8e-05, + "loss": 1.7765, + "step": 5897 + }, + { + "epoch": 0.32876254180602005, + "grad_norm": 0.5070524215698242, + "learning_rate": 8e-05, + "loss": 1.6511, + "step": 5898 + }, + { + "epoch": 0.3288182831661092, + "grad_norm": 0.4960431158542633, + "learning_rate": 8e-05, + "loss": 1.7412, + "step": 5899 + }, + { + "epoch": 0.32887402452619846, + "grad_norm": 0.5233199596405029, + "learning_rate": 8e-05, + "loss": 1.647, + "step": 5900 + }, + { + "epoch": 0.32892976588628764, + "grad_norm": 0.535688579082489, + "learning_rate": 8e-05, + "loss": 1.7367, + "step": 5901 + }, + { + "epoch": 0.3289855072463768, + "grad_norm": 0.5019519925117493, + "learning_rate": 8e-05, + "loss": 2.0115, + "step": 5902 + }, + { + "epoch": 0.329041248606466, + "grad_norm": 0.5208526253700256, + "learning_rate": 8e-05, + "loss": 1.6949, + "step": 5903 + }, + { + "epoch": 0.3290969899665552, + "grad_norm": 0.5380985140800476, + "learning_rate": 8e-05, + "loss": 1.8018, + "step": 5904 + }, + { + "epoch": 0.32915273132664435, + "grad_norm": 0.5154558420181274, + "learning_rate": 8e-05, + "loss": 1.8893, + "step": 5905 + }, + { + "epoch": 0.32920847268673353, + "grad_norm": 0.5352508425712585, + "learning_rate": 8e-05, + "loss": 1.7274, + "step": 5906 + }, + { + "epoch": 0.32926421404682277, + "grad_norm": 0.4826267957687378, + "learning_rate": 8e-05, + "loss": 1.342, + "step": 5907 + }, + { + "epoch": 0.32931995540691195, + "grad_norm": 0.5051599144935608, + "learning_rate": 8e-05, + "loss": 1.5785, + "step": 5908 + }, + { + "epoch": 0.3293756967670011, + "grad_norm": 0.5084307193756104, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 5909 + }, + { + "epoch": 0.3294314381270903, + "grad_norm": 0.5318410992622375, + "learning_rate": 8e-05, + "loss": 1.7986, + "step": 5910 + }, + { + "epoch": 0.3294871794871795, + "grad_norm": 0.4830995202064514, + "learning_rate": 8e-05, + "loss": 1.4463, + "step": 5911 + }, + { + "epoch": 0.32954292084726866, + "grad_norm": 0.552952229976654, + "learning_rate": 8e-05, + "loss": 1.8705, + "step": 5912 + }, + { + "epoch": 0.32959866220735784, + "grad_norm": 0.5508795976638794, + "learning_rate": 8e-05, + "loss": 1.7107, + "step": 5913 + }, + { + "epoch": 0.329654403567447, + "grad_norm": 0.49108216166496277, + "learning_rate": 8e-05, + "loss": 1.6975, + "step": 5914 + }, + { + "epoch": 0.32971014492753625, + "grad_norm": 0.5228166580200195, + "learning_rate": 8e-05, + "loss": 1.6342, + "step": 5915 + }, + { + "epoch": 0.32976588628762543, + "grad_norm": 0.5140056610107422, + "learning_rate": 8e-05, + "loss": 1.738, + "step": 5916 + }, + { + "epoch": 0.3298216276477146, + "grad_norm": 0.5653532147407532, + "learning_rate": 8e-05, + "loss": 1.8411, + "step": 5917 + }, + { + "epoch": 0.3298773690078038, + "grad_norm": 0.5260170102119446, + "learning_rate": 8e-05, + "loss": 1.7909, + "step": 5918 + }, + { + "epoch": 0.32993311036789297, + "grad_norm": 0.5184810757637024, + "learning_rate": 8e-05, + "loss": 1.5628, + "step": 5919 + }, + { + "epoch": 0.32998885172798215, + "grad_norm": 0.507621705532074, + "learning_rate": 8e-05, + "loss": 1.7064, + "step": 5920 + }, + { + "epoch": 0.3300445930880713, + "grad_norm": 0.4716615378856659, + "learning_rate": 8e-05, + "loss": 1.5359, + "step": 5921 + }, + { + "epoch": 0.33010033444816056, + "grad_norm": 0.5044958591461182, + "learning_rate": 8e-05, + "loss": 1.5808, + "step": 5922 + }, + { + "epoch": 0.33015607580824974, + "grad_norm": 0.5160906910896301, + "learning_rate": 8e-05, + "loss": 1.5896, + "step": 5923 + }, + { + "epoch": 0.3302118171683389, + "grad_norm": 0.5261350274085999, + "learning_rate": 8e-05, + "loss": 1.7253, + "step": 5924 + }, + { + "epoch": 0.3302675585284281, + "grad_norm": 0.5316385626792908, + "learning_rate": 8e-05, + "loss": 1.6345, + "step": 5925 + }, + { + "epoch": 0.3303232998885173, + "grad_norm": 0.5023976564407349, + "learning_rate": 8e-05, + "loss": 1.6139, + "step": 5926 + }, + { + "epoch": 0.33037904124860645, + "grad_norm": 0.5029090046882629, + "learning_rate": 8e-05, + "loss": 1.5976, + "step": 5927 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 0.4609101116657257, + "learning_rate": 8e-05, + "loss": 1.2439, + "step": 5928 + }, + { + "epoch": 0.3304905239687848, + "grad_norm": 0.5388715267181396, + "learning_rate": 8e-05, + "loss": 1.5517, + "step": 5929 + }, + { + "epoch": 0.33054626532887404, + "grad_norm": 0.5388485193252563, + "learning_rate": 8e-05, + "loss": 1.6922, + "step": 5930 + }, + { + "epoch": 0.3306020066889632, + "grad_norm": 0.46442028880119324, + "learning_rate": 8e-05, + "loss": 1.5227, + "step": 5931 + }, + { + "epoch": 0.3306577480490524, + "grad_norm": 0.47519493103027344, + "learning_rate": 8e-05, + "loss": 1.7977, + "step": 5932 + }, + { + "epoch": 0.3307134894091416, + "grad_norm": 0.5591146945953369, + "learning_rate": 8e-05, + "loss": 1.8752, + "step": 5933 + }, + { + "epoch": 0.33076923076923076, + "grad_norm": 0.5389036536216736, + "learning_rate": 8e-05, + "loss": 1.7068, + "step": 5934 + }, + { + "epoch": 0.33082497212931994, + "grad_norm": 0.53550124168396, + "learning_rate": 8e-05, + "loss": 1.7691, + "step": 5935 + }, + { + "epoch": 0.3308807134894091, + "grad_norm": 0.49706998467445374, + "learning_rate": 8e-05, + "loss": 1.6261, + "step": 5936 + }, + { + "epoch": 0.33093645484949835, + "grad_norm": 0.5716555118560791, + "learning_rate": 8e-05, + "loss": 1.671, + "step": 5937 + }, + { + "epoch": 0.33099219620958753, + "grad_norm": 0.5462481379508972, + "learning_rate": 8e-05, + "loss": 1.6752, + "step": 5938 + }, + { + "epoch": 0.3310479375696767, + "grad_norm": 0.5143697261810303, + "learning_rate": 8e-05, + "loss": 1.6878, + "step": 5939 + }, + { + "epoch": 0.3311036789297659, + "grad_norm": 0.4737571179866791, + "learning_rate": 8e-05, + "loss": 1.5534, + "step": 5940 + }, + { + "epoch": 0.33115942028985507, + "grad_norm": 0.519430935382843, + "learning_rate": 8e-05, + "loss": 1.6366, + "step": 5941 + }, + { + "epoch": 0.33121516164994425, + "grad_norm": 0.5122805833816528, + "learning_rate": 8e-05, + "loss": 1.7942, + "step": 5942 + }, + { + "epoch": 0.3312709030100334, + "grad_norm": 0.5048823952674866, + "learning_rate": 8e-05, + "loss": 1.6511, + "step": 5943 + }, + { + "epoch": 0.33132664437012266, + "grad_norm": 0.4657517075538635, + "learning_rate": 8e-05, + "loss": 1.6489, + "step": 5944 + }, + { + "epoch": 0.33138238573021184, + "grad_norm": 0.515855610370636, + "learning_rate": 8e-05, + "loss": 1.6975, + "step": 5945 + }, + { + "epoch": 0.331438127090301, + "grad_norm": 0.532136082649231, + "learning_rate": 8e-05, + "loss": 1.6791, + "step": 5946 + }, + { + "epoch": 0.3314938684503902, + "grad_norm": 0.541500449180603, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 5947 + }, + { + "epoch": 0.3315496098104794, + "grad_norm": 0.584726870059967, + "learning_rate": 8e-05, + "loss": 1.7246, + "step": 5948 + }, + { + "epoch": 0.33160535117056855, + "grad_norm": 0.5696508884429932, + "learning_rate": 8e-05, + "loss": 1.9362, + "step": 5949 + }, + { + "epoch": 0.33166109253065773, + "grad_norm": 0.5092194080352783, + "learning_rate": 8e-05, + "loss": 1.5644, + "step": 5950 + }, + { + "epoch": 0.3317168338907469, + "grad_norm": 0.5051195621490479, + "learning_rate": 8e-05, + "loss": 1.3563, + "step": 5951 + }, + { + "epoch": 0.33177257525083614, + "grad_norm": 0.5067978501319885, + "learning_rate": 8e-05, + "loss": 1.9072, + "step": 5952 + }, + { + "epoch": 0.3318283166109253, + "grad_norm": 0.5300750136375427, + "learning_rate": 8e-05, + "loss": 1.7816, + "step": 5953 + }, + { + "epoch": 0.3318840579710145, + "grad_norm": 0.5331507325172424, + "learning_rate": 8e-05, + "loss": 1.8968, + "step": 5954 + }, + { + "epoch": 0.3319397993311037, + "grad_norm": 0.480129212141037, + "learning_rate": 8e-05, + "loss": 1.338, + "step": 5955 + }, + { + "epoch": 0.33199554069119286, + "grad_norm": 0.5224987864494324, + "learning_rate": 8e-05, + "loss": 1.7565, + "step": 5956 + }, + { + "epoch": 0.33205128205128204, + "grad_norm": 0.5832299590110779, + "learning_rate": 8e-05, + "loss": 1.8018, + "step": 5957 + }, + { + "epoch": 0.3321070234113712, + "grad_norm": 0.5152959227561951, + "learning_rate": 8e-05, + "loss": 1.7769, + "step": 5958 + }, + { + "epoch": 0.33216276477146045, + "grad_norm": 0.5299351215362549, + "learning_rate": 8e-05, + "loss": 1.7476, + "step": 5959 + }, + { + "epoch": 0.33221850613154963, + "grad_norm": 0.5079212784767151, + "learning_rate": 8e-05, + "loss": 1.5925, + "step": 5960 + }, + { + "epoch": 0.3322742474916388, + "grad_norm": 0.5432602167129517, + "learning_rate": 8e-05, + "loss": 1.7368, + "step": 5961 + }, + { + "epoch": 0.332329988851728, + "grad_norm": 0.5113498568534851, + "learning_rate": 8e-05, + "loss": 1.9163, + "step": 5962 + }, + { + "epoch": 0.33238573021181717, + "grad_norm": 0.5326887965202332, + "learning_rate": 8e-05, + "loss": 1.6331, + "step": 5963 + }, + { + "epoch": 0.33244147157190634, + "grad_norm": 0.5543447136878967, + "learning_rate": 8e-05, + "loss": 1.8223, + "step": 5964 + }, + { + "epoch": 0.3324972129319955, + "grad_norm": 0.4961104691028595, + "learning_rate": 8e-05, + "loss": 1.6367, + "step": 5965 + }, + { + "epoch": 0.3325529542920847, + "grad_norm": 0.4903685748577118, + "learning_rate": 8e-05, + "loss": 1.5351, + "step": 5966 + }, + { + "epoch": 0.33260869565217394, + "grad_norm": 0.5134567618370056, + "learning_rate": 8e-05, + "loss": 1.9116, + "step": 5967 + }, + { + "epoch": 0.3326644370122631, + "grad_norm": 0.5132040977478027, + "learning_rate": 8e-05, + "loss": 1.4611, + "step": 5968 + }, + { + "epoch": 0.3327201783723523, + "grad_norm": 0.5018925070762634, + "learning_rate": 8e-05, + "loss": 1.8217, + "step": 5969 + }, + { + "epoch": 0.33277591973244147, + "grad_norm": 0.6516698002815247, + "learning_rate": 8e-05, + "loss": 1.4352, + "step": 5970 + }, + { + "epoch": 0.33283166109253065, + "grad_norm": 0.5061227679252625, + "learning_rate": 8e-05, + "loss": 1.6212, + "step": 5971 + }, + { + "epoch": 0.33288740245261983, + "grad_norm": 0.4874931573867798, + "learning_rate": 8e-05, + "loss": 1.5874, + "step": 5972 + }, + { + "epoch": 0.332943143812709, + "grad_norm": 0.5170412063598633, + "learning_rate": 8e-05, + "loss": 1.8376, + "step": 5973 + }, + { + "epoch": 0.33299888517279824, + "grad_norm": 0.48148781061172485, + "learning_rate": 8e-05, + "loss": 1.6972, + "step": 5974 + }, + { + "epoch": 0.3330546265328874, + "grad_norm": 0.5671499967575073, + "learning_rate": 8e-05, + "loss": 1.8067, + "step": 5975 + }, + { + "epoch": 0.3331103678929766, + "grad_norm": 0.5110177397727966, + "learning_rate": 8e-05, + "loss": 1.7419, + "step": 5976 + }, + { + "epoch": 0.3331661092530658, + "grad_norm": 0.49894124269485474, + "learning_rate": 8e-05, + "loss": 1.847, + "step": 5977 + }, + { + "epoch": 0.33322185061315496, + "grad_norm": 0.5159071087837219, + "learning_rate": 8e-05, + "loss": 1.759, + "step": 5978 + }, + { + "epoch": 0.33327759197324414, + "grad_norm": 0.5592484474182129, + "learning_rate": 8e-05, + "loss": 1.9145, + "step": 5979 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5127020478248596, + "learning_rate": 8e-05, + "loss": 1.7149, + "step": 5980 + }, + { + "epoch": 0.3333890746934225, + "grad_norm": 0.5237054228782654, + "learning_rate": 8e-05, + "loss": 1.721, + "step": 5981 + }, + { + "epoch": 0.33344481605351173, + "grad_norm": 0.47445914149284363, + "learning_rate": 8e-05, + "loss": 1.5014, + "step": 5982 + }, + { + "epoch": 0.3335005574136009, + "grad_norm": 0.4995151162147522, + "learning_rate": 8e-05, + "loss": 1.7636, + "step": 5983 + }, + { + "epoch": 0.3335562987736901, + "grad_norm": 0.5311259031295776, + "learning_rate": 8e-05, + "loss": 1.8416, + "step": 5984 + }, + { + "epoch": 0.33361204013377926, + "grad_norm": 0.5449182391166687, + "learning_rate": 8e-05, + "loss": 1.9423, + "step": 5985 + }, + { + "epoch": 0.33366778149386844, + "grad_norm": 0.5676257014274597, + "learning_rate": 8e-05, + "loss": 1.8433, + "step": 5986 + }, + { + "epoch": 0.3337235228539576, + "grad_norm": 0.5525195002555847, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 5987 + }, + { + "epoch": 0.3337792642140468, + "grad_norm": 0.5231057405471802, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 5988 + }, + { + "epoch": 0.33383500557413603, + "grad_norm": 0.5249040126800537, + "learning_rate": 8e-05, + "loss": 1.5266, + "step": 5989 + }, + { + "epoch": 0.3338907469342252, + "grad_norm": 0.6000924706459045, + "learning_rate": 8e-05, + "loss": 1.9797, + "step": 5990 + }, + { + "epoch": 0.3339464882943144, + "grad_norm": 0.5164068937301636, + "learning_rate": 8e-05, + "loss": 1.6044, + "step": 5991 + }, + { + "epoch": 0.33400222965440357, + "grad_norm": 0.4820178747177124, + "learning_rate": 8e-05, + "loss": 1.6805, + "step": 5992 + }, + { + "epoch": 0.33405797101449275, + "grad_norm": 0.4954069256782532, + "learning_rate": 8e-05, + "loss": 1.6487, + "step": 5993 + }, + { + "epoch": 0.33411371237458193, + "grad_norm": 0.5077088475227356, + "learning_rate": 8e-05, + "loss": 1.7436, + "step": 5994 + }, + { + "epoch": 0.3341694537346711, + "grad_norm": 0.5106683373451233, + "learning_rate": 8e-05, + "loss": 1.7808, + "step": 5995 + }, + { + "epoch": 0.3342251950947603, + "grad_norm": 0.5065857768058777, + "learning_rate": 8e-05, + "loss": 1.5292, + "step": 5996 + }, + { + "epoch": 0.3342809364548495, + "grad_norm": 0.5739662647247314, + "learning_rate": 8e-05, + "loss": 1.9316, + "step": 5997 + }, + { + "epoch": 0.3343366778149387, + "grad_norm": 0.5219335556030273, + "learning_rate": 8e-05, + "loss": 1.5769, + "step": 5998 + }, + { + "epoch": 0.3343924191750279, + "grad_norm": 0.5704739689826965, + "learning_rate": 8e-05, + "loss": 2.1513, + "step": 5999 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.5052824020385742, + "learning_rate": 8e-05, + "loss": 1.6403, + "step": 6000 + }, + { + "epoch": 0.33450390189520623, + "grad_norm": 0.5386701226234436, + "learning_rate": 8e-05, + "loss": 1.881, + "step": 6001 + }, + { + "epoch": 0.3345596432552954, + "grad_norm": 0.5315998792648315, + "learning_rate": 8e-05, + "loss": 1.5746, + "step": 6002 + }, + { + "epoch": 0.3346153846153846, + "grad_norm": 0.5564172863960266, + "learning_rate": 8e-05, + "loss": 1.9253, + "step": 6003 + }, + { + "epoch": 0.3346711259754738, + "grad_norm": 0.500681459903717, + "learning_rate": 8e-05, + "loss": 1.9605, + "step": 6004 + }, + { + "epoch": 0.334726867335563, + "grad_norm": 0.5126969218254089, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 6005 + }, + { + "epoch": 0.3347826086956522, + "grad_norm": 0.5156999826431274, + "learning_rate": 8e-05, + "loss": 1.6307, + "step": 6006 + }, + { + "epoch": 0.33483835005574136, + "grad_norm": 0.5102741122245789, + "learning_rate": 8e-05, + "loss": 1.6506, + "step": 6007 + }, + { + "epoch": 0.33489409141583054, + "grad_norm": 0.5284451246261597, + "learning_rate": 8e-05, + "loss": 1.8545, + "step": 6008 + }, + { + "epoch": 0.3349498327759197, + "grad_norm": 0.5557467341423035, + "learning_rate": 8e-05, + "loss": 1.8182, + "step": 6009 + }, + { + "epoch": 0.3350055741360089, + "grad_norm": 0.4997308552265167, + "learning_rate": 8e-05, + "loss": 1.7201, + "step": 6010 + }, + { + "epoch": 0.3350613154960981, + "grad_norm": 0.5237441062927246, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 6011 + }, + { + "epoch": 0.3351170568561873, + "grad_norm": 0.5137330889701843, + "learning_rate": 8e-05, + "loss": 1.8051, + "step": 6012 + }, + { + "epoch": 0.3351727982162765, + "grad_norm": 0.5211109519004822, + "learning_rate": 8e-05, + "loss": 1.7609, + "step": 6013 + }, + { + "epoch": 0.33522853957636567, + "grad_norm": 0.5370551943778992, + "learning_rate": 8e-05, + "loss": 1.8979, + "step": 6014 + }, + { + "epoch": 0.33528428093645485, + "grad_norm": 0.5110623240470886, + "learning_rate": 8e-05, + "loss": 1.7271, + "step": 6015 + }, + { + "epoch": 0.335340022296544, + "grad_norm": 0.5268702507019043, + "learning_rate": 8e-05, + "loss": 1.8703, + "step": 6016 + }, + { + "epoch": 0.3353957636566332, + "grad_norm": 0.5116859078407288, + "learning_rate": 8e-05, + "loss": 1.6818, + "step": 6017 + }, + { + "epoch": 0.3354515050167224, + "grad_norm": 0.669144868850708, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 6018 + }, + { + "epoch": 0.3355072463768116, + "grad_norm": 0.5689597129821777, + "learning_rate": 8e-05, + "loss": 1.8494, + "step": 6019 + }, + { + "epoch": 0.3355629877369008, + "grad_norm": 0.557144820690155, + "learning_rate": 8e-05, + "loss": 1.7948, + "step": 6020 + }, + { + "epoch": 0.33561872909699, + "grad_norm": 0.5054897665977478, + "learning_rate": 8e-05, + "loss": 1.8311, + "step": 6021 + }, + { + "epoch": 0.33567447045707915, + "grad_norm": 0.516497790813446, + "learning_rate": 8e-05, + "loss": 1.6477, + "step": 6022 + }, + { + "epoch": 0.33573021181716833, + "grad_norm": 0.534079372882843, + "learning_rate": 8e-05, + "loss": 1.5736, + "step": 6023 + }, + { + "epoch": 0.3357859531772575, + "grad_norm": 0.5206681489944458, + "learning_rate": 8e-05, + "loss": 1.7455, + "step": 6024 + }, + { + "epoch": 0.3358416945373467, + "grad_norm": 0.5690346360206604, + "learning_rate": 8e-05, + "loss": 1.7757, + "step": 6025 + }, + { + "epoch": 0.33589743589743587, + "grad_norm": 0.5441909432411194, + "learning_rate": 8e-05, + "loss": 1.8619, + "step": 6026 + }, + { + "epoch": 0.3359531772575251, + "grad_norm": 0.5242137908935547, + "learning_rate": 8e-05, + "loss": 1.618, + "step": 6027 + }, + { + "epoch": 0.3360089186176143, + "grad_norm": 0.5588452219963074, + "learning_rate": 8e-05, + "loss": 1.8945, + "step": 6028 + }, + { + "epoch": 0.33606465997770346, + "grad_norm": 0.4702637791633606, + "learning_rate": 8e-05, + "loss": 1.548, + "step": 6029 + }, + { + "epoch": 0.33612040133779264, + "grad_norm": 0.5237563848495483, + "learning_rate": 8e-05, + "loss": 1.6127, + "step": 6030 + }, + { + "epoch": 0.3361761426978818, + "grad_norm": 0.48560264706611633, + "learning_rate": 8e-05, + "loss": 1.6201, + "step": 6031 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 0.47915685176849365, + "learning_rate": 8e-05, + "loss": 1.6187, + "step": 6032 + }, + { + "epoch": 0.3362876254180602, + "grad_norm": 0.5250439047813416, + "learning_rate": 8e-05, + "loss": 1.5716, + "step": 6033 + }, + { + "epoch": 0.3363433667781494, + "grad_norm": 0.4620858430862427, + "learning_rate": 8e-05, + "loss": 1.4799, + "step": 6034 + }, + { + "epoch": 0.3363991081382386, + "grad_norm": 0.5148380994796753, + "learning_rate": 8e-05, + "loss": 1.7257, + "step": 6035 + }, + { + "epoch": 0.33645484949832777, + "grad_norm": 0.5669644474983215, + "learning_rate": 8e-05, + "loss": 2.0105, + "step": 6036 + }, + { + "epoch": 0.33651059085841695, + "grad_norm": 0.4910716414451599, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 6037 + }, + { + "epoch": 0.3365663322185061, + "grad_norm": 0.506493866443634, + "learning_rate": 8e-05, + "loss": 1.3614, + "step": 6038 + }, + { + "epoch": 0.3366220735785953, + "grad_norm": 0.5282632112503052, + "learning_rate": 8e-05, + "loss": 1.9953, + "step": 6039 + }, + { + "epoch": 0.3366778149386845, + "grad_norm": 0.49607253074645996, + "learning_rate": 8e-05, + "loss": 1.6552, + "step": 6040 + }, + { + "epoch": 0.3367335562987737, + "grad_norm": 0.5147976875305176, + "learning_rate": 8e-05, + "loss": 1.806, + "step": 6041 + }, + { + "epoch": 0.3367892976588629, + "grad_norm": 0.49952930212020874, + "learning_rate": 8e-05, + "loss": 1.5324, + "step": 6042 + }, + { + "epoch": 0.3368450390189521, + "grad_norm": 0.5874413251876831, + "learning_rate": 8e-05, + "loss": 2.2245, + "step": 6043 + }, + { + "epoch": 0.33690078037904125, + "grad_norm": 0.5175328850746155, + "learning_rate": 8e-05, + "loss": 1.762, + "step": 6044 + }, + { + "epoch": 0.33695652173913043, + "grad_norm": 0.5300764441490173, + "learning_rate": 8e-05, + "loss": 1.756, + "step": 6045 + }, + { + "epoch": 0.3370122630992196, + "grad_norm": 0.524803102016449, + "learning_rate": 8e-05, + "loss": 1.85, + "step": 6046 + }, + { + "epoch": 0.3370680044593088, + "grad_norm": 0.5062090158462524, + "learning_rate": 8e-05, + "loss": 1.4843, + "step": 6047 + }, + { + "epoch": 0.33712374581939797, + "grad_norm": 0.5162714123725891, + "learning_rate": 8e-05, + "loss": 1.9312, + "step": 6048 + }, + { + "epoch": 0.3371794871794872, + "grad_norm": 0.4670993685722351, + "learning_rate": 8e-05, + "loss": 1.5798, + "step": 6049 + }, + { + "epoch": 0.3372352285395764, + "grad_norm": 0.5087465643882751, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 6050 + }, + { + "epoch": 0.33729096989966556, + "grad_norm": 0.5819141864776611, + "learning_rate": 8e-05, + "loss": 1.9564, + "step": 6051 + }, + { + "epoch": 0.33734671125975474, + "grad_norm": 0.5214961171150208, + "learning_rate": 8e-05, + "loss": 1.6992, + "step": 6052 + }, + { + "epoch": 0.3374024526198439, + "grad_norm": 0.5660673379898071, + "learning_rate": 8e-05, + "loss": 1.8737, + "step": 6053 + }, + { + "epoch": 0.3374581939799331, + "grad_norm": 0.5306689739227295, + "learning_rate": 8e-05, + "loss": 1.3794, + "step": 6054 + }, + { + "epoch": 0.3375139353400223, + "grad_norm": 0.5800758004188538, + "learning_rate": 8e-05, + "loss": 1.6946, + "step": 6055 + }, + { + "epoch": 0.3375696767001115, + "grad_norm": 0.5740100741386414, + "learning_rate": 8e-05, + "loss": 2.1177, + "step": 6056 + }, + { + "epoch": 0.3376254180602007, + "grad_norm": 0.571855366230011, + "learning_rate": 8e-05, + "loss": 2.0269, + "step": 6057 + }, + { + "epoch": 0.33768115942028987, + "grad_norm": 0.5222740769386292, + "learning_rate": 8e-05, + "loss": 1.575, + "step": 6058 + }, + { + "epoch": 0.33773690078037905, + "grad_norm": 0.46818023920059204, + "learning_rate": 8e-05, + "loss": 1.3433, + "step": 6059 + }, + { + "epoch": 0.3377926421404682, + "grad_norm": 0.5272983312606812, + "learning_rate": 8e-05, + "loss": 1.4582, + "step": 6060 + }, + { + "epoch": 0.3378483835005574, + "grad_norm": 0.5301730632781982, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 6061 + }, + { + "epoch": 0.3379041248606466, + "grad_norm": 0.5356550812721252, + "learning_rate": 8e-05, + "loss": 1.7668, + "step": 6062 + }, + { + "epoch": 0.33795986622073576, + "grad_norm": 0.5093060731887817, + "learning_rate": 8e-05, + "loss": 1.7492, + "step": 6063 + }, + { + "epoch": 0.338015607580825, + "grad_norm": 0.4938412308692932, + "learning_rate": 8e-05, + "loss": 1.621, + "step": 6064 + }, + { + "epoch": 0.3380713489409142, + "grad_norm": 0.5446529388427734, + "learning_rate": 8e-05, + "loss": 1.7603, + "step": 6065 + }, + { + "epoch": 0.33812709030100335, + "grad_norm": 0.5054848194122314, + "learning_rate": 8e-05, + "loss": 1.4763, + "step": 6066 + }, + { + "epoch": 0.33818283166109253, + "grad_norm": 0.48166799545288086, + "learning_rate": 8e-05, + "loss": 1.4409, + "step": 6067 + }, + { + "epoch": 0.3382385730211817, + "grad_norm": 0.5645819306373596, + "learning_rate": 8e-05, + "loss": 1.5985, + "step": 6068 + }, + { + "epoch": 0.3382943143812709, + "grad_norm": 0.5451626181602478, + "learning_rate": 8e-05, + "loss": 1.7397, + "step": 6069 + }, + { + "epoch": 0.33835005574136007, + "grad_norm": 0.4711497128009796, + "learning_rate": 8e-05, + "loss": 1.577, + "step": 6070 + }, + { + "epoch": 0.3384057971014493, + "grad_norm": 0.47993114590644836, + "learning_rate": 8e-05, + "loss": 1.5361, + "step": 6071 + }, + { + "epoch": 0.3384615384615385, + "grad_norm": 0.5799311995506287, + "learning_rate": 8e-05, + "loss": 1.7867, + "step": 6072 + }, + { + "epoch": 0.33851727982162766, + "grad_norm": 0.542795717716217, + "learning_rate": 8e-05, + "loss": 1.7606, + "step": 6073 + }, + { + "epoch": 0.33857302118171684, + "grad_norm": 0.5645315051078796, + "learning_rate": 8e-05, + "loss": 1.9333, + "step": 6074 + }, + { + "epoch": 0.338628762541806, + "grad_norm": 0.50261390209198, + "learning_rate": 8e-05, + "loss": 1.5687, + "step": 6075 + }, + { + "epoch": 0.3386845039018952, + "grad_norm": 0.5475636124610901, + "learning_rate": 8e-05, + "loss": 1.8117, + "step": 6076 + }, + { + "epoch": 0.3387402452619844, + "grad_norm": 0.5432072877883911, + "learning_rate": 8e-05, + "loss": 1.6912, + "step": 6077 + }, + { + "epoch": 0.33879598662207355, + "grad_norm": 0.48782506585121155, + "learning_rate": 8e-05, + "loss": 1.6705, + "step": 6078 + }, + { + "epoch": 0.3388517279821628, + "grad_norm": 0.552790641784668, + "learning_rate": 8e-05, + "loss": 1.941, + "step": 6079 + }, + { + "epoch": 0.33890746934225197, + "grad_norm": 0.45601341128349304, + "learning_rate": 8e-05, + "loss": 1.4809, + "step": 6080 + }, + { + "epoch": 0.33896321070234114, + "grad_norm": 0.5363773703575134, + "learning_rate": 8e-05, + "loss": 1.6767, + "step": 6081 + }, + { + "epoch": 0.3390189520624303, + "grad_norm": 0.5174465179443359, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 6082 + }, + { + "epoch": 0.3390746934225195, + "grad_norm": 0.5154566764831543, + "learning_rate": 8e-05, + "loss": 1.637, + "step": 6083 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 0.4398244619369507, + "learning_rate": 8e-05, + "loss": 1.3843, + "step": 6084 + }, + { + "epoch": 0.33918617614269786, + "grad_norm": 0.5024697780609131, + "learning_rate": 8e-05, + "loss": 1.6403, + "step": 6085 + }, + { + "epoch": 0.3392419175027871, + "grad_norm": 0.5018582344055176, + "learning_rate": 8e-05, + "loss": 1.5614, + "step": 6086 + }, + { + "epoch": 0.3392976588628763, + "grad_norm": 0.5023205280303955, + "learning_rate": 8e-05, + "loss": 1.819, + "step": 6087 + }, + { + "epoch": 0.33935340022296545, + "grad_norm": 0.5069661140441895, + "learning_rate": 8e-05, + "loss": 1.6936, + "step": 6088 + }, + { + "epoch": 0.33940914158305463, + "grad_norm": 0.49317628145217896, + "learning_rate": 8e-05, + "loss": 1.4198, + "step": 6089 + }, + { + "epoch": 0.3394648829431438, + "grad_norm": 0.4833192825317383, + "learning_rate": 8e-05, + "loss": 1.6803, + "step": 6090 + }, + { + "epoch": 0.339520624303233, + "grad_norm": 0.6212770938873291, + "learning_rate": 8e-05, + "loss": 2.2949, + "step": 6091 + }, + { + "epoch": 0.33957636566332217, + "grad_norm": 0.486361026763916, + "learning_rate": 8e-05, + "loss": 1.7127, + "step": 6092 + }, + { + "epoch": 0.33963210702341134, + "grad_norm": 0.5241421461105347, + "learning_rate": 8e-05, + "loss": 1.8728, + "step": 6093 + }, + { + "epoch": 0.3396878483835006, + "grad_norm": 0.5817832946777344, + "learning_rate": 8e-05, + "loss": 1.8213, + "step": 6094 + }, + { + "epoch": 0.33974358974358976, + "grad_norm": 0.5687096118927002, + "learning_rate": 8e-05, + "loss": 1.8387, + "step": 6095 + }, + { + "epoch": 0.33979933110367894, + "grad_norm": 0.5000899434089661, + "learning_rate": 8e-05, + "loss": 1.4907, + "step": 6096 + }, + { + "epoch": 0.3398550724637681, + "grad_norm": 0.5093439221382141, + "learning_rate": 8e-05, + "loss": 1.7178, + "step": 6097 + }, + { + "epoch": 0.3399108138238573, + "grad_norm": 0.487914115190506, + "learning_rate": 8e-05, + "loss": 1.4353, + "step": 6098 + }, + { + "epoch": 0.3399665551839465, + "grad_norm": 0.5182102918624878, + "learning_rate": 8e-05, + "loss": 1.833, + "step": 6099 + }, + { + "epoch": 0.34002229654403565, + "grad_norm": 0.45939892530441284, + "learning_rate": 8e-05, + "loss": 1.4572, + "step": 6100 + }, + { + "epoch": 0.3400780379041249, + "grad_norm": 0.4695865213871002, + "learning_rate": 8e-05, + "loss": 1.5965, + "step": 6101 + }, + { + "epoch": 0.34013377926421406, + "grad_norm": 0.54413241147995, + "learning_rate": 8e-05, + "loss": 1.8549, + "step": 6102 + }, + { + "epoch": 0.34018952062430324, + "grad_norm": 0.5373137593269348, + "learning_rate": 8e-05, + "loss": 1.6635, + "step": 6103 + }, + { + "epoch": 0.3402452619843924, + "grad_norm": 0.5292257070541382, + "learning_rate": 8e-05, + "loss": 1.6941, + "step": 6104 + }, + { + "epoch": 0.3403010033444816, + "grad_norm": 0.5034290552139282, + "learning_rate": 8e-05, + "loss": 1.8359, + "step": 6105 + }, + { + "epoch": 0.3403567447045708, + "grad_norm": 0.5438308119773865, + "learning_rate": 8e-05, + "loss": 1.6713, + "step": 6106 + }, + { + "epoch": 0.34041248606465996, + "grad_norm": 0.5302205085754395, + "learning_rate": 8e-05, + "loss": 1.761, + "step": 6107 + }, + { + "epoch": 0.34046822742474914, + "grad_norm": 0.5735040903091431, + "learning_rate": 8e-05, + "loss": 1.8512, + "step": 6108 + }, + { + "epoch": 0.34052396878483837, + "grad_norm": 0.542824923992157, + "learning_rate": 8e-05, + "loss": 1.6337, + "step": 6109 + }, + { + "epoch": 0.34057971014492755, + "grad_norm": 0.532397449016571, + "learning_rate": 8e-05, + "loss": 1.6446, + "step": 6110 + }, + { + "epoch": 0.34063545150501673, + "grad_norm": 0.5326183438301086, + "learning_rate": 8e-05, + "loss": 1.6518, + "step": 6111 + }, + { + "epoch": 0.3406911928651059, + "grad_norm": 0.560077965259552, + "learning_rate": 8e-05, + "loss": 1.736, + "step": 6112 + }, + { + "epoch": 0.3407469342251951, + "grad_norm": 0.56466144323349, + "learning_rate": 8e-05, + "loss": 1.7615, + "step": 6113 + }, + { + "epoch": 0.34080267558528426, + "grad_norm": 0.5226140022277832, + "learning_rate": 8e-05, + "loss": 1.9275, + "step": 6114 + }, + { + "epoch": 0.34085841694537344, + "grad_norm": 0.48684361577033997, + "learning_rate": 8e-05, + "loss": 1.164, + "step": 6115 + }, + { + "epoch": 0.3409141583054627, + "grad_norm": 0.5470103621482849, + "learning_rate": 8e-05, + "loss": 1.9557, + "step": 6116 + }, + { + "epoch": 0.34096989966555186, + "grad_norm": 0.4782428741455078, + "learning_rate": 8e-05, + "loss": 1.5178, + "step": 6117 + }, + { + "epoch": 0.34102564102564104, + "grad_norm": 0.49948710203170776, + "learning_rate": 8e-05, + "loss": 1.6443, + "step": 6118 + }, + { + "epoch": 0.3410813823857302, + "grad_norm": 0.5285187363624573, + "learning_rate": 8e-05, + "loss": 1.8012, + "step": 6119 + }, + { + "epoch": 0.3411371237458194, + "grad_norm": 0.5237113237380981, + "learning_rate": 8e-05, + "loss": 1.6156, + "step": 6120 + }, + { + "epoch": 0.34119286510590857, + "grad_norm": 0.5836105346679688, + "learning_rate": 8e-05, + "loss": 1.7364, + "step": 6121 + }, + { + "epoch": 0.34124860646599775, + "grad_norm": 0.5258899927139282, + "learning_rate": 8e-05, + "loss": 1.6938, + "step": 6122 + }, + { + "epoch": 0.34130434782608693, + "grad_norm": 0.5039440393447876, + "learning_rate": 8e-05, + "loss": 1.6582, + "step": 6123 + }, + { + "epoch": 0.34136008918617616, + "grad_norm": 0.5295379161834717, + "learning_rate": 8e-05, + "loss": 1.7974, + "step": 6124 + }, + { + "epoch": 0.34141583054626534, + "grad_norm": 0.5055199861526489, + "learning_rate": 8e-05, + "loss": 1.6236, + "step": 6125 + }, + { + "epoch": 0.3414715719063545, + "grad_norm": 0.516697108745575, + "learning_rate": 8e-05, + "loss": 1.5589, + "step": 6126 + }, + { + "epoch": 0.3415273132664437, + "grad_norm": 0.5174638628959656, + "learning_rate": 8e-05, + "loss": 1.4963, + "step": 6127 + }, + { + "epoch": 0.3415830546265329, + "grad_norm": 0.4991219937801361, + "learning_rate": 8e-05, + "loss": 1.4234, + "step": 6128 + }, + { + "epoch": 0.34163879598662206, + "grad_norm": 0.5199254155158997, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 6129 + }, + { + "epoch": 0.34169453734671124, + "grad_norm": 0.5277366042137146, + "learning_rate": 8e-05, + "loss": 1.8916, + "step": 6130 + }, + { + "epoch": 0.34175027870680047, + "grad_norm": 0.5037539601325989, + "learning_rate": 8e-05, + "loss": 1.7343, + "step": 6131 + }, + { + "epoch": 0.34180602006688965, + "grad_norm": 0.5122715830802917, + "learning_rate": 8e-05, + "loss": 1.673, + "step": 6132 + }, + { + "epoch": 0.3418617614269788, + "grad_norm": 0.4962614178657532, + "learning_rate": 8e-05, + "loss": 1.8025, + "step": 6133 + }, + { + "epoch": 0.341917502787068, + "grad_norm": 0.48662999272346497, + "learning_rate": 8e-05, + "loss": 1.5983, + "step": 6134 + }, + { + "epoch": 0.3419732441471572, + "grad_norm": 0.5290865302085876, + "learning_rate": 8e-05, + "loss": 1.6749, + "step": 6135 + }, + { + "epoch": 0.34202898550724636, + "grad_norm": 0.4744897782802582, + "learning_rate": 8e-05, + "loss": 1.452, + "step": 6136 + }, + { + "epoch": 0.34208472686733554, + "grad_norm": 0.5886744856834412, + "learning_rate": 8e-05, + "loss": 1.9132, + "step": 6137 + }, + { + "epoch": 0.3421404682274248, + "grad_norm": 0.5426172614097595, + "learning_rate": 8e-05, + "loss": 1.8144, + "step": 6138 + }, + { + "epoch": 0.34219620958751396, + "grad_norm": 0.5077410936355591, + "learning_rate": 8e-05, + "loss": 1.592, + "step": 6139 + }, + { + "epoch": 0.34225195094760313, + "grad_norm": 0.4840519428253174, + "learning_rate": 8e-05, + "loss": 1.581, + "step": 6140 + }, + { + "epoch": 0.3423076923076923, + "grad_norm": 0.5276888608932495, + "learning_rate": 8e-05, + "loss": 1.8234, + "step": 6141 + }, + { + "epoch": 0.3423634336677815, + "grad_norm": 0.5416687726974487, + "learning_rate": 8e-05, + "loss": 1.8545, + "step": 6142 + }, + { + "epoch": 0.34241917502787067, + "grad_norm": 0.48407217860221863, + "learning_rate": 8e-05, + "loss": 1.7246, + "step": 6143 + }, + { + "epoch": 0.34247491638795985, + "grad_norm": 0.5333887934684753, + "learning_rate": 8e-05, + "loss": 1.7349, + "step": 6144 + }, + { + "epoch": 0.34253065774804903, + "grad_norm": 0.49881041049957275, + "learning_rate": 8e-05, + "loss": 1.7543, + "step": 6145 + }, + { + "epoch": 0.34258639910813826, + "grad_norm": 0.5575197339057922, + "learning_rate": 8e-05, + "loss": 1.8022, + "step": 6146 + }, + { + "epoch": 0.34264214046822744, + "grad_norm": 0.5086601972579956, + "learning_rate": 8e-05, + "loss": 1.7278, + "step": 6147 + }, + { + "epoch": 0.3426978818283166, + "grad_norm": 0.5459162592887878, + "learning_rate": 8e-05, + "loss": 1.7454, + "step": 6148 + }, + { + "epoch": 0.3427536231884058, + "grad_norm": 0.5540486574172974, + "learning_rate": 8e-05, + "loss": 1.9281, + "step": 6149 + }, + { + "epoch": 0.342809364548495, + "grad_norm": 0.5860450267791748, + "learning_rate": 8e-05, + "loss": 1.8497, + "step": 6150 + }, + { + "epoch": 0.34286510590858416, + "grad_norm": 0.48789045214653015, + "learning_rate": 8e-05, + "loss": 1.5751, + "step": 6151 + }, + { + "epoch": 0.34292084726867333, + "grad_norm": 0.5314328670501709, + "learning_rate": 8e-05, + "loss": 1.7379, + "step": 6152 + }, + { + "epoch": 0.34297658862876257, + "grad_norm": 0.5092215538024902, + "learning_rate": 8e-05, + "loss": 1.6208, + "step": 6153 + }, + { + "epoch": 0.34303232998885175, + "grad_norm": 0.5262496471405029, + "learning_rate": 8e-05, + "loss": 1.6981, + "step": 6154 + }, + { + "epoch": 0.3430880713489409, + "grad_norm": 0.5319823622703552, + "learning_rate": 8e-05, + "loss": 1.6881, + "step": 6155 + }, + { + "epoch": 0.3431438127090301, + "grad_norm": 0.5669171214103699, + "learning_rate": 8e-05, + "loss": 1.9915, + "step": 6156 + }, + { + "epoch": 0.3431995540691193, + "grad_norm": 0.5114725232124329, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 6157 + }, + { + "epoch": 0.34325529542920846, + "grad_norm": 0.48004892468452454, + "learning_rate": 8e-05, + "loss": 1.3144, + "step": 6158 + }, + { + "epoch": 0.34331103678929764, + "grad_norm": 0.523747444152832, + "learning_rate": 8e-05, + "loss": 1.684, + "step": 6159 + }, + { + "epoch": 0.3433667781493868, + "grad_norm": 0.522097647190094, + "learning_rate": 8e-05, + "loss": 1.7017, + "step": 6160 + }, + { + "epoch": 0.34342251950947605, + "grad_norm": 0.5229143500328064, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 6161 + }, + { + "epoch": 0.34347826086956523, + "grad_norm": 0.5871850252151489, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 6162 + }, + { + "epoch": 0.3435340022296544, + "grad_norm": 0.4945095479488373, + "learning_rate": 8e-05, + "loss": 1.5654, + "step": 6163 + }, + { + "epoch": 0.3435897435897436, + "grad_norm": 0.5154891610145569, + "learning_rate": 8e-05, + "loss": 1.4761, + "step": 6164 + }, + { + "epoch": 0.34364548494983277, + "grad_norm": 0.5434823632240295, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 6165 + }, + { + "epoch": 0.34370122630992195, + "grad_norm": 0.5289078950881958, + "learning_rate": 8e-05, + "loss": 1.3661, + "step": 6166 + }, + { + "epoch": 0.3437569676700111, + "grad_norm": 0.507692813873291, + "learning_rate": 8e-05, + "loss": 1.7057, + "step": 6167 + }, + { + "epoch": 0.34381270903010036, + "grad_norm": 0.5627875924110413, + "learning_rate": 8e-05, + "loss": 1.7958, + "step": 6168 + }, + { + "epoch": 0.34386845039018954, + "grad_norm": 0.5589924454689026, + "learning_rate": 8e-05, + "loss": 1.8935, + "step": 6169 + }, + { + "epoch": 0.3439241917502787, + "grad_norm": 0.5078762769699097, + "learning_rate": 8e-05, + "loss": 1.7232, + "step": 6170 + }, + { + "epoch": 0.3439799331103679, + "grad_norm": 0.4899272918701172, + "learning_rate": 8e-05, + "loss": 1.5772, + "step": 6171 + }, + { + "epoch": 0.3440356744704571, + "grad_norm": 0.5079790353775024, + "learning_rate": 8e-05, + "loss": 1.4953, + "step": 6172 + }, + { + "epoch": 0.34409141583054625, + "grad_norm": 0.5527381896972656, + "learning_rate": 8e-05, + "loss": 1.6653, + "step": 6173 + }, + { + "epoch": 0.34414715719063543, + "grad_norm": 0.5279654264450073, + "learning_rate": 8e-05, + "loss": 1.8035, + "step": 6174 + }, + { + "epoch": 0.3442028985507246, + "grad_norm": 0.572637677192688, + "learning_rate": 8e-05, + "loss": 1.7902, + "step": 6175 + }, + { + "epoch": 0.34425863991081385, + "grad_norm": 0.5261125564575195, + "learning_rate": 8e-05, + "loss": 1.8534, + "step": 6176 + }, + { + "epoch": 0.344314381270903, + "grad_norm": 0.521367073059082, + "learning_rate": 8e-05, + "loss": 1.5969, + "step": 6177 + }, + { + "epoch": 0.3443701226309922, + "grad_norm": 0.4918879568576813, + "learning_rate": 8e-05, + "loss": 1.7713, + "step": 6178 + }, + { + "epoch": 0.3444258639910814, + "grad_norm": 0.5039586424827576, + "learning_rate": 8e-05, + "loss": 1.6867, + "step": 6179 + }, + { + "epoch": 0.34448160535117056, + "grad_norm": 0.5196630954742432, + "learning_rate": 8e-05, + "loss": 1.8582, + "step": 6180 + }, + { + "epoch": 0.34453734671125974, + "grad_norm": 0.5284832715988159, + "learning_rate": 8e-05, + "loss": 1.7363, + "step": 6181 + }, + { + "epoch": 0.3445930880713489, + "grad_norm": 0.5373867750167847, + "learning_rate": 8e-05, + "loss": 1.5278, + "step": 6182 + }, + { + "epoch": 0.34464882943143815, + "grad_norm": 0.5544673800468445, + "learning_rate": 8e-05, + "loss": 1.7299, + "step": 6183 + }, + { + "epoch": 0.34470457079152733, + "grad_norm": 0.5601715445518494, + "learning_rate": 8e-05, + "loss": 1.7765, + "step": 6184 + }, + { + "epoch": 0.3447603121516165, + "grad_norm": 0.564301073551178, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 6185 + }, + { + "epoch": 0.3448160535117057, + "grad_norm": 0.5370534062385559, + "learning_rate": 8e-05, + "loss": 2.023, + "step": 6186 + }, + { + "epoch": 0.34487179487179487, + "grad_norm": 0.5042305588722229, + "learning_rate": 8e-05, + "loss": 1.7116, + "step": 6187 + }, + { + "epoch": 0.34492753623188405, + "grad_norm": 0.4756479859352112, + "learning_rate": 8e-05, + "loss": 1.6318, + "step": 6188 + }, + { + "epoch": 0.3449832775919732, + "grad_norm": 0.573596715927124, + "learning_rate": 8e-05, + "loss": 1.8775, + "step": 6189 + }, + { + "epoch": 0.3450390189520624, + "grad_norm": 0.5731935501098633, + "learning_rate": 8e-05, + "loss": 1.5896, + "step": 6190 + }, + { + "epoch": 0.34509476031215164, + "grad_norm": 0.48869970440864563, + "learning_rate": 8e-05, + "loss": 1.7313, + "step": 6191 + }, + { + "epoch": 0.3451505016722408, + "grad_norm": 0.5444765686988831, + "learning_rate": 8e-05, + "loss": 1.7901, + "step": 6192 + }, + { + "epoch": 0.34520624303233, + "grad_norm": 0.5213355422019958, + "learning_rate": 8e-05, + "loss": 1.7404, + "step": 6193 + }, + { + "epoch": 0.3452619843924192, + "grad_norm": 0.4946276545524597, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 6194 + }, + { + "epoch": 0.34531772575250835, + "grad_norm": 0.505746603012085, + "learning_rate": 8e-05, + "loss": 1.5607, + "step": 6195 + }, + { + "epoch": 0.34537346711259753, + "grad_norm": 0.48780500888824463, + "learning_rate": 8e-05, + "loss": 1.6017, + "step": 6196 + }, + { + "epoch": 0.3454292084726867, + "grad_norm": 0.527519941329956, + "learning_rate": 8e-05, + "loss": 1.5679, + "step": 6197 + }, + { + "epoch": 0.34548494983277594, + "grad_norm": 0.5447553992271423, + "learning_rate": 8e-05, + "loss": 1.8504, + "step": 6198 + }, + { + "epoch": 0.3455406911928651, + "grad_norm": 0.505783200263977, + "learning_rate": 8e-05, + "loss": 1.7171, + "step": 6199 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.5543640851974487, + "learning_rate": 8e-05, + "loss": 1.7813, + "step": 6200 + }, + { + "epoch": 0.3456521739130435, + "grad_norm": 0.6018317341804504, + "learning_rate": 8e-05, + "loss": 1.8848, + "step": 6201 + }, + { + "epoch": 0.34570791527313266, + "grad_norm": 0.5265443921089172, + "learning_rate": 8e-05, + "loss": 1.7686, + "step": 6202 + }, + { + "epoch": 0.34576365663322184, + "grad_norm": 0.48967650532722473, + "learning_rate": 8e-05, + "loss": 1.5091, + "step": 6203 + }, + { + "epoch": 0.345819397993311, + "grad_norm": 0.5279107093811035, + "learning_rate": 8e-05, + "loss": 1.6745, + "step": 6204 + }, + { + "epoch": 0.3458751393534002, + "grad_norm": 0.5227549076080322, + "learning_rate": 8e-05, + "loss": 1.7417, + "step": 6205 + }, + { + "epoch": 0.34593088071348943, + "grad_norm": 0.5156877040863037, + "learning_rate": 8e-05, + "loss": 1.6155, + "step": 6206 + }, + { + "epoch": 0.3459866220735786, + "grad_norm": 0.48437830805778503, + "learning_rate": 8e-05, + "loss": 1.4722, + "step": 6207 + }, + { + "epoch": 0.3460423634336678, + "grad_norm": 0.4888782203197479, + "learning_rate": 8e-05, + "loss": 1.7159, + "step": 6208 + }, + { + "epoch": 0.34609810479375697, + "grad_norm": 0.5110427141189575, + "learning_rate": 8e-05, + "loss": 1.6829, + "step": 6209 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 0.47628217935562134, + "learning_rate": 8e-05, + "loss": 1.6921, + "step": 6210 + }, + { + "epoch": 0.3462095875139353, + "grad_norm": 0.5243502259254456, + "learning_rate": 8e-05, + "loss": 1.7033, + "step": 6211 + }, + { + "epoch": 0.3462653288740245, + "grad_norm": 0.481799453496933, + "learning_rate": 8e-05, + "loss": 1.6188, + "step": 6212 + }, + { + "epoch": 0.34632107023411374, + "grad_norm": 0.4980103075504303, + "learning_rate": 8e-05, + "loss": 1.6574, + "step": 6213 + }, + { + "epoch": 0.3463768115942029, + "grad_norm": 0.5031689405441284, + "learning_rate": 8e-05, + "loss": 1.632, + "step": 6214 + }, + { + "epoch": 0.3464325529542921, + "grad_norm": 0.5591900944709778, + "learning_rate": 8e-05, + "loss": 1.8869, + "step": 6215 + }, + { + "epoch": 0.3464882943143813, + "grad_norm": 0.5446702837944031, + "learning_rate": 8e-05, + "loss": 1.7415, + "step": 6216 + }, + { + "epoch": 0.34654403567447045, + "grad_norm": 0.5175807476043701, + "learning_rate": 8e-05, + "loss": 1.6912, + "step": 6217 + }, + { + "epoch": 0.34659977703455963, + "grad_norm": 0.5340683460235596, + "learning_rate": 8e-05, + "loss": 1.7797, + "step": 6218 + }, + { + "epoch": 0.3466555183946488, + "grad_norm": 0.5005671977996826, + "learning_rate": 8e-05, + "loss": 1.6165, + "step": 6219 + }, + { + "epoch": 0.346711259754738, + "grad_norm": 0.5165536403656006, + "learning_rate": 8e-05, + "loss": 1.6956, + "step": 6220 + }, + { + "epoch": 0.3467670011148272, + "grad_norm": 0.5185880661010742, + "learning_rate": 8e-05, + "loss": 1.6665, + "step": 6221 + }, + { + "epoch": 0.3468227424749164, + "grad_norm": 0.5302398800849915, + "learning_rate": 8e-05, + "loss": 1.8853, + "step": 6222 + }, + { + "epoch": 0.3468784838350056, + "grad_norm": 0.4764886200428009, + "learning_rate": 8e-05, + "loss": 1.5683, + "step": 6223 + }, + { + "epoch": 0.34693422519509476, + "grad_norm": 0.515983521938324, + "learning_rate": 8e-05, + "loss": 1.6717, + "step": 6224 + }, + { + "epoch": 0.34698996655518394, + "grad_norm": 0.5122112035751343, + "learning_rate": 8e-05, + "loss": 1.7558, + "step": 6225 + }, + { + "epoch": 0.3470457079152731, + "grad_norm": 0.5676699876785278, + "learning_rate": 8e-05, + "loss": 2.0176, + "step": 6226 + }, + { + "epoch": 0.3471014492753623, + "grad_norm": 0.5953008532524109, + "learning_rate": 8e-05, + "loss": 1.8701, + "step": 6227 + }, + { + "epoch": 0.34715719063545153, + "grad_norm": 0.5327503681182861, + "learning_rate": 8e-05, + "loss": 1.7862, + "step": 6228 + }, + { + "epoch": 0.3472129319955407, + "grad_norm": 0.514793872833252, + "learning_rate": 8e-05, + "loss": 1.7198, + "step": 6229 + }, + { + "epoch": 0.3472686733556299, + "grad_norm": 0.531711757183075, + "learning_rate": 8e-05, + "loss": 1.7455, + "step": 6230 + }, + { + "epoch": 0.34732441471571907, + "grad_norm": 0.5069992542266846, + "learning_rate": 8e-05, + "loss": 1.6869, + "step": 6231 + }, + { + "epoch": 0.34738015607580824, + "grad_norm": 0.5153529644012451, + "learning_rate": 8e-05, + "loss": 1.8834, + "step": 6232 + }, + { + "epoch": 0.3474358974358974, + "grad_norm": 0.4857705235481262, + "learning_rate": 8e-05, + "loss": 1.6374, + "step": 6233 + }, + { + "epoch": 0.3474916387959866, + "grad_norm": 0.5472427606582642, + "learning_rate": 8e-05, + "loss": 1.9739, + "step": 6234 + }, + { + "epoch": 0.34754738015607584, + "grad_norm": 0.5616702437400818, + "learning_rate": 8e-05, + "loss": 1.6326, + "step": 6235 + }, + { + "epoch": 0.347603121516165, + "grad_norm": 0.5345883369445801, + "learning_rate": 8e-05, + "loss": 1.7221, + "step": 6236 + }, + { + "epoch": 0.3476588628762542, + "grad_norm": 0.48029625415802, + "learning_rate": 8e-05, + "loss": 1.5286, + "step": 6237 + }, + { + "epoch": 0.34771460423634337, + "grad_norm": 0.5803802013397217, + "learning_rate": 8e-05, + "loss": 1.7394, + "step": 6238 + }, + { + "epoch": 0.34777034559643255, + "grad_norm": 0.523873507976532, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 6239 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.48509761691093445, + "learning_rate": 8e-05, + "loss": 1.5798, + "step": 6240 + }, + { + "epoch": 0.3478818283166109, + "grad_norm": 0.5138911008834839, + "learning_rate": 8e-05, + "loss": 1.8276, + "step": 6241 + }, + { + "epoch": 0.3479375696767001, + "grad_norm": 0.5050959587097168, + "learning_rate": 8e-05, + "loss": 1.6347, + "step": 6242 + }, + { + "epoch": 0.3479933110367893, + "grad_norm": 0.4868180751800537, + "learning_rate": 8e-05, + "loss": 1.561, + "step": 6243 + }, + { + "epoch": 0.3480490523968785, + "grad_norm": 0.48366305232048035, + "learning_rate": 8e-05, + "loss": 1.8501, + "step": 6244 + }, + { + "epoch": 0.3481047937569677, + "grad_norm": 0.5158886313438416, + "learning_rate": 8e-05, + "loss": 1.7387, + "step": 6245 + }, + { + "epoch": 0.34816053511705686, + "grad_norm": 0.516280472278595, + "learning_rate": 8e-05, + "loss": 1.72, + "step": 6246 + }, + { + "epoch": 0.34821627647714604, + "grad_norm": 0.5243182182312012, + "learning_rate": 8e-05, + "loss": 1.8727, + "step": 6247 + }, + { + "epoch": 0.3482720178372352, + "grad_norm": 0.5397647619247437, + "learning_rate": 8e-05, + "loss": 1.5581, + "step": 6248 + }, + { + "epoch": 0.3483277591973244, + "grad_norm": 0.5544797778129578, + "learning_rate": 8e-05, + "loss": 1.5943, + "step": 6249 + }, + { + "epoch": 0.34838350055741363, + "grad_norm": 0.5458196997642517, + "learning_rate": 8e-05, + "loss": 1.6336, + "step": 6250 + }, + { + "epoch": 0.3484392419175028, + "grad_norm": 0.5505799055099487, + "learning_rate": 8e-05, + "loss": 1.8118, + "step": 6251 + }, + { + "epoch": 0.348494983277592, + "grad_norm": 0.5446839928627014, + "learning_rate": 8e-05, + "loss": 1.8311, + "step": 6252 + }, + { + "epoch": 0.34855072463768116, + "grad_norm": 0.5084923505783081, + "learning_rate": 8e-05, + "loss": 1.6369, + "step": 6253 + }, + { + "epoch": 0.34860646599777034, + "grad_norm": 0.5445770025253296, + "learning_rate": 8e-05, + "loss": 1.7987, + "step": 6254 + }, + { + "epoch": 0.3486622073578595, + "grad_norm": 0.5154903531074524, + "learning_rate": 8e-05, + "loss": 1.6847, + "step": 6255 + }, + { + "epoch": 0.3487179487179487, + "grad_norm": 0.5390969514846802, + "learning_rate": 8e-05, + "loss": 1.8958, + "step": 6256 + }, + { + "epoch": 0.3487736900780379, + "grad_norm": 0.496901273727417, + "learning_rate": 8e-05, + "loss": 1.7611, + "step": 6257 + }, + { + "epoch": 0.3488294314381271, + "grad_norm": 0.4763997495174408, + "learning_rate": 8e-05, + "loss": 1.46, + "step": 6258 + }, + { + "epoch": 0.3488851727982163, + "grad_norm": 0.5262258052825928, + "learning_rate": 8e-05, + "loss": 1.8513, + "step": 6259 + }, + { + "epoch": 0.34894091415830547, + "grad_norm": 0.5122326016426086, + "learning_rate": 8e-05, + "loss": 1.6108, + "step": 6260 + }, + { + "epoch": 0.34899665551839465, + "grad_norm": 0.555101752281189, + "learning_rate": 8e-05, + "loss": 1.8278, + "step": 6261 + }, + { + "epoch": 0.34905239687848383, + "grad_norm": 0.5074917078018188, + "learning_rate": 8e-05, + "loss": 1.7252, + "step": 6262 + }, + { + "epoch": 0.349108138238573, + "grad_norm": 0.5135613083839417, + "learning_rate": 8e-05, + "loss": 1.6203, + "step": 6263 + }, + { + "epoch": 0.3491638795986622, + "grad_norm": 0.5522807836532593, + "learning_rate": 8e-05, + "loss": 1.8089, + "step": 6264 + }, + { + "epoch": 0.3492196209587514, + "grad_norm": 0.5222398042678833, + "learning_rate": 8e-05, + "loss": 1.788, + "step": 6265 + }, + { + "epoch": 0.3492753623188406, + "grad_norm": 0.5117469429969788, + "learning_rate": 8e-05, + "loss": 1.6157, + "step": 6266 + }, + { + "epoch": 0.3493311036789298, + "grad_norm": 0.5744069218635559, + "learning_rate": 8e-05, + "loss": 1.8723, + "step": 6267 + }, + { + "epoch": 0.34938684503901896, + "grad_norm": 0.5354539155960083, + "learning_rate": 8e-05, + "loss": 1.6102, + "step": 6268 + }, + { + "epoch": 0.34944258639910813, + "grad_norm": 0.6402024626731873, + "learning_rate": 8e-05, + "loss": 1.7213, + "step": 6269 + }, + { + "epoch": 0.3494983277591973, + "grad_norm": 0.4755125641822815, + "learning_rate": 8e-05, + "loss": 1.6028, + "step": 6270 + }, + { + "epoch": 0.3495540691192865, + "grad_norm": 0.5351192951202393, + "learning_rate": 8e-05, + "loss": 1.6167, + "step": 6271 + }, + { + "epoch": 0.34960981047937567, + "grad_norm": 0.5391672253608704, + "learning_rate": 8e-05, + "loss": 1.7226, + "step": 6272 + }, + { + "epoch": 0.3496655518394649, + "grad_norm": 0.5133790969848633, + "learning_rate": 8e-05, + "loss": 1.7819, + "step": 6273 + }, + { + "epoch": 0.3497212931995541, + "grad_norm": 0.5002404451370239, + "learning_rate": 8e-05, + "loss": 1.5547, + "step": 6274 + }, + { + "epoch": 0.34977703455964326, + "grad_norm": 0.4975956380367279, + "learning_rate": 8e-05, + "loss": 1.3753, + "step": 6275 + }, + { + "epoch": 0.34983277591973244, + "grad_norm": 0.5085654854774475, + "learning_rate": 8e-05, + "loss": 1.5977, + "step": 6276 + }, + { + "epoch": 0.3498885172798216, + "grad_norm": 0.5318102836608887, + "learning_rate": 8e-05, + "loss": 1.6468, + "step": 6277 + }, + { + "epoch": 0.3499442586399108, + "grad_norm": 0.4818606376647949, + "learning_rate": 8e-05, + "loss": 1.5751, + "step": 6278 + }, + { + "epoch": 0.35, + "grad_norm": 0.5304341316223145, + "learning_rate": 8e-05, + "loss": 1.6784, + "step": 6279 + }, + { + "epoch": 0.3500557413600892, + "grad_norm": 0.5689952969551086, + "learning_rate": 8e-05, + "loss": 1.9105, + "step": 6280 + }, + { + "epoch": 0.3501114827201784, + "grad_norm": 0.5430139303207397, + "learning_rate": 8e-05, + "loss": 1.8026, + "step": 6281 + }, + { + "epoch": 0.35016722408026757, + "grad_norm": 0.49770841002464294, + "learning_rate": 8e-05, + "loss": 1.5024, + "step": 6282 + }, + { + "epoch": 0.35022296544035675, + "grad_norm": 0.5617845058441162, + "learning_rate": 8e-05, + "loss": 1.8061, + "step": 6283 + }, + { + "epoch": 0.3502787068004459, + "grad_norm": 0.5018987655639648, + "learning_rate": 8e-05, + "loss": 1.6195, + "step": 6284 + }, + { + "epoch": 0.3503344481605351, + "grad_norm": 0.5285188555717468, + "learning_rate": 8e-05, + "loss": 1.6761, + "step": 6285 + }, + { + "epoch": 0.3503901895206243, + "grad_norm": 0.5185275077819824, + "learning_rate": 8e-05, + "loss": 1.6678, + "step": 6286 + }, + { + "epoch": 0.35044593088071346, + "grad_norm": 0.5226163864135742, + "learning_rate": 8e-05, + "loss": 1.709, + "step": 6287 + }, + { + "epoch": 0.3505016722408027, + "grad_norm": 0.520292341709137, + "learning_rate": 8e-05, + "loss": 1.7416, + "step": 6288 + }, + { + "epoch": 0.3505574136008919, + "grad_norm": 0.5003277063369751, + "learning_rate": 8e-05, + "loss": 1.6716, + "step": 6289 + }, + { + "epoch": 0.35061315496098105, + "grad_norm": 0.5214517116546631, + "learning_rate": 8e-05, + "loss": 1.6144, + "step": 6290 + }, + { + "epoch": 0.35066889632107023, + "grad_norm": 0.4936576783657074, + "learning_rate": 8e-05, + "loss": 1.5691, + "step": 6291 + }, + { + "epoch": 0.3507246376811594, + "grad_norm": 0.5251075029373169, + "learning_rate": 8e-05, + "loss": 1.7951, + "step": 6292 + }, + { + "epoch": 0.3507803790412486, + "grad_norm": 0.4800378382205963, + "learning_rate": 8e-05, + "loss": 1.613, + "step": 6293 + }, + { + "epoch": 0.35083612040133777, + "grad_norm": 0.4930437207221985, + "learning_rate": 8e-05, + "loss": 1.5607, + "step": 6294 + }, + { + "epoch": 0.350891861761427, + "grad_norm": 0.5461827516555786, + "learning_rate": 8e-05, + "loss": 1.7602, + "step": 6295 + }, + { + "epoch": 0.3509476031215162, + "grad_norm": 0.4616333544254303, + "learning_rate": 8e-05, + "loss": 1.4898, + "step": 6296 + }, + { + "epoch": 0.35100334448160536, + "grad_norm": 0.5303559899330139, + "learning_rate": 8e-05, + "loss": 1.7979, + "step": 6297 + }, + { + "epoch": 0.35105908584169454, + "grad_norm": 0.5381184220314026, + "learning_rate": 8e-05, + "loss": 1.7486, + "step": 6298 + }, + { + "epoch": 0.3511148272017837, + "grad_norm": 0.5684160590171814, + "learning_rate": 8e-05, + "loss": 1.8218, + "step": 6299 + }, + { + "epoch": 0.3511705685618729, + "grad_norm": 0.5133114457130432, + "learning_rate": 8e-05, + "loss": 1.6521, + "step": 6300 + }, + { + "epoch": 0.3512263099219621, + "grad_norm": 0.6434251070022583, + "learning_rate": 8e-05, + "loss": 2.0408, + "step": 6301 + }, + { + "epoch": 0.35128205128205126, + "grad_norm": 0.45654892921447754, + "learning_rate": 8e-05, + "loss": 1.5327, + "step": 6302 + }, + { + "epoch": 0.3513377926421405, + "grad_norm": 0.5252170562744141, + "learning_rate": 8e-05, + "loss": 1.7286, + "step": 6303 + }, + { + "epoch": 0.35139353400222967, + "grad_norm": 0.5745198726654053, + "learning_rate": 8e-05, + "loss": 1.8548, + "step": 6304 + }, + { + "epoch": 0.35144927536231885, + "grad_norm": 0.5161747336387634, + "learning_rate": 8e-05, + "loss": 1.7415, + "step": 6305 + }, + { + "epoch": 0.351505016722408, + "grad_norm": 0.47877398133277893, + "learning_rate": 8e-05, + "loss": 1.6255, + "step": 6306 + }, + { + "epoch": 0.3515607580824972, + "grad_norm": 0.5062041878700256, + "learning_rate": 8e-05, + "loss": 1.5955, + "step": 6307 + }, + { + "epoch": 0.3516164994425864, + "grad_norm": 0.5496783256530762, + "learning_rate": 8e-05, + "loss": 1.9515, + "step": 6308 + }, + { + "epoch": 0.35167224080267556, + "grad_norm": 0.4971919655799866, + "learning_rate": 8e-05, + "loss": 1.6802, + "step": 6309 + }, + { + "epoch": 0.3517279821627648, + "grad_norm": 0.5056297779083252, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 6310 + }, + { + "epoch": 0.351783723522854, + "grad_norm": 0.5026938319206238, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 6311 + }, + { + "epoch": 0.35183946488294315, + "grad_norm": 0.5117358565330505, + "learning_rate": 8e-05, + "loss": 1.4986, + "step": 6312 + }, + { + "epoch": 0.35189520624303233, + "grad_norm": 0.49614956974983215, + "learning_rate": 8e-05, + "loss": 1.6072, + "step": 6313 + }, + { + "epoch": 0.3519509476031215, + "grad_norm": 0.5110148191452026, + "learning_rate": 8e-05, + "loss": 1.718, + "step": 6314 + }, + { + "epoch": 0.3520066889632107, + "grad_norm": 0.5317930579185486, + "learning_rate": 8e-05, + "loss": 1.631, + "step": 6315 + }, + { + "epoch": 0.35206243032329987, + "grad_norm": 0.553537130355835, + "learning_rate": 8e-05, + "loss": 1.7386, + "step": 6316 + }, + { + "epoch": 0.35211817168338905, + "grad_norm": 0.5011419653892517, + "learning_rate": 8e-05, + "loss": 1.572, + "step": 6317 + }, + { + "epoch": 0.3521739130434783, + "grad_norm": 0.5681989192962646, + "learning_rate": 8e-05, + "loss": 2.0663, + "step": 6318 + }, + { + "epoch": 0.35222965440356746, + "grad_norm": 0.5317137241363525, + "learning_rate": 8e-05, + "loss": 1.5384, + "step": 6319 + }, + { + "epoch": 0.35228539576365664, + "grad_norm": 0.4983222484588623, + "learning_rate": 8e-05, + "loss": 1.6649, + "step": 6320 + }, + { + "epoch": 0.3523411371237458, + "grad_norm": 0.5296564102172852, + "learning_rate": 8e-05, + "loss": 1.7091, + "step": 6321 + }, + { + "epoch": 0.352396878483835, + "grad_norm": 0.49374499917030334, + "learning_rate": 8e-05, + "loss": 1.4785, + "step": 6322 + }, + { + "epoch": 0.3524526198439242, + "grad_norm": 0.490217924118042, + "learning_rate": 8e-05, + "loss": 1.6052, + "step": 6323 + }, + { + "epoch": 0.35250836120401335, + "grad_norm": 0.5159461498260498, + "learning_rate": 8e-05, + "loss": 1.61, + "step": 6324 + }, + { + "epoch": 0.3525641025641026, + "grad_norm": 0.5397430658340454, + "learning_rate": 8e-05, + "loss": 1.8227, + "step": 6325 + }, + { + "epoch": 0.35261984392419177, + "grad_norm": 0.5083862543106079, + "learning_rate": 8e-05, + "loss": 1.6645, + "step": 6326 + }, + { + "epoch": 0.35267558528428095, + "grad_norm": 0.5055984854698181, + "learning_rate": 8e-05, + "loss": 1.8023, + "step": 6327 + }, + { + "epoch": 0.3527313266443701, + "grad_norm": 0.5376937389373779, + "learning_rate": 8e-05, + "loss": 1.8579, + "step": 6328 + }, + { + "epoch": 0.3527870680044593, + "grad_norm": 0.49673518538475037, + "learning_rate": 8e-05, + "loss": 1.6094, + "step": 6329 + }, + { + "epoch": 0.3528428093645485, + "grad_norm": 0.47112712264060974, + "learning_rate": 8e-05, + "loss": 1.6966, + "step": 6330 + }, + { + "epoch": 0.35289855072463766, + "grad_norm": 0.5200251340866089, + "learning_rate": 8e-05, + "loss": 1.8557, + "step": 6331 + }, + { + "epoch": 0.3529542920847269, + "grad_norm": 0.5550200939178467, + "learning_rate": 8e-05, + "loss": 1.9868, + "step": 6332 + }, + { + "epoch": 0.3530100334448161, + "grad_norm": 0.4754127860069275, + "learning_rate": 8e-05, + "loss": 1.441, + "step": 6333 + }, + { + "epoch": 0.35306577480490525, + "grad_norm": 0.5317299962043762, + "learning_rate": 8e-05, + "loss": 1.7727, + "step": 6334 + }, + { + "epoch": 0.35312151616499443, + "grad_norm": 0.5096808075904846, + "learning_rate": 8e-05, + "loss": 1.7812, + "step": 6335 + }, + { + "epoch": 0.3531772575250836, + "grad_norm": 0.5273366570472717, + "learning_rate": 8e-05, + "loss": 1.6957, + "step": 6336 + }, + { + "epoch": 0.3532329988851728, + "grad_norm": 0.5220344662666321, + "learning_rate": 8e-05, + "loss": 1.6494, + "step": 6337 + }, + { + "epoch": 0.35328874024526197, + "grad_norm": 0.5042638778686523, + "learning_rate": 8e-05, + "loss": 1.6709, + "step": 6338 + }, + { + "epoch": 0.35334448160535115, + "grad_norm": 0.5413642525672913, + "learning_rate": 8e-05, + "loss": 1.7465, + "step": 6339 + }, + { + "epoch": 0.3534002229654404, + "grad_norm": 0.4990393817424774, + "learning_rate": 8e-05, + "loss": 1.6118, + "step": 6340 + }, + { + "epoch": 0.35345596432552956, + "grad_norm": 0.5102233290672302, + "learning_rate": 8e-05, + "loss": 1.6453, + "step": 6341 + }, + { + "epoch": 0.35351170568561874, + "grad_norm": 0.4874460995197296, + "learning_rate": 8e-05, + "loss": 1.4917, + "step": 6342 + }, + { + "epoch": 0.3535674470457079, + "grad_norm": 0.562700092792511, + "learning_rate": 8e-05, + "loss": 1.9172, + "step": 6343 + }, + { + "epoch": 0.3536231884057971, + "grad_norm": 0.5399877429008484, + "learning_rate": 8e-05, + "loss": 1.5813, + "step": 6344 + }, + { + "epoch": 0.3536789297658863, + "grad_norm": 0.45111870765686035, + "learning_rate": 8e-05, + "loss": 1.0555, + "step": 6345 + }, + { + "epoch": 0.35373467112597545, + "grad_norm": 0.49734896421432495, + "learning_rate": 8e-05, + "loss": 1.7715, + "step": 6346 + }, + { + "epoch": 0.3537904124860647, + "grad_norm": 0.5130085945129395, + "learning_rate": 8e-05, + "loss": 1.6804, + "step": 6347 + }, + { + "epoch": 0.35384615384615387, + "grad_norm": 0.5188106894493103, + "learning_rate": 8e-05, + "loss": 1.8666, + "step": 6348 + }, + { + "epoch": 0.35390189520624304, + "grad_norm": 0.5795222520828247, + "learning_rate": 8e-05, + "loss": 1.6269, + "step": 6349 + }, + { + "epoch": 0.3539576365663322, + "grad_norm": 0.48987892270088196, + "learning_rate": 8e-05, + "loss": 1.4014, + "step": 6350 + }, + { + "epoch": 0.3540133779264214, + "grad_norm": 0.5103864669799805, + "learning_rate": 8e-05, + "loss": 1.5583, + "step": 6351 + }, + { + "epoch": 0.3540691192865106, + "grad_norm": 0.5661724805831909, + "learning_rate": 8e-05, + "loss": 1.877, + "step": 6352 + }, + { + "epoch": 0.35412486064659976, + "grad_norm": 0.5406143665313721, + "learning_rate": 8e-05, + "loss": 1.8725, + "step": 6353 + }, + { + "epoch": 0.35418060200668894, + "grad_norm": 0.5083308219909668, + "learning_rate": 8e-05, + "loss": 1.5352, + "step": 6354 + }, + { + "epoch": 0.3542363433667782, + "grad_norm": 0.48224666714668274, + "learning_rate": 8e-05, + "loss": 1.7073, + "step": 6355 + }, + { + "epoch": 0.35429208472686735, + "grad_norm": 0.5201107263565063, + "learning_rate": 8e-05, + "loss": 1.7611, + "step": 6356 + }, + { + "epoch": 0.35434782608695653, + "grad_norm": 0.5390413999557495, + "learning_rate": 8e-05, + "loss": 1.8358, + "step": 6357 + }, + { + "epoch": 0.3544035674470457, + "grad_norm": 0.49942028522491455, + "learning_rate": 8e-05, + "loss": 1.6606, + "step": 6358 + }, + { + "epoch": 0.3544593088071349, + "grad_norm": 0.5088225603103638, + "learning_rate": 8e-05, + "loss": 1.6044, + "step": 6359 + }, + { + "epoch": 0.35451505016722407, + "grad_norm": 0.5360525250434875, + "learning_rate": 8e-05, + "loss": 1.756, + "step": 6360 + }, + { + "epoch": 0.35457079152731324, + "grad_norm": 0.5383181571960449, + "learning_rate": 8e-05, + "loss": 1.726, + "step": 6361 + }, + { + "epoch": 0.3546265328874025, + "grad_norm": 0.5377655029296875, + "learning_rate": 8e-05, + "loss": 1.8818, + "step": 6362 + }, + { + "epoch": 0.35468227424749166, + "grad_norm": 0.48970893025398254, + "learning_rate": 8e-05, + "loss": 1.7495, + "step": 6363 + }, + { + "epoch": 0.35473801560758084, + "grad_norm": 0.4807761609554291, + "learning_rate": 8e-05, + "loss": 1.6249, + "step": 6364 + }, + { + "epoch": 0.35479375696767, + "grad_norm": 0.4989910125732422, + "learning_rate": 8e-05, + "loss": 1.6189, + "step": 6365 + }, + { + "epoch": 0.3548494983277592, + "grad_norm": 0.4965146780014038, + "learning_rate": 8e-05, + "loss": 1.5913, + "step": 6366 + }, + { + "epoch": 0.3549052396878484, + "grad_norm": 0.5162652134895325, + "learning_rate": 8e-05, + "loss": 1.8145, + "step": 6367 + }, + { + "epoch": 0.35496098104793755, + "grad_norm": 0.5167195200920105, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 6368 + }, + { + "epoch": 0.35501672240802673, + "grad_norm": 0.5981922149658203, + "learning_rate": 8e-05, + "loss": 1.7429, + "step": 6369 + }, + { + "epoch": 0.35507246376811596, + "grad_norm": 0.49923574924468994, + "learning_rate": 8e-05, + "loss": 1.6594, + "step": 6370 + }, + { + "epoch": 0.35512820512820514, + "grad_norm": 0.5337264537811279, + "learning_rate": 8e-05, + "loss": 1.7549, + "step": 6371 + }, + { + "epoch": 0.3551839464882943, + "grad_norm": 0.5021322965621948, + "learning_rate": 8e-05, + "loss": 1.7241, + "step": 6372 + }, + { + "epoch": 0.3552396878483835, + "grad_norm": 0.48789548873901367, + "learning_rate": 8e-05, + "loss": 1.4845, + "step": 6373 + }, + { + "epoch": 0.3552954292084727, + "grad_norm": 0.49966785311698914, + "learning_rate": 8e-05, + "loss": 1.809, + "step": 6374 + }, + { + "epoch": 0.35535117056856186, + "grad_norm": 0.5539280772209167, + "learning_rate": 8e-05, + "loss": 1.9406, + "step": 6375 + }, + { + "epoch": 0.35540691192865104, + "grad_norm": 0.5110356211662292, + "learning_rate": 8e-05, + "loss": 1.7554, + "step": 6376 + }, + { + "epoch": 0.35546265328874027, + "grad_norm": 0.5013741850852966, + "learning_rate": 8e-05, + "loss": 1.6376, + "step": 6377 + }, + { + "epoch": 0.35551839464882945, + "grad_norm": 0.5059554576873779, + "learning_rate": 8e-05, + "loss": 1.5147, + "step": 6378 + }, + { + "epoch": 0.35557413600891863, + "grad_norm": 0.5380630493164062, + "learning_rate": 8e-05, + "loss": 1.6972, + "step": 6379 + }, + { + "epoch": 0.3556298773690078, + "grad_norm": 0.5238441824913025, + "learning_rate": 8e-05, + "loss": 2.0016, + "step": 6380 + }, + { + "epoch": 0.355685618729097, + "grad_norm": 0.5427252650260925, + "learning_rate": 8e-05, + "loss": 1.8643, + "step": 6381 + }, + { + "epoch": 0.35574136008918616, + "grad_norm": 0.5313915610313416, + "learning_rate": 8e-05, + "loss": 1.9013, + "step": 6382 + }, + { + "epoch": 0.35579710144927534, + "grad_norm": 0.4783153235912323, + "learning_rate": 8e-05, + "loss": 1.6119, + "step": 6383 + }, + { + "epoch": 0.3558528428093645, + "grad_norm": 0.506710410118103, + "learning_rate": 8e-05, + "loss": 1.7333, + "step": 6384 + }, + { + "epoch": 0.35590858416945376, + "grad_norm": 0.552340567111969, + "learning_rate": 8e-05, + "loss": 1.8149, + "step": 6385 + }, + { + "epoch": 0.35596432552954294, + "grad_norm": 0.49930626153945923, + "learning_rate": 8e-05, + "loss": 1.4079, + "step": 6386 + }, + { + "epoch": 0.3560200668896321, + "grad_norm": 0.4994630515575409, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 6387 + }, + { + "epoch": 0.3560758082497213, + "grad_norm": 0.5089203119277954, + "learning_rate": 8e-05, + "loss": 1.5017, + "step": 6388 + }, + { + "epoch": 0.35613154960981047, + "grad_norm": 0.5555450320243835, + "learning_rate": 8e-05, + "loss": 1.7565, + "step": 6389 + }, + { + "epoch": 0.35618729096989965, + "grad_norm": 0.4765130281448364, + "learning_rate": 8e-05, + "loss": 1.7225, + "step": 6390 + }, + { + "epoch": 0.35624303232998883, + "grad_norm": 0.5181832313537598, + "learning_rate": 8e-05, + "loss": 1.6676, + "step": 6391 + }, + { + "epoch": 0.35629877369007806, + "grad_norm": 0.553586483001709, + "learning_rate": 8e-05, + "loss": 1.7953, + "step": 6392 + }, + { + "epoch": 0.35635451505016724, + "grad_norm": 0.5278187394142151, + "learning_rate": 8e-05, + "loss": 1.7932, + "step": 6393 + }, + { + "epoch": 0.3564102564102564, + "grad_norm": 0.5500335693359375, + "learning_rate": 8e-05, + "loss": 1.6132, + "step": 6394 + }, + { + "epoch": 0.3564659977703456, + "grad_norm": 0.5639261603355408, + "learning_rate": 8e-05, + "loss": 1.7654, + "step": 6395 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 0.5675365924835205, + "learning_rate": 8e-05, + "loss": 1.897, + "step": 6396 + }, + { + "epoch": 0.35657748049052396, + "grad_norm": 0.5094690322875977, + "learning_rate": 8e-05, + "loss": 1.5559, + "step": 6397 + }, + { + "epoch": 0.35663322185061314, + "grad_norm": 0.5150576829910278, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 6398 + }, + { + "epoch": 0.3566889632107023, + "grad_norm": 0.5280166864395142, + "learning_rate": 8e-05, + "loss": 1.6834, + "step": 6399 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.5540995001792908, + "learning_rate": 8e-05, + "loss": 1.9026, + "step": 6400 + }, + { + "epoch": 0.3568004459308807, + "grad_norm": 0.5353748798370361, + "learning_rate": 8e-05, + "loss": 1.7235, + "step": 6401 + }, + { + "epoch": 0.3568561872909699, + "grad_norm": 0.5143972635269165, + "learning_rate": 8e-05, + "loss": 1.6317, + "step": 6402 + }, + { + "epoch": 0.3569119286510591, + "grad_norm": 0.4793776273727417, + "learning_rate": 8e-05, + "loss": 1.4453, + "step": 6403 + }, + { + "epoch": 0.35696767001114826, + "grad_norm": 0.5332164168357849, + "learning_rate": 8e-05, + "loss": 1.6527, + "step": 6404 + }, + { + "epoch": 0.35702341137123744, + "grad_norm": 0.5050964951515198, + "learning_rate": 8e-05, + "loss": 1.627, + "step": 6405 + }, + { + "epoch": 0.3570791527313266, + "grad_norm": 0.4644363224506378, + "learning_rate": 8e-05, + "loss": 1.451, + "step": 6406 + }, + { + "epoch": 0.35713489409141586, + "grad_norm": 0.5282057523727417, + "learning_rate": 8e-05, + "loss": 1.5854, + "step": 6407 + }, + { + "epoch": 0.35719063545150503, + "grad_norm": 0.5058812499046326, + "learning_rate": 8e-05, + "loss": 1.8473, + "step": 6408 + }, + { + "epoch": 0.3572463768115942, + "grad_norm": 0.49288514256477356, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 6409 + }, + { + "epoch": 0.3573021181716834, + "grad_norm": 0.5028522610664368, + "learning_rate": 8e-05, + "loss": 1.7249, + "step": 6410 + }, + { + "epoch": 0.35735785953177257, + "grad_norm": 0.4753098487854004, + "learning_rate": 8e-05, + "loss": 1.5235, + "step": 6411 + }, + { + "epoch": 0.35741360089186175, + "grad_norm": 0.49716320633888245, + "learning_rate": 8e-05, + "loss": 1.582, + "step": 6412 + }, + { + "epoch": 0.35746934225195093, + "grad_norm": 0.5260248184204102, + "learning_rate": 8e-05, + "loss": 1.5284, + "step": 6413 + }, + { + "epoch": 0.3575250836120401, + "grad_norm": 0.5542808175086975, + "learning_rate": 8e-05, + "loss": 1.7856, + "step": 6414 + }, + { + "epoch": 0.35758082497212934, + "grad_norm": 0.5370000600814819, + "learning_rate": 8e-05, + "loss": 1.7328, + "step": 6415 + }, + { + "epoch": 0.3576365663322185, + "grad_norm": 0.5088168382644653, + "learning_rate": 8e-05, + "loss": 1.5591, + "step": 6416 + }, + { + "epoch": 0.3576923076923077, + "grad_norm": 0.5215413570404053, + "learning_rate": 8e-05, + "loss": 1.2636, + "step": 6417 + }, + { + "epoch": 0.3577480490523969, + "grad_norm": 0.528469979763031, + "learning_rate": 8e-05, + "loss": 1.6741, + "step": 6418 + }, + { + "epoch": 0.35780379041248606, + "grad_norm": 0.49583855271339417, + "learning_rate": 8e-05, + "loss": 1.6579, + "step": 6419 + }, + { + "epoch": 0.35785953177257523, + "grad_norm": 0.5597677826881409, + "learning_rate": 8e-05, + "loss": 1.7846, + "step": 6420 + }, + { + "epoch": 0.3579152731326644, + "grad_norm": 0.5637001395225525, + "learning_rate": 8e-05, + "loss": 1.8262, + "step": 6421 + }, + { + "epoch": 0.35797101449275365, + "grad_norm": 0.5613079071044922, + "learning_rate": 8e-05, + "loss": 1.7265, + "step": 6422 + }, + { + "epoch": 0.3580267558528428, + "grad_norm": 0.5505542159080505, + "learning_rate": 8e-05, + "loss": 1.6039, + "step": 6423 + }, + { + "epoch": 0.358082497212932, + "grad_norm": 0.5501434206962585, + "learning_rate": 8e-05, + "loss": 1.7493, + "step": 6424 + }, + { + "epoch": 0.3581382385730212, + "grad_norm": 0.5145764946937561, + "learning_rate": 8e-05, + "loss": 1.7458, + "step": 6425 + }, + { + "epoch": 0.35819397993311036, + "grad_norm": 0.5651512742042542, + "learning_rate": 8e-05, + "loss": 1.8958, + "step": 6426 + }, + { + "epoch": 0.35824972129319954, + "grad_norm": 0.5379459857940674, + "learning_rate": 8e-05, + "loss": 1.9652, + "step": 6427 + }, + { + "epoch": 0.3583054626532887, + "grad_norm": 0.46046844124794006, + "learning_rate": 8e-05, + "loss": 1.4552, + "step": 6428 + }, + { + "epoch": 0.35836120401337795, + "grad_norm": 0.5376448035240173, + "learning_rate": 8e-05, + "loss": 1.6586, + "step": 6429 + }, + { + "epoch": 0.35841694537346713, + "grad_norm": 0.4953959584236145, + "learning_rate": 8e-05, + "loss": 1.3764, + "step": 6430 + }, + { + "epoch": 0.3584726867335563, + "grad_norm": 0.5916145443916321, + "learning_rate": 8e-05, + "loss": 2.108, + "step": 6431 + }, + { + "epoch": 0.3585284280936455, + "grad_norm": 0.5416443943977356, + "learning_rate": 8e-05, + "loss": 1.7952, + "step": 6432 + }, + { + "epoch": 0.35858416945373467, + "grad_norm": 0.516238272190094, + "learning_rate": 8e-05, + "loss": 1.6125, + "step": 6433 + }, + { + "epoch": 0.35863991081382385, + "grad_norm": 0.4888264536857605, + "learning_rate": 8e-05, + "loss": 1.5603, + "step": 6434 + }, + { + "epoch": 0.358695652173913, + "grad_norm": 0.5799896717071533, + "learning_rate": 8e-05, + "loss": 1.9252, + "step": 6435 + }, + { + "epoch": 0.3587513935340022, + "grad_norm": 0.4998963177204132, + "learning_rate": 8e-05, + "loss": 1.6797, + "step": 6436 + }, + { + "epoch": 0.35880713489409144, + "grad_norm": 0.5072805881500244, + "learning_rate": 8e-05, + "loss": 1.557, + "step": 6437 + }, + { + "epoch": 0.3588628762541806, + "grad_norm": 0.5456826686859131, + "learning_rate": 8e-05, + "loss": 1.5075, + "step": 6438 + }, + { + "epoch": 0.3589186176142698, + "grad_norm": 0.5327960252761841, + "learning_rate": 8e-05, + "loss": 1.6091, + "step": 6439 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 0.5547055602073669, + "learning_rate": 8e-05, + "loss": 1.6923, + "step": 6440 + }, + { + "epoch": 0.35903010033444815, + "grad_norm": 0.5358332395553589, + "learning_rate": 8e-05, + "loss": 1.7189, + "step": 6441 + }, + { + "epoch": 0.35908584169453733, + "grad_norm": 0.5085732340812683, + "learning_rate": 8e-05, + "loss": 1.5005, + "step": 6442 + }, + { + "epoch": 0.3591415830546265, + "grad_norm": 0.5534481406211853, + "learning_rate": 8e-05, + "loss": 2.0119, + "step": 6443 + }, + { + "epoch": 0.35919732441471575, + "grad_norm": 0.5384095907211304, + "learning_rate": 8e-05, + "loss": 1.792, + "step": 6444 + }, + { + "epoch": 0.3592530657748049, + "grad_norm": 0.4997360110282898, + "learning_rate": 8e-05, + "loss": 1.6842, + "step": 6445 + }, + { + "epoch": 0.3593088071348941, + "grad_norm": 0.5220462679862976, + "learning_rate": 8e-05, + "loss": 1.7234, + "step": 6446 + }, + { + "epoch": 0.3593645484949833, + "grad_norm": 0.5385878682136536, + "learning_rate": 8e-05, + "loss": 1.7578, + "step": 6447 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 0.5040372014045715, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 6448 + }, + { + "epoch": 0.35947603121516164, + "grad_norm": 0.512959897518158, + "learning_rate": 8e-05, + "loss": 1.8103, + "step": 6449 + }, + { + "epoch": 0.3595317725752508, + "grad_norm": 0.4947265088558197, + "learning_rate": 8e-05, + "loss": 1.711, + "step": 6450 + }, + { + "epoch": 0.35958751393534, + "grad_norm": 0.5018380284309387, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 6451 + }, + { + "epoch": 0.35964325529542923, + "grad_norm": 0.5547189116477966, + "learning_rate": 8e-05, + "loss": 1.865, + "step": 6452 + }, + { + "epoch": 0.3596989966555184, + "grad_norm": 0.5413358807563782, + "learning_rate": 8e-05, + "loss": 1.8049, + "step": 6453 + }, + { + "epoch": 0.3597547380156076, + "grad_norm": 0.5174626708030701, + "learning_rate": 8e-05, + "loss": 1.5548, + "step": 6454 + }, + { + "epoch": 0.35981047937569677, + "grad_norm": 0.5683441758155823, + "learning_rate": 8e-05, + "loss": 1.9717, + "step": 6455 + }, + { + "epoch": 0.35986622073578595, + "grad_norm": 0.5123081207275391, + "learning_rate": 8e-05, + "loss": 1.625, + "step": 6456 + }, + { + "epoch": 0.3599219620958751, + "grad_norm": 0.5193832516670227, + "learning_rate": 8e-05, + "loss": 1.6638, + "step": 6457 + }, + { + "epoch": 0.3599777034559643, + "grad_norm": 0.5199078321456909, + "learning_rate": 8e-05, + "loss": 1.6831, + "step": 6458 + }, + { + "epoch": 0.36003344481605354, + "grad_norm": 0.5766400098800659, + "learning_rate": 8e-05, + "loss": 1.7626, + "step": 6459 + }, + { + "epoch": 0.3600891861761427, + "grad_norm": 0.523861289024353, + "learning_rate": 8e-05, + "loss": 1.7019, + "step": 6460 + }, + { + "epoch": 0.3601449275362319, + "grad_norm": 0.5427674651145935, + "learning_rate": 8e-05, + "loss": 1.821, + "step": 6461 + }, + { + "epoch": 0.3602006688963211, + "grad_norm": 0.5394901633262634, + "learning_rate": 8e-05, + "loss": 1.7174, + "step": 6462 + }, + { + "epoch": 0.36025641025641025, + "grad_norm": 0.7988752126693726, + "learning_rate": 8e-05, + "loss": 1.9678, + "step": 6463 + }, + { + "epoch": 0.36031215161649943, + "grad_norm": 0.5276942253112793, + "learning_rate": 8e-05, + "loss": 1.8143, + "step": 6464 + }, + { + "epoch": 0.3603678929765886, + "grad_norm": 0.5382676720619202, + "learning_rate": 8e-05, + "loss": 1.9482, + "step": 6465 + }, + { + "epoch": 0.3604236343366778, + "grad_norm": 0.5459386110305786, + "learning_rate": 8e-05, + "loss": 1.9035, + "step": 6466 + }, + { + "epoch": 0.360479375696767, + "grad_norm": 0.4714954197406769, + "learning_rate": 8e-05, + "loss": 1.4949, + "step": 6467 + }, + { + "epoch": 0.3605351170568562, + "grad_norm": 0.5136751532554626, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 6468 + }, + { + "epoch": 0.3605908584169454, + "grad_norm": 0.524118185043335, + "learning_rate": 8e-05, + "loss": 1.6161, + "step": 6469 + }, + { + "epoch": 0.36064659977703456, + "grad_norm": 0.4913913309574127, + "learning_rate": 8e-05, + "loss": 1.523, + "step": 6470 + }, + { + "epoch": 0.36070234113712374, + "grad_norm": 0.5972789525985718, + "learning_rate": 8e-05, + "loss": 1.9685, + "step": 6471 + }, + { + "epoch": 0.3607580824972129, + "grad_norm": 0.6274723410606384, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 6472 + }, + { + "epoch": 0.3608138238573021, + "grad_norm": 0.5555365681648254, + "learning_rate": 8e-05, + "loss": 1.67, + "step": 6473 + }, + { + "epoch": 0.36086956521739133, + "grad_norm": 0.5084021687507629, + "learning_rate": 8e-05, + "loss": 1.6604, + "step": 6474 + }, + { + "epoch": 0.3609253065774805, + "grad_norm": 0.5389943718910217, + "learning_rate": 8e-05, + "loss": 1.6026, + "step": 6475 + }, + { + "epoch": 0.3609810479375697, + "grad_norm": 0.5192217826843262, + "learning_rate": 8e-05, + "loss": 1.5066, + "step": 6476 + }, + { + "epoch": 0.36103678929765887, + "grad_norm": 0.5360554456710815, + "learning_rate": 8e-05, + "loss": 1.7987, + "step": 6477 + }, + { + "epoch": 0.36109253065774805, + "grad_norm": 0.5452953577041626, + "learning_rate": 8e-05, + "loss": 1.7616, + "step": 6478 + }, + { + "epoch": 0.3611482720178372, + "grad_norm": 0.5165794491767883, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 6479 + }, + { + "epoch": 0.3612040133779264, + "grad_norm": 0.48414236307144165, + "learning_rate": 8e-05, + "loss": 1.5535, + "step": 6480 + }, + { + "epoch": 0.3612597547380156, + "grad_norm": 0.5539426207542419, + "learning_rate": 8e-05, + "loss": 1.7877, + "step": 6481 + }, + { + "epoch": 0.3613154960981048, + "grad_norm": 0.5607216358184814, + "learning_rate": 8e-05, + "loss": 1.708, + "step": 6482 + }, + { + "epoch": 0.361371237458194, + "grad_norm": 0.5305792689323425, + "learning_rate": 8e-05, + "loss": 1.861, + "step": 6483 + }, + { + "epoch": 0.3614269788182832, + "grad_norm": 0.4897555112838745, + "learning_rate": 8e-05, + "loss": 1.5322, + "step": 6484 + }, + { + "epoch": 0.36148272017837235, + "grad_norm": 0.4937852919101715, + "learning_rate": 8e-05, + "loss": 1.5797, + "step": 6485 + }, + { + "epoch": 0.36153846153846153, + "grad_norm": 0.49905410408973694, + "learning_rate": 8e-05, + "loss": 1.6547, + "step": 6486 + }, + { + "epoch": 0.3615942028985507, + "grad_norm": 0.49061068892478943, + "learning_rate": 8e-05, + "loss": 1.4706, + "step": 6487 + }, + { + "epoch": 0.3616499442586399, + "grad_norm": 0.4852028787136078, + "learning_rate": 8e-05, + "loss": 1.4891, + "step": 6488 + }, + { + "epoch": 0.3617056856187291, + "grad_norm": 0.5745864510536194, + "learning_rate": 8e-05, + "loss": 1.9354, + "step": 6489 + }, + { + "epoch": 0.3617614269788183, + "grad_norm": 0.5412963628768921, + "learning_rate": 8e-05, + "loss": 1.7391, + "step": 6490 + }, + { + "epoch": 0.3618171683389075, + "grad_norm": 0.4556156396865845, + "learning_rate": 8e-05, + "loss": 1.5082, + "step": 6491 + }, + { + "epoch": 0.36187290969899666, + "grad_norm": 0.575387179851532, + "learning_rate": 8e-05, + "loss": 1.8843, + "step": 6492 + }, + { + "epoch": 0.36192865105908584, + "grad_norm": 0.5384889245033264, + "learning_rate": 8e-05, + "loss": 1.8429, + "step": 6493 + }, + { + "epoch": 0.361984392419175, + "grad_norm": 0.6044797301292419, + "learning_rate": 8e-05, + "loss": 1.736, + "step": 6494 + }, + { + "epoch": 0.3620401337792642, + "grad_norm": 0.5215487480163574, + "learning_rate": 8e-05, + "loss": 1.5594, + "step": 6495 + }, + { + "epoch": 0.3620958751393534, + "grad_norm": 0.5874164700508118, + "learning_rate": 8e-05, + "loss": 1.8676, + "step": 6496 + }, + { + "epoch": 0.3621516164994426, + "grad_norm": 0.5641337037086487, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 6497 + }, + { + "epoch": 0.3622073578595318, + "grad_norm": 0.5324305295944214, + "learning_rate": 8e-05, + "loss": 1.5296, + "step": 6498 + }, + { + "epoch": 0.36226309921962097, + "grad_norm": 0.4652421772480011, + "learning_rate": 8e-05, + "loss": 1.4351, + "step": 6499 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 0.5650059580802917, + "learning_rate": 8e-05, + "loss": 1.9025, + "step": 6500 + }, + { + "epoch": 0.3623745819397993, + "grad_norm": 0.509933590888977, + "learning_rate": 8e-05, + "loss": 1.6018, + "step": 6501 + }, + { + "epoch": 0.3624303232998885, + "grad_norm": 0.6044166088104248, + "learning_rate": 8e-05, + "loss": 1.8909, + "step": 6502 + }, + { + "epoch": 0.3624860646599777, + "grad_norm": 0.5065827965736389, + "learning_rate": 8e-05, + "loss": 1.7081, + "step": 6503 + }, + { + "epoch": 0.3625418060200669, + "grad_norm": 0.5435416102409363, + "learning_rate": 8e-05, + "loss": 1.9141, + "step": 6504 + }, + { + "epoch": 0.3625975473801561, + "grad_norm": 0.538236677646637, + "learning_rate": 8e-05, + "loss": 1.6744, + "step": 6505 + }, + { + "epoch": 0.36265328874024527, + "grad_norm": 0.49007493257522583, + "learning_rate": 8e-05, + "loss": 1.5196, + "step": 6506 + }, + { + "epoch": 0.36270903010033445, + "grad_norm": 0.5888476967811584, + "learning_rate": 8e-05, + "loss": 1.9067, + "step": 6507 + }, + { + "epoch": 0.36276477146042363, + "grad_norm": 0.5109419226646423, + "learning_rate": 8e-05, + "loss": 1.9334, + "step": 6508 + }, + { + "epoch": 0.3628205128205128, + "grad_norm": 0.537640392780304, + "learning_rate": 8e-05, + "loss": 1.8233, + "step": 6509 + }, + { + "epoch": 0.362876254180602, + "grad_norm": 0.48754364252090454, + "learning_rate": 8e-05, + "loss": 1.5459, + "step": 6510 + }, + { + "epoch": 0.36293199554069117, + "grad_norm": 0.5003510117530823, + "learning_rate": 8e-05, + "loss": 1.6937, + "step": 6511 + }, + { + "epoch": 0.3629877369007804, + "grad_norm": 0.46658939123153687, + "learning_rate": 8e-05, + "loss": 1.4756, + "step": 6512 + }, + { + "epoch": 0.3630434782608696, + "grad_norm": 0.5610101819038391, + "learning_rate": 8e-05, + "loss": 1.8659, + "step": 6513 + }, + { + "epoch": 0.36309921962095876, + "grad_norm": 0.49573513865470886, + "learning_rate": 8e-05, + "loss": 1.6819, + "step": 6514 + }, + { + "epoch": 0.36315496098104794, + "grad_norm": 0.49012234807014465, + "learning_rate": 8e-05, + "loss": 1.5958, + "step": 6515 + }, + { + "epoch": 0.3632107023411371, + "grad_norm": 0.5030548572540283, + "learning_rate": 8e-05, + "loss": 1.7471, + "step": 6516 + }, + { + "epoch": 0.3632664437012263, + "grad_norm": 0.4662206470966339, + "learning_rate": 8e-05, + "loss": 1.3778, + "step": 6517 + }, + { + "epoch": 0.3633221850613155, + "grad_norm": 0.5224818587303162, + "learning_rate": 8e-05, + "loss": 1.7238, + "step": 6518 + }, + { + "epoch": 0.3633779264214047, + "grad_norm": 0.4647224247455597, + "learning_rate": 8e-05, + "loss": 1.2414, + "step": 6519 + }, + { + "epoch": 0.3634336677814939, + "grad_norm": 0.5260183811187744, + "learning_rate": 8e-05, + "loss": 1.7534, + "step": 6520 + }, + { + "epoch": 0.36348940914158306, + "grad_norm": 0.4867696166038513, + "learning_rate": 8e-05, + "loss": 1.5077, + "step": 6521 + }, + { + "epoch": 0.36354515050167224, + "grad_norm": 0.5324907898902893, + "learning_rate": 8e-05, + "loss": 1.6597, + "step": 6522 + }, + { + "epoch": 0.3636008918617614, + "grad_norm": 0.5028617978096008, + "learning_rate": 8e-05, + "loss": 1.5938, + "step": 6523 + }, + { + "epoch": 0.3636566332218506, + "grad_norm": 0.556318998336792, + "learning_rate": 8e-05, + "loss": 1.899, + "step": 6524 + }, + { + "epoch": 0.3637123745819398, + "grad_norm": 0.5142127275466919, + "learning_rate": 8e-05, + "loss": 1.6227, + "step": 6525 + }, + { + "epoch": 0.36376811594202896, + "grad_norm": 0.5022664070129395, + "learning_rate": 8e-05, + "loss": 1.677, + "step": 6526 + }, + { + "epoch": 0.3638238573021182, + "grad_norm": 0.4740968942642212, + "learning_rate": 8e-05, + "loss": 1.4775, + "step": 6527 + }, + { + "epoch": 0.36387959866220737, + "grad_norm": 0.6123348474502563, + "learning_rate": 8e-05, + "loss": 1.6875, + "step": 6528 + }, + { + "epoch": 0.36393534002229655, + "grad_norm": 0.5256551504135132, + "learning_rate": 8e-05, + "loss": 1.7736, + "step": 6529 + }, + { + "epoch": 0.36399108138238573, + "grad_norm": 0.5395634174346924, + "learning_rate": 8e-05, + "loss": 1.8819, + "step": 6530 + }, + { + "epoch": 0.3640468227424749, + "grad_norm": 0.5305240154266357, + "learning_rate": 8e-05, + "loss": 1.5619, + "step": 6531 + }, + { + "epoch": 0.3641025641025641, + "grad_norm": 0.4834372401237488, + "learning_rate": 8e-05, + "loss": 1.4774, + "step": 6532 + }, + { + "epoch": 0.36415830546265326, + "grad_norm": 0.5600225925445557, + "learning_rate": 8e-05, + "loss": 1.9042, + "step": 6533 + }, + { + "epoch": 0.3642140468227425, + "grad_norm": 0.5327090620994568, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 6534 + }, + { + "epoch": 0.3642697881828317, + "grad_norm": 0.48773810267448425, + "learning_rate": 8e-05, + "loss": 1.6035, + "step": 6535 + }, + { + "epoch": 0.36432552954292086, + "grad_norm": 0.5857282876968384, + "learning_rate": 8e-05, + "loss": 1.8036, + "step": 6536 + }, + { + "epoch": 0.36438127090301003, + "grad_norm": 0.5390805602073669, + "learning_rate": 8e-05, + "loss": 1.6281, + "step": 6537 + }, + { + "epoch": 0.3644370122630992, + "grad_norm": 0.4638005495071411, + "learning_rate": 8e-05, + "loss": 1.4809, + "step": 6538 + }, + { + "epoch": 0.3644927536231884, + "grad_norm": 0.4900088906288147, + "learning_rate": 8e-05, + "loss": 1.4965, + "step": 6539 + }, + { + "epoch": 0.36454849498327757, + "grad_norm": 0.5323425531387329, + "learning_rate": 8e-05, + "loss": 1.8889, + "step": 6540 + }, + { + "epoch": 0.3646042363433668, + "grad_norm": 0.5697327256202698, + "learning_rate": 8e-05, + "loss": 1.8247, + "step": 6541 + }, + { + "epoch": 0.364659977703456, + "grad_norm": 0.5275499224662781, + "learning_rate": 8e-05, + "loss": 1.6131, + "step": 6542 + }, + { + "epoch": 0.36471571906354516, + "grad_norm": 0.5853115320205688, + "learning_rate": 8e-05, + "loss": 1.4321, + "step": 6543 + }, + { + "epoch": 0.36477146042363434, + "grad_norm": 0.5601681470870972, + "learning_rate": 8e-05, + "loss": 1.7446, + "step": 6544 + }, + { + "epoch": 0.3648272017837235, + "grad_norm": 0.5006238222122192, + "learning_rate": 8e-05, + "loss": 1.5411, + "step": 6545 + }, + { + "epoch": 0.3648829431438127, + "grad_norm": 0.5464562177658081, + "learning_rate": 8e-05, + "loss": 1.7087, + "step": 6546 + }, + { + "epoch": 0.3649386845039019, + "grad_norm": 0.5300664901733398, + "learning_rate": 8e-05, + "loss": 1.7823, + "step": 6547 + }, + { + "epoch": 0.36499442586399106, + "grad_norm": 0.5564868450164795, + "learning_rate": 8e-05, + "loss": 2.0165, + "step": 6548 + }, + { + "epoch": 0.3650501672240803, + "grad_norm": 0.574718177318573, + "learning_rate": 8e-05, + "loss": 1.9779, + "step": 6549 + }, + { + "epoch": 0.36510590858416947, + "grad_norm": 0.4962438941001892, + "learning_rate": 8e-05, + "loss": 1.7449, + "step": 6550 + }, + { + "epoch": 0.36516164994425865, + "grad_norm": 0.5202713012695312, + "learning_rate": 8e-05, + "loss": 1.6674, + "step": 6551 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 0.5091273188591003, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 6552 + }, + { + "epoch": 0.365273132664437, + "grad_norm": 0.4962243139743805, + "learning_rate": 8e-05, + "loss": 1.7279, + "step": 6553 + }, + { + "epoch": 0.3653288740245262, + "grad_norm": 0.5044227242469788, + "learning_rate": 8e-05, + "loss": 1.8611, + "step": 6554 + }, + { + "epoch": 0.36538461538461536, + "grad_norm": 0.5568208694458008, + "learning_rate": 8e-05, + "loss": 1.5701, + "step": 6555 + }, + { + "epoch": 0.3654403567447046, + "grad_norm": 0.4962807297706604, + "learning_rate": 8e-05, + "loss": 1.5269, + "step": 6556 + }, + { + "epoch": 0.3654960981047938, + "grad_norm": 0.48871636390686035, + "learning_rate": 8e-05, + "loss": 1.6167, + "step": 6557 + }, + { + "epoch": 0.36555183946488295, + "grad_norm": 0.5097386837005615, + "learning_rate": 8e-05, + "loss": 1.7949, + "step": 6558 + }, + { + "epoch": 0.36560758082497213, + "grad_norm": 0.5374859571456909, + "learning_rate": 8e-05, + "loss": 1.64, + "step": 6559 + }, + { + "epoch": 0.3656633221850613, + "grad_norm": 0.5586405396461487, + "learning_rate": 8e-05, + "loss": 1.4684, + "step": 6560 + }, + { + "epoch": 0.3657190635451505, + "grad_norm": 0.5146247148513794, + "learning_rate": 8e-05, + "loss": 1.5647, + "step": 6561 + }, + { + "epoch": 0.36577480490523967, + "grad_norm": 0.5160596966743469, + "learning_rate": 8e-05, + "loss": 1.5378, + "step": 6562 + }, + { + "epoch": 0.36583054626532885, + "grad_norm": 0.5124092102050781, + "learning_rate": 8e-05, + "loss": 1.5913, + "step": 6563 + }, + { + "epoch": 0.3658862876254181, + "grad_norm": 0.5166916251182556, + "learning_rate": 8e-05, + "loss": 1.6796, + "step": 6564 + }, + { + "epoch": 0.36594202898550726, + "grad_norm": 0.512089729309082, + "learning_rate": 8e-05, + "loss": 1.5568, + "step": 6565 + }, + { + "epoch": 0.36599777034559644, + "grad_norm": 0.48678961396217346, + "learning_rate": 8e-05, + "loss": 1.5374, + "step": 6566 + }, + { + "epoch": 0.3660535117056856, + "grad_norm": 0.5077835321426392, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 6567 + }, + { + "epoch": 0.3661092530657748, + "grad_norm": 0.5288443565368652, + "learning_rate": 8e-05, + "loss": 1.7829, + "step": 6568 + }, + { + "epoch": 0.366164994425864, + "grad_norm": 0.5403035283088684, + "learning_rate": 8e-05, + "loss": 1.7271, + "step": 6569 + }, + { + "epoch": 0.36622073578595316, + "grad_norm": 0.5623093247413635, + "learning_rate": 8e-05, + "loss": 1.862, + "step": 6570 + }, + { + "epoch": 0.3662764771460424, + "grad_norm": 0.5390522480010986, + "learning_rate": 8e-05, + "loss": 1.693, + "step": 6571 + }, + { + "epoch": 0.36633221850613157, + "grad_norm": 0.5267409086227417, + "learning_rate": 8e-05, + "loss": 1.5246, + "step": 6572 + }, + { + "epoch": 0.36638795986622075, + "grad_norm": 0.507342517375946, + "learning_rate": 8e-05, + "loss": 1.6128, + "step": 6573 + }, + { + "epoch": 0.3664437012263099, + "grad_norm": 0.5371213555335999, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 6574 + }, + { + "epoch": 0.3664994425863991, + "grad_norm": 0.5376846194267273, + "learning_rate": 8e-05, + "loss": 1.7991, + "step": 6575 + }, + { + "epoch": 0.3665551839464883, + "grad_norm": 0.5879801511764526, + "learning_rate": 8e-05, + "loss": 1.9445, + "step": 6576 + }, + { + "epoch": 0.36661092530657746, + "grad_norm": 0.5080875754356384, + "learning_rate": 8e-05, + "loss": 1.4541, + "step": 6577 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.49109145998954773, + "learning_rate": 8e-05, + "loss": 1.5978, + "step": 6578 + }, + { + "epoch": 0.3667224080267559, + "grad_norm": 0.5372000932693481, + "learning_rate": 8e-05, + "loss": 1.7174, + "step": 6579 + }, + { + "epoch": 0.36677814938684505, + "grad_norm": 0.5064409375190735, + "learning_rate": 8e-05, + "loss": 1.6052, + "step": 6580 + }, + { + "epoch": 0.36683389074693423, + "grad_norm": 0.4575348496437073, + "learning_rate": 8e-05, + "loss": 1.4182, + "step": 6581 + }, + { + "epoch": 0.3668896321070234, + "grad_norm": 0.5588882565498352, + "learning_rate": 8e-05, + "loss": 1.8103, + "step": 6582 + }, + { + "epoch": 0.3669453734671126, + "grad_norm": 0.5252819657325745, + "learning_rate": 8e-05, + "loss": 1.7079, + "step": 6583 + }, + { + "epoch": 0.36700111482720177, + "grad_norm": 0.5528586506843567, + "learning_rate": 8e-05, + "loss": 1.864, + "step": 6584 + }, + { + "epoch": 0.36705685618729095, + "grad_norm": 0.5005204081535339, + "learning_rate": 8e-05, + "loss": 1.5407, + "step": 6585 + }, + { + "epoch": 0.3671125975473802, + "grad_norm": 0.5155864953994751, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 6586 + }, + { + "epoch": 0.36716833890746936, + "grad_norm": 0.5482629537582397, + "learning_rate": 8e-05, + "loss": 1.7748, + "step": 6587 + }, + { + "epoch": 0.36722408026755854, + "grad_norm": 0.5585765838623047, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 6588 + }, + { + "epoch": 0.3672798216276477, + "grad_norm": 0.49062028527259827, + "learning_rate": 8e-05, + "loss": 1.5748, + "step": 6589 + }, + { + "epoch": 0.3673355629877369, + "grad_norm": 0.5338550209999084, + "learning_rate": 8e-05, + "loss": 1.7891, + "step": 6590 + }, + { + "epoch": 0.3673913043478261, + "grad_norm": 0.5884079337120056, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 6591 + }, + { + "epoch": 0.36744704570791525, + "grad_norm": 0.5195145606994629, + "learning_rate": 8e-05, + "loss": 1.7409, + "step": 6592 + }, + { + "epoch": 0.36750278706800443, + "grad_norm": 0.5180572271347046, + "learning_rate": 8e-05, + "loss": 1.6582, + "step": 6593 + }, + { + "epoch": 0.36755852842809367, + "grad_norm": 0.5171594619750977, + "learning_rate": 8e-05, + "loss": 1.7377, + "step": 6594 + }, + { + "epoch": 0.36761426978818285, + "grad_norm": 0.5494788289070129, + "learning_rate": 8e-05, + "loss": 2.0252, + "step": 6595 + }, + { + "epoch": 0.367670011148272, + "grad_norm": 0.5018215179443359, + "learning_rate": 8e-05, + "loss": 1.7187, + "step": 6596 + }, + { + "epoch": 0.3677257525083612, + "grad_norm": 0.5216608047485352, + "learning_rate": 8e-05, + "loss": 1.6828, + "step": 6597 + }, + { + "epoch": 0.3677814938684504, + "grad_norm": 0.4886898100376129, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 6598 + }, + { + "epoch": 0.36783723522853956, + "grad_norm": 0.5049815773963928, + "learning_rate": 8e-05, + "loss": 1.6745, + "step": 6599 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.51988685131073, + "learning_rate": 8e-05, + "loss": 1.6152, + "step": 6600 + }, + { + "epoch": 0.367948717948718, + "grad_norm": 0.4952639043331146, + "learning_rate": 8e-05, + "loss": 1.5671, + "step": 6601 + }, + { + "epoch": 0.36800445930880715, + "grad_norm": 0.5094792246818542, + "learning_rate": 8e-05, + "loss": 1.6354, + "step": 6602 + }, + { + "epoch": 0.36806020066889633, + "grad_norm": 0.5375295877456665, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 6603 + }, + { + "epoch": 0.3681159420289855, + "grad_norm": 0.5196813941001892, + "learning_rate": 8e-05, + "loss": 1.5596, + "step": 6604 + }, + { + "epoch": 0.3681716833890747, + "grad_norm": 0.524641752243042, + "learning_rate": 8e-05, + "loss": 1.6315, + "step": 6605 + }, + { + "epoch": 0.36822742474916387, + "grad_norm": 0.5774500370025635, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 6606 + }, + { + "epoch": 0.36828316610925305, + "grad_norm": 0.5841911435127258, + "learning_rate": 8e-05, + "loss": 1.9424, + "step": 6607 + }, + { + "epoch": 0.3683389074693422, + "grad_norm": 0.5551748871803284, + "learning_rate": 8e-05, + "loss": 1.7258, + "step": 6608 + }, + { + "epoch": 0.36839464882943146, + "grad_norm": 0.5234334468841553, + "learning_rate": 8e-05, + "loss": 1.866, + "step": 6609 + }, + { + "epoch": 0.36845039018952064, + "grad_norm": 0.48736587166786194, + "learning_rate": 8e-05, + "loss": 1.3496, + "step": 6610 + }, + { + "epoch": 0.3685061315496098, + "grad_norm": 0.5116720795631409, + "learning_rate": 8e-05, + "loss": 1.8454, + "step": 6611 + }, + { + "epoch": 0.368561872909699, + "grad_norm": 0.4924512803554535, + "learning_rate": 8e-05, + "loss": 1.6009, + "step": 6612 + }, + { + "epoch": 0.3686176142697882, + "grad_norm": 0.49612775444984436, + "learning_rate": 8e-05, + "loss": 1.5169, + "step": 6613 + }, + { + "epoch": 0.36867335562987735, + "grad_norm": 0.5605433583259583, + "learning_rate": 8e-05, + "loss": 1.6678, + "step": 6614 + }, + { + "epoch": 0.36872909698996653, + "grad_norm": 0.5453359484672546, + "learning_rate": 8e-05, + "loss": 1.7332, + "step": 6615 + }, + { + "epoch": 0.36878483835005577, + "grad_norm": 0.507093608379364, + "learning_rate": 8e-05, + "loss": 1.7661, + "step": 6616 + }, + { + "epoch": 0.36884057971014494, + "grad_norm": 0.5282663106918335, + "learning_rate": 8e-05, + "loss": 1.7891, + "step": 6617 + }, + { + "epoch": 0.3688963210702341, + "grad_norm": 0.5238633155822754, + "learning_rate": 8e-05, + "loss": 1.7113, + "step": 6618 + }, + { + "epoch": 0.3689520624303233, + "grad_norm": 0.5179184079170227, + "learning_rate": 8e-05, + "loss": 1.7173, + "step": 6619 + }, + { + "epoch": 0.3690078037904125, + "grad_norm": 0.5235730409622192, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 6620 + }, + { + "epoch": 0.36906354515050166, + "grad_norm": 0.49404484033584595, + "learning_rate": 8e-05, + "loss": 1.5118, + "step": 6621 + }, + { + "epoch": 0.36911928651059084, + "grad_norm": 0.5173282623291016, + "learning_rate": 8e-05, + "loss": 1.6177, + "step": 6622 + }, + { + "epoch": 0.36917502787068, + "grad_norm": 0.5297023057937622, + "learning_rate": 8e-05, + "loss": 1.7344, + "step": 6623 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 0.512212872505188, + "learning_rate": 8e-05, + "loss": 1.5957, + "step": 6624 + }, + { + "epoch": 0.36928651059085843, + "grad_norm": 0.504366934299469, + "learning_rate": 8e-05, + "loss": 1.672, + "step": 6625 + }, + { + "epoch": 0.3693422519509476, + "grad_norm": 0.5258657932281494, + "learning_rate": 8e-05, + "loss": 1.5942, + "step": 6626 + }, + { + "epoch": 0.3693979933110368, + "grad_norm": 0.5346935987472534, + "learning_rate": 8e-05, + "loss": 1.6521, + "step": 6627 + }, + { + "epoch": 0.36945373467112597, + "grad_norm": 0.4969017505645752, + "learning_rate": 8e-05, + "loss": 1.6157, + "step": 6628 + }, + { + "epoch": 0.36950947603121514, + "grad_norm": 0.5460304021835327, + "learning_rate": 8e-05, + "loss": 1.6787, + "step": 6629 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.4940931797027588, + "learning_rate": 8e-05, + "loss": 1.6568, + "step": 6630 + }, + { + "epoch": 0.36962095875139356, + "grad_norm": 0.5654184818267822, + "learning_rate": 8e-05, + "loss": 1.7727, + "step": 6631 + }, + { + "epoch": 0.36967670011148274, + "grad_norm": 0.48522743582725525, + "learning_rate": 8e-05, + "loss": 1.4585, + "step": 6632 + }, + { + "epoch": 0.3697324414715719, + "grad_norm": 0.5106379985809326, + "learning_rate": 8e-05, + "loss": 1.5965, + "step": 6633 + }, + { + "epoch": 0.3697881828316611, + "grad_norm": 0.5287076830863953, + "learning_rate": 8e-05, + "loss": 1.6599, + "step": 6634 + }, + { + "epoch": 0.3698439241917503, + "grad_norm": 0.4883654713630676, + "learning_rate": 8e-05, + "loss": 1.6157, + "step": 6635 + }, + { + "epoch": 0.36989966555183945, + "grad_norm": 0.459191232919693, + "learning_rate": 8e-05, + "loss": 1.3488, + "step": 6636 + }, + { + "epoch": 0.36995540691192863, + "grad_norm": 0.5547154545783997, + "learning_rate": 8e-05, + "loss": 1.8267, + "step": 6637 + }, + { + "epoch": 0.37001114827201786, + "grad_norm": 0.5515210628509521, + "learning_rate": 8e-05, + "loss": 1.7662, + "step": 6638 + }, + { + "epoch": 0.37006688963210704, + "grad_norm": 0.571776270866394, + "learning_rate": 8e-05, + "loss": 1.8165, + "step": 6639 + }, + { + "epoch": 0.3701226309921962, + "grad_norm": 0.4707722067832947, + "learning_rate": 8e-05, + "loss": 1.6566, + "step": 6640 + }, + { + "epoch": 0.3701783723522854, + "grad_norm": 0.5285392999649048, + "learning_rate": 8e-05, + "loss": 1.6576, + "step": 6641 + }, + { + "epoch": 0.3702341137123746, + "grad_norm": 0.49318474531173706, + "learning_rate": 8e-05, + "loss": 1.6507, + "step": 6642 + }, + { + "epoch": 0.37028985507246376, + "grad_norm": 0.5350597500801086, + "learning_rate": 8e-05, + "loss": 1.7359, + "step": 6643 + }, + { + "epoch": 0.37034559643255294, + "grad_norm": 0.561744213104248, + "learning_rate": 8e-05, + "loss": 1.7681, + "step": 6644 + }, + { + "epoch": 0.3704013377926421, + "grad_norm": 0.529123842716217, + "learning_rate": 8e-05, + "loss": 1.6686, + "step": 6645 + }, + { + "epoch": 0.37045707915273135, + "grad_norm": 0.5006415247917175, + "learning_rate": 8e-05, + "loss": 1.4867, + "step": 6646 + }, + { + "epoch": 0.37051282051282053, + "grad_norm": 0.47411513328552246, + "learning_rate": 8e-05, + "loss": 1.6221, + "step": 6647 + }, + { + "epoch": 0.3705685618729097, + "grad_norm": 0.535148561000824, + "learning_rate": 8e-05, + "loss": 1.4756, + "step": 6648 + }, + { + "epoch": 0.3706243032329989, + "grad_norm": 0.483823299407959, + "learning_rate": 8e-05, + "loss": 1.6542, + "step": 6649 + }, + { + "epoch": 0.37068004459308806, + "grad_norm": 0.5823395252227783, + "learning_rate": 8e-05, + "loss": 1.8614, + "step": 6650 + }, + { + "epoch": 0.37073578595317724, + "grad_norm": 0.4946765899658203, + "learning_rate": 8e-05, + "loss": 1.5182, + "step": 6651 + }, + { + "epoch": 0.3707915273132664, + "grad_norm": 0.5327061414718628, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 6652 + }, + { + "epoch": 0.37084726867335566, + "grad_norm": 0.49041929841041565, + "learning_rate": 8e-05, + "loss": 1.6707, + "step": 6653 + }, + { + "epoch": 0.37090301003344484, + "grad_norm": 0.5231907367706299, + "learning_rate": 8e-05, + "loss": 1.5303, + "step": 6654 + }, + { + "epoch": 0.370958751393534, + "grad_norm": 0.47933900356292725, + "learning_rate": 8e-05, + "loss": 1.4115, + "step": 6655 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 0.5437414646148682, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 6656 + }, + { + "epoch": 0.37107023411371237, + "grad_norm": 0.5546405911445618, + "learning_rate": 8e-05, + "loss": 1.8704, + "step": 6657 + }, + { + "epoch": 0.37112597547380155, + "grad_norm": 0.4913389980792999, + "learning_rate": 8e-05, + "loss": 1.6487, + "step": 6658 + }, + { + "epoch": 0.37118171683389073, + "grad_norm": 0.501367449760437, + "learning_rate": 8e-05, + "loss": 1.6942, + "step": 6659 + }, + { + "epoch": 0.3712374581939799, + "grad_norm": 0.5529482960700989, + "learning_rate": 8e-05, + "loss": 1.336, + "step": 6660 + }, + { + "epoch": 0.37129319955406914, + "grad_norm": 0.5055440664291382, + "learning_rate": 8e-05, + "loss": 1.3953, + "step": 6661 + }, + { + "epoch": 0.3713489409141583, + "grad_norm": 0.497901052236557, + "learning_rate": 8e-05, + "loss": 1.5928, + "step": 6662 + }, + { + "epoch": 0.3714046822742475, + "grad_norm": 0.5057888031005859, + "learning_rate": 8e-05, + "loss": 1.7644, + "step": 6663 + }, + { + "epoch": 0.3714604236343367, + "grad_norm": 0.5101109147071838, + "learning_rate": 8e-05, + "loss": 1.5919, + "step": 6664 + }, + { + "epoch": 0.37151616499442586, + "grad_norm": 0.4964488744735718, + "learning_rate": 8e-05, + "loss": 1.6232, + "step": 6665 + }, + { + "epoch": 0.37157190635451504, + "grad_norm": 0.5438522696495056, + "learning_rate": 8e-05, + "loss": 1.8482, + "step": 6666 + }, + { + "epoch": 0.3716276477146042, + "grad_norm": 0.5171390175819397, + "learning_rate": 8e-05, + "loss": 1.5641, + "step": 6667 + }, + { + "epoch": 0.37168338907469345, + "grad_norm": 0.6002838611602783, + "learning_rate": 8e-05, + "loss": 2.0131, + "step": 6668 + }, + { + "epoch": 0.3717391304347826, + "grad_norm": 0.5407944321632385, + "learning_rate": 8e-05, + "loss": 1.7647, + "step": 6669 + }, + { + "epoch": 0.3717948717948718, + "grad_norm": 0.5388062596321106, + "learning_rate": 8e-05, + "loss": 1.7898, + "step": 6670 + }, + { + "epoch": 0.371850613154961, + "grad_norm": 0.49192193150520325, + "learning_rate": 8e-05, + "loss": 1.6744, + "step": 6671 + }, + { + "epoch": 0.37190635451505016, + "grad_norm": 0.5632032752037048, + "learning_rate": 8e-05, + "loss": 1.946, + "step": 6672 + }, + { + "epoch": 0.37196209587513934, + "grad_norm": 0.5415680408477783, + "learning_rate": 8e-05, + "loss": 1.6364, + "step": 6673 + }, + { + "epoch": 0.3720178372352285, + "grad_norm": 0.5188789367675781, + "learning_rate": 8e-05, + "loss": 1.4743, + "step": 6674 + }, + { + "epoch": 0.3720735785953177, + "grad_norm": 0.5327577590942383, + "learning_rate": 8e-05, + "loss": 1.7789, + "step": 6675 + }, + { + "epoch": 0.37212931995540693, + "grad_norm": 0.5899271368980408, + "learning_rate": 8e-05, + "loss": 1.9106, + "step": 6676 + }, + { + "epoch": 0.3721850613154961, + "grad_norm": 0.5051175355911255, + "learning_rate": 8e-05, + "loss": 1.6377, + "step": 6677 + }, + { + "epoch": 0.3722408026755853, + "grad_norm": 0.5093581676483154, + "learning_rate": 8e-05, + "loss": 1.6424, + "step": 6678 + }, + { + "epoch": 0.37229654403567447, + "grad_norm": 0.5512845516204834, + "learning_rate": 8e-05, + "loss": 1.6879, + "step": 6679 + }, + { + "epoch": 0.37235228539576365, + "grad_norm": 0.5061362385749817, + "learning_rate": 8e-05, + "loss": 1.9053, + "step": 6680 + }, + { + "epoch": 0.37240802675585283, + "grad_norm": 0.48424118757247925, + "learning_rate": 8e-05, + "loss": 1.3633, + "step": 6681 + }, + { + "epoch": 0.372463768115942, + "grad_norm": 0.5293522477149963, + "learning_rate": 8e-05, + "loss": 1.6503, + "step": 6682 + }, + { + "epoch": 0.37251950947603124, + "grad_norm": 0.4804575443267822, + "learning_rate": 8e-05, + "loss": 1.3826, + "step": 6683 + }, + { + "epoch": 0.3725752508361204, + "grad_norm": 0.5213553309440613, + "learning_rate": 8e-05, + "loss": 1.9151, + "step": 6684 + }, + { + "epoch": 0.3726309921962096, + "grad_norm": 0.5248057842254639, + "learning_rate": 8e-05, + "loss": 1.8208, + "step": 6685 + }, + { + "epoch": 0.3726867335562988, + "grad_norm": 0.6494954824447632, + "learning_rate": 8e-05, + "loss": 1.5739, + "step": 6686 + }, + { + "epoch": 0.37274247491638796, + "grad_norm": 0.5033650994300842, + "learning_rate": 8e-05, + "loss": 1.5725, + "step": 6687 + }, + { + "epoch": 0.37279821627647713, + "grad_norm": 0.5093197226524353, + "learning_rate": 8e-05, + "loss": 1.7493, + "step": 6688 + }, + { + "epoch": 0.3728539576365663, + "grad_norm": 0.485580712556839, + "learning_rate": 8e-05, + "loss": 1.5371, + "step": 6689 + }, + { + "epoch": 0.3729096989966555, + "grad_norm": 0.5913149118423462, + "learning_rate": 8e-05, + "loss": 1.7906, + "step": 6690 + }, + { + "epoch": 0.3729654403567447, + "grad_norm": 0.5249662399291992, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 6691 + }, + { + "epoch": 0.3730211817168339, + "grad_norm": 0.5243616700172424, + "learning_rate": 8e-05, + "loss": 1.6433, + "step": 6692 + }, + { + "epoch": 0.3730769230769231, + "grad_norm": 0.5441550612449646, + "learning_rate": 8e-05, + "loss": 1.7376, + "step": 6693 + }, + { + "epoch": 0.37313266443701226, + "grad_norm": 0.5584784746170044, + "learning_rate": 8e-05, + "loss": 1.834, + "step": 6694 + }, + { + "epoch": 0.37318840579710144, + "grad_norm": 0.5328128933906555, + "learning_rate": 8e-05, + "loss": 1.7491, + "step": 6695 + }, + { + "epoch": 0.3732441471571906, + "grad_norm": 0.5425513982772827, + "learning_rate": 8e-05, + "loss": 1.8401, + "step": 6696 + }, + { + "epoch": 0.3732998885172798, + "grad_norm": 0.5354945063591003, + "learning_rate": 8e-05, + "loss": 1.626, + "step": 6697 + }, + { + "epoch": 0.37335562987736903, + "grad_norm": 0.5174336433410645, + "learning_rate": 8e-05, + "loss": 1.6046, + "step": 6698 + }, + { + "epoch": 0.3734113712374582, + "grad_norm": 0.6389894485473633, + "learning_rate": 8e-05, + "loss": 1.9224, + "step": 6699 + }, + { + "epoch": 0.3734671125975474, + "grad_norm": 0.4668770134449005, + "learning_rate": 8e-05, + "loss": 1.4228, + "step": 6700 + }, + { + "epoch": 0.37352285395763657, + "grad_norm": 0.46430280804634094, + "learning_rate": 8e-05, + "loss": 1.585, + "step": 6701 + }, + { + "epoch": 0.37357859531772575, + "grad_norm": 0.5275490880012512, + "learning_rate": 8e-05, + "loss": 1.7373, + "step": 6702 + }, + { + "epoch": 0.3736343366778149, + "grad_norm": 0.5056618452072144, + "learning_rate": 8e-05, + "loss": 1.6823, + "step": 6703 + }, + { + "epoch": 0.3736900780379041, + "grad_norm": 0.5731016993522644, + "learning_rate": 8e-05, + "loss": 1.8575, + "step": 6704 + }, + { + "epoch": 0.3737458193979933, + "grad_norm": 0.5207464098930359, + "learning_rate": 8e-05, + "loss": 1.5405, + "step": 6705 + }, + { + "epoch": 0.3738015607580825, + "grad_norm": 0.49173352122306824, + "learning_rate": 8e-05, + "loss": 1.4675, + "step": 6706 + }, + { + "epoch": 0.3738573021181717, + "grad_norm": 0.5099393129348755, + "learning_rate": 8e-05, + "loss": 1.582, + "step": 6707 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 0.5217099785804749, + "learning_rate": 8e-05, + "loss": 1.6141, + "step": 6708 + }, + { + "epoch": 0.37396878483835005, + "grad_norm": 0.5156376957893372, + "learning_rate": 8e-05, + "loss": 1.8233, + "step": 6709 + }, + { + "epoch": 0.37402452619843923, + "grad_norm": 0.4843328893184662, + "learning_rate": 8e-05, + "loss": 1.3098, + "step": 6710 + }, + { + "epoch": 0.3740802675585284, + "grad_norm": 0.5049893856048584, + "learning_rate": 8e-05, + "loss": 1.5087, + "step": 6711 + }, + { + "epoch": 0.3741360089186176, + "grad_norm": 0.5323360562324524, + "learning_rate": 8e-05, + "loss": 1.8198, + "step": 6712 + }, + { + "epoch": 0.3741917502787068, + "grad_norm": 0.5711098313331604, + "learning_rate": 8e-05, + "loss": 1.5649, + "step": 6713 + }, + { + "epoch": 0.374247491638796, + "grad_norm": 0.538690984249115, + "learning_rate": 8e-05, + "loss": 1.4958, + "step": 6714 + }, + { + "epoch": 0.3743032329988852, + "grad_norm": 0.5207656621932983, + "learning_rate": 8e-05, + "loss": 1.5492, + "step": 6715 + }, + { + "epoch": 0.37435897435897436, + "grad_norm": 0.5162418484687805, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 6716 + }, + { + "epoch": 0.37441471571906354, + "grad_norm": 0.5555497407913208, + "learning_rate": 8e-05, + "loss": 1.7426, + "step": 6717 + }, + { + "epoch": 0.3744704570791527, + "grad_norm": 0.5439112782478333, + "learning_rate": 8e-05, + "loss": 1.6575, + "step": 6718 + }, + { + "epoch": 0.3745261984392419, + "grad_norm": 0.5380793809890747, + "learning_rate": 8e-05, + "loss": 1.7351, + "step": 6719 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.5121132731437683, + "learning_rate": 8e-05, + "loss": 1.544, + "step": 6720 + }, + { + "epoch": 0.3746376811594203, + "grad_norm": 0.5033418536186218, + "learning_rate": 8e-05, + "loss": 1.544, + "step": 6721 + }, + { + "epoch": 0.3746934225195095, + "grad_norm": 0.5126864314079285, + "learning_rate": 8e-05, + "loss": 1.6579, + "step": 6722 + }, + { + "epoch": 0.37474916387959867, + "grad_norm": 0.5681543350219727, + "learning_rate": 8e-05, + "loss": 1.7232, + "step": 6723 + }, + { + "epoch": 0.37480490523968785, + "grad_norm": 0.5577385425567627, + "learning_rate": 8e-05, + "loss": 1.5906, + "step": 6724 + }, + { + "epoch": 0.374860646599777, + "grad_norm": 0.5277745723724365, + "learning_rate": 8e-05, + "loss": 1.6748, + "step": 6725 + }, + { + "epoch": 0.3749163879598662, + "grad_norm": 0.5073549747467041, + "learning_rate": 8e-05, + "loss": 1.3841, + "step": 6726 + }, + { + "epoch": 0.3749721293199554, + "grad_norm": 0.5121777653694153, + "learning_rate": 8e-05, + "loss": 1.6002, + "step": 6727 + }, + { + "epoch": 0.3750278706800446, + "grad_norm": 0.5338273644447327, + "learning_rate": 8e-05, + "loss": 1.6037, + "step": 6728 + }, + { + "epoch": 0.3750836120401338, + "grad_norm": 0.550675630569458, + "learning_rate": 8e-05, + "loss": 1.7317, + "step": 6729 + }, + { + "epoch": 0.375139353400223, + "grad_norm": 0.5400722026824951, + "learning_rate": 8e-05, + "loss": 1.8084, + "step": 6730 + }, + { + "epoch": 0.37519509476031215, + "grad_norm": 0.5145625472068787, + "learning_rate": 8e-05, + "loss": 1.4093, + "step": 6731 + }, + { + "epoch": 0.37525083612040133, + "grad_norm": 0.6066479086875916, + "learning_rate": 8e-05, + "loss": 1.9785, + "step": 6732 + }, + { + "epoch": 0.3753065774804905, + "grad_norm": 0.5051720142364502, + "learning_rate": 8e-05, + "loss": 1.7888, + "step": 6733 + }, + { + "epoch": 0.3753623188405797, + "grad_norm": 0.5142165422439575, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 6734 + }, + { + "epoch": 0.3754180602006689, + "grad_norm": 0.5311247706413269, + "learning_rate": 8e-05, + "loss": 1.6527, + "step": 6735 + }, + { + "epoch": 0.3754738015607581, + "grad_norm": 0.5546833872795105, + "learning_rate": 8e-05, + "loss": 1.8813, + "step": 6736 + }, + { + "epoch": 0.3755295429208473, + "grad_norm": 0.5479986071586609, + "learning_rate": 8e-05, + "loss": 1.6774, + "step": 6737 + }, + { + "epoch": 0.37558528428093646, + "grad_norm": 0.5656196475028992, + "learning_rate": 8e-05, + "loss": 1.9011, + "step": 6738 + }, + { + "epoch": 0.37564102564102564, + "grad_norm": 0.5137097239494324, + "learning_rate": 8e-05, + "loss": 1.7263, + "step": 6739 + }, + { + "epoch": 0.3756967670011148, + "grad_norm": 0.5318892598152161, + "learning_rate": 8e-05, + "loss": 1.75, + "step": 6740 + }, + { + "epoch": 0.375752508361204, + "grad_norm": 0.5124412178993225, + "learning_rate": 8e-05, + "loss": 1.6559, + "step": 6741 + }, + { + "epoch": 0.3758082497212932, + "grad_norm": 0.5239470601081848, + "learning_rate": 8e-05, + "loss": 1.7523, + "step": 6742 + }, + { + "epoch": 0.3758639910813824, + "grad_norm": 0.5155774354934692, + "learning_rate": 8e-05, + "loss": 1.7701, + "step": 6743 + }, + { + "epoch": 0.3759197324414716, + "grad_norm": 0.4978020191192627, + "learning_rate": 8e-05, + "loss": 1.5988, + "step": 6744 + }, + { + "epoch": 0.37597547380156077, + "grad_norm": 0.4940713047981262, + "learning_rate": 8e-05, + "loss": 1.78, + "step": 6745 + }, + { + "epoch": 0.37603121516164995, + "grad_norm": 0.4915592670440674, + "learning_rate": 8e-05, + "loss": 1.6901, + "step": 6746 + }, + { + "epoch": 0.3760869565217391, + "grad_norm": 0.5367061495780945, + "learning_rate": 8e-05, + "loss": 1.5299, + "step": 6747 + }, + { + "epoch": 0.3761426978818283, + "grad_norm": 0.5281707644462585, + "learning_rate": 8e-05, + "loss": 1.6676, + "step": 6748 + }, + { + "epoch": 0.3761984392419175, + "grad_norm": 0.5502864122390747, + "learning_rate": 8e-05, + "loss": 1.7959, + "step": 6749 + }, + { + "epoch": 0.3762541806020067, + "grad_norm": 0.48569023609161377, + "learning_rate": 8e-05, + "loss": 1.5379, + "step": 6750 + }, + { + "epoch": 0.3763099219620959, + "grad_norm": 0.5276660919189453, + "learning_rate": 8e-05, + "loss": 1.5918, + "step": 6751 + }, + { + "epoch": 0.3763656633221851, + "grad_norm": 0.5254923701286316, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 6752 + }, + { + "epoch": 0.37642140468227425, + "grad_norm": 0.5019527077674866, + "learning_rate": 8e-05, + "loss": 1.4723, + "step": 6753 + }, + { + "epoch": 0.37647714604236343, + "grad_norm": 0.5919958353042603, + "learning_rate": 8e-05, + "loss": 1.8617, + "step": 6754 + }, + { + "epoch": 0.3765328874024526, + "grad_norm": 0.5578399300575256, + "learning_rate": 8e-05, + "loss": 1.8262, + "step": 6755 + }, + { + "epoch": 0.3765886287625418, + "grad_norm": 0.5366002917289734, + "learning_rate": 8e-05, + "loss": 1.7505, + "step": 6756 + }, + { + "epoch": 0.37664437012263097, + "grad_norm": 0.5503619909286499, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 6757 + }, + { + "epoch": 0.3767001114827202, + "grad_norm": 0.5201141238212585, + "learning_rate": 8e-05, + "loss": 1.8242, + "step": 6758 + }, + { + "epoch": 0.3767558528428094, + "grad_norm": 0.5590203404426575, + "learning_rate": 8e-05, + "loss": 1.7174, + "step": 6759 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.5271986722946167, + "learning_rate": 8e-05, + "loss": 1.7366, + "step": 6760 + }, + { + "epoch": 0.37686733556298774, + "grad_norm": 0.49543046951293945, + "learning_rate": 8e-05, + "loss": 1.5968, + "step": 6761 + }, + { + "epoch": 0.3769230769230769, + "grad_norm": 0.4976285398006439, + "learning_rate": 8e-05, + "loss": 1.7351, + "step": 6762 + }, + { + "epoch": 0.3769788182831661, + "grad_norm": 0.49661460518836975, + "learning_rate": 8e-05, + "loss": 1.4948, + "step": 6763 + }, + { + "epoch": 0.3770345596432553, + "grad_norm": 0.5244497656822205, + "learning_rate": 8e-05, + "loss": 1.8646, + "step": 6764 + }, + { + "epoch": 0.3770903010033445, + "grad_norm": 0.5240281224250793, + "learning_rate": 8e-05, + "loss": 1.8795, + "step": 6765 + }, + { + "epoch": 0.3771460423634337, + "grad_norm": 0.5917136073112488, + "learning_rate": 8e-05, + "loss": 2.0025, + "step": 6766 + }, + { + "epoch": 0.37720178372352287, + "grad_norm": 0.5583435893058777, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 6767 + }, + { + "epoch": 0.37725752508361204, + "grad_norm": 0.5000171065330505, + "learning_rate": 8e-05, + "loss": 1.6172, + "step": 6768 + }, + { + "epoch": 0.3773132664437012, + "grad_norm": 0.5241192579269409, + "learning_rate": 8e-05, + "loss": 1.5843, + "step": 6769 + }, + { + "epoch": 0.3773690078037904, + "grad_norm": 0.5357404947280884, + "learning_rate": 8e-05, + "loss": 1.8782, + "step": 6770 + }, + { + "epoch": 0.3774247491638796, + "grad_norm": 0.5429965257644653, + "learning_rate": 8e-05, + "loss": 1.7346, + "step": 6771 + }, + { + "epoch": 0.37748049052396876, + "grad_norm": 0.5275600552558899, + "learning_rate": 8e-05, + "loss": 1.603, + "step": 6772 + }, + { + "epoch": 0.377536231884058, + "grad_norm": 0.49835145473480225, + "learning_rate": 8e-05, + "loss": 1.8675, + "step": 6773 + }, + { + "epoch": 0.37759197324414717, + "grad_norm": 0.5513197183609009, + "learning_rate": 8e-05, + "loss": 1.905, + "step": 6774 + }, + { + "epoch": 0.37764771460423635, + "grad_norm": 0.48942214250564575, + "learning_rate": 8e-05, + "loss": 1.6214, + "step": 6775 + }, + { + "epoch": 0.37770345596432553, + "grad_norm": 0.45932549238204956, + "learning_rate": 8e-05, + "loss": 1.369, + "step": 6776 + }, + { + "epoch": 0.3777591973244147, + "grad_norm": 0.7865234613418579, + "learning_rate": 8e-05, + "loss": 1.7867, + "step": 6777 + }, + { + "epoch": 0.3778149386845039, + "grad_norm": 0.5142460465431213, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 6778 + }, + { + "epoch": 0.37787068004459307, + "grad_norm": 0.5079149007797241, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 6779 + }, + { + "epoch": 0.3779264214046823, + "grad_norm": 0.518399715423584, + "learning_rate": 8e-05, + "loss": 1.57, + "step": 6780 + }, + { + "epoch": 0.3779821627647715, + "grad_norm": 0.5325887799263, + "learning_rate": 8e-05, + "loss": 1.7435, + "step": 6781 + }, + { + "epoch": 0.37803790412486066, + "grad_norm": 0.5779329538345337, + "learning_rate": 8e-05, + "loss": 1.7795, + "step": 6782 + }, + { + "epoch": 0.37809364548494984, + "grad_norm": 0.5537606477737427, + "learning_rate": 8e-05, + "loss": 1.6176, + "step": 6783 + }, + { + "epoch": 0.378149386845039, + "grad_norm": 0.5910248160362244, + "learning_rate": 8e-05, + "loss": 1.7853, + "step": 6784 + }, + { + "epoch": 0.3782051282051282, + "grad_norm": 0.5300013422966003, + "learning_rate": 8e-05, + "loss": 1.5137, + "step": 6785 + }, + { + "epoch": 0.3782608695652174, + "grad_norm": 0.5802608132362366, + "learning_rate": 8e-05, + "loss": 1.7391, + "step": 6786 + }, + { + "epoch": 0.37831661092530655, + "grad_norm": 0.520869255065918, + "learning_rate": 8e-05, + "loss": 1.6134, + "step": 6787 + }, + { + "epoch": 0.3783723522853958, + "grad_norm": 0.5778328776359558, + "learning_rate": 8e-05, + "loss": 1.8975, + "step": 6788 + }, + { + "epoch": 0.37842809364548496, + "grad_norm": 0.5080073475837708, + "learning_rate": 8e-05, + "loss": 1.5634, + "step": 6789 + }, + { + "epoch": 0.37848383500557414, + "grad_norm": 0.5058872699737549, + "learning_rate": 8e-05, + "loss": 1.8211, + "step": 6790 + }, + { + "epoch": 0.3785395763656633, + "grad_norm": 0.5163450837135315, + "learning_rate": 8e-05, + "loss": 1.7062, + "step": 6791 + }, + { + "epoch": 0.3785953177257525, + "grad_norm": 0.5297355651855469, + "learning_rate": 8e-05, + "loss": 1.715, + "step": 6792 + }, + { + "epoch": 0.3786510590858417, + "grad_norm": 0.5093514919281006, + "learning_rate": 8e-05, + "loss": 1.7468, + "step": 6793 + }, + { + "epoch": 0.37870680044593086, + "grad_norm": 0.5215211510658264, + "learning_rate": 8e-05, + "loss": 1.504, + "step": 6794 + }, + { + "epoch": 0.3787625418060201, + "grad_norm": 0.5218589901924133, + "learning_rate": 8e-05, + "loss": 1.7833, + "step": 6795 + }, + { + "epoch": 0.37881828316610927, + "grad_norm": 0.5648636817932129, + "learning_rate": 8e-05, + "loss": 1.7638, + "step": 6796 + }, + { + "epoch": 0.37887402452619845, + "grad_norm": 0.5256296992301941, + "learning_rate": 8e-05, + "loss": 1.7571, + "step": 6797 + }, + { + "epoch": 0.37892976588628763, + "grad_norm": 0.4967195391654968, + "learning_rate": 8e-05, + "loss": 1.7674, + "step": 6798 + }, + { + "epoch": 0.3789855072463768, + "grad_norm": 0.5004242658615112, + "learning_rate": 8e-05, + "loss": 1.6802, + "step": 6799 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.5254338979721069, + "learning_rate": 8e-05, + "loss": 1.7531, + "step": 6800 + }, + { + "epoch": 0.37909698996655516, + "grad_norm": 0.5258433222770691, + "learning_rate": 8e-05, + "loss": 1.6567, + "step": 6801 + }, + { + "epoch": 0.37915273132664434, + "grad_norm": 0.5207537412643433, + "learning_rate": 8e-05, + "loss": 1.6734, + "step": 6802 + }, + { + "epoch": 0.3792084726867336, + "grad_norm": 0.5069604516029358, + "learning_rate": 8e-05, + "loss": 1.5255, + "step": 6803 + }, + { + "epoch": 0.37926421404682276, + "grad_norm": 0.5184929966926575, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 6804 + }, + { + "epoch": 0.37931995540691194, + "grad_norm": 0.5532615184783936, + "learning_rate": 8e-05, + "loss": 1.6805, + "step": 6805 + }, + { + "epoch": 0.3793756967670011, + "grad_norm": 0.49667033553123474, + "learning_rate": 8e-05, + "loss": 1.512, + "step": 6806 + }, + { + "epoch": 0.3794314381270903, + "grad_norm": 0.5174059271812439, + "learning_rate": 8e-05, + "loss": 1.6957, + "step": 6807 + }, + { + "epoch": 0.37948717948717947, + "grad_norm": 0.48986855149269104, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 6808 + }, + { + "epoch": 0.37954292084726865, + "grad_norm": 0.529532790184021, + "learning_rate": 8e-05, + "loss": 1.6124, + "step": 6809 + }, + { + "epoch": 0.3795986622073579, + "grad_norm": 0.564482569694519, + "learning_rate": 8e-05, + "loss": 1.8692, + "step": 6810 + }, + { + "epoch": 0.37965440356744706, + "grad_norm": 0.5190215706825256, + "learning_rate": 8e-05, + "loss": 1.6718, + "step": 6811 + }, + { + "epoch": 0.37971014492753624, + "grad_norm": 0.5358736515045166, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 6812 + }, + { + "epoch": 0.3797658862876254, + "grad_norm": 0.5234962701797485, + "learning_rate": 8e-05, + "loss": 1.948, + "step": 6813 + }, + { + "epoch": 0.3798216276477146, + "grad_norm": 0.5307388305664062, + "learning_rate": 8e-05, + "loss": 1.6149, + "step": 6814 + }, + { + "epoch": 0.3798773690078038, + "grad_norm": 0.5353745222091675, + "learning_rate": 8e-05, + "loss": 1.8958, + "step": 6815 + }, + { + "epoch": 0.37993311036789296, + "grad_norm": 0.5111658573150635, + "learning_rate": 8e-05, + "loss": 1.5686, + "step": 6816 + }, + { + "epoch": 0.37998885172798214, + "grad_norm": 0.5570499300956726, + "learning_rate": 8e-05, + "loss": 1.7011, + "step": 6817 + }, + { + "epoch": 0.38004459308807137, + "grad_norm": 0.5320193767547607, + "learning_rate": 8e-05, + "loss": 1.641, + "step": 6818 + }, + { + "epoch": 0.38010033444816055, + "grad_norm": 0.498035192489624, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 6819 + }, + { + "epoch": 0.3801560758082497, + "grad_norm": 0.5418937802314758, + "learning_rate": 8e-05, + "loss": 1.8077, + "step": 6820 + }, + { + "epoch": 0.3802118171683389, + "grad_norm": 0.5447816252708435, + "learning_rate": 8e-05, + "loss": 1.7445, + "step": 6821 + }, + { + "epoch": 0.3802675585284281, + "grad_norm": 0.535697340965271, + "learning_rate": 8e-05, + "loss": 1.7219, + "step": 6822 + }, + { + "epoch": 0.38032329988851726, + "grad_norm": 0.543074369430542, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 6823 + }, + { + "epoch": 0.38037904124860644, + "grad_norm": 0.4806436002254486, + "learning_rate": 8e-05, + "loss": 1.5289, + "step": 6824 + }, + { + "epoch": 0.3804347826086957, + "grad_norm": 0.4711361825466156, + "learning_rate": 8e-05, + "loss": 1.4881, + "step": 6825 + }, + { + "epoch": 0.38049052396878486, + "grad_norm": 0.5814215540885925, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 6826 + }, + { + "epoch": 0.38054626532887403, + "grad_norm": 0.5161845684051514, + "learning_rate": 8e-05, + "loss": 1.7158, + "step": 6827 + }, + { + "epoch": 0.3806020066889632, + "grad_norm": 0.5395570993423462, + "learning_rate": 8e-05, + "loss": 1.6931, + "step": 6828 + }, + { + "epoch": 0.3806577480490524, + "grad_norm": 0.47877243161201477, + "learning_rate": 8e-05, + "loss": 1.4561, + "step": 6829 + }, + { + "epoch": 0.38071348940914157, + "grad_norm": 0.5157475471496582, + "learning_rate": 8e-05, + "loss": 1.6233, + "step": 6830 + }, + { + "epoch": 0.38076923076923075, + "grad_norm": 0.47996920347213745, + "learning_rate": 8e-05, + "loss": 1.747, + "step": 6831 + }, + { + "epoch": 0.38082497212932, + "grad_norm": 0.5504299402236938, + "learning_rate": 8e-05, + "loss": 1.8097, + "step": 6832 + }, + { + "epoch": 0.38088071348940916, + "grad_norm": 0.5216795802116394, + "learning_rate": 8e-05, + "loss": 1.6967, + "step": 6833 + }, + { + "epoch": 0.38093645484949834, + "grad_norm": 0.5156157612800598, + "learning_rate": 8e-05, + "loss": 1.6106, + "step": 6834 + }, + { + "epoch": 0.3809921962095875, + "grad_norm": 0.5302389860153198, + "learning_rate": 8e-05, + "loss": 1.6393, + "step": 6835 + }, + { + "epoch": 0.3810479375696767, + "grad_norm": 0.6114341020584106, + "learning_rate": 8e-05, + "loss": 1.7414, + "step": 6836 + }, + { + "epoch": 0.3811036789297659, + "grad_norm": 0.589827835559845, + "learning_rate": 8e-05, + "loss": 1.8041, + "step": 6837 + }, + { + "epoch": 0.38115942028985506, + "grad_norm": 0.5321907997131348, + "learning_rate": 8e-05, + "loss": 1.871, + "step": 6838 + }, + { + "epoch": 0.38121516164994423, + "grad_norm": 0.5287553071975708, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 6839 + }, + { + "epoch": 0.38127090301003347, + "grad_norm": 0.5082924962043762, + "learning_rate": 8e-05, + "loss": 1.6053, + "step": 6840 + }, + { + "epoch": 0.38132664437012265, + "grad_norm": 0.5522058010101318, + "learning_rate": 8e-05, + "loss": 2.0149, + "step": 6841 + }, + { + "epoch": 0.3813823857302118, + "grad_norm": 0.5427312850952148, + "learning_rate": 8e-05, + "loss": 1.6895, + "step": 6842 + }, + { + "epoch": 0.381438127090301, + "grad_norm": 0.49162036180496216, + "learning_rate": 8e-05, + "loss": 1.5884, + "step": 6843 + }, + { + "epoch": 0.3814938684503902, + "grad_norm": 0.5044377446174622, + "learning_rate": 8e-05, + "loss": 1.8749, + "step": 6844 + }, + { + "epoch": 0.38154960981047936, + "grad_norm": 0.5048327445983887, + "learning_rate": 8e-05, + "loss": 1.7494, + "step": 6845 + }, + { + "epoch": 0.38160535117056854, + "grad_norm": 0.5744604468345642, + "learning_rate": 8e-05, + "loss": 2.0678, + "step": 6846 + }, + { + "epoch": 0.3816610925306578, + "grad_norm": 0.5219566226005554, + "learning_rate": 8e-05, + "loss": 1.685, + "step": 6847 + }, + { + "epoch": 0.38171683389074695, + "grad_norm": 0.5850833654403687, + "learning_rate": 8e-05, + "loss": 1.8884, + "step": 6848 + }, + { + "epoch": 0.38177257525083613, + "grad_norm": 0.510691225528717, + "learning_rate": 8e-05, + "loss": 1.6582, + "step": 6849 + }, + { + "epoch": 0.3818283166109253, + "grad_norm": 0.5360484719276428, + "learning_rate": 8e-05, + "loss": 1.8495, + "step": 6850 + }, + { + "epoch": 0.3818840579710145, + "grad_norm": 0.5596208572387695, + "learning_rate": 8e-05, + "loss": 1.8184, + "step": 6851 + }, + { + "epoch": 0.38193979933110367, + "grad_norm": 0.5178501009941101, + "learning_rate": 8e-05, + "loss": 1.6066, + "step": 6852 + }, + { + "epoch": 0.38199554069119285, + "grad_norm": 0.535118043422699, + "learning_rate": 8e-05, + "loss": 1.6646, + "step": 6853 + }, + { + "epoch": 0.382051282051282, + "grad_norm": 0.5289649963378906, + "learning_rate": 8e-05, + "loss": 1.8509, + "step": 6854 + }, + { + "epoch": 0.38210702341137126, + "grad_norm": 0.5231328010559082, + "learning_rate": 8e-05, + "loss": 1.6752, + "step": 6855 + }, + { + "epoch": 0.38216276477146044, + "grad_norm": 0.5241426825523376, + "learning_rate": 8e-05, + "loss": 1.6721, + "step": 6856 + }, + { + "epoch": 0.3822185061315496, + "grad_norm": 0.5819863677024841, + "learning_rate": 8e-05, + "loss": 2.0239, + "step": 6857 + }, + { + "epoch": 0.3822742474916388, + "grad_norm": 0.5138490200042725, + "learning_rate": 8e-05, + "loss": 1.6158, + "step": 6858 + }, + { + "epoch": 0.382329988851728, + "grad_norm": 0.5236282348632812, + "learning_rate": 8e-05, + "loss": 1.7532, + "step": 6859 + }, + { + "epoch": 0.38238573021181715, + "grad_norm": 0.503381609916687, + "learning_rate": 8e-05, + "loss": 1.454, + "step": 6860 + }, + { + "epoch": 0.38244147157190633, + "grad_norm": 0.5174334049224854, + "learning_rate": 8e-05, + "loss": 1.5932, + "step": 6861 + }, + { + "epoch": 0.38249721293199557, + "grad_norm": 0.5292040705680847, + "learning_rate": 8e-05, + "loss": 1.7081, + "step": 6862 + }, + { + "epoch": 0.38255295429208475, + "grad_norm": 0.5251057147979736, + "learning_rate": 8e-05, + "loss": 1.8752, + "step": 6863 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.5487580895423889, + "learning_rate": 8e-05, + "loss": 1.8481, + "step": 6864 + }, + { + "epoch": 0.3826644370122631, + "grad_norm": 0.5405415892601013, + "learning_rate": 8e-05, + "loss": 1.9514, + "step": 6865 + }, + { + "epoch": 0.3827201783723523, + "grad_norm": 0.517487645149231, + "learning_rate": 8e-05, + "loss": 1.5871, + "step": 6866 + }, + { + "epoch": 0.38277591973244146, + "grad_norm": 0.5205005407333374, + "learning_rate": 8e-05, + "loss": 1.8664, + "step": 6867 + }, + { + "epoch": 0.38283166109253064, + "grad_norm": 0.5625116229057312, + "learning_rate": 8e-05, + "loss": 1.6272, + "step": 6868 + }, + { + "epoch": 0.3828874024526198, + "grad_norm": 0.5461683869361877, + "learning_rate": 8e-05, + "loss": 1.7246, + "step": 6869 + }, + { + "epoch": 0.38294314381270905, + "grad_norm": 0.5413379073143005, + "learning_rate": 8e-05, + "loss": 1.7691, + "step": 6870 + }, + { + "epoch": 0.38299888517279823, + "grad_norm": 0.498203843832016, + "learning_rate": 8e-05, + "loss": 1.3738, + "step": 6871 + }, + { + "epoch": 0.3830546265328874, + "grad_norm": 0.5851048827171326, + "learning_rate": 8e-05, + "loss": 1.8351, + "step": 6872 + }, + { + "epoch": 0.3831103678929766, + "grad_norm": 0.693454921245575, + "learning_rate": 8e-05, + "loss": 1.7745, + "step": 6873 + }, + { + "epoch": 0.38316610925306577, + "grad_norm": 0.5207720994949341, + "learning_rate": 8e-05, + "loss": 1.6717, + "step": 6874 + }, + { + "epoch": 0.38322185061315495, + "grad_norm": 0.5102624297142029, + "learning_rate": 8e-05, + "loss": 1.7611, + "step": 6875 + }, + { + "epoch": 0.3832775919732441, + "grad_norm": 0.4984310567378998, + "learning_rate": 8e-05, + "loss": 1.6392, + "step": 6876 + }, + { + "epoch": 0.38333333333333336, + "grad_norm": 0.529915988445282, + "learning_rate": 8e-05, + "loss": 1.8102, + "step": 6877 + }, + { + "epoch": 0.38338907469342254, + "grad_norm": 0.50824373960495, + "learning_rate": 8e-05, + "loss": 1.553, + "step": 6878 + }, + { + "epoch": 0.3834448160535117, + "grad_norm": 0.5229014754295349, + "learning_rate": 8e-05, + "loss": 1.67, + "step": 6879 + }, + { + "epoch": 0.3835005574136009, + "grad_norm": 0.5327182412147522, + "learning_rate": 8e-05, + "loss": 1.8009, + "step": 6880 + }, + { + "epoch": 0.3835562987736901, + "grad_norm": 0.5739976167678833, + "learning_rate": 8e-05, + "loss": 1.5915, + "step": 6881 + }, + { + "epoch": 0.38361204013377925, + "grad_norm": 0.5743573307991028, + "learning_rate": 8e-05, + "loss": 1.8215, + "step": 6882 + }, + { + "epoch": 0.38366778149386843, + "grad_norm": 0.5695149302482605, + "learning_rate": 8e-05, + "loss": 1.8339, + "step": 6883 + }, + { + "epoch": 0.3837235228539576, + "grad_norm": 0.5400159955024719, + "learning_rate": 8e-05, + "loss": 1.6151, + "step": 6884 + }, + { + "epoch": 0.38377926421404684, + "grad_norm": 0.5263397693634033, + "learning_rate": 8e-05, + "loss": 1.7196, + "step": 6885 + }, + { + "epoch": 0.383835005574136, + "grad_norm": 0.4755968451499939, + "learning_rate": 8e-05, + "loss": 1.6788, + "step": 6886 + }, + { + "epoch": 0.3838907469342252, + "grad_norm": 0.542471706867218, + "learning_rate": 8e-05, + "loss": 1.716, + "step": 6887 + }, + { + "epoch": 0.3839464882943144, + "grad_norm": 0.5046743154525757, + "learning_rate": 8e-05, + "loss": 1.6081, + "step": 6888 + }, + { + "epoch": 0.38400222965440356, + "grad_norm": 0.5250272750854492, + "learning_rate": 8e-05, + "loss": 1.8675, + "step": 6889 + }, + { + "epoch": 0.38405797101449274, + "grad_norm": 0.5886393785476685, + "learning_rate": 8e-05, + "loss": 1.9787, + "step": 6890 + }, + { + "epoch": 0.3841137123745819, + "grad_norm": 0.5748692154884338, + "learning_rate": 8e-05, + "loss": 1.7817, + "step": 6891 + }, + { + "epoch": 0.38416945373467115, + "grad_norm": 0.5253850817680359, + "learning_rate": 8e-05, + "loss": 1.535, + "step": 6892 + }, + { + "epoch": 0.38422519509476033, + "grad_norm": 0.5223969221115112, + "learning_rate": 8e-05, + "loss": 1.6191, + "step": 6893 + }, + { + "epoch": 0.3842809364548495, + "grad_norm": 0.5565198659896851, + "learning_rate": 8e-05, + "loss": 1.7, + "step": 6894 + }, + { + "epoch": 0.3843366778149387, + "grad_norm": 0.5230225324630737, + "learning_rate": 8e-05, + "loss": 1.6912, + "step": 6895 + }, + { + "epoch": 0.38439241917502787, + "grad_norm": 0.4867858290672302, + "learning_rate": 8e-05, + "loss": 1.4106, + "step": 6896 + }, + { + "epoch": 0.38444816053511705, + "grad_norm": 0.5560249090194702, + "learning_rate": 8e-05, + "loss": 1.78, + "step": 6897 + }, + { + "epoch": 0.3845039018952062, + "grad_norm": 0.5637099742889404, + "learning_rate": 8e-05, + "loss": 1.8057, + "step": 6898 + }, + { + "epoch": 0.3845596432552954, + "grad_norm": 0.5312997102737427, + "learning_rate": 8e-05, + "loss": 1.646, + "step": 6899 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.516556441783905, + "learning_rate": 8e-05, + "loss": 1.815, + "step": 6900 + }, + { + "epoch": 0.3846711259754738, + "grad_norm": 0.49218010902404785, + "learning_rate": 8e-05, + "loss": 1.5516, + "step": 6901 + }, + { + "epoch": 0.384726867335563, + "grad_norm": 0.5333677530288696, + "learning_rate": 8e-05, + "loss": 1.7512, + "step": 6902 + }, + { + "epoch": 0.3847826086956522, + "grad_norm": 0.5032493472099304, + "learning_rate": 8e-05, + "loss": 1.5828, + "step": 6903 + }, + { + "epoch": 0.38483835005574135, + "grad_norm": 0.5718285441398621, + "learning_rate": 8e-05, + "loss": 1.9966, + "step": 6904 + }, + { + "epoch": 0.38489409141583053, + "grad_norm": 0.5370532274246216, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 6905 + }, + { + "epoch": 0.3849498327759197, + "grad_norm": 0.5484476089477539, + "learning_rate": 8e-05, + "loss": 1.6092, + "step": 6906 + }, + { + "epoch": 0.38500557413600894, + "grad_norm": 0.5581532120704651, + "learning_rate": 8e-05, + "loss": 1.4118, + "step": 6907 + }, + { + "epoch": 0.3850613154960981, + "grad_norm": 0.47904351353645325, + "learning_rate": 8e-05, + "loss": 1.31, + "step": 6908 + }, + { + "epoch": 0.3851170568561873, + "grad_norm": 0.5527108311653137, + "learning_rate": 8e-05, + "loss": 1.7555, + "step": 6909 + }, + { + "epoch": 0.3851727982162765, + "grad_norm": 0.5678485631942749, + "learning_rate": 8e-05, + "loss": 1.6078, + "step": 6910 + }, + { + "epoch": 0.38522853957636566, + "grad_norm": 0.5740782022476196, + "learning_rate": 8e-05, + "loss": 1.9674, + "step": 6911 + }, + { + "epoch": 0.38528428093645484, + "grad_norm": 0.5345740914344788, + "learning_rate": 8e-05, + "loss": 1.6925, + "step": 6912 + }, + { + "epoch": 0.385340022296544, + "grad_norm": 0.5236703753471375, + "learning_rate": 8e-05, + "loss": 1.6034, + "step": 6913 + }, + { + "epoch": 0.3853957636566332, + "grad_norm": 0.6058480739593506, + "learning_rate": 8e-05, + "loss": 1.6309, + "step": 6914 + }, + { + "epoch": 0.38545150501672243, + "grad_norm": 0.5399019122123718, + "learning_rate": 8e-05, + "loss": 1.6592, + "step": 6915 + }, + { + "epoch": 0.3855072463768116, + "grad_norm": 0.5349549651145935, + "learning_rate": 8e-05, + "loss": 1.4853, + "step": 6916 + }, + { + "epoch": 0.3855629877369008, + "grad_norm": 0.5857842564582825, + "learning_rate": 8e-05, + "loss": 1.749, + "step": 6917 + }, + { + "epoch": 0.38561872909698997, + "grad_norm": 0.5716098546981812, + "learning_rate": 8e-05, + "loss": 1.7058, + "step": 6918 + }, + { + "epoch": 0.38567447045707914, + "grad_norm": 0.5078105330467224, + "learning_rate": 8e-05, + "loss": 1.6227, + "step": 6919 + }, + { + "epoch": 0.3857302118171683, + "grad_norm": 0.5407280921936035, + "learning_rate": 8e-05, + "loss": 1.7624, + "step": 6920 + }, + { + "epoch": 0.3857859531772575, + "grad_norm": 0.47967904806137085, + "learning_rate": 8e-05, + "loss": 1.6142, + "step": 6921 + }, + { + "epoch": 0.38584169453734674, + "grad_norm": 0.5285566449165344, + "learning_rate": 8e-05, + "loss": 1.7721, + "step": 6922 + }, + { + "epoch": 0.3858974358974359, + "grad_norm": 0.5036161541938782, + "learning_rate": 8e-05, + "loss": 1.7378, + "step": 6923 + }, + { + "epoch": 0.3859531772575251, + "grad_norm": 0.4978451132774353, + "learning_rate": 8e-05, + "loss": 1.8718, + "step": 6924 + }, + { + "epoch": 0.38600891861761427, + "grad_norm": 0.5122378468513489, + "learning_rate": 8e-05, + "loss": 1.658, + "step": 6925 + }, + { + "epoch": 0.38606465997770345, + "grad_norm": 0.5669604539871216, + "learning_rate": 8e-05, + "loss": 1.9589, + "step": 6926 + }, + { + "epoch": 0.38612040133779263, + "grad_norm": 0.5110654234886169, + "learning_rate": 8e-05, + "loss": 1.6009, + "step": 6927 + }, + { + "epoch": 0.3861761426978818, + "grad_norm": 0.5333113670349121, + "learning_rate": 8e-05, + "loss": 1.6887, + "step": 6928 + }, + { + "epoch": 0.38623188405797104, + "grad_norm": 0.5038242340087891, + "learning_rate": 8e-05, + "loss": 1.5244, + "step": 6929 + }, + { + "epoch": 0.3862876254180602, + "grad_norm": 0.5251333713531494, + "learning_rate": 8e-05, + "loss": 1.7132, + "step": 6930 + }, + { + "epoch": 0.3863433667781494, + "grad_norm": 0.5106619000434875, + "learning_rate": 8e-05, + "loss": 1.6036, + "step": 6931 + }, + { + "epoch": 0.3863991081382386, + "grad_norm": 0.5227919816970825, + "learning_rate": 8e-05, + "loss": 1.8909, + "step": 6932 + }, + { + "epoch": 0.38645484949832776, + "grad_norm": 0.5325038433074951, + "learning_rate": 8e-05, + "loss": 1.6279, + "step": 6933 + }, + { + "epoch": 0.38651059085841694, + "grad_norm": 0.5435842275619507, + "learning_rate": 8e-05, + "loss": 1.6574, + "step": 6934 + }, + { + "epoch": 0.3865663322185061, + "grad_norm": 0.5161607265472412, + "learning_rate": 8e-05, + "loss": 1.6595, + "step": 6935 + }, + { + "epoch": 0.3866220735785953, + "grad_norm": 0.5756670236587524, + "learning_rate": 8e-05, + "loss": 1.7544, + "step": 6936 + }, + { + "epoch": 0.3866778149386845, + "grad_norm": 0.5451154112815857, + "learning_rate": 8e-05, + "loss": 1.761, + "step": 6937 + }, + { + "epoch": 0.3867335562987737, + "grad_norm": 0.5060601830482483, + "learning_rate": 8e-05, + "loss": 1.8706, + "step": 6938 + }, + { + "epoch": 0.3867892976588629, + "grad_norm": 0.5163840055465698, + "learning_rate": 8e-05, + "loss": 1.7337, + "step": 6939 + }, + { + "epoch": 0.38684503901895206, + "grad_norm": 0.5261103510856628, + "learning_rate": 8e-05, + "loss": 1.6595, + "step": 6940 + }, + { + "epoch": 0.38690078037904124, + "grad_norm": 0.5704602003097534, + "learning_rate": 8e-05, + "loss": 1.738, + "step": 6941 + }, + { + "epoch": 0.3869565217391304, + "grad_norm": 0.5094172954559326, + "learning_rate": 8e-05, + "loss": 1.7202, + "step": 6942 + }, + { + "epoch": 0.3870122630992196, + "grad_norm": 0.5927731394767761, + "learning_rate": 8e-05, + "loss": 1.9517, + "step": 6943 + }, + { + "epoch": 0.38706800445930883, + "grad_norm": 0.5634392499923706, + "learning_rate": 8e-05, + "loss": 1.6567, + "step": 6944 + }, + { + "epoch": 0.387123745819398, + "grad_norm": 0.5229656100273132, + "learning_rate": 8e-05, + "loss": 1.6196, + "step": 6945 + }, + { + "epoch": 0.3871794871794872, + "grad_norm": 0.5949238538742065, + "learning_rate": 8e-05, + "loss": 1.9731, + "step": 6946 + }, + { + "epoch": 0.38723522853957637, + "grad_norm": 0.4674675166606903, + "learning_rate": 8e-05, + "loss": 1.5122, + "step": 6947 + }, + { + "epoch": 0.38729096989966555, + "grad_norm": 0.5145415663719177, + "learning_rate": 8e-05, + "loss": 1.5898, + "step": 6948 + }, + { + "epoch": 0.38734671125975473, + "grad_norm": 0.5329121351242065, + "learning_rate": 8e-05, + "loss": 1.6453, + "step": 6949 + }, + { + "epoch": 0.3874024526198439, + "grad_norm": 0.5534684658050537, + "learning_rate": 8e-05, + "loss": 1.9438, + "step": 6950 + }, + { + "epoch": 0.3874581939799331, + "grad_norm": 0.5507635474205017, + "learning_rate": 8e-05, + "loss": 1.7876, + "step": 6951 + }, + { + "epoch": 0.3875139353400223, + "grad_norm": 0.5376663208007812, + "learning_rate": 8e-05, + "loss": 1.7594, + "step": 6952 + }, + { + "epoch": 0.3875696767001115, + "grad_norm": 0.5363755822181702, + "learning_rate": 8e-05, + "loss": 1.7429, + "step": 6953 + }, + { + "epoch": 0.3876254180602007, + "grad_norm": 0.5718921422958374, + "learning_rate": 8e-05, + "loss": 1.7017, + "step": 6954 + }, + { + "epoch": 0.38768115942028986, + "grad_norm": 0.555400013923645, + "learning_rate": 8e-05, + "loss": 1.7949, + "step": 6955 + }, + { + "epoch": 0.38773690078037903, + "grad_norm": 0.49279558658599854, + "learning_rate": 8e-05, + "loss": 1.5243, + "step": 6956 + }, + { + "epoch": 0.3877926421404682, + "grad_norm": 0.5423111319541931, + "learning_rate": 8e-05, + "loss": 1.48, + "step": 6957 + }, + { + "epoch": 0.3878483835005574, + "grad_norm": 0.4846994876861572, + "learning_rate": 8e-05, + "loss": 1.574, + "step": 6958 + }, + { + "epoch": 0.3879041248606466, + "grad_norm": 0.5224431157112122, + "learning_rate": 8e-05, + "loss": 1.6236, + "step": 6959 + }, + { + "epoch": 0.3879598662207358, + "grad_norm": 0.5335124135017395, + "learning_rate": 8e-05, + "loss": 1.7444, + "step": 6960 + }, + { + "epoch": 0.388015607580825, + "grad_norm": 0.5206796526908875, + "learning_rate": 8e-05, + "loss": 1.8457, + "step": 6961 + }, + { + "epoch": 0.38807134894091416, + "grad_norm": 0.5847195386886597, + "learning_rate": 8e-05, + "loss": 1.9442, + "step": 6962 + }, + { + "epoch": 0.38812709030100334, + "grad_norm": 0.4814377725124359, + "learning_rate": 8e-05, + "loss": 1.4659, + "step": 6963 + }, + { + "epoch": 0.3881828316610925, + "grad_norm": 0.4975985884666443, + "learning_rate": 8e-05, + "loss": 1.5948, + "step": 6964 + }, + { + "epoch": 0.3882385730211817, + "grad_norm": 0.5406726002693176, + "learning_rate": 8e-05, + "loss": 1.7501, + "step": 6965 + }, + { + "epoch": 0.3882943143812709, + "grad_norm": 0.5482088923454285, + "learning_rate": 8e-05, + "loss": 1.8163, + "step": 6966 + }, + { + "epoch": 0.3883500557413601, + "grad_norm": 0.5207962989807129, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 6967 + }, + { + "epoch": 0.3884057971014493, + "grad_norm": 0.5449426770210266, + "learning_rate": 8e-05, + "loss": 1.8502, + "step": 6968 + }, + { + "epoch": 0.38846153846153847, + "grad_norm": 0.6138060688972473, + "learning_rate": 8e-05, + "loss": 2.0023, + "step": 6969 + }, + { + "epoch": 0.38851727982162765, + "grad_norm": 0.5436367988586426, + "learning_rate": 8e-05, + "loss": 1.6418, + "step": 6970 + }, + { + "epoch": 0.3885730211817168, + "grad_norm": 0.5104245543479919, + "learning_rate": 8e-05, + "loss": 1.7074, + "step": 6971 + }, + { + "epoch": 0.388628762541806, + "grad_norm": 0.5240477919578552, + "learning_rate": 8e-05, + "loss": 1.7272, + "step": 6972 + }, + { + "epoch": 0.3886845039018952, + "grad_norm": 0.5188844203948975, + "learning_rate": 8e-05, + "loss": 1.6639, + "step": 6973 + }, + { + "epoch": 0.3887402452619844, + "grad_norm": 0.5346798300743103, + "learning_rate": 8e-05, + "loss": 1.88, + "step": 6974 + }, + { + "epoch": 0.3887959866220736, + "grad_norm": 0.5370174050331116, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 6975 + }, + { + "epoch": 0.3888517279821628, + "grad_norm": 0.497453510761261, + "learning_rate": 8e-05, + "loss": 1.4677, + "step": 6976 + }, + { + "epoch": 0.38890746934225195, + "grad_norm": 0.5281513333320618, + "learning_rate": 8e-05, + "loss": 1.7995, + "step": 6977 + }, + { + "epoch": 0.38896321070234113, + "grad_norm": 0.5266953706741333, + "learning_rate": 8e-05, + "loss": 1.7261, + "step": 6978 + }, + { + "epoch": 0.3890189520624303, + "grad_norm": 0.4925101399421692, + "learning_rate": 8e-05, + "loss": 1.6437, + "step": 6979 + }, + { + "epoch": 0.3890746934225195, + "grad_norm": 0.5393596291542053, + "learning_rate": 8e-05, + "loss": 1.7864, + "step": 6980 + }, + { + "epoch": 0.38913043478260867, + "grad_norm": 0.537490963935852, + "learning_rate": 8e-05, + "loss": 1.6231, + "step": 6981 + }, + { + "epoch": 0.3891861761426979, + "grad_norm": 0.5116598606109619, + "learning_rate": 8e-05, + "loss": 1.605, + "step": 6982 + }, + { + "epoch": 0.3892419175027871, + "grad_norm": 0.559950590133667, + "learning_rate": 8e-05, + "loss": 1.541, + "step": 6983 + }, + { + "epoch": 0.38929765886287626, + "grad_norm": 0.4906415343284607, + "learning_rate": 8e-05, + "loss": 1.3907, + "step": 6984 + }, + { + "epoch": 0.38935340022296544, + "grad_norm": 0.5318133234977722, + "learning_rate": 8e-05, + "loss": 1.9337, + "step": 6985 + }, + { + "epoch": 0.3894091415830546, + "grad_norm": 0.5461350083351135, + "learning_rate": 8e-05, + "loss": 1.7324, + "step": 6986 + }, + { + "epoch": 0.3894648829431438, + "grad_norm": 0.5531414151191711, + "learning_rate": 8e-05, + "loss": 1.7923, + "step": 6987 + }, + { + "epoch": 0.389520624303233, + "grad_norm": 0.49649012088775635, + "learning_rate": 8e-05, + "loss": 1.5552, + "step": 6988 + }, + { + "epoch": 0.3895763656633222, + "grad_norm": 0.48604315519332886, + "learning_rate": 8e-05, + "loss": 1.6701, + "step": 6989 + }, + { + "epoch": 0.3896321070234114, + "grad_norm": 0.5059698820114136, + "learning_rate": 8e-05, + "loss": 1.5653, + "step": 6990 + }, + { + "epoch": 0.38968784838350057, + "grad_norm": 0.5029332637786865, + "learning_rate": 8e-05, + "loss": 1.5919, + "step": 6991 + }, + { + "epoch": 0.38974358974358975, + "grad_norm": 0.5438098907470703, + "learning_rate": 8e-05, + "loss": 1.7432, + "step": 6992 + }, + { + "epoch": 0.3897993311036789, + "grad_norm": 0.4913785755634308, + "learning_rate": 8e-05, + "loss": 1.5916, + "step": 6993 + }, + { + "epoch": 0.3898550724637681, + "grad_norm": 0.47481903433799744, + "learning_rate": 8e-05, + "loss": 1.5417, + "step": 6994 + }, + { + "epoch": 0.3899108138238573, + "grad_norm": 0.5361449122428894, + "learning_rate": 8e-05, + "loss": 1.9055, + "step": 6995 + }, + { + "epoch": 0.38996655518394646, + "grad_norm": 0.5341494083404541, + "learning_rate": 8e-05, + "loss": 1.7104, + "step": 6996 + }, + { + "epoch": 0.3900222965440357, + "grad_norm": 0.5641982555389404, + "learning_rate": 8e-05, + "loss": 1.9479, + "step": 6997 + }, + { + "epoch": 0.3900780379041249, + "grad_norm": 0.5358514189720154, + "learning_rate": 8e-05, + "loss": 1.7432, + "step": 6998 + }, + { + "epoch": 0.39013377926421405, + "grad_norm": 0.5224436521530151, + "learning_rate": 8e-05, + "loss": 1.8123, + "step": 6999 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.4992210268974304, + "learning_rate": 8e-05, + "loss": 1.8403, + "step": 7000 + }, + { + "epoch": 0.3902452619843924, + "grad_norm": 0.5072396397590637, + "learning_rate": 8e-05, + "loss": 1.6148, + "step": 7001 + }, + { + "epoch": 0.3903010033444816, + "grad_norm": 0.5484432578086853, + "learning_rate": 8e-05, + "loss": 1.7956, + "step": 7002 + }, + { + "epoch": 0.39035674470457077, + "grad_norm": 0.502047598361969, + "learning_rate": 8e-05, + "loss": 1.6104, + "step": 7003 + }, + { + "epoch": 0.39041248606466, + "grad_norm": 0.5068725943565369, + "learning_rate": 8e-05, + "loss": 1.6485, + "step": 7004 + }, + { + "epoch": 0.3904682274247492, + "grad_norm": 0.49112656712532043, + "learning_rate": 8e-05, + "loss": 1.557, + "step": 7005 + }, + { + "epoch": 0.39052396878483836, + "grad_norm": 0.5446252226829529, + "learning_rate": 8e-05, + "loss": 1.6979, + "step": 7006 + }, + { + "epoch": 0.39057971014492754, + "grad_norm": 0.47817152738571167, + "learning_rate": 8e-05, + "loss": 1.5078, + "step": 7007 + }, + { + "epoch": 0.3906354515050167, + "grad_norm": 0.4783325493335724, + "learning_rate": 8e-05, + "loss": 1.5962, + "step": 7008 + }, + { + "epoch": 0.3906911928651059, + "grad_norm": 0.5405673980712891, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 7009 + }, + { + "epoch": 0.3907469342251951, + "grad_norm": 0.5000050067901611, + "learning_rate": 8e-05, + "loss": 1.613, + "step": 7010 + }, + { + "epoch": 0.39080267558528425, + "grad_norm": 0.5314992666244507, + "learning_rate": 8e-05, + "loss": 1.6482, + "step": 7011 + }, + { + "epoch": 0.3908584169453735, + "grad_norm": 0.49853751063346863, + "learning_rate": 8e-05, + "loss": 1.6888, + "step": 7012 + }, + { + "epoch": 0.39091415830546267, + "grad_norm": 0.6656997203826904, + "learning_rate": 8e-05, + "loss": 2.1214, + "step": 7013 + }, + { + "epoch": 0.39096989966555185, + "grad_norm": 0.5526180863380432, + "learning_rate": 8e-05, + "loss": 1.9484, + "step": 7014 + }, + { + "epoch": 0.391025641025641, + "grad_norm": 0.5586705803871155, + "learning_rate": 8e-05, + "loss": 1.8734, + "step": 7015 + }, + { + "epoch": 0.3910813823857302, + "grad_norm": 0.5152546763420105, + "learning_rate": 8e-05, + "loss": 1.7155, + "step": 7016 + }, + { + "epoch": 0.3911371237458194, + "grad_norm": 0.5178968906402588, + "learning_rate": 8e-05, + "loss": 1.6688, + "step": 7017 + }, + { + "epoch": 0.39119286510590856, + "grad_norm": 0.5010287165641785, + "learning_rate": 8e-05, + "loss": 1.6412, + "step": 7018 + }, + { + "epoch": 0.3912486064659978, + "grad_norm": 0.49992191791534424, + "learning_rate": 8e-05, + "loss": 1.7122, + "step": 7019 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.5425409078598022, + "learning_rate": 8e-05, + "loss": 1.846, + "step": 7020 + }, + { + "epoch": 0.39136008918617615, + "grad_norm": 0.4892907738685608, + "learning_rate": 8e-05, + "loss": 1.6649, + "step": 7021 + }, + { + "epoch": 0.39141583054626533, + "grad_norm": 0.5175873637199402, + "learning_rate": 8e-05, + "loss": 1.6877, + "step": 7022 + }, + { + "epoch": 0.3914715719063545, + "grad_norm": 0.5006701946258545, + "learning_rate": 8e-05, + "loss": 1.6376, + "step": 7023 + }, + { + "epoch": 0.3915273132664437, + "grad_norm": 0.5196325778961182, + "learning_rate": 8e-05, + "loss": 1.616, + "step": 7024 + }, + { + "epoch": 0.39158305462653287, + "grad_norm": 0.5506092309951782, + "learning_rate": 8e-05, + "loss": 1.8984, + "step": 7025 + }, + { + "epoch": 0.39163879598662205, + "grad_norm": 0.4763076603412628, + "learning_rate": 8e-05, + "loss": 1.5839, + "step": 7026 + }, + { + "epoch": 0.3916945373467113, + "grad_norm": 0.5732759833335876, + "learning_rate": 8e-05, + "loss": 1.6241, + "step": 7027 + }, + { + "epoch": 0.39175027870680046, + "grad_norm": 0.4679046869277954, + "learning_rate": 8e-05, + "loss": 1.4369, + "step": 7028 + }, + { + "epoch": 0.39180602006688964, + "grad_norm": 0.48116782307624817, + "learning_rate": 8e-05, + "loss": 1.4916, + "step": 7029 + }, + { + "epoch": 0.3918617614269788, + "grad_norm": 0.5586615800857544, + "learning_rate": 8e-05, + "loss": 1.8211, + "step": 7030 + }, + { + "epoch": 0.391917502787068, + "grad_norm": 0.49537360668182373, + "learning_rate": 8e-05, + "loss": 1.6, + "step": 7031 + }, + { + "epoch": 0.3919732441471572, + "grad_norm": 0.5766502618789673, + "learning_rate": 8e-05, + "loss": 1.7938, + "step": 7032 + }, + { + "epoch": 0.39202898550724635, + "grad_norm": 0.5225504636764526, + "learning_rate": 8e-05, + "loss": 1.7192, + "step": 7033 + }, + { + "epoch": 0.3920847268673356, + "grad_norm": 0.5227795243263245, + "learning_rate": 8e-05, + "loss": 1.5299, + "step": 7034 + }, + { + "epoch": 0.39214046822742477, + "grad_norm": 0.5982847213745117, + "learning_rate": 8e-05, + "loss": 2.0238, + "step": 7035 + }, + { + "epoch": 0.39219620958751394, + "grad_norm": 0.5800369381904602, + "learning_rate": 8e-05, + "loss": 1.6763, + "step": 7036 + }, + { + "epoch": 0.3922519509476031, + "grad_norm": 0.567676305770874, + "learning_rate": 8e-05, + "loss": 1.3901, + "step": 7037 + }, + { + "epoch": 0.3923076923076923, + "grad_norm": 0.5485690236091614, + "learning_rate": 8e-05, + "loss": 1.8169, + "step": 7038 + }, + { + "epoch": 0.3923634336677815, + "grad_norm": 0.5342855453491211, + "learning_rate": 8e-05, + "loss": 1.6875, + "step": 7039 + }, + { + "epoch": 0.39241917502787066, + "grad_norm": 0.4871984124183655, + "learning_rate": 8e-05, + "loss": 1.5819, + "step": 7040 + }, + { + "epoch": 0.3924749163879599, + "grad_norm": 0.5237048864364624, + "learning_rate": 8e-05, + "loss": 1.7995, + "step": 7041 + }, + { + "epoch": 0.39253065774804907, + "grad_norm": 0.5661356449127197, + "learning_rate": 8e-05, + "loss": 2.0471, + "step": 7042 + }, + { + "epoch": 0.39258639910813825, + "grad_norm": 0.49327969551086426, + "learning_rate": 8e-05, + "loss": 1.6038, + "step": 7043 + }, + { + "epoch": 0.39264214046822743, + "grad_norm": 0.5772015452384949, + "learning_rate": 8e-05, + "loss": 1.7543, + "step": 7044 + }, + { + "epoch": 0.3926978818283166, + "grad_norm": 0.5665246844291687, + "learning_rate": 8e-05, + "loss": 1.806, + "step": 7045 + }, + { + "epoch": 0.3927536231884058, + "grad_norm": 0.49184301495552063, + "learning_rate": 8e-05, + "loss": 1.5393, + "step": 7046 + }, + { + "epoch": 0.39280936454849497, + "grad_norm": 0.5374985337257385, + "learning_rate": 8e-05, + "loss": 1.7513, + "step": 7047 + }, + { + "epoch": 0.39286510590858414, + "grad_norm": 0.5255217552185059, + "learning_rate": 8e-05, + "loss": 1.7094, + "step": 7048 + }, + { + "epoch": 0.3929208472686734, + "grad_norm": 0.5010694861412048, + "learning_rate": 8e-05, + "loss": 1.5043, + "step": 7049 + }, + { + "epoch": 0.39297658862876256, + "grad_norm": 0.5062034130096436, + "learning_rate": 8e-05, + "loss": 1.4959, + "step": 7050 + }, + { + "epoch": 0.39303232998885174, + "grad_norm": 0.5208863019943237, + "learning_rate": 8e-05, + "loss": 1.4794, + "step": 7051 + }, + { + "epoch": 0.3930880713489409, + "grad_norm": 0.7044099569320679, + "learning_rate": 8e-05, + "loss": 1.6254, + "step": 7052 + }, + { + "epoch": 0.3931438127090301, + "grad_norm": 0.5580699443817139, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 7053 + }, + { + "epoch": 0.3931995540691193, + "grad_norm": 0.5844243168830872, + "learning_rate": 8e-05, + "loss": 1.8115, + "step": 7054 + }, + { + "epoch": 0.39325529542920845, + "grad_norm": 0.5109538435935974, + "learning_rate": 8e-05, + "loss": 1.7011, + "step": 7055 + }, + { + "epoch": 0.3933110367892977, + "grad_norm": 0.5302479267120361, + "learning_rate": 8e-05, + "loss": 1.6663, + "step": 7056 + }, + { + "epoch": 0.39336677814938686, + "grad_norm": 0.5831972360610962, + "learning_rate": 8e-05, + "loss": 2.0732, + "step": 7057 + }, + { + "epoch": 0.39342251950947604, + "grad_norm": 0.5115814805030823, + "learning_rate": 8e-05, + "loss": 1.5539, + "step": 7058 + }, + { + "epoch": 0.3934782608695652, + "grad_norm": 0.5479879379272461, + "learning_rate": 8e-05, + "loss": 1.6709, + "step": 7059 + }, + { + "epoch": 0.3935340022296544, + "grad_norm": 0.5461530089378357, + "learning_rate": 8e-05, + "loss": 1.6039, + "step": 7060 + }, + { + "epoch": 0.3935897435897436, + "grad_norm": 0.5113889575004578, + "learning_rate": 8e-05, + "loss": 1.6394, + "step": 7061 + }, + { + "epoch": 0.39364548494983276, + "grad_norm": 0.5324178338050842, + "learning_rate": 8e-05, + "loss": 1.7084, + "step": 7062 + }, + { + "epoch": 0.39370122630992194, + "grad_norm": 0.5632378458976746, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 7063 + }, + { + "epoch": 0.39375696767001117, + "grad_norm": 0.5247626304626465, + "learning_rate": 8e-05, + "loss": 1.9485, + "step": 7064 + }, + { + "epoch": 0.39381270903010035, + "grad_norm": 0.5283797383308411, + "learning_rate": 8e-05, + "loss": 1.8232, + "step": 7065 + }, + { + "epoch": 0.39386845039018953, + "grad_norm": 0.5559601187705994, + "learning_rate": 8e-05, + "loss": 1.9815, + "step": 7066 + }, + { + "epoch": 0.3939241917502787, + "grad_norm": 0.5030553340911865, + "learning_rate": 8e-05, + "loss": 1.6524, + "step": 7067 + }, + { + "epoch": 0.3939799331103679, + "grad_norm": 0.4583421051502228, + "learning_rate": 8e-05, + "loss": 1.0681, + "step": 7068 + }, + { + "epoch": 0.39403567447045706, + "grad_norm": 0.4787176847457886, + "learning_rate": 8e-05, + "loss": 1.5423, + "step": 7069 + }, + { + "epoch": 0.39409141583054624, + "grad_norm": 0.5295685529708862, + "learning_rate": 8e-05, + "loss": 1.667, + "step": 7070 + }, + { + "epoch": 0.3941471571906355, + "grad_norm": 0.5449760556221008, + "learning_rate": 8e-05, + "loss": 1.7835, + "step": 7071 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 0.5539211630821228, + "learning_rate": 8e-05, + "loss": 1.7981, + "step": 7072 + }, + { + "epoch": 0.39425863991081384, + "grad_norm": 0.5042213201522827, + "learning_rate": 8e-05, + "loss": 1.304, + "step": 7073 + }, + { + "epoch": 0.394314381270903, + "grad_norm": 0.544682502746582, + "learning_rate": 8e-05, + "loss": 1.7827, + "step": 7074 + }, + { + "epoch": 0.3943701226309922, + "grad_norm": 0.5186235904693604, + "learning_rate": 8e-05, + "loss": 1.6544, + "step": 7075 + }, + { + "epoch": 0.39442586399108137, + "grad_norm": 0.5746743679046631, + "learning_rate": 8e-05, + "loss": 1.7657, + "step": 7076 + }, + { + "epoch": 0.39448160535117055, + "grad_norm": 0.5321332216262817, + "learning_rate": 8e-05, + "loss": 1.7641, + "step": 7077 + }, + { + "epoch": 0.39453734671125973, + "grad_norm": 0.5100351572036743, + "learning_rate": 8e-05, + "loss": 1.3928, + "step": 7078 + }, + { + "epoch": 0.39459308807134896, + "grad_norm": 0.555130124092102, + "learning_rate": 8e-05, + "loss": 1.7825, + "step": 7079 + }, + { + "epoch": 0.39464882943143814, + "grad_norm": 0.5125172138214111, + "learning_rate": 8e-05, + "loss": 1.6196, + "step": 7080 + }, + { + "epoch": 0.3947045707915273, + "grad_norm": 0.5487686991691589, + "learning_rate": 8e-05, + "loss": 1.8139, + "step": 7081 + }, + { + "epoch": 0.3947603121516165, + "grad_norm": 0.554560124874115, + "learning_rate": 8e-05, + "loss": 1.8361, + "step": 7082 + }, + { + "epoch": 0.3948160535117057, + "grad_norm": 0.4659266173839569, + "learning_rate": 8e-05, + "loss": 1.4233, + "step": 7083 + }, + { + "epoch": 0.39487179487179486, + "grad_norm": 0.5364368557929993, + "learning_rate": 8e-05, + "loss": 1.6832, + "step": 7084 + }, + { + "epoch": 0.39492753623188404, + "grad_norm": 0.5149415731430054, + "learning_rate": 8e-05, + "loss": 1.5629, + "step": 7085 + }, + { + "epoch": 0.39498327759197327, + "grad_norm": 0.5535032749176025, + "learning_rate": 8e-05, + "loss": 1.6994, + "step": 7086 + }, + { + "epoch": 0.39503901895206245, + "grad_norm": 0.5155476927757263, + "learning_rate": 8e-05, + "loss": 1.4715, + "step": 7087 + }, + { + "epoch": 0.3950947603121516, + "grad_norm": 0.5397098064422607, + "learning_rate": 8e-05, + "loss": 1.6871, + "step": 7088 + }, + { + "epoch": 0.3951505016722408, + "grad_norm": 0.5228619575500488, + "learning_rate": 8e-05, + "loss": 1.6322, + "step": 7089 + }, + { + "epoch": 0.39520624303233, + "grad_norm": 0.5426560044288635, + "learning_rate": 8e-05, + "loss": 1.8848, + "step": 7090 + }, + { + "epoch": 0.39526198439241916, + "grad_norm": 0.6139596700668335, + "learning_rate": 8e-05, + "loss": 1.7845, + "step": 7091 + }, + { + "epoch": 0.39531772575250834, + "grad_norm": 0.5236718058586121, + "learning_rate": 8e-05, + "loss": 1.6079, + "step": 7092 + }, + { + "epoch": 0.3953734671125975, + "grad_norm": 0.5317121148109436, + "learning_rate": 8e-05, + "loss": 1.8536, + "step": 7093 + }, + { + "epoch": 0.39542920847268676, + "grad_norm": 0.5438995361328125, + "learning_rate": 8e-05, + "loss": 1.5735, + "step": 7094 + }, + { + "epoch": 0.39548494983277593, + "grad_norm": 0.5763745307922363, + "learning_rate": 8e-05, + "loss": 1.7013, + "step": 7095 + }, + { + "epoch": 0.3955406911928651, + "grad_norm": 0.52718585729599, + "learning_rate": 8e-05, + "loss": 1.5212, + "step": 7096 + }, + { + "epoch": 0.3955964325529543, + "grad_norm": 0.5630643367767334, + "learning_rate": 8e-05, + "loss": 1.8717, + "step": 7097 + }, + { + "epoch": 0.39565217391304347, + "grad_norm": 0.5001387596130371, + "learning_rate": 8e-05, + "loss": 1.6107, + "step": 7098 + }, + { + "epoch": 0.39570791527313265, + "grad_norm": 0.5191920399665833, + "learning_rate": 8e-05, + "loss": 1.6995, + "step": 7099 + }, + { + "epoch": 0.3957636566332218, + "grad_norm": 0.5507417917251587, + "learning_rate": 8e-05, + "loss": 1.983, + "step": 7100 + }, + { + "epoch": 0.39581939799331106, + "grad_norm": 0.540391206741333, + "learning_rate": 8e-05, + "loss": 1.8635, + "step": 7101 + }, + { + "epoch": 0.39587513935340024, + "grad_norm": 0.534968912601471, + "learning_rate": 8e-05, + "loss": 1.6873, + "step": 7102 + }, + { + "epoch": 0.3959308807134894, + "grad_norm": 0.5403704047203064, + "learning_rate": 8e-05, + "loss": 1.8149, + "step": 7103 + }, + { + "epoch": 0.3959866220735786, + "grad_norm": 0.5288872122764587, + "learning_rate": 8e-05, + "loss": 1.6222, + "step": 7104 + }, + { + "epoch": 0.3960423634336678, + "grad_norm": 0.4927409589290619, + "learning_rate": 8e-05, + "loss": 1.7369, + "step": 7105 + }, + { + "epoch": 0.39609810479375696, + "grad_norm": 0.49208763241767883, + "learning_rate": 8e-05, + "loss": 1.5327, + "step": 7106 + }, + { + "epoch": 0.39615384615384613, + "grad_norm": 0.4779423177242279, + "learning_rate": 8e-05, + "loss": 1.428, + "step": 7107 + }, + { + "epoch": 0.3962095875139353, + "grad_norm": 0.52891606092453, + "learning_rate": 8e-05, + "loss": 1.792, + "step": 7108 + }, + { + "epoch": 0.39626532887402455, + "grad_norm": 0.5548779368400574, + "learning_rate": 8e-05, + "loss": 1.7652, + "step": 7109 + }, + { + "epoch": 0.3963210702341137, + "grad_norm": 0.5618845820426941, + "learning_rate": 8e-05, + "loss": 1.8268, + "step": 7110 + }, + { + "epoch": 0.3963768115942029, + "grad_norm": 0.5512269139289856, + "learning_rate": 8e-05, + "loss": 1.7835, + "step": 7111 + }, + { + "epoch": 0.3964325529542921, + "grad_norm": 0.5383572578430176, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 7112 + }, + { + "epoch": 0.39648829431438126, + "grad_norm": 0.5181984305381775, + "learning_rate": 8e-05, + "loss": 1.4108, + "step": 7113 + }, + { + "epoch": 0.39654403567447044, + "grad_norm": 0.6135736107826233, + "learning_rate": 8e-05, + "loss": 2.1073, + "step": 7114 + }, + { + "epoch": 0.3965997770345596, + "grad_norm": 0.5372074246406555, + "learning_rate": 8e-05, + "loss": 1.7575, + "step": 7115 + }, + { + "epoch": 0.39665551839464885, + "grad_norm": 0.5308025479316711, + "learning_rate": 8e-05, + "loss": 1.6895, + "step": 7116 + }, + { + "epoch": 0.39671125975473803, + "grad_norm": 0.5664640069007874, + "learning_rate": 8e-05, + "loss": 1.7114, + "step": 7117 + }, + { + "epoch": 0.3967670011148272, + "grad_norm": 0.5988225936889648, + "learning_rate": 8e-05, + "loss": 1.9322, + "step": 7118 + }, + { + "epoch": 0.3968227424749164, + "grad_norm": 0.5092202425003052, + "learning_rate": 8e-05, + "loss": 1.7817, + "step": 7119 + }, + { + "epoch": 0.39687848383500557, + "grad_norm": 0.5076021552085876, + "learning_rate": 8e-05, + "loss": 1.6029, + "step": 7120 + }, + { + "epoch": 0.39693422519509475, + "grad_norm": 0.5385578870773315, + "learning_rate": 8e-05, + "loss": 1.5666, + "step": 7121 + }, + { + "epoch": 0.3969899665551839, + "grad_norm": 0.52571040391922, + "learning_rate": 8e-05, + "loss": 1.6277, + "step": 7122 + }, + { + "epoch": 0.3970457079152731, + "grad_norm": 0.5378811359405518, + "learning_rate": 8e-05, + "loss": 1.7184, + "step": 7123 + }, + { + "epoch": 0.39710144927536234, + "grad_norm": 0.4979332983493805, + "learning_rate": 8e-05, + "loss": 1.6225, + "step": 7124 + }, + { + "epoch": 0.3971571906354515, + "grad_norm": 0.5347927212715149, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 7125 + }, + { + "epoch": 0.3972129319955407, + "grad_norm": 0.557803750038147, + "learning_rate": 8e-05, + "loss": 1.7504, + "step": 7126 + }, + { + "epoch": 0.3972686733556299, + "grad_norm": 0.5096932053565979, + "learning_rate": 8e-05, + "loss": 1.7177, + "step": 7127 + }, + { + "epoch": 0.39732441471571905, + "grad_norm": 0.5407583117485046, + "learning_rate": 8e-05, + "loss": 1.7523, + "step": 7128 + }, + { + "epoch": 0.39738015607580823, + "grad_norm": 0.46805867552757263, + "learning_rate": 8e-05, + "loss": 1.549, + "step": 7129 + }, + { + "epoch": 0.3974358974358974, + "grad_norm": 0.5960009098052979, + "learning_rate": 8e-05, + "loss": 1.941, + "step": 7130 + }, + { + "epoch": 0.39749163879598665, + "grad_norm": 0.5466600060462952, + "learning_rate": 8e-05, + "loss": 1.5596, + "step": 7131 + }, + { + "epoch": 0.3975473801560758, + "grad_norm": 0.5576518177986145, + "learning_rate": 8e-05, + "loss": 1.7435, + "step": 7132 + }, + { + "epoch": 0.397603121516165, + "grad_norm": 0.56622713804245, + "learning_rate": 8e-05, + "loss": 1.7719, + "step": 7133 + }, + { + "epoch": 0.3976588628762542, + "grad_norm": 0.515771746635437, + "learning_rate": 8e-05, + "loss": 1.5704, + "step": 7134 + }, + { + "epoch": 0.39771460423634336, + "grad_norm": 0.5629921555519104, + "learning_rate": 8e-05, + "loss": 1.7414, + "step": 7135 + }, + { + "epoch": 0.39777034559643254, + "grad_norm": 0.5997464656829834, + "learning_rate": 8e-05, + "loss": 1.8351, + "step": 7136 + }, + { + "epoch": 0.3978260869565217, + "grad_norm": 0.5276950001716614, + "learning_rate": 8e-05, + "loss": 1.5901, + "step": 7137 + }, + { + "epoch": 0.39788182831661095, + "grad_norm": 0.562311589717865, + "learning_rate": 8e-05, + "loss": 1.7781, + "step": 7138 + }, + { + "epoch": 0.39793756967670013, + "grad_norm": 0.5696790218353271, + "learning_rate": 8e-05, + "loss": 1.8847, + "step": 7139 + }, + { + "epoch": 0.3979933110367893, + "grad_norm": 0.5362411141395569, + "learning_rate": 8e-05, + "loss": 1.7381, + "step": 7140 + }, + { + "epoch": 0.3980490523968785, + "grad_norm": 0.5526409149169922, + "learning_rate": 8e-05, + "loss": 1.7339, + "step": 7141 + }, + { + "epoch": 0.39810479375696767, + "grad_norm": 0.5474283695220947, + "learning_rate": 8e-05, + "loss": 1.575, + "step": 7142 + }, + { + "epoch": 0.39816053511705685, + "grad_norm": 0.5464802980422974, + "learning_rate": 8e-05, + "loss": 1.9082, + "step": 7143 + }, + { + "epoch": 0.398216276477146, + "grad_norm": 0.5198264718055725, + "learning_rate": 8e-05, + "loss": 1.7022, + "step": 7144 + }, + { + "epoch": 0.3982720178372352, + "grad_norm": 0.5149979591369629, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 7145 + }, + { + "epoch": 0.39832775919732444, + "grad_norm": 0.5334619283676147, + "learning_rate": 8e-05, + "loss": 1.6983, + "step": 7146 + }, + { + "epoch": 0.3983835005574136, + "grad_norm": 0.5151000022888184, + "learning_rate": 8e-05, + "loss": 1.6459, + "step": 7147 + }, + { + "epoch": 0.3984392419175028, + "grad_norm": 0.557898998260498, + "learning_rate": 8e-05, + "loss": 1.8314, + "step": 7148 + }, + { + "epoch": 0.398494983277592, + "grad_norm": 0.5185675621032715, + "learning_rate": 8e-05, + "loss": 1.7921, + "step": 7149 + }, + { + "epoch": 0.39855072463768115, + "grad_norm": 0.5042035579681396, + "learning_rate": 8e-05, + "loss": 1.7054, + "step": 7150 + }, + { + "epoch": 0.39860646599777033, + "grad_norm": 0.5184841156005859, + "learning_rate": 8e-05, + "loss": 1.7933, + "step": 7151 + }, + { + "epoch": 0.3986622073578595, + "grad_norm": 0.505743682384491, + "learning_rate": 8e-05, + "loss": 1.7687, + "step": 7152 + }, + { + "epoch": 0.39871794871794874, + "grad_norm": 0.47850340604782104, + "learning_rate": 8e-05, + "loss": 1.6095, + "step": 7153 + }, + { + "epoch": 0.3987736900780379, + "grad_norm": 0.5684821009635925, + "learning_rate": 8e-05, + "loss": 1.7756, + "step": 7154 + }, + { + "epoch": 0.3988294314381271, + "grad_norm": 0.5087734460830688, + "learning_rate": 8e-05, + "loss": 1.6656, + "step": 7155 + }, + { + "epoch": 0.3988851727982163, + "grad_norm": 0.5265142321586609, + "learning_rate": 8e-05, + "loss": 1.7659, + "step": 7156 + }, + { + "epoch": 0.39894091415830546, + "grad_norm": 0.4864795506000519, + "learning_rate": 8e-05, + "loss": 1.5684, + "step": 7157 + }, + { + "epoch": 0.39899665551839464, + "grad_norm": 0.4981810748577118, + "learning_rate": 8e-05, + "loss": 1.6102, + "step": 7158 + }, + { + "epoch": 0.3990523968784838, + "grad_norm": 0.5119472742080688, + "learning_rate": 8e-05, + "loss": 1.5057, + "step": 7159 + }, + { + "epoch": 0.399108138238573, + "grad_norm": 0.507240355014801, + "learning_rate": 8e-05, + "loss": 1.7858, + "step": 7160 + }, + { + "epoch": 0.39916387959866223, + "grad_norm": 0.5334985256195068, + "learning_rate": 8e-05, + "loss": 1.6685, + "step": 7161 + }, + { + "epoch": 0.3992196209587514, + "grad_norm": 0.5807051658630371, + "learning_rate": 8e-05, + "loss": 1.9291, + "step": 7162 + }, + { + "epoch": 0.3992753623188406, + "grad_norm": 0.5425968766212463, + "learning_rate": 8e-05, + "loss": 1.7255, + "step": 7163 + }, + { + "epoch": 0.39933110367892977, + "grad_norm": 0.5361913442611694, + "learning_rate": 8e-05, + "loss": 1.8213, + "step": 7164 + }, + { + "epoch": 0.39938684503901895, + "grad_norm": 0.5138218998908997, + "learning_rate": 8e-05, + "loss": 1.7108, + "step": 7165 + }, + { + "epoch": 0.3994425863991081, + "grad_norm": 0.4844566881656647, + "learning_rate": 8e-05, + "loss": 1.4822, + "step": 7166 + }, + { + "epoch": 0.3994983277591973, + "grad_norm": 0.5321471691131592, + "learning_rate": 8e-05, + "loss": 1.7647, + "step": 7167 + }, + { + "epoch": 0.39955406911928654, + "grad_norm": 0.5588659048080444, + "learning_rate": 8e-05, + "loss": 1.8536, + "step": 7168 + }, + { + "epoch": 0.3996098104793757, + "grad_norm": 0.5184299945831299, + "learning_rate": 8e-05, + "loss": 1.4886, + "step": 7169 + }, + { + "epoch": 0.3996655518394649, + "grad_norm": 0.5337185263633728, + "learning_rate": 8e-05, + "loss": 1.6478, + "step": 7170 + }, + { + "epoch": 0.3997212931995541, + "grad_norm": 0.5601459741592407, + "learning_rate": 8e-05, + "loss": 1.7805, + "step": 7171 + }, + { + "epoch": 0.39977703455964325, + "grad_norm": 0.5493620038032532, + "learning_rate": 8e-05, + "loss": 1.9224, + "step": 7172 + }, + { + "epoch": 0.39983277591973243, + "grad_norm": 0.49881303310394287, + "learning_rate": 8e-05, + "loss": 1.5991, + "step": 7173 + }, + { + "epoch": 0.3998885172798216, + "grad_norm": 0.6255406141281128, + "learning_rate": 8e-05, + "loss": 1.8032, + "step": 7174 + }, + { + "epoch": 0.3999442586399108, + "grad_norm": 0.46618714928627014, + "learning_rate": 8e-05, + "loss": 1.4614, + "step": 7175 + }, + { + "epoch": 0.4, + "grad_norm": 0.5251069664955139, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 7176 + }, + { + "epoch": 0.4000557413600892, + "grad_norm": 0.5375117063522339, + "learning_rate": 8e-05, + "loss": 1.7528, + "step": 7177 + }, + { + "epoch": 0.4001114827201784, + "grad_norm": 0.5366592407226562, + "learning_rate": 8e-05, + "loss": 1.8064, + "step": 7178 + }, + { + "epoch": 0.40016722408026756, + "grad_norm": 0.5342915654182434, + "learning_rate": 8e-05, + "loss": 1.7025, + "step": 7179 + }, + { + "epoch": 0.40022296544035674, + "grad_norm": 0.510113000869751, + "learning_rate": 8e-05, + "loss": 1.6142, + "step": 7180 + }, + { + "epoch": 0.4002787068004459, + "grad_norm": 0.5079729557037354, + "learning_rate": 8e-05, + "loss": 1.6545, + "step": 7181 + }, + { + "epoch": 0.4003344481605351, + "grad_norm": 0.5121543407440186, + "learning_rate": 8e-05, + "loss": 1.5406, + "step": 7182 + }, + { + "epoch": 0.40039018952062433, + "grad_norm": 0.5430922508239746, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 7183 + }, + { + "epoch": 0.4004459308807135, + "grad_norm": 0.5592001080513, + "learning_rate": 8e-05, + "loss": 1.7763, + "step": 7184 + }, + { + "epoch": 0.4005016722408027, + "grad_norm": 0.5771913528442383, + "learning_rate": 8e-05, + "loss": 2.056, + "step": 7185 + }, + { + "epoch": 0.40055741360089187, + "grad_norm": 0.5104514360427856, + "learning_rate": 8e-05, + "loss": 1.5396, + "step": 7186 + }, + { + "epoch": 0.40061315496098104, + "grad_norm": 0.543381929397583, + "learning_rate": 8e-05, + "loss": 1.6641, + "step": 7187 + }, + { + "epoch": 0.4006688963210702, + "grad_norm": 0.5523921251296997, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 7188 + }, + { + "epoch": 0.4007246376811594, + "grad_norm": 0.4858691394329071, + "learning_rate": 8e-05, + "loss": 1.4625, + "step": 7189 + }, + { + "epoch": 0.4007803790412486, + "grad_norm": 0.5553864240646362, + "learning_rate": 8e-05, + "loss": 1.6574, + "step": 7190 + }, + { + "epoch": 0.4008361204013378, + "grad_norm": 0.6046305298805237, + "learning_rate": 8e-05, + "loss": 1.8414, + "step": 7191 + }, + { + "epoch": 0.400891861761427, + "grad_norm": 0.48796287178993225, + "learning_rate": 8e-05, + "loss": 1.4957, + "step": 7192 + }, + { + "epoch": 0.40094760312151617, + "grad_norm": 0.5404077768325806, + "learning_rate": 8e-05, + "loss": 1.4942, + "step": 7193 + }, + { + "epoch": 0.40100334448160535, + "grad_norm": 0.5653712153434753, + "learning_rate": 8e-05, + "loss": 1.8882, + "step": 7194 + }, + { + "epoch": 0.40105908584169453, + "grad_norm": 0.5747902393341064, + "learning_rate": 8e-05, + "loss": 1.8261, + "step": 7195 + }, + { + "epoch": 0.4011148272017837, + "grad_norm": 0.537380039691925, + "learning_rate": 8e-05, + "loss": 1.7867, + "step": 7196 + }, + { + "epoch": 0.4011705685618729, + "grad_norm": 0.5171547532081604, + "learning_rate": 8e-05, + "loss": 1.6586, + "step": 7197 + }, + { + "epoch": 0.4012263099219621, + "grad_norm": 0.5561977624893188, + "learning_rate": 8e-05, + "loss": 1.6361, + "step": 7198 + }, + { + "epoch": 0.4012820512820513, + "grad_norm": 0.5058720707893372, + "learning_rate": 8e-05, + "loss": 1.5449, + "step": 7199 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.562718391418457, + "learning_rate": 8e-05, + "loss": 1.7685, + "step": 7200 + }, + { + "epoch": 0.40139353400222966, + "grad_norm": 0.5348118543624878, + "learning_rate": 8e-05, + "loss": 1.7649, + "step": 7201 + }, + { + "epoch": 0.40144927536231884, + "grad_norm": 0.5601673126220703, + "learning_rate": 8e-05, + "loss": 1.796, + "step": 7202 + }, + { + "epoch": 0.401505016722408, + "grad_norm": 0.492948979139328, + "learning_rate": 8e-05, + "loss": 1.5824, + "step": 7203 + }, + { + "epoch": 0.4015607580824972, + "grad_norm": 0.5347782969474792, + "learning_rate": 8e-05, + "loss": 1.559, + "step": 7204 + }, + { + "epoch": 0.40161649944258637, + "grad_norm": 0.5501527786254883, + "learning_rate": 8e-05, + "loss": 1.5969, + "step": 7205 + }, + { + "epoch": 0.4016722408026756, + "grad_norm": 0.5141997337341309, + "learning_rate": 8e-05, + "loss": 1.5827, + "step": 7206 + }, + { + "epoch": 0.4017279821627648, + "grad_norm": 0.49190807342529297, + "learning_rate": 8e-05, + "loss": 1.5513, + "step": 7207 + }, + { + "epoch": 0.40178372352285396, + "grad_norm": 0.5163851976394653, + "learning_rate": 8e-05, + "loss": 1.4754, + "step": 7208 + }, + { + "epoch": 0.40183946488294314, + "grad_norm": 0.5286582708358765, + "learning_rate": 8e-05, + "loss": 1.6241, + "step": 7209 + }, + { + "epoch": 0.4018952062430323, + "grad_norm": 0.5705527663230896, + "learning_rate": 8e-05, + "loss": 1.8793, + "step": 7210 + }, + { + "epoch": 0.4019509476031215, + "grad_norm": 0.505829393863678, + "learning_rate": 8e-05, + "loss": 1.6368, + "step": 7211 + }, + { + "epoch": 0.4020066889632107, + "grad_norm": 0.6229416728019714, + "learning_rate": 8e-05, + "loss": 2.0446, + "step": 7212 + }, + { + "epoch": 0.4020624303232999, + "grad_norm": 0.5273987054824829, + "learning_rate": 8e-05, + "loss": 1.6026, + "step": 7213 + }, + { + "epoch": 0.4021181716833891, + "grad_norm": 0.5581182837486267, + "learning_rate": 8e-05, + "loss": 1.8189, + "step": 7214 + }, + { + "epoch": 0.40217391304347827, + "grad_norm": 0.5493568778038025, + "learning_rate": 8e-05, + "loss": 1.7455, + "step": 7215 + }, + { + "epoch": 0.40222965440356745, + "grad_norm": 0.5482844710350037, + "learning_rate": 8e-05, + "loss": 1.8096, + "step": 7216 + }, + { + "epoch": 0.40228539576365663, + "grad_norm": 0.5216826796531677, + "learning_rate": 8e-05, + "loss": 1.59, + "step": 7217 + }, + { + "epoch": 0.4023411371237458, + "grad_norm": 0.5004620552062988, + "learning_rate": 8e-05, + "loss": 1.5273, + "step": 7218 + }, + { + "epoch": 0.402396878483835, + "grad_norm": 0.5231639742851257, + "learning_rate": 8e-05, + "loss": 1.6749, + "step": 7219 + }, + { + "epoch": 0.40245261984392416, + "grad_norm": 0.4889169931411743, + "learning_rate": 8e-05, + "loss": 1.6555, + "step": 7220 + }, + { + "epoch": 0.4025083612040134, + "grad_norm": 0.49795204401016235, + "learning_rate": 8e-05, + "loss": 1.583, + "step": 7221 + }, + { + "epoch": 0.4025641025641026, + "grad_norm": 0.5205188989639282, + "learning_rate": 8e-05, + "loss": 1.6421, + "step": 7222 + }, + { + "epoch": 0.40261984392419176, + "grad_norm": 0.5181834101676941, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 7223 + }, + { + "epoch": 0.40267558528428093, + "grad_norm": 0.5610384941101074, + "learning_rate": 8e-05, + "loss": 1.5956, + "step": 7224 + }, + { + "epoch": 0.4027313266443701, + "grad_norm": 0.5183853507041931, + "learning_rate": 8e-05, + "loss": 1.5579, + "step": 7225 + }, + { + "epoch": 0.4027870680044593, + "grad_norm": 0.5921036005020142, + "learning_rate": 8e-05, + "loss": 1.8994, + "step": 7226 + }, + { + "epoch": 0.40284280936454847, + "grad_norm": 0.54100501537323, + "learning_rate": 8e-05, + "loss": 1.6054, + "step": 7227 + }, + { + "epoch": 0.4028985507246377, + "grad_norm": 0.5243507027626038, + "learning_rate": 8e-05, + "loss": 1.5235, + "step": 7228 + }, + { + "epoch": 0.4029542920847269, + "grad_norm": 0.5220246315002441, + "learning_rate": 8e-05, + "loss": 1.5431, + "step": 7229 + }, + { + "epoch": 0.40301003344481606, + "grad_norm": 0.48561525344848633, + "learning_rate": 8e-05, + "loss": 1.4805, + "step": 7230 + }, + { + "epoch": 0.40306577480490524, + "grad_norm": 0.548069953918457, + "learning_rate": 8e-05, + "loss": 1.567, + "step": 7231 + }, + { + "epoch": 0.4031215161649944, + "grad_norm": 0.5422705411911011, + "learning_rate": 8e-05, + "loss": 1.7502, + "step": 7232 + }, + { + "epoch": 0.4031772575250836, + "grad_norm": 0.508958101272583, + "learning_rate": 8e-05, + "loss": 1.4607, + "step": 7233 + }, + { + "epoch": 0.4032329988851728, + "grad_norm": 0.5446996092796326, + "learning_rate": 8e-05, + "loss": 1.7624, + "step": 7234 + }, + { + "epoch": 0.403288740245262, + "grad_norm": 0.5247542858123779, + "learning_rate": 8e-05, + "loss": 1.5096, + "step": 7235 + }, + { + "epoch": 0.4033444816053512, + "grad_norm": 0.5961942076683044, + "learning_rate": 8e-05, + "loss": 1.9071, + "step": 7236 + }, + { + "epoch": 0.40340022296544037, + "grad_norm": 0.5433081984519958, + "learning_rate": 8e-05, + "loss": 1.6365, + "step": 7237 + }, + { + "epoch": 0.40345596432552955, + "grad_norm": 0.5088001489639282, + "learning_rate": 8e-05, + "loss": 1.7051, + "step": 7238 + }, + { + "epoch": 0.4035117056856187, + "grad_norm": 0.510620653629303, + "learning_rate": 8e-05, + "loss": 1.592, + "step": 7239 + }, + { + "epoch": 0.4035674470457079, + "grad_norm": 0.5772462487220764, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 7240 + }, + { + "epoch": 0.4036231884057971, + "grad_norm": 0.5558195114135742, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 7241 + }, + { + "epoch": 0.40367892976588626, + "grad_norm": 0.5695681571960449, + "learning_rate": 8e-05, + "loss": 1.819, + "step": 7242 + }, + { + "epoch": 0.4037346711259755, + "grad_norm": 0.4986192584037781, + "learning_rate": 8e-05, + "loss": 1.6319, + "step": 7243 + }, + { + "epoch": 0.4037904124860647, + "grad_norm": 0.5238422155380249, + "learning_rate": 8e-05, + "loss": 1.7019, + "step": 7244 + }, + { + "epoch": 0.40384615384615385, + "grad_norm": 0.517815351486206, + "learning_rate": 8e-05, + "loss": 1.6851, + "step": 7245 + }, + { + "epoch": 0.40390189520624303, + "grad_norm": 0.5185407400131226, + "learning_rate": 8e-05, + "loss": 1.6868, + "step": 7246 + }, + { + "epoch": 0.4039576365663322, + "grad_norm": 0.5118069648742676, + "learning_rate": 8e-05, + "loss": 1.8089, + "step": 7247 + }, + { + "epoch": 0.4040133779264214, + "grad_norm": 0.5143292546272278, + "learning_rate": 8e-05, + "loss": 1.4618, + "step": 7248 + }, + { + "epoch": 0.40406911928651057, + "grad_norm": 0.5812262892723083, + "learning_rate": 8e-05, + "loss": 1.8726, + "step": 7249 + }, + { + "epoch": 0.4041248606465998, + "grad_norm": 0.5687528848648071, + "learning_rate": 8e-05, + "loss": 1.9506, + "step": 7250 + }, + { + "epoch": 0.404180602006689, + "grad_norm": 0.5691890716552734, + "learning_rate": 8e-05, + "loss": 1.9047, + "step": 7251 + }, + { + "epoch": 0.40423634336677816, + "grad_norm": 0.5662111043930054, + "learning_rate": 8e-05, + "loss": 1.6631, + "step": 7252 + }, + { + "epoch": 0.40429208472686734, + "grad_norm": 0.5995144844055176, + "learning_rate": 8e-05, + "loss": 1.8842, + "step": 7253 + }, + { + "epoch": 0.4043478260869565, + "grad_norm": 0.5363901853561401, + "learning_rate": 8e-05, + "loss": 1.7278, + "step": 7254 + }, + { + "epoch": 0.4044035674470457, + "grad_norm": 0.5433478355407715, + "learning_rate": 8e-05, + "loss": 1.7402, + "step": 7255 + }, + { + "epoch": 0.4044593088071349, + "grad_norm": 0.5265305042266846, + "learning_rate": 8e-05, + "loss": 1.7695, + "step": 7256 + }, + { + "epoch": 0.40451505016722406, + "grad_norm": 0.4668007493019104, + "learning_rate": 8e-05, + "loss": 1.4825, + "step": 7257 + }, + { + "epoch": 0.4045707915273133, + "grad_norm": 0.5050549507141113, + "learning_rate": 8e-05, + "loss": 1.6319, + "step": 7258 + }, + { + "epoch": 0.40462653288740247, + "grad_norm": 0.5017398595809937, + "learning_rate": 8e-05, + "loss": 1.6405, + "step": 7259 + }, + { + "epoch": 0.40468227424749165, + "grad_norm": 0.5323314666748047, + "learning_rate": 8e-05, + "loss": 1.7371, + "step": 7260 + }, + { + "epoch": 0.4047380156075808, + "grad_norm": 0.5021803975105286, + "learning_rate": 8e-05, + "loss": 1.6651, + "step": 7261 + }, + { + "epoch": 0.40479375696767, + "grad_norm": 0.4910186529159546, + "learning_rate": 8e-05, + "loss": 1.5458, + "step": 7262 + }, + { + "epoch": 0.4048494983277592, + "grad_norm": 0.6216732263565063, + "learning_rate": 8e-05, + "loss": 2.0608, + "step": 7263 + }, + { + "epoch": 0.40490523968784836, + "grad_norm": 0.553127110004425, + "learning_rate": 8e-05, + "loss": 1.6896, + "step": 7264 + }, + { + "epoch": 0.4049609810479376, + "grad_norm": 0.5868520736694336, + "learning_rate": 8e-05, + "loss": 1.7069, + "step": 7265 + }, + { + "epoch": 0.4050167224080268, + "grad_norm": 0.46473756432533264, + "learning_rate": 8e-05, + "loss": 1.5142, + "step": 7266 + }, + { + "epoch": 0.40507246376811595, + "grad_norm": 0.5369247198104858, + "learning_rate": 8e-05, + "loss": 1.6509, + "step": 7267 + }, + { + "epoch": 0.40512820512820513, + "grad_norm": 0.515741765499115, + "learning_rate": 8e-05, + "loss": 1.7297, + "step": 7268 + }, + { + "epoch": 0.4051839464882943, + "grad_norm": 0.5385709404945374, + "learning_rate": 8e-05, + "loss": 1.7021, + "step": 7269 + }, + { + "epoch": 0.4052396878483835, + "grad_norm": 0.5441043972969055, + "learning_rate": 8e-05, + "loss": 1.6534, + "step": 7270 + }, + { + "epoch": 0.40529542920847267, + "grad_norm": 0.5380592346191406, + "learning_rate": 8e-05, + "loss": 1.6558, + "step": 7271 + }, + { + "epoch": 0.40535117056856185, + "grad_norm": 0.5165267586708069, + "learning_rate": 8e-05, + "loss": 1.6123, + "step": 7272 + }, + { + "epoch": 0.4054069119286511, + "grad_norm": 0.527855396270752, + "learning_rate": 8e-05, + "loss": 1.5234, + "step": 7273 + }, + { + "epoch": 0.40546265328874026, + "grad_norm": 0.5355624556541443, + "learning_rate": 8e-05, + "loss": 1.7963, + "step": 7274 + }, + { + "epoch": 0.40551839464882944, + "grad_norm": 0.5941885113716125, + "learning_rate": 8e-05, + "loss": 1.6391, + "step": 7275 + }, + { + "epoch": 0.4055741360089186, + "grad_norm": 0.525371253490448, + "learning_rate": 8e-05, + "loss": 1.7042, + "step": 7276 + }, + { + "epoch": 0.4056298773690078, + "grad_norm": 0.5089641809463501, + "learning_rate": 8e-05, + "loss": 1.5949, + "step": 7277 + }, + { + "epoch": 0.405685618729097, + "grad_norm": 0.47336864471435547, + "learning_rate": 8e-05, + "loss": 1.4321, + "step": 7278 + }, + { + "epoch": 0.40574136008918615, + "grad_norm": 0.520519495010376, + "learning_rate": 8e-05, + "loss": 1.6113, + "step": 7279 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.521248996257782, + "learning_rate": 8e-05, + "loss": 1.4845, + "step": 7280 + }, + { + "epoch": 0.40585284280936457, + "grad_norm": 0.51210618019104, + "learning_rate": 8e-05, + "loss": 1.6622, + "step": 7281 + }, + { + "epoch": 0.40590858416945375, + "grad_norm": 0.5206047892570496, + "learning_rate": 8e-05, + "loss": 1.8067, + "step": 7282 + }, + { + "epoch": 0.4059643255295429, + "grad_norm": 0.5226845145225525, + "learning_rate": 8e-05, + "loss": 1.644, + "step": 7283 + }, + { + "epoch": 0.4060200668896321, + "grad_norm": 0.5129978656768799, + "learning_rate": 8e-05, + "loss": 1.5691, + "step": 7284 + }, + { + "epoch": 0.4060758082497213, + "grad_norm": 0.5226308107376099, + "learning_rate": 8e-05, + "loss": 1.4112, + "step": 7285 + }, + { + "epoch": 0.40613154960981046, + "grad_norm": 0.5508782267570496, + "learning_rate": 8e-05, + "loss": 1.6141, + "step": 7286 + }, + { + "epoch": 0.40618729096989964, + "grad_norm": 0.54766446352005, + "learning_rate": 8e-05, + "loss": 1.7305, + "step": 7287 + }, + { + "epoch": 0.4062430323299889, + "grad_norm": 0.5656951665878296, + "learning_rate": 8e-05, + "loss": 1.8182, + "step": 7288 + }, + { + "epoch": 0.40629877369007805, + "grad_norm": 0.6196290850639343, + "learning_rate": 8e-05, + "loss": 1.7658, + "step": 7289 + }, + { + "epoch": 0.40635451505016723, + "grad_norm": 0.5594131350517273, + "learning_rate": 8e-05, + "loss": 1.6559, + "step": 7290 + }, + { + "epoch": 0.4064102564102564, + "grad_norm": 0.7854731678962708, + "learning_rate": 8e-05, + "loss": 1.3569, + "step": 7291 + }, + { + "epoch": 0.4064659977703456, + "grad_norm": 0.49075648188591003, + "learning_rate": 8e-05, + "loss": 1.6244, + "step": 7292 + }, + { + "epoch": 0.40652173913043477, + "grad_norm": 0.5666341781616211, + "learning_rate": 8e-05, + "loss": 1.7787, + "step": 7293 + }, + { + "epoch": 0.40657748049052395, + "grad_norm": 0.5504280924797058, + "learning_rate": 8e-05, + "loss": 1.7641, + "step": 7294 + }, + { + "epoch": 0.4066332218506132, + "grad_norm": 0.5278729200363159, + "learning_rate": 8e-05, + "loss": 1.7668, + "step": 7295 + }, + { + "epoch": 0.40668896321070236, + "grad_norm": 0.5682119131088257, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 7296 + }, + { + "epoch": 0.40674470457079154, + "grad_norm": 0.6224889755249023, + "learning_rate": 8e-05, + "loss": 1.8824, + "step": 7297 + }, + { + "epoch": 0.4068004459308807, + "grad_norm": 0.5042921304702759, + "learning_rate": 8e-05, + "loss": 1.614, + "step": 7298 + }, + { + "epoch": 0.4068561872909699, + "grad_norm": 0.5012341141700745, + "learning_rate": 8e-05, + "loss": 1.5258, + "step": 7299 + }, + { + "epoch": 0.4069119286510591, + "grad_norm": 0.5259014964103699, + "learning_rate": 8e-05, + "loss": 1.6493, + "step": 7300 + }, + { + "epoch": 0.40696767001114825, + "grad_norm": 0.493158221244812, + "learning_rate": 8e-05, + "loss": 1.4587, + "step": 7301 + }, + { + "epoch": 0.40702341137123743, + "grad_norm": 0.5391311049461365, + "learning_rate": 8e-05, + "loss": 1.8054, + "step": 7302 + }, + { + "epoch": 0.40707915273132667, + "grad_norm": 0.5361586213111877, + "learning_rate": 8e-05, + "loss": 1.4909, + "step": 7303 + }, + { + "epoch": 0.40713489409141584, + "grad_norm": 0.5893044471740723, + "learning_rate": 8e-05, + "loss": 1.8674, + "step": 7304 + }, + { + "epoch": 0.407190635451505, + "grad_norm": 0.540810763835907, + "learning_rate": 8e-05, + "loss": 1.6365, + "step": 7305 + }, + { + "epoch": 0.4072463768115942, + "grad_norm": 0.5277107357978821, + "learning_rate": 8e-05, + "loss": 1.6131, + "step": 7306 + }, + { + "epoch": 0.4073021181716834, + "grad_norm": 0.47652947902679443, + "learning_rate": 8e-05, + "loss": 1.3556, + "step": 7307 + }, + { + "epoch": 0.40735785953177256, + "grad_norm": 0.5251367688179016, + "learning_rate": 8e-05, + "loss": 1.795, + "step": 7308 + }, + { + "epoch": 0.40741360089186174, + "grad_norm": 0.5432030558586121, + "learning_rate": 8e-05, + "loss": 2.0776, + "step": 7309 + }, + { + "epoch": 0.40746934225195097, + "grad_norm": 0.5877689719200134, + "learning_rate": 8e-05, + "loss": 1.88, + "step": 7310 + }, + { + "epoch": 0.40752508361204015, + "grad_norm": 0.5788148641586304, + "learning_rate": 8e-05, + "loss": 1.8192, + "step": 7311 + }, + { + "epoch": 0.40758082497212933, + "grad_norm": 0.551967978477478, + "learning_rate": 8e-05, + "loss": 1.7107, + "step": 7312 + }, + { + "epoch": 0.4076365663322185, + "grad_norm": 0.49300578236579895, + "learning_rate": 8e-05, + "loss": 1.599, + "step": 7313 + }, + { + "epoch": 0.4076923076923077, + "grad_norm": 0.505588173866272, + "learning_rate": 8e-05, + "loss": 1.4808, + "step": 7314 + }, + { + "epoch": 0.40774804905239687, + "grad_norm": 0.6424862742424011, + "learning_rate": 8e-05, + "loss": 1.3891, + "step": 7315 + }, + { + "epoch": 0.40780379041248604, + "grad_norm": 0.5149382948875427, + "learning_rate": 8e-05, + "loss": 1.5358, + "step": 7316 + }, + { + "epoch": 0.4078595317725752, + "grad_norm": 0.5695319771766663, + "learning_rate": 8e-05, + "loss": 1.7848, + "step": 7317 + }, + { + "epoch": 0.40791527313266446, + "grad_norm": 0.562187910079956, + "learning_rate": 8e-05, + "loss": 2.0304, + "step": 7318 + }, + { + "epoch": 0.40797101449275364, + "grad_norm": 0.5698602795600891, + "learning_rate": 8e-05, + "loss": 1.7808, + "step": 7319 + }, + { + "epoch": 0.4080267558528428, + "grad_norm": 0.4977051615715027, + "learning_rate": 8e-05, + "loss": 1.528, + "step": 7320 + }, + { + "epoch": 0.408082497212932, + "grad_norm": 0.48804786801338196, + "learning_rate": 8e-05, + "loss": 1.8455, + "step": 7321 + }, + { + "epoch": 0.4081382385730212, + "grad_norm": 0.4927138090133667, + "learning_rate": 8e-05, + "loss": 1.5468, + "step": 7322 + }, + { + "epoch": 0.40819397993311035, + "grad_norm": 0.5586680173873901, + "learning_rate": 8e-05, + "loss": 1.5726, + "step": 7323 + }, + { + "epoch": 0.40824972129319953, + "grad_norm": 0.5101229548454285, + "learning_rate": 8e-05, + "loss": 1.704, + "step": 7324 + }, + { + "epoch": 0.40830546265328876, + "grad_norm": 0.5254568457603455, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 7325 + }, + { + "epoch": 0.40836120401337794, + "grad_norm": 0.490192174911499, + "learning_rate": 8e-05, + "loss": 1.5068, + "step": 7326 + }, + { + "epoch": 0.4084169453734671, + "grad_norm": 0.5417366623878479, + "learning_rate": 8e-05, + "loss": 1.7213, + "step": 7327 + }, + { + "epoch": 0.4084726867335563, + "grad_norm": 0.5311920642852783, + "learning_rate": 8e-05, + "loss": 1.7244, + "step": 7328 + }, + { + "epoch": 0.4085284280936455, + "grad_norm": 0.5254058837890625, + "learning_rate": 8e-05, + "loss": 1.4401, + "step": 7329 + }, + { + "epoch": 0.40858416945373466, + "grad_norm": 0.5589635968208313, + "learning_rate": 8e-05, + "loss": 1.7735, + "step": 7330 + }, + { + "epoch": 0.40863991081382384, + "grad_norm": 0.5018014907836914, + "learning_rate": 8e-05, + "loss": 1.6477, + "step": 7331 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 0.525607168674469, + "learning_rate": 8e-05, + "loss": 1.5617, + "step": 7332 + }, + { + "epoch": 0.40875139353400225, + "grad_norm": 0.5388331413269043, + "learning_rate": 8e-05, + "loss": 1.6982, + "step": 7333 + }, + { + "epoch": 0.40880713489409143, + "grad_norm": 0.5607791543006897, + "learning_rate": 8e-05, + "loss": 1.7448, + "step": 7334 + }, + { + "epoch": 0.4088628762541806, + "grad_norm": 0.5692734122276306, + "learning_rate": 8e-05, + "loss": 1.5466, + "step": 7335 + }, + { + "epoch": 0.4089186176142698, + "grad_norm": 0.5394876003265381, + "learning_rate": 8e-05, + "loss": 1.7048, + "step": 7336 + }, + { + "epoch": 0.40897435897435896, + "grad_norm": 0.5117727518081665, + "learning_rate": 8e-05, + "loss": 1.5873, + "step": 7337 + }, + { + "epoch": 0.40903010033444814, + "grad_norm": 0.5213082432746887, + "learning_rate": 8e-05, + "loss": 1.6449, + "step": 7338 + }, + { + "epoch": 0.4090858416945373, + "grad_norm": 0.5686284303665161, + "learning_rate": 8e-05, + "loss": 2.0047, + "step": 7339 + }, + { + "epoch": 0.40914158305462656, + "grad_norm": 0.5361032485961914, + "learning_rate": 8e-05, + "loss": 1.9062, + "step": 7340 + }, + { + "epoch": 0.40919732441471574, + "grad_norm": 0.5334948897361755, + "learning_rate": 8e-05, + "loss": 1.812, + "step": 7341 + }, + { + "epoch": 0.4092530657748049, + "grad_norm": 0.5158889293670654, + "learning_rate": 8e-05, + "loss": 1.649, + "step": 7342 + }, + { + "epoch": 0.4093088071348941, + "grad_norm": 0.4964984357357025, + "learning_rate": 8e-05, + "loss": 1.4696, + "step": 7343 + }, + { + "epoch": 0.40936454849498327, + "grad_norm": NaN, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 7344 + }, + { + "epoch": 0.40942028985507245, + "grad_norm": 0.5129902362823486, + "learning_rate": 8e-05, + "loss": 1.6362, + "step": 7345 + }, + { + "epoch": 0.40947603121516163, + "grad_norm": 0.5674779415130615, + "learning_rate": 8e-05, + "loss": 1.634, + "step": 7346 + }, + { + "epoch": 0.40953177257525086, + "grad_norm": 0.4873337149620056, + "learning_rate": 8e-05, + "loss": 1.7257, + "step": 7347 + }, + { + "epoch": 0.40958751393534004, + "grad_norm": 0.5569888353347778, + "learning_rate": 8e-05, + "loss": 1.6974, + "step": 7348 + }, + { + "epoch": 0.4096432552954292, + "grad_norm": 0.5687206387519836, + "learning_rate": 8e-05, + "loss": 1.8742, + "step": 7349 + }, + { + "epoch": 0.4096989966555184, + "grad_norm": 0.7041868567466736, + "learning_rate": 8e-05, + "loss": 1.7045, + "step": 7350 + }, + { + "epoch": 0.4097547380156076, + "grad_norm": 0.5319523811340332, + "learning_rate": 8e-05, + "loss": 1.759, + "step": 7351 + }, + { + "epoch": 0.40981047937569676, + "grad_norm": 0.523441731929779, + "learning_rate": 8e-05, + "loss": 1.8373, + "step": 7352 + }, + { + "epoch": 0.40986622073578594, + "grad_norm": 0.5501686930656433, + "learning_rate": 8e-05, + "loss": 1.6488, + "step": 7353 + }, + { + "epoch": 0.4099219620958751, + "grad_norm": 0.4980147182941437, + "learning_rate": 8e-05, + "loss": 1.6566, + "step": 7354 + }, + { + "epoch": 0.40997770345596435, + "grad_norm": 0.5074655413627625, + "learning_rate": 8e-05, + "loss": 1.5941, + "step": 7355 + }, + { + "epoch": 0.4100334448160535, + "grad_norm": 0.5464773178100586, + "learning_rate": 8e-05, + "loss": 1.7869, + "step": 7356 + }, + { + "epoch": 0.4100891861761427, + "grad_norm": 0.6277008652687073, + "learning_rate": 8e-05, + "loss": 2.2296, + "step": 7357 + }, + { + "epoch": 0.4101449275362319, + "grad_norm": 0.5693956613540649, + "learning_rate": 8e-05, + "loss": 1.7652, + "step": 7358 + }, + { + "epoch": 0.41020066889632106, + "grad_norm": 0.5990443229675293, + "learning_rate": 8e-05, + "loss": 2.0915, + "step": 7359 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.5059307813644409, + "learning_rate": 8e-05, + "loss": 1.6951, + "step": 7360 + }, + { + "epoch": 0.4103121516164994, + "grad_norm": 0.5810990333557129, + "learning_rate": 8e-05, + "loss": 2.0471, + "step": 7361 + }, + { + "epoch": 0.41036789297658866, + "grad_norm": 0.5133877992630005, + "learning_rate": 8e-05, + "loss": 1.4961, + "step": 7362 + }, + { + "epoch": 0.41042363433667783, + "grad_norm": 0.5552747845649719, + "learning_rate": 8e-05, + "loss": 1.8481, + "step": 7363 + }, + { + "epoch": 0.410479375696767, + "grad_norm": 0.5545029044151306, + "learning_rate": 8e-05, + "loss": 1.5988, + "step": 7364 + }, + { + "epoch": 0.4105351170568562, + "grad_norm": 0.522911787033081, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 7365 + }, + { + "epoch": 0.41059085841694537, + "grad_norm": 0.5258331894874573, + "learning_rate": 8e-05, + "loss": 1.7336, + "step": 7366 + }, + { + "epoch": 0.41064659977703455, + "grad_norm": 0.5287671685218811, + "learning_rate": 8e-05, + "loss": 1.5069, + "step": 7367 + }, + { + "epoch": 0.4107023411371237, + "grad_norm": 0.5577136278152466, + "learning_rate": 8e-05, + "loss": 1.5476, + "step": 7368 + }, + { + "epoch": 0.4107580824972129, + "grad_norm": 0.5114900469779968, + "learning_rate": 8e-05, + "loss": 1.5314, + "step": 7369 + }, + { + "epoch": 0.41081382385730214, + "grad_norm": 0.5116535425186157, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 7370 + }, + { + "epoch": 0.4108695652173913, + "grad_norm": 0.5354006290435791, + "learning_rate": 8e-05, + "loss": 1.628, + "step": 7371 + }, + { + "epoch": 0.4109253065774805, + "grad_norm": 0.4987451434135437, + "learning_rate": 8e-05, + "loss": 1.6674, + "step": 7372 + }, + { + "epoch": 0.4109810479375697, + "grad_norm": 0.5362124443054199, + "learning_rate": 8e-05, + "loss": 1.7775, + "step": 7373 + }, + { + "epoch": 0.41103678929765886, + "grad_norm": 0.5331243872642517, + "learning_rate": 8e-05, + "loss": 1.4266, + "step": 7374 + }, + { + "epoch": 0.41109253065774803, + "grad_norm": 0.552800178527832, + "learning_rate": 8e-05, + "loss": 1.5609, + "step": 7375 + }, + { + "epoch": 0.4111482720178372, + "grad_norm": 0.5207287669181824, + "learning_rate": 8e-05, + "loss": 1.7087, + "step": 7376 + }, + { + "epoch": 0.41120401337792645, + "grad_norm": 0.5250851511955261, + "learning_rate": 8e-05, + "loss": 1.6695, + "step": 7377 + }, + { + "epoch": 0.4112597547380156, + "grad_norm": 0.552610456943512, + "learning_rate": 8e-05, + "loss": 1.8226, + "step": 7378 + }, + { + "epoch": 0.4113154960981048, + "grad_norm": 0.5105003714561462, + "learning_rate": 8e-05, + "loss": 1.7584, + "step": 7379 + }, + { + "epoch": 0.411371237458194, + "grad_norm": 0.5159006714820862, + "learning_rate": 8e-05, + "loss": 1.627, + "step": 7380 + }, + { + "epoch": 0.41142697881828316, + "grad_norm": 0.5639457106590271, + "learning_rate": 8e-05, + "loss": 1.8453, + "step": 7381 + }, + { + "epoch": 0.41148272017837234, + "grad_norm": 0.4808430075645447, + "learning_rate": 8e-05, + "loss": 1.7023, + "step": 7382 + }, + { + "epoch": 0.4115384615384615, + "grad_norm": 0.5087721347808838, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 7383 + }, + { + "epoch": 0.4115942028985507, + "grad_norm": 0.5479757189750671, + "learning_rate": 8e-05, + "loss": 1.7867, + "step": 7384 + }, + { + "epoch": 0.41164994425863993, + "grad_norm": 0.5063915252685547, + "learning_rate": 8e-05, + "loss": 1.7109, + "step": 7385 + }, + { + "epoch": 0.4117056856187291, + "grad_norm": 0.5211130380630493, + "learning_rate": 8e-05, + "loss": 1.6334, + "step": 7386 + }, + { + "epoch": 0.4117614269788183, + "grad_norm": 0.5052521824836731, + "learning_rate": 8e-05, + "loss": 1.8544, + "step": 7387 + }, + { + "epoch": 0.41181716833890747, + "grad_norm": 0.5772587060928345, + "learning_rate": 8e-05, + "loss": 1.9451, + "step": 7388 + }, + { + "epoch": 0.41187290969899665, + "grad_norm": 0.5105565786361694, + "learning_rate": 8e-05, + "loss": 1.5803, + "step": 7389 + }, + { + "epoch": 0.4119286510590858, + "grad_norm": 0.5484120845794678, + "learning_rate": 8e-05, + "loss": 1.7197, + "step": 7390 + }, + { + "epoch": 0.411984392419175, + "grad_norm": 0.5253351926803589, + "learning_rate": 8e-05, + "loss": 1.5121, + "step": 7391 + }, + { + "epoch": 0.41204013377926424, + "grad_norm": 0.523414134979248, + "learning_rate": 8e-05, + "loss": 1.6663, + "step": 7392 + }, + { + "epoch": 0.4120958751393534, + "grad_norm": 0.5172882676124573, + "learning_rate": 8e-05, + "loss": 1.5974, + "step": 7393 + }, + { + "epoch": 0.4121516164994426, + "grad_norm": 0.5120169520378113, + "learning_rate": 8e-05, + "loss": 1.6205, + "step": 7394 + }, + { + "epoch": 0.4122073578595318, + "grad_norm": 0.5658069849014282, + "learning_rate": 8e-05, + "loss": 1.6488, + "step": 7395 + }, + { + "epoch": 0.41226309921962095, + "grad_norm": 0.5338782668113708, + "learning_rate": 8e-05, + "loss": 1.8355, + "step": 7396 + }, + { + "epoch": 0.41231884057971013, + "grad_norm": 0.504389226436615, + "learning_rate": 8e-05, + "loss": 1.7015, + "step": 7397 + }, + { + "epoch": 0.4123745819397993, + "grad_norm": 0.535618007183075, + "learning_rate": 8e-05, + "loss": 1.5951, + "step": 7398 + }, + { + "epoch": 0.4124303232998885, + "grad_norm": 0.501131534576416, + "learning_rate": 8e-05, + "loss": 1.733, + "step": 7399 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.5304338932037354, + "learning_rate": 8e-05, + "loss": 1.674, + "step": 7400 + }, + { + "epoch": 0.4125418060200669, + "grad_norm": 0.6905393004417419, + "learning_rate": 8e-05, + "loss": 1.4084, + "step": 7401 + }, + { + "epoch": 0.4125975473801561, + "grad_norm": 0.5419348478317261, + "learning_rate": 8e-05, + "loss": 1.7082, + "step": 7402 + }, + { + "epoch": 0.41265328874024526, + "grad_norm": 0.4964982867240906, + "learning_rate": 8e-05, + "loss": 1.5791, + "step": 7403 + }, + { + "epoch": 0.41270903010033444, + "grad_norm": 0.5790861248970032, + "learning_rate": 8e-05, + "loss": 1.8076, + "step": 7404 + }, + { + "epoch": 0.4127647714604236, + "grad_norm": 0.5126116871833801, + "learning_rate": 8e-05, + "loss": 1.7128, + "step": 7405 + }, + { + "epoch": 0.4128205128205128, + "grad_norm": 0.5443001985549927, + "learning_rate": 8e-05, + "loss": 1.7598, + "step": 7406 + }, + { + "epoch": 0.41287625418060203, + "grad_norm": 0.4740257263183594, + "learning_rate": 8e-05, + "loss": 1.4271, + "step": 7407 + }, + { + "epoch": 0.4129319955406912, + "grad_norm": 0.5965816974639893, + "learning_rate": 8e-05, + "loss": 1.8464, + "step": 7408 + }, + { + "epoch": 0.4129877369007804, + "grad_norm": 0.5371137261390686, + "learning_rate": 8e-05, + "loss": 1.561, + "step": 7409 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 0.5329623222351074, + "learning_rate": 8e-05, + "loss": 1.7707, + "step": 7410 + }, + { + "epoch": 0.41309921962095875, + "grad_norm": 0.5217059254646301, + "learning_rate": 8e-05, + "loss": 1.5821, + "step": 7411 + }, + { + "epoch": 0.4131549609810479, + "grad_norm": 0.5564699172973633, + "learning_rate": 8e-05, + "loss": 1.7106, + "step": 7412 + }, + { + "epoch": 0.4132107023411371, + "grad_norm": 0.5382006168365479, + "learning_rate": 8e-05, + "loss": 1.7405, + "step": 7413 + }, + { + "epoch": 0.4132664437012263, + "grad_norm": 0.5441609025001526, + "learning_rate": 8e-05, + "loss": 1.6926, + "step": 7414 + }, + { + "epoch": 0.4133221850613155, + "grad_norm": 0.578251302242279, + "learning_rate": 8e-05, + "loss": 1.51, + "step": 7415 + }, + { + "epoch": 0.4133779264214047, + "grad_norm": 0.5474327802658081, + "learning_rate": 8e-05, + "loss": 1.7875, + "step": 7416 + }, + { + "epoch": 0.4134336677814939, + "grad_norm": 0.5703834891319275, + "learning_rate": 8e-05, + "loss": 1.8034, + "step": 7417 + }, + { + "epoch": 0.41348940914158305, + "grad_norm": 0.5354348421096802, + "learning_rate": 8e-05, + "loss": 1.746, + "step": 7418 + }, + { + "epoch": 0.41354515050167223, + "grad_norm": 0.5187190771102905, + "learning_rate": 8e-05, + "loss": 1.6389, + "step": 7419 + }, + { + "epoch": 0.4136008918617614, + "grad_norm": 0.5314764380455017, + "learning_rate": 8e-05, + "loss": 1.7257, + "step": 7420 + }, + { + "epoch": 0.4136566332218506, + "grad_norm": 0.5247149467468262, + "learning_rate": 8e-05, + "loss": 1.568, + "step": 7421 + }, + { + "epoch": 0.4137123745819398, + "grad_norm": 0.4993450343608856, + "learning_rate": 8e-05, + "loss": 1.5826, + "step": 7422 + }, + { + "epoch": 0.413768115942029, + "grad_norm": 0.5261116623878479, + "learning_rate": 8e-05, + "loss": 1.6117, + "step": 7423 + }, + { + "epoch": 0.4138238573021182, + "grad_norm": 0.5468164086341858, + "learning_rate": 8e-05, + "loss": 1.5543, + "step": 7424 + }, + { + "epoch": 0.41387959866220736, + "grad_norm": 0.5935494303703308, + "learning_rate": 8e-05, + "loss": 1.7585, + "step": 7425 + }, + { + "epoch": 0.41393534002229654, + "grad_norm": 0.5638031363487244, + "learning_rate": 8e-05, + "loss": 1.9934, + "step": 7426 + }, + { + "epoch": 0.4139910813823857, + "grad_norm": 0.47739219665527344, + "learning_rate": 8e-05, + "loss": 1.5811, + "step": 7427 + }, + { + "epoch": 0.4140468227424749, + "grad_norm": 0.5171627998352051, + "learning_rate": 8e-05, + "loss": 1.7371, + "step": 7428 + }, + { + "epoch": 0.41410256410256413, + "grad_norm": 0.5365909337997437, + "learning_rate": 8e-05, + "loss": 1.7186, + "step": 7429 + }, + { + "epoch": 0.4141583054626533, + "grad_norm": 0.5287392735481262, + "learning_rate": 8e-05, + "loss": 1.5597, + "step": 7430 + }, + { + "epoch": 0.4142140468227425, + "grad_norm": 0.5019273161888123, + "learning_rate": 8e-05, + "loss": 1.4832, + "step": 7431 + }, + { + "epoch": 0.41426978818283167, + "grad_norm": 0.535295844078064, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 7432 + }, + { + "epoch": 0.41432552954292085, + "grad_norm": 0.5648143291473389, + "learning_rate": 8e-05, + "loss": 1.7733, + "step": 7433 + }, + { + "epoch": 0.41438127090301, + "grad_norm": 0.5082452893257141, + "learning_rate": 8e-05, + "loss": 1.7593, + "step": 7434 + }, + { + "epoch": 0.4144370122630992, + "grad_norm": 0.515472412109375, + "learning_rate": 8e-05, + "loss": 1.5832, + "step": 7435 + }, + { + "epoch": 0.4144927536231884, + "grad_norm": 0.540582001209259, + "learning_rate": 8e-05, + "loss": 1.6328, + "step": 7436 + }, + { + "epoch": 0.4145484949832776, + "grad_norm": 0.5583634972572327, + "learning_rate": 8e-05, + "loss": 1.8097, + "step": 7437 + }, + { + "epoch": 0.4146042363433668, + "grad_norm": 0.5404437184333801, + "learning_rate": 8e-05, + "loss": 1.7001, + "step": 7438 + }, + { + "epoch": 0.414659977703456, + "grad_norm": 0.5578569769859314, + "learning_rate": 8e-05, + "loss": 1.6861, + "step": 7439 + }, + { + "epoch": 0.41471571906354515, + "grad_norm": 0.5309574007987976, + "learning_rate": 8e-05, + "loss": 1.5784, + "step": 7440 + }, + { + "epoch": 0.41477146042363433, + "grad_norm": 0.5452367663383484, + "learning_rate": 8e-05, + "loss": 1.687, + "step": 7441 + }, + { + "epoch": 0.4148272017837235, + "grad_norm": 0.5491836071014404, + "learning_rate": 8e-05, + "loss": 1.6507, + "step": 7442 + }, + { + "epoch": 0.4148829431438127, + "grad_norm": 0.5284445285797119, + "learning_rate": 8e-05, + "loss": 1.5607, + "step": 7443 + }, + { + "epoch": 0.4149386845039019, + "grad_norm": 0.5563288927078247, + "learning_rate": 8e-05, + "loss": 1.666, + "step": 7444 + }, + { + "epoch": 0.4149944258639911, + "grad_norm": 0.5210450887680054, + "learning_rate": 8e-05, + "loss": 1.5811, + "step": 7445 + }, + { + "epoch": 0.4150501672240803, + "grad_norm": 0.5133852958679199, + "learning_rate": 8e-05, + "loss": 1.7379, + "step": 7446 + }, + { + "epoch": 0.41510590858416946, + "grad_norm": 0.5505521297454834, + "learning_rate": 8e-05, + "loss": 1.7915, + "step": 7447 + }, + { + "epoch": 0.41516164994425864, + "grad_norm": 0.5652416348457336, + "learning_rate": 8e-05, + "loss": 1.6962, + "step": 7448 + }, + { + "epoch": 0.4152173913043478, + "grad_norm": 0.4884105622768402, + "learning_rate": 8e-05, + "loss": 1.6399, + "step": 7449 + }, + { + "epoch": 0.415273132664437, + "grad_norm": 0.5113058686256409, + "learning_rate": 8e-05, + "loss": 1.7623, + "step": 7450 + }, + { + "epoch": 0.4153288740245262, + "grad_norm": 0.533737063407898, + "learning_rate": 8e-05, + "loss": 1.5663, + "step": 7451 + }, + { + "epoch": 0.4153846153846154, + "grad_norm": 0.5064999461174011, + "learning_rate": 8e-05, + "loss": 1.639, + "step": 7452 + }, + { + "epoch": 0.4154403567447046, + "grad_norm": 0.600654661655426, + "learning_rate": 8e-05, + "loss": 1.62, + "step": 7453 + }, + { + "epoch": 0.41549609810479377, + "grad_norm": 0.5428963899612427, + "learning_rate": 8e-05, + "loss": 1.7833, + "step": 7454 + }, + { + "epoch": 0.41555183946488294, + "grad_norm": 0.5852295756340027, + "learning_rate": 8e-05, + "loss": 1.7379, + "step": 7455 + }, + { + "epoch": 0.4156075808249721, + "grad_norm": 0.49108296632766724, + "learning_rate": 8e-05, + "loss": 1.4555, + "step": 7456 + }, + { + "epoch": 0.4156633221850613, + "grad_norm": 0.5304026007652283, + "learning_rate": 8e-05, + "loss": 1.4129, + "step": 7457 + }, + { + "epoch": 0.4157190635451505, + "grad_norm": 0.5354292392730713, + "learning_rate": 8e-05, + "loss": 1.5815, + "step": 7458 + }, + { + "epoch": 0.4157748049052397, + "grad_norm": 0.49475952982902527, + "learning_rate": 8e-05, + "loss": 1.6129, + "step": 7459 + }, + { + "epoch": 0.4158305462653289, + "grad_norm": 0.5129939317703247, + "learning_rate": 8e-05, + "loss": 1.6688, + "step": 7460 + }, + { + "epoch": 0.41588628762541807, + "grad_norm": 0.5419784784317017, + "learning_rate": 8e-05, + "loss": 1.7965, + "step": 7461 + }, + { + "epoch": 0.41594202898550725, + "grad_norm": 0.521435022354126, + "learning_rate": 8e-05, + "loss": 1.6395, + "step": 7462 + }, + { + "epoch": 0.41599777034559643, + "grad_norm": 0.5104933977127075, + "learning_rate": 8e-05, + "loss": 1.6178, + "step": 7463 + }, + { + "epoch": 0.4160535117056856, + "grad_norm": 0.5301722288131714, + "learning_rate": 8e-05, + "loss": 1.7177, + "step": 7464 + }, + { + "epoch": 0.4161092530657748, + "grad_norm": 0.5151179432868958, + "learning_rate": 8e-05, + "loss": 1.667, + "step": 7465 + }, + { + "epoch": 0.41616499442586397, + "grad_norm": 0.4956851303577423, + "learning_rate": 8e-05, + "loss": 1.6084, + "step": 7466 + }, + { + "epoch": 0.4162207357859532, + "grad_norm": 0.5060463547706604, + "learning_rate": 8e-05, + "loss": 1.4551, + "step": 7467 + }, + { + "epoch": 0.4162764771460424, + "grad_norm": 0.5542873740196228, + "learning_rate": 8e-05, + "loss": 1.4436, + "step": 7468 + }, + { + "epoch": 0.41633221850613156, + "grad_norm": 0.5257938504219055, + "learning_rate": 8e-05, + "loss": 1.5713, + "step": 7469 + }, + { + "epoch": 0.41638795986622074, + "grad_norm": 0.5059783458709717, + "learning_rate": 8e-05, + "loss": 1.5782, + "step": 7470 + }, + { + "epoch": 0.4164437012263099, + "grad_norm": 0.5536264181137085, + "learning_rate": 8e-05, + "loss": 1.8385, + "step": 7471 + }, + { + "epoch": 0.4164994425863991, + "grad_norm": 0.5066181421279907, + "learning_rate": 8e-05, + "loss": 1.5314, + "step": 7472 + }, + { + "epoch": 0.41655518394648827, + "grad_norm": 0.5219850540161133, + "learning_rate": 8e-05, + "loss": 1.7979, + "step": 7473 + }, + { + "epoch": 0.4166109253065775, + "grad_norm": 0.5205042362213135, + "learning_rate": 8e-05, + "loss": 1.6734, + "step": 7474 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.49642428755760193, + "learning_rate": 8e-05, + "loss": 1.697, + "step": 7475 + }, + { + "epoch": 0.41672240802675586, + "grad_norm": 0.4865456223487854, + "learning_rate": 8e-05, + "loss": 1.5118, + "step": 7476 + }, + { + "epoch": 0.41677814938684504, + "grad_norm": 0.5142179131507874, + "learning_rate": 8e-05, + "loss": 1.5317, + "step": 7477 + }, + { + "epoch": 0.4168338907469342, + "grad_norm": 0.5317469835281372, + "learning_rate": 8e-05, + "loss": 1.601, + "step": 7478 + }, + { + "epoch": 0.4168896321070234, + "grad_norm": 0.501403272151947, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 7479 + }, + { + "epoch": 0.4169453734671126, + "grad_norm": 0.5238836407661438, + "learning_rate": 8e-05, + "loss": 1.7402, + "step": 7480 + }, + { + "epoch": 0.41700111482720176, + "grad_norm": 0.6039742231369019, + "learning_rate": 8e-05, + "loss": 1.8983, + "step": 7481 + }, + { + "epoch": 0.417056856187291, + "grad_norm": 0.593430757522583, + "learning_rate": 8e-05, + "loss": 1.9938, + "step": 7482 + }, + { + "epoch": 0.41711259754738017, + "grad_norm": 0.5287467837333679, + "learning_rate": 8e-05, + "loss": 1.7788, + "step": 7483 + }, + { + "epoch": 0.41716833890746935, + "grad_norm": 0.5970814228057861, + "learning_rate": 8e-05, + "loss": 1.8968, + "step": 7484 + }, + { + "epoch": 0.41722408026755853, + "grad_norm": 0.49905112385749817, + "learning_rate": 8e-05, + "loss": 1.5382, + "step": 7485 + }, + { + "epoch": 0.4172798216276477, + "grad_norm": 0.5791721343994141, + "learning_rate": 8e-05, + "loss": 1.87, + "step": 7486 + }, + { + "epoch": 0.4173355629877369, + "grad_norm": 0.534599244594574, + "learning_rate": 8e-05, + "loss": 1.7648, + "step": 7487 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.5286064147949219, + "learning_rate": 8e-05, + "loss": 1.4987, + "step": 7488 + }, + { + "epoch": 0.4174470457079153, + "grad_norm": 0.47725871205329895, + "learning_rate": 8e-05, + "loss": 1.4845, + "step": 7489 + }, + { + "epoch": 0.4175027870680045, + "grad_norm": 0.5819158554077148, + "learning_rate": 8e-05, + "loss": 1.8628, + "step": 7490 + }, + { + "epoch": 0.41755852842809366, + "grad_norm": 0.5254594683647156, + "learning_rate": 8e-05, + "loss": 1.5828, + "step": 7491 + }, + { + "epoch": 0.41761426978818283, + "grad_norm": 0.5000957250595093, + "learning_rate": 8e-05, + "loss": 1.5273, + "step": 7492 + }, + { + "epoch": 0.417670011148272, + "grad_norm": 0.5213155746459961, + "learning_rate": 8e-05, + "loss": 1.8125, + "step": 7493 + }, + { + "epoch": 0.4177257525083612, + "grad_norm": 0.4522757828235626, + "learning_rate": 8e-05, + "loss": 1.3973, + "step": 7494 + }, + { + "epoch": 0.41778149386845037, + "grad_norm": 0.5139923691749573, + "learning_rate": 8e-05, + "loss": 1.7558, + "step": 7495 + }, + { + "epoch": 0.41783723522853955, + "grad_norm": 0.5346956849098206, + "learning_rate": 8e-05, + "loss": 1.3351, + "step": 7496 + }, + { + "epoch": 0.4178929765886288, + "grad_norm": 0.48926034569740295, + "learning_rate": 8e-05, + "loss": 1.4097, + "step": 7497 + }, + { + "epoch": 0.41794871794871796, + "grad_norm": 0.5781483054161072, + "learning_rate": 8e-05, + "loss": 1.7858, + "step": 7498 + }, + { + "epoch": 0.41800445930880714, + "grad_norm": 0.5323367118835449, + "learning_rate": 8e-05, + "loss": 1.4932, + "step": 7499 + }, + { + "epoch": 0.4180602006688963, + "grad_norm": 0.6245522499084473, + "learning_rate": 8e-05, + "loss": 1.6934, + "step": 7500 + }, + { + "epoch": 0.4181159420289855, + "grad_norm": 0.5793091654777527, + "learning_rate": 8e-05, + "loss": 1.8131, + "step": 7501 + }, + { + "epoch": 0.4181716833890747, + "grad_norm": 0.5998666286468506, + "learning_rate": 8e-05, + "loss": 1.5865, + "step": 7502 + }, + { + "epoch": 0.41822742474916386, + "grad_norm": 0.513708770275116, + "learning_rate": 8e-05, + "loss": 1.4626, + "step": 7503 + }, + { + "epoch": 0.4182831661092531, + "grad_norm": 0.5088089108467102, + "learning_rate": 8e-05, + "loss": 1.6985, + "step": 7504 + }, + { + "epoch": 0.41833890746934227, + "grad_norm": 0.5325367450714111, + "learning_rate": 8e-05, + "loss": 1.7215, + "step": 7505 + }, + { + "epoch": 0.41839464882943145, + "grad_norm": 0.5224018692970276, + "learning_rate": 8e-05, + "loss": 1.5529, + "step": 7506 + }, + { + "epoch": 0.4184503901895206, + "grad_norm": 0.5467714667320251, + "learning_rate": 8e-05, + "loss": 1.8319, + "step": 7507 + }, + { + "epoch": 0.4185061315496098, + "grad_norm": 0.5274876356124878, + "learning_rate": 8e-05, + "loss": 1.7315, + "step": 7508 + }, + { + "epoch": 0.418561872909699, + "grad_norm": 0.5074488520622253, + "learning_rate": 8e-05, + "loss": 1.5532, + "step": 7509 + }, + { + "epoch": 0.41861761426978816, + "grad_norm": 0.4909979999065399, + "learning_rate": 8e-05, + "loss": 1.4936, + "step": 7510 + }, + { + "epoch": 0.41867335562987734, + "grad_norm": 0.5296882390975952, + "learning_rate": 8e-05, + "loss": 1.6747, + "step": 7511 + }, + { + "epoch": 0.4187290969899666, + "grad_norm": 0.5459797382354736, + "learning_rate": 8e-05, + "loss": 1.7246, + "step": 7512 + }, + { + "epoch": 0.41878483835005575, + "grad_norm": 0.4935992360115051, + "learning_rate": 8e-05, + "loss": 1.5092, + "step": 7513 + }, + { + "epoch": 0.41884057971014493, + "grad_norm": 0.5477376580238342, + "learning_rate": 8e-05, + "loss": 1.7749, + "step": 7514 + }, + { + "epoch": 0.4188963210702341, + "grad_norm": 0.5266591906547546, + "learning_rate": 8e-05, + "loss": 1.7828, + "step": 7515 + }, + { + "epoch": 0.4189520624303233, + "grad_norm": 0.5456550717353821, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 7516 + }, + { + "epoch": 0.41900780379041247, + "grad_norm": 0.56923508644104, + "learning_rate": 8e-05, + "loss": 1.8137, + "step": 7517 + }, + { + "epoch": 0.41906354515050165, + "grad_norm": 0.5321353673934937, + "learning_rate": 8e-05, + "loss": 1.718, + "step": 7518 + }, + { + "epoch": 0.4191192865105909, + "grad_norm": 0.5207672715187073, + "learning_rate": 8e-05, + "loss": 1.4387, + "step": 7519 + }, + { + "epoch": 0.41917502787068006, + "grad_norm": 0.5619326233863831, + "learning_rate": 8e-05, + "loss": 1.8107, + "step": 7520 + }, + { + "epoch": 0.41923076923076924, + "grad_norm": 0.5416442155838013, + "learning_rate": 8e-05, + "loss": 1.7936, + "step": 7521 + }, + { + "epoch": 0.4192865105908584, + "grad_norm": 0.5835678577423096, + "learning_rate": 8e-05, + "loss": 1.7206, + "step": 7522 + }, + { + "epoch": 0.4193422519509476, + "grad_norm": 0.5465813279151917, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 7523 + }, + { + "epoch": 0.4193979933110368, + "grad_norm": 0.5443020462989807, + "learning_rate": 8e-05, + "loss": 1.6822, + "step": 7524 + }, + { + "epoch": 0.41945373467112596, + "grad_norm": 0.4806618392467499, + "learning_rate": 8e-05, + "loss": 1.5436, + "step": 7525 + }, + { + "epoch": 0.4195094760312152, + "grad_norm": 0.5466196537017822, + "learning_rate": 8e-05, + "loss": 1.6569, + "step": 7526 + }, + { + "epoch": 0.41956521739130437, + "grad_norm": 0.520419716835022, + "learning_rate": 8e-05, + "loss": 1.6902, + "step": 7527 + }, + { + "epoch": 0.41962095875139355, + "grad_norm": 0.512060821056366, + "learning_rate": 8e-05, + "loss": 1.7119, + "step": 7528 + }, + { + "epoch": 0.4196767001114827, + "grad_norm": 0.5478929281234741, + "learning_rate": 8e-05, + "loss": 1.5737, + "step": 7529 + }, + { + "epoch": 0.4197324414715719, + "grad_norm": 0.5517770051956177, + "learning_rate": 8e-05, + "loss": 1.8199, + "step": 7530 + }, + { + "epoch": 0.4197881828316611, + "grad_norm": 0.5207688212394714, + "learning_rate": 8e-05, + "loss": 1.3724, + "step": 7531 + }, + { + "epoch": 0.41984392419175026, + "grad_norm": 0.5325443148612976, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 7532 + }, + { + "epoch": 0.41989966555183944, + "grad_norm": 0.5486525893211365, + "learning_rate": 8e-05, + "loss": 1.6209, + "step": 7533 + }, + { + "epoch": 0.4199554069119287, + "grad_norm": 0.5634220838546753, + "learning_rate": 8e-05, + "loss": 1.6899, + "step": 7534 + }, + { + "epoch": 0.42001114827201785, + "grad_norm": 0.603767454624176, + "learning_rate": 8e-05, + "loss": 1.9529, + "step": 7535 + }, + { + "epoch": 0.42006688963210703, + "grad_norm": 0.5488311052322388, + "learning_rate": 8e-05, + "loss": 1.6988, + "step": 7536 + }, + { + "epoch": 0.4201226309921962, + "grad_norm": 0.5185443758964539, + "learning_rate": 8e-05, + "loss": 1.7136, + "step": 7537 + }, + { + "epoch": 0.4201783723522854, + "grad_norm": 0.5392804741859436, + "learning_rate": 8e-05, + "loss": 1.9436, + "step": 7538 + }, + { + "epoch": 0.42023411371237457, + "grad_norm": 0.5228851437568665, + "learning_rate": 8e-05, + "loss": 1.4759, + "step": 7539 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 0.48584380745887756, + "learning_rate": 8e-05, + "loss": 1.5736, + "step": 7540 + }, + { + "epoch": 0.420345596432553, + "grad_norm": 0.5334780812263489, + "learning_rate": 8e-05, + "loss": 1.5748, + "step": 7541 + }, + { + "epoch": 0.42040133779264216, + "grad_norm": 0.5429795384407043, + "learning_rate": 8e-05, + "loss": 1.627, + "step": 7542 + }, + { + "epoch": 0.42045707915273134, + "grad_norm": 0.5226675271987915, + "learning_rate": 8e-05, + "loss": 1.7006, + "step": 7543 + }, + { + "epoch": 0.4205128205128205, + "grad_norm": 0.5478998422622681, + "learning_rate": 8e-05, + "loss": 1.6005, + "step": 7544 + }, + { + "epoch": 0.4205685618729097, + "grad_norm": 0.5705276131629944, + "learning_rate": 8e-05, + "loss": 1.7977, + "step": 7545 + }, + { + "epoch": 0.4206243032329989, + "grad_norm": 0.5187798738479614, + "learning_rate": 8e-05, + "loss": 1.6782, + "step": 7546 + }, + { + "epoch": 0.42068004459308805, + "grad_norm": 0.5031110644340515, + "learning_rate": 8e-05, + "loss": 1.5819, + "step": 7547 + }, + { + "epoch": 0.42073578595317723, + "grad_norm": 0.5376731157302856, + "learning_rate": 8e-05, + "loss": 1.6474, + "step": 7548 + }, + { + "epoch": 0.42079152731326647, + "grad_norm": 0.5520094037055969, + "learning_rate": 8e-05, + "loss": 1.6007, + "step": 7549 + }, + { + "epoch": 0.42084726867335565, + "grad_norm": 0.51263827085495, + "learning_rate": 8e-05, + "loss": 1.5592, + "step": 7550 + }, + { + "epoch": 0.4209030100334448, + "grad_norm": 0.5615968108177185, + "learning_rate": 8e-05, + "loss": 1.6022, + "step": 7551 + }, + { + "epoch": 0.420958751393534, + "grad_norm": 0.5113683938980103, + "learning_rate": 8e-05, + "loss": 1.6102, + "step": 7552 + }, + { + "epoch": 0.4210144927536232, + "grad_norm": 0.5493238568305969, + "learning_rate": 8e-05, + "loss": 1.6307, + "step": 7553 + }, + { + "epoch": 0.42107023411371236, + "grad_norm": 0.47366848587989807, + "learning_rate": 8e-05, + "loss": 1.4715, + "step": 7554 + }, + { + "epoch": 0.42112597547380154, + "grad_norm": 0.5336489677429199, + "learning_rate": 8e-05, + "loss": 1.7972, + "step": 7555 + }, + { + "epoch": 0.4211817168338908, + "grad_norm": 0.5131495594978333, + "learning_rate": 8e-05, + "loss": 1.6308, + "step": 7556 + }, + { + "epoch": 0.42123745819397995, + "grad_norm": 0.5199673771858215, + "learning_rate": 8e-05, + "loss": 1.6602, + "step": 7557 + }, + { + "epoch": 0.42129319955406913, + "grad_norm": 0.5832521915435791, + "learning_rate": 8e-05, + "loss": 1.7775, + "step": 7558 + }, + { + "epoch": 0.4213489409141583, + "grad_norm": 0.5680006146430969, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 7559 + }, + { + "epoch": 0.4214046822742475, + "grad_norm": 0.5315585136413574, + "learning_rate": 8e-05, + "loss": 1.672, + "step": 7560 + }, + { + "epoch": 0.42146042363433667, + "grad_norm": 0.5046219229698181, + "learning_rate": 8e-05, + "loss": 1.5325, + "step": 7561 + }, + { + "epoch": 0.42151616499442585, + "grad_norm": 0.5093092322349548, + "learning_rate": 8e-05, + "loss": 1.4543, + "step": 7562 + }, + { + "epoch": 0.421571906354515, + "grad_norm": 0.49265122413635254, + "learning_rate": 8e-05, + "loss": 1.4613, + "step": 7563 + }, + { + "epoch": 0.42162764771460426, + "grad_norm": 0.5940898656845093, + "learning_rate": 8e-05, + "loss": 1.7415, + "step": 7564 + }, + { + "epoch": 0.42168338907469344, + "grad_norm": 0.4995904862880707, + "learning_rate": 8e-05, + "loss": 1.6841, + "step": 7565 + }, + { + "epoch": 0.4217391304347826, + "grad_norm": 0.5423473715782166, + "learning_rate": 8e-05, + "loss": 1.6731, + "step": 7566 + }, + { + "epoch": 0.4217948717948718, + "grad_norm": 0.5287518501281738, + "learning_rate": 8e-05, + "loss": 1.5821, + "step": 7567 + }, + { + "epoch": 0.421850613154961, + "grad_norm": 0.6142107248306274, + "learning_rate": 8e-05, + "loss": 1.7342, + "step": 7568 + }, + { + "epoch": 0.42190635451505015, + "grad_norm": 0.5168833136558533, + "learning_rate": 8e-05, + "loss": 1.6403, + "step": 7569 + }, + { + "epoch": 0.42196209587513933, + "grad_norm": 0.49429258704185486, + "learning_rate": 8e-05, + "loss": 1.4016, + "step": 7570 + }, + { + "epoch": 0.42201783723522857, + "grad_norm": 0.5727221369743347, + "learning_rate": 8e-05, + "loss": 1.722, + "step": 7571 + }, + { + "epoch": 0.42207357859531774, + "grad_norm": 0.5960108041763306, + "learning_rate": 8e-05, + "loss": 1.8959, + "step": 7572 + }, + { + "epoch": 0.4221293199554069, + "grad_norm": 0.5104274153709412, + "learning_rate": 8e-05, + "loss": 1.6805, + "step": 7573 + }, + { + "epoch": 0.4221850613154961, + "grad_norm": 0.5847427248954773, + "learning_rate": 8e-05, + "loss": 1.7601, + "step": 7574 + }, + { + "epoch": 0.4222408026755853, + "grad_norm": 0.6041691899299622, + "learning_rate": 8e-05, + "loss": 1.6447, + "step": 7575 + }, + { + "epoch": 0.42229654403567446, + "grad_norm": 0.5371209383010864, + "learning_rate": 8e-05, + "loss": 1.6438, + "step": 7576 + }, + { + "epoch": 0.42235228539576364, + "grad_norm": 0.540814220905304, + "learning_rate": 8e-05, + "loss": 1.6554, + "step": 7577 + }, + { + "epoch": 0.4224080267558528, + "grad_norm": 0.5552213788032532, + "learning_rate": 8e-05, + "loss": 1.9482, + "step": 7578 + }, + { + "epoch": 0.42246376811594205, + "grad_norm": 0.565314769744873, + "learning_rate": 8e-05, + "loss": 1.9007, + "step": 7579 + }, + { + "epoch": 0.42251950947603123, + "grad_norm": 0.538690447807312, + "learning_rate": 8e-05, + "loss": 1.6039, + "step": 7580 + }, + { + "epoch": 0.4225752508361204, + "grad_norm": 0.5818107724189758, + "learning_rate": 8e-05, + "loss": 1.29, + "step": 7581 + }, + { + "epoch": 0.4226309921962096, + "grad_norm": 0.5533572435379028, + "learning_rate": 8e-05, + "loss": 1.7616, + "step": 7582 + }, + { + "epoch": 0.42268673355629877, + "grad_norm": 0.5512927174568176, + "learning_rate": 8e-05, + "loss": 1.651, + "step": 7583 + }, + { + "epoch": 0.42274247491638794, + "grad_norm": 0.5334979295730591, + "learning_rate": 8e-05, + "loss": 1.4996, + "step": 7584 + }, + { + "epoch": 0.4227982162764771, + "grad_norm": 0.48545053601264954, + "learning_rate": 8e-05, + "loss": 1.6261, + "step": 7585 + }, + { + "epoch": 0.42285395763656636, + "grad_norm": 0.47293198108673096, + "learning_rate": 8e-05, + "loss": 1.4688, + "step": 7586 + }, + { + "epoch": 0.42290969899665554, + "grad_norm": 0.5610591173171997, + "learning_rate": 8e-05, + "loss": 1.524, + "step": 7587 + }, + { + "epoch": 0.4229654403567447, + "grad_norm": 0.5938591957092285, + "learning_rate": 8e-05, + "loss": 2.0942, + "step": 7588 + }, + { + "epoch": 0.4230211817168339, + "grad_norm": 0.49670886993408203, + "learning_rate": 8e-05, + "loss": 1.7034, + "step": 7589 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 0.5407470464706421, + "learning_rate": 8e-05, + "loss": 1.7156, + "step": 7590 + }, + { + "epoch": 0.42313266443701225, + "grad_norm": 0.5480191111564636, + "learning_rate": 8e-05, + "loss": 1.4605, + "step": 7591 + }, + { + "epoch": 0.42318840579710143, + "grad_norm": 0.4956515431404114, + "learning_rate": 8e-05, + "loss": 1.5794, + "step": 7592 + }, + { + "epoch": 0.4232441471571906, + "grad_norm": 0.6380559802055359, + "learning_rate": 8e-05, + "loss": 1.9092, + "step": 7593 + }, + { + "epoch": 0.42329988851727984, + "grad_norm": 0.5132410526275635, + "learning_rate": 8e-05, + "loss": 1.5389, + "step": 7594 + }, + { + "epoch": 0.423355629877369, + "grad_norm": 0.5750256776809692, + "learning_rate": 8e-05, + "loss": 1.7148, + "step": 7595 + }, + { + "epoch": 0.4234113712374582, + "grad_norm": 0.5305300354957581, + "learning_rate": 8e-05, + "loss": 1.8729, + "step": 7596 + }, + { + "epoch": 0.4234671125975474, + "grad_norm": 0.5310443043708801, + "learning_rate": 8e-05, + "loss": 1.5722, + "step": 7597 + }, + { + "epoch": 0.42352285395763656, + "grad_norm": 0.5453668832778931, + "learning_rate": 8e-05, + "loss": 1.5986, + "step": 7598 + }, + { + "epoch": 0.42357859531772574, + "grad_norm": 0.4966369867324829, + "learning_rate": 8e-05, + "loss": 1.5603, + "step": 7599 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.5009279847145081, + "learning_rate": 8e-05, + "loss": 1.6332, + "step": 7600 + }, + { + "epoch": 0.42369007803790415, + "grad_norm": 0.5626919269561768, + "learning_rate": 8e-05, + "loss": 1.8475, + "step": 7601 + }, + { + "epoch": 0.42374581939799333, + "grad_norm": 0.48185285925865173, + "learning_rate": 8e-05, + "loss": 1.3049, + "step": 7602 + }, + { + "epoch": 0.4238015607580825, + "grad_norm": 0.5500190258026123, + "learning_rate": 8e-05, + "loss": 1.8254, + "step": 7603 + }, + { + "epoch": 0.4238573021181717, + "grad_norm": 0.5302898287773132, + "learning_rate": 8e-05, + "loss": 1.8387, + "step": 7604 + }, + { + "epoch": 0.42391304347826086, + "grad_norm": 0.5317866206169128, + "learning_rate": 8e-05, + "loss": 1.5846, + "step": 7605 + }, + { + "epoch": 0.42396878483835004, + "grad_norm": 0.5135228037834167, + "learning_rate": 8e-05, + "loss": 1.6452, + "step": 7606 + }, + { + "epoch": 0.4240245261984392, + "grad_norm": 0.5610030293464661, + "learning_rate": 8e-05, + "loss": 1.678, + "step": 7607 + }, + { + "epoch": 0.4240802675585284, + "grad_norm": 0.5058049559593201, + "learning_rate": 8e-05, + "loss": 1.8044, + "step": 7608 + }, + { + "epoch": 0.42413600891861764, + "grad_norm": 0.5683199167251587, + "learning_rate": 8e-05, + "loss": 1.8105, + "step": 7609 + }, + { + "epoch": 0.4241917502787068, + "grad_norm": 0.5189591646194458, + "learning_rate": 8e-05, + "loss": 1.4554, + "step": 7610 + }, + { + "epoch": 0.424247491638796, + "grad_norm": 0.5114647746086121, + "learning_rate": 8e-05, + "loss": 1.6832, + "step": 7611 + }, + { + "epoch": 0.42430323299888517, + "grad_norm": 0.5456311702728271, + "learning_rate": 8e-05, + "loss": 1.7849, + "step": 7612 + }, + { + "epoch": 0.42435897435897435, + "grad_norm": 0.5578476190567017, + "learning_rate": 8e-05, + "loss": 1.8521, + "step": 7613 + }, + { + "epoch": 0.42441471571906353, + "grad_norm": 0.5680450201034546, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 7614 + }, + { + "epoch": 0.4244704570791527, + "grad_norm": 0.5414755344390869, + "learning_rate": 8e-05, + "loss": 1.7448, + "step": 7615 + }, + { + "epoch": 0.42452619843924194, + "grad_norm": 0.5823304653167725, + "learning_rate": 8e-05, + "loss": 1.8596, + "step": 7616 + }, + { + "epoch": 0.4245819397993311, + "grad_norm": 0.5097913146018982, + "learning_rate": 8e-05, + "loss": 1.4833, + "step": 7617 + }, + { + "epoch": 0.4246376811594203, + "grad_norm": 0.5405616164207458, + "learning_rate": 8e-05, + "loss": 1.764, + "step": 7618 + }, + { + "epoch": 0.4246934225195095, + "grad_norm": 0.513064980506897, + "learning_rate": 8e-05, + "loss": 1.6893, + "step": 7619 + }, + { + "epoch": 0.42474916387959866, + "grad_norm": 0.5439816117286682, + "learning_rate": 8e-05, + "loss": 1.6872, + "step": 7620 + }, + { + "epoch": 0.42480490523968784, + "grad_norm": 0.7042144536972046, + "learning_rate": 8e-05, + "loss": 1.8371, + "step": 7621 + }, + { + "epoch": 0.424860646599777, + "grad_norm": 0.4817110598087311, + "learning_rate": 8e-05, + "loss": 1.6254, + "step": 7622 + }, + { + "epoch": 0.4249163879598662, + "grad_norm": 0.5761969685554504, + "learning_rate": 8e-05, + "loss": 1.7819, + "step": 7623 + }, + { + "epoch": 0.4249721293199554, + "grad_norm": 0.6097943186759949, + "learning_rate": 8e-05, + "loss": 1.8692, + "step": 7624 + }, + { + "epoch": 0.4250278706800446, + "grad_norm": 0.5209263563156128, + "learning_rate": 8e-05, + "loss": 1.6544, + "step": 7625 + }, + { + "epoch": 0.4250836120401338, + "grad_norm": 0.5559192299842834, + "learning_rate": 8e-05, + "loss": 1.7777, + "step": 7626 + }, + { + "epoch": 0.42513935340022296, + "grad_norm": 0.5229249000549316, + "learning_rate": 8e-05, + "loss": 1.749, + "step": 7627 + }, + { + "epoch": 0.42519509476031214, + "grad_norm": 0.4869654178619385, + "learning_rate": 8e-05, + "loss": 1.4507, + "step": 7628 + }, + { + "epoch": 0.4252508361204013, + "grad_norm": 0.5297756791114807, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 7629 + }, + { + "epoch": 0.4253065774804905, + "grad_norm": 0.5154840350151062, + "learning_rate": 8e-05, + "loss": 1.7402, + "step": 7630 + }, + { + "epoch": 0.42536231884057973, + "grad_norm": 0.4986840784549713, + "learning_rate": 8e-05, + "loss": 1.4875, + "step": 7631 + }, + { + "epoch": 0.4254180602006689, + "grad_norm": 0.48561593890190125, + "learning_rate": 8e-05, + "loss": 1.5017, + "step": 7632 + }, + { + "epoch": 0.4254738015607581, + "grad_norm": 0.5005348920822144, + "learning_rate": 8e-05, + "loss": 1.67, + "step": 7633 + }, + { + "epoch": 0.42552954292084727, + "grad_norm": 0.5824643969535828, + "learning_rate": 8e-05, + "loss": 2.0515, + "step": 7634 + }, + { + "epoch": 0.42558528428093645, + "grad_norm": 0.512254536151886, + "learning_rate": 8e-05, + "loss": 1.8098, + "step": 7635 + }, + { + "epoch": 0.4256410256410256, + "grad_norm": 0.5454243421554565, + "learning_rate": 8e-05, + "loss": 1.9424, + "step": 7636 + }, + { + "epoch": 0.4256967670011148, + "grad_norm": 0.4775199890136719, + "learning_rate": 8e-05, + "loss": 1.3979, + "step": 7637 + }, + { + "epoch": 0.42575250836120404, + "grad_norm": 0.4912828207015991, + "learning_rate": 8e-05, + "loss": 1.4566, + "step": 7638 + }, + { + "epoch": 0.4258082497212932, + "grad_norm": 0.5596845149993896, + "learning_rate": 8e-05, + "loss": 1.7422, + "step": 7639 + }, + { + "epoch": 0.4258639910813824, + "grad_norm": 0.5112817287445068, + "learning_rate": 8e-05, + "loss": 1.5523, + "step": 7640 + }, + { + "epoch": 0.4259197324414716, + "grad_norm": 0.5548750758171082, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 7641 + }, + { + "epoch": 0.42597547380156076, + "grad_norm": 0.5437869429588318, + "learning_rate": 8e-05, + "loss": 1.6663, + "step": 7642 + }, + { + "epoch": 0.42603121516164993, + "grad_norm": 0.5219390392303467, + "learning_rate": 8e-05, + "loss": 1.6779, + "step": 7643 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 0.5230270624160767, + "learning_rate": 8e-05, + "loss": 1.6244, + "step": 7644 + }, + { + "epoch": 0.4261426978818283, + "grad_norm": 0.5277723670005798, + "learning_rate": 8e-05, + "loss": 1.5704, + "step": 7645 + }, + { + "epoch": 0.4261984392419175, + "grad_norm": 0.5310590863227844, + "learning_rate": 8e-05, + "loss": 1.7704, + "step": 7646 + }, + { + "epoch": 0.4262541806020067, + "grad_norm": 0.5522536039352417, + "learning_rate": 8e-05, + "loss": 1.6834, + "step": 7647 + }, + { + "epoch": 0.4263099219620959, + "grad_norm": 0.5156833529472351, + "learning_rate": 8e-05, + "loss": 1.5644, + "step": 7648 + }, + { + "epoch": 0.42636566332218506, + "grad_norm": 0.5626975893974304, + "learning_rate": 8e-05, + "loss": 1.7787, + "step": 7649 + }, + { + "epoch": 0.42642140468227424, + "grad_norm": 0.5107052326202393, + "learning_rate": 8e-05, + "loss": 1.5207, + "step": 7650 + }, + { + "epoch": 0.4264771460423634, + "grad_norm": 0.5529700517654419, + "learning_rate": 8e-05, + "loss": 1.9028, + "step": 7651 + }, + { + "epoch": 0.4265328874024526, + "grad_norm": 0.5312693119049072, + "learning_rate": 8e-05, + "loss": 1.7605, + "step": 7652 + }, + { + "epoch": 0.42658862876254183, + "grad_norm": 0.5670287013053894, + "learning_rate": 8e-05, + "loss": 1.8526, + "step": 7653 + }, + { + "epoch": 0.426644370122631, + "grad_norm": 0.5672020316123962, + "learning_rate": 8e-05, + "loss": 1.4338, + "step": 7654 + }, + { + "epoch": 0.4267001114827202, + "grad_norm": 0.5201690793037415, + "learning_rate": 8e-05, + "loss": 1.5336, + "step": 7655 + }, + { + "epoch": 0.42675585284280937, + "grad_norm": 0.4957461953163147, + "learning_rate": 8e-05, + "loss": 1.5938, + "step": 7656 + }, + { + "epoch": 0.42681159420289855, + "grad_norm": 0.5092799663543701, + "learning_rate": 8e-05, + "loss": 1.5157, + "step": 7657 + }, + { + "epoch": 0.4268673355629877, + "grad_norm": 0.5307939052581787, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 7658 + }, + { + "epoch": 0.4269230769230769, + "grad_norm": 0.5566995143890381, + "learning_rate": 8e-05, + "loss": 1.7914, + "step": 7659 + }, + { + "epoch": 0.4269788182831661, + "grad_norm": 0.5886173248291016, + "learning_rate": 8e-05, + "loss": 2.1544, + "step": 7660 + }, + { + "epoch": 0.4270345596432553, + "grad_norm": 0.547525942325592, + "learning_rate": 8e-05, + "loss": 1.7621, + "step": 7661 + }, + { + "epoch": 0.4270903010033445, + "grad_norm": 0.5709888935089111, + "learning_rate": 8e-05, + "loss": 1.7429, + "step": 7662 + }, + { + "epoch": 0.4271460423634337, + "grad_norm": 0.5912901759147644, + "learning_rate": 8e-05, + "loss": 1.6187, + "step": 7663 + }, + { + "epoch": 0.42720178372352285, + "grad_norm": 0.5148410797119141, + "learning_rate": 8e-05, + "loss": 1.7212, + "step": 7664 + }, + { + "epoch": 0.42725752508361203, + "grad_norm": 0.5268368124961853, + "learning_rate": 8e-05, + "loss": 1.5642, + "step": 7665 + }, + { + "epoch": 0.4273132664437012, + "grad_norm": 0.5531495213508606, + "learning_rate": 8e-05, + "loss": 1.7811, + "step": 7666 + }, + { + "epoch": 0.4273690078037904, + "grad_norm": 0.5273078680038452, + "learning_rate": 8e-05, + "loss": 1.7557, + "step": 7667 + }, + { + "epoch": 0.4274247491638796, + "grad_norm": 0.6302218437194824, + "learning_rate": 8e-05, + "loss": 1.6079, + "step": 7668 + }, + { + "epoch": 0.4274804905239688, + "grad_norm": 0.5131771564483643, + "learning_rate": 8e-05, + "loss": 1.6148, + "step": 7669 + }, + { + "epoch": 0.427536231884058, + "grad_norm": 0.4759463965892792, + "learning_rate": 8e-05, + "loss": 1.4748, + "step": 7670 + }, + { + "epoch": 0.42759197324414716, + "grad_norm": 0.6624416708946228, + "learning_rate": 8e-05, + "loss": 1.8419, + "step": 7671 + }, + { + "epoch": 0.42764771460423634, + "grad_norm": 0.5604111552238464, + "learning_rate": 8e-05, + "loss": 1.819, + "step": 7672 + }, + { + "epoch": 0.4277034559643255, + "grad_norm": 0.5581426620483398, + "learning_rate": 8e-05, + "loss": 1.585, + "step": 7673 + }, + { + "epoch": 0.4277591973244147, + "grad_norm": 0.5654292106628418, + "learning_rate": 8e-05, + "loss": 1.8578, + "step": 7674 + }, + { + "epoch": 0.4278149386845039, + "grad_norm": 0.5510108470916748, + "learning_rate": 8e-05, + "loss": 1.8643, + "step": 7675 + }, + { + "epoch": 0.4278706800445931, + "grad_norm": 0.5483967661857605, + "learning_rate": 8e-05, + "loss": 1.7746, + "step": 7676 + }, + { + "epoch": 0.4279264214046823, + "grad_norm": 0.5236601829528809, + "learning_rate": 8e-05, + "loss": 1.6946, + "step": 7677 + }, + { + "epoch": 0.42798216276477147, + "grad_norm": 0.5318403244018555, + "learning_rate": 8e-05, + "loss": 1.5735, + "step": 7678 + }, + { + "epoch": 0.42803790412486065, + "grad_norm": 0.5390852093696594, + "learning_rate": 8e-05, + "loss": 1.7182, + "step": 7679 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.6264533400535583, + "learning_rate": 8e-05, + "loss": 1.844, + "step": 7680 + }, + { + "epoch": 0.428149386845039, + "grad_norm": 0.5331520438194275, + "learning_rate": 8e-05, + "loss": 1.5223, + "step": 7681 + }, + { + "epoch": 0.4282051282051282, + "grad_norm": 0.5553402304649353, + "learning_rate": 8e-05, + "loss": 1.728, + "step": 7682 + }, + { + "epoch": 0.4282608695652174, + "grad_norm": 0.5179986357688904, + "learning_rate": 8e-05, + "loss": 1.4644, + "step": 7683 + }, + { + "epoch": 0.4283166109253066, + "grad_norm": 0.5654995441436768, + "learning_rate": 8e-05, + "loss": 1.5609, + "step": 7684 + }, + { + "epoch": 0.4283723522853958, + "grad_norm": 0.5537652373313904, + "learning_rate": 8e-05, + "loss": 1.5997, + "step": 7685 + }, + { + "epoch": 0.42842809364548495, + "grad_norm": 0.5693234801292419, + "learning_rate": 8e-05, + "loss": 1.7669, + "step": 7686 + }, + { + "epoch": 0.42848383500557413, + "grad_norm": 0.5033535361289978, + "learning_rate": 8e-05, + "loss": 1.662, + "step": 7687 + }, + { + "epoch": 0.4285395763656633, + "grad_norm": 0.5271948575973511, + "learning_rate": 8e-05, + "loss": 1.5819, + "step": 7688 + }, + { + "epoch": 0.4285953177257525, + "grad_norm": 0.5141314268112183, + "learning_rate": 8e-05, + "loss": 1.6355, + "step": 7689 + }, + { + "epoch": 0.42865105908584167, + "grad_norm": 0.5537899732589722, + "learning_rate": 8e-05, + "loss": 1.7471, + "step": 7690 + }, + { + "epoch": 0.4287068004459309, + "grad_norm": 0.5063492059707642, + "learning_rate": 8e-05, + "loss": 1.3736, + "step": 7691 + }, + { + "epoch": 0.4287625418060201, + "grad_norm": 0.47556009888648987, + "learning_rate": 8e-05, + "loss": 1.5181, + "step": 7692 + }, + { + "epoch": 0.42881828316610926, + "grad_norm": 0.5070196986198425, + "learning_rate": 8e-05, + "loss": 1.7123, + "step": 7693 + }, + { + "epoch": 0.42887402452619844, + "grad_norm": 0.5480073094367981, + "learning_rate": 8e-05, + "loss": 1.6972, + "step": 7694 + }, + { + "epoch": 0.4289297658862876, + "grad_norm": 0.5210518836975098, + "learning_rate": 8e-05, + "loss": 1.7618, + "step": 7695 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 0.5803531408309937, + "learning_rate": 8e-05, + "loss": 1.8326, + "step": 7696 + }, + { + "epoch": 0.429041248606466, + "grad_norm": 0.4972003400325775, + "learning_rate": 8e-05, + "loss": 1.5183, + "step": 7697 + }, + { + "epoch": 0.4290969899665552, + "grad_norm": 0.5382688045501709, + "learning_rate": 8e-05, + "loss": 1.8117, + "step": 7698 + }, + { + "epoch": 0.4291527313266444, + "grad_norm": 0.47422686219215393, + "learning_rate": 8e-05, + "loss": 1.6818, + "step": 7699 + }, + { + "epoch": 0.42920847268673357, + "grad_norm": 0.5724363923072815, + "learning_rate": 8e-05, + "loss": 1.8173, + "step": 7700 + }, + { + "epoch": 0.42926421404682275, + "grad_norm": 0.5626831650733948, + "learning_rate": 8e-05, + "loss": 1.6295, + "step": 7701 + }, + { + "epoch": 0.4293199554069119, + "grad_norm": 0.5423737168312073, + "learning_rate": 8e-05, + "loss": 1.7521, + "step": 7702 + }, + { + "epoch": 0.4293756967670011, + "grad_norm": 0.5201063752174377, + "learning_rate": 8e-05, + "loss": 1.6914, + "step": 7703 + }, + { + "epoch": 0.4294314381270903, + "grad_norm": 0.5397342443466187, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 7704 + }, + { + "epoch": 0.42948717948717946, + "grad_norm": 0.54704749584198, + "learning_rate": 8e-05, + "loss": 1.6658, + "step": 7705 + }, + { + "epoch": 0.4295429208472687, + "grad_norm": 0.5956423878669739, + "learning_rate": 8e-05, + "loss": 1.7879, + "step": 7706 + }, + { + "epoch": 0.4295986622073579, + "grad_norm": 0.5553403496742249, + "learning_rate": 8e-05, + "loss": 1.7613, + "step": 7707 + }, + { + "epoch": 0.42965440356744705, + "grad_norm": 0.5432010293006897, + "learning_rate": 8e-05, + "loss": 1.6631, + "step": 7708 + }, + { + "epoch": 0.42971014492753623, + "grad_norm": 0.5349817872047424, + "learning_rate": 8e-05, + "loss": 1.7419, + "step": 7709 + }, + { + "epoch": 0.4297658862876254, + "grad_norm": 0.5355441570281982, + "learning_rate": 8e-05, + "loss": 1.676, + "step": 7710 + }, + { + "epoch": 0.4298216276477146, + "grad_norm": 0.5206706523895264, + "learning_rate": 8e-05, + "loss": 1.499, + "step": 7711 + }, + { + "epoch": 0.42987736900780377, + "grad_norm": 0.5900402069091797, + "learning_rate": 8e-05, + "loss": 1.9597, + "step": 7712 + }, + { + "epoch": 0.429933110367893, + "grad_norm": 0.5922344326972961, + "learning_rate": 8e-05, + "loss": 1.969, + "step": 7713 + }, + { + "epoch": 0.4299888517279822, + "grad_norm": 0.5381130576133728, + "learning_rate": 8e-05, + "loss": 1.5954, + "step": 7714 + }, + { + "epoch": 0.43004459308807136, + "grad_norm": 0.5215221047401428, + "learning_rate": 8e-05, + "loss": 1.513, + "step": 7715 + }, + { + "epoch": 0.43010033444816054, + "grad_norm": 0.6906350255012512, + "learning_rate": 8e-05, + "loss": 1.7071, + "step": 7716 + }, + { + "epoch": 0.4301560758082497, + "grad_norm": 0.49984869360923767, + "learning_rate": 8e-05, + "loss": 1.5308, + "step": 7717 + }, + { + "epoch": 0.4302118171683389, + "grad_norm": 0.5267676711082458, + "learning_rate": 8e-05, + "loss": 1.6652, + "step": 7718 + }, + { + "epoch": 0.4302675585284281, + "grad_norm": 0.533749520778656, + "learning_rate": 8e-05, + "loss": 1.6965, + "step": 7719 + }, + { + "epoch": 0.43032329988851725, + "grad_norm": 0.5423862338066101, + "learning_rate": 8e-05, + "loss": 1.5306, + "step": 7720 + }, + { + "epoch": 0.4303790412486065, + "grad_norm": 0.525157630443573, + "learning_rate": 8e-05, + "loss": 1.5907, + "step": 7721 + }, + { + "epoch": 0.43043478260869567, + "grad_norm": 0.5443845391273499, + "learning_rate": 8e-05, + "loss": 1.6161, + "step": 7722 + }, + { + "epoch": 0.43049052396878484, + "grad_norm": 0.5968034267425537, + "learning_rate": 8e-05, + "loss": 1.7725, + "step": 7723 + }, + { + "epoch": 0.430546265328874, + "grad_norm": 0.5567757487297058, + "learning_rate": 8e-05, + "loss": 1.7576, + "step": 7724 + }, + { + "epoch": 0.4306020066889632, + "grad_norm": 0.5223360657691956, + "learning_rate": 8e-05, + "loss": 1.7543, + "step": 7725 + }, + { + "epoch": 0.4306577480490524, + "grad_norm": 0.5313509702682495, + "learning_rate": 8e-05, + "loss": 1.6799, + "step": 7726 + }, + { + "epoch": 0.43071348940914156, + "grad_norm": 0.5988759994506836, + "learning_rate": 8e-05, + "loss": 1.9165, + "step": 7727 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 0.5237802267074585, + "learning_rate": 8e-05, + "loss": 1.555, + "step": 7728 + }, + { + "epoch": 0.43082497212931997, + "grad_norm": 0.5154265761375427, + "learning_rate": 8e-05, + "loss": 1.7844, + "step": 7729 + }, + { + "epoch": 0.43088071348940915, + "grad_norm": 0.510456919670105, + "learning_rate": 8e-05, + "loss": 1.7146, + "step": 7730 + }, + { + "epoch": 0.43093645484949833, + "grad_norm": 0.47713032364845276, + "learning_rate": 8e-05, + "loss": 1.4793, + "step": 7731 + }, + { + "epoch": 0.4309921962095875, + "grad_norm": 0.5002168416976929, + "learning_rate": 8e-05, + "loss": 1.7028, + "step": 7732 + }, + { + "epoch": 0.4310479375696767, + "grad_norm": 0.5037677884101868, + "learning_rate": 8e-05, + "loss": 1.7953, + "step": 7733 + }, + { + "epoch": 0.43110367892976587, + "grad_norm": 0.5853288173675537, + "learning_rate": 8e-05, + "loss": 1.8959, + "step": 7734 + }, + { + "epoch": 0.4311594202898551, + "grad_norm": 0.5197460651397705, + "learning_rate": 8e-05, + "loss": 1.6534, + "step": 7735 + }, + { + "epoch": 0.4312151616499443, + "grad_norm": 0.5077546834945679, + "learning_rate": 8e-05, + "loss": 1.6536, + "step": 7736 + }, + { + "epoch": 0.43127090301003346, + "grad_norm": 0.5528148412704468, + "learning_rate": 8e-05, + "loss": 1.8002, + "step": 7737 + }, + { + "epoch": 0.43132664437012264, + "grad_norm": 0.5611364245414734, + "learning_rate": 8e-05, + "loss": 1.6509, + "step": 7738 + }, + { + "epoch": 0.4313823857302118, + "grad_norm": 0.5474463701248169, + "learning_rate": 8e-05, + "loss": 1.8108, + "step": 7739 + }, + { + "epoch": 0.431438127090301, + "grad_norm": 0.568392276763916, + "learning_rate": 8e-05, + "loss": 1.7376, + "step": 7740 + }, + { + "epoch": 0.43149386845039017, + "grad_norm": 0.5201650261878967, + "learning_rate": 8e-05, + "loss": 1.5288, + "step": 7741 + }, + { + "epoch": 0.43154960981047935, + "grad_norm": 0.5635396242141724, + "learning_rate": 8e-05, + "loss": 1.5296, + "step": 7742 + }, + { + "epoch": 0.4316053511705686, + "grad_norm": 0.5454962849617004, + "learning_rate": 8e-05, + "loss": 1.7872, + "step": 7743 + }, + { + "epoch": 0.43166109253065776, + "grad_norm": 0.5474128127098083, + "learning_rate": 8e-05, + "loss": 1.7107, + "step": 7744 + }, + { + "epoch": 0.43171683389074694, + "grad_norm": 0.5751205682754517, + "learning_rate": 8e-05, + "loss": 2.0157, + "step": 7745 + }, + { + "epoch": 0.4317725752508361, + "grad_norm": 0.5645526647567749, + "learning_rate": 8e-05, + "loss": 1.5263, + "step": 7746 + }, + { + "epoch": 0.4318283166109253, + "grad_norm": 0.5280218720436096, + "learning_rate": 8e-05, + "loss": 1.6702, + "step": 7747 + }, + { + "epoch": 0.4318840579710145, + "grad_norm": 0.5345996618270874, + "learning_rate": 8e-05, + "loss": 1.7274, + "step": 7748 + }, + { + "epoch": 0.43193979933110366, + "grad_norm": 0.500987708568573, + "learning_rate": 8e-05, + "loss": 1.5633, + "step": 7749 + }, + { + "epoch": 0.4319955406911929, + "grad_norm": 0.5084331631660461, + "learning_rate": 8e-05, + "loss": 1.493, + "step": 7750 + }, + { + "epoch": 0.43205128205128207, + "grad_norm": 0.5245458483695984, + "learning_rate": 8e-05, + "loss": 1.7815, + "step": 7751 + }, + { + "epoch": 0.43210702341137125, + "grad_norm": 0.5459529757499695, + "learning_rate": 8e-05, + "loss": 1.6608, + "step": 7752 + }, + { + "epoch": 0.43216276477146043, + "grad_norm": 0.505776584148407, + "learning_rate": 8e-05, + "loss": 1.5744, + "step": 7753 + }, + { + "epoch": 0.4322185061315496, + "grad_norm": 0.5133358240127563, + "learning_rate": 8e-05, + "loss": 1.7084, + "step": 7754 + }, + { + "epoch": 0.4322742474916388, + "grad_norm": 0.4842239320278168, + "learning_rate": 8e-05, + "loss": 1.5998, + "step": 7755 + }, + { + "epoch": 0.43232998885172796, + "grad_norm": 0.49821797013282776, + "learning_rate": 8e-05, + "loss": 1.5764, + "step": 7756 + }, + { + "epoch": 0.43238573021181714, + "grad_norm": 0.5373196005821228, + "learning_rate": 8e-05, + "loss": 1.536, + "step": 7757 + }, + { + "epoch": 0.4324414715719064, + "grad_norm": 0.5395300388336182, + "learning_rate": 8e-05, + "loss": 1.6068, + "step": 7758 + }, + { + "epoch": 0.43249721293199556, + "grad_norm": 0.5366533994674683, + "learning_rate": 8e-05, + "loss": 1.6368, + "step": 7759 + }, + { + "epoch": 0.43255295429208473, + "grad_norm": 0.5885009169578552, + "learning_rate": 8e-05, + "loss": 1.7807, + "step": 7760 + }, + { + "epoch": 0.4326086956521739, + "grad_norm": 0.5301985740661621, + "learning_rate": 8e-05, + "loss": 1.7191, + "step": 7761 + }, + { + "epoch": 0.4326644370122631, + "grad_norm": 0.5283817648887634, + "learning_rate": 8e-05, + "loss": 1.6551, + "step": 7762 + }, + { + "epoch": 0.43272017837235227, + "grad_norm": 0.5542014837265015, + "learning_rate": 8e-05, + "loss": 1.7364, + "step": 7763 + }, + { + "epoch": 0.43277591973244145, + "grad_norm": 0.5553522706031799, + "learning_rate": 8e-05, + "loss": 1.5643, + "step": 7764 + }, + { + "epoch": 0.4328316610925307, + "grad_norm": 0.5287862420082092, + "learning_rate": 8e-05, + "loss": 1.763, + "step": 7765 + }, + { + "epoch": 0.43288740245261986, + "grad_norm": 0.5084366798400879, + "learning_rate": 8e-05, + "loss": 1.6354, + "step": 7766 + }, + { + "epoch": 0.43294314381270904, + "grad_norm": 0.5567695498466492, + "learning_rate": 8e-05, + "loss": 1.7928, + "step": 7767 + }, + { + "epoch": 0.4329988851727982, + "grad_norm": 0.5375629663467407, + "learning_rate": 8e-05, + "loss": 1.6925, + "step": 7768 + }, + { + "epoch": 0.4330546265328874, + "grad_norm": 0.5264116525650024, + "learning_rate": 8e-05, + "loss": 1.775, + "step": 7769 + }, + { + "epoch": 0.4331103678929766, + "grad_norm": 0.5006895661354065, + "learning_rate": 8e-05, + "loss": 1.626, + "step": 7770 + }, + { + "epoch": 0.43316610925306576, + "grad_norm": 0.5429582595825195, + "learning_rate": 8e-05, + "loss": 1.9145, + "step": 7771 + }, + { + "epoch": 0.43322185061315494, + "grad_norm": 0.46108904480934143, + "learning_rate": 8e-05, + "loss": 1.2811, + "step": 7772 + }, + { + "epoch": 0.43327759197324417, + "grad_norm": 0.5806688070297241, + "learning_rate": 8e-05, + "loss": 1.8541, + "step": 7773 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.5763636231422424, + "learning_rate": 8e-05, + "loss": 1.8307, + "step": 7774 + }, + { + "epoch": 0.4333890746934225, + "grad_norm": 0.5186631083488464, + "learning_rate": 8e-05, + "loss": 1.6144, + "step": 7775 + }, + { + "epoch": 0.4334448160535117, + "grad_norm": 0.5161386132240295, + "learning_rate": 8e-05, + "loss": 1.7043, + "step": 7776 + }, + { + "epoch": 0.4335005574136009, + "grad_norm": 0.5380999445915222, + "learning_rate": 8e-05, + "loss": 1.6233, + "step": 7777 + }, + { + "epoch": 0.43355629877369006, + "grad_norm": 0.5027433037757874, + "learning_rate": 8e-05, + "loss": 1.5026, + "step": 7778 + }, + { + "epoch": 0.43361204013377924, + "grad_norm": 0.4948747158050537, + "learning_rate": 8e-05, + "loss": 1.5456, + "step": 7779 + }, + { + "epoch": 0.4336677814938685, + "grad_norm": 0.5553415417671204, + "learning_rate": 8e-05, + "loss": 1.6105, + "step": 7780 + }, + { + "epoch": 0.43372352285395765, + "grad_norm": 0.5105648040771484, + "learning_rate": 8e-05, + "loss": 1.5264, + "step": 7781 + }, + { + "epoch": 0.43377926421404683, + "grad_norm": 0.5923818349838257, + "learning_rate": 8e-05, + "loss": 1.672, + "step": 7782 + }, + { + "epoch": 0.433835005574136, + "grad_norm": 0.5075348019599915, + "learning_rate": 8e-05, + "loss": 1.4665, + "step": 7783 + }, + { + "epoch": 0.4338907469342252, + "grad_norm": 0.5268019437789917, + "learning_rate": 8e-05, + "loss": 1.7234, + "step": 7784 + }, + { + "epoch": 0.43394648829431437, + "grad_norm": 0.48290082812309265, + "learning_rate": 8e-05, + "loss": 1.3133, + "step": 7785 + }, + { + "epoch": 0.43400222965440355, + "grad_norm": 0.5417959690093994, + "learning_rate": 8e-05, + "loss": 1.6926, + "step": 7786 + }, + { + "epoch": 0.4340579710144927, + "grad_norm": 0.7672746181488037, + "learning_rate": 8e-05, + "loss": 1.8159, + "step": 7787 + }, + { + "epoch": 0.43411371237458196, + "grad_norm": 0.5545976758003235, + "learning_rate": 8e-05, + "loss": 1.9052, + "step": 7788 + }, + { + "epoch": 0.43416945373467114, + "grad_norm": 0.5124754905700684, + "learning_rate": 8e-05, + "loss": 1.6812, + "step": 7789 + }, + { + "epoch": 0.4342251950947603, + "grad_norm": 0.5179421901702881, + "learning_rate": 8e-05, + "loss": 1.7352, + "step": 7790 + }, + { + "epoch": 0.4342809364548495, + "grad_norm": 0.5736626386642456, + "learning_rate": 8e-05, + "loss": 1.5641, + "step": 7791 + }, + { + "epoch": 0.4343366778149387, + "grad_norm": 0.5768195986747742, + "learning_rate": 8e-05, + "loss": 1.7259, + "step": 7792 + }, + { + "epoch": 0.43439241917502786, + "grad_norm": 0.5163231492042542, + "learning_rate": 8e-05, + "loss": 1.7629, + "step": 7793 + }, + { + "epoch": 0.43444816053511703, + "grad_norm": 0.5233259797096252, + "learning_rate": 8e-05, + "loss": 1.5611, + "step": 7794 + }, + { + "epoch": 0.43450390189520627, + "grad_norm": 0.4899364411830902, + "learning_rate": 8e-05, + "loss": 1.2202, + "step": 7795 + }, + { + "epoch": 0.43455964325529545, + "grad_norm": 0.5041932463645935, + "learning_rate": 8e-05, + "loss": 1.6126, + "step": 7796 + }, + { + "epoch": 0.4346153846153846, + "grad_norm": 0.5223237872123718, + "learning_rate": 8e-05, + "loss": 1.7619, + "step": 7797 + }, + { + "epoch": 0.4346711259754738, + "grad_norm": 0.5277586579322815, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 7798 + }, + { + "epoch": 0.434726867335563, + "grad_norm": 0.5886757373809814, + "learning_rate": 8e-05, + "loss": 1.9693, + "step": 7799 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.503447413444519, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 7800 + }, + { + "epoch": 0.43483835005574134, + "grad_norm": 0.5232421159744263, + "learning_rate": 8e-05, + "loss": 1.5618, + "step": 7801 + }, + { + "epoch": 0.4348940914158305, + "grad_norm": 0.5342657566070557, + "learning_rate": 8e-05, + "loss": 1.7106, + "step": 7802 + }, + { + "epoch": 0.43494983277591975, + "grad_norm": 0.5697629451751709, + "learning_rate": 8e-05, + "loss": 1.6205, + "step": 7803 + }, + { + "epoch": 0.43500557413600893, + "grad_norm": 0.54024338722229, + "learning_rate": 8e-05, + "loss": 1.5715, + "step": 7804 + }, + { + "epoch": 0.4350613154960981, + "grad_norm": 0.4912549555301666, + "learning_rate": 8e-05, + "loss": 1.4373, + "step": 7805 + }, + { + "epoch": 0.4351170568561873, + "grad_norm": 0.5158436894416809, + "learning_rate": 8e-05, + "loss": 1.7721, + "step": 7806 + }, + { + "epoch": 0.43517279821627647, + "grad_norm": 0.5614274740219116, + "learning_rate": 8e-05, + "loss": 1.7082, + "step": 7807 + }, + { + "epoch": 0.43522853957636565, + "grad_norm": 0.5338239669799805, + "learning_rate": 8e-05, + "loss": 1.679, + "step": 7808 + }, + { + "epoch": 0.4352842809364548, + "grad_norm": 0.5141422748565674, + "learning_rate": 8e-05, + "loss": 1.481, + "step": 7809 + }, + { + "epoch": 0.43534002229654406, + "grad_norm": 0.5167403221130371, + "learning_rate": 8e-05, + "loss": 1.5169, + "step": 7810 + }, + { + "epoch": 0.43539576365663324, + "grad_norm": 0.5393883585929871, + "learning_rate": 8e-05, + "loss": 1.7736, + "step": 7811 + }, + { + "epoch": 0.4354515050167224, + "grad_norm": 0.5399210453033447, + "learning_rate": 8e-05, + "loss": 1.8407, + "step": 7812 + }, + { + "epoch": 0.4355072463768116, + "grad_norm": 0.5002278685569763, + "learning_rate": 8e-05, + "loss": 1.6819, + "step": 7813 + }, + { + "epoch": 0.4355629877369008, + "grad_norm": 0.5279465317726135, + "learning_rate": 8e-05, + "loss": 1.6198, + "step": 7814 + }, + { + "epoch": 0.43561872909698995, + "grad_norm": 0.5590936541557312, + "learning_rate": 8e-05, + "loss": 1.7535, + "step": 7815 + }, + { + "epoch": 0.43567447045707913, + "grad_norm": 0.5703123211860657, + "learning_rate": 8e-05, + "loss": 1.8356, + "step": 7816 + }, + { + "epoch": 0.4357302118171683, + "grad_norm": 0.5183179378509521, + "learning_rate": 8e-05, + "loss": 1.6322, + "step": 7817 + }, + { + "epoch": 0.43578595317725755, + "grad_norm": 0.5476742386817932, + "learning_rate": 8e-05, + "loss": 1.6851, + "step": 7818 + }, + { + "epoch": 0.4358416945373467, + "grad_norm": 0.5798848867416382, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 7819 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 0.4962586760520935, + "learning_rate": 8e-05, + "loss": 1.6727, + "step": 7820 + }, + { + "epoch": 0.4359531772575251, + "grad_norm": 0.5649489760398865, + "learning_rate": 8e-05, + "loss": 1.8386, + "step": 7821 + }, + { + "epoch": 0.43600891861761426, + "grad_norm": 0.5474053621292114, + "learning_rate": 8e-05, + "loss": 1.9251, + "step": 7822 + }, + { + "epoch": 0.43606465997770344, + "grad_norm": 0.5361340641975403, + "learning_rate": 8e-05, + "loss": 1.8305, + "step": 7823 + }, + { + "epoch": 0.4361204013377926, + "grad_norm": 0.5183270573616028, + "learning_rate": 8e-05, + "loss": 1.3515, + "step": 7824 + }, + { + "epoch": 0.43617614269788185, + "grad_norm": 0.5586016774177551, + "learning_rate": 8e-05, + "loss": 1.6641, + "step": 7825 + }, + { + "epoch": 0.43623188405797103, + "grad_norm": 0.5187327861785889, + "learning_rate": 8e-05, + "loss": 1.7939, + "step": 7826 + }, + { + "epoch": 0.4362876254180602, + "grad_norm": 0.49453333020210266, + "learning_rate": 8e-05, + "loss": 1.6929, + "step": 7827 + }, + { + "epoch": 0.4363433667781494, + "grad_norm": 0.485514372587204, + "learning_rate": 8e-05, + "loss": 1.352, + "step": 7828 + }, + { + "epoch": 0.43639910813823857, + "grad_norm": 0.6950998306274414, + "learning_rate": 8e-05, + "loss": 2.265, + "step": 7829 + }, + { + "epoch": 0.43645484949832775, + "grad_norm": 0.5308253765106201, + "learning_rate": 8e-05, + "loss": 1.8375, + "step": 7830 + }, + { + "epoch": 0.4365105908584169, + "grad_norm": 0.4920623302459717, + "learning_rate": 8e-05, + "loss": 1.5364, + "step": 7831 + }, + { + "epoch": 0.43656633221850616, + "grad_norm": 0.48871949315071106, + "learning_rate": 8e-05, + "loss": 1.6355, + "step": 7832 + }, + { + "epoch": 0.43662207357859534, + "grad_norm": 0.5611724257469177, + "learning_rate": 8e-05, + "loss": 1.7873, + "step": 7833 + }, + { + "epoch": 0.4366778149386845, + "grad_norm": 0.5402656197547913, + "learning_rate": 8e-05, + "loss": 1.5936, + "step": 7834 + }, + { + "epoch": 0.4367335562987737, + "grad_norm": 0.565010666847229, + "learning_rate": 8e-05, + "loss": 1.8024, + "step": 7835 + }, + { + "epoch": 0.4367892976588629, + "grad_norm": 0.5193888545036316, + "learning_rate": 8e-05, + "loss": 1.6301, + "step": 7836 + }, + { + "epoch": 0.43684503901895205, + "grad_norm": 0.5342622995376587, + "learning_rate": 8e-05, + "loss": 1.5462, + "step": 7837 + }, + { + "epoch": 0.43690078037904123, + "grad_norm": 0.5685501098632812, + "learning_rate": 8e-05, + "loss": 1.8972, + "step": 7838 + }, + { + "epoch": 0.4369565217391304, + "grad_norm": 0.5733136534690857, + "learning_rate": 8e-05, + "loss": 1.7854, + "step": 7839 + }, + { + "epoch": 0.43701226309921964, + "grad_norm": 0.5411170125007629, + "learning_rate": 8e-05, + "loss": 1.8297, + "step": 7840 + }, + { + "epoch": 0.4370680044593088, + "grad_norm": 0.5439412593841553, + "learning_rate": 8e-05, + "loss": 1.8925, + "step": 7841 + }, + { + "epoch": 0.437123745819398, + "grad_norm": 0.5242007374763489, + "learning_rate": 8e-05, + "loss": 1.6811, + "step": 7842 + }, + { + "epoch": 0.4371794871794872, + "grad_norm": 0.5315377116203308, + "learning_rate": 8e-05, + "loss": 1.5802, + "step": 7843 + }, + { + "epoch": 0.43723522853957636, + "grad_norm": 0.5164995193481445, + "learning_rate": 8e-05, + "loss": 1.6785, + "step": 7844 + }, + { + "epoch": 0.43729096989966554, + "grad_norm": 0.5350561141967773, + "learning_rate": 8e-05, + "loss": 1.7579, + "step": 7845 + }, + { + "epoch": 0.4373467112597547, + "grad_norm": 0.5074156522750854, + "learning_rate": 8e-05, + "loss": 1.4924, + "step": 7846 + }, + { + "epoch": 0.43740245261984395, + "grad_norm": 0.5803791880607605, + "learning_rate": 8e-05, + "loss": 1.9426, + "step": 7847 + }, + { + "epoch": 0.43745819397993313, + "grad_norm": 0.4957139492034912, + "learning_rate": 8e-05, + "loss": 1.6834, + "step": 7848 + }, + { + "epoch": 0.4375139353400223, + "grad_norm": 0.5319690108299255, + "learning_rate": 8e-05, + "loss": 1.6855, + "step": 7849 + }, + { + "epoch": 0.4375696767001115, + "grad_norm": 0.5462393760681152, + "learning_rate": 8e-05, + "loss": 1.5779, + "step": 7850 + }, + { + "epoch": 0.43762541806020067, + "grad_norm": 0.5485098958015442, + "learning_rate": 8e-05, + "loss": 1.6278, + "step": 7851 + }, + { + "epoch": 0.43768115942028984, + "grad_norm": 0.5800246000289917, + "learning_rate": 8e-05, + "loss": 1.8766, + "step": 7852 + }, + { + "epoch": 0.437736900780379, + "grad_norm": 0.5502092838287354, + "learning_rate": 8e-05, + "loss": 1.807, + "step": 7853 + }, + { + "epoch": 0.4377926421404682, + "grad_norm": 0.5767754912376404, + "learning_rate": 8e-05, + "loss": 1.8275, + "step": 7854 + }, + { + "epoch": 0.43784838350055744, + "grad_norm": 0.541012704372406, + "learning_rate": 8e-05, + "loss": 1.6968, + "step": 7855 + }, + { + "epoch": 0.4379041248606466, + "grad_norm": 0.4932102859020233, + "learning_rate": 8e-05, + "loss": 1.5194, + "step": 7856 + }, + { + "epoch": 0.4379598662207358, + "grad_norm": 0.5406380891799927, + "learning_rate": 8e-05, + "loss": 1.6381, + "step": 7857 + }, + { + "epoch": 0.438015607580825, + "grad_norm": 0.5157513618469238, + "learning_rate": 8e-05, + "loss": 1.7066, + "step": 7858 + }, + { + "epoch": 0.43807134894091415, + "grad_norm": 0.5263302326202393, + "learning_rate": 8e-05, + "loss": 1.5629, + "step": 7859 + }, + { + "epoch": 0.43812709030100333, + "grad_norm": 0.49297571182250977, + "learning_rate": 8e-05, + "loss": 1.4853, + "step": 7860 + }, + { + "epoch": 0.4381828316610925, + "grad_norm": 0.5538367629051208, + "learning_rate": 8e-05, + "loss": 1.76, + "step": 7861 + }, + { + "epoch": 0.43823857302118174, + "grad_norm": 0.5233355760574341, + "learning_rate": 8e-05, + "loss": 1.5433, + "step": 7862 + }, + { + "epoch": 0.4382943143812709, + "grad_norm": 0.5237699747085571, + "learning_rate": 8e-05, + "loss": 1.4886, + "step": 7863 + }, + { + "epoch": 0.4383500557413601, + "grad_norm": 0.5608038306236267, + "learning_rate": 8e-05, + "loss": 1.8291, + "step": 7864 + }, + { + "epoch": 0.4384057971014493, + "grad_norm": 0.5415871739387512, + "learning_rate": 8e-05, + "loss": 1.6691, + "step": 7865 + }, + { + "epoch": 0.43846153846153846, + "grad_norm": 0.5716668963432312, + "learning_rate": 8e-05, + "loss": 1.7024, + "step": 7866 + }, + { + "epoch": 0.43851727982162764, + "grad_norm": 0.5166226625442505, + "learning_rate": 8e-05, + "loss": 1.6799, + "step": 7867 + }, + { + "epoch": 0.4385730211817168, + "grad_norm": 0.5312373638153076, + "learning_rate": 8e-05, + "loss": 1.6249, + "step": 7868 + }, + { + "epoch": 0.438628762541806, + "grad_norm": 0.5225555896759033, + "learning_rate": 8e-05, + "loss": 1.6405, + "step": 7869 + }, + { + "epoch": 0.43868450390189523, + "grad_norm": 0.5243179202079773, + "learning_rate": 8e-05, + "loss": 1.5884, + "step": 7870 + }, + { + "epoch": 0.4387402452619844, + "grad_norm": 0.5011211037635803, + "learning_rate": 8e-05, + "loss": 1.6761, + "step": 7871 + }, + { + "epoch": 0.4387959866220736, + "grad_norm": 0.5171734690666199, + "learning_rate": 8e-05, + "loss": 1.1844, + "step": 7872 + }, + { + "epoch": 0.43885172798216276, + "grad_norm": 0.5413135290145874, + "learning_rate": 8e-05, + "loss": 1.6946, + "step": 7873 + }, + { + "epoch": 0.43890746934225194, + "grad_norm": 0.5040024518966675, + "learning_rate": 8e-05, + "loss": 1.7646, + "step": 7874 + }, + { + "epoch": 0.4389632107023411, + "grad_norm": 0.5325048565864563, + "learning_rate": 8e-05, + "loss": 1.5024, + "step": 7875 + }, + { + "epoch": 0.4390189520624303, + "grad_norm": 0.4997828006744385, + "learning_rate": 8e-05, + "loss": 1.5721, + "step": 7876 + }, + { + "epoch": 0.43907469342251954, + "grad_norm": 0.5768607258796692, + "learning_rate": 8e-05, + "loss": 1.8523, + "step": 7877 + }, + { + "epoch": 0.4391304347826087, + "grad_norm": 0.5463048815727234, + "learning_rate": 8e-05, + "loss": 1.8202, + "step": 7878 + }, + { + "epoch": 0.4391861761426979, + "grad_norm": 0.5254857540130615, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 7879 + }, + { + "epoch": 0.43924191750278707, + "grad_norm": 0.5765327215194702, + "learning_rate": 8e-05, + "loss": 1.7585, + "step": 7880 + }, + { + "epoch": 0.43929765886287625, + "grad_norm": 0.47521743178367615, + "learning_rate": 8e-05, + "loss": 1.5456, + "step": 7881 + }, + { + "epoch": 0.43935340022296543, + "grad_norm": 0.5405141115188599, + "learning_rate": 8e-05, + "loss": 1.811, + "step": 7882 + }, + { + "epoch": 0.4394091415830546, + "grad_norm": 0.5534927248954773, + "learning_rate": 8e-05, + "loss": 1.7256, + "step": 7883 + }, + { + "epoch": 0.4394648829431438, + "grad_norm": 0.5321674942970276, + "learning_rate": 8e-05, + "loss": 1.6274, + "step": 7884 + }, + { + "epoch": 0.439520624303233, + "grad_norm": 0.5295695066452026, + "learning_rate": 8e-05, + "loss": 1.5753, + "step": 7885 + }, + { + "epoch": 0.4395763656633222, + "grad_norm": 0.574080228805542, + "learning_rate": 8e-05, + "loss": 1.8412, + "step": 7886 + }, + { + "epoch": 0.4396321070234114, + "grad_norm": 0.5514618754386902, + "learning_rate": 8e-05, + "loss": 1.7035, + "step": 7887 + }, + { + "epoch": 0.43968784838350056, + "grad_norm": 0.516113817691803, + "learning_rate": 8e-05, + "loss": 1.6944, + "step": 7888 + }, + { + "epoch": 0.43974358974358974, + "grad_norm": 0.49809619784355164, + "learning_rate": 8e-05, + "loss": 1.5288, + "step": 7889 + }, + { + "epoch": 0.4397993311036789, + "grad_norm": 0.5675715804100037, + "learning_rate": 8e-05, + "loss": 1.4908, + "step": 7890 + }, + { + "epoch": 0.4398550724637681, + "grad_norm": 0.585404098033905, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 7891 + }, + { + "epoch": 0.4399108138238573, + "grad_norm": 0.5580907464027405, + "learning_rate": 8e-05, + "loss": 1.8979, + "step": 7892 + }, + { + "epoch": 0.4399665551839465, + "grad_norm": 0.5543110966682434, + "learning_rate": 8e-05, + "loss": 1.6024, + "step": 7893 + }, + { + "epoch": 0.4400222965440357, + "grad_norm": 0.4939456582069397, + "learning_rate": 8e-05, + "loss": 1.4742, + "step": 7894 + }, + { + "epoch": 0.44007803790412486, + "grad_norm": 0.5418000817298889, + "learning_rate": 8e-05, + "loss": 1.6381, + "step": 7895 + }, + { + "epoch": 0.44013377926421404, + "grad_norm": 0.5609815716743469, + "learning_rate": 8e-05, + "loss": 1.7848, + "step": 7896 + }, + { + "epoch": 0.4401895206243032, + "grad_norm": 0.5530222654342651, + "learning_rate": 8e-05, + "loss": 1.7114, + "step": 7897 + }, + { + "epoch": 0.4402452619843924, + "grad_norm": 0.5275927782058716, + "learning_rate": 8e-05, + "loss": 1.5007, + "step": 7898 + }, + { + "epoch": 0.4403010033444816, + "grad_norm": 0.5392025709152222, + "learning_rate": 8e-05, + "loss": 1.5712, + "step": 7899 + }, + { + "epoch": 0.4403567447045708, + "grad_norm": 0.5584535598754883, + "learning_rate": 8e-05, + "loss": 1.5063, + "step": 7900 + }, + { + "epoch": 0.44041248606466, + "grad_norm": 0.545294463634491, + "learning_rate": 8e-05, + "loss": 1.7439, + "step": 7901 + }, + { + "epoch": 0.44046822742474917, + "grad_norm": 0.5408179759979248, + "learning_rate": 8e-05, + "loss": 1.4718, + "step": 7902 + }, + { + "epoch": 0.44052396878483835, + "grad_norm": 0.4967776834964752, + "learning_rate": 8e-05, + "loss": 1.3889, + "step": 7903 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 0.5502292513847351, + "learning_rate": 8e-05, + "loss": 1.607, + "step": 7904 + }, + { + "epoch": 0.4406354515050167, + "grad_norm": 0.5698793530464172, + "learning_rate": 8e-05, + "loss": 1.8125, + "step": 7905 + }, + { + "epoch": 0.4406911928651059, + "grad_norm": 0.5307198762893677, + "learning_rate": 8e-05, + "loss": 1.5135, + "step": 7906 + }, + { + "epoch": 0.4407469342251951, + "grad_norm": 0.5222761034965515, + "learning_rate": 8e-05, + "loss": 1.7111, + "step": 7907 + }, + { + "epoch": 0.4408026755852843, + "grad_norm": 0.5538968443870544, + "learning_rate": 8e-05, + "loss": 1.7281, + "step": 7908 + }, + { + "epoch": 0.4408584169453735, + "grad_norm": 0.653179943561554, + "learning_rate": 8e-05, + "loss": 1.596, + "step": 7909 + }, + { + "epoch": 0.44091415830546266, + "grad_norm": 0.5297260880470276, + "learning_rate": 8e-05, + "loss": 1.554, + "step": 7910 + }, + { + "epoch": 0.44096989966555183, + "grad_norm": 0.5243282914161682, + "learning_rate": 8e-05, + "loss": 1.6496, + "step": 7911 + }, + { + "epoch": 0.441025641025641, + "grad_norm": 0.5334591269493103, + "learning_rate": 8e-05, + "loss": 1.5722, + "step": 7912 + }, + { + "epoch": 0.4410813823857302, + "grad_norm": 0.606288492679596, + "learning_rate": 8e-05, + "loss": 1.8366, + "step": 7913 + }, + { + "epoch": 0.44113712374581937, + "grad_norm": 0.582847535610199, + "learning_rate": 8e-05, + "loss": 1.7688, + "step": 7914 + }, + { + "epoch": 0.4411928651059086, + "grad_norm": 0.5157939195632935, + "learning_rate": 8e-05, + "loss": 1.6076, + "step": 7915 + }, + { + "epoch": 0.4412486064659978, + "grad_norm": 0.49170541763305664, + "learning_rate": 8e-05, + "loss": 1.5521, + "step": 7916 + }, + { + "epoch": 0.44130434782608696, + "grad_norm": 0.6261030435562134, + "learning_rate": 8e-05, + "loss": 1.823, + "step": 7917 + }, + { + "epoch": 0.44136008918617614, + "grad_norm": 0.5169453620910645, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 7918 + }, + { + "epoch": 0.4414158305462653, + "grad_norm": 0.5462013483047485, + "learning_rate": 8e-05, + "loss": 1.7069, + "step": 7919 + }, + { + "epoch": 0.4414715719063545, + "grad_norm": 0.5447542667388916, + "learning_rate": 8e-05, + "loss": 1.5881, + "step": 7920 + }, + { + "epoch": 0.4415273132664437, + "grad_norm": 0.5971205830574036, + "learning_rate": 8e-05, + "loss": 1.8295, + "step": 7921 + }, + { + "epoch": 0.4415830546265329, + "grad_norm": 0.5625289678573608, + "learning_rate": 8e-05, + "loss": 1.6345, + "step": 7922 + }, + { + "epoch": 0.4416387959866221, + "grad_norm": 0.5436424016952515, + "learning_rate": 8e-05, + "loss": 1.8182, + "step": 7923 + }, + { + "epoch": 0.44169453734671127, + "grad_norm": 0.5405243635177612, + "learning_rate": 8e-05, + "loss": 1.744, + "step": 7924 + }, + { + "epoch": 0.44175027870680045, + "grad_norm": 0.5950716733932495, + "learning_rate": 8e-05, + "loss": 1.859, + "step": 7925 + }, + { + "epoch": 0.4418060200668896, + "grad_norm": 0.531568169593811, + "learning_rate": 8e-05, + "loss": 1.7732, + "step": 7926 + }, + { + "epoch": 0.4418617614269788, + "grad_norm": 0.5521816611289978, + "learning_rate": 8e-05, + "loss": 1.5794, + "step": 7927 + }, + { + "epoch": 0.441917502787068, + "grad_norm": 0.5484359264373779, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 7928 + }, + { + "epoch": 0.4419732441471572, + "grad_norm": 0.5186398029327393, + "learning_rate": 8e-05, + "loss": 1.7833, + "step": 7929 + }, + { + "epoch": 0.4420289855072464, + "grad_norm": 0.5243989825248718, + "learning_rate": 8e-05, + "loss": 1.5285, + "step": 7930 + }, + { + "epoch": 0.4420847268673356, + "grad_norm": 0.4838677942752838, + "learning_rate": 8e-05, + "loss": 1.589, + "step": 7931 + }, + { + "epoch": 0.44214046822742475, + "grad_norm": 0.510401725769043, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 7932 + }, + { + "epoch": 0.44219620958751393, + "grad_norm": 0.5910479426383972, + "learning_rate": 8e-05, + "loss": 1.7874, + "step": 7933 + }, + { + "epoch": 0.4422519509476031, + "grad_norm": 0.577433168888092, + "learning_rate": 8e-05, + "loss": 1.784, + "step": 7934 + }, + { + "epoch": 0.4423076923076923, + "grad_norm": 0.49999353289604187, + "learning_rate": 8e-05, + "loss": 1.6503, + "step": 7935 + }, + { + "epoch": 0.44236343366778147, + "grad_norm": 0.48399224877357483, + "learning_rate": 8e-05, + "loss": 1.478, + "step": 7936 + }, + { + "epoch": 0.4424191750278707, + "grad_norm": 0.5222609639167786, + "learning_rate": 8e-05, + "loss": 1.5898, + "step": 7937 + }, + { + "epoch": 0.4424749163879599, + "grad_norm": 0.5327225923538208, + "learning_rate": 8e-05, + "loss": 1.7029, + "step": 7938 + }, + { + "epoch": 0.44253065774804906, + "grad_norm": 0.4968123137950897, + "learning_rate": 8e-05, + "loss": 1.4246, + "step": 7939 + }, + { + "epoch": 0.44258639910813824, + "grad_norm": 0.5522922873497009, + "learning_rate": 8e-05, + "loss": 1.8884, + "step": 7940 + }, + { + "epoch": 0.4426421404682274, + "grad_norm": 0.5389187932014465, + "learning_rate": 8e-05, + "loss": 1.6626, + "step": 7941 + }, + { + "epoch": 0.4426978818283166, + "grad_norm": 0.5206403732299805, + "learning_rate": 8e-05, + "loss": 1.5838, + "step": 7942 + }, + { + "epoch": 0.4427536231884058, + "grad_norm": 0.4918576776981354, + "learning_rate": 8e-05, + "loss": 1.4641, + "step": 7943 + }, + { + "epoch": 0.442809364548495, + "grad_norm": 0.5084859728813171, + "learning_rate": 8e-05, + "loss": 1.7413, + "step": 7944 + }, + { + "epoch": 0.4428651059085842, + "grad_norm": 0.5442695021629333, + "learning_rate": 8e-05, + "loss": 1.8337, + "step": 7945 + }, + { + "epoch": 0.44292084726867337, + "grad_norm": 0.5141339302062988, + "learning_rate": 8e-05, + "loss": 1.6379, + "step": 7946 + }, + { + "epoch": 0.44297658862876255, + "grad_norm": 0.5276790261268616, + "learning_rate": 8e-05, + "loss": 1.7592, + "step": 7947 + }, + { + "epoch": 0.4430323299888517, + "grad_norm": 0.5303980708122253, + "learning_rate": 8e-05, + "loss": 1.6602, + "step": 7948 + }, + { + "epoch": 0.4430880713489409, + "grad_norm": 0.5147736072540283, + "learning_rate": 8e-05, + "loss": 1.6483, + "step": 7949 + }, + { + "epoch": 0.4431438127090301, + "grad_norm": 0.5501843094825745, + "learning_rate": 8e-05, + "loss": 1.7459, + "step": 7950 + }, + { + "epoch": 0.44319955406911926, + "grad_norm": 0.5527022480964661, + "learning_rate": 8e-05, + "loss": 1.8121, + "step": 7951 + }, + { + "epoch": 0.4432552954292085, + "grad_norm": 0.534709095954895, + "learning_rate": 8e-05, + "loss": 1.6494, + "step": 7952 + }, + { + "epoch": 0.4433110367892977, + "grad_norm": 0.5975018739700317, + "learning_rate": 8e-05, + "loss": 1.69, + "step": 7953 + }, + { + "epoch": 0.44336677814938685, + "grad_norm": 0.5425289869308472, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 7954 + }, + { + "epoch": 0.44342251950947603, + "grad_norm": 0.5638290047645569, + "learning_rate": 8e-05, + "loss": 1.8224, + "step": 7955 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 0.5615090131759644, + "learning_rate": 8e-05, + "loss": 1.7671, + "step": 7956 + }, + { + "epoch": 0.4435340022296544, + "grad_norm": 0.5214371681213379, + "learning_rate": 8e-05, + "loss": 1.5474, + "step": 7957 + }, + { + "epoch": 0.44358974358974357, + "grad_norm": 0.524421215057373, + "learning_rate": 8e-05, + "loss": 1.7009, + "step": 7958 + }, + { + "epoch": 0.4436454849498328, + "grad_norm": 0.5154446363449097, + "learning_rate": 8e-05, + "loss": 1.6709, + "step": 7959 + }, + { + "epoch": 0.443701226309922, + "grad_norm": 0.5744690299034119, + "learning_rate": 8e-05, + "loss": 2.029, + "step": 7960 + }, + { + "epoch": 0.44375696767001116, + "grad_norm": 0.5760914087295532, + "learning_rate": 8e-05, + "loss": 1.9362, + "step": 7961 + }, + { + "epoch": 0.44381270903010034, + "grad_norm": 0.5083246231079102, + "learning_rate": 8e-05, + "loss": 1.6227, + "step": 7962 + }, + { + "epoch": 0.4438684503901895, + "grad_norm": 0.5764332413673401, + "learning_rate": 8e-05, + "loss": 1.7809, + "step": 7963 + }, + { + "epoch": 0.4439241917502787, + "grad_norm": 0.5300820469856262, + "learning_rate": 8e-05, + "loss": 1.6536, + "step": 7964 + }, + { + "epoch": 0.4439799331103679, + "grad_norm": 0.4995599687099457, + "learning_rate": 8e-05, + "loss": 1.5449, + "step": 7965 + }, + { + "epoch": 0.44403567447045705, + "grad_norm": 0.5778630375862122, + "learning_rate": 8e-05, + "loss": 1.6878, + "step": 7966 + }, + { + "epoch": 0.4440914158305463, + "grad_norm": 0.5369563698768616, + "learning_rate": 8e-05, + "loss": 1.5915, + "step": 7967 + }, + { + "epoch": 0.44414715719063547, + "grad_norm": 0.5145952701568604, + "learning_rate": 8e-05, + "loss": 1.5626, + "step": 7968 + }, + { + "epoch": 0.44420289855072465, + "grad_norm": 0.48712003231048584, + "learning_rate": 8e-05, + "loss": 1.4918, + "step": 7969 + }, + { + "epoch": 0.4442586399108138, + "grad_norm": 0.5495256781578064, + "learning_rate": 8e-05, + "loss": 1.7003, + "step": 7970 + }, + { + "epoch": 0.444314381270903, + "grad_norm": 0.5205605626106262, + "learning_rate": 8e-05, + "loss": 1.7063, + "step": 7971 + }, + { + "epoch": 0.4443701226309922, + "grad_norm": 0.5068941712379456, + "learning_rate": 8e-05, + "loss": 1.6017, + "step": 7972 + }, + { + "epoch": 0.44442586399108136, + "grad_norm": 0.5757874250411987, + "learning_rate": 8e-05, + "loss": 1.7222, + "step": 7973 + }, + { + "epoch": 0.4444816053511706, + "grad_norm": 0.5113906264305115, + "learning_rate": 8e-05, + "loss": 1.4284, + "step": 7974 + }, + { + "epoch": 0.4445373467112598, + "grad_norm": 0.6202803254127502, + "learning_rate": 8e-05, + "loss": 1.7175, + "step": 7975 + }, + { + "epoch": 0.44459308807134895, + "grad_norm": 0.5060017704963684, + "learning_rate": 8e-05, + "loss": 1.5773, + "step": 7976 + }, + { + "epoch": 0.44464882943143813, + "grad_norm": 0.5798066258430481, + "learning_rate": 8e-05, + "loss": 1.788, + "step": 7977 + }, + { + "epoch": 0.4447045707915273, + "grad_norm": 0.5086560249328613, + "learning_rate": 8e-05, + "loss": 1.5831, + "step": 7978 + }, + { + "epoch": 0.4447603121516165, + "grad_norm": 0.5149025917053223, + "learning_rate": 8e-05, + "loss": 1.5965, + "step": 7979 + }, + { + "epoch": 0.44481605351170567, + "grad_norm": 0.5903439521789551, + "learning_rate": 8e-05, + "loss": 1.8206, + "step": 7980 + }, + { + "epoch": 0.44487179487179485, + "grad_norm": 0.5610959529876709, + "learning_rate": 8e-05, + "loss": 1.6128, + "step": 7981 + }, + { + "epoch": 0.4449275362318841, + "grad_norm": 0.5200814008712769, + "learning_rate": 8e-05, + "loss": 1.5292, + "step": 7982 + }, + { + "epoch": 0.44498327759197326, + "grad_norm": 0.5364866256713867, + "learning_rate": 8e-05, + "loss": 1.7843, + "step": 7983 + }, + { + "epoch": 0.44503901895206244, + "grad_norm": 0.5320987105369568, + "learning_rate": 8e-05, + "loss": 1.662, + "step": 7984 + }, + { + "epoch": 0.4450947603121516, + "grad_norm": 0.5191739201545715, + "learning_rate": 8e-05, + "loss": 1.6594, + "step": 7985 + }, + { + "epoch": 0.4451505016722408, + "grad_norm": 0.5547230243682861, + "learning_rate": 8e-05, + "loss": 1.655, + "step": 7986 + }, + { + "epoch": 0.44520624303233, + "grad_norm": 0.5270265340805054, + "learning_rate": 8e-05, + "loss": 1.5838, + "step": 7987 + }, + { + "epoch": 0.44526198439241915, + "grad_norm": 0.550730288028717, + "learning_rate": 8e-05, + "loss": 1.748, + "step": 7988 + }, + { + "epoch": 0.4453177257525084, + "grad_norm": 0.4731638729572296, + "learning_rate": 8e-05, + "loss": 1.5173, + "step": 7989 + }, + { + "epoch": 0.44537346711259757, + "grad_norm": 0.542497992515564, + "learning_rate": 8e-05, + "loss": 1.6523, + "step": 7990 + }, + { + "epoch": 0.44542920847268674, + "grad_norm": 0.6007255911827087, + "learning_rate": 8e-05, + "loss": 1.8565, + "step": 7991 + }, + { + "epoch": 0.4454849498327759, + "grad_norm": 0.5397570133209229, + "learning_rate": 8e-05, + "loss": 1.5885, + "step": 7992 + }, + { + "epoch": 0.4455406911928651, + "grad_norm": 0.5419074892997742, + "learning_rate": 8e-05, + "loss": 1.7126, + "step": 7993 + }, + { + "epoch": 0.4455964325529543, + "grad_norm": 0.5122414827346802, + "learning_rate": 8e-05, + "loss": 1.7067, + "step": 7994 + }, + { + "epoch": 0.44565217391304346, + "grad_norm": 0.5504677295684814, + "learning_rate": 8e-05, + "loss": 1.7417, + "step": 7995 + }, + { + "epoch": 0.44570791527313264, + "grad_norm": 0.536055862903595, + "learning_rate": 8e-05, + "loss": 1.7303, + "step": 7996 + }, + { + "epoch": 0.44576365663322187, + "grad_norm": 0.5506960153579712, + "learning_rate": 8e-05, + "loss": 1.6504, + "step": 7997 + }, + { + "epoch": 0.44581939799331105, + "grad_norm": 0.580141544342041, + "learning_rate": 8e-05, + "loss": 1.7647, + "step": 7998 + }, + { + "epoch": 0.44587513935340023, + "grad_norm": 0.5886245369911194, + "learning_rate": 8e-05, + "loss": 1.7583, + "step": 7999 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.5789358019828796, + "learning_rate": 8e-05, + "loss": 1.8195, + "step": 8000 + }, + { + "epoch": 0.4459866220735786, + "grad_norm": 0.5332310795783997, + "learning_rate": 8e-05, + "loss": 1.8922, + "step": 8001 + }, + { + "epoch": 0.44604236343366777, + "grad_norm": 0.5233482718467712, + "learning_rate": 8e-05, + "loss": 1.5484, + "step": 8002 + }, + { + "epoch": 0.44609810479375694, + "grad_norm": 0.5520485639572144, + "learning_rate": 8e-05, + "loss": 1.8006, + "step": 8003 + }, + { + "epoch": 0.4461538461538462, + "grad_norm": 0.5176988840103149, + "learning_rate": 8e-05, + "loss": 1.7473, + "step": 8004 + }, + { + "epoch": 0.44620958751393536, + "grad_norm": 0.5222984552383423, + "learning_rate": 8e-05, + "loss": 1.68, + "step": 8005 + }, + { + "epoch": 0.44626532887402454, + "grad_norm": 0.5123849511146545, + "learning_rate": 8e-05, + "loss": 1.5285, + "step": 8006 + }, + { + "epoch": 0.4463210702341137, + "grad_norm": 0.6144101619720459, + "learning_rate": 8e-05, + "loss": 2.081, + "step": 8007 + }, + { + "epoch": 0.4463768115942029, + "grad_norm": 0.5395132303237915, + "learning_rate": 8e-05, + "loss": 1.7676, + "step": 8008 + }, + { + "epoch": 0.4464325529542921, + "grad_norm": 0.6325112581253052, + "learning_rate": 8e-05, + "loss": 2.0531, + "step": 8009 + }, + { + "epoch": 0.44648829431438125, + "grad_norm": 0.5071414709091187, + "learning_rate": 8e-05, + "loss": 1.551, + "step": 8010 + }, + { + "epoch": 0.44654403567447043, + "grad_norm": 0.5221314430236816, + "learning_rate": 8e-05, + "loss": 1.6285, + "step": 8011 + }, + { + "epoch": 0.44659977703455966, + "grad_norm": 0.5296001434326172, + "learning_rate": 8e-05, + "loss": 1.5697, + "step": 8012 + }, + { + "epoch": 0.44665551839464884, + "grad_norm": 0.5084794163703918, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 8013 + }, + { + "epoch": 0.446711259754738, + "grad_norm": 0.537729024887085, + "learning_rate": 8e-05, + "loss": 1.8762, + "step": 8014 + }, + { + "epoch": 0.4467670011148272, + "grad_norm": 0.5437967777252197, + "learning_rate": 8e-05, + "loss": 1.5961, + "step": 8015 + }, + { + "epoch": 0.4468227424749164, + "grad_norm": 0.5017749071121216, + "learning_rate": 8e-05, + "loss": 1.6359, + "step": 8016 + }, + { + "epoch": 0.44687848383500556, + "grad_norm": 0.5402575731277466, + "learning_rate": 8e-05, + "loss": 1.7692, + "step": 8017 + }, + { + "epoch": 0.44693422519509474, + "grad_norm": 0.6567726135253906, + "learning_rate": 8e-05, + "loss": 2.1172, + "step": 8018 + }, + { + "epoch": 0.44698996655518397, + "grad_norm": 0.5226815342903137, + "learning_rate": 8e-05, + "loss": 1.5514, + "step": 8019 + }, + { + "epoch": 0.44704570791527315, + "grad_norm": 0.5185201168060303, + "learning_rate": 8e-05, + "loss": 1.6736, + "step": 8020 + }, + { + "epoch": 0.44710144927536233, + "grad_norm": 0.5502247214317322, + "learning_rate": 8e-05, + "loss": 1.712, + "step": 8021 + }, + { + "epoch": 0.4471571906354515, + "grad_norm": 0.5351677536964417, + "learning_rate": 8e-05, + "loss": 1.8209, + "step": 8022 + }, + { + "epoch": 0.4472129319955407, + "grad_norm": 0.5364797711372375, + "learning_rate": 8e-05, + "loss": 1.7622, + "step": 8023 + }, + { + "epoch": 0.44726867335562986, + "grad_norm": 0.47554463148117065, + "learning_rate": 8e-05, + "loss": 1.3493, + "step": 8024 + }, + { + "epoch": 0.44732441471571904, + "grad_norm": 0.5028263330459595, + "learning_rate": 8e-05, + "loss": 1.6095, + "step": 8025 + }, + { + "epoch": 0.4473801560758083, + "grad_norm": 0.5260576605796814, + "learning_rate": 8e-05, + "loss": 1.7044, + "step": 8026 + }, + { + "epoch": 0.44743589743589746, + "grad_norm": 0.5328836441040039, + "learning_rate": 8e-05, + "loss": 1.7109, + "step": 8027 + }, + { + "epoch": 0.44749163879598663, + "grad_norm": 0.4906134307384491, + "learning_rate": 8e-05, + "loss": 1.4654, + "step": 8028 + }, + { + "epoch": 0.4475473801560758, + "grad_norm": 0.5100238919258118, + "learning_rate": 8e-05, + "loss": 1.5336, + "step": 8029 + }, + { + "epoch": 0.447603121516165, + "grad_norm": 0.53555828332901, + "learning_rate": 8e-05, + "loss": 1.6962, + "step": 8030 + }, + { + "epoch": 0.44765886287625417, + "grad_norm": 0.5725340247154236, + "learning_rate": 8e-05, + "loss": 1.9263, + "step": 8031 + }, + { + "epoch": 0.44771460423634335, + "grad_norm": 0.5331685543060303, + "learning_rate": 8e-05, + "loss": 1.7079, + "step": 8032 + }, + { + "epoch": 0.44777034559643253, + "grad_norm": 0.5597028136253357, + "learning_rate": 8e-05, + "loss": 1.5791, + "step": 8033 + }, + { + "epoch": 0.44782608695652176, + "grad_norm": 0.5003613829612732, + "learning_rate": 8e-05, + "loss": 1.6993, + "step": 8034 + }, + { + "epoch": 0.44788182831661094, + "grad_norm": 0.5053821206092834, + "learning_rate": 8e-05, + "loss": 1.5431, + "step": 8035 + }, + { + "epoch": 0.4479375696767001, + "grad_norm": 0.6037608981132507, + "learning_rate": 8e-05, + "loss": 1.6782, + "step": 8036 + }, + { + "epoch": 0.4479933110367893, + "grad_norm": 0.5720326900482178, + "learning_rate": 8e-05, + "loss": 1.6984, + "step": 8037 + }, + { + "epoch": 0.4480490523968785, + "grad_norm": 0.5353522300720215, + "learning_rate": 8e-05, + "loss": 1.615, + "step": 8038 + }, + { + "epoch": 0.44810479375696766, + "grad_norm": 0.5460042953491211, + "learning_rate": 8e-05, + "loss": 1.6436, + "step": 8039 + }, + { + "epoch": 0.44816053511705684, + "grad_norm": 0.5569713711738586, + "learning_rate": 8e-05, + "loss": 1.7649, + "step": 8040 + }, + { + "epoch": 0.44821627647714607, + "grad_norm": 0.5759195685386658, + "learning_rate": 8e-05, + "loss": 1.5646, + "step": 8041 + }, + { + "epoch": 0.44827201783723525, + "grad_norm": 0.5390092134475708, + "learning_rate": 8e-05, + "loss": 1.667, + "step": 8042 + }, + { + "epoch": 0.4483277591973244, + "grad_norm": 0.5163521766662598, + "learning_rate": 8e-05, + "loss": 1.7887, + "step": 8043 + }, + { + "epoch": 0.4483835005574136, + "grad_norm": 0.5562042593955994, + "learning_rate": 8e-05, + "loss": 1.8138, + "step": 8044 + }, + { + "epoch": 0.4484392419175028, + "grad_norm": 0.5692409873008728, + "learning_rate": 8e-05, + "loss": 1.7239, + "step": 8045 + }, + { + "epoch": 0.44849498327759196, + "grad_norm": 0.49043577909469604, + "learning_rate": 8e-05, + "loss": 1.4557, + "step": 8046 + }, + { + "epoch": 0.44855072463768114, + "grad_norm": 0.5952504873275757, + "learning_rate": 8e-05, + "loss": 1.9548, + "step": 8047 + }, + { + "epoch": 0.4486064659977703, + "grad_norm": 0.5599199533462524, + "learning_rate": 8e-05, + "loss": 1.6081, + "step": 8048 + }, + { + "epoch": 0.44866220735785955, + "grad_norm": 0.5060661435127258, + "learning_rate": 8e-05, + "loss": 1.496, + "step": 8049 + }, + { + "epoch": 0.44871794871794873, + "grad_norm": 0.5552914142608643, + "learning_rate": 8e-05, + "loss": 1.6489, + "step": 8050 + }, + { + "epoch": 0.4487736900780379, + "grad_norm": 0.5685629844665527, + "learning_rate": 8e-05, + "loss": 1.8693, + "step": 8051 + }, + { + "epoch": 0.4488294314381271, + "grad_norm": 0.45723462104797363, + "learning_rate": 8e-05, + "loss": 1.3753, + "step": 8052 + }, + { + "epoch": 0.44888517279821627, + "grad_norm": 0.4917463958263397, + "learning_rate": 8e-05, + "loss": 1.4215, + "step": 8053 + }, + { + "epoch": 0.44894091415830545, + "grad_norm": 0.5196283459663391, + "learning_rate": 8e-05, + "loss": 1.7516, + "step": 8054 + }, + { + "epoch": 0.4489966555183946, + "grad_norm": 0.538723349571228, + "learning_rate": 8e-05, + "loss": 1.7441, + "step": 8055 + }, + { + "epoch": 0.44905239687848386, + "grad_norm": 0.5024257898330688, + "learning_rate": 8e-05, + "loss": 1.4925, + "step": 8056 + }, + { + "epoch": 0.44910813823857304, + "grad_norm": 0.519081711769104, + "learning_rate": 8e-05, + "loss": 1.5659, + "step": 8057 + }, + { + "epoch": 0.4491638795986622, + "grad_norm": 0.5455350875854492, + "learning_rate": 8e-05, + "loss": 1.7434, + "step": 8058 + }, + { + "epoch": 0.4492196209587514, + "grad_norm": 0.5286114811897278, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 8059 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 0.5353128910064697, + "learning_rate": 8e-05, + "loss": 1.6816, + "step": 8060 + }, + { + "epoch": 0.44933110367892976, + "grad_norm": 0.5198448896408081, + "learning_rate": 8e-05, + "loss": 1.6891, + "step": 8061 + }, + { + "epoch": 0.44938684503901893, + "grad_norm": 0.5549296736717224, + "learning_rate": 8e-05, + "loss": 1.8031, + "step": 8062 + }, + { + "epoch": 0.4494425863991081, + "grad_norm": 0.5344325304031372, + "learning_rate": 8e-05, + "loss": 1.6114, + "step": 8063 + }, + { + "epoch": 0.44949832775919735, + "grad_norm": 0.5231785178184509, + "learning_rate": 8e-05, + "loss": 1.5725, + "step": 8064 + }, + { + "epoch": 0.4495540691192865, + "grad_norm": 0.5388935208320618, + "learning_rate": 8e-05, + "loss": 1.5811, + "step": 8065 + }, + { + "epoch": 0.4496098104793757, + "grad_norm": 0.5113973617553711, + "learning_rate": 8e-05, + "loss": 1.4581, + "step": 8066 + }, + { + "epoch": 0.4496655518394649, + "grad_norm": 0.5376384258270264, + "learning_rate": 8e-05, + "loss": 1.7599, + "step": 8067 + }, + { + "epoch": 0.44972129319955406, + "grad_norm": 0.5223584175109863, + "learning_rate": 8e-05, + "loss": 1.7786, + "step": 8068 + }, + { + "epoch": 0.44977703455964324, + "grad_norm": 0.5178060531616211, + "learning_rate": 8e-05, + "loss": 1.7542, + "step": 8069 + }, + { + "epoch": 0.4498327759197324, + "grad_norm": 0.5555859804153442, + "learning_rate": 8e-05, + "loss": 1.6353, + "step": 8070 + }, + { + "epoch": 0.44988851727982165, + "grad_norm": 0.510905921459198, + "learning_rate": 8e-05, + "loss": 1.6363, + "step": 8071 + }, + { + "epoch": 0.44994425863991083, + "grad_norm": 0.5641573667526245, + "learning_rate": 8e-05, + "loss": 1.8944, + "step": 8072 + }, + { + "epoch": 0.45, + "grad_norm": 0.5425226092338562, + "learning_rate": 8e-05, + "loss": 1.6644, + "step": 8073 + }, + { + "epoch": 0.4500557413600892, + "grad_norm": 0.5517387986183167, + "learning_rate": 8e-05, + "loss": 1.7881, + "step": 8074 + }, + { + "epoch": 0.45011148272017837, + "grad_norm": 0.5022097229957581, + "learning_rate": 8e-05, + "loss": 1.5671, + "step": 8075 + }, + { + "epoch": 0.45016722408026755, + "grad_norm": 0.551947295665741, + "learning_rate": 8e-05, + "loss": 1.8847, + "step": 8076 + }, + { + "epoch": 0.4502229654403567, + "grad_norm": 0.584666907787323, + "learning_rate": 8e-05, + "loss": 1.7084, + "step": 8077 + }, + { + "epoch": 0.4502787068004459, + "grad_norm": 0.536909818649292, + "learning_rate": 8e-05, + "loss": 1.6183, + "step": 8078 + }, + { + "epoch": 0.45033444816053514, + "grad_norm": 0.5274991393089294, + "learning_rate": 8e-05, + "loss": 1.7355, + "step": 8079 + }, + { + "epoch": 0.4503901895206243, + "grad_norm": 0.49505487084388733, + "learning_rate": 8e-05, + "loss": 1.5412, + "step": 8080 + }, + { + "epoch": 0.4504459308807135, + "grad_norm": 0.5947615504264832, + "learning_rate": 8e-05, + "loss": 1.964, + "step": 8081 + }, + { + "epoch": 0.4505016722408027, + "grad_norm": 0.5649814605712891, + "learning_rate": 8e-05, + "loss": 1.7741, + "step": 8082 + }, + { + "epoch": 0.45055741360089185, + "grad_norm": 0.495584100484848, + "learning_rate": 8e-05, + "loss": 1.7114, + "step": 8083 + }, + { + "epoch": 0.45061315496098103, + "grad_norm": 0.5267391204833984, + "learning_rate": 8e-05, + "loss": 1.8227, + "step": 8084 + }, + { + "epoch": 0.4506688963210702, + "grad_norm": 0.5202919840812683, + "learning_rate": 8e-05, + "loss": 1.598, + "step": 8085 + }, + { + "epoch": 0.45072463768115945, + "grad_norm": 0.5348260998725891, + "learning_rate": 8e-05, + "loss": 1.53, + "step": 8086 + }, + { + "epoch": 0.4507803790412486, + "grad_norm": 0.5375006198883057, + "learning_rate": 8e-05, + "loss": 1.4686, + "step": 8087 + }, + { + "epoch": 0.4508361204013378, + "grad_norm": 0.5393670797348022, + "learning_rate": 8e-05, + "loss": 1.6661, + "step": 8088 + }, + { + "epoch": 0.450891861761427, + "grad_norm": 0.5716339349746704, + "learning_rate": 8e-05, + "loss": 1.8264, + "step": 8089 + }, + { + "epoch": 0.45094760312151616, + "grad_norm": 0.5319709777832031, + "learning_rate": 8e-05, + "loss": 1.6998, + "step": 8090 + }, + { + "epoch": 0.45100334448160534, + "grad_norm": 0.5010347366333008, + "learning_rate": 8e-05, + "loss": 1.5386, + "step": 8091 + }, + { + "epoch": 0.4510590858416945, + "grad_norm": 0.5667153596878052, + "learning_rate": 8e-05, + "loss": 1.9579, + "step": 8092 + }, + { + "epoch": 0.4511148272017837, + "grad_norm": 0.500321090221405, + "learning_rate": 8e-05, + "loss": 1.687, + "step": 8093 + }, + { + "epoch": 0.45117056856187293, + "grad_norm": 0.5381765365600586, + "learning_rate": 8e-05, + "loss": 1.8494, + "step": 8094 + }, + { + "epoch": 0.4512263099219621, + "grad_norm": 0.5462884902954102, + "learning_rate": 8e-05, + "loss": 1.6532, + "step": 8095 + }, + { + "epoch": 0.4512820512820513, + "grad_norm": 0.5044798851013184, + "learning_rate": 8e-05, + "loss": 1.7236, + "step": 8096 + }, + { + "epoch": 0.45133779264214047, + "grad_norm": 0.45130854845046997, + "learning_rate": 8e-05, + "loss": 1.0285, + "step": 8097 + }, + { + "epoch": 0.45139353400222965, + "grad_norm": 0.5413728356361389, + "learning_rate": 8e-05, + "loss": 1.7617, + "step": 8098 + }, + { + "epoch": 0.4514492753623188, + "grad_norm": 0.5397976636886597, + "learning_rate": 8e-05, + "loss": 1.6764, + "step": 8099 + }, + { + "epoch": 0.451505016722408, + "grad_norm": 0.562067985534668, + "learning_rate": 8e-05, + "loss": 1.7465, + "step": 8100 + }, + { + "epoch": 0.45156075808249724, + "grad_norm": 0.5796635746955872, + "learning_rate": 8e-05, + "loss": 1.7751, + "step": 8101 + }, + { + "epoch": 0.4516164994425864, + "grad_norm": 0.5075339078903198, + "learning_rate": 8e-05, + "loss": 1.5174, + "step": 8102 + }, + { + "epoch": 0.4516722408026756, + "grad_norm": 0.4336899220943451, + "learning_rate": 8e-05, + "loss": 1.0754, + "step": 8103 + }, + { + "epoch": 0.4517279821627648, + "grad_norm": 0.5077371001243591, + "learning_rate": 8e-05, + "loss": 1.4769, + "step": 8104 + }, + { + "epoch": 0.45178372352285395, + "grad_norm": 0.5493600964546204, + "learning_rate": 8e-05, + "loss": 1.8034, + "step": 8105 + }, + { + "epoch": 0.45183946488294313, + "grad_norm": 0.4964161217212677, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 8106 + }, + { + "epoch": 0.4518952062430323, + "grad_norm": 0.5899708867073059, + "learning_rate": 8e-05, + "loss": 1.6904, + "step": 8107 + }, + { + "epoch": 0.4519509476031215, + "grad_norm": 0.5874199271202087, + "learning_rate": 8e-05, + "loss": 1.7159, + "step": 8108 + }, + { + "epoch": 0.4520066889632107, + "grad_norm": 0.597144365310669, + "learning_rate": 8e-05, + "loss": 1.658, + "step": 8109 + }, + { + "epoch": 0.4520624303232999, + "grad_norm": 0.5587716102600098, + "learning_rate": 8e-05, + "loss": 1.4525, + "step": 8110 + }, + { + "epoch": 0.4521181716833891, + "grad_norm": 0.5530357360839844, + "learning_rate": 8e-05, + "loss": 1.6442, + "step": 8111 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.5947765111923218, + "learning_rate": 8e-05, + "loss": 1.8093, + "step": 8112 + }, + { + "epoch": 0.45222965440356744, + "grad_norm": 0.5420628190040588, + "learning_rate": 8e-05, + "loss": 1.4677, + "step": 8113 + }, + { + "epoch": 0.4522853957636566, + "grad_norm": 0.5657048225402832, + "learning_rate": 8e-05, + "loss": 1.796, + "step": 8114 + }, + { + "epoch": 0.4523411371237458, + "grad_norm": 0.5704300403594971, + "learning_rate": 8e-05, + "loss": 1.7856, + "step": 8115 + }, + { + "epoch": 0.45239687848383503, + "grad_norm": 0.581071674823761, + "learning_rate": 8e-05, + "loss": 1.7953, + "step": 8116 + }, + { + "epoch": 0.4524526198439242, + "grad_norm": 0.49436646699905396, + "learning_rate": 8e-05, + "loss": 1.3443, + "step": 8117 + }, + { + "epoch": 0.4525083612040134, + "grad_norm": 0.47054535150527954, + "learning_rate": 8e-05, + "loss": 1.5879, + "step": 8118 + }, + { + "epoch": 0.45256410256410257, + "grad_norm": 0.5406829118728638, + "learning_rate": 8e-05, + "loss": 1.5736, + "step": 8119 + }, + { + "epoch": 0.45261984392419174, + "grad_norm": 0.605000913143158, + "learning_rate": 8e-05, + "loss": 1.9112, + "step": 8120 + }, + { + "epoch": 0.4526755852842809, + "grad_norm": 0.56277996301651, + "learning_rate": 8e-05, + "loss": 1.7241, + "step": 8121 + }, + { + "epoch": 0.4527313266443701, + "grad_norm": 0.5486575961112976, + "learning_rate": 8e-05, + "loss": 1.6242, + "step": 8122 + }, + { + "epoch": 0.45278706800445934, + "grad_norm": 0.4798527657985687, + "learning_rate": 8e-05, + "loss": 1.339, + "step": 8123 + }, + { + "epoch": 0.4528428093645485, + "grad_norm": 0.5389805436134338, + "learning_rate": 8e-05, + "loss": 1.5075, + "step": 8124 + }, + { + "epoch": 0.4528985507246377, + "grad_norm": 0.5828685760498047, + "learning_rate": 8e-05, + "loss": 1.497, + "step": 8125 + }, + { + "epoch": 0.4529542920847269, + "grad_norm": 0.5291416645050049, + "learning_rate": 8e-05, + "loss": 1.5458, + "step": 8126 + }, + { + "epoch": 0.45301003344481605, + "grad_norm": 0.5561142563819885, + "learning_rate": 8e-05, + "loss": 1.7098, + "step": 8127 + }, + { + "epoch": 0.45306577480490523, + "grad_norm": 0.5542564392089844, + "learning_rate": 8e-05, + "loss": 1.9053, + "step": 8128 + }, + { + "epoch": 0.4531215161649944, + "grad_norm": 0.5583918690681458, + "learning_rate": 8e-05, + "loss": 1.7097, + "step": 8129 + }, + { + "epoch": 0.4531772575250836, + "grad_norm": 0.6063544750213623, + "learning_rate": 8e-05, + "loss": 1.6789, + "step": 8130 + }, + { + "epoch": 0.4532329988851728, + "grad_norm": 0.5362454056739807, + "learning_rate": 8e-05, + "loss": 1.5972, + "step": 8131 + }, + { + "epoch": 0.453288740245262, + "grad_norm": 0.5259758830070496, + "learning_rate": 8e-05, + "loss": 1.6535, + "step": 8132 + }, + { + "epoch": 0.4533444816053512, + "grad_norm": 0.5174777507781982, + "learning_rate": 8e-05, + "loss": 1.6159, + "step": 8133 + }, + { + "epoch": 0.45340022296544036, + "grad_norm": 0.5773780345916748, + "learning_rate": 8e-05, + "loss": 1.842, + "step": 8134 + }, + { + "epoch": 0.45345596432552954, + "grad_norm": 0.5560287833213806, + "learning_rate": 8e-05, + "loss": 1.7839, + "step": 8135 + }, + { + "epoch": 0.4535117056856187, + "grad_norm": 0.6014232635498047, + "learning_rate": 8e-05, + "loss": 1.8046, + "step": 8136 + }, + { + "epoch": 0.4535674470457079, + "grad_norm": 0.5750653743743896, + "learning_rate": 8e-05, + "loss": 1.6044, + "step": 8137 + }, + { + "epoch": 0.45362318840579713, + "grad_norm": 0.4930271506309509, + "learning_rate": 8e-05, + "loss": 1.5389, + "step": 8138 + }, + { + "epoch": 0.4536789297658863, + "grad_norm": 0.515459418296814, + "learning_rate": 8e-05, + "loss": 1.5612, + "step": 8139 + }, + { + "epoch": 0.4537346711259755, + "grad_norm": 0.5473839640617371, + "learning_rate": 8e-05, + "loss": 1.645, + "step": 8140 + }, + { + "epoch": 0.45379041248606466, + "grad_norm": 0.5208000540733337, + "learning_rate": 8e-05, + "loss": 1.7808, + "step": 8141 + }, + { + "epoch": 0.45384615384615384, + "grad_norm": 0.5229403972625732, + "learning_rate": 8e-05, + "loss": 1.5968, + "step": 8142 + }, + { + "epoch": 0.453901895206243, + "grad_norm": 0.5802630186080933, + "learning_rate": 8e-05, + "loss": 1.6251, + "step": 8143 + }, + { + "epoch": 0.4539576365663322, + "grad_norm": 0.5335087180137634, + "learning_rate": 8e-05, + "loss": 1.7452, + "step": 8144 + }, + { + "epoch": 0.4540133779264214, + "grad_norm": 0.5810218453407288, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 8145 + }, + { + "epoch": 0.4540691192865106, + "grad_norm": 0.5198366045951843, + "learning_rate": 8e-05, + "loss": 1.6894, + "step": 8146 + }, + { + "epoch": 0.4541248606465998, + "grad_norm": 0.5838040709495544, + "learning_rate": 8e-05, + "loss": 1.7899, + "step": 8147 + }, + { + "epoch": 0.45418060200668897, + "grad_norm": 0.554544985294342, + "learning_rate": 8e-05, + "loss": 1.7979, + "step": 8148 + }, + { + "epoch": 0.45423634336677815, + "grad_norm": 0.5387526750564575, + "learning_rate": 8e-05, + "loss": 1.8184, + "step": 8149 + }, + { + "epoch": 0.45429208472686733, + "grad_norm": 0.5326625108718872, + "learning_rate": 8e-05, + "loss": 1.7459, + "step": 8150 + }, + { + "epoch": 0.4543478260869565, + "grad_norm": 0.5054154396057129, + "learning_rate": 8e-05, + "loss": 1.4972, + "step": 8151 + }, + { + "epoch": 0.4544035674470457, + "grad_norm": 0.5089333653450012, + "learning_rate": 8e-05, + "loss": 1.5232, + "step": 8152 + }, + { + "epoch": 0.4544593088071349, + "grad_norm": 0.5137532353401184, + "learning_rate": 8e-05, + "loss": 1.709, + "step": 8153 + }, + { + "epoch": 0.4545150501672241, + "grad_norm": 0.5468508005142212, + "learning_rate": 8e-05, + "loss": 1.7988, + "step": 8154 + }, + { + "epoch": 0.4545707915273133, + "grad_norm": 0.5106626152992249, + "learning_rate": 8e-05, + "loss": 1.5714, + "step": 8155 + }, + { + "epoch": 0.45462653288740246, + "grad_norm": 0.5454922318458557, + "learning_rate": 8e-05, + "loss": 1.7014, + "step": 8156 + }, + { + "epoch": 0.45468227424749164, + "grad_norm": 0.5221412181854248, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 8157 + }, + { + "epoch": 0.4547380156075808, + "grad_norm": 0.5512486100196838, + "learning_rate": 8e-05, + "loss": 1.75, + "step": 8158 + }, + { + "epoch": 0.45479375696767, + "grad_norm": 0.49368467926979065, + "learning_rate": 8e-05, + "loss": 1.5232, + "step": 8159 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.5954217910766602, + "learning_rate": 8e-05, + "loss": 1.8206, + "step": 8160 + }, + { + "epoch": 0.4549052396878484, + "grad_norm": 0.5530853271484375, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 8161 + }, + { + "epoch": 0.4549609810479376, + "grad_norm": 0.5027709007263184, + "learning_rate": 8e-05, + "loss": 1.3964, + "step": 8162 + }, + { + "epoch": 0.45501672240802676, + "grad_norm": 0.5821712017059326, + "learning_rate": 8e-05, + "loss": 1.8771, + "step": 8163 + }, + { + "epoch": 0.45507246376811594, + "grad_norm": 0.5425708293914795, + "learning_rate": 8e-05, + "loss": 1.6491, + "step": 8164 + }, + { + "epoch": 0.4551282051282051, + "grad_norm": 0.5588460564613342, + "learning_rate": 8e-05, + "loss": 1.7911, + "step": 8165 + }, + { + "epoch": 0.4551839464882943, + "grad_norm": 0.5832526683807373, + "learning_rate": 8e-05, + "loss": 1.5692, + "step": 8166 + }, + { + "epoch": 0.4552396878483835, + "grad_norm": 0.5861908197402954, + "learning_rate": 8e-05, + "loss": 1.8063, + "step": 8167 + }, + { + "epoch": 0.4552954292084727, + "grad_norm": 0.5518962144851685, + "learning_rate": 8e-05, + "loss": 1.693, + "step": 8168 + }, + { + "epoch": 0.4553511705685619, + "grad_norm": 0.5097272992134094, + "learning_rate": 8e-05, + "loss": 1.5919, + "step": 8169 + }, + { + "epoch": 0.45540691192865107, + "grad_norm": 0.5612396597862244, + "learning_rate": 8e-05, + "loss": 1.6945, + "step": 8170 + }, + { + "epoch": 0.45546265328874025, + "grad_norm": 0.6262779235839844, + "learning_rate": 8e-05, + "loss": 1.754, + "step": 8171 + }, + { + "epoch": 0.45551839464882943, + "grad_norm": 0.5311568975448608, + "learning_rate": 8e-05, + "loss": 1.5949, + "step": 8172 + }, + { + "epoch": 0.4555741360089186, + "grad_norm": 0.5751360654830933, + "learning_rate": 8e-05, + "loss": 1.6484, + "step": 8173 + }, + { + "epoch": 0.4556298773690078, + "grad_norm": 0.6115161180496216, + "learning_rate": 8e-05, + "loss": 1.914, + "step": 8174 + }, + { + "epoch": 0.45568561872909696, + "grad_norm": 0.580069899559021, + "learning_rate": 8e-05, + "loss": 2.048, + "step": 8175 + }, + { + "epoch": 0.4557413600891862, + "grad_norm": 0.5459027290344238, + "learning_rate": 8e-05, + "loss": 1.6791, + "step": 8176 + }, + { + "epoch": 0.4557971014492754, + "grad_norm": 0.5423399806022644, + "learning_rate": 8e-05, + "loss": 1.8994, + "step": 8177 + }, + { + "epoch": 0.45585284280936456, + "grad_norm": 0.5573829412460327, + "learning_rate": 8e-05, + "loss": 1.9555, + "step": 8178 + }, + { + "epoch": 0.45590858416945373, + "grad_norm": 0.5473054051399231, + "learning_rate": 8e-05, + "loss": 1.8268, + "step": 8179 + }, + { + "epoch": 0.4559643255295429, + "grad_norm": 0.5032323002815247, + "learning_rate": 8e-05, + "loss": 1.6568, + "step": 8180 + }, + { + "epoch": 0.4560200668896321, + "grad_norm": 0.6185225248336792, + "learning_rate": 8e-05, + "loss": 1.6041, + "step": 8181 + }, + { + "epoch": 0.45607580824972127, + "grad_norm": 0.4937954545021057, + "learning_rate": 8e-05, + "loss": 1.5299, + "step": 8182 + }, + { + "epoch": 0.4561315496098105, + "grad_norm": 0.5103335380554199, + "learning_rate": 8e-05, + "loss": 1.7353, + "step": 8183 + }, + { + "epoch": 0.4561872909698997, + "grad_norm": 0.5169482231140137, + "learning_rate": 8e-05, + "loss": 1.645, + "step": 8184 + }, + { + "epoch": 0.45624303232998886, + "grad_norm": 0.5453347563743591, + "learning_rate": 8e-05, + "loss": 1.6612, + "step": 8185 + }, + { + "epoch": 0.45629877369007804, + "grad_norm": 0.5726072192192078, + "learning_rate": 8e-05, + "loss": 1.7785, + "step": 8186 + }, + { + "epoch": 0.4563545150501672, + "grad_norm": 0.5240086317062378, + "learning_rate": 8e-05, + "loss": 1.4897, + "step": 8187 + }, + { + "epoch": 0.4564102564102564, + "grad_norm": 0.6142365336418152, + "learning_rate": 8e-05, + "loss": 1.6084, + "step": 8188 + }, + { + "epoch": 0.4564659977703456, + "grad_norm": 0.5494696497917175, + "learning_rate": 8e-05, + "loss": 1.7353, + "step": 8189 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.5338488817214966, + "learning_rate": 8e-05, + "loss": 1.7151, + "step": 8190 + }, + { + "epoch": 0.456577480490524, + "grad_norm": 0.5470588207244873, + "learning_rate": 8e-05, + "loss": 1.8233, + "step": 8191 + }, + { + "epoch": 0.45663322185061317, + "grad_norm": 0.5292598605155945, + "learning_rate": 8e-05, + "loss": 1.7907, + "step": 8192 + }, + { + "epoch": 0.45668896321070235, + "grad_norm": 0.52900230884552, + "learning_rate": 8e-05, + "loss": 1.8824, + "step": 8193 + }, + { + "epoch": 0.4567447045707915, + "grad_norm": 0.533981204032898, + "learning_rate": 8e-05, + "loss": 1.734, + "step": 8194 + }, + { + "epoch": 0.4568004459308807, + "grad_norm": 0.5497355461120605, + "learning_rate": 8e-05, + "loss": 1.644, + "step": 8195 + }, + { + "epoch": 0.4568561872909699, + "grad_norm": 0.5164412260055542, + "learning_rate": 8e-05, + "loss": 1.745, + "step": 8196 + }, + { + "epoch": 0.45691192865105906, + "grad_norm": 0.5101504921913147, + "learning_rate": 8e-05, + "loss": 1.7557, + "step": 8197 + }, + { + "epoch": 0.4569676700111483, + "grad_norm": 0.5320062637329102, + "learning_rate": 8e-05, + "loss": 1.7507, + "step": 8198 + }, + { + "epoch": 0.4570234113712375, + "grad_norm": 0.5199472308158875, + "learning_rate": 8e-05, + "loss": 1.6154, + "step": 8199 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.54566490650177, + "learning_rate": 8e-05, + "loss": 1.6001, + "step": 8200 + }, + { + "epoch": 0.45713489409141583, + "grad_norm": 0.4972429573535919, + "learning_rate": 8e-05, + "loss": 1.6148, + "step": 8201 + }, + { + "epoch": 0.457190635451505, + "grad_norm": 0.5039562582969666, + "learning_rate": 8e-05, + "loss": 1.4474, + "step": 8202 + }, + { + "epoch": 0.4572463768115942, + "grad_norm": 0.5226442217826843, + "learning_rate": 8e-05, + "loss": 1.6382, + "step": 8203 + }, + { + "epoch": 0.45730211817168337, + "grad_norm": 0.49594616889953613, + "learning_rate": 8e-05, + "loss": 1.5854, + "step": 8204 + }, + { + "epoch": 0.45735785953177255, + "grad_norm": 0.5066409111022949, + "learning_rate": 8e-05, + "loss": 1.4831, + "step": 8205 + }, + { + "epoch": 0.4574136008918618, + "grad_norm": 0.6288407444953918, + "learning_rate": 8e-05, + "loss": 1.9551, + "step": 8206 + }, + { + "epoch": 0.45746934225195096, + "grad_norm": 0.540301501750946, + "learning_rate": 8e-05, + "loss": 1.7047, + "step": 8207 + }, + { + "epoch": 0.45752508361204014, + "grad_norm": 0.5802655220031738, + "learning_rate": 8e-05, + "loss": 1.9917, + "step": 8208 + }, + { + "epoch": 0.4575808249721293, + "grad_norm": 0.5389567017555237, + "learning_rate": 8e-05, + "loss": 1.6907, + "step": 8209 + }, + { + "epoch": 0.4576365663322185, + "grad_norm": 0.49851271510124207, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 8210 + }, + { + "epoch": 0.4576923076923077, + "grad_norm": 0.5140793919563293, + "learning_rate": 8e-05, + "loss": 1.6777, + "step": 8211 + }, + { + "epoch": 0.45774804905239685, + "grad_norm": 0.4872767925262451, + "learning_rate": 8e-05, + "loss": 1.4623, + "step": 8212 + }, + { + "epoch": 0.4578037904124861, + "grad_norm": 0.48600637912750244, + "learning_rate": 8e-05, + "loss": 1.4297, + "step": 8213 + }, + { + "epoch": 0.45785953177257527, + "grad_norm": 0.5143309235572815, + "learning_rate": 8e-05, + "loss": 1.5519, + "step": 8214 + }, + { + "epoch": 0.45791527313266445, + "grad_norm": 0.4996730089187622, + "learning_rate": 8e-05, + "loss": 1.5866, + "step": 8215 + }, + { + "epoch": 0.4579710144927536, + "grad_norm": 0.555542528629303, + "learning_rate": 8e-05, + "loss": 1.6117, + "step": 8216 + }, + { + "epoch": 0.4580267558528428, + "grad_norm": 0.5220987796783447, + "learning_rate": 8e-05, + "loss": 1.5935, + "step": 8217 + }, + { + "epoch": 0.458082497212932, + "grad_norm": 0.5406100749969482, + "learning_rate": 8e-05, + "loss": 1.6984, + "step": 8218 + }, + { + "epoch": 0.45813823857302116, + "grad_norm": 0.5386164784431458, + "learning_rate": 8e-05, + "loss": 1.7229, + "step": 8219 + }, + { + "epoch": 0.45819397993311034, + "grad_norm": 0.5644610524177551, + "learning_rate": 8e-05, + "loss": 1.7008, + "step": 8220 + }, + { + "epoch": 0.4582497212931996, + "grad_norm": 0.5660703182220459, + "learning_rate": 8e-05, + "loss": 1.7436, + "step": 8221 + }, + { + "epoch": 0.45830546265328875, + "grad_norm": 0.5259301662445068, + "learning_rate": 8e-05, + "loss": 1.7168, + "step": 8222 + }, + { + "epoch": 0.45836120401337793, + "grad_norm": 0.5591362714767456, + "learning_rate": 8e-05, + "loss": 1.968, + "step": 8223 + }, + { + "epoch": 0.4584169453734671, + "grad_norm": 0.5394548773765564, + "learning_rate": 8e-05, + "loss": 1.6454, + "step": 8224 + }, + { + "epoch": 0.4584726867335563, + "grad_norm": 0.6189616322517395, + "learning_rate": 8e-05, + "loss": 2.2171, + "step": 8225 + }, + { + "epoch": 0.45852842809364547, + "grad_norm": 0.504211962223053, + "learning_rate": 8e-05, + "loss": 1.5599, + "step": 8226 + }, + { + "epoch": 0.45858416945373465, + "grad_norm": 0.5423411130905151, + "learning_rate": 8e-05, + "loss": 1.8359, + "step": 8227 + }, + { + "epoch": 0.4586399108138239, + "grad_norm": 0.6187911033630371, + "learning_rate": 8e-05, + "loss": 2.113, + "step": 8228 + }, + { + "epoch": 0.45869565217391306, + "grad_norm": 0.5360573530197144, + "learning_rate": 8e-05, + "loss": 1.6487, + "step": 8229 + }, + { + "epoch": 0.45875139353400224, + "grad_norm": 0.5503401160240173, + "learning_rate": 8e-05, + "loss": 1.6517, + "step": 8230 + }, + { + "epoch": 0.4588071348940914, + "grad_norm": 0.49501150846481323, + "learning_rate": 8e-05, + "loss": 1.5461, + "step": 8231 + }, + { + "epoch": 0.4588628762541806, + "grad_norm": 0.5979353785514832, + "learning_rate": 8e-05, + "loss": 1.9651, + "step": 8232 + }, + { + "epoch": 0.4589186176142698, + "grad_norm": 0.5210656523704529, + "learning_rate": 8e-05, + "loss": 1.5778, + "step": 8233 + }, + { + "epoch": 0.45897435897435895, + "grad_norm": 0.5004912614822388, + "learning_rate": 8e-05, + "loss": 1.5335, + "step": 8234 + }, + { + "epoch": 0.4590301003344482, + "grad_norm": 0.5546466708183289, + "learning_rate": 8e-05, + "loss": 1.8972, + "step": 8235 + }, + { + "epoch": 0.45908584169453737, + "grad_norm": 0.5235152840614319, + "learning_rate": 8e-05, + "loss": 1.6181, + "step": 8236 + }, + { + "epoch": 0.45914158305462655, + "grad_norm": 0.5469987392425537, + "learning_rate": 8e-05, + "loss": 1.7178, + "step": 8237 + }, + { + "epoch": 0.4591973244147157, + "grad_norm": 0.5259850025177002, + "learning_rate": 8e-05, + "loss": 1.5475, + "step": 8238 + }, + { + "epoch": 0.4592530657748049, + "grad_norm": 0.5477474927902222, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 8239 + }, + { + "epoch": 0.4593088071348941, + "grad_norm": 0.5713329315185547, + "learning_rate": 8e-05, + "loss": 1.8525, + "step": 8240 + }, + { + "epoch": 0.45936454849498326, + "grad_norm": 0.5435706973075867, + "learning_rate": 8e-05, + "loss": 1.7169, + "step": 8241 + }, + { + "epoch": 0.45942028985507244, + "grad_norm": 0.5042891502380371, + "learning_rate": 8e-05, + "loss": 1.5887, + "step": 8242 + }, + { + "epoch": 0.4594760312151617, + "grad_norm": 0.5069809556007385, + "learning_rate": 8e-05, + "loss": 1.58, + "step": 8243 + }, + { + "epoch": 0.45953177257525085, + "grad_norm": 0.5203117728233337, + "learning_rate": 8e-05, + "loss": 1.643, + "step": 8244 + }, + { + "epoch": 0.45958751393534003, + "grad_norm": 0.5837569236755371, + "learning_rate": 8e-05, + "loss": 1.489, + "step": 8245 + }, + { + "epoch": 0.4596432552954292, + "grad_norm": 0.544971764087677, + "learning_rate": 8e-05, + "loss": 1.56, + "step": 8246 + }, + { + "epoch": 0.4596989966555184, + "grad_norm": 0.572570264339447, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 8247 + }, + { + "epoch": 0.45975473801560757, + "grad_norm": 0.5214897990226746, + "learning_rate": 8e-05, + "loss": 1.6651, + "step": 8248 + }, + { + "epoch": 0.45981047937569675, + "grad_norm": 0.5395656228065491, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 8249 + }, + { + "epoch": 0.459866220735786, + "grad_norm": 0.5213479399681091, + "learning_rate": 8e-05, + "loss": 1.5345, + "step": 8250 + }, + { + "epoch": 0.45992196209587516, + "grad_norm": 0.5405566692352295, + "learning_rate": 8e-05, + "loss": 1.7846, + "step": 8251 + }, + { + "epoch": 0.45997770345596434, + "grad_norm": 0.5674734115600586, + "learning_rate": 8e-05, + "loss": 1.6661, + "step": 8252 + }, + { + "epoch": 0.4600334448160535, + "grad_norm": 0.5487441420555115, + "learning_rate": 8e-05, + "loss": 1.7767, + "step": 8253 + }, + { + "epoch": 0.4600891861761427, + "grad_norm": 0.5241776704788208, + "learning_rate": 8e-05, + "loss": 1.7186, + "step": 8254 + }, + { + "epoch": 0.4601449275362319, + "grad_norm": 0.5231916904449463, + "learning_rate": 8e-05, + "loss": 1.4834, + "step": 8255 + }, + { + "epoch": 0.46020066889632105, + "grad_norm": 0.5078309774398804, + "learning_rate": 8e-05, + "loss": 1.6286, + "step": 8256 + }, + { + "epoch": 0.46025641025641023, + "grad_norm": 0.6028822660446167, + "learning_rate": 8e-05, + "loss": 1.7446, + "step": 8257 + }, + { + "epoch": 0.46031215161649947, + "grad_norm": 0.49096351861953735, + "learning_rate": 8e-05, + "loss": 1.5099, + "step": 8258 + }, + { + "epoch": 0.46036789297658864, + "grad_norm": 0.5947312712669373, + "learning_rate": 8e-05, + "loss": 1.7774, + "step": 8259 + }, + { + "epoch": 0.4604236343366778, + "grad_norm": 0.5490503907203674, + "learning_rate": 8e-05, + "loss": 1.7369, + "step": 8260 + }, + { + "epoch": 0.460479375696767, + "grad_norm": 0.49701809883117676, + "learning_rate": 8e-05, + "loss": 1.4358, + "step": 8261 + }, + { + "epoch": 0.4605351170568562, + "grad_norm": 0.5505596399307251, + "learning_rate": 8e-05, + "loss": 1.5358, + "step": 8262 + }, + { + "epoch": 0.46059085841694536, + "grad_norm": 0.5386398434638977, + "learning_rate": 8e-05, + "loss": 1.5321, + "step": 8263 + }, + { + "epoch": 0.46064659977703454, + "grad_norm": 0.5729582905769348, + "learning_rate": 8e-05, + "loss": 1.7571, + "step": 8264 + }, + { + "epoch": 0.46070234113712377, + "grad_norm": 0.591675877571106, + "learning_rate": 8e-05, + "loss": 1.8728, + "step": 8265 + }, + { + "epoch": 0.46075808249721295, + "grad_norm": 0.5355835556983948, + "learning_rate": 8e-05, + "loss": 1.6417, + "step": 8266 + }, + { + "epoch": 0.46081382385730213, + "grad_norm": 0.5285724997520447, + "learning_rate": 8e-05, + "loss": 1.7523, + "step": 8267 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 0.49647387862205505, + "learning_rate": 8e-05, + "loss": 1.5041, + "step": 8268 + }, + { + "epoch": 0.4609253065774805, + "grad_norm": 0.5497033596038818, + "learning_rate": 8e-05, + "loss": 1.5984, + "step": 8269 + }, + { + "epoch": 0.46098104793756967, + "grad_norm": 0.5773898959159851, + "learning_rate": 8e-05, + "loss": 1.7893, + "step": 8270 + }, + { + "epoch": 0.46103678929765884, + "grad_norm": 0.5707722902297974, + "learning_rate": 8e-05, + "loss": 1.6682, + "step": 8271 + }, + { + "epoch": 0.461092530657748, + "grad_norm": 0.6010302901268005, + "learning_rate": 8e-05, + "loss": 1.7398, + "step": 8272 + }, + { + "epoch": 0.46114827201783726, + "grad_norm": 0.5731756091117859, + "learning_rate": 8e-05, + "loss": 1.7073, + "step": 8273 + }, + { + "epoch": 0.46120401337792644, + "grad_norm": 0.5730472207069397, + "learning_rate": 8e-05, + "loss": 1.655, + "step": 8274 + }, + { + "epoch": 0.4612597547380156, + "grad_norm": 0.5540065169334412, + "learning_rate": 8e-05, + "loss": 1.8349, + "step": 8275 + }, + { + "epoch": 0.4613154960981048, + "grad_norm": 0.5118550658226013, + "learning_rate": 8e-05, + "loss": 1.5659, + "step": 8276 + }, + { + "epoch": 0.461371237458194, + "grad_norm": 0.5175141096115112, + "learning_rate": 8e-05, + "loss": 1.7364, + "step": 8277 + }, + { + "epoch": 0.46142697881828315, + "grad_norm": 0.5034962296485901, + "learning_rate": 8e-05, + "loss": 1.5762, + "step": 8278 + }, + { + "epoch": 0.46148272017837233, + "grad_norm": 0.5668886303901672, + "learning_rate": 8e-05, + "loss": 1.7841, + "step": 8279 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.4840858280658722, + "learning_rate": 8e-05, + "loss": 1.6333, + "step": 8280 + }, + { + "epoch": 0.46159420289855074, + "grad_norm": 0.5296512246131897, + "learning_rate": 8e-05, + "loss": 1.7393, + "step": 8281 + }, + { + "epoch": 0.4616499442586399, + "grad_norm": 0.5633383393287659, + "learning_rate": 8e-05, + "loss": 1.9755, + "step": 8282 + }, + { + "epoch": 0.4617056856187291, + "grad_norm": 0.5701639652252197, + "learning_rate": 8e-05, + "loss": 1.5747, + "step": 8283 + }, + { + "epoch": 0.4617614269788183, + "grad_norm": 0.5512565970420837, + "learning_rate": 8e-05, + "loss": 1.5175, + "step": 8284 + }, + { + "epoch": 0.46181716833890746, + "grad_norm": 0.5408527255058289, + "learning_rate": 8e-05, + "loss": 1.7483, + "step": 8285 + }, + { + "epoch": 0.46187290969899664, + "grad_norm": 0.5452939867973328, + "learning_rate": 8e-05, + "loss": 1.5374, + "step": 8286 + }, + { + "epoch": 0.4619286510590858, + "grad_norm": 0.5265973806381226, + "learning_rate": 8e-05, + "loss": 1.6867, + "step": 8287 + }, + { + "epoch": 0.46198439241917505, + "grad_norm": 0.5437179803848267, + "learning_rate": 8e-05, + "loss": 1.7102, + "step": 8288 + }, + { + "epoch": 0.46204013377926423, + "grad_norm": 0.5253024101257324, + "learning_rate": 8e-05, + "loss": 1.6835, + "step": 8289 + }, + { + "epoch": 0.4620958751393534, + "grad_norm": 0.524053156375885, + "learning_rate": 8e-05, + "loss": 1.6644, + "step": 8290 + }, + { + "epoch": 0.4621516164994426, + "grad_norm": 0.5442635416984558, + "learning_rate": 8e-05, + "loss": 1.6942, + "step": 8291 + }, + { + "epoch": 0.46220735785953176, + "grad_norm": 0.5474844574928284, + "learning_rate": 8e-05, + "loss": 1.636, + "step": 8292 + }, + { + "epoch": 0.46226309921962094, + "grad_norm": 0.5145108699798584, + "learning_rate": 8e-05, + "loss": 1.548, + "step": 8293 + }, + { + "epoch": 0.4623188405797101, + "grad_norm": 0.4863380789756775, + "learning_rate": 8e-05, + "loss": 1.5228, + "step": 8294 + }, + { + "epoch": 0.46237458193979936, + "grad_norm": 0.5318400263786316, + "learning_rate": 8e-05, + "loss": 1.3712, + "step": 8295 + }, + { + "epoch": 0.46243032329988853, + "grad_norm": 0.551691472530365, + "learning_rate": 8e-05, + "loss": 1.7128, + "step": 8296 + }, + { + "epoch": 0.4624860646599777, + "grad_norm": 0.5184952616691589, + "learning_rate": 8e-05, + "loss": 1.39, + "step": 8297 + }, + { + "epoch": 0.4625418060200669, + "grad_norm": 0.5191618204116821, + "learning_rate": 8e-05, + "loss": 1.5686, + "step": 8298 + }, + { + "epoch": 0.46259754738015607, + "grad_norm": 0.55131596326828, + "learning_rate": 8e-05, + "loss": 1.5918, + "step": 8299 + }, + { + "epoch": 0.46265328874024525, + "grad_norm": 0.5466975569725037, + "learning_rate": 8e-05, + "loss": 1.7046, + "step": 8300 + }, + { + "epoch": 0.46270903010033443, + "grad_norm": 0.5339773297309875, + "learning_rate": 8e-05, + "loss": 1.6391, + "step": 8301 + }, + { + "epoch": 0.4627647714604236, + "grad_norm": 0.5683463215827942, + "learning_rate": 8e-05, + "loss": 1.7752, + "step": 8302 + }, + { + "epoch": 0.46282051282051284, + "grad_norm": 0.5539059042930603, + "learning_rate": 8e-05, + "loss": 1.7801, + "step": 8303 + }, + { + "epoch": 0.462876254180602, + "grad_norm": 0.5928812026977539, + "learning_rate": 8e-05, + "loss": 1.7587, + "step": 8304 + }, + { + "epoch": 0.4629319955406912, + "grad_norm": 0.6081556081771851, + "learning_rate": 8e-05, + "loss": 1.9025, + "step": 8305 + }, + { + "epoch": 0.4629877369007804, + "grad_norm": 0.5395975112915039, + "learning_rate": 8e-05, + "loss": 1.77, + "step": 8306 + }, + { + "epoch": 0.46304347826086956, + "grad_norm": 0.5138643383979797, + "learning_rate": 8e-05, + "loss": 1.6165, + "step": 8307 + }, + { + "epoch": 0.46309921962095874, + "grad_norm": 0.48940345644950867, + "learning_rate": 8e-05, + "loss": 1.6167, + "step": 8308 + }, + { + "epoch": 0.4631549609810479, + "grad_norm": 0.606408417224884, + "learning_rate": 8e-05, + "loss": 1.4015, + "step": 8309 + }, + { + "epoch": 0.46321070234113715, + "grad_norm": 0.5240685939788818, + "learning_rate": 8e-05, + "loss": 1.5085, + "step": 8310 + }, + { + "epoch": 0.4632664437012263, + "grad_norm": 0.5632039308547974, + "learning_rate": 8e-05, + "loss": 1.7416, + "step": 8311 + }, + { + "epoch": 0.4633221850613155, + "grad_norm": 0.574478030204773, + "learning_rate": 8e-05, + "loss": 1.6456, + "step": 8312 + }, + { + "epoch": 0.4633779264214047, + "grad_norm": 0.5275530219078064, + "learning_rate": 8e-05, + "loss": 1.5196, + "step": 8313 + }, + { + "epoch": 0.46343366778149386, + "grad_norm": 0.5936029553413391, + "learning_rate": 8e-05, + "loss": 1.806, + "step": 8314 + }, + { + "epoch": 0.46348940914158304, + "grad_norm": 0.5386390686035156, + "learning_rate": 8e-05, + "loss": 1.7582, + "step": 8315 + }, + { + "epoch": 0.4635451505016722, + "grad_norm": 0.5835428237915039, + "learning_rate": 8e-05, + "loss": 1.7981, + "step": 8316 + }, + { + "epoch": 0.4636008918617614, + "grad_norm": 0.6085085868835449, + "learning_rate": 8e-05, + "loss": 2.0216, + "step": 8317 + }, + { + "epoch": 0.46365663322185063, + "grad_norm": 0.5607771277427673, + "learning_rate": 8e-05, + "loss": 1.5928, + "step": 8318 + }, + { + "epoch": 0.4637123745819398, + "grad_norm": 0.5809671878814697, + "learning_rate": 8e-05, + "loss": 1.7172, + "step": 8319 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.5324718952178955, + "learning_rate": 8e-05, + "loss": 1.6558, + "step": 8320 + }, + { + "epoch": 0.46382385730211817, + "grad_norm": 0.5490670800209045, + "learning_rate": 8e-05, + "loss": 1.755, + "step": 8321 + }, + { + "epoch": 0.46387959866220735, + "grad_norm": 0.5422378778457642, + "learning_rate": 8e-05, + "loss": 1.7185, + "step": 8322 + }, + { + "epoch": 0.4639353400222965, + "grad_norm": 0.5793622136116028, + "learning_rate": 8e-05, + "loss": 1.7017, + "step": 8323 + }, + { + "epoch": 0.4639910813823857, + "grad_norm": 0.48198401927948, + "learning_rate": 8e-05, + "loss": 1.4639, + "step": 8324 + }, + { + "epoch": 0.46404682274247494, + "grad_norm": 0.9945459961891174, + "learning_rate": 8e-05, + "loss": 1.6198, + "step": 8325 + }, + { + "epoch": 0.4641025641025641, + "grad_norm": 0.5188176035881042, + "learning_rate": 8e-05, + "loss": 1.5543, + "step": 8326 + }, + { + "epoch": 0.4641583054626533, + "grad_norm": 0.5144892930984497, + "learning_rate": 8e-05, + "loss": 1.5877, + "step": 8327 + }, + { + "epoch": 0.4642140468227425, + "grad_norm": 0.5563486218452454, + "learning_rate": 8e-05, + "loss": 1.6031, + "step": 8328 + }, + { + "epoch": 0.46426978818283166, + "grad_norm": 0.5223346948623657, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 8329 + }, + { + "epoch": 0.46432552954292083, + "grad_norm": 0.5695260763168335, + "learning_rate": 8e-05, + "loss": 1.7654, + "step": 8330 + }, + { + "epoch": 0.46438127090301, + "grad_norm": 0.5626814365386963, + "learning_rate": 8e-05, + "loss": 1.881, + "step": 8331 + }, + { + "epoch": 0.46443701226309925, + "grad_norm": 0.5374181866645813, + "learning_rate": 8e-05, + "loss": 1.7529, + "step": 8332 + }, + { + "epoch": 0.4644927536231884, + "grad_norm": 0.5513303279876709, + "learning_rate": 8e-05, + "loss": 1.8025, + "step": 8333 + }, + { + "epoch": 0.4645484949832776, + "grad_norm": 0.5162280201911926, + "learning_rate": 8e-05, + "loss": 1.5826, + "step": 8334 + }, + { + "epoch": 0.4646042363433668, + "grad_norm": 0.5183243751525879, + "learning_rate": 8e-05, + "loss": 1.4256, + "step": 8335 + }, + { + "epoch": 0.46465997770345596, + "grad_norm": 0.5621330738067627, + "learning_rate": 8e-05, + "loss": 1.5479, + "step": 8336 + }, + { + "epoch": 0.46471571906354514, + "grad_norm": 0.5799599289894104, + "learning_rate": 8e-05, + "loss": 1.9382, + "step": 8337 + }, + { + "epoch": 0.4647714604236343, + "grad_norm": 0.5067434310913086, + "learning_rate": 8e-05, + "loss": 1.423, + "step": 8338 + }, + { + "epoch": 0.4648272017837235, + "grad_norm": 0.5357816219329834, + "learning_rate": 8e-05, + "loss": 1.5762, + "step": 8339 + }, + { + "epoch": 0.46488294314381273, + "grad_norm": 0.4942900538444519, + "learning_rate": 8e-05, + "loss": 1.4523, + "step": 8340 + }, + { + "epoch": 0.4649386845039019, + "grad_norm": 0.5133934617042542, + "learning_rate": 8e-05, + "loss": 1.6795, + "step": 8341 + }, + { + "epoch": 0.4649944258639911, + "grad_norm": 0.6044417023658752, + "learning_rate": 8e-05, + "loss": 1.8665, + "step": 8342 + }, + { + "epoch": 0.46505016722408027, + "grad_norm": 0.5712794065475464, + "learning_rate": 8e-05, + "loss": 1.7797, + "step": 8343 + }, + { + "epoch": 0.46510590858416945, + "grad_norm": 0.5416268706321716, + "learning_rate": 8e-05, + "loss": 1.5001, + "step": 8344 + }, + { + "epoch": 0.4651616499442586, + "grad_norm": 0.5173828601837158, + "learning_rate": 8e-05, + "loss": 1.6749, + "step": 8345 + }, + { + "epoch": 0.4652173913043478, + "grad_norm": 0.5193343162536621, + "learning_rate": 8e-05, + "loss": 1.6668, + "step": 8346 + }, + { + "epoch": 0.46527313266443704, + "grad_norm": 0.5364198088645935, + "learning_rate": 8e-05, + "loss": 1.9235, + "step": 8347 + }, + { + "epoch": 0.4653288740245262, + "grad_norm": 0.5241788029670715, + "learning_rate": 8e-05, + "loss": 1.659, + "step": 8348 + }, + { + "epoch": 0.4653846153846154, + "grad_norm": 0.5484277606010437, + "learning_rate": 8e-05, + "loss": 1.6348, + "step": 8349 + }, + { + "epoch": 0.4654403567447046, + "grad_norm": 0.5373669266700745, + "learning_rate": 8e-05, + "loss": 1.5944, + "step": 8350 + }, + { + "epoch": 0.46549609810479375, + "grad_norm": 0.5364479422569275, + "learning_rate": 8e-05, + "loss": 1.977, + "step": 8351 + }, + { + "epoch": 0.46555183946488293, + "grad_norm": 0.5501197576522827, + "learning_rate": 8e-05, + "loss": 1.5936, + "step": 8352 + }, + { + "epoch": 0.4656075808249721, + "grad_norm": 0.5254024267196655, + "learning_rate": 8e-05, + "loss": 1.8216, + "step": 8353 + }, + { + "epoch": 0.4656633221850613, + "grad_norm": 0.573980987071991, + "learning_rate": 8e-05, + "loss": 1.8467, + "step": 8354 + }, + { + "epoch": 0.4657190635451505, + "grad_norm": 0.5709452629089355, + "learning_rate": 8e-05, + "loss": 1.9287, + "step": 8355 + }, + { + "epoch": 0.4657748049052397, + "grad_norm": 0.5465955138206482, + "learning_rate": 8e-05, + "loss": 1.7063, + "step": 8356 + }, + { + "epoch": 0.4658305462653289, + "grad_norm": 0.5366584658622742, + "learning_rate": 8e-05, + "loss": 1.5926, + "step": 8357 + }, + { + "epoch": 0.46588628762541806, + "grad_norm": 0.552971363067627, + "learning_rate": 8e-05, + "loss": 1.7139, + "step": 8358 + }, + { + "epoch": 0.46594202898550724, + "grad_norm": 0.46810775995254517, + "learning_rate": 8e-05, + "loss": 1.4272, + "step": 8359 + }, + { + "epoch": 0.4659977703455964, + "grad_norm": 0.536858081817627, + "learning_rate": 8e-05, + "loss": 1.8526, + "step": 8360 + }, + { + "epoch": 0.4660535117056856, + "grad_norm": 0.46676334738731384, + "learning_rate": 8e-05, + "loss": 1.5068, + "step": 8361 + }, + { + "epoch": 0.46610925306577483, + "grad_norm": 0.49330854415893555, + "learning_rate": 8e-05, + "loss": 1.5198, + "step": 8362 + }, + { + "epoch": 0.466164994425864, + "grad_norm": 0.6294517517089844, + "learning_rate": 8e-05, + "loss": 1.6882, + "step": 8363 + }, + { + "epoch": 0.4662207357859532, + "grad_norm": 0.6168404221534729, + "learning_rate": 8e-05, + "loss": 1.8669, + "step": 8364 + }, + { + "epoch": 0.46627647714604237, + "grad_norm": 0.5400106310844421, + "learning_rate": 8e-05, + "loss": 1.7639, + "step": 8365 + }, + { + "epoch": 0.46633221850613155, + "grad_norm": 0.5267928242683411, + "learning_rate": 8e-05, + "loss": 1.5111, + "step": 8366 + }, + { + "epoch": 0.4663879598662207, + "grad_norm": 0.5415551066398621, + "learning_rate": 8e-05, + "loss": 1.6299, + "step": 8367 + }, + { + "epoch": 0.4664437012263099, + "grad_norm": 0.505455493927002, + "learning_rate": 8e-05, + "loss": 1.5415, + "step": 8368 + }, + { + "epoch": 0.4664994425863991, + "grad_norm": 0.5676425695419312, + "learning_rate": 8e-05, + "loss": 1.7696, + "step": 8369 + }, + { + "epoch": 0.4665551839464883, + "grad_norm": 0.48888587951660156, + "learning_rate": 8e-05, + "loss": 1.4738, + "step": 8370 + }, + { + "epoch": 0.4666109253065775, + "grad_norm": 0.5303835272789001, + "learning_rate": 8e-05, + "loss": 1.6513, + "step": 8371 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.5789093971252441, + "learning_rate": 8e-05, + "loss": 1.8686, + "step": 8372 + }, + { + "epoch": 0.46672240802675585, + "grad_norm": 0.5505883693695068, + "learning_rate": 8e-05, + "loss": 1.6741, + "step": 8373 + }, + { + "epoch": 0.46677814938684503, + "grad_norm": 0.6738007664680481, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 8374 + }, + { + "epoch": 0.4668338907469342, + "grad_norm": 0.6047912836074829, + "learning_rate": 8e-05, + "loss": 1.9487, + "step": 8375 + }, + { + "epoch": 0.4668896321070234, + "grad_norm": 0.5206901431083679, + "learning_rate": 8e-05, + "loss": 1.4995, + "step": 8376 + }, + { + "epoch": 0.4669453734671126, + "grad_norm": 0.5485989451408386, + "learning_rate": 8e-05, + "loss": 1.6406, + "step": 8377 + }, + { + "epoch": 0.4670011148272018, + "grad_norm": 0.5110633373260498, + "learning_rate": 8e-05, + "loss": 1.5564, + "step": 8378 + }, + { + "epoch": 0.467056856187291, + "grad_norm": 0.5560204982757568, + "learning_rate": 8e-05, + "loss": 1.8834, + "step": 8379 + }, + { + "epoch": 0.46711259754738016, + "grad_norm": 0.6657040119171143, + "learning_rate": 8e-05, + "loss": 1.4447, + "step": 8380 + }, + { + "epoch": 0.46716833890746934, + "grad_norm": 0.5403985381126404, + "learning_rate": 8e-05, + "loss": 1.733, + "step": 8381 + }, + { + "epoch": 0.4672240802675585, + "grad_norm": 0.5952733159065247, + "learning_rate": 8e-05, + "loss": 2.0547, + "step": 8382 + }, + { + "epoch": 0.4672798216276477, + "grad_norm": 0.5075384378433228, + "learning_rate": 8e-05, + "loss": 1.6152, + "step": 8383 + }, + { + "epoch": 0.4673355629877369, + "grad_norm": 0.5747597217559814, + "learning_rate": 8e-05, + "loss": 1.7946, + "step": 8384 + }, + { + "epoch": 0.4673913043478261, + "grad_norm": 0.6281068325042725, + "learning_rate": 8e-05, + "loss": 1.9414, + "step": 8385 + }, + { + "epoch": 0.4674470457079153, + "grad_norm": 0.5636025071144104, + "learning_rate": 8e-05, + "loss": 1.9194, + "step": 8386 + }, + { + "epoch": 0.46750278706800447, + "grad_norm": 0.5191909074783325, + "learning_rate": 8e-05, + "loss": 1.5481, + "step": 8387 + }, + { + "epoch": 0.46755852842809364, + "grad_norm": 0.5473641157150269, + "learning_rate": 8e-05, + "loss": 1.7042, + "step": 8388 + }, + { + "epoch": 0.4676142697881828, + "grad_norm": 0.5307797193527222, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 8389 + }, + { + "epoch": 0.467670011148272, + "grad_norm": 0.539536714553833, + "learning_rate": 8e-05, + "loss": 1.636, + "step": 8390 + }, + { + "epoch": 0.4677257525083612, + "grad_norm": 0.49172070622444153, + "learning_rate": 8e-05, + "loss": 1.5952, + "step": 8391 + }, + { + "epoch": 0.4677814938684504, + "grad_norm": 0.5307680368423462, + "learning_rate": 8e-05, + "loss": 1.6422, + "step": 8392 + }, + { + "epoch": 0.4678372352285396, + "grad_norm": 0.5460218787193298, + "learning_rate": 8e-05, + "loss": 1.8214, + "step": 8393 + }, + { + "epoch": 0.4678929765886288, + "grad_norm": 0.54899662733078, + "learning_rate": 8e-05, + "loss": 1.5739, + "step": 8394 + }, + { + "epoch": 0.46794871794871795, + "grad_norm": 0.5274426937103271, + "learning_rate": 8e-05, + "loss": 1.7815, + "step": 8395 + }, + { + "epoch": 0.46800445930880713, + "grad_norm": 0.49675053358078003, + "learning_rate": 8e-05, + "loss": 1.5168, + "step": 8396 + }, + { + "epoch": 0.4680602006688963, + "grad_norm": 0.5308000445365906, + "learning_rate": 8e-05, + "loss": 1.6003, + "step": 8397 + }, + { + "epoch": 0.4681159420289855, + "grad_norm": 0.5650284886360168, + "learning_rate": 8e-05, + "loss": 1.6046, + "step": 8398 + }, + { + "epoch": 0.46817168338907467, + "grad_norm": 0.5442639589309692, + "learning_rate": 8e-05, + "loss": 1.6954, + "step": 8399 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.6157032251358032, + "learning_rate": 8e-05, + "loss": 1.9056, + "step": 8400 + }, + { + "epoch": 0.4682831661092531, + "grad_norm": 0.545790433883667, + "learning_rate": 8e-05, + "loss": 1.7362, + "step": 8401 + }, + { + "epoch": 0.46833890746934226, + "grad_norm": 0.5099235773086548, + "learning_rate": 8e-05, + "loss": 1.5272, + "step": 8402 + }, + { + "epoch": 0.46839464882943144, + "grad_norm": 0.5452757477760315, + "learning_rate": 8e-05, + "loss": 1.7499, + "step": 8403 + }, + { + "epoch": 0.4684503901895206, + "grad_norm": 0.5631169080734253, + "learning_rate": 8e-05, + "loss": 1.9252, + "step": 8404 + }, + { + "epoch": 0.4685061315496098, + "grad_norm": 0.5493608713150024, + "learning_rate": 8e-05, + "loss": 1.6625, + "step": 8405 + }, + { + "epoch": 0.468561872909699, + "grad_norm": 0.5756227374076843, + "learning_rate": 8e-05, + "loss": 1.8305, + "step": 8406 + }, + { + "epoch": 0.4686176142697882, + "grad_norm": 0.4990144670009613, + "learning_rate": 8e-05, + "loss": 1.4709, + "step": 8407 + }, + { + "epoch": 0.4686733556298774, + "grad_norm": 0.5849319100379944, + "learning_rate": 8e-05, + "loss": 1.8436, + "step": 8408 + }, + { + "epoch": 0.46872909698996656, + "grad_norm": 0.5466557145118713, + "learning_rate": 8e-05, + "loss": 1.8921, + "step": 8409 + }, + { + "epoch": 0.46878483835005574, + "grad_norm": 0.5994129776954651, + "learning_rate": 8e-05, + "loss": 1.7685, + "step": 8410 + }, + { + "epoch": 0.4688405797101449, + "grad_norm": 0.512111485004425, + "learning_rate": 8e-05, + "loss": 1.6275, + "step": 8411 + }, + { + "epoch": 0.4688963210702341, + "grad_norm": 0.536937952041626, + "learning_rate": 8e-05, + "loss": 1.7214, + "step": 8412 + }, + { + "epoch": 0.4689520624303233, + "grad_norm": 0.545914888381958, + "learning_rate": 8e-05, + "loss": 1.6529, + "step": 8413 + }, + { + "epoch": 0.46900780379041246, + "grad_norm": 0.5283071994781494, + "learning_rate": 8e-05, + "loss": 1.779, + "step": 8414 + }, + { + "epoch": 0.4690635451505017, + "grad_norm": 0.4989955723285675, + "learning_rate": 8e-05, + "loss": 1.3232, + "step": 8415 + }, + { + "epoch": 0.46911928651059087, + "grad_norm": 0.5377459526062012, + "learning_rate": 8e-05, + "loss": 1.6318, + "step": 8416 + }, + { + "epoch": 0.46917502787068005, + "grad_norm": 0.5159767270088196, + "learning_rate": 8e-05, + "loss": 1.6881, + "step": 8417 + }, + { + "epoch": 0.46923076923076923, + "grad_norm": 0.49078115820884705, + "learning_rate": 8e-05, + "loss": 1.7409, + "step": 8418 + }, + { + "epoch": 0.4692865105908584, + "grad_norm": 0.5152768492698669, + "learning_rate": 8e-05, + "loss": 1.5558, + "step": 8419 + }, + { + "epoch": 0.4693422519509476, + "grad_norm": 0.545647144317627, + "learning_rate": 8e-05, + "loss": 1.8672, + "step": 8420 + }, + { + "epoch": 0.46939799331103677, + "grad_norm": 0.5529956221580505, + "learning_rate": 8e-05, + "loss": 1.6215, + "step": 8421 + }, + { + "epoch": 0.469453734671126, + "grad_norm": 0.5335527062416077, + "learning_rate": 8e-05, + "loss": 1.7322, + "step": 8422 + }, + { + "epoch": 0.4695094760312152, + "grad_norm": 0.5761918425559998, + "learning_rate": 8e-05, + "loss": 1.8132, + "step": 8423 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 0.5925495028495789, + "learning_rate": 8e-05, + "loss": 1.8973, + "step": 8424 + }, + { + "epoch": 0.46962095875139354, + "grad_norm": 0.5872844457626343, + "learning_rate": 8e-05, + "loss": 1.6397, + "step": 8425 + }, + { + "epoch": 0.4696767001114827, + "grad_norm": 0.5724424123764038, + "learning_rate": 8e-05, + "loss": 1.6804, + "step": 8426 + }, + { + "epoch": 0.4697324414715719, + "grad_norm": 0.4837213158607483, + "learning_rate": 8e-05, + "loss": 1.4942, + "step": 8427 + }, + { + "epoch": 0.46978818283166107, + "grad_norm": 0.5545235276222229, + "learning_rate": 8e-05, + "loss": 1.3764, + "step": 8428 + }, + { + "epoch": 0.4698439241917503, + "grad_norm": 0.4970060884952545, + "learning_rate": 8e-05, + "loss": 1.5394, + "step": 8429 + }, + { + "epoch": 0.4698996655518395, + "grad_norm": 0.536975085735321, + "learning_rate": 8e-05, + "loss": 1.4085, + "step": 8430 + }, + { + "epoch": 0.46995540691192866, + "grad_norm": 0.5321915745735168, + "learning_rate": 8e-05, + "loss": 1.4961, + "step": 8431 + }, + { + "epoch": 0.47001114827201784, + "grad_norm": 0.5285022258758545, + "learning_rate": 8e-05, + "loss": 1.4844, + "step": 8432 + }, + { + "epoch": 0.470066889632107, + "grad_norm": 0.5677052140235901, + "learning_rate": 8e-05, + "loss": 1.8176, + "step": 8433 + }, + { + "epoch": 0.4701226309921962, + "grad_norm": 0.5485408306121826, + "learning_rate": 8e-05, + "loss": 1.5995, + "step": 8434 + }, + { + "epoch": 0.4701783723522854, + "grad_norm": 0.5371769666671753, + "learning_rate": 8e-05, + "loss": 1.711, + "step": 8435 + }, + { + "epoch": 0.47023411371237456, + "grad_norm": 0.5499669909477234, + "learning_rate": 8e-05, + "loss": 1.5055, + "step": 8436 + }, + { + "epoch": 0.4702898550724638, + "grad_norm": 0.5971784591674805, + "learning_rate": 8e-05, + "loss": 1.6764, + "step": 8437 + }, + { + "epoch": 0.47034559643255297, + "grad_norm": 0.7127771377563477, + "learning_rate": 8e-05, + "loss": 1.1981, + "step": 8438 + }, + { + "epoch": 0.47040133779264215, + "grad_norm": 0.5157849192619324, + "learning_rate": 8e-05, + "loss": 1.4334, + "step": 8439 + }, + { + "epoch": 0.47045707915273133, + "grad_norm": 0.5078108906745911, + "learning_rate": 8e-05, + "loss": 1.5883, + "step": 8440 + }, + { + "epoch": 0.4705128205128205, + "grad_norm": 0.5465660095214844, + "learning_rate": 8e-05, + "loss": 1.691, + "step": 8441 + }, + { + "epoch": 0.4705685618729097, + "grad_norm": 0.5386843681335449, + "learning_rate": 8e-05, + "loss": 1.7306, + "step": 8442 + }, + { + "epoch": 0.47062430323299886, + "grad_norm": 0.5482935905456543, + "learning_rate": 8e-05, + "loss": 1.8809, + "step": 8443 + }, + { + "epoch": 0.4706800445930881, + "grad_norm": 0.5584401488304138, + "learning_rate": 8e-05, + "loss": 1.668, + "step": 8444 + }, + { + "epoch": 0.4707357859531773, + "grad_norm": 0.4966360926628113, + "learning_rate": 8e-05, + "loss": 1.5818, + "step": 8445 + }, + { + "epoch": 0.47079152731326646, + "grad_norm": 0.5326342582702637, + "learning_rate": 8e-05, + "loss": 1.5713, + "step": 8446 + }, + { + "epoch": 0.47084726867335563, + "grad_norm": 0.542566180229187, + "learning_rate": 8e-05, + "loss": 1.7441, + "step": 8447 + }, + { + "epoch": 0.4709030100334448, + "grad_norm": 0.5604332089424133, + "learning_rate": 8e-05, + "loss": 1.6504, + "step": 8448 + }, + { + "epoch": 0.470958751393534, + "grad_norm": 0.5631572008132935, + "learning_rate": 8e-05, + "loss": 1.8132, + "step": 8449 + }, + { + "epoch": 0.47101449275362317, + "grad_norm": 0.5507049560546875, + "learning_rate": 8e-05, + "loss": 1.6097, + "step": 8450 + }, + { + "epoch": 0.47107023411371235, + "grad_norm": 0.5550432205200195, + "learning_rate": 8e-05, + "loss": 1.6979, + "step": 8451 + }, + { + "epoch": 0.4711259754738016, + "grad_norm": 0.5171401500701904, + "learning_rate": 8e-05, + "loss": 1.4894, + "step": 8452 + }, + { + "epoch": 0.47118171683389076, + "grad_norm": 0.5405150055885315, + "learning_rate": 8e-05, + "loss": 1.6846, + "step": 8453 + }, + { + "epoch": 0.47123745819397994, + "grad_norm": 0.536183774471283, + "learning_rate": 8e-05, + "loss": 1.4982, + "step": 8454 + }, + { + "epoch": 0.4712931995540691, + "grad_norm": 0.5685522556304932, + "learning_rate": 8e-05, + "loss": 1.785, + "step": 8455 + }, + { + "epoch": 0.4713489409141583, + "grad_norm": 0.5040280222892761, + "learning_rate": 8e-05, + "loss": 1.453, + "step": 8456 + }, + { + "epoch": 0.4714046822742475, + "grad_norm": 0.5402909517288208, + "learning_rate": 8e-05, + "loss": 1.7486, + "step": 8457 + }, + { + "epoch": 0.47146042363433666, + "grad_norm": 0.5560258030891418, + "learning_rate": 8e-05, + "loss": 1.636, + "step": 8458 + }, + { + "epoch": 0.4715161649944259, + "grad_norm": 0.5390499234199524, + "learning_rate": 8e-05, + "loss": 1.8613, + "step": 8459 + }, + { + "epoch": 0.47157190635451507, + "grad_norm": 0.5030994415283203, + "learning_rate": 8e-05, + "loss": 1.4811, + "step": 8460 + }, + { + "epoch": 0.47162764771460425, + "grad_norm": 0.5098398923873901, + "learning_rate": 8e-05, + "loss": 1.6269, + "step": 8461 + }, + { + "epoch": 0.4716833890746934, + "grad_norm": 0.5928331017494202, + "learning_rate": 8e-05, + "loss": 1.8651, + "step": 8462 + }, + { + "epoch": 0.4717391304347826, + "grad_norm": 0.5353269577026367, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 8463 + }, + { + "epoch": 0.4717948717948718, + "grad_norm": 0.5925639867782593, + "learning_rate": 8e-05, + "loss": 1.7972, + "step": 8464 + }, + { + "epoch": 0.47185061315496096, + "grad_norm": 0.46216219663619995, + "learning_rate": 8e-05, + "loss": 1.4756, + "step": 8465 + }, + { + "epoch": 0.47190635451505014, + "grad_norm": 0.5451775193214417, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 8466 + }, + { + "epoch": 0.4719620958751394, + "grad_norm": 0.5439514517784119, + "learning_rate": 8e-05, + "loss": 1.8168, + "step": 8467 + }, + { + "epoch": 0.47201783723522855, + "grad_norm": 0.5746572613716125, + "learning_rate": 8e-05, + "loss": 1.8156, + "step": 8468 + }, + { + "epoch": 0.47207357859531773, + "grad_norm": 0.5776638388633728, + "learning_rate": 8e-05, + "loss": 1.808, + "step": 8469 + }, + { + "epoch": 0.4721293199554069, + "grad_norm": 0.5580638647079468, + "learning_rate": 8e-05, + "loss": 1.5294, + "step": 8470 + }, + { + "epoch": 0.4721850613154961, + "grad_norm": 0.5210429430007935, + "learning_rate": 8e-05, + "loss": 1.5537, + "step": 8471 + }, + { + "epoch": 0.47224080267558527, + "grad_norm": 0.5610958337783813, + "learning_rate": 8e-05, + "loss": 1.7074, + "step": 8472 + }, + { + "epoch": 0.47229654403567445, + "grad_norm": 0.5312358736991882, + "learning_rate": 8e-05, + "loss": 1.7302, + "step": 8473 + }, + { + "epoch": 0.4723522853957637, + "grad_norm": 0.5046020746231079, + "learning_rate": 8e-05, + "loss": 1.6344, + "step": 8474 + }, + { + "epoch": 0.47240802675585286, + "grad_norm": 0.5662330389022827, + "learning_rate": 8e-05, + "loss": 1.8086, + "step": 8475 + }, + { + "epoch": 0.47246376811594204, + "grad_norm": 0.5743188261985779, + "learning_rate": 8e-05, + "loss": 1.7688, + "step": 8476 + }, + { + "epoch": 0.4725195094760312, + "grad_norm": 0.5933166742324829, + "learning_rate": 8e-05, + "loss": 1.6326, + "step": 8477 + }, + { + "epoch": 0.4725752508361204, + "grad_norm": 0.5768831968307495, + "learning_rate": 8e-05, + "loss": 1.6855, + "step": 8478 + }, + { + "epoch": 0.4726309921962096, + "grad_norm": 0.5617559552192688, + "learning_rate": 8e-05, + "loss": 1.5102, + "step": 8479 + }, + { + "epoch": 0.47268673355629875, + "grad_norm": 0.5413428544998169, + "learning_rate": 8e-05, + "loss": 1.6673, + "step": 8480 + }, + { + "epoch": 0.47274247491638793, + "grad_norm": 0.5708975195884705, + "learning_rate": 8e-05, + "loss": 1.7645, + "step": 8481 + }, + { + "epoch": 0.47279821627647717, + "grad_norm": 0.5631591081619263, + "learning_rate": 8e-05, + "loss": 1.7544, + "step": 8482 + }, + { + "epoch": 0.47285395763656635, + "grad_norm": 0.5494202375411987, + "learning_rate": 8e-05, + "loss": 1.6359, + "step": 8483 + }, + { + "epoch": 0.4729096989966555, + "grad_norm": 0.5949761867523193, + "learning_rate": 8e-05, + "loss": 1.7499, + "step": 8484 + }, + { + "epoch": 0.4729654403567447, + "grad_norm": 0.5369747877120972, + "learning_rate": 8e-05, + "loss": 1.6844, + "step": 8485 + }, + { + "epoch": 0.4730211817168339, + "grad_norm": 0.5614414215087891, + "learning_rate": 8e-05, + "loss": 1.7497, + "step": 8486 + }, + { + "epoch": 0.47307692307692306, + "grad_norm": 0.49822041392326355, + "learning_rate": 8e-05, + "loss": 1.4049, + "step": 8487 + }, + { + "epoch": 0.47313266443701224, + "grad_norm": 0.5776722431182861, + "learning_rate": 8e-05, + "loss": 1.6997, + "step": 8488 + }, + { + "epoch": 0.4731884057971015, + "grad_norm": NaN, + "learning_rate": 8e-05, + "loss": 2.2426, + "step": 8489 + }, + { + "epoch": 0.47324414715719065, + "grad_norm": 0.605370044708252, + "learning_rate": 8e-05, + "loss": 1.8601, + "step": 8490 + }, + { + "epoch": 0.47329988851727983, + "grad_norm": 0.5518434643745422, + "learning_rate": 8e-05, + "loss": 1.5683, + "step": 8491 + }, + { + "epoch": 0.473355629877369, + "grad_norm": 0.540189266204834, + "learning_rate": 8e-05, + "loss": 1.6447, + "step": 8492 + }, + { + "epoch": 0.4734113712374582, + "grad_norm": 0.5703498125076294, + "learning_rate": 8e-05, + "loss": 1.7412, + "step": 8493 + }, + { + "epoch": 0.47346711259754737, + "grad_norm": 0.5552210807800293, + "learning_rate": 8e-05, + "loss": 1.6012, + "step": 8494 + }, + { + "epoch": 0.47352285395763655, + "grad_norm": 0.5687077641487122, + "learning_rate": 8e-05, + "loss": 1.8304, + "step": 8495 + }, + { + "epoch": 0.4735785953177257, + "grad_norm": 0.5775901079177856, + "learning_rate": 8e-05, + "loss": 1.8197, + "step": 8496 + }, + { + "epoch": 0.47363433667781496, + "grad_norm": 0.5491219758987427, + "learning_rate": 8e-05, + "loss": 1.7912, + "step": 8497 + }, + { + "epoch": 0.47369007803790414, + "grad_norm": 0.520912230014801, + "learning_rate": 8e-05, + "loss": 1.7095, + "step": 8498 + }, + { + "epoch": 0.4737458193979933, + "grad_norm": 0.4933062791824341, + "learning_rate": 8e-05, + "loss": 1.4836, + "step": 8499 + }, + { + "epoch": 0.4738015607580825, + "grad_norm": 0.5397845506668091, + "learning_rate": 8e-05, + "loss": 1.8484, + "step": 8500 + }, + { + "epoch": 0.4738573021181717, + "grad_norm": 0.5235599279403687, + "learning_rate": 8e-05, + "loss": 1.823, + "step": 8501 + }, + { + "epoch": 0.47391304347826085, + "grad_norm": 0.5046244859695435, + "learning_rate": 8e-05, + "loss": 1.6333, + "step": 8502 + }, + { + "epoch": 0.47396878483835003, + "grad_norm": 0.5144981741905212, + "learning_rate": 8e-05, + "loss": 1.7431, + "step": 8503 + }, + { + "epoch": 0.47402452619843927, + "grad_norm": 0.5081440806388855, + "learning_rate": 8e-05, + "loss": 1.4979, + "step": 8504 + }, + { + "epoch": 0.47408026755852845, + "grad_norm": 0.550340473651886, + "learning_rate": 8e-05, + "loss": 1.6302, + "step": 8505 + }, + { + "epoch": 0.4741360089186176, + "grad_norm": 0.5056584477424622, + "learning_rate": 8e-05, + "loss": 1.4834, + "step": 8506 + }, + { + "epoch": 0.4741917502787068, + "grad_norm": 0.49061113595962524, + "learning_rate": 8e-05, + "loss": 1.4609, + "step": 8507 + }, + { + "epoch": 0.474247491638796, + "grad_norm": 0.6651285290718079, + "learning_rate": 8e-05, + "loss": 1.7197, + "step": 8508 + }, + { + "epoch": 0.47430323299888516, + "grad_norm": 0.5291346311569214, + "learning_rate": 8e-05, + "loss": 1.6432, + "step": 8509 + }, + { + "epoch": 0.47435897435897434, + "grad_norm": 0.5617138743400574, + "learning_rate": 8e-05, + "loss": 1.7347, + "step": 8510 + }, + { + "epoch": 0.4744147157190635, + "grad_norm": 0.498995304107666, + "learning_rate": 8e-05, + "loss": 1.5052, + "step": 8511 + }, + { + "epoch": 0.47447045707915275, + "grad_norm": 0.5444831252098083, + "learning_rate": 8e-05, + "loss": 1.7847, + "step": 8512 + }, + { + "epoch": 0.47452619843924193, + "grad_norm": 0.54232257604599, + "learning_rate": 8e-05, + "loss": 1.7109, + "step": 8513 + }, + { + "epoch": 0.4745819397993311, + "grad_norm": 0.5574086308479309, + "learning_rate": 8e-05, + "loss": 1.7192, + "step": 8514 + }, + { + "epoch": 0.4746376811594203, + "grad_norm": 0.5762770771980286, + "learning_rate": 8e-05, + "loss": 1.8808, + "step": 8515 + }, + { + "epoch": 0.47469342251950947, + "grad_norm": 0.5595988631248474, + "learning_rate": 8e-05, + "loss": 1.8657, + "step": 8516 + }, + { + "epoch": 0.47474916387959865, + "grad_norm": 0.5351122617721558, + "learning_rate": 8e-05, + "loss": 1.8992, + "step": 8517 + }, + { + "epoch": 0.4748049052396878, + "grad_norm": 0.5130336284637451, + "learning_rate": 8e-05, + "loss": 1.6199, + "step": 8518 + }, + { + "epoch": 0.47486064659977706, + "grad_norm": 0.5582343339920044, + "learning_rate": 8e-05, + "loss": 1.6847, + "step": 8519 + }, + { + "epoch": 0.47491638795986624, + "grad_norm": 0.5283982157707214, + "learning_rate": 8e-05, + "loss": 1.6256, + "step": 8520 + }, + { + "epoch": 0.4749721293199554, + "grad_norm": 0.501308262348175, + "learning_rate": 8e-05, + "loss": 1.4266, + "step": 8521 + }, + { + "epoch": 0.4750278706800446, + "grad_norm": 0.5760260820388794, + "learning_rate": 8e-05, + "loss": 1.7956, + "step": 8522 + }, + { + "epoch": 0.4750836120401338, + "grad_norm": 0.5730380415916443, + "learning_rate": 8e-05, + "loss": 1.564, + "step": 8523 + }, + { + "epoch": 0.47513935340022295, + "grad_norm": 0.5436396598815918, + "learning_rate": 8e-05, + "loss": 1.7967, + "step": 8524 + }, + { + "epoch": 0.47519509476031213, + "grad_norm": 0.5383810997009277, + "learning_rate": 8e-05, + "loss": 1.6471, + "step": 8525 + }, + { + "epoch": 0.47525083612040137, + "grad_norm": 0.548120379447937, + "learning_rate": 8e-05, + "loss": 1.6541, + "step": 8526 + }, + { + "epoch": 0.47530657748049054, + "grad_norm": 0.5204375982284546, + "learning_rate": 8e-05, + "loss": 1.4784, + "step": 8527 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 0.562260091304779, + "learning_rate": 8e-05, + "loss": 1.6466, + "step": 8528 + }, + { + "epoch": 0.4754180602006689, + "grad_norm": 0.523824155330658, + "learning_rate": 8e-05, + "loss": 1.5682, + "step": 8529 + }, + { + "epoch": 0.4754738015607581, + "grad_norm": 0.5674720406532288, + "learning_rate": 8e-05, + "loss": 1.6642, + "step": 8530 + }, + { + "epoch": 0.47552954292084726, + "grad_norm": 0.49034515023231506, + "learning_rate": 8e-05, + "loss": 1.2825, + "step": 8531 + }, + { + "epoch": 0.47558528428093644, + "grad_norm": 0.5165945291519165, + "learning_rate": 8e-05, + "loss": 1.6967, + "step": 8532 + }, + { + "epoch": 0.4756410256410256, + "grad_norm": 0.5310879349708557, + "learning_rate": 8e-05, + "loss": 1.448, + "step": 8533 + }, + { + "epoch": 0.47569676700111485, + "grad_norm": 0.5824185609817505, + "learning_rate": 8e-05, + "loss": 1.836, + "step": 8534 + }, + { + "epoch": 0.47575250836120403, + "grad_norm": 0.4889031946659088, + "learning_rate": 8e-05, + "loss": 1.3147, + "step": 8535 + }, + { + "epoch": 0.4758082497212932, + "grad_norm": 0.6152478456497192, + "learning_rate": 8e-05, + "loss": 1.8151, + "step": 8536 + }, + { + "epoch": 0.4758639910813824, + "grad_norm": 0.5502965450286865, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 8537 + }, + { + "epoch": 0.47591973244147157, + "grad_norm": 0.5226942896842957, + "learning_rate": 8e-05, + "loss": 1.6196, + "step": 8538 + }, + { + "epoch": 0.47597547380156074, + "grad_norm": 0.5234166383743286, + "learning_rate": 8e-05, + "loss": 1.7366, + "step": 8539 + }, + { + "epoch": 0.4760312151616499, + "grad_norm": 0.5769938230514526, + "learning_rate": 8e-05, + "loss": 1.8112, + "step": 8540 + }, + { + "epoch": 0.47608695652173916, + "grad_norm": 0.5301511287689209, + "learning_rate": 8e-05, + "loss": 1.805, + "step": 8541 + }, + { + "epoch": 0.47614269788182834, + "grad_norm": 0.5410844087600708, + "learning_rate": 8e-05, + "loss": 1.7966, + "step": 8542 + }, + { + "epoch": 0.4761984392419175, + "grad_norm": 0.5708724856376648, + "learning_rate": 8e-05, + "loss": 1.7287, + "step": 8543 + }, + { + "epoch": 0.4762541806020067, + "grad_norm": 0.516877293586731, + "learning_rate": 8e-05, + "loss": 1.6566, + "step": 8544 + }, + { + "epoch": 0.4763099219620959, + "grad_norm": 0.5604516267776489, + "learning_rate": 8e-05, + "loss": 1.7456, + "step": 8545 + }, + { + "epoch": 0.47636566332218505, + "grad_norm": 0.5355717539787292, + "learning_rate": 8e-05, + "loss": 1.6313, + "step": 8546 + }, + { + "epoch": 0.47642140468227423, + "grad_norm": 0.5061933994293213, + "learning_rate": 8e-05, + "loss": 1.5322, + "step": 8547 + }, + { + "epoch": 0.4764771460423634, + "grad_norm": 0.5413419604301453, + "learning_rate": 8e-05, + "loss": 1.7215, + "step": 8548 + }, + { + "epoch": 0.47653288740245264, + "grad_norm": 0.5458749532699585, + "learning_rate": 8e-05, + "loss": 1.8465, + "step": 8549 + }, + { + "epoch": 0.4765886287625418, + "grad_norm": 0.5370244979858398, + "learning_rate": 8e-05, + "loss": 1.5644, + "step": 8550 + }, + { + "epoch": 0.476644370122631, + "grad_norm": 0.5661664605140686, + "learning_rate": 8e-05, + "loss": 1.7357, + "step": 8551 + }, + { + "epoch": 0.4767001114827202, + "grad_norm": 0.5611180663108826, + "learning_rate": 8e-05, + "loss": 1.8036, + "step": 8552 + }, + { + "epoch": 0.47675585284280936, + "grad_norm": 0.5248943567276001, + "learning_rate": 8e-05, + "loss": 1.515, + "step": 8553 + }, + { + "epoch": 0.47681159420289854, + "grad_norm": 0.5423473715782166, + "learning_rate": 8e-05, + "loss": 1.6425, + "step": 8554 + }, + { + "epoch": 0.4768673355629877, + "grad_norm": 0.6139373183250427, + "learning_rate": 8e-05, + "loss": 1.7463, + "step": 8555 + }, + { + "epoch": 0.47692307692307695, + "grad_norm": 0.5869265794754028, + "learning_rate": 8e-05, + "loss": 1.7739, + "step": 8556 + }, + { + "epoch": 0.47697881828316613, + "grad_norm": 0.6074341535568237, + "learning_rate": 8e-05, + "loss": 1.9197, + "step": 8557 + }, + { + "epoch": 0.4770345596432553, + "grad_norm": 0.5665176510810852, + "learning_rate": 8e-05, + "loss": 1.6443, + "step": 8558 + }, + { + "epoch": 0.4770903010033445, + "grad_norm": 0.5026342272758484, + "learning_rate": 8e-05, + "loss": 1.4091, + "step": 8559 + }, + { + "epoch": 0.47714604236343366, + "grad_norm": 0.5059593915939331, + "learning_rate": 8e-05, + "loss": 1.5653, + "step": 8560 + }, + { + "epoch": 0.47720178372352284, + "grad_norm": 0.5121914744377136, + "learning_rate": 8e-05, + "loss": 1.3864, + "step": 8561 + }, + { + "epoch": 0.477257525083612, + "grad_norm": 0.5353608727455139, + "learning_rate": 8e-05, + "loss": 1.611, + "step": 8562 + }, + { + "epoch": 0.4773132664437012, + "grad_norm": 0.49897637963294983, + "learning_rate": 8e-05, + "loss": 1.6633, + "step": 8563 + }, + { + "epoch": 0.47736900780379043, + "grad_norm": 0.5345289707183838, + "learning_rate": 8e-05, + "loss": 1.7425, + "step": 8564 + }, + { + "epoch": 0.4774247491638796, + "grad_norm": 0.5157424807548523, + "learning_rate": 8e-05, + "loss": 1.4158, + "step": 8565 + }, + { + "epoch": 0.4774804905239688, + "grad_norm": 0.49930742383003235, + "learning_rate": 8e-05, + "loss": 1.6463, + "step": 8566 + }, + { + "epoch": 0.47753623188405797, + "grad_norm": 0.5590001344680786, + "learning_rate": 8e-05, + "loss": 1.7468, + "step": 8567 + }, + { + "epoch": 0.47759197324414715, + "grad_norm": 0.603706419467926, + "learning_rate": 8e-05, + "loss": 1.8045, + "step": 8568 + }, + { + "epoch": 0.47764771460423633, + "grad_norm": 0.5573416948318481, + "learning_rate": 8e-05, + "loss": 1.7055, + "step": 8569 + }, + { + "epoch": 0.4777034559643255, + "grad_norm": 0.5743922591209412, + "learning_rate": 8e-05, + "loss": 1.8453, + "step": 8570 + }, + { + "epoch": 0.47775919732441474, + "grad_norm": 0.5786899328231812, + "learning_rate": 8e-05, + "loss": 1.8062, + "step": 8571 + }, + { + "epoch": 0.4778149386845039, + "grad_norm": 0.511255145072937, + "learning_rate": 8e-05, + "loss": 1.6474, + "step": 8572 + }, + { + "epoch": 0.4778706800445931, + "grad_norm": 0.51045823097229, + "learning_rate": 8e-05, + "loss": 1.4694, + "step": 8573 + }, + { + "epoch": 0.4779264214046823, + "grad_norm": 0.5715101361274719, + "learning_rate": 8e-05, + "loss": 1.7614, + "step": 8574 + }, + { + "epoch": 0.47798216276477146, + "grad_norm": 0.5859196186065674, + "learning_rate": 8e-05, + "loss": 1.6408, + "step": 8575 + }, + { + "epoch": 0.47803790412486064, + "grad_norm": 0.5789876580238342, + "learning_rate": 8e-05, + "loss": 1.8372, + "step": 8576 + }, + { + "epoch": 0.4780936454849498, + "grad_norm": 0.5685731768608093, + "learning_rate": 8e-05, + "loss": 1.5789, + "step": 8577 + }, + { + "epoch": 0.478149386845039, + "grad_norm": 0.5826317071914673, + "learning_rate": 8e-05, + "loss": 1.6124, + "step": 8578 + }, + { + "epoch": 0.4782051282051282, + "grad_norm": 0.5494159460067749, + "learning_rate": 8e-05, + "loss": 1.6284, + "step": 8579 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.4774226248264313, + "learning_rate": 8e-05, + "loss": 1.3509, + "step": 8580 + }, + { + "epoch": 0.4783166109253066, + "grad_norm": 0.5364505648612976, + "learning_rate": 8e-05, + "loss": 1.6744, + "step": 8581 + }, + { + "epoch": 0.47837235228539576, + "grad_norm": 0.5289744734764099, + "learning_rate": 8e-05, + "loss": 1.5394, + "step": 8582 + }, + { + "epoch": 0.47842809364548494, + "grad_norm": 0.579842746257782, + "learning_rate": 8e-05, + "loss": 1.7983, + "step": 8583 + }, + { + "epoch": 0.4784838350055741, + "grad_norm": 0.5630995035171509, + "learning_rate": 8e-05, + "loss": 1.6644, + "step": 8584 + }, + { + "epoch": 0.4785395763656633, + "grad_norm": 0.5356509685516357, + "learning_rate": 8e-05, + "loss": 1.6247, + "step": 8585 + }, + { + "epoch": 0.47859531772575253, + "grad_norm": 0.5014398097991943, + "learning_rate": 8e-05, + "loss": 1.5935, + "step": 8586 + }, + { + "epoch": 0.4786510590858417, + "grad_norm": 0.5287692546844482, + "learning_rate": 8e-05, + "loss": 1.7297, + "step": 8587 + }, + { + "epoch": 0.4787068004459309, + "grad_norm": 0.5526402592658997, + "learning_rate": 8e-05, + "loss": 1.5543, + "step": 8588 + }, + { + "epoch": 0.47876254180602007, + "grad_norm": 0.547342836856842, + "learning_rate": 8e-05, + "loss": 1.5193, + "step": 8589 + }, + { + "epoch": 0.47881828316610925, + "grad_norm": 0.5509900450706482, + "learning_rate": 8e-05, + "loss": 1.7922, + "step": 8590 + }, + { + "epoch": 0.4788740245261984, + "grad_norm": 0.5440711975097656, + "learning_rate": 8e-05, + "loss": 1.5476, + "step": 8591 + }, + { + "epoch": 0.4789297658862876, + "grad_norm": 0.5216376185417175, + "learning_rate": 8e-05, + "loss": 1.4191, + "step": 8592 + }, + { + "epoch": 0.4789855072463768, + "grad_norm": 0.5691701769828796, + "learning_rate": 8e-05, + "loss": 1.6273, + "step": 8593 + }, + { + "epoch": 0.479041248606466, + "grad_norm": 0.5230170488357544, + "learning_rate": 8e-05, + "loss": 1.6368, + "step": 8594 + }, + { + "epoch": 0.4790969899665552, + "grad_norm": 0.5238328576087952, + "learning_rate": 8e-05, + "loss": 1.6142, + "step": 8595 + }, + { + "epoch": 0.4791527313266444, + "grad_norm": 0.5560547709465027, + "learning_rate": 8e-05, + "loss": 1.6971, + "step": 8596 + }, + { + "epoch": 0.47920847268673356, + "grad_norm": 0.5416411757469177, + "learning_rate": 8e-05, + "loss": 1.7525, + "step": 8597 + }, + { + "epoch": 0.47926421404682273, + "grad_norm": 0.5670372843742371, + "learning_rate": 8e-05, + "loss": 1.7209, + "step": 8598 + }, + { + "epoch": 0.4793199554069119, + "grad_norm": 0.559459924697876, + "learning_rate": 8e-05, + "loss": 1.7211, + "step": 8599 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.5402252078056335, + "learning_rate": 8e-05, + "loss": 1.713, + "step": 8600 + }, + { + "epoch": 0.4794314381270903, + "grad_norm": 0.5224754214286804, + "learning_rate": 8e-05, + "loss": 1.5075, + "step": 8601 + }, + { + "epoch": 0.4794871794871795, + "grad_norm": 0.5436088442802429, + "learning_rate": 8e-05, + "loss": 1.4942, + "step": 8602 + }, + { + "epoch": 0.4795429208472687, + "grad_norm": 0.7566401362419128, + "learning_rate": 8e-05, + "loss": 1.8953, + "step": 8603 + }, + { + "epoch": 0.47959866220735786, + "grad_norm": 0.5927697420120239, + "learning_rate": 8e-05, + "loss": 1.8098, + "step": 8604 + }, + { + "epoch": 0.47965440356744704, + "grad_norm": 0.5384635925292969, + "learning_rate": 8e-05, + "loss": 1.7361, + "step": 8605 + }, + { + "epoch": 0.4797101449275362, + "grad_norm": 0.527228832244873, + "learning_rate": 8e-05, + "loss": 1.5468, + "step": 8606 + }, + { + "epoch": 0.4797658862876254, + "grad_norm": 0.5674744248390198, + "learning_rate": 8e-05, + "loss": 1.5369, + "step": 8607 + }, + { + "epoch": 0.4798216276477146, + "grad_norm": 0.5231080055236816, + "learning_rate": 8e-05, + "loss": 1.622, + "step": 8608 + }, + { + "epoch": 0.4798773690078038, + "grad_norm": 0.7882029414176941, + "learning_rate": 8e-05, + "loss": 1.726, + "step": 8609 + }, + { + "epoch": 0.479933110367893, + "grad_norm": 0.5434726476669312, + "learning_rate": 8e-05, + "loss": 1.8462, + "step": 8610 + }, + { + "epoch": 0.47998885172798217, + "grad_norm": 0.5238932371139526, + "learning_rate": 8e-05, + "loss": 1.6484, + "step": 8611 + }, + { + "epoch": 0.48004459308807135, + "grad_norm": 0.5556615591049194, + "learning_rate": 8e-05, + "loss": 1.7166, + "step": 8612 + }, + { + "epoch": 0.4801003344481605, + "grad_norm": 0.5250589847564697, + "learning_rate": 8e-05, + "loss": 1.758, + "step": 8613 + }, + { + "epoch": 0.4801560758082497, + "grad_norm": 0.5367085337638855, + "learning_rate": 8e-05, + "loss": 1.6922, + "step": 8614 + }, + { + "epoch": 0.4802118171683389, + "grad_norm": 0.572113037109375, + "learning_rate": 8e-05, + "loss": 1.6577, + "step": 8615 + }, + { + "epoch": 0.4802675585284281, + "grad_norm": 0.5322007536888123, + "learning_rate": 8e-05, + "loss": 1.7393, + "step": 8616 + }, + { + "epoch": 0.4803232998885173, + "grad_norm": 0.5749253630638123, + "learning_rate": 8e-05, + "loss": 1.4501, + "step": 8617 + }, + { + "epoch": 0.4803790412486065, + "grad_norm": 0.5245643854141235, + "learning_rate": 8e-05, + "loss": 1.5195, + "step": 8618 + }, + { + "epoch": 0.48043478260869565, + "grad_norm": 0.5711345672607422, + "learning_rate": 8e-05, + "loss": 1.8074, + "step": 8619 + }, + { + "epoch": 0.48049052396878483, + "grad_norm": 0.5806153416633606, + "learning_rate": 8e-05, + "loss": 1.6689, + "step": 8620 + }, + { + "epoch": 0.480546265328874, + "grad_norm": 0.5262450575828552, + "learning_rate": 8e-05, + "loss": 1.621, + "step": 8621 + }, + { + "epoch": 0.4806020066889632, + "grad_norm": 0.5317466259002686, + "learning_rate": 8e-05, + "loss": 1.7104, + "step": 8622 + }, + { + "epoch": 0.4806577480490524, + "grad_norm": 0.5976824164390564, + "learning_rate": 8e-05, + "loss": 1.6694, + "step": 8623 + }, + { + "epoch": 0.4807134894091416, + "grad_norm": 0.5373111963272095, + "learning_rate": 8e-05, + "loss": 1.6995, + "step": 8624 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 0.5650762915611267, + "learning_rate": 8e-05, + "loss": 1.7193, + "step": 8625 + }, + { + "epoch": 0.48082497212931996, + "grad_norm": 0.4892181158065796, + "learning_rate": 8e-05, + "loss": 1.5967, + "step": 8626 + }, + { + "epoch": 0.48088071348940914, + "grad_norm": 0.4929823577404022, + "learning_rate": 8e-05, + "loss": 1.5012, + "step": 8627 + }, + { + "epoch": 0.4809364548494983, + "grad_norm": 0.5495728850364685, + "learning_rate": 8e-05, + "loss": 1.7687, + "step": 8628 + }, + { + "epoch": 0.4809921962095875, + "grad_norm": 0.5166272521018982, + "learning_rate": 8e-05, + "loss": 1.6092, + "step": 8629 + }, + { + "epoch": 0.4810479375696767, + "grad_norm": 0.5548287630081177, + "learning_rate": 8e-05, + "loss": 1.5799, + "step": 8630 + }, + { + "epoch": 0.4811036789297659, + "grad_norm": 0.5389736890792847, + "learning_rate": 8e-05, + "loss": 1.5249, + "step": 8631 + }, + { + "epoch": 0.4811594202898551, + "grad_norm": 0.5570147633552551, + "learning_rate": 8e-05, + "loss": 1.7399, + "step": 8632 + }, + { + "epoch": 0.48121516164994427, + "grad_norm": 0.528630793094635, + "learning_rate": 8e-05, + "loss": 1.5725, + "step": 8633 + }, + { + "epoch": 0.48127090301003345, + "grad_norm": 0.5272868275642395, + "learning_rate": 8e-05, + "loss": 1.6709, + "step": 8634 + }, + { + "epoch": 0.4813266443701226, + "grad_norm": 0.5915762782096863, + "learning_rate": 8e-05, + "loss": 1.9041, + "step": 8635 + }, + { + "epoch": 0.4813823857302118, + "grad_norm": 0.5859066247940063, + "learning_rate": 8e-05, + "loss": 1.942, + "step": 8636 + }, + { + "epoch": 0.481438127090301, + "grad_norm": 0.5061094760894775, + "learning_rate": 8e-05, + "loss": 1.5464, + "step": 8637 + }, + { + "epoch": 0.4814938684503902, + "grad_norm": 0.5594993233680725, + "learning_rate": 8e-05, + "loss": 1.6924, + "step": 8638 + }, + { + "epoch": 0.4815496098104794, + "grad_norm": 0.5258274674415588, + "learning_rate": 8e-05, + "loss": 1.5865, + "step": 8639 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.5139769315719604, + "learning_rate": 8e-05, + "loss": 1.6557, + "step": 8640 + }, + { + "epoch": 0.48166109253065775, + "grad_norm": 0.5273594856262207, + "learning_rate": 8e-05, + "loss": 1.6087, + "step": 8641 + }, + { + "epoch": 0.48171683389074693, + "grad_norm": 0.6034706830978394, + "learning_rate": 8e-05, + "loss": 1.7294, + "step": 8642 + }, + { + "epoch": 0.4817725752508361, + "grad_norm": 0.5553348660469055, + "learning_rate": 8e-05, + "loss": 1.4567, + "step": 8643 + }, + { + "epoch": 0.4818283166109253, + "grad_norm": 0.5267349481582642, + "learning_rate": 8e-05, + "loss": 1.517, + "step": 8644 + }, + { + "epoch": 0.48188405797101447, + "grad_norm": 0.5282378196716309, + "learning_rate": 8e-05, + "loss": 1.5921, + "step": 8645 + }, + { + "epoch": 0.4819397993311037, + "grad_norm": 0.526695728302002, + "learning_rate": 8e-05, + "loss": 1.7406, + "step": 8646 + }, + { + "epoch": 0.4819955406911929, + "grad_norm": 0.5395731925964355, + "learning_rate": 8e-05, + "loss": 1.4693, + "step": 8647 + }, + { + "epoch": 0.48205128205128206, + "grad_norm": 0.5188526511192322, + "learning_rate": 8e-05, + "loss": 1.4089, + "step": 8648 + }, + { + "epoch": 0.48210702341137124, + "grad_norm": 0.5890273451805115, + "learning_rate": 8e-05, + "loss": 1.765, + "step": 8649 + }, + { + "epoch": 0.4821627647714604, + "grad_norm": 0.5989070534706116, + "learning_rate": 8e-05, + "loss": 1.8159, + "step": 8650 + }, + { + "epoch": 0.4822185061315496, + "grad_norm": 0.5372903943061829, + "learning_rate": 8e-05, + "loss": 1.7536, + "step": 8651 + }, + { + "epoch": 0.4822742474916388, + "grad_norm": 0.5370186567306519, + "learning_rate": 8e-05, + "loss": 1.5296, + "step": 8652 + }, + { + "epoch": 0.482329988851728, + "grad_norm": 0.5499247908592224, + "learning_rate": 8e-05, + "loss": 1.5902, + "step": 8653 + }, + { + "epoch": 0.4823857302118172, + "grad_norm": 0.5799142122268677, + "learning_rate": 8e-05, + "loss": 1.9175, + "step": 8654 + }, + { + "epoch": 0.48244147157190637, + "grad_norm": 0.562670886516571, + "learning_rate": 8e-05, + "loss": 1.7646, + "step": 8655 + }, + { + "epoch": 0.48249721293199554, + "grad_norm": 0.5562814474105835, + "learning_rate": 8e-05, + "loss": 1.7472, + "step": 8656 + }, + { + "epoch": 0.4825529542920847, + "grad_norm": 0.5497145652770996, + "learning_rate": 8e-05, + "loss": 1.7556, + "step": 8657 + }, + { + "epoch": 0.4826086956521739, + "grad_norm": 0.5814297795295715, + "learning_rate": 8e-05, + "loss": 1.6638, + "step": 8658 + }, + { + "epoch": 0.4826644370122631, + "grad_norm": 0.5385329723358154, + "learning_rate": 8e-05, + "loss": 1.6465, + "step": 8659 + }, + { + "epoch": 0.48272017837235226, + "grad_norm": 0.533889651298523, + "learning_rate": 8e-05, + "loss": 1.6298, + "step": 8660 + }, + { + "epoch": 0.4827759197324415, + "grad_norm": 0.5587199330329895, + "learning_rate": 8e-05, + "loss": 1.7957, + "step": 8661 + }, + { + "epoch": 0.4828316610925307, + "grad_norm": 0.5244656205177307, + "learning_rate": 8e-05, + "loss": 1.4407, + "step": 8662 + }, + { + "epoch": 0.48288740245261985, + "grad_norm": 0.5379255414009094, + "learning_rate": 8e-05, + "loss": 1.5525, + "step": 8663 + }, + { + "epoch": 0.48294314381270903, + "grad_norm": 0.5391064882278442, + "learning_rate": 8e-05, + "loss": 1.6002, + "step": 8664 + }, + { + "epoch": 0.4829988851727982, + "grad_norm": 0.5106524229049683, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 8665 + }, + { + "epoch": 0.4830546265328874, + "grad_norm": 0.5733168721199036, + "learning_rate": 8e-05, + "loss": 1.6508, + "step": 8666 + }, + { + "epoch": 0.48311036789297657, + "grad_norm": 0.5390850901603699, + "learning_rate": 8e-05, + "loss": 1.5811, + "step": 8667 + }, + { + "epoch": 0.4831661092530658, + "grad_norm": 0.5093480944633484, + "learning_rate": 8e-05, + "loss": 1.424, + "step": 8668 + }, + { + "epoch": 0.483221850613155, + "grad_norm": 0.5804044008255005, + "learning_rate": 8e-05, + "loss": 2.0014, + "step": 8669 + }, + { + "epoch": 0.48327759197324416, + "grad_norm": 0.6597045063972473, + "learning_rate": 8e-05, + "loss": 2.1137, + "step": 8670 + }, + { + "epoch": 0.48333333333333334, + "grad_norm": 0.5206351280212402, + "learning_rate": 8e-05, + "loss": 1.5059, + "step": 8671 + }, + { + "epoch": 0.4833890746934225, + "grad_norm": 0.5995643138885498, + "learning_rate": 8e-05, + "loss": 1.505, + "step": 8672 + }, + { + "epoch": 0.4834448160535117, + "grad_norm": 0.5529085993766785, + "learning_rate": 8e-05, + "loss": 1.5522, + "step": 8673 + }, + { + "epoch": 0.4835005574136009, + "grad_norm": 0.5057050585746765, + "learning_rate": 8e-05, + "loss": 1.5109, + "step": 8674 + }, + { + "epoch": 0.48355629877369005, + "grad_norm": 0.5466620922088623, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 8675 + }, + { + "epoch": 0.4836120401337793, + "grad_norm": 0.5643208026885986, + "learning_rate": 8e-05, + "loss": 1.5089, + "step": 8676 + }, + { + "epoch": 0.48366778149386846, + "grad_norm": 0.5501327514648438, + "learning_rate": 8e-05, + "loss": 1.6649, + "step": 8677 + }, + { + "epoch": 0.48372352285395764, + "grad_norm": 0.5182400941848755, + "learning_rate": 8e-05, + "loss": 1.6111, + "step": 8678 + }, + { + "epoch": 0.4837792642140468, + "grad_norm": 0.5956326723098755, + "learning_rate": 8e-05, + "loss": 1.6881, + "step": 8679 + }, + { + "epoch": 0.483835005574136, + "grad_norm": 0.5279495716094971, + "learning_rate": 8e-05, + "loss": 1.542, + "step": 8680 + }, + { + "epoch": 0.4838907469342252, + "grad_norm": 0.537891149520874, + "learning_rate": 8e-05, + "loss": 1.651, + "step": 8681 + }, + { + "epoch": 0.48394648829431436, + "grad_norm": 0.5734157562255859, + "learning_rate": 8e-05, + "loss": 1.6414, + "step": 8682 + }, + { + "epoch": 0.4840022296544036, + "grad_norm": 0.5524048209190369, + "learning_rate": 8e-05, + "loss": 1.5915, + "step": 8683 + }, + { + "epoch": 0.48405797101449277, + "grad_norm": 0.5250129103660583, + "learning_rate": 8e-05, + "loss": 1.7873, + "step": 8684 + }, + { + "epoch": 0.48411371237458195, + "grad_norm": 0.4993424415588379, + "learning_rate": 8e-05, + "loss": 1.5449, + "step": 8685 + }, + { + "epoch": 0.48416945373467113, + "grad_norm": 0.5354702472686768, + "learning_rate": 8e-05, + "loss": 1.6186, + "step": 8686 + }, + { + "epoch": 0.4842251950947603, + "grad_norm": 0.7202904224395752, + "learning_rate": 8e-05, + "loss": 1.5289, + "step": 8687 + }, + { + "epoch": 0.4842809364548495, + "grad_norm": 0.5833171010017395, + "learning_rate": 8e-05, + "loss": 1.7669, + "step": 8688 + }, + { + "epoch": 0.48433667781493867, + "grad_norm": 0.5454303026199341, + "learning_rate": 8e-05, + "loss": 1.5781, + "step": 8689 + }, + { + "epoch": 0.48439241917502784, + "grad_norm": 0.5747190117835999, + "learning_rate": 8e-05, + "loss": 1.8008, + "step": 8690 + }, + { + "epoch": 0.4844481605351171, + "grad_norm": 0.5227949023246765, + "learning_rate": 8e-05, + "loss": 1.7343, + "step": 8691 + }, + { + "epoch": 0.48450390189520626, + "grad_norm": 0.5211941599845886, + "learning_rate": 8e-05, + "loss": 1.773, + "step": 8692 + }, + { + "epoch": 0.48455964325529544, + "grad_norm": 0.511934220790863, + "learning_rate": 8e-05, + "loss": 1.6041, + "step": 8693 + }, + { + "epoch": 0.4846153846153846, + "grad_norm": 0.5146392583847046, + "learning_rate": 8e-05, + "loss": 1.5809, + "step": 8694 + }, + { + "epoch": 0.4846711259754738, + "grad_norm": 0.5503633618354797, + "learning_rate": 8e-05, + "loss": 1.6425, + "step": 8695 + }, + { + "epoch": 0.48472686733556297, + "grad_norm": 0.6414245963096619, + "learning_rate": 8e-05, + "loss": 1.6811, + "step": 8696 + }, + { + "epoch": 0.48478260869565215, + "grad_norm": 0.5404537916183472, + "learning_rate": 8e-05, + "loss": 1.7578, + "step": 8697 + }, + { + "epoch": 0.4848383500557414, + "grad_norm": 0.5513060688972473, + "learning_rate": 8e-05, + "loss": 1.7655, + "step": 8698 + }, + { + "epoch": 0.48489409141583056, + "grad_norm": 0.6144512891769409, + "learning_rate": 8e-05, + "loss": 1.7569, + "step": 8699 + }, + { + "epoch": 0.48494983277591974, + "grad_norm": 0.5189093947410583, + "learning_rate": 8e-05, + "loss": 1.7075, + "step": 8700 + }, + { + "epoch": 0.4850055741360089, + "grad_norm": 0.5028544664382935, + "learning_rate": 8e-05, + "loss": 1.441, + "step": 8701 + }, + { + "epoch": 0.4850613154960981, + "grad_norm": 0.5841237902641296, + "learning_rate": 8e-05, + "loss": 1.7716, + "step": 8702 + }, + { + "epoch": 0.4851170568561873, + "grad_norm": 0.5114294290542603, + "learning_rate": 8e-05, + "loss": 1.4676, + "step": 8703 + }, + { + "epoch": 0.48517279821627646, + "grad_norm": 0.597727358341217, + "learning_rate": 8e-05, + "loss": 1.5788, + "step": 8704 + }, + { + "epoch": 0.48522853957636564, + "grad_norm": 0.5356034636497498, + "learning_rate": 8e-05, + "loss": 1.6905, + "step": 8705 + }, + { + "epoch": 0.48528428093645487, + "grad_norm": 0.5260893702507019, + "learning_rate": 8e-05, + "loss": 1.5722, + "step": 8706 + }, + { + "epoch": 0.48534002229654405, + "grad_norm": 0.5867304801940918, + "learning_rate": 8e-05, + "loss": 1.8526, + "step": 8707 + }, + { + "epoch": 0.48539576365663323, + "grad_norm": 0.5377045273780823, + "learning_rate": 8e-05, + "loss": 1.8251, + "step": 8708 + }, + { + "epoch": 0.4854515050167224, + "grad_norm": 0.5542226433753967, + "learning_rate": 8e-05, + "loss": 1.9328, + "step": 8709 + }, + { + "epoch": 0.4855072463768116, + "grad_norm": 0.6472811102867126, + "learning_rate": 8e-05, + "loss": 1.7699, + "step": 8710 + }, + { + "epoch": 0.48556298773690076, + "grad_norm": 0.5780431032180786, + "learning_rate": 8e-05, + "loss": 1.5492, + "step": 8711 + }, + { + "epoch": 0.48561872909698994, + "grad_norm": 0.6103213429450989, + "learning_rate": 8e-05, + "loss": 1.7291, + "step": 8712 + }, + { + "epoch": 0.4856744704570792, + "grad_norm": 0.4791732430458069, + "learning_rate": 8e-05, + "loss": 1.4316, + "step": 8713 + }, + { + "epoch": 0.48573021181716836, + "grad_norm": 0.5246248245239258, + "learning_rate": 8e-05, + "loss": 1.5101, + "step": 8714 + }, + { + "epoch": 0.48578595317725753, + "grad_norm": 0.5121028423309326, + "learning_rate": 8e-05, + "loss": 1.6613, + "step": 8715 + }, + { + "epoch": 0.4858416945373467, + "grad_norm": 0.522429347038269, + "learning_rate": 8e-05, + "loss": 1.4679, + "step": 8716 + }, + { + "epoch": 0.4858974358974359, + "grad_norm": 0.5425698757171631, + "learning_rate": 8e-05, + "loss": 1.6178, + "step": 8717 + }, + { + "epoch": 0.48595317725752507, + "grad_norm": 0.5327525734901428, + "learning_rate": 8e-05, + "loss": 1.6725, + "step": 8718 + }, + { + "epoch": 0.48600891861761425, + "grad_norm": 0.5315846800804138, + "learning_rate": 8e-05, + "loss": 1.4996, + "step": 8719 + }, + { + "epoch": 0.48606465997770343, + "grad_norm": 0.6056599617004395, + "learning_rate": 8e-05, + "loss": 1.5545, + "step": 8720 + }, + { + "epoch": 0.48612040133779266, + "grad_norm": 0.535976767539978, + "learning_rate": 8e-05, + "loss": 1.6458, + "step": 8721 + }, + { + "epoch": 0.48617614269788184, + "grad_norm": 0.5951147079467773, + "learning_rate": 8e-05, + "loss": 1.8229, + "step": 8722 + }, + { + "epoch": 0.486231884057971, + "grad_norm": 0.5291656255722046, + "learning_rate": 8e-05, + "loss": 1.6449, + "step": 8723 + }, + { + "epoch": 0.4862876254180602, + "grad_norm": 0.4991705119609833, + "learning_rate": 8e-05, + "loss": 1.3861, + "step": 8724 + }, + { + "epoch": 0.4863433667781494, + "grad_norm": 0.538762629032135, + "learning_rate": 8e-05, + "loss": 1.802, + "step": 8725 + }, + { + "epoch": 0.48639910813823856, + "grad_norm": 0.5288955569267273, + "learning_rate": 8e-05, + "loss": 1.5841, + "step": 8726 + }, + { + "epoch": 0.48645484949832773, + "grad_norm": 0.5434188842773438, + "learning_rate": 8e-05, + "loss": 1.6843, + "step": 8727 + }, + { + "epoch": 0.48651059085841697, + "grad_norm": 0.5369993448257446, + "learning_rate": 8e-05, + "loss": 1.7506, + "step": 8728 + }, + { + "epoch": 0.48656633221850615, + "grad_norm": 0.5895953178405762, + "learning_rate": 8e-05, + "loss": 1.6276, + "step": 8729 + }, + { + "epoch": 0.4866220735785953, + "grad_norm": 0.5380129814147949, + "learning_rate": 8e-05, + "loss": 1.7047, + "step": 8730 + }, + { + "epoch": 0.4866778149386845, + "grad_norm": 0.5475728511810303, + "learning_rate": 8e-05, + "loss": 1.9109, + "step": 8731 + }, + { + "epoch": 0.4867335562987737, + "grad_norm": 0.5191141963005066, + "learning_rate": 8e-05, + "loss": 1.5978, + "step": 8732 + }, + { + "epoch": 0.48678929765886286, + "grad_norm": 0.5969076752662659, + "learning_rate": 8e-05, + "loss": 1.8285, + "step": 8733 + }, + { + "epoch": 0.48684503901895204, + "grad_norm": 0.49988871812820435, + "learning_rate": 8e-05, + "loss": 1.4702, + "step": 8734 + }, + { + "epoch": 0.4869007803790413, + "grad_norm": 0.5074156522750854, + "learning_rate": 8e-05, + "loss": 1.5778, + "step": 8735 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.5323285460472107, + "learning_rate": 8e-05, + "loss": 1.8674, + "step": 8736 + }, + { + "epoch": 0.48701226309921963, + "grad_norm": 0.5003831386566162, + "learning_rate": 8e-05, + "loss": 1.3662, + "step": 8737 + }, + { + "epoch": 0.4870680044593088, + "grad_norm": 0.5432720184326172, + "learning_rate": 8e-05, + "loss": 1.6234, + "step": 8738 + }, + { + "epoch": 0.487123745819398, + "grad_norm": 0.5797169804573059, + "learning_rate": 8e-05, + "loss": 1.8015, + "step": 8739 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 0.5347649455070496, + "learning_rate": 8e-05, + "loss": 1.7881, + "step": 8740 + }, + { + "epoch": 0.48723522853957635, + "grad_norm": 0.5669839978218079, + "learning_rate": 8e-05, + "loss": 1.6679, + "step": 8741 + }, + { + "epoch": 0.4872909698996655, + "grad_norm": 0.54695063829422, + "learning_rate": 8e-05, + "loss": 1.7627, + "step": 8742 + }, + { + "epoch": 0.48734671125975476, + "grad_norm": 0.6649922132492065, + "learning_rate": 8e-05, + "loss": 2.0592, + "step": 8743 + }, + { + "epoch": 0.48740245261984394, + "grad_norm": 0.5175009369850159, + "learning_rate": 8e-05, + "loss": 1.4855, + "step": 8744 + }, + { + "epoch": 0.4874581939799331, + "grad_norm": 0.5442166924476624, + "learning_rate": 8e-05, + "loss": 1.4434, + "step": 8745 + }, + { + "epoch": 0.4875139353400223, + "grad_norm": 0.5374280214309692, + "learning_rate": 8e-05, + "loss": 1.4264, + "step": 8746 + }, + { + "epoch": 0.4875696767001115, + "grad_norm": 0.5635949373245239, + "learning_rate": 8e-05, + "loss": 1.7391, + "step": 8747 + }, + { + "epoch": 0.48762541806020065, + "grad_norm": 0.5555373430252075, + "learning_rate": 8e-05, + "loss": 1.5628, + "step": 8748 + }, + { + "epoch": 0.48768115942028983, + "grad_norm": 0.5785765051841736, + "learning_rate": 8e-05, + "loss": 1.6793, + "step": 8749 + }, + { + "epoch": 0.48773690078037907, + "grad_norm": 0.4377272427082062, + "learning_rate": 8e-05, + "loss": 0.8688, + "step": 8750 + }, + { + "epoch": 0.48779264214046825, + "grad_norm": 0.49414724111557007, + "learning_rate": 8e-05, + "loss": 1.4305, + "step": 8751 + }, + { + "epoch": 0.4878483835005574, + "grad_norm": 0.5328716039657593, + "learning_rate": 8e-05, + "loss": 1.458, + "step": 8752 + }, + { + "epoch": 0.4879041248606466, + "grad_norm": 0.5761605501174927, + "learning_rate": 8e-05, + "loss": 1.8068, + "step": 8753 + }, + { + "epoch": 0.4879598662207358, + "grad_norm": 0.47436168789863586, + "learning_rate": 8e-05, + "loss": 1.1729, + "step": 8754 + }, + { + "epoch": 0.48801560758082496, + "grad_norm": 0.5351918339729309, + "learning_rate": 8e-05, + "loss": 1.7398, + "step": 8755 + }, + { + "epoch": 0.48807134894091414, + "grad_norm": 0.6001772880554199, + "learning_rate": 8e-05, + "loss": 1.7006, + "step": 8756 + }, + { + "epoch": 0.4881270903010033, + "grad_norm": 0.5331566333770752, + "learning_rate": 8e-05, + "loss": 1.3804, + "step": 8757 + }, + { + "epoch": 0.48818283166109255, + "grad_norm": 0.5804986953735352, + "learning_rate": 8e-05, + "loss": 1.7321, + "step": 8758 + }, + { + "epoch": 0.48823857302118173, + "grad_norm": 0.6281266212463379, + "learning_rate": 8e-05, + "loss": 1.5619, + "step": 8759 + }, + { + "epoch": 0.4882943143812709, + "grad_norm": 0.5269529819488525, + "learning_rate": 8e-05, + "loss": 1.5927, + "step": 8760 + }, + { + "epoch": 0.4883500557413601, + "grad_norm": 0.628817081451416, + "learning_rate": 8e-05, + "loss": 1.8212, + "step": 8761 + }, + { + "epoch": 0.48840579710144927, + "grad_norm": 0.5161125659942627, + "learning_rate": 8e-05, + "loss": 1.5617, + "step": 8762 + }, + { + "epoch": 0.48846153846153845, + "grad_norm": 0.5620384812355042, + "learning_rate": 8e-05, + "loss": 1.794, + "step": 8763 + }, + { + "epoch": 0.4885172798216276, + "grad_norm": 0.5161763429641724, + "learning_rate": 8e-05, + "loss": 1.4774, + "step": 8764 + }, + { + "epoch": 0.48857302118171686, + "grad_norm": 0.5568657517433167, + "learning_rate": 8e-05, + "loss": 1.7074, + "step": 8765 + }, + { + "epoch": 0.48862876254180604, + "grad_norm": 0.5740475058555603, + "learning_rate": 8e-05, + "loss": 1.6313, + "step": 8766 + }, + { + "epoch": 0.4886845039018952, + "grad_norm": 0.5516563653945923, + "learning_rate": 8e-05, + "loss": 1.6862, + "step": 8767 + }, + { + "epoch": 0.4887402452619844, + "grad_norm": 0.5552970170974731, + "learning_rate": 8e-05, + "loss": 1.7847, + "step": 8768 + }, + { + "epoch": 0.4887959866220736, + "grad_norm": 0.6541142463684082, + "learning_rate": 8e-05, + "loss": 1.9267, + "step": 8769 + }, + { + "epoch": 0.48885172798216275, + "grad_norm": 0.5554009675979614, + "learning_rate": 8e-05, + "loss": 1.8234, + "step": 8770 + }, + { + "epoch": 0.48890746934225193, + "grad_norm": 0.5801352858543396, + "learning_rate": 8e-05, + "loss": 1.7553, + "step": 8771 + }, + { + "epoch": 0.4889632107023411, + "grad_norm": 0.5918492674827576, + "learning_rate": 8e-05, + "loss": 1.5996, + "step": 8772 + }, + { + "epoch": 0.48901895206243035, + "grad_norm": 0.5583415627479553, + "learning_rate": 8e-05, + "loss": 1.6549, + "step": 8773 + }, + { + "epoch": 0.4890746934225195, + "grad_norm": 0.5346972346305847, + "learning_rate": 8e-05, + "loss": 1.867, + "step": 8774 + }, + { + "epoch": 0.4891304347826087, + "grad_norm": 0.5730275511741638, + "learning_rate": 8e-05, + "loss": 1.8839, + "step": 8775 + }, + { + "epoch": 0.4891861761426979, + "grad_norm": 0.5031627416610718, + "learning_rate": 8e-05, + "loss": 1.5669, + "step": 8776 + }, + { + "epoch": 0.48924191750278706, + "grad_norm": 0.5561438798904419, + "learning_rate": 8e-05, + "loss": 1.804, + "step": 8777 + }, + { + "epoch": 0.48929765886287624, + "grad_norm": 0.5484711527824402, + "learning_rate": 8e-05, + "loss": 1.7645, + "step": 8778 + }, + { + "epoch": 0.4893534002229654, + "grad_norm": 0.5624884963035583, + "learning_rate": 8e-05, + "loss": 1.551, + "step": 8779 + }, + { + "epoch": 0.48940914158305465, + "grad_norm": 0.5877845883369446, + "learning_rate": 8e-05, + "loss": 1.9774, + "step": 8780 + }, + { + "epoch": 0.48946488294314383, + "grad_norm": 0.557813823223114, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 8781 + }, + { + "epoch": 0.489520624303233, + "grad_norm": 0.5099141597747803, + "learning_rate": 8e-05, + "loss": 1.5899, + "step": 8782 + }, + { + "epoch": 0.4895763656633222, + "grad_norm": 0.5315130949020386, + "learning_rate": 8e-05, + "loss": 1.5977, + "step": 8783 + }, + { + "epoch": 0.48963210702341137, + "grad_norm": 0.5222629904747009, + "learning_rate": 8e-05, + "loss": 1.7737, + "step": 8784 + }, + { + "epoch": 0.48968784838350055, + "grad_norm": 0.5800378322601318, + "learning_rate": 8e-05, + "loss": 1.8103, + "step": 8785 + }, + { + "epoch": 0.4897435897435897, + "grad_norm": 0.5630063414573669, + "learning_rate": 8e-05, + "loss": 1.6663, + "step": 8786 + }, + { + "epoch": 0.4897993311036789, + "grad_norm": 0.5052672028541565, + "learning_rate": 8e-05, + "loss": 1.412, + "step": 8787 + }, + { + "epoch": 0.48985507246376814, + "grad_norm": 0.6610358953475952, + "learning_rate": 8e-05, + "loss": 2.2178, + "step": 8788 + }, + { + "epoch": 0.4899108138238573, + "grad_norm": 0.5247467756271362, + "learning_rate": 8e-05, + "loss": 1.6431, + "step": 8789 + }, + { + "epoch": 0.4899665551839465, + "grad_norm": 0.5510267019271851, + "learning_rate": 8e-05, + "loss": 1.7086, + "step": 8790 + }, + { + "epoch": 0.4900222965440357, + "grad_norm": 0.5938252210617065, + "learning_rate": 8e-05, + "loss": 1.7299, + "step": 8791 + }, + { + "epoch": 0.49007803790412485, + "grad_norm": 0.6057933568954468, + "learning_rate": 8e-05, + "loss": 1.9044, + "step": 8792 + }, + { + "epoch": 0.49013377926421403, + "grad_norm": 0.5210702419281006, + "learning_rate": 8e-05, + "loss": 1.6161, + "step": 8793 + }, + { + "epoch": 0.4901895206243032, + "grad_norm": 0.5469611883163452, + "learning_rate": 8e-05, + "loss": 1.5888, + "step": 8794 + }, + { + "epoch": 0.49024526198439244, + "grad_norm": 0.5292215347290039, + "learning_rate": 8e-05, + "loss": 1.6371, + "step": 8795 + }, + { + "epoch": 0.4903010033444816, + "grad_norm": 0.5211275219917297, + "learning_rate": 8e-05, + "loss": 1.5327, + "step": 8796 + }, + { + "epoch": 0.4903567447045708, + "grad_norm": 0.5240673422813416, + "learning_rate": 8e-05, + "loss": 1.6971, + "step": 8797 + }, + { + "epoch": 0.49041248606466, + "grad_norm": 0.4981025755405426, + "learning_rate": 8e-05, + "loss": 1.4021, + "step": 8798 + }, + { + "epoch": 0.49046822742474916, + "grad_norm": 0.5216448307037354, + "learning_rate": 8e-05, + "loss": 1.4958, + "step": 8799 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.531890332698822, + "learning_rate": 8e-05, + "loss": 1.6907, + "step": 8800 + }, + { + "epoch": 0.4905797101449275, + "grad_norm": 0.5103880763053894, + "learning_rate": 8e-05, + "loss": 1.3802, + "step": 8801 + }, + { + "epoch": 0.4906354515050167, + "grad_norm": 0.5482351183891296, + "learning_rate": 8e-05, + "loss": 1.5857, + "step": 8802 + }, + { + "epoch": 0.49069119286510593, + "grad_norm": 0.6087172031402588, + "learning_rate": 8e-05, + "loss": 1.4996, + "step": 8803 + }, + { + "epoch": 0.4907469342251951, + "grad_norm": 0.5360320210456848, + "learning_rate": 8e-05, + "loss": 1.6422, + "step": 8804 + }, + { + "epoch": 0.4908026755852843, + "grad_norm": 0.557442843914032, + "learning_rate": 8e-05, + "loss": 1.7739, + "step": 8805 + }, + { + "epoch": 0.49085841694537347, + "grad_norm": 0.5476164817810059, + "learning_rate": 8e-05, + "loss": 1.6352, + "step": 8806 + }, + { + "epoch": 0.49091415830546264, + "grad_norm": 0.5663220286369324, + "learning_rate": 8e-05, + "loss": 1.7057, + "step": 8807 + }, + { + "epoch": 0.4909698996655518, + "grad_norm": 0.5578323602676392, + "learning_rate": 8e-05, + "loss": 1.4855, + "step": 8808 + }, + { + "epoch": 0.491025641025641, + "grad_norm": 0.7541027069091797, + "learning_rate": 8e-05, + "loss": 1.7219, + "step": 8809 + }, + { + "epoch": 0.49108138238573024, + "grad_norm": 0.5288164615631104, + "learning_rate": 8e-05, + "loss": 1.4917, + "step": 8810 + }, + { + "epoch": 0.4911371237458194, + "grad_norm": 0.5490350127220154, + "learning_rate": 8e-05, + "loss": 1.7237, + "step": 8811 + }, + { + "epoch": 0.4911928651059086, + "grad_norm": 0.5328288078308105, + "learning_rate": 8e-05, + "loss": 1.6545, + "step": 8812 + }, + { + "epoch": 0.4912486064659978, + "grad_norm": 0.6055459380149841, + "learning_rate": 8e-05, + "loss": 1.5455, + "step": 8813 + }, + { + "epoch": 0.49130434782608695, + "grad_norm": 0.5313544273376465, + "learning_rate": 8e-05, + "loss": 1.5263, + "step": 8814 + }, + { + "epoch": 0.49136008918617613, + "grad_norm": 0.5419933199882507, + "learning_rate": 8e-05, + "loss": 1.7685, + "step": 8815 + }, + { + "epoch": 0.4914158305462653, + "grad_norm": 0.5350378155708313, + "learning_rate": 8e-05, + "loss": 1.5742, + "step": 8816 + }, + { + "epoch": 0.4914715719063545, + "grad_norm": 0.5508986711502075, + "learning_rate": 8e-05, + "loss": 1.7074, + "step": 8817 + }, + { + "epoch": 0.4915273132664437, + "grad_norm": 0.5655425786972046, + "learning_rate": 8e-05, + "loss": 1.7255, + "step": 8818 + }, + { + "epoch": 0.4915830546265329, + "grad_norm": 0.5334153175354004, + "learning_rate": 8e-05, + "loss": 1.534, + "step": 8819 + }, + { + "epoch": 0.4916387959866221, + "grad_norm": 0.5287074446678162, + "learning_rate": 8e-05, + "loss": 1.5271, + "step": 8820 + }, + { + "epoch": 0.49169453734671126, + "grad_norm": 0.5889909863471985, + "learning_rate": 8e-05, + "loss": 1.7964, + "step": 8821 + }, + { + "epoch": 0.49175027870680044, + "grad_norm": 0.5158841013908386, + "learning_rate": 8e-05, + "loss": 1.5585, + "step": 8822 + }, + { + "epoch": 0.4918060200668896, + "grad_norm": 0.5091415643692017, + "learning_rate": 8e-05, + "loss": 1.5794, + "step": 8823 + }, + { + "epoch": 0.4918617614269788, + "grad_norm": 0.5212059617042542, + "learning_rate": 8e-05, + "loss": 1.62, + "step": 8824 + }, + { + "epoch": 0.49191750278706803, + "grad_norm": 0.5812239050865173, + "learning_rate": 8e-05, + "loss": 1.8132, + "step": 8825 + }, + { + "epoch": 0.4919732441471572, + "grad_norm": 0.5414086580276489, + "learning_rate": 8e-05, + "loss": 1.5705, + "step": 8826 + }, + { + "epoch": 0.4920289855072464, + "grad_norm": 0.5411535501480103, + "learning_rate": 8e-05, + "loss": 1.6643, + "step": 8827 + }, + { + "epoch": 0.49208472686733556, + "grad_norm": 0.5157851576805115, + "learning_rate": 8e-05, + "loss": 1.7066, + "step": 8828 + }, + { + "epoch": 0.49214046822742474, + "grad_norm": 0.5341320633888245, + "learning_rate": 8e-05, + "loss": 1.772, + "step": 8829 + }, + { + "epoch": 0.4921962095875139, + "grad_norm": 0.5654478669166565, + "learning_rate": 8e-05, + "loss": 1.7265, + "step": 8830 + }, + { + "epoch": 0.4922519509476031, + "grad_norm": 0.5648899078369141, + "learning_rate": 8e-05, + "loss": 1.6301, + "step": 8831 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.5131524801254272, + "learning_rate": 8e-05, + "loss": 1.648, + "step": 8832 + }, + { + "epoch": 0.4923634336677815, + "grad_norm": 0.49470341205596924, + "learning_rate": 8e-05, + "loss": 1.583, + "step": 8833 + }, + { + "epoch": 0.4924191750278707, + "grad_norm": 0.5371274352073669, + "learning_rate": 8e-05, + "loss": 1.743, + "step": 8834 + }, + { + "epoch": 0.49247491638795987, + "grad_norm": 0.5869753956794739, + "learning_rate": 8e-05, + "loss": 1.6407, + "step": 8835 + }, + { + "epoch": 0.49253065774804905, + "grad_norm": 0.49453967809677124, + "learning_rate": 8e-05, + "loss": 1.4267, + "step": 8836 + }, + { + "epoch": 0.49258639910813823, + "grad_norm": 0.5081074237823486, + "learning_rate": 8e-05, + "loss": 1.5216, + "step": 8837 + }, + { + "epoch": 0.4926421404682274, + "grad_norm": 0.5137898325920105, + "learning_rate": 8e-05, + "loss": 1.5087, + "step": 8838 + }, + { + "epoch": 0.4926978818283166, + "grad_norm": 0.5767127275466919, + "learning_rate": 8e-05, + "loss": 1.6743, + "step": 8839 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.5548331141471863, + "learning_rate": 8e-05, + "loss": 1.7111, + "step": 8840 + }, + { + "epoch": 0.492809364548495, + "grad_norm": 0.5236859321594238, + "learning_rate": 8e-05, + "loss": 1.6453, + "step": 8841 + }, + { + "epoch": 0.4928651059085842, + "grad_norm": 0.5255964994430542, + "learning_rate": 8e-05, + "loss": 1.7073, + "step": 8842 + }, + { + "epoch": 0.49292084726867336, + "grad_norm": 0.5545988082885742, + "learning_rate": 8e-05, + "loss": 1.6276, + "step": 8843 + }, + { + "epoch": 0.49297658862876254, + "grad_norm": 0.5834181308746338, + "learning_rate": 8e-05, + "loss": 1.727, + "step": 8844 + }, + { + "epoch": 0.4930323299888517, + "grad_norm": 0.5687046647071838, + "learning_rate": 8e-05, + "loss": 1.7085, + "step": 8845 + }, + { + "epoch": 0.4930880713489409, + "grad_norm": 0.5452062487602234, + "learning_rate": 8e-05, + "loss": 1.6304, + "step": 8846 + }, + { + "epoch": 0.4931438127090301, + "grad_norm": 0.5048710107803345, + "learning_rate": 8e-05, + "loss": 1.3044, + "step": 8847 + }, + { + "epoch": 0.4931995540691193, + "grad_norm": 0.534052848815918, + "learning_rate": 8e-05, + "loss": 1.7413, + "step": 8848 + }, + { + "epoch": 0.4932552954292085, + "grad_norm": 0.6282777190208435, + "learning_rate": 8e-05, + "loss": 2.0405, + "step": 8849 + }, + { + "epoch": 0.49331103678929766, + "grad_norm": 0.5243322253227234, + "learning_rate": 8e-05, + "loss": 1.4643, + "step": 8850 + }, + { + "epoch": 0.49336677814938684, + "grad_norm": 0.5178560018539429, + "learning_rate": 8e-05, + "loss": 1.5601, + "step": 8851 + }, + { + "epoch": 0.493422519509476, + "grad_norm": 0.5159735679626465, + "learning_rate": 8e-05, + "loss": 1.6775, + "step": 8852 + }, + { + "epoch": 0.4934782608695652, + "grad_norm": 0.5154753923416138, + "learning_rate": 8e-05, + "loss": 1.4601, + "step": 8853 + }, + { + "epoch": 0.4935340022296544, + "grad_norm": 0.5437743067741394, + "learning_rate": 8e-05, + "loss": 1.4441, + "step": 8854 + }, + { + "epoch": 0.4935897435897436, + "grad_norm": 0.5819849371910095, + "learning_rate": 8e-05, + "loss": 1.7588, + "step": 8855 + }, + { + "epoch": 0.4936454849498328, + "grad_norm": 0.5482854247093201, + "learning_rate": 8e-05, + "loss": 1.6496, + "step": 8856 + }, + { + "epoch": 0.49370122630992197, + "grad_norm": 0.4933509826660156, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 8857 + }, + { + "epoch": 0.49375696767001115, + "grad_norm": 0.5568447113037109, + "learning_rate": 8e-05, + "loss": 1.7922, + "step": 8858 + }, + { + "epoch": 0.4938127090301003, + "grad_norm": 0.548936665058136, + "learning_rate": 8e-05, + "loss": 1.6704, + "step": 8859 + }, + { + "epoch": 0.4938684503901895, + "grad_norm": 0.542045533657074, + "learning_rate": 8e-05, + "loss": 1.6765, + "step": 8860 + }, + { + "epoch": 0.4939241917502787, + "grad_norm": 0.5615548491477966, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 8861 + }, + { + "epoch": 0.4939799331103679, + "grad_norm": 0.5786969661712646, + "learning_rate": 8e-05, + "loss": 1.6795, + "step": 8862 + }, + { + "epoch": 0.4940356744704571, + "grad_norm": 0.5402855277061462, + "learning_rate": 8e-05, + "loss": 1.6715, + "step": 8863 + }, + { + "epoch": 0.4940914158305463, + "grad_norm": 0.5332042574882507, + "learning_rate": 8e-05, + "loss": 1.5561, + "step": 8864 + }, + { + "epoch": 0.49414715719063546, + "grad_norm": 0.5454310178756714, + "learning_rate": 8e-05, + "loss": 1.6758, + "step": 8865 + }, + { + "epoch": 0.49420289855072463, + "grad_norm": 0.5213438868522644, + "learning_rate": 8e-05, + "loss": 1.5993, + "step": 8866 + }, + { + "epoch": 0.4942586399108138, + "grad_norm": 0.550634503364563, + "learning_rate": 8e-05, + "loss": 1.7791, + "step": 8867 + }, + { + "epoch": 0.494314381270903, + "grad_norm": 0.4944699704647064, + "learning_rate": 8e-05, + "loss": 1.4123, + "step": 8868 + }, + { + "epoch": 0.49437012263099217, + "grad_norm": 0.5702661275863647, + "learning_rate": 8e-05, + "loss": 1.9546, + "step": 8869 + }, + { + "epoch": 0.4944258639910814, + "grad_norm": 0.5053265690803528, + "learning_rate": 8e-05, + "loss": 1.5537, + "step": 8870 + }, + { + "epoch": 0.4944816053511706, + "grad_norm": 0.5766624808311462, + "learning_rate": 8e-05, + "loss": 1.957, + "step": 8871 + }, + { + "epoch": 0.49453734671125976, + "grad_norm": 0.5471301078796387, + "learning_rate": 8e-05, + "loss": 1.695, + "step": 8872 + }, + { + "epoch": 0.49459308807134894, + "grad_norm": 0.5663468837738037, + "learning_rate": 8e-05, + "loss": 1.5985, + "step": 8873 + }, + { + "epoch": 0.4946488294314381, + "grad_norm": 0.513062596321106, + "learning_rate": 8e-05, + "loss": 1.5668, + "step": 8874 + }, + { + "epoch": 0.4947045707915273, + "grad_norm": 0.5351914763450623, + "learning_rate": 8e-05, + "loss": 1.8147, + "step": 8875 + }, + { + "epoch": 0.4947603121516165, + "grad_norm": 0.5259719491004944, + "learning_rate": 8e-05, + "loss": 1.5631, + "step": 8876 + }, + { + "epoch": 0.4948160535117057, + "grad_norm": 0.527617871761322, + "learning_rate": 8e-05, + "loss": 1.484, + "step": 8877 + }, + { + "epoch": 0.4948717948717949, + "grad_norm": 0.529051661491394, + "learning_rate": 8e-05, + "loss": 1.6024, + "step": 8878 + }, + { + "epoch": 0.49492753623188407, + "grad_norm": 0.5139374136924744, + "learning_rate": 8e-05, + "loss": 1.4492, + "step": 8879 + }, + { + "epoch": 0.49498327759197325, + "grad_norm": 0.523256778717041, + "learning_rate": 8e-05, + "loss": 1.6188, + "step": 8880 + }, + { + "epoch": 0.4950390189520624, + "grad_norm": 0.5505326390266418, + "learning_rate": 8e-05, + "loss": 1.675, + "step": 8881 + }, + { + "epoch": 0.4950947603121516, + "grad_norm": 0.5377850532531738, + "learning_rate": 8e-05, + "loss": 1.6074, + "step": 8882 + }, + { + "epoch": 0.4951505016722408, + "grad_norm": 0.5315950512886047, + "learning_rate": 8e-05, + "loss": 1.5816, + "step": 8883 + }, + { + "epoch": 0.49520624303232996, + "grad_norm": 0.6018045544624329, + "learning_rate": 8e-05, + "loss": 1.7899, + "step": 8884 + }, + { + "epoch": 0.4952619843924192, + "grad_norm": 0.579967200756073, + "learning_rate": 8e-05, + "loss": 1.6885, + "step": 8885 + }, + { + "epoch": 0.4953177257525084, + "grad_norm": 0.593023955821991, + "learning_rate": 8e-05, + "loss": 1.7914, + "step": 8886 + }, + { + "epoch": 0.49537346711259755, + "grad_norm": 0.5540221333503723, + "learning_rate": 8e-05, + "loss": 1.7083, + "step": 8887 + }, + { + "epoch": 0.49542920847268673, + "grad_norm": 0.5043448805809021, + "learning_rate": 8e-05, + "loss": 1.5477, + "step": 8888 + }, + { + "epoch": 0.4954849498327759, + "grad_norm": 0.6106610894203186, + "learning_rate": 8e-05, + "loss": 1.9431, + "step": 8889 + }, + { + "epoch": 0.4955406911928651, + "grad_norm": 0.5356171131134033, + "learning_rate": 8e-05, + "loss": 1.6505, + "step": 8890 + }, + { + "epoch": 0.49559643255295427, + "grad_norm": 0.5981549620628357, + "learning_rate": 8e-05, + "loss": 1.8112, + "step": 8891 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 0.5713699460029602, + "learning_rate": 8e-05, + "loss": 1.4266, + "step": 8892 + }, + { + "epoch": 0.4957079152731327, + "grad_norm": 0.5626436471939087, + "learning_rate": 8e-05, + "loss": 1.8815, + "step": 8893 + }, + { + "epoch": 0.49576365663322186, + "grad_norm": 0.5311765670776367, + "learning_rate": 8e-05, + "loss": 1.6903, + "step": 8894 + }, + { + "epoch": 0.49581939799331104, + "grad_norm": 0.5413386225700378, + "learning_rate": 8e-05, + "loss": 1.5235, + "step": 8895 + }, + { + "epoch": 0.4958751393534002, + "grad_norm": 0.4782915413379669, + "learning_rate": 8e-05, + "loss": 1.2485, + "step": 8896 + }, + { + "epoch": 0.4959308807134894, + "grad_norm": 0.5347909331321716, + "learning_rate": 8e-05, + "loss": 1.6659, + "step": 8897 + }, + { + "epoch": 0.4959866220735786, + "grad_norm": 0.5323268175125122, + "learning_rate": 8e-05, + "loss": 1.7437, + "step": 8898 + }, + { + "epoch": 0.49604236343366775, + "grad_norm": 0.5715739727020264, + "learning_rate": 8e-05, + "loss": 1.7099, + "step": 8899 + }, + { + "epoch": 0.496098104793757, + "grad_norm": 0.7310459613800049, + "learning_rate": 8e-05, + "loss": 1.7719, + "step": 8900 + }, + { + "epoch": 0.49615384615384617, + "grad_norm": 0.598199188709259, + "learning_rate": 8e-05, + "loss": 1.6053, + "step": 8901 + }, + { + "epoch": 0.49620958751393535, + "grad_norm": 0.5783191919326782, + "learning_rate": 8e-05, + "loss": 1.6202, + "step": 8902 + }, + { + "epoch": 0.4962653288740245, + "grad_norm": 0.533246636390686, + "learning_rate": 8e-05, + "loss": 1.4843, + "step": 8903 + }, + { + "epoch": 0.4963210702341137, + "grad_norm": 0.615612804889679, + "learning_rate": 8e-05, + "loss": 1.9378, + "step": 8904 + }, + { + "epoch": 0.4963768115942029, + "grad_norm": 0.4893304407596588, + "learning_rate": 8e-05, + "loss": 1.4438, + "step": 8905 + }, + { + "epoch": 0.49643255295429206, + "grad_norm": 0.5676378607749939, + "learning_rate": 8e-05, + "loss": 1.7779, + "step": 8906 + }, + { + "epoch": 0.4964882943143813, + "grad_norm": 0.5795214772224426, + "learning_rate": 8e-05, + "loss": 1.7449, + "step": 8907 + }, + { + "epoch": 0.4965440356744705, + "grad_norm": 0.5832824110984802, + "learning_rate": 8e-05, + "loss": 1.8669, + "step": 8908 + }, + { + "epoch": 0.49659977703455965, + "grad_norm": 0.7791319489479065, + "learning_rate": 8e-05, + "loss": 1.6592, + "step": 8909 + }, + { + "epoch": 0.49665551839464883, + "grad_norm": 0.5205925703048706, + "learning_rate": 8e-05, + "loss": 1.6496, + "step": 8910 + }, + { + "epoch": 0.496711259754738, + "grad_norm": 0.5032536387443542, + "learning_rate": 8e-05, + "loss": 1.4186, + "step": 8911 + }, + { + "epoch": 0.4967670011148272, + "grad_norm": 0.5437055826187134, + "learning_rate": 8e-05, + "loss": 1.7029, + "step": 8912 + }, + { + "epoch": 0.49682274247491637, + "grad_norm": 0.5419133305549622, + "learning_rate": 8e-05, + "loss": 1.7818, + "step": 8913 + }, + { + "epoch": 0.49687848383500555, + "grad_norm": 0.5332748889923096, + "learning_rate": 8e-05, + "loss": 1.4844, + "step": 8914 + }, + { + "epoch": 0.4969342251950948, + "grad_norm": 0.5628235936164856, + "learning_rate": 8e-05, + "loss": 1.7583, + "step": 8915 + }, + { + "epoch": 0.49698996655518396, + "grad_norm": 0.5357251763343811, + "learning_rate": 8e-05, + "loss": 1.9001, + "step": 8916 + }, + { + "epoch": 0.49704570791527314, + "grad_norm": 0.5667791366577148, + "learning_rate": 8e-05, + "loss": 1.7927, + "step": 8917 + }, + { + "epoch": 0.4971014492753623, + "grad_norm": 0.4990066587924957, + "learning_rate": 8e-05, + "loss": 1.4913, + "step": 8918 + }, + { + "epoch": 0.4971571906354515, + "grad_norm": 0.5187414288520813, + "learning_rate": 8e-05, + "loss": 1.4935, + "step": 8919 + }, + { + "epoch": 0.4972129319955407, + "grad_norm": 0.5602273344993591, + "learning_rate": 8e-05, + "loss": 1.6549, + "step": 8920 + }, + { + "epoch": 0.49726867335562985, + "grad_norm": 0.5536525249481201, + "learning_rate": 8e-05, + "loss": 1.5933, + "step": 8921 + }, + { + "epoch": 0.4973244147157191, + "grad_norm": 0.5444284677505493, + "learning_rate": 8e-05, + "loss": 1.6299, + "step": 8922 + }, + { + "epoch": 0.49738015607580827, + "grad_norm": 0.5906103849411011, + "learning_rate": 8e-05, + "loss": 1.7832, + "step": 8923 + }, + { + "epoch": 0.49743589743589745, + "grad_norm": 0.5382599234580994, + "learning_rate": 8e-05, + "loss": 1.6533, + "step": 8924 + }, + { + "epoch": 0.4974916387959866, + "grad_norm": 0.5361124873161316, + "learning_rate": 8e-05, + "loss": 1.6416, + "step": 8925 + }, + { + "epoch": 0.4975473801560758, + "grad_norm": 0.5967959761619568, + "learning_rate": 8e-05, + "loss": 1.9175, + "step": 8926 + }, + { + "epoch": 0.497603121516165, + "grad_norm": 0.5135853886604309, + "learning_rate": 8e-05, + "loss": 1.545, + "step": 8927 + }, + { + "epoch": 0.49765886287625416, + "grad_norm": 0.5927615761756897, + "learning_rate": 8e-05, + "loss": 1.9414, + "step": 8928 + }, + { + "epoch": 0.4977146042363434, + "grad_norm": 0.5337189435958862, + "learning_rate": 8e-05, + "loss": 1.7404, + "step": 8929 + }, + { + "epoch": 0.4977703455964326, + "grad_norm": 0.5466311573982239, + "learning_rate": 8e-05, + "loss": 1.7012, + "step": 8930 + }, + { + "epoch": 0.49782608695652175, + "grad_norm": 0.4856451153755188, + "learning_rate": 8e-05, + "loss": 1.572, + "step": 8931 + }, + { + "epoch": 0.49788182831661093, + "grad_norm": 0.5938416123390198, + "learning_rate": 8e-05, + "loss": 1.831, + "step": 8932 + }, + { + "epoch": 0.4979375696767001, + "grad_norm": 0.46626678109169006, + "learning_rate": 8e-05, + "loss": 1.2963, + "step": 8933 + }, + { + "epoch": 0.4979933110367893, + "grad_norm": 0.6333751678466797, + "learning_rate": 8e-05, + "loss": 1.4346, + "step": 8934 + }, + { + "epoch": 0.49804905239687847, + "grad_norm": 0.5655824542045593, + "learning_rate": 8e-05, + "loss": 1.8434, + "step": 8935 + }, + { + "epoch": 0.49810479375696765, + "grad_norm": 0.5203345417976379, + "learning_rate": 8e-05, + "loss": 1.7327, + "step": 8936 + }, + { + "epoch": 0.4981605351170569, + "grad_norm": 0.5423237085342407, + "learning_rate": 8e-05, + "loss": 1.6376, + "step": 8937 + }, + { + "epoch": 0.49821627647714606, + "grad_norm": 0.5489369630813599, + "learning_rate": 8e-05, + "loss": 1.7333, + "step": 8938 + }, + { + "epoch": 0.49827201783723524, + "grad_norm": 0.5339183807373047, + "learning_rate": 8e-05, + "loss": 1.6599, + "step": 8939 + }, + { + "epoch": 0.4983277591973244, + "grad_norm": 0.5917924642562866, + "learning_rate": 8e-05, + "loss": 2.0178, + "step": 8940 + }, + { + "epoch": 0.4983835005574136, + "grad_norm": 0.5788326263427734, + "learning_rate": 8e-05, + "loss": 1.607, + "step": 8941 + }, + { + "epoch": 0.4984392419175028, + "grad_norm": 0.49358728528022766, + "learning_rate": 8e-05, + "loss": 1.4111, + "step": 8942 + }, + { + "epoch": 0.49849498327759195, + "grad_norm": 0.5892625451087952, + "learning_rate": 8e-05, + "loss": 1.8303, + "step": 8943 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 0.6209052205085754, + "learning_rate": 8e-05, + "loss": 1.8335, + "step": 8944 + }, + { + "epoch": 0.49860646599777037, + "grad_norm": 0.5494616031646729, + "learning_rate": 8e-05, + "loss": 1.8625, + "step": 8945 + }, + { + "epoch": 0.49866220735785954, + "grad_norm": 0.5134598612785339, + "learning_rate": 8e-05, + "loss": 1.5633, + "step": 8946 + }, + { + "epoch": 0.4987179487179487, + "grad_norm": 0.5650904178619385, + "learning_rate": 8e-05, + "loss": 1.6744, + "step": 8947 + }, + { + "epoch": 0.4987736900780379, + "grad_norm": 0.545432984828949, + "learning_rate": 8e-05, + "loss": 1.9893, + "step": 8948 + }, + { + "epoch": 0.4988294314381271, + "grad_norm": 0.5759139657020569, + "learning_rate": 8e-05, + "loss": 1.624, + "step": 8949 + }, + { + "epoch": 0.49888517279821626, + "grad_norm": 0.5109109878540039, + "learning_rate": 8e-05, + "loss": 1.4989, + "step": 8950 + }, + { + "epoch": 0.49894091415830544, + "grad_norm": 0.48945099115371704, + "learning_rate": 8e-05, + "loss": 1.3881, + "step": 8951 + }, + { + "epoch": 0.49899665551839467, + "grad_norm": 0.532191812992096, + "learning_rate": 8e-05, + "loss": 1.5542, + "step": 8952 + }, + { + "epoch": 0.49905239687848385, + "grad_norm": 0.9853920340538025, + "learning_rate": 8e-05, + "loss": 1.6086, + "step": 8953 + }, + { + "epoch": 0.49910813823857303, + "grad_norm": 0.5455578565597534, + "learning_rate": 8e-05, + "loss": 1.3766, + "step": 8954 + }, + { + "epoch": 0.4991638795986622, + "grad_norm": 0.5894210338592529, + "learning_rate": 8e-05, + "loss": 1.7001, + "step": 8955 + }, + { + "epoch": 0.4992196209587514, + "grad_norm": 0.5502829551696777, + "learning_rate": 8e-05, + "loss": 1.6782, + "step": 8956 + }, + { + "epoch": 0.49927536231884057, + "grad_norm": 0.5422848463058472, + "learning_rate": 8e-05, + "loss": 1.5955, + "step": 8957 + }, + { + "epoch": 0.49933110367892974, + "grad_norm": 0.577444851398468, + "learning_rate": 8e-05, + "loss": 1.8286, + "step": 8958 + }, + { + "epoch": 0.499386845039019, + "grad_norm": 0.5444875359535217, + "learning_rate": 8e-05, + "loss": 1.5021, + "step": 8959 + }, + { + "epoch": 0.49944258639910816, + "grad_norm": 0.5610191822052002, + "learning_rate": 8e-05, + "loss": 1.7752, + "step": 8960 + }, + { + "epoch": 0.49949832775919734, + "grad_norm": 0.5572718977928162, + "learning_rate": 8e-05, + "loss": 1.7961, + "step": 8961 + }, + { + "epoch": 0.4995540691192865, + "grad_norm": 0.5246376991271973, + "learning_rate": 8e-05, + "loss": 1.2681, + "step": 8962 + }, + { + "epoch": 0.4996098104793757, + "grad_norm": 0.5913357138633728, + "learning_rate": 8e-05, + "loss": 1.7517, + "step": 8963 + }, + { + "epoch": 0.49966555183946487, + "grad_norm": 0.520278811454773, + "learning_rate": 8e-05, + "loss": 1.5818, + "step": 8964 + }, + { + "epoch": 0.49972129319955405, + "grad_norm": 0.5451558232307434, + "learning_rate": 8e-05, + "loss": 1.6493, + "step": 8965 + }, + { + "epoch": 0.49977703455964323, + "grad_norm": 0.5383256077766418, + "learning_rate": 8e-05, + "loss": 1.9797, + "step": 8966 + }, + { + "epoch": 0.49983277591973246, + "grad_norm": 0.5744273066520691, + "learning_rate": 8e-05, + "loss": 1.8506, + "step": 8967 + }, + { + "epoch": 0.49988851727982164, + "grad_norm": 0.6074442267417908, + "learning_rate": 8e-05, + "loss": 1.9342, + "step": 8968 + }, + { + "epoch": 0.4999442586399108, + "grad_norm": 0.5620627999305725, + "learning_rate": 8e-05, + "loss": 1.7157, + "step": 8969 + }, + { + "epoch": 0.5, + "grad_norm": 0.5319579243659973, + "learning_rate": 8e-05, + "loss": 1.5882, + "step": 8970 + }, + { + "epoch": 0.5000557413600892, + "grad_norm": 0.5307610034942627, + "learning_rate": 8e-05, + "loss": 1.6502, + "step": 8971 + }, + { + "epoch": 0.5001114827201784, + "grad_norm": 0.5093584060668945, + "learning_rate": 8e-05, + "loss": 1.4701, + "step": 8972 + }, + { + "epoch": 0.5001672240802676, + "grad_norm": 0.5648424029350281, + "learning_rate": 8e-05, + "loss": 1.6485, + "step": 8973 + }, + { + "epoch": 0.5002229654403567, + "grad_norm": 0.5174546241760254, + "learning_rate": 8e-05, + "loss": 1.6417, + "step": 8974 + }, + { + "epoch": 0.500278706800446, + "grad_norm": 0.5817108154296875, + "learning_rate": 8e-05, + "loss": 1.5185, + "step": 8975 + }, + { + "epoch": 0.5003344481605351, + "grad_norm": 0.5302509665489197, + "learning_rate": 8e-05, + "loss": 1.7699, + "step": 8976 + }, + { + "epoch": 0.5003901895206243, + "grad_norm": 0.5956892967224121, + "learning_rate": 8e-05, + "loss": 1.6252, + "step": 8977 + }, + { + "epoch": 0.5004459308807135, + "grad_norm": 0.564408540725708, + "learning_rate": 8e-05, + "loss": 1.4983, + "step": 8978 + }, + { + "epoch": 0.5005016722408027, + "grad_norm": 0.5849278569221497, + "learning_rate": 8e-05, + "loss": 1.7398, + "step": 8979 + }, + { + "epoch": 0.5005574136008919, + "grad_norm": 0.5324124097824097, + "learning_rate": 8e-05, + "loss": 1.6415, + "step": 8980 + }, + { + "epoch": 0.500613154960981, + "grad_norm": 0.5535297393798828, + "learning_rate": 8e-05, + "loss": 1.6123, + "step": 8981 + }, + { + "epoch": 0.5006688963210703, + "grad_norm": 0.5630205273628235, + "learning_rate": 8e-05, + "loss": 1.7372, + "step": 8982 + }, + { + "epoch": 0.5007246376811594, + "grad_norm": 0.5356989502906799, + "learning_rate": 8e-05, + "loss": 1.7165, + "step": 8983 + }, + { + "epoch": 0.5007803790412486, + "grad_norm": 0.6177685856819153, + "learning_rate": 8e-05, + "loss": 1.8803, + "step": 8984 + }, + { + "epoch": 0.5008361204013378, + "grad_norm": 0.5300348401069641, + "learning_rate": 8e-05, + "loss": 1.5568, + "step": 8985 + }, + { + "epoch": 0.500891861761427, + "grad_norm": 0.5672892332077026, + "learning_rate": 8e-05, + "loss": 1.7536, + "step": 8986 + }, + { + "epoch": 0.5009476031215162, + "grad_norm": 0.5484430193901062, + "learning_rate": 8e-05, + "loss": 1.6806, + "step": 8987 + }, + { + "epoch": 0.5010033444816053, + "grad_norm": 0.49196168780326843, + "learning_rate": 8e-05, + "loss": 1.4351, + "step": 8988 + }, + { + "epoch": 0.5010590858416946, + "grad_norm": 0.49675580859184265, + "learning_rate": 8e-05, + "loss": 1.5384, + "step": 8989 + }, + { + "epoch": 0.5011148272017837, + "grad_norm": 0.5410286784172058, + "learning_rate": 8e-05, + "loss": 1.7398, + "step": 8990 + }, + { + "epoch": 0.5011705685618729, + "grad_norm": 0.5007629990577698, + "learning_rate": 8e-05, + "loss": 1.6711, + "step": 8991 + }, + { + "epoch": 0.501226309921962, + "grad_norm": 0.5875619649887085, + "learning_rate": 8e-05, + "loss": 1.5316, + "step": 8992 + }, + { + "epoch": 0.5012820512820513, + "grad_norm": 0.5303008556365967, + "learning_rate": 8e-05, + "loss": 1.5361, + "step": 8993 + }, + { + "epoch": 0.5013377926421405, + "grad_norm": 0.6002159714698792, + "learning_rate": 8e-05, + "loss": 1.9255, + "step": 8994 + }, + { + "epoch": 0.5013935340022296, + "grad_norm": 0.5164722204208374, + "learning_rate": 8e-05, + "loss": 1.5173, + "step": 8995 + }, + { + "epoch": 0.5014492753623189, + "grad_norm": 0.49862802028656006, + "learning_rate": 8e-05, + "loss": 1.6109, + "step": 8996 + }, + { + "epoch": 0.501505016722408, + "grad_norm": 0.594048798084259, + "learning_rate": 8e-05, + "loss": 1.8636, + "step": 8997 + }, + { + "epoch": 0.5015607580824972, + "grad_norm": 0.5271819233894348, + "learning_rate": 8e-05, + "loss": 1.6044, + "step": 8998 + }, + { + "epoch": 0.5016164994425863, + "grad_norm": 0.5142638683319092, + "learning_rate": 8e-05, + "loss": 1.6739, + "step": 8999 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.5490266680717468, + "learning_rate": 8e-05, + "loss": 1.5733, + "step": 9000 + } + ], + "logging_steps": 1, + "max_steps": 17940, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.788311104356352e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}