diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.36643459142543056, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018321729571271528, + "grad_norm": 7.481241625973833, + "learning_rate": 0.0, + "loss": 0.764, + "step": 1 + }, + { + "epoch": 0.00036643459142543056, + "grad_norm": 7.936839352385908, + "learning_rate": 1.8315018315018317e-08, + "loss": 0.7505, + "step": 2 + }, + { + "epoch": 0.0005496518871381459, + "grad_norm": 8.381229660678587, + "learning_rate": 3.6630036630036635e-08, + "loss": 0.7767, + "step": 3 + }, + { + "epoch": 0.0007328691828508611, + "grad_norm": 7.533434041856819, + "learning_rate": 5.494505494505495e-08, + "loss": 0.7545, + "step": 4 + }, + { + "epoch": 0.0009160864785635764, + "grad_norm": 6.4876435204860545, + "learning_rate": 7.326007326007327e-08, + "loss": 0.7254, + "step": 5 + }, + { + "epoch": 0.0010993037742762918, + "grad_norm": 6.984439562746247, + "learning_rate": 9.157509157509159e-08, + "loss": 0.7238, + "step": 6 + }, + { + "epoch": 0.001282521069989007, + "grad_norm": 8.432795226889532, + "learning_rate": 1.098901098901099e-07, + "loss": 0.7667, + "step": 7 + }, + { + "epoch": 0.0014657383657017222, + "grad_norm": 6.615389400475257, + "learning_rate": 1.282051282051282e-07, + "loss": 0.739, + "step": 8 + }, + { + "epoch": 0.0016489556614144375, + "grad_norm": 7.043438048491804, + "learning_rate": 1.4652014652014654e-07, + "loss": 0.7412, + "step": 9 + }, + { + "epoch": 0.001832172957127153, + "grad_norm": 7.763888342467084, + "learning_rate": 1.6483516483516484e-07, + "loss": 0.739, + "step": 10 + }, + { + "epoch": 0.002015390252839868, + "grad_norm": 7.115680896600135, + "learning_rate": 1.8315018315018317e-07, + "loss": 0.6814, + "step": 11 + }, + { + "epoch": 0.0021986075485525836, + "grad_norm": 9.472751553447468, + "learning_rate": 2.014652014652015e-07, + "loss": 0.7694, + "step": 12 + }, + { + "epoch": 0.0023818248442652986, + "grad_norm": 9.164365397122241, + "learning_rate": 2.197802197802198e-07, + "loss": 0.8101, + "step": 13 + }, + { + "epoch": 0.002565042139978014, + "grad_norm": 7.831934204465113, + "learning_rate": 2.3809523809523811e-07, + "loss": 0.7504, + "step": 14 + }, + { + "epoch": 0.002748259435690729, + "grad_norm": 6.7423320112288865, + "learning_rate": 2.564102564102564e-07, + "loss": 0.7442, + "step": 15 + }, + { + "epoch": 0.0029314767314034445, + "grad_norm": 7.8103949840893145, + "learning_rate": 2.7472527472527475e-07, + "loss": 0.7782, + "step": 16 + }, + { + "epoch": 0.00311469402711616, + "grad_norm": 6.1519268993583465, + "learning_rate": 2.930402930402931e-07, + "loss": 0.663, + "step": 17 + }, + { + "epoch": 0.003297911322828875, + "grad_norm": 8.049047835270914, + "learning_rate": 3.113553113553114e-07, + "loss": 0.7934, + "step": 18 + }, + { + "epoch": 0.0034811286185415903, + "grad_norm": 6.388709624292669, + "learning_rate": 3.296703296703297e-07, + "loss": 0.7241, + "step": 19 + }, + { + "epoch": 0.003664345914254306, + "grad_norm": 7.039170133500172, + "learning_rate": 3.47985347985348e-07, + "loss": 0.7133, + "step": 20 + }, + { + "epoch": 0.003847563209967021, + "grad_norm": 8.416142106876258, + "learning_rate": 3.6630036630036635e-07, + "loss": 0.7787, + "step": 21 + }, + { + "epoch": 0.004030780505679736, + "grad_norm": 5.942542945633484, + "learning_rate": 3.846153846153847e-07, + "loss": 0.692, + "step": 22 + }, + { + "epoch": 0.004213997801392451, + "grad_norm": 6.397620608740201, + "learning_rate": 4.02930402930403e-07, + "loss": 0.6885, + "step": 23 + }, + { + "epoch": 0.004397215097105167, + "grad_norm": 4.789506080162222, + "learning_rate": 4.212454212454213e-07, + "loss": 0.6818, + "step": 24 + }, + { + "epoch": 0.004580432392817882, + "grad_norm": 5.4323837663846275, + "learning_rate": 4.395604395604396e-07, + "loss": 0.7202, + "step": 25 + }, + { + "epoch": 0.004763649688530597, + "grad_norm": 6.001418955741802, + "learning_rate": 4.578754578754579e-07, + "loss": 0.6915, + "step": 26 + }, + { + "epoch": 0.004946866984243312, + "grad_norm": 6.931290264891086, + "learning_rate": 4.7619047619047623e-07, + "loss": 0.7214, + "step": 27 + }, + { + "epoch": 0.005130084279956028, + "grad_norm": 5.423177642103934, + "learning_rate": 4.945054945054946e-07, + "loss": 0.6627, + "step": 28 + }, + { + "epoch": 0.005313301575668743, + "grad_norm": 5.824082478920574, + "learning_rate": 5.128205128205128e-07, + "loss": 0.7128, + "step": 29 + }, + { + "epoch": 0.005496518871381458, + "grad_norm": 4.837515053542817, + "learning_rate": 5.311355311355311e-07, + "loss": 0.7063, + "step": 30 + }, + { + "epoch": 0.005679736167094174, + "grad_norm": 4.2178492878732, + "learning_rate": 5.494505494505495e-07, + "loss": 0.6667, + "step": 31 + }, + { + "epoch": 0.005862953462806889, + "grad_norm": 3.7310273500960283, + "learning_rate": 5.677655677655678e-07, + "loss": 0.6724, + "step": 32 + }, + { + "epoch": 0.006046170758519604, + "grad_norm": 3.4436166557785013, + "learning_rate": 5.860805860805862e-07, + "loss": 0.6158, + "step": 33 + }, + { + "epoch": 0.00622938805423232, + "grad_norm": 2.6914219359650264, + "learning_rate": 6.043956043956044e-07, + "loss": 0.6384, + "step": 34 + }, + { + "epoch": 0.006412605349945035, + "grad_norm": 2.8611154351289803, + "learning_rate": 6.227106227106228e-07, + "loss": 0.6564, + "step": 35 + }, + { + "epoch": 0.00659582264565775, + "grad_norm": 2.915608387835296, + "learning_rate": 6.41025641025641e-07, + "loss": 0.6556, + "step": 36 + }, + { + "epoch": 0.006779039941370466, + "grad_norm": 3.104210670394775, + "learning_rate": 6.593406593406594e-07, + "loss": 0.6571, + "step": 37 + }, + { + "epoch": 0.006962257237083181, + "grad_norm": 2.416024185643152, + "learning_rate": 6.776556776556777e-07, + "loss": 0.6118, + "step": 38 + }, + { + "epoch": 0.007145474532795896, + "grad_norm": 2.2840327527366187, + "learning_rate": 6.95970695970696e-07, + "loss": 0.5773, + "step": 39 + }, + { + "epoch": 0.007328691828508612, + "grad_norm": 2.994955066139323, + "learning_rate": 7.142857142857143e-07, + "loss": 0.6621, + "step": 40 + }, + { + "epoch": 0.007511909124221327, + "grad_norm": 2.4977146662969956, + "learning_rate": 7.326007326007327e-07, + "loss": 0.5878, + "step": 41 + }, + { + "epoch": 0.007695126419934042, + "grad_norm": 2.2697538551105443, + "learning_rate": 7.50915750915751e-07, + "loss": 0.6161, + "step": 42 + }, + { + "epoch": 0.007878343715646757, + "grad_norm": 2.040084870467292, + "learning_rate": 7.692307692307694e-07, + "loss": 0.625, + "step": 43 + }, + { + "epoch": 0.008061561011359472, + "grad_norm": 1.4264410515550054, + "learning_rate": 7.875457875457876e-07, + "loss": 0.5991, + "step": 44 + }, + { + "epoch": 0.008244778307072188, + "grad_norm": 1.3307329745484617, + "learning_rate": 8.05860805860806e-07, + "loss": 0.5802, + "step": 45 + }, + { + "epoch": 0.008427995602784902, + "grad_norm": 1.3186002922814006, + "learning_rate": 8.241758241758242e-07, + "loss": 0.5971, + "step": 46 + }, + { + "epoch": 0.008611212898497618, + "grad_norm": 1.4454419288227918, + "learning_rate": 8.424908424908426e-07, + "loss": 0.6152, + "step": 47 + }, + { + "epoch": 0.008794430194210334, + "grad_norm": 1.2728865809977838, + "learning_rate": 8.608058608058609e-07, + "loss": 0.6176, + "step": 48 + }, + { + "epoch": 0.008977647489923048, + "grad_norm": 1.2387163683388445, + "learning_rate": 8.791208791208792e-07, + "loss": 0.6233, + "step": 49 + }, + { + "epoch": 0.009160864785635764, + "grad_norm": 1.1465107255795282, + "learning_rate": 8.974358974358975e-07, + "loss": 0.5795, + "step": 50 + }, + { + "epoch": 0.00934408208134848, + "grad_norm": 1.264265250343769, + "learning_rate": 9.157509157509158e-07, + "loss": 0.6262, + "step": 51 + }, + { + "epoch": 0.009527299377061194, + "grad_norm": 0.919312835052228, + "learning_rate": 9.340659340659342e-07, + "loss": 0.5535, + "step": 52 + }, + { + "epoch": 0.00971051667277391, + "grad_norm": 0.8672735287012353, + "learning_rate": 9.523809523809525e-07, + "loss": 0.5327, + "step": 53 + }, + { + "epoch": 0.009893733968486624, + "grad_norm": 0.8288169490471791, + "learning_rate": 9.706959706959708e-07, + "loss": 0.5673, + "step": 54 + }, + { + "epoch": 0.01007695126419934, + "grad_norm": 0.8603030677007734, + "learning_rate": 9.890109890109891e-07, + "loss": 0.5547, + "step": 55 + }, + { + "epoch": 0.010260168559912056, + "grad_norm": 0.7482006424960487, + "learning_rate": 1.0073260073260074e-06, + "loss": 0.6012, + "step": 56 + }, + { + "epoch": 0.01044338585562477, + "grad_norm": 0.6144742944660406, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.5231, + "step": 57 + }, + { + "epoch": 0.010626603151337486, + "grad_norm": 0.634897676727318, + "learning_rate": 1.0439560439560442e-06, + "loss": 0.5726, + "step": 58 + }, + { + "epoch": 0.010809820447050202, + "grad_norm": 0.6410545094813134, + "learning_rate": 1.0622710622710622e-06, + "loss": 0.5695, + "step": 59 + }, + { + "epoch": 0.010993037742762916, + "grad_norm": 0.6609456014192859, + "learning_rate": 1.0805860805860807e-06, + "loss": 0.5805, + "step": 60 + }, + { + "epoch": 0.011176255038475632, + "grad_norm": 0.7416419125176478, + "learning_rate": 1.098901098901099e-06, + "loss": 0.634, + "step": 61 + }, + { + "epoch": 0.011359472334188348, + "grad_norm": 0.6851293138177442, + "learning_rate": 1.1172161172161173e-06, + "loss": 0.5544, + "step": 62 + }, + { + "epoch": 0.011542689629901062, + "grad_norm": 0.6383351643137376, + "learning_rate": 1.1355311355311355e-06, + "loss": 0.557, + "step": 63 + }, + { + "epoch": 0.011725906925613778, + "grad_norm": 0.6348085314283121, + "learning_rate": 1.153846153846154e-06, + "loss": 0.5632, + "step": 64 + }, + { + "epoch": 0.011909124221326494, + "grad_norm": 0.7024363928163538, + "learning_rate": 1.1721611721611723e-06, + "loss": 0.5839, + "step": 65 + }, + { + "epoch": 0.012092341517039208, + "grad_norm": 0.5900911656210056, + "learning_rate": 1.1904761904761906e-06, + "loss": 0.5344, + "step": 66 + }, + { + "epoch": 0.012275558812751924, + "grad_norm": 0.6275963867777459, + "learning_rate": 1.2087912087912089e-06, + "loss": 0.5584, + "step": 67 + }, + { + "epoch": 0.01245877610846464, + "grad_norm": 0.5865171487667508, + "learning_rate": 1.2271062271062271e-06, + "loss": 0.575, + "step": 68 + }, + { + "epoch": 0.012641993404177354, + "grad_norm": 0.674046870317235, + "learning_rate": 1.2454212454212456e-06, + "loss": 0.5744, + "step": 69 + }, + { + "epoch": 0.01282521069989007, + "grad_norm": 0.60918911566989, + "learning_rate": 1.2637362637362637e-06, + "loss": 0.5423, + "step": 70 + }, + { + "epoch": 0.013008427995602785, + "grad_norm": 0.6221860528344966, + "learning_rate": 1.282051282051282e-06, + "loss": 0.5652, + "step": 71 + }, + { + "epoch": 0.0131916452913155, + "grad_norm": 0.606665778835579, + "learning_rate": 1.3003663003663005e-06, + "loss": 0.6123, + "step": 72 + }, + { + "epoch": 0.013374862587028215, + "grad_norm": 0.5884723188789052, + "learning_rate": 1.3186813186813187e-06, + "loss": 0.5863, + "step": 73 + }, + { + "epoch": 0.013558079882740931, + "grad_norm": 0.5824062487652404, + "learning_rate": 1.336996336996337e-06, + "loss": 0.5918, + "step": 74 + }, + { + "epoch": 0.013741297178453645, + "grad_norm": 0.5460196146561194, + "learning_rate": 1.3553113553113553e-06, + "loss": 0.5912, + "step": 75 + }, + { + "epoch": 0.013924514474166361, + "grad_norm": 0.49213553293102813, + "learning_rate": 1.3736263736263738e-06, + "loss": 0.5421, + "step": 76 + }, + { + "epoch": 0.014107731769879077, + "grad_norm": 0.5594168254817149, + "learning_rate": 1.391941391941392e-06, + "loss": 0.5122, + "step": 77 + }, + { + "epoch": 0.014290949065591791, + "grad_norm": 0.5277088034821339, + "learning_rate": 1.4102564102564104e-06, + "loss": 0.5678, + "step": 78 + }, + { + "epoch": 0.014474166361304507, + "grad_norm": 0.49574136870511754, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.5498, + "step": 79 + }, + { + "epoch": 0.014657383657017223, + "grad_norm": 0.5944104688680958, + "learning_rate": 1.4468864468864471e-06, + "loss": 0.5469, + "step": 80 + }, + { + "epoch": 0.014840600952729937, + "grad_norm": 0.4732970613815555, + "learning_rate": 1.4652014652014654e-06, + "loss": 0.5233, + "step": 81 + }, + { + "epoch": 0.015023818248442653, + "grad_norm": 0.49591541086638596, + "learning_rate": 1.4835164835164837e-06, + "loss": 0.5367, + "step": 82 + }, + { + "epoch": 0.015207035544155369, + "grad_norm": 0.4883648219378977, + "learning_rate": 1.501831501831502e-06, + "loss": 0.5782, + "step": 83 + }, + { + "epoch": 0.015390252839868083, + "grad_norm": 0.48559293967760114, + "learning_rate": 1.5201465201465202e-06, + "loss": 0.5578, + "step": 84 + }, + { + "epoch": 0.015573470135580799, + "grad_norm": 0.5802435990379926, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.5562, + "step": 85 + }, + { + "epoch": 0.015756687431293513, + "grad_norm": 0.6655689083082568, + "learning_rate": 1.556776556776557e-06, + "loss": 0.5916, + "step": 86 + }, + { + "epoch": 0.01593990472700623, + "grad_norm": 0.5272919501805459, + "learning_rate": 1.5750915750915753e-06, + "loss": 0.5462, + "step": 87 + }, + { + "epoch": 0.016123122022718945, + "grad_norm": 0.5025768009972991, + "learning_rate": 1.5934065934065933e-06, + "loss": 0.564, + "step": 88 + }, + { + "epoch": 0.01630633931843166, + "grad_norm": 0.4868371012830415, + "learning_rate": 1.611721611721612e-06, + "loss": 0.557, + "step": 89 + }, + { + "epoch": 0.016489556614144377, + "grad_norm": 0.4724914851279723, + "learning_rate": 1.6300366300366301e-06, + "loss": 0.534, + "step": 90 + }, + { + "epoch": 0.01667277390985709, + "grad_norm": 0.4933114328584066, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.5628, + "step": 91 + }, + { + "epoch": 0.016855991205569805, + "grad_norm": 0.5429724547645147, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5693, + "step": 92 + }, + { + "epoch": 0.017039208501282523, + "grad_norm": 0.46970029586030615, + "learning_rate": 1.6849816849816852e-06, + "loss": 0.5761, + "step": 93 + }, + { + "epoch": 0.017222425796995237, + "grad_norm": 0.5466204682716642, + "learning_rate": 1.7032967032967034e-06, + "loss": 0.5407, + "step": 94 + }, + { + "epoch": 0.01740564309270795, + "grad_norm": 0.46448768993410167, + "learning_rate": 1.7216117216117217e-06, + "loss": 0.5784, + "step": 95 + }, + { + "epoch": 0.01758886038842067, + "grad_norm": 0.46769210989227256, + "learning_rate": 1.73992673992674e-06, + "loss": 0.5262, + "step": 96 + }, + { + "epoch": 0.017772077684133383, + "grad_norm": 0.4412027593954725, + "learning_rate": 1.7582417582417585e-06, + "loss": 0.5608, + "step": 97 + }, + { + "epoch": 0.017955294979846097, + "grad_norm": 0.5975733130220022, + "learning_rate": 1.7765567765567768e-06, + "loss": 0.5633, + "step": 98 + }, + { + "epoch": 0.018138512275558814, + "grad_norm": 0.46156861116006753, + "learning_rate": 1.794871794871795e-06, + "loss": 0.5576, + "step": 99 + }, + { + "epoch": 0.01832172957127153, + "grad_norm": 0.5193644526534718, + "learning_rate": 1.8131868131868133e-06, + "loss": 0.5533, + "step": 100 + }, + { + "epoch": 0.018504946866984243, + "grad_norm": 0.479596247036775, + "learning_rate": 1.8315018315018316e-06, + "loss": 0.5337, + "step": 101 + }, + { + "epoch": 0.01868816416269696, + "grad_norm": 0.4610500867184236, + "learning_rate": 1.84981684981685e-06, + "loss": 0.5314, + "step": 102 + }, + { + "epoch": 0.018871381458409674, + "grad_norm": 0.481950984787821, + "learning_rate": 1.8681318681318684e-06, + "loss": 0.5865, + "step": 103 + }, + { + "epoch": 0.01905459875412239, + "grad_norm": 0.5225151739123198, + "learning_rate": 1.8864468864468866e-06, + "loss": 0.5694, + "step": 104 + }, + { + "epoch": 0.019237816049835106, + "grad_norm": 0.46381468108353424, + "learning_rate": 1.904761904761905e-06, + "loss": 0.557, + "step": 105 + }, + { + "epoch": 0.01942103334554782, + "grad_norm": 0.45411743556679485, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.5648, + "step": 106 + }, + { + "epoch": 0.019604250641260534, + "grad_norm": 0.4859210831259208, + "learning_rate": 1.9413919413919417e-06, + "loss": 0.5685, + "step": 107 + }, + { + "epoch": 0.01978746793697325, + "grad_norm": 0.4205701080898951, + "learning_rate": 1.95970695970696e-06, + "loss": 0.5426, + "step": 108 + }, + { + "epoch": 0.019970685232685966, + "grad_norm": 0.5044757429436201, + "learning_rate": 1.9780219780219782e-06, + "loss": 0.5509, + "step": 109 + }, + { + "epoch": 0.02015390252839868, + "grad_norm": 0.47571345015861244, + "learning_rate": 1.9963369963369965e-06, + "loss": 0.5281, + "step": 110 + }, + { + "epoch": 0.020337119824111394, + "grad_norm": 0.5681283820477757, + "learning_rate": 2.0146520146520148e-06, + "loss": 0.5398, + "step": 111 + }, + { + "epoch": 0.020520337119824112, + "grad_norm": 0.48199138528425167, + "learning_rate": 2.032967032967033e-06, + "loss": 0.56, + "step": 112 + }, + { + "epoch": 0.020703554415536826, + "grad_norm": 0.4841461198775233, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.5454, + "step": 113 + }, + { + "epoch": 0.02088677171124954, + "grad_norm": 0.4680411378560794, + "learning_rate": 2.0695970695970696e-06, + "loss": 0.516, + "step": 114 + }, + { + "epoch": 0.021069989006962258, + "grad_norm": 0.4426302566203345, + "learning_rate": 2.0879120879120883e-06, + "loss": 0.5493, + "step": 115 + }, + { + "epoch": 0.021253206302674972, + "grad_norm": 0.5379521772056074, + "learning_rate": 2.1062271062271066e-06, + "loss": 0.5642, + "step": 116 + }, + { + "epoch": 0.021436423598387686, + "grad_norm": 0.42644965401865687, + "learning_rate": 2.1245421245421245e-06, + "loss": 0.564, + "step": 117 + }, + { + "epoch": 0.021619640894100404, + "grad_norm": 1.3341683924694292, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.5831, + "step": 118 + }, + { + "epoch": 0.021802858189813118, + "grad_norm": 0.5613781478585862, + "learning_rate": 2.1611721611721614e-06, + "loss": 0.5598, + "step": 119 + }, + { + "epoch": 0.021986075485525832, + "grad_norm": 0.4389899980262906, + "learning_rate": 2.1794871794871797e-06, + "loss": 0.5498, + "step": 120 + }, + { + "epoch": 0.02216929278123855, + "grad_norm": 0.5433793107490897, + "learning_rate": 2.197802197802198e-06, + "loss": 0.5838, + "step": 121 + }, + { + "epoch": 0.022352510076951264, + "grad_norm": 0.49992126724035435, + "learning_rate": 2.2161172161172163e-06, + "loss": 0.5581, + "step": 122 + }, + { + "epoch": 0.022535727372663978, + "grad_norm": 0.5192253683114394, + "learning_rate": 2.2344322344322345e-06, + "loss": 0.5963, + "step": 123 + }, + { + "epoch": 0.022718944668376696, + "grad_norm": 0.5218720529273816, + "learning_rate": 2.252747252747253e-06, + "loss": 0.5568, + "step": 124 + }, + { + "epoch": 0.02290216196408941, + "grad_norm": 0.41135006043138017, + "learning_rate": 2.271062271062271e-06, + "loss": 0.5256, + "step": 125 + }, + { + "epoch": 0.023085379259802124, + "grad_norm": 0.4960132808732071, + "learning_rate": 2.2893772893772894e-06, + "loss": 0.5636, + "step": 126 + }, + { + "epoch": 0.02326859655551484, + "grad_norm": 0.40294874528313, + "learning_rate": 2.307692307692308e-06, + "loss": 0.5373, + "step": 127 + }, + { + "epoch": 0.023451813851227556, + "grad_norm": 0.49129951716432685, + "learning_rate": 2.3260073260073264e-06, + "loss": 0.5571, + "step": 128 + }, + { + "epoch": 0.02363503114694027, + "grad_norm": 0.4541626724950761, + "learning_rate": 2.3443223443223446e-06, + "loss": 0.5407, + "step": 129 + }, + { + "epoch": 0.023818248442652987, + "grad_norm": 0.44854257290105054, + "learning_rate": 2.362637362637363e-06, + "loss": 0.5516, + "step": 130 + }, + { + "epoch": 0.0240014657383657, + "grad_norm": 0.4439039076319426, + "learning_rate": 2.380952380952381e-06, + "loss": 0.5192, + "step": 131 + }, + { + "epoch": 0.024184683034078416, + "grad_norm": 0.4410959524536652, + "learning_rate": 2.3992673992673995e-06, + "loss": 0.5678, + "step": 132 + }, + { + "epoch": 0.024367900329791133, + "grad_norm": 0.49361868947998544, + "learning_rate": 2.4175824175824177e-06, + "loss": 0.5591, + "step": 133 + }, + { + "epoch": 0.024551117625503847, + "grad_norm": 0.5023022664764156, + "learning_rate": 2.435897435897436e-06, + "loss": 0.5818, + "step": 134 + }, + { + "epoch": 0.02473433492121656, + "grad_norm": 0.4287249843742935, + "learning_rate": 2.4542124542124543e-06, + "loss": 0.5532, + "step": 135 + }, + { + "epoch": 0.02491755221692928, + "grad_norm": 0.43595844702277614, + "learning_rate": 2.472527472527473e-06, + "loss": 0.534, + "step": 136 + }, + { + "epoch": 0.025100769512641993, + "grad_norm": 0.5546618594341888, + "learning_rate": 2.4908424908424913e-06, + "loss": 0.5058, + "step": 137 + }, + { + "epoch": 0.025283986808354707, + "grad_norm": 0.5017549598118495, + "learning_rate": 2.509157509157509e-06, + "loss": 0.5416, + "step": 138 + }, + { + "epoch": 0.025467204104067425, + "grad_norm": 0.47047331776441903, + "learning_rate": 2.5274725274725274e-06, + "loss": 0.5557, + "step": 139 + }, + { + "epoch": 0.02565042139978014, + "grad_norm": 0.4881037742451912, + "learning_rate": 2.5457875457875457e-06, + "loss": 0.5494, + "step": 140 + }, + { + "epoch": 0.025833638695492853, + "grad_norm": 0.41933017046045795, + "learning_rate": 2.564102564102564e-06, + "loss": 0.5729, + "step": 141 + }, + { + "epoch": 0.02601685599120557, + "grad_norm": 0.39974209370594543, + "learning_rate": 2.582417582417583e-06, + "loss": 0.5381, + "step": 142 + }, + { + "epoch": 0.026200073286918285, + "grad_norm": 0.4473481461077892, + "learning_rate": 2.600732600732601e-06, + "loss": 0.5336, + "step": 143 + }, + { + "epoch": 0.026383290582631, + "grad_norm": 0.45197895778035446, + "learning_rate": 2.6190476190476192e-06, + "loss": 0.5271, + "step": 144 + }, + { + "epoch": 0.026566507878343717, + "grad_norm": 0.49529182062519755, + "learning_rate": 2.6373626373626375e-06, + "loss": 0.5468, + "step": 145 + }, + { + "epoch": 0.02674972517405643, + "grad_norm": 0.5413416851523152, + "learning_rate": 2.6556776556776558e-06, + "loss": 0.5807, + "step": 146 + }, + { + "epoch": 0.026932942469769145, + "grad_norm": 0.4703582853460863, + "learning_rate": 2.673992673992674e-06, + "loss": 0.538, + "step": 147 + }, + { + "epoch": 0.027116159765481863, + "grad_norm": 0.4332295907365602, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.5167, + "step": 148 + }, + { + "epoch": 0.027299377061194577, + "grad_norm": 0.458616583095158, + "learning_rate": 2.7106227106227106e-06, + "loss": 0.5226, + "step": 149 + }, + { + "epoch": 0.02748259435690729, + "grad_norm": 0.43740744109233864, + "learning_rate": 2.728937728937729e-06, + "loss": 0.541, + "step": 150 + }, + { + "epoch": 0.02766581165262001, + "grad_norm": 0.5735574364330706, + "learning_rate": 2.7472527472527476e-06, + "loss": 0.5835, + "step": 151 + }, + { + "epoch": 0.027849028948332723, + "grad_norm": 0.5055518766772779, + "learning_rate": 2.765567765567766e-06, + "loss": 0.5631, + "step": 152 + }, + { + "epoch": 0.028032246244045437, + "grad_norm": 0.43271616931082146, + "learning_rate": 2.783882783882784e-06, + "loss": 0.5426, + "step": 153 + }, + { + "epoch": 0.028215463539758154, + "grad_norm": 0.4971463373765023, + "learning_rate": 2.8021978021978024e-06, + "loss": 0.5389, + "step": 154 + }, + { + "epoch": 0.02839868083547087, + "grad_norm": 0.4591734918280654, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.5309, + "step": 155 + }, + { + "epoch": 0.028581898131183583, + "grad_norm": 0.45247087926212365, + "learning_rate": 2.838827838827839e-06, + "loss": 0.5536, + "step": 156 + }, + { + "epoch": 0.0287651154268963, + "grad_norm": 0.46626015341742694, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.559, + "step": 157 + }, + { + "epoch": 0.028948332722609015, + "grad_norm": 0.4834854871636071, + "learning_rate": 2.8754578754578755e-06, + "loss": 0.5476, + "step": 158 + }, + { + "epoch": 0.02913155001832173, + "grad_norm": 0.570546858013933, + "learning_rate": 2.8937728937728942e-06, + "loss": 0.5394, + "step": 159 + }, + { + "epoch": 0.029314767314034446, + "grad_norm": 0.45441480104130544, + "learning_rate": 2.9120879120879125e-06, + "loss": 0.5029, + "step": 160 + }, + { + "epoch": 0.02949798460974716, + "grad_norm": 0.5206871098196969, + "learning_rate": 2.930402930402931e-06, + "loss": 0.528, + "step": 161 + }, + { + "epoch": 0.029681201905459875, + "grad_norm": 0.43359542229542136, + "learning_rate": 2.948717948717949e-06, + "loss": 0.5502, + "step": 162 + }, + { + "epoch": 0.029864419201172592, + "grad_norm": 0.46302617065984364, + "learning_rate": 2.9670329670329673e-06, + "loss": 0.5447, + "step": 163 + }, + { + "epoch": 0.030047636496885306, + "grad_norm": 0.3915642271916536, + "learning_rate": 2.9853479853479856e-06, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 0.03023085379259802, + "grad_norm": 0.42991246482419715, + "learning_rate": 3.003663003663004e-06, + "loss": 0.521, + "step": 165 + }, + { + "epoch": 0.030414071088310738, + "grad_norm": 0.468175424095518, + "learning_rate": 3.021978021978022e-06, + "loss": 0.5101, + "step": 166 + }, + { + "epoch": 0.030597288384023452, + "grad_norm": 0.6468735604295471, + "learning_rate": 3.0402930402930405e-06, + "loss": 0.5617, + "step": 167 + }, + { + "epoch": 0.030780505679736166, + "grad_norm": 0.5058923699848836, + "learning_rate": 3.058608058608059e-06, + "loss": 0.5154, + "step": 168 + }, + { + "epoch": 0.030963722975448884, + "grad_norm": 0.45437537978064513, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.4993, + "step": 169 + }, + { + "epoch": 0.031146940271161598, + "grad_norm": 0.467183819204265, + "learning_rate": 3.0952380952380957e-06, + "loss": 0.5701, + "step": 170 + }, + { + "epoch": 0.031330157566874316, + "grad_norm": 0.3835674459614267, + "learning_rate": 3.113553113553114e-06, + "loss": 0.4902, + "step": 171 + }, + { + "epoch": 0.031513374862587026, + "grad_norm": 0.4292795112150875, + "learning_rate": 3.1318681318681323e-06, + "loss": 0.5237, + "step": 172 + }, + { + "epoch": 0.031696592158299744, + "grad_norm": 0.6593051731116806, + "learning_rate": 3.1501831501831505e-06, + "loss": 0.5361, + "step": 173 + }, + { + "epoch": 0.03187980945401246, + "grad_norm": 0.4718436483558741, + "learning_rate": 3.1684981684981684e-06, + "loss": 0.5324, + "step": 174 + }, + { + "epoch": 0.03206302674972517, + "grad_norm": 0.48018416222395494, + "learning_rate": 3.1868131868131867e-06, + "loss": 0.5273, + "step": 175 + }, + { + "epoch": 0.03224624404543789, + "grad_norm": 0.6316028239985647, + "learning_rate": 3.205128205128206e-06, + "loss": 0.5346, + "step": 176 + }, + { + "epoch": 0.03242946134115061, + "grad_norm": 0.4596646339699305, + "learning_rate": 3.223443223443224e-06, + "loss": 0.5263, + "step": 177 + }, + { + "epoch": 0.03261267863686332, + "grad_norm": 0.6529252419894329, + "learning_rate": 3.2417582417582424e-06, + "loss": 0.5442, + "step": 178 + }, + { + "epoch": 0.032795895932576036, + "grad_norm": 0.5538108341969676, + "learning_rate": 3.2600732600732602e-06, + "loss": 0.543, + "step": 179 + }, + { + "epoch": 0.03297911322828875, + "grad_norm": 0.46058359751530825, + "learning_rate": 3.2783882783882785e-06, + "loss": 0.5101, + "step": 180 + }, + { + "epoch": 0.033162330524001464, + "grad_norm": 0.42238032667898895, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.5487, + "step": 181 + }, + { + "epoch": 0.03334554781971418, + "grad_norm": 0.49973876221605035, + "learning_rate": 3.315018315018315e-06, + "loss": 0.5333, + "step": 182 + }, + { + "epoch": 0.0335287651154269, + "grad_norm": 0.49185079032879564, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5733, + "step": 183 + }, + { + "epoch": 0.03371198241113961, + "grad_norm": 0.41357500203470565, + "learning_rate": 3.3516483516483516e-06, + "loss": 0.5108, + "step": 184 + }, + { + "epoch": 0.03389519970685233, + "grad_norm": 0.45597659979596383, + "learning_rate": 3.3699633699633703e-06, + "loss": 0.5316, + "step": 185 + }, + { + "epoch": 0.034078417002565045, + "grad_norm": 0.523873134306111, + "learning_rate": 3.3882783882783886e-06, + "loss": 0.5516, + "step": 186 + }, + { + "epoch": 0.034261634298277756, + "grad_norm": 0.4604705346503309, + "learning_rate": 3.406593406593407e-06, + "loss": 0.5463, + "step": 187 + }, + { + "epoch": 0.03444485159399047, + "grad_norm": 0.4494660179181277, + "learning_rate": 3.424908424908425e-06, + "loss": 0.5195, + "step": 188 + }, + { + "epoch": 0.03462806888970319, + "grad_norm": 0.47161535336220833, + "learning_rate": 3.4432234432234434e-06, + "loss": 0.5495, + "step": 189 + }, + { + "epoch": 0.0348112861854159, + "grad_norm": 0.41422931205807795, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.5165, + "step": 190 + }, + { + "epoch": 0.03499450348112862, + "grad_norm": 0.48552414748511474, + "learning_rate": 3.47985347985348e-06, + "loss": 0.5207, + "step": 191 + }, + { + "epoch": 0.03517772077684134, + "grad_norm": 0.49918231460984896, + "learning_rate": 3.4981684981684982e-06, + "loss": 0.5055, + "step": 192 + }, + { + "epoch": 0.03536093807255405, + "grad_norm": 0.5047784031000427, + "learning_rate": 3.516483516483517e-06, + "loss": 0.5541, + "step": 193 + }, + { + "epoch": 0.035544155368266765, + "grad_norm": 0.45515393167769386, + "learning_rate": 3.5347985347985352e-06, + "loss": 0.5324, + "step": 194 + }, + { + "epoch": 0.03572737266397948, + "grad_norm": 0.4697755766255993, + "learning_rate": 3.5531135531135535e-06, + "loss": 0.5317, + "step": 195 + }, + { + "epoch": 0.035910589959692193, + "grad_norm": 0.47670842684148323, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.5331, + "step": 196 + }, + { + "epoch": 0.03609380725540491, + "grad_norm": 0.46160363428859325, + "learning_rate": 3.58974358974359e-06, + "loss": 0.5645, + "step": 197 + }, + { + "epoch": 0.03627702455111763, + "grad_norm": 0.5456279444043086, + "learning_rate": 3.6080586080586083e-06, + "loss": 0.4923, + "step": 198 + }, + { + "epoch": 0.03646024184683034, + "grad_norm": 0.45679324633890633, + "learning_rate": 3.6263736263736266e-06, + "loss": 0.5408, + "step": 199 + }, + { + "epoch": 0.03664345914254306, + "grad_norm": 0.4038620329916062, + "learning_rate": 3.644688644688645e-06, + "loss": 0.5542, + "step": 200 + }, + { + "epoch": 0.036826676438255775, + "grad_norm": 0.42338470275172285, + "learning_rate": 3.663003663003663e-06, + "loss": 0.5509, + "step": 201 + }, + { + "epoch": 0.037009893733968485, + "grad_norm": 0.48862631901451187, + "learning_rate": 3.681318681318682e-06, + "loss": 0.533, + "step": 202 + }, + { + "epoch": 0.0371931110296812, + "grad_norm": 0.4339739485667441, + "learning_rate": 3.6996336996337e-06, + "loss": 0.5331, + "step": 203 + }, + { + "epoch": 0.03737632832539392, + "grad_norm": 0.4040593227125272, + "learning_rate": 3.7179487179487184e-06, + "loss": 0.568, + "step": 204 + }, + { + "epoch": 0.03755954562110663, + "grad_norm": 0.5025361443024049, + "learning_rate": 3.7362637362637367e-06, + "loss": 0.5616, + "step": 205 + }, + { + "epoch": 0.03774276291681935, + "grad_norm": 0.43671528407867277, + "learning_rate": 3.754578754578755e-06, + "loss": 0.5468, + "step": 206 + }, + { + "epoch": 0.037925980212532066, + "grad_norm": 0.5426409729355149, + "learning_rate": 3.7728937728937733e-06, + "loss": 0.5274, + "step": 207 + }, + { + "epoch": 0.03810919750824478, + "grad_norm": 0.4383066270524436, + "learning_rate": 3.7912087912087915e-06, + "loss": 0.5491, + "step": 208 + }, + { + "epoch": 0.038292414803957495, + "grad_norm": 0.4541571165503901, + "learning_rate": 3.80952380952381e-06, + "loss": 0.511, + "step": 209 + }, + { + "epoch": 0.03847563209967021, + "grad_norm": 0.42914694472434756, + "learning_rate": 3.827838827838828e-06, + "loss": 0.4891, + "step": 210 + }, + { + "epoch": 0.03865884939538292, + "grad_norm": 0.4583586099579229, + "learning_rate": 3.846153846153847e-06, + "loss": 0.5359, + "step": 211 + }, + { + "epoch": 0.03884206669109564, + "grad_norm": 0.46332307938408596, + "learning_rate": 3.864468864468865e-06, + "loss": 0.506, + "step": 212 + }, + { + "epoch": 0.03902528398680836, + "grad_norm": 0.4979093495563886, + "learning_rate": 3.882783882783883e-06, + "loss": 0.5523, + "step": 213 + }, + { + "epoch": 0.03920850128252107, + "grad_norm": 0.524225251704003, + "learning_rate": 3.901098901098901e-06, + "loss": 0.543, + "step": 214 + }, + { + "epoch": 0.039391718578233786, + "grad_norm": 0.5147235908704881, + "learning_rate": 3.91941391941392e-06, + "loss": 0.5465, + "step": 215 + }, + { + "epoch": 0.0395749358739465, + "grad_norm": 0.4170039710750516, + "learning_rate": 3.937728937728938e-06, + "loss": 0.557, + "step": 216 + }, + { + "epoch": 0.039758153169659215, + "grad_norm": 0.4459151957597281, + "learning_rate": 3.9560439560439565e-06, + "loss": 0.5176, + "step": 217 + }, + { + "epoch": 0.03994137046537193, + "grad_norm": 0.4387028825678643, + "learning_rate": 3.974358974358974e-06, + "loss": 0.5433, + "step": 218 + }, + { + "epoch": 0.04012458776108464, + "grad_norm": 0.4606220492260897, + "learning_rate": 3.992673992673993e-06, + "loss": 0.5558, + "step": 219 + }, + { + "epoch": 0.04030780505679736, + "grad_norm": 0.43876788524484817, + "learning_rate": 4.010989010989012e-06, + "loss": 0.496, + "step": 220 + }, + { + "epoch": 0.04049102235251008, + "grad_norm": 0.5354442317270937, + "learning_rate": 4.0293040293040296e-06, + "loss": 0.4883, + "step": 221 + }, + { + "epoch": 0.04067423964822279, + "grad_norm": 0.4471338769246311, + "learning_rate": 4.047619047619048e-06, + "loss": 0.5248, + "step": 222 + }, + { + "epoch": 0.040857456943935506, + "grad_norm": 0.4728450428012797, + "learning_rate": 4.065934065934066e-06, + "loss": 0.5412, + "step": 223 + }, + { + "epoch": 0.041040674239648224, + "grad_norm": 0.5021628515290991, + "learning_rate": 4.084249084249085e-06, + "loss": 0.5398, + "step": 224 + }, + { + "epoch": 0.041223891535360935, + "grad_norm": 0.379469323319607, + "learning_rate": 4.102564102564103e-06, + "loss": 0.5484, + "step": 225 + }, + { + "epoch": 0.04140710883107365, + "grad_norm": 0.4682923741442823, + "learning_rate": 4.120879120879121e-06, + "loss": 0.5221, + "step": 226 + }, + { + "epoch": 0.04159032612678637, + "grad_norm": 0.5298230539031403, + "learning_rate": 4.139194139194139e-06, + "loss": 0.5248, + "step": 227 + }, + { + "epoch": 0.04177354342249908, + "grad_norm": 0.448919135267925, + "learning_rate": 4.157509157509158e-06, + "loss": 0.5492, + "step": 228 + }, + { + "epoch": 0.0419567607182118, + "grad_norm": 0.4651550006247672, + "learning_rate": 4.175824175824177e-06, + "loss": 0.5244, + "step": 229 + }, + { + "epoch": 0.042139978013924516, + "grad_norm": 0.46885786974024124, + "learning_rate": 4.1941391941391945e-06, + "loss": 0.5145, + "step": 230 + }, + { + "epoch": 0.04232319530963723, + "grad_norm": 0.9939588576814024, + "learning_rate": 4.212454212454213e-06, + "loss": 0.507, + "step": 231 + }, + { + "epoch": 0.042506412605349944, + "grad_norm": 0.42445387506164906, + "learning_rate": 4.230769230769231e-06, + "loss": 0.528, + "step": 232 + }, + { + "epoch": 0.04268962990106266, + "grad_norm": 0.4386077325175301, + "learning_rate": 4.249084249084249e-06, + "loss": 0.5235, + "step": 233 + }, + { + "epoch": 0.04287284719677537, + "grad_norm": 0.4275787850644743, + "learning_rate": 4.267399267399268e-06, + "loss": 0.5268, + "step": 234 + }, + { + "epoch": 0.04305606449248809, + "grad_norm": 0.3819534900324145, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5108, + "step": 235 + }, + { + "epoch": 0.04323928178820081, + "grad_norm": 0.5069196499403993, + "learning_rate": 4.304029304029305e-06, + "loss": 0.4914, + "step": 236 + }, + { + "epoch": 0.04342249908391352, + "grad_norm": 0.4224671659155117, + "learning_rate": 4.322344322344323e-06, + "loss": 0.531, + "step": 237 + }, + { + "epoch": 0.043605716379626236, + "grad_norm": 0.49892435842001814, + "learning_rate": 4.340659340659341e-06, + "loss": 0.5309, + "step": 238 + }, + { + "epoch": 0.043788933675338954, + "grad_norm": 0.4435906661481072, + "learning_rate": 4.358974358974359e-06, + "loss": 0.5386, + "step": 239 + }, + { + "epoch": 0.043972150971051664, + "grad_norm": 0.4646701721197805, + "learning_rate": 4.377289377289377e-06, + "loss": 0.5445, + "step": 240 + }, + { + "epoch": 0.04415536826676438, + "grad_norm": 0.42345412346700445, + "learning_rate": 4.395604395604396e-06, + "loss": 0.5171, + "step": 241 + }, + { + "epoch": 0.0443385855624771, + "grad_norm": 0.4664513677280782, + "learning_rate": 4.413919413919414e-06, + "loss": 0.5471, + "step": 242 + }, + { + "epoch": 0.04452180285818981, + "grad_norm": 0.5007403600133091, + "learning_rate": 4.4322344322344325e-06, + "loss": 0.5224, + "step": 243 + }, + { + "epoch": 0.04470502015390253, + "grad_norm": 0.4202775330369337, + "learning_rate": 4.45054945054945e-06, + "loss": 0.5283, + "step": 244 + }, + { + "epoch": 0.044888237449615245, + "grad_norm": 0.485312343325994, + "learning_rate": 4.468864468864469e-06, + "loss": 0.5365, + "step": 245 + }, + { + "epoch": 0.045071454745327956, + "grad_norm": 0.4796744192562041, + "learning_rate": 4.487179487179488e-06, + "loss": 0.5481, + "step": 246 + }, + { + "epoch": 0.045254672041040674, + "grad_norm": 0.4699525411145978, + "learning_rate": 4.505494505494506e-06, + "loss": 0.526, + "step": 247 + }, + { + "epoch": 0.04543788933675339, + "grad_norm": 0.37235134296143163, + "learning_rate": 4.523809523809524e-06, + "loss": 0.4929, + "step": 248 + }, + { + "epoch": 0.0456211066324661, + "grad_norm": 0.4594065464998237, + "learning_rate": 4.542124542124542e-06, + "loss": 0.5435, + "step": 249 + }, + { + "epoch": 0.04580432392817882, + "grad_norm": 0.4812140084395847, + "learning_rate": 4.560439560439561e-06, + "loss": 0.4714, + "step": 250 + }, + { + "epoch": 0.04598754122389154, + "grad_norm": 0.44275937617791644, + "learning_rate": 4.578754578754579e-06, + "loss": 0.5289, + "step": 251 + }, + { + "epoch": 0.04617075851960425, + "grad_norm": 0.543477861772032, + "learning_rate": 4.5970695970695975e-06, + "loss": 0.5176, + "step": 252 + }, + { + "epoch": 0.046353975815316965, + "grad_norm": 0.9195336625704912, + "learning_rate": 4.615384615384616e-06, + "loss": 0.5315, + "step": 253 + }, + { + "epoch": 0.04653719311102968, + "grad_norm": 0.4922315670719196, + "learning_rate": 4.633699633699634e-06, + "loss": 0.5364, + "step": 254 + }, + { + "epoch": 0.046720410406742394, + "grad_norm": 0.47361690353516367, + "learning_rate": 4.652014652014653e-06, + "loss": 0.5591, + "step": 255 + }, + { + "epoch": 0.04690362770245511, + "grad_norm": 0.5164779414047217, + "learning_rate": 4.6703296703296706e-06, + "loss": 0.5538, + "step": 256 + }, + { + "epoch": 0.04708684499816783, + "grad_norm": 0.421397036777767, + "learning_rate": 4.688644688644689e-06, + "loss": 0.5338, + "step": 257 + }, + { + "epoch": 0.04727006229388054, + "grad_norm": 0.5053871259325204, + "learning_rate": 4.706959706959707e-06, + "loss": 0.544, + "step": 258 + }, + { + "epoch": 0.04745327958959326, + "grad_norm": 0.4607447877406368, + "learning_rate": 4.725274725274726e-06, + "loss": 0.4812, + "step": 259 + }, + { + "epoch": 0.047636496885305975, + "grad_norm": 0.4875422302168998, + "learning_rate": 4.743589743589744e-06, + "loss": 0.5614, + "step": 260 + }, + { + "epoch": 0.047819714181018685, + "grad_norm": 0.41579068200919733, + "learning_rate": 4.761904761904762e-06, + "loss": 0.4729, + "step": 261 + }, + { + "epoch": 0.0480029314767314, + "grad_norm": 0.46282242693556186, + "learning_rate": 4.780219780219781e-06, + "loss": 0.5224, + "step": 262 + }, + { + "epoch": 0.04818614877244412, + "grad_norm": 0.4482830307148575, + "learning_rate": 4.798534798534799e-06, + "loss": 0.5326, + "step": 263 + }, + { + "epoch": 0.04836936606815683, + "grad_norm": 0.4342637761169385, + "learning_rate": 4.816849816849818e-06, + "loss": 0.5056, + "step": 264 + }, + { + "epoch": 0.04855258336386955, + "grad_norm": 0.42342307710917526, + "learning_rate": 4.8351648351648355e-06, + "loss": 0.5049, + "step": 265 + }, + { + "epoch": 0.04873580065958227, + "grad_norm": 0.41567729548709964, + "learning_rate": 4.853479853479854e-06, + "loss": 0.5313, + "step": 266 + }, + { + "epoch": 0.04891901795529498, + "grad_norm": 0.44877730848158315, + "learning_rate": 4.871794871794872e-06, + "loss": 0.5027, + "step": 267 + }, + { + "epoch": 0.049102235251007695, + "grad_norm": 0.46298601715996757, + "learning_rate": 4.890109890109891e-06, + "loss": 0.5418, + "step": 268 + }, + { + "epoch": 0.04928545254672041, + "grad_norm": 0.4233152082129357, + "learning_rate": 4.908424908424909e-06, + "loss": 0.534, + "step": 269 + }, + { + "epoch": 0.04946866984243312, + "grad_norm": 0.42327618076780654, + "learning_rate": 4.926739926739927e-06, + "loss": 0.5073, + "step": 270 + }, + { + "epoch": 0.04965188713814584, + "grad_norm": 0.47132160003804374, + "learning_rate": 4.945054945054946e-06, + "loss": 0.564, + "step": 271 + }, + { + "epoch": 0.04983510443385856, + "grad_norm": 0.467395758671848, + "learning_rate": 4.963369963369964e-06, + "loss": 0.5194, + "step": 272 + }, + { + "epoch": 0.05001832172957127, + "grad_norm": 0.4377241688268797, + "learning_rate": 4.9816849816849826e-06, + "loss": 0.5473, + "step": 273 + }, + { + "epoch": 0.05020153902528399, + "grad_norm": 0.4344310773487788, + "learning_rate": 5e-06, + "loss": 0.5198, + "step": 274 + }, + { + "epoch": 0.050384756320996704, + "grad_norm": 0.505691771505538, + "learning_rate": 5.018315018315018e-06, + "loss": 0.5001, + "step": 275 + }, + { + "epoch": 0.050567973616709415, + "grad_norm": 0.4741702696479342, + "learning_rate": 5.036630036630037e-06, + "loss": 0.5269, + "step": 276 + }, + { + "epoch": 0.05075119091242213, + "grad_norm": 0.4123760853657366, + "learning_rate": 5.054945054945055e-06, + "loss": 0.5366, + "step": 277 + }, + { + "epoch": 0.05093440820813485, + "grad_norm": 0.5165952852785715, + "learning_rate": 5.0732600732600735e-06, + "loss": 0.5629, + "step": 278 + }, + { + "epoch": 0.05111762550384756, + "grad_norm": 0.6018743338441076, + "learning_rate": 5.091575091575091e-06, + "loss": 0.5268, + "step": 279 + }, + { + "epoch": 0.05130084279956028, + "grad_norm": 0.4647905088113548, + "learning_rate": 5.10989010989011e-06, + "loss": 0.5353, + "step": 280 + }, + { + "epoch": 0.051484060095272996, + "grad_norm": 0.4666862939383661, + "learning_rate": 5.128205128205128e-06, + "loss": 0.5582, + "step": 281 + }, + { + "epoch": 0.05166727739098571, + "grad_norm": 0.42295571485014016, + "learning_rate": 5.146520146520147e-06, + "loss": 0.5389, + "step": 282 + }, + { + "epoch": 0.051850494686698424, + "grad_norm": 0.5088982589681916, + "learning_rate": 5.164835164835166e-06, + "loss": 0.5114, + "step": 283 + }, + { + "epoch": 0.05203371198241114, + "grad_norm": 0.39252264391052066, + "learning_rate": 5.183150183150184e-06, + "loss": 0.5078, + "step": 284 + }, + { + "epoch": 0.05221692927812385, + "grad_norm": 0.47464881831711925, + "learning_rate": 5.201465201465202e-06, + "loss": 0.5719, + "step": 285 + }, + { + "epoch": 0.05240014657383657, + "grad_norm": 0.42811181510690394, + "learning_rate": 5.219780219780221e-06, + "loss": 0.536, + "step": 286 + }, + { + "epoch": 0.05258336386954929, + "grad_norm": 0.4983761768332983, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.5454, + "step": 287 + }, + { + "epoch": 0.052766581165262, + "grad_norm": 0.3871752405078846, + "learning_rate": 5.256410256410257e-06, + "loss": 0.5172, + "step": 288 + }, + { + "epoch": 0.052949798460974716, + "grad_norm": 0.44878294094458826, + "learning_rate": 5.274725274725275e-06, + "loss": 0.4965, + "step": 289 + }, + { + "epoch": 0.053133015756687434, + "grad_norm": 0.4193937449864018, + "learning_rate": 5.293040293040294e-06, + "loss": 0.5418, + "step": 290 + }, + { + "epoch": 0.053316233052400144, + "grad_norm": 0.47326184468203625, + "learning_rate": 5.3113553113553116e-06, + "loss": 0.5465, + "step": 291 + }, + { + "epoch": 0.05349945034811286, + "grad_norm": 0.46890596965933473, + "learning_rate": 5.32967032967033e-06, + "loss": 0.5427, + "step": 292 + }, + { + "epoch": 0.05368266764382558, + "grad_norm": 0.4718404244115825, + "learning_rate": 5.347985347985348e-06, + "loss": 0.5315, + "step": 293 + }, + { + "epoch": 0.05386588493953829, + "grad_norm": 0.5313479853203268, + "learning_rate": 5.366300366300367e-06, + "loss": 0.5261, + "step": 294 + }, + { + "epoch": 0.05404910223525101, + "grad_norm": 0.4619862699623299, + "learning_rate": 5.384615384615385e-06, + "loss": 0.5224, + "step": 295 + }, + { + "epoch": 0.054232319530963725, + "grad_norm": 0.4235637361399875, + "learning_rate": 5.402930402930403e-06, + "loss": 0.5484, + "step": 296 + }, + { + "epoch": 0.054415536826676436, + "grad_norm": 0.4826898937600368, + "learning_rate": 5.421245421245421e-06, + "loss": 0.524, + "step": 297 + }, + { + "epoch": 0.054598754122389154, + "grad_norm": 0.43904512964940123, + "learning_rate": 5.43956043956044e-06, + "loss": 0.5486, + "step": 298 + }, + { + "epoch": 0.05478197141810187, + "grad_norm": 0.5045757484045217, + "learning_rate": 5.457875457875458e-06, + "loss": 0.5407, + "step": 299 + }, + { + "epoch": 0.05496518871381458, + "grad_norm": 0.47829971819207484, + "learning_rate": 5.476190476190477e-06, + "loss": 0.5344, + "step": 300 + }, + { + "epoch": 0.0551484060095273, + "grad_norm": 0.416644246441645, + "learning_rate": 5.494505494505495e-06, + "loss": 0.5111, + "step": 301 + }, + { + "epoch": 0.05533162330524002, + "grad_norm": 0.488275746902462, + "learning_rate": 5.512820512820514e-06, + "loss": 0.54, + "step": 302 + }, + { + "epoch": 0.05551484060095273, + "grad_norm": 0.43082352297647686, + "learning_rate": 5.531135531135532e-06, + "loss": 0.5219, + "step": 303 + }, + { + "epoch": 0.055698057896665445, + "grad_norm": 0.41708996725660685, + "learning_rate": 5.5494505494505504e-06, + "loss": 0.5272, + "step": 304 + }, + { + "epoch": 0.05588127519237816, + "grad_norm": 0.4748217492221608, + "learning_rate": 5.567765567765568e-06, + "loss": 0.5439, + "step": 305 + }, + { + "epoch": 0.056064492488090874, + "grad_norm": 0.5257169187612324, + "learning_rate": 5.586080586080587e-06, + "loss": 0.5671, + "step": 306 + }, + { + "epoch": 0.05624770978380359, + "grad_norm": 0.4243472668028098, + "learning_rate": 5.604395604395605e-06, + "loss": 0.5044, + "step": 307 + }, + { + "epoch": 0.05643092707951631, + "grad_norm": 0.43876355547814727, + "learning_rate": 5.6227106227106235e-06, + "loss": 0.4815, + "step": 308 + }, + { + "epoch": 0.05661414437522902, + "grad_norm": 0.43342398130791976, + "learning_rate": 5.641025641025641e-06, + "loss": 0.5308, + "step": 309 + }, + { + "epoch": 0.05679736167094174, + "grad_norm": 0.4660705177035744, + "learning_rate": 5.65934065934066e-06, + "loss": 0.5686, + "step": 310 + }, + { + "epoch": 0.056980578966654455, + "grad_norm": 0.4170384834874546, + "learning_rate": 5.677655677655678e-06, + "loss": 0.4965, + "step": 311 + }, + { + "epoch": 0.057163796262367166, + "grad_norm": 0.4568771189697619, + "learning_rate": 5.695970695970696e-06, + "loss": 0.5074, + "step": 312 + }, + { + "epoch": 0.05734701355807988, + "grad_norm": 0.4666988354433752, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5225, + "step": 313 + }, + { + "epoch": 0.0575302308537926, + "grad_norm": 0.38305953336818827, + "learning_rate": 5.732600732600732e-06, + "loss": 0.5057, + "step": 314 + }, + { + "epoch": 0.05771344814950531, + "grad_norm": 0.8971752127635111, + "learning_rate": 5.750915750915751e-06, + "loss": 0.5556, + "step": 315 + }, + { + "epoch": 0.05789666544521803, + "grad_norm": 0.4580428245754137, + "learning_rate": 5.769230769230769e-06, + "loss": 0.5344, + "step": 316 + }, + { + "epoch": 0.05807988274093075, + "grad_norm": 0.414647309534276, + "learning_rate": 5.7875457875457885e-06, + "loss": 0.5177, + "step": 317 + }, + { + "epoch": 0.05826310003664346, + "grad_norm": 0.45495854796733415, + "learning_rate": 5.805860805860807e-06, + "loss": 0.5214, + "step": 318 + }, + { + "epoch": 0.058446317332356175, + "grad_norm": 0.47098300998769715, + "learning_rate": 5.824175824175825e-06, + "loss": 0.5519, + "step": 319 + }, + { + "epoch": 0.05862953462806889, + "grad_norm": 0.43335718768494447, + "learning_rate": 5.842490842490844e-06, + "loss": 0.5282, + "step": 320 + }, + { + "epoch": 0.0588127519237816, + "grad_norm": 0.4649559782288625, + "learning_rate": 5.860805860805862e-06, + "loss": 0.5163, + "step": 321 + }, + { + "epoch": 0.05899596921949432, + "grad_norm": 0.44623794630893965, + "learning_rate": 5.8791208791208794e-06, + "loss": 0.514, + "step": 322 + }, + { + "epoch": 0.05917918651520704, + "grad_norm": 0.4521538486948526, + "learning_rate": 5.897435897435898e-06, + "loss": 0.5248, + "step": 323 + }, + { + "epoch": 0.05936240381091975, + "grad_norm": 0.453853173278213, + "learning_rate": 5.915750915750916e-06, + "loss": 0.5105, + "step": 324 + }, + { + "epoch": 0.05954562110663247, + "grad_norm": 0.45615939787874893, + "learning_rate": 5.934065934065935e-06, + "loss": 0.5321, + "step": 325 + }, + { + "epoch": 0.059728838402345184, + "grad_norm": 0.5103507598238842, + "learning_rate": 5.9523809523809525e-06, + "loss": 0.4889, + "step": 326 + }, + { + "epoch": 0.059912055698057895, + "grad_norm": 0.4755868373014201, + "learning_rate": 5.970695970695971e-06, + "loss": 0.5233, + "step": 327 + }, + { + "epoch": 0.06009527299377061, + "grad_norm": 0.48186711326734255, + "learning_rate": 5.989010989010989e-06, + "loss": 0.5216, + "step": 328 + }, + { + "epoch": 0.06027849028948333, + "grad_norm": 0.406290487409726, + "learning_rate": 6.007326007326008e-06, + "loss": 0.5048, + "step": 329 + }, + { + "epoch": 0.06046170758519604, + "grad_norm": 0.47965200235867606, + "learning_rate": 6.025641025641026e-06, + "loss": 0.513, + "step": 330 + }, + { + "epoch": 0.06064492488090876, + "grad_norm": 0.492103246179344, + "learning_rate": 6.043956043956044e-06, + "loss": 0.5553, + "step": 331 + }, + { + "epoch": 0.060828142176621476, + "grad_norm": 0.49171785130479284, + "learning_rate": 6.062271062271062e-06, + "loss": 0.5289, + "step": 332 + }, + { + "epoch": 0.06101135947233419, + "grad_norm": 0.4659535600554528, + "learning_rate": 6.080586080586081e-06, + "loss": 0.5143, + "step": 333 + }, + { + "epoch": 0.061194576768046904, + "grad_norm": 0.43136677550682173, + "learning_rate": 6.0989010989011e-06, + "loss": 0.469, + "step": 334 + }, + { + "epoch": 0.06137779406375962, + "grad_norm": 0.6094192173431031, + "learning_rate": 6.117216117216118e-06, + "loss": 0.5121, + "step": 335 + }, + { + "epoch": 0.06156101135947233, + "grad_norm": 0.45352827270619606, + "learning_rate": 6.135531135531136e-06, + "loss": 0.5097, + "step": 336 + }, + { + "epoch": 0.06174422865518505, + "grad_norm": 0.42626268219917746, + "learning_rate": 6.153846153846155e-06, + "loss": 0.5234, + "step": 337 + }, + { + "epoch": 0.06192744595089777, + "grad_norm": 0.4438479757326601, + "learning_rate": 6.172161172161173e-06, + "loss": 0.4852, + "step": 338 + }, + { + "epoch": 0.06211066324661048, + "grad_norm": 0.688188614062373, + "learning_rate": 6.1904761904761914e-06, + "loss": 0.502, + "step": 339 + }, + { + "epoch": 0.062293880542323196, + "grad_norm": 0.5160627484540854, + "learning_rate": 6.208791208791209e-06, + "loss": 0.521, + "step": 340 + }, + { + "epoch": 0.062477097838035914, + "grad_norm": 0.4356067335955304, + "learning_rate": 6.227106227106228e-06, + "loss": 0.4918, + "step": 341 + }, + { + "epoch": 0.06266031513374863, + "grad_norm": 0.5001791021027777, + "learning_rate": 6.245421245421246e-06, + "loss": 0.5173, + "step": 342 + }, + { + "epoch": 0.06284353242946134, + "grad_norm": 0.45621279623031163, + "learning_rate": 6.2637362637362645e-06, + "loss": 0.5532, + "step": 343 + }, + { + "epoch": 0.06302674972517405, + "grad_norm": 0.4686583276600699, + "learning_rate": 6.282051282051282e-06, + "loss": 0.5544, + "step": 344 + }, + { + "epoch": 0.06320996702088677, + "grad_norm": 0.4500457038114704, + "learning_rate": 6.300366300366301e-06, + "loss": 0.5094, + "step": 345 + }, + { + "epoch": 0.06339318431659949, + "grad_norm": 0.4707435610591325, + "learning_rate": 6.318681318681319e-06, + "loss": 0.5317, + "step": 346 + }, + { + "epoch": 0.0635764016123122, + "grad_norm": 0.44910822533973516, + "learning_rate": 6.336996336996337e-06, + "loss": 0.5063, + "step": 347 + }, + { + "epoch": 0.06375961890802492, + "grad_norm": 0.4262957774336457, + "learning_rate": 6.3553113553113555e-06, + "loss": 0.5389, + "step": 348 + }, + { + "epoch": 0.06394283620373763, + "grad_norm": 0.4958715530578741, + "learning_rate": 6.373626373626373e-06, + "loss": 0.5253, + "step": 349 + }, + { + "epoch": 0.06412605349945034, + "grad_norm": 0.4706503863353741, + "learning_rate": 6.391941391941392e-06, + "loss": 0.5078, + "step": 350 + }, + { + "epoch": 0.06430927079516306, + "grad_norm": 0.5025375698465898, + "learning_rate": 6.410256410256412e-06, + "loss": 0.5366, + "step": 351 + }, + { + "epoch": 0.06449248809087578, + "grad_norm": 0.43307402952194485, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.5053, + "step": 352 + }, + { + "epoch": 0.0646757053865885, + "grad_norm": 0.4145957842766705, + "learning_rate": 6.446886446886448e-06, + "loss": 0.5134, + "step": 353 + }, + { + "epoch": 0.06485892268230121, + "grad_norm": 0.42558146560441634, + "learning_rate": 6.465201465201466e-06, + "loss": 0.498, + "step": 354 + }, + { + "epoch": 0.06504213997801392, + "grad_norm": 0.6808736114735602, + "learning_rate": 6.483516483516485e-06, + "loss": 0.4865, + "step": 355 + }, + { + "epoch": 0.06522535727372664, + "grad_norm": 0.4858578720351213, + "learning_rate": 6.5018315018315026e-06, + "loss": 0.4977, + "step": 356 + }, + { + "epoch": 0.06540857456943935, + "grad_norm": 0.5523209851617275, + "learning_rate": 6.5201465201465204e-06, + "loss": 0.5278, + "step": 357 + }, + { + "epoch": 0.06559179186515207, + "grad_norm": 0.4756243917905379, + "learning_rate": 6.538461538461539e-06, + "loss": 0.5018, + "step": 358 + }, + { + "epoch": 0.06577500916086479, + "grad_norm": 0.5270789359516691, + "learning_rate": 6.556776556776557e-06, + "loss": 0.5127, + "step": 359 + }, + { + "epoch": 0.0659582264565775, + "grad_norm": 0.5213313487503423, + "learning_rate": 6.575091575091576e-06, + "loss": 0.504, + "step": 360 + }, + { + "epoch": 0.06614144375229021, + "grad_norm": 0.47699985237826076, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.5356, + "step": 361 + }, + { + "epoch": 0.06632466104800293, + "grad_norm": 0.4398115760336562, + "learning_rate": 6.611721611721612e-06, + "loss": 0.5038, + "step": 362 + }, + { + "epoch": 0.06650787834371565, + "grad_norm": 0.5848808647247892, + "learning_rate": 6.63003663003663e-06, + "loss": 0.5393, + "step": 363 + }, + { + "epoch": 0.06669109563942836, + "grad_norm": 0.40297449884724584, + "learning_rate": 6.648351648351649e-06, + "loss": 0.5231, + "step": 364 + }, + { + "epoch": 0.06687431293514108, + "grad_norm": 0.4615954062532406, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5017, + "step": 365 + }, + { + "epoch": 0.0670575302308538, + "grad_norm": 0.4204668889117905, + "learning_rate": 6.684981684981685e-06, + "loss": 0.5651, + "step": 366 + }, + { + "epoch": 0.0672407475265665, + "grad_norm": 0.4765525618556606, + "learning_rate": 6.703296703296703e-06, + "loss": 0.5253, + "step": 367 + }, + { + "epoch": 0.06742396482227922, + "grad_norm": 0.431663431335195, + "learning_rate": 6.721611721611723e-06, + "loss": 0.5398, + "step": 368 + }, + { + "epoch": 0.06760718211799194, + "grad_norm": 0.4403652862248983, + "learning_rate": 6.739926739926741e-06, + "loss": 0.5252, + "step": 369 + }, + { + "epoch": 0.06779039941370466, + "grad_norm": 0.483725795531489, + "learning_rate": 6.758241758241759e-06, + "loss": 0.5349, + "step": 370 + }, + { + "epoch": 0.06797361670941737, + "grad_norm": 0.8892216780264888, + "learning_rate": 6.776556776556777e-06, + "loss": 0.5184, + "step": 371 + }, + { + "epoch": 0.06815683400513009, + "grad_norm": 0.48775432933817636, + "learning_rate": 6.794871794871796e-06, + "loss": 0.5275, + "step": 372 + }, + { + "epoch": 0.0683400513008428, + "grad_norm": 0.4772597017011689, + "learning_rate": 6.813186813186814e-06, + "loss": 0.5517, + "step": 373 + }, + { + "epoch": 0.06852326859655551, + "grad_norm": 0.46418441159832313, + "learning_rate": 6.831501831501832e-06, + "loss": 0.5429, + "step": 374 + }, + { + "epoch": 0.06870648589226823, + "grad_norm": 0.45474945807496187, + "learning_rate": 6.84981684981685e-06, + "loss": 0.4999, + "step": 375 + }, + { + "epoch": 0.06888970318798095, + "grad_norm": 0.5576640330118253, + "learning_rate": 6.868131868131869e-06, + "loss": 0.5198, + "step": 376 + }, + { + "epoch": 0.06907292048369366, + "grad_norm": 0.4921371781399045, + "learning_rate": 6.886446886446887e-06, + "loss": 0.5379, + "step": 377 + }, + { + "epoch": 0.06925613777940638, + "grad_norm": 0.41639097386056484, + "learning_rate": 6.9047619047619055e-06, + "loss": 0.519, + "step": 378 + }, + { + "epoch": 0.06943935507511909, + "grad_norm": 0.4778909271586564, + "learning_rate": 6.923076923076923e-06, + "loss": 0.5336, + "step": 379 + }, + { + "epoch": 0.0696225723708318, + "grad_norm": 0.4735414706699025, + "learning_rate": 6.941391941391942e-06, + "loss": 0.5, + "step": 380 + }, + { + "epoch": 0.06980578966654452, + "grad_norm": 0.5646322357431656, + "learning_rate": 6.95970695970696e-06, + "loss": 0.5073, + "step": 381 + }, + { + "epoch": 0.06998900696225724, + "grad_norm": 0.48231469572609614, + "learning_rate": 6.978021978021979e-06, + "loss": 0.519, + "step": 382 + }, + { + "epoch": 0.07017222425796996, + "grad_norm": 0.5051213072897166, + "learning_rate": 6.9963369963369965e-06, + "loss": 0.5315, + "step": 383 + }, + { + "epoch": 0.07035544155368267, + "grad_norm": 0.5166250853237789, + "learning_rate": 7.014652014652014e-06, + "loss": 0.4978, + "step": 384 + }, + { + "epoch": 0.07053865884939538, + "grad_norm": 0.43900982727967125, + "learning_rate": 7.032967032967034e-06, + "loss": 0.4913, + "step": 385 + }, + { + "epoch": 0.0707218761451081, + "grad_norm": 0.5262819691743885, + "learning_rate": 7.051282051282053e-06, + "loss": 0.5189, + "step": 386 + }, + { + "epoch": 0.07090509344082081, + "grad_norm": 0.4906825009780104, + "learning_rate": 7.0695970695970705e-06, + "loss": 0.5586, + "step": 387 + }, + { + "epoch": 0.07108831073653353, + "grad_norm": 0.48825547184252527, + "learning_rate": 7.087912087912089e-06, + "loss": 0.5218, + "step": 388 + }, + { + "epoch": 0.07127152803224625, + "grad_norm": 0.5005437189224704, + "learning_rate": 7.106227106227107e-06, + "loss": 0.5088, + "step": 389 + }, + { + "epoch": 0.07145474532795897, + "grad_norm": 0.4820090004987099, + "learning_rate": 7.124542124542126e-06, + "loss": 0.5132, + "step": 390 + }, + { + "epoch": 0.07163796262367167, + "grad_norm": 0.46585258023856246, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5198, + "step": 391 + }, + { + "epoch": 0.07182117991938439, + "grad_norm": 0.5351346796257885, + "learning_rate": 7.161172161172162e-06, + "loss": 0.5174, + "step": 392 + }, + { + "epoch": 0.0720043972150971, + "grad_norm": 0.505980784032641, + "learning_rate": 7.17948717948718e-06, + "loss": 0.517, + "step": 393 + }, + { + "epoch": 0.07218761451080982, + "grad_norm": 0.3991493735258601, + "learning_rate": 7.197802197802198e-06, + "loss": 0.5193, + "step": 394 + }, + { + "epoch": 0.07237083180652254, + "grad_norm": 0.47050897190858826, + "learning_rate": 7.216117216117217e-06, + "loss": 0.5278, + "step": 395 + }, + { + "epoch": 0.07255404910223526, + "grad_norm": 0.42709223237684557, + "learning_rate": 7.2344322344322345e-06, + "loss": 0.5233, + "step": 396 + }, + { + "epoch": 0.07273726639794796, + "grad_norm": 0.40701893214503493, + "learning_rate": 7.252747252747253e-06, + "loss": 0.4594, + "step": 397 + }, + { + "epoch": 0.07292048369366068, + "grad_norm": 0.4793723507064191, + "learning_rate": 7.271062271062271e-06, + "loss": 0.5022, + "step": 398 + }, + { + "epoch": 0.0731037009893734, + "grad_norm": 0.4084855491847111, + "learning_rate": 7.28937728937729e-06, + "loss": 0.5177, + "step": 399 + }, + { + "epoch": 0.07328691828508611, + "grad_norm": 0.48791204362548757, + "learning_rate": 7.307692307692308e-06, + "loss": 0.5336, + "step": 400 + }, + { + "epoch": 0.07347013558079883, + "grad_norm": 0.4721917787188149, + "learning_rate": 7.326007326007326e-06, + "loss": 0.5291, + "step": 401 + }, + { + "epoch": 0.07365335287651155, + "grad_norm": 0.4482564665450733, + "learning_rate": 7.344322344322346e-06, + "loss": 0.516, + "step": 402 + }, + { + "epoch": 0.07383657017222425, + "grad_norm": 0.503350703275625, + "learning_rate": 7.362637362637364e-06, + "loss": 0.5232, + "step": 403 + }, + { + "epoch": 0.07401978746793697, + "grad_norm": 0.5152368271826678, + "learning_rate": 7.380952380952382e-06, + "loss": 0.5258, + "step": 404 + }, + { + "epoch": 0.07420300476364969, + "grad_norm": 0.4523690267979874, + "learning_rate": 7.3992673992674e-06, + "loss": 0.5007, + "step": 405 + }, + { + "epoch": 0.0743862220593624, + "grad_norm": 0.44122338596098665, + "learning_rate": 7.417582417582418e-06, + "loss": 0.5325, + "step": 406 + }, + { + "epoch": 0.07456943935507512, + "grad_norm": 0.418021168922302, + "learning_rate": 7.435897435897437e-06, + "loss": 0.5048, + "step": 407 + }, + { + "epoch": 0.07475265665078784, + "grad_norm": 0.44537098554090276, + "learning_rate": 7.454212454212455e-06, + "loss": 0.5294, + "step": 408 + }, + { + "epoch": 0.07493587394650054, + "grad_norm": 0.41384101894590325, + "learning_rate": 7.472527472527473e-06, + "loss": 0.5119, + "step": 409 + }, + { + "epoch": 0.07511909124221326, + "grad_norm": 0.4647470180215214, + "learning_rate": 7.490842490842491e-06, + "loss": 0.5202, + "step": 410 + }, + { + "epoch": 0.07530230853792598, + "grad_norm": 0.4836409216489063, + "learning_rate": 7.50915750915751e-06, + "loss": 0.5056, + "step": 411 + }, + { + "epoch": 0.0754855258336387, + "grad_norm": 0.45195196209865357, + "learning_rate": 7.527472527472528e-06, + "loss": 0.5213, + "step": 412 + }, + { + "epoch": 0.07566874312935142, + "grad_norm": 0.5036829282617328, + "learning_rate": 7.5457875457875465e-06, + "loss": 0.5347, + "step": 413 + }, + { + "epoch": 0.07585196042506413, + "grad_norm": 0.4666038283310089, + "learning_rate": 7.564102564102564e-06, + "loss": 0.5151, + "step": 414 + }, + { + "epoch": 0.07603517772077684, + "grad_norm": 0.5067769344340564, + "learning_rate": 7.582417582417583e-06, + "loss": 0.5171, + "step": 415 + }, + { + "epoch": 0.07621839501648955, + "grad_norm": 0.4093230258713508, + "learning_rate": 7.600732600732601e-06, + "loss": 0.5039, + "step": 416 + }, + { + "epoch": 0.07640161231220227, + "grad_norm": 0.6498006485071868, + "learning_rate": 7.61904761904762e-06, + "loss": 0.5232, + "step": 417 + }, + { + "epoch": 0.07658482960791499, + "grad_norm": 0.4558152414900924, + "learning_rate": 7.637362637362638e-06, + "loss": 0.5232, + "step": 418 + }, + { + "epoch": 0.0767680469036277, + "grad_norm": 0.4804930639066785, + "learning_rate": 7.655677655677656e-06, + "loss": 0.5088, + "step": 419 + }, + { + "epoch": 0.07695126419934042, + "grad_norm": 0.5081120095319347, + "learning_rate": 7.673992673992676e-06, + "loss": 0.5308, + "step": 420 + }, + { + "epoch": 0.07713448149505313, + "grad_norm": 0.42393212923875745, + "learning_rate": 7.692307692307694e-06, + "loss": 0.5084, + "step": 421 + }, + { + "epoch": 0.07731769879076585, + "grad_norm": 0.44672075335716155, + "learning_rate": 7.710622710622711e-06, + "loss": 0.483, + "step": 422 + }, + { + "epoch": 0.07750091608647856, + "grad_norm": 0.4942211324726324, + "learning_rate": 7.72893772893773e-06, + "loss": 0.5236, + "step": 423 + }, + { + "epoch": 0.07768413338219128, + "grad_norm": 0.39817279842841763, + "learning_rate": 7.747252747252749e-06, + "loss": 0.5332, + "step": 424 + }, + { + "epoch": 0.077867350677904, + "grad_norm": 0.4608671155196199, + "learning_rate": 7.765567765567767e-06, + "loss": 0.523, + "step": 425 + }, + { + "epoch": 0.07805056797361672, + "grad_norm": 0.5225991254379231, + "learning_rate": 7.783882783882785e-06, + "loss": 0.5171, + "step": 426 + }, + { + "epoch": 0.07823378526932942, + "grad_norm": 0.5423490196220466, + "learning_rate": 7.802197802197802e-06, + "loss": 0.4985, + "step": 427 + }, + { + "epoch": 0.07841700256504214, + "grad_norm": 0.502419506962013, + "learning_rate": 7.820512820512822e-06, + "loss": 0.5087, + "step": 428 + }, + { + "epoch": 0.07860021986075486, + "grad_norm": 0.5102984182578104, + "learning_rate": 7.83882783882784e-06, + "loss": 0.5358, + "step": 429 + }, + { + "epoch": 0.07878343715646757, + "grad_norm": 0.4742126281062651, + "learning_rate": 7.857142857142858e-06, + "loss": 0.5585, + "step": 430 + }, + { + "epoch": 0.07896665445218029, + "grad_norm": 0.508787409495283, + "learning_rate": 7.875457875457876e-06, + "loss": 0.5411, + "step": 431 + }, + { + "epoch": 0.079149871747893, + "grad_norm": 0.4559258370924022, + "learning_rate": 7.893772893772893e-06, + "loss": 0.5229, + "step": 432 + }, + { + "epoch": 0.07933308904360571, + "grad_norm": 0.6320276655690735, + "learning_rate": 7.912087912087913e-06, + "loss": 0.4688, + "step": 433 + }, + { + "epoch": 0.07951630633931843, + "grad_norm": 0.9797165625621964, + "learning_rate": 7.93040293040293e-06, + "loss": 0.5372, + "step": 434 + }, + { + "epoch": 0.07969952363503115, + "grad_norm": 0.43648526458810954, + "learning_rate": 7.948717948717949e-06, + "loss": 0.4528, + "step": 435 + }, + { + "epoch": 0.07988274093074386, + "grad_norm": 0.46101717931557235, + "learning_rate": 7.967032967032966e-06, + "loss": 0.4836, + "step": 436 + }, + { + "epoch": 0.08006595822645658, + "grad_norm": 0.46019219132505085, + "learning_rate": 7.985347985347986e-06, + "loss": 0.4874, + "step": 437 + }, + { + "epoch": 0.08024917552216929, + "grad_norm": 0.4228905699110607, + "learning_rate": 8.003663003663006e-06, + "loss": 0.483, + "step": 438 + }, + { + "epoch": 0.080432392817882, + "grad_norm": 0.4920935432281776, + "learning_rate": 8.021978021978023e-06, + "loss": 0.489, + "step": 439 + }, + { + "epoch": 0.08061561011359472, + "grad_norm": 0.45385267234753407, + "learning_rate": 8.040293040293041e-06, + "loss": 0.5262, + "step": 440 + }, + { + "epoch": 0.08079882740930744, + "grad_norm": 0.48390990285633756, + "learning_rate": 8.058608058608059e-06, + "loss": 0.4969, + "step": 441 + }, + { + "epoch": 0.08098204470502016, + "grad_norm": 0.4424118373260341, + "learning_rate": 8.076923076923077e-06, + "loss": 0.5217, + "step": 442 + }, + { + "epoch": 0.08116526200073287, + "grad_norm": 0.4647327426734162, + "learning_rate": 8.095238095238097e-06, + "loss": 0.5103, + "step": 443 + }, + { + "epoch": 0.08134847929644558, + "grad_norm": 0.42019112334691044, + "learning_rate": 8.113553113553114e-06, + "loss": 0.5378, + "step": 444 + }, + { + "epoch": 0.0815316965921583, + "grad_norm": 0.40127483204178854, + "learning_rate": 8.131868131868132e-06, + "loss": 0.4813, + "step": 445 + }, + { + "epoch": 0.08171491388787101, + "grad_norm": 0.4606579465800615, + "learning_rate": 8.15018315018315e-06, + "loss": 0.5067, + "step": 446 + }, + { + "epoch": 0.08189813118358373, + "grad_norm": 0.4715535246345655, + "learning_rate": 8.16849816849817e-06, + "loss": 0.5181, + "step": 447 + }, + { + "epoch": 0.08208134847929645, + "grad_norm": 0.5386430386856308, + "learning_rate": 8.186813186813188e-06, + "loss": 0.5188, + "step": 448 + }, + { + "epoch": 0.08226456577500917, + "grad_norm": 0.5389030998275688, + "learning_rate": 8.205128205128205e-06, + "loss": 0.516, + "step": 449 + }, + { + "epoch": 0.08244778307072187, + "grad_norm": 0.5181954163327931, + "learning_rate": 8.223443223443223e-06, + "loss": 0.5099, + "step": 450 + }, + { + "epoch": 0.08263100036643459, + "grad_norm": 0.5425277965290457, + "learning_rate": 8.241758241758243e-06, + "loss": 0.5508, + "step": 451 + }, + { + "epoch": 0.0828142176621473, + "grad_norm": 0.4366152129754919, + "learning_rate": 8.26007326007326e-06, + "loss": 0.4772, + "step": 452 + }, + { + "epoch": 0.08299743495786002, + "grad_norm": 0.45607475441878936, + "learning_rate": 8.278388278388278e-06, + "loss": 0.5319, + "step": 453 + }, + { + "epoch": 0.08318065225357274, + "grad_norm": 0.4712644997156548, + "learning_rate": 8.296703296703298e-06, + "loss": 0.5477, + "step": 454 + }, + { + "epoch": 0.08336386954928546, + "grad_norm": 0.6856852588986854, + "learning_rate": 8.315018315018316e-06, + "loss": 0.5235, + "step": 455 + }, + { + "epoch": 0.08354708684499816, + "grad_norm": 0.5019345602931549, + "learning_rate": 8.333333333333334e-06, + "loss": 0.5101, + "step": 456 + }, + { + "epoch": 0.08373030414071088, + "grad_norm": 0.4155972768165925, + "learning_rate": 8.351648351648353e-06, + "loss": 0.4986, + "step": 457 + }, + { + "epoch": 0.0839135214364236, + "grad_norm": 0.45725631484653695, + "learning_rate": 8.369963369963371e-06, + "loss": 0.5233, + "step": 458 + }, + { + "epoch": 0.08409673873213631, + "grad_norm": 0.5500464841633228, + "learning_rate": 8.388278388278389e-06, + "loss": 0.5225, + "step": 459 + }, + { + "epoch": 0.08427995602784903, + "grad_norm": 0.5210381864086956, + "learning_rate": 8.406593406593407e-06, + "loss": 0.5509, + "step": 460 + }, + { + "epoch": 0.08446317332356175, + "grad_norm": 0.4559821107303101, + "learning_rate": 8.424908424908426e-06, + "loss": 0.5118, + "step": 461 + }, + { + "epoch": 0.08464639061927445, + "grad_norm": 0.48323545775926524, + "learning_rate": 8.443223443223444e-06, + "loss": 0.5049, + "step": 462 + }, + { + "epoch": 0.08482960791498717, + "grad_norm": 0.4812859935236779, + "learning_rate": 8.461538461538462e-06, + "loss": 0.5204, + "step": 463 + }, + { + "epoch": 0.08501282521069989, + "grad_norm": 0.454474341410942, + "learning_rate": 8.47985347985348e-06, + "loss": 0.5387, + "step": 464 + }, + { + "epoch": 0.0851960425064126, + "grad_norm": 0.6615650181808158, + "learning_rate": 8.498168498168498e-06, + "loss": 0.5393, + "step": 465 + }, + { + "epoch": 0.08537925980212532, + "grad_norm": 0.4355325124699567, + "learning_rate": 8.516483516483517e-06, + "loss": 0.5027, + "step": 466 + }, + { + "epoch": 0.08556247709783804, + "grad_norm": 0.43977659231205646, + "learning_rate": 8.534798534798535e-06, + "loss": 0.4851, + "step": 467 + }, + { + "epoch": 0.08574569439355074, + "grad_norm": 0.5490378411527629, + "learning_rate": 8.553113553113553e-06, + "loss": 0.5087, + "step": 468 + }, + { + "epoch": 0.08592891168926346, + "grad_norm": 0.4334113007101785, + "learning_rate": 8.571428571428571e-06, + "loss": 0.4874, + "step": 469 + }, + { + "epoch": 0.08611212898497618, + "grad_norm": 0.48093462345535853, + "learning_rate": 8.58974358974359e-06, + "loss": 0.5103, + "step": 470 + }, + { + "epoch": 0.0862953462806889, + "grad_norm": 0.509104778628553, + "learning_rate": 8.60805860805861e-06, + "loss": 0.5125, + "step": 471 + }, + { + "epoch": 0.08647856357640162, + "grad_norm": 0.46329459466937095, + "learning_rate": 8.626373626373628e-06, + "loss": 0.4862, + "step": 472 + }, + { + "epoch": 0.08666178087211433, + "grad_norm": 0.41221910536829426, + "learning_rate": 8.644688644688646e-06, + "loss": 0.5058, + "step": 473 + }, + { + "epoch": 0.08684499816782704, + "grad_norm": 0.4965957595147397, + "learning_rate": 8.663003663003664e-06, + "loss": 0.5045, + "step": 474 + }, + { + "epoch": 0.08702821546353975, + "grad_norm": 0.6831024596734047, + "learning_rate": 8.681318681318681e-06, + "loss": 0.533, + "step": 475 + }, + { + "epoch": 0.08721143275925247, + "grad_norm": 0.44117689667679433, + "learning_rate": 8.699633699633701e-06, + "loss": 0.494, + "step": 476 + }, + { + "epoch": 0.08739465005496519, + "grad_norm": 0.49552902332130444, + "learning_rate": 8.717948717948719e-06, + "loss": 0.5295, + "step": 477 + }, + { + "epoch": 0.08757786735067791, + "grad_norm": 0.5202864223018397, + "learning_rate": 8.736263736263737e-06, + "loss": 0.5315, + "step": 478 + }, + { + "epoch": 0.08776108464639062, + "grad_norm": 0.4531456591891388, + "learning_rate": 8.754578754578755e-06, + "loss": 0.5177, + "step": 479 + }, + { + "epoch": 0.08794430194210333, + "grad_norm": 0.5060531781359298, + "learning_rate": 8.772893772893774e-06, + "loss": 0.5422, + "step": 480 + }, + { + "epoch": 0.08812751923781605, + "grad_norm": 0.44222377285427994, + "learning_rate": 8.791208791208792e-06, + "loss": 0.4679, + "step": 481 + }, + { + "epoch": 0.08831073653352876, + "grad_norm": 0.4308135949373974, + "learning_rate": 8.80952380952381e-06, + "loss": 0.5057, + "step": 482 + }, + { + "epoch": 0.08849395382924148, + "grad_norm": 0.5107147323097014, + "learning_rate": 8.827838827838828e-06, + "loss": 0.5147, + "step": 483 + }, + { + "epoch": 0.0886771711249542, + "grad_norm": 0.4722359038239213, + "learning_rate": 8.846153846153847e-06, + "loss": 0.5069, + "step": 484 + }, + { + "epoch": 0.08886038842066692, + "grad_norm": 0.4913307139634766, + "learning_rate": 8.864468864468865e-06, + "loss": 0.4943, + "step": 485 + }, + { + "epoch": 0.08904360571637962, + "grad_norm": 0.47381589638250404, + "learning_rate": 8.882783882783883e-06, + "loss": 0.5432, + "step": 486 + }, + { + "epoch": 0.08922682301209234, + "grad_norm": 0.4763727220823097, + "learning_rate": 8.9010989010989e-06, + "loss": 0.4765, + "step": 487 + }, + { + "epoch": 0.08941004030780506, + "grad_norm": 0.48581942940784734, + "learning_rate": 8.91941391941392e-06, + "loss": 0.4918, + "step": 488 + }, + { + "epoch": 0.08959325760351777, + "grad_norm": 0.4771435540271998, + "learning_rate": 8.937728937728938e-06, + "loss": 0.5146, + "step": 489 + }, + { + "epoch": 0.08977647489923049, + "grad_norm": 0.5554558590150881, + "learning_rate": 8.956043956043958e-06, + "loss": 0.5139, + "step": 490 + }, + { + "epoch": 0.08995969219494321, + "grad_norm": 0.4409852140988399, + "learning_rate": 8.974358974358976e-06, + "loss": 0.494, + "step": 491 + }, + { + "epoch": 0.09014290949065591, + "grad_norm": 0.470617109732078, + "learning_rate": 8.992673992673993e-06, + "loss": 0.5293, + "step": 492 + }, + { + "epoch": 0.09032612678636863, + "grad_norm": 0.48719044521659705, + "learning_rate": 9.010989010989011e-06, + "loss": 0.5111, + "step": 493 + }, + { + "epoch": 0.09050934408208135, + "grad_norm": 0.46876439756978366, + "learning_rate": 9.02930402930403e-06, + "loss": 0.4942, + "step": 494 + }, + { + "epoch": 0.09069256137779406, + "grad_norm": 0.46671821534033575, + "learning_rate": 9.047619047619049e-06, + "loss": 0.5617, + "step": 495 + }, + { + "epoch": 0.09087577867350678, + "grad_norm": 0.4868372988815388, + "learning_rate": 9.065934065934067e-06, + "loss": 0.5195, + "step": 496 + }, + { + "epoch": 0.0910589959692195, + "grad_norm": 0.47526685976460153, + "learning_rate": 9.084249084249084e-06, + "loss": 0.4561, + "step": 497 + }, + { + "epoch": 0.0912422132649322, + "grad_norm": 0.526795639153496, + "learning_rate": 9.102564102564104e-06, + "loss": 0.4905, + "step": 498 + }, + { + "epoch": 0.09142543056064492, + "grad_norm": 0.47935527575760656, + "learning_rate": 9.120879120879122e-06, + "loss": 0.5129, + "step": 499 + }, + { + "epoch": 0.09160864785635764, + "grad_norm": 0.47020582683735346, + "learning_rate": 9.13919413919414e-06, + "loss": 0.511, + "step": 500 + }, + { + "epoch": 0.09179186515207036, + "grad_norm": 0.46760420885699033, + "learning_rate": 9.157509157509158e-06, + "loss": 0.4789, + "step": 501 + }, + { + "epoch": 0.09197508244778307, + "grad_norm": 0.4695251283934706, + "learning_rate": 9.175824175824175e-06, + "loss": 0.5351, + "step": 502 + }, + { + "epoch": 0.09215829974349579, + "grad_norm": 0.5153360795669553, + "learning_rate": 9.194139194139195e-06, + "loss": 0.5522, + "step": 503 + }, + { + "epoch": 0.0923415170392085, + "grad_norm": 0.44806081150815513, + "learning_rate": 9.212454212454213e-06, + "loss": 0.5173, + "step": 504 + }, + { + "epoch": 0.09252473433492121, + "grad_norm": 0.451536504838503, + "learning_rate": 9.230769230769232e-06, + "loss": 0.5082, + "step": 505 + }, + { + "epoch": 0.09270795163063393, + "grad_norm": 0.48436384147084766, + "learning_rate": 9.24908424908425e-06, + "loss": 0.5388, + "step": 506 + }, + { + "epoch": 0.09289116892634665, + "grad_norm": 0.4609113225702784, + "learning_rate": 9.267399267399268e-06, + "loss": 0.4892, + "step": 507 + }, + { + "epoch": 0.09307438622205937, + "grad_norm": 0.4098711492197863, + "learning_rate": 9.285714285714288e-06, + "loss": 0.4856, + "step": 508 + }, + { + "epoch": 0.09325760351777208, + "grad_norm": 0.4715675907322317, + "learning_rate": 9.304029304029305e-06, + "loss": 0.53, + "step": 509 + }, + { + "epoch": 0.09344082081348479, + "grad_norm": 0.5026574189450673, + "learning_rate": 9.322344322344323e-06, + "loss": 0.5314, + "step": 510 + }, + { + "epoch": 0.0936240381091975, + "grad_norm": 0.4253199666429807, + "learning_rate": 9.340659340659341e-06, + "loss": 0.5068, + "step": 511 + }, + { + "epoch": 0.09380725540491022, + "grad_norm": 0.45894485399853974, + "learning_rate": 9.358974358974359e-06, + "loss": 0.5235, + "step": 512 + }, + { + "epoch": 0.09399047270062294, + "grad_norm": 0.7513225377427756, + "learning_rate": 9.377289377289379e-06, + "loss": 0.5071, + "step": 513 + }, + { + "epoch": 0.09417368999633566, + "grad_norm": 0.5231017027547264, + "learning_rate": 9.395604395604396e-06, + "loss": 0.5339, + "step": 514 + }, + { + "epoch": 0.09435690729204838, + "grad_norm": 0.5146997140217462, + "learning_rate": 9.413919413919414e-06, + "loss": 0.5052, + "step": 515 + }, + { + "epoch": 0.09454012458776108, + "grad_norm": 0.47760493192836756, + "learning_rate": 9.432234432234432e-06, + "loss": 0.5102, + "step": 516 + }, + { + "epoch": 0.0947233418834738, + "grad_norm": 0.44117590454697736, + "learning_rate": 9.450549450549452e-06, + "loss": 0.5237, + "step": 517 + }, + { + "epoch": 0.09490655917918651, + "grad_norm": 0.5004680185091399, + "learning_rate": 9.46886446886447e-06, + "loss": 0.5243, + "step": 518 + }, + { + "epoch": 0.09508977647489923, + "grad_norm": 0.4549800276149351, + "learning_rate": 9.487179487179487e-06, + "loss": 0.5139, + "step": 519 + }, + { + "epoch": 0.09527299377061195, + "grad_norm": 0.4263327588755031, + "learning_rate": 9.505494505494505e-06, + "loss": 0.4877, + "step": 520 + }, + { + "epoch": 0.09545621106632467, + "grad_norm": 0.4484968491218323, + "learning_rate": 9.523809523809525e-06, + "loss": 0.4986, + "step": 521 + }, + { + "epoch": 0.09563942836203737, + "grad_norm": 0.39390345192807547, + "learning_rate": 9.542124542124543e-06, + "loss": 0.5116, + "step": 522 + }, + { + "epoch": 0.09582264565775009, + "grad_norm": 0.4529147798983982, + "learning_rate": 9.560439560439562e-06, + "loss": 0.5312, + "step": 523 + }, + { + "epoch": 0.0960058629534628, + "grad_norm": 0.49701803503982106, + "learning_rate": 9.57875457875458e-06, + "loss": 0.529, + "step": 524 + }, + { + "epoch": 0.09618908024917552, + "grad_norm": 0.4097214793479841, + "learning_rate": 9.597069597069598e-06, + "loss": 0.5005, + "step": 525 + }, + { + "epoch": 0.09637229754488824, + "grad_norm": 0.5148151884655592, + "learning_rate": 9.615384615384616e-06, + "loss": 0.53, + "step": 526 + }, + { + "epoch": 0.09655551484060096, + "grad_norm": 0.4264857545219357, + "learning_rate": 9.633699633699635e-06, + "loss": 0.5136, + "step": 527 + }, + { + "epoch": 0.09673873213631366, + "grad_norm": 0.45426464252638443, + "learning_rate": 9.652014652014653e-06, + "loss": 0.5117, + "step": 528 + }, + { + "epoch": 0.09692194943202638, + "grad_norm": 0.47571176071645493, + "learning_rate": 9.670329670329671e-06, + "loss": 0.5185, + "step": 529 + }, + { + "epoch": 0.0971051667277391, + "grad_norm": 0.42418752426373113, + "learning_rate": 9.688644688644689e-06, + "loss": 0.5285, + "step": 530 + }, + { + "epoch": 0.09728838402345182, + "grad_norm": 0.48590348616099827, + "learning_rate": 9.706959706959708e-06, + "loss": 0.5339, + "step": 531 + }, + { + "epoch": 0.09747160131916453, + "grad_norm": 0.463774513888958, + "learning_rate": 9.725274725274726e-06, + "loss": 0.5386, + "step": 532 + }, + { + "epoch": 0.09765481861487725, + "grad_norm": 0.4891808396070614, + "learning_rate": 9.743589743589744e-06, + "loss": 0.5326, + "step": 533 + }, + { + "epoch": 0.09783803591058995, + "grad_norm": 0.40841004082693305, + "learning_rate": 9.761904761904762e-06, + "loss": 0.5085, + "step": 534 + }, + { + "epoch": 0.09802125320630267, + "grad_norm": 0.42494728872814735, + "learning_rate": 9.780219780219781e-06, + "loss": 0.4751, + "step": 535 + }, + { + "epoch": 0.09820447050201539, + "grad_norm": 0.4594283822364781, + "learning_rate": 9.7985347985348e-06, + "loss": 0.5275, + "step": 536 + }, + { + "epoch": 0.09838768779772811, + "grad_norm": 0.4064302259117676, + "learning_rate": 9.816849816849817e-06, + "loss": 0.5089, + "step": 537 + }, + { + "epoch": 0.09857090509344082, + "grad_norm": 0.4860027724973198, + "learning_rate": 9.835164835164835e-06, + "loss": 0.505, + "step": 538 + }, + { + "epoch": 0.09875412238915354, + "grad_norm": 0.47625362870716265, + "learning_rate": 9.853479853479855e-06, + "loss": 0.4954, + "step": 539 + }, + { + "epoch": 0.09893733968486625, + "grad_norm": 0.4592940212317861, + "learning_rate": 9.871794871794872e-06, + "loss": 0.4871, + "step": 540 + }, + { + "epoch": 0.09912055698057896, + "grad_norm": 0.42717127053329573, + "learning_rate": 9.890109890109892e-06, + "loss": 0.5261, + "step": 541 + }, + { + "epoch": 0.09930377427629168, + "grad_norm": 0.43970535809042904, + "learning_rate": 9.90842490842491e-06, + "loss": 0.4866, + "step": 542 + }, + { + "epoch": 0.0994869915720044, + "grad_norm": 0.42143360797263807, + "learning_rate": 9.926739926739928e-06, + "loss": 0.5113, + "step": 543 + }, + { + "epoch": 0.09967020886771712, + "grad_norm": 0.4416711016318173, + "learning_rate": 9.945054945054946e-06, + "loss": 0.5116, + "step": 544 + }, + { + "epoch": 0.09985342616342983, + "grad_norm": 0.45325267149265236, + "learning_rate": 9.963369963369965e-06, + "loss": 0.5283, + "step": 545 + }, + { + "epoch": 0.10003664345914254, + "grad_norm": 0.3948707773420248, + "learning_rate": 9.981684981684983e-06, + "loss": 0.5114, + "step": 546 + }, + { + "epoch": 0.10021986075485526, + "grad_norm": 0.42983426021717475, + "learning_rate": 1e-05, + "loss": 0.5169, + "step": 547 + }, + { + "epoch": 0.10040307805056797, + "grad_norm": 0.5119731461361727, + "learning_rate": 9.999998977359419e-06, + "loss": 0.518, + "step": 548 + }, + { + "epoch": 0.10058629534628069, + "grad_norm": 0.48479775231872485, + "learning_rate": 9.999995909438092e-06, + "loss": 0.5207, + "step": 549 + }, + { + "epoch": 0.10076951264199341, + "grad_norm": 0.45005420614602115, + "learning_rate": 9.999990796237274e-06, + "loss": 0.4878, + "step": 550 + }, + { + "epoch": 0.10095272993770613, + "grad_norm": 0.40595587220813967, + "learning_rate": 9.999983637759059e-06, + "loss": 0.4712, + "step": 551 + }, + { + "epoch": 0.10113594723341883, + "grad_norm": 0.43893870910937327, + "learning_rate": 9.999974434006372e-06, + "loss": 0.4964, + "step": 552 + }, + { + "epoch": 0.10131916452913155, + "grad_norm": 0.4638079942473788, + "learning_rate": 9.99996318498298e-06, + "loss": 0.4834, + "step": 553 + }, + { + "epoch": 0.10150238182484426, + "grad_norm": 0.45078724585599195, + "learning_rate": 9.999949890693484e-06, + "loss": 0.4828, + "step": 554 + }, + { + "epoch": 0.10168559912055698, + "grad_norm": 0.4968230440871285, + "learning_rate": 9.999934551143319e-06, + "loss": 0.5146, + "step": 555 + }, + { + "epoch": 0.1018688164162697, + "grad_norm": 0.4431824586567993, + "learning_rate": 9.999917166338767e-06, + "loss": 0.5117, + "step": 556 + }, + { + "epoch": 0.10205203371198242, + "grad_norm": 0.40809319706771474, + "learning_rate": 9.999897736286932e-06, + "loss": 0.4942, + "step": 557 + }, + { + "epoch": 0.10223525100769512, + "grad_norm": 0.46904649406632837, + "learning_rate": 9.999876260995767e-06, + "loss": 0.5223, + "step": 558 + }, + { + "epoch": 0.10241846830340784, + "grad_norm": 0.45993616230767786, + "learning_rate": 9.999852740474054e-06, + "loss": 0.498, + "step": 559 + }, + { + "epoch": 0.10260168559912056, + "grad_norm": 0.41328723379038895, + "learning_rate": 9.999827174731414e-06, + "loss": 0.5294, + "step": 560 + }, + { + "epoch": 0.10278490289483327, + "grad_norm": 0.4584774156426854, + "learning_rate": 9.999799563778307e-06, + "loss": 0.5138, + "step": 561 + }, + { + "epoch": 0.10296812019054599, + "grad_norm": 0.4626480547701558, + "learning_rate": 9.999769907626024e-06, + "loss": 0.5122, + "step": 562 + }, + { + "epoch": 0.10315133748625871, + "grad_norm": 0.4647555669122866, + "learning_rate": 9.9997382062867e-06, + "loss": 0.5276, + "step": 563 + }, + { + "epoch": 0.10333455478197141, + "grad_norm": 0.41824885855037686, + "learning_rate": 9.9997044597733e-06, + "loss": 0.4976, + "step": 564 + }, + { + "epoch": 0.10351777207768413, + "grad_norm": 0.41187305924650414, + "learning_rate": 9.999668668099628e-06, + "loss": 0.4953, + "step": 565 + }, + { + "epoch": 0.10370098937339685, + "grad_norm": 0.42660595756816655, + "learning_rate": 9.999630831280329e-06, + "loss": 0.514, + "step": 566 + }, + { + "epoch": 0.10388420666910957, + "grad_norm": 0.5099786508796531, + "learning_rate": 9.999590949330876e-06, + "loss": 0.5038, + "step": 567 + }, + { + "epoch": 0.10406742396482228, + "grad_norm": 0.4926825490754442, + "learning_rate": 9.999549022267582e-06, + "loss": 0.5535, + "step": 568 + }, + { + "epoch": 0.104250641260535, + "grad_norm": 0.4578490443823955, + "learning_rate": 9.999505050107601e-06, + "loss": 0.5075, + "step": 569 + }, + { + "epoch": 0.1044338585562477, + "grad_norm": 0.4733323605035433, + "learning_rate": 9.99945903286892e-06, + "loss": 0.4747, + "step": 570 + }, + { + "epoch": 0.10461707585196042, + "grad_norm": 0.44893796479946446, + "learning_rate": 9.999410970570358e-06, + "loss": 0.4711, + "step": 571 + }, + { + "epoch": 0.10480029314767314, + "grad_norm": 0.5254914900315981, + "learning_rate": 9.99936086323158e-06, + "loss": 0.5358, + "step": 572 + }, + { + "epoch": 0.10498351044338586, + "grad_norm": 0.537923540220477, + "learning_rate": 9.999308710873083e-06, + "loss": 0.5079, + "step": 573 + }, + { + "epoch": 0.10516672773909858, + "grad_norm": 0.45536567675754, + "learning_rate": 9.999254513516196e-06, + "loss": 0.4908, + "step": 574 + }, + { + "epoch": 0.10534994503481128, + "grad_norm": 0.38323869490085266, + "learning_rate": 9.999198271183094e-06, + "loss": 0.4983, + "step": 575 + }, + { + "epoch": 0.105533162330524, + "grad_norm": 0.43580085679521197, + "learning_rate": 9.999139983896779e-06, + "loss": 0.5031, + "step": 576 + }, + { + "epoch": 0.10571637962623671, + "grad_norm": 0.4193926406628939, + "learning_rate": 9.999079651681096e-06, + "loss": 0.5026, + "step": 577 + }, + { + "epoch": 0.10589959692194943, + "grad_norm": 0.5149327291476714, + "learning_rate": 9.999017274560722e-06, + "loss": 0.5414, + "step": 578 + }, + { + "epoch": 0.10608281421766215, + "grad_norm": 0.44098660936462153, + "learning_rate": 9.998952852561176e-06, + "loss": 0.5032, + "step": 579 + }, + { + "epoch": 0.10626603151337487, + "grad_norm": 0.4413320145497695, + "learning_rate": 9.998886385708807e-06, + "loss": 0.497, + "step": 580 + }, + { + "epoch": 0.10644924880908757, + "grad_norm": 0.45575842795199323, + "learning_rate": 9.998817874030808e-06, + "loss": 0.5207, + "step": 581 + }, + { + "epoch": 0.10663246610480029, + "grad_norm": 0.532654166955093, + "learning_rate": 9.9987473175552e-06, + "loss": 0.5212, + "step": 582 + }, + { + "epoch": 0.106815683400513, + "grad_norm": 0.4157619606008366, + "learning_rate": 9.998674716310846e-06, + "loss": 0.5085, + "step": 583 + }, + { + "epoch": 0.10699890069622572, + "grad_norm": 0.4868388397165033, + "learning_rate": 9.998600070327444e-06, + "loss": 0.5306, + "step": 584 + }, + { + "epoch": 0.10718211799193844, + "grad_norm": 0.49401937562080056, + "learning_rate": 9.998523379635527e-06, + "loss": 0.5429, + "step": 585 + }, + { + "epoch": 0.10736533528765116, + "grad_norm": 0.3986626914030523, + "learning_rate": 9.99844464426647e-06, + "loss": 0.4695, + "step": 586 + }, + { + "epoch": 0.10754855258336386, + "grad_norm": 0.49524254606268836, + "learning_rate": 9.998363864252474e-06, + "loss": 0.5226, + "step": 587 + }, + { + "epoch": 0.10773176987907658, + "grad_norm": 0.46186447850660334, + "learning_rate": 9.998281039626588e-06, + "loss": 0.5294, + "step": 588 + }, + { + "epoch": 0.1079149871747893, + "grad_norm": 0.4207082230286084, + "learning_rate": 9.99819617042269e-06, + "loss": 0.5096, + "step": 589 + }, + { + "epoch": 0.10809820447050202, + "grad_norm": 0.3955565788721964, + "learning_rate": 9.998109256675496e-06, + "loss": 0.4931, + "step": 590 + }, + { + "epoch": 0.10828142176621473, + "grad_norm": 0.36723747707336185, + "learning_rate": 9.998020298420559e-06, + "loss": 0.4964, + "step": 591 + }, + { + "epoch": 0.10846463906192745, + "grad_norm": 0.4185544526751163, + "learning_rate": 9.997929295694266e-06, + "loss": 0.4801, + "step": 592 + }, + { + "epoch": 0.10864785635764015, + "grad_norm": 0.42948593775837285, + "learning_rate": 9.997836248533845e-06, + "loss": 0.5197, + "step": 593 + }, + { + "epoch": 0.10883107365335287, + "grad_norm": 0.43966759837587943, + "learning_rate": 9.997741156977356e-06, + "loss": 0.5254, + "step": 594 + }, + { + "epoch": 0.10901429094906559, + "grad_norm": 0.4632445570564666, + "learning_rate": 9.997644021063698e-06, + "loss": 0.4802, + "step": 595 + }, + { + "epoch": 0.10919750824477831, + "grad_norm": 0.46480621716730125, + "learning_rate": 9.997544840832604e-06, + "loss": 0.5219, + "step": 596 + }, + { + "epoch": 0.10938072554049103, + "grad_norm": 0.44567173311984926, + "learning_rate": 9.997443616324645e-06, + "loss": 0.5105, + "step": 597 + }, + { + "epoch": 0.10956394283620374, + "grad_norm": 0.4534263401515789, + "learning_rate": 9.997340347581226e-06, + "loss": 0.5212, + "step": 598 + }, + { + "epoch": 0.10974716013191645, + "grad_norm": 0.40359525147680225, + "learning_rate": 9.99723503464459e-06, + "loss": 0.5261, + "step": 599 + }, + { + "epoch": 0.10993037742762916, + "grad_norm": 0.8305073137318505, + "learning_rate": 9.99712767755782e-06, + "loss": 0.5173, + "step": 600 + }, + { + "epoch": 0.11011359472334188, + "grad_norm": 0.5039765167575108, + "learning_rate": 9.997018276364825e-06, + "loss": 0.4983, + "step": 601 + }, + { + "epoch": 0.1102968120190546, + "grad_norm": 0.4000444070826362, + "learning_rate": 9.99690683111036e-06, + "loss": 0.4876, + "step": 602 + }, + { + "epoch": 0.11048002931476732, + "grad_norm": 0.45535979366815893, + "learning_rate": 9.99679334184001e-06, + "loss": 0.5116, + "step": 603 + }, + { + "epoch": 0.11066324661048003, + "grad_norm": 0.5208092576973751, + "learning_rate": 9.996677808600202e-06, + "loss": 0.5079, + "step": 604 + }, + { + "epoch": 0.11084646390619274, + "grad_norm": 0.47721138790862244, + "learning_rate": 9.996560231438193e-06, + "loss": 0.5031, + "step": 605 + }, + { + "epoch": 0.11102968120190546, + "grad_norm": 0.4106250945716929, + "learning_rate": 9.996440610402078e-06, + "loss": 0.4789, + "step": 606 + }, + { + "epoch": 0.11121289849761817, + "grad_norm": 0.4981443340009068, + "learning_rate": 9.996318945540792e-06, + "loss": 0.4749, + "step": 607 + }, + { + "epoch": 0.11139611579333089, + "grad_norm": 0.4365748987869686, + "learning_rate": 9.996195236904098e-06, + "loss": 0.503, + "step": 608 + }, + { + "epoch": 0.11157933308904361, + "grad_norm": 0.42769357659836243, + "learning_rate": 9.996069484542605e-06, + "loss": 0.4883, + "step": 609 + }, + { + "epoch": 0.11176255038475633, + "grad_norm": 0.4468425715852285, + "learning_rate": 9.995941688507749e-06, + "loss": 0.528, + "step": 610 + }, + { + "epoch": 0.11194576768046903, + "grad_norm": 0.44305501127316244, + "learning_rate": 9.995811848851807e-06, + "loss": 0.5192, + "step": 611 + }, + { + "epoch": 0.11212898497618175, + "grad_norm": 0.4430509247761743, + "learning_rate": 9.995679965627891e-06, + "loss": 0.4879, + "step": 612 + }, + { + "epoch": 0.11231220227189447, + "grad_norm": 0.4330736232909904, + "learning_rate": 9.995546038889948e-06, + "loss": 0.5315, + "step": 613 + }, + { + "epoch": 0.11249541956760718, + "grad_norm": 0.38536887827486094, + "learning_rate": 9.995410068692763e-06, + "loss": 0.4789, + "step": 614 + }, + { + "epoch": 0.1126786368633199, + "grad_norm": 0.5420414741691238, + "learning_rate": 9.995272055091954e-06, + "loss": 0.4863, + "step": 615 + }, + { + "epoch": 0.11286185415903262, + "grad_norm": 0.4166977498062619, + "learning_rate": 9.995131998143976e-06, + "loss": 0.4863, + "step": 616 + }, + { + "epoch": 0.11304507145474532, + "grad_norm": 0.40681536551456327, + "learning_rate": 9.99498989790612e-06, + "loss": 0.5158, + "step": 617 + }, + { + "epoch": 0.11322828875045804, + "grad_norm": 0.4164498167643854, + "learning_rate": 9.994845754436516e-06, + "loss": 0.5287, + "step": 618 + }, + { + "epoch": 0.11341150604617076, + "grad_norm": 0.430875770675555, + "learning_rate": 9.994699567794123e-06, + "loss": 0.4949, + "step": 619 + }, + { + "epoch": 0.11359472334188347, + "grad_norm": 0.47209089186963143, + "learning_rate": 9.994551338038742e-06, + "loss": 0.5212, + "step": 620 + }, + { + "epoch": 0.11377794063759619, + "grad_norm": 0.4693460118731517, + "learning_rate": 9.994401065231008e-06, + "loss": 0.5218, + "step": 621 + }, + { + "epoch": 0.11396115793330891, + "grad_norm": 0.43645941737097255, + "learning_rate": 9.994248749432388e-06, + "loss": 0.5355, + "step": 622 + }, + { + "epoch": 0.11414437522902161, + "grad_norm": 0.45174829183811943, + "learning_rate": 9.994094390705189e-06, + "loss": 0.4941, + "step": 623 + }, + { + "epoch": 0.11432759252473433, + "grad_norm": 0.4609885421062077, + "learning_rate": 9.993937989112554e-06, + "loss": 0.5424, + "step": 624 + }, + { + "epoch": 0.11451080982044705, + "grad_norm": 0.4711205796072519, + "learning_rate": 9.993779544718459e-06, + "loss": 0.5054, + "step": 625 + }, + { + "epoch": 0.11469402711615977, + "grad_norm": 0.4692458224302816, + "learning_rate": 9.993619057587714e-06, + "loss": 0.5013, + "step": 626 + }, + { + "epoch": 0.11487724441187248, + "grad_norm": 0.44543459536907126, + "learning_rate": 9.99345652778597e-06, + "loss": 0.4882, + "step": 627 + }, + { + "epoch": 0.1150604617075852, + "grad_norm": 0.43461884283611496, + "learning_rate": 9.993291955379713e-06, + "loss": 0.5352, + "step": 628 + }, + { + "epoch": 0.1152436790032979, + "grad_norm": 0.41500076390959145, + "learning_rate": 9.993125340436258e-06, + "loss": 0.4895, + "step": 629 + }, + { + "epoch": 0.11542689629901062, + "grad_norm": 0.4307377636082482, + "learning_rate": 9.992956683023762e-06, + "loss": 0.5149, + "step": 630 + }, + { + "epoch": 0.11561011359472334, + "grad_norm": 0.45085961488644083, + "learning_rate": 9.992785983211214e-06, + "loss": 0.5061, + "step": 631 + }, + { + "epoch": 0.11579333089043606, + "grad_norm": 0.45702457348234676, + "learning_rate": 9.992613241068444e-06, + "loss": 0.4947, + "step": 632 + }, + { + "epoch": 0.11597654818614878, + "grad_norm": 0.46046219980195896, + "learning_rate": 9.992438456666108e-06, + "loss": 0.4727, + "step": 633 + }, + { + "epoch": 0.1161597654818615, + "grad_norm": 0.5109084834664002, + "learning_rate": 9.992261630075704e-06, + "loss": 0.5015, + "step": 634 + }, + { + "epoch": 0.1163429827775742, + "grad_norm": 0.4605028589793491, + "learning_rate": 9.992082761369567e-06, + "loss": 0.518, + "step": 635 + }, + { + "epoch": 0.11652620007328691, + "grad_norm": 0.4135254020313275, + "learning_rate": 9.991901850620861e-06, + "loss": 0.4833, + "step": 636 + }, + { + "epoch": 0.11670941736899963, + "grad_norm": 0.5226822189268372, + "learning_rate": 9.99171889790359e-06, + "loss": 0.5118, + "step": 637 + }, + { + "epoch": 0.11689263466471235, + "grad_norm": 0.4061164579366039, + "learning_rate": 9.991533903292592e-06, + "loss": 0.4953, + "step": 638 + }, + { + "epoch": 0.11707585196042507, + "grad_norm": 2.3765281557608486, + "learning_rate": 9.99134686686354e-06, + "loss": 0.4708, + "step": 639 + }, + { + "epoch": 0.11725906925613779, + "grad_norm": 0.4675538148921928, + "learning_rate": 9.991157788692942e-06, + "loss": 0.4994, + "step": 640 + }, + { + "epoch": 0.11744228655185049, + "grad_norm": 0.4467597960418954, + "learning_rate": 9.990966668858144e-06, + "loss": 0.4982, + "step": 641 + }, + { + "epoch": 0.1176255038475632, + "grad_norm": 0.46222076234046117, + "learning_rate": 9.99077350743732e-06, + "loss": 0.5208, + "step": 642 + }, + { + "epoch": 0.11780872114327592, + "grad_norm": 0.45856899421216063, + "learning_rate": 9.990578304509488e-06, + "loss": 0.5109, + "step": 643 + }, + { + "epoch": 0.11799193843898864, + "grad_norm": 0.5006843127431095, + "learning_rate": 9.990381060154496e-06, + "loss": 0.4941, + "step": 644 + }, + { + "epoch": 0.11817515573470136, + "grad_norm": 0.4638055815431298, + "learning_rate": 9.990181774453028e-06, + "loss": 0.509, + "step": 645 + }, + { + "epoch": 0.11835837303041408, + "grad_norm": 0.4429725714841303, + "learning_rate": 9.989980447486601e-06, + "loss": 0.5196, + "step": 646 + }, + { + "epoch": 0.11854159032612678, + "grad_norm": 0.44051330732830757, + "learning_rate": 9.989777079337572e-06, + "loss": 0.5316, + "step": 647 + }, + { + "epoch": 0.1187248076218395, + "grad_norm": 0.41555588890083767, + "learning_rate": 9.989571670089129e-06, + "loss": 0.5126, + "step": 648 + }, + { + "epoch": 0.11890802491755222, + "grad_norm": 0.46239837428682623, + "learning_rate": 9.989364219825295e-06, + "loss": 0.506, + "step": 649 + }, + { + "epoch": 0.11909124221326493, + "grad_norm": 0.416093035128269, + "learning_rate": 9.98915472863093e-06, + "loss": 0.5194, + "step": 650 + }, + { + "epoch": 0.11927445950897765, + "grad_norm": 0.43847114193541153, + "learning_rate": 9.988943196591727e-06, + "loss": 0.4994, + "step": 651 + }, + { + "epoch": 0.11945767680469037, + "grad_norm": 0.44451525143619486, + "learning_rate": 9.988729623794215e-06, + "loss": 0.5283, + "step": 652 + }, + { + "epoch": 0.11964089410040307, + "grad_norm": 0.4041297930697904, + "learning_rate": 9.988514010325758e-06, + "loss": 0.5049, + "step": 653 + }, + { + "epoch": 0.11982411139611579, + "grad_norm": 0.4075716530480146, + "learning_rate": 9.988296356274551e-06, + "loss": 0.4921, + "step": 654 + }, + { + "epoch": 0.12000732869182851, + "grad_norm": 0.40736195443308915, + "learning_rate": 9.988076661729631e-06, + "loss": 0.4805, + "step": 655 + }, + { + "epoch": 0.12019054598754123, + "grad_norm": 0.4545077167148167, + "learning_rate": 9.987854926780863e-06, + "loss": 0.4789, + "step": 656 + }, + { + "epoch": 0.12037376328325394, + "grad_norm": 0.5314832542562551, + "learning_rate": 9.987631151518948e-06, + "loss": 0.5263, + "step": 657 + }, + { + "epoch": 0.12055698057896666, + "grad_norm": 0.4967524623940482, + "learning_rate": 9.987405336035425e-06, + "loss": 0.5106, + "step": 658 + }, + { + "epoch": 0.12074019787467936, + "grad_norm": 0.4306664623952609, + "learning_rate": 9.987177480422663e-06, + "loss": 0.5376, + "step": 659 + }, + { + "epoch": 0.12092341517039208, + "grad_norm": 0.4158801663482348, + "learning_rate": 9.98694758477387e-06, + "loss": 0.4838, + "step": 660 + }, + { + "epoch": 0.1211066324661048, + "grad_norm": 0.40605820771792417, + "learning_rate": 9.986715649183084e-06, + "loss": 0.505, + "step": 661 + }, + { + "epoch": 0.12128984976181752, + "grad_norm": 0.4085721106839356, + "learning_rate": 9.986481673745183e-06, + "loss": 0.4993, + "step": 662 + }, + { + "epoch": 0.12147306705753023, + "grad_norm": 0.4785768918832099, + "learning_rate": 9.986245658555873e-06, + "loss": 0.5387, + "step": 663 + }, + { + "epoch": 0.12165628435324295, + "grad_norm": 0.4175528791505364, + "learning_rate": 9.986007603711698e-06, + "loss": 0.4976, + "step": 664 + }, + { + "epoch": 0.12183950164895566, + "grad_norm": 0.4323331836919794, + "learning_rate": 9.985767509310035e-06, + "loss": 0.5346, + "step": 665 + }, + { + "epoch": 0.12202271894466837, + "grad_norm": 0.4868011056584651, + "learning_rate": 9.9855253754491e-06, + "loss": 0.5309, + "step": 666 + }, + { + "epoch": 0.12220593624038109, + "grad_norm": 0.46372133085485157, + "learning_rate": 9.985281202227936e-06, + "loss": 0.5053, + "step": 667 + }, + { + "epoch": 0.12238915353609381, + "grad_norm": 0.48326269099644514, + "learning_rate": 9.985034989746423e-06, + "loss": 0.4941, + "step": 668 + }, + { + "epoch": 0.12257237083180653, + "grad_norm": 0.4557552947362274, + "learning_rate": 9.984786738105279e-06, + "loss": 0.5121, + "step": 669 + }, + { + "epoch": 0.12275558812751924, + "grad_norm": 0.39704238565295197, + "learning_rate": 9.98453644740605e-06, + "loss": 0.4962, + "step": 670 + }, + { + "epoch": 0.12293880542323195, + "grad_norm": 0.40645243334504044, + "learning_rate": 9.98428411775112e-06, + "loss": 0.5046, + "step": 671 + }, + { + "epoch": 0.12312202271894467, + "grad_norm": 0.42629021258457467, + "learning_rate": 9.984029749243707e-06, + "loss": 0.5084, + "step": 672 + }, + { + "epoch": 0.12330524001465738, + "grad_norm": 0.45965819318406503, + "learning_rate": 9.98377334198786e-06, + "loss": 0.4759, + "step": 673 + }, + { + "epoch": 0.1234884573103701, + "grad_norm": 0.41967629717267835, + "learning_rate": 9.983514896088466e-06, + "loss": 0.498, + "step": 674 + }, + { + "epoch": 0.12367167460608282, + "grad_norm": 0.4295939596150219, + "learning_rate": 9.983254411651242e-06, + "loss": 0.5243, + "step": 675 + }, + { + "epoch": 0.12385489190179554, + "grad_norm": 0.4808089074794197, + "learning_rate": 9.982991888782742e-06, + "loss": 0.5, + "step": 676 + }, + { + "epoch": 0.12403810919750824, + "grad_norm": 0.48728725704379316, + "learning_rate": 9.982727327590352e-06, + "loss": 0.5459, + "step": 677 + }, + { + "epoch": 0.12422132649322096, + "grad_norm": 0.42799437772375803, + "learning_rate": 9.982460728182292e-06, + "loss": 0.5488, + "step": 678 + }, + { + "epoch": 0.12440454378893367, + "grad_norm": 0.4334349975707624, + "learning_rate": 9.982192090667618e-06, + "loss": 0.501, + "step": 679 + }, + { + "epoch": 0.12458776108464639, + "grad_norm": 0.45671378695636383, + "learning_rate": 9.981921415156217e-06, + "loss": 0.5236, + "step": 680 + }, + { + "epoch": 0.12477097838035911, + "grad_norm": 0.49531406109254905, + "learning_rate": 9.98164870175881e-06, + "loss": 0.5224, + "step": 681 + }, + { + "epoch": 0.12495419567607183, + "grad_norm": 0.3902067301024602, + "learning_rate": 9.981373950586952e-06, + "loss": 0.4835, + "step": 682 + }, + { + "epoch": 0.12513741297178455, + "grad_norm": 0.46995820446689096, + "learning_rate": 9.981097161753032e-06, + "loss": 0.527, + "step": 683 + }, + { + "epoch": 0.12532063026749726, + "grad_norm": 0.43642001976560174, + "learning_rate": 9.980818335370273e-06, + "loss": 0.514, + "step": 684 + }, + { + "epoch": 0.12550384756320998, + "grad_norm": 0.5074521053811396, + "learning_rate": 9.980537471552728e-06, + "loss": 0.4897, + "step": 685 + }, + { + "epoch": 0.12568706485892267, + "grad_norm": 0.43625566798430576, + "learning_rate": 9.98025457041529e-06, + "loss": 0.5174, + "step": 686 + }, + { + "epoch": 0.1258702821546354, + "grad_norm": 0.4129578447344159, + "learning_rate": 9.979969632073678e-06, + "loss": 0.5135, + "step": 687 + }, + { + "epoch": 0.1260534994503481, + "grad_norm": 0.4553566655837448, + "learning_rate": 9.97968265664445e-06, + "loss": 0.5228, + "step": 688 + }, + { + "epoch": 0.12623671674606082, + "grad_norm": 0.4682536997157449, + "learning_rate": 9.979393644244992e-06, + "loss": 0.5039, + "step": 689 + }, + { + "epoch": 0.12641993404177354, + "grad_norm": 0.42839595687214543, + "learning_rate": 9.979102594993533e-06, + "loss": 0.5128, + "step": 690 + }, + { + "epoch": 0.12660315133748626, + "grad_norm": 0.46092632925079513, + "learning_rate": 9.978809509009121e-06, + "loss": 0.5011, + "step": 691 + }, + { + "epoch": 0.12678636863319898, + "grad_norm": 0.4486176186332823, + "learning_rate": 9.97851438641165e-06, + "loss": 0.5169, + "step": 692 + }, + { + "epoch": 0.1269695859289117, + "grad_norm": 0.47419826995019465, + "learning_rate": 9.978217227321837e-06, + "loss": 0.5131, + "step": 693 + }, + { + "epoch": 0.1271528032246244, + "grad_norm": 0.4004091408746546, + "learning_rate": 9.97791803186124e-06, + "loss": 0.4743, + "step": 694 + }, + { + "epoch": 0.12733602052033713, + "grad_norm": 0.38849935615066844, + "learning_rate": 9.977616800152248e-06, + "loss": 0.4944, + "step": 695 + }, + { + "epoch": 0.12751923781604985, + "grad_norm": 0.44216099670793213, + "learning_rate": 9.977313532318078e-06, + "loss": 0.5072, + "step": 696 + }, + { + "epoch": 0.12770245511176256, + "grad_norm": 0.4462646919434802, + "learning_rate": 9.977008228482785e-06, + "loss": 0.4901, + "step": 697 + }, + { + "epoch": 0.12788567240747525, + "grad_norm": 0.44769942596462214, + "learning_rate": 9.976700888771259e-06, + "loss": 0.5263, + "step": 698 + }, + { + "epoch": 0.12806888970318797, + "grad_norm": 0.5113829977752261, + "learning_rate": 9.976391513309212e-06, + "loss": 0.5202, + "step": 699 + }, + { + "epoch": 0.1282521069989007, + "grad_norm": 0.4245344457429005, + "learning_rate": 9.976080102223202e-06, + "loss": 0.5182, + "step": 700 + }, + { + "epoch": 0.1284353242946134, + "grad_norm": 0.4305381223047699, + "learning_rate": 9.97576665564061e-06, + "loss": 0.507, + "step": 701 + }, + { + "epoch": 0.12861854159032612, + "grad_norm": 0.47072785404283757, + "learning_rate": 9.975451173689658e-06, + "loss": 0.5166, + "step": 702 + }, + { + "epoch": 0.12880175888603884, + "grad_norm": 0.6675845713373886, + "learning_rate": 9.975133656499392e-06, + "loss": 0.47, + "step": 703 + }, + { + "epoch": 0.12898497618175156, + "grad_norm": 0.4533261672443661, + "learning_rate": 9.974814104199694e-06, + "loss": 0.5362, + "step": 704 + }, + { + "epoch": 0.12916819347746428, + "grad_norm": 0.4503943455070683, + "learning_rate": 9.97449251692128e-06, + "loss": 0.5134, + "step": 705 + }, + { + "epoch": 0.129351410773177, + "grad_norm": 0.44545859946020006, + "learning_rate": 9.974168894795698e-06, + "loss": 0.4981, + "step": 706 + }, + { + "epoch": 0.1295346280688897, + "grad_norm": 0.430864283858029, + "learning_rate": 9.973843237955328e-06, + "loss": 0.5329, + "step": 707 + }, + { + "epoch": 0.12971784536460243, + "grad_norm": 0.7865986917626206, + "learning_rate": 9.973515546533379e-06, + "loss": 0.504, + "step": 708 + }, + { + "epoch": 0.12990106266031515, + "grad_norm": 0.44976211133712124, + "learning_rate": 9.973185820663897e-06, + "loss": 0.4737, + "step": 709 + }, + { + "epoch": 0.13008427995602784, + "grad_norm": 0.4549304394815992, + "learning_rate": 9.97285406048176e-06, + "loss": 0.5425, + "step": 710 + }, + { + "epoch": 0.13026749725174055, + "grad_norm": 0.5049990473394447, + "learning_rate": 9.972520266122676e-06, + "loss": 0.5117, + "step": 711 + }, + { + "epoch": 0.13045071454745327, + "grad_norm": 0.42657669286902156, + "learning_rate": 9.972184437723182e-06, + "loss": 0.5057, + "step": 712 + }, + { + "epoch": 0.130633931843166, + "grad_norm": 0.5299863700854671, + "learning_rate": 9.971846575420656e-06, + "loss": 0.5262, + "step": 713 + }, + { + "epoch": 0.1308171491388787, + "grad_norm": 0.45345740582998295, + "learning_rate": 9.9715066793533e-06, + "loss": 0.4997, + "step": 714 + }, + { + "epoch": 0.13100036643459143, + "grad_norm": 0.39393086616308026, + "learning_rate": 9.971164749660149e-06, + "loss": 0.4996, + "step": 715 + }, + { + "epoch": 0.13118358373030414, + "grad_norm": 0.4577611248522203, + "learning_rate": 9.970820786481075e-06, + "loss": 0.5018, + "step": 716 + }, + { + "epoch": 0.13136680102601686, + "grad_norm": 0.43279352533859367, + "learning_rate": 9.970474789956775e-06, + "loss": 0.4914, + "step": 717 + }, + { + "epoch": 0.13155001832172958, + "grad_norm": 0.4366448132250816, + "learning_rate": 9.970126760228785e-06, + "loss": 0.4864, + "step": 718 + }, + { + "epoch": 0.1317332356174423, + "grad_norm": 0.43998158412052296, + "learning_rate": 9.969776697439463e-06, + "loss": 0.4834, + "step": 719 + }, + { + "epoch": 0.131916452913155, + "grad_norm": 0.46940772392002544, + "learning_rate": 9.969424601732011e-06, + "loss": 0.501, + "step": 720 + }, + { + "epoch": 0.1320996702088677, + "grad_norm": 0.43117110167391715, + "learning_rate": 9.96907047325045e-06, + "loss": 0.5562, + "step": 721 + }, + { + "epoch": 0.13228288750458042, + "grad_norm": 0.4119352967972661, + "learning_rate": 9.968714312139642e-06, + "loss": 0.4748, + "step": 722 + }, + { + "epoch": 0.13246610480029314, + "grad_norm": 0.5028443530663289, + "learning_rate": 9.968356118545277e-06, + "loss": 0.5344, + "step": 723 + }, + { + "epoch": 0.13264932209600586, + "grad_norm": 0.43089419048593447, + "learning_rate": 9.967995892613875e-06, + "loss": 0.4704, + "step": 724 + }, + { + "epoch": 0.13283253939171857, + "grad_norm": 0.5044933769783081, + "learning_rate": 9.967633634492788e-06, + "loss": 0.4996, + "step": 725 + }, + { + "epoch": 0.1330157566874313, + "grad_norm": 0.442554097189932, + "learning_rate": 9.967269344330201e-06, + "loss": 0.5278, + "step": 726 + }, + { + "epoch": 0.133198973983144, + "grad_norm": 0.431094495901828, + "learning_rate": 9.966903022275131e-06, + "loss": 0.4943, + "step": 727 + }, + { + "epoch": 0.13338219127885673, + "grad_norm": 0.6222105275898999, + "learning_rate": 9.966534668477421e-06, + "loss": 0.5215, + "step": 728 + }, + { + "epoch": 0.13356540857456944, + "grad_norm": 0.4562449049230116, + "learning_rate": 9.96616428308775e-06, + "loss": 0.5112, + "step": 729 + }, + { + "epoch": 0.13374862587028216, + "grad_norm": 0.4249667668518143, + "learning_rate": 9.965791866257626e-06, + "loss": 0.5083, + "step": 730 + }, + { + "epoch": 0.13393184316599488, + "grad_norm": 0.42736565388331654, + "learning_rate": 9.96541741813939e-06, + "loss": 0.5078, + "step": 731 + }, + { + "epoch": 0.1341150604617076, + "grad_norm": 0.42789645341508464, + "learning_rate": 9.96504093888621e-06, + "loss": 0.5148, + "step": 732 + }, + { + "epoch": 0.1342982777574203, + "grad_norm": 0.5018533252539279, + "learning_rate": 9.964662428652088e-06, + "loss": 0.4994, + "step": 733 + }, + { + "epoch": 0.134481495053133, + "grad_norm": 0.4561559749816332, + "learning_rate": 9.964281887591856e-06, + "loss": 0.5036, + "step": 734 + }, + { + "epoch": 0.13466471234884572, + "grad_norm": 0.43874828747054045, + "learning_rate": 9.963899315861176e-06, + "loss": 0.4976, + "step": 735 + }, + { + "epoch": 0.13484792964455844, + "grad_norm": 0.44683273370213666, + "learning_rate": 9.963514713616544e-06, + "loss": 0.4837, + "step": 736 + }, + { + "epoch": 0.13503114694027116, + "grad_norm": 0.4575940082218496, + "learning_rate": 9.963128081015282e-06, + "loss": 0.531, + "step": 737 + }, + { + "epoch": 0.13521436423598387, + "grad_norm": 0.4190671986873532, + "learning_rate": 9.962739418215545e-06, + "loss": 0.525, + "step": 738 + }, + { + "epoch": 0.1353975815316966, + "grad_norm": 0.41284775197321, + "learning_rate": 9.962348725376318e-06, + "loss": 0.519, + "step": 739 + }, + { + "epoch": 0.1355807988274093, + "grad_norm": 0.41220938724190687, + "learning_rate": 9.961956002657414e-06, + "loss": 0.535, + "step": 740 + }, + { + "epoch": 0.13576401612312203, + "grad_norm": 0.4428272636340964, + "learning_rate": 9.961561250219482e-06, + "loss": 0.5287, + "step": 741 + }, + { + "epoch": 0.13594723341883475, + "grad_norm": 0.42565025293852765, + "learning_rate": 9.961164468223996e-06, + "loss": 0.5329, + "step": 742 + }, + { + "epoch": 0.13613045071454746, + "grad_norm": 0.5963216630193675, + "learning_rate": 9.960765656833263e-06, + "loss": 0.5036, + "step": 743 + }, + { + "epoch": 0.13631366801026018, + "grad_norm": 0.42494573499796934, + "learning_rate": 9.96036481621042e-06, + "loss": 0.4936, + "step": 744 + }, + { + "epoch": 0.13649688530597287, + "grad_norm": 0.39224716002216886, + "learning_rate": 9.959961946519431e-06, + "loss": 0.4704, + "step": 745 + }, + { + "epoch": 0.1366801026016856, + "grad_norm": 0.42574265198042266, + "learning_rate": 9.959557047925095e-06, + "loss": 0.4964, + "step": 746 + }, + { + "epoch": 0.1368633198973983, + "grad_norm": 0.457270415221301, + "learning_rate": 9.959150120593035e-06, + "loss": 0.5322, + "step": 747 + }, + { + "epoch": 0.13704653719311102, + "grad_norm": 0.4245718410150897, + "learning_rate": 9.95874116468971e-06, + "loss": 0.4952, + "step": 748 + }, + { + "epoch": 0.13722975448882374, + "grad_norm": 0.47589556246220993, + "learning_rate": 9.958330180382405e-06, + "loss": 0.5124, + "step": 749 + }, + { + "epoch": 0.13741297178453646, + "grad_norm": 0.45440270321018955, + "learning_rate": 9.957917167839238e-06, + "loss": 0.5109, + "step": 750 + }, + { + "epoch": 0.13759618908024918, + "grad_norm": 0.5232685566183007, + "learning_rate": 9.95750212722915e-06, + "loss": 0.5174, + "step": 751 + }, + { + "epoch": 0.1377794063759619, + "grad_norm": 0.538012343254673, + "learning_rate": 9.957085058721918e-06, + "loss": 0.5105, + "step": 752 + }, + { + "epoch": 0.1379626236716746, + "grad_norm": 0.38959097348139504, + "learning_rate": 9.956665962488148e-06, + "loss": 0.4533, + "step": 753 + }, + { + "epoch": 0.13814584096738733, + "grad_norm": 0.4404610898092818, + "learning_rate": 9.956244838699271e-06, + "loss": 0.4978, + "step": 754 + }, + { + "epoch": 0.13832905826310005, + "grad_norm": 0.47384922621236214, + "learning_rate": 9.955821687527554e-06, + "loss": 0.5309, + "step": 755 + }, + { + "epoch": 0.13851227555881276, + "grad_norm": 0.45266354792605684, + "learning_rate": 9.955396509146084e-06, + "loss": 0.5151, + "step": 756 + }, + { + "epoch": 0.13869549285452545, + "grad_norm": 0.4098036061670406, + "learning_rate": 9.95496930372879e-06, + "loss": 0.5, + "step": 757 + }, + { + "epoch": 0.13887871015023817, + "grad_norm": 0.4520393902857722, + "learning_rate": 9.954540071450418e-06, + "loss": 0.4766, + "step": 758 + }, + { + "epoch": 0.1390619274459509, + "grad_norm": 0.41592431105518124, + "learning_rate": 9.95410881248655e-06, + "loss": 0.5279, + "step": 759 + }, + { + "epoch": 0.1392451447416636, + "grad_norm": 0.9055926932881602, + "learning_rate": 9.953675527013594e-06, + "loss": 0.4544, + "step": 760 + }, + { + "epoch": 0.13942836203737632, + "grad_norm": 0.4230083797544526, + "learning_rate": 9.953240215208787e-06, + "loss": 0.5174, + "step": 761 + }, + { + "epoch": 0.13961157933308904, + "grad_norm": 0.46167105853029766, + "learning_rate": 9.9528028772502e-06, + "loss": 0.5087, + "step": 762 + }, + { + "epoch": 0.13979479662880176, + "grad_norm": 0.4614988799168283, + "learning_rate": 9.952363513316727e-06, + "loss": 0.5035, + "step": 763 + }, + { + "epoch": 0.13997801392451448, + "grad_norm": 0.4599378559524754, + "learning_rate": 9.951922123588091e-06, + "loss": 0.4982, + "step": 764 + }, + { + "epoch": 0.1401612312202272, + "grad_norm": 0.4169187563233499, + "learning_rate": 9.951478708244847e-06, + "loss": 0.5205, + "step": 765 + }, + { + "epoch": 0.1403444485159399, + "grad_norm": 1.4346541301623688, + "learning_rate": 9.951033267468375e-06, + "loss": 0.4704, + "step": 766 + }, + { + "epoch": 0.14052766581165263, + "grad_norm": 0.4477016515011828, + "learning_rate": 9.950585801440889e-06, + "loss": 0.489, + "step": 767 + }, + { + "epoch": 0.14071088310736535, + "grad_norm": 0.4178701034776549, + "learning_rate": 9.950136310345425e-06, + "loss": 0.5147, + "step": 768 + }, + { + "epoch": 0.14089410040307804, + "grad_norm": 0.4893104037393506, + "learning_rate": 9.949684794365848e-06, + "loss": 0.4913, + "step": 769 + }, + { + "epoch": 0.14107731769879076, + "grad_norm": 0.4018838415277579, + "learning_rate": 9.949231253686857e-06, + "loss": 0.464, + "step": 770 + }, + { + "epoch": 0.14126053499450347, + "grad_norm": 0.38608379924790914, + "learning_rate": 9.948775688493974e-06, + "loss": 0.5173, + "step": 771 + }, + { + "epoch": 0.1414437522902162, + "grad_norm": 0.4162388206396457, + "learning_rate": 9.948318098973552e-06, + "loss": 0.4917, + "step": 772 + }, + { + "epoch": 0.1416269695859289, + "grad_norm": 0.3781950328417496, + "learning_rate": 9.947858485312772e-06, + "loss": 0.4446, + "step": 773 + }, + { + "epoch": 0.14181018688164163, + "grad_norm": 0.45959051057998834, + "learning_rate": 9.947396847699638e-06, + "loss": 0.4918, + "step": 774 + }, + { + "epoch": 0.14199340417735434, + "grad_norm": 0.480317102541819, + "learning_rate": 9.946933186322988e-06, + "loss": 0.5369, + "step": 775 + }, + { + "epoch": 0.14217662147306706, + "grad_norm": 0.5008985493720574, + "learning_rate": 9.946467501372485e-06, + "loss": 0.4992, + "step": 776 + }, + { + "epoch": 0.14235983876877978, + "grad_norm": 0.4117591563183844, + "learning_rate": 9.94599979303862e-06, + "loss": 0.4818, + "step": 777 + }, + { + "epoch": 0.1425430560644925, + "grad_norm": 0.531108956987235, + "learning_rate": 9.945530061512714e-06, + "loss": 0.517, + "step": 778 + }, + { + "epoch": 0.1427262733602052, + "grad_norm": 0.43817523582155865, + "learning_rate": 9.945058306986911e-06, + "loss": 0.5473, + "step": 779 + }, + { + "epoch": 0.14290949065591793, + "grad_norm": 0.4747080227792923, + "learning_rate": 9.944584529654187e-06, + "loss": 0.4819, + "step": 780 + }, + { + "epoch": 0.14309270795163062, + "grad_norm": 0.4249215515551791, + "learning_rate": 9.944108729708342e-06, + "loss": 0.4979, + "step": 781 + }, + { + "epoch": 0.14327592524734334, + "grad_norm": 0.43558257752578505, + "learning_rate": 9.943630907344008e-06, + "loss": 0.484, + "step": 782 + }, + { + "epoch": 0.14345914254305606, + "grad_norm": 0.4750219883418816, + "learning_rate": 9.943151062756638e-06, + "loss": 0.4825, + "step": 783 + }, + { + "epoch": 0.14364235983876877, + "grad_norm": 0.4332476417990451, + "learning_rate": 9.942669196142516e-06, + "loss": 0.4929, + "step": 784 + }, + { + "epoch": 0.1438255771344815, + "grad_norm": 0.48307720479460303, + "learning_rate": 9.942185307698754e-06, + "loss": 0.5099, + "step": 785 + }, + { + "epoch": 0.1440087944301942, + "grad_norm": 0.43004630110605213, + "learning_rate": 9.941699397623289e-06, + "loss": 0.4435, + "step": 786 + }, + { + "epoch": 0.14419201172590693, + "grad_norm": 0.5526090978976514, + "learning_rate": 9.941211466114883e-06, + "loss": 0.4831, + "step": 787 + }, + { + "epoch": 0.14437522902161964, + "grad_norm": 0.4299548315161719, + "learning_rate": 9.94072151337313e-06, + "loss": 0.5055, + "step": 788 + }, + { + "epoch": 0.14455844631733236, + "grad_norm": 0.46488795405005734, + "learning_rate": 9.940229539598449e-06, + "loss": 0.5247, + "step": 789 + }, + { + "epoch": 0.14474166361304508, + "grad_norm": 0.45711196453956754, + "learning_rate": 9.939735544992084e-06, + "loss": 0.5155, + "step": 790 + }, + { + "epoch": 0.1449248809087578, + "grad_norm": 0.39377908902960373, + "learning_rate": 9.939239529756106e-06, + "loss": 0.4861, + "step": 791 + }, + { + "epoch": 0.14510809820447051, + "grad_norm": 0.3780973854580167, + "learning_rate": 9.938741494093413e-06, + "loss": 0.4743, + "step": 792 + }, + { + "epoch": 0.1452913155001832, + "grad_norm": 0.4442667746426281, + "learning_rate": 9.93824143820773e-06, + "loss": 0.5124, + "step": 793 + }, + { + "epoch": 0.14547453279589592, + "grad_norm": 0.47139079982459514, + "learning_rate": 9.93773936230361e-06, + "loss": 0.5095, + "step": 794 + }, + { + "epoch": 0.14565775009160864, + "grad_norm": 0.47478113201389704, + "learning_rate": 9.937235266586425e-06, + "loss": 0.5242, + "step": 795 + }, + { + "epoch": 0.14584096738732136, + "grad_norm": 0.4253566981856297, + "learning_rate": 9.936729151262383e-06, + "loss": 0.4816, + "step": 796 + }, + { + "epoch": 0.14602418468303408, + "grad_norm": 0.3995444614750483, + "learning_rate": 9.936221016538514e-06, + "loss": 0.5114, + "step": 797 + }, + { + "epoch": 0.1462074019787468, + "grad_norm": 0.3958644449360531, + "learning_rate": 9.935710862622671e-06, + "loss": 0.5044, + "step": 798 + }, + { + "epoch": 0.1463906192744595, + "grad_norm": 0.39672728856696343, + "learning_rate": 9.935198689723537e-06, + "loss": 0.5079, + "step": 799 + }, + { + "epoch": 0.14657383657017223, + "grad_norm": 0.4401216989939036, + "learning_rate": 9.934684498050619e-06, + "loss": 0.5169, + "step": 800 + }, + { + "epoch": 0.14675705386588495, + "grad_norm": 0.42531985045283716, + "learning_rate": 9.93416828781425e-06, + "loss": 0.5039, + "step": 801 + }, + { + "epoch": 0.14694027116159766, + "grad_norm": 0.43641771981673, + "learning_rate": 9.93365005922559e-06, + "loss": 0.4705, + "step": 802 + }, + { + "epoch": 0.14712348845731038, + "grad_norm": 0.4154214727248315, + "learning_rate": 9.933129812496623e-06, + "loss": 0.4801, + "step": 803 + }, + { + "epoch": 0.1473067057530231, + "grad_norm": 0.4668765888893056, + "learning_rate": 9.932607547840161e-06, + "loss": 0.5262, + "step": 804 + }, + { + "epoch": 0.1474899230487358, + "grad_norm": 0.45163409733644266, + "learning_rate": 9.932083265469836e-06, + "loss": 0.5009, + "step": 805 + }, + { + "epoch": 0.1476731403444485, + "grad_norm": 0.44602336917824525, + "learning_rate": 9.93155696560011e-06, + "loss": 0.5071, + "step": 806 + }, + { + "epoch": 0.14785635764016122, + "grad_norm": 0.40571563214546336, + "learning_rate": 9.931028648446273e-06, + "loss": 0.4867, + "step": 807 + }, + { + "epoch": 0.14803957493587394, + "grad_norm": 0.412866633306789, + "learning_rate": 9.930498314224433e-06, + "loss": 0.5204, + "step": 808 + }, + { + "epoch": 0.14822279223158666, + "grad_norm": 0.4491417286505596, + "learning_rate": 9.929965963151526e-06, + "loss": 0.4959, + "step": 809 + }, + { + "epoch": 0.14840600952729938, + "grad_norm": 0.483698524685917, + "learning_rate": 9.929431595445315e-06, + "loss": 0.4945, + "step": 810 + }, + { + "epoch": 0.1485892268230121, + "grad_norm": 0.45809625895317874, + "learning_rate": 9.928895211324387e-06, + "loss": 0.4795, + "step": 811 + }, + { + "epoch": 0.1487724441187248, + "grad_norm": 0.4671855759216309, + "learning_rate": 9.928356811008153e-06, + "loss": 0.502, + "step": 812 + }, + { + "epoch": 0.14895566141443753, + "grad_norm": 0.49991252806247843, + "learning_rate": 9.927816394716847e-06, + "loss": 0.4989, + "step": 813 + }, + { + "epoch": 0.14913887871015025, + "grad_norm": 0.4930888660574426, + "learning_rate": 9.92727396267153e-06, + "loss": 0.5256, + "step": 814 + }, + { + "epoch": 0.14932209600586296, + "grad_norm": 0.4301245288190715, + "learning_rate": 9.926729515094092e-06, + "loss": 0.5062, + "step": 815 + }, + { + "epoch": 0.14950531330157568, + "grad_norm": 0.4254345495226345, + "learning_rate": 9.926183052207235e-06, + "loss": 0.4885, + "step": 816 + }, + { + "epoch": 0.14968853059728837, + "grad_norm": 0.40854658960332646, + "learning_rate": 9.925634574234499e-06, + "loss": 0.4909, + "step": 817 + }, + { + "epoch": 0.1498717478930011, + "grad_norm": 0.4307966660913799, + "learning_rate": 9.925084081400241e-06, + "loss": 0.4974, + "step": 818 + }, + { + "epoch": 0.1500549651887138, + "grad_norm": 0.44710983697734263, + "learning_rate": 9.924531573929641e-06, + "loss": 0.5107, + "step": 819 + }, + { + "epoch": 0.15023818248442652, + "grad_norm": 0.47247983608563787, + "learning_rate": 9.923977052048708e-06, + "loss": 0.5185, + "step": 820 + }, + { + "epoch": 0.15042139978013924, + "grad_norm": 0.43408666873611645, + "learning_rate": 9.923420515984272e-06, + "loss": 0.5256, + "step": 821 + }, + { + "epoch": 0.15060461707585196, + "grad_norm": 0.3618361050633503, + "learning_rate": 9.922861965963987e-06, + "loss": 0.4835, + "step": 822 + }, + { + "epoch": 0.15078783437156468, + "grad_norm": 0.478572088704784, + "learning_rate": 9.922301402216334e-06, + "loss": 0.4971, + "step": 823 + }, + { + "epoch": 0.1509710516672774, + "grad_norm": 0.4956723559981853, + "learning_rate": 9.921738824970611e-06, + "loss": 0.4664, + "step": 824 + }, + { + "epoch": 0.1511542689629901, + "grad_norm": 0.43104833444065105, + "learning_rate": 9.921174234456947e-06, + "loss": 0.5056, + "step": 825 + }, + { + "epoch": 0.15133748625870283, + "grad_norm": 0.38636058698792763, + "learning_rate": 9.920607630906289e-06, + "loss": 0.509, + "step": 826 + }, + { + "epoch": 0.15152070355441555, + "grad_norm": 0.4638108326048624, + "learning_rate": 9.920039014550413e-06, + "loss": 0.4928, + "step": 827 + }, + { + "epoch": 0.15170392085012827, + "grad_norm": 0.38962500919500936, + "learning_rate": 9.919468385621912e-06, + "loss": 0.5025, + "step": 828 + }, + { + "epoch": 0.15188713814584096, + "grad_norm": 0.423466480397551, + "learning_rate": 9.918895744354204e-06, + "loss": 0.4903, + "step": 829 + }, + { + "epoch": 0.15207035544155367, + "grad_norm": 0.5945183810824295, + "learning_rate": 9.918321090981537e-06, + "loss": 0.4961, + "step": 830 + }, + { + "epoch": 0.1522535727372664, + "grad_norm": 0.3857346095853992, + "learning_rate": 9.917744425738971e-06, + "loss": 0.4903, + "step": 831 + }, + { + "epoch": 0.1524367900329791, + "grad_norm": 0.4368019305898112, + "learning_rate": 9.917165748862398e-06, + "loss": 0.4985, + "step": 832 + }, + { + "epoch": 0.15262000732869183, + "grad_norm": 0.4622792781214721, + "learning_rate": 9.916585060588526e-06, + "loss": 0.5298, + "step": 833 + }, + { + "epoch": 0.15280322462440454, + "grad_norm": 0.43464073300377387, + "learning_rate": 9.916002361154894e-06, + "loss": 0.4956, + "step": 834 + }, + { + "epoch": 0.15298644192011726, + "grad_norm": 0.37914154472744427, + "learning_rate": 9.915417650799855e-06, + "loss": 0.4914, + "step": 835 + }, + { + "epoch": 0.15316965921582998, + "grad_norm": 0.41110768627195465, + "learning_rate": 9.914830929762588e-06, + "loss": 0.4933, + "step": 836 + }, + { + "epoch": 0.1533528765115427, + "grad_norm": 0.3922989216974703, + "learning_rate": 9.914242198283099e-06, + "loss": 0.5, + "step": 837 + }, + { + "epoch": 0.1535360938072554, + "grad_norm": 0.4310098182316404, + "learning_rate": 9.91365145660221e-06, + "loss": 0.4817, + "step": 838 + }, + { + "epoch": 0.15371931110296813, + "grad_norm": 0.40690680117071937, + "learning_rate": 9.913058704961565e-06, + "loss": 0.4778, + "step": 839 + }, + { + "epoch": 0.15390252839868085, + "grad_norm": 0.4584415585967629, + "learning_rate": 9.912463943603635e-06, + "loss": 0.4585, + "step": 840 + }, + { + "epoch": 0.15408574569439354, + "grad_norm": 0.48734349658323495, + "learning_rate": 9.911867172771711e-06, + "loss": 0.5345, + "step": 841 + }, + { + "epoch": 0.15426896299010626, + "grad_norm": 0.442686833828428, + "learning_rate": 9.911268392709908e-06, + "loss": 0.5134, + "step": 842 + }, + { + "epoch": 0.15445218028581897, + "grad_norm": 0.44506293028939, + "learning_rate": 9.910667603663156e-06, + "loss": 0.5158, + "step": 843 + }, + { + "epoch": 0.1546353975815317, + "grad_norm": 0.4311992637968541, + "learning_rate": 9.910064805877214e-06, + "loss": 0.4837, + "step": 844 + }, + { + "epoch": 0.1548186148772444, + "grad_norm": 0.4493099076015729, + "learning_rate": 9.90945999959866e-06, + "loss": 0.5108, + "step": 845 + }, + { + "epoch": 0.15500183217295713, + "grad_norm": 0.45478365363321205, + "learning_rate": 9.908853185074896e-06, + "loss": 0.4998, + "step": 846 + }, + { + "epoch": 0.15518504946866984, + "grad_norm": 0.46301414888311854, + "learning_rate": 9.90824436255414e-06, + "loss": 0.5225, + "step": 847 + }, + { + "epoch": 0.15536826676438256, + "grad_norm": 0.40118617155548936, + "learning_rate": 9.907633532285435e-06, + "loss": 0.4927, + "step": 848 + }, + { + "epoch": 0.15555148406009528, + "grad_norm": 0.49481295874709574, + "learning_rate": 9.907020694518646e-06, + "loss": 0.4933, + "step": 849 + }, + { + "epoch": 0.155734701355808, + "grad_norm": 0.4102626498510104, + "learning_rate": 9.90640584950446e-06, + "loss": 0.5111, + "step": 850 + }, + { + "epoch": 0.15591791865152071, + "grad_norm": 0.46514690031713113, + "learning_rate": 9.905788997494377e-06, + "loss": 0.4728, + "step": 851 + }, + { + "epoch": 0.15610113594723343, + "grad_norm": 0.4054441718781369, + "learning_rate": 9.905170138740732e-06, + "loss": 0.4828, + "step": 852 + }, + { + "epoch": 0.15628435324294612, + "grad_norm": 0.4400252577977126, + "learning_rate": 9.904549273496666e-06, + "loss": 0.501, + "step": 853 + }, + { + "epoch": 0.15646757053865884, + "grad_norm": 0.42167618939817325, + "learning_rate": 9.903926402016153e-06, + "loss": 0.4922, + "step": 854 + }, + { + "epoch": 0.15665078783437156, + "grad_norm": 0.4093707700968775, + "learning_rate": 9.90330152455398e-06, + "loss": 0.5014, + "step": 855 + }, + { + "epoch": 0.15683400513008428, + "grad_norm": 0.4523522809487888, + "learning_rate": 9.902674641365756e-06, + "loss": 0.4829, + "step": 856 + }, + { + "epoch": 0.157017222425797, + "grad_norm": 0.49143897815029397, + "learning_rate": 9.902045752707916e-06, + "loss": 0.5074, + "step": 857 + }, + { + "epoch": 0.1572004397215097, + "grad_norm": 0.43621740114283203, + "learning_rate": 9.901414858837706e-06, + "loss": 0.5182, + "step": 858 + }, + { + "epoch": 0.15738365701722243, + "grad_norm": 0.40732498942379025, + "learning_rate": 9.9007819600132e-06, + "loss": 0.4674, + "step": 859 + }, + { + "epoch": 0.15756687431293515, + "grad_norm": 0.43675476681733244, + "learning_rate": 9.900147056493285e-06, + "loss": 0.5374, + "step": 860 + }, + { + "epoch": 0.15775009160864786, + "grad_norm": 0.43060074322101793, + "learning_rate": 9.89951014853768e-06, + "loss": 0.4635, + "step": 861 + }, + { + "epoch": 0.15793330890436058, + "grad_norm": 0.41179202097945616, + "learning_rate": 9.898871236406907e-06, + "loss": 0.4951, + "step": 862 + }, + { + "epoch": 0.1581165262000733, + "grad_norm": 0.371370018002044, + "learning_rate": 9.898230320362323e-06, + "loss": 0.479, + "step": 863 + }, + { + "epoch": 0.158299743495786, + "grad_norm": 0.4300134296591519, + "learning_rate": 9.897587400666097e-06, + "loss": 0.4868, + "step": 864 + }, + { + "epoch": 0.1584829607914987, + "grad_norm": 0.43589209027697184, + "learning_rate": 9.896942477581221e-06, + "loss": 0.5124, + "step": 865 + }, + { + "epoch": 0.15866617808721142, + "grad_norm": 0.575247632832188, + "learning_rate": 9.896295551371503e-06, + "loss": 0.5053, + "step": 866 + }, + { + "epoch": 0.15884939538292414, + "grad_norm": 0.4273872012355264, + "learning_rate": 9.89564662230157e-06, + "loss": 0.5138, + "step": 867 + }, + { + "epoch": 0.15903261267863686, + "grad_norm": 0.4543163505662005, + "learning_rate": 9.894995690636874e-06, + "loss": 0.5229, + "step": 868 + }, + { + "epoch": 0.15921582997434958, + "grad_norm": 0.4491410597219586, + "learning_rate": 9.894342756643684e-06, + "loss": 0.5045, + "step": 869 + }, + { + "epoch": 0.1593990472700623, + "grad_norm": 0.47087708750396473, + "learning_rate": 9.893687820589081e-06, + "loss": 0.5087, + "step": 870 + }, + { + "epoch": 0.159582264565775, + "grad_norm": 0.4240236693967998, + "learning_rate": 9.893030882740977e-06, + "loss": 0.5052, + "step": 871 + }, + { + "epoch": 0.15976548186148773, + "grad_norm": 0.4287525963131784, + "learning_rate": 9.892371943368092e-06, + "loss": 0.527, + "step": 872 + }, + { + "epoch": 0.15994869915720045, + "grad_norm": 0.4131563755872441, + "learning_rate": 9.891711002739971e-06, + "loss": 0.5074, + "step": 873 + }, + { + "epoch": 0.16013191645291316, + "grad_norm": 0.39231143806918656, + "learning_rate": 9.891048061126975e-06, + "loss": 0.5018, + "step": 874 + }, + { + "epoch": 0.16031513374862588, + "grad_norm": 0.4391493046066672, + "learning_rate": 9.890383118800287e-06, + "loss": 0.518, + "step": 875 + }, + { + "epoch": 0.16049835104433857, + "grad_norm": 0.468016398258625, + "learning_rate": 9.889716176031903e-06, + "loss": 0.5191, + "step": 876 + }, + { + "epoch": 0.1606815683400513, + "grad_norm": 0.4619425954928644, + "learning_rate": 9.88904723309464e-06, + "loss": 0.5114, + "step": 877 + }, + { + "epoch": 0.160864785635764, + "grad_norm": 0.4581492770476195, + "learning_rate": 9.888376290262134e-06, + "loss": 0.5349, + "step": 878 + }, + { + "epoch": 0.16104800293147672, + "grad_norm": 0.390022001202663, + "learning_rate": 9.887703347808838e-06, + "loss": 0.507, + "step": 879 + }, + { + "epoch": 0.16123122022718944, + "grad_norm": 0.38271876665722593, + "learning_rate": 9.887028406010026e-06, + "loss": 0.4987, + "step": 880 + }, + { + "epoch": 0.16141443752290216, + "grad_norm": 0.39991523006604696, + "learning_rate": 9.886351465141785e-06, + "loss": 0.4807, + "step": 881 + }, + { + "epoch": 0.16159765481861488, + "grad_norm": 0.3867195675970219, + "learning_rate": 9.88567252548102e-06, + "loss": 0.515, + "step": 882 + }, + { + "epoch": 0.1617808721143276, + "grad_norm": 0.47566001886735476, + "learning_rate": 9.884991587305459e-06, + "loss": 0.4461, + "step": 883 + }, + { + "epoch": 0.1619640894100403, + "grad_norm": 0.4453653597243864, + "learning_rate": 9.884308650893642e-06, + "loss": 0.5493, + "step": 884 + }, + { + "epoch": 0.16214730670575303, + "grad_norm": 0.38787040868110323, + "learning_rate": 9.883623716524929e-06, + "loss": 0.4998, + "step": 885 + }, + { + "epoch": 0.16233052400146575, + "grad_norm": 0.4570258592705414, + "learning_rate": 9.882936784479498e-06, + "loss": 0.5226, + "step": 886 + }, + { + "epoch": 0.16251374129717847, + "grad_norm": 0.4438200453897428, + "learning_rate": 9.882247855038339e-06, + "loss": 0.5073, + "step": 887 + }, + { + "epoch": 0.16269695859289116, + "grad_norm": 0.3887912773501902, + "learning_rate": 9.881556928483266e-06, + "loss": 0.5213, + "step": 888 + }, + { + "epoch": 0.16288017588860387, + "grad_norm": 0.44048355093202696, + "learning_rate": 9.880864005096906e-06, + "loss": 0.511, + "step": 889 + }, + { + "epoch": 0.1630633931843166, + "grad_norm": 0.44235079469227967, + "learning_rate": 9.880169085162703e-06, + "loss": 0.5223, + "step": 890 + }, + { + "epoch": 0.1632466104800293, + "grad_norm": 0.3877832289424, + "learning_rate": 9.87947216896492e-06, + "loss": 0.4537, + "step": 891 + }, + { + "epoch": 0.16342982777574203, + "grad_norm": 0.44869938145307137, + "learning_rate": 9.878773256788635e-06, + "loss": 0.5013, + "step": 892 + }, + { + "epoch": 0.16361304507145474, + "grad_norm": 0.46375171983760244, + "learning_rate": 9.878072348919738e-06, + "loss": 0.489, + "step": 893 + }, + { + "epoch": 0.16379626236716746, + "grad_norm": 0.49608852880970994, + "learning_rate": 9.877369445644945e-06, + "loss": 0.5177, + "step": 894 + }, + { + "epoch": 0.16397947966288018, + "grad_norm": 0.4082183549684993, + "learning_rate": 9.876664547251781e-06, + "loss": 0.5091, + "step": 895 + }, + { + "epoch": 0.1641626969585929, + "grad_norm": 0.4347968628628821, + "learning_rate": 9.875957654028588e-06, + "loss": 0.5242, + "step": 896 + }, + { + "epoch": 0.16434591425430561, + "grad_norm": 0.45545255399303036, + "learning_rate": 9.875248766264527e-06, + "loss": 0.5406, + "step": 897 + }, + { + "epoch": 0.16452913155001833, + "grad_norm": 0.41358644063630506, + "learning_rate": 9.874537884249574e-06, + "loss": 0.511, + "step": 898 + }, + { + "epoch": 0.16471234884573105, + "grad_norm": 0.4576667619206321, + "learning_rate": 9.873825008274514e-06, + "loss": 0.5073, + "step": 899 + }, + { + "epoch": 0.16489556614144374, + "grad_norm": 0.47721781865779744, + "learning_rate": 9.87311013863096e-06, + "loss": 0.5044, + "step": 900 + }, + { + "epoch": 0.16507878343715646, + "grad_norm": 0.4622288619794578, + "learning_rate": 9.872393275611329e-06, + "loss": 0.5096, + "step": 901 + }, + { + "epoch": 0.16526200073286917, + "grad_norm": 0.41177688817899083, + "learning_rate": 9.871674419508864e-06, + "loss": 0.523, + "step": 902 + }, + { + "epoch": 0.1654452180285819, + "grad_norm": 0.371054810469729, + "learning_rate": 9.87095357061761e-06, + "loss": 0.4897, + "step": 903 + }, + { + "epoch": 0.1656284353242946, + "grad_norm": 0.41574608839188226, + "learning_rate": 9.87023072923244e-06, + "loss": 0.4823, + "step": 904 + }, + { + "epoch": 0.16581165262000733, + "grad_norm": 0.4218915251343208, + "learning_rate": 9.869505895649036e-06, + "loss": 0.5079, + "step": 905 + }, + { + "epoch": 0.16599486991572004, + "grad_norm": 0.5549831381708766, + "learning_rate": 9.868779070163895e-06, + "loss": 0.484, + "step": 906 + }, + { + "epoch": 0.16617808721143276, + "grad_norm": 0.3891473564827362, + "learning_rate": 9.868050253074328e-06, + "loss": 0.4903, + "step": 907 + }, + { + "epoch": 0.16636130450714548, + "grad_norm": 0.4268945705632106, + "learning_rate": 9.867319444678465e-06, + "loss": 0.5024, + "step": 908 + }, + { + "epoch": 0.1665445218028582, + "grad_norm": 0.40332206359603545, + "learning_rate": 9.866586645275247e-06, + "loss": 0.4477, + "step": 909 + }, + { + "epoch": 0.16672773909857092, + "grad_norm": 0.3720747589583317, + "learning_rate": 9.86585185516443e-06, + "loss": 0.469, + "step": 910 + }, + { + "epoch": 0.16691095639428363, + "grad_norm": 0.451113552639029, + "learning_rate": 9.865115074646583e-06, + "loss": 0.5221, + "step": 911 + }, + { + "epoch": 0.16709417368999632, + "grad_norm": 0.4454998332020719, + "learning_rate": 9.864376304023092e-06, + "loss": 0.495, + "step": 912 + }, + { + "epoch": 0.16727739098570904, + "grad_norm": 0.5363910151131063, + "learning_rate": 9.863635543596156e-06, + "loss": 0.5233, + "step": 913 + }, + { + "epoch": 0.16746060828142176, + "grad_norm": 0.4392403856880424, + "learning_rate": 9.862892793668787e-06, + "loss": 0.4998, + "step": 914 + }, + { + "epoch": 0.16764382557713448, + "grad_norm": 0.42093143672832734, + "learning_rate": 9.862148054544812e-06, + "loss": 0.4977, + "step": 915 + }, + { + "epoch": 0.1678270428728472, + "grad_norm": 0.39275787265485074, + "learning_rate": 9.86140132652887e-06, + "loss": 0.4694, + "step": 916 + }, + { + "epoch": 0.1680102601685599, + "grad_norm": 0.4159908861548446, + "learning_rate": 9.860652609926417e-06, + "loss": 0.4886, + "step": 917 + }, + { + "epoch": 0.16819347746427263, + "grad_norm": 0.38579180273261776, + "learning_rate": 9.859901905043718e-06, + "loss": 0.48, + "step": 918 + }, + { + "epoch": 0.16837669475998535, + "grad_norm": 0.4273621763370794, + "learning_rate": 9.859149212187855e-06, + "loss": 0.512, + "step": 919 + }, + { + "epoch": 0.16855991205569806, + "grad_norm": 0.45913559298162226, + "learning_rate": 9.85839453166672e-06, + "loss": 0.4877, + "step": 920 + }, + { + "epoch": 0.16874312935141078, + "grad_norm": 0.3847363797846655, + "learning_rate": 9.85763786378902e-06, + "loss": 0.4586, + "step": 921 + }, + { + "epoch": 0.1689263466471235, + "grad_norm": 0.4872256895625686, + "learning_rate": 9.856879208864277e-06, + "loss": 0.5035, + "step": 922 + }, + { + "epoch": 0.16910956394283622, + "grad_norm": 0.47756329412725707, + "learning_rate": 9.85611856720282e-06, + "loss": 0.5031, + "step": 923 + }, + { + "epoch": 0.1692927812385489, + "grad_norm": 0.474431940864528, + "learning_rate": 9.8553559391158e-06, + "loss": 0.512, + "step": 924 + }, + { + "epoch": 0.16947599853426162, + "grad_norm": 0.41972238109665655, + "learning_rate": 9.85459132491517e-06, + "loss": 0.5386, + "step": 925 + }, + { + "epoch": 0.16965921582997434, + "grad_norm": 0.4369771091332424, + "learning_rate": 9.8538247249137e-06, + "loss": 0.4909, + "step": 926 + }, + { + "epoch": 0.16984243312568706, + "grad_norm": 0.4523772832392917, + "learning_rate": 9.853056139424974e-06, + "loss": 0.4951, + "step": 927 + }, + { + "epoch": 0.17002565042139978, + "grad_norm": 0.40235086982371876, + "learning_rate": 9.852285568763387e-06, + "loss": 0.4828, + "step": 928 + }, + { + "epoch": 0.1702088677171125, + "grad_norm": 0.4150844402567988, + "learning_rate": 9.851513013244144e-06, + "loss": 0.4933, + "step": 929 + }, + { + "epoch": 0.1703920850128252, + "grad_norm": 0.3941602147072389, + "learning_rate": 9.850738473183266e-06, + "loss": 0.4855, + "step": 930 + }, + { + "epoch": 0.17057530230853793, + "grad_norm": 0.47118394280764797, + "learning_rate": 9.849961948897582e-06, + "loss": 0.5024, + "step": 931 + }, + { + "epoch": 0.17075851960425065, + "grad_norm": 0.3912878663413955, + "learning_rate": 9.849183440704735e-06, + "loss": 0.5041, + "step": 932 + }, + { + "epoch": 0.17094173689996336, + "grad_norm": 0.4001961484144372, + "learning_rate": 9.848402948923177e-06, + "loss": 0.4753, + "step": 933 + }, + { + "epoch": 0.17112495419567608, + "grad_norm": 0.4441137892348275, + "learning_rate": 9.847620473872172e-06, + "loss": 0.5102, + "step": 934 + }, + { + "epoch": 0.1713081714913888, + "grad_norm": 0.40101758764349804, + "learning_rate": 9.846836015871802e-06, + "loss": 0.4824, + "step": 935 + }, + { + "epoch": 0.1714913887871015, + "grad_norm": 0.41075944565503664, + "learning_rate": 9.846049575242949e-06, + "loss": 0.5235, + "step": 936 + }, + { + "epoch": 0.1716746060828142, + "grad_norm": 0.3780837940291423, + "learning_rate": 9.845261152307312e-06, + "loss": 0.5211, + "step": 937 + }, + { + "epoch": 0.17185782337852692, + "grad_norm": 0.45070991069476113, + "learning_rate": 9.844470747387403e-06, + "loss": 0.4926, + "step": 938 + }, + { + "epoch": 0.17204104067423964, + "grad_norm": 0.40836878453605646, + "learning_rate": 9.843678360806542e-06, + "loss": 0.5173, + "step": 939 + }, + { + "epoch": 0.17222425796995236, + "grad_norm": 0.36676941914948374, + "learning_rate": 9.842883992888855e-06, + "loss": 0.5101, + "step": 940 + }, + { + "epoch": 0.17240747526566508, + "grad_norm": 0.42894488109200807, + "learning_rate": 9.842087643959288e-06, + "loss": 0.5398, + "step": 941 + }, + { + "epoch": 0.1725906925613778, + "grad_norm": 0.4004768449690441, + "learning_rate": 9.841289314343591e-06, + "loss": 0.4967, + "step": 942 + }, + { + "epoch": 0.1727739098570905, + "grad_norm": 0.3781076276341992, + "learning_rate": 9.840489004368325e-06, + "loss": 0.4731, + "step": 943 + }, + { + "epoch": 0.17295712715280323, + "grad_norm": 0.45307945315267256, + "learning_rate": 9.839686714360864e-06, + "loss": 0.4737, + "step": 944 + }, + { + "epoch": 0.17314034444851595, + "grad_norm": 0.4079247794136399, + "learning_rate": 9.838882444649387e-06, + "loss": 0.4927, + "step": 945 + }, + { + "epoch": 0.17332356174422867, + "grad_norm": 0.47171018664243136, + "learning_rate": 9.838076195562886e-06, + "loss": 0.5009, + "step": 946 + }, + { + "epoch": 0.17350677903994138, + "grad_norm": 0.44572108382734166, + "learning_rate": 9.837267967431164e-06, + "loss": 0.4858, + "step": 947 + }, + { + "epoch": 0.17368999633565407, + "grad_norm": 0.40649730883742624, + "learning_rate": 9.83645776058483e-06, + "loss": 0.4943, + "step": 948 + }, + { + "epoch": 0.1738732136313668, + "grad_norm": 0.43002759908477306, + "learning_rate": 9.835645575355304e-06, + "loss": 0.4872, + "step": 949 + }, + { + "epoch": 0.1740564309270795, + "grad_norm": 0.39064612647120167, + "learning_rate": 9.834831412074816e-06, + "loss": 0.4409, + "step": 950 + }, + { + "epoch": 0.17423964822279223, + "grad_norm": 0.3761573603114394, + "learning_rate": 9.834015271076405e-06, + "loss": 0.459, + "step": 951 + }, + { + "epoch": 0.17442286551850494, + "grad_norm": 0.42796519720566795, + "learning_rate": 9.83319715269392e-06, + "loss": 0.4906, + "step": 952 + }, + { + "epoch": 0.17460608281421766, + "grad_norm": 0.5091807084893948, + "learning_rate": 9.832377057262015e-06, + "loss": 0.5516, + "step": 953 + }, + { + "epoch": 0.17478930010993038, + "grad_norm": 0.48065067696295527, + "learning_rate": 9.831554985116155e-06, + "loss": 0.5097, + "step": 954 + }, + { + "epoch": 0.1749725174056431, + "grad_norm": 0.4310364985277951, + "learning_rate": 9.830730936592615e-06, + "loss": 0.5025, + "step": 955 + }, + { + "epoch": 0.17515573470135581, + "grad_norm": 0.39964086093662415, + "learning_rate": 9.829904912028477e-06, + "loss": 0.494, + "step": 956 + }, + { + "epoch": 0.17533895199706853, + "grad_norm": 0.40935380850458175, + "learning_rate": 9.829076911761631e-06, + "loss": 0.5182, + "step": 957 + }, + { + "epoch": 0.17552216929278125, + "grad_norm": 0.454766087870644, + "learning_rate": 9.828246936130777e-06, + "loss": 0.5113, + "step": 958 + }, + { + "epoch": 0.17570538658849397, + "grad_norm": 0.5087972352515657, + "learning_rate": 9.827414985475419e-06, + "loss": 0.4797, + "step": 959 + }, + { + "epoch": 0.17588860388420666, + "grad_norm": 0.4255562501722622, + "learning_rate": 9.826581060135873e-06, + "loss": 0.5045, + "step": 960 + }, + { + "epoch": 0.17607182117991937, + "grad_norm": 0.45583787528367603, + "learning_rate": 9.825745160453264e-06, + "loss": 0.4755, + "step": 961 + }, + { + "epoch": 0.1762550384756321, + "grad_norm": 0.45648883029913284, + "learning_rate": 9.824907286769519e-06, + "loss": 0.4736, + "step": 962 + }, + { + "epoch": 0.1764382557713448, + "grad_norm": 0.436071698669034, + "learning_rate": 9.824067439427374e-06, + "loss": 0.5036, + "step": 963 + }, + { + "epoch": 0.17662147306705753, + "grad_norm": 0.41502000197263017, + "learning_rate": 9.823225618770378e-06, + "loss": 0.5121, + "step": 964 + }, + { + "epoch": 0.17680469036277024, + "grad_norm": 0.450252173902017, + "learning_rate": 9.822381825142879e-06, + "loss": 0.506, + "step": 965 + }, + { + "epoch": 0.17698790765848296, + "grad_norm": 0.43561778776659554, + "learning_rate": 9.82153605889004e-06, + "loss": 0.5304, + "step": 966 + }, + { + "epoch": 0.17717112495419568, + "grad_norm": 0.4306520452468297, + "learning_rate": 9.820688320357823e-06, + "loss": 0.5112, + "step": 967 + }, + { + "epoch": 0.1773543422499084, + "grad_norm": 0.39680977809710444, + "learning_rate": 9.819838609893005e-06, + "loss": 0.4904, + "step": 968 + }, + { + "epoch": 0.17753755954562112, + "grad_norm": 0.4225898008101114, + "learning_rate": 9.81898692784316e-06, + "loss": 0.5137, + "step": 969 + }, + { + "epoch": 0.17772077684133383, + "grad_norm": 0.4267062187327317, + "learning_rate": 9.818133274556679e-06, + "loss": 0.517, + "step": 970 + }, + { + "epoch": 0.17790399413704655, + "grad_norm": 0.4450452397955529, + "learning_rate": 9.81727765038275e-06, + "loss": 0.4949, + "step": 971 + }, + { + "epoch": 0.17808721143275924, + "grad_norm": 0.42730228271929965, + "learning_rate": 9.816420055671374e-06, + "loss": 0.5013, + "step": 972 + }, + { + "epoch": 0.17827042872847196, + "grad_norm": 0.4421383626296405, + "learning_rate": 9.815560490773356e-06, + "loss": 0.4679, + "step": 973 + }, + { + "epoch": 0.17845364602418468, + "grad_norm": 0.43307265096730885, + "learning_rate": 9.814698956040305e-06, + "loss": 0.4976, + "step": 974 + }, + { + "epoch": 0.1786368633198974, + "grad_norm": 0.40376464412932705, + "learning_rate": 9.813835451824636e-06, + "loss": 0.5021, + "step": 975 + }, + { + "epoch": 0.1788200806156101, + "grad_norm": 0.44775776339538953, + "learning_rate": 9.812969978479573e-06, + "loss": 0.4824, + "step": 976 + }, + { + "epoch": 0.17900329791132283, + "grad_norm": 0.468036097947629, + "learning_rate": 9.812102536359142e-06, + "loss": 0.4966, + "step": 977 + }, + { + "epoch": 0.17918651520703555, + "grad_norm": 0.49832149697786493, + "learning_rate": 9.811233125818176e-06, + "loss": 0.4853, + "step": 978 + }, + { + "epoch": 0.17936973250274826, + "grad_norm": 0.44754723833693727, + "learning_rate": 9.810361747212313e-06, + "loss": 0.4999, + "step": 979 + }, + { + "epoch": 0.17955294979846098, + "grad_norm": 0.4747011696315986, + "learning_rate": 9.809488400897996e-06, + "loss": 0.5108, + "step": 980 + }, + { + "epoch": 0.1797361670941737, + "grad_norm": 0.4055842519238836, + "learning_rate": 9.808613087232473e-06, + "loss": 0.4786, + "step": 981 + }, + { + "epoch": 0.17991938438988642, + "grad_norm": 0.3971701443501286, + "learning_rate": 9.807735806573795e-06, + "loss": 0.4925, + "step": 982 + }, + { + "epoch": 0.18010260168559913, + "grad_norm": 0.3784480421793096, + "learning_rate": 9.806856559280819e-06, + "loss": 0.4762, + "step": 983 + }, + { + "epoch": 0.18028581898131182, + "grad_norm": 0.41465183173286063, + "learning_rate": 9.80597534571321e-06, + "loss": 0.5037, + "step": 984 + }, + { + "epoch": 0.18046903627702454, + "grad_norm": 0.4713550462839489, + "learning_rate": 9.80509216623143e-06, + "loss": 0.4816, + "step": 985 + }, + { + "epoch": 0.18065225357273726, + "grad_norm": 0.34923786309805893, + "learning_rate": 9.804207021196751e-06, + "loss": 0.4903, + "step": 986 + }, + { + "epoch": 0.18083547086844998, + "grad_norm": 0.4359203385963395, + "learning_rate": 9.803319910971248e-06, + "loss": 0.5252, + "step": 987 + }, + { + "epoch": 0.1810186881641627, + "grad_norm": 0.40286824933567683, + "learning_rate": 9.802430835917796e-06, + "loss": 0.5248, + "step": 988 + }, + { + "epoch": 0.1812019054598754, + "grad_norm": 0.44314680454994215, + "learning_rate": 9.801539796400078e-06, + "loss": 0.5071, + "step": 989 + }, + { + "epoch": 0.18138512275558813, + "grad_norm": 0.44074612784921385, + "learning_rate": 9.80064679278258e-06, + "loss": 0.5057, + "step": 990 + }, + { + "epoch": 0.18156834005130085, + "grad_norm": 0.40968360376086815, + "learning_rate": 9.799751825430592e-06, + "loss": 0.4919, + "step": 991 + }, + { + "epoch": 0.18175155734701356, + "grad_norm": 0.4204735967781832, + "learning_rate": 9.798854894710202e-06, + "loss": 0.546, + "step": 992 + }, + { + "epoch": 0.18193477464272628, + "grad_norm": 0.36210624498761007, + "learning_rate": 9.79795600098831e-06, + "loss": 0.4904, + "step": 993 + }, + { + "epoch": 0.182117991938439, + "grad_norm": 0.45193717949202633, + "learning_rate": 9.797055144632609e-06, + "loss": 0.5292, + "step": 994 + }, + { + "epoch": 0.18230120923415172, + "grad_norm": 0.415630496229026, + "learning_rate": 9.796152326011604e-06, + "loss": 0.533, + "step": 995 + }, + { + "epoch": 0.1824844265298644, + "grad_norm": 0.39372729416385926, + "learning_rate": 9.795247545494594e-06, + "loss": 0.5147, + "step": 996 + }, + { + "epoch": 0.18266764382557713, + "grad_norm": 0.43540226077312505, + "learning_rate": 9.794340803451692e-06, + "loss": 0.49, + "step": 997 + }, + { + "epoch": 0.18285086112128984, + "grad_norm": 0.39372791981341043, + "learning_rate": 9.7934321002538e-06, + "loss": 0.4776, + "step": 998 + }, + { + "epoch": 0.18303407841700256, + "grad_norm": 0.40381508343023165, + "learning_rate": 9.792521436272633e-06, + "loss": 0.4566, + "step": 999 + }, + { + "epoch": 0.18321729571271528, + "grad_norm": 0.47729065322343966, + "learning_rate": 9.791608811880702e-06, + "loss": 0.5013, + "step": 1000 + }, + { + "epoch": 0.183400513008428, + "grad_norm": 0.4569898838129068, + "learning_rate": 9.79069422745132e-06, + "loss": 0.4834, + "step": 1001 + }, + { + "epoch": 0.1835837303041407, + "grad_norm": 0.41137738030360954, + "learning_rate": 9.789777683358607e-06, + "loss": 0.4982, + "step": 1002 + }, + { + "epoch": 0.18376694759985343, + "grad_norm": 0.443462511432857, + "learning_rate": 9.788859179977478e-06, + "loss": 0.4822, + "step": 1003 + }, + { + "epoch": 0.18395016489556615, + "grad_norm": 0.3803483898936361, + "learning_rate": 9.787938717683654e-06, + "loss": 0.4781, + "step": 1004 + }, + { + "epoch": 0.18413338219127887, + "grad_norm": 0.4711405888417879, + "learning_rate": 9.787016296853657e-06, + "loss": 0.5104, + "step": 1005 + }, + { + "epoch": 0.18431659948699158, + "grad_norm": 0.42018309024133216, + "learning_rate": 9.786091917864807e-06, + "loss": 0.4922, + "step": 1006 + }, + { + "epoch": 0.18449981678270427, + "grad_norm": 0.46053661876076, + "learning_rate": 9.785165581095225e-06, + "loss": 0.4777, + "step": 1007 + }, + { + "epoch": 0.184683034078417, + "grad_norm": 0.38543837395940844, + "learning_rate": 9.78423728692384e-06, + "loss": 0.5053, + "step": 1008 + }, + { + "epoch": 0.1848662513741297, + "grad_norm": 0.3681107156627339, + "learning_rate": 9.783307035730375e-06, + "loss": 0.463, + "step": 1009 + }, + { + "epoch": 0.18504946866984243, + "grad_norm": 0.42988658038887384, + "learning_rate": 9.782374827895353e-06, + "loss": 0.5179, + "step": 1010 + }, + { + "epoch": 0.18523268596555514, + "grad_norm": 0.3985975808728344, + "learning_rate": 9.781440663800099e-06, + "loss": 0.4979, + "step": 1011 + }, + { + "epoch": 0.18541590326126786, + "grad_norm": 0.419869548881785, + "learning_rate": 9.780504543826741e-06, + "loss": 0.4952, + "step": 1012 + }, + { + "epoch": 0.18559912055698058, + "grad_norm": 0.4348607167141751, + "learning_rate": 9.779566468358205e-06, + "loss": 0.4999, + "step": 1013 + }, + { + "epoch": 0.1857823378526933, + "grad_norm": 0.43518629406794973, + "learning_rate": 9.778626437778214e-06, + "loss": 0.5028, + "step": 1014 + }, + { + "epoch": 0.18596555514840601, + "grad_norm": 0.4475968637904264, + "learning_rate": 9.777684452471296e-06, + "loss": 0.4997, + "step": 1015 + }, + { + "epoch": 0.18614877244411873, + "grad_norm": 0.4834533700606862, + "learning_rate": 9.776740512822773e-06, + "loss": 0.5139, + "step": 1016 + }, + { + "epoch": 0.18633198973983145, + "grad_norm": 0.49443428074486423, + "learning_rate": 9.775794619218773e-06, + "loss": 0.4818, + "step": 1017 + }, + { + "epoch": 0.18651520703554417, + "grad_norm": 0.42771715112146097, + "learning_rate": 9.774846772046216e-06, + "loss": 0.507, + "step": 1018 + }, + { + "epoch": 0.18669842433125686, + "grad_norm": 0.43776726147874706, + "learning_rate": 9.77389697169283e-06, + "loss": 0.5002, + "step": 1019 + }, + { + "epoch": 0.18688164162696957, + "grad_norm": 0.44345646316302056, + "learning_rate": 9.77294521854713e-06, + "loss": 0.4953, + "step": 1020 + }, + { + "epoch": 0.1870648589226823, + "grad_norm": 0.4199805356687226, + "learning_rate": 9.77199151299844e-06, + "loss": 0.491, + "step": 1021 + }, + { + "epoch": 0.187248076218395, + "grad_norm": 0.4157991523056247, + "learning_rate": 9.77103585543688e-06, + "loss": 0.4809, + "step": 1022 + }, + { + "epoch": 0.18743129351410773, + "grad_norm": 0.3883786524287937, + "learning_rate": 9.770078246253367e-06, + "loss": 0.4811, + "step": 1023 + }, + { + "epoch": 0.18761451080982045, + "grad_norm": 0.4460460362572771, + "learning_rate": 9.769118685839616e-06, + "loss": 0.4869, + "step": 1024 + }, + { + "epoch": 0.18779772810553316, + "grad_norm": 0.4153757045736682, + "learning_rate": 9.768157174588144e-06, + "loss": 0.4464, + "step": 1025 + }, + { + "epoch": 0.18798094540124588, + "grad_norm": 0.4146331433621528, + "learning_rate": 9.767193712892259e-06, + "loss": 0.4588, + "step": 1026 + }, + { + "epoch": 0.1881641626969586, + "grad_norm": 0.42852199415357106, + "learning_rate": 9.766228301146074e-06, + "loss": 0.5132, + "step": 1027 + }, + { + "epoch": 0.18834737999267132, + "grad_norm": 0.40617799593038223, + "learning_rate": 9.765260939744496e-06, + "loss": 0.4715, + "step": 1028 + }, + { + "epoch": 0.18853059728838403, + "grad_norm": 0.43057345885222986, + "learning_rate": 9.76429162908323e-06, + "loss": 0.4613, + "step": 1029 + }, + { + "epoch": 0.18871381458409675, + "grad_norm": 0.4287852170015339, + "learning_rate": 9.763320369558777e-06, + "loss": 0.5029, + "step": 1030 + }, + { + "epoch": 0.18889703187980944, + "grad_norm": 0.4338612990572842, + "learning_rate": 9.762347161568441e-06, + "loss": 0.51, + "step": 1031 + }, + { + "epoch": 0.18908024917552216, + "grad_norm": 0.6099286344029112, + "learning_rate": 9.761372005510315e-06, + "loss": 0.5176, + "step": 1032 + }, + { + "epoch": 0.18926346647123488, + "grad_norm": 0.4139699553573524, + "learning_rate": 9.760394901783294e-06, + "loss": 0.4803, + "step": 1033 + }, + { + "epoch": 0.1894466837669476, + "grad_norm": 0.4613287614032801, + "learning_rate": 9.759415850787068e-06, + "loss": 0.5114, + "step": 1034 + }, + { + "epoch": 0.1896299010626603, + "grad_norm": 0.43042214487008984, + "learning_rate": 9.758434852922124e-06, + "loss": 0.5049, + "step": 1035 + }, + { + "epoch": 0.18981311835837303, + "grad_norm": 0.4708254422812291, + "learning_rate": 9.757451908589746e-06, + "loss": 0.5466, + "step": 1036 + }, + { + "epoch": 0.18999633565408575, + "grad_norm": 0.4823877518080949, + "learning_rate": 9.756467018192013e-06, + "loss": 0.4851, + "step": 1037 + }, + { + "epoch": 0.19017955294979846, + "grad_norm": 0.49290249361142535, + "learning_rate": 9.7554801821318e-06, + "loss": 0.4915, + "step": 1038 + }, + { + "epoch": 0.19036277024551118, + "grad_norm": 0.5097570694075713, + "learning_rate": 9.754491400812779e-06, + "loss": 0.5195, + "step": 1039 + }, + { + "epoch": 0.1905459875412239, + "grad_norm": 0.41886567376911943, + "learning_rate": 9.753500674639417e-06, + "loss": 0.5133, + "step": 1040 + }, + { + "epoch": 0.19072920483693662, + "grad_norm": 0.48924775085841404, + "learning_rate": 9.752508004016976e-06, + "loss": 0.5174, + "step": 1041 + }, + { + "epoch": 0.19091242213264933, + "grad_norm": 0.44418461742112236, + "learning_rate": 9.751513389351517e-06, + "loss": 0.4711, + "step": 1042 + }, + { + "epoch": 0.19109563942836202, + "grad_norm": 0.4103673664900802, + "learning_rate": 9.75051683104989e-06, + "loss": 0.478, + "step": 1043 + }, + { + "epoch": 0.19127885672407474, + "grad_norm": 0.45394849049437136, + "learning_rate": 9.749518329519745e-06, + "loss": 0.5069, + "step": 1044 + }, + { + "epoch": 0.19146207401978746, + "grad_norm": 0.4841087058964896, + "learning_rate": 9.748517885169525e-06, + "loss": 0.5226, + "step": 1045 + }, + { + "epoch": 0.19164529131550018, + "grad_norm": 0.4138362206735341, + "learning_rate": 9.747515498408466e-06, + "loss": 0.5085, + "step": 1046 + }, + { + "epoch": 0.1918285086112129, + "grad_norm": 0.4267417097417006, + "learning_rate": 9.746511169646604e-06, + "loss": 0.5074, + "step": 1047 + }, + { + "epoch": 0.1920117259069256, + "grad_norm": 0.4397686656970245, + "learning_rate": 9.745504899294764e-06, + "loss": 0.5242, + "step": 1048 + }, + { + "epoch": 0.19219494320263833, + "grad_norm": 0.6027364595972364, + "learning_rate": 9.744496687764568e-06, + "loss": 0.5046, + "step": 1049 + }, + { + "epoch": 0.19237816049835105, + "grad_norm": 0.45130752478163744, + "learning_rate": 9.743486535468431e-06, + "loss": 0.4699, + "step": 1050 + }, + { + "epoch": 0.19256137779406376, + "grad_norm": 0.4971542747077622, + "learning_rate": 9.742474442819561e-06, + "loss": 0.4446, + "step": 1051 + }, + { + "epoch": 0.19274459508977648, + "grad_norm": 0.566649600088142, + "learning_rate": 9.741460410231962e-06, + "loss": 0.5531, + "step": 1052 + }, + { + "epoch": 0.1929278123854892, + "grad_norm": 0.44162327591540923, + "learning_rate": 9.74044443812043e-06, + "loss": 0.5194, + "step": 1053 + }, + { + "epoch": 0.19311102968120192, + "grad_norm": 0.40323605263226747, + "learning_rate": 9.739426526900555e-06, + "loss": 0.5013, + "step": 1054 + }, + { + "epoch": 0.1932942469769146, + "grad_norm": 0.45567902759630347, + "learning_rate": 9.738406676988721e-06, + "loss": 0.4397, + "step": 1055 + }, + { + "epoch": 0.19347746427262733, + "grad_norm": 0.4604601552443739, + "learning_rate": 9.737384888802099e-06, + "loss": 0.4888, + "step": 1056 + }, + { + "epoch": 0.19366068156834004, + "grad_norm": 0.3927639599518843, + "learning_rate": 9.736361162758665e-06, + "loss": 0.4632, + "step": 1057 + }, + { + "epoch": 0.19384389886405276, + "grad_norm": 0.40880550818933825, + "learning_rate": 9.735335499277176e-06, + "loss": 0.484, + "step": 1058 + }, + { + "epoch": 0.19402711615976548, + "grad_norm": 0.4813436727815402, + "learning_rate": 9.734307898777187e-06, + "loss": 0.5351, + "step": 1059 + }, + { + "epoch": 0.1942103334554782, + "grad_norm": 0.44626763914485296, + "learning_rate": 9.733278361679042e-06, + "loss": 0.4847, + "step": 1060 + }, + { + "epoch": 0.1943935507511909, + "grad_norm": 0.3999429153352251, + "learning_rate": 9.732246888403885e-06, + "loss": 0.4888, + "step": 1061 + }, + { + "epoch": 0.19457676804690363, + "grad_norm": 0.39498667028752193, + "learning_rate": 9.731213479373643e-06, + "loss": 0.4947, + "step": 1062 + }, + { + "epoch": 0.19475998534261635, + "grad_norm": 0.4096597841615662, + "learning_rate": 9.730178135011036e-06, + "loss": 0.4747, + "step": 1063 + }, + { + "epoch": 0.19494320263832907, + "grad_norm": 0.4279641584926225, + "learning_rate": 9.729140855739585e-06, + "loss": 0.5013, + "step": 1064 + }, + { + "epoch": 0.19512641993404178, + "grad_norm": 0.5191493361942445, + "learning_rate": 9.72810164198359e-06, + "loss": 0.4947, + "step": 1065 + }, + { + "epoch": 0.1953096372297545, + "grad_norm": 0.4196384047011208, + "learning_rate": 9.727060494168148e-06, + "loss": 0.4825, + "step": 1066 + }, + { + "epoch": 0.1954928545254672, + "grad_norm": 0.4345875246850262, + "learning_rate": 9.726017412719151e-06, + "loss": 0.4734, + "step": 1067 + }, + { + "epoch": 0.1956760718211799, + "grad_norm": 0.45563595275942326, + "learning_rate": 9.724972398063273e-06, + "loss": 0.5153, + "step": 1068 + }, + { + "epoch": 0.19585928911689263, + "grad_norm": 0.44886022596575603, + "learning_rate": 9.723925450627988e-06, + "loss": 0.478, + "step": 1069 + }, + { + "epoch": 0.19604250641260534, + "grad_norm": 0.413358196754176, + "learning_rate": 9.722876570841554e-06, + "loss": 0.5132, + "step": 1070 + }, + { + "epoch": 0.19622572370831806, + "grad_norm": 0.41859249060654946, + "learning_rate": 9.721825759133022e-06, + "loss": 0.4911, + "step": 1071 + }, + { + "epoch": 0.19640894100403078, + "grad_norm": 0.4261104024613703, + "learning_rate": 9.720773015932234e-06, + "loss": 0.512, + "step": 1072 + }, + { + "epoch": 0.1965921582997435, + "grad_norm": 0.41207466582794494, + "learning_rate": 9.71971834166982e-06, + "loss": 0.4739, + "step": 1073 + }, + { + "epoch": 0.19677537559545621, + "grad_norm": 0.4191753315427551, + "learning_rate": 9.718661736777202e-06, + "loss": 0.4906, + "step": 1074 + }, + { + "epoch": 0.19695859289116893, + "grad_norm": 0.42085342925013736, + "learning_rate": 9.71760320168659e-06, + "loss": 0.5164, + "step": 1075 + }, + { + "epoch": 0.19714181018688165, + "grad_norm": 0.4268987095250944, + "learning_rate": 9.716542736830985e-06, + "loss": 0.4861, + "step": 1076 + }, + { + "epoch": 0.19732502748259437, + "grad_norm": 0.42610930294181376, + "learning_rate": 9.715480342644177e-06, + "loss": 0.475, + "step": 1077 + }, + { + "epoch": 0.19750824477830708, + "grad_norm": 0.46458365169769283, + "learning_rate": 9.714416019560744e-06, + "loss": 0.4949, + "step": 1078 + }, + { + "epoch": 0.19769146207401977, + "grad_norm": 0.40388410318318285, + "learning_rate": 9.713349768016055e-06, + "loss": 0.4886, + "step": 1079 + }, + { + "epoch": 0.1978746793697325, + "grad_norm": 0.47359324461508207, + "learning_rate": 9.712281588446267e-06, + "loss": 0.5166, + "step": 1080 + }, + { + "epoch": 0.1980578966654452, + "grad_norm": 0.3996969974243518, + "learning_rate": 9.711211481288323e-06, + "loss": 0.5208, + "step": 1081 + }, + { + "epoch": 0.19824111396115793, + "grad_norm": 0.37701213979445153, + "learning_rate": 9.710139446979961e-06, + "loss": 0.5, + "step": 1082 + }, + { + "epoch": 0.19842433125687065, + "grad_norm": 0.43491339118601396, + "learning_rate": 9.7090654859597e-06, + "loss": 0.4899, + "step": 1083 + }, + { + "epoch": 0.19860754855258336, + "grad_norm": 0.4651922376126236, + "learning_rate": 9.707989598666852e-06, + "loss": 0.4684, + "step": 1084 + }, + { + "epoch": 0.19879076584829608, + "grad_norm": 0.41200327863289965, + "learning_rate": 9.706911785541515e-06, + "loss": 0.4408, + "step": 1085 + }, + { + "epoch": 0.1989739831440088, + "grad_norm": 0.48300341584043166, + "learning_rate": 9.705832047024574e-06, + "loss": 0.53, + "step": 1086 + }, + { + "epoch": 0.19915720043972152, + "grad_norm": 0.4403619842708917, + "learning_rate": 9.704750383557707e-06, + "loss": 0.5361, + "step": 1087 + }, + { + "epoch": 0.19934041773543423, + "grad_norm": 0.3719557224033517, + "learning_rate": 9.70366679558337e-06, + "loss": 0.4948, + "step": 1088 + }, + { + "epoch": 0.19952363503114695, + "grad_norm": 0.4176787500801207, + "learning_rate": 9.702581283544813e-06, + "loss": 0.5145, + "step": 1089 + }, + { + "epoch": 0.19970685232685967, + "grad_norm": 0.4221402861265115, + "learning_rate": 9.701493847886075e-06, + "loss": 0.5128, + "step": 1090 + }, + { + "epoch": 0.19989006962257236, + "grad_norm": 0.40690517337963633, + "learning_rate": 9.700404489051974e-06, + "loss": 0.4911, + "step": 1091 + }, + { + "epoch": 0.20007328691828508, + "grad_norm": 0.46227138813892915, + "learning_rate": 9.69931320748812e-06, + "loss": 0.5037, + "step": 1092 + }, + { + "epoch": 0.2002565042139978, + "grad_norm": 0.3860882899117114, + "learning_rate": 9.69822000364091e-06, + "loss": 0.5063, + "step": 1093 + }, + { + "epoch": 0.2004397215097105, + "grad_norm": 0.512213232549586, + "learning_rate": 9.697124877957524e-06, + "loss": 0.5019, + "step": 1094 + }, + { + "epoch": 0.20062293880542323, + "grad_norm": 0.455116276042089, + "learning_rate": 9.69602783088593e-06, + "loss": 0.5145, + "step": 1095 + }, + { + "epoch": 0.20080615610113595, + "grad_norm": 0.4251117291337367, + "learning_rate": 9.694928862874883e-06, + "loss": 0.5096, + "step": 1096 + }, + { + "epoch": 0.20098937339684866, + "grad_norm": 0.46240748219044453, + "learning_rate": 9.693827974373925e-06, + "loss": 0.4506, + "step": 1097 + }, + { + "epoch": 0.20117259069256138, + "grad_norm": 0.35673376148210156, + "learning_rate": 9.692725165833377e-06, + "loss": 0.4688, + "step": 1098 + }, + { + "epoch": 0.2013558079882741, + "grad_norm": 0.469083523866663, + "learning_rate": 9.69162043770435e-06, + "loss": 0.5049, + "step": 1099 + }, + { + "epoch": 0.20153902528398682, + "grad_norm": 0.42797732688089807, + "learning_rate": 9.690513790438743e-06, + "loss": 0.4696, + "step": 1100 + }, + { + "epoch": 0.20172224257969953, + "grad_norm": 0.4281881317554375, + "learning_rate": 9.689405224489235e-06, + "loss": 0.4973, + "step": 1101 + }, + { + "epoch": 0.20190545987541225, + "grad_norm": 0.4513286311031276, + "learning_rate": 9.688294740309292e-06, + "loss": 0.4694, + "step": 1102 + }, + { + "epoch": 0.20208867717112494, + "grad_norm": 0.3940963878288531, + "learning_rate": 9.687182338353166e-06, + "loss": 0.4765, + "step": 1103 + }, + { + "epoch": 0.20227189446683766, + "grad_norm": 0.3902749441842877, + "learning_rate": 9.68606801907589e-06, + "loss": 0.4759, + "step": 1104 + }, + { + "epoch": 0.20245511176255038, + "grad_norm": 0.4789649740688253, + "learning_rate": 9.684951782933282e-06, + "loss": 0.4948, + "step": 1105 + }, + { + "epoch": 0.2026383290582631, + "grad_norm": 0.47833293392492787, + "learning_rate": 9.683833630381949e-06, + "loss": 0.5239, + "step": 1106 + }, + { + "epoch": 0.2028215463539758, + "grad_norm": 0.38231892821814056, + "learning_rate": 9.682713561879275e-06, + "loss": 0.4958, + "step": 1107 + }, + { + "epoch": 0.20300476364968853, + "grad_norm": 0.37675479403578893, + "learning_rate": 9.681591577883433e-06, + "loss": 0.4778, + "step": 1108 + }, + { + "epoch": 0.20318798094540125, + "grad_norm": 0.3570523937587689, + "learning_rate": 9.680467678853378e-06, + "loss": 0.484, + "step": 1109 + }, + { + "epoch": 0.20337119824111397, + "grad_norm": 0.4245386369867795, + "learning_rate": 9.679341865248848e-06, + "loss": 0.4866, + "step": 1110 + }, + { + "epoch": 0.20355441553682668, + "grad_norm": 0.4405417241764804, + "learning_rate": 9.678214137530361e-06, + "loss": 0.4532, + "step": 1111 + }, + { + "epoch": 0.2037376328325394, + "grad_norm": 0.3965079787279517, + "learning_rate": 9.677084496159224e-06, + "loss": 0.4731, + "step": 1112 + }, + { + "epoch": 0.20392085012825212, + "grad_norm": 0.38208118269449826, + "learning_rate": 9.675952941597522e-06, + "loss": 0.4397, + "step": 1113 + }, + { + "epoch": 0.20410406742396484, + "grad_norm": 0.39488634039550174, + "learning_rate": 9.674819474308126e-06, + "loss": 0.4928, + "step": 1114 + }, + { + "epoch": 0.20428728471967753, + "grad_norm": 0.3739080933909842, + "learning_rate": 9.673684094754686e-06, + "loss": 0.486, + "step": 1115 + }, + { + "epoch": 0.20447050201539024, + "grad_norm": 0.3958666571427383, + "learning_rate": 9.672546803401638e-06, + "loss": 0.5102, + "step": 1116 + }, + { + "epoch": 0.20465371931110296, + "grad_norm": 1.2610206438333627, + "learning_rate": 9.671407600714197e-06, + "loss": 0.4555, + "step": 1117 + }, + { + "epoch": 0.20483693660681568, + "grad_norm": 0.4461796609741918, + "learning_rate": 9.670266487158363e-06, + "loss": 0.4718, + "step": 1118 + }, + { + "epoch": 0.2050201539025284, + "grad_norm": 0.43024748470939017, + "learning_rate": 9.669123463200914e-06, + "loss": 0.4953, + "step": 1119 + }, + { + "epoch": 0.2052033711982411, + "grad_norm": 0.44154226794099993, + "learning_rate": 9.66797852930941e-06, + "loss": 0.5181, + "step": 1120 + }, + { + "epoch": 0.20538658849395383, + "grad_norm": 0.41502499575208784, + "learning_rate": 9.666831685952194e-06, + "loss": 0.4772, + "step": 1121 + }, + { + "epoch": 0.20556980578966655, + "grad_norm": 0.4084436936730005, + "learning_rate": 9.66568293359839e-06, + "loss": 0.5047, + "step": 1122 + }, + { + "epoch": 0.20575302308537927, + "grad_norm": 0.43479037966133816, + "learning_rate": 9.664532272717902e-06, + "loss": 0.4698, + "step": 1123 + }, + { + "epoch": 0.20593624038109198, + "grad_norm": 0.7606951229520282, + "learning_rate": 9.663379703781414e-06, + "loss": 0.4841, + "step": 1124 + }, + { + "epoch": 0.2061194576768047, + "grad_norm": 0.4006994667526495, + "learning_rate": 9.662225227260395e-06, + "loss": 0.4963, + "step": 1125 + }, + { + "epoch": 0.20630267497251742, + "grad_norm": 0.39866531966563096, + "learning_rate": 9.661068843627088e-06, + "loss": 0.4777, + "step": 1126 + }, + { + "epoch": 0.2064858922682301, + "grad_norm": 0.4035785610200985, + "learning_rate": 9.65991055335452e-06, + "loss": 0.4666, + "step": 1127 + }, + { + "epoch": 0.20666910956394283, + "grad_norm": 0.438045344209569, + "learning_rate": 9.658750356916494e-06, + "loss": 0.4956, + "step": 1128 + }, + { + "epoch": 0.20685232685965554, + "grad_norm": 0.3835030416878391, + "learning_rate": 9.657588254787598e-06, + "loss": 0.4886, + "step": 1129 + }, + { + "epoch": 0.20703554415536826, + "grad_norm": 0.4044887736362754, + "learning_rate": 9.6564242474432e-06, + "loss": 0.4831, + "step": 1130 + }, + { + "epoch": 0.20721876145108098, + "grad_norm": 0.40816269155213675, + "learning_rate": 9.655258335359438e-06, + "loss": 0.497, + "step": 1131 + }, + { + "epoch": 0.2074019787467937, + "grad_norm": 0.39413831022845913, + "learning_rate": 9.65409051901324e-06, + "loss": 0.4588, + "step": 1132 + }, + { + "epoch": 0.20758519604250641, + "grad_norm": 0.5053059172212088, + "learning_rate": 9.652920798882307e-06, + "loss": 0.4985, + "step": 1133 + }, + { + "epoch": 0.20776841333821913, + "grad_norm": 0.3669036545586826, + "learning_rate": 9.651749175445123e-06, + "loss": 0.4917, + "step": 1134 + }, + { + "epoch": 0.20795163063393185, + "grad_norm": 0.4268516645401171, + "learning_rate": 9.650575649180944e-06, + "loss": 0.4849, + "step": 1135 + }, + { + "epoch": 0.20813484792964457, + "grad_norm": 0.4262534436550349, + "learning_rate": 9.649400220569812e-06, + "loss": 0.5128, + "step": 1136 + }, + { + "epoch": 0.20831806522535729, + "grad_norm": 0.3944076446197719, + "learning_rate": 9.648222890092538e-06, + "loss": 0.5014, + "step": 1137 + }, + { + "epoch": 0.20850128252107, + "grad_norm": 0.40237247723443686, + "learning_rate": 9.647043658230723e-06, + "loss": 0.5162, + "step": 1138 + }, + { + "epoch": 0.2086844998167827, + "grad_norm": 0.40871120340037176, + "learning_rate": 9.645862525466734e-06, + "loss": 0.5021, + "step": 1139 + }, + { + "epoch": 0.2088677171124954, + "grad_norm": 0.40979376980666626, + "learning_rate": 9.644679492283723e-06, + "loss": 0.5104, + "step": 1140 + }, + { + "epoch": 0.20905093440820813, + "grad_norm": 0.41263650327798096, + "learning_rate": 9.643494559165616e-06, + "loss": 0.4764, + "step": 1141 + }, + { + "epoch": 0.20923415170392085, + "grad_norm": 0.37127003400941044, + "learning_rate": 9.64230772659712e-06, + "loss": 0.4858, + "step": 1142 + }, + { + "epoch": 0.20941736899963356, + "grad_norm": 0.4556209336445142, + "learning_rate": 9.641118995063712e-06, + "loss": 0.4668, + "step": 1143 + }, + { + "epoch": 0.20960058629534628, + "grad_norm": 0.37966915346806135, + "learning_rate": 9.639928365051655e-06, + "loss": 0.4733, + "step": 1144 + }, + { + "epoch": 0.209783803591059, + "grad_norm": 0.5527182868106905, + "learning_rate": 9.638735837047976e-06, + "loss": 0.5182, + "step": 1145 + }, + { + "epoch": 0.20996702088677172, + "grad_norm": 0.37939914018444165, + "learning_rate": 9.637541411540496e-06, + "loss": 0.4866, + "step": 1146 + }, + { + "epoch": 0.21015023818248443, + "grad_norm": 0.44598863023348195, + "learning_rate": 9.636345089017795e-06, + "loss": 0.5115, + "step": 1147 + }, + { + "epoch": 0.21033345547819715, + "grad_norm": 0.4509291233105335, + "learning_rate": 9.635146869969239e-06, + "loss": 0.5099, + "step": 1148 + }, + { + "epoch": 0.21051667277390987, + "grad_norm": 0.43540452743790453, + "learning_rate": 9.633946754884963e-06, + "loss": 0.4971, + "step": 1149 + }, + { + "epoch": 0.21069989006962256, + "grad_norm": 0.43503175511293474, + "learning_rate": 9.63274474425589e-06, + "loss": 0.535, + "step": 1150 + }, + { + "epoch": 0.21088310736533528, + "grad_norm": 0.4284909564211116, + "learning_rate": 9.6315408385737e-06, + "loss": 0.5195, + "step": 1151 + }, + { + "epoch": 0.211066324661048, + "grad_norm": 0.4144953761451503, + "learning_rate": 9.630335038330867e-06, + "loss": 0.4917, + "step": 1152 + }, + { + "epoch": 0.2112495419567607, + "grad_norm": 0.4897343272100308, + "learning_rate": 9.629127344020625e-06, + "loss": 0.5315, + "step": 1153 + }, + { + "epoch": 0.21143275925247343, + "grad_norm": 0.422768220232635, + "learning_rate": 9.627917756136991e-06, + "loss": 0.4751, + "step": 1154 + }, + { + "epoch": 0.21161597654818615, + "grad_norm": 0.4834794267838247, + "learning_rate": 9.626706275174754e-06, + "loss": 0.49, + "step": 1155 + }, + { + "epoch": 0.21179919384389886, + "grad_norm": 0.43277895942455413, + "learning_rate": 9.625492901629478e-06, + "loss": 0.4942, + "step": 1156 + }, + { + "epoch": 0.21198241113961158, + "grad_norm": 0.4500403514457418, + "learning_rate": 9.624277635997503e-06, + "loss": 0.4794, + "step": 1157 + }, + { + "epoch": 0.2121656284353243, + "grad_norm": 0.4099295617842837, + "learning_rate": 9.62306047877594e-06, + "loss": 0.5079, + "step": 1158 + }, + { + "epoch": 0.21234884573103702, + "grad_norm": 0.39697880470360636, + "learning_rate": 9.62184143046267e-06, + "loss": 0.4718, + "step": 1159 + }, + { + "epoch": 0.21253206302674973, + "grad_norm": 0.45285900320929984, + "learning_rate": 9.620620491556359e-06, + "loss": 0.4815, + "step": 1160 + }, + { + "epoch": 0.21271528032246245, + "grad_norm": 0.4616778523748189, + "learning_rate": 9.619397662556434e-06, + "loss": 0.5072, + "step": 1161 + }, + { + "epoch": 0.21289849761817514, + "grad_norm": 0.42472446401669417, + "learning_rate": 9.618172943963107e-06, + "loss": 0.5205, + "step": 1162 + }, + { + "epoch": 0.21308171491388786, + "grad_norm": 1.1202188747857207, + "learning_rate": 9.61694633627735e-06, + "loss": 0.4941, + "step": 1163 + }, + { + "epoch": 0.21326493220960058, + "grad_norm": 0.4238555504122449, + "learning_rate": 9.615717840000922e-06, + "loss": 0.5259, + "step": 1164 + }, + { + "epoch": 0.2134481495053133, + "grad_norm": 0.4331940724685078, + "learning_rate": 9.614487455636341e-06, + "loss": 0.4792, + "step": 1165 + }, + { + "epoch": 0.213631366801026, + "grad_norm": 0.42864744785979175, + "learning_rate": 9.613255183686906e-06, + "loss": 0.4737, + "step": 1166 + }, + { + "epoch": 0.21381458409673873, + "grad_norm": 0.43365153166481973, + "learning_rate": 9.612021024656685e-06, + "loss": 0.4709, + "step": 1167 + }, + { + "epoch": 0.21399780139245145, + "grad_norm": 0.43747080354215073, + "learning_rate": 9.610784979050519e-06, + "loss": 0.5012, + "step": 1168 + }, + { + "epoch": 0.21418101868816417, + "grad_norm": 0.4204470389686437, + "learning_rate": 9.609547047374018e-06, + "loss": 0.4905, + "step": 1169 + }, + { + "epoch": 0.21436423598387688, + "grad_norm": 0.42512506549883267, + "learning_rate": 9.608307230133566e-06, + "loss": 0.4672, + "step": 1170 + }, + { + "epoch": 0.2145474532795896, + "grad_norm": 0.4326161922702061, + "learning_rate": 9.607065527836324e-06, + "loss": 0.5035, + "step": 1171 + }, + { + "epoch": 0.21473067057530232, + "grad_norm": 0.4012535242609492, + "learning_rate": 9.60582194099021e-06, + "loss": 0.5042, + "step": 1172 + }, + { + "epoch": 0.21491388787101504, + "grad_norm": 0.407044902400079, + "learning_rate": 9.604576470103923e-06, + "loss": 0.4751, + "step": 1173 + }, + { + "epoch": 0.21509710516672773, + "grad_norm": 0.420735767335274, + "learning_rate": 9.603329115686934e-06, + "loss": 0.4872, + "step": 1174 + }, + { + "epoch": 0.21528032246244044, + "grad_norm": 0.41386495522739447, + "learning_rate": 9.60207987824948e-06, + "loss": 0.4554, + "step": 1175 + }, + { + "epoch": 0.21546353975815316, + "grad_norm": 0.3993847396510676, + "learning_rate": 9.600828758302568e-06, + "loss": 0.4769, + "step": 1176 + }, + { + "epoch": 0.21564675705386588, + "grad_norm": 0.43412675596114314, + "learning_rate": 9.599575756357974e-06, + "loss": 0.4604, + "step": 1177 + }, + { + "epoch": 0.2158299743495786, + "grad_norm": 0.42400411471398847, + "learning_rate": 9.598320872928251e-06, + "loss": 0.4644, + "step": 1178 + }, + { + "epoch": 0.2160131916452913, + "grad_norm": 0.43220524123609294, + "learning_rate": 9.597064108526715e-06, + "loss": 0.4931, + "step": 1179 + }, + { + "epoch": 0.21619640894100403, + "grad_norm": 0.4139513466321143, + "learning_rate": 9.595805463667452e-06, + "loss": 0.502, + "step": 1180 + }, + { + "epoch": 0.21637962623671675, + "grad_norm": 0.5080108758300018, + "learning_rate": 9.59454493886532e-06, + "loss": 0.4825, + "step": 1181 + }, + { + "epoch": 0.21656284353242947, + "grad_norm": 0.47000114005402754, + "learning_rate": 9.593282534635945e-06, + "loss": 0.4678, + "step": 1182 + }, + { + "epoch": 0.21674606082814218, + "grad_norm": 0.42032814714737154, + "learning_rate": 9.59201825149572e-06, + "loss": 0.5086, + "step": 1183 + }, + { + "epoch": 0.2169292781238549, + "grad_norm": 0.4011425053622946, + "learning_rate": 9.590752089961811e-06, + "loss": 0.5067, + "step": 1184 + }, + { + "epoch": 0.21711249541956762, + "grad_norm": 0.38415361503458767, + "learning_rate": 9.589484050552142e-06, + "loss": 0.5192, + "step": 1185 + }, + { + "epoch": 0.2172957127152803, + "grad_norm": 0.4099189657711175, + "learning_rate": 9.588214133785421e-06, + "loss": 0.5087, + "step": 1186 + }, + { + "epoch": 0.21747893001099303, + "grad_norm": 0.4259010610337274, + "learning_rate": 9.58694234018111e-06, + "loss": 0.4931, + "step": 1187 + }, + { + "epoch": 0.21766214730670574, + "grad_norm": 0.4709131450449139, + "learning_rate": 9.585668670259446e-06, + "loss": 0.4949, + "step": 1188 + }, + { + "epoch": 0.21784536460241846, + "grad_norm": 0.44547955151410057, + "learning_rate": 9.584393124541431e-06, + "loss": 0.4771, + "step": 1189 + }, + { + "epoch": 0.21802858189813118, + "grad_norm": 0.43579531600042903, + "learning_rate": 9.583115703548835e-06, + "loss": 0.5121, + "step": 1190 + }, + { + "epoch": 0.2182117991938439, + "grad_norm": 0.3959951122458681, + "learning_rate": 9.581836407804196e-06, + "loss": 0.4739, + "step": 1191 + }, + { + "epoch": 0.21839501648955661, + "grad_norm": 0.39913718386402464, + "learning_rate": 9.580555237830817e-06, + "loss": 0.476, + "step": 1192 + }, + { + "epoch": 0.21857823378526933, + "grad_norm": 0.3981984945355583, + "learning_rate": 9.579272194152767e-06, + "loss": 0.5042, + "step": 1193 + }, + { + "epoch": 0.21876145108098205, + "grad_norm": 0.4369820172983135, + "learning_rate": 9.577987277294887e-06, + "loss": 0.5072, + "step": 1194 + }, + { + "epoch": 0.21894466837669477, + "grad_norm": 0.43452616563791835, + "learning_rate": 9.576700487782775e-06, + "loss": 0.5111, + "step": 1195 + }, + { + "epoch": 0.21912788567240749, + "grad_norm": 0.45908350014787985, + "learning_rate": 9.575411826142806e-06, + "loss": 0.5284, + "step": 1196 + }, + { + "epoch": 0.2193111029681202, + "grad_norm": 0.40541576693311915, + "learning_rate": 9.57412129290211e-06, + "loss": 0.465, + "step": 1197 + }, + { + "epoch": 0.2194943202638329, + "grad_norm": 0.3894120792636639, + "learning_rate": 9.572828888588593e-06, + "loss": 0.502, + "step": 1198 + }, + { + "epoch": 0.2196775375595456, + "grad_norm": 0.41214065106611036, + "learning_rate": 9.571534613730915e-06, + "loss": 0.4999, + "step": 1199 + }, + { + "epoch": 0.21986075485525833, + "grad_norm": 0.38064251771626856, + "learning_rate": 9.57023846885851e-06, + "loss": 0.485, + "step": 1200 + }, + { + "epoch": 0.22004397215097105, + "grad_norm": 0.3546653255179806, + "learning_rate": 9.568940454501578e-06, + "loss": 0.4575, + "step": 1201 + }, + { + "epoch": 0.22022718944668376, + "grad_norm": 0.40806335023375157, + "learning_rate": 9.567640571191073e-06, + "loss": 0.4516, + "step": 1202 + }, + { + "epoch": 0.22041040674239648, + "grad_norm": 0.4688612954629896, + "learning_rate": 9.566338819458726e-06, + "loss": 0.5134, + "step": 1203 + }, + { + "epoch": 0.2205936240381092, + "grad_norm": 0.39969548266649896, + "learning_rate": 9.56503519983702e-06, + "loss": 0.4847, + "step": 1204 + }, + { + "epoch": 0.22077684133382192, + "grad_norm": 0.4383715060312841, + "learning_rate": 9.563729712859216e-06, + "loss": 0.5254, + "step": 1205 + }, + { + "epoch": 0.22096005862953463, + "grad_norm": 0.4352285842560572, + "learning_rate": 9.562422359059328e-06, + "loss": 0.5016, + "step": 1206 + }, + { + "epoch": 0.22114327592524735, + "grad_norm": 0.39691391609356236, + "learning_rate": 9.561113138972138e-06, + "loss": 0.5089, + "step": 1207 + }, + { + "epoch": 0.22132649322096007, + "grad_norm": 0.3960087971203996, + "learning_rate": 9.55980205313319e-06, + "loss": 0.4977, + "step": 1208 + }, + { + "epoch": 0.2215097105166728, + "grad_norm": 0.4287362367803256, + "learning_rate": 9.558489102078792e-06, + "loss": 0.4641, + "step": 1209 + }, + { + "epoch": 0.22169292781238548, + "grad_norm": 0.3811089768117096, + "learning_rate": 9.557174286346014e-06, + "loss": 0.5213, + "step": 1210 + }, + { + "epoch": 0.2218761451080982, + "grad_norm": 0.4593900206484617, + "learning_rate": 9.555857606472692e-06, + "loss": 0.514, + "step": 1211 + }, + { + "epoch": 0.2220593624038109, + "grad_norm": 0.3711599555884036, + "learning_rate": 9.554539062997421e-06, + "loss": 0.5003, + "step": 1212 + }, + { + "epoch": 0.22224257969952363, + "grad_norm": 0.42622144015650515, + "learning_rate": 9.553218656459558e-06, + "loss": 0.5486, + "step": 1213 + }, + { + "epoch": 0.22242579699523635, + "grad_norm": 0.4167867779019379, + "learning_rate": 9.551896387399226e-06, + "loss": 0.5184, + "step": 1214 + }, + { + "epoch": 0.22260901429094906, + "grad_norm": 0.3809013711065984, + "learning_rate": 9.550572256357305e-06, + "loss": 0.5022, + "step": 1215 + }, + { + "epoch": 0.22279223158666178, + "grad_norm": 0.44157858433876945, + "learning_rate": 9.54924626387544e-06, + "loss": 0.4605, + "step": 1216 + }, + { + "epoch": 0.2229754488823745, + "grad_norm": 0.38145210565349935, + "learning_rate": 9.547918410496037e-06, + "loss": 0.497, + "step": 1217 + }, + { + "epoch": 0.22315866617808722, + "grad_norm": 0.4245361578656831, + "learning_rate": 9.546588696762262e-06, + "loss": 0.5249, + "step": 1218 + }, + { + "epoch": 0.22334188347379993, + "grad_norm": 0.3806441240590457, + "learning_rate": 9.545257123218043e-06, + "loss": 0.489, + "step": 1219 + }, + { + "epoch": 0.22352510076951265, + "grad_norm": 0.3771756284454525, + "learning_rate": 9.54392369040807e-06, + "loss": 0.4835, + "step": 1220 + }, + { + "epoch": 0.22370831806522537, + "grad_norm": 0.42014861145100263, + "learning_rate": 9.542588398877787e-06, + "loss": 0.4908, + "step": 1221 + }, + { + "epoch": 0.22389153536093806, + "grad_norm": 0.40603250481690667, + "learning_rate": 9.54125124917341e-06, + "loss": 0.4936, + "step": 1222 + }, + { + "epoch": 0.22407475265665078, + "grad_norm": 0.4061177784088164, + "learning_rate": 9.539912241841904e-06, + "loss": 0.4827, + "step": 1223 + }, + { + "epoch": 0.2242579699523635, + "grad_norm": 0.3861564878611609, + "learning_rate": 9.538571377431e-06, + "loss": 0.4676, + "step": 1224 + }, + { + "epoch": 0.2244411872480762, + "grad_norm": 0.42382201486067383, + "learning_rate": 9.537228656489187e-06, + "loss": 0.4602, + "step": 1225 + }, + { + "epoch": 0.22462440454378893, + "grad_norm": 0.3822471479641513, + "learning_rate": 9.53588407956571e-06, + "loss": 0.4508, + "step": 1226 + }, + { + "epoch": 0.22480762183950165, + "grad_norm": 0.3882035970294104, + "learning_rate": 9.534537647210582e-06, + "loss": 0.4723, + "step": 1227 + }, + { + "epoch": 0.22499083913521437, + "grad_norm": 0.46953161111998076, + "learning_rate": 9.533189359974564e-06, + "loss": 0.5037, + "step": 1228 + }, + { + "epoch": 0.22517405643092708, + "grad_norm": 0.3742186217573921, + "learning_rate": 9.531839218409186e-06, + "loss": 0.4821, + "step": 1229 + }, + { + "epoch": 0.2253572737266398, + "grad_norm": 0.42732489656890116, + "learning_rate": 9.53048722306673e-06, + "loss": 0.4959, + "step": 1230 + }, + { + "epoch": 0.22554049102235252, + "grad_norm": 0.36370123103326807, + "learning_rate": 9.52913337450024e-06, + "loss": 0.509, + "step": 1231 + }, + { + "epoch": 0.22572370831806524, + "grad_norm": 0.3842157959300981, + "learning_rate": 9.527777673263512e-06, + "loss": 0.4776, + "step": 1232 + }, + { + "epoch": 0.22590692561377795, + "grad_norm": 0.39958584358419963, + "learning_rate": 9.526420119911109e-06, + "loss": 0.4937, + "step": 1233 + }, + { + "epoch": 0.22609014290949064, + "grad_norm": 0.3795874511893427, + "learning_rate": 9.525060714998341e-06, + "loss": 0.4673, + "step": 1234 + }, + { + "epoch": 0.22627336020520336, + "grad_norm": 0.443615180266169, + "learning_rate": 9.523699459081285e-06, + "loss": 0.497, + "step": 1235 + }, + { + "epoch": 0.22645657750091608, + "grad_norm": 0.4126187255162815, + "learning_rate": 9.52233635271677e-06, + "loss": 0.5026, + "step": 1236 + }, + { + "epoch": 0.2266397947966288, + "grad_norm": 0.3898731986202734, + "learning_rate": 9.520971396462383e-06, + "loss": 0.5142, + "step": 1237 + }, + { + "epoch": 0.2268230120923415, + "grad_norm": 0.4303787774334329, + "learning_rate": 9.519604590876471e-06, + "loss": 0.5027, + "step": 1238 + }, + { + "epoch": 0.22700622938805423, + "grad_norm": 0.4881712350764513, + "learning_rate": 9.51823593651813e-06, + "loss": 0.5194, + "step": 1239 + }, + { + "epoch": 0.22718944668376695, + "grad_norm": 0.3958516733915845, + "learning_rate": 9.516865433947218e-06, + "loss": 0.4873, + "step": 1240 + }, + { + "epoch": 0.22737266397947967, + "grad_norm": 0.37588183496475464, + "learning_rate": 9.515493083724348e-06, + "loss": 0.483, + "step": 1241 + }, + { + "epoch": 0.22755588127519238, + "grad_norm": 0.42594083794952703, + "learning_rate": 9.514118886410889e-06, + "loss": 0.515, + "step": 1242 + }, + { + "epoch": 0.2277390985709051, + "grad_norm": 0.42418243334298944, + "learning_rate": 9.512742842568964e-06, + "loss": 0.5057, + "step": 1243 + }, + { + "epoch": 0.22792231586661782, + "grad_norm": 0.4402484021876601, + "learning_rate": 9.511364952761453e-06, + "loss": 0.4961, + "step": 1244 + }, + { + "epoch": 0.22810553316233054, + "grad_norm": 0.39304351029537393, + "learning_rate": 9.509985217551989e-06, + "loss": 0.4803, + "step": 1245 + }, + { + "epoch": 0.22828875045804323, + "grad_norm": 0.40248532246553786, + "learning_rate": 9.508603637504962e-06, + "loss": 0.4801, + "step": 1246 + }, + { + "epoch": 0.22847196775375594, + "grad_norm": 0.4302259083201261, + "learning_rate": 9.507220213185517e-06, + "loss": 0.5027, + "step": 1247 + }, + { + "epoch": 0.22865518504946866, + "grad_norm": 0.40571061563755584, + "learning_rate": 9.505834945159552e-06, + "loss": 0.5162, + "step": 1248 + }, + { + "epoch": 0.22883840234518138, + "grad_norm": 0.4032017209651049, + "learning_rate": 9.504447833993717e-06, + "loss": 0.4948, + "step": 1249 + }, + { + "epoch": 0.2290216196408941, + "grad_norm": 0.4195307039259119, + "learning_rate": 9.503058880255423e-06, + "loss": 0.5007, + "step": 1250 + }, + { + "epoch": 0.22920483693660682, + "grad_norm": 0.4677433036389937, + "learning_rate": 9.501668084512827e-06, + "loss": 0.4676, + "step": 1251 + }, + { + "epoch": 0.22938805423231953, + "grad_norm": 0.44203412318706997, + "learning_rate": 9.500275447334843e-06, + "loss": 0.5178, + "step": 1252 + }, + { + "epoch": 0.22957127152803225, + "grad_norm": 0.3745369415002761, + "learning_rate": 9.498880969291138e-06, + "loss": 0.4705, + "step": 1253 + }, + { + "epoch": 0.22975448882374497, + "grad_norm": 0.3760740011440762, + "learning_rate": 9.497484650952133e-06, + "loss": 0.4982, + "step": 1254 + }, + { + "epoch": 0.22993770611945769, + "grad_norm": 0.38594463821493263, + "learning_rate": 9.496086492889001e-06, + "loss": 0.4887, + "step": 1255 + }, + { + "epoch": 0.2301209234151704, + "grad_norm": 0.4212966089256434, + "learning_rate": 9.494686495673665e-06, + "loss": 0.4837, + "step": 1256 + }, + { + "epoch": 0.23030414071088312, + "grad_norm": 0.4210112942058116, + "learning_rate": 9.493284659878802e-06, + "loss": 0.517, + "step": 1257 + }, + { + "epoch": 0.2304873580065958, + "grad_norm": 0.41531125869001356, + "learning_rate": 9.491880986077846e-06, + "loss": 0.5263, + "step": 1258 + }, + { + "epoch": 0.23067057530230853, + "grad_norm": 0.46209059357171517, + "learning_rate": 9.490475474844976e-06, + "loss": 0.5232, + "step": 1259 + }, + { + "epoch": 0.23085379259802125, + "grad_norm": 0.4297726055183601, + "learning_rate": 9.489068126755124e-06, + "loss": 0.4613, + "step": 1260 + }, + { + "epoch": 0.23103700989373396, + "grad_norm": 0.46903664754890034, + "learning_rate": 9.487658942383975e-06, + "loss": 0.5076, + "step": 1261 + }, + { + "epoch": 0.23122022718944668, + "grad_norm": 0.48896965014560756, + "learning_rate": 9.486247922307967e-06, + "loss": 0.487, + "step": 1262 + }, + { + "epoch": 0.2314034444851594, + "grad_norm": 0.4218926363173936, + "learning_rate": 9.484835067104285e-06, + "loss": 0.4914, + "step": 1263 + }, + { + "epoch": 0.23158666178087212, + "grad_norm": 0.40940946307427944, + "learning_rate": 9.483420377350865e-06, + "loss": 0.4984, + "step": 1264 + }, + { + "epoch": 0.23176987907658483, + "grad_norm": 0.45293257509435014, + "learning_rate": 9.482003853626396e-06, + "loss": 0.5082, + "step": 1265 + }, + { + "epoch": 0.23195309637229755, + "grad_norm": 0.3935023699694669, + "learning_rate": 9.480585496510315e-06, + "loss": 0.5099, + "step": 1266 + }, + { + "epoch": 0.23213631366801027, + "grad_norm": 0.39198925832901294, + "learning_rate": 9.479165306582811e-06, + "loss": 0.4682, + "step": 1267 + }, + { + "epoch": 0.232319530963723, + "grad_norm": 0.34440630702814135, + "learning_rate": 9.47774328442482e-06, + "loss": 0.4835, + "step": 1268 + }, + { + "epoch": 0.2325027482594357, + "grad_norm": 0.4696525716200109, + "learning_rate": 9.476319430618033e-06, + "loss": 0.5098, + "step": 1269 + }, + { + "epoch": 0.2326859655551484, + "grad_norm": 0.4172646920959579, + "learning_rate": 9.47489374574488e-06, + "loss": 0.5023, + "step": 1270 + }, + { + "epoch": 0.2328691828508611, + "grad_norm": 0.3756156161424675, + "learning_rate": 9.473466230388552e-06, + "loss": 0.4754, + "step": 1271 + }, + { + "epoch": 0.23305240014657383, + "grad_norm": 1.9398173346161394, + "learning_rate": 9.472036885132979e-06, + "loss": 0.5082, + "step": 1272 + }, + { + "epoch": 0.23323561744228655, + "grad_norm": 0.47073412743230747, + "learning_rate": 9.470605710562845e-06, + "loss": 0.5285, + "step": 1273 + }, + { + "epoch": 0.23341883473799926, + "grad_norm": 0.38789836660988597, + "learning_rate": 9.469172707263582e-06, + "loss": 0.4925, + "step": 1274 + }, + { + "epoch": 0.23360205203371198, + "grad_norm": 0.3825601936653152, + "learning_rate": 9.467737875821368e-06, + "loss": 0.4659, + "step": 1275 + }, + { + "epoch": 0.2337852693294247, + "grad_norm": 0.3973414450558941, + "learning_rate": 9.46630121682313e-06, + "loss": 0.4772, + "step": 1276 + }, + { + "epoch": 0.23396848662513742, + "grad_norm": 0.40322202930522744, + "learning_rate": 9.464862730856542e-06, + "loss": 0.4832, + "step": 1277 + }, + { + "epoch": 0.23415170392085013, + "grad_norm": 0.42699100304138604, + "learning_rate": 9.463422418510024e-06, + "loss": 0.5212, + "step": 1278 + }, + { + "epoch": 0.23433492121656285, + "grad_norm": 0.42176298384733996, + "learning_rate": 9.461980280372748e-06, + "loss": 0.4669, + "step": 1279 + }, + { + "epoch": 0.23451813851227557, + "grad_norm": 0.4081443681151287, + "learning_rate": 9.460536317034627e-06, + "loss": 0.5113, + "step": 1280 + }, + { + "epoch": 0.2347013558079883, + "grad_norm": 0.38795147083032466, + "learning_rate": 9.459090529086325e-06, + "loss": 0.4392, + "step": 1281 + }, + { + "epoch": 0.23488457310370098, + "grad_norm": 0.5217298258213892, + "learning_rate": 9.457642917119249e-06, + "loss": 0.5181, + "step": 1282 + }, + { + "epoch": 0.2350677903994137, + "grad_norm": 0.42457976380897766, + "learning_rate": 9.456193481725555e-06, + "loss": 0.5034, + "step": 1283 + }, + { + "epoch": 0.2352510076951264, + "grad_norm": 0.37057304524699214, + "learning_rate": 9.454742223498145e-06, + "loss": 0.4502, + "step": 1284 + }, + { + "epoch": 0.23543422499083913, + "grad_norm": 0.3946439030149264, + "learning_rate": 9.453289143030662e-06, + "loss": 0.4777, + "step": 1285 + }, + { + "epoch": 0.23561744228655185, + "grad_norm": 0.44885227772255887, + "learning_rate": 9.451834240917498e-06, + "loss": 0.4847, + "step": 1286 + }, + { + "epoch": 0.23580065958226457, + "grad_norm": 0.43519145234243767, + "learning_rate": 9.45037751775379e-06, + "loss": 0.5088, + "step": 1287 + }, + { + "epoch": 0.23598387687797728, + "grad_norm": 0.4309093065470007, + "learning_rate": 9.448918974135424e-06, + "loss": 0.4677, + "step": 1288 + }, + { + "epoch": 0.23616709417369, + "grad_norm": 0.47978496271668625, + "learning_rate": 9.447458610659019e-06, + "loss": 0.5116, + "step": 1289 + }, + { + "epoch": 0.23635031146940272, + "grad_norm": 0.41021471445709895, + "learning_rate": 9.445996427921951e-06, + "loss": 0.523, + "step": 1290 + }, + { + "epoch": 0.23653352876511544, + "grad_norm": 0.4296171869446803, + "learning_rate": 9.444532426522334e-06, + "loss": 0.4691, + "step": 1291 + }, + { + "epoch": 0.23671674606082815, + "grad_norm": 0.4077954317933853, + "learning_rate": 9.443066607059026e-06, + "loss": 0.4772, + "step": 1292 + }, + { + "epoch": 0.23689996335654084, + "grad_norm": 0.35793769775976464, + "learning_rate": 9.44159897013163e-06, + "loss": 0.4924, + "step": 1293 + }, + { + "epoch": 0.23708318065225356, + "grad_norm": 0.39909841974854016, + "learning_rate": 9.440129516340492e-06, + "loss": 0.4823, + "step": 1294 + }, + { + "epoch": 0.23726639794796628, + "grad_norm": 0.3675254686648848, + "learning_rate": 9.4386582462867e-06, + "loss": 0.506, + "step": 1295 + }, + { + "epoch": 0.237449615243679, + "grad_norm": 0.37951723029275103, + "learning_rate": 9.43718516057209e-06, + "loss": 0.4996, + "step": 1296 + }, + { + "epoch": 0.23763283253939171, + "grad_norm": 0.4395625646258237, + "learning_rate": 9.435710259799234e-06, + "loss": 0.4841, + "step": 1297 + }, + { + "epoch": 0.23781604983510443, + "grad_norm": 0.4409131115143356, + "learning_rate": 9.434233544571446e-06, + "loss": 0.524, + "step": 1298 + }, + { + "epoch": 0.23799926713081715, + "grad_norm": 0.38035953245650445, + "learning_rate": 9.432755015492794e-06, + "loss": 0.4833, + "step": 1299 + }, + { + "epoch": 0.23818248442652987, + "grad_norm": 0.37603352556682307, + "learning_rate": 9.431274673168072e-06, + "loss": 0.4771, + "step": 1300 + }, + { + "epoch": 0.23836570172224258, + "grad_norm": 0.3973590263816894, + "learning_rate": 9.429792518202826e-06, + "loss": 0.4519, + "step": 1301 + }, + { + "epoch": 0.2385489190179553, + "grad_norm": 0.46279982818002663, + "learning_rate": 9.428308551203342e-06, + "loss": 0.5067, + "step": 1302 + }, + { + "epoch": 0.23873213631366802, + "grad_norm": 0.44423083147602355, + "learning_rate": 9.426822772776645e-06, + "loss": 0.4955, + "step": 1303 + }, + { + "epoch": 0.23891535360938074, + "grad_norm": 0.3765473227250332, + "learning_rate": 9.425335183530501e-06, + "loss": 0.4688, + "step": 1304 + }, + { + "epoch": 0.23909857090509343, + "grad_norm": 0.38352159262743835, + "learning_rate": 9.42384578407342e-06, + "loss": 0.5013, + "step": 1305 + }, + { + "epoch": 0.23928178820080614, + "grad_norm": 0.38474924269669325, + "learning_rate": 9.422354575014644e-06, + "loss": 0.5259, + "step": 1306 + }, + { + "epoch": 0.23946500549651886, + "grad_norm": 0.3764815533565891, + "learning_rate": 9.42086155696417e-06, + "loss": 0.4907, + "step": 1307 + }, + { + "epoch": 0.23964822279223158, + "grad_norm": 0.40120663442717996, + "learning_rate": 9.41936673053272e-06, + "loss": 0.5082, + "step": 1308 + }, + { + "epoch": 0.2398314400879443, + "grad_norm": 0.38410734753609116, + "learning_rate": 9.417870096331764e-06, + "loss": 0.5235, + "step": 1309 + }, + { + "epoch": 0.24001465738365702, + "grad_norm": 0.4824543008653062, + "learning_rate": 9.416371654973513e-06, + "loss": 0.5306, + "step": 1310 + }, + { + "epoch": 0.24019787467936973, + "grad_norm": 0.4448880086308854, + "learning_rate": 9.414871407070906e-06, + "loss": 0.5045, + "step": 1311 + }, + { + "epoch": 0.24038109197508245, + "grad_norm": 0.3818507137941972, + "learning_rate": 9.413369353237637e-06, + "loss": 0.4903, + "step": 1312 + }, + { + "epoch": 0.24056430927079517, + "grad_norm": 0.42284475981294684, + "learning_rate": 9.411865494088124e-06, + "loss": 0.4964, + "step": 1313 + }, + { + "epoch": 0.24074752656650789, + "grad_norm": 0.39875491330187324, + "learning_rate": 9.410359830237534e-06, + "loss": 0.5081, + "step": 1314 + }, + { + "epoch": 0.2409307438622206, + "grad_norm": 0.3905340354403746, + "learning_rate": 9.408852362301768e-06, + "loss": 0.5006, + "step": 1315 + }, + { + "epoch": 0.24111396115793332, + "grad_norm": 0.5340945724213095, + "learning_rate": 9.407343090897464e-06, + "loss": 0.5028, + "step": 1316 + }, + { + "epoch": 0.241297178453646, + "grad_norm": 0.43175601981581013, + "learning_rate": 9.405832016641997e-06, + "loss": 0.5053, + "step": 1317 + }, + { + "epoch": 0.24148039574935873, + "grad_norm": 0.4128194078123402, + "learning_rate": 9.404319140153484e-06, + "loss": 0.446, + "step": 1318 + }, + { + "epoch": 0.24166361304507145, + "grad_norm": 0.3673682428697322, + "learning_rate": 9.402804462050776e-06, + "loss": 0.4664, + "step": 1319 + }, + { + "epoch": 0.24184683034078416, + "grad_norm": 0.35927179937888154, + "learning_rate": 9.401287982953462e-06, + "loss": 0.4772, + "step": 1320 + }, + { + "epoch": 0.24203004763649688, + "grad_norm": 0.436824106029447, + "learning_rate": 9.399769703481865e-06, + "loss": 0.4973, + "step": 1321 + }, + { + "epoch": 0.2422132649322096, + "grad_norm": 0.44788915497504966, + "learning_rate": 9.39824962425705e-06, + "loss": 0.5297, + "step": 1322 + }, + { + "epoch": 0.24239648222792232, + "grad_norm": 0.43045640486065784, + "learning_rate": 9.396727745900811e-06, + "loss": 0.4906, + "step": 1323 + }, + { + "epoch": 0.24257969952363503, + "grad_norm": 0.5421905192174398, + "learning_rate": 9.395204069035686e-06, + "loss": 0.4659, + "step": 1324 + }, + { + "epoch": 0.24276291681934775, + "grad_norm": 0.4219323385690796, + "learning_rate": 9.39367859428494e-06, + "loss": 0.5003, + "step": 1325 + }, + { + "epoch": 0.24294613411506047, + "grad_norm": 0.4119758712615104, + "learning_rate": 9.392151322272583e-06, + "loss": 0.499, + "step": 1326 + }, + { + "epoch": 0.2431293514107732, + "grad_norm": 0.4287819235082752, + "learning_rate": 9.390622253623353e-06, + "loss": 0.4939, + "step": 1327 + }, + { + "epoch": 0.2433125687064859, + "grad_norm": 0.38055263240566445, + "learning_rate": 9.389091388962723e-06, + "loss": 0.4712, + "step": 1328 + }, + { + "epoch": 0.2434957860021986, + "grad_norm": 0.36009711673291733, + "learning_rate": 9.387558728916905e-06, + "loss": 0.4728, + "step": 1329 + }, + { + "epoch": 0.2436790032979113, + "grad_norm": 0.43571235014716525, + "learning_rate": 9.386024274112842e-06, + "loss": 0.4821, + "step": 1330 + }, + { + "epoch": 0.24386222059362403, + "grad_norm": 0.3659773293799275, + "learning_rate": 9.384488025178214e-06, + "loss": 0.4572, + "step": 1331 + }, + { + "epoch": 0.24404543788933675, + "grad_norm": 0.37704886297840073, + "learning_rate": 9.382949982741429e-06, + "loss": 0.4336, + "step": 1332 + }, + { + "epoch": 0.24422865518504946, + "grad_norm": 0.4586883760808847, + "learning_rate": 9.38141014743164e-06, + "loss": 0.4783, + "step": 1333 + }, + { + "epoch": 0.24441187248076218, + "grad_norm": 0.5054332600007021, + "learning_rate": 9.379868519878718e-06, + "loss": 0.4883, + "step": 1334 + }, + { + "epoch": 0.2445950897764749, + "grad_norm": 0.3980887593736299, + "learning_rate": 9.378325100713283e-06, + "loss": 0.4865, + "step": 1335 + }, + { + "epoch": 0.24477830707218762, + "grad_norm": 0.4215392454714871, + "learning_rate": 9.376779890566675e-06, + "loss": 0.5154, + "step": 1336 + }, + { + "epoch": 0.24496152436790034, + "grad_norm": 0.42292016972151214, + "learning_rate": 9.375232890070973e-06, + "loss": 0.5184, + "step": 1337 + }, + { + "epoch": 0.24514474166361305, + "grad_norm": 0.38508367690050965, + "learning_rate": 9.373684099858989e-06, + "loss": 0.5014, + "step": 1338 + }, + { + "epoch": 0.24532795895932577, + "grad_norm": 0.4481685963998327, + "learning_rate": 9.372133520564264e-06, + "loss": 0.4831, + "step": 1339 + }, + { + "epoch": 0.2455111762550385, + "grad_norm": 0.5247732605935221, + "learning_rate": 9.37058115282107e-06, + "loss": 0.5238, + "step": 1340 + }, + { + "epoch": 0.24569439355075118, + "grad_norm": 0.3817180851569393, + "learning_rate": 9.369026997264417e-06, + "loss": 0.4513, + "step": 1341 + }, + { + "epoch": 0.2458776108464639, + "grad_norm": 0.41437756480910404, + "learning_rate": 9.36747105453004e-06, + "loss": 0.4692, + "step": 1342 + }, + { + "epoch": 0.2460608281421766, + "grad_norm": 0.45961904721533503, + "learning_rate": 9.365913325254406e-06, + "loss": 0.4824, + "step": 1343 + }, + { + "epoch": 0.24624404543788933, + "grad_norm": 0.411323007740071, + "learning_rate": 9.364353810074716e-06, + "loss": 0.4767, + "step": 1344 + }, + { + "epoch": 0.24642726273360205, + "grad_norm": 0.40884023238234113, + "learning_rate": 9.362792509628897e-06, + "loss": 0.5129, + "step": 1345 + }, + { + "epoch": 0.24661048002931477, + "grad_norm": 0.4143220748874932, + "learning_rate": 9.361229424555609e-06, + "loss": 0.4969, + "step": 1346 + }, + { + "epoch": 0.24679369732502748, + "grad_norm": 0.42312617400069397, + "learning_rate": 9.359664555494244e-06, + "loss": 0.4586, + "step": 1347 + }, + { + "epoch": 0.2469769146207402, + "grad_norm": 0.476793297997145, + "learning_rate": 9.35809790308492e-06, + "loss": 0.4933, + "step": 1348 + }, + { + "epoch": 0.24716013191645292, + "grad_norm": 0.42441332538018967, + "learning_rate": 9.356529467968485e-06, + "loss": 0.4588, + "step": 1349 + }, + { + "epoch": 0.24734334921216564, + "grad_norm": 0.39138693081487125, + "learning_rate": 9.354959250786519e-06, + "loss": 0.4602, + "step": 1350 + }, + { + "epoch": 0.24752656650787835, + "grad_norm": 0.44700626066197013, + "learning_rate": 9.353387252181328e-06, + "loss": 0.4954, + "step": 1351 + }, + { + "epoch": 0.24770978380359107, + "grad_norm": 0.5055157069210112, + "learning_rate": 9.351813472795947e-06, + "loss": 0.4796, + "step": 1352 + }, + { + "epoch": 0.24789300109930376, + "grad_norm": 0.45338574174735713, + "learning_rate": 9.350237913274143e-06, + "loss": 0.5119, + "step": 1353 + }, + { + "epoch": 0.24807621839501648, + "grad_norm": 0.392746885183511, + "learning_rate": 9.348660574260406e-06, + "loss": 0.514, + "step": 1354 + }, + { + "epoch": 0.2482594356907292, + "grad_norm": 0.4164788422614105, + "learning_rate": 9.347081456399958e-06, + "loss": 0.5089, + "step": 1355 + }, + { + "epoch": 0.24844265298644191, + "grad_norm": 0.7070530239591771, + "learning_rate": 9.345500560338745e-06, + "loss": 0.4702, + "step": 1356 + }, + { + "epoch": 0.24862587028215463, + "grad_norm": 0.4394194345754478, + "learning_rate": 9.343917886723444e-06, + "loss": 0.477, + "step": 1357 + }, + { + "epoch": 0.24880908757786735, + "grad_norm": 0.42911702918417705, + "learning_rate": 9.342333436201457e-06, + "loss": 0.4923, + "step": 1358 + }, + { + "epoch": 0.24899230487358007, + "grad_norm": 0.4470370063218885, + "learning_rate": 9.340747209420913e-06, + "loss": 0.5228, + "step": 1359 + }, + { + "epoch": 0.24917552216929278, + "grad_norm": 0.4599577217037793, + "learning_rate": 9.339159207030668e-06, + "loss": 0.5182, + "step": 1360 + }, + { + "epoch": 0.2493587394650055, + "grad_norm": 0.4405760361683581, + "learning_rate": 9.337569429680306e-06, + "loss": 0.5319, + "step": 1361 + }, + { + "epoch": 0.24954195676071822, + "grad_norm": 0.4164050414490823, + "learning_rate": 9.335977878020132e-06, + "loss": 0.5231, + "step": 1362 + }, + { + "epoch": 0.24972517405643094, + "grad_norm": 0.4968041900926753, + "learning_rate": 9.334384552701183e-06, + "loss": 0.5057, + "step": 1363 + }, + { + "epoch": 0.24990839135214366, + "grad_norm": 0.40916476625199594, + "learning_rate": 9.332789454375219e-06, + "loss": 0.4936, + "step": 1364 + }, + { + "epoch": 0.25009160864785634, + "grad_norm": 0.40707879703628913, + "learning_rate": 9.331192583694722e-06, + "loss": 0.4911, + "step": 1365 + }, + { + "epoch": 0.2502748259435691, + "grad_norm": 0.36742835933618884, + "learning_rate": 9.329593941312904e-06, + "loss": 0.4697, + "step": 1366 + }, + { + "epoch": 0.2504580432392818, + "grad_norm": 0.417024876859677, + "learning_rate": 9.327993527883698e-06, + "loss": 0.4726, + "step": 1367 + }, + { + "epoch": 0.2506412605349945, + "grad_norm": 0.4039071282595343, + "learning_rate": 9.326391344061767e-06, + "loss": 0.4545, + "step": 1368 + }, + { + "epoch": 0.2508244778307072, + "grad_norm": 0.5042927537180502, + "learning_rate": 9.32478739050249e-06, + "loss": 0.4994, + "step": 1369 + }, + { + "epoch": 0.25100769512641996, + "grad_norm": 0.43157899355551005, + "learning_rate": 9.323181667861976e-06, + "loss": 0.5033, + "step": 1370 + }, + { + "epoch": 0.25119091242213265, + "grad_norm": 0.4065168380531392, + "learning_rate": 9.321574176797055e-06, + "loss": 0.4767, + "step": 1371 + }, + { + "epoch": 0.25137412971784534, + "grad_norm": 0.3698342286632864, + "learning_rate": 9.31996491796528e-06, + "loss": 0.4863, + "step": 1372 + }, + { + "epoch": 0.2515573470135581, + "grad_norm": 0.43408179894940496, + "learning_rate": 9.318353892024934e-06, + "loss": 0.505, + "step": 1373 + }, + { + "epoch": 0.2517405643092708, + "grad_norm": 0.39467414262175576, + "learning_rate": 9.316741099635012e-06, + "loss": 0.4922, + "step": 1374 + }, + { + "epoch": 0.2519237816049835, + "grad_norm": 0.37099615952376014, + "learning_rate": 9.315126541455237e-06, + "loss": 0.4723, + "step": 1375 + }, + { + "epoch": 0.2521069989006962, + "grad_norm": 0.40654704513015383, + "learning_rate": 9.313510218146055e-06, + "loss": 0.4858, + "step": 1376 + }, + { + "epoch": 0.25229021619640896, + "grad_norm": 0.4320196580766476, + "learning_rate": 9.311892130368636e-06, + "loss": 0.4773, + "step": 1377 + }, + { + "epoch": 0.25247343349212165, + "grad_norm": 0.4649358511506784, + "learning_rate": 9.310272278784865e-06, + "loss": 0.506, + "step": 1378 + }, + { + "epoch": 0.2526566507878344, + "grad_norm": 0.37417265682881873, + "learning_rate": 9.308650664057352e-06, + "loss": 0.474, + "step": 1379 + }, + { + "epoch": 0.2528398680835471, + "grad_norm": 0.3877983907225597, + "learning_rate": 9.307027286849434e-06, + "loss": 0.5005, + "step": 1380 + }, + { + "epoch": 0.2530230853792598, + "grad_norm": 0.3604568123656264, + "learning_rate": 9.305402147825155e-06, + "loss": 0.4931, + "step": 1381 + }, + { + "epoch": 0.2532063026749725, + "grad_norm": 0.4076431370889414, + "learning_rate": 9.303775247649297e-06, + "loss": 0.4781, + "step": 1382 + }, + { + "epoch": 0.2533895199706852, + "grad_norm": 0.4125540102911716, + "learning_rate": 9.302146586987347e-06, + "loss": 0.489, + "step": 1383 + }, + { + "epoch": 0.25357273726639795, + "grad_norm": 0.38688511296298134, + "learning_rate": 9.300516166505523e-06, + "loss": 0.4997, + "step": 1384 + }, + { + "epoch": 0.25375595456211064, + "grad_norm": 0.3901423062453873, + "learning_rate": 9.298883986870757e-06, + "loss": 0.4599, + "step": 1385 + }, + { + "epoch": 0.2539391718578234, + "grad_norm": 0.3963777690852405, + "learning_rate": 9.297250048750702e-06, + "loss": 0.5013, + "step": 1386 + }, + { + "epoch": 0.2541223891535361, + "grad_norm": 0.42814597983549907, + "learning_rate": 9.295614352813732e-06, + "loss": 0.4841, + "step": 1387 + }, + { + "epoch": 0.2543056064492488, + "grad_norm": 0.4342615131111368, + "learning_rate": 9.293976899728936e-06, + "loss": 0.4893, + "step": 1388 + }, + { + "epoch": 0.2544888237449615, + "grad_norm": 0.3801898878490571, + "learning_rate": 9.292337690166127e-06, + "loss": 0.4921, + "step": 1389 + }, + { + "epoch": 0.25467204104067426, + "grad_norm": 0.41610528591828644, + "learning_rate": 9.290696724795833e-06, + "loss": 0.5179, + "step": 1390 + }, + { + "epoch": 0.25485525833638695, + "grad_norm": 0.4071057844053913, + "learning_rate": 9.289054004289302e-06, + "loss": 0.463, + "step": 1391 + }, + { + "epoch": 0.2550384756320997, + "grad_norm": 0.44828293293160687, + "learning_rate": 9.287409529318497e-06, + "loss": 0.4899, + "step": 1392 + }, + { + "epoch": 0.2552216929278124, + "grad_norm": 0.4044272763220796, + "learning_rate": 9.285763300556101e-06, + "loss": 0.5114, + "step": 1393 + }, + { + "epoch": 0.25540491022352513, + "grad_norm": 0.4027954029425126, + "learning_rate": 9.284115318675515e-06, + "loss": 0.4706, + "step": 1394 + }, + { + "epoch": 0.2555881275192378, + "grad_norm": 0.4318050133048906, + "learning_rate": 9.282465584350856e-06, + "loss": 0.4931, + "step": 1395 + }, + { + "epoch": 0.2557713448149505, + "grad_norm": 0.39621664861383626, + "learning_rate": 9.280814098256961e-06, + "loss": 0.4561, + "step": 1396 + }, + { + "epoch": 0.25595456211066325, + "grad_norm": 0.4720751749880995, + "learning_rate": 9.279160861069376e-06, + "loss": 0.4889, + "step": 1397 + }, + { + "epoch": 0.25613777940637594, + "grad_norm": 0.47784125077057304, + "learning_rate": 9.277505873464369e-06, + "loss": 0.5292, + "step": 1398 + }, + { + "epoch": 0.2563209967020887, + "grad_norm": 0.48714587413048666, + "learning_rate": 9.275849136118926e-06, + "loss": 0.4785, + "step": 1399 + }, + { + "epoch": 0.2565042139978014, + "grad_norm": 0.403984192319981, + "learning_rate": 9.274190649710743e-06, + "loss": 0.5076, + "step": 1400 + }, + { + "epoch": 0.2566874312935141, + "grad_norm": 0.39832914408918374, + "learning_rate": 9.272530414918236e-06, + "loss": 0.508, + "step": 1401 + }, + { + "epoch": 0.2568706485892268, + "grad_norm": 0.4345591492942482, + "learning_rate": 9.270868432420532e-06, + "loss": 0.434, + "step": 1402 + }, + { + "epoch": 0.25705386588493956, + "grad_norm": 0.38575416668818974, + "learning_rate": 9.269204702897476e-06, + "loss": 0.5123, + "step": 1403 + }, + { + "epoch": 0.25723708318065225, + "grad_norm": 0.46248145640766813, + "learning_rate": 9.267539227029628e-06, + "loss": 0.462, + "step": 1404 + }, + { + "epoch": 0.257420300476365, + "grad_norm": 0.41624325170317916, + "learning_rate": 9.265872005498263e-06, + "loss": 0.5224, + "step": 1405 + }, + { + "epoch": 0.2576035177720777, + "grad_norm": 0.45537999223861836, + "learning_rate": 9.264203038985362e-06, + "loss": 0.4863, + "step": 1406 + }, + { + "epoch": 0.2577867350677904, + "grad_norm": 0.41647722499814044, + "learning_rate": 9.262532328173633e-06, + "loss": 0.498, + "step": 1407 + }, + { + "epoch": 0.2579699523635031, + "grad_norm": 0.39714530799054115, + "learning_rate": 9.260859873746487e-06, + "loss": 0.4874, + "step": 1408 + }, + { + "epoch": 0.2581531696592158, + "grad_norm": 0.3703455263000783, + "learning_rate": 9.259185676388053e-06, + "loss": 0.4676, + "step": 1409 + }, + { + "epoch": 0.25833638695492855, + "grad_norm": 0.40896486520452896, + "learning_rate": 9.257509736783173e-06, + "loss": 0.4972, + "step": 1410 + }, + { + "epoch": 0.25851960425064124, + "grad_norm": 0.38349770802111716, + "learning_rate": 9.2558320556174e-06, + "loss": 0.4986, + "step": 1411 + }, + { + "epoch": 0.258702821546354, + "grad_norm": 0.38845233640399973, + "learning_rate": 9.254152633576996e-06, + "loss": 0.4781, + "step": 1412 + }, + { + "epoch": 0.2588860388420667, + "grad_norm": 0.4414806402475634, + "learning_rate": 9.252471471348945e-06, + "loss": 0.4998, + "step": 1413 + }, + { + "epoch": 0.2590692561377794, + "grad_norm": 0.6508729529767923, + "learning_rate": 9.250788569620933e-06, + "loss": 0.4731, + "step": 1414 + }, + { + "epoch": 0.2592524734334921, + "grad_norm": 0.3943363905618656, + "learning_rate": 9.249103929081362e-06, + "loss": 0.5024, + "step": 1415 + }, + { + "epoch": 0.25943569072920486, + "grad_norm": 0.40856928097689554, + "learning_rate": 9.247417550419347e-06, + "loss": 0.5212, + "step": 1416 + }, + { + "epoch": 0.25961890802491755, + "grad_norm": 0.441062293662255, + "learning_rate": 9.245729434324708e-06, + "loss": 0.4818, + "step": 1417 + }, + { + "epoch": 0.2598021253206303, + "grad_norm": 0.3948984673415181, + "learning_rate": 9.244039581487983e-06, + "loss": 0.4666, + "step": 1418 + }, + { + "epoch": 0.259985342616343, + "grad_norm": 0.3925260255665206, + "learning_rate": 9.242347992600416e-06, + "loss": 0.5116, + "step": 1419 + }, + { + "epoch": 0.2601685599120557, + "grad_norm": 0.4304859621407188, + "learning_rate": 9.24065466835396e-06, + "loss": 0.4982, + "step": 1420 + }, + { + "epoch": 0.2603517772077684, + "grad_norm": 0.40221018211406145, + "learning_rate": 9.23895960944128e-06, + "loss": 0.5005, + "step": 1421 + }, + { + "epoch": 0.2605349945034811, + "grad_norm": 0.41700277900738475, + "learning_rate": 9.237262816555755e-06, + "loss": 0.5002, + "step": 1422 + }, + { + "epoch": 0.26071821179919386, + "grad_norm": 0.4685774232060306, + "learning_rate": 9.235564290391461e-06, + "loss": 0.511, + "step": 1423 + }, + { + "epoch": 0.26090142909490655, + "grad_norm": 0.43210491917117466, + "learning_rate": 9.233864031643199e-06, + "loss": 0.4969, + "step": 1424 + }, + { + "epoch": 0.2610846463906193, + "grad_norm": 0.39899357606382013, + "learning_rate": 9.232162041006463e-06, + "loss": 0.5067, + "step": 1425 + }, + { + "epoch": 0.261267863686332, + "grad_norm": 0.3914730405010264, + "learning_rate": 9.230458319177467e-06, + "loss": 0.4567, + "step": 1426 + }, + { + "epoch": 0.2614510809820447, + "grad_norm": 0.4224757115631004, + "learning_rate": 9.22875286685313e-06, + "loss": 0.4878, + "step": 1427 + }, + { + "epoch": 0.2616342982777574, + "grad_norm": 0.44070937438452384, + "learning_rate": 9.227045684731075e-06, + "loss": 0.4751, + "step": 1428 + }, + { + "epoch": 0.26181751557347016, + "grad_norm": 0.41874124802141355, + "learning_rate": 9.225336773509637e-06, + "loss": 0.495, + "step": 1429 + }, + { + "epoch": 0.26200073286918285, + "grad_norm": 0.39663369299149254, + "learning_rate": 9.223626133887856e-06, + "loss": 0.502, + "step": 1430 + }, + { + "epoch": 0.26218395016489554, + "grad_norm": 0.3978403037913836, + "learning_rate": 9.22191376656548e-06, + "loss": 0.5057, + "step": 1431 + }, + { + "epoch": 0.2623671674606083, + "grad_norm": 0.39621285816390917, + "learning_rate": 9.220199672242965e-06, + "loss": 0.4787, + "step": 1432 + }, + { + "epoch": 0.262550384756321, + "grad_norm": 0.36823591327830396, + "learning_rate": 9.218483851621469e-06, + "loss": 0.4759, + "step": 1433 + }, + { + "epoch": 0.2627336020520337, + "grad_norm": 0.40956001204846254, + "learning_rate": 9.216766305402862e-06, + "loss": 0.4992, + "step": 1434 + }, + { + "epoch": 0.2629168193477464, + "grad_norm": 0.4909546794694869, + "learning_rate": 9.215047034289716e-06, + "loss": 0.4893, + "step": 1435 + }, + { + "epoch": 0.26310003664345916, + "grad_norm": 0.38305868280097505, + "learning_rate": 9.213326038985308e-06, + "loss": 0.4998, + "step": 1436 + }, + { + "epoch": 0.26328325393917185, + "grad_norm": 0.4506364554374671, + "learning_rate": 9.211603320193624e-06, + "loss": 0.5376, + "step": 1437 + }, + { + "epoch": 0.2634664712348846, + "grad_norm": 0.4532783129632112, + "learning_rate": 9.209878878619354e-06, + "loss": 0.4792, + "step": 1438 + }, + { + "epoch": 0.2636496885305973, + "grad_norm": 0.3927838537868758, + "learning_rate": 9.208152714967888e-06, + "loss": 0.4671, + "step": 1439 + }, + { + "epoch": 0.26383290582631, + "grad_norm": 0.43211843791598387, + "learning_rate": 9.206424829945326e-06, + "loss": 0.5083, + "step": 1440 + }, + { + "epoch": 0.2640161231220227, + "grad_norm": 0.4598413576241595, + "learning_rate": 9.20469522425847e-06, + "loss": 0.4946, + "step": 1441 + }, + { + "epoch": 0.2641993404177354, + "grad_norm": 0.4087512250232026, + "learning_rate": 9.202963898614825e-06, + "loss": 0.5002, + "step": 1442 + }, + { + "epoch": 0.26438255771344815, + "grad_norm": 0.4584659510926636, + "learning_rate": 9.201230853722603e-06, + "loss": 0.4979, + "step": 1443 + }, + { + "epoch": 0.26456577500916084, + "grad_norm": 0.4088655212775404, + "learning_rate": 9.199496090290713e-06, + "loss": 0.5083, + "step": 1444 + }, + { + "epoch": 0.2647489923048736, + "grad_norm": 0.39750766713674895, + "learning_rate": 9.197759609028774e-06, + "loss": 0.4447, + "step": 1445 + }, + { + "epoch": 0.2649322096005863, + "grad_norm": 2.5841998747640154, + "learning_rate": 9.196021410647104e-06, + "loss": 0.4938, + "step": 1446 + }, + { + "epoch": 0.265115426896299, + "grad_norm": 0.4352826266759048, + "learning_rate": 9.194281495856724e-06, + "loss": 0.4791, + "step": 1447 + }, + { + "epoch": 0.2652986441920117, + "grad_norm": 0.4347966702352818, + "learning_rate": 9.192539865369354e-06, + "loss": 0.4713, + "step": 1448 + }, + { + "epoch": 0.26548186148772446, + "grad_norm": 0.37418945105267554, + "learning_rate": 9.190796519897423e-06, + "loss": 0.4953, + "step": 1449 + }, + { + "epoch": 0.26566507878343715, + "grad_norm": 0.45051341568089304, + "learning_rate": 9.189051460154054e-06, + "loss": 0.4844, + "step": 1450 + }, + { + "epoch": 0.2658482960791499, + "grad_norm": 0.4213633340607822, + "learning_rate": 9.187304686853078e-06, + "loss": 0.4927, + "step": 1451 + }, + { + "epoch": 0.2660315133748626, + "grad_norm": 0.4083049015051242, + "learning_rate": 9.185556200709021e-06, + "loss": 0.4819, + "step": 1452 + }, + { + "epoch": 0.26621473067057533, + "grad_norm": 0.7665941590007657, + "learning_rate": 9.183806002437112e-06, + "loss": 0.5237, + "step": 1453 + }, + { + "epoch": 0.266397947966288, + "grad_norm": 0.4231366399299115, + "learning_rate": 9.182054092753281e-06, + "loss": 0.5165, + "step": 1454 + }, + { + "epoch": 0.2665811652620007, + "grad_norm": 0.3819742000154287, + "learning_rate": 9.180300472374158e-06, + "loss": 0.5068, + "step": 1455 + }, + { + "epoch": 0.26676438255771345, + "grad_norm": 0.430881793871794, + "learning_rate": 9.178545142017073e-06, + "loss": 0.4855, + "step": 1456 + }, + { + "epoch": 0.26694759985342614, + "grad_norm": 0.4120790750347018, + "learning_rate": 9.176788102400053e-06, + "loss": 0.4885, + "step": 1457 + }, + { + "epoch": 0.2671308171491389, + "grad_norm": 0.4538815285108574, + "learning_rate": 9.175029354241827e-06, + "loss": 0.4863, + "step": 1458 + }, + { + "epoch": 0.2673140344448516, + "grad_norm": 0.3830901020270506, + "learning_rate": 9.173268898261822e-06, + "loss": 0.4863, + "step": 1459 + }, + { + "epoch": 0.2674972517405643, + "grad_norm": 0.4245356876119995, + "learning_rate": 9.171506735180164e-06, + "loss": 0.4711, + "step": 1460 + }, + { + "epoch": 0.267680469036277, + "grad_norm": 0.47590664332011035, + "learning_rate": 9.169742865717675e-06, + "loss": 0.4902, + "step": 1461 + }, + { + "epoch": 0.26786368633198976, + "grad_norm": 0.4472826376520853, + "learning_rate": 9.167977290595879e-06, + "loss": 0.4752, + "step": 1462 + }, + { + "epoch": 0.26804690362770245, + "grad_norm": 0.45383853982527605, + "learning_rate": 9.166210010536996e-06, + "loss": 0.5041, + "step": 1463 + }, + { + "epoch": 0.2682301209234152, + "grad_norm": 0.4349556389365925, + "learning_rate": 9.164441026263939e-06, + "loss": 0.4841, + "step": 1464 + }, + { + "epoch": 0.2684133382191279, + "grad_norm": 0.3896144484943624, + "learning_rate": 9.162670338500323e-06, + "loss": 0.4691, + "step": 1465 + }, + { + "epoch": 0.2685965555148406, + "grad_norm": 0.40828579323895614, + "learning_rate": 9.160897947970463e-06, + "loss": 0.5087, + "step": 1466 + }, + { + "epoch": 0.2687797728105533, + "grad_norm": 0.44921268339324344, + "learning_rate": 9.159123855399364e-06, + "loss": 0.5205, + "step": 1467 + }, + { + "epoch": 0.268962990106266, + "grad_norm": 0.3749973142689185, + "learning_rate": 9.157348061512728e-06, + "loss": 0.5054, + "step": 1468 + }, + { + "epoch": 0.26914620740197875, + "grad_norm": 0.8460064155416173, + "learning_rate": 9.155570567036956e-06, + "loss": 0.504, + "step": 1469 + }, + { + "epoch": 0.26932942469769144, + "grad_norm": 0.40099978049747603, + "learning_rate": 9.153791372699141e-06, + "loss": 0.5169, + "step": 1470 + }, + { + "epoch": 0.2695126419934042, + "grad_norm": 0.399695867626504, + "learning_rate": 9.152010479227078e-06, + "loss": 0.4435, + "step": 1471 + }, + { + "epoch": 0.2696958592891169, + "grad_norm": 0.4522941668627639, + "learning_rate": 9.15022788734925e-06, + "loss": 0.4892, + "step": 1472 + }, + { + "epoch": 0.2698790765848296, + "grad_norm": 0.3907998733925797, + "learning_rate": 9.148443597794839e-06, + "loss": 0.4763, + "step": 1473 + }, + { + "epoch": 0.2700622938805423, + "grad_norm": 0.3989688222157153, + "learning_rate": 9.146657611293714e-06, + "loss": 0.4907, + "step": 1474 + }, + { + "epoch": 0.27024551117625506, + "grad_norm": 0.4158272717294187, + "learning_rate": 9.144869928576451e-06, + "loss": 0.4585, + "step": 1475 + }, + { + "epoch": 0.27042872847196775, + "grad_norm": 0.4122595114642358, + "learning_rate": 9.14308055037431e-06, + "loss": 0.5156, + "step": 1476 + }, + { + "epoch": 0.2706119457676805, + "grad_norm": 0.5428248637214301, + "learning_rate": 9.141289477419246e-06, + "loss": 0.5146, + "step": 1477 + }, + { + "epoch": 0.2707951630633932, + "grad_norm": 0.4081668756027033, + "learning_rate": 9.139496710443911e-06, + "loss": 0.4962, + "step": 1478 + }, + { + "epoch": 0.2709783803591059, + "grad_norm": 0.43919686618422127, + "learning_rate": 9.137702250181646e-06, + "loss": 0.5238, + "step": 1479 + }, + { + "epoch": 0.2711615976548186, + "grad_norm": 0.4199352917649959, + "learning_rate": 9.135906097366486e-06, + "loss": 0.5171, + "step": 1480 + }, + { + "epoch": 0.2713448149505313, + "grad_norm": 0.3812628825329553, + "learning_rate": 9.134108252733159e-06, + "loss": 0.4919, + "step": 1481 + }, + { + "epoch": 0.27152803224624406, + "grad_norm": 0.4062350904486568, + "learning_rate": 9.132308717017084e-06, + "loss": 0.4827, + "step": 1482 + }, + { + "epoch": 0.27171124954195675, + "grad_norm": 0.37649670407672037, + "learning_rate": 9.130507490954375e-06, + "loss": 0.4862, + "step": 1483 + }, + { + "epoch": 0.2718944668376695, + "grad_norm": 0.39510364308925733, + "learning_rate": 9.12870457528183e-06, + "loss": 0.4695, + "step": 1484 + }, + { + "epoch": 0.2720776841333822, + "grad_norm": 0.4412716554236446, + "learning_rate": 9.126899970736947e-06, + "loss": 0.4929, + "step": 1485 + }, + { + "epoch": 0.2722609014290949, + "grad_norm": 0.37777764528277363, + "learning_rate": 9.125093678057909e-06, + "loss": 0.4965, + "step": 1486 + }, + { + "epoch": 0.2724441187248076, + "grad_norm": 0.3701200088318955, + "learning_rate": 9.12328569798359e-06, + "loss": 0.4857, + "step": 1487 + }, + { + "epoch": 0.27262733602052036, + "grad_norm": 0.4323306961731567, + "learning_rate": 9.121476031253557e-06, + "loss": 0.47, + "step": 1488 + }, + { + "epoch": 0.27281055331623305, + "grad_norm": 0.40253559983281856, + "learning_rate": 9.119664678608067e-06, + "loss": 0.4474, + "step": 1489 + }, + { + "epoch": 0.27299377061194574, + "grad_norm": 0.36316272808709116, + "learning_rate": 9.117851640788064e-06, + "loss": 0.4743, + "step": 1490 + }, + { + "epoch": 0.2731769879076585, + "grad_norm": 0.39987807717863316, + "learning_rate": 9.11603691853518e-06, + "loss": 0.5074, + "step": 1491 + }, + { + "epoch": 0.2733602052033712, + "grad_norm": 0.36775081110408747, + "learning_rate": 9.11422051259174e-06, + "loss": 0.4805, + "step": 1492 + }, + { + "epoch": 0.2735434224990839, + "grad_norm": 0.41940289454678054, + "learning_rate": 9.112402423700759e-06, + "loss": 0.5071, + "step": 1493 + }, + { + "epoch": 0.2737266397947966, + "grad_norm": 0.3883498469145558, + "learning_rate": 9.110582652605934e-06, + "loss": 0.4839, + "step": 1494 + }, + { + "epoch": 0.27390985709050936, + "grad_norm": 0.40736107872427696, + "learning_rate": 9.108761200051655e-06, + "loss": 0.4769, + "step": 1495 + }, + { + "epoch": 0.27409307438622205, + "grad_norm": 0.42145067923082186, + "learning_rate": 9.106938066782998e-06, + "loss": 0.5176, + "step": 1496 + }, + { + "epoch": 0.2742762916819348, + "grad_norm": 0.4436513324175086, + "learning_rate": 9.105113253545727e-06, + "loss": 0.5185, + "step": 1497 + }, + { + "epoch": 0.2744595089776475, + "grad_norm": 0.3692944363116775, + "learning_rate": 9.103286761086294e-06, + "loss": 0.4369, + "step": 1498 + }, + { + "epoch": 0.2746427262733602, + "grad_norm": 0.4760665480063585, + "learning_rate": 9.101458590151837e-06, + "loss": 0.5006, + "step": 1499 + }, + { + "epoch": 0.2748259435690729, + "grad_norm": 0.4079202639977091, + "learning_rate": 9.099628741490179e-06, + "loss": 0.486, + "step": 1500 + }, + { + "epoch": 0.27500916086478566, + "grad_norm": 0.43182526673935, + "learning_rate": 9.097797215849834e-06, + "loss": 0.5081, + "step": 1501 + }, + { + "epoch": 0.27519237816049835, + "grad_norm": 0.39431567471837187, + "learning_rate": 9.095964013979998e-06, + "loss": 0.4692, + "step": 1502 + }, + { + "epoch": 0.27537559545621104, + "grad_norm": 0.4019693616622923, + "learning_rate": 9.094129136630552e-06, + "loss": 0.476, + "step": 1503 + }, + { + "epoch": 0.2755588127519238, + "grad_norm": 0.4313498077896387, + "learning_rate": 9.092292584552064e-06, + "loss": 0.4775, + "step": 1504 + }, + { + "epoch": 0.2757420300476365, + "grad_norm": 0.3604034259823722, + "learning_rate": 9.09045435849579e-06, + "loss": 0.497, + "step": 1505 + }, + { + "epoch": 0.2759252473433492, + "grad_norm": 0.4981716823895217, + "learning_rate": 9.088614459213665e-06, + "loss": 0.5151, + "step": 1506 + }, + { + "epoch": 0.2761084646390619, + "grad_norm": 0.42985286366030234, + "learning_rate": 9.086772887458314e-06, + "loss": 0.4724, + "step": 1507 + }, + { + "epoch": 0.27629168193477466, + "grad_norm": 0.40646371473453785, + "learning_rate": 9.08492964398304e-06, + "loss": 0.4875, + "step": 1508 + }, + { + "epoch": 0.27647489923048735, + "grad_norm": 0.3651071526478735, + "learning_rate": 9.083084729541836e-06, + "loss": 0.5, + "step": 1509 + }, + { + "epoch": 0.2766581165262001, + "grad_norm": 0.49895693881998293, + "learning_rate": 9.081238144889373e-06, + "loss": 0.5184, + "step": 1510 + }, + { + "epoch": 0.2768413338219128, + "grad_norm": 0.4128024780491911, + "learning_rate": 9.079389890781011e-06, + "loss": 0.4879, + "step": 1511 + }, + { + "epoch": 0.27702455111762553, + "grad_norm": 0.41553878397064187, + "learning_rate": 9.077539967972788e-06, + "loss": 0.5132, + "step": 1512 + }, + { + "epoch": 0.2772077684133382, + "grad_norm": 0.5023456269550239, + "learning_rate": 9.075688377221428e-06, + "loss": 0.5316, + "step": 1513 + }, + { + "epoch": 0.2773909857090509, + "grad_norm": 0.42658543986166453, + "learning_rate": 9.073835119284333e-06, + "loss": 0.4954, + "step": 1514 + }, + { + "epoch": 0.27757420300476365, + "grad_norm": 0.4257754248428276, + "learning_rate": 9.071980194919592e-06, + "loss": 0.4739, + "step": 1515 + }, + { + "epoch": 0.27775742030047634, + "grad_norm": 0.36513061047252116, + "learning_rate": 9.070123604885973e-06, + "loss": 0.4666, + "step": 1516 + }, + { + "epoch": 0.2779406375961891, + "grad_norm": 0.39730883326499855, + "learning_rate": 9.068265349942926e-06, + "loss": 0.4857, + "step": 1517 + }, + { + "epoch": 0.2781238548919018, + "grad_norm": 0.4089832030333659, + "learning_rate": 9.06640543085058e-06, + "loss": 0.5033, + "step": 1518 + }, + { + "epoch": 0.2783070721876145, + "grad_norm": 0.45930382348605114, + "learning_rate": 9.064543848369749e-06, + "loss": 0.5071, + "step": 1519 + }, + { + "epoch": 0.2784902894833272, + "grad_norm": 0.420241795630585, + "learning_rate": 9.062680603261923e-06, + "loss": 0.4966, + "step": 1520 + }, + { + "epoch": 0.27867350677903996, + "grad_norm": 0.45585947515287706, + "learning_rate": 9.060815696289273e-06, + "loss": 0.475, + "step": 1521 + }, + { + "epoch": 0.27885672407475265, + "grad_norm": 0.37205804655379004, + "learning_rate": 9.058949128214655e-06, + "loss": 0.5017, + "step": 1522 + }, + { + "epoch": 0.2790399413704654, + "grad_norm": 0.4529387207435349, + "learning_rate": 9.057080899801598e-06, + "loss": 0.482, + "step": 1523 + }, + { + "epoch": 0.2792231586661781, + "grad_norm": 0.3712655517985857, + "learning_rate": 9.055211011814312e-06, + "loss": 0.458, + "step": 1524 + }, + { + "epoch": 0.27940637596189083, + "grad_norm": 0.39642625126203734, + "learning_rate": 9.053339465017685e-06, + "loss": 0.4687, + "step": 1525 + }, + { + "epoch": 0.2795895932576035, + "grad_norm": 0.4136774885883469, + "learning_rate": 9.051466260177286e-06, + "loss": 0.5088, + "step": 1526 + }, + { + "epoch": 0.2797728105533162, + "grad_norm": 0.41134059761022984, + "learning_rate": 9.049591398059364e-06, + "loss": 0.4934, + "step": 1527 + }, + { + "epoch": 0.27995602784902895, + "grad_norm": 0.42078398613472917, + "learning_rate": 9.047714879430841e-06, + "loss": 0.488, + "step": 1528 + }, + { + "epoch": 0.28013924514474164, + "grad_norm": 0.3893031394642896, + "learning_rate": 9.045836705059316e-06, + "loss": 0.4787, + "step": 1529 + }, + { + "epoch": 0.2803224624404544, + "grad_norm": 0.4093693977481649, + "learning_rate": 9.043956875713071e-06, + "loss": 0.5038, + "step": 1530 + }, + { + "epoch": 0.2805056797361671, + "grad_norm": 0.39262539132038, + "learning_rate": 9.042075392161062e-06, + "loss": 0.4789, + "step": 1531 + }, + { + "epoch": 0.2806888970318798, + "grad_norm": 0.45335811960346867, + "learning_rate": 9.040192255172919e-06, + "loss": 0.4875, + "step": 1532 + }, + { + "epoch": 0.2808721143275925, + "grad_norm": 0.36503912559147556, + "learning_rate": 9.038307465518954e-06, + "loss": 0.477, + "step": 1533 + }, + { + "epoch": 0.28105533162330526, + "grad_norm": 0.4257211312986204, + "learning_rate": 9.03642102397015e-06, + "loss": 0.469, + "step": 1534 + }, + { + "epoch": 0.28123854891901795, + "grad_norm": 0.4252988723490824, + "learning_rate": 9.034532931298169e-06, + "loss": 0.4881, + "step": 1535 + }, + { + "epoch": 0.2814217662147307, + "grad_norm": 0.40947075405147687, + "learning_rate": 9.032643188275346e-06, + "loss": 0.4943, + "step": 1536 + }, + { + "epoch": 0.2816049835104434, + "grad_norm": 0.4104856496215327, + "learning_rate": 9.030751795674693e-06, + "loss": 0.5001, + "step": 1537 + }, + { + "epoch": 0.2817882008061561, + "grad_norm": 0.4108489148032505, + "learning_rate": 9.028858754269893e-06, + "loss": 0.486, + "step": 1538 + }, + { + "epoch": 0.2819714181018688, + "grad_norm": 0.400218515841904, + "learning_rate": 9.026964064835312e-06, + "loss": 0.4875, + "step": 1539 + }, + { + "epoch": 0.2821546353975815, + "grad_norm": 0.392307962036209, + "learning_rate": 9.02506772814598e-06, + "loss": 0.4696, + "step": 1540 + }, + { + "epoch": 0.28233785269329426, + "grad_norm": 0.39418249225011087, + "learning_rate": 9.023169744977607e-06, + "loss": 0.4944, + "step": 1541 + }, + { + "epoch": 0.28252106998900695, + "grad_norm": 0.36881071138331667, + "learning_rate": 9.021270116106574e-06, + "loss": 0.458, + "step": 1542 + }, + { + "epoch": 0.2827042872847197, + "grad_norm": 0.3604688853323364, + "learning_rate": 9.019368842309937e-06, + "loss": 0.4612, + "step": 1543 + }, + { + "epoch": 0.2828875045804324, + "grad_norm": 0.43502584499562924, + "learning_rate": 9.017465924365423e-06, + "loss": 0.535, + "step": 1544 + }, + { + "epoch": 0.2830707218761451, + "grad_norm": 0.3495743983793779, + "learning_rate": 9.015561363051434e-06, + "loss": 0.4884, + "step": 1545 + }, + { + "epoch": 0.2832539391718578, + "grad_norm": 0.38746814328257545, + "learning_rate": 9.01365515914704e-06, + "loss": 0.4684, + "step": 1546 + }, + { + "epoch": 0.28343715646757056, + "grad_norm": 0.42208621683162756, + "learning_rate": 9.011747313431988e-06, + "loss": 0.4894, + "step": 1547 + }, + { + "epoch": 0.28362037376328325, + "grad_norm": 0.40174728758795464, + "learning_rate": 9.009837826686693e-06, + "loss": 0.4892, + "step": 1548 + }, + { + "epoch": 0.283803591058996, + "grad_norm": 0.3804107018996848, + "learning_rate": 9.007926699692244e-06, + "loss": 0.4781, + "step": 1549 + }, + { + "epoch": 0.2839868083547087, + "grad_norm": 0.4347596518542267, + "learning_rate": 9.006013933230398e-06, + "loss": 0.4735, + "step": 1550 + }, + { + "epoch": 0.2841700256504214, + "grad_norm": 0.4121135548481947, + "learning_rate": 9.004099528083583e-06, + "loss": 0.4719, + "step": 1551 + }, + { + "epoch": 0.2843532429461341, + "grad_norm": 0.44795396529132614, + "learning_rate": 9.0021834850349e-06, + "loss": 0.5099, + "step": 1552 + }, + { + "epoch": 0.2845364602418468, + "grad_norm": 0.44372233437035424, + "learning_rate": 9.000265804868118e-06, + "loss": 0.4532, + "step": 1553 + }, + { + "epoch": 0.28471967753755956, + "grad_norm": 0.37122246988181606, + "learning_rate": 8.998346488367675e-06, + "loss": 0.5031, + "step": 1554 + }, + { + "epoch": 0.28490289483327225, + "grad_norm": 0.4132553844079889, + "learning_rate": 8.996425536318683e-06, + "loss": 0.4973, + "step": 1555 + }, + { + "epoch": 0.285086112128985, + "grad_norm": 0.44678572881841533, + "learning_rate": 8.994502949506914e-06, + "loss": 0.4937, + "step": 1556 + }, + { + "epoch": 0.2852693294246977, + "grad_norm": 0.3961645213095161, + "learning_rate": 8.99257872871882e-06, + "loss": 0.4917, + "step": 1557 + }, + { + "epoch": 0.2854525467204104, + "grad_norm": 0.4068575794251254, + "learning_rate": 8.99065287474151e-06, + "loss": 0.5059, + "step": 1558 + }, + { + "epoch": 0.2856357640161231, + "grad_norm": 0.3439421297681402, + "learning_rate": 8.98872538836277e-06, + "loss": 0.4943, + "step": 1559 + }, + { + "epoch": 0.28581898131183586, + "grad_norm": 0.4072178672567339, + "learning_rate": 8.986796270371047e-06, + "loss": 0.4884, + "step": 1560 + }, + { + "epoch": 0.28600219860754855, + "grad_norm": 0.42449273454289244, + "learning_rate": 8.984865521555464e-06, + "loss": 0.4875, + "step": 1561 + }, + { + "epoch": 0.28618541590326124, + "grad_norm": 0.37613787684490696, + "learning_rate": 8.9829331427058e-06, + "loss": 0.4857, + "step": 1562 + }, + { + "epoch": 0.286368633198974, + "grad_norm": 0.36486974332112687, + "learning_rate": 8.980999134612512e-06, + "loss": 0.5002, + "step": 1563 + }, + { + "epoch": 0.2865518504946867, + "grad_norm": 0.38386234504756434, + "learning_rate": 8.979063498066714e-06, + "loss": 0.4885, + "step": 1564 + }, + { + "epoch": 0.2867350677903994, + "grad_norm": 0.39062088012736546, + "learning_rate": 8.977126233860193e-06, + "loss": 0.4936, + "step": 1565 + }, + { + "epoch": 0.2869182850861121, + "grad_norm": 0.4126575331178653, + "learning_rate": 8.975187342785397e-06, + "loss": 0.4903, + "step": 1566 + }, + { + "epoch": 0.28710150238182486, + "grad_norm": 0.46684149874863134, + "learning_rate": 8.973246825635441e-06, + "loss": 0.527, + "step": 1567 + }, + { + "epoch": 0.28728471967753755, + "grad_norm": 0.39335552791237355, + "learning_rate": 8.97130468320411e-06, + "loss": 0.4779, + "step": 1568 + }, + { + "epoch": 0.2874679369732503, + "grad_norm": 0.4246548051800614, + "learning_rate": 8.969360916285844e-06, + "loss": 0.4815, + "step": 1569 + }, + { + "epoch": 0.287651154268963, + "grad_norm": 0.44025612619503274, + "learning_rate": 8.967415525675756e-06, + "loss": 0.4371, + "step": 1570 + }, + { + "epoch": 0.28783437156467573, + "grad_norm": 0.9703871617250511, + "learning_rate": 8.96546851216962e-06, + "loss": 0.4856, + "step": 1571 + }, + { + "epoch": 0.2880175888603884, + "grad_norm": 0.49217374632039296, + "learning_rate": 8.96351987656387e-06, + "loss": 0.5087, + "step": 1572 + }, + { + "epoch": 0.28820080615610116, + "grad_norm": 0.41378220482583816, + "learning_rate": 8.961569619655615e-06, + "loss": 0.4736, + "step": 1573 + }, + { + "epoch": 0.28838402345181385, + "grad_norm": 0.4186462441272065, + "learning_rate": 8.959617742242615e-06, + "loss": 0.493, + "step": 1574 + }, + { + "epoch": 0.28856724074752654, + "grad_norm": 0.3880931424436376, + "learning_rate": 8.957664245123297e-06, + "loss": 0.4844, + "step": 1575 + }, + { + "epoch": 0.2887504580432393, + "grad_norm": 0.3741132383694949, + "learning_rate": 8.955709129096752e-06, + "loss": 0.4055, + "step": 1576 + }, + { + "epoch": 0.288933675338952, + "grad_norm": 0.44863598418647255, + "learning_rate": 8.953752394962736e-06, + "loss": 0.5278, + "step": 1577 + }, + { + "epoch": 0.2891168926346647, + "grad_norm": 0.3876273635035489, + "learning_rate": 8.951794043521657e-06, + "loss": 0.4859, + "step": 1578 + }, + { + "epoch": 0.2893001099303774, + "grad_norm": 0.34806890820847197, + "learning_rate": 8.949834075574595e-06, + "loss": 0.4768, + "step": 1579 + }, + { + "epoch": 0.28948332722609016, + "grad_norm": 0.4072445168776299, + "learning_rate": 8.947872491923288e-06, + "loss": 0.487, + "step": 1580 + }, + { + "epoch": 0.28966654452180285, + "grad_norm": 0.4293336896066131, + "learning_rate": 8.94590929337013e-06, + "loss": 0.5008, + "step": 1581 + }, + { + "epoch": 0.2898497618175156, + "grad_norm": 0.41128872476228806, + "learning_rate": 8.943944480718184e-06, + "loss": 0.51, + "step": 1582 + }, + { + "epoch": 0.2900329791132283, + "grad_norm": 0.36929320177047475, + "learning_rate": 8.941978054771165e-06, + "loss": 0.5116, + "step": 1583 + }, + { + "epoch": 0.29021619640894103, + "grad_norm": 0.3533069262895081, + "learning_rate": 8.940010016333453e-06, + "loss": 0.462, + "step": 1584 + }, + { + "epoch": 0.2903994137046537, + "grad_norm": 0.4290477614884879, + "learning_rate": 8.938040366210088e-06, + "loss": 0.4886, + "step": 1585 + }, + { + "epoch": 0.2905826310003664, + "grad_norm": 0.4049126999073717, + "learning_rate": 8.936069105206767e-06, + "loss": 0.488, + "step": 1586 + }, + { + "epoch": 0.29076584829607915, + "grad_norm": 0.37625165658845383, + "learning_rate": 8.934096234129843e-06, + "loss": 0.4704, + "step": 1587 + }, + { + "epoch": 0.29094906559179184, + "grad_norm": 0.4368199590248626, + "learning_rate": 8.932121753786339e-06, + "loss": 0.5089, + "step": 1588 + }, + { + "epoch": 0.2911322828875046, + "grad_norm": 0.4293928339113946, + "learning_rate": 8.930145664983921e-06, + "loss": 0.4989, + "step": 1589 + }, + { + "epoch": 0.2913155001832173, + "grad_norm": 0.362276762434061, + "learning_rate": 8.928167968530921e-06, + "loss": 0.4848, + "step": 1590 + }, + { + "epoch": 0.29149871747893, + "grad_norm": 0.4545018284719774, + "learning_rate": 8.926188665236334e-06, + "loss": 0.5297, + "step": 1591 + }, + { + "epoch": 0.2916819347746427, + "grad_norm": 0.4271204305866213, + "learning_rate": 8.924207755909801e-06, + "loss": 0.4635, + "step": 1592 + }, + { + "epoch": 0.29186515207035546, + "grad_norm": 0.3895989149171432, + "learning_rate": 8.922225241361629e-06, + "loss": 0.4802, + "step": 1593 + }, + { + "epoch": 0.29204836936606815, + "grad_norm": 0.36495793219351547, + "learning_rate": 8.920241122402773e-06, + "loss": 0.4963, + "step": 1594 + }, + { + "epoch": 0.2922315866617809, + "grad_norm": 0.38869432485787914, + "learning_rate": 8.918255399844855e-06, + "loss": 0.4996, + "step": 1595 + }, + { + "epoch": 0.2924148039574936, + "grad_norm": 0.4601922887324616, + "learning_rate": 8.916268074500141e-06, + "loss": 0.5025, + "step": 1596 + }, + { + "epoch": 0.2925980212532063, + "grad_norm": 0.39205663001486896, + "learning_rate": 8.914279147181564e-06, + "loss": 0.4603, + "step": 1597 + }, + { + "epoch": 0.292781238548919, + "grad_norm": 0.4013458043993565, + "learning_rate": 8.912288618702706e-06, + "loss": 0.487, + "step": 1598 + }, + { + "epoch": 0.2929644558446317, + "grad_norm": 0.38939321801864035, + "learning_rate": 8.910296489877803e-06, + "loss": 0.5029, + "step": 1599 + }, + { + "epoch": 0.29314767314034446, + "grad_norm": 0.35395541109470974, + "learning_rate": 8.90830276152175e-06, + "loss": 0.4551, + "step": 1600 + }, + { + "epoch": 0.29333089043605715, + "grad_norm": 0.41856036352490694, + "learning_rate": 8.906307434450092e-06, + "loss": 0.4949, + "step": 1601 + }, + { + "epoch": 0.2935141077317699, + "grad_norm": 0.5275243050289077, + "learning_rate": 8.904310509479031e-06, + "loss": 0.4961, + "step": 1602 + }, + { + "epoch": 0.2936973250274826, + "grad_norm": 0.4742284206723536, + "learning_rate": 8.902311987425422e-06, + "loss": 0.5095, + "step": 1603 + }, + { + "epoch": 0.2938805423231953, + "grad_norm": 0.3390894324421048, + "learning_rate": 8.900311869106772e-06, + "loss": 0.4675, + "step": 1604 + }, + { + "epoch": 0.294063759618908, + "grad_norm": 0.39896932989796424, + "learning_rate": 8.898310155341245e-06, + "loss": 0.4819, + "step": 1605 + }, + { + "epoch": 0.29424697691462076, + "grad_norm": 0.5410338815825568, + "learning_rate": 8.896306846947649e-06, + "loss": 0.5032, + "step": 1606 + }, + { + "epoch": 0.29443019421033345, + "grad_norm": 0.4070700524177446, + "learning_rate": 8.894301944745453e-06, + "loss": 0.48, + "step": 1607 + }, + { + "epoch": 0.2946134115060462, + "grad_norm": 0.42862296543081163, + "learning_rate": 8.892295449554777e-06, + "loss": 0.4605, + "step": 1608 + }, + { + "epoch": 0.2947966288017589, + "grad_norm": 0.4287346890153464, + "learning_rate": 8.890287362196385e-06, + "loss": 0.485, + "step": 1609 + }, + { + "epoch": 0.2949798460974716, + "grad_norm": 0.43250442423486146, + "learning_rate": 8.888277683491701e-06, + "loss": 0.4854, + "step": 1610 + }, + { + "epoch": 0.2951630633931843, + "grad_norm": 0.49215086511267214, + "learning_rate": 8.886266414262797e-06, + "loss": 0.489, + "step": 1611 + }, + { + "epoch": 0.295346280688897, + "grad_norm": 0.4186341465088225, + "learning_rate": 8.884253555332393e-06, + "loss": 0.5078, + "step": 1612 + }, + { + "epoch": 0.29552949798460976, + "grad_norm": 0.4406315027714821, + "learning_rate": 8.882239107523863e-06, + "loss": 0.4937, + "step": 1613 + }, + { + "epoch": 0.29571271528032245, + "grad_norm": 0.3726428857037365, + "learning_rate": 8.88022307166123e-06, + "loss": 0.453, + "step": 1614 + }, + { + "epoch": 0.2958959325760352, + "grad_norm": 0.42821871054675215, + "learning_rate": 8.878205448569163e-06, + "loss": 0.4934, + "step": 1615 + }, + { + "epoch": 0.2960791498717479, + "grad_norm": 0.3670484899531654, + "learning_rate": 8.876186239072988e-06, + "loss": 0.486, + "step": 1616 + }, + { + "epoch": 0.2962623671674606, + "grad_norm": 0.34461787552049733, + "learning_rate": 8.874165443998672e-06, + "loss": 0.4761, + "step": 1617 + }, + { + "epoch": 0.2964455844631733, + "grad_norm": 0.37867132917069235, + "learning_rate": 8.872143064172834e-06, + "loss": 0.4821, + "step": 1618 + }, + { + "epoch": 0.29662880175888606, + "grad_norm": 0.3712153633078572, + "learning_rate": 8.870119100422743e-06, + "loss": 0.4931, + "step": 1619 + }, + { + "epoch": 0.29681201905459875, + "grad_norm": 0.36684611413431756, + "learning_rate": 8.86809355357631e-06, + "loss": 0.4616, + "step": 1620 + }, + { + "epoch": 0.29699523635031144, + "grad_norm": 0.38108459496354996, + "learning_rate": 8.866066424462103e-06, + "loss": 0.5118, + "step": 1621 + }, + { + "epoch": 0.2971784536460242, + "grad_norm": 0.382915281692164, + "learning_rate": 8.864037713909329e-06, + "loss": 0.4564, + "step": 1622 + }, + { + "epoch": 0.2973616709417369, + "grad_norm": 0.38365339546998406, + "learning_rate": 8.862007422747842e-06, + "loss": 0.516, + "step": 1623 + }, + { + "epoch": 0.2975448882374496, + "grad_norm": 0.5453723638931661, + "learning_rate": 8.85997555180815e-06, + "loss": 0.4722, + "step": 1624 + }, + { + "epoch": 0.2977281055331623, + "grad_norm": 0.38449352037816636, + "learning_rate": 8.8579421019214e-06, + "loss": 0.4583, + "step": 1625 + }, + { + "epoch": 0.29791132282887506, + "grad_norm": 0.4091259949883011, + "learning_rate": 8.855907073919388e-06, + "loss": 0.5102, + "step": 1626 + }, + { + "epoch": 0.29809454012458775, + "grad_norm": 0.46579655657632557, + "learning_rate": 8.853870468634554e-06, + "loss": 0.504, + "step": 1627 + }, + { + "epoch": 0.2982777574203005, + "grad_norm": 0.34914615282378225, + "learning_rate": 8.851832286899984e-06, + "loss": 0.4809, + "step": 1628 + }, + { + "epoch": 0.2984609747160132, + "grad_norm": 0.3465447713300964, + "learning_rate": 8.84979252954941e-06, + "loss": 0.4714, + "step": 1629 + }, + { + "epoch": 0.29864419201172593, + "grad_norm": 0.36719763001696804, + "learning_rate": 8.847751197417208e-06, + "loss": 0.4856, + "step": 1630 + }, + { + "epoch": 0.2988274093074386, + "grad_norm": 0.3954216747573634, + "learning_rate": 8.845708291338396e-06, + "loss": 0.4802, + "step": 1631 + }, + { + "epoch": 0.29901062660315136, + "grad_norm": 0.40271380964660825, + "learning_rate": 8.84366381214864e-06, + "loss": 0.4788, + "step": 1632 + }, + { + "epoch": 0.29919384389886405, + "grad_norm": 0.3751708327909397, + "learning_rate": 8.841617760684242e-06, + "loss": 0.4834, + "step": 1633 + }, + { + "epoch": 0.29937706119457674, + "grad_norm": 0.36638025333362195, + "learning_rate": 8.839570137782157e-06, + "loss": 0.4451, + "step": 1634 + }, + { + "epoch": 0.2995602784902895, + "grad_norm": 0.34260231840043137, + "learning_rate": 8.837520944279976e-06, + "loss": 0.4816, + "step": 1635 + }, + { + "epoch": 0.2997434957860022, + "grad_norm": 0.41835359853675946, + "learning_rate": 8.835470181015935e-06, + "loss": 0.4657, + "step": 1636 + }, + { + "epoch": 0.2999267130817149, + "grad_norm": 0.3853522103777551, + "learning_rate": 8.833417848828912e-06, + "loss": 0.4798, + "step": 1637 + }, + { + "epoch": 0.3001099303774276, + "grad_norm": 0.4311087142185147, + "learning_rate": 8.831363948558422e-06, + "loss": 0.5055, + "step": 1638 + }, + { + "epoch": 0.30029314767314036, + "grad_norm": 0.4121988474004896, + "learning_rate": 8.829308481044631e-06, + "loss": 0.4946, + "step": 1639 + }, + { + "epoch": 0.30047636496885305, + "grad_norm": 0.39926874503164067, + "learning_rate": 8.82725144712834e-06, + "loss": 0.5105, + "step": 1640 + }, + { + "epoch": 0.3006595822645658, + "grad_norm": 0.42930617786556713, + "learning_rate": 8.82519284765099e-06, + "loss": 0.4742, + "step": 1641 + }, + { + "epoch": 0.3008427995602785, + "grad_norm": 0.4756153230731036, + "learning_rate": 8.823132683454662e-06, + "loss": 0.4858, + "step": 1642 + }, + { + "epoch": 0.30102601685599123, + "grad_norm": 0.4105739310138263, + "learning_rate": 8.821070955382082e-06, + "loss": 0.4748, + "step": 1643 + }, + { + "epoch": 0.3012092341517039, + "grad_norm": 0.42461593671764813, + "learning_rate": 8.819007664276614e-06, + "loss": 0.4747, + "step": 1644 + }, + { + "epoch": 0.3013924514474166, + "grad_norm": 0.42619757195470465, + "learning_rate": 8.816942810982258e-06, + "loss": 0.4783, + "step": 1645 + }, + { + "epoch": 0.30157566874312935, + "grad_norm": 0.441815860292698, + "learning_rate": 8.814876396343655e-06, + "loss": 0.4822, + "step": 1646 + }, + { + "epoch": 0.30175888603884204, + "grad_norm": 0.39546125127248405, + "learning_rate": 8.812808421206083e-06, + "loss": 0.4751, + "step": 1647 + }, + { + "epoch": 0.3019421033345548, + "grad_norm": 0.4039788211552708, + "learning_rate": 8.810738886415464e-06, + "loss": 0.4774, + "step": 1648 + }, + { + "epoch": 0.3021253206302675, + "grad_norm": 0.4195550927374702, + "learning_rate": 8.80866779281835e-06, + "loss": 0.5184, + "step": 1649 + }, + { + "epoch": 0.3023085379259802, + "grad_norm": 0.4410446432137119, + "learning_rate": 8.80659514126194e-06, + "loss": 0.52, + "step": 1650 + }, + { + "epoch": 0.3024917552216929, + "grad_norm": 0.3910752829108485, + "learning_rate": 8.804520932594061e-06, + "loss": 0.4734, + "step": 1651 + }, + { + "epoch": 0.30267497251740566, + "grad_norm": 0.4084027351750214, + "learning_rate": 8.802445167663181e-06, + "loss": 0.4906, + "step": 1652 + }, + { + "epoch": 0.30285818981311835, + "grad_norm": 0.3779109317009322, + "learning_rate": 8.800367847318407e-06, + "loss": 0.4661, + "step": 1653 + }, + { + "epoch": 0.3030414071088311, + "grad_norm": 0.4434368130901037, + "learning_rate": 8.798288972409477e-06, + "loss": 0.4956, + "step": 1654 + }, + { + "epoch": 0.3032246244045438, + "grad_norm": 0.40653821037943955, + "learning_rate": 8.79620854378677e-06, + "loss": 0.5006, + "step": 1655 + }, + { + "epoch": 0.30340784170025653, + "grad_norm": 0.40448780548308255, + "learning_rate": 8.794126562301298e-06, + "loss": 0.5131, + "step": 1656 + }, + { + "epoch": 0.3035910589959692, + "grad_norm": 0.4185848391026118, + "learning_rate": 8.792043028804706e-06, + "loss": 0.4937, + "step": 1657 + }, + { + "epoch": 0.3037742762916819, + "grad_norm": 0.43416404958296123, + "learning_rate": 8.78995794414928e-06, + "loss": 0.4927, + "step": 1658 + }, + { + "epoch": 0.30395749358739466, + "grad_norm": 0.3941144660162759, + "learning_rate": 8.787871309187936e-06, + "loss": 0.4719, + "step": 1659 + }, + { + "epoch": 0.30414071088310735, + "grad_norm": 0.37126342352995245, + "learning_rate": 8.785783124774223e-06, + "loss": 0.4722, + "step": 1660 + }, + { + "epoch": 0.3043239281788201, + "grad_norm": 0.4103240007205328, + "learning_rate": 8.783693391762328e-06, + "loss": 0.4809, + "step": 1661 + }, + { + "epoch": 0.3045071454745328, + "grad_norm": 0.4249044003852677, + "learning_rate": 8.781602111007066e-06, + "loss": 0.4626, + "step": 1662 + }, + { + "epoch": 0.3046903627702455, + "grad_norm": 0.41638775694944496, + "learning_rate": 8.779509283363894e-06, + "loss": 0.4929, + "step": 1663 + }, + { + "epoch": 0.3048735800659582, + "grad_norm": 0.3861009808325714, + "learning_rate": 8.77741490968889e-06, + "loss": 0.4706, + "step": 1664 + }, + { + "epoch": 0.30505679736167096, + "grad_norm": 0.4284459852901781, + "learning_rate": 8.775318990838775e-06, + "loss": 0.542, + "step": 1665 + }, + { + "epoch": 0.30524001465738365, + "grad_norm": 0.39067748379098055, + "learning_rate": 8.773221527670896e-06, + "loss": 0.4921, + "step": 1666 + }, + { + "epoch": 0.3054232319530964, + "grad_norm": 0.4636902711928648, + "learning_rate": 8.771122521043236e-06, + "loss": 0.4847, + "step": 1667 + }, + { + "epoch": 0.3056064492488091, + "grad_norm": 0.4041218634315, + "learning_rate": 8.769021971814401e-06, + "loss": 0.4865, + "step": 1668 + }, + { + "epoch": 0.3057896665445218, + "grad_norm": 0.39459279955438104, + "learning_rate": 8.766919880843639e-06, + "loss": 0.519, + "step": 1669 + }, + { + "epoch": 0.3059728838402345, + "grad_norm": 0.40128357110983964, + "learning_rate": 8.764816248990822e-06, + "loss": 0.4841, + "step": 1670 + }, + { + "epoch": 0.3061561011359472, + "grad_norm": 0.36997768478257936, + "learning_rate": 8.762711077116453e-06, + "loss": 0.4771, + "step": 1671 + }, + { + "epoch": 0.30633931843165996, + "grad_norm": 0.3480828562287117, + "learning_rate": 8.760604366081665e-06, + "loss": 0.4876, + "step": 1672 + }, + { + "epoch": 0.30652253572737265, + "grad_norm": 0.417116733445613, + "learning_rate": 8.758496116748221e-06, + "loss": 0.4763, + "step": 1673 + }, + { + "epoch": 0.3067057530230854, + "grad_norm": 0.38990995104925313, + "learning_rate": 8.756386329978518e-06, + "loss": 0.4952, + "step": 1674 + }, + { + "epoch": 0.3068889703187981, + "grad_norm": 0.4102790428532168, + "learning_rate": 8.754275006635573e-06, + "loss": 0.4938, + "step": 1675 + }, + { + "epoch": 0.3070721876145108, + "grad_norm": 0.44424837968994635, + "learning_rate": 8.752162147583036e-06, + "loss": 0.4862, + "step": 1676 + }, + { + "epoch": 0.3072554049102235, + "grad_norm": 0.36829717881414353, + "learning_rate": 8.750047753685188e-06, + "loss": 0.4714, + "step": 1677 + }, + { + "epoch": 0.30743862220593626, + "grad_norm": 0.39758875548969896, + "learning_rate": 8.747931825806933e-06, + "loss": 0.4832, + "step": 1678 + }, + { + "epoch": 0.30762183950164895, + "grad_norm": 0.41272232708939577, + "learning_rate": 8.745814364813806e-06, + "loss": 0.4872, + "step": 1679 + }, + { + "epoch": 0.3078050567973617, + "grad_norm": 0.3854030542482913, + "learning_rate": 8.743695371571963e-06, + "loss": 0.484, + "step": 1680 + }, + { + "epoch": 0.3079882740930744, + "grad_norm": 0.36746498837241803, + "learning_rate": 8.741574846948198e-06, + "loss": 0.4711, + "step": 1681 + }, + { + "epoch": 0.3081714913887871, + "grad_norm": 0.411804382886457, + "learning_rate": 8.73945279180992e-06, + "loss": 0.4759, + "step": 1682 + }, + { + "epoch": 0.3083547086844998, + "grad_norm": 0.4026474390517101, + "learning_rate": 8.737329207025172e-06, + "loss": 0.502, + "step": 1683 + }, + { + "epoch": 0.3085379259802125, + "grad_norm": 0.3639512942893561, + "learning_rate": 8.735204093462617e-06, + "loss": 0.4647, + "step": 1684 + }, + { + "epoch": 0.30872114327592526, + "grad_norm": 0.4601114282657689, + "learning_rate": 8.733077451991546e-06, + "loss": 0.4794, + "step": 1685 + }, + { + "epoch": 0.30890436057163795, + "grad_norm": 0.4029114065064777, + "learning_rate": 8.730949283481877e-06, + "loss": 0.4581, + "step": 1686 + }, + { + "epoch": 0.3090875778673507, + "grad_norm": 0.47530481298922933, + "learning_rate": 8.72881958880415e-06, + "loss": 0.5358, + "step": 1687 + }, + { + "epoch": 0.3092707951630634, + "grad_norm": 0.46546993084658045, + "learning_rate": 8.72668836882953e-06, + "loss": 0.508, + "step": 1688 + }, + { + "epoch": 0.30945401245877613, + "grad_norm": 0.44880821238197843, + "learning_rate": 8.724555624429805e-06, + "loss": 0.494, + "step": 1689 + }, + { + "epoch": 0.3096372297544888, + "grad_norm": 0.39081723015666026, + "learning_rate": 8.722421356477389e-06, + "loss": 0.4845, + "step": 1690 + }, + { + "epoch": 0.30982044705020156, + "grad_norm": 0.3529090891828588, + "learning_rate": 8.720285565845313e-06, + "loss": 0.4672, + "step": 1691 + }, + { + "epoch": 0.31000366434591425, + "grad_norm": 0.39898106739463174, + "learning_rate": 8.718148253407242e-06, + "loss": 0.458, + "step": 1692 + }, + { + "epoch": 0.31018688164162694, + "grad_norm": 0.3651751506557362, + "learning_rate": 8.716009420037452e-06, + "loss": 0.4978, + "step": 1693 + }, + { + "epoch": 0.3103700989373397, + "grad_norm": 0.360403523409843, + "learning_rate": 8.713869066610847e-06, + "loss": 0.489, + "step": 1694 + }, + { + "epoch": 0.3105533162330524, + "grad_norm": 0.37968929495733417, + "learning_rate": 8.711727194002955e-06, + "loss": 0.493, + "step": 1695 + }, + { + "epoch": 0.3107365335287651, + "grad_norm": 0.38561595360319445, + "learning_rate": 8.709583803089919e-06, + "loss": 0.4864, + "step": 1696 + }, + { + "epoch": 0.3109197508244778, + "grad_norm": 0.3925407466954778, + "learning_rate": 8.707438894748508e-06, + "loss": 0.4592, + "step": 1697 + }, + { + "epoch": 0.31110296812019056, + "grad_norm": 0.38697234703046507, + "learning_rate": 8.70529246985611e-06, + "loss": 0.506, + "step": 1698 + }, + { + "epoch": 0.31128618541590325, + "grad_norm": 0.40545775127101236, + "learning_rate": 8.703144529290733e-06, + "loss": 0.498, + "step": 1699 + }, + { + "epoch": 0.311469402711616, + "grad_norm": 0.35555366552450657, + "learning_rate": 8.700995073931004e-06, + "loss": 0.4838, + "step": 1700 + }, + { + "epoch": 0.3116526200073287, + "grad_norm": 0.4022620468333227, + "learning_rate": 8.698844104656175e-06, + "loss": 0.4859, + "step": 1701 + }, + { + "epoch": 0.31183583730304143, + "grad_norm": 0.4370267955102546, + "learning_rate": 8.696691622346109e-06, + "loss": 0.4821, + "step": 1702 + }, + { + "epoch": 0.3120190545987541, + "grad_norm": 0.39148309791555397, + "learning_rate": 8.694537627881296e-06, + "loss": 0.4969, + "step": 1703 + }, + { + "epoch": 0.31220227189446687, + "grad_norm": 0.4149560429420508, + "learning_rate": 8.69238212214284e-06, + "loss": 0.4833, + "step": 1704 + }, + { + "epoch": 0.31238548919017955, + "grad_norm": 0.4012554167583925, + "learning_rate": 8.690225106012462e-06, + "loss": 0.4845, + "step": 1705 + }, + { + "epoch": 0.31256870648589224, + "grad_norm": 0.3880778144850858, + "learning_rate": 8.688066580372506e-06, + "loss": 0.4402, + "step": 1706 + }, + { + "epoch": 0.312751923781605, + "grad_norm": 0.3709906292066814, + "learning_rate": 8.685906546105925e-06, + "loss": 0.461, + "step": 1707 + }, + { + "epoch": 0.3129351410773177, + "grad_norm": 0.4038618996987991, + "learning_rate": 8.6837450040963e-06, + "loss": 0.4952, + "step": 1708 + }, + { + "epoch": 0.3131183583730304, + "grad_norm": 0.4498213583992615, + "learning_rate": 8.681581955227823e-06, + "loss": 0.463, + "step": 1709 + }, + { + "epoch": 0.3133015756687431, + "grad_norm": 0.6063342768243897, + "learning_rate": 8.6794174003853e-06, + "loss": 0.4692, + "step": 1710 + }, + { + "epoch": 0.31348479296445586, + "grad_norm": 0.393078000699868, + "learning_rate": 8.677251340454155e-06, + "loss": 0.477, + "step": 1711 + }, + { + "epoch": 0.31366801026016855, + "grad_norm": 0.5208615924956094, + "learning_rate": 8.67508377632043e-06, + "loss": 0.4686, + "step": 1712 + }, + { + "epoch": 0.3138512275558813, + "grad_norm": 0.4156650220868597, + "learning_rate": 8.672914708870782e-06, + "loss": 0.489, + "step": 1713 + }, + { + "epoch": 0.314034444851594, + "grad_norm": 0.39229916132917864, + "learning_rate": 8.67074413899248e-06, + "loss": 0.4854, + "step": 1714 + }, + { + "epoch": 0.31421766214730673, + "grad_norm": 0.3432761699669145, + "learning_rate": 8.668572067573409e-06, + "loss": 0.5026, + "step": 1715 + }, + { + "epoch": 0.3144008794430194, + "grad_norm": 0.4024782274517547, + "learning_rate": 8.666398495502068e-06, + "loss": 0.5145, + "step": 1716 + }, + { + "epoch": 0.3145840967387321, + "grad_norm": 0.4193753630351778, + "learning_rate": 8.664223423667571e-06, + "loss": 0.5072, + "step": 1717 + }, + { + "epoch": 0.31476731403444486, + "grad_norm": 0.44033356668027723, + "learning_rate": 8.662046852959644e-06, + "loss": 0.4962, + "step": 1718 + }, + { + "epoch": 0.31495053133015755, + "grad_norm": 0.4301457201335492, + "learning_rate": 8.65986878426863e-06, + "loss": 0.5034, + "step": 1719 + }, + { + "epoch": 0.3151337486258703, + "grad_norm": 0.42488867042789086, + "learning_rate": 8.657689218485476e-06, + "loss": 0.5188, + "step": 1720 + }, + { + "epoch": 0.315316965921583, + "grad_norm": 0.41341522328402974, + "learning_rate": 8.65550815650175e-06, + "loss": 0.5216, + "step": 1721 + }, + { + "epoch": 0.3155001832172957, + "grad_norm": 0.3771662909597496, + "learning_rate": 8.65332559920963e-06, + "loss": 0.5106, + "step": 1722 + }, + { + "epoch": 0.3156834005130084, + "grad_norm": 0.39212468702748254, + "learning_rate": 8.651141547501904e-06, + "loss": 0.4766, + "step": 1723 + }, + { + "epoch": 0.31586661780872116, + "grad_norm": 0.36654865806495635, + "learning_rate": 8.64895600227197e-06, + "loss": 0.4802, + "step": 1724 + }, + { + "epoch": 0.31604983510443385, + "grad_norm": 0.4065034981518445, + "learning_rate": 8.64676896441384e-06, + "loss": 0.5288, + "step": 1725 + }, + { + "epoch": 0.3162330524001466, + "grad_norm": 0.3854499856808352, + "learning_rate": 8.644580434822136e-06, + "loss": 0.4699, + "step": 1726 + }, + { + "epoch": 0.3164162696958593, + "grad_norm": 0.4057249078954745, + "learning_rate": 8.64239041439209e-06, + "loss": 0.4759, + "step": 1727 + }, + { + "epoch": 0.316599486991572, + "grad_norm": 0.44500981118266, + "learning_rate": 8.640198904019544e-06, + "loss": 0.4822, + "step": 1728 + }, + { + "epoch": 0.3167827042872847, + "grad_norm": 0.3896776477436031, + "learning_rate": 8.638005904600948e-06, + "loss": 0.49, + "step": 1729 + }, + { + "epoch": 0.3169659215829974, + "grad_norm": 0.4432593842454478, + "learning_rate": 8.635811417033361e-06, + "loss": 0.4952, + "step": 1730 + }, + { + "epoch": 0.31714913887871016, + "grad_norm": 0.38053644866499986, + "learning_rate": 8.633615442214452e-06, + "loss": 0.4579, + "step": 1731 + }, + { + "epoch": 0.31733235617442285, + "grad_norm": 0.36140726326704303, + "learning_rate": 8.6314179810425e-06, + "loss": 0.469, + "step": 1732 + }, + { + "epoch": 0.3175155734701356, + "grad_norm": 0.387898862687194, + "learning_rate": 8.62921903441639e-06, + "loss": 0.4903, + "step": 1733 + }, + { + "epoch": 0.3176987907658483, + "grad_norm": 0.4221126995607179, + "learning_rate": 8.627018603235613e-06, + "loss": 0.4959, + "step": 1734 + }, + { + "epoch": 0.317882008061561, + "grad_norm": 0.46009818640085887, + "learning_rate": 8.624816688400271e-06, + "loss": 0.4563, + "step": 1735 + }, + { + "epoch": 0.3180652253572737, + "grad_norm": 0.4685918929067162, + "learning_rate": 8.62261329081107e-06, + "loss": 0.4714, + "step": 1736 + }, + { + "epoch": 0.31824844265298646, + "grad_norm": 0.4412482439531488, + "learning_rate": 8.620408411369323e-06, + "loss": 0.5007, + "step": 1737 + }, + { + "epoch": 0.31843165994869915, + "grad_norm": 0.4547503190879881, + "learning_rate": 8.61820205097695e-06, + "loss": 0.4581, + "step": 1738 + }, + { + "epoch": 0.3186148772444119, + "grad_norm": 0.43395651034382204, + "learning_rate": 8.615994210536479e-06, + "loss": 0.49, + "step": 1739 + }, + { + "epoch": 0.3187980945401246, + "grad_norm": 0.4428071176266843, + "learning_rate": 8.613784890951036e-06, + "loss": 0.4922, + "step": 1740 + }, + { + "epoch": 0.3189813118358373, + "grad_norm": 0.40849265010723296, + "learning_rate": 8.61157409312436e-06, + "loss": 0.4743, + "step": 1741 + }, + { + "epoch": 0.31916452913155, + "grad_norm": 0.42591237999179565, + "learning_rate": 8.609361817960794e-06, + "loss": 0.505, + "step": 1742 + }, + { + "epoch": 0.3193477464272627, + "grad_norm": 0.46495694756695405, + "learning_rate": 8.607148066365278e-06, + "loss": 0.5111, + "step": 1743 + }, + { + "epoch": 0.31953096372297546, + "grad_norm": 0.3979181415316983, + "learning_rate": 8.604932839243363e-06, + "loss": 0.4911, + "step": 1744 + }, + { + "epoch": 0.31971418101868815, + "grad_norm": 0.47254311835047524, + "learning_rate": 8.602716137501201e-06, + "loss": 0.5059, + "step": 1745 + }, + { + "epoch": 0.3198973983144009, + "grad_norm": 0.4110415786461455, + "learning_rate": 8.600497962045551e-06, + "loss": 0.4931, + "step": 1746 + }, + { + "epoch": 0.3200806156101136, + "grad_norm": 0.448385298002212, + "learning_rate": 8.598278313783765e-06, + "loss": 0.4816, + "step": 1747 + }, + { + "epoch": 0.32026383290582633, + "grad_norm": 0.564876624397995, + "learning_rate": 8.59605719362381e-06, + "loss": 0.4949, + "step": 1748 + }, + { + "epoch": 0.320447050201539, + "grad_norm": 0.35471940765133153, + "learning_rate": 8.593834602474248e-06, + "loss": 0.4794, + "step": 1749 + }, + { + "epoch": 0.32063026749725176, + "grad_norm": 0.4192300945251544, + "learning_rate": 8.59161054124424e-06, + "loss": 0.4519, + "step": 1750 + }, + { + "epoch": 0.32081348479296445, + "grad_norm": 0.5038523389603609, + "learning_rate": 8.589385010843557e-06, + "loss": 0.4729, + "step": 1751 + }, + { + "epoch": 0.32099670208867714, + "grad_norm": 0.7356309775332599, + "learning_rate": 8.587158012182561e-06, + "loss": 0.5082, + "step": 1752 + }, + { + "epoch": 0.3211799193843899, + "grad_norm": 0.3818652835002793, + "learning_rate": 8.584929546172224e-06, + "loss": 0.4756, + "step": 1753 + }, + { + "epoch": 0.3213631366801026, + "grad_norm": 0.4435070016683062, + "learning_rate": 8.582699613724111e-06, + "loss": 0.5102, + "step": 1754 + }, + { + "epoch": 0.3215463539758153, + "grad_norm": 0.5377483293815659, + "learning_rate": 8.580468215750392e-06, + "loss": 0.4844, + "step": 1755 + }, + { + "epoch": 0.321729571271528, + "grad_norm": 0.4451665769582748, + "learning_rate": 8.578235353163832e-06, + "loss": 0.4617, + "step": 1756 + }, + { + "epoch": 0.32191278856724076, + "grad_norm": 0.4135616911329195, + "learning_rate": 8.576001026877802e-06, + "loss": 0.4919, + "step": 1757 + }, + { + "epoch": 0.32209600586295345, + "grad_norm": 0.4319033591486567, + "learning_rate": 8.573765237806262e-06, + "loss": 0.4699, + "step": 1758 + }, + { + "epoch": 0.3222792231586662, + "grad_norm": 0.37755085037719216, + "learning_rate": 8.571527986863775e-06, + "loss": 0.4816, + "step": 1759 + }, + { + "epoch": 0.3224624404543789, + "grad_norm": 0.3515458496554082, + "learning_rate": 8.569289274965507e-06, + "loss": 0.4742, + "step": 1760 + }, + { + "epoch": 0.32264565775009163, + "grad_norm": 0.4448770787291807, + "learning_rate": 8.567049103027215e-06, + "loss": 0.498, + "step": 1761 + }, + { + "epoch": 0.3228288750458043, + "grad_norm": 0.3941180324286139, + "learning_rate": 8.564807471965253e-06, + "loss": 0.5405, + "step": 1762 + }, + { + "epoch": 0.32301209234151707, + "grad_norm": 0.3435676251293926, + "learning_rate": 8.562564382696578e-06, + "loss": 0.4743, + "step": 1763 + }, + { + "epoch": 0.32319530963722976, + "grad_norm": 0.37988171856202396, + "learning_rate": 8.560319836138737e-06, + "loss": 0.4674, + "step": 1764 + }, + { + "epoch": 0.32337852693294244, + "grad_norm": 0.3836135165007263, + "learning_rate": 8.558073833209878e-06, + "loss": 0.4519, + "step": 1765 + }, + { + "epoch": 0.3235617442286552, + "grad_norm": 0.39708836154368815, + "learning_rate": 8.555826374828737e-06, + "loss": 0.4669, + "step": 1766 + }, + { + "epoch": 0.3237449615243679, + "grad_norm": 0.3771156156458366, + "learning_rate": 8.553577461914658e-06, + "loss": 0.4518, + "step": 1767 + }, + { + "epoch": 0.3239281788200806, + "grad_norm": 0.3598271555441401, + "learning_rate": 8.55132709538757e-06, + "loss": 0.4729, + "step": 1768 + }, + { + "epoch": 0.3241113961157933, + "grad_norm": 0.3951522532510331, + "learning_rate": 8.549075276167999e-06, + "loss": 0.4848, + "step": 1769 + }, + { + "epoch": 0.32429461341150606, + "grad_norm": 0.4147560955415563, + "learning_rate": 8.546822005177065e-06, + "loss": 0.5051, + "step": 1770 + }, + { + "epoch": 0.32447783070721875, + "grad_norm": 0.37548763357253123, + "learning_rate": 8.544567283336484e-06, + "loss": 0.4887, + "step": 1771 + }, + { + "epoch": 0.3246610480029315, + "grad_norm": 0.3562817695493491, + "learning_rate": 8.542311111568564e-06, + "loss": 0.4627, + "step": 1772 + }, + { + "epoch": 0.3248442652986442, + "grad_norm": 0.3403427009030267, + "learning_rate": 8.540053490796204e-06, + "loss": 0.498, + "step": 1773 + }, + { + "epoch": 0.32502748259435693, + "grad_norm": 0.3618093246911228, + "learning_rate": 8.5377944219429e-06, + "loss": 0.4711, + "step": 1774 + }, + { + "epoch": 0.3252106998900696, + "grad_norm": 0.37156147331448447, + "learning_rate": 8.535533905932739e-06, + "loss": 0.4914, + "step": 1775 + }, + { + "epoch": 0.3253939171857823, + "grad_norm": 0.4104697698807329, + "learning_rate": 8.533271943690397e-06, + "loss": 0.4926, + "step": 1776 + }, + { + "epoch": 0.32557713448149506, + "grad_norm": 0.4221079282420862, + "learning_rate": 8.531008536141143e-06, + "loss": 0.494, + "step": 1777 + }, + { + "epoch": 0.32576035177720775, + "grad_norm": 0.44238661768749327, + "learning_rate": 8.528743684210842e-06, + "loss": 0.4679, + "step": 1778 + }, + { + "epoch": 0.3259435690729205, + "grad_norm": 0.37351954296984535, + "learning_rate": 8.52647738882594e-06, + "loss": 0.4743, + "step": 1779 + }, + { + "epoch": 0.3261267863686332, + "grad_norm": 0.39302981254445146, + "learning_rate": 8.524209650913487e-06, + "loss": 0.4817, + "step": 1780 + }, + { + "epoch": 0.3263100036643459, + "grad_norm": 0.3421628958137739, + "learning_rate": 8.521940471401106e-06, + "loss": 0.4663, + "step": 1781 + }, + { + "epoch": 0.3264932209600586, + "grad_norm": 0.5533622907062358, + "learning_rate": 8.519669851217028e-06, + "loss": 0.4918, + "step": 1782 + }, + { + "epoch": 0.32667643825577136, + "grad_norm": 0.3970061640019179, + "learning_rate": 8.517397791290059e-06, + "loss": 0.4914, + "step": 1783 + }, + { + "epoch": 0.32685965555148405, + "grad_norm": 0.435520424701832, + "learning_rate": 8.5151242925496e-06, + "loss": 0.5093, + "step": 1784 + }, + { + "epoch": 0.3270428728471968, + "grad_norm": 0.3465454680201401, + "learning_rate": 8.512849355925641e-06, + "loss": 0.4718, + "step": 1785 + }, + { + "epoch": 0.3272260901429095, + "grad_norm": 0.4166371179539154, + "learning_rate": 8.510572982348759e-06, + "loss": 0.508, + "step": 1786 + }, + { + "epoch": 0.32740930743862223, + "grad_norm": 0.4192663642287507, + "learning_rate": 8.508295172750116e-06, + "loss": 0.4802, + "step": 1787 + }, + { + "epoch": 0.3275925247343349, + "grad_norm": 0.3935083715874497, + "learning_rate": 8.506015928061468e-06, + "loss": 0.4748, + "step": 1788 + }, + { + "epoch": 0.3277757420300476, + "grad_norm": 0.4149102123371367, + "learning_rate": 8.50373524921515e-06, + "loss": 0.4927, + "step": 1789 + }, + { + "epoch": 0.32795895932576036, + "grad_norm": 0.3867478956169408, + "learning_rate": 8.501453137144093e-06, + "loss": 0.5062, + "step": 1790 + }, + { + "epoch": 0.32814217662147305, + "grad_norm": 0.4017651304087876, + "learning_rate": 8.499169592781807e-06, + "loss": 0.5199, + "step": 1791 + }, + { + "epoch": 0.3283253939171858, + "grad_norm": 0.365617732104251, + "learning_rate": 8.496884617062389e-06, + "loss": 0.4777, + "step": 1792 + }, + { + "epoch": 0.3285086112128985, + "grad_norm": 0.5935783428527378, + "learning_rate": 8.494598210920522e-06, + "loss": 0.5327, + "step": 1793 + }, + { + "epoch": 0.32869182850861123, + "grad_norm": 0.4461319964673768, + "learning_rate": 8.492310375291478e-06, + "loss": 0.4783, + "step": 1794 + }, + { + "epoch": 0.3288750458043239, + "grad_norm": 0.4548047555505239, + "learning_rate": 8.490021111111108e-06, + "loss": 0.4793, + "step": 1795 + }, + { + "epoch": 0.32905826310003666, + "grad_norm": 0.3837539231745361, + "learning_rate": 8.48773041931585e-06, + "loss": 0.4656, + "step": 1796 + }, + { + "epoch": 0.32924148039574935, + "grad_norm": 0.38081008240354797, + "learning_rate": 8.485438300842725e-06, + "loss": 0.4538, + "step": 1797 + }, + { + "epoch": 0.3294246976914621, + "grad_norm": 0.381006774669353, + "learning_rate": 8.483144756629342e-06, + "loss": 0.4678, + "step": 1798 + }, + { + "epoch": 0.3296079149871748, + "grad_norm": 4.393546461902215, + "learning_rate": 8.480849787613883e-06, + "loss": 0.48, + "step": 1799 + }, + { + "epoch": 0.3297911322828875, + "grad_norm": 0.4216378380286295, + "learning_rate": 8.478553394735126e-06, + "loss": 0.4664, + "step": 1800 + }, + { + "epoch": 0.3299743495786002, + "grad_norm": 0.40689364633375075, + "learning_rate": 8.47625557893242e-06, + "loss": 0.4687, + "step": 1801 + }, + { + "epoch": 0.3301575668743129, + "grad_norm": 0.4232729156542768, + "learning_rate": 8.473956341145706e-06, + "loss": 0.4684, + "step": 1802 + }, + { + "epoch": 0.33034078417002566, + "grad_norm": 0.4289741259876096, + "learning_rate": 8.471655682315496e-06, + "loss": 0.4716, + "step": 1803 + }, + { + "epoch": 0.33052400146573835, + "grad_norm": 0.5275080199063381, + "learning_rate": 8.469353603382892e-06, + "loss": 0.4757, + "step": 1804 + }, + { + "epoch": 0.3307072187614511, + "grad_norm": 0.43974415262075855, + "learning_rate": 8.467050105289572e-06, + "loss": 0.4865, + "step": 1805 + }, + { + "epoch": 0.3308904360571638, + "grad_norm": 0.4814298309864084, + "learning_rate": 8.4647451889778e-06, + "loss": 0.4797, + "step": 1806 + }, + { + "epoch": 0.33107365335287653, + "grad_norm": 0.42391918940288414, + "learning_rate": 8.462438855390409e-06, + "loss": 0.4823, + "step": 1807 + }, + { + "epoch": 0.3312568706485892, + "grad_norm": 0.3665799179835301, + "learning_rate": 8.460131105470829e-06, + "loss": 0.4649, + "step": 1808 + }, + { + "epoch": 0.33144008794430196, + "grad_norm": 0.4035555626403075, + "learning_rate": 8.45782194016305e-06, + "loss": 0.4758, + "step": 1809 + }, + { + "epoch": 0.33162330524001465, + "grad_norm": 0.4425161709111113, + "learning_rate": 8.455511360411657e-06, + "loss": 0.4961, + "step": 1810 + }, + { + "epoch": 0.3318065225357274, + "grad_norm": 0.4297733230595355, + "learning_rate": 8.453199367161804e-06, + "loss": 0.4801, + "step": 1811 + }, + { + "epoch": 0.3319897398314401, + "grad_norm": 0.407563982418168, + "learning_rate": 8.450885961359227e-06, + "loss": 0.5109, + "step": 1812 + }, + { + "epoch": 0.3321729571271528, + "grad_norm": 0.40167352725847427, + "learning_rate": 8.448571143950239e-06, + "loss": 0.5056, + "step": 1813 + }, + { + "epoch": 0.3323561744228655, + "grad_norm": 0.41581106857716255, + "learning_rate": 8.446254915881733e-06, + "loss": 0.4432, + "step": 1814 + }, + { + "epoch": 0.3325393917185782, + "grad_norm": 0.42034315457885957, + "learning_rate": 8.443937278101171e-06, + "loss": 0.5129, + "step": 1815 + }, + { + "epoch": 0.33272260901429096, + "grad_norm": 0.49697266318929034, + "learning_rate": 8.441618231556604e-06, + "loss": 0.4914, + "step": 1816 + }, + { + "epoch": 0.33290582631000365, + "grad_norm": 0.45473540523372535, + "learning_rate": 8.439297777196645e-06, + "loss": 0.5046, + "step": 1817 + }, + { + "epoch": 0.3330890436057164, + "grad_norm": 0.38498739235497825, + "learning_rate": 8.436975915970496e-06, + "loss": 0.5125, + "step": 1818 + }, + { + "epoch": 0.3332722609014291, + "grad_norm": 0.3856310516540707, + "learning_rate": 8.434652648827925e-06, + "loss": 0.4877, + "step": 1819 + }, + { + "epoch": 0.33345547819714183, + "grad_norm": 0.3545408916699545, + "learning_rate": 8.432327976719281e-06, + "loss": 0.4819, + "step": 1820 + }, + { + "epoch": 0.3336386954928545, + "grad_norm": 0.3816671046542913, + "learning_rate": 8.430001900595487e-06, + "loss": 0.5074, + "step": 1821 + }, + { + "epoch": 0.33382191278856727, + "grad_norm": 0.4236478252674396, + "learning_rate": 8.427674421408037e-06, + "loss": 0.4746, + "step": 1822 + }, + { + "epoch": 0.33400513008427996, + "grad_norm": 0.4093466909489272, + "learning_rate": 8.425345540109e-06, + "loss": 0.4727, + "step": 1823 + }, + { + "epoch": 0.33418834737999265, + "grad_norm": 0.37909373859606427, + "learning_rate": 8.423015257651022e-06, + "loss": 0.482, + "step": 1824 + }, + { + "epoch": 0.3343715646757054, + "grad_norm": 0.3801638934135252, + "learning_rate": 8.420683574987319e-06, + "loss": 0.4763, + "step": 1825 + }, + { + "epoch": 0.3345547819714181, + "grad_norm": 1.0336015888999988, + "learning_rate": 8.418350493071677e-06, + "loss": 0.4797, + "step": 1826 + }, + { + "epoch": 0.3347379992671308, + "grad_norm": 0.35984342130185926, + "learning_rate": 8.41601601285846e-06, + "loss": 0.4868, + "step": 1827 + }, + { + "epoch": 0.3349212165628435, + "grad_norm": 0.3550613764764406, + "learning_rate": 8.413680135302604e-06, + "loss": 0.4706, + "step": 1828 + }, + { + "epoch": 0.33510443385855626, + "grad_norm": 0.3623869235380865, + "learning_rate": 8.411342861359612e-06, + "loss": 0.4602, + "step": 1829 + }, + { + "epoch": 0.33528765115426895, + "grad_norm": 0.4364870755091544, + "learning_rate": 8.40900419198556e-06, + "loss": 0.4832, + "step": 1830 + }, + { + "epoch": 0.3354708684499817, + "grad_norm": 0.4636618121853265, + "learning_rate": 8.406664128137095e-06, + "loss": 0.453, + "step": 1831 + }, + { + "epoch": 0.3356540857456944, + "grad_norm": 0.36221754354794844, + "learning_rate": 8.404322670771436e-06, + "loss": 0.4849, + "step": 1832 + }, + { + "epoch": 0.33583730304140713, + "grad_norm": 0.39602504845526065, + "learning_rate": 8.40197982084637e-06, + "loss": 0.4755, + "step": 1833 + }, + { + "epoch": 0.3360205203371198, + "grad_norm": 0.4091319003361619, + "learning_rate": 8.399635579320259e-06, + "loss": 0.5015, + "step": 1834 + }, + { + "epoch": 0.33620373763283257, + "grad_norm": 0.3415400300998229, + "learning_rate": 8.397289947152021e-06, + "loss": 0.4645, + "step": 1835 + }, + { + "epoch": 0.33638695492854526, + "grad_norm": 0.4295090419145507, + "learning_rate": 8.394942925301155e-06, + "loss": 0.4385, + "step": 1836 + }, + { + "epoch": 0.33657017222425795, + "grad_norm": 0.3919892057064269, + "learning_rate": 8.392594514727728e-06, + "loss": 0.5009, + "step": 1837 + }, + { + "epoch": 0.3367533895199707, + "grad_norm": 0.4276728302114804, + "learning_rate": 8.390244716392369e-06, + "loss": 0.5113, + "step": 1838 + }, + { + "epoch": 0.3369366068156834, + "grad_norm": 0.4027722155599858, + "learning_rate": 8.387893531256278e-06, + "loss": 0.4825, + "step": 1839 + }, + { + "epoch": 0.3371198241113961, + "grad_norm": 0.34856854492446304, + "learning_rate": 8.385540960281223e-06, + "loss": 0.4614, + "step": 1840 + }, + { + "epoch": 0.3373030414071088, + "grad_norm": 0.33714083093594926, + "learning_rate": 8.383187004429536e-06, + "loss": 0.4723, + "step": 1841 + }, + { + "epoch": 0.33748625870282156, + "grad_norm": 0.5015806620678309, + "learning_rate": 8.38083166466412e-06, + "loss": 0.4804, + "step": 1842 + }, + { + "epoch": 0.33766947599853425, + "grad_norm": 0.40706034865044627, + "learning_rate": 8.378474941948437e-06, + "loss": 0.4814, + "step": 1843 + }, + { + "epoch": 0.337852693294247, + "grad_norm": 0.42150894844205045, + "learning_rate": 8.376116837246525e-06, + "loss": 0.4959, + "step": 1844 + }, + { + "epoch": 0.3380359105899597, + "grad_norm": 0.37939627732061587, + "learning_rate": 8.373757351522976e-06, + "loss": 0.4737, + "step": 1845 + }, + { + "epoch": 0.33821912788567243, + "grad_norm": 0.37334004265146, + "learning_rate": 8.371396485742956e-06, + "loss": 0.4583, + "step": 1846 + }, + { + "epoch": 0.3384023451813851, + "grad_norm": 0.5502578818261286, + "learning_rate": 8.36903424087219e-06, + "loss": 0.5031, + "step": 1847 + }, + { + "epoch": 0.3385855624770978, + "grad_norm": 0.4305389526404514, + "learning_rate": 8.366670617876969e-06, + "loss": 0.5081, + "step": 1848 + }, + { + "epoch": 0.33876877977281056, + "grad_norm": 0.373003871725479, + "learning_rate": 8.36430561772415e-06, + "loss": 0.4674, + "step": 1849 + }, + { + "epoch": 0.33895199706852325, + "grad_norm": 0.36910312348131236, + "learning_rate": 8.361939241381148e-06, + "loss": 0.4824, + "step": 1850 + }, + { + "epoch": 0.339135214364236, + "grad_norm": 0.38233912239924234, + "learning_rate": 8.359571489815946e-06, + "loss": 0.5155, + "step": 1851 + }, + { + "epoch": 0.3393184316599487, + "grad_norm": 0.42080088842006386, + "learning_rate": 8.357202363997085e-06, + "loss": 0.4932, + "step": 1852 + }, + { + "epoch": 0.33950164895566143, + "grad_norm": 0.40024242741960253, + "learning_rate": 8.354831864893675e-06, + "loss": 0.4889, + "step": 1853 + }, + { + "epoch": 0.3396848662513741, + "grad_norm": 0.3818263254638861, + "learning_rate": 8.352459993475379e-06, + "loss": 0.4489, + "step": 1854 + }, + { + "epoch": 0.33986808354708686, + "grad_norm": 0.45691270540100465, + "learning_rate": 8.35008675071243e-06, + "loss": 0.445, + "step": 1855 + }, + { + "epoch": 0.34005130084279955, + "grad_norm": 0.40956822564503537, + "learning_rate": 8.347712137575614e-06, + "loss": 0.4875, + "step": 1856 + }, + { + "epoch": 0.3402345181385123, + "grad_norm": 0.4223915335148929, + "learning_rate": 8.34533615503628e-06, + "loss": 0.5017, + "step": 1857 + }, + { + "epoch": 0.340417735434225, + "grad_norm": 0.4339594623834613, + "learning_rate": 8.342958804066345e-06, + "loss": 0.5349, + "step": 1858 + }, + { + "epoch": 0.34060095272993773, + "grad_norm": 0.38617828743973426, + "learning_rate": 8.340580085638275e-06, + "loss": 0.4858, + "step": 1859 + }, + { + "epoch": 0.3407841700256504, + "grad_norm": 0.39545092131685405, + "learning_rate": 8.3382000007251e-06, + "loss": 0.4844, + "step": 1860 + }, + { + "epoch": 0.3409673873213631, + "grad_norm": 0.4257990961795386, + "learning_rate": 8.33581855030041e-06, + "loss": 0.5055, + "step": 1861 + }, + { + "epoch": 0.34115060461707586, + "grad_norm": 0.3528695250715933, + "learning_rate": 8.333435735338346e-06, + "loss": 0.4875, + "step": 1862 + }, + { + "epoch": 0.34133382191278855, + "grad_norm": 0.37091916482456505, + "learning_rate": 8.331051556813623e-06, + "loss": 0.4984, + "step": 1863 + }, + { + "epoch": 0.3415170392085013, + "grad_norm": 0.4264847608510263, + "learning_rate": 8.328666015701496e-06, + "loss": 0.4867, + "step": 1864 + }, + { + "epoch": 0.341700256504214, + "grad_norm": 0.4099601844664771, + "learning_rate": 8.326279112977791e-06, + "loss": 0.4994, + "step": 1865 + }, + { + "epoch": 0.34188347379992673, + "grad_norm": 0.3677590723427411, + "learning_rate": 8.323890849618882e-06, + "loss": 0.5027, + "step": 1866 + }, + { + "epoch": 0.3420666910956394, + "grad_norm": 0.4409098617591673, + "learning_rate": 8.321501226601702e-06, + "loss": 0.496, + "step": 1867 + }, + { + "epoch": 0.34224990839135216, + "grad_norm": 0.3926664857139486, + "learning_rate": 8.319110244903748e-06, + "loss": 0.5018, + "step": 1868 + }, + { + "epoch": 0.34243312568706485, + "grad_norm": 0.3981457421214149, + "learning_rate": 8.316717905503058e-06, + "loss": 0.4803, + "step": 1869 + }, + { + "epoch": 0.3426163429827776, + "grad_norm": 0.4111803238988, + "learning_rate": 8.314324209378237e-06, + "loss": 0.4944, + "step": 1870 + }, + { + "epoch": 0.3427995602784903, + "grad_norm": 0.3837210327181565, + "learning_rate": 8.31192915750844e-06, + "loss": 0.4637, + "step": 1871 + }, + { + "epoch": 0.342982777574203, + "grad_norm": 0.36141808942109716, + "learning_rate": 8.309532750873381e-06, + "loss": 0.5022, + "step": 1872 + }, + { + "epoch": 0.3431659948699157, + "grad_norm": 0.4135486922923667, + "learning_rate": 8.30713499045332e-06, + "loss": 0.5429, + "step": 1873 + }, + { + "epoch": 0.3433492121656284, + "grad_norm": 0.4100838578777171, + "learning_rate": 8.304735877229082e-06, + "loss": 0.4814, + "step": 1874 + }, + { + "epoch": 0.34353242946134116, + "grad_norm": 0.4419756386117236, + "learning_rate": 8.302335412182034e-06, + "loss": 0.506, + "step": 1875 + }, + { + "epoch": 0.34371564675705385, + "grad_norm": 0.4023978864041945, + "learning_rate": 8.299933596294104e-06, + "loss": 0.4835, + "step": 1876 + }, + { + "epoch": 0.3438988640527666, + "grad_norm": 0.37938282086115893, + "learning_rate": 8.297530430547767e-06, + "loss": 0.4802, + "step": 1877 + }, + { + "epoch": 0.3440820813484793, + "grad_norm": 0.44808143362476516, + "learning_rate": 8.295125915926057e-06, + "loss": 0.4948, + "step": 1878 + }, + { + "epoch": 0.34426529864419203, + "grad_norm": 0.38559125969803254, + "learning_rate": 8.292720053412553e-06, + "loss": 0.4685, + "step": 1879 + }, + { + "epoch": 0.3444485159399047, + "grad_norm": 0.33369724182793525, + "learning_rate": 8.290312843991388e-06, + "loss": 0.4395, + "step": 1880 + }, + { + "epoch": 0.34463173323561747, + "grad_norm": 0.36560725000329003, + "learning_rate": 8.287904288647246e-06, + "loss": 0.4769, + "step": 1881 + }, + { + "epoch": 0.34481495053133016, + "grad_norm": 0.40109807099240397, + "learning_rate": 8.285494388365364e-06, + "loss": 0.4896, + "step": 1882 + }, + { + "epoch": 0.34499816782704285, + "grad_norm": 0.3758614423812384, + "learning_rate": 8.283083144131523e-06, + "loss": 0.4591, + "step": 1883 + }, + { + "epoch": 0.3451813851227556, + "grad_norm": 0.4045825714686613, + "learning_rate": 8.28067055693206e-06, + "loss": 0.4983, + "step": 1884 + }, + { + "epoch": 0.3453646024184683, + "grad_norm": 0.5653539162553293, + "learning_rate": 8.278256627753857e-06, + "loss": 0.4912, + "step": 1885 + }, + { + "epoch": 0.345547819714181, + "grad_norm": 0.4120278659351058, + "learning_rate": 8.27584135758435e-06, + "loss": 0.5053, + "step": 1886 + }, + { + "epoch": 0.3457310370098937, + "grad_norm": 0.4087683124654251, + "learning_rate": 8.273424747411519e-06, + "loss": 0.4768, + "step": 1887 + }, + { + "epoch": 0.34591425430560646, + "grad_norm": 0.3868920894610502, + "learning_rate": 8.27100679822389e-06, + "loss": 0.4872, + "step": 1888 + }, + { + "epoch": 0.34609747160131915, + "grad_norm": 0.3672508442805955, + "learning_rate": 8.268587511010546e-06, + "loss": 0.4492, + "step": 1889 + }, + { + "epoch": 0.3462806888970319, + "grad_norm": 0.3226405339400022, + "learning_rate": 8.266166886761106e-06, + "loss": 0.447, + "step": 1890 + }, + { + "epoch": 0.3464639061927446, + "grad_norm": 0.3896492518387063, + "learning_rate": 8.263744926465744e-06, + "loss": 0.4952, + "step": 1891 + }, + { + "epoch": 0.34664712348845733, + "grad_norm": 0.40806210827621486, + "learning_rate": 8.26132163111518e-06, + "loss": 0.4942, + "step": 1892 + }, + { + "epoch": 0.34683034078417, + "grad_norm": 0.5064133110397161, + "learning_rate": 8.258897001700673e-06, + "loss": 0.4914, + "step": 1893 + }, + { + "epoch": 0.34701355807988277, + "grad_norm": 0.37350719921589426, + "learning_rate": 8.256471039214036e-06, + "loss": 0.4667, + "step": 1894 + }, + { + "epoch": 0.34719677537559546, + "grad_norm": 0.4472694649954096, + "learning_rate": 8.254043744647625e-06, + "loss": 0.4768, + "step": 1895 + }, + { + "epoch": 0.34737999267130815, + "grad_norm": 0.5451471608365284, + "learning_rate": 8.251615118994338e-06, + "loss": 0.4833, + "step": 1896 + }, + { + "epoch": 0.3475632099670209, + "grad_norm": 0.43194887420538414, + "learning_rate": 8.249185163247621e-06, + "loss": 0.5041, + "step": 1897 + }, + { + "epoch": 0.3477464272627336, + "grad_norm": 0.4440434179645274, + "learning_rate": 8.24675387840146e-06, + "loss": 0.4928, + "step": 1898 + }, + { + "epoch": 0.3479296445584463, + "grad_norm": 0.4470574728369111, + "learning_rate": 8.24432126545039e-06, + "loss": 0.4721, + "step": 1899 + }, + { + "epoch": 0.348112861854159, + "grad_norm": 0.37811511446638624, + "learning_rate": 8.241887325389486e-06, + "loss": 0.4916, + "step": 1900 + }, + { + "epoch": 0.34829607914987176, + "grad_norm": 0.401475995772846, + "learning_rate": 8.239452059214367e-06, + "loss": 0.4922, + "step": 1901 + }, + { + "epoch": 0.34847929644558445, + "grad_norm": 0.36588179778579716, + "learning_rate": 8.23701546792119e-06, + "loss": 0.4769, + "step": 1902 + }, + { + "epoch": 0.3486625137412972, + "grad_norm": 0.37419447204674516, + "learning_rate": 8.234577552506662e-06, + "loss": 0.4683, + "step": 1903 + }, + { + "epoch": 0.3488457310370099, + "grad_norm": 0.44910824354183104, + "learning_rate": 8.232138313968025e-06, + "loss": 0.5002, + "step": 1904 + }, + { + "epoch": 0.34902894833272263, + "grad_norm": 0.37853030952448913, + "learning_rate": 8.229697753303067e-06, + "loss": 0.499, + "step": 1905 + }, + { + "epoch": 0.3492121656284353, + "grad_norm": 0.3787302843131875, + "learning_rate": 8.227255871510111e-06, + "loss": 0.4652, + "step": 1906 + }, + { + "epoch": 0.349395382924148, + "grad_norm": 0.42294805152134374, + "learning_rate": 8.224812669588028e-06, + "loss": 0.4761, + "step": 1907 + }, + { + "epoch": 0.34957860021986076, + "grad_norm": 0.4251808716753822, + "learning_rate": 8.222368148536223e-06, + "loss": 0.4781, + "step": 1908 + }, + { + "epoch": 0.34976181751557345, + "grad_norm": 0.41741877899455326, + "learning_rate": 8.219922309354643e-06, + "loss": 0.4979, + "step": 1909 + }, + { + "epoch": 0.3499450348112862, + "grad_norm": 0.4528858990840802, + "learning_rate": 8.217475153043772e-06, + "loss": 0.4841, + "step": 1910 + }, + { + "epoch": 0.3501282521069989, + "grad_norm": 0.37317689057150794, + "learning_rate": 8.215026680604637e-06, + "loss": 0.4713, + "step": 1911 + }, + { + "epoch": 0.35031146940271163, + "grad_norm": 0.3667470219941718, + "learning_rate": 8.212576893038799e-06, + "loss": 0.4689, + "step": 1912 + }, + { + "epoch": 0.3504946866984243, + "grad_norm": 0.3979696962375165, + "learning_rate": 8.21012579134836e-06, + "loss": 0.4538, + "step": 1913 + }, + { + "epoch": 0.35067790399413706, + "grad_norm": 0.43331562093452664, + "learning_rate": 8.20767337653596e-06, + "loss": 0.4878, + "step": 1914 + }, + { + "epoch": 0.35086112128984975, + "grad_norm": 0.36256762359906813, + "learning_rate": 8.20521964960477e-06, + "loss": 0.4462, + "step": 1915 + }, + { + "epoch": 0.3510443385855625, + "grad_norm": 0.43213117928406414, + "learning_rate": 8.202764611558507e-06, + "loss": 0.5066, + "step": 1916 + }, + { + "epoch": 0.3512275558812752, + "grad_norm": 0.4293633236485711, + "learning_rate": 8.200308263401417e-06, + "loss": 0.5351, + "step": 1917 + }, + { + "epoch": 0.35141077317698793, + "grad_norm": 0.5287407111719371, + "learning_rate": 8.197850606138286e-06, + "loss": 0.4705, + "step": 1918 + }, + { + "epoch": 0.3515939904727006, + "grad_norm": 0.48855534444525384, + "learning_rate": 8.195391640774433e-06, + "loss": 0.4852, + "step": 1919 + }, + { + "epoch": 0.3517772077684133, + "grad_norm": 0.4148812235933253, + "learning_rate": 8.192931368315715e-06, + "loss": 0.4785, + "step": 1920 + }, + { + "epoch": 0.35196042506412606, + "grad_norm": 0.44919257408205365, + "learning_rate": 8.190469789768517e-06, + "loss": 0.4851, + "step": 1921 + }, + { + "epoch": 0.35214364235983875, + "grad_norm": 0.4359193627784823, + "learning_rate": 8.188006906139767e-06, + "loss": 0.5077, + "step": 1922 + }, + { + "epoch": 0.3523268596555515, + "grad_norm": 0.43124736358934584, + "learning_rate": 8.185542718436923e-06, + "loss": 0.4705, + "step": 1923 + }, + { + "epoch": 0.3525100769512642, + "grad_norm": 0.4270332132061218, + "learning_rate": 8.183077227667975e-06, + "loss": 0.5061, + "step": 1924 + }, + { + "epoch": 0.35269329424697693, + "grad_norm": 0.3709113632371592, + "learning_rate": 8.180610434841448e-06, + "loss": 0.4926, + "step": 1925 + }, + { + "epoch": 0.3528765115426896, + "grad_norm": 0.3686143335119551, + "learning_rate": 8.1781423409664e-06, + "loss": 0.4736, + "step": 1926 + }, + { + "epoch": 0.35305972883840236, + "grad_norm": 0.4768757941328196, + "learning_rate": 8.175672947052416e-06, + "loss": 0.4899, + "step": 1927 + }, + { + "epoch": 0.35324294613411505, + "grad_norm": 0.37435018310100715, + "learning_rate": 8.173202254109622e-06, + "loss": 0.5019, + "step": 1928 + }, + { + "epoch": 0.3534261634298278, + "grad_norm": 0.5886108613911327, + "learning_rate": 8.170730263148668e-06, + "loss": 0.5208, + "step": 1929 + }, + { + "epoch": 0.3536093807255405, + "grad_norm": 0.42394082862501326, + "learning_rate": 8.168256975180737e-06, + "loss": 0.5124, + "step": 1930 + }, + { + "epoch": 0.3537925980212532, + "grad_norm": 0.39473180614386977, + "learning_rate": 8.165782391217543e-06, + "loss": 0.4863, + "step": 1931 + }, + { + "epoch": 0.3539758153169659, + "grad_norm": 0.4022544573376633, + "learning_rate": 8.163306512271334e-06, + "loss": 0.4989, + "step": 1932 + }, + { + "epoch": 0.3541590326126786, + "grad_norm": 0.4154530765976519, + "learning_rate": 8.160829339354876e-06, + "loss": 0.4661, + "step": 1933 + }, + { + "epoch": 0.35434224990839136, + "grad_norm": 0.3921468409429114, + "learning_rate": 8.158350873481478e-06, + "loss": 0.4738, + "step": 1934 + }, + { + "epoch": 0.35452546720410405, + "grad_norm": 0.3954474064023121, + "learning_rate": 8.155871115664968e-06, + "loss": 0.51, + "step": 1935 + }, + { + "epoch": 0.3547086844998168, + "grad_norm": 0.3882950367284415, + "learning_rate": 8.15339006691971e-06, + "loss": 0.4642, + "step": 1936 + }, + { + "epoch": 0.3548919017955295, + "grad_norm": 0.37897867702653376, + "learning_rate": 8.150907728260592e-06, + "loss": 0.5044, + "step": 1937 + }, + { + "epoch": 0.35507511909124223, + "grad_norm": 0.3859422166696926, + "learning_rate": 8.14842410070303e-06, + "loss": 0.4551, + "step": 1938 + }, + { + "epoch": 0.3552583363869549, + "grad_norm": 0.38875341973643246, + "learning_rate": 8.145939185262963e-06, + "loss": 0.4944, + "step": 1939 + }, + { + "epoch": 0.35544155368266767, + "grad_norm": 0.3983931627212232, + "learning_rate": 8.143452982956866e-06, + "loss": 0.5008, + "step": 1940 + }, + { + "epoch": 0.35562477097838036, + "grad_norm": 0.39775757772326403, + "learning_rate": 8.140965494801733e-06, + "loss": 0.5263, + "step": 1941 + }, + { + "epoch": 0.3558079882740931, + "grad_norm": 0.3944293856829923, + "learning_rate": 8.13847672181509e-06, + "loss": 0.4771, + "step": 1942 + }, + { + "epoch": 0.3559912055698058, + "grad_norm": 0.3817860113481927, + "learning_rate": 8.13598666501498e-06, + "loss": 0.4666, + "step": 1943 + }, + { + "epoch": 0.3561744228655185, + "grad_norm": 0.42632726063844545, + "learning_rate": 8.133495325419983e-06, + "loss": 0.4663, + "step": 1944 + }, + { + "epoch": 0.3563576401612312, + "grad_norm": 0.3572805692583185, + "learning_rate": 8.131002704049189e-06, + "loss": 0.4708, + "step": 1945 + }, + { + "epoch": 0.3565408574569439, + "grad_norm": 0.404852300953083, + "learning_rate": 8.128508801922226e-06, + "loss": 0.5325, + "step": 1946 + }, + { + "epoch": 0.35672407475265666, + "grad_norm": 0.35876468020600716, + "learning_rate": 8.126013620059236e-06, + "loss": 0.4949, + "step": 1947 + }, + { + "epoch": 0.35690729204836935, + "grad_norm": 0.38545171520037624, + "learning_rate": 8.123517159480894e-06, + "loss": 0.4764, + "step": 1948 + }, + { + "epoch": 0.3570905093440821, + "grad_norm": 0.40541828483545733, + "learning_rate": 8.12101942120839e-06, + "loss": 0.4807, + "step": 1949 + }, + { + "epoch": 0.3572737266397948, + "grad_norm": 0.4243415026003622, + "learning_rate": 8.118520406263437e-06, + "loss": 0.5003, + "step": 1950 + }, + { + "epoch": 0.35745694393550753, + "grad_norm": 0.4384025995443247, + "learning_rate": 8.116020115668278e-06, + "loss": 0.4916, + "step": 1951 + }, + { + "epoch": 0.3576401612312202, + "grad_norm": 0.356464489561591, + "learning_rate": 8.113518550445667e-06, + "loss": 0.5068, + "step": 1952 + }, + { + "epoch": 0.35782337852693297, + "grad_norm": 0.36237464089967664, + "learning_rate": 8.111015711618888e-06, + "loss": 0.4691, + "step": 1953 + }, + { + "epoch": 0.35800659582264566, + "grad_norm": 0.41004625226837194, + "learning_rate": 8.108511600211741e-06, + "loss": 0.495, + "step": 1954 + }, + { + "epoch": 0.35818981311835835, + "grad_norm": 0.37519930557205516, + "learning_rate": 8.106006217248552e-06, + "loss": 0.527, + "step": 1955 + }, + { + "epoch": 0.3583730304140711, + "grad_norm": 0.3930139446184139, + "learning_rate": 8.103499563754159e-06, + "loss": 0.49, + "step": 1956 + }, + { + "epoch": 0.3585562477097838, + "grad_norm": 0.4049543770945572, + "learning_rate": 8.100991640753926e-06, + "loss": 0.4918, + "step": 1957 + }, + { + "epoch": 0.3587394650054965, + "grad_norm": 0.41598194124283755, + "learning_rate": 8.098482449273737e-06, + "loss": 0.4583, + "step": 1958 + }, + { + "epoch": 0.3589226823012092, + "grad_norm": 0.3497094599476565, + "learning_rate": 8.095971990339987e-06, + "loss": 0.441, + "step": 1959 + }, + { + "epoch": 0.35910589959692196, + "grad_norm": 0.857453529704147, + "learning_rate": 8.0934602649796e-06, + "loss": 0.4862, + "step": 1960 + }, + { + "epoch": 0.35928911689263465, + "grad_norm": 0.3948051526194389, + "learning_rate": 8.090947274220011e-06, + "loss": 0.4696, + "step": 1961 + }, + { + "epoch": 0.3594723341883474, + "grad_norm": 0.4025724340675242, + "learning_rate": 8.088433019089174e-06, + "loss": 0.4784, + "step": 1962 + }, + { + "epoch": 0.3596555514840601, + "grad_norm": 0.4250158054819026, + "learning_rate": 8.08591750061556e-06, + "loss": 0.4887, + "step": 1963 + }, + { + "epoch": 0.35983876877977283, + "grad_norm": 0.3987052277731631, + "learning_rate": 8.083400719828161e-06, + "loss": 0.486, + "step": 1964 + }, + { + "epoch": 0.3600219860754855, + "grad_norm": 0.3818650890686896, + "learning_rate": 8.080882677756479e-06, + "loss": 0.4961, + "step": 1965 + }, + { + "epoch": 0.36020520337119827, + "grad_norm": 0.37338678994677815, + "learning_rate": 8.078363375430534e-06, + "loss": 0.4849, + "step": 1966 + }, + { + "epoch": 0.36038842066691096, + "grad_norm": 0.4033555520259992, + "learning_rate": 8.075842813880865e-06, + "loss": 0.5092, + "step": 1967 + }, + { + "epoch": 0.36057163796262365, + "grad_norm": 0.3904145734640018, + "learning_rate": 8.073320994138522e-06, + "loss": 0.498, + "step": 1968 + }, + { + "epoch": 0.3607548552583364, + "grad_norm": 0.41968588248870525, + "learning_rate": 8.070797917235071e-06, + "loss": 0.4592, + "step": 1969 + }, + { + "epoch": 0.3609380725540491, + "grad_norm": 0.40225333028901955, + "learning_rate": 8.068273584202593e-06, + "loss": 0.498, + "step": 1970 + }, + { + "epoch": 0.36112128984976183, + "grad_norm": 0.4779749898506442, + "learning_rate": 8.065747996073681e-06, + "loss": 0.4852, + "step": 1971 + }, + { + "epoch": 0.3613045071454745, + "grad_norm": 0.3801653362164248, + "learning_rate": 8.063221153881443e-06, + "loss": 0.4821, + "step": 1972 + }, + { + "epoch": 0.36148772444118726, + "grad_norm": 0.4360824877290448, + "learning_rate": 8.0606930586595e-06, + "loss": 0.4867, + "step": 1973 + }, + { + "epoch": 0.36167094173689995, + "grad_norm": 0.4253700547037495, + "learning_rate": 8.058163711441986e-06, + "loss": 0.4829, + "step": 1974 + }, + { + "epoch": 0.3618541590326127, + "grad_norm": 0.4341340818854911, + "learning_rate": 8.055633113263543e-06, + "loss": 0.5169, + "step": 1975 + }, + { + "epoch": 0.3620373763283254, + "grad_norm": 0.4424513726038396, + "learning_rate": 8.053101265159331e-06, + "loss": 0.5005, + "step": 1976 + }, + { + "epoch": 0.36222059362403813, + "grad_norm": 0.40080376851886934, + "learning_rate": 8.050568168165018e-06, + "loss": 0.4831, + "step": 1977 + }, + { + "epoch": 0.3624038109197508, + "grad_norm": 0.4092773451489212, + "learning_rate": 8.048033823316784e-06, + "loss": 0.4886, + "step": 1978 + }, + { + "epoch": 0.3625870282154635, + "grad_norm": 0.5750732156961504, + "learning_rate": 8.045498231651314e-06, + "loss": 0.4924, + "step": 1979 + }, + { + "epoch": 0.36277024551117626, + "grad_norm": 0.4000642911770352, + "learning_rate": 8.042961394205812e-06, + "loss": 0.4856, + "step": 1980 + }, + { + "epoch": 0.36295346280688895, + "grad_norm": 0.38938145758582615, + "learning_rate": 8.040423312017986e-06, + "loss": 0.4778, + "step": 1981 + }, + { + "epoch": 0.3631366801026017, + "grad_norm": 0.3643427586018773, + "learning_rate": 8.037883986126054e-06, + "loss": 0.4909, + "step": 1982 + }, + { + "epoch": 0.3633198973983144, + "grad_norm": 0.3939262969339654, + "learning_rate": 8.035343417568742e-06, + "loss": 0.4714, + "step": 1983 + }, + { + "epoch": 0.36350311469402713, + "grad_norm": 0.4016181460823361, + "learning_rate": 8.032801607385288e-06, + "loss": 0.4931, + "step": 1984 + }, + { + "epoch": 0.3636863319897398, + "grad_norm": 0.44751708009914176, + "learning_rate": 8.030258556615433e-06, + "loss": 0.4894, + "step": 1985 + }, + { + "epoch": 0.36386954928545256, + "grad_norm": 0.34958387993124196, + "learning_rate": 8.02771426629943e-06, + "loss": 0.4598, + "step": 1986 + }, + { + "epoch": 0.36405276658116525, + "grad_norm": 0.348786463445316, + "learning_rate": 8.025168737478034e-06, + "loss": 0.447, + "step": 1987 + }, + { + "epoch": 0.364235983876878, + "grad_norm": 0.3854079222539858, + "learning_rate": 8.022621971192513e-06, + "loss": 0.4812, + "step": 1988 + }, + { + "epoch": 0.3644192011725907, + "grad_norm": 0.38734200128527324, + "learning_rate": 8.020073968484632e-06, + "loss": 0.4862, + "step": 1989 + }, + { + "epoch": 0.36460241846830344, + "grad_norm": 0.411711055657578, + "learning_rate": 8.017524730396673e-06, + "loss": 0.4907, + "step": 1990 + }, + { + "epoch": 0.3647856357640161, + "grad_norm": 0.440618505246582, + "learning_rate": 8.014974257971415e-06, + "loss": 0.4588, + "step": 1991 + }, + { + "epoch": 0.3649688530597288, + "grad_norm": 0.4841735582737877, + "learning_rate": 8.012422552252148e-06, + "loss": 0.4988, + "step": 1992 + }, + { + "epoch": 0.36515207035544156, + "grad_norm": 0.35566001438725936, + "learning_rate": 8.009869614282657e-06, + "loss": 0.5083, + "step": 1993 + }, + { + "epoch": 0.36533528765115425, + "grad_norm": 0.3607042141519043, + "learning_rate": 8.007315445107242e-06, + "loss": 0.4634, + "step": 1994 + }, + { + "epoch": 0.365518504946867, + "grad_norm": 0.3841642715938066, + "learning_rate": 8.004760045770702e-06, + "loss": 0.4972, + "step": 1995 + }, + { + "epoch": 0.3657017222425797, + "grad_norm": 0.39468083856708197, + "learning_rate": 8.002203417318335e-06, + "loss": 0.4779, + "step": 1996 + }, + { + "epoch": 0.36588493953829243, + "grad_norm": 0.419151699921075, + "learning_rate": 7.999645560795947e-06, + "loss": 0.5039, + "step": 1997 + }, + { + "epoch": 0.3660681568340051, + "grad_norm": 0.4148076872613475, + "learning_rate": 7.99708647724985e-06, + "loss": 0.4969, + "step": 1998 + }, + { + "epoch": 0.36625137412971787, + "grad_norm": 0.37720877965942756, + "learning_rate": 7.994526167726847e-06, + "loss": 0.4657, + "step": 1999 + }, + { + "epoch": 0.36643459142543056, + "grad_norm": 0.3456057298421402, + "learning_rate": 7.991964633274255e-06, + "loss": 0.5015, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 5458, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 357375823380480.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}