| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9993174061433447, | |
| "eval_steps": 183, | |
| "global_step": 549, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018202502844141069, | |
| "grad_norm": 7.828993836368125, | |
| "learning_rate": 4.705882352941176e-07, | |
| "loss": 0.7927, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0036405005688282138, | |
| "grad_norm": 6.714284407371177, | |
| "learning_rate": 9.411764705882352e-07, | |
| "loss": 0.7939, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.005460750853242321, | |
| "grad_norm": 6.364056661574126, | |
| "learning_rate": 1.411764705882353e-06, | |
| "loss": 0.8186, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0072810011376564275, | |
| "grad_norm": 6.001686432641966, | |
| "learning_rate": 1.8823529411764705e-06, | |
| "loss": 0.7232, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.009101251422070534, | |
| "grad_norm": 5.534730246504558, | |
| "learning_rate": 2.352941176470588e-06, | |
| "loss": 0.7891, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010921501706484642, | |
| "grad_norm": 3.4121020699424713, | |
| "learning_rate": 2.823529411764706e-06, | |
| "loss": 0.7351, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.01274175199089875, | |
| "grad_norm": 3.9324785806724476, | |
| "learning_rate": 3.294117647058823e-06, | |
| "loss": 0.6899, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.014562002275312855, | |
| "grad_norm": 2.5947504275059496, | |
| "learning_rate": 3.764705882352941e-06, | |
| "loss": 0.6103, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.016382252559726963, | |
| "grad_norm": 2.5403086872594955, | |
| "learning_rate": 4.235294117647058e-06, | |
| "loss": 0.6377, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.01820250284414107, | |
| "grad_norm": 2.339440216808695, | |
| "learning_rate": 4.705882352941176e-06, | |
| "loss": 0.6723, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020022753128555178, | |
| "grad_norm": 2.1983237574211922, | |
| "learning_rate": 5.176470588235294e-06, | |
| "loss": 0.684, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.021843003412969283, | |
| "grad_norm": 1.6424427461297102, | |
| "learning_rate": 5.647058823529412e-06, | |
| "loss": 0.6027, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02366325369738339, | |
| "grad_norm": 1.6005922788724476, | |
| "learning_rate": 6.1176470588235285e-06, | |
| "loss": 0.7195, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0254835039817975, | |
| "grad_norm": 1.8438931230594375, | |
| "learning_rate": 6.588235294117646e-06, | |
| "loss": 0.6329, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.027303754266211604, | |
| "grad_norm": 1.8086010126416687, | |
| "learning_rate": 7.058823529411764e-06, | |
| "loss": 0.6812, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02912400455062571, | |
| "grad_norm": 1.6390713129791323, | |
| "learning_rate": 7.529411764705882e-06, | |
| "loss": 0.6848, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03094425483503982, | |
| "grad_norm": 1.5725949727834614, | |
| "learning_rate": 8e-06, | |
| "loss": 0.6415, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.032764505119453925, | |
| "grad_norm": 1.5078400807519021, | |
| "learning_rate": 7.999930256262932e-06, | |
| "loss": 0.6284, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03458475540386803, | |
| "grad_norm": 1.5292651701417765, | |
| "learning_rate": 7.999721027483818e-06, | |
| "loss": 0.6503, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.03640500568828214, | |
| "grad_norm": 1.4983054037314452, | |
| "learning_rate": 7.999372320958861e-06, | |
| "loss": 0.6167, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03822525597269624, | |
| "grad_norm": 1.4412298959675836, | |
| "learning_rate": 7.998884148848109e-06, | |
| "loss": 0.6245, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.040045506257110355, | |
| "grad_norm": 1.2365167423542114, | |
| "learning_rate": 7.998256528175033e-06, | |
| "loss": 0.5953, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04186575654152446, | |
| "grad_norm": 1.5357829692377438, | |
| "learning_rate": 7.997489480825941e-06, | |
| "loss": 0.6367, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.04368600682593857, | |
| "grad_norm": 1.3171822289121358, | |
| "learning_rate": 7.996583033549204e-06, | |
| "loss": 0.5577, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04550625711035267, | |
| "grad_norm": 1.3245922461587984, | |
| "learning_rate": 7.995537217954335e-06, | |
| "loss": 0.5706, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04732650739476678, | |
| "grad_norm": 1.4166210756319584, | |
| "learning_rate": 7.994352070510876e-06, | |
| "loss": 0.6612, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.049146757679180884, | |
| "grad_norm": 1.4367785667776511, | |
| "learning_rate": 7.993027632547137e-06, | |
| "loss": 0.5766, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.050967007963595, | |
| "grad_norm": 1.2803978303957413, | |
| "learning_rate": 7.991563950248739e-06, | |
| "loss": 0.6023, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0527872582480091, | |
| "grad_norm": 1.3201964359281728, | |
| "learning_rate": 7.989961074657023e-06, | |
| "loss": 0.6026, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.05460750853242321, | |
| "grad_norm": 1.5466402476684098, | |
| "learning_rate": 7.988219061667252e-06, | |
| "loss": 0.5979, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.056427758816837315, | |
| "grad_norm": 1.372807482234666, | |
| "learning_rate": 7.986337972026678e-06, | |
| "loss": 0.5928, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.05824800910125142, | |
| "grad_norm": 1.2476758007512014, | |
| "learning_rate": 7.98431787133241e-06, | |
| "loss": 0.5506, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.060068259385665526, | |
| "grad_norm": 1.3332407362456573, | |
| "learning_rate": 7.982158830029133e-06, | |
| "loss": 0.5252, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06188850967007964, | |
| "grad_norm": 1.2956829613552345, | |
| "learning_rate": 7.979860923406654e-06, | |
| "loss": 0.6162, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.06370875995449374, | |
| "grad_norm": 1.3358171325973744, | |
| "learning_rate": 7.977424231597266e-06, | |
| "loss": 0.6323, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06552901023890785, | |
| "grad_norm": 1.2668022187917536, | |
| "learning_rate": 7.97484883957297e-06, | |
| "loss": 0.5481, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.06734926052332196, | |
| "grad_norm": 1.3032578120954865, | |
| "learning_rate": 7.972134837142497e-06, | |
| "loss": 0.6982, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.06916951080773606, | |
| "grad_norm": 1.438077454657845, | |
| "learning_rate": 7.969282318948179e-06, | |
| "loss": 0.6386, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.07098976109215017, | |
| "grad_norm": 1.2348261155030926, | |
| "learning_rate": 7.966291384462662e-06, | |
| "loss": 0.5691, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.07281001137656427, | |
| "grad_norm": 1.7781144758356542, | |
| "learning_rate": 7.963162137985416e-06, | |
| "loss": 0.6133, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07463026166097839, | |
| "grad_norm": 1.3915614778891998, | |
| "learning_rate": 7.959894688639114e-06, | |
| "loss": 0.6097, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.07645051194539249, | |
| "grad_norm": 1.3714026109891253, | |
| "learning_rate": 7.956489150365818e-06, | |
| "loss": 0.7127, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0782707622298066, | |
| "grad_norm": 1.2639378240340353, | |
| "learning_rate": 7.952945641923014e-06, | |
| "loss": 0.5649, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08009101251422071, | |
| "grad_norm": 1.3658987432003644, | |
| "learning_rate": 7.949264286879461e-06, | |
| "loss": 0.5975, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08191126279863481, | |
| "grad_norm": 1.3869528512684368, | |
| "learning_rate": 7.94544521361089e-06, | |
| "loss": 0.5851, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08373151308304892, | |
| "grad_norm": 1.3457362580768018, | |
| "learning_rate": 7.941488555295519e-06, | |
| "loss": 0.6241, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.08555176336746302, | |
| "grad_norm": 1.3016443215121218, | |
| "learning_rate": 7.937394449909417e-06, | |
| "loss": 0.5603, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.08737201365187713, | |
| "grad_norm": 1.4891468132527028, | |
| "learning_rate": 7.933163040221691e-06, | |
| "loss": 0.6103, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.08919226393629125, | |
| "grad_norm": 1.3296810848522813, | |
| "learning_rate": 7.928794473789502e-06, | |
| "loss": 0.5823, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09101251422070535, | |
| "grad_norm": 1.5647412617424314, | |
| "learning_rate": 7.924288902952924e-06, | |
| "loss": 0.6222, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09283276450511946, | |
| "grad_norm": 1.224983623587384, | |
| "learning_rate": 7.91964648482963e-06, | |
| "loss": 0.5779, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.09465301478953356, | |
| "grad_norm": 1.3384527596977494, | |
| "learning_rate": 7.914867381309417e-06, | |
| "loss": 0.5721, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.09647326507394767, | |
| "grad_norm": 1.3725597146198896, | |
| "learning_rate": 7.909951759048553e-06, | |
| "loss": 0.6531, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.09829351535836177, | |
| "grad_norm": 1.4825184286499173, | |
| "learning_rate": 7.904899789463974e-06, | |
| "loss": 0.5767, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.10011376564277588, | |
| "grad_norm": 1.2710681313032695, | |
| "learning_rate": 7.899711648727295e-06, | |
| "loss": 0.5447, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.10193401592719, | |
| "grad_norm": 1.468311538305991, | |
| "learning_rate": 7.894387517758679e-06, | |
| "loss": 0.6303, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1037542662116041, | |
| "grad_norm": 1.1661278644804365, | |
| "learning_rate": 7.888927582220521e-06, | |
| "loss": 0.606, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1055745164960182, | |
| "grad_norm": 1.3217868795788283, | |
| "learning_rate": 7.883332032510978e-06, | |
| "loss": 0.5329, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.1073947667804323, | |
| "grad_norm": 1.33515774996865, | |
| "learning_rate": 7.877601063757322e-06, | |
| "loss": 0.5335, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.10921501706484642, | |
| "grad_norm": 1.4916296734659389, | |
| "learning_rate": 7.871734875809141e-06, | |
| "loss": 0.5705, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11103526734926053, | |
| "grad_norm": 1.3630503455015932, | |
| "learning_rate": 7.86573367323137e-06, | |
| "loss": 0.6279, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.11285551763367463, | |
| "grad_norm": 1.2409472798363426, | |
| "learning_rate": 7.859597665297158e-06, | |
| "loss": 0.5096, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.11467576791808874, | |
| "grad_norm": 1.241131166674189, | |
| "learning_rate": 7.853327065980567e-06, | |
| "loss": 0.5792, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.11649601820250284, | |
| "grad_norm": 1.2488483644853932, | |
| "learning_rate": 7.84692209394911e-06, | |
| "loss": 0.5191, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.11831626848691695, | |
| "grad_norm": 1.4362376863954367, | |
| "learning_rate": 7.84038297255613e-06, | |
| "loss": 0.5749, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12013651877133105, | |
| "grad_norm": 1.301660597552013, | |
| "learning_rate": 7.83370992983301e-06, | |
| "loss": 0.5598, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.12195676905574517, | |
| "grad_norm": 1.2871188302606258, | |
| "learning_rate": 7.826903198481218e-06, | |
| "loss": 0.6357, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.12377701934015928, | |
| "grad_norm": 1.2412889792540172, | |
| "learning_rate": 7.819963015864195e-06, | |
| "loss": 0.6025, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.12559726962457338, | |
| "grad_norm": 1.2417123308251397, | |
| "learning_rate": 7.812889623999077e-06, | |
| "loss": 0.5973, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.12741751990898748, | |
| "grad_norm": 1.334500041209377, | |
| "learning_rate": 7.805683269548253e-06, | |
| "loss": 0.5339, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1292377701934016, | |
| "grad_norm": 1.2800707133087328, | |
| "learning_rate": 7.798344203810772e-06, | |
| "loss": 0.5506, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.1310580204778157, | |
| "grad_norm": 1.2634004680746123, | |
| "learning_rate": 7.790872682713567e-06, | |
| "loss": 0.554, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.1328782707622298, | |
| "grad_norm": 1.3715962042311087, | |
| "learning_rate": 7.783268966802538e-06, | |
| "loss": 0.5949, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.13469852104664393, | |
| "grad_norm": 1.4002060242886838, | |
| "learning_rate": 7.77553332123347e-06, | |
| "loss": 0.6422, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.13651877133105803, | |
| "grad_norm": 1.2759192431077615, | |
| "learning_rate": 7.767666015762775e-06, | |
| "loss": 0.607, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.13833902161547212, | |
| "grad_norm": 1.6921865669723448, | |
| "learning_rate": 7.7596673247381e-06, | |
| "loss": 0.6002, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.14015927189988622, | |
| "grad_norm": 1.3571851968954738, | |
| "learning_rate": 7.751537527088742e-06, | |
| "loss": 0.5215, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.14197952218430035, | |
| "grad_norm": 1.4785928356534102, | |
| "learning_rate": 7.743276906315936e-06, | |
| "loss": 0.6101, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.14379977246871445, | |
| "grad_norm": 1.465222696303414, | |
| "learning_rate": 7.734885750482967e-06, | |
| "loss": 0.6187, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.14562002275312855, | |
| "grad_norm": 1.264573511241066, | |
| "learning_rate": 7.726364352205116e-06, | |
| "loss": 0.5673, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14744027303754267, | |
| "grad_norm": 1.238555624330946, | |
| "learning_rate": 7.717713008639463e-06, | |
| "loss": 0.6066, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.14926052332195677, | |
| "grad_norm": 1.20038139697854, | |
| "learning_rate": 7.708932021474524e-06, | |
| "loss": 0.5678, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.15108077360637087, | |
| "grad_norm": 1.3190323958334018, | |
| "learning_rate": 7.70002169691973e-06, | |
| "loss": 0.5544, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.15290102389078497, | |
| "grad_norm": 1.3333392166861238, | |
| "learning_rate": 7.690982345694746e-06, | |
| "loss": 0.5212, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.1547212741751991, | |
| "grad_norm": 1.5189079377057624, | |
| "learning_rate": 7.68181428301864e-06, | |
| "loss": 0.5411, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1565415244596132, | |
| "grad_norm": 1.2341512401643826, | |
| "learning_rate": 7.67251782859889e-06, | |
| "loss": 0.5, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.1583617747440273, | |
| "grad_norm": 1.3710782832898465, | |
| "learning_rate": 7.663093306620228e-06, | |
| "loss": 0.567, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.16018202502844142, | |
| "grad_norm": 1.3224943188093254, | |
| "learning_rate": 7.653541045733351e-06, | |
| "loss": 0.6514, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.16200227531285552, | |
| "grad_norm": 1.3825136099871158, | |
| "learning_rate": 7.643861379043442e-06, | |
| "loss": 0.49, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.16382252559726962, | |
| "grad_norm": 1.6656667334345212, | |
| "learning_rate": 7.634054644098566e-06, | |
| "loss": 0.649, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16564277588168372, | |
| "grad_norm": 1.3683202937271444, | |
| "learning_rate": 7.624121182877892e-06, | |
| "loss": 0.497, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.16746302616609784, | |
| "grad_norm": 1.5512864267072353, | |
| "learning_rate": 7.614061341779777e-06, | |
| "loss": 0.6176, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.16928327645051194, | |
| "grad_norm": 1.5790193819370095, | |
| "learning_rate": 7.6038754716096755e-06, | |
| "loss": 0.5634, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.17110352673492604, | |
| "grad_norm": 1.4344008312589909, | |
| "learning_rate": 7.593563927567915e-06, | |
| "loss": 0.5932, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.17292377701934017, | |
| "grad_norm": 1.2510278162330568, | |
| "learning_rate": 7.583127069237302e-06, | |
| "loss": 0.5604, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.17474402730375427, | |
| "grad_norm": 1.1926891094591303, | |
| "learning_rate": 7.5725652605705876e-06, | |
| "loss": 0.5746, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.17656427758816837, | |
| "grad_norm": 1.3458065119541616, | |
| "learning_rate": 7.561878869877778e-06, | |
| "loss": 0.5, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.1783845278725825, | |
| "grad_norm": 1.311426359460282, | |
| "learning_rate": 7.551068269813282e-06, | |
| "loss": 0.503, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.1802047781569966, | |
| "grad_norm": 1.2792152183438508, | |
| "learning_rate": 7.540133837362924e-06, | |
| "loss": 0.5279, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.1820250284414107, | |
| "grad_norm": 1.2349765362905594, | |
| "learning_rate": 7.5290759538307944e-06, | |
| "loss": 0.5159, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1838452787258248, | |
| "grad_norm": 1.3587018583909733, | |
| "learning_rate": 7.517895004825955e-06, | |
| "loss": 0.573, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.18566552901023892, | |
| "grad_norm": 1.3554993825796526, | |
| "learning_rate": 7.506591380248991e-06, | |
| "loss": 0.5801, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.18748577929465302, | |
| "grad_norm": 1.2364653944665345, | |
| "learning_rate": 7.495165474278411e-06, | |
| "loss": 0.5618, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.18930602957906711, | |
| "grad_norm": 1.226975873209754, | |
| "learning_rate": 7.483617685356906e-06, | |
| "loss": 0.6663, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.19112627986348124, | |
| "grad_norm": 1.4312274290636884, | |
| "learning_rate": 7.471948416177452e-06, | |
| "loss": 0.5473, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.19294653014789534, | |
| "grad_norm": 1.467104665014613, | |
| "learning_rate": 7.460158073669271e-06, | |
| "loss": 0.5418, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.19476678043230944, | |
| "grad_norm": 1.1804815586636788, | |
| "learning_rate": 7.448247068983638e-06, | |
| "loss": 0.5378, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.19658703071672354, | |
| "grad_norm": 1.3602412614708939, | |
| "learning_rate": 7.43621581747954e-06, | |
| "loss": 0.5026, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.19840728100113766, | |
| "grad_norm": 1.2691524680339796, | |
| "learning_rate": 7.4240647387092e-06, | |
| "loss": 0.591, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.20022753128555176, | |
| "grad_norm": 1.2783869708675566, | |
| "learning_rate": 7.411794256403439e-06, | |
| "loss": 0.5085, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20204778156996586, | |
| "grad_norm": 1.2804361153159327, | |
| "learning_rate": 7.399404798456901e-06, | |
| "loss": 0.6244, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.20386803185438, | |
| "grad_norm": 1.2444898246213776, | |
| "learning_rate": 7.3868967969131364e-06, | |
| "loss": 0.5313, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2056882821387941, | |
| "grad_norm": 1.3015010555018793, | |
| "learning_rate": 7.374270687949531e-06, | |
| "loss": 0.5512, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2075085324232082, | |
| "grad_norm": 1.2989535634921763, | |
| "learning_rate": 7.3615269118620945e-06, | |
| "loss": 0.5612, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.20932878270762229, | |
| "grad_norm": 1.2893299441070913, | |
| "learning_rate": 7.348665913050114e-06, | |
| "loss": 0.4779, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2111490329920364, | |
| "grad_norm": 1.4374586177315487, | |
| "learning_rate": 7.3356881400006485e-06, | |
| "loss": 0.6057, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.2129692832764505, | |
| "grad_norm": 1.2587242021503462, | |
| "learning_rate": 7.3225940452728915e-06, | |
| "loss": 0.5679, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.2147895335608646, | |
| "grad_norm": 1.4250046519243573, | |
| "learning_rate": 7.309384085482396e-06, | |
| "loss": 0.5, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.21660978384527874, | |
| "grad_norm": 1.222873795275555, | |
| "learning_rate": 7.29605872128514e-06, | |
| "loss": 0.6714, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.21843003412969283, | |
| "grad_norm": 1.4243471522403268, | |
| "learning_rate": 7.282618417361476e-06, | |
| "loss": 0.6238, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22025028441410693, | |
| "grad_norm": 1.3065783409043068, | |
| "learning_rate": 7.269063642399912e-06, | |
| "loss": 0.5464, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.22207053469852106, | |
| "grad_norm": 1.4517652323683794, | |
| "learning_rate": 7.25539486908078e-06, | |
| "loss": 0.4985, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.22389078498293516, | |
| "grad_norm": 1.1483510986901082, | |
| "learning_rate": 7.241612574059745e-06, | |
| "loss": 0.4978, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.22571103526734926, | |
| "grad_norm": 1.4389217565677268, | |
| "learning_rate": 7.227717237951189e-06, | |
| "loss": 0.6112, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.22753128555176336, | |
| "grad_norm": 1.3389550682905482, | |
| "learning_rate": 7.213709345311444e-06, | |
| "loss": 0.6476, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.22935153583617748, | |
| "grad_norm": 1.4169245520418259, | |
| "learning_rate": 7.1995893846219035e-06, | |
| "loss": 0.5354, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.23117178612059158, | |
| "grad_norm": 1.2613504469980097, | |
| "learning_rate": 7.185357848271977e-06, | |
| "loss": 0.5467, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.23299203640500568, | |
| "grad_norm": 1.1666125829814091, | |
| "learning_rate": 7.17101523254193e-06, | |
| "loss": 0.4698, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.2348122866894198, | |
| "grad_norm": 1.352110005355786, | |
| "learning_rate": 7.156562037585575e-06, | |
| "loss": 0.6109, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.2366325369738339, | |
| "grad_norm": 1.2180780408157383, | |
| "learning_rate": 7.1419987674128225e-06, | |
| "loss": 0.5332, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.238452787258248, | |
| "grad_norm": 1.3933377570677665, | |
| "learning_rate": 7.127325929872119e-06, | |
| "loss": 0.6671, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2402730375426621, | |
| "grad_norm": 1.5246008222442193, | |
| "learning_rate": 7.1125440366327245e-06, | |
| "loss": 0.6212, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.24209328782707623, | |
| "grad_norm": 1.2745899656103845, | |
| "learning_rate": 7.0976536031668775e-06, | |
| "loss": 0.6395, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.24391353811149033, | |
| "grad_norm": 1.226190343569202, | |
| "learning_rate": 7.082655148731815e-06, | |
| "loss": 0.5761, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.24573378839590443, | |
| "grad_norm": 1.1905155038972701, | |
| "learning_rate": 7.067549196351669e-06, | |
| "loss": 0.5418, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.24755403868031856, | |
| "grad_norm": 1.2116012422540454, | |
| "learning_rate": 7.052336272799226e-06, | |
| "loss": 0.5273, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.24937428896473265, | |
| "grad_norm": 1.282002116739219, | |
| "learning_rate": 7.037016908577555e-06, | |
| "loss": 0.4506, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.25119453924914675, | |
| "grad_norm": 1.2499521919340497, | |
| "learning_rate": 7.02159163790151e-06, | |
| "loss": 0.5606, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2530147895335609, | |
| "grad_norm": 1.7373606199315674, | |
| "learning_rate": 7.006060998679105e-06, | |
| "loss": 0.559, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.25483503981797495, | |
| "grad_norm": 1.4650449441633262, | |
| "learning_rate": 6.990425532492747e-06, | |
| "loss": 0.5135, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2566552901023891, | |
| "grad_norm": 1.154885986165648, | |
| "learning_rate": 6.974685784580359e-06, | |
| "loss": 0.5039, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.2584755403868032, | |
| "grad_norm": 1.2096650723076037, | |
| "learning_rate": 6.958842303816359e-06, | |
| "loss": 0.5079, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.2602957906712173, | |
| "grad_norm": 1.1620810226211598, | |
| "learning_rate": 6.942895642692527e-06, | |
| "loss": 0.5245, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.2621160409556314, | |
| "grad_norm": 1.3200231462656904, | |
| "learning_rate": 6.926846357298732e-06, | |
| "loss": 0.5935, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.26393629124004553, | |
| "grad_norm": 1.2355460824632627, | |
| "learning_rate": 6.910695007303544e-06, | |
| "loss": 0.5543, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2657565415244596, | |
| "grad_norm": 1.2033075815432748, | |
| "learning_rate": 6.894442155934719e-06, | |
| "loss": 0.4831, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2675767918088737, | |
| "grad_norm": 1.1907917135137838, | |
| "learning_rate": 6.878088369959553e-06, | |
| "loss": 0.5221, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.26939704209328785, | |
| "grad_norm": 1.3255509879738674, | |
| "learning_rate": 6.861634219665117e-06, | |
| "loss": 0.6086, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2712172923777019, | |
| "grad_norm": 1.2271219658785495, | |
| "learning_rate": 6.845080278838381e-06, | |
| "loss": 0.5825, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.27303754266211605, | |
| "grad_norm": 1.1690589610793065, | |
| "learning_rate": 6.82842712474619e-06, | |
| "loss": 0.5807, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2748577929465301, | |
| "grad_norm": 1.2888207762623227, | |
| "learning_rate": 6.811675338115146e-06, | |
| "loss": 0.6188, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.27667804323094425, | |
| "grad_norm": 1.2514369430789496, | |
| "learning_rate": 6.7948255031113505e-06, | |
| "loss": 0.5913, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.2784982935153584, | |
| "grad_norm": 1.3076941421175066, | |
| "learning_rate": 6.777878207320034e-06, | |
| "loss": 0.5054, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.28031854379977245, | |
| "grad_norm": 1.2943985087075844, | |
| "learning_rate": 6.760834041725068e-06, | |
| "loss": 0.4915, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.2821387940841866, | |
| "grad_norm": 1.2887581327542428, | |
| "learning_rate": 6.743693600688352e-06, | |
| "loss": 0.5538, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2839590443686007, | |
| "grad_norm": 1.157113796843012, | |
| "learning_rate": 6.726457481929095e-06, | |
| "loss": 0.537, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.28577929465301477, | |
| "grad_norm": 1.1766314672266696, | |
| "learning_rate": 6.7091262865029645e-06, | |
| "loss": 0.5896, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.2875995449374289, | |
| "grad_norm": 1.2292202798354899, | |
| "learning_rate": 6.691700618781126e-06, | |
| "loss": 0.6347, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.289419795221843, | |
| "grad_norm": 1.1513406588801496, | |
| "learning_rate": 6.674181086429177e-06, | |
| "loss": 0.4663, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.2912400455062571, | |
| "grad_norm": 1.2946840584006447, | |
| "learning_rate": 6.656568300385944e-06, | |
| "loss": 0.6247, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2930602957906712, | |
| "grad_norm": 1.2952632442735728, | |
| "learning_rate": 6.6388628748421895e-06, | |
| "loss": 0.4728, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.29488054607508535, | |
| "grad_norm": 1.2684543190366842, | |
| "learning_rate": 6.62106542721918e-06, | |
| "loss": 0.5, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.2967007963594994, | |
| "grad_norm": 1.2434590334770437, | |
| "learning_rate": 6.603176578147174e-06, | |
| "loss": 0.552, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.29852104664391355, | |
| "grad_norm": 1.2853162665121605, | |
| "learning_rate": 6.585196951443763e-06, | |
| "loss": 0.5311, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.3003412969283277, | |
| "grad_norm": 1.3910181828529422, | |
| "learning_rate": 6.5671271740921266e-06, | |
| "loss": 0.5595, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.30216154721274174, | |
| "grad_norm": 1.318698152941268, | |
| "learning_rate": 6.548967876219163e-06, | |
| "loss": 0.5323, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.30398179749715587, | |
| "grad_norm": 1.259567167046916, | |
| "learning_rate": 6.530719691073521e-06, | |
| "loss": 0.5773, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.30580204778156994, | |
| "grad_norm": 1.3201679730014977, | |
| "learning_rate": 6.5123832550035165e-06, | |
| "loss": 0.5143, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.30762229806598407, | |
| "grad_norm": 1.3232034824966301, | |
| "learning_rate": 6.493959207434934e-06, | |
| "loss": 0.553, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3094425483503982, | |
| "grad_norm": 1.3960220649200046, | |
| "learning_rate": 6.47544819084874e-06, | |
| "loss": 0.561, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.31126279863481227, | |
| "grad_norm": 1.3348060079340793, | |
| "learning_rate": 6.4568508507586715e-06, | |
| "loss": 0.5047, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.3130830489192264, | |
| "grad_norm": 1.2868651237482562, | |
| "learning_rate": 6.438167835688725e-06, | |
| "loss": 0.5094, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.3149032992036405, | |
| "grad_norm": 1.2603952904899627, | |
| "learning_rate": 6.41939979715055e-06, | |
| "loss": 0.5323, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3167235494880546, | |
| "grad_norm": 1.2921556438401538, | |
| "learning_rate": 6.400547389620716e-06, | |
| "loss": 0.5554, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.3185437997724687, | |
| "grad_norm": 1.2590564886848532, | |
| "learning_rate": 6.3816112705178984e-06, | |
| "loss": 0.5288, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.32036405005688284, | |
| "grad_norm": 1.3084436554782835, | |
| "learning_rate": 6.362592100179958e-06, | |
| "loss": 0.6402, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3221843003412969, | |
| "grad_norm": 1.4261334165831296, | |
| "learning_rate": 6.343490541840899e-06, | |
| "loss": 0.489, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.32400455062571104, | |
| "grad_norm": 1.4986503257367303, | |
| "learning_rate": 6.3243072616077535e-06, | |
| "loss": 0.5957, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.32582480091012517, | |
| "grad_norm": 1.2030242787629297, | |
| "learning_rate": 6.3050429284373465e-06, | |
| "loss": 0.4974, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.32764505119453924, | |
| "grad_norm": 1.3717989144113625, | |
| "learning_rate": 6.285698214112974e-06, | |
| "loss": 0.593, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.32946530147895337, | |
| "grad_norm": 1.2021644407962897, | |
| "learning_rate": 6.2662737932209695e-06, | |
| "loss": 0.616, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.33128555176336744, | |
| "grad_norm": 1.281410126238882, | |
| "learning_rate": 6.246770343127185e-06, | |
| "loss": 0.5598, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.33310580204778156, | |
| "grad_norm": 1.39438795848328, | |
| "learning_rate": 6.227188543953368e-06, | |
| "loss": 0.5932, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.33310580204778156, | |
| "eval_accuracy": 0.8092656088844726, | |
| "eval_accuracy_first_token": 0.7541679610645128, | |
| "eval_accuracy_first_token_<": 0.9296587926509187, | |
| "eval_accuracy_first_token_<_total": 1905, | |
| "eval_accuracy_first_token_<|python_tag|>": 0.8752515090543259, | |
| "eval_accuracy_first_token_<|python_tag|>_total": 994, | |
| "eval_accuracy_first_token_Certainly": 0.7024793388429752, | |
| "eval_accuracy_first_token_Certainly_total": 363, | |
| "eval_accuracy_first_token_The": 0.9059161873459326, | |
| "eval_accuracy_first_token_The_total": 2434, | |
| "eval_accuracy_first_token_To": 0.8237179487179487, | |
| "eval_accuracy_first_token_To_total": 936, | |
| "eval_loss": 0.5801064372062683, | |
| "eval_perplexity": 1.1141803737974993, | |
| "eval_runtime": 508.2626, | |
| "eval_samples_per_second": 1.371, | |
| "eval_steps_per_second": 0.687, | |
| "eval_total_number_first_token": 9657, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.3349260523321957, | |
| "grad_norm": 1.3081524231573554, | |
| "learning_rate": 6.207529078553444e-06, | |
| "loss": 0.5457, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.33674630261660976, | |
| "grad_norm": 1.3716078335539046, | |
| "learning_rate": 6.1877926324897085e-06, | |
| "loss": 0.5473, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3385665529010239, | |
| "grad_norm": 1.4145939624062198, | |
| "learning_rate": 6.16797989400891e-06, | |
| "loss": 0.5786, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.340386803185438, | |
| "grad_norm": 1.2294721126569037, | |
| "learning_rate": 6.148091554018264e-06, | |
| "loss": 0.5902, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.3422070534698521, | |
| "grad_norm": 1.2925378088030424, | |
| "learning_rate": 6.128128306061346e-06, | |
| "loss": 0.5142, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.3440273037542662, | |
| "grad_norm": 1.2279588518524418, | |
| "learning_rate": 6.108090846293915e-06, | |
| "loss": 0.5135, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.34584755403868034, | |
| "grad_norm": 1.268146835786646, | |
| "learning_rate": 6.087979873459634e-06, | |
| "loss": 0.5447, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3476678043230944, | |
| "grad_norm": 1.318995573559777, | |
| "learning_rate": 6.0677960888657015e-06, | |
| "loss": 0.6744, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.34948805460750854, | |
| "grad_norm": 1.231758642404661, | |
| "learning_rate": 6.047540196358404e-06, | |
| "loss": 0.5809, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.35130830489192266, | |
| "grad_norm": 1.2372891673165372, | |
| "learning_rate": 6.02721290229856e-06, | |
| "loss": 0.5807, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.35312855517633673, | |
| "grad_norm": 1.481210652387573, | |
| "learning_rate": 6.006814915536894e-06, | |
| "loss": 0.5936, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.35494880546075086, | |
| "grad_norm": 1.2305803524181071, | |
| "learning_rate": 5.9863469473893225e-06, | |
| "loss": 0.5438, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.356769055745165, | |
| "grad_norm": 1.2206928256434937, | |
| "learning_rate": 5.965809711612137e-06, | |
| "loss": 0.5005, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.35858930602957906, | |
| "grad_norm": 1.086573502847394, | |
| "learning_rate": 5.945203924377125e-06, | |
| "loss": 0.4889, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.3604095563139932, | |
| "grad_norm": 1.3513470624112347, | |
| "learning_rate": 5.92453030424659e-06, | |
| "loss": 0.5599, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.36222980659840726, | |
| "grad_norm": 1.1113956980921844, | |
| "learning_rate": 5.903789572148294e-06, | |
| "loss": 0.5182, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.3640500568828214, | |
| "grad_norm": 1.4891052629080104, | |
| "learning_rate": 5.88298245135032e-06, | |
| "loss": 0.5716, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3658703071672355, | |
| "grad_norm": 1.3005774833983796, | |
| "learning_rate": 5.862109667435853e-06, | |
| "loss": 0.5665, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.3676905574516496, | |
| "grad_norm": 1.2151067893045482, | |
| "learning_rate": 5.8411719482778645e-06, | |
| "loss": 0.4965, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3695108077360637, | |
| "grad_norm": 1.5031392413729012, | |
| "learning_rate": 5.820170024013746e-06, | |
| "loss": 0.5398, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.37133105802047783, | |
| "grad_norm": 1.1627104663425107, | |
| "learning_rate": 5.79910462701984e-06, | |
| "loss": 0.4461, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.3731513083048919, | |
| "grad_norm": 1.3656640622390992, | |
| "learning_rate": 5.777976491885903e-06, | |
| "loss": 0.6048, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.37497155858930603, | |
| "grad_norm": 1.2327820864728312, | |
| "learning_rate": 5.756786355389481e-06, | |
| "loss": 0.5052, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.37679180887372016, | |
| "grad_norm": 1.3098660955959893, | |
| "learning_rate": 5.735534956470232e-06, | |
| "loss": 0.5507, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.37861205915813423, | |
| "grad_norm": 1.2828934352712993, | |
| "learning_rate": 5.714223036204144e-06, | |
| "loss": 0.5973, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.38043230944254836, | |
| "grad_norm": 1.1860097743128348, | |
| "learning_rate": 5.6928513377777e-06, | |
| "loss": 0.4965, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.3822525597269625, | |
| "grad_norm": 1.2517621268060033, | |
| "learning_rate": 5.671420606461956e-06, | |
| "loss": 0.5487, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.38407281001137655, | |
| "grad_norm": 1.271873202647325, | |
| "learning_rate": 5.649931589586557e-06, | |
| "loss": 0.5979, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.3858930602957907, | |
| "grad_norm": 1.2859574150365818, | |
| "learning_rate": 5.628385036513676e-06, | |
| "loss": 0.4776, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.38771331058020475, | |
| "grad_norm": 1.354537404525919, | |
| "learning_rate": 5.606781698611878e-06, | |
| "loss": 0.4877, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.3895335608646189, | |
| "grad_norm": 1.3284236870109494, | |
| "learning_rate": 5.585122329229923e-06, | |
| "loss": 0.5859, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.391353811149033, | |
| "grad_norm": 1.2340130409038237, | |
| "learning_rate": 5.56340768367049e-06, | |
| "loss": 0.5305, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3931740614334471, | |
| "grad_norm": 1.453487991090255, | |
| "learning_rate": 5.541638519163849e-06, | |
| "loss": 0.55, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.3949943117178612, | |
| "grad_norm": 1.298062739902415, | |
| "learning_rate": 5.51981559484144e-06, | |
| "loss": 0.5169, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.39681456200227533, | |
| "grad_norm": 1.311968473611326, | |
| "learning_rate": 5.49793967170941e-06, | |
| "loss": 0.5751, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.3986348122866894, | |
| "grad_norm": 1.2412899795871963, | |
| "learning_rate": 5.476011512622076e-06, | |
| "loss": 0.6166, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.4004550625711035, | |
| "grad_norm": 1.242116646999028, | |
| "learning_rate": 5.454031882255319e-06, | |
| "loss": 0.5578, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40227531285551765, | |
| "grad_norm": 1.4168825327120473, | |
| "learning_rate": 5.43200154707992e-06, | |
| "loss": 0.5662, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.4040955631399317, | |
| "grad_norm": 1.426587220049501, | |
| "learning_rate": 5.4099212753348294e-06, | |
| "loss": 0.5169, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.40591581342434585, | |
| "grad_norm": 1.3014887756398712, | |
| "learning_rate": 5.3877918370003806e-06, | |
| "loss": 0.5117, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.40773606370876, | |
| "grad_norm": 1.3406748606110184, | |
| "learning_rate": 5.365614003771439e-06, | |
| "loss": 0.5549, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.40955631399317405, | |
| "grad_norm": 1.2661574936204552, | |
| "learning_rate": 5.343388549030491e-06, | |
| "loss": 0.5163, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4113765642775882, | |
| "grad_norm": 1.085801335463159, | |
| "learning_rate": 5.321116247820669e-06, | |
| "loss": 0.5244, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4131968145620023, | |
| "grad_norm": 1.1238570077454868, | |
| "learning_rate": 5.298797876818734e-06, | |
| "loss": 0.4877, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.4150170648464164, | |
| "grad_norm": 1.2501588033198834, | |
| "learning_rate": 5.276434214307986e-06, | |
| "loss": 0.5175, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.4168373151308305, | |
| "grad_norm": 1.1104132675236253, | |
| "learning_rate": 5.2540260401511255e-06, | |
| "loss": 0.4912, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.41865756541524457, | |
| "grad_norm": 1.3469476360006967, | |
| "learning_rate": 5.231574135763052e-06, | |
| "loss": 0.5119, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4204778156996587, | |
| "grad_norm": 1.3917881004299013, | |
| "learning_rate": 5.209079284083626e-06, | |
| "loss": 0.5893, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.4222980659840728, | |
| "grad_norm": 1.3049155919134754, | |
| "learning_rate": 5.186542269550359e-06, | |
| "loss": 0.5863, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.4241183162684869, | |
| "grad_norm": 1.4688437185729748, | |
| "learning_rate": 5.163963878071058e-06, | |
| "loss": 0.6134, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.425938566552901, | |
| "grad_norm": 1.334594097803803, | |
| "learning_rate": 5.141344896996421e-06, | |
| "loss": 0.4778, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.42775881683731515, | |
| "grad_norm": 1.349733087487026, | |
| "learning_rate": 5.1186861150925844e-06, | |
| "loss": 0.5989, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4295790671217292, | |
| "grad_norm": 1.1945891754612503, | |
| "learning_rate": 5.09598832251361e-06, | |
| "loss": 0.4466, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.43139931740614335, | |
| "grad_norm": 1.388728814777883, | |
| "learning_rate": 5.073252310773939e-06, | |
| "loss": 0.6193, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.43321956769055747, | |
| "grad_norm": 1.4284168159961905, | |
| "learning_rate": 5.050478872720782e-06, | |
| "loss": 0.5535, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.43503981797497154, | |
| "grad_norm": 1.3786811940208537, | |
| "learning_rate": 5.027668802506476e-06, | |
| "loss": 0.4974, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.43686006825938567, | |
| "grad_norm": 1.2927644778322318, | |
| "learning_rate": 5.004822895560794e-06, | |
| "loss": 0.5029, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4386803185437998, | |
| "grad_norm": 1.252387489439096, | |
| "learning_rate": 4.981941948563196e-06, | |
| "loss": 0.5278, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.44050056882821387, | |
| "grad_norm": 1.339410297479304, | |
| "learning_rate": 4.959026759415061e-06, | |
| "loss": 0.4939, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.442320819112628, | |
| "grad_norm": 1.364314280772671, | |
| "learning_rate": 4.936078127211849e-06, | |
| "loss": 0.5951, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.4441410693970421, | |
| "grad_norm": 1.3075257340817037, | |
| "learning_rate": 4.913096852215248e-06, | |
| "loss": 0.6049, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.4459613196814562, | |
| "grad_norm": 1.4449459168578944, | |
| "learning_rate": 4.890083735825257e-06, | |
| "loss": 0.6495, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4477815699658703, | |
| "grad_norm": 1.250520396487667, | |
| "learning_rate": 4.867039580552247e-06, | |
| "loss": 0.5094, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.4496018202502844, | |
| "grad_norm": 1.3729253299629682, | |
| "learning_rate": 4.843965189988969e-06, | |
| "loss": 0.601, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.4514220705346985, | |
| "grad_norm": 1.3588267674574899, | |
| "learning_rate": 4.820861368782537e-06, | |
| "loss": 0.6282, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.45324232081911264, | |
| "grad_norm": 1.2605268123722788, | |
| "learning_rate": 4.79772892260637e-06, | |
| "loss": 0.5305, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4550625711035267, | |
| "grad_norm": 1.2681522997617236, | |
| "learning_rate": 4.774568658132086e-06, | |
| "loss": 0.5748, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.45688282138794084, | |
| "grad_norm": 1.3533665486552986, | |
| "learning_rate": 4.751381383001386e-06, | |
| "loss": 0.4689, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.45870307167235497, | |
| "grad_norm": 1.267434752321521, | |
| "learning_rate": 4.728167905797877e-06, | |
| "loss": 0.534, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.46052332195676904, | |
| "grad_norm": 1.2225430366963792, | |
| "learning_rate": 4.7049290360188875e-06, | |
| "loss": 0.5003, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.46234357224118316, | |
| "grad_norm": 1.3388181688742944, | |
| "learning_rate": 4.681665584047227e-06, | |
| "loss": 0.5219, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.4641638225255973, | |
| "grad_norm": 1.3028544028541067, | |
| "learning_rate": 4.658378361122936e-06, | |
| "loss": 0.5452, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.46598407281001136, | |
| "grad_norm": 1.157965088527389, | |
| "learning_rate": 4.6350681793149884e-06, | |
| "loss": 0.5229, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.4678043230944255, | |
| "grad_norm": 1.3045701677810966, | |
| "learning_rate": 4.611735851492984e-06, | |
| "loss": 0.5728, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.4696245733788396, | |
| "grad_norm": 1.2850295730378811, | |
| "learning_rate": 4.588382191298787e-06, | |
| "loss": 0.5537, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.4714448236632537, | |
| "grad_norm": 1.1541773263319788, | |
| "learning_rate": 4.5650080131181675e-06, | |
| "loss": 0.538, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.4732650739476678, | |
| "grad_norm": 1.2910982881953428, | |
| "learning_rate": 4.541614132052393e-06, | |
| "loss": 0.5612, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4750853242320819, | |
| "grad_norm": 1.3654547247389965, | |
| "learning_rate": 4.51820136388981e-06, | |
| "loss": 0.4475, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.476905574516496, | |
| "grad_norm": 1.2698111645568326, | |
| "learning_rate": 4.494770525077391e-06, | |
| "loss": 0.5621, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.47872582480091014, | |
| "grad_norm": 1.3935553561279155, | |
| "learning_rate": 4.4713224326922655e-06, | |
| "loss": 0.599, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.4805460750853242, | |
| "grad_norm": 1.3156202456183232, | |
| "learning_rate": 4.447857904413231e-06, | |
| "loss": 0.532, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.48236632536973834, | |
| "grad_norm": 1.1337315757674442, | |
| "learning_rate": 4.424377758492232e-06, | |
| "loss": 0.5353, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.48418657565415246, | |
| "grad_norm": 1.4101220722381445, | |
| "learning_rate": 4.40088281372583e-06, | |
| "loss": 0.5006, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.48600682593856653, | |
| "grad_norm": 1.2300783759396936, | |
| "learning_rate": 4.377373889426649e-06, | |
| "loss": 0.5438, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.48782707622298066, | |
| "grad_norm": 1.17585511931677, | |
| "learning_rate": 4.353851805394808e-06, | |
| "loss": 0.5369, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.4896473265073948, | |
| "grad_norm": 1.2115284188121456, | |
| "learning_rate": 4.33031738188933e-06, | |
| "loss": 0.4524, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.49146757679180886, | |
| "grad_norm": 1.4317839486717954, | |
| "learning_rate": 4.306771439599534e-06, | |
| "loss": 0.6436, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.493287827076223, | |
| "grad_norm": 1.4621543028546877, | |
| "learning_rate": 4.283214799616428e-06, | |
| "loss": 0.4368, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.4951080773606371, | |
| "grad_norm": 1.2936498719418335, | |
| "learning_rate": 4.259648283404062e-06, | |
| "loss": 0.5541, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.4969283276450512, | |
| "grad_norm": 1.120893056901481, | |
| "learning_rate": 4.236072712770891e-06, | |
| "loss": 0.5822, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.4987485779294653, | |
| "grad_norm": 1.429920580926913, | |
| "learning_rate": 4.2124889098411175e-06, | |
| "loss": 0.5302, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5005688282138794, | |
| "grad_norm": 1.312546385183068, | |
| "learning_rate": 4.1888976970260135e-06, | |
| "loss": 0.4835, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5023890784982935, | |
| "grad_norm": 1.3107231912852029, | |
| "learning_rate": 4.165299896995252e-06, | |
| "loss": 0.5421, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5042093287827076, | |
| "grad_norm": 1.232720241327702, | |
| "learning_rate": 4.141696332648216e-06, | |
| "loss": 0.5012, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5060295790671218, | |
| "grad_norm": 1.2132932647482422, | |
| "learning_rate": 4.118087827085294e-06, | |
| "loss": 0.5463, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5078498293515359, | |
| "grad_norm": 1.270716742837696, | |
| "learning_rate": 4.094475203579191e-06, | |
| "loss": 0.5383, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5096700796359499, | |
| "grad_norm": 1.2438093689244545, | |
| "learning_rate": 4.070859285546209e-06, | |
| "loss": 0.5556, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.511490329920364, | |
| "grad_norm": 1.3516997697288733, | |
| "learning_rate": 4.047240896517539e-06, | |
| "loss": 0.6018, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5133105802047782, | |
| "grad_norm": 1.2439174788361766, | |
| "learning_rate": 4.023620860110533e-06, | |
| "loss": 0.4133, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5151308304891923, | |
| "grad_norm": 1.3778918541026397, | |
| "learning_rate": 4e-06, | |
| "loss": 0.6007, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5169510807736064, | |
| "grad_norm": 1.2268835899477202, | |
| "learning_rate": 3.976379139889467e-06, | |
| "loss": 0.5331, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5187713310580204, | |
| "grad_norm": 1.3429317349537357, | |
| "learning_rate": 3.9527591034824616e-06, | |
| "loss": 0.5311, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5205915813424346, | |
| "grad_norm": 1.2134030361634403, | |
| "learning_rate": 3.929140714453791e-06, | |
| "loss": 0.481, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.5224118316268487, | |
| "grad_norm": 1.2462618006711519, | |
| "learning_rate": 3.9055247964208075e-06, | |
| "loss": 0.5273, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.5242320819112628, | |
| "grad_norm": 1.5559356945744065, | |
| "learning_rate": 3.8819121729147055e-06, | |
| "loss": 0.6021, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5260523321956769, | |
| "grad_norm": 1.3832902722589653, | |
| "learning_rate": 3.8583036673517845e-06, | |
| "loss": 0.4454, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5278725824800911, | |
| "grad_norm": 1.3978629990846738, | |
| "learning_rate": 3.834700103004747e-06, | |
| "loss": 0.5124, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5296928327645051, | |
| "grad_norm": 1.2666994035299775, | |
| "learning_rate": 3.8111023029739866e-06, | |
| "loss": 0.4667, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5315130830489192, | |
| "grad_norm": 1.3305625183535323, | |
| "learning_rate": 3.787511090158884e-06, | |
| "loss": 0.5368, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.2554353067602742, | |
| "learning_rate": 3.763927287229109e-06, | |
| "loss": 0.5499, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5351535836177475, | |
| "grad_norm": 1.307288214215709, | |
| "learning_rate": 3.740351716595939e-06, | |
| "loss": 0.5055, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5369738339021616, | |
| "grad_norm": 1.3279514539943822, | |
| "learning_rate": 3.7167852003835723e-06, | |
| "loss": 0.511, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5387940841865757, | |
| "grad_norm": 1.223828271060312, | |
| "learning_rate": 3.6932285604004656e-06, | |
| "loss": 0.4595, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5406143344709897, | |
| "grad_norm": 1.332815030257366, | |
| "learning_rate": 3.669682618110671e-06, | |
| "loss": 0.6227, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.5424345847554038, | |
| "grad_norm": 1.2271598713957907, | |
| "learning_rate": 3.646148194605191e-06, | |
| "loss": 0.5925, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.544254835039818, | |
| "grad_norm": 1.2856112828358344, | |
| "learning_rate": 3.622626110573351e-06, | |
| "loss": 0.4888, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.5460750853242321, | |
| "grad_norm": 1.2933732045646906, | |
| "learning_rate": 3.5991171862741713e-06, | |
| "loss": 0.5072, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5478953356086462, | |
| "grad_norm": 1.928545669129003, | |
| "learning_rate": 3.575622241507768e-06, | |
| "loss": 0.525, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5497155858930602, | |
| "grad_norm": 1.1113802518291283, | |
| "learning_rate": 3.5521420955867683e-06, | |
| "loss": 0.5977, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5515358361774744, | |
| "grad_norm": 1.3617289948905469, | |
| "learning_rate": 3.5286775673077332e-06, | |
| "loss": 0.5839, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5533560864618885, | |
| "grad_norm": 1.391130826033813, | |
| "learning_rate": 3.505229474922609e-06, | |
| "loss": 0.5181, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5551763367463026, | |
| "grad_norm": 1.2316652646361441, | |
| "learning_rate": 3.481798636110191e-06, | |
| "loss": 0.4945, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5569965870307167, | |
| "grad_norm": 1.2873086430494702, | |
| "learning_rate": 3.458385867947607e-06, | |
| "loss": 0.4924, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.5588168373151309, | |
| "grad_norm": 1.430369532857129, | |
| "learning_rate": 3.434991986881833e-06, | |
| "loss": 0.4821, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.5606370875995449, | |
| "grad_norm": 1.1646260708946579, | |
| "learning_rate": 3.4116178087012136e-06, | |
| "loss": 0.5052, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.562457337883959, | |
| "grad_norm": 1.2327288356772756, | |
| "learning_rate": 3.388264148507016e-06, | |
| "loss": 0.5057, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.5642775881683731, | |
| "grad_norm": 1.3536008520463172, | |
| "learning_rate": 3.3649318206850116e-06, | |
| "loss": 0.5178, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5660978384527873, | |
| "grad_norm": 1.1896041830424324, | |
| "learning_rate": 3.3416216388770635e-06, | |
| "loss": 0.5417, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.5679180887372014, | |
| "grad_norm": 1.2975919761075365, | |
| "learning_rate": 3.3183344159527736e-06, | |
| "loss": 0.5234, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.5697383390216155, | |
| "grad_norm": 1.1688323545338841, | |
| "learning_rate": 3.2950709639811134e-06, | |
| "loss": 0.4888, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.5715585893060295, | |
| "grad_norm": 1.3939258642019638, | |
| "learning_rate": 3.271832094202123e-06, | |
| "loss": 0.5183, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.5733788395904437, | |
| "grad_norm": 1.1897624823605304, | |
| "learning_rate": 3.2486186169986153e-06, | |
| "loss": 0.5454, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5751990898748578, | |
| "grad_norm": 1.1944714828344472, | |
| "learning_rate": 3.2254313418679154e-06, | |
| "loss": 0.4807, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.5770193401592719, | |
| "grad_norm": 1.2256094296723554, | |
| "learning_rate": 3.2022710773936304e-06, | |
| "loss": 0.5223, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.578839590443686, | |
| "grad_norm": 1.2296819521179183, | |
| "learning_rate": 3.1791386312174633e-06, | |
| "loss": 0.4951, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.5806598407281001, | |
| "grad_norm": 1.4525163093513003, | |
| "learning_rate": 3.1560348100110315e-06, | |
| "loss": 0.4874, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.5824800910125142, | |
| "grad_norm": 1.4766150215295402, | |
| "learning_rate": 3.1329604194477535e-06, | |
| "loss": 0.5186, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5843003412969283, | |
| "grad_norm": 1.5620410502792905, | |
| "learning_rate": 3.1099162641747427e-06, | |
| "loss": 0.5542, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.5861205915813424, | |
| "grad_norm": 1.1792003162185065, | |
| "learning_rate": 3.0869031477847507e-06, | |
| "loss": 0.4751, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5879408418657566, | |
| "grad_norm": 1.4371388899245734, | |
| "learning_rate": 3.0639218727881508e-06, | |
| "loss": 0.5066, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.5897610921501707, | |
| "grad_norm": 1.3321828422036859, | |
| "learning_rate": 3.04097324058494e-06, | |
| "loss": 0.418, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.5915813424345847, | |
| "grad_norm": 1.2642329608748821, | |
| "learning_rate": 3.0180580514368034e-06, | |
| "loss": 0.6167, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5934015927189988, | |
| "grad_norm": 1.1538255682096556, | |
| "learning_rate": 2.9951771044392066e-06, | |
| "loss": 0.5297, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.595221843003413, | |
| "grad_norm": 1.2453988044078719, | |
| "learning_rate": 2.972331197493523e-06, | |
| "loss": 0.4552, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.5970420932878271, | |
| "grad_norm": 1.3576907607149231, | |
| "learning_rate": 2.949521127279218e-06, | |
| "loss": 0.5003, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.5988623435722412, | |
| "grad_norm": 1.3497348777364608, | |
| "learning_rate": 2.926747689226062e-06, | |
| "loss": 0.5561, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.6006825938566553, | |
| "grad_norm": 1.259164100959422, | |
| "learning_rate": 2.9040116774863896e-06, | |
| "loss": 0.4856, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6025028441410694, | |
| "grad_norm": 1.2529485220686642, | |
| "learning_rate": 2.881313884907416e-06, | |
| "loss": 0.5575, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.6043230944254835, | |
| "grad_norm": 1.0669208953569564, | |
| "learning_rate": 2.8586551030035797e-06, | |
| "loss": 0.4644, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.6061433447098976, | |
| "grad_norm": 1.3366563458096783, | |
| "learning_rate": 2.836036121928942e-06, | |
| "loss": 0.453, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6079635949943117, | |
| "grad_norm": 1.1476061284968695, | |
| "learning_rate": 2.813457730449641e-06, | |
| "loss": 0.4207, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6097838452787259, | |
| "grad_norm": 1.3411031407126155, | |
| "learning_rate": 2.790920715916372e-06, | |
| "loss": 0.5404, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6116040955631399, | |
| "grad_norm": 1.1482077310699785, | |
| "learning_rate": 2.7684258642369484e-06, | |
| "loss": 0.5205, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.613424345847554, | |
| "grad_norm": 1.42649538957105, | |
| "learning_rate": 2.7459739598488762e-06, | |
| "loss": 0.5013, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6152445961319681, | |
| "grad_norm": 1.2243282105614175, | |
| "learning_rate": 2.723565785692013e-06, | |
| "loss": 0.5464, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6170648464163823, | |
| "grad_norm": 1.2484628027395077, | |
| "learning_rate": 2.701202123181266e-06, | |
| "loss": 0.5519, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6188850967007964, | |
| "grad_norm": 1.3240902412697022, | |
| "learning_rate": 2.6788837521793328e-06, | |
| "loss": 0.5205, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6207053469852105, | |
| "grad_norm": 1.2873575493742448, | |
| "learning_rate": 2.6566114509695096e-06, | |
| "loss": 0.4761, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6225255972696245, | |
| "grad_norm": 1.2093203753299095, | |
| "learning_rate": 2.634385996228561e-06, | |
| "loss": 0.4753, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6243458475540387, | |
| "grad_norm": 1.300179853101682, | |
| "learning_rate": 2.6122081629996195e-06, | |
| "loss": 0.4934, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6261660978384528, | |
| "grad_norm": 1.2890047749069995, | |
| "learning_rate": 2.5900787246651715e-06, | |
| "loss": 0.4873, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.6279863481228669, | |
| "grad_norm": 1.4341436462807016, | |
| "learning_rate": 2.567998452920081e-06, | |
| "loss": 0.5213, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.629806598407281, | |
| "grad_norm": 1.299103431190263, | |
| "learning_rate": 2.5459681177446797e-06, | |
| "loss": 0.4783, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.631626848691695, | |
| "grad_norm": 1.331377030608932, | |
| "learning_rate": 2.523988487377924e-06, | |
| "loss": 0.5045, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.6334470989761092, | |
| "grad_norm": 1.3571947037370755, | |
| "learning_rate": 2.50206032829059e-06, | |
| "loss": 0.5005, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.6352673492605233, | |
| "grad_norm": 1.5380704302051296, | |
| "learning_rate": 2.4801844051585604e-06, | |
| "loss": 0.5238, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.6370875995449374, | |
| "grad_norm": 1.3042046637167102, | |
| "learning_rate": 2.4583614808361508e-06, | |
| "loss": 0.4785, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6389078498293516, | |
| "grad_norm": 1.1803375558623432, | |
| "learning_rate": 2.4365923163295083e-06, | |
| "loss": 0.5518, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.6407281001137657, | |
| "grad_norm": 1.1305498748692666, | |
| "learning_rate": 2.4148776707700775e-06, | |
| "loss": 0.4627, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.6425483503981797, | |
| "grad_norm": 1.3696351562191598, | |
| "learning_rate": 2.393218301388123e-06, | |
| "loss": 0.4714, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.6443686006825938, | |
| "grad_norm": 1.1646766669675297, | |
| "learning_rate": 2.3716149634863244e-06, | |
| "loss": 0.461, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.646188850967008, | |
| "grad_norm": 1.432326181786707, | |
| "learning_rate": 2.3500684104134433e-06, | |
| "loss": 0.4775, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6480091012514221, | |
| "grad_norm": 1.2811226649238618, | |
| "learning_rate": 2.328579393538046e-06, | |
| "loss": 0.4473, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.6498293515358362, | |
| "grad_norm": 1.2847418645420832, | |
| "learning_rate": 2.3071486622223e-06, | |
| "loss": 0.473, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.6516496018202503, | |
| "grad_norm": 1.1991535525500763, | |
| "learning_rate": 2.2857769637958554e-06, | |
| "loss": 0.4548, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.6534698521046644, | |
| "grad_norm": 1.3510869929117142, | |
| "learning_rate": 2.2644650435297675e-06, | |
| "loss": 0.474, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.6552901023890785, | |
| "grad_norm": 1.2247454838152558, | |
| "learning_rate": 2.243213644610519e-06, | |
| "loss": 0.4063, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6571103526734926, | |
| "grad_norm": 1.224682187747472, | |
| "learning_rate": 2.2220235081140985e-06, | |
| "loss": 0.5137, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.6589306029579067, | |
| "grad_norm": 1.5257557938450914, | |
| "learning_rate": 2.2008953729801583e-06, | |
| "loss": 0.4591, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.6607508532423209, | |
| "grad_norm": 1.221121328273825, | |
| "learning_rate": 2.1798299759862545e-06, | |
| "loss": 0.5614, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.6625711035267349, | |
| "grad_norm": 1.2199344648755224, | |
| "learning_rate": 2.158828051722137e-06, | |
| "loss": 0.5104, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.664391353811149, | |
| "grad_norm": 1.3052741269665118, | |
| "learning_rate": 2.137890332564147e-06, | |
| "loss": 0.4732, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6662116040955631, | |
| "grad_norm": 1.1647023893060888, | |
| "learning_rate": 2.117017548649678e-06, | |
| "loss": 0.5229, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.6662116040955631, | |
| "eval_accuracy": 0.8129295561130228, | |
| "eval_accuracy_first_token": 0.7684581132856995, | |
| "eval_accuracy_first_token_<": 0.9595800524934384, | |
| "eval_accuracy_first_token_<_total": 1905, | |
| "eval_accuracy_first_token_<|python_tag|>": 0.9094567404426559, | |
| "eval_accuracy_first_token_<|python_tag|>_total": 994, | |
| "eval_accuracy_first_token_Certainly": 0.7741046831955923, | |
| "eval_accuracy_first_token_Certainly_total": 363, | |
| "eval_accuracy_first_token_The": 0.8948233360723089, | |
| "eval_accuracy_first_token_The_total": 2434, | |
| "eval_accuracy_first_token_To": 0.8044871794871795, | |
| "eval_accuracy_first_token_To_total": 936, | |
| "eval_loss": 0.5655013918876648, | |
| "eval_perplexity": 1.1114110979501997, | |
| "eval_runtime": 507.2948, | |
| "eval_samples_per_second": 1.374, | |
| "eval_steps_per_second": 0.688, | |
| "eval_total_number_first_token": 9657, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.6680318543799773, | |
| "grad_norm": 1.2605323109478153, | |
| "learning_rate": 2.0962104278517058e-06, | |
| "loss": 0.4634, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.6698521046643914, | |
| "grad_norm": 1.153711484102447, | |
| "learning_rate": 2.0754696957534105e-06, | |
| "loss": 0.4578, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.6716723549488055, | |
| "grad_norm": 1.4112272127644152, | |
| "learning_rate": 2.0547960756228746e-06, | |
| "loss": 0.5903, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.6734926052332195, | |
| "grad_norm": 1.3058143917601592, | |
| "learning_rate": 2.0341902883878626e-06, | |
| "loss": 0.4261, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6753128555176336, | |
| "grad_norm": 1.241032329122879, | |
| "learning_rate": 2.013653052610678e-06, | |
| "loss": 0.4901, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.6771331058020478, | |
| "grad_norm": 1.134116834066691, | |
| "learning_rate": 1.993185084463106e-06, | |
| "loss": 0.5478, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.6789533560864619, | |
| "grad_norm": 1.2621524843864569, | |
| "learning_rate": 1.97278709770144e-06, | |
| "loss": 0.4521, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.680773606370876, | |
| "grad_norm": 1.2445963074217277, | |
| "learning_rate": 1.952459803641597e-06, | |
| "loss": 0.5048, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.6825938566552902, | |
| "grad_norm": 1.3367185945909759, | |
| "learning_rate": 1.9322039111342977e-06, | |
| "loss": 0.4859, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6844141069397042, | |
| "grad_norm": 1.502198228897516, | |
| "learning_rate": 1.912020126540366e-06, | |
| "loss": 0.5483, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.6862343572241183, | |
| "grad_norm": 1.5682296957615942, | |
| "learning_rate": 1.8919091537060847e-06, | |
| "loss": 0.5403, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.6880546075085324, | |
| "grad_norm": 1.2186683041461865, | |
| "learning_rate": 1.8718716939386541e-06, | |
| "loss": 0.4953, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.6898748577929465, | |
| "grad_norm": 1.2104649746142353, | |
| "learning_rate": 1.8519084459817362e-06, | |
| "loss": 0.4599, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.6916951080773607, | |
| "grad_norm": 1.1390426306451955, | |
| "learning_rate": 1.83202010599109e-06, | |
| "loss": 0.4164, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6935153583617747, | |
| "grad_norm": 1.2956325376708957, | |
| "learning_rate": 1.8122073675102932e-06, | |
| "loss": 0.5417, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.6953356086461888, | |
| "grad_norm": 1.1586136644085798, | |
| "learning_rate": 1.792470921446557e-06, | |
| "loss": 0.4365, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.697155858930603, | |
| "grad_norm": 1.1975210529143365, | |
| "learning_rate": 1.7728114560466324e-06, | |
| "loss": 0.4956, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.6989761092150171, | |
| "grad_norm": 1.5675227569116297, | |
| "learning_rate": 1.753229656872815e-06, | |
| "loss": 0.4646, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.7007963594994312, | |
| "grad_norm": 1.1981622083221466, | |
| "learning_rate": 1.7337262067790319e-06, | |
| "loss": 0.5042, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7026166097838453, | |
| "grad_norm": 1.291822326824022, | |
| "learning_rate": 1.7143017858870259e-06, | |
| "loss": 0.5786, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.7044368600682593, | |
| "grad_norm": 1.3381873610330526, | |
| "learning_rate": 1.6949570715626532e-06, | |
| "loss": 0.3987, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.7062571103526735, | |
| "grad_norm": 1.5233756050791378, | |
| "learning_rate": 1.675692738392247e-06, | |
| "loss": 0.5373, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.7080773606370876, | |
| "grad_norm": 1.405379762218711, | |
| "learning_rate": 1.6565094581591015e-06, | |
| "loss": 0.5151, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.7098976109215017, | |
| "grad_norm": 1.3827588130238773, | |
| "learning_rate": 1.6374078998200424e-06, | |
| "loss": 0.4868, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7117178612059158, | |
| "grad_norm": 1.3281467896725871, | |
| "learning_rate": 1.6183887294820995e-06, | |
| "loss": 0.4892, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.71353811149033, | |
| "grad_norm": 1.5562464103926885, | |
| "learning_rate": 1.5994526103792852e-06, | |
| "loss": 0.5977, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.715358361774744, | |
| "grad_norm": 1.2645130650718202, | |
| "learning_rate": 1.5806002028494509e-06, | |
| "loss": 0.4245, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.7171786120591581, | |
| "grad_norm": 1.3281593922925885, | |
| "learning_rate": 1.5618321643112738e-06, | |
| "loss": 0.5813, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.7189988623435722, | |
| "grad_norm": 1.1215366227811656, | |
| "learning_rate": 1.5431491492413286e-06, | |
| "loss": 0.4276, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7208191126279864, | |
| "grad_norm": 1.3212838118308114, | |
| "learning_rate": 1.52455180915126e-06, | |
| "loss": 0.5774, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.7226393629124005, | |
| "grad_norm": 1.2852914600481689, | |
| "learning_rate": 1.506040792565066e-06, | |
| "loss": 0.5057, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.7244596131968145, | |
| "grad_norm": 1.280275275618163, | |
| "learning_rate": 1.487616744996484e-06, | |
| "loss": 0.444, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.7262798634812286, | |
| "grad_norm": 1.1583238977099228, | |
| "learning_rate": 1.4692803089264772e-06, | |
| "loss": 0.5377, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.7281001137656428, | |
| "grad_norm": 1.435157708312753, | |
| "learning_rate": 1.4510321237808377e-06, | |
| "loss": 0.5444, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7299203640500569, | |
| "grad_norm": 1.3208185752900872, | |
| "learning_rate": 1.4328728259078746e-06, | |
| "loss": 0.5566, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.731740614334471, | |
| "grad_norm": 1.2130339190915678, | |
| "learning_rate": 1.414803048556236e-06, | |
| "loss": 0.4988, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.7335608646188851, | |
| "grad_norm": 1.1363530661008532, | |
| "learning_rate": 1.396823421852825e-06, | |
| "loss": 0.6129, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.7353811149032992, | |
| "grad_norm": 1.3222588910481998, | |
| "learning_rate": 1.3789345727808207e-06, | |
| "loss": 0.546, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.7372013651877133, | |
| "grad_norm": 1.3949194783709729, | |
| "learning_rate": 1.3611371251578114e-06, | |
| "loss": 0.5583, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7390216154721274, | |
| "grad_norm": 1.2917335175784925, | |
| "learning_rate": 1.3434316996140553e-06, | |
| "loss": 0.5151, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.7408418657565415, | |
| "grad_norm": 1.2895735708732046, | |
| "learning_rate": 1.3258189135708229e-06, | |
| "loss": 0.5098, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.7426621160409557, | |
| "grad_norm": 1.2978294874532978, | |
| "learning_rate": 1.3082993812188735e-06, | |
| "loss": 0.5414, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.7444823663253698, | |
| "grad_norm": 1.2095221030821062, | |
| "learning_rate": 1.2908737134970364e-06, | |
| "loss": 0.5268, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.7463026166097838, | |
| "grad_norm": 1.3840563503977592, | |
| "learning_rate": 1.2735425180709039e-06, | |
| "loss": 0.479, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7481228668941979, | |
| "grad_norm": 1.2789076883026242, | |
| "learning_rate": 1.2563063993116482e-06, | |
| "loss": 0.5503, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.7499431171786121, | |
| "grad_norm": 1.283587802005637, | |
| "learning_rate": 1.239165958274933e-06, | |
| "loss": 0.4113, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.7517633674630262, | |
| "grad_norm": 1.2909165266250262, | |
| "learning_rate": 1.2221217926799652e-06, | |
| "loss": 0.535, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.7535836177474403, | |
| "grad_norm": 1.3531455484884616, | |
| "learning_rate": 1.2051744968886489e-06, | |
| "loss": 0.5052, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.7554038680318543, | |
| "grad_norm": 1.2730404093480168, | |
| "learning_rate": 1.1883246618848533e-06, | |
| "loss": 0.4566, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.7572241183162685, | |
| "grad_norm": 1.37616764437592, | |
| "learning_rate": 1.1715728752538101e-06, | |
| "loss": 0.566, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.7590443686006826, | |
| "grad_norm": 1.1512441975212944, | |
| "learning_rate": 1.1549197211616203e-06, | |
| "loss": 0.5044, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.7608646188850967, | |
| "grad_norm": 1.2438970988598956, | |
| "learning_rate": 1.1383657803348835e-06, | |
| "loss": 0.5109, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.7626848691695108, | |
| "grad_norm": 1.5233735431446764, | |
| "learning_rate": 1.1219116300404486e-06, | |
| "loss": 0.507, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.764505119453925, | |
| "grad_norm": 1.3253161212074762, | |
| "learning_rate": 1.10555784406528e-06, | |
| "loss": 0.5082, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.766325369738339, | |
| "grad_norm": 1.1775521474516462, | |
| "learning_rate": 1.089304992696455e-06, | |
| "loss": 0.46, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.7681456200227531, | |
| "grad_norm": 1.2462962157301152, | |
| "learning_rate": 1.0731536427012695e-06, | |
| "loss": 0.5253, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.7699658703071672, | |
| "grad_norm": 1.3347631673740097, | |
| "learning_rate": 1.0571043573074736e-06, | |
| "loss": 0.4449, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.7717861205915814, | |
| "grad_norm": 1.292727758187721, | |
| "learning_rate": 1.041157696183641e-06, | |
| "loss": 0.441, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.7736063708759955, | |
| "grad_norm": 1.293278742294603, | |
| "learning_rate": 1.0253142154196415e-06, | |
| "loss": 0.4867, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7754266211604095, | |
| "grad_norm": 1.2102494852297525, | |
| "learning_rate": 1.0095744675072525e-06, | |
| "loss": 0.4898, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.7772468714448236, | |
| "grad_norm": 1.224313028246693, | |
| "learning_rate": 9.93939001320895e-07, | |
| "loss": 0.4686, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.7790671217292378, | |
| "grad_norm": 1.3632517015375165, | |
| "learning_rate": 9.784083620984884e-07, | |
| "loss": 0.4639, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.7808873720136519, | |
| "grad_norm": 1.3987002856426751, | |
| "learning_rate": 9.62983091422446e-07, | |
| "loss": 0.4528, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.782707622298066, | |
| "grad_norm": 1.329331750067852, | |
| "learning_rate": 9.476637272007746e-07, | |
| "loss": 0.4562, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7845278725824801, | |
| "grad_norm": 1.4216744583623766, | |
| "learning_rate": 9.324508036483303e-07, | |
| "loss": 0.4622, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.7863481228668942, | |
| "grad_norm": 1.3060911776176307, | |
| "learning_rate": 9.173448512681848e-07, | |
| "loss": 0.5405, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.7881683731513083, | |
| "grad_norm": 1.3971532684012182, | |
| "learning_rate": 9.023463968331238e-07, | |
| "loss": 0.4642, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.7899886234357224, | |
| "grad_norm": 1.351332971443725, | |
| "learning_rate": 8.874559633672754e-07, | |
| "loss": 0.4146, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.7918088737201365, | |
| "grad_norm": 1.2506853747891504, | |
| "learning_rate": 8.726740701278808e-07, | |
| "loss": 0.5233, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.7936291240045507, | |
| "grad_norm": 1.2588296359051319, | |
| "learning_rate": 8.580012325871773e-07, | |
| "loss": 0.5196, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.7954493742889648, | |
| "grad_norm": 1.3656683873360818, | |
| "learning_rate": 8.434379624144261e-07, | |
| "loss": 0.4426, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.7972696245733788, | |
| "grad_norm": 1.3415371986074633, | |
| "learning_rate": 8.289847674580702e-07, | |
| "loss": 0.5025, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.7990898748577929, | |
| "grad_norm": 1.210310044679145, | |
| "learning_rate": 8.146421517280226e-07, | |
| "loss": 0.4922, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.800910125142207, | |
| "grad_norm": 1.675036054936253, | |
| "learning_rate": 8.004106153780967e-07, | |
| "loss": 0.4396, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8027303754266212, | |
| "grad_norm": 1.1849449434556916, | |
| "learning_rate": 7.862906546885559e-07, | |
| "loss": 0.5348, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.8045506257110353, | |
| "grad_norm": 1.3294402423567042, | |
| "learning_rate": 7.722827620488108e-07, | |
| "loss": 0.4472, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.8063708759954493, | |
| "grad_norm": 1.2341888202472633, | |
| "learning_rate": 7.583874259402545e-07, | |
| "loss": 0.4926, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.8081911262798634, | |
| "grad_norm": 1.3727750069417188, | |
| "learning_rate": 7.446051309192203e-07, | |
| "loss": 0.5187, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.8100113765642776, | |
| "grad_norm": 1.1665673148184286, | |
| "learning_rate": 7.30936357600088e-07, | |
| "loss": 0.4459, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8118316268486917, | |
| "grad_norm": 1.4461908262228584, | |
| "learning_rate": 7.173815826385246e-07, | |
| "loss": 0.5931, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.8136518771331058, | |
| "grad_norm": 1.2164762112018974, | |
| "learning_rate": 7.039412787148586e-07, | |
| "loss": 0.5769, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.81547212741752, | |
| "grad_norm": 1.3268169931538385, | |
| "learning_rate": 6.906159145176049e-07, | |
| "loss": 0.4962, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.817292377701934, | |
| "grad_norm": 1.3585533527783662, | |
| "learning_rate": 6.774059547271087e-07, | |
| "loss": 0.5011, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.8191126279863481, | |
| "grad_norm": 1.2715237655057547, | |
| "learning_rate": 6.643118599993518e-07, | |
| "loss": 0.4591, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8209328782707622, | |
| "grad_norm": 1.1129340141314334, | |
| "learning_rate": 6.513340869498858e-07, | |
| "loss": 0.4818, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.8227531285551763, | |
| "grad_norm": 1.1993408265317198, | |
| "learning_rate": 6.384730881379048e-07, | |
| "loss": 0.4826, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.8245733788395905, | |
| "grad_norm": 1.305009025174831, | |
| "learning_rate": 6.257293120504692e-07, | |
| "loss": 0.4824, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.8263936291240046, | |
| "grad_norm": 1.2620160974509798, | |
| "learning_rate": 6.131032030868635e-07, | |
| "loss": 0.4479, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.8282138794084186, | |
| "grad_norm": 1.2693469945741236, | |
| "learning_rate": 6.005952015430993e-07, | |
| "loss": 0.5286, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8300341296928327, | |
| "grad_norm": 1.2953927032105943, | |
| "learning_rate": 5.882057435965619e-07, | |
| "loss": 0.5802, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.8318543799772469, | |
| "grad_norm": 1.3055790274997285, | |
| "learning_rate": 5.759352612907999e-07, | |
| "loss": 0.5273, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.833674630261661, | |
| "grad_norm": 1.3009913306704852, | |
| "learning_rate": 5.637841825204588e-07, | |
| "loss": 0.4434, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.8354948805460751, | |
| "grad_norm": 1.3010149850935786, | |
| "learning_rate": 5.517529310163627e-07, | |
| "loss": 0.5302, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.8373151308304891, | |
| "grad_norm": 1.1588504398899486, | |
| "learning_rate": 5.398419263307281e-07, | |
| "loss": 0.4898, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8391353811149033, | |
| "grad_norm": 1.427556447905731, | |
| "learning_rate": 5.280515838225477e-07, | |
| "loss": 0.4583, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.8409556313993174, | |
| "grad_norm": 1.3382828189315212, | |
| "learning_rate": 5.163823146430944e-07, | |
| "loss": 0.4544, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.8427758816837315, | |
| "grad_norm": 1.3048820751365628, | |
| "learning_rate": 5.048345257215892e-07, | |
| "loss": 0.5348, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.8445961319681456, | |
| "grad_norm": 1.3464339683482869, | |
| "learning_rate": 4.934086197510088e-07, | |
| "loss": 0.4866, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.8464163822525598, | |
| "grad_norm": 1.3076973707605393, | |
| "learning_rate": 4.821049951740441e-07, | |
| "loss": 0.4374, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.8482366325369738, | |
| "grad_norm": 1.207783472984328, | |
| "learning_rate": 4.7092404616920547e-07, | |
| "loss": 0.5268, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.8500568828213879, | |
| "grad_norm": 1.3340034898150066, | |
| "learning_rate": 4.59866162637077e-07, | |
| "loss": 0.5163, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.851877133105802, | |
| "grad_norm": 1.2793323359204207, | |
| "learning_rate": 4.4893173018671816e-07, | |
| "loss": 0.464, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.8536973833902162, | |
| "grad_norm": 1.3875887367624027, | |
| "learning_rate": 4.3812113012222164e-07, | |
| "loss": 0.5605, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.8555176336746303, | |
| "grad_norm": 1.2752397131609516, | |
| "learning_rate": 4.2743473942941177e-07, | |
| "loss": 0.5166, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8573378839590444, | |
| "grad_norm": 1.361832548391048, | |
| "learning_rate": 4.168729307626977e-07, | |
| "loss": 0.4494, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.8591581342434584, | |
| "grad_norm": 1.3313280628055624, | |
| "learning_rate": 4.0643607243208455e-07, | |
| "loss": 0.4531, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.8609783845278726, | |
| "grad_norm": 1.238927541446331, | |
| "learning_rate": 3.9612452839032384e-07, | |
| "loss": 0.4629, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.8627986348122867, | |
| "grad_norm": 1.440299941933543, | |
| "learning_rate": 3.859386582202231e-07, | |
| "loss": 0.5238, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.8646188850967008, | |
| "grad_norm": 1.2998009060977955, | |
| "learning_rate": 3.758788171221079e-07, | |
| "loss": 0.4126, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.8664391353811149, | |
| "grad_norm": 1.2344313543035759, | |
| "learning_rate": 3.659453559014345e-07, | |
| "loss": 0.3997, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.868259385665529, | |
| "grad_norm": 1.3123945291493502, | |
| "learning_rate": 3.561386209565582e-07, | |
| "loss": 0.4354, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.8700796359499431, | |
| "grad_norm": 1.3385863981096489, | |
| "learning_rate": 3.464589542666485e-07, | |
| "loss": 0.5423, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.8718998862343572, | |
| "grad_norm": 1.4693361278099728, | |
| "learning_rate": 3.3690669337976996e-07, | |
| "loss": 0.5439, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.8737201365187713, | |
| "grad_norm": 1.219115488818529, | |
| "learning_rate": 3.2748217140111e-07, | |
| "loss": 0.55, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8755403868031855, | |
| "grad_norm": 1.3943875753971013, | |
| "learning_rate": 3.1818571698135976e-07, | |
| "loss": 0.479, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.8773606370875996, | |
| "grad_norm": 1.31283354874802, | |
| "learning_rate": 3.0901765430525337e-07, | |
| "loss": 0.4546, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.8791808873720136, | |
| "grad_norm": 1.2760697675194013, | |
| "learning_rate": 2.9997830308027003e-07, | |
| "loss": 0.5241, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.8810011376564277, | |
| "grad_norm": 1.3057898765814404, | |
| "learning_rate": 2.9106797852547483e-07, | |
| "loss": 0.5045, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.8828213879408419, | |
| "grad_norm": 1.228721100779524, | |
| "learning_rate": 2.8228699136053726e-07, | |
| "loss": 0.4588, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.884641638225256, | |
| "grad_norm": 1.4327825477254865, | |
| "learning_rate": 2.7363564779488446e-07, | |
| "loss": 0.4911, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.8864618885096701, | |
| "grad_norm": 1.1675697744027835, | |
| "learning_rate": 2.6511424951703244e-07, | |
| "loss": 0.4503, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.8882821387940842, | |
| "grad_norm": 1.3573793521283821, | |
| "learning_rate": 2.567230936840632e-07, | |
| "loss": 0.5537, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.8901023890784983, | |
| "grad_norm": 1.2385857779190943, | |
| "learning_rate": 2.4846247291125897e-07, | |
| "loss": 0.5261, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.8919226393629124, | |
| "grad_norm": 1.3747886513978498, | |
| "learning_rate": 2.4033267526190057e-07, | |
| "loss": 0.5116, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8937428896473265, | |
| "grad_norm": 1.3015002806547666, | |
| "learning_rate": 2.323339842372234e-07, | |
| "loss": 0.501, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.8955631399317406, | |
| "grad_norm": 1.2282471393147485, | |
| "learning_rate": 2.2446667876652968e-07, | |
| "loss": 0.5615, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.8973833902161548, | |
| "grad_norm": 1.2246787300329813, | |
| "learning_rate": 2.1673103319746146e-07, | |
| "loss": 0.5847, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.8992036405005688, | |
| "grad_norm": 1.381507003520726, | |
| "learning_rate": 2.0912731728643362e-07, | |
| "loss": 0.4593, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.9010238907849829, | |
| "grad_norm": 1.2236872730147548, | |
| "learning_rate": 2.0165579618922757e-07, | |
| "loss": 0.426, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.902844141069397, | |
| "grad_norm": 1.5642555968533283, | |
| "learning_rate": 1.943167304517459e-07, | |
| "loss": 0.4669, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.9046643913538112, | |
| "grad_norm": 1.4113035349877263, | |
| "learning_rate": 1.871103760009234e-07, | |
| "loss": 0.5189, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.9064846416382253, | |
| "grad_norm": 1.2945664446971985, | |
| "learning_rate": 1.8003698413580427e-07, | |
| "loss": 0.5331, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.9083048919226394, | |
| "grad_norm": 1.1216172834522593, | |
| "learning_rate": 1.7309680151878126e-07, | |
| "loss": 0.4596, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.9101251422070534, | |
| "grad_norm": 1.4490178400997769, | |
| "learning_rate": 1.6629007016698916e-07, | |
| "loss": 0.5719, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9119453924914676, | |
| "grad_norm": 1.4927164965040023, | |
| "learning_rate": 1.5961702744386973e-07, | |
| "loss": 0.4637, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.9137656427758817, | |
| "grad_norm": 1.2926779903672145, | |
| "learning_rate": 1.5307790605089045e-07, | |
| "loss": 0.4931, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.9155858930602958, | |
| "grad_norm": 1.3434407972538571, | |
| "learning_rate": 1.4667293401943393e-07, | |
| "loss": 0.4843, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.9174061433447099, | |
| "grad_norm": 1.2627460036138376, | |
| "learning_rate": 1.404023347028418e-07, | |
| "loss": 0.4628, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.919226393629124, | |
| "grad_norm": 1.1980109325087624, | |
| "learning_rate": 1.342663267686297e-07, | |
| "loss": 0.547, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9210466439135381, | |
| "grad_norm": 1.4394748326258473, | |
| "learning_rate": 1.2826512419085922e-07, | |
| "loss": 0.4852, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.9228668941979522, | |
| "grad_norm": 1.1712799414971835, | |
| "learning_rate": 1.223989362426785e-07, | |
| "loss": 0.5027, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.9246871444823663, | |
| "grad_norm": 1.2917639503148088, | |
| "learning_rate": 1.1666796748902142e-07, | |
| "loss": 0.4318, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.9265073947667805, | |
| "grad_norm": 1.407559329871179, | |
| "learning_rate": 1.1107241777947774e-07, | |
| "loss": 0.452, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.9283276450511946, | |
| "grad_norm": 1.4395176866301798, | |
| "learning_rate": 1.0561248224132091e-07, | |
| "loss": 0.5792, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9301478953356086, | |
| "grad_norm": 1.3107228117658043, | |
| "learning_rate": 1.0028835127270552e-07, | |
| "loss": 0.523, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.9319681456200227, | |
| "grad_norm": 1.319280624009732, | |
| "learning_rate": 9.510021053602679e-08, | |
| "loss": 0.4903, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.9337883959044369, | |
| "grad_norm": 1.2825750147020196, | |
| "learning_rate": 9.004824095144581e-08, | |
| "loss": 0.486, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.935608646188851, | |
| "grad_norm": 1.3550036994824897, | |
| "learning_rate": 8.513261869058209e-08, | |
| "loss": 0.4342, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.9374288964732651, | |
| "grad_norm": 1.2912511428181583, | |
| "learning_rate": 8.035351517036914e-08, | |
| "loss": 0.4975, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.9392491467576792, | |
| "grad_norm": 1.2630516224119532, | |
| "learning_rate": 7.571109704707623e-08, | |
| "loss": 0.4942, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.9410693970420932, | |
| "grad_norm": 1.2630983628627157, | |
| "learning_rate": 7.120552621049825e-08, | |
| "loss": 0.4581, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.9428896473265074, | |
| "grad_norm": 1.184276479260659, | |
| "learning_rate": 6.68369597783096e-08, | |
| "loss": 0.4245, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.9447098976109215, | |
| "grad_norm": 1.3479750123046965, | |
| "learning_rate": 6.260555009058288e-08, | |
| "loss": 0.4734, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.9465301478953356, | |
| "grad_norm": 1.184265059530281, | |
| "learning_rate": 5.851144470448144e-08, | |
| "loss": 0.5263, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9483503981797498, | |
| "grad_norm": 1.3131542129196199, | |
| "learning_rate": 5.455478638911071e-08, | |
| "loss": 0.369, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.9501706484641638, | |
| "grad_norm": 1.3396828056059393, | |
| "learning_rate": 5.073571312053815e-08, | |
| "loss": 0.5098, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.9519908987485779, | |
| "grad_norm": 1.3620509437765531, | |
| "learning_rate": 4.705435807698555e-08, | |
| "loss": 0.5595, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.953811149032992, | |
| "grad_norm": 1.3476395824069989, | |
| "learning_rate": 4.351084963418117e-08, | |
| "loss": 0.5332, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.9556313993174061, | |
| "grad_norm": 1.4056028428746756, | |
| "learning_rate": 4.010531136088691e-08, | |
| "loss": 0.5135, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.9574516496018203, | |
| "grad_norm": 1.2931973314368226, | |
| "learning_rate": 3.683786201458439e-08, | |
| "loss": 0.4869, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.9592718998862344, | |
| "grad_norm": 1.3728587745363008, | |
| "learning_rate": 3.370861553733784e-08, | |
| "loss": 0.544, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.9610921501706484, | |
| "grad_norm": 1.379130929011516, | |
| "learning_rate": 3.071768105181993e-08, | |
| "loss": 0.4312, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.9629124004550625, | |
| "grad_norm": 1.4162454724368647, | |
| "learning_rate": 2.786516285750373e-08, | |
| "loss": 0.4464, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.9647326507394767, | |
| "grad_norm": 1.310107669303508, | |
| "learning_rate": 2.5151160427029582e-08, | |
| "loss": 0.4641, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9665529010238908, | |
| "grad_norm": 1.3049449814100964, | |
| "learning_rate": 2.2575768402733232e-08, | |
| "loss": 0.5079, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.9683731513083049, | |
| "grad_norm": 1.301610299072927, | |
| "learning_rate": 2.013907659334624e-08, | |
| "loss": 0.4798, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.970193401592719, | |
| "grad_norm": 1.2912511438851022, | |
| "learning_rate": 1.7841169970866042e-08, | |
| "loss": 0.4962, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.9720136518771331, | |
| "grad_norm": 1.3741948558886383, | |
| "learning_rate": 1.5682128667589e-08, | |
| "loss": 0.4556, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.9738339021615472, | |
| "grad_norm": 1.4394930710163565, | |
| "learning_rate": 1.3662027973320612e-08, | |
| "loss": 0.4808, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.9756541524459613, | |
| "grad_norm": 1.3189189784853037, | |
| "learning_rate": 1.1780938332746515e-08, | |
| "loss": 0.4601, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.9774744027303754, | |
| "grad_norm": 1.3675135301050803, | |
| "learning_rate": 1.0038925342977122e-08, | |
| "loss": 0.4862, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.9792946530147896, | |
| "grad_norm": 1.2496142970199702, | |
| "learning_rate": 8.43604975126011e-09, | |
| "loss": 0.4972, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.9811149032992036, | |
| "grad_norm": 1.2590220357743287, | |
| "learning_rate": 6.972367452863004e-09, | |
| "loss": 0.5048, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.9829351535836177, | |
| "grad_norm": 1.4321645089041766, | |
| "learning_rate": 5.647929489122738e-09, | |
| "loss": 0.5688, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9847554038680318, | |
| "grad_norm": 1.3266758203446563, | |
| "learning_rate": 4.462782045664859e-09, | |
| "loss": 0.4745, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.986575654152446, | |
| "grad_norm": 1.1860109568892805, | |
| "learning_rate": 3.4169664507959216e-09, | |
| "loss": 0.4616, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.9883959044368601, | |
| "grad_norm": 1.2475217679660848, | |
| "learning_rate": 2.5105191740597553e-09, | |
| "loss": 0.6489, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.9902161547212742, | |
| "grad_norm": 1.4270596684886099, | |
| "learning_rate": 1.7434718249664803e-09, | |
| "loss": 0.4712, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.9920364050056882, | |
| "grad_norm": 1.2999695109285117, | |
| "learning_rate": 1.1158511518902791e-09, | |
| "loss": 0.5143, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.9938566552901024, | |
| "grad_norm": 1.5041014788909566, | |
| "learning_rate": 6.276790411372524e-10, | |
| "loss": 0.4971, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.9956769055745165, | |
| "grad_norm": 1.151430673100721, | |
| "learning_rate": 2.789725161806977e-10, | |
| "loss": 0.5446, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.9974971558589306, | |
| "grad_norm": 1.2513093960410882, | |
| "learning_rate": 6.974373706869486e-11, | |
| "loss": 0.5494, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.9993174061433447, | |
| "grad_norm": 1.3143110421818924, | |
| "learning_rate": 0.0, | |
| "loss": 0.4345, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.9993174061433447, | |
| "eval_accuracy": 0.8141491715694426, | |
| "eval_accuracy_first_token": 0.7775706741223982, | |
| "eval_accuracy_first_token_<": 0.9648293963254593, | |
| "eval_accuracy_first_token_<_total": 1905, | |
| "eval_accuracy_first_token_<|python_tag|>": 0.9014084507042254, | |
| "eval_accuracy_first_token_<|python_tag|>_total": 994, | |
| "eval_accuracy_first_token_Certainly": 0.743801652892562, | |
| "eval_accuracy_first_token_Certainly_total": 363, | |
| "eval_accuracy_first_token_The": 0.9030402629416598, | |
| "eval_accuracy_first_token_The_total": 2434, | |
| "eval_accuracy_first_token_To": 0.8076923076923077, | |
| "eval_accuracy_first_token_To_total": 936, | |
| "eval_loss": 0.5610479116439819, | |
| "eval_perplexity": 1.110590475782418, | |
| "eval_runtime": 507.5321, | |
| "eval_samples_per_second": 1.373, | |
| "eval_steps_per_second": 0.688, | |
| "eval_total_number_first_token": 9657, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.9993174061433447, | |
| "step": 549, | |
| "total_flos": 229846517022720.0, | |
| "train_loss": 0.5347839987994544, | |
| "train_runtime": 35925.1648, | |
| "train_samples_per_second": 0.245, | |
| "train_steps_per_second": 0.015 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 549, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 229846517022720.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |