| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9978925184404637, | |
| "eval_steps": 500, | |
| "global_step": 1896, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005268703898840885, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 9.142857142857142e-06, | |
| "loss": 0.5469, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01053740779768177, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 2.0571428571428566e-05, | |
| "loss": 0.493, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.015806111696522657, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 3.2e-05, | |
| "loss": 0.4305, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02107481559536354, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 4.3428571428571424e-05, | |
| "loss": 0.4097, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026343519494204427, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 5.485714285714285e-05, | |
| "loss": 0.3943, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03161222339304531, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 6.628571428571428e-05, | |
| "loss": 0.4013, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0368809272918862, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 7.771428571428571e-05, | |
| "loss": 0.387, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04214963119072708, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 7.999970044289027e-05, | |
| "loss": 0.3758, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04741833508956796, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 7.999848350238486e-05, | |
| "loss": 0.3781, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05268703898840885, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 7.999633049410842e-05, | |
| "loss": 0.3766, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05795574288724974, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 7.999324148524308e-05, | |
| "loss": 0.3749, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06322444678609063, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 7.998921657217774e-05, | |
| "loss": 0.3719, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0684931506849315, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 7.998425588050514e-05, | |
| "loss": 0.3694, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0737618545837724, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 7.99783595650179e-05, | |
| "loss": 0.3679, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07903055848261328, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 7.997152780970364e-05, | |
| "loss": 0.3641, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08429926238145416, | |
| "grad_norm": 0.625, | |
| "learning_rate": 7.99637608277394e-05, | |
| "loss": 0.3585, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08956796628029505, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 7.99550588614848e-05, | |
| "loss": 0.3562, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09483667017913593, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.994542218247453e-05, | |
| "loss": 0.3496, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10010537407797682, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 7.993485109140998e-05, | |
| "loss": 0.3585, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1053740779768177, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 7.992334591814973e-05, | |
| "loss": 0.3568, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11064278187565858, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 7.991090702169934e-05, | |
| "loss": 0.349, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11591148577449947, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.989753479020009e-05, | |
| "loss": 0.345, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12118018967334036, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.98832296409169e-05, | |
| "loss": 0.3518, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12644889357218125, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 7.986799202022531e-05, | |
| "loss": 0.357, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13171759747102213, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 7.985182240359757e-05, | |
| "loss": 0.3563, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 7.983472129558769e-05, | |
| "loss": 0.3465, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1422550052687039, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 7.98166892298159e-05, | |
| "loss": 0.3397, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1475237091675448, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.979772676895182e-05, | |
| "loss": 0.3541, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15279241306638566, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 7.977783450469697e-05, | |
| "loss": 0.3431, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.15806111696522657, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 7.97570130577663e-05, | |
| "loss": 0.3507, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16332982086406744, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 7.973526307786885e-05, | |
| "loss": 0.3416, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.16859852476290832, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 7.971258524368743e-05, | |
| "loss": 0.3459, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17386722866174922, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 7.968898026285744e-05, | |
| "loss": 0.332, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1791359325605901, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 7.966444887194489e-05, | |
| "loss": 0.3496, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18440463645943098, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 7.963899183642324e-05, | |
| "loss": 0.3409, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.18967334035827185, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 7.961260995064969e-05, | |
| "loss": 0.3473, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19494204425711276, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 7.958530403784029e-05, | |
| "loss": 0.3389, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.20021074815595363, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 7.955707495004427e-05, | |
| "loss": 0.3317, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2054794520547945, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 7.952792356811745e-05, | |
| "loss": 0.3424, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2107481559536354, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.949785080169479e-05, | |
| "loss": 0.3423, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2160168598524763, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 7.946685758916198e-05, | |
| "loss": 0.338, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.22128556375131717, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.943494489762617e-05, | |
| "loss": 0.3256, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.22655426765015807, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 7.940211372288572e-05, | |
| "loss": 0.3237, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.23182297154899895, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 7.936836508939928e-05, | |
| "loss": 0.3334, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.23709167544783982, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 7.933370005025367e-05, | |
| "loss": 0.3455, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.24236037934668073, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 7.92981196871311e-05, | |
| "loss": 0.3303, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2476290832455216, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 7.926162511027539e-05, | |
| "loss": 0.3389, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2528977871443625, | |
| "grad_norm": 0.5, | |
| "learning_rate": 7.922421745845734e-05, | |
| "loss": 0.3347, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2581664910432034, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 7.918589789893922e-05, | |
| "loss": 0.3215, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.26343519494204426, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.914666762743831e-05, | |
| "loss": 0.3234, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26870389884088514, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 7.910652786808953e-05, | |
| "loss": 0.3211, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 7.90654798734074e-05, | |
| "loss": 0.3332, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2792413066385669, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 7.902352492424682e-05, | |
| "loss": 0.3352, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2845100105374078, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.898066432976318e-05, | |
| "loss": 0.3403, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2897787144362487, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 7.893689942737141e-05, | |
| "loss": 0.3351, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2950474183350896, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 7.88922315827044e-05, | |
| "loss": 0.3254, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.30031612223393045, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 7.884666218957029e-05, | |
| "loss": 0.336, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3055848261327713, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 7.880019266990891e-05, | |
| "loss": 0.318, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3108535300316122, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 7.875282447374757e-05, | |
| "loss": 0.3328, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.31612223393045313, | |
| "grad_norm": 0.5, | |
| "learning_rate": 7.870455907915573e-05, | |
| "loss": 0.3268, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.321390937829294, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.865539799219885e-05, | |
| "loss": 0.3188, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.3266596417281349, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 7.860534274689147e-05, | |
| "loss": 0.3286, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.33192834562697576, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 7.855439490514922e-05, | |
| "loss": 0.3259, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.33719704952581664, | |
| "grad_norm": 0.5, | |
| "learning_rate": 7.850255605674026e-05, | |
| "loss": 0.3213, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3424657534246575, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 7.844982781923554e-05, | |
| "loss": 0.3194, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.34773445732349845, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.839621183795833e-05, | |
| "loss": 0.3258, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3530031612223393, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 7.834170978593296e-05, | |
| "loss": 0.3232, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3582718651211802, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 7.828632336383253e-05, | |
| "loss": 0.3232, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3635405690200211, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 7.823005429992587e-05, | |
| "loss": 0.3198, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.36880927291886195, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 7.81729043500237e-05, | |
| "loss": 0.3222, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.37407797681770283, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 7.811487529742366e-05, | |
| "loss": 0.3269, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.3793466807165437, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 7.805596895285485e-05, | |
| "loss": 0.3221, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 7.799618715442116e-05, | |
| "loss": 0.3178, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.3898840885142255, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.79355317675441e-05, | |
| "loss": 0.3173, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3951527924130664, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 7.78740046849044e-05, | |
| "loss": 0.3206, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.40042149631190727, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.781160782638307e-05, | |
| "loss": 0.3141, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.40569020021074814, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 7.774834313900154e-05, | |
| "loss": 0.3107, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.76842125968607e-05, | |
| "loss": 0.3315, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.41622760800842995, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.761921820107951e-05, | |
| "loss": 0.315, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4214963119072708, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.755336197973248e-05, | |
| "loss": 0.311, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4267650158061117, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 7.748664598778633e-05, | |
| "loss": 0.3257, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.4320337197049526, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.7419072307036e-05, | |
| "loss": 0.3256, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.43730242360379346, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.735064304603954e-05, | |
| "loss": 0.3226, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.44257112750263433, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 7.728136034005241e-05, | |
| "loss": 0.3197, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.44783983140147526, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 7.721122635096086e-05, | |
| "loss": 0.3207, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.45310853530031614, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 7.714024326721441e-05, | |
| "loss": 0.3092, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.458377239199157, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.706841330375755e-05, | |
| "loss": 0.3104, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.4636459430979979, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 7.699573870196074e-05, | |
| "loss": 0.3036, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.46891464699683877, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 7.692222172955035e-05, | |
| "loss": 0.3061, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.47418335089567965, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.684786468053799e-05, | |
| "loss": 0.3148, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4794520547945205, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 7.677266987514882e-05, | |
| "loss": 0.3157, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.48472075869336145, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 7.669663965974923e-05, | |
| "loss": 0.3147, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.48998946259220233, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.661977640677366e-05, | |
| "loss": 0.3053, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.4952581664910432, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 7.654208251465047e-05, | |
| "loss": 0.3111, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5005268703898841, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 7.646356040772716e-05, | |
| "loss": 0.3195, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.505795574288725, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.638421253619466e-05, | |
| "loss": 0.3026, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5110642781875658, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 7.630404137601104e-05, | |
| "loss": 0.3004, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5163329820864068, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 7.622304942882402e-05, | |
| "loss": 0.3181, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5216016859852476, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.61412392218931e-05, | |
| "loss": 0.3061, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5268703898840885, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.605861330801056e-05, | |
| "loss": 0.3244, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5268703898840885, | |
| "eval_loss": 0.3330249488353729, | |
| "eval_runtime": 2.6362, | |
| "eval_samples_per_second": 17.829, | |
| "eval_steps_per_second": 17.829, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5321390937829295, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 7.597517426542193e-05, | |
| "loss": 0.3059, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5374077976817703, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 7.589092469774541e-05, | |
| "loss": 0.307, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5426765015806112, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 7.580586723389075e-05, | |
| "loss": 0.3093, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 7.572000452797713e-05, | |
| "loss": 0.3079, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.553213909378293, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 7.563333925925036e-05, | |
| "loss": 0.3077, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5584826132771338, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.554587413199932e-05, | |
| "loss": 0.3033, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5637513171759747, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 7.545761187547155e-05, | |
| "loss": 0.3131, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5690200210748156, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 7.536855524378804e-05, | |
| "loss": 0.3053, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5742887249736565, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 7.527870701585735e-05, | |
| "loss": 0.3038, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5795574288724974, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.518806999528887e-05, | |
| "loss": 0.3086, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5848261327713382, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.50966470103054e-05, | |
| "loss": 0.3003, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.5900948366701791, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 7.500444091365479e-05, | |
| "loss": 0.3078, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.59536354056902, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.491145458252099e-05, | |
| "loss": 0.3034, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6006322444678609, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 7.481769091843424e-05, | |
| "loss": 0.3054, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6059009483667018, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.472315284718061e-05, | |
| "loss": 0.2977, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6111696522655427, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 7.46278433187106e-05, | |
| "loss": 0.2976, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6164383561643836, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 7.453176530704713e-05, | |
| "loss": 0.3064, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6217070600632244, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 7.443492181019277e-05, | |
| "loss": 0.2976, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6269757639620653, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.433731585003612e-05, | |
| "loss": 0.2993, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6322444678609063, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 7.423895047225762e-05, | |
| "loss": 0.2986, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6375131717597471, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.413982874623443e-05, | |
| "loss": 0.2996, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.642781875658588, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 7.403995376494465e-05, | |
| "loss": 0.3115, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6480505795574288, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.393932864487086e-05, | |
| "loss": 0.3031, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6533192834562698, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 7.383795652590285e-05, | |
| "loss": 0.3078, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6585879873551106, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.373584057123965e-05, | |
| "loss": 0.2977, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6638566912539515, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 7.363298396729077e-05, | |
| "loss": 0.2922, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6691253951527925, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 7.352938992357685e-05, | |
| "loss": 0.2964, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6743940990516333, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.342506167262951e-05, | |
| "loss": 0.3015, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6796628029504742, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 7.33200024698904e-05, | |
| "loss": 0.3019, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 7.32142155936097e-05, | |
| "loss": 0.3052, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.690200210748156, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 7.310770434474381e-05, | |
| "loss": 0.2968, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.6954689146469969, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 7.300047204685228e-05, | |
| "loss": 0.3029, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7007376185458377, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 7.28925220459942e-05, | |
| "loss": 0.3056, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7060063224446786, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 7.278385771062373e-05, | |
| "loss": 0.3053, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7112750263435195, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 7.267448243148501e-05, | |
| "loss": 0.2986, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7165437302423604, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 7.256439962150638e-05, | |
| "loss": 0.2891, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7218124341412012, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 7.245361271569382e-05, | |
| "loss": 0.2968, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7270811380400422, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.234212517102378e-05, | |
| "loss": 0.3014, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7323498419388831, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 7.222994046633541e-05, | |
| "loss": 0.3031, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7376185458377239, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 7.211706210222186e-05, | |
| "loss": 0.3031, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7428872497365648, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 7.200349360092113e-05, | |
| "loss": 0.2889, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7481559536354057, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 7.188923850620616e-05, | |
| "loss": 0.2839, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7534246575342466, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 7.177430038327424e-05, | |
| "loss": 0.3007, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7586933614330874, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 7.165868281863572e-05, | |
| "loss": 0.2841, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7639620653319283, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 7.15423894200022e-05, | |
| "loss": 0.2979, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 7.142542381617388e-05, | |
| "loss": 0.2918, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7744994731296101, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 7.130778965692629e-05, | |
| "loss": 0.3029, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.779768177028451, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 7.118949061289649e-05, | |
| "loss": 0.3013, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7850368809272918, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 7.107053037546851e-05, | |
| "loss": 0.3047, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.7903055848261328, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 7.095091265665814e-05, | |
| "loss": 0.2979, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7955742887249737, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 7.083064118899708e-05, | |
| "loss": 0.293, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8008429926238145, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 7.070971972541654e-05, | |
| "loss": 0.2998, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8061116965226555, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 7.05881520391301e-05, | |
| "loss": 0.2864, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8113804004214963, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 7.046594192351595e-05, | |
| "loss": 0.2927, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8166491043203372, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 7.034309319199853e-05, | |
| "loss": 0.294, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.821917808219178, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 7.021960967792956e-05, | |
| "loss": 0.2912, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.827186512118019, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 7.009549523446842e-05, | |
| "loss": 0.292, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8324552160168599, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.997075373446187e-05, | |
| "loss": 0.3041, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8377239199157007, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 6.98453890703233e-05, | |
| "loss": 0.2996, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8429926238145417, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 6.971940515391118e-05, | |
| "loss": 0.2953, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8482613277133825, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 6.9592805916407e-05, | |
| "loss": 0.2954, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8535300316122234, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 6.946559530819265e-05, | |
| "loss": 0.2937, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8587987355110642, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 6.933777729872716e-05, | |
| "loss": 0.3035, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8640674394099052, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 6.920935587642278e-05, | |
| "loss": 0.2913, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8693361433087461, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 6.908033504852054e-05, | |
| "loss": 0.2961, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.8746048472075869, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 6.895071884096526e-05, | |
| "loss": 0.2846, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8798735511064278, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 6.882051129827989e-05, | |
| "loss": 0.288, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.8851422550052687, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 6.868971648343925e-05, | |
| "loss": 0.2927, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8904109589041096, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 6.855833847774337e-05, | |
| "loss": 0.286, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.8956796628029505, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 6.842638138069003e-05, | |
| "loss": 0.2863, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9009483667017913, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 6.82938493098469e-05, | |
| "loss": 0.2901, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9062170706006323, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.816074640072305e-05, | |
| "loss": 0.2884, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9114857744994731, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.802707680663987e-05, | |
| "loss": 0.2903, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.916754478398314, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 6.789284469860146e-05, | |
| "loss": 0.2945, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9220231822971549, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.775805426516464e-05, | |
| "loss": 0.2864, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9272918861959958, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.762270971230799e-05, | |
| "loss": 0.287, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9325605900948367, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.748681526330078e-05, | |
| "loss": 0.2936, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9378292939936775, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.73503751585712e-05, | |
| "loss": 0.2907, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9430979978925185, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 6.721339365557394e-05, | |
| "loss": 0.2864, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9483667017913593, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 6.707587502865739e-05, | |
| "loss": 0.2948, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9536354056902002, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.693782356893032e-05, | |
| "loss": 0.2962, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.958904109589041, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.679924358412785e-05, | |
| "loss": 0.2898, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.964172813487882, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 6.666013939847719e-05, | |
| "loss": 0.2833, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9694415173867229, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.652051535256257e-05, | |
| "loss": 0.2995, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9747102212855637, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.638037580318988e-05, | |
| "loss": 0.2863, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.9799789251844047, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 6.623972512325068e-05, | |
| "loss": 0.294, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9852476290832455, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 6.609856770158579e-05, | |
| "loss": 0.2806, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.9905163329820864, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 6.595690794284828e-05, | |
| "loss": 0.2901, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9957850368809273, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 6.581475026736611e-05, | |
| "loss": 0.2799, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.9989462592202318, | |
| "eval_loss": 0.3042255640029907, | |
| "eval_runtime": 2.5158, | |
| "eval_samples_per_second": 18.682, | |
| "eval_steps_per_second": 18.682, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 1.0010537407797682, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 6.56720991110041e-05, | |
| "loss": 0.2765, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.006322444678609, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 6.552895892502563e-05, | |
| "loss": 0.2285, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.01159114857745, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 6.538533417595359e-05, | |
| "loss": 0.2271, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0168598524762908, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 6.52412293454312e-05, | |
| "loss": 0.2292, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.0221285563751317, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.5096648930082e-05, | |
| "loss": 0.2196, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.0273972602739727, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.495159744136959e-05, | |
| "loss": 0.2249, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.0326659641728135, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 6.480607940545692e-05, | |
| "loss": 0.2212, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.0379346680716544, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 6.466009936306498e-05, | |
| "loss": 0.2272, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.0432033719704952, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 6.45136618693311e-05, | |
| "loss": 0.2253, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0484720758693362, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 6.436677149366688e-05, | |
| "loss": 0.229, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.053740779768177, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 6.42194328196156e-05, | |
| "loss": 0.2252, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.053740779768177, | |
| "eval_loss": 0.30919456481933594, | |
| "eval_runtime": 2.4891, | |
| "eval_samples_per_second": 18.882, | |
| "eval_steps_per_second": 18.882, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0590094836670179, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 6.407165044470911e-05, | |
| "loss": 0.2241, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.064278187565859, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.392342898032445e-05, | |
| "loss": 0.2296, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.0695468914646997, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 6.377477305153997e-05, | |
| "loss": 0.2257, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.0748155953635405, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 6.362568729699093e-05, | |
| "loss": 0.2269, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.0800842992623814, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 6.347617636872484e-05, | |
| "loss": 0.2314, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.0853530031612224, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.332624493205623e-05, | |
| "loss": 0.2261, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.0906217070600632, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 6.317589766542111e-05, | |
| "loss": 0.2239, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.095890410958904, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 6.3025139260231e-05, | |
| "loss": 0.2234, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.101159114857745, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 6.287397442072653e-05, | |
| "loss": 0.2342, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.106427818756586, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 6.272240786383057e-05, | |
| "loss": 0.2307, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1116965226554267, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 6.257044431900121e-05, | |
| "loss": 0.2376, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.1169652265542676, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.241808852808403e-05, | |
| "loss": 0.2319, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1222339304531086, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 6.226534524516418e-05, | |
| "loss": 0.2285, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.1275026343519494, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 6.21122192364181e-05, | |
| "loss": 0.2254, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1327713382507902, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 6.195871527996467e-05, | |
| "loss": 0.2325, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.1380400421496313, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 6.180483816571628e-05, | |
| "loss": 0.2333, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.143308746048472, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 6.165059269522921e-05, | |
| "loss": 0.2162, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.148577449947313, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 6.149598368155386e-05, | |
| "loss": 0.2269, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 6.13410159490846e-05, | |
| "loss": 0.2304, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.1591148577449948, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 6.118569433340927e-05, | |
| "loss": 0.2246, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1643835616438356, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 6.103002368115805e-05, | |
| "loss": 0.219, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.1696522655426764, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 6.0874008849852566e-05, | |
| "loss": 0.2299, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.1749209694415175, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 6.071765470775406e-05, | |
| "loss": 0.2222, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.1801896733403583, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 6.056096613371163e-05, | |
| "loss": 0.224, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.1854583772391991, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 6.040394801700989e-05, | |
| "loss": 0.2249, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.1907270811380402, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 6.024660525721645e-05, | |
| "loss": 0.2188, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.195995785036881, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 6.008894276402905e-05, | |
| "loss": 0.2212, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.2012644889357218, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 5.993096545712233e-05, | |
| "loss": 0.2234, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.2065331928345626, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.977267826599435e-05, | |
| "loss": 0.2186, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.2118018967334037, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 5.9614086129812696e-05, | |
| "loss": 0.2269, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2170706006322445, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 5.945519399726045e-05, | |
| "loss": 0.2257, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.2223393045310853, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 5.929600682638171e-05, | |
| "loss": 0.2274, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.2276080084299261, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 5.913652958442693e-05, | |
| "loss": 0.2169, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.2328767123287672, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 5.8976767247697856e-05, | |
| "loss": 0.2267, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.238145416227608, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 5.88167248013923e-05, | |
| "loss": 0.2205, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.2434141201264488, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 5.865640723944859e-05, | |
| "loss": 0.2256, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.2486828240252899, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 5.849581956438969e-05, | |
| "loss": 0.2294, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.2539515279241307, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 5.8334966787167135e-05, | |
| "loss": 0.2247, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2592202318229715, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 5.8173853927004676e-05, | |
| "loss": 0.2187, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.2644889357218125, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 5.801248601124164e-05, | |
| "loss": 0.2276, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2697576396206534, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.7850868075176056e-05, | |
| "loss": 0.235, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.2750263435194942, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 5.7689005161907586e-05, | |
| "loss": 0.2252, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.2802950474183352, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 5.752690232218005e-05, | |
| "loss": 0.2244, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.285563751317176, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 5.7364564614223974e-05, | |
| "loss": 0.2224, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.2908324552160169, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 5.720199710359862e-05, | |
| "loss": 0.2329, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.2961011591148577, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 5.703920486303399e-05, | |
| "loss": 0.2193, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3013698630136985, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 5.6876192972272516e-05, | |
| "loss": 0.2306, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.3066385669125395, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 5.6712966517910595e-05, | |
| "loss": 0.2199, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.3119072708113804, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.654953059323978e-05, | |
| "loss": 0.2217, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.3171759747102212, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.638589029808793e-05, | |
| "loss": 0.2288, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.3224446786090622, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 5.6222050738660043e-05, | |
| "loss": 0.2265, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.327713382507903, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 5.605801702737892e-05, | |
| "loss": 0.2341, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.3329820864067439, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 5.5893794282725646e-05, | |
| "loss": 0.23, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.338250790305585, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 5.5729387629079884e-05, | |
| "loss": 0.2156, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.3435194942044257, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 5.556480219655995e-05, | |
| "loss": 0.2279, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.3487881981032666, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 5.540004312086276e-05, | |
| "loss": 0.2238, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.3540569020021076, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 5.523511554310354e-05, | |
| "loss": 0.234, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.3593256059009484, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.507002460965547e-05, | |
| "loss": 0.2243, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3645943097997892, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.490477547198899e-05, | |
| "loss": 0.2217, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.36986301369863, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 5.47393732865112e-05, | |
| "loss": 0.2297, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.375131717597471, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 5.457382321440477e-05, | |
| "loss": 0.2282, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.380400421496312, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 5.4408130421467115e-05, | |
| "loss": 0.2275, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.3856691253951527, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.424230007794903e-05, | |
| "loss": 0.2246, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.3909378292939936, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 5.40763373583934e-05, | |
| "loss": 0.2187, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.3962065331928346, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 5.391024744147379e-05, | |
| "loss": 0.2226, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.4014752370916754, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 5.374403550983279e-05, | |
| "loss": 0.2213, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.4067439409905163, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 5.357770674992032e-05, | |
| "loss": 0.226, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.4120126448893573, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.341126635183178e-05, | |
| "loss": 0.2262, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.4172813487881981, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 5.324471950914613e-05, | |
| "loss": 0.2298, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.422550052687039, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 5.30780714187638e-05, | |
| "loss": 0.2197, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.42781875658588, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 5.291132728074453e-05, | |
| "loss": 0.2231, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.4330874604847208, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 5.2744492298145136e-05, | |
| "loss": 0.2251, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.4383561643835616, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 5.25775716768571e-05, | |
| "loss": 0.2252, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.4436248682824027, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 5.24105706254442e-05, | |
| "loss": 0.2238, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.4488935721812435, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 5.224349435497989e-05, | |
| "loss": 0.2259, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.4541622760800843, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 5.207634807888481e-05, | |
| "loss": 0.2172, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.4594309799789251, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.190913701276398e-05, | |
| "loss": 0.2133, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.464699683877766, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.1741866374244174e-05, | |
| "loss": 0.2195, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.469968387776607, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 5.157454138281102e-05, | |
| "loss": 0.2223, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.4752370916754478, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 5.140716725964618e-05, | |
| "loss": 0.2163, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4805057955742886, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 5.1239749227464393e-05, | |
| "loss": 0.2264, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.4857744994731297, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 5.107229251035056e-05, | |
| "loss": 0.2168, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.4910432033719705, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 5.090480233359667e-05, | |
| "loss": 0.2221, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.4963119072708113, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 5.07372839235388e-05, | |
| "loss": 0.2257, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.5015806111696524, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 5.056974250739401e-05, | |
| "loss": 0.2241, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.5068493150684932, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 5.0402183313097235e-05, | |
| "loss": 0.2253, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.512118018967334, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 5.023461156913818e-05, | |
| "loss": 0.2195, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.517386722866175, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 5.0067032504398086e-05, | |
| "loss": 0.2172, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.5226554267650156, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.98994513479867e-05, | |
| "loss": 0.2243, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.5279241306638567, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.9731873329079e-05, | |
| "loss": 0.224, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.5331928345626977, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 4.9564303676752075e-05, | |
| "loss": 0.2187, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.9396747619821925e-05, | |
| "loss": 0.2247, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.5437302423603794, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 4.922921038668035e-05, | |
| "loss": 0.2195, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.5489989462592202, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.906169720513176e-05, | |
| "loss": 0.222, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.554267650158061, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.8894213302230055e-05, | |
| "loss": 0.2275, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.559536354056902, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.8726763904115556e-05, | |
| "loss": 0.2193, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.5648050579557429, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.8559354235851854e-05, | |
| "loss": 0.2289, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.5700737618545837, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.839198952126288e-05, | |
| "loss": 0.2217, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5753424657534247, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.8224674982769796e-05, | |
| "loss": 0.2146, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.5806111696522656, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.805741584122808e-05, | |
| "loss": 0.2147, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5806111696522656, | |
| "eval_loss": 0.29037579894065857, | |
| "eval_runtime": 2.4367, | |
| "eval_samples_per_second": 19.289, | |
| "eval_steps_per_second": 19.289, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5858798735511064, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.78902173157646e-05, | |
| "loss": 0.2182, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.5911485774499474, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.772308462361483e-05, | |
| "loss": 0.2202, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.5964172813487882, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.7556022979959925e-05, | |
| "loss": 0.2248, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.601685985247629, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.7389037597764084e-05, | |
| "loss": 0.2185, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.60695468914647, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 4.7222133687611846e-05, | |
| "loss": 0.2143, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.6122233930453107, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.705531645754552e-05, | |
| "loss": 0.228, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.6174920969441517, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.688859111290268e-05, | |
| "loss": 0.2205, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.6227608008429928, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.672196285615367e-05, | |
| "loss": 0.2179, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.6280295047418334, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.655543688673936e-05, | |
| "loss": 0.2252, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.6332982086406744, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.638901840090886e-05, | |
| "loss": 0.2207, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.6385669125395153, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.6222712591557375e-05, | |
| "loss": 0.2213, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.643835616438356, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.6056524648064163e-05, | |
| "loss": 0.2202, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6491043203371971, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.589045975613062e-05, | |
| "loss": 0.2162, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.654373024236038, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 4.572452309761847e-05, | |
| "loss": 0.2169, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.6596417281348788, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 4.555871985038805e-05, | |
| "loss": 0.2237, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.6649104320337198, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.539305518813677e-05, | |
| "loss": 0.2207, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.6701791359325606, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 4.5227534280237645e-05, | |
| "loss": 0.2167, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.6754478398314014, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.506216229157797e-05, | |
| "loss": 0.2191, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.6807165437302425, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.489694438239827e-05, | |
| "loss": 0.2171, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.685985247629083, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.4731885708131135e-05, | |
| "loss": 0.2191, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6912539515279241, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.456699141924041e-05, | |
| "loss": 0.2195, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.6965226554267652, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.4402266661060535e-05, | |
| "loss": 0.2168, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.7017913593256058, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.4237716573635895e-05, | |
| "loss": 0.22, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.7070600632244468, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 4.40733462915605e-05, | |
| "loss": 0.2103, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.7123287671232876, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 4.390916094381774e-05, | |
| "loss": 0.2154, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.7175974710221285, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 4.374516565362034e-05, | |
| "loss": 0.223, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.7228661749209695, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.35813655382505e-05, | |
| "loss": 0.2193, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.7281348788198103, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 4.341776570890024e-05, | |
| "loss": 0.213, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.7334035827186511, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.325437127051184e-05, | |
| "loss": 0.219, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.7386722866174922, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.309118732161865e-05, | |
| "loss": 0.2188, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.743940990516333, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.292821895418588e-05, | |
| "loss": 0.2181, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.7492096944151738, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.2765471253451824e-05, | |
| "loss": 0.2168, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.7544783983140149, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.260294929776911e-05, | |
| "loss": 0.2231, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.7597471022128557, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 4.244065815844624e-05, | |
| "loss": 0.2084, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7650158061116965, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.227860289958938e-05, | |
| "loss": 0.2095, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.7702845100105375, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.211678857794432e-05, | |
| "loss": 0.2118, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.7755532139093781, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.1955220242738666e-05, | |
| "loss": 0.2131, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.7808219178082192, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 4.1793902935524314e-05, | |
| "loss": 0.2141, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.78609062170706, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.163284169002013e-05, | |
| "loss": 0.2102, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.7913593256059008, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.147204153195486e-05, | |
| "loss": 0.2179, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.7966280295047419, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.1311507478910346e-05, | |
| "loss": 0.2133, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.8018967334035827, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 4.11512445401649e-05, | |
| "loss": 0.2259, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.8071654373024235, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.099125771653703e-05, | |
| "loss": 0.2247, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.8124341412012646, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.083155200022942e-05, | |
| "loss": 0.2104, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.8177028451001054, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.067213237467312e-05, | |
| "loss": 0.2152, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.8229715489989462, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.051300381437201e-05, | |
| "loss": 0.2213, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.8282402528977872, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.035417128474769e-05, | |
| "loss": 0.2156, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.833508956796628, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.019563974198439e-05, | |
| "loss": 0.2119, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.8387776606954689, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.0037414132874454e-05, | |
| "loss": 0.2149, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.84404636459431, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 3.98794993946639e-05, | |
| "loss": 0.207, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.8493150684931505, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 3.972190045489838e-05, | |
| "loss": 0.207, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.8545837723919916, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 3.956462223126941e-05, | |
| "loss": 0.2176, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.8598524762908326, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 3.940766963146097e-05, | |
| "loss": 0.2195, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.8651211801896732, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 3.9251047552996304e-05, | |
| "loss": 0.2203, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.8703898840885143, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 3.9094760883085096e-05, | |
| "loss": 0.2217, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.875658587987355, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 3.8938814498471055e-05, | |
| "loss": 0.2036, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.880927291886196, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 3.8783213265279634e-05, | |
| "loss": 0.2165, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.886195995785037, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 3.8627962038866255e-05, | |
| "loss": 0.2108, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.8914646996838778, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 3.84730656636648e-05, | |
| "loss": 0.2106, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.8967334035827186, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 3.8318528973036395e-05, | |
| "loss": 0.2144, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.9020021074815596, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 3.816435678911868e-05, | |
| "loss": 0.2099, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.9072708113804004, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 3.801055392267523e-05, | |
| "loss": 0.2168, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.9125395152792413, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 3.785712517294552e-05, | |
| "loss": 0.2062, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.9178082191780823, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 3.770407532749519e-05, | |
| "loss": 0.2215, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 3.755140916206654e-05, | |
| "loss": 0.2111, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.928345626975764, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 3.739913144042963e-05, | |
| "loss": 0.2109, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.933614330874605, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 3.7247246914233584e-05, | |
| "loss": 0.2133, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.9388830347734456, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 3.709576032285829e-05, | |
| "loss": 0.2128, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.9441517386722866, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 3.694467639326656e-05, | |
| "loss": 0.2161, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.9494204425711275, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 3.679399983985663e-05, | |
| "loss": 0.2215, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.9546891464699683, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 3.6643735364314995e-05, | |
| "loss": 0.2111, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.9599578503688093, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 3.6493887655469796e-05, | |
| "loss": 0.2141, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.9652265542676501, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 3.63444613891444e-05, | |
| "loss": 0.2102, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.970495258166491, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 3.619546122801158e-05, | |
| "loss": 0.204, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.975763962065332, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 3.604689182144798e-05, | |
| "loss": 0.2057, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.9810326659641728, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 3.589875780538906e-05, | |
| "loss": 0.2107, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.9863013698630136, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 3.575106380218442e-05, | |
| "loss": 0.2202, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.9915700737618547, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 3.5603814420453566e-05, | |
| "loss": 0.2075, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.9968387776606955, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 3.5457014254942126e-05, | |
| "loss": 0.2163, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.9978925184404637, | |
| "eval_loss": 0.28488120436668396, | |
| "eval_runtime": 2.5242, | |
| "eval_samples_per_second": 18.619, | |
| "eval_steps_per_second": 18.619, | |
| "step": 1896 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2847, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.051871750229852e+18, | |
| "train_batch_size": 100, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |