| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1858, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005382131324004305, | |
| "grad_norm": 24.46331800159023, | |
| "learning_rate": 5.376344086021506e-08, | |
| "loss": 1.3638, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.002691065662002153, | |
| "grad_norm": 24.232442902094746, | |
| "learning_rate": 2.688172043010753e-07, | |
| "loss": 1.3692, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005382131324004306, | |
| "grad_norm": 14.752612983186452, | |
| "learning_rate": 5.376344086021506e-07, | |
| "loss": 1.3038, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008073196986006458, | |
| "grad_norm": 11.800751606437952, | |
| "learning_rate": 8.064516129032258e-07, | |
| "loss": 1.166, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.010764262648008612, | |
| "grad_norm": 9.554957230427643, | |
| "learning_rate": 1.0752688172043011e-06, | |
| "loss": 1.0338, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013455328310010764, | |
| "grad_norm": 3.628600831390835, | |
| "learning_rate": 1.3440860215053765e-06, | |
| "loss": 0.9317, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.016146393972012917, | |
| "grad_norm": 3.2255977477470297, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 0.9012, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01883745963401507, | |
| "grad_norm": 3.0289714775630516, | |
| "learning_rate": 1.881720430107527e-06, | |
| "loss": 0.8827, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.021528525296017224, | |
| "grad_norm": 3.1823522788295704, | |
| "learning_rate": 2.1505376344086023e-06, | |
| "loss": 0.8644, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.024219590958019375, | |
| "grad_norm": 2.8342525430348315, | |
| "learning_rate": 2.4193548387096776e-06, | |
| "loss": 0.8452, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02691065662002153, | |
| "grad_norm": 3.052168326380914, | |
| "learning_rate": 2.688172043010753e-06, | |
| "loss": 0.8347, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029601722282023683, | |
| "grad_norm": 2.8700531898378, | |
| "learning_rate": 2.9569892473118283e-06, | |
| "loss": 0.8117, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03229278794402583, | |
| "grad_norm": 2.9876737446045385, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.815, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03498385360602799, | |
| "grad_norm": 3.0224919131706023, | |
| "learning_rate": 3.494623655913979e-06, | |
| "loss": 0.7953, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03767491926803014, | |
| "grad_norm": 3.0473808806620517, | |
| "learning_rate": 3.763440860215054e-06, | |
| "loss": 0.7859, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.040365984930032295, | |
| "grad_norm": 3.096384352129974, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 0.7971, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04305705059203445, | |
| "grad_norm": 3.1948521945715074, | |
| "learning_rate": 4.3010752688172045e-06, | |
| "loss": 0.7941, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.045748116254036596, | |
| "grad_norm": 3.0704452147023473, | |
| "learning_rate": 4.56989247311828e-06, | |
| "loss": 0.7655, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.04843918191603875, | |
| "grad_norm": 3.213509560134052, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.7705, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.051130247578040904, | |
| "grad_norm": 3.551334383153254, | |
| "learning_rate": 5.1075268817204305e-06, | |
| "loss": 0.7731, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05382131324004306, | |
| "grad_norm": 3.3115985648508595, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.7421, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05651237890204521, | |
| "grad_norm": 3.024863181759173, | |
| "learning_rate": 5.645161290322582e-06, | |
| "loss": 0.7446, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.059203444564047365, | |
| "grad_norm": 3.003509243588486, | |
| "learning_rate": 5.9139784946236566e-06, | |
| "loss": 0.7168, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06189451022604951, | |
| "grad_norm": 2.8904939122926785, | |
| "learning_rate": 6.182795698924732e-06, | |
| "loss": 0.7226, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06458557588805167, | |
| "grad_norm": 3.0907313428716585, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.7287, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06727664155005382, | |
| "grad_norm": 2.908190969126188, | |
| "learning_rate": 6.720430107526882e-06, | |
| "loss": 0.7362, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.06996770721205597, | |
| "grad_norm": 3.0277469501250267, | |
| "learning_rate": 6.989247311827958e-06, | |
| "loss": 0.7127, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07265877287405813, | |
| "grad_norm": 2.9375347724298946, | |
| "learning_rate": 7.258064516129033e-06, | |
| "loss": 0.7057, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07534983853606028, | |
| "grad_norm": 2.7968808634431097, | |
| "learning_rate": 7.526881720430108e-06, | |
| "loss": 0.7212, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07804090419806244, | |
| "grad_norm": 3.0782635414890342, | |
| "learning_rate": 7.795698924731183e-06, | |
| "loss": 0.7121, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08073196986006459, | |
| "grad_norm": 2.957567985639813, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.7285, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08342303552206674, | |
| "grad_norm": 2.8558517093443045, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.7182, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0861141011840689, | |
| "grad_norm": 2.875223992789043, | |
| "learning_rate": 8.602150537634409e-06, | |
| "loss": 0.7179, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08880516684607104, | |
| "grad_norm": 3.0182318773591517, | |
| "learning_rate": 8.870967741935484e-06, | |
| "loss": 0.702, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09149623250807319, | |
| "grad_norm": 2.7804594099140014, | |
| "learning_rate": 9.13978494623656e-06, | |
| "loss": 0.7105, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09418729817007535, | |
| "grad_norm": 2.8174567099055157, | |
| "learning_rate": 9.408602150537635e-06, | |
| "loss": 0.7095, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0968783638320775, | |
| "grad_norm": 2.803920967572859, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.7048, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09956942949407965, | |
| "grad_norm": 2.7583241887420917, | |
| "learning_rate": 9.946236559139786e-06, | |
| "loss": 0.7161, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.10226049515608181, | |
| "grad_norm": 2.9304582816909286, | |
| "learning_rate": 9.999858783596665e-06, | |
| "loss": 0.7136, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10495156081808396, | |
| "grad_norm": 2.8592073188711886, | |
| "learning_rate": 9.999285105629308e-06, | |
| "loss": 0.7213, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.10764262648008611, | |
| "grad_norm": 2.6445409308883256, | |
| "learning_rate": 9.998270190666602e-06, | |
| "loss": 0.71, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11033369214208827, | |
| "grad_norm": 2.811803987053168, | |
| "learning_rate": 9.99681412828496e-06, | |
| "loss": 0.6946, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.11302475780409042, | |
| "grad_norm": 2.60382159101248, | |
| "learning_rate": 9.994917046996472e-06, | |
| "loss": 0.6824, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11571582346609258, | |
| "grad_norm": 2.7632251330599327, | |
| "learning_rate": 9.99257911423757e-06, | |
| "loss": 0.712, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11840688912809473, | |
| "grad_norm": 2.737121694120479, | |
| "learning_rate": 9.989800536354243e-06, | |
| "loss": 0.705, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12109795479009688, | |
| "grad_norm": 2.7950967614991105, | |
| "learning_rate": 9.986581558583824e-06, | |
| "loss": 0.6785, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.12378902045209902, | |
| "grad_norm": 2.580895787265613, | |
| "learning_rate": 9.98292246503335e-06, | |
| "loss": 0.6921, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1264800861141012, | |
| "grad_norm": 2.740755897744941, | |
| "learning_rate": 9.978823578654486e-06, | |
| "loss": 0.7008, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.12917115177610333, | |
| "grad_norm": 2.5740723463717514, | |
| "learning_rate": 9.97428526121502e-06, | |
| "loss": 0.6851, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1318622174381055, | |
| "grad_norm": 2.5648119982196316, | |
| "learning_rate": 9.969307913266931e-06, | |
| "loss": 0.6918, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.13455328310010764, | |
| "grad_norm": 2.6146329921031968, | |
| "learning_rate": 9.963891974111042e-06, | |
| "loss": 0.686, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1372443487621098, | |
| "grad_norm": 2.5693313714525225, | |
| "learning_rate": 9.958037921758241e-06, | |
| "loss": 0.6851, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.13993541442411195, | |
| "grad_norm": 2.5758611472453183, | |
| "learning_rate": 9.951746272887298e-06, | |
| "loss": 0.6825, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1426264800861141, | |
| "grad_norm": 2.6882089017132866, | |
| "learning_rate": 9.945017582799256e-06, | |
| "loss": 0.6857, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.14531754574811626, | |
| "grad_norm": 2.471441857805247, | |
| "learning_rate": 9.937852445368427e-06, | |
| "loss": 0.7087, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1480086114101184, | |
| "grad_norm": 2.499362888582163, | |
| "learning_rate": 9.930251492989972e-06, | |
| "loss": 0.6834, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.15069967707212056, | |
| "grad_norm": 2.4532511507065684, | |
| "learning_rate": 9.922215396524089e-06, | |
| "loss": 0.6841, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1533907427341227, | |
| "grad_norm": 2.476586585638722, | |
| "learning_rate": 9.913744865236798e-06, | |
| "loss": 0.6738, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.15608180839612487, | |
| "grad_norm": 2.4741617401097593, | |
| "learning_rate": 9.904840646737346e-06, | |
| "loss": 0.6977, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.158772874058127, | |
| "grad_norm": 2.4646734777776405, | |
| "learning_rate": 9.895503526912224e-06, | |
| "loss": 0.6664, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.16146393972012918, | |
| "grad_norm": 2.541571671880384, | |
| "learning_rate": 9.885734329855798e-06, | |
| "loss": 0.6578, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16415500538213132, | |
| "grad_norm": 2.512328994032953, | |
| "learning_rate": 9.875533917797579e-06, | |
| "loss": 0.6564, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.1668460710441335, | |
| "grad_norm": 2.608219630629651, | |
| "learning_rate": 9.864903191026125e-06, | |
| "loss": 0.6676, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.16953713670613563, | |
| "grad_norm": 2.5127310208806226, | |
| "learning_rate": 9.853843087809574e-06, | |
| "loss": 0.675, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.1722282023681378, | |
| "grad_norm": 2.954995883472683, | |
| "learning_rate": 9.842354584312841e-06, | |
| "loss": 0.6711, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17491926803013993, | |
| "grad_norm": 2.4877912144296723, | |
| "learning_rate": 9.830438694511454e-06, | |
| "loss": 0.6532, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.17761033369214208, | |
| "grad_norm": 2.4109884507809998, | |
| "learning_rate": 9.818096470102067e-06, | |
| "loss": 0.6742, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18030139935414424, | |
| "grad_norm": 2.7208831806314633, | |
| "learning_rate": 9.805329000409634e-06, | |
| "loss": 0.6626, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.18299246501614638, | |
| "grad_norm": 2.70446788462542, | |
| "learning_rate": 9.792137412291265e-06, | |
| "loss": 0.6578, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18568353067814855, | |
| "grad_norm": 2.554719643429019, | |
| "learning_rate": 9.778522870036768e-06, | |
| "loss": 0.6386, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.1883745963401507, | |
| "grad_norm": 2.548018478530073, | |
| "learning_rate": 9.764486575265893e-06, | |
| "loss": 0.653, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19106566200215286, | |
| "grad_norm": 2.720938193286558, | |
| "learning_rate": 9.750029766822277e-06, | |
| "loss": 0.6579, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.193756727664155, | |
| "grad_norm": 3.2604834173029267, | |
| "learning_rate": 9.735153720664096e-06, | |
| "loss": 0.6357, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19644779332615717, | |
| "grad_norm": 2.854496420392015, | |
| "learning_rate": 9.719859749751462e-06, | |
| "loss": 0.643, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.1991388589881593, | |
| "grad_norm": 2.5148923925605295, | |
| "learning_rate": 9.704149203930522e-06, | |
| "loss": 0.6314, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20182992465016147, | |
| "grad_norm": 2.6156775549028097, | |
| "learning_rate": 9.688023469814345e-06, | |
| "loss": 0.6291, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.20452099031216361, | |
| "grad_norm": 2.4436480149075903, | |
| "learning_rate": 9.671483970660519e-06, | |
| "loss": 0.6391, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.20721205597416578, | |
| "grad_norm": 2.6176248575730683, | |
| "learning_rate": 9.654532166245543e-06, | |
| "loss": 0.6451, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.20990312163616792, | |
| "grad_norm": 2.6284920127628704, | |
| "learning_rate": 9.637169552735985e-06, | |
| "loss": 0.6457, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21259418729817006, | |
| "grad_norm": 2.5079900288395676, | |
| "learning_rate": 9.619397662556434e-06, | |
| "loss": 0.6363, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.21528525296017223, | |
| "grad_norm": 2.510239111931181, | |
| "learning_rate": 9.601218064254245e-06, | |
| "loss": 0.636, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21797631862217437, | |
| "grad_norm": 2.6420717433236294, | |
| "learning_rate": 9.582632362361098e-06, | |
| "loss": 0.638, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.22066738428417654, | |
| "grad_norm": 2.4228761926249343, | |
| "learning_rate": 9.563642197251382e-06, | |
| "loss": 0.6373, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.22335844994617868, | |
| "grad_norm": 2.482460770882953, | |
| "learning_rate": 9.54424924499742e-06, | |
| "loss": 0.6138, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.22604951560818085, | |
| "grad_norm": 2.4554209699636904, | |
| "learning_rate": 9.524455217221537e-06, | |
| "loss": 0.6297, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22874058127018299, | |
| "grad_norm": 2.2267259427452686, | |
| "learning_rate": 9.504261860944984e-06, | |
| "loss": 0.6345, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.23143164693218515, | |
| "grad_norm": 2.5062964865846467, | |
| "learning_rate": 9.48367095843376e-06, | |
| "loss": 0.6271, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2341227125941873, | |
| "grad_norm": 2.5483397828557353, | |
| "learning_rate": 9.462684327041298e-06, | |
| "loss": 0.6137, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.23681377825618946, | |
| "grad_norm": 6.340986217773637, | |
| "learning_rate": 9.441303819048073e-06, | |
| "loss": 0.6236, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2395048439181916, | |
| "grad_norm": 3.6533277008246787, | |
| "learning_rate": 9.41953132149811e-06, | |
| "loss": 0.6201, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.24219590958019377, | |
| "grad_norm": 2.539798703180873, | |
| "learning_rate": 9.397368756032445e-06, | |
| "loss": 0.623, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2448869752421959, | |
| "grad_norm": 2.569421112776594, | |
| "learning_rate": 9.374818078719515e-06, | |
| "loss": 0.6129, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.24757804090419805, | |
| "grad_norm": 2.3715746149053953, | |
| "learning_rate": 9.351881279882512e-06, | |
| "loss": 0.6268, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2502691065662002, | |
| "grad_norm": 2.572579471719146, | |
| "learning_rate": 9.328560383923724e-06, | |
| "loss": 0.6161, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.2529601722282024, | |
| "grad_norm": 2.4902601550757737, | |
| "learning_rate": 9.304857449145858e-06, | |
| "loss": 0.6244, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2556512378902045, | |
| "grad_norm": 2.4501823079061658, | |
| "learning_rate": 9.280774567570372e-06, | |
| "loss": 0.6287, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.25834230355220666, | |
| "grad_norm": 2.5682717978671126, | |
| "learning_rate": 9.256313864752838e-06, | |
| "loss": 0.604, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.26103336921420883, | |
| "grad_norm": 2.404883352610733, | |
| "learning_rate": 9.231477499595333e-06, | |
| "loss": 0.6138, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.263724434876211, | |
| "grad_norm": 2.562822568126396, | |
| "learning_rate": 9.206267664155906e-06, | |
| "loss": 0.6054, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2664155005382131, | |
| "grad_norm": 2.321411236526974, | |
| "learning_rate": 9.180686583455097e-06, | |
| "loss": 0.5948, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.2691065662002153, | |
| "grad_norm": 2.475132775898768, | |
| "learning_rate": 9.154736515279557e-06, | |
| "loss": 0.5905, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.27179763186221745, | |
| "grad_norm": 2.3355547839115354, | |
| "learning_rate": 9.12841974998278e-06, | |
| "loss": 0.5813, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.2744886975242196, | |
| "grad_norm": 2.742626737076485, | |
| "learning_rate": 9.101738610282956e-06, | |
| "loss": 0.6136, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27717976318622173, | |
| "grad_norm": 2.52218472070317, | |
| "learning_rate": 9.074695451057966e-06, | |
| "loss": 0.6002, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2798708288482239, | |
| "grad_norm": 2.364034819036867, | |
| "learning_rate": 9.047292659137542e-06, | |
| "loss": 0.6055, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.28256189451022606, | |
| "grad_norm": 2.5115978707447524, | |
| "learning_rate": 9.019532653092597e-06, | |
| "loss": 0.5978, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.2852529601722282, | |
| "grad_norm": 2.4450725301262457, | |
| "learning_rate": 8.99141788302178e-06, | |
| "loss": 0.5748, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.28794402583423034, | |
| "grad_norm": 2.5635891607971764, | |
| "learning_rate": 8.962950830335213e-06, | |
| "loss": 0.6019, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.2906350914962325, | |
| "grad_norm": 2.6838630282585316, | |
| "learning_rate": 8.93413400753549e-06, | |
| "loss": 0.6119, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2933261571582347, | |
| "grad_norm": 2.559914467821027, | |
| "learning_rate": 8.90496995799592e-06, | |
| "loss": 0.5934, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.2960172228202368, | |
| "grad_norm": 2.3172236979073633, | |
| "learning_rate": 8.875461255736055e-06, | |
| "loss": 0.5923, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.29870828848223896, | |
| "grad_norm": 2.613084787164224, | |
| "learning_rate": 8.845610505194495e-06, | |
| "loss": 0.5881, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3013993541442411, | |
| "grad_norm": 2.527915276378667, | |
| "learning_rate": 8.815420340999034e-06, | |
| "loss": 0.5877, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3040904198062433, | |
| "grad_norm": 2.566801330647483, | |
| "learning_rate": 8.784893427734117e-06, | |
| "loss": 0.5742, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3067814854682454, | |
| "grad_norm": 2.363950845115375, | |
| "learning_rate": 8.754032459705672e-06, | |
| "loss": 0.5828, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3094725511302476, | |
| "grad_norm": 2.2951780141802307, | |
| "learning_rate": 8.722840160703304e-06, | |
| "loss": 0.5825, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.31216361679224974, | |
| "grad_norm": 2.53863020294738, | |
| "learning_rate": 8.691319283759896e-06, | |
| "loss": 0.5751, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3148546824542519, | |
| "grad_norm": 2.357318941568188, | |
| "learning_rate": 8.659472610908628e-06, | |
| "loss": 0.5897, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.317545748116254, | |
| "grad_norm": 2.318670881116376, | |
| "learning_rate": 8.627302952937431e-06, | |
| "loss": 0.5649, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3202368137782562, | |
| "grad_norm": 2.2618602596217388, | |
| "learning_rate": 8.594813149140908e-06, | |
| "loss": 0.577, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.32292787944025836, | |
| "grad_norm": 2.39290329100585, | |
| "learning_rate": 8.56200606706974e-06, | |
| "loss": 0.5543, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.32561894510226047, | |
| "grad_norm": 2.470727799467087, | |
| "learning_rate": 8.528884602277593e-06, | |
| "loss": 0.5823, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.32831001076426264, | |
| "grad_norm": 2.4343415189492403, | |
| "learning_rate": 8.495451678065563e-06, | |
| "loss": 0.583, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3310010764262648, | |
| "grad_norm": 2.329527413930226, | |
| "learning_rate": 8.461710245224149e-06, | |
| "loss": 0.5596, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.333692142088267, | |
| "grad_norm": 2.710197180873677, | |
| "learning_rate": 8.42766328177284e-06, | |
| "loss": 0.5688, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3363832077502691, | |
| "grad_norm": 2.3295706886393908, | |
| "learning_rate": 8.393313792697251e-06, | |
| "loss": 0.5606, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.33907427341227125, | |
| "grad_norm": 2.7411314085447893, | |
| "learning_rate": 8.358664809683926e-06, | |
| "loss": 0.5679, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3417653390742734, | |
| "grad_norm": 2.354293086420231, | |
| "learning_rate": 8.323719390852735e-06, | |
| "loss": 0.5737, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3444564047362756, | |
| "grad_norm": 2.5084140873961083, | |
| "learning_rate": 8.288480620486991e-06, | |
| "loss": 0.5479, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3471474703982777, | |
| "grad_norm": 2.4064859741841427, | |
| "learning_rate": 8.252951608761217e-06, | |
| "loss": 0.5574, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.34983853606027987, | |
| "grad_norm": 2.4777423126966185, | |
| "learning_rate": 8.217135491466636e-06, | |
| "loss": 0.5666, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.35252960172228204, | |
| "grad_norm": 2.3905287579939416, | |
| "learning_rate": 8.181035429734423e-06, | |
| "loss": 0.5629, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.35522066738428415, | |
| "grad_norm": 2.327857140876721, | |
| "learning_rate": 8.144654609756685e-06, | |
| "loss": 0.5372, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3579117330462863, | |
| "grad_norm": 2.290053555489138, | |
| "learning_rate": 8.10799624250527e-06, | |
| "loss": 0.5646, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3606027987082885, | |
| "grad_norm": 2.297702496538287, | |
| "learning_rate": 8.071063563448341e-06, | |
| "loss": 0.553, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.36329386437029065, | |
| "grad_norm": 2.302044112251165, | |
| "learning_rate": 8.03385983226483e-06, | |
| "loss": 0.5381, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.36598493003229277, | |
| "grad_norm": 2.508769537248855, | |
| "learning_rate": 7.996388332556735e-06, | |
| "loss": 0.5433, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.36867599569429493, | |
| "grad_norm": 2.4009515047198193, | |
| "learning_rate": 7.958652371559313e-06, | |
| "loss": 0.5524, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.3713670613562971, | |
| "grad_norm": 2.3402259477581198, | |
| "learning_rate": 7.920655279849173e-06, | |
| "loss": 0.5609, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.37405812701829927, | |
| "grad_norm": 2.71945405647194, | |
| "learning_rate": 7.882400411050328e-06, | |
| "loss": 0.5414, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.3767491926803014, | |
| "grad_norm": 2.325373282991122, | |
| "learning_rate": 7.843891141538201e-06, | |
| "loss": 0.5352, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.37944025834230355, | |
| "grad_norm": 2.243347153689357, | |
| "learning_rate": 7.80513087014163e-06, | |
| "loss": 0.5503, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.3821313240043057, | |
| "grad_norm": 2.2504693819243506, | |
| "learning_rate": 7.766123017842877e-06, | |
| "loss": 0.5408, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3848223896663079, | |
| "grad_norm": 2.24546531050267, | |
| "learning_rate": 7.726871027475709e-06, | |
| "loss": 0.5384, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.38751345532831, | |
| "grad_norm": 2.452293080065367, | |
| "learning_rate": 7.687378363421512e-06, | |
| "loss": 0.5435, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.39020452099031216, | |
| "grad_norm": 2.4702348319235625, | |
| "learning_rate": 7.647648511303545e-06, | |
| "loss": 0.5366, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.39289558665231433, | |
| "grad_norm": 2.6594839428477646, | |
| "learning_rate": 7.607684977679284e-06, | |
| "loss": 0.5311, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.39558665231431644, | |
| "grad_norm": 2.3853401333086097, | |
| "learning_rate": 7.567491289730944e-06, | |
| "loss": 0.5347, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.3982777179763186, | |
| "grad_norm": 2.305805010232876, | |
| "learning_rate": 7.52707099495416e-06, | |
| "loss": 0.5198, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4009687836383208, | |
| "grad_norm": 2.32709407366988, | |
| "learning_rate": 7.4864276608448925e-06, | |
| "loss": 0.5162, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.40365984930032295, | |
| "grad_norm": 2.2558330926103993, | |
| "learning_rate": 7.44556487458456e-06, | |
| "loss": 0.5157, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.40635091496232506, | |
| "grad_norm": 2.358890714175164, | |
| "learning_rate": 7.404486242723428e-06, | |
| "loss": 0.5223, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.40904198062432723, | |
| "grad_norm": 2.4688588460570036, | |
| "learning_rate": 7.363195390862298e-06, | |
| "loss": 0.5306, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4117330462863294, | |
| "grad_norm": 2.342696497908346, | |
| "learning_rate": 7.321695963332516e-06, | |
| "loss": 0.5331, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.41442411194833156, | |
| "grad_norm": 2.5064305240289144, | |
| "learning_rate": 7.279991622874319e-06, | |
| "loss": 0.5397, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4171151776103337, | |
| "grad_norm": 2.3898169886899296, | |
| "learning_rate": 7.238086050313563e-06, | |
| "loss": 0.5197, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.41980624327233584, | |
| "grad_norm": 2.429842329992259, | |
| "learning_rate": 7.195982944236853e-06, | |
| "loss": 0.516, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.422497308934338, | |
| "grad_norm": 2.3477023797859813, | |
| "learning_rate": 7.1536860206651025e-06, | |
| "loss": 0.5215, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.4251883745963401, | |
| "grad_norm": 2.3798189210672587, | |
| "learning_rate": 7.1111990127255684e-06, | |
| "loss": 0.5033, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4278794402583423, | |
| "grad_norm": 2.400485115987879, | |
| "learning_rate": 7.068525670322349e-06, | |
| "loss": 0.5301, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.43057050592034446, | |
| "grad_norm": 2.830405312883021, | |
| "learning_rate": 7.025669759805431e-06, | |
| "loss": 0.5226, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4332615715823466, | |
| "grad_norm": 2.267489328220382, | |
| "learning_rate": 6.982635063638265e-06, | |
| "loss": 0.5065, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.43595263724434874, | |
| "grad_norm": 2.452358924939666, | |
| "learning_rate": 6.939425380063924e-06, | |
| "loss": 0.5037, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4386437029063509, | |
| "grad_norm": 2.4030600337466588, | |
| "learning_rate": 6.896044522769879e-06, | |
| "loss": 0.5181, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.4413347685683531, | |
| "grad_norm": 2.385954958028487, | |
| "learning_rate": 6.852496320551387e-06, | |
| "loss": 0.513, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.44402583423035524, | |
| "grad_norm": 2.3598226733949064, | |
| "learning_rate": 6.808784616973581e-06, | |
| "loss": 0.5126, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.44671689989235736, | |
| "grad_norm": 2.3565314879299053, | |
| "learning_rate": 6.76491327003222e-06, | |
| "loss": 0.5029, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4494079655543595, | |
| "grad_norm": 2.5160272506828756, | |
| "learning_rate": 6.720886151813194e-06, | |
| "loss": 0.4934, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4520990312163617, | |
| "grad_norm": 2.386495536841602, | |
| "learning_rate": 6.676707148150763e-06, | |
| "loss": 0.5032, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.45479009687836386, | |
| "grad_norm": 2.436343528178474, | |
| "learning_rate": 6.632380158284607e-06, | |
| "loss": 0.4946, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.45748116254036597, | |
| "grad_norm": 2.4789859597196426, | |
| "learning_rate": 6.587909094515663e-06, | |
| "loss": 0.5066, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.46017222820236814, | |
| "grad_norm": 2.417011825533269, | |
| "learning_rate": 6.5432978818608395e-06, | |
| "loss": 0.5112, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.4628632938643703, | |
| "grad_norm": 2.3434112329726275, | |
| "learning_rate": 6.498550457706584e-06, | |
| "loss": 0.5045, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4655543595263724, | |
| "grad_norm": 2.517810982631902, | |
| "learning_rate": 6.453670771461377e-06, | |
| "loss": 0.4905, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.4682454251883746, | |
| "grad_norm": 2.3345178470528385, | |
| "learning_rate": 6.408662784207149e-06, | |
| "loss": 0.5141, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.47093649085037675, | |
| "grad_norm": 2.39523867329611, | |
| "learning_rate": 6.363530468349686e-06, | |
| "loss": 0.4879, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.4736275565123789, | |
| "grad_norm": 2.308366752180236, | |
| "learning_rate": 6.318277807268013e-06, | |
| "loss": 0.4956, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.47631862217438103, | |
| "grad_norm": 2.2009448335762127, | |
| "learning_rate": 6.27290879496283e-06, | |
| "loss": 0.4946, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.4790096878363832, | |
| "grad_norm": 2.506472513608515, | |
| "learning_rate": 6.227427435703997e-06, | |
| "loss": 0.4747, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.48170075349838537, | |
| "grad_norm": 2.3312661677886157, | |
| "learning_rate": 6.181837743677118e-06, | |
| "loss": 0.4781, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.48439181916038754, | |
| "grad_norm": 2.415462314886686, | |
| "learning_rate": 6.136143742629252e-06, | |
| "loss": 0.4846, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.48708288482238965, | |
| "grad_norm": 2.5220206493746997, | |
| "learning_rate": 6.09034946551377e-06, | |
| "loss": 0.4891, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.4897739504843918, | |
| "grad_norm": 2.4607305732947853, | |
| "learning_rate": 6.044458954134411e-06, | |
| "loss": 0.5057, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.492465016146394, | |
| "grad_norm": 2.306884124252388, | |
| "learning_rate": 5.998476258788555e-06, | |
| "loss": 0.477, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.4951560818083961, | |
| "grad_norm": 2.385382261424978, | |
| "learning_rate": 5.952405437909738e-06, | |
| "loss": 0.4812, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.49784714747039827, | |
| "grad_norm": 2.2862418710775847, | |
| "learning_rate": 5.90625055770946e-06, | |
| "loss": 0.4784, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5005382131324004, | |
| "grad_norm": 2.3522872723971093, | |
| "learning_rate": 5.860015691818292e-06, | |
| "loss": 0.4849, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5032292787944026, | |
| "grad_norm": 2.257405887575007, | |
| "learning_rate": 5.813704920926352e-06, | |
| "loss": 0.4799, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5059203444564048, | |
| "grad_norm": 2.5409180122003225, | |
| "learning_rate": 5.767322332423128e-06, | |
| "loss": 0.4753, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5086114101184069, | |
| "grad_norm": 2.2912568364325137, | |
| "learning_rate": 5.720872020036734e-06, | |
| "loss": 0.4836, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.511302475780409, | |
| "grad_norm": 2.3404279089465256, | |
| "learning_rate": 5.674358083472598e-06, | |
| "loss": 0.4709, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5139935414424112, | |
| "grad_norm": 2.3138319616276504, | |
| "learning_rate": 5.6277846280516125e-06, | |
| "loss": 0.4775, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5166846071044133, | |
| "grad_norm": 2.319930769748735, | |
| "learning_rate": 5.581155764347812e-06, | |
| "loss": 0.4737, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5193756727664155, | |
| "grad_norm": 2.2631450594519573, | |
| "learning_rate": 5.534475607825566e-06, | |
| "loss": 0.4681, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5220667384284177, | |
| "grad_norm": 2.2993752726651877, | |
| "learning_rate": 5.487748278476342e-06, | |
| "loss": 0.4744, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5247578040904198, | |
| "grad_norm": 2.6084393355683853, | |
| "learning_rate": 5.440977900455093e-06, | |
| "loss": 0.474, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.527448869752422, | |
| "grad_norm": 2.2812262068529767, | |
| "learning_rate": 5.39416860171624e-06, | |
| "loss": 0.4603, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5301399354144241, | |
| "grad_norm": 2.366439732267079, | |
| "learning_rate": 5.347324513649352e-06, | |
| "loss": 0.4554, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.5328310010764262, | |
| "grad_norm": 2.282327173713366, | |
| "learning_rate": 5.300449770714502e-06, | |
| "loss": 0.4484, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5355220667384284, | |
| "grad_norm": 2.388092616206348, | |
| "learning_rate": 5.253548510077366e-06, | |
| "loss": 0.4565, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5382131324004306, | |
| "grad_norm": 2.347808334356873, | |
| "learning_rate": 5.206624871244066e-06, | |
| "loss": 0.4581, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5409041980624327, | |
| "grad_norm": 2.2379521568960206, | |
| "learning_rate": 5.159682995695833e-06, | |
| "loss": 0.4477, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5435952637244349, | |
| "grad_norm": 2.2336687017970296, | |
| "learning_rate": 5.112727026523461e-06, | |
| "loss": 0.4559, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5462863293864371, | |
| "grad_norm": 2.3526152961096223, | |
| "learning_rate": 5.065761108061658e-06, | |
| "loss": 0.4517, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5489773950484392, | |
| "grad_norm": 2.3430650960275305, | |
| "learning_rate": 5.018789385523245e-06, | |
| "loss": 0.4651, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5516684607104413, | |
| "grad_norm": 2.238164378282626, | |
| "learning_rate": 4.971816004633323e-06, | |
| "loss": 0.4461, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5543595263724435, | |
| "grad_norm": 2.357723214292236, | |
| "learning_rate": 4.924845111263349e-06, | |
| "loss": 0.4475, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5570505920344456, | |
| "grad_norm": 2.416110748499095, | |
| "learning_rate": 4.877880851065238e-06, | |
| "loss": 0.4621, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.5597416576964478, | |
| "grad_norm": 2.446048600948981, | |
| "learning_rate": 4.830927369105457e-06, | |
| "loss": 0.4585, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.56243272335845, | |
| "grad_norm": 2.381977000693451, | |
| "learning_rate": 4.783988809499187e-06, | |
| "loss": 0.4544, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.5651237890204521, | |
| "grad_norm": 2.633241388056459, | |
| "learning_rate": 4.737069315044562e-06, | |
| "loss": 0.44, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5678148546824543, | |
| "grad_norm": 2.2816953591294937, | |
| "learning_rate": 4.690173026857028e-06, | |
| "loss": 0.4501, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.5705059203444564, | |
| "grad_norm": 2.446396815822862, | |
| "learning_rate": 4.643304084003839e-06, | |
| "loss": 0.4506, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5731969860064585, | |
| "grad_norm": 2.184428941729192, | |
| "learning_rate": 4.596466623138756e-06, | |
| "loss": 0.4308, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.5758880516684607, | |
| "grad_norm": 2.3003382421283844, | |
| "learning_rate": 4.549664778136933e-06, | |
| "loss": 0.4416, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5785791173304629, | |
| "grad_norm": 2.2767749774728783, | |
| "learning_rate": 4.502902679730074e-06, | |
| "loss": 0.4315, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.581270182992465, | |
| "grad_norm": 2.3411661207469674, | |
| "learning_rate": 4.456184455141843e-06, | |
| "loss": 0.447, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5839612486544672, | |
| "grad_norm": 2.381303755147278, | |
| "learning_rate": 4.4095142277236015e-06, | |
| "loss": 0.4397, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.5866523143164694, | |
| "grad_norm": 2.332530420483932, | |
| "learning_rate": 4.362896116590475e-06, | |
| "loss": 0.4392, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5893433799784715, | |
| "grad_norm": 2.3579190260561704, | |
| "learning_rate": 4.316334236257818e-06, | |
| "loss": 0.4328, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.5920344456404736, | |
| "grad_norm": 2.2783721282271685, | |
| "learning_rate": 4.269832696278038e-06, | |
| "loss": 0.4336, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5947255113024758, | |
| "grad_norm": 2.300306639814476, | |
| "learning_rate": 4.223395600877912e-06, | |
| "loss": 0.4242, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.5974165769644779, | |
| "grad_norm": 2.351752301438986, | |
| "learning_rate": 4.17702704859633e-06, | |
| "loss": 0.4442, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6001076426264801, | |
| "grad_norm": 2.4962239067091847, | |
| "learning_rate": 4.130731131922574e-06, | |
| "loss": 0.4378, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.6027987082884823, | |
| "grad_norm": 2.361239331500982, | |
| "learning_rate": 4.0845119369350995e-06, | |
| "loss": 0.4262, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6054897739504844, | |
| "grad_norm": 2.1310276875997936, | |
| "learning_rate": 4.038373542940905e-06, | |
| "loss": 0.4183, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6081808396124866, | |
| "grad_norm": 2.3017752163117744, | |
| "learning_rate": 3.992320022115492e-06, | |
| "loss": 0.4064, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6108719052744886, | |
| "grad_norm": 2.288740574605373, | |
| "learning_rate": 3.946355439143455e-06, | |
| "loss": 0.4133, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6135629709364908, | |
| "grad_norm": 2.2674805407755194, | |
| "learning_rate": 3.900483850859735e-06, | |
| "loss": 0.4275, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.616254036598493, | |
| "grad_norm": 2.3917723784081706, | |
| "learning_rate": 3.854709305891557e-06, | |
| "loss": 0.4311, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6189451022604952, | |
| "grad_norm": 2.24620365098204, | |
| "learning_rate": 3.8090358443010993e-06, | |
| "loss": 0.4111, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6216361679224973, | |
| "grad_norm": 2.297246451401738, | |
| "learning_rate": 3.7634674972289227e-06, | |
| "loss": 0.422, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.6243272335844995, | |
| "grad_norm": 2.1634920709192462, | |
| "learning_rate": 3.718008286538179e-06, | |
| "loss": 0.4322, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6270182992465017, | |
| "grad_norm": 2.2583945433268857, | |
| "learning_rate": 3.67266222445964e-06, | |
| "loss": 0.4235, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6297093649085038, | |
| "grad_norm": 2.194753709445314, | |
| "learning_rate": 3.627433313237576e-06, | |
| "loss": 0.4219, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6324004305705059, | |
| "grad_norm": 2.1815873696945323, | |
| "learning_rate": 3.5823255447765233e-06, | |
| "loss": 0.4185, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.635091496232508, | |
| "grad_norm": 2.115828204783862, | |
| "learning_rate": 3.5373429002889583e-06, | |
| "loss": 0.4015, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6377825618945102, | |
| "grad_norm": 2.250398497407886, | |
| "learning_rate": 3.4924893499439096e-06, | |
| "loss": 0.4164, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6404736275565124, | |
| "grad_norm": 2.3734751418562405, | |
| "learning_rate": 3.447768852516554e-06, | |
| "loss": 0.4031, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6431646932185145, | |
| "grad_norm": 2.2971665886143056, | |
| "learning_rate": 3.4031853550388176e-06, | |
| "loss": 0.4204, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.6458557588805167, | |
| "grad_norm": 2.148355296872698, | |
| "learning_rate": 3.3587427924510086e-06, | |
| "loss": 0.4088, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6485468245425189, | |
| "grad_norm": 2.2063900099924636, | |
| "learning_rate": 3.314445087254518e-06, | |
| "loss": 0.4038, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.6512378902045209, | |
| "grad_norm": 2.396941304529023, | |
| "learning_rate": 3.2702961491656197e-06, | |
| "loss": 0.4095, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6539289558665231, | |
| "grad_norm": 2.4764518437414558, | |
| "learning_rate": 3.226299874770402e-06, | |
| "loss": 0.4006, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.6566200215285253, | |
| "grad_norm": 2.169456697824407, | |
| "learning_rate": 3.1824601471808504e-06, | |
| "loss": 0.4102, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6593110871905274, | |
| "grad_norm": 2.2877199164126223, | |
| "learning_rate": 3.138780835692132e-06, | |
| "loss": 0.4044, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6620021528525296, | |
| "grad_norm": 2.411168783471376, | |
| "learning_rate": 3.0952657954410792e-06, | |
| "loss": 0.3998, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6646932185145318, | |
| "grad_norm": 2.1915427873990687, | |
| "learning_rate": 3.051918867065944e-06, | |
| "loss": 0.3935, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.667384284176534, | |
| "grad_norm": 2.236321241200045, | |
| "learning_rate": 3.0087438763674226e-06, | |
| "loss": 0.3968, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.670075349838536, | |
| "grad_norm": 2.153491088223765, | |
| "learning_rate": 2.9657446339709906e-06, | |
| "loss": 0.4108, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.6727664155005382, | |
| "grad_norm": 2.1667604177883066, | |
| "learning_rate": 2.9229249349905686e-06, | |
| "loss": 0.4067, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6754574811625403, | |
| "grad_norm": 2.3480884088972216, | |
| "learning_rate": 2.8802885586935794e-06, | |
| "loss": 0.3973, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.6781485468245425, | |
| "grad_norm": 2.363888356448008, | |
| "learning_rate": 2.837839268167373e-06, | |
| "loss": 0.3997, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6808396124865447, | |
| "grad_norm": 2.1759737776060906, | |
| "learning_rate": 2.7955808099871196e-06, | |
| "loss": 0.41, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.6835306781485468, | |
| "grad_norm": 2.145769279313241, | |
| "learning_rate": 2.7535169138851124e-06, | |
| "loss": 0.3968, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.686221743810549, | |
| "grad_norm": 2.3938641882258738, | |
| "learning_rate": 2.711651292421593e-06, | |
| "loss": 0.3943, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.6889128094725512, | |
| "grad_norm": 2.2985974347283435, | |
| "learning_rate": 2.6699876406570823e-06, | |
| "loss": 0.4057, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6916038751345532, | |
| "grad_norm": 2.212164701308494, | |
| "learning_rate": 2.62852963582625e-06, | |
| "loss": 0.4069, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.6942949407965554, | |
| "grad_norm": 2.2332995585806783, | |
| "learning_rate": 2.5872809370133704e-06, | |
| "loss": 0.3929, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6969860064585576, | |
| "grad_norm": 2.154755198337086, | |
| "learning_rate": 2.5462451848293535e-06, | |
| "loss": 0.395, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.6996770721205597, | |
| "grad_norm": 2.35147288064046, | |
| "learning_rate": 2.5054260010904423e-06, | |
| "loss": 0.4131, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7023681377825619, | |
| "grad_norm": 2.307361805622348, | |
| "learning_rate": 2.464826988498544e-06, | |
| "loss": 0.3889, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.7050592034445641, | |
| "grad_norm": 2.2374159276691294, | |
| "learning_rate": 2.424451730323261e-06, | |
| "loss": 0.3911, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7077502691065662, | |
| "grad_norm": 2.0218101351051714, | |
| "learning_rate": 2.3843037900856174e-06, | |
| "loss": 0.3744, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7104413347685683, | |
| "grad_norm": 2.117430212547741, | |
| "learning_rate": 2.3443867112435585e-06, | |
| "loss": 0.3735, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7131324004305705, | |
| "grad_norm": 2.4379881536686625, | |
| "learning_rate": 2.304704016879195e-06, | |
| "loss": 0.3808, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7158234660925726, | |
| "grad_norm": 2.186063282716259, | |
| "learning_rate": 2.265259209387867e-06, | |
| "loss": 0.401, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7185145317545748, | |
| "grad_norm": 2.1034516865579316, | |
| "learning_rate": 2.226055770169002e-06, | |
| "loss": 0.3867, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.721205597416577, | |
| "grad_norm": 2.257291682766681, | |
| "learning_rate": 2.1870971593188704e-06, | |
| "loss": 0.3827, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7238966630785791, | |
| "grad_norm": 2.2641761303433983, | |
| "learning_rate": 2.148386815325179e-06, | |
| "loss": 0.3795, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7265877287405813, | |
| "grad_norm": 2.485493712722777, | |
| "learning_rate": 2.109928154763606e-06, | |
| "loss": 0.3878, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7292787944025835, | |
| "grad_norm": 2.1251911128194285, | |
| "learning_rate": 2.0717245719962347e-06, | |
| "loss": 0.3664, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.7319698600645855, | |
| "grad_norm": 2.276033486452104, | |
| "learning_rate": 2.0337794388719845e-06, | |
| "loss": 0.3813, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7346609257265877, | |
| "grad_norm": 2.2995322739078747, | |
| "learning_rate": 1.9960961044290015e-06, | |
| "loss": 0.3744, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.7373519913885899, | |
| "grad_norm": 2.0899009167827343, | |
| "learning_rate": 1.9586778945990785e-06, | |
| "loss": 0.3721, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.740043057050592, | |
| "grad_norm": 2.2617171211103395, | |
| "learning_rate": 1.921528111914102e-06, | |
| "loss": 0.3785, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.7427341227125942, | |
| "grad_norm": 2.154826412069349, | |
| "learning_rate": 1.8846500352145753e-06, | |
| "loss": 0.3762, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7454251883745964, | |
| "grad_norm": 2.099913553338307, | |
| "learning_rate": 1.848046919360225e-06, | |
| "loss": 0.3707, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.7481162540365985, | |
| "grad_norm": 2.0516253842538767, | |
| "learning_rate": 1.811721994942731e-06, | |
| "loss": 0.3744, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7508073196986006, | |
| "grad_norm": 2.1300015889778425, | |
| "learning_rate": 1.775678468000589e-06, | |
| "loss": 0.3762, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7534983853606028, | |
| "grad_norm": 2.09371262898784, | |
| "learning_rate": 1.7399195197361507e-06, | |
| "loss": 0.3767, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7561894510226049, | |
| "grad_norm": 2.0414347937365194, | |
| "learning_rate": 1.7044483062348465e-06, | |
| "loss": 0.3769, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.7588805166846071, | |
| "grad_norm": 2.245862892723086, | |
| "learning_rate": 1.6692679581866334e-06, | |
| "loss": 0.3699, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7615715823466093, | |
| "grad_norm": 2.0019206442455535, | |
| "learning_rate": 1.6343815806096764e-06, | |
| "loss": 0.3718, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.7642626480086114, | |
| "grad_norm": 2.0667622426417975, | |
| "learning_rate": 1.5997922525763015e-06, | |
| "loss": 0.3709, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7669537136706136, | |
| "grad_norm": 2.2454001706534252, | |
| "learning_rate": 1.5655030269412375e-06, | |
| "loss": 0.378, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.7696447793326158, | |
| "grad_norm": 2.0453161314199035, | |
| "learning_rate": 1.5315169300721694e-06, | |
| "loss": 0.3798, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7723358449946178, | |
| "grad_norm": 2.122627263333417, | |
| "learning_rate": 1.4978369615826316e-06, | |
| "loss": 0.3642, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.77502691065662, | |
| "grad_norm": 2.253665103900189, | |
| "learning_rate": 1.4644660940672628e-06, | |
| "loss": 0.3649, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7777179763186222, | |
| "grad_norm": 2.223866039627806, | |
| "learning_rate": 1.431407272839443e-06, | |
| "loss": 0.3648, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.7804090419806243, | |
| "grad_norm": 2.124642169885474, | |
| "learning_rate": 1.3986634156713418e-06, | |
| "loss": 0.3686, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7831001076426265, | |
| "grad_norm": 2.039777090168692, | |
| "learning_rate": 1.3662374125363954e-06, | |
| "loss": 0.365, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.7857911733046287, | |
| "grad_norm": 2.1116486617070405, | |
| "learning_rate": 1.334132125354236e-06, | |
| "loss": 0.3636, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7884822389666308, | |
| "grad_norm": 2.1955820453419217, | |
| "learning_rate": 1.302350387738101e-06, | |
| "loss": 0.3623, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.7911733046286329, | |
| "grad_norm": 2.1223258884983074, | |
| "learning_rate": 1.270895004744737e-06, | |
| "loss": 0.3631, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7938643702906351, | |
| "grad_norm": 2.272320371876702, | |
| "learning_rate": 1.2397687526268248e-06, | |
| "loss": 0.3714, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.7965554359526372, | |
| "grad_norm": 2.234677152395205, | |
| "learning_rate": 1.2089743785879493e-06, | |
| "loss": 0.3613, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.7992465016146394, | |
| "grad_norm": 2.112632819570097, | |
| "learning_rate": 1.1785146005401292e-06, | |
| "loss": 0.3676, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.8019375672766416, | |
| "grad_norm": 2.0426642476776857, | |
| "learning_rate": 1.1483921068639353e-06, | |
| "loss": 0.3589, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8046286329386437, | |
| "grad_norm": 2.2682633423265006, | |
| "learning_rate": 1.118609556171213e-06, | |
| "loss": 0.3702, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8073196986006459, | |
| "grad_norm": 2.206478926731362, | |
| "learning_rate": 1.0891695770704341e-06, | |
| "loss": 0.3669, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.810010764262648, | |
| "grad_norm": 2.0838005667062425, | |
| "learning_rate": 1.0600747679346956e-06, | |
| "loss": 0.3649, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8127018299246501, | |
| "grad_norm": 2.0526302864143515, | |
| "learning_rate": 1.0313276966723867e-06, | |
| "loss": 0.3607, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8153928955866523, | |
| "grad_norm": 2.2411019803506704, | |
| "learning_rate": 1.002930900500546e-06, | |
| "loss": 0.3608, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.8180839612486545, | |
| "grad_norm": 2.2183253148113877, | |
| "learning_rate": 9.74886885720925e-07, | |
| "loss": 0.3489, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8207750269106566, | |
| "grad_norm": 2.2046741515838555, | |
| "learning_rate": 9.471981274987846e-07, | |
| "loss": 0.3582, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8234660925726588, | |
| "grad_norm": 2.0776101459474936, | |
| "learning_rate": 9.198670696444339e-07, | |
| "loss": 0.3684, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.826157158234661, | |
| "grad_norm": 2.1260490300416053, | |
| "learning_rate": 8.928961243975437e-07, | |
| "loss": 0.3602, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.8288482238966631, | |
| "grad_norm": 2.203007778671738, | |
| "learning_rate": 8.662876722142327e-07, | |
| "loss": 0.3676, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8315392895586652, | |
| "grad_norm": 2.0515304374779415, | |
| "learning_rate": 8.400440615569849e-07, | |
| "loss": 0.3666, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.8342303552206674, | |
| "grad_norm": 2.235903012764375, | |
| "learning_rate": 8.141676086873574e-07, | |
| "loss": 0.35, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8369214208826695, | |
| "grad_norm": 1.9898897228797456, | |
| "learning_rate": 7.886605974615574e-07, | |
| "loss": 0.3619, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.8396124865446717, | |
| "grad_norm": 2.0672348209333102, | |
| "learning_rate": 7.635252791288611e-07, | |
| "loss": 0.346, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8423035522066739, | |
| "grad_norm": 2.140977921399784, | |
| "learning_rate": 7.38763872132931e-07, | |
| "loss": 0.3579, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.844994617868676, | |
| "grad_norm": 2.0050132762403137, | |
| "learning_rate": 7.143785619160026e-07, | |
| "loss": 0.3561, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8476856835306782, | |
| "grad_norm": 2.0184002747173104, | |
| "learning_rate": 6.903715007260043e-07, | |
| "loss": 0.3555, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.8503767491926802, | |
| "grad_norm": 1.8477534752755447, | |
| "learning_rate": 6.667448074265954e-07, | |
| "loss": 0.3517, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8530678148546824, | |
| "grad_norm": 2.0049189975542965, | |
| "learning_rate": 6.435005673101646e-07, | |
| "loss": 0.3388, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.8557588805166846, | |
| "grad_norm": 2.0424760670973345, | |
| "learning_rate": 6.206408319137703e-07, | |
| "loss": 0.371, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8584499461786868, | |
| "grad_norm": 2.0546063213646577, | |
| "learning_rate": 5.981676188380802e-07, | |
| "loss": 0.3467, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.8611410118406889, | |
| "grad_norm": 2.0825364915777085, | |
| "learning_rate": 5.760829115692907e-07, | |
| "loss": 0.3593, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8638320775026911, | |
| "grad_norm": 2.276064828333872, | |
| "learning_rate": 5.543886593040737e-07, | |
| "loss": 0.3527, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.8665231431646933, | |
| "grad_norm": 2.0176514994976524, | |
| "learning_rate": 5.330867767775333e-07, | |
| "loss": 0.3532, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8692142088266954, | |
| "grad_norm": 2.0708726570249922, | |
| "learning_rate": 5.121791440942131e-07, | |
| "loss": 0.3498, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.8719052744886975, | |
| "grad_norm": 1.9377547369040535, | |
| "learning_rate": 4.916676065621562e-07, | |
| "loss": 0.3485, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8745963401506996, | |
| "grad_norm": 1.9361489483911087, | |
| "learning_rate": 4.715539745300429e-07, | |
| "loss": 0.3433, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.8772874058127018, | |
| "grad_norm": 2.362723902179469, | |
| "learning_rate": 4.5184002322740784e-07, | |
| "loss": 0.3529, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.879978471474704, | |
| "grad_norm": 2.0061855571363765, | |
| "learning_rate": 4.3252749260795533e-07, | |
| "loss": 0.3405, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.8826695371367062, | |
| "grad_norm": 2.332323171973721, | |
| "learning_rate": 4.1361808719599163e-07, | |
| "loss": 0.3522, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8853606027987083, | |
| "grad_norm": 2.1057140470170155, | |
| "learning_rate": 3.951134759359854e-07, | |
| "loss": 0.346, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.8880516684607105, | |
| "grad_norm": 1.927661281496956, | |
| "learning_rate": 3.7701529204526856e-07, | |
| "loss": 0.3592, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8907427341227125, | |
| "grad_norm": 2.118098677778136, | |
| "learning_rate": 3.5932513286988436e-07, | |
| "loss": 0.3533, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.8934337997847147, | |
| "grad_norm": 1.971012885096697, | |
| "learning_rate": 3.420445597436056e-07, | |
| "loss": 0.3485, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8961248654467169, | |
| "grad_norm": 2.0143344650309265, | |
| "learning_rate": 3.251750978501339e-07, | |
| "loss": 0.3503, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.898815931108719, | |
| "grad_norm": 2.0292306602899193, | |
| "learning_rate": 3.087182360884872e-07, | |
| "loss": 0.3416, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9015069967707212, | |
| "grad_norm": 1.9564839527078925, | |
| "learning_rate": 2.926754269415877e-07, | |
| "loss": 0.3507, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9041980624327234, | |
| "grad_norm": 2.1080877292592373, | |
| "learning_rate": 2.77048086348064e-07, | |
| "loss": 0.3544, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9068891280947255, | |
| "grad_norm": 2.0632107292197617, | |
| "learning_rate": 2.6183759357728543e-07, | |
| "loss": 0.3502, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9095801937567277, | |
| "grad_norm": 1.937198592937774, | |
| "learning_rate": 2.470452911076227e-07, | |
| "loss": 0.3485, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9122712594187298, | |
| "grad_norm": 2.013603885615683, | |
| "learning_rate": 2.326724845079653e-07, | |
| "loss": 0.3572, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.9149623250807319, | |
| "grad_norm": 2.1281726376758066, | |
| "learning_rate": 2.1872044232248646e-07, | |
| "loss": 0.3521, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9176533907427341, | |
| "grad_norm": 2.1720549170018897, | |
| "learning_rate": 2.0519039595868706e-07, | |
| "loss": 0.355, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9203444564047363, | |
| "grad_norm": 2.058083501208556, | |
| "learning_rate": 1.9208353957870684e-07, | |
| "loss": 0.3606, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9230355220667384, | |
| "grad_norm": 2.083719674982006, | |
| "learning_rate": 1.7940102999393194e-07, | |
| "loss": 0.3548, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.9257265877287406, | |
| "grad_norm": 2.130647708015, | |
| "learning_rate": 1.6714398656289154e-07, | |
| "loss": 0.3497, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9284176533907428, | |
| "grad_norm": 1.9132420017221308, | |
| "learning_rate": 1.5531349109246364e-07, | |
| "loss": 0.3356, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.9311087190527448, | |
| "grad_norm": 2.0946347399647687, | |
| "learning_rate": 1.439105877423963e-07, | |
| "loss": 0.3495, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.933799784714747, | |
| "grad_norm": 2.08738192237864, | |
| "learning_rate": 1.3293628293314876e-07, | |
| "loss": 0.3517, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.9364908503767492, | |
| "grad_norm": 1.9669616900562368, | |
| "learning_rate": 1.223915452570651e-07, | |
| "loss": 0.3624, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9391819160387513, | |
| "grad_norm": 2.0421605949353694, | |
| "learning_rate": 1.1227730539288717e-07, | |
| "loss": 0.3452, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.9418729817007535, | |
| "grad_norm": 2.0355903627837098, | |
| "learning_rate": 1.0259445602361084e-07, | |
| "loss": 0.3492, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9445640473627557, | |
| "grad_norm": 2.0896912495997255, | |
| "learning_rate": 9.334385175769955e-08, | |
| "loss": 0.3572, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.9472551130247578, | |
| "grad_norm": 2.002959531267022, | |
| "learning_rate": 8.452630905365633e-08, | |
| "loss": 0.3481, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.94994617868676, | |
| "grad_norm": 2.136820278680856, | |
| "learning_rate": 7.614260614796143e-08, | |
| "loss": 0.3435, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.9526372443487621, | |
| "grad_norm": 1.9052167785885679, | |
| "learning_rate": 6.819348298638839e-08, | |
| "loss": 0.3381, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9553283100107642, | |
| "grad_norm": 2.1788907550411403, | |
| "learning_rate": 6.067964115869297e-08, | |
| "loss": 0.3577, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.9580193756727664, | |
| "grad_norm": 2.085202442833548, | |
| "learning_rate": 5.36017438366937e-08, | |
| "loss": 0.3443, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9607104413347686, | |
| "grad_norm": 1.9066530306055132, | |
| "learning_rate": 4.696041571573773e-08, | |
| "loss": 0.3514, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.9634015069967707, | |
| "grad_norm": 2.030414548308391, | |
| "learning_rate": 4.0756242959567596e-08, | |
| "loss": 0.3458, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9660925726587729, | |
| "grad_norm": 2.3356985672893034, | |
| "learning_rate": 3.498977314858487e-08, | |
| "loss": 0.3532, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.9687836383207751, | |
| "grad_norm": 2.008020036717182, | |
| "learning_rate": 2.96615152315205e-08, | |
| "loss": 0.3371, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9714747039827771, | |
| "grad_norm": 1.996053166126216, | |
| "learning_rate": 2.4771939480516817e-08, | |
| "loss": 0.345, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.9741657696447793, | |
| "grad_norm": 1.9817659128174747, | |
| "learning_rate": 2.0321477449619098e-08, | |
| "loss": 0.3428, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9768568353067815, | |
| "grad_norm": 1.968843928974531, | |
| "learning_rate": 1.6310521936688806e-08, | |
| "loss": 0.3475, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.9795479009687836, | |
| "grad_norm": 2.008709148033631, | |
| "learning_rate": 1.2739426948732426e-08, | |
| "loss": 0.3579, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9822389666307858, | |
| "grad_norm": 2.0860175169319852, | |
| "learning_rate": 9.608507670659239e-09, | |
| "loss": 0.3269, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.984930032292788, | |
| "grad_norm": 2.1919123246360415, | |
| "learning_rate": 6.918040437463025e-09, | |
| "loss": 0.3546, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9876210979547901, | |
| "grad_norm": 2.197666814347832, | |
| "learning_rate": 4.668262709830451e-09, | |
| "loss": 0.3476, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.9903121636167922, | |
| "grad_norm": 2.0435447242057716, | |
| "learning_rate": 2.8593730531861764e-09, | |
| "loss": 0.3309, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9930032292787944, | |
| "grad_norm": 2.1036408486178244, | |
| "learning_rate": 1.4915311201635362e-09, | |
| "loss": 0.3428, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.9956942949407965, | |
| "grad_norm": 1.9217810613809283, | |
| "learning_rate": 5.648576365169245e-10, | |
| "loss": 0.3392, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9983853606027987, | |
| "grad_norm": 2.1168830996512846, | |
| "learning_rate": 7.943439046531609e-11, | |
| "loss": 0.3501, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 3.853, | |
| "eval_samples_per_second": 2.595, | |
| "eval_steps_per_second": 0.779, | |
| "step": 1858 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1858, | |
| "total_flos": 194513700126720.0, | |
| "train_loss": 0.5110224842126967, | |
| "train_runtime": 16738.0887, | |
| "train_samples_per_second": 1.776, | |
| "train_steps_per_second": 0.111 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1858, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 194513700126720.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |