| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 515, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009708737864077669, | |
| "grad_norm": 32.0991353240983, | |
| "learning_rate": 4.999994831641374e-06, | |
| "loss": 4.1052, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.019417475728155338, | |
| "grad_norm": 29.524023707455505, | |
| "learning_rate": 4.9999793265868636e-06, | |
| "loss": 3.2535, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02912621359223301, | |
| "grad_norm": 25.915328826410054, | |
| "learning_rate": 4.999953484900578e-06, | |
| "loss": 3.0603, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.038834951456310676, | |
| "grad_norm": 26.925318996290773, | |
| "learning_rate": 4.9999173066893655e-06, | |
| "loss": 3.8736, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04854368932038835, | |
| "grad_norm": 17.49179072124481, | |
| "learning_rate": 4.9998707921028104e-06, | |
| "loss": 3.2503, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05825242718446602, | |
| "grad_norm": 11.174584909912863, | |
| "learning_rate": 4.999813941333237e-06, | |
| "loss": 2.0295, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06796116504854369, | |
| "grad_norm": 9.90835381349434, | |
| "learning_rate": 4.999746754615704e-06, | |
| "loss": 1.9601, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07766990291262135, | |
| "grad_norm": 2.252755813363907, | |
| "learning_rate": 4.9996692322280085e-06, | |
| "loss": 1.1375, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08737864077669903, | |
| "grad_norm": 9.417040995705188, | |
| "learning_rate": 4.999581374490681e-06, | |
| "loss": 2.4889, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0970873786407767, | |
| "grad_norm": 5.128297805941186, | |
| "learning_rate": 4.999483181766986e-06, | |
| "loss": 1.9038, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10679611650485436, | |
| "grad_norm": 2.6424619563549627, | |
| "learning_rate": 4.999374654462919e-06, | |
| "loss": 1.3898, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.11650485436893204, | |
| "grad_norm": 5.089573735718794, | |
| "learning_rate": 4.999255793027207e-06, | |
| "loss": 2.1286, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1262135922330097, | |
| "grad_norm": 2.251991573691883, | |
| "learning_rate": 4.999126597951305e-06, | |
| "loss": 1.0427, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.13592233009708737, | |
| "grad_norm": 2.2920823891389333, | |
| "learning_rate": 4.998987069769394e-06, | |
| "loss": 1.1071, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.14563106796116504, | |
| "grad_norm": 2.8377769456674247, | |
| "learning_rate": 4.998837209058379e-06, | |
| "loss": 1.6229, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1553398058252427, | |
| "grad_norm": 7.727434690224115, | |
| "learning_rate": 4.998677016437888e-06, | |
| "loss": 1.3171, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1650485436893204, | |
| "grad_norm": 3.74374589959768, | |
| "learning_rate": 4.998506492570266e-06, | |
| "loss": 1.7463, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.17475728155339806, | |
| "grad_norm": 3.837005912107839, | |
| "learning_rate": 4.998325638160576e-06, | |
| "loss": 1.3993, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.18446601941747573, | |
| "grad_norm": 5.797983985086783, | |
| "learning_rate": 4.998134453956596e-06, | |
| "loss": 1.1204, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1941747572815534, | |
| "grad_norm": 2.7548345494912807, | |
| "learning_rate": 4.997932940748811e-06, | |
| "loss": 1.2081, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.20388349514563106, | |
| "grad_norm": 5.045371621372198, | |
| "learning_rate": 4.997721099370416e-06, | |
| "loss": 1.2124, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.21359223300970873, | |
| "grad_norm": 2.051110624952864, | |
| "learning_rate": 4.997498930697308e-06, | |
| "loss": 1.3137, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.22330097087378642, | |
| "grad_norm": 2.691454094294257, | |
| "learning_rate": 4.997266435648086e-06, | |
| "loss": 1.3098, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.23300970873786409, | |
| "grad_norm": 2.4865134695003017, | |
| "learning_rate": 4.997023615184044e-06, | |
| "loss": 1.6151, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.24271844660194175, | |
| "grad_norm": 2.8075788095017757, | |
| "learning_rate": 4.996770470309167e-06, | |
| "loss": 1.6531, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2524271844660194, | |
| "grad_norm": 1.768040918463024, | |
| "learning_rate": 4.996507002070131e-06, | |
| "loss": 1.1902, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.2621359223300971, | |
| "grad_norm": 1.955363000769332, | |
| "learning_rate": 4.996233211556295e-06, | |
| "loss": 1.2454, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.27184466019417475, | |
| "grad_norm": 4.241739298469607, | |
| "learning_rate": 4.9959490998996974e-06, | |
| "loss": 0.8855, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2815533980582524, | |
| "grad_norm": 7.222330025323884, | |
| "learning_rate": 4.995654668275049e-06, | |
| "loss": 1.2271, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2912621359223301, | |
| "grad_norm": 2.3651169987619527, | |
| "learning_rate": 4.995349917899735e-06, | |
| "loss": 1.3329, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.30097087378640774, | |
| "grad_norm": 1.6948963634507868, | |
| "learning_rate": 4.9950348500338005e-06, | |
| "loss": 0.9562, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3106796116504854, | |
| "grad_norm": 1.3136462562718183, | |
| "learning_rate": 4.994709465979954e-06, | |
| "loss": 0.9246, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.32038834951456313, | |
| "grad_norm": 3.6835006171200306, | |
| "learning_rate": 4.994373767083556e-06, | |
| "loss": 1.0724, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.3300970873786408, | |
| "grad_norm": 3.163430858725886, | |
| "learning_rate": 4.994027754732616e-06, | |
| "loss": 1.1139, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.33980582524271846, | |
| "grad_norm": 1.4746343819625023, | |
| "learning_rate": 4.993671430357788e-06, | |
| "loss": 0.9994, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.34951456310679613, | |
| "grad_norm": 1.9654224512814882, | |
| "learning_rate": 4.99330479543236e-06, | |
| "loss": 1.3667, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3592233009708738, | |
| "grad_norm": 1.2495201306162087, | |
| "learning_rate": 4.992927851472254e-06, | |
| "loss": 1.1252, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.36893203883495146, | |
| "grad_norm": 1.5824371832462278, | |
| "learning_rate": 4.992540600036014e-06, | |
| "loss": 1.3831, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3786407766990291, | |
| "grad_norm": 1.4702657605395504, | |
| "learning_rate": 4.992143042724805e-06, | |
| "loss": 1.1461, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 1.4506087672413808, | |
| "learning_rate": 4.991735181182401e-06, | |
| "loss": 1.4195, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.39805825242718446, | |
| "grad_norm": 1.075865370590295, | |
| "learning_rate": 4.991317017095182e-06, | |
| "loss": 0.9813, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4077669902912621, | |
| "grad_norm": 1.2265855895293323, | |
| "learning_rate": 4.990888552192126e-06, | |
| "loss": 1.0049, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4174757281553398, | |
| "grad_norm": 2.4151552794703437, | |
| "learning_rate": 4.9904497882448004e-06, | |
| "loss": 1.1099, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.42718446601941745, | |
| "grad_norm": 1.6677746145821908, | |
| "learning_rate": 4.990000727067357e-06, | |
| "loss": 1.1481, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.4368932038834951, | |
| "grad_norm": 1.5396824114800838, | |
| "learning_rate": 4.989541370516523e-06, | |
| "loss": 0.9952, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.44660194174757284, | |
| "grad_norm": 1.3928969634273873, | |
| "learning_rate": 4.989071720491595e-06, | |
| "loss": 0.9147, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.4563106796116505, | |
| "grad_norm": 0.9492551299059091, | |
| "learning_rate": 4.988591778934428e-06, | |
| "loss": 0.6869, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.46601941747572817, | |
| "grad_norm": 1.2714456311245477, | |
| "learning_rate": 4.9881015478294294e-06, | |
| "loss": 0.8276, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.47572815533980584, | |
| "grad_norm": 2.6314825001613658, | |
| "learning_rate": 4.987601029203553e-06, | |
| "loss": 1.0247, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4854368932038835, | |
| "grad_norm": 1.416971495316372, | |
| "learning_rate": 4.987090225126285e-06, | |
| "loss": 1.012, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.49514563106796117, | |
| "grad_norm": 1.361560829125586, | |
| "learning_rate": 4.98656913770964e-06, | |
| "loss": 0.8907, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5048543689320388, | |
| "grad_norm": 1.3035984152044737, | |
| "learning_rate": 4.986037769108154e-06, | |
| "loss": 1.056, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5145631067961165, | |
| "grad_norm": 1.0150205143363307, | |
| "learning_rate": 4.9854961215188676e-06, | |
| "loss": 1.0017, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5242718446601942, | |
| "grad_norm": 0.9609277033036215, | |
| "learning_rate": 4.984944197181324e-06, | |
| "loss": 0.7601, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.5339805825242718, | |
| "grad_norm": 1.1898667428428673, | |
| "learning_rate": 4.9843819983775575e-06, | |
| "loss": 0.8858, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5436893203883495, | |
| "grad_norm": 1.0139938825198491, | |
| "learning_rate": 4.983809527432086e-06, | |
| "loss": 0.8071, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5533980582524272, | |
| "grad_norm": 1.247418732762796, | |
| "learning_rate": 4.983226786711895e-06, | |
| "loss": 0.9675, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5631067961165048, | |
| "grad_norm": 0.8942637413037233, | |
| "learning_rate": 4.982633778626437e-06, | |
| "loss": 0.8187, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5728155339805825, | |
| "grad_norm": 1.3943617986028647, | |
| "learning_rate": 4.982030505627613e-06, | |
| "loss": 1.0678, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 0.8610131821728051, | |
| "learning_rate": 4.98141697020977e-06, | |
| "loss": 0.8306, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5922330097087378, | |
| "grad_norm": 1.0768921632881472, | |
| "learning_rate": 4.9807931749096836e-06, | |
| "loss": 0.81, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6019417475728155, | |
| "grad_norm": 0.9625381291373968, | |
| "learning_rate": 4.980159122306551e-06, | |
| "loss": 0.892, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6116504854368932, | |
| "grad_norm": 0.9617828022691955, | |
| "learning_rate": 4.979514815021984e-06, | |
| "loss": 1.0243, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6213592233009708, | |
| "grad_norm": 1.2606711145146943, | |
| "learning_rate": 4.978860255719989e-06, | |
| "loss": 0.7773, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.6310679611650486, | |
| "grad_norm": 0.8531956488376088, | |
| "learning_rate": 4.978195447106965e-06, | |
| "loss": 0.9458, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6407766990291263, | |
| "grad_norm": 0.8576435230419439, | |
| "learning_rate": 4.9775203919316864e-06, | |
| "loss": 0.7812, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6504854368932039, | |
| "grad_norm": 0.9030199864232913, | |
| "learning_rate": 4.976835092985297e-06, | |
| "loss": 0.9382, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6601941747572816, | |
| "grad_norm": 0.8483972987027173, | |
| "learning_rate": 4.976139553101291e-06, | |
| "loss": 0.8671, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6699029126213593, | |
| "grad_norm": 0.8960545150162272, | |
| "learning_rate": 4.975433775155509e-06, | |
| "loss": 0.8646, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6796116504854369, | |
| "grad_norm": 1.0017538898162217, | |
| "learning_rate": 4.974717762066123e-06, | |
| "loss": 0.8805, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6893203883495146, | |
| "grad_norm": 0.9212309895628247, | |
| "learning_rate": 4.973991516793621e-06, | |
| "loss": 1.0576, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.6990291262135923, | |
| "grad_norm": 0.9305642899936102, | |
| "learning_rate": 4.973255042340801e-06, | |
| "loss": 0.8486, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7087378640776699, | |
| "grad_norm": 1.0773851273416222, | |
| "learning_rate": 4.972508341752754e-06, | |
| "loss": 1.0583, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7184466019417476, | |
| "grad_norm": 0.892091799769842, | |
| "learning_rate": 4.9717514181168534e-06, | |
| "loss": 0.7527, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.7281553398058253, | |
| "grad_norm": 0.8481741026230679, | |
| "learning_rate": 4.970984274562741e-06, | |
| "loss": 0.7125, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7378640776699029, | |
| "grad_norm": 0.9312066959231862, | |
| "learning_rate": 4.970206914262315e-06, | |
| "loss": 0.8687, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7475728155339806, | |
| "grad_norm": 0.9215979104056321, | |
| "learning_rate": 4.969419340429717e-06, | |
| "loss": 0.7691, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7572815533980582, | |
| "grad_norm": 1.227175476240732, | |
| "learning_rate": 4.968621556321319e-06, | |
| "loss": 0.92, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7669902912621359, | |
| "grad_norm": 0.9330601553607981, | |
| "learning_rate": 4.967813565235708e-06, | |
| "loss": 0.8216, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.8252798793274818, | |
| "learning_rate": 4.966995370513675e-06, | |
| "loss": 0.7061, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7864077669902912, | |
| "grad_norm": 0.9922301648204198, | |
| "learning_rate": 4.966166975538197e-06, | |
| "loss": 1.0408, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7961165048543689, | |
| "grad_norm": 0.7891923797199208, | |
| "learning_rate": 4.965328383734429e-06, | |
| "loss": 0.7595, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8058252427184466, | |
| "grad_norm": 1.0034345889698408, | |
| "learning_rate": 4.964479598569686e-06, | |
| "loss": 1.0233, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8155339805825242, | |
| "grad_norm": 0.9634692778615994, | |
| "learning_rate": 4.963620623553428e-06, | |
| "loss": 0.929, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8252427184466019, | |
| "grad_norm": 0.8794808183269647, | |
| "learning_rate": 4.962751462237248e-06, | |
| "loss": 0.7247, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8349514563106796, | |
| "grad_norm": 0.8991558039524955, | |
| "learning_rate": 4.9618721182148564e-06, | |
| "loss": 0.6248, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8446601941747572, | |
| "grad_norm": 0.9291777621093169, | |
| "learning_rate": 4.960982595122064e-06, | |
| "loss": 0.7035, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8543689320388349, | |
| "grad_norm": 0.85791569259972, | |
| "learning_rate": 4.960082896636773e-06, | |
| "loss": 0.9148, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8640776699029126, | |
| "grad_norm": 0.9063445307146435, | |
| "learning_rate": 4.959173026478952e-06, | |
| "loss": 0.7805, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8737864077669902, | |
| "grad_norm": 0.7587205124423355, | |
| "learning_rate": 4.958252988410631e-06, | |
| "loss": 0.6329, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.883495145631068, | |
| "grad_norm": 1.0713074090013364, | |
| "learning_rate": 4.9573227862358794e-06, | |
| "loss": 0.7955, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.8932038834951457, | |
| "grad_norm": 0.8477622625159322, | |
| "learning_rate": 4.956382423800791e-06, | |
| "loss": 0.8325, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.9029126213592233, | |
| "grad_norm": 0.9269059925838263, | |
| "learning_rate": 4.955431904993471e-06, | |
| "loss": 0.8194, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.912621359223301, | |
| "grad_norm": 1.0475028792580197, | |
| "learning_rate": 4.954471233744015e-06, | |
| "loss": 0.5835, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9223300970873787, | |
| "grad_norm": 0.8985260034505762, | |
| "learning_rate": 4.9535004140245005e-06, | |
| "loss": 0.8063, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9320388349514563, | |
| "grad_norm": 1.0048572531965805, | |
| "learning_rate": 4.952519449848962e-06, | |
| "loss": 0.8127, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.941747572815534, | |
| "grad_norm": 0.9062750156316196, | |
| "learning_rate": 4.951528345273379e-06, | |
| "loss": 0.7181, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.9514563106796117, | |
| "grad_norm": 0.8778949835317967, | |
| "learning_rate": 4.950527104395659e-06, | |
| "loss": 0.8103, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.9611650485436893, | |
| "grad_norm": 0.9728187484090823, | |
| "learning_rate": 4.9495157313556185e-06, | |
| "loss": 0.6329, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.970873786407767, | |
| "grad_norm": 0.7352370132544686, | |
| "learning_rate": 4.94849423033497e-06, | |
| "loss": 0.5231, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9805825242718447, | |
| "grad_norm": 0.8479762742075609, | |
| "learning_rate": 4.9474626055573e-06, | |
| "loss": 0.9551, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.9902912621359223, | |
| "grad_norm": 0.7351142523348447, | |
| "learning_rate": 4.946420861288051e-06, | |
| "loss": 0.738, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9009004817884025, | |
| "learning_rate": 4.9453690018345144e-06, | |
| "loss": 0.7789, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.0097087378640777, | |
| "grad_norm": 0.8057060436062744, | |
| "learning_rate": 4.944307031545797e-06, | |
| "loss": 0.6629, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.0194174757281553, | |
| "grad_norm": 0.8977736266341005, | |
| "learning_rate": 4.943234954812812e-06, | |
| "loss": 0.9053, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.029126213592233, | |
| "grad_norm": 0.7717440949849311, | |
| "learning_rate": 4.942152776068264e-06, | |
| "loss": 0.6404, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.0388349514563107, | |
| "grad_norm": 0.890503065931168, | |
| "learning_rate": 4.941060499786622e-06, | |
| "loss": 0.9117, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.0485436893203883, | |
| "grad_norm": 0.8814259279520221, | |
| "learning_rate": 4.939958130484106e-06, | |
| "loss": 0.7055, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.058252427184466, | |
| "grad_norm": 0.671081647050562, | |
| "learning_rate": 4.938845672718668e-06, | |
| "loss": 0.6671, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0679611650485437, | |
| "grad_norm": 0.8278909841687843, | |
| "learning_rate": 4.937723131089974e-06, | |
| "loss": 0.7318, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0776699029126213, | |
| "grad_norm": 0.9269878913315724, | |
| "learning_rate": 4.93659051023938e-06, | |
| "loss": 0.7646, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.087378640776699, | |
| "grad_norm": 0.665235455755867, | |
| "learning_rate": 4.93544781484992e-06, | |
| "loss": 0.4998, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.0970873786407767, | |
| "grad_norm": 0.8487063874467249, | |
| "learning_rate": 4.9342950496462815e-06, | |
| "loss": 0.7435, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.1067961165048543, | |
| "grad_norm": 0.8430924624096062, | |
| "learning_rate": 4.933132219394786e-06, | |
| "loss": 0.6992, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.116504854368932, | |
| "grad_norm": 0.8993084000328078, | |
| "learning_rate": 4.931959328903376e-06, | |
| "loss": 0.8961, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1262135922330097, | |
| "grad_norm": 0.7373642852662377, | |
| "learning_rate": 4.930776383021584e-06, | |
| "loss": 0.6722, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.1359223300970873, | |
| "grad_norm": 0.8356211654383496, | |
| "learning_rate": 4.92958338664052e-06, | |
| "loss": 0.528, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.145631067961165, | |
| "grad_norm": 0.8788366905889159, | |
| "learning_rate": 4.928380344692853e-06, | |
| "loss": 0.6369, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1553398058252426, | |
| "grad_norm": 0.70400203786099, | |
| "learning_rate": 4.927167262152784e-06, | |
| "loss": 0.6961, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.1650485436893203, | |
| "grad_norm": 0.7933675823948615, | |
| "learning_rate": 4.925944144036027e-06, | |
| "loss": 0.7316, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.174757281553398, | |
| "grad_norm": 0.7413576987999055, | |
| "learning_rate": 4.924710995399796e-06, | |
| "loss": 0.6764, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.1844660194174756, | |
| "grad_norm": 0.8385780569550869, | |
| "learning_rate": 4.923467821342773e-06, | |
| "loss": 0.7602, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.1941747572815533, | |
| "grad_norm": 0.8531356062842188, | |
| "learning_rate": 4.922214627005092e-06, | |
| "loss": 0.749, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.203883495145631, | |
| "grad_norm": 0.8216291410900652, | |
| "learning_rate": 4.920951417568323e-06, | |
| "loss": 0.8079, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2135922330097086, | |
| "grad_norm": 0.8320005263963047, | |
| "learning_rate": 4.919678198255438e-06, | |
| "loss": 0.6805, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2233009708737863, | |
| "grad_norm": 0.8268979479586152, | |
| "learning_rate": 4.918394974330801e-06, | |
| "loss": 0.7583, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.233009708737864, | |
| "grad_norm": 0.7645562011859167, | |
| "learning_rate": 4.917101751100142e-06, | |
| "loss": 0.7109, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2427184466019416, | |
| "grad_norm": 0.8421630378850258, | |
| "learning_rate": 4.915798533910534e-06, | |
| "loss": 0.714, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.2524271844660193, | |
| "grad_norm": 0.7552330522382976, | |
| "learning_rate": 4.9144853281503715e-06, | |
| "loss": 0.5679, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.262135922330097, | |
| "grad_norm": 0.852273134838057, | |
| "learning_rate": 4.91316213924935e-06, | |
| "loss": 0.7952, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2718446601941746, | |
| "grad_norm": 0.8114602189230878, | |
| "learning_rate": 4.911828972678441e-06, | |
| "loss": 0.6176, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.2815533980582523, | |
| "grad_norm": 0.8164393262307038, | |
| "learning_rate": 4.91048583394987e-06, | |
| "loss": 0.6477, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.29126213592233, | |
| "grad_norm": 0.8581250231339308, | |
| "learning_rate": 4.909132728617095e-06, | |
| "loss": 0.9476, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3009708737864076, | |
| "grad_norm": 0.766359434418812, | |
| "learning_rate": 4.907769662274785e-06, | |
| "loss": 0.6528, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3106796116504853, | |
| "grad_norm": 0.6976692264719603, | |
| "learning_rate": 4.90639664055879e-06, | |
| "loss": 0.5637, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3203883495145632, | |
| "grad_norm": 0.7314627468283406, | |
| "learning_rate": 4.905013669146127e-06, | |
| "loss": 0.6096, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3300970873786409, | |
| "grad_norm": 0.7773436570942428, | |
| "learning_rate": 4.903620753754949e-06, | |
| "loss": 0.7461, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3398058252427185, | |
| "grad_norm": 0.8921198866699622, | |
| "learning_rate": 4.902217900144524e-06, | |
| "loss": 0.9358, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.3495145631067962, | |
| "grad_norm": 0.7375485723614339, | |
| "learning_rate": 4.900805114115214e-06, | |
| "loss": 0.8942, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.3592233009708738, | |
| "grad_norm": 0.7942794591041569, | |
| "learning_rate": 4.899382401508446e-06, | |
| "loss": 0.5492, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3689320388349515, | |
| "grad_norm": 0.8078627057423822, | |
| "learning_rate": 4.8979497682066916e-06, | |
| "loss": 0.7808, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.3786407766990292, | |
| "grad_norm": 0.6837805173092772, | |
| "learning_rate": 4.89650722013344e-06, | |
| "loss": 0.5391, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.3883495145631068, | |
| "grad_norm": 0.9832481462127313, | |
| "learning_rate": 4.895054763253177e-06, | |
| "loss": 0.7406, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.3980582524271845, | |
| "grad_norm": 0.769076719995811, | |
| "learning_rate": 4.8935924035713564e-06, | |
| "loss": 0.5929, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4077669902912622, | |
| "grad_norm": 0.8430007967016218, | |
| "learning_rate": 4.892120147134378e-06, | |
| "loss": 0.699, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4174757281553398, | |
| "grad_norm": 0.9122194953590461, | |
| "learning_rate": 4.8906380000295615e-06, | |
| "loss": 0.6895, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.4271844660194175, | |
| "grad_norm": 0.7898786206966036, | |
| "learning_rate": 4.889145968385121e-06, | |
| "loss": 0.6528, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4368932038834952, | |
| "grad_norm": 0.7666616277046007, | |
| "learning_rate": 4.887644058370139e-06, | |
| "loss": 0.531, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.4466019417475728, | |
| "grad_norm": 0.7653718997690864, | |
| "learning_rate": 4.886132276194544e-06, | |
| "loss": 0.5768, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.4563106796116505, | |
| "grad_norm": 0.90831243558374, | |
| "learning_rate": 4.884610628109082e-06, | |
| "loss": 0.5652, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4660194174757282, | |
| "grad_norm": 0.7442744457963555, | |
| "learning_rate": 4.883079120405292e-06, | |
| "loss": 0.6688, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.4757281553398058, | |
| "grad_norm": 0.7344457110605118, | |
| "learning_rate": 4.881537759415478e-06, | |
| "loss": 0.5314, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.4854368932038835, | |
| "grad_norm": 1.1695813268075903, | |
| "learning_rate": 4.879986551512684e-06, | |
| "loss": 0.7273, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.4951456310679612, | |
| "grad_norm": 0.6738881315688398, | |
| "learning_rate": 4.878425503110672e-06, | |
| "loss": 0.5607, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5048543689320388, | |
| "grad_norm": 0.8581543167197856, | |
| "learning_rate": 4.876854620663887e-06, | |
| "loss": 0.6943, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5145631067961165, | |
| "grad_norm": 0.8060496260215426, | |
| "learning_rate": 4.875273910667434e-06, | |
| "loss": 0.6212, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.5242718446601942, | |
| "grad_norm": 0.8453811238639173, | |
| "learning_rate": 4.873683379657057e-06, | |
| "loss": 0.5456, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5339805825242718, | |
| "grad_norm": 0.8844708129131296, | |
| "learning_rate": 4.8720830342091015e-06, | |
| "loss": 0.9448, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.5436893203883495, | |
| "grad_norm": 0.861907269409231, | |
| "learning_rate": 4.870472880940496e-06, | |
| "loss": 0.7925, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.5533980582524272, | |
| "grad_norm": 1.0829307275890252, | |
| "learning_rate": 4.868852926508721e-06, | |
| "loss": 0.8343, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5631067961165048, | |
| "grad_norm": 0.875424595240487, | |
| "learning_rate": 4.867223177611779e-06, | |
| "loss": 0.7158, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.5728155339805825, | |
| "grad_norm": 0.6659133214470974, | |
| "learning_rate": 4.865583640988173e-06, | |
| "loss": 0.5684, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.5825242718446602, | |
| "grad_norm": 0.772588673794732, | |
| "learning_rate": 4.863934323416871e-06, | |
| "loss": 0.6486, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.5922330097087378, | |
| "grad_norm": 0.741499702229279, | |
| "learning_rate": 4.862275231717288e-06, | |
| "loss": 0.5758, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.6019417475728155, | |
| "grad_norm": 0.920336013887185, | |
| "learning_rate": 4.860606372749247e-06, | |
| "loss": 0.5468, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6116504854368932, | |
| "grad_norm": 0.7093065195416415, | |
| "learning_rate": 4.858927753412958e-06, | |
| "loss": 0.6127, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.6213592233009708, | |
| "grad_norm": 0.8110174812695655, | |
| "learning_rate": 4.857239380648985e-06, | |
| "loss": 0.6017, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6310679611650487, | |
| "grad_norm": 0.7375215564893128, | |
| "learning_rate": 4.855541261438223e-06, | |
| "loss": 0.6753, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.6407766990291264, | |
| "grad_norm": 0.7500973220432647, | |
| "learning_rate": 4.8538334028018605e-06, | |
| "loss": 0.8343, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.650485436893204, | |
| "grad_norm": 0.6948713105727569, | |
| "learning_rate": 4.8521158118013605e-06, | |
| "loss": 0.5493, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6601941747572817, | |
| "grad_norm": 0.773755912005905, | |
| "learning_rate": 4.850388495538423e-06, | |
| "loss": 0.6271, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.6699029126213594, | |
| "grad_norm": 0.8759327452228973, | |
| "learning_rate": 4.84865146115496e-06, | |
| "loss": 0.5641, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.679611650485437, | |
| "grad_norm": 0.8673872477886911, | |
| "learning_rate": 4.846904715833066e-06, | |
| "loss": 0.5295, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.6893203883495147, | |
| "grad_norm": 0.7435589108212891, | |
| "learning_rate": 4.8451482667949836e-06, | |
| "loss": 0.7122, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.6990291262135924, | |
| "grad_norm": 0.7510040594244859, | |
| "learning_rate": 4.843382121303082e-06, | |
| "loss": 0.6579, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.70873786407767, | |
| "grad_norm": 0.7254228155893363, | |
| "learning_rate": 4.841606286659819e-06, | |
| "loss": 0.5591, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.7184466019417477, | |
| "grad_norm": 0.7786787819553973, | |
| "learning_rate": 4.839820770207714e-06, | |
| "loss": 0.5417, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7281553398058254, | |
| "grad_norm": 0.7424090664501856, | |
| "learning_rate": 4.8380255793293195e-06, | |
| "loss": 0.5679, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.737864077669903, | |
| "grad_norm": 0.880425961769788, | |
| "learning_rate": 4.8362207214471864e-06, | |
| "loss": 0.5504, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.7475728155339807, | |
| "grad_norm": 0.7692052821337514, | |
| "learning_rate": 4.83440620402384e-06, | |
| "loss": 0.6217, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7572815533980584, | |
| "grad_norm": 0.7571562044181578, | |
| "learning_rate": 4.832582034561738e-06, | |
| "loss": 0.641, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.766990291262136, | |
| "grad_norm": 0.8286514373124609, | |
| "learning_rate": 4.830748220603251e-06, | |
| "loss": 0.7462, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.7766990291262137, | |
| "grad_norm": 0.8663863630421635, | |
| "learning_rate": 4.828904769730628e-06, | |
| "loss": 0.7598, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.7864077669902914, | |
| "grad_norm": 0.8392430546644409, | |
| "learning_rate": 4.827051689565958e-06, | |
| "loss": 0.6278, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.796116504854369, | |
| "grad_norm": 0.7431256674232911, | |
| "learning_rate": 4.825188987771149e-06, | |
| "loss": 0.7471, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8058252427184467, | |
| "grad_norm": 0.775602420628486, | |
| "learning_rate": 4.82331667204789e-06, | |
| "loss": 0.4658, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.8155339805825244, | |
| "grad_norm": 0.8220707289832607, | |
| "learning_rate": 4.821434750137619e-06, | |
| "loss": 0.6212, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.825242718446602, | |
| "grad_norm": 0.8721039703571108, | |
| "learning_rate": 4.819543229821494e-06, | |
| "loss": 0.7135, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8349514563106797, | |
| "grad_norm": 0.7196046947557964, | |
| "learning_rate": 4.8176421189203605e-06, | |
| "loss": 0.5464, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.8446601941747574, | |
| "grad_norm": 0.7844213665577071, | |
| "learning_rate": 4.815731425294716e-06, | |
| "loss": 0.6536, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.854368932038835, | |
| "grad_norm": 0.7921297235454372, | |
| "learning_rate": 4.813811156844681e-06, | |
| "loss": 0.7183, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.8640776699029127, | |
| "grad_norm": 0.863196612505686, | |
| "learning_rate": 4.811881321509964e-06, | |
| "loss": 0.6976, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.8737864077669903, | |
| "grad_norm": 0.7755881818234687, | |
| "learning_rate": 4.809941927269829e-06, | |
| "loss": 0.8491, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.883495145631068, | |
| "grad_norm": 0.8186186296199729, | |
| "learning_rate": 4.807992982143064e-06, | |
| "loss": 0.6343, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.8932038834951457, | |
| "grad_norm": 0.913496737962807, | |
| "learning_rate": 4.806034494187949e-06, | |
| "loss": 0.8436, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9029126213592233, | |
| "grad_norm": 0.7590143443155779, | |
| "learning_rate": 4.804066471502216e-06, | |
| "loss": 0.603, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.912621359223301, | |
| "grad_norm": 0.766119646590294, | |
| "learning_rate": 4.802088922223024e-06, | |
| "loss": 0.5167, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.9223300970873787, | |
| "grad_norm": 0.7274017571399753, | |
| "learning_rate": 4.80010185452692e-06, | |
| "loss": 0.6725, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.9320388349514563, | |
| "grad_norm": 0.73780689049117, | |
| "learning_rate": 4.798105276629806e-06, | |
| "loss": 0.521, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.941747572815534, | |
| "grad_norm": 0.7791155049733209, | |
| "learning_rate": 4.796099196786908e-06, | |
| "loss": 0.583, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9514563106796117, | |
| "grad_norm": 0.8004953963543238, | |
| "learning_rate": 4.794083623292737e-06, | |
| "loss": 0.7842, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.9611650485436893, | |
| "grad_norm": 0.7807030577892377, | |
| "learning_rate": 4.792058564481058e-06, | |
| "loss": 0.5534, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.970873786407767, | |
| "grad_norm": 0.7092633632036252, | |
| "learning_rate": 4.7900240287248554e-06, | |
| "loss": 0.7083, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.9805825242718447, | |
| "grad_norm": 0.8875925228430702, | |
| "learning_rate": 4.7879800244362975e-06, | |
| "loss": 0.7782, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.9902912621359223, | |
| "grad_norm": 0.8909791320483021, | |
| "learning_rate": 4.785926560066703e-06, | |
| "loss": 0.7681, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.7275875515555469, | |
| "learning_rate": 4.783863644106502e-06, | |
| "loss": 0.649, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.0097087378640777, | |
| "grad_norm": 0.802116606621558, | |
| "learning_rate": 4.781791285085209e-06, | |
| "loss": 0.5568, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.0194174757281553, | |
| "grad_norm": 0.7406594271373391, | |
| "learning_rate": 4.779709491571378e-06, | |
| "loss": 0.5144, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.029126213592233, | |
| "grad_norm": 0.6857989185878482, | |
| "learning_rate": 4.777618272172573e-06, | |
| "loss": 0.6449, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.0388349514563107, | |
| "grad_norm": 0.734748210095666, | |
| "learning_rate": 4.775517635535332e-06, | |
| "loss": 0.7377, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.0485436893203883, | |
| "grad_norm": 0.7708218856857236, | |
| "learning_rate": 4.77340759034513e-06, | |
| "loss": 0.6855, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.058252427184466, | |
| "grad_norm": 0.7279897754942517, | |
| "learning_rate": 4.771288145326343e-06, | |
| "loss": 0.6684, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.0679611650485437, | |
| "grad_norm": 0.674474791686522, | |
| "learning_rate": 4.769159309242213e-06, | |
| "loss": 0.438, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.0776699029126213, | |
| "grad_norm": 0.792505397680811, | |
| "learning_rate": 4.767021090894809e-06, | |
| "loss": 0.6831, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.087378640776699, | |
| "grad_norm": 0.744545194492276, | |
| "learning_rate": 4.764873499124997e-06, | |
| "loss": 0.6976, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.0970873786407767, | |
| "grad_norm": 0.7354909274219694, | |
| "learning_rate": 4.762716542812395e-06, | |
| "loss": 0.5495, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.1067961165048543, | |
| "grad_norm": 0.7133200950602842, | |
| "learning_rate": 4.7605502308753415e-06, | |
| "loss": 0.5687, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.116504854368932, | |
| "grad_norm": 0.7982098464573162, | |
| "learning_rate": 4.758374572270859e-06, | |
| "loss": 0.6886, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.1262135922330097, | |
| "grad_norm": 0.6913449166288441, | |
| "learning_rate": 4.756189575994614e-06, | |
| "loss": 0.3812, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.1359223300970873, | |
| "grad_norm": 0.7192629747712952, | |
| "learning_rate": 4.753995251080884e-06, | |
| "loss": 0.575, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.145631067961165, | |
| "grad_norm": 0.6248283732386594, | |
| "learning_rate": 4.7517916066025126e-06, | |
| "loss": 0.463, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.1553398058252426, | |
| "grad_norm": 0.8287445715621566, | |
| "learning_rate": 4.7495786516708806e-06, | |
| "loss": 0.7071, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.1650485436893203, | |
| "grad_norm": 0.8318311223138934, | |
| "learning_rate": 4.747356395435865e-06, | |
| "loss": 0.4322, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.174757281553398, | |
| "grad_norm": 0.8533840933951986, | |
| "learning_rate": 4.745124847085799e-06, | |
| "loss": 0.568, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.1844660194174756, | |
| "grad_norm": 0.7901361363585824, | |
| "learning_rate": 4.742884015847436e-06, | |
| "loss": 0.5393, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.1941747572815533, | |
| "grad_norm": 0.8111627610336898, | |
| "learning_rate": 4.740633910985911e-06, | |
| "loss": 0.5969, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.203883495145631, | |
| "grad_norm": 0.7671540334640228, | |
| "learning_rate": 4.738374541804704e-06, | |
| "loss": 0.4576, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.2135922330097086, | |
| "grad_norm": 0.7242562019069873, | |
| "learning_rate": 4.7361059176456e-06, | |
| "loss": 0.5186, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.2233009708737863, | |
| "grad_norm": 0.7763865691421442, | |
| "learning_rate": 4.733828047888647e-06, | |
| "loss": 0.6682, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.233009708737864, | |
| "grad_norm": 0.6906133600693856, | |
| "learning_rate": 4.731540941952126e-06, | |
| "loss": 0.4051, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.2427184466019416, | |
| "grad_norm": 0.7481490824065317, | |
| "learning_rate": 4.7292446092925016e-06, | |
| "loss": 0.4902, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.2524271844660193, | |
| "grad_norm": 0.7044106920537051, | |
| "learning_rate": 4.726939059404392e-06, | |
| "loss": 0.4499, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.262135922330097, | |
| "grad_norm": 0.6853728707124397, | |
| "learning_rate": 4.724624301820524e-06, | |
| "loss": 0.4757, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.2718446601941746, | |
| "grad_norm": 0.9364489830239873, | |
| "learning_rate": 4.722300346111695e-06, | |
| "loss": 0.6159, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.2815533980582523, | |
| "grad_norm": 0.8138300584150231, | |
| "learning_rate": 4.719967201886734e-06, | |
| "loss": 0.5365, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.29126213592233, | |
| "grad_norm": 0.7475372553230331, | |
| "learning_rate": 4.717624878792461e-06, | |
| "loss": 0.7347, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.3009708737864076, | |
| "grad_norm": 0.9795714505586022, | |
| "learning_rate": 4.715273386513651e-06, | |
| "loss": 0.7175, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.3106796116504853, | |
| "grad_norm": 0.7332975150286841, | |
| "learning_rate": 4.712912734772988e-06, | |
| "loss": 0.5204, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.320388349514563, | |
| "grad_norm": 0.6488936041709285, | |
| "learning_rate": 4.710542933331025e-06, | |
| "loss": 0.5825, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.3300970873786406, | |
| "grad_norm": 0.7371958257036496, | |
| "learning_rate": 4.708163991986152e-06, | |
| "loss": 0.6679, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.3398058252427183, | |
| "grad_norm": 0.8214266450639852, | |
| "learning_rate": 4.705775920574546e-06, | |
| "loss": 0.6434, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.349514563106796, | |
| "grad_norm": 0.7702926046174587, | |
| "learning_rate": 4.703378728970134e-06, | |
| "loss": 0.4755, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.3592233009708736, | |
| "grad_norm": 0.6996698003201337, | |
| "learning_rate": 4.700972427084551e-06, | |
| "loss": 0.592, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.3689320388349513, | |
| "grad_norm": 0.7560377791738715, | |
| "learning_rate": 4.698557024867105e-06, | |
| "loss": 0.4565, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.378640776699029, | |
| "grad_norm": 0.7633818347510686, | |
| "learning_rate": 4.696132532304727e-06, | |
| "loss": 0.6659, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.3883495145631066, | |
| "grad_norm": 0.7234976470337168, | |
| "learning_rate": 4.693698959421935e-06, | |
| "loss": 0.5678, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.3980582524271843, | |
| "grad_norm": 0.750666487544434, | |
| "learning_rate": 4.691256316280789e-06, | |
| "loss": 0.8332, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.407766990291262, | |
| "grad_norm": 0.7493376372555444, | |
| "learning_rate": 4.688804612980855e-06, | |
| "loss": 0.4837, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.4174757281553396, | |
| "grad_norm": 0.6751745421938132, | |
| "learning_rate": 4.686343859659158e-06, | |
| "loss": 0.6951, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.4271844660194173, | |
| "grad_norm": 0.734057362640619, | |
| "learning_rate": 4.683874066490143e-06, | |
| "loss": 0.5507, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.436893203883495, | |
| "grad_norm": 0.7227243519681933, | |
| "learning_rate": 4.681395243685631e-06, | |
| "loss": 0.56, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.4466019417475726, | |
| "grad_norm": 0.7751770457742055, | |
| "learning_rate": 4.67890740149478e-06, | |
| "loss": 0.518, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.4563106796116507, | |
| "grad_norm": 0.8010398356492873, | |
| "learning_rate": 4.676410550204036e-06, | |
| "loss": 0.6707, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.466019417475728, | |
| "grad_norm": 0.6322959904355947, | |
| "learning_rate": 4.673904700137098e-06, | |
| "loss": 0.3992, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.475728155339806, | |
| "grad_norm": 0.6951997535390679, | |
| "learning_rate": 4.671389861654873e-06, | |
| "loss": 0.4745, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.4854368932038833, | |
| "grad_norm": 0.7106909584689424, | |
| "learning_rate": 4.668866045155428e-06, | |
| "loss": 0.596, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.4951456310679614, | |
| "grad_norm": 0.8847918134161639, | |
| "learning_rate": 4.666333261073956e-06, | |
| "loss": 0.6131, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.5048543689320386, | |
| "grad_norm": 0.7875255376725511, | |
| "learning_rate": 4.6637915198827265e-06, | |
| "loss": 0.6808, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.5145631067961167, | |
| "grad_norm": 0.74457845575171, | |
| "learning_rate": 4.661240832091042e-06, | |
| "loss": 0.534, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.524271844660194, | |
| "grad_norm": 0.850799682933559, | |
| "learning_rate": 4.658681208245198e-06, | |
| "loss": 0.6302, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.533980582524272, | |
| "grad_norm": 0.7621063847057238, | |
| "learning_rate": 4.65611265892844e-06, | |
| "loss": 0.5924, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.5436893203883493, | |
| "grad_norm": 0.8162383044494845, | |
| "learning_rate": 4.653535194760912e-06, | |
| "loss": 0.5497, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.5533980582524274, | |
| "grad_norm": 0.7920894927810362, | |
| "learning_rate": 4.650948826399624e-06, | |
| "loss": 0.392, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.5631067961165046, | |
| "grad_norm": 0.7383821620603919, | |
| "learning_rate": 4.648353564538397e-06, | |
| "loss": 0.5049, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.5728155339805827, | |
| "grad_norm": 0.7650056257507512, | |
| "learning_rate": 4.645749419907829e-06, | |
| "loss": 0.4973, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.58252427184466, | |
| "grad_norm": 0.7120510594433922, | |
| "learning_rate": 4.64313640327524e-06, | |
| "loss": 0.406, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.592233009708738, | |
| "grad_norm": 0.7835448578972172, | |
| "learning_rate": 4.640514525444637e-06, | |
| "loss": 0.6122, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.6019417475728153, | |
| "grad_norm": 0.9134409217464837, | |
| "learning_rate": 4.637883797256663e-06, | |
| "loss": 0.845, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.6116504854368934, | |
| "grad_norm": 0.7387942613013898, | |
| "learning_rate": 4.635244229588558e-06, | |
| "loss": 0.5296, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.6213592233009706, | |
| "grad_norm": 0.6722442278200302, | |
| "learning_rate": 4.632595833354105e-06, | |
| "loss": 0.6111, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6310679611650487, | |
| "grad_norm": 0.790100355706828, | |
| "learning_rate": 4.629938619503593e-06, | |
| "loss": 0.4931, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.6407766990291264, | |
| "grad_norm": 0.7457720631344418, | |
| "learning_rate": 4.627272599023772e-06, | |
| "loss": 0.6932, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.650485436893204, | |
| "grad_norm": 0.7742836603714502, | |
| "learning_rate": 4.6245977829378e-06, | |
| "loss": 0.8069, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.6601941747572817, | |
| "grad_norm": 0.7522977861605561, | |
| "learning_rate": 4.6219141823052035e-06, | |
| "loss": 0.4594, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.6699029126213594, | |
| "grad_norm": 0.7897785740037921, | |
| "learning_rate": 4.619221808221833e-06, | |
| "loss": 0.7502, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.679611650485437, | |
| "grad_norm": 0.6542147168655412, | |
| "learning_rate": 4.616520671819812e-06, | |
| "loss": 0.455, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.6893203883495147, | |
| "grad_norm": 1.3324292561191493, | |
| "learning_rate": 4.613810784267492e-06, | |
| "loss": 0.615, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.6990291262135924, | |
| "grad_norm": 0.6366824455259196, | |
| "learning_rate": 4.61109215676941e-06, | |
| "loss": 0.4398, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.70873786407767, | |
| "grad_norm": 0.7464337608282651, | |
| "learning_rate": 4.608364800566241e-06, | |
| "loss": 0.587, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.7184466019417477, | |
| "grad_norm": 0.6895928222340304, | |
| "learning_rate": 4.605628726934747e-06, | |
| "loss": 0.6703, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7281553398058254, | |
| "grad_norm": 0.6878653541621508, | |
| "learning_rate": 4.602883947187738e-06, | |
| "loss": 0.5948, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.737864077669903, | |
| "grad_norm": 0.6926203162726917, | |
| "learning_rate": 4.600130472674017e-06, | |
| "loss": 0.4607, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.7475728155339807, | |
| "grad_norm": 0.7781936564011823, | |
| "learning_rate": 4.5973683147783405e-06, | |
| "loss": 0.7486, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.7572815533980584, | |
| "grad_norm": 0.7501081998810354, | |
| "learning_rate": 4.594597484921365e-06, | |
| "loss": 0.5782, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.766990291262136, | |
| "grad_norm": 0.7647372977005877, | |
| "learning_rate": 4.5918179945596055e-06, | |
| "loss": 0.5159, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.7766990291262137, | |
| "grad_norm": 0.6977238852528742, | |
| "learning_rate": 4.589029855185384e-06, | |
| "loss": 0.5334, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.7864077669902914, | |
| "grad_norm": 0.9130101610006348, | |
| "learning_rate": 4.586233078326785e-06, | |
| "loss": 0.8354, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.796116504854369, | |
| "grad_norm": 0.8318292434477851, | |
| "learning_rate": 4.583427675547602e-06, | |
| "loss": 0.6258, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.8058252427184467, | |
| "grad_norm": 0.7324981921403528, | |
| "learning_rate": 4.580613658447301e-06, | |
| "loss": 0.709, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.8155339805825244, | |
| "grad_norm": 0.7523751933345019, | |
| "learning_rate": 4.577791038660959e-06, | |
| "loss": 0.4851, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.825242718446602, | |
| "grad_norm": 0.7542503570491638, | |
| "learning_rate": 4.574959827859226e-06, | |
| "loss": 0.4493, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.8349514563106797, | |
| "grad_norm": 0.741187265691656, | |
| "learning_rate": 4.572120037748273e-06, | |
| "loss": 0.4682, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.8446601941747574, | |
| "grad_norm": 0.7141919069039983, | |
| "learning_rate": 4.5692716800697415e-06, | |
| "loss": 0.6235, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.854368932038835, | |
| "grad_norm": 0.7414161387983556, | |
| "learning_rate": 4.566414766600698e-06, | |
| "loss": 0.5613, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.8640776699029127, | |
| "grad_norm": 0.700849509216115, | |
| "learning_rate": 4.563549309153589e-06, | |
| "loss": 0.443, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.8737864077669903, | |
| "grad_norm": 0.7854807058221909, | |
| "learning_rate": 4.56067531957618e-06, | |
| "loss": 0.4657, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.883495145631068, | |
| "grad_norm": 0.7744368603433763, | |
| "learning_rate": 4.557792809751519e-06, | |
| "loss": 0.6192, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.8932038834951457, | |
| "grad_norm": 0.811186979862698, | |
| "learning_rate": 4.554901791597883e-06, | |
| "loss": 0.5432, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.9029126213592233, | |
| "grad_norm": 0.7610230989372687, | |
| "learning_rate": 4.552002277068725e-06, | |
| "loss": 0.5689, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.912621359223301, | |
| "grad_norm": 0.738268676455508, | |
| "learning_rate": 4.549094278152631e-06, | |
| "loss": 0.6102, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.9223300970873787, | |
| "grad_norm": 0.8265276954113333, | |
| "learning_rate": 4.546177806873266e-06, | |
| "loss": 0.4803, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.9320388349514563, | |
| "grad_norm": 0.7994330788168778, | |
| "learning_rate": 4.543252875289326e-06, | |
| "loss": 0.5232, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.941747572815534, | |
| "grad_norm": 0.7803228028067196, | |
| "learning_rate": 4.540319495494486e-06, | |
| "loss": 0.5785, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.9514563106796117, | |
| "grad_norm": 0.812661521852206, | |
| "learning_rate": 4.537377679617353e-06, | |
| "loss": 0.5857, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.9611650485436893, | |
| "grad_norm": 0.8017682940418599, | |
| "learning_rate": 4.534427439821416e-06, | |
| "loss": 0.4679, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.970873786407767, | |
| "grad_norm": 0.7557388320666087, | |
| "learning_rate": 4.531468788304992e-06, | |
| "loss": 0.4511, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.9805825242718447, | |
| "grad_norm": 0.7990082356636408, | |
| "learning_rate": 4.5285017373011784e-06, | |
| "loss": 0.4999, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.9902912621359223, | |
| "grad_norm": 0.8410650831835784, | |
| "learning_rate": 4.5255262990778024e-06, | |
| "loss": 0.4279, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.67842206115732, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.4778, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.0097087378640777, | |
| "grad_norm": 0.7208972107613384, | |
| "learning_rate": 4.519550310217013e-06, | |
| "loss": 0.5166, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0194174757281553, | |
| "grad_norm": 0.73349262960244, | |
| "learning_rate": 4.516549784288442e-06, | |
| "loss": 0.4455, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.029126213592233, | |
| "grad_norm": 0.6956083097152967, | |
| "learning_rate": 4.513540920557892e-06, | |
| "loss": 0.4371, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.0388349514563107, | |
| "grad_norm": 0.7915417702341847, | |
| "learning_rate": 4.510523731466072e-06, | |
| "loss": 0.6448, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.0485436893203883, | |
| "grad_norm": 0.5990172469379881, | |
| "learning_rate": 4.507498229488116e-06, | |
| "loss": 0.3384, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.058252427184466, | |
| "grad_norm": 0.8318855981134111, | |
| "learning_rate": 4.504464427133527e-06, | |
| "loss": 0.4508, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.0679611650485437, | |
| "grad_norm": 0.8758812451211815, | |
| "learning_rate": 4.501422336946126e-06, | |
| "loss": 0.4328, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.0776699029126213, | |
| "grad_norm": 0.7844299628970943, | |
| "learning_rate": 4.498371971504005e-06, | |
| "loss": 0.4222, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.087378640776699, | |
| "grad_norm": 0.7195150581975005, | |
| "learning_rate": 4.49531334341947e-06, | |
| "loss": 0.4417, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.0970873786407767, | |
| "grad_norm": 0.7671602770728359, | |
| "learning_rate": 4.49224646533899e-06, | |
| "loss": 0.42, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.1067961165048543, | |
| "grad_norm": 0.6929615407642132, | |
| "learning_rate": 4.489171349943144e-06, | |
| "loss": 0.4332, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.116504854368932, | |
| "grad_norm": 0.7086258123510658, | |
| "learning_rate": 4.486088009946575e-06, | |
| "loss": 0.3632, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.1262135922330097, | |
| "grad_norm": 0.7708151250786913, | |
| "learning_rate": 4.482996458097926e-06, | |
| "loss": 0.4975, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.1359223300970873, | |
| "grad_norm": 0.846155524161367, | |
| "learning_rate": 4.479896707179796e-06, | |
| "loss": 0.6871, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.145631067961165, | |
| "grad_norm": 0.8231242232551517, | |
| "learning_rate": 4.476788770008685e-06, | |
| "loss": 0.6574, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.1553398058252426, | |
| "grad_norm": 0.8549632502448101, | |
| "learning_rate": 4.473672659434941e-06, | |
| "loss": 0.5856, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.1650485436893203, | |
| "grad_norm": 0.7228128426745009, | |
| "learning_rate": 4.470548388342704e-06, | |
| "loss": 0.4776, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.174757281553398, | |
| "grad_norm": 0.8108949680823984, | |
| "learning_rate": 4.467415969649858e-06, | |
| "loss": 0.4874, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.1844660194174756, | |
| "grad_norm": 0.6851857911673899, | |
| "learning_rate": 4.464275416307973e-06, | |
| "loss": 0.4994, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.1941747572815533, | |
| "grad_norm": 0.761950009953729, | |
| "learning_rate": 4.461126741302253e-06, | |
| "loss": 0.4929, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.203883495145631, | |
| "grad_norm": 0.8432338612654883, | |
| "learning_rate": 4.457969957651485e-06, | |
| "loss": 0.4137, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.2135922330097086, | |
| "grad_norm": 0.7741138798177974, | |
| "learning_rate": 4.454805078407979e-06, | |
| "loss": 0.6696, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.2233009708737863, | |
| "grad_norm": 0.71765697779996, | |
| "learning_rate": 4.451632116657521e-06, | |
| "loss": 0.4506, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.233009708737864, | |
| "grad_norm": 0.6253829454649015, | |
| "learning_rate": 4.448451085519314e-06, | |
| "loss": 0.3586, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.2427184466019416, | |
| "grad_norm": 0.9686569714580476, | |
| "learning_rate": 4.445261998145927e-06, | |
| "loss": 0.4832, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.2524271844660193, | |
| "grad_norm": 0.682456331535341, | |
| "learning_rate": 4.442064867723236e-06, | |
| "loss": 0.4737, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.262135922330097, | |
| "grad_norm": 0.801934212380896, | |
| "learning_rate": 4.438859707470376e-06, | |
| "loss": 0.4988, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.2718446601941746, | |
| "grad_norm": 0.6210847995583545, | |
| "learning_rate": 4.435646530639679e-06, | |
| "loss": 0.3549, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.2815533980582523, | |
| "grad_norm": 0.7693142736109597, | |
| "learning_rate": 4.432425350516627e-06, | |
| "loss": 0.4612, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.29126213592233, | |
| "grad_norm": 0.7469011823723339, | |
| "learning_rate": 4.42919618041979e-06, | |
| "loss": 0.3343, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.3009708737864076, | |
| "grad_norm": 0.6080757362122666, | |
| "learning_rate": 4.425959033700776e-06, | |
| "loss": 0.2422, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.3106796116504853, | |
| "grad_norm": 0.7696858723921102, | |
| "learning_rate": 4.422713923744174e-06, | |
| "loss": 0.5103, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.320388349514563, | |
| "grad_norm": 0.8778656215024975, | |
| "learning_rate": 4.419460863967496e-06, | |
| "loss": 0.5335, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.3300970873786406, | |
| "grad_norm": 0.6994532036876416, | |
| "learning_rate": 4.416199867821126e-06, | |
| "loss": 0.5022, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.3398058252427183, | |
| "grad_norm": 0.7969656894134531, | |
| "learning_rate": 4.412930948788263e-06, | |
| "loss": 0.4246, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.349514563106796, | |
| "grad_norm": 0.8154555970057127, | |
| "learning_rate": 4.409654120384863e-06, | |
| "loss": 0.6664, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.3592233009708736, | |
| "grad_norm": 0.7113779335682774, | |
| "learning_rate": 4.406369396159585e-06, | |
| "loss": 0.6024, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.3689320388349513, | |
| "grad_norm": 0.7946176349594999, | |
| "learning_rate": 4.403076789693735e-06, | |
| "loss": 0.6273, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.378640776699029, | |
| "grad_norm": 0.6725243345900541, | |
| "learning_rate": 4.399776314601212e-06, | |
| "loss": 0.422, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.3883495145631066, | |
| "grad_norm": 0.8474457763132652, | |
| "learning_rate": 4.396467984528445e-06, | |
| "loss": 0.5515, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.3980582524271843, | |
| "grad_norm": 0.7635758657443303, | |
| "learning_rate": 4.393151813154345e-06, | |
| "loss": 0.417, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.407766990291262, | |
| "grad_norm": 0.7301687492460268, | |
| "learning_rate": 4.3898278141902396e-06, | |
| "loss": 0.4335, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.4174757281553396, | |
| "grad_norm": 0.8217888155242994, | |
| "learning_rate": 4.386496001379826e-06, | |
| "loss": 0.5301, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.4271844660194173, | |
| "grad_norm": 0.700440895441813, | |
| "learning_rate": 4.383156388499106e-06, | |
| "loss": 0.5289, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.436893203883495, | |
| "grad_norm": 0.676777993038823, | |
| "learning_rate": 4.3798089893563335e-06, | |
| "loss": 0.4079, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.4466019417475726, | |
| "grad_norm": 0.7359945465771945, | |
| "learning_rate": 4.3764538177919555e-06, | |
| "loss": 0.3024, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.4563106796116507, | |
| "grad_norm": 0.7431315486816701, | |
| "learning_rate": 4.3730908876785574e-06, | |
| "loss": 0.4715, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.466019417475728, | |
| "grad_norm": 0.858728130994808, | |
| "learning_rate": 4.3697202129208e-06, | |
| "loss": 0.5126, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.475728155339806, | |
| "grad_norm": 0.7006328621067651, | |
| "learning_rate": 4.36634180745537e-06, | |
| "loss": 0.3219, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.4854368932038833, | |
| "grad_norm": 0.8347983820993932, | |
| "learning_rate": 4.3629556852509145e-06, | |
| "loss": 0.7038, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.4951456310679614, | |
| "grad_norm": 0.6736438089942041, | |
| "learning_rate": 4.35956186030799e-06, | |
| "loss": 0.388, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.5048543689320386, | |
| "grad_norm": 0.7313432635941632, | |
| "learning_rate": 4.356160346659001e-06, | |
| "loss": 0.4803, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.5145631067961167, | |
| "grad_norm": 0.914378556446873, | |
| "learning_rate": 4.3527511583681384e-06, | |
| "loss": 0.9472, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.524271844660194, | |
| "grad_norm": 0.7928555788899265, | |
| "learning_rate": 4.34933430953133e-06, | |
| "loss": 0.5195, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.533980582524272, | |
| "grad_norm": 0.8552957566999786, | |
| "learning_rate": 4.345909814276177e-06, | |
| "loss": 0.5432, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.5436893203883493, | |
| "grad_norm": 0.6593786360114855, | |
| "learning_rate": 4.3424776867618935e-06, | |
| "loss": 0.5773, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.5533980582524274, | |
| "grad_norm": 0.7453073470450546, | |
| "learning_rate": 4.339037941179253e-06, | |
| "loss": 0.5414, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.5631067961165046, | |
| "grad_norm": 0.7285347424646024, | |
| "learning_rate": 4.335590591750526e-06, | |
| "loss": 0.4336, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.5728155339805827, | |
| "grad_norm": 0.7229430966497608, | |
| "learning_rate": 4.332135652729423e-06, | |
| "loss": 0.4226, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.58252427184466, | |
| "grad_norm": 0.7264295125746805, | |
| "learning_rate": 4.328673138401036e-06, | |
| "loss": 0.5502, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.592233009708738, | |
| "grad_norm": 0.7134753381323626, | |
| "learning_rate": 4.325203063081776e-06, | |
| "loss": 0.2876, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.6019417475728153, | |
| "grad_norm": 0.7636004423408037, | |
| "learning_rate": 4.32172544111932e-06, | |
| "loss": 0.3651, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.6116504854368934, | |
| "grad_norm": 0.731718208729643, | |
| "learning_rate": 4.318240286892544e-06, | |
| "loss": 0.4391, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.6213592233009706, | |
| "grad_norm": 0.6392786785401195, | |
| "learning_rate": 4.314747614811471e-06, | |
| "loss": 0.4575, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.6310679611650487, | |
| "grad_norm": 0.7082359191028809, | |
| "learning_rate": 4.3112474393172055e-06, | |
| "loss": 0.3473, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.6407766990291264, | |
| "grad_norm": 0.704495139070046, | |
| "learning_rate": 4.307739774881878e-06, | |
| "loss": 0.4346, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.650485436893204, | |
| "grad_norm": 0.7500124236230435, | |
| "learning_rate": 4.304224636008582e-06, | |
| "loss": 0.2937, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.6601941747572817, | |
| "grad_norm": 0.7473030765489458, | |
| "learning_rate": 4.300702037231318e-06, | |
| "loss": 0.5837, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.6699029126213594, | |
| "grad_norm": 0.6924606297437474, | |
| "learning_rate": 4.297171993114927e-06, | |
| "loss": 0.2863, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.679611650485437, | |
| "grad_norm": 0.7683874913737846, | |
| "learning_rate": 4.2936345182550365e-06, | |
| "loss": 0.5933, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.6893203883495147, | |
| "grad_norm": 0.738704129251105, | |
| "learning_rate": 4.290089627277998e-06, | |
| "loss": 0.3695, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.6990291262135924, | |
| "grad_norm": 0.730122979591364, | |
| "learning_rate": 4.286537334840825e-06, | |
| "loss": 0.5314, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.70873786407767, | |
| "grad_norm": 0.8561211215862192, | |
| "learning_rate": 4.2829776556311355e-06, | |
| "loss": 0.4077, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.7184466019417477, | |
| "grad_norm": 0.8670647072409673, | |
| "learning_rate": 4.279410604367088e-06, | |
| "loss": 0.6157, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.7281553398058254, | |
| "grad_norm": 0.8439911656034701, | |
| "learning_rate": 4.275836195797323e-06, | |
| "loss": 0.5611, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.737864077669903, | |
| "grad_norm": 0.8111642418779228, | |
| "learning_rate": 4.2722544447008995e-06, | |
| "loss": 0.6242, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.7475728155339807, | |
| "grad_norm": 0.8234444876560004, | |
| "learning_rate": 4.268665365887238e-06, | |
| "loss": 0.626, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.7572815533980584, | |
| "grad_norm": 0.7499486492298939, | |
| "learning_rate": 4.265068974196056e-06, | |
| "loss": 0.3372, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.766990291262136, | |
| "grad_norm": 0.8486583520088408, | |
| "learning_rate": 4.261465284497307e-06, | |
| "loss": 0.3682, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.7766990291262137, | |
| "grad_norm": 0.8004979648644168, | |
| "learning_rate": 4.257854311691118e-06, | |
| "loss": 0.5395, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.7864077669902914, | |
| "grad_norm": 0.7151044131463972, | |
| "learning_rate": 4.254236070707734e-06, | |
| "loss": 0.3921, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.796116504854369, | |
| "grad_norm": 0.6664683762506176, | |
| "learning_rate": 4.250610576507445e-06, | |
| "loss": 0.357, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.8058252427184467, | |
| "grad_norm": 0.8605859568498817, | |
| "learning_rate": 4.246977844080537e-06, | |
| "loss": 0.5304, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.8155339805825244, | |
| "grad_norm": 0.8012964647317763, | |
| "learning_rate": 4.24333788844722e-06, | |
| "loss": 0.4708, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.825242718446602, | |
| "grad_norm": 0.8135374110450408, | |
| "learning_rate": 4.239690724657571e-06, | |
| "loss": 0.4547, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.8349514563106797, | |
| "grad_norm": 1.1239630061318624, | |
| "learning_rate": 4.236036367791471e-06, | |
| "loss": 0.5222, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.8446601941747574, | |
| "grad_norm": 0.8761219541832255, | |
| "learning_rate": 4.23237483295854e-06, | |
| "loss": 0.584, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.854368932038835, | |
| "grad_norm": 0.7205334687878558, | |
| "learning_rate": 4.228706135298081e-06, | |
| "loss": 0.383, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.8640776699029127, | |
| "grad_norm": 0.778638883459709, | |
| "learning_rate": 4.225030289979006e-06, | |
| "loss": 0.3611, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.8737864077669903, | |
| "grad_norm": 0.775480657637448, | |
| "learning_rate": 4.221347312199788e-06, | |
| "loss": 0.3474, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.883495145631068, | |
| "grad_norm": 0.9124181162931588, | |
| "learning_rate": 4.2176572171883865e-06, | |
| "loss": 0.5554, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.8932038834951457, | |
| "grad_norm": 0.7674226802327085, | |
| "learning_rate": 4.213960020202187e-06, | |
| "loss": 0.5775, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.9029126213592233, | |
| "grad_norm": 0.8374874478064829, | |
| "learning_rate": 4.2102557365279435e-06, | |
| "loss": 0.6738, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.912621359223301, | |
| "grad_norm": 0.789430401614793, | |
| "learning_rate": 4.206544381481708e-06, | |
| "loss": 0.5645, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.9223300970873787, | |
| "grad_norm": 0.8301680910378548, | |
| "learning_rate": 4.202825970408772e-06, | |
| "loss": 0.3362, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.9320388349514563, | |
| "grad_norm": 0.7434161207431128, | |
| "learning_rate": 4.199100518683601e-06, | |
| "loss": 0.3865, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.941747572815534, | |
| "grad_norm": 0.8023276174001709, | |
| "learning_rate": 4.195368041709772e-06, | |
| "loss": 0.557, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.9514563106796117, | |
| "grad_norm": 0.6897944866344006, | |
| "learning_rate": 4.191628554919907e-06, | |
| "loss": 0.4306, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.9611650485436893, | |
| "grad_norm": 0.7587292046885276, | |
| "learning_rate": 4.187882073775615e-06, | |
| "loss": 0.4791, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.970873786407767, | |
| "grad_norm": 0.7402776540544128, | |
| "learning_rate": 4.184128613767422e-06, | |
| "loss": 0.4587, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.9805825242718447, | |
| "grad_norm": 0.918829002078532, | |
| "learning_rate": 4.18036819041471e-06, | |
| "loss": 0.4834, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.9902912621359223, | |
| "grad_norm": 0.6951873113715458, | |
| "learning_rate": 4.17660081926565e-06, | |
| "loss": 0.3421, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.6863501093124053, | |
| "learning_rate": 4.172826515897146e-06, | |
| "loss": 0.3388, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.009708737864078, | |
| "grad_norm": 0.7114971026737045, | |
| "learning_rate": 4.169045295914757e-06, | |
| "loss": 0.5117, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.019417475728155, | |
| "grad_norm": 0.669639982894413, | |
| "learning_rate": 4.165257174952647e-06, | |
| "loss": 0.2925, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.029126213592233, | |
| "grad_norm": 0.7192571477195556, | |
| "learning_rate": 4.161462168673508e-06, | |
| "loss": 0.4189, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.038834951456311, | |
| "grad_norm": 0.7186614597357849, | |
| "learning_rate": 4.157660292768502e-06, | |
| "loss": 0.5147, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.048543689320389, | |
| "grad_norm": 0.7615450499237213, | |
| "learning_rate": 4.1538515629571985e-06, | |
| "loss": 0.3839, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.058252427184466, | |
| "grad_norm": 0.7825364795958354, | |
| "learning_rate": 4.1500359949875e-06, | |
| "loss": 0.5142, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.067961165048544, | |
| "grad_norm": 0.7567301120413946, | |
| "learning_rate": 4.1462136046355864e-06, | |
| "loss": 0.3761, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.077669902912621, | |
| "grad_norm": 0.7496382866591231, | |
| "learning_rate": 4.142384407705846e-06, | |
| "loss": 0.6578, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.087378640776699, | |
| "grad_norm": 0.6832493076636806, | |
| "learning_rate": 4.138548420030808e-06, | |
| "loss": 0.2888, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.097087378640777, | |
| "grad_norm": 0.8082055763247596, | |
| "learning_rate": 4.13470565747108e-06, | |
| "loss": 0.4464, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.106796116504855, | |
| "grad_norm": 0.6922107618500224, | |
| "learning_rate": 4.130856135915282e-06, | |
| "loss": 0.414, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.116504854368932, | |
| "grad_norm": 0.8568390451993608, | |
| "learning_rate": 4.126999871279982e-06, | |
| "loss": 0.3524, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.12621359223301, | |
| "grad_norm": 0.7555701643934601, | |
| "learning_rate": 4.123136879509626e-06, | |
| "loss": 0.4294, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.135922330097087, | |
| "grad_norm": 0.7172542927002353, | |
| "learning_rate": 4.119267176576475e-06, | |
| "loss": 0.464, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.145631067961165, | |
| "grad_norm": 0.6682558575648311, | |
| "learning_rate": 4.11539077848054e-06, | |
| "loss": 0.4873, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.155339805825243, | |
| "grad_norm": 0.8598473815632454, | |
| "learning_rate": 4.111507701249513e-06, | |
| "loss": 0.4819, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.165048543689321, | |
| "grad_norm": 0.9167731886550053, | |
| "learning_rate": 4.107617960938702e-06, | |
| "loss": 0.1954, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.174757281553398, | |
| "grad_norm": 0.8392679998978078, | |
| "learning_rate": 4.103721573630965e-06, | |
| "loss": 0.3716, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.184466019417476, | |
| "grad_norm": 0.7039689710893157, | |
| "learning_rate": 4.099818555436645e-06, | |
| "loss": 0.399, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.194174757281553, | |
| "grad_norm": 0.8962803885867973, | |
| "learning_rate": 4.095908922493499e-06, | |
| "loss": 0.6665, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.203883495145631, | |
| "grad_norm": 1.919580266091257, | |
| "learning_rate": 4.091992690966636e-06, | |
| "loss": 0.2248, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.213592233009709, | |
| "grad_norm": 0.7602103224056724, | |
| "learning_rate": 4.088069877048447e-06, | |
| "loss": 0.3179, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.223300970873787, | |
| "grad_norm": 0.6467738319616958, | |
| "learning_rate": 4.084140496958539e-06, | |
| "loss": 0.2963, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.233009708737864, | |
| "grad_norm": 0.7220900194096829, | |
| "learning_rate": 4.080204566943668e-06, | |
| "loss": 0.2475, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.242718446601942, | |
| "grad_norm": 1.189054603182352, | |
| "learning_rate": 4.076262103277673e-06, | |
| "loss": 0.2943, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.252427184466019, | |
| "grad_norm": 0.7169713980642173, | |
| "learning_rate": 4.072313122261406e-06, | |
| "loss": 0.6236, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.262135922330097, | |
| "grad_norm": 0.631689118975417, | |
| "learning_rate": 4.068357640222668e-06, | |
| "loss": 0.2852, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.271844660194175, | |
| "grad_norm": 0.756930350000182, | |
| "learning_rate": 4.06439567351614e-06, | |
| "loss": 0.3246, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.281553398058253, | |
| "grad_norm": 0.6706899017753607, | |
| "learning_rate": 4.0604272385233105e-06, | |
| "loss": 0.3887, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 4.29126213592233, | |
| "grad_norm": 0.6775487389857546, | |
| "learning_rate": 4.056452351652418e-06, | |
| "loss": 0.2223, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 4.300970873786408, | |
| "grad_norm": 0.8187523228643651, | |
| "learning_rate": 4.052471029338375e-06, | |
| "loss": 0.4288, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 4.310679611650485, | |
| "grad_norm": 0.6200886929947741, | |
| "learning_rate": 4.048483288042703e-06, | |
| "loss": 0.24, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.320388349514563, | |
| "grad_norm": 0.7372457408545906, | |
| "learning_rate": 4.0444891442534615e-06, | |
| "loss": 0.3116, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.330097087378641, | |
| "grad_norm": 0.7260528356835414, | |
| "learning_rate": 4.040488614485187e-06, | |
| "loss": 0.4287, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.339805825242719, | |
| "grad_norm": 0.8887283733391886, | |
| "learning_rate": 4.036481715278818e-06, | |
| "loss": 0.4366, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.349514563106796, | |
| "grad_norm": 0.7417768044969674, | |
| "learning_rate": 4.032468463201626e-06, | |
| "loss": 0.4489, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.359223300970874, | |
| "grad_norm": 0.767167243913028, | |
| "learning_rate": 4.028448874847152e-06, | |
| "loss": 0.1889, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.368932038834951, | |
| "grad_norm": 0.7548738530081098, | |
| "learning_rate": 4.024422966835137e-06, | |
| "loss": 0.3836, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.378640776699029, | |
| "grad_norm": 0.7267980574762086, | |
| "learning_rate": 4.0203907558114475e-06, | |
| "loss": 0.3113, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.388349514563107, | |
| "grad_norm": 0.7384964389802123, | |
| "learning_rate": 4.016352258448016e-06, | |
| "loss": 0.2558, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.398058252427185, | |
| "grad_norm": 0.7354848571749719, | |
| "learning_rate": 4.0123074914427635e-06, | |
| "loss": 0.2631, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.407766990291262, | |
| "grad_norm": 0.6121502281910712, | |
| "learning_rate": 4.008256471519536e-06, | |
| "loss": 0.2496, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.41747572815534, | |
| "grad_norm": 0.8179123108880375, | |
| "learning_rate": 4.004199215428032e-06, | |
| "loss": 0.3859, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.427184466019417, | |
| "grad_norm": 0.9831209055765248, | |
| "learning_rate": 4.000135739943735e-06, | |
| "loss": 0.4455, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.436893203883495, | |
| "grad_norm": 0.8368630657095354, | |
| "learning_rate": 3.996066061867844e-06, | |
| "loss": 0.4183, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.446601941747573, | |
| "grad_norm": 0.7510366020026782, | |
| "learning_rate": 3.991990198027203e-06, | |
| "loss": 0.3243, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.456310679611651, | |
| "grad_norm": 0.7739088378814717, | |
| "learning_rate": 3.987908165274233e-06, | |
| "loss": 0.3937, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.466019417475728, | |
| "grad_norm": 0.7451064795923457, | |
| "learning_rate": 3.9838199804868635e-06, | |
| "loss": 0.4762, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.475728155339806, | |
| "grad_norm": 0.8005653610588523, | |
| "learning_rate": 3.979725660568456e-06, | |
| "loss": 0.4203, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.485436893203883, | |
| "grad_norm": 0.7571415862327683, | |
| "learning_rate": 3.975625222447742e-06, | |
| "loss": 0.2966, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.495145631067961, | |
| "grad_norm": 0.9407682824067278, | |
| "learning_rate": 3.97151868307875e-06, | |
| "loss": 0.4631, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.504854368932039, | |
| "grad_norm": 0.8280322987446443, | |
| "learning_rate": 3.9674060594407345e-06, | |
| "loss": 0.3439, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.514563106796117, | |
| "grad_norm": 0.7056881797171722, | |
| "learning_rate": 3.963287368538105e-06, | |
| "loss": 0.2049, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.524271844660194, | |
| "grad_norm": 0.9332990417689767, | |
| "learning_rate": 3.959162627400361e-06, | |
| "loss": 0.4755, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.533980582524272, | |
| "grad_norm": 1.1403012923588804, | |
| "learning_rate": 3.9550318530820145e-06, | |
| "loss": 0.4185, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.543689320388349, | |
| "grad_norm": 0.689075963184153, | |
| "learning_rate": 3.9508950626625244e-06, | |
| "loss": 0.2702, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.553398058252427, | |
| "grad_norm": 0.8020665017742961, | |
| "learning_rate": 3.946752273246224e-06, | |
| "loss": 0.4359, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.563106796116505, | |
| "grad_norm": 0.8212257599019372, | |
| "learning_rate": 3.942603501962249e-06, | |
| "loss": 0.352, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.572815533980583, | |
| "grad_norm": 0.7742300518258042, | |
| "learning_rate": 3.9384487659644716e-06, | |
| "loss": 0.2489, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.58252427184466, | |
| "grad_norm": 0.7529500735779854, | |
| "learning_rate": 3.934288082431423e-06, | |
| "loss": 0.4064, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.592233009708738, | |
| "grad_norm": 0.7285209333659659, | |
| "learning_rate": 3.930121468566227e-06, | |
| "loss": 0.4903, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.601941747572815, | |
| "grad_norm": 0.702088268251986, | |
| "learning_rate": 3.925948941596528e-06, | |
| "loss": 0.3524, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.611650485436893, | |
| "grad_norm": 0.6123252794922119, | |
| "learning_rate": 3.92177051877442e-06, | |
| "loss": 0.291, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.621359223300971, | |
| "grad_norm": 0.743156646498327, | |
| "learning_rate": 3.917586217376369e-06, | |
| "loss": 0.435, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.631067961165049, | |
| "grad_norm": 0.704106227231604, | |
| "learning_rate": 3.913396054703155e-06, | |
| "loss": 0.4408, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.640776699029126, | |
| "grad_norm": 0.7127662571747935, | |
| "learning_rate": 3.909200048079786e-06, | |
| "loss": 0.4177, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.650485436893204, | |
| "grad_norm": 0.748418825692663, | |
| "learning_rate": 3.9049982148554384e-06, | |
| "loss": 0.5271, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.660194174757281, | |
| "grad_norm": 1.0250685486452722, | |
| "learning_rate": 3.900790572403376e-06, | |
| "loss": 0.3461, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.669902912621359, | |
| "grad_norm": 0.7829872249544201, | |
| "learning_rate": 3.896577138120881e-06, | |
| "loss": 0.4164, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.679611650485437, | |
| "grad_norm": 0.9232004605724133, | |
| "learning_rate": 3.892357929429187e-06, | |
| "loss": 0.3579, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.689320388349515, | |
| "grad_norm": 0.784529648802751, | |
| "learning_rate": 3.8881329637734e-06, | |
| "loss": 0.2236, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.699029126213592, | |
| "grad_norm": 0.7262690017605888, | |
| "learning_rate": 3.883902258622431e-06, | |
| "loss": 0.3756, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.70873786407767, | |
| "grad_norm": 0.8102856355194789, | |
| "learning_rate": 3.8796658314689205e-06, | |
| "loss": 0.4178, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.718446601941747, | |
| "grad_norm": 0.7486005625149885, | |
| "learning_rate": 3.875423699829168e-06, | |
| "loss": 0.2835, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.728155339805825, | |
| "grad_norm": 0.7379335654656654, | |
| "learning_rate": 3.871175881243061e-06, | |
| "loss": 0.3489, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.737864077669903, | |
| "grad_norm": 0.8104226180405947, | |
| "learning_rate": 3.866922393273999e-06, | |
| "loss": 0.4974, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.747572815533981, | |
| "grad_norm": 0.7845427197820135, | |
| "learning_rate": 3.862663253508822e-06, | |
| "loss": 0.3453, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.757281553398058, | |
| "grad_norm": 0.7436234031119154, | |
| "learning_rate": 3.858398479557739e-06, | |
| "loss": 0.3273, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.766990291262136, | |
| "grad_norm": 2.217227869082823, | |
| "learning_rate": 3.8541280890542565e-06, | |
| "loss": 0.3229, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.776699029126213, | |
| "grad_norm": 1.0428680689060519, | |
| "learning_rate": 3.849852099655102e-06, | |
| "loss": 0.4476, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.786407766990291, | |
| "grad_norm": 0.7093952303779135, | |
| "learning_rate": 3.845570529040151e-06, | |
| "loss": 0.3531, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.796116504854369, | |
| "grad_norm": 0.7919634533614799, | |
| "learning_rate": 3.841283394912361e-06, | |
| "loss": 0.3435, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.805825242718447, | |
| "grad_norm": 0.7136080403274071, | |
| "learning_rate": 3.836990714997686e-06, | |
| "loss": 0.4444, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.815533980582524, | |
| "grad_norm": 0.8376006583714606, | |
| "learning_rate": 3.832692507045015e-06, | |
| "loss": 0.4478, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.825242718446602, | |
| "grad_norm": 0.6616123052279387, | |
| "learning_rate": 3.828388788826091e-06, | |
| "loss": 0.5166, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.834951456310679, | |
| "grad_norm": 0.7483368464194841, | |
| "learning_rate": 3.824079578135442e-06, | |
| "loss": 0.4151, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.844660194174757, | |
| "grad_norm": 0.646917604155803, | |
| "learning_rate": 3.819764892790307e-06, | |
| "loss": 0.4058, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.854368932038835, | |
| "grad_norm": 0.8267691480427513, | |
| "learning_rate": 3.815444750630555e-06, | |
| "loss": 0.3406, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.864077669902913, | |
| "grad_norm": 0.8928434754303194, | |
| "learning_rate": 3.811119169518624e-06, | |
| "loss": 0.541, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.87378640776699, | |
| "grad_norm": 0.9334290516881728, | |
| "learning_rate": 3.8067881673394363e-06, | |
| "loss": 0.6994, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.883495145631068, | |
| "grad_norm": 0.7272706791055442, | |
| "learning_rate": 3.802451762000331e-06, | |
| "loss": 0.3112, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.893203883495145, | |
| "grad_norm": 0.6753535996733946, | |
| "learning_rate": 3.7981099714309856e-06, | |
| "loss": 0.2749, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.902912621359223, | |
| "grad_norm": 0.8446207256750283, | |
| "learning_rate": 3.7937628135833453e-06, | |
| "loss": 0.354, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.9126213592233015, | |
| "grad_norm": 0.8427902134243664, | |
| "learning_rate": 3.7894103064315463e-06, | |
| "loss": 0.3765, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.922330097087379, | |
| "grad_norm": 0.7801637629685254, | |
| "learning_rate": 3.7850524679718424e-06, | |
| "loss": 0.3014, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.932038834951456, | |
| "grad_norm": 0.7448345484733, | |
| "learning_rate": 3.7806893162225328e-06, | |
| "loss": 0.2862, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.941747572815534, | |
| "grad_norm": 0.7506722271057215, | |
| "learning_rate": 3.7763208692238818e-06, | |
| "loss": 0.4005, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 4.951456310679612, | |
| "grad_norm": 0.7507341810173735, | |
| "learning_rate": 3.7719471450380518e-06, | |
| "loss": 0.3154, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.961165048543689, | |
| "grad_norm": 0.7509927539126146, | |
| "learning_rate": 3.7675681617490212e-06, | |
| "loss": 0.5283, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 4.970873786407767, | |
| "grad_norm": 0.7780851180094496, | |
| "learning_rate": 3.7631839374625167e-06, | |
| "loss": 0.5371, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 4.980582524271845, | |
| "grad_norm": 0.7269269443801724, | |
| "learning_rate": 3.758794490305932e-06, | |
| "loss": 0.3471, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 4.990291262135923, | |
| "grad_norm": 0.7023346099879195, | |
| "learning_rate": 3.7543998384282565e-06, | |
| "loss": 0.3844, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.6668852497704912, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.4468, | |
| "step": 515 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1545, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 57257695182848.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |