| { | |
| "best_metric": 0.029243575409054756, | |
| "best_model_checkpoint": "saves/chess/no_explain/checkpoint-4000", | |
| "epoch": 9.992390869042852, | |
| "eval_steps": 1000, | |
| "global_step": 12480, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08009611533840609, | |
| "grad_norm": 0.8625897724596373, | |
| "learning_rate": 4.006410256410257e-07, | |
| "loss": 1.3897, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16019223067681218, | |
| "grad_norm": 0.8895947937892531, | |
| "learning_rate": 8.012820512820515e-07, | |
| "loss": 0.0598, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24028834601521826, | |
| "grad_norm": 0.5221246844134636, | |
| "learning_rate": 1.201923076923077e-06, | |
| "loss": 0.0551, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32038446135362436, | |
| "grad_norm": 0.5590357289952654, | |
| "learning_rate": 1.602564102564103e-06, | |
| "loss": 0.0516, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4004805766920304, | |
| "grad_norm": 0.36991974174438536, | |
| "learning_rate": 2.0032051282051286e-06, | |
| "loss": 0.0501, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4805766920304365, | |
| "grad_norm": 0.6389443947236714, | |
| "learning_rate": 2.403846153846154e-06, | |
| "loss": 0.0486, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5606728073688426, | |
| "grad_norm": 0.44563280571067243, | |
| "learning_rate": 2.8044871794871797e-06, | |
| "loss": 0.0463, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6407689227072487, | |
| "grad_norm": 0.44266380357676305, | |
| "learning_rate": 3.205128205128206e-06, | |
| "loss": 0.0447, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7208650380456548, | |
| "grad_norm": 0.585654631503778, | |
| "learning_rate": 3.605769230769231e-06, | |
| "loss": 0.0441, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8009611533840608, | |
| "grad_norm": 0.600751877456253, | |
| "learning_rate": 4.006410256410257e-06, | |
| "loss": 0.0429, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8009611533840608, | |
| "eval_loss": 0.042210426181554794, | |
| "eval_runtime": 97.133, | |
| "eval_samples_per_second": 1462.17, | |
| "eval_steps_per_second": 2.862, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 0.2641551118831142, | |
| "learning_rate": 4.4070512820512826e-06, | |
| "loss": 0.0414, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.961153384060873, | |
| "grad_norm": 0.29049561928975876, | |
| "learning_rate": 4.807692307692308e-06, | |
| "loss": 0.0402, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0408490188225872, | |
| "grad_norm": 0.5344113116420023, | |
| "learning_rate": 4.999735579817769e-06, | |
| "loss": 0.0386, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1209451341609933, | |
| "grad_norm": 0.31257482202449377, | |
| "learning_rate": 4.997740994288484e-06, | |
| "loss": 0.0373, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2010412494993994, | |
| "grad_norm": 0.4593106982622164, | |
| "learning_rate": 4.993792498360407e-06, | |
| "loss": 0.0366, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2811373648378055, | |
| "grad_norm": 0.2012883704449717, | |
| "learning_rate": 4.9878931808274796e-06, | |
| "loss": 0.0357, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3612334801762114, | |
| "grad_norm": 0.22908626001592647, | |
| "learning_rate": 4.980047656554856e-06, | |
| "loss": 0.0352, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.4413295955146175, | |
| "grad_norm": 0.3169879320183415, | |
| "learning_rate": 4.970262062868821e-06, | |
| "loss": 0.0346, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.5214257108530236, | |
| "grad_norm": 0.2078878255601618, | |
| "learning_rate": 4.958544054755741e-06, | |
| "loss": 0.0336, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6015218261914297, | |
| "grad_norm": 0.2978110993331312, | |
| "learning_rate": 4.944902798873794e-06, | |
| "loss": 0.0329, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6015218261914297, | |
| "eval_loss": 0.03361953794956207, | |
| "eval_runtime": 97.2876, | |
| "eval_samples_per_second": 1459.847, | |
| "eval_steps_per_second": 2.858, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6816179415298358, | |
| "grad_norm": 0.16678424956102253, | |
| "learning_rate": 4.92934896638215e-06, | |
| "loss": 0.0328, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.761714056868242, | |
| "grad_norm": 0.19029664571581045, | |
| "learning_rate": 4.91189472459324e-06, | |
| "loss": 0.0316, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.841810172206648, | |
| "grad_norm": 0.2388908631462674, | |
| "learning_rate": 4.892553727454616e-06, | |
| "loss": 0.0317, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.921906287545054, | |
| "grad_norm": 0.15794270702360638, | |
| "learning_rate": 4.8713411048678635e-06, | |
| "loss": 0.0309, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.0016019223067683, | |
| "grad_norm": 0.2103115075663395, | |
| "learning_rate": 4.848273450852921e-06, | |
| "loss": 0.0305, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.0816980376451744, | |
| "grad_norm": 0.28601246983481904, | |
| "learning_rate": 4.823368810567056e-06, | |
| "loss": 0.0268, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.1617941529835805, | |
| "grad_norm": 0.25522616878445004, | |
| "learning_rate": 4.796646666188663e-06, | |
| "loss": 0.0268, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.2418902683219866, | |
| "grad_norm": 0.2343538332348778, | |
| "learning_rate": 4.768127921676916e-06, | |
| "loss": 0.0272, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.3219863836603922, | |
| "grad_norm": 0.22903658893889398, | |
| "learning_rate": 4.737834886419217e-06, | |
| "loss": 0.0297, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.4020824989987988, | |
| "grad_norm": 0.19855668130980528, | |
| "learning_rate": 4.705791257779196e-06, | |
| "loss": 0.0275, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.4020824989987988, | |
| "eval_loss": 0.029653793200850487, | |
| "eval_runtime": 97.2179, | |
| "eval_samples_per_second": 1460.893, | |
| "eval_steps_per_second": 2.86, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.4821786143372044, | |
| "grad_norm": 0.1868527106405498, | |
| "learning_rate": 4.672022102558958e-06, | |
| "loss": 0.0269, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.562274729675611, | |
| "grad_norm": 0.1985255713449175, | |
| "learning_rate": 4.636553837390051e-06, | |
| "loss": 0.0269, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.6423708450140166, | |
| "grad_norm": 0.17528235376425527, | |
| "learning_rate": 4.5994142080684956e-06, | |
| "loss": 0.026, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.7224669603524227, | |
| "grad_norm": 0.20238382028782428, | |
| "learning_rate": 4.560632267850054e-06, | |
| "loss": 0.026, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.802563075690829, | |
| "grad_norm": 0.20789525240306345, | |
| "learning_rate": 4.5202383547227134e-06, | |
| "loss": 0.0257, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.882659191029235, | |
| "grad_norm": 0.2849074845845128, | |
| "learning_rate": 4.478264067674155e-06, | |
| "loss": 0.0256, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.962755306367641, | |
| "grad_norm": 0.1826392119567578, | |
| "learning_rate": 4.43474224197278e-06, | |
| "loss": 0.0255, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.0424509411293554, | |
| "grad_norm": 0.3254043272458406, | |
| "learning_rate": 4.389706923481633e-06, | |
| "loss": 0.0224, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.122547056467761, | |
| "grad_norm": 0.2695456046362865, | |
| "learning_rate": 4.34319334202531e-06, | |
| "loss": 0.0198, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.202643171806167, | |
| "grad_norm": 0.24345073976828904, | |
| "learning_rate": 4.2952378838306855e-06, | |
| "loss": 0.0202, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.202643171806167, | |
| "eval_loss": 0.029243575409054756, | |
| "eval_runtime": 97.6159, | |
| "eval_samples_per_second": 1454.937, | |
| "eval_steps_per_second": 2.848, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.2827392871445733, | |
| "grad_norm": 0.3753413906545954, | |
| "learning_rate": 4.245878063063022e-06, | |
| "loss": 0.0205, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.3628354024829794, | |
| "grad_norm": 0.2460926534460345, | |
| "learning_rate": 4.195152492479727e-06, | |
| "loss": 0.0205, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.4429315178213855, | |
| "grad_norm": 0.2704381094416959, | |
| "learning_rate": 4.143100853224714e-06, | |
| "loss": 0.0204, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.5230276331597916, | |
| "grad_norm": 0.32177852781904165, | |
| "learning_rate": 4.089763863786988e-06, | |
| "loss": 0.0204, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.6031237484981977, | |
| "grad_norm": 0.24794031349246146, | |
| "learning_rate": 4.035183248147752e-06, | |
| "loss": 0.0201, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.683219863836604, | |
| "grad_norm": 0.2548491545100107, | |
| "learning_rate": 3.979401703140955e-06, | |
| "loss": 0.02, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.76331597917501, | |
| "grad_norm": 0.28339343421860097, | |
| "learning_rate": 3.922462865052782e-06, | |
| "loss": 0.0206, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.843412094513416, | |
| "grad_norm": 0.25858197249007897, | |
| "learning_rate": 3.8644112754862614e-06, | |
| "loss": 0.0199, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.923508209851822, | |
| "grad_norm": 0.25917676686664276, | |
| "learning_rate": 3.805292346517659e-06, | |
| "loss": 0.0199, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.0032038446135365, | |
| "grad_norm": 0.205217434085613, | |
| "learning_rate": 3.745152325171921e-06, | |
| "loss": 0.0194, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.0032038446135365, | |
| "eval_loss": 0.02939535118639469, | |
| "eval_runtime": 97.4425, | |
| "eval_samples_per_second": 1457.526, | |
| "eval_steps_per_second": 2.853, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.083299959951942, | |
| "grad_norm": 0.3449210512817333, | |
| "learning_rate": 3.6840382572449733e-06, | |
| "loss": 0.0109, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.163396075290349, | |
| "grad_norm": 0.35702251381560834, | |
| "learning_rate": 3.621997950501156e-06, | |
| "loss": 0.0116, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.243492190628754, | |
| "grad_norm": 0.3503016584030036, | |
| "learning_rate": 3.5590799372745915e-06, | |
| "loss": 0.0119, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.323588305967161, | |
| "grad_norm": 0.32698876302828034, | |
| "learning_rate": 3.495333436503753e-06, | |
| "loss": 0.0125, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.403684421305567, | |
| "grad_norm": 0.29218555867917617, | |
| "learning_rate": 3.4308083152289073e-06, | |
| "loss": 0.0122, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.483780536643973, | |
| "grad_norm": 0.42870050776267266, | |
| "learning_rate": 3.3655550495825824e-06, | |
| "loss": 0.0121, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.563876651982379, | |
| "grad_norm": 0.3841200097431653, | |
| "learning_rate": 3.2996246853035417e-06, | |
| "loss": 0.0123, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.6439727673207845, | |
| "grad_norm": 0.27276776968480937, | |
| "learning_rate": 3.233068797805194e-06, | |
| "loss": 0.0121, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.724068882659191, | |
| "grad_norm": 0.37618566324117403, | |
| "learning_rate": 3.1659394518296303e-06, | |
| "loss": 0.0121, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.8041649979975976, | |
| "grad_norm": 0.3053361427605705, | |
| "learning_rate": 3.0982891607188948e-06, | |
| "loss": 0.0119, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.8041649979975976, | |
| "eval_loss": 0.031129568815231323, | |
| "eval_runtime": 97.1562, | |
| "eval_samples_per_second": 1461.821, | |
| "eval_steps_per_second": 2.861, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.884261113336003, | |
| "grad_norm": 0.24611176483050773, | |
| "learning_rate": 3.0301708453353118e-06, | |
| "loss": 0.0121, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.964357228674409, | |
| "grad_norm": 0.30724706018820913, | |
| "learning_rate": 2.961637792663032e-06, | |
| "loss": 0.0114, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 5.044052863436123, | |
| "grad_norm": 0.38244952925905945, | |
| "learning_rate": 2.8927436141231695e-06, | |
| "loss": 0.0075, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 5.12414897877453, | |
| "grad_norm": 0.20603254501695356, | |
| "learning_rate": 2.8235422036351384e-06, | |
| "loss": 0.0044, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 5.2042450941129355, | |
| "grad_norm": 0.2637357295160275, | |
| "learning_rate": 2.754087695457005e-06, | |
| "loss": 0.0043, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.284341209451342, | |
| "grad_norm": 0.5274953505653177, | |
| "learning_rate": 2.684434421837821e-06, | |
| "loss": 0.0045, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 5.364437324789748, | |
| "grad_norm": 0.4238975113115418, | |
| "learning_rate": 2.6146368705150854e-06, | |
| "loss": 0.0048, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 5.444533440128154, | |
| "grad_norm": 0.37315897649626995, | |
| "learning_rate": 2.5447496420905608e-06, | |
| "loss": 0.0045, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 5.52462955546656, | |
| "grad_norm": 0.33573760401057196, | |
| "learning_rate": 2.4748274073178114e-06, | |
| "loss": 0.0045, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.6047256708049655, | |
| "grad_norm": 0.4205706583224986, | |
| "learning_rate": 2.4049248643348512e-06, | |
| "loss": 0.0048, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.6047256708049655, | |
| "eval_loss": 0.04388193413615227, | |
| "eval_runtime": 97.237, | |
| "eval_samples_per_second": 1460.607, | |
| "eval_steps_per_second": 2.859, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.684821786143372, | |
| "grad_norm": 0.3352151910327754, | |
| "learning_rate": 2.3350966958753766e-06, | |
| "loss": 0.0043, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 5.764917901481779, | |
| "grad_norm": 0.30090375390642815, | |
| "learning_rate": 2.265397526492052e-06, | |
| "loss": 0.0043, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 5.845014016820184, | |
| "grad_norm": 0.3649497532401096, | |
| "learning_rate": 2.195881879825301e-06, | |
| "loss": 0.0043, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 5.92511013215859, | |
| "grad_norm": 0.26280545277109674, | |
| "learning_rate": 2.1266041359510456e-06, | |
| "loss": 0.0043, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 6.004805766920304, | |
| "grad_norm": 0.13356592430041458, | |
| "learning_rate": 2.057618488840745e-06, | |
| "loss": 0.004, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.084901882258711, | |
| "grad_norm": 0.5145664187486052, | |
| "learning_rate": 1.9889789039670276e-06, | |
| "loss": 0.0014, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 6.1649979975971165, | |
| "grad_norm": 0.4699723392536862, | |
| "learning_rate": 1.9207390760880605e-06, | |
| "loss": 0.0014, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 6.245094112935522, | |
| "grad_norm": 0.4374296333529995, | |
| "learning_rate": 1.852952387243698e-06, | |
| "loss": 0.0013, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 6.325190228273929, | |
| "grad_norm": 0.42961763489773475, | |
| "learning_rate": 1.7856718649962606e-06, | |
| "loss": 0.0013, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 6.405286343612334, | |
| "grad_norm": 0.2569945673800255, | |
| "learning_rate": 1.7189501409486061e-06, | |
| "loss": 0.0013, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.405286343612334, | |
| "eval_loss": 0.053785648196935654, | |
| "eval_runtime": 97.6698, | |
| "eval_samples_per_second": 1454.134, | |
| "eval_steps_per_second": 2.846, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.485382458950741, | |
| "grad_norm": 0.36869037970588475, | |
| "learning_rate": 1.6528394095719558e-06, | |
| "loss": 0.0013, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 6.565478574289147, | |
| "grad_norm": 0.2187928895873153, | |
| "learning_rate": 1.587391387375669e-06, | |
| "loss": 0.0014, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 6.645574689627553, | |
| "grad_norm": 0.3346302824445088, | |
| "learning_rate": 1.522657272450917e-06, | |
| "loss": 0.0013, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 6.725670804965959, | |
| "grad_norm": 0.22583442175391086, | |
| "learning_rate": 1.4586877044199015e-06, | |
| "loss": 0.0014, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 6.805766920304365, | |
| "grad_norm": 0.24275774632690653, | |
| "learning_rate": 1.3955327248219438e-06, | |
| "loss": 0.0014, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 6.885863035642771, | |
| "grad_norm": 0.322120144658376, | |
| "learning_rate": 1.3332417379674426e-06, | |
| "loss": 0.0012, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 6.965959150981178, | |
| "grad_norm": 0.2971892796613953, | |
| "learning_rate": 1.2718634722903073e-06, | |
| "loss": 0.0013, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 7.045654785742891, | |
| "grad_norm": 0.18481532134302478, | |
| "learning_rate": 1.2114459422291205e-06, | |
| "loss": 0.0007, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 7.125750901081298, | |
| "grad_norm": 0.02287021398890685, | |
| "learning_rate": 1.1520364106668342e-06, | |
| "loss": 0.0003, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 7.205847016419703, | |
| "grad_norm": 0.2801253618567114, | |
| "learning_rate": 1.093681351958383e-06, | |
| "loss": 0.0004, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.205847016419703, | |
| "eval_loss": 0.06704169511795044, | |
| "eval_runtime": 97.5519, | |
| "eval_samples_per_second": 1455.892, | |
| "eval_steps_per_second": 2.85, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.28594313175811, | |
| "grad_norm": 0.05323383136377585, | |
| "learning_rate": 1.0364264155751489e-06, | |
| "loss": 0.0004, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 7.3660392470965155, | |
| "grad_norm": 0.10232274475527954, | |
| "learning_rate": 9.803163903946952e-07, | |
| "loss": 0.0004, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 7.446135362434922, | |
| "grad_norm": 0.1820198743026229, | |
| "learning_rate": 9.253951696637311e-07, | |
| "loss": 0.0004, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 7.526231477773328, | |
| "grad_norm": 0.3077085461325738, | |
| "learning_rate": 8.717057166616926e-07, | |
| "loss": 0.0003, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 7.606327593111734, | |
| "grad_norm": 0.02788433448517323, | |
| "learning_rate": 8.192900310918206e-07, | |
| "loss": 0.0004, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 7.68642370845014, | |
| "grad_norm": 0.454935336405101, | |
| "learning_rate": 7.681891162260016e-07, | |
| "loss": 0.0004, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 7.766519823788546, | |
| "grad_norm": 0.404101132597737, | |
| "learning_rate": 7.184429468291023e-07, | |
| "loss": 0.0003, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 7.846615939126952, | |
| "grad_norm": 0.006797483493599147, | |
| "learning_rate": 6.700904378878675e-07, | |
| "loss": 0.0004, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 7.926712054465359, | |
| "grad_norm": 0.253219681405225, | |
| "learning_rate": 6.231694141688535e-07, | |
| "loss": 0.0003, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 8.006407689227073, | |
| "grad_norm": 0.01676261471498421, | |
| "learning_rate": 5.777165806292109e-07, | |
| "loss": 0.0003, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 8.006407689227073, | |
| "eval_loss": 0.06978683918714523, | |
| "eval_runtime": 97.4682, | |
| "eval_samples_per_second": 1457.142, | |
| "eval_steps_per_second": 2.852, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 8.086503804565478, | |
| "grad_norm": 0.023667739210693765, | |
| "learning_rate": 5.337674937034581e-07, | |
| "loss": 0.0001, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 8.166599919903884, | |
| "grad_norm": 0.19018699551662502, | |
| "learning_rate": 4.913565334887135e-07, | |
| "loss": 0.0001, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 8.246696035242291, | |
| "grad_norm": 0.15520052508971907, | |
| "learning_rate": 4.505168768501431e-07, | |
| "loss": 0.0001, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 8.326792150580697, | |
| "grad_norm": 0.002693072772497186, | |
| "learning_rate": 4.1128047146765936e-07, | |
| "loss": 0.0001, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 8.406888265919102, | |
| "grad_norm": 0.07912436909277526, | |
| "learning_rate": 3.736780108441762e-07, | |
| "loss": 0.0001, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 8.486984381257509, | |
| "grad_norm": 0.0704194063541305, | |
| "learning_rate": 3.3773891029497326e-07, | |
| "loss": 0.0001, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 8.567080496595915, | |
| "grad_norm": 0.0031901574938772484, | |
| "learning_rate": 3.034912839369447e-07, | |
| "loss": 0.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 8.647176611934322, | |
| "grad_norm": 0.3276934498065665, | |
| "learning_rate": 2.70961922695743e-07, | |
| "loss": 0.0001, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 8.727272727272727, | |
| "grad_norm": 0.0018329070981180388, | |
| "learning_rate": 2.401762733480115e-07, | |
| "loss": 0.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 8.807368842611133, | |
| "grad_norm": 0.002408780413295549, | |
| "learning_rate": 2.1115841861510945e-07, | |
| "loss": 0.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 8.807368842611133, | |
| "eval_loss": 0.08943355828523636, | |
| "eval_runtime": 97.2207, | |
| "eval_samples_per_second": 1460.852, | |
| "eval_steps_per_second": 2.859, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 8.88746495794954, | |
| "grad_norm": 0.005344361337035522, | |
| "learning_rate": 1.8393105832389791e-07, | |
| "loss": 0.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 8.967561073287946, | |
| "grad_norm": 0.008809607265012539, | |
| "learning_rate": 1.5851549164932118e-07, | |
| "loss": 0.0001, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 9.047256708049659, | |
| "grad_norm": 0.004351746843250683, | |
| "learning_rate": 1.349316004526824e-07, | |
| "loss": 0.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 9.127352823388065, | |
| "grad_norm": 0.0008942462172532464, | |
| "learning_rate": 1.1319783372863601e-07, | |
| "loss": 0.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 9.207448938726472, | |
| "grad_norm": 0.0009999088005623051, | |
| "learning_rate": 9.333119317307598e-08, | |
| "loss": 0.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 9.287545054064879, | |
| "grad_norm": 0.006833873365903121, | |
| "learning_rate": 7.534721988320143e-08, | |
| "loss": 0.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 9.367641169403283, | |
| "grad_norm": 0.001580786758369194, | |
| "learning_rate": 5.92599822001666e-08, | |
| "loss": 0.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 9.44773728474169, | |
| "grad_norm": 0.08223063305947663, | |
| "learning_rate": 4.508206470382554e-08, | |
| "loss": 0.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 9.527833400080096, | |
| "grad_norm": 0.0003265712066290809, | |
| "learning_rate": 3.2824558368179384e-08, | |
| "loss": 0.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 9.607929515418503, | |
| "grad_norm": 0.0005479447690907845, | |
| "learning_rate": 2.2497051885228825e-08, | |
| "loss": 0.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 9.607929515418503, | |
| "eval_loss": 0.09308738261461258, | |
| "eval_runtime": 97.3361, | |
| "eval_samples_per_second": 1459.119, | |
| "eval_steps_per_second": 2.856, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 9.688025630756908, | |
| "grad_norm": 0.01973266591029808, | |
| "learning_rate": 1.4107624164019229e-08, | |
| "loss": 0.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 9.768121746095314, | |
| "grad_norm": 0.0007774042502156854, | |
| "learning_rate": 7.662838010742413e-09, | |
| "loss": 0.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 9.84821786143372, | |
| "grad_norm": 0.0003574216553306887, | |
| "learning_rate": 3.1677349948461277e-09, | |
| "loss": 0.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 9.928313976772127, | |
| "grad_norm": 0.0005660328857731791, | |
| "learning_rate": 6.258315051568819e-10, | |
| "loss": 0.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 9.992390869042852, | |
| "step": 12480, | |
| "total_flos": 2784163811819520.0, | |
| "train_loss": 0.025371345406674895, | |
| "train_runtime": 36703.9164, | |
| "train_samples_per_second": 348.252, | |
| "train_steps_per_second": 0.34 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 12480, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2784163811819520.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |