| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.999356582164458, | |
| "eval_steps": 1000, | |
| "global_step": 19425, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012868356710848025, | |
| "grad_norm": 5.15623140335083, | |
| "learning_rate": 4.987387387387388e-05, | |
| "loss": 1.7328, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02573671342169605, | |
| "grad_norm": 4.769791603088379, | |
| "learning_rate": 4.9745173745173744e-05, | |
| "loss": 1.3827, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03860507013254407, | |
| "grad_norm": 3.983968734741211, | |
| "learning_rate": 4.9616473616473616e-05, | |
| "loss": 1.2993, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0514734268433921, | |
| "grad_norm": 4.153149127960205, | |
| "learning_rate": 4.9487773487773494e-05, | |
| "loss": 1.2884, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06434178355424013, | |
| "grad_norm": 3.8115487098693848, | |
| "learning_rate": 4.935907335907336e-05, | |
| "loss": 1.2803, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07721014026508814, | |
| "grad_norm": 4.397896766662598, | |
| "learning_rate": 4.923037323037323e-05, | |
| "loss": 1.2442, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09007849697593617, | |
| "grad_norm": 4.619286060333252, | |
| "learning_rate": 4.91016731016731e-05, | |
| "loss": 1.2187, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1029468536867842, | |
| "grad_norm": 4.834469795227051, | |
| "learning_rate": 4.8972972972972974e-05, | |
| "loss": 1.1784, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11581521039763222, | |
| "grad_norm": 3.5152344703674316, | |
| "learning_rate": 4.8844272844272846e-05, | |
| "loss": 1.162, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12868356710848025, | |
| "grad_norm": 4.991385459899902, | |
| "learning_rate": 4.871557271557272e-05, | |
| "loss": 1.2053, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14155192381932827, | |
| "grad_norm": 3.806436538696289, | |
| "learning_rate": 4.8589446589446594e-05, | |
| "loss": 1.1554, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.15442028053017628, | |
| "grad_norm": 4.663485050201416, | |
| "learning_rate": 4.8460746460746466e-05, | |
| "loss": 1.135, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16728863724102433, | |
| "grad_norm": 4.0664238929748535, | |
| "learning_rate": 4.833204633204633e-05, | |
| "loss": 1.1372, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.18015699395187235, | |
| "grad_norm": 3.8850207328796387, | |
| "learning_rate": 4.82033462033462e-05, | |
| "loss": 1.0818, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.19302535066272036, | |
| "grad_norm": 3.971874713897705, | |
| "learning_rate": 4.807464607464608e-05, | |
| "loss": 1.1086, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2058937073735684, | |
| "grad_norm": 3.79402232170105, | |
| "learning_rate": 4.7945945945945946e-05, | |
| "loss": 1.1449, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.21876206408441642, | |
| "grad_norm": 4.102041721343994, | |
| "learning_rate": 4.781724581724582e-05, | |
| "loss": 1.1066, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.23163042079526444, | |
| "grad_norm": 4.198806285858154, | |
| "learning_rate": 4.768854568854569e-05, | |
| "loss": 1.1604, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.24449877750611246, | |
| "grad_norm": 3.196410655975342, | |
| "learning_rate": 4.755984555984556e-05, | |
| "loss": 1.1193, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2573671342169605, | |
| "grad_norm": 4.250946521759033, | |
| "learning_rate": 4.743114543114543e-05, | |
| "loss": 1.1378, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2573671342169605, | |
| "eval_loss": 1.026713252067566, | |
| "eval_runtime": 189.3511, | |
| "eval_samples_per_second": 102.592, | |
| "eval_steps_per_second": 10.261, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2702354909278085, | |
| "grad_norm": 3.319427251815796, | |
| "learning_rate": 4.7302445302445304e-05, | |
| "loss": 1.0153, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.28310384763865654, | |
| "grad_norm": 3.8477160930633545, | |
| "learning_rate": 4.7173745173745175e-05, | |
| "loss": 1.0782, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.29597220434950455, | |
| "grad_norm": 3.674345016479492, | |
| "learning_rate": 4.704504504504505e-05, | |
| "loss": 1.0935, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.30884056106035257, | |
| "grad_norm": 3.0566294193267822, | |
| "learning_rate": 4.691634491634492e-05, | |
| "loss": 1.1357, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.32170891777120064, | |
| "grad_norm": 3.81984806060791, | |
| "learning_rate": 4.678764478764479e-05, | |
| "loss": 1.0254, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.33457727448204866, | |
| "grad_norm": 4.312713623046875, | |
| "learning_rate": 4.665894465894466e-05, | |
| "loss": 1.0809, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3474456311928967, | |
| "grad_norm": 4.260924339294434, | |
| "learning_rate": 4.653024453024453e-05, | |
| "loss": 1.1019, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.3603139879037447, | |
| "grad_norm": 3.334003448486328, | |
| "learning_rate": 4.6401544401544405e-05, | |
| "loss": 1.0271, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3731823446145927, | |
| "grad_norm": 3.480570077896118, | |
| "learning_rate": 4.6272844272844277e-05, | |
| "loss": 1.0714, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3860507013254407, | |
| "grad_norm": 3.051344871520996, | |
| "learning_rate": 4.614414414414415e-05, | |
| "loss": 1.0437, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.39891905803628874, | |
| "grad_norm": 3.3421928882598877, | |
| "learning_rate": 4.601544401544402e-05, | |
| "loss": 1.0486, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4117874147471368, | |
| "grad_norm": 3.3062002658843994, | |
| "learning_rate": 4.588674388674389e-05, | |
| "loss": 1.067, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.42465577145798483, | |
| "grad_norm": 3.173940420150757, | |
| "learning_rate": 4.575804375804376e-05, | |
| "loss": 1.0349, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.43752412816883285, | |
| "grad_norm": 3.4759039878845215, | |
| "learning_rate": 4.562934362934363e-05, | |
| "loss": 1.0471, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.45039248487968087, | |
| "grad_norm": 3.4710917472839355, | |
| "learning_rate": 4.55006435006435e-05, | |
| "loss": 1.0891, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4632608415905289, | |
| "grad_norm": 3.515437602996826, | |
| "learning_rate": 4.537194337194338e-05, | |
| "loss": 1.0072, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4761291983013769, | |
| "grad_norm": 3.6845717430114746, | |
| "learning_rate": 4.524324324324324e-05, | |
| "loss": 1.0141, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4889975550122249, | |
| "grad_norm": 4.084773540496826, | |
| "learning_rate": 4.5114543114543114e-05, | |
| "loss": 1.003, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5018659117230729, | |
| "grad_norm": 3.49133038520813, | |
| "learning_rate": 4.4985842985842986e-05, | |
| "loss": 1.056, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.514734268433921, | |
| "grad_norm": 3.321465015411377, | |
| "learning_rate": 4.485714285714286e-05, | |
| "loss": 1.0146, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.514734268433921, | |
| "eval_loss": 0.9642198085784912, | |
| "eval_runtime": 189.3709, | |
| "eval_samples_per_second": 102.582, | |
| "eval_steps_per_second": 10.26, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.527602625144769, | |
| "grad_norm": 4.292445182800293, | |
| "learning_rate": 4.472844272844273e-05, | |
| "loss": 1.079, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.540470981855617, | |
| "grad_norm": 3.905832529067993, | |
| "learning_rate": 4.45997425997426e-05, | |
| "loss": 0.9615, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5533393385664651, | |
| "grad_norm": 3.985164165496826, | |
| "learning_rate": 4.447104247104247e-05, | |
| "loss": 1.0643, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5662076952773131, | |
| "grad_norm": 3.4493820667266846, | |
| "learning_rate": 4.4342342342342344e-05, | |
| "loss": 1.0075, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5790760519881611, | |
| "grad_norm": 3.188607931137085, | |
| "learning_rate": 4.4213642213642216e-05, | |
| "loss": 0.9822, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5919444086990091, | |
| "grad_norm": 4.343353748321533, | |
| "learning_rate": 4.408494208494209e-05, | |
| "loss": 1.0451, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6048127654098572, | |
| "grad_norm": 5.567046642303467, | |
| "learning_rate": 4.395624195624196e-05, | |
| "loss": 0.9741, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6176811221207051, | |
| "grad_norm": 3.7193870544433594, | |
| "learning_rate": 4.382754182754183e-05, | |
| "loss": 1.0067, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6305494788315532, | |
| "grad_norm": 3.9916462898254395, | |
| "learning_rate": 4.36988416988417e-05, | |
| "loss": 1.0139, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6434178355424013, | |
| "grad_norm": 2.9249072074890137, | |
| "learning_rate": 4.3570141570141574e-05, | |
| "loss": 0.9773, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6562861922532492, | |
| "grad_norm": 3.220916748046875, | |
| "learning_rate": 4.3441441441441445e-05, | |
| "loss": 0.9802, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6691545489640973, | |
| "grad_norm": 3.6743297576904297, | |
| "learning_rate": 4.331274131274132e-05, | |
| "loss": 0.9568, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6820229056749453, | |
| "grad_norm": 3.846135377883911, | |
| "learning_rate": 4.318404118404119e-05, | |
| "loss": 1.0341, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6948912623857934, | |
| "grad_norm": 3.933671236038208, | |
| "learning_rate": 4.305534105534106e-05, | |
| "loss": 1.0272, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7077596190966413, | |
| "grad_norm": 4.287942409515381, | |
| "learning_rate": 4.2926640926640925e-05, | |
| "loss": 0.9458, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7206279758074894, | |
| "grad_norm": 3.385679006576538, | |
| "learning_rate": 4.2797940797940796e-05, | |
| "loss": 1.013, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7334963325183375, | |
| "grad_norm": 3.283371925354004, | |
| "learning_rate": 4.2669240669240675e-05, | |
| "loss": 0.9173, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7463646892291854, | |
| "grad_norm": 3.90366268157959, | |
| "learning_rate": 4.254054054054054e-05, | |
| "loss": 0.9605, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7592330459400335, | |
| "grad_norm": 3.5078847408294678, | |
| "learning_rate": 4.241184041184041e-05, | |
| "loss": 1.0341, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7721014026508815, | |
| "grad_norm": 3.1368582248687744, | |
| "learning_rate": 4.228314028314029e-05, | |
| "loss": 0.9344, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7721014026508815, | |
| "eval_loss": 0.9136499762535095, | |
| "eval_runtime": 189.3685, | |
| "eval_samples_per_second": 102.583, | |
| "eval_steps_per_second": 10.26, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7849697593617295, | |
| "grad_norm": 3.6094772815704346, | |
| "learning_rate": 4.2154440154440154e-05, | |
| "loss": 0.9633, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7978381160725775, | |
| "grad_norm": 3.157749652862549, | |
| "learning_rate": 4.2025740025740026e-05, | |
| "loss": 0.9851, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8107064727834256, | |
| "grad_norm": 2.6654834747314453, | |
| "learning_rate": 4.18970398970399e-05, | |
| "loss": 0.9317, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8235748294942736, | |
| "grad_norm": 3.4694721698760986, | |
| "learning_rate": 4.176833976833977e-05, | |
| "loss": 0.9166, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8364431862051216, | |
| "grad_norm": 4.163283824920654, | |
| "learning_rate": 4.163963963963964e-05, | |
| "loss": 1.0012, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8493115429159697, | |
| "grad_norm": 2.8416895866394043, | |
| "learning_rate": 4.151093951093951e-05, | |
| "loss": 0.9063, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8621798996268176, | |
| "grad_norm": 2.7988712787628174, | |
| "learning_rate": 4.1382239382239384e-05, | |
| "loss": 0.9155, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8750482563376657, | |
| "grad_norm": 2.930269241333008, | |
| "learning_rate": 4.1253539253539256e-05, | |
| "loss": 0.9897, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8879166130485137, | |
| "grad_norm": 3.0751140117645264, | |
| "learning_rate": 4.112483912483913e-05, | |
| "loss": 0.9355, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9007849697593617, | |
| "grad_norm": 4.277087688446045, | |
| "learning_rate": 4.0996138996139e-05, | |
| "loss": 0.945, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9136533264702098, | |
| "grad_norm": 4.510545253753662, | |
| "learning_rate": 4.086743886743887e-05, | |
| "loss": 0.97, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9265216831810578, | |
| "grad_norm": 3.1346421241760254, | |
| "learning_rate": 4.073873873873874e-05, | |
| "loss": 0.9602, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9393900398919058, | |
| "grad_norm": 2.9139790534973145, | |
| "learning_rate": 4.0610038610038614e-05, | |
| "loss": 0.939, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9522583966027538, | |
| "grad_norm": 3.7701892852783203, | |
| "learning_rate": 4.0481338481338485e-05, | |
| "loss": 0.9806, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9651267533136019, | |
| "grad_norm": 3.3391294479370117, | |
| "learning_rate": 4.035263835263836e-05, | |
| "loss": 0.9835, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9779951100244498, | |
| "grad_norm": 4.155078887939453, | |
| "learning_rate": 4.022393822393822e-05, | |
| "loss": 0.9921, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9908634667352979, | |
| "grad_norm": 3.9210457801818848, | |
| "learning_rate": 4.00952380952381e-05, | |
| "loss": 0.9831, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.0037318234461459, | |
| "grad_norm": 2.388692855834961, | |
| "learning_rate": 3.996653796653797e-05, | |
| "loss": 0.8888, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.016600180156994, | |
| "grad_norm": 3.42979097366333, | |
| "learning_rate": 3.983783783783784e-05, | |
| "loss": 0.76, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.029468536867842, | |
| "grad_norm": 3.304460048675537, | |
| "learning_rate": 3.970913770913771e-05, | |
| "loss": 0.752, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.029468536867842, | |
| "eval_loss": 0.8883536458015442, | |
| "eval_runtime": 189.2819, | |
| "eval_samples_per_second": 102.63, | |
| "eval_steps_per_second": 10.265, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.04233689357869, | |
| "grad_norm": 3.8564417362213135, | |
| "learning_rate": 3.9580437580437587e-05, | |
| "loss": 0.7679, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.055205250289538, | |
| "grad_norm": 3.680621385574341, | |
| "learning_rate": 3.945173745173745e-05, | |
| "loss": 0.7563, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.068073607000386, | |
| "grad_norm": 3.4106550216674805, | |
| "learning_rate": 3.932303732303732e-05, | |
| "loss": 0.7149, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.080941963711234, | |
| "grad_norm": 3.5690340995788574, | |
| "learning_rate": 3.9194337194337195e-05, | |
| "loss": 0.7185, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.0938103204220821, | |
| "grad_norm": 3.215930700302124, | |
| "learning_rate": 3.9065637065637066e-05, | |
| "loss": 0.762, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.1066786771329302, | |
| "grad_norm": 3.48146390914917, | |
| "learning_rate": 3.893693693693694e-05, | |
| "loss": 0.744, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.119547033843778, | |
| "grad_norm": 3.667818069458008, | |
| "learning_rate": 3.880823680823681e-05, | |
| "loss": 0.7564, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.1324153905546261, | |
| "grad_norm": 4.029824256896973, | |
| "learning_rate": 3.867953667953668e-05, | |
| "loss": 0.7631, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.1452837472654742, | |
| "grad_norm": 3.8039183616638184, | |
| "learning_rate": 3.855083655083655e-05, | |
| "loss": 0.7635, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.1581521039763223, | |
| "grad_norm": 2.8013832569122314, | |
| "learning_rate": 3.8422136422136424e-05, | |
| "loss": 0.6891, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1710204606871701, | |
| "grad_norm": 3.721989631652832, | |
| "learning_rate": 3.8293436293436296e-05, | |
| "loss": 0.763, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.1838888173980182, | |
| "grad_norm": 2.802401304244995, | |
| "learning_rate": 3.816473616473617e-05, | |
| "loss": 0.7624, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.1967571741088663, | |
| "grad_norm": 3.3529865741729736, | |
| "learning_rate": 3.803603603603604e-05, | |
| "loss": 0.7673, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.2096255308197144, | |
| "grad_norm": 3.866051197052002, | |
| "learning_rate": 3.790733590733591e-05, | |
| "loss": 0.7713, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.2224938875305624, | |
| "grad_norm": 3.5920650959014893, | |
| "learning_rate": 3.777863577863578e-05, | |
| "loss": 0.7484, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.2353622442414103, | |
| "grad_norm": 4.17499303817749, | |
| "learning_rate": 3.7649935649935654e-05, | |
| "loss": 0.7279, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.2482306009522584, | |
| "grad_norm": 4.160298824310303, | |
| "learning_rate": 3.752123552123552e-05, | |
| "loss": 0.7442, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.2610989576631064, | |
| "grad_norm": 5.012088775634766, | |
| "learning_rate": 3.73925353925354e-05, | |
| "loss": 0.7322, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.2739673143739545, | |
| "grad_norm": 2.925732135772705, | |
| "learning_rate": 3.726383526383527e-05, | |
| "loss": 0.7284, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.2868356710848023, | |
| "grad_norm": 3.843261241912842, | |
| "learning_rate": 3.7135135135135134e-05, | |
| "loss": 0.7816, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.2868356710848023, | |
| "eval_loss": 0.8717387914657593, | |
| "eval_runtime": 189.4604, | |
| "eval_samples_per_second": 102.533, | |
| "eval_steps_per_second": 10.255, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.2997040277956504, | |
| "grad_norm": 3.363952398300171, | |
| "learning_rate": 3.7006435006435005e-05, | |
| "loss": 0.7719, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.3125723845064985, | |
| "grad_norm": 3.3399648666381836, | |
| "learning_rate": 3.6877734877734884e-05, | |
| "loss": 0.7282, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.3254407412173466, | |
| "grad_norm": 3.1839489936828613, | |
| "learning_rate": 3.674903474903475e-05, | |
| "loss": 0.7269, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.3383090979281946, | |
| "grad_norm": 4.2268266677856445, | |
| "learning_rate": 3.662033462033462e-05, | |
| "loss": 0.7438, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.3511774546390427, | |
| "grad_norm": 3.201995611190796, | |
| "learning_rate": 3.649163449163449e-05, | |
| "loss": 0.7396, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.3640458113498906, | |
| "grad_norm": 2.455885887145996, | |
| "learning_rate": 3.636293436293436e-05, | |
| "loss": 0.7292, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.3769141680607386, | |
| "grad_norm": 3.5545082092285156, | |
| "learning_rate": 3.6234234234234235e-05, | |
| "loss": 0.7118, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.3897825247715867, | |
| "grad_norm": 3.1645827293395996, | |
| "learning_rate": 3.6105534105534106e-05, | |
| "loss": 0.7368, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.4026508814824348, | |
| "grad_norm": 4.050566673278809, | |
| "learning_rate": 3.597683397683398e-05, | |
| "loss": 0.7368, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.4155192381932826, | |
| "grad_norm": 3.393502950668335, | |
| "learning_rate": 3.584813384813385e-05, | |
| "loss": 0.7496, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4283875949041307, | |
| "grad_norm": 2.807429790496826, | |
| "learning_rate": 3.571943371943372e-05, | |
| "loss": 0.7224, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.4412559516149788, | |
| "grad_norm": 3.8620431423187256, | |
| "learning_rate": 3.559073359073359e-05, | |
| "loss": 0.7135, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.4541243083258268, | |
| "grad_norm": 3.6501224040985107, | |
| "learning_rate": 3.5462033462033465e-05, | |
| "loss": 0.7158, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.466992665036675, | |
| "grad_norm": 3.178494453430176, | |
| "learning_rate": 3.5333333333333336e-05, | |
| "loss": 0.7122, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.4798610217475228, | |
| "grad_norm": 3.163177251815796, | |
| "learning_rate": 3.520463320463321e-05, | |
| "loss": 0.7268, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.4927293784583708, | |
| "grad_norm": 3.3929202556610107, | |
| "learning_rate": 3.507593307593308e-05, | |
| "loss": 0.7099, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.505597735169219, | |
| "grad_norm": 4.053615570068359, | |
| "learning_rate": 3.494723294723295e-05, | |
| "loss": 0.7291, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.5184660918800668, | |
| "grad_norm": 3.453657865524292, | |
| "learning_rate": 3.4818532818532816e-05, | |
| "loss": 0.7373, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.5313344485909148, | |
| "grad_norm": 3.537903070449829, | |
| "learning_rate": 3.4689832689832694e-05, | |
| "loss": 0.7311, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.544202805301763, | |
| "grad_norm": 3.1102797985076904, | |
| "learning_rate": 3.4561132561132566e-05, | |
| "loss": 0.7702, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.544202805301763, | |
| "eval_loss": 0.856028139591217, | |
| "eval_runtime": 189.5266, | |
| "eval_samples_per_second": 102.497, | |
| "eval_steps_per_second": 10.252, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.557071162012611, | |
| "grad_norm": 3.3603110313415527, | |
| "learning_rate": 3.443243243243243e-05, | |
| "loss": 0.748, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.569939518723459, | |
| "grad_norm": 3.154524326324463, | |
| "learning_rate": 3.43037323037323e-05, | |
| "loss": 0.7241, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.5828078754343071, | |
| "grad_norm": 3.641023874282837, | |
| "learning_rate": 3.417503217503218e-05, | |
| "loss": 0.7564, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.5956762321451552, | |
| "grad_norm": 2.77715802192688, | |
| "learning_rate": 3.4046332046332045e-05, | |
| "loss": 0.709, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.608544588856003, | |
| "grad_norm": 3.438999652862549, | |
| "learning_rate": 3.391763191763192e-05, | |
| "loss": 0.7545, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.6214129455668511, | |
| "grad_norm": 4.376018524169922, | |
| "learning_rate": 3.378893178893179e-05, | |
| "loss": 0.7435, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.6342813022776992, | |
| "grad_norm": 3.45723819732666, | |
| "learning_rate": 3.366023166023166e-05, | |
| "loss": 0.7191, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.647149658988547, | |
| "grad_norm": 3.0947272777557373, | |
| "learning_rate": 3.353153153153153e-05, | |
| "loss": 0.7376, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.660018015699395, | |
| "grad_norm": 4.219300746917725, | |
| "learning_rate": 3.3402831402831403e-05, | |
| "loss": 0.7567, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.6728863724102432, | |
| "grad_norm": 3.1860275268554688, | |
| "learning_rate": 3.3274131274131275e-05, | |
| "loss": 0.7771, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.6857547291210913, | |
| "grad_norm": 2.766604423522949, | |
| "learning_rate": 3.314543114543115e-05, | |
| "loss": 0.7504, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.6986230858319393, | |
| "grad_norm": 3.745673894882202, | |
| "learning_rate": 3.301673101673102e-05, | |
| "loss": 0.7389, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.7114914425427874, | |
| "grad_norm": 3.8440420627593994, | |
| "learning_rate": 3.288803088803089e-05, | |
| "loss": 0.7298, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.7243597992536355, | |
| "grad_norm": 3.5810177326202393, | |
| "learning_rate": 3.276190476190477e-05, | |
| "loss": 0.7602, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.7372281559644833, | |
| "grad_norm": 3.4311060905456543, | |
| "learning_rate": 3.263320463320463e-05, | |
| "loss": 0.74, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.7500965126753314, | |
| "grad_norm": 2.345510721206665, | |
| "learning_rate": 3.250450450450451e-05, | |
| "loss": 0.733, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.7629648693861792, | |
| "grad_norm": 3.014230251312256, | |
| "learning_rate": 3.2375804375804375e-05, | |
| "loss": 0.7469, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.7758332260970273, | |
| "grad_norm": 2.7299680709838867, | |
| "learning_rate": 3.224710424710425e-05, | |
| "loss": 0.7183, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.7887015828078754, | |
| "grad_norm": 3.163944721221924, | |
| "learning_rate": 3.2118404118404125e-05, | |
| "loss": 0.763, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.8015699395187235, | |
| "grad_norm": 3.4352288246154785, | |
| "learning_rate": 3.198970398970399e-05, | |
| "loss": 0.7653, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8015699395187235, | |
| "eval_loss": 0.8297092914581299, | |
| "eval_runtime": 189.396, | |
| "eval_samples_per_second": 102.568, | |
| "eval_steps_per_second": 10.259, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8144382962295715, | |
| "grad_norm": 4.164456844329834, | |
| "learning_rate": 3.186100386100386e-05, | |
| "loss": 0.7168, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.8273066529404196, | |
| "grad_norm": 3.9557299613952637, | |
| "learning_rate": 3.173230373230373e-05, | |
| "loss": 0.7689, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.8401750096512677, | |
| "grad_norm": 3.7688467502593994, | |
| "learning_rate": 3.1603603603603605e-05, | |
| "loss": 0.7101, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.8530433663621155, | |
| "grad_norm": 3.025460720062256, | |
| "learning_rate": 3.1474903474903476e-05, | |
| "loss": 0.7311, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.8659117230729636, | |
| "grad_norm": 3.369191884994507, | |
| "learning_rate": 3.134620334620335e-05, | |
| "loss": 0.7063, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.8787800797838115, | |
| "grad_norm": 3.8419950008392334, | |
| "learning_rate": 3.121750321750322e-05, | |
| "loss": 0.7228, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.8916484364946595, | |
| "grad_norm": 4.265018939971924, | |
| "learning_rate": 3.108880308880309e-05, | |
| "loss": 0.7699, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.9045167932055076, | |
| "grad_norm": 3.287287712097168, | |
| "learning_rate": 3.096010296010296e-05, | |
| "loss": 0.7496, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.9173851499163557, | |
| "grad_norm": 4.200809001922607, | |
| "learning_rate": 3.0831402831402834e-05, | |
| "loss": 0.7039, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.9302535066272037, | |
| "grad_norm": 2.8199522495269775, | |
| "learning_rate": 3.07027027027027e-05, | |
| "loss": 0.7166, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.9431218633380518, | |
| "grad_norm": 3.1593542098999023, | |
| "learning_rate": 3.057400257400258e-05, | |
| "loss": 0.7288, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.9559902200488999, | |
| "grad_norm": 3.7583165168762207, | |
| "learning_rate": 3.044530244530245e-05, | |
| "loss": 0.7184, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.9688585767597477, | |
| "grad_norm": 2.7919719219207764, | |
| "learning_rate": 3.0316602316602317e-05, | |
| "loss": 0.7414, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.9817269334705958, | |
| "grad_norm": 3.475163459777832, | |
| "learning_rate": 3.018790218790219e-05, | |
| "loss": 0.6855, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.9945952901814439, | |
| "grad_norm": 3.133477210998535, | |
| "learning_rate": 3.0059202059202064e-05, | |
| "loss": 0.715, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.0074636468922917, | |
| "grad_norm": 2.9434854984283447, | |
| "learning_rate": 2.9930501930501932e-05, | |
| "loss": 0.6191, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.02033200360314, | |
| "grad_norm": 3.8272597789764404, | |
| "learning_rate": 2.9801801801801804e-05, | |
| "loss": 0.5678, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.033200360313988, | |
| "grad_norm": 3.036975860595703, | |
| "learning_rate": 2.9673101673101672e-05, | |
| "loss": 0.5251, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.046068717024836, | |
| "grad_norm": 3.401226758956909, | |
| "learning_rate": 2.9544401544401547e-05, | |
| "loss": 0.554, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.058937073735684, | |
| "grad_norm": 3.0268912315368652, | |
| "learning_rate": 2.941570141570142e-05, | |
| "loss": 0.5595, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.058937073735684, | |
| "eval_loss": 0.8340552449226379, | |
| "eval_runtime": 189.3362, | |
| "eval_samples_per_second": 102.601, | |
| "eval_steps_per_second": 10.262, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.071805430446532, | |
| "grad_norm": 3.2826826572418213, | |
| "learning_rate": 2.9287001287001287e-05, | |
| "loss": 0.537, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.08467378715738, | |
| "grad_norm": 3.539940357208252, | |
| "learning_rate": 2.915830115830116e-05, | |
| "loss": 0.5392, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.0975421438682282, | |
| "grad_norm": 3.780297040939331, | |
| "learning_rate": 2.9032175032175036e-05, | |
| "loss": 0.5642, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.110410500579076, | |
| "grad_norm": 3.1283602714538574, | |
| "learning_rate": 2.8903474903474904e-05, | |
| "loss": 0.5538, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.123278857289924, | |
| "grad_norm": 2.545053243637085, | |
| "learning_rate": 2.8774774774774775e-05, | |
| "loss": 0.4843, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.136147214000772, | |
| "grad_norm": 3.5739200115203857, | |
| "learning_rate": 2.864607464607465e-05, | |
| "loss": 0.5477, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.14901557071162, | |
| "grad_norm": 2.904505491256714, | |
| "learning_rate": 2.851737451737452e-05, | |
| "loss": 0.5377, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.161883927422468, | |
| "grad_norm": 4.482571601867676, | |
| "learning_rate": 2.838867438867439e-05, | |
| "loss": 0.5404, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.1747522841333162, | |
| "grad_norm": 3.0093982219696045, | |
| "learning_rate": 2.825997425997426e-05, | |
| "loss": 0.5446, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.1876206408441643, | |
| "grad_norm": 5.678364276885986, | |
| "learning_rate": 2.8131274131274134e-05, | |
| "loss": 0.5632, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.2004889975550124, | |
| "grad_norm": 3.4848239421844482, | |
| "learning_rate": 2.8002574002574005e-05, | |
| "loss": 0.5451, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.2133573542658604, | |
| "grad_norm": 3.782041549682617, | |
| "learning_rate": 2.7873873873873873e-05, | |
| "loss": 0.5654, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.226225710976708, | |
| "grad_norm": 3.711773157119751, | |
| "learning_rate": 2.774517374517375e-05, | |
| "loss": 0.5549, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.239094067687556, | |
| "grad_norm": 3.105684518814087, | |
| "learning_rate": 2.761647361647362e-05, | |
| "loss": 0.5741, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.251962424398404, | |
| "grad_norm": 3.3493452072143555, | |
| "learning_rate": 2.7487773487773488e-05, | |
| "loss": 0.5474, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.2648307811092523, | |
| "grad_norm": 2.916020631790161, | |
| "learning_rate": 2.7359073359073363e-05, | |
| "loss": 0.5704, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.2776991378201004, | |
| "grad_norm": 2.867391347885132, | |
| "learning_rate": 2.7230373230373228e-05, | |
| "loss": 0.5116, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.2905674945309484, | |
| "grad_norm": 3.349109649658203, | |
| "learning_rate": 2.7101673101673103e-05, | |
| "loss": 0.5523, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.3034358512417965, | |
| "grad_norm": 3.198293685913086, | |
| "learning_rate": 2.6972972972972978e-05, | |
| "loss": 0.5437, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.3163042079526446, | |
| "grad_norm": 3.9013428688049316, | |
| "learning_rate": 2.6844272844272843e-05, | |
| "loss": 0.5501, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.3163042079526446, | |
| "eval_loss": 0.8361700773239136, | |
| "eval_runtime": 189.4593, | |
| "eval_samples_per_second": 102.534, | |
| "eval_steps_per_second": 10.256, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.3291725646634927, | |
| "grad_norm": 2.5401031970977783, | |
| "learning_rate": 2.6715572715572718e-05, | |
| "loss": 0.5441, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.3420409213743403, | |
| "grad_norm": 3.330003261566162, | |
| "learning_rate": 2.6586872586872586e-05, | |
| "loss": 0.5292, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.3549092780851884, | |
| "grad_norm": 3.724376916885376, | |
| "learning_rate": 2.6458172458172458e-05, | |
| "loss": 0.5555, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.3677776347960364, | |
| "grad_norm": 3.3693745136260986, | |
| "learning_rate": 2.6329472329472333e-05, | |
| "loss": 0.5638, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.3806459915068845, | |
| "grad_norm": 3.3780782222747803, | |
| "learning_rate": 2.62007722007722e-05, | |
| "loss": 0.5724, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.3935143482177326, | |
| "grad_norm": 3.949384927749634, | |
| "learning_rate": 2.6072072072072072e-05, | |
| "loss": 0.5548, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.4063827049285806, | |
| "grad_norm": 3.7752134799957275, | |
| "learning_rate": 2.5943371943371947e-05, | |
| "loss": 0.5401, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.4192510616394287, | |
| "grad_norm": 4.018200874328613, | |
| "learning_rate": 2.5814671814671816e-05, | |
| "loss": 0.5154, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.432119418350277, | |
| "grad_norm": 2.818852186203003, | |
| "learning_rate": 2.5685971685971687e-05, | |
| "loss": 0.5317, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.444987775061125, | |
| "grad_norm": 3.676013946533203, | |
| "learning_rate": 2.5557271557271556e-05, | |
| "loss": 0.5683, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.4578561317719725, | |
| "grad_norm": 3.144169807434082, | |
| "learning_rate": 2.542857142857143e-05, | |
| "loss": 0.5491, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.4707244884828206, | |
| "grad_norm": 3.605506658554077, | |
| "learning_rate": 2.5299871299871302e-05, | |
| "loss": 0.5733, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.4835928451936686, | |
| "grad_norm": 3.2025864124298096, | |
| "learning_rate": 2.517117117117117e-05, | |
| "loss": 0.5581, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.4964612019045167, | |
| "grad_norm": 3.0987846851348877, | |
| "learning_rate": 2.5042471042471045e-05, | |
| "loss": 0.5246, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.5093295586153648, | |
| "grad_norm": 2.3385045528411865, | |
| "learning_rate": 2.4913770913770914e-05, | |
| "loss": 0.5643, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.522197915326213, | |
| "grad_norm": 3.4975156784057617, | |
| "learning_rate": 2.4785070785070785e-05, | |
| "loss": 0.5793, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.535066272037061, | |
| "grad_norm": 3.657727003097534, | |
| "learning_rate": 2.465637065637066e-05, | |
| "loss": 0.561, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.547934628747909, | |
| "grad_norm": 3.558779001235962, | |
| "learning_rate": 2.452767052767053e-05, | |
| "loss": 0.5442, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.560802985458757, | |
| "grad_norm": 3.5756425857543945, | |
| "learning_rate": 2.43989703989704e-05, | |
| "loss": 0.5467, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.5736713421696047, | |
| "grad_norm": 4.302220821380615, | |
| "learning_rate": 2.427027027027027e-05, | |
| "loss": 0.5322, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.5736713421696047, | |
| "eval_loss": 0.8265257477760315, | |
| "eval_runtime": 189.4664, | |
| "eval_samples_per_second": 102.53, | |
| "eval_steps_per_second": 10.255, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.586539698880453, | |
| "grad_norm": 3.3012359142303467, | |
| "learning_rate": 2.4141570141570143e-05, | |
| "loss": 0.542, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.599408055591301, | |
| "grad_norm": 3.0096490383148193, | |
| "learning_rate": 2.4012870012870015e-05, | |
| "loss": 0.5641, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.612276412302149, | |
| "grad_norm": 3.803586006164551, | |
| "learning_rate": 2.3884169884169886e-05, | |
| "loss": 0.5375, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.625144769012997, | |
| "grad_norm": 3.69391131401062, | |
| "learning_rate": 2.3755469755469755e-05, | |
| "loss": 0.5579, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.638013125723845, | |
| "grad_norm": 3.184783458709717, | |
| "learning_rate": 2.362676962676963e-05, | |
| "loss": 0.5185, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 2.650881482434693, | |
| "grad_norm": 3.191938638687134, | |
| "learning_rate": 2.34980694980695e-05, | |
| "loss": 0.5644, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.663749839145541, | |
| "grad_norm": 2.8819658756256104, | |
| "learning_rate": 2.336936936936937e-05, | |
| "loss": 0.5318, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 2.6766181958563893, | |
| "grad_norm": 2.8010735511779785, | |
| "learning_rate": 2.324066924066924e-05, | |
| "loss": 0.5286, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.689486552567237, | |
| "grad_norm": 3.6001551151275635, | |
| "learning_rate": 2.3111969111969113e-05, | |
| "loss": 0.5699, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 2.7023549092780854, | |
| "grad_norm": 3.6811742782592773, | |
| "learning_rate": 2.2983268983268984e-05, | |
| "loss": 0.5693, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.715223265988933, | |
| "grad_norm": 3.8935017585754395, | |
| "learning_rate": 2.2854568854568856e-05, | |
| "loss": 0.5471, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 2.728091622699781, | |
| "grad_norm": 4.385620594024658, | |
| "learning_rate": 2.2725868725868727e-05, | |
| "loss": 0.5496, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.740959979410629, | |
| "grad_norm": 3.178663730621338, | |
| "learning_rate": 2.2597168597168596e-05, | |
| "loss": 0.5669, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 2.7538283361214773, | |
| "grad_norm": 2.5699925422668457, | |
| "learning_rate": 2.246846846846847e-05, | |
| "loss": 0.5445, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.7666966928323253, | |
| "grad_norm": 3.409348964691162, | |
| "learning_rate": 2.2339768339768342e-05, | |
| "loss": 0.5195, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 2.7795650495431734, | |
| "grad_norm": 3.578507900238037, | |
| "learning_rate": 2.221106821106821e-05, | |
| "loss": 0.5664, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.7924334062540215, | |
| "grad_norm": 3.959843873977661, | |
| "learning_rate": 2.2082368082368082e-05, | |
| "loss": 0.5509, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 2.8053017629648695, | |
| "grad_norm": 4.0397868156433105, | |
| "learning_rate": 2.1953667953667957e-05, | |
| "loss": 0.5287, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.8181701196757176, | |
| "grad_norm": 4.056464195251465, | |
| "learning_rate": 2.1824967824967825e-05, | |
| "loss": 0.5493, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 2.8310384763865653, | |
| "grad_norm": 2.7274129390716553, | |
| "learning_rate": 2.1696267696267697e-05, | |
| "loss": 0.5536, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.8310384763865653, | |
| "eval_loss": 0.8111644983291626, | |
| "eval_runtime": 189.4072, | |
| "eval_samples_per_second": 102.562, | |
| "eval_steps_per_second": 10.258, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.8439068330974133, | |
| "grad_norm": 4.217979907989502, | |
| "learning_rate": 2.156756756756757e-05, | |
| "loss": 0.5206, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 2.8567751898082614, | |
| "grad_norm": 3.4003894329071045, | |
| "learning_rate": 2.143886743886744e-05, | |
| "loss": 0.5389, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.8696435465191095, | |
| "grad_norm": 3.1953563690185547, | |
| "learning_rate": 2.1310167310167312e-05, | |
| "loss": 0.5533, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 2.8825119032299575, | |
| "grad_norm": 4.147376537322998, | |
| "learning_rate": 2.1181467181467183e-05, | |
| "loss": 0.5537, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.8953802599408056, | |
| "grad_norm": 3.726901054382324, | |
| "learning_rate": 2.105276705276705e-05, | |
| "loss": 0.527, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 2.9082486166516537, | |
| "grad_norm": 3.210848331451416, | |
| "learning_rate": 2.0924066924066927e-05, | |
| "loss": 0.5561, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.9211169733625018, | |
| "grad_norm": 4.512522220611572, | |
| "learning_rate": 2.0795366795366798e-05, | |
| "loss": 0.5642, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 2.93398533007335, | |
| "grad_norm": 3.112410545349121, | |
| "learning_rate": 2.0666666666666666e-05, | |
| "loss": 0.5678, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.9468536867841975, | |
| "grad_norm": 3.0859549045562744, | |
| "learning_rate": 2.0537966537966538e-05, | |
| "loss": 0.5203, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 2.9597220434950455, | |
| "grad_norm": 2.518927812576294, | |
| "learning_rate": 2.0409266409266413e-05, | |
| "loss": 0.5326, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.9725904002058936, | |
| "grad_norm": 3.686985492706299, | |
| "learning_rate": 2.028056628056628e-05, | |
| "loss": 0.5419, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 2.9854587569167417, | |
| "grad_norm": 3.1635522842407227, | |
| "learning_rate": 2.0151866151866153e-05, | |
| "loss": 0.5593, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.9983271136275897, | |
| "grad_norm": 3.734297037124634, | |
| "learning_rate": 2.0023166023166024e-05, | |
| "loss": 0.5233, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.011195470338438, | |
| "grad_norm": 4.363897800445557, | |
| "learning_rate": 1.9894465894465893e-05, | |
| "loss": 0.3853, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.024063827049286, | |
| "grad_norm": 3.380840539932251, | |
| "learning_rate": 1.9765765765765768e-05, | |
| "loss": 0.4048, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.036932183760134, | |
| "grad_norm": 2.8301613330841064, | |
| "learning_rate": 1.963706563706564e-05, | |
| "loss": 0.4104, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.049800540470982, | |
| "grad_norm": 3.5483977794647217, | |
| "learning_rate": 1.9508365508365508e-05, | |
| "loss": 0.4002, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.0626688971818297, | |
| "grad_norm": 3.0657596588134766, | |
| "learning_rate": 1.937966537966538e-05, | |
| "loss": 0.3955, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.0755372538926777, | |
| "grad_norm": 3.3583223819732666, | |
| "learning_rate": 1.9250965250965254e-05, | |
| "loss": 0.4258, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.088405610603526, | |
| "grad_norm": 2.8227546215057373, | |
| "learning_rate": 1.9122265122265122e-05, | |
| "loss": 0.4279, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.088405610603526, | |
| "eval_loss": 0.8435577154159546, | |
| "eval_runtime": 189.2953, | |
| "eval_samples_per_second": 102.623, | |
| "eval_steps_per_second": 10.264, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.101273967314374, | |
| "grad_norm": 3.359844923019409, | |
| "learning_rate": 1.8993564993564994e-05, | |
| "loss": 0.4204, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.114142324025222, | |
| "grad_norm": 3.8054656982421875, | |
| "learning_rate": 1.8864864864864866e-05, | |
| "loss": 0.419, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.12701068073607, | |
| "grad_norm": 3.396296977996826, | |
| "learning_rate": 1.8736164736164737e-05, | |
| "loss": 0.4167, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.139879037446918, | |
| "grad_norm": 3.7082102298736572, | |
| "learning_rate": 1.860746460746461e-05, | |
| "loss": 0.4047, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.152747394157766, | |
| "grad_norm": 3.589115619659424, | |
| "learning_rate": 1.847876447876448e-05, | |
| "loss": 0.4245, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.1656157508686142, | |
| "grad_norm": 4.928313732147217, | |
| "learning_rate": 1.835006435006435e-05, | |
| "loss": 0.4054, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.178484107579462, | |
| "grad_norm": 3.3157694339752197, | |
| "learning_rate": 1.8221364221364224e-05, | |
| "loss": 0.3828, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.19135246429031, | |
| "grad_norm": 2.832195997238159, | |
| "learning_rate": 1.8092664092664095e-05, | |
| "loss": 0.3985, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.204220821001158, | |
| "grad_norm": 3.6316025257110596, | |
| "learning_rate": 1.7963963963963963e-05, | |
| "loss": 0.4009, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.217089177712006, | |
| "grad_norm": 2.810399293899536, | |
| "learning_rate": 1.7835263835263835e-05, | |
| "loss": 0.3942, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.229957534422854, | |
| "grad_norm": 4.077607154846191, | |
| "learning_rate": 1.770656370656371e-05, | |
| "loss": 0.4139, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.2428258911337022, | |
| "grad_norm": 4.511129856109619, | |
| "learning_rate": 1.7577863577863578e-05, | |
| "loss": 0.4151, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.2556942478445503, | |
| "grad_norm": 4.129230976104736, | |
| "learning_rate": 1.744916344916345e-05, | |
| "loss": 0.4174, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.2685626045553984, | |
| "grad_norm": 2.6795713901519775, | |
| "learning_rate": 1.732046332046332e-05, | |
| "loss": 0.4305, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.2814309612662464, | |
| "grad_norm": 3.7392494678497314, | |
| "learning_rate": 1.7191763191763193e-05, | |
| "loss": 0.4221, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.294299317977094, | |
| "grad_norm": 3.319235324859619, | |
| "learning_rate": 1.7063063063063065e-05, | |
| "loss": 0.4506, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.3071676746879426, | |
| "grad_norm": 3.564242362976074, | |
| "learning_rate": 1.6934362934362936e-05, | |
| "loss": 0.4287, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.32003603139879, | |
| "grad_norm": 4.851574420928955, | |
| "learning_rate": 1.6805662805662805e-05, | |
| "loss": 0.4002, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.3329043881096383, | |
| "grad_norm": 3.396198034286499, | |
| "learning_rate": 1.667696267696268e-05, | |
| "loss": 0.3979, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.3457727448204864, | |
| "grad_norm": 2.8444738388061523, | |
| "learning_rate": 1.654826254826255e-05, | |
| "loss": 0.4015, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.3457727448204864, | |
| "eval_loss": 0.8406257033348083, | |
| "eval_runtime": 189.5238, | |
| "eval_samples_per_second": 102.499, | |
| "eval_steps_per_second": 10.252, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.3586411015313344, | |
| "grad_norm": 3.6319420337677, | |
| "learning_rate": 1.641956241956242e-05, | |
| "loss": 0.4175, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.3715094582421825, | |
| "grad_norm": 3.819413900375366, | |
| "learning_rate": 1.629086229086229e-05, | |
| "loss": 0.4132, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.3843778149530306, | |
| "grad_norm": 3.5537872314453125, | |
| "learning_rate": 1.6162162162162163e-05, | |
| "loss": 0.4347, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.3972461716638787, | |
| "grad_norm": 3.193889617919922, | |
| "learning_rate": 1.6033462033462034e-05, | |
| "loss": 0.4082, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.4101145283747267, | |
| "grad_norm": 2.0629472732543945, | |
| "learning_rate": 1.5904761904761906e-05, | |
| "loss": 0.4157, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.422982885085575, | |
| "grad_norm": 3.1376850605010986, | |
| "learning_rate": 1.5776061776061777e-05, | |
| "loss": 0.4063, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.4358512417964224, | |
| "grad_norm": 3.6547293663024902, | |
| "learning_rate": 1.5647361647361646e-05, | |
| "loss": 0.4006, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.4487195985072705, | |
| "grad_norm": 3.7299160957336426, | |
| "learning_rate": 1.551866151866152e-05, | |
| "loss": 0.4103, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.4615879552181186, | |
| "grad_norm": 4.016064167022705, | |
| "learning_rate": 1.5389961389961392e-05, | |
| "loss": 0.4128, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.4744563119289666, | |
| "grad_norm": 3.9914636611938477, | |
| "learning_rate": 1.526126126126126e-05, | |
| "loss": 0.4417, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.4873246686398147, | |
| "grad_norm": 3.342193126678467, | |
| "learning_rate": 1.5132561132561132e-05, | |
| "loss": 0.4369, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.500193025350663, | |
| "grad_norm": 3.600562810897827, | |
| "learning_rate": 1.5003861003861005e-05, | |
| "loss": 0.4186, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.513061382061511, | |
| "grad_norm": 2.986332654953003, | |
| "learning_rate": 1.4875160875160877e-05, | |
| "loss": 0.4109, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.525929738772359, | |
| "grad_norm": 3.3687069416046143, | |
| "learning_rate": 1.4746460746460747e-05, | |
| "loss": 0.4193, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 3.538798095483207, | |
| "grad_norm": 3.519691228866577, | |
| "learning_rate": 1.4617760617760617e-05, | |
| "loss": 0.4046, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 3.5516664521940546, | |
| "grad_norm": 2.984351396560669, | |
| "learning_rate": 1.448906048906049e-05, | |
| "loss": 0.4013, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 3.5645348089049027, | |
| "grad_norm": 4.0301899909973145, | |
| "learning_rate": 1.4360360360360362e-05, | |
| "loss": 0.425, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 3.577403165615751, | |
| "grad_norm": 3.606745481491089, | |
| "learning_rate": 1.4231660231660232e-05, | |
| "loss": 0.395, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 3.590271522326599, | |
| "grad_norm": 3.490335702896118, | |
| "learning_rate": 1.4102960102960103e-05, | |
| "loss": 0.4104, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 3.603139879037447, | |
| "grad_norm": 4.385608673095703, | |
| "learning_rate": 1.3974259974259976e-05, | |
| "loss": 0.4235, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.603139879037447, | |
| "eval_loss": 0.8373395800590515, | |
| "eval_runtime": 189.5425, | |
| "eval_samples_per_second": 102.489, | |
| "eval_steps_per_second": 10.251, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.616008235748295, | |
| "grad_norm": 3.1057236194610596, | |
| "learning_rate": 1.3845559845559846e-05, | |
| "loss": 0.4055, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 3.628876592459143, | |
| "grad_norm": 3.3940112590789795, | |
| "learning_rate": 1.3716859716859718e-05, | |
| "loss": 0.3916, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 3.641744949169991, | |
| "grad_norm": 3.4843504428863525, | |
| "learning_rate": 1.3588159588159588e-05, | |
| "loss": 0.4483, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 3.654613305880839, | |
| "grad_norm": 4.122684001922607, | |
| "learning_rate": 1.3462033462033463e-05, | |
| "loss": 0.4241, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 3.667481662591687, | |
| "grad_norm": 3.975159168243408, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.417, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 3.680350019302535, | |
| "grad_norm": 4.270761966705322, | |
| "learning_rate": 1.3204633204633205e-05, | |
| "loss": 0.4137, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 3.693218376013383, | |
| "grad_norm": 2.448530673980713, | |
| "learning_rate": 1.307850707850708e-05, | |
| "loss": 0.4277, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 3.706086732724231, | |
| "grad_norm": 3.0755138397216797, | |
| "learning_rate": 1.294980694980695e-05, | |
| "loss": 0.4003, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 3.718955089435079, | |
| "grad_norm": 3.795004367828369, | |
| "learning_rate": 1.282110682110682e-05, | |
| "loss": 0.4128, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 3.731823446145927, | |
| "grad_norm": 3.655588150024414, | |
| "learning_rate": 1.2692406692406693e-05, | |
| "loss": 0.4243, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.7446918028567753, | |
| "grad_norm": 3.4732940196990967, | |
| "learning_rate": 1.2563706563706565e-05, | |
| "loss": 0.432, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 3.7575601595676233, | |
| "grad_norm": 3.737543821334839, | |
| "learning_rate": 1.2435006435006435e-05, | |
| "loss": 0.4214, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 3.7704285162784714, | |
| "grad_norm": 4.61754035949707, | |
| "learning_rate": 1.2306306306306308e-05, | |
| "loss": 0.4075, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 3.783296872989319, | |
| "grad_norm": 3.1483190059661865, | |
| "learning_rate": 1.2177606177606178e-05, | |
| "loss": 0.4136, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 3.7961652297001676, | |
| "grad_norm": 3.9902536869049072, | |
| "learning_rate": 1.204890604890605e-05, | |
| "loss": 0.399, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 3.809033586411015, | |
| "grad_norm": 3.6237611770629883, | |
| "learning_rate": 1.1920205920205921e-05, | |
| "loss": 0.4144, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 3.8219019431218633, | |
| "grad_norm": 4.140763282775879, | |
| "learning_rate": 1.1791505791505791e-05, | |
| "loss": 0.4009, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 3.8347702998327113, | |
| "grad_norm": 3.700617790222168, | |
| "learning_rate": 1.1662805662805663e-05, | |
| "loss": 0.3993, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.8476386565435594, | |
| "grad_norm": 3.43810772895813, | |
| "learning_rate": 1.1534105534105535e-05, | |
| "loss": 0.4153, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 3.8605070132544075, | |
| "grad_norm": 3.04858136177063, | |
| "learning_rate": 1.1405405405405406e-05, | |
| "loss": 0.4056, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.8605070132544075, | |
| "eval_loss": 0.8282376527786255, | |
| "eval_runtime": 189.5165, | |
| "eval_samples_per_second": 102.503, | |
| "eval_steps_per_second": 10.252, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.8733753699652556, | |
| "grad_norm": 3.8784329891204834, | |
| "learning_rate": 1.1276705276705276e-05, | |
| "loss": 0.3866, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 3.8862437266761036, | |
| "grad_norm": 3.622274398803711, | |
| "learning_rate": 1.114800514800515e-05, | |
| "loss": 0.4208, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.8991120833869513, | |
| "grad_norm": 3.2869110107421875, | |
| "learning_rate": 1.101930501930502e-05, | |
| "loss": 0.4302, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 3.9119804400977998, | |
| "grad_norm": 2.9988114833831787, | |
| "learning_rate": 1.0890604890604891e-05, | |
| "loss": 0.4153, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.9248487968086474, | |
| "grad_norm": 3.2975871562957764, | |
| "learning_rate": 1.0761904761904763e-05, | |
| "loss": 0.4178, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 3.9377171535194955, | |
| "grad_norm": 2.9260878562927246, | |
| "learning_rate": 1.0633204633204634e-05, | |
| "loss": 0.4048, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.9505855102303435, | |
| "grad_norm": 3.6288230419158936, | |
| "learning_rate": 1.0504504504504504e-05, | |
| "loss": 0.4007, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 3.9634538669411916, | |
| "grad_norm": 4.0753173828125, | |
| "learning_rate": 1.0375804375804377e-05, | |
| "loss": 0.397, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.9763222236520397, | |
| "grad_norm": 3.9990575313568115, | |
| "learning_rate": 1.0247104247104247e-05, | |
| "loss": 0.4043, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 3.9891905803628878, | |
| "grad_norm": 3.895235776901245, | |
| "learning_rate": 1.0118404118404119e-05, | |
| "loss": 0.4147, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.002058937073736, | |
| "grad_norm": 3.6188671588897705, | |
| "learning_rate": 9.98970398970399e-06, | |
| "loss": 0.4038, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.0149272937845835, | |
| "grad_norm": 2.461876392364502, | |
| "learning_rate": 9.861003861003862e-06, | |
| "loss": 0.3183, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.027795650495432, | |
| "grad_norm": 3.0858776569366455, | |
| "learning_rate": 9.732303732303732e-06, | |
| "loss": 0.3477, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.04066400720628, | |
| "grad_norm": 3.3975493907928467, | |
| "learning_rate": 9.603603603603605e-06, | |
| "loss": 0.3225, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.053532363917128, | |
| "grad_norm": 3.6678659915924072, | |
| "learning_rate": 9.474903474903475e-06, | |
| "loss": 0.3294, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.066400720627976, | |
| "grad_norm": 3.5393424034118652, | |
| "learning_rate": 9.346203346203347e-06, | |
| "loss": 0.3421, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.079269077338824, | |
| "grad_norm": 3.039139747619629, | |
| "learning_rate": 9.217503217503218e-06, | |
| "loss": 0.328, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.092137434049672, | |
| "grad_norm": 4.036282062530518, | |
| "learning_rate": 9.08880308880309e-06, | |
| "loss": 0.3287, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.1050057907605195, | |
| "grad_norm": 3.0405406951904297, | |
| "learning_rate": 8.96010296010296e-06, | |
| "loss": 0.3335, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.117874147471368, | |
| "grad_norm": 2.893115520477295, | |
| "learning_rate": 8.831402831402833e-06, | |
| "loss": 0.3092, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.117874147471368, | |
| "eval_loss": 0.8520528674125671, | |
| "eval_runtime": 189.4809, | |
| "eval_samples_per_second": 102.522, | |
| "eval_steps_per_second": 10.254, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.130742504182216, | |
| "grad_norm": 4.632415771484375, | |
| "learning_rate": 8.702702702702703e-06, | |
| "loss": 0.3281, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.143610860893064, | |
| "grad_norm": 3.4265694618225098, | |
| "learning_rate": 8.574002574002575e-06, | |
| "loss": 0.3147, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.156479217603912, | |
| "grad_norm": 2.336419105529785, | |
| "learning_rate": 8.445302445302446e-06, | |
| "loss": 0.3013, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.16934757431476, | |
| "grad_norm": 3.3826658725738525, | |
| "learning_rate": 8.316602316602316e-06, | |
| "loss": 0.3124, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.182215931025608, | |
| "grad_norm": 2.8319571018218994, | |
| "learning_rate": 8.187902187902188e-06, | |
| "loss": 0.3503, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.1950842877364565, | |
| "grad_norm": 5.399038314819336, | |
| "learning_rate": 8.05920205920206e-06, | |
| "loss": 0.3394, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.207952644447304, | |
| "grad_norm": 2.59490966796875, | |
| "learning_rate": 7.930501930501931e-06, | |
| "loss": 0.3317, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.220821001158152, | |
| "grad_norm": 3.371151924133301, | |
| "learning_rate": 7.801801801801801e-06, | |
| "loss": 0.3123, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.233689357869, | |
| "grad_norm": 3.447014808654785, | |
| "learning_rate": 7.673101673101674e-06, | |
| "loss": 0.3348, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.246557714579848, | |
| "grad_norm": 4.9008893966674805, | |
| "learning_rate": 7.544401544401544e-06, | |
| "loss": 0.34, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.259426071290696, | |
| "grad_norm": 3.1242849826812744, | |
| "learning_rate": 7.415701415701416e-06, | |
| "loss": 0.3064, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.272294428001544, | |
| "grad_norm": 4.190210342407227, | |
| "learning_rate": 7.287001287001287e-06, | |
| "loss": 0.3316, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.2851627847123925, | |
| "grad_norm": 4.833367824554443, | |
| "learning_rate": 7.158301158301159e-06, | |
| "loss": 0.3151, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.29803114142324, | |
| "grad_norm": 3.561443328857422, | |
| "learning_rate": 7.02960102960103e-06, | |
| "loss": 0.3284, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.310899498134089, | |
| "grad_norm": 3.202888011932373, | |
| "learning_rate": 6.903474903474904e-06, | |
| "loss": 0.3361, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.323767854844936, | |
| "grad_norm": 4.517894744873047, | |
| "learning_rate": 6.774774774774775e-06, | |
| "loss": 0.3371, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.336636211555784, | |
| "grad_norm": 3.958711862564087, | |
| "learning_rate": 6.646074646074646e-06, | |
| "loss": 0.3236, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.3495045682666325, | |
| "grad_norm": 2.8120596408843994, | |
| "learning_rate": 6.517374517374518e-06, | |
| "loss": 0.3128, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.36237292497748, | |
| "grad_norm": 2.320220947265625, | |
| "learning_rate": 6.388674388674388e-06, | |
| "loss": 0.3079, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.375241281688329, | |
| "grad_norm": 2.833205223083496, | |
| "learning_rate": 6.259974259974261e-06, | |
| "loss": 0.3127, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.375241281688329, | |
| "eval_loss": 0.8534464836120605, | |
| "eval_runtime": 189.5459, | |
| "eval_samples_per_second": 102.487, | |
| "eval_steps_per_second": 10.251, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.388109638399176, | |
| "grad_norm": 4.218430519104004, | |
| "learning_rate": 6.1312741312741316e-06, | |
| "loss": 0.3375, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.400977995110025, | |
| "grad_norm": 3.2581334114074707, | |
| "learning_rate": 6.002574002574003e-06, | |
| "loss": 0.3081, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.413846351820872, | |
| "grad_norm": 3.2041099071502686, | |
| "learning_rate": 5.873873873873874e-06, | |
| "loss": 0.3368, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 4.426714708531721, | |
| "grad_norm": 4.340978622436523, | |
| "learning_rate": 5.7451737451737455e-06, | |
| "loss": 0.3404, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 4.4395830652425685, | |
| "grad_norm": 4.946235179901123, | |
| "learning_rate": 5.616473616473616e-06, | |
| "loss": 0.3199, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 4.452451421953416, | |
| "grad_norm": 3.671445846557617, | |
| "learning_rate": 5.487773487773488e-06, | |
| "loss": 0.3137, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 4.465319778664265, | |
| "grad_norm": 4.509451389312744, | |
| "learning_rate": 5.3590733590733595e-06, | |
| "loss": 0.332, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 4.478188135375112, | |
| "grad_norm": 3.3261749744415283, | |
| "learning_rate": 5.23037323037323e-06, | |
| "loss": 0.3339, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 4.491056492085961, | |
| "grad_norm": 3.5157628059387207, | |
| "learning_rate": 5.101673101673102e-06, | |
| "loss": 0.3256, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 4.503924848796808, | |
| "grad_norm": 3.7138776779174805, | |
| "learning_rate": 4.9729729729729735e-06, | |
| "loss": 0.3278, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.516793205507657, | |
| "grad_norm": 3.2979509830474854, | |
| "learning_rate": 4.844272844272844e-06, | |
| "loss": 0.3249, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 4.529661562218505, | |
| "grad_norm": 3.117856740951538, | |
| "learning_rate": 4.715572715572716e-06, | |
| "loss": 0.3174, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 4.542529918929353, | |
| "grad_norm": 3.44389009475708, | |
| "learning_rate": 4.586872586872587e-06, | |
| "loss": 0.3072, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 4.555398275640201, | |
| "grad_norm": 4.093995094299316, | |
| "learning_rate": 4.458172458172458e-06, | |
| "loss": 0.3118, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 4.568266632351049, | |
| "grad_norm": 2.943690061569214, | |
| "learning_rate": 4.32947232947233e-06, | |
| "loss": 0.3123, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 4.581134989061897, | |
| "grad_norm": 3.0897843837738037, | |
| "learning_rate": 4.200772200772201e-06, | |
| "loss": 0.3194, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 4.5940033457727445, | |
| "grad_norm": 3.8259048461914062, | |
| "learning_rate": 4.072072072072072e-06, | |
| "loss": 0.3321, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 4.606871702483593, | |
| "grad_norm": 4.632271766662598, | |
| "learning_rate": 3.943371943371944e-06, | |
| "loss": 0.3223, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 4.619740059194441, | |
| "grad_norm": 3.449892282485962, | |
| "learning_rate": 3.814671814671815e-06, | |
| "loss": 0.3152, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 4.632608415905289, | |
| "grad_norm": 4.951483726501465, | |
| "learning_rate": 3.685971685971686e-06, | |
| "loss": 0.3333, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.632608415905289, | |
| "eval_loss": 0.8536800742149353, | |
| "eval_runtime": 189.5498, | |
| "eval_samples_per_second": 102.485, | |
| "eval_steps_per_second": 10.251, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.645476772616137, | |
| "grad_norm": 2.879723310470581, | |
| "learning_rate": 3.5572715572715578e-06, | |
| "loss": 0.3311, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 4.658345129326985, | |
| "grad_norm": 2.263468027114868, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": 0.3288, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 4.671213486037833, | |
| "grad_norm": 4.122782230377197, | |
| "learning_rate": 3.2998712998713e-06, | |
| "loss": 0.2881, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 4.684081842748681, | |
| "grad_norm": 2.713583469390869, | |
| "learning_rate": 3.1711711711711718e-06, | |
| "loss": 0.335, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 4.696950199459529, | |
| "grad_norm": 4.133466720581055, | |
| "learning_rate": 3.0424710424710425e-06, | |
| "loss": 0.3389, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 4.709818556170377, | |
| "grad_norm": 3.080181121826172, | |
| "learning_rate": 2.9137709137709137e-06, | |
| "loss": 0.3049, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 4.722686912881225, | |
| "grad_norm": 3.2087597846984863, | |
| "learning_rate": 2.7850707850707853e-06, | |
| "loss": 0.3021, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 4.735555269592073, | |
| "grad_norm": 3.037031888961792, | |
| "learning_rate": 2.6563706563706565e-06, | |
| "loss": 0.3343, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 4.748423626302921, | |
| "grad_norm": 3.936049699783325, | |
| "learning_rate": 2.5276705276705277e-06, | |
| "loss": 0.351, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 4.761291983013769, | |
| "grad_norm": 2.863168239593506, | |
| "learning_rate": 2.3989703989703993e-06, | |
| "loss": 0.3062, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.7741603397246175, | |
| "grad_norm": 3.105039358139038, | |
| "learning_rate": 2.2702702702702705e-06, | |
| "loss": 0.3139, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 4.787028696435465, | |
| "grad_norm": 4.5207390785217285, | |
| "learning_rate": 2.1415701415701416e-06, | |
| "loss": 0.3295, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 4.799897053146314, | |
| "grad_norm": 4.3474345207214355, | |
| "learning_rate": 2.0128700128700133e-06, | |
| "loss": 0.3256, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 4.812765409857161, | |
| "grad_norm": 3.58796763420105, | |
| "learning_rate": 1.8841698841698844e-06, | |
| "loss": 0.32, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 4.825633766568009, | |
| "grad_norm": 2.583491325378418, | |
| "learning_rate": 1.7554697554697556e-06, | |
| "loss": 0.3409, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 4.838502123278857, | |
| "grad_norm": 2.490678071975708, | |
| "learning_rate": 1.6267696267696266e-06, | |
| "loss": 0.3164, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 4.851370479989705, | |
| "grad_norm": 2.505882740020752, | |
| "learning_rate": 1.4980694980694982e-06, | |
| "loss": 0.2992, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 4.864238836700554, | |
| "grad_norm": 3.8381474018096924, | |
| "learning_rate": 1.371943371943372e-06, | |
| "loss": 0.3178, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 4.877107193411401, | |
| "grad_norm": 4.065976619720459, | |
| "learning_rate": 1.2432432432432434e-06, | |
| "loss": 0.312, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 4.88997555012225, | |
| "grad_norm": 2.3685247898101807, | |
| "learning_rate": 1.1145431145431146e-06, | |
| "loss": 0.3103, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.88997555012225, | |
| "eval_loss": 0.8551310896873474, | |
| "eval_runtime": 189.5403, | |
| "eval_samples_per_second": 102.49, | |
| "eval_steps_per_second": 10.251, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.902843906833097, | |
| "grad_norm": 1.829939365386963, | |
| "learning_rate": 9.85842985842986e-07, | |
| "loss": 0.3012, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 4.915712263543945, | |
| "grad_norm": 3.9715230464935303, | |
| "learning_rate": 8.571428571428572e-07, | |
| "loss": 0.3258, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 4.9285806202547935, | |
| "grad_norm": 3.2242319583892822, | |
| "learning_rate": 7.284427284427284e-07, | |
| "loss": 0.319, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 4.941448976965641, | |
| "grad_norm": 2.4675443172454834, | |
| "learning_rate": 5.997425997425998e-07, | |
| "loss": 0.3183, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 4.95431733367649, | |
| "grad_norm": 4.066344738006592, | |
| "learning_rate": 4.710424710424711e-07, | |
| "loss": 0.3368, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 4.967185690387337, | |
| "grad_norm": 3.9160056114196777, | |
| "learning_rate": 3.423423423423424e-07, | |
| "loss": 0.3002, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 4.980054047098186, | |
| "grad_norm": 3.2470641136169434, | |
| "learning_rate": 2.1364221364221366e-07, | |
| "loss": 0.3168, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 4.992922403809033, | |
| "grad_norm": 4.328388690948486, | |
| "learning_rate": 8.494208494208495e-08, | |
| "loss": 0.3105, | |
| "step": 19400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 19425, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.343651794926285e+17, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |