| { | |
| "best_global_step": 785, | |
| "best_metric": 0.04556597024202347, | |
| "best_model_checkpoint": "outputs/checkpoint-785", | |
| "epoch": 14.0, | |
| "eval_steps": 500, | |
| "global_step": 2198, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06369426751592357, | |
| "grad_norm": 0.5893169641494751, | |
| "learning_rate": 0.000199991726421274, | |
| "loss": 0.3173, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12738853503184713, | |
| "grad_norm": 0.4055931568145752, | |
| "learning_rate": 0.00019996312815368718, | |
| "loss": 0.1199, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1910828025477707, | |
| "grad_norm": 0.49983513355255127, | |
| "learning_rate": 0.00019991410889510497, | |
| "loss": 0.0987, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25477707006369427, | |
| "grad_norm": 0.4363257586956024, | |
| "learning_rate": 0.00019984467865943805, | |
| "loss": 0.0775, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3184713375796178, | |
| "grad_norm": 0.26337388157844543, | |
| "learning_rate": 0.00019975485163025835, | |
| "loss": 0.0573, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3821656050955414, | |
| "grad_norm": 0.24820806086063385, | |
| "learning_rate": 0.00019964464615790156, | |
| "loss": 0.0647, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.445859872611465, | |
| "grad_norm": 0.1877005398273468, | |
| "learning_rate": 0.0001995140847557183, | |
| "loss": 0.0608, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5095541401273885, | |
| "grad_norm": 0.2699015736579895, | |
| "learning_rate": 0.00019936319409547513, | |
| "loss": 0.0533, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5732484076433121, | |
| "grad_norm": 0.32841813564300537, | |
| "learning_rate": 0.00019919200500190587, | |
| "loss": 0.0622, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6369426751592356, | |
| "grad_norm": 0.2413034588098526, | |
| "learning_rate": 0.00019900055244641447, | |
| "loss": 0.0664, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7006369426751592, | |
| "grad_norm": 0.3198252320289612, | |
| "learning_rate": 0.000198788875539931, | |
| "loss": 0.049, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7643312101910829, | |
| "grad_norm": 0.27829986810684204, | |
| "learning_rate": 0.00019855701752492176, | |
| "loss": 0.0574, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8280254777070064, | |
| "grad_norm": 0.2980740964412689, | |
| "learning_rate": 0.00019830502576655552, | |
| "loss": 0.0494, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.89171974522293, | |
| "grad_norm": 0.3273659348487854, | |
| "learning_rate": 0.00019803295174302752, | |
| "loss": 0.0486, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 0.23626568913459778, | |
| "learning_rate": 0.00019774085103504326, | |
| "loss": 0.0408, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.05023716390132904, | |
| "eval_runtime": 152.5451, | |
| "eval_samples_per_second": 2.74, | |
| "eval_steps_per_second": 0.347, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.019108280254777, | |
| "grad_norm": 0.2541586458683014, | |
| "learning_rate": 0.00019742878331446414, | |
| "loss": 0.0473, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0828025477707006, | |
| "grad_norm": 0.27645406126976013, | |
| "learning_rate": 0.00019709681233211733, | |
| "loss": 0.0224, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1464968152866242, | |
| "grad_norm": 0.3336308002471924, | |
| "learning_rate": 0.0001967450059047726, | |
| "loss": 0.0478, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2101910828025477, | |
| "grad_norm": 0.1680356115102768, | |
| "learning_rate": 0.00019637343590128809, | |
| "loss": 0.0315, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2738853503184713, | |
| "grad_norm": 0.1790059506893158, | |
| "learning_rate": 0.00019598217822792892, | |
| "loss": 0.0326, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3375796178343948, | |
| "grad_norm": 0.22847947478294373, | |
| "learning_rate": 0.00019557131281286024, | |
| "loss": 0.0481, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.4012738853503186, | |
| "grad_norm": 0.20187054574489594, | |
| "learning_rate": 0.0001951409235898194, | |
| "loss": 0.0407, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4649681528662422, | |
| "grad_norm": 0.14863349497318268, | |
| "learning_rate": 0.0001946910984809694, | |
| "loss": 0.0406, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5286624203821657, | |
| "grad_norm": 0.17791222035884857, | |
| "learning_rate": 0.00019422192937893775, | |
| "loss": 0.0328, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5923566878980893, | |
| "grad_norm": 0.15719226002693176, | |
| "learning_rate": 0.00019373351212804404, | |
| "loss": 0.0337, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6560509554140128, | |
| "grad_norm": 0.2113306075334549, | |
| "learning_rate": 0.0001932259465047206, | |
| "loss": 0.0353, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7197452229299364, | |
| "grad_norm": 0.19012148678302765, | |
| "learning_rate": 0.0001926993361971293, | |
| "loss": 0.0328, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.78343949044586, | |
| "grad_norm": 0.1509159654378891, | |
| "learning_rate": 0.00019215378878397997, | |
| "loss": 0.0407, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8471337579617835, | |
| "grad_norm": 0.1625605821609497, | |
| "learning_rate": 0.00019158941571255337, | |
| "loss": 0.0379, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.910828025477707, | |
| "grad_norm": 0.12683314085006714, | |
| "learning_rate": 0.0001910063322759343, | |
| "loss": 0.037, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9745222929936306, | |
| "grad_norm": 0.14843901991844177, | |
| "learning_rate": 0.00019040465758945883, | |
| "loss": 0.0353, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.0473560094833374, | |
| "eval_runtime": 151.5547, | |
| "eval_samples_per_second": 2.758, | |
| "eval_steps_per_second": 0.35, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.038216560509554, | |
| "grad_norm": 0.16260643303394318, | |
| "learning_rate": 0.00018978451456638088, | |
| "loss": 0.0365, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.1019108280254777, | |
| "grad_norm": 0.09349235892295837, | |
| "learning_rate": 0.00018914602989276294, | |
| "loss": 0.0327, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1656050955414012, | |
| "grad_norm": 0.17592753469944, | |
| "learning_rate": 0.00018848933400159569, | |
| "loss": 0.0258, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.229299363057325, | |
| "grad_norm": 0.1627691686153412, | |
| "learning_rate": 0.00018781456104615272, | |
| "loss": 0.0274, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.2929936305732483, | |
| "grad_norm": 0.14966121315956116, | |
| "learning_rate": 0.00018712184887258494, | |
| "loss": 0.0293, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.356687898089172, | |
| "grad_norm": 0.17441661655902863, | |
| "learning_rate": 0.0001864113389917606, | |
| "loss": 0.0304, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4203821656050954, | |
| "grad_norm": 0.09554579854011536, | |
| "learning_rate": 0.00018568317655035676, | |
| "loss": 0.0321, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.484076433121019, | |
| "grad_norm": 0.2536601126194, | |
| "learning_rate": 0.00018493751030120793, | |
| "loss": 0.0257, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5477707006369426, | |
| "grad_norm": 0.16189263761043549, | |
| "learning_rate": 0.00018417449257291803, | |
| "loss": 0.0279, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.611464968152866, | |
| "grad_norm": 0.1018817350268364, | |
| "learning_rate": 0.00018339427923874207, | |
| "loss": 0.0289, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.6751592356687897, | |
| "grad_norm": 0.14323534071445465, | |
| "learning_rate": 0.00018259702968474327, | |
| "loss": 0.0275, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.738853503184713, | |
| "grad_norm": 0.11969427019357681, | |
| "learning_rate": 0.00018178290677723312, | |
| "loss": 0.0281, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.802547770700637, | |
| "grad_norm": 0.2024673968553543, | |
| "learning_rate": 0.00018095207682950005, | |
| "loss": 0.0314, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8662420382165603, | |
| "grad_norm": 0.17737647891044617, | |
| "learning_rate": 0.00018010470956783406, | |
| "loss": 0.0279, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9299363057324843, | |
| "grad_norm": 0.18298077583312988, | |
| "learning_rate": 0.00017924097809685424, | |
| "loss": 0.0257, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.9936305732484074, | |
| "grad_norm": 0.2549296021461487, | |
| "learning_rate": 0.00017836105886414596, | |
| "loss": 0.0304, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.05053602159023285, | |
| "eval_runtime": 152.6545, | |
| "eval_samples_per_second": 2.738, | |
| "eval_steps_per_second": 0.347, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.0573248407643314, | |
| "grad_norm": 0.17169545590877533, | |
| "learning_rate": 0.00017746513162421535, | |
| "loss": 0.0208, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.121019108280255, | |
| "grad_norm": 0.21220606565475464, | |
| "learning_rate": 0.00017655337940176793, | |
| "loss": 0.0226, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.1847133757961785, | |
| "grad_norm": 0.12988293170928955, | |
| "learning_rate": 0.00017562598845431956, | |
| "loss": 0.0217, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.248407643312102, | |
| "grad_norm": 0.3005841076374054, | |
| "learning_rate": 0.00017468314823414669, | |
| "loss": 0.0187, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.3121019108280256, | |
| "grad_norm": 0.21529339253902435, | |
| "learning_rate": 0.000173725051349584, | |
| "loss": 0.022, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.375796178343949, | |
| "grad_norm": 0.07448782026767731, | |
| "learning_rate": 0.00017275189352567745, | |
| "loss": 0.0311, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.4394904458598727, | |
| "grad_norm": 0.10172971338033676, | |
| "learning_rate": 0.0001717638735642005, | |
| "loss": 0.0223, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.5031847133757963, | |
| "grad_norm": 0.19833995401859283, | |
| "learning_rate": 0.0001707611933030419, | |
| "loss": 0.0249, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.56687898089172, | |
| "grad_norm": 0.234901562333107, | |
| "learning_rate": 0.00016974405757497318, | |
| "loss": 0.0383, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.6305732484076434, | |
| "grad_norm": 0.21425440907478333, | |
| "learning_rate": 0.0001687126741658041, | |
| "loss": 0.0266, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.694267515923567, | |
| "grad_norm": 0.08396715670824051, | |
| "learning_rate": 0.00016766725377193557, | |
| "loss": 0.0265, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.7579617834394905, | |
| "grad_norm": 0.08471404016017914, | |
| "learning_rate": 0.00016660800995731693, | |
| "loss": 0.0238, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.821656050955414, | |
| "grad_norm": 0.0868527814745903, | |
| "learning_rate": 0.00016553515910981847, | |
| "loss": 0.0269, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.8853503184713376, | |
| "grad_norm": 0.3129713535308838, | |
| "learning_rate": 0.0001644489203970263, | |
| "loss": 0.0223, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.949044585987261, | |
| "grad_norm": 0.11113307625055313, | |
| "learning_rate": 0.00016334951572146965, | |
| "loss": 0.0238, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.04629155993461609, | |
| "eval_runtime": 148.9174, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 4.012738853503185, | |
| "grad_norm": 0.0971466675400734, | |
| "learning_rate": 0.00016223716967528958, | |
| "loss": 0.018, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.076433121019108, | |
| "grad_norm": 0.15950708091259003, | |
| "learning_rate": 0.00016111210949435815, | |
| "loss": 0.0158, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.140127388535032, | |
| "grad_norm": 0.20078063011169434, | |
| "learning_rate": 0.00015997456501185727, | |
| "loss": 0.0179, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.203821656050955, | |
| "grad_norm": 0.1425529569387436, | |
| "learning_rate": 0.0001588247686113274, | |
| "loss": 0.0245, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.267515923566879, | |
| "grad_norm": 0.1291467249393463, | |
| "learning_rate": 0.00015766295517919497, | |
| "loss": 0.0241, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.3312101910828025, | |
| "grad_norm": 0.08179380744695663, | |
| "learning_rate": 0.00015648936205678838, | |
| "loss": 0.023, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.3949044585987265, | |
| "grad_norm": 0.15069672465324402, | |
| "learning_rate": 0.00015530422899185298, | |
| "loss": 0.0304, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.45859872611465, | |
| "grad_norm": 0.14441800117492676, | |
| "learning_rate": 0.00015410779808957385, | |
| "loss": 0.0246, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.522292993630574, | |
| "grad_norm": 0.07218258827924728, | |
| "learning_rate": 0.0001529003137631175, | |
| "loss": 0.0232, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.585987261146497, | |
| "grad_norm": 0.28358036279678345, | |
| "learning_rate": 0.0001516820226837017, | |
| "loss": 0.0357, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.649681528662421, | |
| "grad_norm": 0.48727092146873474, | |
| "learning_rate": 0.00015045317373020426, | |
| "loss": 0.0192, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.713375796178344, | |
| "grad_norm": 0.14064273238182068, | |
| "learning_rate": 0.00014921401793832094, | |
| "loss": 0.0221, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.777070063694268, | |
| "grad_norm": 0.4577218294143677, | |
| "learning_rate": 0.00014796480844928218, | |
| "loss": 0.0171, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.840764331210191, | |
| "grad_norm": 0.07277490198612213, | |
| "learning_rate": 0.0001467058004581404, | |
| "loss": 0.0244, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.904458598726115, | |
| "grad_norm": 0.3607349693775177, | |
| "learning_rate": 0.0001454372511616373, | |
| "loss": 0.0227, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.968152866242038, | |
| "grad_norm": 1.5265377759933472, | |
| "learning_rate": 0.00014415941970566233, | |
| "loss": 0.0239, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.04556597024202347, | |
| "eval_runtime": 148.8315, | |
| "eval_samples_per_second": 2.809, | |
| "eval_steps_per_second": 0.356, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 5.031847133757962, | |
| "grad_norm": 0.08108412474393845, | |
| "learning_rate": 0.00014287256713231314, | |
| "loss": 0.0179, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 5.095541401273885, | |
| "grad_norm": 0.07250893861055374, | |
| "learning_rate": 0.00014157695632656837, | |
| "loss": 0.0128, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.159235668789809, | |
| "grad_norm": 0.12970279157161713, | |
| "learning_rate": 0.00014027285196258426, | |
| "loss": 0.0127, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 5.222929936305732, | |
| "grad_norm": 0.20146997272968292, | |
| "learning_rate": 0.00013896052044962557, | |
| "loss": 0.014, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 5.286624203821656, | |
| "grad_norm": 0.165513277053833, | |
| "learning_rate": 0.00013764022987764209, | |
| "loss": 0.0182, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 5.350318471337579, | |
| "grad_norm": 0.1810760200023651, | |
| "learning_rate": 0.00013631224996250185, | |
| "loss": 0.0164, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 5.414012738853503, | |
| "grad_norm": 0.13012060523033142, | |
| "learning_rate": 0.00013497685199089217, | |
| "loss": 0.0162, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.477707006369426, | |
| "grad_norm": 0.1861809343099594, | |
| "learning_rate": 0.00013363430876489976, | |
| "loss": 0.0141, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 5.54140127388535, | |
| "grad_norm": 0.2427922487258911, | |
| "learning_rate": 0.00013228489454628127, | |
| "loss": 0.0191, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 5.6050955414012735, | |
| "grad_norm": 0.06246360391378403, | |
| "learning_rate": 0.00013092888500043566, | |
| "loss": 0.0143, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 5.6687898089171975, | |
| "grad_norm": 0.15271341800689697, | |
| "learning_rate": 0.0001295665571400899, | |
| "loss": 0.0185, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 5.732484076433121, | |
| "grad_norm": 0.04112791642546654, | |
| "learning_rate": 0.00012819818926870942, | |
| "loss": 0.0122, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.796178343949045, | |
| "grad_norm": 0.13756102323532104, | |
| "learning_rate": 0.00012682406092364446, | |
| "loss": 0.0205, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.859872611464969, | |
| "grad_norm": 0.1089109405875206, | |
| "learning_rate": 0.00012544445281902512, | |
| "loss": 0.0175, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.923566878980892, | |
| "grad_norm": 0.5035731792449951, | |
| "learning_rate": 0.00012405964678841556, | |
| "loss": 0.0164, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 5.987261146496815, | |
| "grad_norm": 0.04808522015810013, | |
| "learning_rate": 0.0001226699257272393, | |
| "loss": 0.0178, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.048118457198143005, | |
| "eval_runtime": 148.8942, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 6.050955414012739, | |
| "grad_norm": 0.09140116721391678, | |
| "learning_rate": 0.00012127557353498806, | |
| "loss": 0.0095, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 6.114649681528663, | |
| "grad_norm": 0.0554346963763237, | |
| "learning_rate": 0.00011987687505722532, | |
| "loss": 0.0102, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 6.178343949044586, | |
| "grad_norm": 0.0764077678322792, | |
| "learning_rate": 0.00011847411602739645, | |
| "loss": 0.0097, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 6.24203821656051, | |
| "grad_norm": 0.0922919437289238, | |
| "learning_rate": 0.00011706758300845771, | |
| "loss": 0.0127, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 6.305732484076433, | |
| "grad_norm": 0.12583084404468536, | |
| "learning_rate": 0.0001156575633343355, | |
| "loss": 0.0119, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 6.369426751592357, | |
| "grad_norm": 0.14942176640033722, | |
| "learning_rate": 0.00011424434505122851, | |
| "loss": 0.0132, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.43312101910828, | |
| "grad_norm": 0.2156478315591812, | |
| "learning_rate": 0.00011282821685876399, | |
| "loss": 0.012, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 6.496815286624204, | |
| "grad_norm": 0.12194344401359558, | |
| "learning_rate": 0.00011140946805102059, | |
| "loss": 0.0136, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 6.560509554140127, | |
| "grad_norm": 0.1328732818365097, | |
| "learning_rate": 0.00010998838845743011, | |
| "loss": 0.0131, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 6.624203821656051, | |
| "grad_norm": 0.30128493905067444, | |
| "learning_rate": 0.00010856526838356941, | |
| "loss": 0.0109, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 6.687898089171974, | |
| "grad_norm": 0.16975216567516327, | |
| "learning_rate": 0.00010714039855185539, | |
| "loss": 0.0149, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 6.751592356687898, | |
| "grad_norm": 0.08274857699871063, | |
| "learning_rate": 0.00010571407004215447, | |
| "loss": 0.0155, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 6.8152866242038215, | |
| "grad_norm": 0.1308615654706955, | |
| "learning_rate": 0.00010428657423231969, | |
| "loss": 0.0135, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 6.8789808917197455, | |
| "grad_norm": 0.05005017668008804, | |
| "learning_rate": 0.00010285820273866613, | |
| "loss": 0.0157, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.942675159235669, | |
| "grad_norm": 0.23810291290283203, | |
| "learning_rate": 0.00010142924735639819, | |
| "loss": 0.0141, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.047865718603134155, | |
| "eval_runtime": 148.8655, | |
| "eval_samples_per_second": 2.808, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1099 | |
| }, | |
| { | |
| "epoch": 7.006369426751593, | |
| "grad_norm": 0.09416891634464264, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0146, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.070063694267516, | |
| "grad_norm": 0.7533183097839355, | |
| "learning_rate": 9.857075264360185e-05, | |
| "loss": 0.0113, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 7.13375796178344, | |
| "grad_norm": 0.16101513803005219, | |
| "learning_rate": 9.714179726133388e-05, | |
| "loss": 0.0075, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 7.197452229299363, | |
| "grad_norm": 0.1908101737499237, | |
| "learning_rate": 9.571342576768035e-05, | |
| "loss": 0.009, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 7.261146496815287, | |
| "grad_norm": 0.06933945417404175, | |
| "learning_rate": 9.428592995784554e-05, | |
| "loss": 0.0089, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 7.32484076433121, | |
| "grad_norm": 0.054749008268117905, | |
| "learning_rate": 9.285960144814465e-05, | |
| "loss": 0.0097, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 7.388535031847134, | |
| "grad_norm": 0.21135513484477997, | |
| "learning_rate": 9.14347316164306e-05, | |
| "loss": 0.011, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 7.452229299363057, | |
| "grad_norm": 0.0507340133190155, | |
| "learning_rate": 9.00116115425699e-05, | |
| "loss": 0.0095, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 7.515923566878981, | |
| "grad_norm": 0.06733115762472153, | |
| "learning_rate": 8.859053194897942e-05, | |
| "loss": 0.0108, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 7.579617834394904, | |
| "grad_norm": 0.07039262354373932, | |
| "learning_rate": 8.717178314123605e-05, | |
| "loss": 0.0082, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 7.643312101910828, | |
| "grad_norm": 0.08534280955791473, | |
| "learning_rate": 8.575565494877147e-05, | |
| "loss": 0.0099, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.707006369426751, | |
| "grad_norm": 0.11997800320386887, | |
| "learning_rate": 8.434243666566451e-05, | |
| "loss": 0.011, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 7.770700636942675, | |
| "grad_norm": 0.05447472259402275, | |
| "learning_rate": 8.293241699154231e-05, | |
| "loss": 0.0089, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 7.834394904458598, | |
| "grad_norm": 0.045016925781965256, | |
| "learning_rate": 8.152588397260357e-05, | |
| "loss": 0.0087, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 7.898089171974522, | |
| "grad_norm": 0.10037513077259064, | |
| "learning_rate": 8.012312494277472e-05, | |
| "loss": 0.0088, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 7.961783439490446, | |
| "grad_norm": 0.09553356468677521, | |
| "learning_rate": 7.872442646501199e-05, | |
| "loss": 0.008, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.05469883605837822, | |
| "eval_runtime": 148.8965, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 8.02547770700637, | |
| "grad_norm": 0.099692702293396, | |
| "learning_rate": 7.733007427276075e-05, | |
| "loss": 0.008, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 8.089171974522293, | |
| "grad_norm": 0.05955551564693451, | |
| "learning_rate": 7.594035321158445e-05, | |
| "loss": 0.0062, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 8.152866242038217, | |
| "grad_norm": 0.055873848497867584, | |
| "learning_rate": 7.455554718097487e-05, | |
| "loss": 0.0051, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 8.21656050955414, | |
| "grad_norm": 0.11832093447446823, | |
| "learning_rate": 7.317593907635558e-05, | |
| "loss": 0.0068, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 8.280254777070065, | |
| "grad_norm": 0.11872788518667221, | |
| "learning_rate": 7.180181073129061e-05, | |
| "loss": 0.0055, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 8.343949044585987, | |
| "grad_norm": 0.12502700090408325, | |
| "learning_rate": 7.043344285991012e-05, | |
| "loss": 0.006, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 8.40764331210191, | |
| "grad_norm": 0.07949739694595337, | |
| "learning_rate": 6.907111499956439e-05, | |
| "loss": 0.0056, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 8.471337579617835, | |
| "grad_norm": 0.08610483258962631, | |
| "learning_rate": 6.77151054537188e-05, | |
| "loss": 0.0048, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 8.535031847133759, | |
| "grad_norm": 0.08261114358901978, | |
| "learning_rate": 6.636569123510027e-05, | |
| "loss": 0.0047, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 8.598726114649681, | |
| "grad_norm": 0.030890854075551033, | |
| "learning_rate": 6.502314800910785e-05, | |
| "loss": 0.0052, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 8.662420382165605, | |
| "grad_norm": 0.07963161170482635, | |
| "learning_rate": 6.368775003749816e-05, | |
| "loss": 0.0099, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 8.726114649681529, | |
| "grad_norm": 0.14875206351280212, | |
| "learning_rate": 6.235977012235792e-05, | |
| "loss": 0.006, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 8.789808917197453, | |
| "grad_norm": 0.21349501609802246, | |
| "learning_rate": 6.103947955037446e-05, | |
| "loss": 0.0047, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 8.853503184713375, | |
| "grad_norm": 0.05400541424751282, | |
| "learning_rate": 5.972714803741577e-05, | |
| "loss": 0.006, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 8.9171974522293, | |
| "grad_norm": 0.14428143203258514, | |
| "learning_rate": 5.842304367343161e-05, | |
| "loss": 0.0095, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 8.980891719745223, | |
| "grad_norm": 0.07769430428743362, | |
| "learning_rate": 5.712743286768687e-05, | |
| "loss": 0.0053, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.05676256865262985, | |
| "eval_runtime": 148.9241, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1413 | |
| }, | |
| { | |
| "epoch": 9.044585987261147, | |
| "grad_norm": 0.10749530047178268, | |
| "learning_rate": 5.584058029433766e-05, | |
| "loss": 0.0038, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 9.10828025477707, | |
| "grad_norm": 0.016948334872722626, | |
| "learning_rate": 5.4562748838362735e-05, | |
| "loss": 0.0051, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 9.171974522292993, | |
| "grad_norm": 0.008234160952270031, | |
| "learning_rate": 5.329419954185965e-05, | |
| "loss": 0.0043, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 9.235668789808917, | |
| "grad_norm": 0.04994361847639084, | |
| "learning_rate": 5.203519155071785e-05, | |
| "loss": 0.0039, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 9.299363057324841, | |
| "grad_norm": 0.037435177713632584, | |
| "learning_rate": 5.078598206167912e-05, | |
| "loss": 0.0033, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 9.363057324840764, | |
| "grad_norm": 0.11694881319999695, | |
| "learning_rate": 4.9546826269795765e-05, | |
| "loss": 0.0036, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 9.426751592356688, | |
| "grad_norm": 0.05153834447264671, | |
| "learning_rate": 4.831797731629835e-05, | |
| "loss": 0.0042, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 9.490445859872612, | |
| "grad_norm": 0.09336938709020615, | |
| "learning_rate": 4.709968623688254e-05, | |
| "loss": 0.0028, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 9.554140127388536, | |
| "grad_norm": 0.03732943907380104, | |
| "learning_rate": 4.589220191042616e-05, | |
| "loss": 0.0034, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 9.617834394904458, | |
| "grad_norm": 0.14202427864074707, | |
| "learning_rate": 4.469577100814705e-05, | |
| "loss": 0.0031, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 9.681528662420382, | |
| "grad_norm": 0.09861844778060913, | |
| "learning_rate": 4.351063794321165e-05, | |
| "loss": 0.003, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 9.745222929936306, | |
| "grad_norm": 0.16652171313762665, | |
| "learning_rate": 4.233704482080504e-05, | |
| "loss": 0.0041, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 9.80891719745223, | |
| "grad_norm": 0.05778292566537857, | |
| "learning_rate": 4.11752313886726e-05, | |
| "loss": 0.0042, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 9.872611464968152, | |
| "grad_norm": 0.012399845756590366, | |
| "learning_rate": 4.0025434988142766e-05, | |
| "loss": 0.0037, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 9.936305732484076, | |
| "grad_norm": 0.0798059031367302, | |
| "learning_rate": 3.888789050564188e-05, | |
| "loss": 0.0047, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.04369504004716873, | |
| "learning_rate": 3.776283032471044e-05, | |
| "loss": 0.0029, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.07112779468297958, | |
| "eval_runtime": 148.9356, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 10.063694267515924, | |
| "grad_norm": 0.02837546356022358, | |
| "learning_rate": 3.6650484278530387e-05, | |
| "loss": 0.0023, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 10.127388535031848, | |
| "grad_norm": 0.00527458218857646, | |
| "learning_rate": 3.5551079602973734e-05, | |
| "loss": 0.0041, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 10.19108280254777, | |
| "grad_norm": 0.013476898893713951, | |
| "learning_rate": 3.446484089018153e-05, | |
| "loss": 0.0028, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 10.254777070063694, | |
| "grad_norm": 0.009121859446167946, | |
| "learning_rate": 3.3391990042683055e-05, | |
| "loss": 0.0032, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 10.318471337579618, | |
| "grad_norm": 0.035027824342250824, | |
| "learning_rate": 3.233274622806446e-05, | |
| "loss": 0.0038, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 10.382165605095542, | |
| "grad_norm": 0.05406291410326958, | |
| "learning_rate": 3.1287325834195915e-05, | |
| "loss": 0.0027, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 10.445859872611464, | |
| "grad_norm": 0.021644996479153633, | |
| "learning_rate": 3.025594242502684e-05, | |
| "loss": 0.0022, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 10.509554140127388, | |
| "grad_norm": 0.04471028223633766, | |
| "learning_rate": 2.9238806696958087e-05, | |
| "loss": 0.0034, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 10.573248407643312, | |
| "grad_norm": 0.014225292019546032, | |
| "learning_rate": 2.823612643579949e-05, | |
| "loss": 0.002, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 10.636942675159236, | |
| "grad_norm": 0.005709750112146139, | |
| "learning_rate": 2.7248106474322554e-05, | |
| "loss": 0.0021, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 10.700636942675159, | |
| "grad_norm": 0.015249662101268768, | |
| "learning_rate": 2.627494865041602e-05, | |
| "loss": 0.002, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 10.764331210191083, | |
| "grad_norm": 0.061606843024492264, | |
| "learning_rate": 2.5316851765853344e-05, | |
| "loss": 0.0035, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 10.828025477707007, | |
| "grad_norm": 0.056455183774232864, | |
| "learning_rate": 2.437401154568044e-05, | |
| "loss": 0.0016, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 10.89171974522293, | |
| "grad_norm": 0.050760120153427124, | |
| "learning_rate": 2.3446620598232104e-05, | |
| "loss": 0.0031, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 10.955414012738853, | |
| "grad_norm": 0.0020009365398436785, | |
| "learning_rate": 2.253486837578468e-05, | |
| "loss": 0.0024, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.07927798479795456, | |
| "eval_runtime": 148.9334, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1727 | |
| }, | |
| { | |
| "epoch": 11.019108280254777, | |
| "grad_norm": 0.010104876011610031, | |
| "learning_rate": 2.163894113585404e-05, | |
| "loss": 0.0021, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 11.0828025477707, | |
| "grad_norm": 0.00586892431601882, | |
| "learning_rate": 2.075902190314578e-05, | |
| "loss": 0.0018, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 11.146496815286625, | |
| "grad_norm": 0.004691167734563351, | |
| "learning_rate": 1.9895290432165935e-05, | |
| "loss": 0.0021, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 11.210191082802547, | |
| "grad_norm": 0.0037749160546809435, | |
| "learning_rate": 1.904792317049996e-05, | |
| "loss": 0.0016, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 11.273885350318471, | |
| "grad_norm": 0.0812540128827095, | |
| "learning_rate": 1.82170932227669e-05, | |
| "loss": 0.0023, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 11.337579617834395, | |
| "grad_norm": 0.03934706375002861, | |
| "learning_rate": 1.740297031525674e-05, | |
| "loss": 0.0027, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 11.401273885350319, | |
| "grad_norm": 0.005765652749687433, | |
| "learning_rate": 1.660572076125797e-05, | |
| "loss": 0.0017, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 11.464968152866241, | |
| "grad_norm": 0.006609324831515551, | |
| "learning_rate": 1.5825507427081976e-05, | |
| "loss": 0.0019, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 11.528662420382165, | |
| "grad_norm": 0.046800799667835236, | |
| "learning_rate": 1.5062489698792082e-05, | |
| "loss": 0.0024, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 11.59235668789809, | |
| "grad_norm": 0.0012369153555482626, | |
| "learning_rate": 1.4316823449643257e-05, | |
| "loss": 0.0015, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 11.656050955414013, | |
| "grad_norm": 0.008111722767353058, | |
| "learning_rate": 1.3588661008239412e-05, | |
| "loss": 0.0023, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 11.719745222929935, | |
| "grad_norm": 0.0041403137147426605, | |
| "learning_rate": 1.2878151127415094e-05, | |
| "loss": 0.0021, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 11.78343949044586, | |
| "grad_norm": 0.19675485789775848, | |
| "learning_rate": 1.2185438953847328e-05, | |
| "loss": 0.0032, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 11.847133757961783, | |
| "grad_norm": 0.006231395993381739, | |
| "learning_rate": 1.1510665998404336e-05, | |
| "loss": 0.0022, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 11.910828025477707, | |
| "grad_norm": 0.0019039853941649199, | |
| "learning_rate": 1.0853970107237088e-05, | |
| "loss": 0.0028, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 11.97452229299363, | |
| "grad_norm": 0.0023176763206720352, | |
| "learning_rate": 1.0215485433619132e-05, | |
| "loss": 0.0017, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.08632908761501312, | |
| "eval_runtime": 148.9624, | |
| "eval_samples_per_second": 2.806, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1884 | |
| }, | |
| { | |
| "epoch": 12.038216560509554, | |
| "grad_norm": 0.0012192321009933949, | |
| "learning_rate": 9.595342410541209e-06, | |
| "loss": 0.0017, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 12.101910828025478, | |
| "grad_norm": 0.01771487295627594, | |
| "learning_rate": 8.993667724065747e-06, | |
| "loss": 0.002, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 12.165605095541402, | |
| "grad_norm": 0.004799762275069952, | |
| "learning_rate": 8.410584287446643e-06, | |
| "loss": 0.0018, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 12.229299363057326, | |
| "grad_norm": 0.0028903819620609283, | |
| "learning_rate": 7.846211216020039e-06, | |
| "loss": 0.0017, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 12.292993630573248, | |
| "grad_norm": 0.002215326763689518, | |
| "learning_rate": 7.3006638028707e-06, | |
| "loss": 0.0018, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 12.356687898089172, | |
| "grad_norm": 0.0046812682412564754, | |
| "learning_rate": 6.77405349527942e-06, | |
| "loss": 0.0022, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 12.420382165605096, | |
| "grad_norm": 0.00792867224663496, | |
| "learning_rate": 6.266487871955962e-06, | |
| "loss": 0.0018, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 12.48407643312102, | |
| "grad_norm": 0.0029917878564447165, | |
| "learning_rate": 5.778070621062281e-06, | |
| "loss": 0.0019, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 12.547770700636942, | |
| "grad_norm": 0.0012242052471265197, | |
| "learning_rate": 5.308901519030607e-06, | |
| "loss": 0.0015, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 12.611464968152866, | |
| "grad_norm": 0.0016652451595291495, | |
| "learning_rate": 4.859076410180629e-06, | |
| "loss": 0.0018, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 12.67515923566879, | |
| "grad_norm": 0.0020181615836918354, | |
| "learning_rate": 4.42868718713978e-06, | |
| "loss": 0.0026, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 12.738853503184714, | |
| "grad_norm": 0.01197089534252882, | |
| "learning_rate": 4.017821772071084e-06, | |
| "loss": 0.0018, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 12.802547770700636, | |
| "grad_norm": 0.0036414351779967546, | |
| "learning_rate": 3.6265640987119042e-06, | |
| "loss": 0.0016, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 12.86624203821656, | |
| "grad_norm": 0.005194537341594696, | |
| "learning_rate": 3.2549940952274483e-06, | |
| "loss": 0.0017, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 12.929936305732484, | |
| "grad_norm": 0.0025584339164197445, | |
| "learning_rate": 2.903187667882701e-06, | |
| "loss": 0.0019, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 12.993630573248408, | |
| "grad_norm": 0.08787062019109726, | |
| "learning_rate": 2.5712166855359045e-06, | |
| "loss": 0.0027, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.09124071151018143, | |
| "eval_runtime": 148.9651, | |
| "eval_samples_per_second": 2.806, | |
| "eval_steps_per_second": 0.356, | |
| "step": 2041 | |
| }, | |
| { | |
| "epoch": 13.05732484076433, | |
| "grad_norm": 0.001067174132913351, | |
| "learning_rate": 2.2591489649567587e-06, | |
| "loss": 0.0019, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 13.121019108280255, | |
| "grad_norm": 0.04148571938276291, | |
| "learning_rate": 1.967048256972492e-06, | |
| "loss": 0.0018, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 13.184713375796179, | |
| "grad_norm": 0.0012596879387274384, | |
| "learning_rate": 1.6949742334445018e-06, | |
| "loss": 0.0017, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 13.248407643312103, | |
| "grad_norm": 0.0008848529541864991, | |
| "learning_rate": 1.4429824750782583e-06, | |
| "loss": 0.0019, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 13.312101910828025, | |
| "grad_norm": 0.004621226340532303, | |
| "learning_rate": 1.211124460069013e-06, | |
| "loss": 0.0022, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 13.375796178343949, | |
| "grad_norm": 0.0037146620452404022, | |
| "learning_rate": 9.99447553585542e-07, | |
| "loss": 0.0018, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 13.439490445859873, | |
| "grad_norm": 0.0031733817886561155, | |
| "learning_rate": 8.079949980941526e-07, | |
| "loss": 0.0016, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 13.503184713375797, | |
| "grad_norm": 0.04481673985719681, | |
| "learning_rate": 6.368059045248842e-07, | |
| "loss": 0.0021, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 13.566878980891719, | |
| "grad_norm": 0.008498159237205982, | |
| "learning_rate": 4.859152442817205e-07, | |
| "loss": 0.002, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 13.630573248407643, | |
| "grad_norm": 0.003298922209069133, | |
| "learning_rate": 3.5535384209846036e-07, | |
| "loss": 0.0017, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 13.694267515923567, | |
| "grad_norm": 0.003794416319578886, | |
| "learning_rate": 2.4514836974165454e-07, | |
| "loss": 0.0016, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 13.757961783439491, | |
| "grad_norm": 0.005654670298099518, | |
| "learning_rate": 1.5532134056196468e-07, | |
| "loss": 0.0017, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 13.821656050955415, | |
| "grad_norm": 0.0006748574669472873, | |
| "learning_rate": 8.589110489505281e-08, | |
| "loss": 0.0019, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 13.885350318471337, | |
| "grad_norm": 0.05673813074827194, | |
| "learning_rate": 3.687184631284701e-08, | |
| "loss": 0.0023, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 13.949044585987261, | |
| "grad_norm": 0.044436752796173096, | |
| "learning_rate": 8.273578726014642e-09, | |
| "loss": 0.002, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.09217014908790588, | |
| "eval_runtime": 148.9613, | |
| "eval_samples_per_second": 2.806, | |
| "eval_steps_per_second": 0.356, | |
| "step": 2198 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2198, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 14, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.05580823441408e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |