| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994683678894205, | |
| "eval_steps": 500, | |
| "global_step": 940, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01063264221158958, | |
| "grad_norm": 0.9233230352401733, | |
| "learning_rate": 0.00019808510638297873, | |
| "loss": 1.7298, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02126528442317916, | |
| "grad_norm": 0.9088625311851501, | |
| "learning_rate": 0.00019595744680851065, | |
| "loss": 1.2536, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03189792663476874, | |
| "grad_norm": 0.9639036655426025, | |
| "learning_rate": 0.00019382978723404257, | |
| "loss": 1.1759, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04253056884635832, | |
| "grad_norm": 0.9426536560058594, | |
| "learning_rate": 0.00019170212765957448, | |
| "loss": 1.0703, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0531632110579479, | |
| "grad_norm": 0.9788757562637329, | |
| "learning_rate": 0.0001895744680851064, | |
| "loss": 1.0505, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06379585326953748, | |
| "grad_norm": 1.2088581323623657, | |
| "learning_rate": 0.00018744680851063832, | |
| "loss": 1.0358, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07442849548112707, | |
| "grad_norm": 0.9232538342475891, | |
| "learning_rate": 0.0001853191489361702, | |
| "loss": 0.9859, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08506113769271664, | |
| "grad_norm": 1.2805695533752441, | |
| "learning_rate": 0.00018319148936170215, | |
| "loss": 0.9763, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 1.067161202430725, | |
| "learning_rate": 0.00018106382978723404, | |
| "loss": 1.0738, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1063264221158958, | |
| "grad_norm": 1.2387498617172241, | |
| "learning_rate": 0.00017893617021276596, | |
| "loss": 0.9779, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 1.0024847984313965, | |
| "learning_rate": 0.00017680851063829787, | |
| "loss": 0.9815, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12759170653907495, | |
| "grad_norm": 1.9225773811340332, | |
| "learning_rate": 0.0001746808510638298, | |
| "loss": 0.9203, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13822434875066453, | |
| "grad_norm": 1.3451205492019653, | |
| "learning_rate": 0.0001725531914893617, | |
| "loss": 0.9678, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14885699096225413, | |
| "grad_norm": 1.4681438207626343, | |
| "learning_rate": 0.00017042553191489362, | |
| "loss": 0.813, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1594896331738437, | |
| "grad_norm": 1.244214415550232, | |
| "learning_rate": 0.00016829787234042554, | |
| "loss": 0.7149, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17012227538543329, | |
| "grad_norm": 1.4099949598312378, | |
| "learning_rate": 0.00016617021276595746, | |
| "loss": 0.8472, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18075491759702286, | |
| "grad_norm": 1.4794244766235352, | |
| "learning_rate": 0.00016404255319148937, | |
| "loss": 0.6425, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 1.574625849723816, | |
| "learning_rate": 0.0001619148936170213, | |
| "loss": 0.7604, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 1.8564409017562866, | |
| "learning_rate": 0.0001597872340425532, | |
| "loss": 0.6777, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2126528442317916, | |
| "grad_norm": 1.1089837551116943, | |
| "learning_rate": 0.00015765957446808512, | |
| "loss": 0.6804, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22328548644338117, | |
| "grad_norm": 1.5858855247497559, | |
| "learning_rate": 0.00015553191489361701, | |
| "loss": 0.6962, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 1.0457383394241333, | |
| "learning_rate": 0.00015340425531914896, | |
| "loss": 0.6975, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24455077086656035, | |
| "grad_norm": 1.007315993309021, | |
| "learning_rate": 0.00015127659574468085, | |
| "loss": 0.6911, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2551834130781499, | |
| "grad_norm": 1.4641199111938477, | |
| "learning_rate": 0.00014914893617021276, | |
| "loss": 0.6968, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2658160552897395, | |
| "grad_norm": 1.127540111541748, | |
| "learning_rate": 0.00014702127659574468, | |
| "loss": 0.6226, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27644869750132905, | |
| "grad_norm": 1.841412901878357, | |
| "learning_rate": 0.0001448936170212766, | |
| "loss": 0.5781, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 1.6684510707855225, | |
| "learning_rate": 0.00014276595744680851, | |
| "loss": 0.6214, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29771398192450826, | |
| "grad_norm": 1.7632583379745483, | |
| "learning_rate": 0.00014063829787234043, | |
| "loss": 0.5107, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3083466241360978, | |
| "grad_norm": 1.3282111883163452, | |
| "learning_rate": 0.00013851063829787235, | |
| "loss": 0.6491, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3189792663476874, | |
| "grad_norm": 1.7586910724639893, | |
| "learning_rate": 0.00013638297872340427, | |
| "loss": 0.7237, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32961190855927697, | |
| "grad_norm": 1.5256597995758057, | |
| "learning_rate": 0.00013425531914893618, | |
| "loss": 0.6446, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.34024455077086657, | |
| "grad_norm": 1.4499211311340332, | |
| "learning_rate": 0.0001321276595744681, | |
| "loss": 0.557, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 1.7797976732254028, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 0.6162, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3615098351940457, | |
| "grad_norm": 1.2894353866577148, | |
| "learning_rate": 0.0001278723404255319, | |
| "loss": 0.4761, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3721424774056353, | |
| "grad_norm": 1.3315067291259766, | |
| "learning_rate": 0.00012574468085106382, | |
| "loss": 0.5865, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 1.4586937427520752, | |
| "learning_rate": 0.00012361702127659577, | |
| "loss": 0.5099, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3934077618288145, | |
| "grad_norm": 1.2912027835845947, | |
| "learning_rate": 0.00012148936170212766, | |
| "loss": 0.5801, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 1.2132781744003296, | |
| "learning_rate": 0.00011936170212765959, | |
| "loss": 0.3565, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.41467304625199364, | |
| "grad_norm": 1.2837001085281372, | |
| "learning_rate": 0.0001172340425531915, | |
| "loss": 0.3634, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4253056884635832, | |
| "grad_norm": 1.6399765014648438, | |
| "learning_rate": 0.0001151063829787234, | |
| "loss": 0.5858, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4359383306751728, | |
| "grad_norm": 1.2120444774627686, | |
| "learning_rate": 0.00011297872340425532, | |
| "loss": 0.5533, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.44657097288676234, | |
| "grad_norm": 2.2904655933380127, | |
| "learning_rate": 0.00011085106382978725, | |
| "loss": 0.5185, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.45720361509835195, | |
| "grad_norm": 0.9909681081771851, | |
| "learning_rate": 0.00010872340425531916, | |
| "loss": 0.4418, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 1.9283276796340942, | |
| "learning_rate": 0.00010659574468085107, | |
| "loss": 0.458, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 1.5563241243362427, | |
| "learning_rate": 0.00010446808510638298, | |
| "loss": 0.3739, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4891015417331207, | |
| "grad_norm": 0.8688263893127441, | |
| "learning_rate": 0.0001023404255319149, | |
| "loss": 0.3839, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.49973418394471025, | |
| "grad_norm": 1.012356162071228, | |
| "learning_rate": 0.00010021276595744682, | |
| "loss": 0.3637, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5103668261562998, | |
| "grad_norm": 1.2394040822982788, | |
| "learning_rate": 9.808510638297873e-05, | |
| "loss": 0.4456, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5209994683678895, | |
| "grad_norm": 2.0661351680755615, | |
| "learning_rate": 9.595744680851064e-05, | |
| "loss": 0.3375, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.531632110579479, | |
| "grad_norm": 0.8300966024398804, | |
| "learning_rate": 9.382978723404256e-05, | |
| "loss": 0.2703, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5422647527910686, | |
| "grad_norm": 2.6386091709136963, | |
| "learning_rate": 9.170212765957448e-05, | |
| "loss": 0.3582, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5528973950026581, | |
| "grad_norm": 1.5658433437347412, | |
| "learning_rate": 8.95744680851064e-05, | |
| "loss": 0.5284, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5635300372142478, | |
| "grad_norm": 1.4143650531768799, | |
| "learning_rate": 8.74468085106383e-05, | |
| "loss": 0.3619, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 1.0321277379989624, | |
| "learning_rate": 8.531914893617021e-05, | |
| "loss": 0.449, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 1.4047714471817017, | |
| "learning_rate": 8.319148936170213e-05, | |
| "loss": 0.3873, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5954279638490165, | |
| "grad_norm": 1.176665186882019, | |
| "learning_rate": 8.106382978723405e-05, | |
| "loss": 0.3208, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 1.349563479423523, | |
| "learning_rate": 7.893617021276596e-05, | |
| "loss": 0.3534, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6166932482721956, | |
| "grad_norm": 1.899173617362976, | |
| "learning_rate": 7.680851063829788e-05, | |
| "loss": 0.4149, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6273258904837852, | |
| "grad_norm": 1.041756272315979, | |
| "learning_rate": 7.46808510638298e-05, | |
| "loss": 0.2761, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6379585326953748, | |
| "grad_norm": 1.1541553735733032, | |
| "learning_rate": 7.25531914893617e-05, | |
| "loss": 0.3183, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6485911749069644, | |
| "grad_norm": 1.7732151746749878, | |
| "learning_rate": 7.042553191489362e-05, | |
| "loss": 0.4356, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6592238171185539, | |
| "grad_norm": 2.6027865409851074, | |
| "learning_rate": 6.829787234042554e-05, | |
| "loss": 0.5153, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 1.1163185834884644, | |
| "learning_rate": 6.617021276595745e-05, | |
| "loss": 0.3758, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6804891015417331, | |
| "grad_norm": 0.8950490355491638, | |
| "learning_rate": 6.404255319148937e-05, | |
| "loss": 0.3092, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6911217437533227, | |
| "grad_norm": 0.8353213667869568, | |
| "learning_rate": 6.191489361702127e-05, | |
| "loss": 0.2713, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.9185741543769836, | |
| "learning_rate": 5.9787234042553196e-05, | |
| "loss": 0.2692, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7123870281765019, | |
| "grad_norm": 1.5412646532058716, | |
| "learning_rate": 5.7659574468085106e-05, | |
| "loss": 0.252, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7230196703880915, | |
| "grad_norm": 1.210580825805664, | |
| "learning_rate": 5.553191489361702e-05, | |
| "loss": 0.2394, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.733652312599681, | |
| "grad_norm": 2.3778483867645264, | |
| "learning_rate": 5.3404255319148946e-05, | |
| "loss": 0.2672, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7442849548112705, | |
| "grad_norm": 2.204791784286499, | |
| "learning_rate": 5.1276595744680856e-05, | |
| "loss": 0.3295, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7549175970228602, | |
| "grad_norm": 1.610378623008728, | |
| "learning_rate": 4.9148936170212766e-05, | |
| "loss": 0.3868, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 1.7490154504776, | |
| "learning_rate": 4.702127659574468e-05, | |
| "loss": 0.302, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7761828814460393, | |
| "grad_norm": 1.022546410560608, | |
| "learning_rate": 4.489361702127659e-05, | |
| "loss": 0.2969, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.786815523657629, | |
| "grad_norm": 1.0458086729049683, | |
| "learning_rate": 4.276595744680851e-05, | |
| "loss": 0.3652, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7974481658692185, | |
| "grad_norm": 1.330607295036316, | |
| "learning_rate": 4.063829787234043e-05, | |
| "loss": 0.2184, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 1.8219746351242065, | |
| "learning_rate": 3.8510638297872344e-05, | |
| "loss": 0.2359, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 3.080618143081665, | |
| "learning_rate": 3.638297872340426e-05, | |
| "loss": 0.3008, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8293460925039873, | |
| "grad_norm": 2.212218999862671, | |
| "learning_rate": 3.425531914893617e-05, | |
| "loss": 0.2675, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8399787347155768, | |
| "grad_norm": 1.714879035949707, | |
| "learning_rate": 3.212765957446809e-05, | |
| "loss": 0.2914, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8506113769271664, | |
| "grad_norm": 2.811004161834717, | |
| "learning_rate": 3e-05, | |
| "loss": 0.2235, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 2.3071866035461426, | |
| "learning_rate": 2.7872340425531918e-05, | |
| "loss": 0.2387, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8718766613503456, | |
| "grad_norm": 2.134385108947754, | |
| "learning_rate": 2.574468085106383e-05, | |
| "loss": 0.2365, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8825093035619351, | |
| "grad_norm": 1.6607768535614014, | |
| "learning_rate": 2.3617021276595748e-05, | |
| "loss": 0.3073, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8931419457735247, | |
| "grad_norm": 2.4962167739868164, | |
| "learning_rate": 2.148936170212766e-05, | |
| "loss": 0.2799, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9037745879851143, | |
| "grad_norm": 3.272426128387451, | |
| "learning_rate": 1.9361702127659575e-05, | |
| "loss": 0.2627, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9144072301967039, | |
| "grad_norm": 0.6173011064529419, | |
| "learning_rate": 1.723404255319149e-05, | |
| "loss": 0.2336, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9250398724082934, | |
| "grad_norm": 1.584494948387146, | |
| "learning_rate": 1.5106382978723405e-05, | |
| "loss": 0.2081, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 0.16361036896705627, | |
| "learning_rate": 1.2978723404255318e-05, | |
| "loss": 0.3102, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9463051568314726, | |
| "grad_norm": 1.1021312475204468, | |
| "learning_rate": 1.0851063829787235e-05, | |
| "loss": 0.2414, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 1.4342254400253296, | |
| "learning_rate": 8.72340425531915e-06, | |
| "loss": 0.2268, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9675704412546517, | |
| "grad_norm": 3.8059401512145996, | |
| "learning_rate": 6.595744680851064e-06, | |
| "loss": 0.192, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9782030834662414, | |
| "grad_norm": 3.46911883354187, | |
| "learning_rate": 4.468085106382979e-06, | |
| "loss": 0.2758, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.988835725677831, | |
| "grad_norm": 1.878474235534668, | |
| "learning_rate": 2.3404255319148935e-06, | |
| "loss": 0.2006, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9994683678894205, | |
| "grad_norm": 4.247520923614502, | |
| "learning_rate": 2.1276595744680852e-07, | |
| "loss": 0.2977, | |
| "step": 940 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 940, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.314124516155392e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |