{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994683678894205, "eval_steps": 500, "global_step": 940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01063264221158958, "grad_norm": 0.9233230352401733, "learning_rate": 0.00019808510638297873, "loss": 1.7298, "step": 10 }, { "epoch": 0.02126528442317916, "grad_norm": 0.9088625311851501, "learning_rate": 0.00019595744680851065, "loss": 1.2536, "step": 20 }, { "epoch": 0.03189792663476874, "grad_norm": 0.9639036655426025, "learning_rate": 0.00019382978723404257, "loss": 1.1759, "step": 30 }, { "epoch": 0.04253056884635832, "grad_norm": 0.9426536560058594, "learning_rate": 0.00019170212765957448, "loss": 1.0703, "step": 40 }, { "epoch": 0.0531632110579479, "grad_norm": 0.9788757562637329, "learning_rate": 0.0001895744680851064, "loss": 1.0505, "step": 50 }, { "epoch": 0.06379585326953748, "grad_norm": 1.2088581323623657, "learning_rate": 0.00018744680851063832, "loss": 1.0358, "step": 60 }, { "epoch": 0.07442849548112707, "grad_norm": 0.9232538342475891, "learning_rate": 0.0001853191489361702, "loss": 0.9859, "step": 70 }, { "epoch": 0.08506113769271664, "grad_norm": 1.2805695533752441, "learning_rate": 0.00018319148936170215, "loss": 0.9763, "step": 80 }, { "epoch": 0.09569377990430622, "grad_norm": 1.067161202430725, "learning_rate": 0.00018106382978723404, "loss": 1.0738, "step": 90 }, { "epoch": 0.1063264221158958, "grad_norm": 1.2387498617172241, "learning_rate": 0.00017893617021276596, "loss": 0.9779, "step": 100 }, { "epoch": 0.11695906432748537, "grad_norm": 1.0024847984313965, "learning_rate": 0.00017680851063829787, "loss": 0.9815, "step": 110 }, { "epoch": 0.12759170653907495, "grad_norm": 1.9225773811340332, "learning_rate": 0.0001746808510638298, "loss": 0.9203, "step": 120 }, { "epoch": 0.13822434875066453, "grad_norm": 1.3451205492019653, "learning_rate": 0.0001725531914893617, "loss": 0.9678, "step": 130 }, { "epoch": 0.14885699096225413, "grad_norm": 1.4681438207626343, "learning_rate": 0.00017042553191489362, "loss": 0.813, "step": 140 }, { "epoch": 0.1594896331738437, "grad_norm": 1.244214415550232, "learning_rate": 0.00016829787234042554, "loss": 0.7149, "step": 150 }, { "epoch": 0.17012227538543329, "grad_norm": 1.4099949598312378, "learning_rate": 0.00016617021276595746, "loss": 0.8472, "step": 160 }, { "epoch": 0.18075491759702286, "grad_norm": 1.4794244766235352, "learning_rate": 0.00016404255319148937, "loss": 0.6425, "step": 170 }, { "epoch": 0.19138755980861244, "grad_norm": 1.574625849723816, "learning_rate": 0.0001619148936170213, "loss": 0.7604, "step": 180 }, { "epoch": 0.20202020202020202, "grad_norm": 1.8564409017562866, "learning_rate": 0.0001597872340425532, "loss": 0.6777, "step": 190 }, { "epoch": 0.2126528442317916, "grad_norm": 1.1089837551116943, "learning_rate": 0.00015765957446808512, "loss": 0.6804, "step": 200 }, { "epoch": 0.22328548644338117, "grad_norm": 1.5858855247497559, "learning_rate": 0.00015553191489361701, "loss": 0.6962, "step": 210 }, { "epoch": 0.23391812865497075, "grad_norm": 1.0457383394241333, "learning_rate": 0.00015340425531914896, "loss": 0.6975, "step": 220 }, { "epoch": 0.24455077086656035, "grad_norm": 1.007315993309021, "learning_rate": 0.00015127659574468085, "loss": 0.6911, "step": 230 }, { "epoch": 0.2551834130781499, "grad_norm": 1.4641199111938477, "learning_rate": 0.00014914893617021276, "loss": 0.6968, "step": 240 }, { "epoch": 0.2658160552897395, "grad_norm": 1.127540111541748, "learning_rate": 0.00014702127659574468, "loss": 0.6226, "step": 250 }, { "epoch": 0.27644869750132905, "grad_norm": 1.841412901878357, "learning_rate": 0.0001448936170212766, "loss": 0.5781, "step": 260 }, { "epoch": 0.28708133971291866, "grad_norm": 1.6684510707855225, "learning_rate": 0.00014276595744680851, "loss": 0.6214, "step": 270 }, { "epoch": 0.29771398192450826, "grad_norm": 1.7632583379745483, "learning_rate": 0.00014063829787234043, "loss": 0.5107, "step": 280 }, { "epoch": 0.3083466241360978, "grad_norm": 1.3282111883163452, "learning_rate": 0.00013851063829787235, "loss": 0.6491, "step": 290 }, { "epoch": 0.3189792663476874, "grad_norm": 1.7586910724639893, "learning_rate": 0.00013638297872340427, "loss": 0.7237, "step": 300 }, { "epoch": 0.32961190855927697, "grad_norm": 1.5256597995758057, "learning_rate": 0.00013425531914893618, "loss": 0.6446, "step": 310 }, { "epoch": 0.34024455077086657, "grad_norm": 1.4499211311340332, "learning_rate": 0.0001321276595744681, "loss": 0.557, "step": 320 }, { "epoch": 0.3508771929824561, "grad_norm": 1.7797976732254028, "learning_rate": 0.00013000000000000002, "loss": 0.6162, "step": 330 }, { "epoch": 0.3615098351940457, "grad_norm": 1.2894353866577148, "learning_rate": 0.0001278723404255319, "loss": 0.4761, "step": 340 }, { "epoch": 0.3721424774056353, "grad_norm": 1.3315067291259766, "learning_rate": 0.00012574468085106382, "loss": 0.5865, "step": 350 }, { "epoch": 0.3827751196172249, "grad_norm": 1.4586937427520752, "learning_rate": 0.00012361702127659577, "loss": 0.5099, "step": 360 }, { "epoch": 0.3934077618288145, "grad_norm": 1.2912027835845947, "learning_rate": 0.00012148936170212766, "loss": 0.5801, "step": 370 }, { "epoch": 0.40404040404040403, "grad_norm": 1.2132781744003296, "learning_rate": 0.00011936170212765959, "loss": 0.3565, "step": 380 }, { "epoch": 0.41467304625199364, "grad_norm": 1.2837001085281372, "learning_rate": 0.0001172340425531915, "loss": 0.3634, "step": 390 }, { "epoch": 0.4253056884635832, "grad_norm": 1.6399765014648438, "learning_rate": 0.0001151063829787234, "loss": 0.5858, "step": 400 }, { "epoch": 0.4359383306751728, "grad_norm": 1.2120444774627686, "learning_rate": 0.00011297872340425532, "loss": 0.5533, "step": 410 }, { "epoch": 0.44657097288676234, "grad_norm": 2.2904655933380127, "learning_rate": 0.00011085106382978725, "loss": 0.5185, "step": 420 }, { "epoch": 0.45720361509835195, "grad_norm": 0.9909681081771851, "learning_rate": 0.00010872340425531916, "loss": 0.4418, "step": 430 }, { "epoch": 0.4678362573099415, "grad_norm": 1.9283276796340942, "learning_rate": 0.00010659574468085107, "loss": 0.458, "step": 440 }, { "epoch": 0.4784688995215311, "grad_norm": 1.5563241243362427, "learning_rate": 0.00010446808510638298, "loss": 0.3739, "step": 450 }, { "epoch": 0.4891015417331207, "grad_norm": 0.8688263893127441, "learning_rate": 0.0001023404255319149, "loss": 0.3839, "step": 460 }, { "epoch": 0.49973418394471025, "grad_norm": 1.012356162071228, "learning_rate": 0.00010021276595744682, "loss": 0.3637, "step": 470 }, { "epoch": 0.5103668261562998, "grad_norm": 1.2394040822982788, "learning_rate": 9.808510638297873e-05, "loss": 0.4456, "step": 480 }, { "epoch": 0.5209994683678895, "grad_norm": 2.0661351680755615, "learning_rate": 9.595744680851064e-05, "loss": 0.3375, "step": 490 }, { "epoch": 0.531632110579479, "grad_norm": 0.8300966024398804, "learning_rate": 9.382978723404256e-05, "loss": 0.2703, "step": 500 }, { "epoch": 0.5422647527910686, "grad_norm": 2.6386091709136963, "learning_rate": 9.170212765957448e-05, "loss": 0.3582, "step": 510 }, { "epoch": 0.5528973950026581, "grad_norm": 1.5658433437347412, "learning_rate": 8.95744680851064e-05, "loss": 0.5284, "step": 520 }, { "epoch": 0.5635300372142478, "grad_norm": 1.4143650531768799, "learning_rate": 8.74468085106383e-05, "loss": 0.3619, "step": 530 }, { "epoch": 0.5741626794258373, "grad_norm": 1.0321277379989624, "learning_rate": 8.531914893617021e-05, "loss": 0.449, "step": 540 }, { "epoch": 0.5847953216374269, "grad_norm": 1.4047714471817017, "learning_rate": 8.319148936170213e-05, "loss": 0.3873, "step": 550 }, { "epoch": 0.5954279638490165, "grad_norm": 1.176665186882019, "learning_rate": 8.106382978723405e-05, "loss": 0.3208, "step": 560 }, { "epoch": 0.6060606060606061, "grad_norm": 1.349563479423523, "learning_rate": 7.893617021276596e-05, "loss": 0.3534, "step": 570 }, { "epoch": 0.6166932482721956, "grad_norm": 1.899173617362976, "learning_rate": 7.680851063829788e-05, "loss": 0.4149, "step": 580 }, { "epoch": 0.6273258904837852, "grad_norm": 1.041756272315979, "learning_rate": 7.46808510638298e-05, "loss": 0.2761, "step": 590 }, { "epoch": 0.6379585326953748, "grad_norm": 1.1541553735733032, "learning_rate": 7.25531914893617e-05, "loss": 0.3183, "step": 600 }, { "epoch": 0.6485911749069644, "grad_norm": 1.7732151746749878, "learning_rate": 7.042553191489362e-05, "loss": 0.4356, "step": 610 }, { "epoch": 0.6592238171185539, "grad_norm": 2.6027865409851074, "learning_rate": 6.829787234042554e-05, "loss": 0.5153, "step": 620 }, { "epoch": 0.6698564593301436, "grad_norm": 1.1163185834884644, "learning_rate": 6.617021276595745e-05, "loss": 0.3758, "step": 630 }, { "epoch": 0.6804891015417331, "grad_norm": 0.8950490355491638, "learning_rate": 6.404255319148937e-05, "loss": 0.3092, "step": 640 }, { "epoch": 0.6911217437533227, "grad_norm": 0.8353213667869568, "learning_rate": 6.191489361702127e-05, "loss": 0.2713, "step": 650 }, { "epoch": 0.7017543859649122, "grad_norm": 0.9185741543769836, "learning_rate": 5.9787234042553196e-05, "loss": 0.2692, "step": 660 }, { "epoch": 0.7123870281765019, "grad_norm": 1.5412646532058716, "learning_rate": 5.7659574468085106e-05, "loss": 0.252, "step": 670 }, { "epoch": 0.7230196703880915, "grad_norm": 1.210580825805664, "learning_rate": 5.553191489361702e-05, "loss": 0.2394, "step": 680 }, { "epoch": 0.733652312599681, "grad_norm": 2.3778483867645264, "learning_rate": 5.3404255319148946e-05, "loss": 0.2672, "step": 690 }, { "epoch": 0.7442849548112705, "grad_norm": 2.204791784286499, "learning_rate": 5.1276595744680856e-05, "loss": 0.3295, "step": 700 }, { "epoch": 0.7549175970228602, "grad_norm": 1.610378623008728, "learning_rate": 4.9148936170212766e-05, "loss": 0.3868, "step": 710 }, { "epoch": 0.7655502392344498, "grad_norm": 1.7490154504776, "learning_rate": 4.702127659574468e-05, "loss": 0.302, "step": 720 }, { "epoch": 0.7761828814460393, "grad_norm": 1.022546410560608, "learning_rate": 4.489361702127659e-05, "loss": 0.2969, "step": 730 }, { "epoch": 0.786815523657629, "grad_norm": 1.0458086729049683, "learning_rate": 4.276595744680851e-05, "loss": 0.3652, "step": 740 }, { "epoch": 0.7974481658692185, "grad_norm": 1.330607295036316, "learning_rate": 4.063829787234043e-05, "loss": 0.2184, "step": 750 }, { "epoch": 0.8080808080808081, "grad_norm": 1.8219746351242065, "learning_rate": 3.8510638297872344e-05, "loss": 0.2359, "step": 760 }, { "epoch": 0.8187134502923976, "grad_norm": 3.080618143081665, "learning_rate": 3.638297872340426e-05, "loss": 0.3008, "step": 770 }, { "epoch": 0.8293460925039873, "grad_norm": 2.212218999862671, "learning_rate": 3.425531914893617e-05, "loss": 0.2675, "step": 780 }, { "epoch": 0.8399787347155768, "grad_norm": 1.714879035949707, "learning_rate": 3.212765957446809e-05, "loss": 0.2914, "step": 790 }, { "epoch": 0.8506113769271664, "grad_norm": 2.811004161834717, "learning_rate": 3e-05, "loss": 0.2235, "step": 800 }, { "epoch": 0.861244019138756, "grad_norm": 2.3071866035461426, "learning_rate": 2.7872340425531918e-05, "loss": 0.2387, "step": 810 }, { "epoch": 0.8718766613503456, "grad_norm": 2.134385108947754, "learning_rate": 2.574468085106383e-05, "loss": 0.2365, "step": 820 }, { "epoch": 0.8825093035619351, "grad_norm": 1.6607768535614014, "learning_rate": 2.3617021276595748e-05, "loss": 0.3073, "step": 830 }, { "epoch": 0.8931419457735247, "grad_norm": 2.4962167739868164, "learning_rate": 2.148936170212766e-05, "loss": 0.2799, "step": 840 }, { "epoch": 0.9037745879851143, "grad_norm": 3.272426128387451, "learning_rate": 1.9361702127659575e-05, "loss": 0.2627, "step": 850 }, { "epoch": 0.9144072301967039, "grad_norm": 0.6173011064529419, "learning_rate": 1.723404255319149e-05, "loss": 0.2336, "step": 860 }, { "epoch": 0.9250398724082934, "grad_norm": 1.584494948387146, "learning_rate": 1.5106382978723405e-05, "loss": 0.2081, "step": 870 }, { "epoch": 0.935672514619883, "grad_norm": 0.16361036896705627, "learning_rate": 1.2978723404255318e-05, "loss": 0.3102, "step": 880 }, { "epoch": 0.9463051568314726, "grad_norm": 1.1021312475204468, "learning_rate": 1.0851063829787235e-05, "loss": 0.2414, "step": 890 }, { "epoch": 0.9569377990430622, "grad_norm": 1.4342254400253296, "learning_rate": 8.72340425531915e-06, "loss": 0.2268, "step": 900 }, { "epoch": 0.9675704412546517, "grad_norm": 3.8059401512145996, "learning_rate": 6.595744680851064e-06, "loss": 0.192, "step": 910 }, { "epoch": 0.9782030834662414, "grad_norm": 3.46911883354187, "learning_rate": 4.468085106382979e-06, "loss": 0.2758, "step": 920 }, { "epoch": 0.988835725677831, "grad_norm": 1.878474235534668, "learning_rate": 2.3404255319148935e-06, "loss": 0.2006, "step": 930 }, { "epoch": 0.9994683678894205, "grad_norm": 4.247520923614502, "learning_rate": 2.1276595744680852e-07, "loss": 0.2977, "step": 940 } ], "logging_steps": 10, "max_steps": 940, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.314124516155392e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }