{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9869451697127936, "eval_steps": 500, "global_step": 573, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05221932114882506, "grad_norm": 5.394797325134277, "learning_rate": 1.9650959860383945e-05, "loss": 1.2108, "step": 10 }, { "epoch": 0.10443864229765012, "grad_norm": 4.011317729949951, "learning_rate": 1.930191972076789e-05, "loss": 0.9376, "step": 20 }, { "epoch": 0.1566579634464752, "grad_norm": 3.838289260864258, "learning_rate": 1.895287958115183e-05, "loss": 0.7876, "step": 30 }, { "epoch": 0.20887728459530025, "grad_norm": 4.1263933181762695, "learning_rate": 1.8603839441535778e-05, "loss": 0.7373, "step": 40 }, { "epoch": 0.26109660574412535, "grad_norm": 3.705490827560425, "learning_rate": 1.825479930191972e-05, "loss": 0.6564, "step": 50 }, { "epoch": 0.3133159268929504, "grad_norm": 3.4956674575805664, "learning_rate": 1.7905759162303668e-05, "loss": 0.6102, "step": 60 }, { "epoch": 0.36553524804177545, "grad_norm": 3.823296308517456, "learning_rate": 1.755671902268761e-05, "loss": 0.4846, "step": 70 }, { "epoch": 0.4177545691906005, "grad_norm": 2.9303112030029297, "learning_rate": 1.7207678883071554e-05, "loss": 0.447, "step": 80 }, { "epoch": 0.4699738903394256, "grad_norm": 3.7392730712890625, "learning_rate": 1.6858638743455497e-05, "loss": 0.3669, "step": 90 }, { "epoch": 0.5221932114882507, "grad_norm": 3.2671146392822266, "learning_rate": 1.6509598603839444e-05, "loss": 0.34, "step": 100 }, { "epoch": 0.5744125326370757, "grad_norm": 3.2053961753845215, "learning_rate": 1.6160558464223387e-05, "loss": 0.2791, "step": 110 }, { "epoch": 0.6266318537859008, "grad_norm": 2.1108620166778564, "learning_rate": 1.581151832460733e-05, "loss": 0.2593, "step": 120 }, { "epoch": 0.6788511749347258, "grad_norm": 3.154852867126465, "learning_rate": 1.5462478184991274e-05, "loss": 0.2308, "step": 130 }, { "epoch": 0.7310704960835509, "grad_norm": 2.451120376586914, "learning_rate": 1.511343804537522e-05, "loss": 0.2126, "step": 140 }, { "epoch": 0.783289817232376, "grad_norm": 3.1815454959869385, "learning_rate": 1.4764397905759162e-05, "loss": 0.1987, "step": 150 }, { "epoch": 0.835509138381201, "grad_norm": 2.3058295249938965, "learning_rate": 1.4415357766143108e-05, "loss": 0.2129, "step": 160 }, { "epoch": 0.8877284595300261, "grad_norm": 2.5228800773620605, "learning_rate": 1.4066317626527052e-05, "loss": 0.2154, "step": 170 }, { "epoch": 0.9399477806788512, "grad_norm": 3.707547426223755, "learning_rate": 1.3717277486910996e-05, "loss": 0.1779, "step": 180 }, { "epoch": 0.9921671018276762, "grad_norm": 1.675297737121582, "learning_rate": 1.336823734729494e-05, "loss": 0.1925, "step": 190 }, { "epoch": 1.04177545691906, "grad_norm": 2.4128589630126953, "learning_rate": 1.3019197207678885e-05, "loss": 0.1193, "step": 200 }, { "epoch": 1.0939947780678851, "grad_norm": 1.474452018737793, "learning_rate": 1.2670157068062828e-05, "loss": 0.1135, "step": 210 }, { "epoch": 1.1462140992167102, "grad_norm": 1.6879887580871582, "learning_rate": 1.2321116928446773e-05, "loss": 0.1064, "step": 220 }, { "epoch": 1.1984334203655354, "grad_norm": 1.3061121702194214, "learning_rate": 1.1972076788830716e-05, "loss": 0.1125, "step": 230 }, { "epoch": 1.2506527415143602, "grad_norm": 1.8820029497146606, "learning_rate": 1.162303664921466e-05, "loss": 0.1156, "step": 240 }, { "epoch": 1.3028720626631853, "grad_norm": 2.1106035709381104, "learning_rate": 1.1273996509598604e-05, "loss": 0.1112, "step": 250 }, { "epoch": 1.3550913838120104, "grad_norm": 1.6610087156295776, "learning_rate": 1.0924956369982549e-05, "loss": 0.1147, "step": 260 }, { "epoch": 1.4073107049608355, "grad_norm": 1.0979489088058472, "learning_rate": 1.0575916230366492e-05, "loss": 0.0959, "step": 270 }, { "epoch": 1.4595300261096606, "grad_norm": 1.7349355220794678, "learning_rate": 1.0226876090750437e-05, "loss": 0.0959, "step": 280 }, { "epoch": 1.5117493472584855, "grad_norm": 1.3557089567184448, "learning_rate": 9.877835951134382e-06, "loss": 0.0961, "step": 290 }, { "epoch": 1.5639686684073109, "grad_norm": 2.409390687942505, "learning_rate": 9.528795811518325e-06, "loss": 0.1122, "step": 300 }, { "epoch": 1.6161879895561357, "grad_norm": 2.6092705726623535, "learning_rate": 9.17975567190227e-06, "loss": 0.1012, "step": 310 }, { "epoch": 1.6684073107049608, "grad_norm": 0.9906852841377258, "learning_rate": 8.830715532286213e-06, "loss": 0.0949, "step": 320 }, { "epoch": 1.720626631853786, "grad_norm": 1.7950801849365234, "learning_rate": 8.481675392670158e-06, "loss": 0.108, "step": 330 }, { "epoch": 1.7728459530026108, "grad_norm": 5.53033971786499, "learning_rate": 8.132635253054101e-06, "loss": 0.0999, "step": 340 }, { "epoch": 1.8250652741514362, "grad_norm": 1.3689807653427124, "learning_rate": 7.783595113438046e-06, "loss": 0.0848, "step": 350 }, { "epoch": 1.877284595300261, "grad_norm": 0.9605047702789307, "learning_rate": 7.43455497382199e-06, "loss": 0.0889, "step": 360 }, { "epoch": 1.9295039164490861, "grad_norm": 2.4679007530212402, "learning_rate": 7.0855148342059345e-06, "loss": 0.0942, "step": 370 }, { "epoch": 1.9817232375979112, "grad_norm": 1.0207858085632324, "learning_rate": 6.7364746945898785e-06, "loss": 0.0897, "step": 380 }, { "epoch": 2.031331592689295, "grad_norm": 0.7235077023506165, "learning_rate": 6.3874345549738226e-06, "loss": 0.0767, "step": 390 }, { "epoch": 2.08355091383812, "grad_norm": 1.5406379699707031, "learning_rate": 6.038394415357767e-06, "loss": 0.064, "step": 400 }, { "epoch": 2.1357702349869454, "grad_norm": 1.5837807655334473, "learning_rate": 5.689354275741711e-06, "loss": 0.0624, "step": 410 }, { "epoch": 2.1879895561357703, "grad_norm": 0.8071414828300476, "learning_rate": 5.340314136125655e-06, "loss": 0.0618, "step": 420 }, { "epoch": 2.240208877284595, "grad_norm": 1.0322142839431763, "learning_rate": 4.991273996509599e-06, "loss": 0.0626, "step": 430 }, { "epoch": 2.2924281984334205, "grad_norm": 0.9583885073661804, "learning_rate": 4.642233856893543e-06, "loss": 0.0588, "step": 440 }, { "epoch": 2.3446475195822454, "grad_norm": 1.3839960098266602, "learning_rate": 4.293193717277487e-06, "loss": 0.0609, "step": 450 }, { "epoch": 2.3968668407310707, "grad_norm": 1.018333911895752, "learning_rate": 3.944153577661432e-06, "loss": 0.0627, "step": 460 }, { "epoch": 2.4490861618798956, "grad_norm": 0.6483064889907837, "learning_rate": 3.5951134380453755e-06, "loss": 0.0635, "step": 470 }, { "epoch": 2.5013054830287205, "grad_norm": 1.3443909883499146, "learning_rate": 3.2460732984293196e-06, "loss": 0.0599, "step": 480 }, { "epoch": 2.553524804177546, "grad_norm": 0.8122425675392151, "learning_rate": 2.897033158813264e-06, "loss": 0.0606, "step": 490 }, { "epoch": 2.6057441253263707, "grad_norm": 0.841248631477356, "learning_rate": 2.547993019197208e-06, "loss": 0.0583, "step": 500 }, { "epoch": 2.657963446475196, "grad_norm": 0.8031327128410339, "learning_rate": 2.198952879581152e-06, "loss": 0.0601, "step": 510 }, { "epoch": 2.710182767624021, "grad_norm": 0.9310224056243896, "learning_rate": 1.8499127399650962e-06, "loss": 0.0605, "step": 520 }, { "epoch": 2.7624020887728458, "grad_norm": 0.9988183379173279, "learning_rate": 1.5008726003490403e-06, "loss": 0.0607, "step": 530 }, { "epoch": 2.814621409921671, "grad_norm": 0.6764442920684814, "learning_rate": 1.1518324607329843e-06, "loss": 0.0577, "step": 540 }, { "epoch": 2.866840731070496, "grad_norm": 0.7667761445045471, "learning_rate": 8.027923211169285e-07, "loss": 0.0605, "step": 550 }, { "epoch": 2.9190600522193213, "grad_norm": 1.5497705936431885, "learning_rate": 4.537521815008726e-07, "loss": 0.059, "step": 560 }, { "epoch": 2.971279373368146, "grad_norm": 0.8243622779846191, "learning_rate": 1.0471204188481677e-07, "loss": 0.055, "step": 570 } ], "logging_steps": 10, "max_steps": 573, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4542744030543872e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }