| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9869451697127936, | |
| "eval_steps": 500, | |
| "global_step": 573, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05221932114882506, | |
| "grad_norm": 5.394797325134277, | |
| "learning_rate": 1.9650959860383945e-05, | |
| "loss": 1.2108, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10443864229765012, | |
| "grad_norm": 4.011317729949951, | |
| "learning_rate": 1.930191972076789e-05, | |
| "loss": 0.9376, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1566579634464752, | |
| "grad_norm": 3.838289260864258, | |
| "learning_rate": 1.895287958115183e-05, | |
| "loss": 0.7876, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20887728459530025, | |
| "grad_norm": 4.1263933181762695, | |
| "learning_rate": 1.8603839441535778e-05, | |
| "loss": 0.7373, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.26109660574412535, | |
| "grad_norm": 3.705490827560425, | |
| "learning_rate": 1.825479930191972e-05, | |
| "loss": 0.6564, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3133159268929504, | |
| "grad_norm": 3.4956674575805664, | |
| "learning_rate": 1.7905759162303668e-05, | |
| "loss": 0.6102, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.36553524804177545, | |
| "grad_norm": 3.823296308517456, | |
| "learning_rate": 1.755671902268761e-05, | |
| "loss": 0.4846, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4177545691906005, | |
| "grad_norm": 2.9303112030029297, | |
| "learning_rate": 1.7207678883071554e-05, | |
| "loss": 0.447, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4699738903394256, | |
| "grad_norm": 3.7392730712890625, | |
| "learning_rate": 1.6858638743455497e-05, | |
| "loss": 0.3669, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5221932114882507, | |
| "grad_norm": 3.2671146392822266, | |
| "learning_rate": 1.6509598603839444e-05, | |
| "loss": 0.34, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5744125326370757, | |
| "grad_norm": 3.2053961753845215, | |
| "learning_rate": 1.6160558464223387e-05, | |
| "loss": 0.2791, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6266318537859008, | |
| "grad_norm": 2.1108620166778564, | |
| "learning_rate": 1.581151832460733e-05, | |
| "loss": 0.2593, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6788511749347258, | |
| "grad_norm": 3.154852867126465, | |
| "learning_rate": 1.5462478184991274e-05, | |
| "loss": 0.2308, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7310704960835509, | |
| "grad_norm": 2.451120376586914, | |
| "learning_rate": 1.511343804537522e-05, | |
| "loss": 0.2126, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.783289817232376, | |
| "grad_norm": 3.1815454959869385, | |
| "learning_rate": 1.4764397905759162e-05, | |
| "loss": 0.1987, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.835509138381201, | |
| "grad_norm": 2.3058295249938965, | |
| "learning_rate": 1.4415357766143108e-05, | |
| "loss": 0.2129, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8877284595300261, | |
| "grad_norm": 2.5228800773620605, | |
| "learning_rate": 1.4066317626527052e-05, | |
| "loss": 0.2154, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9399477806788512, | |
| "grad_norm": 3.707547426223755, | |
| "learning_rate": 1.3717277486910996e-05, | |
| "loss": 0.1779, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9921671018276762, | |
| "grad_norm": 1.675297737121582, | |
| "learning_rate": 1.336823734729494e-05, | |
| "loss": 0.1925, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.04177545691906, | |
| "grad_norm": 2.4128589630126953, | |
| "learning_rate": 1.3019197207678885e-05, | |
| "loss": 0.1193, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0939947780678851, | |
| "grad_norm": 1.474452018737793, | |
| "learning_rate": 1.2670157068062828e-05, | |
| "loss": 0.1135, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1462140992167102, | |
| "grad_norm": 1.6879887580871582, | |
| "learning_rate": 1.2321116928446773e-05, | |
| "loss": 0.1064, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1984334203655354, | |
| "grad_norm": 1.3061121702194214, | |
| "learning_rate": 1.1972076788830716e-05, | |
| "loss": 0.1125, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2506527415143602, | |
| "grad_norm": 1.8820029497146606, | |
| "learning_rate": 1.162303664921466e-05, | |
| "loss": 0.1156, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3028720626631853, | |
| "grad_norm": 2.1106035709381104, | |
| "learning_rate": 1.1273996509598604e-05, | |
| "loss": 0.1112, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3550913838120104, | |
| "grad_norm": 1.6610087156295776, | |
| "learning_rate": 1.0924956369982549e-05, | |
| "loss": 0.1147, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4073107049608355, | |
| "grad_norm": 1.0979489088058472, | |
| "learning_rate": 1.0575916230366492e-05, | |
| "loss": 0.0959, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4595300261096606, | |
| "grad_norm": 1.7349355220794678, | |
| "learning_rate": 1.0226876090750437e-05, | |
| "loss": 0.0959, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5117493472584855, | |
| "grad_norm": 1.3557089567184448, | |
| "learning_rate": 9.877835951134382e-06, | |
| "loss": 0.0961, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5639686684073109, | |
| "grad_norm": 2.409390687942505, | |
| "learning_rate": 9.528795811518325e-06, | |
| "loss": 0.1122, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6161879895561357, | |
| "grad_norm": 2.6092705726623535, | |
| "learning_rate": 9.17975567190227e-06, | |
| "loss": 0.1012, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6684073107049608, | |
| "grad_norm": 0.9906852841377258, | |
| "learning_rate": 8.830715532286213e-06, | |
| "loss": 0.0949, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.720626631853786, | |
| "grad_norm": 1.7950801849365234, | |
| "learning_rate": 8.481675392670158e-06, | |
| "loss": 0.108, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7728459530026108, | |
| "grad_norm": 5.53033971786499, | |
| "learning_rate": 8.132635253054101e-06, | |
| "loss": 0.0999, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8250652741514362, | |
| "grad_norm": 1.3689807653427124, | |
| "learning_rate": 7.783595113438046e-06, | |
| "loss": 0.0848, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.877284595300261, | |
| "grad_norm": 0.9605047702789307, | |
| "learning_rate": 7.43455497382199e-06, | |
| "loss": 0.0889, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9295039164490861, | |
| "grad_norm": 2.4679007530212402, | |
| "learning_rate": 7.0855148342059345e-06, | |
| "loss": 0.0942, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9817232375979112, | |
| "grad_norm": 1.0207858085632324, | |
| "learning_rate": 6.7364746945898785e-06, | |
| "loss": 0.0897, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.031331592689295, | |
| "grad_norm": 0.7235077023506165, | |
| "learning_rate": 6.3874345549738226e-06, | |
| "loss": 0.0767, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.08355091383812, | |
| "grad_norm": 1.5406379699707031, | |
| "learning_rate": 6.038394415357767e-06, | |
| "loss": 0.064, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1357702349869454, | |
| "grad_norm": 1.5837807655334473, | |
| "learning_rate": 5.689354275741711e-06, | |
| "loss": 0.0624, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1879895561357703, | |
| "grad_norm": 0.8071414828300476, | |
| "learning_rate": 5.340314136125655e-06, | |
| "loss": 0.0618, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.240208877284595, | |
| "grad_norm": 1.0322142839431763, | |
| "learning_rate": 4.991273996509599e-06, | |
| "loss": 0.0626, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.2924281984334205, | |
| "grad_norm": 0.9583885073661804, | |
| "learning_rate": 4.642233856893543e-06, | |
| "loss": 0.0588, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.3446475195822454, | |
| "grad_norm": 1.3839960098266602, | |
| "learning_rate": 4.293193717277487e-06, | |
| "loss": 0.0609, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3968668407310707, | |
| "grad_norm": 1.018333911895752, | |
| "learning_rate": 3.944153577661432e-06, | |
| "loss": 0.0627, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4490861618798956, | |
| "grad_norm": 0.6483064889907837, | |
| "learning_rate": 3.5951134380453755e-06, | |
| "loss": 0.0635, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.5013054830287205, | |
| "grad_norm": 1.3443909883499146, | |
| "learning_rate": 3.2460732984293196e-06, | |
| "loss": 0.0599, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.553524804177546, | |
| "grad_norm": 0.8122425675392151, | |
| "learning_rate": 2.897033158813264e-06, | |
| "loss": 0.0606, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.6057441253263707, | |
| "grad_norm": 0.841248631477356, | |
| "learning_rate": 2.547993019197208e-06, | |
| "loss": 0.0583, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.657963446475196, | |
| "grad_norm": 0.8031327128410339, | |
| "learning_rate": 2.198952879581152e-06, | |
| "loss": 0.0601, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.710182767624021, | |
| "grad_norm": 0.9310224056243896, | |
| "learning_rate": 1.8499127399650962e-06, | |
| "loss": 0.0605, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.7624020887728458, | |
| "grad_norm": 0.9988183379173279, | |
| "learning_rate": 1.5008726003490403e-06, | |
| "loss": 0.0607, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.814621409921671, | |
| "grad_norm": 0.6764442920684814, | |
| "learning_rate": 1.1518324607329843e-06, | |
| "loss": 0.0577, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.866840731070496, | |
| "grad_norm": 0.7667761445045471, | |
| "learning_rate": 8.027923211169285e-07, | |
| "loss": 0.0605, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.9190600522193213, | |
| "grad_norm": 1.5497705936431885, | |
| "learning_rate": 4.537521815008726e-07, | |
| "loss": 0.059, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.971279373368146, | |
| "grad_norm": 0.8243622779846191, | |
| "learning_rate": 1.0471204188481677e-07, | |
| "loss": 0.055, | |
| "step": 570 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 573, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4542744030543872e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |