{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9946917109024094, "eval_steps": 612, "global_step": 2448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08166598611678236, "grad_norm": 1.7321007251739502, "learning_rate": 2.5e-05, "loss": 3.105, "step": 50 }, { "epoch": 0.16333197223356472, "grad_norm": 2.5774662494659424, "learning_rate": 5e-05, "loss": 2.1147, "step": 100 }, { "epoch": 0.24499795835034707, "grad_norm": 2.3063881397247314, "learning_rate": 4.9964806340932865e-05, "loss": 1.7456, "step": 150 }, { "epoch": 0.32666394446712943, "grad_norm": 2.3037455081939697, "learning_rate": 4.985932445122257e-05, "loss": 1.5373, "step": 200 }, { "epoch": 0.4083299305839118, "grad_norm": 3.1852424144744873, "learning_rate": 4.968385131436222e-05, "loss": 1.4322, "step": 250 }, { "epoch": 0.48999591670069415, "grad_norm": 2.7717628479003906, "learning_rate": 4.943888097369216e-05, "loss": 1.3964, "step": 300 }, { "epoch": 0.5716619028174765, "grad_norm": 2.9692201614379883, "learning_rate": 4.912510314142448e-05, "loss": 1.3724, "step": 350 }, { "epoch": 0.6533278889342589, "grad_norm": 2.7853329181671143, "learning_rate": 4.87434012567633e-05, "loss": 1.2211, "step": 400 }, { "epoch": 0.7349938750510412, "grad_norm": 3.02131986618042, "learning_rate": 4.829484999858815e-05, "loss": 1.2873, "step": 450 }, { "epoch": 0.8166598611678236, "grad_norm": 2.9002573490142822, "learning_rate": 4.77807122597034e-05, "loss": 1.1954, "step": 500 }, { "epoch": 0.898325847284606, "grad_norm": 3.3557584285736084, "learning_rate": 4.7202435591172676e-05, "loss": 1.1836, "step": 550 }, { "epoch": 0.9799918334013883, "grad_norm": 2.2068400382995605, "learning_rate": 4.656164812674951e-05, "loss": 1.1539, "step": 600 }, { "epoch": 0.9995916700694161, "eval_loss": 1.0969716310501099, "eval_runtime": 142.2066, "eval_samples_per_second": 3.832, "eval_steps_per_second": 3.832, "step": 612 }, { "epoch": 1.060432829726419, "grad_norm": 3.3319382667541504, "learning_rate": 4.5860153998878494e-05, "loss": 1.0427, "step": 650 }, { "epoch": 1.1420988158432013, "grad_norm": 3.4751930236816406, "learning_rate": 4.5099928259173516e-05, "loss": 0.9736, "step": 700 }, { "epoch": 1.2237648019599836, "grad_norm": 2.9710006713867188, "learning_rate": 4.4283111317674374e-05, "loss": 1.0045, "step": 750 }, { "epoch": 1.305430788076766, "grad_norm": 3.5268781185150146, "learning_rate": 4.341200291653781e-05, "loss": 0.9427, "step": 800 }, { "epoch": 1.3870967741935485, "grad_norm": 2.7967159748077393, "learning_rate": 4.248905565513023e-05, "loss": 0.9968, "step": 850 }, { "epoch": 1.4687627603103308, "grad_norm": 3.274461030960083, "learning_rate": 4.151686808475204e-05, "loss": 0.8969, "step": 900 }, { "epoch": 1.5504287464271131, "grad_norm": 3.7996115684509277, "learning_rate": 4.049817739243532e-05, "loss": 0.9398, "step": 950 }, { "epoch": 1.6320947325438955, "grad_norm": 3.434795379638672, "learning_rate": 3.94358516944137e-05, "loss": 0.9255, "step": 1000 }, { "epoch": 1.7137607186606778, "grad_norm": 2.722684383392334, "learning_rate": 3.833288196096194e-05, "loss": 0.9775, "step": 1050 }, { "epoch": 1.7954267047774601, "grad_norm": 4.152195930480957, "learning_rate": 3.719237359534087e-05, "loss": 0.8806, "step": 1100 }, { "epoch": 1.8770926908942425, "grad_norm": 2.783254623413086, "learning_rate": 3.6017537690557115e-05, "loss": 0.9433, "step": 1150 }, { "epoch": 1.958758677011025, "grad_norm": 3.3357818126678467, "learning_rate": 3.481168198855409e-05, "loss": 0.9619, "step": 1200 }, { "epoch": 1.9979583503470804, "eval_loss": 0.9826937913894653, "eval_runtime": 141.9315, "eval_samples_per_second": 3.84, "eval_steps_per_second": 3.84, "step": 1224 }, { "epoch": 2.0391996733360553, "grad_norm": 3.218412160873413, "learning_rate": 3.357820156728867e-05, "loss": 0.8787, "step": 1250 }, { "epoch": 2.120865659452838, "grad_norm": 3.419764280319214, "learning_rate": 3.232056928191376e-05, "loss": 0.7141, "step": 1300 }, { "epoch": 2.2025316455696204, "grad_norm": 3.141345739364624, "learning_rate": 3.1042325986980064e-05, "loss": 0.7485, "step": 1350 }, { "epoch": 2.2841976316864026, "grad_norm": 2.911731719970703, "learning_rate": 2.974707056718571e-05, "loss": 0.7787, "step": 1400 }, { "epoch": 2.365863617803185, "grad_norm": 3.447298765182495, "learning_rate": 2.8438449804742628e-05, "loss": 0.7358, "step": 1450 }, { "epoch": 2.447529603919967, "grad_norm": 3.8683292865753174, "learning_rate": 2.7120148111887732e-05, "loss": 0.7281, "step": 1500 }, { "epoch": 2.5291955900367498, "grad_norm": 3.7542524337768555, "learning_rate": 2.579587715744712e-05, "loss": 0.7169, "step": 1550 }, { "epoch": 2.610861576153532, "grad_norm": 2.9598772525787354, "learning_rate": 2.446936541665941e-05, "loss": 0.7425, "step": 1600 }, { "epoch": 2.6925275622703144, "grad_norm": 3.7450504302978516, "learning_rate": 2.3144347673680936e-05, "loss": 0.699, "step": 1650 }, { "epoch": 2.774193548387097, "grad_norm": 3.8803372383117676, "learning_rate": 2.182455450632803e-05, "loss": 0.751, "step": 1700 }, { "epoch": 2.855859534503879, "grad_norm": 3.553741216659546, "learning_rate": 2.0513701782662366e-05, "loss": 0.7451, "step": 1750 }, { "epoch": 2.9375255206206616, "grad_norm": 3.795320987701416, "learning_rate": 1.9215480198991466e-05, "loss": 0.7101, "step": 1800 }, { "epoch": 2.9963250306247446, "eval_loss": 0.9423996210098267, "eval_runtime": 141.7974, "eval_samples_per_second": 3.844, "eval_steps_per_second": 3.844, "step": 1836 }, { "epoch": 3.017966516945692, "grad_norm": 3.2628490924835205, "learning_rate": 1.793354488874006e-05, "loss": 0.681, "step": 1850 }, { "epoch": 3.0996325030624745, "grad_norm": 3.2649693489074707, "learning_rate": 1.6696527865384627e-05, "loss": 0.6026, "step": 1900 }, { "epoch": 3.181298489179257, "grad_norm": 2.9757375717163086, "learning_rate": 1.5457433537180068e-05, "loss": 0.5814, "step": 1950 }, { "epoch": 3.262964475296039, "grad_norm": 2.891662836074829, "learning_rate": 1.424520623543294e-05, "loss": 0.5761, "step": 2000 }, { "epoch": 3.3446304614128217, "grad_norm": 3.558239221572876, "learning_rate": 1.3063258977292814e-05, "loss": 0.5863, "step": 2050 }, { "epoch": 3.426296447529604, "grad_norm": 3.837745428085327, "learning_rate": 1.1914919526666754e-05, "loss": 0.5972, "step": 2100 }, { "epoch": 3.5079624336463864, "grad_norm": 3.7010719776153564, "learning_rate": 1.0803421024924246e-05, "loss": 0.5797, "step": 2150 }, { "epoch": 3.5896284197631685, "grad_norm": 3.509216785430908, "learning_rate": 9.731892888011207e-06, "loss": 0.5959, "step": 2200 }, { "epoch": 3.671294405879951, "grad_norm": 4.380457878112793, "learning_rate": 8.703351995602158e-06, "loss": 0.5605, "step": 2250 }, { "epoch": 3.7529603919967336, "grad_norm": 3.9303319454193115, "learning_rate": 7.720694197097405e-06, "loss": 0.5507, "step": 2300 }, { "epoch": 3.8346263781135157, "grad_norm": 3.130398988723755, "learning_rate": 6.786686158380176e-06, "loss": 0.5661, "step": 2350 }, { "epoch": 3.916292364230298, "grad_norm": 3.736706495285034, "learning_rate": 5.903957572288923e-06, "loss": 0.5643, "step": 2400 }, { "epoch": 3.9946917109024094, "eval_loss": 0.9574249982833862, "eval_runtime": 141.5072, "eval_samples_per_second": 3.851, "eval_steps_per_second": 3.851, "step": 2448 } ], "logging_steps": 50, "max_steps": 3060, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 612, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.802763943153664e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }