| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9946917109024094, | |
| "eval_steps": 612, | |
| "global_step": 2448, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08166598611678236, | |
| "grad_norm": 1.7321007251739502, | |
| "learning_rate": 2.5e-05, | |
| "loss": 3.105, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16333197223356472, | |
| "grad_norm": 2.5774662494659424, | |
| "learning_rate": 5e-05, | |
| "loss": 2.1147, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24499795835034707, | |
| "grad_norm": 2.3063881397247314, | |
| "learning_rate": 4.9964806340932865e-05, | |
| "loss": 1.7456, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32666394446712943, | |
| "grad_norm": 2.3037455081939697, | |
| "learning_rate": 4.985932445122257e-05, | |
| "loss": 1.5373, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4083299305839118, | |
| "grad_norm": 3.1852424144744873, | |
| "learning_rate": 4.968385131436222e-05, | |
| "loss": 1.4322, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.48999591670069415, | |
| "grad_norm": 2.7717628479003906, | |
| "learning_rate": 4.943888097369216e-05, | |
| "loss": 1.3964, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5716619028174765, | |
| "grad_norm": 2.9692201614379883, | |
| "learning_rate": 4.912510314142448e-05, | |
| "loss": 1.3724, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6533278889342589, | |
| "grad_norm": 2.7853329181671143, | |
| "learning_rate": 4.87434012567633e-05, | |
| "loss": 1.2211, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7349938750510412, | |
| "grad_norm": 3.02131986618042, | |
| "learning_rate": 4.829484999858815e-05, | |
| "loss": 1.2873, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8166598611678236, | |
| "grad_norm": 2.9002573490142822, | |
| "learning_rate": 4.77807122597034e-05, | |
| "loss": 1.1954, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.898325847284606, | |
| "grad_norm": 3.3557584285736084, | |
| "learning_rate": 4.7202435591172676e-05, | |
| "loss": 1.1836, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9799918334013883, | |
| "grad_norm": 2.2068400382995605, | |
| "learning_rate": 4.656164812674951e-05, | |
| "loss": 1.1539, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9995916700694161, | |
| "eval_loss": 1.0969716310501099, | |
| "eval_runtime": 142.2066, | |
| "eval_samples_per_second": 3.832, | |
| "eval_steps_per_second": 3.832, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.060432829726419, | |
| "grad_norm": 3.3319382667541504, | |
| "learning_rate": 4.5860153998878494e-05, | |
| "loss": 1.0427, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1420988158432013, | |
| "grad_norm": 3.4751930236816406, | |
| "learning_rate": 4.5099928259173516e-05, | |
| "loss": 0.9736, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2237648019599836, | |
| "grad_norm": 2.9710006713867188, | |
| "learning_rate": 4.4283111317674374e-05, | |
| "loss": 1.0045, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.305430788076766, | |
| "grad_norm": 3.5268781185150146, | |
| "learning_rate": 4.341200291653781e-05, | |
| "loss": 0.9427, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3870967741935485, | |
| "grad_norm": 2.7967159748077393, | |
| "learning_rate": 4.248905565513023e-05, | |
| "loss": 0.9968, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4687627603103308, | |
| "grad_norm": 3.274461030960083, | |
| "learning_rate": 4.151686808475204e-05, | |
| "loss": 0.8969, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5504287464271131, | |
| "grad_norm": 3.7996115684509277, | |
| "learning_rate": 4.049817739243532e-05, | |
| "loss": 0.9398, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6320947325438955, | |
| "grad_norm": 3.434795379638672, | |
| "learning_rate": 3.94358516944137e-05, | |
| "loss": 0.9255, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7137607186606778, | |
| "grad_norm": 2.722684383392334, | |
| "learning_rate": 3.833288196096194e-05, | |
| "loss": 0.9775, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.7954267047774601, | |
| "grad_norm": 4.152195930480957, | |
| "learning_rate": 3.719237359534087e-05, | |
| "loss": 0.8806, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.8770926908942425, | |
| "grad_norm": 2.783254623413086, | |
| "learning_rate": 3.6017537690557115e-05, | |
| "loss": 0.9433, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.958758677011025, | |
| "grad_norm": 3.3357818126678467, | |
| "learning_rate": 3.481168198855409e-05, | |
| "loss": 0.9619, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.9979583503470804, | |
| "eval_loss": 0.9826937913894653, | |
| "eval_runtime": 141.9315, | |
| "eval_samples_per_second": 3.84, | |
| "eval_steps_per_second": 3.84, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 2.0391996733360553, | |
| "grad_norm": 3.218412160873413, | |
| "learning_rate": 3.357820156728867e-05, | |
| "loss": 0.8787, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.120865659452838, | |
| "grad_norm": 3.419764280319214, | |
| "learning_rate": 3.232056928191376e-05, | |
| "loss": 0.7141, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2025316455696204, | |
| "grad_norm": 3.141345739364624, | |
| "learning_rate": 3.1042325986980064e-05, | |
| "loss": 0.7485, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.2841976316864026, | |
| "grad_norm": 2.911731719970703, | |
| "learning_rate": 2.974707056718571e-05, | |
| "loss": 0.7787, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.365863617803185, | |
| "grad_norm": 3.447298765182495, | |
| "learning_rate": 2.8438449804742628e-05, | |
| "loss": 0.7358, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.447529603919967, | |
| "grad_norm": 3.8683292865753174, | |
| "learning_rate": 2.7120148111887732e-05, | |
| "loss": 0.7281, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.5291955900367498, | |
| "grad_norm": 3.7542524337768555, | |
| "learning_rate": 2.579587715744712e-05, | |
| "loss": 0.7169, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.610861576153532, | |
| "grad_norm": 2.9598772525787354, | |
| "learning_rate": 2.446936541665941e-05, | |
| "loss": 0.7425, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.6925275622703144, | |
| "grad_norm": 3.7450504302978516, | |
| "learning_rate": 2.3144347673680936e-05, | |
| "loss": 0.699, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.774193548387097, | |
| "grad_norm": 3.8803372383117676, | |
| "learning_rate": 2.182455450632803e-05, | |
| "loss": 0.751, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.855859534503879, | |
| "grad_norm": 3.553741216659546, | |
| "learning_rate": 2.0513701782662366e-05, | |
| "loss": 0.7451, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.9375255206206616, | |
| "grad_norm": 3.795320987701416, | |
| "learning_rate": 1.9215480198991466e-05, | |
| "loss": 0.7101, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.9963250306247446, | |
| "eval_loss": 0.9423996210098267, | |
| "eval_runtime": 141.7974, | |
| "eval_samples_per_second": 3.844, | |
| "eval_steps_per_second": 3.844, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 3.017966516945692, | |
| "grad_norm": 3.2628490924835205, | |
| "learning_rate": 1.793354488874006e-05, | |
| "loss": 0.681, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.0996325030624745, | |
| "grad_norm": 3.2649693489074707, | |
| "learning_rate": 1.6696527865384627e-05, | |
| "loss": 0.6026, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.181298489179257, | |
| "grad_norm": 2.9757375717163086, | |
| "learning_rate": 1.5457433537180068e-05, | |
| "loss": 0.5814, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.262964475296039, | |
| "grad_norm": 2.891662836074829, | |
| "learning_rate": 1.424520623543294e-05, | |
| "loss": 0.5761, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.3446304614128217, | |
| "grad_norm": 3.558239221572876, | |
| "learning_rate": 1.3063258977292814e-05, | |
| "loss": 0.5863, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.426296447529604, | |
| "grad_norm": 3.837745428085327, | |
| "learning_rate": 1.1914919526666754e-05, | |
| "loss": 0.5972, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.5079624336463864, | |
| "grad_norm": 3.7010719776153564, | |
| "learning_rate": 1.0803421024924246e-05, | |
| "loss": 0.5797, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.5896284197631685, | |
| "grad_norm": 3.509216785430908, | |
| "learning_rate": 9.731892888011207e-06, | |
| "loss": 0.5959, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.671294405879951, | |
| "grad_norm": 4.380457878112793, | |
| "learning_rate": 8.703351995602158e-06, | |
| "loss": 0.5605, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.7529603919967336, | |
| "grad_norm": 3.9303319454193115, | |
| "learning_rate": 7.720694197097405e-06, | |
| "loss": 0.5507, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.8346263781135157, | |
| "grad_norm": 3.130398988723755, | |
| "learning_rate": 6.786686158380176e-06, | |
| "loss": 0.5661, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.916292364230298, | |
| "grad_norm": 3.736706495285034, | |
| "learning_rate": 5.903957572288923e-06, | |
| "loss": 0.5643, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.9946917109024094, | |
| "eval_loss": 0.9574249982833862, | |
| "eval_runtime": 141.5072, | |
| "eval_samples_per_second": 3.851, | |
| "eval_steps_per_second": 3.851, | |
| "step": 2448 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3060, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 612, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.802763943153664e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |