| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19061, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.026231572320444888, |
| "grad_norm": 1.3818440437316895, |
| "learning_rate": 4.868842138397776e-05, |
| "loss": 4.3761, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.052463144640889775, |
| "grad_norm": 2.8775651454925537, |
| "learning_rate": 4.7376842767955515e-05, |
| "loss": 2.9069, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.052463144640889775, |
| "eval_accuracy": 0.44778331183560627, |
| "eval_loss": 2.44579815864563, |
| "eval_runtime": 53.4802, |
| "eval_samples_per_second": 114.697, |
| "eval_steps_per_second": 3.59, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07869471696133466, |
| "grad_norm": 2.087350368499756, |
| "learning_rate": 4.606526415193327e-05, |
| "loss": 2.2789, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10492628928177955, |
| "grad_norm": 1.8893406391143799, |
| "learning_rate": 4.475368553591103e-05, |
| "loss": 1.9731, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10492628928177955, |
| "eval_accuracy": 0.5686255574030745, |
| "eval_loss": 1.793091058731079, |
| "eval_runtime": 53.0446, |
| "eval_samples_per_second": 115.639, |
| "eval_steps_per_second": 3.62, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.13115786160222442, |
| "grad_norm": 1.4629850387573242, |
| "learning_rate": 4.3442106919888784e-05, |
| "loss": 1.8193, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15738943392266933, |
| "grad_norm": 1.3035223484039307, |
| "learning_rate": 4.213052830386653e-05, |
| "loss": 1.7234, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15738943392266933, |
| "eval_accuracy": 0.6008603552909747, |
| "eval_loss": 1.6119489669799805, |
| "eval_runtime": 53.4678, |
| "eval_samples_per_second": 114.723, |
| "eval_steps_per_second": 3.591, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1836210062431142, |
| "grad_norm": 1.390590786933899, |
| "learning_rate": 4.0818949687844296e-05, |
| "loss": 1.6587, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.2098525785635591, |
| "grad_norm": 1.2905086278915405, |
| "learning_rate": 3.9507371071822046e-05, |
| "loss": 1.6063, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2098525785635591, |
| "eval_accuracy": 0.6191917492074207, |
| "eval_loss": 1.511812686920166, |
| "eval_runtime": 53.5523, |
| "eval_samples_per_second": 114.542, |
| "eval_steps_per_second": 3.585, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.23608415088400397, |
| "grad_norm": 1.2503050565719604, |
| "learning_rate": 3.81957924557998e-05, |
| "loss": 1.5624, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.26231572320444885, |
| "grad_norm": 1.2357672452926636, |
| "learning_rate": 3.688421383977756e-05, |
| "loss": 1.5331, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.26231572320444885, |
| "eval_accuracy": 0.6299159118558132, |
| "eval_loss": 1.4537405967712402, |
| "eval_runtime": 54.1401, |
| "eval_samples_per_second": 113.299, |
| "eval_steps_per_second": 3.546, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2885472955248938, |
| "grad_norm": 1.2465555667877197, |
| "learning_rate": 3.5572635223755315e-05, |
| "loss": 1.5044, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.31477886784533865, |
| "grad_norm": 1.2649658918380737, |
| "learning_rate": 3.426105660773307e-05, |
| "loss": 1.4812, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.31477886784533865, |
| "eval_accuracy": 0.6392144357635486, |
| "eval_loss": 1.405892252922058, |
| "eval_runtime": 53.4127, |
| "eval_samples_per_second": 114.842, |
| "eval_steps_per_second": 3.595, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3410104401657835, |
| "grad_norm": 1.2671959400177002, |
| "learning_rate": 3.294947799171083e-05, |
| "loss": 1.4634, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3672420124862284, |
| "grad_norm": 1.1386276483535767, |
| "learning_rate": 3.163789937568858e-05, |
| "loss": 1.4428, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3672420124862284, |
| "eval_accuracy": 0.6457219204466172, |
| "eval_loss": 1.3719512224197388, |
| "eval_runtime": 53.3481, |
| "eval_samples_per_second": 114.981, |
| "eval_steps_per_second": 3.599, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3934735848066733, |
| "grad_norm": 1.2336556911468506, |
| "learning_rate": 3.0326320759666336e-05, |
| "loss": 1.4271, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.4197051571271182, |
| "grad_norm": 1.1540193557739258, |
| "learning_rate": 2.9014742143644093e-05, |
| "loss": 1.4149, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4197051571271182, |
| "eval_accuracy": 0.6509898994148602, |
| "eval_loss": 1.343773603439331, |
| "eval_runtime": 53.3234, |
| "eval_samples_per_second": 115.034, |
| "eval_steps_per_second": 3.601, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4459367294475631, |
| "grad_norm": 1.0972239971160889, |
| "learning_rate": 2.7703163527621846e-05, |
| "loss": 1.3979, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.47216830176800795, |
| "grad_norm": 1.1413357257843018, |
| "learning_rate": 2.6391584911599605e-05, |
| "loss": 1.3857, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.47216830176800795, |
| "eval_accuracy": 0.6563627375706007, |
| "eval_loss": 1.3179402351379395, |
| "eval_runtime": 52.7311, |
| "eval_samples_per_second": 116.326, |
| "eval_steps_per_second": 3.641, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4983998740884529, |
| "grad_norm": 1.1442935466766357, |
| "learning_rate": 2.5080006295577358e-05, |
| "loss": 1.3798, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5246314464088977, |
| "grad_norm": 1.1634732484817505, |
| "learning_rate": 2.3768427679555114e-05, |
| "loss": 1.3654, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5246314464088977, |
| "eval_accuracy": 0.6599810169811327, |
| "eval_loss": 1.2988349199295044, |
| "eval_runtime": 52.7299, |
| "eval_samples_per_second": 116.329, |
| "eval_steps_per_second": 3.641, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5508630187293426, |
| "grad_norm": 1.173746943473816, |
| "learning_rate": 2.245684906353287e-05, |
| "loss": 1.3583, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5770945910497876, |
| "grad_norm": 1.1500358581542969, |
| "learning_rate": 2.1145270447510627e-05, |
| "loss": 1.3449, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5770945910497876, |
| "eval_accuracy": 0.6630447219653862, |
| "eval_loss": 1.2830150127410889, |
| "eval_runtime": 52.9419, |
| "eval_samples_per_second": 115.863, |
| "eval_steps_per_second": 3.627, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.6033261633702324, |
| "grad_norm": 1.18638277053833, |
| "learning_rate": 1.983369183148838e-05, |
| "loss": 1.3371, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6295577356906773, |
| "grad_norm": 1.1297719478607178, |
| "learning_rate": 1.8522113215466136e-05, |
| "loss": 1.3302, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6295577356906773, |
| "eval_accuracy": 0.6660457982859825, |
| "eval_loss": 1.2687662839889526, |
| "eval_runtime": 52.8433, |
| "eval_samples_per_second": 116.079, |
| "eval_steps_per_second": 3.633, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6557893080111222, |
| "grad_norm": 1.1766819953918457, |
| "learning_rate": 1.7210534599443892e-05, |
| "loss": 1.3227, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.682020880331567, |
| "grad_norm": 1.128738284111023, |
| "learning_rate": 1.589895598342165e-05, |
| "loss": 1.3174, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.682020880331567, |
| "eval_accuracy": 0.6682683987237139, |
| "eval_loss": 1.2574915885925293, |
| "eval_runtime": 53.1648, |
| "eval_samples_per_second": 115.377, |
| "eval_steps_per_second": 3.611, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.708252452652012, |
| "grad_norm": 1.147472620010376, |
| "learning_rate": 1.4587377367399401e-05, |
| "loss": 1.3118, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7344840249724568, |
| "grad_norm": 1.12171471118927, |
| "learning_rate": 1.3275798751377158e-05, |
| "loss": 1.3052, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7344840249724568, |
| "eval_accuracy": 0.6708049392820683, |
| "eval_loss": 1.2456800937652588, |
| "eval_runtime": 52.8767, |
| "eval_samples_per_second": 116.006, |
| "eval_steps_per_second": 3.631, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7607155972929017, |
| "grad_norm": 1.106832504272461, |
| "learning_rate": 1.1964220135354914e-05, |
| "loss": 1.3006, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7869471696133467, |
| "grad_norm": 1.1112337112426758, |
| "learning_rate": 1.0652641519332669e-05, |
| "loss": 1.2959, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7869471696133467, |
| "eval_accuracy": 0.6724595790142662, |
| "eval_loss": 1.2370907068252563, |
| "eval_runtime": 52.7887, |
| "eval_samples_per_second": 116.199, |
| "eval_steps_per_second": 3.637, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.8131787419337915, |
| "grad_norm": 1.1115854978561401, |
| "learning_rate": 9.341062903310425e-06, |
| "loss": 1.2901, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8394103142542364, |
| "grad_norm": 1.1028845310211182, |
| "learning_rate": 8.029484287288181e-06, |
| "loss": 1.2847, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8394103142542364, |
| "eval_accuracy": 0.6743223435167859, |
| "eval_loss": 1.2278393507003784, |
| "eval_runtime": 52.9124, |
| "eval_samples_per_second": 115.927, |
| "eval_steps_per_second": 3.629, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8656418865746813, |
| "grad_norm": 1.1097549200057983, |
| "learning_rate": 6.717905671265937e-06, |
| "loss": 1.2842, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8918734588951261, |
| "grad_norm": 1.1161189079284668, |
| "learning_rate": 5.406327055243691e-06, |
| "loss": 1.28, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8918734588951261, |
| "eval_accuracy": 0.6759062272014932, |
| "eval_loss": 1.2205840349197388, |
| "eval_runtime": 52.9403, |
| "eval_samples_per_second": 115.866, |
| "eval_steps_per_second": 3.627, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.9181050312155711, |
| "grad_norm": 1.112290859222412, |
| "learning_rate": 4.0947484392214475e-06, |
| "loss": 1.2749, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9443366035360159, |
| "grad_norm": 1.10244619846344, |
| "learning_rate": 2.7831698231992025e-06, |
| "loss": 1.27, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9443366035360159, |
| "eval_accuracy": 0.6768434261098102, |
| "eval_loss": 1.2162342071533203, |
| "eval_runtime": 52.5316, |
| "eval_samples_per_second": 116.768, |
| "eval_steps_per_second": 3.655, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9705681758564608, |
| "grad_norm": 1.1379034519195557, |
| "learning_rate": 1.4715912071769583e-06, |
| "loss": 1.2756, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9967997481769058, |
| "grad_norm": 1.120842456817627, |
| "learning_rate": 1.6001259115471381e-07, |
| "loss": 1.272, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9967997481769058, |
| "eval_accuracy": 0.6775780778641618, |
| "eval_loss": 1.2128527164459229, |
| "eval_runtime": 53.048, |
| "eval_samples_per_second": 115.631, |
| "eval_steps_per_second": 3.619, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19061, |
| "total_flos": 3.18739175571456e+17, |
| "train_loss": 1.5558179828629188, |
| "train_runtime": 7460.0601, |
| "train_samples_per_second": 81.759, |
| "train_steps_per_second": 2.555 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19061, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.18739175571456e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|