ADPrLlama / trainer_state.json
jbenbudd's picture
train_1_epoch_test
61d2404
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02280501710376283,
"grad_norm": 32.61074447631836,
"learning_rate": 2.9999999999999997e-05,
"loss": 12.6644,
"num_input_tokens_seen": 61696,
"step": 5,
"train_runtime": 24.3804,
"train_tokens_per_second": 2530.557
},
{
"epoch": 0.04561003420752566,
"grad_norm": 18.813081741333008,
"learning_rate": 6.75e-05,
"loss": 5.0641,
"num_input_tokens_seen": 123136,
"step": 10,
"train_runtime": 47.2224,
"train_tokens_per_second": 2607.578
},
{
"epoch": 0.06841505131128849,
"grad_norm": 6.64304256439209,
"learning_rate": 0.00010499999999999999,
"loss": 1.5053,
"num_input_tokens_seen": 184832,
"step": 15,
"train_runtime": 70.2288,
"train_tokens_per_second": 2631.856
},
{
"epoch": 0.09122006841505131,
"grad_norm": 1.0877084732055664,
"learning_rate": 0.0001425,
"loss": 0.6917,
"num_input_tokens_seen": 247552,
"step": 20,
"train_runtime": 93.5022,
"train_tokens_per_second": 2647.553
},
{
"epoch": 0.11402508551881414,
"grad_norm": 2.6032357215881348,
"learning_rate": 0.00017999999999999998,
"loss": 0.5721,
"num_input_tokens_seen": 308992,
"step": 25,
"train_runtime": 116.3387,
"train_tokens_per_second": 2655.97
},
{
"epoch": 0.13683010262257697,
"grad_norm": 3.105325698852539,
"learning_rate": 0.00021749999999999997,
"loss": 0.6254,
"num_input_tokens_seen": 370176,
"step": 30,
"train_runtime": 139.2197,
"train_tokens_per_second": 2658.935
},
{
"epoch": 0.15963511972633979,
"grad_norm": 0.9677203297615051,
"learning_rate": 0.00025499999999999996,
"loss": 0.6389,
"num_input_tokens_seen": 430720,
"step": 35,
"train_runtime": 161.8821,
"train_tokens_per_second": 2660.702
},
{
"epoch": 0.18244013683010263,
"grad_norm": 1.2506942749023438,
"learning_rate": 0.00029249999999999995,
"loss": 0.5258,
"num_input_tokens_seen": 492800,
"step": 40,
"train_runtime": 185.2921,
"train_tokens_per_second": 2659.584
},
{
"epoch": 0.20524515393386544,
"grad_norm": 6.443251132965088,
"learning_rate": 0.0002996346075389736,
"loss": 0.5041,
"num_input_tokens_seen": 553984,
"step": 45,
"train_runtime": 208.2146,
"train_tokens_per_second": 2660.639
},
{
"epoch": 0.22805017103762829,
"grad_norm": 0.32593730092048645,
"learning_rate": 0.00029815325108927063,
"loss": 0.4831,
"num_input_tokens_seen": 615040,
"step": 50,
"train_runtime": 231.0893,
"train_tokens_per_second": 2661.482
},
{
"epoch": 0.2508551881413911,
"grad_norm": 0.46452033519744873,
"learning_rate": 0.0002955443589413994,
"loss": 0.4957,
"num_input_tokens_seen": 676736,
"step": 55,
"train_runtime": 254.0788,
"train_tokens_per_second": 2663.488
},
{
"epoch": 0.27366020524515394,
"grad_norm": 0.26457586884498596,
"learning_rate": 0.00029182778633989753,
"loss": 0.4764,
"num_input_tokens_seen": 738176,
"step": 60,
"train_runtime": 277.0778,
"train_tokens_per_second": 2664.147
},
{
"epoch": 0.29646522234891676,
"grad_norm": 0.14807738363742828,
"learning_rate": 0.0002870318186463901,
"loss": 0.4829,
"num_input_tokens_seen": 799488,
"step": 65,
"train_runtime": 300.021,
"train_tokens_per_second": 2664.773
},
{
"epoch": 0.31927023945267957,
"grad_norm": 0.3857417404651642,
"learning_rate": 0.00028119295607090933,
"loss": 0.478,
"num_input_tokens_seen": 861568,
"step": 70,
"train_runtime": 323.133,
"train_tokens_per_second": 2666.296
},
{
"epoch": 0.34207525655644244,
"grad_norm": 0.22548428177833557,
"learning_rate": 0.0002743556358832562,
"loss": 0.4771,
"num_input_tokens_seen": 924544,
"step": 75,
"train_runtime": 346.7417,
"train_tokens_per_second": 2666.376
},
{
"epoch": 0.36488027366020526,
"grad_norm": 0.22088338434696198,
"learning_rate": 0.0002665718942185456,
"loss": 0.4657,
"num_input_tokens_seen": 985472,
"step": 80,
"train_runtime": 369.5185,
"train_tokens_per_second": 2666.908
},
{
"epoch": 0.38768529076396807,
"grad_norm": 0.3554978668689728,
"learning_rate": 0.00025790097005079764,
"loss": 0.4831,
"num_input_tokens_seen": 1046912,
"step": 85,
"train_runtime": 392.5346,
"train_tokens_per_second": 2667.056
},
{
"epoch": 0.4104903078677309,
"grad_norm": 0.131392702460289,
"learning_rate": 0.0002484088543485761,
"loss": 0.4778,
"num_input_tokens_seen": 1108992,
"step": 90,
"train_runtime": 415.6419,
"train_tokens_per_second": 2668.143
},
{
"epoch": 0.43329532497149376,
"grad_norm": 0.30744704604148865,
"learning_rate": 0.00023816778784387094,
"loss": 0.455,
"num_input_tokens_seen": 1170048,
"step": 95,
"train_runtime": 438.5167,
"train_tokens_per_second": 2668.195
},
{
"epoch": 0.45610034207525657,
"grad_norm": 0.18866093456745148,
"learning_rate": 0.00022725571123650813,
"loss": 0.4571,
"num_input_tokens_seen": 1230464,
"step": 100,
"train_runtime": 461.123,
"train_tokens_per_second": 2668.407
},
{
"epoch": 0.45610034207525657,
"eval_loss": 0.4646710157394409,
"eval_runtime": 32.6532,
"eval_samples_per_second": 95.488,
"eval_steps_per_second": 5.972,
"num_input_tokens_seen": 1230464,
"step": 100
},
{
"epoch": 0.4789053591790194,
"grad_norm": 0.23965908586978912,
"learning_rate": 0.0002157556720183616,
"loss": 0.4667,
"num_input_tokens_seen": 1292288,
"step": 105,
"train_runtime": 516.8377,
"train_tokens_per_second": 2500.375
},
{
"epoch": 0.5017103762827823,
"grad_norm": 0.25870010256767273,
"learning_rate": 0.000203755192431795,
"loss": 0.4758,
"num_input_tokens_seen": 1353344,
"step": 110,
"train_runtime": 539.6435,
"train_tokens_per_second": 2507.848
},
{
"epoch": 0.5245153933865451,
"grad_norm": 0.26426610350608826,
"learning_rate": 0.00019134560337254986,
"loss": 0.4783,
"num_input_tokens_seen": 1415040,
"step": 115,
"train_runtime": 562.5569,
"train_tokens_per_second": 2515.372
},
{
"epoch": 0.5473204104903079,
"grad_norm": 0.22881975769996643,
"learning_rate": 0.0001786213493064817,
"loss": 0.4643,
"num_input_tokens_seen": 1476480,
"step": 120,
"train_runtime": 585.45,
"train_tokens_per_second": 2521.958
},
{
"epoch": 0.5701254275940707,
"grad_norm": 0.2016284018754959,
"learning_rate": 0.000165679269490148,
"loss": 0.4542,
"num_input_tokens_seen": 1537664,
"step": 125,
"train_runtime": 608.3455,
"train_tokens_per_second": 2527.616
},
{
"epoch": 0.5929304446978335,
"grad_norm": 0.1549897938966751,
"learning_rate": 0.00015261786096559254,
"loss": 0.4539,
"num_input_tokens_seen": 1598848,
"step": 130,
"train_runtime": 631.1849,
"train_tokens_per_second": 2533.09
},
{
"epoch": 0.6157354618015963,
"grad_norm": 0.4272724986076355,
"learning_rate": 0.00013953652893838119,
"loss": 0.4563,
"num_input_tokens_seen": 1660800,
"step": 135,
"train_runtime": 654.1659,
"train_tokens_per_second": 2538.806
},
{
"epoch": 0.6385404789053591,
"grad_norm": 0.2291804850101471,
"learning_rate": 0.00012653483024396533,
"loss": 0.4434,
"num_input_tokens_seen": 1721600,
"step": 140,
"train_runtime": 676.8974,
"train_tokens_per_second": 2543.369
},
{
"epoch": 0.661345496009122,
"grad_norm": 0.3113957345485687,
"learning_rate": 0.00011371171566004985,
"loss": 0.4484,
"num_input_tokens_seen": 1783168,
"step": 145,
"train_runtime": 699.8482,
"train_tokens_per_second": 2547.935
},
{
"epoch": 0.6841505131128849,
"grad_norm": 0.25940707325935364,
"learning_rate": 0.00010116477683142652,
"loss": 0.4314,
"num_input_tokens_seen": 1844992,
"step": 150,
"train_runtime": 722.8201,
"train_tokens_per_second": 2552.491
},
{
"epoch": 0.7069555302166477,
"grad_norm": 0.28187158703804016,
"learning_rate": 8.898950353862998e-05,
"loss": 0.4211,
"num_input_tokens_seen": 1906048,
"step": 155,
"train_runtime": 745.621,
"train_tokens_per_second": 2556.323
},
{
"epoch": 0.7297605473204105,
"grad_norm": 0.317065954208374,
"learning_rate": 7.727855696304944e-05,
"loss": 0.4324,
"num_input_tokens_seen": 1967744,
"step": 160,
"train_runtime": 768.5694,
"train_tokens_per_second": 2560.268
},
{
"epoch": 0.7525655644241733,
"grad_norm": 0.2838670313358307,
"learning_rate": 6.612106447938799e-05,
"loss": 0.4093,
"num_input_tokens_seen": 2028032,
"step": 165,
"train_runtime": 791.1533,
"train_tokens_per_second": 2563.387
},
{
"epoch": 0.7753705815279361,
"grad_norm": 0.29432976245880127,
"learning_rate": 5.56019413425244e-05,
"loss": 0.4113,
"num_input_tokens_seen": 2088448,
"step": 170,
"train_runtime": 813.7398,
"train_tokens_per_second": 2566.481
},
{
"epoch": 0.798175598631699,
"grad_norm": 0.2523755133152008,
"learning_rate": 4.5801244431150394e-05,
"loss": 0.4142,
"num_input_tokens_seen": 2150144,
"step": 175,
"train_runtime": 836.6712,
"train_tokens_per_second": 2569.879
},
{
"epoch": 0.8209806157354618,
"grad_norm": 0.25406619906425476,
"learning_rate": 3.6793562966584196e-05,
"loss": 0.407,
"num_input_tokens_seen": 2211584,
"step": 180,
"train_runtime": 859.6283,
"train_tokens_per_second": 2572.721
},
{
"epoch": 0.8437856328392246,
"grad_norm": 0.35335877537727356,
"learning_rate": 2.8647450843757897e-05,
"loss": 0.3836,
"num_input_tokens_seen": 2272256,
"step": 185,
"train_runtime": 882.269,
"train_tokens_per_second": 2575.469
},
{
"epoch": 0.8665906499429875,
"grad_norm": 0.3066641688346863,
"learning_rate": 2.1424904894683165e-05,
"loss": 0.3904,
"num_input_tokens_seen": 2333696,
"step": 190,
"train_runtime": 905.2082,
"train_tokens_per_second": 2578.076
},
{
"epoch": 0.8893956670467503,
"grad_norm": 0.3389071524143219,
"learning_rate": 1.5180893055124977e-05,
"loss": 0.4011,
"num_input_tokens_seen": 2394880,
"step": 195,
"train_runtime": 928.0104,
"train_tokens_per_second": 2580.661
},
{
"epoch": 0.9122006841505131,
"grad_norm": 0.3012617826461792,
"learning_rate": 9.962936025419754e-06,
"loss": 0.3809,
"num_input_tokens_seen": 2455680,
"step": 200,
"train_runtime": 950.7291,
"train_tokens_per_second": 2582.944
},
{
"epoch": 0.9122006841505131,
"eval_loss": 0.3808976411819458,
"eval_runtime": 32.6975,
"eval_samples_per_second": 95.359,
"eval_steps_per_second": 5.964,
"num_input_tokens_seen": 2455680,
"step": 200
},
{
"epoch": 0.935005701254276,
"grad_norm": 0.2728487253189087,
"learning_rate": 5.810745609252165e-06,
"loss": 0.3799,
"num_input_tokens_seen": 2517376,
"step": 205,
"train_runtime": 1006.4425,
"train_tokens_per_second": 2501.262
},
{
"epoch": 0.9578107183580388,
"grad_norm": 0.29773786664009094,
"learning_rate": 2.7559224828504035e-06,
"loss": 0.3944,
"num_input_tokens_seen": 2578816,
"step": 210,
"train_runtime": 1029.2761,
"train_tokens_per_second": 2505.466
},
{
"epoch": 0.9806157354618016,
"grad_norm": 0.3076622486114502,
"learning_rate": 8.217156947590064e-07,
"loss": 0.3721,
"num_input_tokens_seen": 2640128,
"step": 215,
"train_runtime": 1052.1692,
"train_tokens_per_second": 2509.224
},
{
"epoch": 1.0,
"grad_norm": 0.5157559514045715,
"learning_rate": 2.284572654130956e-08,
"loss": 0.3749,
"num_input_tokens_seen": 2691984,
"step": 220,
"train_runtime": 1071.5462,
"train_tokens_per_second": 2512.243
},
{
"epoch": 1.0,
"num_input_tokens_seen": 2691984,
"step": 220,
"total_flos": 1.0930399586117222e+17,
"train_loss": 0.8674792235547846,
"train_runtime": 1074.777,
"train_samples_per_second": 26.106,
"train_steps_per_second": 0.205
}
],
"logging_steps": 5,
"max_steps": 220,
"num_input_tokens_seen": 2691984,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0930399586117222e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}