Llama-3.2-3B-Instruct-zno-cot / trainer_state.json
Nikita Syromiatnikov
Upload folder using huggingface_hub
6539b22 verified
{
"best_metric": 1.7067729234695435,
"best_model_checkpoint": "4bit_repro_03022025/host18_seed_42_full_det_fp16_no_flash_attn_fix_pad_llama-3.2-instruct-l16-cot-wt_feb7-4ep-lr3e04-ws20-bs8-ga4-fp16-16022025/checkpoint-110",
"epoch": 3.9357798165137616,
"eval_steps": 500,
"global_step": 216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01834862385321101,
"grad_norm": 0.8146735429763794,
"learning_rate": 1.4999999999999999e-05,
"loss": 3.458,
"step": 1
},
{
"epoch": 0.03669724770642202,
"grad_norm": 0.8460527658462524,
"learning_rate": 2.9999999999999997e-05,
"loss": 3.318,
"step": 2
},
{
"epoch": 0.05504587155963303,
"grad_norm": 1.0199122428894043,
"learning_rate": 4.4999999999999996e-05,
"loss": 3.4251,
"step": 3
},
{
"epoch": 0.07339449541284404,
"grad_norm": 0.738707959651947,
"learning_rate": 5.9999999999999995e-05,
"loss": 3.3918,
"step": 4
},
{
"epoch": 0.09174311926605505,
"grad_norm": 0.9168875217437744,
"learning_rate": 7.5e-05,
"loss": 3.2804,
"step": 5
},
{
"epoch": 0.11009174311926606,
"grad_norm": 0.7749077081680298,
"learning_rate": 8.999999999999999e-05,
"loss": 3.1672,
"step": 6
},
{
"epoch": 0.12844036697247707,
"grad_norm": 0.8257707953453064,
"learning_rate": 0.00010499999999999999,
"loss": 3.1118,
"step": 7
},
{
"epoch": 0.14678899082568808,
"grad_norm": 0.7854360342025757,
"learning_rate": 0.00011999999999999999,
"loss": 3.1685,
"step": 8
},
{
"epoch": 0.1651376146788991,
"grad_norm": 0.9187895059585571,
"learning_rate": 0.000135,
"loss": 2.821,
"step": 9
},
{
"epoch": 0.1834862385321101,
"grad_norm": 0.70020991563797,
"learning_rate": 0.00015,
"loss": 2.7924,
"step": 10
},
{
"epoch": 0.2018348623853211,
"grad_norm": 0.513304591178894,
"learning_rate": 0.000165,
"loss": 2.8753,
"step": 11
},
{
"epoch": 0.22018348623853212,
"grad_norm": 0.49817588925361633,
"learning_rate": 0.00017999999999999998,
"loss": 2.6701,
"step": 12
},
{
"epoch": 0.23853211009174313,
"grad_norm": 0.4605925977230072,
"learning_rate": 0.000195,
"loss": 2.7468,
"step": 13
},
{
"epoch": 0.25688073394495414,
"grad_norm": 0.5559924244880676,
"learning_rate": 0.00020999999999999998,
"loss": 2.532,
"step": 14
},
{
"epoch": 0.27522935779816515,
"grad_norm": 0.5250511765480042,
"learning_rate": 0.000225,
"loss": 2.8411,
"step": 15
},
{
"epoch": 0.29357798165137616,
"grad_norm": 0.5654911398887634,
"learning_rate": 0.00023999999999999998,
"loss": 2.5375,
"step": 16
},
{
"epoch": 0.3119266055045872,
"grad_norm": 0.6329001188278198,
"learning_rate": 0.00025499999999999996,
"loss": 2.431,
"step": 17
},
{
"epoch": 0.3302752293577982,
"grad_norm": 0.6397466063499451,
"learning_rate": 0.00027,
"loss": 2.4786,
"step": 18
},
{
"epoch": 0.3486238532110092,
"grad_norm": 0.5889459252357483,
"learning_rate": 0.000285,
"loss": 2.6437,
"step": 19
},
{
"epoch": 0.3669724770642202,
"grad_norm": 0.5488238334655762,
"learning_rate": 0.0003,
"loss": 2.539,
"step": 20
},
{
"epoch": 0.3853211009174312,
"grad_norm": 0.5296560525894165,
"learning_rate": 0.00029846938775510205,
"loss": 2.3395,
"step": 21
},
{
"epoch": 0.4036697247706422,
"grad_norm": 0.4363678991794586,
"learning_rate": 0.0002969387755102041,
"loss": 2.2912,
"step": 22
},
{
"epoch": 0.42201834862385323,
"grad_norm": 0.4143035411834717,
"learning_rate": 0.0002954081632653061,
"loss": 2.3358,
"step": 23
},
{
"epoch": 0.44036697247706424,
"grad_norm": 0.4551326632499695,
"learning_rate": 0.0002938775510204081,
"loss": 2.2228,
"step": 24
},
{
"epoch": 0.45871559633027525,
"grad_norm": 0.39562949538230896,
"learning_rate": 0.0002923469387755102,
"loss": 2.0389,
"step": 25
},
{
"epoch": 0.47706422018348627,
"grad_norm": 0.35010403394699097,
"learning_rate": 0.00029081632653061223,
"loss": 2.3651,
"step": 26
},
{
"epoch": 0.4954128440366973,
"grad_norm": 0.3502652943134308,
"learning_rate": 0.00028928571428571425,
"loss": 1.8895,
"step": 27
},
{
"epoch": 0.5137614678899083,
"grad_norm": 0.3823269307613373,
"learning_rate": 0.0002877551020408163,
"loss": 2.3629,
"step": 28
},
{
"epoch": 0.5321100917431193,
"grad_norm": 0.3375133275985718,
"learning_rate": 0.00028622448979591836,
"loss": 2.1808,
"step": 29
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.3216778635978699,
"learning_rate": 0.0002846938775510204,
"loss": 2.2516,
"step": 30
},
{
"epoch": 0.5688073394495413,
"grad_norm": 0.30818313360214233,
"learning_rate": 0.0002831632653061224,
"loss": 2.1212,
"step": 31
},
{
"epoch": 0.5871559633027523,
"grad_norm": 0.32901531457901,
"learning_rate": 0.0002816326530612245,
"loss": 1.9435,
"step": 32
},
{
"epoch": 0.6055045871559633,
"grad_norm": 0.3240410387516022,
"learning_rate": 0.0002801020408163265,
"loss": 1.8977,
"step": 33
},
{
"epoch": 0.6238532110091743,
"grad_norm": 0.31374916434288025,
"learning_rate": 0.00027857142857142854,
"loss": 2.2115,
"step": 34
},
{
"epoch": 0.6422018348623854,
"grad_norm": 0.31450748443603516,
"learning_rate": 0.00027704081632653056,
"loss": 2.0926,
"step": 35
},
{
"epoch": 0.6605504587155964,
"grad_norm": 0.30289527773857117,
"learning_rate": 0.00027551020408163264,
"loss": 2.1462,
"step": 36
},
{
"epoch": 0.6788990825688074,
"grad_norm": 0.3504111170768738,
"learning_rate": 0.00027397959183673466,
"loss": 2.2673,
"step": 37
},
{
"epoch": 0.6972477064220184,
"grad_norm": 0.3244895040988922,
"learning_rate": 0.0002724489795918367,
"loss": 2.1864,
"step": 38
},
{
"epoch": 0.7155963302752294,
"grad_norm": 0.3336032032966614,
"learning_rate": 0.0002709183673469387,
"loss": 1.8489,
"step": 39
},
{
"epoch": 0.7339449541284404,
"grad_norm": 0.3516104519367218,
"learning_rate": 0.0002693877551020408,
"loss": 2.3057,
"step": 40
},
{
"epoch": 0.7522935779816514,
"grad_norm": 0.30326011776924133,
"learning_rate": 0.00026785714285714287,
"loss": 1.9602,
"step": 41
},
{
"epoch": 0.7706422018348624,
"grad_norm": 0.34224340319633484,
"learning_rate": 0.0002663265306122449,
"loss": 1.994,
"step": 42
},
{
"epoch": 0.7889908256880734,
"grad_norm": 0.33715730905532837,
"learning_rate": 0.0002647959183673469,
"loss": 1.9931,
"step": 43
},
{
"epoch": 0.8073394495412844,
"grad_norm": 0.3272618353366852,
"learning_rate": 0.00026326530612244894,
"loss": 2.1401,
"step": 44
},
{
"epoch": 0.8256880733944955,
"grad_norm": 0.3652991056442261,
"learning_rate": 0.000261734693877551,
"loss": 1.625,
"step": 45
},
{
"epoch": 0.8440366972477065,
"grad_norm": 0.3317834138870239,
"learning_rate": 0.00026020408163265305,
"loss": 2.0808,
"step": 46
},
{
"epoch": 0.8623853211009175,
"grad_norm": 0.36255204677581787,
"learning_rate": 0.00025867346938775507,
"loss": 1.7574,
"step": 47
},
{
"epoch": 0.8807339449541285,
"grad_norm": 0.35033008456230164,
"learning_rate": 0.0002571428571428571,
"loss": 1.9578,
"step": 48
},
{
"epoch": 0.8990825688073395,
"grad_norm": 0.3665790855884552,
"learning_rate": 0.0002556122448979592,
"loss": 1.9175,
"step": 49
},
{
"epoch": 0.9174311926605505,
"grad_norm": 0.3547409772872925,
"learning_rate": 0.0002540816326530612,
"loss": 1.8527,
"step": 50
},
{
"epoch": 0.9357798165137615,
"grad_norm": 0.37087056040763855,
"learning_rate": 0.0002525510204081632,
"loss": 1.7837,
"step": 51
},
{
"epoch": 0.9541284403669725,
"grad_norm": 0.40799495577812195,
"learning_rate": 0.0002510204081632653,
"loss": 1.5264,
"step": 52
},
{
"epoch": 0.9724770642201835,
"grad_norm": 0.3731648027896881,
"learning_rate": 0.00024948979591836733,
"loss": 1.8249,
"step": 53
},
{
"epoch": 0.9908256880733946,
"grad_norm": 0.3571811616420746,
"learning_rate": 0.00024795918367346935,
"loss": 1.7617,
"step": 54
},
{
"epoch": 1.0,
"grad_norm": 0.5152439475059509,
"learning_rate": 0.0002464285714285714,
"loss": 1.8187,
"step": 55
},
{
"epoch": 1.0,
"eval_loss": 1.8342303037643433,
"eval_runtime": 40.5217,
"eval_samples_per_second": 8.193,
"eval_steps_per_second": 4.097,
"step": 55
},
{
"epoch": 1.018348623853211,
"grad_norm": 0.38449227809906006,
"learning_rate": 0.00024489795918367346,
"loss": 1.7036,
"step": 56
},
{
"epoch": 1.036697247706422,
"grad_norm": 0.3873405158519745,
"learning_rate": 0.00024336734693877548,
"loss": 1.9332,
"step": 57
},
{
"epoch": 1.0550458715596331,
"grad_norm": 0.4064755439758301,
"learning_rate": 0.00024183673469387753,
"loss": 1.7656,
"step": 58
},
{
"epoch": 1.073394495412844,
"grad_norm": 0.39023351669311523,
"learning_rate": 0.00024030612244897956,
"loss": 1.8004,
"step": 59
},
{
"epoch": 1.091743119266055,
"grad_norm": 0.40451326966285706,
"learning_rate": 0.0002387755102040816,
"loss": 1.674,
"step": 60
},
{
"epoch": 1.110091743119266,
"grad_norm": 0.3661345839500427,
"learning_rate": 0.00023724489795918366,
"loss": 1.9113,
"step": 61
},
{
"epoch": 1.1284403669724772,
"grad_norm": 0.4025841951370239,
"learning_rate": 0.00023571428571428569,
"loss": 1.8991,
"step": 62
},
{
"epoch": 1.146788990825688,
"grad_norm": 0.41999149322509766,
"learning_rate": 0.00023418367346938774,
"loss": 1.7361,
"step": 63
},
{
"epoch": 1.165137614678899,
"grad_norm": 0.39743533730506897,
"learning_rate": 0.00023265306122448976,
"loss": 1.5829,
"step": 64
},
{
"epoch": 1.18348623853211,
"grad_norm": 0.42366546392440796,
"learning_rate": 0.00023112244897959181,
"loss": 1.7228,
"step": 65
},
{
"epoch": 1.2018348623853212,
"grad_norm": 0.37864112854003906,
"learning_rate": 0.00022959183673469384,
"loss": 1.636,
"step": 66
},
{
"epoch": 1.2201834862385321,
"grad_norm": 0.392802357673645,
"learning_rate": 0.0002280612244897959,
"loss": 1.5435,
"step": 67
},
{
"epoch": 1.238532110091743,
"grad_norm": 0.4146474301815033,
"learning_rate": 0.00022653061224489791,
"loss": 1.7305,
"step": 68
},
{
"epoch": 1.2568807339449541,
"grad_norm": 0.401034951210022,
"learning_rate": 0.000225,
"loss": 1.6097,
"step": 69
},
{
"epoch": 1.2752293577981653,
"grad_norm": 0.4014700949192047,
"learning_rate": 0.00022346938775510205,
"loss": 1.8824,
"step": 70
},
{
"epoch": 1.2935779816513762,
"grad_norm": 0.4407334327697754,
"learning_rate": 0.00022193877551020407,
"loss": 1.6208,
"step": 71
},
{
"epoch": 1.311926605504587,
"grad_norm": 0.3951621353626251,
"learning_rate": 0.00022040816326530612,
"loss": 1.7436,
"step": 72
},
{
"epoch": 1.3302752293577982,
"grad_norm": 0.40659868717193604,
"learning_rate": 0.00021887755102040815,
"loss": 1.7545,
"step": 73
},
{
"epoch": 1.3486238532110093,
"grad_norm": 0.40831202268600464,
"learning_rate": 0.0002173469387755102,
"loss": 1.8652,
"step": 74
},
{
"epoch": 1.3669724770642202,
"grad_norm": 0.44127607345581055,
"learning_rate": 0.00021581632653061222,
"loss": 1.5706,
"step": 75
},
{
"epoch": 1.385321100917431,
"grad_norm": 0.413889616727829,
"learning_rate": 0.00021428571428571427,
"loss": 1.5272,
"step": 76
},
{
"epoch": 1.4036697247706422,
"grad_norm": 0.4785701036453247,
"learning_rate": 0.0002127551020408163,
"loss": 1.572,
"step": 77
},
{
"epoch": 1.4220183486238533,
"grad_norm": 0.45182228088378906,
"learning_rate": 0.00021122448979591835,
"loss": 1.5978,
"step": 78
},
{
"epoch": 1.4403669724770642,
"grad_norm": 0.4382292628288269,
"learning_rate": 0.0002096938775510204,
"loss": 1.6424,
"step": 79
},
{
"epoch": 1.4587155963302751,
"grad_norm": 0.529373288154602,
"learning_rate": 0.00020816326530612243,
"loss": 1.5756,
"step": 80
},
{
"epoch": 1.4770642201834863,
"grad_norm": 0.4828866720199585,
"learning_rate": 0.00020663265306122448,
"loss": 1.6252,
"step": 81
},
{
"epoch": 1.4954128440366974,
"grad_norm": 0.455864816904068,
"learning_rate": 0.0002051020408163265,
"loss": 1.6719,
"step": 82
},
{
"epoch": 1.5137614678899083,
"grad_norm": 0.43042704463005066,
"learning_rate": 0.00020357142857142856,
"loss": 1.4768,
"step": 83
},
{
"epoch": 1.5321100917431192,
"grad_norm": 0.43703892827033997,
"learning_rate": 0.00020204081632653058,
"loss": 1.4823,
"step": 84
},
{
"epoch": 1.5504587155963303,
"grad_norm": 0.6060220003128052,
"learning_rate": 0.00020051020408163263,
"loss": 1.7325,
"step": 85
},
{
"epoch": 1.5688073394495414,
"grad_norm": 0.4146731495857239,
"learning_rate": 0.00019897959183673466,
"loss": 1.5784,
"step": 86
},
{
"epoch": 1.5871559633027523,
"grad_norm": 0.48514559864997864,
"learning_rate": 0.0001974489795918367,
"loss": 1.7322,
"step": 87
},
{
"epoch": 1.6055045871559632,
"grad_norm": 0.4649484157562256,
"learning_rate": 0.00019591836734693873,
"loss": 1.4178,
"step": 88
},
{
"epoch": 1.6238532110091743,
"grad_norm": 0.48133671283721924,
"learning_rate": 0.0001943877551020408,
"loss": 1.5013,
"step": 89
},
{
"epoch": 1.6422018348623855,
"grad_norm": 0.4548419415950775,
"learning_rate": 0.00019285714285714286,
"loss": 1.5874,
"step": 90
},
{
"epoch": 1.6605504587155964,
"grad_norm": 0.5536527633666992,
"learning_rate": 0.0001913265306122449,
"loss": 1.4934,
"step": 91
},
{
"epoch": 1.6788990825688073,
"grad_norm": 0.5949488878250122,
"learning_rate": 0.00018979591836734694,
"loss": 1.5306,
"step": 92
},
{
"epoch": 1.6972477064220184,
"grad_norm": 0.5679605603218079,
"learning_rate": 0.00018826530612244896,
"loss": 1.5162,
"step": 93
},
{
"epoch": 1.7155963302752295,
"grad_norm": 0.5724030137062073,
"learning_rate": 0.00018673469387755102,
"loss": 1.4972,
"step": 94
},
{
"epoch": 1.7339449541284404,
"grad_norm": 0.5926903486251831,
"learning_rate": 0.00018520408163265304,
"loss": 1.512,
"step": 95
},
{
"epoch": 1.7522935779816513,
"grad_norm": 0.49787193536758423,
"learning_rate": 0.0001836734693877551,
"loss": 1.5754,
"step": 96
},
{
"epoch": 1.7706422018348624,
"grad_norm": 0.521364152431488,
"learning_rate": 0.00018214285714285712,
"loss": 1.5558,
"step": 97
},
{
"epoch": 1.7889908256880735,
"grad_norm": 0.5252606868743896,
"learning_rate": 0.00018061224489795917,
"loss": 1.5562,
"step": 98
},
{
"epoch": 1.8073394495412844,
"grad_norm": 0.520020604133606,
"learning_rate": 0.00017908163265306122,
"loss": 1.5431,
"step": 99
},
{
"epoch": 1.8256880733944953,
"grad_norm": 0.5684699416160583,
"learning_rate": 0.00017755102040816325,
"loss": 1.4776,
"step": 100
},
{
"epoch": 1.8440366972477065,
"grad_norm": 0.49918070435523987,
"learning_rate": 0.0001760204081632653,
"loss": 1.5665,
"step": 101
},
{
"epoch": 1.8623853211009176,
"grad_norm": 0.5219622850418091,
"learning_rate": 0.00017448979591836732,
"loss": 1.4726,
"step": 102
},
{
"epoch": 1.8807339449541285,
"grad_norm": 0.49506455659866333,
"learning_rate": 0.00017295918367346937,
"loss": 1.5677,
"step": 103
},
{
"epoch": 1.8990825688073394,
"grad_norm": 0.48011767864227295,
"learning_rate": 0.0001714285714285714,
"loss": 1.4628,
"step": 104
},
{
"epoch": 1.9174311926605505,
"grad_norm": 0.49670523405075073,
"learning_rate": 0.00016989795918367345,
"loss": 1.343,
"step": 105
},
{
"epoch": 1.9357798165137616,
"grad_norm": 0.5730084180831909,
"learning_rate": 0.00016836734693877547,
"loss": 1.5211,
"step": 106
},
{
"epoch": 1.9541284403669725,
"grad_norm": 0.5185048580169678,
"learning_rate": 0.00016683673469387753,
"loss": 1.4416,
"step": 107
},
{
"epoch": 1.9724770642201834,
"grad_norm": 0.5075457692146301,
"learning_rate": 0.00016530612244897955,
"loss": 1.3877,
"step": 108
},
{
"epoch": 1.9908256880733946,
"grad_norm": 0.5157256126403809,
"learning_rate": 0.00016377551020408163,
"loss": 1.4932,
"step": 109
},
{
"epoch": 2.0,
"grad_norm": 0.8046102523803711,
"learning_rate": 0.00016224489795918368,
"loss": 1.3092,
"step": 110
},
{
"epoch": 2.0,
"eval_loss": 1.7067729234695435,
"eval_runtime": 40.3292,
"eval_samples_per_second": 8.232,
"eval_steps_per_second": 4.116,
"step": 110
},
{
"epoch": 2.018348623853211,
"grad_norm": 0.5372384786605835,
"learning_rate": 0.0001607142857142857,
"loss": 1.4086,
"step": 111
},
{
"epoch": 2.036697247706422,
"grad_norm": 0.5499650239944458,
"learning_rate": 0.00015918367346938776,
"loss": 1.4309,
"step": 112
},
{
"epoch": 2.055045871559633,
"grad_norm": 0.5393022298812866,
"learning_rate": 0.00015765306122448978,
"loss": 1.3377,
"step": 113
},
{
"epoch": 2.073394495412844,
"grad_norm": 0.4644460678100586,
"learning_rate": 0.00015612244897959183,
"loss": 1.3486,
"step": 114
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.5637883543968201,
"learning_rate": 0.00015459183673469386,
"loss": 1.3423,
"step": 115
},
{
"epoch": 2.1100917431192663,
"grad_norm": 0.5196585655212402,
"learning_rate": 0.0001530612244897959,
"loss": 1.1701,
"step": 116
},
{
"epoch": 2.128440366972477,
"grad_norm": 0.5293002724647522,
"learning_rate": 0.00015153061224489794,
"loss": 1.3339,
"step": 117
},
{
"epoch": 2.146788990825688,
"grad_norm": 0.5287687182426453,
"learning_rate": 0.00015,
"loss": 1.2385,
"step": 118
},
{
"epoch": 2.165137614678899,
"grad_norm": 0.5759944319725037,
"learning_rate": 0.00014846938775510204,
"loss": 1.3614,
"step": 119
},
{
"epoch": 2.18348623853211,
"grad_norm": 0.6487063765525818,
"learning_rate": 0.00014693877551020406,
"loss": 1.392,
"step": 120
},
{
"epoch": 2.2018348623853212,
"grad_norm": 0.5643731355667114,
"learning_rate": 0.00014540816326530611,
"loss": 1.3722,
"step": 121
},
{
"epoch": 2.220183486238532,
"grad_norm": 0.6352585554122925,
"learning_rate": 0.00014387755102040814,
"loss": 1.4423,
"step": 122
},
{
"epoch": 2.238532110091743,
"grad_norm": 0.5605873465538025,
"learning_rate": 0.0001423469387755102,
"loss": 1.3284,
"step": 123
},
{
"epoch": 2.2568807339449544,
"grad_norm": 0.49541911482810974,
"learning_rate": 0.00014081632653061224,
"loss": 1.324,
"step": 124
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.5412710905075073,
"learning_rate": 0.00013928571428571427,
"loss": 1.2605,
"step": 125
},
{
"epoch": 2.293577981651376,
"grad_norm": 0.575342059135437,
"learning_rate": 0.00013775510204081632,
"loss": 1.2615,
"step": 126
},
{
"epoch": 2.311926605504587,
"grad_norm": 0.6061179637908936,
"learning_rate": 0.00013622448979591834,
"loss": 1.3793,
"step": 127
},
{
"epoch": 2.330275229357798,
"grad_norm": 0.5834862589836121,
"learning_rate": 0.0001346938775510204,
"loss": 1.3506,
"step": 128
},
{
"epoch": 2.3486238532110093,
"grad_norm": 0.5381261110305786,
"learning_rate": 0.00013316326530612245,
"loss": 1.224,
"step": 129
},
{
"epoch": 2.36697247706422,
"grad_norm": 0.7020912170410156,
"learning_rate": 0.00013163265306122447,
"loss": 1.3398,
"step": 130
},
{
"epoch": 2.385321100917431,
"grad_norm": 0.5954611897468567,
"learning_rate": 0.00013010204081632652,
"loss": 1.2508,
"step": 131
},
{
"epoch": 2.4036697247706424,
"grad_norm": 0.5576110482215881,
"learning_rate": 0.00012857142857142855,
"loss": 1.266,
"step": 132
},
{
"epoch": 2.4220183486238533,
"grad_norm": 0.5651321411132812,
"learning_rate": 0.0001270408163265306,
"loss": 1.1969,
"step": 133
},
{
"epoch": 2.4403669724770642,
"grad_norm": 0.6407105326652527,
"learning_rate": 0.00012551020408163265,
"loss": 1.2142,
"step": 134
},
{
"epoch": 2.458715596330275,
"grad_norm": 0.6058876514434814,
"learning_rate": 0.00012397959183673468,
"loss": 1.2383,
"step": 135
},
{
"epoch": 2.477064220183486,
"grad_norm": 0.5757945775985718,
"learning_rate": 0.00012244897959183673,
"loss": 1.286,
"step": 136
},
{
"epoch": 2.4954128440366974,
"grad_norm": 0.5286776423454285,
"learning_rate": 0.00012091836734693877,
"loss": 1.364,
"step": 137
},
{
"epoch": 2.5137614678899083,
"grad_norm": 0.5179428458213806,
"learning_rate": 0.0001193877551020408,
"loss": 1.1119,
"step": 138
},
{
"epoch": 2.532110091743119,
"grad_norm": 0.6148931384086609,
"learning_rate": 0.00011785714285714284,
"loss": 1.2517,
"step": 139
},
{
"epoch": 2.5504587155963305,
"grad_norm": 0.5199230313301086,
"learning_rate": 0.00011632653061224488,
"loss": 1.2244,
"step": 140
},
{
"epoch": 2.5688073394495414,
"grad_norm": 0.8426241278648376,
"learning_rate": 0.00011479591836734692,
"loss": 1.2683,
"step": 141
},
{
"epoch": 2.5871559633027523,
"grad_norm": 0.5697855949401855,
"learning_rate": 0.00011326530612244896,
"loss": 1.1894,
"step": 142
},
{
"epoch": 2.6055045871559632,
"grad_norm": 0.6371927261352539,
"learning_rate": 0.00011173469387755102,
"loss": 1.1667,
"step": 143
},
{
"epoch": 2.623853211009174,
"grad_norm": 0.5687111616134644,
"learning_rate": 0.00011020408163265306,
"loss": 1.2333,
"step": 144
},
{
"epoch": 2.6422018348623855,
"grad_norm": 0.7226176261901855,
"learning_rate": 0.0001086734693877551,
"loss": 1.1316,
"step": 145
},
{
"epoch": 2.6605504587155964,
"grad_norm": 0.5449386239051819,
"learning_rate": 0.00010714285714285714,
"loss": 1.3315,
"step": 146
},
{
"epoch": 2.6788990825688073,
"grad_norm": 0.5627453327178955,
"learning_rate": 0.00010561224489795918,
"loss": 1.1903,
"step": 147
},
{
"epoch": 2.6972477064220186,
"grad_norm": 0.6717932820320129,
"learning_rate": 0.00010408163265306121,
"loss": 1.1551,
"step": 148
},
{
"epoch": 2.7155963302752295,
"grad_norm": 0.6216678619384766,
"learning_rate": 0.00010255102040816325,
"loss": 1.3409,
"step": 149
},
{
"epoch": 2.7339449541284404,
"grad_norm": 0.565977156162262,
"learning_rate": 0.00010102040816326529,
"loss": 1.2799,
"step": 150
},
{
"epoch": 2.7522935779816513,
"grad_norm": 0.6974563598632812,
"learning_rate": 9.948979591836733e-05,
"loss": 1.2034,
"step": 151
},
{
"epoch": 2.770642201834862,
"grad_norm": 0.5820234417915344,
"learning_rate": 9.795918367346937e-05,
"loss": 1.1747,
"step": 152
},
{
"epoch": 2.7889908256880735,
"grad_norm": 0.5635000467300415,
"learning_rate": 9.642857142857143e-05,
"loss": 1.2143,
"step": 153
},
{
"epoch": 2.8073394495412844,
"grad_norm": 0.6183028817176819,
"learning_rate": 9.489795918367347e-05,
"loss": 1.0934,
"step": 154
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.5435863733291626,
"learning_rate": 9.336734693877551e-05,
"loss": 1.254,
"step": 155
},
{
"epoch": 2.8440366972477067,
"grad_norm": 0.6071304678916931,
"learning_rate": 9.183673469387755e-05,
"loss": 1.0866,
"step": 156
},
{
"epoch": 2.8623853211009176,
"grad_norm": 0.6845288276672363,
"learning_rate": 9.030612244897958e-05,
"loss": 1.0657,
"step": 157
},
{
"epoch": 2.8807339449541285,
"grad_norm": 0.7707186341285706,
"learning_rate": 8.877551020408162e-05,
"loss": 1.1873,
"step": 158
},
{
"epoch": 2.8990825688073394,
"grad_norm": 0.6403721570968628,
"learning_rate": 8.724489795918366e-05,
"loss": 1.2217,
"step": 159
},
{
"epoch": 2.9174311926605503,
"grad_norm": 0.6184455752372742,
"learning_rate": 8.57142857142857e-05,
"loss": 1.2699,
"step": 160
},
{
"epoch": 2.9357798165137616,
"grad_norm": 0.6127010583877563,
"learning_rate": 8.418367346938774e-05,
"loss": 1.1339,
"step": 161
},
{
"epoch": 2.9541284403669725,
"grad_norm": 0.5781142711639404,
"learning_rate": 8.265306122448978e-05,
"loss": 1.0983,
"step": 162
},
{
"epoch": 2.9724770642201834,
"grad_norm": 0.6162577271461487,
"learning_rate": 8.112244897959184e-05,
"loss": 1.225,
"step": 163
},
{
"epoch": 2.9908256880733948,
"grad_norm": 0.5873180031776428,
"learning_rate": 7.959183673469388e-05,
"loss": 1.1027,
"step": 164
},
{
"epoch": 3.0,
"grad_norm": 0.9156239032745361,
"learning_rate": 7.806122448979592e-05,
"loss": 1.2634,
"step": 165
},
{
"epoch": 3.0,
"eval_loss": 1.765793800354004,
"eval_runtime": 40.2043,
"eval_samples_per_second": 8.258,
"eval_steps_per_second": 4.129,
"step": 165
},
{
"epoch": 3.018348623853211,
"grad_norm": 2.7086777687072754,
"learning_rate": 7.653061224489796e-05,
"loss": 0.9086,
"step": 166
},
{
"epoch": 3.036697247706422,
"grad_norm": 0.798815131187439,
"learning_rate": 7.5e-05,
"loss": 0.9862,
"step": 167
},
{
"epoch": 3.055045871559633,
"grad_norm": 0.6483126878738403,
"learning_rate": 7.346938775510203e-05,
"loss": 1.0914,
"step": 168
},
{
"epoch": 3.073394495412844,
"grad_norm": 0.9064918160438538,
"learning_rate": 7.193877551020407e-05,
"loss": 0.9983,
"step": 169
},
{
"epoch": 3.091743119266055,
"grad_norm": 0.5859102010726929,
"learning_rate": 7.040816326530612e-05,
"loss": 1.2501,
"step": 170
},
{
"epoch": 3.1100917431192663,
"grad_norm": 1.0210392475128174,
"learning_rate": 6.887755102040816e-05,
"loss": 0.9503,
"step": 171
},
{
"epoch": 3.128440366972477,
"grad_norm": 0.5909506678581238,
"learning_rate": 6.73469387755102e-05,
"loss": 1.1646,
"step": 172
},
{
"epoch": 3.146788990825688,
"grad_norm": 0.8601213693618774,
"learning_rate": 6.581632653061224e-05,
"loss": 1.089,
"step": 173
},
{
"epoch": 3.165137614678899,
"grad_norm": 1.5392725467681885,
"learning_rate": 6.428571428571427e-05,
"loss": 1.1588,
"step": 174
},
{
"epoch": 3.18348623853211,
"grad_norm": 0.6926484107971191,
"learning_rate": 6.275510204081633e-05,
"loss": 1.0608,
"step": 175
},
{
"epoch": 3.2018348623853212,
"grad_norm": 0.6359078288078308,
"learning_rate": 6.122448979591836e-05,
"loss": 1.2014,
"step": 176
},
{
"epoch": 3.220183486238532,
"grad_norm": 0.7411192655563354,
"learning_rate": 5.96938775510204e-05,
"loss": 1.0524,
"step": 177
},
{
"epoch": 3.238532110091743,
"grad_norm": 0.7913389801979065,
"learning_rate": 5.816326530612244e-05,
"loss": 0.9702,
"step": 178
},
{
"epoch": 3.2568807339449544,
"grad_norm": 0.5389623045921326,
"learning_rate": 5.663265306122448e-05,
"loss": 1.0675,
"step": 179
},
{
"epoch": 3.2752293577981653,
"grad_norm": 0.4771580398082733,
"learning_rate": 5.510204081632653e-05,
"loss": 1.0254,
"step": 180
},
{
"epoch": 3.293577981651376,
"grad_norm": 0.7268071174621582,
"learning_rate": 5.357142857142857e-05,
"loss": 1.0475,
"step": 181
},
{
"epoch": 3.311926605504587,
"grad_norm": 0.6685888767242432,
"learning_rate": 5.204081632653061e-05,
"loss": 0.9658,
"step": 182
},
{
"epoch": 3.330275229357798,
"grad_norm": 0.5965012907981873,
"learning_rate": 5.0510204081632645e-05,
"loss": 1.0657,
"step": 183
},
{
"epoch": 3.3486238532110093,
"grad_norm": 0.5370911955833435,
"learning_rate": 4.897959183673468e-05,
"loss": 1.0277,
"step": 184
},
{
"epoch": 3.36697247706422,
"grad_norm": 0.646511971950531,
"learning_rate": 4.7448979591836735e-05,
"loss": 1.1439,
"step": 185
},
{
"epoch": 3.385321100917431,
"grad_norm": 0.5768170356750488,
"learning_rate": 4.591836734693877e-05,
"loss": 1.253,
"step": 186
},
{
"epoch": 3.4036697247706424,
"grad_norm": 0.6067250370979309,
"learning_rate": 4.438775510204081e-05,
"loss": 0.9355,
"step": 187
},
{
"epoch": 3.4220183486238533,
"grad_norm": 0.8360852003097534,
"learning_rate": 4.285714285714285e-05,
"loss": 1.0526,
"step": 188
},
{
"epoch": 3.4403669724770642,
"grad_norm": 0.59422767162323,
"learning_rate": 4.132653061224489e-05,
"loss": 1.1794,
"step": 189
},
{
"epoch": 3.458715596330275,
"grad_norm": 0.5541112422943115,
"learning_rate": 3.979591836734694e-05,
"loss": 0.9477,
"step": 190
},
{
"epoch": 3.477064220183486,
"grad_norm": 0.587552547454834,
"learning_rate": 3.826530612244898e-05,
"loss": 1.0714,
"step": 191
},
{
"epoch": 3.4954128440366974,
"grad_norm": 0.5143390893936157,
"learning_rate": 3.6734693877551016e-05,
"loss": 1.0379,
"step": 192
},
{
"epoch": 3.5137614678899083,
"grad_norm": 0.6476455330848694,
"learning_rate": 3.520408163265306e-05,
"loss": 0.9505,
"step": 193
},
{
"epoch": 3.532110091743119,
"grad_norm": 0.5314425230026245,
"learning_rate": 3.36734693877551e-05,
"loss": 1.0504,
"step": 194
},
{
"epoch": 3.5504587155963305,
"grad_norm": 0.6685779094696045,
"learning_rate": 3.214285714285714e-05,
"loss": 1.083,
"step": 195
},
{
"epoch": 3.5688073394495414,
"grad_norm": 0.5587136149406433,
"learning_rate": 3.061224489795918e-05,
"loss": 0.9242,
"step": 196
},
{
"epoch": 3.5871559633027523,
"grad_norm": 0.5730684995651245,
"learning_rate": 2.908163265306122e-05,
"loss": 1.0271,
"step": 197
},
{
"epoch": 3.6055045871559632,
"grad_norm": 0.5254084467887878,
"learning_rate": 2.7551020408163265e-05,
"loss": 1.0828,
"step": 198
},
{
"epoch": 3.623853211009174,
"grad_norm": 0.5611628890037537,
"learning_rate": 2.6020408163265303e-05,
"loss": 1.1933,
"step": 199
},
{
"epoch": 3.6422018348623855,
"grad_norm": 0.541988730430603,
"learning_rate": 2.448979591836734e-05,
"loss": 0.9135,
"step": 200
},
{
"epoch": 3.6605504587155964,
"grad_norm": 0.5615597367286682,
"learning_rate": 2.2959183673469387e-05,
"loss": 0.9679,
"step": 201
},
{
"epoch": 3.6788990825688073,
"grad_norm": 0.664253830909729,
"learning_rate": 2.1428571428571425e-05,
"loss": 0.9984,
"step": 202
},
{
"epoch": 3.6972477064220186,
"grad_norm": 0.5762522220611572,
"learning_rate": 1.989795918367347e-05,
"loss": 1.0341,
"step": 203
},
{
"epoch": 3.7155963302752295,
"grad_norm": 0.544408917427063,
"learning_rate": 1.8367346938775508e-05,
"loss": 1.2647,
"step": 204
},
{
"epoch": 3.7339449541284404,
"grad_norm": 0.5570142865180969,
"learning_rate": 1.683673469387755e-05,
"loss": 0.8697,
"step": 205
},
{
"epoch": 3.7522935779816513,
"grad_norm": 0.5823831558227539,
"learning_rate": 1.530612244897959e-05,
"loss": 1.0817,
"step": 206
},
{
"epoch": 3.770642201834862,
"grad_norm": 0.672044038772583,
"learning_rate": 1.3775510204081633e-05,
"loss": 1.029,
"step": 207
},
{
"epoch": 3.7889908256880735,
"grad_norm": 0.6006896495819092,
"learning_rate": 1.224489795918367e-05,
"loss": 1.1158,
"step": 208
},
{
"epoch": 3.8073394495412844,
"grad_norm": 0.546748697757721,
"learning_rate": 1.0714285714285712e-05,
"loss": 0.8507,
"step": 209
},
{
"epoch": 3.8256880733944953,
"grad_norm": 0.6468778252601624,
"learning_rate": 9.183673469387754e-06,
"loss": 1.0383,
"step": 210
},
{
"epoch": 3.8440366972477067,
"grad_norm": 0.510528564453125,
"learning_rate": 7.653061224489796e-06,
"loss": 0.8932,
"step": 211
},
{
"epoch": 3.8623853211009176,
"grad_norm": 0.5235452055931091,
"learning_rate": 6.122448979591835e-06,
"loss": 0.9912,
"step": 212
},
{
"epoch": 3.8807339449541285,
"grad_norm": 0.589995801448822,
"learning_rate": 4.591836734693877e-06,
"loss": 0.9821,
"step": 213
},
{
"epoch": 3.8990825688073394,
"grad_norm": 0.5095317959785461,
"learning_rate": 3.0612244897959177e-06,
"loss": 0.9985,
"step": 214
},
{
"epoch": 3.9174311926605503,
"grad_norm": 0.6477232575416565,
"learning_rate": 1.5306122448979589e-06,
"loss": 0.9372,
"step": 215
},
{
"epoch": 3.9357798165137616,
"grad_norm": 0.5697444677352905,
"learning_rate": 0.0,
"loss": 1.0124,
"step": 216
},
{
"epoch": 3.9357798165137616,
"eval_loss": 1.7458010911941528,
"eval_runtime": 39.9333,
"eval_samples_per_second": 8.314,
"eval_steps_per_second": 4.157,
"step": 216
}
],
"logging_steps": 1,
"max_steps": 216,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.809327135010898e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}