| { |
| "best_metric": 1.7067729234695435, |
| "best_model_checkpoint": "4bit_repro_03022025/host18_seed_42_full_det_fp16_no_flash_attn_fix_pad_llama-3.2-instruct-l16-cot-wt_feb7-4ep-lr3e04-ws20-bs8-ga4-fp16-16022025/checkpoint-110", |
| "epoch": 3.9357798165137616, |
| "eval_steps": 500, |
| "global_step": 216, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01834862385321101, |
| "grad_norm": 0.8146735429763794, |
| "learning_rate": 1.4999999999999999e-05, |
| "loss": 3.458, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03669724770642202, |
| "grad_norm": 0.8460527658462524, |
| "learning_rate": 2.9999999999999997e-05, |
| "loss": 3.318, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.05504587155963303, |
| "grad_norm": 1.0199122428894043, |
| "learning_rate": 4.4999999999999996e-05, |
| "loss": 3.4251, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.07339449541284404, |
| "grad_norm": 0.738707959651947, |
| "learning_rate": 5.9999999999999995e-05, |
| "loss": 3.3918, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.09174311926605505, |
| "grad_norm": 0.9168875217437744, |
| "learning_rate": 7.5e-05, |
| "loss": 3.2804, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.11009174311926606, |
| "grad_norm": 0.7749077081680298, |
| "learning_rate": 8.999999999999999e-05, |
| "loss": 3.1672, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.12844036697247707, |
| "grad_norm": 0.8257707953453064, |
| "learning_rate": 0.00010499999999999999, |
| "loss": 3.1118, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.14678899082568808, |
| "grad_norm": 0.7854360342025757, |
| "learning_rate": 0.00011999999999999999, |
| "loss": 3.1685, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1651376146788991, |
| "grad_norm": 0.9187895059585571, |
| "learning_rate": 0.000135, |
| "loss": 2.821, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.1834862385321101, |
| "grad_norm": 0.70020991563797, |
| "learning_rate": 0.00015, |
| "loss": 2.7924, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.2018348623853211, |
| "grad_norm": 0.513304591178894, |
| "learning_rate": 0.000165, |
| "loss": 2.8753, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.22018348623853212, |
| "grad_norm": 0.49817588925361633, |
| "learning_rate": 0.00017999999999999998, |
| "loss": 2.6701, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.23853211009174313, |
| "grad_norm": 0.4605925977230072, |
| "learning_rate": 0.000195, |
| "loss": 2.7468, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.25688073394495414, |
| "grad_norm": 0.5559924244880676, |
| "learning_rate": 0.00020999999999999998, |
| "loss": 2.532, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.27522935779816515, |
| "grad_norm": 0.5250511765480042, |
| "learning_rate": 0.000225, |
| "loss": 2.8411, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.29357798165137616, |
| "grad_norm": 0.5654911398887634, |
| "learning_rate": 0.00023999999999999998, |
| "loss": 2.5375, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.3119266055045872, |
| "grad_norm": 0.6329001188278198, |
| "learning_rate": 0.00025499999999999996, |
| "loss": 2.431, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3302752293577982, |
| "grad_norm": 0.6397466063499451, |
| "learning_rate": 0.00027, |
| "loss": 2.4786, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3486238532110092, |
| "grad_norm": 0.5889459252357483, |
| "learning_rate": 0.000285, |
| "loss": 2.6437, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3669724770642202, |
| "grad_norm": 0.5488238334655762, |
| "learning_rate": 0.0003, |
| "loss": 2.539, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.3853211009174312, |
| "grad_norm": 0.5296560525894165, |
| "learning_rate": 0.00029846938775510205, |
| "loss": 2.3395, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.4036697247706422, |
| "grad_norm": 0.4363678991794586, |
| "learning_rate": 0.0002969387755102041, |
| "loss": 2.2912, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.42201834862385323, |
| "grad_norm": 0.4143035411834717, |
| "learning_rate": 0.0002954081632653061, |
| "loss": 2.3358, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.44036697247706424, |
| "grad_norm": 0.4551326632499695, |
| "learning_rate": 0.0002938775510204081, |
| "loss": 2.2228, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.45871559633027525, |
| "grad_norm": 0.39562949538230896, |
| "learning_rate": 0.0002923469387755102, |
| "loss": 2.0389, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.47706422018348627, |
| "grad_norm": 0.35010403394699097, |
| "learning_rate": 0.00029081632653061223, |
| "loss": 2.3651, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.4954128440366973, |
| "grad_norm": 0.3502652943134308, |
| "learning_rate": 0.00028928571428571425, |
| "loss": 1.8895, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.5137614678899083, |
| "grad_norm": 0.3823269307613373, |
| "learning_rate": 0.0002877551020408163, |
| "loss": 2.3629, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5321100917431193, |
| "grad_norm": 0.3375133275985718, |
| "learning_rate": 0.00028622448979591836, |
| "loss": 2.1808, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5504587155963303, |
| "grad_norm": 0.3216778635978699, |
| "learning_rate": 0.0002846938775510204, |
| "loss": 2.2516, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5688073394495413, |
| "grad_norm": 0.30818313360214233, |
| "learning_rate": 0.0002831632653061224, |
| "loss": 2.1212, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5871559633027523, |
| "grad_norm": 0.32901531457901, |
| "learning_rate": 0.0002816326530612245, |
| "loss": 1.9435, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.6055045871559633, |
| "grad_norm": 0.3240410387516022, |
| "learning_rate": 0.0002801020408163265, |
| "loss": 1.8977, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.6238532110091743, |
| "grad_norm": 0.31374916434288025, |
| "learning_rate": 0.00027857142857142854, |
| "loss": 2.2115, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6422018348623854, |
| "grad_norm": 0.31450748443603516, |
| "learning_rate": 0.00027704081632653056, |
| "loss": 2.0926, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6605504587155964, |
| "grad_norm": 0.30289527773857117, |
| "learning_rate": 0.00027551020408163264, |
| "loss": 2.1462, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.6788990825688074, |
| "grad_norm": 0.3504111170768738, |
| "learning_rate": 0.00027397959183673466, |
| "loss": 2.2673, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.6972477064220184, |
| "grad_norm": 0.3244895040988922, |
| "learning_rate": 0.0002724489795918367, |
| "loss": 2.1864, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.7155963302752294, |
| "grad_norm": 0.3336032032966614, |
| "learning_rate": 0.0002709183673469387, |
| "loss": 1.8489, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.7339449541284404, |
| "grad_norm": 0.3516104519367218, |
| "learning_rate": 0.0002693877551020408, |
| "loss": 2.3057, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7522935779816514, |
| "grad_norm": 0.30326011776924133, |
| "learning_rate": 0.00026785714285714287, |
| "loss": 1.9602, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.7706422018348624, |
| "grad_norm": 0.34224340319633484, |
| "learning_rate": 0.0002663265306122449, |
| "loss": 1.994, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.7889908256880734, |
| "grad_norm": 0.33715730905532837, |
| "learning_rate": 0.0002647959183673469, |
| "loss": 1.9931, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.8073394495412844, |
| "grad_norm": 0.3272618353366852, |
| "learning_rate": 0.00026326530612244894, |
| "loss": 2.1401, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.8256880733944955, |
| "grad_norm": 0.3652991056442261, |
| "learning_rate": 0.000261734693877551, |
| "loss": 1.625, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8440366972477065, |
| "grad_norm": 0.3317834138870239, |
| "learning_rate": 0.00026020408163265305, |
| "loss": 2.0808, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8623853211009175, |
| "grad_norm": 0.36255204677581787, |
| "learning_rate": 0.00025867346938775507, |
| "loss": 1.7574, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8807339449541285, |
| "grad_norm": 0.35033008456230164, |
| "learning_rate": 0.0002571428571428571, |
| "loss": 1.9578, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.8990825688073395, |
| "grad_norm": 0.3665790855884552, |
| "learning_rate": 0.0002556122448979592, |
| "loss": 1.9175, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.9174311926605505, |
| "grad_norm": 0.3547409772872925, |
| "learning_rate": 0.0002540816326530612, |
| "loss": 1.8527, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9357798165137615, |
| "grad_norm": 0.37087056040763855, |
| "learning_rate": 0.0002525510204081632, |
| "loss": 1.7837, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9541284403669725, |
| "grad_norm": 0.40799495577812195, |
| "learning_rate": 0.0002510204081632653, |
| "loss": 1.5264, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.9724770642201835, |
| "grad_norm": 0.3731648027896881, |
| "learning_rate": 0.00024948979591836733, |
| "loss": 1.8249, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.9908256880733946, |
| "grad_norm": 0.3571811616420746, |
| "learning_rate": 0.00024795918367346935, |
| "loss": 1.7617, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.5152439475059509, |
| "learning_rate": 0.0002464285714285714, |
| "loss": 1.8187, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.8342303037643433, |
| "eval_runtime": 40.5217, |
| "eval_samples_per_second": 8.193, |
| "eval_steps_per_second": 4.097, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.018348623853211, |
| "grad_norm": 0.38449227809906006, |
| "learning_rate": 0.00024489795918367346, |
| "loss": 1.7036, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.036697247706422, |
| "grad_norm": 0.3873405158519745, |
| "learning_rate": 0.00024336734693877548, |
| "loss": 1.9332, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0550458715596331, |
| "grad_norm": 0.4064755439758301, |
| "learning_rate": 0.00024183673469387753, |
| "loss": 1.7656, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.073394495412844, |
| "grad_norm": 0.39023351669311523, |
| "learning_rate": 0.00024030612244897956, |
| "loss": 1.8004, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.091743119266055, |
| "grad_norm": 0.40451326966285706, |
| "learning_rate": 0.0002387755102040816, |
| "loss": 1.674, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.110091743119266, |
| "grad_norm": 0.3661345839500427, |
| "learning_rate": 0.00023724489795918366, |
| "loss": 1.9113, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.1284403669724772, |
| "grad_norm": 0.4025841951370239, |
| "learning_rate": 0.00023571428571428569, |
| "loss": 1.8991, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.146788990825688, |
| "grad_norm": 0.41999149322509766, |
| "learning_rate": 0.00023418367346938774, |
| "loss": 1.7361, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.165137614678899, |
| "grad_norm": 0.39743533730506897, |
| "learning_rate": 0.00023265306122448976, |
| "loss": 1.5829, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.18348623853211, |
| "grad_norm": 0.42366546392440796, |
| "learning_rate": 0.00023112244897959181, |
| "loss": 1.7228, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2018348623853212, |
| "grad_norm": 0.37864112854003906, |
| "learning_rate": 0.00022959183673469384, |
| "loss": 1.636, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.2201834862385321, |
| "grad_norm": 0.392802357673645, |
| "learning_rate": 0.0002280612244897959, |
| "loss": 1.5435, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.238532110091743, |
| "grad_norm": 0.4146474301815033, |
| "learning_rate": 0.00022653061224489791, |
| "loss": 1.7305, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.2568807339449541, |
| "grad_norm": 0.401034951210022, |
| "learning_rate": 0.000225, |
| "loss": 1.6097, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.2752293577981653, |
| "grad_norm": 0.4014700949192047, |
| "learning_rate": 0.00022346938775510205, |
| "loss": 1.8824, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.2935779816513762, |
| "grad_norm": 0.4407334327697754, |
| "learning_rate": 0.00022193877551020407, |
| "loss": 1.6208, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.311926605504587, |
| "grad_norm": 0.3951621353626251, |
| "learning_rate": 0.00022040816326530612, |
| "loss": 1.7436, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.3302752293577982, |
| "grad_norm": 0.40659868717193604, |
| "learning_rate": 0.00021887755102040815, |
| "loss": 1.7545, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.3486238532110093, |
| "grad_norm": 0.40831202268600464, |
| "learning_rate": 0.0002173469387755102, |
| "loss": 1.8652, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.3669724770642202, |
| "grad_norm": 0.44127607345581055, |
| "learning_rate": 0.00021581632653061222, |
| "loss": 1.5706, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.385321100917431, |
| "grad_norm": 0.413889616727829, |
| "learning_rate": 0.00021428571428571427, |
| "loss": 1.5272, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.4036697247706422, |
| "grad_norm": 0.4785701036453247, |
| "learning_rate": 0.0002127551020408163, |
| "loss": 1.572, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.4220183486238533, |
| "grad_norm": 0.45182228088378906, |
| "learning_rate": 0.00021122448979591835, |
| "loss": 1.5978, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.4403669724770642, |
| "grad_norm": 0.4382292628288269, |
| "learning_rate": 0.0002096938775510204, |
| "loss": 1.6424, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.4587155963302751, |
| "grad_norm": 0.529373288154602, |
| "learning_rate": 0.00020816326530612243, |
| "loss": 1.5756, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4770642201834863, |
| "grad_norm": 0.4828866720199585, |
| "learning_rate": 0.00020663265306122448, |
| "loss": 1.6252, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.4954128440366974, |
| "grad_norm": 0.455864816904068, |
| "learning_rate": 0.0002051020408163265, |
| "loss": 1.6719, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.5137614678899083, |
| "grad_norm": 0.43042704463005066, |
| "learning_rate": 0.00020357142857142856, |
| "loss": 1.4768, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.5321100917431192, |
| "grad_norm": 0.43703892827033997, |
| "learning_rate": 0.00020204081632653058, |
| "loss": 1.4823, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.5504587155963303, |
| "grad_norm": 0.6060220003128052, |
| "learning_rate": 0.00020051020408163263, |
| "loss": 1.7325, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5688073394495414, |
| "grad_norm": 0.4146731495857239, |
| "learning_rate": 0.00019897959183673466, |
| "loss": 1.5784, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.5871559633027523, |
| "grad_norm": 0.48514559864997864, |
| "learning_rate": 0.0001974489795918367, |
| "loss": 1.7322, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.6055045871559632, |
| "grad_norm": 0.4649484157562256, |
| "learning_rate": 0.00019591836734693873, |
| "loss": 1.4178, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.6238532110091743, |
| "grad_norm": 0.48133671283721924, |
| "learning_rate": 0.0001943877551020408, |
| "loss": 1.5013, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.6422018348623855, |
| "grad_norm": 0.4548419415950775, |
| "learning_rate": 0.00019285714285714286, |
| "loss": 1.5874, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.6605504587155964, |
| "grad_norm": 0.5536527633666992, |
| "learning_rate": 0.0001913265306122449, |
| "loss": 1.4934, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.6788990825688073, |
| "grad_norm": 0.5949488878250122, |
| "learning_rate": 0.00018979591836734694, |
| "loss": 1.5306, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.6972477064220184, |
| "grad_norm": 0.5679605603218079, |
| "learning_rate": 0.00018826530612244896, |
| "loss": 1.5162, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.7155963302752295, |
| "grad_norm": 0.5724030137062073, |
| "learning_rate": 0.00018673469387755102, |
| "loss": 1.4972, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.7339449541284404, |
| "grad_norm": 0.5926903486251831, |
| "learning_rate": 0.00018520408163265304, |
| "loss": 1.512, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.7522935779816513, |
| "grad_norm": 0.49787193536758423, |
| "learning_rate": 0.0001836734693877551, |
| "loss": 1.5754, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.7706422018348624, |
| "grad_norm": 0.521364152431488, |
| "learning_rate": 0.00018214285714285712, |
| "loss": 1.5558, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.7889908256880735, |
| "grad_norm": 0.5252606868743896, |
| "learning_rate": 0.00018061224489795917, |
| "loss": 1.5562, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.8073394495412844, |
| "grad_norm": 0.520020604133606, |
| "learning_rate": 0.00017908163265306122, |
| "loss": 1.5431, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.8256880733944953, |
| "grad_norm": 0.5684699416160583, |
| "learning_rate": 0.00017755102040816325, |
| "loss": 1.4776, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8440366972477065, |
| "grad_norm": 0.49918070435523987, |
| "learning_rate": 0.0001760204081632653, |
| "loss": 1.5665, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.8623853211009176, |
| "grad_norm": 0.5219622850418091, |
| "learning_rate": 0.00017448979591836732, |
| "loss": 1.4726, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.8807339449541285, |
| "grad_norm": 0.49506455659866333, |
| "learning_rate": 0.00017295918367346937, |
| "loss": 1.5677, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.8990825688073394, |
| "grad_norm": 0.48011767864227295, |
| "learning_rate": 0.0001714285714285714, |
| "loss": 1.4628, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.9174311926605505, |
| "grad_norm": 0.49670523405075073, |
| "learning_rate": 0.00016989795918367345, |
| "loss": 1.343, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9357798165137616, |
| "grad_norm": 0.5730084180831909, |
| "learning_rate": 0.00016836734693877547, |
| "loss": 1.5211, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.9541284403669725, |
| "grad_norm": 0.5185048580169678, |
| "learning_rate": 0.00016683673469387753, |
| "loss": 1.4416, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.9724770642201834, |
| "grad_norm": 0.5075457692146301, |
| "learning_rate": 0.00016530612244897955, |
| "loss": 1.3877, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.9908256880733946, |
| "grad_norm": 0.5157256126403809, |
| "learning_rate": 0.00016377551020408163, |
| "loss": 1.4932, |
| "step": 109 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.8046102523803711, |
| "learning_rate": 0.00016224489795918368, |
| "loss": 1.3092, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.7067729234695435, |
| "eval_runtime": 40.3292, |
| "eval_samples_per_second": 8.232, |
| "eval_steps_per_second": 4.116, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.018348623853211, |
| "grad_norm": 0.5372384786605835, |
| "learning_rate": 0.0001607142857142857, |
| "loss": 1.4086, |
| "step": 111 |
| }, |
| { |
| "epoch": 2.036697247706422, |
| "grad_norm": 0.5499650239944458, |
| "learning_rate": 0.00015918367346938776, |
| "loss": 1.4309, |
| "step": 112 |
| }, |
| { |
| "epoch": 2.055045871559633, |
| "grad_norm": 0.5393022298812866, |
| "learning_rate": 0.00015765306122448978, |
| "loss": 1.3377, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.073394495412844, |
| "grad_norm": 0.4644460678100586, |
| "learning_rate": 0.00015612244897959183, |
| "loss": 1.3486, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.091743119266055, |
| "grad_norm": 0.5637883543968201, |
| "learning_rate": 0.00015459183673469386, |
| "loss": 1.3423, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.1100917431192663, |
| "grad_norm": 0.5196585655212402, |
| "learning_rate": 0.0001530612244897959, |
| "loss": 1.1701, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.128440366972477, |
| "grad_norm": 0.5293002724647522, |
| "learning_rate": 0.00015153061224489794, |
| "loss": 1.3339, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.146788990825688, |
| "grad_norm": 0.5287687182426453, |
| "learning_rate": 0.00015, |
| "loss": 1.2385, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.165137614678899, |
| "grad_norm": 0.5759944319725037, |
| "learning_rate": 0.00014846938775510204, |
| "loss": 1.3614, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.18348623853211, |
| "grad_norm": 0.6487063765525818, |
| "learning_rate": 0.00014693877551020406, |
| "loss": 1.392, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.2018348623853212, |
| "grad_norm": 0.5643731355667114, |
| "learning_rate": 0.00014540816326530611, |
| "loss": 1.3722, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.220183486238532, |
| "grad_norm": 0.6352585554122925, |
| "learning_rate": 0.00014387755102040814, |
| "loss": 1.4423, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.238532110091743, |
| "grad_norm": 0.5605873465538025, |
| "learning_rate": 0.0001423469387755102, |
| "loss": 1.3284, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.2568807339449544, |
| "grad_norm": 0.49541911482810974, |
| "learning_rate": 0.00014081632653061224, |
| "loss": 1.324, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.2752293577981653, |
| "grad_norm": 0.5412710905075073, |
| "learning_rate": 0.00013928571428571427, |
| "loss": 1.2605, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.293577981651376, |
| "grad_norm": 0.575342059135437, |
| "learning_rate": 0.00013775510204081632, |
| "loss": 1.2615, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.311926605504587, |
| "grad_norm": 0.6061179637908936, |
| "learning_rate": 0.00013622448979591834, |
| "loss": 1.3793, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.330275229357798, |
| "grad_norm": 0.5834862589836121, |
| "learning_rate": 0.0001346938775510204, |
| "loss": 1.3506, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.3486238532110093, |
| "grad_norm": 0.5381261110305786, |
| "learning_rate": 0.00013316326530612245, |
| "loss": 1.224, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.36697247706422, |
| "grad_norm": 0.7020912170410156, |
| "learning_rate": 0.00013163265306122447, |
| "loss": 1.3398, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.385321100917431, |
| "grad_norm": 0.5954611897468567, |
| "learning_rate": 0.00013010204081632652, |
| "loss": 1.2508, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.4036697247706424, |
| "grad_norm": 0.5576110482215881, |
| "learning_rate": 0.00012857142857142855, |
| "loss": 1.266, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.4220183486238533, |
| "grad_norm": 0.5651321411132812, |
| "learning_rate": 0.0001270408163265306, |
| "loss": 1.1969, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.4403669724770642, |
| "grad_norm": 0.6407105326652527, |
| "learning_rate": 0.00012551020408163265, |
| "loss": 1.2142, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.458715596330275, |
| "grad_norm": 0.6058876514434814, |
| "learning_rate": 0.00012397959183673468, |
| "loss": 1.2383, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.477064220183486, |
| "grad_norm": 0.5757945775985718, |
| "learning_rate": 0.00012244897959183673, |
| "loss": 1.286, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.4954128440366974, |
| "grad_norm": 0.5286776423454285, |
| "learning_rate": 0.00012091836734693877, |
| "loss": 1.364, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.5137614678899083, |
| "grad_norm": 0.5179428458213806, |
| "learning_rate": 0.0001193877551020408, |
| "loss": 1.1119, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.532110091743119, |
| "grad_norm": 0.6148931384086609, |
| "learning_rate": 0.00011785714285714284, |
| "loss": 1.2517, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.5504587155963305, |
| "grad_norm": 0.5199230313301086, |
| "learning_rate": 0.00011632653061224488, |
| "loss": 1.2244, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.5688073394495414, |
| "grad_norm": 0.8426241278648376, |
| "learning_rate": 0.00011479591836734692, |
| "loss": 1.2683, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.5871559633027523, |
| "grad_norm": 0.5697855949401855, |
| "learning_rate": 0.00011326530612244896, |
| "loss": 1.1894, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.6055045871559632, |
| "grad_norm": 0.6371927261352539, |
| "learning_rate": 0.00011173469387755102, |
| "loss": 1.1667, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.623853211009174, |
| "grad_norm": 0.5687111616134644, |
| "learning_rate": 0.00011020408163265306, |
| "loss": 1.2333, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.6422018348623855, |
| "grad_norm": 0.7226176261901855, |
| "learning_rate": 0.0001086734693877551, |
| "loss": 1.1316, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.6605504587155964, |
| "grad_norm": 0.5449386239051819, |
| "learning_rate": 0.00010714285714285714, |
| "loss": 1.3315, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.6788990825688073, |
| "grad_norm": 0.5627453327178955, |
| "learning_rate": 0.00010561224489795918, |
| "loss": 1.1903, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.6972477064220186, |
| "grad_norm": 0.6717932820320129, |
| "learning_rate": 0.00010408163265306121, |
| "loss": 1.1551, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.7155963302752295, |
| "grad_norm": 0.6216678619384766, |
| "learning_rate": 0.00010255102040816325, |
| "loss": 1.3409, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.7339449541284404, |
| "grad_norm": 0.565977156162262, |
| "learning_rate": 0.00010102040816326529, |
| "loss": 1.2799, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.7522935779816513, |
| "grad_norm": 0.6974563598632812, |
| "learning_rate": 9.948979591836733e-05, |
| "loss": 1.2034, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.770642201834862, |
| "grad_norm": 0.5820234417915344, |
| "learning_rate": 9.795918367346937e-05, |
| "loss": 1.1747, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.7889908256880735, |
| "grad_norm": 0.5635000467300415, |
| "learning_rate": 9.642857142857143e-05, |
| "loss": 1.2143, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.8073394495412844, |
| "grad_norm": 0.6183028817176819, |
| "learning_rate": 9.489795918367347e-05, |
| "loss": 1.0934, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.8256880733944953, |
| "grad_norm": 0.5435863733291626, |
| "learning_rate": 9.336734693877551e-05, |
| "loss": 1.254, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.8440366972477067, |
| "grad_norm": 0.6071304678916931, |
| "learning_rate": 9.183673469387755e-05, |
| "loss": 1.0866, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.8623853211009176, |
| "grad_norm": 0.6845288276672363, |
| "learning_rate": 9.030612244897958e-05, |
| "loss": 1.0657, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.8807339449541285, |
| "grad_norm": 0.7707186341285706, |
| "learning_rate": 8.877551020408162e-05, |
| "loss": 1.1873, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.8990825688073394, |
| "grad_norm": 0.6403721570968628, |
| "learning_rate": 8.724489795918366e-05, |
| "loss": 1.2217, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.9174311926605503, |
| "grad_norm": 0.6184455752372742, |
| "learning_rate": 8.57142857142857e-05, |
| "loss": 1.2699, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.9357798165137616, |
| "grad_norm": 0.6127010583877563, |
| "learning_rate": 8.418367346938774e-05, |
| "loss": 1.1339, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.9541284403669725, |
| "grad_norm": 0.5781142711639404, |
| "learning_rate": 8.265306122448978e-05, |
| "loss": 1.0983, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.9724770642201834, |
| "grad_norm": 0.6162577271461487, |
| "learning_rate": 8.112244897959184e-05, |
| "loss": 1.225, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.9908256880733948, |
| "grad_norm": 0.5873180031776428, |
| "learning_rate": 7.959183673469388e-05, |
| "loss": 1.1027, |
| "step": 164 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.9156239032745361, |
| "learning_rate": 7.806122448979592e-05, |
| "loss": 1.2634, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 1.765793800354004, |
| "eval_runtime": 40.2043, |
| "eval_samples_per_second": 8.258, |
| "eval_steps_per_second": 4.129, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.018348623853211, |
| "grad_norm": 2.7086777687072754, |
| "learning_rate": 7.653061224489796e-05, |
| "loss": 0.9086, |
| "step": 166 |
| }, |
| { |
| "epoch": 3.036697247706422, |
| "grad_norm": 0.798815131187439, |
| "learning_rate": 7.5e-05, |
| "loss": 0.9862, |
| "step": 167 |
| }, |
| { |
| "epoch": 3.055045871559633, |
| "grad_norm": 0.6483126878738403, |
| "learning_rate": 7.346938775510203e-05, |
| "loss": 1.0914, |
| "step": 168 |
| }, |
| { |
| "epoch": 3.073394495412844, |
| "grad_norm": 0.9064918160438538, |
| "learning_rate": 7.193877551020407e-05, |
| "loss": 0.9983, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.091743119266055, |
| "grad_norm": 0.5859102010726929, |
| "learning_rate": 7.040816326530612e-05, |
| "loss": 1.2501, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.1100917431192663, |
| "grad_norm": 1.0210392475128174, |
| "learning_rate": 6.887755102040816e-05, |
| "loss": 0.9503, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.128440366972477, |
| "grad_norm": 0.5909506678581238, |
| "learning_rate": 6.73469387755102e-05, |
| "loss": 1.1646, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.146788990825688, |
| "grad_norm": 0.8601213693618774, |
| "learning_rate": 6.581632653061224e-05, |
| "loss": 1.089, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.165137614678899, |
| "grad_norm": 1.5392725467681885, |
| "learning_rate": 6.428571428571427e-05, |
| "loss": 1.1588, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.18348623853211, |
| "grad_norm": 0.6926484107971191, |
| "learning_rate": 6.275510204081633e-05, |
| "loss": 1.0608, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.2018348623853212, |
| "grad_norm": 0.6359078288078308, |
| "learning_rate": 6.122448979591836e-05, |
| "loss": 1.2014, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.220183486238532, |
| "grad_norm": 0.7411192655563354, |
| "learning_rate": 5.96938775510204e-05, |
| "loss": 1.0524, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.238532110091743, |
| "grad_norm": 0.7913389801979065, |
| "learning_rate": 5.816326530612244e-05, |
| "loss": 0.9702, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.2568807339449544, |
| "grad_norm": 0.5389623045921326, |
| "learning_rate": 5.663265306122448e-05, |
| "loss": 1.0675, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.2752293577981653, |
| "grad_norm": 0.4771580398082733, |
| "learning_rate": 5.510204081632653e-05, |
| "loss": 1.0254, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.293577981651376, |
| "grad_norm": 0.7268071174621582, |
| "learning_rate": 5.357142857142857e-05, |
| "loss": 1.0475, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.311926605504587, |
| "grad_norm": 0.6685888767242432, |
| "learning_rate": 5.204081632653061e-05, |
| "loss": 0.9658, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.330275229357798, |
| "grad_norm": 0.5965012907981873, |
| "learning_rate": 5.0510204081632645e-05, |
| "loss": 1.0657, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.3486238532110093, |
| "grad_norm": 0.5370911955833435, |
| "learning_rate": 4.897959183673468e-05, |
| "loss": 1.0277, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.36697247706422, |
| "grad_norm": 0.646511971950531, |
| "learning_rate": 4.7448979591836735e-05, |
| "loss": 1.1439, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.385321100917431, |
| "grad_norm": 0.5768170356750488, |
| "learning_rate": 4.591836734693877e-05, |
| "loss": 1.253, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.4036697247706424, |
| "grad_norm": 0.6067250370979309, |
| "learning_rate": 4.438775510204081e-05, |
| "loss": 0.9355, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.4220183486238533, |
| "grad_norm": 0.8360852003097534, |
| "learning_rate": 4.285714285714285e-05, |
| "loss": 1.0526, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.4403669724770642, |
| "grad_norm": 0.59422767162323, |
| "learning_rate": 4.132653061224489e-05, |
| "loss": 1.1794, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.458715596330275, |
| "grad_norm": 0.5541112422943115, |
| "learning_rate": 3.979591836734694e-05, |
| "loss": 0.9477, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.477064220183486, |
| "grad_norm": 0.587552547454834, |
| "learning_rate": 3.826530612244898e-05, |
| "loss": 1.0714, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.4954128440366974, |
| "grad_norm": 0.5143390893936157, |
| "learning_rate": 3.6734693877551016e-05, |
| "loss": 1.0379, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.5137614678899083, |
| "grad_norm": 0.6476455330848694, |
| "learning_rate": 3.520408163265306e-05, |
| "loss": 0.9505, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.532110091743119, |
| "grad_norm": 0.5314425230026245, |
| "learning_rate": 3.36734693877551e-05, |
| "loss": 1.0504, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.5504587155963305, |
| "grad_norm": 0.6685779094696045, |
| "learning_rate": 3.214285714285714e-05, |
| "loss": 1.083, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.5688073394495414, |
| "grad_norm": 0.5587136149406433, |
| "learning_rate": 3.061224489795918e-05, |
| "loss": 0.9242, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.5871559633027523, |
| "grad_norm": 0.5730684995651245, |
| "learning_rate": 2.908163265306122e-05, |
| "loss": 1.0271, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.6055045871559632, |
| "grad_norm": 0.5254084467887878, |
| "learning_rate": 2.7551020408163265e-05, |
| "loss": 1.0828, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.623853211009174, |
| "grad_norm": 0.5611628890037537, |
| "learning_rate": 2.6020408163265303e-05, |
| "loss": 1.1933, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.6422018348623855, |
| "grad_norm": 0.541988730430603, |
| "learning_rate": 2.448979591836734e-05, |
| "loss": 0.9135, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.6605504587155964, |
| "grad_norm": 0.5615597367286682, |
| "learning_rate": 2.2959183673469387e-05, |
| "loss": 0.9679, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.6788990825688073, |
| "grad_norm": 0.664253830909729, |
| "learning_rate": 2.1428571428571425e-05, |
| "loss": 0.9984, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.6972477064220186, |
| "grad_norm": 0.5762522220611572, |
| "learning_rate": 1.989795918367347e-05, |
| "loss": 1.0341, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.7155963302752295, |
| "grad_norm": 0.544408917427063, |
| "learning_rate": 1.8367346938775508e-05, |
| "loss": 1.2647, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.7339449541284404, |
| "grad_norm": 0.5570142865180969, |
| "learning_rate": 1.683673469387755e-05, |
| "loss": 0.8697, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.7522935779816513, |
| "grad_norm": 0.5823831558227539, |
| "learning_rate": 1.530612244897959e-05, |
| "loss": 1.0817, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.770642201834862, |
| "grad_norm": 0.672044038772583, |
| "learning_rate": 1.3775510204081633e-05, |
| "loss": 1.029, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.7889908256880735, |
| "grad_norm": 0.6006896495819092, |
| "learning_rate": 1.224489795918367e-05, |
| "loss": 1.1158, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.8073394495412844, |
| "grad_norm": 0.546748697757721, |
| "learning_rate": 1.0714285714285712e-05, |
| "loss": 0.8507, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.8256880733944953, |
| "grad_norm": 0.6468778252601624, |
| "learning_rate": 9.183673469387754e-06, |
| "loss": 1.0383, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.8440366972477067, |
| "grad_norm": 0.510528564453125, |
| "learning_rate": 7.653061224489796e-06, |
| "loss": 0.8932, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.8623853211009176, |
| "grad_norm": 0.5235452055931091, |
| "learning_rate": 6.122448979591835e-06, |
| "loss": 0.9912, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.8807339449541285, |
| "grad_norm": 0.589995801448822, |
| "learning_rate": 4.591836734693877e-06, |
| "loss": 0.9821, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.8990825688073394, |
| "grad_norm": 0.5095317959785461, |
| "learning_rate": 3.0612244897959177e-06, |
| "loss": 0.9985, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.9174311926605503, |
| "grad_norm": 0.6477232575416565, |
| "learning_rate": 1.5306122448979589e-06, |
| "loss": 0.9372, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.9357798165137616, |
| "grad_norm": 0.5697444677352905, |
| "learning_rate": 0.0, |
| "loss": 1.0124, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.9357798165137616, |
| "eval_loss": 1.7458010911941528, |
| "eval_runtime": 39.9333, |
| "eval_samples_per_second": 8.314, |
| "eval_steps_per_second": 4.157, |
| "step": 216 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 216, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.809327135010898e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|