{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011235955056179775, "grad_norm": 10.042351722717285, "learning_rate": 4e-05, "loss": 9.033, "step": 1 }, { "epoch": 0.02247191011235955, "grad_norm": 10.187106132507324, "learning_rate": 8e-05, "loss": 9.1846, "step": 2 }, { "epoch": 0.033707865168539325, "grad_norm": 8.38694953918457, "learning_rate": 0.00012, "loss": 7.9832, "step": 3 }, { "epoch": 0.0449438202247191, "grad_norm": 5.938446521759033, "learning_rate": 0.00016, "loss": 5.6493, "step": 4 }, { "epoch": 0.056179775280898875, "grad_norm": 4.586862087249756, "learning_rate": 0.0002, "loss": 4.0131, "step": 5 }, { "epoch": 0.06741573033707865, "grad_norm": 4.84490442276001, "learning_rate": 0.00019954545454545455, "loss": 2.9931, "step": 6 }, { "epoch": 0.07865168539325842, "grad_norm": 4.460867881774902, "learning_rate": 0.0001990909090909091, "loss": 2.261, "step": 7 }, { "epoch": 0.0898876404494382, "grad_norm": 3.067777395248413, "learning_rate": 0.00019863636363636364, "loss": 1.8861, "step": 8 }, { "epoch": 0.10112359550561797, "grad_norm": 1.5425913333892822, "learning_rate": 0.00019818181818181821, "loss": 1.4675, "step": 9 }, { "epoch": 0.11235955056179775, "grad_norm": 1.346417784690857, "learning_rate": 0.00019772727272727273, "loss": 1.3642, "step": 10 }, { "epoch": 0.12359550561797752, "grad_norm": 1.1825133562088013, "learning_rate": 0.00019727272727272728, "loss": 1.0109, "step": 11 }, { "epoch": 0.1348314606741573, "grad_norm": 0.9976914525032043, "learning_rate": 0.00019681818181818185, "loss": 0.9873, "step": 12 }, { "epoch": 0.14606741573033707, "grad_norm": 0.6481193900108337, "learning_rate": 0.00019636363636363636, "loss": 0.6746, "step": 13 }, { "epoch": 0.15730337078651685, "grad_norm": 0.5585954189300537, "learning_rate": 0.0001959090909090909, "loss": 0.5289, "step": 14 }, { "epoch": 0.16853932584269662, "grad_norm": 0.9624674916267395, "learning_rate": 0.00019545454545454548, "loss": 0.5552, "step": 15 }, { "epoch": 0.1797752808988764, "grad_norm": 0.7732966542243958, "learning_rate": 0.000195, "loss": 0.5047, "step": 16 }, { "epoch": 0.19101123595505617, "grad_norm": 0.7540618777275085, "learning_rate": 0.00019454545454545457, "loss": 0.5803, "step": 17 }, { "epoch": 0.20224719101123595, "grad_norm": 0.9345465302467346, "learning_rate": 0.0001940909090909091, "loss": 0.611, "step": 18 }, { "epoch": 0.21348314606741572, "grad_norm": 0.5613678693771362, "learning_rate": 0.00019363636363636363, "loss": 0.4007, "step": 19 }, { "epoch": 0.2247191011235955, "grad_norm": 0.8053308725357056, "learning_rate": 0.0001931818181818182, "loss": 0.5202, "step": 20 }, { "epoch": 0.23595505617977527, "grad_norm": 0.6621015667915344, "learning_rate": 0.00019272727272727274, "loss": 0.4851, "step": 21 }, { "epoch": 0.24719101123595505, "grad_norm": 0.797894299030304, "learning_rate": 0.00019227272727272726, "loss": 0.5401, "step": 22 }, { "epoch": 0.25842696629213485, "grad_norm": 0.5635455250740051, "learning_rate": 0.00019181818181818183, "loss": 0.4295, "step": 23 }, { "epoch": 0.2696629213483146, "grad_norm": 0.5012619495391846, "learning_rate": 0.00019136363636363638, "loss": 0.3133, "step": 24 }, { "epoch": 0.2808988764044944, "grad_norm": 0.5143508911132812, "learning_rate": 0.00019090909090909092, "loss": 0.3207, "step": 25 }, { "epoch": 0.29213483146067415, "grad_norm": NaN, "learning_rate": 0.00019090909090909092, "loss": 0.622, "step": 26 }, { "epoch": 0.30337078651685395, "grad_norm": NaN, "learning_rate": 0.00019090909090909092, "loss": 0.5544, "step": 27 }, { "epoch": 0.3146067415730337, "grad_norm": NaN, "learning_rate": 0.00019090909090909092, "loss": 0.4251, "step": 28 }, { "epoch": 0.3258426966292135, "grad_norm": NaN, "learning_rate": 0.00019090909090909092, "loss": 0.626, "step": 29 }, { "epoch": 0.33707865168539325, "grad_norm": 551.2264404296875, "learning_rate": 0.00019045454545454547, "loss": 0.5757, "step": 30 }, { "epoch": 0.34831460674157305, "grad_norm": 0.5639562010765076, "learning_rate": 0.00019, "loss": 0.3197, "step": 31 }, { "epoch": 0.3595505617977528, "grad_norm": 0.5104588270187378, "learning_rate": 0.00018954545454545455, "loss": 0.2792, "step": 32 }, { "epoch": 0.3707865168539326, "grad_norm": 0.6046687960624695, "learning_rate": 0.0001890909090909091, "loss": 0.4017, "step": 33 }, { "epoch": 0.38202247191011235, "grad_norm": 0.5290363430976868, "learning_rate": 0.00018863636363636364, "loss": 0.2522, "step": 34 }, { "epoch": 0.39325842696629215, "grad_norm": 0.4888989329338074, "learning_rate": 0.0001881818181818182, "loss": 0.2513, "step": 35 }, { "epoch": 0.4044943820224719, "grad_norm": 0.4538341164588928, "learning_rate": 0.00018772727272727273, "loss": 0.2394, "step": 36 }, { "epoch": 0.4157303370786517, "grad_norm": 0.5249782800674438, "learning_rate": 0.00018727272727272728, "loss": 0.3383, "step": 37 }, { "epoch": 0.42696629213483145, "grad_norm": 0.544735312461853, "learning_rate": 0.00018681818181818182, "loss": 0.3345, "step": 38 }, { "epoch": 0.43820224719101125, "grad_norm": 0.3868374228477478, "learning_rate": 0.00018636363636363636, "loss": 0.2425, "step": 39 }, { "epoch": 0.449438202247191, "grad_norm": 0.41138726472854614, "learning_rate": 0.0001859090909090909, "loss": 0.2625, "step": 40 }, { "epoch": 0.4606741573033708, "grad_norm": 0.684403657913208, "learning_rate": 0.00018545454545454545, "loss": 0.3363, "step": 41 }, { "epoch": 0.47191011235955055, "grad_norm": 0.4800611138343811, "learning_rate": 0.00018500000000000002, "loss": 0.2339, "step": 42 }, { "epoch": 0.48314606741573035, "grad_norm": 0.47672778367996216, "learning_rate": 0.00018454545454545454, "loss": 0.2908, "step": 43 }, { "epoch": 0.4943820224719101, "grad_norm": 0.44971343874931335, "learning_rate": 0.00018409090909090909, "loss": 0.2553, "step": 44 }, { "epoch": 0.5056179775280899, "grad_norm": 0.45129504799842834, "learning_rate": 0.00018363636363636366, "loss": 0.325, "step": 45 }, { "epoch": 0.5168539325842697, "grad_norm": 0.48597297072410583, "learning_rate": 0.00018318181818181817, "loss": 0.2843, "step": 46 }, { "epoch": 0.5280898876404494, "grad_norm": 0.3514116704463959, "learning_rate": 0.00018272727272727275, "loss": 0.2128, "step": 47 }, { "epoch": 0.5393258426966292, "grad_norm": 0.46826595067977905, "learning_rate": 0.0001822727272727273, "loss": 0.2601, "step": 48 }, { "epoch": 0.550561797752809, "grad_norm": 0.3118669390678406, "learning_rate": 0.00018181818181818183, "loss": 0.2192, "step": 49 }, { "epoch": 0.5617977528089888, "grad_norm": 0.37151020765304565, "learning_rate": 0.00018136363636363638, "loss": 0.2255, "step": 50 }, { "epoch": 0.5730337078651685, "grad_norm": 0.4220457971096039, "learning_rate": 0.00018090909090909092, "loss": 0.2463, "step": 51 }, { "epoch": 0.5842696629213483, "grad_norm": 0.298927366733551, "learning_rate": 0.00018045454545454547, "loss": 0.1939, "step": 52 }, { "epoch": 0.5955056179775281, "grad_norm": 0.3494245707988739, "learning_rate": 0.00018, "loss": 0.2276, "step": 53 }, { "epoch": 0.6067415730337079, "grad_norm": 0.7135240435600281, "learning_rate": 0.00017954545454545456, "loss": 0.316, "step": 54 }, { "epoch": 0.6179775280898876, "grad_norm": 0.47558313608169556, "learning_rate": 0.0001790909090909091, "loss": 0.1915, "step": 55 }, { "epoch": 0.6292134831460674, "grad_norm": 0.47424033284187317, "learning_rate": 0.00017863636363636364, "loss": 0.268, "step": 56 }, { "epoch": 0.6404494382022472, "grad_norm": 0.3898255228996277, "learning_rate": 0.0001781818181818182, "loss": 0.2346, "step": 57 }, { "epoch": 0.651685393258427, "grad_norm": 0.3044908940792084, "learning_rate": 0.00017772727272727273, "loss": 0.2244, "step": 58 }, { "epoch": 0.6629213483146067, "grad_norm": 0.4012211561203003, "learning_rate": 0.00017727272727272728, "loss": 0.2548, "step": 59 }, { "epoch": 0.6741573033707865, "grad_norm": 0.36978745460510254, "learning_rate": 0.00017681818181818182, "loss": 0.1945, "step": 60 }, { "epoch": 0.6853932584269663, "grad_norm": 0.3404884338378906, "learning_rate": 0.00017636363636363637, "loss": 0.2486, "step": 61 }, { "epoch": 0.6966292134831461, "grad_norm": 0.3647669851779938, "learning_rate": 0.0001759090909090909, "loss": 0.2267, "step": 62 }, { "epoch": 0.7078651685393258, "grad_norm": 0.2504497170448303, "learning_rate": 0.00017545454545454548, "loss": 0.1854, "step": 63 }, { "epoch": 0.7191011235955056, "grad_norm": 0.3424750566482544, "learning_rate": 0.000175, "loss": 0.2586, "step": 64 }, { "epoch": 0.7303370786516854, "grad_norm": 0.3164011836051941, "learning_rate": 0.00017454545454545454, "loss": 0.217, "step": 65 }, { "epoch": 0.7415730337078652, "grad_norm": 0.32831573486328125, "learning_rate": 0.00017409090909090911, "loss": 0.2326, "step": 66 }, { "epoch": 0.7528089887640449, "grad_norm": 0.3514035940170288, "learning_rate": 0.00017363636363636363, "loss": 0.2591, "step": 67 }, { "epoch": 0.7640449438202247, "grad_norm": 0.30864688754081726, "learning_rate": 0.0001731818181818182, "loss": 0.2073, "step": 68 }, { "epoch": 0.7752808988764045, "grad_norm": 0.32905542850494385, "learning_rate": 0.00017272727272727275, "loss": 0.2224, "step": 69 }, { "epoch": 0.7865168539325843, "grad_norm": 0.27842977643013, "learning_rate": 0.00017227272727272726, "loss": 0.1893, "step": 70 }, { "epoch": 0.797752808988764, "grad_norm": 0.3048170804977417, "learning_rate": 0.00017181818181818184, "loss": 0.2206, "step": 71 }, { "epoch": 0.8089887640449438, "grad_norm": 0.25344306230545044, "learning_rate": 0.00017136363636363638, "loss": 0.1876, "step": 72 }, { "epoch": 0.8202247191011236, "grad_norm": 0.3092687427997589, "learning_rate": 0.0001709090909090909, "loss": 0.1791, "step": 73 }, { "epoch": 0.8314606741573034, "grad_norm": 0.30857375264167786, "learning_rate": 0.00017045454545454547, "loss": 0.2099, "step": 74 }, { "epoch": 0.8426966292134831, "grad_norm": 0.20455722510814667, "learning_rate": 0.00017, "loss": 0.1705, "step": 75 }, { "epoch": 0.8539325842696629, "grad_norm": 0.273429274559021, "learning_rate": 0.00016954545454545456, "loss": 0.162, "step": 76 }, { "epoch": 0.8651685393258427, "grad_norm": 0.25422728061676025, "learning_rate": 0.0001690909090909091, "loss": 0.1804, "step": 77 }, { "epoch": 0.8764044943820225, "grad_norm": 0.2745291292667389, "learning_rate": 0.00016863636363636364, "loss": 0.2184, "step": 78 }, { "epoch": 0.8876404494382022, "grad_norm": 0.30162861943244934, "learning_rate": 0.0001681818181818182, "loss": 0.2371, "step": 79 }, { "epoch": 0.898876404494382, "grad_norm": 0.29733604192733765, "learning_rate": 0.00016772727272727273, "loss": 0.2018, "step": 80 }, { "epoch": 0.9101123595505618, "grad_norm": 0.22824084758758545, "learning_rate": 0.00016727272727272728, "loss": 0.1819, "step": 81 }, { "epoch": 0.9213483146067416, "grad_norm": 0.2034444659948349, "learning_rate": 0.00016681818181818182, "loss": 0.1715, "step": 82 }, { "epoch": 0.9325842696629213, "grad_norm": 0.2859365940093994, "learning_rate": 0.00016636363636363637, "loss": 0.2157, "step": 83 }, { "epoch": 0.9438202247191011, "grad_norm": 0.2006848305463791, "learning_rate": 0.00016590909090909094, "loss": 0.1595, "step": 84 }, { "epoch": 0.9550561797752809, "grad_norm": 0.3615463972091675, "learning_rate": 0.00016545454545454545, "loss": 0.2056, "step": 85 }, { "epoch": 0.9662921348314607, "grad_norm": 0.24181589484214783, "learning_rate": 0.000165, "loss": 0.2051, "step": 86 }, { "epoch": 0.9775280898876404, "grad_norm": 0.1747487634420395, "learning_rate": 0.00016454545454545457, "loss": 0.155, "step": 87 }, { "epoch": 0.9887640449438202, "grad_norm": 0.240757554769516, "learning_rate": 0.0001640909090909091, "loss": 0.1786, "step": 88 }, { "epoch": 1.0, "grad_norm": 0.27025192975997925, "learning_rate": 0.00016363636363636366, "loss": 0.1894, "step": 89 }, { "epoch": 1.0112359550561798, "grad_norm": 0.2341006100177765, "learning_rate": 0.0001631818181818182, "loss": 0.2067, "step": 90 }, { "epoch": 1.0224719101123596, "grad_norm": 0.2805992364883423, "learning_rate": 0.00016272727272727272, "loss": 0.223, "step": 91 }, { "epoch": 1.0337078651685394, "grad_norm": 0.21516099572181702, "learning_rate": 0.0001622727272727273, "loss": 0.1891, "step": 92 }, { "epoch": 1.0449438202247192, "grad_norm": 0.2255050241947174, "learning_rate": 0.00016181818181818184, "loss": 0.195, "step": 93 }, { "epoch": 1.0561797752808988, "grad_norm": 0.278639554977417, "learning_rate": 0.00016136363636363635, "loss": 0.2074, "step": 94 }, { "epoch": 1.0674157303370786, "grad_norm": 0.2584143877029419, "learning_rate": 0.00016090909090909092, "loss": 0.2121, "step": 95 }, { "epoch": 1.0786516853932584, "grad_norm": 0.2629978358745575, "learning_rate": 0.00016045454545454547, "loss": 0.2038, "step": 96 }, { "epoch": 1.0898876404494382, "grad_norm": 0.2283640205860138, "learning_rate": 0.00016, "loss": 0.1768, "step": 97 }, { "epoch": 1.101123595505618, "grad_norm": 0.2216864973306656, "learning_rate": 0.00015954545454545456, "loss": 0.1989, "step": 98 }, { "epoch": 1.1123595505617978, "grad_norm": 0.18183070421218872, "learning_rate": 0.0001590909090909091, "loss": 0.1631, "step": 99 }, { "epoch": 1.1235955056179776, "grad_norm": 0.19485391676425934, "learning_rate": 0.00015863636363636365, "loss": 0.1748, "step": 100 }, { "epoch": 1.1348314606741572, "grad_norm": 0.2579084038734436, "learning_rate": 0.0001581818181818182, "loss": 0.2012, "step": 101 }, { "epoch": 1.146067415730337, "grad_norm": 0.1952996402978897, "learning_rate": 0.00015772727272727273, "loss": 0.1634, "step": 102 }, { "epoch": 1.1573033707865168, "grad_norm": 0.20118536055088043, "learning_rate": 0.00015727272727272728, "loss": 0.1839, "step": 103 }, { "epoch": 1.1685393258426966, "grad_norm": 0.23753251135349274, "learning_rate": 0.00015681818181818182, "loss": 0.1861, "step": 104 }, { "epoch": 1.1797752808988764, "grad_norm": 0.26496222615242004, "learning_rate": 0.00015636363636363637, "loss": 0.226, "step": 105 }, { "epoch": 1.1910112359550562, "grad_norm": 0.17998002469539642, "learning_rate": 0.0001559090909090909, "loss": 0.1595, "step": 106 }, { "epoch": 1.202247191011236, "grad_norm": 0.25317177176475525, "learning_rate": 0.00015545454545454546, "loss": 0.1825, "step": 107 }, { "epoch": 1.2134831460674158, "grad_norm": 0.20935533940792084, "learning_rate": 0.000155, "loss": 0.1747, "step": 108 }, { "epoch": 1.2247191011235956, "grad_norm": 0.23691172897815704, "learning_rate": 0.00015454545454545454, "loss": 0.2247, "step": 109 }, { "epoch": 1.2359550561797752, "grad_norm": 0.22647719085216522, "learning_rate": 0.00015409090909090912, "loss": 0.1829, "step": 110 }, { "epoch": 1.247191011235955, "grad_norm": 0.23936079442501068, "learning_rate": 0.00015363636363636363, "loss": 0.1938, "step": 111 }, { "epoch": 1.2584269662921348, "grad_norm": 0.19330653548240662, "learning_rate": 0.00015318181818181818, "loss": 0.1603, "step": 112 }, { "epoch": 1.2696629213483146, "grad_norm": 0.19124047458171844, "learning_rate": 0.00015272727272727275, "loss": 0.184, "step": 113 }, { "epoch": 1.2808988764044944, "grad_norm": 0.1912234127521515, "learning_rate": 0.00015227272727272727, "loss": 0.1659, "step": 114 }, { "epoch": 1.2921348314606742, "grad_norm": 0.1903829127550125, "learning_rate": 0.0001518181818181818, "loss": 0.1755, "step": 115 }, { "epoch": 1.303370786516854, "grad_norm": 0.1836494654417038, "learning_rate": 0.00015136363636363638, "loss": 0.1575, "step": 116 }, { "epoch": 1.3146067415730336, "grad_norm": 0.21919198334217072, "learning_rate": 0.0001509090909090909, "loss": 0.1978, "step": 117 }, { "epoch": 1.3258426966292136, "grad_norm": 0.22532597184181213, "learning_rate": 0.00015045454545454547, "loss": 0.1604, "step": 118 }, { "epoch": 1.3370786516853932, "grad_norm": 0.22674380242824554, "learning_rate": 0.00015000000000000001, "loss": 0.183, "step": 119 }, { "epoch": 1.348314606741573, "grad_norm": 0.23995709419250488, "learning_rate": 0.00014954545454545453, "loss": 0.1777, "step": 120 }, { "epoch": 1.3595505617977528, "grad_norm": 0.15110474824905396, "learning_rate": 0.0001490909090909091, "loss": 0.154, "step": 121 }, { "epoch": 1.3707865168539326, "grad_norm": 0.1769479662179947, "learning_rate": 0.00014863636363636365, "loss": 0.1824, "step": 122 }, { "epoch": 1.3820224719101124, "grad_norm": 0.2355855405330658, "learning_rate": 0.0001481818181818182, "loss": 0.187, "step": 123 }, { "epoch": 1.3932584269662922, "grad_norm": 0.21928639709949493, "learning_rate": 0.00014772727272727274, "loss": 0.2154, "step": 124 }, { "epoch": 1.404494382022472, "grad_norm": 0.23884353041648865, "learning_rate": 0.00014727272727272728, "loss": 0.2144, "step": 125 }, { "epoch": 1.4157303370786516, "grad_norm": 0.19905094802379608, "learning_rate": 0.00014681818181818182, "loss": 0.1758, "step": 126 }, { "epoch": 1.4269662921348314, "grad_norm": 0.20008406043052673, "learning_rate": 0.00014636363636363637, "loss": 0.1988, "step": 127 }, { "epoch": 1.4382022471910112, "grad_norm": 0.14788155257701874, "learning_rate": 0.0001459090909090909, "loss": 0.1591, "step": 128 }, { "epoch": 1.449438202247191, "grad_norm": 0.17816966772079468, "learning_rate": 0.00014545454545454546, "loss": 0.1607, "step": 129 }, { "epoch": 1.4606741573033708, "grad_norm": 0.2414962351322174, "learning_rate": 0.000145, "loss": 0.1988, "step": 130 }, { "epoch": 1.4719101123595506, "grad_norm": 0.21710173785686493, "learning_rate": 0.00014454545454545457, "loss": 0.1749, "step": 131 }, { "epoch": 1.4831460674157304, "grad_norm": 0.30704858899116516, "learning_rate": 0.0001440909090909091, "loss": 0.2106, "step": 132 }, { "epoch": 1.49438202247191, "grad_norm": 0.22256992757320404, "learning_rate": 0.00014363636363636363, "loss": 0.1828, "step": 133 }, { "epoch": 1.50561797752809, "grad_norm": 0.2698194980621338, "learning_rate": 0.0001431818181818182, "loss": 0.2418, "step": 134 }, { "epoch": 1.5168539325842696, "grad_norm": 0.22738061845302582, "learning_rate": 0.00014272727272727272, "loss": 0.2023, "step": 135 }, { "epoch": 1.5280898876404494, "grad_norm": 0.2749398946762085, "learning_rate": 0.00014227272727272727, "loss": 0.2201, "step": 136 }, { "epoch": 1.5393258426966292, "grad_norm": 0.20003195106983185, "learning_rate": 0.00014181818181818184, "loss": 0.1658, "step": 137 }, { "epoch": 1.550561797752809, "grad_norm": 0.1827768236398697, "learning_rate": 0.00014136363636363635, "loss": 0.1787, "step": 138 }, { "epoch": 1.5617977528089888, "grad_norm": 0.20226892828941345, "learning_rate": 0.00014090909090909093, "loss": 0.1812, "step": 139 }, { "epoch": 1.5730337078651684, "grad_norm": 0.18449921905994415, "learning_rate": 0.00014045454545454547, "loss": 0.183, "step": 140 }, { "epoch": 1.5842696629213484, "grad_norm": 0.22666248679161072, "learning_rate": 0.00014, "loss": 0.2163, "step": 141 }, { "epoch": 1.595505617977528, "grad_norm": 0.2258187234401703, "learning_rate": 0.00013954545454545456, "loss": 0.2024, "step": 142 }, { "epoch": 1.606741573033708, "grad_norm": 0.17761889100074768, "learning_rate": 0.0001390909090909091, "loss": 0.1822, "step": 143 }, { "epoch": 1.6179775280898876, "grad_norm": 0.1967982053756714, "learning_rate": 0.00013863636363636365, "loss": 0.1768, "step": 144 }, { "epoch": 1.6292134831460674, "grad_norm": 0.21614395081996918, "learning_rate": 0.0001381818181818182, "loss": 0.1941, "step": 145 }, { "epoch": 1.6404494382022472, "grad_norm": 0.21362508833408356, "learning_rate": 0.00013772727272727274, "loss": 0.2053, "step": 146 }, { "epoch": 1.651685393258427, "grad_norm": 0.1829160451889038, "learning_rate": 0.00013727272727272728, "loss": 0.1601, "step": 147 }, { "epoch": 1.6629213483146068, "grad_norm": 0.19813786447048187, "learning_rate": 0.00013681818181818182, "loss": 0.2027, "step": 148 }, { "epoch": 1.6741573033707864, "grad_norm": 0.19203445315361023, "learning_rate": 0.00013636363636363637, "loss": 0.1638, "step": 149 }, { "epoch": 1.6853932584269664, "grad_norm": 0.1693754941225052, "learning_rate": 0.0001359090909090909, "loss": 0.1695, "step": 150 }, { "epoch": 1.696629213483146, "grad_norm": 0.19884233176708221, "learning_rate": 0.00013545454545454546, "loss": 0.2037, "step": 151 }, { "epoch": 1.7078651685393258, "grad_norm": 0.20025449991226196, "learning_rate": 0.00013500000000000003, "loss": 0.1927, "step": 152 }, { "epoch": 1.7191011235955056, "grad_norm": 0.1659104973077774, "learning_rate": 0.00013454545454545455, "loss": 0.164, "step": 153 }, { "epoch": 1.7303370786516854, "grad_norm": 0.16635450720787048, "learning_rate": 0.0001340909090909091, "loss": 0.1729, "step": 154 }, { "epoch": 1.7415730337078652, "grad_norm": 0.2119787037372589, "learning_rate": 0.00013363636363636366, "loss": 0.1761, "step": 155 }, { "epoch": 1.7528089887640448, "grad_norm": 0.1592678278684616, "learning_rate": 0.00013318181818181818, "loss": 0.1582, "step": 156 }, { "epoch": 1.7640449438202248, "grad_norm": 0.20228628814220428, "learning_rate": 0.00013272727272727275, "loss": 0.1833, "step": 157 }, { "epoch": 1.7752808988764044, "grad_norm": 0.19576573371887207, "learning_rate": 0.0001322727272727273, "loss": 0.1716, "step": 158 }, { "epoch": 1.7865168539325844, "grad_norm": 0.17609822750091553, "learning_rate": 0.0001318181818181818, "loss": 0.1566, "step": 159 }, { "epoch": 1.797752808988764, "grad_norm": 0.19382323324680328, "learning_rate": 0.00013136363636363638, "loss": 0.1637, "step": 160 }, { "epoch": 1.8089887640449438, "grad_norm": 0.26488256454467773, "learning_rate": 0.00013090909090909093, "loss": 0.2064, "step": 161 }, { "epoch": 1.8202247191011236, "grad_norm": 0.23723822832107544, "learning_rate": 0.00013045454545454544, "loss": 0.177, "step": 162 }, { "epoch": 1.8314606741573034, "grad_norm": 0.1890057623386383, "learning_rate": 0.00013000000000000002, "loss": 0.1784, "step": 163 }, { "epoch": 1.8426966292134832, "grad_norm": 0.1911257803440094, "learning_rate": 0.00012954545454545456, "loss": 0.1788, "step": 164 }, { "epoch": 1.8539325842696628, "grad_norm": 0.2343178540468216, "learning_rate": 0.0001290909090909091, "loss": 0.1833, "step": 165 }, { "epoch": 1.8651685393258428, "grad_norm": 0.1506495475769043, "learning_rate": 0.00012863636363636365, "loss": 0.1655, "step": 166 }, { "epoch": 1.8764044943820224, "grad_norm": 0.17160733044147491, "learning_rate": 0.0001281818181818182, "loss": 0.1586, "step": 167 }, { "epoch": 1.8876404494382022, "grad_norm": 0.13511358201503754, "learning_rate": 0.00012772727272727274, "loss": 0.1439, "step": 168 }, { "epoch": 1.898876404494382, "grad_norm": 0.1821577399969101, "learning_rate": 0.00012727272727272728, "loss": 0.1973, "step": 169 }, { "epoch": 1.9101123595505618, "grad_norm": 0.18846295773983002, "learning_rate": 0.00012681818181818183, "loss": 0.1767, "step": 170 }, { "epoch": 1.9213483146067416, "grad_norm": 0.17300470173358917, "learning_rate": 0.00012636363636363637, "loss": 0.1726, "step": 171 }, { "epoch": 1.9325842696629212, "grad_norm": 0.20833244919776917, "learning_rate": 0.00012590909090909091, "loss": 0.2248, "step": 172 }, { "epoch": 1.9438202247191012, "grad_norm": 0.17537686228752136, "learning_rate": 0.00012545454545454546, "loss": 0.147, "step": 173 }, { "epoch": 1.9550561797752808, "grad_norm": 0.17684531211853027, "learning_rate": 0.000125, "loss": 0.1774, "step": 174 }, { "epoch": 1.9662921348314608, "grad_norm": 0.20325636863708496, "learning_rate": 0.00012454545454545455, "loss": 0.1921, "step": 175 }, { "epoch": 1.9775280898876404, "grad_norm": 0.2229178249835968, "learning_rate": 0.0001240909090909091, "loss": 0.2058, "step": 176 }, { "epoch": 1.9887640449438202, "grad_norm": 0.18187855184078217, "learning_rate": 0.00012363636363636364, "loss": 0.1883, "step": 177 }, { "epoch": 2.0, "grad_norm": 0.18570281565189362, "learning_rate": 0.0001231818181818182, "loss": 0.1716, "step": 178 } ], "logging_steps": 1, "max_steps": 445, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.933479100108186e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }