| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 200, | |
| "global_step": 762, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003940886699507389, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 0.0, | |
| "loss": 1.7215, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.019704433497536946, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.038961038961039e-05, | |
| "loss": 1.7672, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03940886699507389, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 2.3376623376623376e-05, | |
| "loss": 1.7749, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.059113300492610835, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 1.7334, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07881773399014778, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 4.9350649350649355e-05, | |
| "loss": 1.7012, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09852216748768473, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 6.233766233766233e-05, | |
| "loss": 1.6683, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11822660098522167, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 7.532467532467533e-05, | |
| "loss": 1.6283, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 8.831168831168831e-05, | |
| "loss": 1.5815, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.15763546798029557, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.0001012987012987013, | |
| "loss": 1.5755, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17733990147783252, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.00011428571428571428, | |
| "loss": 1.5314, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19704433497536947, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 0.00012727272727272728, | |
| "loss": 1.5258, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21674876847290642, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.00014025974025974028, | |
| "loss": 1.5131, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.23645320197044334, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.00015324675324675325, | |
| "loss": 1.498, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2561576354679803, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00016623376623376625, | |
| "loss": 1.4953, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.00017922077922077922, | |
| "loss": 1.4889, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2955665024630542, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.00019220779220779222, | |
| "loss": 1.4768, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.31527093596059114, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.00019999579326114222, | |
| "loss": 1.4826, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.33497536945812806, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.00019994847151359784, | |
| "loss": 1.4514, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.35467980295566504, | |
| "grad_norm": 0.115234375, | |
| "learning_rate": 0.0001998485945607536, | |
| "loss": 1.469, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.37438423645320196, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.00019969621492020869, | |
| "loss": 1.4381, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.39408866995073893, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 0.00019949141271668306, | |
| "loss": 1.4513, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.00019923429563988614, | |
| "loss": 1.4403, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.43349753694581283, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 0.00019892499888789098, | |
| "loss": 1.4521, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.45320197044334976, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.00019856368509604412, | |
| "loss": 1.4494, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4729064039408867, | |
| "grad_norm": 0.115234375, | |
| "learning_rate": 0.00019815054425144815, | |
| "loss": 1.4289, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49261083743842365, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 0.00019768579359306205, | |
| "loss": 1.4261, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5123152709359606, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.00019716967749747207, | |
| "loss": 1.4212, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5320197044334976, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.00019660246735039266, | |
| "loss": 1.4517, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.00019598446140396605, | |
| "loss": 1.4208, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 0.00019531598461993392, | |
| "loss": 1.4357, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5911330049261084, | |
| "grad_norm": 0.1015625, | |
| "learning_rate": 0.00019459738849876543, | |
| "loss": 1.4108, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6108374384236454, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.00019382905089482995, | |
| "loss": 1.4396, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6305418719211823, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.00019301137581771266, | |
| "loss": 1.4114, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6502463054187192, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.00019214479321977697, | |
| "loss": 1.4221, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6699507389162561, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.00019122975877008567, | |
| "loss": 1.4234, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.00019026675361479969, | |
| "loss": 1.4378, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7093596059113301, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.0001892562841241804, | |
| "loss": 1.4178, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.729064039408867, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.00018819888162632838, | |
| "loss": 1.4221, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7487684729064039, | |
| "grad_norm": 0.1005859375, | |
| "learning_rate": 0.00018709510212779903, | |
| "loss": 1.4267, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7684729064039408, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.0001859455260212414, | |
| "loss": 1.4226, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7881773399014779, | |
| "grad_norm": 0.10986328125, | |
| "learning_rate": 0.00018475075778021438, | |
| "loss": 1.4328, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7881773399014779, | |
| "eval_loss": 1.4307746887207031, | |
| "eval_runtime": 143.9646, | |
| "eval_samples_per_second": 50.116, | |
| "eval_steps_per_second": 3.133, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8078817733990148, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.00018351142564134078, | |
| "loss": 1.4341, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.0001822281812739659, | |
| "loss": 1.4172, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8472906403940886, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.00018090169943749476, | |
| "loss": 1.41, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8669950738916257, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.00017953267762658827, | |
| "loss": 1.4099, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8866995073891626, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 0.00017812183570440428, | |
| "loss": 1.4176, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9064039408866995, | |
| "grad_norm": 0.10546875, | |
| "learning_rate": 0.00017666991552407724, | |
| "loss": 1.4101, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9261083743842364, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.0001751776805386344, | |
| "loss": 1.4029, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9458128078817734, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.000173645915399555, | |
| "loss": 1.3944, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.00017207542554418227, | |
| "loss": 1.4001, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9852216748768473, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.000170467036772206, | |
| "loss": 1.4076, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0039408866995074, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.00016882159481143802, | |
| "loss": 1.4025, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0236453201970444, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.00016713996487310916, | |
| "loss": 1.405, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0433497536945813, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.00016542303119692129, | |
| "loss": 1.4017, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.0630541871921182, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.00016367169658609355, | |
| "loss": 1.4038, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0827586206896551, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.0001618868819326479, | |
| "loss": 1.3942, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.102463054187192, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.00016006952573318278, | |
| "loss": 1.3961, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1221674876847292, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.00015822058359539002, | |
| "loss": 1.4007, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.141871921182266, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.0001563410277355743, | |
| "loss": 1.4071, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.161576354679803, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001544318464674397, | |
| "loss": 1.385, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.18128078817734, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.00015249404368241116, | |
| "loss": 1.3933, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2009852216748769, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001505286383217657, | |
| "loss": 1.3886, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2206896551724138, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001485366638408496, | |
| "loss": 1.387, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2403940886699507, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.0001465191676656634, | |
| "loss": 1.3949, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2600985221674876, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.00014447721064210186, | |
| "loss": 1.3927, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2798029556650246, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.00014241186647813626, | |
| "loss": 1.4009, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.2995073891625615, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.00014032422117923426, | |
| "loss": 1.3838, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3192118226600984, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.00013821537247731336, | |
| "loss": 1.3958, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3389162561576355, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.00013608642925352793, | |
| "loss": 1.4111, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3586206896551725, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.00013393851095519423, | |
| "loss": 1.4007, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.3783251231527094, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.00013177274700715914, | |
| "loss": 1.3933, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3980295566502463, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.00012959027621792265, | |
| "loss": 1.3997, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4177339901477832, | |
| "grad_norm": 0.09716796875, | |
| "learning_rate": 0.00012739224618082612, | |
| "loss": 1.4075, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4374384236453202, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.00012517981267062134, | |
| "loss": 1.3927, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.457142857142857, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.00012295413903573756, | |
| "loss": 1.3956, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4768472906403942, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.00012071639558656614, | |
| "loss": 1.4016, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.4965517241379311, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.00011846775898008438, | |
| "loss": 1.3889, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.516256157635468, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.00011620941160114229, | |
| "loss": 1.3962, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.535960591133005, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001139425409407374, | |
| "loss": 1.3996, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.555665024630542, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 0.00011166833897160465, | |
| "loss": 1.4013, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.5753694581280788, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.00010938800152144984, | |
| "loss": 1.3788, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5753694581280788, | |
| "eval_loss": 1.4156588315963745, | |
| "eval_runtime": 144.1156, | |
| "eval_samples_per_second": 50.064, | |
| "eval_steps_per_second": 3.129, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5950738916256157, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.00010710272764415566, | |
| "loss": 1.3997, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.6147783251231527, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 0.00010481371898929186, | |
| "loss": 1.3844, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6344827586206896, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001025221791702601, | |
| "loss": 1.4087, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.6541871921182265, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 0.00010022931313140638, | |
| "loss": 1.3909, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6738916256157634, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 9.793632651443357e-05, | |
| "loss": 1.3925, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.6935960591133004, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 9.564442502444735e-05, | |
| "loss": 1.3906, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7133004926108373, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 9.33548137959686e-05, | |
| "loss": 1.4122, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.7330049261083744, | |
| "grad_norm": 0.1025390625, | |
| "learning_rate": 9.106869675924605e-05, | |
| "loss": 1.3932, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7527093596059113, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 8.878727600720207e-05, | |
| "loss": 1.3974, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.7724137931034483, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 8.651175116334443e-05, | |
| "loss": 1.3872, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7921182266009852, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 8.424331875097688e-05, | |
| "loss": 1.4025, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8118226600985223, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 8.19831715640394e-05, | |
| "loss": 1.4054, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8315270935960593, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 7.973249803991006e-05, | |
| "loss": 1.3878, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.8512315270935962, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 7.749248163449693e-05, | |
| "loss": 1.3862, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.870935960591133, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 7.526430019995001e-05, | |
| "loss": 1.4118, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.89064039408867, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 7.304912536531944e-05, | |
| "loss": 1.3954, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.910344827586207, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 7.084812192048594e-05, | |
| "loss": 1.4003, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.9300492610837439, | |
| "grad_norm": 0.1005859375, | |
| "learning_rate": 6.866244720368737e-05, | |
| "loss": 1.3998, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9497536945812808, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 6.6493250492964e-05, | |
| "loss": 1.3837, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.9694581280788177, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 6.434167240184135e-05, | |
| "loss": 1.411, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9891625615763546, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 6.220884427956953e-05, | |
| "loss": 1.3949, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.007881773399015, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 6.0095887616233796e-05, | |
| "loss": 1.3871, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.027586206896552, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 5.800391345304914e-05, | |
| "loss": 1.3871, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.0472906403940887, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 5.593402179814944e-05, | |
| "loss": 1.3887, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.0669950738916256, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 5.388730104817769e-05, | |
| "loss": 1.3913, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.0866995073891625, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 5.18648274159821e-05, | |
| "loss": 1.3854, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.1064039408866995, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 4.9867664364718725e-05, | |
| "loss": 1.3915, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.1261083743842364, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 4.7896862048657965e-05, | |
| "loss": 1.3917, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.1458128078817733, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 4.595345676098923e-05, | |
| "loss": 1.3855, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.1655172413793102, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 4.403847038891424e-05, | |
| "loss": 1.3838, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.185221674876847, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 4.2152909876315316e-05, | |
| "loss": 1.3919, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.204926108374384, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 4.0297766694280915e-05, | |
| "loss": 1.3979, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.224630541871921, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 3.8474016319767435e-05, | |
| "loss": 1.3865, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.2443349753694584, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 3.6682617722671096e-05, | |
| "loss": 1.3903, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.264039408866995, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 3.4924512861579315e-05, | |
| "loss": 1.3841, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.283743842364532, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 3.3200626188467344e-05, | |
| "loss": 1.3965, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.303448275862069, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 3.151186416260006e-05, | |
| "loss": 1.4112, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.323152709359606, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 2.9859114773895025e-05, | |
| "loss": 1.385, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.342857142857143, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 2.8243247075996693e-05, | |
| "loss": 1.3838, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.36256157635468, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 2.6665110729308263e-05, | |
| "loss": 1.3938, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.36256157635468, | |
| "eval_loss": 1.4132238626480103, | |
| "eval_runtime": 144.0354, | |
| "eval_samples_per_second": 50.092, | |
| "eval_steps_per_second": 3.131, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.382266009852217, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 2.5125535554220482e-05, | |
| "loss": 1.3974, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.4019704433497537, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 2.3625331094773206e-05, | |
| "loss": 1.3814, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.4216748768472907, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 2.2165286192978342e-05, | |
| "loss": 1.3858, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.4413793103448276, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 2.074616857402867e-05, | |
| "loss": 1.3804, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.4610837438423645, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 1.936872444261022e-05, | |
| "loss": 1.3868, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.4807881773399014, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 1.8033678090530813e-05, | |
| "loss": 1.3923, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.5004926108374383, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 1.6741731515870594e-05, | |
| "loss": 1.3889, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.5201970443349753, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 1.549356405385538e-05, | |
| "loss": 1.3736, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.539901477832512, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 1.428983201964662e-05, | |
| "loss": 1.3955, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.559605911330049, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 1.313116836323568e-05, | |
| "loss": 1.3946, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5793103448275865, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 1.2018182336624273e-05, | |
| "loss": 1.3907, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.599014778325123, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 1.0951459173465629e-05, | |
| "loss": 1.4041, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.6187192118226603, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 9.93155978133541e-06, | |
| "loss": 1.391, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.638423645320197, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 8.959020446793288e-06, | |
| "loss": 1.3882, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.658128078817734, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 8.034352553391367e-06, | |
| "loss": 1.4001, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.677832512315271, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 7.158042312776847e-06, | |
| "loss": 1.3824, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.697536945812808, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 6.330550509030852e-06, | |
| "loss": 1.379, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.717241379310345, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 5.552312256377423e-06, | |
| "loss": 1.3787, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.736945812807882, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 4.823736770390552e-06, | |
| "loss": 1.3902, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.7566502463054188, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 4.14520715281923e-06, | |
| "loss": 1.3991, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.7763546798029557, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 3.517080190143629e-06, | |
| "loss": 1.3866, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.7960591133004926, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 2.9396861659686915e-06, | |
| "loss": 1.3864, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.8157635467980295, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 2.4133286873533112e-06, | |
| "loss": 1.373, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.8354679802955665, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 1.9382845251668335e-06, | |
| "loss": 1.384, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.8551724137931034, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 1.514803468556547e-06, | |
| "loss": 1.3768, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.8748768472906403, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 1.14310819360276e-06, | |
| "loss": 1.3968, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.8945812807881772, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 8.233941462306271e-07, | |
| "loss": 1.3811, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.914285714285714, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 5.558294394402253e-07, | |
| "loss": 1.4115, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.933990147783251, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 3.405547649087959e-07, | |
| "loss": 1.4119, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.9536945812807884, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 1.7768331901187875e-07, | |
| "loss": 1.395, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.973399014778325, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 6.730074330203451e-08, | |
| "loss": 1.3966, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.9931034482758623, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 9.46507947655606e-09, | |
| "loss": 1.3686, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 762, | |
| "total_flos": 7.533943292711404e+17, | |
| "train_loss": 1.4212341141200129, | |
| "train_runtime": 13229.3679, | |
| "train_samples_per_second": 14.73, | |
| "train_steps_per_second": 0.058 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 762, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.533943292711404e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |