| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.5074024226110363, | |
| "eval_steps": 500, | |
| "global_step": 1050, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007178106774338268, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0001, | |
| "loss": 6.8185, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014356213548676536, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0001, | |
| "loss": 5.3587, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021534320323014805, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 0.0001, | |
| "loss": 3.9044, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.028712427097353072, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 0.0001, | |
| "loss": 2.4036, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03589053387169134, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5506, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04306864064602961, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8859, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05024674742036788, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3927, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.057424854194706144, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1452, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06460296096904442, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0693, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07178106774338268, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0279, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07895917451772096, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6299, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08613728129205922, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9721, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09331538806639748, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8273, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10049349484073576, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6694, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10767160161507403, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5689, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11484970838941229, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.35, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12202781516375057, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1548, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.12920592193808883, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0625, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1363840287124271, | |
| "grad_norm": 0.0284423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0345, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.14356213548676536, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0194, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15074024226110364, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1732, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1579183490354419, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.87, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.16509645580978016, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7213, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.17227456258411844, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5522, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17945266935845672, | |
| "grad_norm": 0.1015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4513, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.18663077613279497, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2306, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.19380888290713325, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0997, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.20098698968147152, | |
| "grad_norm": 0.060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0362, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20816509645580977, | |
| "grad_norm": 0.037109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0274, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.21534320323014805, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0054, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22252131000448633, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0624, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.22969941677882458, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.829, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.23687752355316286, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6497, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.24405563032750113, | |
| "grad_norm": 0.1142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5721, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2512337371018394, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4299, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.25841184387617766, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2842, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.26558995065051594, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1096, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2727680574248542, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0362, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.27994616419919244, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0188, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2871242709735307, | |
| "grad_norm": 0.0167236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0077, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.294302377747869, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0719, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.30148048452220727, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.79, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.30865859129654555, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6307, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3158366980708838, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5041, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.32301480484522205, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4389, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3301929116195603, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2337, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3373710183938986, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1152, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3445491251682369, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0224, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.35172723194257516, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0363, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.35890533871691344, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0073, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.36608344549125166, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0824, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.37326155226558994, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8525, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3804396590399282, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6736, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3876177658142665, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5694, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.39479587258860477, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4329, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.40197397936294305, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2051, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.40915208613728127, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1067, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.41633019291161955, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0365, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4235082996859578, | |
| "grad_norm": 0.05126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0252, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4306864064602961, | |
| "grad_norm": 0.0029449462890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0046, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4378645132346344, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0461, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.44504262000897266, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7834, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4522207267833109, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6162, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.45939883355764916, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4886, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.46657694033198743, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3858, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.4737550471063257, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2249, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.480933153880664, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0778, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.48811126065500227, | |
| "grad_norm": 0.04931640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0258, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4952893674293405, | |
| "grad_norm": 0.0283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0245, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5024674742036788, | |
| "grad_norm": 0.0218505859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0108, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.509645580978017, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1229, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5168236877523553, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7767, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5240017945266936, | |
| "grad_norm": 0.1162109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6151, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5311799013010319, | |
| "grad_norm": 0.11767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4997, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5383580080753702, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3645, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5455361148497084, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2487, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5527142216240467, | |
| "grad_norm": 0.043212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1116, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5598923283983849, | |
| "grad_norm": 0.0262451171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0278, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5670704351727232, | |
| "grad_norm": 0.048583984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0104, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5742485419470614, | |
| "grad_norm": 0.0458984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0104, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5814266487213997, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9303, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.588604755495738, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.766, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5957828622700763, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5917, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6029609690444145, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5611, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6101390758187528, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3833, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6173171825930911, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2563, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6244952893674294, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1056, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6316733961417677, | |
| "grad_norm": 0.087890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0343, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6388515029161059, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0113, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6460296096904441, | |
| "grad_norm": 0.0194091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0062, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6532077164647824, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.894, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6603858232391207, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7454, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6675639300134589, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5539, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6747420367877972, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5263, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6819201435621355, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3882, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6890982503364738, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2243, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.696276357110812, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0728, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7034544638851503, | |
| "grad_norm": 0.048583984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0205, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7106325706594886, | |
| "grad_norm": 0.06103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0179, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7178106774338269, | |
| "grad_norm": 0.031494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0072, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7249887842081651, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9516, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7321668909825033, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6854, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7393449977568416, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5769, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7465231045311799, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4634, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7537012113055181, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3856, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7608793180798564, | |
| "grad_norm": 0.10791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2155, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7680574248541947, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0857, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.775235531628533, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0233, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7824136384028713, | |
| "grad_norm": 0.0281982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.013, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7895917451772095, | |
| "grad_norm": 0.016845703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0061, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7967698519515478, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8853, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8039479587258861, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.726, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8111260655002244, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.62, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8183041722745625, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5036, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8254822790489008, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4053, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8326603858232391, | |
| "grad_norm": 0.1142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2355, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8398384925975774, | |
| "grad_norm": 0.045654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0751, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8470165993719156, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0226, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8541947061462539, | |
| "grad_norm": 0.027587890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0055, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8613728129205922, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0052, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8685509196949305, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9366, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8757290264692688, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7429, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.882907133243607, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.564, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8900852400179453, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5045, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8972633467922836, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3997, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9044414535666218, | |
| "grad_norm": 0.0830078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1856, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.91161956034096, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0583, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9187976671152983, | |
| "grad_norm": 0.033935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0274, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9259757738896366, | |
| "grad_norm": 0.03271484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0078, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9331538806639749, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.003, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9403319874383131, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9234, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9475100942126514, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7145, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9546882009869897, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5764, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.961866307761328, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4568, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9690444145356663, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2681, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.9762225213100045, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1399, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9834006280843428, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0375, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.990578734858681, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0108, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9977568416330193, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0082, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.0049349484073575, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6031, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.012113055181696, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7291, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.019291161956034, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5393, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.0264692687303723, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.413, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.0336473755047106, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3693, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.0408254822790488, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2104, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.0480035890533872, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0834, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.0551816958277254, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0144, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.0623598026020638, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0119, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.069537909376402, | |
| "grad_norm": 0.0034332275390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0023, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.0767160161507403, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5662, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.0838941229250785, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7079, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.0910722296994169, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5619, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.098250336473755, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4236, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.1054284432480934, | |
| "grad_norm": 0.11328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3422, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.1126065500224316, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2757, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.1197846567967698, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.101, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1269627635711081, | |
| "grad_norm": 0.0615234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0292, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.1341408703454463, | |
| "grad_norm": 0.01123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0117, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.1413189771197847, | |
| "grad_norm": 0.0311279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0068, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.1484970838941229, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5275, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1556751906684612, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7151, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.1628532974427994, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5625, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.1700314042171378, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4765, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.177209510991476, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3728, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.1843876177658144, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2169, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.1915657245401525, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.09, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.198743831314491, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0261, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.205921938088829, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0169, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2131000448631672, | |
| "grad_norm": 0.01409912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0041, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.2202781516375056, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5508, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2274562584118438, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7281, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.2346343651861822, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.499, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.2418124719605204, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5054, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.2489905787348587, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3918, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.256168685509197, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2211, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.263346792283535, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.099, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.2705248990578735, | |
| "grad_norm": 0.0263671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0239, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.2777030058322119, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0203, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.28488111260655, | |
| "grad_norm": 0.0172119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0053, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.2920592193808882, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4856, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2992373261552266, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7204, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.3064154329295647, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5374, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.3135935397039031, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.48, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.3207716464782413, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3897, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.3279497532525797, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2242, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.3351278600269179, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1292, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.3423059668012562, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0242, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.3494840735755944, | |
| "grad_norm": 0.08544921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0092, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.3566621803499328, | |
| "grad_norm": 0.004241943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0039, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.363840287124271, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5465, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.3710183938986091, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6114, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.3781965006729475, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5226, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.385374607447286, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4234, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.392552714221624, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3595, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.3997308209959622, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2464, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.4069089277703006, | |
| "grad_norm": 0.11767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.11, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.4140870345446388, | |
| "grad_norm": 0.05322265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0205, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.4212651413189772, | |
| "grad_norm": 0.0206298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0102, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.4284432480933154, | |
| "grad_norm": 0.0250244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0044, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.4356213548676537, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4827, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.442799461641992, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6536, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.44997756841633, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5993, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.4571556751906685, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4176, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.4643337819650069, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.307, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.471511888739345, | |
| "grad_norm": 0.1005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2381, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.4786899955136832, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.084, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.4858681022880216, | |
| "grad_norm": 0.01153564453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0165, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.4930462090623597, | |
| "grad_norm": 0.0283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0059, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.500224315836698, | |
| "grad_norm": 0.0380859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0051, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.5074024226110363, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5321, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.5074024226110363, | |
| "step": 1050, | |
| "total_flos": 4.83809405232513e+18, | |
| "train_loss": 0.43872410982492427, | |
| "train_runtime": 144968.4958, | |
| "train_samples_per_second": 0.464, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1050, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.83809405232513e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |