{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5074024226110363, "eval_steps": 500, "global_step": 1050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007178106774338268, "grad_norm": 0.65234375, "learning_rate": 0.0001, "loss": 6.8185, "step": 5 }, { "epoch": 0.014356213548676536, "grad_norm": 0.69921875, "learning_rate": 0.0001, "loss": 5.3587, "step": 10 }, { "epoch": 0.021534320323014805, "grad_norm": 0.984375, "learning_rate": 0.0001, "loss": 3.9044, "step": 15 }, { "epoch": 0.028712427097353072, "grad_norm": 0.84765625, "learning_rate": 0.0001, "loss": 2.4036, "step": 20 }, { "epoch": 0.03589053387169134, "grad_norm": 0.63671875, "learning_rate": 0.0001, "loss": 1.5506, "step": 25 }, { "epoch": 0.04306864064602961, "grad_norm": 0.44921875, "learning_rate": 0.0001, "loss": 0.8859, "step": 30 }, { "epoch": 0.05024674742036788, "grad_norm": 0.259765625, "learning_rate": 0.0001, "loss": 0.3927, "step": 35 }, { "epoch": 0.057424854194706144, "grad_norm": 0.11669921875, "learning_rate": 0.0001, "loss": 0.1452, "step": 40 }, { "epoch": 0.06460296096904442, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.0693, "step": 45 }, { "epoch": 0.07178106774338268, "grad_norm": 0.040283203125, "learning_rate": 0.0001, "loss": 0.0279, "step": 50 }, { "epoch": 0.07895917451772096, "grad_norm": 0.46484375, "learning_rate": 0.0001, "loss": 1.6299, "step": 55 }, { "epoch": 0.08613728129205922, "grad_norm": 0.201171875, "learning_rate": 0.0001, "loss": 0.9721, "step": 60 }, { "epoch": 0.09331538806639748, "grad_norm": 0.1953125, "learning_rate": 0.0001, "loss": 0.8273, "step": 65 }, { "epoch": 0.10049349484073576, "grad_norm": 0.1259765625, "learning_rate": 0.0001, "loss": 0.6694, "step": 70 }, { "epoch": 0.10767160161507403, "grad_norm": 0.1171875, "learning_rate": 0.0001, "loss": 0.5689, "step": 75 }, { "epoch": 0.11484970838941229, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.35, "step": 80 }, { "epoch": 0.12202781516375057, "grad_norm": 0.06640625, "learning_rate": 0.0001, "loss": 0.1548, "step": 85 }, { "epoch": 0.12920592193808883, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.0625, "step": 90 }, { "epoch": 0.1363840287124271, "grad_norm": 0.0284423828125, "learning_rate": 0.0001, "loss": 0.0345, "step": 95 }, { "epoch": 0.14356213548676536, "grad_norm": 0.0654296875, "learning_rate": 0.0001, "loss": 0.0194, "step": 100 }, { "epoch": 0.15074024226110364, "grad_norm": 0.2490234375, "learning_rate": 0.0001, "loss": 1.1732, "step": 105 }, { "epoch": 0.1579183490354419, "grad_norm": 0.2099609375, "learning_rate": 0.0001, "loss": 0.87, "step": 110 }, { "epoch": 0.16509645580978016, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.7213, "step": 115 }, { "epoch": 0.17227456258411844, "grad_norm": 0.158203125, "learning_rate": 0.0001, "loss": 0.5522, "step": 120 }, { "epoch": 0.17945266935845672, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.4513, "step": 125 }, { "epoch": 0.18663077613279497, "grad_norm": 0.1064453125, "learning_rate": 0.0001, "loss": 0.2306, "step": 130 }, { "epoch": 0.19380888290713325, "grad_norm": 0.06591796875, "learning_rate": 0.0001, "loss": 0.0997, "step": 135 }, { "epoch": 0.20098698968147152, "grad_norm": 0.060546875, "learning_rate": 0.0001, "loss": 0.0362, "step": 140 }, { "epoch": 0.20816509645580977, "grad_norm": 0.037109375, "learning_rate": 0.0001, "loss": 0.0274, "step": 145 }, { "epoch": 0.21534320323014805, "grad_norm": 0.0234375, "learning_rate": 0.0001, "loss": 0.0054, "step": 150 }, { "epoch": 0.22252131000448633, "grad_norm": 0.337890625, "learning_rate": 0.0001, "loss": 1.0624, "step": 155 }, { "epoch": 0.22969941677882458, "grad_norm": 0.1787109375, "learning_rate": 0.0001, "loss": 0.829, "step": 160 }, { "epoch": 0.23687752355316286, "grad_norm": 0.15234375, "learning_rate": 0.0001, "loss": 0.6497, "step": 165 }, { "epoch": 0.24405563032750113, "grad_norm": 0.1142578125, "learning_rate": 0.0001, "loss": 0.5721, "step": 170 }, { "epoch": 0.2512337371018394, "grad_norm": 0.1533203125, "learning_rate": 0.0001, "loss": 0.4299, "step": 175 }, { "epoch": 0.25841184387617766, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.2842, "step": 180 }, { "epoch": 0.26558995065051594, "grad_norm": 0.049560546875, "learning_rate": 0.0001, "loss": 0.1096, "step": 185 }, { "epoch": 0.2727680574248542, "grad_norm": 0.072265625, "learning_rate": 0.0001, "loss": 0.0362, "step": 190 }, { "epoch": 0.27994616419919244, "grad_norm": 0.0634765625, "learning_rate": 0.0001, "loss": 0.0188, "step": 195 }, { "epoch": 0.2871242709735307, "grad_norm": 0.0167236328125, "learning_rate": 0.0001, "loss": 0.0077, "step": 200 }, { "epoch": 0.294302377747869, "grad_norm": 0.2109375, "learning_rate": 0.0001, "loss": 1.0719, "step": 205 }, { "epoch": 0.30148048452220727, "grad_norm": 0.1669921875, "learning_rate": 0.0001, "loss": 0.79, "step": 210 }, { "epoch": 0.30865859129654555, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.6307, "step": 215 }, { "epoch": 0.3158366980708838, "grad_norm": 0.126953125, "learning_rate": 0.0001, "loss": 0.5041, "step": 220 }, { "epoch": 0.32301480484522205, "grad_norm": 0.1748046875, "learning_rate": 0.0001, "loss": 0.4389, "step": 225 }, { "epoch": 0.3301929116195603, "grad_norm": 0.1181640625, "learning_rate": 0.0001, "loss": 0.2337, "step": 230 }, { "epoch": 0.3373710183938986, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.1152, "step": 235 }, { "epoch": 0.3445491251682369, "grad_norm": 0.038818359375, "learning_rate": 0.0001, "loss": 0.0224, "step": 240 }, { "epoch": 0.35172723194257516, "grad_norm": 0.0703125, "learning_rate": 0.0001, "loss": 0.0363, "step": 245 }, { "epoch": 0.35890533871691344, "grad_norm": 0.0400390625, "learning_rate": 0.0001, "loss": 0.0073, "step": 250 }, { "epoch": 0.36608344549125166, "grad_norm": 0.1650390625, "learning_rate": 0.0001, "loss": 1.0824, "step": 255 }, { "epoch": 0.37326155226558994, "grad_norm": 0.12890625, "learning_rate": 0.0001, "loss": 0.8525, "step": 260 }, { "epoch": 0.3804396590399282, "grad_norm": 0.1484375, "learning_rate": 0.0001, "loss": 0.6736, "step": 265 }, { "epoch": 0.3876177658142665, "grad_norm": 0.16015625, "learning_rate": 0.0001, "loss": 0.5694, "step": 270 }, { "epoch": 0.39479587258860477, "grad_norm": 0.146484375, "learning_rate": 0.0001, "loss": 0.4329, "step": 275 }, { "epoch": 0.40197397936294305, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.2051, "step": 280 }, { "epoch": 0.40915208613728127, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.1067, "step": 285 }, { "epoch": 0.41633019291161955, "grad_norm": 0.10302734375, "learning_rate": 0.0001, "loss": 0.0365, "step": 290 }, { "epoch": 0.4235082996859578, "grad_norm": 0.05126953125, "learning_rate": 0.0001, "loss": 0.0252, "step": 295 }, { "epoch": 0.4306864064602961, "grad_norm": 0.0029449462890625, "learning_rate": 0.0001, "loss": 0.0046, "step": 300 }, { "epoch": 0.4378645132346344, "grad_norm": 0.2177734375, "learning_rate": 0.0001, "loss": 1.0461, "step": 305 }, { "epoch": 0.44504262000897266, "grad_norm": 0.1474609375, "learning_rate": 0.0001, "loss": 0.7834, "step": 310 }, { "epoch": 0.4522207267833109, "grad_norm": 0.11669921875, "learning_rate": 0.0001, "loss": 0.6162, "step": 315 }, { "epoch": 0.45939883355764916, "grad_norm": 0.1123046875, "learning_rate": 0.0001, "loss": 0.4886, "step": 320 }, { "epoch": 0.46657694033198743, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.3858, "step": 325 }, { "epoch": 0.4737550471063257, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.2249, "step": 330 }, { "epoch": 0.480933153880664, "grad_norm": 0.061279296875, "learning_rate": 0.0001, "loss": 0.0778, "step": 335 }, { "epoch": 0.48811126065500227, "grad_norm": 0.04931640625, "learning_rate": 0.0001, "loss": 0.0258, "step": 340 }, { "epoch": 0.4952893674293405, "grad_norm": 0.0283203125, "learning_rate": 0.0001, "loss": 0.0245, "step": 345 }, { "epoch": 0.5024674742036788, "grad_norm": 0.0218505859375, "learning_rate": 0.0001, "loss": 0.0108, "step": 350 }, { "epoch": 0.509645580978017, "grad_norm": 0.2060546875, "learning_rate": 0.0001, "loss": 1.1229, "step": 355 }, { "epoch": 0.5168236877523553, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.7767, "step": 360 }, { "epoch": 0.5240017945266936, "grad_norm": 0.1162109375, "learning_rate": 0.0001, "loss": 0.6151, "step": 365 }, { "epoch": 0.5311799013010319, "grad_norm": 0.11767578125, "learning_rate": 0.0001, "loss": 0.4997, "step": 370 }, { "epoch": 0.5383580080753702, "grad_norm": 0.1181640625, "learning_rate": 0.0001, "loss": 0.3645, "step": 375 }, { "epoch": 0.5455361148497084, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.2487, "step": 380 }, { "epoch": 0.5527142216240467, "grad_norm": 0.043212890625, "learning_rate": 0.0001, "loss": 0.1116, "step": 385 }, { "epoch": 0.5598923283983849, "grad_norm": 0.0262451171875, "learning_rate": 0.0001, "loss": 0.0278, "step": 390 }, { "epoch": 0.5670704351727232, "grad_norm": 0.048583984375, "learning_rate": 0.0001, "loss": 0.0104, "step": 395 }, { "epoch": 0.5742485419470614, "grad_norm": 0.0458984375, "learning_rate": 0.0001, "loss": 0.0104, "step": 400 }, { "epoch": 0.5814266487213997, "grad_norm": 0.1953125, "learning_rate": 0.0001, "loss": 0.9303, "step": 405 }, { "epoch": 0.588604755495738, "grad_norm": 0.1513671875, "learning_rate": 0.0001, "loss": 0.766, "step": 410 }, { "epoch": 0.5957828622700763, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.5917, "step": 415 }, { "epoch": 0.6029609690444145, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5611, "step": 420 }, { "epoch": 0.6101390758187528, "grad_norm": 0.1220703125, "learning_rate": 0.0001, "loss": 0.3833, "step": 425 }, { "epoch": 0.6173171825930911, "grad_norm": 0.11865234375, "learning_rate": 0.0001, "loss": 0.2563, "step": 430 }, { "epoch": 0.6244952893674294, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.1056, "step": 435 }, { "epoch": 0.6316733961417677, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.0343, "step": 440 }, { "epoch": 0.6388515029161059, "grad_norm": 0.038818359375, "learning_rate": 0.0001, "loss": 0.0113, "step": 445 }, { "epoch": 0.6460296096904441, "grad_norm": 0.0194091796875, "learning_rate": 0.0001, "loss": 0.0062, "step": 450 }, { "epoch": 0.6532077164647824, "grad_norm": 0.18359375, "learning_rate": 0.0001, "loss": 0.894, "step": 455 }, { "epoch": 0.6603858232391207, "grad_norm": 0.158203125, "learning_rate": 0.0001, "loss": 0.7454, "step": 460 }, { "epoch": 0.6675639300134589, "grad_norm": 0.123046875, "learning_rate": 0.0001, "loss": 0.5539, "step": 465 }, { "epoch": 0.6747420367877972, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.5263, "step": 470 }, { "epoch": 0.6819201435621355, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.3882, "step": 475 }, { "epoch": 0.6890982503364738, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.2243, "step": 480 }, { "epoch": 0.696276357110812, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.0728, "step": 485 }, { "epoch": 0.7034544638851503, "grad_norm": 0.048583984375, "learning_rate": 0.0001, "loss": 0.0205, "step": 490 }, { "epoch": 0.7106325706594886, "grad_norm": 0.06103515625, "learning_rate": 0.0001, "loss": 0.0179, "step": 495 }, { "epoch": 0.7178106774338269, "grad_norm": 0.031494140625, "learning_rate": 0.0001, "loss": 0.0072, "step": 500 }, { "epoch": 0.7249887842081651, "grad_norm": 0.2470703125, "learning_rate": 0.0001, "loss": 0.9516, "step": 505 }, { "epoch": 0.7321668909825033, "grad_norm": 0.1240234375, "learning_rate": 0.0001, "loss": 0.6854, "step": 510 }, { "epoch": 0.7393449977568416, "grad_norm": 0.1552734375, "learning_rate": 0.0001, "loss": 0.5769, "step": 515 }, { "epoch": 0.7465231045311799, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.4634, "step": 520 }, { "epoch": 0.7537012113055181, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.3856, "step": 525 }, { "epoch": 0.7608793180798564, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.2155, "step": 530 }, { "epoch": 0.7680574248541947, "grad_norm": 0.0634765625, "learning_rate": 0.0001, "loss": 0.0857, "step": 535 }, { "epoch": 0.775235531628533, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.0233, "step": 540 }, { "epoch": 0.7824136384028713, "grad_norm": 0.0281982421875, "learning_rate": 0.0001, "loss": 0.013, "step": 545 }, { "epoch": 0.7895917451772095, "grad_norm": 0.016845703125, "learning_rate": 0.0001, "loss": 0.0061, "step": 550 }, { "epoch": 0.7967698519515478, "grad_norm": 0.1796875, "learning_rate": 0.0001, "loss": 0.8853, "step": 555 }, { "epoch": 0.8039479587258861, "grad_norm": 0.154296875, "learning_rate": 0.0001, "loss": 0.726, "step": 560 }, { "epoch": 0.8111260655002244, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.62, "step": 565 }, { "epoch": 0.8183041722745625, "grad_norm": 0.126953125, "learning_rate": 0.0001, "loss": 0.5036, "step": 570 }, { "epoch": 0.8254822790489008, "grad_norm": 0.1279296875, "learning_rate": 0.0001, "loss": 0.4053, "step": 575 }, { "epoch": 0.8326603858232391, "grad_norm": 0.1142578125, "learning_rate": 0.0001, "loss": 0.2355, "step": 580 }, { "epoch": 0.8398384925975774, "grad_norm": 0.045654296875, "learning_rate": 0.0001, "loss": 0.0751, "step": 585 }, { "epoch": 0.8470165993719156, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.0226, "step": 590 }, { "epoch": 0.8541947061462539, "grad_norm": 0.027587890625, "learning_rate": 0.0001, "loss": 0.0055, "step": 595 }, { "epoch": 0.8613728129205922, "grad_norm": 0.05712890625, "learning_rate": 0.0001, "loss": 0.0052, "step": 600 }, { "epoch": 0.8685509196949305, "grad_norm": 0.1943359375, "learning_rate": 0.0001, "loss": 0.9366, "step": 605 }, { "epoch": 0.8757290264692688, "grad_norm": 0.1484375, "learning_rate": 0.0001, "loss": 0.7429, "step": 610 }, { "epoch": 0.882907133243607, "grad_norm": 0.14453125, "learning_rate": 0.0001, "loss": 0.564, "step": 615 }, { "epoch": 0.8900852400179453, "grad_norm": 0.140625, "learning_rate": 0.0001, "loss": 0.5045, "step": 620 }, { "epoch": 0.8972633467922836, "grad_norm": 0.1259765625, "learning_rate": 0.0001, "loss": 0.3997, "step": 625 }, { "epoch": 0.9044414535666218, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.1856, "step": 630 }, { "epoch": 0.91161956034096, "grad_norm": 0.06298828125, "learning_rate": 0.0001, "loss": 0.0583, "step": 635 }, { "epoch": 0.9187976671152983, "grad_norm": 0.033935546875, "learning_rate": 0.0001, "loss": 0.0274, "step": 640 }, { "epoch": 0.9259757738896366, "grad_norm": 0.03271484375, "learning_rate": 0.0001, "loss": 0.0078, "step": 645 }, { "epoch": 0.9331538806639749, "grad_norm": 0.0244140625, "learning_rate": 0.0001, "loss": 0.003, "step": 650 }, { "epoch": 0.9403319874383131, "grad_norm": 0.220703125, "learning_rate": 0.0001, "loss": 0.9234, "step": 655 }, { "epoch": 0.9475100942126514, "grad_norm": 0.1494140625, "learning_rate": 0.0001, "loss": 0.7145, "step": 660 }, { "epoch": 0.9546882009869897, "grad_norm": 0.138671875, "learning_rate": 0.0001, "loss": 0.5764, "step": 665 }, { "epoch": 0.961866307761328, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.4568, "step": 670 }, { "epoch": 0.9690444145356663, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.2681, "step": 675 }, { "epoch": 0.9762225213100045, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.1399, "step": 680 }, { "epoch": 0.9834006280843428, "grad_norm": 0.068359375, "learning_rate": 0.0001, "loss": 0.0375, "step": 685 }, { "epoch": 0.990578734858681, "grad_norm": 0.040283203125, "learning_rate": 0.0001, "loss": 0.0108, "step": 690 }, { "epoch": 0.9977568416330193, "grad_norm": 0.022216796875, "learning_rate": 0.0001, "loss": 0.0082, "step": 695 }, { "epoch": 1.0049349484073575, "grad_norm": 0.193359375, "learning_rate": 0.0001, "loss": 0.6031, "step": 700 }, { "epoch": 1.012113055181696, "grad_norm": 0.1640625, "learning_rate": 0.0001, "loss": 0.7291, "step": 705 }, { "epoch": 1.019291161956034, "grad_norm": 0.1708984375, "learning_rate": 0.0001, "loss": 0.5393, "step": 710 }, { "epoch": 1.0264692687303723, "grad_norm": 0.1416015625, "learning_rate": 0.0001, "loss": 0.413, "step": 715 }, { "epoch": 1.0336473755047106, "grad_norm": 0.11669921875, "learning_rate": 0.0001, "loss": 0.3693, "step": 720 }, { "epoch": 1.0408254822790488, "grad_norm": 0.123046875, "learning_rate": 0.0001, "loss": 0.2104, "step": 725 }, { "epoch": 1.0480035890533872, "grad_norm": 0.055908203125, "learning_rate": 0.0001, "loss": 0.0834, "step": 730 }, { "epoch": 1.0551816958277254, "grad_norm": 0.0546875, "learning_rate": 0.0001, "loss": 0.0144, "step": 735 }, { "epoch": 1.0623598026020638, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.0119, "step": 740 }, { "epoch": 1.069537909376402, "grad_norm": 0.0034332275390625, "learning_rate": 0.0001, "loss": 0.0023, "step": 745 }, { "epoch": 1.0767160161507403, "grad_norm": 0.2490234375, "learning_rate": 0.0001, "loss": 0.5662, "step": 750 }, { "epoch": 1.0838941229250785, "grad_norm": 0.2177734375, "learning_rate": 0.0001, "loss": 0.7079, "step": 755 }, { "epoch": 1.0910722296994169, "grad_norm": 0.1904296875, "learning_rate": 0.0001, "loss": 0.5619, "step": 760 }, { "epoch": 1.098250336473755, "grad_norm": 0.12890625, "learning_rate": 0.0001, "loss": 0.4236, "step": 765 }, { "epoch": 1.1054284432480934, "grad_norm": 0.11328125, "learning_rate": 0.0001, "loss": 0.3422, "step": 770 }, { "epoch": 1.1126065500224316, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.2757, "step": 775 }, { "epoch": 1.1197846567967698, "grad_norm": 0.1103515625, "learning_rate": 0.0001, "loss": 0.101, "step": 780 }, { "epoch": 1.1269627635711081, "grad_norm": 0.0615234375, "learning_rate": 0.0001, "loss": 0.0292, "step": 785 }, { "epoch": 1.1341408703454463, "grad_norm": 0.01123046875, "learning_rate": 0.0001, "loss": 0.0117, "step": 790 }, { "epoch": 1.1413189771197847, "grad_norm": 0.0311279296875, "learning_rate": 0.0001, "loss": 0.0068, "step": 795 }, { "epoch": 1.1484970838941229, "grad_norm": 0.2236328125, "learning_rate": 0.0001, "loss": 0.5275, "step": 800 }, { "epoch": 1.1556751906684612, "grad_norm": 0.2060546875, "learning_rate": 0.0001, "loss": 0.7151, "step": 805 }, { "epoch": 1.1628532974427994, "grad_norm": 0.1708984375, "learning_rate": 0.0001, "loss": 0.5625, "step": 810 }, { "epoch": 1.1700314042171378, "grad_norm": 0.1708984375, "learning_rate": 0.0001, "loss": 0.4765, "step": 815 }, { "epoch": 1.177209510991476, "grad_norm": 0.1875, "learning_rate": 0.0001, "loss": 0.3728, "step": 820 }, { "epoch": 1.1843876177658144, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.2169, "step": 825 }, { "epoch": 1.1915657245401525, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.09, "step": 830 }, { "epoch": 1.198743831314491, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.0261, "step": 835 }, { "epoch": 1.205921938088829, "grad_norm": 0.06591796875, "learning_rate": 0.0001, "loss": 0.0169, "step": 840 }, { "epoch": 1.2131000448631672, "grad_norm": 0.01409912109375, "learning_rate": 0.0001, "loss": 0.0041, "step": 845 }, { "epoch": 1.2202781516375056, "grad_norm": 0.2265625, "learning_rate": 0.0001, "loss": 0.5508, "step": 850 }, { "epoch": 1.2274562584118438, "grad_norm": 0.255859375, "learning_rate": 0.0001, "loss": 0.7281, "step": 855 }, { "epoch": 1.2346343651861822, "grad_norm": 0.212890625, "learning_rate": 0.0001, "loss": 0.499, "step": 860 }, { "epoch": 1.2418124719605204, "grad_norm": 0.1767578125, "learning_rate": 0.0001, "loss": 0.5054, "step": 865 }, { "epoch": 1.2489905787348587, "grad_norm": 0.1513671875, "learning_rate": 0.0001, "loss": 0.3918, "step": 870 }, { "epoch": 1.256168685509197, "grad_norm": 0.1318359375, "learning_rate": 0.0001, "loss": 0.2211, "step": 875 }, { "epoch": 1.263346792283535, "grad_norm": 0.053955078125, "learning_rate": 0.0001, "loss": 0.099, "step": 880 }, { "epoch": 1.2705248990578735, "grad_norm": 0.0263671875, "learning_rate": 0.0001, "loss": 0.0239, "step": 885 }, { "epoch": 1.2777030058322119, "grad_norm": 0.055908203125, "learning_rate": 0.0001, "loss": 0.0203, "step": 890 }, { "epoch": 1.28488111260655, "grad_norm": 0.0172119140625, "learning_rate": 0.0001, "loss": 0.0053, "step": 895 }, { "epoch": 1.2920592193808882, "grad_norm": 0.1943359375, "learning_rate": 0.0001, "loss": 0.4856, "step": 900 }, { "epoch": 1.2992373261552266, "grad_norm": 0.2138671875, "learning_rate": 0.0001, "loss": 0.7204, "step": 905 }, { "epoch": 1.3064154329295647, "grad_norm": 0.19140625, "learning_rate": 0.0001, "loss": 0.5374, "step": 910 }, { "epoch": 1.3135935397039031, "grad_norm": 0.216796875, "learning_rate": 0.0001, "loss": 0.48, "step": 915 }, { "epoch": 1.3207716464782413, "grad_norm": 0.19921875, "learning_rate": 0.0001, "loss": 0.3897, "step": 920 }, { "epoch": 1.3279497532525797, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.2242, "step": 925 }, { "epoch": 1.3351278600269179, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.1292, "step": 930 }, { "epoch": 1.3423059668012562, "grad_norm": 0.068359375, "learning_rate": 0.0001, "loss": 0.0242, "step": 935 }, { "epoch": 1.3494840735755944, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.0092, "step": 940 }, { "epoch": 1.3566621803499328, "grad_norm": 0.004241943359375, "learning_rate": 0.0001, "loss": 0.0039, "step": 945 }, { "epoch": 1.363840287124271, "grad_norm": 0.25390625, "learning_rate": 0.0001, "loss": 0.5465, "step": 950 }, { "epoch": 1.3710183938986091, "grad_norm": 0.2412109375, "learning_rate": 0.0001, "loss": 0.6114, "step": 955 }, { "epoch": 1.3781965006729475, "grad_norm": 0.244140625, "learning_rate": 0.0001, "loss": 0.5226, "step": 960 }, { "epoch": 1.385374607447286, "grad_norm": 0.205078125, "learning_rate": 0.0001, "loss": 0.4234, "step": 965 }, { "epoch": 1.392552714221624, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.3595, "step": 970 }, { "epoch": 1.3997308209959622, "grad_norm": 0.123046875, "learning_rate": 0.0001, "loss": 0.2464, "step": 975 }, { "epoch": 1.4069089277703006, "grad_norm": 0.11767578125, "learning_rate": 0.0001, "loss": 0.11, "step": 980 }, { "epoch": 1.4140870345446388, "grad_norm": 0.05322265625, "learning_rate": 0.0001, "loss": 0.0205, "step": 985 }, { "epoch": 1.4212651413189772, "grad_norm": 0.0206298828125, "learning_rate": 0.0001, "loss": 0.0102, "step": 990 }, { "epoch": 1.4284432480933154, "grad_norm": 0.0250244140625, "learning_rate": 0.0001, "loss": 0.0044, "step": 995 }, { "epoch": 1.4356213548676537, "grad_norm": 0.23046875, "learning_rate": 0.0001, "loss": 0.4827, "step": 1000 }, { "epoch": 1.442799461641992, "grad_norm": 0.2314453125, "learning_rate": 0.0001, "loss": 0.6536, "step": 1005 }, { "epoch": 1.44997756841633, "grad_norm": 0.1953125, "learning_rate": 0.0001, "loss": 0.5993, "step": 1010 }, { "epoch": 1.4571556751906685, "grad_norm": 0.158203125, "learning_rate": 0.0001, "loss": 0.4176, "step": 1015 }, { "epoch": 1.4643337819650069, "grad_norm": 0.1689453125, "learning_rate": 0.0001, "loss": 0.307, "step": 1020 }, { "epoch": 1.471511888739345, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.2381, "step": 1025 }, { "epoch": 1.4786899955136832, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.084, "step": 1030 }, { "epoch": 1.4858681022880216, "grad_norm": 0.01153564453125, "learning_rate": 0.0001, "loss": 0.0165, "step": 1035 }, { "epoch": 1.4930462090623597, "grad_norm": 0.0283203125, "learning_rate": 0.0001, "loss": 0.0059, "step": 1040 }, { "epoch": 1.500224315836698, "grad_norm": 0.0380859375, "learning_rate": 0.0001, "loss": 0.0051, "step": 1045 }, { "epoch": 1.5074024226110363, "grad_norm": 0.296875, "learning_rate": 0.0001, "loss": 0.5321, "step": 1050 }, { "epoch": 1.5074024226110363, "step": 1050, "total_flos": 4.83809405232513e+18, "train_loss": 0.43872410982492427, "train_runtime": 144968.4958, "train_samples_per_second": 0.464, "train_steps_per_second": 0.007 } ], "logging_steps": 5, "max_steps": 1050, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.83809405232513e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }