{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1047, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02865329512893983, "grad_norm": 2.6186984732280814, "learning_rate": 8.571428571428572e-07, "loss": 0.5731, "step": 10 }, { "epoch": 0.05730659025787966, "grad_norm": 1.0232811311533683, "learning_rate": 1.8095238095238097e-06, "loss": 0.5412, "step": 20 }, { "epoch": 0.08595988538681948, "grad_norm": 0.9701094427215733, "learning_rate": 2.7619047619047625e-06, "loss": 0.4937, "step": 30 }, { "epoch": 0.11461318051575932, "grad_norm": 0.5323948276277348, "learning_rate": 3.7142857142857146e-06, "loss": 0.4582, "step": 40 }, { "epoch": 0.14326647564469913, "grad_norm": 0.36885209030841964, "learning_rate": 4.666666666666667e-06, "loss": 0.4392, "step": 50 }, { "epoch": 0.17191977077363896, "grad_norm": 0.27190324169109104, "learning_rate": 5.619047619047619e-06, "loss": 0.4255, "step": 60 }, { "epoch": 0.20057306590257878, "grad_norm": 0.284362183163526, "learning_rate": 6.571428571428572e-06, "loss": 0.42, "step": 70 }, { "epoch": 0.22922636103151864, "grad_norm": 0.2415087591622169, "learning_rate": 7.523809523809524e-06, "loss": 0.4006, "step": 80 }, { "epoch": 0.25787965616045844, "grad_norm": 0.2832959959768393, "learning_rate": 8.476190476190477e-06, "loss": 0.3976, "step": 90 }, { "epoch": 0.28653295128939826, "grad_norm": 0.2819249571240325, "learning_rate": 9.42857142857143e-06, "loss": 0.3887, "step": 100 }, { "epoch": 0.3151862464183381, "grad_norm": 0.26289638357241263, "learning_rate": 9.999555111181558e-06, "loss": 0.3914, "step": 110 }, { "epoch": 0.3438395415472779, "grad_norm": 0.31346359091894765, "learning_rate": 9.994551021152415e-06, "loss": 0.3847, "step": 120 }, { "epoch": 0.37249283667621774, "grad_norm": 0.25447340280629915, "learning_rate": 9.983992313852776e-06, "loss": 0.3883, "step": 130 }, { "epoch": 0.40114613180515757, "grad_norm": 0.2713615676238601, "learning_rate": 9.967890731995383e-06, "loss": 0.3885, "step": 140 }, { "epoch": 0.4297994269340974, "grad_norm": 0.29785890393022935, "learning_rate": 9.946264182720295e-06, "loss": 0.3811, "step": 150 }, { "epoch": 0.4584527220630373, "grad_norm": 0.31068169293825476, "learning_rate": 9.919136717679723e-06, "loss": 0.3755, "step": 160 }, { "epoch": 0.4871060171919771, "grad_norm": 0.291914065891215, "learning_rate": 9.88653850628933e-06, "loss": 0.3727, "step": 170 }, { "epoch": 0.5157593123209169, "grad_norm": 0.28645514791771215, "learning_rate": 9.848505802175762e-06, "loss": 0.3694, "step": 180 }, { "epoch": 0.5444126074498568, "grad_norm": 0.26191461156518464, "learning_rate": 9.8050809028577e-06, "loss": 0.3704, "step": 190 }, { "epoch": 0.5730659025787965, "grad_norm": 0.2782451749750521, "learning_rate": 9.756312102705284e-06, "loss": 0.3766, "step": 200 }, { "epoch": 0.6017191977077364, "grad_norm": 0.31746617801856314, "learning_rate": 9.702253639230246e-06, "loss": 0.3639, "step": 210 }, { "epoch": 0.6303724928366762, "grad_norm": 0.3104703712769699, "learning_rate": 9.642965632766437e-06, "loss": 0.3749, "step": 220 }, { "epoch": 0.6590257879656161, "grad_norm": 0.25481045697197613, "learning_rate": 9.57851401960788e-06, "loss": 0.3691, "step": 230 }, { "epoch": 0.6876790830945558, "grad_norm": 0.2569207157034273, "learning_rate": 9.508970478678676e-06, "loss": 0.364, "step": 240 }, { "epoch": 0.7163323782234957, "grad_norm": 0.3010191468789677, "learning_rate": 9.434412351816329e-06, "loss": 0.3699, "step": 250 }, { "epoch": 0.7449856733524355, "grad_norm": 0.2845353099007207, "learning_rate": 9.354922557757153e-06, "loss": 0.3626, "step": 260 }, { "epoch": 0.7736389684813754, "grad_norm": 0.2583262736091065, "learning_rate": 9.270589499919405e-06, "loss": 0.367, "step": 270 }, { "epoch": 0.8022922636103151, "grad_norm": 0.2734910827357129, "learning_rate": 9.181506968086696e-06, "loss": 0.3616, "step": 280 }, { "epoch": 0.830945558739255, "grad_norm": 0.3163373871358028, "learning_rate": 9.087774034101069e-06, "loss": 0.3603, "step": 290 }, { "epoch": 0.8595988538681948, "grad_norm": 0.27019612276333577, "learning_rate": 8.989494941681672e-06, "loss": 0.3625, "step": 300 }, { "epoch": 0.8882521489971347, "grad_norm": 0.26011182825983586, "learning_rate": 8.886778990491632e-06, "loss": 0.3571, "step": 310 }, { "epoch": 0.9169054441260746, "grad_norm": 0.2602289671922979, "learning_rate": 8.77974041458202e-06, "loss": 0.3551, "step": 320 }, { "epoch": 0.9455587392550143, "grad_norm": 0.28489190685342874, "learning_rate": 8.668498255348119e-06, "loss": 0.3651, "step": 330 }, { "epoch": 0.9742120343839542, "grad_norm": 0.2907841661352505, "learning_rate": 8.553176229139262e-06, "loss": 0.3621, "step": 340 }, { "epoch": 1.002865329512894, "grad_norm": 0.29410444361118165, "learning_rate": 8.433902589669489e-06, "loss": 0.3526, "step": 350 }, { "epoch": 1.0315186246418337, "grad_norm": 0.2629411999849927, "learning_rate": 8.310809985382059e-06, "loss": 0.3428, "step": 360 }, { "epoch": 1.0601719197707737, "grad_norm": 0.2664030042868273, "learning_rate": 8.184035311926397e-06, "loss": 0.3352, "step": 370 }, { "epoch": 1.0888252148997135, "grad_norm": 0.27448456132450433, "learning_rate": 8.053719559911605e-06, "loss": 0.3313, "step": 380 }, { "epoch": 1.1174785100286533, "grad_norm": 0.280680513900802, "learning_rate": 7.92000765810579e-06, "loss": 0.3372, "step": 390 }, { "epoch": 1.146131805157593, "grad_norm": 0.2696702891136822, "learning_rate": 7.783048312255653e-06, "loss": 0.3418, "step": 400 }, { "epoch": 1.174785100286533, "grad_norm": 0.31145780582148586, "learning_rate": 7.642993839705557e-06, "loss": 0.3374, "step": 410 }, { "epoch": 1.2034383954154728, "grad_norm": 0.2869659364327292, "learning_rate": 7.500000000000001e-06, "loss": 0.3363, "step": 420 }, { "epoch": 1.2320916905444126, "grad_norm": 0.25325335643256375, "learning_rate": 7.3542258216579136e-06, "loss": 0.3316, "step": 430 }, { "epoch": 1.2607449856733524, "grad_norm": 0.2772339994096704, "learning_rate": 7.205833425311394e-06, "loss": 0.3436, "step": 440 }, { "epoch": 1.2893982808022924, "grad_norm": 0.29278050839602937, "learning_rate": 7.0549878434056155e-06, "loss": 0.3406, "step": 450 }, { "epoch": 1.3180515759312321, "grad_norm": 0.28430937748006735, "learning_rate": 6.901856836660386e-06, "loss": 0.3432, "step": 460 }, { "epoch": 1.346704871060172, "grad_norm": 0.2803846351745633, "learning_rate": 6.746610707497511e-06, "loss": 0.34, "step": 470 }, { "epoch": 1.3753581661891117, "grad_norm": 0.27906165695648083, "learning_rate": 6.58942211064142e-06, "loss": 0.3353, "step": 480 }, { "epoch": 1.4040114613180517, "grad_norm": 0.28984790937034516, "learning_rate": 6.43046586110374e-06, "loss": 0.3309, "step": 490 }, { "epoch": 1.4326647564469914, "grad_norm": 0.23182165178077277, "learning_rate": 6.269918739765313e-06, "loss": 0.3355, "step": 500 }, { "epoch": 1.4613180515759312, "grad_norm": 0.2488774130896262, "learning_rate": 6.107959296771915e-06, "loss": 0.3329, "step": 510 }, { "epoch": 1.4899713467048712, "grad_norm": 0.24874727627825863, "learning_rate": 5.944767652962309e-06, "loss": 0.3438, "step": 520 }, { "epoch": 1.518624641833811, "grad_norm": 0.2911542320121491, "learning_rate": 5.780525299549473e-06, "loss": 0.3359, "step": 530 }, { "epoch": 1.5472779369627507, "grad_norm": 0.28119928239607705, "learning_rate": 5.615414896277786e-06, "loss": 0.336, "step": 540 }, { "epoch": 1.5759312320916905, "grad_norm": 0.2765768847959203, "learning_rate": 5.44962006828065e-06, "loss": 0.3404, "step": 550 }, { "epoch": 1.6045845272206303, "grad_norm": 0.27806174427298036, "learning_rate": 5.283325201864475e-06, "loss": 0.3304, "step": 560 }, { "epoch": 1.63323782234957, "grad_norm": 0.2404022014715492, "learning_rate": 5.116715239446121e-06, "loss": 0.3295, "step": 570 }, { "epoch": 1.66189111747851, "grad_norm": 0.2513689234570333, "learning_rate": 4.9499754738718835e-06, "loss": 0.3342, "step": 580 }, { "epoch": 1.6905444126074498, "grad_norm": 0.2429044062755473, "learning_rate": 4.7832913423467555e-06, "loss": 0.3364, "step": 590 }, { "epoch": 1.7191977077363898, "grad_norm": 0.23681124768665515, "learning_rate": 4.616848220203124e-06, "loss": 0.3275, "step": 600 }, { "epoch": 1.7478510028653296, "grad_norm": 0.2491649241512219, "learning_rate": 4.450831214738303e-06, "loss": 0.3385, "step": 610 }, { "epoch": 1.7765042979942693, "grad_norm": 0.7732756792555575, "learning_rate": 4.285424959350139e-06, "loss": 0.3303, "step": 620 }, { "epoch": 1.8051575931232091, "grad_norm": 0.24958098955725797, "learning_rate": 4.1208134081996625e-06, "loss": 0.3382, "step": 630 }, { "epoch": 1.8338108882521489, "grad_norm": 0.2713258221481738, "learning_rate": 3.957179631629148e-06, "loss": 0.3348, "step": 640 }, { "epoch": 1.8624641833810889, "grad_norm": 0.23269097420301693, "learning_rate": 3.7947056125630904e-06, "loss": 0.3332, "step": 650 }, { "epoch": 1.8911174785100286, "grad_norm": 0.2800452866317965, "learning_rate": 3.6335720441185474e-06, "loss": 0.3263, "step": 660 }, { "epoch": 1.9197707736389686, "grad_norm": 0.2244027553245592, "learning_rate": 3.4739581286499147e-06, "loss": 0.3347, "step": 670 }, { "epoch": 1.9484240687679084, "grad_norm": 0.36236606952760386, "learning_rate": 3.3160413784516342e-06, "loss": 0.3251, "step": 680 }, { "epoch": 1.9770773638968482, "grad_norm": 0.2560138762411745, "learning_rate": 3.1599974183404784e-06, "loss": 0.3311, "step": 690 }, { "epoch": 2.005730659025788, "grad_norm": 0.24846783217066923, "learning_rate": 3.0059997903369658e-06, "loss": 0.3263, "step": 700 }, { "epoch": 2.0343839541547277, "grad_norm": 0.2561038844246594, "learning_rate": 2.854219760663125e-06, "loss": 0.3194, "step": 710 }, { "epoch": 2.0630372492836675, "grad_norm": 0.21888324162969877, "learning_rate": 2.704826129271257e-06, "loss": 0.3106, "step": 720 }, { "epoch": 2.0916905444126073, "grad_norm": 0.23634859060639352, "learning_rate": 2.5579850421155294e-06, "loss": 0.3215, "step": 730 }, { "epoch": 2.1203438395415475, "grad_norm": 0.40366184729596755, "learning_rate": 2.413859806375159e-06, "loss": 0.3119, "step": 740 }, { "epoch": 2.1489971346704873, "grad_norm": 0.21759429767669508, "learning_rate": 2.272610708834719e-06, "loss": 0.3094, "step": 750 }, { "epoch": 2.177650429799427, "grad_norm": 0.21797932422789995, "learning_rate": 2.1343948376235146e-06, "loss": 0.3081, "step": 760 }, { "epoch": 2.206303724928367, "grad_norm": 0.22015734090400496, "learning_rate": 1.9993659075123117e-06, "loss": 0.3118, "step": 770 }, { "epoch": 2.2349570200573066, "grad_norm": 0.5859620771202967, "learning_rate": 1.8676740889616835e-06, "loss": 0.3194, "step": 780 }, { "epoch": 2.2636103151862463, "grad_norm": 0.21809409710426375, "learning_rate": 1.739465841112125e-06, "loss": 0.3156, "step": 790 }, { "epoch": 2.292263610315186, "grad_norm": 0.311384088738318, "learning_rate": 1.6148837489016406e-06, "loss": 0.3105, "step": 800 }, { "epoch": 2.3209169054441263, "grad_norm": 0.22764092498020874, "learning_rate": 1.49406636449199e-06, "loss": 0.3209, "step": 810 }, { "epoch": 2.349570200573066, "grad_norm": 0.22336147023416364, "learning_rate": 1.3771480531799054e-06, "loss": 0.3217, "step": 820 }, { "epoch": 2.378223495702006, "grad_norm": 0.22717728346077168, "learning_rate": 1.2642588439646951e-06, "loss": 0.3211, "step": 830 }, { "epoch": 2.4068767908309456, "grad_norm": 0.208665334531538, "learning_rate": 1.1555242849383668e-06, "loss": 0.3183, "step": 840 }, { "epoch": 2.4355300859598854, "grad_norm": 0.21821518140433077, "learning_rate": 1.0510653036591583e-06, "loss": 0.3188, "step": 850 }, { "epoch": 2.464183381088825, "grad_norm": 0.22416726569351633, "learning_rate": 9.509980726637003e-07, "loss": 0.3167, "step": 860 }, { "epoch": 2.492836676217765, "grad_norm": 0.2135708085936856, "learning_rate": 8.5543388026743e-07, "loss": 0.315, "step": 870 }, { "epoch": 2.5214899713467047, "grad_norm": 0.20626691858117885, "learning_rate": 7.644790067969005e-07, "loss": 0.3151, "step": 880 }, { "epoch": 2.5501432664756445, "grad_norm": 0.203689315528671, "learning_rate": 6.7823460639167e-07, "loss": 0.3122, "step": 890 }, { "epoch": 2.5787965616045847, "grad_norm": 0.20939291314273653, "learning_rate": 5.967965945071896e-07, "loss": 0.3153, "step": 900 }, { "epoch": 2.6074498567335245, "grad_norm": 0.23023262138795553, "learning_rate": 5.202555412438309e-07, "loss": 0.3094, "step": 910 }, { "epoch": 2.6361031518624642, "grad_norm": 0.235572721264662, "learning_rate": 4.486965706206597e-07, "loss": 0.3146, "step": 920 }, { "epoch": 2.664756446991404, "grad_norm": 0.2315057700635808, "learning_rate": 3.8219926590600365e-07, "loss": 0.3144, "step": 930 }, { "epoch": 2.693409742120344, "grad_norm": 0.21729980205720664, "learning_rate": 3.2083758111006946e-07, "loss": 0.3191, "step": 940 }, { "epoch": 2.7220630372492836, "grad_norm": 0.2031412349579613, "learning_rate": 2.6467975873807617e-07, "loss": 0.3127, "step": 950 }, { "epoch": 2.7507163323782233, "grad_norm": 0.19878302412351165, "learning_rate": 2.1378825389533508e-07, "loss": 0.3169, "step": 960 }, { "epoch": 2.7793696275071635, "grad_norm": 0.20562940798947288, "learning_rate": 1.6821966482872264e-07, "loss": 0.3197, "step": 970 }, { "epoch": 2.8080229226361033, "grad_norm": 0.20192515439147846, "learning_rate": 1.28024669981755e-07, "loss": 0.3154, "step": 980 }, { "epoch": 2.836676217765043, "grad_norm": 0.20191838585812194, "learning_rate": 9.324797163330012e-08, "loss": 0.3125, "step": 990 }, { "epoch": 2.865329512893983, "grad_norm": 0.2153477946539778, "learning_rate": 6.39282461825852e-08, "loss": 0.3119, "step": 1000 }, { "epoch": 2.8939828080229226, "grad_norm": 0.21913342629281218, "learning_rate": 4.009810113580426e-08, "loss": 0.3175, "step": 1010 }, { "epoch": 2.9226361031518624, "grad_norm": 0.19798983439143433, "learning_rate": 2.178403884215141e-08, "loss": 0.3081, "step": 1020 }, { "epoch": 2.951289398280802, "grad_norm": 0.22183768018570765, "learning_rate": 9.006427019622177e-09, "loss": 0.3181, "step": 1030 }, { "epoch": 2.9799426934097424, "grad_norm": 0.1912655799602591, "learning_rate": 1.7794761033496089e-09, "loss": 0.3146, "step": 1040 } ], "logging_steps": 10, "max_steps": 1047, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5074006013116416.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }