{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.47357293868921774, "eval_steps": 500, "global_step": 448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005285412262156448, "grad_norm": 8.903882026672363, "learning_rate": 9.166551619047618e-06, "loss": 1.2181, "num_tokens": 152231.0, "step": 5 }, { "epoch": 0.010570824524312896, "grad_norm": 1.0083556175231934, "learning_rate": 2.062474114285714e-05, "loss": 0.6681, "num_tokens": 317223.0, "step": 10 }, { "epoch": 0.015856236786469344, "grad_norm": 0.746456503868103, "learning_rate": 3.2082930666666666e-05, "loss": 0.4249, "num_tokens": 473767.0, "step": 15 }, { "epoch": 0.021141649048625793, "grad_norm": 0.6790260076522827, "learning_rate": 4.3541120190476185e-05, "loss": 0.3982, "num_tokens": 659649.0, "step": 20 }, { "epoch": 0.026427061310782242, "grad_norm": 0.7228266596794128, "learning_rate": 5.499930971428571e-05, "loss": 0.3887, "num_tokens": 832364.0, "step": 25 }, { "epoch": 0.03171247357293869, "grad_norm": 0.6723021268844604, "learning_rate": 6.645749923809523e-05, "loss": 0.3467, "num_tokens": 991414.0, "step": 30 }, { "epoch": 0.03699788583509514, "grad_norm": 0.5942872166633606, "learning_rate": 7.791568876190476e-05, "loss": 0.3578, "num_tokens": 1158859.0, "step": 35 }, { "epoch": 0.042283298097251586, "grad_norm": 0.6299146413803101, "learning_rate": 8.020446518214365e-05, "loss": 0.3567, "num_tokens": 1312426.0, "step": 40 }, { "epoch": 0.04756871035940803, "grad_norm": 0.43629753589630127, "learning_rate": 8.0192841334398e-05, "loss": 0.3251, "num_tokens": 1488799.0, "step": 45 }, { "epoch": 0.052854122621564484, "grad_norm": 0.7118616700172424, "learning_rate": 8.017227973373715e-05, "loss": 0.3148, "num_tokens": 1659228.0, "step": 50 }, { "epoch": 0.05813953488372093, "grad_norm": 0.4712906777858734, "learning_rate": 8.014278649308742e-05, "loss": 0.3528, "num_tokens": 1824722.0, "step": 55 }, { "epoch": 0.06342494714587738, "grad_norm": 0.5827367305755615, "learning_rate": 8.010437038073538e-05, "loss": 0.3307, "num_tokens": 1999570.0, "step": 60 }, { "epoch": 0.06871035940803383, "grad_norm": 0.46786293387413025, "learning_rate": 8.005704281772099e-05, "loss": 0.3722, "num_tokens": 2149278.0, "step": 65 }, { "epoch": 0.07399577167019028, "grad_norm": 0.46910208463668823, "learning_rate": 8.000081787444232e-05, "loss": 0.3281, "num_tokens": 2321362.0, "step": 70 }, { "epoch": 0.07928118393234672, "grad_norm": 0.46508923172950745, "learning_rate": 7.993571226647224e-05, "loss": 0.31, "num_tokens": 2513422.0, "step": 75 }, { "epoch": 0.08456659619450317, "grad_norm": 0.3090206980705261, "learning_rate": 7.98617453495891e-05, "loss": 0.3267, "num_tokens": 2682030.0, "step": 80 }, { "epoch": 0.08985200845665962, "grad_norm": 0.4037857949733734, "learning_rate": 7.977893911402208e-05, "loss": 0.3497, "num_tokens": 2838889.0, "step": 85 }, { "epoch": 0.09513742071881606, "grad_norm": 0.3317049443721771, "learning_rate": 7.968731817791378e-05, "loss": 0.4027, "num_tokens": 2984527.0, "step": 90 }, { "epoch": 0.10042283298097252, "grad_norm": 0.4948605000972748, "learning_rate": 7.958690978000108e-05, "loss": 0.3301, "num_tokens": 3129850.0, "step": 95 }, { "epoch": 0.10570824524312897, "grad_norm": 0.5033330917358398, "learning_rate": 7.947774377151723e-05, "loss": 0.3574, "num_tokens": 3289081.0, "step": 100 }, { "epoch": 0.1109936575052854, "grad_norm": 0.6163113117218018, "learning_rate": 7.935985260731712e-05, "loss": 0.3465, "num_tokens": 3456305.0, "step": 105 }, { "epoch": 0.11627906976744186, "grad_norm": 0.3303355574607849, "learning_rate": 7.923327133622843e-05, "loss": 0.3013, "num_tokens": 3632845.0, "step": 110 }, { "epoch": 0.12156448202959831, "grad_norm": 0.3407858908176422, "learning_rate": 7.909803759063184e-05, "loss": 0.3059, "num_tokens": 3829460.0, "step": 115 }, { "epoch": 0.12684989429175475, "grad_norm": 0.39908474683761597, "learning_rate": 7.895419157527279e-05, "loss": 0.3627, "num_tokens": 3992992.0, "step": 120 }, { "epoch": 0.1321353065539112, "grad_norm": 0.29309067130088806, "learning_rate": 7.880177605530884e-05, "loss": 0.2892, "num_tokens": 4164780.0, "step": 125 }, { "epoch": 0.13742071881606766, "grad_norm": 0.37892845273017883, "learning_rate": 7.864083634359562e-05, "loss": 0.3028, "num_tokens": 4322872.0, "step": 130 }, { "epoch": 0.1427061310782241, "grad_norm": 0.34533941745758057, "learning_rate": 7.847142028721538e-05, "loss": 0.3528, "num_tokens": 4484774.0, "step": 135 }, { "epoch": 0.14799154334038056, "grad_norm": 0.47042974829673767, "learning_rate": 7.829357825325212e-05, "loss": 0.3279, "num_tokens": 4661609.0, "step": 140 }, { "epoch": 0.15327695560253699, "grad_norm": 0.3045842945575714, "learning_rate": 7.810736311381762e-05, "loss": 0.3445, "num_tokens": 4827667.0, "step": 145 }, { "epoch": 0.15856236786469344, "grad_norm": 0.4161529541015625, "learning_rate": 7.791283023033264e-05, "loss": 0.3148, "num_tokens": 5000163.0, "step": 150 }, { "epoch": 0.1638477801268499, "grad_norm": 0.4866187274456024, "learning_rate": 7.771003743706797e-05, "loss": 0.3225, "num_tokens": 5168441.0, "step": 155 }, { "epoch": 0.16913319238900634, "grad_norm": 0.35074010491371155, "learning_rate": 7.749904502395058e-05, "loss": 0.3089, "num_tokens": 5338454.0, "step": 160 }, { "epoch": 0.1744186046511628, "grad_norm": 0.34164947271347046, "learning_rate": 7.727991571863935e-05, "loss": 0.3025, "num_tokens": 5512939.0, "step": 165 }, { "epoch": 0.17970401691331925, "grad_norm": 0.43914279341697693, "learning_rate": 7.705271466787641e-05, "loss": 0.3334, "num_tokens": 5665993.0, "step": 170 }, { "epoch": 0.1849894291754757, "grad_norm": 0.405638724565506, "learning_rate": 7.681750941811905e-05, "loss": 0.3739, "num_tokens": 5824925.0, "step": 175 }, { "epoch": 0.19027484143763213, "grad_norm": 0.42802131175994873, "learning_rate": 7.657436989545827e-05, "loss": 0.3284, "num_tokens": 5982641.0, "step": 180 }, { "epoch": 0.19556025369978858, "grad_norm": 0.41498205065727234, "learning_rate": 7.632336838482996e-05, "loss": 0.3022, "num_tokens": 6149598.0, "step": 185 }, { "epoch": 0.20084566596194503, "grad_norm": 0.40431639552116394, "learning_rate": 7.60645795085246e-05, "loss": 0.3159, "num_tokens": 6320515.0, "step": 190 }, { "epoch": 0.20613107822410148, "grad_norm": 0.32176268100738525, "learning_rate": 7.579808020400232e-05, "loss": 0.2938, "num_tokens": 6492543.0, "step": 195 }, { "epoch": 0.21141649048625794, "grad_norm": 0.4644106924533844, "learning_rate": 7.55239497010194e-05, "loss": 0.3349, "num_tokens": 6672523.0, "step": 200 }, { "epoch": 0.2167019027484144, "grad_norm": 0.35432812571525574, "learning_rate": 7.52422694980736e-05, "loss": 0.3041, "num_tokens": 6844107.0, "step": 205 }, { "epoch": 0.2219873150105708, "grad_norm": 0.44315239787101746, "learning_rate": 7.495312333817455e-05, "loss": 0.2612, "num_tokens": 7007468.0, "step": 210 }, { "epoch": 0.22727272727272727, "grad_norm": 0.36817148327827454, "learning_rate": 7.465659718394734e-05, "loss": 0.3221, "num_tokens": 7156859.0, "step": 215 }, { "epoch": 0.23255813953488372, "grad_norm": 0.39306601881980896, "learning_rate": 7.43527791920758e-05, "loss": 0.3426, "num_tokens": 7311920.0, "step": 220 }, { "epoch": 0.23784355179704017, "grad_norm": 0.39284104108810425, "learning_rate": 7.404175968709388e-05, "loss": 0.3232, "num_tokens": 7461910.0, "step": 225 }, { "epoch": 0.24312896405919662, "grad_norm": 0.3719322085380554, "learning_rate": 7.372363113453213e-05, "loss": 0.3277, "num_tokens": 7629200.0, "step": 230 }, { "epoch": 0.24841437632135308, "grad_norm": 0.3720446527004242, "learning_rate": 7.339848811342796e-05, "loss": 0.3122, "num_tokens": 7790842.0, "step": 235 }, { "epoch": 0.2536997885835095, "grad_norm": 0.34994134306907654, "learning_rate": 7.306642728820755e-05, "loss": 0.3404, "num_tokens": 7961093.0, "step": 240 }, { "epoch": 0.25898520084566595, "grad_norm": 0.379666268825531, "learning_rate": 7.272754737994752e-05, "loss": 0.3254, "num_tokens": 8118785.0, "step": 245 }, { "epoch": 0.2642706131078224, "grad_norm": 0.5034386515617371, "learning_rate": 7.238194913702544e-05, "loss": 0.3185, "num_tokens": 8279200.0, "step": 250 }, { "epoch": 0.26955602536997886, "grad_norm": 0.38826555013656616, "learning_rate": 7.202973530516749e-05, "loss": 0.3021, "num_tokens": 8487897.0, "step": 255 }, { "epoch": 0.2748414376321353, "grad_norm": 0.3884974420070648, "learning_rate": 7.167101059690238e-05, "loss": 0.3221, "num_tokens": 8636638.0, "step": 260 }, { "epoch": 0.28012684989429176, "grad_norm": 0.4962138533592224, "learning_rate": 7.130588166043048e-05, "loss": 0.3485, "num_tokens": 8798198.0, "step": 265 }, { "epoch": 0.2854122621564482, "grad_norm": 0.4131626486778259, "learning_rate": 7.093445704791747e-05, "loss": 0.2897, "num_tokens": 8990965.0, "step": 270 }, { "epoch": 0.29069767441860467, "grad_norm": 0.38657164573669434, "learning_rate": 7.055684718322205e-05, "loss": 0.3492, "num_tokens": 9152215.0, "step": 275 }, { "epoch": 0.2959830866807611, "grad_norm": 0.2804111838340759, "learning_rate": 7.017316432906707e-05, "loss": 0.3243, "num_tokens": 9311537.0, "step": 280 }, { "epoch": 0.3012684989429176, "grad_norm": 0.3840746283531189, "learning_rate": 6.978352255366406e-05, "loss": 0.3657, "num_tokens": 9446553.0, "step": 285 }, { "epoch": 0.30655391120507397, "grad_norm": 0.29615920782089233, "learning_rate": 6.938803769680094e-05, "loss": 0.3144, "num_tokens": 9625894.0, "step": 290 }, { "epoch": 0.3118393234672304, "grad_norm": 0.3887929320335388, "learning_rate": 6.898682733540313e-05, "loss": 0.2967, "num_tokens": 9820246.0, "step": 295 }, { "epoch": 0.3171247357293869, "grad_norm": 0.4418380856513977, "learning_rate": 6.8580010748578e-05, "loss": 0.3299, "num_tokens": 9995021.0, "step": 300 }, { "epoch": 0.3224101479915433, "grad_norm": 0.327908992767334, "learning_rate": 6.816770888215352e-05, "loss": 0.2879, "num_tokens": 10186048.0, "step": 305 }, { "epoch": 0.3276955602536998, "grad_norm": 0.31937530636787415, "learning_rate": 6.775004431272132e-05, "loss": 0.3504, "num_tokens": 10343442.0, "step": 310 }, { "epoch": 0.33298097251585623, "grad_norm": 0.3883694112300873, "learning_rate": 6.732714121119478e-05, "loss": 0.3188, "num_tokens": 10505725.0, "step": 315 }, { "epoch": 0.3382663847780127, "grad_norm": 0.520715594291687, "learning_rate": 6.68991253058933e-05, "loss": 0.3348, "num_tokens": 10685454.0, "step": 320 }, { "epoch": 0.34355179704016914, "grad_norm": 0.3867979943752289, "learning_rate": 6.646612384516355e-05, "loss": 0.2958, "num_tokens": 10839868.0, "step": 325 }, { "epoch": 0.3488372093023256, "grad_norm": 0.4362066686153412, "learning_rate": 6.602826555954866e-05, "loss": 0.3052, "num_tokens": 10987360.0, "step": 330 }, { "epoch": 0.35412262156448204, "grad_norm": 0.3443554639816284, "learning_rate": 6.558568062351694e-05, "loss": 0.3133, "num_tokens": 11147325.0, "step": 335 }, { "epoch": 0.3594080338266385, "grad_norm": 0.3375532329082489, "learning_rate": 6.513850061676129e-05, "loss": 0.3187, "num_tokens": 11328945.0, "step": 340 }, { "epoch": 0.36469344608879495, "grad_norm": 0.48123109340667725, "learning_rate": 6.468685848508066e-05, "loss": 0.3234, "num_tokens": 11491151.0, "step": 345 }, { "epoch": 0.3699788583509514, "grad_norm": 0.350399911403656, "learning_rate": 6.423088850085563e-05, "loss": 0.2754, "num_tokens": 11659065.0, "step": 350 }, { "epoch": 0.3752642706131078, "grad_norm": 0.3584173917770386, "learning_rate": 6.377072622312942e-05, "loss": 0.3336, "num_tokens": 11863689.0, "step": 355 }, { "epoch": 0.38054968287526425, "grad_norm": 0.3181978464126587, "learning_rate": 6.330650845730648e-05, "loss": 0.3418, "num_tokens": 12006544.0, "step": 360 }, { "epoch": 0.3858350951374207, "grad_norm": 0.30926552414894104, "learning_rate": 6.283837321448044e-05, "loss": 0.3248, "num_tokens": 12155215.0, "step": 365 }, { "epoch": 0.39112050739957716, "grad_norm": 0.30399248003959656, "learning_rate": 6.236645967040363e-05, "loss": 0.3199, "num_tokens": 12304714.0, "step": 370 }, { "epoch": 0.3964059196617336, "grad_norm": 0.3858378827571869, "learning_rate": 6.189090812411056e-05, "loss": 0.2905, "num_tokens": 12467243.0, "step": 375 }, { "epoch": 0.40169133192389006, "grad_norm": 0.3069511950016022, "learning_rate": 6.141185995620703e-05, "loss": 0.3314, "num_tokens": 12641866.0, "step": 380 }, { "epoch": 0.4069767441860465, "grad_norm": 0.31766510009765625, "learning_rate": 6.0929457586838143e-05, "loss": 0.3084, "num_tokens": 12805214.0, "step": 385 }, { "epoch": 0.41226215644820297, "grad_norm": 0.5858482122421265, "learning_rate": 6.044384443334691e-05, "loss": 0.2878, "num_tokens": 12967228.0, "step": 390 }, { "epoch": 0.4175475687103594, "grad_norm": 0.34514257311820984, "learning_rate": 5.9955164867636644e-05, "loss": 0.3471, "num_tokens": 13104829.0, "step": 395 }, { "epoch": 0.42283298097251587, "grad_norm": 0.4240577816963196, "learning_rate": 5.9463564173249374e-05, "loss": 0.2973, "num_tokens": 13288474.0, "step": 400 }, { "epoch": 0.4281183932346723, "grad_norm": 0.3454580008983612, "learning_rate": 5.896918850217336e-05, "loss": 0.3241, "num_tokens": 13451876.0, "step": 405 }, { "epoch": 0.4334038054968288, "grad_norm": 0.3258069157600403, "learning_rate": 5.8472184831392364e-05, "loss": 0.2968, "num_tokens": 13633506.0, "step": 410 }, { "epoch": 0.43868921775898523, "grad_norm": 0.35886386036872864, "learning_rate": 5.797270091918965e-05, "loss": 0.3173, "num_tokens": 13794654.0, "step": 415 }, { "epoch": 0.4439746300211416, "grad_norm": 0.5046164393424988, "learning_rate": 5.747088526121975e-05, "loss": 0.3439, "num_tokens": 13928763.0, "step": 420 }, { "epoch": 0.4492600422832981, "grad_norm": 0.34629306197166443, "learning_rate": 5.696688704636091e-05, "loss": 0.3095, "num_tokens": 14091776.0, "step": 425 }, { "epoch": 0.45454545454545453, "grad_norm": 0.35323911905288696, "learning_rate": 5.6460856112361576e-05, "loss": 0.3053, "num_tokens": 14283410.0, "step": 430 }, { "epoch": 0.459830866807611, "grad_norm": 0.3977009356021881, "learning_rate": 5.595294290129389e-05, "loss": 0.2885, "num_tokens": 14450115.0, "step": 435 }, { "epoch": 0.46511627906976744, "grad_norm": 0.34842291474342346, "learning_rate": 5.5443298414827514e-05, "loss": 0.2904, "num_tokens": 14629092.0, "step": 440 }, { "epoch": 0.4704016913319239, "grad_norm": 0.3181321322917938, "learning_rate": 5.4932074169337124e-05, "loss": 0.3289, "num_tokens": 14766334.0, "step": 445 } ], "logging_steps": 5, "max_steps": 946, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8225814430631854e+18, "train_batch_size": 14, "trial_name": null, "trial_params": null }