{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9922822491730982, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011025358324145534, "grad_norm": 22.754425048828125, "learning_rate": 4.945054945054946e-07, "loss": 1.2587, "step": 10 }, { "epoch": 0.022050716648291068, "grad_norm": 7.113337993621826, "learning_rate": 1.0439560439560442e-06, "loss": 0.945, "step": 20 }, { "epoch": 0.03307607497243661, "grad_norm": 6.807334899902344, "learning_rate": 1.5934065934065933e-06, "loss": 0.7046, "step": 30 }, { "epoch": 0.044101433296582136, "grad_norm": 4.160985946655273, "learning_rate": 2.1428571428571427e-06, "loss": 0.4965, "step": 40 }, { "epoch": 0.05512679162072767, "grad_norm": 2.954188108444214, "learning_rate": 2.6923076923076923e-06, "loss": 0.4281, "step": 50 }, { "epoch": 0.06615214994487321, "grad_norm": 3.6939523220062256, "learning_rate": 3.2417582417582424e-06, "loss": 0.3876, "step": 60 }, { "epoch": 0.07717750826901874, "grad_norm": 3.4522147178649902, "learning_rate": 3.7912087912087915e-06, "loss": 0.3595, "step": 70 }, { "epoch": 0.08820286659316427, "grad_norm": 3.375483274459839, "learning_rate": 4.340659340659341e-06, "loss": 0.3622, "step": 80 }, { "epoch": 0.09922822491730982, "grad_norm": 3.3890573978424072, "learning_rate": 4.890109890109891e-06, "loss": 0.3491, "step": 90 }, { "epoch": 0.11025358324145534, "grad_norm": 2.7892234325408936, "learning_rate": 4.998814299283415e-06, "loss": 0.3274, "step": 100 }, { "epoch": 0.12127894156560089, "grad_norm": 3.0670788288116455, "learning_rate": 4.993999317659293e-06, "loss": 0.3478, "step": 110 }, { "epoch": 0.13230429988974643, "grad_norm": 2.922692060470581, "learning_rate": 4.985488079432037e-06, "loss": 0.3286, "step": 120 }, { "epoch": 0.14332965821389196, "grad_norm": 2.717001438140869, "learning_rate": 4.973293198767286e-06, "loss": 0.324, "step": 130 }, { "epoch": 0.1543550165380375, "grad_norm": 2.99025821685791, "learning_rate": 4.957432749209755e-06, "loss": 0.3256, "step": 140 }, { "epoch": 0.16538037486218302, "grad_norm": 2.956892251968384, "learning_rate": 4.937930236897151e-06, "loss": 0.323, "step": 150 }, { "epoch": 0.17640573318632854, "grad_norm": 2.796661615371704, "learning_rate": 4.914814565722671e-06, "loss": 0.3198, "step": 160 }, { "epoch": 0.1874310915104741, "grad_norm": 2.8440957069396973, "learning_rate": 4.888119994497701e-06, "loss": 0.3164, "step": 170 }, { "epoch": 0.19845644983461963, "grad_norm": 2.8787901401519775, "learning_rate": 4.857886086178194e-06, "loss": 0.3155, "step": 180 }, { "epoch": 0.20948180815876516, "grad_norm": 3.005969524383545, "learning_rate": 4.824157649230005e-06, "loss": 0.314, "step": 190 }, { "epoch": 0.2205071664829107, "grad_norm": 2.5972912311553955, "learning_rate": 4.786984671220053e-06, "loss": 0.2983, "step": 200 }, { "epoch": 0.23153252480705622, "grad_norm": 2.5450024604797363, "learning_rate": 4.746422244731743e-06, "loss": 0.3146, "step": 210 }, { "epoch": 0.24255788313120177, "grad_norm": 2.8743643760681152, "learning_rate": 4.702530485714462e-06, "loss": 0.3144, "step": 220 }, { "epoch": 0.2535832414553473, "grad_norm": 2.759552001953125, "learning_rate": 4.655374444388127e-06, "loss": 0.2969, "step": 230 }, { "epoch": 0.26460859977949286, "grad_norm": 2.5727880001068115, "learning_rate": 4.6050240088348634e-06, "loss": 0.3076, "step": 240 }, { "epoch": 0.2756339581036384, "grad_norm": 2.36820387840271, "learning_rate": 4.551553801420671e-06, "loss": 0.2894, "step": 250 }, { "epoch": 0.2866593164277839, "grad_norm": 2.892141103744507, "learning_rate": 4.4950430682005995e-06, "loss": 0.3084, "step": 260 }, { "epoch": 0.29768467475192945, "grad_norm": 2.46459698677063, "learning_rate": 4.435575561471346e-06, "loss": 0.2968, "step": 270 }, { "epoch": 0.308710033076075, "grad_norm": 2.428185224533081, "learning_rate": 4.373239415645324e-06, "loss": 0.2923, "step": 280 }, { "epoch": 0.3197353914002205, "grad_norm": 2.903369188308716, "learning_rate": 4.308127016630176e-06, "loss": 0.3055, "step": 290 }, { "epoch": 0.33076074972436603, "grad_norm": 2.4219541549682617, "learning_rate": 4.240334864907317e-06, "loss": 0.2966, "step": 300 }, { "epoch": 0.34178610804851156, "grad_norm": 2.812272548675537, "learning_rate": 4.169963432512436e-06, "loss": 0.2833, "step": 310 }, { "epoch": 0.3528114663726571, "grad_norm": 2.7986788749694824, "learning_rate": 4.097117014129903e-06, "loss": 0.2946, "step": 320 }, { "epoch": 0.3638368246968026, "grad_norm": 2.4811017513275146, "learning_rate": 4.021903572521802e-06, "loss": 0.28, "step": 330 }, { "epoch": 0.3748621830209482, "grad_norm": 2.9100232124328613, "learning_rate": 3.9444345785206285e-06, "loss": 0.2973, "step": 340 }, { "epoch": 0.38588754134509373, "grad_norm": 2.6646060943603516, "learning_rate": 3.864824845822837e-06, "loss": 0.2914, "step": 350 }, { "epoch": 0.39691289966923926, "grad_norm": 2.480253219604492, "learning_rate": 3.7831923608280516e-06, "loss": 0.278, "step": 360 }, { "epoch": 0.4079382579933848, "grad_norm": 2.5499134063720703, "learning_rate": 3.699658107776148e-06, "loss": 0.2827, "step": 370 }, { "epoch": 0.4189636163175303, "grad_norm": 3.0008158683776855, "learning_rate": 3.6143458894413463e-06, "loss": 0.2829, "step": 380 }, { "epoch": 0.42998897464167585, "grad_norm": 2.6385183334350586, "learning_rate": 3.527382143649075e-06, "loss": 0.2823, "step": 390 }, { "epoch": 0.4410143329658214, "grad_norm": 2.9152820110321045, "learning_rate": 3.438895755887532e-06, "loss": 0.2723, "step": 400 }, { "epoch": 0.4520396912899669, "grad_norm": 2.694945812225342, "learning_rate": 3.3490178682916534e-06, "loss": 0.2836, "step": 410 }, { "epoch": 0.46306504961411243, "grad_norm": 2.779081106185913, "learning_rate": 3.257881685282609e-06, "loss": 0.2522, "step": 420 }, { "epoch": 0.474090407938258, "grad_norm": 2.4859516620635986, "learning_rate": 3.1656222761508525e-06, "loss": 0.2783, "step": 430 }, { "epoch": 0.48511576626240355, "grad_norm": 2.886754035949707, "learning_rate": 3.0723763748753354e-06, "loss": 0.2619, "step": 440 }, { "epoch": 0.4961411245865491, "grad_norm": 2.810600519180298, "learning_rate": 2.9782821774755454e-06, "loss": 0.2847, "step": 450 }, { "epoch": 0.5071664829106945, "grad_norm": 2.813720226287842, "learning_rate": 2.883479137196714e-06, "loss": 0.2665, "step": 460 }, { "epoch": 0.5181918412348401, "grad_norm": 2.1755166053771973, "learning_rate": 2.7881077578317445e-06, "loss": 0.2701, "step": 470 }, { "epoch": 0.5292171995589857, "grad_norm": 2.8941328525543213, "learning_rate": 2.6923093854861597e-06, "loss": 0.2584, "step": 480 }, { "epoch": 0.5402425578831312, "grad_norm": 2.368398427963257, "learning_rate": 2.596225999094696e-06, "loss": 0.2684, "step": 490 }, { "epoch": 0.5512679162072768, "grad_norm": 2.5928003787994385, "learning_rate": 2.5e-06, "loss": 0.2546, "step": 500 }, { "epoch": 0.5622932745314223, "grad_norm": 2.835794687271118, "learning_rate": 2.4037740009053053e-06, "loss": 0.2653, "step": 510 }, { "epoch": 0.5733186328555678, "grad_norm": 2.550260543823242, "learning_rate": 2.3076906145138407e-06, "loss": 0.264, "step": 520 }, { "epoch": 0.5843439911797134, "grad_norm": 2.7682080268859863, "learning_rate": 2.2118922421682563e-06, "loss": 0.2613, "step": 530 }, { "epoch": 0.5953693495038589, "grad_norm": 2.712428569793701, "learning_rate": 2.1165208628032863e-06, "loss": 0.2488, "step": 540 }, { "epoch": 0.6063947078280044, "grad_norm": 2.650545358657837, "learning_rate": 2.0217178225244554e-06, "loss": 0.258, "step": 550 }, { "epoch": 0.61742006615215, "grad_norm": 2.616968870162964, "learning_rate": 1.9276236251246655e-06, "loss": 0.2552, "step": 560 }, { "epoch": 0.6284454244762955, "grad_norm": 2.836118698120117, "learning_rate": 1.8343777238491477e-06, "loss": 0.251, "step": 570 }, { "epoch": 0.639470782800441, "grad_norm": 2.6420810222625732, "learning_rate": 1.7421183147173915e-06, "loss": 0.2587, "step": 580 }, { "epoch": 0.6504961411245865, "grad_norm": 2.2103841304779053, "learning_rate": 1.6509821317083466e-06, "loss": 0.2528, "step": 590 }, { "epoch": 0.6615214994487321, "grad_norm": 2.5041706562042236, "learning_rate": 1.5611042441124687e-06, "loss": 0.2466, "step": 600 }, { "epoch": 0.6725468577728776, "grad_norm": 3.019406795501709, "learning_rate": 1.4726178563509258e-06, "loss": 0.247, "step": 610 }, { "epoch": 0.6835722160970231, "grad_norm": 2.844505548477173, "learning_rate": 1.3856541105586545e-06, "loss": 0.2487, "step": 620 }, { "epoch": 0.6945975744211687, "grad_norm": 2.6431515216827393, "learning_rate": 1.300341892223852e-06, "loss": 0.2596, "step": 630 }, { "epoch": 0.7056229327453142, "grad_norm": 2.4335758686065674, "learning_rate": 1.2168076391719492e-06, "loss": 0.2467, "step": 640 }, { "epoch": 0.7166482910694597, "grad_norm": 2.8177690505981445, "learning_rate": 1.1351751541771644e-06, "loss": 0.2345, "step": 650 }, { "epoch": 0.7276736493936052, "grad_norm": 2.927981376647949, "learning_rate": 1.0555654214793723e-06, "loss": 0.249, "step": 660 }, { "epoch": 0.7386990077177509, "grad_norm": 3.3543944358825684, "learning_rate": 9.780964274781984e-07, "loss": 0.2612, "step": 670 }, { "epoch": 0.7497243660418964, "grad_norm": 3.0488715171813965, "learning_rate": 9.028829858700974e-07, "loss": 0.2355, "step": 680 }, { "epoch": 0.7607497243660419, "grad_norm": 2.7804830074310303, "learning_rate": 8.300365674875652e-07, "loss": 0.2562, "step": 690 }, { "epoch": 0.7717750826901875, "grad_norm": 2.2365427017211914, "learning_rate": 7.596651350926837e-07, "loss": 0.2355, "step": 700 }, { "epoch": 0.782800441014333, "grad_norm": 2.474679708480835, "learning_rate": 6.91872983369826e-07, "loss": 0.2279, "step": 710 }, { "epoch": 0.7938257993384785, "grad_norm": 2.5682637691497803, "learning_rate": 6.267605843546768e-07, "loss": 0.2561, "step": 720 }, { "epoch": 0.804851157662624, "grad_norm": 2.0722885131835938, "learning_rate": 5.644244385286548e-07, "loss": 0.2311, "step": 730 }, { "epoch": 0.8158765159867696, "grad_norm": 2.517178773880005, "learning_rate": 5.049569317994013e-07, "loss": 0.2356, "step": 740 }, { "epoch": 0.8269018743109151, "grad_norm": 2.4816508293151855, "learning_rate": 4.484461985793298e-07, "loss": 0.2499, "step": 750 }, { "epoch": 0.8379272326350606, "grad_norm": 2.7292091846466064, "learning_rate": 3.9497599116513714e-07, "loss": 0.2577, "step": 760 }, { "epoch": 0.8489525909592062, "grad_norm": 2.906921148300171, "learning_rate": 3.446255556118736e-07, "loss": 0.2316, "step": 770 }, { "epoch": 0.8599779492833517, "grad_norm": 2.4659173488616943, "learning_rate": 2.9746951428553884e-07, "loss": 0.2407, "step": 780 }, { "epoch": 0.8710033076074972, "grad_norm": 2.541038751602173, "learning_rate": 2.535777552682578e-07, "loss": 0.2399, "step": 790 }, { "epoch": 0.8820286659316428, "grad_norm": 2.69195556640625, "learning_rate": 2.1301532877994747e-07, "loss": 0.2339, "step": 800 }, { "epoch": 0.8930540242557883, "grad_norm": 2.950699806213379, "learning_rate": 1.7584235076999468e-07, "loss": 0.2384, "step": 810 }, { "epoch": 0.9040793825799338, "grad_norm": 3.1038763523101807, "learning_rate": 1.421139138218064e-07, "loss": 0.2414, "step": 820 }, { "epoch": 0.9151047409040793, "grad_norm": 2.633563756942749, "learning_rate": 1.1188000550230005e-07, "loss": 0.2364, "step": 830 }, { "epoch": 0.9261300992282249, "grad_norm": 2.5586025714874268, "learning_rate": 8.518543427732951e-08, "loss": 0.2324, "step": 840 }, { "epoch": 0.9371554575523704, "grad_norm": 2.448216199874878, "learning_rate": 6.206976310284996e-08, "loss": 0.242, "step": 850 }, { "epoch": 0.948180815876516, "grad_norm": 2.557136297225952, "learning_rate": 4.256725079024554e-08, "loss": 0.2349, "step": 860 }, { "epoch": 0.9592061742006616, "grad_norm": 2.8758692741394043, "learning_rate": 2.670680123271402e-08, "loss": 0.2337, "step": 870 }, { "epoch": 0.9702315325248071, "grad_norm": 2.5542616844177246, "learning_rate": 1.4511920567963911e-08, "loss": 0.2404, "step": 880 }, { "epoch": 0.9812568908489526, "grad_norm": 2.937741994857788, "learning_rate": 6.00068234070772e-09, "loss": 0.2434, "step": 890 }, { "epoch": 0.9922822491730982, "grad_norm": 2.797497272491455, "learning_rate": 1.1857007165852475e-09, "loss": 0.2483, "step": 900 } ], "logging_steps": 10, "max_steps": 907, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.777891824668508e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }