| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9922822491730982, | |
| "eval_steps": 500, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011025358324145534, | |
| "grad_norm": 22.754425048828125, | |
| "learning_rate": 4.945054945054946e-07, | |
| "loss": 1.2587, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022050716648291068, | |
| "grad_norm": 7.113337993621826, | |
| "learning_rate": 1.0439560439560442e-06, | |
| "loss": 0.945, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03307607497243661, | |
| "grad_norm": 6.807334899902344, | |
| "learning_rate": 1.5934065934065933e-06, | |
| "loss": 0.7046, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.044101433296582136, | |
| "grad_norm": 4.160985946655273, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 0.4965, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05512679162072767, | |
| "grad_norm": 2.954188108444214, | |
| "learning_rate": 2.6923076923076923e-06, | |
| "loss": 0.4281, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06615214994487321, | |
| "grad_norm": 3.6939523220062256, | |
| "learning_rate": 3.2417582417582424e-06, | |
| "loss": 0.3876, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07717750826901874, | |
| "grad_norm": 3.4522147178649902, | |
| "learning_rate": 3.7912087912087915e-06, | |
| "loss": 0.3595, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08820286659316427, | |
| "grad_norm": 3.375483274459839, | |
| "learning_rate": 4.340659340659341e-06, | |
| "loss": 0.3622, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09922822491730982, | |
| "grad_norm": 3.3890573978424072, | |
| "learning_rate": 4.890109890109891e-06, | |
| "loss": 0.3491, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11025358324145534, | |
| "grad_norm": 2.7892234325408936, | |
| "learning_rate": 4.998814299283415e-06, | |
| "loss": 0.3274, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12127894156560089, | |
| "grad_norm": 3.0670788288116455, | |
| "learning_rate": 4.993999317659293e-06, | |
| "loss": 0.3478, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13230429988974643, | |
| "grad_norm": 2.922692060470581, | |
| "learning_rate": 4.985488079432037e-06, | |
| "loss": 0.3286, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14332965821389196, | |
| "grad_norm": 2.717001438140869, | |
| "learning_rate": 4.973293198767286e-06, | |
| "loss": 0.324, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1543550165380375, | |
| "grad_norm": 2.99025821685791, | |
| "learning_rate": 4.957432749209755e-06, | |
| "loss": 0.3256, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16538037486218302, | |
| "grad_norm": 2.956892251968384, | |
| "learning_rate": 4.937930236897151e-06, | |
| "loss": 0.323, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17640573318632854, | |
| "grad_norm": 2.796661615371704, | |
| "learning_rate": 4.914814565722671e-06, | |
| "loss": 0.3198, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1874310915104741, | |
| "grad_norm": 2.8440957069396973, | |
| "learning_rate": 4.888119994497701e-06, | |
| "loss": 0.3164, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19845644983461963, | |
| "grad_norm": 2.8787901401519775, | |
| "learning_rate": 4.857886086178194e-06, | |
| "loss": 0.3155, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20948180815876516, | |
| "grad_norm": 3.005969524383545, | |
| "learning_rate": 4.824157649230005e-06, | |
| "loss": 0.314, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2205071664829107, | |
| "grad_norm": 2.5972912311553955, | |
| "learning_rate": 4.786984671220053e-06, | |
| "loss": 0.2983, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23153252480705622, | |
| "grad_norm": 2.5450024604797363, | |
| "learning_rate": 4.746422244731743e-06, | |
| "loss": 0.3146, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.24255788313120177, | |
| "grad_norm": 2.8743643760681152, | |
| "learning_rate": 4.702530485714462e-06, | |
| "loss": 0.3144, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2535832414553473, | |
| "grad_norm": 2.759552001953125, | |
| "learning_rate": 4.655374444388127e-06, | |
| "loss": 0.2969, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.26460859977949286, | |
| "grad_norm": 2.5727880001068115, | |
| "learning_rate": 4.6050240088348634e-06, | |
| "loss": 0.3076, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2756339581036384, | |
| "grad_norm": 2.36820387840271, | |
| "learning_rate": 4.551553801420671e-06, | |
| "loss": 0.2894, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2866593164277839, | |
| "grad_norm": 2.892141103744507, | |
| "learning_rate": 4.4950430682005995e-06, | |
| "loss": 0.3084, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.29768467475192945, | |
| "grad_norm": 2.46459698677063, | |
| "learning_rate": 4.435575561471346e-06, | |
| "loss": 0.2968, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.308710033076075, | |
| "grad_norm": 2.428185224533081, | |
| "learning_rate": 4.373239415645324e-06, | |
| "loss": 0.2923, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3197353914002205, | |
| "grad_norm": 2.903369188308716, | |
| "learning_rate": 4.308127016630176e-06, | |
| "loss": 0.3055, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.33076074972436603, | |
| "grad_norm": 2.4219541549682617, | |
| "learning_rate": 4.240334864907317e-06, | |
| "loss": 0.2966, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.34178610804851156, | |
| "grad_norm": 2.812272548675537, | |
| "learning_rate": 4.169963432512436e-06, | |
| "loss": 0.2833, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3528114663726571, | |
| "grad_norm": 2.7986788749694824, | |
| "learning_rate": 4.097117014129903e-06, | |
| "loss": 0.2946, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3638368246968026, | |
| "grad_norm": 2.4811017513275146, | |
| "learning_rate": 4.021903572521802e-06, | |
| "loss": 0.28, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3748621830209482, | |
| "grad_norm": 2.9100232124328613, | |
| "learning_rate": 3.9444345785206285e-06, | |
| "loss": 0.2973, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.38588754134509373, | |
| "grad_norm": 2.6646060943603516, | |
| "learning_rate": 3.864824845822837e-06, | |
| "loss": 0.2914, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.39691289966923926, | |
| "grad_norm": 2.480253219604492, | |
| "learning_rate": 3.7831923608280516e-06, | |
| "loss": 0.278, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4079382579933848, | |
| "grad_norm": 2.5499134063720703, | |
| "learning_rate": 3.699658107776148e-06, | |
| "loss": 0.2827, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4189636163175303, | |
| "grad_norm": 3.0008158683776855, | |
| "learning_rate": 3.6143458894413463e-06, | |
| "loss": 0.2829, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.42998897464167585, | |
| "grad_norm": 2.6385183334350586, | |
| "learning_rate": 3.527382143649075e-06, | |
| "loss": 0.2823, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4410143329658214, | |
| "grad_norm": 2.9152820110321045, | |
| "learning_rate": 3.438895755887532e-06, | |
| "loss": 0.2723, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4520396912899669, | |
| "grad_norm": 2.694945812225342, | |
| "learning_rate": 3.3490178682916534e-06, | |
| "loss": 0.2836, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.46306504961411243, | |
| "grad_norm": 2.779081106185913, | |
| "learning_rate": 3.257881685282609e-06, | |
| "loss": 0.2522, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.474090407938258, | |
| "grad_norm": 2.4859516620635986, | |
| "learning_rate": 3.1656222761508525e-06, | |
| "loss": 0.2783, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.48511576626240355, | |
| "grad_norm": 2.886754035949707, | |
| "learning_rate": 3.0723763748753354e-06, | |
| "loss": 0.2619, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4961411245865491, | |
| "grad_norm": 2.810600519180298, | |
| "learning_rate": 2.9782821774755454e-06, | |
| "loss": 0.2847, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5071664829106945, | |
| "grad_norm": 2.813720226287842, | |
| "learning_rate": 2.883479137196714e-06, | |
| "loss": 0.2665, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5181918412348401, | |
| "grad_norm": 2.1755166053771973, | |
| "learning_rate": 2.7881077578317445e-06, | |
| "loss": 0.2701, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5292171995589857, | |
| "grad_norm": 2.8941328525543213, | |
| "learning_rate": 2.6923093854861597e-06, | |
| "loss": 0.2584, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5402425578831312, | |
| "grad_norm": 2.368398427963257, | |
| "learning_rate": 2.596225999094696e-06, | |
| "loss": 0.2684, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5512679162072768, | |
| "grad_norm": 2.5928003787994385, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.2546, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5622932745314223, | |
| "grad_norm": 2.835794687271118, | |
| "learning_rate": 2.4037740009053053e-06, | |
| "loss": 0.2653, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5733186328555678, | |
| "grad_norm": 2.550260543823242, | |
| "learning_rate": 2.3076906145138407e-06, | |
| "loss": 0.264, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5843439911797134, | |
| "grad_norm": 2.7682080268859863, | |
| "learning_rate": 2.2118922421682563e-06, | |
| "loss": 0.2613, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5953693495038589, | |
| "grad_norm": 2.712428569793701, | |
| "learning_rate": 2.1165208628032863e-06, | |
| "loss": 0.2488, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6063947078280044, | |
| "grad_norm": 2.650545358657837, | |
| "learning_rate": 2.0217178225244554e-06, | |
| "loss": 0.258, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.61742006615215, | |
| "grad_norm": 2.616968870162964, | |
| "learning_rate": 1.9276236251246655e-06, | |
| "loss": 0.2552, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6284454244762955, | |
| "grad_norm": 2.836118698120117, | |
| "learning_rate": 1.8343777238491477e-06, | |
| "loss": 0.251, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.639470782800441, | |
| "grad_norm": 2.6420810222625732, | |
| "learning_rate": 1.7421183147173915e-06, | |
| "loss": 0.2587, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6504961411245865, | |
| "grad_norm": 2.2103841304779053, | |
| "learning_rate": 1.6509821317083466e-06, | |
| "loss": 0.2528, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6615214994487321, | |
| "grad_norm": 2.5041706562042236, | |
| "learning_rate": 1.5611042441124687e-06, | |
| "loss": 0.2466, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6725468577728776, | |
| "grad_norm": 3.019406795501709, | |
| "learning_rate": 1.4726178563509258e-06, | |
| "loss": 0.247, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6835722160970231, | |
| "grad_norm": 2.844505548477173, | |
| "learning_rate": 1.3856541105586545e-06, | |
| "loss": 0.2487, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6945975744211687, | |
| "grad_norm": 2.6431515216827393, | |
| "learning_rate": 1.300341892223852e-06, | |
| "loss": 0.2596, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7056229327453142, | |
| "grad_norm": 2.4335758686065674, | |
| "learning_rate": 1.2168076391719492e-06, | |
| "loss": 0.2467, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7166482910694597, | |
| "grad_norm": 2.8177690505981445, | |
| "learning_rate": 1.1351751541771644e-06, | |
| "loss": 0.2345, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7276736493936052, | |
| "grad_norm": 2.927981376647949, | |
| "learning_rate": 1.0555654214793723e-06, | |
| "loss": 0.249, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7386990077177509, | |
| "grad_norm": 3.3543944358825684, | |
| "learning_rate": 9.780964274781984e-07, | |
| "loss": 0.2612, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7497243660418964, | |
| "grad_norm": 3.0488715171813965, | |
| "learning_rate": 9.028829858700974e-07, | |
| "loss": 0.2355, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7607497243660419, | |
| "grad_norm": 2.7804830074310303, | |
| "learning_rate": 8.300365674875652e-07, | |
| "loss": 0.2562, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7717750826901875, | |
| "grad_norm": 2.2365427017211914, | |
| "learning_rate": 7.596651350926837e-07, | |
| "loss": 0.2355, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.782800441014333, | |
| "grad_norm": 2.474679708480835, | |
| "learning_rate": 6.91872983369826e-07, | |
| "loss": 0.2279, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7938257993384785, | |
| "grad_norm": 2.5682637691497803, | |
| "learning_rate": 6.267605843546768e-07, | |
| "loss": 0.2561, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.804851157662624, | |
| "grad_norm": 2.0722885131835938, | |
| "learning_rate": 5.644244385286548e-07, | |
| "loss": 0.2311, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8158765159867696, | |
| "grad_norm": 2.517178773880005, | |
| "learning_rate": 5.049569317994013e-07, | |
| "loss": 0.2356, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8269018743109151, | |
| "grad_norm": 2.4816508293151855, | |
| "learning_rate": 4.484461985793298e-07, | |
| "loss": 0.2499, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8379272326350606, | |
| "grad_norm": 2.7292091846466064, | |
| "learning_rate": 3.9497599116513714e-07, | |
| "loss": 0.2577, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8489525909592062, | |
| "grad_norm": 2.906921148300171, | |
| "learning_rate": 3.446255556118736e-07, | |
| "loss": 0.2316, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8599779492833517, | |
| "grad_norm": 2.4659173488616943, | |
| "learning_rate": 2.9746951428553884e-07, | |
| "loss": 0.2407, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8710033076074972, | |
| "grad_norm": 2.541038751602173, | |
| "learning_rate": 2.535777552682578e-07, | |
| "loss": 0.2399, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8820286659316428, | |
| "grad_norm": 2.69195556640625, | |
| "learning_rate": 2.1301532877994747e-07, | |
| "loss": 0.2339, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8930540242557883, | |
| "grad_norm": 2.950699806213379, | |
| "learning_rate": 1.7584235076999468e-07, | |
| "loss": 0.2384, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9040793825799338, | |
| "grad_norm": 3.1038763523101807, | |
| "learning_rate": 1.421139138218064e-07, | |
| "loss": 0.2414, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9151047409040793, | |
| "grad_norm": 2.633563756942749, | |
| "learning_rate": 1.1188000550230005e-07, | |
| "loss": 0.2364, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9261300992282249, | |
| "grad_norm": 2.5586025714874268, | |
| "learning_rate": 8.518543427732951e-08, | |
| "loss": 0.2324, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9371554575523704, | |
| "grad_norm": 2.448216199874878, | |
| "learning_rate": 6.206976310284996e-08, | |
| "loss": 0.242, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.948180815876516, | |
| "grad_norm": 2.557136297225952, | |
| "learning_rate": 4.256725079024554e-08, | |
| "loss": 0.2349, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9592061742006616, | |
| "grad_norm": 2.8758692741394043, | |
| "learning_rate": 2.670680123271402e-08, | |
| "loss": 0.2337, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9702315325248071, | |
| "grad_norm": 2.5542616844177246, | |
| "learning_rate": 1.4511920567963911e-08, | |
| "loss": 0.2404, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9812568908489526, | |
| "grad_norm": 2.937741994857788, | |
| "learning_rate": 6.00068234070772e-09, | |
| "loss": 0.2434, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9922822491730982, | |
| "grad_norm": 2.797497272491455, | |
| "learning_rate": 1.1857007165852475e-09, | |
| "loss": 0.2483, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 907, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.777891824668508e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |