| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 990, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.030349013657056147, |
| "grad_norm": 0.49088624119758606, |
| "learning_rate": 1.8e-05, |
| "loss": 2.5243, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06069802731411229, |
| "grad_norm": 0.6470591425895691, |
| "learning_rate": 3.8e-05, |
| "loss": 2.389, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09104704097116843, |
| "grad_norm": 0.5612089037895203, |
| "learning_rate": 5.8e-05, |
| "loss": 2.164, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.12139605462822459, |
| "grad_norm": 1.2414666414260864, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.8563, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.15174506828528073, |
| "grad_norm": 0.7067261934280396, |
| "learning_rate": 9.8e-05, |
| "loss": 1.3996, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18209408194233687, |
| "grad_norm": 0.6404679417610168, |
| "learning_rate": 0.000118, |
| "loss": 1.0518, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.212443095599393, |
| "grad_norm": 0.7477788329124451, |
| "learning_rate": 0.000138, |
| "loss": 0.8398, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.24279210925644917, |
| "grad_norm": 0.7206103205680847, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.6644, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2731411229135053, |
| "grad_norm": 0.6917621493339539, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.5447, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.30349013657056145, |
| "grad_norm": 0.7294086813926697, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.4877, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3338391502276176, |
| "grad_norm": 0.7436180114746094, |
| "learning_rate": 0.00019797752808988766, |
| "loss": 0.4499, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.36418816388467373, |
| "grad_norm": 0.7850086688995361, |
| "learning_rate": 0.0001957303370786517, |
| "loss": 0.4262, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3945371775417299, |
| "grad_norm": 0.69960618019104, |
| "learning_rate": 0.00019348314606741572, |
| "loss": 0.4107, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.424886191198786, |
| "grad_norm": 0.5339717864990234, |
| "learning_rate": 0.0001912359550561798, |
| "loss": 0.4023, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4552352048558422, |
| "grad_norm": 0.7217109203338623, |
| "learning_rate": 0.00018898876404494384, |
| "loss": 0.3713, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.48558421851289835, |
| "grad_norm": 0.5403777360916138, |
| "learning_rate": 0.00018674157303370787, |
| "loss": 0.3451, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5159332321699545, |
| "grad_norm": 0.7086395621299744, |
| "learning_rate": 0.00018449438202247192, |
| "loss": 0.3433, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5462822458270106, |
| "grad_norm": 0.5336319804191589, |
| "learning_rate": 0.00018224719101123598, |
| "loss": 0.3467, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5766312594840668, |
| "grad_norm": 0.5295460224151611, |
| "learning_rate": 0.00018, |
| "loss": 0.3625, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6069802731411229, |
| "grad_norm": 0.6301100254058838, |
| "learning_rate": 0.00017775280898876404, |
| "loss": 0.342, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.637329286798179, |
| "grad_norm": 0.5921112298965454, |
| "learning_rate": 0.0001755056179775281, |
| "loss": 0.3418, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6676783004552352, |
| "grad_norm": 0.7228449583053589, |
| "learning_rate": 0.00017325842696629216, |
| "loss": 0.3276, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6980273141122914, |
| "grad_norm": 0.6273078918457031, |
| "learning_rate": 0.00017101123595505619, |
| "loss": 0.3226, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7283763277693475, |
| "grad_norm": 0.5710306167602539, |
| "learning_rate": 0.00016876404494382024, |
| "loss": 0.3238, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7587253414264037, |
| "grad_norm": 0.43547865748405457, |
| "learning_rate": 0.00016651685393258427, |
| "loss": 0.3264, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7890743550834598, |
| "grad_norm": 0.4803106188774109, |
| "learning_rate": 0.00016426966292134833, |
| "loss": 0.3125, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8194233687405159, |
| "grad_norm": 0.45547714829444885, |
| "learning_rate": 0.00016202247191011236, |
| "loss": 0.3015, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.849772382397572, |
| "grad_norm": 0.4463222622871399, |
| "learning_rate": 0.00015977528089887642, |
| "loss": 0.3006, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8801213960546282, |
| "grad_norm": 0.6434076428413391, |
| "learning_rate": 0.00015752808988764045, |
| "loss": 0.3159, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9104704097116844, |
| "grad_norm": 0.5581756830215454, |
| "learning_rate": 0.0001552808988764045, |
| "loss": 0.3042, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9408194233687405, |
| "grad_norm": 0.5555682182312012, |
| "learning_rate": 0.00015303370786516856, |
| "loss": 0.3011, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9711684370257967, |
| "grad_norm": 0.47584882378578186, |
| "learning_rate": 0.0001507865168539326, |
| "loss": 0.2971, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.9439553618431091, |
| "learning_rate": 0.00014853932584269662, |
| "loss": 0.3148, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0303490136570561, |
| "grad_norm": 0.41724225878715515, |
| "learning_rate": 0.00014629213483146068, |
| "loss": 0.2885, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0606980273141122, |
| "grad_norm": 0.6222860813140869, |
| "learning_rate": 0.00014404494382022474, |
| "loss": 0.2941, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0910470409711683, |
| "grad_norm": 0.43433964252471924, |
| "learning_rate": 0.00014179775280898877, |
| "loss": 0.2825, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1213960546282247, |
| "grad_norm": 0.5504065752029419, |
| "learning_rate": 0.0001395505617977528, |
| "loss": 0.2917, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1517450682852808, |
| "grad_norm": 0.4902341663837433, |
| "learning_rate": 0.00013730337078651686, |
| "loss": 0.2803, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.182094081942337, |
| "grad_norm": 0.5374056100845337, |
| "learning_rate": 0.00013505617977528091, |
| "loss": 0.2799, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.212443095599393, |
| "grad_norm": 0.47176507115364075, |
| "learning_rate": 0.00013280898876404494, |
| "loss": 0.2783, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2427921092564491, |
| "grad_norm": 0.4779718816280365, |
| "learning_rate": 0.000130561797752809, |
| "loss": 0.2819, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2731411229135052, |
| "grad_norm": 0.4197782278060913, |
| "learning_rate": 0.00012831460674157303, |
| "loss": 0.279, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3034901365705616, |
| "grad_norm": 0.3616682291030884, |
| "learning_rate": 0.0001260674157303371, |
| "loss": 0.2732, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3338391502276177, |
| "grad_norm": 0.4093301594257355, |
| "learning_rate": 0.00012382022471910112, |
| "loss": 0.2719, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3641881638846738, |
| "grad_norm": 0.389291375875473, |
| "learning_rate": 0.00012157303370786516, |
| "loss": 0.2774, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.39453717754173, |
| "grad_norm": 0.41108396649360657, |
| "learning_rate": 0.0001193258426966292, |
| "loss": 0.274, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.424886191198786, |
| "grad_norm": 0.45677822828292847, |
| "learning_rate": 0.00011707865168539326, |
| "loss": 0.2701, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4552352048558421, |
| "grad_norm": 0.45333346724510193, |
| "learning_rate": 0.00011483146067415731, |
| "loss": 0.27, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4855842185128982, |
| "grad_norm": 0.4619373679161072, |
| "learning_rate": 0.00011258426966292135, |
| "loss": 0.2619, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5159332321699543, |
| "grad_norm": 0.36644256114959717, |
| "learning_rate": 0.00011033707865168538, |
| "loss": 0.2606, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5462822458270105, |
| "grad_norm": 0.41749054193496704, |
| "learning_rate": 0.00010808988764044945, |
| "loss": 0.2745, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5766312594840668, |
| "grad_norm": 0.3515094518661499, |
| "learning_rate": 0.00010584269662921348, |
| "loss": 0.2681, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.606980273141123, |
| "grad_norm": 0.3448503315448761, |
| "learning_rate": 0.00010359550561797753, |
| "loss": 0.2702, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.637329286798179, |
| "grad_norm": 0.37155789136886597, |
| "learning_rate": 0.00010134831460674157, |
| "loss": 0.2677, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.6676783004552354, |
| "grad_norm": 0.4785250425338745, |
| "learning_rate": 9.910112359550561e-05, |
| "loss": 0.2652, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6980273141122915, |
| "grad_norm": 0.3583938181400299, |
| "learning_rate": 9.685393258426967e-05, |
| "loss": 0.2742, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.7283763277693476, |
| "grad_norm": 0.487437903881073, |
| "learning_rate": 9.46067415730337e-05, |
| "loss": 0.2632, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.7587253414264037, |
| "grad_norm": 0.3896310031414032, |
| "learning_rate": 9.235955056179776e-05, |
| "loss": 0.2659, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.7890743550834598, |
| "grad_norm": 0.4289703369140625, |
| "learning_rate": 9.01123595505618e-05, |
| "loss": 0.2602, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.819423368740516, |
| "grad_norm": 0.39631426334381104, |
| "learning_rate": 8.786516853932585e-05, |
| "loss": 0.2611, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.849772382397572, |
| "grad_norm": 0.43037208914756775, |
| "learning_rate": 8.561797752808989e-05, |
| "loss": 0.2593, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.8801213960546281, |
| "grad_norm": 0.40060803294181824, |
| "learning_rate": 8.337078651685393e-05, |
| "loss": 0.2608, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.9104704097116842, |
| "grad_norm": 0.4095540940761566, |
| "learning_rate": 8.112359550561798e-05, |
| "loss": 0.2599, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9408194233687404, |
| "grad_norm": 0.36683136224746704, |
| "learning_rate": 7.887640449438202e-05, |
| "loss": 0.2627, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.9711684370257967, |
| "grad_norm": 0.4494629502296448, |
| "learning_rate": 7.662921348314607e-05, |
| "loss": 0.2492, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.6300239562988281, |
| "learning_rate": 7.438202247191012e-05, |
| "loss": 0.2625, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.030349013657056, |
| "grad_norm": 0.3551422655582428, |
| "learning_rate": 7.213483146067415e-05, |
| "loss": 0.2519, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.0606980273141122, |
| "grad_norm": 0.3787732720375061, |
| "learning_rate": 6.988764044943821e-05, |
| "loss": 0.2515, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.0910470409711683, |
| "grad_norm": 0.5085782408714294, |
| "learning_rate": 6.764044943820224e-05, |
| "loss": 0.2531, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.1213960546282244, |
| "grad_norm": 0.3131832480430603, |
| "learning_rate": 6.53932584269663e-05, |
| "loss": 0.2556, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1517450682852806, |
| "grad_norm": 0.4718491733074188, |
| "learning_rate": 6.314606741573034e-05, |
| "loss": 0.2497, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.1820940819423367, |
| "grad_norm": 0.39790356159210205, |
| "learning_rate": 6.0898876404494385e-05, |
| "loss": 0.2526, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.212443095599393, |
| "grad_norm": 0.4287549555301666, |
| "learning_rate": 5.865168539325843e-05, |
| "loss": 0.2461, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.2427921092564493, |
| "grad_norm": 0.30572280287742615, |
| "learning_rate": 5.640449438202248e-05, |
| "loss": 0.2502, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.2731411229135055, |
| "grad_norm": 0.40425729751586914, |
| "learning_rate": 5.415730337078652e-05, |
| "loss": 0.2507, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.3034901365705616, |
| "grad_norm": 0.3986862897872925, |
| "learning_rate": 5.191011235955057e-05, |
| "loss": 0.2514, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.3338391502276177, |
| "grad_norm": 0.38151049613952637, |
| "learning_rate": 4.966292134831461e-05, |
| "loss": 0.2465, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.364188163884674, |
| "grad_norm": 0.4567468762397766, |
| "learning_rate": 4.7415730337078655e-05, |
| "loss": 0.2477, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.39453717754173, |
| "grad_norm": 0.4813973605632782, |
| "learning_rate": 4.51685393258427e-05, |
| "loss": 0.2548, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.424886191198786, |
| "grad_norm": 0.37060871720314026, |
| "learning_rate": 4.292134831460675e-05, |
| "loss": 0.2484, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.455235204855842, |
| "grad_norm": 0.3922988474369049, |
| "learning_rate": 4.067415730337079e-05, |
| "loss": 0.255, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.4855842185128982, |
| "grad_norm": 0.4330635964870453, |
| "learning_rate": 3.842696629213483e-05, |
| "loss": 0.2518, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.5159332321699543, |
| "grad_norm": 0.3923832178115845, |
| "learning_rate": 3.6179775280898874e-05, |
| "loss": 0.2495, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.5462822458270105, |
| "grad_norm": 0.4159257411956787, |
| "learning_rate": 3.393258426966292e-05, |
| "loss": 0.2481, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.576631259484067, |
| "grad_norm": 0.4878959655761719, |
| "learning_rate": 3.168539325842697e-05, |
| "loss": 0.2467, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.606980273141123, |
| "grad_norm": 0.49331000447273254, |
| "learning_rate": 2.9438202247191012e-05, |
| "loss": 0.2466, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.6373292867981792, |
| "grad_norm": 0.38977667689323425, |
| "learning_rate": 2.7191011235955055e-05, |
| "loss": 0.2501, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.6676783004552354, |
| "grad_norm": 0.42586463689804077, |
| "learning_rate": 2.4943820224719103e-05, |
| "loss": 0.249, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.6980273141122915, |
| "grad_norm": 0.4059686064720154, |
| "learning_rate": 2.2696629213483146e-05, |
| "loss": 0.2488, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.7283763277693476, |
| "grad_norm": 0.4038703143596649, |
| "learning_rate": 2.0449438202247194e-05, |
| "loss": 0.2431, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.7587253414264037, |
| "grad_norm": 0.434299111366272, |
| "learning_rate": 1.8202247191011237e-05, |
| "loss": 0.2449, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.78907435508346, |
| "grad_norm": 0.431682288646698, |
| "learning_rate": 1.595505617977528e-05, |
| "loss": 0.2439, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.819423368740516, |
| "grad_norm": 0.41127917170524597, |
| "learning_rate": 1.3707865168539327e-05, |
| "loss": 0.2476, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.849772382397572, |
| "grad_norm": 0.3838565945625305, |
| "learning_rate": 1.146067415730337e-05, |
| "loss": 0.2497, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.880121396054628, |
| "grad_norm": 0.389149934053421, |
| "learning_rate": 9.213483146067416e-06, |
| "loss": 0.2473, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.9104704097116842, |
| "grad_norm": 0.4304480254650116, |
| "learning_rate": 6.96629213483146e-06, |
| "loss": 0.2473, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.9408194233687404, |
| "grad_norm": 0.3889082372188568, |
| "learning_rate": 4.719101123595506e-06, |
| "loss": 0.2465, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.9711684370257965, |
| "grad_norm": 0.3653116822242737, |
| "learning_rate": 2.4719101123595505e-06, |
| "loss": 0.2453, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.8728406429290771, |
| "learning_rate": 2.2471910112359554e-07, |
| "loss": 0.2431, |
| "step": 990 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 990, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8095218810159104e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|