| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 782, |
| "global_step": 3910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 186.5393524169922, |
| "learning_rate": 1.2787723785166243e-08, |
| "loss": 9.9143, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 79.81620788574219, |
| "learning_rate": 4.0920716112531976e-07, |
| "loss": 9.1648, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 42.39503860473633, |
| "learning_rate": 8.184143222506395e-07, |
| "loss": 7.594, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 39.50730514526367, |
| "learning_rate": 1.2276214833759592e-06, |
| "loss": 7.1797, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 66.16382598876953, |
| "learning_rate": 1.636828644501279e-06, |
| "loss": 6.8731, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 73.76081848144531, |
| "learning_rate": 2.0460358056265987e-06, |
| "loss": 6.2698, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 83.69901275634766, |
| "learning_rate": 2.4552429667519184e-06, |
| "loss": 5.4165, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 83.07150268554688, |
| "learning_rate": 2.864450127877238e-06, |
| "loss": 4.1512, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 89.9841079711914, |
| "learning_rate": 3.273657289002558e-06, |
| "loss": 2.7473, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 47.78603744506836, |
| "learning_rate": 3.6828644501278778e-06, |
| "loss": 1.3029, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 16.27153205871582, |
| "learning_rate": 4.092071611253197e-06, |
| "loss": 0.5736, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 8.598386764526367, |
| "learning_rate": 4.501278772378517e-06, |
| "loss": 0.4362, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 6.162290096282959, |
| "learning_rate": 4.910485933503837e-06, |
| "loss": 0.3712, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 4.307323455810547, |
| "learning_rate": 4.999377365452712e-06, |
| "loss": 0.3602, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 3.5536632537841797, |
| "learning_rate": 4.996763860622537e-06, |
| "loss": 0.3408, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.9953174591064453, |
| "learning_rate": 4.992112801917064e-06, |
| "loss": 0.3275, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.5501198768615723, |
| "learning_rate": 4.985427984962641e-06, |
| "loss": 0.3398, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.0887036323547363, |
| "learning_rate": 4.976714865090827e-06, |
| "loss": 0.3333, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.0213687419891357, |
| "learning_rate": 4.965980552886427e-06, |
| "loss": 0.3231, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 1.822816252708435, |
| "learning_rate": 4.953233808384689e-06, |
| "loss": 0.3235, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 1.6860909461975098, |
| "learning_rate": 4.9384850339224405e-06, |
| "loss": 0.3146, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.4786981344223022, |
| "learning_rate": 4.92174626564897e-06, |
| "loss": 0.3147, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.4971543550491333, |
| "learning_rate": 4.903031163703588e-06, |
| "loss": 0.3159, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.6120744943618774, |
| "learning_rate": 4.882355001067892e-06, |
| "loss": 0.3172, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.7643611431121826, |
| "learning_rate": 4.859734651101821e-06, |
| "loss": 0.3179, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 1.4471312761306763, |
| "learning_rate": 4.835188573773681e-06, |
| "loss": 0.3146, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 1.5208021402359009, |
| "learning_rate": 4.808736800595372e-06, |
| "loss": 0.2978, |
| "step": 832 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 1.2985948324203491, |
| "learning_rate": 4.78040091827511e-06, |
| "loss": 0.2945, |
| "step": 864 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 1.3271950483322144, |
| "learning_rate": 4.750204051100996e-06, |
| "loss": 0.2864, |
| "step": 896 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 1.5903184413909912, |
| "learning_rate": 4.718170842069793e-06, |
| "loss": 0.3022, |
| "step": 928 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 1.446515679359436, |
| "learning_rate": 4.6843274327763165e-06, |
| "loss": 0.3016, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 1.1784732341766357, |
| "learning_rate": 4.648701442079864e-06, |
| "loss": 0.2862, |
| "step": 992 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 1.4098988771438599, |
| "learning_rate": 4.611321943565065e-06, |
| "loss": 0.2882, |
| "step": 1024 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 1.194462537765503, |
| "learning_rate": 4.5722194418155756e-06, |
| "loss": 0.2867, |
| "step": 1056 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 1.4424467086791992, |
| "learning_rate": 4.531425847519958e-06, |
| "loss": 0.2979, |
| "step": 1088 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 1.1310840845108032, |
| "learning_rate": 4.488974451430077e-06, |
| "loss": 0.304, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 1.291769027709961, |
| "learning_rate": 4.444899897193247e-06, |
| "loss": 0.295, |
| "step": 1152 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 1.2577590942382812, |
| "learning_rate": 4.399238153080317e-06, |
| "loss": 0.2827, |
| "step": 1184 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 1.5202428102493286, |
| "learning_rate": 4.352026482632762e-06, |
| "loss": 0.2936, |
| "step": 1216 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.2438831329345703, |
| "learning_rate": 4.303303414252724e-06, |
| "loss": 0.2861, |
| "step": 1248 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 1.170016884803772, |
| "learning_rate": 4.253108709760838e-06, |
| "loss": 0.2949, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 1.502139925956726, |
| "learning_rate": 4.201483331947488e-06, |
| "loss": 0.29, |
| "step": 1312 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.021010398864746, |
| "learning_rate": 4.148469411143973e-06, |
| "loss": 0.2832, |
| "step": 1344 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 1.2760310173034668, |
| "learning_rate": 4.094110210840879e-06, |
| "loss": 0.2855, |
| "step": 1376 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.6488782167434692, |
| "learning_rate": 4.038450092381697e-06, |
| "loss": 0.2897, |
| "step": 1408 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 1.4734816551208496, |
| "learning_rate": 3.981534478760508e-06, |
| "loss": 0.2945, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.2717450857162476, |
| "learning_rate": 3.923409817553284e-06, |
| "loss": 0.2865, |
| "step": 1472 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 1.449500322341919, |
| "learning_rate": 3.864123543013044e-06, |
| "loss": 0.2913, |
| "step": 1504 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.9587951302528381, |
| "learning_rate": 3.8037240373598077e-06, |
| "loss": 0.2798, |
| "step": 1536 |
| }, |
| { |
| "epoch": 2.01, |
| "grad_norm": 1.0608307123184204, |
| "learning_rate": 3.7422605912969334e-06, |
| "loss": 0.2832, |
| "step": 1568 |
| }, |
| { |
| "epoch": 2.05, |
| "grad_norm": 1.510848045349121, |
| "learning_rate": 3.679783363786063e-06, |
| "loss": 0.27, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.09, |
| "grad_norm": 1.4769214391708374, |
| "learning_rate": 3.6163433411135003e-06, |
| "loss": 0.2616, |
| "step": 1632 |
| }, |
| { |
| "epoch": 2.13, |
| "grad_norm": 1.3239907026290894, |
| "learning_rate": 3.551992295281431e-06, |
| "loss": 0.2653, |
| "step": 1664 |
| }, |
| { |
| "epoch": 2.17, |
| "grad_norm": 1.6486111879348755, |
| "learning_rate": 3.48678274175793e-06, |
| "loss": 0.2579, |
| "step": 1696 |
| }, |
| { |
| "epoch": 2.21, |
| "grad_norm": 1.3881120681762695, |
| "learning_rate": 3.420767896620249e-06, |
| "loss": 0.2657, |
| "step": 1728 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 1.0850417613983154, |
| "learning_rate": 3.3540016331263526e-06, |
| "loss": 0.2594, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.29, |
| "grad_norm": 1.369093656539917, |
| "learning_rate": 3.2865384377501385e-06, |
| "loss": 0.2605, |
| "step": 1792 |
| }, |
| { |
| "epoch": 2.33, |
| "grad_norm": 1.3315595388412476, |
| "learning_rate": 3.2184333657162297e-06, |
| "loss": 0.2538, |
| "step": 1824 |
| }, |
| { |
| "epoch": 2.37, |
| "grad_norm": 1.4239180088043213, |
| "learning_rate": 3.1497419960706235e-06, |
| "loss": 0.2632, |
| "step": 1856 |
| }, |
| { |
| "epoch": 2.41, |
| "grad_norm": 1.4137816429138184, |
| "learning_rate": 3.080520386323853e-06, |
| "loss": 0.2639, |
| "step": 1888 |
| }, |
| { |
| "epoch": 2.46, |
| "grad_norm": 1.2701876163482666, |
| "learning_rate": 3.0108250267036976e-06, |
| "loss": 0.2664, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 1.0828937292099, |
| "learning_rate": 2.9407127940547485e-06, |
| "loss": 0.2667, |
| "step": 1952 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 1.4425878524780273, |
| "learning_rate": 2.870240905422476e-06, |
| "loss": 0.2587, |
| "step": 1984 |
| }, |
| { |
| "epoch": 2.58, |
| "grad_norm": 1.6281284093856812, |
| "learning_rate": 2.7994668713596598e-06, |
| "loss": 0.2593, |
| "step": 2016 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 1.3322839736938477, |
| "learning_rate": 2.728448448993292e-06, |
| "loss": 0.2596, |
| "step": 2048 |
| }, |
| { |
| "epoch": 2.66, |
| "grad_norm": 1.1276755332946777, |
| "learning_rate": 2.65724359489027e-06, |
| "loss": 0.2562, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 1.6861284971237183, |
| "learning_rate": 2.5859104177603146e-06, |
| "loss": 0.2667, |
| "step": 2112 |
| }, |
| { |
| "epoch": 2.74, |
| "grad_norm": 1.2902480363845825, |
| "learning_rate": 2.514507131034735e-06, |
| "loss": 0.2605, |
| "step": 2144 |
| }, |
| { |
| "epoch": 2.78, |
| "grad_norm": 1.439092755317688, |
| "learning_rate": 2.443092005359736e-06, |
| "loss": 0.2616, |
| "step": 2176 |
| }, |
| { |
| "epoch": 2.82, |
| "grad_norm": 1.6176005601882935, |
| "learning_rate": 2.3717233210430258e-06, |
| "loss": 0.2627, |
| "step": 2208 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 1.1382880210876465, |
| "learning_rate": 2.300459320492547e-06, |
| "loss": 0.2536, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.91, |
| "grad_norm": 1.4830528497695923, |
| "learning_rate": 2.2293581606861298e-06, |
| "loss": 0.2613, |
| "step": 2272 |
| }, |
| { |
| "epoch": 2.95, |
| "grad_norm": 1.468441367149353, |
| "learning_rate": 2.158477865710868e-06, |
| "loss": 0.2661, |
| "step": 2304 |
| }, |
| { |
| "epoch": 2.99, |
| "grad_norm": 1.5543335676193237, |
| "learning_rate": 2.087876279410942e-06, |
| "loss": 0.2628, |
| "step": 2336 |
| }, |
| { |
| "epoch": 3.03, |
| "grad_norm": 1.4273053407669067, |
| "learning_rate": 2.017611018182533e-06, |
| "loss": 0.2549, |
| "step": 2368 |
| }, |
| { |
| "epoch": 3.07, |
| "grad_norm": 1.910224199295044, |
| "learning_rate": 1.94773942395436e-06, |
| "loss": 0.2345, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.11, |
| "grad_norm": 1.6112534999847412, |
| "learning_rate": 1.8783185173921847e-06, |
| "loss": 0.2335, |
| "step": 2432 |
| }, |
| { |
| "epoch": 3.15, |
| "grad_norm": 1.7274821996688843, |
| "learning_rate": 1.8094049513655191e-06, |
| "loss": 0.2391, |
| "step": 2464 |
| }, |
| { |
| "epoch": 3.19, |
| "grad_norm": 1.3582186698913574, |
| "learning_rate": 1.7410549647144598e-06, |
| "loss": 0.2339, |
| "step": 2496 |
| }, |
| { |
| "epoch": 3.23, |
| "grad_norm": 1.3842142820358276, |
| "learning_rate": 1.6733243363544154e-06, |
| "loss": 0.2326, |
| "step": 2528 |
| }, |
| { |
| "epoch": 3.27, |
| "grad_norm": 1.635377287864685, |
| "learning_rate": 1.606268339756166e-06, |
| "loss": 0.2416, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.31, |
| "grad_norm": 1.555497407913208, |
| "learning_rate": 1.5399416978383985e-06, |
| "loss": 0.2285, |
| "step": 2592 |
| }, |
| { |
| "epoch": 3.36, |
| "grad_norm": 1.3637293577194214, |
| "learning_rate": 1.4743985383095478e-06, |
| "loss": 0.2283, |
| "step": 2624 |
| }, |
| { |
| "epoch": 3.4, |
| "grad_norm": 1.5432817935943604, |
| "learning_rate": 1.409692349495363e-06, |
| "loss": 0.2302, |
| "step": 2656 |
| }, |
| { |
| "epoch": 3.44, |
| "grad_norm": 1.5474470853805542, |
| "learning_rate": 1.345875936688268e-06, |
| "loss": 0.2356, |
| "step": 2688 |
| }, |
| { |
| "epoch": 3.48, |
| "grad_norm": 1.5453095436096191, |
| "learning_rate": 1.283001379054128e-06, |
| "loss": 0.2421, |
| "step": 2720 |
| }, |
| { |
| "epoch": 3.52, |
| "grad_norm": 1.3231130838394165, |
| "learning_rate": 1.2211199871315932e-06, |
| "loss": 0.2303, |
| "step": 2752 |
| }, |
| { |
| "epoch": 3.56, |
| "grad_norm": 1.3008874654769897, |
| "learning_rate": 1.160282260958692e-06, |
| "loss": 0.2337, |
| "step": 2784 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 1.9900566339492798, |
| "learning_rate": 1.1005378488608783e-06, |
| "loss": 0.2331, |
| "step": 2816 |
| }, |
| { |
| "epoch": 3.64, |
| "grad_norm": 1.4739086627960205, |
| "learning_rate": 1.0419355069341206e-06, |
| "loss": 0.2363, |
| "step": 2848 |
| }, |
| { |
| "epoch": 3.68, |
| "grad_norm": 1.4675333499908447, |
| "learning_rate": 9.845230592561273e-07, |
| "loss": 0.2207, |
| "step": 2880 |
| }, |
| { |
| "epoch": 3.72, |
| "grad_norm": 1.4848682880401611, |
| "learning_rate": 9.283473588581784e-07, |
| "loss": 0.2287, |
| "step": 2912 |
| }, |
| { |
| "epoch": 3.76, |
| "grad_norm": 1.7442777156829834, |
| "learning_rate": 8.734542494893955e-07, |
| "loss": 0.2341, |
| "step": 2944 |
| }, |
| { |
| "epoch": 3.81, |
| "grad_norm": 1.6904102563858032, |
| "learning_rate": 8.198885282046712e-07, |
| "loss": 0.2304, |
| "step": 2976 |
| }, |
| { |
| "epoch": 3.85, |
| "grad_norm": 1.308547019958496, |
| "learning_rate": 7.676939088067847e-07, |
| "loss": 0.231, |
| "step": 3008 |
| }, |
| { |
| "epoch": 3.89, |
| "grad_norm": 1.2075304985046387, |
| "learning_rate": 7.169129861725297e-07, |
| "loss": 0.2337, |
| "step": 3040 |
| }, |
| { |
| "epoch": 3.93, |
| "grad_norm": 1.1886279582977295, |
| "learning_rate": 6.675872014919738e-07, |
| "loss": 0.2263, |
| "step": 3072 |
| }, |
| { |
| "epoch": 3.97, |
| "grad_norm": 1.3249859809875488, |
| "learning_rate": 6.197568084492203e-07, |
| "loss": 0.2287, |
| "step": 3104 |
| }, |
| { |
| "epoch": 4.01, |
| "grad_norm": 1.322100281715393, |
| "learning_rate": 5.734608403722674e-07, |
| "loss": 0.2245, |
| "step": 3136 |
| }, |
| { |
| "epoch": 4.05, |
| "grad_norm": 1.4475897550582886, |
| "learning_rate": 5.287370783787649e-07, |
| "loss": 0.2136, |
| "step": 3168 |
| }, |
| { |
| "epoch": 4.09, |
| "grad_norm": 1.3953217267990112, |
| "learning_rate": 4.856220205436834e-07, |
| "loss": 0.2062, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.13, |
| "grad_norm": 1.5530751943588257, |
| "learning_rate": 4.441508521140392e-07, |
| "loss": 0.2133, |
| "step": 3232 |
| }, |
| { |
| "epoch": 4.17, |
| "grad_norm": 1.3993407487869263, |
| "learning_rate": 4.043574167949893e-07, |
| "loss": 0.2086, |
| "step": 3264 |
| }, |
| { |
| "epoch": 4.21, |
| "grad_norm": 1.5585782527923584, |
| "learning_rate": 3.66274189130732e-07, |
| "loss": 0.2176, |
| "step": 3296 |
| }, |
| { |
| "epoch": 4.26, |
| "grad_norm": 1.8112726211547852, |
| "learning_rate": 3.299322480027498e-07, |
| "loss": 0.2095, |
| "step": 3328 |
| }, |
| { |
| "epoch": 4.3, |
| "grad_norm": 1.3818844556808472, |
| "learning_rate": 2.9536125126701565e-07, |
| "loss": 0.2083, |
| "step": 3360 |
| }, |
| { |
| "epoch": 4.34, |
| "grad_norm": 2.006455898284912, |
| "learning_rate": 2.6258941155087783e-07, |
| "loss": 0.2165, |
| "step": 3392 |
| }, |
| { |
| "epoch": 4.38, |
| "grad_norm": 1.5955146551132202, |
| "learning_rate": 2.3164347322935127e-07, |
| "loss": 0.2116, |
| "step": 3424 |
| }, |
| { |
| "epoch": 4.42, |
| "grad_norm": 1.6406195163726807, |
| "learning_rate": 2.0254869059962628e-07, |
| "loss": 0.2085, |
| "step": 3456 |
| }, |
| { |
| "epoch": 4.46, |
| "grad_norm": 1.5686984062194824, |
| "learning_rate": 1.75328807271595e-07, |
| "loss": 0.2083, |
| "step": 3488 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 1.7452423572540283, |
| "learning_rate": 1.5000603679121456e-07, |
| "loss": 0.2067, |
| "step": 3520 |
| }, |
| { |
| "epoch": 4.54, |
| "grad_norm": 1.6517360210418701, |
| "learning_rate": 1.2660104451252043e-07, |
| "loss": 0.2091, |
| "step": 3552 |
| }, |
| { |
| "epoch": 4.58, |
| "grad_norm": 1.559053897857666, |
| "learning_rate": 1.0513293073309089e-07, |
| "loss": 0.2133, |
| "step": 3584 |
| }, |
| { |
| "epoch": 4.62, |
| "grad_norm": 1.491980791091919, |
| "learning_rate": 8.561921510671312e-08, |
| "loss": 0.2109, |
| "step": 3616 |
| }, |
| { |
| "epoch": 4.66, |
| "grad_norm": 1.7280569076538086, |
| "learning_rate": 6.807582234598042e-08, |
| "loss": 0.2195, |
| "step": 3648 |
| }, |
| { |
| "epoch": 4.71, |
| "grad_norm": 1.3292275667190552, |
| "learning_rate": 5.2517069226488694e-08, |
| "loss": 0.2126, |
| "step": 3680 |
| }, |
| { |
| "epoch": 4.75, |
| "grad_norm": 1.9652115106582642, |
| "learning_rate": 3.8955652903228114e-08, |
| "loss": 0.2184, |
| "step": 3712 |
| }, |
| { |
| "epoch": 4.79, |
| "grad_norm": 1.3413430452346802, |
| "learning_rate": 2.7402640548717107e-08, |
| "loss": 0.2089, |
| "step": 3744 |
| }, |
| { |
| "epoch": 4.83, |
| "grad_norm": 1.6649166345596313, |
| "learning_rate": 1.786746032132747e-08, |
| "loss": 0.2077, |
| "step": 3776 |
| }, |
| { |
| "epoch": 4.87, |
| "grad_norm": 1.5024484395980835, |
| "learning_rate": 1.0357893671171793e-08, |
| "loss": 0.2167, |
| "step": 3808 |
| }, |
| { |
| "epoch": 4.91, |
| "grad_norm": 1.7963902950286865, |
| "learning_rate": 4.8800689898337304e-09, |
| "loss": 0.2073, |
| "step": 3840 |
| }, |
| { |
| "epoch": 4.95, |
| "grad_norm": 1.2430423498153687, |
| "learning_rate": 1.4384566091227293e-09, |
| "loss": 0.2086, |
| "step": 3872 |
| }, |
| { |
| "epoch": 4.99, |
| "grad_norm": 1.498355507850647, |
| "learning_rate": 3.586515293557691e-11, |
| "loss": 0.2099, |
| "step": 3904 |
| } |
| ], |
| "logging_steps": 32, |
| "max_steps": 3910, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 782, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|