| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.029098898061104047, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0002909889806110405, | |
| "grad_norm": 3.3640072345733643, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2361, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.000581977961222081, | |
| "grad_norm": 0.2830420136451721, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1232, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0008729669418331215, | |
| "grad_norm": 0.3788599669933319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0874, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.001163955922444162, | |
| "grad_norm": 0.9566423892974854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1099, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0014549449030552023, | |
| "grad_norm": 0.5953503847122192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0727, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.001745933883666243, | |
| "grad_norm": 0.3039131164550781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0575, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0020369228642772835, | |
| "grad_norm": 0.27436479926109314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0511, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.002327911844888324, | |
| "grad_norm": 0.22409753501415253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0495, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0026189008254993646, | |
| "grad_norm": 0.5258105397224426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0447, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0029098898061104047, | |
| "grad_norm": 0.3420485854148865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0447, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0032008787867214453, | |
| "grad_norm": 0.18883126974105835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0437, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.003491867767332486, | |
| "grad_norm": 0.09849688410758972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0428, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0037828567479435264, | |
| "grad_norm": 0.5861080884933472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0416, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.004073845728554567, | |
| "grad_norm": 0.8478333353996277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0415, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.004364834709165607, | |
| "grad_norm": 0.6563957333564758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0416, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.004655823689776648, | |
| "grad_norm": 0.1496465653181076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.04, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.004946812670387688, | |
| "grad_norm": 0.7356327176094055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0406, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.005237801650998729, | |
| "grad_norm": 0.5485235452651978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0415, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.005528790631609769, | |
| "grad_norm": 0.28617948293685913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.04, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.005819779612220809, | |
| "grad_norm": 0.7351231575012207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0462, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.00611076859283185, | |
| "grad_norm": 0.6597175598144531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.042, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0064017575734428905, | |
| "grad_norm": 0.5418401956558228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0425, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0066927465540539315, | |
| "grad_norm": 0.4611615836620331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0409, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.006983735534664972, | |
| "grad_norm": 0.039530955255031586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0414, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.007274724515276012, | |
| "grad_norm": 0.03446557745337486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0393, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.007565713495887053, | |
| "grad_norm": 0.7747415900230408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0419, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.007856702476498093, | |
| "grad_norm": 0.3428023159503937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0423, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.008147691457109134, | |
| "grad_norm": 0.2668132781982422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0404, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.008438680437720175, | |
| "grad_norm": 0.06787655502557755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0373, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.008729669418331214, | |
| "grad_norm": 0.17109806835651398, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0371, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.009020658398942255, | |
| "grad_norm": 0.1489395946264267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0377, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.009311647379553296, | |
| "grad_norm": 0.20444560050964355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0385, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.009602636360164335, | |
| "grad_norm": 1.251767873764038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0395, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.009893625340775376, | |
| "grad_norm": 0.17964421212673187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0403, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.010184614321386417, | |
| "grad_norm": 0.20001742243766785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0374, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.010475603301997458, | |
| "grad_norm": 0.07885689288377762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0366, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.010766592282608498, | |
| "grad_norm": 0.1755530834197998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0366, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.011057581263219539, | |
| "grad_norm": 0.24720287322998047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.036, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.01134857024383058, | |
| "grad_norm": 0.13627253472805023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0384, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.011639559224441619, | |
| "grad_norm": 0.10238471627235413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.037, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.01193054820505266, | |
| "grad_norm": 0.1790493279695511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.036, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0122215371856637, | |
| "grad_norm": 0.8142262101173401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0366, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.012512526166274742, | |
| "grad_norm": 0.3142533600330353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0373, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.012803515146885781, | |
| "grad_norm": 0.5743248462677002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0364, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.013094504127496822, | |
| "grad_norm": 0.10562433302402496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0357, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.013385493108107863, | |
| "grad_norm": 0.1285354048013687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0348, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.013676482088718902, | |
| "grad_norm": 0.31671035289764404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0367, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.013967471069329943, | |
| "grad_norm": 0.10281776636838913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0367, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.014258460049940984, | |
| "grad_norm": 0.10469332337379456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0354, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.014549449030552023, | |
| "grad_norm": 0.04262165352702141, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0358, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.014840438011163064, | |
| "grad_norm": 0.08256979286670685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0352, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.015131426991774105, | |
| "grad_norm": 0.2147534042596817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0352, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.015422415972385146, | |
| "grad_norm": 0.7227026224136353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0367, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.015713404952996186, | |
| "grad_norm": 0.3186182975769043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0366, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.01600439393360723, | |
| "grad_norm": 0.5129309296607971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.036, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.016295382914218268, | |
| "grad_norm": 0.3547574281692505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0363, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.016586371894829307, | |
| "grad_norm": 0.29844892024993896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0369, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.01687736087544035, | |
| "grad_norm": 0.25678157806396484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0352, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.01716834985605139, | |
| "grad_norm": 0.07419384270906448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0344, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.017459338836662428, | |
| "grad_norm": 0.15620607137680054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0343, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.01775032781727347, | |
| "grad_norm": 0.17426913976669312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0326, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.01804131679788451, | |
| "grad_norm": 0.18652600049972534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0358, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.01833230577849555, | |
| "grad_norm": 0.5866808295249939, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0364, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.018623294759106592, | |
| "grad_norm": 0.5470107793807983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0385, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.01891428373971763, | |
| "grad_norm": 0.4430047273635864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0355, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.01920527272032867, | |
| "grad_norm": 0.44153594970703125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0374, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.019496261700939713, | |
| "grad_norm": 0.2025349885225296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0361, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.019787250681550753, | |
| "grad_norm": 0.05006701499223709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0352, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.020078239662161792, | |
| "grad_norm": 0.15291444957256317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0339, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.020369228642772835, | |
| "grad_norm": 0.20080982148647308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.034, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.020660217623383874, | |
| "grad_norm": 0.1934683471918106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0338, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.020951206603994917, | |
| "grad_norm": 0.2093890905380249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.033, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.021242195584605956, | |
| "grad_norm": 0.17411717772483826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0329, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.021533184565216995, | |
| "grad_norm": 0.06554729491472244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0328, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.021824173545828038, | |
| "grad_norm": 0.3035508692264557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0345, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.022115162526439077, | |
| "grad_norm": 0.1284075379371643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0344, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.022406151507050116, | |
| "grad_norm": 0.06972914189100266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0326, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.02269714048766116, | |
| "grad_norm": 0.2625221908092499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0343, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.0229881294682722, | |
| "grad_norm": 0.2056276947259903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0341, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.023279118448883238, | |
| "grad_norm": 0.06602438539266586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0331, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.02357010742949428, | |
| "grad_norm": 0.1302807331085205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0328, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.02386109641010532, | |
| "grad_norm": 0.07038327306509018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0338, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.02415208539071636, | |
| "grad_norm": 0.3151911199092865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0353, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.0244430743713274, | |
| "grad_norm": 0.2942112982273102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0336, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.02473406335193844, | |
| "grad_norm": 0.09775586426258087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0331, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.025025052332549483, | |
| "grad_norm": 0.06825686991214752, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0338, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.025316041313160523, | |
| "grad_norm": 0.08698020130395889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0329, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.025607030293771562, | |
| "grad_norm": 0.29394668340682983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0328, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.025898019274382605, | |
| "grad_norm": 0.2138691395521164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0327, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.026189008254993644, | |
| "grad_norm": 0.22760023176670074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0349, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.026479997235604683, | |
| "grad_norm": 0.07050047069787979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0327, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.026770986216215726, | |
| "grad_norm": 0.0632275640964508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0332, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.027061975196826765, | |
| "grad_norm": 0.2537945508956909, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0334, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.027352964177437804, | |
| "grad_norm": 0.17872551083564758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0339, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.027643953158048847, | |
| "grad_norm": 0.1240101158618927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0328, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.027934942138659886, | |
| "grad_norm": 0.24408769607543945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0337, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.028225931119270926, | |
| "grad_norm": 0.06075837463140488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0327, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.02851692009988197, | |
| "grad_norm": 0.09202170372009277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0326, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.028807909080493008, | |
| "grad_norm": 0.09207413345575333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0316, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.029098898061104047, | |
| "grad_norm": 0.056632377207279205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.0331, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 40000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.102264300163564e+18, | |
| "train_batch_size": 96, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |