| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9980781550288276, |
| "eval_steps": 500, |
| "global_step": 1755, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01708306641042067, |
| "grad_norm": 30.988456901868677, |
| "learning_rate": 5e-06, |
| "loss": 1.0716, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03416613282084134, |
| "grad_norm": 3.981743724280262, |
| "learning_rate": 5e-06, |
| "loss": 0.9952, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05124919923126201, |
| "grad_norm": 0.9238645350776216, |
| "learning_rate": 5e-06, |
| "loss": 0.9502, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06833226564168268, |
| "grad_norm": 1.0528903755575718, |
| "learning_rate": 5e-06, |
| "loss": 0.9206, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08541533205210335, |
| "grad_norm": 1.1750428009605969, |
| "learning_rate": 5e-06, |
| "loss": 0.9102, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10249839846252402, |
| "grad_norm": 0.7437586737148641, |
| "learning_rate": 5e-06, |
| "loss": 0.8945, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11958146487294469, |
| "grad_norm": 0.6775772425198474, |
| "learning_rate": 5e-06, |
| "loss": 0.8836, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13666453128336536, |
| "grad_norm": 0.5992902350468045, |
| "learning_rate": 5e-06, |
| "loss": 0.8771, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15374759769378604, |
| "grad_norm": 0.5246527905235602, |
| "learning_rate": 5e-06, |
| "loss": 0.8688, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1708306641042067, |
| "grad_norm": 0.6744242377338667, |
| "learning_rate": 5e-06, |
| "loss": 0.8674, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.18791373051462737, |
| "grad_norm": 0.5815020177486712, |
| "learning_rate": 5e-06, |
| "loss": 0.8658, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20499679692504805, |
| "grad_norm": 0.612553188685598, |
| "learning_rate": 5e-06, |
| "loss": 0.8607, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.22207986333546872, |
| "grad_norm": 0.550502713970348, |
| "learning_rate": 5e-06, |
| "loss": 0.8597, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.23916292974588937, |
| "grad_norm": 0.6852509049344128, |
| "learning_rate": 5e-06, |
| "loss": 0.854, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.25624599615631005, |
| "grad_norm": 0.578544920290097, |
| "learning_rate": 5e-06, |
| "loss": 0.8583, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.27332906256673073, |
| "grad_norm": 0.5189175609736144, |
| "learning_rate": 5e-06, |
| "loss": 0.851, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2904121289771514, |
| "grad_norm": 0.5087573435881564, |
| "learning_rate": 5e-06, |
| "loss": 0.8505, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3074951953875721, |
| "grad_norm": 0.6966346988112697, |
| "learning_rate": 5e-06, |
| "loss": 0.8454, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.32457826179799276, |
| "grad_norm": 0.5997059592749316, |
| "learning_rate": 5e-06, |
| "loss": 0.8484, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3416613282084134, |
| "grad_norm": 0.6852631417751086, |
| "learning_rate": 5e-06, |
| "loss": 0.8483, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35874439461883406, |
| "grad_norm": 0.6164595381164006, |
| "learning_rate": 5e-06, |
| "loss": 0.8462, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.37582746102925474, |
| "grad_norm": 0.6904944801515591, |
| "learning_rate": 5e-06, |
| "loss": 0.8445, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3929105274396754, |
| "grad_norm": 0.6512004476683885, |
| "learning_rate": 5e-06, |
| "loss": 0.8399, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4099935938500961, |
| "grad_norm": 0.6184327993659001, |
| "learning_rate": 5e-06, |
| "loss": 0.8391, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.42707666026051677, |
| "grad_norm": 0.802165112115216, |
| "learning_rate": 5e-06, |
| "loss": 0.8384, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.44415972667093745, |
| "grad_norm": 0.6499733286909369, |
| "learning_rate": 5e-06, |
| "loss": 0.8364, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4612427930813581, |
| "grad_norm": 0.5388351566067404, |
| "learning_rate": 5e-06, |
| "loss": 0.8303, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.47832585949177875, |
| "grad_norm": 0.5438990498346035, |
| "learning_rate": 5e-06, |
| "loss": 0.832, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4954089259021994, |
| "grad_norm": 0.5187067150821502, |
| "learning_rate": 5e-06, |
| "loss": 0.8325, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5124919923126201, |
| "grad_norm": 0.6697424824465937, |
| "learning_rate": 5e-06, |
| "loss": 0.8337, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5295750587230408, |
| "grad_norm": 0.6633949919937228, |
| "learning_rate": 5e-06, |
| "loss": 0.8296, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5466581251334615, |
| "grad_norm": 0.5550184232084733, |
| "learning_rate": 5e-06, |
| "loss": 0.8293, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5637411915438821, |
| "grad_norm": 0.725923344191194, |
| "learning_rate": 5e-06, |
| "loss": 0.8315, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5808242579543028, |
| "grad_norm": 0.6017986140852183, |
| "learning_rate": 5e-06, |
| "loss": 0.828, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5979073243647235, |
| "grad_norm": 0.5482521266135052, |
| "learning_rate": 5e-06, |
| "loss": 0.8299, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6149903907751442, |
| "grad_norm": 0.5278215410540681, |
| "learning_rate": 5e-06, |
| "loss": 0.832, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6320734571855648, |
| "grad_norm": 0.6984298439291815, |
| "learning_rate": 5e-06, |
| "loss": 0.8291, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6491565235959855, |
| "grad_norm": 0.5017079870431141, |
| "learning_rate": 5e-06, |
| "loss": 0.827, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6662395900064062, |
| "grad_norm": 0.5032298742038609, |
| "learning_rate": 5e-06, |
| "loss": 0.8272, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6833226564168268, |
| "grad_norm": 0.5330416146652471, |
| "learning_rate": 5e-06, |
| "loss": 0.8247, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7004057228272474, |
| "grad_norm": 0.57373005832922, |
| "learning_rate": 5e-06, |
| "loss": 0.8242, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7174887892376681, |
| "grad_norm": 0.5257091555093115, |
| "learning_rate": 5e-06, |
| "loss": 0.8266, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7345718556480888, |
| "grad_norm": 0.5789856149074786, |
| "learning_rate": 5e-06, |
| "loss": 0.8242, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7516549220585095, |
| "grad_norm": 0.5299653272462573, |
| "learning_rate": 5e-06, |
| "loss": 0.816, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7687379884689302, |
| "grad_norm": 0.5964593947123102, |
| "learning_rate": 5e-06, |
| "loss": 0.8242, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7858210548793508, |
| "grad_norm": 0.7283098877992732, |
| "learning_rate": 5e-06, |
| "loss": 0.8241, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8029041212897715, |
| "grad_norm": 0.5985448658584591, |
| "learning_rate": 5e-06, |
| "loss": 0.8197, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8199871877001922, |
| "grad_norm": 0.5623410005491558, |
| "learning_rate": 5e-06, |
| "loss": 0.8213, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8370702541106129, |
| "grad_norm": 0.6408816581220068, |
| "learning_rate": 5e-06, |
| "loss": 0.823, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8541533205210335, |
| "grad_norm": 0.6249632483859644, |
| "learning_rate": 5e-06, |
| "loss": 0.8184, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8712363869314542, |
| "grad_norm": 0.5922346578431508, |
| "learning_rate": 5e-06, |
| "loss": 0.8144, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8883194533418749, |
| "grad_norm": 0.690797808616181, |
| "learning_rate": 5e-06, |
| "loss": 0.8179, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9054025197522956, |
| "grad_norm": 0.5637410385766849, |
| "learning_rate": 5e-06, |
| "loss": 0.8153, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9224855861627163, |
| "grad_norm": 0.6713092701845222, |
| "learning_rate": 5e-06, |
| "loss": 0.8156, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.9395686525731369, |
| "grad_norm": 0.5614251903253611, |
| "learning_rate": 5e-06, |
| "loss": 0.8151, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9566517189835575, |
| "grad_norm": 0.488524190594288, |
| "learning_rate": 5e-06, |
| "loss": 0.8165, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9737347853939782, |
| "grad_norm": 0.5588451830957717, |
| "learning_rate": 5e-06, |
| "loss": 0.8147, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9908178518043989, |
| "grad_norm": 0.5319341754740086, |
| "learning_rate": 5e-06, |
| "loss": 0.8146, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9993593850096092, |
| "eval_loss": 0.8145768046379089, |
| "eval_runtime": 623.2585, |
| "eval_samples_per_second": 25.311, |
| "eval_steps_per_second": 0.396, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.0079009182148195, |
| "grad_norm": 0.7444773824556985, |
| "learning_rate": 5e-06, |
| "loss": 0.8416, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.0249839846252402, |
| "grad_norm": 0.6182001774270124, |
| "learning_rate": 5e-06, |
| "loss": 0.7786, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0420670510356609, |
| "grad_norm": 0.5471145639195996, |
| "learning_rate": 5e-06, |
| "loss": 0.7689, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0591501174460816, |
| "grad_norm": 0.5749072203498992, |
| "learning_rate": 5e-06, |
| "loss": 0.774, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0762331838565022, |
| "grad_norm": 0.5458121480997504, |
| "learning_rate": 5e-06, |
| "loss": 0.7727, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.093316250266923, |
| "grad_norm": 0.57658998771773, |
| "learning_rate": 5e-06, |
| "loss": 0.7723, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.1103993166773436, |
| "grad_norm": 0.718911287142942, |
| "learning_rate": 5e-06, |
| "loss": 0.7761, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.1274823830877643, |
| "grad_norm": 0.7129614149484951, |
| "learning_rate": 5e-06, |
| "loss": 0.7791, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.144565449498185, |
| "grad_norm": 0.5411663435831485, |
| "learning_rate": 5e-06, |
| "loss": 0.7737, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.1616485159086056, |
| "grad_norm": 0.7987307718934373, |
| "learning_rate": 5e-06, |
| "loss": 0.7665, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1787315823190263, |
| "grad_norm": 0.5752310814305064, |
| "learning_rate": 5e-06, |
| "loss": 0.7742, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.195814648729447, |
| "grad_norm": 0.5310768207788683, |
| "learning_rate": 5e-06, |
| "loss": 0.7738, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2128977151398677, |
| "grad_norm": 0.5646734820206145, |
| "learning_rate": 5e-06, |
| "loss": 0.7745, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.2299807815502883, |
| "grad_norm": 0.5131848643270003, |
| "learning_rate": 5e-06, |
| "loss": 0.7749, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.247063847960709, |
| "grad_norm": 0.7018347821869065, |
| "learning_rate": 5e-06, |
| "loss": 0.7761, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.2641469143711297, |
| "grad_norm": 0.5677858771240941, |
| "learning_rate": 5e-06, |
| "loss": 0.7733, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2812299807815504, |
| "grad_norm": 0.5314774866996713, |
| "learning_rate": 5e-06, |
| "loss": 0.7751, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.298313047191971, |
| "grad_norm": 0.6656368518895404, |
| "learning_rate": 5e-06, |
| "loss": 0.7749, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.3153961136023917, |
| "grad_norm": 0.5039492371844833, |
| "learning_rate": 5e-06, |
| "loss": 0.7741, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.3324791800128124, |
| "grad_norm": 0.5105033014036762, |
| "learning_rate": 5e-06, |
| "loss": 0.7784, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.349562246423233, |
| "grad_norm": 0.5030749236842763, |
| "learning_rate": 5e-06, |
| "loss": 0.7758, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3666453128336538, |
| "grad_norm": 0.5846299051076495, |
| "learning_rate": 5e-06, |
| "loss": 0.7733, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3837283792440744, |
| "grad_norm": 0.5992440442463463, |
| "learning_rate": 5e-06, |
| "loss": 0.7739, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.4008114456544951, |
| "grad_norm": 0.547090040748775, |
| "learning_rate": 5e-06, |
| "loss": 0.7744, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.4178945120649158, |
| "grad_norm": 0.5741884776951681, |
| "learning_rate": 5e-06, |
| "loss": 0.7703, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.4349775784753362, |
| "grad_norm": 0.5238258323687885, |
| "learning_rate": 5e-06, |
| "loss": 0.7701, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.452060644885757, |
| "grad_norm": 0.5265486787202277, |
| "learning_rate": 5e-06, |
| "loss": 0.7687, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.4691437112961776, |
| "grad_norm": 0.5591682134523662, |
| "learning_rate": 5e-06, |
| "loss": 0.7694, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4862267777065983, |
| "grad_norm": 0.5637486227419112, |
| "learning_rate": 5e-06, |
| "loss": 0.7713, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.503309844117019, |
| "grad_norm": 0.5276872431482891, |
| "learning_rate": 5e-06, |
| "loss": 0.7687, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.5203929105274396, |
| "grad_norm": 0.5299879511165935, |
| "learning_rate": 5e-06, |
| "loss": 0.7719, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.5374759769378603, |
| "grad_norm": 0.48443212446653844, |
| "learning_rate": 5e-06, |
| "loss": 0.7704, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.554559043348281, |
| "grad_norm": 0.5258029162836203, |
| "learning_rate": 5e-06, |
| "loss": 0.7681, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.5716421097587017, |
| "grad_norm": 0.5839360099287706, |
| "learning_rate": 5e-06, |
| "loss": 0.772, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.5887251761691223, |
| "grad_norm": 0.5806331874369932, |
| "learning_rate": 5e-06, |
| "loss": 0.7736, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.605808242579543, |
| "grad_norm": 0.6613985728737157, |
| "learning_rate": 5e-06, |
| "loss": 0.7724, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.6228913089899637, |
| "grad_norm": 0.5224489011940004, |
| "learning_rate": 5e-06, |
| "loss": 0.7711, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.6399743754003844, |
| "grad_norm": 0.5454437716534818, |
| "learning_rate": 5e-06, |
| "loss": 0.7715, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.657057441810805, |
| "grad_norm": 0.5161526858636564, |
| "learning_rate": 5e-06, |
| "loss": 0.7783, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.6741405082212257, |
| "grad_norm": 0.7631274530949943, |
| "learning_rate": 5e-06, |
| "loss": 0.7721, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.6912235746316464, |
| "grad_norm": 0.6083126340996768, |
| "learning_rate": 5e-06, |
| "loss": 0.7718, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.708306641042067, |
| "grad_norm": 0.5310268793627193, |
| "learning_rate": 5e-06, |
| "loss": 0.7741, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7253897074524878, |
| "grad_norm": 0.4883757515317452, |
| "learning_rate": 5e-06, |
| "loss": 0.771, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.7424727738629084, |
| "grad_norm": 0.5215621795180689, |
| "learning_rate": 5e-06, |
| "loss": 0.7728, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.759555840273329, |
| "grad_norm": 0.49887803010112675, |
| "learning_rate": 5e-06, |
| "loss": 0.7702, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.7766389066837496, |
| "grad_norm": 0.538143965723932, |
| "learning_rate": 5e-06, |
| "loss": 0.7695, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.7937219730941703, |
| "grad_norm": 0.5943865951120142, |
| "learning_rate": 5e-06, |
| "loss": 0.7713, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.810805039504591, |
| "grad_norm": 0.5034904524114908, |
| "learning_rate": 5e-06, |
| "loss": 0.7707, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.8278881059150116, |
| "grad_norm": 0.5739027654813702, |
| "learning_rate": 5e-06, |
| "loss": 0.767, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.8449711723254323, |
| "grad_norm": 0.5345337736484315, |
| "learning_rate": 5e-06, |
| "loss": 0.7739, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.862054238735853, |
| "grad_norm": 0.8233000709404549, |
| "learning_rate": 5e-06, |
| "loss": 0.7697, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.8791373051462736, |
| "grad_norm": 0.5699216631288021, |
| "learning_rate": 5e-06, |
| "loss": 0.7715, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.8962203715566943, |
| "grad_norm": 0.5480692157923471, |
| "learning_rate": 5e-06, |
| "loss": 0.7715, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.913303437967115, |
| "grad_norm": 0.6493917189844246, |
| "learning_rate": 5e-06, |
| "loss": 0.776, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.9303865043775357, |
| "grad_norm": 0.4971193417821817, |
| "learning_rate": 5e-06, |
| "loss": 0.7689, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.9474695707879563, |
| "grad_norm": 0.5213534104860004, |
| "learning_rate": 5e-06, |
| "loss": 0.7691, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.964552637198377, |
| "grad_norm": 0.5515331643144213, |
| "learning_rate": 5e-06, |
| "loss": 0.7684, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.9816357036087977, |
| "grad_norm": 0.6132524891266977, |
| "learning_rate": 5e-06, |
| "loss": 0.7651, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.9987187700192184, |
| "grad_norm": 0.5207251406889574, |
| "learning_rate": 5e-06, |
| "loss": 0.7679, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.9987187700192184, |
| "eval_loss": 0.8001261949539185, |
| "eval_runtime": 623.1773, |
| "eval_samples_per_second": 25.314, |
| "eval_steps_per_second": 0.396, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.015801836429639, |
| "grad_norm": 0.696280654661383, |
| "learning_rate": 5e-06, |
| "loss": 0.7761, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.0328849028400597, |
| "grad_norm": 0.6141244434895877, |
| "learning_rate": 5e-06, |
| "loss": 0.7251, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.0499679692504804, |
| "grad_norm": 0.6049456191917907, |
| "learning_rate": 5e-06, |
| "loss": 0.7275, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.067051035660901, |
| "grad_norm": 0.5271804189451788, |
| "learning_rate": 5e-06, |
| "loss": 0.7222, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.0841341020713218, |
| "grad_norm": 0.5825184609454925, |
| "learning_rate": 5e-06, |
| "loss": 0.7265, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.1012171684817424, |
| "grad_norm": 0.5032946228138522, |
| "learning_rate": 5e-06, |
| "loss": 0.7253, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.118300234892163, |
| "grad_norm": 0.5147327767567343, |
| "learning_rate": 5e-06, |
| "loss": 0.7237, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.135383301302584, |
| "grad_norm": 0.5337482087146928, |
| "learning_rate": 5e-06, |
| "loss": 0.7281, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.1524663677130045, |
| "grad_norm": 0.59826922882338, |
| "learning_rate": 5e-06, |
| "loss": 0.7328, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.169549434123425, |
| "grad_norm": 0.5946570187866338, |
| "learning_rate": 5e-06, |
| "loss": 0.7295, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.186632500533846, |
| "grad_norm": 0.6437960040336966, |
| "learning_rate": 5e-06, |
| "loss": 0.7337, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.2037155669442665, |
| "grad_norm": 0.5667571654097528, |
| "learning_rate": 5e-06, |
| "loss": 0.7257, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.220798633354687, |
| "grad_norm": 0.5711027651356156, |
| "learning_rate": 5e-06, |
| "loss": 0.7295, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.237881699765108, |
| "grad_norm": 0.5604913831263466, |
| "learning_rate": 5e-06, |
| "loss": 0.7258, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.2549647661755285, |
| "grad_norm": 0.49584992475251044, |
| "learning_rate": 5e-06, |
| "loss": 0.7282, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.2720478325859492, |
| "grad_norm": 0.48636367384700585, |
| "learning_rate": 5e-06, |
| "loss": 0.7289, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.28913089899637, |
| "grad_norm": 0.5593094273198317, |
| "learning_rate": 5e-06, |
| "loss": 0.7261, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.3062139654067906, |
| "grad_norm": 0.5334937979304919, |
| "learning_rate": 5e-06, |
| "loss": 0.7274, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.3232970318172113, |
| "grad_norm": 0.63384705400206, |
| "learning_rate": 5e-06, |
| "loss": 0.7295, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.340380098227632, |
| "grad_norm": 0.5143434005458392, |
| "learning_rate": 5e-06, |
| "loss": 0.7283, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.3574631646380526, |
| "grad_norm": 0.6076783258962472, |
| "learning_rate": 5e-06, |
| "loss": 0.7296, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.3745462310484733, |
| "grad_norm": 0.6067756593571197, |
| "learning_rate": 5e-06, |
| "loss": 0.7248, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.391629297458894, |
| "grad_norm": 0.6071395306047531, |
| "learning_rate": 5e-06, |
| "loss": 0.7318, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.4087123638693146, |
| "grad_norm": 0.5997962743795122, |
| "learning_rate": 5e-06, |
| "loss": 0.7272, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.4257954302797353, |
| "grad_norm": 0.6633499264729928, |
| "learning_rate": 5e-06, |
| "loss": 0.7277, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.442878496690156, |
| "grad_norm": 0.5544821713317218, |
| "learning_rate": 5e-06, |
| "loss": 0.7307, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.4599615631005767, |
| "grad_norm": 0.5348856989916878, |
| "learning_rate": 5e-06, |
| "loss": 0.7313, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.4770446295109974, |
| "grad_norm": 0.5567282104551005, |
| "learning_rate": 5e-06, |
| "loss": 0.7304, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.494127695921418, |
| "grad_norm": 0.49476735212888745, |
| "learning_rate": 5e-06, |
| "loss": 0.7301, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.5112107623318387, |
| "grad_norm": 0.5172195774829064, |
| "learning_rate": 5e-06, |
| "loss": 0.7279, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.5282938287422594, |
| "grad_norm": 0.5822469173059942, |
| "learning_rate": 5e-06, |
| "loss": 0.728, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.54537689515268, |
| "grad_norm": 0.5665275024242861, |
| "learning_rate": 5e-06, |
| "loss": 0.727, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.5624599615631007, |
| "grad_norm": 0.5261365140179813, |
| "learning_rate": 5e-06, |
| "loss": 0.7326, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.5795430279735214, |
| "grad_norm": 0.5317881820166406, |
| "learning_rate": 5e-06, |
| "loss": 0.7316, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.596626094383942, |
| "grad_norm": 0.5623482133625999, |
| "learning_rate": 5e-06, |
| "loss": 0.7312, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.6137091607943628, |
| "grad_norm": 0.5379867031953368, |
| "learning_rate": 5e-06, |
| "loss": 0.729, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.6307922272047835, |
| "grad_norm": 0.552606133346205, |
| "learning_rate": 5e-06, |
| "loss": 0.7282, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.647875293615204, |
| "grad_norm": 0.5260176310975024, |
| "learning_rate": 5e-06, |
| "loss": 0.7316, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.664958360025625, |
| "grad_norm": 0.5417204925891144, |
| "learning_rate": 5e-06, |
| "loss": 0.7311, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.6820414264360455, |
| "grad_norm": 0.5191792624557837, |
| "learning_rate": 5e-06, |
| "loss": 0.7317, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.699124492846466, |
| "grad_norm": 0.5082503207244659, |
| "learning_rate": 5e-06, |
| "loss": 0.7308, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.716207559256887, |
| "grad_norm": 0.5352199374254042, |
| "learning_rate": 5e-06, |
| "loss": 0.7322, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.7332906256673075, |
| "grad_norm": 0.4938969791102419, |
| "learning_rate": 5e-06, |
| "loss": 0.7299, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.750373692077728, |
| "grad_norm": 0.5349619842682974, |
| "learning_rate": 5e-06, |
| "loss": 0.7311, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.767456758488149, |
| "grad_norm": 0.5982776306942509, |
| "learning_rate": 5e-06, |
| "loss": 0.7326, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.7845398248985695, |
| "grad_norm": 0.5610641447482575, |
| "learning_rate": 5e-06, |
| "loss": 0.7283, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.8016228913089902, |
| "grad_norm": 0.5289582066062115, |
| "learning_rate": 5e-06, |
| "loss": 0.7322, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.818705957719411, |
| "grad_norm": 0.5307084408188756, |
| "learning_rate": 5e-06, |
| "loss": 0.7305, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.8357890241298316, |
| "grad_norm": 0.5768867367143191, |
| "learning_rate": 5e-06, |
| "loss": 0.7318, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.852872090540252, |
| "grad_norm": 0.5013755884966334, |
| "learning_rate": 5e-06, |
| "loss": 0.7261, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.8699551569506725, |
| "grad_norm": 0.5386292168646896, |
| "learning_rate": 5e-06, |
| "loss": 0.7326, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.887038223361093, |
| "grad_norm": 0.5042887110473108, |
| "learning_rate": 5e-06, |
| "loss": 0.7267, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.904121289771514, |
| "grad_norm": 0.5778864247918416, |
| "learning_rate": 5e-06, |
| "loss": 0.7304, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.9212043561819345, |
| "grad_norm": 0.4628270969407437, |
| "learning_rate": 5e-06, |
| "loss": 0.7337, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.938287422592355, |
| "grad_norm": 0.4828533328054976, |
| "learning_rate": 5e-06, |
| "loss": 0.7305, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.955370489002776, |
| "grad_norm": 0.5335294858018457, |
| "learning_rate": 5e-06, |
| "loss": 0.7255, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.9724535554131966, |
| "grad_norm": 0.4855853932089583, |
| "learning_rate": 5e-06, |
| "loss": 0.7299, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.9895366218236172, |
| "grad_norm": 0.4876186210552259, |
| "learning_rate": 5e-06, |
| "loss": 0.7294, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.9980781550288276, |
| "eval_loss": 0.7988265156745911, |
| "eval_runtime": 623.1353, |
| "eval_samples_per_second": 25.316, |
| "eval_steps_per_second": 0.396, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.9980781550288276, |
| "step": 1755, |
| "total_flos": 2939480986091520.0, |
| "train_loss": 0.7835989086716263, |
| "train_runtime": 103740.1935, |
| "train_samples_per_second": 8.667, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1755, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2939480986091520.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|