| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1722, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017421602787456445, | |
| "grad_norm": 1.110547889420809, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8085, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03484320557491289, | |
| "grad_norm": 0.9516650393814576, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7273, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05226480836236934, | |
| "grad_norm": 0.8745560758252257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7151, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06968641114982578, | |
| "grad_norm": 0.6458692027569163, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7042, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08710801393728224, | |
| "grad_norm": 0.6419702080069286, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10452961672473868, | |
| "grad_norm": 1.3167787941292113, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6939, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12195121951219512, | |
| "grad_norm": 0.5165069959452926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6731, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13937282229965156, | |
| "grad_norm": 0.6494172258411438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6653, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.156794425087108, | |
| "grad_norm": 0.8433361988376273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6834, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17421602787456447, | |
| "grad_norm": 0.7333617572332245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.674, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1916376306620209, | |
| "grad_norm": 0.46328020438675316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6705, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20905923344947736, | |
| "grad_norm": 0.5375878274323563, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6546, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2264808362369338, | |
| "grad_norm": 0.6043348556726694, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6506, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24390243902439024, | |
| "grad_norm": 0.4508554439777698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6536, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2613240418118467, | |
| "grad_norm": 1.1359138470696808, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2787456445993031, | |
| "grad_norm": 0.5757321381081676, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6492, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2961672473867596, | |
| "grad_norm": 0.49845234216314765, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6481, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.313588850174216, | |
| "grad_norm": 0.4885975758468272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6445, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3310104529616725, | |
| "grad_norm": 0.46363877509236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6437, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.34843205574912894, | |
| "grad_norm": 0.4685718300832952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6386, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36585365853658536, | |
| "grad_norm": 0.5605596050023267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6507, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3832752613240418, | |
| "grad_norm": 0.6403424736477891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40069686411149824, | |
| "grad_norm": 0.5838158484945601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6493, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4181184668989547, | |
| "grad_norm": 0.45330751179903367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6498, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4355400696864111, | |
| "grad_norm": 0.47780598447174244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4529616724738676, | |
| "grad_norm": 0.4754778849361075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47038327526132406, | |
| "grad_norm": 0.9267353139931963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6453, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 1.09959569850648, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6381, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5052264808362369, | |
| "grad_norm": 0.46907276389959596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6142, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5226480836236934, | |
| "grad_norm": 0.9009500821986945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5400696864111498, | |
| "grad_norm": 0.8732778985630532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.633, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5574912891986062, | |
| "grad_norm": 0.5330855012377655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6292, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5749128919860628, | |
| "grad_norm": 0.5021412884291475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6253, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5923344947735192, | |
| "grad_norm": 0.5447288950929059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6308, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6097560975609756, | |
| "grad_norm": 0.7644080552686335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6267, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.627177700348432, | |
| "grad_norm": 0.4607717863440372, | |
| "learning_rate": 5e-06, | |
| "loss": 0.637, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6445993031358885, | |
| "grad_norm": 0.6059522443076883, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6276, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.662020905923345, | |
| "grad_norm": 0.6368838367327173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6227, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6794425087108014, | |
| "grad_norm": 0.4571682914981839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6204, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6968641114982579, | |
| "grad_norm": 0.7704076665510348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6332, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.48016934744783224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6317, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7317073170731707, | |
| "grad_norm": 0.4659861774424128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7491289198606271, | |
| "grad_norm": 0.5709628311962424, | |
| "learning_rate": 5e-06, | |
| "loss": 0.617, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7665505226480837, | |
| "grad_norm": 0.9275541597160887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6259, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7839721254355401, | |
| "grad_norm": 0.4590863644330183, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6432, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8013937282229965, | |
| "grad_norm": 0.44311113231679206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6325, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.818815331010453, | |
| "grad_norm": 0.42872958673136763, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6254, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8362369337979094, | |
| "grad_norm": 0.6968494949424339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6282, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8536585365853658, | |
| "grad_norm": 0.475991569929859, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6104, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8710801393728222, | |
| "grad_norm": 0.4036764356634414, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6192, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8885017421602788, | |
| "grad_norm": 0.4800314428435892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6226, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9059233449477352, | |
| "grad_norm": 0.4448196930678713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6141, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9233449477351916, | |
| "grad_norm": 0.43823469299056167, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6137, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9407665505226481, | |
| "grad_norm": 0.46401352050703015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6155, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9581881533101045, | |
| "grad_norm": 0.4741335427712111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6239, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.5404449413086233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6228, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9930313588850174, | |
| "grad_norm": 0.4865700073464584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6057, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6242489218711853, | |
| "eval_runtime": 156.3164, | |
| "eval_samples_per_second": 98.902, | |
| "eval_steps_per_second": 0.39, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.0104529616724738, | |
| "grad_norm": 0.49999642055125576, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5959, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.0278745644599303, | |
| "grad_norm": 0.45102982348932047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5783, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0452961672473868, | |
| "grad_norm": 0.4675746175963686, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5907, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0627177700348431, | |
| "grad_norm": 0.4332204857821093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5721, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0801393728222997, | |
| "grad_norm": 0.46797084979476816, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5728, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0975609756097562, | |
| "grad_norm": 0.45462105008725134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5766, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1149825783972125, | |
| "grad_norm": 0.45583632727945284, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5684, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.132404181184669, | |
| "grad_norm": 0.6045690088290018, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5839, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1498257839721253, | |
| "grad_norm": 0.4772407436839459, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5745, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1672473867595818, | |
| "grad_norm": 0.7134135897555454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5763, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.1846689895470384, | |
| "grad_norm": 0.5626507275472277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5699, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.202090592334495, | |
| "grad_norm": 0.4587849239706485, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5975, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2195121951219512, | |
| "grad_norm": 0.45602175911022613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5727, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2369337979094077, | |
| "grad_norm": 0.4760989235025255, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5743, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.254355400696864, | |
| "grad_norm": 0.57636457602915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5723, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2717770034843205, | |
| "grad_norm": 0.7070463930124464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5734, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.289198606271777, | |
| "grad_norm": 0.451570248773914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5654, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3066202090592334, | |
| "grad_norm": 0.4634040338505886, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5734, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.32404181184669, | |
| "grad_norm": 0.461184295216135, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5659, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3414634146341464, | |
| "grad_norm": 0.5502438318822999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5638, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3588850174216027, | |
| "grad_norm": 0.40334764475120544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5741, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3763066202090593, | |
| "grad_norm": 0.46619326588491156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5816, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.3937282229965158, | |
| "grad_norm": 0.43450365905550786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5879, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.411149825783972, | |
| "grad_norm": 0.4491332694126357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5834, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.4492632982117168, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5704, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.445993031358885, | |
| "grad_norm": 0.48204516908878015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5726, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4634146341463414, | |
| "grad_norm": 0.4451434847610245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5725, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.480836236933798, | |
| "grad_norm": 0.4817077921497448, | |
| "learning_rate": 5e-06, | |
| "loss": 0.578, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4982578397212545, | |
| "grad_norm": 0.4114565463440688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5756, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.5156794425087108, | |
| "grad_norm": 0.4364153090393298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5781, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.533101045296167, | |
| "grad_norm": 0.4641250735638994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5858, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5505226480836236, | |
| "grad_norm": 0.5414804208775645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.576, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5679442508710801, | |
| "grad_norm": 0.4046063470124716, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5764, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5853658536585367, | |
| "grad_norm": 0.5021694200790203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5832, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6027874564459932, | |
| "grad_norm": 0.44359324558365465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5702, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.6202090592334495, | |
| "grad_norm": 0.4590082845753106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5702, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6376306620209058, | |
| "grad_norm": 0.42340672053613443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5788, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6550522648083623, | |
| "grad_norm": 0.4144352252936342, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5766, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6724738675958188, | |
| "grad_norm": 0.4171029941084573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5695, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6898954703832754, | |
| "grad_norm": 0.43911657202438914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5661, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7073170731707317, | |
| "grad_norm": 0.4114986035010171, | |
| "learning_rate": 5e-06, | |
| "loss": 0.571, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.7247386759581882, | |
| "grad_norm": 0.43109782590499385, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5747, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7421602787456445, | |
| "grad_norm": 0.4565580100991871, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5771, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.759581881533101, | |
| "grad_norm": 0.4138828268586911, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5629, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7770034843205575, | |
| "grad_norm": 0.4542995316054866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5856, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.794425087108014, | |
| "grad_norm": 0.6622604112941458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5679, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.8118466898954704, | |
| "grad_norm": 0.4507517659371942, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5774, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8292682926829267, | |
| "grad_norm": 0.4309645347763417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5704, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8466898954703832, | |
| "grad_norm": 0.41185890423116756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5734, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8641114982578397, | |
| "grad_norm": 0.4455851719114888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5747, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8815331010452963, | |
| "grad_norm": 0.41686045633860264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5771, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.8989547038327528, | |
| "grad_norm": 0.4333938820779194, | |
| "learning_rate": 5e-06, | |
| "loss": 0.579, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.916376306620209, | |
| "grad_norm": 0.4045207725207787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5758, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9337979094076654, | |
| "grad_norm": 0.4520222740004976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5563, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.951219512195122, | |
| "grad_norm": 0.4311997583086096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5681, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9686411149825784, | |
| "grad_norm": 0.4383316797763638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5819, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.986062717770035, | |
| "grad_norm": 0.5477336634704406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5711, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.6141585111618042, | |
| "eval_runtime": 156.1142, | |
| "eval_samples_per_second": 99.03, | |
| "eval_steps_per_second": 0.391, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 2.0034843205574915, | |
| "grad_norm": 0.4640944945247334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5571, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.0209059233449476, | |
| "grad_norm": 0.45843359238558995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5249, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.038327526132404, | |
| "grad_norm": 0.4837328490077125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5282, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0557491289198606, | |
| "grad_norm": 0.44590774801900834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5337, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.073170731707317, | |
| "grad_norm": 0.43363076292724995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5374, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.0905923344947737, | |
| "grad_norm": 0.4931511449549238, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5309, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.10801393728223, | |
| "grad_norm": 0.4504251344307032, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5271, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.1254355400696863, | |
| "grad_norm": 0.49526543085134606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5319, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.4677844909297682, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5306, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.1602787456445993, | |
| "grad_norm": 0.47260212803851326, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5228, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.177700348432056, | |
| "grad_norm": 0.5490671664415253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5276, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.1951219512195124, | |
| "grad_norm": 0.4780364714560436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5307, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.2125435540069684, | |
| "grad_norm": 0.44095382556774654, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5273, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.229965156794425, | |
| "grad_norm": 0.471163066518852, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5095, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.2473867595818815, | |
| "grad_norm": 0.4509784260620687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.525, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.264808362369338, | |
| "grad_norm": 0.42926159657173546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5254, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2822299651567945, | |
| "grad_norm": 0.47746501662453417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5229, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.2996515679442506, | |
| "grad_norm": 0.4747495543682741, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5259, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.317073170731707, | |
| "grad_norm": 0.5228901131403929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5243, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.3344947735191637, | |
| "grad_norm": 0.4834167699517272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5306, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.35191637630662, | |
| "grad_norm": 0.45707310674276885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5276, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.3693379790940767, | |
| "grad_norm": 0.47685998083437814, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5312, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.3867595818815333, | |
| "grad_norm": 0.47877017672465005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5274, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.40418118466899, | |
| "grad_norm": 0.5258496984168791, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5226, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.421602787456446, | |
| "grad_norm": 0.47709678060118926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5189, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "grad_norm": 0.581254404109644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.54, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.456445993031359, | |
| "grad_norm": 0.43351552520462094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5345, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4738675958188154, | |
| "grad_norm": 0.4485068008247544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5361, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.491289198606272, | |
| "grad_norm": 0.4729416207097551, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5351, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.508710801393728, | |
| "grad_norm": 0.44072110157252653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5294, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.5261324041811846, | |
| "grad_norm": 0.5219007442544961, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5238, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.543554006968641, | |
| "grad_norm": 0.4809656167510028, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5288, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.5609756097560976, | |
| "grad_norm": 0.4531184946595288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.536, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.578397212543554, | |
| "grad_norm": 0.48164003860440047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5324, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.59581881533101, | |
| "grad_norm": 0.4758292139458944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5133, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.6132404181184667, | |
| "grad_norm": 0.4634797349634793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5295, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6306620209059233, | |
| "grad_norm": 0.47375952821313455, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5273, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.64808362369338, | |
| "grad_norm": 0.4441843444201693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5334, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.6655052264808363, | |
| "grad_norm": 0.42211766638465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5138, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.682926829268293, | |
| "grad_norm": 0.48817131427198845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5134, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.7003484320557494, | |
| "grad_norm": 0.45942488735176895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5361, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.7177700348432055, | |
| "grad_norm": 0.4854175238598141, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5394, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.735191637630662, | |
| "grad_norm": 0.4604289027255158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5187, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.7526132404181185, | |
| "grad_norm": 0.4936790258884402, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5246, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.770034843205575, | |
| "grad_norm": 0.541536408959734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5278, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.7874564459930316, | |
| "grad_norm": 0.484918416204369, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5382, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.8048780487804876, | |
| "grad_norm": 0.4885761374101617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.533, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.822299651567944, | |
| "grad_norm": 0.4349789229036019, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5388, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.8397212543554007, | |
| "grad_norm": 0.4314276877403793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5391, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.43396532487130346, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5302, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.8745644599303137, | |
| "grad_norm": 0.43625746208041316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5275, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.89198606271777, | |
| "grad_norm": 0.4646206916137346, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5209, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.9094076655052263, | |
| "grad_norm": 0.4840563025343255, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5246, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.926829268292683, | |
| "grad_norm": 0.45066810285940856, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5318, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.9442508710801394, | |
| "grad_norm": 0.46846871284532293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5324, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.961672473867596, | |
| "grad_norm": 0.5421178557930114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5317, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.979094076655052, | |
| "grad_norm": 0.44573049149554844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5284, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.996515679442509, | |
| "grad_norm": 0.4720545406402456, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5286, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6146459579467773, | |
| "eval_runtime": 156.0276, | |
| "eval_samples_per_second": 99.085, | |
| "eval_steps_per_second": 0.391, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1722, | |
| "total_flos": 2883576618024960.0, | |
| "train_loss": 0.5825082723410426, | |
| "train_runtime": 26219.2762, | |
| "train_samples_per_second": 33.609, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1722, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2883576618024960.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |