| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3063, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032653061224489797, | |
| "grad_norm": 1.763192892074585, | |
| "learning_rate": 0.0005, | |
| "loss": 6.1164, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.006530612244897959, | |
| "grad_norm": 1.1696356534957886, | |
| "learning_rate": 0.0005, | |
| "loss": 5.8499, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.009795918367346938, | |
| "grad_norm": 1.1040449142456055, | |
| "learning_rate": 0.0005, | |
| "loss": 5.6782, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.013061224489795919, | |
| "grad_norm": 1.3148446083068848, | |
| "learning_rate": 0.0005, | |
| "loss": 5.6237, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0163265306122449, | |
| "grad_norm": 1.438623070716858, | |
| "learning_rate": 0.0005, | |
| "loss": 5.5178, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.019591836734693877, | |
| "grad_norm": 1.2186000347137451, | |
| "learning_rate": 0.0005, | |
| "loss": 5.5895, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 1.088930606842041, | |
| "learning_rate": 0.0005, | |
| "loss": 5.4889, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.026122448979591838, | |
| "grad_norm": 1.1969929933547974, | |
| "learning_rate": 0.0005, | |
| "loss": 5.3515, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.029387755102040815, | |
| "grad_norm": 1.0405751466751099, | |
| "learning_rate": 0.0005, | |
| "loss": 5.2943, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0326530612244898, | |
| "grad_norm": 1.6840550899505615, | |
| "learning_rate": 0.0005, | |
| "loss": 5.1347, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.035918367346938776, | |
| "grad_norm": 1.0448979139328003, | |
| "learning_rate": 0.0005, | |
| "loss": 5.2275, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03918367346938775, | |
| "grad_norm": 1.2239587306976318, | |
| "learning_rate": 0.0005, | |
| "loss": 5.1866, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04244897959183674, | |
| "grad_norm": 1.0111207962036133, | |
| "learning_rate": 0.0005, | |
| "loss": 5.1201, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 1.0412639379501343, | |
| "learning_rate": 0.0005, | |
| "loss": 5.0317, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04897959183673469, | |
| "grad_norm": 1.6490758657455444, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8709, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.052244897959183675, | |
| "grad_norm": 1.320185661315918, | |
| "learning_rate": 0.0005, | |
| "loss": 5.0482, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05551020408163265, | |
| "grad_norm": 1.23984694480896, | |
| "learning_rate": 0.0005, | |
| "loss": 5.1341, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05877551020408163, | |
| "grad_norm": 1.0305520296096802, | |
| "learning_rate": 0.0005, | |
| "loss": 4.9743, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.062040816326530614, | |
| "grad_norm": 1.1690291166305542, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8896, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0653061224489796, | |
| "grad_norm": 1.5160081386566162, | |
| "learning_rate": 0.0005, | |
| "loss": 4.7129, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 1.293174147605896, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8936, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07183673469387755, | |
| "grad_norm": 0.9752352237701416, | |
| "learning_rate": 0.0005, | |
| "loss": 4.9842, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07510204081632653, | |
| "grad_norm": 1.2358481884002686, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8956, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0783673469387755, | |
| "grad_norm": 1.4549710750579834, | |
| "learning_rate": 0.0005, | |
| "loss": 4.7804, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08163265306122448, | |
| "grad_norm": 1.6275160312652588, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6489, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08489795918367347, | |
| "grad_norm": 1.3446155786514282, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8603, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08816326530612245, | |
| "grad_norm": 1.3125636577606201, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8614, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 1.5089269876480103, | |
| "learning_rate": 0.0005, | |
| "loss": 4.7528, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0946938775510204, | |
| "grad_norm": 1.240677833557129, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6491, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09795918367346938, | |
| "grad_norm": 1.6580984592437744, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6021, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10122448979591837, | |
| "grad_norm": 1.293299674987793, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6556, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.10448979591836735, | |
| "grad_norm": 1.0091382265090942, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8806, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.10775510204081633, | |
| "grad_norm": 1.2688143253326416, | |
| "learning_rate": 0.0005, | |
| "loss": 4.706, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1110204081632653, | |
| "grad_norm": 1.514876365661621, | |
| "learning_rate": 0.0005, | |
| "loss": 4.649, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 1.8324439525604248, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4951, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11755102040816326, | |
| "grad_norm": 1.6239265203475952, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5715, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.12081632653061225, | |
| "grad_norm": 1.2813515663146973, | |
| "learning_rate": 0.0005, | |
| "loss": 4.8563, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.12408163265306123, | |
| "grad_norm": 1.2220065593719482, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6549, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1273469387755102, | |
| "grad_norm": 1.176604986190796, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5817, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1306122448979592, | |
| "grad_norm": 1.7251240015029907, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4422, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13387755102040816, | |
| "grad_norm": 1.2302827835083008, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4658, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 1.0500273704528809, | |
| "learning_rate": 0.0005, | |
| "loss": 4.722, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1404081632653061, | |
| "grad_norm": 1.2972230911254883, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6021, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1436734693877551, | |
| "grad_norm": 1.294978141784668, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4754, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1469387755102041, | |
| "grad_norm": 1.6263995170593262, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4881, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.15020408163265306, | |
| "grad_norm": 1.2330529689788818, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4441, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.15346938775510205, | |
| "grad_norm": 0.8953180909156799, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6281, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.156734693877551, | |
| "grad_norm": 1.2303706407546997, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5374, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.4355968236923218, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5039, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16326530612244897, | |
| "grad_norm": 2.549136161804199, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3848, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16326530612244897, | |
| "eval_loss": 4.471920967102051, | |
| "eval_runtime": 44.0079, | |
| "eval_samples_per_second": 45.446, | |
| "eval_steps_per_second": 11.362, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16653061224489796, | |
| "grad_norm": 1.3125051259994507, | |
| "learning_rate": 0.0005, | |
| "loss": 4.429, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.16979591836734695, | |
| "grad_norm": 1.1771793365478516, | |
| "learning_rate": 0.0005, | |
| "loss": 4.6117, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1730612244897959, | |
| "grad_norm": 1.0817294120788574, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5011, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1763265306122449, | |
| "grad_norm": 1.7551524639129639, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4272, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.17959183673469387, | |
| "grad_norm": 2.062394142150879, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2264, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 1.150291919708252, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2967, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.18612244897959185, | |
| "grad_norm": 1.318264126777649, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5895, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1893877551020408, | |
| "grad_norm": 1.267132043838501, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4743, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1926530612244898, | |
| "grad_norm": 1.0571893453598022, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3215, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.19591836734693877, | |
| "grad_norm": 1.7111858129501343, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3655, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.19918367346938776, | |
| "grad_norm": 1.3666787147521973, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2062, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.20244897959183675, | |
| "grad_norm": 1.1153773069381714, | |
| "learning_rate": 0.0005, | |
| "loss": 4.5956, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 1.3033626079559326, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3626, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2089795918367347, | |
| "grad_norm": 1.2020397186279297, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2874, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.21224489795918366, | |
| "grad_norm": 1.620437741279602, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3094, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.21551020408163266, | |
| "grad_norm": 1.3368499279022217, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1476, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.21877551020408162, | |
| "grad_norm": 1.3473259210586548, | |
| "learning_rate": 0.0005, | |
| "loss": 4.515, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2220408163265306, | |
| "grad_norm": 1.3251821994781494, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2691, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2253061224489796, | |
| "grad_norm": 1.4138338565826416, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2523, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1.8546305894851685, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2569, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.23183673469387756, | |
| "grad_norm": 1.3483269214630127, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0691, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.23510204081632652, | |
| "grad_norm": 1.0672351121902466, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4731, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2383673469387755, | |
| "grad_norm": 1.168258786201477, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4081, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2416326530612245, | |
| "grad_norm": 1.1797970533370972, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2101, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24489795918367346, | |
| "grad_norm": 1.774629831314087, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1419, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.24816326530612245, | |
| "grad_norm": 1.4174522161483765, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0785, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 1.1591559648513794, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4851, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2546938775510204, | |
| "grad_norm": 1.3229284286499023, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2819, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.25795918367346937, | |
| "grad_norm": 1.174056887626648, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1234, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2612244897959184, | |
| "grad_norm": 1.6990975141525269, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0965, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.26448979591836735, | |
| "grad_norm": 1.4492194652557373, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0653, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2677551020408163, | |
| "grad_norm": 0.9765328764915466, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4531, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2710204081632653, | |
| "grad_norm": 1.4868978261947632, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2941, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 1.3501718044281006, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1963, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.27755102040816326, | |
| "grad_norm": 1.9492390155792236, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1563, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2808163265306122, | |
| "grad_norm": 1.2434947490692139, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0553, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.28408163265306124, | |
| "grad_norm": 1.1206755638122559, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3843, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2873469387755102, | |
| "grad_norm": 1.1947944164276123, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2625, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.29061224489795917, | |
| "grad_norm": 1.1833423376083374, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1207, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2938775510204082, | |
| "grad_norm": 1.6545424461364746, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1006, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 1.1993494033813477, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9992, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.3004081632653061, | |
| "grad_norm": 1.017640233039856, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3819, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3036734693877551, | |
| "grad_norm": 1.300307273864746, | |
| "learning_rate": 0.0005, | |
| "loss": 4.182, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3069387755102041, | |
| "grad_norm": 1.4982844591140747, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0585, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.31020408163265306, | |
| "grad_norm": 1.8079185485839844, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0825, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.313469387755102, | |
| "grad_norm": 1.2230055332183838, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9335, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.31673469387755104, | |
| "grad_norm": 1.372392177581787, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3513, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.4195263385772705, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1446, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.32326530612244897, | |
| "grad_norm": 1.5404787063598633, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0195, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 1.884207844734192, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0313, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "eval_loss": 4.118765354156494, | |
| "eval_runtime": 43.1312, | |
| "eval_samples_per_second": 46.37, | |
| "eval_steps_per_second": 11.593, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.32979591836734695, | |
| "grad_norm": 1.2172777652740479, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8635, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3330612244897959, | |
| "grad_norm": 1.3385504484176636, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2695, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3363265306122449, | |
| "grad_norm": 1.2645728588104248, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1298, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3395918367346939, | |
| "grad_norm": 1.1611703634262085, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0871, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 1.9043705463409424, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0687, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3461224489795918, | |
| "grad_norm": 1.109143853187561, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9311, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.34938775510204084, | |
| "grad_norm": 1.3506762981414795, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2937, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3526530612244898, | |
| "grad_norm": 1.5633002519607544, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0552, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.35591836734693877, | |
| "grad_norm": 1.3422085046768188, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1209, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.35918367346938773, | |
| "grad_norm": 1.8398027420043945, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0818, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.36244897959183675, | |
| "grad_norm": 1.0788570642471313, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8917, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 1.3736050128936768, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2967, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3689795918367347, | |
| "grad_norm": 1.249338150024414, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1155, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3722448979591837, | |
| "grad_norm": 1.3408821821212769, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0186, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.37551020408163266, | |
| "grad_norm": 1.4768186807632446, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9932, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3787755102040816, | |
| "grad_norm": 1.274889349937439, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7688, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3820408163265306, | |
| "grad_norm": 1.17601478099823, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1915, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3853061224489796, | |
| "grad_norm": 1.2624982595443726, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0991, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 1.4248754978179932, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0275, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.39183673469387753, | |
| "grad_norm": 1.8577288389205933, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9708, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.39510204081632655, | |
| "grad_norm": 1.2153960466384888, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8195, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3983673469387755, | |
| "grad_norm": 1.238071084022522, | |
| "learning_rate": 0.0005, | |
| "loss": 4.3136, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.4016326530612245, | |
| "grad_norm": 1.4334009885787964, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1319, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.4048979591836735, | |
| "grad_norm": 1.3810005187988281, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0287, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 2.1688148975372314, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0158, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 1.2777838706970215, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8439, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4146938775510204, | |
| "grad_norm": 1.1235041618347168, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2695, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4179591836734694, | |
| "grad_norm": 1.2477576732635498, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0986, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.42122448979591837, | |
| "grad_norm": 1.2333978414535522, | |
| "learning_rate": 0.0005, | |
| "loss": 4.007, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.42448979591836733, | |
| "grad_norm": 2.194262981414795, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9411, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.42775510204081635, | |
| "grad_norm": 1.084908366203308, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6208, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4310204081632653, | |
| "grad_norm": 1.275455117225647, | |
| "learning_rate": 0.0005, | |
| "loss": 4.2136, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 1.4374445676803589, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0137, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.43755102040816324, | |
| "grad_norm": 1.3499337434768677, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8929, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.44081632653061226, | |
| "grad_norm": 1.9091603755950928, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8582, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4440816326530612, | |
| "grad_norm": 1.0902653932571411, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6813, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4473469387755102, | |
| "grad_norm": 1.1447824239730835, | |
| "learning_rate": 0.0005, | |
| "loss": 4.236, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4506122448979592, | |
| "grad_norm": 1.4679241180419922, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9569, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.45387755102040817, | |
| "grad_norm": 1.7417553663253784, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8949, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.7916682958602905, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8576, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.46040816326530615, | |
| "grad_norm": 1.053566813468933, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6635, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4636734693877551, | |
| "grad_norm": 1.0085692405700684, | |
| "learning_rate": 0.0005, | |
| "loss": 4.11, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4669387755102041, | |
| "grad_norm": 1.3383585214614868, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9825, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.47020408163265304, | |
| "grad_norm": 1.3227241039276123, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8703, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.47346938775510206, | |
| "grad_norm": 2.0333919525146484, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8328, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.476734693877551, | |
| "grad_norm": 1.2266664505004883, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6192, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.3952326774597168, | |
| "learning_rate": 0.0005, | |
| "loss": 4.149, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.483265306122449, | |
| "grad_norm": 1.4793697595596313, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0945, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.48653061224489796, | |
| "grad_norm": 1.4412100315093994, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8816, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.4897959183673469, | |
| "grad_norm": 1.8379197120666504, | |
| "learning_rate": 0.0005, | |
| "loss": 3.766, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4897959183673469, | |
| "eval_loss": 3.902669906616211, | |
| "eval_runtime": 43.0604, | |
| "eval_samples_per_second": 46.446, | |
| "eval_steps_per_second": 11.612, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4930612244897959, | |
| "grad_norm": 1.1561907529830933, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5564, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4963265306122449, | |
| "grad_norm": 1.1232249736785889, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1061, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4995918367346939, | |
| "grad_norm": 1.1983751058578491, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0355, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 1.4547094106674194, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9718, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5061224489795918, | |
| "grad_norm": 2.0942959785461426, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8531, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5093877551020408, | |
| "grad_norm": 1.4254790544509888, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6136, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5126530612244898, | |
| "grad_norm": 1.3434633016586304, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0861, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5159183673469387, | |
| "grad_norm": 1.2875784635543823, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9535, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5191836734693878, | |
| "grad_norm": 1.324489951133728, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7898, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.5224489795918368, | |
| "grad_norm": 1.9661375284194946, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8496, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 1.1303874254226685, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5894, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5289795918367347, | |
| "grad_norm": 1.2739813327789307, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0466, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5322448979591837, | |
| "grad_norm": 1.3381832838058472, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9463, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5355102040816326, | |
| "grad_norm": 1.3114221096038818, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8021, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5387755102040817, | |
| "grad_norm": 1.9492048025131226, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7187, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5420408163265306, | |
| "grad_norm": 1.1877042055130005, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5491, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5453061224489796, | |
| "grad_norm": 1.1403299570083618, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0365, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.369149088859558, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8219, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5518367346938775, | |
| "grad_norm": 1.3963128328323364, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9018, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.5551020408163265, | |
| "grad_norm": 1.9626106023788452, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8125, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5583673469387755, | |
| "grad_norm": 1.3681797981262207, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5254, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5616326530612245, | |
| "grad_norm": 1.3346383571624756, | |
| "learning_rate": 0.0005, | |
| "loss": 4.1034, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5648979591836735, | |
| "grad_norm": 1.2607356309890747, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8995, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5681632653061225, | |
| "grad_norm": 1.4595521688461304, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7525, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.0250532627105713, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8421, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5746938775510204, | |
| "grad_norm": 1.277748942375183, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6177, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5779591836734694, | |
| "grad_norm": 1.3413293361663818, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0805, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5812244897959183, | |
| "grad_norm": 1.1629762649536133, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9218, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5844897959183674, | |
| "grad_norm": 1.3503767251968384, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7097, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.5877551020408164, | |
| "grad_norm": 1.966658353805542, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8113, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5910204081632653, | |
| "grad_norm": 1.1706182956695557, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6205, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5942857142857143, | |
| "grad_norm": 1.095694661140442, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9975, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5975510204081632, | |
| "grad_norm": 1.3853328227996826, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8375, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.6008163265306122, | |
| "grad_norm": 1.396588683128357, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8137, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.6040816326530613, | |
| "grad_norm": 1.8910547494888306, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8027, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6073469387755102, | |
| "grad_norm": 1.1377391815185547, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4935, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.6106122448979592, | |
| "grad_norm": 1.1120401620864868, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0356, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6138775510204082, | |
| "grad_norm": 1.2026475667953491, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8108, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6171428571428571, | |
| "grad_norm": 1.4205538034439087, | |
| "learning_rate": 0.0005, | |
| "loss": 3.731, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6204081632653061, | |
| "grad_norm": 1.925189733505249, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7237, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6236734693877551, | |
| "grad_norm": 1.1621805429458618, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5831, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.626938775510204, | |
| "grad_norm": 1.2125190496444702, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9736, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6302040816326531, | |
| "grad_norm": 1.4116473197937012, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8547, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6334693877551021, | |
| "grad_norm": 1.5884093046188354, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7234, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.636734693877551, | |
| "grad_norm": 1.875867486000061, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7336, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.1436996459960938, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4685, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.643265306122449, | |
| "grad_norm": 1.4208112955093384, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0489, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6465306122448979, | |
| "grad_norm": 1.3270920515060425, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8098, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.649795918367347, | |
| "grad_norm": 1.6381317377090454, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6228, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 2.0222079753875732, | |
| "learning_rate": 0.0005, | |
| "loss": 3.678, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "eval_loss": 3.763411521911621, | |
| "eval_runtime": 43.2803, | |
| "eval_samples_per_second": 46.21, | |
| "eval_steps_per_second": 11.553, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6563265306122449, | |
| "grad_norm": 1.211680293083191, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5285, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6595918367346939, | |
| "grad_norm": 1.27492094039917, | |
| "learning_rate": 0.0005, | |
| "loss": 3.977, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6628571428571428, | |
| "grad_norm": 1.4437798261642456, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8207, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6661224489795918, | |
| "grad_norm": 1.3550224304199219, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6058, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6693877551020408, | |
| "grad_norm": 2.0937228202819824, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6986, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6726530612244898, | |
| "grad_norm": 1.0755195617675781, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4733, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6759183673469388, | |
| "grad_norm": 1.157943606376648, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9306, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6791836734693878, | |
| "grad_norm": 1.1048219203948975, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7846, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6824489795918367, | |
| "grad_norm": 1.385745882987976, | |
| "learning_rate": 0.0005, | |
| "loss": 3.804, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 1.8955817222595215, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6585, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6889795918367347, | |
| "grad_norm": 1.0233700275421143, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4444, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6922448979591836, | |
| "grad_norm": 1.180027723312378, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9621, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6955102040816327, | |
| "grad_norm": 1.2139476537704468, | |
| "learning_rate": 0.0005, | |
| "loss": 3.785, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6987755102040817, | |
| "grad_norm": 1.285551905632019, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7749, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7020408163265306, | |
| "grad_norm": 1.9963250160217285, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6162, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7053061224489796, | |
| "grad_norm": 1.216117262840271, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5505, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.7085714285714285, | |
| "grad_norm": 1.181820034980774, | |
| "learning_rate": 0.0005, | |
| "loss": 4.0493, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.7118367346938775, | |
| "grad_norm": 1.181449294090271, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7653, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7151020408163266, | |
| "grad_norm": 1.350578784942627, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7466, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7183673469387755, | |
| "grad_norm": 2.2694125175476074, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6147, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7216326530612245, | |
| "grad_norm": 1.4255095720291138, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5044, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7248979591836735, | |
| "grad_norm": 1.037293791770935, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8757, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7281632653061224, | |
| "grad_norm": 1.4334427118301392, | |
| "learning_rate": 0.0005, | |
| "loss": 3.856, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7314285714285714, | |
| "grad_norm": 1.3529642820358276, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6443, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.7346938775510204, | |
| "grad_norm": 2.0167791843414307, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6821, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7379591836734694, | |
| "grad_norm": 1.011146068572998, | |
| "learning_rate": 0.0005, | |
| "loss": 3.2779, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7412244897959184, | |
| "grad_norm": 1.0214072465896606, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8809, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7444897959183674, | |
| "grad_norm": 1.3080716133117676, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7988, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7477551020408163, | |
| "grad_norm": 1.4071499109268188, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7096, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.7510204081632653, | |
| "grad_norm": 2.372915744781494, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6902, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7542857142857143, | |
| "grad_norm": 1.1310540437698364, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5033, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7575510204081632, | |
| "grad_norm": 1.1746577024459839, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9845, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7608163265306123, | |
| "grad_norm": 1.3103886842727661, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7691, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7640816326530612, | |
| "grad_norm": 1.3824633359909058, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5941, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7673469387755102, | |
| "grad_norm": 2.100325345993042, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6041, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7706122448979592, | |
| "grad_norm": 1.1913516521453857, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3779, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7738775510204081, | |
| "grad_norm": 0.9938827753067017, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8768, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7771428571428571, | |
| "grad_norm": 1.2203476428985596, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6631, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7804081632653062, | |
| "grad_norm": 1.2797229290008545, | |
| "learning_rate": 0.0005, | |
| "loss": 3.682, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.7836734693877551, | |
| "grad_norm": 1.820462703704834, | |
| "learning_rate": 0.0005, | |
| "loss": 3.63, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7869387755102041, | |
| "grad_norm": 1.147829532623291, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3512, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7902040816326531, | |
| "grad_norm": 1.2591899633407593, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8567, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.793469387755102, | |
| "grad_norm": 1.2597836256027222, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7401, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.796734693877551, | |
| "grad_norm": 1.3258607387542725, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5487, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.17008376121521, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5872, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.803265306122449, | |
| "grad_norm": 1.0918580293655396, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3497, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.806530612244898, | |
| "grad_norm": 1.1970785856246948, | |
| "learning_rate": 0.0005, | |
| "loss": 3.9383, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.809795918367347, | |
| "grad_norm": 1.2330458164215088, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7035, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.8130612244897959, | |
| "grad_norm": 1.3976151943206787, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6041, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 1.8973948955535889, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5651, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "eval_loss": 3.65913724899292, | |
| "eval_runtime": 42.7312, | |
| "eval_samples_per_second": 46.804, | |
| "eval_steps_per_second": 11.701, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8195918367346938, | |
| "grad_norm": 1.126738429069519, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3749, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8228571428571428, | |
| "grad_norm": 1.2251478433609009, | |
| "learning_rate": 0.0005, | |
| "loss": 3.884, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8261224489795919, | |
| "grad_norm": 1.219794750213623, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6404, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8293877551020408, | |
| "grad_norm": 1.3533365726470947, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5655, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8326530612244898, | |
| "grad_norm": 1.7649017572402954, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6185, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8359183673469388, | |
| "grad_norm": 1.0244803428649902, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4428, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8391836734693877, | |
| "grad_norm": 1.3756012916564941, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8133, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8424489795918367, | |
| "grad_norm": 1.4523794651031494, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6916, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8457142857142858, | |
| "grad_norm": 1.4527126550674438, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5812, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.8489795918367347, | |
| "grad_norm": 1.7975162267684937, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6506, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8522448979591837, | |
| "grad_norm": 1.29179048538208, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4787, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8555102040816327, | |
| "grad_norm": 1.2570093870162964, | |
| "learning_rate": 0.0005, | |
| "loss": 3.933, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8587755102040816, | |
| "grad_norm": 1.350013017654419, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6734, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8620408163265306, | |
| "grad_norm": 1.4130898714065552, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4979, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8653061224489796, | |
| "grad_norm": 2.0199756622314453, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5747, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8685714285714285, | |
| "grad_norm": 1.0159718990325928, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4062, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8718367346938776, | |
| "grad_norm": 1.2518935203552246, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7682, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8751020408163265, | |
| "grad_norm": 1.2634557485580444, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7365, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8783673469387755, | |
| "grad_norm": 1.2956312894821167, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6749, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.8816326530612245, | |
| "grad_norm": 1.8405163288116455, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5182, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8848979591836734, | |
| "grad_norm": 1.1379750967025757, | |
| "learning_rate": 0.0005, | |
| "loss": 3.376, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8881632653061224, | |
| "grad_norm": 1.3452417850494385, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8657, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8914285714285715, | |
| "grad_norm": 1.1698819398880005, | |
| "learning_rate": 0.0005, | |
| "loss": 3.633, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8946938775510204, | |
| "grad_norm": 1.42051100730896, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5665, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.8979591836734694, | |
| "grad_norm": 1.9082551002502441, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5884, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9012244897959184, | |
| "grad_norm": 1.3937710523605347, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3543, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.9044897959183673, | |
| "grad_norm": 1.2848858833312988, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8298, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.9077551020408163, | |
| "grad_norm": 1.2677395343780518, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6608, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.9110204081632653, | |
| "grad_norm": 1.372312307357788, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5296, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 1.928770899772644, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5219, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9175510204081633, | |
| "grad_norm": 1.1431926488876343, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3378, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9208163265306123, | |
| "grad_norm": 1.5195988416671753, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8191, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9240816326530612, | |
| "grad_norm": 1.3690383434295654, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5557, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9273469387755102, | |
| "grad_norm": 1.473240852355957, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5422, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9306122448979591, | |
| "grad_norm": 2.0330464839935303, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6382, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9338775510204081, | |
| "grad_norm": 1.0458343029022217, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3883, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9371428571428572, | |
| "grad_norm": 1.1635581254959106, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8158, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9404081632653061, | |
| "grad_norm": 1.3448679447174072, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6389, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9436734693877551, | |
| "grad_norm": 1.3173009157180786, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5386, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.9469387755102041, | |
| "grad_norm": 2.144378662109375, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5892, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.950204081632653, | |
| "grad_norm": 1.1590373516082764, | |
| "learning_rate": 0.0005, | |
| "loss": 3.397, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.953469387755102, | |
| "grad_norm": 1.253893494606018, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8054, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9567346938775511, | |
| "grad_norm": 1.215649962425232, | |
| "learning_rate": 0.0005, | |
| "loss": 3.612, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.4085159301757812, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5056, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.963265306122449, | |
| "grad_norm": 1.908341646194458, | |
| "learning_rate": 0.0005, | |
| "loss": 3.4516, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.966530612244898, | |
| "grad_norm": 1.1971689462661743, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3546, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9697959183673469, | |
| "grad_norm": 1.2646092176437378, | |
| "learning_rate": 0.0005, | |
| "loss": 3.8237, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9730612244897959, | |
| "grad_norm": 1.3261162042617798, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6123, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.976326530612245, | |
| "grad_norm": 1.4025673866271973, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5225, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 1.9254060983657837, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5353, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "eval_loss": 3.575631618499756, | |
| "eval_runtime": 42.8247, | |
| "eval_samples_per_second": 46.702, | |
| "eval_steps_per_second": 11.675, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9828571428571429, | |
| "grad_norm": 1.1695542335510254, | |
| "learning_rate": 0.0005, | |
| "loss": 3.1419, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9861224489795918, | |
| "grad_norm": 1.1839579343795776, | |
| "learning_rate": 0.0005, | |
| "loss": 3.7104, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9893877551020408, | |
| "grad_norm": 1.3528823852539062, | |
| "learning_rate": 0.0005, | |
| "loss": 3.6772, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9926530612244898, | |
| "grad_norm": 1.42439603805542, | |
| "learning_rate": 0.0005, | |
| "loss": 3.544, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.9959183673469387, | |
| "grad_norm": 2.387221574783325, | |
| "learning_rate": 0.0005, | |
| "loss": 3.536, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9991836734693877, | |
| "grad_norm": 1.4861875772476196, | |
| "learning_rate": 0.0005, | |
| "loss": 3.5749, | |
| "step": 3060 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3063, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 28910317824000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |