| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 20.0, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.1, |
| "grad_norm": 8.593878746032715, |
| "learning_rate": 1e-05, |
| "loss": 1.4852, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 7.231457233428955, |
| "learning_rate": 2e-05, |
| "loss": 1.4207, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 3.319319248199463, |
| "learning_rate": 3e-05, |
| "loss": 1.0614, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 3.60310435295105, |
| "learning_rate": 4e-05, |
| "loss": 0.7256, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 15.761638641357422, |
| "learning_rate": 5e-05, |
| "loss": 0.5587, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 24.317516326904297, |
| "learning_rate": 6e-05, |
| "loss": 0.5146, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 4.110610008239746, |
| "learning_rate": 7e-05, |
| "loss": 0.4222, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 4.141885757446289, |
| "learning_rate": 8e-05, |
| "loss": 0.4355, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 3.7163727283477783, |
| "learning_rate": 9e-05, |
| "loss": 0.3939, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 8.189441680908203, |
| "learning_rate": 0.0001, |
| "loss": 0.3712, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 2.9945340156555176, |
| "learning_rate": 9.999316524962345e-05, |
| "loss": 0.3548, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 1.997642159461975, |
| "learning_rate": 9.997266286704631e-05, |
| "loss": 0.2898, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 1.9837431907653809, |
| "learning_rate": 9.993849845741524e-05, |
| "loss": 0.2528, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 2.294177532196045, |
| "learning_rate": 9.989068136093873e-05, |
| "loss": 0.2187, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 1.4083565473556519, |
| "learning_rate": 9.98292246503335e-05, |
| "loss": 0.1986, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 7.1001811027526855, |
| "learning_rate": 9.975414512725057e-05, |
| "loss": 0.1738, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 1.4144936800003052, |
| "learning_rate": 9.966546331768191e-05, |
| "loss": 0.1537, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.5691213607788086, |
| "learning_rate": 9.956320346634876e-05, |
| "loss": 0.1519, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 1.2897148132324219, |
| "learning_rate": 9.944739353007344e-05, |
| "loss": 0.1287, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.6297178268432617, |
| "learning_rate": 9.931806517013612e-05, |
| "loss": 0.1169, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 1.364025354385376, |
| "learning_rate": 9.917525374361912e-05, |
| "loss": 0.1246, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 1.02783203125, |
| "learning_rate": 9.901899829374047e-05, |
| "loss": 0.1205, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 1.1652231216430664, |
| "learning_rate": 9.884934153917997e-05, |
| "loss": 0.1123, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 1.3376855850219727, |
| "learning_rate": 9.86663298624003e-05, |
| "loss": 0.1073, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.9518831968307495, |
| "learning_rate": 9.847001329696653e-05, |
| "loss": 0.0983, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 1.0183520317077637, |
| "learning_rate": 9.826044551386744e-05, |
| "loss": 0.0994, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 1.088984727859497, |
| "learning_rate": 9.803768380684242e-05, |
| "loss": 0.1057, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.7311228513717651, |
| "learning_rate": 9.780178907671789e-05, |
| "loss": 0.0979, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 0.4584773778915405, |
| "learning_rate": 9.755282581475769e-05, |
| "loss": 0.1015, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.5885437726974487, |
| "learning_rate": 9.729086208503174e-05, |
| "loss": 0.0798, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.1, |
| "grad_norm": 0.7659353017807007, |
| "learning_rate": 9.701596950580806e-05, |
| "loss": 0.0978, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 0.472594290971756, |
| "learning_rate": 9.672822322997305e-05, |
| "loss": 0.0849, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.3, |
| "grad_norm": 0.9068973064422607, |
| "learning_rate": 9.642770192448536e-05, |
| "loss": 0.0917, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.4, |
| "grad_norm": 0.7755366563796997, |
| "learning_rate": 9.611448774886924e-05, |
| "loss": 0.0843, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 1.4321980476379395, |
| "learning_rate": 9.578866633275288e-05, |
| "loss": 0.0813, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 0.7324996590614319, |
| "learning_rate": 9.545032675245813e-05, |
| "loss": 0.0822, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.7, |
| "grad_norm": 0.5374558568000793, |
| "learning_rate": 9.509956150664796e-05, |
| "loss": 0.0877, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.8, |
| "grad_norm": 0.7615280151367188, |
| "learning_rate": 9.473646649103818e-05, |
| "loss": 0.0745, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.9, |
| "grad_norm": 1.008134365081787, |
| "learning_rate": 9.43611409721806e-05, |
| "loss": 0.0739, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.376315951347351, |
| "learning_rate": 9.397368756032445e-05, |
| "loss": 0.0716, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.1, |
| "grad_norm": 1.1624388694763184, |
| "learning_rate": 9.357421218136386e-05, |
| "loss": 0.0787, |
| "step": 410 |
| }, |
| { |
| "epoch": 4.2, |
| "grad_norm": 0.5839523077011108, |
| "learning_rate": 9.316282404787871e-05, |
| "loss": 0.0786, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.3, |
| "grad_norm": 0.9589359760284424, |
| "learning_rate": 9.273963562927695e-05, |
| "loss": 0.0735, |
| "step": 430 |
| }, |
| { |
| "epoch": 4.4, |
| "grad_norm": 1.2537989616394043, |
| "learning_rate": 9.230476262104677e-05, |
| "loss": 0.0787, |
| "step": 440 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 0.9431202411651611, |
| "learning_rate": 9.185832391312644e-05, |
| "loss": 0.0772, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.6, |
| "grad_norm": 0.689441978931427, |
| "learning_rate": 9.140044155740101e-05, |
| "loss": 0.0775, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.7, |
| "grad_norm": 0.6225442290306091, |
| "learning_rate": 9.093124073433463e-05, |
| "loss": 0.0838, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 0.8038715720176697, |
| "learning_rate": 9.045084971874738e-05, |
| "loss": 0.0708, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.9, |
| "grad_norm": 1.0997047424316406, |
| "learning_rate": 8.995939984474624e-05, |
| "loss": 0.0736, |
| "step": 490 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.7090861797332764, |
| "learning_rate": 8.945702546981969e-05, |
| "loss": 0.0664, |
| "step": 500 |
| }, |
| { |
| "epoch": 5.1, |
| "grad_norm": 0.6094951629638672, |
| "learning_rate": 8.894386393810563e-05, |
| "loss": 0.0718, |
| "step": 510 |
| }, |
| { |
| "epoch": 5.2, |
| "grad_norm": 0.577790379524231, |
| "learning_rate": 8.842005554284296e-05, |
| "loss": 0.0625, |
| "step": 520 |
| }, |
| { |
| "epoch": 5.3, |
| "grad_norm": 0.7675192952156067, |
| "learning_rate": 8.788574348801675e-05, |
| "loss": 0.0637, |
| "step": 530 |
| }, |
| { |
| "epoch": 5.4, |
| "grad_norm": 0.5569060444831848, |
| "learning_rate": 8.73410738492077e-05, |
| "loss": 0.0667, |
| "step": 540 |
| }, |
| { |
| "epoch": 5.5, |
| "grad_norm": 0.6511795520782471, |
| "learning_rate": 8.678619553365659e-05, |
| "loss": 0.07, |
| "step": 550 |
| }, |
| { |
| "epoch": 5.6, |
| "grad_norm": 0.7688615322113037, |
| "learning_rate": 8.622126023955446e-05, |
| "loss": 0.0579, |
| "step": 560 |
| }, |
| { |
| "epoch": 5.7, |
| "grad_norm": 0.668289840221405, |
| "learning_rate": 8.564642241456986e-05, |
| "loss": 0.0599, |
| "step": 570 |
| }, |
| { |
| "epoch": 5.8, |
| "grad_norm": 0.8834435939788818, |
| "learning_rate": 8.506183921362443e-05, |
| "loss": 0.0639, |
| "step": 580 |
| }, |
| { |
| "epoch": 5.9, |
| "grad_norm": 0.727353572845459, |
| "learning_rate": 8.44676704559283e-05, |
| "loss": 0.0675, |
| "step": 590 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 2.630605936050415, |
| "learning_rate": 8.386407858128706e-05, |
| "loss": 0.0699, |
| "step": 600 |
| }, |
| { |
| "epoch": 6.1, |
| "grad_norm": 1.2145522832870483, |
| "learning_rate": 8.32512286056924e-05, |
| "loss": 0.0724, |
| "step": 610 |
| }, |
| { |
| "epoch": 6.2, |
| "grad_norm": 1.142543911933899, |
| "learning_rate": 8.262928807620843e-05, |
| "loss": 0.0727, |
| "step": 620 |
| }, |
| { |
| "epoch": 6.3, |
| "grad_norm": 0.6511779427528381, |
| "learning_rate": 8.199842702516583e-05, |
| "loss": 0.0611, |
| "step": 630 |
| }, |
| { |
| "epoch": 6.4, |
| "grad_norm": 0.5131811499595642, |
| "learning_rate": 8.135881792367686e-05, |
| "loss": 0.0591, |
| "step": 640 |
| }, |
| { |
| "epoch": 6.5, |
| "grad_norm": 0.8611190915107727, |
| "learning_rate": 8.07106356344834e-05, |
| "loss": 0.0644, |
| "step": 650 |
| }, |
| { |
| "epoch": 6.6, |
| "grad_norm": 0.7479195594787598, |
| "learning_rate": 8.005405736415126e-05, |
| "loss": 0.0638, |
| "step": 660 |
| }, |
| { |
| "epoch": 6.7, |
| "grad_norm": 1.030855655670166, |
| "learning_rate": 7.938926261462366e-05, |
| "loss": 0.0644, |
| "step": 670 |
| }, |
| { |
| "epoch": 6.8, |
| "grad_norm": 0.9318546056747437, |
| "learning_rate": 7.871643313414718e-05, |
| "loss": 0.0698, |
| "step": 680 |
| }, |
| { |
| "epoch": 6.9, |
| "grad_norm": 0.8005062937736511, |
| "learning_rate": 7.803575286758364e-05, |
| "loss": 0.064, |
| "step": 690 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 1.5523598194122314, |
| "learning_rate": 7.734740790612136e-05, |
| "loss": 0.0581, |
| "step": 700 |
| }, |
| { |
| "epoch": 7.1, |
| "grad_norm": 0.7943648099899292, |
| "learning_rate": 7.66515864363997e-05, |
| "loss": 0.0699, |
| "step": 710 |
| }, |
| { |
| "epoch": 7.2, |
| "grad_norm": 0.775920569896698, |
| "learning_rate": 7.594847868906076e-05, |
| "loss": 0.0643, |
| "step": 720 |
| }, |
| { |
| "epoch": 7.3, |
| "grad_norm": 0.7172490954399109, |
| "learning_rate": 7.52382768867422e-05, |
| "loss": 0.055, |
| "step": 730 |
| }, |
| { |
| "epoch": 7.4, |
| "grad_norm": 0.7170266509056091, |
| "learning_rate": 7.452117519152542e-05, |
| "loss": 0.0561, |
| "step": 740 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.5932777523994446, |
| "learning_rate": 7.379736965185368e-05, |
| "loss": 0.0541, |
| "step": 750 |
| }, |
| { |
| "epoch": 7.6, |
| "grad_norm": 0.5876290202140808, |
| "learning_rate": 7.30670581489344e-05, |
| "loss": 0.0568, |
| "step": 760 |
| }, |
| { |
| "epoch": 7.7, |
| "grad_norm": 0.7758692502975464, |
| "learning_rate": 7.233044034264034e-05, |
| "loss": 0.055, |
| "step": 770 |
| }, |
| { |
| "epoch": 7.8, |
| "grad_norm": 0.6511589288711548, |
| "learning_rate": 7.158771761692464e-05, |
| "loss": 0.0581, |
| "step": 780 |
| }, |
| { |
| "epoch": 7.9, |
| "grad_norm": 0.777108371257782, |
| "learning_rate": 7.083909302476453e-05, |
| "loss": 0.0491, |
| "step": 790 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 3.6956264972686768, |
| "learning_rate": 7.008477123264848e-05, |
| "loss": 0.057, |
| "step": 800 |
| }, |
| { |
| "epoch": 8.1, |
| "grad_norm": 0.69562828540802, |
| "learning_rate": 6.932495846462261e-05, |
| "loss": 0.0572, |
| "step": 810 |
| }, |
| { |
| "epoch": 8.2, |
| "grad_norm": 0.7596244812011719, |
| "learning_rate": 6.855986244591104e-05, |
| "loss": 0.0598, |
| "step": 820 |
| }, |
| { |
| "epoch": 8.3, |
| "grad_norm": 0.8167744278907776, |
| "learning_rate": 6.778969234612584e-05, |
| "loss": 0.0523, |
| "step": 830 |
| }, |
| { |
| "epoch": 8.4, |
| "grad_norm": 0.7772257328033447, |
| "learning_rate": 6.701465872208216e-05, |
| "loss": 0.0563, |
| "step": 840 |
| }, |
| { |
| "epoch": 8.5, |
| "grad_norm": 0.6254410147666931, |
| "learning_rate": 6.623497346023418e-05, |
| "loss": 0.0527, |
| "step": 850 |
| }, |
| { |
| "epoch": 8.6, |
| "grad_norm": 0.4882695972919464, |
| "learning_rate": 6.545084971874738e-05, |
| "loss": 0.0543, |
| "step": 860 |
| }, |
| { |
| "epoch": 8.7, |
| "grad_norm": 0.8698508739471436, |
| "learning_rate": 6.466250186922325e-05, |
| "loss": 0.0495, |
| "step": 870 |
| }, |
| { |
| "epoch": 8.8, |
| "grad_norm": 0.6745582818984985, |
| "learning_rate": 6.387014543809223e-05, |
| "loss": 0.0505, |
| "step": 880 |
| }, |
| { |
| "epoch": 8.9, |
| "grad_norm": 1.0861890316009521, |
| "learning_rate": 6.307399704769099e-05, |
| "loss": 0.044, |
| "step": 890 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.43595385551452637, |
| "learning_rate": 6.227427435703997e-05, |
| "loss": 0.0416, |
| "step": 900 |
| }, |
| { |
| "epoch": 9.1, |
| "grad_norm": 1.1349962949752808, |
| "learning_rate": 6.147119600233758e-05, |
| "loss": 0.0576, |
| "step": 910 |
| }, |
| { |
| "epoch": 9.2, |
| "grad_norm": 0.8424226641654968, |
| "learning_rate": 6.066498153718735e-05, |
| "loss": 0.0478, |
| "step": 920 |
| }, |
| { |
| "epoch": 9.3, |
| "grad_norm": 1.0484619140625, |
| "learning_rate": 5.985585137257401e-05, |
| "loss": 0.0549, |
| "step": 930 |
| }, |
| { |
| "epoch": 9.4, |
| "grad_norm": 0.6300575137138367, |
| "learning_rate": 5.90440267166055e-05, |
| "loss": 0.0507, |
| "step": 940 |
| }, |
| { |
| "epoch": 9.5, |
| "grad_norm": 0.6857895851135254, |
| "learning_rate": 5.8229729514036705e-05, |
| "loss": 0.0463, |
| "step": 950 |
| }, |
| { |
| "epoch": 9.6, |
| "grad_norm": 0.6959238648414612, |
| "learning_rate": 5.74131823855921e-05, |
| "loss": 0.0529, |
| "step": 960 |
| }, |
| { |
| "epoch": 9.7, |
| "grad_norm": 0.6516974568367004, |
| "learning_rate": 5.6594608567103456e-05, |
| "loss": 0.0461, |
| "step": 970 |
| }, |
| { |
| "epoch": 9.8, |
| "grad_norm": 1.1341180801391602, |
| "learning_rate": 5.577423184847932e-05, |
| "loss": 0.0458, |
| "step": 980 |
| }, |
| { |
| "epoch": 9.9, |
| "grad_norm": 0.6836873292922974, |
| "learning_rate": 5.495227651252315e-05, |
| "loss": 0.0498, |
| "step": 990 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.8491439819335938, |
| "learning_rate": 5.4128967273616625e-05, |
| "loss": 0.0459, |
| "step": 1000 |
| }, |
| { |
| "epoch": 10.1, |
| "grad_norm": 0.7286385297775269, |
| "learning_rate": 5.330452921628497e-05, |
| "loss": 0.0495, |
| "step": 1010 |
| }, |
| { |
| "epoch": 10.2, |
| "grad_norm": 0.9026761651039124, |
| "learning_rate": 5.247918773366112e-05, |
| "loss": 0.0439, |
| "step": 1020 |
| }, |
| { |
| "epoch": 10.3, |
| "grad_norm": 0.5952501893043518, |
| "learning_rate": 5.165316846586541e-05, |
| "loss": 0.0438, |
| "step": 1030 |
| }, |
| { |
| "epoch": 10.4, |
| "grad_norm": 0.7074761986732483, |
| "learning_rate": 5.0826697238317935e-05, |
| "loss": 0.0444, |
| "step": 1040 |
| }, |
| { |
| "epoch": 10.5, |
| "grad_norm": 0.7149840593338013, |
| "learning_rate": 5e-05, |
| "loss": 0.0464, |
| "step": 1050 |
| }, |
| { |
| "epoch": 10.6, |
| "grad_norm": 0.9573127627372742, |
| "learning_rate": 4.917330276168208e-05, |
| "loss": 0.0434, |
| "step": 1060 |
| }, |
| { |
| "epoch": 10.7, |
| "grad_norm": 0.3760146498680115, |
| "learning_rate": 4.834683153413459e-05, |
| "loss": 0.0536, |
| "step": 1070 |
| }, |
| { |
| "epoch": 10.8, |
| "grad_norm": 0.7248881459236145, |
| "learning_rate": 4.7520812266338885e-05, |
| "loss": 0.052, |
| "step": 1080 |
| }, |
| { |
| "epoch": 10.9, |
| "grad_norm": 0.7167366147041321, |
| "learning_rate": 4.669547078371504e-05, |
| "loss": 0.0511, |
| "step": 1090 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 0.554724931716919, |
| "learning_rate": 4.5871032726383386e-05, |
| "loss": 0.0361, |
| "step": 1100 |
| }, |
| { |
| "epoch": 11.1, |
| "grad_norm": 0.8976012468338013, |
| "learning_rate": 4.504772348747687e-05, |
| "loss": 0.0446, |
| "step": 1110 |
| }, |
| { |
| "epoch": 11.2, |
| "grad_norm": 1.2764275074005127, |
| "learning_rate": 4.4225768151520694e-05, |
| "loss": 0.0477, |
| "step": 1120 |
| }, |
| { |
| "epoch": 11.3, |
| "grad_norm": 0.9568806886672974, |
| "learning_rate": 4.3405391432896555e-05, |
| "loss": 0.0489, |
| "step": 1130 |
| }, |
| { |
| "epoch": 11.4, |
| "grad_norm": 0.5885640978813171, |
| "learning_rate": 4.2586817614407895e-05, |
| "loss": 0.0534, |
| "step": 1140 |
| }, |
| { |
| "epoch": 11.5, |
| "grad_norm": 0.48282381892204285, |
| "learning_rate": 4.17702704859633e-05, |
| "loss": 0.0489, |
| "step": 1150 |
| }, |
| { |
| "epoch": 11.6, |
| "grad_norm": 0.6082215905189514, |
| "learning_rate": 4.095597328339452e-05, |
| "loss": 0.0455, |
| "step": 1160 |
| }, |
| { |
| "epoch": 11.7, |
| "grad_norm": 0.8037371635437012, |
| "learning_rate": 4.0144148627425993e-05, |
| "loss": 0.0454, |
| "step": 1170 |
| }, |
| { |
| "epoch": 11.8, |
| "grad_norm": 0.6086834669113159, |
| "learning_rate": 3.933501846281267e-05, |
| "loss": 0.0427, |
| "step": 1180 |
| }, |
| { |
| "epoch": 11.9, |
| "grad_norm": 0.6202229857444763, |
| "learning_rate": 3.852880399766243e-05, |
| "loss": 0.0434, |
| "step": 1190 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 1.6280860900878906, |
| "learning_rate": 3.772572564296005e-05, |
| "loss": 0.0405, |
| "step": 1200 |
| }, |
| { |
| "epoch": 12.1, |
| "grad_norm": 0.7294513583183289, |
| "learning_rate": 3.6926002952309016e-05, |
| "loss": 0.0474, |
| "step": 1210 |
| }, |
| { |
| "epoch": 12.2, |
| "grad_norm": 1.0804405212402344, |
| "learning_rate": 3.612985456190778e-05, |
| "loss": 0.0455, |
| "step": 1220 |
| }, |
| { |
| "epoch": 12.3, |
| "grad_norm": 0.8139125108718872, |
| "learning_rate": 3.533749813077677e-05, |
| "loss": 0.0396, |
| "step": 1230 |
| }, |
| { |
| "epoch": 12.4, |
| "grad_norm": 0.6082015037536621, |
| "learning_rate": 3.4549150281252636e-05, |
| "loss": 0.0455, |
| "step": 1240 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.5969621539115906, |
| "learning_rate": 3.3765026539765834e-05, |
| "loss": 0.0393, |
| "step": 1250 |
| }, |
| { |
| "epoch": 12.6, |
| "grad_norm": 0.6664546728134155, |
| "learning_rate": 3.298534127791785e-05, |
| "loss": 0.0448, |
| "step": 1260 |
| }, |
| { |
| "epoch": 12.7, |
| "grad_norm": 0.5217233300209045, |
| "learning_rate": 3.221030765387417e-05, |
| "loss": 0.0446, |
| "step": 1270 |
| }, |
| { |
| "epoch": 12.8, |
| "grad_norm": 0.7529652118682861, |
| "learning_rate": 3.144013755408895e-05, |
| "loss": 0.0426, |
| "step": 1280 |
| }, |
| { |
| "epoch": 12.9, |
| "grad_norm": 0.41002699732780457, |
| "learning_rate": 3.0675041535377405e-05, |
| "loss": 0.0438, |
| "step": 1290 |
| }, |
| { |
| "epoch": 13.0, |
| "grad_norm": 3.2146363258361816, |
| "learning_rate": 2.991522876735154e-05, |
| "loss": 0.0435, |
| "step": 1300 |
| }, |
| { |
| "epoch": 13.1, |
| "grad_norm": 0.6749954223632812, |
| "learning_rate": 2.916090697523549e-05, |
| "loss": 0.0468, |
| "step": 1310 |
| }, |
| { |
| "epoch": 13.2, |
| "grad_norm": 0.7005327343940735, |
| "learning_rate": 2.8412282383075363e-05, |
| "loss": 0.0357, |
| "step": 1320 |
| }, |
| { |
| "epoch": 13.3, |
| "grad_norm": 0.6995335817337036, |
| "learning_rate": 2.766955965735968e-05, |
| "loss": 0.0426, |
| "step": 1330 |
| }, |
| { |
| "epoch": 13.4, |
| "grad_norm": 0.6105914115905762, |
| "learning_rate": 2.693294185106562e-05, |
| "loss": 0.0406, |
| "step": 1340 |
| }, |
| { |
| "epoch": 13.5, |
| "grad_norm": 0.6400395035743713, |
| "learning_rate": 2.6202630348146324e-05, |
| "loss": 0.043, |
| "step": 1350 |
| }, |
| { |
| "epoch": 13.6, |
| "grad_norm": 0.8495157957077026, |
| "learning_rate": 2.547882480847461e-05, |
| "loss": 0.0403, |
| "step": 1360 |
| }, |
| { |
| "epoch": 13.7, |
| "grad_norm": 0.6124421954154968, |
| "learning_rate": 2.476172311325783e-05, |
| "loss": 0.0395, |
| "step": 1370 |
| }, |
| { |
| "epoch": 13.8, |
| "grad_norm": 0.9979932904243469, |
| "learning_rate": 2.405152131093926e-05, |
| "loss": 0.0386, |
| "step": 1380 |
| }, |
| { |
| "epoch": 13.9, |
| "grad_norm": 0.9076727628707886, |
| "learning_rate": 2.3348413563600325e-05, |
| "loss": 0.0391, |
| "step": 1390 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 0.8010100722312927, |
| "learning_rate": 2.2652592093878666e-05, |
| "loss": 0.0327, |
| "step": 1400 |
| }, |
| { |
| "epoch": 14.1, |
| "grad_norm": 0.7930116653442383, |
| "learning_rate": 2.196424713241637e-05, |
| "loss": 0.0416, |
| "step": 1410 |
| }, |
| { |
| "epoch": 14.2, |
| "grad_norm": 0.4719603359699249, |
| "learning_rate": 2.128356686585282e-05, |
| "loss": 0.0339, |
| "step": 1420 |
| }, |
| { |
| "epoch": 14.3, |
| "grad_norm": 0.5977267622947693, |
| "learning_rate": 2.061073738537635e-05, |
| "loss": 0.0433, |
| "step": 1430 |
| }, |
| { |
| "epoch": 14.4, |
| "grad_norm": 0.924698531627655, |
| "learning_rate": 1.9945942635848748e-05, |
| "loss": 0.0436, |
| "step": 1440 |
| }, |
| { |
| "epoch": 14.5, |
| "grad_norm": 0.3880954384803772, |
| "learning_rate": 1.928936436551661e-05, |
| "loss": 0.0403, |
| "step": 1450 |
| }, |
| { |
| "epoch": 14.6, |
| "grad_norm": 0.4617973566055298, |
| "learning_rate": 1.8641182076323148e-05, |
| "loss": 0.0373, |
| "step": 1460 |
| }, |
| { |
| "epoch": 14.7, |
| "grad_norm": 0.666461169719696, |
| "learning_rate": 1.800157297483417e-05, |
| "loss": 0.0418, |
| "step": 1470 |
| }, |
| { |
| "epoch": 14.8, |
| "grad_norm": 0.4455702304840088, |
| "learning_rate": 1.7370711923791567e-05, |
| "loss": 0.0352, |
| "step": 1480 |
| }, |
| { |
| "epoch": 14.9, |
| "grad_norm": 1.0331430435180664, |
| "learning_rate": 1.6748771394307585e-05, |
| "loss": 0.05, |
| "step": 1490 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 1.1350957155227661, |
| "learning_rate": 1.6135921418712956e-05, |
| "loss": 0.0365, |
| "step": 1500 |
| }, |
| { |
| "epoch": 15.1, |
| "grad_norm": 0.6353791952133179, |
| "learning_rate": 1.553232954407171e-05, |
| "loss": 0.0399, |
| "step": 1510 |
| }, |
| { |
| "epoch": 15.2, |
| "grad_norm": 0.5416865944862366, |
| "learning_rate": 1.4938160786375572e-05, |
| "loss": 0.0451, |
| "step": 1520 |
| }, |
| { |
| "epoch": 15.3, |
| "grad_norm": 0.864016056060791, |
| "learning_rate": 1.435357758543015e-05, |
| "loss": 0.0379, |
| "step": 1530 |
| }, |
| { |
| "epoch": 15.4, |
| "grad_norm": 0.4366832971572876, |
| "learning_rate": 1.3778739760445552e-05, |
| "loss": 0.0405, |
| "step": 1540 |
| }, |
| { |
| "epoch": 15.5, |
| "grad_norm": 0.5444951057434082, |
| "learning_rate": 1.3213804466343421e-05, |
| "loss": 0.0368, |
| "step": 1550 |
| }, |
| { |
| "epoch": 15.6, |
| "grad_norm": 1.8182613849639893, |
| "learning_rate": 1.2658926150792322e-05, |
| "loss": 0.0497, |
| "step": 1560 |
| }, |
| { |
| "epoch": 15.7, |
| "grad_norm": 0.5947335362434387, |
| "learning_rate": 1.2114256511983274e-05, |
| "loss": 0.0391, |
| "step": 1570 |
| }, |
| { |
| "epoch": 15.8, |
| "grad_norm": 0.794297993183136, |
| "learning_rate": 1.157994445715706e-05, |
| "loss": 0.0372, |
| "step": 1580 |
| }, |
| { |
| "epoch": 15.9, |
| "grad_norm": 0.6530230641365051, |
| "learning_rate": 1.1056136061894384e-05, |
| "loss": 0.0385, |
| "step": 1590 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 2.78792405128479, |
| "learning_rate": 1.0542974530180327e-05, |
| "loss": 0.0485, |
| "step": 1600 |
| }, |
| { |
| "epoch": 16.1, |
| "grad_norm": 0.9636825323104858, |
| "learning_rate": 1.0040600155253765e-05, |
| "loss": 0.0371, |
| "step": 1610 |
| }, |
| { |
| "epoch": 16.2, |
| "grad_norm": 0.4774838984012604, |
| "learning_rate": 9.549150281252633e-06, |
| "loss": 0.0379, |
| "step": 1620 |
| }, |
| { |
| "epoch": 16.3, |
| "grad_norm": 0.6014675498008728, |
| "learning_rate": 9.068759265665384e-06, |
| "loss": 0.0354, |
| "step": 1630 |
| }, |
| { |
| "epoch": 16.4, |
| "grad_norm": 0.7133916020393372, |
| "learning_rate": 8.599558442598998e-06, |
| "loss": 0.036, |
| "step": 1640 |
| }, |
| { |
| "epoch": 16.5, |
| "grad_norm": 0.765894889831543, |
| "learning_rate": 8.141676086873572e-06, |
| "loss": 0.0371, |
| "step": 1650 |
| }, |
| { |
| "epoch": 16.6, |
| "grad_norm": 0.49927645921707153, |
| "learning_rate": 7.695237378953223e-06, |
| "loss": 0.0398, |
| "step": 1660 |
| }, |
| { |
| "epoch": 16.7, |
| "grad_norm": 1.3441693782806396, |
| "learning_rate": 7.260364370723044e-06, |
| "loss": 0.0393, |
| "step": 1670 |
| }, |
| { |
| "epoch": 16.8, |
| "grad_norm": 0.6928268671035767, |
| "learning_rate": 6.837175952121306e-06, |
| "loss": 0.0333, |
| "step": 1680 |
| }, |
| { |
| "epoch": 16.9, |
| "grad_norm": 0.4891086220741272, |
| "learning_rate": 6.425787818636131e-06, |
| "loss": 0.0357, |
| "step": 1690 |
| }, |
| { |
| "epoch": 17.0, |
| "grad_norm": 1.0727057456970215, |
| "learning_rate": 6.026312439675552e-06, |
| "loss": 0.0403, |
| "step": 1700 |
| }, |
| { |
| "epoch": 17.1, |
| "grad_norm": 0.5739259123802185, |
| "learning_rate": 5.6388590278194096e-06, |
| "loss": 0.0366, |
| "step": 1710 |
| }, |
| { |
| "epoch": 17.2, |
| "grad_norm": 0.5994676351547241, |
| "learning_rate": 5.263533508961827e-06, |
| "loss": 0.0365, |
| "step": 1720 |
| }, |
| { |
| "epoch": 17.3, |
| "grad_norm": 0.7043574452400208, |
| "learning_rate": 4.900438493352055e-06, |
| "loss": 0.0404, |
| "step": 1730 |
| }, |
| { |
| "epoch": 17.4, |
| "grad_norm": 0.4768543839454651, |
| "learning_rate": 4.549673247541875e-06, |
| "loss": 0.0419, |
| "step": 1740 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 0.800778329372406, |
| "learning_rate": 4.2113336672471245e-06, |
| "loss": 0.0379, |
| "step": 1750 |
| }, |
| { |
| "epoch": 17.6, |
| "grad_norm": 0.9242330193519592, |
| "learning_rate": 3.885512251130763e-06, |
| "loss": 0.0337, |
| "step": 1760 |
| }, |
| { |
| "epoch": 17.7, |
| "grad_norm": 0.37277647852897644, |
| "learning_rate": 3.5722980755146517e-06, |
| "loss": 0.0381, |
| "step": 1770 |
| }, |
| { |
| "epoch": 17.8, |
| "grad_norm": 0.8925457000732422, |
| "learning_rate": 3.271776770026963e-06, |
| "loss": 0.0383, |
| "step": 1780 |
| }, |
| { |
| "epoch": 17.9, |
| "grad_norm": 0.8878549337387085, |
| "learning_rate": 2.9840304941919415e-06, |
| "loss": 0.0405, |
| "step": 1790 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.779109537601471, |
| "learning_rate": 2.7091379149682685e-06, |
| "loss": 0.0341, |
| "step": 1800 |
| }, |
| { |
| "epoch": 18.1, |
| "grad_norm": 0.4987022280693054, |
| "learning_rate": 2.4471741852423237e-06, |
| "loss": 0.0435, |
| "step": 1810 |
| }, |
| { |
| "epoch": 18.2, |
| "grad_norm": 0.8446706533432007, |
| "learning_rate": 2.1982109232821178e-06, |
| "loss": 0.0401, |
| "step": 1820 |
| }, |
| { |
| "epoch": 18.3, |
| "grad_norm": 0.6136398911476135, |
| "learning_rate": 1.962316193157593e-06, |
| "loss": 0.0405, |
| "step": 1830 |
| }, |
| { |
| "epoch": 18.4, |
| "grad_norm": 0.5189019441604614, |
| "learning_rate": 1.7395544861325718e-06, |
| "loss": 0.035, |
| "step": 1840 |
| }, |
| { |
| "epoch": 18.5, |
| "grad_norm": 1.2081691026687622, |
| "learning_rate": 1.5299867030334814e-06, |
| "loss": 0.0301, |
| "step": 1850 |
| }, |
| { |
| "epoch": 18.6, |
| "grad_norm": 0.5128328204154968, |
| "learning_rate": 1.333670137599713e-06, |
| "loss": 0.0342, |
| "step": 1860 |
| }, |
| { |
| "epoch": 18.7, |
| "grad_norm": 0.6271176338195801, |
| "learning_rate": 1.1506584608200367e-06, |
| "loss": 0.0374, |
| "step": 1870 |
| }, |
| { |
| "epoch": 18.8, |
| "grad_norm": 0.7536263465881348, |
| "learning_rate": 9.810017062595322e-07, |
| "loss": 0.0432, |
| "step": 1880 |
| }, |
| { |
| "epoch": 18.9, |
| "grad_norm": 0.70732581615448, |
| "learning_rate": 8.247462563808817e-07, |
| "loss": 0.0389, |
| "step": 1890 |
| }, |
| { |
| "epoch": 19.0, |
| "grad_norm": 3.046426296234131, |
| "learning_rate": 6.819348298638839e-07, |
| "loss": 0.0339, |
| "step": 1900 |
| }, |
| { |
| "epoch": 19.1, |
| "grad_norm": 0.5528438091278076, |
| "learning_rate": 5.526064699265753e-07, |
| "loss": 0.0356, |
| "step": 1910 |
| }, |
| { |
| "epoch": 19.2, |
| "grad_norm": 0.4317626357078552, |
| "learning_rate": 4.367965336512403e-07, |
| "loss": 0.0415, |
| "step": 1920 |
| }, |
| { |
| "epoch": 19.3, |
| "grad_norm": 0.7551414966583252, |
| "learning_rate": 3.3453668231809286e-07, |
| "loss": 0.0375, |
| "step": 1930 |
| }, |
| { |
| "epoch": 19.4, |
| "grad_norm": 0.737501859664917, |
| "learning_rate": 2.458548727494292e-07, |
| "loss": 0.0391, |
| "step": 1940 |
| }, |
| { |
| "epoch": 19.5, |
| "grad_norm": 0.3441050052642822, |
| "learning_rate": 1.7077534966650766e-07, |
| "loss": 0.0356, |
| "step": 1950 |
| }, |
| { |
| "epoch": 19.6, |
| "grad_norm": 0.7213258147239685, |
| "learning_rate": 1.0931863906127327e-07, |
| "loss": 0.0377, |
| "step": 1960 |
| }, |
| { |
| "epoch": 19.7, |
| "grad_norm": 0.6739200353622437, |
| "learning_rate": 6.150154258476315e-08, |
| "loss": 0.0346, |
| "step": 1970 |
| }, |
| { |
| "epoch": 19.8, |
| "grad_norm": 0.4855494499206543, |
| "learning_rate": 2.7337132953697554e-08, |
| "loss": 0.0315, |
| "step": 1980 |
| }, |
| { |
| "epoch": 19.9, |
| "grad_norm": 0.8237863779067993, |
| "learning_rate": 6.834750376549792e-09, |
| "loss": 0.0403, |
| "step": 1990 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.4662252366542816, |
| "learning_rate": 0.0, |
| "loss": 0.0408, |
| "step": 2000 |
| }, |
| { |
| "epoch": 20.0, |
| "step": 2000, |
| "total_flos": 1.357860048774336e+17, |
| "train_loss": 0.09518647351861, |
| "train_runtime": 632.3751, |
| "train_samples_per_second": 100.257, |
| "train_steps_per_second": 3.163 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 20000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.357860048774336e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|