Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.31899641577061, | |
| "eval_steps": 500, | |
| "global_step": 2600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 240.98789978027344, | |
| "learning_rate": 9.978494623655915e-06, | |
| "loss": 4.1336, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": NaN, | |
| "learning_rate": 9.953405017921148e-06, | |
| "loss": 3.7709, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 414.7696533203125, | |
| "learning_rate": 9.917562724014338e-06, | |
| "loss": 3.5379, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 160.37644958496094, | |
| "learning_rate": 9.881720430107527e-06, | |
| "loss": 3.2512, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1477.7020263671875, | |
| "learning_rate": 9.845878136200718e-06, | |
| "loss": 2.8186, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 444.38775634765625, | |
| "learning_rate": 9.810035842293908e-06, | |
| "loss": 2.5583, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 119.81842041015625, | |
| "learning_rate": 9.774193548387097e-06, | |
| "loss": 2.4916, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 175.25006103515625, | |
| "learning_rate": 9.74193548387097e-06, | |
| "loss": 2.4299, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 116.39012145996094, | |
| "learning_rate": 9.706093189964158e-06, | |
| "loss": 2.2983, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 313.2846984863281, | |
| "learning_rate": 9.670250896057349e-06, | |
| "loss": 2.233, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 228.02125549316406, | |
| "learning_rate": 9.634408602150539e-06, | |
| "loss": 2.0787, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 82.76283264160156, | |
| "learning_rate": 9.598566308243728e-06, | |
| "loss": 2.2126, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 94.6029052734375, | |
| "learning_rate": 9.562724014336918e-06, | |
| "loss": 2.0342, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 259.7580871582031, | |
| "learning_rate": 9.526881720430107e-06, | |
| "loss": 1.8917, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 453.753173828125, | |
| "learning_rate": 9.491039426523298e-06, | |
| "loss": 2.0701, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 131.5532989501953, | |
| "learning_rate": 9.455197132616488e-06, | |
| "loss": 1.8425, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 528.49560546875, | |
| "learning_rate": 9.419354838709677e-06, | |
| "loss": 1.7829, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 510.9673156738281, | |
| "learning_rate": 9.38351254480287e-06, | |
| "loss": 1.8383, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 140.07981872558594, | |
| "learning_rate": 9.347670250896058e-06, | |
| "loss": 2.0396, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 284.4012145996094, | |
| "learning_rate": 9.311827956989249e-06, | |
| "loss": 1.8723, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 105.90425109863281, | |
| "learning_rate": 9.27598566308244e-06, | |
| "loss": 1.8539, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 77.53498077392578, | |
| "learning_rate": 9.240143369175628e-06, | |
| "loss": 1.8487, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 203.25697326660156, | |
| "learning_rate": 9.204301075268819e-06, | |
| "loss": 1.7915, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 167.5474853515625, | |
| "learning_rate": 9.168458781362007e-06, | |
| "loss": 1.6751, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 141.84117126464844, | |
| "learning_rate": 9.132616487455198e-06, | |
| "loss": 1.7533, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 170.94772338867188, | |
| "learning_rate": 9.096774193548388e-06, | |
| "loss": 1.6008, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 155.2710418701172, | |
| "learning_rate": 9.060931899641577e-06, | |
| "loss": 1.7155, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 165.6481170654297, | |
| "learning_rate": 9.025089605734768e-06, | |
| "loss": 1.6922, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 108.59546661376953, | |
| "learning_rate": 8.989247311827958e-06, | |
| "loss": 1.737, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 119.43255615234375, | |
| "learning_rate": 8.953405017921147e-06, | |
| "loss": 1.5624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 203.9807891845703, | |
| "learning_rate": 8.917562724014338e-06, | |
| "loss": 1.5555, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 196.70803833007812, | |
| "learning_rate": 8.881720430107528e-06, | |
| "loss": 1.4945, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 218.3573760986328, | |
| "learning_rate": 8.845878136200717e-06, | |
| "loss": 1.6784, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 218.7974395751953, | |
| "learning_rate": 8.810035842293907e-06, | |
| "loss": 1.5671, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 110.03742980957031, | |
| "learning_rate": 8.774193548387098e-06, | |
| "loss": 1.5509, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 536.1112060546875, | |
| "learning_rate": 8.738351254480287e-06, | |
| "loss": 1.5836, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 91.13652038574219, | |
| "learning_rate": 8.702508960573477e-06, | |
| "loss": 1.5235, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 111.45140075683594, | |
| "learning_rate": 8.666666666666668e-06, | |
| "loss": 1.5352, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 83.86921691894531, | |
| "learning_rate": 8.630824372759857e-06, | |
| "loss": 1.5994, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 64.33961486816406, | |
| "learning_rate": 8.594982078853047e-06, | |
| "loss": 1.5309, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 74.24256134033203, | |
| "learning_rate": 8.559139784946238e-06, | |
| "loss": 1.5243, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 88.8588638305664, | |
| "learning_rate": 8.523297491039427e-06, | |
| "loss": 1.4779, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 99.83770751953125, | |
| "learning_rate": 8.487455197132617e-06, | |
| "loss": 1.4671, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 219.4841766357422, | |
| "learning_rate": 8.451612903225808e-06, | |
| "loss": 1.4821, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 145.25010681152344, | |
| "learning_rate": 8.415770609318998e-06, | |
| "loss": 1.5258, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 109.16207885742188, | |
| "learning_rate": 8.379928315412187e-06, | |
| "loss": 1.4973, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 385.5920104980469, | |
| "learning_rate": 8.344086021505376e-06, | |
| "loss": 1.5602, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 78.8775634765625, | |
| "learning_rate": 8.308243727598568e-06, | |
| "loss": 1.4304, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 54.696563720703125, | |
| "learning_rate": 8.272401433691757e-06, | |
| "loss": 1.4581, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 145.9872283935547, | |
| "learning_rate": 8.236559139784947e-06, | |
| "loss": 1.4707, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 86.8324203491211, | |
| "learning_rate": 8.200716845878138e-06, | |
| "loss": 1.4593, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 328.5557556152344, | |
| "learning_rate": 8.164874551971327e-06, | |
| "loss": 1.4688, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 126.05339813232422, | |
| "learning_rate": 8.129032258064517e-06, | |
| "loss": 1.5903, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 216.65573120117188, | |
| "learning_rate": 8.093189964157708e-06, | |
| "loss": 1.6609, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 150.45306396484375, | |
| "learning_rate": 8.057347670250897e-06, | |
| "loss": 1.5142, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 84.9225845336914, | |
| "learning_rate": 8.021505376344087e-06, | |
| "loss": 1.5273, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 63.36981201171875, | |
| "learning_rate": 7.985663082437278e-06, | |
| "loss": 1.4743, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 70.1873779296875, | |
| "learning_rate": 7.949820788530466e-06, | |
| "loss": 1.5223, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 208.42889404296875, | |
| "learning_rate": 7.913978494623657e-06, | |
| "loss": 1.4587, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 59.578243255615234, | |
| "learning_rate": 7.878136200716846e-06, | |
| "loss": 1.3852, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 121.19114685058594, | |
| "learning_rate": 7.842293906810036e-06, | |
| "loss": 1.3795, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 81.8535385131836, | |
| "learning_rate": 7.806451612903227e-06, | |
| "loss": 1.3634, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 171.9092559814453, | |
| "learning_rate": 7.770609318996416e-06, | |
| "loss": 1.4038, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 189.8422393798828, | |
| "learning_rate": 7.734767025089606e-06, | |
| "loss": 1.359, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 191.68310546875, | |
| "learning_rate": 7.698924731182797e-06, | |
| "loss": 1.4473, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 60.011138916015625, | |
| "learning_rate": 7.663082437275985e-06, | |
| "loss": 1.3605, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 85.5229721069336, | |
| "learning_rate": 7.627240143369177e-06, | |
| "loss": 1.3775, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 72.5903549194336, | |
| "learning_rate": 7.5913978494623665e-06, | |
| "loss": 1.3748, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 246.1010284423828, | |
| "learning_rate": 7.555555555555556e-06, | |
| "loss": 1.3974, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 70.74227905273438, | |
| "learning_rate": 7.519713261648746e-06, | |
| "loss": 1.352, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 131.19888305664062, | |
| "learning_rate": 7.483870967741936e-06, | |
| "loss": 1.3262, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 178.96922302246094, | |
| "learning_rate": 7.448028673835126e-06, | |
| "loss": 1.3966, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 173.15139770507812, | |
| "learning_rate": 7.412186379928316e-06, | |
| "loss": 1.3522, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 139.38653564453125, | |
| "learning_rate": 7.376344086021506e-06, | |
| "loss": 1.3824, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 91.99787902832031, | |
| "learning_rate": 7.340501792114696e-06, | |
| "loss": 1.3789, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 113.1898193359375, | |
| "learning_rate": 7.3046594982078856e-06, | |
| "loss": 1.4338, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 92.90975952148438, | |
| "learning_rate": 7.268817204301076e-06, | |
| "loss": 1.3135, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 62.657371520996094, | |
| "learning_rate": 7.232974910394266e-06, | |
| "loss": 1.3329, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 44.4366340637207, | |
| "learning_rate": 7.1971326164874554e-06, | |
| "loss": 1.372, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 73.2843017578125, | |
| "learning_rate": 7.161290322580646e-06, | |
| "loss": 1.4003, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 33.83133316040039, | |
| "learning_rate": 7.125448028673836e-06, | |
| "loss": 1.3719, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 95.57901763916016, | |
| "learning_rate": 7.089605734767025e-06, | |
| "loss": 1.3175, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 90.033935546875, | |
| "learning_rate": 7.053763440860215e-06, | |
| "loss": 1.3899, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 73.42399597167969, | |
| "learning_rate": 7.0179211469534055e-06, | |
| "loss": 1.3479, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 90.77163696289062, | |
| "learning_rate": 6.982078853046595e-06, | |
| "loss": 1.3747, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 73.06351470947266, | |
| "learning_rate": 6.946236559139785e-06, | |
| "loss": 1.3437, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 177.39906311035156, | |
| "learning_rate": 6.910394265232976e-06, | |
| "loss": 1.3008, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 49.48398208618164, | |
| "learning_rate": 6.874551971326166e-06, | |
| "loss": 1.3858, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 60.6556396484375, | |
| "learning_rate": 6.838709677419355e-06, | |
| "loss": 1.3837, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 133.91407775878906, | |
| "learning_rate": 6.802867383512546e-06, | |
| "loss": 1.3502, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 100.1251449584961, | |
| "learning_rate": 6.767025089605736e-06, | |
| "loss": 1.3186, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 56.571807861328125, | |
| "learning_rate": 6.731182795698925e-06, | |
| "loss": 1.2846, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 37.74870681762695, | |
| "learning_rate": 6.695340501792115e-06, | |
| "loss": 1.3624, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 334.59722900390625, | |
| "learning_rate": 6.659498207885306e-06, | |
| "loss": 1.3377, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 67.167724609375, | |
| "learning_rate": 6.623655913978495e-06, | |
| "loss": 1.3796, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 46.475154876708984, | |
| "learning_rate": 6.587813620071685e-06, | |
| "loss": 1.3168, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 94.3563232421875, | |
| "learning_rate": 6.5519713261648755e-06, | |
| "loss": 1.2965, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 49.33637619018555, | |
| "learning_rate": 6.516129032258065e-06, | |
| "loss": 1.2851, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 78.11804962158203, | |
| "learning_rate": 6.480286738351255e-06, | |
| "loss": 1.4281, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 135.7293243408203, | |
| "learning_rate": 6.444444444444445e-06, | |
| "loss": 1.361, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 449.07757568359375, | |
| "learning_rate": 6.408602150537635e-06, | |
| "loss": 1.3035, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 50.195037841796875, | |
| "learning_rate": 6.372759856630825e-06, | |
| "loss": 1.3036, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 51.2278938293457, | |
| "learning_rate": 6.336917562724015e-06, | |
| "loss": 1.3152, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 115.5573501586914, | |
| "learning_rate": 6.301075268817205e-06, | |
| "loss": 1.2863, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 48.20037078857422, | |
| "learning_rate": 6.2652329749103945e-06, | |
| "loss": 1.289, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 82.9049301147461, | |
| "learning_rate": 6.229390681003584e-06, | |
| "loss": 1.3162, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 49.85783386230469, | |
| "learning_rate": 6.193548387096775e-06, | |
| "loss": 1.439, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 72.38436126708984, | |
| "learning_rate": 6.157706093189964e-06, | |
| "loss": 1.3553, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 69.86207580566406, | |
| "learning_rate": 6.121863799283154e-06, | |
| "loss": 1.3005, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 121.75460052490234, | |
| "learning_rate": 6.086021505376345e-06, | |
| "loss": 1.3552, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 74.01500701904297, | |
| "learning_rate": 6.050179211469534e-06, | |
| "loss": 1.2324, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 120.46449279785156, | |
| "learning_rate": 6.014336917562724e-06, | |
| "loss": 1.2653, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 129.35137939453125, | |
| "learning_rate": 5.978494623655915e-06, | |
| "loss": 1.2437, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 114.84891510009766, | |
| "learning_rate": 5.942652329749104e-06, | |
| "loss": 1.2631, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 48.180904388427734, | |
| "learning_rate": 5.906810035842294e-06, | |
| "loss": 1.2735, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 40.2717170715332, | |
| "learning_rate": 5.8709677419354835e-06, | |
| "loss": 1.2728, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 71.15570068359375, | |
| "learning_rate": 5.835125448028675e-06, | |
| "loss": 1.3059, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 62.20613479614258, | |
| "learning_rate": 5.7992831541218645e-06, | |
| "loss": 1.2574, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 83.15790557861328, | |
| "learning_rate": 5.763440860215054e-06, | |
| "loss": 1.3026, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 79.50647735595703, | |
| "learning_rate": 5.727598566308245e-06, | |
| "loss": 1.237, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 69.64093780517578, | |
| "learning_rate": 5.691756272401434e-06, | |
| "loss": 1.2479, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 40.61994171142578, | |
| "learning_rate": 5.655913978494624e-06, | |
| "loss": 1.2842, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 53.86878967285156, | |
| "learning_rate": 5.620071684587815e-06, | |
| "loss": 1.267, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 53.299129486083984, | |
| "learning_rate": 5.584229390681004e-06, | |
| "loss": 1.2256, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 124.71385192871094, | |
| "learning_rate": 5.548387096774194e-06, | |
| "loss": 1.245, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 48.88494873046875, | |
| "learning_rate": 5.5125448028673844e-06, | |
| "loss": 1.2428, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 34.74821472167969, | |
| "learning_rate": 5.476702508960574e-06, | |
| "loss": 1.241, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 92.53400421142578, | |
| "learning_rate": 5.440860215053764e-06, | |
| "loss": 1.2655, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 79.42727661132812, | |
| "learning_rate": 5.4050179211469535e-06, | |
| "loss": 1.2246, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 42.84294891357422, | |
| "learning_rate": 5.369175627240144e-06, | |
| "loss": 1.234, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 43.57499313354492, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 1.2637, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 141.61326599121094, | |
| "learning_rate": 5.297491039426523e-06, | |
| "loss": 1.2883, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 71.41532897949219, | |
| "learning_rate": 5.261648745519714e-06, | |
| "loss": 1.2633, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 56.57035827636719, | |
| "learning_rate": 5.2258064516129035e-06, | |
| "loss": 1.238, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 38.39865493774414, | |
| "learning_rate": 5.189964157706093e-06, | |
| "loss": 1.2587, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 45.26354217529297, | |
| "learning_rate": 5.154121863799284e-06, | |
| "loss": 1.2644, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 62.85947799682617, | |
| "learning_rate": 5.118279569892473e-06, | |
| "loss": 1.3197, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 171.12010192871094, | |
| "learning_rate": 5.082437275985663e-06, | |
| "loss": 1.2333, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 38.02899932861328, | |
| "learning_rate": 5.0465949820788544e-06, | |
| "loss": 1.2987, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.02, | |
| "grad_norm": 47.150367736816406, | |
| "learning_rate": 5.010752688172043e-06, | |
| "loss": 1.2651, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 50.13650894165039, | |
| "learning_rate": 4.974910394265233e-06, | |
| "loss": 1.224, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.09, | |
| "grad_norm": 56.77107238769531, | |
| "learning_rate": 4.9390681003584234e-06, | |
| "loss": 1.1944, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.13, | |
| "grad_norm": 45.99001693725586, | |
| "learning_rate": 4.903225806451613e-06, | |
| "loss": 1.2481, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 94.43379974365234, | |
| "learning_rate": 4.867383512544804e-06, | |
| "loss": 1.2333, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 92.9940414428711, | |
| "learning_rate": 4.831541218637993e-06, | |
| "loss": 1.2503, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.23, | |
| "grad_norm": 62.211280822753906, | |
| "learning_rate": 4.795698924731183e-06, | |
| "loss": 1.238, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.27, | |
| "grad_norm": 56.011722564697266, | |
| "learning_rate": 4.7598566308243735e-06, | |
| "loss": 1.2228, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 131.8081817626953, | |
| "learning_rate": 4.724014336917563e-06, | |
| "loss": 1.1854, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 38.84933090209961, | |
| "learning_rate": 4.688172043010753e-06, | |
| "loss": 1.2052, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "grad_norm": 53.045658111572266, | |
| "learning_rate": 4.652329749103943e-06, | |
| "loss": 1.3113, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 48.19401931762695, | |
| "learning_rate": 4.616487455197133e-06, | |
| "loss": 1.2192, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "grad_norm": 31.12712860107422, | |
| "learning_rate": 4.580645161290323e-06, | |
| "loss": 1.2003, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 5.48, | |
| "grad_norm": 2096.69091796875, | |
| "learning_rate": 4.544802867383513e-06, | |
| "loss": 1.2451, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 76.25735473632812, | |
| "learning_rate": 4.508960573476703e-06, | |
| "loss": 1.2152, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "grad_norm": 66.96879577636719, | |
| "learning_rate": 4.473118279569893e-06, | |
| "loss": 1.1985, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.59, | |
| "grad_norm": 74.71087646484375, | |
| "learning_rate": 4.437275985663082e-06, | |
| "loss": 1.2033, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 5.63, | |
| "grad_norm": 64.05677795410156, | |
| "learning_rate": 4.401433691756273e-06, | |
| "loss": 1.2461, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 5.66, | |
| "grad_norm": 35.51209259033203, | |
| "learning_rate": 4.365591397849463e-06, | |
| "loss": 1.2277, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 50.55402374267578, | |
| "learning_rate": 4.329749103942653e-06, | |
| "loss": 1.2312, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "grad_norm": 39.16292953491211, | |
| "learning_rate": 4.293906810035843e-06, | |
| "loss": 1.1947, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.77, | |
| "grad_norm": 108.2676773071289, | |
| "learning_rate": 4.258064516129032e-06, | |
| "loss": 1.2399, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 5.81, | |
| "grad_norm": 37.47825622558594, | |
| "learning_rate": 4.222222222222223e-06, | |
| "loss": 1.2542, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 41.715301513671875, | |
| "learning_rate": 4.1863799283154125e-06, | |
| "loss": 1.1657, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 70.7906723022461, | |
| "learning_rate": 4.150537634408602e-06, | |
| "loss": 1.18, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 5.91, | |
| "grad_norm": 73.00814056396484, | |
| "learning_rate": 4.114695340501793e-06, | |
| "loss": 1.1753, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 40.9326286315918, | |
| "learning_rate": 4.078853046594982e-06, | |
| "loss": 1.1954, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 5.99, | |
| "grad_norm": 71.95938873291016, | |
| "learning_rate": 4.043010752688172e-06, | |
| "loss": 1.239, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "grad_norm": 35.70452880859375, | |
| "learning_rate": 4.0071684587813626e-06, | |
| "loss": 1.2607, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.06, | |
| "grad_norm": 33.75741958618164, | |
| "learning_rate": 3.971326164874552e-06, | |
| "loss": 1.2169, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "grad_norm": 39.85159683227539, | |
| "learning_rate": 3.935483870967742e-06, | |
| "loss": 1.198, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.13, | |
| "grad_norm": 51.76079177856445, | |
| "learning_rate": 3.8996415770609324e-06, | |
| "loss": 1.2028, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 41.39125061035156, | |
| "learning_rate": 3.863799283154122e-06, | |
| "loss": 1.2141, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 53.27019119262695, | |
| "learning_rate": 3.827956989247313e-06, | |
| "loss": 1.1815, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 54.405303955078125, | |
| "learning_rate": 3.792114695340502e-06, | |
| "loss": 1.2037, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "grad_norm": 60.07841491699219, | |
| "learning_rate": 3.756272401433692e-06, | |
| "loss": 1.2098, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 6.31, | |
| "grad_norm": 82.04875183105469, | |
| "learning_rate": 3.720430107526882e-06, | |
| "loss": 1.1644, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "grad_norm": 26.22403335571289, | |
| "learning_rate": 3.6845878136200717e-06, | |
| "loss": 1.1614, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 48.65098190307617, | |
| "learning_rate": 3.6487455197132623e-06, | |
| "loss": 1.2599, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "grad_norm": 42.07708740234375, | |
| "learning_rate": 3.6129032258064515e-06, | |
| "loss": 1.0894, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "grad_norm": 36.60328674316406, | |
| "learning_rate": 3.577060931899642e-06, | |
| "loss": 1.1918, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "grad_norm": 61.540828704833984, | |
| "learning_rate": 3.541218637992832e-06, | |
| "loss": 1.1573, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "grad_norm": 55.06793975830078, | |
| "learning_rate": 3.505376344086022e-06, | |
| "loss": 1.2161, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "grad_norm": 52.780025482177734, | |
| "learning_rate": 3.469534050179212e-06, | |
| "loss": 1.2136, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 6.59, | |
| "grad_norm": 36.805023193359375, | |
| "learning_rate": 3.4336917562724016e-06, | |
| "loss": 1.2059, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 6.63, | |
| "grad_norm": 34.84747314453125, | |
| "learning_rate": 3.3978494623655917e-06, | |
| "loss": 1.1723, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "grad_norm": 29.03413200378418, | |
| "learning_rate": 3.3620071684587818e-06, | |
| "loss": 1.2304, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 6.7, | |
| "grad_norm": 43.57373809814453, | |
| "learning_rate": 3.3261648745519714e-06, | |
| "loss": 1.2636, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 6.74, | |
| "grad_norm": 40.338401794433594, | |
| "learning_rate": 3.2903225806451615e-06, | |
| "loss": 1.201, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 6.77, | |
| "grad_norm": 49.39339828491211, | |
| "learning_rate": 3.254480286738351e-06, | |
| "loss": 1.1907, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 6.81, | |
| "grad_norm": 44.37787628173828, | |
| "learning_rate": 3.2186379928315413e-06, | |
| "loss": 1.1608, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "grad_norm": 30.73851776123047, | |
| "learning_rate": 3.182795698924732e-06, | |
| "loss": 1.0991, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 75.66316986083984, | |
| "learning_rate": 3.146953405017921e-06, | |
| "loss": 1.2072, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 6.92, | |
| "grad_norm": 79.0891342163086, | |
| "learning_rate": 3.1111111111111116e-06, | |
| "loss": 1.1899, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 44.473907470703125, | |
| "learning_rate": 3.0752688172043017e-06, | |
| "loss": 1.2603, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "grad_norm": 70.97260284423828, | |
| "learning_rate": 3.0394265232974914e-06, | |
| "loss": 1.1917, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "grad_norm": 113.41941833496094, | |
| "learning_rate": 3.0035842293906814e-06, | |
| "loss": 1.2268, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 7.06, | |
| "grad_norm": 31.316911697387695, | |
| "learning_rate": 2.967741935483871e-06, | |
| "loss": 1.1049, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 35.60115051269531, | |
| "learning_rate": 2.9318996415770612e-06, | |
| "loss": 1.1542, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 57.42852783203125, | |
| "learning_rate": 2.8960573476702513e-06, | |
| "loss": 1.2427, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 7.17, | |
| "grad_norm": 40.23640441894531, | |
| "learning_rate": 2.860215053763441e-06, | |
| "loss": 1.1448, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 36.7147102355957, | |
| "learning_rate": 2.824372759856631e-06, | |
| "loss": 1.1392, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 33.4290657043457, | |
| "learning_rate": 2.7885304659498208e-06, | |
| "loss": 1.1704, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 56.32290267944336, | |
| "learning_rate": 2.752688172043011e-06, | |
| "loss": 1.1495, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 7.31, | |
| "grad_norm": 43.89424133300781, | |
| "learning_rate": 2.716845878136201e-06, | |
| "loss": 1.1387, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 39.107975006103516, | |
| "learning_rate": 2.6810035842293906e-06, | |
| "loss": 1.3479, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 7.38, | |
| "grad_norm": 56.70566177368164, | |
| "learning_rate": 2.645161290322581e-06, | |
| "loss": 1.1315, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 7.42, | |
| "grad_norm": 28.802082061767578, | |
| "learning_rate": 2.6093189964157704e-06, | |
| "loss": 1.1935, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 7.46, | |
| "grad_norm": 48.39360809326172, | |
| "learning_rate": 2.573476702508961e-06, | |
| "loss": 1.2482, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 7.49, | |
| "grad_norm": 32.54924774169922, | |
| "learning_rate": 2.537634408602151e-06, | |
| "loss": 1.2273, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 7.53, | |
| "grad_norm": 39.42136764526367, | |
| "learning_rate": 2.5017921146953407e-06, | |
| "loss": 1.1796, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 45.770755767822266, | |
| "learning_rate": 2.4659498207885308e-06, | |
| "loss": 1.1732, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 38.276832580566406, | |
| "learning_rate": 2.4301075268817204e-06, | |
| "loss": 1.1732, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 7.63, | |
| "grad_norm": 41.075775146484375, | |
| "learning_rate": 2.3942652329749105e-06, | |
| "loss": 1.1713, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "grad_norm": 65.88035583496094, | |
| "learning_rate": 2.3584229390681006e-06, | |
| "loss": 1.1167, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 7.71, | |
| "grad_norm": 36.76555633544922, | |
| "learning_rate": 2.3225806451612907e-06, | |
| "loss": 1.1947, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 7.74, | |
| "grad_norm": 60.711055755615234, | |
| "learning_rate": 2.2867383512544804e-06, | |
| "loss": 1.1164, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "grad_norm": 37.08338928222656, | |
| "learning_rate": 2.2508960573476705e-06, | |
| "loss": 1.1199, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 7.81, | |
| "grad_norm": 41.63785171508789, | |
| "learning_rate": 2.21505376344086e-06, | |
| "loss": 1.2048, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 7.85, | |
| "grad_norm": 62.35722351074219, | |
| "learning_rate": 2.1792114695340507e-06, | |
| "loss": 1.2011, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 7.89, | |
| "grad_norm": 44.37682342529297, | |
| "learning_rate": 2.1433691756272404e-06, | |
| "loss": 1.1709, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 40.216739654541016, | |
| "learning_rate": 2.1075268817204305e-06, | |
| "loss": 1.1959, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 35.443302154541016, | |
| "learning_rate": 2.07168458781362e-06, | |
| "loss": 1.1123, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 7.99, | |
| "grad_norm": 59.596954345703125, | |
| "learning_rate": 2.0358422939068102e-06, | |
| "loss": 1.1139, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 8.03, | |
| "grad_norm": 34.79387283325195, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.1845, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 8.06, | |
| "grad_norm": 50.571712493896484, | |
| "learning_rate": 1.96415770609319e-06, | |
| "loss": 1.1528, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 8.1, | |
| "grad_norm": 33.95150375366211, | |
| "learning_rate": 1.92831541218638e-06, | |
| "loss": 1.1858, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 8.14, | |
| "grad_norm": 64.71048736572266, | |
| "learning_rate": 1.89247311827957e-06, | |
| "loss": 1.1199, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 8.17, | |
| "grad_norm": 76.19322967529297, | |
| "learning_rate": 1.8566308243727599e-06, | |
| "loss": 1.1457, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "grad_norm": 40.506675720214844, | |
| "learning_rate": 1.82078853046595e-06, | |
| "loss": 1.1886, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 34.859432220458984, | |
| "learning_rate": 1.7849462365591399e-06, | |
| "loss": 1.1133, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 39.036376953125, | |
| "learning_rate": 1.74910394265233e-06, | |
| "loss": 1.1845, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 72.71634674072266, | |
| "learning_rate": 1.7132616487455198e-06, | |
| "loss": 1.1685, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 8.35, | |
| "grad_norm": 29.658227920532227, | |
| "learning_rate": 1.67741935483871e-06, | |
| "loss": 1.1849, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 8.39, | |
| "grad_norm": 40.88108825683594, | |
| "learning_rate": 1.6415770609318998e-06, | |
| "loss": 1.1473, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 8.42, | |
| "grad_norm": 46.79905700683594, | |
| "learning_rate": 1.6057347670250897e-06, | |
| "loss": 1.1407, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 8.46, | |
| "grad_norm": 40.85004806518555, | |
| "learning_rate": 1.5698924731182796e-06, | |
| "loss": 1.0891, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 8.49, | |
| "grad_norm": 43.564849853515625, | |
| "learning_rate": 1.5340501792114695e-06, | |
| "loss": 1.1173, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 8.53, | |
| "grad_norm": 39.988792419433594, | |
| "learning_rate": 1.4982078853046598e-06, | |
| "loss": 1.0948, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 8.57, | |
| "grad_norm": 33.0150260925293, | |
| "learning_rate": 1.4623655913978497e-06, | |
| "loss": 1.1763, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 36.02336120605469, | |
| "learning_rate": 1.4265232974910395e-06, | |
| "loss": 1.1621, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 34.898765563964844, | |
| "learning_rate": 1.3906810035842294e-06, | |
| "loss": 1.2091, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 8.67, | |
| "grad_norm": 34.42953109741211, | |
| "learning_rate": 1.3548387096774195e-06, | |
| "loss": 1.1255, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 8.71, | |
| "grad_norm": 86.2882080078125, | |
| "learning_rate": 1.3189964157706094e-06, | |
| "loss": 1.1501, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 43.11504364013672, | |
| "learning_rate": 1.2831541218637993e-06, | |
| "loss": 1.2159, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 8.78, | |
| "grad_norm": 37.7353630065918, | |
| "learning_rate": 1.2473118279569894e-06, | |
| "loss": 1.1255, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 8.82, | |
| "grad_norm": 33.8388671875, | |
| "learning_rate": 1.2114695340501793e-06, | |
| "loss": 1.1677, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 129.2806396484375, | |
| "learning_rate": 1.1756272401433692e-06, | |
| "loss": 1.1739, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 8.89, | |
| "grad_norm": 31.244264602661133, | |
| "learning_rate": 1.1397849462365593e-06, | |
| "loss": 1.1247, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "grad_norm": 30.236568450927734, | |
| "learning_rate": 1.1039426523297491e-06, | |
| "loss": 1.093, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 42.34114456176758, | |
| "learning_rate": 1.0681003584229392e-06, | |
| "loss": 1.1688, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 34.8842887878418, | |
| "learning_rate": 1.0322580645161291e-06, | |
| "loss": 1.1828, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 9.03, | |
| "grad_norm": 34.398521423339844, | |
| "learning_rate": 9.96415770609319e-07, | |
| "loss": 1.1351, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 9.07, | |
| "grad_norm": 42.38921356201172, | |
| "learning_rate": 9.60573476702509e-07, | |
| "loss": 1.1673, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 9.1, | |
| "grad_norm": 62.44378662109375, | |
| "learning_rate": 9.24731182795699e-07, | |
| "loss": 1.1386, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 9.14, | |
| "grad_norm": 30.45479965209961, | |
| "learning_rate": 8.88888888888889e-07, | |
| "loss": 1.1577, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 9.18, | |
| "grad_norm": 37.57691192626953, | |
| "learning_rate": 8.530465949820789e-07, | |
| "loss": 1.1291, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 9.21, | |
| "grad_norm": 54.45702362060547, | |
| "learning_rate": 8.17204301075269e-07, | |
| "loss": 1.1575, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 57.186737060546875, | |
| "learning_rate": 7.813620071684588e-07, | |
| "loss": 1.132, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 35.08234405517578, | |
| "learning_rate": 7.455197132616488e-07, | |
| "loss": 1.1816, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 9.32, | |
| "grad_norm": 87.56676483154297, | |
| "learning_rate": 7.096774193548388e-07, | |
| "loss": 1.1528, | |
| "step": 2600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2790, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 200, | |
| "total_flos": 3.965868410199552e+19, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |