| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3930, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002544529262086514, | |
| "grad_norm": 2.004002094268799, | |
| "learning_rate": 3.0508474576271192e-06, | |
| "loss": 0.1146, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005089058524173028, | |
| "grad_norm": 2.2637343406677246, | |
| "learning_rate": 6.440677966101695e-06, | |
| "loss": 0.1546, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007633587786259542, | |
| "grad_norm": 1.814893364906311, | |
| "learning_rate": 9.830508474576272e-06, | |
| "loss": 0.0832, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010178117048346057, | |
| "grad_norm": 2.5804829597473145, | |
| "learning_rate": 1.3220338983050848e-05, | |
| "loss": 0.0677, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01272264631043257, | |
| "grad_norm": 2.116039991378784, | |
| "learning_rate": 1.6610169491525424e-05, | |
| "loss": 0.0305, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015267175572519083, | |
| "grad_norm": 3.1931254863739014, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0375, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.017811704834605598, | |
| "grad_norm": 0.12293073534965515, | |
| "learning_rate": 2.338983050847458e-05, | |
| "loss": 0.0239, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.020356234096692113, | |
| "grad_norm": 1.320569634437561, | |
| "learning_rate": 2.6779661016949153e-05, | |
| "loss": 0.0311, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.022900763358778626, | |
| "grad_norm": 1.4948492050170898, | |
| "learning_rate": 3.016949152542373e-05, | |
| "loss": 0.0258, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02544529262086514, | |
| "grad_norm": 1.2656677961349487, | |
| "learning_rate": 3.355932203389831e-05, | |
| "loss": 0.0258, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.027989821882951654, | |
| "grad_norm": 1.5219528675079346, | |
| "learning_rate": 3.6949152542372886e-05, | |
| "loss": 0.0109, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.030534351145038167, | |
| "grad_norm": 0.7582465410232544, | |
| "learning_rate": 3.9989506820566634e-05, | |
| "loss": 0.0178, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03307888040712468, | |
| "grad_norm": 0.43421927094459534, | |
| "learning_rate": 3.9884575026232955e-05, | |
| "loss": 0.018, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.035623409669211195, | |
| "grad_norm": 0.03073570877313614, | |
| "learning_rate": 3.977964323189927e-05, | |
| "loss": 0.0135, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03816793893129771, | |
| "grad_norm": 0.28750094771385193, | |
| "learning_rate": 3.967471143756559e-05, | |
| "loss": 0.0173, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04071246819338423, | |
| "grad_norm": 0.47052061557769775, | |
| "learning_rate": 3.9569779643231905e-05, | |
| "loss": 0.0117, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.043256997455470736, | |
| "grad_norm": 1.7329446077346802, | |
| "learning_rate": 3.946484784889822e-05, | |
| "loss": 0.0089, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04580152671755725, | |
| "grad_norm": 0.037718575447797775, | |
| "learning_rate": 3.935991605456454e-05, | |
| "loss": 0.0202, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04834605597964377, | |
| "grad_norm": 0.4059722423553467, | |
| "learning_rate": 3.9254984260230855e-05, | |
| "loss": 0.0066, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05089058524173028, | |
| "grad_norm": 0.043499384075403214, | |
| "learning_rate": 3.915005246589717e-05, | |
| "loss": 0.0111, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05343511450381679, | |
| "grad_norm": 0.504442036151886, | |
| "learning_rate": 3.904512067156349e-05, | |
| "loss": 0.0188, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05597964376590331, | |
| "grad_norm": 1.7431628704071045, | |
| "learning_rate": 3.8940188877229805e-05, | |
| "loss": 0.0129, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.058524173027989825, | |
| "grad_norm": 0.18788424134254456, | |
| "learning_rate": 3.8835257082896126e-05, | |
| "loss": 0.0155, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.061068702290076333, | |
| "grad_norm": 0.03526950255036354, | |
| "learning_rate": 3.873032528856244e-05, | |
| "loss": 0.0134, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06361323155216285, | |
| "grad_norm": 0.7251449823379517, | |
| "learning_rate": 3.8625393494228755e-05, | |
| "loss": 0.0078, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06615776081424936, | |
| "grad_norm": 0.05955138057470322, | |
| "learning_rate": 3.8520461699895076e-05, | |
| "loss": 0.016, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06870229007633588, | |
| "grad_norm": 0.11732513457536697, | |
| "learning_rate": 3.841552990556139e-05, | |
| "loss": 0.022, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07124681933842239, | |
| "grad_norm": 0.03472783789038658, | |
| "learning_rate": 3.8310598111227705e-05, | |
| "loss": 0.013, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0737913486005089, | |
| "grad_norm": 0.7197083234786987, | |
| "learning_rate": 3.8205666316894026e-05, | |
| "loss": 0.0102, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07633587786259542, | |
| "grad_norm": 0.03828183189034462, | |
| "learning_rate": 3.810073452256034e-05, | |
| "loss": 0.0078, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07888040712468193, | |
| "grad_norm": 0.6017931699752808, | |
| "learning_rate": 3.7995802728226654e-05, | |
| "loss": 0.0205, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08142493638676845, | |
| "grad_norm": 0.44738417863845825, | |
| "learning_rate": 3.7890870933892976e-05, | |
| "loss": 0.0178, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08396946564885496, | |
| "grad_norm": 0.03330178186297417, | |
| "learning_rate": 3.778593913955929e-05, | |
| "loss": 0.0145, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08651399491094147, | |
| "grad_norm": 0.01718255504965782, | |
| "learning_rate": 3.768100734522561e-05, | |
| "loss": 0.0147, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.089058524173028, | |
| "grad_norm": 0.8818934559822083, | |
| "learning_rate": 3.7576075550891925e-05, | |
| "loss": 0.0128, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0916030534351145, | |
| "grad_norm": 0.6107049584388733, | |
| "learning_rate": 3.747114375655824e-05, | |
| "loss": 0.0182, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09414758269720101, | |
| "grad_norm": 0.7103002071380615, | |
| "learning_rate": 3.736621196222456e-05, | |
| "loss": 0.005, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09669211195928754, | |
| "grad_norm": 0.548647403717041, | |
| "learning_rate": 3.7261280167890875e-05, | |
| "loss": 0.0201, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09923664122137404, | |
| "grad_norm": 0.0742243304848671, | |
| "learning_rate": 3.715634837355719e-05, | |
| "loss": 0.0129, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10178117048346055, | |
| "grad_norm": 1.2889010906219482, | |
| "learning_rate": 3.705141657922351e-05, | |
| "loss": 0.0231, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10432569974554708, | |
| "grad_norm": 1.0989108085632324, | |
| "learning_rate": 3.6946484784889825e-05, | |
| "loss": 0.0149, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10687022900763359, | |
| "grad_norm": 0.6415719985961914, | |
| "learning_rate": 3.684155299055614e-05, | |
| "loss": 0.0143, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.10941475826972011, | |
| "grad_norm": 0.5659549236297607, | |
| "learning_rate": 3.673662119622246e-05, | |
| "loss": 0.0149, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11195928753180662, | |
| "grad_norm": 0.05708279460668564, | |
| "learning_rate": 3.6631689401888775e-05, | |
| "loss": 0.0043, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11450381679389313, | |
| "grad_norm": 1.086536169052124, | |
| "learning_rate": 3.652675760755509e-05, | |
| "loss": 0.007, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11704834605597965, | |
| "grad_norm": 0.17354388535022736, | |
| "learning_rate": 3.642182581322141e-05, | |
| "loss": 0.0037, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.11959287531806616, | |
| "grad_norm": 0.34753769636154175, | |
| "learning_rate": 3.6316894018887725e-05, | |
| "loss": 0.0058, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12213740458015267, | |
| "grad_norm": 0.010063248686492443, | |
| "learning_rate": 3.6211962224554046e-05, | |
| "loss": 0.0097, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12468193384223919, | |
| "grad_norm": 0.027567019686102867, | |
| "learning_rate": 3.610703043022036e-05, | |
| "loss": 0.0095, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1272264631043257, | |
| "grad_norm": 0.16245301067829132, | |
| "learning_rate": 3.6002098635886675e-05, | |
| "loss": 0.0169, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1272264631043257, | |
| "eval_loss": 0.03691313415765762, | |
| "eval_runtime": 134.0261, | |
| "eval_samples_per_second": 59.556, | |
| "eval_steps_per_second": 0.47, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1297709923664122, | |
| "grad_norm": 1.1503355503082275, | |
| "learning_rate": 3.5897166841552996e-05, | |
| "loss": 0.0209, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.13231552162849872, | |
| "grad_norm": 0.19143138825893402, | |
| "learning_rate": 3.579223504721931e-05, | |
| "loss": 0.0116, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13486005089058525, | |
| "grad_norm": 0.09249290823936462, | |
| "learning_rate": 3.5687303252885625e-05, | |
| "loss": 0.0052, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.13740458015267176, | |
| "grad_norm": 0.4119039475917816, | |
| "learning_rate": 3.5582371458551946e-05, | |
| "loss": 0.0092, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13994910941475827, | |
| "grad_norm": 0.48269760608673096, | |
| "learning_rate": 3.547743966421826e-05, | |
| "loss": 0.0087, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.14249363867684478, | |
| "grad_norm": 0.7463809251785278, | |
| "learning_rate": 3.5372507869884575e-05, | |
| "loss": 0.0122, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1450381679389313, | |
| "grad_norm": 0.09356890618801117, | |
| "learning_rate": 3.5267576075550896e-05, | |
| "loss": 0.0115, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1475826972010178, | |
| "grad_norm": 0.6823534965515137, | |
| "learning_rate": 3.516264428121721e-05, | |
| "loss": 0.0038, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.15012722646310434, | |
| "grad_norm": 0.012240025214850903, | |
| "learning_rate": 3.505771248688353e-05, | |
| "loss": 0.0104, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.15267175572519084, | |
| "grad_norm": 0.22518590092658997, | |
| "learning_rate": 3.4952780692549846e-05, | |
| "loss": 0.0027, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15521628498727735, | |
| "grad_norm": 0.32648953795433044, | |
| "learning_rate": 3.484784889821616e-05, | |
| "loss": 0.0112, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.15776081424936386, | |
| "grad_norm": 0.07640829682350159, | |
| "learning_rate": 3.474291710388248e-05, | |
| "loss": 0.0122, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.16030534351145037, | |
| "grad_norm": 0.027291180565953255, | |
| "learning_rate": 3.4637985309548795e-05, | |
| "loss": 0.0145, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1628498727735369, | |
| "grad_norm": 0.9284016489982605, | |
| "learning_rate": 3.453305351521511e-05, | |
| "loss": 0.0086, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.16539440203562342, | |
| "grad_norm": 0.06572818756103516, | |
| "learning_rate": 3.442812172088143e-05, | |
| "loss": 0.0046, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.16793893129770993, | |
| "grad_norm": 0.5768460631370544, | |
| "learning_rate": 3.4323189926547745e-05, | |
| "loss": 0.0099, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.17048346055979643, | |
| "grad_norm": 0.13030396401882172, | |
| "learning_rate": 3.421825813221406e-05, | |
| "loss": 0.006, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.17302798982188294, | |
| "grad_norm": 0.061677299439907074, | |
| "learning_rate": 3.411332633788038e-05, | |
| "loss": 0.0172, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.17557251908396945, | |
| "grad_norm": 0.4030856788158417, | |
| "learning_rate": 3.4008394543546695e-05, | |
| "loss": 0.0205, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.178117048346056, | |
| "grad_norm": 0.7717676758766174, | |
| "learning_rate": 3.3903462749213016e-05, | |
| "loss": 0.0112, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1806615776081425, | |
| "grad_norm": 0.867887556552887, | |
| "learning_rate": 3.379853095487933e-05, | |
| "loss": 0.0065, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.183206106870229, | |
| "grad_norm": 0.12486184388399124, | |
| "learning_rate": 3.3693599160545645e-05, | |
| "loss": 0.0105, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.18575063613231552, | |
| "grad_norm": 0.6370356678962708, | |
| "learning_rate": 3.3588667366211966e-05, | |
| "loss": 0.0058, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.18829516539440203, | |
| "grad_norm": 0.30326998233795166, | |
| "learning_rate": 3.348373557187828e-05, | |
| "loss": 0.0114, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.19083969465648856, | |
| "grad_norm": 0.03924720734357834, | |
| "learning_rate": 3.3378803777544595e-05, | |
| "loss": 0.0103, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19338422391857507, | |
| "grad_norm": 0.5556716918945312, | |
| "learning_rate": 3.3273871983210916e-05, | |
| "loss": 0.0059, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.19592875318066158, | |
| "grad_norm": 0.12600132822990417, | |
| "learning_rate": 3.316894018887723e-05, | |
| "loss": 0.0032, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.1984732824427481, | |
| "grad_norm": 0.21691930294036865, | |
| "learning_rate": 3.3064008394543545e-05, | |
| "loss": 0.0099, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2010178117048346, | |
| "grad_norm": 0.8290544748306274, | |
| "learning_rate": 3.2959076600209866e-05, | |
| "loss": 0.006, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2035623409669211, | |
| "grad_norm": 1.1044622659683228, | |
| "learning_rate": 3.285414480587618e-05, | |
| "loss": 0.0169, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20610687022900764, | |
| "grad_norm": 0.24777106940746307, | |
| "learning_rate": 3.27492130115425e-05, | |
| "loss": 0.0144, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.20865139949109415, | |
| "grad_norm": 0.06799839437007904, | |
| "learning_rate": 3.2644281217208816e-05, | |
| "loss": 0.0038, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.21119592875318066, | |
| "grad_norm": 0.5728234052658081, | |
| "learning_rate": 3.253934942287513e-05, | |
| "loss": 0.0085, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.21374045801526717, | |
| "grad_norm": 1.0202137231826782, | |
| "learning_rate": 3.243441762854145e-05, | |
| "loss": 0.0163, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.21628498727735368, | |
| "grad_norm": 0.00811342429369688, | |
| "learning_rate": 3.2329485834207766e-05, | |
| "loss": 0.0057, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.21882951653944022, | |
| "grad_norm": 0.22676686942577362, | |
| "learning_rate": 3.222455403987408e-05, | |
| "loss": 0.0153, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.22137404580152673, | |
| "grad_norm": 0.02477932535111904, | |
| "learning_rate": 3.21196222455404e-05, | |
| "loss": 0.0035, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.22391857506361323, | |
| "grad_norm": 0.08315183967351913, | |
| "learning_rate": 3.2014690451206715e-05, | |
| "loss": 0.0068, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.22646310432569974, | |
| "grad_norm": 1.0003092288970947, | |
| "learning_rate": 3.190975865687304e-05, | |
| "loss": 0.0119, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.22900763358778625, | |
| "grad_norm": 0.034200772643089294, | |
| "learning_rate": 3.180482686253935e-05, | |
| "loss": 0.008, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23155216284987276, | |
| "grad_norm": 0.0161778274923563, | |
| "learning_rate": 3.1699895068205665e-05, | |
| "loss": 0.0105, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2340966921119593, | |
| "grad_norm": 0.07032974809408188, | |
| "learning_rate": 3.1594963273871987e-05, | |
| "loss": 0.0096, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2366412213740458, | |
| "grad_norm": 0.10155557096004486, | |
| "learning_rate": 3.14900314795383e-05, | |
| "loss": 0.0066, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.23918575063613232, | |
| "grad_norm": 0.004129286855459213, | |
| "learning_rate": 3.1385099685204615e-05, | |
| "loss": 0.0016, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.24173027989821882, | |
| "grad_norm": 0.00802706554532051, | |
| "learning_rate": 3.1280167890870936e-05, | |
| "loss": 0.0118, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.24427480916030533, | |
| "grad_norm": 0.09856593608856201, | |
| "learning_rate": 3.117523609653725e-05, | |
| "loss": 0.0023, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.24681933842239187, | |
| "grad_norm": 0.11473847180604935, | |
| "learning_rate": 3.107030430220357e-05, | |
| "loss": 0.0056, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.24936386768447838, | |
| "grad_norm": 0.02289162203669548, | |
| "learning_rate": 3.0965372507869886e-05, | |
| "loss": 0.0131, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.25190839694656486, | |
| "grad_norm": 0.08203335851430893, | |
| "learning_rate": 3.08604407135362e-05, | |
| "loss": 0.0049, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2544529262086514, | |
| "grad_norm": 0.8752655982971191, | |
| "learning_rate": 3.075550891920252e-05, | |
| "loss": 0.0102, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2544529262086514, | |
| "eval_loss": 0.03172941133379936, | |
| "eval_runtime": 133.9654, | |
| "eval_samples_per_second": 59.583, | |
| "eval_steps_per_second": 0.47, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.25699745547073793, | |
| "grad_norm": 0.04296811670064926, | |
| "learning_rate": 3.0650577124868836e-05, | |
| "loss": 0.0045, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.2595419847328244, | |
| "grad_norm": 0.14025937020778656, | |
| "learning_rate": 3.054564533053516e-05, | |
| "loss": 0.0069, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.26208651399491095, | |
| "grad_norm": 0.5874730348587036, | |
| "learning_rate": 3.0440713536201468e-05, | |
| "loss": 0.0057, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.26463104325699743, | |
| "grad_norm": 0.45463526248931885, | |
| "learning_rate": 3.033578174186779e-05, | |
| "loss": 0.0119, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.26717557251908397, | |
| "grad_norm": 0.4644736051559448, | |
| "learning_rate": 3.0230849947534104e-05, | |
| "loss": 0.004, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2697201017811705, | |
| "grad_norm": 0.23533551394939423, | |
| "learning_rate": 3.0125918153200425e-05, | |
| "loss": 0.002, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.272264631043257, | |
| "grad_norm": 0.744405210018158, | |
| "learning_rate": 3.002098635886674e-05, | |
| "loss": 0.0088, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2748091603053435, | |
| "grad_norm": 0.4194672703742981, | |
| "learning_rate": 2.9916054564533054e-05, | |
| "loss": 0.0108, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.27735368956743, | |
| "grad_norm": 0.5709512829780579, | |
| "learning_rate": 2.9811122770199375e-05, | |
| "loss": 0.0068, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.27989821882951654, | |
| "grad_norm": 0.029444962739944458, | |
| "learning_rate": 2.970619097586569e-05, | |
| "loss": 0.0103, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2824427480916031, | |
| "grad_norm": 0.024460772052407265, | |
| "learning_rate": 2.9601259181532003e-05, | |
| "loss": 0.0069, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.28498727735368956, | |
| "grad_norm": 0.9423603415489197, | |
| "learning_rate": 2.9496327387198325e-05, | |
| "loss": 0.018, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2875318066157761, | |
| "grad_norm": 0.1192903071641922, | |
| "learning_rate": 2.939139559286464e-05, | |
| "loss": 0.007, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.2900763358778626, | |
| "grad_norm": 0.12459497898817062, | |
| "learning_rate": 2.9286463798530957e-05, | |
| "loss": 0.004, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.2926208651399491, | |
| "grad_norm": 0.02133631706237793, | |
| "learning_rate": 2.9181532004197274e-05, | |
| "loss": 0.0097, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2951653944020356, | |
| "grad_norm": 0.20045505464076996, | |
| "learning_rate": 2.907660020986359e-05, | |
| "loss": 0.0096, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.29770992366412213, | |
| "grad_norm": 0.8285555243492126, | |
| "learning_rate": 2.897166841552991e-05, | |
| "loss": 0.0153, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.30025445292620867, | |
| "grad_norm": 0.11353142559528351, | |
| "learning_rate": 2.8866736621196224e-05, | |
| "loss": 0.004, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.30279898218829515, | |
| "grad_norm": 0.316145658493042, | |
| "learning_rate": 2.8761804826862542e-05, | |
| "loss": 0.0118, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "grad_norm": 0.20705455541610718, | |
| "learning_rate": 2.865687303252886e-05, | |
| "loss": 0.0048, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.30788804071246817, | |
| "grad_norm": 0.004804595839232206, | |
| "learning_rate": 2.8551941238195174e-05, | |
| "loss": 0.0077, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3104325699745547, | |
| "grad_norm": 0.003488596761599183, | |
| "learning_rate": 2.8447009443861492e-05, | |
| "loss": 0.0115, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.31297709923664124, | |
| "grad_norm": 0.10154404491186142, | |
| "learning_rate": 2.834207764952781e-05, | |
| "loss": 0.0146, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3155216284987277, | |
| "grad_norm": 0.008709642104804516, | |
| "learning_rate": 2.8237145855194124e-05, | |
| "loss": 0.0061, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.31806615776081426, | |
| "grad_norm": 0.018860768526792526, | |
| "learning_rate": 2.8132214060860442e-05, | |
| "loss": 0.0091, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.32061068702290074, | |
| "grad_norm": 0.10052315890789032, | |
| "learning_rate": 2.802728226652676e-05, | |
| "loss": 0.0189, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3231552162849873, | |
| "grad_norm": 0.012599390931427479, | |
| "learning_rate": 2.7922350472193077e-05, | |
| "loss": 0.0037, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3256997455470738, | |
| "grad_norm": 0.08410943299531937, | |
| "learning_rate": 2.7817418677859395e-05, | |
| "loss": 0.0047, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3282442748091603, | |
| "grad_norm": 0.11278515309095383, | |
| "learning_rate": 2.771248688352571e-05, | |
| "loss": 0.0033, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.33078880407124683, | |
| "grad_norm": 0.508403480052948, | |
| "learning_rate": 2.7607555089192027e-05, | |
| "loss": 0.008, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.2811458706855774, | |
| "learning_rate": 2.7502623294858345e-05, | |
| "loss": 0.0116, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.33587786259541985, | |
| "grad_norm": 0.012120573781430721, | |
| "learning_rate": 2.7397691500524663e-05, | |
| "loss": 0.0094, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3384223918575064, | |
| "grad_norm": 0.24956151843070984, | |
| "learning_rate": 2.7292759706190977e-05, | |
| "loss": 0.0031, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.34096692111959287, | |
| "grad_norm": 0.7218794226646423, | |
| "learning_rate": 2.7187827911857295e-05, | |
| "loss": 0.0105, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3435114503816794, | |
| "grad_norm": 0.24596373736858368, | |
| "learning_rate": 2.7082896117523613e-05, | |
| "loss": 0.0123, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.3460559796437659, | |
| "grad_norm": 1.128461480140686, | |
| "learning_rate": 2.6977964323189927e-05, | |
| "loss": 0.0129, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.3486005089058524, | |
| "grad_norm": 0.029505103826522827, | |
| "learning_rate": 2.6873032528856248e-05, | |
| "loss": 0.0066, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.3511450381679389, | |
| "grad_norm": 0.2863447964191437, | |
| "learning_rate": 2.6768100734522562e-05, | |
| "loss": 0.0084, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.35368956743002544, | |
| "grad_norm": 0.6585884690284729, | |
| "learning_rate": 2.666316894018888e-05, | |
| "loss": 0.0121, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.356234096692112, | |
| "grad_norm": 0.12626522779464722, | |
| "learning_rate": 2.6558237145855198e-05, | |
| "loss": 0.0061, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.35877862595419846, | |
| "grad_norm": 0.042731188237667084, | |
| "learning_rate": 2.6453305351521512e-05, | |
| "loss": 0.0079, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.361323155216285, | |
| "grad_norm": 0.49549469351768494, | |
| "learning_rate": 2.6348373557187833e-05, | |
| "loss": 0.0107, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3638676844783715, | |
| "grad_norm": 0.05457068234682083, | |
| "learning_rate": 2.6243441762854148e-05, | |
| "loss": 0.0051, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.366412213740458, | |
| "grad_norm": 0.03797609731554985, | |
| "learning_rate": 2.6138509968520462e-05, | |
| "loss": 0.0146, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.36895674300254455, | |
| "grad_norm": 0.025207195430994034, | |
| "learning_rate": 2.6033578174186783e-05, | |
| "loss": 0.0081, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.37150127226463103, | |
| "grad_norm": 0.03196549415588379, | |
| "learning_rate": 2.5928646379853098e-05, | |
| "loss": 0.01, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.37404580152671757, | |
| "grad_norm": 0.322093665599823, | |
| "learning_rate": 2.5823714585519412e-05, | |
| "loss": 0.0031, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.37659033078880405, | |
| "grad_norm": 0.29793310165405273, | |
| "learning_rate": 2.5718782791185733e-05, | |
| "loss": 0.0138, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.3791348600508906, | |
| "grad_norm": 0.1609342247247696, | |
| "learning_rate": 2.5613850996852048e-05, | |
| "loss": 0.0057, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3816793893129771, | |
| "grad_norm": 0.014492412097752094, | |
| "learning_rate": 2.5508919202518362e-05, | |
| "loss": 0.0124, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3816793893129771, | |
| "eval_loss": 0.03146525099873543, | |
| "eval_runtime": 133.9818, | |
| "eval_samples_per_second": 59.575, | |
| "eval_steps_per_second": 0.47, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3842239185750636, | |
| "grad_norm": 0.6495288610458374, | |
| "learning_rate": 2.5403987408184683e-05, | |
| "loss": 0.0137, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.38676844783715014, | |
| "grad_norm": 0.1830611675977707, | |
| "learning_rate": 2.5299055613850997e-05, | |
| "loss": 0.0029, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.3893129770992366, | |
| "grad_norm": 0.1593904048204422, | |
| "learning_rate": 2.519412381951732e-05, | |
| "loss": 0.006, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.39185750636132316, | |
| "grad_norm": 0.012680516578257084, | |
| "learning_rate": 2.5089192025183633e-05, | |
| "loss": 0.0032, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.3944020356234097, | |
| "grad_norm": 0.6925283074378967, | |
| "learning_rate": 2.4984260230849947e-05, | |
| "loss": 0.0074, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.3969465648854962, | |
| "grad_norm": 0.3493160605430603, | |
| "learning_rate": 2.487932843651627e-05, | |
| "loss": 0.012, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.3994910941475827, | |
| "grad_norm": 0.14689674973487854, | |
| "learning_rate": 2.4774396642182583e-05, | |
| "loss": 0.0038, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4020356234096692, | |
| "grad_norm": 0.015744954347610474, | |
| "learning_rate": 2.4669464847848897e-05, | |
| "loss": 0.003, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.40458015267175573, | |
| "grad_norm": 0.8252820372581482, | |
| "learning_rate": 2.456453305351522e-05, | |
| "loss": 0.0084, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4071246819338422, | |
| "grad_norm": 0.07908014953136444, | |
| "learning_rate": 2.4459601259181533e-05, | |
| "loss": 0.0031, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.40966921119592875, | |
| "grad_norm": 0.019779745489358902, | |
| "learning_rate": 2.435466946484785e-05, | |
| "loss": 0.0025, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4122137404580153, | |
| "grad_norm": 0.19202356040477753, | |
| "learning_rate": 2.4249737670514168e-05, | |
| "loss": 0.0018, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.41475826972010177, | |
| "grad_norm": 0.9459726214408875, | |
| "learning_rate": 2.4144805876180483e-05, | |
| "loss": 0.005, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.4173027989821883, | |
| "grad_norm": 0.7893335819244385, | |
| "learning_rate": 2.4039874081846804e-05, | |
| "loss": 0.0081, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4198473282442748, | |
| "grad_norm": 0.0010036466410383582, | |
| "learning_rate": 2.3934942287513118e-05, | |
| "loss": 0.0021, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4223918575063613, | |
| "grad_norm": 0.015236877836287022, | |
| "learning_rate": 2.3830010493179432e-05, | |
| "loss": 0.008, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.42493638676844786, | |
| "grad_norm": 0.00819339882582426, | |
| "learning_rate": 2.3725078698845754e-05, | |
| "loss": 0.005, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.42748091603053434, | |
| "grad_norm": 0.05410633608698845, | |
| "learning_rate": 2.3620146904512068e-05, | |
| "loss": 0.0053, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.4300254452926209, | |
| "grad_norm": 0.10227911174297333, | |
| "learning_rate": 2.3515215110178386e-05, | |
| "loss": 0.0101, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.43256997455470736, | |
| "grad_norm": 0.08396944403648376, | |
| "learning_rate": 2.3410283315844703e-05, | |
| "loss": 0.0074, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4351145038167939, | |
| "grad_norm": 0.8800942897796631, | |
| "learning_rate": 2.3305351521511018e-05, | |
| "loss": 0.0108, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.43765903307888043, | |
| "grad_norm": 0.0064217569306492805, | |
| "learning_rate": 2.3200419727177336e-05, | |
| "loss": 0.0053, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4402035623409669, | |
| "grad_norm": 0.9264568090438843, | |
| "learning_rate": 2.3095487932843653e-05, | |
| "loss": 0.0084, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.44274809160305345, | |
| "grad_norm": 0.03843522444367409, | |
| "learning_rate": 2.299055613850997e-05, | |
| "loss": 0.0025, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.44529262086513993, | |
| "grad_norm": 0.7688780426979065, | |
| "learning_rate": 2.288562434417629e-05, | |
| "loss": 0.0094, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.44783715012722647, | |
| "grad_norm": 0.007313898764550686, | |
| "learning_rate": 2.2780692549842603e-05, | |
| "loss": 0.0074, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.45038167938931295, | |
| "grad_norm": 0.10244712233543396, | |
| "learning_rate": 2.267576075550892e-05, | |
| "loss": 0.0075, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.4529262086513995, | |
| "grad_norm": 0.6030476093292236, | |
| "learning_rate": 2.257082896117524e-05, | |
| "loss": 0.0172, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.455470737913486, | |
| "grad_norm": 0.08637584000825882, | |
| "learning_rate": 2.2465897166841556e-05, | |
| "loss": 0.0068, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4580152671755725, | |
| "grad_norm": 0.5561503767967224, | |
| "learning_rate": 2.236096537250787e-05, | |
| "loss": 0.0104, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.46055979643765904, | |
| "grad_norm": 0.08013448119163513, | |
| "learning_rate": 2.225603357817419e-05, | |
| "loss": 0.0078, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.4631043256997455, | |
| "grad_norm": 0.08120064437389374, | |
| "learning_rate": 2.2151101783840506e-05, | |
| "loss": 0.0047, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.46564885496183206, | |
| "grad_norm": 0.021734222769737244, | |
| "learning_rate": 2.204616998950682e-05, | |
| "loss": 0.0067, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.4681933842239186, | |
| "grad_norm": 0.014017133973538876, | |
| "learning_rate": 2.194123819517314e-05, | |
| "loss": 0.0047, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.4707379134860051, | |
| "grad_norm": 0.10124704986810684, | |
| "learning_rate": 2.1836306400839456e-05, | |
| "loss": 0.0041, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4732824427480916, | |
| "grad_norm": 0.2559342086315155, | |
| "learning_rate": 2.1731374606505774e-05, | |
| "loss": 0.0171, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4758269720101781, | |
| "grad_norm": 0.003074784530326724, | |
| "learning_rate": 2.1626442812172092e-05, | |
| "loss": 0.0087, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.47837150127226463, | |
| "grad_norm": 0.2251855880022049, | |
| "learning_rate": 2.1521511017838406e-05, | |
| "loss": 0.0026, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.48091603053435117, | |
| "grad_norm": 0.7686161994934082, | |
| "learning_rate": 2.1416579223504724e-05, | |
| "loss": 0.011, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.48346055979643765, | |
| "grad_norm": 0.06315213441848755, | |
| "learning_rate": 2.131164742917104e-05, | |
| "loss": 0.0098, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4860050890585242, | |
| "grad_norm": 0.1871805340051651, | |
| "learning_rate": 2.1206715634837356e-05, | |
| "loss": 0.0065, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.48854961832061067, | |
| "grad_norm": 0.1880640983581543, | |
| "learning_rate": 2.1101783840503677e-05, | |
| "loss": 0.0021, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.4910941475826972, | |
| "grad_norm": 1.1694693565368652, | |
| "learning_rate": 2.099685204616999e-05, | |
| "loss": 0.0083, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.49363867684478374, | |
| "grad_norm": 0.2776516079902649, | |
| "learning_rate": 2.0891920251836306e-05, | |
| "loss": 0.0053, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.4961832061068702, | |
| "grad_norm": 0.023003289476037025, | |
| "learning_rate": 2.0786988457502627e-05, | |
| "loss": 0.01, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.49872773536895676, | |
| "grad_norm": 0.07819438725709915, | |
| "learning_rate": 2.068205666316894e-05, | |
| "loss": 0.0038, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5012722646310432, | |
| "grad_norm": 0.6131492853164673, | |
| "learning_rate": 2.0577124868835262e-05, | |
| "loss": 0.0053, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5038167938931297, | |
| "grad_norm": 0.2298940122127533, | |
| "learning_rate": 2.0472193074501577e-05, | |
| "loss": 0.0075, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5063613231552163, | |
| "grad_norm": 0.09217698127031326, | |
| "learning_rate": 2.036726128016789e-05, | |
| "loss": 0.0024, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5089058524173028, | |
| "grad_norm": 0.11719799041748047, | |
| "learning_rate": 2.0262329485834212e-05, | |
| "loss": 0.0017, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5089058524173028, | |
| "eval_loss": 0.028622597455978394, | |
| "eval_runtime": 133.9853, | |
| "eval_samples_per_second": 59.574, | |
| "eval_steps_per_second": 0.47, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5114503816793893, | |
| "grad_norm": 0.0031085775699466467, | |
| "learning_rate": 2.0157397691500527e-05, | |
| "loss": 0.0109, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5139949109414759, | |
| "grad_norm": 0.45600050687789917, | |
| "learning_rate": 2.005246589716684e-05, | |
| "loss": 0.0052, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5165394402035624, | |
| "grad_norm": 0.15607476234436035, | |
| "learning_rate": 1.994753410283316e-05, | |
| "loss": 0.0072, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5190839694656488, | |
| "grad_norm": 0.4832943081855774, | |
| "learning_rate": 1.9842602308499477e-05, | |
| "loss": 0.0088, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5216284987277354, | |
| "grad_norm": 0.13958820700645447, | |
| "learning_rate": 1.9737670514165794e-05, | |
| "loss": 0.0075, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5241730279898219, | |
| "grad_norm": 0.9070898294448853, | |
| "learning_rate": 1.9632738719832112e-05, | |
| "loss": 0.0062, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5267175572519084, | |
| "grad_norm": 0.3955434262752533, | |
| "learning_rate": 1.9527806925498426e-05, | |
| "loss": 0.0112, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5292620865139949, | |
| "grad_norm": 0.616914689540863, | |
| "learning_rate": 1.9422875131164744e-05, | |
| "loss": 0.0133, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5318066157760815, | |
| "grad_norm": 0.006997487973421812, | |
| "learning_rate": 1.9317943336831062e-05, | |
| "loss": 0.0103, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5343511450381679, | |
| "grad_norm": 0.7367590665817261, | |
| "learning_rate": 1.9213011542497376e-05, | |
| "loss": 0.0117, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5368956743002544, | |
| "grad_norm": 0.10802368819713593, | |
| "learning_rate": 1.9108079748163694e-05, | |
| "loss": 0.0092, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.539440203562341, | |
| "grad_norm": 0.1059407964348793, | |
| "learning_rate": 1.9003147953830012e-05, | |
| "loss": 0.0065, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5419847328244275, | |
| "grad_norm": 0.03868965432047844, | |
| "learning_rate": 1.889821615949633e-05, | |
| "loss": 0.0052, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.544529262086514, | |
| "grad_norm": 0.004114597570151091, | |
| "learning_rate": 1.8793284365162644e-05, | |
| "loss": 0.011, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5470737913486005, | |
| "grad_norm": 0.15624436736106873, | |
| "learning_rate": 1.868835257082896e-05, | |
| "loss": 0.0098, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.549618320610687, | |
| "grad_norm": 0.46892330050468445, | |
| "learning_rate": 1.858342077649528e-05, | |
| "loss": 0.0059, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5521628498727735, | |
| "grad_norm": 0.046456072479486465, | |
| "learning_rate": 1.8478488982161594e-05, | |
| "loss": 0.0041, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.55470737913486, | |
| "grad_norm": 0.6324033737182617, | |
| "learning_rate": 1.837355718782791e-05, | |
| "loss": 0.0079, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5572519083969466, | |
| "grad_norm": 0.08168061077594757, | |
| "learning_rate": 1.826862539349423e-05, | |
| "loss": 0.0081, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5597964376590331, | |
| "grad_norm": 0.0022136031184345484, | |
| "learning_rate": 1.8163693599160547e-05, | |
| "loss": 0.0087, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5623409669211196, | |
| "grad_norm": 0.8067908883094788, | |
| "learning_rate": 1.8058761804826865e-05, | |
| "loss": 0.0054, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.5648854961832062, | |
| "grad_norm": 0.438232958316803, | |
| "learning_rate": 1.795383001049318e-05, | |
| "loss": 0.0091, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5674300254452926, | |
| "grad_norm": 0.8731472492218018, | |
| "learning_rate": 1.7848898216159497e-05, | |
| "loss": 0.0153, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5699745547073791, | |
| "grad_norm": 0.09311806410551071, | |
| "learning_rate": 1.7743966421825815e-05, | |
| "loss": 0.0025, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5725190839694656, | |
| "grad_norm": 0.27608296275138855, | |
| "learning_rate": 1.7639034627492132e-05, | |
| "loss": 0.0097, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5750636132315522, | |
| "grad_norm": 0.023715125396847725, | |
| "learning_rate": 1.7534102833158447e-05, | |
| "loss": 0.0068, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5776081424936387, | |
| "grad_norm": 0.390055388212204, | |
| "learning_rate": 1.7429171038824765e-05, | |
| "loss": 0.0063, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5801526717557252, | |
| "grad_norm": 0.5786376595497131, | |
| "learning_rate": 1.7324239244491082e-05, | |
| "loss": 0.0086, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.5826972010178118, | |
| "grad_norm": 0.7800132632255554, | |
| "learning_rate": 1.72193074501574e-05, | |
| "loss": 0.0076, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.5852417302798982, | |
| "grad_norm": 0.022780494764447212, | |
| "learning_rate": 1.7114375655823718e-05, | |
| "loss": 0.0007, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5877862595419847, | |
| "grad_norm": 0.4210648238658905, | |
| "learning_rate": 1.7009443861490032e-05, | |
| "loss": 0.0026, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.5903307888040712, | |
| "grad_norm": 0.046605486422777176, | |
| "learning_rate": 1.690451206715635e-05, | |
| "loss": 0.0096, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.5928753180661578, | |
| "grad_norm": 0.09751195460557938, | |
| "learning_rate": 1.6799580272822668e-05, | |
| "loss": 0.0049, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.5954198473282443, | |
| "grad_norm": 0.08370577543973923, | |
| "learning_rate": 1.6694648478488985e-05, | |
| "loss": 0.0057, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.5979643765903307, | |
| "grad_norm": 0.003502418752759695, | |
| "learning_rate": 1.65897166841553e-05, | |
| "loss": 0.0084, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6005089058524173, | |
| "grad_norm": 0.27206945419311523, | |
| "learning_rate": 1.6484784889821618e-05, | |
| "loss": 0.0022, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6030534351145038, | |
| "grad_norm": 0.11041576415300369, | |
| "learning_rate": 1.6379853095487935e-05, | |
| "loss": 0.0045, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6055979643765903, | |
| "grad_norm": 0.0404311828315258, | |
| "learning_rate": 1.6274921301154253e-05, | |
| "loss": 0.0101, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6081424936386769, | |
| "grad_norm": 0.47096529603004456, | |
| "learning_rate": 1.6169989506820567e-05, | |
| "loss": 0.0073, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "grad_norm": 0.0981617122888565, | |
| "learning_rate": 1.6065057712486885e-05, | |
| "loss": 0.011, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6132315521628499, | |
| "grad_norm": 0.029454654082655907, | |
| "learning_rate": 1.5960125918153203e-05, | |
| "loss": 0.0083, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6157760814249363, | |
| "grad_norm": 0.6878058910369873, | |
| "learning_rate": 1.585519412381952e-05, | |
| "loss": 0.0134, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6183206106870229, | |
| "grad_norm": 0.06291262060403824, | |
| "learning_rate": 1.5750262329485835e-05, | |
| "loss": 0.0043, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6208651399491094, | |
| "grad_norm": 0.01707449182868004, | |
| "learning_rate": 1.5645330535152153e-05, | |
| "loss": 0.0087, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6234096692111959, | |
| "grad_norm": 0.18086305260658264, | |
| "learning_rate": 1.554039874081847e-05, | |
| "loss": 0.004, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6259541984732825, | |
| "grad_norm": 0.632637619972229, | |
| "learning_rate": 1.5435466946484785e-05, | |
| "loss": 0.0116, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.628498727735369, | |
| "grad_norm": 0.2877883017063141, | |
| "learning_rate": 1.5330535152151103e-05, | |
| "loss": 0.0042, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6310432569974554, | |
| "grad_norm": 0.42460644245147705, | |
| "learning_rate": 1.522560335781742e-05, | |
| "loss": 0.0197, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6335877862595419, | |
| "grad_norm": 0.013305261731147766, | |
| "learning_rate": 1.5120671563483738e-05, | |
| "loss": 0.0101, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6361323155216285, | |
| "grad_norm": 0.5830298066139221, | |
| "learning_rate": 1.5015739769150053e-05, | |
| "loss": 0.0051, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6361323155216285, | |
| "eval_loss": 0.026752423495054245, | |
| "eval_runtime": 133.9291, | |
| "eval_samples_per_second": 59.599, | |
| "eval_steps_per_second": 0.47, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.638676844783715, | |
| "grad_norm": 0.48519253730773926, | |
| "learning_rate": 1.491080797481637e-05, | |
| "loss": 0.0136, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6412213740458015, | |
| "grad_norm": 0.4439609944820404, | |
| "learning_rate": 1.4805876180482688e-05, | |
| "loss": 0.0098, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6437659033078881, | |
| "grad_norm": 0.27948108315467834, | |
| "learning_rate": 1.4700944386149006e-05, | |
| "loss": 0.0086, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.6463104325699746, | |
| "grad_norm": 0.01617351733148098, | |
| "learning_rate": 1.459601259181532e-05, | |
| "loss": 0.0079, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.648854961832061, | |
| "grad_norm": 0.8637906312942505, | |
| "learning_rate": 1.4491080797481638e-05, | |
| "loss": 0.0083, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6513994910941476, | |
| "grad_norm": 0.47218918800354004, | |
| "learning_rate": 1.4386149003147956e-05, | |
| "loss": 0.0243, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6539440203562341, | |
| "grad_norm": 0.5402618646621704, | |
| "learning_rate": 1.428121720881427e-05, | |
| "loss": 0.0082, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.6564885496183206, | |
| "grad_norm": 0.03326863422989845, | |
| "learning_rate": 1.4176285414480588e-05, | |
| "loss": 0.0112, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.6590330788804071, | |
| "grad_norm": 1.7870920896530151, | |
| "learning_rate": 1.4071353620146906e-05, | |
| "loss": 0.0132, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6615776081424937, | |
| "grad_norm": 0.06419245153665543, | |
| "learning_rate": 1.3966421825813223e-05, | |
| "loss": 0.0037, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6641221374045801, | |
| "grad_norm": 0.020061425864696503, | |
| "learning_rate": 1.386149003147954e-05, | |
| "loss": 0.0104, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.018068986013531685, | |
| "learning_rate": 1.3756558237145855e-05, | |
| "loss": 0.0038, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6692111959287532, | |
| "grad_norm": 0.038132619112730026, | |
| "learning_rate": 1.3651626442812173e-05, | |
| "loss": 0.0029, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.6717557251908397, | |
| "grad_norm": 0.6635996103286743, | |
| "learning_rate": 1.3546694648478491e-05, | |
| "loss": 0.0064, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.6743002544529262, | |
| "grad_norm": 0.01742667704820633, | |
| "learning_rate": 1.3441762854144807e-05, | |
| "loss": 0.0045, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6768447837150128, | |
| "grad_norm": 0.5041590332984924, | |
| "learning_rate": 1.3336831059811125e-05, | |
| "loss": 0.0028, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.6793893129770993, | |
| "grad_norm": 0.2859777808189392, | |
| "learning_rate": 1.323189926547744e-05, | |
| "loss": 0.0085, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.6819338422391857, | |
| "grad_norm": 0.8118346929550171, | |
| "learning_rate": 1.3126967471143757e-05, | |
| "loss": 0.0261, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.6844783715012722, | |
| "grad_norm": 0.898858904838562, | |
| "learning_rate": 1.3022035676810075e-05, | |
| "loss": 0.0151, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.6870229007633588, | |
| "grad_norm": 1.0029804706573486, | |
| "learning_rate": 1.2917103882476392e-05, | |
| "loss": 0.0079, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.6895674300254453, | |
| "grad_norm": 1.1035183668136597, | |
| "learning_rate": 1.2812172088142708e-05, | |
| "loss": 0.0082, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.6921119592875318, | |
| "grad_norm": 0.05390460416674614, | |
| "learning_rate": 1.2707240293809024e-05, | |
| "loss": 0.0085, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6946564885496184, | |
| "grad_norm": 0.12546318769454956, | |
| "learning_rate": 1.2602308499475342e-05, | |
| "loss": 0.0084, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.6972010178117048, | |
| "grad_norm": 0.0319611094892025, | |
| "learning_rate": 1.249737670514166e-05, | |
| "loss": 0.0112, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.6997455470737913, | |
| "grad_norm": 0.46342381834983826, | |
| "learning_rate": 1.2392444910807974e-05, | |
| "loss": 0.0042, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7022900763358778, | |
| "grad_norm": 0.17658625543117523, | |
| "learning_rate": 1.2287513116474292e-05, | |
| "loss": 0.0049, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7048346055979644, | |
| "grad_norm": 0.18953083455562592, | |
| "learning_rate": 1.218258132214061e-05, | |
| "loss": 0.0061, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7073791348600509, | |
| "grad_norm": 0.010515202768146992, | |
| "learning_rate": 1.2077649527806928e-05, | |
| "loss": 0.0008, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7099236641221374, | |
| "grad_norm": 0.004728379193693399, | |
| "learning_rate": 1.1972717733473242e-05, | |
| "loss": 0.0026, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.712468193384224, | |
| "grad_norm": 0.29097694158554077, | |
| "learning_rate": 1.186778593913956e-05, | |
| "loss": 0.0054, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7150127226463104, | |
| "grad_norm": 0.010688016191124916, | |
| "learning_rate": 1.1762854144805877e-05, | |
| "loss": 0.0048, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7175572519083969, | |
| "grad_norm": 0.5915406346321106, | |
| "learning_rate": 1.1657922350472195e-05, | |
| "loss": 0.0046, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7201017811704835, | |
| "grad_norm": 0.009668947197496891, | |
| "learning_rate": 1.155299055613851e-05, | |
| "loss": 0.0018, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.72264631043257, | |
| "grad_norm": 0.010339943692088127, | |
| "learning_rate": 1.1448058761804827e-05, | |
| "loss": 0.0137, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7251908396946565, | |
| "grad_norm": 0.00322342268191278, | |
| "learning_rate": 1.1343126967471145e-05, | |
| "loss": 0.0059, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.727735368956743, | |
| "grad_norm": 0.5668567419052124, | |
| "learning_rate": 1.1238195173137461e-05, | |
| "loss": 0.0128, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7302798982188295, | |
| "grad_norm": 0.30744919180870056, | |
| "learning_rate": 1.1133263378803777e-05, | |
| "loss": 0.0009, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.732824427480916, | |
| "grad_norm": 1.329842448234558, | |
| "learning_rate": 1.1028331584470095e-05, | |
| "loss": 0.0056, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7353689567430025, | |
| "grad_norm": 0.3226444125175476, | |
| "learning_rate": 1.0923399790136413e-05, | |
| "loss": 0.0084, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7379134860050891, | |
| "grad_norm": 0.26656076312065125, | |
| "learning_rate": 1.0818467995802729e-05, | |
| "loss": 0.01, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7404580152671756, | |
| "grad_norm": 0.029421096667647362, | |
| "learning_rate": 1.0713536201469047e-05, | |
| "loss": 0.0122, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7430025445292621, | |
| "grad_norm": 0.036649610847234726, | |
| "learning_rate": 1.0608604407135363e-05, | |
| "loss": 0.0041, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7455470737913485, | |
| "grad_norm": 0.14049018919467926, | |
| "learning_rate": 1.050367261280168e-05, | |
| "loss": 0.0021, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.7480916030534351, | |
| "grad_norm": 0.8101888298988342, | |
| "learning_rate": 1.0398740818467996e-05, | |
| "loss": 0.0061, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.7506361323155216, | |
| "grad_norm": 0.3166317045688629, | |
| "learning_rate": 1.0293809024134314e-05, | |
| "loss": 0.0103, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7531806615776081, | |
| "grad_norm": 0.4369036853313446, | |
| "learning_rate": 1.0188877229800632e-05, | |
| "loss": 0.006, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.7557251908396947, | |
| "grad_norm": 0.6003983616828918, | |
| "learning_rate": 1.0083945435466946e-05, | |
| "loss": 0.0063, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.7582697201017812, | |
| "grad_norm": 0.3092733323574066, | |
| "learning_rate": 9.979013641133264e-06, | |
| "loss": 0.0061, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.7608142493638677, | |
| "grad_norm": 0.17131660878658295, | |
| "learning_rate": 9.874081846799582e-06, | |
| "loss": 0.0026, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "grad_norm": 0.6556366682052612, | |
| "learning_rate": 9.769150052465898e-06, | |
| "loss": 0.0071, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "eval_loss": 0.02788231521844864, | |
| "eval_runtime": 134.0287, | |
| "eval_samples_per_second": 59.554, | |
| "eval_steps_per_second": 0.47, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7659033078880407, | |
| "grad_norm": 0.02394355833530426, | |
| "learning_rate": 9.664218258132216e-06, | |
| "loss": 0.0062, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.7684478371501272, | |
| "grad_norm": 0.047644879668951035, | |
| "learning_rate": 9.559286463798532e-06, | |
| "loss": 0.0061, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.7709923664122137, | |
| "grad_norm": 1.3568006753921509, | |
| "learning_rate": 9.45435466946485e-06, | |
| "loss": 0.0058, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.7735368956743003, | |
| "grad_norm": 0.316701740026474, | |
| "learning_rate": 9.349422875131165e-06, | |
| "loss": 0.0034, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7760814249363868, | |
| "grad_norm": 0.02915433794260025, | |
| "learning_rate": 9.244491080797482e-06, | |
| "loss": 0.0024, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7786259541984732, | |
| "grad_norm": 0.9025008678436279, | |
| "learning_rate": 9.1395592864638e-06, | |
| "loss": 0.01, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.7811704834605598, | |
| "grad_norm": 0.05889623612165451, | |
| "learning_rate": 9.034627492130115e-06, | |
| "loss": 0.0175, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.7837150127226463, | |
| "grad_norm": 0.2562934160232544, | |
| "learning_rate": 8.929695697796433e-06, | |
| "loss": 0.0037, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.7862595419847328, | |
| "grad_norm": 0.019730977714061737, | |
| "learning_rate": 8.824763903462749e-06, | |
| "loss": 0.0072, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.7888040712468194, | |
| "grad_norm": 0.24575015902519226, | |
| "learning_rate": 8.719832109129067e-06, | |
| "loss": 0.0066, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.7913486005089059, | |
| "grad_norm": 0.13040441274642944, | |
| "learning_rate": 8.614900314795383e-06, | |
| "loss": 0.0091, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.7938931297709924, | |
| "grad_norm": 0.10326778143644333, | |
| "learning_rate": 8.5099685204617e-06, | |
| "loss": 0.0053, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.7964376590330788, | |
| "grad_norm": 0.011873223818838596, | |
| "learning_rate": 8.405036726128017e-06, | |
| "loss": 0.0114, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.7989821882951654, | |
| "grad_norm": 0.1212642639875412, | |
| "learning_rate": 8.300104931794335e-06, | |
| "loss": 0.0059, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8015267175572519, | |
| "grad_norm": 0.04367879405617714, | |
| "learning_rate": 8.19517313746065e-06, | |
| "loss": 0.0052, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8040712468193384, | |
| "grad_norm": 0.5733098387718201, | |
| "learning_rate": 8.090241343126968e-06, | |
| "loss": 0.0158, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.806615776081425, | |
| "grad_norm": 0.03896990418434143, | |
| "learning_rate": 7.985309548793284e-06, | |
| "loss": 0.0092, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.8091603053435115, | |
| "grad_norm": 0.39722007513046265, | |
| "learning_rate": 7.880377754459602e-06, | |
| "loss": 0.0069, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.811704834605598, | |
| "grad_norm": 0.06284640729427338, | |
| "learning_rate": 7.77544596012592e-06, | |
| "loss": 0.0124, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8142493638676844, | |
| "grad_norm": 0.8230289816856384, | |
| "learning_rate": 7.670514165792236e-06, | |
| "loss": 0.0253, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.816793893129771, | |
| "grad_norm": 0.045468445867300034, | |
| "learning_rate": 7.565582371458553e-06, | |
| "loss": 0.0012, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8193384223918575, | |
| "grad_norm": 0.3366369307041168, | |
| "learning_rate": 7.46065057712487e-06, | |
| "loss": 0.0021, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.821882951653944, | |
| "grad_norm": 0.47629743814468384, | |
| "learning_rate": 7.355718782791186e-06, | |
| "loss": 0.0134, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8244274809160306, | |
| "grad_norm": 0.18790696561336517, | |
| "learning_rate": 7.250786988457504e-06, | |
| "loss": 0.0045, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8269720101781171, | |
| "grad_norm": 0.17202238738536835, | |
| "learning_rate": 7.14585519412382e-06, | |
| "loss": 0.0042, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8295165394402035, | |
| "grad_norm": 0.4404882788658142, | |
| "learning_rate": 7.040923399790137e-06, | |
| "loss": 0.0042, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8320610687022901, | |
| "grad_norm": 0.5554683208465576, | |
| "learning_rate": 6.9359916054564535e-06, | |
| "loss": 0.0049, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.8346055979643766, | |
| "grad_norm": 0.6356526613235474, | |
| "learning_rate": 6.831059811122771e-06, | |
| "loss": 0.0142, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8371501272264631, | |
| "grad_norm": 0.2484181523323059, | |
| "learning_rate": 6.726128016789087e-06, | |
| "loss": 0.0065, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8396946564885496, | |
| "grad_norm": 0.1756751537322998, | |
| "learning_rate": 6.621196222455405e-06, | |
| "loss": 0.0102, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8422391857506362, | |
| "grad_norm": 0.5005545616149902, | |
| "learning_rate": 6.516264428121721e-06, | |
| "loss": 0.0018, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.8447837150127226, | |
| "grad_norm": 0.06506593525409698, | |
| "learning_rate": 6.411332633788039e-06, | |
| "loss": 0.0058, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.8473282442748091, | |
| "grad_norm": 0.07923749834299088, | |
| "learning_rate": 6.306400839454355e-06, | |
| "loss": 0.0055, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.8498727735368957, | |
| "grad_norm": 0.03359365090727806, | |
| "learning_rate": 6.201469045120672e-06, | |
| "loss": 0.01, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.8524173027989822, | |
| "grad_norm": 0.5358174443244934, | |
| "learning_rate": 6.096537250786989e-06, | |
| "loss": 0.0041, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8549618320610687, | |
| "grad_norm": 0.26434558629989624, | |
| "learning_rate": 5.991605456453306e-06, | |
| "loss": 0.0117, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8575063613231552, | |
| "grad_norm": 0.1969001740217209, | |
| "learning_rate": 5.886673662119623e-06, | |
| "loss": 0.0079, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.8600508905852418, | |
| "grad_norm": 0.3824959993362427, | |
| "learning_rate": 5.781741867785939e-06, | |
| "loss": 0.008, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.8625954198473282, | |
| "grad_norm": 0.0074090552516281605, | |
| "learning_rate": 5.676810073452257e-06, | |
| "loss": 0.0031, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.8651399491094147, | |
| "grad_norm": 0.22043420374393463, | |
| "learning_rate": 5.571878279118573e-06, | |
| "loss": 0.0048, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8676844783715013, | |
| "grad_norm": 0.6048237681388855, | |
| "learning_rate": 5.466946484784891e-06, | |
| "loss": 0.0127, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.8702290076335878, | |
| "grad_norm": 0.004376487340778112, | |
| "learning_rate": 5.362014690451207e-06, | |
| "loss": 0.0044, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.8727735368956743, | |
| "grad_norm": 0.3400680422782898, | |
| "learning_rate": 5.257082896117524e-06, | |
| "loss": 0.0025, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.8753180661577609, | |
| "grad_norm": 0.004490234889090061, | |
| "learning_rate": 5.152151101783841e-06, | |
| "loss": 0.0024, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.8778625954198473, | |
| "grad_norm": 0.005273368675261736, | |
| "learning_rate": 5.047219307450158e-06, | |
| "loss": 0.0089, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.8804071246819338, | |
| "grad_norm": 0.0007822876796126366, | |
| "learning_rate": 4.942287513116475e-06, | |
| "loss": 0.0039, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.8829516539440203, | |
| "grad_norm": 0.088668093085289, | |
| "learning_rate": 4.8373557187827916e-06, | |
| "loss": 0.0022, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.8854961832061069, | |
| "grad_norm": 0.08840341866016388, | |
| "learning_rate": 4.7324239244491085e-06, | |
| "loss": 0.0047, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.8880407124681934, | |
| "grad_norm": 0.4743495285511017, | |
| "learning_rate": 4.627492130115425e-06, | |
| "loss": 0.0075, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.8905852417302799, | |
| "grad_norm": 0.08260899037122726, | |
| "learning_rate": 4.522560335781742e-06, | |
| "loss": 0.0188, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8905852417302799, | |
| "eval_loss": 0.026213819161057472, | |
| "eval_runtime": 133.9096, | |
| "eval_samples_per_second": 59.607, | |
| "eval_steps_per_second": 0.47, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8931297709923665, | |
| "grad_norm": 0.002863268367946148, | |
| "learning_rate": 4.417628541448059e-06, | |
| "loss": 0.0016, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.8956743002544529, | |
| "grad_norm": 0.4324285686016083, | |
| "learning_rate": 4.312696747114376e-06, | |
| "loss": 0.0029, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.8982188295165394, | |
| "grad_norm": 0.21640141308307648, | |
| "learning_rate": 4.207764952780693e-06, | |
| "loss": 0.0045, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.9007633587786259, | |
| "grad_norm": 1.0017634630203247, | |
| "learning_rate": 4.10283315844701e-06, | |
| "loss": 0.0092, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9033078880407125, | |
| "grad_norm": 0.01565241441130638, | |
| "learning_rate": 3.997901364113327e-06, | |
| "loss": 0.0059, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.905852417302799, | |
| "grad_norm": 0.25544649362564087, | |
| "learning_rate": 3.892969569779644e-06, | |
| "loss": 0.0046, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9083969465648855, | |
| "grad_norm": 0.007674202788621187, | |
| "learning_rate": 3.78803777544596e-06, | |
| "loss": 0.0085, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.910941475826972, | |
| "grad_norm": 0.004819818306714296, | |
| "learning_rate": 3.683105981112277e-06, | |
| "loss": 0.0054, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9134860050890585, | |
| "grad_norm": 0.39050644636154175, | |
| "learning_rate": 3.578174186778594e-06, | |
| "loss": 0.0075, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "grad_norm": 0.5367786884307861, | |
| "learning_rate": 3.473242392444911e-06, | |
| "loss": 0.0036, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9185750636132316, | |
| "grad_norm": 0.002182970056310296, | |
| "learning_rate": 3.368310598111228e-06, | |
| "loss": 0.0005, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9211195928753181, | |
| "grad_norm": 0.010968153364956379, | |
| "learning_rate": 3.2633788037775447e-06, | |
| "loss": 0.006, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9236641221374046, | |
| "grad_norm": 0.04364249110221863, | |
| "learning_rate": 3.1584470094438616e-06, | |
| "loss": 0.0101, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.926208651399491, | |
| "grad_norm": 0.009574225172400475, | |
| "learning_rate": 3.053515215110179e-06, | |
| "loss": 0.0095, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9287531806615776, | |
| "grad_norm": 0.023343412205576897, | |
| "learning_rate": 2.948583420776496e-06, | |
| "loss": 0.0057, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9312977099236641, | |
| "grad_norm": 0.013017148710787296, | |
| "learning_rate": 2.8436516264428128e-06, | |
| "loss": 0.0039, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.9338422391857506, | |
| "grad_norm": 0.07142795622348785, | |
| "learning_rate": 2.7387198321091293e-06, | |
| "loss": 0.001, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.9363867684478372, | |
| "grad_norm": 0.03349682688713074, | |
| "learning_rate": 2.633788037775446e-06, | |
| "loss": 0.0018, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.9389312977099237, | |
| "grad_norm": 0.09319507330656052, | |
| "learning_rate": 2.528856243441763e-06, | |
| "loss": 0.0049, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.9414758269720102, | |
| "grad_norm": 0.08466564863920212, | |
| "learning_rate": 2.42392444910808e-06, | |
| "loss": 0.0101, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9440203562340967, | |
| "grad_norm": 0.03436915576457977, | |
| "learning_rate": 2.318992654774397e-06, | |
| "loss": 0.0016, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.9465648854961832, | |
| "grad_norm": 0.05093689262866974, | |
| "learning_rate": 2.214060860440714e-06, | |
| "loss": 0.0055, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.9491094147582697, | |
| "grad_norm": 0.0019584556575864553, | |
| "learning_rate": 2.1091290661070307e-06, | |
| "loss": 0.0089, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.9516539440203562, | |
| "grad_norm": 0.6479087471961975, | |
| "learning_rate": 2.004197271773347e-06, | |
| "loss": 0.005, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.9541984732824428, | |
| "grad_norm": 0.22916148602962494, | |
| "learning_rate": 1.8992654774396643e-06, | |
| "loss": 0.0018, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9567430025445293, | |
| "grad_norm": 0.021227942779660225, | |
| "learning_rate": 1.7943336831059812e-06, | |
| "loss": 0.0053, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.9592875318066157, | |
| "grad_norm": 0.028795473277568817, | |
| "learning_rate": 1.6894018887722981e-06, | |
| "loss": 0.0098, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.9618320610687023, | |
| "grad_norm": 0.23935425281524658, | |
| "learning_rate": 1.5844700944386152e-06, | |
| "loss": 0.0069, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.9643765903307888, | |
| "grad_norm": 0.2875511348247528, | |
| "learning_rate": 1.479538300104932e-06, | |
| "loss": 0.0028, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.9669211195928753, | |
| "grad_norm": 0.02270675264298916, | |
| "learning_rate": 1.3746065057712488e-06, | |
| "loss": 0.0049, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9694656488549618, | |
| "grad_norm": 0.42798078060150146, | |
| "learning_rate": 1.2696747114375657e-06, | |
| "loss": 0.0097, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.9720101781170484, | |
| "grad_norm": 0.051214683800935745, | |
| "learning_rate": 1.1647429171038824e-06, | |
| "loss": 0.0025, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.9745547073791349, | |
| "grad_norm": 0.34169483184814453, | |
| "learning_rate": 1.0598111227701995e-06, | |
| "loss": 0.0186, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.9770992366412213, | |
| "grad_norm": 0.013889187946915627, | |
| "learning_rate": 9.548793284365165e-07, | |
| "loss": 0.0134, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.9796437659033079, | |
| "grad_norm": 0.08551329374313354, | |
| "learning_rate": 8.499475341028333e-07, | |
| "loss": 0.0094, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.9821882951653944, | |
| "grad_norm": 0.008397153578698635, | |
| "learning_rate": 7.450157397691502e-07, | |
| "loss": 0.0061, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.9847328244274809, | |
| "grad_norm": 0.14278727769851685, | |
| "learning_rate": 6.40083945435467e-07, | |
| "loss": 0.0015, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.9872773536895675, | |
| "grad_norm": 0.07914524525403976, | |
| "learning_rate": 5.351521511017839e-07, | |
| "loss": 0.0045, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.989821882951654, | |
| "grad_norm": 0.024170175194740295, | |
| "learning_rate": 4.3022035676810077e-07, | |
| "loss": 0.0052, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.9923664122137404, | |
| "grad_norm": 0.004945623688399792, | |
| "learning_rate": 3.252885624344176e-07, | |
| "loss": 0.0048, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9949109414758269, | |
| "grad_norm": 0.21786199510097504, | |
| "learning_rate": 2.2035676810073456e-07, | |
| "loss": 0.0049, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.9974554707379135, | |
| "grad_norm": 0.11055775731801987, | |
| "learning_rate": 1.1542497376705142e-07, | |
| "loss": 0.0009, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.363585296436213e-05, | |
| "learning_rate": 1.0493179433368311e-08, | |
| "loss": 0.0035, | |
| "step": 3930 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.273606668827065e+18, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |