| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995267392333176, | |
| "eval_steps": 500, | |
| "global_step": 1584, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.7190074920654297, | |
| "learning_rate": 4.9995083170283816e-05, | |
| "loss": 2.9245, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.431870222091675, | |
| "learning_rate": 4.998033461515242e-05, | |
| "loss": 2.0053, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.3315682411193848, | |
| "learning_rate": 4.9955760135896534e-05, | |
| "loss": 1.888, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.2937276363372803, | |
| "learning_rate": 4.992136939879856e-05, | |
| "loss": 1.8447, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.7375714778900146, | |
| "learning_rate": 4.9877175931330346e-05, | |
| "loss": 1.8212, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.15061092376709, | |
| "learning_rate": 4.982319711683221e-05, | |
| "loss": 1.793, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.0427424907684326, | |
| "learning_rate": 4.975945418767529e-05, | |
| "loss": 1.756, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.107785224914551, | |
| "learning_rate": 4.968597221690986e-05, | |
| "loss": 1.7285, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.100552558898926, | |
| "learning_rate": 4.96027801084029e-05, | |
| "loss": 1.7297, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.2227377891540527, | |
| "learning_rate": 4.950991058546893e-05, | |
| "loss": 1.7602, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.535144567489624, | |
| "learning_rate": 4.940740017799833e-05, | |
| "loss": 1.7433, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.6522979736328125, | |
| "learning_rate": 4.929528920808854e-05, | |
| "loss": 1.7363, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8091869354248047, | |
| "learning_rate": 4.917362177418342e-05, | |
| "loss": 1.6872, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.1017510890960693, | |
| "learning_rate": 4.904244573372733e-05, | |
| "loss": 1.7084, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.6424258947372437, | |
| "learning_rate": 4.8901812684340564e-05, | |
| "loss": 1.6997, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.4547488689422607, | |
| "learning_rate": 4.8751777943523634e-05, | |
| "loss": 1.6747, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.6251146793365479, | |
| "learning_rate": 4.8592400526898314e-05, | |
| "loss": 1.6836, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.098386526107788, | |
| "learning_rate": 4.842374312499405e-05, | |
| "loss": 1.6552, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.2387640476226807, | |
| "learning_rate": 4.824587207858888e-05, | |
| "loss": 1.6489, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.7299611568450928, | |
| "learning_rate": 4.805885735261454e-05, | |
| "loss": 1.6576, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.5701665878295898, | |
| "learning_rate": 4.786277250863599e-05, | |
| "loss": 1.6533, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.417296886444092, | |
| "learning_rate": 4.765769467591625e-05, | |
| "loss": 1.6356, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.2636029720306396, | |
| "learning_rate": 4.744370452107789e-05, | |
| "loss": 1.6389, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.576324224472046, | |
| "learning_rate": 4.722088621637309e-05, | |
| "loss": 1.6546, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.9720542430877686, | |
| "learning_rate": 4.698932740657479e-05, | |
| "loss": 1.6354, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.5250279903411865, | |
| "learning_rate": 4.6749119174501975e-05, | |
| "loss": 1.6342, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.4737966060638428, | |
| "learning_rate": 4.6500356005192514e-05, | |
| "loss": 1.6407, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.2792372703552246, | |
| "learning_rate": 4.6243135748737864e-05, | |
| "loss": 1.6339, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.5593037605285645, | |
| "learning_rate": 4.597755958179406e-05, | |
| "loss": 1.6095, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3141404390335083, | |
| "learning_rate": 4.570373196778427e-05, | |
| "loss": 1.6036, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.2617065906524658, | |
| "learning_rate": 4.5421760615808474e-05, | |
| "loss": 1.6244, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.64117431640625, | |
| "learning_rate": 4.513175643827647e-05, | |
| "loss": 1.6449, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.7132749557495117, | |
| "learning_rate": 4.4833833507280884e-05, | |
| "loss": 1.5948, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.1323654651641846, | |
| "learning_rate": 4.4528109009727336e-05, | |
| "loss": 1.627, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.253115653991699, | |
| "learning_rate": 4.42147032012394e-05, | |
| "loss": 1.6151, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.6143097877502441, | |
| "learning_rate": 4.389373935885646e-05, | |
| "loss": 1.5838, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3353707790374756, | |
| "learning_rate": 4.356534373254316e-05, | |
| "loss": 1.5935, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.283742904663086, | |
| "learning_rate": 4.322964549552943e-05, | |
| "loss": 1.6015, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.437249779701233, | |
| "learning_rate": 4.288677669350066e-05, | |
| "loss": 1.577, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.5190638303756714, | |
| "learning_rate": 4.2536872192658036e-05, | |
| "loss": 1.5843, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.1320886611938477, | |
| "learning_rate": 4.218006962666934e-05, | |
| "loss": 1.6145, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.0696591138839722, | |
| "learning_rate": 4.181650934253132e-05, | |
| "loss": 1.5601, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.3149545192718506, | |
| "learning_rate": 4.144633434536467e-05, | |
| "loss": 1.5664, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.3661577701568604, | |
| "learning_rate": 4.1069690242163484e-05, | |
| "loss": 1.6002, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.6984481811523438, | |
| "learning_rate": 4.06867251845213e-05, | |
| "loss": 1.576, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.2728784084320068, | |
| "learning_rate": 4.0297589810356165e-05, | |
| "loss": 1.5448, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.4147616624832153, | |
| "learning_rate": 3.9902437184657784e-05, | |
| "loss": 1.5595, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.2289011478424072, | |
| "learning_rate": 3.9501422739279956e-05, | |
| "loss": 1.5628, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.5690233707427979, | |
| "learning_rate": 3.909470421180201e-05, | |
| "loss": 1.5731, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.4935098886489868, | |
| "learning_rate": 3.8682441583483314e-05, | |
| "loss": 1.545, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.2939772605895996, | |
| "learning_rate": 3.8264797016335205e-05, | |
| "loss": 1.5793, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.2150651216506958, | |
| "learning_rate": 3.7841934789335164e-05, | |
| "loss": 1.5378, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.2153139114379883, | |
| "learning_rate": 3.741402123380828e-05, | |
| "loss": 1.5345, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.290591835975647, | |
| "learning_rate": 3.6981224668001424e-05, | |
| "loss": 1.5517, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1924967765808105, | |
| "learning_rate": 3.654371533087586e-05, | |
| "loss": 1.5472, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.6345056295394897, | |
| "learning_rate": 3.610166531514436e-05, | |
| "loss": 1.5564, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.185119867324829, | |
| "learning_rate": 3.565524849957921e-05, | |
| "loss": 1.5574, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.3646321296691895, | |
| "learning_rate": 3.520464048061758e-05, | |
| "loss": 1.5584, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.2333228588104248, | |
| "learning_rate": 3.47500185032913e-05, | |
| "loss": 1.518, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3945318460464478, | |
| "learning_rate": 3.4291561391508185e-05, | |
| "loss": 1.5339, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.304306149482727, | |
| "learning_rate": 3.3829449477712324e-05, | |
| "loss": 1.5339, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.6393932104110718, | |
| "learning_rate": 3.336386453195088e-05, | |
| "loss": 1.5399, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.2000635862350464, | |
| "learning_rate": 3.2894989690375626e-05, | |
| "loss": 1.5233, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.1479601860046387, | |
| "learning_rate": 3.2423009383206876e-05, | |
| "loss": 1.538, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.1483389139175415, | |
| "learning_rate": 3.194810926218861e-05, | |
| "loss": 1.528, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.2403253316879272, | |
| "learning_rate": 3.147047612756302e-05, | |
| "loss": 1.5307, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.3997712135314941, | |
| "learning_rate": 3.099029785459328e-05, | |
| "loss": 1.4915, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2010352611541748, | |
| "learning_rate": 3.0507763319663517e-05, | |
| "loss": 1.5268, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0670932531356812, | |
| "learning_rate": 3.002306232598497e-05, | |
| "loss": 1.5273, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.2283655405044556, | |
| "learning_rate": 2.9536385528937567e-05, | |
| "loss": 1.5273, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.1306476593017578, | |
| "learning_rate": 2.9047924361076345e-05, | |
| "loss": 1.5072, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.1699943542480469, | |
| "learning_rate": 2.8557870956832132e-05, | |
| "loss": 1.4856, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.2550854682922363, | |
| "learning_rate": 2.8066418076936167e-05, | |
| "loss": 1.4983, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.0610970258712769, | |
| "learning_rate": 2.7573759032598366e-05, | |
| "loss": 1.5518, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.1754754781723022, | |
| "learning_rate": 2.7080087609469062e-05, | |
| "loss": 1.4998, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1955766677856445, | |
| "learning_rate": 2.6585597991414114e-05, | |
| "loss": 1.5109, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.0891656875610352, | |
| "learning_rate": 2.6090484684133404e-05, | |
| "loss": 1.5007, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.0880335569381714, | |
| "learning_rate": 2.5594942438652688e-05, | |
| "loss": 1.5049, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.345954418182373, | |
| "learning_rate": 2.509916617471903e-05, | |
| "loss": 1.5154, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.1668224334716797, | |
| "learning_rate": 2.46033509041298e-05, | |
| "loss": 1.4883, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.055127501487732, | |
| "learning_rate": 2.410769165402549e-05, | |
| "loss": 1.5053, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0528500080108643, | |
| "learning_rate": 2.3612383390176503e-05, | |
| "loss": 1.4871, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.328258991241455, | |
| "learning_rate": 2.3117620940294048e-05, | |
| "loss": 1.5037, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0326772928237915, | |
| "learning_rate": 2.2623598917395438e-05, | |
| "loss": 1.4525, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 3.057058811187744, | |
| "learning_rate": 2.213051164325366e-05, | |
| "loss": 1.4898, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1190940141677856, | |
| "learning_rate": 2.1638553071961708e-05, | |
| "loss": 1.488, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.1501041650772095, | |
| "learning_rate": 2.1147916713641367e-05, | |
| "loss": 1.4711, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.090022325515747, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 1.488, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.0642565488815308, | |
| "learning_rate": 2.017138200005236e-05, | |
| "loss": 1.4791, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.3562296628952026, | |
| "learning_rate": 1.9685867761175584e-05, | |
| "loss": 1.4956, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.2069261074066162, | |
| "learning_rate": 1.9202443816963425e-05, | |
| "loss": 1.4918, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.3227437734603882, | |
| "learning_rate": 1.872130032047302e-05, | |
| "loss": 1.4577, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0784181356430054, | |
| "learning_rate": 1.824262652775568e-05, | |
| "loss": 1.4888, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.000135898590088, | |
| "learning_rate": 1.7766610723413684e-05, | |
| "loss": 1.4673, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.136026382446289, | |
| "learning_rate": 1.7293440146539196e-05, | |
| "loss": 1.4779, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.123252272605896, | |
| "learning_rate": 1.682330091706446e-05, | |
| "loss": 1.4583, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.0559343099594116, | |
| "learning_rate": 1.6356377962552238e-05, | |
| "loss": 1.4471, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0266658067703247, | |
| "learning_rate": 1.589285494545514e-05, | |
| "loss": 1.4632, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1371444463729858, | |
| "learning_rate": 1.5432914190872757e-05, | |
| "loss": 1.4732, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.1203784942626953, | |
| "learning_rate": 1.4976736614834664e-05, | |
| "loss": 1.452, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0037944316864014, | |
| "learning_rate": 1.4524501653137787e-05, | |
| "loss": 1.461, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.1353282928466797, | |
| "learning_rate": 1.4076387190766017e-05, | |
| "loss": 1.4538, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.1203887462615967, | |
| "learning_rate": 1.363256949191972e-05, | |
| "loss": 1.4681, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0686651468276978, | |
| "learning_rate": 1.3193223130682936e-05, | |
| "loss": 1.4548, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0339988470077515, | |
| "learning_rate": 1.2758520922355226e-05, | |
| "loss": 1.4535, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.4555269479751587, | |
| "learning_rate": 1.2328633855475429e-05, | |
| "loss": 1.4621, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0318940877914429, | |
| "learning_rate": 1.1903731024563966e-05, | |
| "loss": 1.4621, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.084612488746643, | |
| "learning_rate": 1.148397956361007e-05, | |
| "loss": 1.4636, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.0705621242523193, | |
| "learning_rate": 1.106954458033026e-05, | |
| "loss": 1.4495, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.050857424736023, | |
| "learning_rate": 1.0660589091223855e-05, | |
| "loss": 1.4395, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0744839906692505, | |
| "learning_rate": 1.025727395745095e-05, | |
| "loss": 1.4583, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0446105003356934, | |
| "learning_rate": 9.859757821558337e-06, | |
| "loss": 1.4606, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.1479051113128662, | |
| "learning_rate": 9.468197045077976e-06, | |
| "loss": 1.454, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.985953152179718, | |
| "learning_rate": 9.082745647022797e-06, | |
| "loss": 1.4654, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.1085201501846313, | |
| "learning_rate": 8.703555243303835e-06, | |
| "loss": 1.4526, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.2304482460021973, | |
| "learning_rate": 8.330774987092712e-06, | |
| "loss": 1.448, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0740071535110474, | |
| "learning_rate": 7.96455151015272e-06, | |
| "loss": 1.4606, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0380760431289673, | |
| "learning_rate": 7.605028865161809e-06, | |
| "loss": 1.4661, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.1115810871124268, | |
| "learning_rate": 7.25234846904993e-06, | |
| "loss": 1.4567, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.9248858094215393, | |
| "learning_rate": 6.906649047373246e-06, | |
| "loss": 1.4372, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0288389921188354, | |
| "learning_rate": 6.568066579746901e-06, | |
| "loss": 1.4542, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0125763416290283, | |
| "learning_rate": 6.2367342463579475e-06, | |
| "loss": 1.4426, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9536031484603882, | |
| "learning_rate": 5.912782375579412e-06, | |
| "loss": 1.4292, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.993061363697052, | |
| "learning_rate": 5.596338392706077e-06, | |
| "loss": 1.432, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.9642956852912903, | |
| "learning_rate": 5.2875267698322325e-06, | |
| "loss": 1.4427, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9925894737243652, | |
| "learning_rate": 4.986468976890993e-06, | |
| "loss": 1.4199, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0030889511108398, | |
| "learning_rate": 4.693283433874565e-06, | |
| "loss": 1.4253, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.986602783203125, | |
| "learning_rate": 4.408085464254183e-06, | |
| "loss": 1.4382, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.9463419318199158, | |
| "learning_rate": 4.130987249617993e-06, | |
| "loss": 1.439, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.9418216347694397, | |
| "learning_rate": 3.8620977855448935e-06, | |
| "loss": 1.4322, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.067226529121399, | |
| "learning_rate": 3.601522838731461e-06, | |
| "loss": 1.4305, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.9662885665893555, | |
| "learning_rate": 3.3493649053890326e-06, | |
| "loss": 1.4188, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.1397868394851685, | |
| "learning_rate": 3.1057231709272077e-06, | |
| "loss": 1.4426, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0030759572982788, | |
| "learning_rate": 2.8706934709395892e-06, | |
| "loss": 1.4185, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.9549908638000488, | |
| "learning_rate": 2.6443682535072177e-06, | |
| "loss": 1.4276, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9839365482330322, | |
| "learning_rate": 2.4268365428344736e-06, | |
| "loss": 1.4174, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.954189121723175, | |
| "learning_rate": 2.21818390423168e-06, | |
| "loss": 1.441, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.9914742708206177, | |
| "learning_rate": 2.0184924104583613e-06, | |
| "loss": 1.4322, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9965653419494629, | |
| "learning_rate": 1.8278406094401623e-06, | |
| "loss": 1.411, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.0744175910949707, | |
| "learning_rate": 1.6463034933723337e-06, | |
| "loss": 1.4368, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.9871243238449097, | |
| "learning_rate": 1.4739524692218314e-06, | |
| "loss": 1.396, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9976981282234192, | |
| "learning_rate": 1.3108553306396265e-06, | |
| "loss": 1.439, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9817109704017639, | |
| "learning_rate": 1.1570762312943295e-06, | |
| "loss": 1.4113, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.9741029143333435, | |
| "learning_rate": 1.0126756596375686e-06, | |
| "loss": 1.4438, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0171328783035278, | |
| "learning_rate": 8.777104151110826e-07, | |
| "loss": 1.4365, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.980021595954895, | |
| "learning_rate": 7.522335858048707e-07, | |
| "loss": 1.4355, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.9966154098510742, | |
| "learning_rate": 6.362945275751736e-07, | |
| "loss": 1.431, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.9687898755073547, | |
| "learning_rate": 5.299388446305343e-07, | |
| "loss": 1.4057, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9906119704246521, | |
| "learning_rate": 4.3320837159353813e-07, | |
| "loss": 1.421, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0227527618408203, | |
| "learning_rate": 3.4614115704533767e-07, | |
| "loss": 1.4319, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0115277767181396, | |
| "learning_rate": 2.687714485593462e-07, | |
| "loss": 1.4295, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.993654727935791, | |
| "learning_rate": 2.011296792301165e-07, | |
| "loss": 1.4294, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.8775748014450073, | |
| "learning_rate": 1.4324245570256633e-07, | |
| "loss": 1.4562, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9754842519760132, | |
| "learning_rate": 9.513254770636137e-08, | |
| "loss": 1.4447, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.9996697902679443, | |
| "learning_rate": 5.681887909952388e-08, | |
| "loss": 1.4229, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.9914098381996155, | |
| "learning_rate": 2.831652042480093e-08, | |
| "loss": 1.4458, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.9639108777046204, | |
| "learning_rate": 9.636682981720158e-09, | |
| "loss": 1.4267, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9515108466148376, | |
| "learning_rate": 7.867144166728846e-10, | |
| "loss": 1.4373, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1584, | |
| "total_flos": 1.1098698583858217e+18, | |
| "train_loss": 1.5383612829627413, | |
| "train_runtime": 4681.1872, | |
| "train_samples_per_second": 21.666, | |
| "train_steps_per_second": 0.338 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1584, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "total_flos": 1.1098698583858217e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |