| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9981024667931688, | |
| "eval_steps": 500, | |
| "global_step": 1580, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01265022137887413, | |
| "grad_norm": 13.099321365356445, | |
| "learning_rate": 6.329113924050633e-08, | |
| "loss": 2.5391, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02530044275774826, | |
| "grad_norm": 27.764171600341797, | |
| "learning_rate": 1.2658227848101266e-07, | |
| "loss": 2.5382, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03795066413662239, | |
| "grad_norm": 13.548388481140137, | |
| "learning_rate": 1.89873417721519e-07, | |
| "loss": 2.5232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05060088551549652, | |
| "grad_norm": 12.454045295715332, | |
| "learning_rate": 2.5316455696202533e-07, | |
| "loss": 2.4481, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06325110689437065, | |
| "grad_norm": 8.77104377746582, | |
| "learning_rate": 3.1645569620253163e-07, | |
| "loss": 2.3842, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07590132827324478, | |
| "grad_norm": 6.88944149017334, | |
| "learning_rate": 3.79746835443038e-07, | |
| "loss": 2.2827, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08855154965211891, | |
| "grad_norm": 6.3905930519104, | |
| "learning_rate": 4.4303797468354424e-07, | |
| "loss": 2.2146, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10120177103099304, | |
| "grad_norm": 8.130488395690918, | |
| "learning_rate": 5.063291139240507e-07, | |
| "loss": 2.0978, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11385199240986717, | |
| "grad_norm": 21.30768394470215, | |
| "learning_rate": 5.69620253164557e-07, | |
| "loss": 2.0624, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1265022137887413, | |
| "grad_norm": 6.106363296508789, | |
| "learning_rate": 6.329113924050633e-07, | |
| "loss": 2.0176, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13915243516761544, | |
| "grad_norm": 6.555318832397461, | |
| "learning_rate": 6.962025316455696e-07, | |
| "loss": 1.9714, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15180265654648956, | |
| "grad_norm": 6.091899394989014, | |
| "learning_rate": 7.59493670886076e-07, | |
| "loss": 1.8976, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1644528779253637, | |
| "grad_norm": 8.193241119384766, | |
| "learning_rate": 8.227848101265823e-07, | |
| "loss": 1.8571, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.17710309930423782, | |
| "grad_norm": 7.589028835296631, | |
| "learning_rate": 8.860759493670885e-07, | |
| "loss": 1.845, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18975332068311196, | |
| "grad_norm": 7.830214023590088, | |
| "learning_rate": 9.493670886075948e-07, | |
| "loss": 1.8197, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20240354206198607, | |
| "grad_norm": 6.8579535484313965, | |
| "learning_rate": 9.99995119100718e-07, | |
| "loss": 1.8233, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 6.225603103637695, | |
| "learning_rate": 9.998242976313776e-07, | |
| "loss": 1.7624, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.22770398481973433, | |
| "grad_norm": 7.000970363616943, | |
| "learning_rate": 9.994095264822903e-07, | |
| "loss": 1.7696, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24035420619860848, | |
| "grad_norm": 10.97808837890625, | |
| "learning_rate": 9.987510080911721e-07, | |
| "loss": 1.7406, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2530044275774826, | |
| "grad_norm": 6.758321762084961, | |
| "learning_rate": 9.97849063861667e-07, | |
| "loss": 1.7793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2656546489563567, | |
| "grad_norm": 12.631876945495605, | |
| "learning_rate": 9.967041340064793e-07, | |
| "loss": 1.7416, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2783048703352309, | |
| "grad_norm": 14.51364517211914, | |
| "learning_rate": 9.953167773325195e-07, | |
| "loss": 1.7273, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.290955091714105, | |
| "grad_norm": 9.859718322753906, | |
| "learning_rate": 9.936876709681666e-07, | |
| "loss": 1.7137, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3036053130929791, | |
| "grad_norm": 8.631091117858887, | |
| "learning_rate": 9.91817610032781e-07, | |
| "loss": 1.7117, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3162555344718533, | |
| "grad_norm": 9.83065414428711, | |
| "learning_rate": 9.897075072486298e-07, | |
| "loss": 1.7011, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3289057558507274, | |
| "grad_norm": 17.304086685180664, | |
| "learning_rate": 9.87358392495415e-07, | |
| "loss": 1.7106, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3415559772296015, | |
| "grad_norm": 10.813619613647461, | |
| "learning_rate": 9.847714123076173e-07, | |
| "loss": 1.6754, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.35420619860847563, | |
| "grad_norm": 10.511194229125977, | |
| "learning_rate": 9.81947829314908e-07, | |
| "loss": 1.6962, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3668564199873498, | |
| "grad_norm": 10.410032272338867, | |
| "learning_rate": 9.788890216258938e-07, | |
| "loss": 1.6962, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3795066413662239, | |
| "grad_norm": 7.6379714012146, | |
| "learning_rate": 9.755964821555046e-07, | |
| "loss": 1.6905, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 16.694063186645508, | |
| "learning_rate": 9.720718178963446e-07, | |
| "loss": 1.6889, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.40480708412397215, | |
| "grad_norm": 10.317975044250488, | |
| "learning_rate": 9.68316749134364e-07, | |
| "loss": 1.6611, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4174573055028463, | |
| "grad_norm": 10.705253601074219, | |
| "learning_rate": 9.643331086092404e-07, | |
| "loss": 1.6706, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 10.81264591217041, | |
| "learning_rate": 9.601228406198703e-07, | |
| "loss": 1.6597, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.44275774826059455, | |
| "grad_norm": 7.797952175140381, | |
| "learning_rate": 9.55688000075414e-07, | |
| "loss": 1.667, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.45540796963946867, | |
| "grad_norm": 11.544702529907227, | |
| "learning_rate": 9.510307514923536e-07, | |
| "loss": 1.6463, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.46805819101834284, | |
| "grad_norm": 8.987153053283691, | |
| "learning_rate": 9.461533679380567e-07, | |
| "loss": 1.6518, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.48070841239721696, | |
| "grad_norm": 8.551813125610352, | |
| "learning_rate": 9.410582299213572e-07, | |
| "loss": 1.645, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.49335863377609107, | |
| "grad_norm": 13.203941345214844, | |
| "learning_rate": 9.357478242306996e-07, | |
| "loss": 1.633, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5060088551549652, | |
| "grad_norm": 60.77254104614258, | |
| "learning_rate": 9.302247427204087e-07, | |
| "loss": 1.6537, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5186590765338394, | |
| "grad_norm": 23.738460540771484, | |
| "learning_rate": 9.24491681045682e-07, | |
| "loss": 1.6392, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5313092979127134, | |
| "grad_norm": 12.887944221496582, | |
| "learning_rate": 9.185514373469179e-07, | |
| "loss": 1.6342, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5439595192915876, | |
| "grad_norm": 11.586170196533203, | |
| "learning_rate": 9.124069108840264e-07, | |
| "loss": 1.6542, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5566097406704618, | |
| "grad_norm": 6.504334926605225, | |
| "learning_rate": 9.060611006213832e-07, | |
| "loss": 1.6413, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5692599620493358, | |
| "grad_norm": 8.936307907104492, | |
| "learning_rate": 8.995171037641234e-07, | |
| "loss": 1.6305, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.58191018342821, | |
| "grad_norm": 10.616469383239746, | |
| "learning_rate": 8.927781142464858e-07, | |
| "loss": 1.6135, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5945604048070842, | |
| "grad_norm": 6.386105537414551, | |
| "learning_rate": 8.858474211729469e-07, | |
| "loss": 1.6249, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6072106261859582, | |
| "grad_norm": 7.467748165130615, | |
| "learning_rate": 8.787284072129037e-07, | |
| "loss": 1.6282, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6198608475648324, | |
| "grad_norm": 11.522847175598145, | |
| "learning_rate": 8.714245469496931e-07, | |
| "loss": 1.633, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6325110689437066, | |
| "grad_norm": 5.804441928863525, | |
| "learning_rate": 8.639394051847471e-07, | |
| "loss": 1.6011, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 7.378643035888672, | |
| "learning_rate": 8.562766351977181e-07, | |
| "loss": 1.6185, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6578115117014548, | |
| "grad_norm": 6.906543254852295, | |
| "learning_rate": 8.484399769634203e-07, | |
| "loss": 1.6326, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6704617330803289, | |
| "grad_norm": 7.340395927429199, | |
| "learning_rate": 8.404332553264546e-07, | |
| "loss": 1.6306, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.683111954459203, | |
| "grad_norm": 13.938148498535156, | |
| "learning_rate": 8.32260378134416e-07, | |
| "loss": 1.6156, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6957621758380772, | |
| "grad_norm": 9.445398330688477, | |
| "learning_rate": 8.239253343305847e-07, | |
| "loss": 1.6228, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7084123972169513, | |
| "grad_norm": 6.176075458526611, | |
| "learning_rate": 8.154321920070412e-07, | |
| "loss": 1.5906, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7210626185958254, | |
| "grad_norm": 8.291935920715332, | |
| "learning_rate": 8.067850964191475e-07, | |
| "loss": 1.6202, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7337128399746996, | |
| "grad_norm": 11.122963905334473, | |
| "learning_rate": 7.979882679623694e-07, | |
| "loss": 1.6181, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7463630613535737, | |
| "grad_norm": 8.314069747924805, | |
| "learning_rate": 7.890460001124241e-07, | |
| "loss": 1.5892, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7590132827324478, | |
| "grad_norm": 6.397180080413818, | |
| "learning_rate": 7.799626573297604e-07, | |
| "loss": 1.5916, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7716635041113219, | |
| "grad_norm": 8.690041542053223, | |
| "learning_rate": 7.707426729293915e-07, | |
| "loss": 1.5919, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 7.349343776702881, | |
| "learning_rate": 7.613905469171245e-07, | |
| "loss": 1.5858, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7969639468690702, | |
| "grad_norm": 9.25490665435791, | |
| "learning_rate": 7.519108437932378e-07, | |
| "loss": 1.6114, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8096141682479443, | |
| "grad_norm": 7.29209566116333, | |
| "learning_rate": 7.423081903246813e-07, | |
| "loss": 1.607, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8222643896268185, | |
| "grad_norm": 7.68463659286499, | |
| "learning_rate": 7.325872732868869e-07, | |
| "loss": 1.5746, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8349146110056926, | |
| "grad_norm": 6.869282245635986, | |
| "learning_rate": 7.227528371762896e-07, | |
| "loss": 1.5811, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8475648323845667, | |
| "grad_norm": 17.074424743652344, | |
| "learning_rate": 7.128096818946769e-07, | |
| "loss": 1.5897, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 11.504667282104492, | |
| "learning_rate": 7.027626604064969e-07, | |
| "loss": 1.5794, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.872865275142315, | |
| "grad_norm": 8.88005256652832, | |
| "learning_rate": 6.926166763702672e-07, | |
| "loss": 1.6042, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8855154965211891, | |
| "grad_norm": 24.4627685546875, | |
| "learning_rate": 6.823766817452424e-07, | |
| "loss": 1.5639, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8981657179000633, | |
| "grad_norm": 20.67166519165039, | |
| "learning_rate": 6.720476743745072e-07, | |
| "loss": 1.587, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9108159392789373, | |
| "grad_norm": 16.13395118713379, | |
| "learning_rate": 6.616346955456742e-07, | |
| "loss": 1.6019, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9234661606578115, | |
| "grad_norm": 16.747365951538086, | |
| "learning_rate": 6.511428275303785e-07, | |
| "loss": 1.6014, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9361163820366857, | |
| "grad_norm": 7.6073994636535645, | |
| "learning_rate": 6.405771911037697e-07, | |
| "loss": 1.5753, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9487666034155597, | |
| "grad_norm": 8.412175178527832, | |
| "learning_rate": 6.299429430452096e-07, | |
| "loss": 1.5632, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9614168247944339, | |
| "grad_norm": 6.702009677886963, | |
| "learning_rate": 6.192452736213987e-07, | |
| "loss": 1.5773, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9740670461733081, | |
| "grad_norm": 6.783812046051025, | |
| "learning_rate": 6.084894040531589e-07, | |
| "loss": 1.5662, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9867172675521821, | |
| "grad_norm": 10.526253700256348, | |
| "learning_rate": 5.976805839671071e-07, | |
| "loss": 1.5854, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9993674889310563, | |
| "grad_norm": 17.669225692749023, | |
| "learning_rate": 5.868240888334652e-07, | |
| "loss": 1.588, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0113851992409868, | |
| "grad_norm": 7.780857086181641, | |
| "learning_rate": 5.759252173912572e-07, | |
| "loss": 1.479, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0240354206198607, | |
| "grad_norm": 49.61854934692383, | |
| "learning_rate": 5.64989289062149e-07, | |
| "loss": 1.5685, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.036685641998735, | |
| "grad_norm": 31.814254760742188, | |
| "learning_rate": 5.540216413541936e-07, | |
| "loss": 1.5346, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.049335863377609, | |
| "grad_norm": 18.944929122924805, | |
| "learning_rate": 5.430276272567485e-07, | |
| "loss": 1.5623, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0619860847564833, | |
| "grad_norm": 7.601119518280029, | |
| "learning_rate": 5.320126126278379e-07, | |
| "loss": 1.569, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0746363061353574, | |
| "grad_norm": 6.994897842407227, | |
| "learning_rate": 5.209819735752341e-07, | |
| "loss": 1.555, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0872865275142316, | |
| "grad_norm": 7.816515922546387, | |
| "learning_rate": 5.09941093832535e-07, | |
| "loss": 1.5482, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0999367488931056, | |
| "grad_norm": 9.505668640136719, | |
| "learning_rate": 4.988953621315213e-07, | |
| "loss": 1.5455, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1125869702719797, | |
| "grad_norm": 6.980685234069824, | |
| "learning_rate": 4.87850169572073e-07, | |
| "loss": 1.5679, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.125237191650854, | |
| "grad_norm": 5.63450288772583, | |
| "learning_rate": 4.7681090699093066e-07, | |
| "loss": 1.5502, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.137887413029728, | |
| "grad_norm": 11.722896575927734, | |
| "learning_rate": 4.657829623305859e-07, | |
| "loss": 1.5628, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1505376344086022, | |
| "grad_norm": 14.06059455871582, | |
| "learning_rate": 4.5477171800958203e-07, | |
| "loss": 1.5144, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1631878557874762, | |
| "grad_norm": 14.6784029006958, | |
| "learning_rate": 4.437825482955139e-07, | |
| "loss": 1.5457, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1758380771663504, | |
| "grad_norm": 18.590673446655273, | |
| "learning_rate": 4.3282081668200327e-07, | |
| "loss": 1.5526, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1884882985452245, | |
| "grad_norm": 27.646364212036133, | |
| "learning_rate": 4.218918732709342e-07, | |
| "loss": 1.5234, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2011385199240987, | |
| "grad_norm": 8.348926544189453, | |
| "learning_rate": 4.1100105216122496e-07, | |
| "loss": 1.5587, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2137887413029729, | |
| "grad_norm": 9.07374382019043, | |
| "learning_rate": 4.0015366884540814e-07, | |
| "loss": 1.5576, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.226438962681847, | |
| "grad_norm": 6.855799198150635, | |
| "learning_rate": 3.893550176152954e-07, | |
| "loss": 1.5354, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.239089184060721, | |
| "grad_norm": 8.235871315002441, | |
| "learning_rate": 3.78610368977986e-07, | |
| "loss": 1.5196, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2517394054395952, | |
| "grad_norm": 8.418612480163574, | |
| "learning_rate": 3.6792496708348774e-07, | |
| "loss": 1.5618, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2643896268184693, | |
| "grad_norm": 8.189360618591309, | |
| "learning_rate": 3.5730402716519826e-07, | |
| "loss": 1.5453, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2770398481973435, | |
| "grad_norm": 8.278912544250488, | |
| "learning_rate": 3.4675273299450256e-07, | |
| "loss": 1.5456, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2896900695762175, | |
| "grad_norm": 5.916414260864258, | |
| "learning_rate": 3.362762343507257e-07, | |
| "loss": 1.5276, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3023402909550916, | |
| "grad_norm": 7.753712177276611, | |
| "learning_rate": 3.258796445076738e-07, | |
| "loss": 1.5288, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.3149905123339658, | |
| "grad_norm": 6.598722457885742, | |
| "learning_rate": 3.1556803773799613e-07, | |
| "loss": 1.5544, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.32764073371284, | |
| "grad_norm": 13.177520751953125, | |
| "learning_rate": 3.053464468365785e-07, | |
| "loss": 1.5548, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3402909550917141, | |
| "grad_norm": 8.982342720031738, | |
| "learning_rate": 2.9521986066418446e-07, | |
| "loss": 1.5316, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3529411764705883, | |
| "grad_norm": 8.048005104064941, | |
| "learning_rate": 2.8519322171253604e-07, | |
| "loss": 1.5499, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.3655913978494625, | |
| "grad_norm": 13.070209503173828, | |
| "learning_rate": 2.7527142369202875e-07, | |
| "loss": 1.5515, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.3782416192283364, | |
| "grad_norm": 10.934873580932617, | |
| "learning_rate": 2.6545930914325374e-07, | |
| "loss": 1.5432, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.3908918406072106, | |
| "grad_norm": 22.81064796447754, | |
| "learning_rate": 2.5576166707349384e-07, | |
| "loss": 1.5591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4035420619860848, | |
| "grad_norm": 6.494205474853516, | |
| "learning_rate": 2.4618323061935093e-07, | |
| "loss": 1.5343, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.416192283364959, | |
| "grad_norm": 9.8826322555542, | |
| "learning_rate": 2.3672867473663672e-07, | |
| "loss": 1.5541, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.428842504743833, | |
| "grad_norm": 15.296801567077637, | |
| "learning_rate": 2.2740261391866633e-07, | |
| "loss": 1.521, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.441492726122707, | |
| "grad_norm": 9.621323585510254, | |
| "learning_rate": 2.182095999440552e-07, | |
| "loss": 1.5235, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.4541429475015812, | |
| "grad_norm": 6.374513626098633, | |
| "learning_rate": 2.091541196551318e-07, | |
| "loss": 1.5362, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4667931688804554, | |
| "grad_norm": 8.73390007019043, | |
| "learning_rate": 2.0024059276803739e-07, | |
| "loss": 1.5475, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.4794433902593296, | |
| "grad_norm": 7.136387348175049, | |
| "learning_rate": 1.9147336971559448e-07, | |
| "loss": 1.5519, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.4920936116382038, | |
| "grad_norm": 15.520442962646484, | |
| "learning_rate": 1.8285672952398446e-07, | |
| "loss": 1.5551, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.504743833017078, | |
| "grad_norm": 8.319755554199219, | |
| "learning_rate": 1.743948777242814e-07, | |
| "loss": 1.5433, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.5173940543959519, | |
| "grad_norm": 41.71631622314453, | |
| "learning_rate": 1.6609194429985436e-07, | |
| "loss": 1.5308, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.530044275774826, | |
| "grad_norm": 19.970256805419922, | |
| "learning_rate": 1.5795198167064249e-07, | |
| "loss": 1.5446, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.5426944971537002, | |
| "grad_norm": 7.31848669052124, | |
| "learning_rate": 1.4997896271528737e-07, | |
| "loss": 1.5417, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.5553447185325742, | |
| "grad_norm": 11.576011657714844, | |
| "learning_rate": 1.4217677883208624e-07, | |
| "loss": 1.5312, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.5679949399114483, | |
| "grad_norm": 6.723977088928223, | |
| "learning_rate": 1.3454923803971418e-07, | |
| "loss": 1.5214, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.5806451612903225, | |
| "grad_norm": 12.752711296081543, | |
| "learning_rate": 1.2710006311864103e-07, | |
| "loss": 1.5196, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5932953826691967, | |
| "grad_norm": 7.53589391708374, | |
| "learning_rate": 1.1983288979415062e-07, | |
| "loss": 1.5456, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6059456040480709, | |
| "grad_norm": 26.84853172302246, | |
| "learning_rate": 1.1275126496184917e-07, | |
| "loss": 1.5518, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.618595825426945, | |
| "grad_norm": 25.39905548095703, | |
| "learning_rate": 1.0585864495652896e-07, | |
| "loss": 1.5198, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.6312460468058192, | |
| "grad_norm": 29.904163360595703, | |
| "learning_rate": 9.915839386523211e-08, | |
| "loss": 1.5363, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.6438962681846934, | |
| "grad_norm": 12.930685043334961, | |
| "learning_rate": 9.265378188533696e-08, | |
| "loss": 1.5213, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.6565464895635673, | |
| "grad_norm": 7.995054244995117, | |
| "learning_rate": 8.634798372847146e-08, | |
| "loss": 1.5326, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.6691967109424415, | |
| "grad_norm": 8.757749557495117, | |
| "learning_rate": 8.024407707102698e-08, | |
| "loss": 1.5254, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.6818469323213157, | |
| "grad_norm": 6.595693588256836, | |
| "learning_rate": 7.434504105203621e-08, | |
| "loss": 1.5285, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.6944971537001896, | |
| "grad_norm": 9.962479591369629, | |
| "learning_rate": 6.865375481914015e-08, | |
| "loss": 1.5483, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.7071473750790638, | |
| "grad_norm": 18.681095123291016, | |
| "learning_rate": 6.317299612336146e-08, | |
| "loss": 1.5408, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.719797596457938, | |
| "grad_norm": 9.540229797363281, | |
| "learning_rate": 5.790543996336466e-08, | |
| "loss": 1.5333, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.7324478178368121, | |
| "grad_norm": 18.783493041992188, | |
| "learning_rate": 5.285365727986707e-08, | |
| "loss": 1.5343, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.7450980392156863, | |
| "grad_norm": 31.82489776611328, | |
| "learning_rate": 4.802011370083747e-08, | |
| "loss": 1.5412, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.7577482605945605, | |
| "grad_norm": 5.806077480316162, | |
| "learning_rate": 4.3407168338095325e-08, | |
| "loss": 1.5522, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.7703984819734346, | |
| "grad_norm": 15.603910446166992, | |
| "learning_rate": 3.901707263589671e-08, | |
| "loss": 1.5457, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7830487033523088, | |
| "grad_norm": 6.486084461212158, | |
| "learning_rate": 3.485196927206985e-08, | |
| "loss": 1.537, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.7956989247311828, | |
| "grad_norm": 7.477235794067383, | |
| "learning_rate": 3.091389111223691e-08, | |
| "loss": 1.5367, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.808349146110057, | |
| "grad_norm": 5.301967620849609, | |
| "learning_rate": 2.7204760217631074e-08, | |
| "loss": 1.555, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.820999367488931, | |
| "grad_norm": 11.517831802368164, | |
| "learning_rate": 2.3726386906994688e-08, | |
| "loss": 1.5269, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.833649588867805, | |
| "grad_norm": 6.416511058807373, | |
| "learning_rate": 2.0480468873015298e-08, | |
| "loss": 1.5494, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.8462998102466792, | |
| "grad_norm": 12.837623596191406, | |
| "learning_rate": 1.7468590353731495e-08, | |
| "loss": 1.517, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.8589500316255534, | |
| "grad_norm": 6.5693159103393555, | |
| "learning_rate": 1.4692221359312196e-08, | |
| "loss": 1.5285, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.8716002530044276, | |
| "grad_norm": 36.55837631225586, | |
| "learning_rate": 1.2152716954587694e-08, | |
| "loss": 1.517, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.8842504743833017, | |
| "grad_norm": 9.189281463623047, | |
| "learning_rate": 9.851316597681959e-09, | |
| "loss": 1.5424, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.896900695762176, | |
| "grad_norm": 5.850296497344971, | |
| "learning_rate": 7.789143535069153e-09, | |
| "loss": 1.5468, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.90955091714105, | |
| "grad_norm": 20.80071258544922, | |
| "learning_rate": 5.9672042533499e-09, | |
| "loss": 1.5156, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9222011385199242, | |
| "grad_norm": 15.397515296936035, | |
| "learning_rate": 4.386387988014273e-09, | |
| "loss": 1.5257, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.9348513598987982, | |
| "grad_norm": 7.598056316375732, | |
| "learning_rate": 3.0474662894321437e-09, | |
| "loss": 1.5253, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.9475015812776724, | |
| "grad_norm": 17.48052978515625, | |
| "learning_rate": 1.9510926462816823e-09, | |
| "loss": 1.514, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.9601518026565465, | |
| "grad_norm": 9.740424156188965, | |
| "learning_rate": 1.0978021666005476e-09, | |
| "loss": 1.5153, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.9728020240354205, | |
| "grad_norm": 7.099216461181641, | |
| "learning_rate": 4.880113166155774e-10, | |
| "loss": 1.5389, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.9854522454142947, | |
| "grad_norm": 12.578405380249023, | |
| "learning_rate": 1.2201771747727407e-10, | |
| "loss": 1.5433, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.9981024667931688, | |
| "grad_norm": 10.554688453674316, | |
| "learning_rate": 0.0, | |
| "loss": 1.5311, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.9981024667931688, | |
| "step": 1580, | |
| "total_flos": 2.5201109024647414e+18, | |
| "train_loss": 1.6402756485757948, | |
| "train_runtime": 7842.2323, | |
| "train_samples_per_second": 3.225, | |
| "train_steps_per_second": 0.201 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1580, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5201109024647414e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |