| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.2643896268184693, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01265022137887413, | |
| "grad_norm": 13.099321365356445, | |
| "learning_rate": 6.329113924050633e-08, | |
| "loss": 2.5391, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02530044275774826, | |
| "grad_norm": 27.764171600341797, | |
| "learning_rate": 1.2658227848101266e-07, | |
| "loss": 2.5382, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03795066413662239, | |
| "grad_norm": 13.548388481140137, | |
| "learning_rate": 1.89873417721519e-07, | |
| "loss": 2.5232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05060088551549652, | |
| "grad_norm": 12.454045295715332, | |
| "learning_rate": 2.5316455696202533e-07, | |
| "loss": 2.4481, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06325110689437065, | |
| "grad_norm": 8.77104377746582, | |
| "learning_rate": 3.1645569620253163e-07, | |
| "loss": 2.3842, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07590132827324478, | |
| "grad_norm": 6.88944149017334, | |
| "learning_rate": 3.79746835443038e-07, | |
| "loss": 2.2827, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08855154965211891, | |
| "grad_norm": 6.3905930519104, | |
| "learning_rate": 4.4303797468354424e-07, | |
| "loss": 2.2146, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10120177103099304, | |
| "grad_norm": 8.130488395690918, | |
| "learning_rate": 5.063291139240507e-07, | |
| "loss": 2.0978, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11385199240986717, | |
| "grad_norm": 21.30768394470215, | |
| "learning_rate": 5.69620253164557e-07, | |
| "loss": 2.0624, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1265022137887413, | |
| "grad_norm": 6.106363296508789, | |
| "learning_rate": 6.329113924050633e-07, | |
| "loss": 2.0176, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13915243516761544, | |
| "grad_norm": 6.555318832397461, | |
| "learning_rate": 6.962025316455696e-07, | |
| "loss": 1.9714, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15180265654648956, | |
| "grad_norm": 6.091899394989014, | |
| "learning_rate": 7.59493670886076e-07, | |
| "loss": 1.8976, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1644528779253637, | |
| "grad_norm": 8.193241119384766, | |
| "learning_rate": 8.227848101265823e-07, | |
| "loss": 1.8571, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.17710309930423782, | |
| "grad_norm": 7.589028835296631, | |
| "learning_rate": 8.860759493670885e-07, | |
| "loss": 1.845, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18975332068311196, | |
| "grad_norm": 7.830214023590088, | |
| "learning_rate": 9.493670886075948e-07, | |
| "loss": 1.8197, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20240354206198607, | |
| "grad_norm": 6.8579535484313965, | |
| "learning_rate": 9.99995119100718e-07, | |
| "loss": 1.8233, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 6.225603103637695, | |
| "learning_rate": 9.998242976313776e-07, | |
| "loss": 1.7624, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.22770398481973433, | |
| "grad_norm": 7.000970363616943, | |
| "learning_rate": 9.994095264822903e-07, | |
| "loss": 1.7696, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24035420619860848, | |
| "grad_norm": 10.97808837890625, | |
| "learning_rate": 9.987510080911721e-07, | |
| "loss": 1.7406, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2530044275774826, | |
| "grad_norm": 6.758321762084961, | |
| "learning_rate": 9.97849063861667e-07, | |
| "loss": 1.7793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2656546489563567, | |
| "grad_norm": 12.631876945495605, | |
| "learning_rate": 9.967041340064793e-07, | |
| "loss": 1.7416, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2783048703352309, | |
| "grad_norm": 14.51364517211914, | |
| "learning_rate": 9.953167773325195e-07, | |
| "loss": 1.7273, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.290955091714105, | |
| "grad_norm": 9.859718322753906, | |
| "learning_rate": 9.936876709681666e-07, | |
| "loss": 1.7137, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3036053130929791, | |
| "grad_norm": 8.631091117858887, | |
| "learning_rate": 9.91817610032781e-07, | |
| "loss": 1.7117, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3162555344718533, | |
| "grad_norm": 9.83065414428711, | |
| "learning_rate": 9.897075072486298e-07, | |
| "loss": 1.7011, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3289057558507274, | |
| "grad_norm": 17.304086685180664, | |
| "learning_rate": 9.87358392495415e-07, | |
| "loss": 1.7106, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3415559772296015, | |
| "grad_norm": 10.813619613647461, | |
| "learning_rate": 9.847714123076173e-07, | |
| "loss": 1.6754, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.35420619860847563, | |
| "grad_norm": 10.511194229125977, | |
| "learning_rate": 9.81947829314908e-07, | |
| "loss": 1.6962, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3668564199873498, | |
| "grad_norm": 10.410032272338867, | |
| "learning_rate": 9.788890216258938e-07, | |
| "loss": 1.6962, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3795066413662239, | |
| "grad_norm": 7.6379714012146, | |
| "learning_rate": 9.755964821555046e-07, | |
| "loss": 1.6905, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 16.694063186645508, | |
| "learning_rate": 9.720718178963446e-07, | |
| "loss": 1.6889, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.40480708412397215, | |
| "grad_norm": 10.317975044250488, | |
| "learning_rate": 9.68316749134364e-07, | |
| "loss": 1.6611, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4174573055028463, | |
| "grad_norm": 10.705253601074219, | |
| "learning_rate": 9.643331086092404e-07, | |
| "loss": 1.6706, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 10.81264591217041, | |
| "learning_rate": 9.601228406198703e-07, | |
| "loss": 1.6597, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.44275774826059455, | |
| "grad_norm": 7.797952175140381, | |
| "learning_rate": 9.55688000075414e-07, | |
| "loss": 1.667, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.45540796963946867, | |
| "grad_norm": 11.544702529907227, | |
| "learning_rate": 9.510307514923536e-07, | |
| "loss": 1.6463, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.46805819101834284, | |
| "grad_norm": 8.987153053283691, | |
| "learning_rate": 9.461533679380567e-07, | |
| "loss": 1.6518, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.48070841239721696, | |
| "grad_norm": 8.551813125610352, | |
| "learning_rate": 9.410582299213572e-07, | |
| "loss": 1.645, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.49335863377609107, | |
| "grad_norm": 13.203941345214844, | |
| "learning_rate": 9.357478242306996e-07, | |
| "loss": 1.633, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5060088551549652, | |
| "grad_norm": 60.77254104614258, | |
| "learning_rate": 9.302247427204087e-07, | |
| "loss": 1.6537, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5186590765338394, | |
| "grad_norm": 23.738460540771484, | |
| "learning_rate": 9.24491681045682e-07, | |
| "loss": 1.6392, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5313092979127134, | |
| "grad_norm": 12.887944221496582, | |
| "learning_rate": 9.185514373469179e-07, | |
| "loss": 1.6342, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5439595192915876, | |
| "grad_norm": 11.586170196533203, | |
| "learning_rate": 9.124069108840264e-07, | |
| "loss": 1.6542, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5566097406704618, | |
| "grad_norm": 6.504334926605225, | |
| "learning_rate": 9.060611006213832e-07, | |
| "loss": 1.6413, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5692599620493358, | |
| "grad_norm": 8.936307907104492, | |
| "learning_rate": 8.995171037641234e-07, | |
| "loss": 1.6305, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.58191018342821, | |
| "grad_norm": 10.616469383239746, | |
| "learning_rate": 8.927781142464858e-07, | |
| "loss": 1.6135, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5945604048070842, | |
| "grad_norm": 6.386105537414551, | |
| "learning_rate": 8.858474211729469e-07, | |
| "loss": 1.6249, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6072106261859582, | |
| "grad_norm": 7.467748165130615, | |
| "learning_rate": 8.787284072129037e-07, | |
| "loss": 1.6282, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6198608475648324, | |
| "grad_norm": 11.522847175598145, | |
| "learning_rate": 8.714245469496931e-07, | |
| "loss": 1.633, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6325110689437066, | |
| "grad_norm": 5.804441928863525, | |
| "learning_rate": 8.639394051847471e-07, | |
| "loss": 1.6011, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 7.378643035888672, | |
| "learning_rate": 8.562766351977181e-07, | |
| "loss": 1.6185, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6578115117014548, | |
| "grad_norm": 6.906543254852295, | |
| "learning_rate": 8.484399769634203e-07, | |
| "loss": 1.6326, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6704617330803289, | |
| "grad_norm": 7.340395927429199, | |
| "learning_rate": 8.404332553264546e-07, | |
| "loss": 1.6306, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.683111954459203, | |
| "grad_norm": 13.938148498535156, | |
| "learning_rate": 8.32260378134416e-07, | |
| "loss": 1.6156, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6957621758380772, | |
| "grad_norm": 9.445398330688477, | |
| "learning_rate": 8.239253343305847e-07, | |
| "loss": 1.6228, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7084123972169513, | |
| "grad_norm": 6.176075458526611, | |
| "learning_rate": 8.154321920070412e-07, | |
| "loss": 1.5906, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7210626185958254, | |
| "grad_norm": 8.291935920715332, | |
| "learning_rate": 8.067850964191475e-07, | |
| "loss": 1.6202, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7337128399746996, | |
| "grad_norm": 11.122963905334473, | |
| "learning_rate": 7.979882679623694e-07, | |
| "loss": 1.6181, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7463630613535737, | |
| "grad_norm": 8.314069747924805, | |
| "learning_rate": 7.890460001124241e-07, | |
| "loss": 1.5892, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7590132827324478, | |
| "grad_norm": 6.397180080413818, | |
| "learning_rate": 7.799626573297604e-07, | |
| "loss": 1.5916, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7716635041113219, | |
| "grad_norm": 8.690041542053223, | |
| "learning_rate": 7.707426729293915e-07, | |
| "loss": 1.5919, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 7.349343776702881, | |
| "learning_rate": 7.613905469171245e-07, | |
| "loss": 1.5858, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7969639468690702, | |
| "grad_norm": 9.25490665435791, | |
| "learning_rate": 7.519108437932378e-07, | |
| "loss": 1.6114, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8096141682479443, | |
| "grad_norm": 7.29209566116333, | |
| "learning_rate": 7.423081903246813e-07, | |
| "loss": 1.607, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8222643896268185, | |
| "grad_norm": 7.68463659286499, | |
| "learning_rate": 7.325872732868869e-07, | |
| "loss": 1.5746, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8349146110056926, | |
| "grad_norm": 6.869282245635986, | |
| "learning_rate": 7.227528371762896e-07, | |
| "loss": 1.5811, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8475648323845667, | |
| "grad_norm": 17.074424743652344, | |
| "learning_rate": 7.128096818946769e-07, | |
| "loss": 1.5897, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 11.504667282104492, | |
| "learning_rate": 7.027626604064969e-07, | |
| "loss": 1.5794, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.872865275142315, | |
| "grad_norm": 8.88005256652832, | |
| "learning_rate": 6.926166763702672e-07, | |
| "loss": 1.6042, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8855154965211891, | |
| "grad_norm": 24.4627685546875, | |
| "learning_rate": 6.823766817452424e-07, | |
| "loss": 1.5639, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8981657179000633, | |
| "grad_norm": 20.67166519165039, | |
| "learning_rate": 6.720476743745072e-07, | |
| "loss": 1.587, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9108159392789373, | |
| "grad_norm": 16.13395118713379, | |
| "learning_rate": 6.616346955456742e-07, | |
| "loss": 1.6019, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9234661606578115, | |
| "grad_norm": 16.747365951538086, | |
| "learning_rate": 6.511428275303785e-07, | |
| "loss": 1.6014, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9361163820366857, | |
| "grad_norm": 7.6073994636535645, | |
| "learning_rate": 6.405771911037697e-07, | |
| "loss": 1.5753, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9487666034155597, | |
| "grad_norm": 8.412175178527832, | |
| "learning_rate": 6.299429430452096e-07, | |
| "loss": 1.5632, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9614168247944339, | |
| "grad_norm": 6.702009677886963, | |
| "learning_rate": 6.192452736213987e-07, | |
| "loss": 1.5773, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9740670461733081, | |
| "grad_norm": 6.783812046051025, | |
| "learning_rate": 6.084894040531589e-07, | |
| "loss": 1.5662, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9867172675521821, | |
| "grad_norm": 10.526253700256348, | |
| "learning_rate": 5.976805839671071e-07, | |
| "loss": 1.5854, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9993674889310563, | |
| "grad_norm": 17.669225692749023, | |
| "learning_rate": 5.868240888334652e-07, | |
| "loss": 1.588, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0113851992409868, | |
| "grad_norm": 7.780857086181641, | |
| "learning_rate": 5.759252173912572e-07, | |
| "loss": 1.479, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0240354206198607, | |
| "grad_norm": 49.61854934692383, | |
| "learning_rate": 5.64989289062149e-07, | |
| "loss": 1.5685, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.036685641998735, | |
| "grad_norm": 31.814254760742188, | |
| "learning_rate": 5.540216413541936e-07, | |
| "loss": 1.5346, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.049335863377609, | |
| "grad_norm": 18.944929122924805, | |
| "learning_rate": 5.430276272567485e-07, | |
| "loss": 1.5623, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0619860847564833, | |
| "grad_norm": 7.601119518280029, | |
| "learning_rate": 5.320126126278379e-07, | |
| "loss": 1.569, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0746363061353574, | |
| "grad_norm": 6.994897842407227, | |
| "learning_rate": 5.209819735752341e-07, | |
| "loss": 1.555, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0872865275142316, | |
| "grad_norm": 7.816515922546387, | |
| "learning_rate": 5.09941093832535e-07, | |
| "loss": 1.5482, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0999367488931056, | |
| "grad_norm": 9.505668640136719, | |
| "learning_rate": 4.988953621315213e-07, | |
| "loss": 1.5455, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1125869702719797, | |
| "grad_norm": 6.980685234069824, | |
| "learning_rate": 4.87850169572073e-07, | |
| "loss": 1.5679, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.125237191650854, | |
| "grad_norm": 5.63450288772583, | |
| "learning_rate": 4.7681090699093066e-07, | |
| "loss": 1.5502, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.137887413029728, | |
| "grad_norm": 11.722896575927734, | |
| "learning_rate": 4.657829623305859e-07, | |
| "loss": 1.5628, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1505376344086022, | |
| "grad_norm": 14.06059455871582, | |
| "learning_rate": 4.5477171800958203e-07, | |
| "loss": 1.5144, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1631878557874762, | |
| "grad_norm": 14.6784029006958, | |
| "learning_rate": 4.437825482955139e-07, | |
| "loss": 1.5457, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1758380771663504, | |
| "grad_norm": 18.590673446655273, | |
| "learning_rate": 4.3282081668200327e-07, | |
| "loss": 1.5526, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1884882985452245, | |
| "grad_norm": 27.646364212036133, | |
| "learning_rate": 4.218918732709342e-07, | |
| "loss": 1.5234, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2011385199240987, | |
| "grad_norm": 8.348926544189453, | |
| "learning_rate": 4.1100105216122496e-07, | |
| "loss": 1.5587, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2137887413029729, | |
| "grad_norm": 9.07374382019043, | |
| "learning_rate": 4.0015366884540814e-07, | |
| "loss": 1.5576, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.226438962681847, | |
| "grad_norm": 6.855799198150635, | |
| "learning_rate": 3.893550176152954e-07, | |
| "loss": 1.5354, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.239089184060721, | |
| "grad_norm": 8.235871315002441, | |
| "learning_rate": 3.78610368977986e-07, | |
| "loss": 1.5196, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2517394054395952, | |
| "grad_norm": 8.418612480163574, | |
| "learning_rate": 3.6792496708348774e-07, | |
| "loss": 1.5618, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2643896268184693, | |
| "grad_norm": 8.189360618591309, | |
| "learning_rate": 3.5730402716519826e-07, | |
| "loss": 1.5453, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1580, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5953517615670886e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |