{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.18099799752235413, "learning_rate": 3e-06, "loss": 1.2088, "step": 10 }, { "grad_norm": 0.20619013905525208, "learning_rate": 6.333333333333334e-06, "loss": 1.1961, "step": 20 }, { "grad_norm": 0.14481662213802338, "learning_rate": 9.666666666666667e-06, "loss": 1.1631, "step": 30 }, { "grad_norm": 0.14480064809322357, "learning_rate": 1.3000000000000001e-05, "loss": 1.12, "step": 40 }, { "grad_norm": 0.1346646249294281, "learning_rate": 1.6333333333333335e-05, "loss": 1.0839, "step": 50 }, { "grad_norm": 0.1372521072626114, "learning_rate": 1.9666666666666666e-05, "loss": 1.0649, "step": 60 }, { "grad_norm": 0.2891208529472351, "learning_rate": 2.3000000000000003e-05, "loss": 1.0489, "step": 70 }, { "grad_norm": 0.2914903163909912, "learning_rate": 2.633333333333333e-05, "loss": 1.0182, "step": 80 }, { "grad_norm": 0.4698174297809601, "learning_rate": 2.9666666666666672e-05, "loss": 0.9654, "step": 90 }, { "grad_norm": 0.8523975014686584, "learning_rate": 3.3e-05, "loss": 0.9062, "step": 100 }, { "grad_norm": 0.5535483956336975, "learning_rate": 3.633333333333333e-05, "loss": 0.8529, "step": 110 }, { "grad_norm": 0.6099287271499634, "learning_rate": 3.966666666666667e-05, "loss": 0.8047, "step": 120 }, { "grad_norm": 0.6395930051803589, "learning_rate": 4.3e-05, "loss": 0.7568, "step": 130 }, { "grad_norm": 0.7526710033416748, "learning_rate": 4.633333333333333e-05, "loss": 0.7168, "step": 140 }, { "grad_norm": 0.7764474153518677, "learning_rate": 4.966666666666667e-05, "loss": 0.6804, "step": 150 }, { "grad_norm": 0.9331451654434204, "learning_rate": 5.300000000000001e-05, "loss": 0.6504, "step": 160 }, { "grad_norm": 0.702415943145752, "learning_rate": 5.633333333333334e-05, "loss": 0.625, "step": 170 }, { "grad_norm": 1.1818166971206665, "learning_rate": 5.966666666666667e-05, "loss": 0.6007, "step": 180 }, { "grad_norm": 1.1777819395065308, "learning_rate": 6.3e-05, "loss": 0.5718, "step": 190 }, { "grad_norm": 1.0473010540008545, "learning_rate": 6.633333333333334e-05, "loss": 0.5543, "step": 200 }, { "grad_norm": 0.873248815536499, "learning_rate": 6.966666666666668e-05, "loss": 0.5387, "step": 210 }, { "grad_norm": 0.8804641962051392, "learning_rate": 7.3e-05, "loss": 0.5221, "step": 220 }, { "grad_norm": 0.915578305721283, "learning_rate": 7.633333333333334e-05, "loss": 0.5033, "step": 230 }, { "grad_norm": 1.316872000694275, "learning_rate": 7.966666666666666e-05, "loss": 0.4825, "step": 240 }, { "grad_norm": 1.1146589517593384, "learning_rate": 8.3e-05, "loss": 0.4626, "step": 250 }, { "grad_norm": 1.5606942176818848, "learning_rate": 8.633333333333334e-05, "loss": 0.4351, "step": 260 }, { "grad_norm": 1.0714901685714722, "learning_rate": 8.966666666666666e-05, "loss": 0.4128, "step": 270 }, { "grad_norm": 1.3251256942749023, "learning_rate": 9.300000000000001e-05, "loss": 0.3907, "step": 280 }, { "grad_norm": 1.0007801055908203, "learning_rate": 9.633333333333335e-05, "loss": 0.3783, "step": 290 }, { "grad_norm": 1.3502074480056763, "learning_rate": 9.966666666666667e-05, "loss": 0.3597, "step": 300 }, { "grad_norm": 1.1962398290634155, "learning_rate": 9.999938485971279e-05, "loss": 0.3383, "step": 310 }, { "grad_norm": 1.634523630142212, "learning_rate": 9.999725846827562e-05, "loss": 0.3226, "step": 320 }, { "grad_norm": 1.4946134090423584, "learning_rate": 9.999361329594254e-05, "loss": 0.2976, "step": 330 }, { "grad_norm": 1.2499020099639893, "learning_rate": 9.998844945344405e-05, "loss": 0.2673, "step": 340 }, { "grad_norm": 1.481104850769043, "learning_rate": 9.99817670976436e-05, "loss": 0.2459, "step": 350 }, { "grad_norm": 1.554062008857727, "learning_rate": 9.997356643153303e-05, "loss": 0.2272, "step": 360 }, { "grad_norm": 1.8656765222549438, "learning_rate": 9.996384770422629e-05, "loss": 0.2146, "step": 370 }, { "grad_norm": 1.0278624296188354, "learning_rate": 9.995261121095194e-05, "loss": 0.1967, "step": 380 }, { "grad_norm": 1.3932757377624512, "learning_rate": 9.993985729304408e-05, "loss": 0.1756, "step": 390 }, { "grad_norm": 1.618746042251587, "learning_rate": 9.992558633793212e-05, "loss": 0.1501, "step": 400 }, { "grad_norm": 1.1260430812835693, "learning_rate": 9.990979877912891e-05, "loss": 0.1335, "step": 410 }, { "grad_norm": 1.0491538047790527, "learning_rate": 9.989249509621759e-05, "loss": 0.1259, "step": 420 }, { "grad_norm": 1.284968614578247, "learning_rate": 9.987367581483705e-05, "loss": 0.122, "step": 430 }, { "grad_norm": 1.0914318561553955, "learning_rate": 9.985334150666592e-05, "loss": 0.1172, "step": 440 }, { "grad_norm": 1.1247401237487793, "learning_rate": 9.983149278940526e-05, "loss": 0.1097, "step": 450 }, { "grad_norm": 1.192862868309021, "learning_rate": 9.980813032675974e-05, "loss": 0.1, "step": 460 }, { "grad_norm": 0.8659683465957642, "learning_rate": 9.978325482841753e-05, "loss": 0.1027, "step": 470 }, { "grad_norm": 1.0352890491485596, "learning_rate": 9.975686705002867e-05, "loss": 0.1003, "step": 480 }, { "grad_norm": 1.2678529024124146, "learning_rate": 9.972896779318219e-05, "loss": 0.0971, "step": 490 }, { "grad_norm": 1.1098188161849976, "learning_rate": 9.969955790538175e-05, "loss": 0.0992, "step": 500 }, { "grad_norm": 1.181512713432312, "learning_rate": 9.966863828001982e-05, "loss": 0.0965, "step": 510 }, { "grad_norm": 1.0185215473175049, "learning_rate": 9.963620985635065e-05, "loss": 0.0877, "step": 520 }, { "grad_norm": 1.1182595491409302, "learning_rate": 9.960227361946164e-05, "loss": 0.0843, "step": 530 }, { "grad_norm": 0.9148073196411133, "learning_rate": 9.95668306002435e-05, "loss": 0.0923, "step": 540 }, { "grad_norm": 0.9969584941864014, "learning_rate": 9.952988187535886e-05, "loss": 0.0849, "step": 550 }, { "grad_norm": 1.2698750495910645, "learning_rate": 9.949142856720961e-05, "loss": 0.0835, "step": 560 }, { "grad_norm": 1.0291186571121216, "learning_rate": 9.945147184390278e-05, "loss": 0.0878, "step": 570 }, { "grad_norm": 0.9522308111190796, "learning_rate": 9.941001291921512e-05, "loss": 0.0823, "step": 580 }, { "grad_norm": 1.0067387819290161, "learning_rate": 9.936705305255612e-05, "loss": 0.084, "step": 590 }, { "grad_norm": 1.1268168687820435, "learning_rate": 9.932259354892984e-05, "loss": 0.0824, "step": 600 }, { "grad_norm": 0.8802523016929626, "learning_rate": 9.927663575889521e-05, "loss": 0.0792, "step": 610 }, { "grad_norm": 1.3417941331863403, "learning_rate": 9.922918107852504e-05, "loss": 0.0811, "step": 620 }, { "grad_norm": 1.097968578338623, "learning_rate": 9.918023094936363e-05, "loss": 0.077, "step": 630 }, { "grad_norm": 1.0577588081359863, "learning_rate": 9.912978685838294e-05, "loss": 0.0802, "step": 640 }, { "grad_norm": 1.1714197397232056, "learning_rate": 9.90778503379374e-05, "loss": 0.078, "step": 650 }, { "grad_norm": 0.8812937140464783, "learning_rate": 9.902442296571743e-05, "loss": 0.0708, "step": 660 }, { "grad_norm": 0.9380112886428833, "learning_rate": 9.896950636470147e-05, "loss": 0.0803, "step": 670 }, { "grad_norm": 1.1852452754974365, "learning_rate": 9.891310220310666e-05, "loss": 0.0757, "step": 680 }, { "grad_norm": 1.0475136041641235, "learning_rate": 9.885521219433823e-05, "loss": 0.0727, "step": 690 }, { "grad_norm": 1.1130269765853882, "learning_rate": 9.879583809693738e-05, "loss": 0.0711, "step": 700 }, { "grad_norm": 0.9928076863288879, "learning_rate": 9.873498171452789e-05, "loss": 0.0703, "step": 710 }, { "grad_norm": 0.979897141456604, "learning_rate": 9.867264489576135e-05, "loss": 0.0687, "step": 720 }, { "grad_norm": 1.0989563465118408, "learning_rate": 9.860882953426099e-05, "loss": 0.0747, "step": 730 }, { "grad_norm": 1.0301982164382935, "learning_rate": 9.854353756856412e-05, "loss": 0.0699, "step": 740 }, { "grad_norm": 1.101908802986145, "learning_rate": 9.847677098206332e-05, "loss": 0.069, "step": 750 }, { "grad_norm": 0.8733094334602356, "learning_rate": 9.840853180294608e-05, "loss": 0.0672, "step": 760 }, { "grad_norm": 1.0546811819076538, "learning_rate": 9.833882210413332e-05, "loss": 0.0706, "step": 770 }, { "grad_norm": 0.8678887486457825, "learning_rate": 9.826764400321633e-05, "loss": 0.0702, "step": 780 }, { "grad_norm": 0.8769698739051819, "learning_rate": 9.819499966239243e-05, "loss": 0.0678, "step": 790 }, { "grad_norm": 1.1157478094100952, "learning_rate": 9.812089128839938e-05, "loss": 0.0693, "step": 800 }, { "grad_norm": 1.0164200067520142, "learning_rate": 9.804532113244828e-05, "loss": 0.0624, "step": 810 }, { "grad_norm": 0.915485680103302, "learning_rate": 9.796829149015517e-05, "loss": 0.0647, "step": 820 }, { "grad_norm": 0.8830865025520325, "learning_rate": 9.788980470147132e-05, "loss": 0.0613, "step": 830 }, { "grad_norm": 1.0174789428710938, "learning_rate": 9.780986315061218e-05, "loss": 0.0641, "step": 840 }, { "grad_norm": 0.7468952536582947, "learning_rate": 9.772846926598491e-05, "loss": 0.0716, "step": 850 }, { "grad_norm": 0.7474204301834106, "learning_rate": 9.76456255201146e-05, "loss": 0.0636, "step": 860 }, { "grad_norm": 1.0349617004394531, "learning_rate": 9.756133442956923e-05, "loss": 0.0612, "step": 870 }, { "grad_norm": 0.8907390236854553, "learning_rate": 9.747559855488313e-05, "loss": 0.0656, "step": 880 }, { "grad_norm": 0.8625577092170715, "learning_rate": 9.73884205004793e-05, "loss": 0.0637, "step": 890 }, { "grad_norm": 0.895968496799469, "learning_rate": 9.729980291459019e-05, "loss": 0.0635, "step": 900 }, { "grad_norm": 0.9742909073829651, "learning_rate": 9.720974848917735e-05, "loss": 0.0596, "step": 910 }, { "grad_norm": 0.7080522775650024, "learning_rate": 9.711825995984957e-05, "loss": 0.0604, "step": 920 }, { "grad_norm": 0.7485001087188721, "learning_rate": 9.702534010577991e-05, "loss": 0.0627, "step": 930 }, { "grad_norm": 0.8010299801826477, "learning_rate": 9.693099174962103e-05, "loss": 0.0584, "step": 940 }, { "grad_norm": 0.8207157254219055, "learning_rate": 9.683521775741977e-05, "loss": 0.0606, "step": 950 }, { "grad_norm": 0.7718232870101929, "learning_rate": 9.673802103852979e-05, "loss": 0.0586, "step": 960 }, { "grad_norm": 1.0081161260604858, "learning_rate": 9.663940454552342e-05, "loss": 0.0595, "step": 970 }, { "grad_norm": 0.8325558304786682, "learning_rate": 9.65393712741018e-05, "loss": 0.0581, "step": 980 }, { "grad_norm": 0.9128422737121582, "learning_rate": 9.6437924263004e-05, "loss": 0.0613, "step": 990 }, { "grad_norm": 0.8013613224029541, "learning_rate": 9.63350665939146e-05, "loss": 0.059, "step": 1000 }, { "grad_norm": 0.8024020791053772, "learning_rate": 9.623080139137023e-05, "loss": 0.0585, "step": 1010 }, { "grad_norm": 0.8608654737472534, "learning_rate": 9.612513182266447e-05, "loss": 0.0573, "step": 1020 }, { "grad_norm": 0.7992358803749084, "learning_rate": 9.601806109775179e-05, "loss": 0.0588, "step": 1030 }, { "grad_norm": 0.9951710104942322, "learning_rate": 9.590959246914995e-05, "loss": 0.0549, "step": 1040 }, { "grad_norm": 0.7153400778770447, "learning_rate": 9.579972923184122e-05, "loss": 0.0581, "step": 1050 }, { "grad_norm": 0.8274824023246765, "learning_rate": 9.568847472317232e-05, "loss": 0.0528, "step": 1060 }, { "grad_norm": 0.6790134906768799, "learning_rate": 9.557583232275303e-05, "loss": 0.0554, "step": 1070 }, { "grad_norm": 0.7177821397781372, "learning_rate": 9.546180545235344e-05, "loss": 0.0525, "step": 1080 }, { "grad_norm": 0.8989811539649963, "learning_rate": 9.534639757580013e-05, "loss": 0.0515, "step": 1090 }, { "grad_norm": 0.8031622767448425, "learning_rate": 9.522961219887092e-05, "loss": 0.0564, "step": 1100 }, { "grad_norm": 0.8315763473510742, "learning_rate": 9.511145286918828e-05, "loss": 0.0567, "step": 1110 }, { "grad_norm": 0.7631978988647461, "learning_rate": 9.499192317611167e-05, "loss": 0.0524, "step": 1120 }, { "grad_norm": 0.8047354817390442, "learning_rate": 9.487102675062851e-05, "loss": 0.0563, "step": 1130 }, { "grad_norm": 0.5823233127593994, "learning_rate": 9.474876726524374e-05, "loss": 0.0507, "step": 1140 }, { "grad_norm": 0.7840980887413025, "learning_rate": 9.462514843386845e-05, "loss": 0.0522, "step": 1150 }, { "grad_norm": 0.7950931191444397, "learning_rate": 9.450017401170689e-05, "loss": 0.0544, "step": 1160 }, { "grad_norm": 0.7812637090682983, "learning_rate": 9.437384779514256e-05, "loss": 0.0538, "step": 1170 }, { "grad_norm": 0.8743076920509338, "learning_rate": 9.424617362162271e-05, "loss": 0.0551, "step": 1180 }, { "grad_norm": 0.7778111100196838, "learning_rate": 9.411715536954196e-05, "loss": 0.0515, "step": 1190 }, { "grad_norm": 0.799373209476471, "learning_rate": 9.39867969581243e-05, "loss": 0.0514, "step": 1200 }, { "grad_norm": 0.7291685342788696, "learning_rate": 9.385510234730415e-05, "loss": 0.0524, "step": 1210 }, { "grad_norm": 0.830124020576477, "learning_rate": 9.372207553760603e-05, "loss": 0.0506, "step": 1220 }, { "grad_norm": 0.6252336502075195, "learning_rate": 9.358772057002312e-05, "loss": 0.0502, "step": 1230 }, { "grad_norm": 0.7802227735519409, "learning_rate": 9.345204152589428e-05, "loss": 0.0505, "step": 1240 }, { "grad_norm": 0.7094554901123047, "learning_rate": 9.331504252678037e-05, "loss": 0.0537, "step": 1250 }, { "grad_norm": 0.7272769808769226, "learning_rate": 9.317672773433876e-05, "loss": 0.0506, "step": 1260 }, { "grad_norm": 0.6488326191902161, "learning_rate": 9.30371013501972e-05, "loss": 0.0484, "step": 1270 }, { "grad_norm": 0.6355553865432739, "learning_rate": 9.289616761582587e-05, "loss": 0.0544, "step": 1280 }, { "grad_norm": 0.769917905330658, "learning_rate": 9.275393081240882e-05, "loss": 0.048, "step": 1290 }, { "grad_norm": 0.6785501837730408, "learning_rate": 9.261039526071374e-05, "loss": 0.0484, "step": 1300 }, { "grad_norm": 0.7995139360427856, "learning_rate": 9.246556532096078e-05, "loss": 0.05, "step": 1310 }, { "grad_norm": 0.7101492285728455, "learning_rate": 9.231944539269009e-05, "loss": 0.0499, "step": 1320 }, { "grad_norm": 0.6292925477027893, "learning_rate": 9.217203991462815e-05, "loss": 0.0499, "step": 1330 }, { "grad_norm": 0.6308528780937195, "learning_rate": 9.202335336455296e-05, "loss": 0.049, "step": 1340 }, { "grad_norm": 0.6749176979064941, "learning_rate": 9.187339025915802e-05, "loss": 0.0488, "step": 1350 }, { "grad_norm": 0.5973607897758484, "learning_rate": 9.17221551539151e-05, "loss": 0.0475, "step": 1360 }, { "grad_norm": 0.6545643210411072, "learning_rate": 9.156965264293586e-05, "loss": 0.0476, "step": 1370 }, { "grad_norm": 0.6092913746833801, "learning_rate": 9.141588735883232e-05, "loss": 0.0433, "step": 1380 }, { "grad_norm": 0.5947241187095642, "learning_rate": 9.126086397257612e-05, "loss": 0.0471, "step": 1390 }, { "grad_norm": 0.5612359046936035, "learning_rate": 9.110458719335659e-05, "loss": 0.0463, "step": 1400 }, { "grad_norm": 0.654656708240509, "learning_rate": 9.094706176843777e-05, "loss": 0.0486, "step": 1410 }, { "grad_norm": 0.7321748733520508, "learning_rate": 9.078829248301417e-05, "loss": 0.0451, "step": 1420 }, { "grad_norm": 0.7481226325035095, "learning_rate": 9.062828416006539e-05, "loss": 0.0503, "step": 1430 }, { "grad_norm": 0.6706563234329224, "learning_rate": 9.046704166020961e-05, "loss": 0.0472, "step": 1440 }, { "grad_norm": 0.6942538619041443, "learning_rate": 9.030456988155596e-05, "loss": 0.0462, "step": 1450 }, { "grad_norm": 0.65287184715271, "learning_rate": 9.014087375955573e-05, "loss": 0.0469, "step": 1460 }, { "grad_norm": 0.7019280195236206, "learning_rate": 8.997595826685243e-05, "loss": 0.0514, "step": 1470 }, { "grad_norm": 0.6150776147842407, "learning_rate": 8.980982841313074e-05, "loss": 0.0466, "step": 1480 }, { "grad_norm": 0.784782350063324, "learning_rate": 8.964248924496435e-05, "loss": 0.0434, "step": 1490 }, { "grad_norm": 0.6784024834632874, "learning_rate": 8.947394584566258e-05, "loss": 0.0438, "step": 1500 }, { "grad_norm": 0.5981051921844482, "learning_rate": 8.930420333511606e-05, "loss": 0.0427, "step": 1510 }, { "grad_norm": 0.7331579923629761, "learning_rate": 8.913326686964117e-05, "loss": 0.0432, "step": 1520 }, { "grad_norm": 0.6730307936668396, "learning_rate": 8.89611416418234e-05, "loss": 0.0424, "step": 1530 }, { "grad_norm": 0.5771387219429016, "learning_rate": 8.878783288035957e-05, "loss": 0.0432, "step": 1540 }, { "grad_norm": 0.7928068041801453, "learning_rate": 8.86133458498991e-05, "loss": 0.0475, "step": 1550 }, { "grad_norm": 0.6628245115280151, "learning_rate": 8.843768585088393e-05, "loss": 0.0432, "step": 1560 }, { "grad_norm": 0.7262830138206482, "learning_rate": 8.82608582193877e-05, "loss": 0.0451, "step": 1570 }, { "grad_norm": 0.6896581649780273, "learning_rate": 8.80828683269535e-05, "loss": 0.0429, "step": 1580 }, { "grad_norm": 0.6019271016120911, "learning_rate": 8.790372158043074e-05, "loss": 0.0416, "step": 1590 }, { "grad_norm": 0.6586809754371643, "learning_rate": 8.772342342181095e-05, "loss": 0.0435, "step": 1600 }, { "grad_norm": 0.741075336933136, "learning_rate": 8.75419793280624e-05, "loss": 0.0428, "step": 1610 }, { "grad_norm": 0.7138071656227112, "learning_rate": 8.735939481096378e-05, "loss": 0.0415, "step": 1620 }, { "grad_norm": 0.665623128414154, "learning_rate": 8.717567541693673e-05, "loss": 0.0437, "step": 1630 }, { "grad_norm": 0.6723113059997559, "learning_rate": 8.699082672687734e-05, "loss": 0.0442, "step": 1640 }, { "grad_norm": 0.5757609605789185, "learning_rate": 8.680485435598673e-05, "loss": 0.0473, "step": 1650 }, { "grad_norm": 0.646248459815979, "learning_rate": 8.661776395360029e-05, "loss": 0.0443, "step": 1660 }, { "grad_norm": 0.7440095543861389, "learning_rate": 8.642956120301626e-05, "loss": 0.0414, "step": 1670 }, { "grad_norm": 0.6682982444763184, "learning_rate": 8.624025182132292e-05, "loss": 0.042, "step": 1680 }, { "grad_norm": 0.6209063529968262, "learning_rate": 8.604984155922506e-05, "loss": 0.0422, "step": 1690 }, { "grad_norm": 0.6250181198120117, "learning_rate": 8.585833620086918e-05, "loss": 0.042, "step": 1700 }, { "grad_norm": 0.709252655506134, "learning_rate": 8.566574156366784e-05, "loss": 0.0369, "step": 1710 }, { "grad_norm": 0.783593475818634, "learning_rate": 8.547206349812298e-05, "loss": 0.0445, "step": 1720 }, { "grad_norm": 0.5931394100189209, "learning_rate": 8.527730788764805e-05, "loss": 0.0449, "step": 1730 }, { "grad_norm": 0.5985734462738037, "learning_rate": 8.508148064838948e-05, "loss": 0.0412, "step": 1740 }, { "grad_norm": 0.528599739074707, "learning_rate": 8.488458772904684e-05, "loss": 0.0398, "step": 1750 }, { "grad_norm": 0.6593722701072693, "learning_rate": 8.468663511069217e-05, "loss": 0.0408, "step": 1760 }, { "grad_norm": 0.5931499600410461, "learning_rate": 8.448762880658825e-05, "loss": 0.0414, "step": 1770 }, { "grad_norm": 0.5673992037773132, "learning_rate": 8.428757486200603e-05, "loss": 0.041, "step": 1780 }, { "grad_norm": 0.7802947759628296, "learning_rate": 8.40864793540409e-05, "loss": 0.0421, "step": 1790 }, { "grad_norm": 0.5950642228126526, "learning_rate": 8.388434839142813e-05, "loss": 0.0424, "step": 1800 }, { "grad_norm": 0.6841787099838257, "learning_rate": 8.368118811435726e-05, "loss": 0.0391, "step": 1810 }, { "grad_norm": 0.5789716839790344, "learning_rate": 8.347700469428564e-05, "loss": 0.0386, "step": 1820 }, { "grad_norm": 0.6306881904602051, "learning_rate": 8.327180433375091e-05, "loss": 0.0404, "step": 1830 }, { "grad_norm": 0.5804703831672668, "learning_rate": 8.306559326618259e-05, "loss": 0.0392, "step": 1840 }, { "grad_norm": 0.6453599333763123, "learning_rate": 8.285837775571276e-05, "loss": 0.0398, "step": 1850 }, { "grad_norm": 0.5413265228271484, "learning_rate": 8.265016409698573e-05, "loss": 0.0389, "step": 1860 }, { "grad_norm": 0.5259561538696289, "learning_rate": 8.244095861496686e-05, "loss": 0.0389, "step": 1870 }, { "grad_norm": 0.6392974853515625, "learning_rate": 8.223076766475035e-05, "loss": 0.0404, "step": 1880 }, { "grad_norm": 0.7087792754173279, "learning_rate": 8.201959763136633e-05, "loss": 0.0388, "step": 1890 }, { "grad_norm": 0.7540794610977173, "learning_rate": 8.180745492958674e-05, "loss": 0.0419, "step": 1900 }, { "grad_norm": 0.5628899335861206, "learning_rate": 8.159434600373061e-05, "loss": 0.0375, "step": 1910 }, { "grad_norm": 0.5828471779823303, "learning_rate": 8.138027732746818e-05, "loss": 0.0394, "step": 1920 }, { "grad_norm": 0.6918069124221802, "learning_rate": 8.116525540362434e-05, "loss": 0.0377, "step": 1930 }, { "grad_norm": 0.5691211819648743, "learning_rate": 8.094928676398101e-05, "loss": 0.0389, "step": 1940 }, { "grad_norm": 0.5968996286392212, "learning_rate": 8.073237796907882e-05, "loss": 0.0361, "step": 1950 }, { "grad_norm": 0.5921427011489868, "learning_rate": 8.051453560801772e-05, "loss": 0.0433, "step": 1960 }, { "grad_norm": 0.5701543688774109, "learning_rate": 8.029576629825687e-05, "loss": 0.0368, "step": 1970 }, { "grad_norm": 0.6130271553993225, "learning_rate": 8.007607668541362e-05, "loss": 0.0395, "step": 1980 }, { "grad_norm": 0.6060221195220947, "learning_rate": 7.985547344306161e-05, "loss": 0.0438, "step": 1990 }, { "grad_norm": 0.709045946598053, "learning_rate": 7.963396327252812e-05, "loss": 0.0414, "step": 2000 }, { "grad_norm": 0.6804901361465454, "learning_rate": 7.941155290269038e-05, "loss": 0.0394, "step": 2010 }, { "grad_norm": 0.5408011078834534, "learning_rate": 7.918824908977123e-05, "loss": 0.0367, "step": 2020 }, { "grad_norm": 0.554338812828064, "learning_rate": 7.896405861713394e-05, "loss": 0.0356, "step": 2030 }, { "grad_norm": 0.711392879486084, "learning_rate": 7.873898829507606e-05, "loss": 0.0371, "step": 2040 }, { "grad_norm": 0.6779384613037109, "learning_rate": 7.851304496062254e-05, "loss": 0.038, "step": 2050 }, { "grad_norm": 0.6775013208389282, "learning_rate": 7.828623547731818e-05, "loss": 0.038, "step": 2060 }, { "grad_norm": 0.5738393664360046, "learning_rate": 7.80585667350189e-05, "loss": 0.0388, "step": 2070 }, { "grad_norm": 0.5050686001777649, "learning_rate": 7.783004564968263e-05, "loss": 0.0381, "step": 2080 }, { "grad_norm": 0.6223453283309937, "learning_rate": 7.760067916315921e-05, "loss": 0.0382, "step": 2090 }, { "grad_norm": 0.6240858435630798, "learning_rate": 7.737047424297941e-05, "loss": 0.0345, "step": 2100 }, { "grad_norm": 0.5866036415100098, "learning_rate": 7.713943788214337e-05, "loss": 0.0341, "step": 2110 }, { "grad_norm": 0.6695197224617004, "learning_rate": 7.690757709890812e-05, "loss": 0.0354, "step": 2120 }, { "grad_norm": 0.5520651340484619, "learning_rate": 7.66748989365744e-05, "loss": 0.0366, "step": 2130 }, { "grad_norm": 0.5425397157669067, "learning_rate": 7.644141046327271e-05, "loss": 0.0339, "step": 2140 }, { "grad_norm": 0.5396847128868103, "learning_rate": 7.620711877174866e-05, "loss": 0.037, "step": 2150 }, { "grad_norm": 0.633583128452301, "learning_rate": 7.597203097914732e-05, "loss": 0.0358, "step": 2160 }, { "grad_norm": 0.5030661821365356, "learning_rate": 7.573615422679726e-05, "loss": 0.0372, "step": 2170 }, { "grad_norm": 0.7198052406311035, "learning_rate": 7.549949567999345e-05, "loss": 0.0344, "step": 2180 }, { "grad_norm": 0.5248534679412842, "learning_rate": 7.526206252777968e-05, "loss": 0.0382, "step": 2190 }, { "grad_norm": 0.6668030619621277, "learning_rate": 7.50238619827301e-05, "loss": 0.0375, "step": 2200 }, { "grad_norm": 0.6512902975082397, "learning_rate": 7.478490128073022e-05, "loss": 0.0365, "step": 2210 }, { "grad_norm": 0.5244461894035339, "learning_rate": 7.454518768075704e-05, "loss": 0.0369, "step": 2220 }, { "grad_norm": 0.5693942308425903, "learning_rate": 7.430472846465856e-05, "loss": 0.0344, "step": 2230 }, { "grad_norm": 0.6084948182106018, "learning_rate": 7.406353093693253e-05, "loss": 0.035, "step": 2240 }, { "grad_norm": 0.536939263343811, "learning_rate": 7.382160242450469e-05, "loss": 0.0356, "step": 2250 }, { "grad_norm": 0.5331522226333618, "learning_rate": 7.357895027650598e-05, "loss": 0.031, "step": 2260 }, { "grad_norm": 0.45928630232810974, "learning_rate": 7.333558186404958e-05, "loss": 0.0327, "step": 2270 }, { "grad_norm": 0.528089165687561, "learning_rate": 7.309150458000668e-05, "loss": 0.0359, "step": 2280 }, { "grad_norm": 0.581315279006958, "learning_rate": 7.284672583878219e-05, "loss": 0.0343, "step": 2290 }, { "grad_norm": 0.558525800704956, "learning_rate": 7.260125307608929e-05, "loss": 0.0367, "step": 2300 }, { "grad_norm": 0.47152066230773926, "learning_rate": 7.235509374872373e-05, "loss": 0.035, "step": 2310 }, { "grad_norm": 0.6010111570358276, "learning_rate": 7.210825533433719e-05, "loss": 0.0335, "step": 2320 }, { "grad_norm": 0.6121062636375427, "learning_rate": 7.186074533121013e-05, "loss": 0.0336, "step": 2330 }, { "grad_norm": 0.5350408554077148, "learning_rate": 7.161257125802413e-05, "loss": 0.0353, "step": 2340 }, { "grad_norm": 0.5239009857177734, "learning_rate": 7.136374065363334e-05, "loss": 0.037, "step": 2350 }, { "grad_norm": 0.4956763982772827, "learning_rate": 7.11142610768356e-05, "loss": 0.0354, "step": 2360 }, { "grad_norm": 0.5018975138664246, "learning_rate": 7.086414010614276e-05, "loss": 0.0338, "step": 2370 }, { "grad_norm": 0.4875252842903137, "learning_rate": 7.061338533955043e-05, "loss": 0.0362, "step": 2380 }, { "grad_norm": 0.47811827063560486, "learning_rate": 7.036200439430725e-05, "loss": 0.0376, "step": 2390 }, { "grad_norm": 0.5614078044891357, "learning_rate": 7.01100049066835e-05, "loss": 0.0339, "step": 2400 }, { "grad_norm": 0.6021232008934021, "learning_rate": 6.985739453173903e-05, "loss": 0.0372, "step": 2410 }, { "grad_norm": 0.5856548547744751, "learning_rate": 6.960418094309085e-05, "loss": 0.0353, "step": 2420 }, { "grad_norm": 0.46249493956565857, "learning_rate": 6.93503718326799e-05, "loss": 0.0334, "step": 2430 }, { "grad_norm": 0.5227417945861816, "learning_rate": 6.909597491053751e-05, "loss": 0.0342, "step": 2440 }, { "grad_norm": 0.607357382774353, "learning_rate": 6.884099790455113e-05, "loss": 0.0324, "step": 2450 }, { "grad_norm": 0.485953152179718, "learning_rate": 6.858544856022952e-05, "loss": 0.0348, "step": 2460 }, { "grad_norm": 0.571148157119751, "learning_rate": 6.83293346404676e-05, "loss": 0.0347, "step": 2470 }, { "grad_norm": 0.5217222571372986, "learning_rate": 6.80726639253105e-05, "loss": 0.0368, "step": 2480 }, { "grad_norm": 0.4487457573413849, "learning_rate": 6.781544421171732e-05, "loss": 0.0355, "step": 2490 }, { "grad_norm": 0.47729650139808655, "learning_rate": 6.755768331332424e-05, "loss": 0.0343, "step": 2500 }, { "grad_norm": 0.4894144535064697, "learning_rate": 6.729938906020713e-05, "loss": 0.0353, "step": 2510 }, { "grad_norm": 0.544179379940033, "learning_rate": 6.704056929864376e-05, "loss": 0.0331, "step": 2520 }, { "grad_norm": 0.6115988492965698, "learning_rate": 6.67812318908754e-05, "loss": 0.0326, "step": 2530 }, { "grad_norm": 0.5752000212669373, "learning_rate": 6.6521384714868e-05, "loss": 0.0312, "step": 2540 }, { "grad_norm": 0.47528618574142456, "learning_rate": 6.626103566407295e-05, "loss": 0.0331, "step": 2550 }, { "grad_norm": 0.5542522072792053, "learning_rate": 6.600019264718713e-05, "loss": 0.0327, "step": 2560 }, { "grad_norm": 0.5280784368515015, "learning_rate": 6.573886358791285e-05, "loss": 0.0347, "step": 2570 }, { "grad_norm": 0.5374977588653564, "learning_rate": 6.547705642471703e-05, "loss": 0.0331, "step": 2580 }, { "grad_norm": 0.3995784521102905, "learning_rate": 6.521477911059008e-05, "loss": 0.0287, "step": 2590 }, { "grad_norm": 0.43667104840278625, "learning_rate": 6.495203961280434e-05, "loss": 0.0327, "step": 2600 }, { "grad_norm": 0.5405910611152649, "learning_rate": 6.468884591267204e-05, "loss": 0.0325, "step": 2610 }, { "grad_norm": 0.46785178780555725, "learning_rate": 6.44252060053028e-05, "loss": 0.0318, "step": 2620 }, { "grad_norm": 0.45796945691108704, "learning_rate": 6.416112789936086e-05, "loss": 0.0331, "step": 2630 }, { "grad_norm": 0.4898403286933899, "learning_rate": 6.389661961682173e-05, "loss": 0.0317, "step": 2640 }, { "grad_norm": 0.5258901119232178, "learning_rate": 6.363168919272846e-05, "loss": 0.0317, "step": 2650 }, { "grad_norm": 0.492632120847702, "learning_rate": 6.336634467494768e-05, "loss": 0.0306, "step": 2660 }, { "grad_norm": 0.5009192824363708, "learning_rate": 6.310059412392505e-05, "loss": 0.0304, "step": 2670 }, { "grad_norm": 0.6297652721405029, "learning_rate": 6.283444561244042e-05, "loss": 0.0304, "step": 2680 }, { "grad_norm": 0.4868377149105072, "learning_rate": 6.256790722536251e-05, "loss": 0.0313, "step": 2690 }, { "grad_norm": 0.5541006922721863, "learning_rate": 6.230098705940354e-05, "loss": 0.0316, "step": 2700 }, { "grad_norm": 0.42766621708869934, "learning_rate": 6.203369322287306e-05, "loss": 0.0327, "step": 2710 }, { "grad_norm": 0.5170658826828003, "learning_rate": 6.17660338354317e-05, "loss": 0.0293, "step": 2720 }, { "grad_norm": 0.4898792505264282, "learning_rate": 6.149801702784456e-05, "loss": 0.0288, "step": 2730 }, { "grad_norm": 0.4858188033103943, "learning_rate": 6.122965094173424e-05, "loss": 0.031, "step": 2740 }, { "grad_norm": 0.5073441863059998, "learning_rate": 6.0960943729333374e-05, "loss": 0.034, "step": 2750 }, { "grad_norm": 0.4941282570362091, "learning_rate": 6.069190355323717e-05, "loss": 0.0305, "step": 2760 }, { "grad_norm": 0.4680149257183075, "learning_rate": 6.042253858615532e-05, "loss": 0.0308, "step": 2770 }, { "grad_norm": 0.4339468777179718, "learning_rate": 6.015285701066382e-05, "loss": 0.0333, "step": 2780 }, { "grad_norm": 0.46258655190467834, "learning_rate": 5.988286701895631e-05, "loss": 0.0349, "step": 2790 }, { "grad_norm": 0.490296870470047, "learning_rate": 5.961257681259535e-05, "loss": 0.0343, "step": 2800 }, { "grad_norm": 0.5121153593063354, "learning_rate": 5.934199460226317e-05, "loss": 0.0332, "step": 2810 }, { "grad_norm": 0.4576858878135681, "learning_rate": 5.9071128607512285e-05, "loss": 0.0308, "step": 2820 }, { "grad_norm": 0.4811716675758362, "learning_rate": 5.8799987056515804e-05, "loss": 0.0304, "step": 2830 }, { "grad_norm": 0.6127643585205078, "learning_rate": 5.8528578185817514e-05, "loss": 0.0318, "step": 2840 }, { "grad_norm": 0.48503780364990234, "learning_rate": 5.825691024008162e-05, "loss": 0.0294, "step": 2850 }, { "grad_norm": 0.555530846118927, "learning_rate": 5.798499147184233e-05, "loss": 0.0307, "step": 2860 }, { "grad_norm": 0.562579870223999, "learning_rate": 5.771283014125317e-05, "loss": 0.0338, "step": 2870 }, { "grad_norm": 0.5244818925857544, "learning_rate": 5.7440434515836064e-05, "loss": 0.0284, "step": 2880 }, { "grad_norm": 0.3919405937194824, "learning_rate": 5.7167812870230094e-05, "loss": 0.0305, "step": 2890 }, { "grad_norm": 0.46723607182502747, "learning_rate": 5.689497348594035e-05, "loss": 0.0292, "step": 2900 }, { "grad_norm": 0.47963953018188477, "learning_rate": 5.662192465108613e-05, "loss": 0.0303, "step": 2910 }, { "grad_norm": 0.4416669011116028, "learning_rate": 5.634867466014932e-05, "loss": 0.0282, "step": 2920 }, { "grad_norm": 0.3962218761444092, "learning_rate": 5.607523181372234e-05, "loss": 0.0308, "step": 2930 }, { "grad_norm": 0.4772116243839264, "learning_rate": 5.5801604418256117e-05, "loss": 0.0292, "step": 2940 }, { "grad_norm": 0.40191689133644104, "learning_rate": 5.552780078580756e-05, "loss": 0.0275, "step": 2950 }, { "grad_norm": 0.4422965943813324, "learning_rate": 5.525382923378728e-05, "loss": 0.0292, "step": 2960 }, { "grad_norm": 0.4391031563282013, "learning_rate": 5.49796980847068e-05, "loss": 0.0311, "step": 2970 }, { "grad_norm": 0.4302864372730255, "learning_rate": 5.470541566592573e-05, "loss": 0.0303, "step": 2980 }, { "grad_norm": 0.4752635359764099, "learning_rate": 5.443099030939887e-05, "loss": 0.0284, "step": 2990 }, { "grad_norm": 0.42526647448539734, "learning_rate": 5.415643035142309e-05, "loss": 0.0279, "step": 3000 } ], "logging_steps": 10, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }