{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.1641297340393066, "eval_runtime": 2.1264, "eval_samples_per_second": 9.405, "eval_steps_per_second": 0.941, "step": 0 }, { "epoch": 0.0008535154166222128, "grad_norm": 4.247310161590576, "learning_rate": 5.681818181818182e-08, "loss": 1.0032, "step": 1 }, { "epoch": 0.008535154166222128, "grad_norm": 4.285112380981445, "learning_rate": 5.681818181818182e-07, "loss": 0.9823, "step": 10 }, { "epoch": 0.017070308332444255, "grad_norm": 3.0184619426727295, "learning_rate": 1.1363636363636364e-06, "loss": 0.9382, "step": 20 }, { "epoch": 0.02560546249866638, "grad_norm": 2.0186476707458496, "learning_rate": 1.7045454545454546e-06, "loss": 0.8583, "step": 30 }, { "epoch": 0.03414061666488851, "grad_norm": 1.175138235092163, "learning_rate": 2.2727272727272728e-06, "loss": 0.764, "step": 40 }, { "epoch": 0.04267577083111064, "grad_norm": 0.9045423269271851, "learning_rate": 2.8409090909090916e-06, "loss": 0.714, "step": 50 }, { "epoch": 0.05121092499733276, "grad_norm": 0.8305149078369141, "learning_rate": 3.409090909090909e-06, "loss": 0.6835, "step": 60 }, { "epoch": 0.05974607916355489, "grad_norm": 0.7908885478973389, "learning_rate": 3.9772727272727275e-06, "loss": 0.6549, "step": 70 }, { "epoch": 0.06828123332977702, "grad_norm": 0.8054904341697693, "learning_rate": 4.5454545454545455e-06, "loss": 0.6368, "step": 80 }, { "epoch": 0.07681638749599914, "grad_norm": 0.7891983985900879, "learning_rate": 5.113636363636364e-06, "loss": 0.631, "step": 90 }, { "epoch": 0.08535154166222128, "grad_norm": 0.7775433659553528, "learning_rate": 5.681818181818183e-06, "loss": 0.6186, "step": 100 }, { "epoch": 0.0938866958284434, "grad_norm": 0.8340434432029724, "learning_rate": 6.25e-06, "loss": 0.6028, "step": 110 }, { "epoch": 0.10242184999466553, "grad_norm": 0.7749195098876953, "learning_rate": 6.818181818181818e-06, "loss": 0.5965, "step": 120 }, { "epoch": 0.11095700416088766, "grad_norm": 0.7949061393737793, "learning_rate": 7.386363636363637e-06, "loss": 0.5837, "step": 130 }, { "epoch": 0.11949215832710978, "grad_norm": 0.8714171051979065, "learning_rate": 7.954545454545455e-06, "loss": 0.5865, "step": 140 }, { "epoch": 0.1280273124933319, "grad_norm": 0.7890238761901855, "learning_rate": 8.522727272727273e-06, "loss": 0.5712, "step": 150 }, { "epoch": 0.13656246665955404, "grad_norm": 0.8195155262947083, "learning_rate": 9.090909090909091e-06, "loss": 0.5774, "step": 160 }, { "epoch": 0.14509762082577615, "grad_norm": 0.8219712972640991, "learning_rate": 9.65909090909091e-06, "loss": 0.5767, "step": 170 }, { "epoch": 0.1536327749919983, "grad_norm": 0.8124852776527405, "learning_rate": 1.0227272727272729e-05, "loss": 0.5643, "step": 180 }, { "epoch": 0.16216792915822043, "grad_norm": 0.8383850455284119, "learning_rate": 1.0795454545454547e-05, "loss": 0.5734, "step": 190 }, { "epoch": 0.17070308332444256, "grad_norm": 0.7605228424072266, "learning_rate": 1.1363636363636366e-05, "loss": 0.5661, "step": 200 }, { "epoch": 0.17923823749066467, "grad_norm": 0.8585752844810486, "learning_rate": 1.1931818181818183e-05, "loss": 0.5534, "step": 210 }, { "epoch": 0.1877733916568868, "grad_norm": 0.8643070459365845, "learning_rate": 1.25e-05, "loss": 0.5549, "step": 220 }, { "epoch": 0.19630854582310894, "grad_norm": 0.9826616644859314, "learning_rate": 1.306818181818182e-05, "loss": 0.5531, "step": 230 }, { "epoch": 0.20484369998933105, "grad_norm": 0.8799106478691101, "learning_rate": 1.3636363636363637e-05, "loss": 0.5371, "step": 240 }, { "epoch": 0.2133788541555532, "grad_norm": 0.757698118686676, "learning_rate": 1.4204545454545455e-05, "loss": 0.5459, "step": 250 }, { "epoch": 0.22191400832177532, "grad_norm": 0.8704412579536438, "learning_rate": 1.4772727272727274e-05, "loss": 0.5369, "step": 260 }, { "epoch": 0.23044916248799743, "grad_norm": 0.7941352725028992, "learning_rate": 1.5340909090909094e-05, "loss": 0.5359, "step": 270 }, { "epoch": 0.23898431665421957, "grad_norm": 0.7990615367889404, "learning_rate": 1.590909090909091e-05, "loss": 0.5285, "step": 280 }, { "epoch": 0.2475194708204417, "grad_norm": 0.7938647270202637, "learning_rate": 1.647727272727273e-05, "loss": 0.5391, "step": 290 }, { "epoch": 0.2560546249866638, "grad_norm": 0.7677845358848572, "learning_rate": 1.7045454545454546e-05, "loss": 0.5195, "step": 300 }, { "epoch": 0.26458977915288595, "grad_norm": 0.7977807521820068, "learning_rate": 1.7613636363636366e-05, "loss": 0.5246, "step": 310 }, { "epoch": 0.2731249333191081, "grad_norm": 0.819622814655304, "learning_rate": 1.8181818181818182e-05, "loss": 0.5176, "step": 320 }, { "epoch": 0.2816600874853302, "grad_norm": 0.8428648114204407, "learning_rate": 1.8750000000000002e-05, "loss": 0.513, "step": 330 }, { "epoch": 0.2901952416515523, "grad_norm": 0.7542017102241516, "learning_rate": 1.931818181818182e-05, "loss": 0.5196, "step": 340 }, { "epoch": 0.29873039581777444, "grad_norm": 0.8601102232933044, "learning_rate": 1.9886363636363638e-05, "loss": 0.5206, "step": 350 }, { "epoch": 0.3072655499839966, "grad_norm": 0.7532691955566406, "learning_rate": 1.9999683918961086e-05, "loss": 0.5162, "step": 360 }, { "epoch": 0.3158007041502187, "grad_norm": 0.7489638924598694, "learning_rate": 1.999839987398595e-05, "loss": 0.5122, "step": 370 }, { "epoch": 0.32433585831644085, "grad_norm": 0.7671491503715515, "learning_rate": 1.9996128236743682e-05, "loss": 0.5161, "step": 380 }, { "epoch": 0.332871012482663, "grad_norm": 0.7274801731109619, "learning_rate": 1.9992869231615323e-05, "loss": 0.5167, "step": 390 }, { "epoch": 0.3414061666488851, "grad_norm": 0.7289360165596008, "learning_rate": 1.9988623180509206e-05, "loss": 0.5127, "step": 400 }, { "epoch": 0.3499413208151072, "grad_norm": 0.7416843175888062, "learning_rate": 1.9983390502829168e-05, "loss": 0.5208, "step": 410 }, { "epoch": 0.35847647498132934, "grad_norm": 0.7530054450035095, "learning_rate": 1.997717171543311e-05, "loss": 0.5011, "step": 420 }, { "epoch": 0.3670116291475515, "grad_norm": 0.7372049689292908, "learning_rate": 1.9969967432581962e-05, "loss": 0.5091, "step": 430 }, { "epoch": 0.3755467833137736, "grad_norm": 0.8357003927230835, "learning_rate": 1.996177836587899e-05, "loss": 0.5114, "step": 440 }, { "epoch": 0.38408193747999575, "grad_norm": 0.8466181755065918, "learning_rate": 1.9952605324199516e-05, "loss": 0.5009, "step": 450 }, { "epoch": 0.3926170916462179, "grad_norm": 0.7260451316833496, "learning_rate": 1.9942449213611028e-05, "loss": 0.5087, "step": 460 }, { "epoch": 0.40115224581243997, "grad_norm": 0.6815033555030823, "learning_rate": 1.9931311037283673e-05, "loss": 0.5033, "step": 470 }, { "epoch": 0.4096873999786621, "grad_norm": 0.706380307674408, "learning_rate": 1.9919191895391176e-05, "loss": 0.4974, "step": 480 }, { "epoch": 0.41822255414488424, "grad_norm": 0.736223042011261, "learning_rate": 1.9906092985002163e-05, "loss": 0.4981, "step": 490 }, { "epoch": 0.4267577083111064, "grad_norm": 0.9467535614967346, "learning_rate": 1.9892015599961927e-05, "loss": 0.5017, "step": 500 }, { "epoch": 0.4352928624773285, "grad_norm": 0.7372705936431885, "learning_rate": 1.9876961130764624e-05, "loss": 0.5047, "step": 510 }, { "epoch": 0.44382801664355065, "grad_norm": 0.7471463680267334, "learning_rate": 1.9860931064415934e-05, "loss": 0.5009, "step": 520 }, { "epoch": 0.45236317080977273, "grad_norm": 0.6775240898132324, "learning_rate": 1.9843926984286165e-05, "loss": 0.5045, "step": 530 }, { "epoch": 0.46089832497599487, "grad_norm": 0.6793776750564575, "learning_rate": 1.9825950569953884e-05, "loss": 0.4978, "step": 540 }, { "epoch": 0.469433479142217, "grad_norm": 0.6851901412010193, "learning_rate": 1.980700359703999e-05, "loss": 0.494, "step": 550 }, { "epoch": 0.47796863330843914, "grad_norm": 0.6855452060699463, "learning_rate": 1.9787087937032333e-05, "loss": 0.4952, "step": 560 }, { "epoch": 0.4865037874746613, "grad_norm": 0.707453727722168, "learning_rate": 1.976620555710087e-05, "loss": 0.4949, "step": 570 }, { "epoch": 0.4950389416408834, "grad_norm": 0.6674401760101318, "learning_rate": 1.9744358519903343e-05, "loss": 0.4863, "step": 580 }, { "epoch": 0.5035740958071055, "grad_norm": 0.7680632472038269, "learning_rate": 1.9721548983381554e-05, "loss": 0.4882, "step": 590 }, { "epoch": 0.5121092499733276, "grad_norm": 0.6789171099662781, "learning_rate": 1.9697779200548202e-05, "loss": 0.4848, "step": 600 }, { "epoch": 0.5206444041395498, "grad_norm": 0.6551903486251831, "learning_rate": 1.9673051519264342e-05, "loss": 0.4951, "step": 610 }, { "epoch": 0.5291795583057719, "grad_norm": 0.7067411541938782, "learning_rate": 1.964736838200749e-05, "loss": 0.493, "step": 620 }, { "epoch": 0.537714712471994, "grad_norm": 0.6695910692214966, "learning_rate": 1.9620732325630342e-05, "loss": 0.4938, "step": 630 }, { "epoch": 0.5462498666382162, "grad_norm": 0.6625940203666687, "learning_rate": 1.9593145981110223e-05, "loss": 0.4873, "step": 640 }, { "epoch": 0.5547850208044383, "grad_norm": 0.7263137102127075, "learning_rate": 1.9564612073289192e-05, "loss": 0.4964, "step": 650 }, { "epoch": 0.5633201749706604, "grad_norm": 0.7034249901771545, "learning_rate": 1.9535133420604905e-05, "loss": 0.4952, "step": 660 }, { "epoch": 0.5718553291368825, "grad_norm": 0.633065402507782, "learning_rate": 1.9504712934812228e-05, "loss": 0.4982, "step": 670 }, { "epoch": 0.5803904833031046, "grad_norm": 0.7597299814224243, "learning_rate": 1.9473353620695614e-05, "loss": 0.4839, "step": 680 }, { "epoch": 0.5889256374693268, "grad_norm": 0.6461811065673828, "learning_rate": 1.9441058575772317e-05, "loss": 0.4853, "step": 690 }, { "epoch": 0.5974607916355489, "grad_norm": 0.6678339242935181, "learning_rate": 1.940783098998643e-05, "loss": 0.4814, "step": 700 }, { "epoch": 0.6059959458017711, "grad_norm": 0.6317865252494812, "learning_rate": 1.9373674145393804e-05, "loss": 0.4896, "step": 710 }, { "epoch": 0.6145310999679932, "grad_norm": 0.6536778211593628, "learning_rate": 1.9338591415837856e-05, "loss": 0.4795, "step": 720 }, { "epoch": 0.6230662541342153, "grad_norm": 0.653390645980835, "learning_rate": 1.9302586266616318e-05, "loss": 0.4862, "step": 730 }, { "epoch": 0.6316014083004374, "grad_norm": 0.6988620758056641, "learning_rate": 1.9265662254138958e-05, "loss": 0.4913, "step": 740 }, { "epoch": 0.6401365624666595, "grad_norm": 0.6603461503982544, "learning_rate": 1.922782302557628e-05, "loss": 0.4813, "step": 750 }, { "epoch": 0.6486717166328817, "grad_norm": 0.7291558980941772, "learning_rate": 1.918907231849931e-05, "loss": 0.4843, "step": 760 }, { "epoch": 0.6572068707991038, "grad_norm": 0.678535521030426, "learning_rate": 1.914941396051036e-05, "loss": 0.4819, "step": 770 }, { "epoch": 0.665742024965326, "grad_norm": 0.7419494390487671, "learning_rate": 1.910885186886502e-05, "loss": 0.4759, "step": 780 }, { "epoch": 0.674277179131548, "grad_norm": 0.7254301309585571, "learning_rate": 1.9067390050085183e-05, "loss": 0.4754, "step": 790 }, { "epoch": 0.6828123332977702, "grad_norm": 0.6892661452293396, "learning_rate": 1.902503259956333e-05, "loss": 0.4759, "step": 800 }, { "epoch": 0.6913474874639923, "grad_norm": 0.6353612542152405, "learning_rate": 1.8981783701157985e-05, "loss": 0.4787, "step": 810 }, { "epoch": 0.6998826416302144, "grad_norm": 0.578580915927887, "learning_rate": 1.8937647626780473e-05, "loss": 0.4748, "step": 820 }, { "epoch": 0.7084177957964366, "grad_norm": 0.6349884867668152, "learning_rate": 1.889262873597295e-05, "loss": 0.4817, "step": 830 }, { "epoch": 0.7169529499626587, "grad_norm": 0.6372865438461304, "learning_rate": 1.8846731475477796e-05, "loss": 0.4811, "step": 840 }, { "epoch": 0.7254881041288809, "grad_norm": 0.70332932472229, "learning_rate": 1.8799960378798382e-05, "loss": 0.4854, "step": 850 }, { "epoch": 0.734023258295103, "grad_norm": 0.6345937252044678, "learning_rate": 1.8752320065751276e-05, "loss": 0.4804, "step": 860 }, { "epoch": 0.742558412461325, "grad_norm": 0.7001857757568359, "learning_rate": 1.8703815242009927e-05, "loss": 0.4823, "step": 870 }, { "epoch": 0.7510935666275472, "grad_norm": 0.6997294425964355, "learning_rate": 1.8654450698639845e-05, "loss": 0.4772, "step": 880 }, { "epoch": 0.7596287207937693, "grad_norm": 0.6218723058700562, "learning_rate": 1.860423131162538e-05, "loss": 0.4741, "step": 890 }, { "epoch": 0.7681638749599915, "grad_norm": 0.6111359000205994, "learning_rate": 1.8553162041388096e-05, "loss": 0.4724, "step": 900 }, { "epoch": 0.7766990291262136, "grad_norm": 0.6032017469406128, "learning_rate": 1.8501247932296785e-05, "loss": 0.4769, "step": 910 }, { "epoch": 0.7852341832924358, "grad_norm": 0.6396908164024353, "learning_rate": 1.8448494112169234e-05, "loss": 0.4747, "step": 920 }, { "epoch": 0.7937693374586579, "grad_norm": 0.690779983997345, "learning_rate": 1.8394905791765714e-05, "loss": 0.4719, "step": 930 }, { "epoch": 0.8023044916248799, "grad_norm": 0.6640828251838684, "learning_rate": 1.8340488264274285e-05, "loss": 0.477, "step": 940 }, { "epoch": 0.8108396457911021, "grad_norm": 0.6492927074432373, "learning_rate": 1.8285246904787968e-05, "loss": 0.4626, "step": 950 }, { "epoch": 0.8193747999573242, "grad_norm": 0.6751061677932739, "learning_rate": 1.8229187169773805e-05, "loss": 0.4698, "step": 960 }, { "epoch": 0.8279099541235464, "grad_norm": 0.6475481390953064, "learning_rate": 1.8172314596533914e-05, "loss": 0.4698, "step": 970 }, { "epoch": 0.8364451082897685, "grad_norm": 0.6659871935844421, "learning_rate": 1.8114634802658542e-05, "loss": 0.4778, "step": 980 }, { "epoch": 0.8449802624559906, "grad_norm": 0.6479447484016418, "learning_rate": 1.8056153485471167e-05, "loss": 0.4703, "step": 990 }, { "epoch": 0.8535154166222128, "grad_norm": 0.7254726886749268, "learning_rate": 1.7996876421465764e-05, "loss": 0.4767, "step": 1000 }, { "epoch": 0.8620505707884348, "grad_norm": 0.6294846534729004, "learning_rate": 1.7936809465736223e-05, "loss": 0.4615, "step": 1010 }, { "epoch": 0.870585724954657, "grad_norm": 0.6530686020851135, "learning_rate": 1.7875958551398023e-05, "loss": 0.4642, "step": 1020 }, { "epoch": 0.8791208791208791, "grad_norm": 0.713152289390564, "learning_rate": 1.781432968900217e-05, "loss": 0.4692, "step": 1030 }, { "epoch": 0.8876560332871013, "grad_norm": 0.7651381492614746, "learning_rate": 1.775192896594151e-05, "loss": 0.4646, "step": 1040 }, { "epoch": 0.8961911874533234, "grad_norm": 0.5913043022155762, "learning_rate": 1.7688762545849466e-05, "loss": 0.4688, "step": 1050 }, { "epoch": 0.9047263416195455, "grad_norm": 0.6034150719642639, "learning_rate": 1.7624836667991195e-05, "loss": 0.4574, "step": 1060 }, { "epoch": 0.9132614957857677, "grad_norm": 0.6374826431274414, "learning_rate": 1.7560157646647335e-05, "loss": 0.4651, "step": 1070 }, { "epoch": 0.9217966499519897, "grad_norm": 0.6261687278747559, "learning_rate": 1.749473187049028e-05, "loss": 0.4608, "step": 1080 }, { "epoch": 0.9303318041182119, "grad_norm": 0.6067594885826111, "learning_rate": 1.742856580195316e-05, "loss": 0.4592, "step": 1090 }, { "epoch": 0.938866958284434, "grad_norm": 0.6240684986114502, "learning_rate": 1.7361665976591513e-05, "loss": 0.4663, "step": 1100 }, { "epoch": 0.9474021124506561, "grad_norm": 0.6192986965179443, "learning_rate": 1.7294039002437724e-05, "loss": 0.476, "step": 1110 }, { "epoch": 0.9559372666168783, "grad_norm": 0.6121706962585449, "learning_rate": 1.7225691559348333e-05, "loss": 0.4531, "step": 1120 }, { "epoch": 0.9644724207831004, "grad_norm": 0.5956526398658752, "learning_rate": 1.715663039834421e-05, "loss": 0.4643, "step": 1130 }, { "epoch": 0.9730075749493226, "grad_norm": 0.5699427127838135, "learning_rate": 1.7086862340943745e-05, "loss": 0.4601, "step": 1140 }, { "epoch": 0.9815427291155446, "grad_norm": 0.6840018630027771, "learning_rate": 1.701639427848903e-05, "loss": 0.4555, "step": 1150 }, { "epoch": 0.9900778832817668, "grad_norm": 0.6009624004364014, "learning_rate": 1.6945233171465193e-05, "loss": 0.4632, "step": 1160 }, { "epoch": 0.9986130374479889, "grad_norm": 0.5962963104248047, "learning_rate": 1.6873386048812854e-05, "loss": 0.4702, "step": 1170 }, { "epoch": 1.0068281233329777, "grad_norm": 0.679568350315094, "learning_rate": 1.680086000723385e-05, "loss": 0.4318, "step": 1180 }, { "epoch": 1.0153632774991999, "grad_norm": 0.6293564438819885, "learning_rate": 1.672766221049025e-05, "loss": 0.4231, "step": 1190 }, { "epoch": 1.023898431665422, "grad_norm": 0.6619155406951904, "learning_rate": 1.6653799888696777e-05, "loss": 0.4251, "step": 1200 }, { "epoch": 1.032433585831644, "grad_norm": 0.6258383393287659, "learning_rate": 1.6579280337606615e-05, "loss": 0.4237, "step": 1210 }, { "epoch": 1.0409687399978662, "grad_norm": 0.6029568314552307, "learning_rate": 1.650411091789082e-05, "loss": 0.42, "step": 1220 }, { "epoch": 1.0495038941640884, "grad_norm": 0.5933378338813782, "learning_rate": 1.6428299054411212e-05, "loss": 0.4325, "step": 1230 }, { "epoch": 1.0580390483303104, "grad_norm": 0.593717098236084, "learning_rate": 1.635185223548704e-05, "loss": 0.4289, "step": 1240 }, { "epoch": 1.0665742024965326, "grad_norm": 0.699511706829071, "learning_rate": 1.627477801215528e-05, "loss": 0.4218, "step": 1250 }, { "epoch": 1.0751093566627548, "grad_norm": 0.6585178971290588, "learning_rate": 1.619708399742481e-05, "loss": 0.4222, "step": 1260 }, { "epoch": 1.083644510828977, "grad_norm": 0.6042758822441101, "learning_rate": 1.6118777865524414e-05, "loss": 0.4231, "step": 1270 }, { "epoch": 1.092179664995199, "grad_norm": 0.5816319584846497, "learning_rate": 1.6039867351144778e-05, "loss": 0.4199, "step": 1280 }, { "epoch": 1.100714819161421, "grad_norm": 0.7652620673179626, "learning_rate": 1.5960360248674478e-05, "loss": 0.428, "step": 1290 }, { "epoch": 1.1092499733276433, "grad_norm": 0.6318002343177795, "learning_rate": 1.5880264411430106e-05, "loss": 0.4263, "step": 1300 }, { "epoch": 1.1177851274938653, "grad_norm": 0.5835772752761841, "learning_rate": 1.579958775088054e-05, "loss": 0.4277, "step": 1310 }, { "epoch": 1.1263202816600875, "grad_norm": 0.6293865442276001, "learning_rate": 1.5718338235865505e-05, "loss": 0.4226, "step": 1320 }, { "epoch": 1.1348554358263097, "grad_norm": 0.6008028984069824, "learning_rate": 1.5636523891808452e-05, "loss": 0.4247, "step": 1330 }, { "epoch": 1.1433905899925318, "grad_norm": 0.6461915373802185, "learning_rate": 1.5554152799923824e-05, "loss": 0.429, "step": 1340 }, { "epoch": 1.1519257441587538, "grad_norm": 0.606767475605011, "learning_rate": 1.547123309641885e-05, "loss": 0.4268, "step": 1350 }, { "epoch": 1.160460898324976, "grad_norm": 0.637474000453949, "learning_rate": 1.538777297168991e-05, "loss": 0.4378, "step": 1360 }, { "epoch": 1.1689960524911982, "grad_norm": 0.6810766458511353, "learning_rate": 1.5303780669513472e-05, "loss": 0.42, "step": 1370 }, { "epoch": 1.1775312066574202, "grad_norm": 0.6600694060325623, "learning_rate": 1.5219264486231882e-05, "loss": 0.4181, "step": 1380 }, { "epoch": 1.1860663608236424, "grad_norm": 0.7521950006484985, "learning_rate": 1.5134232769933836e-05, "loss": 0.4326, "step": 1390 }, { "epoch": 1.1946015149898646, "grad_norm": 0.6155983805656433, "learning_rate": 1.5048693919629837e-05, "loss": 0.4263, "step": 1400 }, { "epoch": 1.2031366691560867, "grad_norm": 0.6304906606674194, "learning_rate": 1.4962656384422555e-05, "loss": 0.4321, "step": 1410 }, { "epoch": 1.2116718233223087, "grad_norm": 0.5815775990486145, "learning_rate": 1.4876128662672277e-05, "loss": 0.4248, "step": 1420 }, { "epoch": 1.220206977488531, "grad_norm": 0.619872510433197, "learning_rate": 1.4789119301157491e-05, "loss": 0.4192, "step": 1430 }, { "epoch": 1.228742131654753, "grad_norm": 0.6162813305854797, "learning_rate": 1.4701636894230655e-05, "loss": 0.4199, "step": 1440 }, { "epoch": 1.237277285820975, "grad_norm": 0.6389777064323425, "learning_rate": 1.4613690082969311e-05, "loss": 0.4239, "step": 1450 }, { "epoch": 1.2458124399871973, "grad_norm": 0.6258131265640259, "learning_rate": 1.4525287554322558e-05, "loss": 0.4229, "step": 1460 }, { "epoch": 1.2543475941534195, "grad_norm": 0.678900957107544, "learning_rate": 1.4436438040252983e-05, "loss": 0.431, "step": 1470 }, { "epoch": 1.2628827483196416, "grad_norm": 0.564929723739624, "learning_rate": 1.4347150316874179e-05, "loss": 0.4178, "step": 1480 }, { "epoch": 1.2714179024858636, "grad_norm": 0.5994879603385925, "learning_rate": 1.4257433203583876e-05, "loss": 0.4271, "step": 1490 }, { "epoch": 1.2799530566520858, "grad_norm": 0.5823336839675903, "learning_rate": 1.4167295562192808e-05, "loss": 0.4246, "step": 1500 }, { "epoch": 1.2884882108183078, "grad_norm": 0.624218225479126, "learning_rate": 1.4076746296049387e-05, "loss": 0.4278, "step": 1510 }, { "epoch": 1.29702336498453, "grad_norm": 0.6522708535194397, "learning_rate": 1.3985794349160267e-05, "loss": 0.4225, "step": 1520 }, { "epoch": 1.3055585191507522, "grad_norm": 0.5826681852340698, "learning_rate": 1.3894448705306908e-05, "loss": 0.4195, "step": 1530 }, { "epoch": 1.3140936733169744, "grad_norm": 0.6364330053329468, "learning_rate": 1.3802718387158208e-05, "loss": 0.4167, "step": 1540 }, { "epoch": 1.3226288274831965, "grad_norm": 0.6075431704521179, "learning_rate": 1.3710612455379268e-05, "loss": 0.4216, "step": 1550 }, { "epoch": 1.3311639816494185, "grad_norm": 0.6225044131278992, "learning_rate": 1.3618140007736442e-05, "loss": 0.4181, "step": 1560 }, { "epoch": 1.3396991358156407, "grad_norm": 0.5846056938171387, "learning_rate": 1.3525310178198707e-05, "loss": 0.4264, "step": 1570 }, { "epoch": 1.3482342899818627, "grad_norm": 0.6489505171775818, "learning_rate": 1.3432132136035443e-05, "loss": 0.4267, "step": 1580 }, { "epoch": 1.3567694441480849, "grad_norm": 0.7888132929801941, "learning_rate": 1.3338615084910737e-05, "loss": 0.4173, "step": 1590 }, { "epoch": 1.365304598314307, "grad_norm": 0.6617109775543213, "learning_rate": 1.3244768261974307e-05, "loss": 0.422, "step": 1600 }, { "epoch": 1.3738397524805293, "grad_norm": 0.5967974066734314, "learning_rate": 1.3150600936949092e-05, "loss": 0.422, "step": 1610 }, { "epoch": 1.3823749066467514, "grad_norm": 0.6307809352874756, "learning_rate": 1.305612241121562e-05, "loss": 0.422, "step": 1620 }, { "epoch": 1.3909100608129734, "grad_norm": 0.5797590017318726, "learning_rate": 1.2961342016893302e-05, "loss": 0.4168, "step": 1630 }, { "epoch": 1.3994452149791956, "grad_norm": 0.5861985087394714, "learning_rate": 1.2866269115918606e-05, "loss": 0.4289, "step": 1640 }, { "epoch": 1.4079803691454176, "grad_norm": 0.6140124797821045, "learning_rate": 1.2770913099120374e-05, "loss": 0.4294, "step": 1650 }, { "epoch": 1.4165155233116398, "grad_norm": 0.5719994306564331, "learning_rate": 1.2675283385292212e-05, "loss": 0.4254, "step": 1660 }, { "epoch": 1.425050677477862, "grad_norm": 0.5942404866218567, "learning_rate": 1.2579389420262151e-05, "loss": 0.4212, "step": 1670 }, { "epoch": 1.4335858316440842, "grad_norm": 0.6488329172134399, "learning_rate": 1.248324067595966e-05, "loss": 0.4173, "step": 1680 }, { "epoch": 1.4421209858103061, "grad_norm": 0.5882269144058228, "learning_rate": 1.2386846649480036e-05, "loss": 0.4206, "step": 1690 }, { "epoch": 1.4506561399765283, "grad_norm": 0.6740261316299438, "learning_rate": 1.2290216862146309e-05, "loss": 0.4157, "step": 1700 }, { "epoch": 1.4591912941427505, "grad_norm": 0.6225579977035522, "learning_rate": 1.2193360858568824e-05, "loss": 0.4165, "step": 1710 }, { "epoch": 1.4677264483089725, "grad_norm": 0.6018004417419434, "learning_rate": 1.2096288205702431e-05, "loss": 0.4225, "step": 1720 }, { "epoch": 1.4762616024751947, "grad_norm": 0.6020993590354919, "learning_rate": 1.1999008491901511e-05, "loss": 0.4247, "step": 1730 }, { "epoch": 1.4847967566414169, "grad_norm": 0.6166216731071472, "learning_rate": 1.1901531325972911e-05, "loss": 0.4227, "step": 1740 }, { "epoch": 1.493331910807639, "grad_norm": 0.5822474956512451, "learning_rate": 1.180386633622681e-05, "loss": 0.4132, "step": 1750 }, { "epoch": 1.5018670649738612, "grad_norm": 0.6379313468933105, "learning_rate": 1.1706023169525691e-05, "loss": 0.4213, "step": 1760 }, { "epoch": 1.5104022191400832, "grad_norm": 0.5989879965782166, "learning_rate": 1.160801149033147e-05, "loss": 0.4222, "step": 1770 }, { "epoch": 1.5189373733063052, "grad_norm": 0.6491897702217102, "learning_rate": 1.1509840979750895e-05, "loss": 0.4186, "step": 1780 }, { "epoch": 1.5274725274725274, "grad_norm": 0.5892138481140137, "learning_rate": 1.1411521334579288e-05, "loss": 0.4215, "step": 1790 }, { "epoch": 1.5360076816387496, "grad_norm": 0.5960173606872559, "learning_rate": 1.131306226634274e-05, "loss": 0.418, "step": 1800 }, { "epoch": 1.5445428358049718, "grad_norm": 0.5612832307815552, "learning_rate": 1.1214473500338862e-05, "loss": 0.4203, "step": 1810 }, { "epoch": 1.553077989971194, "grad_norm": 0.5860463380813599, "learning_rate": 1.1115764774676172e-05, "loss": 0.422, "step": 1820 }, { "epoch": 1.5616131441374161, "grad_norm": 0.6095026135444641, "learning_rate": 1.1016945839312202e-05, "loss": 0.4153, "step": 1830 }, { "epoch": 1.5701482983036381, "grad_norm": 0.5922375321388245, "learning_rate": 1.0918026455090447e-05, "loss": 0.4195, "step": 1840 }, { "epoch": 1.57868345246986, "grad_norm": 0.5867443084716797, "learning_rate": 1.0819016392776245e-05, "loss": 0.4169, "step": 1850 }, { "epoch": 1.5872186066360823, "grad_norm": 0.6018590927124023, "learning_rate": 1.0719925432091671e-05, "loss": 0.4111, "step": 1860 }, { "epoch": 1.5957537608023045, "grad_norm": 0.5756944417953491, "learning_rate": 1.0620763360749534e-05, "loss": 0.4189, "step": 1870 }, { "epoch": 1.6042889149685267, "grad_norm": 0.6248449683189392, "learning_rate": 1.0521539973486592e-05, "loss": 0.4183, "step": 1880 }, { "epoch": 1.6128240691347489, "grad_norm": 0.6250705122947693, "learning_rate": 1.0422265071096101e-05, "loss": 0.4233, "step": 1890 }, { "epoch": 1.6213592233009708, "grad_norm": 0.5823164582252502, "learning_rate": 1.0322948459459716e-05, "loss": 0.4143, "step": 1900 }, { "epoch": 1.629894377467193, "grad_norm": 0.574417769908905, "learning_rate": 1.0223599948578923e-05, "loss": 0.423, "step": 1910 }, { "epoch": 1.638429531633415, "grad_norm": 0.547025203704834, "learning_rate": 1.0124229351606065e-05, "loss": 0.4164, "step": 1920 }, { "epoch": 1.6469646857996372, "grad_norm": 0.5772645473480225, "learning_rate": 1.002484648387503e-05, "loss": 0.4116, "step": 1930 }, { "epoch": 1.6554998399658594, "grad_norm": 0.5808480381965637, "learning_rate": 9.925461161931758e-06, "loss": 0.4147, "step": 1940 }, { "epoch": 1.6640349941320816, "grad_norm": 0.5725694894790649, "learning_rate": 9.826083202564596e-06, "loss": 0.4036, "step": 1950 }, { "epoch": 1.6725701482983037, "grad_norm": 0.563442587852478, "learning_rate": 9.726722421834664e-06, "loss": 0.421, "step": 1960 }, { "epoch": 1.6811053024645257, "grad_norm": 0.5920106172561646, "learning_rate": 9.627388634106245e-06, "loss": 0.4214, "step": 1970 }, { "epoch": 1.689640456630748, "grad_norm": 0.5918905138969421, "learning_rate": 9.528091651077404e-06, "loss": 0.417, "step": 1980 }, { "epoch": 1.6981756107969699, "grad_norm": 0.576627790927887, "learning_rate": 9.428841280810811e-06, "loss": 0.4151, "step": 1990 }, { "epoch": 1.706710764963192, "grad_norm": 0.5786689519882202, "learning_rate": 9.329647326764963e-06, "loss": 0.4076, "step": 2000 }, { "epoch": 1.7152459191294143, "grad_norm": 0.621134340763092, "learning_rate": 9.23051958682584e-06, "loss": 0.4165, "step": 2010 }, { "epoch": 1.7237810732956365, "grad_norm": 0.5719323754310608, "learning_rate": 9.131467852339123e-06, "loss": 0.4156, "step": 2020 }, { "epoch": 1.7323162274618586, "grad_norm": 0.5678684711456299, "learning_rate": 9.032501907143053e-06, "loss": 0.4191, "step": 2030 }, { "epoch": 1.7408513816280806, "grad_norm": 0.5779105424880981, "learning_rate": 8.933631526602028e-06, "loss": 0.4046, "step": 2040 }, { "epoch": 1.7493865357943028, "grad_norm": 0.5759092569351196, "learning_rate": 8.834866476641048e-06, "loss": 0.4143, "step": 2050 }, { "epoch": 1.7579216899605248, "grad_norm": 0.5681161284446716, "learning_rate": 8.73621651278108e-06, "loss": 0.424, "step": 2060 }, { "epoch": 1.766456844126747, "grad_norm": 0.614236056804657, "learning_rate": 8.637691379175453e-06, "loss": 0.4148, "step": 2070 }, { "epoch": 1.7749919982929692, "grad_norm": 0.5703302621841431, "learning_rate": 8.539300807647396e-06, "loss": 0.4179, "step": 2080 }, { "epoch": 1.7835271524591914, "grad_norm": 0.6135383248329163, "learning_rate": 8.441054516728747e-06, "loss": 0.4173, "step": 2090 }, { "epoch": 1.7920623066254135, "grad_norm": 0.5604820847511292, "learning_rate": 8.342962210700043e-06, "loss": 0.4167, "step": 2100 }, { "epoch": 1.8005974607916355, "grad_norm": 0.5805196166038513, "learning_rate": 8.245033578631939e-06, "loss": 0.4232, "step": 2110 }, { "epoch": 1.8091326149578577, "grad_norm": 0.5662888884544373, "learning_rate": 8.147278293428199e-06, "loss": 0.4066, "step": 2120 }, { "epoch": 1.8176677691240797, "grad_norm": 0.581598162651062, "learning_rate": 8.049706010870241e-06, "loss": 0.416, "step": 2130 }, { "epoch": 1.8262029232903019, "grad_norm": 0.5490183234214783, "learning_rate": 7.952326368663404e-06, "loss": 0.4088, "step": 2140 }, { "epoch": 1.834738077456524, "grad_norm": 0.6272237300872803, "learning_rate": 7.855148985484946e-06, "loss": 0.4099, "step": 2150 }, { "epoch": 1.8432732316227463, "grad_norm": 0.6257729530334473, "learning_rate": 7.758183460034005e-06, "loss": 0.4068, "step": 2160 }, { "epoch": 1.8518083857889684, "grad_norm": 0.6253445744514465, "learning_rate": 7.661439370083456e-06, "loss": 0.4137, "step": 2170 }, { "epoch": 1.8603435399551904, "grad_norm": 0.6083528995513916, "learning_rate": 7.564926271533876e-06, "loss": 0.4112, "step": 2180 }, { "epoch": 1.8688786941214126, "grad_norm": 0.598172128200531, "learning_rate": 7.468653697469655e-06, "loss": 0.4146, "step": 2190 }, { "epoch": 1.8774138482876346, "grad_norm": 0.5930103659629822, "learning_rate": 7.372631157217377e-06, "loss": 0.4093, "step": 2200 }, { "epoch": 1.8859490024538568, "grad_norm": 0.5641002655029297, "learning_rate": 7.276868135406523e-06, "loss": 0.4162, "step": 2210 }, { "epoch": 1.894484156620079, "grad_norm": 0.5636462569236755, "learning_rate": 7.181374091032635e-06, "loss": 0.408, "step": 2220 }, { "epoch": 1.9030193107863012, "grad_norm": 0.6011704802513123, "learning_rate": 7.0861584565229955e-06, "loss": 0.4119, "step": 2230 }, { "epoch": 1.9115544649525233, "grad_norm": 0.5807620882987976, "learning_rate": 6.991230636804953e-06, "loss": 0.4056, "step": 2240 }, { "epoch": 1.9200896191187453, "grad_norm": 0.6091267466545105, "learning_rate": 6.896600008376926e-06, "loss": 0.4095, "step": 2250 }, { "epoch": 1.9286247732849675, "grad_norm": 0.5541015267372131, "learning_rate": 6.802275918382266e-06, "loss": 0.4085, "step": 2260 }, { "epoch": 1.9371599274511895, "grad_norm": 0.5683854222297668, "learning_rate": 6.708267683685978e-06, "loss": 0.4098, "step": 2270 }, { "epoch": 1.9456950816174117, "grad_norm": 0.6314034461975098, "learning_rate": 6.614584589954447e-06, "loss": 0.4135, "step": 2280 }, { "epoch": 1.9542302357836339, "grad_norm": 0.5882005095481873, "learning_rate": 6.521235890738251e-06, "loss": 0.4102, "step": 2290 }, { "epoch": 1.962765389949856, "grad_norm": 0.5736626386642456, "learning_rate": 6.42823080655814e-06, "loss": 0.4037, "step": 2300 }, { "epoch": 1.9713005441160782, "grad_norm": 0.5581937432289124, "learning_rate": 6.33557852399428e-06, "loss": 0.4155, "step": 2310 }, { "epoch": 1.9798356982823002, "grad_norm": 0.5713927745819092, "learning_rate": 6.243288194778837e-06, "loss": 0.4058, "step": 2320 }, { "epoch": 1.9883708524485222, "grad_norm": 0.6096407771110535, "learning_rate": 6.151368934892028e-06, "loss": 0.4096, "step": 2330 }, { "epoch": 1.9969060066147444, "grad_norm": 0.5943323373794556, "learning_rate": 6.059829823661692e-06, "loss": 0.413, "step": 2340 } ], "logging_steps": 10, "max_steps": 3513, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.459688578893742e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }