{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.7378236055374146, "learning_rate": 1.8e-06, "loss": 1.0762, "step": 10 }, { "grad_norm": 0.295481413602829, "learning_rate": 3.8e-06, "loss": 1.068, "step": 20 }, { "grad_norm": 0.14812137186527252, "learning_rate": 5.8e-06, "loss": 1.0568, "step": 30 }, { "grad_norm": 0.08043279498815536, "learning_rate": 7.8e-06, "loss": 1.0493, "step": 40 }, { "grad_norm": 0.08855466544628143, "learning_rate": 9.800000000000001e-06, "loss": 1.0441, "step": 50 }, { "grad_norm": 0.0877288430929184, "learning_rate": 1.18e-05, "loss": 1.0395, "step": 60 }, { "grad_norm": 0.09314433485269547, "learning_rate": 1.3800000000000002e-05, "loss": 1.0285, "step": 70 }, { "grad_norm": 0.10957096517086029, "learning_rate": 1.58e-05, "loss": 1.0216, "step": 80 }, { "grad_norm": 0.09380817413330078, "learning_rate": 1.78e-05, "loss": 1.0177, "step": 90 }, { "grad_norm": 0.12308425456285477, "learning_rate": 1.9800000000000004e-05, "loss": 1.0174, "step": 100 }, { "grad_norm": 0.20681029558181763, "learning_rate": 2.18e-05, "loss": 1.0002, "step": 110 }, { "grad_norm": 0.43010827898979187, "learning_rate": 2.38e-05, "loss": 0.9756, "step": 120 }, { "grad_norm": 0.5976276993751526, "learning_rate": 2.58e-05, "loss": 0.9313, "step": 130 }, { "grad_norm": 0.7003195881843567, "learning_rate": 2.7800000000000005e-05, "loss": 0.8658, "step": 140 }, { "grad_norm": 0.9297671318054199, "learning_rate": 2.98e-05, "loss": 0.7985, "step": 150 }, { "grad_norm": 0.8993100523948669, "learning_rate": 3.18e-05, "loss": 0.7352, "step": 160 }, { "grad_norm": 0.9233132004737854, "learning_rate": 3.38e-05, "loss": 0.6818, "step": 170 }, { "grad_norm": 0.8877270221710205, "learning_rate": 3.58e-05, "loss": 0.6224, "step": 180 }, { "grad_norm": 0.8032844662666321, "learning_rate": 3.7800000000000004e-05, "loss": 0.5745, "step": 190 }, { "grad_norm": 1.4975675344467163, "learning_rate": 3.9800000000000005e-05, "loss": 0.5299, "step": 200 }, { "grad_norm": 0.8472157716751099, "learning_rate": 4.18e-05, "loss": 0.498, "step": 210 }, { "grad_norm": 0.974686324596405, "learning_rate": 4.38e-05, "loss": 0.4575, "step": 220 }, { "grad_norm": 0.9499194025993347, "learning_rate": 4.58e-05, "loss": 0.4184, "step": 230 }, { "grad_norm": 0.9380892515182495, "learning_rate": 4.78e-05, "loss": 0.3887, "step": 240 }, { "grad_norm": 1.1189147233963013, "learning_rate": 4.9800000000000004e-05, "loss": 0.356, "step": 250 }, { "grad_norm": 1.1425296068191528, "learning_rate": 5.1800000000000005e-05, "loss": 0.3323, "step": 260 }, { "grad_norm": 1.6566969156265259, "learning_rate": 5.380000000000001e-05, "loss": 0.3011, "step": 270 }, { "grad_norm": 1.130071759223938, "learning_rate": 5.580000000000001e-05, "loss": 0.266, "step": 280 }, { "grad_norm": 0.748692512512207, "learning_rate": 5.7799999999999995e-05, "loss": 0.2453, "step": 290 }, { "grad_norm": 1.0026001930236816, "learning_rate": 5.9800000000000003e-05, "loss": 0.2321, "step": 300 }, { "grad_norm": 1.0121994018554688, "learning_rate": 6.18e-05, "loss": 0.2158, "step": 310 }, { "grad_norm": 1.153480052947998, "learning_rate": 6.38e-05, "loss": 0.1899, "step": 320 }, { "grad_norm": 1.013115644454956, "learning_rate": 6.58e-05, "loss": 0.1817, "step": 330 }, { "grad_norm": 0.9551103115081787, "learning_rate": 6.780000000000001e-05, "loss": 0.1799, "step": 340 }, { "grad_norm": 1.242201566696167, "learning_rate": 6.98e-05, "loss": 0.1602, "step": 350 }, { "grad_norm": 1.2399349212646484, "learning_rate": 7.18e-05, "loss": 0.1463, "step": 360 }, { "grad_norm": 0.9403181076049805, "learning_rate": 7.38e-05, "loss": 0.1228, "step": 370 }, { "grad_norm": 0.894763171672821, "learning_rate": 7.58e-05, "loss": 0.1122, "step": 380 }, { "grad_norm": 0.9855173230171204, "learning_rate": 7.780000000000001e-05, "loss": 0.1044, "step": 390 }, { "grad_norm": 1.3088891506195068, "learning_rate": 7.98e-05, "loss": 0.0922, "step": 400 }, { "grad_norm": 1.064687967300415, "learning_rate": 8.18e-05, "loss": 0.0824, "step": 410 }, { "grad_norm": 1.112962007522583, "learning_rate": 8.38e-05, "loss": 0.077, "step": 420 }, { "grad_norm": 1.0096707344055176, "learning_rate": 8.58e-05, "loss": 0.0816, "step": 430 }, { "grad_norm": 0.956439733505249, "learning_rate": 8.78e-05, "loss": 0.0721, "step": 440 }, { "grad_norm": 0.841948926448822, "learning_rate": 8.98e-05, "loss": 0.0723, "step": 450 }, { "grad_norm": 1.0005617141723633, "learning_rate": 9.180000000000001e-05, "loss": 0.0676, "step": 460 }, { "grad_norm": 0.8577463030815125, "learning_rate": 9.38e-05, "loss": 0.0647, "step": 470 }, { "grad_norm": 0.9084122776985168, "learning_rate": 9.58e-05, "loss": 0.0677, "step": 480 }, { "grad_norm": 1.0833115577697754, "learning_rate": 9.78e-05, "loss": 0.0653, "step": 490 }, { "grad_norm": 0.8259796500205994, "learning_rate": 9.98e-05, "loss": 0.0669, "step": 500 }, { "grad_norm": 0.8890817165374756, "learning_rate": 9.9999778549206e-05, "loss": 0.0611, "step": 510 }, { "grad_norm": 0.8948380351066589, "learning_rate": 9.999901304280685e-05, "loss": 0.0622, "step": 520 }, { "grad_norm": 0.848558247089386, "learning_rate": 9.999770075521164e-05, "loss": 0.0625, "step": 530 }, { "grad_norm": 0.7437359094619751, "learning_rate": 9.99958417007713e-05, "loss": 0.0626, "step": 540 }, { "grad_norm": 0.8651896715164185, "learning_rate": 9.999343589981615e-05, "loss": 0.0531, "step": 550 }, { "grad_norm": 0.8850076198577881, "learning_rate": 9.999048337865568e-05, "loss": 0.0615, "step": 560 }, { "grad_norm": 1.15691077709198, "learning_rate": 9.998698416957815e-05, "loss": 0.0641, "step": 570 }, { "grad_norm": 0.7086659073829651, "learning_rate": 9.998293831085037e-05, "loss": 0.0596, "step": 580 }, { "grad_norm": 0.7347807288169861, "learning_rate": 9.997834584671719e-05, "loss": 0.0572, "step": 590 }, { "grad_norm": 0.7725936770439148, "learning_rate": 9.997320682740107e-05, "loss": 0.0595, "step": 600 }, { "grad_norm": 0.769038200378418, "learning_rate": 9.996752130910149e-05, "loss": 0.0549, "step": 610 }, { "grad_norm": 0.6636187434196472, "learning_rate": 9.99612893539944e-05, "loss": 0.0538, "step": 620 }, { "grad_norm": 0.7348890900611877, "learning_rate": 9.995451103023144e-05, "loss": 0.0527, "step": 630 }, { "grad_norm": 0.7872657179832458, "learning_rate": 9.994718641193928e-05, "loss": 0.0557, "step": 640 }, { "grad_norm": 0.6932393312454224, "learning_rate": 9.993931557921874e-05, "loss": 0.0548, "step": 650 }, { "grad_norm": 0.7634221911430359, "learning_rate": 9.993089861814402e-05, "loss": 0.0524, "step": 660 }, { "grad_norm": 0.7409372925758362, "learning_rate": 9.992193562076166e-05, "loss": 0.0496, "step": 670 }, { "grad_norm": 0.7612417936325073, "learning_rate": 9.991242668508954e-05, "loss": 0.0461, "step": 680 }, { "grad_norm": 0.7743764519691467, "learning_rate": 9.990237191511587e-05, "loss": 0.0435, "step": 690 }, { "grad_norm": 0.725627601146698, "learning_rate": 9.989177142079802e-05, "loss": 0.0471, "step": 700 }, { "grad_norm": 0.5516918301582336, "learning_rate": 9.988062531806126e-05, "loss": 0.0462, "step": 710 }, { "grad_norm": 0.730378270149231, "learning_rate": 9.986893372879762e-05, "loss": 0.0487, "step": 720 }, { "grad_norm": 0.6389397382736206, "learning_rate": 9.985669678086443e-05, "loss": 0.0488, "step": 730 }, { "grad_norm": 0.8831709623336792, "learning_rate": 9.984391460808298e-05, "loss": 0.0514, "step": 740 }, { "grad_norm": 0.6661105751991272, "learning_rate": 9.983058735023709e-05, "loss": 0.0514, "step": 750 }, { "grad_norm": 0.7856804728507996, "learning_rate": 9.98167151530715e-05, "loss": 0.0471, "step": 760 }, { "grad_norm": 0.7654492259025574, "learning_rate": 9.980229816829034e-05, "loss": 0.0505, "step": 770 }, { "grad_norm": 0.6101555228233337, "learning_rate": 9.978733655355544e-05, "loss": 0.047, "step": 780 }, { "grad_norm": 0.7730712890625, "learning_rate": 9.977183047248464e-05, "loss": 0.0424, "step": 790 }, { "grad_norm": 0.8450173139572144, "learning_rate": 9.975578009464992e-05, "loss": 0.0455, "step": 800 }, { "grad_norm": 0.5586540102958679, "learning_rate": 9.97391855955757e-05, "loss": 0.04, "step": 810 }, { "grad_norm": 0.8502600789070129, "learning_rate": 9.972204715673669e-05, "loss": 0.0418, "step": 820 }, { "grad_norm": 0.6090761423110962, "learning_rate": 9.970436496555617e-05, "loss": 0.042, "step": 830 }, { "grad_norm": 0.6097173690795898, "learning_rate": 9.968613921540373e-05, "loss": 0.0451, "step": 840 }, { "grad_norm": 0.765418291091919, "learning_rate": 9.966737010559326e-05, "loss": 0.0447, "step": 850 }, { "grad_norm": 0.7200655937194824, "learning_rate": 9.964805784138072e-05, "loss": 0.0439, "step": 860 }, { "grad_norm": 0.6888765692710876, "learning_rate": 9.962820263396195e-05, "loss": 0.0416, "step": 870 }, { "grad_norm": 0.5708920359611511, "learning_rate": 9.960780470047033e-05, "loss": 0.0459, "step": 880 }, { "grad_norm": 0.7507001757621765, "learning_rate": 9.958686426397437e-05, "loss": 0.0425, "step": 890 }, { "grad_norm": 0.5076937079429626, "learning_rate": 9.956538155347534e-05, "loss": 0.0455, "step": 900 }, { "grad_norm": 0.5799984335899353, "learning_rate": 9.95433568039047e-05, "loss": 0.0399, "step": 910 }, { "grad_norm": 0.6337814927101135, "learning_rate": 9.952079025612162e-05, "loss": 0.0381, "step": 920 }, { "grad_norm": 0.7000153660774231, "learning_rate": 9.949768215691022e-05, "loss": 0.0411, "step": 930 }, { "grad_norm": 0.5318272709846497, "learning_rate": 9.9474032758977e-05, "loss": 0.0401, "step": 940 }, { "grad_norm": 0.700434148311615, "learning_rate": 9.944984232094794e-05, "loss": 0.0435, "step": 950 }, { "grad_norm": 0.605954647064209, "learning_rate": 9.942511110736584e-05, "loss": 0.0411, "step": 960 }, { "grad_norm": 0.5715162754058838, "learning_rate": 9.939983938868726e-05, "loss": 0.0414, "step": 970 }, { "grad_norm": 0.6310116648674011, "learning_rate": 9.93740274412797e-05, "loss": 0.0383, "step": 980 }, { "grad_norm": 0.680823564529419, "learning_rate": 9.934767554741846e-05, "loss": 0.0457, "step": 990 }, { "grad_norm": 0.632407546043396, "learning_rate": 9.932078399528361e-05, "loss": 0.0374, "step": 1000 }, { "grad_norm": 0.5892583727836609, "learning_rate": 9.929335307895689e-05, "loss": 0.0368, "step": 1010 }, { "grad_norm": 0.6278207898139954, "learning_rate": 9.926538309841839e-05, "loss": 0.0434, "step": 1020 }, { "grad_norm": 0.5285525321960449, "learning_rate": 9.923687435954334e-05, "loss": 0.0363, "step": 1030 }, { "grad_norm": 0.6097428798675537, "learning_rate": 9.920782717409873e-05, "loss": 0.0348, "step": 1040 }, { "grad_norm": 0.6607808470726013, "learning_rate": 9.917824185973994e-05, "loss": 0.0344, "step": 1050 }, { "grad_norm": 0.5603345036506653, "learning_rate": 9.914811874000723e-05, "loss": 0.0339, "step": 1060 }, { "grad_norm": 0.5727225542068481, "learning_rate": 9.911745814432218e-05, "loss": 0.0371, "step": 1070 }, { "grad_norm": 0.5944136381149292, "learning_rate": 9.90862604079842e-05, "loss": 0.0398, "step": 1080 }, { "grad_norm": 0.6860123872756958, "learning_rate": 9.90545258721667e-05, "loss": 0.0378, "step": 1090 }, { "grad_norm": 0.6331652402877808, "learning_rate": 9.90222548839135e-05, "loss": 0.0352, "step": 1100 }, { "grad_norm": 0.5666359066963196, "learning_rate": 9.898944779613495e-05, "loss": 0.034, "step": 1110 }, { "grad_norm": 0.5733403563499451, "learning_rate": 9.89561049676041e-05, "loss": 0.0352, "step": 1120 }, { "grad_norm": 0.5164110660552979, "learning_rate": 9.89222267629528e-05, "loss": 0.0379, "step": 1130 }, { "grad_norm": 0.6303825378417969, "learning_rate": 9.888781355266763e-05, "loss": 0.0369, "step": 1140 }, { "grad_norm": 0.5613416433334351, "learning_rate": 9.885286571308598e-05, "loss": 0.0338, "step": 1150 }, { "grad_norm": 0.6414242386817932, "learning_rate": 9.881738362639182e-05, "loss": 0.0375, "step": 1160 }, { "grad_norm": 0.5172221660614014, "learning_rate": 9.878136768061154e-05, "loss": 0.0376, "step": 1170 }, { "grad_norm": 0.6341432332992554, "learning_rate": 9.874481826960979e-05, "loss": 0.0374, "step": 1180 }, { "grad_norm": 0.624677836894989, "learning_rate": 9.870773579308503e-05, "loss": 0.0341, "step": 1190 }, { "grad_norm": 0.4869997203350067, "learning_rate": 9.867012065656533e-05, "loss": 0.0381, "step": 1200 }, { "grad_norm": 0.5759740471839905, "learning_rate": 9.863197327140376e-05, "loss": 0.0333, "step": 1210 }, { "grad_norm": 0.48775139451026917, "learning_rate": 9.859329405477403e-05, "loss": 0.0331, "step": 1220 }, { "grad_norm": 0.6388097405433655, "learning_rate": 9.855408342966585e-05, "loss": 0.0352, "step": 1230 }, { "grad_norm": 0.5959818363189697, "learning_rate": 9.851434182488033e-05, "loss": 0.0338, "step": 1240 }, { "grad_norm": 0.657508373260498, "learning_rate": 9.84740696750253e-05, "loss": 0.0331, "step": 1250 }, { "grad_norm": 0.7012799978256226, "learning_rate": 9.843326742051055e-05, "loss": 0.0348, "step": 1260 }, { "grad_norm": 0.5348427295684814, "learning_rate": 9.839193550754297e-05, "loss": 0.0337, "step": 1270 }, { "grad_norm": 0.7294585704803467, "learning_rate": 9.835007438812177e-05, "loss": 0.038, "step": 1280 }, { "grad_norm": 0.6077402830123901, "learning_rate": 9.830768452003341e-05, "loss": 0.0342, "step": 1290 }, { "grad_norm": 0.5021491050720215, "learning_rate": 9.826476636684671e-05, "loss": 0.0339, "step": 1300 }, { "grad_norm": 0.42891937494277954, "learning_rate": 9.822132039790773e-05, "loss": 0.0322, "step": 1310 }, { "grad_norm": 0.5746376514434814, "learning_rate": 9.817734708833461e-05, "loss": 0.0302, "step": 1320 }, { "grad_norm": 0.591606616973877, "learning_rate": 9.813284691901243e-05, "loss": 0.039, "step": 1330 }, { "grad_norm": 0.5928114056587219, "learning_rate": 9.808782037658792e-05, "loss": 0.0367, "step": 1340 }, { "grad_norm": 0.5678219199180603, "learning_rate": 9.804226795346411e-05, "loss": 0.0343, "step": 1350 }, { "grad_norm": 0.5018511414527893, "learning_rate": 9.799619014779503e-05, "loss": 0.0331, "step": 1360 }, { "grad_norm": 0.5295028686523438, "learning_rate": 9.794958746348013e-05, "loss": 0.0337, "step": 1370 }, { "grad_norm": 0.6938942074775696, "learning_rate": 9.790246041015896e-05, "loss": 0.0306, "step": 1380 }, { "grad_norm": 0.5297317504882812, "learning_rate": 9.785480950320538e-05, "loss": 0.0331, "step": 1390 }, { "grad_norm": 0.637657105922699, "learning_rate": 9.78066352637221e-05, "loss": 0.0311, "step": 1400 }, { "grad_norm": 0.5819315314292908, "learning_rate": 9.775793821853488e-05, "loss": 0.0327, "step": 1410 }, { "grad_norm": 0.7160147428512573, "learning_rate": 9.77087189001868e-05, "loss": 0.0323, "step": 1420 }, { "grad_norm": 0.7221500873565674, "learning_rate": 9.765897784693243e-05, "loss": 0.0332, "step": 1430 }, { "grad_norm": 0.5845819711685181, "learning_rate": 9.760871560273197e-05, "loss": 0.0312, "step": 1440 }, { "grad_norm": 0.5930690765380859, "learning_rate": 9.755793271724526e-05, "loss": 0.0305, "step": 1450 }, { "grad_norm": 0.4570452570915222, "learning_rate": 9.750662974582584e-05, "loss": 0.0372, "step": 1460 }, { "grad_norm": 0.5543919801712036, "learning_rate": 9.745480724951473e-05, "loss": 0.0314, "step": 1470 }, { "grad_norm": 0.5798304677009583, "learning_rate": 9.740246579503447e-05, "loss": 0.0336, "step": 1480 }, { "grad_norm": 0.5464045405387878, "learning_rate": 9.734960595478284e-05, "loss": 0.032, "step": 1490 }, { "grad_norm": 0.5292957425117493, "learning_rate": 9.729622830682657e-05, "loss": 0.0308, "step": 1500 }, { "grad_norm": 0.4644886255264282, "learning_rate": 9.724233343489504e-05, "loss": 0.0341, "step": 1510 }, { "grad_norm": 0.4468748867511749, "learning_rate": 9.718792192837396e-05, "loss": 0.029, "step": 1520 }, { "grad_norm": 0.5442079305648804, "learning_rate": 9.713299438229886e-05, "loss": 0.0337, "step": 1530 }, { "grad_norm": 0.46528083086013794, "learning_rate": 9.707755139734855e-05, "loss": 0.0338, "step": 1540 }, { "grad_norm": 0.5214895009994507, "learning_rate": 9.702159357983866e-05, "loss": 0.0315, "step": 1550 }, { "grad_norm": 0.564447820186615, "learning_rate": 9.696512154171492e-05, "loss": 0.0329, "step": 1560 }, { "grad_norm": 0.471500962972641, "learning_rate": 9.690813590054645e-05, "loss": 0.0326, "step": 1570 }, { "grad_norm": 0.52679044008255, "learning_rate": 9.685063727951914e-05, "loss": 0.0305, "step": 1580 }, { "grad_norm": 0.4842182993888855, "learning_rate": 9.679262630742865e-05, "loss": 0.0317, "step": 1590 }, { "grad_norm": 0.5807623267173767, "learning_rate": 9.673410361867373e-05, "loss": 0.0336, "step": 1600 }, { "grad_norm": 0.41651445627212524, "learning_rate": 9.667506985324909e-05, "loss": 0.031, "step": 1610 }, { "grad_norm": 0.4714881479740143, "learning_rate": 9.661552565673855e-05, "loss": 0.028, "step": 1620 }, { "grad_norm": 0.4803926944732666, "learning_rate": 9.655547168030789e-05, "loss": 0.0321, "step": 1630 }, { "grad_norm": 0.5974353551864624, "learning_rate": 9.649490858069777e-05, "loss": 0.0295, "step": 1640 }, { "grad_norm": 0.4964613914489746, "learning_rate": 9.643383702021658e-05, "loss": 0.0297, "step": 1650 }, { "grad_norm": 0.6017008423805237, "learning_rate": 9.637225766673307e-05, "loss": 0.0286, "step": 1660 }, { "grad_norm": 0.5116117000579834, "learning_rate": 9.631017119366922e-05, "loss": 0.0296, "step": 1670 }, { "grad_norm": 0.5510458946228027, "learning_rate": 9.624757827999273e-05, "loss": 0.0322, "step": 1680 }, { "grad_norm": 0.41116780042648315, "learning_rate": 9.618447961020971e-05, "loss": 0.0365, "step": 1690 }, { "grad_norm": 0.4938806891441345, "learning_rate": 9.612087587435707e-05, "loss": 0.0346, "step": 1700 }, { "grad_norm": 0.5194259881973267, "learning_rate": 9.605676776799508e-05, "loss": 0.0311, "step": 1710 }, { "grad_norm": 0.4529009163379669, "learning_rate": 9.599215599219973e-05, "loss": 0.0306, "step": 1720 }, { "grad_norm": 0.4386800229549408, "learning_rate": 9.592704125355505e-05, "loss": 0.0303, "step": 1730 }, { "grad_norm": 0.44015586376190186, "learning_rate": 9.586142426414538e-05, "loss": 0.0291, "step": 1740 }, { "grad_norm": 0.5530741810798645, "learning_rate": 9.57953057415476e-05, "loss": 0.0328, "step": 1750 }, { "grad_norm": 0.28225114941596985, "learning_rate": 9.572868640882328e-05, "loss": 0.0296, "step": 1760 }, { "grad_norm": 0.6074041724205017, "learning_rate": 9.56615669945108e-05, "loss": 0.0324, "step": 1770 }, { "grad_norm": 0.5109390616416931, "learning_rate": 9.55939482326173e-05, "loss": 0.03, "step": 1780 }, { "grad_norm": 0.5892201662063599, "learning_rate": 9.552583086261069e-05, "loss": 0.0316, "step": 1790 }, { "grad_norm": 0.4495730400085449, "learning_rate": 9.545721562941168e-05, "loss": 0.0295, "step": 1800 }, { "grad_norm": 0.5142664313316345, "learning_rate": 9.538810328338543e-05, "loss": 0.0277, "step": 1810 }, { "grad_norm": 0.4616416394710541, "learning_rate": 9.531849458033349e-05, "loss": 0.0307, "step": 1820 }, { "grad_norm": 0.4885185658931732, "learning_rate": 9.524839028148547e-05, "loss": 0.0298, "step": 1830 }, { "grad_norm": 0.4711757004261017, "learning_rate": 9.517779115349077e-05, "loss": 0.0304, "step": 1840 }, { "grad_norm": 0.4843687117099762, "learning_rate": 9.510669796841014e-05, "loss": 0.0301, "step": 1850 }, { "grad_norm": 0.5420807003974915, "learning_rate": 9.503511150370727e-05, "loss": 0.0326, "step": 1860 }, { "grad_norm": 0.644017219543457, "learning_rate": 9.496303254224024e-05, "loss": 0.0318, "step": 1870 }, { "grad_norm": 0.4648231565952301, "learning_rate": 9.489046187225306e-05, "loss": 0.0301, "step": 1880 }, { "grad_norm": 0.5046685338020325, "learning_rate": 9.481740028736692e-05, "loss": 0.0314, "step": 1890 }, { "grad_norm": 0.49768561124801636, "learning_rate": 9.474384858657164e-05, "loss": 0.0291, "step": 1900 }, { "grad_norm": 0.5587893724441528, "learning_rate": 9.466980757421679e-05, "loss": 0.0296, "step": 1910 }, { "grad_norm": 0.5340442061424255, "learning_rate": 9.459527806000305e-05, "loss": 0.0313, "step": 1920 }, { "grad_norm": 0.5392602682113647, "learning_rate": 9.452026085897325e-05, "loss": 0.0308, "step": 1930 }, { "grad_norm": 0.4618771970272064, "learning_rate": 9.444475679150348e-05, "loss": 0.0296, "step": 1940 }, { "grad_norm": 0.4055277705192566, "learning_rate": 9.436876668329411e-05, "loss": 0.028, "step": 1950 }, { "grad_norm": 0.5005772113800049, "learning_rate": 9.429229136536079e-05, "loss": 0.0273, "step": 1960 }, { "grad_norm": 0.42232707142829895, "learning_rate": 9.421533167402534e-05, "loss": 0.0286, "step": 1970 }, { "grad_norm": 0.5429880619049072, "learning_rate": 9.413788845090666e-05, "loss": 0.029, "step": 1980 }, { "grad_norm": 0.4448404312133789, "learning_rate": 9.405996254291136e-05, "loss": 0.0284, "step": 1990 }, { "grad_norm": 0.5074642300605774, "learning_rate": 9.398155480222474e-05, "loss": 0.0283, "step": 2000 }, { "grad_norm": 0.4470667243003845, "learning_rate": 9.390266608630128e-05, "loss": 0.0267, "step": 2010 }, { "grad_norm": 0.47320127487182617, "learning_rate": 9.38232972578553e-05, "loss": 0.0303, "step": 2020 }, { "grad_norm": 0.5718346238136292, "learning_rate": 9.374344918485164e-05, "loss": 0.0296, "step": 2030 }, { "grad_norm": 0.4110424518585205, "learning_rate": 9.366312274049602e-05, "loss": 0.028, "step": 2040 }, { "grad_norm": 0.41520369052886963, "learning_rate": 9.358231880322554e-05, "loss": 0.0296, "step": 2050 }, { "grad_norm": 0.4130535423755646, "learning_rate": 9.350103825669916e-05, "loss": 0.0286, "step": 2060 }, { "grad_norm": 0.5143803358078003, "learning_rate": 9.341928198978787e-05, "loss": 0.0285, "step": 2070 }, { "grad_norm": 0.5418136119842529, "learning_rate": 9.333705089656512e-05, "loss": 0.0264, "step": 2080 }, { "grad_norm": 0.46870583295822144, "learning_rate": 9.325434587629698e-05, "loss": 0.0317, "step": 2090 }, { "grad_norm": 0.417431116104126, "learning_rate": 9.31711678334323e-05, "loss": 0.0284, "step": 2100 }, { "grad_norm": 0.49152880907058716, "learning_rate": 9.308751767759282e-05, "loss": 0.025, "step": 2110 }, { "grad_norm": 0.378698468208313, "learning_rate": 9.300339632356325e-05, "loss": 0.027, "step": 2120 }, { "grad_norm": 0.4329814016819, "learning_rate": 9.291880469128124e-05, "loss": 0.0299, "step": 2130 }, { "grad_norm": 0.49008893966674805, "learning_rate": 9.283374370582732e-05, "loss": 0.0273, "step": 2140 }, { "grad_norm": 0.3793398141860962, "learning_rate": 9.274821429741482e-05, "loss": 0.0264, "step": 2150 }, { "grad_norm": 0.3981456458568573, "learning_rate": 9.266221740137961e-05, "loss": 0.026, "step": 2160 }, { "grad_norm": 0.5248379707336426, "learning_rate": 9.257575395817001e-05, "loss": 0.0255, "step": 2170 }, { "grad_norm": 0.39933085441589355, "learning_rate": 9.248882491333637e-05, "loss": 0.0261, "step": 2180 }, { "grad_norm": 0.5205438733100891, "learning_rate": 9.240143121752076e-05, "loss": 0.0253, "step": 2190 }, { "grad_norm": 0.4256397783756256, "learning_rate": 9.23135738264467e-05, "loss": 0.0313, "step": 2200 }, { "grad_norm": 0.4098120629787445, "learning_rate": 9.222525370090849e-05, "loss": 0.0286, "step": 2210 }, { "grad_norm": 0.452364444732666, "learning_rate": 9.213647180676088e-05, "loss": 0.0313, "step": 2220 }, { "grad_norm": 0.5283573269844055, "learning_rate": 9.204722911490846e-05, "loss": 0.027, "step": 2230 }, { "grad_norm": 0.4341718852519989, "learning_rate": 9.1957526601295e-05, "loss": 0.026, "step": 2240 }, { "grad_norm": 0.5589078664779663, "learning_rate": 9.186736524689281e-05, "loss": 0.0277, "step": 2250 }, { "grad_norm": 0.3732184171676636, "learning_rate": 9.177674603769204e-05, "loss": 0.0289, "step": 2260 }, { "grad_norm": 0.39962926506996155, "learning_rate": 9.168566996468983e-05, "loss": 0.0262, "step": 2270 }, { "grad_norm": 0.43201372027397156, "learning_rate": 9.159413802387951e-05, "loss": 0.0239, "step": 2280 }, { "grad_norm": 0.4189751446247101, "learning_rate": 9.150215121623974e-05, "loss": 0.0266, "step": 2290 }, { "grad_norm": 0.3986872136592865, "learning_rate": 9.140971054772349e-05, "loss": 0.0255, "step": 2300 }, { "grad_norm": 0.4998125731945038, "learning_rate": 9.131681702924713e-05, "loss": 0.0281, "step": 2310 }, { "grad_norm": 0.4827892482280731, "learning_rate": 9.122347167667926e-05, "loss": 0.0281, "step": 2320 }, { "grad_norm": 0.4876689016819, "learning_rate": 9.112967551082973e-05, "loss": 0.0319, "step": 2330 }, { "grad_norm": 0.36984163522720337, "learning_rate": 9.103542955743835e-05, "loss": 0.0242, "step": 2340 }, { "grad_norm": 0.465818852186203, "learning_rate": 9.094073484716381e-05, "loss": 0.0314, "step": 2350 }, { "grad_norm": 0.37877270579338074, "learning_rate": 9.084559241557226e-05, "loss": 0.0262, "step": 2360 }, { "grad_norm": 0.4463783800601959, "learning_rate": 9.075000330312608e-05, "loss": 0.0263, "step": 2370 }, { "grad_norm": 0.47452881932258606, "learning_rate": 9.065396855517253e-05, "loss": 0.0272, "step": 2380 }, { "grad_norm": 0.424927681684494, "learning_rate": 9.055748922193219e-05, "loss": 0.0278, "step": 2390 }, { "grad_norm": 0.3524123728275299, "learning_rate": 9.046056635848761e-05, "loss": 0.0268, "step": 2400 }, { "grad_norm": 0.39357349276542664, "learning_rate": 9.036320102477169e-05, "loss": 0.0235, "step": 2410 }, { "grad_norm": 0.38801810145378113, "learning_rate": 9.02653942855561e-05, "loss": 0.0309, "step": 2420 }, { "grad_norm": 0.42541712522506714, "learning_rate": 9.016714721043971e-05, "loss": 0.027, "step": 2430 }, { "grad_norm": 0.42861104011535645, "learning_rate": 9.006846087383675e-05, "loss": 0.0274, "step": 2440 }, { "grad_norm": 0.44381630420684814, "learning_rate": 8.996933635496523e-05, "loss": 0.0264, "step": 2450 }, { "grad_norm": 0.5069416761398315, "learning_rate": 8.986977473783498e-05, "loss": 0.0243, "step": 2460 }, { "grad_norm": 0.5593004822731018, "learning_rate": 8.97697771112359e-05, "loss": 0.0266, "step": 2470 }, { "grad_norm": 0.49196624755859375, "learning_rate": 8.966934456872602e-05, "loss": 0.0254, "step": 2480 }, { "grad_norm": 0.42328518629074097, "learning_rate": 8.95684782086195e-05, "loss": 0.0317, "step": 2490 }, { "grad_norm": 0.40058237314224243, "learning_rate": 8.946717913397476e-05, "loss": 0.0257, "step": 2500 }, { "grad_norm": 0.45824214816093445, "learning_rate": 8.93654484525822e-05, "loss": 0.0267, "step": 2510 }, { "grad_norm": 0.47785720229148865, "learning_rate": 8.926328727695226e-05, "loss": 0.026, "step": 2520 }, { "grad_norm": 0.5189729928970337, "learning_rate": 8.916069672430319e-05, "loss": 0.0264, "step": 2530 }, { "grad_norm": 0.3164174556732178, "learning_rate": 8.905767791654884e-05, "loss": 0.0244, "step": 2540 }, { "grad_norm": 0.45995235443115234, "learning_rate": 8.895423198028638e-05, "loss": 0.0272, "step": 2550 }, { "grad_norm": 0.40050774812698364, "learning_rate": 8.885036004678402e-05, "loss": 0.0292, "step": 2560 }, { "grad_norm": 0.3644542098045349, "learning_rate": 8.874606325196857e-05, "loss": 0.0237, "step": 2570 }, { "grad_norm": 0.4536350667476654, "learning_rate": 8.864134273641304e-05, "loss": 0.025, "step": 2580 }, { "grad_norm": 0.35245734453201294, "learning_rate": 8.853619964532427e-05, "loss": 0.0233, "step": 2590 }, { "grad_norm": 0.4429668188095093, "learning_rate": 8.843063512853019e-05, "loss": 0.0285, "step": 2600 }, { "grad_norm": 0.43946343660354614, "learning_rate": 8.832465034046749e-05, "loss": 0.0263, "step": 2610 }, { "grad_norm": 0.4406358599662781, "learning_rate": 8.821824644016882e-05, "loss": 0.0254, "step": 2620 }, { "grad_norm": 0.48885712027549744, "learning_rate": 8.811142459125019e-05, "loss": 0.025, "step": 2630 }, { "grad_norm": 0.42471176385879517, "learning_rate": 8.800418596189822e-05, "loss": 0.0265, "step": 2640 }, { "grad_norm": 0.3454952836036682, "learning_rate": 8.789653172485737e-05, "loss": 0.0261, "step": 2650 }, { "grad_norm": 0.4365542232990265, "learning_rate": 8.778846305741715e-05, "loss": 0.0253, "step": 2660 }, { "grad_norm": 0.3438829779624939, "learning_rate": 8.767998114139918e-05, "loss": 0.0251, "step": 2670 }, { "grad_norm": 0.3312196433544159, "learning_rate": 8.757108716314429e-05, "loss": 0.0254, "step": 2680 }, { "grad_norm": 0.40338999032974243, "learning_rate": 8.746178231349962e-05, "loss": 0.0275, "step": 2690 }, { "grad_norm": 0.4243628978729248, "learning_rate": 8.735206778780549e-05, "loss": 0.0239, "step": 2700 }, { "grad_norm": 0.4020898938179016, "learning_rate": 8.724194478588234e-05, "loss": 0.0234, "step": 2710 }, { "grad_norm": 0.4327259361743927, "learning_rate": 8.713141451201772e-05, "loss": 0.0248, "step": 2720 }, { "grad_norm": 0.3352695107460022, "learning_rate": 8.702047817495295e-05, "loss": 0.0258, "step": 2730 }, { "grad_norm": 0.3333274722099304, "learning_rate": 8.69091369878701e-05, "loss": 0.0238, "step": 2740 }, { "grad_norm": 0.42753326892852783, "learning_rate": 8.679739216837849e-05, "loss": 0.0222, "step": 2750 }, { "grad_norm": 0.3095396161079407, "learning_rate": 8.66852449385016e-05, "loss": 0.0233, "step": 2760 }, { "grad_norm": 0.3271157741546631, "learning_rate": 8.657269652466356e-05, "loss": 0.0267, "step": 2770 }, { "grad_norm": 0.4156598150730133, "learning_rate": 8.645974815767577e-05, "loss": 0.0225, "step": 2780 }, { "grad_norm": 0.35358086228370667, "learning_rate": 8.634640107272351e-05, "loss": 0.023, "step": 2790 }, { "grad_norm": 0.43658044934272766, "learning_rate": 8.623265650935234e-05, "loss": 0.0256, "step": 2800 }, { "grad_norm": 0.39249366521835327, "learning_rate": 8.611851571145456e-05, "loss": 0.0256, "step": 2810 }, { "grad_norm": 0.4045146107673645, "learning_rate": 8.600397992725566e-05, "loss": 0.0265, "step": 2820 }, { "grad_norm": 0.46040022373199463, "learning_rate": 8.588905040930061e-05, "loss": 0.0223, "step": 2830 }, { "grad_norm": 0.4740993082523346, "learning_rate": 8.577372841444022e-05, "loss": 0.0238, "step": 2840 }, { "grad_norm": 0.5013309717178345, "learning_rate": 8.565801520381736e-05, "loss": 0.0244, "step": 2850 }, { "grad_norm": 0.5572243332862854, "learning_rate": 8.554191204285313e-05, "loss": 0.0278, "step": 2860 }, { "grad_norm": 0.4136684536933899, "learning_rate": 8.542542020123315e-05, "loss": 0.0268, "step": 2870 }, { "grad_norm": 0.4344552159309387, "learning_rate": 8.530854095289347e-05, "loss": 0.0237, "step": 2880 }, { "grad_norm": 0.44237616658210754, "learning_rate": 8.519127557600688e-05, "loss": 0.0258, "step": 2890 }, { "grad_norm": 0.40503379702568054, "learning_rate": 8.507362535296871e-05, "loss": 0.0245, "step": 2900 }, { "grad_norm": 0.4115789234638214, "learning_rate": 8.495559157038299e-05, "loss": 0.0228, "step": 2910 }, { "grad_norm": 0.47725898027420044, "learning_rate": 8.483717551904823e-05, "loss": 0.0255, "step": 2920 }, { "grad_norm": 0.39906537532806396, "learning_rate": 8.47183784939434e-05, "loss": 0.028, "step": 2930 }, { "grad_norm": 0.3738861083984375, "learning_rate": 8.459920179421374e-05, "loss": 0.0276, "step": 2940 }, { "grad_norm": 0.4364217519760132, "learning_rate": 8.447964672315656e-05, "loss": 0.0242, "step": 2950 }, { "grad_norm": 0.40446925163269043, "learning_rate": 8.435971458820692e-05, "loss": 0.0245, "step": 2960 }, { "grad_norm": 0.43057939410209656, "learning_rate": 8.423940670092345e-05, "loss": 0.0229, "step": 2970 }, { "grad_norm": 0.3856443464756012, "learning_rate": 8.411872437697394e-05, "loss": 0.0217, "step": 2980 }, { "grad_norm": 0.3364211320877075, "learning_rate": 8.399766893612096e-05, "loss": 0.0223, "step": 2990 }, { "grad_norm": 0.47767484188079834, "learning_rate": 8.38762417022074e-05, "loss": 0.0231, "step": 3000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }