{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.92, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 10.566986083984375, "learning_rate": 0.0, "loss": 14.1421, "step": 1 }, { "epoch": 0.008, "grad_norm": 12.296218872070312, "learning_rate": 1.730769230769231e-05, "loss": 13.35, "step": 10 }, { "epoch": 0.016, "grad_norm": 6.457699775695801, "learning_rate": 3.653846153846154e-05, "loss": 11.8957, "step": 20 }, { "epoch": 0.024, "grad_norm": 6.461245059967041, "learning_rate": 5.576923076923077e-05, "loss": 11.2465, "step": 30 }, { "epoch": 0.032, "grad_norm": 6.351202011108398, "learning_rate": 7.500000000000001e-05, "loss": 10.7197, "step": 40 }, { "epoch": 0.04, "grad_norm": 5.675596714019775, "learning_rate": 9.423076923076924e-05, "loss": 10.4108, "step": 50 }, { "epoch": 0.048, "grad_norm": 6.45210599899292, "learning_rate": 0.00011346153846153846, "loss": 9.499, "step": 60 }, { "epoch": 0.056, "grad_norm": 3.97434663772583, "learning_rate": 0.0001326923076923077, "loss": 9.2464, "step": 70 }, { "epoch": 0.064, "grad_norm": 4.443643093109131, "learning_rate": 0.00015192307692307692, "loss": 9.0007, "step": 80 }, { "epoch": 0.072, "grad_norm": 4.448770046234131, "learning_rate": 0.00017115384615384616, "loss": 8.8057, "step": 90 }, { "epoch": 0.08, "grad_norm": 5.425487041473389, "learning_rate": 0.00019038461538461538, "loss": 8.9744, "step": 100 }, { "epoch": 0.088, "grad_norm": 4.242831230163574, "learning_rate": 0.00019999785100910492, "loss": 8.9241, "step": 110 }, { "epoch": 0.096, "grad_norm": 3.6791751384735107, "learning_rate": 0.00019998065963611962, "loss": 8.8742, "step": 120 }, { "epoch": 0.104, "grad_norm": 5.0801777839660645, "learning_rate": 0.00019994627984564557, "loss": 8.8388, "step": 130 }, { "epoch": 0.112, "grad_norm": 5.117883205413818, "learning_rate": 0.00019989471754816785, "loss": 8.412, "step": 140 }, { "epoch": 0.12, "grad_norm": 5.7691802978515625, "learning_rate": 0.00019982598160814377, "loss": 8.7482, "step": 150 }, { "epoch": 0.128, "grad_norm": 4.111888885498047, "learning_rate": 0.00019974008384247908, "loss": 8.6456, "step": 160 }, { "epoch": 0.136, "grad_norm": 3.717806816101074, "learning_rate": 0.0001996370390184965, "loss": 8.3429, "step": 170 }, { "epoch": 0.144, "grad_norm": 5.897804260253906, "learning_rate": 0.00019951686485139672, "loss": 8.5481, "step": 180 }, { "epoch": 0.152, "grad_norm": 4.452871799468994, "learning_rate": 0.00019937958200121303, "loss": 8.6357, "step": 190 }, { "epoch": 0.16, "grad_norm": 5.103796482086182, "learning_rate": 0.0001992252140692594, "loss": 8.5245, "step": 200 }, { "epoch": 0.168, "grad_norm": 5.806966304779053, "learning_rate": 0.00019905378759407314, "loss": 8.4875, "step": 210 }, { "epoch": 0.176, "grad_norm": 4.293936729431152, "learning_rate": 0.00019886533204685228, "loss": 8.3073, "step": 220 }, { "epoch": 0.184, "grad_norm": 3.6153390407562256, "learning_rate": 0.00019865987982638914, "loss": 8.5256, "step": 230 }, { "epoch": 0.192, "grad_norm": 5.031829357147217, "learning_rate": 0.00019843746625350028, "loss": 8.3936, "step": 240 }, { "epoch": 0.2, "grad_norm": 4.666059970855713, "learning_rate": 0.0001981981295649543, "loss": 7.9453, "step": 250 }, { "epoch": 0.208, "grad_norm": 4.338928699493408, "learning_rate": 0.0001979419109068982, "loss": 8.5403, "step": 260 }, { "epoch": 0.216, "grad_norm": 5.491336345672607, "learning_rate": 0.0001976688543277838, "loss": 8.5499, "step": 270 }, { "epoch": 0.224, "grad_norm": 4.206221580505371, "learning_rate": 0.00019737900677079483, "loss": 8.202, "step": 280 }, { "epoch": 0.232, "grad_norm": 4.248091220855713, "learning_rate": 0.0001970724180657768, "loss": 8.1605, "step": 290 }, { "epoch": 0.24, "grad_norm": 4.153928279876709, "learning_rate": 0.00019674914092067015, "loss": 8.2001, "step": 300 }, { "epoch": 0.248, "grad_norm": 4.331130027770996, "learning_rate": 0.00019640923091244906, "loss": 8.3479, "step": 310 }, { "epoch": 0.256, "grad_norm": 4.888726711273193, "learning_rate": 0.0001960527464775666, "loss": 8.0348, "step": 320 }, { "epoch": 0.264, "grad_norm": 4.52598762512207, "learning_rate": 0.00019567974890190865, "loss": 7.9916, "step": 330 }, { "epoch": 0.272, "grad_norm": 5.109200477600098, "learning_rate": 0.00019529030231025776, "loss": 7.8621, "step": 340 }, { "epoch": 0.28, "grad_norm": 4.465454578399658, "learning_rate": 0.0001948844736552688, "loss": 8.1529, "step": 350 }, { "epoch": 0.288, "grad_norm": 4.94981050491333, "learning_rate": 0.00019446233270595896, "loss": 7.9475, "step": 360 }, { "epoch": 0.296, "grad_norm": 4.898144245147705, "learning_rate": 0.00019402395203571286, "loss": 8.1256, "step": 370 }, { "epoch": 0.304, "grad_norm": 4.506499767303467, "learning_rate": 0.00019356940700980625, "loss": 7.9425, "step": 380 }, { "epoch": 0.312, "grad_norm": 4.715751647949219, "learning_rate": 0.00019309877577244924, "loss": 7.8867, "step": 390 }, { "epoch": 0.32, "grad_norm": 6.232232570648193, "learning_rate": 0.00019261213923335194, "loss": 8.0137, "step": 400 }, { "epoch": 0.328, "grad_norm": 5.4095258712768555, "learning_rate": 0.0001921095810538148, "loss": 7.655, "step": 410 }, { "epoch": 0.336, "grad_norm": 8.021153450012207, "learning_rate": 0.00019159118763234555, "loss": 8.239, "step": 420 }, { "epoch": 0.344, "grad_norm": 4.821053504943848, "learning_rate": 0.0001910570480898061, "loss": 7.6991, "step": 430 }, { "epoch": 0.352, "grad_norm": 4.676478385925293, "learning_rate": 0.00019050725425409076, "loss": 7.9241, "step": 440 }, { "epoch": 0.36, "grad_norm": 6.322430610656738, "learning_rate": 0.0001899419006443397, "loss": 8.0194, "step": 450 }, { "epoch": 0.368, "grad_norm": 3.8518083095550537, "learning_rate": 0.0001893610844546894, "loss": 7.7739, "step": 460 }, { "epoch": 0.376, "grad_norm": 4.104583263397217, "learning_rate": 0.00018876490553756313, "loss": 7.7344, "step": 470 }, { "epoch": 0.384, "grad_norm": 5.830111980438232, "learning_rate": 0.00018815346638650487, "loss": 7.4569, "step": 480 }, { "epoch": 0.392, "grad_norm": 5.279020309448242, "learning_rate": 0.0001875268721185585, "loss": 7.6329, "step": 490 }, { "epoch": 0.4, "grad_norm": 6.486227989196777, "learning_rate": 0.00018688523045619674, "loss": 7.6998, "step": 500 }, { "epoch": 0.408, "grad_norm": 3.6189093589782715, "learning_rate": 0.00018622865170880151, "loss": 7.3692, "step": 510 }, { "epoch": 0.416, "grad_norm": 4.793766498565674, "learning_rate": 0.00018555724875369997, "loss": 7.4387, "step": 520 }, { "epoch": 0.424, "grad_norm": 3.8143932819366455, "learning_rate": 0.00018487113701675883, "loss": 7.6311, "step": 530 }, { "epoch": 0.432, "grad_norm": 5.568665027618408, "learning_rate": 0.00018417043445254075, "loss": 7.0967, "step": 540 }, { "epoch": 0.44, "grad_norm": 5.059378623962402, "learning_rate": 0.00018345526152402573, "loss": 7.2701, "step": 550 }, { "epoch": 0.448, "grad_norm": 5.139848232269287, "learning_rate": 0.00018272574118190167, "loss": 7.2647, "step": 560 }, { "epoch": 0.456, "grad_norm": 6.435779571533203, "learning_rate": 0.00018198199884342673, "loss": 7.161, "step": 570 }, { "epoch": 0.464, "grad_norm": 5.199296951293945, "learning_rate": 0.0001812241623708682, "loss": 7.5137, "step": 580 }, { "epoch": 0.472, "grad_norm": 4.161045074462891, "learning_rate": 0.00018045236204952044, "loss": 7.356, "step": 590 }, { "epoch": 0.48, "grad_norm": 5.084685325622559, "learning_rate": 0.00017966673056530686, "loss": 7.3859, "step": 600 }, { "epoch": 0.488, "grad_norm": 7.625977516174316, "learning_rate": 0.00017886740298196863, "loss": 7.0858, "step": 610 }, { "epoch": 0.496, "grad_norm": 5.039551734924316, "learning_rate": 0.00017805451671784516, "loss": 7.4888, "step": 620 }, { "epoch": 0.504, "grad_norm": 5.905925273895264, "learning_rate": 0.00017722821152224924, "loss": 7.2393, "step": 630 }, { "epoch": 0.512, "grad_norm": 9.78266716003418, "learning_rate": 0.00017638862945144182, "loss": 7.4102, "step": 640 }, { "epoch": 0.52, "grad_norm": 6.097099781036377, "learning_rate": 0.00017553591484421004, "loss": 7.297, "step": 650 }, { "epoch": 0.528, "grad_norm": 4.922943115234375, "learning_rate": 0.00017467021429705285, "loss": 7.3741, "step": 660 }, { "epoch": 0.536, "grad_norm": 5.79689884185791, "learning_rate": 0.00017379167663897856, "loss": 7.1571, "step": 670 }, { "epoch": 0.544, "grad_norm": 3.795285701751709, "learning_rate": 0.00017290045290591858, "loss": 6.8519, "step": 680 }, { "epoch": 0.552, "grad_norm": 4.697098731994629, "learning_rate": 0.0001719966963147616, "loss": 7.111, "step": 690 }, { "epoch": 0.56, "grad_norm": 4.46597146987915, "learning_rate": 0.000171080562237013, "loss": 7.0099, "step": 700 }, { "epoch": 0.568, "grad_norm": 6.007630348205566, "learning_rate": 0.00017015220817208376, "loss": 7.326, "step": 710 }, { "epoch": 0.576, "grad_norm": 4.53595495223999, "learning_rate": 0.0001692117937202136, "loss": 7.0785, "step": 720 }, { "epoch": 0.584, "grad_norm": 6.850202560424805, "learning_rate": 0.00016825948055503294, "loss": 7.2174, "step": 730 }, { "epoch": 0.592, "grad_norm": 5.931501865386963, "learning_rate": 0.00016729543239576828, "loss": 6.995, "step": 740 }, { "epoch": 0.6, "grad_norm": 5.900391101837158, "learning_rate": 0.0001663198149790961, "loss": 6.8995, "step": 750 }, { "epoch": 0.608, "grad_norm": 5.841864109039307, "learning_rate": 0.00016533279603064978, "loss": 6.7632, "step": 760 }, { "epoch": 0.616, "grad_norm": 5.902273178100586, "learning_rate": 0.00016433454523618482, "loss": 6.9055, "step": 770 }, { "epoch": 0.624, "grad_norm": 5.881319046020508, "learning_rate": 0.00016332523421240658, "loss": 6.686, "step": 780 }, { "epoch": 0.632, "grad_norm": 5.178507328033447, "learning_rate": 0.00016230503647746657, "loss": 6.6281, "step": 790 }, { "epoch": 0.64, "grad_norm": 4.807728290557861, "learning_rate": 0.00016127412742113185, "loss": 6.9373, "step": 800 }, { "epoch": 0.648, "grad_norm": 5.6972975730896, "learning_rate": 0.000160232684274632, "loss": 6.7103, "step": 810 }, { "epoch": 0.656, "grad_norm": 5.271605968475342, "learning_rate": 0.00015918088608019043, "loss": 6.9209, "step": 820 }, { "epoch": 0.664, "grad_norm": 5.366481304168701, "learning_rate": 0.00015811891366024358, "loss": 6.9491, "step": 830 }, { "epoch": 0.672, "grad_norm": 4.8482866287231445, "learning_rate": 0.00015704694958635468, "loss": 6.6238, "step": 840 }, { "epoch": 0.68, "grad_norm": 7.680044651031494, "learning_rate": 0.0001559651781478263, "loss": 6.7933, "step": 850 }, { "epoch": 0.688, "grad_norm": 4.3361496925354, "learning_rate": 0.00015487378532001782, "loss": 6.6105, "step": 860 }, { "epoch": 0.696, "grad_norm": 8.266775131225586, "learning_rate": 0.0001537729587323732, "loss": 6.5274, "step": 870 }, { "epoch": 0.704, "grad_norm": 5.751520156860352, "learning_rate": 0.00015266288763616403, "loss": 6.6585, "step": 880 }, { "epoch": 0.712, "grad_norm": 6.401413440704346, "learning_rate": 0.000151543762871954, "loss": 6.5739, "step": 890 }, { "epoch": 0.72, "grad_norm": 5.519962310791016, "learning_rate": 0.0001504157768367901, "loss": 6.6899, "step": 900 }, { "epoch": 0.728, "grad_norm": 5.094054222106934, "learning_rate": 0.00014927912345112616, "loss": 6.2008, "step": 910 }, { "epoch": 0.736, "grad_norm": 6.107059955596924, "learning_rate": 0.0001481339981254846, "loss": 6.2149, "step": 920 }, { "epoch": 0.744, "grad_norm": 6.355636119842529, "learning_rate": 0.00014698059772686202, "loss": 6.7521, "step": 930 }, { "epoch": 0.752, "grad_norm": 7.230486869812012, "learning_rate": 0.00014581912054488413, "loss": 6.3823, "step": 940 }, { "epoch": 0.76, "grad_norm": 4.960805416107178, "learning_rate": 0.00014464976625771654, "loss": 6.6149, "step": 950 }, { "epoch": 0.768, "grad_norm": 5.269943714141846, "learning_rate": 0.00014347273589773637, "loss": 6.1001, "step": 960 }, { "epoch": 0.776, "grad_norm": 6.840855598449707, "learning_rate": 0.0001422882318169716, "loss": 6.3677, "step": 970 }, { "epoch": 0.784, "grad_norm": 5.1594038009643555, "learning_rate": 0.00014109645765231278, "loss": 6.3929, "step": 980 }, { "epoch": 0.792, "grad_norm": 5.096086025238037, "learning_rate": 0.00013989761829050475, "loss": 6.1354, "step": 990 }, { "epoch": 0.8, "grad_norm": 5.235525608062744, "learning_rate": 0.00013869191983292283, "loss": 6.4954, "step": 1000 }, { "epoch": 0.808, "grad_norm": 5.518918991088867, "learning_rate": 0.00013747956956014037, "loss": 6.449, "step": 1010 }, { "epoch": 0.816, "grad_norm": 4.848990440368652, "learning_rate": 0.00013626077589629367, "loss": 6.392, "step": 1020 }, { "epoch": 0.824, "grad_norm": 7.234468460083008, "learning_rate": 0.00013503574837325015, "loss": 6.5465, "step": 1030 }, { "epoch": 0.832, "grad_norm": 6.593731880187988, "learning_rate": 0.00013380469759458643, "loss": 6.574, "step": 1040 }, { "epoch": 0.84, "grad_norm": 5.687368392944336, "learning_rate": 0.00013256783519938154, "loss": 6.1995, "step": 1050 }, { "epoch": 0.848, "grad_norm": 4.857635498046875, "learning_rate": 0.00013132537382583274, "loss": 5.8422, "step": 1060 }, { "epoch": 0.856, "grad_norm": 7.068734645843506, "learning_rate": 0.00013007752707469924, "loss": 6.0601, "step": 1070 }, { "epoch": 0.864, "grad_norm": 4.396754741668701, "learning_rate": 0.00012882450947258045, "loss": 5.8387, "step": 1080 }, { "epoch": 0.872, "grad_norm": 9.501909255981445, "learning_rate": 0.0001275665364350352, "loss": 5.9831, "step": 1090 }, { "epoch": 0.88, "grad_norm": 6.957056522369385, "learning_rate": 0.00012630382422954795, "loss": 6.1359, "step": 1100 }, { "epoch": 0.888, "grad_norm": 5.782343864440918, "learning_rate": 0.00012503658993834885, "loss": 6.0754, "step": 1110 }, { "epoch": 0.896, "grad_norm": 5.452831268310547, "learning_rate": 0.0001237650514210932, "loss": 5.6186, "step": 1120 }, { "epoch": 0.904, "grad_norm": 6.382038593292236, "learning_rate": 0.00012248942727740783, "loss": 5.7174, "step": 1130 }, { "epoch": 0.912, "grad_norm": 6.288851261138916, "learning_rate": 0.00012120993680931003, "loss": 5.6529, "step": 1140 }, { "epoch": 0.92, "grad_norm": 6.7387166023254395, "learning_rate": 0.0001199267999835055, "loss": 5.603, "step": 1150 }, { "epoch": 0.928, "grad_norm": 5.694065093994141, "learning_rate": 0.00011864023739357235, "loss": 5.2627, "step": 1160 }, { "epoch": 0.936, "grad_norm": 6.711731910705566, "learning_rate": 0.00011735047022203741, "loss": 5.4706, "step": 1170 }, { "epoch": 0.944, "grad_norm": 5.517411708831787, "learning_rate": 0.00011605772020235072, "loss": 5.6277, "step": 1180 }, { "epoch": 0.952, "grad_norm": 6.785055160522461, "learning_rate": 0.00011476220958076607, "loss": 5.9611, "step": 1190 }, { "epoch": 0.96, "grad_norm": 5.702793121337891, "learning_rate": 0.00011346416107813267, "loss": 5.9226, "step": 1200 }, { "epoch": 0.968, "grad_norm": 7.974459648132324, "learning_rate": 0.00011216379785160578, "loss": 5.8474, "step": 1210 }, { "epoch": 0.976, "grad_norm": 5.838559150695801, "learning_rate": 0.000110861343456282, "loss": 6.0194, "step": 1220 }, { "epoch": 0.984, "grad_norm": 7.7551703453063965, "learning_rate": 0.00010955702180676632, "loss": 5.7078, "step": 1230 }, { "epoch": 0.992, "grad_norm": 7.658422470092773, "learning_rate": 0.00010825105713867724, "loss": 5.6615, "step": 1240 }, { "epoch": 1.0, "grad_norm": 6.197235584259033, "learning_rate": 0.0001069436739700968, "loss": 5.7792, "step": 1250 }, { "epoch": 1.008, "grad_norm": 6.991013526916504, "learning_rate": 0.00010563509706297188, "loss": 5.3963, "step": 1260 }, { "epoch": 1.016, "grad_norm": 7.606940269470215, "learning_rate": 0.00010432555138447404, "loss": 5.9168, "step": 1270 }, { "epoch": 1.024, "grad_norm": 6.3427042961120605, "learning_rate": 0.0001030152620683233, "loss": 5.6694, "step": 1280 }, { "epoch": 1.032, "grad_norm": 5.197830677032471, "learning_rate": 0.00010170445437608403, "loss": 5.4628, "step": 1290 }, { "epoch": 1.04, "grad_norm": 7.1471381187438965, "learning_rate": 0.00010039335365843851, "loss": 5.4371, "step": 1300 }, { "epoch": 1.048, "grad_norm": 6.546257495880127, "learning_rate": 9.908218531644521e-05, "loss": 5.7931, "step": 1310 }, { "epoch": 1.056, "grad_norm": 8.735387802124023, "learning_rate": 9.77711747627883e-05, "loss": 5.354, "step": 1320 }, { "epoch": 1.064, "grad_norm": 6.352960109710693, "learning_rate": 9.646054738302551e-05, "loss": 5.0677, "step": 1330 }, { "epoch": 1.072, "grad_norm": 7.88266658782959, "learning_rate": 9.515052849684019e-05, "loss": 5.4004, "step": 1340 }, { "epoch": 1.08, "grad_norm": 5.571359634399414, "learning_rate": 9.384134331930513e-05, "loss": 5.3187, "step": 1350 }, { "epoch": 1.088, "grad_norm": 8.573116302490234, "learning_rate": 9.253321692216402e-05, "loss": 5.2719, "step": 1360 }, { "epoch": 1.096, "grad_norm": 9.162277221679688, "learning_rate": 9.122637419513778e-05, "loss": 5.6781, "step": 1370 }, { "epoch": 1.104, "grad_norm": 11.947822570800781, "learning_rate": 8.992103980726207e-05, "loss": 5.593, "step": 1380 }, { "epoch": 1.112, "grad_norm": 8.124156951904297, "learning_rate": 8.861743816826274e-05, "loss": 5.3142, "step": 1390 }, { "epoch": 1.12, "grad_norm": 7.134088039398193, "learning_rate": 8.731579338997594e-05, "loss": 5.3372, "step": 1400 }, { "epoch": 1.1280000000000001, "grad_norm": 8.928452491760254, "learning_rate": 8.601632924781935e-05, "loss": 5.4594, "step": 1410 }, { "epoch": 1.1360000000000001, "grad_norm": 5.617236614227295, "learning_rate": 8.471926914232137e-05, "loss": 5.0226, "step": 1420 }, { "epoch": 1.144, "grad_norm": 6.080244064331055, "learning_rate": 8.34248360607145e-05, "loss": 5.3677, "step": 1430 }, { "epoch": 1.152, "grad_norm": 9.846885681152344, "learning_rate": 8.213325253860013e-05, "loss": 5.2629, "step": 1440 }, { "epoch": 1.16, "grad_norm": 10.537776947021484, "learning_rate": 8.084474062169071e-05, "loss": 5.5022, "step": 1450 }, { "epoch": 1.168, "grad_norm": 5.898927688598633, "learning_rate": 7.955952182763624e-05, "loss": 5.2349, "step": 1460 }, { "epoch": 1.176, "grad_norm": 6.621062755584717, "learning_rate": 7.827781710794146e-05, "loss": 5.4929, "step": 1470 }, { "epoch": 1.184, "grad_norm": 6.793276309967041, "learning_rate": 7.699984680998063e-05, "loss": 5.136, "step": 1480 }, { "epoch": 1.192, "grad_norm": 8.44278335571289, "learning_rate": 7.57258306391157e-05, "loss": 4.9147, "step": 1490 }, { "epoch": 1.2, "grad_norm": 8.787193298339844, "learning_rate": 7.445598762092537e-05, "loss": 5.4473, "step": 1500 }, { "epoch": 1.208, "grad_norm": 7.027050495147705, "learning_rate": 7.319053606355061e-05, "loss": 5.1231, "step": 1510 }, { "epoch": 1.216, "grad_norm": 6.161197662353516, "learning_rate": 7.192969352016383e-05, "loss": 5.1066, "step": 1520 }, { "epoch": 1.224, "grad_norm": 5.8758745193481445, "learning_rate": 7.067367675156758e-05, "loss": 5.1743, "step": 1530 }, { "epoch": 1.232, "grad_norm": 8.138237953186035, "learning_rate": 6.942270168892959e-05, "loss": 5.2096, "step": 1540 }, { "epoch": 1.24, "grad_norm": 7.498131275177002, "learning_rate": 6.817698339666066e-05, "loss": 5.1718, "step": 1550 }, { "epoch": 1.248, "grad_norm": 5.885447025299072, "learning_rate": 6.693673603544097e-05, "loss": 5.4034, "step": 1560 }, { "epoch": 1.256, "grad_norm": 5.564108848571777, "learning_rate": 6.570217282540247e-05, "loss": 5.0762, "step": 1570 }, { "epoch": 1.264, "grad_norm": 5.904695510864258, "learning_rate": 6.447350600947236e-05, "loss": 4.9994, "step": 1580 }, { "epoch": 1.272, "grad_norm": 6.91570520401001, "learning_rate": 6.325094681688503e-05, "loss": 5.1286, "step": 1590 }, { "epoch": 1.28, "grad_norm": 5.768303394317627, "learning_rate": 6.203470542686798e-05, "loss": 4.975, "step": 1600 }, { "epoch": 1.288, "grad_norm": 9.351409912109375, "learning_rate": 6.082499093250831e-05, "loss": 4.9613, "step": 1610 }, { "epoch": 1.296, "grad_norm": 9.572948455810547, "learning_rate": 5.962201130480618e-05, "loss": 4.7479, "step": 1620 }, { "epoch": 1.304, "grad_norm": 7.169508457183838, "learning_rate": 5.842597335692067e-05, "loss": 5.2066, "step": 1630 }, { "epoch": 1.312, "grad_norm": 7.029344081878662, "learning_rate": 5.7237082708615186e-05, "loss": 4.7997, "step": 1640 }, { "epoch": 1.32, "grad_norm": 6.824025630950928, "learning_rate": 5.605554375090784e-05, "loss": 4.6027, "step": 1650 }, { "epoch": 1.328, "grad_norm": 12.187256813049316, "learning_rate": 5.488155961093298e-05, "loss": 5.03, "step": 1660 }, { "epoch": 1.336, "grad_norm": 5.913212776184082, "learning_rate": 5.3715332117020154e-05, "loss": 4.7387, "step": 1670 }, { "epoch": 1.3439999999999999, "grad_norm": 6.245068550109863, "learning_rate": 5.2557061763996354e-05, "loss": 5.0907, "step": 1680 }, { "epoch": 1.3519999999999999, "grad_norm": 9.76169490814209, "learning_rate": 5.140694767871733e-05, "loss": 5.1469, "step": 1690 }, { "epoch": 1.3599999999999999, "grad_norm": 10.653762817382812, "learning_rate": 5.026518758583434e-05, "loss": 4.9712, "step": 1700 }, { "epoch": 1.3679999999999999, "grad_norm": 7.157083511352539, "learning_rate": 4.9131977773801765e-05, "loss": 4.6749, "step": 1710 }, { "epoch": 1.376, "grad_norm": 8.09749698638916, "learning_rate": 4.8007513061131736e-05, "loss": 4.5003, "step": 1720 }, { "epoch": 1.384, "grad_norm": 6.536433219909668, "learning_rate": 4.6891986762901406e-05, "loss": 4.2644, "step": 1730 }, { "epoch": 1.392, "grad_norm": 7.866100311279297, "learning_rate": 4.578559065751873e-05, "loss": 4.721, "step": 1740 }, { "epoch": 1.4, "grad_norm": 5.806945323944092, "learning_rate": 4.4688514953752515e-05, "loss": 4.6537, "step": 1750 }, { "epoch": 1.408, "grad_norm": 8.229887962341309, "learning_rate": 4.360094825803203e-05, "loss": 4.7068, "step": 1760 }, { "epoch": 1.416, "grad_norm": 10.401880264282227, "learning_rate": 4.25230775420224e-05, "loss": 4.5295, "step": 1770 }, { "epoch": 1.424, "grad_norm": 7.166602611541748, "learning_rate": 4.145508811048089e-05, "loss": 4.5715, "step": 1780 }, { "epoch": 1.432, "grad_norm": 8.82656478881836, "learning_rate": 4.039716356939981e-05, "loss": 4.8924, "step": 1790 }, { "epoch": 1.44, "grad_norm": 8.747090339660645, "learning_rate": 3.9349485794441395e-05, "loss": 4.5401, "step": 1800 }, { "epoch": 1.448, "grad_norm": 12.537016868591309, "learning_rate": 3.831223489967025e-05, "loss": 4.7768, "step": 1810 }, { "epoch": 1.456, "grad_norm": 6.526289939880371, "learning_rate": 3.728558920658868e-05, "loss": 4.4916, "step": 1820 }, { "epoch": 1.464, "grad_norm": 7.882378101348877, "learning_rate": 3.6269725213479846e-05, "loss": 4.5072, "step": 1830 }, { "epoch": 1.472, "grad_norm": 10.795116424560547, "learning_rate": 3.526481756506498e-05, "loss": 4.7454, "step": 1840 }, { "epoch": 1.48, "grad_norm": 9.107118606567383, "learning_rate": 3.4271039022478694e-05, "loss": 4.3667, "step": 1850 }, { "epoch": 1.488, "grad_norm": 8.725346565246582, "learning_rate": 3.328856043356837e-05, "loss": 4.8818, "step": 1860 }, { "epoch": 1.496, "grad_norm": 8.258055686950684, "learning_rate": 3.231755070352249e-05, "loss": 4.4017, "step": 1870 }, { "epoch": 1.504, "grad_norm": 9.932522773742676, "learning_rate": 3.1358176765832915e-05, "loss": 4.3575, "step": 1880 }, { "epoch": 1.512, "grad_norm": 7.825517654418945, "learning_rate": 3.041060355359594e-05, "loss": 4.2122, "step": 1890 }, { "epoch": 1.52, "grad_norm": 8.71921443939209, "learning_rate": 2.9474993971157605e-05, "loss": 4.4648, "step": 1900 }, { "epoch": 1.528, "grad_norm": 9.51145076751709, "learning_rate": 2.8551508866107514e-05, "loss": 4.4388, "step": 1910 }, { "epoch": 1.536, "grad_norm": 6.032364845275879, "learning_rate": 2.764030700162633e-05, "loss": 4.6544, "step": 1920 }, { "epoch": 1.544, "grad_norm": 6.442480564117432, "learning_rate": 2.6741545029191674e-05, "loss": 4.3281, "step": 1930 }, { "epoch": 1.552, "grad_norm": 9.142407417297363, "learning_rate": 2.5855377461646902e-05, "loss": 4.7258, "step": 1940 }, { "epoch": 1.56, "grad_norm": 13.672632217407227, "learning_rate": 2.4981956646637815e-05, "loss": 4.076, "step": 1950 }, { "epoch": 1.568, "grad_norm": 10.532658576965332, "learning_rate": 2.412143274042129e-05, "loss": 4.3765, "step": 1960 }, { "epoch": 1.576, "grad_norm": 6.73195743560791, "learning_rate": 2.327395368205084e-05, "loss": 4.4419, "step": 1970 }, { "epoch": 1.584, "grad_norm": 7.2371745109558105, "learning_rate": 2.243966516794338e-05, "loss": 4.5395, "step": 1980 }, { "epoch": 1.592, "grad_norm": 9.002269744873047, "learning_rate": 2.161871062683145e-05, "loss": 4.5577, "step": 1990 }, { "epoch": 1.6, "grad_norm": 10.184229850769043, "learning_rate": 2.0811231195105186e-05, "loss": 4.3631, "step": 2000 }, { "epoch": 1.608, "grad_norm": 7.962112903594971, "learning_rate": 2.0017365692548717e-05, "loss": 4.6822, "step": 2010 }, { "epoch": 1.616, "grad_norm": 7.3697190284729, "learning_rate": 1.9237250598474564e-05, "loss": 4.1389, "step": 2020 }, { "epoch": 1.624, "grad_norm": 8.01472282409668, "learning_rate": 1.8471020028260368e-05, "loss": 3.9598, "step": 2030 }, { "epoch": 1.6320000000000001, "grad_norm": 5.376262187957764, "learning_rate": 1.7718805710292208e-05, "loss": 4.3192, "step": 2040 }, { "epoch": 1.6400000000000001, "grad_norm": 10.927371978759766, "learning_rate": 1.6980736963318177e-05, "loss": 4.192, "step": 2050 }, { "epoch": 1.6480000000000001, "grad_norm": 6.14822244644165, "learning_rate": 1.625694067421626e-05, "loss": 4.4155, "step": 2060 }, { "epoch": 1.6560000000000001, "grad_norm": 8.27953815460205, "learning_rate": 1.554754127618019e-05, "loss": 4.5526, "step": 2070 }, { "epoch": 1.6640000000000001, "grad_norm": 6.160037517547607, "learning_rate": 1.4852660727327361e-05, "loss": 4.453, "step": 2080 }, { "epoch": 1.6720000000000002, "grad_norm": 7.098100662231445, "learning_rate": 1.4172418489731942e-05, "loss": 4.4741, "step": 2090 }, { "epoch": 1.6800000000000002, "grad_norm": 6.903653144836426, "learning_rate": 1.3506931508887333e-05, "loss": 4.0234, "step": 2100 }, { "epoch": 1.688, "grad_norm": 6.967901706695557, "learning_rate": 1.2856314193601216e-05, "loss": 4.2137, "step": 2110 }, { "epoch": 1.696, "grad_norm": 7.635384559631348, "learning_rate": 1.2220678396326678e-05, "loss": 4.38, "step": 2120 }, { "epoch": 1.704, "grad_norm": 9.351762771606445, "learning_rate": 1.160013339393281e-05, "loss": 4.4418, "step": 2130 }, { "epoch": 1.712, "grad_norm": 7.01410436630249, "learning_rate": 1.0994785868918101e-05, "loss": 4.4396, "step": 2140 }, { "epoch": 1.72, "grad_norm": 6.974796772003174, "learning_rate": 1.040473989106988e-05, "loss": 4.5075, "step": 2150 }, { "epoch": 1.728, "grad_norm": 7.545105934143066, "learning_rate": 9.830096899572927e-06, "loss": 4.7132, "step": 2160 }, { "epoch": 1.736, "grad_norm": 7.074202537536621, "learning_rate": 9.270955685570226e-06, "loss": 4.2393, "step": 2170 }, { "epoch": 1.744, "grad_norm": 6.088277339935303, "learning_rate": 8.727412375179156e-06, "loss": 4.8092, "step": 2180 }, { "epoch": 1.752, "grad_norm": 5.859469413757324, "learning_rate": 8.199560412965634e-06, "loss": 4.6403, "step": 2190 }, { "epoch": 1.76, "grad_norm": 6.770120143890381, "learning_rate": 7.687490545879461e-06, "loss": 4.2544, "step": 2200 }, { "epoch": 1.768, "grad_norm": 8.092095375061035, "learning_rate": 7.191290807653251e-06, "loss": 4.454, "step": 2210 }, { "epoch": 1.776, "grad_norm": 6.498937606811523, "learning_rate": 6.711046503667983e-06, "loss": 4.0243, "step": 2220 }, { "epoch": 1.784, "grad_norm": 5.894200801849365, "learning_rate": 6.24684019628744e-06, "loss": 4.0666, "step": 2230 }, { "epoch": 1.792, "grad_norm": 6.309505462646484, "learning_rate": 5.79875169066435e-06, "loss": 4.2441, "step": 2240 }, { "epoch": 1.8, "grad_norm": 6.20737886428833, "learning_rate": 5.366858021020471e-06, "loss": 4.1951, "step": 2250 }, { "epoch": 1.808, "grad_norm": 8.314438819885254, "learning_rate": 4.951233437403102e-06, "loss": 4.4043, "step": 2260 }, { "epoch": 1.8159999999999998, "grad_norm": 6.5354108810424805, "learning_rate": 4.551949392920118e-06, "loss": 4.1528, "step": 2270 }, { "epoch": 1.8239999999999998, "grad_norm": 8.503190040588379, "learning_rate": 4.169074531456063e-06, "loss": 3.8358, "step": 2280 }, { "epoch": 1.8319999999999999, "grad_norm": 7.091247081756592, "learning_rate": 3.802674675870932e-06, "loss": 4.3702, "step": 2290 }, { "epoch": 1.8399999999999999, "grad_norm": 6.265818119049072, "learning_rate": 3.4528128166842033e-06, "loss": 4.3354, "step": 2300 }, { "epoch": 1.8479999999999999, "grad_norm": 7.026565074920654, "learning_rate": 3.119549101245567e-06, "loss": 3.9832, "step": 2310 }, { "epoch": 1.8559999999999999, "grad_norm": 6.407781600952148, "learning_rate": 2.8029408233946177e-06, "loss": 4.1522, "step": 2320 }, { "epoch": 1.8639999999999999, "grad_norm": 13.10364055633545, "learning_rate": 2.503042413611001e-06, "loss": 4.354, "step": 2330 }, { "epoch": 1.8719999999999999, "grad_norm": 12.458951950073242, "learning_rate": 2.219905429656899e-06, "loss": 4.5043, "step": 2340 }, { "epoch": 1.88, "grad_norm": 9.545763969421387, "learning_rate": 1.9535785477133195e-06, "loss": 4.2234, "step": 2350 }, { "epoch": 1.888, "grad_norm": 9.243309020996094, "learning_rate": 1.7041075540118578e-06, "loss": 4.2382, "step": 2360 }, { "epoch": 1.896, "grad_norm": 6.87458610534668, "learning_rate": 1.4715353369631924e-06, "loss": 3.9852, "step": 2370 }, { "epoch": 1.904, "grad_norm": 6.957976341247559, "learning_rate": 1.2559018797838384e-06, "loss": 3.9335, "step": 2380 }, { "epoch": 1.912, "grad_norm": 5.689143657684326, "learning_rate": 1.0572442536223692e-06, "loss": 4.0, "step": 2390 }, { "epoch": 1.92, "grad_norm": 5.896793842315674, "learning_rate": 8.755966111861913e-07, "loss": 4.2807, "step": 2400 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }