| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.92, | |
| "eval_steps": 500, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 10.566986083984375, | |
| "learning_rate": 0.0, | |
| "loss": 14.1421, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 12.296218872070312, | |
| "learning_rate": 1.730769230769231e-05, | |
| "loss": 13.35, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 6.457699775695801, | |
| "learning_rate": 3.653846153846154e-05, | |
| "loss": 11.8957, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 6.461245059967041, | |
| "learning_rate": 5.576923076923077e-05, | |
| "loss": 11.2465, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 6.351202011108398, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 10.7197, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 5.675596714019775, | |
| "learning_rate": 9.423076923076924e-05, | |
| "loss": 10.4108, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 6.45210599899292, | |
| "learning_rate": 0.00011346153846153846, | |
| "loss": 9.499, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 3.97434663772583, | |
| "learning_rate": 0.0001326923076923077, | |
| "loss": 9.2464, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 4.443643093109131, | |
| "learning_rate": 0.00015192307692307692, | |
| "loss": 9.0007, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 4.448770046234131, | |
| "learning_rate": 0.00017115384615384616, | |
| "loss": 8.8057, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.425487041473389, | |
| "learning_rate": 0.00019038461538461538, | |
| "loss": 8.9744, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 4.242831230163574, | |
| "learning_rate": 0.00019999785100910492, | |
| "loss": 8.9241, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 3.6791751384735107, | |
| "learning_rate": 0.00019998065963611962, | |
| "loss": 8.8742, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 5.0801777839660645, | |
| "learning_rate": 0.00019994627984564557, | |
| "loss": 8.8388, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 5.117883205413818, | |
| "learning_rate": 0.00019989471754816785, | |
| "loss": 8.412, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.7691802978515625, | |
| "learning_rate": 0.00019982598160814377, | |
| "loss": 8.7482, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 4.111888885498047, | |
| "learning_rate": 0.00019974008384247908, | |
| "loss": 8.6456, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 3.717806816101074, | |
| "learning_rate": 0.0001996370390184965, | |
| "loss": 8.3429, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 5.897804260253906, | |
| "learning_rate": 0.00019951686485139672, | |
| "loss": 8.5481, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 4.452871799468994, | |
| "learning_rate": 0.00019937958200121303, | |
| "loss": 8.6357, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.103796482086182, | |
| "learning_rate": 0.0001992252140692594, | |
| "loss": 8.5245, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 5.806966304779053, | |
| "learning_rate": 0.00019905378759407314, | |
| "loss": 8.4875, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 4.293936729431152, | |
| "learning_rate": 0.00019886533204685228, | |
| "loss": 8.3073, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 3.6153390407562256, | |
| "learning_rate": 0.00019865987982638914, | |
| "loss": 8.5256, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 5.031829357147217, | |
| "learning_rate": 0.00019843746625350028, | |
| "loss": 8.3936, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.666059970855713, | |
| "learning_rate": 0.0001981981295649543, | |
| "loss": 7.9453, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 4.338928699493408, | |
| "learning_rate": 0.0001979419109068982, | |
| "loss": 8.5403, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 5.491336345672607, | |
| "learning_rate": 0.0001976688543277838, | |
| "loss": 8.5499, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 4.206221580505371, | |
| "learning_rate": 0.00019737900677079483, | |
| "loss": 8.202, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 4.248091220855713, | |
| "learning_rate": 0.0001970724180657768, | |
| "loss": 8.1605, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.153928279876709, | |
| "learning_rate": 0.00019674914092067015, | |
| "loss": 8.2001, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 4.331130027770996, | |
| "learning_rate": 0.00019640923091244906, | |
| "loss": 8.3479, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 4.888726711273193, | |
| "learning_rate": 0.0001960527464775666, | |
| "loss": 8.0348, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 4.52598762512207, | |
| "learning_rate": 0.00019567974890190865, | |
| "loss": 7.9916, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 5.109200477600098, | |
| "learning_rate": 0.00019529030231025776, | |
| "loss": 7.8621, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.465454578399658, | |
| "learning_rate": 0.0001948844736552688, | |
| "loss": 8.1529, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 4.94981050491333, | |
| "learning_rate": 0.00019446233270595896, | |
| "loss": 7.9475, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 4.898144245147705, | |
| "learning_rate": 0.00019402395203571286, | |
| "loss": 8.1256, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 4.506499767303467, | |
| "learning_rate": 0.00019356940700980625, | |
| "loss": 7.9425, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 4.715751647949219, | |
| "learning_rate": 0.00019309877577244924, | |
| "loss": 7.8867, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.232232570648193, | |
| "learning_rate": 0.00019261213923335194, | |
| "loss": 8.0137, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 5.4095258712768555, | |
| "learning_rate": 0.0001921095810538148, | |
| "loss": 7.655, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 8.021153450012207, | |
| "learning_rate": 0.00019159118763234555, | |
| "loss": 8.239, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 4.821053504943848, | |
| "learning_rate": 0.0001910570480898061, | |
| "loss": 7.6991, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 4.676478385925293, | |
| "learning_rate": 0.00019050725425409076, | |
| "loss": 7.9241, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.322430610656738, | |
| "learning_rate": 0.0001899419006443397, | |
| "loss": 8.0194, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 3.8518083095550537, | |
| "learning_rate": 0.0001893610844546894, | |
| "loss": 7.7739, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 4.104583263397217, | |
| "learning_rate": 0.00018876490553756313, | |
| "loss": 7.7344, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 5.830111980438232, | |
| "learning_rate": 0.00018815346638650487, | |
| "loss": 7.4569, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 5.279020309448242, | |
| "learning_rate": 0.0001875268721185585, | |
| "loss": 7.6329, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 6.486227989196777, | |
| "learning_rate": 0.00018688523045619674, | |
| "loss": 7.6998, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 3.6189093589782715, | |
| "learning_rate": 0.00018622865170880151, | |
| "loss": 7.3692, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 4.793766498565674, | |
| "learning_rate": 0.00018555724875369997, | |
| "loss": 7.4387, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 3.8143932819366455, | |
| "learning_rate": 0.00018487113701675883, | |
| "loss": 7.6311, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 5.568665027618408, | |
| "learning_rate": 0.00018417043445254075, | |
| "loss": 7.0967, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.059378623962402, | |
| "learning_rate": 0.00018345526152402573, | |
| "loss": 7.2701, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 5.139848232269287, | |
| "learning_rate": 0.00018272574118190167, | |
| "loss": 7.2647, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 6.435779571533203, | |
| "learning_rate": 0.00018198199884342673, | |
| "loss": 7.161, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 5.199296951293945, | |
| "learning_rate": 0.0001812241623708682, | |
| "loss": 7.5137, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 4.161045074462891, | |
| "learning_rate": 0.00018045236204952044, | |
| "loss": 7.356, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.084685325622559, | |
| "learning_rate": 0.00017966673056530686, | |
| "loss": 7.3859, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7.625977516174316, | |
| "learning_rate": 0.00017886740298196863, | |
| "loss": 7.0858, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 5.039551734924316, | |
| "learning_rate": 0.00017805451671784516, | |
| "loss": 7.4888, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 5.905925273895264, | |
| "learning_rate": 0.00017722821152224924, | |
| "loss": 7.2393, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 9.78266716003418, | |
| "learning_rate": 0.00017638862945144182, | |
| "loss": 7.4102, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.097099781036377, | |
| "learning_rate": 0.00017553591484421004, | |
| "loss": 7.297, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 4.922943115234375, | |
| "learning_rate": 0.00017467021429705285, | |
| "loss": 7.3741, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 5.79689884185791, | |
| "learning_rate": 0.00017379167663897856, | |
| "loss": 7.1571, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 3.795285701751709, | |
| "learning_rate": 0.00017290045290591858, | |
| "loss": 6.8519, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 4.697098731994629, | |
| "learning_rate": 0.0001719966963147616, | |
| "loss": 7.111, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.46597146987915, | |
| "learning_rate": 0.000171080562237013, | |
| "loss": 7.0099, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 6.007630348205566, | |
| "learning_rate": 0.00017015220817208376, | |
| "loss": 7.326, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 4.53595495223999, | |
| "learning_rate": 0.0001692117937202136, | |
| "loss": 7.0785, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 6.850202560424805, | |
| "learning_rate": 0.00016825948055503294, | |
| "loss": 7.2174, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 5.931501865386963, | |
| "learning_rate": 0.00016729543239576828, | |
| "loss": 6.995, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 5.900391101837158, | |
| "learning_rate": 0.0001663198149790961, | |
| "loss": 6.8995, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 5.841864109039307, | |
| "learning_rate": 0.00016533279603064978, | |
| "loss": 6.7632, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 5.902273178100586, | |
| "learning_rate": 0.00016433454523618482, | |
| "loss": 6.9055, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 5.881319046020508, | |
| "learning_rate": 0.00016332523421240658, | |
| "loss": 6.686, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 5.178507328033447, | |
| "learning_rate": 0.00016230503647746657, | |
| "loss": 6.6281, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.807728290557861, | |
| "learning_rate": 0.00016127412742113185, | |
| "loss": 6.9373, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 5.6972975730896, | |
| "learning_rate": 0.000160232684274632, | |
| "loss": 6.7103, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 5.271605968475342, | |
| "learning_rate": 0.00015918088608019043, | |
| "loss": 6.9209, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 5.366481304168701, | |
| "learning_rate": 0.00015811891366024358, | |
| "loss": 6.9491, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 4.8482866287231445, | |
| "learning_rate": 0.00015704694958635468, | |
| "loss": 6.6238, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 7.680044651031494, | |
| "learning_rate": 0.0001559651781478263, | |
| "loss": 6.7933, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 4.3361496925354, | |
| "learning_rate": 0.00015487378532001782, | |
| "loss": 6.6105, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 8.266775131225586, | |
| "learning_rate": 0.0001537729587323732, | |
| "loss": 6.5274, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 5.751520156860352, | |
| "learning_rate": 0.00015266288763616403, | |
| "loss": 6.6585, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.401413440704346, | |
| "learning_rate": 0.000151543762871954, | |
| "loss": 6.5739, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 5.519962310791016, | |
| "learning_rate": 0.0001504157768367901, | |
| "loss": 6.6899, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 5.094054222106934, | |
| "learning_rate": 0.00014927912345112616, | |
| "loss": 6.2008, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 6.107059955596924, | |
| "learning_rate": 0.0001481339981254846, | |
| "loss": 6.2149, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 6.355636119842529, | |
| "learning_rate": 0.00014698059772686202, | |
| "loss": 6.7521, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 7.230486869812012, | |
| "learning_rate": 0.00014581912054488413, | |
| "loss": 6.3823, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 4.960805416107178, | |
| "learning_rate": 0.00014464976625771654, | |
| "loss": 6.6149, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 5.269943714141846, | |
| "learning_rate": 0.00014347273589773637, | |
| "loss": 6.1001, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 6.840855598449707, | |
| "learning_rate": 0.0001422882318169716, | |
| "loss": 6.3677, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 5.1594038009643555, | |
| "learning_rate": 0.00014109645765231278, | |
| "loss": 6.3929, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 5.096086025238037, | |
| "learning_rate": 0.00013989761829050475, | |
| "loss": 6.1354, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5.235525608062744, | |
| "learning_rate": 0.00013869191983292283, | |
| "loss": 6.4954, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 5.518918991088867, | |
| "learning_rate": 0.00013747956956014037, | |
| "loss": 6.449, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 4.848990440368652, | |
| "learning_rate": 0.00013626077589629367, | |
| "loss": 6.392, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 7.234468460083008, | |
| "learning_rate": 0.00013503574837325015, | |
| "loss": 6.5465, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 6.593731880187988, | |
| "learning_rate": 0.00013380469759458643, | |
| "loss": 6.574, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.687368392944336, | |
| "learning_rate": 0.00013256783519938154, | |
| "loss": 6.1995, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 4.857635498046875, | |
| "learning_rate": 0.00013132537382583274, | |
| "loss": 5.8422, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 7.068734645843506, | |
| "learning_rate": 0.00013007752707469924, | |
| "loss": 6.0601, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 4.396754741668701, | |
| "learning_rate": 0.00012882450947258045, | |
| "loss": 5.8387, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 9.501909255981445, | |
| "learning_rate": 0.0001275665364350352, | |
| "loss": 5.9831, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 6.957056522369385, | |
| "learning_rate": 0.00012630382422954795, | |
| "loss": 6.1359, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 5.782343864440918, | |
| "learning_rate": 0.00012503658993834885, | |
| "loss": 6.0754, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 5.452831268310547, | |
| "learning_rate": 0.0001237650514210932, | |
| "loss": 5.6186, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 6.382038593292236, | |
| "learning_rate": 0.00012248942727740783, | |
| "loss": 5.7174, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 6.288851261138916, | |
| "learning_rate": 0.00012120993680931003, | |
| "loss": 5.6529, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.7387166023254395, | |
| "learning_rate": 0.0001199267999835055, | |
| "loss": 5.603, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 5.694065093994141, | |
| "learning_rate": 0.00011864023739357235, | |
| "loss": 5.2627, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 6.711731910705566, | |
| "learning_rate": 0.00011735047022203741, | |
| "loss": 5.4706, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 5.517411708831787, | |
| "learning_rate": 0.00011605772020235072, | |
| "loss": 5.6277, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 6.785055160522461, | |
| "learning_rate": 0.00011476220958076607, | |
| "loss": 5.9611, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 5.702793121337891, | |
| "learning_rate": 0.00011346416107813267, | |
| "loss": 5.9226, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 7.974459648132324, | |
| "learning_rate": 0.00011216379785160578, | |
| "loss": 5.8474, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 5.838559150695801, | |
| "learning_rate": 0.000110861343456282, | |
| "loss": 6.0194, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 7.7551703453063965, | |
| "learning_rate": 0.00010955702180676632, | |
| "loss": 5.7078, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 7.658422470092773, | |
| "learning_rate": 0.00010825105713867724, | |
| "loss": 5.6615, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.197235584259033, | |
| "learning_rate": 0.0001069436739700968, | |
| "loss": 5.7792, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 6.991013526916504, | |
| "learning_rate": 0.00010563509706297188, | |
| "loss": 5.3963, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 7.606940269470215, | |
| "learning_rate": 0.00010432555138447404, | |
| "loss": 5.9168, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 6.3427042961120605, | |
| "learning_rate": 0.0001030152620683233, | |
| "loss": 5.6694, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 5.197830677032471, | |
| "learning_rate": 0.00010170445437608403, | |
| "loss": 5.4628, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 7.1471381187438965, | |
| "learning_rate": 0.00010039335365843851, | |
| "loss": 5.4371, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 6.546257495880127, | |
| "learning_rate": 9.908218531644521e-05, | |
| "loss": 5.7931, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 8.735387802124023, | |
| "learning_rate": 9.77711747627883e-05, | |
| "loss": 5.354, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 6.352960109710693, | |
| "learning_rate": 9.646054738302551e-05, | |
| "loss": 5.0677, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 7.88266658782959, | |
| "learning_rate": 9.515052849684019e-05, | |
| "loss": 5.4004, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 5.571359634399414, | |
| "learning_rate": 9.384134331930513e-05, | |
| "loss": 5.3187, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 8.573116302490234, | |
| "learning_rate": 9.253321692216402e-05, | |
| "loss": 5.2719, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 9.162277221679688, | |
| "learning_rate": 9.122637419513778e-05, | |
| "loss": 5.6781, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 11.947822570800781, | |
| "learning_rate": 8.992103980726207e-05, | |
| "loss": 5.593, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 8.124156951904297, | |
| "learning_rate": 8.861743816826274e-05, | |
| "loss": 5.3142, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 7.134088039398193, | |
| "learning_rate": 8.731579338997594e-05, | |
| "loss": 5.3372, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 8.928452491760254, | |
| "learning_rate": 8.601632924781935e-05, | |
| "loss": 5.4594, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 5.617236614227295, | |
| "learning_rate": 8.471926914232137e-05, | |
| "loss": 5.0226, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 6.080244064331055, | |
| "learning_rate": 8.34248360607145e-05, | |
| "loss": 5.3677, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 9.846885681152344, | |
| "learning_rate": 8.213325253860013e-05, | |
| "loss": 5.2629, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 10.537776947021484, | |
| "learning_rate": 8.084474062169071e-05, | |
| "loss": 5.5022, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 5.898927688598633, | |
| "learning_rate": 7.955952182763624e-05, | |
| "loss": 5.2349, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 6.621062755584717, | |
| "learning_rate": 7.827781710794146e-05, | |
| "loss": 5.4929, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 6.793276309967041, | |
| "learning_rate": 7.699984680998063e-05, | |
| "loss": 5.136, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 8.44278335571289, | |
| "learning_rate": 7.57258306391157e-05, | |
| "loss": 4.9147, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 8.787193298339844, | |
| "learning_rate": 7.445598762092537e-05, | |
| "loss": 5.4473, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 7.027050495147705, | |
| "learning_rate": 7.319053606355061e-05, | |
| "loss": 5.1231, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 6.161197662353516, | |
| "learning_rate": 7.192969352016383e-05, | |
| "loss": 5.1066, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 5.8758745193481445, | |
| "learning_rate": 7.067367675156758e-05, | |
| "loss": 5.1743, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 8.138237953186035, | |
| "learning_rate": 6.942270168892959e-05, | |
| "loss": 5.2096, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 7.498131275177002, | |
| "learning_rate": 6.817698339666066e-05, | |
| "loss": 5.1718, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 5.885447025299072, | |
| "learning_rate": 6.693673603544097e-05, | |
| "loss": 5.4034, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 5.564108848571777, | |
| "learning_rate": 6.570217282540247e-05, | |
| "loss": 5.0762, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 5.904695510864258, | |
| "learning_rate": 6.447350600947236e-05, | |
| "loss": 4.9994, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 6.91570520401001, | |
| "learning_rate": 6.325094681688503e-05, | |
| "loss": 5.1286, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 5.768303394317627, | |
| "learning_rate": 6.203470542686798e-05, | |
| "loss": 4.975, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 9.351409912109375, | |
| "learning_rate": 6.082499093250831e-05, | |
| "loss": 4.9613, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 9.572948455810547, | |
| "learning_rate": 5.962201130480618e-05, | |
| "loss": 4.7479, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 7.169508457183838, | |
| "learning_rate": 5.842597335692067e-05, | |
| "loss": 5.2066, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 7.029344081878662, | |
| "learning_rate": 5.7237082708615186e-05, | |
| "loss": 4.7997, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 6.824025630950928, | |
| "learning_rate": 5.605554375090784e-05, | |
| "loss": 4.6027, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 12.187256813049316, | |
| "learning_rate": 5.488155961093298e-05, | |
| "loss": 5.03, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 5.913212776184082, | |
| "learning_rate": 5.3715332117020154e-05, | |
| "loss": 4.7387, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 6.245068550109863, | |
| "learning_rate": 5.2557061763996354e-05, | |
| "loss": 5.0907, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 9.76169490814209, | |
| "learning_rate": 5.140694767871733e-05, | |
| "loss": 5.1469, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 10.653762817382812, | |
| "learning_rate": 5.026518758583434e-05, | |
| "loss": 4.9712, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 7.157083511352539, | |
| "learning_rate": 4.9131977773801765e-05, | |
| "loss": 4.6749, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 8.09749698638916, | |
| "learning_rate": 4.8007513061131736e-05, | |
| "loss": 4.5003, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 6.536433219909668, | |
| "learning_rate": 4.6891986762901406e-05, | |
| "loss": 4.2644, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 7.866100311279297, | |
| "learning_rate": 4.578559065751873e-05, | |
| "loss": 4.721, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 5.806945323944092, | |
| "learning_rate": 4.4688514953752515e-05, | |
| "loss": 4.6537, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 8.229887962341309, | |
| "learning_rate": 4.360094825803203e-05, | |
| "loss": 4.7068, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 10.401880264282227, | |
| "learning_rate": 4.25230775420224e-05, | |
| "loss": 4.5295, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 7.166602611541748, | |
| "learning_rate": 4.145508811048089e-05, | |
| "loss": 4.5715, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 8.82656478881836, | |
| "learning_rate": 4.039716356939981e-05, | |
| "loss": 4.8924, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 8.747090339660645, | |
| "learning_rate": 3.9349485794441395e-05, | |
| "loss": 4.5401, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 12.537016868591309, | |
| "learning_rate": 3.831223489967025e-05, | |
| "loss": 4.7768, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 6.526289939880371, | |
| "learning_rate": 3.728558920658868e-05, | |
| "loss": 4.4916, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 7.882378101348877, | |
| "learning_rate": 3.6269725213479846e-05, | |
| "loss": 4.5072, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 10.795116424560547, | |
| "learning_rate": 3.526481756506498e-05, | |
| "loss": 4.7454, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 9.107118606567383, | |
| "learning_rate": 3.4271039022478694e-05, | |
| "loss": 4.3667, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 8.725346565246582, | |
| "learning_rate": 3.328856043356837e-05, | |
| "loss": 4.8818, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 8.258055686950684, | |
| "learning_rate": 3.231755070352249e-05, | |
| "loss": 4.4017, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 9.932522773742676, | |
| "learning_rate": 3.1358176765832915e-05, | |
| "loss": 4.3575, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 7.825517654418945, | |
| "learning_rate": 3.041060355359594e-05, | |
| "loss": 4.2122, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 8.71921443939209, | |
| "learning_rate": 2.9474993971157605e-05, | |
| "loss": 4.4648, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 9.51145076751709, | |
| "learning_rate": 2.8551508866107514e-05, | |
| "loss": 4.4388, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 6.032364845275879, | |
| "learning_rate": 2.764030700162633e-05, | |
| "loss": 4.6544, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 6.442480564117432, | |
| "learning_rate": 2.6741545029191674e-05, | |
| "loss": 4.3281, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 9.142407417297363, | |
| "learning_rate": 2.5855377461646902e-05, | |
| "loss": 4.7258, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 13.672632217407227, | |
| "learning_rate": 2.4981956646637815e-05, | |
| "loss": 4.076, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 10.532658576965332, | |
| "learning_rate": 2.412143274042129e-05, | |
| "loss": 4.3765, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 6.73195743560791, | |
| "learning_rate": 2.327395368205084e-05, | |
| "loss": 4.4419, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 7.2371745109558105, | |
| "learning_rate": 2.243966516794338e-05, | |
| "loss": 4.5395, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 9.002269744873047, | |
| "learning_rate": 2.161871062683145e-05, | |
| "loss": 4.5577, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 10.184229850769043, | |
| "learning_rate": 2.0811231195105186e-05, | |
| "loss": 4.3631, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 7.962112903594971, | |
| "learning_rate": 2.0017365692548717e-05, | |
| "loss": 4.6822, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 7.3697190284729, | |
| "learning_rate": 1.9237250598474564e-05, | |
| "loss": 4.1389, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 8.01472282409668, | |
| "learning_rate": 1.8471020028260368e-05, | |
| "loss": 3.9598, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 5.376262187957764, | |
| "learning_rate": 1.7718805710292208e-05, | |
| "loss": 4.3192, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 10.927371978759766, | |
| "learning_rate": 1.6980736963318177e-05, | |
| "loss": 4.192, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 6.14822244644165, | |
| "learning_rate": 1.625694067421626e-05, | |
| "loss": 4.4155, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 8.27953815460205, | |
| "learning_rate": 1.554754127618019e-05, | |
| "loss": 4.5526, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 6.160037517547607, | |
| "learning_rate": 1.4852660727327361e-05, | |
| "loss": 4.453, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 7.098100662231445, | |
| "learning_rate": 1.4172418489731942e-05, | |
| "loss": 4.4741, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 6.903653144836426, | |
| "learning_rate": 1.3506931508887333e-05, | |
| "loss": 4.0234, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 6.967901706695557, | |
| "learning_rate": 1.2856314193601216e-05, | |
| "loss": 4.2137, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 7.635384559631348, | |
| "learning_rate": 1.2220678396326678e-05, | |
| "loss": 4.38, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 9.351762771606445, | |
| "learning_rate": 1.160013339393281e-05, | |
| "loss": 4.4418, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 7.01410436630249, | |
| "learning_rate": 1.0994785868918101e-05, | |
| "loss": 4.4396, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.974796772003174, | |
| "learning_rate": 1.040473989106988e-05, | |
| "loss": 4.5075, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 7.545105934143066, | |
| "learning_rate": 9.830096899572927e-06, | |
| "loss": 4.7132, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 7.074202537536621, | |
| "learning_rate": 9.270955685570226e-06, | |
| "loss": 4.2393, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 6.088277339935303, | |
| "learning_rate": 8.727412375179156e-06, | |
| "loss": 4.8092, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 5.859469413757324, | |
| "learning_rate": 8.199560412965634e-06, | |
| "loss": 4.6403, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 6.770120143890381, | |
| "learning_rate": 7.687490545879461e-06, | |
| "loss": 4.2544, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 8.092095375061035, | |
| "learning_rate": 7.191290807653251e-06, | |
| "loss": 4.454, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 6.498937606811523, | |
| "learning_rate": 6.711046503667983e-06, | |
| "loss": 4.0243, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 5.894200801849365, | |
| "learning_rate": 6.24684019628744e-06, | |
| "loss": 4.0666, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 6.309505462646484, | |
| "learning_rate": 5.79875169066435e-06, | |
| "loss": 4.2441, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 6.20737886428833, | |
| "learning_rate": 5.366858021020471e-06, | |
| "loss": 4.1951, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 8.314438819885254, | |
| "learning_rate": 4.951233437403102e-06, | |
| "loss": 4.4043, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 6.5354108810424805, | |
| "learning_rate": 4.551949392920118e-06, | |
| "loss": 4.1528, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 8.503190040588379, | |
| "learning_rate": 4.169074531456063e-06, | |
| "loss": 3.8358, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 7.091247081756592, | |
| "learning_rate": 3.802674675870932e-06, | |
| "loss": 4.3702, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 6.265818119049072, | |
| "learning_rate": 3.4528128166842033e-06, | |
| "loss": 4.3354, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 7.026565074920654, | |
| "learning_rate": 3.119549101245567e-06, | |
| "loss": 3.9832, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 6.407781600952148, | |
| "learning_rate": 2.8029408233946177e-06, | |
| "loss": 4.1522, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 13.10364055633545, | |
| "learning_rate": 2.503042413611001e-06, | |
| "loss": 4.354, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 12.458951950073242, | |
| "learning_rate": 2.219905429656899e-06, | |
| "loss": 4.5043, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 9.545763969421387, | |
| "learning_rate": 1.9535785477133195e-06, | |
| "loss": 4.2234, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 9.243309020996094, | |
| "learning_rate": 1.7041075540118578e-06, | |
| "loss": 4.2382, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 6.87458610534668, | |
| "learning_rate": 1.4715353369631924e-06, | |
| "loss": 3.9852, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 6.957976341247559, | |
| "learning_rate": 1.2559018797838384e-06, | |
| "loss": 3.9335, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 5.689143657684326, | |
| "learning_rate": 1.0572442536223692e-06, | |
| "loss": 4.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 5.896793842315674, | |
| "learning_rate": 8.755966111861913e-07, | |
| "loss": 4.2807, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |