diff --git "a/checkpoint-4291/trainer_state.json" "b/checkpoint-4291/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4291/trainer_state.json" @@ -0,0 +1,30070 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999223240640049, + "eval_steps": 500, + "global_step": 4291, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023302780798508622, + "grad_norm": 1.3110069036483765, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.9853, + "step": 1 + }, + { + "epoch": 0.00046605561597017243, + "grad_norm": 1.3046332597732544, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9665, + "step": 2 + }, + { + "epoch": 0.0006990834239552587, + "grad_norm": 1.3271926641464233, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.9758, + "step": 3 + }, + { + "epoch": 0.0009321112319403449, + "grad_norm": 1.3110666275024414, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9576, + "step": 4 + }, + { + "epoch": 0.0011651390399254312, + "grad_norm": 1.2493504285812378, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9439, + "step": 5 + }, + { + "epoch": 0.0013981668479105174, + "grad_norm": 1.2979836463928223, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.9846, + "step": 6 + }, + { + "epoch": 0.0016311946558956035, + "grad_norm": 1.3058552742004395, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.964, + "step": 7 + }, + { + "epoch": 0.0018642224638806897, + "grad_norm": 1.3901036977767944, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.9779, + "step": 8 + }, + { + "epoch": 0.002097250271865776, + "grad_norm": 1.2552282810211182, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.9838, + "step": 9 + }, + { + "epoch": 0.0023302780798508623, + "grad_norm": 1.2276779413223267, + "learning_rate": 5.000000000000001e-07, + "loss": 0.9122, + "step": 10 + }, + { + "epoch": 0.0025633058878359485, + "grad_norm": 1.306789517402649, + "learning_rate": 5.5e-07, + "loss": 1.0011, + "step": 11 + }, + { + "epoch": 0.0027963336958210347, + "grad_norm": 1.2753934860229492, + "learning_rate": 6.000000000000001e-07, + "loss": 0.9585, + "step": 12 + }, + { + "epoch": 0.003029361503806121, + "grad_norm": 1.2781143188476562, + "learning_rate": 6.5e-07, + "loss": 0.9728, + "step": 13 + }, + { + "epoch": 0.003262389311791207, + "grad_norm": 1.2627602815628052, + "learning_rate": 7.000000000000001e-07, + "loss": 0.9864, + "step": 14 + }, + { + "epoch": 0.0034954171197762933, + "grad_norm": 1.204742431640625, + "learning_rate": 7.5e-07, + "loss": 0.9463, + "step": 15 + }, + { + "epoch": 0.0037284449277613795, + "grad_norm": 1.2309420108795166, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9584, + "step": 16 + }, + { + "epoch": 0.003961472735746466, + "grad_norm": 1.1872025728225708, + "learning_rate": 8.500000000000001e-07, + "loss": 0.9511, + "step": 17 + }, + { + "epoch": 0.004194500543731552, + "grad_norm": 1.1508766412734985, + "learning_rate": 9.000000000000001e-07, + "loss": 0.9364, + "step": 18 + }, + { + "epoch": 0.0044275283517166385, + "grad_norm": 1.0602127313613892, + "learning_rate": 9.500000000000001e-07, + "loss": 0.919, + "step": 19 + }, + { + "epoch": 0.004660556159701725, + "grad_norm": 1.0241538286209106, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9264, + "step": 20 + }, + { + "epoch": 0.004893583967686811, + "grad_norm": 1.0005977153778076, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.934, + "step": 21 + }, + { + "epoch": 0.005126611775671897, + "grad_norm": 0.9035549163818359, + "learning_rate": 1.1e-06, + "loss": 0.9096, + "step": 22 + }, + { + "epoch": 0.005359639583656983, + "grad_norm": 0.8883580565452576, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.8991, + "step": 23 + }, + { + "epoch": 0.005592667391642069, + "grad_norm": 0.7974752187728882, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9089, + "step": 24 + }, + { + "epoch": 0.005825695199627156, + "grad_norm": 0.7034953832626343, + "learning_rate": 1.25e-06, + "loss": 0.8837, + "step": 25 + }, + { + "epoch": 0.006058723007612242, + "grad_norm": 0.6669341921806335, + "learning_rate": 1.3e-06, + "loss": 0.9094, + "step": 26 + }, + { + "epoch": 0.006291750815597328, + "grad_norm": 0.6528049111366272, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.9335, + "step": 27 + }, + { + "epoch": 0.006524778623582414, + "grad_norm": 0.5770772695541382, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8982, + "step": 28 + }, + { + "epoch": 0.0067578064315675, + "grad_norm": 0.5549561977386475, + "learning_rate": 1.45e-06, + "loss": 0.9045, + "step": 29 + }, + { + "epoch": 0.0069908342395525866, + "grad_norm": 0.5312375426292419, + "learning_rate": 1.5e-06, + "loss": 0.9188, + "step": 30 + }, + { + "epoch": 0.007223862047537673, + "grad_norm": 0.5100304484367371, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8858, + "step": 31 + }, + { + "epoch": 0.007456889855522759, + "grad_norm": 0.5262665152549744, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8997, + "step": 32 + }, + { + "epoch": 0.007689917663507845, + "grad_norm": 0.4866653084754944, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8885, + "step": 33 + }, + { + "epoch": 0.007922945471492932, + "grad_norm": 0.5153486728668213, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8986, + "step": 34 + }, + { + "epoch": 0.008155973279478018, + "grad_norm": 0.4663233757019043, + "learning_rate": 1.75e-06, + "loss": 0.867, + "step": 35 + }, + { + "epoch": 0.008389001087463105, + "grad_norm": 0.47794193029403687, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8714, + "step": 36 + }, + { + "epoch": 0.00862202889544819, + "grad_norm": 0.4731766879558563, + "learning_rate": 1.85e-06, + "loss": 0.8669, + "step": 37 + }, + { + "epoch": 0.008855056703433277, + "grad_norm": 0.45312952995300293, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.8904, + "step": 38 + }, + { + "epoch": 0.009088084511418362, + "grad_norm": 0.44134730100631714, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.8469, + "step": 39 + }, + { + "epoch": 0.00932111231940345, + "grad_norm": 0.4474064111709595, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8191, + "step": 40 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 0.4580775201320648, + "learning_rate": 2.05e-06, + "loss": 0.8645, + "step": 41 + }, + { + "epoch": 0.009787167935373622, + "grad_norm": 0.41531872749328613, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.8578, + "step": 42 + }, + { + "epoch": 0.010020195743358707, + "grad_norm": 0.4022881090641022, + "learning_rate": 2.15e-06, + "loss": 0.837, + "step": 43 + }, + { + "epoch": 0.010253223551343794, + "grad_norm": 0.38351377844810486, + "learning_rate": 2.2e-06, + "loss": 0.8286, + "step": 44 + }, + { + "epoch": 0.01048625135932888, + "grad_norm": 0.37268322706222534, + "learning_rate": 2.25e-06, + "loss": 0.8373, + "step": 45 + }, + { + "epoch": 0.010719279167313966, + "grad_norm": 0.3749820590019226, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.8465, + "step": 46 + }, + { + "epoch": 0.010952306975299052, + "grad_norm": 0.36987820267677307, + "learning_rate": 2.35e-06, + "loss": 0.8022, + "step": 47 + }, + { + "epoch": 0.011185334783284139, + "grad_norm": 0.35212600231170654, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.808, + "step": 48 + }, + { + "epoch": 0.011418362591269224, + "grad_norm": 0.3424849510192871, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7895, + "step": 49 + }, + { + "epoch": 0.011651390399254311, + "grad_norm": 0.3109133839607239, + "learning_rate": 2.5e-06, + "loss": 0.8213, + "step": 50 + }, + { + "epoch": 0.011884418207239397, + "grad_norm": 0.28981292247772217, + "learning_rate": 2.55e-06, + "loss": 0.7973, + "step": 51 + }, + { + "epoch": 0.012117446015224484, + "grad_norm": 0.27374207973480225, + "learning_rate": 2.6e-06, + "loss": 0.8026, + "step": 52 + }, + { + "epoch": 0.012350473823209569, + "grad_norm": 0.2604827284812927, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.7849, + "step": 53 + }, + { + "epoch": 0.012583501631194656, + "grad_norm": 0.2770780920982361, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7797, + "step": 54 + }, + { + "epoch": 0.012816529439179741, + "grad_norm": 0.2674211859703064, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8028, + "step": 55 + }, + { + "epoch": 0.013049557247164828, + "grad_norm": 0.26464080810546875, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.8145, + "step": 56 + }, + { + "epoch": 0.013282585055149915, + "grad_norm": 0.26782912015914917, + "learning_rate": 2.85e-06, + "loss": 0.8258, + "step": 57 + }, + { + "epoch": 0.013515612863135, + "grad_norm": 0.2634070813655853, + "learning_rate": 2.9e-06, + "loss": 0.8198, + "step": 58 + }, + { + "epoch": 0.013748640671120088, + "grad_norm": 0.2565610408782959, + "learning_rate": 2.95e-06, + "loss": 0.7911, + "step": 59 + }, + { + "epoch": 0.013981668479105173, + "grad_norm": 0.25742948055267334, + "learning_rate": 3e-06, + "loss": 0.8125, + "step": 60 + }, + { + "epoch": 0.01421469628709026, + "grad_norm": 0.25982847809791565, + "learning_rate": 3.05e-06, + "loss": 0.7982, + "step": 61 + }, + { + "epoch": 0.014447724095075345, + "grad_norm": 0.24355986714363098, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7709, + "step": 62 + }, + { + "epoch": 0.014680751903060433, + "grad_norm": 0.25315988063812256, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.8219, + "step": 63 + }, + { + "epoch": 0.014913779711045518, + "grad_norm": 0.2541501820087433, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7906, + "step": 64 + }, + { + "epoch": 0.015146807519030605, + "grad_norm": 0.2325265109539032, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7888, + "step": 65 + }, + { + "epoch": 0.01537983532701569, + "grad_norm": 0.23053106665611267, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7883, + "step": 66 + }, + { + "epoch": 0.015612863135000777, + "grad_norm": 0.23249517381191254, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.7952, + "step": 67 + }, + { + "epoch": 0.015845890942985864, + "grad_norm": 0.23336932063102722, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.8057, + "step": 68 + }, + { + "epoch": 0.01607891875097095, + "grad_norm": 0.24779729545116425, + "learning_rate": 3.45e-06, + "loss": 0.8311, + "step": 69 + }, + { + "epoch": 0.016311946558956035, + "grad_norm": 0.24076716601848602, + "learning_rate": 3.5e-06, + "loss": 0.8284, + "step": 70 + }, + { + "epoch": 0.01654497436694112, + "grad_norm": 0.2726815342903137, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7999, + "step": 71 + }, + { + "epoch": 0.01677800217492621, + "grad_norm": 0.22827991843223572, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7989, + "step": 72 + }, + { + "epoch": 0.017011029982911294, + "grad_norm": 0.229026660323143, + "learning_rate": 3.65e-06, + "loss": 0.8221, + "step": 73 + }, + { + "epoch": 0.01724405779089638, + "grad_norm": 0.2230895608663559, + "learning_rate": 3.7e-06, + "loss": 0.8035, + "step": 74 + }, + { + "epoch": 0.017477085598881465, + "grad_norm": 0.23196783661842346, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7886, + "step": 75 + }, + { + "epoch": 0.017710113406866554, + "grad_norm": 0.22168371081352234, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7804, + "step": 76 + }, + { + "epoch": 0.01794314121485164, + "grad_norm": 0.2273039072751999, + "learning_rate": 3.85e-06, + "loss": 0.8005, + "step": 77 + }, + { + "epoch": 0.018176169022836725, + "grad_norm": 0.22388514876365662, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7894, + "step": 78 + }, + { + "epoch": 0.01840919683082181, + "grad_norm": 0.21821601688861847, + "learning_rate": 3.95e-06, + "loss": 0.7627, + "step": 79 + }, + { + "epoch": 0.0186422246388069, + "grad_norm": 0.21988703310489655, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7611, + "step": 80 + }, + { + "epoch": 0.018875252446791984, + "grad_norm": 0.23125998675823212, + "learning_rate": 4.05e-06, + "loss": 0.8258, + "step": 81 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 0.21021096408367157, + "learning_rate": 4.1e-06, + "loss": 0.7564, + "step": 82 + }, + { + "epoch": 0.019341308062762155, + "grad_norm": 0.20952369272708893, + "learning_rate": 4.15e-06, + "loss": 0.7543, + "step": 83 + }, + { + "epoch": 0.019574335870747243, + "grad_norm": 0.2296966314315796, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.7936, + "step": 84 + }, + { + "epoch": 0.01980736367873233, + "grad_norm": 0.22012834250926971, + "learning_rate": 4.25e-06, + "loss": 0.7714, + "step": 85 + }, + { + "epoch": 0.020040391486717414, + "grad_norm": 0.22611187398433685, + "learning_rate": 4.3e-06, + "loss": 0.7672, + "step": 86 + }, + { + "epoch": 0.020273419294702503, + "grad_norm": 0.21921581029891968, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7659, + "step": 87 + }, + { + "epoch": 0.020506447102687588, + "grad_norm": 0.22477608919143677, + "learning_rate": 4.4e-06, + "loss": 0.7743, + "step": 88 + }, + { + "epoch": 0.020739474910672673, + "grad_norm": 0.2186264544725418, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7649, + "step": 89 + }, + { + "epoch": 0.02097250271865776, + "grad_norm": 0.21717782318592072, + "learning_rate": 4.5e-06, + "loss": 0.7674, + "step": 90 + }, + { + "epoch": 0.021205530526642848, + "grad_norm": 0.21880236268043518, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7954, + "step": 91 + }, + { + "epoch": 0.021438558334627933, + "grad_norm": 0.21165220439434052, + "learning_rate": 4.600000000000001e-06, + "loss": 0.772, + "step": 92 + }, + { + "epoch": 0.021671586142613018, + "grad_norm": 0.21881164610385895, + "learning_rate": 4.65e-06, + "loss": 0.7887, + "step": 93 + }, + { + "epoch": 0.021904613950598104, + "grad_norm": 0.2075614631175995, + "learning_rate": 4.7e-06, + "loss": 0.7309, + "step": 94 + }, + { + "epoch": 0.022137641758583192, + "grad_norm": 0.2024187296628952, + "learning_rate": 4.75e-06, + "loss": 0.7208, + "step": 95 + }, + { + "epoch": 0.022370669566568278, + "grad_norm": 0.21097055077552795, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7592, + "step": 96 + }, + { + "epoch": 0.022603697374553363, + "grad_norm": 0.20701804757118225, + "learning_rate": 4.85e-06, + "loss": 0.7563, + "step": 97 + }, + { + "epoch": 0.02283672518253845, + "grad_norm": 0.20521406829357147, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7632, + "step": 98 + }, + { + "epoch": 0.023069752990523537, + "grad_norm": 0.21903039515018463, + "learning_rate": 4.95e-06, + "loss": 0.748, + "step": 99 + }, + { + "epoch": 0.023302780798508622, + "grad_norm": 0.21777278184890747, + "learning_rate": 5e-06, + "loss": 0.7669, + "step": 100 + }, + { + "epoch": 0.023535808606493708, + "grad_norm": 0.21491429209709167, + "learning_rate": 4.999999981242694e-06, + "loss": 0.7607, + "step": 101 + }, + { + "epoch": 0.023768836414478793, + "grad_norm": 0.21780641376972198, + "learning_rate": 4.999999924970773e-06, + "loss": 0.7931, + "step": 102 + }, + { + "epoch": 0.024001864222463882, + "grad_norm": 0.21689319610595703, + "learning_rate": 4.999999831184241e-06, + "loss": 0.7828, + "step": 103 + }, + { + "epoch": 0.024234892030448967, + "grad_norm": 0.21614891290664673, + "learning_rate": 4.9999996998830965e-06, + "loss": 0.7525, + "step": 104 + }, + { + "epoch": 0.024467919838434053, + "grad_norm": 0.2099386304616928, + "learning_rate": 4.999999531067345e-06, + "loss": 0.766, + "step": 105 + }, + { + "epoch": 0.024700947646419138, + "grad_norm": 0.2126135528087616, + "learning_rate": 4.999999324736985e-06, + "loss": 0.7519, + "step": 106 + }, + { + "epoch": 0.024933975454404227, + "grad_norm": 0.21956445276737213, + "learning_rate": 4.999999080892023e-06, + "loss": 0.7577, + "step": 107 + }, + { + "epoch": 0.025167003262389312, + "grad_norm": 0.20770469307899475, + "learning_rate": 4.999998799532459e-06, + "loss": 0.7352, + "step": 108 + }, + { + "epoch": 0.025400031070374397, + "grad_norm": 0.21246927976608276, + "learning_rate": 4.999998480658301e-06, + "loss": 0.7708, + "step": 109 + }, + { + "epoch": 0.025633058878359483, + "grad_norm": 0.210545614361763, + "learning_rate": 4.999998124269552e-06, + "loss": 0.7648, + "step": 110 + }, + { + "epoch": 0.02586608668634457, + "grad_norm": 0.21258406341075897, + "learning_rate": 4.999997730366218e-06, + "loss": 0.7908, + "step": 111 + }, + { + "epoch": 0.026099114494329657, + "grad_norm": 0.20755688846111298, + "learning_rate": 4.999997298948304e-06, + "loss": 0.7506, + "step": 112 + }, + { + "epoch": 0.026332142302314742, + "grad_norm": 0.2095147967338562, + "learning_rate": 4.999996830015817e-06, + "loss": 0.7682, + "step": 113 + }, + { + "epoch": 0.02656517011029983, + "grad_norm": 0.21282409131526947, + "learning_rate": 4.999996323568763e-06, + "loss": 0.759, + "step": 114 + }, + { + "epoch": 0.026798197918284916, + "grad_norm": 0.20473438501358032, + "learning_rate": 4.999995779607152e-06, + "loss": 0.7484, + "step": 115 + }, + { + "epoch": 0.02703122572627, + "grad_norm": 0.2132248878479004, + "learning_rate": 4.99999519813099e-06, + "loss": 0.7698, + "step": 116 + }, + { + "epoch": 0.027264253534255087, + "grad_norm": 0.20986324548721313, + "learning_rate": 4.999994579140287e-06, + "loss": 0.7862, + "step": 117 + }, + { + "epoch": 0.027497281342240176, + "grad_norm": 0.21242764592170715, + "learning_rate": 4.99999392263505e-06, + "loss": 0.751, + "step": 118 + }, + { + "epoch": 0.02773030915022526, + "grad_norm": 0.21054607629776, + "learning_rate": 4.999993228615293e-06, + "loss": 0.7757, + "step": 119 + }, + { + "epoch": 0.027963336958210346, + "grad_norm": 0.2049800306558609, + "learning_rate": 4.999992497081023e-06, + "loss": 0.7466, + "step": 120 + }, + { + "epoch": 0.02819636476619543, + "grad_norm": 0.21881400048732758, + "learning_rate": 4.999991728032252e-06, + "loss": 0.7578, + "step": 121 + }, + { + "epoch": 0.02842939257418052, + "grad_norm": 0.20710553228855133, + "learning_rate": 4.999990921468991e-06, + "loss": 0.7514, + "step": 122 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 0.21309322118759155, + "learning_rate": 4.999990077391253e-06, + "loss": 0.7776, + "step": 123 + }, + { + "epoch": 0.02889544819015069, + "grad_norm": 0.21584945917129517, + "learning_rate": 4.9999891957990505e-06, + "loss": 0.7267, + "step": 124 + }, + { + "epoch": 0.029128475998135776, + "grad_norm": 0.21612006425857544, + "learning_rate": 4.999988276692396e-06, + "loss": 0.7563, + "step": 125 + }, + { + "epoch": 0.029361503806120865, + "grad_norm": 0.21165628731250763, + "learning_rate": 4.999987320071304e-06, + "loss": 0.7635, + "step": 126 + }, + { + "epoch": 0.02959453161410595, + "grad_norm": 0.21232764422893524, + "learning_rate": 4.999986325935789e-06, + "loss": 0.7505, + "step": 127 + }, + { + "epoch": 0.029827559422091036, + "grad_norm": 0.21750356256961823, + "learning_rate": 4.999985294285866e-06, + "loss": 0.746, + "step": 128 + }, + { + "epoch": 0.03006058723007612, + "grad_norm": 0.21733710169792175, + "learning_rate": 4.9999842251215495e-06, + "loss": 0.732, + "step": 129 + }, + { + "epoch": 0.03029361503806121, + "grad_norm": 0.21034637093544006, + "learning_rate": 4.999983118442856e-06, + "loss": 0.7508, + "step": 130 + }, + { + "epoch": 0.030526642846046295, + "grad_norm": 0.21561205387115479, + "learning_rate": 4.999981974249802e-06, + "loss": 0.7561, + "step": 131 + }, + { + "epoch": 0.03075967065403138, + "grad_norm": 0.20917284488677979, + "learning_rate": 4.9999807925424045e-06, + "loss": 0.7506, + "step": 132 + }, + { + "epoch": 0.030992698462016466, + "grad_norm": 0.21881957352161407, + "learning_rate": 4.999979573320683e-06, + "loss": 0.7731, + "step": 133 + }, + { + "epoch": 0.031225726270001555, + "grad_norm": 0.21721650660037994, + "learning_rate": 4.999978316584654e-06, + "loss": 0.8205, + "step": 134 + }, + { + "epoch": 0.031458754077986636, + "grad_norm": 0.20491044223308563, + "learning_rate": 4.999977022334336e-06, + "loss": 0.7534, + "step": 135 + }, + { + "epoch": 0.03169178188597173, + "grad_norm": 0.21985279023647308, + "learning_rate": 4.99997569056975e-06, + "loss": 0.7528, + "step": 136 + }, + { + "epoch": 0.031924809693956814, + "grad_norm": 0.2122112214565277, + "learning_rate": 4.999974321290915e-06, + "loss": 0.7383, + "step": 137 + }, + { + "epoch": 0.0321578375019419, + "grad_norm": 0.21406878530979156, + "learning_rate": 4.9999729144978514e-06, + "loss": 0.7636, + "step": 138 + }, + { + "epoch": 0.032390865309926985, + "grad_norm": 0.21860897541046143, + "learning_rate": 4.999971470190581e-06, + "loss": 0.7619, + "step": 139 + }, + { + "epoch": 0.03262389311791207, + "grad_norm": 0.21503332257270813, + "learning_rate": 4.999969988369125e-06, + "loss": 0.7485, + "step": 140 + }, + { + "epoch": 0.032856920925897155, + "grad_norm": 0.2177797555923462, + "learning_rate": 4.9999684690335055e-06, + "loss": 0.7673, + "step": 141 + }, + { + "epoch": 0.03308994873388224, + "grad_norm": 0.21829889714717865, + "learning_rate": 4.999966912183746e-06, + "loss": 0.7455, + "step": 142 + }, + { + "epoch": 0.03332297654186733, + "grad_norm": 0.2195976823568344, + "learning_rate": 4.9999653178198685e-06, + "loss": 0.763, + "step": 143 + }, + { + "epoch": 0.03355600434985242, + "grad_norm": 0.21256093680858612, + "learning_rate": 4.999963685941899e-06, + "loss": 0.7304, + "step": 144 + }, + { + "epoch": 0.033789032157837504, + "grad_norm": 0.2156277596950531, + "learning_rate": 4.99996201654986e-06, + "loss": 0.7574, + "step": 145 + }, + { + "epoch": 0.03402205996582259, + "grad_norm": 0.21632106602191925, + "learning_rate": 4.9999603096437775e-06, + "loss": 0.734, + "step": 146 + }, + { + "epoch": 0.034255087773807674, + "grad_norm": 0.22344687581062317, + "learning_rate": 4.999958565223677e-06, + "loss": 0.751, + "step": 147 + }, + { + "epoch": 0.03448811558179276, + "grad_norm": 0.21789836883544922, + "learning_rate": 4.9999567832895845e-06, + "loss": 0.7515, + "step": 148 + }, + { + "epoch": 0.034721143389777845, + "grad_norm": 0.22056196630001068, + "learning_rate": 4.999954963841528e-06, + "loss": 0.7631, + "step": 149 + }, + { + "epoch": 0.03495417119776293, + "grad_norm": 0.21013368666172028, + "learning_rate": 4.999953106879532e-06, + "loss": 0.7316, + "step": 150 + }, + { + "epoch": 0.03518719900574802, + "grad_norm": 0.2180197536945343, + "learning_rate": 4.999951212403627e-06, + "loss": 0.7556, + "step": 151 + }, + { + "epoch": 0.03542022681373311, + "grad_norm": 0.22545045614242554, + "learning_rate": 4.99994928041384e-06, + "loss": 0.7415, + "step": 152 + }, + { + "epoch": 0.03565325462171819, + "grad_norm": 0.21654920279979706, + "learning_rate": 4.999947310910201e-06, + "loss": 0.7708, + "step": 153 + }, + { + "epoch": 0.03588628242970328, + "grad_norm": 0.20974504947662354, + "learning_rate": 4.9999453038927396e-06, + "loss": 0.7178, + "step": 154 + }, + { + "epoch": 0.036119310237688364, + "grad_norm": 0.22206845879554749, + "learning_rate": 4.9999432593614845e-06, + "loss": 0.7422, + "step": 155 + }, + { + "epoch": 0.03635233804567345, + "grad_norm": 0.22387872636318207, + "learning_rate": 4.9999411773164676e-06, + "loss": 0.7467, + "step": 156 + }, + { + "epoch": 0.036585365853658534, + "grad_norm": 0.22495611011981964, + "learning_rate": 4.999939057757719e-06, + "loss": 0.7699, + "step": 157 + }, + { + "epoch": 0.03681839366164362, + "grad_norm": 0.2160438597202301, + "learning_rate": 4.999936900685272e-06, + "loss": 0.7249, + "step": 158 + }, + { + "epoch": 0.03705142146962871, + "grad_norm": 0.215169757604599, + "learning_rate": 4.9999347060991584e-06, + "loss": 0.7407, + "step": 159 + }, + { + "epoch": 0.0372844492776138, + "grad_norm": 0.2201366275548935, + "learning_rate": 4.999932473999411e-06, + "loss": 0.755, + "step": 160 + }, + { + "epoch": 0.03751747708559888, + "grad_norm": 0.22283214330673218, + "learning_rate": 4.999930204386063e-06, + "loss": 0.7294, + "step": 161 + }, + { + "epoch": 0.03775050489358397, + "grad_norm": 0.215604767203331, + "learning_rate": 4.999927897259149e-06, + "loss": 0.7351, + "step": 162 + }, + { + "epoch": 0.03798353270156905, + "grad_norm": 0.22387312352657318, + "learning_rate": 4.999925552618703e-06, + "loss": 0.7683, + "step": 163 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 0.21581925451755524, + "learning_rate": 4.99992317046476e-06, + "loss": 0.7449, + "step": 164 + }, + { + "epoch": 0.038449588317539224, + "grad_norm": 0.22226959466934204, + "learning_rate": 4.999920750797357e-06, + "loss": 0.7574, + "step": 165 + }, + { + "epoch": 0.03868261612552431, + "grad_norm": 0.22044377028942108, + "learning_rate": 4.99991829361653e-06, + "loss": 0.7361, + "step": 166 + }, + { + "epoch": 0.0389156439335094, + "grad_norm": 0.22568173706531525, + "learning_rate": 4.999915798922315e-06, + "loss": 0.7707, + "step": 167 + }, + { + "epoch": 0.03914867174149449, + "grad_norm": 0.23213784396648407, + "learning_rate": 4.99991326671475e-06, + "loss": 0.7661, + "step": 168 + }, + { + "epoch": 0.03938169954947957, + "grad_norm": 0.22136245667934418, + "learning_rate": 4.999910696993873e-06, + "loss": 0.7497, + "step": 169 + }, + { + "epoch": 0.03961472735746466, + "grad_norm": 0.22211574018001556, + "learning_rate": 4.999908089759722e-06, + "loss": 0.7489, + "step": 170 + }, + { + "epoch": 0.03984775516544974, + "grad_norm": 0.211950421333313, + "learning_rate": 4.9999054450123365e-06, + "loss": 0.7106, + "step": 171 + }, + { + "epoch": 0.04008078297343483, + "grad_norm": 0.21984447538852692, + "learning_rate": 4.999902762751757e-06, + "loss": 0.7698, + "step": 172 + }, + { + "epoch": 0.04031381078141991, + "grad_norm": 0.21994370222091675, + "learning_rate": 4.9999000429780225e-06, + "loss": 0.7466, + "step": 173 + }, + { + "epoch": 0.040546838589405006, + "grad_norm": 0.22428268194198608, + "learning_rate": 4.999897285691174e-06, + "loss": 0.771, + "step": 174 + }, + { + "epoch": 0.04077986639739009, + "grad_norm": 0.23170886933803558, + "learning_rate": 4.999894490891254e-06, + "loss": 0.7392, + "step": 175 + }, + { + "epoch": 0.041012894205375176, + "grad_norm": 0.2380857616662979, + "learning_rate": 4.999891658578303e-06, + "loss": 0.7287, + "step": 176 + }, + { + "epoch": 0.04124592201336026, + "grad_norm": 0.2259863168001175, + "learning_rate": 4.999888788752365e-06, + "loss": 0.7283, + "step": 177 + }, + { + "epoch": 0.04147894982134535, + "grad_norm": 0.22356554865837097, + "learning_rate": 4.999885881413482e-06, + "loss": 0.7598, + "step": 178 + }, + { + "epoch": 0.04171197762933043, + "grad_norm": 0.22653083503246307, + "learning_rate": 4.999882936561697e-06, + "loss": 0.7344, + "step": 179 + }, + { + "epoch": 0.04194500543731552, + "grad_norm": 0.23027479648590088, + "learning_rate": 4.9998799541970555e-06, + "loss": 0.762, + "step": 180 + }, + { + "epoch": 0.0421780332453006, + "grad_norm": 0.2211555540561676, + "learning_rate": 4.999876934319602e-06, + "loss": 0.723, + "step": 181 + }, + { + "epoch": 0.042411061053285695, + "grad_norm": 0.22502490878105164, + "learning_rate": 4.999873876929381e-06, + "loss": 0.7392, + "step": 182 + }, + { + "epoch": 0.04264408886127078, + "grad_norm": 0.22317567467689514, + "learning_rate": 4.99987078202644e-06, + "loss": 0.7662, + "step": 183 + }, + { + "epoch": 0.042877116669255866, + "grad_norm": 0.21702991425991058, + "learning_rate": 4.9998676496108235e-06, + "loss": 0.7101, + "step": 184 + }, + { + "epoch": 0.04311014447724095, + "grad_norm": 0.23088614642620087, + "learning_rate": 4.99986447968258e-06, + "loss": 0.7627, + "step": 185 + }, + { + "epoch": 0.043343172285226036, + "grad_norm": 0.22956077754497528, + "learning_rate": 4.999861272241756e-06, + "loss": 0.742, + "step": 186 + }, + { + "epoch": 0.04357620009321112, + "grad_norm": 0.2265009582042694, + "learning_rate": 4.9998580272884e-06, + "loss": 0.7242, + "step": 187 + }, + { + "epoch": 0.04380922790119621, + "grad_norm": 0.22329825162887573, + "learning_rate": 4.999854744822562e-06, + "loss": 0.7298, + "step": 188 + }, + { + "epoch": 0.04404225570918129, + "grad_norm": 0.22264976799488068, + "learning_rate": 4.999851424844289e-06, + "loss": 0.7277, + "step": 189 + }, + { + "epoch": 0.044275283517166385, + "grad_norm": 0.2162184715270996, + "learning_rate": 4.999848067353632e-06, + "loss": 0.6926, + "step": 190 + }, + { + "epoch": 0.04450831132515147, + "grad_norm": 0.22213086485862732, + "learning_rate": 4.999844672350641e-06, + "loss": 0.7315, + "step": 191 + }, + { + "epoch": 0.044741339133136555, + "grad_norm": 0.22618098556995392, + "learning_rate": 4.999841239835368e-06, + "loss": 0.73, + "step": 192 + }, + { + "epoch": 0.04497436694112164, + "grad_norm": 0.22303909063339233, + "learning_rate": 4.999837769807864e-06, + "loss": 0.703, + "step": 193 + }, + { + "epoch": 0.045207394749106726, + "grad_norm": 0.22598472237586975, + "learning_rate": 4.999834262268179e-06, + "loss": 0.7398, + "step": 194 + }, + { + "epoch": 0.04544042255709181, + "grad_norm": 0.23237083852291107, + "learning_rate": 4.999830717216369e-06, + "loss": 0.7473, + "step": 195 + }, + { + "epoch": 0.0456734503650769, + "grad_norm": 0.21514421701431274, + "learning_rate": 4.999827134652485e-06, + "loss": 0.7293, + "step": 196 + }, + { + "epoch": 0.04590647817306198, + "grad_norm": 0.22203579545021057, + "learning_rate": 4.999823514576582e-06, + "loss": 0.7425, + "step": 197 + }, + { + "epoch": 0.046139505981047074, + "grad_norm": 0.2246609330177307, + "learning_rate": 4.999819856988714e-06, + "loss": 0.7149, + "step": 198 + }, + { + "epoch": 0.04637253378903216, + "grad_norm": 0.2282305806875229, + "learning_rate": 4.9998161618889344e-06, + "loss": 0.7276, + "step": 199 + }, + { + "epoch": 0.046605561597017245, + "grad_norm": 0.22761468589305878, + "learning_rate": 4.9998124292773e-06, + "loss": 0.7324, + "step": 200 + }, + { + "epoch": 0.04683858940500233, + "grad_norm": 0.23388297855854034, + "learning_rate": 4.999808659153867e-06, + "loss": 0.7358, + "step": 201 + }, + { + "epoch": 0.047071617212987416, + "grad_norm": 0.22498762607574463, + "learning_rate": 4.9998048515186915e-06, + "loss": 0.725, + "step": 202 + }, + { + "epoch": 0.0473046450209725, + "grad_norm": 0.23367926478385925, + "learning_rate": 4.999801006371831e-06, + "loss": 0.7694, + "step": 203 + }, + { + "epoch": 0.047537672828957586, + "grad_norm": 0.22767691314220428, + "learning_rate": 4.999797123713342e-06, + "loss": 0.7302, + "step": 204 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 0.2235887348651886, + "learning_rate": 4.999793203543285e-06, + "loss": 0.7327, + "step": 205 + }, + { + "epoch": 0.048003728444927764, + "grad_norm": 0.22135455906391144, + "learning_rate": 4.999789245861717e-06, + "loss": 0.7295, + "step": 206 + }, + { + "epoch": 0.04823675625291285, + "grad_norm": 0.23671281337738037, + "learning_rate": 4.999785250668698e-06, + "loss": 0.7578, + "step": 207 + }, + { + "epoch": 0.048469784060897934, + "grad_norm": 0.24161268770694733, + "learning_rate": 4.999781217964288e-06, + "loss": 0.7498, + "step": 208 + }, + { + "epoch": 0.04870281186888302, + "grad_norm": 0.2337157428264618, + "learning_rate": 4.999777147748548e-06, + "loss": 0.7408, + "step": 209 + }, + { + "epoch": 0.048935839676868105, + "grad_norm": 0.2388341873884201, + "learning_rate": 4.999773040021537e-06, + "loss": 0.744, + "step": 210 + }, + { + "epoch": 0.04916886748485319, + "grad_norm": 0.2359924167394638, + "learning_rate": 4.999768894783319e-06, + "loss": 0.7153, + "step": 211 + }, + { + "epoch": 0.049401895292838276, + "grad_norm": 0.23011301457881927, + "learning_rate": 4.999764712033956e-06, + "loss": 0.706, + "step": 212 + }, + { + "epoch": 0.04963492310082337, + "grad_norm": 0.2365659922361374, + "learning_rate": 4.999760491773509e-06, + "loss": 0.7359, + "step": 213 + }, + { + "epoch": 0.04986795090880845, + "grad_norm": 0.2266506552696228, + "learning_rate": 4.9997562340020424e-06, + "loss": 0.7082, + "step": 214 + }, + { + "epoch": 0.05010097871679354, + "grad_norm": 0.23107214272022247, + "learning_rate": 4.9997519387196205e-06, + "loss": 0.7537, + "step": 215 + }, + { + "epoch": 0.050334006524778624, + "grad_norm": 0.22786951065063477, + "learning_rate": 4.999747605926307e-06, + "loss": 0.7467, + "step": 216 + }, + { + "epoch": 0.05056703433276371, + "grad_norm": 0.2349010407924652, + "learning_rate": 4.999743235622168e-06, + "loss": 0.7433, + "step": 217 + }, + { + "epoch": 0.050800062140748795, + "grad_norm": 0.22448864579200745, + "learning_rate": 4.999738827807268e-06, + "loss": 0.7418, + "step": 218 + }, + { + "epoch": 0.05103308994873388, + "grad_norm": 0.23586194217205048, + "learning_rate": 4.999734382481674e-06, + "loss": 0.7235, + "step": 219 + }, + { + "epoch": 0.051266117756718965, + "grad_norm": 0.22801582515239716, + "learning_rate": 4.999729899645452e-06, + "loss": 0.732, + "step": 220 + }, + { + "epoch": 0.05149914556470406, + "grad_norm": 0.229914590716362, + "learning_rate": 4.9997253792986685e-06, + "loss": 0.7335, + "step": 221 + }, + { + "epoch": 0.05173217337268914, + "grad_norm": 0.23865897953510284, + "learning_rate": 4.999720821441393e-06, + "loss": 0.7612, + "step": 222 + }, + { + "epoch": 0.05196520118067423, + "grad_norm": 0.23582662642002106, + "learning_rate": 4.9997162260736935e-06, + "loss": 0.7273, + "step": 223 + }, + { + "epoch": 0.05219822898865931, + "grad_norm": 0.2305140644311905, + "learning_rate": 4.999711593195638e-06, + "loss": 0.7194, + "step": 224 + }, + { + "epoch": 0.0524312567966444, + "grad_norm": 0.2359943836927414, + "learning_rate": 4.999706922807297e-06, + "loss": 0.7442, + "step": 225 + }, + { + "epoch": 0.052664284604629484, + "grad_norm": 0.22906522452831268, + "learning_rate": 4.9997022149087405e-06, + "loss": 0.732, + "step": 226 + }, + { + "epoch": 0.05289731241261457, + "grad_norm": 0.2394052892923355, + "learning_rate": 4.999697469500038e-06, + "loss": 0.7066, + "step": 227 + }, + { + "epoch": 0.05313034022059966, + "grad_norm": 0.24094820022583008, + "learning_rate": 4.9996926865812625e-06, + "loss": 0.744, + "step": 228 + }, + { + "epoch": 0.05336336802858475, + "grad_norm": 0.23332065343856812, + "learning_rate": 4.999687866152485e-06, + "loss": 0.7172, + "step": 229 + }, + { + "epoch": 0.05359639583656983, + "grad_norm": 0.23934350907802582, + "learning_rate": 4.999683008213777e-06, + "loss": 0.7422, + "step": 230 + }, + { + "epoch": 0.05382942364455492, + "grad_norm": 0.24818432331085205, + "learning_rate": 4.999678112765213e-06, + "loss": 0.7487, + "step": 231 + }, + { + "epoch": 0.05406245145254, + "grad_norm": 0.2531030774116516, + "learning_rate": 4.999673179806866e-06, + "loss": 0.7271, + "step": 232 + }, + { + "epoch": 0.05429547926052509, + "grad_norm": 0.23697523772716522, + "learning_rate": 4.999668209338808e-06, + "loss": 0.7471, + "step": 233 + }, + { + "epoch": 0.054528507068510174, + "grad_norm": 0.24024754762649536, + "learning_rate": 4.999663201361116e-06, + "loss": 0.7483, + "step": 234 + }, + { + "epoch": 0.05476153487649526, + "grad_norm": 0.2395297884941101, + "learning_rate": 4.999658155873864e-06, + "loss": 0.7367, + "step": 235 + }, + { + "epoch": 0.05499456268448035, + "grad_norm": 0.22585037350654602, + "learning_rate": 4.999653072877129e-06, + "loss": 0.7072, + "step": 236 + }, + { + "epoch": 0.055227590492465437, + "grad_norm": 0.23874503374099731, + "learning_rate": 4.999647952370985e-06, + "loss": 0.7088, + "step": 237 + }, + { + "epoch": 0.05546061830045052, + "grad_norm": 0.23172254860401154, + "learning_rate": 4.999642794355511e-06, + "loss": 0.7486, + "step": 238 + }, + { + "epoch": 0.05569364610843561, + "grad_norm": 0.2458873689174652, + "learning_rate": 4.999637598830783e-06, + "loss": 0.7344, + "step": 239 + }, + { + "epoch": 0.05592667391642069, + "grad_norm": 0.22992630302906036, + "learning_rate": 4.99963236579688e-06, + "loss": 0.7319, + "step": 240 + }, + { + "epoch": 0.05615970172440578, + "grad_norm": 0.2370229810476303, + "learning_rate": 4.9996270952538804e-06, + "loss": 0.7308, + "step": 241 + }, + { + "epoch": 0.05639272953239086, + "grad_norm": 0.23465345799922943, + "learning_rate": 4.999621787201862e-06, + "loss": 0.726, + "step": 242 + }, + { + "epoch": 0.05662575734037595, + "grad_norm": 0.23192419111728668, + "learning_rate": 4.999616441640906e-06, + "loss": 0.7147, + "step": 243 + }, + { + "epoch": 0.05685878514836104, + "grad_norm": 0.2386251986026764, + "learning_rate": 4.999611058571092e-06, + "loss": 0.7407, + "step": 244 + }, + { + "epoch": 0.057091812956346126, + "grad_norm": 0.24743086099624634, + "learning_rate": 4.9996056379925e-06, + "loss": 0.7469, + "step": 245 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 0.24018095433712006, + "learning_rate": 4.999600179905213e-06, + "loss": 0.7354, + "step": 246 + }, + { + "epoch": 0.0575578685723163, + "grad_norm": 0.23809517920017242, + "learning_rate": 4.999594684309312e-06, + "loss": 0.7174, + "step": 247 + }, + { + "epoch": 0.05779089638030138, + "grad_norm": 0.24602758884429932, + "learning_rate": 4.99958915120488e-06, + "loss": 0.7106, + "step": 248 + }, + { + "epoch": 0.05802392418828647, + "grad_norm": 0.23758511245250702, + "learning_rate": 4.999583580591999e-06, + "loss": 0.6975, + "step": 249 + }, + { + "epoch": 0.05825695199627155, + "grad_norm": 0.24656106531620026, + "learning_rate": 4.999577972470753e-06, + "loss": 0.7558, + "step": 250 + }, + { + "epoch": 0.05848997980425664, + "grad_norm": 0.25386252999305725, + "learning_rate": 4.999572326841226e-06, + "loss": 0.7313, + "step": 251 + }, + { + "epoch": 0.05872300761224173, + "grad_norm": 0.25689932703971863, + "learning_rate": 4.999566643703504e-06, + "loss": 0.7457, + "step": 252 + }, + { + "epoch": 0.058956035420226816, + "grad_norm": 0.23852482438087463, + "learning_rate": 4.9995609230576704e-06, + "loss": 0.7425, + "step": 253 + }, + { + "epoch": 0.0591890632282119, + "grad_norm": 0.2362796664237976, + "learning_rate": 4.999555164903811e-06, + "loss": 0.7093, + "step": 254 + }, + { + "epoch": 0.059422091036196986, + "grad_norm": 0.2364169806241989, + "learning_rate": 4.999549369242015e-06, + "loss": 0.7365, + "step": 255 + }, + { + "epoch": 0.05965511884418207, + "grad_norm": 0.2310011386871338, + "learning_rate": 4.999543536072367e-06, + "loss": 0.7222, + "step": 256 + }, + { + "epoch": 0.05988814665216716, + "grad_norm": 0.2424667477607727, + "learning_rate": 4.999537665394954e-06, + "loss": 0.7627, + "step": 257 + }, + { + "epoch": 0.06012117446015224, + "grad_norm": 0.25183749198913574, + "learning_rate": 4.999531757209866e-06, + "loss": 0.734, + "step": 258 + }, + { + "epoch": 0.060354202268137334, + "grad_norm": 0.2403503656387329, + "learning_rate": 4.9995258115171906e-06, + "loss": 0.7072, + "step": 259 + }, + { + "epoch": 0.06058723007612242, + "grad_norm": 0.25794631242752075, + "learning_rate": 4.999519828317017e-06, + "loss": 0.7412, + "step": 260 + }, + { + "epoch": 0.060820257884107505, + "grad_norm": 0.2556687295436859, + "learning_rate": 4.999513807609435e-06, + "loss": 0.7273, + "step": 261 + }, + { + "epoch": 0.06105328569209259, + "grad_norm": 0.2458338886499405, + "learning_rate": 4.999507749394536e-06, + "loss": 0.7565, + "step": 262 + }, + { + "epoch": 0.061286313500077676, + "grad_norm": 0.2361326813697815, + "learning_rate": 4.99950165367241e-06, + "loss": 0.729, + "step": 263 + }, + { + "epoch": 0.06151934130806276, + "grad_norm": 0.24156038463115692, + "learning_rate": 4.999495520443147e-06, + "loss": 0.7227, + "step": 264 + }, + { + "epoch": 0.061752369116047846, + "grad_norm": 0.23659822344779968, + "learning_rate": 4.999489349706842e-06, + "loss": 0.744, + "step": 265 + }, + { + "epoch": 0.06198539692403293, + "grad_norm": 0.23918043076992035, + "learning_rate": 4.999483141463586e-06, + "loss": 0.726, + "step": 266 + }, + { + "epoch": 0.062218424732018024, + "grad_norm": 0.24140456318855286, + "learning_rate": 4.999476895713472e-06, + "loss": 0.7321, + "step": 267 + }, + { + "epoch": 0.06245145254000311, + "grad_norm": 0.2419995367527008, + "learning_rate": 4.999470612456594e-06, + "loss": 0.7091, + "step": 268 + }, + { + "epoch": 0.0626844803479882, + "grad_norm": 0.25285908579826355, + "learning_rate": 4.999464291693046e-06, + "loss": 0.7354, + "step": 269 + }, + { + "epoch": 0.06291750815597327, + "grad_norm": 0.25021588802337646, + "learning_rate": 4.9994579334229235e-06, + "loss": 0.7479, + "step": 270 + }, + { + "epoch": 0.06315053596395837, + "grad_norm": 0.24748603999614716, + "learning_rate": 4.999451537646322e-06, + "loss": 0.7375, + "step": 271 + }, + { + "epoch": 0.06338356377194346, + "grad_norm": 0.2499394714832306, + "learning_rate": 4.9994451043633366e-06, + "loss": 0.7522, + "step": 272 + }, + { + "epoch": 0.06361659157992854, + "grad_norm": 0.25145795941352844, + "learning_rate": 4.999438633574064e-06, + "loss": 0.7472, + "step": 273 + }, + { + "epoch": 0.06384961938791363, + "grad_norm": 0.24611398577690125, + "learning_rate": 4.999432125278601e-06, + "loss": 0.719, + "step": 274 + }, + { + "epoch": 0.0640826471958987, + "grad_norm": 0.23927611112594604, + "learning_rate": 4.999425579477047e-06, + "loss": 0.7358, + "step": 275 + }, + { + "epoch": 0.0643156750038838, + "grad_norm": 0.24265727400779724, + "learning_rate": 4.999418996169499e-06, + "loss": 0.722, + "step": 276 + }, + { + "epoch": 0.06454870281186888, + "grad_norm": 0.2381640374660492, + "learning_rate": 4.999412375356055e-06, + "loss": 0.7206, + "step": 277 + }, + { + "epoch": 0.06478173061985397, + "grad_norm": 0.24963288009166718, + "learning_rate": 4.999405717036816e-06, + "loss": 0.731, + "step": 278 + }, + { + "epoch": 0.06501475842783906, + "grad_norm": 0.25031325221061707, + "learning_rate": 4.999399021211881e-06, + "loss": 0.7253, + "step": 279 + }, + { + "epoch": 0.06524778623582414, + "grad_norm": 0.25496459007263184, + "learning_rate": 4.999392287881351e-06, + "loss": 0.7423, + "step": 280 + }, + { + "epoch": 0.06548081404380923, + "grad_norm": 0.25405147671699524, + "learning_rate": 4.999385517045325e-06, + "loss": 0.765, + "step": 281 + }, + { + "epoch": 0.06571384185179431, + "grad_norm": 0.25281643867492676, + "learning_rate": 4.9993787087039075e-06, + "loss": 0.7257, + "step": 282 + }, + { + "epoch": 0.0659468696597794, + "grad_norm": 0.25418737530708313, + "learning_rate": 4.999371862857199e-06, + "loss": 0.755, + "step": 283 + }, + { + "epoch": 0.06617989746776448, + "grad_norm": 0.2420542687177658, + "learning_rate": 4.999364979505303e-06, + "loss": 0.7365, + "step": 284 + }, + { + "epoch": 0.06641292527574957, + "grad_norm": 0.2453756034374237, + "learning_rate": 4.9993580586483225e-06, + "loss": 0.7361, + "step": 285 + }, + { + "epoch": 0.06664595308373467, + "grad_norm": 0.24084922671318054, + "learning_rate": 4.999351100286361e-06, + "loss": 0.6936, + "step": 286 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 0.25480175018310547, + "learning_rate": 4.999344104419523e-06, + "loss": 0.7625, + "step": 287 + }, + { + "epoch": 0.06711200869970484, + "grad_norm": 0.24831323325634003, + "learning_rate": 4.999337071047914e-06, + "loss": 0.7145, + "step": 288 + }, + { + "epoch": 0.06734503650768991, + "grad_norm": 0.23863226175308228, + "learning_rate": 4.999330000171639e-06, + "loss": 0.6951, + "step": 289 + }, + { + "epoch": 0.06757806431567501, + "grad_norm": 0.24551492929458618, + "learning_rate": 4.999322891790804e-06, + "loss": 0.7037, + "step": 290 + }, + { + "epoch": 0.06781109212366009, + "grad_norm": 0.24633821845054626, + "learning_rate": 4.999315745905517e-06, + "loss": 0.747, + "step": 291 + }, + { + "epoch": 0.06804411993164518, + "grad_norm": 0.25016093254089355, + "learning_rate": 4.999308562515883e-06, + "loss": 0.7377, + "step": 292 + }, + { + "epoch": 0.06827714773963026, + "grad_norm": 0.2484135776758194, + "learning_rate": 4.999301341622011e-06, + "loss": 0.7422, + "step": 293 + }, + { + "epoch": 0.06851017554761535, + "grad_norm": 0.24550265073776245, + "learning_rate": 4.99929408322401e-06, + "loss": 0.7261, + "step": 294 + }, + { + "epoch": 0.06874320335560044, + "grad_norm": 0.2469964325428009, + "learning_rate": 4.999286787321989e-06, + "loss": 0.7113, + "step": 295 + }, + { + "epoch": 0.06897623116358552, + "grad_norm": 0.24772542715072632, + "learning_rate": 4.999279453916055e-06, + "loss": 0.7136, + "step": 296 + }, + { + "epoch": 0.06920925897157061, + "grad_norm": 0.2547084391117096, + "learning_rate": 4.999272083006322e-06, + "loss": 0.6882, + "step": 297 + }, + { + "epoch": 0.06944228677955569, + "grad_norm": 0.2466219663619995, + "learning_rate": 4.999264674592896e-06, + "loss": 0.7157, + "step": 298 + }, + { + "epoch": 0.06967531458754078, + "grad_norm": 0.2598658800125122, + "learning_rate": 4.999257228675892e-06, + "loss": 0.6889, + "step": 299 + }, + { + "epoch": 0.06990834239552586, + "grad_norm": 0.2506358325481415, + "learning_rate": 4.999249745255419e-06, + "loss": 0.7552, + "step": 300 + }, + { + "epoch": 0.07014137020351095, + "grad_norm": 0.242702916264534, + "learning_rate": 4.999242224331592e-06, + "loss": 0.728, + "step": 301 + }, + { + "epoch": 0.07037439801149604, + "grad_norm": 0.250826358795166, + "learning_rate": 4.999234665904522e-06, + "loss": 0.6755, + "step": 302 + }, + { + "epoch": 0.07060742581948112, + "grad_norm": 0.25157541036605835, + "learning_rate": 4.999227069974323e-06, + "loss": 0.7229, + "step": 303 + }, + { + "epoch": 0.07084045362746622, + "grad_norm": 0.2394135594367981, + "learning_rate": 4.999219436541109e-06, + "loss": 0.7191, + "step": 304 + }, + { + "epoch": 0.0710734814354513, + "grad_norm": 0.26184362173080444, + "learning_rate": 4.999211765604995e-06, + "loss": 0.7366, + "step": 305 + }, + { + "epoch": 0.07130650924343639, + "grad_norm": 0.245759978890419, + "learning_rate": 4.999204057166094e-06, + "loss": 0.7079, + "step": 306 + }, + { + "epoch": 0.07153953705142146, + "grad_norm": 0.2391767054796219, + "learning_rate": 4.999196311224523e-06, + "loss": 0.7391, + "step": 307 + }, + { + "epoch": 0.07177256485940656, + "grad_norm": 0.25215908885002136, + "learning_rate": 4.9991885277804e-06, + "loss": 0.7359, + "step": 308 + }, + { + "epoch": 0.07200559266739164, + "grad_norm": 0.251105934381485, + "learning_rate": 4.999180706833839e-06, + "loss": 0.7299, + "step": 309 + }, + { + "epoch": 0.07223862047537673, + "grad_norm": 0.25377991795539856, + "learning_rate": 4.99917284838496e-06, + "loss": 0.7027, + "step": 310 + }, + { + "epoch": 0.07247164828336182, + "grad_norm": 0.2490369826555252, + "learning_rate": 4.9991649524338785e-06, + "loss": 0.7581, + "step": 311 + }, + { + "epoch": 0.0727046760913469, + "grad_norm": 0.2513660490512848, + "learning_rate": 4.9991570189807135e-06, + "loss": 0.7343, + "step": 312 + }, + { + "epoch": 0.07293770389933199, + "grad_norm": 0.24804595112800598, + "learning_rate": 4.999149048025586e-06, + "loss": 0.7136, + "step": 313 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 0.2489134669303894, + "learning_rate": 4.999141039568613e-06, + "loss": 0.7058, + "step": 314 + }, + { + "epoch": 0.07340375951530216, + "grad_norm": 0.24533462524414062, + "learning_rate": 4.999132993609916e-06, + "loss": 0.7166, + "step": 315 + }, + { + "epoch": 0.07363678732328724, + "grad_norm": 0.2505927085876465, + "learning_rate": 4.999124910149616e-06, + "loss": 0.7194, + "step": 316 + }, + { + "epoch": 0.07386981513127233, + "grad_norm": 0.25363633036613464, + "learning_rate": 4.999116789187834e-06, + "loss": 0.7543, + "step": 317 + }, + { + "epoch": 0.07410284293925742, + "grad_norm": 0.24882932007312775, + "learning_rate": 4.99910863072469e-06, + "loss": 0.7179, + "step": 318 + }, + { + "epoch": 0.0743358707472425, + "grad_norm": 0.23970524966716766, + "learning_rate": 4.999100434760309e-06, + "loss": 0.7236, + "step": 319 + }, + { + "epoch": 0.0745688985552276, + "grad_norm": 0.25722166895866394, + "learning_rate": 4.999092201294814e-06, + "loss": 0.7633, + "step": 320 + }, + { + "epoch": 0.07480192636321267, + "grad_norm": 0.2641606330871582, + "learning_rate": 4.999083930328327e-06, + "loss": 0.7386, + "step": 321 + }, + { + "epoch": 0.07503495417119777, + "grad_norm": 0.2548326551914215, + "learning_rate": 4.999075621860973e-06, + "loss": 0.7306, + "step": 322 + }, + { + "epoch": 0.07526798197918284, + "grad_norm": 0.2574126422405243, + "learning_rate": 4.999067275892877e-06, + "loss": 0.7299, + "step": 323 + }, + { + "epoch": 0.07550100978716794, + "grad_norm": 0.25842714309692383, + "learning_rate": 4.999058892424163e-06, + "loss": 0.7329, + "step": 324 + }, + { + "epoch": 0.07573403759515303, + "grad_norm": 0.2642553448677063, + "learning_rate": 4.999050471454957e-06, + "loss": 0.7357, + "step": 325 + }, + { + "epoch": 0.0759670654031381, + "grad_norm": 0.25944018363952637, + "learning_rate": 4.9990420129853876e-06, + "loss": 0.7304, + "step": 326 + }, + { + "epoch": 0.0762000932111232, + "grad_norm": 0.2534201741218567, + "learning_rate": 4.999033517015579e-06, + "loss": 0.7143, + "step": 327 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 0.2712051570415497, + "learning_rate": 4.999024983545659e-06, + "loss": 0.7282, + "step": 328 + }, + { + "epoch": 0.07666614882709337, + "grad_norm": 0.256282776594162, + "learning_rate": 4.999016412575757e-06, + "loss": 0.7307, + "step": 329 + }, + { + "epoch": 0.07689917663507845, + "grad_norm": 0.25298354029655457, + "learning_rate": 4.999007804106001e-06, + "loss": 0.7144, + "step": 330 + }, + { + "epoch": 0.07713220444306354, + "grad_norm": 0.25451895594596863, + "learning_rate": 4.99899915813652e-06, + "loss": 0.7112, + "step": 331 + }, + { + "epoch": 0.07736523225104862, + "grad_norm": 0.2686389684677124, + "learning_rate": 4.998990474667444e-06, + "loss": 0.7308, + "step": 332 + }, + { + "epoch": 0.07759826005903371, + "grad_norm": 0.2620779871940613, + "learning_rate": 4.998981753698903e-06, + "loss": 0.7035, + "step": 333 + }, + { + "epoch": 0.0778312878670188, + "grad_norm": 0.25993025302886963, + "learning_rate": 4.9989729952310285e-06, + "loss": 0.7511, + "step": 334 + }, + { + "epoch": 0.07806431567500388, + "grad_norm": 0.2550770342350006, + "learning_rate": 4.998964199263951e-06, + "loss": 0.7035, + "step": 335 + }, + { + "epoch": 0.07829734348298897, + "grad_norm": 0.25614380836486816, + "learning_rate": 4.998955365797803e-06, + "loss": 0.7219, + "step": 336 + }, + { + "epoch": 0.07853037129097405, + "grad_norm": 0.259289026260376, + "learning_rate": 4.998946494832717e-06, + "loss": 0.6984, + "step": 337 + }, + { + "epoch": 0.07876339909895914, + "grad_norm": 0.25654953718185425, + "learning_rate": 4.998937586368826e-06, + "loss": 0.7238, + "step": 338 + }, + { + "epoch": 0.07899642690694422, + "grad_norm": 0.2503283619880676, + "learning_rate": 4.998928640406264e-06, + "loss": 0.7122, + "step": 339 + }, + { + "epoch": 0.07922945471492931, + "grad_norm": 0.2624177038669586, + "learning_rate": 4.998919656945165e-06, + "loss": 0.7129, + "step": 340 + }, + { + "epoch": 0.07946248252291441, + "grad_norm": 0.2542095184326172, + "learning_rate": 4.9989106359856645e-06, + "loss": 0.7049, + "step": 341 + }, + { + "epoch": 0.07969551033089949, + "grad_norm": 0.2504916787147522, + "learning_rate": 4.998901577527896e-06, + "loss": 0.7298, + "step": 342 + }, + { + "epoch": 0.07992853813888458, + "grad_norm": 0.2611362636089325, + "learning_rate": 4.998892481571998e-06, + "loss": 0.7411, + "step": 343 + }, + { + "epoch": 0.08016156594686966, + "grad_norm": 0.2716434895992279, + "learning_rate": 4.998883348118104e-06, + "loss": 0.7063, + "step": 344 + }, + { + "epoch": 0.08039459375485475, + "grad_norm": 0.2586671710014343, + "learning_rate": 4.998874177166354e-06, + "loss": 0.7085, + "step": 345 + }, + { + "epoch": 0.08062762156283983, + "grad_norm": 0.2720732092857361, + "learning_rate": 4.998864968716884e-06, + "loss": 0.7207, + "step": 346 + }, + { + "epoch": 0.08086064937082492, + "grad_norm": 0.25597047805786133, + "learning_rate": 4.998855722769833e-06, + "loss": 0.7355, + "step": 347 + }, + { + "epoch": 0.08109367717881001, + "grad_norm": 0.2722516357898712, + "learning_rate": 4.9988464393253385e-06, + "loss": 0.718, + "step": 348 + }, + { + "epoch": 0.08132670498679509, + "grad_norm": 0.2619338929653168, + "learning_rate": 4.99883711838354e-06, + "loss": 0.7315, + "step": 349 + }, + { + "epoch": 0.08155973279478018, + "grad_norm": 0.27276793122291565, + "learning_rate": 4.998827759944579e-06, + "loss": 0.7243, + "step": 350 + }, + { + "epoch": 0.08179276060276526, + "grad_norm": 0.2667602598667145, + "learning_rate": 4.998818364008595e-06, + "loss": 0.7239, + "step": 351 + }, + { + "epoch": 0.08202578841075035, + "grad_norm": 0.2747623920440674, + "learning_rate": 4.998808930575728e-06, + "loss": 0.7402, + "step": 352 + }, + { + "epoch": 0.08225881621873543, + "grad_norm": 0.2709905803203583, + "learning_rate": 4.998799459646121e-06, + "loss": 0.7192, + "step": 353 + }, + { + "epoch": 0.08249184402672052, + "grad_norm": 0.26394927501678467, + "learning_rate": 4.998789951219915e-06, + "loss": 0.7285, + "step": 354 + }, + { + "epoch": 0.0827248718347056, + "grad_norm": 0.26141220331192017, + "learning_rate": 4.998780405297253e-06, + "loss": 0.7386, + "step": 355 + }, + { + "epoch": 0.0829578996426907, + "grad_norm": 0.25983375310897827, + "learning_rate": 4.998770821878279e-06, + "loss": 0.7451, + "step": 356 + }, + { + "epoch": 0.08319092745067579, + "grad_norm": 0.2517448365688324, + "learning_rate": 4.998761200963137e-06, + "loss": 0.7094, + "step": 357 + }, + { + "epoch": 0.08342395525866086, + "grad_norm": 0.2560007870197296, + "learning_rate": 4.998751542551969e-06, + "loss": 0.7465, + "step": 358 + }, + { + "epoch": 0.08365698306664596, + "grad_norm": 0.2673647701740265, + "learning_rate": 4.998741846644923e-06, + "loss": 0.7455, + "step": 359 + }, + { + "epoch": 0.08389001087463104, + "grad_norm": 0.26824450492858887, + "learning_rate": 4.998732113242142e-06, + "loss": 0.6901, + "step": 360 + }, + { + "epoch": 0.08412303868261613, + "grad_norm": 0.26135963201522827, + "learning_rate": 4.9987223423437734e-06, + "loss": 0.7332, + "step": 361 + }, + { + "epoch": 0.0843560664906012, + "grad_norm": 0.25458693504333496, + "learning_rate": 4.998712533949964e-06, + "loss": 0.7265, + "step": 362 + }, + { + "epoch": 0.0845890942985863, + "grad_norm": 0.2519102096557617, + "learning_rate": 4.99870268806086e-06, + "loss": 0.7281, + "step": 363 + }, + { + "epoch": 0.08482212210657139, + "grad_norm": 0.2623507082462311, + "learning_rate": 4.99869280467661e-06, + "loss": 0.7053, + "step": 364 + }, + { + "epoch": 0.08505514991455647, + "grad_norm": 0.2610650360584259, + "learning_rate": 4.998682883797362e-06, + "loss": 0.7062, + "step": 365 + }, + { + "epoch": 0.08528817772254156, + "grad_norm": 0.2616136968135834, + "learning_rate": 4.998672925423264e-06, + "loss": 0.7205, + "step": 366 + }, + { + "epoch": 0.08552120553052664, + "grad_norm": 0.27179041504859924, + "learning_rate": 4.998662929554467e-06, + "loss": 0.73, + "step": 367 + }, + { + "epoch": 0.08575423333851173, + "grad_norm": 0.2715303897857666, + "learning_rate": 4.99865289619112e-06, + "loss": 0.7391, + "step": 368 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 0.2580897808074951, + "learning_rate": 4.998642825333374e-06, + "loss": 0.728, + "step": 369 + }, + { + "epoch": 0.0862202889544819, + "grad_norm": 0.26969707012176514, + "learning_rate": 4.9986327169813795e-06, + "loss": 0.7152, + "step": 370 + }, + { + "epoch": 0.086453316762467, + "grad_norm": 0.2700117230415344, + "learning_rate": 4.9986225711352895e-06, + "loss": 0.7553, + "step": 371 + }, + { + "epoch": 0.08668634457045207, + "grad_norm": 0.2550394833087921, + "learning_rate": 4.9986123877952555e-06, + "loss": 0.7158, + "step": 372 + }, + { + "epoch": 0.08691937237843717, + "grad_norm": 0.25190266966819763, + "learning_rate": 4.998602166961429e-06, + "loss": 0.7193, + "step": 373 + }, + { + "epoch": 0.08715240018642224, + "grad_norm": 0.2563241422176361, + "learning_rate": 4.998591908633965e-06, + "loss": 0.7239, + "step": 374 + }, + { + "epoch": 0.08738542799440734, + "grad_norm": 0.2611631453037262, + "learning_rate": 4.9985816128130175e-06, + "loss": 0.715, + "step": 375 + }, + { + "epoch": 0.08761845580239241, + "grad_norm": 0.25888943672180176, + "learning_rate": 4.998571279498739e-06, + "loss": 0.6994, + "step": 376 + }, + { + "epoch": 0.0878514836103775, + "grad_norm": 0.2674463987350464, + "learning_rate": 4.998560908691288e-06, + "loss": 0.7193, + "step": 377 + }, + { + "epoch": 0.08808451141836258, + "grad_norm": 0.25503867864608765, + "learning_rate": 4.998550500390817e-06, + "loss": 0.7119, + "step": 378 + }, + { + "epoch": 0.08831753922634768, + "grad_norm": 0.2583731412887573, + "learning_rate": 4.998540054597484e-06, + "loss": 0.6991, + "step": 379 + }, + { + "epoch": 0.08855056703433277, + "grad_norm": 0.26033318042755127, + "learning_rate": 4.998529571311445e-06, + "loss": 0.6943, + "step": 380 + }, + { + "epoch": 0.08878359484231785, + "grad_norm": 0.26900988817214966, + "learning_rate": 4.998519050532857e-06, + "loss": 0.724, + "step": 381 + }, + { + "epoch": 0.08901662265030294, + "grad_norm": 0.26029253005981445, + "learning_rate": 4.998508492261878e-06, + "loss": 0.7036, + "step": 382 + }, + { + "epoch": 0.08924965045828802, + "grad_norm": 0.26662078499794006, + "learning_rate": 4.998497896498668e-06, + "loss": 0.7297, + "step": 383 + }, + { + "epoch": 0.08948267826627311, + "grad_norm": 0.26625850796699524, + "learning_rate": 4.998487263243384e-06, + "loss": 0.6876, + "step": 384 + }, + { + "epoch": 0.08971570607425819, + "grad_norm": 0.27606239914894104, + "learning_rate": 4.998476592496186e-06, + "loss": 0.7129, + "step": 385 + }, + { + "epoch": 0.08994873388224328, + "grad_norm": 0.2647344470024109, + "learning_rate": 4.9984658842572354e-06, + "loss": 0.7128, + "step": 386 + }, + { + "epoch": 0.09018176169022837, + "grad_norm": 0.26329436898231506, + "learning_rate": 4.998455138526691e-06, + "loss": 0.7098, + "step": 387 + }, + { + "epoch": 0.09041478949821345, + "grad_norm": 0.26317068934440613, + "learning_rate": 4.9984443553047145e-06, + "loss": 0.7004, + "step": 388 + }, + { + "epoch": 0.09064781730619854, + "grad_norm": 0.25156447291374207, + "learning_rate": 4.998433534591469e-06, + "loss": 0.6939, + "step": 389 + }, + { + "epoch": 0.09088084511418362, + "grad_norm": 0.26344063878059387, + "learning_rate": 4.998422676387116e-06, + "loss": 0.6832, + "step": 390 + }, + { + "epoch": 0.09111387292216871, + "grad_norm": 0.26562049984931946, + "learning_rate": 4.998411780691819e-06, + "loss": 0.7377, + "step": 391 + }, + { + "epoch": 0.0913469007301538, + "grad_norm": 0.262561172246933, + "learning_rate": 4.99840084750574e-06, + "loss": 0.7145, + "step": 392 + }, + { + "epoch": 0.09157992853813889, + "grad_norm": 0.25424498319625854, + "learning_rate": 4.998389876829044e-06, + "loss": 0.6637, + "step": 393 + }, + { + "epoch": 0.09181295634612396, + "grad_norm": 0.26349854469299316, + "learning_rate": 4.998378868661896e-06, + "loss": 0.7269, + "step": 394 + }, + { + "epoch": 0.09204598415410906, + "grad_norm": 0.26946550607681274, + "learning_rate": 4.998367823004461e-06, + "loss": 0.7094, + "step": 395 + }, + { + "epoch": 0.09227901196209415, + "grad_norm": 0.26526933908462524, + "learning_rate": 4.9983567398569044e-06, + "loss": 0.7219, + "step": 396 + }, + { + "epoch": 0.09251203977007923, + "grad_norm": 0.2611550986766815, + "learning_rate": 4.998345619219392e-06, + "loss": 0.7068, + "step": 397 + }, + { + "epoch": 0.09274506757806432, + "grad_norm": 0.2719970643520355, + "learning_rate": 4.998334461092092e-06, + "loss": 0.7288, + "step": 398 + }, + { + "epoch": 0.0929780953860494, + "grad_norm": 0.26935720443725586, + "learning_rate": 4.998323265475172e-06, + "loss": 0.7132, + "step": 399 + }, + { + "epoch": 0.09321112319403449, + "grad_norm": 0.26246386766433716, + "learning_rate": 4.998312032368799e-06, + "loss": 0.6887, + "step": 400 + }, + { + "epoch": 0.09344415100201957, + "grad_norm": 0.27224159240722656, + "learning_rate": 4.99830076177314e-06, + "loss": 0.7466, + "step": 401 + }, + { + "epoch": 0.09367717881000466, + "grad_norm": 0.2665623128414154, + "learning_rate": 4.998289453688367e-06, + "loss": 0.7039, + "step": 402 + }, + { + "epoch": 0.09391020661798975, + "grad_norm": 0.28303226828575134, + "learning_rate": 4.998278108114649e-06, + "loss": 0.7292, + "step": 403 + }, + { + "epoch": 0.09414323442597483, + "grad_norm": 0.2726888954639435, + "learning_rate": 4.9982667250521564e-06, + "loss": 0.6881, + "step": 404 + }, + { + "epoch": 0.09437626223395992, + "grad_norm": 0.2580481767654419, + "learning_rate": 4.998255304501059e-06, + "loss": 0.695, + "step": 405 + }, + { + "epoch": 0.094609290041945, + "grad_norm": 0.26251721382141113, + "learning_rate": 4.998243846461528e-06, + "loss": 0.7118, + "step": 406 + }, + { + "epoch": 0.0948423178499301, + "grad_norm": 0.26338014006614685, + "learning_rate": 4.998232350933736e-06, + "loss": 0.6886, + "step": 407 + }, + { + "epoch": 0.09507534565791517, + "grad_norm": 0.2690316140651703, + "learning_rate": 4.998220817917855e-06, + "loss": 0.7125, + "step": 408 + }, + { + "epoch": 0.09530837346590026, + "grad_norm": 0.27284082770347595, + "learning_rate": 4.998209247414059e-06, + "loss": 0.7315, + "step": 409 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 0.27260881662368774, + "learning_rate": 4.9981976394225215e-06, + "loss": 0.714, + "step": 410 + }, + { + "epoch": 0.09577442908187044, + "grad_norm": 0.2741718590259552, + "learning_rate": 4.998185993943416e-06, + "loss": 0.7194, + "step": 411 + }, + { + "epoch": 0.09600745688985553, + "grad_norm": 0.2764624357223511, + "learning_rate": 4.998174310976919e-06, + "loss": 0.7318, + "step": 412 + }, + { + "epoch": 0.0962404846978406, + "grad_norm": 0.267203152179718, + "learning_rate": 4.998162590523202e-06, + "loss": 0.7189, + "step": 413 + }, + { + "epoch": 0.0964735125058257, + "grad_norm": 0.27287253737449646, + "learning_rate": 4.9981508325824455e-06, + "loss": 0.7327, + "step": 414 + }, + { + "epoch": 0.09670654031381078, + "grad_norm": 0.2635997235774994, + "learning_rate": 4.998139037154822e-06, + "loss": 0.6811, + "step": 415 + }, + { + "epoch": 0.09693956812179587, + "grad_norm": 0.2751455008983612, + "learning_rate": 4.998127204240511e-06, + "loss": 0.7123, + "step": 416 + }, + { + "epoch": 0.09717259592978095, + "grad_norm": 0.2580989897251129, + "learning_rate": 4.998115333839689e-06, + "loss": 0.7055, + "step": 417 + }, + { + "epoch": 0.09740562373776604, + "grad_norm": 0.282155841588974, + "learning_rate": 4.998103425952535e-06, + "loss": 0.7249, + "step": 418 + }, + { + "epoch": 0.09763865154575113, + "grad_norm": 0.266447514295578, + "learning_rate": 4.998091480579227e-06, + "loss": 0.711, + "step": 419 + }, + { + "epoch": 0.09787167935373621, + "grad_norm": 0.28153368830680847, + "learning_rate": 4.998079497719943e-06, + "loss": 0.7119, + "step": 420 + }, + { + "epoch": 0.0981047071617213, + "grad_norm": 0.2687012553215027, + "learning_rate": 4.9980674773748656e-06, + "loss": 0.7119, + "step": 421 + }, + { + "epoch": 0.09833773496970638, + "grad_norm": 0.2772180438041687, + "learning_rate": 4.998055419544174e-06, + "loss": 0.7274, + "step": 422 + }, + { + "epoch": 0.09857076277769147, + "grad_norm": 0.27405598759651184, + "learning_rate": 4.998043324228047e-06, + "loss": 0.7095, + "step": 423 + }, + { + "epoch": 0.09880379058567655, + "grad_norm": 0.2776533365249634, + "learning_rate": 4.9980311914266696e-06, + "loss": 0.7262, + "step": 424 + }, + { + "epoch": 0.09903681839366164, + "grad_norm": 0.26940077543258667, + "learning_rate": 4.998019021140222e-06, + "loss": 0.7102, + "step": 425 + }, + { + "epoch": 0.09926984620164674, + "grad_norm": 0.27262529730796814, + "learning_rate": 4.9980068133688865e-06, + "loss": 0.6989, + "step": 426 + }, + { + "epoch": 0.09950287400963181, + "grad_norm": 0.27136725187301636, + "learning_rate": 4.997994568112847e-06, + "loss": 0.7156, + "step": 427 + }, + { + "epoch": 0.0997359018176169, + "grad_norm": 0.28153106570243835, + "learning_rate": 4.997982285372287e-06, + "loss": 0.7348, + "step": 428 + }, + { + "epoch": 0.09996892962560198, + "grad_norm": 0.26744544506073, + "learning_rate": 4.997969965147392e-06, + "loss": 0.7217, + "step": 429 + }, + { + "epoch": 0.10020195743358708, + "grad_norm": 0.27284687757492065, + "learning_rate": 4.997957607438345e-06, + "loss": 0.7166, + "step": 430 + }, + { + "epoch": 0.10043498524157216, + "grad_norm": 0.27115944027900696, + "learning_rate": 4.997945212245332e-06, + "loss": 0.7204, + "step": 431 + }, + { + "epoch": 0.10066801304955725, + "grad_norm": 0.26061660051345825, + "learning_rate": 4.997932779568539e-06, + "loss": 0.7149, + "step": 432 + }, + { + "epoch": 0.10090104085754234, + "grad_norm": 0.26815304160118103, + "learning_rate": 4.997920309408153e-06, + "loss": 0.6698, + "step": 433 + }, + { + "epoch": 0.10113406866552742, + "grad_norm": 0.27857542037963867, + "learning_rate": 4.9979078017643615e-06, + "loss": 0.6933, + "step": 434 + }, + { + "epoch": 0.10136709647351251, + "grad_norm": 0.2685565948486328, + "learning_rate": 4.9978952566373506e-06, + "loss": 0.7017, + "step": 435 + }, + { + "epoch": 0.10160012428149759, + "grad_norm": 0.26038601994514465, + "learning_rate": 4.9978826740273105e-06, + "loss": 0.7031, + "step": 436 + }, + { + "epoch": 0.10183315208948268, + "grad_norm": 0.2697908282279968, + "learning_rate": 4.997870053934429e-06, + "loss": 0.7039, + "step": 437 + }, + { + "epoch": 0.10206617989746776, + "grad_norm": 0.270423948764801, + "learning_rate": 4.997857396358895e-06, + "loss": 0.7024, + "step": 438 + }, + { + "epoch": 0.10229920770545285, + "grad_norm": 0.27088746428489685, + "learning_rate": 4.997844701300899e-06, + "loss": 0.6742, + "step": 439 + }, + { + "epoch": 0.10253223551343793, + "grad_norm": 0.2678142488002777, + "learning_rate": 4.997831968760632e-06, + "loss": 0.6916, + "step": 440 + }, + { + "epoch": 0.10276526332142302, + "grad_norm": 0.28872260451316833, + "learning_rate": 4.997819198738284e-06, + "loss": 0.7522, + "step": 441 + }, + { + "epoch": 0.10299829112940811, + "grad_norm": 0.27392515540122986, + "learning_rate": 4.997806391234048e-06, + "loss": 0.7229, + "step": 442 + }, + { + "epoch": 0.1032313189373932, + "grad_norm": 0.2679874002933502, + "learning_rate": 4.9977935462481145e-06, + "loss": 0.6999, + "step": 443 + }, + { + "epoch": 0.10346434674537829, + "grad_norm": 0.2744854688644409, + "learning_rate": 4.997780663780677e-06, + "loss": 0.7382, + "step": 444 + }, + { + "epoch": 0.10369737455336336, + "grad_norm": 0.2624858319759369, + "learning_rate": 4.99776774383193e-06, + "loss": 0.6931, + "step": 445 + }, + { + "epoch": 0.10393040236134846, + "grad_norm": 0.2833358943462372, + "learning_rate": 4.9977547864020644e-06, + "loss": 0.7268, + "step": 446 + }, + { + "epoch": 0.10416343016933353, + "grad_norm": 0.28012388944625854, + "learning_rate": 4.9977417914912775e-06, + "loss": 0.7146, + "step": 447 + }, + { + "epoch": 0.10439645797731863, + "grad_norm": 0.2652537226676941, + "learning_rate": 4.997728759099764e-06, + "loss": 0.7039, + "step": 448 + }, + { + "epoch": 0.10462948578530372, + "grad_norm": 0.9657779335975647, + "learning_rate": 4.997715689227718e-06, + "loss": 0.681, + "step": 449 + }, + { + "epoch": 0.1048625135932888, + "grad_norm": 0.2833934724330902, + "learning_rate": 4.997702581875337e-06, + "loss": 0.7174, + "step": 450 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 0.2703401744365692, + "learning_rate": 4.997689437042817e-06, + "loss": 0.7138, + "step": 451 + }, + { + "epoch": 0.10532856920925897, + "grad_norm": 0.267728716135025, + "learning_rate": 4.997676254730355e-06, + "loss": 0.7162, + "step": 452 + }, + { + "epoch": 0.10556159701724406, + "grad_norm": 0.27212557196617126, + "learning_rate": 4.997663034938148e-06, + "loss": 0.699, + "step": 453 + }, + { + "epoch": 0.10579462482522914, + "grad_norm": 0.2720823884010315, + "learning_rate": 4.997649777666397e-06, + "loss": 0.6851, + "step": 454 + }, + { + "epoch": 0.10602765263321423, + "grad_norm": 0.2817004919052124, + "learning_rate": 4.9976364829153e-06, + "loss": 0.7119, + "step": 455 + }, + { + "epoch": 0.10626068044119932, + "grad_norm": 0.26471012830734253, + "learning_rate": 4.9976231506850546e-06, + "loss": 0.7132, + "step": 456 + }, + { + "epoch": 0.1064937082491844, + "grad_norm": 0.2735552489757538, + "learning_rate": 4.997609780975863e-06, + "loss": 0.7175, + "step": 457 + }, + { + "epoch": 0.1067267360571695, + "grad_norm": 0.2700481414794922, + "learning_rate": 4.997596373787925e-06, + "loss": 0.6942, + "step": 458 + }, + { + "epoch": 0.10695976386515457, + "grad_norm": 0.27634599804878235, + "learning_rate": 4.997582929121442e-06, + "loss": 0.7003, + "step": 459 + }, + { + "epoch": 0.10719279167313966, + "grad_norm": 0.26627108454704285, + "learning_rate": 4.997569446976614e-06, + "loss": 0.6964, + "step": 460 + }, + { + "epoch": 0.10742581948112474, + "grad_norm": 0.2728627026081085, + "learning_rate": 4.9975559273536465e-06, + "loss": 0.6653, + "step": 461 + }, + { + "epoch": 0.10765884728910984, + "grad_norm": 0.2738238275051117, + "learning_rate": 4.997542370252741e-06, + "loss": 0.6893, + "step": 462 + }, + { + "epoch": 0.10789187509709491, + "grad_norm": 0.2788858413696289, + "learning_rate": 4.9975287756741e-06, + "loss": 0.7193, + "step": 463 + }, + { + "epoch": 0.10812490290508, + "grad_norm": 0.28782856464385986, + "learning_rate": 4.997515143617929e-06, + "loss": 0.7212, + "step": 464 + }, + { + "epoch": 0.1083579307130651, + "grad_norm": 0.2826350927352905, + "learning_rate": 4.997501474084431e-06, + "loss": 0.7003, + "step": 465 + }, + { + "epoch": 0.10859095852105018, + "grad_norm": 0.28830617666244507, + "learning_rate": 4.997487767073812e-06, + "loss": 0.7408, + "step": 466 + }, + { + "epoch": 0.10882398632903527, + "grad_norm": 0.2735765874385834, + "learning_rate": 4.997474022586277e-06, + "loss": 0.6838, + "step": 467 + }, + { + "epoch": 0.10905701413702035, + "grad_norm": 0.273650199174881, + "learning_rate": 4.997460240622034e-06, + "loss": 0.7311, + "step": 468 + }, + { + "epoch": 0.10929004194500544, + "grad_norm": 0.26199105381965637, + "learning_rate": 4.997446421181288e-06, + "loss": 0.7093, + "step": 469 + }, + { + "epoch": 0.10952306975299052, + "grad_norm": 0.2845688462257385, + "learning_rate": 4.997432564264247e-06, + "loss": 0.6869, + "step": 470 + }, + { + "epoch": 0.10975609756097561, + "grad_norm": 0.28403785824775696, + "learning_rate": 4.997418669871119e-06, + "loss": 0.6926, + "step": 471 + }, + { + "epoch": 0.1099891253689607, + "grad_norm": 0.2828761637210846, + "learning_rate": 4.9974047380021115e-06, + "loss": 0.7251, + "step": 472 + }, + { + "epoch": 0.11022215317694578, + "grad_norm": 0.27970263361930847, + "learning_rate": 4.997390768657437e-06, + "loss": 0.7329, + "step": 473 + }, + { + "epoch": 0.11045518098493087, + "grad_norm": 0.26096630096435547, + "learning_rate": 4.9973767618373e-06, + "loss": 0.7077, + "step": 474 + }, + { + "epoch": 0.11068820879291595, + "grad_norm": 0.2903180420398712, + "learning_rate": 4.997362717541915e-06, + "loss": 0.7268, + "step": 475 + }, + { + "epoch": 0.11092123660090104, + "grad_norm": 0.28232213854789734, + "learning_rate": 4.997348635771489e-06, + "loss": 0.6929, + "step": 476 + }, + { + "epoch": 0.11115426440888612, + "grad_norm": 0.29427483677864075, + "learning_rate": 4.997334516526238e-06, + "loss": 0.6959, + "step": 477 + }, + { + "epoch": 0.11138729221687121, + "grad_norm": 0.2723221182823181, + "learning_rate": 4.99732035980637e-06, + "loss": 0.6972, + "step": 478 + }, + { + "epoch": 0.11162032002485629, + "grad_norm": 0.28987693786621094, + "learning_rate": 4.997306165612098e-06, + "loss": 0.7064, + "step": 479 + }, + { + "epoch": 0.11185334783284138, + "grad_norm": 0.2901027500629425, + "learning_rate": 4.997291933943637e-06, + "loss": 0.7226, + "step": 480 + }, + { + "epoch": 0.11208637564082648, + "grad_norm": 0.28449949622154236, + "learning_rate": 4.997277664801198e-06, + "loss": 0.6659, + "step": 481 + }, + { + "epoch": 0.11231940344881156, + "grad_norm": 0.2740713059902191, + "learning_rate": 4.997263358184997e-06, + "loss": 0.7052, + "step": 482 + }, + { + "epoch": 0.11255243125679665, + "grad_norm": 0.2873528301715851, + "learning_rate": 4.9972490140952475e-06, + "loss": 0.7108, + "step": 483 + }, + { + "epoch": 0.11278545906478173, + "grad_norm": 0.27575674653053284, + "learning_rate": 4.997234632532166e-06, + "loss": 0.7021, + "step": 484 + }, + { + "epoch": 0.11301848687276682, + "grad_norm": 0.27255427837371826, + "learning_rate": 4.997220213495967e-06, + "loss": 0.7016, + "step": 485 + }, + { + "epoch": 0.1132515146807519, + "grad_norm": 0.27455398440361023, + "learning_rate": 4.997205756986867e-06, + "loss": 0.674, + "step": 486 + }, + { + "epoch": 0.11348454248873699, + "grad_norm": 0.3010913133621216, + "learning_rate": 4.997191263005084e-06, + "loss": 0.694, + "step": 487 + }, + { + "epoch": 0.11371757029672208, + "grad_norm": 0.29097482562065125, + "learning_rate": 4.9971767315508354e-06, + "loss": 0.7394, + "step": 488 + }, + { + "epoch": 0.11395059810470716, + "grad_norm": 0.28544801473617554, + "learning_rate": 4.997162162624339e-06, + "loss": 0.682, + "step": 489 + }, + { + "epoch": 0.11418362591269225, + "grad_norm": 0.27722764015197754, + "learning_rate": 4.9971475562258115e-06, + "loss": 0.712, + "step": 490 + }, + { + "epoch": 0.11441665372067733, + "grad_norm": 0.28289729356765747, + "learning_rate": 4.997132912355474e-06, + "loss": 0.7377, + "step": 491 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 0.2930034399032593, + "learning_rate": 4.997118231013547e-06, + "loss": 0.7268, + "step": 492 + }, + { + "epoch": 0.1148827093366475, + "grad_norm": 0.28607070446014404, + "learning_rate": 4.99710351220025e-06, + "loss": 0.6526, + "step": 493 + }, + { + "epoch": 0.1151157371446326, + "grad_norm": 0.28152915835380554, + "learning_rate": 4.997088755915802e-06, + "loss": 0.7411, + "step": 494 + }, + { + "epoch": 0.11534876495261769, + "grad_norm": 0.27191075682640076, + "learning_rate": 4.9970739621604265e-06, + "loss": 0.7089, + "step": 495 + }, + { + "epoch": 0.11558179276060276, + "grad_norm": 0.28994372487068176, + "learning_rate": 4.9970591309343465e-06, + "loss": 0.7005, + "step": 496 + }, + { + "epoch": 0.11581482056858786, + "grad_norm": 0.28294068574905396, + "learning_rate": 4.997044262237782e-06, + "loss": 0.7161, + "step": 497 + }, + { + "epoch": 0.11604784837657293, + "grad_norm": 0.2823251783847809, + "learning_rate": 4.997029356070957e-06, + "loss": 0.687, + "step": 498 + }, + { + "epoch": 0.11628087618455803, + "grad_norm": 0.2785414457321167, + "learning_rate": 4.997014412434097e-06, + "loss": 0.7074, + "step": 499 + }, + { + "epoch": 0.1165139039925431, + "grad_norm": 0.2828596532344818, + "learning_rate": 4.996999431327423e-06, + "loss": 0.7362, + "step": 500 + }, + { + "epoch": 0.1167469318005282, + "grad_norm": 0.27480649948120117, + "learning_rate": 4.996984412751162e-06, + "loss": 0.6886, + "step": 501 + }, + { + "epoch": 0.11697995960851328, + "grad_norm": 0.2780996263027191, + "learning_rate": 4.996969356705539e-06, + "loss": 0.697, + "step": 502 + }, + { + "epoch": 0.11721298741649837, + "grad_norm": 0.2799265384674072, + "learning_rate": 4.996954263190779e-06, + "loss": 0.7121, + "step": 503 + }, + { + "epoch": 0.11744601522448346, + "grad_norm": 0.26891598105430603, + "learning_rate": 4.99693913220711e-06, + "loss": 0.7176, + "step": 504 + }, + { + "epoch": 0.11767904303246854, + "grad_norm": 0.2739698588848114, + "learning_rate": 4.996923963754758e-06, + "loss": 0.6974, + "step": 505 + }, + { + "epoch": 0.11791207084045363, + "grad_norm": 0.28994348645210266, + "learning_rate": 4.996908757833951e-06, + "loss": 0.7278, + "step": 506 + }, + { + "epoch": 0.11814509864843871, + "grad_norm": 0.2842945456504822, + "learning_rate": 4.996893514444917e-06, + "loss": 0.6901, + "step": 507 + }, + { + "epoch": 0.1183781264564238, + "grad_norm": 0.2860533595085144, + "learning_rate": 4.9968782335878854e-06, + "loss": 0.7225, + "step": 508 + }, + { + "epoch": 0.11861115426440888, + "grad_norm": 0.29213234782218933, + "learning_rate": 4.996862915263084e-06, + "loss": 0.7342, + "step": 509 + }, + { + "epoch": 0.11884418207239397, + "grad_norm": 0.2886195778846741, + "learning_rate": 4.996847559470744e-06, + "loss": 0.704, + "step": 510 + }, + { + "epoch": 0.11907720988037906, + "grad_norm": 0.2848113775253296, + "learning_rate": 4.996832166211096e-06, + "loss": 0.7116, + "step": 511 + }, + { + "epoch": 0.11931023768836414, + "grad_norm": 0.26638761162757874, + "learning_rate": 4.99681673548437e-06, + "loss": 0.6897, + "step": 512 + }, + { + "epoch": 0.11954326549634924, + "grad_norm": 0.2708003520965576, + "learning_rate": 4.996801267290797e-06, + "loss": 0.6808, + "step": 513 + }, + { + "epoch": 0.11977629330433431, + "grad_norm": 0.28945574164390564, + "learning_rate": 4.9967857616306115e-06, + "loss": 0.7201, + "step": 514 + }, + { + "epoch": 0.1200093211123194, + "grad_norm": 0.2842789888381958, + "learning_rate": 4.996770218504044e-06, + "loss": 0.6857, + "step": 515 + }, + { + "epoch": 0.12024234892030448, + "grad_norm": 0.26787251234054565, + "learning_rate": 4.996754637911329e-06, + "loss": 0.6832, + "step": 516 + }, + { + "epoch": 0.12047537672828958, + "grad_norm": 0.27095159888267517, + "learning_rate": 4.9967390198527e-06, + "loss": 0.6813, + "step": 517 + }, + { + "epoch": 0.12070840453627467, + "grad_norm": 0.28457996249198914, + "learning_rate": 4.996723364328391e-06, + "loss": 0.7039, + "step": 518 + }, + { + "epoch": 0.12094143234425975, + "grad_norm": 0.2792329788208008, + "learning_rate": 4.9967076713386365e-06, + "loss": 0.7212, + "step": 519 + }, + { + "epoch": 0.12117446015224484, + "grad_norm": 0.2768925726413727, + "learning_rate": 4.996691940883672e-06, + "loss": 0.6719, + "step": 520 + }, + { + "epoch": 0.12140748796022992, + "grad_norm": 0.29109251499176025, + "learning_rate": 4.996676172963736e-06, + "loss": 0.7223, + "step": 521 + }, + { + "epoch": 0.12164051576821501, + "grad_norm": 0.28198501467704773, + "learning_rate": 4.996660367579061e-06, + "loss": 0.7116, + "step": 522 + }, + { + "epoch": 0.12187354357620009, + "grad_norm": 0.2883881628513336, + "learning_rate": 4.996644524729888e-06, + "loss": 0.7007, + "step": 523 + }, + { + "epoch": 0.12210657138418518, + "grad_norm": 0.2818365693092346, + "learning_rate": 4.996628644416453e-06, + "loss": 0.7235, + "step": 524 + }, + { + "epoch": 0.12233959919217026, + "grad_norm": 0.2791387140750885, + "learning_rate": 4.996612726638994e-06, + "loss": 0.6933, + "step": 525 + }, + { + "epoch": 0.12257262700015535, + "grad_norm": 0.2868289053440094, + "learning_rate": 4.9965967713977506e-06, + "loss": 0.7168, + "step": 526 + }, + { + "epoch": 0.12280565480814044, + "grad_norm": 0.284287691116333, + "learning_rate": 4.996580778692962e-06, + "loss": 0.693, + "step": 527 + }, + { + "epoch": 0.12303868261612552, + "grad_norm": 0.2871397137641907, + "learning_rate": 4.996564748524868e-06, + "loss": 0.6935, + "step": 528 + }, + { + "epoch": 0.12327171042411061, + "grad_norm": 0.290457546710968, + "learning_rate": 4.9965486808937095e-06, + "loss": 0.7246, + "step": 529 + }, + { + "epoch": 0.12350473823209569, + "grad_norm": 0.28620800375938416, + "learning_rate": 4.996532575799727e-06, + "loss": 0.718, + "step": 530 + }, + { + "epoch": 0.12373776604008078, + "grad_norm": 0.27761298418045044, + "learning_rate": 4.996516433243162e-06, + "loss": 0.7167, + "step": 531 + }, + { + "epoch": 0.12397079384806586, + "grad_norm": 0.28706222772598267, + "learning_rate": 4.996500253224258e-06, + "loss": 0.7043, + "step": 532 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 0.284008264541626, + "learning_rate": 4.996484035743256e-06, + "loss": 0.7015, + "step": 533 + }, + { + "epoch": 0.12443684946403605, + "grad_norm": 0.28923410177230835, + "learning_rate": 4.996467780800401e-06, + "loss": 0.6934, + "step": 534 + }, + { + "epoch": 0.12466987727202113, + "grad_norm": 0.297838419675827, + "learning_rate": 4.996451488395937e-06, + "loss": 0.7153, + "step": 535 + }, + { + "epoch": 0.12490290508000622, + "grad_norm": 0.2865050733089447, + "learning_rate": 4.996435158530107e-06, + "loss": 0.7073, + "step": 536 + }, + { + "epoch": 0.1251359328879913, + "grad_norm": 0.2982148230075836, + "learning_rate": 4.996418791203158e-06, + "loss": 0.7005, + "step": 537 + }, + { + "epoch": 0.1253689606959764, + "grad_norm": 0.2930370271205902, + "learning_rate": 4.996402386415333e-06, + "loss": 0.6938, + "step": 538 + }, + { + "epoch": 0.12560198850396148, + "grad_norm": 0.28063294291496277, + "learning_rate": 4.996385944166881e-06, + "loss": 0.7108, + "step": 539 + }, + { + "epoch": 0.12583501631194655, + "grad_norm": 0.2815013825893402, + "learning_rate": 4.996369464458046e-06, + "loss": 0.6996, + "step": 540 + }, + { + "epoch": 0.12606804411993164, + "grad_norm": 0.2848454415798187, + "learning_rate": 4.996352947289078e-06, + "loss": 0.7173, + "step": 541 + }, + { + "epoch": 0.12630107192791673, + "grad_norm": 0.2997225522994995, + "learning_rate": 4.996336392660223e-06, + "loss": 0.7092, + "step": 542 + }, + { + "epoch": 0.12653409973590182, + "grad_norm": 0.27983635663986206, + "learning_rate": 4.996319800571729e-06, + "loss": 0.7193, + "step": 543 + }, + { + "epoch": 0.12676712754388691, + "grad_norm": 0.2749515175819397, + "learning_rate": 4.996303171023847e-06, + "loss": 0.7129, + "step": 544 + }, + { + "epoch": 0.12700015535187198, + "grad_norm": 0.28092387318611145, + "learning_rate": 4.9962865040168256e-06, + "loss": 0.7035, + "step": 545 + }, + { + "epoch": 0.12723318315985707, + "grad_norm": 0.2733262777328491, + "learning_rate": 4.996269799550914e-06, + "loss": 0.6778, + "step": 546 + }, + { + "epoch": 0.12746621096784216, + "grad_norm": 0.2983027398586273, + "learning_rate": 4.996253057626363e-06, + "loss": 0.7048, + "step": 547 + }, + { + "epoch": 0.12769923877582726, + "grad_norm": 0.2896121144294739, + "learning_rate": 4.996236278243426e-06, + "loss": 0.7022, + "step": 548 + }, + { + "epoch": 0.12793226658381235, + "grad_norm": 0.2866162955760956, + "learning_rate": 4.996219461402352e-06, + "loss": 0.6942, + "step": 549 + }, + { + "epoch": 0.1281652943917974, + "grad_norm": 0.29023391008377075, + "learning_rate": 4.996202607103395e-06, + "loss": 0.7294, + "step": 550 + }, + { + "epoch": 0.1283983221997825, + "grad_norm": 0.27646350860595703, + "learning_rate": 4.996185715346808e-06, + "loss": 0.6822, + "step": 551 + }, + { + "epoch": 0.1286313500077676, + "grad_norm": 0.2973344922065735, + "learning_rate": 4.996168786132843e-06, + "loss": 0.6913, + "step": 552 + }, + { + "epoch": 0.1288643778157527, + "grad_norm": 0.2969902455806732, + "learning_rate": 4.996151819461756e-06, + "loss": 0.7006, + "step": 553 + }, + { + "epoch": 0.12909740562373775, + "grad_norm": 0.2780134081840515, + "learning_rate": 4.9961348153338e-06, + "loss": 0.6992, + "step": 554 + }, + { + "epoch": 0.12933043343172285, + "grad_norm": 0.28041157126426697, + "learning_rate": 4.996117773749231e-06, + "loss": 0.6915, + "step": 555 + }, + { + "epoch": 0.12956346123970794, + "grad_norm": 0.27521926164627075, + "learning_rate": 4.996100694708305e-06, + "loss": 0.6825, + "step": 556 + }, + { + "epoch": 0.12979648904769303, + "grad_norm": 0.2831442058086395, + "learning_rate": 4.996083578211276e-06, + "loss": 0.6806, + "step": 557 + }, + { + "epoch": 0.13002951685567812, + "grad_norm": 0.2870921194553375, + "learning_rate": 4.996066424258404e-06, + "loss": 0.7004, + "step": 558 + }, + { + "epoch": 0.1302625446636632, + "grad_norm": 0.2789919674396515, + "learning_rate": 4.996049232849944e-06, + "loss": 0.6745, + "step": 559 + }, + { + "epoch": 0.13049557247164828, + "grad_norm": 0.2808243930339813, + "learning_rate": 4.996032003986157e-06, + "loss": 0.6866, + "step": 560 + }, + { + "epoch": 0.13072860027963337, + "grad_norm": 0.2794012427330017, + "learning_rate": 4.996014737667297e-06, + "loss": 0.6914, + "step": 561 + }, + { + "epoch": 0.13096162808761846, + "grad_norm": 0.2844577431678772, + "learning_rate": 4.995997433893628e-06, + "loss": 0.6951, + "step": 562 + }, + { + "epoch": 0.13119465589560353, + "grad_norm": 0.3051019608974457, + "learning_rate": 4.995980092665406e-06, + "loss": 0.6867, + "step": 563 + }, + { + "epoch": 0.13142768370358862, + "grad_norm": 0.3089688718318939, + "learning_rate": 4.995962713982893e-06, + "loss": 0.7442, + "step": 564 + }, + { + "epoch": 0.1316607115115737, + "grad_norm": 0.2827191948890686, + "learning_rate": 4.995945297846349e-06, + "loss": 0.6956, + "step": 565 + }, + { + "epoch": 0.1318937393195588, + "grad_norm": 0.30197271704673767, + "learning_rate": 4.995927844256035e-06, + "loss": 0.7348, + "step": 566 + }, + { + "epoch": 0.1321267671275439, + "grad_norm": 0.28756970167160034, + "learning_rate": 4.9959103532122144e-06, + "loss": 0.6977, + "step": 567 + }, + { + "epoch": 0.13235979493552896, + "grad_norm": 0.2906602621078491, + "learning_rate": 4.995892824715148e-06, + "loss": 0.7104, + "step": 568 + }, + { + "epoch": 0.13259282274351405, + "grad_norm": 0.2923468351364136, + "learning_rate": 4.995875258765101e-06, + "loss": 0.713, + "step": 569 + }, + { + "epoch": 0.13282585055149915, + "grad_norm": 0.30007535219192505, + "learning_rate": 4.995857655362334e-06, + "loss": 0.7017, + "step": 570 + }, + { + "epoch": 0.13305887835948424, + "grad_norm": 0.27815499901771545, + "learning_rate": 4.995840014507114e-06, + "loss": 0.7004, + "step": 571 + }, + { + "epoch": 0.13329190616746933, + "grad_norm": 0.29244887828826904, + "learning_rate": 4.9958223361997035e-06, + "loss": 0.6874, + "step": 572 + }, + { + "epoch": 0.1335249339754544, + "grad_norm": 0.28718239068984985, + "learning_rate": 4.995804620440369e-06, + "loss": 0.7222, + "step": 573 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 0.290454238653183, + "learning_rate": 4.995786867229377e-06, + "loss": 0.7033, + "step": 574 + }, + { + "epoch": 0.13399098959142458, + "grad_norm": 0.286780446767807, + "learning_rate": 4.995769076566992e-06, + "loss": 0.6937, + "step": 575 + }, + { + "epoch": 0.13422401739940967, + "grad_norm": 0.29561716318130493, + "learning_rate": 4.995751248453483e-06, + "loss": 0.7136, + "step": 576 + }, + { + "epoch": 0.13445704520739474, + "grad_norm": 0.2873508930206299, + "learning_rate": 4.995733382889115e-06, + "loss": 0.6905, + "step": 577 + }, + { + "epoch": 0.13469007301537983, + "grad_norm": 0.29414796829223633, + "learning_rate": 4.995715479874159e-06, + "loss": 0.7144, + "step": 578 + }, + { + "epoch": 0.13492310082336492, + "grad_norm": 0.27985337376594543, + "learning_rate": 4.995697539408881e-06, + "loss": 0.6932, + "step": 579 + }, + { + "epoch": 0.13515612863135001, + "grad_norm": 0.28410232067108154, + "learning_rate": 4.995679561493553e-06, + "loss": 0.6832, + "step": 580 + }, + { + "epoch": 0.1353891564393351, + "grad_norm": 0.28381258249282837, + "learning_rate": 4.995661546128443e-06, + "loss": 0.6993, + "step": 581 + }, + { + "epoch": 0.13562218424732017, + "grad_norm": 0.2869824767112732, + "learning_rate": 4.99564349331382e-06, + "loss": 0.6895, + "step": 582 + }, + { + "epoch": 0.13585521205530526, + "grad_norm": 0.2836722433567047, + "learning_rate": 4.995625403049957e-06, + "loss": 0.7061, + "step": 583 + }, + { + "epoch": 0.13608823986329036, + "grad_norm": 0.296436071395874, + "learning_rate": 4.995607275337126e-06, + "loss": 0.6936, + "step": 584 + }, + { + "epoch": 0.13632126767127545, + "grad_norm": 0.28385958075523376, + "learning_rate": 4.995589110175597e-06, + "loss": 0.7111, + "step": 585 + }, + { + "epoch": 0.1365542954792605, + "grad_norm": 0.28489843010902405, + "learning_rate": 4.995570907565644e-06, + "loss": 0.7088, + "step": 586 + }, + { + "epoch": 0.1367873232872456, + "grad_norm": 0.2872679531574249, + "learning_rate": 4.995552667507539e-06, + "loss": 0.6986, + "step": 587 + }, + { + "epoch": 0.1370203510952307, + "grad_norm": 0.3037053942680359, + "learning_rate": 4.995534390001557e-06, + "loss": 0.7121, + "step": 588 + }, + { + "epoch": 0.1372533789032158, + "grad_norm": 0.28891274333000183, + "learning_rate": 4.995516075047972e-06, + "loss": 0.7128, + "step": 589 + }, + { + "epoch": 0.13748640671120088, + "grad_norm": 0.28105786442756653, + "learning_rate": 4.995497722647058e-06, + "loss": 0.6908, + "step": 590 + }, + { + "epoch": 0.13771943451918595, + "grad_norm": 0.2814849317073822, + "learning_rate": 4.995479332799092e-06, + "loss": 0.6827, + "step": 591 + }, + { + "epoch": 0.13795246232717104, + "grad_norm": 0.2872859835624695, + "learning_rate": 4.995460905504347e-06, + "loss": 0.7152, + "step": 592 + }, + { + "epoch": 0.13818549013515613, + "grad_norm": 0.289855420589447, + "learning_rate": 4.995442440763102e-06, + "loss": 0.6931, + "step": 593 + }, + { + "epoch": 0.13841851794314122, + "grad_norm": 0.2874184846878052, + "learning_rate": 4.995423938575633e-06, + "loss": 0.7006, + "step": 594 + }, + { + "epoch": 0.1386515457511263, + "grad_norm": 0.2856663763523102, + "learning_rate": 4.995405398942219e-06, + "loss": 0.7226, + "step": 595 + }, + { + "epoch": 0.13888457355911138, + "grad_norm": 0.2995941936969757, + "learning_rate": 4.995386821863136e-06, + "loss": 0.7229, + "step": 596 + }, + { + "epoch": 0.13911760136709647, + "grad_norm": 0.28040891885757446, + "learning_rate": 4.995368207338666e-06, + "loss": 0.7141, + "step": 597 + }, + { + "epoch": 0.13935062917508156, + "grad_norm": 0.27735310792922974, + "learning_rate": 4.995349555369084e-06, + "loss": 0.6928, + "step": 598 + }, + { + "epoch": 0.13958365698306666, + "grad_norm": 0.29008832573890686, + "learning_rate": 4.995330865954674e-06, + "loss": 0.7354, + "step": 599 + }, + { + "epoch": 0.13981668479105172, + "grad_norm": 0.29084211587905884, + "learning_rate": 4.995312139095713e-06, + "loss": 0.7044, + "step": 600 + }, + { + "epoch": 0.1400497125990368, + "grad_norm": 0.2796522378921509, + "learning_rate": 4.995293374792486e-06, + "loss": 0.696, + "step": 601 + }, + { + "epoch": 0.1402827404070219, + "grad_norm": 0.29648593068122864, + "learning_rate": 4.995274573045271e-06, + "loss": 0.7176, + "step": 602 + }, + { + "epoch": 0.140515768215007, + "grad_norm": 0.2902360260486603, + "learning_rate": 4.995255733854352e-06, + "loss": 0.7071, + "step": 603 + }, + { + "epoch": 0.1407487960229921, + "grad_norm": 0.3007471263408661, + "learning_rate": 4.99523685722001e-06, + "loss": 0.7072, + "step": 604 + }, + { + "epoch": 0.14098182383097715, + "grad_norm": 0.2811586558818817, + "learning_rate": 4.99521794314253e-06, + "loss": 0.7055, + "step": 605 + }, + { + "epoch": 0.14121485163896225, + "grad_norm": 0.28654035925865173, + "learning_rate": 4.995198991622195e-06, + "loss": 0.6916, + "step": 606 + }, + { + "epoch": 0.14144787944694734, + "grad_norm": 0.2852920889854431, + "learning_rate": 4.99518000265929e-06, + "loss": 0.6729, + "step": 607 + }, + { + "epoch": 0.14168090725493243, + "grad_norm": 0.29553723335266113, + "learning_rate": 4.9951609762541e-06, + "loss": 0.718, + "step": 608 + }, + { + "epoch": 0.1419139350629175, + "grad_norm": 0.2878788709640503, + "learning_rate": 4.995141912406909e-06, + "loss": 0.6997, + "step": 609 + }, + { + "epoch": 0.1421469628709026, + "grad_norm": 0.29290252923965454, + "learning_rate": 4.995122811118004e-06, + "loss": 0.6997, + "step": 610 + }, + { + "epoch": 0.14237999067888768, + "grad_norm": 0.28711509704589844, + "learning_rate": 4.995103672387673e-06, + "loss": 0.7021, + "step": 611 + }, + { + "epoch": 0.14261301848687277, + "grad_norm": 0.2926309406757355, + "learning_rate": 4.995084496216201e-06, + "loss": 0.6906, + "step": 612 + }, + { + "epoch": 0.14284604629485786, + "grad_norm": 0.2969643175601959, + "learning_rate": 4.995065282603877e-06, + "loss": 0.7086, + "step": 613 + }, + { + "epoch": 0.14307907410284293, + "grad_norm": 0.29061949253082275, + "learning_rate": 4.995046031550988e-06, + "loss": 0.6872, + "step": 614 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 0.2971344590187073, + "learning_rate": 4.995026743057825e-06, + "loss": 0.6883, + "step": 615 + }, + { + "epoch": 0.1435451297188131, + "grad_norm": 0.29349300265312195, + "learning_rate": 4.995007417124676e-06, + "loss": 0.6823, + "step": 616 + }, + { + "epoch": 0.1437781575267982, + "grad_norm": 0.28206148743629456, + "learning_rate": 4.994988053751832e-06, + "loss": 0.6777, + "step": 617 + }, + { + "epoch": 0.14401118533478327, + "grad_norm": 0.30278339982032776, + "learning_rate": 4.994968652939582e-06, + "loss": 0.6906, + "step": 618 + }, + { + "epoch": 0.14424421314276836, + "grad_norm": 0.2912931442260742, + "learning_rate": 4.994949214688219e-06, + "loss": 0.6853, + "step": 619 + }, + { + "epoch": 0.14447724095075345, + "grad_norm": 0.2938266396522522, + "learning_rate": 4.994929738998032e-06, + "loss": 0.71, + "step": 620 + }, + { + "epoch": 0.14471026875873855, + "grad_norm": 0.29091596603393555, + "learning_rate": 4.994910225869316e-06, + "loss": 0.6771, + "step": 621 + }, + { + "epoch": 0.14494329656672364, + "grad_norm": 0.2998121976852417, + "learning_rate": 4.994890675302363e-06, + "loss": 0.7057, + "step": 622 + }, + { + "epoch": 0.1451763243747087, + "grad_norm": 0.287182480096817, + "learning_rate": 4.994871087297466e-06, + "loss": 0.6727, + "step": 623 + }, + { + "epoch": 0.1454093521826938, + "grad_norm": 0.2917057275772095, + "learning_rate": 4.994851461854918e-06, + "loss": 0.6948, + "step": 624 + }, + { + "epoch": 0.1456423799906789, + "grad_norm": 0.2944914996623993, + "learning_rate": 4.994831798975016e-06, + "loss": 0.6736, + "step": 625 + }, + { + "epoch": 0.14587540779866398, + "grad_norm": 0.2945144772529602, + "learning_rate": 4.994812098658053e-06, + "loss": 0.6717, + "step": 626 + }, + { + "epoch": 0.14610843560664907, + "grad_norm": 0.28766441345214844, + "learning_rate": 4.994792360904325e-06, + "loss": 0.6784, + "step": 627 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 0.29533708095550537, + "learning_rate": 4.994772585714129e-06, + "loss": 0.7064, + "step": 628 + }, + { + "epoch": 0.14657449122261923, + "grad_norm": 0.30078238248825073, + "learning_rate": 4.994752773087761e-06, + "loss": 0.6728, + "step": 629 + }, + { + "epoch": 0.14680751903060432, + "grad_norm": 0.29112541675567627, + "learning_rate": 4.9947329230255185e-06, + "loss": 0.6795, + "step": 630 + }, + { + "epoch": 0.14704054683858941, + "grad_norm": 0.2986755073070526, + "learning_rate": 4.994713035527699e-06, + "loss": 0.7011, + "step": 631 + }, + { + "epoch": 0.14727357464657448, + "grad_norm": 0.3102971017360687, + "learning_rate": 4.9946931105946015e-06, + "loss": 0.7185, + "step": 632 + }, + { + "epoch": 0.14750660245455957, + "grad_norm": 0.30071836709976196, + "learning_rate": 4.994673148226524e-06, + "loss": 0.7019, + "step": 633 + }, + { + "epoch": 0.14773963026254466, + "grad_norm": 0.2878577411174774, + "learning_rate": 4.994653148423768e-06, + "loss": 0.7041, + "step": 634 + }, + { + "epoch": 0.14797265807052976, + "grad_norm": 0.28542059659957886, + "learning_rate": 4.994633111186632e-06, + "loss": 0.6998, + "step": 635 + }, + { + "epoch": 0.14820568587851485, + "grad_norm": 0.2916441261768341, + "learning_rate": 4.994613036515417e-06, + "loss": 0.7109, + "step": 636 + }, + { + "epoch": 0.1484387136864999, + "grad_norm": 0.29448115825653076, + "learning_rate": 4.9945929244104245e-06, + "loss": 0.704, + "step": 637 + }, + { + "epoch": 0.148671741494485, + "grad_norm": 0.3023439049720764, + "learning_rate": 4.994572774871955e-06, + "loss": 0.6872, + "step": 638 + }, + { + "epoch": 0.1489047693024701, + "grad_norm": 0.3026156425476074, + "learning_rate": 4.9945525879003135e-06, + "loss": 0.6833, + "step": 639 + }, + { + "epoch": 0.1491377971104552, + "grad_norm": 0.29606306552886963, + "learning_rate": 4.9945323634958005e-06, + "loss": 0.7161, + "step": 640 + }, + { + "epoch": 0.14937082491844025, + "grad_norm": 0.29458752274513245, + "learning_rate": 4.994512101658721e-06, + "loss": 0.6954, + "step": 641 + }, + { + "epoch": 0.14960385272642535, + "grad_norm": 0.3028385043144226, + "learning_rate": 4.994491802389378e-06, + "loss": 0.7135, + "step": 642 + }, + { + "epoch": 0.14983688053441044, + "grad_norm": 0.30123192071914673, + "learning_rate": 4.994471465688077e-06, + "loss": 0.7055, + "step": 643 + }, + { + "epoch": 0.15006990834239553, + "grad_norm": 0.2845219075679779, + "learning_rate": 4.9944510915551224e-06, + "loss": 0.6981, + "step": 644 + }, + { + "epoch": 0.15030293615038062, + "grad_norm": 0.291064977645874, + "learning_rate": 4.99443067999082e-06, + "loss": 0.678, + "step": 645 + }, + { + "epoch": 0.1505359639583657, + "grad_norm": 0.2965686619281769, + "learning_rate": 4.994410230995478e-06, + "loss": 0.701, + "step": 646 + }, + { + "epoch": 0.15076899176635078, + "grad_norm": 0.3055955767631531, + "learning_rate": 4.9943897445694e-06, + "loss": 0.6871, + "step": 647 + }, + { + "epoch": 0.15100201957433587, + "grad_norm": 0.30113545060157776, + "learning_rate": 4.994369220712895e-06, + "loss": 0.6942, + "step": 648 + }, + { + "epoch": 0.15123504738232096, + "grad_norm": 0.2944492995738983, + "learning_rate": 4.994348659426271e-06, + "loss": 0.693, + "step": 649 + }, + { + "epoch": 0.15146807519030606, + "grad_norm": 0.28904175758361816, + "learning_rate": 4.994328060709838e-06, + "loss": 0.6756, + "step": 650 + }, + { + "epoch": 0.15170110299829112, + "grad_norm": 0.31396597623825073, + "learning_rate": 4.9943074245639025e-06, + "loss": 0.7006, + "step": 651 + }, + { + "epoch": 0.1519341308062762, + "grad_norm": 0.30177727341651917, + "learning_rate": 4.994286750988775e-06, + "loss": 0.6714, + "step": 652 + }, + { + "epoch": 0.1521671586142613, + "grad_norm": 0.3121333420276642, + "learning_rate": 4.994266039984766e-06, + "loss": 0.7234, + "step": 653 + }, + { + "epoch": 0.1524001864222464, + "grad_norm": 0.31091129779815674, + "learning_rate": 4.994245291552186e-06, + "loss": 0.7204, + "step": 654 + }, + { + "epoch": 0.15263321423023146, + "grad_norm": 0.2879391610622406, + "learning_rate": 4.994224505691348e-06, + "loss": 0.7161, + "step": 655 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 0.298545241355896, + "learning_rate": 4.994203682402562e-06, + "loss": 0.6885, + "step": 656 + }, + { + "epoch": 0.15309926984620165, + "grad_norm": 0.299441397190094, + "learning_rate": 4.994182821686141e-06, + "loss": 0.7123, + "step": 657 + }, + { + "epoch": 0.15333229765418674, + "grad_norm": 0.30985963344573975, + "learning_rate": 4.994161923542398e-06, + "loss": 0.7094, + "step": 658 + }, + { + "epoch": 0.15356532546217183, + "grad_norm": 0.3077718913555145, + "learning_rate": 4.994140987971647e-06, + "loss": 0.7087, + "step": 659 + }, + { + "epoch": 0.1537983532701569, + "grad_norm": 0.2969335615634918, + "learning_rate": 4.994120014974201e-06, + "loss": 0.7046, + "step": 660 + }, + { + "epoch": 0.154031381078142, + "grad_norm": 0.2992897629737854, + "learning_rate": 4.994099004550376e-06, + "loss": 0.6758, + "step": 661 + }, + { + "epoch": 0.15426440888612708, + "grad_norm": 0.2993827760219574, + "learning_rate": 4.994077956700487e-06, + "loss": 0.7021, + "step": 662 + }, + { + "epoch": 0.15449743669411217, + "grad_norm": 0.3054139316082001, + "learning_rate": 4.99405687142485e-06, + "loss": 0.7091, + "step": 663 + }, + { + "epoch": 0.15473046450209724, + "grad_norm": 0.29991456866264343, + "learning_rate": 4.99403574872378e-06, + "loss": 0.7125, + "step": 664 + }, + { + "epoch": 0.15496349231008233, + "grad_norm": 0.30858808755874634, + "learning_rate": 4.994014588597595e-06, + "loss": 0.7134, + "step": 665 + }, + { + "epoch": 0.15519652011806742, + "grad_norm": 0.3052411377429962, + "learning_rate": 4.993993391046614e-06, + "loss": 0.7159, + "step": 666 + }, + { + "epoch": 0.1554295479260525, + "grad_norm": 0.2996300458908081, + "learning_rate": 4.993972156071153e-06, + "loss": 0.6785, + "step": 667 + }, + { + "epoch": 0.1556625757340376, + "grad_norm": 0.3047278821468353, + "learning_rate": 4.993950883671531e-06, + "loss": 0.6935, + "step": 668 + }, + { + "epoch": 0.15589560354202267, + "grad_norm": 0.2926061153411865, + "learning_rate": 4.993929573848067e-06, + "loss": 0.6847, + "step": 669 + }, + { + "epoch": 0.15612863135000776, + "grad_norm": 0.31860196590423584, + "learning_rate": 4.993908226601082e-06, + "loss": 0.6877, + "step": 670 + }, + { + "epoch": 0.15636165915799285, + "grad_norm": 0.30044296383857727, + "learning_rate": 4.993886841930896e-06, + "loss": 0.683, + "step": 671 + }, + { + "epoch": 0.15659468696597795, + "grad_norm": 0.288571298122406, + "learning_rate": 4.993865419837829e-06, + "loss": 0.696, + "step": 672 + }, + { + "epoch": 0.15682771477396304, + "grad_norm": 0.30127960443496704, + "learning_rate": 4.993843960322203e-06, + "loss": 0.696, + "step": 673 + }, + { + "epoch": 0.1570607425819481, + "grad_norm": 0.29243579506874084, + "learning_rate": 4.99382246338434e-06, + "loss": 0.6935, + "step": 674 + }, + { + "epoch": 0.1572937703899332, + "grad_norm": 0.3010440766811371, + "learning_rate": 4.993800929024562e-06, + "loss": 0.6796, + "step": 675 + }, + { + "epoch": 0.1575267981979183, + "grad_norm": 0.2879869043827057, + "learning_rate": 4.993779357243193e-06, + "loss": 0.6934, + "step": 676 + }, + { + "epoch": 0.15775982600590338, + "grad_norm": 0.3031919300556183, + "learning_rate": 4.993757748040557e-06, + "loss": 0.7197, + "step": 677 + }, + { + "epoch": 0.15799285381388845, + "grad_norm": 0.3012710511684418, + "learning_rate": 4.993736101416977e-06, + "loss": 0.6679, + "step": 678 + }, + { + "epoch": 0.15822588162187354, + "grad_norm": 0.2978542149066925, + "learning_rate": 4.993714417372779e-06, + "loss": 0.6876, + "step": 679 + }, + { + "epoch": 0.15845890942985863, + "grad_norm": 0.29764682054519653, + "learning_rate": 4.993692695908287e-06, + "loss": 0.7146, + "step": 680 + }, + { + "epoch": 0.15869193723784372, + "grad_norm": 0.3129620850086212, + "learning_rate": 4.993670937023829e-06, + "loss": 0.7223, + "step": 681 + }, + { + "epoch": 0.15892496504582881, + "grad_norm": 0.29770827293395996, + "learning_rate": 4.993649140719729e-06, + "loss": 0.6871, + "step": 682 + }, + { + "epoch": 0.15915799285381388, + "grad_norm": 0.2983894348144531, + "learning_rate": 4.993627306996316e-06, + "loss": 0.6982, + "step": 683 + }, + { + "epoch": 0.15939102066179897, + "grad_norm": 0.29265254735946655, + "learning_rate": 4.993605435853918e-06, + "loss": 0.7139, + "step": 684 + }, + { + "epoch": 0.15962404846978406, + "grad_norm": 0.2941584289073944, + "learning_rate": 4.993583527292862e-06, + "loss": 0.6835, + "step": 685 + }, + { + "epoch": 0.15985707627776916, + "grad_norm": 0.31259962916374207, + "learning_rate": 4.993561581313475e-06, + "loss": 0.7046, + "step": 686 + }, + { + "epoch": 0.16009010408575422, + "grad_norm": 0.31096795201301575, + "learning_rate": 4.99353959791609e-06, + "loss": 0.7145, + "step": 687 + }, + { + "epoch": 0.1603231318937393, + "grad_norm": 0.2942263185977936, + "learning_rate": 4.993517577101034e-06, + "loss": 0.6992, + "step": 688 + }, + { + "epoch": 0.1605561597017244, + "grad_norm": 0.30139651894569397, + "learning_rate": 4.9934955188686405e-06, + "loss": 0.6874, + "step": 689 + }, + { + "epoch": 0.1607891875097095, + "grad_norm": 0.2974080443382263, + "learning_rate": 4.993473423219237e-06, + "loss": 0.68, + "step": 690 + }, + { + "epoch": 0.1610222153176946, + "grad_norm": 0.2922239303588867, + "learning_rate": 4.993451290153158e-06, + "loss": 0.6761, + "step": 691 + }, + { + "epoch": 0.16125524312567965, + "grad_norm": 0.29046618938446045, + "learning_rate": 4.993429119670734e-06, + "loss": 0.6532, + "step": 692 + }, + { + "epoch": 0.16148827093366475, + "grad_norm": 0.30687785148620605, + "learning_rate": 4.993406911772297e-06, + "loss": 0.697, + "step": 693 + }, + { + "epoch": 0.16172129874164984, + "grad_norm": 0.2958100140094757, + "learning_rate": 4.993384666458182e-06, + "loss": 0.7025, + "step": 694 + }, + { + "epoch": 0.16195432654963493, + "grad_norm": 0.2987540364265442, + "learning_rate": 4.993362383728722e-06, + "loss": 0.6933, + "step": 695 + }, + { + "epoch": 0.16218735435762002, + "grad_norm": 0.3071579337120056, + "learning_rate": 4.9933400635842514e-06, + "loss": 0.7004, + "step": 696 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 0.3087875247001648, + "learning_rate": 4.993317706025105e-06, + "loss": 0.696, + "step": 697 + }, + { + "epoch": 0.16265340997359018, + "grad_norm": 0.305466890335083, + "learning_rate": 4.993295311051619e-06, + "loss": 0.6875, + "step": 698 + }, + { + "epoch": 0.16288643778157527, + "grad_norm": 0.31651973724365234, + "learning_rate": 4.993272878664128e-06, + "loss": 0.6887, + "step": 699 + }, + { + "epoch": 0.16311946558956036, + "grad_norm": 0.3118724226951599, + "learning_rate": 4.9932504088629705e-06, + "loss": 0.6614, + "step": 700 + }, + { + "epoch": 0.16335249339754543, + "grad_norm": 0.2947903871536255, + "learning_rate": 4.993227901648483e-06, + "loss": 0.6825, + "step": 701 + }, + { + "epoch": 0.16358552120553052, + "grad_norm": 0.3035338222980499, + "learning_rate": 4.993205357021002e-06, + "loss": 0.6746, + "step": 702 + }, + { + "epoch": 0.1638185490135156, + "grad_norm": 0.2950892150402069, + "learning_rate": 4.993182774980867e-06, + "loss": 0.6914, + "step": 703 + }, + { + "epoch": 0.1640515768215007, + "grad_norm": 0.3125010132789612, + "learning_rate": 4.993160155528417e-06, + "loss": 0.6925, + "step": 704 + }, + { + "epoch": 0.1642846046294858, + "grad_norm": 0.31467851996421814, + "learning_rate": 4.99313749866399e-06, + "loss": 0.7077, + "step": 705 + }, + { + "epoch": 0.16451763243747086, + "grad_norm": 0.32034242153167725, + "learning_rate": 4.993114804387929e-06, + "loss": 0.7003, + "step": 706 + }, + { + "epoch": 0.16475066024545595, + "grad_norm": 0.3025346100330353, + "learning_rate": 4.9930920727005705e-06, + "loss": 0.6924, + "step": 707 + }, + { + "epoch": 0.16498368805344105, + "grad_norm": 0.308503121137619, + "learning_rate": 4.993069303602258e-06, + "loss": 0.6914, + "step": 708 + }, + { + "epoch": 0.16521671586142614, + "grad_norm": 0.3070871829986572, + "learning_rate": 4.993046497093334e-06, + "loss": 0.7015, + "step": 709 + }, + { + "epoch": 0.1654497436694112, + "grad_norm": 0.3085547387599945, + "learning_rate": 4.9930236531741386e-06, + "loss": 0.7094, + "step": 710 + }, + { + "epoch": 0.1656827714773963, + "grad_norm": 0.3031659722328186, + "learning_rate": 4.993000771845016e-06, + "loss": 0.6795, + "step": 711 + }, + { + "epoch": 0.1659157992853814, + "grad_norm": 0.31133806705474854, + "learning_rate": 4.992977853106309e-06, + "loss": 0.7239, + "step": 712 + }, + { + "epoch": 0.16614882709336648, + "grad_norm": 0.3030085563659668, + "learning_rate": 4.992954896958362e-06, + "loss": 0.6861, + "step": 713 + }, + { + "epoch": 0.16638185490135157, + "grad_norm": 0.30885449051856995, + "learning_rate": 4.9929319034015185e-06, + "loss": 0.6931, + "step": 714 + }, + { + "epoch": 0.16661488270933664, + "grad_norm": 0.3116648495197296, + "learning_rate": 4.992908872436125e-06, + "loss": 0.7156, + "step": 715 + }, + { + "epoch": 0.16684791051732173, + "grad_norm": 0.2987951636314392, + "learning_rate": 4.992885804062525e-06, + "loss": 0.7159, + "step": 716 + }, + { + "epoch": 0.16708093832530682, + "grad_norm": 0.28926190733909607, + "learning_rate": 4.992862698281067e-06, + "loss": 0.6641, + "step": 717 + }, + { + "epoch": 0.1673139661332919, + "grad_norm": 0.31572502851486206, + "learning_rate": 4.992839555092096e-06, + "loss": 0.6989, + "step": 718 + }, + { + "epoch": 0.167546993941277, + "grad_norm": 0.3006322979927063, + "learning_rate": 4.99281637449596e-06, + "loss": 0.6875, + "step": 719 + }, + { + "epoch": 0.16778002174926207, + "grad_norm": 0.3082723617553711, + "learning_rate": 4.992793156493008e-06, + "loss": 0.6929, + "step": 720 + }, + { + "epoch": 0.16801304955724716, + "grad_norm": 0.31215739250183105, + "learning_rate": 4.992769901083586e-06, + "loss": 0.6694, + "step": 721 + }, + { + "epoch": 0.16824607736523225, + "grad_norm": 0.2989502251148224, + "learning_rate": 4.992746608268045e-06, + "loss": 0.6802, + "step": 722 + }, + { + "epoch": 0.16847910517321735, + "grad_norm": 0.30942055583000183, + "learning_rate": 4.992723278046731e-06, + "loss": 0.6984, + "step": 723 + }, + { + "epoch": 0.1687121329812024, + "grad_norm": 0.31884604692459106, + "learning_rate": 4.992699910419999e-06, + "loss": 0.6842, + "step": 724 + }, + { + "epoch": 0.1689451607891875, + "grad_norm": 0.30471524596214294, + "learning_rate": 4.992676505388197e-06, + "loss": 0.6916, + "step": 725 + }, + { + "epoch": 0.1691781885971726, + "grad_norm": 0.3024379014968872, + "learning_rate": 4.992653062951677e-06, + "loss": 0.6889, + "step": 726 + }, + { + "epoch": 0.1694112164051577, + "grad_norm": 0.3035619556903839, + "learning_rate": 4.992629583110791e-06, + "loss": 0.7061, + "step": 727 + }, + { + "epoch": 0.16964424421314278, + "grad_norm": 0.29914483428001404, + "learning_rate": 4.992606065865889e-06, + "loss": 0.6398, + "step": 728 + }, + { + "epoch": 0.16987727202112785, + "grad_norm": 0.30270639061927795, + "learning_rate": 4.992582511217327e-06, + "loss": 0.6636, + "step": 729 + }, + { + "epoch": 0.17011029982911294, + "grad_norm": 0.30574485659599304, + "learning_rate": 4.992558919165457e-06, + "loss": 0.7085, + "step": 730 + }, + { + "epoch": 0.17034332763709803, + "grad_norm": 0.30577272176742554, + "learning_rate": 4.992535289710632e-06, + "loss": 0.6813, + "step": 731 + }, + { + "epoch": 0.17057635544508312, + "grad_norm": 0.29934442043304443, + "learning_rate": 4.992511622853209e-06, + "loss": 0.7067, + "step": 732 + }, + { + "epoch": 0.1708093832530682, + "grad_norm": 0.30844244360923767, + "learning_rate": 4.99248791859354e-06, + "loss": 0.7146, + "step": 733 + }, + { + "epoch": 0.17104241106105328, + "grad_norm": 0.31358230113983154, + "learning_rate": 4.992464176931985e-06, + "loss": 0.6699, + "step": 734 + }, + { + "epoch": 0.17127543886903837, + "grad_norm": 0.31964826583862305, + "learning_rate": 4.992440397868896e-06, + "loss": 0.6913, + "step": 735 + }, + { + "epoch": 0.17150846667702346, + "grad_norm": 0.30717557668685913, + "learning_rate": 4.9924165814046325e-06, + "loss": 0.7057, + "step": 736 + }, + { + "epoch": 0.17174149448500856, + "grad_norm": 0.30307239294052124, + "learning_rate": 4.99239272753955e-06, + "loss": 0.7034, + "step": 737 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 0.29861584305763245, + "learning_rate": 4.992368836274008e-06, + "loss": 0.7045, + "step": 738 + }, + { + "epoch": 0.1722075501009787, + "grad_norm": 0.2987031042575836, + "learning_rate": 4.992344907608365e-06, + "loss": 0.7179, + "step": 739 + }, + { + "epoch": 0.1724405779089638, + "grad_norm": 0.299699068069458, + "learning_rate": 4.992320941542979e-06, + "loss": 0.7155, + "step": 740 + }, + { + "epoch": 0.1726736057169489, + "grad_norm": 0.2987295985221863, + "learning_rate": 4.99229693807821e-06, + "loss": 0.6967, + "step": 741 + }, + { + "epoch": 0.172906633524934, + "grad_norm": 0.30071499943733215, + "learning_rate": 4.992272897214419e-06, + "loss": 0.7062, + "step": 742 + }, + { + "epoch": 0.17313966133291905, + "grad_norm": 0.3024517595767975, + "learning_rate": 4.992248818951966e-06, + "loss": 0.6663, + "step": 743 + }, + { + "epoch": 0.17337268914090415, + "grad_norm": 0.30913013219833374, + "learning_rate": 4.992224703291212e-06, + "loss": 0.6827, + "step": 744 + }, + { + "epoch": 0.17360571694888924, + "grad_norm": 0.31381648778915405, + "learning_rate": 4.992200550232519e-06, + "loss": 0.6728, + "step": 745 + }, + { + "epoch": 0.17383874475687433, + "grad_norm": 0.33769795298576355, + "learning_rate": 4.99217635977625e-06, + "loss": 0.6647, + "step": 746 + }, + { + "epoch": 0.1740717725648594, + "grad_norm": 0.30636337399482727, + "learning_rate": 4.992152131922768e-06, + "loss": 0.6855, + "step": 747 + }, + { + "epoch": 0.1743048003728445, + "grad_norm": 0.31338930130004883, + "learning_rate": 4.992127866672436e-06, + "loss": 0.7024, + "step": 748 + }, + { + "epoch": 0.17453782818082958, + "grad_norm": 0.31845077872276306, + "learning_rate": 4.992103564025619e-06, + "loss": 0.6962, + "step": 749 + }, + { + "epoch": 0.17477085598881467, + "grad_norm": 0.31712737679481506, + "learning_rate": 4.99207922398268e-06, + "loss": 0.687, + "step": 750 + }, + { + "epoch": 0.17500388379679976, + "grad_norm": 0.30331987142562866, + "learning_rate": 4.9920548465439865e-06, + "loss": 0.6943, + "step": 751 + }, + { + "epoch": 0.17523691160478483, + "grad_norm": 0.29878899455070496, + "learning_rate": 4.992030431709902e-06, + "loss": 0.6936, + "step": 752 + }, + { + "epoch": 0.17546993941276992, + "grad_norm": 0.32260632514953613, + "learning_rate": 4.992005979480795e-06, + "loss": 0.6925, + "step": 753 + }, + { + "epoch": 0.175702967220755, + "grad_norm": 0.29739686846733093, + "learning_rate": 4.99198148985703e-06, + "loss": 0.6758, + "step": 754 + }, + { + "epoch": 0.1759359950287401, + "grad_norm": 0.3142729699611664, + "learning_rate": 4.991956962838978e-06, + "loss": 0.6791, + "step": 755 + }, + { + "epoch": 0.17616902283672517, + "grad_norm": 0.312031090259552, + "learning_rate": 4.991932398427004e-06, + "loss": 0.7104, + "step": 756 + }, + { + "epoch": 0.17640205064471026, + "grad_norm": 0.3036330044269562, + "learning_rate": 4.991907796621478e-06, + "loss": 0.6704, + "step": 757 + }, + { + "epoch": 0.17663507845269535, + "grad_norm": 0.30892109870910645, + "learning_rate": 4.991883157422768e-06, + "loss": 0.6705, + "step": 758 + }, + { + "epoch": 0.17686810626068045, + "grad_norm": 0.32213160395622253, + "learning_rate": 4.991858480831244e-06, + "loss": 0.6903, + "step": 759 + }, + { + "epoch": 0.17710113406866554, + "grad_norm": 0.3143679201602936, + "learning_rate": 4.991833766847278e-06, + "loss": 0.6837, + "step": 760 + }, + { + "epoch": 0.1773341618766506, + "grad_norm": 0.33420804142951965, + "learning_rate": 4.991809015471239e-06, + "loss": 0.7172, + "step": 761 + }, + { + "epoch": 0.1775671896846357, + "grad_norm": 0.3267275094985962, + "learning_rate": 4.9917842267034985e-06, + "loss": 0.6971, + "step": 762 + }, + { + "epoch": 0.1778002174926208, + "grad_norm": 0.32033008337020874, + "learning_rate": 4.99175940054443e-06, + "loss": 0.6936, + "step": 763 + }, + { + "epoch": 0.17803324530060588, + "grad_norm": 0.30232009291648865, + "learning_rate": 4.991734536994405e-06, + "loss": 0.6638, + "step": 764 + }, + { + "epoch": 0.17826627310859094, + "grad_norm": 0.2977076768875122, + "learning_rate": 4.9917096360537966e-06, + "loss": 0.7057, + "step": 765 + }, + { + "epoch": 0.17849930091657604, + "grad_norm": 0.319747656583786, + "learning_rate": 4.991684697722978e-06, + "loss": 0.6922, + "step": 766 + }, + { + "epoch": 0.17873232872456113, + "grad_norm": 0.33538225293159485, + "learning_rate": 4.9916597220023245e-06, + "loss": 0.6875, + "step": 767 + }, + { + "epoch": 0.17896535653254622, + "grad_norm": 0.3111737370491028, + "learning_rate": 4.99163470889221e-06, + "loss": 0.6905, + "step": 768 + }, + { + "epoch": 0.1791983843405313, + "grad_norm": 0.31114619970321655, + "learning_rate": 4.9916096583930105e-06, + "loss": 0.6937, + "step": 769 + }, + { + "epoch": 0.17943141214851638, + "grad_norm": 0.30948296189308167, + "learning_rate": 4.991584570505101e-06, + "loss": 0.6905, + "step": 770 + }, + { + "epoch": 0.17966443995650147, + "grad_norm": 0.30805283784866333, + "learning_rate": 4.991559445228858e-06, + "loss": 0.7042, + "step": 771 + }, + { + "epoch": 0.17989746776448656, + "grad_norm": 0.31451675295829773, + "learning_rate": 4.991534282564661e-06, + "loss": 0.6999, + "step": 772 + }, + { + "epoch": 0.18013049557247165, + "grad_norm": 0.3157965838909149, + "learning_rate": 4.9915090825128845e-06, + "loss": 0.6971, + "step": 773 + }, + { + "epoch": 0.18036352338045675, + "grad_norm": 0.3217350244522095, + "learning_rate": 4.991483845073908e-06, + "loss": 0.7063, + "step": 774 + }, + { + "epoch": 0.1805965511884418, + "grad_norm": 0.3110254406929016, + "learning_rate": 4.991458570248109e-06, + "loss": 0.6833, + "step": 775 + }, + { + "epoch": 0.1808295789964269, + "grad_norm": 0.31662318110466003, + "learning_rate": 4.991433258035869e-06, + "loss": 0.7006, + "step": 776 + }, + { + "epoch": 0.181062606804412, + "grad_norm": 0.3376653790473938, + "learning_rate": 4.991407908437566e-06, + "loss": 0.6956, + "step": 777 + }, + { + "epoch": 0.1812956346123971, + "grad_norm": 0.3087911307811737, + "learning_rate": 4.991382521453582e-06, + "loss": 0.6817, + "step": 778 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 0.2997356653213501, + "learning_rate": 4.991357097084295e-06, + "loss": 0.6938, + "step": 779 + }, + { + "epoch": 0.18176169022836725, + "grad_norm": 0.31942835450172424, + "learning_rate": 4.99133163533009e-06, + "loss": 0.7052, + "step": 780 + }, + { + "epoch": 0.18199471803635234, + "grad_norm": 0.32617196440696716, + "learning_rate": 4.9913061361913475e-06, + "loss": 0.6957, + "step": 781 + }, + { + "epoch": 0.18222774584433743, + "grad_norm": 0.3160213232040405, + "learning_rate": 4.991280599668449e-06, + "loss": 0.69, + "step": 782 + }, + { + "epoch": 0.18246077365232252, + "grad_norm": 0.31434017419815063, + "learning_rate": 4.99125502576178e-06, + "loss": 0.7033, + "step": 783 + }, + { + "epoch": 0.1826938014603076, + "grad_norm": 0.3154219686985016, + "learning_rate": 4.991229414471723e-06, + "loss": 0.6993, + "step": 784 + }, + { + "epoch": 0.18292682926829268, + "grad_norm": 0.30268388986587524, + "learning_rate": 4.991203765798662e-06, + "loss": 0.6742, + "step": 785 + }, + { + "epoch": 0.18315985707627777, + "grad_norm": 0.3019484281539917, + "learning_rate": 4.991178079742982e-06, + "loss": 0.6759, + "step": 786 + }, + { + "epoch": 0.18339288488426286, + "grad_norm": 0.3150714039802551, + "learning_rate": 4.991152356305069e-06, + "loss": 0.6625, + "step": 787 + }, + { + "epoch": 0.18362591269224793, + "grad_norm": 0.31490543484687805, + "learning_rate": 4.99112659548531e-06, + "loss": 0.6876, + "step": 788 + }, + { + "epoch": 0.18385894050023302, + "grad_norm": 0.31966719031333923, + "learning_rate": 4.9911007972840895e-06, + "loss": 0.7215, + "step": 789 + }, + { + "epoch": 0.1840919683082181, + "grad_norm": 0.32380110025405884, + "learning_rate": 4.991074961701795e-06, + "loss": 0.6543, + "step": 790 + }, + { + "epoch": 0.1843249961162032, + "grad_norm": 0.3227287232875824, + "learning_rate": 4.9910490887388155e-06, + "loss": 0.6919, + "step": 791 + }, + { + "epoch": 0.1845580239241883, + "grad_norm": 0.32635679841041565, + "learning_rate": 4.9910231783955374e-06, + "loss": 0.715, + "step": 792 + }, + { + "epoch": 0.18479105173217336, + "grad_norm": 0.3026251494884491, + "learning_rate": 4.990997230672351e-06, + "loss": 0.6789, + "step": 793 + }, + { + "epoch": 0.18502407954015845, + "grad_norm": 0.30521073937416077, + "learning_rate": 4.990971245569645e-06, + "loss": 0.7014, + "step": 794 + }, + { + "epoch": 0.18525710734814355, + "grad_norm": 0.30537697672843933, + "learning_rate": 4.9909452230878096e-06, + "loss": 0.6825, + "step": 795 + }, + { + "epoch": 0.18549013515612864, + "grad_norm": 0.3197531998157501, + "learning_rate": 4.990919163227235e-06, + "loss": 0.6972, + "step": 796 + }, + { + "epoch": 0.18572316296411373, + "grad_norm": 0.3134068250656128, + "learning_rate": 4.990893065988313e-06, + "loss": 0.7093, + "step": 797 + }, + { + "epoch": 0.1859561907720988, + "grad_norm": 0.297159880399704, + "learning_rate": 4.9908669313714345e-06, + "loss": 0.6794, + "step": 798 + }, + { + "epoch": 0.1861892185800839, + "grad_norm": 0.301148384809494, + "learning_rate": 4.990840759376991e-06, + "loss": 0.6892, + "step": 799 + }, + { + "epoch": 0.18642224638806898, + "grad_norm": 0.3051984906196594, + "learning_rate": 4.990814550005376e-06, + "loss": 0.6646, + "step": 800 + }, + { + "epoch": 0.18665527419605407, + "grad_norm": 0.31295686960220337, + "learning_rate": 4.990788303256984e-06, + "loss": 0.7179, + "step": 801 + }, + { + "epoch": 0.18688830200403914, + "grad_norm": 0.32010987401008606, + "learning_rate": 4.990762019132207e-06, + "loss": 0.6643, + "step": 802 + }, + { + "epoch": 0.18712132981202423, + "grad_norm": 0.3115476369857788, + "learning_rate": 4.99073569763144e-06, + "loss": 0.6841, + "step": 803 + }, + { + "epoch": 0.18735435762000932, + "grad_norm": 0.3164718449115753, + "learning_rate": 4.990709338755078e-06, + "loss": 0.6779, + "step": 804 + }, + { + "epoch": 0.1875873854279944, + "grad_norm": 0.3165794312953949, + "learning_rate": 4.9906829425035176e-06, + "loss": 0.7026, + "step": 805 + }, + { + "epoch": 0.1878204132359795, + "grad_norm": 0.2998059093952179, + "learning_rate": 4.990656508877153e-06, + "loss": 0.6917, + "step": 806 + }, + { + "epoch": 0.18805344104396457, + "grad_norm": 0.3217999339103699, + "learning_rate": 4.990630037876383e-06, + "loss": 0.6716, + "step": 807 + }, + { + "epoch": 0.18828646885194966, + "grad_norm": 0.3058147728443146, + "learning_rate": 4.990603529501602e-06, + "loss": 0.6932, + "step": 808 + }, + { + "epoch": 0.18851949665993475, + "grad_norm": 0.3243579864501953, + "learning_rate": 4.990576983753211e-06, + "loss": 0.7242, + "step": 809 + }, + { + "epoch": 0.18875252446791985, + "grad_norm": 0.30855730175971985, + "learning_rate": 4.990550400631605e-06, + "loss": 0.7139, + "step": 810 + }, + { + "epoch": 0.1889855522759049, + "grad_norm": 0.31107524037361145, + "learning_rate": 4.990523780137186e-06, + "loss": 0.6919, + "step": 811 + }, + { + "epoch": 0.18921858008389, + "grad_norm": 0.31353241205215454, + "learning_rate": 4.990497122270351e-06, + "loss": 0.6879, + "step": 812 + }, + { + "epoch": 0.1894516078918751, + "grad_norm": 0.32754525542259216, + "learning_rate": 4.9904704270315015e-06, + "loss": 0.6767, + "step": 813 + }, + { + "epoch": 0.1896846356998602, + "grad_norm": 0.3003893494606018, + "learning_rate": 4.990443694421038e-06, + "loss": 0.7107, + "step": 814 + }, + { + "epoch": 0.18991766350784528, + "grad_norm": 0.30822816491127014, + "learning_rate": 4.990416924439361e-06, + "loss": 0.6962, + "step": 815 + }, + { + "epoch": 0.19015069131583034, + "grad_norm": 0.2961769700050354, + "learning_rate": 4.990390117086872e-06, + "loss": 0.6831, + "step": 816 + }, + { + "epoch": 0.19038371912381544, + "grad_norm": 0.31952205300331116, + "learning_rate": 4.990363272363975e-06, + "loss": 0.6757, + "step": 817 + }, + { + "epoch": 0.19061674693180053, + "grad_norm": 0.3112236559391022, + "learning_rate": 4.99033639027107e-06, + "loss": 0.6886, + "step": 818 + }, + { + "epoch": 0.19084977473978562, + "grad_norm": 0.29789745807647705, + "learning_rate": 4.990309470808563e-06, + "loss": 0.6711, + "step": 819 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 0.30415651202201843, + "learning_rate": 4.9902825139768576e-06, + "loss": 0.6892, + "step": 820 + }, + { + "epoch": 0.19131583035575578, + "grad_norm": 0.31669631600379944, + "learning_rate": 4.990255519776358e-06, + "loss": 0.6976, + "step": 821 + }, + { + "epoch": 0.19154885816374087, + "grad_norm": 0.3271667957305908, + "learning_rate": 4.990228488207467e-06, + "loss": 0.6974, + "step": 822 + }, + { + "epoch": 0.19178188597172596, + "grad_norm": 0.3100816011428833, + "learning_rate": 4.990201419270594e-06, + "loss": 0.6784, + "step": 823 + }, + { + "epoch": 0.19201491377971106, + "grad_norm": 0.306534081697464, + "learning_rate": 4.990174312966143e-06, + "loss": 0.6656, + "step": 824 + }, + { + "epoch": 0.19224794158769612, + "grad_norm": 0.331881046295166, + "learning_rate": 4.99014716929452e-06, + "loss": 0.6584, + "step": 825 + }, + { + "epoch": 0.1924809693956812, + "grad_norm": 0.3119177520275116, + "learning_rate": 4.990119988256134e-06, + "loss": 0.7172, + "step": 826 + }, + { + "epoch": 0.1927139972036663, + "grad_norm": 0.2935415506362915, + "learning_rate": 4.990092769851394e-06, + "loss": 0.6906, + "step": 827 + }, + { + "epoch": 0.1929470250116514, + "grad_norm": 0.3127956688404083, + "learning_rate": 4.990065514080704e-06, + "loss": 0.6965, + "step": 828 + }, + { + "epoch": 0.1931800528196365, + "grad_norm": 0.3116763234138489, + "learning_rate": 4.990038220944477e-06, + "loss": 0.6854, + "step": 829 + }, + { + "epoch": 0.19341308062762155, + "grad_norm": 0.31973204016685486, + "learning_rate": 4.990010890443122e-06, + "loss": 0.6934, + "step": 830 + }, + { + "epoch": 0.19364610843560665, + "grad_norm": 0.3199937641620636, + "learning_rate": 4.989983522577048e-06, + "loss": 0.6841, + "step": 831 + }, + { + "epoch": 0.19387913624359174, + "grad_norm": 0.3148503303527832, + "learning_rate": 4.989956117346666e-06, + "loss": 0.7091, + "step": 832 + }, + { + "epoch": 0.19411216405157683, + "grad_norm": 0.30885228514671326, + "learning_rate": 4.989928674752387e-06, + "loss": 0.6993, + "step": 833 + }, + { + "epoch": 0.1943451918595619, + "grad_norm": 0.31106770038604736, + "learning_rate": 4.989901194794624e-06, + "loss": 0.7304, + "step": 834 + }, + { + "epoch": 0.194578219667547, + "grad_norm": 0.3182736039161682, + "learning_rate": 4.9898736774737874e-06, + "loss": 0.7037, + "step": 835 + }, + { + "epoch": 0.19481124747553208, + "grad_norm": 0.31246218085289, + "learning_rate": 4.989846122790291e-06, + "loss": 0.6903, + "step": 836 + }, + { + "epoch": 0.19504427528351717, + "grad_norm": 0.3041127920150757, + "learning_rate": 4.98981853074455e-06, + "loss": 0.6731, + "step": 837 + }, + { + "epoch": 0.19527730309150226, + "grad_norm": 0.3129989206790924, + "learning_rate": 4.989790901336976e-06, + "loss": 0.6918, + "step": 838 + }, + { + "epoch": 0.19551033089948733, + "grad_norm": 0.30565863847732544, + "learning_rate": 4.989763234567985e-06, + "loss": 0.6882, + "step": 839 + }, + { + "epoch": 0.19574335870747242, + "grad_norm": 0.32209983468055725, + "learning_rate": 4.989735530437992e-06, + "loss": 0.6703, + "step": 840 + }, + { + "epoch": 0.1959763865154575, + "grad_norm": 0.3255407512187958, + "learning_rate": 4.989707788947412e-06, + "loss": 0.6625, + "step": 841 + }, + { + "epoch": 0.1962094143234426, + "grad_norm": 0.31158989667892456, + "learning_rate": 4.989680010096662e-06, + "loss": 0.6556, + "step": 842 + }, + { + "epoch": 0.1964424421314277, + "grad_norm": 0.333229660987854, + "learning_rate": 4.989652193886158e-06, + "loss": 0.6987, + "step": 843 + }, + { + "epoch": 0.19667546993941276, + "grad_norm": 0.32209059596061707, + "learning_rate": 4.989624340316318e-06, + "loss": 0.7121, + "step": 844 + }, + { + "epoch": 0.19690849774739785, + "grad_norm": 0.3253222703933716, + "learning_rate": 4.98959644938756e-06, + "loss": 0.6759, + "step": 845 + }, + { + "epoch": 0.19714152555538295, + "grad_norm": 0.3341846168041229, + "learning_rate": 4.989568521100302e-06, + "loss": 0.7057, + "step": 846 + }, + { + "epoch": 0.19737455336336804, + "grad_norm": 0.3113526999950409, + "learning_rate": 4.989540555454964e-06, + "loss": 0.6912, + "step": 847 + }, + { + "epoch": 0.1976075811713531, + "grad_norm": 0.3049372732639313, + "learning_rate": 4.989512552451966e-06, + "loss": 0.6989, + "step": 848 + }, + { + "epoch": 0.1978406089793382, + "grad_norm": 0.3069918155670166, + "learning_rate": 4.989484512091727e-06, + "loss": 0.7007, + "step": 849 + }, + { + "epoch": 0.1980736367873233, + "grad_norm": 0.3059154748916626, + "learning_rate": 4.989456434374668e-06, + "loss": 0.677, + "step": 850 + }, + { + "epoch": 0.19830666459530838, + "grad_norm": 0.30385711789131165, + "learning_rate": 4.98942831930121e-06, + "loss": 0.6816, + "step": 851 + }, + { + "epoch": 0.19853969240329347, + "grad_norm": 0.31075239181518555, + "learning_rate": 4.9894001668717765e-06, + "loss": 0.6874, + "step": 852 + }, + { + "epoch": 0.19877272021127854, + "grad_norm": 0.31055590510368347, + "learning_rate": 4.989371977086787e-06, + "loss": 0.6755, + "step": 853 + }, + { + "epoch": 0.19900574801926363, + "grad_norm": 0.3091159462928772, + "learning_rate": 4.989343749946668e-06, + "loss": 0.6824, + "step": 854 + }, + { + "epoch": 0.19923877582724872, + "grad_norm": 0.3108750581741333, + "learning_rate": 4.989315485451841e-06, + "loss": 0.6755, + "step": 855 + }, + { + "epoch": 0.1994718036352338, + "grad_norm": 0.3127971589565277, + "learning_rate": 4.9892871836027305e-06, + "loss": 0.6599, + "step": 856 + }, + { + "epoch": 0.19970483144321888, + "grad_norm": 0.3139951527118683, + "learning_rate": 4.9892588443997615e-06, + "loss": 0.7255, + "step": 857 + }, + { + "epoch": 0.19993785925120397, + "grad_norm": 0.32461312413215637, + "learning_rate": 4.989230467843359e-06, + "loss": 0.6835, + "step": 858 + }, + { + "epoch": 0.20017088705918906, + "grad_norm": 0.32544243335723877, + "learning_rate": 4.989202053933949e-06, + "loss": 0.6687, + "step": 859 + }, + { + "epoch": 0.20040391486717415, + "grad_norm": 0.32397815585136414, + "learning_rate": 4.989173602671957e-06, + "loss": 0.6884, + "step": 860 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 0.32511264085769653, + "learning_rate": 4.989145114057811e-06, + "loss": 0.679, + "step": 861 + }, + { + "epoch": 0.2008699704831443, + "grad_norm": 0.31888827681541443, + "learning_rate": 4.989116588091938e-06, + "loss": 0.6781, + "step": 862 + }, + { + "epoch": 0.2011029982911294, + "grad_norm": 0.31558793783187866, + "learning_rate": 4.989088024774767e-06, + "loss": 0.689, + "step": 863 + }, + { + "epoch": 0.2013360260991145, + "grad_norm": 0.31991255283355713, + "learning_rate": 4.989059424106725e-06, + "loss": 0.6801, + "step": 864 + }, + { + "epoch": 0.2015690539070996, + "grad_norm": 0.332898885011673, + "learning_rate": 4.989030786088241e-06, + "loss": 0.689, + "step": 865 + }, + { + "epoch": 0.20180208171508468, + "grad_norm": 0.31651759147644043, + "learning_rate": 4.989002110719747e-06, + "loss": 0.6906, + "step": 866 + }, + { + "epoch": 0.20203510952306974, + "grad_norm": 0.3150373101234436, + "learning_rate": 4.988973398001672e-06, + "loss": 0.6863, + "step": 867 + }, + { + "epoch": 0.20226813733105484, + "grad_norm": 0.31643638014793396, + "learning_rate": 4.988944647934447e-06, + "loss": 0.6764, + "step": 868 + }, + { + "epoch": 0.20250116513903993, + "grad_norm": 0.3135892450809479, + "learning_rate": 4.988915860518504e-06, + "loss": 0.7006, + "step": 869 + }, + { + "epoch": 0.20273419294702502, + "grad_norm": 0.324581503868103, + "learning_rate": 4.988887035754273e-06, + "loss": 0.6788, + "step": 870 + }, + { + "epoch": 0.20296722075501009, + "grad_norm": 0.32707783579826355, + "learning_rate": 4.988858173642188e-06, + "loss": 0.683, + "step": 871 + }, + { + "epoch": 0.20320024856299518, + "grad_norm": 0.31686946749687195, + "learning_rate": 4.988829274182682e-06, + "loss": 0.7171, + "step": 872 + }, + { + "epoch": 0.20343327637098027, + "grad_norm": 0.3203548491001129, + "learning_rate": 4.988800337376189e-06, + "loss": 0.705, + "step": 873 + }, + { + "epoch": 0.20366630417896536, + "grad_norm": 0.3286099433898926, + "learning_rate": 4.9887713632231415e-06, + "loss": 0.6888, + "step": 874 + }, + { + "epoch": 0.20389933198695046, + "grad_norm": 0.3374185562133789, + "learning_rate": 4.988742351723977e-06, + "loss": 0.6373, + "step": 875 + }, + { + "epoch": 0.20413235979493552, + "grad_norm": 0.32489535212516785, + "learning_rate": 4.988713302879129e-06, + "loss": 0.6898, + "step": 876 + }, + { + "epoch": 0.2043653876029206, + "grad_norm": 0.3170216679573059, + "learning_rate": 4.988684216689034e-06, + "loss": 0.6838, + "step": 877 + }, + { + "epoch": 0.2045984154109057, + "grad_norm": 0.3239687383174896, + "learning_rate": 4.988655093154128e-06, + "loss": 0.6871, + "step": 878 + }, + { + "epoch": 0.2048314432188908, + "grad_norm": 0.31675198674201965, + "learning_rate": 4.988625932274848e-06, + "loss": 0.6694, + "step": 879 + }, + { + "epoch": 0.20506447102687586, + "grad_norm": 0.29677361249923706, + "learning_rate": 4.988596734051633e-06, + "loss": 0.6726, + "step": 880 + }, + { + "epoch": 0.20529749883486095, + "grad_norm": 0.3072007894515991, + "learning_rate": 4.988567498484918e-06, + "loss": 0.6905, + "step": 881 + }, + { + "epoch": 0.20553052664284605, + "grad_norm": 0.3128240704536438, + "learning_rate": 4.988538225575145e-06, + "loss": 0.6831, + "step": 882 + }, + { + "epoch": 0.20576355445083114, + "grad_norm": 0.32844415307044983, + "learning_rate": 4.988508915322753e-06, + "loss": 0.6857, + "step": 883 + }, + { + "epoch": 0.20599658225881623, + "grad_norm": 0.3029480278491974, + "learning_rate": 4.98847956772818e-06, + "loss": 0.6343, + "step": 884 + }, + { + "epoch": 0.2062296100668013, + "grad_norm": 0.3151710331439972, + "learning_rate": 4.988450182791867e-06, + "loss": 0.6822, + "step": 885 + }, + { + "epoch": 0.2064626378747864, + "grad_norm": 0.32395777106285095, + "learning_rate": 4.988420760514255e-06, + "loss": 0.7023, + "step": 886 + }, + { + "epoch": 0.20669566568277148, + "grad_norm": 0.33223193883895874, + "learning_rate": 4.988391300895786e-06, + "loss": 0.6679, + "step": 887 + }, + { + "epoch": 0.20692869349075657, + "grad_norm": 0.3161953091621399, + "learning_rate": 4.988361803936902e-06, + "loss": 0.6971, + "step": 888 + }, + { + "epoch": 0.20716172129874166, + "grad_norm": 0.32543936371803284, + "learning_rate": 4.988332269638045e-06, + "loss": 0.6703, + "step": 889 + }, + { + "epoch": 0.20739474910672673, + "grad_norm": 0.33734849095344543, + "learning_rate": 4.988302697999658e-06, + "loss": 0.6875, + "step": 890 + }, + { + "epoch": 0.20762777691471182, + "grad_norm": 0.32218724489212036, + "learning_rate": 4.988273089022187e-06, + "loss": 0.6773, + "step": 891 + }, + { + "epoch": 0.2078608047226969, + "grad_norm": 0.3049621284008026, + "learning_rate": 4.988243442706073e-06, + "loss": 0.668, + "step": 892 + }, + { + "epoch": 0.208093832530682, + "grad_norm": 0.30014267563819885, + "learning_rate": 4.9882137590517634e-06, + "loss": 0.6594, + "step": 893 + }, + { + "epoch": 0.20832686033866707, + "grad_norm": 0.3145594298839569, + "learning_rate": 4.988184038059702e-06, + "loss": 0.6674, + "step": 894 + }, + { + "epoch": 0.20855988814665216, + "grad_norm": 0.3056693971157074, + "learning_rate": 4.988154279730335e-06, + "loss": 0.7101, + "step": 895 + }, + { + "epoch": 0.20879291595463725, + "grad_norm": 0.3176927864551544, + "learning_rate": 4.988124484064111e-06, + "loss": 0.7167, + "step": 896 + }, + { + "epoch": 0.20902594376262235, + "grad_norm": 0.3169732391834259, + "learning_rate": 4.988094651061475e-06, + "loss": 0.6489, + "step": 897 + }, + { + "epoch": 0.20925897157060744, + "grad_norm": 0.31884342432022095, + "learning_rate": 4.988064780722874e-06, + "loss": 0.6785, + "step": 898 + }, + { + "epoch": 0.2094919993785925, + "grad_norm": 0.3164225220680237, + "learning_rate": 4.9880348730487596e-06, + "loss": 0.665, + "step": 899 + }, + { + "epoch": 0.2097250271865776, + "grad_norm": 0.32530754804611206, + "learning_rate": 4.988004928039577e-06, + "loss": 0.6582, + "step": 900 + }, + { + "epoch": 0.2099580549945627, + "grad_norm": 0.3108154535293579, + "learning_rate": 4.987974945695777e-06, + "loss": 0.6976, + "step": 901 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 0.334484726190567, + "learning_rate": 4.98794492601781e-06, + "loss": 0.6711, + "step": 902 + }, + { + "epoch": 0.21042411061053284, + "grad_norm": 0.31173184514045715, + "learning_rate": 4.9879148690061255e-06, + "loss": 0.6933, + "step": 903 + }, + { + "epoch": 0.21065713841851794, + "grad_norm": 0.32134583592414856, + "learning_rate": 4.9878847746611755e-06, + "loss": 0.6907, + "step": 904 + }, + { + "epoch": 0.21089016622650303, + "grad_norm": 0.30610403418540955, + "learning_rate": 4.987854642983411e-06, + "loss": 0.7023, + "step": 905 + }, + { + "epoch": 0.21112319403448812, + "grad_norm": 0.3174522817134857, + "learning_rate": 4.987824473973284e-06, + "loss": 0.7086, + "step": 906 + }, + { + "epoch": 0.2113562218424732, + "grad_norm": 0.32686710357666016, + "learning_rate": 4.9877942676312475e-06, + "loss": 0.7179, + "step": 907 + }, + { + "epoch": 0.21158924965045828, + "grad_norm": 0.31729856133461, + "learning_rate": 4.987764023957755e-06, + "loss": 0.7222, + "step": 908 + }, + { + "epoch": 0.21182227745844337, + "grad_norm": 0.31572768092155457, + "learning_rate": 4.98773374295326e-06, + "loss": 0.6877, + "step": 909 + }, + { + "epoch": 0.21205530526642846, + "grad_norm": 0.3098528981208801, + "learning_rate": 4.9877034246182166e-06, + "loss": 0.6871, + "step": 910 + }, + { + "epoch": 0.21228833307441355, + "grad_norm": 0.30946585536003113, + "learning_rate": 4.98767306895308e-06, + "loss": 0.6712, + "step": 911 + }, + { + "epoch": 0.21252136088239865, + "grad_norm": 0.31838321685791016, + "learning_rate": 4.987642675958306e-06, + "loss": 0.6858, + "step": 912 + }, + { + "epoch": 0.2127543886903837, + "grad_norm": 0.32269299030303955, + "learning_rate": 4.98761224563435e-06, + "loss": 0.6738, + "step": 913 + }, + { + "epoch": 0.2129874164983688, + "grad_norm": 0.31012025475502014, + "learning_rate": 4.98758177798167e-06, + "loss": 0.6842, + "step": 914 + }, + { + "epoch": 0.2132204443063539, + "grad_norm": 0.3235667943954468, + "learning_rate": 4.987551273000723e-06, + "loss": 0.6752, + "step": 915 + }, + { + "epoch": 0.213453472114339, + "grad_norm": 0.31368494033813477, + "learning_rate": 4.987520730691965e-06, + "loss": 0.6788, + "step": 916 + }, + { + "epoch": 0.21368649992232405, + "grad_norm": 0.3298737704753876, + "learning_rate": 4.987490151055855e-06, + "loss": 0.6906, + "step": 917 + }, + { + "epoch": 0.21391952773030914, + "grad_norm": 0.3258292078971863, + "learning_rate": 4.987459534092853e-06, + "loss": 0.6612, + "step": 918 + }, + { + "epoch": 0.21415255553829424, + "grad_norm": 0.3185737133026123, + "learning_rate": 4.987428879803418e-06, + "loss": 0.6602, + "step": 919 + }, + { + "epoch": 0.21438558334627933, + "grad_norm": 0.31575673818588257, + "learning_rate": 4.987398188188009e-06, + "loss": 0.6865, + "step": 920 + }, + { + "epoch": 0.21461861115426442, + "grad_norm": 0.3217650353908539, + "learning_rate": 4.987367459247087e-06, + "loss": 0.6802, + "step": 921 + }, + { + "epoch": 0.21485163896224949, + "grad_norm": 0.3315964341163635, + "learning_rate": 4.987336692981113e-06, + "loss": 0.6675, + "step": 922 + }, + { + "epoch": 0.21508466677023458, + "grad_norm": 0.32140815258026123, + "learning_rate": 4.98730588939055e-06, + "loss": 0.6882, + "step": 923 + }, + { + "epoch": 0.21531769457821967, + "grad_norm": 0.31715768575668335, + "learning_rate": 4.9872750484758595e-06, + "loss": 0.6702, + "step": 924 + }, + { + "epoch": 0.21555072238620476, + "grad_norm": 0.3184153139591217, + "learning_rate": 4.987244170237503e-06, + "loss": 0.695, + "step": 925 + }, + { + "epoch": 0.21578375019418983, + "grad_norm": 0.3181294798851013, + "learning_rate": 4.987213254675945e-06, + "loss": 0.6851, + "step": 926 + }, + { + "epoch": 0.21601677800217492, + "grad_norm": 0.3019131124019623, + "learning_rate": 4.98718230179165e-06, + "loss": 0.6902, + "step": 927 + }, + { + "epoch": 0.21624980581016, + "grad_norm": 0.3133629262447357, + "learning_rate": 4.987151311585082e-06, + "loss": 0.6771, + "step": 928 + }, + { + "epoch": 0.2164828336181451, + "grad_norm": 0.3170763850212097, + "learning_rate": 4.987120284056705e-06, + "loss": 0.6835, + "step": 929 + }, + { + "epoch": 0.2167158614261302, + "grad_norm": 0.3109990656375885, + "learning_rate": 4.987089219206986e-06, + "loss": 0.6877, + "step": 930 + }, + { + "epoch": 0.21694888923411526, + "grad_norm": 0.3352920711040497, + "learning_rate": 4.98705811703639e-06, + "loss": 0.6902, + "step": 931 + }, + { + "epoch": 0.21718191704210035, + "grad_norm": 0.3126963973045349, + "learning_rate": 4.987026977545385e-06, + "loss": 0.6627, + "step": 932 + }, + { + "epoch": 0.21741494485008545, + "grad_norm": 0.31306400895118713, + "learning_rate": 4.986995800734437e-06, + "loss": 0.6892, + "step": 933 + }, + { + "epoch": 0.21764797265807054, + "grad_norm": 0.30529454350471497, + "learning_rate": 4.986964586604015e-06, + "loss": 0.6588, + "step": 934 + }, + { + "epoch": 0.2178810004660556, + "grad_norm": 0.3121471703052521, + "learning_rate": 4.986933335154586e-06, + "loss": 0.6819, + "step": 935 + }, + { + "epoch": 0.2181140282740407, + "grad_norm": 0.32725220918655396, + "learning_rate": 4.98690204638662e-06, + "loss": 0.7281, + "step": 936 + }, + { + "epoch": 0.2183470560820258, + "grad_norm": 0.3239481747150421, + "learning_rate": 4.986870720300586e-06, + "loss": 0.7076, + "step": 937 + }, + { + "epoch": 0.21858008389001088, + "grad_norm": 0.31875649094581604, + "learning_rate": 4.986839356896956e-06, + "loss": 0.7077, + "step": 938 + }, + { + "epoch": 0.21881311169799597, + "grad_norm": 0.31592074036598206, + "learning_rate": 4.986807956176197e-06, + "loss": 0.6728, + "step": 939 + }, + { + "epoch": 0.21904613950598104, + "grad_norm": 0.3200913965702057, + "learning_rate": 4.986776518138783e-06, + "loss": 0.73, + "step": 940 + }, + { + "epoch": 0.21927916731396613, + "grad_norm": 0.3183765113353729, + "learning_rate": 4.986745042785185e-06, + "loss": 0.7014, + "step": 941 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 0.3215160667896271, + "learning_rate": 4.986713530115874e-06, + "loss": 0.6629, + "step": 942 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 0.3064996004104614, + "learning_rate": 4.986681980131326e-06, + "loss": 0.6615, + "step": 943 + }, + { + "epoch": 0.2199782507379214, + "grad_norm": 0.3177923858165741, + "learning_rate": 4.986650392832011e-06, + "loss": 0.7035, + "step": 944 + }, + { + "epoch": 0.22021127854590647, + "grad_norm": 0.32579976320266724, + "learning_rate": 4.986618768218405e-06, + "loss": 0.7123, + "step": 945 + }, + { + "epoch": 0.22044430635389156, + "grad_norm": 0.31881824135780334, + "learning_rate": 4.986587106290983e-06, + "loss": 0.6937, + "step": 946 + }, + { + "epoch": 0.22067733416187665, + "grad_norm": 0.3212588131427765, + "learning_rate": 4.986555407050218e-06, + "loss": 0.6622, + "step": 947 + }, + { + "epoch": 0.22091036196986175, + "grad_norm": 0.31766781210899353, + "learning_rate": 4.9865236704965884e-06, + "loss": 0.6648, + "step": 948 + }, + { + "epoch": 0.2211433897778468, + "grad_norm": 0.31368860602378845, + "learning_rate": 4.986491896630568e-06, + "loss": 0.679, + "step": 949 + }, + { + "epoch": 0.2213764175858319, + "grad_norm": 0.3226605951786041, + "learning_rate": 4.986460085452634e-06, + "loss": 0.6947, + "step": 950 + }, + { + "epoch": 0.221609445393817, + "grad_norm": 0.3288770616054535, + "learning_rate": 4.986428236963264e-06, + "loss": 0.6695, + "step": 951 + }, + { + "epoch": 0.2218424732018021, + "grad_norm": 0.31415286660194397, + "learning_rate": 4.986396351162937e-06, + "loss": 0.6665, + "step": 952 + }, + { + "epoch": 0.22207550100978718, + "grad_norm": 0.3283250331878662, + "learning_rate": 4.986364428052131e-06, + "loss": 0.7251, + "step": 953 + }, + { + "epoch": 0.22230852881777224, + "grad_norm": 0.308844655752182, + "learning_rate": 4.986332467631324e-06, + "loss": 0.6823, + "step": 954 + }, + { + "epoch": 0.22254155662575734, + "grad_norm": 0.32204872369766235, + "learning_rate": 4.9863004699009965e-06, + "loss": 0.674, + "step": 955 + }, + { + "epoch": 0.22277458443374243, + "grad_norm": 0.3426995873451233, + "learning_rate": 4.986268434861628e-06, + "loss": 0.6883, + "step": 956 + }, + { + "epoch": 0.22300761224172752, + "grad_norm": 0.3239719271659851, + "learning_rate": 4.9862363625137e-06, + "loss": 0.6799, + "step": 957 + }, + { + "epoch": 0.22324064004971259, + "grad_norm": 0.3150714635848999, + "learning_rate": 4.986204252857694e-06, + "loss": 0.7011, + "step": 958 + }, + { + "epoch": 0.22347366785769768, + "grad_norm": 0.3101712465286255, + "learning_rate": 4.98617210589409e-06, + "loss": 0.6938, + "step": 959 + }, + { + "epoch": 0.22370669566568277, + "grad_norm": 0.3265264928340912, + "learning_rate": 4.986139921623372e-06, + "loss": 0.6939, + "step": 960 + }, + { + "epoch": 0.22393972347366786, + "grad_norm": 0.31700029969215393, + "learning_rate": 4.9861077000460235e-06, + "loss": 0.6823, + "step": 961 + }, + { + "epoch": 0.22417275128165295, + "grad_norm": 0.31733718514442444, + "learning_rate": 4.986075441162526e-06, + "loss": 0.6843, + "step": 962 + }, + { + "epoch": 0.22440577908963802, + "grad_norm": 0.30548855662345886, + "learning_rate": 4.9860431449733645e-06, + "loss": 0.671, + "step": 963 + }, + { + "epoch": 0.2246388068976231, + "grad_norm": 0.32768377661705017, + "learning_rate": 4.986010811479025e-06, + "loss": 0.6767, + "step": 964 + }, + { + "epoch": 0.2248718347056082, + "grad_norm": 0.32733145356178284, + "learning_rate": 4.985978440679991e-06, + "loss": 0.6815, + "step": 965 + }, + { + "epoch": 0.2251048625135933, + "grad_norm": 0.32724952697753906, + "learning_rate": 4.985946032576748e-06, + "loss": 0.6862, + "step": 966 + }, + { + "epoch": 0.2253378903215784, + "grad_norm": 0.3247838616371155, + "learning_rate": 4.985913587169785e-06, + "loss": 0.6859, + "step": 967 + }, + { + "epoch": 0.22557091812956345, + "grad_norm": 0.3151308298110962, + "learning_rate": 4.985881104459585e-06, + "loss": 0.7005, + "step": 968 + }, + { + "epoch": 0.22580394593754854, + "grad_norm": 0.32546621561050415, + "learning_rate": 4.985848584446638e-06, + "loss": 0.6763, + "step": 969 + }, + { + "epoch": 0.22603697374553364, + "grad_norm": 0.3261764645576477, + "learning_rate": 4.985816027131431e-06, + "loss": 0.6854, + "step": 970 + }, + { + "epoch": 0.22627000155351873, + "grad_norm": 0.31922075152397156, + "learning_rate": 4.985783432514454e-06, + "loss": 0.6805, + "step": 971 + }, + { + "epoch": 0.2265030293615038, + "grad_norm": 0.3158024847507477, + "learning_rate": 4.985750800596194e-06, + "loss": 0.6841, + "step": 972 + }, + { + "epoch": 0.22673605716948889, + "grad_norm": 0.31959623098373413, + "learning_rate": 4.985718131377142e-06, + "loss": 0.6931, + "step": 973 + }, + { + "epoch": 0.22696908497747398, + "grad_norm": 0.3227651119232178, + "learning_rate": 4.985685424857788e-06, + "loss": 0.6596, + "step": 974 + }, + { + "epoch": 0.22720211278545907, + "grad_norm": 0.320505291223526, + "learning_rate": 4.985652681038622e-06, + "loss": 0.7218, + "step": 975 + }, + { + "epoch": 0.22743514059344416, + "grad_norm": 0.3225525915622711, + "learning_rate": 4.985619899920137e-06, + "loss": 0.6955, + "step": 976 + }, + { + "epoch": 0.22766816840142923, + "grad_norm": 0.3074674606323242, + "learning_rate": 4.9855870815028235e-06, + "loss": 0.6571, + "step": 977 + }, + { + "epoch": 0.22790119620941432, + "grad_norm": 0.33230888843536377, + "learning_rate": 4.9855542257871744e-06, + "loss": 0.6683, + "step": 978 + }, + { + "epoch": 0.2281342240173994, + "grad_norm": 0.3268888294696808, + "learning_rate": 4.985521332773683e-06, + "loss": 0.6815, + "step": 979 + }, + { + "epoch": 0.2283672518253845, + "grad_norm": 0.31517982482910156, + "learning_rate": 4.9854884024628425e-06, + "loss": 0.672, + "step": 980 + }, + { + "epoch": 0.22860027963336957, + "grad_norm": 0.3164212703704834, + "learning_rate": 4.985455434855147e-06, + "loss": 0.6838, + "step": 981 + }, + { + "epoch": 0.22883330744135466, + "grad_norm": 0.3225736916065216, + "learning_rate": 4.9854224299510915e-06, + "loss": 0.6592, + "step": 982 + }, + { + "epoch": 0.22906633524933975, + "grad_norm": 0.32018762826919556, + "learning_rate": 4.985389387751171e-06, + "loss": 0.6853, + "step": 983 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 0.3260047733783722, + "learning_rate": 4.985356308255882e-06, + "loss": 0.6892, + "step": 984 + }, + { + "epoch": 0.22953239086530994, + "grad_norm": 0.33310064673423767, + "learning_rate": 4.985323191465719e-06, + "loss": 0.6911, + "step": 985 + }, + { + "epoch": 0.229765418673295, + "grad_norm": 0.3268904983997345, + "learning_rate": 4.985290037381182e-06, + "loss": 0.6713, + "step": 986 + }, + { + "epoch": 0.2299984464812801, + "grad_norm": 0.3028082251548767, + "learning_rate": 4.985256846002766e-06, + "loss": 0.6489, + "step": 987 + }, + { + "epoch": 0.2302314742892652, + "grad_norm": 0.32314130663871765, + "learning_rate": 4.98522361733097e-06, + "loss": 0.708, + "step": 988 + }, + { + "epoch": 0.23046450209725028, + "grad_norm": 0.330923467874527, + "learning_rate": 4.985190351366293e-06, + "loss": 0.7033, + "step": 989 + }, + { + "epoch": 0.23069752990523537, + "grad_norm": 0.33708706498146057, + "learning_rate": 4.9851570481092336e-06, + "loss": 0.6799, + "step": 990 + }, + { + "epoch": 0.23093055771322044, + "grad_norm": 0.3309376835823059, + "learning_rate": 4.985123707560291e-06, + "loss": 0.6874, + "step": 991 + }, + { + "epoch": 0.23116358552120553, + "grad_norm": 0.3181137144565582, + "learning_rate": 4.985090329719967e-06, + "loss": 0.6844, + "step": 992 + }, + { + "epoch": 0.23139661332919062, + "grad_norm": 0.32914844155311584, + "learning_rate": 4.985056914588761e-06, + "loss": 0.7064, + "step": 993 + }, + { + "epoch": 0.2316296411371757, + "grad_norm": 0.32259196043014526, + "learning_rate": 4.985023462167174e-06, + "loss": 0.6758, + "step": 994 + }, + { + "epoch": 0.23186266894516078, + "grad_norm": 0.32219961285591125, + "learning_rate": 4.984989972455711e-06, + "loss": 0.6808, + "step": 995 + }, + { + "epoch": 0.23209569675314587, + "grad_norm": 0.2993902564048767, + "learning_rate": 4.984956445454872e-06, + "loss": 0.6595, + "step": 996 + }, + { + "epoch": 0.23232872456113096, + "grad_norm": 0.3115287721157074, + "learning_rate": 4.984922881165159e-06, + "loss": 0.6854, + "step": 997 + }, + { + "epoch": 0.23256175236911605, + "grad_norm": 0.3306537866592407, + "learning_rate": 4.984889279587079e-06, + "loss": 0.6997, + "step": 998 + }, + { + "epoch": 0.23279478017710115, + "grad_norm": 0.32630330324172974, + "learning_rate": 4.984855640721134e-06, + "loss": 0.6989, + "step": 999 + }, + { + "epoch": 0.2330278079850862, + "grad_norm": 0.3192913234233856, + "learning_rate": 4.9848219645678295e-06, + "loss": 0.6723, + "step": 1000 + }, + { + "epoch": 0.2332608357930713, + "grad_norm": 0.3126539885997772, + "learning_rate": 4.98478825112767e-06, + "loss": 0.6949, + "step": 1001 + }, + { + "epoch": 0.2334938636010564, + "grad_norm": 0.32951727509498596, + "learning_rate": 4.984754500401163e-06, + "loss": 0.6754, + "step": 1002 + }, + { + "epoch": 0.2337268914090415, + "grad_norm": 0.30512815713882446, + "learning_rate": 4.984720712388814e-06, + "loss": 0.7017, + "step": 1003 + }, + { + "epoch": 0.23395991921702655, + "grad_norm": 0.3149077296257019, + "learning_rate": 4.984686887091129e-06, + "loss": 0.6752, + "step": 1004 + }, + { + "epoch": 0.23419294702501164, + "grad_norm": 0.3332644999027252, + "learning_rate": 4.984653024508618e-06, + "loss": 0.6762, + "step": 1005 + }, + { + "epoch": 0.23442597483299674, + "grad_norm": 0.3328660726547241, + "learning_rate": 4.984619124641787e-06, + "loss": 0.7122, + "step": 1006 + }, + { + "epoch": 0.23465900264098183, + "grad_norm": 0.319530189037323, + "learning_rate": 4.984585187491145e-06, + "loss": 0.6844, + "step": 1007 + }, + { + "epoch": 0.23489203044896692, + "grad_norm": 0.3192739188671112, + "learning_rate": 4.9845512130572015e-06, + "loss": 0.704, + "step": 1008 + }, + { + "epoch": 0.23512505825695199, + "grad_norm": 0.3231336772441864, + "learning_rate": 4.984517201340468e-06, + "loss": 0.683, + "step": 1009 + }, + { + "epoch": 0.23535808606493708, + "grad_norm": 0.3182506561279297, + "learning_rate": 4.984483152341452e-06, + "loss": 0.6706, + "step": 1010 + }, + { + "epoch": 0.23559111387292217, + "grad_norm": 0.3149191439151764, + "learning_rate": 4.984449066060666e-06, + "loss": 0.6662, + "step": 1011 + }, + { + "epoch": 0.23582414168090726, + "grad_norm": 0.32004258036613464, + "learning_rate": 4.984414942498622e-06, + "loss": 0.6983, + "step": 1012 + }, + { + "epoch": 0.23605716948889235, + "grad_norm": 0.32421594858169556, + "learning_rate": 4.984380781655831e-06, + "loss": 0.6891, + "step": 1013 + }, + { + "epoch": 0.23629019729687742, + "grad_norm": 0.3202371895313263, + "learning_rate": 4.984346583532806e-06, + "loss": 0.6775, + "step": 1014 + }, + { + "epoch": 0.2365232251048625, + "grad_norm": 0.31756332516670227, + "learning_rate": 4.9843123481300596e-06, + "loss": 0.685, + "step": 1015 + }, + { + "epoch": 0.2367562529128476, + "grad_norm": 0.31431037187576294, + "learning_rate": 4.984278075448107e-06, + "loss": 0.6598, + "step": 1016 + }, + { + "epoch": 0.2369892807208327, + "grad_norm": 0.32881680130958557, + "learning_rate": 4.984243765487461e-06, + "loss": 0.6747, + "step": 1017 + }, + { + "epoch": 0.23722230852881776, + "grad_norm": 0.31124937534332275, + "learning_rate": 4.984209418248637e-06, + "loss": 0.6637, + "step": 1018 + }, + { + "epoch": 0.23745533633680285, + "grad_norm": 0.3284285366535187, + "learning_rate": 4.984175033732151e-06, + "loss": 0.6814, + "step": 1019 + }, + { + "epoch": 0.23768836414478794, + "grad_norm": 0.3177911937236786, + "learning_rate": 4.984140611938517e-06, + "loss": 0.6893, + "step": 1020 + }, + { + "epoch": 0.23792139195277304, + "grad_norm": 0.308485209941864, + "learning_rate": 4.984106152868254e-06, + "loss": 0.6802, + "step": 1021 + }, + { + "epoch": 0.23815441976075813, + "grad_norm": 0.31310921907424927, + "learning_rate": 4.984071656521878e-06, + "loss": 0.6755, + "step": 1022 + }, + { + "epoch": 0.2383874475687432, + "grad_norm": 0.32530835270881653, + "learning_rate": 4.984037122899906e-06, + "loss": 0.6969, + "step": 1023 + }, + { + "epoch": 0.23862047537672829, + "grad_norm": 0.3131357729434967, + "learning_rate": 4.984002552002858e-06, + "loss": 0.729, + "step": 1024 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 0.3161475956439972, + "learning_rate": 4.9839679438312505e-06, + "loss": 0.6767, + "step": 1025 + }, + { + "epoch": 0.23908653099269847, + "grad_norm": 0.3180074691772461, + "learning_rate": 4.983933298385605e-06, + "loss": 0.6966, + "step": 1026 + }, + { + "epoch": 0.23931955880068354, + "grad_norm": 0.32768023014068604, + "learning_rate": 4.983898615666439e-06, + "loss": 0.6986, + "step": 1027 + }, + { + "epoch": 0.23955258660866863, + "grad_norm": 0.3414887487888336, + "learning_rate": 4.983863895674275e-06, + "loss": 0.7014, + "step": 1028 + }, + { + "epoch": 0.23978561441665372, + "grad_norm": 0.34052568674087524, + "learning_rate": 4.983829138409634e-06, + "loss": 0.714, + "step": 1029 + }, + { + "epoch": 0.2400186422246388, + "grad_norm": 0.31681525707244873, + "learning_rate": 4.983794343873036e-06, + "loss": 0.6774, + "step": 1030 + }, + { + "epoch": 0.2402516700326239, + "grad_norm": 0.30409830808639526, + "learning_rate": 4.983759512065004e-06, + "loss": 0.6891, + "step": 1031 + }, + { + "epoch": 0.24048469784060897, + "grad_norm": 0.31787604093551636, + "learning_rate": 4.983724642986061e-06, + "loss": 0.6884, + "step": 1032 + }, + { + "epoch": 0.24071772564859406, + "grad_norm": 0.33168041706085205, + "learning_rate": 4.98368973663673e-06, + "loss": 0.6933, + "step": 1033 + }, + { + "epoch": 0.24095075345657915, + "grad_norm": 0.32437780499458313, + "learning_rate": 4.983654793017534e-06, + "loss": 0.7189, + "step": 1034 + }, + { + "epoch": 0.24118378126456425, + "grad_norm": 0.3383296728134155, + "learning_rate": 4.983619812128998e-06, + "loss": 0.6868, + "step": 1035 + }, + { + "epoch": 0.24141680907254934, + "grad_norm": 0.32842451333999634, + "learning_rate": 4.983584793971648e-06, + "loss": 0.7017, + "step": 1036 + }, + { + "epoch": 0.2416498368805344, + "grad_norm": 0.3163357973098755, + "learning_rate": 4.983549738546007e-06, + "loss": 0.6855, + "step": 1037 + }, + { + "epoch": 0.2418828646885195, + "grad_norm": 0.3236447274684906, + "learning_rate": 4.983514645852604e-06, + "loss": 0.6987, + "step": 1038 + }, + { + "epoch": 0.2421158924965046, + "grad_norm": 0.34931090474128723, + "learning_rate": 4.983479515891963e-06, + "loss": 0.6808, + "step": 1039 + }, + { + "epoch": 0.24234892030448968, + "grad_norm": 0.31541189551353455, + "learning_rate": 4.9834443486646125e-06, + "loss": 0.6708, + "step": 1040 + }, + { + "epoch": 0.24258194811247474, + "grad_norm": 0.3185347616672516, + "learning_rate": 4.98340914417108e-06, + "loss": 0.6815, + "step": 1041 + }, + { + "epoch": 0.24281497592045984, + "grad_norm": 0.337496280670166, + "learning_rate": 4.9833739024118935e-06, + "loss": 0.668, + "step": 1042 + }, + { + "epoch": 0.24304800372844493, + "grad_norm": 0.3248627185821533, + "learning_rate": 4.983338623387582e-06, + "loss": 0.7107, + "step": 1043 + }, + { + "epoch": 0.24328103153643002, + "grad_norm": 0.33839982748031616, + "learning_rate": 4.9833033070986756e-06, + "loss": 0.6853, + "step": 1044 + }, + { + "epoch": 0.2435140593444151, + "grad_norm": 0.3312564194202423, + "learning_rate": 4.983267953545703e-06, + "loss": 0.6913, + "step": 1045 + }, + { + "epoch": 0.24374708715240018, + "grad_norm": 0.32821324467658997, + "learning_rate": 4.983232562729196e-06, + "loss": 0.6667, + "step": 1046 + }, + { + "epoch": 0.24398011496038527, + "grad_norm": 0.3222935199737549, + "learning_rate": 4.983197134649684e-06, + "loss": 0.648, + "step": 1047 + }, + { + "epoch": 0.24421314276837036, + "grad_norm": 0.3185495138168335, + "learning_rate": 4.9831616693077e-06, + "loss": 0.6611, + "step": 1048 + }, + { + "epoch": 0.24444617057635545, + "grad_norm": 0.3268357515335083, + "learning_rate": 4.983126166703776e-06, + "loss": 0.6956, + "step": 1049 + }, + { + "epoch": 0.24467919838434052, + "grad_norm": 0.31512388586997986, + "learning_rate": 4.983090626838444e-06, + "loss": 0.6799, + "step": 1050 + }, + { + "epoch": 0.2449122261923256, + "grad_norm": 0.31316640973091125, + "learning_rate": 4.983055049712238e-06, + "loss": 0.6928, + "step": 1051 + }, + { + "epoch": 0.2451452540003107, + "grad_norm": 0.30760249495506287, + "learning_rate": 4.9830194353256925e-06, + "loss": 0.6744, + "step": 1052 + }, + { + "epoch": 0.2453782818082958, + "grad_norm": 0.3210604190826416, + "learning_rate": 4.9829837836793405e-06, + "loss": 0.6656, + "step": 1053 + }, + { + "epoch": 0.2456113096162809, + "grad_norm": 0.31622931361198425, + "learning_rate": 4.9829480947737175e-06, + "loss": 0.6669, + "step": 1054 + }, + { + "epoch": 0.24584433742426595, + "grad_norm": 0.3119875490665436, + "learning_rate": 4.982912368609359e-06, + "loss": 0.6869, + "step": 1055 + }, + { + "epoch": 0.24607736523225104, + "grad_norm": 0.3131190538406372, + "learning_rate": 4.982876605186802e-06, + "loss": 0.6638, + "step": 1056 + }, + { + "epoch": 0.24631039304023614, + "grad_norm": 0.31808602809906006, + "learning_rate": 4.982840804506582e-06, + "loss": 0.6879, + "step": 1057 + }, + { + "epoch": 0.24654342084822123, + "grad_norm": 0.3290175497531891, + "learning_rate": 4.982804966569236e-06, + "loss": 0.7016, + "step": 1058 + }, + { + "epoch": 0.24677644865620632, + "grad_norm": 0.32719728350639343, + "learning_rate": 4.982769091375303e-06, + "loss": 0.6836, + "step": 1059 + }, + { + "epoch": 0.24700947646419139, + "grad_norm": 0.3295661509037018, + "learning_rate": 4.98273317892532e-06, + "loss": 0.6572, + "step": 1060 + }, + { + "epoch": 0.24724250427217648, + "grad_norm": 0.3341047763824463, + "learning_rate": 4.9826972292198285e-06, + "loss": 0.6918, + "step": 1061 + }, + { + "epoch": 0.24747553208016157, + "grad_norm": 0.31995072960853577, + "learning_rate": 4.982661242259364e-06, + "loss": 0.6724, + "step": 1062 + }, + { + "epoch": 0.24770855988814666, + "grad_norm": 0.3117062449455261, + "learning_rate": 4.98262521804447e-06, + "loss": 0.6708, + "step": 1063 + }, + { + "epoch": 0.24794158769613173, + "grad_norm": 0.31006914377212524, + "learning_rate": 4.9825891565756855e-06, + "loss": 0.692, + "step": 1064 + }, + { + "epoch": 0.24817461550411682, + "grad_norm": 0.32288649678230286, + "learning_rate": 4.9825530578535515e-06, + "loss": 0.69, + "step": 1065 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 0.3262838125228882, + "learning_rate": 4.98251692187861e-06, + "loss": 0.7044, + "step": 1066 + }, + { + "epoch": 0.248640671120087, + "grad_norm": 0.31438615918159485, + "learning_rate": 4.982480748651404e-06, + "loss": 0.6879, + "step": 1067 + }, + { + "epoch": 0.2488736989280721, + "grad_norm": 0.3240039646625519, + "learning_rate": 4.982444538172474e-06, + "loss": 0.6839, + "step": 1068 + }, + { + "epoch": 0.24910672673605716, + "grad_norm": 0.32020998001098633, + "learning_rate": 4.9824082904423665e-06, + "loss": 0.6608, + "step": 1069 + }, + { + "epoch": 0.24933975454404225, + "grad_norm": 0.31015968322753906, + "learning_rate": 4.9823720054616236e-06, + "loss": 0.6458, + "step": 1070 + }, + { + "epoch": 0.24957278235202734, + "grad_norm": 0.3322699964046478, + "learning_rate": 4.982335683230789e-06, + "loss": 0.7054, + "step": 1071 + }, + { + "epoch": 0.24980581016001244, + "grad_norm": 0.33780789375305176, + "learning_rate": 4.982299323750409e-06, + "loss": 0.6755, + "step": 1072 + }, + { + "epoch": 0.25003883796799753, + "grad_norm": 0.3332003653049469, + "learning_rate": 4.98226292702103e-06, + "loss": 0.6897, + "step": 1073 + }, + { + "epoch": 0.2502718657759826, + "grad_norm": 0.31305354833602905, + "learning_rate": 4.982226493043196e-06, + "loss": 0.6491, + "step": 1074 + }, + { + "epoch": 0.2505048935839677, + "grad_norm": 0.34786874055862427, + "learning_rate": 4.982190021817456e-06, + "loss": 0.675, + "step": 1075 + }, + { + "epoch": 0.2507379213919528, + "grad_norm": 0.3354516625404358, + "learning_rate": 4.982153513344356e-06, + "loss": 0.6907, + "step": 1076 + }, + { + "epoch": 0.25097094919993784, + "grad_norm": 0.32449620962142944, + "learning_rate": 4.982116967624443e-06, + "loss": 0.6725, + "step": 1077 + }, + { + "epoch": 0.25120397700792296, + "grad_norm": 0.33061128854751587, + "learning_rate": 4.9820803846582675e-06, + "loss": 0.6995, + "step": 1078 + }, + { + "epoch": 0.251437004815908, + "grad_norm": 0.3355608880519867, + "learning_rate": 4.982043764446377e-06, + "loss": 0.7114, + "step": 1079 + }, + { + "epoch": 0.2516700326238931, + "grad_norm": 0.32722851634025574, + "learning_rate": 4.982007106989322e-06, + "loss": 0.6612, + "step": 1080 + }, + { + "epoch": 0.2519030604318782, + "grad_norm": 0.3304991126060486, + "learning_rate": 4.981970412287651e-06, + "loss": 0.6826, + "step": 1081 + }, + { + "epoch": 0.2521360882398633, + "grad_norm": 0.3070150911808014, + "learning_rate": 4.981933680341916e-06, + "loss": 0.6807, + "step": 1082 + }, + { + "epoch": 0.2523691160478484, + "grad_norm": 0.3161391317844391, + "learning_rate": 4.9818969111526675e-06, + "loss": 0.6907, + "step": 1083 + }, + { + "epoch": 0.25260214385583346, + "grad_norm": 0.30827802419662476, + "learning_rate": 4.981860104720458e-06, + "loss": 0.6824, + "step": 1084 + }, + { + "epoch": 0.2528351716638185, + "grad_norm": 0.32472050189971924, + "learning_rate": 4.981823261045839e-06, + "loss": 0.7006, + "step": 1085 + }, + { + "epoch": 0.25306819947180365, + "grad_norm": 0.3271133005619049, + "learning_rate": 4.981786380129364e-06, + "loss": 0.6786, + "step": 1086 + }, + { + "epoch": 0.2533012272797887, + "grad_norm": 0.32003071904182434, + "learning_rate": 4.981749461971586e-06, + "loss": 0.6928, + "step": 1087 + }, + { + "epoch": 0.25353425508777383, + "grad_norm": 0.3227982819080353, + "learning_rate": 4.98171250657306e-06, + "loss": 0.6758, + "step": 1088 + }, + { + "epoch": 0.2537672828957589, + "grad_norm": 0.32913100719451904, + "learning_rate": 4.981675513934338e-06, + "loss": 0.719, + "step": 1089 + }, + { + "epoch": 0.25400031070374396, + "grad_norm": 0.308261513710022, + "learning_rate": 4.981638484055978e-06, + "loss": 0.675, + "step": 1090 + }, + { + "epoch": 0.2542333385117291, + "grad_norm": 0.31638050079345703, + "learning_rate": 4.9816014169385345e-06, + "loss": 0.6578, + "step": 1091 + }, + { + "epoch": 0.25446636631971414, + "grad_norm": 0.3201076090335846, + "learning_rate": 4.981564312582563e-06, + "loss": 0.6754, + "step": 1092 + }, + { + "epoch": 0.25469939412769926, + "grad_norm": 0.31934770941734314, + "learning_rate": 4.981527170988621e-06, + "loss": 0.6615, + "step": 1093 + }, + { + "epoch": 0.25493242193568433, + "grad_norm": 0.3143176734447479, + "learning_rate": 4.981489992157266e-06, + "loss": 0.6467, + "step": 1094 + }, + { + "epoch": 0.2551654497436694, + "grad_norm": 0.3193483054637909, + "learning_rate": 4.9814527760890565e-06, + "loss": 0.6631, + "step": 1095 + }, + { + "epoch": 0.2553984775516545, + "grad_norm": 0.3184541165828705, + "learning_rate": 4.981415522784549e-06, + "loss": 0.7029, + "step": 1096 + }, + { + "epoch": 0.2556315053596396, + "grad_norm": 0.35324546694755554, + "learning_rate": 4.981378232244304e-06, + "loss": 0.6817, + "step": 1097 + }, + { + "epoch": 0.2558645331676247, + "grad_norm": 0.3153809607028961, + "learning_rate": 4.981340904468881e-06, + "loss": 0.6552, + "step": 1098 + }, + { + "epoch": 0.25609756097560976, + "grad_norm": 0.33551350235939026, + "learning_rate": 4.98130353945884e-06, + "loss": 0.7084, + "step": 1099 + }, + { + "epoch": 0.2563305887835948, + "grad_norm": 0.33167293667793274, + "learning_rate": 4.981266137214741e-06, + "loss": 0.6773, + "step": 1100 + }, + { + "epoch": 0.25656361659157995, + "grad_norm": 0.3189348578453064, + "learning_rate": 4.981228697737147e-06, + "loss": 0.6695, + "step": 1101 + }, + { + "epoch": 0.256796644399565, + "grad_norm": 0.32993099093437195, + "learning_rate": 4.981191221026618e-06, + "loss": 0.6928, + "step": 1102 + }, + { + "epoch": 0.2570296722075501, + "grad_norm": 0.3275751769542694, + "learning_rate": 4.981153707083717e-06, + "loss": 0.6619, + "step": 1103 + }, + { + "epoch": 0.2572627000155352, + "grad_norm": 0.3442035913467407, + "learning_rate": 4.981116155909007e-06, + "loss": 0.6484, + "step": 1104 + }, + { + "epoch": 0.25749572782352026, + "grad_norm": 0.322083443403244, + "learning_rate": 4.981078567503051e-06, + "loss": 0.6485, + "step": 1105 + }, + { + "epoch": 0.2577287556315054, + "grad_norm": 0.3244823217391968, + "learning_rate": 4.981040941866414e-06, + "loss": 0.6758, + "step": 1106 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 0.32457441091537476, + "learning_rate": 4.9810032789996595e-06, + "loss": 0.6862, + "step": 1107 + }, + { + "epoch": 0.2581948112474755, + "grad_norm": 0.3417382538318634, + "learning_rate": 4.9809655789033536e-06, + "loss": 0.6969, + "step": 1108 + }, + { + "epoch": 0.25842783905546063, + "grad_norm": 0.3285803496837616, + "learning_rate": 4.9809278415780616e-06, + "loss": 0.6803, + "step": 1109 + }, + { + "epoch": 0.2586608668634457, + "grad_norm": 0.3371686041355133, + "learning_rate": 4.98089006702435e-06, + "loss": 0.689, + "step": 1110 + }, + { + "epoch": 0.2588938946714308, + "grad_norm": 0.3267279267311096, + "learning_rate": 4.980852255242786e-06, + "loss": 0.6717, + "step": 1111 + }, + { + "epoch": 0.2591269224794159, + "grad_norm": 0.34054896235466003, + "learning_rate": 4.980814406233936e-06, + "loss": 0.7054, + "step": 1112 + }, + { + "epoch": 0.25935995028740094, + "grad_norm": 0.3284096419811249, + "learning_rate": 4.980776519998368e-06, + "loss": 0.6687, + "step": 1113 + }, + { + "epoch": 0.25959297809538606, + "grad_norm": 0.3172536790370941, + "learning_rate": 4.9807385965366515e-06, + "loss": 0.6461, + "step": 1114 + }, + { + "epoch": 0.2598260059033711, + "grad_norm": 0.33623701333999634, + "learning_rate": 4.980700635849355e-06, + "loss": 0.6784, + "step": 1115 + }, + { + "epoch": 0.26005903371135625, + "grad_norm": 0.34166187047958374, + "learning_rate": 4.980662637937048e-06, + "loss": 0.6714, + "step": 1116 + }, + { + "epoch": 0.2602920615193413, + "grad_norm": 0.32317808270454407, + "learning_rate": 4.980624602800302e-06, + "loss": 0.6654, + "step": 1117 + }, + { + "epoch": 0.2605250893273264, + "grad_norm": 0.3245809078216553, + "learning_rate": 4.980586530439685e-06, + "loss": 0.6971, + "step": 1118 + }, + { + "epoch": 0.2607581171353115, + "grad_norm": 0.33168426156044006, + "learning_rate": 4.98054842085577e-06, + "loss": 0.6627, + "step": 1119 + }, + { + "epoch": 0.26099114494329656, + "grad_norm": 0.3556671738624573, + "learning_rate": 4.980510274049129e-06, + "loss": 0.6908, + "step": 1120 + }, + { + "epoch": 0.2612241727512817, + "grad_norm": 0.3098551332950592, + "learning_rate": 4.980472090020335e-06, + "loss": 0.6744, + "step": 1121 + }, + { + "epoch": 0.26145720055926674, + "grad_norm": 0.3297421634197235, + "learning_rate": 4.980433868769959e-06, + "loss": 0.6775, + "step": 1122 + }, + { + "epoch": 0.2616902283672518, + "grad_norm": 0.32819730043411255, + "learning_rate": 4.980395610298577e-06, + "loss": 0.6793, + "step": 1123 + }, + { + "epoch": 0.26192325617523693, + "grad_norm": 0.3369044065475464, + "learning_rate": 4.980357314606762e-06, + "loss": 0.6945, + "step": 1124 + }, + { + "epoch": 0.262156283983222, + "grad_norm": 0.3214326798915863, + "learning_rate": 4.980318981695087e-06, + "loss": 0.6807, + "step": 1125 + }, + { + "epoch": 0.26238931179120706, + "grad_norm": 0.32153618335723877, + "learning_rate": 4.98028061156413e-06, + "loss": 0.6583, + "step": 1126 + }, + { + "epoch": 0.2626223395991922, + "grad_norm": 0.325228214263916, + "learning_rate": 4.980242204214465e-06, + "loss": 0.676, + "step": 1127 + }, + { + "epoch": 0.26285536740717724, + "grad_norm": 0.32421669363975525, + "learning_rate": 4.980203759646668e-06, + "loss": 0.6774, + "step": 1128 + }, + { + "epoch": 0.26308839521516236, + "grad_norm": 0.3236892521381378, + "learning_rate": 4.980165277861318e-06, + "loss": 0.6637, + "step": 1129 + }, + { + "epoch": 0.2633214230231474, + "grad_norm": 0.32830357551574707, + "learning_rate": 4.98012675885899e-06, + "loss": 0.6796, + "step": 1130 + }, + { + "epoch": 0.2635544508311325, + "grad_norm": 0.3153664469718933, + "learning_rate": 4.980088202640264e-06, + "loss": 0.6617, + "step": 1131 + }, + { + "epoch": 0.2637874786391176, + "grad_norm": 0.34275203943252563, + "learning_rate": 4.9800496092057185e-06, + "loss": 0.6994, + "step": 1132 + }, + { + "epoch": 0.2640205064471027, + "grad_norm": 0.3406108319759369, + "learning_rate": 4.98001097855593e-06, + "loss": 0.7067, + "step": 1133 + }, + { + "epoch": 0.2642535342550878, + "grad_norm": 0.32589617371559143, + "learning_rate": 4.979972310691482e-06, + "loss": 0.6851, + "step": 1134 + }, + { + "epoch": 0.26448656206307286, + "grad_norm": 0.33518075942993164, + "learning_rate": 4.979933605612951e-06, + "loss": 0.7133, + "step": 1135 + }, + { + "epoch": 0.2647195898710579, + "grad_norm": 0.31793200969696045, + "learning_rate": 4.979894863320921e-06, + "loss": 0.6861, + "step": 1136 + }, + { + "epoch": 0.26495261767904305, + "grad_norm": 0.32094424962997437, + "learning_rate": 4.979856083815972e-06, + "loss": 0.6892, + "step": 1137 + }, + { + "epoch": 0.2651856454870281, + "grad_norm": 0.3373265564441681, + "learning_rate": 4.979817267098685e-06, + "loss": 0.7065, + "step": 1138 + }, + { + "epoch": 0.26541867329501323, + "grad_norm": 0.3219834864139557, + "learning_rate": 4.9797784131696434e-06, + "loss": 0.677, + "step": 1139 + }, + { + "epoch": 0.2656517011029983, + "grad_norm": 0.33354637026786804, + "learning_rate": 4.97973952202943e-06, + "loss": 0.7, + "step": 1140 + }, + { + "epoch": 0.26588472891098336, + "grad_norm": 0.3264922797679901, + "learning_rate": 4.979700593678629e-06, + "loss": 0.7088, + "step": 1141 + }, + { + "epoch": 0.2661177567189685, + "grad_norm": 0.30343928933143616, + "learning_rate": 4.979661628117825e-06, + "loss": 0.6586, + "step": 1142 + }, + { + "epoch": 0.26635078452695354, + "grad_norm": 0.3384201228618622, + "learning_rate": 4.9796226253476e-06, + "loss": 0.6742, + "step": 1143 + }, + { + "epoch": 0.26658381233493866, + "grad_norm": 0.3338489234447479, + "learning_rate": 4.979583585368543e-06, + "loss": 0.6766, + "step": 1144 + }, + { + "epoch": 0.26681684014292373, + "grad_norm": 0.34501194953918457, + "learning_rate": 4.979544508181237e-06, + "loss": 0.7057, + "step": 1145 + }, + { + "epoch": 0.2670498679509088, + "grad_norm": 0.34783637523651123, + "learning_rate": 4.979505393786269e-06, + "loss": 0.6659, + "step": 1146 + }, + { + "epoch": 0.2672828957588939, + "grad_norm": 0.3296046853065491, + "learning_rate": 4.979466242184227e-06, + "loss": 0.6811, + "step": 1147 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 0.32453563809394836, + "learning_rate": 4.979427053375697e-06, + "loss": 0.6526, + "step": 1148 + }, + { + "epoch": 0.26774895137486404, + "grad_norm": 0.3431033194065094, + "learning_rate": 4.979387827361269e-06, + "loss": 0.6971, + "step": 1149 + }, + { + "epoch": 0.26798197918284916, + "grad_norm": 0.3361447751522064, + "learning_rate": 4.979348564141529e-06, + "loss": 0.6631, + "step": 1150 + }, + { + "epoch": 0.2682150069908342, + "grad_norm": 0.32666468620300293, + "learning_rate": 4.9793092637170695e-06, + "loss": 0.7038, + "step": 1151 + }, + { + "epoch": 0.26844803479881935, + "grad_norm": 0.32629555463790894, + "learning_rate": 4.979269926088477e-06, + "loss": 0.6951, + "step": 1152 + }, + { + "epoch": 0.2686810626068044, + "grad_norm": 0.33472225069999695, + "learning_rate": 4.979230551256344e-06, + "loss": 0.7206, + "step": 1153 + }, + { + "epoch": 0.2689140904147895, + "grad_norm": 0.35302734375, + "learning_rate": 4.97919113922126e-06, + "loss": 0.6635, + "step": 1154 + }, + { + "epoch": 0.2691471182227746, + "grad_norm": 0.32378703355789185, + "learning_rate": 4.979151689983817e-06, + "loss": 0.6727, + "step": 1155 + }, + { + "epoch": 0.26938014603075966, + "grad_norm": 0.3213959038257599, + "learning_rate": 4.979112203544607e-06, + "loss": 0.681, + "step": 1156 + }, + { + "epoch": 0.2696131738387448, + "grad_norm": 0.32677143812179565, + "learning_rate": 4.979072679904222e-06, + "loss": 0.6913, + "step": 1157 + }, + { + "epoch": 0.26984620164672984, + "grad_norm": 0.327044278383255, + "learning_rate": 4.979033119063257e-06, + "loss": 0.7015, + "step": 1158 + }, + { + "epoch": 0.2700792294547149, + "grad_norm": 0.34587791562080383, + "learning_rate": 4.978993521022303e-06, + "loss": 0.6828, + "step": 1159 + }, + { + "epoch": 0.27031225726270003, + "grad_norm": 0.3155994117259979, + "learning_rate": 4.978953885781955e-06, + "loss": 0.6763, + "step": 1160 + }, + { + "epoch": 0.2705452850706851, + "grad_norm": 0.3262394666671753, + "learning_rate": 4.978914213342808e-06, + "loss": 0.6755, + "step": 1161 + }, + { + "epoch": 0.2707783128786702, + "grad_norm": 0.329606831073761, + "learning_rate": 4.978874503705458e-06, + "loss": 0.6833, + "step": 1162 + }, + { + "epoch": 0.2710113406866553, + "grad_norm": 0.3400686979293823, + "learning_rate": 4.978834756870501e-06, + "loss": 0.6926, + "step": 1163 + }, + { + "epoch": 0.27124436849464034, + "grad_norm": 0.32114097476005554, + "learning_rate": 4.978794972838532e-06, + "loss": 0.6905, + "step": 1164 + }, + { + "epoch": 0.27147739630262546, + "grad_norm": 0.3182801604270935, + "learning_rate": 4.978755151610149e-06, + "loss": 0.6933, + "step": 1165 + }, + { + "epoch": 0.2717104241106105, + "grad_norm": 0.3098074495792389, + "learning_rate": 4.97871529318595e-06, + "loss": 0.6769, + "step": 1166 + }, + { + "epoch": 0.27194345191859565, + "grad_norm": 0.33235201239585876, + "learning_rate": 4.978675397566531e-06, + "loss": 0.6777, + "step": 1167 + }, + { + "epoch": 0.2721764797265807, + "grad_norm": 0.32080531120300293, + "learning_rate": 4.978635464752492e-06, + "loss": 0.6638, + "step": 1168 + }, + { + "epoch": 0.2724095075345658, + "grad_norm": 0.34010323882102966, + "learning_rate": 4.9785954947444336e-06, + "loss": 0.6762, + "step": 1169 + }, + { + "epoch": 0.2726425353425509, + "grad_norm": 0.33953315019607544, + "learning_rate": 4.978555487542954e-06, + "loss": 0.6732, + "step": 1170 + }, + { + "epoch": 0.27287556315053596, + "grad_norm": 0.31337615847587585, + "learning_rate": 4.9785154431486535e-06, + "loss": 0.6625, + "step": 1171 + }, + { + "epoch": 0.273108590958521, + "grad_norm": 0.31656894087791443, + "learning_rate": 4.978475361562133e-06, + "loss": 0.6888, + "step": 1172 + }, + { + "epoch": 0.27334161876650614, + "grad_norm": 0.3215431571006775, + "learning_rate": 4.978435242783995e-06, + "loss": 0.657, + "step": 1173 + }, + { + "epoch": 0.2735746465744912, + "grad_norm": 0.3264496624469757, + "learning_rate": 4.97839508681484e-06, + "loss": 0.664, + "step": 1174 + }, + { + "epoch": 0.27380767438247633, + "grad_norm": 0.3303684592247009, + "learning_rate": 4.9783548936552725e-06, + "loss": 0.6929, + "step": 1175 + }, + { + "epoch": 0.2740407021904614, + "grad_norm": 0.3294624090194702, + "learning_rate": 4.978314663305893e-06, + "loss": 0.6799, + "step": 1176 + }, + { + "epoch": 0.27427372999844646, + "grad_norm": 0.33381983637809753, + "learning_rate": 4.978274395767308e-06, + "loss": 0.6694, + "step": 1177 + }, + { + "epoch": 0.2745067578064316, + "grad_norm": 0.3268805742263794, + "learning_rate": 4.97823409104012e-06, + "loss": 0.6709, + "step": 1178 + }, + { + "epoch": 0.27473978561441664, + "grad_norm": 0.31954631209373474, + "learning_rate": 4.978193749124934e-06, + "loss": 0.7034, + "step": 1179 + }, + { + "epoch": 0.27497281342240176, + "grad_norm": 0.33598634600639343, + "learning_rate": 4.978153370022356e-06, + "loss": 0.6897, + "step": 1180 + }, + { + "epoch": 0.2752058412303868, + "grad_norm": 0.33294931054115295, + "learning_rate": 4.978112953732992e-06, + "loss": 0.6812, + "step": 1181 + }, + { + "epoch": 0.2754388690383719, + "grad_norm": 0.3250420093536377, + "learning_rate": 4.978072500257447e-06, + "loss": 0.6741, + "step": 1182 + }, + { + "epoch": 0.275671896846357, + "grad_norm": 0.3255693018436432, + "learning_rate": 4.97803200959633e-06, + "loss": 0.6859, + "step": 1183 + }, + { + "epoch": 0.2759049246543421, + "grad_norm": 0.3343731462955475, + "learning_rate": 4.977991481750247e-06, + "loss": 0.6629, + "step": 1184 + }, + { + "epoch": 0.2761379524623272, + "grad_norm": 0.3359210193157196, + "learning_rate": 4.977950916719807e-06, + "loss": 0.7201, + "step": 1185 + }, + { + "epoch": 0.27637098027031226, + "grad_norm": 0.3182503283023834, + "learning_rate": 4.977910314505619e-06, + "loss": 0.6366, + "step": 1186 + }, + { + "epoch": 0.2766040080782973, + "grad_norm": 0.3269363343715668, + "learning_rate": 4.977869675108291e-06, + "loss": 0.6941, + "step": 1187 + }, + { + "epoch": 0.27683703588628245, + "grad_norm": 0.3201512098312378, + "learning_rate": 4.977828998528434e-06, + "loss": 0.6859, + "step": 1188 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 0.3153619170188904, + "learning_rate": 4.977788284766658e-06, + "loss": 0.6536, + "step": 1189 + }, + { + "epoch": 0.2773030915022526, + "grad_norm": 0.32349175214767456, + "learning_rate": 4.977747533823574e-06, + "loss": 0.6877, + "step": 1190 + }, + { + "epoch": 0.2775361193102377, + "grad_norm": 0.32620033621788025, + "learning_rate": 4.977706745699793e-06, + "loss": 0.6881, + "step": 1191 + }, + { + "epoch": 0.27776914711822276, + "grad_norm": 0.3228050470352173, + "learning_rate": 4.9776659203959284e-06, + "loss": 0.661, + "step": 1192 + }, + { + "epoch": 0.2780021749262079, + "grad_norm": 0.32657092809677124, + "learning_rate": 4.977625057912591e-06, + "loss": 0.6892, + "step": 1193 + }, + { + "epoch": 0.27823520273419294, + "grad_norm": 0.32486894726753235, + "learning_rate": 4.9775841582503956e-06, + "loss": 0.6862, + "step": 1194 + }, + { + "epoch": 0.278468230542178, + "grad_norm": 0.33233556151390076, + "learning_rate": 4.977543221409955e-06, + "loss": 0.6862, + "step": 1195 + }, + { + "epoch": 0.27870125835016313, + "grad_norm": 0.32121655344963074, + "learning_rate": 4.977502247391884e-06, + "loss": 0.6639, + "step": 1196 + }, + { + "epoch": 0.2789342861581482, + "grad_norm": 0.3228181004524231, + "learning_rate": 4.977461236196797e-06, + "loss": 0.6563, + "step": 1197 + }, + { + "epoch": 0.2791673139661333, + "grad_norm": 0.3242126703262329, + "learning_rate": 4.9774201878253096e-06, + "loss": 0.674, + "step": 1198 + }, + { + "epoch": 0.2794003417741184, + "grad_norm": 0.3366967737674713, + "learning_rate": 4.9773791022780375e-06, + "loss": 0.6818, + "step": 1199 + }, + { + "epoch": 0.27963336958210344, + "grad_norm": 0.33272233605384827, + "learning_rate": 4.977337979555597e-06, + "loss": 0.6512, + "step": 1200 + }, + { + "epoch": 0.27986639739008856, + "grad_norm": 0.3313521146774292, + "learning_rate": 4.977296819658607e-06, + "loss": 0.6807, + "step": 1201 + }, + { + "epoch": 0.2800994251980736, + "grad_norm": 0.31650835275650024, + "learning_rate": 4.977255622587683e-06, + "loss": 0.6687, + "step": 1202 + }, + { + "epoch": 0.28033245300605875, + "grad_norm": 0.33946067094802856, + "learning_rate": 4.977214388343444e-06, + "loss": 0.6786, + "step": 1203 + }, + { + "epoch": 0.2805654808140438, + "grad_norm": 0.3237524926662445, + "learning_rate": 4.97717311692651e-06, + "loss": 0.6663, + "step": 1204 + }, + { + "epoch": 0.2807985086220289, + "grad_norm": 0.32818326354026794, + "learning_rate": 4.977131808337497e-06, + "loss": 0.6831, + "step": 1205 + }, + { + "epoch": 0.281031536430014, + "grad_norm": 0.3310922384262085, + "learning_rate": 4.977090462577029e-06, + "loss": 0.6723, + "step": 1206 + }, + { + "epoch": 0.28126456423799906, + "grad_norm": 0.3216572403907776, + "learning_rate": 4.977049079645723e-06, + "loss": 0.6771, + "step": 1207 + }, + { + "epoch": 0.2814975920459842, + "grad_norm": 0.3296744227409363, + "learning_rate": 4.977007659544202e-06, + "loss": 0.7123, + "step": 1208 + }, + { + "epoch": 0.28173061985396924, + "grad_norm": 0.3326682150363922, + "learning_rate": 4.976966202273087e-06, + "loss": 0.705, + "step": 1209 + }, + { + "epoch": 0.2819636476619543, + "grad_norm": 0.3582572638988495, + "learning_rate": 4.976924707832999e-06, + "loss": 0.658, + "step": 1210 + }, + { + "epoch": 0.28219667546993943, + "grad_norm": 0.33549171686172485, + "learning_rate": 4.9768831762245625e-06, + "loss": 0.6445, + "step": 1211 + }, + { + "epoch": 0.2824297032779245, + "grad_norm": 0.3363671600818634, + "learning_rate": 4.976841607448399e-06, + "loss": 0.6694, + "step": 1212 + }, + { + "epoch": 0.28266273108590956, + "grad_norm": 0.3305470049381256, + "learning_rate": 4.976800001505135e-06, + "loss": 0.6748, + "step": 1213 + }, + { + "epoch": 0.2828957588938947, + "grad_norm": 0.333059698343277, + "learning_rate": 4.976758358395392e-06, + "loss": 0.6938, + "step": 1214 + }, + { + "epoch": 0.28312878670187974, + "grad_norm": 0.3261781930923462, + "learning_rate": 4.976716678119795e-06, + "loss": 0.6871, + "step": 1215 + }, + { + "epoch": 0.28336181450986486, + "grad_norm": 0.33105453848838806, + "learning_rate": 4.976674960678971e-06, + "loss": 0.6645, + "step": 1216 + }, + { + "epoch": 0.2835948423178499, + "grad_norm": 0.3435531258583069, + "learning_rate": 4.976633206073546e-06, + "loss": 0.7017, + "step": 1217 + }, + { + "epoch": 0.283827870125835, + "grad_norm": 0.32856670022010803, + "learning_rate": 4.9765914143041444e-06, + "loss": 0.6549, + "step": 1218 + }, + { + "epoch": 0.2840608979338201, + "grad_norm": 0.3154817223548889, + "learning_rate": 4.9765495853713955e-06, + "loss": 0.6836, + "step": 1219 + }, + { + "epoch": 0.2842939257418052, + "grad_norm": 0.338883638381958, + "learning_rate": 4.976507719275927e-06, + "loss": 0.6928, + "step": 1220 + }, + { + "epoch": 0.2845269535497903, + "grad_norm": 0.3774440586566925, + "learning_rate": 4.976465816018366e-06, + "loss": 0.6384, + "step": 1221 + }, + { + "epoch": 0.28475998135777536, + "grad_norm": 0.31949707865715027, + "learning_rate": 4.976423875599341e-06, + "loss": 0.6726, + "step": 1222 + }, + { + "epoch": 0.2849930091657604, + "grad_norm": 0.3362560272216797, + "learning_rate": 4.976381898019483e-06, + "loss": 0.7068, + "step": 1223 + }, + { + "epoch": 0.28522603697374554, + "grad_norm": 0.3268083333969116, + "learning_rate": 4.976339883279421e-06, + "loss": 0.7035, + "step": 1224 + }, + { + "epoch": 0.2854590647817306, + "grad_norm": 0.364164263010025, + "learning_rate": 4.9762978313797846e-06, + "loss": 0.6936, + "step": 1225 + }, + { + "epoch": 0.28569209258971573, + "grad_norm": 0.33498162031173706, + "learning_rate": 4.976255742321206e-06, + "loss": 0.654, + "step": 1226 + }, + { + "epoch": 0.2859251203977008, + "grad_norm": 0.333462119102478, + "learning_rate": 4.9762136161043165e-06, + "loss": 0.702, + "step": 1227 + }, + { + "epoch": 0.28615814820568586, + "grad_norm": 0.33537304401397705, + "learning_rate": 4.976171452729749e-06, + "loss": 0.6985, + "step": 1228 + }, + { + "epoch": 0.286391176013671, + "grad_norm": 0.33682459592819214, + "learning_rate": 4.976129252198134e-06, + "loss": 0.6887, + "step": 1229 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 0.35498911142349243, + "learning_rate": 4.976087014510107e-06, + "loss": 0.7015, + "step": 1230 + }, + { + "epoch": 0.28685723162964116, + "grad_norm": 0.3236118257045746, + "learning_rate": 4.976044739666301e-06, + "loss": 0.6661, + "step": 1231 + }, + { + "epoch": 0.2870902594376262, + "grad_norm": 0.3204387426376343, + "learning_rate": 4.97600242766735e-06, + "loss": 0.7015, + "step": 1232 + }, + { + "epoch": 0.2873232872456113, + "grad_norm": 0.32678601145744324, + "learning_rate": 4.9759600785138895e-06, + "loss": 0.7094, + "step": 1233 + }, + { + "epoch": 0.2875563150535964, + "grad_norm": 0.36387044191360474, + "learning_rate": 4.975917692206554e-06, + "loss": 0.6836, + "step": 1234 + }, + { + "epoch": 0.2877893428615815, + "grad_norm": 0.3519711196422577, + "learning_rate": 4.975875268745981e-06, + "loss": 0.7121, + "step": 1235 + }, + { + "epoch": 0.28802237066956654, + "grad_norm": 0.32280227541923523, + "learning_rate": 4.975832808132807e-06, + "loss": 0.69, + "step": 1236 + }, + { + "epoch": 0.28825539847755166, + "grad_norm": 0.3270997405052185, + "learning_rate": 4.975790310367668e-06, + "loss": 0.6721, + "step": 1237 + }, + { + "epoch": 0.2884884262855367, + "grad_norm": 0.3828691244125366, + "learning_rate": 4.975747775451202e-06, + "loss": 0.6704, + "step": 1238 + }, + { + "epoch": 0.28872145409352185, + "grad_norm": 0.3381083309650421, + "learning_rate": 4.975705203384047e-06, + "loss": 0.6814, + "step": 1239 + }, + { + "epoch": 0.2889544819015069, + "grad_norm": 0.3429172933101654, + "learning_rate": 4.975662594166843e-06, + "loss": 0.6855, + "step": 1240 + }, + { + "epoch": 0.289187509709492, + "grad_norm": 0.34118086099624634, + "learning_rate": 4.975619947800228e-06, + "loss": 0.6689, + "step": 1241 + }, + { + "epoch": 0.2894205375174771, + "grad_norm": 0.3280944526195526, + "learning_rate": 4.9755772642848444e-06, + "loss": 0.6634, + "step": 1242 + }, + { + "epoch": 0.28965356532546216, + "grad_norm": 0.3340398371219635, + "learning_rate": 4.97553454362133e-06, + "loss": 0.6863, + "step": 1243 + }, + { + "epoch": 0.2898865931334473, + "grad_norm": 0.3207405209541321, + "learning_rate": 4.975491785810327e-06, + "loss": 0.6673, + "step": 1244 + }, + { + "epoch": 0.29011962094143234, + "grad_norm": 0.33891165256500244, + "learning_rate": 4.975448990852476e-06, + "loss": 0.7043, + "step": 1245 + }, + { + "epoch": 0.2903526487494174, + "grad_norm": 0.35398241877555847, + "learning_rate": 4.975406158748422e-06, + "loss": 0.7024, + "step": 1246 + }, + { + "epoch": 0.29058567655740253, + "grad_norm": 0.3343062102794647, + "learning_rate": 4.975363289498803e-06, + "loss": 0.7019, + "step": 1247 + }, + { + "epoch": 0.2908187043653876, + "grad_norm": 0.3250543773174286, + "learning_rate": 4.975320383104268e-06, + "loss": 0.7064, + "step": 1248 + }, + { + "epoch": 0.2910517321733727, + "grad_norm": 0.3200162351131439, + "learning_rate": 4.975277439565456e-06, + "loss": 0.6827, + "step": 1249 + }, + { + "epoch": 0.2912847599813578, + "grad_norm": 0.32905685901641846, + "learning_rate": 4.975234458883014e-06, + "loss": 0.6955, + "step": 1250 + }, + { + "epoch": 0.29151778778934284, + "grad_norm": 0.3250727355480194, + "learning_rate": 4.9751914410575855e-06, + "loss": 0.6846, + "step": 1251 + }, + { + "epoch": 0.29175081559732796, + "grad_norm": 0.32665541768074036, + "learning_rate": 4.975148386089818e-06, + "loss": 0.6599, + "step": 1252 + }, + { + "epoch": 0.291983843405313, + "grad_norm": 0.32674768567085266, + "learning_rate": 4.975105293980355e-06, + "loss": 0.6793, + "step": 1253 + }, + { + "epoch": 0.29221687121329815, + "grad_norm": 0.3193095028400421, + "learning_rate": 4.9750621647298455e-06, + "loss": 0.6859, + "step": 1254 + }, + { + "epoch": 0.2924498990212832, + "grad_norm": 0.33316028118133545, + "learning_rate": 4.9750189983389355e-06, + "loss": 0.678, + "step": 1255 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 0.32702332735061646, + "learning_rate": 4.974975794808273e-06, + "loss": 0.673, + "step": 1256 + }, + { + "epoch": 0.2929159546372534, + "grad_norm": 0.3176928460597992, + "learning_rate": 4.9749325541385065e-06, + "loss": 0.6589, + "step": 1257 + }, + { + "epoch": 0.29314898244523846, + "grad_norm": 0.33263736963272095, + "learning_rate": 4.974889276330284e-06, + "loss": 0.691, + "step": 1258 + }, + { + "epoch": 0.2933820102532235, + "grad_norm": 0.3262975513935089, + "learning_rate": 4.974845961384256e-06, + "loss": 0.6694, + "step": 1259 + }, + { + "epoch": 0.29361503806120864, + "grad_norm": 0.31891730427742004, + "learning_rate": 4.974802609301072e-06, + "loss": 0.6778, + "step": 1260 + }, + { + "epoch": 0.2938480658691937, + "grad_norm": 0.31289660930633545, + "learning_rate": 4.974759220081382e-06, + "loss": 0.6597, + "step": 1261 + }, + { + "epoch": 0.29408109367717883, + "grad_norm": 0.33140507340431213, + "learning_rate": 4.974715793725838e-06, + "loss": 0.6691, + "step": 1262 + }, + { + "epoch": 0.2943141214851639, + "grad_norm": 0.3315129578113556, + "learning_rate": 4.974672330235091e-06, + "loss": 0.6578, + "step": 1263 + }, + { + "epoch": 0.29454714929314896, + "grad_norm": 0.32520338892936707, + "learning_rate": 4.9746288296097935e-06, + "loss": 0.6563, + "step": 1264 + }, + { + "epoch": 0.2947801771011341, + "grad_norm": 0.3262653350830078, + "learning_rate": 4.974585291850599e-06, + "loss": 0.6945, + "step": 1265 + }, + { + "epoch": 0.29501320490911914, + "grad_norm": 0.320806622505188, + "learning_rate": 4.974541716958159e-06, + "loss": 0.6673, + "step": 1266 + }, + { + "epoch": 0.29524623271710426, + "grad_norm": 0.3515656292438507, + "learning_rate": 4.97449810493313e-06, + "loss": 0.6969, + "step": 1267 + }, + { + "epoch": 0.2954792605250893, + "grad_norm": 0.3243502974510193, + "learning_rate": 4.9744544557761634e-06, + "loss": 0.647, + "step": 1268 + }, + { + "epoch": 0.2957122883330744, + "grad_norm": 0.339293509721756, + "learning_rate": 4.974410769487916e-06, + "loss": 0.6831, + "step": 1269 + }, + { + "epoch": 0.2959453161410595, + "grad_norm": 0.3240710198879242, + "learning_rate": 4.974367046069044e-06, + "loss": 0.6798, + "step": 1270 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 0.3185398280620575, + "learning_rate": 4.9743232855202015e-06, + "loss": 0.6921, + "step": 1271 + }, + { + "epoch": 0.2964113717570297, + "grad_norm": 0.3241291642189026, + "learning_rate": 4.974279487842046e-06, + "loss": 0.6838, + "step": 1272 + }, + { + "epoch": 0.29664439956501476, + "grad_norm": 0.33373790979385376, + "learning_rate": 4.974235653035235e-06, + "loss": 0.698, + "step": 1273 + }, + { + "epoch": 0.2968774273729998, + "grad_norm": 0.33697187900543213, + "learning_rate": 4.974191781100427e-06, + "loss": 0.6861, + "step": 1274 + }, + { + "epoch": 0.29711045518098494, + "grad_norm": 0.32973381876945496, + "learning_rate": 4.9741478720382795e-06, + "loss": 0.6616, + "step": 1275 + }, + { + "epoch": 0.29734348298897, + "grad_norm": 0.33947205543518066, + "learning_rate": 4.974103925849451e-06, + "loss": 0.6687, + "step": 1276 + }, + { + "epoch": 0.29757651079695513, + "grad_norm": 0.3271055817604065, + "learning_rate": 4.9740599425346016e-06, + "loss": 0.6615, + "step": 1277 + }, + { + "epoch": 0.2978095386049402, + "grad_norm": 0.3290695250034332, + "learning_rate": 4.974015922094391e-06, + "loss": 0.7102, + "step": 1278 + }, + { + "epoch": 0.29804256641292526, + "grad_norm": 0.3332037925720215, + "learning_rate": 4.97397186452948e-06, + "loss": 0.6607, + "step": 1279 + }, + { + "epoch": 0.2982755942209104, + "grad_norm": 0.3230850100517273, + "learning_rate": 4.973927769840529e-06, + "loss": 0.6831, + "step": 1280 + }, + { + "epoch": 0.29850862202889544, + "grad_norm": 0.32722553610801697, + "learning_rate": 4.9738836380282e-06, + "loss": 0.6913, + "step": 1281 + }, + { + "epoch": 0.2987416498368805, + "grad_norm": 0.3400437533855438, + "learning_rate": 4.973839469093157e-06, + "loss": 0.66, + "step": 1282 + }, + { + "epoch": 0.2989746776448656, + "grad_norm": 0.3300645649433136, + "learning_rate": 4.97379526303606e-06, + "loss": 0.6846, + "step": 1283 + }, + { + "epoch": 0.2992077054528507, + "grad_norm": 0.3195590376853943, + "learning_rate": 4.973751019857574e-06, + "loss": 0.6916, + "step": 1284 + }, + { + "epoch": 0.2994407332608358, + "grad_norm": 0.32309019565582275, + "learning_rate": 4.973706739558363e-06, + "loss": 0.6877, + "step": 1285 + }, + { + "epoch": 0.2996737610688209, + "grad_norm": 0.3287172317504883, + "learning_rate": 4.97366242213909e-06, + "loss": 0.6876, + "step": 1286 + }, + { + "epoch": 0.29990678887680594, + "grad_norm": 0.3174317181110382, + "learning_rate": 4.973618067600422e-06, + "loss": 0.6512, + "step": 1287 + }, + { + "epoch": 0.30013981668479106, + "grad_norm": 0.33624881505966187, + "learning_rate": 4.973573675943023e-06, + "loss": 0.6689, + "step": 1288 + }, + { + "epoch": 0.3003728444927761, + "grad_norm": 0.3421054184436798, + "learning_rate": 4.97352924716756e-06, + "loss": 0.6369, + "step": 1289 + }, + { + "epoch": 0.30060587230076125, + "grad_norm": 0.3369331657886505, + "learning_rate": 4.973484781274699e-06, + "loss": 0.7103, + "step": 1290 + }, + { + "epoch": 0.3008389001087463, + "grad_norm": 0.3262912631034851, + "learning_rate": 4.973440278265108e-06, + "loss": 0.6854, + "step": 1291 + }, + { + "epoch": 0.3010719279167314, + "grad_norm": 0.333082377910614, + "learning_rate": 4.973395738139455e-06, + "loss": 0.6717, + "step": 1292 + }, + { + "epoch": 0.3013049557247165, + "grad_norm": 0.3398587703704834, + "learning_rate": 4.973351160898407e-06, + "loss": 0.6579, + "step": 1293 + }, + { + "epoch": 0.30153798353270156, + "grad_norm": 0.3400275707244873, + "learning_rate": 4.973306546542634e-06, + "loss": 0.6636, + "step": 1294 + }, + { + "epoch": 0.3017710113406867, + "grad_norm": 0.3500739634037018, + "learning_rate": 4.973261895072805e-06, + "loss": 0.6585, + "step": 1295 + }, + { + "epoch": 0.30200403914867174, + "grad_norm": 0.3260088562965393, + "learning_rate": 4.973217206489591e-06, + "loss": 0.6816, + "step": 1296 + }, + { + "epoch": 0.3022370669566568, + "grad_norm": 0.34469497203826904, + "learning_rate": 4.973172480793661e-06, + "loss": 0.692, + "step": 1297 + }, + { + "epoch": 0.30247009476464193, + "grad_norm": 0.31993481516838074, + "learning_rate": 4.9731277179856875e-06, + "loss": 0.666, + "step": 1298 + }, + { + "epoch": 0.302703122572627, + "grad_norm": 0.3278704583644867, + "learning_rate": 4.973082918066341e-06, + "loss": 0.6502, + "step": 1299 + }, + { + "epoch": 0.3029361503806121, + "grad_norm": 0.32976511120796204, + "learning_rate": 4.973038081036295e-06, + "loss": 0.6716, + "step": 1300 + }, + { + "epoch": 0.3031691781885972, + "grad_norm": 0.33230850100517273, + "learning_rate": 4.9729932068962225e-06, + "loss": 0.6478, + "step": 1301 + }, + { + "epoch": 0.30340220599658224, + "grad_norm": 0.31722718477249146, + "learning_rate": 4.972948295646796e-06, + "loss": 0.658, + "step": 1302 + }, + { + "epoch": 0.30363523380456736, + "grad_norm": 0.34632936120033264, + "learning_rate": 4.972903347288688e-06, + "loss": 0.6877, + "step": 1303 + }, + { + "epoch": 0.3038682616125524, + "grad_norm": 0.3193000257015228, + "learning_rate": 4.972858361822576e-06, + "loss": 0.681, + "step": 1304 + }, + { + "epoch": 0.3041012894205375, + "grad_norm": 0.3409803509712219, + "learning_rate": 4.972813339249134e-06, + "loss": 0.6829, + "step": 1305 + }, + { + "epoch": 0.3043343172285226, + "grad_norm": 0.3246142864227295, + "learning_rate": 4.972768279569036e-06, + "loss": 0.6726, + "step": 1306 + }, + { + "epoch": 0.3045673450365077, + "grad_norm": 0.3239786922931671, + "learning_rate": 4.9727231827829594e-06, + "loss": 0.672, + "step": 1307 + }, + { + "epoch": 0.3048003728444928, + "grad_norm": 0.35233545303344727, + "learning_rate": 4.972678048891582e-06, + "loss": 0.7037, + "step": 1308 + }, + { + "epoch": 0.30503340065247786, + "grad_norm": 0.3429258167743683, + "learning_rate": 4.972632877895579e-06, + "loss": 0.6967, + "step": 1309 + }, + { + "epoch": 0.3052664284604629, + "grad_norm": 0.3299075663089752, + "learning_rate": 4.97258766979563e-06, + "loss": 0.658, + "step": 1310 + }, + { + "epoch": 0.30549945626844804, + "grad_norm": 0.3192426860332489, + "learning_rate": 4.9725424245924114e-06, + "loss": 0.6884, + "step": 1311 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 0.32650989294052124, + "learning_rate": 4.972497142286604e-06, + "loss": 0.6631, + "step": 1312 + }, + { + "epoch": 0.30596551188441823, + "grad_norm": 0.3478202223777771, + "learning_rate": 4.972451822878887e-06, + "loss": 0.6741, + "step": 1313 + }, + { + "epoch": 0.3061985396924033, + "grad_norm": 0.32752013206481934, + "learning_rate": 4.97240646636994e-06, + "loss": 0.6438, + "step": 1314 + }, + { + "epoch": 0.30643156750038836, + "grad_norm": 0.33994001150131226, + "learning_rate": 4.972361072760444e-06, + "loss": 0.704, + "step": 1315 + }, + { + "epoch": 0.3066645953083735, + "grad_norm": 0.32744064927101135, + "learning_rate": 4.972315642051079e-06, + "loss": 0.6549, + "step": 1316 + }, + { + "epoch": 0.30689762311635854, + "grad_norm": 0.3362392783164978, + "learning_rate": 4.972270174242527e-06, + "loss": 0.6909, + "step": 1317 + }, + { + "epoch": 0.30713065092434366, + "grad_norm": 0.3341652750968933, + "learning_rate": 4.9722246693354725e-06, + "loss": 0.6858, + "step": 1318 + }, + { + "epoch": 0.3073636787323287, + "grad_norm": 0.3378065228462219, + "learning_rate": 4.972179127330596e-06, + "loss": 0.6953, + "step": 1319 + }, + { + "epoch": 0.3075967065403138, + "grad_norm": 0.3364918828010559, + "learning_rate": 4.9721335482285815e-06, + "loss": 0.6604, + "step": 1320 + }, + { + "epoch": 0.3078297343482989, + "grad_norm": 0.3280162811279297, + "learning_rate": 4.972087932030113e-06, + "loss": 0.6509, + "step": 1321 + }, + { + "epoch": 0.308062762156284, + "grad_norm": 0.31110915541648865, + "learning_rate": 4.972042278735875e-06, + "loss": 0.6488, + "step": 1322 + }, + { + "epoch": 0.3082957899642691, + "grad_norm": 0.34272050857543945, + "learning_rate": 4.971996588346553e-06, + "loss": 0.696, + "step": 1323 + }, + { + "epoch": 0.30852881777225416, + "grad_norm": 0.32411742210388184, + "learning_rate": 4.971950860862833e-06, + "loss": 0.6927, + "step": 1324 + }, + { + "epoch": 0.3087618455802392, + "grad_norm": 0.34221556782722473, + "learning_rate": 4.971905096285399e-06, + "loss": 0.6693, + "step": 1325 + }, + { + "epoch": 0.30899487338822434, + "grad_norm": 0.3373861014842987, + "learning_rate": 4.97185929461494e-06, + "loss": 0.6877, + "step": 1326 + }, + { + "epoch": 0.3092279011962094, + "grad_norm": 0.3335096836090088, + "learning_rate": 4.971813455852142e-06, + "loss": 0.6784, + "step": 1327 + }, + { + "epoch": 0.3094609290041945, + "grad_norm": 0.33188334107398987, + "learning_rate": 4.971767579997694e-06, + "loss": 0.687, + "step": 1328 + }, + { + "epoch": 0.3096939568121796, + "grad_norm": 0.33043885231018066, + "learning_rate": 4.971721667052282e-06, + "loss": 0.6746, + "step": 1329 + }, + { + "epoch": 0.30992698462016466, + "grad_norm": 0.31568336486816406, + "learning_rate": 4.9716757170165985e-06, + "loss": 0.6589, + "step": 1330 + }, + { + "epoch": 0.3101600124281498, + "grad_norm": 0.33193090558052063, + "learning_rate": 4.971629729891331e-06, + "loss": 0.6828, + "step": 1331 + }, + { + "epoch": 0.31039304023613484, + "grad_norm": 0.3316151201725006, + "learning_rate": 4.9715837056771694e-06, + "loss": 0.6922, + "step": 1332 + }, + { + "epoch": 0.3106260680441199, + "grad_norm": 0.33882445096969604, + "learning_rate": 4.971537644374805e-06, + "loss": 0.6677, + "step": 1333 + }, + { + "epoch": 0.310859095852105, + "grad_norm": 0.34290367364883423, + "learning_rate": 4.971491545984928e-06, + "loss": 0.7158, + "step": 1334 + }, + { + "epoch": 0.3110921236600901, + "grad_norm": 0.31752586364746094, + "learning_rate": 4.971445410508231e-06, + "loss": 0.6609, + "step": 1335 + }, + { + "epoch": 0.3113251514680752, + "grad_norm": 0.32989704608917236, + "learning_rate": 4.9713992379454066e-06, + "loss": 0.696, + "step": 1336 + }, + { + "epoch": 0.3115581792760603, + "grad_norm": 0.31605765223503113, + "learning_rate": 4.9713530282971474e-06, + "loss": 0.6809, + "step": 1337 + }, + { + "epoch": 0.31179120708404534, + "grad_norm": 0.324421763420105, + "learning_rate": 4.971306781564146e-06, + "loss": 0.6543, + "step": 1338 + }, + { + "epoch": 0.31202423489203046, + "grad_norm": 0.3282513916492462, + "learning_rate": 4.971260497747098e-06, + "loss": 0.6655, + "step": 1339 + }, + { + "epoch": 0.3122572627000155, + "grad_norm": 0.3427858054637909, + "learning_rate": 4.9712141768466955e-06, + "loss": 0.6842, + "step": 1340 + }, + { + "epoch": 0.31249029050800065, + "grad_norm": 0.3279074728488922, + "learning_rate": 4.9711678188636355e-06, + "loss": 0.6792, + "step": 1341 + }, + { + "epoch": 0.3127233183159857, + "grad_norm": 0.3270131051540375, + "learning_rate": 4.971121423798614e-06, + "loss": 0.6828, + "step": 1342 + }, + { + "epoch": 0.3129563461239708, + "grad_norm": 0.31755512952804565, + "learning_rate": 4.971074991652326e-06, + "loss": 0.6894, + "step": 1343 + }, + { + "epoch": 0.3131893739319559, + "grad_norm": 0.31231531500816345, + "learning_rate": 4.9710285224254675e-06, + "loss": 0.6649, + "step": 1344 + }, + { + "epoch": 0.31342240173994096, + "grad_norm": 0.32646313309669495, + "learning_rate": 4.970982016118738e-06, + "loss": 0.6446, + "step": 1345 + }, + { + "epoch": 0.3136554295479261, + "grad_norm": 0.35008662939071655, + "learning_rate": 4.970935472732834e-06, + "loss": 0.6754, + "step": 1346 + }, + { + "epoch": 0.31388845735591114, + "grad_norm": 0.32250162959098816, + "learning_rate": 4.970888892268454e-06, + "loss": 0.6512, + "step": 1347 + }, + { + "epoch": 0.3141214851638962, + "grad_norm": 0.32305940985679626, + "learning_rate": 4.9708422747262975e-06, + "loss": 0.6745, + "step": 1348 + }, + { + "epoch": 0.31435451297188133, + "grad_norm": 0.35666224360466003, + "learning_rate": 4.970795620107064e-06, + "loss": 0.6693, + "step": 1349 + }, + { + "epoch": 0.3145875407798664, + "grad_norm": 0.3351663649082184, + "learning_rate": 4.970748928411453e-06, + "loss": 0.6709, + "step": 1350 + }, + { + "epoch": 0.31482056858785146, + "grad_norm": 0.33183228969573975, + "learning_rate": 4.970702199640165e-06, + "loss": 0.7028, + "step": 1351 + }, + { + "epoch": 0.3150535963958366, + "grad_norm": 0.33055007457733154, + "learning_rate": 4.970655433793902e-06, + "loss": 0.6871, + "step": 1352 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 0.34457311034202576, + "learning_rate": 4.970608630873366e-06, + "loss": 0.6968, + "step": 1353 + }, + { + "epoch": 0.31551965201180676, + "grad_norm": 0.320267915725708, + "learning_rate": 4.9705617908792575e-06, + "loss": 0.6667, + "step": 1354 + }, + { + "epoch": 0.3157526798197918, + "grad_norm": 0.32821550965309143, + "learning_rate": 4.970514913812281e-06, + "loss": 0.6754, + "step": 1355 + }, + { + "epoch": 0.3159857076277769, + "grad_norm": 0.33787867426872253, + "learning_rate": 4.97046799967314e-06, + "loss": 0.6708, + "step": 1356 + }, + { + "epoch": 0.316218735435762, + "grad_norm": 0.3337002694606781, + "learning_rate": 4.9704210484625384e-06, + "loss": 0.6645, + "step": 1357 + }, + { + "epoch": 0.3164517632437471, + "grad_norm": 0.3271748423576355, + "learning_rate": 4.97037406018118e-06, + "loss": 0.6635, + "step": 1358 + }, + { + "epoch": 0.3166847910517322, + "grad_norm": 0.3291759788990021, + "learning_rate": 4.97032703482977e-06, + "loss": 0.6766, + "step": 1359 + }, + { + "epoch": 0.31691781885971726, + "grad_norm": 0.33234846591949463, + "learning_rate": 4.970279972409014e-06, + "loss": 0.6715, + "step": 1360 + }, + { + "epoch": 0.3171508466677023, + "grad_norm": 0.34253254532814026, + "learning_rate": 4.970232872919619e-06, + "loss": 0.6465, + "step": 1361 + }, + { + "epoch": 0.31738387447568744, + "grad_norm": 0.3257928490638733, + "learning_rate": 4.970185736362291e-06, + "loss": 0.6559, + "step": 1362 + }, + { + "epoch": 0.3176169022836725, + "grad_norm": 0.3326355218887329, + "learning_rate": 4.970138562737738e-06, + "loss": 0.6677, + "step": 1363 + }, + { + "epoch": 0.31784993009165763, + "grad_norm": 0.33395668864250183, + "learning_rate": 4.970091352046667e-06, + "loss": 0.6846, + "step": 1364 + }, + { + "epoch": 0.3180829578996427, + "grad_norm": 0.3367978036403656, + "learning_rate": 4.9700441042897875e-06, + "loss": 0.6927, + "step": 1365 + }, + { + "epoch": 0.31831598570762776, + "grad_norm": 0.3197716772556305, + "learning_rate": 4.969996819467808e-06, + "loss": 0.6719, + "step": 1366 + }, + { + "epoch": 0.3185490135156129, + "grad_norm": 0.32522016763687134, + "learning_rate": 4.969949497581438e-06, + "loss": 0.7017, + "step": 1367 + }, + { + "epoch": 0.31878204132359794, + "grad_norm": 0.33278292417526245, + "learning_rate": 4.969902138631386e-06, + "loss": 0.6682, + "step": 1368 + }, + { + "epoch": 0.31901506913158306, + "grad_norm": 0.33482375741004944, + "learning_rate": 4.969854742618366e-06, + "loss": 0.6823, + "step": 1369 + }, + { + "epoch": 0.3192480969395681, + "grad_norm": 0.3382750153541565, + "learning_rate": 4.9698073095430874e-06, + "loss": 0.6826, + "step": 1370 + }, + { + "epoch": 0.3194811247475532, + "grad_norm": 0.34116655588150024, + "learning_rate": 4.9697598394062615e-06, + "loss": 0.6783, + "step": 1371 + }, + { + "epoch": 0.3197141525555383, + "grad_norm": 0.33539387583732605, + "learning_rate": 4.969712332208602e-06, + "loss": 0.6622, + "step": 1372 + }, + { + "epoch": 0.3199471803635234, + "grad_norm": 0.3345773220062256, + "learning_rate": 4.969664787950819e-06, + "loss": 0.6864, + "step": 1373 + }, + { + "epoch": 0.32018020817150844, + "grad_norm": 0.324045330286026, + "learning_rate": 4.96961720663363e-06, + "loss": 0.6682, + "step": 1374 + }, + { + "epoch": 0.32041323597949356, + "grad_norm": 0.3316758871078491, + "learning_rate": 4.969569588257746e-06, + "loss": 0.6303, + "step": 1375 + }, + { + "epoch": 0.3206462637874786, + "grad_norm": 0.33933529257774353, + "learning_rate": 4.969521932823883e-06, + "loss": 0.6745, + "step": 1376 + }, + { + "epoch": 0.32087929159546374, + "grad_norm": 0.31629621982574463, + "learning_rate": 4.9694742403327545e-06, + "loss": 0.6885, + "step": 1377 + }, + { + "epoch": 0.3211123194034488, + "grad_norm": 0.3404847979545593, + "learning_rate": 4.9694265107850785e-06, + "loss": 0.6899, + "step": 1378 + }, + { + "epoch": 0.3213453472114339, + "grad_norm": 0.3451565206050873, + "learning_rate": 4.9693787441815685e-06, + "loss": 0.6752, + "step": 1379 + }, + { + "epoch": 0.321578375019419, + "grad_norm": 0.33518263697624207, + "learning_rate": 4.969330940522944e-06, + "loss": 0.6685, + "step": 1380 + }, + { + "epoch": 0.32181140282740406, + "grad_norm": 0.33433669805526733, + "learning_rate": 4.969283099809921e-06, + "loss": 0.6759, + "step": 1381 + }, + { + "epoch": 0.3220444306353892, + "grad_norm": 0.3320857286453247, + "learning_rate": 4.969235222043216e-06, + "loss": 0.6481, + "step": 1382 + }, + { + "epoch": 0.32227745844337424, + "grad_norm": 0.33632907271385193, + "learning_rate": 4.96918730722355e-06, + "loss": 0.6718, + "step": 1383 + }, + { + "epoch": 0.3225104862513593, + "grad_norm": 0.3452127277851105, + "learning_rate": 4.969139355351641e-06, + "loss": 0.6296, + "step": 1384 + }, + { + "epoch": 0.3227435140593444, + "grad_norm": 0.33205586671829224, + "learning_rate": 4.969091366428208e-06, + "loss": 0.6484, + "step": 1385 + }, + { + "epoch": 0.3229765418673295, + "grad_norm": 0.33604803681373596, + "learning_rate": 4.969043340453972e-06, + "loss": 0.6774, + "step": 1386 + }, + { + "epoch": 0.3232095696753146, + "grad_norm": 0.35917821526527405, + "learning_rate": 4.968995277429653e-06, + "loss": 0.6765, + "step": 1387 + }, + { + "epoch": 0.3234425974832997, + "grad_norm": 0.34565892815589905, + "learning_rate": 4.968947177355973e-06, + "loss": 0.6929, + "step": 1388 + }, + { + "epoch": 0.32367562529128474, + "grad_norm": 0.32982537150382996, + "learning_rate": 4.968899040233652e-06, + "loss": 0.6924, + "step": 1389 + }, + { + "epoch": 0.32390865309926986, + "grad_norm": 0.36234843730926514, + "learning_rate": 4.968850866063415e-06, + "loss": 0.654, + "step": 1390 + }, + { + "epoch": 0.3241416809072549, + "grad_norm": 0.3566433787345886, + "learning_rate": 4.968802654845982e-06, + "loss": 0.6794, + "step": 1391 + }, + { + "epoch": 0.32437470871524005, + "grad_norm": 0.3353135585784912, + "learning_rate": 4.968754406582079e-06, + "loss": 0.6372, + "step": 1392 + }, + { + "epoch": 0.3246077365232251, + "grad_norm": 0.3528743386268616, + "learning_rate": 4.9687061212724296e-06, + "loss": 0.6418, + "step": 1393 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 0.34912416338920593, + "learning_rate": 4.968657798917757e-06, + "loss": 0.7001, + "step": 1394 + }, + { + "epoch": 0.3250737921391953, + "grad_norm": 0.33514919877052307, + "learning_rate": 4.968609439518786e-06, + "loss": 0.6865, + "step": 1395 + }, + { + "epoch": 0.32530681994718036, + "grad_norm": 0.33943676948547363, + "learning_rate": 4.968561043076244e-06, + "loss": 0.668, + "step": 1396 + }, + { + "epoch": 0.3255398477551654, + "grad_norm": 0.32568252086639404, + "learning_rate": 4.968512609590857e-06, + "loss": 0.6737, + "step": 1397 + }, + { + "epoch": 0.32577287556315054, + "grad_norm": 0.34173136949539185, + "learning_rate": 4.968464139063351e-06, + "loss": 0.6784, + "step": 1398 + }, + { + "epoch": 0.3260059033711356, + "grad_norm": 0.34435778856277466, + "learning_rate": 4.968415631494454e-06, + "loss": 0.6686, + "step": 1399 + }, + { + "epoch": 0.32623893117912073, + "grad_norm": 0.3178437054157257, + "learning_rate": 4.968367086884894e-06, + "loss": 0.6579, + "step": 1400 + }, + { + "epoch": 0.3264719589871058, + "grad_norm": 0.31875333189964294, + "learning_rate": 4.9683185052353964e-06, + "loss": 0.6746, + "step": 1401 + }, + { + "epoch": 0.32670498679509086, + "grad_norm": 0.33802083134651184, + "learning_rate": 4.968269886546695e-06, + "loss": 0.6529, + "step": 1402 + }, + { + "epoch": 0.326938014603076, + "grad_norm": 0.34064623713493347, + "learning_rate": 4.9682212308195165e-06, + "loss": 0.6622, + "step": 1403 + }, + { + "epoch": 0.32717104241106104, + "grad_norm": 0.3312382698059082, + "learning_rate": 4.968172538054592e-06, + "loss": 0.6494, + "step": 1404 + }, + { + "epoch": 0.32740407021904616, + "grad_norm": 0.32981160283088684, + "learning_rate": 4.968123808252652e-06, + "loss": 0.6653, + "step": 1405 + }, + { + "epoch": 0.3276370980270312, + "grad_norm": 0.34164994955062866, + "learning_rate": 4.968075041414427e-06, + "loss": 0.678, + "step": 1406 + }, + { + "epoch": 0.3278701258350163, + "grad_norm": 0.3467571437358856, + "learning_rate": 4.968026237540651e-06, + "loss": 0.6762, + "step": 1407 + }, + { + "epoch": 0.3281031536430014, + "grad_norm": 0.3421630859375, + "learning_rate": 4.967977396632053e-06, + "loss": 0.6968, + "step": 1408 + }, + { + "epoch": 0.3283361814509865, + "grad_norm": 0.3460288345813751, + "learning_rate": 4.967928518689368e-06, + "loss": 0.6614, + "step": 1409 + }, + { + "epoch": 0.3285692092589716, + "grad_norm": 0.33323007822036743, + "learning_rate": 4.967879603713329e-06, + "loss": 0.686, + "step": 1410 + }, + { + "epoch": 0.32880223706695666, + "grad_norm": 0.35468199849128723, + "learning_rate": 4.96783065170467e-06, + "loss": 0.672, + "step": 1411 + }, + { + "epoch": 0.3290352648749417, + "grad_norm": 0.33295881748199463, + "learning_rate": 4.967781662664127e-06, + "loss": 0.6718, + "step": 1412 + }, + { + "epoch": 0.32926829268292684, + "grad_norm": 0.3279052674770355, + "learning_rate": 4.967732636592433e-06, + "loss": 0.6743, + "step": 1413 + }, + { + "epoch": 0.3295013204909119, + "grad_norm": 0.33248627185821533, + "learning_rate": 4.967683573490324e-06, + "loss": 0.6691, + "step": 1414 + }, + { + "epoch": 0.32973434829889703, + "grad_norm": 0.3366410434246063, + "learning_rate": 4.967634473358537e-06, + "loss": 0.6585, + "step": 1415 + }, + { + "epoch": 0.3299673761068821, + "grad_norm": 0.337537944316864, + "learning_rate": 4.9675853361978074e-06, + "loss": 0.6753, + "step": 1416 + }, + { + "epoch": 0.33020040391486716, + "grad_norm": 0.3411688208580017, + "learning_rate": 4.967536162008875e-06, + "loss": 0.681, + "step": 1417 + }, + { + "epoch": 0.3304334317228523, + "grad_norm": 0.3610337972640991, + "learning_rate": 4.967486950792476e-06, + "loss": 0.6437, + "step": 1418 + }, + { + "epoch": 0.33066645953083734, + "grad_norm": 0.3333829343318939, + "learning_rate": 4.967437702549349e-06, + "loss": 0.6538, + "step": 1419 + }, + { + "epoch": 0.3308994873388224, + "grad_norm": 0.3339460492134094, + "learning_rate": 4.967388417280234e-06, + "loss": 0.6712, + "step": 1420 + }, + { + "epoch": 0.3311325151468075, + "grad_norm": 0.3398650288581848, + "learning_rate": 4.967339094985869e-06, + "loss": 0.6833, + "step": 1421 + }, + { + "epoch": 0.3313655429547926, + "grad_norm": 0.3282124400138855, + "learning_rate": 4.967289735666995e-06, + "loss": 0.6794, + "step": 1422 + }, + { + "epoch": 0.3315985707627777, + "grad_norm": 0.34293466806411743, + "learning_rate": 4.967240339324353e-06, + "loss": 0.6783, + "step": 1423 + }, + { + "epoch": 0.3318315985707628, + "grad_norm": 0.3506075143814087, + "learning_rate": 4.967190905958683e-06, + "loss": 0.6689, + "step": 1424 + }, + { + "epoch": 0.33206462637874784, + "grad_norm": 0.3233243227005005, + "learning_rate": 4.967141435570728e-06, + "loss": 0.6534, + "step": 1425 + }, + { + "epoch": 0.33229765418673296, + "grad_norm": 0.3266931176185608, + "learning_rate": 4.967091928161229e-06, + "loss": 0.6609, + "step": 1426 + }, + { + "epoch": 0.332530681994718, + "grad_norm": 0.33955803513526917, + "learning_rate": 4.967042383730931e-06, + "loss": 0.6693, + "step": 1427 + }, + { + "epoch": 0.33276370980270314, + "grad_norm": 0.3354465961456299, + "learning_rate": 4.966992802280577e-06, + "loss": 0.6526, + "step": 1428 + }, + { + "epoch": 0.3329967376106882, + "grad_norm": 0.3509282171726227, + "learning_rate": 4.966943183810909e-06, + "loss": 0.7017, + "step": 1429 + }, + { + "epoch": 0.3332297654186733, + "grad_norm": 0.34357669949531555, + "learning_rate": 4.966893528322673e-06, + "loss": 0.6901, + "step": 1430 + }, + { + "epoch": 0.3334627932266584, + "grad_norm": 0.3232961893081665, + "learning_rate": 4.966843835816615e-06, + "loss": 0.6912, + "step": 1431 + }, + { + "epoch": 0.33369582103464346, + "grad_norm": 0.32485634088516235, + "learning_rate": 4.966794106293478e-06, + "loss": 0.6829, + "step": 1432 + }, + { + "epoch": 0.3339288488426286, + "grad_norm": 0.3440361022949219, + "learning_rate": 4.96674433975401e-06, + "loss": 0.6936, + "step": 1433 + }, + { + "epoch": 0.33416187665061364, + "grad_norm": 0.321914941072464, + "learning_rate": 4.9666945361989595e-06, + "loss": 0.6618, + "step": 1434 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 0.33769160509109497, + "learning_rate": 4.966644695629071e-06, + "loss": 0.6803, + "step": 1435 + }, + { + "epoch": 0.3346279322665838, + "grad_norm": 0.3410460948944092, + "learning_rate": 4.966594818045093e-06, + "loss": 0.691, + "step": 1436 + }, + { + "epoch": 0.3348609600745689, + "grad_norm": 0.3339146673679352, + "learning_rate": 4.966544903447775e-06, + "loss": 0.695, + "step": 1437 + }, + { + "epoch": 0.335093987882554, + "grad_norm": 0.32087549567222595, + "learning_rate": 4.966494951837865e-06, + "loss": 0.6637, + "step": 1438 + }, + { + "epoch": 0.3353270156905391, + "grad_norm": 0.329897403717041, + "learning_rate": 4.966444963216114e-06, + "loss": 0.6635, + "step": 1439 + }, + { + "epoch": 0.33556004349852414, + "grad_norm": 0.33904460072517395, + "learning_rate": 4.96639493758327e-06, + "loss": 0.6495, + "step": 1440 + }, + { + "epoch": 0.33579307130650926, + "grad_norm": 0.32955342531204224, + "learning_rate": 4.966344874940085e-06, + "loss": 0.6597, + "step": 1441 + }, + { + "epoch": 0.3360260991144943, + "grad_norm": 0.33158713579177856, + "learning_rate": 4.96629477528731e-06, + "loss": 0.6564, + "step": 1442 + }, + { + "epoch": 0.3362591269224794, + "grad_norm": 0.3594280779361725, + "learning_rate": 4.966244638625697e-06, + "loss": 0.6544, + "step": 1443 + }, + { + "epoch": 0.3364921547304645, + "grad_norm": 0.34677043557167053, + "learning_rate": 4.966194464955998e-06, + "loss": 0.6862, + "step": 1444 + }, + { + "epoch": 0.3367251825384496, + "grad_norm": 0.3245130181312561, + "learning_rate": 4.966144254278966e-06, + "loss": 0.6605, + "step": 1445 + }, + { + "epoch": 0.3369582103464347, + "grad_norm": 0.32185593247413635, + "learning_rate": 4.966094006595354e-06, + "loss": 0.6677, + "step": 1446 + }, + { + "epoch": 0.33719123815441976, + "grad_norm": 0.3378206193447113, + "learning_rate": 4.966043721905917e-06, + "loss": 0.6618, + "step": 1447 + }, + { + "epoch": 0.3374242659624048, + "grad_norm": 0.32731786370277405, + "learning_rate": 4.9659934002114094e-06, + "loss": 0.6571, + "step": 1448 + }, + { + "epoch": 0.33765729377038994, + "grad_norm": 0.3646664023399353, + "learning_rate": 4.965943041512585e-06, + "loss": 0.6466, + "step": 1449 + }, + { + "epoch": 0.337890321578375, + "grad_norm": 0.34014469385147095, + "learning_rate": 4.965892645810201e-06, + "loss": 0.6784, + "step": 1450 + }, + { + "epoch": 0.33812334938636013, + "grad_norm": 0.3487859070301056, + "learning_rate": 4.965842213105013e-06, + "loss": 0.663, + "step": 1451 + }, + { + "epoch": 0.3383563771943452, + "grad_norm": 0.3366861641407013, + "learning_rate": 4.965791743397779e-06, + "loss": 0.6686, + "step": 1452 + }, + { + "epoch": 0.33858940500233026, + "grad_norm": 0.34097161889076233, + "learning_rate": 4.965741236689253e-06, + "loss": 0.6641, + "step": 1453 + }, + { + "epoch": 0.3388224328103154, + "grad_norm": 0.3294290006160736, + "learning_rate": 4.965690692980197e-06, + "loss": 0.6696, + "step": 1454 + }, + { + "epoch": 0.33905546061830044, + "grad_norm": 0.3328244388103485, + "learning_rate": 4.965640112271367e-06, + "loss": 0.6549, + "step": 1455 + }, + { + "epoch": 0.33928848842628556, + "grad_norm": 0.32607483863830566, + "learning_rate": 4.965589494563522e-06, + "loss": 0.6586, + "step": 1456 + }, + { + "epoch": 0.3395215162342706, + "grad_norm": 0.33046114444732666, + "learning_rate": 4.965538839857422e-06, + "loss": 0.6535, + "step": 1457 + }, + { + "epoch": 0.3397545440422557, + "grad_norm": 0.33702173829078674, + "learning_rate": 4.965488148153829e-06, + "loss": 0.676, + "step": 1458 + }, + { + "epoch": 0.3399875718502408, + "grad_norm": 0.33732154965400696, + "learning_rate": 4.9654374194535e-06, + "loss": 0.6797, + "step": 1459 + }, + { + "epoch": 0.3402205996582259, + "grad_norm": 0.3321245312690735, + "learning_rate": 4.965386653757199e-06, + "loss": 0.6519, + "step": 1460 + }, + { + "epoch": 0.340453627466211, + "grad_norm": 0.337451696395874, + "learning_rate": 4.965335851065686e-06, + "loss": 0.6799, + "step": 1461 + }, + { + "epoch": 0.34068665527419606, + "grad_norm": 0.3249598741531372, + "learning_rate": 4.965285011379725e-06, + "loss": 0.6719, + "step": 1462 + }, + { + "epoch": 0.3409196830821811, + "grad_norm": 0.33367300033569336, + "learning_rate": 4.9652341347000785e-06, + "loss": 0.665, + "step": 1463 + }, + { + "epoch": 0.34115271089016624, + "grad_norm": 0.34850284457206726, + "learning_rate": 4.9651832210275094e-06, + "loss": 0.6571, + "step": 1464 + }, + { + "epoch": 0.3413857386981513, + "grad_norm": 0.33590009808540344, + "learning_rate": 4.965132270362781e-06, + "loss": 0.6749, + "step": 1465 + }, + { + "epoch": 0.3416187665061364, + "grad_norm": 0.321455180644989, + "learning_rate": 4.96508128270666e-06, + "loss": 0.6915, + "step": 1466 + }, + { + "epoch": 0.3418517943141215, + "grad_norm": 0.3373476564884186, + "learning_rate": 4.96503025805991e-06, + "loss": 0.6791, + "step": 1467 + }, + { + "epoch": 0.34208482212210656, + "grad_norm": 0.31687116622924805, + "learning_rate": 4.9649791964232965e-06, + "loss": 0.6568, + "step": 1468 + }, + { + "epoch": 0.3423178499300917, + "grad_norm": 0.33823466300964355, + "learning_rate": 4.964928097797586e-06, + "loss": 0.6606, + "step": 1469 + }, + { + "epoch": 0.34255087773807674, + "grad_norm": 0.32286176085472107, + "learning_rate": 4.964876962183546e-06, + "loss": 0.6658, + "step": 1470 + }, + { + "epoch": 0.3427839055460618, + "grad_norm": 0.3416803479194641, + "learning_rate": 4.964825789581943e-06, + "loss": 0.6556, + "step": 1471 + }, + { + "epoch": 0.3430169333540469, + "grad_norm": 0.33235323429107666, + "learning_rate": 4.964774579993546e-06, + "loss": 0.6667, + "step": 1472 + }, + { + "epoch": 0.343249961162032, + "grad_norm": 0.3292483687400818, + "learning_rate": 4.964723333419121e-06, + "loss": 0.6868, + "step": 1473 + }, + { + "epoch": 0.3434829889700171, + "grad_norm": 0.3646247982978821, + "learning_rate": 4.964672049859439e-06, + "loss": 0.7001, + "step": 1474 + }, + { + "epoch": 0.3437160167780022, + "grad_norm": 0.33400335907936096, + "learning_rate": 4.96462072931527e-06, + "loss": 0.6775, + "step": 1475 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 0.32747116684913635, + "learning_rate": 4.964569371787383e-06, + "loss": 0.665, + "step": 1476 + }, + { + "epoch": 0.34418207239397236, + "grad_norm": 0.33993470668792725, + "learning_rate": 4.964517977276549e-06, + "loss": 0.6292, + "step": 1477 + }, + { + "epoch": 0.3444151002019574, + "grad_norm": 0.3309602737426758, + "learning_rate": 4.964466545783538e-06, + "loss": 0.6425, + "step": 1478 + }, + { + "epoch": 0.34464812800994254, + "grad_norm": 0.33237558603286743, + "learning_rate": 4.964415077309124e-06, + "loss": 0.7033, + "step": 1479 + }, + { + "epoch": 0.3448811558179276, + "grad_norm": 0.3518960475921631, + "learning_rate": 4.964363571854078e-06, + "loss": 0.6683, + "step": 1480 + }, + { + "epoch": 0.3451141836259127, + "grad_norm": 0.36643266677856445, + "learning_rate": 4.964312029419173e-06, + "loss": 0.7128, + "step": 1481 + }, + { + "epoch": 0.3453472114338978, + "grad_norm": 0.32175979018211365, + "learning_rate": 4.9642604500051825e-06, + "loss": 0.6467, + "step": 1482 + }, + { + "epoch": 0.34558023924188286, + "grad_norm": 0.3339715600013733, + "learning_rate": 4.964208833612882e-06, + "loss": 0.6724, + "step": 1483 + }, + { + "epoch": 0.345813267049868, + "grad_norm": 0.32880422472953796, + "learning_rate": 4.9641571802430425e-06, + "loss": 0.6551, + "step": 1484 + }, + { + "epoch": 0.34604629485785304, + "grad_norm": 0.3312155604362488, + "learning_rate": 4.9641054898964424e-06, + "loss": 0.6588, + "step": 1485 + }, + { + "epoch": 0.3462793226658381, + "grad_norm": 0.34586378931999207, + "learning_rate": 4.964053762573856e-06, + "loss": 0.6606, + "step": 1486 + }, + { + "epoch": 0.3465123504738232, + "grad_norm": 0.3550679683685303, + "learning_rate": 4.964001998276059e-06, + "loss": 0.6755, + "step": 1487 + }, + { + "epoch": 0.3467453782818083, + "grad_norm": 0.33396396040916443, + "learning_rate": 4.963950197003829e-06, + "loss": 0.6734, + "step": 1488 + }, + { + "epoch": 0.34697840608979336, + "grad_norm": 0.33681562542915344, + "learning_rate": 4.963898358757944e-06, + "loss": 0.682, + "step": 1489 + }, + { + "epoch": 0.3472114338977785, + "grad_norm": 0.33345746994018555, + "learning_rate": 4.9638464835391805e-06, + "loss": 0.6565, + "step": 1490 + }, + { + "epoch": 0.34744446170576354, + "grad_norm": 0.34245574474334717, + "learning_rate": 4.963794571348318e-06, + "loss": 0.6414, + "step": 1491 + }, + { + "epoch": 0.34767748951374866, + "grad_norm": 0.3369658887386322, + "learning_rate": 4.963742622186134e-06, + "loss": 0.67, + "step": 1492 + }, + { + "epoch": 0.3479105173217337, + "grad_norm": 0.3422485291957855, + "learning_rate": 4.96369063605341e-06, + "loss": 0.6981, + "step": 1493 + }, + { + "epoch": 0.3481435451297188, + "grad_norm": 0.3342357575893402, + "learning_rate": 4.963638612950924e-06, + "loss": 0.6727, + "step": 1494 + }, + { + "epoch": 0.3483765729377039, + "grad_norm": 0.33848434686660767, + "learning_rate": 4.963586552879459e-06, + "loss": 0.6806, + "step": 1495 + }, + { + "epoch": 0.348609600745689, + "grad_norm": 0.3396807312965393, + "learning_rate": 4.963534455839794e-06, + "loss": 0.6775, + "step": 1496 + }, + { + "epoch": 0.3488426285536741, + "grad_norm": 0.3285427391529083, + "learning_rate": 4.963482321832712e-06, + "loss": 0.6862, + "step": 1497 + }, + { + "epoch": 0.34907565636165916, + "grad_norm": 0.32928505539894104, + "learning_rate": 4.963430150858996e-06, + "loss": 0.6654, + "step": 1498 + }, + { + "epoch": 0.3493086841696442, + "grad_norm": 0.33978602290153503, + "learning_rate": 4.963377942919426e-06, + "loss": 0.68, + "step": 1499 + }, + { + "epoch": 0.34954171197762934, + "grad_norm": 0.33161461353302, + "learning_rate": 4.963325698014789e-06, + "loss": 0.6726, + "step": 1500 + }, + { + "epoch": 0.3497747397856144, + "grad_norm": 0.33270370960235596, + "learning_rate": 4.963273416145866e-06, + "loss": 0.6489, + "step": 1501 + }, + { + "epoch": 0.35000776759359953, + "grad_norm": 0.3288179039955139, + "learning_rate": 4.963221097313444e-06, + "loss": 0.6598, + "step": 1502 + }, + { + "epoch": 0.3502407954015846, + "grad_norm": 0.33383846282958984, + "learning_rate": 4.9631687415183064e-06, + "loss": 0.6586, + "step": 1503 + }, + { + "epoch": 0.35047382320956966, + "grad_norm": 0.3354722857475281, + "learning_rate": 4.963116348761239e-06, + "loss": 0.6621, + "step": 1504 + }, + { + "epoch": 0.3507068510175548, + "grad_norm": 0.33293452858924866, + "learning_rate": 4.9630639190430285e-06, + "loss": 0.6658, + "step": 1505 + }, + { + "epoch": 0.35093987882553984, + "grad_norm": 0.3416721522808075, + "learning_rate": 4.9630114523644615e-06, + "loss": 0.6599, + "step": 1506 + }, + { + "epoch": 0.35117290663352496, + "grad_norm": 0.3372819423675537, + "learning_rate": 4.962958948726326e-06, + "loss": 0.6782, + "step": 1507 + }, + { + "epoch": 0.35140593444151, + "grad_norm": 0.34058842062950134, + "learning_rate": 4.962906408129409e-06, + "loss": 0.6626, + "step": 1508 + }, + { + "epoch": 0.3516389622494951, + "grad_norm": 0.36225181818008423, + "learning_rate": 4.962853830574498e-06, + "loss": 0.6777, + "step": 1509 + }, + { + "epoch": 0.3518719900574802, + "grad_norm": 0.3396381139755249, + "learning_rate": 4.962801216062384e-06, + "loss": 0.6531, + "step": 1510 + }, + { + "epoch": 0.3521050178654653, + "grad_norm": 0.3358972668647766, + "learning_rate": 4.962748564593856e-06, + "loss": 0.6714, + "step": 1511 + }, + { + "epoch": 0.35233804567345034, + "grad_norm": 0.33602288365364075, + "learning_rate": 4.962695876169703e-06, + "loss": 0.6683, + "step": 1512 + }, + { + "epoch": 0.35257107348143546, + "grad_norm": 0.3487638533115387, + "learning_rate": 4.962643150790716e-06, + "loss": 0.6908, + "step": 1513 + }, + { + "epoch": 0.3528041012894205, + "grad_norm": 0.31988584995269775, + "learning_rate": 4.962590388457687e-06, + "loss": 0.6568, + "step": 1514 + }, + { + "epoch": 0.35303712909740564, + "grad_norm": 0.3450043499469757, + "learning_rate": 4.9625375891714075e-06, + "loss": 0.6781, + "step": 1515 + }, + { + "epoch": 0.3532701569053907, + "grad_norm": 0.3263136148452759, + "learning_rate": 4.962484752932669e-06, + "loss": 0.6692, + "step": 1516 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 0.3288547098636627, + "learning_rate": 4.962431879742265e-06, + "loss": 0.6633, + "step": 1517 + }, + { + "epoch": 0.3537362125213609, + "grad_norm": 0.32152634859085083, + "learning_rate": 4.96237896960099e-06, + "loss": 0.6765, + "step": 1518 + }, + { + "epoch": 0.35396924032934596, + "grad_norm": 0.32595255970954895, + "learning_rate": 4.962326022509635e-06, + "loss": 0.6832, + "step": 1519 + }, + { + "epoch": 0.3542022681373311, + "grad_norm": 0.3305436670780182, + "learning_rate": 4.962273038468997e-06, + "loss": 0.6647, + "step": 1520 + }, + { + "epoch": 0.35443529594531614, + "grad_norm": 0.32116127014160156, + "learning_rate": 4.96222001747987e-06, + "loss": 0.6566, + "step": 1521 + }, + { + "epoch": 0.3546683237533012, + "grad_norm": 0.33138781785964966, + "learning_rate": 4.9621669595430496e-06, + "loss": 0.6615, + "step": 1522 + }, + { + "epoch": 0.3549013515612863, + "grad_norm": 0.3344120383262634, + "learning_rate": 4.962113864659334e-06, + "loss": 0.6755, + "step": 1523 + }, + { + "epoch": 0.3551343793692714, + "grad_norm": 0.34403133392333984, + "learning_rate": 4.962060732829517e-06, + "loss": 0.6659, + "step": 1524 + }, + { + "epoch": 0.3553674071772565, + "grad_norm": 0.32567688822746277, + "learning_rate": 4.962007564054397e-06, + "loss": 0.6833, + "step": 1525 + }, + { + "epoch": 0.3556004349852416, + "grad_norm": 0.3177163898944855, + "learning_rate": 4.961954358334773e-06, + "loss": 0.6619, + "step": 1526 + }, + { + "epoch": 0.35583346279322664, + "grad_norm": 0.3349158763885498, + "learning_rate": 4.961901115671442e-06, + "loss": 0.6731, + "step": 1527 + }, + { + "epoch": 0.35606649060121176, + "grad_norm": 0.3401324152946472, + "learning_rate": 4.961847836065204e-06, + "loss": 0.6457, + "step": 1528 + }, + { + "epoch": 0.3562995184091968, + "grad_norm": 0.3316843509674072, + "learning_rate": 4.961794519516857e-06, + "loss": 0.7027, + "step": 1529 + }, + { + "epoch": 0.3565325462171819, + "grad_norm": 0.3355189561843872, + "learning_rate": 4.9617411660272015e-06, + "loss": 0.6708, + "step": 1530 + }, + { + "epoch": 0.356765574025167, + "grad_norm": 0.33535894751548767, + "learning_rate": 4.961687775597039e-06, + "loss": 0.6553, + "step": 1531 + }, + { + "epoch": 0.3569986018331521, + "grad_norm": 0.3297029733657837, + "learning_rate": 4.9616343482271706e-06, + "loss": 0.6691, + "step": 1532 + }, + { + "epoch": 0.3572316296411372, + "grad_norm": 0.3326868414878845, + "learning_rate": 4.9615808839183974e-06, + "loss": 0.6891, + "step": 1533 + }, + { + "epoch": 0.35746465744912226, + "grad_norm": 0.3398496210575104, + "learning_rate": 4.961527382671521e-06, + "loss": 0.6801, + "step": 1534 + }, + { + "epoch": 0.3576976852571073, + "grad_norm": 0.32764020562171936, + "learning_rate": 4.961473844487347e-06, + "loss": 0.6751, + "step": 1535 + }, + { + "epoch": 0.35793071306509244, + "grad_norm": 0.3305548131465912, + "learning_rate": 4.961420269366676e-06, + "loss": 0.6748, + "step": 1536 + }, + { + "epoch": 0.3581637408730775, + "grad_norm": 0.3380449712276459, + "learning_rate": 4.961366657310312e-06, + "loss": 0.6994, + "step": 1537 + }, + { + "epoch": 0.3583967686810626, + "grad_norm": 0.33517783880233765, + "learning_rate": 4.961313008319062e-06, + "loss": 0.6623, + "step": 1538 + }, + { + "epoch": 0.3586297964890477, + "grad_norm": 0.316273957490921, + "learning_rate": 4.961259322393729e-06, + "loss": 0.649, + "step": 1539 + }, + { + "epoch": 0.35886282429703276, + "grad_norm": 0.32361137866973877, + "learning_rate": 4.961205599535118e-06, + "loss": 0.6624, + "step": 1540 + }, + { + "epoch": 0.3590958521050179, + "grad_norm": 0.3326389789581299, + "learning_rate": 4.961151839744037e-06, + "loss": 0.6663, + "step": 1541 + }, + { + "epoch": 0.35932887991300294, + "grad_norm": 0.3337155878543854, + "learning_rate": 4.9610980430212915e-06, + "loss": 0.6727, + "step": 1542 + }, + { + "epoch": 0.35956190772098806, + "grad_norm": 0.3241748809814453, + "learning_rate": 4.9610442093676896e-06, + "loss": 0.675, + "step": 1543 + }, + { + "epoch": 0.3597949355289731, + "grad_norm": 0.3282434642314911, + "learning_rate": 4.960990338784038e-06, + "loss": 0.6858, + "step": 1544 + }, + { + "epoch": 0.3600279633369582, + "grad_norm": 0.332741916179657, + "learning_rate": 4.960936431271146e-06, + "loss": 0.6743, + "step": 1545 + }, + { + "epoch": 0.3602609911449433, + "grad_norm": 0.34384360909461975, + "learning_rate": 4.9608824868298225e-06, + "loss": 0.6554, + "step": 1546 + }, + { + "epoch": 0.3604940189529284, + "grad_norm": 0.33145928382873535, + "learning_rate": 4.960828505460877e-06, + "loss": 0.6722, + "step": 1547 + }, + { + "epoch": 0.3607270467609135, + "grad_norm": 0.35948124527931213, + "learning_rate": 4.960774487165118e-06, + "loss": 0.6602, + "step": 1548 + }, + { + "epoch": 0.36096007456889856, + "grad_norm": 0.33065855503082275, + "learning_rate": 4.960720431943359e-06, + "loss": 0.6815, + "step": 1549 + }, + { + "epoch": 0.3611931023768836, + "grad_norm": 0.3315471112728119, + "learning_rate": 4.960666339796408e-06, + "loss": 0.6429, + "step": 1550 + }, + { + "epoch": 0.36142613018486874, + "grad_norm": 0.32013222575187683, + "learning_rate": 4.96061221072508e-06, + "loss": 0.6713, + "step": 1551 + }, + { + "epoch": 0.3616591579928538, + "grad_norm": 0.3553844392299652, + "learning_rate": 4.960558044730184e-06, + "loss": 0.6851, + "step": 1552 + }, + { + "epoch": 0.3618921858008389, + "grad_norm": 0.351507306098938, + "learning_rate": 4.960503841812535e-06, + "loss": 0.6866, + "step": 1553 + }, + { + "epoch": 0.362125213608824, + "grad_norm": 0.3341979384422302, + "learning_rate": 4.960449601972945e-06, + "loss": 0.6712, + "step": 1554 + }, + { + "epoch": 0.36235824141680906, + "grad_norm": 0.3357712924480438, + "learning_rate": 4.960395325212229e-06, + "loss": 0.6863, + "step": 1555 + }, + { + "epoch": 0.3625912692247942, + "grad_norm": 0.35046669840812683, + "learning_rate": 4.960341011531201e-06, + "loss": 0.6411, + "step": 1556 + }, + { + "epoch": 0.36282429703277924, + "grad_norm": 0.34484097361564636, + "learning_rate": 4.960286660930677e-06, + "loss": 0.677, + "step": 1557 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 0.35046783089637756, + "learning_rate": 4.960232273411471e-06, + "loss": 0.692, + "step": 1558 + }, + { + "epoch": 0.3632903526487494, + "grad_norm": 0.34226495027542114, + "learning_rate": 4.960177848974399e-06, + "loss": 0.6395, + "step": 1559 + }, + { + "epoch": 0.3635233804567345, + "grad_norm": 0.346538245677948, + "learning_rate": 4.960123387620278e-06, + "loss": 0.7211, + "step": 1560 + }, + { + "epoch": 0.3637564082647196, + "grad_norm": 0.34203317761421204, + "learning_rate": 4.960068889349927e-06, + "loss": 0.6529, + "step": 1561 + }, + { + "epoch": 0.3639894360727047, + "grad_norm": 0.33744996786117554, + "learning_rate": 4.960014354164162e-06, + "loss": 0.7104, + "step": 1562 + }, + { + "epoch": 0.36422246388068974, + "grad_norm": 0.3274959325790405, + "learning_rate": 4.959959782063803e-06, + "loss": 0.6539, + "step": 1563 + }, + { + "epoch": 0.36445549168867486, + "grad_norm": 0.33168455958366394, + "learning_rate": 4.9599051730496655e-06, + "loss": 0.6717, + "step": 1564 + }, + { + "epoch": 0.3646885194966599, + "grad_norm": 0.34095802903175354, + "learning_rate": 4.959850527122572e-06, + "loss": 0.6805, + "step": 1565 + }, + { + "epoch": 0.36492154730464504, + "grad_norm": 0.35089337825775146, + "learning_rate": 4.959795844283342e-06, + "loss": 0.6766, + "step": 1566 + }, + { + "epoch": 0.3651545751126301, + "grad_norm": 0.3399343490600586, + "learning_rate": 4.959741124532795e-06, + "loss": 0.6608, + "step": 1567 + }, + { + "epoch": 0.3653876029206152, + "grad_norm": 0.332829087972641, + "learning_rate": 4.959686367871755e-06, + "loss": 0.6611, + "step": 1568 + }, + { + "epoch": 0.3656206307286003, + "grad_norm": 0.3586636483669281, + "learning_rate": 4.959631574301039e-06, + "loss": 0.6904, + "step": 1569 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 0.3340654671192169, + "learning_rate": 4.959576743821473e-06, + "loss": 0.6584, + "step": 1570 + }, + { + "epoch": 0.3660866863445705, + "grad_norm": 0.33988139033317566, + "learning_rate": 4.959521876433878e-06, + "loss": 0.6548, + "step": 1571 + }, + { + "epoch": 0.36631971415255554, + "grad_norm": 0.3362710475921631, + "learning_rate": 4.959466972139078e-06, + "loss": 0.6699, + "step": 1572 + }, + { + "epoch": 0.3665527419605406, + "grad_norm": 0.3283473253250122, + "learning_rate": 4.959412030937898e-06, + "loss": 0.675, + "step": 1573 + }, + { + "epoch": 0.3667857697685257, + "grad_norm": 0.33544567227363586, + "learning_rate": 4.9593570528311595e-06, + "loss": 0.6462, + "step": 1574 + }, + { + "epoch": 0.3670187975765108, + "grad_norm": 0.34647125005722046, + "learning_rate": 4.95930203781969e-06, + "loss": 0.6634, + "step": 1575 + }, + { + "epoch": 0.36725182538449586, + "grad_norm": 0.3306637108325958, + "learning_rate": 4.959246985904314e-06, + "loss": 0.6541, + "step": 1576 + }, + { + "epoch": 0.367484853192481, + "grad_norm": 0.31689196825027466, + "learning_rate": 4.959191897085857e-06, + "loss": 0.6569, + "step": 1577 + }, + { + "epoch": 0.36771788100046604, + "grad_norm": 0.33510029315948486, + "learning_rate": 4.959136771365148e-06, + "loss": 0.688, + "step": 1578 + }, + { + "epoch": 0.36795090880845116, + "grad_norm": 0.3365011513233185, + "learning_rate": 4.959081608743011e-06, + "loss": 0.6829, + "step": 1579 + }, + { + "epoch": 0.3681839366164362, + "grad_norm": 0.35050442814826965, + "learning_rate": 4.9590264092202775e-06, + "loss": 0.6934, + "step": 1580 + }, + { + "epoch": 0.3684169644244213, + "grad_norm": 0.3280167877674103, + "learning_rate": 4.958971172797772e-06, + "loss": 0.6592, + "step": 1581 + }, + { + "epoch": 0.3686499922324064, + "grad_norm": 0.32912278175354004, + "learning_rate": 4.958915899476327e-06, + "loss": 0.6794, + "step": 1582 + }, + { + "epoch": 0.3688830200403915, + "grad_norm": 0.34846240282058716, + "learning_rate": 4.958860589256768e-06, + "loss": 0.684, + "step": 1583 + }, + { + "epoch": 0.3691160478483766, + "grad_norm": 0.3347542881965637, + "learning_rate": 4.958805242139928e-06, + "loss": 0.6738, + "step": 1584 + }, + { + "epoch": 0.36934907565636166, + "grad_norm": 0.3299737572669983, + "learning_rate": 4.958749858126636e-06, + "loss": 0.6577, + "step": 1585 + }, + { + "epoch": 0.3695821034643467, + "grad_norm": 0.3441544771194458, + "learning_rate": 4.958694437217724e-06, + "loss": 0.6577, + "step": 1586 + }, + { + "epoch": 0.36981513127233184, + "grad_norm": 0.3369065523147583, + "learning_rate": 4.958638979414024e-06, + "loss": 0.6687, + "step": 1587 + }, + { + "epoch": 0.3700481590803169, + "grad_norm": 0.34210944175720215, + "learning_rate": 4.958583484716367e-06, + "loss": 0.6503, + "step": 1588 + }, + { + "epoch": 0.370281186888302, + "grad_norm": 0.3385675847530365, + "learning_rate": 4.9585279531255855e-06, + "loss": 0.683, + "step": 1589 + }, + { + "epoch": 0.3705142146962871, + "grad_norm": 0.3563666045665741, + "learning_rate": 4.958472384642514e-06, + "loss": 0.6865, + "step": 1590 + }, + { + "epoch": 0.37074724250427216, + "grad_norm": 0.33408495783805847, + "learning_rate": 4.958416779267987e-06, + "loss": 0.668, + "step": 1591 + }, + { + "epoch": 0.3709802703122573, + "grad_norm": 0.3468257784843445, + "learning_rate": 4.9583611370028365e-06, + "loss": 0.6696, + "step": 1592 + }, + { + "epoch": 0.37121329812024234, + "grad_norm": 0.32883039116859436, + "learning_rate": 4.958305457847899e-06, + "loss": 0.6391, + "step": 1593 + }, + { + "epoch": 0.37144632592822746, + "grad_norm": 0.3413340747356415, + "learning_rate": 4.95824974180401e-06, + "loss": 0.6548, + "step": 1594 + }, + { + "epoch": 0.3716793537362125, + "grad_norm": 0.33973047137260437, + "learning_rate": 4.958193988872004e-06, + "loss": 0.6822, + "step": 1595 + }, + { + "epoch": 0.3719123815441976, + "grad_norm": 0.3220883011817932, + "learning_rate": 4.958138199052721e-06, + "loss": 0.666, + "step": 1596 + }, + { + "epoch": 0.3721454093521827, + "grad_norm": 0.3393138647079468, + "learning_rate": 4.958082372346995e-06, + "loss": 0.6879, + "step": 1597 + }, + { + "epoch": 0.3723784371601678, + "grad_norm": 0.33591750264167786, + "learning_rate": 4.958026508755664e-06, + "loss": 0.6974, + "step": 1598 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 0.3552340269088745, + "learning_rate": 4.957970608279569e-06, + "loss": 0.6972, + "step": 1599 + }, + { + "epoch": 0.37284449277613796, + "grad_norm": 0.3448086380958557, + "learning_rate": 4.957914670919546e-06, + "loss": 0.6363, + "step": 1600 + }, + { + "epoch": 0.373077520584123, + "grad_norm": 0.3337414562702179, + "learning_rate": 4.957858696676435e-06, + "loss": 0.6427, + "step": 1601 + }, + { + "epoch": 0.37331054839210814, + "grad_norm": 0.3307848274707794, + "learning_rate": 4.957802685551077e-06, + "loss": 0.6619, + "step": 1602 + }, + { + "epoch": 0.3735435762000932, + "grad_norm": 0.3575986921787262, + "learning_rate": 4.9577466375443115e-06, + "loss": 0.6726, + "step": 1603 + }, + { + "epoch": 0.3737766040080783, + "grad_norm": 0.3505476415157318, + "learning_rate": 4.9576905526569795e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.3740096318160634, + "grad_norm": 0.35714641213417053, + "learning_rate": 4.957634430889924e-06, + "loss": 0.6809, + "step": 1605 + }, + { + "epoch": 0.37424265962404846, + "grad_norm": 0.338337779045105, + "learning_rate": 4.957578272243985e-06, + "loss": 0.6655, + "step": 1606 + }, + { + "epoch": 0.3744756874320336, + "grad_norm": 0.3614133894443512, + "learning_rate": 4.957522076720006e-06, + "loss": 0.6969, + "step": 1607 + }, + { + "epoch": 0.37470871524001864, + "grad_norm": 0.34252122044563293, + "learning_rate": 4.9574658443188306e-06, + "loss": 0.6733, + "step": 1608 + }, + { + "epoch": 0.3749417430480037, + "grad_norm": 0.3478531837463379, + "learning_rate": 4.957409575041303e-06, + "loss": 0.6816, + "step": 1609 + }, + { + "epoch": 0.3751747708559888, + "grad_norm": 0.3269883990287781, + "learning_rate": 4.957353268888268e-06, + "loss": 0.6966, + "step": 1610 + }, + { + "epoch": 0.3754077986639739, + "grad_norm": 0.34591448307037354, + "learning_rate": 4.957296925860568e-06, + "loss": 0.6567, + "step": 1611 + }, + { + "epoch": 0.375640826471959, + "grad_norm": 0.3345402479171753, + "learning_rate": 4.957240545959051e-06, + "loss": 0.6582, + "step": 1612 + }, + { + "epoch": 0.3758738542799441, + "grad_norm": 0.3423495590686798, + "learning_rate": 4.957184129184563e-06, + "loss": 0.668, + "step": 1613 + }, + { + "epoch": 0.37610688208792914, + "grad_norm": 0.33866196870803833, + "learning_rate": 4.957127675537948e-06, + "loss": 0.6527, + "step": 1614 + }, + { + "epoch": 0.37633990989591426, + "grad_norm": 0.3447354733943939, + "learning_rate": 4.9570711850200555e-06, + "loss": 0.6496, + "step": 1615 + }, + { + "epoch": 0.3765729377038993, + "grad_norm": 0.3413432538509369, + "learning_rate": 4.957014657631733e-06, + "loss": 0.663, + "step": 1616 + }, + { + "epoch": 0.37680596551188444, + "grad_norm": 0.3393695652484894, + "learning_rate": 4.956958093373828e-06, + "loss": 0.6428, + "step": 1617 + }, + { + "epoch": 0.3770389933198695, + "grad_norm": 0.3595902919769287, + "learning_rate": 4.9569014922471895e-06, + "loss": 0.6918, + "step": 1618 + }, + { + "epoch": 0.3772720211278546, + "grad_norm": 0.3418142795562744, + "learning_rate": 4.956844854252667e-06, + "loss": 0.6583, + "step": 1619 + }, + { + "epoch": 0.3775050489358397, + "grad_norm": 0.3333628177642822, + "learning_rate": 4.956788179391111e-06, + "loss": 0.6696, + "step": 1620 + }, + { + "epoch": 0.37773807674382476, + "grad_norm": 0.3433617353439331, + "learning_rate": 4.95673146766337e-06, + "loss": 0.6717, + "step": 1621 + }, + { + "epoch": 0.3779711045518098, + "grad_norm": 0.349901020526886, + "learning_rate": 4.956674719070297e-06, + "loss": 0.6628, + "step": 1622 + }, + { + "epoch": 0.37820413235979494, + "grad_norm": 0.31437140703201294, + "learning_rate": 4.956617933612743e-06, + "loss": 0.6182, + "step": 1623 + }, + { + "epoch": 0.37843716016778, + "grad_norm": 0.329340398311615, + "learning_rate": 4.95656111129156e-06, + "loss": 0.6637, + "step": 1624 + }, + { + "epoch": 0.3786701879757651, + "grad_norm": 0.33582091331481934, + "learning_rate": 4.9565042521076e-06, + "loss": 0.6949, + "step": 1625 + }, + { + "epoch": 0.3789032157837502, + "grad_norm": 0.3454348146915436, + "learning_rate": 4.956447356061717e-06, + "loss": 0.6792, + "step": 1626 + }, + { + "epoch": 0.37913624359173526, + "grad_norm": 0.3384022116661072, + "learning_rate": 4.956390423154766e-06, + "loss": 0.6788, + "step": 1627 + }, + { + "epoch": 0.3793692713997204, + "grad_norm": 0.34388267993927, + "learning_rate": 4.9563334533875985e-06, + "loss": 0.6875, + "step": 1628 + }, + { + "epoch": 0.37960229920770544, + "grad_norm": 0.35815274715423584, + "learning_rate": 4.956276446761072e-06, + "loss": 0.6856, + "step": 1629 + }, + { + "epoch": 0.37983532701569056, + "grad_norm": 0.3462885916233063, + "learning_rate": 4.956219403276039e-06, + "loss": 0.655, + "step": 1630 + }, + { + "epoch": 0.3800683548236756, + "grad_norm": 0.3552197813987732, + "learning_rate": 4.956162322933359e-06, + "loss": 0.6661, + "step": 1631 + }, + { + "epoch": 0.3803013826316607, + "grad_norm": 0.3418506383895874, + "learning_rate": 4.956105205733886e-06, + "loss": 0.6846, + "step": 1632 + }, + { + "epoch": 0.3805344104396458, + "grad_norm": 0.3540204167366028, + "learning_rate": 4.956048051678478e-06, + "loss": 0.6648, + "step": 1633 + }, + { + "epoch": 0.3807674382476309, + "grad_norm": 0.33124321699142456, + "learning_rate": 4.955990860767992e-06, + "loss": 0.6663, + "step": 1634 + }, + { + "epoch": 0.381000466055616, + "grad_norm": 0.3355477750301361, + "learning_rate": 4.955933633003288e-06, + "loss": 0.6741, + "step": 1635 + }, + { + "epoch": 0.38123349386360106, + "grad_norm": 0.33315402269363403, + "learning_rate": 4.955876368385223e-06, + "loss": 0.6592, + "step": 1636 + }, + { + "epoch": 0.3814665216715861, + "grad_norm": 0.3531157076358795, + "learning_rate": 4.955819066914656e-06, + "loss": 0.6846, + "step": 1637 + }, + { + "epoch": 0.38169954947957124, + "grad_norm": 0.3344787061214447, + "learning_rate": 4.955761728592449e-06, + "loss": 0.6588, + "step": 1638 + }, + { + "epoch": 0.3819325772875563, + "grad_norm": 0.34091925621032715, + "learning_rate": 4.955704353419459e-06, + "loss": 0.678, + "step": 1639 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 0.344499796628952, + "learning_rate": 4.9556469413965516e-06, + "loss": 0.6898, + "step": 1640 + }, + { + "epoch": 0.3823986329035265, + "grad_norm": 0.33654898405075073, + "learning_rate": 4.955589492524584e-06, + "loss": 0.6598, + "step": 1641 + }, + { + "epoch": 0.38263166071151156, + "grad_norm": 0.3479369580745697, + "learning_rate": 4.95553200680442e-06, + "loss": 0.6624, + "step": 1642 + }, + { + "epoch": 0.3828646885194967, + "grad_norm": 0.351240873336792, + "learning_rate": 4.955474484236923e-06, + "loss": 0.7015, + "step": 1643 + }, + { + "epoch": 0.38309771632748174, + "grad_norm": 0.34370705485343933, + "learning_rate": 4.955416924822955e-06, + "loss": 0.6857, + "step": 1644 + }, + { + "epoch": 0.3833307441354668, + "grad_norm": 0.3616243600845337, + "learning_rate": 4.95535932856338e-06, + "loss": 0.6835, + "step": 1645 + }, + { + "epoch": 0.3835637719434519, + "grad_norm": 0.32643356919288635, + "learning_rate": 4.955301695459063e-06, + "loss": 0.6656, + "step": 1646 + }, + { + "epoch": 0.383796799751437, + "grad_norm": 0.33337071537971497, + "learning_rate": 4.955244025510867e-06, + "loss": 0.6653, + "step": 1647 + }, + { + "epoch": 0.3840298275594221, + "grad_norm": 0.3460000157356262, + "learning_rate": 4.9551863187196605e-06, + "loss": 0.6623, + "step": 1648 + }, + { + "epoch": 0.3842628553674072, + "grad_norm": 0.33485469222068787, + "learning_rate": 4.955128575086306e-06, + "loss": 0.6648, + "step": 1649 + }, + { + "epoch": 0.38449588317539224, + "grad_norm": 0.3250187337398529, + "learning_rate": 4.955070794611672e-06, + "loss": 0.6601, + "step": 1650 + }, + { + "epoch": 0.38472891098337736, + "grad_norm": 0.3555416166782379, + "learning_rate": 4.955012977296626e-06, + "loss": 0.6501, + "step": 1651 + }, + { + "epoch": 0.3849619387913624, + "grad_norm": 0.3402034342288971, + "learning_rate": 4.954955123142034e-06, + "loss": 0.6463, + "step": 1652 + }, + { + "epoch": 0.38519496659934754, + "grad_norm": 0.3691380023956299, + "learning_rate": 4.954897232148764e-06, + "loss": 0.6533, + "step": 1653 + }, + { + "epoch": 0.3854279944073326, + "grad_norm": 0.38897407054901123, + "learning_rate": 4.954839304317686e-06, + "loss": 0.6505, + "step": 1654 + }, + { + "epoch": 0.3856610222153177, + "grad_norm": 0.3390321433544159, + "learning_rate": 4.954781339649669e-06, + "loss": 0.6696, + "step": 1655 + }, + { + "epoch": 0.3858940500233028, + "grad_norm": 0.3287765681743622, + "learning_rate": 4.954723338145583e-06, + "loss": 0.6701, + "step": 1656 + }, + { + "epoch": 0.38612707783128786, + "grad_norm": 0.33425477147102356, + "learning_rate": 4.954665299806298e-06, + "loss": 0.664, + "step": 1657 + }, + { + "epoch": 0.386360105639273, + "grad_norm": 0.32371294498443604, + "learning_rate": 4.954607224632685e-06, + "loss": 0.6597, + "step": 1658 + }, + { + "epoch": 0.38659313344725804, + "grad_norm": 0.33574768900871277, + "learning_rate": 4.954549112625614e-06, + "loss": 0.6691, + "step": 1659 + }, + { + "epoch": 0.3868261612552431, + "grad_norm": 0.33252713084220886, + "learning_rate": 4.95449096378596e-06, + "loss": 0.6833, + "step": 1660 + }, + { + "epoch": 0.3870591890632282, + "grad_norm": 0.33966872096061707, + "learning_rate": 4.954432778114593e-06, + "loss": 0.6732, + "step": 1661 + }, + { + "epoch": 0.3872922168712133, + "grad_norm": 0.3408563733100891, + "learning_rate": 4.9543745556123875e-06, + "loss": 0.6856, + "step": 1662 + }, + { + "epoch": 0.3875252446791984, + "grad_norm": 0.3324352502822876, + "learning_rate": 4.954316296280217e-06, + "loss": 0.6667, + "step": 1663 + }, + { + "epoch": 0.3877582724871835, + "grad_norm": 0.33298394083976746, + "learning_rate": 4.954258000118956e-06, + "loss": 0.6864, + "step": 1664 + }, + { + "epoch": 0.38799130029516854, + "grad_norm": 0.3265496790409088, + "learning_rate": 4.954199667129477e-06, + "loss": 0.6758, + "step": 1665 + }, + { + "epoch": 0.38822432810315366, + "grad_norm": 0.34990039467811584, + "learning_rate": 4.9541412973126576e-06, + "loss": 0.7145, + "step": 1666 + }, + { + "epoch": 0.3884573559111387, + "grad_norm": 0.33468717336654663, + "learning_rate": 4.954082890669374e-06, + "loss": 0.6886, + "step": 1667 + }, + { + "epoch": 0.3886903837191238, + "grad_norm": 0.3419089913368225, + "learning_rate": 4.954024447200502e-06, + "loss": 0.6798, + "step": 1668 + }, + { + "epoch": 0.3889234115271089, + "grad_norm": 0.33942127227783203, + "learning_rate": 4.953965966906917e-06, + "loss": 0.6711, + "step": 1669 + }, + { + "epoch": 0.389156439335094, + "grad_norm": 0.344032347202301, + "learning_rate": 4.9539074497894994e-06, + "loss": 0.6843, + "step": 1670 + }, + { + "epoch": 0.3893894671430791, + "grad_norm": 0.3337257206439972, + "learning_rate": 4.953848895849126e-06, + "loss": 0.6827, + "step": 1671 + }, + { + "epoch": 0.38962249495106416, + "grad_norm": 0.3413871228694916, + "learning_rate": 4.953790305086674e-06, + "loss": 0.6645, + "step": 1672 + }, + { + "epoch": 0.3898555227590492, + "grad_norm": 0.34587734937667847, + "learning_rate": 4.953731677503025e-06, + "loss": 0.6718, + "step": 1673 + }, + { + "epoch": 0.39008855056703434, + "grad_norm": 0.3454611897468567, + "learning_rate": 4.953673013099057e-06, + "loss": 0.6814, + "step": 1674 + }, + { + "epoch": 0.3903215783750194, + "grad_norm": 0.3341832756996155, + "learning_rate": 4.953614311875652e-06, + "loss": 0.6504, + "step": 1675 + }, + { + "epoch": 0.3905546061830045, + "grad_norm": 0.32990822196006775, + "learning_rate": 4.953555573833689e-06, + "loss": 0.6487, + "step": 1676 + }, + { + "epoch": 0.3907876339909896, + "grad_norm": 0.3368721902370453, + "learning_rate": 4.95349679897405e-06, + "loss": 0.6527, + "step": 1677 + }, + { + "epoch": 0.39102066179897466, + "grad_norm": 0.3382852375507355, + "learning_rate": 4.953437987297618e-06, + "loss": 0.659, + "step": 1678 + }, + { + "epoch": 0.3912536896069598, + "grad_norm": 0.33389759063720703, + "learning_rate": 4.9533791388052744e-06, + "loss": 0.6779, + "step": 1679 + }, + { + "epoch": 0.39148671741494484, + "grad_norm": 0.33047276735305786, + "learning_rate": 4.953320253497903e-06, + "loss": 0.6608, + "step": 1680 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 0.32769301533699036, + "learning_rate": 4.953261331376387e-06, + "loss": 0.672, + "step": 1681 + }, + { + "epoch": 0.391952773030915, + "grad_norm": 0.3583182990550995, + "learning_rate": 4.95320237244161e-06, + "loss": 0.6563, + "step": 1682 + }, + { + "epoch": 0.3921858008389001, + "grad_norm": 0.330965131521225, + "learning_rate": 4.9531433766944585e-06, + "loss": 0.6608, + "step": 1683 + }, + { + "epoch": 0.3924188286468852, + "grad_norm": 0.32593464851379395, + "learning_rate": 4.953084344135815e-06, + "loss": 0.6602, + "step": 1684 + }, + { + "epoch": 0.3926518564548703, + "grad_norm": 0.34240472316741943, + "learning_rate": 4.953025274766568e-06, + "loss": 0.6943, + "step": 1685 + }, + { + "epoch": 0.3928848842628554, + "grad_norm": 0.3450424373149872, + "learning_rate": 4.952966168587603e-06, + "loss": 0.6946, + "step": 1686 + }, + { + "epoch": 0.39311791207084046, + "grad_norm": 0.3360675275325775, + "learning_rate": 4.952907025599806e-06, + "loss": 0.6642, + "step": 1687 + }, + { + "epoch": 0.3933509398788255, + "grad_norm": 0.3306034803390503, + "learning_rate": 4.952847845804066e-06, + "loss": 0.6671, + "step": 1688 + }, + { + "epoch": 0.39358396768681064, + "grad_norm": 0.35195815563201904, + "learning_rate": 4.952788629201271e-06, + "loss": 0.6776, + "step": 1689 + }, + { + "epoch": 0.3938169954947957, + "grad_norm": 0.3332144618034363, + "learning_rate": 4.952729375792307e-06, + "loss": 0.6752, + "step": 1690 + }, + { + "epoch": 0.39405002330278077, + "grad_norm": 0.3285732567310333, + "learning_rate": 4.952670085578066e-06, + "loss": 0.6621, + "step": 1691 + }, + { + "epoch": 0.3942830511107659, + "grad_norm": 0.3489218056201935, + "learning_rate": 4.952610758559437e-06, + "loss": 0.6588, + "step": 1692 + }, + { + "epoch": 0.39451607891875096, + "grad_norm": 0.34411221742630005, + "learning_rate": 4.95255139473731e-06, + "loss": 0.6797, + "step": 1693 + }, + { + "epoch": 0.3947491067267361, + "grad_norm": 0.3395288288593292, + "learning_rate": 4.952491994112576e-06, + "loss": 0.6536, + "step": 1694 + }, + { + "epoch": 0.39498213453472114, + "grad_norm": 0.3389856517314911, + "learning_rate": 4.9524325566861265e-06, + "loss": 0.6725, + "step": 1695 + }, + { + "epoch": 0.3952151623427062, + "grad_norm": 0.34349897503852844, + "learning_rate": 4.952373082458852e-06, + "loss": 0.6757, + "step": 1696 + }, + { + "epoch": 0.3954481901506913, + "grad_norm": 0.3340815007686615, + "learning_rate": 4.952313571431647e-06, + "loss": 0.6716, + "step": 1697 + }, + { + "epoch": 0.3956812179586764, + "grad_norm": 0.33736929297447205, + "learning_rate": 4.952254023605403e-06, + "loss": 0.6521, + "step": 1698 + }, + { + "epoch": 0.3959142457666615, + "grad_norm": 0.34748896956443787, + "learning_rate": 4.952194438981014e-06, + "loss": 0.6709, + "step": 1699 + }, + { + "epoch": 0.3961472735746466, + "grad_norm": 0.3248806297779083, + "learning_rate": 4.9521348175593754e-06, + "loss": 0.6758, + "step": 1700 + }, + { + "epoch": 0.39638030138263164, + "grad_norm": 0.3407350480556488, + "learning_rate": 4.9520751593413795e-06, + "loss": 0.6881, + "step": 1701 + }, + { + "epoch": 0.39661332919061676, + "grad_norm": 0.3421764671802521, + "learning_rate": 4.952015464327922e-06, + "loss": 0.6817, + "step": 1702 + }, + { + "epoch": 0.3968463569986018, + "grad_norm": 0.34055161476135254, + "learning_rate": 4.951955732519902e-06, + "loss": 0.6345, + "step": 1703 + }, + { + "epoch": 0.39707938480658694, + "grad_norm": 0.32435688376426697, + "learning_rate": 4.9518959639182115e-06, + "loss": 0.6671, + "step": 1704 + }, + { + "epoch": 0.397312412614572, + "grad_norm": 0.3469674587249756, + "learning_rate": 4.95183615852375e-06, + "loss": 0.6674, + "step": 1705 + }, + { + "epoch": 0.3975454404225571, + "grad_norm": 0.3325175344944, + "learning_rate": 4.951776316337414e-06, + "loss": 0.653, + "step": 1706 + }, + { + "epoch": 0.3977784682305422, + "grad_norm": 0.34760206937789917, + "learning_rate": 4.9517164373601e-06, + "loss": 0.6614, + "step": 1707 + }, + { + "epoch": 0.39801149603852726, + "grad_norm": 0.3471429646015167, + "learning_rate": 4.951656521592709e-06, + "loss": 0.6822, + "step": 1708 + }, + { + "epoch": 0.3982445238465124, + "grad_norm": 0.3425239622592926, + "learning_rate": 4.951596569036139e-06, + "loss": 0.6516, + "step": 1709 + }, + { + "epoch": 0.39847755165449744, + "grad_norm": 0.33928027749061584, + "learning_rate": 4.9515365796912904e-06, + "loss": 0.6659, + "step": 1710 + }, + { + "epoch": 0.3987105794624825, + "grad_norm": 0.34166333079338074, + "learning_rate": 4.951476553559063e-06, + "loss": 0.689, + "step": 1711 + }, + { + "epoch": 0.3989436072704676, + "grad_norm": 0.35232672095298767, + "learning_rate": 4.951416490640356e-06, + "loss": 0.6538, + "step": 1712 + }, + { + "epoch": 0.3991766350784527, + "grad_norm": 0.33543989062309265, + "learning_rate": 4.951356390936073e-06, + "loss": 0.6728, + "step": 1713 + }, + { + "epoch": 0.39940966288643776, + "grad_norm": 0.3472687304019928, + "learning_rate": 4.951296254447115e-06, + "loss": 0.6604, + "step": 1714 + }, + { + "epoch": 0.3996426906944229, + "grad_norm": 0.3372616171836853, + "learning_rate": 4.951236081174384e-06, + "loss": 0.6574, + "step": 1715 + }, + { + "epoch": 0.39987571850240794, + "grad_norm": 0.3572330176830292, + "learning_rate": 4.951175871118783e-06, + "loss": 0.6696, + "step": 1716 + }, + { + "epoch": 0.40010874631039306, + "grad_norm": 0.34363383054733276, + "learning_rate": 4.951115624281216e-06, + "loss": 0.6908, + "step": 1717 + }, + { + "epoch": 0.4003417741183781, + "grad_norm": 0.3367461562156677, + "learning_rate": 4.9510553406625866e-06, + "loss": 0.6836, + "step": 1718 + }, + { + "epoch": 0.4005748019263632, + "grad_norm": 0.338622510433197, + "learning_rate": 4.9509950202638e-06, + "loss": 0.6393, + "step": 1719 + }, + { + "epoch": 0.4008078297343483, + "grad_norm": 0.352492094039917, + "learning_rate": 4.95093466308576e-06, + "loss": 0.6822, + "step": 1720 + }, + { + "epoch": 0.4010408575423334, + "grad_norm": 0.33983314037323, + "learning_rate": 4.950874269129374e-06, + "loss": 0.6459, + "step": 1721 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 0.35303840041160583, + "learning_rate": 4.950813838395547e-06, + "loss": 0.6603, + "step": 1722 + }, + { + "epoch": 0.40150691315830356, + "grad_norm": 0.35449764132499695, + "learning_rate": 4.950753370885188e-06, + "loss": 0.6485, + "step": 1723 + }, + { + "epoch": 0.4017399409662886, + "grad_norm": 0.3749799430370331, + "learning_rate": 4.9506928665992005e-06, + "loss": 0.649, + "step": 1724 + }, + { + "epoch": 0.40197296877427374, + "grad_norm": 0.3282085061073303, + "learning_rate": 4.950632325538496e-06, + "loss": 0.664, + "step": 1725 + }, + { + "epoch": 0.4022059965822588, + "grad_norm": 0.3568965196609497, + "learning_rate": 4.950571747703981e-06, + "loss": 0.6772, + "step": 1726 + }, + { + "epoch": 0.4024390243902439, + "grad_norm": 0.4035707712173462, + "learning_rate": 4.950511133096565e-06, + "loss": 0.6752, + "step": 1727 + }, + { + "epoch": 0.402672052198229, + "grad_norm": 0.3327253460884094, + "learning_rate": 4.950450481717159e-06, + "loss": 0.6244, + "step": 1728 + }, + { + "epoch": 0.40290508000621406, + "grad_norm": 0.355476975440979, + "learning_rate": 4.95038979356667e-06, + "loss": 0.6619, + "step": 1729 + }, + { + "epoch": 0.4031381078141992, + "grad_norm": 0.3695465624332428, + "learning_rate": 4.950329068646011e-06, + "loss": 0.6491, + "step": 1730 + }, + { + "epoch": 0.40337113562218424, + "grad_norm": 0.3560361862182617, + "learning_rate": 4.950268306956093e-06, + "loss": 0.688, + "step": 1731 + }, + { + "epoch": 0.40360416343016936, + "grad_norm": 0.3342687487602234, + "learning_rate": 4.950207508497828e-06, + "loss": 0.6845, + "step": 1732 + }, + { + "epoch": 0.4038371912381544, + "grad_norm": 0.35703787207603455, + "learning_rate": 4.950146673272127e-06, + "loss": 0.6784, + "step": 1733 + }, + { + "epoch": 0.4040702190461395, + "grad_norm": 0.3340420722961426, + "learning_rate": 4.950085801279904e-06, + "loss": 0.6821, + "step": 1734 + }, + { + "epoch": 0.4043032468541246, + "grad_norm": 0.33699601888656616, + "learning_rate": 4.9500248925220714e-06, + "loss": 0.6514, + "step": 1735 + }, + { + "epoch": 0.4045362746621097, + "grad_norm": 0.3259906768798828, + "learning_rate": 4.949963946999545e-06, + "loss": 0.6425, + "step": 1736 + }, + { + "epoch": 0.40476930247009474, + "grad_norm": 0.32370150089263916, + "learning_rate": 4.949902964713238e-06, + "loss": 0.6383, + "step": 1737 + }, + { + "epoch": 0.40500233027807986, + "grad_norm": 0.349104106426239, + "learning_rate": 4.949841945664066e-06, + "loss": 0.668, + "step": 1738 + }, + { + "epoch": 0.4052353580860649, + "grad_norm": 0.3610500991344452, + "learning_rate": 4.949780889852943e-06, + "loss": 0.7159, + "step": 1739 + }, + { + "epoch": 0.40546838589405004, + "grad_norm": 0.3394101560115814, + "learning_rate": 4.949719797280788e-06, + "loss": 0.621, + "step": 1740 + }, + { + "epoch": 0.4057014137020351, + "grad_norm": 0.3298831880092621, + "learning_rate": 4.949658667948516e-06, + "loss": 0.6547, + "step": 1741 + }, + { + "epoch": 0.40593444151002017, + "grad_norm": 0.37258607149124146, + "learning_rate": 4.949597501857045e-06, + "loss": 0.6393, + "step": 1742 + }, + { + "epoch": 0.4061674693180053, + "grad_norm": 0.33922743797302246, + "learning_rate": 4.949536299007292e-06, + "loss": 0.6869, + "step": 1743 + }, + { + "epoch": 0.40640049712599036, + "grad_norm": 0.3323351740837097, + "learning_rate": 4.9494750594001765e-06, + "loss": 0.6675, + "step": 1744 + }, + { + "epoch": 0.4066335249339755, + "grad_norm": 0.3612413704395294, + "learning_rate": 4.9494137830366165e-06, + "loss": 0.6758, + "step": 1745 + }, + { + "epoch": 0.40686655274196054, + "grad_norm": 0.3318728804588318, + "learning_rate": 4.949352469917531e-06, + "loss": 0.6737, + "step": 1746 + }, + { + "epoch": 0.4070995805499456, + "grad_norm": 0.3428286910057068, + "learning_rate": 4.949291120043842e-06, + "loss": 0.656, + "step": 1747 + }, + { + "epoch": 0.4073326083579307, + "grad_norm": 0.3430292010307312, + "learning_rate": 4.9492297334164685e-06, + "loss": 0.6621, + "step": 1748 + }, + { + "epoch": 0.4075656361659158, + "grad_norm": 0.3563692271709442, + "learning_rate": 4.949168310036332e-06, + "loss": 0.6875, + "step": 1749 + }, + { + "epoch": 0.4077986639739009, + "grad_norm": 0.3334141671657562, + "learning_rate": 4.9491068499043546e-06, + "loss": 0.6764, + "step": 1750 + }, + { + "epoch": 0.408031691781886, + "grad_norm": 0.334791898727417, + "learning_rate": 4.949045353021459e-06, + "loss": 0.6989, + "step": 1751 + }, + { + "epoch": 0.40826471958987104, + "grad_norm": 0.3507644534111023, + "learning_rate": 4.9489838193885664e-06, + "loss": 0.6714, + "step": 1752 + }, + { + "epoch": 0.40849774739785616, + "grad_norm": 0.3352418541908264, + "learning_rate": 4.948922249006602e-06, + "loss": 0.6959, + "step": 1753 + }, + { + "epoch": 0.4087307752058412, + "grad_norm": 0.3390520513057709, + "learning_rate": 4.948860641876488e-06, + "loss": 0.6852, + "step": 1754 + }, + { + "epoch": 0.40896380301382634, + "grad_norm": 0.34767231345176697, + "learning_rate": 4.94879899799915e-06, + "loss": 0.6682, + "step": 1755 + }, + { + "epoch": 0.4091968308218114, + "grad_norm": 0.31935757398605347, + "learning_rate": 4.948737317375513e-06, + "loss": 0.6861, + "step": 1756 + }, + { + "epoch": 0.4094298586297965, + "grad_norm": 0.3387490510940552, + "learning_rate": 4.948675600006502e-06, + "loss": 0.6432, + "step": 1757 + }, + { + "epoch": 0.4096628864377816, + "grad_norm": 0.3330536484718323, + "learning_rate": 4.9486138458930434e-06, + "loss": 0.683, + "step": 1758 + }, + { + "epoch": 0.40989591424576666, + "grad_norm": 0.3468201160430908, + "learning_rate": 4.948552055036063e-06, + "loss": 0.6708, + "step": 1759 + }, + { + "epoch": 0.4101289420537517, + "grad_norm": 0.3471274673938751, + "learning_rate": 4.948490227436491e-06, + "loss": 0.6589, + "step": 1760 + }, + { + "epoch": 0.41036196986173684, + "grad_norm": 0.3250395953655243, + "learning_rate": 4.948428363095251e-06, + "loss": 0.6488, + "step": 1761 + }, + { + "epoch": 0.4105949976697219, + "grad_norm": 0.3314000368118286, + "learning_rate": 4.948366462013274e-06, + "loss": 0.6662, + "step": 1762 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 0.34250009059906006, + "learning_rate": 4.948304524191489e-06, + "loss": 0.684, + "step": 1763 + }, + { + "epoch": 0.4110610532856921, + "grad_norm": 0.3585408329963684, + "learning_rate": 4.948242549630824e-06, + "loss": 0.6549, + "step": 1764 + }, + { + "epoch": 0.41129408109367716, + "grad_norm": 0.3291746973991394, + "learning_rate": 4.94818053833221e-06, + "loss": 0.6383, + "step": 1765 + }, + { + "epoch": 0.4115271089016623, + "grad_norm": 0.34441739320755005, + "learning_rate": 4.948118490296578e-06, + "loss": 0.6417, + "step": 1766 + }, + { + "epoch": 0.41176013670964734, + "grad_norm": 0.34976986050605774, + "learning_rate": 4.948056405524857e-06, + "loss": 0.6671, + "step": 1767 + }, + { + "epoch": 0.41199316451763246, + "grad_norm": 0.346294641494751, + "learning_rate": 4.947994284017981e-06, + "loss": 0.6829, + "step": 1768 + }, + { + "epoch": 0.4122261923256175, + "grad_norm": 0.34444451332092285, + "learning_rate": 4.94793212577688e-06, + "loss": 0.6735, + "step": 1769 + }, + { + "epoch": 0.4124592201336026, + "grad_norm": 0.3410640358924866, + "learning_rate": 4.947869930802489e-06, + "loss": 0.6556, + "step": 1770 + }, + { + "epoch": 0.4126922479415877, + "grad_norm": 0.34041473269462585, + "learning_rate": 4.947807699095739e-06, + "loss": 0.665, + "step": 1771 + }, + { + "epoch": 0.4129252757495728, + "grad_norm": 0.33902254700660706, + "learning_rate": 4.947745430657566e-06, + "loss": 0.6939, + "step": 1772 + }, + { + "epoch": 0.4131583035575579, + "grad_norm": 0.350528359413147, + "learning_rate": 4.947683125488903e-06, + "loss": 0.6649, + "step": 1773 + }, + { + "epoch": 0.41339133136554296, + "grad_norm": 0.3637930750846863, + "learning_rate": 4.947620783590686e-06, + "loss": 0.6715, + "step": 1774 + }, + { + "epoch": 0.413624359173528, + "grad_norm": 0.32345283031463623, + "learning_rate": 4.947558404963848e-06, + "loss": 0.6682, + "step": 1775 + }, + { + "epoch": 0.41385738698151314, + "grad_norm": 0.33449557423591614, + "learning_rate": 4.947495989609329e-06, + "loss": 0.6598, + "step": 1776 + }, + { + "epoch": 0.4140904147894982, + "grad_norm": 0.35374686121940613, + "learning_rate": 4.947433537528062e-06, + "loss": 0.6692, + "step": 1777 + }, + { + "epoch": 0.4143234425974833, + "grad_norm": 0.3391699194908142, + "learning_rate": 4.947371048720986e-06, + "loss": 0.6673, + "step": 1778 + }, + { + "epoch": 0.4145564704054684, + "grad_norm": 0.3354116380214691, + "learning_rate": 4.947308523189038e-06, + "loss": 0.6702, + "step": 1779 + }, + { + "epoch": 0.41478949821345346, + "grad_norm": 0.34017205238342285, + "learning_rate": 4.947245960933157e-06, + "loss": 0.7073, + "step": 1780 + }, + { + "epoch": 0.4150225260214386, + "grad_norm": 0.34678107500076294, + "learning_rate": 4.947183361954282e-06, + "loss": 0.6903, + "step": 1781 + }, + { + "epoch": 0.41525555382942364, + "grad_norm": 0.345950186252594, + "learning_rate": 4.9471207262533506e-06, + "loss": 0.6454, + "step": 1782 + }, + { + "epoch": 0.4154885816374087, + "grad_norm": 0.3504173159599304, + "learning_rate": 4.947058053831304e-06, + "loss": 0.6576, + "step": 1783 + }, + { + "epoch": 0.4157216094453938, + "grad_norm": 0.3447764813899994, + "learning_rate": 4.946995344689083e-06, + "loss": 0.6762, + "step": 1784 + }, + { + "epoch": 0.4159546372533789, + "grad_norm": 0.36346936225891113, + "learning_rate": 4.946932598827627e-06, + "loss": 0.6922, + "step": 1785 + }, + { + "epoch": 0.416187665061364, + "grad_norm": 0.35050731897354126, + "learning_rate": 4.94686981624788e-06, + "loss": 0.6645, + "step": 1786 + }, + { + "epoch": 0.4164206928693491, + "grad_norm": 0.32814982533454895, + "learning_rate": 4.946806996950781e-06, + "loss": 0.6461, + "step": 1787 + }, + { + "epoch": 0.41665372067733414, + "grad_norm": 0.35874247550964355, + "learning_rate": 4.946744140937276e-06, + "loss": 0.6741, + "step": 1788 + }, + { + "epoch": 0.41688674848531926, + "grad_norm": 0.34394165873527527, + "learning_rate": 4.946681248208306e-06, + "loss": 0.6551, + "step": 1789 + }, + { + "epoch": 0.4171197762933043, + "grad_norm": 0.32596510648727417, + "learning_rate": 4.946618318764816e-06, + "loss": 0.6638, + "step": 1790 + }, + { + "epoch": 0.41735280410128944, + "grad_norm": 0.3426658809185028, + "learning_rate": 4.946555352607748e-06, + "loss": 0.7024, + "step": 1791 + }, + { + "epoch": 0.4175858319092745, + "grad_norm": 0.327951043844223, + "learning_rate": 4.94649234973805e-06, + "loss": 0.6749, + "step": 1792 + }, + { + "epoch": 0.41781885971725957, + "grad_norm": 0.34522995352745056, + "learning_rate": 4.946429310156665e-06, + "loss": 0.6497, + "step": 1793 + }, + { + "epoch": 0.4180518875252447, + "grad_norm": 0.34774723649024963, + "learning_rate": 4.94636623386454e-06, + "loss": 0.6816, + "step": 1794 + }, + { + "epoch": 0.41828491533322976, + "grad_norm": 0.3330458402633667, + "learning_rate": 4.9463031208626224e-06, + "loss": 0.6635, + "step": 1795 + }, + { + "epoch": 0.4185179431412149, + "grad_norm": 0.33796951174736023, + "learning_rate": 4.946239971151857e-06, + "loss": 0.6679, + "step": 1796 + }, + { + "epoch": 0.41875097094919994, + "grad_norm": 0.3397925794124603, + "learning_rate": 4.946176784733194e-06, + "loss": 0.6535, + "step": 1797 + }, + { + "epoch": 0.418983998757185, + "grad_norm": 0.34677398204803467, + "learning_rate": 4.94611356160758e-06, + "loss": 0.6604, + "step": 1798 + }, + { + "epoch": 0.4192170265651701, + "grad_norm": 0.34160876274108887, + "learning_rate": 4.946050301775963e-06, + "loss": 0.6547, + "step": 1799 + }, + { + "epoch": 0.4194500543731552, + "grad_norm": 0.3424573540687561, + "learning_rate": 4.9459870052392935e-06, + "loss": 0.6443, + "step": 1800 + }, + { + "epoch": 0.4196830821811403, + "grad_norm": 0.34834882616996765, + "learning_rate": 4.945923671998521e-06, + "loss": 0.6713, + "step": 1801 + }, + { + "epoch": 0.4199161099891254, + "grad_norm": 0.3429913818836212, + "learning_rate": 4.945860302054596e-06, + "loss": 0.6906, + "step": 1802 + }, + { + "epoch": 0.42014913779711044, + "grad_norm": 0.3339388072490692, + "learning_rate": 4.945796895408469e-06, + "loss": 0.6894, + "step": 1803 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 0.33159393072128296, + "learning_rate": 4.945733452061093e-06, + "loss": 0.6551, + "step": 1804 + }, + { + "epoch": 0.4206151934130806, + "grad_norm": 0.34279656410217285, + "learning_rate": 4.9456699720134185e-06, + "loss": 0.6567, + "step": 1805 + }, + { + "epoch": 0.4208482212210657, + "grad_norm": 0.3406655192375183, + "learning_rate": 4.945606455266398e-06, + "loss": 0.6534, + "step": 1806 + }, + { + "epoch": 0.4210812490290508, + "grad_norm": 0.3469056785106659, + "learning_rate": 4.945542901820984e-06, + "loss": 0.6475, + "step": 1807 + }, + { + "epoch": 0.4213142768370359, + "grad_norm": 0.32995009422302246, + "learning_rate": 4.945479311678133e-06, + "loss": 0.642, + "step": 1808 + }, + { + "epoch": 0.421547304645021, + "grad_norm": 0.33879077434539795, + "learning_rate": 4.945415684838796e-06, + "loss": 0.6548, + "step": 1809 + }, + { + "epoch": 0.42178033245300606, + "grad_norm": 0.34693577885627747, + "learning_rate": 4.94535202130393e-06, + "loss": 0.6809, + "step": 1810 + }, + { + "epoch": 0.4220133602609911, + "grad_norm": 0.3340616524219513, + "learning_rate": 4.945288321074489e-06, + "loss": 0.6581, + "step": 1811 + }, + { + "epoch": 0.42224638806897624, + "grad_norm": 0.35153448581695557, + "learning_rate": 4.94522458415143e-06, + "loss": 0.6715, + "step": 1812 + }, + { + "epoch": 0.4224794158769613, + "grad_norm": 0.3310188055038452, + "learning_rate": 4.945160810535708e-06, + "loss": 0.6648, + "step": 1813 + }, + { + "epoch": 0.4227124436849464, + "grad_norm": 0.3211494982242584, + "learning_rate": 4.945097000228282e-06, + "loss": 0.6332, + "step": 1814 + }, + { + "epoch": 0.4229454714929315, + "grad_norm": 0.34392520785331726, + "learning_rate": 4.945033153230108e-06, + "loss": 0.6537, + "step": 1815 + }, + { + "epoch": 0.42317849930091656, + "grad_norm": 0.3541039824485779, + "learning_rate": 4.944969269542144e-06, + "loss": 0.6471, + "step": 1816 + }, + { + "epoch": 0.4234115271089017, + "grad_norm": 0.32842588424682617, + "learning_rate": 4.944905349165349e-06, + "loss": 0.6574, + "step": 1817 + }, + { + "epoch": 0.42364455491688674, + "grad_norm": 0.32187607884407043, + "learning_rate": 4.9448413921006824e-06, + "loss": 0.6376, + "step": 1818 + }, + { + "epoch": 0.42387758272487186, + "grad_norm": 0.32731229066848755, + "learning_rate": 4.944777398349103e-06, + "loss": 0.6492, + "step": 1819 + }, + { + "epoch": 0.4241106105328569, + "grad_norm": 0.33163848519325256, + "learning_rate": 4.944713367911573e-06, + "loss": 0.6311, + "step": 1820 + }, + { + "epoch": 0.424343638340842, + "grad_norm": 0.34121736884117126, + "learning_rate": 4.944649300789052e-06, + "loss": 0.6882, + "step": 1821 + }, + { + "epoch": 0.4245766661488271, + "grad_norm": 0.3194497227668762, + "learning_rate": 4.9445851969825e-06, + "loss": 0.6665, + "step": 1822 + }, + { + "epoch": 0.4248096939568122, + "grad_norm": 0.32579803466796875, + "learning_rate": 4.944521056492882e-06, + "loss": 0.6632, + "step": 1823 + }, + { + "epoch": 0.4250427217647973, + "grad_norm": 0.3217863142490387, + "learning_rate": 4.9444568793211574e-06, + "loss": 0.6788, + "step": 1824 + }, + { + "epoch": 0.42527574957278236, + "grad_norm": 0.3432362377643585, + "learning_rate": 4.944392665468292e-06, + "loss": 0.6777, + "step": 1825 + }, + { + "epoch": 0.4255087773807674, + "grad_norm": 0.35607829689979553, + "learning_rate": 4.944328414935247e-06, + "loss": 0.6407, + "step": 1826 + }, + { + "epoch": 0.42574180518875254, + "grad_norm": 0.3298324942588806, + "learning_rate": 4.944264127722989e-06, + "loss": 0.6515, + "step": 1827 + }, + { + "epoch": 0.4259748329967376, + "grad_norm": 0.3370838761329651, + "learning_rate": 4.9441998038324795e-06, + "loss": 0.6483, + "step": 1828 + }, + { + "epoch": 0.42620786080472267, + "grad_norm": 0.3540722727775574, + "learning_rate": 4.9441354432646865e-06, + "loss": 0.6987, + "step": 1829 + }, + { + "epoch": 0.4264408886127078, + "grad_norm": 0.37742921710014343, + "learning_rate": 4.9440710460205745e-06, + "loss": 0.636, + "step": 1830 + }, + { + "epoch": 0.42667391642069286, + "grad_norm": 0.3406807482242584, + "learning_rate": 4.944006612101111e-06, + "loss": 0.6353, + "step": 1831 + }, + { + "epoch": 0.426906944228678, + "grad_norm": 0.3647693693637848, + "learning_rate": 4.943942141507261e-06, + "loss": 0.6539, + "step": 1832 + }, + { + "epoch": 0.42713997203666304, + "grad_norm": 0.3553480803966522, + "learning_rate": 4.943877634239993e-06, + "loss": 0.6876, + "step": 1833 + }, + { + "epoch": 0.4273729998446481, + "grad_norm": 0.3423396050930023, + "learning_rate": 4.943813090300275e-06, + "loss": 0.6718, + "step": 1834 + }, + { + "epoch": 0.4276060276526332, + "grad_norm": 0.3400339186191559, + "learning_rate": 4.943748509689076e-06, + "loss": 0.6716, + "step": 1835 + }, + { + "epoch": 0.4278390554606183, + "grad_norm": 0.356384813785553, + "learning_rate": 4.943683892407364e-06, + "loss": 0.671, + "step": 1836 + }, + { + "epoch": 0.4280720832686034, + "grad_norm": 0.34584617614746094, + "learning_rate": 4.9436192384561096e-06, + "loss": 0.6823, + "step": 1837 + }, + { + "epoch": 0.4283051110765885, + "grad_norm": 0.332420289516449, + "learning_rate": 4.943554547836282e-06, + "loss": 0.649, + "step": 1838 + }, + { + "epoch": 0.42853813888457354, + "grad_norm": 0.3330012261867523, + "learning_rate": 4.943489820548854e-06, + "loss": 0.6695, + "step": 1839 + }, + { + "epoch": 0.42877116669255866, + "grad_norm": 0.3482713997364044, + "learning_rate": 4.9434250565947935e-06, + "loss": 0.6622, + "step": 1840 + }, + { + "epoch": 0.4290041945005437, + "grad_norm": 0.33144357800483704, + "learning_rate": 4.943360255975075e-06, + "loss": 0.701, + "step": 1841 + }, + { + "epoch": 0.42923722230852884, + "grad_norm": 0.3374062180519104, + "learning_rate": 4.9432954186906705e-06, + "loss": 0.6619, + "step": 1842 + }, + { + "epoch": 0.4294702501165139, + "grad_norm": 0.32982903718948364, + "learning_rate": 4.9432305447425525e-06, + "loss": 0.6533, + "step": 1843 + }, + { + "epoch": 0.42970327792449897, + "grad_norm": 0.33117732405662537, + "learning_rate": 4.943165634131695e-06, + "loss": 0.685, + "step": 1844 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 0.33971142768859863, + "learning_rate": 4.9431006868590704e-06, + "loss": 0.6699, + "step": 1845 + }, + { + "epoch": 0.43016933354046916, + "grad_norm": 0.3486189544200897, + "learning_rate": 4.943035702925655e-06, + "loss": 0.6684, + "step": 1846 + }, + { + "epoch": 0.4304023613484543, + "grad_norm": 0.34855806827545166, + "learning_rate": 4.942970682332423e-06, + "loss": 0.691, + "step": 1847 + }, + { + "epoch": 0.43063538915643934, + "grad_norm": 0.3541366755962372, + "learning_rate": 4.942905625080352e-06, + "loss": 0.6703, + "step": 1848 + }, + { + "epoch": 0.4308684169644244, + "grad_norm": 0.35368266701698303, + "learning_rate": 4.9428405311704155e-06, + "loss": 0.6497, + "step": 1849 + }, + { + "epoch": 0.4311014447724095, + "grad_norm": 0.31725722551345825, + "learning_rate": 4.942775400603592e-06, + "loss": 0.6687, + "step": 1850 + }, + { + "epoch": 0.4313344725803946, + "grad_norm": 0.3506354093551636, + "learning_rate": 4.942710233380858e-06, + "loss": 0.6794, + "step": 1851 + }, + { + "epoch": 0.43156750038837965, + "grad_norm": 0.33650723099708557, + "learning_rate": 4.942645029503192e-06, + "loss": 0.652, + "step": 1852 + }, + { + "epoch": 0.4318005281963648, + "grad_norm": 0.34796279668807983, + "learning_rate": 4.942579788971572e-06, + "loss": 0.6792, + "step": 1853 + }, + { + "epoch": 0.43203355600434984, + "grad_norm": 0.3405900299549103, + "learning_rate": 4.942514511786978e-06, + "loss": 0.6708, + "step": 1854 + }, + { + "epoch": 0.43226658381233496, + "grad_norm": 0.3173640966415405, + "learning_rate": 4.942449197950388e-06, + "loss": 0.6425, + "step": 1855 + }, + { + "epoch": 0.43249961162032, + "grad_norm": 0.3378995358943939, + "learning_rate": 4.942383847462783e-06, + "loss": 0.6693, + "step": 1856 + }, + { + "epoch": 0.4327326394283051, + "grad_norm": 0.32878416776657104, + "learning_rate": 4.9423184603251435e-06, + "loss": 0.6485, + "step": 1857 + }, + { + "epoch": 0.4329656672362902, + "grad_norm": 0.3428669273853302, + "learning_rate": 4.942253036538451e-06, + "loss": 0.6754, + "step": 1858 + }, + { + "epoch": 0.4331986950442753, + "grad_norm": 0.34098395705223083, + "learning_rate": 4.942187576103687e-06, + "loss": 0.6696, + "step": 1859 + }, + { + "epoch": 0.4334317228522604, + "grad_norm": 0.3321053087711334, + "learning_rate": 4.942122079021833e-06, + "loss": 0.6388, + "step": 1860 + }, + { + "epoch": 0.43366475066024546, + "grad_norm": 0.344250351190567, + "learning_rate": 4.942056545293872e-06, + "loss": 0.6666, + "step": 1861 + }, + { + "epoch": 0.4338977784682305, + "grad_norm": 0.332223117351532, + "learning_rate": 4.941990974920789e-06, + "loss": 0.678, + "step": 1862 + }, + { + "epoch": 0.43413080627621564, + "grad_norm": 0.33907264471054077, + "learning_rate": 4.941925367903567e-06, + "loss": 0.6607, + "step": 1863 + }, + { + "epoch": 0.4343638340842007, + "grad_norm": 0.3244437277317047, + "learning_rate": 4.94185972424319e-06, + "loss": 0.6459, + "step": 1864 + }, + { + "epoch": 0.4345968618921858, + "grad_norm": 0.33086422085762024, + "learning_rate": 4.941794043940643e-06, + "loss": 0.6654, + "step": 1865 + }, + { + "epoch": 0.4348298897001709, + "grad_norm": 0.3431732654571533, + "learning_rate": 4.941728326996912e-06, + "loss": 0.656, + "step": 1866 + }, + { + "epoch": 0.43506291750815596, + "grad_norm": 0.33987000584602356, + "learning_rate": 4.941662573412983e-06, + "loss": 0.6691, + "step": 1867 + }, + { + "epoch": 0.4352959453161411, + "grad_norm": 0.35245755314826965, + "learning_rate": 4.941596783189844e-06, + "loss": 0.6392, + "step": 1868 + }, + { + "epoch": 0.43552897312412614, + "grad_norm": 0.3439905643463135, + "learning_rate": 4.9415309563284795e-06, + "loss": 0.6605, + "step": 1869 + }, + { + "epoch": 0.4357620009321112, + "grad_norm": 0.35251450538635254, + "learning_rate": 4.9414650928298806e-06, + "loss": 0.6837, + "step": 1870 + }, + { + "epoch": 0.4359950287400963, + "grad_norm": 0.34666281938552856, + "learning_rate": 4.941399192695032e-06, + "loss": 0.6751, + "step": 1871 + }, + { + "epoch": 0.4362280565480814, + "grad_norm": 0.341030478477478, + "learning_rate": 4.941333255924926e-06, + "loss": 0.6476, + "step": 1872 + }, + { + "epoch": 0.4364610843560665, + "grad_norm": 0.33805760741233826, + "learning_rate": 4.9412672825205485e-06, + "loss": 0.6623, + "step": 1873 + }, + { + "epoch": 0.4366941121640516, + "grad_norm": 0.34336331486701965, + "learning_rate": 4.941201272482893e-06, + "loss": 0.6946, + "step": 1874 + }, + { + "epoch": 0.43692713997203664, + "grad_norm": 0.34336891770362854, + "learning_rate": 4.9411352258129485e-06, + "loss": 0.6883, + "step": 1875 + }, + { + "epoch": 0.43716016778002176, + "grad_norm": 0.35696253180503845, + "learning_rate": 4.941069142511706e-06, + "loss": 0.6663, + "step": 1876 + }, + { + "epoch": 0.4373931955880068, + "grad_norm": 0.33432096242904663, + "learning_rate": 4.941003022580157e-06, + "loss": 0.6689, + "step": 1877 + }, + { + "epoch": 0.43762622339599194, + "grad_norm": 0.3365829885005951, + "learning_rate": 4.940936866019294e-06, + "loss": 0.6651, + "step": 1878 + }, + { + "epoch": 0.437859251203977, + "grad_norm": 0.35070231556892395, + "learning_rate": 4.940870672830109e-06, + "loss": 0.6874, + "step": 1879 + }, + { + "epoch": 0.43809227901196207, + "grad_norm": 0.34392017126083374, + "learning_rate": 4.940804443013597e-06, + "loss": 0.6746, + "step": 1880 + }, + { + "epoch": 0.4383253068199472, + "grad_norm": 0.35446518659591675, + "learning_rate": 4.940738176570751e-06, + "loss": 0.6711, + "step": 1881 + }, + { + "epoch": 0.43855833462793226, + "grad_norm": 0.35856354236602783, + "learning_rate": 4.940671873502564e-06, + "loss": 0.6506, + "step": 1882 + }, + { + "epoch": 0.4387913624359174, + "grad_norm": 0.3437099754810333, + "learning_rate": 4.940605533810032e-06, + "loss": 0.6408, + "step": 1883 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 0.32710912823677063, + "learning_rate": 4.940539157494152e-06, + "loss": 0.6738, + "step": 1884 + }, + { + "epoch": 0.4392574180518875, + "grad_norm": 0.3601798415184021, + "learning_rate": 4.940472744555918e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 0.3446096181869507, + "learning_rate": 4.940406294996326e-06, + "loss": 0.7064, + "step": 1886 + }, + { + "epoch": 0.4397234736678577, + "grad_norm": 0.3344641625881195, + "learning_rate": 4.940339808816376e-06, + "loss": 0.653, + "step": 1887 + }, + { + "epoch": 0.4399565014758428, + "grad_norm": 0.3304932415485382, + "learning_rate": 4.940273286017063e-06, + "loss": 0.6673, + "step": 1888 + }, + { + "epoch": 0.4401895292838279, + "grad_norm": 0.32900694012641907, + "learning_rate": 4.9402067265993865e-06, + "loss": 0.6385, + "step": 1889 + }, + { + "epoch": 0.44042255709181294, + "grad_norm": 0.3487311005592346, + "learning_rate": 4.940140130564345e-06, + "loss": 0.6605, + "step": 1890 + }, + { + "epoch": 0.44065558489979806, + "grad_norm": 0.34781455993652344, + "learning_rate": 4.940073497912938e-06, + "loss": 0.6861, + "step": 1891 + }, + { + "epoch": 0.4408886127077831, + "grad_norm": 0.34696295857429504, + "learning_rate": 4.940006828646166e-06, + "loss": 0.6498, + "step": 1892 + }, + { + "epoch": 0.4411216405157682, + "grad_norm": 0.3341001868247986, + "learning_rate": 4.939940122765027e-06, + "loss": 0.6383, + "step": 1893 + }, + { + "epoch": 0.4413546683237533, + "grad_norm": 0.3485175669193268, + "learning_rate": 4.939873380270525e-06, + "loss": 0.6698, + "step": 1894 + }, + { + "epoch": 0.44158769613173837, + "grad_norm": 0.33241575956344604, + "learning_rate": 4.939806601163659e-06, + "loss": 0.6631, + "step": 1895 + }, + { + "epoch": 0.4418207239397235, + "grad_norm": 0.3525029718875885, + "learning_rate": 4.939739785445433e-06, + "loss": 0.6628, + "step": 1896 + }, + { + "epoch": 0.44205375174770856, + "grad_norm": 0.3264833986759186, + "learning_rate": 4.939672933116848e-06, + "loss": 0.6632, + "step": 1897 + }, + { + "epoch": 0.4422867795556936, + "grad_norm": 0.349259614944458, + "learning_rate": 4.939606044178909e-06, + "loss": 0.6609, + "step": 1898 + }, + { + "epoch": 0.44251980736367874, + "grad_norm": 0.3453799784183502, + "learning_rate": 4.939539118632618e-06, + "loss": 0.6573, + "step": 1899 + }, + { + "epoch": 0.4427528351716638, + "grad_norm": 0.3277939260005951, + "learning_rate": 4.93947215647898e-06, + "loss": 0.6457, + "step": 1900 + }, + { + "epoch": 0.4429858629796489, + "grad_norm": 0.33755090832710266, + "learning_rate": 4.939405157719001e-06, + "loss": 0.6709, + "step": 1901 + }, + { + "epoch": 0.443218890787634, + "grad_norm": 0.3394086956977844, + "learning_rate": 4.939338122353685e-06, + "loss": 0.6589, + "step": 1902 + }, + { + "epoch": 0.44345191859561905, + "grad_norm": 0.3714866042137146, + "learning_rate": 4.939271050384037e-06, + "loss": 0.6897, + "step": 1903 + }, + { + "epoch": 0.4436849464036042, + "grad_norm": 0.3506304919719696, + "learning_rate": 4.939203941811066e-06, + "loss": 0.6664, + "step": 1904 + }, + { + "epoch": 0.44391797421158924, + "grad_norm": 0.34467026591300964, + "learning_rate": 4.939136796635777e-06, + "loss": 0.6616, + "step": 1905 + }, + { + "epoch": 0.44415100201957436, + "grad_norm": 0.34170782566070557, + "learning_rate": 4.939069614859178e-06, + "loss": 0.6696, + "step": 1906 + }, + { + "epoch": 0.4443840298275594, + "grad_norm": 0.323437362909317, + "learning_rate": 4.939002396482277e-06, + "loss": 0.649, + "step": 1907 + }, + { + "epoch": 0.4446170576355445, + "grad_norm": 0.33163708448410034, + "learning_rate": 4.938935141506084e-06, + "loss": 0.6723, + "step": 1908 + }, + { + "epoch": 0.4448500854435296, + "grad_norm": 0.35104507207870483, + "learning_rate": 4.938867849931607e-06, + "loss": 0.6863, + "step": 1909 + }, + { + "epoch": 0.4450831132515147, + "grad_norm": 0.3493117094039917, + "learning_rate": 4.938800521759855e-06, + "loss": 0.662, + "step": 1910 + }, + { + "epoch": 0.4453161410594998, + "grad_norm": 0.338409423828125, + "learning_rate": 4.93873315699184e-06, + "loss": 0.6276, + "step": 1911 + }, + { + "epoch": 0.44554916886748486, + "grad_norm": 0.34810155630111694, + "learning_rate": 4.938665755628572e-06, + "loss": 0.6795, + "step": 1912 + }, + { + "epoch": 0.4457821966754699, + "grad_norm": 0.34380555152893066, + "learning_rate": 4.938598317671064e-06, + "loss": 0.6722, + "step": 1913 + }, + { + "epoch": 0.44601522448345504, + "grad_norm": 0.32595911622047424, + "learning_rate": 4.938530843120325e-06, + "loss": 0.6691, + "step": 1914 + }, + { + "epoch": 0.4462482522914401, + "grad_norm": 0.32678142189979553, + "learning_rate": 4.938463331977369e-06, + "loss": 0.643, + "step": 1915 + }, + { + "epoch": 0.44648128009942517, + "grad_norm": 0.34854981303215027, + "learning_rate": 4.938395784243209e-06, + "loss": 0.6776, + "step": 1916 + }, + { + "epoch": 0.4467143079074103, + "grad_norm": 0.3612758219242096, + "learning_rate": 4.938328199918861e-06, + "loss": 0.6581, + "step": 1917 + }, + { + "epoch": 0.44694733571539536, + "grad_norm": 0.37052950263023376, + "learning_rate": 4.938260579005334e-06, + "loss": 0.6315, + "step": 1918 + }, + { + "epoch": 0.4471803635233805, + "grad_norm": 0.3535763919353485, + "learning_rate": 4.938192921503647e-06, + "loss": 0.6548, + "step": 1919 + }, + { + "epoch": 0.44741339133136554, + "grad_norm": 0.3444952070713043, + "learning_rate": 4.9381252274148136e-06, + "loss": 0.6735, + "step": 1920 + }, + { + "epoch": 0.4476464191393506, + "grad_norm": 0.3540029525756836, + "learning_rate": 4.93805749673985e-06, + "loss": 0.6762, + "step": 1921 + }, + { + "epoch": 0.4478794469473357, + "grad_norm": 0.3438248038291931, + "learning_rate": 4.9379897294797725e-06, + "loss": 0.6561, + "step": 1922 + }, + { + "epoch": 0.4481124747553208, + "grad_norm": 0.36112090945243835, + "learning_rate": 4.937921925635598e-06, + "loss": 0.6518, + "step": 1923 + }, + { + "epoch": 0.4483455025633059, + "grad_norm": 0.38106769323349, + "learning_rate": 4.937854085208344e-06, + "loss": 0.6372, + "step": 1924 + }, + { + "epoch": 0.448578530371291, + "grad_norm": 0.3448070287704468, + "learning_rate": 4.937786208199027e-06, + "loss": 0.6509, + "step": 1925 + }, + { + "epoch": 0.44881155817927604, + "grad_norm": 0.35030901432037354, + "learning_rate": 4.937718294608668e-06, + "loss": 0.6767, + "step": 1926 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 0.37124526500701904, + "learning_rate": 4.937650344438285e-06, + "loss": 0.6545, + "step": 1927 + }, + { + "epoch": 0.4492776137952462, + "grad_norm": 0.39224597811698914, + "learning_rate": 4.937582357688898e-06, + "loss": 0.6583, + "step": 1928 + }, + { + "epoch": 0.44951064160323134, + "grad_norm": 0.33804187178611755, + "learning_rate": 4.937514334361526e-06, + "loss": 0.6655, + "step": 1929 + }, + { + "epoch": 0.4497436694112164, + "grad_norm": 0.3457462191581726, + "learning_rate": 4.937446274457191e-06, + "loss": 0.6745, + "step": 1930 + }, + { + "epoch": 0.44997669721920147, + "grad_norm": 0.36189696192741394, + "learning_rate": 4.937378177976915e-06, + "loss": 0.6576, + "step": 1931 + }, + { + "epoch": 0.4502097250271866, + "grad_norm": 0.3467555642127991, + "learning_rate": 4.937310044921717e-06, + "loss": 0.7063, + "step": 1932 + }, + { + "epoch": 0.45044275283517166, + "grad_norm": 0.33450090885162354, + "learning_rate": 4.937241875292623e-06, + "loss": 0.6609, + "step": 1933 + }, + { + "epoch": 0.4506757806431568, + "grad_norm": 0.33516567945480347, + "learning_rate": 4.937173669090653e-06, + "loss": 0.6701, + "step": 1934 + }, + { + "epoch": 0.45090880845114184, + "grad_norm": 0.34392353892326355, + "learning_rate": 4.9371054263168315e-06, + "loss": 0.632, + "step": 1935 + }, + { + "epoch": 0.4511418362591269, + "grad_norm": 0.3397997319698334, + "learning_rate": 4.937037146972183e-06, + "loss": 0.6847, + "step": 1936 + }, + { + "epoch": 0.451374864067112, + "grad_norm": 0.3283977508544922, + "learning_rate": 4.9369688310577325e-06, + "loss": 0.664, + "step": 1937 + }, + { + "epoch": 0.4516078918750971, + "grad_norm": 0.35051071643829346, + "learning_rate": 4.936900478574503e-06, + "loss": 0.6402, + "step": 1938 + }, + { + "epoch": 0.45184091968308215, + "grad_norm": 0.33347511291503906, + "learning_rate": 4.936832089523522e-06, + "loss": 0.6858, + "step": 1939 + }, + { + "epoch": 0.4520739474910673, + "grad_norm": 0.3427417576313019, + "learning_rate": 4.936763663905816e-06, + "loss": 0.6651, + "step": 1940 + }, + { + "epoch": 0.45230697529905234, + "grad_norm": 0.36389806866645813, + "learning_rate": 4.93669520172241e-06, + "loss": 0.6512, + "step": 1941 + }, + { + "epoch": 0.45254000310703746, + "grad_norm": 0.3406047821044922, + "learning_rate": 4.9366267029743335e-06, + "loss": 0.6903, + "step": 1942 + }, + { + "epoch": 0.4527730309150225, + "grad_norm": 0.33689039945602417, + "learning_rate": 4.9365581676626126e-06, + "loss": 0.6766, + "step": 1943 + }, + { + "epoch": 0.4530060587230076, + "grad_norm": 0.34832045435905457, + "learning_rate": 4.9364895957882765e-06, + "loss": 0.6717, + "step": 1944 + }, + { + "epoch": 0.4532390865309927, + "grad_norm": 0.34105607867240906, + "learning_rate": 4.936420987352354e-06, + "loss": 0.6496, + "step": 1945 + }, + { + "epoch": 0.45347211433897777, + "grad_norm": 0.35318198800086975, + "learning_rate": 4.936352342355875e-06, + "loss": 0.662, + "step": 1946 + }, + { + "epoch": 0.4537051421469629, + "grad_norm": 0.33909276127815247, + "learning_rate": 4.9362836607998685e-06, + "loss": 0.6819, + "step": 1947 + }, + { + "epoch": 0.45393816995494796, + "grad_norm": 0.3597166836261749, + "learning_rate": 4.936214942685367e-06, + "loss": 0.6545, + "step": 1948 + }, + { + "epoch": 0.454171197762933, + "grad_norm": 0.3549489974975586, + "learning_rate": 4.9361461880133995e-06, + "loss": 0.6603, + "step": 1949 + }, + { + "epoch": 0.45440422557091814, + "grad_norm": 0.34495845437049866, + "learning_rate": 4.936077396785e-06, + "loss": 0.678, + "step": 1950 + }, + { + "epoch": 0.4546372533789032, + "grad_norm": 0.3443203568458557, + "learning_rate": 4.936008569001199e-06, + "loss": 0.6683, + "step": 1951 + }, + { + "epoch": 0.4548702811868883, + "grad_norm": 0.348441481590271, + "learning_rate": 4.93593970466303e-06, + "loss": 0.6847, + "step": 1952 + }, + { + "epoch": 0.4551033089948734, + "grad_norm": 0.3346801698207855, + "learning_rate": 4.9358708037715264e-06, + "loss": 0.6636, + "step": 1953 + }, + { + "epoch": 0.45533633680285845, + "grad_norm": 0.3380433917045593, + "learning_rate": 4.935801866327722e-06, + "loss": 0.6847, + "step": 1954 + }, + { + "epoch": 0.4555693646108436, + "grad_norm": 0.33647602796554565, + "learning_rate": 4.935732892332651e-06, + "loss": 0.6596, + "step": 1955 + }, + { + "epoch": 0.45580239241882864, + "grad_norm": 0.3503870368003845, + "learning_rate": 4.935663881787349e-06, + "loss": 0.6718, + "step": 1956 + }, + { + "epoch": 0.45603542022681376, + "grad_norm": 0.3643966615200043, + "learning_rate": 4.935594834692852e-06, + "loss": 0.6588, + "step": 1957 + }, + { + "epoch": 0.4562684480347988, + "grad_norm": 0.3441069722175598, + "learning_rate": 4.935525751050194e-06, + "loss": 0.6757, + "step": 1958 + }, + { + "epoch": 0.4565014758427839, + "grad_norm": 0.33687901496887207, + "learning_rate": 4.9354566308604135e-06, + "loss": 0.6531, + "step": 1959 + }, + { + "epoch": 0.456734503650769, + "grad_norm": 0.34576699137687683, + "learning_rate": 4.935387474124546e-06, + "loss": 0.6287, + "step": 1960 + }, + { + "epoch": 0.4569675314587541, + "grad_norm": 0.33949020504951477, + "learning_rate": 4.935318280843633e-06, + "loss": 0.6476, + "step": 1961 + }, + { + "epoch": 0.45720055926673914, + "grad_norm": 0.34629350900650024, + "learning_rate": 4.935249051018708e-06, + "loss": 0.646, + "step": 1962 + }, + { + "epoch": 0.45743358707472426, + "grad_norm": 0.3446328341960907, + "learning_rate": 4.935179784650814e-06, + "loss": 0.694, + "step": 1963 + }, + { + "epoch": 0.4576666148827093, + "grad_norm": 0.35406917333602905, + "learning_rate": 4.935110481740988e-06, + "loss": 0.6647, + "step": 1964 + }, + { + "epoch": 0.45789964269069444, + "grad_norm": 0.33509257435798645, + "learning_rate": 4.935041142290269e-06, + "loss": 0.6392, + "step": 1965 + }, + { + "epoch": 0.4581326704986795, + "grad_norm": 0.3374606966972351, + "learning_rate": 4.9349717662997e-06, + "loss": 0.6582, + "step": 1966 + }, + { + "epoch": 0.45836569830666457, + "grad_norm": 0.3423045575618744, + "learning_rate": 4.934902353770321e-06, + "loss": 0.6787, + "step": 1967 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 0.3516309857368469, + "learning_rate": 4.934832904703174e-06, + "loss": 0.6531, + "step": 1968 + }, + { + "epoch": 0.45883175392263476, + "grad_norm": 0.32963767647743225, + "learning_rate": 4.9347634190993e-06, + "loss": 0.656, + "step": 1969 + }, + { + "epoch": 0.4590647817306199, + "grad_norm": 0.33289390802383423, + "learning_rate": 4.934693896959744e-06, + "loss": 0.6585, + "step": 1970 + }, + { + "epoch": 0.45929780953860494, + "grad_norm": 0.3355274498462677, + "learning_rate": 4.934624338285546e-06, + "loss": 0.6513, + "step": 1971 + }, + { + "epoch": 0.45953083734659, + "grad_norm": 0.36725667119026184, + "learning_rate": 4.934554743077752e-06, + "loss": 0.6468, + "step": 1972 + }, + { + "epoch": 0.4597638651545751, + "grad_norm": 0.35186630487442017, + "learning_rate": 4.934485111337406e-06, + "loss": 0.6561, + "step": 1973 + }, + { + "epoch": 0.4599968929625602, + "grad_norm": 0.3456324338912964, + "learning_rate": 4.934415443065552e-06, + "loss": 0.657, + "step": 1974 + }, + { + "epoch": 0.4602299207705453, + "grad_norm": 0.32745951414108276, + "learning_rate": 4.934345738263237e-06, + "loss": 0.6405, + "step": 1975 + }, + { + "epoch": 0.4604629485785304, + "grad_norm": 0.3430924117565155, + "learning_rate": 4.934275996931506e-06, + "loss": 0.6473, + "step": 1976 + }, + { + "epoch": 0.46069597638651544, + "grad_norm": 0.3553910255432129, + "learning_rate": 4.934206219071405e-06, + "loss": 0.676, + "step": 1977 + }, + { + "epoch": 0.46092900419450056, + "grad_norm": 0.34862270951271057, + "learning_rate": 4.934136404683982e-06, + "loss": 0.6616, + "step": 1978 + }, + { + "epoch": 0.4611620320024856, + "grad_norm": 0.3491661250591278, + "learning_rate": 4.934066553770284e-06, + "loss": 0.6649, + "step": 1979 + }, + { + "epoch": 0.46139505981047074, + "grad_norm": 0.34327757358551025, + "learning_rate": 4.93399666633136e-06, + "loss": 0.6425, + "step": 1980 + }, + { + "epoch": 0.4616280876184558, + "grad_norm": 0.33885884284973145, + "learning_rate": 4.933926742368258e-06, + "loss": 0.6996, + "step": 1981 + }, + { + "epoch": 0.46186111542644087, + "grad_norm": 0.3380642831325531, + "learning_rate": 4.933856781882028e-06, + "loss": 0.6526, + "step": 1982 + }, + { + "epoch": 0.462094143234426, + "grad_norm": 0.3637973964214325, + "learning_rate": 4.933786784873718e-06, + "loss": 0.6616, + "step": 1983 + }, + { + "epoch": 0.46232717104241106, + "grad_norm": 0.33312949538230896, + "learning_rate": 4.933716751344381e-06, + "loss": 0.6514, + "step": 1984 + }, + { + "epoch": 0.4625601988503961, + "grad_norm": 0.3393290936946869, + "learning_rate": 4.933646681295066e-06, + "loss": 0.6624, + "step": 1985 + }, + { + "epoch": 0.46279322665838124, + "grad_norm": 0.33298400044441223, + "learning_rate": 4.933576574726826e-06, + "loss": 0.638, + "step": 1986 + }, + { + "epoch": 0.4630262544663663, + "grad_norm": 0.34938380122184753, + "learning_rate": 4.933506431640711e-06, + "loss": 0.6312, + "step": 1987 + }, + { + "epoch": 0.4632592822743514, + "grad_norm": 0.36868390440940857, + "learning_rate": 4.933436252037774e-06, + "loss": 0.6515, + "step": 1988 + }, + { + "epoch": 0.4634923100823365, + "grad_norm": 0.33582809567451477, + "learning_rate": 4.93336603591907e-06, + "loss": 0.6335, + "step": 1989 + }, + { + "epoch": 0.46372533789032155, + "grad_norm": 0.32757478952407837, + "learning_rate": 4.933295783285651e-06, + "loss": 0.6513, + "step": 1990 + }, + { + "epoch": 0.4639583656983067, + "grad_norm": 0.3301256597042084, + "learning_rate": 4.933225494138571e-06, + "loss": 0.6931, + "step": 1991 + }, + { + "epoch": 0.46419139350629174, + "grad_norm": 0.3358980119228363, + "learning_rate": 4.933155168478886e-06, + "loss": 0.6702, + "step": 1992 + }, + { + "epoch": 0.46442442131427686, + "grad_norm": 0.33238673210144043, + "learning_rate": 4.933084806307651e-06, + "loss": 0.6693, + "step": 1993 + }, + { + "epoch": 0.4646574491222619, + "grad_norm": 0.348239004611969, + "learning_rate": 4.93301440762592e-06, + "loss": 0.6562, + "step": 1994 + }, + { + "epoch": 0.464890476930247, + "grad_norm": 0.3256385922431946, + "learning_rate": 4.932943972434752e-06, + "loss": 0.6341, + "step": 1995 + }, + { + "epoch": 0.4651235047382321, + "grad_norm": 0.3421926200389862, + "learning_rate": 4.932873500735202e-06, + "loss": 0.6677, + "step": 1996 + }, + { + "epoch": 0.46535653254621717, + "grad_norm": 0.340618371963501, + "learning_rate": 4.9328029925283295e-06, + "loss": 0.653, + "step": 1997 + }, + { + "epoch": 0.4655895603542023, + "grad_norm": 0.35727640986442566, + "learning_rate": 4.93273244781519e-06, + "loss": 0.6861, + "step": 1998 + }, + { + "epoch": 0.46582258816218736, + "grad_norm": 0.33504724502563477, + "learning_rate": 4.932661866596844e-06, + "loss": 0.673, + "step": 1999 + }, + { + "epoch": 0.4660556159701724, + "grad_norm": 0.33034440875053406, + "learning_rate": 4.93259124887435e-06, + "loss": 0.6253, + "step": 2000 + }, + { + "epoch": 0.46628864377815754, + "grad_norm": 0.33925461769104004, + "learning_rate": 4.932520594648768e-06, + "loss": 0.6614, + "step": 2001 + }, + { + "epoch": 0.4665216715861426, + "grad_norm": 0.33077698945999146, + "learning_rate": 4.9324499039211575e-06, + "loss": 0.6556, + "step": 2002 + }, + { + "epoch": 0.4667546993941277, + "grad_norm": 0.3444373607635498, + "learning_rate": 4.9323791766925784e-06, + "loss": 0.6659, + "step": 2003 + }, + { + "epoch": 0.4669877272021128, + "grad_norm": 0.3422435224056244, + "learning_rate": 4.932308412964095e-06, + "loss": 0.6578, + "step": 2004 + }, + { + "epoch": 0.46722075501009785, + "grad_norm": 0.3478015959262848, + "learning_rate": 4.9322376127367675e-06, + "loss": 0.6623, + "step": 2005 + }, + { + "epoch": 0.467453782818083, + "grad_norm": 0.3475801944732666, + "learning_rate": 4.9321667760116574e-06, + "loss": 0.6658, + "step": 2006 + }, + { + "epoch": 0.46768681062606804, + "grad_norm": 0.36094075441360474, + "learning_rate": 4.93209590278983e-06, + "loss": 0.6749, + "step": 2007 + }, + { + "epoch": 0.4679198384340531, + "grad_norm": 0.345959335565567, + "learning_rate": 4.932024993072347e-06, + "loss": 0.6557, + "step": 2008 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 0.3349626064300537, + "learning_rate": 4.931954046860271e-06, + "loss": 0.6581, + "step": 2009 + }, + { + "epoch": 0.4683858940500233, + "grad_norm": 0.3383091688156128, + "learning_rate": 4.93188306415467e-06, + "loss": 0.6868, + "step": 2010 + }, + { + "epoch": 0.4686189218580084, + "grad_norm": 0.34955522418022156, + "learning_rate": 4.931812044956608e-06, + "loss": 0.6711, + "step": 2011 + }, + { + "epoch": 0.4688519496659935, + "grad_norm": 0.3561055362224579, + "learning_rate": 4.931740989267148e-06, + "loss": 0.6479, + "step": 2012 + }, + { + "epoch": 0.46908497747397854, + "grad_norm": 0.36517080664634705, + "learning_rate": 4.931669897087361e-06, + "loss": 0.6752, + "step": 2013 + }, + { + "epoch": 0.46931800528196366, + "grad_norm": 0.32239022850990295, + "learning_rate": 4.931598768418311e-06, + "loss": 0.6198, + "step": 2014 + }, + { + "epoch": 0.4695510330899487, + "grad_norm": 0.3472045660018921, + "learning_rate": 4.9315276032610644e-06, + "loss": 0.6576, + "step": 2015 + }, + { + "epoch": 0.46978406089793384, + "grad_norm": 0.33949777483940125, + "learning_rate": 4.931456401616692e-06, + "loss": 0.6307, + "step": 2016 + }, + { + "epoch": 0.4700170887059189, + "grad_norm": 0.354637086391449, + "learning_rate": 4.93138516348626e-06, + "loss": 0.6779, + "step": 2017 + }, + { + "epoch": 0.47025011651390397, + "grad_norm": 0.3476591110229492, + "learning_rate": 4.931313888870838e-06, + "loss": 0.638, + "step": 2018 + }, + { + "epoch": 0.4704831443218891, + "grad_norm": 0.33732566237449646, + "learning_rate": 4.9312425777714955e-06, + "loss": 0.6472, + "step": 2019 + }, + { + "epoch": 0.47071617212987416, + "grad_norm": 0.339277446269989, + "learning_rate": 4.931171230189302e-06, + "loss": 0.685, + "step": 2020 + }, + { + "epoch": 0.4709491999378593, + "grad_norm": 0.3490895926952362, + "learning_rate": 4.93109984612533e-06, + "loss": 0.635, + "step": 2021 + }, + { + "epoch": 0.47118222774584434, + "grad_norm": 0.34943822026252747, + "learning_rate": 4.931028425580649e-06, + "loss": 0.6507, + "step": 2022 + }, + { + "epoch": 0.4714152555538294, + "grad_norm": 0.3454621434211731, + "learning_rate": 4.930956968556331e-06, + "loss": 0.6513, + "step": 2023 + }, + { + "epoch": 0.4716482833618145, + "grad_norm": 0.3614349067211151, + "learning_rate": 4.930885475053448e-06, + "loss": 0.6717, + "step": 2024 + }, + { + "epoch": 0.4718813111697996, + "grad_norm": 0.3302850127220154, + "learning_rate": 4.930813945073074e-06, + "loss": 0.6641, + "step": 2025 + }, + { + "epoch": 0.4721143389777847, + "grad_norm": 0.3391430377960205, + "learning_rate": 4.930742378616281e-06, + "loss": 0.6798, + "step": 2026 + }, + { + "epoch": 0.4723473667857698, + "grad_norm": 0.3367573916912079, + "learning_rate": 4.930670775684144e-06, + "loss": 0.6614, + "step": 2027 + }, + { + "epoch": 0.47258039459375484, + "grad_norm": 0.3508684039115906, + "learning_rate": 4.930599136277737e-06, + "loss": 0.6781, + "step": 2028 + }, + { + "epoch": 0.47281342240173996, + "grad_norm": 0.3506162166595459, + "learning_rate": 4.930527460398136e-06, + "loss": 0.6742, + "step": 2029 + }, + { + "epoch": 0.473046450209725, + "grad_norm": 0.36696121096611023, + "learning_rate": 4.9304557480464145e-06, + "loss": 0.6683, + "step": 2030 + }, + { + "epoch": 0.4732794780177101, + "grad_norm": 0.32933297753334045, + "learning_rate": 4.930383999223649e-06, + "loss": 0.624, + "step": 2031 + }, + { + "epoch": 0.4735125058256952, + "grad_norm": 0.3691255450248718, + "learning_rate": 4.930312213930918e-06, + "loss": 0.6722, + "step": 2032 + }, + { + "epoch": 0.47374553363368027, + "grad_norm": 0.3559948801994324, + "learning_rate": 4.930240392169298e-06, + "loss": 0.7052, + "step": 2033 + }, + { + "epoch": 0.4739785614416654, + "grad_norm": 0.3873662054538727, + "learning_rate": 4.930168533939865e-06, + "loss": 0.6396, + "step": 2034 + }, + { + "epoch": 0.47421158924965046, + "grad_norm": 0.3418651819229126, + "learning_rate": 4.930096639243699e-06, + "loss": 0.6576, + "step": 2035 + }, + { + "epoch": 0.4744446170576355, + "grad_norm": 0.34678104519844055, + "learning_rate": 4.930024708081879e-06, + "loss": 0.671, + "step": 2036 + }, + { + "epoch": 0.47467764486562064, + "grad_norm": 0.377640038728714, + "learning_rate": 4.929952740455483e-06, + "loss": 0.6583, + "step": 2037 + }, + { + "epoch": 0.4749106726736057, + "grad_norm": 0.3382796049118042, + "learning_rate": 4.929880736365592e-06, + "loss": 0.6825, + "step": 2038 + }, + { + "epoch": 0.4751437004815908, + "grad_norm": 0.3476635813713074, + "learning_rate": 4.929808695813286e-06, + "loss": 0.6434, + "step": 2039 + }, + { + "epoch": 0.4753767282895759, + "grad_norm": 0.3502070903778076, + "learning_rate": 4.929736618799647e-06, + "loss": 0.6606, + "step": 2040 + }, + { + "epoch": 0.47560975609756095, + "grad_norm": 0.335993230342865, + "learning_rate": 4.929664505325754e-06, + "loss": 0.662, + "step": 2041 + }, + { + "epoch": 0.4758427839055461, + "grad_norm": 0.3608747720718384, + "learning_rate": 4.929592355392693e-06, + "loss": 0.6688, + "step": 2042 + }, + { + "epoch": 0.47607581171353114, + "grad_norm": 0.3402598202228546, + "learning_rate": 4.929520169001544e-06, + "loss": 0.6568, + "step": 2043 + }, + { + "epoch": 0.47630883952151626, + "grad_norm": 0.32449713349342346, + "learning_rate": 4.929447946153391e-06, + "loss": 0.6412, + "step": 2044 + }, + { + "epoch": 0.4765418673295013, + "grad_norm": 0.3535619080066681, + "learning_rate": 4.929375686849317e-06, + "loss": 0.6629, + "step": 2045 + }, + { + "epoch": 0.4767748951374864, + "grad_norm": 0.36256715655326843, + "learning_rate": 4.929303391090408e-06, + "loss": 0.6422, + "step": 2046 + }, + { + "epoch": 0.4770079229454715, + "grad_norm": 0.33516040444374084, + "learning_rate": 4.929231058877746e-06, + "loss": 0.6687, + "step": 2047 + }, + { + "epoch": 0.47724095075345657, + "grad_norm": 0.34236401319503784, + "learning_rate": 4.92915869021242e-06, + "loss": 0.6751, + "step": 2048 + }, + { + "epoch": 0.4774739785614417, + "grad_norm": 0.3655480146408081, + "learning_rate": 4.929086285095513e-06, + "loss": 0.6576, + "step": 2049 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 0.34615617990493774, + "learning_rate": 4.929013843528113e-06, + "loss": 0.6738, + "step": 2050 + }, + { + "epoch": 0.4779400341774118, + "grad_norm": 0.3455753028392792, + "learning_rate": 4.928941365511308e-06, + "loss": 0.6611, + "step": 2051 + }, + { + "epoch": 0.47817306198539694, + "grad_norm": 0.33355802297592163, + "learning_rate": 4.928868851046182e-06, + "loss": 0.6617, + "step": 2052 + }, + { + "epoch": 0.478406089793382, + "grad_norm": 0.33715471625328064, + "learning_rate": 4.928796300133826e-06, + "loss": 0.6415, + "step": 2053 + }, + { + "epoch": 0.47863911760136707, + "grad_norm": 0.34479057788848877, + "learning_rate": 4.928723712775329e-06, + "loss": 0.6815, + "step": 2054 + }, + { + "epoch": 0.4788721454093522, + "grad_norm": 0.35430002212524414, + "learning_rate": 4.928651088971778e-06, + "loss": 0.6722, + "step": 2055 + }, + { + "epoch": 0.47910517321733725, + "grad_norm": 0.32509660720825195, + "learning_rate": 4.9285784287242655e-06, + "loss": 0.6435, + "step": 2056 + }, + { + "epoch": 0.4793382010253224, + "grad_norm": 0.34744372963905334, + "learning_rate": 4.92850573203388e-06, + "loss": 0.6718, + "step": 2057 + }, + { + "epoch": 0.47957122883330744, + "grad_norm": 0.3411124348640442, + "learning_rate": 4.928432998901713e-06, + "loss": 0.6578, + "step": 2058 + }, + { + "epoch": 0.4798042566412925, + "grad_norm": 0.34777212142944336, + "learning_rate": 4.928360229328856e-06, + "loss": 0.6632, + "step": 2059 + }, + { + "epoch": 0.4800372844492776, + "grad_norm": 0.3456726372241974, + "learning_rate": 4.9282874233163995e-06, + "loss": 0.6622, + "step": 2060 + }, + { + "epoch": 0.4802703122572627, + "grad_norm": 0.3298170864582062, + "learning_rate": 4.928214580865438e-06, + "loss": 0.6672, + "step": 2061 + }, + { + "epoch": 0.4805033400652478, + "grad_norm": 0.3360856771469116, + "learning_rate": 4.928141701977064e-06, + "loss": 0.6568, + "step": 2062 + }, + { + "epoch": 0.4807363678732329, + "grad_norm": 0.3521275818347931, + "learning_rate": 4.928068786652371e-06, + "loss": 0.6869, + "step": 2063 + }, + { + "epoch": 0.48096939568121794, + "grad_norm": 0.3405437171459198, + "learning_rate": 4.9279958348924526e-06, + "loss": 0.6565, + "step": 2064 + }, + { + "epoch": 0.48120242348920306, + "grad_norm": 0.3265219032764435, + "learning_rate": 4.927922846698404e-06, + "loss": 0.6504, + "step": 2065 + }, + { + "epoch": 0.4814354512971881, + "grad_norm": 0.32374072074890137, + "learning_rate": 4.92784982207132e-06, + "loss": 0.6727, + "step": 2066 + }, + { + "epoch": 0.48166847910517324, + "grad_norm": 0.3466837406158447, + "learning_rate": 4.927776761012298e-06, + "loss": 0.672, + "step": 2067 + }, + { + "epoch": 0.4819015069131583, + "grad_norm": 0.33999893069267273, + "learning_rate": 4.927703663522433e-06, + "loss": 0.6311, + "step": 2068 + }, + { + "epoch": 0.48213453472114337, + "grad_norm": 0.33274218440055847, + "learning_rate": 4.927630529602821e-06, + "loss": 0.6942, + "step": 2069 + }, + { + "epoch": 0.4823675625291285, + "grad_norm": 0.33769360184669495, + "learning_rate": 4.927557359254561e-06, + "loss": 0.64, + "step": 2070 + }, + { + "epoch": 0.48260059033711356, + "grad_norm": 0.33648666739463806, + "learning_rate": 4.927484152478751e-06, + "loss": 0.6701, + "step": 2071 + }, + { + "epoch": 0.4828336181450987, + "grad_norm": 0.33954161405563354, + "learning_rate": 4.927410909276488e-06, + "loss": 0.6869, + "step": 2072 + }, + { + "epoch": 0.48306664595308374, + "grad_norm": 0.32898953557014465, + "learning_rate": 4.927337629648873e-06, + "loss": 0.6344, + "step": 2073 + }, + { + "epoch": 0.4832996737610688, + "grad_norm": 0.329923152923584, + "learning_rate": 4.927264313597004e-06, + "loss": 0.649, + "step": 2074 + }, + { + "epoch": 0.4835327015690539, + "grad_norm": 0.3360907733440399, + "learning_rate": 4.927190961121982e-06, + "loss": 0.6471, + "step": 2075 + }, + { + "epoch": 0.483765729377039, + "grad_norm": 0.3478478193283081, + "learning_rate": 4.927117572224908e-06, + "loss": 0.6879, + "step": 2076 + }, + { + "epoch": 0.48399875718502405, + "grad_norm": 0.3353518545627594, + "learning_rate": 4.927044146906882e-06, + "loss": 0.6478, + "step": 2077 + }, + { + "epoch": 0.4842317849930092, + "grad_norm": 0.3452572822570801, + "learning_rate": 4.9269706851690075e-06, + "loss": 0.6813, + "step": 2078 + }, + { + "epoch": 0.48446481280099424, + "grad_norm": 0.3549037277698517, + "learning_rate": 4.9268971870123864e-06, + "loss": 0.6613, + "step": 2079 + }, + { + "epoch": 0.48469784060897936, + "grad_norm": 0.34084680676460266, + "learning_rate": 4.92682365243812e-06, + "loss": 0.6512, + "step": 2080 + }, + { + "epoch": 0.4849308684169644, + "grad_norm": 0.32705533504486084, + "learning_rate": 4.9267500814473135e-06, + "loss": 0.6437, + "step": 2081 + }, + { + "epoch": 0.4851638962249495, + "grad_norm": 0.3274748921394348, + "learning_rate": 4.92667647404107e-06, + "loss": 0.6202, + "step": 2082 + }, + { + "epoch": 0.4853969240329346, + "grad_norm": 0.3341066241264343, + "learning_rate": 4.926602830220495e-06, + "loss": 0.6483, + "step": 2083 + }, + { + "epoch": 0.48562995184091967, + "grad_norm": 0.34731820225715637, + "learning_rate": 4.9265291499866925e-06, + "loss": 0.6727, + "step": 2084 + }, + { + "epoch": 0.4858629796489048, + "grad_norm": 0.34217527508735657, + "learning_rate": 4.926455433340769e-06, + "loss": 0.6766, + "step": 2085 + }, + { + "epoch": 0.48609600745688986, + "grad_norm": 0.3347722589969635, + "learning_rate": 4.92638168028383e-06, + "loss": 0.6732, + "step": 2086 + }, + { + "epoch": 0.4863290352648749, + "grad_norm": 0.33862629532814026, + "learning_rate": 4.926307890816983e-06, + "loss": 0.6581, + "step": 2087 + }, + { + "epoch": 0.48656206307286004, + "grad_norm": 0.34749701619148254, + "learning_rate": 4.926234064941334e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.4867950908808451, + "grad_norm": 0.33946022391319275, + "learning_rate": 4.926160202657992e-06, + "loss": 0.6371, + "step": 2089 + }, + { + "epoch": 0.4870281186888302, + "grad_norm": 0.32810527086257935, + "learning_rate": 4.926086303968065e-06, + "loss": 0.6485, + "step": 2090 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 0.33966711163520813, + "learning_rate": 4.9260123688726615e-06, + "loss": 0.6495, + "step": 2091 + }, + { + "epoch": 0.48749417430480035, + "grad_norm": 0.3659641742706299, + "learning_rate": 4.925938397372892e-06, + "loss": 0.6684, + "step": 2092 + }, + { + "epoch": 0.4877272021127855, + "grad_norm": 0.3359503746032715, + "learning_rate": 4.925864389469865e-06, + "loss": 0.6553, + "step": 2093 + }, + { + "epoch": 0.48796022992077054, + "grad_norm": 0.34418871998786926, + "learning_rate": 4.925790345164693e-06, + "loss": 0.6956, + "step": 2094 + }, + { + "epoch": 0.48819325772875566, + "grad_norm": 0.3405565023422241, + "learning_rate": 4.925716264458485e-06, + "loss": 0.6672, + "step": 2095 + }, + { + "epoch": 0.4884262855367407, + "grad_norm": 0.3466337025165558, + "learning_rate": 4.925642147352354e-06, + "loss": 0.666, + "step": 2096 + }, + { + "epoch": 0.4886593133447258, + "grad_norm": 0.3301486074924469, + "learning_rate": 4.9255679938474126e-06, + "loss": 0.6514, + "step": 2097 + }, + { + "epoch": 0.4888923411527109, + "grad_norm": 0.34689193964004517, + "learning_rate": 4.925493803944772e-06, + "loss": 0.6629, + "step": 2098 + }, + { + "epoch": 0.48912536896069597, + "grad_norm": 0.3345968723297119, + "learning_rate": 4.925419577645547e-06, + "loss": 0.6429, + "step": 2099 + }, + { + "epoch": 0.48935839676868104, + "grad_norm": 0.34352630376815796, + "learning_rate": 4.92534531495085e-06, + "loss": 0.6904, + "step": 2100 + }, + { + "epoch": 0.48959142457666616, + "grad_norm": 0.3366606831550598, + "learning_rate": 4.925271015861796e-06, + "loss": 0.6651, + "step": 2101 + }, + { + "epoch": 0.4898244523846512, + "grad_norm": 0.34791046380996704, + "learning_rate": 4.9251966803795e-06, + "loss": 0.6753, + "step": 2102 + }, + { + "epoch": 0.49005748019263634, + "grad_norm": 0.33426377177238464, + "learning_rate": 4.925122308505078e-06, + "loss": 0.6563, + "step": 2103 + }, + { + "epoch": 0.4902905080006214, + "grad_norm": 0.34348320960998535, + "learning_rate": 4.925047900239644e-06, + "loss": 0.6654, + "step": 2104 + }, + { + "epoch": 0.49052353580860647, + "grad_norm": 0.3600376546382904, + "learning_rate": 4.9249734555843174e-06, + "loss": 0.6778, + "step": 2105 + }, + { + "epoch": 0.4907565636165916, + "grad_norm": 0.3337770700454712, + "learning_rate": 4.924898974540215e-06, + "loss": 0.6457, + "step": 2106 + }, + { + "epoch": 0.49098959142457665, + "grad_norm": 0.3568193018436432, + "learning_rate": 4.9248244571084515e-06, + "loss": 0.6631, + "step": 2107 + }, + { + "epoch": 0.4912226192325618, + "grad_norm": 0.349202960729599, + "learning_rate": 4.924749903290147e-06, + "loss": 0.6349, + "step": 2108 + }, + { + "epoch": 0.49145564704054684, + "grad_norm": 0.3698953092098236, + "learning_rate": 4.924675313086421e-06, + "loss": 0.6567, + "step": 2109 + }, + { + "epoch": 0.4916886748485319, + "grad_norm": 0.35257214307785034, + "learning_rate": 4.924600686498391e-06, + "loss": 0.6545, + "step": 2110 + }, + { + "epoch": 0.491921702656517, + "grad_norm": 0.3367707133293152, + "learning_rate": 4.924526023527178e-06, + "loss": 0.6418, + "step": 2111 + }, + { + "epoch": 0.4921547304645021, + "grad_norm": 0.3664289116859436, + "learning_rate": 4.9244513241739036e-06, + "loss": 0.6501, + "step": 2112 + }, + { + "epoch": 0.4923877582724872, + "grad_norm": 0.3507499396800995, + "learning_rate": 4.924376588439686e-06, + "loss": 0.6694, + "step": 2113 + }, + { + "epoch": 0.4926207860804723, + "grad_norm": 0.3555581867694855, + "learning_rate": 4.924301816325649e-06, + "loss": 0.6257, + "step": 2114 + }, + { + "epoch": 0.49285381388845734, + "grad_norm": 0.5021249055862427, + "learning_rate": 4.924227007832913e-06, + "loss": 0.6443, + "step": 2115 + }, + { + "epoch": 0.49308684169644246, + "grad_norm": 0.34352928400039673, + "learning_rate": 4.924152162962601e-06, + "loss": 0.6633, + "step": 2116 + }, + { + "epoch": 0.4933198695044275, + "grad_norm": 0.34953758120536804, + "learning_rate": 4.924077281715837e-06, + "loss": 0.6713, + "step": 2117 + }, + { + "epoch": 0.49355289731241264, + "grad_norm": 0.35992223024368286, + "learning_rate": 4.924002364093744e-06, + "loss": 0.6593, + "step": 2118 + }, + { + "epoch": 0.4937859251203977, + "grad_norm": 0.3374341130256653, + "learning_rate": 4.923927410097446e-06, + "loss": 0.6596, + "step": 2119 + }, + { + "epoch": 0.49401895292838277, + "grad_norm": 0.36175262928009033, + "learning_rate": 4.923852419728067e-06, + "loss": 0.6531, + "step": 2120 + }, + { + "epoch": 0.4942519807363679, + "grad_norm": 0.34456637501716614, + "learning_rate": 4.923777392986734e-06, + "loss": 0.6518, + "step": 2121 + }, + { + "epoch": 0.49448500854435296, + "grad_norm": 0.33838656544685364, + "learning_rate": 4.923702329874572e-06, + "loss": 0.6355, + "step": 2122 + }, + { + "epoch": 0.494718036352338, + "grad_norm": 0.33080607652664185, + "learning_rate": 4.923627230392707e-06, + "loss": 0.6715, + "step": 2123 + }, + { + "epoch": 0.49495106416032314, + "grad_norm": 0.3383380174636841, + "learning_rate": 4.923552094542268e-06, + "loss": 0.655, + "step": 2124 + }, + { + "epoch": 0.4951840919683082, + "grad_norm": 0.3362676203250885, + "learning_rate": 4.923476922324379e-06, + "loss": 0.6649, + "step": 2125 + }, + { + "epoch": 0.4954171197762933, + "grad_norm": 0.3496564030647278, + "learning_rate": 4.92340171374017e-06, + "loss": 0.6668, + "step": 2126 + }, + { + "epoch": 0.4956501475842784, + "grad_norm": 0.32933101058006287, + "learning_rate": 4.92332646879077e-06, + "loss": 0.6317, + "step": 2127 + }, + { + "epoch": 0.49588317539226345, + "grad_norm": 0.3406134247779846, + "learning_rate": 4.923251187477308e-06, + "loss": 0.6621, + "step": 2128 + }, + { + "epoch": 0.4961162032002486, + "grad_norm": 0.34316784143447876, + "learning_rate": 4.923175869800912e-06, + "loss": 0.6719, + "step": 2129 + }, + { + "epoch": 0.49634923100823364, + "grad_norm": 0.33800289034843445, + "learning_rate": 4.923100515762714e-06, + "loss": 0.6777, + "step": 2130 + }, + { + "epoch": 0.49658225881621876, + "grad_norm": 0.3126704692840576, + "learning_rate": 4.923025125363845e-06, + "loss": 0.6348, + "step": 2131 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 0.3406670093536377, + "learning_rate": 4.922949698605435e-06, + "loss": 0.6573, + "step": 2132 + }, + { + "epoch": 0.4970483144321889, + "grad_norm": 0.34644854068756104, + "learning_rate": 4.922874235488616e-06, + "loss": 0.6747, + "step": 2133 + }, + { + "epoch": 0.497281342240174, + "grad_norm": 0.3520788550376892, + "learning_rate": 4.92279873601452e-06, + "loss": 0.6696, + "step": 2134 + }, + { + "epoch": 0.49751437004815907, + "grad_norm": 0.34209099411964417, + "learning_rate": 4.922723200184282e-06, + "loss": 0.6605, + "step": 2135 + }, + { + "epoch": 0.4977473978561442, + "grad_norm": 0.3448578417301178, + "learning_rate": 4.922647627999033e-06, + "loss": 0.6781, + "step": 2136 + }, + { + "epoch": 0.49798042566412926, + "grad_norm": 0.3539183735847473, + "learning_rate": 4.922572019459909e-06, + "loss": 0.6478, + "step": 2137 + }, + { + "epoch": 0.4982134534721143, + "grad_norm": 0.32777678966522217, + "learning_rate": 4.922496374568043e-06, + "loss": 0.6573, + "step": 2138 + }, + { + "epoch": 0.49844648128009944, + "grad_norm": 0.3525262475013733, + "learning_rate": 4.922420693324571e-06, + "loss": 0.6694, + "step": 2139 + }, + { + "epoch": 0.4986795090880845, + "grad_norm": 0.3409246504306793, + "learning_rate": 4.922344975730629e-06, + "loss": 0.6259, + "step": 2140 + }, + { + "epoch": 0.4989125368960696, + "grad_norm": 0.32405295968055725, + "learning_rate": 4.922269221787352e-06, + "loss": 0.6646, + "step": 2141 + }, + { + "epoch": 0.4991455647040547, + "grad_norm": 0.34400761127471924, + "learning_rate": 4.922193431495879e-06, + "loss": 0.6258, + "step": 2142 + }, + { + "epoch": 0.49937859251203975, + "grad_norm": 0.32595187425613403, + "learning_rate": 4.922117604857344e-06, + "loss": 0.6428, + "step": 2143 + }, + { + "epoch": 0.4996116203200249, + "grad_norm": 0.34102311730384827, + "learning_rate": 4.922041741872887e-06, + "loss": 0.6436, + "step": 2144 + }, + { + "epoch": 0.49984464812800994, + "grad_norm": 0.3514825999736786, + "learning_rate": 4.921965842543645e-06, + "loss": 0.6699, + "step": 2145 + }, + { + "epoch": 0.5000776759359951, + "grad_norm": 0.33871781826019287, + "learning_rate": 4.92188990687076e-06, + "loss": 0.6487, + "step": 2146 + }, + { + "epoch": 0.5003107037439801, + "grad_norm": 0.32575950026512146, + "learning_rate": 4.9218139348553686e-06, + "loss": 0.6413, + "step": 2147 + }, + { + "epoch": 0.5005437315519652, + "grad_norm": 0.33293935656547546, + "learning_rate": 4.9217379264986106e-06, + "loss": 0.6247, + "step": 2148 + }, + { + "epoch": 0.5007767593599503, + "grad_norm": 0.3210507929325104, + "learning_rate": 4.9216618818016285e-06, + "loss": 0.6363, + "step": 2149 + }, + { + "epoch": 0.5010097871679354, + "grad_norm": 0.3283553421497345, + "learning_rate": 4.9215858007655625e-06, + "loss": 0.6748, + "step": 2150 + }, + { + "epoch": 0.5012428149759205, + "grad_norm": 0.3590323328971863, + "learning_rate": 4.9215096833915536e-06, + "loss": 0.6565, + "step": 2151 + }, + { + "epoch": 0.5014758427839056, + "grad_norm": 0.3632393181324005, + "learning_rate": 4.921433529680745e-06, + "loss": 0.6538, + "step": 2152 + }, + { + "epoch": 0.5017088705918906, + "grad_norm": 0.34654244780540466, + "learning_rate": 4.92135733963428e-06, + "loss": 0.669, + "step": 2153 + }, + { + "epoch": 0.5019418983998757, + "grad_norm": 0.3367750942707062, + "learning_rate": 4.9212811132532995e-06, + "loss": 0.6741, + "step": 2154 + }, + { + "epoch": 0.5021749262078608, + "grad_norm": 0.36952176690101624, + "learning_rate": 4.92120485053895e-06, + "loss": 0.6713, + "step": 2155 + }, + { + "epoch": 0.5024079540158459, + "grad_norm": 0.35217970609664917, + "learning_rate": 4.921128551492374e-06, + "loss": 0.6476, + "step": 2156 + }, + { + "epoch": 0.502640981823831, + "grad_norm": 0.3271322548389435, + "learning_rate": 4.9210522161147176e-06, + "loss": 0.6329, + "step": 2157 + }, + { + "epoch": 0.502874009631816, + "grad_norm": 0.35471200942993164, + "learning_rate": 4.9209758444071256e-06, + "loss": 0.6402, + "step": 2158 + }, + { + "epoch": 0.5031070374398011, + "grad_norm": 0.34042298793792725, + "learning_rate": 4.9208994363707445e-06, + "loss": 0.6549, + "step": 2159 + }, + { + "epoch": 0.5033400652477862, + "grad_norm": 0.3772978186607361, + "learning_rate": 4.9208229920067194e-06, + "loss": 0.6405, + "step": 2160 + }, + { + "epoch": 0.5035730930557714, + "grad_norm": 0.34636762738227844, + "learning_rate": 4.9207465113162e-06, + "loss": 0.6477, + "step": 2161 + }, + { + "epoch": 0.5038061208637564, + "grad_norm": 0.3633784055709839, + "learning_rate": 4.920669994300332e-06, + "loss": 0.6332, + "step": 2162 + }, + { + "epoch": 0.5040391486717415, + "grad_norm": 0.34145501255989075, + "learning_rate": 4.920593440960265e-06, + "loss": 0.674, + "step": 2163 + }, + { + "epoch": 0.5042721764797266, + "grad_norm": 0.3538043797016144, + "learning_rate": 4.920516851297146e-06, + "loss": 0.6313, + "step": 2164 + }, + { + "epoch": 0.5045052042877116, + "grad_norm": 0.3264778256416321, + "learning_rate": 4.920440225312127e-06, + "loss": 0.6584, + "step": 2165 + }, + { + "epoch": 0.5047382320956968, + "grad_norm": 0.33227279782295227, + "learning_rate": 4.920363563006354e-06, + "loss": 0.6562, + "step": 2166 + }, + { + "epoch": 0.5049712599036819, + "grad_norm": 0.35200807452201843, + "learning_rate": 4.92028686438098e-06, + "loss": 0.6567, + "step": 2167 + }, + { + "epoch": 0.5052042877116669, + "grad_norm": 0.37121519446372986, + "learning_rate": 4.920210129437155e-06, + "loss": 0.6528, + "step": 2168 + }, + { + "epoch": 0.505437315519652, + "grad_norm": 0.342461496591568, + "learning_rate": 4.920133358176032e-06, + "loss": 0.6591, + "step": 2169 + }, + { + "epoch": 0.505670343327637, + "grad_norm": 0.35781458020210266, + "learning_rate": 4.9200565505987615e-06, + "loss": 0.643, + "step": 2170 + }, + { + "epoch": 0.5059033711356222, + "grad_norm": 0.3494258224964142, + "learning_rate": 4.919979706706496e-06, + "loss": 0.6704, + "step": 2171 + }, + { + "epoch": 0.5061363989436073, + "grad_norm": 0.36566266417503357, + "learning_rate": 4.919902826500389e-06, + "loss": 0.6641, + "step": 2172 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 0.34396126866340637, + "learning_rate": 4.919825909981595e-06, + "loss": 0.654, + "step": 2173 + }, + { + "epoch": 0.5066024545595774, + "grad_norm": 0.3575603663921356, + "learning_rate": 4.919748957151267e-06, + "loss": 0.6583, + "step": 2174 + }, + { + "epoch": 0.5068354823675625, + "grad_norm": 0.3584180176258087, + "learning_rate": 4.91967196801056e-06, + "loss": 0.6277, + "step": 2175 + }, + { + "epoch": 0.5070685101755477, + "grad_norm": 0.3573862314224243, + "learning_rate": 4.919594942560629e-06, + "loss": 0.6725, + "step": 2176 + }, + { + "epoch": 0.5073015379835327, + "grad_norm": 0.3348008990287781, + "learning_rate": 4.919517880802631e-06, + "loss": 0.6289, + "step": 2177 + }, + { + "epoch": 0.5075345657915178, + "grad_norm": 0.35310712456703186, + "learning_rate": 4.9194407827377214e-06, + "loss": 0.6601, + "step": 2178 + }, + { + "epoch": 0.5077675935995029, + "grad_norm": 0.3592340648174286, + "learning_rate": 4.919363648367057e-06, + "loss": 0.6487, + "step": 2179 + }, + { + "epoch": 0.5080006214074879, + "grad_norm": 0.38663622736930847, + "learning_rate": 4.919286477691796e-06, + "loss": 0.6407, + "step": 2180 + }, + { + "epoch": 0.5082336492154731, + "grad_norm": 0.34971094131469727, + "learning_rate": 4.919209270713095e-06, + "loss": 0.6349, + "step": 2181 + }, + { + "epoch": 0.5084666770234582, + "grad_norm": 0.3304547965526581, + "learning_rate": 4.919132027432115e-06, + "loss": 0.6682, + "step": 2182 + }, + { + "epoch": 0.5086997048314432, + "grad_norm": 0.4249407947063446, + "learning_rate": 4.919054747850014e-06, + "loss": 0.6515, + "step": 2183 + }, + { + "epoch": 0.5089327326394283, + "grad_norm": 0.37271708250045776, + "learning_rate": 4.918977431967951e-06, + "loss": 0.6457, + "step": 2184 + }, + { + "epoch": 0.5091657604474134, + "grad_norm": 0.3310740888118744, + "learning_rate": 4.9189000797870855e-06, + "loss": 0.6527, + "step": 2185 + }, + { + "epoch": 0.5093987882553985, + "grad_norm": 0.35197943449020386, + "learning_rate": 4.9188226913085805e-06, + "loss": 0.6556, + "step": 2186 + }, + { + "epoch": 0.5096318160633836, + "grad_norm": 0.38824209570884705, + "learning_rate": 4.918745266533594e-06, + "loss": 0.6431, + "step": 2187 + }, + { + "epoch": 0.5098648438713687, + "grad_norm": 0.34277838468551636, + "learning_rate": 4.918667805463292e-06, + "loss": 0.6266, + "step": 2188 + }, + { + "epoch": 0.5100978716793537, + "grad_norm": 0.33438852429389954, + "learning_rate": 4.918590308098834e-06, + "loss": 0.6409, + "step": 2189 + }, + { + "epoch": 0.5103308994873388, + "grad_norm": 0.3482678234577179, + "learning_rate": 4.918512774441384e-06, + "loss": 0.6596, + "step": 2190 + }, + { + "epoch": 0.510563927295324, + "grad_norm": 0.35401132702827454, + "learning_rate": 4.9184352044921046e-06, + "loss": 0.6421, + "step": 2191 + }, + { + "epoch": 0.510796955103309, + "grad_norm": 0.3418525159358978, + "learning_rate": 4.91835759825216e-06, + "loss": 0.6748, + "step": 2192 + }, + { + "epoch": 0.5110299829112941, + "grad_norm": 0.3219141364097595, + "learning_rate": 4.918279955722716e-06, + "loss": 0.6513, + "step": 2193 + }, + { + "epoch": 0.5112630107192792, + "grad_norm": 0.3431146740913391, + "learning_rate": 4.918202276904936e-06, + "loss": 0.6713, + "step": 2194 + }, + { + "epoch": 0.5114960385272642, + "grad_norm": 0.3417932987213135, + "learning_rate": 4.918124561799987e-06, + "loss": 0.648, + "step": 2195 + }, + { + "epoch": 0.5117290663352494, + "grad_norm": 0.343296617269516, + "learning_rate": 4.918046810409034e-06, + "loss": 0.6557, + "step": 2196 + }, + { + "epoch": 0.5119620941432345, + "grad_norm": 0.3592709004878998, + "learning_rate": 4.917969022733244e-06, + "loss": 0.6706, + "step": 2197 + }, + { + "epoch": 0.5121951219512195, + "grad_norm": 0.34267303347587585, + "learning_rate": 4.917891198773785e-06, + "loss": 0.665, + "step": 2198 + }, + { + "epoch": 0.5124281497592046, + "grad_norm": 0.349433034658432, + "learning_rate": 4.917813338531825e-06, + "loss": 0.6481, + "step": 2199 + }, + { + "epoch": 0.5126611775671897, + "grad_norm": 0.34374693036079407, + "learning_rate": 4.917735442008531e-06, + "loss": 0.6751, + "step": 2200 + }, + { + "epoch": 0.5128942053751747, + "grad_norm": 0.35130101442337036, + "learning_rate": 4.917657509205073e-06, + "loss": 0.6854, + "step": 2201 + }, + { + "epoch": 0.5131272331831599, + "grad_norm": 0.33672839403152466, + "learning_rate": 4.91757954012262e-06, + "loss": 0.6424, + "step": 2202 + }, + { + "epoch": 0.513360260991145, + "grad_norm": 0.32252731919288635, + "learning_rate": 4.917501534762342e-06, + "loss": 0.6372, + "step": 2203 + }, + { + "epoch": 0.51359328879913, + "grad_norm": 0.3462819457054138, + "learning_rate": 4.917423493125409e-06, + "loss": 0.6382, + "step": 2204 + }, + { + "epoch": 0.5138263166071151, + "grad_norm": 0.3351527750492096, + "learning_rate": 4.917345415212994e-06, + "loss": 0.6778, + "step": 2205 + }, + { + "epoch": 0.5140593444151002, + "grad_norm": 0.3288668394088745, + "learning_rate": 4.917267301026266e-06, + "loss": 0.6404, + "step": 2206 + }, + { + "epoch": 0.5142923722230853, + "grad_norm": 0.3449069857597351, + "learning_rate": 4.917189150566399e-06, + "loss": 0.6562, + "step": 2207 + }, + { + "epoch": 0.5145254000310704, + "grad_norm": 0.3366752862930298, + "learning_rate": 4.917110963834566e-06, + "loss": 0.6571, + "step": 2208 + }, + { + "epoch": 0.5147584278390555, + "grad_norm": 0.32655763626098633, + "learning_rate": 4.917032740831938e-06, + "loss": 0.657, + "step": 2209 + }, + { + "epoch": 0.5149914556470405, + "grad_norm": 0.34977301955223083, + "learning_rate": 4.9169544815596915e-06, + "loss": 0.6317, + "step": 2210 + }, + { + "epoch": 0.5152244834550256, + "grad_norm": 0.3457763195037842, + "learning_rate": 4.916876186019e-06, + "loss": 0.6476, + "step": 2211 + }, + { + "epoch": 0.5154575112630108, + "grad_norm": 0.33431002497673035, + "learning_rate": 4.916797854211036e-06, + "loss": 0.6464, + "step": 2212 + }, + { + "epoch": 0.5156905390709958, + "grad_norm": 0.33988437056541443, + "learning_rate": 4.916719486136978e-06, + "loss": 0.641, + "step": 2213 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 0.3330911099910736, + "learning_rate": 4.9166410817980015e-06, + "loss": 0.6719, + "step": 2214 + }, + { + "epoch": 0.516156594686966, + "grad_norm": 0.3573972284793854, + "learning_rate": 4.916562641195282e-06, + "loss": 0.6773, + "step": 2215 + }, + { + "epoch": 0.516389622494951, + "grad_norm": 0.3362848162651062, + "learning_rate": 4.916484164329996e-06, + "loss": 0.6842, + "step": 2216 + }, + { + "epoch": 0.5166226503029362, + "grad_norm": 0.33173999190330505, + "learning_rate": 4.916405651203325e-06, + "loss": 0.6682, + "step": 2217 + }, + { + "epoch": 0.5168556781109213, + "grad_norm": 0.3385947644710541, + "learning_rate": 4.916327101816441e-06, + "loss": 0.6556, + "step": 2218 + }, + { + "epoch": 0.5170887059189063, + "grad_norm": 0.33107516169548035, + "learning_rate": 4.916248516170528e-06, + "loss": 0.6603, + "step": 2219 + }, + { + "epoch": 0.5173217337268914, + "grad_norm": 0.33487486839294434, + "learning_rate": 4.916169894266762e-06, + "loss": 0.6593, + "step": 2220 + }, + { + "epoch": 0.5175547615348765, + "grad_norm": 0.34543734788894653, + "learning_rate": 4.916091236106324e-06, + "loss": 0.6756, + "step": 2221 + }, + { + "epoch": 0.5177877893428616, + "grad_norm": 0.3437902629375458, + "learning_rate": 4.916012541690395e-06, + "loss": 0.6585, + "step": 2222 + }, + { + "epoch": 0.5180208171508467, + "grad_norm": 0.34741589426994324, + "learning_rate": 4.915933811020156e-06, + "loss": 0.6697, + "step": 2223 + }, + { + "epoch": 0.5182538449588318, + "grad_norm": 0.33628612756729126, + "learning_rate": 4.915855044096786e-06, + "loss": 0.6606, + "step": 2224 + }, + { + "epoch": 0.5184868727668168, + "grad_norm": 0.3240652084350586, + "learning_rate": 4.915776240921469e-06, + "loss": 0.6468, + "step": 2225 + }, + { + "epoch": 0.5187199005748019, + "grad_norm": 0.3400583267211914, + "learning_rate": 4.915697401495388e-06, + "loss": 0.6395, + "step": 2226 + }, + { + "epoch": 0.5189529283827871, + "grad_norm": 0.34326183795928955, + "learning_rate": 4.915618525819724e-06, + "loss": 0.6851, + "step": 2227 + }, + { + "epoch": 0.5191859561907721, + "grad_norm": 0.3429807424545288, + "learning_rate": 4.915539613895662e-06, + "loss": 0.6535, + "step": 2228 + }, + { + "epoch": 0.5194189839987572, + "grad_norm": 0.34450820088386536, + "learning_rate": 4.9154606657243865e-06, + "loss": 0.6805, + "step": 2229 + }, + { + "epoch": 0.5196520118067423, + "grad_norm": 0.34548258781433105, + "learning_rate": 4.915381681307081e-06, + "loss": 0.6514, + "step": 2230 + }, + { + "epoch": 0.5198850396147273, + "grad_norm": 0.34493348002433777, + "learning_rate": 4.915302660644932e-06, + "loss": 0.6293, + "step": 2231 + }, + { + "epoch": 0.5201180674227125, + "grad_norm": 0.3592972159385681, + "learning_rate": 4.9152236037391245e-06, + "loss": 0.6551, + "step": 2232 + }, + { + "epoch": 0.5203510952306976, + "grad_norm": 0.34040287137031555, + "learning_rate": 4.915144510590844e-06, + "loss": 0.6289, + "step": 2233 + }, + { + "epoch": 0.5205841230386826, + "grad_norm": 0.33005625009536743, + "learning_rate": 4.915065381201279e-06, + "loss": 0.6628, + "step": 2234 + }, + { + "epoch": 0.5208171508466677, + "grad_norm": 0.34237411618232727, + "learning_rate": 4.914986215571615e-06, + "loss": 0.6661, + "step": 2235 + }, + { + "epoch": 0.5210501786546528, + "grad_norm": 0.34381723403930664, + "learning_rate": 4.914907013703043e-06, + "loss": 0.6471, + "step": 2236 + }, + { + "epoch": 0.5212832064626379, + "grad_norm": 0.34940969944000244, + "learning_rate": 4.914827775596749e-06, + "loss": 0.653, + "step": 2237 + }, + { + "epoch": 0.521516234270623, + "grad_norm": 0.3428685665130615, + "learning_rate": 4.914748501253922e-06, + "loss": 0.6496, + "step": 2238 + }, + { + "epoch": 0.5217492620786081, + "grad_norm": 0.3507705628871918, + "learning_rate": 4.914669190675753e-06, + "loss": 0.6776, + "step": 2239 + }, + { + "epoch": 0.5219822898865931, + "grad_norm": 0.35161134600639343, + "learning_rate": 4.91458984386343e-06, + "loss": 0.6567, + "step": 2240 + }, + { + "epoch": 0.5222153176945782, + "grad_norm": 0.34149423241615295, + "learning_rate": 4.914510460818146e-06, + "loss": 0.6728, + "step": 2241 + }, + { + "epoch": 0.5224483455025634, + "grad_norm": 0.33269616961479187, + "learning_rate": 4.914431041541092e-06, + "loss": 0.6556, + "step": 2242 + }, + { + "epoch": 0.5226813733105484, + "grad_norm": 0.3348952829837799, + "learning_rate": 4.914351586033456e-06, + "loss": 0.6312, + "step": 2243 + }, + { + "epoch": 0.5229144011185335, + "grad_norm": 0.334817111492157, + "learning_rate": 4.9142720942964364e-06, + "loss": 0.6707, + "step": 2244 + }, + { + "epoch": 0.5231474289265186, + "grad_norm": 0.3517555594444275, + "learning_rate": 4.914192566331222e-06, + "loss": 0.6614, + "step": 2245 + }, + { + "epoch": 0.5233804567345036, + "grad_norm": 0.34644511342048645, + "learning_rate": 4.914113002139006e-06, + "loss": 0.6237, + "step": 2246 + }, + { + "epoch": 0.5236134845424887, + "grad_norm": 0.3431161344051361, + "learning_rate": 4.9140334017209845e-06, + "loss": 0.6494, + "step": 2247 + }, + { + "epoch": 0.5238465123504739, + "grad_norm": 0.3501669466495514, + "learning_rate": 4.913953765078351e-06, + "loss": 0.6536, + "step": 2248 + }, + { + "epoch": 0.5240795401584589, + "grad_norm": 0.3383510112762451, + "learning_rate": 4.9138740922123e-06, + "loss": 0.6417, + "step": 2249 + }, + { + "epoch": 0.524312567966444, + "grad_norm": 0.3458317518234253, + "learning_rate": 4.913794383124027e-06, + "loss": 0.6449, + "step": 2250 + }, + { + "epoch": 0.524545595774429, + "grad_norm": 0.35702529549598694, + "learning_rate": 4.9137146378147285e-06, + "loss": 0.6656, + "step": 2251 + }, + { + "epoch": 0.5247786235824141, + "grad_norm": 0.34159600734710693, + "learning_rate": 4.913634856285602e-06, + "loss": 0.6603, + "step": 2252 + }, + { + "epoch": 0.5250116513903993, + "grad_norm": 0.3396241068840027, + "learning_rate": 4.9135550385378435e-06, + "loss": 0.666, + "step": 2253 + }, + { + "epoch": 0.5252446791983844, + "grad_norm": 0.3594509959220886, + "learning_rate": 4.913475184572651e-06, + "loss": 0.6542, + "step": 2254 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 0.34263506531715393, + "learning_rate": 4.913395294391224e-06, + "loss": 0.6543, + "step": 2255 + }, + { + "epoch": 0.5257107348143545, + "grad_norm": 0.3306314945220947, + "learning_rate": 4.9133153679947584e-06, + "loss": 0.6702, + "step": 2256 + }, + { + "epoch": 0.5259437626223396, + "grad_norm": 0.3479105234146118, + "learning_rate": 4.913235405384457e-06, + "loss": 0.6617, + "step": 2257 + }, + { + "epoch": 0.5261767904303247, + "grad_norm": 0.34403738379478455, + "learning_rate": 4.913155406561517e-06, + "loss": 0.6555, + "step": 2258 + }, + { + "epoch": 0.5264098182383098, + "grad_norm": 0.35052725672721863, + "learning_rate": 4.913075371527141e-06, + "loss": 0.6634, + "step": 2259 + }, + { + "epoch": 0.5266428460462949, + "grad_norm": 0.35049498081207275, + "learning_rate": 4.912995300282528e-06, + "loss": 0.6605, + "step": 2260 + }, + { + "epoch": 0.5268758738542799, + "grad_norm": 0.3582190275192261, + "learning_rate": 4.912915192828881e-06, + "loss": 0.6538, + "step": 2261 + }, + { + "epoch": 0.527108901662265, + "grad_norm": 0.36185333132743835, + "learning_rate": 4.912835049167401e-06, + "loss": 0.659, + "step": 2262 + }, + { + "epoch": 0.5273419294702502, + "grad_norm": 0.3506518006324768, + "learning_rate": 4.912754869299291e-06, + "loss": 0.6486, + "step": 2263 + }, + { + "epoch": 0.5275749572782352, + "grad_norm": 0.34385251998901367, + "learning_rate": 4.912674653225755e-06, + "loss": 0.6737, + "step": 2264 + }, + { + "epoch": 0.5278079850862203, + "grad_norm": 0.3446044623851776, + "learning_rate": 4.912594400947996e-06, + "loss": 0.6536, + "step": 2265 + }, + { + "epoch": 0.5280410128942054, + "grad_norm": 0.3594140410423279, + "learning_rate": 4.912514112467218e-06, + "loss": 0.6439, + "step": 2266 + }, + { + "epoch": 0.5282740407021904, + "grad_norm": 0.33050665259361267, + "learning_rate": 4.9124337877846266e-06, + "loss": 0.647, + "step": 2267 + }, + { + "epoch": 0.5285070685101756, + "grad_norm": 0.33734118938446045, + "learning_rate": 4.912353426901426e-06, + "loss": 0.6657, + "step": 2268 + }, + { + "epoch": 0.5287400963181607, + "grad_norm": 0.32278677821159363, + "learning_rate": 4.912273029818824e-06, + "loss": 0.654, + "step": 2269 + }, + { + "epoch": 0.5289731241261457, + "grad_norm": 0.33750322461128235, + "learning_rate": 4.912192596538024e-06, + "loss": 0.6843, + "step": 2270 + }, + { + "epoch": 0.5292061519341308, + "grad_norm": 0.34458690881729126, + "learning_rate": 4.9121121270602355e-06, + "loss": 0.6561, + "step": 2271 + }, + { + "epoch": 0.5294391797421159, + "grad_norm": 0.35507944226264954, + "learning_rate": 4.9120316213866654e-06, + "loss": 0.6587, + "step": 2272 + }, + { + "epoch": 0.529672207550101, + "grad_norm": 0.3373720347881317, + "learning_rate": 4.911951079518521e-06, + "loss": 0.6688, + "step": 2273 + }, + { + "epoch": 0.5299052353580861, + "grad_norm": 0.3393058180809021, + "learning_rate": 4.911870501457012e-06, + "loss": 0.6499, + "step": 2274 + }, + { + "epoch": 0.5301382631660712, + "grad_norm": 0.3671293556690216, + "learning_rate": 4.911789887203348e-06, + "loss": 0.6841, + "step": 2275 + }, + { + "epoch": 0.5303712909740562, + "grad_norm": 0.3534620404243469, + "learning_rate": 4.911709236758736e-06, + "loss": 0.6432, + "step": 2276 + }, + { + "epoch": 0.5306043187820413, + "grad_norm": 0.33960312604904175, + "learning_rate": 4.911628550124389e-06, + "loss": 0.6423, + "step": 2277 + }, + { + "epoch": 0.5308373465900265, + "grad_norm": 0.34401312470436096, + "learning_rate": 4.911547827301516e-06, + "loss": 0.6791, + "step": 2278 + }, + { + "epoch": 0.5310703743980115, + "grad_norm": 0.34428709745407104, + "learning_rate": 4.911467068291329e-06, + "loss": 0.6498, + "step": 2279 + }, + { + "epoch": 0.5313034022059966, + "grad_norm": 0.3451349139213562, + "learning_rate": 4.91138627309504e-06, + "loss": 0.6498, + "step": 2280 + }, + { + "epoch": 0.5315364300139817, + "grad_norm": 0.33686813712120056, + "learning_rate": 4.911305441713862e-06, + "loss": 0.6655, + "step": 2281 + }, + { + "epoch": 0.5317694578219667, + "grad_norm": 0.3288014233112335, + "learning_rate": 4.911224574149007e-06, + "loss": 0.6408, + "step": 2282 + }, + { + "epoch": 0.5320024856299519, + "grad_norm": 0.33597010374069214, + "learning_rate": 4.911143670401688e-06, + "loss": 0.6551, + "step": 2283 + }, + { + "epoch": 0.532235513437937, + "grad_norm": 0.3487309217453003, + "learning_rate": 4.91106273047312e-06, + "loss": 0.6806, + "step": 2284 + }, + { + "epoch": 0.532468541245922, + "grad_norm": 0.34059005975723267, + "learning_rate": 4.910981754364518e-06, + "loss": 0.6596, + "step": 2285 + }, + { + "epoch": 0.5327015690539071, + "grad_norm": 0.34350356459617615, + "learning_rate": 4.910900742077096e-06, + "loss": 0.6569, + "step": 2286 + }, + { + "epoch": 0.5329345968618922, + "grad_norm": 0.3523580729961395, + "learning_rate": 4.91081969361207e-06, + "loss": 0.6691, + "step": 2287 + }, + { + "epoch": 0.5331676246698773, + "grad_norm": 0.3324601352214813, + "learning_rate": 4.910738608970656e-06, + "loss": 0.6338, + "step": 2288 + }, + { + "epoch": 0.5334006524778624, + "grad_norm": 0.34204235672950745, + "learning_rate": 4.910657488154071e-06, + "loss": 0.6498, + "step": 2289 + }, + { + "epoch": 0.5336336802858475, + "grad_norm": 0.3379378318786621, + "learning_rate": 4.910576331163533e-06, + "loss": 0.6602, + "step": 2290 + }, + { + "epoch": 0.5338667080938325, + "grad_norm": 0.32650208473205566, + "learning_rate": 4.910495138000259e-06, + "loss": 0.6492, + "step": 2291 + }, + { + "epoch": 0.5340997359018176, + "grad_norm": 0.332234263420105, + "learning_rate": 4.910413908665467e-06, + "loss": 0.6549, + "step": 2292 + }, + { + "epoch": 0.5343327637098026, + "grad_norm": 0.3547701835632324, + "learning_rate": 4.910332643160377e-06, + "loss": 0.6828, + "step": 2293 + }, + { + "epoch": 0.5345657915177878, + "grad_norm": 0.32514768838882446, + "learning_rate": 4.910251341486208e-06, + "loss": 0.6607, + "step": 2294 + }, + { + "epoch": 0.5347988193257729, + "grad_norm": 0.3410443663597107, + "learning_rate": 4.910170003644179e-06, + "loss": 0.6958, + "step": 2295 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 0.34449511766433716, + "learning_rate": 4.910088629635512e-06, + "loss": 0.6673, + "step": 2296 + }, + { + "epoch": 0.535264874941743, + "grad_norm": 0.34092459082603455, + "learning_rate": 4.910007219461428e-06, + "loss": 0.6611, + "step": 2297 + }, + { + "epoch": 0.5354979027497281, + "grad_norm": 0.33642879128456116, + "learning_rate": 4.909925773123147e-06, + "loss": 0.6582, + "step": 2298 + }, + { + "epoch": 0.5357309305577133, + "grad_norm": 0.3488908112049103, + "learning_rate": 4.909844290621894e-06, + "loss": 0.67, + "step": 2299 + }, + { + "epoch": 0.5359639583656983, + "grad_norm": 0.35699182748794556, + "learning_rate": 4.9097627719588885e-06, + "loss": 0.6718, + "step": 2300 + }, + { + "epoch": 0.5361969861736834, + "grad_norm": 0.34304389357566833, + "learning_rate": 4.9096812171353556e-06, + "loss": 0.6727, + "step": 2301 + }, + { + "epoch": 0.5364300139816685, + "grad_norm": 0.33010053634643555, + "learning_rate": 4.909599626152519e-06, + "loss": 0.6575, + "step": 2302 + }, + { + "epoch": 0.5366630417896535, + "grad_norm": 0.33269450068473816, + "learning_rate": 4.909517999011603e-06, + "loss": 0.6418, + "step": 2303 + }, + { + "epoch": 0.5368960695976387, + "grad_norm": 0.3270528018474579, + "learning_rate": 4.909436335713832e-06, + "loss": 0.646, + "step": 2304 + }, + { + "epoch": 0.5371290974056238, + "grad_norm": 0.33139559626579285, + "learning_rate": 4.9093546362604314e-06, + "loss": 0.652, + "step": 2305 + }, + { + "epoch": 0.5373621252136088, + "grad_norm": 0.33372819423675537, + "learning_rate": 4.909272900652629e-06, + "loss": 0.64, + "step": 2306 + }, + { + "epoch": 0.5375951530215939, + "grad_norm": 0.3431796133518219, + "learning_rate": 4.909191128891648e-06, + "loss": 0.6354, + "step": 2307 + }, + { + "epoch": 0.537828180829579, + "grad_norm": 0.3427954614162445, + "learning_rate": 4.909109320978718e-06, + "loss": 0.6531, + "step": 2308 + }, + { + "epoch": 0.5380612086375641, + "grad_norm": 0.34308671951293945, + "learning_rate": 4.909027476915066e-06, + "loss": 0.6446, + "step": 2309 + }, + { + "epoch": 0.5382942364455492, + "grad_norm": 0.355960488319397, + "learning_rate": 4.9089455967019205e-06, + "loss": 0.6447, + "step": 2310 + }, + { + "epoch": 0.5385272642535343, + "grad_norm": 0.34466877579689026, + "learning_rate": 4.908863680340509e-06, + "loss": 0.6887, + "step": 2311 + }, + { + "epoch": 0.5387602920615193, + "grad_norm": 0.3579148054122925, + "learning_rate": 4.9087817278320614e-06, + "loss": 0.6801, + "step": 2312 + }, + { + "epoch": 0.5389933198695044, + "grad_norm": 0.3546207845211029, + "learning_rate": 4.908699739177808e-06, + "loss": 0.6526, + "step": 2313 + }, + { + "epoch": 0.5392263476774896, + "grad_norm": 0.33563873171806335, + "learning_rate": 4.908617714378978e-06, + "loss": 0.6689, + "step": 2314 + }, + { + "epoch": 0.5394593754854746, + "grad_norm": 0.32727229595184326, + "learning_rate": 4.908535653436803e-06, + "loss": 0.6279, + "step": 2315 + }, + { + "epoch": 0.5396924032934597, + "grad_norm": 0.33615097403526306, + "learning_rate": 4.908453556352514e-06, + "loss": 0.6617, + "step": 2316 + }, + { + "epoch": 0.5399254311014448, + "grad_norm": 0.33803093433380127, + "learning_rate": 4.9083714231273434e-06, + "loss": 0.6587, + "step": 2317 + }, + { + "epoch": 0.5401584589094298, + "grad_norm": 0.33965784311294556, + "learning_rate": 4.908289253762524e-06, + "loss": 0.652, + "step": 2318 + }, + { + "epoch": 0.540391486717415, + "grad_norm": 0.3347463011741638, + "learning_rate": 4.908207048259288e-06, + "loss": 0.6706, + "step": 2319 + }, + { + "epoch": 0.5406245145254001, + "grad_norm": 0.3391139507293701, + "learning_rate": 4.9081248066188694e-06, + "loss": 0.6398, + "step": 2320 + }, + { + "epoch": 0.5408575423333851, + "grad_norm": 0.3307684659957886, + "learning_rate": 4.908042528842502e-06, + "loss": 0.6655, + "step": 2321 + }, + { + "epoch": 0.5410905701413702, + "grad_norm": 0.35939478874206543, + "learning_rate": 4.90796021493142e-06, + "loss": 0.6524, + "step": 2322 + }, + { + "epoch": 0.5413235979493553, + "grad_norm": 0.33294814825057983, + "learning_rate": 4.90787786488686e-06, + "loss": 0.6654, + "step": 2323 + }, + { + "epoch": 0.5415566257573404, + "grad_norm": 0.345940500497818, + "learning_rate": 4.9077954787100564e-06, + "loss": 0.6791, + "step": 2324 + }, + { + "epoch": 0.5417896535653255, + "grad_norm": 0.3352663218975067, + "learning_rate": 4.907713056402245e-06, + "loss": 0.6421, + "step": 2325 + }, + { + "epoch": 0.5420226813733106, + "grad_norm": 0.34249022603034973, + "learning_rate": 4.907630597964666e-06, + "loss": 0.637, + "step": 2326 + }, + { + "epoch": 0.5422557091812956, + "grad_norm": 0.3307454586029053, + "learning_rate": 4.9075481033985524e-06, + "loss": 0.6616, + "step": 2327 + }, + { + "epoch": 0.5424887369892807, + "grad_norm": 0.3444777727127075, + "learning_rate": 4.907465572705145e-06, + "loss": 0.6705, + "step": 2328 + }, + { + "epoch": 0.5427217647972659, + "grad_norm": 0.33814331889152527, + "learning_rate": 4.907383005885681e-06, + "loss": 0.6518, + "step": 2329 + }, + { + "epoch": 0.5429547926052509, + "grad_norm": 0.31716158986091614, + "learning_rate": 4.9073004029414e-06, + "loss": 0.6487, + "step": 2330 + }, + { + "epoch": 0.543187820413236, + "grad_norm": 0.3474617898464203, + "learning_rate": 4.90721776387354e-06, + "loss": 0.6737, + "step": 2331 + }, + { + "epoch": 0.543420848221221, + "grad_norm": 0.3517071008682251, + "learning_rate": 4.907135088683345e-06, + "loss": 0.6531, + "step": 2332 + }, + { + "epoch": 0.5436538760292061, + "grad_norm": 0.3631241023540497, + "learning_rate": 4.90705237737205e-06, + "loss": 0.6392, + "step": 2333 + }, + { + "epoch": 0.5438869038371913, + "grad_norm": 0.3421372175216675, + "learning_rate": 4.906969629940901e-06, + "loss": 0.662, + "step": 2334 + }, + { + "epoch": 0.5441199316451764, + "grad_norm": 0.35106325149536133, + "learning_rate": 4.9068868463911364e-06, + "loss": 0.676, + "step": 2335 + }, + { + "epoch": 0.5443529594531614, + "grad_norm": 0.3519647419452667, + "learning_rate": 4.906804026724e-06, + "loss": 0.6399, + "step": 2336 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 0.3458939790725708, + "learning_rate": 4.9067211709407354e-06, + "loss": 0.6653, + "step": 2337 + }, + { + "epoch": 0.5448190150691316, + "grad_norm": 0.3563222289085388, + "learning_rate": 4.906638279042584e-06, + "loss": 0.6662, + "step": 2338 + }, + { + "epoch": 0.5450520428771166, + "grad_norm": 0.3438166379928589, + "learning_rate": 4.906555351030791e-06, + "loss": 0.6849, + "step": 2339 + }, + { + "epoch": 0.5452850706851018, + "grad_norm": 0.33213913440704346, + "learning_rate": 4.9064723869066e-06, + "loss": 0.6602, + "step": 2340 + }, + { + "epoch": 0.5455180984930869, + "grad_norm": 0.3418883681297302, + "learning_rate": 4.906389386671256e-06, + "loss": 0.6879, + "step": 2341 + }, + { + "epoch": 0.5457511263010719, + "grad_norm": 0.3544636070728302, + "learning_rate": 4.9063063503260055e-06, + "loss": 0.6614, + "step": 2342 + }, + { + "epoch": 0.545984154109057, + "grad_norm": 0.33971333503723145, + "learning_rate": 4.906223277872094e-06, + "loss": 0.6449, + "step": 2343 + }, + { + "epoch": 0.546217181917042, + "grad_norm": 0.36545130610466003, + "learning_rate": 4.906140169310767e-06, + "loss": 0.6494, + "step": 2344 + }, + { + "epoch": 0.5464502097250272, + "grad_norm": 0.3544323444366455, + "learning_rate": 4.9060570246432735e-06, + "loss": 0.6494, + "step": 2345 + }, + { + "epoch": 0.5466832375330123, + "grad_norm": 0.35562190413475037, + "learning_rate": 4.905973843870859e-06, + "loss": 0.6873, + "step": 2346 + }, + { + "epoch": 0.5469162653409974, + "grad_norm": 0.3317176401615143, + "learning_rate": 4.905890626994774e-06, + "loss": 0.6529, + "step": 2347 + }, + { + "epoch": 0.5471492931489824, + "grad_norm": 0.36502039432525635, + "learning_rate": 4.905807374016265e-06, + "loss": 0.6308, + "step": 2348 + }, + { + "epoch": 0.5473823209569675, + "grad_norm": 0.3325105905532837, + "learning_rate": 4.905724084936583e-06, + "loss": 0.6262, + "step": 2349 + }, + { + "epoch": 0.5476153487649527, + "grad_norm": 0.3440399765968323, + "learning_rate": 4.9056407597569775e-06, + "loss": 0.6623, + "step": 2350 + }, + { + "epoch": 0.5478483765729377, + "grad_norm": 0.34611979126930237, + "learning_rate": 4.905557398478698e-06, + "loss": 0.6673, + "step": 2351 + }, + { + "epoch": 0.5480814043809228, + "grad_norm": 0.339703232049942, + "learning_rate": 4.9054740011029966e-06, + "loss": 0.6688, + "step": 2352 + }, + { + "epoch": 0.5483144321889079, + "grad_norm": 0.3511140048503876, + "learning_rate": 4.905390567631123e-06, + "loss": 0.6509, + "step": 2353 + }, + { + "epoch": 0.5485474599968929, + "grad_norm": 0.3623688220977783, + "learning_rate": 4.905307098064332e-06, + "loss": 0.6511, + "step": 2354 + }, + { + "epoch": 0.5487804878048781, + "grad_norm": 0.35583463311195374, + "learning_rate": 4.905223592403873e-06, + "loss": 0.6696, + "step": 2355 + }, + { + "epoch": 0.5490135156128632, + "grad_norm": 0.3501174747943878, + "learning_rate": 4.905140050651001e-06, + "loss": 0.6564, + "step": 2356 + }, + { + "epoch": 0.5492465434208482, + "grad_norm": 0.3420566916465759, + "learning_rate": 4.9050564728069695e-06, + "loss": 0.6735, + "step": 2357 + }, + { + "epoch": 0.5494795712288333, + "grad_norm": 0.3361741006374359, + "learning_rate": 4.904972858873032e-06, + "loss": 0.6663, + "step": 2358 + }, + { + "epoch": 0.5497125990368184, + "grad_norm": 0.3442595601081848, + "learning_rate": 4.904889208850443e-06, + "loss": 0.6727, + "step": 2359 + }, + { + "epoch": 0.5499456268448035, + "grad_norm": 0.3452279567718506, + "learning_rate": 4.904805522740459e-06, + "loss": 0.6808, + "step": 2360 + }, + { + "epoch": 0.5501786546527886, + "grad_norm": 0.355076402425766, + "learning_rate": 4.904721800544334e-06, + "loss": 0.6462, + "step": 2361 + }, + { + "epoch": 0.5504116824607737, + "grad_norm": 0.34952065348625183, + "learning_rate": 4.904638042263326e-06, + "loss": 0.6543, + "step": 2362 + }, + { + "epoch": 0.5506447102687587, + "grad_norm": 0.3441038727760315, + "learning_rate": 4.904554247898691e-06, + "loss": 0.6387, + "step": 2363 + }, + { + "epoch": 0.5508777380767438, + "grad_norm": 0.35111913084983826, + "learning_rate": 4.904470417451686e-06, + "loss": 0.6604, + "step": 2364 + }, + { + "epoch": 0.551110765884729, + "grad_norm": 0.3159680962562561, + "learning_rate": 4.904386550923571e-06, + "loss": 0.6471, + "step": 2365 + }, + { + "epoch": 0.551343793692714, + "grad_norm": 0.3476852476596832, + "learning_rate": 4.904302648315603e-06, + "loss": 0.691, + "step": 2366 + }, + { + "epoch": 0.5515768215006991, + "grad_norm": 0.3454022705554962, + "learning_rate": 4.904218709629039e-06, + "loss": 0.6554, + "step": 2367 + }, + { + "epoch": 0.5518098493086842, + "grad_norm": 0.34841468930244446, + "learning_rate": 4.904134734865143e-06, + "loss": 0.671, + "step": 2368 + }, + { + "epoch": 0.5520428771166692, + "grad_norm": 0.3602234423160553, + "learning_rate": 4.904050724025172e-06, + "loss": 0.6668, + "step": 2369 + }, + { + "epoch": 0.5522759049246544, + "grad_norm": 0.3374183177947998, + "learning_rate": 4.903966677110386e-06, + "loss": 0.6652, + "step": 2370 + }, + { + "epoch": 0.5525089327326395, + "grad_norm": 0.3375241160392761, + "learning_rate": 4.90388259412205e-06, + "loss": 0.6673, + "step": 2371 + }, + { + "epoch": 0.5527419605406245, + "grad_norm": 0.33248451352119446, + "learning_rate": 4.903798475061422e-06, + "loss": 0.6422, + "step": 2372 + }, + { + "epoch": 0.5529749883486096, + "grad_norm": 0.35253652930259705, + "learning_rate": 4.903714319929765e-06, + "loss": 0.6504, + "step": 2373 + }, + { + "epoch": 0.5532080161565947, + "grad_norm": 0.3544929027557373, + "learning_rate": 4.903630128728344e-06, + "loss": 0.6461, + "step": 2374 + }, + { + "epoch": 0.5534410439645798, + "grad_norm": 0.3363276422023773, + "learning_rate": 4.903545901458419e-06, + "loss": 0.6288, + "step": 2375 + }, + { + "epoch": 0.5536740717725649, + "grad_norm": 0.34455910325050354, + "learning_rate": 4.9034616381212575e-06, + "loss": 0.6397, + "step": 2376 + }, + { + "epoch": 0.55390709958055, + "grad_norm": 0.3566710650920868, + "learning_rate": 4.9033773387181206e-06, + "loss": 0.6626, + "step": 2377 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 0.3519829213619232, + "learning_rate": 4.903293003250276e-06, + "loss": 0.6814, + "step": 2378 + }, + { + "epoch": 0.5543731551965201, + "grad_norm": 0.35133442282676697, + "learning_rate": 4.903208631718987e-06, + "loss": 0.6804, + "step": 2379 + }, + { + "epoch": 0.5546061830045051, + "grad_norm": 0.33776378631591797, + "learning_rate": 4.903124224125521e-06, + "loss": 0.6534, + "step": 2380 + }, + { + "epoch": 0.5548392108124903, + "grad_norm": 0.3447825014591217, + "learning_rate": 4.903039780471145e-06, + "loss": 0.6594, + "step": 2381 + }, + { + "epoch": 0.5550722386204754, + "grad_norm": 0.3430171608924866, + "learning_rate": 4.902955300757125e-06, + "loss": 0.6354, + "step": 2382 + }, + { + "epoch": 0.5553052664284605, + "grad_norm": 0.3599168062210083, + "learning_rate": 4.902870784984729e-06, + "loss": 0.6699, + "step": 2383 + }, + { + "epoch": 0.5555382942364455, + "grad_norm": 0.33511489629745483, + "learning_rate": 4.902786233155226e-06, + "loss": 0.6551, + "step": 2384 + }, + { + "epoch": 0.5557713220444306, + "grad_norm": 0.3587178885936737, + "learning_rate": 4.902701645269883e-06, + "loss": 0.667, + "step": 2385 + }, + { + "epoch": 0.5560043498524158, + "grad_norm": 0.33937975764274597, + "learning_rate": 4.902617021329971e-06, + "loss": 0.6793, + "step": 2386 + }, + { + "epoch": 0.5562373776604008, + "grad_norm": 0.3573654592037201, + "learning_rate": 4.902532361336759e-06, + "loss": 0.6593, + "step": 2387 + }, + { + "epoch": 0.5564704054683859, + "grad_norm": 0.335524320602417, + "learning_rate": 4.9024476652915185e-06, + "loss": 0.6349, + "step": 2388 + }, + { + "epoch": 0.556703433276371, + "grad_norm": 0.3466155529022217, + "learning_rate": 4.90236293319552e-06, + "loss": 0.6734, + "step": 2389 + }, + { + "epoch": 0.556936461084356, + "grad_norm": 0.34680041670799255, + "learning_rate": 4.902278165050033e-06, + "loss": 0.642, + "step": 2390 + }, + { + "epoch": 0.5571694888923412, + "grad_norm": 0.3400244414806366, + "learning_rate": 4.902193360856332e-06, + "loss": 0.6403, + "step": 2391 + }, + { + "epoch": 0.5574025167003263, + "grad_norm": 0.3358001410961151, + "learning_rate": 4.902108520615689e-06, + "loss": 0.6612, + "step": 2392 + }, + { + "epoch": 0.5576355445083113, + "grad_norm": 0.3506890833377838, + "learning_rate": 4.902023644329377e-06, + "loss": 0.6587, + "step": 2393 + }, + { + "epoch": 0.5578685723162964, + "grad_norm": 0.33254167437553406, + "learning_rate": 4.901938731998669e-06, + "loss": 0.6528, + "step": 2394 + }, + { + "epoch": 0.5581016001242814, + "grad_norm": 0.34346768260002136, + "learning_rate": 4.9018537836248395e-06, + "loss": 0.6816, + "step": 2395 + }, + { + "epoch": 0.5583346279322666, + "grad_norm": 0.3343004286289215, + "learning_rate": 4.901768799209163e-06, + "loss": 0.6448, + "step": 2396 + }, + { + "epoch": 0.5585676557402517, + "grad_norm": 0.32830286026000977, + "learning_rate": 4.901683778752916e-06, + "loss": 0.6457, + "step": 2397 + }, + { + "epoch": 0.5588006835482368, + "grad_norm": 0.3438209891319275, + "learning_rate": 4.9015987222573735e-06, + "loss": 0.6765, + "step": 2398 + }, + { + "epoch": 0.5590337113562218, + "grad_norm": 0.35312771797180176, + "learning_rate": 4.901513629723811e-06, + "loss": 0.6598, + "step": 2399 + }, + { + "epoch": 0.5592667391642069, + "grad_norm": 0.3309422731399536, + "learning_rate": 4.901428501153507e-06, + "loss": 0.6632, + "step": 2400 + }, + { + "epoch": 0.5594997669721921, + "grad_norm": 0.34470367431640625, + "learning_rate": 4.901343336547737e-06, + "loss": 0.6581, + "step": 2401 + }, + { + "epoch": 0.5597327947801771, + "grad_norm": 0.3378477394580841, + "learning_rate": 4.901258135907781e-06, + "loss": 0.6628, + "step": 2402 + }, + { + "epoch": 0.5599658225881622, + "grad_norm": 0.3376944363117218, + "learning_rate": 4.901172899234916e-06, + "loss": 0.6194, + "step": 2403 + }, + { + "epoch": 0.5601988503961473, + "grad_norm": 0.33680328726768494, + "learning_rate": 4.9010876265304215e-06, + "loss": 0.637, + "step": 2404 + }, + { + "epoch": 0.5604318782041323, + "grad_norm": 0.3307522237300873, + "learning_rate": 4.901002317795577e-06, + "loss": 0.6304, + "step": 2405 + }, + { + "epoch": 0.5606649060121175, + "grad_norm": 0.34014543890953064, + "learning_rate": 4.900916973031663e-06, + "loss": 0.6556, + "step": 2406 + }, + { + "epoch": 0.5608979338201026, + "grad_norm": 0.3475813865661621, + "learning_rate": 4.900831592239961e-06, + "loss": 0.6454, + "step": 2407 + }, + { + "epoch": 0.5611309616280876, + "grad_norm": 0.3610076606273651, + "learning_rate": 4.90074617542175e-06, + "loss": 0.6709, + "step": 2408 + }, + { + "epoch": 0.5613639894360727, + "grad_norm": 0.35430780053138733, + "learning_rate": 4.900660722578313e-06, + "loss": 0.6884, + "step": 2409 + }, + { + "epoch": 0.5615970172440578, + "grad_norm": 0.3345937430858612, + "learning_rate": 4.900575233710932e-06, + "loss": 0.6309, + "step": 2410 + }, + { + "epoch": 0.5618300450520429, + "grad_norm": 0.35633665323257446, + "learning_rate": 4.900489708820891e-06, + "loss": 0.6433, + "step": 2411 + }, + { + "epoch": 0.562063072860028, + "grad_norm": 0.3282504975795746, + "learning_rate": 4.9004041479094724e-06, + "loss": 0.6762, + "step": 2412 + }, + { + "epoch": 0.562296100668013, + "grad_norm": 0.3310927152633667, + "learning_rate": 4.90031855097796e-06, + "loss": 0.6787, + "step": 2413 + }, + { + "epoch": 0.5625291284759981, + "grad_norm": 0.33326902985572815, + "learning_rate": 4.900232918027638e-06, + "loss": 0.6523, + "step": 2414 + }, + { + "epoch": 0.5627621562839832, + "grad_norm": 0.3364003300666809, + "learning_rate": 4.9001472490597926e-06, + "loss": 0.6376, + "step": 2415 + }, + { + "epoch": 0.5629951840919684, + "grad_norm": 0.359142929315567, + "learning_rate": 4.900061544075707e-06, + "loss": 0.6753, + "step": 2416 + }, + { + "epoch": 0.5632282118999534, + "grad_norm": 0.3519333600997925, + "learning_rate": 4.89997580307667e-06, + "loss": 0.6369, + "step": 2417 + }, + { + "epoch": 0.5634612397079385, + "grad_norm": 0.3271421492099762, + "learning_rate": 4.899890026063967e-06, + "loss": 0.6377, + "step": 2418 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 0.3526638448238373, + "learning_rate": 4.899804213038885e-06, + "loss": 0.6763, + "step": 2419 + }, + { + "epoch": 0.5639272953239086, + "grad_norm": 0.33895790576934814, + "learning_rate": 4.899718364002712e-06, + "loss": 0.6439, + "step": 2420 + }, + { + "epoch": 0.5641603231318938, + "grad_norm": 0.32559123635292053, + "learning_rate": 4.899632478956736e-06, + "loss": 0.6364, + "step": 2421 + }, + { + "epoch": 0.5643933509398789, + "grad_norm": 0.355796217918396, + "learning_rate": 4.8995465579022464e-06, + "loss": 0.6879, + "step": 2422 + }, + { + "epoch": 0.5646263787478639, + "grad_norm": 0.3464781939983368, + "learning_rate": 4.899460600840531e-06, + "loss": 0.6655, + "step": 2423 + }, + { + "epoch": 0.564859406555849, + "grad_norm": 0.34073665738105774, + "learning_rate": 4.899374607772881e-06, + "loss": 0.6714, + "step": 2424 + }, + { + "epoch": 0.565092434363834, + "grad_norm": 0.33552080392837524, + "learning_rate": 4.899288578700587e-06, + "loss": 0.6376, + "step": 2425 + }, + { + "epoch": 0.5653254621718191, + "grad_norm": 0.3370748460292816, + "learning_rate": 4.899202513624939e-06, + "loss": 0.6852, + "step": 2426 + }, + { + "epoch": 0.5655584899798043, + "grad_norm": 0.33203592896461487, + "learning_rate": 4.899116412547229e-06, + "loss": 0.6558, + "step": 2427 + }, + { + "epoch": 0.5657915177877894, + "grad_norm": 0.33031588792800903, + "learning_rate": 4.899030275468749e-06, + "loss": 0.6526, + "step": 2428 + }, + { + "epoch": 0.5660245455957744, + "grad_norm": 0.3247145712375641, + "learning_rate": 4.898944102390791e-06, + "loss": 0.6348, + "step": 2429 + }, + { + "epoch": 0.5662575734037595, + "grad_norm": 0.34293025732040405, + "learning_rate": 4.898857893314649e-06, + "loss": 0.646, + "step": 2430 + }, + { + "epoch": 0.5664906012117445, + "grad_norm": 0.3392256498336792, + "learning_rate": 4.898771648241616e-06, + "loss": 0.6489, + "step": 2431 + }, + { + "epoch": 0.5667236290197297, + "grad_norm": 0.38269832730293274, + "learning_rate": 4.898685367172987e-06, + "loss": 0.7034, + "step": 2432 + }, + { + "epoch": 0.5669566568277148, + "grad_norm": 0.34509748220443726, + "learning_rate": 4.8985990501100556e-06, + "loss": 0.6276, + "step": 2433 + }, + { + "epoch": 0.5671896846356999, + "grad_norm": 0.34161463379859924, + "learning_rate": 4.898512697054117e-06, + "loss": 0.6464, + "step": 2434 + }, + { + "epoch": 0.5674227124436849, + "grad_norm": 0.35252299904823303, + "learning_rate": 4.8984263080064685e-06, + "loss": 0.6832, + "step": 2435 + }, + { + "epoch": 0.56765574025167, + "grad_norm": 0.33504241704940796, + "learning_rate": 4.898339882968406e-06, + "loss": 0.6537, + "step": 2436 + }, + { + "epoch": 0.5678887680596552, + "grad_norm": 0.34399154782295227, + "learning_rate": 4.898253421941224e-06, + "loss": 0.6535, + "step": 2437 + }, + { + "epoch": 0.5681217958676402, + "grad_norm": 0.3319203555583954, + "learning_rate": 4.8981669249262244e-06, + "loss": 0.6598, + "step": 2438 + }, + { + "epoch": 0.5683548236756253, + "grad_norm": 0.34160080552101135, + "learning_rate": 4.898080391924701e-06, + "loss": 0.6538, + "step": 2439 + }, + { + "epoch": 0.5685878514836104, + "grad_norm": 0.33524981141090393, + "learning_rate": 4.897993822937954e-06, + "loss": 0.6795, + "step": 2440 + }, + { + "epoch": 0.5688208792915954, + "grad_norm": 0.33954674005508423, + "learning_rate": 4.897907217967282e-06, + "loss": 0.6649, + "step": 2441 + }, + { + "epoch": 0.5690539070995806, + "grad_norm": 0.3493073284626007, + "learning_rate": 4.897820577013985e-06, + "loss": 0.6514, + "step": 2442 + }, + { + "epoch": 0.5692869349075657, + "grad_norm": 0.3541712462902069, + "learning_rate": 4.897733900079363e-06, + "loss": 0.6694, + "step": 2443 + }, + { + "epoch": 0.5695199627155507, + "grad_norm": 0.33940112590789795, + "learning_rate": 4.897647187164718e-06, + "loss": 0.6494, + "step": 2444 + }, + { + "epoch": 0.5697529905235358, + "grad_norm": 0.3500059247016907, + "learning_rate": 4.897560438271348e-06, + "loss": 0.64, + "step": 2445 + }, + { + "epoch": 0.5699860183315208, + "grad_norm": 0.34839195013046265, + "learning_rate": 4.897473653400558e-06, + "loss": 0.6628, + "step": 2446 + }, + { + "epoch": 0.570219046139506, + "grad_norm": 0.3687993586063385, + "learning_rate": 4.897386832553648e-06, + "loss": 0.6431, + "step": 2447 + }, + { + "epoch": 0.5704520739474911, + "grad_norm": 0.34782466292381287, + "learning_rate": 4.897299975731922e-06, + "loss": 0.674, + "step": 2448 + }, + { + "epoch": 0.5706851017554762, + "grad_norm": 0.34641706943511963, + "learning_rate": 4.897213082936684e-06, + "loss": 0.6486, + "step": 2449 + }, + { + "epoch": 0.5709181295634612, + "grad_norm": 0.36632996797561646, + "learning_rate": 4.897126154169236e-06, + "loss": 0.6862, + "step": 2450 + }, + { + "epoch": 0.5711511573714463, + "grad_norm": 0.3543465733528137, + "learning_rate": 4.897039189430884e-06, + "loss": 0.6611, + "step": 2451 + }, + { + "epoch": 0.5713841851794315, + "grad_norm": 0.3409847915172577, + "learning_rate": 4.896952188722931e-06, + "loss": 0.6583, + "step": 2452 + }, + { + "epoch": 0.5716172129874165, + "grad_norm": 0.34669995307922363, + "learning_rate": 4.896865152046685e-06, + "loss": 0.6292, + "step": 2453 + }, + { + "epoch": 0.5718502407954016, + "grad_norm": 0.3335089385509491, + "learning_rate": 4.896778079403451e-06, + "loss": 0.6423, + "step": 2454 + }, + { + "epoch": 0.5720832686033867, + "grad_norm": 0.33839696645736694, + "learning_rate": 4.896690970794536e-06, + "loss": 0.6649, + "step": 2455 + }, + { + "epoch": 0.5723162964113717, + "grad_norm": 0.33959248661994934, + "learning_rate": 4.896603826221246e-06, + "loss": 0.6477, + "step": 2456 + }, + { + "epoch": 0.5725493242193569, + "grad_norm": 0.3379269540309906, + "learning_rate": 4.896516645684889e-06, + "loss": 0.6627, + "step": 2457 + }, + { + "epoch": 0.572782352027342, + "grad_norm": 0.3408333957195282, + "learning_rate": 4.896429429186775e-06, + "loss": 0.6624, + "step": 2458 + }, + { + "epoch": 0.573015379835327, + "grad_norm": 0.3499879539012909, + "learning_rate": 4.89634217672821e-06, + "loss": 0.65, + "step": 2459 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 0.3485375940799713, + "learning_rate": 4.896254888310505e-06, + "loss": 0.6584, + "step": 2460 + }, + { + "epoch": 0.5734814354512972, + "grad_norm": 0.3403288424015045, + "learning_rate": 4.89616756393497e-06, + "loss": 0.6522, + "step": 2461 + }, + { + "epoch": 0.5737144632592823, + "grad_norm": 0.33820897340774536, + "learning_rate": 4.8960802036029145e-06, + "loss": 0.6294, + "step": 2462 + }, + { + "epoch": 0.5739474910672674, + "grad_norm": 0.36951151490211487, + "learning_rate": 4.89599280731565e-06, + "loss": 0.6594, + "step": 2463 + }, + { + "epoch": 0.5741805188752525, + "grad_norm": 0.34227415919303894, + "learning_rate": 4.895905375074488e-06, + "loss": 0.646, + "step": 2464 + }, + { + "epoch": 0.5744135466832375, + "grad_norm": 0.3511560559272766, + "learning_rate": 4.89581790688074e-06, + "loss": 0.6632, + "step": 2465 + }, + { + "epoch": 0.5746465744912226, + "grad_norm": 0.3507237434387207, + "learning_rate": 4.895730402735719e-06, + "loss": 0.6649, + "step": 2466 + }, + { + "epoch": 0.5748796022992078, + "grad_norm": 0.3660028874874115, + "learning_rate": 4.895642862640737e-06, + "loss": 0.642, + "step": 2467 + }, + { + "epoch": 0.5751126301071928, + "grad_norm": 0.3318891227245331, + "learning_rate": 4.895555286597109e-06, + "loss": 0.6516, + "step": 2468 + }, + { + "epoch": 0.5753456579151779, + "grad_norm": 0.35382750630378723, + "learning_rate": 4.895467674606149e-06, + "loss": 0.6457, + "step": 2469 + }, + { + "epoch": 0.575578685723163, + "grad_norm": 0.36818501353263855, + "learning_rate": 4.895380026669171e-06, + "loss": 0.6287, + "step": 2470 + }, + { + "epoch": 0.575811713531148, + "grad_norm": 0.3364720940589905, + "learning_rate": 4.895292342787491e-06, + "loss": 0.6606, + "step": 2471 + }, + { + "epoch": 0.5760447413391331, + "grad_norm": 0.34661829471588135, + "learning_rate": 4.895204622962424e-06, + "loss": 0.683, + "step": 2472 + }, + { + "epoch": 0.5762777691471183, + "grad_norm": 0.35747838020324707, + "learning_rate": 4.895116867195286e-06, + "loss": 0.6674, + "step": 2473 + }, + { + "epoch": 0.5765107969551033, + "grad_norm": 0.3476683795452118, + "learning_rate": 4.895029075487395e-06, + "loss": 0.6288, + "step": 2474 + }, + { + "epoch": 0.5767438247630884, + "grad_norm": 0.32421252131462097, + "learning_rate": 4.8949412478400675e-06, + "loss": 0.6063, + "step": 2475 + }, + { + "epoch": 0.5769768525710735, + "grad_norm": 0.3481488823890686, + "learning_rate": 4.894853384254622e-06, + "loss": 0.6584, + "step": 2476 + }, + { + "epoch": 0.5772098803790585, + "grad_norm": 0.34439271688461304, + "learning_rate": 4.894765484732377e-06, + "loss": 0.6646, + "step": 2477 + }, + { + "epoch": 0.5774429081870437, + "grad_norm": 0.34084150195121765, + "learning_rate": 4.894677549274651e-06, + "loss": 0.6609, + "step": 2478 + }, + { + "epoch": 0.5776759359950288, + "grad_norm": 0.33294644951820374, + "learning_rate": 4.894589577882764e-06, + "loss": 0.6368, + "step": 2479 + }, + { + "epoch": 0.5779089638030138, + "grad_norm": 0.374744713306427, + "learning_rate": 4.894501570558035e-06, + "loss": 0.663, + "step": 2480 + }, + { + "epoch": 0.5781419916109989, + "grad_norm": 0.3488067388534546, + "learning_rate": 4.8944135273017865e-06, + "loss": 0.6822, + "step": 2481 + }, + { + "epoch": 0.578375019418984, + "grad_norm": 0.352668434381485, + "learning_rate": 4.894325448115339e-06, + "loss": 0.6603, + "step": 2482 + }, + { + "epoch": 0.5786080472269691, + "grad_norm": 0.33418941497802734, + "learning_rate": 4.894237333000014e-06, + "loss": 0.6144, + "step": 2483 + }, + { + "epoch": 0.5788410750349542, + "grad_norm": 0.3422946035861969, + "learning_rate": 4.894149181957133e-06, + "loss": 0.6383, + "step": 2484 + }, + { + "epoch": 0.5790741028429393, + "grad_norm": 0.3422331213951111, + "learning_rate": 4.894060994988019e-06, + "loss": 0.6572, + "step": 2485 + }, + { + "epoch": 0.5793071306509243, + "grad_norm": 0.3405226469039917, + "learning_rate": 4.893972772093996e-06, + "loss": 0.6492, + "step": 2486 + }, + { + "epoch": 0.5795401584589094, + "grad_norm": 0.32643023133277893, + "learning_rate": 4.893884513276388e-06, + "loss": 0.6512, + "step": 2487 + }, + { + "epoch": 0.5797731862668946, + "grad_norm": 0.3567380905151367, + "learning_rate": 4.893796218536518e-06, + "loss": 0.6309, + "step": 2488 + }, + { + "epoch": 0.5800062140748796, + "grad_norm": 0.3345935344696045, + "learning_rate": 4.893707887875713e-06, + "loss": 0.6565, + "step": 2489 + }, + { + "epoch": 0.5802392418828647, + "grad_norm": 0.35161277651786804, + "learning_rate": 4.893619521295298e-06, + "loss": 0.6518, + "step": 2490 + }, + { + "epoch": 0.5804722696908498, + "grad_norm": 0.33171215653419495, + "learning_rate": 4.8935311187965975e-06, + "loss": 0.6625, + "step": 2491 + }, + { + "epoch": 0.5807052974988348, + "grad_norm": 0.3473520576953888, + "learning_rate": 4.893442680380939e-06, + "loss": 0.6386, + "step": 2492 + }, + { + "epoch": 0.58093832530682, + "grad_norm": 0.34616607427597046, + "learning_rate": 4.8933542060496485e-06, + "loss": 0.6397, + "step": 2493 + }, + { + "epoch": 0.5811713531148051, + "grad_norm": 0.3544016182422638, + "learning_rate": 4.893265695804056e-06, + "loss": 0.663, + "step": 2494 + }, + { + "epoch": 0.5814043809227901, + "grad_norm": 0.34805864095687866, + "learning_rate": 4.893177149645488e-06, + "loss": 0.656, + "step": 2495 + }, + { + "epoch": 0.5816374087307752, + "grad_norm": 0.3590967357158661, + "learning_rate": 4.893088567575274e-06, + "loss": 0.6781, + "step": 2496 + }, + { + "epoch": 0.5818704365387602, + "grad_norm": 0.3215422034263611, + "learning_rate": 4.892999949594743e-06, + "loss": 0.6647, + "step": 2497 + }, + { + "epoch": 0.5821034643467454, + "grad_norm": 0.3562380373477936, + "learning_rate": 4.892911295705224e-06, + "loss": 0.6778, + "step": 2498 + }, + { + "epoch": 0.5823364921547305, + "grad_norm": 0.34939372539520264, + "learning_rate": 4.892822605908048e-06, + "loss": 0.6516, + "step": 2499 + }, + { + "epoch": 0.5825695199627156, + "grad_norm": 0.3535986840724945, + "learning_rate": 4.892733880204546e-06, + "loss": 0.6417, + "step": 2500 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 0.3369734585285187, + "learning_rate": 4.8926451185960486e-06, + "loss": 0.6288, + "step": 2501 + }, + { + "epoch": 0.5830355755786857, + "grad_norm": 0.3484233021736145, + "learning_rate": 4.892556321083889e-06, + "loss": 0.6635, + "step": 2502 + }, + { + "epoch": 0.5832686033866709, + "grad_norm": 0.3375922739505768, + "learning_rate": 4.892467487669399e-06, + "loss": 0.6376, + "step": 2503 + }, + { + "epoch": 0.5835016311946559, + "grad_norm": 0.3397925794124603, + "learning_rate": 4.892378618353912e-06, + "loss": 0.6429, + "step": 2504 + }, + { + "epoch": 0.583734659002641, + "grad_norm": 0.35540929436683655, + "learning_rate": 4.892289713138759e-06, + "loss": 0.6268, + "step": 2505 + }, + { + "epoch": 0.583967686810626, + "grad_norm": 0.3427555561065674, + "learning_rate": 4.892200772025278e-06, + "loss": 0.6575, + "step": 2506 + }, + { + "epoch": 0.5842007146186111, + "grad_norm": 0.34498658776283264, + "learning_rate": 4.892111795014803e-06, + "loss": 0.6771, + "step": 2507 + }, + { + "epoch": 0.5844337424265963, + "grad_norm": 0.34904202818870544, + "learning_rate": 4.8920227821086665e-06, + "loss": 0.6728, + "step": 2508 + }, + { + "epoch": 0.5846667702345814, + "grad_norm": 0.3489423990249634, + "learning_rate": 4.8919337333082065e-06, + "loss": 0.6863, + "step": 2509 + }, + { + "epoch": 0.5848997980425664, + "grad_norm": 0.330411821603775, + "learning_rate": 4.891844648614758e-06, + "loss": 0.6202, + "step": 2510 + }, + { + "epoch": 0.5851328258505515, + "grad_norm": 0.3372916579246521, + "learning_rate": 4.891755528029659e-06, + "loss": 0.6431, + "step": 2511 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 0.33095675706863403, + "learning_rate": 4.891666371554247e-06, + "loss": 0.644, + "step": 2512 + }, + { + "epoch": 0.5855988814665217, + "grad_norm": 0.352293461561203, + "learning_rate": 4.891577179189858e-06, + "loss": 0.6699, + "step": 2513 + }, + { + "epoch": 0.5858319092745068, + "grad_norm": 0.3475862443447113, + "learning_rate": 4.891487950937831e-06, + "loss": 0.6524, + "step": 2514 + }, + { + "epoch": 0.5860649370824919, + "grad_norm": 0.3525044620037079, + "learning_rate": 4.891398686799507e-06, + "loss": 0.6467, + "step": 2515 + }, + { + "epoch": 0.5862979648904769, + "grad_norm": 0.3392266035079956, + "learning_rate": 4.891309386776223e-06, + "loss": 0.6652, + "step": 2516 + }, + { + "epoch": 0.586530992698462, + "grad_norm": 0.3535720109939575, + "learning_rate": 4.89122005086932e-06, + "loss": 0.6698, + "step": 2517 + }, + { + "epoch": 0.586764020506447, + "grad_norm": 0.3465578854084015, + "learning_rate": 4.8911306790801374e-06, + "loss": 0.6509, + "step": 2518 + }, + { + "epoch": 0.5869970483144322, + "grad_norm": 0.34013986587524414, + "learning_rate": 4.891041271410019e-06, + "loss": 0.6199, + "step": 2519 + }, + { + "epoch": 0.5872300761224173, + "grad_norm": 0.3575129508972168, + "learning_rate": 4.890951827860304e-06, + "loss": 0.6733, + "step": 2520 + }, + { + "epoch": 0.5874631039304024, + "grad_norm": 0.35315898060798645, + "learning_rate": 4.890862348432336e-06, + "loss": 0.6263, + "step": 2521 + }, + { + "epoch": 0.5876961317383874, + "grad_norm": 0.3743421137332916, + "learning_rate": 4.890772833127456e-06, + "loss": 0.6424, + "step": 2522 + }, + { + "epoch": 0.5879291595463725, + "grad_norm": 0.3582768738269806, + "learning_rate": 4.890683281947009e-06, + "loss": 0.6374, + "step": 2523 + }, + { + "epoch": 0.5881621873543577, + "grad_norm": 0.33387085795402527, + "learning_rate": 4.8905936948923395e-06, + "loss": 0.6446, + "step": 2524 + }, + { + "epoch": 0.5883952151623427, + "grad_norm": 0.3350699841976166, + "learning_rate": 4.890504071964789e-06, + "loss": 0.6292, + "step": 2525 + }, + { + "epoch": 0.5886282429703278, + "grad_norm": 0.34523510932922363, + "learning_rate": 4.890414413165704e-06, + "loss": 0.6535, + "step": 2526 + }, + { + "epoch": 0.5888612707783129, + "grad_norm": 0.3659435212612152, + "learning_rate": 4.8903247184964304e-06, + "loss": 0.6668, + "step": 2527 + }, + { + "epoch": 0.5890942985862979, + "grad_norm": 0.34710124135017395, + "learning_rate": 4.890234987958313e-06, + "loss": 0.647, + "step": 2528 + }, + { + "epoch": 0.5893273263942831, + "grad_norm": 0.3437765836715698, + "learning_rate": 4.890145221552699e-06, + "loss": 0.6265, + "step": 2529 + }, + { + "epoch": 0.5895603542022682, + "grad_norm": 0.348409503698349, + "learning_rate": 4.890055419280935e-06, + "loss": 0.6787, + "step": 2530 + }, + { + "epoch": 0.5897933820102532, + "grad_norm": 0.353401780128479, + "learning_rate": 4.889965581144369e-06, + "loss": 0.683, + "step": 2531 + }, + { + "epoch": 0.5900264098182383, + "grad_norm": 0.3387104868888855, + "learning_rate": 4.8898757071443484e-06, + "loss": 0.6351, + "step": 2532 + }, + { + "epoch": 0.5902594376262233, + "grad_norm": 0.3497883677482605, + "learning_rate": 4.889785797282222e-06, + "loss": 0.6787, + "step": 2533 + }, + { + "epoch": 0.5904924654342085, + "grad_norm": 0.3397500514984131, + "learning_rate": 4.889695851559341e-06, + "loss": 0.6365, + "step": 2534 + }, + { + "epoch": 0.5907254932421936, + "grad_norm": 0.3715534806251526, + "learning_rate": 4.889605869977052e-06, + "loss": 0.634, + "step": 2535 + }, + { + "epoch": 0.5909585210501787, + "grad_norm": 0.3507728576660156, + "learning_rate": 4.889515852536707e-06, + "loss": 0.654, + "step": 2536 + }, + { + "epoch": 0.5911915488581637, + "grad_norm": 0.358055979013443, + "learning_rate": 4.889425799239656e-06, + "loss": 0.6144, + "step": 2537 + }, + { + "epoch": 0.5914245766661488, + "grad_norm": 0.3644821047782898, + "learning_rate": 4.889335710087252e-06, + "loss": 0.6585, + "step": 2538 + }, + { + "epoch": 0.591657604474134, + "grad_norm": 0.345765084028244, + "learning_rate": 4.889245585080845e-06, + "loss": 0.6409, + "step": 2539 + }, + { + "epoch": 0.591890632282119, + "grad_norm": 0.3698848783969879, + "learning_rate": 4.889155424221788e-06, + "loss": 0.6184, + "step": 2540 + }, + { + "epoch": 0.5921236600901041, + "grad_norm": 0.370880126953125, + "learning_rate": 4.889065227511434e-06, + "loss": 0.6448, + "step": 2541 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 0.3936658799648285, + "learning_rate": 4.888974994951136e-06, + "loss": 0.6441, + "step": 2542 + }, + { + "epoch": 0.5925897157060742, + "grad_norm": 0.34025487303733826, + "learning_rate": 4.888884726542248e-06, + "loss": 0.6427, + "step": 2543 + }, + { + "epoch": 0.5928227435140594, + "grad_norm": 0.3503722548484802, + "learning_rate": 4.888794422286126e-06, + "loss": 0.6627, + "step": 2544 + }, + { + "epoch": 0.5930557713220445, + "grad_norm": 0.3342708647251129, + "learning_rate": 4.888704082184124e-06, + "loss": 0.6519, + "step": 2545 + }, + { + "epoch": 0.5932887991300295, + "grad_norm": 0.3641882538795471, + "learning_rate": 4.8886137062375974e-06, + "loss": 0.6637, + "step": 2546 + }, + { + "epoch": 0.5935218269380146, + "grad_norm": 0.3467912971973419, + "learning_rate": 4.888523294447903e-06, + "loss": 0.6303, + "step": 2547 + }, + { + "epoch": 0.5937548547459996, + "grad_norm": 0.34553229808807373, + "learning_rate": 4.888432846816398e-06, + "loss": 0.6632, + "step": 2548 + }, + { + "epoch": 0.5939878825539848, + "grad_norm": 0.34393686056137085, + "learning_rate": 4.888342363344437e-06, + "loss": 0.6384, + "step": 2549 + }, + { + "epoch": 0.5942209103619699, + "grad_norm": 0.3331138491630554, + "learning_rate": 4.888251844033381e-06, + "loss": 0.6549, + "step": 2550 + }, + { + "epoch": 0.594453938169955, + "grad_norm": 0.35536664724349976, + "learning_rate": 4.888161288884586e-06, + "loss": 0.6778, + "step": 2551 + }, + { + "epoch": 0.59468696597794, + "grad_norm": 0.34683260321617126, + "learning_rate": 4.888070697899413e-06, + "loss": 0.6446, + "step": 2552 + }, + { + "epoch": 0.5949199937859251, + "grad_norm": 0.35574328899383545, + "learning_rate": 4.8879800710792195e-06, + "loss": 0.692, + "step": 2553 + }, + { + "epoch": 0.5951530215939103, + "grad_norm": 0.3349967896938324, + "learning_rate": 4.8878894084253655e-06, + "loss": 0.6446, + "step": 2554 + }, + { + "epoch": 0.5953860494018953, + "grad_norm": 0.35106703639030457, + "learning_rate": 4.887798709939213e-06, + "loss": 0.6777, + "step": 2555 + }, + { + "epoch": 0.5956190772098804, + "grad_norm": 0.33402925729751587, + "learning_rate": 4.8877079756221224e-06, + "loss": 0.6616, + "step": 2556 + }, + { + "epoch": 0.5958521050178655, + "grad_norm": 0.3368925154209137, + "learning_rate": 4.887617205475455e-06, + "loss": 0.6379, + "step": 2557 + }, + { + "epoch": 0.5960851328258505, + "grad_norm": 0.3367069959640503, + "learning_rate": 4.887526399500572e-06, + "loss": 0.6696, + "step": 2558 + }, + { + "epoch": 0.5963181606338357, + "grad_norm": 0.353932648897171, + "learning_rate": 4.887435557698836e-06, + "loss": 0.6613, + "step": 2559 + }, + { + "epoch": 0.5965511884418208, + "grad_norm": 0.3459806442260742, + "learning_rate": 4.8873446800716125e-06, + "loss": 0.6642, + "step": 2560 + }, + { + "epoch": 0.5967842162498058, + "grad_norm": 0.34992364048957825, + "learning_rate": 4.887253766620264e-06, + "loss": 0.6712, + "step": 2561 + }, + { + "epoch": 0.5970172440577909, + "grad_norm": 0.33918505907058716, + "learning_rate": 4.887162817346154e-06, + "loss": 0.6112, + "step": 2562 + }, + { + "epoch": 0.597250271865776, + "grad_norm": 0.34996750950813293, + "learning_rate": 4.887071832250647e-06, + "loss": 0.6634, + "step": 2563 + }, + { + "epoch": 0.597483299673761, + "grad_norm": 0.33108842372894287, + "learning_rate": 4.88698081133511e-06, + "loss": 0.6433, + "step": 2564 + }, + { + "epoch": 0.5977163274817462, + "grad_norm": 0.34658336639404297, + "learning_rate": 4.886889754600907e-06, + "loss": 0.6495, + "step": 2565 + }, + { + "epoch": 0.5979493552897313, + "grad_norm": 0.3292175531387329, + "learning_rate": 4.886798662049406e-06, + "loss": 0.6564, + "step": 2566 + }, + { + "epoch": 0.5981823830977163, + "grad_norm": 0.35623353719711304, + "learning_rate": 4.886707533681973e-06, + "loss": 0.6646, + "step": 2567 + }, + { + "epoch": 0.5984154109057014, + "grad_norm": 0.34437036514282227, + "learning_rate": 4.886616369499975e-06, + "loss": 0.6537, + "step": 2568 + }, + { + "epoch": 0.5986484387136864, + "grad_norm": 0.34387362003326416, + "learning_rate": 4.886525169504781e-06, + "loss": 0.6632, + "step": 2569 + }, + { + "epoch": 0.5988814665216716, + "grad_norm": 0.3416000306606293, + "learning_rate": 4.886433933697759e-06, + "loss": 0.6225, + "step": 2570 + }, + { + "epoch": 0.5991144943296567, + "grad_norm": 0.3664180040359497, + "learning_rate": 4.886342662080279e-06, + "loss": 0.666, + "step": 2571 + }, + { + "epoch": 0.5993475221376418, + "grad_norm": 0.35274410247802734, + "learning_rate": 4.886251354653709e-06, + "loss": 0.6583, + "step": 2572 + }, + { + "epoch": 0.5995805499456268, + "grad_norm": 0.35752496123313904, + "learning_rate": 4.886160011419421e-06, + "loss": 0.6631, + "step": 2573 + }, + { + "epoch": 0.5998135777536119, + "grad_norm": 0.3541601002216339, + "learning_rate": 4.886068632378784e-06, + "loss": 0.6276, + "step": 2574 + }, + { + "epoch": 0.6000466055615971, + "grad_norm": 0.34447914361953735, + "learning_rate": 4.88597721753317e-06, + "loss": 0.675, + "step": 2575 + }, + { + "epoch": 0.6002796333695821, + "grad_norm": 0.339218407869339, + "learning_rate": 4.8858857668839495e-06, + "loss": 0.6569, + "step": 2576 + }, + { + "epoch": 0.6005126611775672, + "grad_norm": 0.34126797318458557, + "learning_rate": 4.885794280432498e-06, + "loss": 0.6524, + "step": 2577 + }, + { + "epoch": 0.6007456889855523, + "grad_norm": 0.34601423144340515, + "learning_rate": 4.885702758180186e-06, + "loss": 0.653, + "step": 2578 + }, + { + "epoch": 0.6009787167935373, + "grad_norm": 0.36766380071640015, + "learning_rate": 4.885611200128385e-06, + "loss": 0.6549, + "step": 2579 + }, + { + "epoch": 0.6012117446015225, + "grad_norm": 0.3594924807548523, + "learning_rate": 4.8855196062784725e-06, + "loss": 0.6561, + "step": 2580 + }, + { + "epoch": 0.6014447724095076, + "grad_norm": 0.3527984321117401, + "learning_rate": 4.885427976631821e-06, + "loss": 0.6672, + "step": 2581 + }, + { + "epoch": 0.6016778002174926, + "grad_norm": 0.3369966745376587, + "learning_rate": 4.885336311189807e-06, + "loss": 0.6403, + "step": 2582 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 0.34029021859169006, + "learning_rate": 4.885244609953805e-06, + "loss": 0.6814, + "step": 2583 + }, + { + "epoch": 0.6021438558334627, + "grad_norm": 0.3441813290119171, + "learning_rate": 4.88515287292519e-06, + "loss": 0.6575, + "step": 2584 + }, + { + "epoch": 0.6023768836414479, + "grad_norm": 0.33488985896110535, + "learning_rate": 4.88506110010534e-06, + "loss": 0.6372, + "step": 2585 + }, + { + "epoch": 0.602609911449433, + "grad_norm": 0.3547884523868561, + "learning_rate": 4.8849692914956315e-06, + "loss": 0.6479, + "step": 2586 + }, + { + "epoch": 0.602842939257418, + "grad_norm": 0.35694798827171326, + "learning_rate": 4.884877447097443e-06, + "loss": 0.6451, + "step": 2587 + }, + { + "epoch": 0.6030759670654031, + "grad_norm": 0.35762032866477966, + "learning_rate": 4.884785566912152e-06, + "loss": 0.667, + "step": 2588 + }, + { + "epoch": 0.6033089948733882, + "grad_norm": 0.3530557155609131, + "learning_rate": 4.884693650941137e-06, + "loss": 0.6447, + "step": 2589 + }, + { + "epoch": 0.6035420226813734, + "grad_norm": 0.33933025598526, + "learning_rate": 4.884601699185778e-06, + "loss": 0.6668, + "step": 2590 + }, + { + "epoch": 0.6037750504893584, + "grad_norm": 0.34513285756111145, + "learning_rate": 4.884509711647454e-06, + "loss": 0.6486, + "step": 2591 + }, + { + "epoch": 0.6040080782973435, + "grad_norm": 0.3470650017261505, + "learning_rate": 4.884417688327546e-06, + "loss": 0.6845, + "step": 2592 + }, + { + "epoch": 0.6042411061053286, + "grad_norm": 0.34426867961883545, + "learning_rate": 4.884325629227436e-06, + "loss": 0.6535, + "step": 2593 + }, + { + "epoch": 0.6044741339133136, + "grad_norm": 0.3360567092895508, + "learning_rate": 4.884233534348503e-06, + "loss": 0.6358, + "step": 2594 + }, + { + "epoch": 0.6047071617212988, + "grad_norm": 0.3453570008277893, + "learning_rate": 4.88414140369213e-06, + "loss": 0.6285, + "step": 2595 + }, + { + "epoch": 0.6049401895292839, + "grad_norm": 0.33319783210754395, + "learning_rate": 4.884049237259699e-06, + "loss": 0.6328, + "step": 2596 + }, + { + "epoch": 0.6051732173372689, + "grad_norm": 0.35306379199028015, + "learning_rate": 4.883957035052594e-06, + "loss": 0.6578, + "step": 2597 + }, + { + "epoch": 0.605406245145254, + "grad_norm": 0.3419654667377472, + "learning_rate": 4.883864797072199e-06, + "loss": 0.666, + "step": 2598 + }, + { + "epoch": 0.605639272953239, + "grad_norm": 0.34253019094467163, + "learning_rate": 4.8837725233198965e-06, + "loss": 0.6658, + "step": 2599 + }, + { + "epoch": 0.6058723007612242, + "grad_norm": 0.38206812739372253, + "learning_rate": 4.883680213797072e-06, + "loss": 0.6823, + "step": 2600 + }, + { + "epoch": 0.6061053285692093, + "grad_norm": 0.3403790593147278, + "learning_rate": 4.883587868505111e-06, + "loss": 0.6461, + "step": 2601 + }, + { + "epoch": 0.6063383563771944, + "grad_norm": 0.35757386684417725, + "learning_rate": 4.883495487445399e-06, + "loss": 0.6324, + "step": 2602 + }, + { + "epoch": 0.6065713841851794, + "grad_norm": 0.3894188702106476, + "learning_rate": 4.883403070619322e-06, + "loss": 0.6407, + "step": 2603 + }, + { + "epoch": 0.6068044119931645, + "grad_norm": 0.3396351635456085, + "learning_rate": 4.883310618028266e-06, + "loss": 0.659, + "step": 2604 + }, + { + "epoch": 0.6070374398011497, + "grad_norm": 0.35221219062805176, + "learning_rate": 4.88321812967362e-06, + "loss": 0.6687, + "step": 2605 + }, + { + "epoch": 0.6072704676091347, + "grad_norm": 0.34768787026405334, + "learning_rate": 4.883125605556771e-06, + "loss": 0.6568, + "step": 2606 + }, + { + "epoch": 0.6075034954171198, + "grad_norm": 0.3504268527030945, + "learning_rate": 4.883033045679108e-06, + "loss": 0.6444, + "step": 2607 + }, + { + "epoch": 0.6077365232251049, + "grad_norm": 0.33256351947784424, + "learning_rate": 4.882940450042018e-06, + "loss": 0.6412, + "step": 2608 + }, + { + "epoch": 0.6079695510330899, + "grad_norm": 0.3416499197483063, + "learning_rate": 4.882847818646893e-06, + "loss": 0.6666, + "step": 2609 + }, + { + "epoch": 0.608202578841075, + "grad_norm": 0.3553072512149811, + "learning_rate": 4.882755151495122e-06, + "loss": 0.6512, + "step": 2610 + }, + { + "epoch": 0.6084356066490602, + "grad_norm": 0.3574197292327881, + "learning_rate": 4.882662448588095e-06, + "loss": 0.6519, + "step": 2611 + }, + { + "epoch": 0.6086686344570452, + "grad_norm": 0.3322905898094177, + "learning_rate": 4.882569709927204e-06, + "loss": 0.6396, + "step": 2612 + }, + { + "epoch": 0.6089016622650303, + "grad_norm": 0.33434003591537476, + "learning_rate": 4.88247693551384e-06, + "loss": 0.6283, + "step": 2613 + }, + { + "epoch": 0.6091346900730154, + "grad_norm": 0.35748299956321716, + "learning_rate": 4.882384125349395e-06, + "loss": 0.6632, + "step": 2614 + }, + { + "epoch": 0.6093677178810004, + "grad_norm": 0.3398849070072174, + "learning_rate": 4.882291279435263e-06, + "loss": 0.6422, + "step": 2615 + }, + { + "epoch": 0.6096007456889856, + "grad_norm": 0.345988392829895, + "learning_rate": 4.882198397772835e-06, + "loss": 0.6236, + "step": 2616 + }, + { + "epoch": 0.6098337734969707, + "grad_norm": 0.33715295791625977, + "learning_rate": 4.882105480363507e-06, + "loss": 0.6642, + "step": 2617 + }, + { + "epoch": 0.6100668013049557, + "grad_norm": 0.3321945071220398, + "learning_rate": 4.882012527208672e-06, + "loss": 0.629, + "step": 2618 + }, + { + "epoch": 0.6102998291129408, + "grad_norm": 0.3532421886920929, + "learning_rate": 4.881919538309725e-06, + "loss": 0.6758, + "step": 2619 + }, + { + "epoch": 0.6105328569209258, + "grad_norm": 0.3501252233982086, + "learning_rate": 4.881826513668061e-06, + "loss": 0.646, + "step": 2620 + }, + { + "epoch": 0.610765884728911, + "grad_norm": 0.3478323817253113, + "learning_rate": 4.881733453285078e-06, + "loss": 0.6414, + "step": 2621 + }, + { + "epoch": 0.6109989125368961, + "grad_norm": 0.3388736844062805, + "learning_rate": 4.881640357162169e-06, + "loss": 0.6451, + "step": 2622 + }, + { + "epoch": 0.6112319403448812, + "grad_norm": 0.33821845054626465, + "learning_rate": 4.881547225300734e-06, + "loss": 0.6451, + "step": 2623 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 0.33728325366973877, + "learning_rate": 4.8814540577021694e-06, + "loss": 0.6664, + "step": 2624 + }, + { + "epoch": 0.6116979959608513, + "grad_norm": 0.3347976803779602, + "learning_rate": 4.881360854367873e-06, + "loss": 0.6597, + "step": 2625 + }, + { + "epoch": 0.6119310237688365, + "grad_norm": 0.3523538410663605, + "learning_rate": 4.881267615299244e-06, + "loss": 0.6489, + "step": 2626 + }, + { + "epoch": 0.6121640515768215, + "grad_norm": 0.3526168167591095, + "learning_rate": 4.881174340497682e-06, + "loss": 0.6461, + "step": 2627 + }, + { + "epoch": 0.6123970793848066, + "grad_norm": 0.3471056818962097, + "learning_rate": 4.881081029964585e-06, + "loss": 0.6543, + "step": 2628 + }, + { + "epoch": 0.6126301071927917, + "grad_norm": 0.3577391803264618, + "learning_rate": 4.880987683701353e-06, + "loss": 0.6469, + "step": 2629 + }, + { + "epoch": 0.6128631350007767, + "grad_norm": 0.3389747738838196, + "learning_rate": 4.88089430170939e-06, + "loss": 0.6357, + "step": 2630 + }, + { + "epoch": 0.6130961628087619, + "grad_norm": 0.37186405062675476, + "learning_rate": 4.880800883990094e-06, + "loss": 0.6564, + "step": 2631 + }, + { + "epoch": 0.613329190616747, + "grad_norm": 0.3473242521286011, + "learning_rate": 4.880707430544867e-06, + "loss": 0.6395, + "step": 2632 + }, + { + "epoch": 0.613562218424732, + "grad_norm": 0.3425268530845642, + "learning_rate": 4.8806139413751144e-06, + "loss": 0.6477, + "step": 2633 + }, + { + "epoch": 0.6137952462327171, + "grad_norm": 0.35096481442451477, + "learning_rate": 4.880520416482235e-06, + "loss": 0.6563, + "step": 2634 + }, + { + "epoch": 0.6140282740407021, + "grad_norm": 0.34155115485191345, + "learning_rate": 4.880426855867635e-06, + "loss": 0.6451, + "step": 2635 + }, + { + "epoch": 0.6142613018486873, + "grad_norm": 0.3373972773551941, + "learning_rate": 4.880333259532718e-06, + "loss": 0.6363, + "step": 2636 + }, + { + "epoch": 0.6144943296566724, + "grad_norm": 0.3239922821521759, + "learning_rate": 4.880239627478886e-06, + "loss": 0.6382, + "step": 2637 + }, + { + "epoch": 0.6147273574646575, + "grad_norm": 0.351337194442749, + "learning_rate": 4.880145959707548e-06, + "loss": 0.6506, + "step": 2638 + }, + { + "epoch": 0.6149603852726425, + "grad_norm": 0.33865582942962646, + "learning_rate": 4.880052256220107e-06, + "loss": 0.6552, + "step": 2639 + }, + { + "epoch": 0.6151934130806276, + "grad_norm": 0.3487749993801117, + "learning_rate": 4.87995851701797e-06, + "loss": 0.6557, + "step": 2640 + }, + { + "epoch": 0.6154264408886128, + "grad_norm": 0.349433034658432, + "learning_rate": 4.8798647421025425e-06, + "loss": 0.6437, + "step": 2641 + }, + { + "epoch": 0.6156594686965978, + "grad_norm": 0.3349849581718445, + "learning_rate": 4.879770931475234e-06, + "loss": 0.6307, + "step": 2642 + }, + { + "epoch": 0.6158924965045829, + "grad_norm": 0.35225069522857666, + "learning_rate": 4.8796770851374496e-06, + "loss": 0.6524, + "step": 2643 + }, + { + "epoch": 0.616125524312568, + "grad_norm": 0.3360063433647156, + "learning_rate": 4.879583203090598e-06, + "loss": 0.6849, + "step": 2644 + }, + { + "epoch": 0.616358552120553, + "grad_norm": 0.3564579486846924, + "learning_rate": 4.8794892853360896e-06, + "loss": 0.637, + "step": 2645 + }, + { + "epoch": 0.6165915799285382, + "grad_norm": 0.3398532569408417, + "learning_rate": 4.879395331875333e-06, + "loss": 0.6688, + "step": 2646 + }, + { + "epoch": 0.6168246077365233, + "grad_norm": 0.3442870080471039, + "learning_rate": 4.8793013427097366e-06, + "loss": 0.6517, + "step": 2647 + }, + { + "epoch": 0.6170576355445083, + "grad_norm": 0.3502770960330963, + "learning_rate": 4.879207317840713e-06, + "loss": 0.6656, + "step": 2648 + }, + { + "epoch": 0.6172906633524934, + "grad_norm": 0.3342411518096924, + "learning_rate": 4.879113257269672e-06, + "loss": 0.6474, + "step": 2649 + }, + { + "epoch": 0.6175236911604784, + "grad_norm": 0.34520772099494934, + "learning_rate": 4.879019160998025e-06, + "loss": 0.6557, + "step": 2650 + }, + { + "epoch": 0.6177567189684636, + "grad_norm": 0.3340608775615692, + "learning_rate": 4.878925029027185e-06, + "loss": 0.653, + "step": 2651 + }, + { + "epoch": 0.6179897467764487, + "grad_norm": 0.351064532995224, + "learning_rate": 4.878830861358562e-06, + "loss": 0.6655, + "step": 2652 + }, + { + "epoch": 0.6182227745844338, + "grad_norm": 0.3670513331890106, + "learning_rate": 4.878736657993572e-06, + "loss": 0.6375, + "step": 2653 + }, + { + "epoch": 0.6184558023924188, + "grad_norm": 0.3375775218009949, + "learning_rate": 4.878642418933627e-06, + "loss": 0.6418, + "step": 2654 + }, + { + "epoch": 0.6186888302004039, + "grad_norm": 0.33090412616729736, + "learning_rate": 4.878548144180142e-06, + "loss": 0.6273, + "step": 2655 + }, + { + "epoch": 0.618921858008389, + "grad_norm": 0.34983256459236145, + "learning_rate": 4.8784538337345306e-06, + "loss": 0.6689, + "step": 2656 + }, + { + "epoch": 0.6191548858163741, + "grad_norm": 0.37634211778640747, + "learning_rate": 4.878359487598208e-06, + "loss": 0.6205, + "step": 2657 + }, + { + "epoch": 0.6193879136243592, + "grad_norm": 0.34730198979377747, + "learning_rate": 4.878265105772592e-06, + "loss": 0.64, + "step": 2658 + }, + { + "epoch": 0.6196209414323443, + "grad_norm": 0.3499267101287842, + "learning_rate": 4.878170688259096e-06, + "loss": 0.6797, + "step": 2659 + }, + { + "epoch": 0.6198539692403293, + "grad_norm": 0.35644441843032837, + "learning_rate": 4.878076235059139e-06, + "loss": 0.6123, + "step": 2660 + }, + { + "epoch": 0.6200869970483144, + "grad_norm": 0.350254625082016, + "learning_rate": 4.8779817461741375e-06, + "loss": 0.628, + "step": 2661 + }, + { + "epoch": 0.6203200248562996, + "grad_norm": 0.35024625062942505, + "learning_rate": 4.87788722160551e-06, + "loss": 0.6652, + "step": 2662 + }, + { + "epoch": 0.6205530526642846, + "grad_norm": 0.35706692934036255, + "learning_rate": 4.8777926613546735e-06, + "loss": 0.6262, + "step": 2663 + }, + { + "epoch": 0.6207860804722697, + "grad_norm": 0.34361332654953003, + "learning_rate": 4.877698065423048e-06, + "loss": 0.6842, + "step": 2664 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 0.35742130875587463, + "learning_rate": 4.877603433812053e-06, + "loss": 0.6634, + "step": 2665 + }, + { + "epoch": 0.6212521360882398, + "grad_norm": 0.35057124495506287, + "learning_rate": 4.877508766523108e-06, + "loss": 0.6539, + "step": 2666 + }, + { + "epoch": 0.621485163896225, + "grad_norm": 0.33745867013931274, + "learning_rate": 4.877414063557634e-06, + "loss": 0.643, + "step": 2667 + }, + { + "epoch": 0.62171819170421, + "grad_norm": 0.33556294441223145, + "learning_rate": 4.877319324917052e-06, + "loss": 0.6698, + "step": 2668 + }, + { + "epoch": 0.6219512195121951, + "grad_norm": 0.343355655670166, + "learning_rate": 4.877224550602783e-06, + "loss": 0.671, + "step": 2669 + }, + { + "epoch": 0.6221842473201802, + "grad_norm": 0.3411237895488739, + "learning_rate": 4.877129740616252e-06, + "loss": 0.6327, + "step": 2670 + }, + { + "epoch": 0.6224172751281652, + "grad_norm": 0.3260954022407532, + "learning_rate": 4.8770348949588775e-06, + "loss": 0.659, + "step": 2671 + }, + { + "epoch": 0.6226503029361504, + "grad_norm": 0.34767940640449524, + "learning_rate": 4.876940013632086e-06, + "loss": 0.6634, + "step": 2672 + }, + { + "epoch": 0.6228833307441355, + "grad_norm": 0.3397616446018219, + "learning_rate": 4.8768450966373e-06, + "loss": 0.6599, + "step": 2673 + }, + { + "epoch": 0.6231163585521206, + "grad_norm": 0.3360998332500458, + "learning_rate": 4.876750143975943e-06, + "loss": 0.6692, + "step": 2674 + }, + { + "epoch": 0.6233493863601056, + "grad_norm": 0.3534923791885376, + "learning_rate": 4.87665515564944e-06, + "loss": 0.6458, + "step": 2675 + }, + { + "epoch": 0.6235824141680907, + "grad_norm": 0.3447485864162445, + "learning_rate": 4.876560131659218e-06, + "loss": 0.6371, + "step": 2676 + }, + { + "epoch": 0.6238154419760759, + "grad_norm": 0.34595727920532227, + "learning_rate": 4.876465072006703e-06, + "loss": 0.6552, + "step": 2677 + }, + { + "epoch": 0.6240484697840609, + "grad_norm": 0.3534361720085144, + "learning_rate": 4.8763699766933184e-06, + "loss": 0.661, + "step": 2678 + }, + { + "epoch": 0.624281497592046, + "grad_norm": 0.3505156338214874, + "learning_rate": 4.876274845720494e-06, + "loss": 0.6716, + "step": 2679 + }, + { + "epoch": 0.624514525400031, + "grad_norm": 0.3807438313961029, + "learning_rate": 4.876179679089657e-06, + "loss": 0.619, + "step": 2680 + }, + { + "epoch": 0.6247475532080161, + "grad_norm": 0.3422558307647705, + "learning_rate": 4.876084476802236e-06, + "loss": 0.6731, + "step": 2681 + }, + { + "epoch": 0.6249805810160013, + "grad_norm": 0.3344363272190094, + "learning_rate": 4.875989238859656e-06, + "loss": 0.6598, + "step": 2682 + }, + { + "epoch": 0.6252136088239864, + "grad_norm": 0.3365171551704407, + "learning_rate": 4.87589396526335e-06, + "loss": 0.6176, + "step": 2683 + }, + { + "epoch": 0.6254466366319714, + "grad_norm": 0.34471437335014343, + "learning_rate": 4.875798656014747e-06, + "loss": 0.6363, + "step": 2684 + }, + { + "epoch": 0.6256796644399565, + "grad_norm": 0.34522297978401184, + "learning_rate": 4.8757033111152765e-06, + "loss": 0.6418, + "step": 2685 + }, + { + "epoch": 0.6259126922479415, + "grad_norm": 0.33947569131851196, + "learning_rate": 4.875607930566369e-06, + "loss": 0.6559, + "step": 2686 + }, + { + "epoch": 0.6261457200559267, + "grad_norm": 0.33999699354171753, + "learning_rate": 4.875512514369455e-06, + "loss": 0.6595, + "step": 2687 + }, + { + "epoch": 0.6263787478639118, + "grad_norm": 0.3446522355079651, + "learning_rate": 4.875417062525968e-06, + "loss": 0.6493, + "step": 2688 + }, + { + "epoch": 0.6266117756718969, + "grad_norm": 0.35614413022994995, + "learning_rate": 4.87532157503734e-06, + "loss": 0.6479, + "step": 2689 + }, + { + "epoch": 0.6268448034798819, + "grad_norm": 0.35252052545547485, + "learning_rate": 4.875226051905003e-06, + "loss": 0.641, + "step": 2690 + }, + { + "epoch": 0.627077831287867, + "grad_norm": 0.363535612821579, + "learning_rate": 4.875130493130392e-06, + "loss": 0.6536, + "step": 2691 + }, + { + "epoch": 0.6273108590958522, + "grad_norm": 0.3655862808227539, + "learning_rate": 4.875034898714939e-06, + "loss": 0.6656, + "step": 2692 + }, + { + "epoch": 0.6275438869038372, + "grad_norm": 0.3467523753643036, + "learning_rate": 4.87493926866008e-06, + "loss": 0.661, + "step": 2693 + }, + { + "epoch": 0.6277769147118223, + "grad_norm": 0.33741840720176697, + "learning_rate": 4.874843602967249e-06, + "loss": 0.6602, + "step": 2694 + }, + { + "epoch": 0.6280099425198074, + "grad_norm": 0.3381771445274353, + "learning_rate": 4.874747901637882e-06, + "loss": 0.6509, + "step": 2695 + }, + { + "epoch": 0.6282429703277924, + "grad_norm": 0.35409238934516907, + "learning_rate": 4.8746521646734156e-06, + "loss": 0.6456, + "step": 2696 + }, + { + "epoch": 0.6284759981357776, + "grad_norm": 0.36146101355552673, + "learning_rate": 4.874556392075284e-06, + "loss": 0.6374, + "step": 2697 + }, + { + "epoch": 0.6287090259437627, + "grad_norm": 0.33394113183021545, + "learning_rate": 4.8744605838449284e-06, + "loss": 0.6436, + "step": 2698 + }, + { + "epoch": 0.6289420537517477, + "grad_norm": 0.39864417910575867, + "learning_rate": 4.874364739983783e-06, + "loss": 0.6661, + "step": 2699 + }, + { + "epoch": 0.6291750815597328, + "grad_norm": 0.3570095896720886, + "learning_rate": 4.8742688604932885e-06, + "loss": 0.6662, + "step": 2700 + }, + { + "epoch": 0.6294081093677178, + "grad_norm": 0.35780996084213257, + "learning_rate": 4.874172945374882e-06, + "loss": 0.6272, + "step": 2701 + }, + { + "epoch": 0.6296411371757029, + "grad_norm": 0.3480808138847351, + "learning_rate": 4.874076994630003e-06, + "loss": 0.6433, + "step": 2702 + }, + { + "epoch": 0.6298741649836881, + "grad_norm": 0.3490018844604492, + "learning_rate": 4.873981008260091e-06, + "loss": 0.6427, + "step": 2703 + }, + { + "epoch": 0.6301071927916732, + "grad_norm": 0.3607998192310333, + "learning_rate": 4.873884986266587e-06, + "loss": 0.6713, + "step": 2704 + }, + { + "epoch": 0.6303402205996582, + "grad_norm": 0.35129204392433167, + "learning_rate": 4.8737889286509324e-06, + "loss": 0.6601, + "step": 2705 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 0.35245653986930847, + "learning_rate": 4.8736928354145675e-06, + "loss": 0.6554, + "step": 2706 + }, + { + "epoch": 0.6308062762156283, + "grad_norm": 0.35255396366119385, + "learning_rate": 4.873596706558935e-06, + "loss": 0.6529, + "step": 2707 + }, + { + "epoch": 0.6310393040236135, + "grad_norm": 0.35473546385765076, + "learning_rate": 4.873500542085477e-06, + "loss": 0.6513, + "step": 2708 + }, + { + "epoch": 0.6312723318315986, + "grad_norm": 0.3592835068702698, + "learning_rate": 4.8734043419956374e-06, + "loss": 0.6469, + "step": 2709 + }, + { + "epoch": 0.6315053596395837, + "grad_norm": 0.3458676338195801, + "learning_rate": 4.873308106290858e-06, + "loss": 0.6405, + "step": 2710 + }, + { + "epoch": 0.6317383874475687, + "grad_norm": 0.34526655077934265, + "learning_rate": 4.873211834972585e-06, + "loss": 0.6422, + "step": 2711 + }, + { + "epoch": 0.6319714152555538, + "grad_norm": 0.3476266860961914, + "learning_rate": 4.873115528042261e-06, + "loss": 0.6585, + "step": 2712 + }, + { + "epoch": 0.632204443063539, + "grad_norm": 0.3423970639705658, + "learning_rate": 4.873019185501333e-06, + "loss": 0.6383, + "step": 2713 + }, + { + "epoch": 0.632437470871524, + "grad_norm": 0.33545127511024475, + "learning_rate": 4.872922807351246e-06, + "loss": 0.6567, + "step": 2714 + }, + { + "epoch": 0.6326704986795091, + "grad_norm": 0.3428058326244354, + "learning_rate": 4.872826393593445e-06, + "loss": 0.6226, + "step": 2715 + }, + { + "epoch": 0.6329035264874942, + "grad_norm": 0.3498053252696991, + "learning_rate": 4.872729944229379e-06, + "loss": 0.666, + "step": 2716 + }, + { + "epoch": 0.6331365542954792, + "grad_norm": 0.3345945179462433, + "learning_rate": 4.872633459260494e-06, + "loss": 0.66, + "step": 2717 + }, + { + "epoch": 0.6333695821034644, + "grad_norm": 0.38933902978897095, + "learning_rate": 4.8725369386882374e-06, + "loss": 0.6809, + "step": 2718 + }, + { + "epoch": 0.6336026099114495, + "grad_norm": 0.3400987386703491, + "learning_rate": 4.872440382514059e-06, + "loss": 0.6409, + "step": 2719 + }, + { + "epoch": 0.6338356377194345, + "grad_norm": 0.34950584173202515, + "learning_rate": 4.872343790739406e-06, + "loss": 0.653, + "step": 2720 + }, + { + "epoch": 0.6340686655274196, + "grad_norm": 0.3537040650844574, + "learning_rate": 4.87224716336573e-06, + "loss": 0.6596, + "step": 2721 + }, + { + "epoch": 0.6343016933354046, + "grad_norm": 0.33301302790641785, + "learning_rate": 4.872150500394478e-06, + "loss": 0.644, + "step": 2722 + }, + { + "epoch": 0.6345347211433898, + "grad_norm": 0.3557029068470001, + "learning_rate": 4.872053801827104e-06, + "loss": 0.6815, + "step": 2723 + }, + { + "epoch": 0.6347677489513749, + "grad_norm": 0.34169986844062805, + "learning_rate": 4.871957067665056e-06, + "loss": 0.6761, + "step": 2724 + }, + { + "epoch": 0.63500077675936, + "grad_norm": 0.3482593894004822, + "learning_rate": 4.871860297909787e-06, + "loss": 0.641, + "step": 2725 + }, + { + "epoch": 0.635233804567345, + "grad_norm": 0.3446268141269684, + "learning_rate": 4.871763492562749e-06, + "loss": 0.6774, + "step": 2726 + }, + { + "epoch": 0.6354668323753301, + "grad_norm": 0.3549942076206207, + "learning_rate": 4.8716666516253955e-06, + "loss": 0.6151, + "step": 2727 + }, + { + "epoch": 0.6356998601833153, + "grad_norm": 0.3563139736652374, + "learning_rate": 4.871569775099178e-06, + "loss": 0.6597, + "step": 2728 + }, + { + "epoch": 0.6359328879913003, + "grad_norm": 0.3395681381225586, + "learning_rate": 4.8714728629855515e-06, + "loss": 0.6588, + "step": 2729 + }, + { + "epoch": 0.6361659157992854, + "grad_norm": 0.32859528064727783, + "learning_rate": 4.871375915285969e-06, + "loss": 0.6504, + "step": 2730 + }, + { + "epoch": 0.6363989436072705, + "grad_norm": 0.3714424967765808, + "learning_rate": 4.871278932001886e-06, + "loss": 0.6388, + "step": 2731 + }, + { + "epoch": 0.6366319714152555, + "grad_norm": 0.34649062156677246, + "learning_rate": 4.871181913134758e-06, + "loss": 0.6465, + "step": 2732 + }, + { + "epoch": 0.6368649992232407, + "grad_norm": 0.34210923314094543, + "learning_rate": 4.871084858686041e-06, + "loss": 0.665, + "step": 2733 + }, + { + "epoch": 0.6370980270312258, + "grad_norm": 0.3536020815372467, + "learning_rate": 4.870987768657192e-06, + "loss": 0.6391, + "step": 2734 + }, + { + "epoch": 0.6373310548392108, + "grad_norm": 0.3370494544506073, + "learning_rate": 4.870890643049665e-06, + "loss": 0.6263, + "step": 2735 + }, + { + "epoch": 0.6375640826471959, + "grad_norm": 0.34429457783699036, + "learning_rate": 4.87079348186492e-06, + "loss": 0.6465, + "step": 2736 + }, + { + "epoch": 0.637797110455181, + "grad_norm": 0.36213988065719604, + "learning_rate": 4.870696285104415e-06, + "loss": 0.6606, + "step": 2737 + }, + { + "epoch": 0.6380301382631661, + "grad_norm": 0.3559233248233795, + "learning_rate": 4.870599052769608e-06, + "loss": 0.6548, + "step": 2738 + }, + { + "epoch": 0.6382631660711512, + "grad_norm": 0.3494240343570709, + "learning_rate": 4.870501784861958e-06, + "loss": 0.632, + "step": 2739 + }, + { + "epoch": 0.6384961938791363, + "grad_norm": 0.35347408056259155, + "learning_rate": 4.870404481382925e-06, + "loss": 0.68, + "step": 2740 + }, + { + "epoch": 0.6387292216871213, + "grad_norm": 0.33819475769996643, + "learning_rate": 4.870307142333968e-06, + "loss": 0.6443, + "step": 2741 + }, + { + "epoch": 0.6389622494951064, + "grad_norm": 0.34380877017974854, + "learning_rate": 4.870209767716548e-06, + "loss": 0.6345, + "step": 2742 + }, + { + "epoch": 0.6391952773030914, + "grad_norm": 0.35041069984436035, + "learning_rate": 4.870112357532127e-06, + "loss": 0.6377, + "step": 2743 + }, + { + "epoch": 0.6394283051110766, + "grad_norm": 0.3498706519603729, + "learning_rate": 4.8700149117821675e-06, + "loss": 0.6655, + "step": 2744 + }, + { + "epoch": 0.6396613329190617, + "grad_norm": 0.3244394361972809, + "learning_rate": 4.8699174304681285e-06, + "loss": 0.6266, + "step": 2745 + }, + { + "epoch": 0.6398943607270468, + "grad_norm": 0.340507447719574, + "learning_rate": 4.8698199135914756e-06, + "loss": 0.6285, + "step": 2746 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 0.3316322863101959, + "learning_rate": 4.8697223611536715e-06, + "loss": 0.6604, + "step": 2747 + }, + { + "epoch": 0.6403604163430169, + "grad_norm": 0.3557249903678894, + "learning_rate": 4.86962477315618e-06, + "loss": 0.6363, + "step": 2748 + }, + { + "epoch": 0.6405934441510021, + "grad_norm": 0.34624597430229187, + "learning_rate": 4.869527149600466e-06, + "loss": 0.6643, + "step": 2749 + }, + { + "epoch": 0.6408264719589871, + "grad_norm": 0.34341302514076233, + "learning_rate": 4.869429490487992e-06, + "loss": 0.6665, + "step": 2750 + }, + { + "epoch": 0.6410594997669722, + "grad_norm": 0.34530991315841675, + "learning_rate": 4.869331795820227e-06, + "loss": 0.6569, + "step": 2751 + }, + { + "epoch": 0.6412925275749572, + "grad_norm": 0.34045401215553284, + "learning_rate": 4.869234065598635e-06, + "loss": 0.6765, + "step": 2752 + }, + { + "epoch": 0.6415255553829423, + "grad_norm": 0.34729668498039246, + "learning_rate": 4.8691362998246825e-06, + "loss": 0.6255, + "step": 2753 + }, + { + "epoch": 0.6417585831909275, + "grad_norm": 0.3269015848636627, + "learning_rate": 4.869038498499836e-06, + "loss": 0.6832, + "step": 2754 + }, + { + "epoch": 0.6419916109989126, + "grad_norm": 0.34684592485427856, + "learning_rate": 4.868940661625565e-06, + "loss": 0.6552, + "step": 2755 + }, + { + "epoch": 0.6422246388068976, + "grad_norm": 0.33162179589271545, + "learning_rate": 4.868842789203336e-06, + "loss": 0.6709, + "step": 2756 + }, + { + "epoch": 0.6424576666148827, + "grad_norm": 0.3570788502693176, + "learning_rate": 4.868744881234619e-06, + "loss": 0.6429, + "step": 2757 + }, + { + "epoch": 0.6426906944228677, + "grad_norm": 0.33613571524620056, + "learning_rate": 4.868646937720882e-06, + "loss": 0.6443, + "step": 2758 + }, + { + "epoch": 0.6429237222308529, + "grad_norm": 0.35032811760902405, + "learning_rate": 4.868548958663594e-06, + "loss": 0.6448, + "step": 2759 + }, + { + "epoch": 0.643156750038838, + "grad_norm": 0.3344932496547699, + "learning_rate": 4.868450944064229e-06, + "loss": 0.6592, + "step": 2760 + }, + { + "epoch": 0.643389777846823, + "grad_norm": 0.3320918381214142, + "learning_rate": 4.868352893924253e-06, + "loss": 0.6593, + "step": 2761 + }, + { + "epoch": 0.6436228056548081, + "grad_norm": 0.3433143198490143, + "learning_rate": 4.868254808245141e-06, + "loss": 0.6738, + "step": 2762 + }, + { + "epoch": 0.6438558334627932, + "grad_norm": 0.33561864495277405, + "learning_rate": 4.8681566870283616e-06, + "loss": 0.6396, + "step": 2763 + }, + { + "epoch": 0.6440888612707784, + "grad_norm": 0.34229180216789246, + "learning_rate": 4.86805853027539e-06, + "loss": 0.6777, + "step": 2764 + }, + { + "epoch": 0.6443218890787634, + "grad_norm": 0.33946692943573, + "learning_rate": 4.867960337987698e-06, + "loss": 0.6695, + "step": 2765 + }, + { + "epoch": 0.6445549168867485, + "grad_norm": 0.3509640097618103, + "learning_rate": 4.86786211016676e-06, + "loss": 0.6486, + "step": 2766 + }, + { + "epoch": 0.6447879446947336, + "grad_norm": 0.3436738848686218, + "learning_rate": 4.867763846814048e-06, + "loss": 0.6407, + "step": 2767 + }, + { + "epoch": 0.6450209725027186, + "grad_norm": 0.33398401737213135, + "learning_rate": 4.867665547931038e-06, + "loss": 0.6491, + "step": 2768 + }, + { + "epoch": 0.6452540003107038, + "grad_norm": 0.35334259271621704, + "learning_rate": 4.867567213519205e-06, + "loss": 0.6706, + "step": 2769 + }, + { + "epoch": 0.6454870281186889, + "grad_norm": 0.3458956182003021, + "learning_rate": 4.867468843580025e-06, + "loss": 0.6537, + "step": 2770 + }, + { + "epoch": 0.6457200559266739, + "grad_norm": 0.3310525417327881, + "learning_rate": 4.867370438114971e-06, + "loss": 0.67, + "step": 2771 + }, + { + "epoch": 0.645953083734659, + "grad_norm": 0.34056296944618225, + "learning_rate": 4.867271997125524e-06, + "loss": 0.676, + "step": 2772 + }, + { + "epoch": 0.646186111542644, + "grad_norm": 0.35037916898727417, + "learning_rate": 4.867173520613159e-06, + "loss": 0.6613, + "step": 2773 + }, + { + "epoch": 0.6464191393506292, + "grad_norm": 0.34831205010414124, + "learning_rate": 4.867075008579354e-06, + "loss": 0.6393, + "step": 2774 + }, + { + "epoch": 0.6466521671586143, + "grad_norm": 0.3406579792499542, + "learning_rate": 4.866976461025587e-06, + "loss": 0.6211, + "step": 2775 + }, + { + "epoch": 0.6468851949665994, + "grad_norm": 0.3390808403491974, + "learning_rate": 4.866877877953337e-06, + "loss": 0.6497, + "step": 2776 + }, + { + "epoch": 0.6471182227745844, + "grad_norm": 0.34273776412010193, + "learning_rate": 4.8667792593640835e-06, + "loss": 0.6257, + "step": 2777 + }, + { + "epoch": 0.6473512505825695, + "grad_norm": 0.33239561319351196, + "learning_rate": 4.8666806052593075e-06, + "loss": 0.6701, + "step": 2778 + }, + { + "epoch": 0.6475842783905547, + "grad_norm": 0.3302348852157593, + "learning_rate": 4.866581915640486e-06, + "loss": 0.6079, + "step": 2779 + }, + { + "epoch": 0.6478173061985397, + "grad_norm": 0.33615437150001526, + "learning_rate": 4.866483190509103e-06, + "loss": 0.649, + "step": 2780 + }, + { + "epoch": 0.6480503340065248, + "grad_norm": 0.3496010899543762, + "learning_rate": 4.86638442986664e-06, + "loss": 0.6592, + "step": 2781 + }, + { + "epoch": 0.6482833618145099, + "grad_norm": 0.3648754358291626, + "learning_rate": 4.866285633714577e-06, + "loss": 0.6391, + "step": 2782 + }, + { + "epoch": 0.6485163896224949, + "grad_norm": 0.34704235196113586, + "learning_rate": 4.866186802054398e-06, + "loss": 0.6432, + "step": 2783 + }, + { + "epoch": 0.6487494174304801, + "grad_norm": 0.3397449553012848, + "learning_rate": 4.866087934887584e-06, + "loss": 0.634, + "step": 2784 + }, + { + "epoch": 0.6489824452384652, + "grad_norm": 0.3426058888435364, + "learning_rate": 4.865989032215622e-06, + "loss": 0.6539, + "step": 2785 + }, + { + "epoch": 0.6492154730464502, + "grad_norm": 0.34963729977607727, + "learning_rate": 4.865890094039994e-06, + "loss": 0.6435, + "step": 2786 + }, + { + "epoch": 0.6494485008544353, + "grad_norm": 0.34288206696510315, + "learning_rate": 4.865791120362183e-06, + "loss": 0.6585, + "step": 2787 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 0.33330968022346497, + "learning_rate": 4.865692111183679e-06, + "loss": 0.6372, + "step": 2788 + }, + { + "epoch": 0.6499145564704054, + "grad_norm": 0.3410674035549164, + "learning_rate": 4.865593066505963e-06, + "loss": 0.6659, + "step": 2789 + }, + { + "epoch": 0.6501475842783906, + "grad_norm": 0.33911049365997314, + "learning_rate": 4.865493986330523e-06, + "loss": 0.6468, + "step": 2790 + }, + { + "epoch": 0.6503806120863757, + "grad_norm": 0.3270745277404785, + "learning_rate": 4.865394870658846e-06, + "loss": 0.6725, + "step": 2791 + }, + { + "epoch": 0.6506136398943607, + "grad_norm": 0.3470090925693512, + "learning_rate": 4.86529571949242e-06, + "loss": 0.6497, + "step": 2792 + }, + { + "epoch": 0.6508466677023458, + "grad_norm": 0.34180912375450134, + "learning_rate": 4.865196532832731e-06, + "loss": 0.6554, + "step": 2793 + }, + { + "epoch": 0.6510796955103308, + "grad_norm": 0.33934417366981506, + "learning_rate": 4.865097310681269e-06, + "loss": 0.6365, + "step": 2794 + }, + { + "epoch": 0.651312723318316, + "grad_norm": 0.3333466649055481, + "learning_rate": 4.864998053039522e-06, + "loss": 0.5969, + "step": 2795 + }, + { + "epoch": 0.6515457511263011, + "grad_norm": 0.32886889576911926, + "learning_rate": 4.86489875990898e-06, + "loss": 0.6333, + "step": 2796 + }, + { + "epoch": 0.6517787789342862, + "grad_norm": 0.35057389736175537, + "learning_rate": 4.8647994312911326e-06, + "loss": 0.6572, + "step": 2797 + }, + { + "epoch": 0.6520118067422712, + "grad_norm": 0.35462504625320435, + "learning_rate": 4.86470006718747e-06, + "loss": 0.6691, + "step": 2798 + }, + { + "epoch": 0.6522448345502563, + "grad_norm": 0.3505004346370697, + "learning_rate": 4.864600667599484e-06, + "loss": 0.6543, + "step": 2799 + }, + { + "epoch": 0.6524778623582415, + "grad_norm": 0.34445837140083313, + "learning_rate": 4.864501232528666e-06, + "loss": 0.6568, + "step": 2800 + }, + { + "epoch": 0.6527108901662265, + "grad_norm": 0.34208616614341736, + "learning_rate": 4.864401761976508e-06, + "loss": 0.6431, + "step": 2801 + }, + { + "epoch": 0.6529439179742116, + "grad_norm": 0.35546135902404785, + "learning_rate": 4.864302255944503e-06, + "loss": 0.651, + "step": 2802 + }, + { + "epoch": 0.6531769457821966, + "grad_norm": 0.35717707872390747, + "learning_rate": 4.864202714434143e-06, + "loss": 0.651, + "step": 2803 + }, + { + "epoch": 0.6534099735901817, + "grad_norm": 0.3360876142978668, + "learning_rate": 4.864103137446923e-06, + "loss": 0.6692, + "step": 2804 + }, + { + "epoch": 0.6536430013981669, + "grad_norm": 0.35091736912727356, + "learning_rate": 4.864003524984336e-06, + "loss": 0.6332, + "step": 2805 + }, + { + "epoch": 0.653876029206152, + "grad_norm": 0.36690616607666016, + "learning_rate": 4.863903877047878e-06, + "loss": 0.6667, + "step": 2806 + }, + { + "epoch": 0.654109057014137, + "grad_norm": 0.348014771938324, + "learning_rate": 4.863804193639043e-06, + "loss": 0.6533, + "step": 2807 + }, + { + "epoch": 0.6543420848221221, + "grad_norm": 0.3407752513885498, + "learning_rate": 4.863704474759329e-06, + "loss": 0.6583, + "step": 2808 + }, + { + "epoch": 0.6545751126301071, + "grad_norm": 0.35023775696754456, + "learning_rate": 4.86360472041023e-06, + "loss": 0.6156, + "step": 2809 + }, + { + "epoch": 0.6548081404380923, + "grad_norm": 0.34118255972862244, + "learning_rate": 4.863504930593245e-06, + "loss": 0.622, + "step": 2810 + }, + { + "epoch": 0.6550411682460774, + "grad_norm": 0.3330015242099762, + "learning_rate": 4.863405105309869e-06, + "loss": 0.6412, + "step": 2811 + }, + { + "epoch": 0.6552741960540625, + "grad_norm": 0.34056833386421204, + "learning_rate": 4.863305244561601e-06, + "loss": 0.6381, + "step": 2812 + }, + { + "epoch": 0.6555072238620475, + "grad_norm": 0.3489595353603363, + "learning_rate": 4.863205348349941e-06, + "loss": 0.6823, + "step": 2813 + }, + { + "epoch": 0.6557402516700326, + "grad_norm": 0.3412863612174988, + "learning_rate": 4.8631054166763865e-06, + "loss": 0.6418, + "step": 2814 + }, + { + "epoch": 0.6559732794780178, + "grad_norm": 0.3303406238555908, + "learning_rate": 4.863005449542437e-06, + "loss": 0.6327, + "step": 2815 + }, + { + "epoch": 0.6562063072860028, + "grad_norm": 0.36000415682792664, + "learning_rate": 4.8629054469495935e-06, + "loss": 0.6457, + "step": 2816 + }, + { + "epoch": 0.6564393350939879, + "grad_norm": 0.34471964836120605, + "learning_rate": 4.862805408899356e-06, + "loss": 0.6743, + "step": 2817 + }, + { + "epoch": 0.656672362901973, + "grad_norm": 0.33806005120277405, + "learning_rate": 4.862705335393225e-06, + "loss": 0.6289, + "step": 2818 + }, + { + "epoch": 0.656905390709958, + "grad_norm": 0.3388570547103882, + "learning_rate": 4.862605226432703e-06, + "loss": 0.6322, + "step": 2819 + }, + { + "epoch": 0.6571384185179432, + "grad_norm": 0.3500990569591522, + "learning_rate": 4.862505082019293e-06, + "loss": 0.6528, + "step": 2820 + }, + { + "epoch": 0.6573714463259283, + "grad_norm": 0.33545202016830444, + "learning_rate": 4.862404902154496e-06, + "loss": 0.6213, + "step": 2821 + }, + { + "epoch": 0.6576044741339133, + "grad_norm": 0.3362768292427063, + "learning_rate": 4.8623046868398165e-06, + "loss": 0.6513, + "step": 2822 + }, + { + "epoch": 0.6578375019418984, + "grad_norm": 0.35069894790649414, + "learning_rate": 4.862204436076758e-06, + "loss": 0.6652, + "step": 2823 + }, + { + "epoch": 0.6580705297498834, + "grad_norm": 0.3208925426006317, + "learning_rate": 4.862104149866825e-06, + "loss": 0.6334, + "step": 2824 + }, + { + "epoch": 0.6583035575578686, + "grad_norm": 0.33345505595207214, + "learning_rate": 4.862003828211522e-06, + "loss": 0.6576, + "step": 2825 + }, + { + "epoch": 0.6585365853658537, + "grad_norm": 0.3366893529891968, + "learning_rate": 4.861903471112355e-06, + "loss": 0.6337, + "step": 2826 + }, + { + "epoch": 0.6587696131738388, + "grad_norm": 0.3494217097759247, + "learning_rate": 4.861803078570828e-06, + "loss": 0.6617, + "step": 2827 + }, + { + "epoch": 0.6590026409818238, + "grad_norm": 0.3470281660556793, + "learning_rate": 4.861702650588451e-06, + "loss": 0.6699, + "step": 2828 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 0.3386491537094116, + "learning_rate": 4.861602187166729e-06, + "loss": 0.6523, + "step": 2829 + }, + { + "epoch": 0.6594686965977941, + "grad_norm": 0.3473060429096222, + "learning_rate": 4.861501688307168e-06, + "loss": 0.6527, + "step": 2830 + }, + { + "epoch": 0.6597017244057791, + "grad_norm": 0.33927130699157715, + "learning_rate": 4.8614011540112786e-06, + "loss": 0.6541, + "step": 2831 + }, + { + "epoch": 0.6599347522137642, + "grad_norm": 0.33874186873435974, + "learning_rate": 4.861300584280569e-06, + "loss": 0.6713, + "step": 2832 + }, + { + "epoch": 0.6601677800217493, + "grad_norm": 0.3390222489833832, + "learning_rate": 4.861199979116547e-06, + "loss": 0.6612, + "step": 2833 + }, + { + "epoch": 0.6604008078297343, + "grad_norm": 0.32675495743751526, + "learning_rate": 4.861099338520723e-06, + "loss": 0.637, + "step": 2834 + }, + { + "epoch": 0.6606338356377194, + "grad_norm": 0.34253883361816406, + "learning_rate": 4.860998662494607e-06, + "loss": 0.6654, + "step": 2835 + }, + { + "epoch": 0.6608668634457046, + "grad_norm": 0.34504395723342896, + "learning_rate": 4.86089795103971e-06, + "loss": 0.678, + "step": 2836 + }, + { + "epoch": 0.6610998912536896, + "grad_norm": 0.357848584651947, + "learning_rate": 4.8607972041575445e-06, + "loss": 0.6777, + "step": 2837 + }, + { + "epoch": 0.6613329190616747, + "grad_norm": 0.3386964499950409, + "learning_rate": 4.86069642184962e-06, + "loss": 0.6578, + "step": 2838 + }, + { + "epoch": 0.6615659468696597, + "grad_norm": 0.3492048680782318, + "learning_rate": 4.86059560411745e-06, + "loss": 0.6309, + "step": 2839 + }, + { + "epoch": 0.6617989746776448, + "grad_norm": 0.355458527803421, + "learning_rate": 4.860494750962547e-06, + "loss": 0.6701, + "step": 2840 + }, + { + "epoch": 0.66203200248563, + "grad_norm": 0.35371872782707214, + "learning_rate": 4.860393862386426e-06, + "loss": 0.6335, + "step": 2841 + }, + { + "epoch": 0.662265030293615, + "grad_norm": 0.3366334140300751, + "learning_rate": 4.860292938390597e-06, + "loss": 0.6292, + "step": 2842 + }, + { + "epoch": 0.6624980581016001, + "grad_norm": 0.33677372336387634, + "learning_rate": 4.860191978976579e-06, + "loss": 0.6362, + "step": 2843 + }, + { + "epoch": 0.6627310859095852, + "grad_norm": 0.332354873418808, + "learning_rate": 4.860090984145884e-06, + "loss": 0.6091, + "step": 2844 + }, + { + "epoch": 0.6629641137175702, + "grad_norm": 0.34227824211120605, + "learning_rate": 4.859989953900029e-06, + "loss": 0.6336, + "step": 2845 + }, + { + "epoch": 0.6631971415255554, + "grad_norm": 0.34174931049346924, + "learning_rate": 4.85988888824053e-06, + "loss": 0.655, + "step": 2846 + }, + { + "epoch": 0.6634301693335405, + "grad_norm": 0.3393741548061371, + "learning_rate": 4.859787787168901e-06, + "loss": 0.6516, + "step": 2847 + }, + { + "epoch": 0.6636631971415256, + "grad_norm": 0.35587507486343384, + "learning_rate": 4.859686650686663e-06, + "loss": 0.6669, + "step": 2848 + }, + { + "epoch": 0.6638962249495106, + "grad_norm": 0.37388330698013306, + "learning_rate": 4.859585478795332e-06, + "loss": 0.6875, + "step": 2849 + }, + { + "epoch": 0.6641292527574957, + "grad_norm": 0.3419080376625061, + "learning_rate": 4.859484271496425e-06, + "loss": 0.6277, + "step": 2850 + }, + { + "epoch": 0.6643622805654809, + "grad_norm": 0.3436039090156555, + "learning_rate": 4.859383028791462e-06, + "loss": 0.6637, + "step": 2851 + }, + { + "epoch": 0.6645953083734659, + "grad_norm": 0.3466782867908478, + "learning_rate": 4.859281750681963e-06, + "loss": 0.64, + "step": 2852 + }, + { + "epoch": 0.664828336181451, + "grad_norm": 0.338115930557251, + "learning_rate": 4.859180437169445e-06, + "loss": 0.6197, + "step": 2853 + }, + { + "epoch": 0.665061363989436, + "grad_norm": 0.35147324204444885, + "learning_rate": 4.8590790882554306e-06, + "loss": 0.6619, + "step": 2854 + }, + { + "epoch": 0.6652943917974211, + "grad_norm": 0.34695670008659363, + "learning_rate": 4.8589777039414395e-06, + "loss": 0.6802, + "step": 2855 + }, + { + "epoch": 0.6655274196054063, + "grad_norm": 0.34973281621932983, + "learning_rate": 4.858876284228995e-06, + "loss": 0.6409, + "step": 2856 + }, + { + "epoch": 0.6657604474133914, + "grad_norm": 0.3435687720775604, + "learning_rate": 4.858774829119616e-06, + "loss": 0.6569, + "step": 2857 + }, + { + "epoch": 0.6659934752213764, + "grad_norm": 0.33112290501594543, + "learning_rate": 4.8586733386148276e-06, + "loss": 0.6273, + "step": 2858 + }, + { + "epoch": 0.6662265030293615, + "grad_norm": 0.3697720468044281, + "learning_rate": 4.858571812716151e-06, + "loss": 0.6213, + "step": 2859 + }, + { + "epoch": 0.6664595308373465, + "grad_norm": 0.34170958399772644, + "learning_rate": 4.8584702514251105e-06, + "loss": 0.6419, + "step": 2860 + }, + { + "epoch": 0.6666925586453317, + "grad_norm": 0.339370459318161, + "learning_rate": 4.858368654743231e-06, + "loss": 0.6499, + "step": 2861 + }, + { + "epoch": 0.6669255864533168, + "grad_norm": 0.34881240129470825, + "learning_rate": 4.858267022672034e-06, + "loss": 0.6729, + "step": 2862 + }, + { + "epoch": 0.6671586142613019, + "grad_norm": 0.33713486790657043, + "learning_rate": 4.858165355213048e-06, + "loss": 0.6538, + "step": 2863 + }, + { + "epoch": 0.6673916420692869, + "grad_norm": 0.33915430307388306, + "learning_rate": 4.858063652367797e-06, + "loss": 0.6546, + "step": 2864 + }, + { + "epoch": 0.667624669877272, + "grad_norm": 0.3317718803882599, + "learning_rate": 4.857961914137807e-06, + "loss": 0.6586, + "step": 2865 + }, + { + "epoch": 0.6678576976852572, + "grad_norm": 0.33998554944992065, + "learning_rate": 4.857860140524604e-06, + "loss": 0.6672, + "step": 2866 + }, + { + "epoch": 0.6680907254932422, + "grad_norm": 0.3538658022880554, + "learning_rate": 4.8577583315297175e-06, + "loss": 0.6444, + "step": 2867 + }, + { + "epoch": 0.6683237533012273, + "grad_norm": 0.3445742130279541, + "learning_rate": 4.857656487154674e-06, + "loss": 0.6619, + "step": 2868 + }, + { + "epoch": 0.6685567811092124, + "grad_norm": 0.3325398862361908, + "learning_rate": 4.857554607401001e-06, + "loss": 0.6577, + "step": 2869 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 0.34330880641937256, + "learning_rate": 4.857452692270228e-06, + "loss": 0.6511, + "step": 2870 + }, + { + "epoch": 0.6690228367251826, + "grad_norm": 0.34497305750846863, + "learning_rate": 4.857350741763884e-06, + "loss": 0.6515, + "step": 2871 + }, + { + "epoch": 0.6692558645331677, + "grad_norm": 0.3537960946559906, + "learning_rate": 4.857248755883499e-06, + "loss": 0.6713, + "step": 2872 + }, + { + "epoch": 0.6694888923411527, + "grad_norm": 0.34577253460884094, + "learning_rate": 4.857146734630604e-06, + "loss": 0.6454, + "step": 2873 + }, + { + "epoch": 0.6697219201491378, + "grad_norm": 0.34073495864868164, + "learning_rate": 4.85704467800673e-06, + "loss": 0.6383, + "step": 2874 + }, + { + "epoch": 0.6699549479571228, + "grad_norm": 0.3555724322795868, + "learning_rate": 4.856942586013407e-06, + "loss": 0.6411, + "step": 2875 + }, + { + "epoch": 0.670187975765108, + "grad_norm": 0.3386792540550232, + "learning_rate": 4.856840458652168e-06, + "loss": 0.642, + "step": 2876 + }, + { + "epoch": 0.6704210035730931, + "grad_norm": 0.34501326084136963, + "learning_rate": 4.856738295924544e-06, + "loss": 0.6442, + "step": 2877 + }, + { + "epoch": 0.6706540313810782, + "grad_norm": 0.3413112163543701, + "learning_rate": 4.856636097832071e-06, + "loss": 0.6454, + "step": 2878 + }, + { + "epoch": 0.6708870591890632, + "grad_norm": 0.33614087104797363, + "learning_rate": 4.856533864376282e-06, + "loss": 0.6463, + "step": 2879 + }, + { + "epoch": 0.6711200869970483, + "grad_norm": 0.33526965975761414, + "learning_rate": 4.856431595558708e-06, + "loss": 0.6318, + "step": 2880 + }, + { + "epoch": 0.6713531148050333, + "grad_norm": 0.33629870414733887, + "learning_rate": 4.856329291380887e-06, + "loss": 0.6754, + "step": 2881 + }, + { + "epoch": 0.6715861426130185, + "grad_norm": 0.34817975759506226, + "learning_rate": 4.8562269518443515e-06, + "loss": 0.6546, + "step": 2882 + }, + { + "epoch": 0.6718191704210036, + "grad_norm": 0.3533041477203369, + "learning_rate": 4.85612457695064e-06, + "loss": 0.6561, + "step": 2883 + }, + { + "epoch": 0.6720521982289887, + "grad_norm": 0.3460318148136139, + "learning_rate": 4.8560221667012864e-06, + "loss": 0.6798, + "step": 2884 + }, + { + "epoch": 0.6722852260369737, + "grad_norm": 0.34081417322158813, + "learning_rate": 4.855919721097828e-06, + "loss": 0.6309, + "step": 2885 + }, + { + "epoch": 0.6725182538449588, + "grad_norm": 0.3334617018699646, + "learning_rate": 4.8558172401418035e-06, + "loss": 0.6372, + "step": 2886 + }, + { + "epoch": 0.672751281652944, + "grad_norm": 0.34845229983329773, + "learning_rate": 4.855714723834749e-06, + "loss": 0.6411, + "step": 2887 + }, + { + "epoch": 0.672984309460929, + "grad_norm": 0.350900262594223, + "learning_rate": 4.855612172178203e-06, + "loss": 0.6422, + "step": 2888 + }, + { + "epoch": 0.6732173372689141, + "grad_norm": 0.35360944271087646, + "learning_rate": 4.855509585173705e-06, + "loss": 0.6285, + "step": 2889 + }, + { + "epoch": 0.6734503650768991, + "grad_norm": 0.3312782943248749, + "learning_rate": 4.855406962822795e-06, + "loss": 0.6557, + "step": 2890 + }, + { + "epoch": 0.6736833928848842, + "grad_norm": 0.34877073764801025, + "learning_rate": 4.855304305127011e-06, + "loss": 0.6501, + "step": 2891 + }, + { + "epoch": 0.6739164206928694, + "grad_norm": 0.35978442430496216, + "learning_rate": 4.855201612087895e-06, + "loss": 0.6597, + "step": 2892 + }, + { + "epoch": 0.6741494485008545, + "grad_norm": 0.3494722545146942, + "learning_rate": 4.8550988837069885e-06, + "loss": 0.6627, + "step": 2893 + }, + { + "epoch": 0.6743824763088395, + "grad_norm": 0.37660762667655945, + "learning_rate": 4.854996119985832e-06, + "loss": 0.6345, + "step": 2894 + }, + { + "epoch": 0.6746155041168246, + "grad_norm": 0.32363277673721313, + "learning_rate": 4.8548933209259665e-06, + "loss": 0.6552, + "step": 2895 + }, + { + "epoch": 0.6748485319248096, + "grad_norm": 0.35478800535202026, + "learning_rate": 4.854790486528937e-06, + "loss": 0.6465, + "step": 2896 + }, + { + "epoch": 0.6750815597327948, + "grad_norm": 0.3585166037082672, + "learning_rate": 4.854687616796285e-06, + "loss": 0.6792, + "step": 2897 + }, + { + "epoch": 0.6753145875407799, + "grad_norm": 0.3506454825401306, + "learning_rate": 4.854584711729554e-06, + "loss": 0.666, + "step": 2898 + }, + { + "epoch": 0.675547615348765, + "grad_norm": 0.33273911476135254, + "learning_rate": 4.854481771330289e-06, + "loss": 0.6496, + "step": 2899 + }, + { + "epoch": 0.67578064315675, + "grad_norm": 0.334081768989563, + "learning_rate": 4.8543787956000355e-06, + "loss": 0.6637, + "step": 2900 + }, + { + "epoch": 0.6760136709647351, + "grad_norm": 0.3325260579586029, + "learning_rate": 4.854275784540336e-06, + "loss": 0.6242, + "step": 2901 + }, + { + "epoch": 0.6762466987727203, + "grad_norm": 0.36122196912765503, + "learning_rate": 4.8541727381527396e-06, + "loss": 0.67, + "step": 2902 + }, + { + "epoch": 0.6764797265807053, + "grad_norm": 0.3491709530353546, + "learning_rate": 4.85406965643879e-06, + "loss": 0.6602, + "step": 2903 + }, + { + "epoch": 0.6767127543886904, + "grad_norm": 0.3406173884868622, + "learning_rate": 4.8539665394000345e-06, + "loss": 0.638, + "step": 2904 + }, + { + "epoch": 0.6769457821966754, + "grad_norm": 0.35326820611953735, + "learning_rate": 4.853863387038022e-06, + "loss": 0.6566, + "step": 2905 + }, + { + "epoch": 0.6771788100046605, + "grad_norm": 0.3404378294944763, + "learning_rate": 4.853760199354299e-06, + "loss": 0.6531, + "step": 2906 + }, + { + "epoch": 0.6774118378126457, + "grad_norm": 0.3591305613517761, + "learning_rate": 4.853656976350414e-06, + "loss": 0.6542, + "step": 2907 + }, + { + "epoch": 0.6776448656206308, + "grad_norm": 0.33614981174468994, + "learning_rate": 4.853553718027916e-06, + "loss": 0.6648, + "step": 2908 + }, + { + "epoch": 0.6778778934286158, + "grad_norm": 0.36403846740722656, + "learning_rate": 4.853450424388355e-06, + "loss": 0.6593, + "step": 2909 + }, + { + "epoch": 0.6781109212366009, + "grad_norm": 0.3421976864337921, + "learning_rate": 4.85334709543328e-06, + "loss": 0.6682, + "step": 2910 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 0.33858761191368103, + "learning_rate": 4.853243731164243e-06, + "loss": 0.6326, + "step": 2911 + }, + { + "epoch": 0.6785769768525711, + "grad_norm": 0.34632447361946106, + "learning_rate": 4.853140331582795e-06, + "loss": 0.6651, + "step": 2912 + }, + { + "epoch": 0.6788100046605562, + "grad_norm": 0.3369079828262329, + "learning_rate": 4.853036896690486e-06, + "loss": 0.6542, + "step": 2913 + }, + { + "epoch": 0.6790430324685413, + "grad_norm": 0.34388890862464905, + "learning_rate": 4.852933426488869e-06, + "loss": 0.609, + "step": 2914 + }, + { + "epoch": 0.6792760602765263, + "grad_norm": 0.33297649025917053, + "learning_rate": 4.852829920979497e-06, + "loss": 0.6487, + "step": 2915 + }, + { + "epoch": 0.6795090880845114, + "grad_norm": 0.35561275482177734, + "learning_rate": 4.852726380163923e-06, + "loss": 0.6461, + "step": 2916 + }, + { + "epoch": 0.6797421158924966, + "grad_norm": 0.38698068261146545, + "learning_rate": 4.852622804043699e-06, + "loss": 0.6309, + "step": 2917 + }, + { + "epoch": 0.6799751437004816, + "grad_norm": 0.3304995596408844, + "learning_rate": 4.852519192620382e-06, + "loss": 0.6568, + "step": 2918 + }, + { + "epoch": 0.6802081715084667, + "grad_norm": 0.361907422542572, + "learning_rate": 4.852415545895526e-06, + "loss": 0.6659, + "step": 2919 + }, + { + "epoch": 0.6804411993164518, + "grad_norm": 0.3358997702598572, + "learning_rate": 4.852311863870685e-06, + "loss": 0.6375, + "step": 2920 + }, + { + "epoch": 0.6806742271244368, + "grad_norm": 0.3543286621570587, + "learning_rate": 4.852208146547417e-06, + "loss": 0.6594, + "step": 2921 + }, + { + "epoch": 0.680907254932422, + "grad_norm": 0.34796246886253357, + "learning_rate": 4.852104393927276e-06, + "loss": 0.6602, + "step": 2922 + }, + { + "epoch": 0.681140282740407, + "grad_norm": 0.3449416756629944, + "learning_rate": 4.85200060601182e-06, + "loss": 0.6726, + "step": 2923 + }, + { + "epoch": 0.6813733105483921, + "grad_norm": 0.3245369493961334, + "learning_rate": 4.8518967828026065e-06, + "loss": 0.6433, + "step": 2924 + }, + { + "epoch": 0.6816063383563772, + "grad_norm": 0.3442113995552063, + "learning_rate": 4.851792924301194e-06, + "loss": 0.6443, + "step": 2925 + }, + { + "epoch": 0.6818393661643622, + "grad_norm": 0.3526158332824707, + "learning_rate": 4.8516890305091395e-06, + "loss": 0.6343, + "step": 2926 + }, + { + "epoch": 0.6820723939723473, + "grad_norm": 0.34400737285614014, + "learning_rate": 4.8515851014280035e-06, + "loss": 0.6216, + "step": 2927 + }, + { + "epoch": 0.6823054217803325, + "grad_norm": 0.33903104066848755, + "learning_rate": 4.8514811370593445e-06, + "loss": 0.616, + "step": 2928 + }, + { + "epoch": 0.6825384495883176, + "grad_norm": 0.328902930021286, + "learning_rate": 4.851377137404724e-06, + "loss": 0.6135, + "step": 2929 + }, + { + "epoch": 0.6827714773963026, + "grad_norm": 0.34282952547073364, + "learning_rate": 4.8512731024657e-06, + "loss": 0.6285, + "step": 2930 + }, + { + "epoch": 0.6830045052042877, + "grad_norm": 0.3574405908584595, + "learning_rate": 4.851169032243837e-06, + "loss": 0.655, + "step": 2931 + }, + { + "epoch": 0.6832375330122727, + "grad_norm": 0.36354970932006836, + "learning_rate": 4.851064926740693e-06, + "loss": 0.617, + "step": 2932 + }, + { + "epoch": 0.6834705608202579, + "grad_norm": 0.3604641556739807, + "learning_rate": 4.8509607859578335e-06, + "loss": 0.6291, + "step": 2933 + }, + { + "epoch": 0.683703588628243, + "grad_norm": 0.3616548180580139, + "learning_rate": 4.850856609896819e-06, + "loss": 0.6642, + "step": 2934 + }, + { + "epoch": 0.683936616436228, + "grad_norm": 0.3557944893836975, + "learning_rate": 4.850752398559214e-06, + "loss": 0.6547, + "step": 2935 + }, + { + "epoch": 0.6841696442442131, + "grad_norm": 0.36621055006980896, + "learning_rate": 4.850648151946582e-06, + "loss": 0.6175, + "step": 2936 + }, + { + "epoch": 0.6844026720521982, + "grad_norm": 0.3524903655052185, + "learning_rate": 4.850543870060487e-06, + "loss": 0.6531, + "step": 2937 + }, + { + "epoch": 0.6846356998601834, + "grad_norm": 0.3588731288909912, + "learning_rate": 4.850439552902493e-06, + "loss": 0.6495, + "step": 2938 + }, + { + "epoch": 0.6848687276681684, + "grad_norm": 0.3340667188167572, + "learning_rate": 4.850335200474168e-06, + "loss": 0.6232, + "step": 2939 + }, + { + "epoch": 0.6851017554761535, + "grad_norm": 0.33896929025650024, + "learning_rate": 4.850230812777075e-06, + "loss": 0.6572, + "step": 2940 + }, + { + "epoch": 0.6853347832841385, + "grad_norm": 0.3591836094856262, + "learning_rate": 4.850126389812783e-06, + "loss": 0.6438, + "step": 2941 + }, + { + "epoch": 0.6855678110921236, + "grad_norm": 0.3465839624404907, + "learning_rate": 4.850021931582855e-06, + "loss": 0.6282, + "step": 2942 + }, + { + "epoch": 0.6858008389001088, + "grad_norm": 0.3423473834991455, + "learning_rate": 4.849917438088864e-06, + "loss": 0.6459, + "step": 2943 + }, + { + "epoch": 0.6860338667080939, + "grad_norm": 0.33350712060928345, + "learning_rate": 4.849812909332374e-06, + "loss": 0.6278, + "step": 2944 + }, + { + "epoch": 0.6862668945160789, + "grad_norm": 0.3349320888519287, + "learning_rate": 4.849708345314954e-06, + "loss": 0.6734, + "step": 2945 + }, + { + "epoch": 0.686499922324064, + "grad_norm": 0.3445034325122833, + "learning_rate": 4.849603746038175e-06, + "loss": 0.6532, + "step": 2946 + }, + { + "epoch": 0.686732950132049, + "grad_norm": 0.3361227810382843, + "learning_rate": 4.849499111503604e-06, + "loss": 0.647, + "step": 2947 + }, + { + "epoch": 0.6869659779400342, + "grad_norm": 0.3305503726005554, + "learning_rate": 4.849394441712814e-06, + "loss": 0.6493, + "step": 2948 + }, + { + "epoch": 0.6871990057480193, + "grad_norm": 0.34169331192970276, + "learning_rate": 4.849289736667373e-06, + "loss": 0.6645, + "step": 2949 + }, + { + "epoch": 0.6874320335560044, + "grad_norm": 0.3470440208911896, + "learning_rate": 4.8491849963688535e-06, + "loss": 0.629, + "step": 2950 + }, + { + "epoch": 0.6876650613639894, + "grad_norm": 0.3384636342525482, + "learning_rate": 4.849080220818827e-06, + "loss": 0.6435, + "step": 2951 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 0.3450942933559418, + "learning_rate": 4.848975410018866e-06, + "loss": 0.6865, + "step": 2952 + }, + { + "epoch": 0.6881311169799597, + "grad_norm": 0.3334246575832367, + "learning_rate": 4.848870563970543e-06, + "loss": 0.6315, + "step": 2953 + }, + { + "epoch": 0.6883641447879447, + "grad_norm": 0.36027050018310547, + "learning_rate": 4.8487656826754316e-06, + "loss": 0.6186, + "step": 2954 + }, + { + "epoch": 0.6885971725959298, + "grad_norm": 0.3396907448768616, + "learning_rate": 4.848660766135105e-06, + "loss": 0.6405, + "step": 2955 + }, + { + "epoch": 0.6888302004039148, + "grad_norm": 0.352873295545578, + "learning_rate": 4.848555814351138e-06, + "loss": 0.6502, + "step": 2956 + }, + { + "epoch": 0.6890632282118999, + "grad_norm": 0.34933972358703613, + "learning_rate": 4.8484508273251055e-06, + "loss": 0.6858, + "step": 2957 + }, + { + "epoch": 0.6892962560198851, + "grad_norm": 0.3380581736564636, + "learning_rate": 4.848345805058583e-06, + "loss": 0.6542, + "step": 2958 + }, + { + "epoch": 0.6895292838278702, + "grad_norm": 0.3402542471885681, + "learning_rate": 4.848240747553146e-06, + "loss": 0.6435, + "step": 2959 + }, + { + "epoch": 0.6897623116358552, + "grad_norm": 0.35029080510139465, + "learning_rate": 4.848135654810371e-06, + "loss": 0.6509, + "step": 2960 + }, + { + "epoch": 0.6899953394438403, + "grad_norm": 0.3417990803718567, + "learning_rate": 4.848030526831836e-06, + "loss": 0.6447, + "step": 2961 + }, + { + "epoch": 0.6902283672518253, + "grad_norm": 0.3578505516052246, + "learning_rate": 4.847925363619118e-06, + "loss": 0.6215, + "step": 2962 + }, + { + "epoch": 0.6904613950598105, + "grad_norm": 0.3469124436378479, + "learning_rate": 4.847820165173794e-06, + "loss": 0.6783, + "step": 2963 + }, + { + "epoch": 0.6906944228677956, + "grad_norm": 0.3646489083766937, + "learning_rate": 4.847714931497444e-06, + "loss": 0.6405, + "step": 2964 + }, + { + "epoch": 0.6909274506757807, + "grad_norm": 0.3659578263759613, + "learning_rate": 4.847609662591646e-06, + "loss": 0.661, + "step": 2965 + }, + { + "epoch": 0.6911604784837657, + "grad_norm": 0.3520257771015167, + "learning_rate": 4.847504358457981e-06, + "loss": 0.6274, + "step": 2966 + }, + { + "epoch": 0.6913935062917508, + "grad_norm": 0.34371551871299744, + "learning_rate": 4.847399019098028e-06, + "loss": 0.6364, + "step": 2967 + }, + { + "epoch": 0.691626534099736, + "grad_norm": 0.3622281551361084, + "learning_rate": 4.847293644513369e-06, + "loss": 0.6249, + "step": 2968 + }, + { + "epoch": 0.691859561907721, + "grad_norm": 0.35088858008384705, + "learning_rate": 4.847188234705583e-06, + "loss": 0.6475, + "step": 2969 + }, + { + "epoch": 0.6920925897157061, + "grad_norm": 0.3484809994697571, + "learning_rate": 4.847082789676254e-06, + "loss": 0.6589, + "step": 2970 + }, + { + "epoch": 0.6923256175236912, + "grad_norm": 0.3647858500480652, + "learning_rate": 4.846977309426963e-06, + "loss": 0.6417, + "step": 2971 + }, + { + "epoch": 0.6925586453316762, + "grad_norm": 0.3574081361293793, + "learning_rate": 4.846871793959293e-06, + "loss": 0.6394, + "step": 2972 + }, + { + "epoch": 0.6927916731396613, + "grad_norm": 0.3508208990097046, + "learning_rate": 4.846766243274829e-06, + "loss": 0.6361, + "step": 2973 + }, + { + "epoch": 0.6930247009476465, + "grad_norm": 0.33730489015579224, + "learning_rate": 4.846660657375152e-06, + "loss": 0.6351, + "step": 2974 + }, + { + "epoch": 0.6932577287556315, + "grad_norm": 0.3719382584095001, + "learning_rate": 4.846555036261849e-06, + "loss": 0.6402, + "step": 2975 + }, + { + "epoch": 0.6934907565636166, + "grad_norm": 0.3311218321323395, + "learning_rate": 4.846449379936503e-06, + "loss": 0.6499, + "step": 2976 + }, + { + "epoch": 0.6937237843716016, + "grad_norm": 0.3509080708026886, + "learning_rate": 4.8463436884007e-06, + "loss": 0.6758, + "step": 2977 + }, + { + "epoch": 0.6939568121795867, + "grad_norm": 0.35390159487724304, + "learning_rate": 4.846237961656026e-06, + "loss": 0.6316, + "step": 2978 + }, + { + "epoch": 0.6941898399875719, + "grad_norm": 0.34578967094421387, + "learning_rate": 4.846132199704069e-06, + "loss": 0.633, + "step": 2979 + }, + { + "epoch": 0.694422867795557, + "grad_norm": 0.34393900632858276, + "learning_rate": 4.846026402546415e-06, + "loss": 0.6458, + "step": 2980 + }, + { + "epoch": 0.694655895603542, + "grad_norm": 0.3548341393470764, + "learning_rate": 4.84592057018465e-06, + "loss": 0.665, + "step": 2981 + }, + { + "epoch": 0.6948889234115271, + "grad_norm": 0.337525337934494, + "learning_rate": 4.845814702620364e-06, + "loss": 0.6332, + "step": 2982 + }, + { + "epoch": 0.6951219512195121, + "grad_norm": 0.33709782361984253, + "learning_rate": 4.845708799855145e-06, + "loss": 0.6384, + "step": 2983 + }, + { + "epoch": 0.6953549790274973, + "grad_norm": 0.35517528653144836, + "learning_rate": 4.845602861890583e-06, + "loss": 0.6451, + "step": 2984 + }, + { + "epoch": 0.6955880068354824, + "grad_norm": 0.3453751802444458, + "learning_rate": 4.845496888728266e-06, + "loss": 0.6547, + "step": 2985 + }, + { + "epoch": 0.6958210346434675, + "grad_norm": 0.3348550498485565, + "learning_rate": 4.845390880369786e-06, + "loss": 0.6559, + "step": 2986 + }, + { + "epoch": 0.6960540624514525, + "grad_norm": 0.34439152479171753, + "learning_rate": 4.845284836816733e-06, + "loss": 0.6124, + "step": 2987 + }, + { + "epoch": 0.6962870902594376, + "grad_norm": 0.34060293436050415, + "learning_rate": 4.8451787580706985e-06, + "loss": 0.6663, + "step": 2988 + }, + { + "epoch": 0.6965201180674228, + "grad_norm": 0.342243492603302, + "learning_rate": 4.845072644133273e-06, + "loss": 0.6437, + "step": 2989 + }, + { + "epoch": 0.6967531458754078, + "grad_norm": 0.3387835919857025, + "learning_rate": 4.844966495006051e-06, + "loss": 0.6632, + "step": 2990 + }, + { + "epoch": 0.6969861736833929, + "grad_norm": 0.33499377965927124, + "learning_rate": 4.8448603106906234e-06, + "loss": 0.6529, + "step": 2991 + }, + { + "epoch": 0.697219201491378, + "grad_norm": 0.34000635147094727, + "learning_rate": 4.844754091188584e-06, + "loss": 0.6394, + "step": 2992 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 0.3396921455860138, + "learning_rate": 4.844647836501528e-06, + "loss": 0.5994, + "step": 2993 + }, + { + "epoch": 0.6976852571073482, + "grad_norm": 0.35427793860435486, + "learning_rate": 4.8445415466310485e-06, + "loss": 0.649, + "step": 2994 + }, + { + "epoch": 0.6979182849153333, + "grad_norm": 0.3381088674068451, + "learning_rate": 4.844435221578741e-06, + "loss": 0.6346, + "step": 2995 + }, + { + "epoch": 0.6981513127233183, + "grad_norm": 0.3444913923740387, + "learning_rate": 4.844328861346201e-06, + "loss": 0.6644, + "step": 2996 + }, + { + "epoch": 0.6983843405313034, + "grad_norm": 0.35512056946754456, + "learning_rate": 4.844222465935024e-06, + "loss": 0.64, + "step": 2997 + }, + { + "epoch": 0.6986173683392884, + "grad_norm": 0.3807167708873749, + "learning_rate": 4.844116035346807e-06, + "loss": 0.6464, + "step": 2998 + }, + { + "epoch": 0.6988503961472736, + "grad_norm": 0.3541911840438843, + "learning_rate": 4.844009569583148e-06, + "loss": 0.6561, + "step": 2999 + }, + { + "epoch": 0.6990834239552587, + "grad_norm": 0.36753973364830017, + "learning_rate": 4.843903068645643e-06, + "loss": 0.651, + "step": 3000 + }, + { + "epoch": 0.6993164517632438, + "grad_norm": 0.34998029470443726, + "learning_rate": 4.843796532535891e-06, + "loss": 0.6731, + "step": 3001 + }, + { + "epoch": 0.6995494795712288, + "grad_norm": 0.34349244832992554, + "learning_rate": 4.84368996125549e-06, + "loss": 0.6337, + "step": 3002 + }, + { + "epoch": 0.6997825073792139, + "grad_norm": 0.3486270010471344, + "learning_rate": 4.843583354806041e-06, + "loss": 0.6528, + "step": 3003 + }, + { + "epoch": 0.7000155351871991, + "grad_norm": 0.3546736538410187, + "learning_rate": 4.843476713189141e-06, + "loss": 0.6546, + "step": 3004 + }, + { + "epoch": 0.7002485629951841, + "grad_norm": 0.34208038449287415, + "learning_rate": 4.843370036406391e-06, + "loss": 0.6506, + "step": 3005 + }, + { + "epoch": 0.7004815908031692, + "grad_norm": 0.35807856917381287, + "learning_rate": 4.8432633244593945e-06, + "loss": 0.67, + "step": 3006 + }, + { + "epoch": 0.7007146186111542, + "grad_norm": 0.325324684381485, + "learning_rate": 4.84315657734975e-06, + "loss": 0.6673, + "step": 3007 + }, + { + "epoch": 0.7009476464191393, + "grad_norm": 0.3421734571456909, + "learning_rate": 4.84304979507906e-06, + "loss": 0.6695, + "step": 3008 + }, + { + "epoch": 0.7011806742271245, + "grad_norm": 0.3403767943382263, + "learning_rate": 4.8429429776489265e-06, + "loss": 0.6485, + "step": 3009 + }, + { + "epoch": 0.7014137020351096, + "grad_norm": 0.3300060033798218, + "learning_rate": 4.842836125060953e-06, + "loss": 0.6611, + "step": 3010 + }, + { + "epoch": 0.7016467298430946, + "grad_norm": 0.3444751799106598, + "learning_rate": 4.842729237316742e-06, + "loss": 0.642, + "step": 3011 + }, + { + "epoch": 0.7018797576510797, + "grad_norm": 0.34277957677841187, + "learning_rate": 4.842622314417899e-06, + "loss": 0.6612, + "step": 3012 + }, + { + "epoch": 0.7021127854590647, + "grad_norm": 0.36179110407829285, + "learning_rate": 4.842515356366028e-06, + "loss": 0.6538, + "step": 3013 + }, + { + "epoch": 0.7023458132670499, + "grad_norm": 0.3417898416519165, + "learning_rate": 4.842408363162733e-06, + "loss": 0.6205, + "step": 3014 + }, + { + "epoch": 0.702578841075035, + "grad_norm": 0.33406367897987366, + "learning_rate": 4.8423013348096205e-06, + "loss": 0.6465, + "step": 3015 + }, + { + "epoch": 0.70281186888302, + "grad_norm": 0.36337172985076904, + "learning_rate": 4.8421942713082955e-06, + "loss": 0.649, + "step": 3016 + }, + { + "epoch": 0.7030448966910051, + "grad_norm": 0.36949628591537476, + "learning_rate": 4.842087172660366e-06, + "loss": 0.6688, + "step": 3017 + }, + { + "epoch": 0.7032779244989902, + "grad_norm": 0.3414747416973114, + "learning_rate": 4.841980038867438e-06, + "loss": 0.6271, + "step": 3018 + }, + { + "epoch": 0.7035109523069752, + "grad_norm": 0.34710419178009033, + "learning_rate": 4.84187286993112e-06, + "loss": 0.6706, + "step": 3019 + }, + { + "epoch": 0.7037439801149604, + "grad_norm": 0.349898099899292, + "learning_rate": 4.841765665853019e-06, + "loss": 0.6477, + "step": 3020 + }, + { + "epoch": 0.7039770079229455, + "grad_norm": 0.34529709815979004, + "learning_rate": 4.841658426634745e-06, + "loss": 0.6471, + "step": 3021 + }, + { + "epoch": 0.7042100357309306, + "grad_norm": 0.3373080790042877, + "learning_rate": 4.841551152277906e-06, + "loss": 0.6385, + "step": 3022 + }, + { + "epoch": 0.7044430635389156, + "grad_norm": 0.3442961275577545, + "learning_rate": 4.841443842784112e-06, + "loss": 0.6281, + "step": 3023 + }, + { + "epoch": 0.7046760913469007, + "grad_norm": 0.3504621088504791, + "learning_rate": 4.841336498154975e-06, + "loss": 0.6682, + "step": 3024 + }, + { + "epoch": 0.7049091191548859, + "grad_norm": 0.37694406509399414, + "learning_rate": 4.841229118392104e-06, + "loss": 0.6647, + "step": 3025 + }, + { + "epoch": 0.7051421469628709, + "grad_norm": 0.33996033668518066, + "learning_rate": 4.841121703497109e-06, + "loss": 0.65, + "step": 3026 + }, + { + "epoch": 0.705375174770856, + "grad_norm": 0.33455154299736023, + "learning_rate": 4.841014253471606e-06, + "loss": 0.658, + "step": 3027 + }, + { + "epoch": 0.705608202578841, + "grad_norm": 0.35557061433792114, + "learning_rate": 4.8409067683172025e-06, + "loss": 0.6367, + "step": 3028 + }, + { + "epoch": 0.7058412303868261, + "grad_norm": 0.3744569718837738, + "learning_rate": 4.8407992480355145e-06, + "loss": 0.6531, + "step": 3029 + }, + { + "epoch": 0.7060742581948113, + "grad_norm": 0.4311912953853607, + "learning_rate": 4.840691692628155e-06, + "loss": 0.6252, + "step": 3030 + }, + { + "epoch": 0.7063072860027964, + "grad_norm": 0.34359443187713623, + "learning_rate": 4.84058410209674e-06, + "loss": 0.6524, + "step": 3031 + }, + { + "epoch": 0.7065403138107814, + "grad_norm": 0.36617326736450195, + "learning_rate": 4.840476476442878e-06, + "loss": 0.6358, + "step": 3032 + }, + { + "epoch": 0.7067733416187665, + "grad_norm": 0.44412803649902344, + "learning_rate": 4.840368815668191e-06, + "loss": 0.637, + "step": 3033 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 0.35716357827186584, + "learning_rate": 4.840261119774289e-06, + "loss": 0.6427, + "step": 3034 + }, + { + "epoch": 0.7072393972347367, + "grad_norm": 0.3464042842388153, + "learning_rate": 4.840153388762791e-06, + "loss": 0.6821, + "step": 3035 + }, + { + "epoch": 0.7074724250427218, + "grad_norm": 0.367305189371109, + "learning_rate": 4.840045622635314e-06, + "loss": 0.67, + "step": 3036 + }, + { + "epoch": 0.7077054528507069, + "grad_norm": 0.38649559020996094, + "learning_rate": 4.839937821393472e-06, + "loss": 0.6403, + "step": 3037 + }, + { + "epoch": 0.7079384806586919, + "grad_norm": 0.34432047605514526, + "learning_rate": 4.839829985038886e-06, + "loss": 0.6564, + "step": 3038 + }, + { + "epoch": 0.708171508466677, + "grad_norm": 0.34674617648124695, + "learning_rate": 4.8397221135731724e-06, + "loss": 0.6345, + "step": 3039 + }, + { + "epoch": 0.7084045362746622, + "grad_norm": 0.33704710006713867, + "learning_rate": 4.839614206997951e-06, + "loss": 0.6397, + "step": 3040 + }, + { + "epoch": 0.7086375640826472, + "grad_norm": 0.3546775281429291, + "learning_rate": 4.83950626531484e-06, + "loss": 0.6346, + "step": 3041 + }, + { + "epoch": 0.7088705918906323, + "grad_norm": 0.3604985773563385, + "learning_rate": 4.83939828852546e-06, + "loss": 0.6669, + "step": 3042 + }, + { + "epoch": 0.7091036196986173, + "grad_norm": 0.36211642622947693, + "learning_rate": 4.8392902766314295e-06, + "loss": 0.6342, + "step": 3043 + }, + { + "epoch": 0.7093366475066024, + "grad_norm": 0.3557267487049103, + "learning_rate": 4.839182229634372e-06, + "loss": 0.6529, + "step": 3044 + }, + { + "epoch": 0.7095696753145876, + "grad_norm": 0.3443957269191742, + "learning_rate": 4.839074147535907e-06, + "loss": 0.6516, + "step": 3045 + }, + { + "epoch": 0.7098027031225727, + "grad_norm": 0.3509231209754944, + "learning_rate": 4.838966030337657e-06, + "loss": 0.6541, + "step": 3046 + }, + { + "epoch": 0.7100357309305577, + "grad_norm": 0.35817667841911316, + "learning_rate": 4.838857878041244e-06, + "loss": 0.6589, + "step": 3047 + }, + { + "epoch": 0.7102687587385428, + "grad_norm": 0.35795050859451294, + "learning_rate": 4.838749690648292e-06, + "loss": 0.648, + "step": 3048 + }, + { + "epoch": 0.7105017865465278, + "grad_norm": 0.3547944128513336, + "learning_rate": 4.838641468160424e-06, + "loss": 0.6436, + "step": 3049 + }, + { + "epoch": 0.710734814354513, + "grad_norm": 0.34723007678985596, + "learning_rate": 4.838533210579262e-06, + "loss": 0.6416, + "step": 3050 + }, + { + "epoch": 0.7109678421624981, + "grad_norm": 0.34855005145072937, + "learning_rate": 4.838424917906433e-06, + "loss": 0.6352, + "step": 3051 + }, + { + "epoch": 0.7112008699704832, + "grad_norm": 0.38094112277030945, + "learning_rate": 4.83831659014356e-06, + "loss": 0.6182, + "step": 3052 + }, + { + "epoch": 0.7114338977784682, + "grad_norm": 0.3358006477355957, + "learning_rate": 4.83820822729227e-06, + "loss": 0.6562, + "step": 3053 + }, + { + "epoch": 0.7116669255864533, + "grad_norm": 0.34185928106307983, + "learning_rate": 4.83809982935419e-06, + "loss": 0.6528, + "step": 3054 + }, + { + "epoch": 0.7118999533944385, + "grad_norm": 0.3454000651836395, + "learning_rate": 4.837991396330944e-06, + "loss": 0.6583, + "step": 3055 + }, + { + "epoch": 0.7121329812024235, + "grad_norm": 0.3546416461467743, + "learning_rate": 4.837882928224161e-06, + "loss": 0.6572, + "step": 3056 + }, + { + "epoch": 0.7123660090104086, + "grad_norm": 0.35580116510391235, + "learning_rate": 4.837774425035468e-06, + "loss": 0.6463, + "step": 3057 + }, + { + "epoch": 0.7125990368183936, + "grad_norm": 0.36378344893455505, + "learning_rate": 4.837665886766494e-06, + "loss": 0.6168, + "step": 3058 + }, + { + "epoch": 0.7128320646263787, + "grad_norm": 0.34941327571868896, + "learning_rate": 4.837557313418866e-06, + "loss": 0.6696, + "step": 3059 + }, + { + "epoch": 0.7130650924343638, + "grad_norm": 0.3522895574569702, + "learning_rate": 4.837448704994215e-06, + "loss": 0.6438, + "step": 3060 + }, + { + "epoch": 0.713298120242349, + "grad_norm": 0.3689976930618286, + "learning_rate": 4.83734006149417e-06, + "loss": 0.6297, + "step": 3061 + }, + { + "epoch": 0.713531148050334, + "grad_norm": 0.33458220958709717, + "learning_rate": 4.83723138292036e-06, + "loss": 0.6227, + "step": 3062 + }, + { + "epoch": 0.7137641758583191, + "grad_norm": 0.33983147144317627, + "learning_rate": 4.837122669274419e-06, + "loss": 0.6366, + "step": 3063 + }, + { + "epoch": 0.7139972036663041, + "grad_norm": 0.3449232876300812, + "learning_rate": 4.837013920557975e-06, + "loss": 0.6763, + "step": 3064 + }, + { + "epoch": 0.7142302314742892, + "grad_norm": 0.3500492572784424, + "learning_rate": 4.836905136772662e-06, + "loss": 0.6369, + "step": 3065 + }, + { + "epoch": 0.7144632592822744, + "grad_norm": 0.3482242524623871, + "learning_rate": 4.836796317920112e-06, + "loss": 0.6413, + "step": 3066 + }, + { + "epoch": 0.7146962870902595, + "grad_norm": 0.3292717933654785, + "learning_rate": 4.836687464001957e-06, + "loss": 0.6376, + "step": 3067 + }, + { + "epoch": 0.7149293148982445, + "grad_norm": 0.3457491099834442, + "learning_rate": 4.836578575019831e-06, + "loss": 0.6339, + "step": 3068 + }, + { + "epoch": 0.7151623427062296, + "grad_norm": 0.3303210735321045, + "learning_rate": 4.836469650975369e-06, + "loss": 0.6525, + "step": 3069 + }, + { + "epoch": 0.7153953705142146, + "grad_norm": 0.34094277024269104, + "learning_rate": 4.8363606918702025e-06, + "loss": 0.653, + "step": 3070 + }, + { + "epoch": 0.7156283983221998, + "grad_norm": 0.3195515275001526, + "learning_rate": 4.83625169770597e-06, + "loss": 0.6135, + "step": 3071 + }, + { + "epoch": 0.7158614261301849, + "grad_norm": 0.3582059442996979, + "learning_rate": 4.836142668484306e-06, + "loss": 0.6728, + "step": 3072 + }, + { + "epoch": 0.71609445393817, + "grad_norm": 0.3356880843639374, + "learning_rate": 4.836033604206845e-06, + "loss": 0.63, + "step": 3073 + }, + { + "epoch": 0.716327481746155, + "grad_norm": 0.33821728825569153, + "learning_rate": 4.835924504875225e-06, + "loss": 0.643, + "step": 3074 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 0.3330288231372833, + "learning_rate": 4.8358153704910834e-06, + "loss": 0.6666, + "step": 3075 + }, + { + "epoch": 0.7167935373621253, + "grad_norm": 0.345687597990036, + "learning_rate": 4.8357062010560576e-06, + "loss": 0.6485, + "step": 3076 + }, + { + "epoch": 0.7170265651701103, + "grad_norm": 0.33962100744247437, + "learning_rate": 4.835596996571785e-06, + "loss": 0.6412, + "step": 3077 + }, + { + "epoch": 0.7172595929780954, + "grad_norm": 0.341589093208313, + "learning_rate": 4.8354877570399055e-06, + "loss": 0.6526, + "step": 3078 + }, + { + "epoch": 0.7174926207860804, + "grad_norm": 0.34056365489959717, + "learning_rate": 4.835378482462057e-06, + "loss": 0.6537, + "step": 3079 + }, + { + "epoch": 0.7177256485940655, + "grad_norm": 0.33900412917137146, + "learning_rate": 4.83526917283988e-06, + "loss": 0.6181, + "step": 3080 + }, + { + "epoch": 0.7179586764020507, + "grad_norm": 0.340034544467926, + "learning_rate": 4.835159828175016e-06, + "loss": 0.6324, + "step": 3081 + }, + { + "epoch": 0.7181917042100358, + "grad_norm": 0.33777105808258057, + "learning_rate": 4.835050448469103e-06, + "loss": 0.6394, + "step": 3082 + }, + { + "epoch": 0.7184247320180208, + "grad_norm": 0.33987337350845337, + "learning_rate": 4.834941033723785e-06, + "loss": 0.6778, + "step": 3083 + }, + { + "epoch": 0.7186577598260059, + "grad_norm": 0.35367244482040405, + "learning_rate": 4.834831583940701e-06, + "loss": 0.6267, + "step": 3084 + }, + { + "epoch": 0.718890787633991, + "grad_norm": 0.3414097726345062, + "learning_rate": 4.834722099121497e-06, + "loss": 0.6472, + "step": 3085 + }, + { + "epoch": 0.7191238154419761, + "grad_norm": 0.3408921957015991, + "learning_rate": 4.8346125792678134e-06, + "loss": 0.6329, + "step": 3086 + }, + { + "epoch": 0.7193568432499612, + "grad_norm": 0.36017513275146484, + "learning_rate": 4.8345030243812944e-06, + "loss": 0.671, + "step": 3087 + }, + { + "epoch": 0.7195898710579463, + "grad_norm": 0.33565688133239746, + "learning_rate": 4.834393434463583e-06, + "loss": 0.6426, + "step": 3088 + }, + { + "epoch": 0.7198228988659313, + "grad_norm": 0.32277658581733704, + "learning_rate": 4.834283809516325e-06, + "loss": 0.6488, + "step": 3089 + }, + { + "epoch": 0.7200559266739164, + "grad_norm": 0.3388022482395172, + "learning_rate": 4.834174149541165e-06, + "loss": 0.6796, + "step": 3090 + }, + { + "epoch": 0.7202889544819016, + "grad_norm": 0.3463088572025299, + "learning_rate": 4.834064454539749e-06, + "loss": 0.667, + "step": 3091 + }, + { + "epoch": 0.7205219822898866, + "grad_norm": 0.34728720784187317, + "learning_rate": 4.8339547245137216e-06, + "loss": 0.6615, + "step": 3092 + }, + { + "epoch": 0.7207550100978717, + "grad_norm": 0.3316285014152527, + "learning_rate": 4.833844959464731e-06, + "loss": 0.625, + "step": 3093 + }, + { + "epoch": 0.7209880379058567, + "grad_norm": 0.34473657608032227, + "learning_rate": 4.833735159394423e-06, + "loss": 0.6438, + "step": 3094 + }, + { + "epoch": 0.7212210657138418, + "grad_norm": 0.34249183535575867, + "learning_rate": 4.833625324304446e-06, + "loss": 0.6465, + "step": 3095 + }, + { + "epoch": 0.721454093521827, + "grad_norm": 0.33761391043663025, + "learning_rate": 4.833515454196449e-06, + "loss": 0.6292, + "step": 3096 + }, + { + "epoch": 0.721687121329812, + "grad_norm": 0.34786856174468994, + "learning_rate": 4.833405549072079e-06, + "loss": 0.6483, + "step": 3097 + }, + { + "epoch": 0.7219201491377971, + "grad_norm": 0.3349517583847046, + "learning_rate": 4.833295608932986e-06, + "loss": 0.6113, + "step": 3098 + }, + { + "epoch": 0.7221531769457822, + "grad_norm": 0.34533506631851196, + "learning_rate": 4.83318563378082e-06, + "loss": 0.6677, + "step": 3099 + }, + { + "epoch": 0.7223862047537672, + "grad_norm": 0.3447478413581848, + "learning_rate": 4.83307562361723e-06, + "loss": 0.6061, + "step": 3100 + }, + { + "epoch": 0.7226192325617524, + "grad_norm": 0.34756892919540405, + "learning_rate": 4.8329655784438686e-06, + "loss": 0.6456, + "step": 3101 + }, + { + "epoch": 0.7228522603697375, + "grad_norm": 0.3407537341117859, + "learning_rate": 4.832855498262386e-06, + "loss": 0.6411, + "step": 3102 + }, + { + "epoch": 0.7230852881777226, + "grad_norm": 0.3321404755115509, + "learning_rate": 4.832745383074435e-06, + "loss": 0.6289, + "step": 3103 + }, + { + "epoch": 0.7233183159857076, + "grad_norm": 0.342989444732666, + "learning_rate": 4.832635232881667e-06, + "loss": 0.6411, + "step": 3104 + }, + { + "epoch": 0.7235513437936927, + "grad_norm": 0.35097041726112366, + "learning_rate": 4.832525047685736e-06, + "loss": 0.6459, + "step": 3105 + }, + { + "epoch": 0.7237843716016777, + "grad_norm": 0.3481845557689667, + "learning_rate": 4.8324148274882935e-06, + "loss": 0.644, + "step": 3106 + }, + { + "epoch": 0.7240173994096629, + "grad_norm": 0.3474588990211487, + "learning_rate": 4.832304572290994e-06, + "loss": 0.6847, + "step": 3107 + }, + { + "epoch": 0.724250427217648, + "grad_norm": 0.3435742259025574, + "learning_rate": 4.832194282095494e-06, + "loss": 0.6599, + "step": 3108 + }, + { + "epoch": 0.724483455025633, + "grad_norm": 0.3473348915576935, + "learning_rate": 4.8320839569034476e-06, + "loss": 0.6281, + "step": 3109 + }, + { + "epoch": 0.7247164828336181, + "grad_norm": 0.35198062658309937, + "learning_rate": 4.8319735967165086e-06, + "loss": 0.6484, + "step": 3110 + }, + { + "epoch": 0.7249495106416032, + "grad_norm": 0.34075573086738586, + "learning_rate": 4.831863201536334e-06, + "loss": 0.6615, + "step": 3111 + }, + { + "epoch": 0.7251825384495884, + "grad_norm": 0.32439160346984863, + "learning_rate": 4.831752771364581e-06, + "loss": 0.6274, + "step": 3112 + }, + { + "epoch": 0.7254155662575734, + "grad_norm": 0.33533644676208496, + "learning_rate": 4.831642306202906e-06, + "loss": 0.6519, + "step": 3113 + }, + { + "epoch": 0.7256485940655585, + "grad_norm": 0.33312398195266724, + "learning_rate": 4.831531806052967e-06, + "loss": 0.6621, + "step": 3114 + }, + { + "epoch": 0.7258816218735435, + "grad_norm": 0.3438466787338257, + "learning_rate": 4.831421270916422e-06, + "loss": 0.6271, + "step": 3115 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 0.35301724076271057, + "learning_rate": 4.83131070079493e-06, + "loss": 0.6147, + "step": 3116 + }, + { + "epoch": 0.7263476774895138, + "grad_norm": 0.3372238278388977, + "learning_rate": 4.83120009569015e-06, + "loss": 0.6541, + "step": 3117 + }, + { + "epoch": 0.7265807052974989, + "grad_norm": 0.33956417441368103, + "learning_rate": 4.831089455603743e-06, + "loss": 0.6305, + "step": 3118 + }, + { + "epoch": 0.7268137331054839, + "grad_norm": 0.34471753239631653, + "learning_rate": 4.830978780537365e-06, + "loss": 0.6248, + "step": 3119 + }, + { + "epoch": 0.727046760913469, + "grad_norm": 0.34800395369529724, + "learning_rate": 4.830868070492682e-06, + "loss": 0.6639, + "step": 3120 + }, + { + "epoch": 0.727279788721454, + "grad_norm": 0.35558366775512695, + "learning_rate": 4.830757325471352e-06, + "loss": 0.6533, + "step": 3121 + }, + { + "epoch": 0.7275128165294392, + "grad_norm": 0.3419859707355499, + "learning_rate": 4.830646545475039e-06, + "loss": 0.6663, + "step": 3122 + }, + { + "epoch": 0.7277458443374243, + "grad_norm": 0.3423082232475281, + "learning_rate": 4.830535730505402e-06, + "loss": 0.648, + "step": 3123 + }, + { + "epoch": 0.7279788721454094, + "grad_norm": 0.34456920623779297, + "learning_rate": 4.830424880564107e-06, + "loss": 0.6583, + "step": 3124 + }, + { + "epoch": 0.7282118999533944, + "grad_norm": 0.339545875787735, + "learning_rate": 4.830313995652817e-06, + "loss": 0.672, + "step": 3125 + }, + { + "epoch": 0.7284449277613795, + "grad_norm": 0.3444019854068756, + "learning_rate": 4.8302030757731956e-06, + "loss": 0.6603, + "step": 3126 + }, + { + "epoch": 0.7286779555693647, + "grad_norm": 0.3373524248600006, + "learning_rate": 4.8300921209269055e-06, + "loss": 0.6355, + "step": 3127 + }, + { + "epoch": 0.7289109833773497, + "grad_norm": 0.34043005108833313, + "learning_rate": 4.829981131115615e-06, + "loss": 0.6637, + "step": 3128 + }, + { + "epoch": 0.7291440111853348, + "grad_norm": 0.3272787630558014, + "learning_rate": 4.8298701063409866e-06, + "loss": 0.6621, + "step": 3129 + }, + { + "epoch": 0.7293770389933198, + "grad_norm": 0.3438282012939453, + "learning_rate": 4.829759046604687e-06, + "loss": 0.636, + "step": 3130 + }, + { + "epoch": 0.7296100668013049, + "grad_norm": 0.3606633245944977, + "learning_rate": 4.829647951908385e-06, + "loss": 0.6512, + "step": 3131 + }, + { + "epoch": 0.7298430946092901, + "grad_norm": 0.3499276638031006, + "learning_rate": 4.829536822253744e-06, + "loss": 0.6457, + "step": 3132 + }, + { + "epoch": 0.7300761224172752, + "grad_norm": 0.337262898683548, + "learning_rate": 4.829425657642435e-06, + "loss": 0.6365, + "step": 3133 + }, + { + "epoch": 0.7303091502252602, + "grad_norm": 0.33490800857543945, + "learning_rate": 4.829314458076124e-06, + "loss": 0.6117, + "step": 3134 + }, + { + "epoch": 0.7305421780332453, + "grad_norm": 0.35111069679260254, + "learning_rate": 4.82920322355648e-06, + "loss": 0.6505, + "step": 3135 + }, + { + "epoch": 0.7307752058412303, + "grad_norm": 0.351300448179245, + "learning_rate": 4.829091954085172e-06, + "loss": 0.6432, + "step": 3136 + }, + { + "epoch": 0.7310082336492155, + "grad_norm": 0.34908854961395264, + "learning_rate": 4.828980649663872e-06, + "loss": 0.6525, + "step": 3137 + }, + { + "epoch": 0.7312412614572006, + "grad_norm": 0.34278157353401184, + "learning_rate": 4.828869310294246e-06, + "loss": 0.6694, + "step": 3138 + }, + { + "epoch": 0.7314742892651857, + "grad_norm": 0.347072958946228, + "learning_rate": 4.828757935977969e-06, + "loss": 0.6131, + "step": 3139 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.34395596385002136, + "learning_rate": 4.828646526716709e-06, + "loss": 0.6553, + "step": 3140 + }, + { + "epoch": 0.7319403448811558, + "grad_norm": 0.33891159296035767, + "learning_rate": 4.8285350825121394e-06, + "loss": 0.641, + "step": 3141 + }, + { + "epoch": 0.732173372689141, + "grad_norm": 0.33914095163345337, + "learning_rate": 4.828423603365933e-06, + "loss": 0.6598, + "step": 3142 + }, + { + "epoch": 0.732406400497126, + "grad_norm": 0.3518467843532562, + "learning_rate": 4.828312089279761e-06, + "loss": 0.6416, + "step": 3143 + }, + { + "epoch": 0.7326394283051111, + "grad_norm": 0.35851022601127625, + "learning_rate": 4.828200540255298e-06, + "loss": 0.6573, + "step": 3144 + }, + { + "epoch": 0.7328724561130961, + "grad_norm": 0.35274019837379456, + "learning_rate": 4.8280889562942175e-06, + "loss": 0.6414, + "step": 3145 + }, + { + "epoch": 0.7331054839210812, + "grad_norm": 0.3277902901172638, + "learning_rate": 4.827977337398194e-06, + "loss": 0.6517, + "step": 3146 + }, + { + "epoch": 0.7333385117290664, + "grad_norm": 0.35696157813072205, + "learning_rate": 4.827865683568902e-06, + "loss": 0.6403, + "step": 3147 + }, + { + "epoch": 0.7335715395370515, + "grad_norm": 0.3485742509365082, + "learning_rate": 4.827753994808017e-06, + "loss": 0.6615, + "step": 3148 + }, + { + "epoch": 0.7338045673450365, + "grad_norm": 0.34345585107803345, + "learning_rate": 4.8276422711172165e-06, + "loss": 0.637, + "step": 3149 + }, + { + "epoch": 0.7340375951530216, + "grad_norm": 0.3350767493247986, + "learning_rate": 4.827530512498176e-06, + "loss": 0.626, + "step": 3150 + }, + { + "epoch": 0.7342706229610066, + "grad_norm": 0.347811758518219, + "learning_rate": 4.827418718952571e-06, + "loss": 0.6448, + "step": 3151 + }, + { + "epoch": 0.7345036507689917, + "grad_norm": 0.34380751848220825, + "learning_rate": 4.827306890482082e-06, + "loss": 0.6476, + "step": 3152 + }, + { + "epoch": 0.7347366785769769, + "grad_norm": 0.33038488030433655, + "learning_rate": 4.8271950270883845e-06, + "loss": 0.6383, + "step": 3153 + }, + { + "epoch": 0.734969706384962, + "grad_norm": 0.3463952839374542, + "learning_rate": 4.827083128773158e-06, + "loss": 0.6746, + "step": 3154 + }, + { + "epoch": 0.735202734192947, + "grad_norm": 0.34071052074432373, + "learning_rate": 4.826971195538082e-06, + "loss": 0.6439, + "step": 3155 + }, + { + "epoch": 0.7354357620009321, + "grad_norm": 0.34183603525161743, + "learning_rate": 4.826859227384836e-06, + "loss": 0.6138, + "step": 3156 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 0.344794899225235, + "learning_rate": 4.8267472243151e-06, + "loss": 0.644, + "step": 3157 + }, + { + "epoch": 0.7359018176169023, + "grad_norm": 0.3323722779750824, + "learning_rate": 4.826635186330555e-06, + "loss": 0.6532, + "step": 3158 + }, + { + "epoch": 0.7361348454248874, + "grad_norm": 0.3376578390598297, + "learning_rate": 4.826523113432882e-06, + "loss": 0.6496, + "step": 3159 + }, + { + "epoch": 0.7363678732328724, + "grad_norm": 0.3405734896659851, + "learning_rate": 4.826411005623762e-06, + "loss": 0.6504, + "step": 3160 + }, + { + "epoch": 0.7366009010408575, + "grad_norm": 0.33469974994659424, + "learning_rate": 4.826298862904878e-06, + "loss": 0.636, + "step": 3161 + }, + { + "epoch": 0.7368339288488426, + "grad_norm": 0.3384998142719269, + "learning_rate": 4.8261866852779136e-06, + "loss": 0.6224, + "step": 3162 + }, + { + "epoch": 0.7370669566568278, + "grad_norm": 0.3405486047267914, + "learning_rate": 4.82607447274455e-06, + "loss": 0.6439, + "step": 3163 + }, + { + "epoch": 0.7372999844648128, + "grad_norm": 0.3470497727394104, + "learning_rate": 4.825962225306474e-06, + "loss": 0.6542, + "step": 3164 + }, + { + "epoch": 0.7375330122727979, + "grad_norm": 0.35120874643325806, + "learning_rate": 4.825849942965368e-06, + "loss": 0.6652, + "step": 3165 + }, + { + "epoch": 0.737766040080783, + "grad_norm": 0.35674694180488586, + "learning_rate": 4.825737625722917e-06, + "loss": 0.6508, + "step": 3166 + }, + { + "epoch": 0.737999067888768, + "grad_norm": 0.34095001220703125, + "learning_rate": 4.825625273580806e-06, + "loss": 0.6007, + "step": 3167 + }, + { + "epoch": 0.7382320956967532, + "grad_norm": 0.35138973593711853, + "learning_rate": 4.825512886540723e-06, + "loss": 0.6546, + "step": 3168 + }, + { + "epoch": 0.7384651235047383, + "grad_norm": 0.33919015526771545, + "learning_rate": 4.825400464604352e-06, + "loss": 0.6517, + "step": 3169 + }, + { + "epoch": 0.7386981513127233, + "grad_norm": 0.3524872660636902, + "learning_rate": 4.825288007773381e-06, + "loss": 0.6448, + "step": 3170 + }, + { + "epoch": 0.7389311791207084, + "grad_norm": 0.3526056110858917, + "learning_rate": 4.825175516049499e-06, + "loss": 0.6632, + "step": 3171 + }, + { + "epoch": 0.7391642069286934, + "grad_norm": 0.3452853858470917, + "learning_rate": 4.82506298943439e-06, + "loss": 0.6564, + "step": 3172 + }, + { + "epoch": 0.7393972347366786, + "grad_norm": 0.3447176516056061, + "learning_rate": 4.824950427929747e-06, + "loss": 0.6702, + "step": 3173 + }, + { + "epoch": 0.7396302625446637, + "grad_norm": 0.36104512214660645, + "learning_rate": 4.8248378315372566e-06, + "loss": 0.643, + "step": 3174 + }, + { + "epoch": 0.7398632903526488, + "grad_norm": 0.35542795062065125, + "learning_rate": 4.82472520025861e-06, + "loss": 0.6477, + "step": 3175 + }, + { + "epoch": 0.7400963181606338, + "grad_norm": 0.32988762855529785, + "learning_rate": 4.824612534095496e-06, + "loss": 0.6374, + "step": 3176 + }, + { + "epoch": 0.7403293459686189, + "grad_norm": 0.3546084463596344, + "learning_rate": 4.824499833049604e-06, + "loss": 0.6502, + "step": 3177 + }, + { + "epoch": 0.740562373776604, + "grad_norm": 0.3374345302581787, + "learning_rate": 4.824387097122628e-06, + "loss": 0.6543, + "step": 3178 + }, + { + "epoch": 0.7407954015845891, + "grad_norm": 0.3604571521282196, + "learning_rate": 4.824274326316258e-06, + "loss": 0.6542, + "step": 3179 + }, + { + "epoch": 0.7410284293925742, + "grad_norm": 0.34523072838783264, + "learning_rate": 4.824161520632187e-06, + "loss": 0.6404, + "step": 3180 + }, + { + "epoch": 0.7412614572005592, + "grad_norm": 0.33610016107559204, + "learning_rate": 4.824048680072107e-06, + "loss": 0.6478, + "step": 3181 + }, + { + "epoch": 0.7414944850085443, + "grad_norm": 0.3363073766231537, + "learning_rate": 4.823935804637713e-06, + "loss": 0.6396, + "step": 3182 + }, + { + "epoch": 0.7417275128165295, + "grad_norm": 0.3265811502933502, + "learning_rate": 4.823822894330697e-06, + "loss": 0.6633, + "step": 3183 + }, + { + "epoch": 0.7419605406245146, + "grad_norm": 0.3369697630405426, + "learning_rate": 4.8237099491527536e-06, + "loss": 0.6566, + "step": 3184 + }, + { + "epoch": 0.7421935684324996, + "grad_norm": 0.348147451877594, + "learning_rate": 4.823596969105577e-06, + "loss": 0.6456, + "step": 3185 + }, + { + "epoch": 0.7424265962404847, + "grad_norm": 0.347522497177124, + "learning_rate": 4.823483954190865e-06, + "loss": 0.6454, + "step": 3186 + }, + { + "epoch": 0.7426596240484697, + "grad_norm": 0.3396416902542114, + "learning_rate": 4.82337090441031e-06, + "loss": 0.6257, + "step": 3187 + }, + { + "epoch": 0.7428926518564549, + "grad_norm": 0.3369756042957306, + "learning_rate": 4.823257819765612e-06, + "loss": 0.6325, + "step": 3188 + }, + { + "epoch": 0.74312567966444, + "grad_norm": 0.34554407000541687, + "learning_rate": 4.823144700258466e-06, + "loss": 0.6444, + "step": 3189 + }, + { + "epoch": 0.743358707472425, + "grad_norm": 0.3345966637134552, + "learning_rate": 4.823031545890569e-06, + "loss": 0.6764, + "step": 3190 + }, + { + "epoch": 0.7435917352804101, + "grad_norm": 0.3428337275981903, + "learning_rate": 4.82291835666362e-06, + "loss": 0.6405, + "step": 3191 + }, + { + "epoch": 0.7438247630883952, + "grad_norm": 0.33425986766815186, + "learning_rate": 4.8228051325793165e-06, + "loss": 0.6501, + "step": 3192 + }, + { + "epoch": 0.7440577908963804, + "grad_norm": 0.34907636046409607, + "learning_rate": 4.822691873639359e-06, + "loss": 0.6649, + "step": 3193 + }, + { + "epoch": 0.7442908187043654, + "grad_norm": 0.33475905656814575, + "learning_rate": 4.822578579845446e-06, + "loss": 0.6177, + "step": 3194 + }, + { + "epoch": 0.7445238465123505, + "grad_norm": 0.3396044671535492, + "learning_rate": 4.822465251199278e-06, + "loss": 0.6346, + "step": 3195 + }, + { + "epoch": 0.7447568743203355, + "grad_norm": 0.35666534304618835, + "learning_rate": 4.822351887702556e-06, + "loss": 0.655, + "step": 3196 + }, + { + "epoch": 0.7449899021283206, + "grad_norm": 0.3472031354904175, + "learning_rate": 4.82223848935698e-06, + "loss": 0.6411, + "step": 3197 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 0.35439929366111755, + "learning_rate": 4.822125056164251e-06, + "loss": 0.6266, + "step": 3198 + }, + { + "epoch": 0.7454559577442909, + "grad_norm": 0.3473011553287506, + "learning_rate": 4.822011588126073e-06, + "loss": 0.6259, + "step": 3199 + }, + { + "epoch": 0.7456889855522759, + "grad_norm": 0.33244064450263977, + "learning_rate": 4.821898085244149e-06, + "loss": 0.6485, + "step": 3200 + }, + { + "epoch": 0.745922013360261, + "grad_norm": 0.34926390647888184, + "learning_rate": 4.82178454752018e-06, + "loss": 0.6731, + "step": 3201 + }, + { + "epoch": 0.746155041168246, + "grad_norm": 0.3370054066181183, + "learning_rate": 4.821670974955871e-06, + "loss": 0.6484, + "step": 3202 + }, + { + "epoch": 0.7463880689762311, + "grad_norm": 0.3446631133556366, + "learning_rate": 4.821557367552927e-06, + "loss": 0.6348, + "step": 3203 + }, + { + "epoch": 0.7466210967842163, + "grad_norm": 0.3424868881702423, + "learning_rate": 4.821443725313052e-06, + "loss": 0.6583, + "step": 3204 + }, + { + "epoch": 0.7468541245922014, + "grad_norm": 0.3607289493083954, + "learning_rate": 4.82133004823795e-06, + "loss": 0.6772, + "step": 3205 + }, + { + "epoch": 0.7470871524001864, + "grad_norm": 0.3564506471157074, + "learning_rate": 4.821216336329329e-06, + "loss": 0.6289, + "step": 3206 + }, + { + "epoch": 0.7473201802081715, + "grad_norm": 0.35301801562309265, + "learning_rate": 4.821102589588894e-06, + "loss": 0.6715, + "step": 3207 + }, + { + "epoch": 0.7475532080161565, + "grad_norm": 0.35231146216392517, + "learning_rate": 4.820988808018353e-06, + "loss": 0.6408, + "step": 3208 + }, + { + "epoch": 0.7477862358241417, + "grad_norm": 0.34647899866104126, + "learning_rate": 4.820874991619412e-06, + "loss": 0.6441, + "step": 3209 + }, + { + "epoch": 0.7480192636321268, + "grad_norm": 0.34790894389152527, + "learning_rate": 4.820761140393779e-06, + "loss": 0.6445, + "step": 3210 + }, + { + "epoch": 0.7482522914401118, + "grad_norm": 0.351002961397171, + "learning_rate": 4.820647254343164e-06, + "loss": 0.6491, + "step": 3211 + }, + { + "epoch": 0.7484853192480969, + "grad_norm": 0.33095428347587585, + "learning_rate": 4.820533333469275e-06, + "loss": 0.68, + "step": 3212 + }, + { + "epoch": 0.748718347056082, + "grad_norm": 0.3562941253185272, + "learning_rate": 4.820419377773821e-06, + "loss": 0.6506, + "step": 3213 + }, + { + "epoch": 0.7489513748640672, + "grad_norm": 0.35631048679351807, + "learning_rate": 4.820305387258512e-06, + "loss": 0.655, + "step": 3214 + }, + { + "epoch": 0.7491844026720522, + "grad_norm": 0.3447266221046448, + "learning_rate": 4.82019136192506e-06, + "loss": 0.6632, + "step": 3215 + }, + { + "epoch": 0.7494174304800373, + "grad_norm": 0.33963173627853394, + "learning_rate": 4.820077301775175e-06, + "loss": 0.6514, + "step": 3216 + }, + { + "epoch": 0.7496504582880223, + "grad_norm": 0.3554181158542633, + "learning_rate": 4.819963206810568e-06, + "loss": 0.6717, + "step": 3217 + }, + { + "epoch": 0.7498834860960074, + "grad_norm": 0.3423314094543457, + "learning_rate": 4.819849077032952e-06, + "loss": 0.6316, + "step": 3218 + }, + { + "epoch": 0.7501165139039926, + "grad_norm": 0.3533771336078644, + "learning_rate": 4.8197349124440395e-06, + "loss": 0.6488, + "step": 3219 + }, + { + "epoch": 0.7503495417119777, + "grad_norm": 0.3453848659992218, + "learning_rate": 4.819620713045543e-06, + "loss": 0.6492, + "step": 3220 + }, + { + "epoch": 0.7505825695199627, + "grad_norm": 0.3555521070957184, + "learning_rate": 4.819506478839178e-06, + "loss": 0.6775, + "step": 3221 + }, + { + "epoch": 0.7508155973279478, + "grad_norm": 0.339569628238678, + "learning_rate": 4.819392209826656e-06, + "loss": 0.6451, + "step": 3222 + }, + { + "epoch": 0.7510486251359328, + "grad_norm": 0.34634456038475037, + "learning_rate": 4.8192779060096925e-06, + "loss": 0.6346, + "step": 3223 + }, + { + "epoch": 0.751281652943918, + "grad_norm": 0.33622679114341736, + "learning_rate": 4.819163567390004e-06, + "loss": 0.6762, + "step": 3224 + }, + { + "epoch": 0.7515146807519031, + "grad_norm": 0.33932310342788696, + "learning_rate": 4.8190491939693055e-06, + "loss": 0.6147, + "step": 3225 + }, + { + "epoch": 0.7517477085598882, + "grad_norm": 0.3475668430328369, + "learning_rate": 4.818934785749312e-06, + "loss": 0.6647, + "step": 3226 + }, + { + "epoch": 0.7519807363678732, + "grad_norm": 0.3365150988101959, + "learning_rate": 4.818820342731742e-06, + "loss": 0.6413, + "step": 3227 + }, + { + "epoch": 0.7522137641758583, + "grad_norm": 0.35051050782203674, + "learning_rate": 4.8187058649183135e-06, + "loss": 0.6436, + "step": 3228 + }, + { + "epoch": 0.7524467919838435, + "grad_norm": 0.3427262604236603, + "learning_rate": 4.8185913523107415e-06, + "loss": 0.6507, + "step": 3229 + }, + { + "epoch": 0.7526798197918285, + "grad_norm": 0.3571871519088745, + "learning_rate": 4.8184768049107474e-06, + "loss": 0.6744, + "step": 3230 + }, + { + "epoch": 0.7529128475998136, + "grad_norm": 0.3444487154483795, + "learning_rate": 4.818362222720049e-06, + "loss": 0.6493, + "step": 3231 + }, + { + "epoch": 0.7531458754077986, + "grad_norm": 0.33227455615997314, + "learning_rate": 4.818247605740364e-06, + "loss": 0.6398, + "step": 3232 + }, + { + "epoch": 0.7533789032157837, + "grad_norm": 0.3508170545101166, + "learning_rate": 4.818132953973415e-06, + "loss": 0.6588, + "step": 3233 + }, + { + "epoch": 0.7536119310237689, + "grad_norm": 0.3524635434150696, + "learning_rate": 4.81801826742092e-06, + "loss": 0.6293, + "step": 3234 + }, + { + "epoch": 0.753844958831754, + "grad_norm": 0.34870293736457825, + "learning_rate": 4.817903546084602e-06, + "loss": 0.6218, + "step": 3235 + }, + { + "epoch": 0.754077986639739, + "grad_norm": 0.34636205434799194, + "learning_rate": 4.817788789966181e-06, + "loss": 0.6571, + "step": 3236 + }, + { + "epoch": 0.7543110144477241, + "grad_norm": 0.3511344790458679, + "learning_rate": 4.8176739990673805e-06, + "loss": 0.6168, + "step": 3237 + }, + { + "epoch": 0.7545440422557091, + "grad_norm": 0.34113338589668274, + "learning_rate": 4.817559173389921e-06, + "loss": 0.6271, + "step": 3238 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 0.32975825667381287, + "learning_rate": 4.817444312935527e-06, + "loss": 0.6207, + "step": 3239 + }, + { + "epoch": 0.7550100978716794, + "grad_norm": 0.3544616401195526, + "learning_rate": 4.817329417705922e-06, + "loss": 0.6273, + "step": 3240 + }, + { + "epoch": 0.7552431256796645, + "grad_norm": 0.3333622217178345, + "learning_rate": 4.8172144877028315e-06, + "loss": 0.6454, + "step": 3241 + }, + { + "epoch": 0.7554761534876495, + "grad_norm": 0.35102710127830505, + "learning_rate": 4.817099522927977e-06, + "loss": 0.6537, + "step": 3242 + }, + { + "epoch": 0.7557091812956346, + "grad_norm": 0.33809036016464233, + "learning_rate": 4.8169845233830856e-06, + "loss": 0.6647, + "step": 3243 + }, + { + "epoch": 0.7559422091036196, + "grad_norm": 0.3254709839820862, + "learning_rate": 4.816869489069882e-06, + "loss": 0.6538, + "step": 3244 + }, + { + "epoch": 0.7561752369116048, + "grad_norm": 0.343944787979126, + "learning_rate": 4.8167544199900936e-06, + "loss": 0.6496, + "step": 3245 + }, + { + "epoch": 0.7564082647195899, + "grad_norm": 0.33066585659980774, + "learning_rate": 4.816639316145446e-06, + "loss": 0.6492, + "step": 3246 + }, + { + "epoch": 0.756641292527575, + "grad_norm": 0.3279355764389038, + "learning_rate": 4.816524177537667e-06, + "loss": 0.6105, + "step": 3247 + }, + { + "epoch": 0.75687432033556, + "grad_norm": 0.34206706285476685, + "learning_rate": 4.816409004168484e-06, + "loss": 0.6522, + "step": 3248 + }, + { + "epoch": 0.7571073481435451, + "grad_norm": 0.34786108136177063, + "learning_rate": 4.816293796039626e-06, + "loss": 0.6585, + "step": 3249 + }, + { + "epoch": 0.7573403759515303, + "grad_norm": 0.34715160727500916, + "learning_rate": 4.81617855315282e-06, + "loss": 0.6548, + "step": 3250 + }, + { + "epoch": 0.7575734037595153, + "grad_norm": 0.33835330605506897, + "learning_rate": 4.816063275509798e-06, + "loss": 0.6568, + "step": 3251 + }, + { + "epoch": 0.7578064315675004, + "grad_norm": 0.3498075306415558, + "learning_rate": 4.815947963112288e-06, + "loss": 0.6441, + "step": 3252 + }, + { + "epoch": 0.7580394593754854, + "grad_norm": 0.34846168756484985, + "learning_rate": 4.815832615962022e-06, + "loss": 0.6295, + "step": 3253 + }, + { + "epoch": 0.7582724871834705, + "grad_norm": 0.3515395224094391, + "learning_rate": 4.815717234060728e-06, + "loss": 0.6543, + "step": 3254 + }, + { + "epoch": 0.7585055149914557, + "grad_norm": 0.34635064005851746, + "learning_rate": 4.8156018174101395e-06, + "loss": 0.6461, + "step": 3255 + }, + { + "epoch": 0.7587385427994408, + "grad_norm": 0.3492947816848755, + "learning_rate": 4.815486366011989e-06, + "loss": 0.6385, + "step": 3256 + }, + { + "epoch": 0.7589715706074258, + "grad_norm": 0.3354594111442566, + "learning_rate": 4.815370879868007e-06, + "loss": 0.6422, + "step": 3257 + }, + { + "epoch": 0.7592045984154109, + "grad_norm": 0.3410017192363739, + "learning_rate": 4.815255358979927e-06, + "loss": 0.6254, + "step": 3258 + }, + { + "epoch": 0.759437626223396, + "grad_norm": 0.3537224531173706, + "learning_rate": 4.815139803349484e-06, + "loss": 0.6645, + "step": 3259 + }, + { + "epoch": 0.7596706540313811, + "grad_norm": 0.35006555914878845, + "learning_rate": 4.81502421297841e-06, + "loss": 0.657, + "step": 3260 + }, + { + "epoch": 0.7599036818393662, + "grad_norm": 0.34229782223701477, + "learning_rate": 4.814908587868441e-06, + "loss": 0.6747, + "step": 3261 + }, + { + "epoch": 0.7601367096473512, + "grad_norm": 0.3591444492340088, + "learning_rate": 4.814792928021311e-06, + "loss": 0.6444, + "step": 3262 + }, + { + "epoch": 0.7603697374553363, + "grad_norm": 0.3441005349159241, + "learning_rate": 4.814677233438757e-06, + "loss": 0.6496, + "step": 3263 + }, + { + "epoch": 0.7606027652633214, + "grad_norm": 0.34978365898132324, + "learning_rate": 4.814561504122514e-06, + "loss": 0.6339, + "step": 3264 + }, + { + "epoch": 0.7608357930713066, + "grad_norm": 0.35024142265319824, + "learning_rate": 4.814445740074318e-06, + "loss": 0.6323, + "step": 3265 + }, + { + "epoch": 0.7610688208792916, + "grad_norm": 0.3472408652305603, + "learning_rate": 4.8143299412959075e-06, + "loss": 0.6471, + "step": 3266 + }, + { + "epoch": 0.7613018486872767, + "grad_norm": 0.34186065196990967, + "learning_rate": 4.81421410778902e-06, + "loss": 0.6615, + "step": 3267 + }, + { + "epoch": 0.7615348764952617, + "grad_norm": 0.35786116123199463, + "learning_rate": 4.814098239555392e-06, + "loss": 0.6304, + "step": 3268 + }, + { + "epoch": 0.7617679043032468, + "grad_norm": 0.3526514172554016, + "learning_rate": 4.8139823365967655e-06, + "loss": 0.6637, + "step": 3269 + }, + { + "epoch": 0.762000932111232, + "grad_norm": 0.3377237915992737, + "learning_rate": 4.813866398914876e-06, + "loss": 0.6493, + "step": 3270 + }, + { + "epoch": 0.762233959919217, + "grad_norm": 0.37075015902519226, + "learning_rate": 4.8137504265114655e-06, + "loss": 0.6572, + "step": 3271 + }, + { + "epoch": 0.7624669877272021, + "grad_norm": 0.3407856523990631, + "learning_rate": 4.8136344193882735e-06, + "loss": 0.6449, + "step": 3272 + }, + { + "epoch": 0.7627000155351872, + "grad_norm": 0.36103707551956177, + "learning_rate": 4.813518377547042e-06, + "loss": 0.6439, + "step": 3273 + }, + { + "epoch": 0.7629330433431722, + "grad_norm": 0.34060922265052795, + "learning_rate": 4.81340230098951e-06, + "loss": 0.6537, + "step": 3274 + }, + { + "epoch": 0.7631660711511574, + "grad_norm": 0.3362058401107788, + "learning_rate": 4.813286189717421e-06, + "loss": 0.6621, + "step": 3275 + }, + { + "epoch": 0.7633990989591425, + "grad_norm": 0.33695897459983826, + "learning_rate": 4.813170043732516e-06, + "loss": 0.6416, + "step": 3276 + }, + { + "epoch": 0.7636321267671276, + "grad_norm": 0.3482149541378021, + "learning_rate": 4.813053863036541e-06, + "loss": 0.6702, + "step": 3277 + }, + { + "epoch": 0.7638651545751126, + "grad_norm": 0.358127623796463, + "learning_rate": 4.812937647631236e-06, + "loss": 0.6472, + "step": 3278 + }, + { + "epoch": 0.7640981823830977, + "grad_norm": 0.3415375351905823, + "learning_rate": 4.812821397518346e-06, + "loss": 0.6463, + "step": 3279 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 0.35667741298675537, + "learning_rate": 4.812705112699616e-06, + "loss": 0.6444, + "step": 3280 + }, + { + "epoch": 0.7645642379990679, + "grad_norm": 0.3553098142147064, + "learning_rate": 4.81258879317679e-06, + "loss": 0.635, + "step": 3281 + }, + { + "epoch": 0.764797265807053, + "grad_norm": 0.35836637020111084, + "learning_rate": 4.812472438951615e-06, + "loss": 0.6597, + "step": 3282 + }, + { + "epoch": 0.765030293615038, + "grad_norm": 0.348099023103714, + "learning_rate": 4.812356050025835e-06, + "loss": 0.6491, + "step": 3283 + }, + { + "epoch": 0.7652633214230231, + "grad_norm": 0.35730811953544617, + "learning_rate": 4.812239626401199e-06, + "loss": 0.6456, + "step": 3284 + }, + { + "epoch": 0.7654963492310083, + "grad_norm": 0.3458344638347626, + "learning_rate": 4.81212316807945e-06, + "loss": 0.634, + "step": 3285 + }, + { + "epoch": 0.7657293770389934, + "grad_norm": 0.34174299240112305, + "learning_rate": 4.81200667506234e-06, + "loss": 0.6667, + "step": 3286 + }, + { + "epoch": 0.7659624048469784, + "grad_norm": 0.3534471392631531, + "learning_rate": 4.811890147351615e-06, + "loss": 0.6404, + "step": 3287 + }, + { + "epoch": 0.7661954326549635, + "grad_norm": 0.3495620787143707, + "learning_rate": 4.8117735849490235e-06, + "loss": 0.6638, + "step": 3288 + }, + { + "epoch": 0.7664284604629485, + "grad_norm": 0.33748164772987366, + "learning_rate": 4.811656987856315e-06, + "loss": 0.6651, + "step": 3289 + }, + { + "epoch": 0.7666614882709336, + "grad_norm": 0.35684001445770264, + "learning_rate": 4.811540356075238e-06, + "loss": 0.6528, + "step": 3290 + }, + { + "epoch": 0.7668945160789188, + "grad_norm": 0.3413209319114685, + "learning_rate": 4.811423689607545e-06, + "loss": 0.6247, + "step": 3291 + }, + { + "epoch": 0.7671275438869039, + "grad_norm": 0.3645796775817871, + "learning_rate": 4.811306988454985e-06, + "loss": 0.6524, + "step": 3292 + }, + { + "epoch": 0.7673605716948889, + "grad_norm": 0.3357072174549103, + "learning_rate": 4.8111902526193086e-06, + "loss": 0.6243, + "step": 3293 + }, + { + "epoch": 0.767593599502874, + "grad_norm": 0.3499735891819, + "learning_rate": 4.81107348210227e-06, + "loss": 0.6655, + "step": 3294 + }, + { + "epoch": 0.767826627310859, + "grad_norm": 0.35434502363204956, + "learning_rate": 4.810956676905619e-06, + "loss": 0.6491, + "step": 3295 + }, + { + "epoch": 0.7680596551188442, + "grad_norm": 0.33859920501708984, + "learning_rate": 4.810839837031109e-06, + "loss": 0.6232, + "step": 3296 + }, + { + "epoch": 0.7682926829268293, + "grad_norm": 0.3312697112560272, + "learning_rate": 4.810722962480495e-06, + "loss": 0.6552, + "step": 3297 + }, + { + "epoch": 0.7685257107348143, + "grad_norm": 0.34180116653442383, + "learning_rate": 4.810606053255528e-06, + "loss": 0.6509, + "step": 3298 + }, + { + "epoch": 0.7687587385427994, + "grad_norm": 0.36605459451675415, + "learning_rate": 4.810489109357964e-06, + "loss": 0.6277, + "step": 3299 + }, + { + "epoch": 0.7689917663507845, + "grad_norm": 0.3543647825717926, + "learning_rate": 4.810372130789558e-06, + "loss": 0.6368, + "step": 3300 + }, + { + "epoch": 0.7692247941587697, + "grad_norm": 0.3504234850406647, + "learning_rate": 4.810255117552065e-06, + "loss": 0.6635, + "step": 3301 + }, + { + "epoch": 0.7694578219667547, + "grad_norm": 0.3769257962703705, + "learning_rate": 4.81013806964724e-06, + "loss": 0.6471, + "step": 3302 + }, + { + "epoch": 0.7696908497747398, + "grad_norm": 0.3553674817085266, + "learning_rate": 4.810020987076841e-06, + "loss": 0.6398, + "step": 3303 + }, + { + "epoch": 0.7699238775827248, + "grad_norm": 0.33601364493370056, + "learning_rate": 4.809903869842623e-06, + "loss": 0.6357, + "step": 3304 + }, + { + "epoch": 0.7701569053907099, + "grad_norm": 0.33319151401519775, + "learning_rate": 4.809786717946345e-06, + "loss": 0.625, + "step": 3305 + }, + { + "epoch": 0.7703899331986951, + "grad_norm": 0.3358577787876129, + "learning_rate": 4.809669531389765e-06, + "loss": 0.6554, + "step": 3306 + }, + { + "epoch": 0.7706229610066802, + "grad_norm": 0.32989537715911865, + "learning_rate": 4.809552310174641e-06, + "loss": 0.638, + "step": 3307 + }, + { + "epoch": 0.7708559888146652, + "grad_norm": 0.34901347756385803, + "learning_rate": 4.809435054302732e-06, + "loss": 0.6471, + "step": 3308 + }, + { + "epoch": 0.7710890166226503, + "grad_norm": 0.3564160168170929, + "learning_rate": 4.809317763775797e-06, + "loss": 0.6649, + "step": 3309 + }, + { + "epoch": 0.7713220444306353, + "grad_norm": 0.3366166651248932, + "learning_rate": 4.809200438595596e-06, + "loss": 0.6369, + "step": 3310 + }, + { + "epoch": 0.7715550722386205, + "grad_norm": 0.3331778943538666, + "learning_rate": 4.809083078763891e-06, + "loss": 0.6509, + "step": 3311 + }, + { + "epoch": 0.7717881000466056, + "grad_norm": 0.3285420835018158, + "learning_rate": 4.808965684282442e-06, + "loss": 0.6633, + "step": 3312 + }, + { + "epoch": 0.7720211278545906, + "grad_norm": 0.338467001914978, + "learning_rate": 4.808848255153011e-06, + "loss": 0.6554, + "step": 3313 + }, + { + "epoch": 0.7722541556625757, + "grad_norm": 0.35426223278045654, + "learning_rate": 4.80873079137736e-06, + "loss": 0.6116, + "step": 3314 + }, + { + "epoch": 0.7724871834705608, + "grad_norm": 0.34492039680480957, + "learning_rate": 4.808613292957252e-06, + "loss": 0.6703, + "step": 3315 + }, + { + "epoch": 0.772720211278546, + "grad_norm": 0.33859747648239136, + "learning_rate": 4.808495759894448e-06, + "loss": 0.6418, + "step": 3316 + }, + { + "epoch": 0.772953239086531, + "grad_norm": 0.3504963219165802, + "learning_rate": 4.808378192190715e-06, + "loss": 0.6597, + "step": 3317 + }, + { + "epoch": 0.7731862668945161, + "grad_norm": 0.3513990640640259, + "learning_rate": 4.808260589847815e-06, + "loss": 0.6557, + "step": 3318 + }, + { + "epoch": 0.7734192947025011, + "grad_norm": 0.3302898108959198, + "learning_rate": 4.808142952867513e-06, + "loss": 0.6845, + "step": 3319 + }, + { + "epoch": 0.7736523225104862, + "grad_norm": 0.35810258984565735, + "learning_rate": 4.808025281251576e-06, + "loss": 0.667, + "step": 3320 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 0.33869293332099915, + "learning_rate": 4.807907575001767e-06, + "loss": 0.6283, + "step": 3321 + }, + { + "epoch": 0.7741183781264565, + "grad_norm": 0.33658963441848755, + "learning_rate": 4.8077898341198544e-06, + "loss": 0.6323, + "step": 3322 + }, + { + "epoch": 0.7743514059344415, + "grad_norm": 0.3501031696796417, + "learning_rate": 4.807672058607604e-06, + "loss": 0.6522, + "step": 3323 + }, + { + "epoch": 0.7745844337424266, + "grad_norm": 0.3606230616569519, + "learning_rate": 4.8075542484667825e-06, + "loss": 0.6407, + "step": 3324 + }, + { + "epoch": 0.7748174615504116, + "grad_norm": 0.34794485569000244, + "learning_rate": 4.8074364036991596e-06, + "loss": 0.6419, + "step": 3325 + }, + { + "epoch": 0.7750504893583968, + "grad_norm": 0.333779513835907, + "learning_rate": 4.807318524306503e-06, + "loss": 0.6531, + "step": 3326 + }, + { + "epoch": 0.7752835171663819, + "grad_norm": 0.3404308259487152, + "learning_rate": 4.8072006102905795e-06, + "loss": 0.6364, + "step": 3327 + }, + { + "epoch": 0.775516544974367, + "grad_norm": 0.36359313130378723, + "learning_rate": 4.807082661653162e-06, + "loss": 0.6223, + "step": 3328 + }, + { + "epoch": 0.775749572782352, + "grad_norm": 0.34931132197380066, + "learning_rate": 4.806964678396018e-06, + "loss": 0.6472, + "step": 3329 + }, + { + "epoch": 0.7759826005903371, + "grad_norm": 0.3372730314731598, + "learning_rate": 4.806846660520918e-06, + "loss": 0.6253, + "step": 3330 + }, + { + "epoch": 0.7762156283983223, + "grad_norm": 0.34978267550468445, + "learning_rate": 4.806728608029635e-06, + "loss": 0.6442, + "step": 3331 + }, + { + "epoch": 0.7764486562063073, + "grad_norm": 0.3448777496814728, + "learning_rate": 4.806610520923938e-06, + "loss": 0.638, + "step": 3332 + }, + { + "epoch": 0.7766816840142924, + "grad_norm": 0.3575165271759033, + "learning_rate": 4.806492399205601e-06, + "loss": 0.6319, + "step": 3333 + }, + { + "epoch": 0.7769147118222774, + "grad_norm": 0.3343985378742218, + "learning_rate": 4.806374242876395e-06, + "loss": 0.6408, + "step": 3334 + }, + { + "epoch": 0.7771477396302625, + "grad_norm": 0.34966790676116943, + "learning_rate": 4.806256051938093e-06, + "loss": 0.6213, + "step": 3335 + }, + { + "epoch": 0.7773807674382476, + "grad_norm": 0.3388005495071411, + "learning_rate": 4.8061378263924696e-06, + "loss": 0.6223, + "step": 3336 + }, + { + "epoch": 0.7776137952462328, + "grad_norm": 0.3339974284172058, + "learning_rate": 4.806019566241298e-06, + "loss": 0.6535, + "step": 3337 + }, + { + "epoch": 0.7778468230542178, + "grad_norm": 0.34717074036598206, + "learning_rate": 4.805901271486354e-06, + "loss": 0.6569, + "step": 3338 + }, + { + "epoch": 0.7780798508622029, + "grad_norm": 0.34198662638664246, + "learning_rate": 4.805782942129411e-06, + "loss": 0.6295, + "step": 3339 + }, + { + "epoch": 0.778312878670188, + "grad_norm": 0.35474860668182373, + "learning_rate": 4.805664578172246e-06, + "loss": 0.6562, + "step": 3340 + }, + { + "epoch": 0.778545906478173, + "grad_norm": 0.34749650955200195, + "learning_rate": 4.805546179616635e-06, + "loss": 0.6286, + "step": 3341 + }, + { + "epoch": 0.7787789342861582, + "grad_norm": 0.33356916904449463, + "learning_rate": 4.805427746464354e-06, + "loss": 0.6358, + "step": 3342 + }, + { + "epoch": 0.7790119620941433, + "grad_norm": 0.34433960914611816, + "learning_rate": 4.805309278717181e-06, + "loss": 0.6608, + "step": 3343 + }, + { + "epoch": 0.7792449899021283, + "grad_norm": 0.3433540165424347, + "learning_rate": 4.805190776376892e-06, + "loss": 0.6522, + "step": 3344 + }, + { + "epoch": 0.7794780177101134, + "grad_norm": 0.3446847200393677, + "learning_rate": 4.805072239445267e-06, + "loss": 0.6493, + "step": 3345 + }, + { + "epoch": 0.7797110455180984, + "grad_norm": 0.3348809480667114, + "learning_rate": 4.804953667924085e-06, + "loss": 0.6282, + "step": 3346 + }, + { + "epoch": 0.7799440733260836, + "grad_norm": 0.34557926654815674, + "learning_rate": 4.8048350618151245e-06, + "loss": 0.6214, + "step": 3347 + }, + { + "epoch": 0.7801771011340687, + "grad_norm": 0.3284865915775299, + "learning_rate": 4.804716421120164e-06, + "loss": 0.6243, + "step": 3348 + }, + { + "epoch": 0.7804101289420537, + "grad_norm": 0.3499979078769684, + "learning_rate": 4.804597745840986e-06, + "loss": 0.6396, + "step": 3349 + }, + { + "epoch": 0.7806431567500388, + "grad_norm": 0.350607305765152, + "learning_rate": 4.8044790359793715e-06, + "loss": 0.6405, + "step": 3350 + }, + { + "epoch": 0.7808761845580239, + "grad_norm": 0.3477213382720947, + "learning_rate": 4.8043602915370995e-06, + "loss": 0.6105, + "step": 3351 + }, + { + "epoch": 0.781109212366009, + "grad_norm": 0.34984785318374634, + "learning_rate": 4.8042415125159535e-06, + "loss": 0.6515, + "step": 3352 + }, + { + "epoch": 0.7813422401739941, + "grad_norm": 0.33690351247787476, + "learning_rate": 4.804122698917715e-06, + "loss": 0.644, + "step": 3353 + }, + { + "epoch": 0.7815752679819792, + "grad_norm": 0.3436413109302521, + "learning_rate": 4.804003850744168e-06, + "loss": 0.6219, + "step": 3354 + }, + { + "epoch": 0.7818082957899642, + "grad_norm": 0.3376931846141815, + "learning_rate": 4.803884967997095e-06, + "loss": 0.6543, + "step": 3355 + }, + { + "epoch": 0.7820413235979493, + "grad_norm": 0.3458283841609955, + "learning_rate": 4.80376605067828e-06, + "loss": 0.6284, + "step": 3356 + }, + { + "epoch": 0.7822743514059345, + "grad_norm": 0.34370461106300354, + "learning_rate": 4.803647098789508e-06, + "loss": 0.5996, + "step": 3357 + }, + { + "epoch": 0.7825073792139196, + "grad_norm": 0.3493337035179138, + "learning_rate": 4.803528112332564e-06, + "loss": 0.6351, + "step": 3358 + }, + { + "epoch": 0.7827404070219046, + "grad_norm": 0.3461698293685913, + "learning_rate": 4.803409091309233e-06, + "loss": 0.6455, + "step": 3359 + }, + { + "epoch": 0.7829734348298897, + "grad_norm": 0.3432633578777313, + "learning_rate": 4.803290035721301e-06, + "loss": 0.6295, + "step": 3360 + }, + { + "epoch": 0.7832064626378747, + "grad_norm": 0.3374421000480652, + "learning_rate": 4.803170945570555e-06, + "loss": 0.6573, + "step": 3361 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 0.3490377366542816, + "learning_rate": 4.80305182085878e-06, + "loss": 0.6559, + "step": 3362 + }, + { + "epoch": 0.783672518253845, + "grad_norm": 0.333173543214798, + "learning_rate": 4.802932661587768e-06, + "loss": 0.6331, + "step": 3363 + }, + { + "epoch": 0.78390554606183, + "grad_norm": 0.3467191159725189, + "learning_rate": 4.802813467759303e-06, + "loss": 0.6502, + "step": 3364 + }, + { + "epoch": 0.7841385738698151, + "grad_norm": 0.3380971848964691, + "learning_rate": 4.802694239375175e-06, + "loss": 0.6506, + "step": 3365 + }, + { + "epoch": 0.7843716016778002, + "grad_norm": 0.33960583806037903, + "learning_rate": 4.8025749764371734e-06, + "loss": 0.6374, + "step": 3366 + }, + { + "epoch": 0.7846046294857854, + "grad_norm": 0.34525594115257263, + "learning_rate": 4.8024556789470876e-06, + "loss": 0.6481, + "step": 3367 + }, + { + "epoch": 0.7848376572937704, + "grad_norm": 0.33117812871932983, + "learning_rate": 4.802336346906708e-06, + "loss": 0.59, + "step": 3368 + }, + { + "epoch": 0.7850706851017555, + "grad_norm": 0.34371519088745117, + "learning_rate": 4.802216980317825e-06, + "loss": 0.6708, + "step": 3369 + }, + { + "epoch": 0.7853037129097405, + "grad_norm": 0.3465151786804199, + "learning_rate": 4.80209757918223e-06, + "loss": 0.6375, + "step": 3370 + }, + { + "epoch": 0.7855367407177256, + "grad_norm": 0.35109269618988037, + "learning_rate": 4.801978143501714e-06, + "loss": 0.648, + "step": 3371 + }, + { + "epoch": 0.7857697685257108, + "grad_norm": 0.3426593840122223, + "learning_rate": 4.801858673278069e-06, + "loss": 0.6265, + "step": 3372 + }, + { + "epoch": 0.7860027963336959, + "grad_norm": 0.344473272562027, + "learning_rate": 4.80173916851309e-06, + "loss": 0.6107, + "step": 3373 + }, + { + "epoch": 0.7862358241416809, + "grad_norm": 0.3430682122707367, + "learning_rate": 4.801619629208568e-06, + "loss": 0.6628, + "step": 3374 + }, + { + "epoch": 0.786468851949666, + "grad_norm": 0.3250817656517029, + "learning_rate": 4.801500055366298e-06, + "loss": 0.6319, + "step": 3375 + }, + { + "epoch": 0.786701879757651, + "grad_norm": 0.35907450318336487, + "learning_rate": 4.801380446988073e-06, + "loss": 0.6555, + "step": 3376 + }, + { + "epoch": 0.7869349075656362, + "grad_norm": 0.353375643491745, + "learning_rate": 4.801260804075691e-06, + "loss": 0.6468, + "step": 3377 + }, + { + "epoch": 0.7871679353736213, + "grad_norm": 0.34833234548568726, + "learning_rate": 4.801141126630943e-06, + "loss": 0.6544, + "step": 3378 + }, + { + "epoch": 0.7874009631816064, + "grad_norm": 0.33815374970436096, + "learning_rate": 4.801021414655627e-06, + "loss": 0.6251, + "step": 3379 + }, + { + "epoch": 0.7876339909895914, + "grad_norm": 0.36220574378967285, + "learning_rate": 4.80090166815154e-06, + "loss": 0.626, + "step": 3380 + }, + { + "epoch": 0.7878670187975765, + "grad_norm": 0.35337746143341064, + "learning_rate": 4.800781887120477e-06, + "loss": 0.643, + "step": 3381 + }, + { + "epoch": 0.7881000466055615, + "grad_norm": 0.35741761326789856, + "learning_rate": 4.800662071564237e-06, + "loss": 0.6556, + "step": 3382 + }, + { + "epoch": 0.7883330744135467, + "grad_norm": 0.343038409948349, + "learning_rate": 4.800542221484619e-06, + "loss": 0.6389, + "step": 3383 + }, + { + "epoch": 0.7885661022215318, + "grad_norm": 0.3472886085510254, + "learning_rate": 4.8004223368834185e-06, + "loss": 0.6343, + "step": 3384 + }, + { + "epoch": 0.7887991300295168, + "grad_norm": 0.34490400552749634, + "learning_rate": 4.800302417762437e-06, + "loss": 0.6355, + "step": 3385 + }, + { + "epoch": 0.7890321578375019, + "grad_norm": 0.34558990597724915, + "learning_rate": 4.800182464123472e-06, + "loss": 0.6571, + "step": 3386 + }, + { + "epoch": 0.789265185645487, + "grad_norm": 0.3421529233455658, + "learning_rate": 4.800062475968324e-06, + "loss": 0.6451, + "step": 3387 + }, + { + "epoch": 0.7894982134534722, + "grad_norm": 0.3589954376220703, + "learning_rate": 4.799942453298794e-06, + "loss": 0.6384, + "step": 3388 + }, + { + "epoch": 0.7897312412614572, + "grad_norm": 0.3614846169948578, + "learning_rate": 4.799822396116685e-06, + "loss": 0.6379, + "step": 3389 + }, + { + "epoch": 0.7899642690694423, + "grad_norm": 0.33443912863731384, + "learning_rate": 4.799702304423796e-06, + "loss": 0.6463, + "step": 3390 + }, + { + "epoch": 0.7901972968774273, + "grad_norm": 0.34689754247665405, + "learning_rate": 4.799582178221929e-06, + "loss": 0.635, + "step": 3391 + }, + { + "epoch": 0.7904303246854124, + "grad_norm": 0.35653364658355713, + "learning_rate": 4.799462017512887e-06, + "loss": 0.6504, + "step": 3392 + }, + { + "epoch": 0.7906633524933976, + "grad_norm": 0.35625550150871277, + "learning_rate": 4.7993418222984735e-06, + "loss": 0.6696, + "step": 3393 + }, + { + "epoch": 0.7908963803013827, + "grad_norm": 0.34109050035476685, + "learning_rate": 4.799221592580492e-06, + "loss": 0.6354, + "step": 3394 + }, + { + "epoch": 0.7911294081093677, + "grad_norm": 0.3542843461036682, + "learning_rate": 4.799101328360747e-06, + "loss": 0.6066, + "step": 3395 + }, + { + "epoch": 0.7913624359173528, + "grad_norm": 0.34802430868148804, + "learning_rate": 4.798981029641042e-06, + "loss": 0.6651, + "step": 3396 + }, + { + "epoch": 0.7915954637253378, + "grad_norm": 0.3436896502971649, + "learning_rate": 4.798860696423184e-06, + "loss": 0.6151, + "step": 3397 + }, + { + "epoch": 0.791828491533323, + "grad_norm": 0.34335216879844666, + "learning_rate": 4.798740328708978e-06, + "loss": 0.6324, + "step": 3398 + }, + { + "epoch": 0.7920615193413081, + "grad_norm": 0.36352935433387756, + "learning_rate": 4.798619926500229e-06, + "loss": 0.6345, + "step": 3399 + }, + { + "epoch": 0.7922945471492931, + "grad_norm": 0.33290818333625793, + "learning_rate": 4.7984994897987445e-06, + "loss": 0.6286, + "step": 3400 + }, + { + "epoch": 0.7925275749572782, + "grad_norm": 0.3587884306907654, + "learning_rate": 4.798379018606332e-06, + "loss": 0.6644, + "step": 3401 + }, + { + "epoch": 0.7927606027652633, + "grad_norm": 0.3574630618095398, + "learning_rate": 4.7982585129248e-06, + "loss": 0.6685, + "step": 3402 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 0.3500847816467285, + "learning_rate": 4.798137972755955e-06, + "loss": 0.6527, + "step": 3403 + }, + { + "epoch": 0.7932266583812335, + "grad_norm": 0.3590225875377655, + "learning_rate": 4.798017398101607e-06, + "loss": 0.6645, + "step": 3404 + }, + { + "epoch": 0.7934596861892186, + "grad_norm": 0.3463137745857239, + "learning_rate": 4.797896788963565e-06, + "loss": 0.6339, + "step": 3405 + }, + { + "epoch": 0.7936927139972036, + "grad_norm": 0.35178813338279724, + "learning_rate": 4.79777614534364e-06, + "loss": 0.6462, + "step": 3406 + }, + { + "epoch": 0.7939257418051887, + "grad_norm": 0.36860114336013794, + "learning_rate": 4.797655467243641e-06, + "loss": 0.6308, + "step": 3407 + }, + { + "epoch": 0.7941587696131739, + "grad_norm": 0.3418652415275574, + "learning_rate": 4.797534754665378e-06, + "loss": 0.6523, + "step": 3408 + }, + { + "epoch": 0.794391797421159, + "grad_norm": 0.3461449444293976, + "learning_rate": 4.797414007610664e-06, + "loss": 0.6528, + "step": 3409 + }, + { + "epoch": 0.794624825229144, + "grad_norm": 0.34676727652549744, + "learning_rate": 4.797293226081311e-06, + "loss": 0.6294, + "step": 3410 + }, + { + "epoch": 0.7948578530371291, + "grad_norm": 0.33554011583328247, + "learning_rate": 4.797172410079131e-06, + "loss": 0.6374, + "step": 3411 + }, + { + "epoch": 0.7950908808451141, + "grad_norm": 0.34221649169921875, + "learning_rate": 4.797051559605936e-06, + "loss": 0.6303, + "step": 3412 + }, + { + "epoch": 0.7953239086530993, + "grad_norm": 0.34373748302459717, + "learning_rate": 4.796930674663542e-06, + "loss": 0.6455, + "step": 3413 + }, + { + "epoch": 0.7955569364610844, + "grad_norm": 0.34897828102111816, + "learning_rate": 4.7968097552537595e-06, + "loss": 0.6582, + "step": 3414 + }, + { + "epoch": 0.7957899642690694, + "grad_norm": 0.33766621351242065, + "learning_rate": 4.796688801378405e-06, + "loss": 0.6137, + "step": 3415 + }, + { + "epoch": 0.7960229920770545, + "grad_norm": 0.33797985315322876, + "learning_rate": 4.796567813039294e-06, + "loss": 0.6622, + "step": 3416 + }, + { + "epoch": 0.7962560198850396, + "grad_norm": 0.3486398756504059, + "learning_rate": 4.796446790238241e-06, + "loss": 0.6514, + "step": 3417 + }, + { + "epoch": 0.7964890476930248, + "grad_norm": 0.3488197326660156, + "learning_rate": 4.7963257329770626e-06, + "loss": 0.6507, + "step": 3418 + }, + { + "epoch": 0.7967220755010098, + "grad_norm": 0.3402903974056244, + "learning_rate": 4.796204641257574e-06, + "loss": 0.6238, + "step": 3419 + }, + { + "epoch": 0.7969551033089949, + "grad_norm": 0.3529575765132904, + "learning_rate": 4.796083515081596e-06, + "loss": 0.6598, + "step": 3420 + }, + { + "epoch": 0.79718813111698, + "grad_norm": 0.35335344076156616, + "learning_rate": 4.795962354450942e-06, + "loss": 0.6655, + "step": 3421 + }, + { + "epoch": 0.797421158924965, + "grad_norm": 0.343791663646698, + "learning_rate": 4.795841159367432e-06, + "loss": 0.6452, + "step": 3422 + }, + { + "epoch": 0.7976541867329501, + "grad_norm": 0.35568544268608093, + "learning_rate": 4.7957199298328835e-06, + "loss": 0.6471, + "step": 3423 + }, + { + "epoch": 0.7978872145409353, + "grad_norm": 0.3370612859725952, + "learning_rate": 4.795598665849118e-06, + "loss": 0.6434, + "step": 3424 + }, + { + "epoch": 0.7981202423489203, + "grad_norm": 0.3526553511619568, + "learning_rate": 4.795477367417953e-06, + "loss": 0.6467, + "step": 3425 + }, + { + "epoch": 0.7983532701569054, + "grad_norm": 0.331854909658432, + "learning_rate": 4.795356034541209e-06, + "loss": 0.6436, + "step": 3426 + }, + { + "epoch": 0.7985862979648904, + "grad_norm": 0.3477550745010376, + "learning_rate": 4.795234667220709e-06, + "loss": 0.6479, + "step": 3427 + }, + { + "epoch": 0.7988193257728755, + "grad_norm": 0.3347671627998352, + "learning_rate": 4.7951132654582704e-06, + "loss": 0.6354, + "step": 3428 + }, + { + "epoch": 0.7990523535808607, + "grad_norm": 0.3578953742980957, + "learning_rate": 4.794991829255717e-06, + "loss": 0.6603, + "step": 3429 + }, + { + "epoch": 0.7992853813888458, + "grad_norm": 0.3546769320964813, + "learning_rate": 4.794870358614871e-06, + "loss": 0.6398, + "step": 3430 + }, + { + "epoch": 0.7995184091968308, + "grad_norm": 0.34097540378570557, + "learning_rate": 4.794748853537556e-06, + "loss": 0.643, + "step": 3431 + }, + { + "epoch": 0.7997514370048159, + "grad_norm": 0.34203553199768066, + "learning_rate": 4.7946273140255935e-06, + "loss": 0.6446, + "step": 3432 + }, + { + "epoch": 0.7999844648128009, + "grad_norm": 0.35310596227645874, + "learning_rate": 4.794505740080809e-06, + "loss": 0.6174, + "step": 3433 + }, + { + "epoch": 0.8002174926207861, + "grad_norm": 0.3516800105571747, + "learning_rate": 4.794384131705026e-06, + "loss": 0.6403, + "step": 3434 + }, + { + "epoch": 0.8004505204287712, + "grad_norm": 0.34104123711586, + "learning_rate": 4.7942624889000685e-06, + "loss": 0.6208, + "step": 3435 + }, + { + "epoch": 0.8006835482367562, + "grad_norm": 0.36415594816207886, + "learning_rate": 4.794140811667763e-06, + "loss": 0.6413, + "step": 3436 + }, + { + "epoch": 0.8009165760447413, + "grad_norm": 0.35284191370010376, + "learning_rate": 4.794019100009935e-06, + "loss": 0.6124, + "step": 3437 + }, + { + "epoch": 0.8011496038527264, + "grad_norm": 0.34393295645713806, + "learning_rate": 4.793897353928411e-06, + "loss": 0.628, + "step": 3438 + }, + { + "epoch": 0.8013826316607116, + "grad_norm": 0.3540259897708893, + "learning_rate": 4.793775573425018e-06, + "loss": 0.6664, + "step": 3439 + }, + { + "epoch": 0.8016156594686966, + "grad_norm": 0.33636727929115295, + "learning_rate": 4.793653758501583e-06, + "loss": 0.6202, + "step": 3440 + }, + { + "epoch": 0.8018486872766817, + "grad_norm": 0.3374824523925781, + "learning_rate": 4.793531909159934e-06, + "loss": 0.6342, + "step": 3441 + }, + { + "epoch": 0.8020817150846667, + "grad_norm": 0.35357093811035156, + "learning_rate": 4.7934100254019e-06, + "loss": 0.637, + "step": 3442 + }, + { + "epoch": 0.8023147428926518, + "grad_norm": 0.3415897488594055, + "learning_rate": 4.79328810722931e-06, + "loss": 0.6639, + "step": 3443 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 0.3494749963283539, + "learning_rate": 4.793166154643993e-06, + "loss": 0.6306, + "step": 3444 + }, + { + "epoch": 0.802780798508622, + "grad_norm": 0.33512672781944275, + "learning_rate": 4.793044167647778e-06, + "loss": 0.6422, + "step": 3445 + }, + { + "epoch": 0.8030138263166071, + "grad_norm": 0.34917211532592773, + "learning_rate": 4.792922146242498e-06, + "loss": 0.654, + "step": 3446 + }, + { + "epoch": 0.8032468541245922, + "grad_norm": 0.35369348526000977, + "learning_rate": 4.792800090429982e-06, + "loss": 0.6499, + "step": 3447 + }, + { + "epoch": 0.8034798819325772, + "grad_norm": 0.3361832797527313, + "learning_rate": 4.792678000212063e-06, + "loss": 0.6454, + "step": 3448 + }, + { + "epoch": 0.8037129097405624, + "grad_norm": 0.3295805752277374, + "learning_rate": 4.792555875590571e-06, + "loss": 0.6283, + "step": 3449 + }, + { + "epoch": 0.8039459375485475, + "grad_norm": 0.3366629183292389, + "learning_rate": 4.792433716567341e-06, + "loss": 0.641, + "step": 3450 + }, + { + "epoch": 0.8041789653565325, + "grad_norm": 0.3441862165927887, + "learning_rate": 4.792311523144204e-06, + "loss": 0.5955, + "step": 3451 + }, + { + "epoch": 0.8044119931645176, + "grad_norm": 0.3561890125274658, + "learning_rate": 4.792189295322995e-06, + "loss": 0.659, + "step": 3452 + }, + { + "epoch": 0.8046450209725027, + "grad_norm": 0.34788811206817627, + "learning_rate": 4.792067033105548e-06, + "loss": 0.6566, + "step": 3453 + }, + { + "epoch": 0.8048780487804879, + "grad_norm": 0.34147217869758606, + "learning_rate": 4.791944736493697e-06, + "loss": 0.6561, + "step": 3454 + }, + { + "epoch": 0.8051110765884729, + "grad_norm": 0.35561302304267883, + "learning_rate": 4.791822405489278e-06, + "loss": 0.6595, + "step": 3455 + }, + { + "epoch": 0.805344104396458, + "grad_norm": 0.3585987985134125, + "learning_rate": 4.791700040094125e-06, + "loss": 0.6592, + "step": 3456 + }, + { + "epoch": 0.805577132204443, + "grad_norm": 0.35012462735176086, + "learning_rate": 4.791577640310077e-06, + "loss": 0.6542, + "step": 3457 + }, + { + "epoch": 0.8058101600124281, + "grad_norm": 0.3354967534542084, + "learning_rate": 4.791455206138968e-06, + "loss": 0.6167, + "step": 3458 + }, + { + "epoch": 0.8060431878204133, + "grad_norm": 0.3484934866428375, + "learning_rate": 4.7913327375826365e-06, + "loss": 0.6684, + "step": 3459 + }, + { + "epoch": 0.8062762156283984, + "grad_norm": 0.3459482192993164, + "learning_rate": 4.7912102346429205e-06, + "loss": 0.6678, + "step": 3460 + }, + { + "epoch": 0.8065092434363834, + "grad_norm": 0.3444315195083618, + "learning_rate": 4.791087697321657e-06, + "loss": 0.6305, + "step": 3461 + }, + { + "epoch": 0.8067422712443685, + "grad_norm": 0.3545803427696228, + "learning_rate": 4.790965125620687e-06, + "loss": 0.6354, + "step": 3462 + }, + { + "epoch": 0.8069752990523535, + "grad_norm": 0.3469056785106659, + "learning_rate": 4.7908425195418475e-06, + "loss": 0.6539, + "step": 3463 + }, + { + "epoch": 0.8072083268603387, + "grad_norm": 0.35787448287010193, + "learning_rate": 4.790719879086979e-06, + "loss": 0.6408, + "step": 3464 + }, + { + "epoch": 0.8074413546683238, + "grad_norm": 0.3311176002025604, + "learning_rate": 4.790597204257922e-06, + "loss": 0.6264, + "step": 3465 + }, + { + "epoch": 0.8076743824763088, + "grad_norm": 0.3681873679161072, + "learning_rate": 4.790474495056519e-06, + "loss": 0.642, + "step": 3466 + }, + { + "epoch": 0.8079074102842939, + "grad_norm": 0.33507776260375977, + "learning_rate": 4.790351751484608e-06, + "loss": 0.6458, + "step": 3467 + }, + { + "epoch": 0.808140438092279, + "grad_norm": 0.3315590023994446, + "learning_rate": 4.790228973544033e-06, + "loss": 0.6229, + "step": 3468 + }, + { + "epoch": 0.808373465900264, + "grad_norm": 0.3723466396331787, + "learning_rate": 4.790106161236636e-06, + "loss": 0.6846, + "step": 3469 + }, + { + "epoch": 0.8086064937082492, + "grad_norm": 0.33870378136634827, + "learning_rate": 4.78998331456426e-06, + "loss": 0.6332, + "step": 3470 + }, + { + "epoch": 0.8088395215162343, + "grad_norm": 0.33959805965423584, + "learning_rate": 4.789860433528748e-06, + "loss": 0.6479, + "step": 3471 + }, + { + "epoch": 0.8090725493242193, + "grad_norm": 0.3452370762825012, + "learning_rate": 4.789737518131945e-06, + "loss": 0.648, + "step": 3472 + }, + { + "epoch": 0.8093055771322044, + "grad_norm": 0.34398525953292847, + "learning_rate": 4.789614568375694e-06, + "loss": 0.6383, + "step": 3473 + }, + { + "epoch": 0.8095386049401895, + "grad_norm": 0.33644118905067444, + "learning_rate": 4.78949158426184e-06, + "loss": 0.6515, + "step": 3474 + }, + { + "epoch": 0.8097716327481747, + "grad_norm": 0.3495814800262451, + "learning_rate": 4.78936856579223e-06, + "loss": 0.6558, + "step": 3475 + }, + { + "epoch": 0.8100046605561597, + "grad_norm": 0.33554190397262573, + "learning_rate": 4.78924551296871e-06, + "loss": 0.6463, + "step": 3476 + }, + { + "epoch": 0.8102376883641448, + "grad_norm": 0.34942367672920227, + "learning_rate": 4.789122425793124e-06, + "loss": 0.6294, + "step": 3477 + }, + { + "epoch": 0.8104707161721298, + "grad_norm": 0.36052700877189636, + "learning_rate": 4.788999304267322e-06, + "loss": 0.6416, + "step": 3478 + }, + { + "epoch": 0.8107037439801149, + "grad_norm": 0.34621015191078186, + "learning_rate": 4.78887614839315e-06, + "loss": 0.6537, + "step": 3479 + }, + { + "epoch": 0.8109367717881001, + "grad_norm": 0.3436248302459717, + "learning_rate": 4.788752958172456e-06, + "loss": 0.6319, + "step": 3480 + }, + { + "epoch": 0.8111697995960852, + "grad_norm": 0.33372271060943604, + "learning_rate": 4.788629733607089e-06, + "loss": 0.6388, + "step": 3481 + }, + { + "epoch": 0.8114028274040702, + "grad_norm": 0.3615459203720093, + "learning_rate": 4.788506474698898e-06, + "loss": 0.639, + "step": 3482 + }, + { + "epoch": 0.8116358552120553, + "grad_norm": 0.3467318117618561, + "learning_rate": 4.788383181449733e-06, + "loss": 0.6642, + "step": 3483 + }, + { + "epoch": 0.8118688830200403, + "grad_norm": 0.33406418561935425, + "learning_rate": 4.788259853861442e-06, + "loss": 0.615, + "step": 3484 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 0.3649868071079254, + "learning_rate": 4.7881364919358785e-06, + "loss": 0.6514, + "step": 3485 + }, + { + "epoch": 0.8123349386360106, + "grad_norm": 0.34707707166671753, + "learning_rate": 4.7880130956748925e-06, + "loss": 0.6419, + "step": 3486 + }, + { + "epoch": 0.8125679664439956, + "grad_norm": 0.33541181683540344, + "learning_rate": 4.787889665080336e-06, + "loss": 0.6227, + "step": 3487 + }, + { + "epoch": 0.8128009942519807, + "grad_norm": 0.3467240333557129, + "learning_rate": 4.7877662001540595e-06, + "loss": 0.6241, + "step": 3488 + }, + { + "epoch": 0.8130340220599658, + "grad_norm": 0.34976351261138916, + "learning_rate": 4.787642700897918e-06, + "loss": 0.6589, + "step": 3489 + }, + { + "epoch": 0.813267049867951, + "grad_norm": 0.32904309034347534, + "learning_rate": 4.787519167313763e-06, + "loss": 0.6379, + "step": 3490 + }, + { + "epoch": 0.813500077675936, + "grad_norm": 0.338296502828598, + "learning_rate": 4.787395599403449e-06, + "loss": 0.6576, + "step": 3491 + }, + { + "epoch": 0.8137331054839211, + "grad_norm": 0.33876076340675354, + "learning_rate": 4.7872719971688305e-06, + "loss": 0.6244, + "step": 3492 + }, + { + "epoch": 0.8139661332919061, + "grad_norm": 0.345537394285202, + "learning_rate": 4.787148360611763e-06, + "loss": 0.6546, + "step": 3493 + }, + { + "epoch": 0.8141991610998912, + "grad_norm": 0.34872615337371826, + "learning_rate": 4.787024689734099e-06, + "loss": 0.6691, + "step": 3494 + }, + { + "epoch": 0.8144321889078764, + "grad_norm": 0.33204713463783264, + "learning_rate": 4.7869009845376966e-06, + "loss": 0.6281, + "step": 3495 + }, + { + "epoch": 0.8146652167158615, + "grad_norm": 0.33278027176856995, + "learning_rate": 4.786777245024412e-06, + "loss": 0.611, + "step": 3496 + }, + { + "epoch": 0.8148982445238465, + "grad_norm": 0.3494824469089508, + "learning_rate": 4.786653471196101e-06, + "loss": 0.616, + "step": 3497 + }, + { + "epoch": 0.8151312723318316, + "grad_norm": 0.3493565618991852, + "learning_rate": 4.786529663054621e-06, + "loss": 0.6512, + "step": 3498 + }, + { + "epoch": 0.8153643001398166, + "grad_norm": 0.3456593155860901, + "learning_rate": 4.7864058206018304e-06, + "loss": 0.6396, + "step": 3499 + }, + { + "epoch": 0.8155973279478018, + "grad_norm": 0.350557416677475, + "learning_rate": 4.786281943839588e-06, + "loss": 0.6523, + "step": 3500 + }, + { + "epoch": 0.8158303557557869, + "grad_norm": 0.3375759720802307, + "learning_rate": 4.7861580327697524e-06, + "loss": 0.6392, + "step": 3501 + }, + { + "epoch": 0.816063383563772, + "grad_norm": 0.34060508012771606, + "learning_rate": 4.786034087394182e-06, + "loss": 0.6553, + "step": 3502 + }, + { + "epoch": 0.816296411371757, + "grad_norm": 0.3507816791534424, + "learning_rate": 4.785910107714737e-06, + "loss": 0.646, + "step": 3503 + }, + { + "epoch": 0.8165294391797421, + "grad_norm": 0.3531794846057892, + "learning_rate": 4.785786093733279e-06, + "loss": 0.6329, + "step": 3504 + }, + { + "epoch": 0.8167624669877273, + "grad_norm": 0.33740612864494324, + "learning_rate": 4.785662045451668e-06, + "loss": 0.659, + "step": 3505 + }, + { + "epoch": 0.8169954947957123, + "grad_norm": 0.33168211579322815, + "learning_rate": 4.785537962871766e-06, + "loss": 0.6433, + "step": 3506 + }, + { + "epoch": 0.8172285226036974, + "grad_norm": 0.3485158085823059, + "learning_rate": 4.785413845995433e-06, + "loss": 0.6672, + "step": 3507 + }, + { + "epoch": 0.8174615504116824, + "grad_norm": 0.35644084215164185, + "learning_rate": 4.785289694824536e-06, + "loss": 0.6487, + "step": 3508 + }, + { + "epoch": 0.8176945782196675, + "grad_norm": 0.3270479738712311, + "learning_rate": 4.785165509360932e-06, + "loss": 0.6129, + "step": 3509 + }, + { + "epoch": 0.8179276060276527, + "grad_norm": 0.3597686290740967, + "learning_rate": 4.785041289606489e-06, + "loss": 0.6598, + "step": 3510 + }, + { + "epoch": 0.8181606338356378, + "grad_norm": 0.33789029717445374, + "learning_rate": 4.784917035563069e-06, + "loss": 0.6145, + "step": 3511 + }, + { + "epoch": 0.8183936616436228, + "grad_norm": 0.3503327965736389, + "learning_rate": 4.784792747232538e-06, + "loss": 0.657, + "step": 3512 + }, + { + "epoch": 0.8186266894516079, + "grad_norm": 0.3403548300266266, + "learning_rate": 4.784668424616759e-06, + "loss": 0.6348, + "step": 3513 + }, + { + "epoch": 0.818859717259593, + "grad_norm": 0.3634667992591858, + "learning_rate": 4.7845440677176e-06, + "loss": 0.6619, + "step": 3514 + }, + { + "epoch": 0.819092745067578, + "grad_norm": 0.3459954261779785, + "learning_rate": 4.784419676536926e-06, + "loss": 0.6235, + "step": 3515 + }, + { + "epoch": 0.8193257728755632, + "grad_norm": 0.3490508794784546, + "learning_rate": 4.784295251076601e-06, + "loss": 0.6402, + "step": 3516 + }, + { + "epoch": 0.8195588006835482, + "grad_norm": 0.35218897461891174, + "learning_rate": 4.7841707913384976e-06, + "loss": 0.6207, + "step": 3517 + }, + { + "epoch": 0.8197918284915333, + "grad_norm": 0.3641635477542877, + "learning_rate": 4.784046297324478e-06, + "loss": 0.6402, + "step": 3518 + }, + { + "epoch": 0.8200248562995184, + "grad_norm": 0.3427915573120117, + "learning_rate": 4.783921769036414e-06, + "loss": 0.6376, + "step": 3519 + }, + { + "epoch": 0.8202578841075034, + "grad_norm": 0.3421076238155365, + "learning_rate": 4.7837972064761715e-06, + "loss": 0.6214, + "step": 3520 + }, + { + "epoch": 0.8204909119154886, + "grad_norm": 0.3693750500679016, + "learning_rate": 4.783672609645623e-06, + "loss": 0.6597, + "step": 3521 + }, + { + "epoch": 0.8207239397234737, + "grad_norm": 0.3425615429878235, + "learning_rate": 4.7835479785466355e-06, + "loss": 0.6528, + "step": 3522 + }, + { + "epoch": 0.8209569675314587, + "grad_norm": 0.3432477116584778, + "learning_rate": 4.7834233131810795e-06, + "loss": 0.6572, + "step": 3523 + }, + { + "epoch": 0.8211899953394438, + "grad_norm": 0.3543875217437744, + "learning_rate": 4.783298613550827e-06, + "loss": 0.6479, + "step": 3524 + }, + { + "epoch": 0.8214230231474289, + "grad_norm": 0.3713391125202179, + "learning_rate": 4.783173879657747e-06, + "loss": 0.6382, + "step": 3525 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 0.34535127878189087, + "learning_rate": 4.783049111503713e-06, + "loss": 0.6728, + "step": 3526 + }, + { + "epoch": 0.8218890787633991, + "grad_norm": 0.37491628527641296, + "learning_rate": 4.782924309090598e-06, + "loss": 0.627, + "step": 3527 + }, + { + "epoch": 0.8221221065713842, + "grad_norm": 0.3445989787578583, + "learning_rate": 4.782799472420274e-06, + "loss": 0.6256, + "step": 3528 + }, + { + "epoch": 0.8223551343793692, + "grad_norm": 0.3363819718360901, + "learning_rate": 4.782674601494613e-06, + "loss": 0.6624, + "step": 3529 + }, + { + "epoch": 0.8225881621873543, + "grad_norm": 0.36682814359664917, + "learning_rate": 4.7825496963154895e-06, + "loss": 0.6235, + "step": 3530 + }, + { + "epoch": 0.8228211899953395, + "grad_norm": 0.3390207886695862, + "learning_rate": 4.782424756884778e-06, + "loss": 0.6301, + "step": 3531 + }, + { + "epoch": 0.8230542178033246, + "grad_norm": 0.3494780361652374, + "learning_rate": 4.782299783204354e-06, + "loss": 0.6264, + "step": 3532 + }, + { + "epoch": 0.8232872456113096, + "grad_norm": 0.3474465310573578, + "learning_rate": 4.7821747752760915e-06, + "loss": 0.6588, + "step": 3533 + }, + { + "epoch": 0.8235202734192947, + "grad_norm": 0.3526511788368225, + "learning_rate": 4.782049733101868e-06, + "loss": 0.6496, + "step": 3534 + }, + { + "epoch": 0.8237533012272797, + "grad_norm": 0.3634152412414551, + "learning_rate": 4.781924656683558e-06, + "loss": 0.6588, + "step": 3535 + }, + { + "epoch": 0.8239863290352649, + "grad_norm": 0.3683382272720337, + "learning_rate": 4.78179954602304e-06, + "loss": 0.6513, + "step": 3536 + }, + { + "epoch": 0.82421935684325, + "grad_norm": 0.356330931186676, + "learning_rate": 4.7816744011221905e-06, + "loss": 0.6511, + "step": 3537 + }, + { + "epoch": 0.824452384651235, + "grad_norm": 0.3361359238624573, + "learning_rate": 4.781549221982887e-06, + "loss": 0.6376, + "step": 3538 + }, + { + "epoch": 0.8246854124592201, + "grad_norm": 0.3478468358516693, + "learning_rate": 4.781424008607009e-06, + "loss": 0.6434, + "step": 3539 + }, + { + "epoch": 0.8249184402672052, + "grad_norm": 0.35386502742767334, + "learning_rate": 4.781298760996434e-06, + "loss": 0.6656, + "step": 3540 + }, + { + "epoch": 0.8251514680751904, + "grad_norm": 0.341420441865921, + "learning_rate": 4.781173479153044e-06, + "loss": 0.6728, + "step": 3541 + }, + { + "epoch": 0.8253844958831754, + "grad_norm": 0.33990949392318726, + "learning_rate": 4.781048163078716e-06, + "loss": 0.6458, + "step": 3542 + }, + { + "epoch": 0.8256175236911605, + "grad_norm": 0.34597262740135193, + "learning_rate": 4.780922812775333e-06, + "loss": 0.6547, + "step": 3543 + }, + { + "epoch": 0.8258505514991455, + "grad_norm": 0.34609514474868774, + "learning_rate": 4.780797428244774e-06, + "loss": 0.6122, + "step": 3544 + }, + { + "epoch": 0.8260835793071306, + "grad_norm": 0.34852975606918335, + "learning_rate": 4.780672009488922e-06, + "loss": 0.6661, + "step": 3545 + }, + { + "epoch": 0.8263166071151158, + "grad_norm": 0.33743152022361755, + "learning_rate": 4.780546556509657e-06, + "loss": 0.632, + "step": 3546 + }, + { + "epoch": 0.8265496349231009, + "grad_norm": 0.3528708219528198, + "learning_rate": 4.780421069308864e-06, + "loss": 0.6474, + "step": 3547 + }, + { + "epoch": 0.8267826627310859, + "grad_norm": 0.3545174300670624, + "learning_rate": 4.780295547888425e-06, + "loss": 0.6373, + "step": 3548 + }, + { + "epoch": 0.827015690539071, + "grad_norm": 0.33902719616889954, + "learning_rate": 4.780169992250223e-06, + "loss": 0.6194, + "step": 3549 + }, + { + "epoch": 0.827248718347056, + "grad_norm": 0.342777281999588, + "learning_rate": 4.780044402396142e-06, + "loss": 0.665, + "step": 3550 + }, + { + "epoch": 0.8274817461550412, + "grad_norm": 0.34932076930999756, + "learning_rate": 4.7799187783280685e-06, + "loss": 0.6604, + "step": 3551 + }, + { + "epoch": 0.8277147739630263, + "grad_norm": 0.33992263674736023, + "learning_rate": 4.779793120047885e-06, + "loss": 0.6494, + "step": 3552 + }, + { + "epoch": 0.8279478017710113, + "grad_norm": 0.3384428322315216, + "learning_rate": 4.779667427557478e-06, + "loss": 0.6566, + "step": 3553 + }, + { + "epoch": 0.8281808295789964, + "grad_norm": 0.34827300906181335, + "learning_rate": 4.779541700858735e-06, + "loss": 0.6247, + "step": 3554 + }, + { + "epoch": 0.8284138573869815, + "grad_norm": 0.3646458685398102, + "learning_rate": 4.779415939953541e-06, + "loss": 0.6415, + "step": 3555 + }, + { + "epoch": 0.8286468851949667, + "grad_norm": 0.33528098464012146, + "learning_rate": 4.779290144843784e-06, + "loss": 0.6581, + "step": 3556 + }, + { + "epoch": 0.8288799130029517, + "grad_norm": 0.3435686528682709, + "learning_rate": 4.7791643155313516e-06, + "loss": 0.6632, + "step": 3557 + }, + { + "epoch": 0.8291129408109368, + "grad_norm": 0.3450467884540558, + "learning_rate": 4.779038452018131e-06, + "loss": 0.6414, + "step": 3558 + }, + { + "epoch": 0.8293459686189218, + "grad_norm": 0.3399979770183563, + "learning_rate": 4.778912554306012e-06, + "loss": 0.6387, + "step": 3559 + }, + { + "epoch": 0.8295789964269069, + "grad_norm": 0.3345566987991333, + "learning_rate": 4.778786622396884e-06, + "loss": 0.6614, + "step": 3560 + }, + { + "epoch": 0.829812024234892, + "grad_norm": 0.3753139078617096, + "learning_rate": 4.778660656292636e-06, + "loss": 0.6146, + "step": 3561 + }, + { + "epoch": 0.8300450520428772, + "grad_norm": 0.34224462509155273, + "learning_rate": 4.778534655995157e-06, + "loss": 0.6249, + "step": 3562 + }, + { + "epoch": 0.8302780798508622, + "grad_norm": 0.3342699706554413, + "learning_rate": 4.77840862150634e-06, + "loss": 0.648, + "step": 3563 + }, + { + "epoch": 0.8305111076588473, + "grad_norm": 0.33843812346458435, + "learning_rate": 4.778282552828075e-06, + "loss": 0.6221, + "step": 3564 + }, + { + "epoch": 0.8307441354668323, + "grad_norm": 0.34959957003593445, + "learning_rate": 4.778156449962255e-06, + "loss": 0.6309, + "step": 3565 + }, + { + "epoch": 0.8309771632748174, + "grad_norm": 0.3421951234340668, + "learning_rate": 4.778030312910771e-06, + "loss": 0.6484, + "step": 3566 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 0.3402029871940613, + "learning_rate": 4.777904141675516e-06, + "loss": 0.6432, + "step": 3567 + }, + { + "epoch": 0.8314432188907876, + "grad_norm": 0.33745989203453064, + "learning_rate": 4.777777936258384e-06, + "loss": 0.6248, + "step": 3568 + }, + { + "epoch": 0.8316762466987727, + "grad_norm": 0.3413679301738739, + "learning_rate": 4.777651696661268e-06, + "loss": 0.6432, + "step": 3569 + }, + { + "epoch": 0.8319092745067578, + "grad_norm": 0.3475130796432495, + "learning_rate": 4.777525422886062e-06, + "loss": 0.6559, + "step": 3570 + }, + { + "epoch": 0.8321423023147428, + "grad_norm": 0.33477458357810974, + "learning_rate": 4.777399114934662e-06, + "loss": 0.6164, + "step": 3571 + }, + { + "epoch": 0.832375330122728, + "grad_norm": 0.3448219299316406, + "learning_rate": 4.777272772808964e-06, + "loss": 0.6457, + "step": 3572 + }, + { + "epoch": 0.8326083579307131, + "grad_norm": 0.3410978317260742, + "learning_rate": 4.777146396510861e-06, + "loss": 0.6427, + "step": 3573 + }, + { + "epoch": 0.8328413857386981, + "grad_norm": 0.3390243649482727, + "learning_rate": 4.777019986042252e-06, + "loss": 0.6252, + "step": 3574 + }, + { + "epoch": 0.8330744135466832, + "grad_norm": 0.34073886275291443, + "learning_rate": 4.7768935414050335e-06, + "loss": 0.6394, + "step": 3575 + }, + { + "epoch": 0.8333074413546683, + "grad_norm": 0.3321382701396942, + "learning_rate": 4.776767062601102e-06, + "loss": 0.6394, + "step": 3576 + }, + { + "epoch": 0.8335404691626535, + "grad_norm": 0.34478917717933655, + "learning_rate": 4.776640549632356e-06, + "loss": 0.6183, + "step": 3577 + }, + { + "epoch": 0.8337734969706385, + "grad_norm": 0.35002583265304565, + "learning_rate": 4.776514002500693e-06, + "loss": 0.6591, + "step": 3578 + }, + { + "epoch": 0.8340065247786236, + "grad_norm": 0.3401683270931244, + "learning_rate": 4.776387421208013e-06, + "loss": 0.6234, + "step": 3579 + }, + { + "epoch": 0.8342395525866086, + "grad_norm": 0.3427402675151825, + "learning_rate": 4.776260805756215e-06, + "loss": 0.6247, + "step": 3580 + }, + { + "epoch": 0.8344725803945937, + "grad_norm": 0.34600913524627686, + "learning_rate": 4.7761341561472005e-06, + "loss": 0.6642, + "step": 3581 + }, + { + "epoch": 0.8347056082025789, + "grad_norm": 0.34193286299705505, + "learning_rate": 4.776007472382868e-06, + "loss": 0.6374, + "step": 3582 + }, + { + "epoch": 0.834938636010564, + "grad_norm": 0.34889349341392517, + "learning_rate": 4.775880754465119e-06, + "loss": 0.6291, + "step": 3583 + }, + { + "epoch": 0.835171663818549, + "grad_norm": 0.3575896918773651, + "learning_rate": 4.775754002395854e-06, + "loss": 0.6504, + "step": 3584 + }, + { + "epoch": 0.8354046916265341, + "grad_norm": 0.35383114218711853, + "learning_rate": 4.7756272161769775e-06, + "loss": 0.656, + "step": 3585 + }, + { + "epoch": 0.8356377194345191, + "grad_norm": 0.3438124358654022, + "learning_rate": 4.77550039581039e-06, + "loss": 0.6307, + "step": 3586 + }, + { + "epoch": 0.8358707472425043, + "grad_norm": 0.34965625405311584, + "learning_rate": 4.775373541297996e-06, + "loss": 0.6497, + "step": 3587 + }, + { + "epoch": 0.8361037750504894, + "grad_norm": 0.3427966833114624, + "learning_rate": 4.7752466526416985e-06, + "loss": 0.627, + "step": 3588 + }, + { + "epoch": 0.8363368028584744, + "grad_norm": 0.33154386281967163, + "learning_rate": 4.7751197298434e-06, + "loss": 0.6507, + "step": 3589 + }, + { + "epoch": 0.8365698306664595, + "grad_norm": 0.33288225531578064, + "learning_rate": 4.774992772905009e-06, + "loss": 0.6269, + "step": 3590 + }, + { + "epoch": 0.8368028584744446, + "grad_norm": 0.3339562714099884, + "learning_rate": 4.774865781828425e-06, + "loss": 0.6296, + "step": 3591 + }, + { + "epoch": 0.8370358862824298, + "grad_norm": 0.3419848680496216, + "learning_rate": 4.774738756615558e-06, + "loss": 0.6276, + "step": 3592 + }, + { + "epoch": 0.8372689140904148, + "grad_norm": 0.3482993543148041, + "learning_rate": 4.774611697268313e-06, + "loss": 0.6354, + "step": 3593 + }, + { + "epoch": 0.8375019418983999, + "grad_norm": 0.34213802218437195, + "learning_rate": 4.774484603788597e-06, + "loss": 0.6553, + "step": 3594 + }, + { + "epoch": 0.837734969706385, + "grad_norm": 0.3432997763156891, + "learning_rate": 4.7743574761783156e-06, + "loss": 0.6576, + "step": 3595 + }, + { + "epoch": 0.83796799751437, + "grad_norm": 0.3474951982498169, + "learning_rate": 4.774230314439378e-06, + "loss": 0.6386, + "step": 3596 + }, + { + "epoch": 0.8382010253223552, + "grad_norm": 0.3478947877883911, + "learning_rate": 4.7741031185736914e-06, + "loss": 0.6197, + "step": 3597 + }, + { + "epoch": 0.8384340531303403, + "grad_norm": 0.3475710451602936, + "learning_rate": 4.7739758885831645e-06, + "loss": 0.6452, + "step": 3598 + }, + { + "epoch": 0.8386670809383253, + "grad_norm": 0.34099340438842773, + "learning_rate": 4.773848624469708e-06, + "loss": 0.6279, + "step": 3599 + }, + { + "epoch": 0.8389001087463104, + "grad_norm": 0.36203837394714355, + "learning_rate": 4.77372132623523e-06, + "loss": 0.6384, + "step": 3600 + }, + { + "epoch": 0.8391331365542954, + "grad_norm": 0.3584960401058197, + "learning_rate": 4.773593993881641e-06, + "loss": 0.6597, + "step": 3601 + }, + { + "epoch": 0.8393661643622806, + "grad_norm": 0.3363056778907776, + "learning_rate": 4.773466627410852e-06, + "loss": 0.6588, + "step": 3602 + }, + { + "epoch": 0.8395991921702657, + "grad_norm": 0.33978939056396484, + "learning_rate": 4.773339226824773e-06, + "loss": 0.6387, + "step": 3603 + }, + { + "epoch": 0.8398322199782507, + "grad_norm": 0.345813125371933, + "learning_rate": 4.773211792125319e-06, + "loss": 0.6254, + "step": 3604 + }, + { + "epoch": 0.8400652477862358, + "grad_norm": 0.34553074836730957, + "learning_rate": 4.773084323314399e-06, + "loss": 0.645, + "step": 3605 + }, + { + "epoch": 0.8402982755942209, + "grad_norm": 0.35533279180526733, + "learning_rate": 4.7729568203939274e-06, + "loss": 0.6513, + "step": 3606 + }, + { + "epoch": 0.8405313034022059, + "grad_norm": 0.35136911273002625, + "learning_rate": 4.772829283365817e-06, + "loss": 0.6422, + "step": 3607 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 0.33584821224212646, + "learning_rate": 4.7727017122319815e-06, + "loss": 0.643, + "step": 3608 + }, + { + "epoch": 0.8409973590181762, + "grad_norm": 0.3331029713153839, + "learning_rate": 4.772574106994335e-06, + "loss": 0.6698, + "step": 3609 + }, + { + "epoch": 0.8412303868261612, + "grad_norm": 0.35537636280059814, + "learning_rate": 4.772446467654794e-06, + "loss": 0.6508, + "step": 3610 + }, + { + "epoch": 0.8414634146341463, + "grad_norm": 0.3618346154689789, + "learning_rate": 4.772318794215272e-06, + "loss": 0.6663, + "step": 3611 + }, + { + "epoch": 0.8416964424421314, + "grad_norm": 0.3594073951244354, + "learning_rate": 4.772191086677685e-06, + "loss": 0.6488, + "step": 3612 + }, + { + "epoch": 0.8419294702501166, + "grad_norm": 0.3444766700267792, + "learning_rate": 4.772063345043949e-06, + "loss": 0.6674, + "step": 3613 + }, + { + "epoch": 0.8421624980581016, + "grad_norm": 0.3426531255245209, + "learning_rate": 4.771935569315983e-06, + "loss": 0.612, + "step": 3614 + }, + { + "epoch": 0.8423955258660867, + "grad_norm": 0.3570743203163147, + "learning_rate": 4.771807759495702e-06, + "loss": 0.6354, + "step": 3615 + }, + { + "epoch": 0.8426285536740717, + "grad_norm": 0.3364807069301605, + "learning_rate": 4.771679915585025e-06, + "loss": 0.652, + "step": 3616 + }, + { + "epoch": 0.8428615814820568, + "grad_norm": 0.3517146110534668, + "learning_rate": 4.77155203758587e-06, + "loss": 0.6661, + "step": 3617 + }, + { + "epoch": 0.843094609290042, + "grad_norm": 0.34496501088142395, + "learning_rate": 4.771424125500158e-06, + "loss": 0.6562, + "step": 3618 + }, + { + "epoch": 0.843327637098027, + "grad_norm": 0.3499245345592499, + "learning_rate": 4.771296179329805e-06, + "loss": 0.6527, + "step": 3619 + }, + { + "epoch": 0.8435606649060121, + "grad_norm": 0.34507372975349426, + "learning_rate": 4.771168199076732e-06, + "loss": 0.6587, + "step": 3620 + }, + { + "epoch": 0.8437936927139972, + "grad_norm": 0.33565396070480347, + "learning_rate": 4.771040184742862e-06, + "loss": 0.6231, + "step": 3621 + }, + { + "epoch": 0.8440267205219822, + "grad_norm": 0.3398193120956421, + "learning_rate": 4.770912136330113e-06, + "loss": 0.6218, + "step": 3622 + }, + { + "epoch": 0.8442597483299674, + "grad_norm": 0.3409017026424408, + "learning_rate": 4.770784053840407e-06, + "loss": 0.6521, + "step": 3623 + }, + { + "epoch": 0.8444927761379525, + "grad_norm": 0.3628937602043152, + "learning_rate": 4.770655937275667e-06, + "loss": 0.6627, + "step": 3624 + }, + { + "epoch": 0.8447258039459375, + "grad_norm": 0.3473515808582306, + "learning_rate": 4.770527786637815e-06, + "loss": 0.6611, + "step": 3625 + }, + { + "epoch": 0.8449588317539226, + "grad_norm": 0.34823256731033325, + "learning_rate": 4.770399601928774e-06, + "loss": 0.6535, + "step": 3626 + }, + { + "epoch": 0.8451918595619077, + "grad_norm": 0.3422960937023163, + "learning_rate": 4.770271383150468e-06, + "loss": 0.6364, + "step": 3627 + }, + { + "epoch": 0.8454248873698929, + "grad_norm": 0.35001474618911743, + "learning_rate": 4.770143130304819e-06, + "loss": 0.6578, + "step": 3628 + }, + { + "epoch": 0.8456579151778779, + "grad_norm": 0.33360588550567627, + "learning_rate": 4.770014843393754e-06, + "loss": 0.5935, + "step": 3629 + }, + { + "epoch": 0.845890942985863, + "grad_norm": 0.3480827510356903, + "learning_rate": 4.769886522419197e-06, + "loss": 0.6232, + "step": 3630 + }, + { + "epoch": 0.846123970793848, + "grad_norm": 0.343196839094162, + "learning_rate": 4.769758167383074e-06, + "loss": 0.6327, + "step": 3631 + }, + { + "epoch": 0.8463569986018331, + "grad_norm": 0.3365664482116699, + "learning_rate": 4.7696297782873105e-06, + "loss": 0.6683, + "step": 3632 + }, + { + "epoch": 0.8465900264098183, + "grad_norm": 0.33868080377578735, + "learning_rate": 4.7695013551338335e-06, + "loss": 0.6637, + "step": 3633 + }, + { + "epoch": 0.8468230542178034, + "grad_norm": 0.34339094161987305, + "learning_rate": 4.769372897924569e-06, + "loss": 0.6316, + "step": 3634 + }, + { + "epoch": 0.8470560820257884, + "grad_norm": 0.3428562581539154, + "learning_rate": 4.769244406661447e-06, + "loss": 0.6628, + "step": 3635 + }, + { + "epoch": 0.8472891098337735, + "grad_norm": 0.35074523091316223, + "learning_rate": 4.7691158813463925e-06, + "loss": 0.6407, + "step": 3636 + }, + { + "epoch": 0.8475221376417585, + "grad_norm": 0.349380761384964, + "learning_rate": 4.7689873219813364e-06, + "loss": 0.6486, + "step": 3637 + }, + { + "epoch": 0.8477551654497437, + "grad_norm": 0.36744967103004456, + "learning_rate": 4.768858728568208e-06, + "loss": 0.6316, + "step": 3638 + }, + { + "epoch": 0.8479881932577288, + "grad_norm": 0.3484974801540375, + "learning_rate": 4.768730101108936e-06, + "loss": 0.6541, + "step": 3639 + }, + { + "epoch": 0.8482212210657138, + "grad_norm": 0.3430306017398834, + "learning_rate": 4.76860143960545e-06, + "loss": 0.6342, + "step": 3640 + }, + { + "epoch": 0.8484542488736989, + "grad_norm": 0.3440837860107422, + "learning_rate": 4.768472744059682e-06, + "loss": 0.6162, + "step": 3641 + }, + { + "epoch": 0.848687276681684, + "grad_norm": 0.33143728971481323, + "learning_rate": 4.768344014473563e-06, + "loss": 0.638, + "step": 3642 + }, + { + "epoch": 0.8489203044896692, + "grad_norm": 0.34617966413497925, + "learning_rate": 4.768215250849023e-06, + "loss": 0.6556, + "step": 3643 + }, + { + "epoch": 0.8491533322976542, + "grad_norm": 0.3537744879722595, + "learning_rate": 4.7680864531879964e-06, + "loss": 0.6484, + "step": 3644 + }, + { + "epoch": 0.8493863601056393, + "grad_norm": 0.3475654721260071, + "learning_rate": 4.767957621492415e-06, + "loss": 0.6473, + "step": 3645 + }, + { + "epoch": 0.8496193879136243, + "grad_norm": 0.33842095732688904, + "learning_rate": 4.767828755764212e-06, + "loss": 0.6416, + "step": 3646 + }, + { + "epoch": 0.8498524157216094, + "grad_norm": 0.3356778919696808, + "learning_rate": 4.7676998560053215e-06, + "loss": 0.6569, + "step": 3647 + }, + { + "epoch": 0.8500854435295946, + "grad_norm": 0.3453678488731384, + "learning_rate": 4.7675709222176765e-06, + "loss": 0.6209, + "step": 3648 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 0.34416916966438293, + "learning_rate": 4.767441954403214e-06, + "loss": 0.6089, + "step": 3649 + }, + { + "epoch": 0.8505514991455647, + "grad_norm": 0.351468563079834, + "learning_rate": 4.767312952563867e-06, + "loss": 0.6551, + "step": 3650 + }, + { + "epoch": 0.8507845269535498, + "grad_norm": 0.3481277823448181, + "learning_rate": 4.767183916701573e-06, + "loss": 0.6303, + "step": 3651 + }, + { + "epoch": 0.8510175547615348, + "grad_norm": 0.3394913971424103, + "learning_rate": 4.767054846818267e-06, + "loss": 0.6256, + "step": 3652 + }, + { + "epoch": 0.8512505825695199, + "grad_norm": 0.3396850824356079, + "learning_rate": 4.766925742915887e-06, + "loss": 0.6285, + "step": 3653 + }, + { + "epoch": 0.8514836103775051, + "grad_norm": 0.3363351821899414, + "learning_rate": 4.766796604996369e-06, + "loss": 0.6498, + "step": 3654 + }, + { + "epoch": 0.8517166381854901, + "grad_norm": 0.3571386933326721, + "learning_rate": 4.766667433061652e-06, + "loss": 0.6369, + "step": 3655 + }, + { + "epoch": 0.8519496659934752, + "grad_norm": 0.34092798829078674, + "learning_rate": 4.7665382271136725e-06, + "loss": 0.6577, + "step": 3656 + }, + { + "epoch": 0.8521826938014603, + "grad_norm": 0.33877652883529663, + "learning_rate": 4.766408987154372e-06, + "loss": 0.6638, + "step": 3657 + }, + { + "epoch": 0.8524157216094453, + "grad_norm": 0.34733322262763977, + "learning_rate": 4.766279713185688e-06, + "loss": 0.6629, + "step": 3658 + }, + { + "epoch": 0.8526487494174305, + "grad_norm": 0.3269679844379425, + "learning_rate": 4.766150405209562e-06, + "loss": 0.6001, + "step": 3659 + }, + { + "epoch": 0.8528817772254156, + "grad_norm": 0.3325427770614624, + "learning_rate": 4.766021063227933e-06, + "loss": 0.6418, + "step": 3660 + }, + { + "epoch": 0.8531148050334006, + "grad_norm": 0.34992220997810364, + "learning_rate": 4.765891687242741e-06, + "loss": 0.663, + "step": 3661 + }, + { + "epoch": 0.8533478328413857, + "grad_norm": 0.3632676303386688, + "learning_rate": 4.765762277255929e-06, + "loss": 0.6421, + "step": 3662 + }, + { + "epoch": 0.8535808606493708, + "grad_norm": 0.3500070571899414, + "learning_rate": 4.7656328332694375e-06, + "loss": 0.6372, + "step": 3663 + }, + { + "epoch": 0.853813888457356, + "grad_norm": 0.3404751121997833, + "learning_rate": 4.765503355285211e-06, + "loss": 0.6337, + "step": 3664 + }, + { + "epoch": 0.854046916265341, + "grad_norm": 0.34594452381134033, + "learning_rate": 4.765373843305192e-06, + "loss": 0.6361, + "step": 3665 + }, + { + "epoch": 0.8542799440733261, + "grad_norm": 0.3495174050331116, + "learning_rate": 4.765244297331323e-06, + "loss": 0.6033, + "step": 3666 + }, + { + "epoch": 0.8545129718813111, + "grad_norm": 0.34797024726867676, + "learning_rate": 4.7651147173655475e-06, + "loss": 0.6355, + "step": 3667 + }, + { + "epoch": 0.8547459996892962, + "grad_norm": 0.33717137575149536, + "learning_rate": 4.764985103409811e-06, + "loss": 0.6199, + "step": 3668 + }, + { + "epoch": 0.8549790274972814, + "grad_norm": 0.3414860963821411, + "learning_rate": 4.764855455466058e-06, + "loss": 0.6541, + "step": 3669 + }, + { + "epoch": 0.8552120553052664, + "grad_norm": 0.34276923537254333, + "learning_rate": 4.764725773536233e-06, + "loss": 0.6247, + "step": 3670 + }, + { + "epoch": 0.8554450831132515, + "grad_norm": 0.3419957160949707, + "learning_rate": 4.764596057622284e-06, + "loss": 0.6366, + "step": 3671 + }, + { + "epoch": 0.8556781109212366, + "grad_norm": 0.34173014760017395, + "learning_rate": 4.764466307726157e-06, + "loss": 0.627, + "step": 3672 + }, + { + "epoch": 0.8559111387292216, + "grad_norm": 0.3366159498691559, + "learning_rate": 4.764336523849798e-06, + "loss": 0.6279, + "step": 3673 + }, + { + "epoch": 0.8561441665372068, + "grad_norm": 0.34921789169311523, + "learning_rate": 4.7642067059951554e-06, + "loss": 0.6116, + "step": 3674 + }, + { + "epoch": 0.8563771943451919, + "grad_norm": 0.3382711112499237, + "learning_rate": 4.764076854164177e-06, + "loss": 0.6125, + "step": 3675 + }, + { + "epoch": 0.856610222153177, + "grad_norm": 0.34190601110458374, + "learning_rate": 4.763946968358811e-06, + "loss": 0.6304, + "step": 3676 + }, + { + "epoch": 0.856843249961162, + "grad_norm": 0.3405074179172516, + "learning_rate": 4.763817048581007e-06, + "loss": 0.6465, + "step": 3677 + }, + { + "epoch": 0.8570762777691471, + "grad_norm": 0.35165947675704956, + "learning_rate": 4.7636870948327145e-06, + "loss": 0.6526, + "step": 3678 + }, + { + "epoch": 0.8573093055771323, + "grad_norm": 0.32140257954597473, + "learning_rate": 4.7635571071158825e-06, + "loss": 0.6212, + "step": 3679 + }, + { + "epoch": 0.8575423333851173, + "grad_norm": 0.3465662896633148, + "learning_rate": 4.763427085432463e-06, + "loss": 0.6698, + "step": 3680 + }, + { + "epoch": 0.8577753611931024, + "grad_norm": 0.35291674733161926, + "learning_rate": 4.763297029784406e-06, + "loss": 0.641, + "step": 3681 + }, + { + "epoch": 0.8580083890010874, + "grad_norm": 0.3425975739955902, + "learning_rate": 4.763166940173663e-06, + "loss": 0.6393, + "step": 3682 + }, + { + "epoch": 0.8582414168090725, + "grad_norm": 0.3384883403778076, + "learning_rate": 4.7630368166021884e-06, + "loss": 0.6305, + "step": 3683 + }, + { + "epoch": 0.8584744446170577, + "grad_norm": 0.3371085524559021, + "learning_rate": 4.762906659071932e-06, + "loss": 0.6167, + "step": 3684 + }, + { + "epoch": 0.8587074724250428, + "grad_norm": 0.36864525079727173, + "learning_rate": 4.762776467584849e-06, + "loss": 0.6453, + "step": 3685 + }, + { + "epoch": 0.8589405002330278, + "grad_norm": 0.3552547097206116, + "learning_rate": 4.762646242142891e-06, + "loss": 0.6376, + "step": 3686 + }, + { + "epoch": 0.8591735280410129, + "grad_norm": 0.3299630284309387, + "learning_rate": 4.762515982748013e-06, + "loss": 0.6241, + "step": 3687 + }, + { + "epoch": 0.8594065558489979, + "grad_norm": 0.3419981002807617, + "learning_rate": 4.76238568940217e-06, + "loss": 0.6518, + "step": 3688 + }, + { + "epoch": 0.8596395836569831, + "grad_norm": 0.3431527316570282, + "learning_rate": 4.762255362107318e-06, + "loss": 0.6231, + "step": 3689 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 0.3457905054092407, + "learning_rate": 4.762125000865411e-06, + "loss": 0.6327, + "step": 3690 + }, + { + "epoch": 0.8601056392729532, + "grad_norm": 0.3587702214717865, + "learning_rate": 4.761994605678406e-06, + "loss": 0.6351, + "step": 3691 + }, + { + "epoch": 0.8603386670809383, + "grad_norm": 0.3324383497238159, + "learning_rate": 4.76186417654826e-06, + "loss": 0.6193, + "step": 3692 + }, + { + "epoch": 0.8605716948889234, + "grad_norm": 0.34697145223617554, + "learning_rate": 4.761733713476929e-06, + "loss": 0.6304, + "step": 3693 + }, + { + "epoch": 0.8608047226969086, + "grad_norm": 0.339828222990036, + "learning_rate": 4.761603216466371e-06, + "loss": 0.6412, + "step": 3694 + }, + { + "epoch": 0.8610377505048936, + "grad_norm": 0.33261317014694214, + "learning_rate": 4.761472685518546e-06, + "loss": 0.6457, + "step": 3695 + }, + { + "epoch": 0.8612707783128787, + "grad_norm": 0.3494134545326233, + "learning_rate": 4.76134212063541e-06, + "loss": 0.6252, + "step": 3696 + }, + { + "epoch": 0.8615038061208637, + "grad_norm": 0.3391890525817871, + "learning_rate": 4.761211521818925e-06, + "loss": 0.6292, + "step": 3697 + }, + { + "epoch": 0.8617368339288488, + "grad_norm": 0.3290880024433136, + "learning_rate": 4.761080889071049e-06, + "loss": 0.6338, + "step": 3698 + }, + { + "epoch": 0.8619698617368339, + "grad_norm": 0.33601072430610657, + "learning_rate": 4.760950222393742e-06, + "loss": 0.6399, + "step": 3699 + }, + { + "epoch": 0.862202889544819, + "grad_norm": 0.33911827206611633, + "learning_rate": 4.7608195217889665e-06, + "loss": 0.6428, + "step": 3700 + }, + { + "epoch": 0.8624359173528041, + "grad_norm": 0.3555572032928467, + "learning_rate": 4.760688787258681e-06, + "loss": 0.6358, + "step": 3701 + }, + { + "epoch": 0.8626689451607892, + "grad_norm": 0.3594367802143097, + "learning_rate": 4.7605580188048505e-06, + "loss": 0.665, + "step": 3702 + }, + { + "epoch": 0.8629019729687742, + "grad_norm": 0.355993390083313, + "learning_rate": 4.760427216429435e-06, + "loss": 0.6695, + "step": 3703 + }, + { + "epoch": 0.8631350007767593, + "grad_norm": 0.35719361901283264, + "learning_rate": 4.7602963801343995e-06, + "loss": 0.6515, + "step": 3704 + }, + { + "epoch": 0.8633680285847445, + "grad_norm": 0.3659099042415619, + "learning_rate": 4.760165509921704e-06, + "loss": 0.6278, + "step": 3705 + }, + { + "epoch": 0.8636010563927295, + "grad_norm": 0.3454124331474304, + "learning_rate": 4.760034605793315e-06, + "loss": 0.6459, + "step": 3706 + }, + { + "epoch": 0.8638340842007146, + "grad_norm": 0.3464796543121338, + "learning_rate": 4.759903667751195e-06, + "loss": 0.6468, + "step": 3707 + }, + { + "epoch": 0.8640671120086997, + "grad_norm": 0.3788662254810333, + "learning_rate": 4.759772695797312e-06, + "loss": 0.6519, + "step": 3708 + }, + { + "epoch": 0.8643001398166847, + "grad_norm": 0.34219467639923096, + "learning_rate": 4.759641689933627e-06, + "loss": 0.6467, + "step": 3709 + }, + { + "epoch": 0.8645331676246699, + "grad_norm": 0.35705649852752686, + "learning_rate": 4.759510650162109e-06, + "loss": 0.6083, + "step": 3710 + }, + { + "epoch": 0.864766195432655, + "grad_norm": 0.3396720886230469, + "learning_rate": 4.759379576484724e-06, + "loss": 0.6308, + "step": 3711 + }, + { + "epoch": 0.86499922324064, + "grad_norm": 0.35597357153892517, + "learning_rate": 4.759248468903438e-06, + "loss": 0.6203, + "step": 3712 + }, + { + "epoch": 0.8652322510486251, + "grad_norm": 0.3375095725059509, + "learning_rate": 4.759117327420218e-06, + "loss": 0.6356, + "step": 3713 + }, + { + "epoch": 0.8654652788566102, + "grad_norm": 0.3338155448436737, + "learning_rate": 4.7589861520370325e-06, + "loss": 0.6164, + "step": 3714 + }, + { + "epoch": 0.8656983066645954, + "grad_norm": 0.3265370726585388, + "learning_rate": 4.75885494275585e-06, + "loss": 0.6411, + "step": 3715 + }, + { + "epoch": 0.8659313344725804, + "grad_norm": 0.3357608914375305, + "learning_rate": 4.75872369957864e-06, + "loss": 0.6457, + "step": 3716 + }, + { + "epoch": 0.8661643622805655, + "grad_norm": 0.34640446305274963, + "learning_rate": 4.758592422507371e-06, + "loss": 0.6094, + "step": 3717 + }, + { + "epoch": 0.8663973900885505, + "grad_norm": 0.32726234197616577, + "learning_rate": 4.758461111544014e-06, + "loss": 0.6119, + "step": 3718 + }, + { + "epoch": 0.8666304178965356, + "grad_norm": 0.344874769449234, + "learning_rate": 4.758329766690537e-06, + "loss": 0.6372, + "step": 3719 + }, + { + "epoch": 0.8668634457045208, + "grad_norm": 0.34715405106544495, + "learning_rate": 4.758198387948914e-06, + "loss": 0.6243, + "step": 3720 + }, + { + "epoch": 0.8670964735125058, + "grad_norm": 0.3316483199596405, + "learning_rate": 4.758066975321114e-06, + "loss": 0.6163, + "step": 3721 + }, + { + "epoch": 0.8673295013204909, + "grad_norm": 0.3491971492767334, + "learning_rate": 4.75793552880911e-06, + "loss": 0.6436, + "step": 3722 + }, + { + "epoch": 0.867562529128476, + "grad_norm": 0.3420524001121521, + "learning_rate": 4.757804048414875e-06, + "loss": 0.6531, + "step": 3723 + }, + { + "epoch": 0.867795556936461, + "grad_norm": 0.3357638716697693, + "learning_rate": 4.75767253414038e-06, + "loss": 0.6308, + "step": 3724 + }, + { + "epoch": 0.8680285847444462, + "grad_norm": 0.332003653049469, + "learning_rate": 4.757540985987602e-06, + "loss": 0.6084, + "step": 3725 + }, + { + "epoch": 0.8682616125524313, + "grad_norm": 0.3483152985572815, + "learning_rate": 4.757409403958512e-06, + "loss": 0.6405, + "step": 3726 + }, + { + "epoch": 0.8684946403604163, + "grad_norm": 0.34742143750190735, + "learning_rate": 4.757277788055086e-06, + "loss": 0.6315, + "step": 3727 + }, + { + "epoch": 0.8687276681684014, + "grad_norm": 0.3378693461418152, + "learning_rate": 4.757146138279297e-06, + "loss": 0.6334, + "step": 3728 + }, + { + "epoch": 0.8689606959763865, + "grad_norm": 0.34703418612480164, + "learning_rate": 4.757014454633123e-06, + "loss": 0.6243, + "step": 3729 + }, + { + "epoch": 0.8691937237843717, + "grad_norm": 0.34256330132484436, + "learning_rate": 4.756882737118538e-06, + "loss": 0.6344, + "step": 3730 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 0.3402869403362274, + "learning_rate": 4.75675098573752e-06, + "loss": 0.634, + "step": 3731 + }, + { + "epoch": 0.8696597794003418, + "grad_norm": 0.3486390709877014, + "learning_rate": 4.756619200492045e-06, + "loss": 0.61, + "step": 3732 + }, + { + "epoch": 0.8698928072083268, + "grad_norm": 0.3503628373146057, + "learning_rate": 4.756487381384092e-06, + "loss": 0.6503, + "step": 3733 + }, + { + "epoch": 0.8701258350163119, + "grad_norm": 0.3521903157234192, + "learning_rate": 4.756355528415639e-06, + "loss": 0.668, + "step": 3734 + }, + { + "epoch": 0.8703588628242971, + "grad_norm": 0.34037309885025024, + "learning_rate": 4.756223641588661e-06, + "loss": 0.6419, + "step": 3735 + }, + { + "epoch": 0.8705918906322822, + "grad_norm": 0.3580534756183624, + "learning_rate": 4.756091720905142e-06, + "loss": 0.655, + "step": 3736 + }, + { + "epoch": 0.8708249184402672, + "grad_norm": 0.3522266447544098, + "learning_rate": 4.755959766367057e-06, + "loss": 0.6298, + "step": 3737 + }, + { + "epoch": 0.8710579462482523, + "grad_norm": 0.3397982120513916, + "learning_rate": 4.755827777976391e-06, + "loss": 0.6204, + "step": 3738 + }, + { + "epoch": 0.8712909740562373, + "grad_norm": 0.3403220474720001, + "learning_rate": 4.75569575573512e-06, + "loss": 0.616, + "step": 3739 + }, + { + "epoch": 0.8715240018642224, + "grad_norm": 0.3355109393596649, + "learning_rate": 4.755563699645229e-06, + "loss": 0.6414, + "step": 3740 + }, + { + "epoch": 0.8717570296722076, + "grad_norm": 0.3633451759815216, + "learning_rate": 4.755431609708696e-06, + "loss": 0.6318, + "step": 3741 + }, + { + "epoch": 0.8719900574801926, + "grad_norm": 0.356184720993042, + "learning_rate": 4.755299485927506e-06, + "loss": 0.6169, + "step": 3742 + }, + { + "epoch": 0.8722230852881777, + "grad_norm": 0.35011857748031616, + "learning_rate": 4.75516732830364e-06, + "loss": 0.6568, + "step": 3743 + }, + { + "epoch": 0.8724561130961628, + "grad_norm": 0.33296555280685425, + "learning_rate": 4.755035136839082e-06, + "loss": 0.628, + "step": 3744 + }, + { + "epoch": 0.8726891409041478, + "grad_norm": 0.34425947070121765, + "learning_rate": 4.7549029115358145e-06, + "loss": 0.6627, + "step": 3745 + }, + { + "epoch": 0.872922168712133, + "grad_norm": 0.34002235531806946, + "learning_rate": 4.754770652395823e-06, + "loss": 0.6569, + "step": 3746 + }, + { + "epoch": 0.8731551965201181, + "grad_norm": 0.33823031187057495, + "learning_rate": 4.7546383594210915e-06, + "loss": 0.6699, + "step": 3747 + }, + { + "epoch": 0.8733882243281031, + "grad_norm": 0.3531865179538727, + "learning_rate": 4.754506032613605e-06, + "loss": 0.639, + "step": 3748 + }, + { + "epoch": 0.8736212521360882, + "grad_norm": 0.36787378787994385, + "learning_rate": 4.75437367197535e-06, + "loss": 0.6401, + "step": 3749 + }, + { + "epoch": 0.8738542799440733, + "grad_norm": 0.35047170519828796, + "learning_rate": 4.754241277508311e-06, + "loss": 0.6347, + "step": 3750 + }, + { + "epoch": 0.8740873077520585, + "grad_norm": 0.3472587764263153, + "learning_rate": 4.754108849214477e-06, + "loss": 0.6367, + "step": 3751 + }, + { + "epoch": 0.8743203355600435, + "grad_norm": 0.3372192978858948, + "learning_rate": 4.753976387095834e-06, + "loss": 0.6499, + "step": 3752 + }, + { + "epoch": 0.8745533633680286, + "grad_norm": 0.3461858630180359, + "learning_rate": 4.753843891154369e-06, + "loss": 0.6439, + "step": 3753 + }, + { + "epoch": 0.8747863911760136, + "grad_norm": 0.35338857769966125, + "learning_rate": 4.753711361392071e-06, + "loss": 0.6394, + "step": 3754 + }, + { + "epoch": 0.8750194189839987, + "grad_norm": 0.3446579575538635, + "learning_rate": 4.75357879781093e-06, + "loss": 0.6353, + "step": 3755 + }, + { + "epoch": 0.8752524467919839, + "grad_norm": 0.33312469720840454, + "learning_rate": 4.753446200412932e-06, + "loss": 0.647, + "step": 3756 + }, + { + "epoch": 0.875485474599969, + "grad_norm": 0.3516089916229248, + "learning_rate": 4.753313569200069e-06, + "loss": 0.627, + "step": 3757 + }, + { + "epoch": 0.875718502407954, + "grad_norm": 0.35476839542388916, + "learning_rate": 4.753180904174332e-06, + "loss": 0.6421, + "step": 3758 + }, + { + "epoch": 0.8759515302159391, + "grad_norm": 0.3633028566837311, + "learning_rate": 4.753048205337709e-06, + "loss": 0.6198, + "step": 3759 + }, + { + "epoch": 0.8761845580239241, + "grad_norm": 0.35467103123664856, + "learning_rate": 4.752915472692194e-06, + "loss": 0.6388, + "step": 3760 + }, + { + "epoch": 0.8764175858319093, + "grad_norm": 0.3557610809803009, + "learning_rate": 4.752782706239777e-06, + "loss": 0.6355, + "step": 3761 + }, + { + "epoch": 0.8766506136398944, + "grad_norm": 0.3655168116092682, + "learning_rate": 4.752649905982451e-06, + "loss": 0.6523, + "step": 3762 + }, + { + "epoch": 0.8768836414478794, + "grad_norm": 0.3764427602291107, + "learning_rate": 4.752517071922208e-06, + "loss": 0.6164, + "step": 3763 + }, + { + "epoch": 0.8771166692558645, + "grad_norm": 0.35654428601264954, + "learning_rate": 4.7523842040610434e-06, + "loss": 0.6442, + "step": 3764 + }, + { + "epoch": 0.8773496970638496, + "grad_norm": 0.3583972454071045, + "learning_rate": 4.752251302400949e-06, + "loss": 0.6461, + "step": 3765 + }, + { + "epoch": 0.8775827248718348, + "grad_norm": 0.36657923460006714, + "learning_rate": 4.75211836694392e-06, + "loss": 0.6397, + "step": 3766 + }, + { + "epoch": 0.8778157526798198, + "grad_norm": 0.3608044385910034, + "learning_rate": 4.75198539769195e-06, + "loss": 0.6349, + "step": 3767 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 0.34390753507614136, + "learning_rate": 4.751852394647036e-06, + "loss": 0.621, + "step": 3768 + }, + { + "epoch": 0.87828180829579, + "grad_norm": 0.3366774320602417, + "learning_rate": 4.751719357811172e-06, + "loss": 0.6411, + "step": 3769 + }, + { + "epoch": 0.878514836103775, + "grad_norm": 0.3541969358921051, + "learning_rate": 4.751586287186356e-06, + "loss": 0.6505, + "step": 3770 + }, + { + "epoch": 0.8787478639117602, + "grad_norm": 0.35459470748901367, + "learning_rate": 4.7514531827745845e-06, + "loss": 0.6287, + "step": 3771 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 0.34577852487564087, + "learning_rate": 4.751320044577854e-06, + "loss": 0.6434, + "step": 3772 + }, + { + "epoch": 0.8792139195277303, + "grad_norm": 0.35078680515289307, + "learning_rate": 4.751186872598163e-06, + "loss": 0.6714, + "step": 3773 + }, + { + "epoch": 0.8794469473357154, + "grad_norm": 0.33998918533325195, + "learning_rate": 4.751053666837509e-06, + "loss": 0.6358, + "step": 3774 + }, + { + "epoch": 0.8796799751437004, + "grad_norm": 0.34751832485198975, + "learning_rate": 4.750920427297893e-06, + "loss": 0.6637, + "step": 3775 + }, + { + "epoch": 0.8799130029516856, + "grad_norm": 0.3659592568874359, + "learning_rate": 4.750787153981312e-06, + "loss": 0.624, + "step": 3776 + }, + { + "epoch": 0.8801460307596707, + "grad_norm": 0.3273109495639801, + "learning_rate": 4.750653846889768e-06, + "loss": 0.6098, + "step": 3777 + }, + { + "epoch": 0.8803790585676557, + "grad_norm": 0.3544033467769623, + "learning_rate": 4.750520506025259e-06, + "loss": 0.5987, + "step": 3778 + }, + { + "epoch": 0.8806120863756408, + "grad_norm": 0.34231942892074585, + "learning_rate": 4.750387131389788e-06, + "loss": 0.6641, + "step": 3779 + }, + { + "epoch": 0.8808451141836259, + "grad_norm": 0.34847745299339294, + "learning_rate": 4.750253722985354e-06, + "loss": 0.6393, + "step": 3780 + }, + { + "epoch": 0.881078141991611, + "grad_norm": 0.3480171859264374, + "learning_rate": 4.750120280813962e-06, + "loss": 0.6354, + "step": 3781 + }, + { + "epoch": 0.8813111697995961, + "grad_norm": 0.3404203951358795, + "learning_rate": 4.749986804877612e-06, + "loss": 0.6409, + "step": 3782 + }, + { + "epoch": 0.8815441976075812, + "grad_norm": 0.33142969012260437, + "learning_rate": 4.749853295178309e-06, + "loss": 0.6599, + "step": 3783 + }, + { + "epoch": 0.8817772254155662, + "grad_norm": 0.3530192971229553, + "learning_rate": 4.749719751718054e-06, + "loss": 0.622, + "step": 3784 + }, + { + "epoch": 0.8820102532235513, + "grad_norm": 0.3339332640171051, + "learning_rate": 4.749586174498852e-06, + "loss": 0.6372, + "step": 3785 + }, + { + "epoch": 0.8822432810315364, + "grad_norm": 0.35504427552223206, + "learning_rate": 4.749452563522707e-06, + "loss": 0.6322, + "step": 3786 + }, + { + "epoch": 0.8824763088395216, + "grad_norm": 0.34452399611473083, + "learning_rate": 4.749318918791626e-06, + "loss": 0.6237, + "step": 3787 + }, + { + "epoch": 0.8827093366475066, + "grad_norm": 0.35303494334220886, + "learning_rate": 4.749185240307612e-06, + "loss": 0.6571, + "step": 3788 + }, + { + "epoch": 0.8829423644554917, + "grad_norm": 0.3551463186740875, + "learning_rate": 4.749051528072672e-06, + "loss": 0.6465, + "step": 3789 + }, + { + "epoch": 0.8831753922634767, + "grad_norm": 0.3522144854068756, + "learning_rate": 4.748917782088813e-06, + "loss": 0.6426, + "step": 3790 + }, + { + "epoch": 0.8834084200714618, + "grad_norm": 0.35137492418289185, + "learning_rate": 4.748784002358041e-06, + "loss": 0.6233, + "step": 3791 + }, + { + "epoch": 0.883641447879447, + "grad_norm": 0.3363114893436432, + "learning_rate": 4.7486501888823645e-06, + "loss": 0.6195, + "step": 3792 + }, + { + "epoch": 0.883874475687432, + "grad_norm": 0.34726542234420776, + "learning_rate": 4.7485163416637895e-06, + "loss": 0.6451, + "step": 3793 + }, + { + "epoch": 0.8841075034954171, + "grad_norm": 0.37782999873161316, + "learning_rate": 4.748382460704327e-06, + "loss": 0.6508, + "step": 3794 + }, + { + "epoch": 0.8843405313034022, + "grad_norm": 0.3602677881717682, + "learning_rate": 4.748248546005984e-06, + "loss": 0.6544, + "step": 3795 + }, + { + "epoch": 0.8845735591113872, + "grad_norm": 0.3412230908870697, + "learning_rate": 4.748114597570771e-06, + "loss": 0.6321, + "step": 3796 + }, + { + "epoch": 0.8848065869193724, + "grad_norm": 0.3554471731185913, + "learning_rate": 4.747980615400698e-06, + "loss": 0.5994, + "step": 3797 + }, + { + "epoch": 0.8850396147273575, + "grad_norm": 0.3435530662536621, + "learning_rate": 4.747846599497775e-06, + "loss": 0.6155, + "step": 3798 + }, + { + "epoch": 0.8852726425353425, + "grad_norm": 0.35305020213127136, + "learning_rate": 4.747712549864013e-06, + "loss": 0.6356, + "step": 3799 + }, + { + "epoch": 0.8855056703433276, + "grad_norm": 0.35838738083839417, + "learning_rate": 4.7475784665014246e-06, + "loss": 0.6319, + "step": 3800 + }, + { + "epoch": 0.8857386981513127, + "grad_norm": 0.3539108335971832, + "learning_rate": 4.747444349412022e-06, + "loss": 0.6378, + "step": 3801 + }, + { + "epoch": 0.8859717259592979, + "grad_norm": 0.3532589375972748, + "learning_rate": 4.747310198597816e-06, + "loss": 0.6263, + "step": 3802 + }, + { + "epoch": 0.8862047537672829, + "grad_norm": 0.3501448333263397, + "learning_rate": 4.747176014060819e-06, + "loss": 0.6604, + "step": 3803 + }, + { + "epoch": 0.886437781575268, + "grad_norm": 0.3451497256755829, + "learning_rate": 4.747041795803048e-06, + "loss": 0.6157, + "step": 3804 + }, + { + "epoch": 0.886670809383253, + "grad_norm": 0.3765154182910919, + "learning_rate": 4.746907543826515e-06, + "loss": 0.6348, + "step": 3805 + }, + { + "epoch": 0.8869038371912381, + "grad_norm": 0.3569197654724121, + "learning_rate": 4.746773258133233e-06, + "loss": 0.6284, + "step": 3806 + }, + { + "epoch": 0.8871368649992233, + "grad_norm": 0.34582531452178955, + "learning_rate": 4.7466389387252195e-06, + "loss": 0.6196, + "step": 3807 + }, + { + "epoch": 0.8873698928072083, + "grad_norm": 0.3365514874458313, + "learning_rate": 4.74650458560449e-06, + "loss": 0.6148, + "step": 3808 + }, + { + "epoch": 0.8876029206151934, + "grad_norm": 0.3495114743709564, + "learning_rate": 4.7463701987730595e-06, + "loss": 0.639, + "step": 3809 + }, + { + "epoch": 0.8878359484231785, + "grad_norm": 0.32894816994667053, + "learning_rate": 4.7462357782329454e-06, + "loss": 0.6346, + "step": 3810 + }, + { + "epoch": 0.8880689762311635, + "grad_norm": 0.3428363502025604, + "learning_rate": 4.746101323986164e-06, + "loss": 0.6509, + "step": 3811 + }, + { + "epoch": 0.8883020040391487, + "grad_norm": 0.35523200035095215, + "learning_rate": 4.745966836034733e-06, + "loss": 0.6426, + "step": 3812 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 0.3565545380115509, + "learning_rate": 4.7458323143806715e-06, + "loss": 0.6565, + "step": 3813 + }, + { + "epoch": 0.8887680596551188, + "grad_norm": 0.3601287007331848, + "learning_rate": 4.7456977590259965e-06, + "loss": 0.6218, + "step": 3814 + }, + { + "epoch": 0.8890010874631039, + "grad_norm": 0.34503766894340515, + "learning_rate": 4.745563169972729e-06, + "loss": 0.6235, + "step": 3815 + }, + { + "epoch": 0.889234115271089, + "grad_norm": 0.3645241856575012, + "learning_rate": 4.745428547222887e-06, + "loss": 0.6554, + "step": 3816 + }, + { + "epoch": 0.8894671430790742, + "grad_norm": 0.34489375352859497, + "learning_rate": 4.745293890778492e-06, + "loss": 0.6535, + "step": 3817 + }, + { + "epoch": 0.8897001708870592, + "grad_norm": 0.3429029583930969, + "learning_rate": 4.745159200641563e-06, + "loss": 0.6394, + "step": 3818 + }, + { + "epoch": 0.8899331986950443, + "grad_norm": 0.35112065076828003, + "learning_rate": 4.745024476814122e-06, + "loss": 0.6184, + "step": 3819 + }, + { + "epoch": 0.8901662265030293, + "grad_norm": 0.3634589910507202, + "learning_rate": 4.744889719298191e-06, + "loss": 0.6603, + "step": 3820 + }, + { + "epoch": 0.8903992543110144, + "grad_norm": 0.34621715545654297, + "learning_rate": 4.7447549280957925e-06, + "loss": 0.6287, + "step": 3821 + }, + { + "epoch": 0.8906322821189996, + "grad_norm": 0.3533332645893097, + "learning_rate": 4.744620103208948e-06, + "loss": 0.6471, + "step": 3822 + }, + { + "epoch": 0.8908653099269846, + "grad_norm": 0.35788652300834656, + "learning_rate": 4.7444852446396804e-06, + "loss": 0.6409, + "step": 3823 + }, + { + "epoch": 0.8910983377349697, + "grad_norm": 0.33965203166007996, + "learning_rate": 4.744350352390016e-06, + "loss": 0.6353, + "step": 3824 + }, + { + "epoch": 0.8913313655429548, + "grad_norm": 0.35039815306663513, + "learning_rate": 4.744215426461975e-06, + "loss": 0.6277, + "step": 3825 + }, + { + "epoch": 0.8915643933509398, + "grad_norm": 0.37441912293434143, + "learning_rate": 4.744080466857585e-06, + "loss": 0.646, + "step": 3826 + }, + { + "epoch": 0.891797421158925, + "grad_norm": 0.338395357131958, + "learning_rate": 4.7439454735788705e-06, + "loss": 0.6279, + "step": 3827 + }, + { + "epoch": 0.8920304489669101, + "grad_norm": 0.33567681908607483, + "learning_rate": 4.743810446627857e-06, + "loss": 0.6541, + "step": 3828 + }, + { + "epoch": 0.8922634767748951, + "grad_norm": 0.33691978454589844, + "learning_rate": 4.743675386006571e-06, + "loss": 0.6455, + "step": 3829 + }, + { + "epoch": 0.8924965045828802, + "grad_norm": 0.3600013852119446, + "learning_rate": 4.743540291717038e-06, + "loss": 0.623, + "step": 3830 + }, + { + "epoch": 0.8927295323908653, + "grad_norm": 0.3391219973564148, + "learning_rate": 4.743405163761287e-06, + "loss": 0.635, + "step": 3831 + }, + { + "epoch": 0.8929625601988503, + "grad_norm": 0.3535481095314026, + "learning_rate": 4.743270002141345e-06, + "loss": 0.626, + "step": 3832 + }, + { + "epoch": 0.8931955880068355, + "grad_norm": 0.3476249873638153, + "learning_rate": 4.743134806859239e-06, + "loss": 0.6236, + "step": 3833 + }, + { + "epoch": 0.8934286158148206, + "grad_norm": 0.33879852294921875, + "learning_rate": 4.742999577916999e-06, + "loss": 0.6137, + "step": 3834 + }, + { + "epoch": 0.8936616436228056, + "grad_norm": 0.34976089000701904, + "learning_rate": 4.742864315316655e-06, + "loss": 0.6602, + "step": 3835 + }, + { + "epoch": 0.8938946714307907, + "grad_norm": 0.3615668714046478, + "learning_rate": 4.742729019060236e-06, + "loss": 0.6484, + "step": 3836 + }, + { + "epoch": 0.8941276992387758, + "grad_norm": 0.35952138900756836, + "learning_rate": 4.74259368914977e-06, + "loss": 0.6538, + "step": 3837 + }, + { + "epoch": 0.894360727046761, + "grad_norm": 0.35889166593551636, + "learning_rate": 4.742458325587291e-06, + "loss": 0.6038, + "step": 3838 + }, + { + "epoch": 0.894593754854746, + "grad_norm": 0.33610525727272034, + "learning_rate": 4.742322928374829e-06, + "loss": 0.6249, + "step": 3839 + }, + { + "epoch": 0.8948267826627311, + "grad_norm": 0.34461140632629395, + "learning_rate": 4.742187497514416e-06, + "loss": 0.6532, + "step": 3840 + }, + { + "epoch": 0.8950598104707161, + "grad_norm": 0.3389437794685364, + "learning_rate": 4.742052033008083e-06, + "loss": 0.677, + "step": 3841 + }, + { + "epoch": 0.8952928382787012, + "grad_norm": 0.352697491645813, + "learning_rate": 4.741916534857864e-06, + "loss": 0.646, + "step": 3842 + }, + { + "epoch": 0.8955258660866864, + "grad_norm": 0.34109431505203247, + "learning_rate": 4.741781003065792e-06, + "loss": 0.6487, + "step": 3843 + }, + { + "epoch": 0.8957588938946714, + "grad_norm": 0.33301639556884766, + "learning_rate": 4.741645437633902e-06, + "loss": 0.6587, + "step": 3844 + }, + { + "epoch": 0.8959919217026565, + "grad_norm": 0.33717355132102966, + "learning_rate": 4.741509838564225e-06, + "loss": 0.6268, + "step": 3845 + }, + { + "epoch": 0.8962249495106416, + "grad_norm": 0.3375745713710785, + "learning_rate": 4.7413742058587995e-06, + "loss": 0.6418, + "step": 3846 + }, + { + "epoch": 0.8964579773186266, + "grad_norm": 0.3435687720775604, + "learning_rate": 4.741238539519658e-06, + "loss": 0.6234, + "step": 3847 + }, + { + "epoch": 0.8966910051266118, + "grad_norm": 0.34973636269569397, + "learning_rate": 4.741102839548838e-06, + "loss": 0.6445, + "step": 3848 + }, + { + "epoch": 0.8969240329345969, + "grad_norm": 0.3533630967140198, + "learning_rate": 4.740967105948375e-06, + "loss": 0.6241, + "step": 3849 + }, + { + "epoch": 0.897157060742582, + "grad_norm": 0.3520508408546448, + "learning_rate": 4.740831338720305e-06, + "loss": 0.6151, + "step": 3850 + }, + { + "epoch": 0.897390088550567, + "grad_norm": 0.3425200879573822, + "learning_rate": 4.740695537866668e-06, + "loss": 0.6291, + "step": 3851 + }, + { + "epoch": 0.8976231163585521, + "grad_norm": 0.36475512385368347, + "learning_rate": 4.740559703389499e-06, + "loss": 0.6509, + "step": 3852 + }, + { + "epoch": 0.8978561441665373, + "grad_norm": 0.33554431796073914, + "learning_rate": 4.740423835290837e-06, + "loss": 0.637, + "step": 3853 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 0.3561400771141052, + "learning_rate": 4.7402879335727225e-06, + "loss": 0.6253, + "step": 3854 + }, + { + "epoch": 0.8983221997825074, + "grad_norm": 0.3329872190952301, + "learning_rate": 4.740151998237193e-06, + "loss": 0.6554, + "step": 3855 + }, + { + "epoch": 0.8985552275904924, + "grad_norm": 0.3630790710449219, + "learning_rate": 4.740016029286289e-06, + "loss": 0.6927, + "step": 3856 + }, + { + "epoch": 0.8987882553984775, + "grad_norm": 0.34806379675865173, + "learning_rate": 4.739880026722049e-06, + "loss": 0.6339, + "step": 3857 + }, + { + "epoch": 0.8990212832064627, + "grad_norm": 0.3382173478603363, + "learning_rate": 4.739743990546517e-06, + "loss": 0.6284, + "step": 3858 + }, + { + "epoch": 0.8992543110144477, + "grad_norm": 0.35595330595970154, + "learning_rate": 4.739607920761733e-06, + "loss": 0.6554, + "step": 3859 + }, + { + "epoch": 0.8994873388224328, + "grad_norm": 0.34291785955429077, + "learning_rate": 4.739471817369738e-06, + "loss": 0.6414, + "step": 3860 + }, + { + "epoch": 0.8997203666304179, + "grad_norm": 0.3440500497817993, + "learning_rate": 4.739335680372576e-06, + "loss": 0.6021, + "step": 3861 + }, + { + "epoch": 0.8999533944384029, + "grad_norm": 0.360556036233902, + "learning_rate": 4.739199509772288e-06, + "loss": 0.644, + "step": 3862 + }, + { + "epoch": 0.9001864222463881, + "grad_norm": 0.34190911054611206, + "learning_rate": 4.739063305570919e-06, + "loss": 0.6486, + "step": 3863 + }, + { + "epoch": 0.9004194500543732, + "grad_norm": 0.352434366941452, + "learning_rate": 4.738927067770512e-06, + "loss": 0.6514, + "step": 3864 + }, + { + "epoch": 0.9006524778623582, + "grad_norm": 0.34317663311958313, + "learning_rate": 4.738790796373111e-06, + "loss": 0.6395, + "step": 3865 + }, + { + "epoch": 0.9008855056703433, + "grad_norm": 0.339983195066452, + "learning_rate": 4.738654491380761e-06, + "loss": 0.647, + "step": 3866 + }, + { + "epoch": 0.9011185334783284, + "grad_norm": 0.34246137738227844, + "learning_rate": 4.738518152795508e-06, + "loss": 0.6208, + "step": 3867 + }, + { + "epoch": 0.9013515612863136, + "grad_norm": 0.3379640579223633, + "learning_rate": 4.738381780619398e-06, + "loss": 0.6272, + "step": 3868 + }, + { + "epoch": 0.9015845890942986, + "grad_norm": 0.3563805818557739, + "learning_rate": 4.738245374854477e-06, + "loss": 0.6064, + "step": 3869 + }, + { + "epoch": 0.9018176169022837, + "grad_norm": 0.3397010266780853, + "learning_rate": 4.738108935502791e-06, + "loss": 0.6304, + "step": 3870 + }, + { + "epoch": 0.9020506447102687, + "grad_norm": 0.36420896649360657, + "learning_rate": 4.7379724625663885e-06, + "loss": 0.655, + "step": 3871 + }, + { + "epoch": 0.9022836725182538, + "grad_norm": 0.35871726274490356, + "learning_rate": 4.737835956047317e-06, + "loss": 0.6505, + "step": 3872 + }, + { + "epoch": 0.902516700326239, + "grad_norm": 0.33052441477775574, + "learning_rate": 4.737699415947626e-06, + "loss": 0.6326, + "step": 3873 + }, + { + "epoch": 0.902749728134224, + "grad_norm": 0.3357349932193756, + "learning_rate": 4.737562842269363e-06, + "loss": 0.6367, + "step": 3874 + }, + { + "epoch": 0.9029827559422091, + "grad_norm": 0.34254252910614014, + "learning_rate": 4.737426235014578e-06, + "loss": 0.6456, + "step": 3875 + }, + { + "epoch": 0.9032157837501942, + "grad_norm": 0.36372002959251404, + "learning_rate": 4.73728959418532e-06, + "loss": 0.6165, + "step": 3876 + }, + { + "epoch": 0.9034488115581792, + "grad_norm": 0.34341010451316833, + "learning_rate": 4.737152919783641e-06, + "loss": 0.6409, + "step": 3877 + }, + { + "epoch": 0.9036818393661643, + "grad_norm": 0.33796125650405884, + "learning_rate": 4.737016211811591e-06, + "loss": 0.6636, + "step": 3878 + }, + { + "epoch": 0.9039148671741495, + "grad_norm": 0.3355461657047272, + "learning_rate": 4.73687947027122e-06, + "loss": 0.6171, + "step": 3879 + }, + { + "epoch": 0.9041478949821345, + "grad_norm": 0.34699252247810364, + "learning_rate": 4.736742695164582e-06, + "loss": 0.6499, + "step": 3880 + }, + { + "epoch": 0.9043809227901196, + "grad_norm": 0.35485294461250305, + "learning_rate": 4.73660588649373e-06, + "loss": 0.6322, + "step": 3881 + }, + { + "epoch": 0.9046139505981047, + "grad_norm": 0.3490213453769684, + "learning_rate": 4.7364690442607144e-06, + "loss": 0.6439, + "step": 3882 + }, + { + "epoch": 0.9048469784060897, + "grad_norm": 0.3430531620979309, + "learning_rate": 4.7363321684675915e-06, + "loss": 0.6285, + "step": 3883 + }, + { + "epoch": 0.9050800062140749, + "grad_norm": 0.3388710618019104, + "learning_rate": 4.736195259116412e-06, + "loss": 0.6357, + "step": 3884 + }, + { + "epoch": 0.90531303402206, + "grad_norm": 0.35716938972473145, + "learning_rate": 4.7360583162092335e-06, + "loss": 0.6632, + "step": 3885 + }, + { + "epoch": 0.905546061830045, + "grad_norm": 0.34946081042289734, + "learning_rate": 4.735921339748108e-06, + "loss": 0.6229, + "step": 3886 + }, + { + "epoch": 0.9057790896380301, + "grad_norm": 0.33701908588409424, + "learning_rate": 4.735784329735094e-06, + "loss": 0.6603, + "step": 3887 + }, + { + "epoch": 0.9060121174460152, + "grad_norm": 0.3414117991924286, + "learning_rate": 4.735647286172245e-06, + "loss": 0.645, + "step": 3888 + }, + { + "epoch": 0.9062451452540004, + "grad_norm": 0.34050947427749634, + "learning_rate": 4.735510209061619e-06, + "loss": 0.6415, + "step": 3889 + }, + { + "epoch": 0.9064781730619854, + "grad_norm": 0.3546944260597229, + "learning_rate": 4.7353730984052714e-06, + "loss": 0.6413, + "step": 3890 + }, + { + "epoch": 0.9067112008699705, + "grad_norm": 0.35964566469192505, + "learning_rate": 4.735235954205261e-06, + "loss": 0.6736, + "step": 3891 + }, + { + "epoch": 0.9069442286779555, + "grad_norm": 0.34705570340156555, + "learning_rate": 4.735098776463646e-06, + "loss": 0.6458, + "step": 3892 + }, + { + "epoch": 0.9071772564859406, + "grad_norm": 0.33552995324134827, + "learning_rate": 4.734961565182483e-06, + "loss": 0.6484, + "step": 3893 + }, + { + "epoch": 0.9074102842939258, + "grad_norm": 0.37147006392478943, + "learning_rate": 4.734824320363833e-06, + "loss": 0.665, + "step": 3894 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 0.34786590933799744, + "learning_rate": 4.734687042009754e-06, + "loss": 0.632, + "step": 3895 + }, + { + "epoch": 0.9078763399098959, + "grad_norm": 0.35050633549690247, + "learning_rate": 4.734549730122307e-06, + "loss": 0.6429, + "step": 3896 + }, + { + "epoch": 0.908109367717881, + "grad_norm": 0.34226545691490173, + "learning_rate": 4.734412384703553e-06, + "loss": 0.6276, + "step": 3897 + }, + { + "epoch": 0.908342395525866, + "grad_norm": 0.3417789340019226, + "learning_rate": 4.734275005755551e-06, + "loss": 0.6585, + "step": 3898 + }, + { + "epoch": 0.9085754233338512, + "grad_norm": 0.34869465231895447, + "learning_rate": 4.734137593280365e-06, + "loss": 0.6496, + "step": 3899 + }, + { + "epoch": 0.9088084511418363, + "grad_norm": 0.3504216969013214, + "learning_rate": 4.734000147280054e-06, + "loss": 0.6399, + "step": 3900 + }, + { + "epoch": 0.9090414789498213, + "grad_norm": 0.35288649797439575, + "learning_rate": 4.733862667756682e-06, + "loss": 0.6415, + "step": 3901 + }, + { + "epoch": 0.9092745067578064, + "grad_norm": 0.3530333936214447, + "learning_rate": 4.733725154712313e-06, + "loss": 0.655, + "step": 3902 + }, + { + "epoch": 0.9095075345657915, + "grad_norm": 0.34417417645454407, + "learning_rate": 4.73358760814901e-06, + "loss": 0.6436, + "step": 3903 + }, + { + "epoch": 0.9097405623737767, + "grad_norm": 0.3469332158565521, + "learning_rate": 4.733450028068835e-06, + "loss": 0.6647, + "step": 3904 + }, + { + "epoch": 0.9099735901817617, + "grad_norm": 0.34454256296157837, + "learning_rate": 4.733312414473855e-06, + "loss": 0.6323, + "step": 3905 + }, + { + "epoch": 0.9102066179897468, + "grad_norm": 0.3486971855163574, + "learning_rate": 4.733174767366134e-06, + "loss": 0.6217, + "step": 3906 + }, + { + "epoch": 0.9104396457977318, + "grad_norm": 0.36047473549842834, + "learning_rate": 4.733037086747737e-06, + "loss": 0.6452, + "step": 3907 + }, + { + "epoch": 0.9106726736057169, + "grad_norm": 0.3514440655708313, + "learning_rate": 4.732899372620732e-06, + "loss": 0.665, + "step": 3908 + }, + { + "epoch": 0.9109057014137021, + "grad_norm": 0.33752280473709106, + "learning_rate": 4.732761624987183e-06, + "loss": 0.6151, + "step": 3909 + }, + { + "epoch": 0.9111387292216871, + "grad_norm": 0.34417960047721863, + "learning_rate": 4.732623843849159e-06, + "loss": 0.6352, + "step": 3910 + }, + { + "epoch": 0.9113717570296722, + "grad_norm": 0.34237760305404663, + "learning_rate": 4.732486029208725e-06, + "loss": 0.6282, + "step": 3911 + }, + { + "epoch": 0.9116047848376573, + "grad_norm": 0.3623654246330261, + "learning_rate": 4.732348181067952e-06, + "loss": 0.6576, + "step": 3912 + }, + { + "epoch": 0.9118378126456423, + "grad_norm": 0.354366660118103, + "learning_rate": 4.732210299428907e-06, + "loss": 0.6286, + "step": 3913 + }, + { + "epoch": 0.9120708404536275, + "grad_norm": 0.33483007550239563, + "learning_rate": 4.732072384293659e-06, + "loss": 0.645, + "step": 3914 + }, + { + "epoch": 0.9123038682616126, + "grad_norm": 0.35395142436027527, + "learning_rate": 4.731934435664278e-06, + "loss": 0.6461, + "step": 3915 + }, + { + "epoch": 0.9125368960695976, + "grad_norm": 0.3502449095249176, + "learning_rate": 4.731796453542834e-06, + "loss": 0.6374, + "step": 3916 + }, + { + "epoch": 0.9127699238775827, + "grad_norm": 0.34913840889930725, + "learning_rate": 4.731658437931397e-06, + "loss": 0.6221, + "step": 3917 + }, + { + "epoch": 0.9130029516855678, + "grad_norm": 0.345021516084671, + "learning_rate": 4.731520388832039e-06, + "loss": 0.6177, + "step": 3918 + }, + { + "epoch": 0.913235979493553, + "grad_norm": 0.3445473611354828, + "learning_rate": 4.731382306246831e-06, + "loss": 0.6056, + "step": 3919 + }, + { + "epoch": 0.913469007301538, + "grad_norm": 0.33472028374671936, + "learning_rate": 4.731244190177844e-06, + "loss": 0.6299, + "step": 3920 + }, + { + "epoch": 0.9137020351095231, + "grad_norm": 0.34397560358047485, + "learning_rate": 4.731106040627153e-06, + "loss": 0.6629, + "step": 3921 + }, + { + "epoch": 0.9139350629175081, + "grad_norm": 0.3379632830619812, + "learning_rate": 4.730967857596828e-06, + "loss": 0.6252, + "step": 3922 + }, + { + "epoch": 0.9141680907254932, + "grad_norm": 0.3550339341163635, + "learning_rate": 4.730829641088945e-06, + "loss": 0.6585, + "step": 3923 + }, + { + "epoch": 0.9144011185334783, + "grad_norm": 0.33841466903686523, + "learning_rate": 4.730691391105577e-06, + "loss": 0.6327, + "step": 3924 + }, + { + "epoch": 0.9146341463414634, + "grad_norm": 0.34635213017463684, + "learning_rate": 4.730553107648799e-06, + "loss": 0.6305, + "step": 3925 + }, + { + "epoch": 0.9148671741494485, + "grad_norm": 0.32462501525878906, + "learning_rate": 4.730414790720685e-06, + "loss": 0.6129, + "step": 3926 + }, + { + "epoch": 0.9151002019574336, + "grad_norm": 0.3589008152484894, + "learning_rate": 4.730276440323312e-06, + "loss": 0.6578, + "step": 3927 + }, + { + "epoch": 0.9153332297654186, + "grad_norm": 0.3234192430973053, + "learning_rate": 4.730138056458755e-06, + "loss": 0.6036, + "step": 3928 + }, + { + "epoch": 0.9155662575734037, + "grad_norm": 0.3388063311576843, + "learning_rate": 4.729999639129092e-06, + "loss": 0.6376, + "step": 3929 + }, + { + "epoch": 0.9157992853813889, + "grad_norm": 0.33314982056617737, + "learning_rate": 4.729861188336399e-06, + "loss": 0.627, + "step": 3930 + }, + { + "epoch": 0.916032313189374, + "grad_norm": 0.3448079526424408, + "learning_rate": 4.7297227040827526e-06, + "loss": 0.6252, + "step": 3931 + }, + { + "epoch": 0.916265340997359, + "grad_norm": 0.33442458510398865, + "learning_rate": 4.729584186370233e-06, + "loss": 0.6346, + "step": 3932 + }, + { + "epoch": 0.9164983688053441, + "grad_norm": 0.36105877161026, + "learning_rate": 4.729445635200917e-06, + "loss": 0.6547, + "step": 3933 + }, + { + "epoch": 0.9167313966133291, + "grad_norm": 0.341082900762558, + "learning_rate": 4.729307050576884e-06, + "loss": 0.6092, + "step": 3934 + }, + { + "epoch": 0.9169644244213143, + "grad_norm": 0.3469405174255371, + "learning_rate": 4.729168432500215e-06, + "loss": 0.6347, + "step": 3935 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 0.3389022946357727, + "learning_rate": 4.729029780972988e-06, + "loss": 0.6473, + "step": 3936 + }, + { + "epoch": 0.9174304800372844, + "grad_norm": 0.3401764929294586, + "learning_rate": 4.728891095997286e-06, + "loss": 0.6472, + "step": 3937 + }, + { + "epoch": 0.9176635078452695, + "grad_norm": 0.330806165933609, + "learning_rate": 4.728752377575188e-06, + "loss": 0.6311, + "step": 3938 + }, + { + "epoch": 0.9178965356532546, + "grad_norm": 0.3510688841342926, + "learning_rate": 4.728613625708776e-06, + "loss": 0.618, + "step": 3939 + }, + { + "epoch": 0.9181295634612398, + "grad_norm": 0.34087854623794556, + "learning_rate": 4.7284748404001315e-06, + "loss": 0.6279, + "step": 3940 + }, + { + "epoch": 0.9183625912692248, + "grad_norm": 0.3598979115486145, + "learning_rate": 4.72833602165134e-06, + "loss": 0.6562, + "step": 3941 + }, + { + "epoch": 0.9185956190772099, + "grad_norm": 0.3396141231060028, + "learning_rate": 4.728197169464482e-06, + "loss": 0.6421, + "step": 3942 + }, + { + "epoch": 0.9188286468851949, + "grad_norm": 0.3487478792667389, + "learning_rate": 4.72805828384164e-06, + "loss": 0.6422, + "step": 3943 + }, + { + "epoch": 0.91906167469318, + "grad_norm": 0.3431631624698639, + "learning_rate": 4.727919364784901e-06, + "loss": 0.6335, + "step": 3944 + }, + { + "epoch": 0.9192947025011652, + "grad_norm": 0.34448882937431335, + "learning_rate": 4.727780412296349e-06, + "loss": 0.6223, + "step": 3945 + }, + { + "epoch": 0.9195277303091502, + "grad_norm": 0.35726746916770935, + "learning_rate": 4.727641426378068e-06, + "loss": 0.6318, + "step": 3946 + }, + { + "epoch": 0.9197607581171353, + "grad_norm": 0.35291051864624023, + "learning_rate": 4.727502407032143e-06, + "loss": 0.6556, + "step": 3947 + }, + { + "epoch": 0.9199937859251204, + "grad_norm": 0.3497447073459625, + "learning_rate": 4.727363354260662e-06, + "loss": 0.6449, + "step": 3948 + }, + { + "epoch": 0.9202268137331054, + "grad_norm": 0.3685568869113922, + "learning_rate": 4.727224268065711e-06, + "loss": 0.6345, + "step": 3949 + }, + { + "epoch": 0.9204598415410906, + "grad_norm": 0.3459789454936981, + "learning_rate": 4.727085148449376e-06, + "loss": 0.6321, + "step": 3950 + }, + { + "epoch": 0.9206928693490757, + "grad_norm": 0.35603341460227966, + "learning_rate": 4.7269459954137465e-06, + "loss": 0.6466, + "step": 3951 + }, + { + "epoch": 0.9209258971570607, + "grad_norm": 0.3626619577407837, + "learning_rate": 4.726806808960909e-06, + "loss": 0.6565, + "step": 3952 + }, + { + "epoch": 0.9211589249650458, + "grad_norm": 0.34058302640914917, + "learning_rate": 4.7266675890929534e-06, + "loss": 0.6205, + "step": 3953 + }, + { + "epoch": 0.9213919527730309, + "grad_norm": 0.3609338402748108, + "learning_rate": 4.726528335811967e-06, + "loss": 0.6282, + "step": 3954 + }, + { + "epoch": 0.921624980581016, + "grad_norm": 0.3524787127971649, + "learning_rate": 4.726389049120041e-06, + "loss": 0.6278, + "step": 3955 + }, + { + "epoch": 0.9218580083890011, + "grad_norm": 0.3527066111564636, + "learning_rate": 4.726249729019266e-06, + "loss": 0.616, + "step": 3956 + }, + { + "epoch": 0.9220910361969862, + "grad_norm": 0.3433379828929901, + "learning_rate": 4.72611037551173e-06, + "loss": 0.6783, + "step": 3957 + }, + { + "epoch": 0.9223240640049712, + "grad_norm": 0.3391532301902771, + "learning_rate": 4.725970988599527e-06, + "loss": 0.6459, + "step": 3958 + }, + { + "epoch": 0.9225570918129563, + "grad_norm": 0.3363184928894043, + "learning_rate": 4.725831568284747e-06, + "loss": 0.6578, + "step": 3959 + }, + { + "epoch": 0.9227901196209415, + "grad_norm": 0.36636167764663696, + "learning_rate": 4.725692114569483e-06, + "loss": 0.617, + "step": 3960 + }, + { + "epoch": 0.9230231474289265, + "grad_norm": 0.35327252745628357, + "learning_rate": 4.725552627455826e-06, + "loss": 0.6462, + "step": 3961 + }, + { + "epoch": 0.9232561752369116, + "grad_norm": 0.35970330238342285, + "learning_rate": 4.725413106945871e-06, + "loss": 0.6435, + "step": 3962 + }, + { + "epoch": 0.9234892030448967, + "grad_norm": 0.34565651416778564, + "learning_rate": 4.72527355304171e-06, + "loss": 0.648, + "step": 3963 + }, + { + "epoch": 0.9237222308528817, + "grad_norm": 0.34877097606658936, + "learning_rate": 4.725133965745439e-06, + "loss": 0.6515, + "step": 3964 + }, + { + "epoch": 0.9239552586608669, + "grad_norm": 0.35003483295440674, + "learning_rate": 4.7249943450591515e-06, + "loss": 0.6208, + "step": 3965 + }, + { + "epoch": 0.924188286468852, + "grad_norm": 0.34210526943206787, + "learning_rate": 4.724854690984943e-06, + "loss": 0.6403, + "step": 3966 + }, + { + "epoch": 0.924421314276837, + "grad_norm": 0.35930535197257996, + "learning_rate": 4.724715003524908e-06, + "loss": 0.6583, + "step": 3967 + }, + { + "epoch": 0.9246543420848221, + "grad_norm": 0.3419696092605591, + "learning_rate": 4.7245752826811445e-06, + "loss": 0.6137, + "step": 3968 + }, + { + "epoch": 0.9248873698928072, + "grad_norm": 0.35197046399116516, + "learning_rate": 4.724435528455747e-06, + "loss": 0.6433, + "step": 3969 + }, + { + "epoch": 0.9251203977007922, + "grad_norm": 0.367239773273468, + "learning_rate": 4.724295740850814e-06, + "loss": 0.6162, + "step": 3970 + }, + { + "epoch": 0.9253534255087774, + "grad_norm": 0.33211272954940796, + "learning_rate": 4.724155919868444e-06, + "loss": 0.6366, + "step": 3971 + }, + { + "epoch": 0.9255864533167625, + "grad_norm": 0.34288662672042847, + "learning_rate": 4.724016065510733e-06, + "loss": 0.6323, + "step": 3972 + }, + { + "epoch": 0.9258194811247475, + "grad_norm": 0.3504854440689087, + "learning_rate": 4.723876177779781e-06, + "loss": 0.6602, + "step": 3973 + }, + { + "epoch": 0.9260525089327326, + "grad_norm": 0.35490524768829346, + "learning_rate": 4.723736256677687e-06, + "loss": 0.6436, + "step": 3974 + }, + { + "epoch": 0.9262855367407177, + "grad_norm": 0.3470008671283722, + "learning_rate": 4.72359630220655e-06, + "loss": 0.6531, + "step": 3975 + }, + { + "epoch": 0.9265185645487028, + "grad_norm": 0.34154486656188965, + "learning_rate": 4.723456314368471e-06, + "loss": 0.6466, + "step": 3976 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 0.3397318124771118, + "learning_rate": 4.72331629316555e-06, + "loss": 0.6373, + "step": 3977 + }, + { + "epoch": 0.926984620164673, + "grad_norm": 0.34282854199409485, + "learning_rate": 4.723176238599888e-06, + "loss": 0.6492, + "step": 3978 + }, + { + "epoch": 0.927217647972658, + "grad_norm": 0.3641519546508789, + "learning_rate": 4.723036150673586e-06, + "loss": 0.6362, + "step": 3979 + }, + { + "epoch": 0.9274506757806431, + "grad_norm": 0.3472311198711395, + "learning_rate": 4.722896029388749e-06, + "loss": 0.6531, + "step": 3980 + }, + { + "epoch": 0.9276837035886283, + "grad_norm": 0.3367893397808075, + "learning_rate": 4.7227558747474775e-06, + "loss": 0.6533, + "step": 3981 + }, + { + "epoch": 0.9279167313966133, + "grad_norm": 0.3454027473926544, + "learning_rate": 4.722615686751874e-06, + "loss": 0.6337, + "step": 3982 + }, + { + "epoch": 0.9281497592045984, + "grad_norm": 0.35066643357276917, + "learning_rate": 4.7224754654040435e-06, + "loss": 0.6433, + "step": 3983 + }, + { + "epoch": 0.9283827870125835, + "grad_norm": 0.36555030941963196, + "learning_rate": 4.722335210706089e-06, + "loss": 0.6456, + "step": 3984 + }, + { + "epoch": 0.9286158148205685, + "grad_norm": 0.3492906093597412, + "learning_rate": 4.722194922660117e-06, + "loss": 0.6457, + "step": 3985 + }, + { + "epoch": 0.9288488426285537, + "grad_norm": 0.3600741922855377, + "learning_rate": 4.722054601268231e-06, + "loss": 0.6373, + "step": 3986 + }, + { + "epoch": 0.9290818704365388, + "grad_norm": 0.345948725938797, + "learning_rate": 4.721914246532537e-06, + "loss": 0.6477, + "step": 3987 + }, + { + "epoch": 0.9293148982445238, + "grad_norm": 0.38339513540267944, + "learning_rate": 4.7217738584551405e-06, + "loss": 0.6669, + "step": 3988 + }, + { + "epoch": 0.9295479260525089, + "grad_norm": 0.3660281300544739, + "learning_rate": 4.72163343703815e-06, + "loss": 0.6248, + "step": 3989 + }, + { + "epoch": 0.929780953860494, + "grad_norm": 0.3529433608055115, + "learning_rate": 4.721492982283671e-06, + "loss": 0.6264, + "step": 3990 + }, + { + "epoch": 0.9300139816684792, + "grad_norm": 0.3576503098011017, + "learning_rate": 4.721352494193812e-06, + "loss": 0.6473, + "step": 3991 + }, + { + "epoch": 0.9302470094764642, + "grad_norm": 0.3469160199165344, + "learning_rate": 4.721211972770681e-06, + "loss": 0.6218, + "step": 3992 + }, + { + "epoch": 0.9304800372844493, + "grad_norm": 0.36292245984077454, + "learning_rate": 4.721071418016386e-06, + "loss": 0.6372, + "step": 3993 + }, + { + "epoch": 0.9307130650924343, + "grad_norm": 0.34668219089508057, + "learning_rate": 4.720930829933036e-06, + "loss": 0.6388, + "step": 3994 + }, + { + "epoch": 0.9309460929004194, + "grad_norm": 0.3582836389541626, + "learning_rate": 4.720790208522742e-06, + "loss": 0.6564, + "step": 3995 + }, + { + "epoch": 0.9311791207084046, + "grad_norm": 0.339874267578125, + "learning_rate": 4.720649553787613e-06, + "loss": 0.6353, + "step": 3996 + }, + { + "epoch": 0.9314121485163896, + "grad_norm": 0.3574848771095276, + "learning_rate": 4.720508865729761e-06, + "loss": 0.657, + "step": 3997 + }, + { + "epoch": 0.9316451763243747, + "grad_norm": 0.362556129693985, + "learning_rate": 4.720368144351295e-06, + "loss": 0.6723, + "step": 3998 + }, + { + "epoch": 0.9318782041323598, + "grad_norm": 0.35559675097465515, + "learning_rate": 4.720227389654328e-06, + "loss": 0.6401, + "step": 3999 + }, + { + "epoch": 0.9321112319403448, + "grad_norm": 0.34215158224105835, + "learning_rate": 4.720086601640972e-06, + "loss": 0.6402, + "step": 4000 + }, + { + "epoch": 0.93234425974833, + "grad_norm": 0.364961713552475, + "learning_rate": 4.7199457803133395e-06, + "loss": 0.6229, + "step": 4001 + }, + { + "epoch": 0.9325772875563151, + "grad_norm": 0.34748655557632446, + "learning_rate": 4.719804925673545e-06, + "loss": 0.6089, + "step": 4002 + }, + { + "epoch": 0.9328103153643001, + "grad_norm": 0.36629366874694824, + "learning_rate": 4.719664037723699e-06, + "loss": 0.6491, + "step": 4003 + }, + { + "epoch": 0.9330433431722852, + "grad_norm": 0.36294519901275635, + "learning_rate": 4.719523116465918e-06, + "loss": 0.6414, + "step": 4004 + }, + { + "epoch": 0.9332763709802703, + "grad_norm": 0.3514779210090637, + "learning_rate": 4.719382161902317e-06, + "loss": 0.6349, + "step": 4005 + }, + { + "epoch": 0.9335093987882555, + "grad_norm": 0.35359108448028564, + "learning_rate": 4.71924117403501e-06, + "loss": 0.6592, + "step": 4006 + }, + { + "epoch": 0.9337424265962405, + "grad_norm": 0.4285716116428375, + "learning_rate": 4.719100152866112e-06, + "loss": 0.6268, + "step": 4007 + }, + { + "epoch": 0.9339754544042256, + "grad_norm": 0.36638516187667847, + "learning_rate": 4.718959098397741e-06, + "loss": 0.6486, + "step": 4008 + }, + { + "epoch": 0.9342084822122106, + "grad_norm": 0.3425025939941406, + "learning_rate": 4.718818010632012e-06, + "loss": 0.6341, + "step": 4009 + }, + { + "epoch": 0.9344415100201957, + "grad_norm": 0.3720821440219879, + "learning_rate": 4.718676889571044e-06, + "loss": 0.6584, + "step": 4010 + }, + { + "epoch": 0.9346745378281809, + "grad_norm": 0.37117549777030945, + "learning_rate": 4.7185357352169535e-06, + "loss": 0.6268, + "step": 4011 + }, + { + "epoch": 0.934907565636166, + "grad_norm": 0.3590240776538849, + "learning_rate": 4.718394547571858e-06, + "loss": 0.648, + "step": 4012 + }, + { + "epoch": 0.935140593444151, + "grad_norm": 0.3648211658000946, + "learning_rate": 4.7182533266378765e-06, + "loss": 0.654, + "step": 4013 + }, + { + "epoch": 0.9353736212521361, + "grad_norm": 0.3608511984348297, + "learning_rate": 4.7181120724171285e-06, + "loss": 0.627, + "step": 4014 + }, + { + "epoch": 0.9356066490601211, + "grad_norm": 0.36417120695114136, + "learning_rate": 4.717970784911734e-06, + "loss": 0.6072, + "step": 4015 + }, + { + "epoch": 0.9358396768681062, + "grad_norm": 0.3368236720561981, + "learning_rate": 4.717829464123812e-06, + "loss": 0.637, + "step": 4016 + }, + { + "epoch": 0.9360727046760914, + "grad_norm": 0.34973829984664917, + "learning_rate": 4.717688110055485e-06, + "loss": 0.6251, + "step": 4017 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 0.34785187244415283, + "learning_rate": 4.7175467227088726e-06, + "loss": 0.641, + "step": 4018 + }, + { + "epoch": 0.9365387602920615, + "grad_norm": 0.34182265400886536, + "learning_rate": 4.717405302086096e-06, + "loss": 0.6396, + "step": 4019 + }, + { + "epoch": 0.9367717881000466, + "grad_norm": 0.3493020236492157, + "learning_rate": 4.717263848189279e-06, + "loss": 0.6534, + "step": 4020 + }, + { + "epoch": 0.9370048159080316, + "grad_norm": 0.32276132702827454, + "learning_rate": 4.717122361020543e-06, + "loss": 0.6256, + "step": 4021 + }, + { + "epoch": 0.9372378437160168, + "grad_norm": 0.3459576964378357, + "learning_rate": 4.7169808405820114e-06, + "loss": 0.6375, + "step": 4022 + }, + { + "epoch": 0.9374708715240019, + "grad_norm": 0.3443538546562195, + "learning_rate": 4.716839286875808e-06, + "loss": 0.6556, + "step": 4023 + }, + { + "epoch": 0.937703899331987, + "grad_norm": 0.374693900346756, + "learning_rate": 4.716697699904057e-06, + "loss": 0.6446, + "step": 4024 + }, + { + "epoch": 0.937936927139972, + "grad_norm": 0.3662647604942322, + "learning_rate": 4.716556079668884e-06, + "loss": 0.6253, + "step": 4025 + }, + { + "epoch": 0.9381699549479571, + "grad_norm": 0.35500001907348633, + "learning_rate": 4.716414426172412e-06, + "loss": 0.6126, + "step": 4026 + }, + { + "epoch": 0.9384029827559422, + "grad_norm": 0.3564988374710083, + "learning_rate": 4.716272739416768e-06, + "loss": 0.6315, + "step": 4027 + }, + { + "epoch": 0.9386360105639273, + "grad_norm": 0.3453284800052643, + "learning_rate": 4.716131019404077e-06, + "loss": 0.6458, + "step": 4028 + }, + { + "epoch": 0.9388690383719124, + "grad_norm": 0.3758915662765503, + "learning_rate": 4.715989266136467e-06, + "loss": 0.6533, + "step": 4029 + }, + { + "epoch": 0.9391020661798974, + "grad_norm": 0.35876256227493286, + "learning_rate": 4.715847479616065e-06, + "loss": 0.6347, + "step": 4030 + }, + { + "epoch": 0.9393350939878825, + "grad_norm": 0.35994216799736023, + "learning_rate": 4.7157056598449966e-06, + "loss": 0.6508, + "step": 4031 + }, + { + "epoch": 0.9395681217958677, + "grad_norm": 0.3564876914024353, + "learning_rate": 4.715563806825393e-06, + "loss": 0.6319, + "step": 4032 + }, + { + "epoch": 0.9398011496038527, + "grad_norm": 0.3481888175010681, + "learning_rate": 4.715421920559381e-06, + "loss": 0.6359, + "step": 4033 + }, + { + "epoch": 0.9400341774118378, + "grad_norm": 0.34253546595573425, + "learning_rate": 4.715280001049091e-06, + "loss": 0.6372, + "step": 4034 + }, + { + "epoch": 0.9402672052198229, + "grad_norm": 0.3431030809879303, + "learning_rate": 4.71513804829665e-06, + "loss": 0.644, + "step": 4035 + }, + { + "epoch": 0.9405002330278079, + "grad_norm": 0.3438045084476471, + "learning_rate": 4.714996062304192e-06, + "loss": 0.63, + "step": 4036 + }, + { + "epoch": 0.9407332608357931, + "grad_norm": 0.34359902143478394, + "learning_rate": 4.714854043073843e-06, + "loss": 0.6583, + "step": 4037 + }, + { + "epoch": 0.9409662886437782, + "grad_norm": 0.34080246090888977, + "learning_rate": 4.714711990607737e-06, + "loss": 0.6142, + "step": 4038 + }, + { + "epoch": 0.9411993164517632, + "grad_norm": 0.34857818484306335, + "learning_rate": 4.7145699049080064e-06, + "loss": 0.6308, + "step": 4039 + }, + { + "epoch": 0.9414323442597483, + "grad_norm": 0.3472273647785187, + "learning_rate": 4.714427785976781e-06, + "loss": 0.6203, + "step": 4040 + }, + { + "epoch": 0.9416653720677334, + "grad_norm": 0.3369632065296173, + "learning_rate": 4.714285633816195e-06, + "loss": 0.6395, + "step": 4041 + }, + { + "epoch": 0.9418983998757186, + "grad_norm": 0.35332348942756653, + "learning_rate": 4.714143448428381e-06, + "loss": 0.6559, + "step": 4042 + }, + { + "epoch": 0.9421314276837036, + "grad_norm": 0.34638896584510803, + "learning_rate": 4.714001229815473e-06, + "loss": 0.6411, + "step": 4043 + }, + { + "epoch": 0.9423644554916887, + "grad_norm": 0.32606467604637146, + "learning_rate": 4.713858977979604e-06, + "loss": 0.6205, + "step": 4044 + }, + { + "epoch": 0.9425974832996737, + "grad_norm": 0.3366117477416992, + "learning_rate": 4.71371669292291e-06, + "loss": 0.6286, + "step": 4045 + }, + { + "epoch": 0.9428305111076588, + "grad_norm": 0.33556997776031494, + "learning_rate": 4.713574374647525e-06, + "loss": 0.6334, + "step": 4046 + }, + { + "epoch": 0.943063538915644, + "grad_norm": 0.3407151401042938, + "learning_rate": 4.713432023155585e-06, + "loss": 0.6241, + "step": 4047 + }, + { + "epoch": 0.943296566723629, + "grad_norm": 0.334515243768692, + "learning_rate": 4.713289638449227e-06, + "loss": 0.6205, + "step": 4048 + }, + { + "epoch": 0.9435295945316141, + "grad_norm": 0.343294233083725, + "learning_rate": 4.713147220530585e-06, + "loss": 0.6249, + "step": 4049 + }, + { + "epoch": 0.9437626223395992, + "grad_norm": 0.3651510179042816, + "learning_rate": 4.7130047694018e-06, + "loss": 0.6281, + "step": 4050 + }, + { + "epoch": 0.9439956501475842, + "grad_norm": 0.36329931020736694, + "learning_rate": 4.712862285065006e-06, + "loss": 0.6326, + "step": 4051 + }, + { + "epoch": 0.9442286779555694, + "grad_norm": 0.34905749559402466, + "learning_rate": 4.712719767522343e-06, + "loss": 0.6077, + "step": 4052 + }, + { + "epoch": 0.9444617057635545, + "grad_norm": 0.3637372553348541, + "learning_rate": 4.712577216775949e-06, + "loss": 0.6358, + "step": 4053 + }, + { + "epoch": 0.9446947335715395, + "grad_norm": 0.3700665533542633, + "learning_rate": 4.712434632827964e-06, + "loss": 0.6091, + "step": 4054 + }, + { + "epoch": 0.9449277613795246, + "grad_norm": 0.35119858384132385, + "learning_rate": 4.712292015680527e-06, + "loss": 0.6194, + "step": 4055 + }, + { + "epoch": 0.9451607891875097, + "grad_norm": 0.3548450469970703, + "learning_rate": 4.712149365335776e-06, + "loss": 0.6401, + "step": 4056 + }, + { + "epoch": 0.9453938169954949, + "grad_norm": 0.33072131872177124, + "learning_rate": 4.712006681795855e-06, + "loss": 0.6374, + "step": 4057 + }, + { + "epoch": 0.9456268448034799, + "grad_norm": 0.3509339988231659, + "learning_rate": 4.711863965062903e-06, + "loss": 0.6129, + "step": 4058 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 0.34420838952064514, + "learning_rate": 4.711721215139063e-06, + "loss": 0.6327, + "step": 4059 + }, + { + "epoch": 0.94609290041945, + "grad_norm": 0.3433762192726135, + "learning_rate": 4.711578432026475e-06, + "loss": 0.6426, + "step": 4060 + }, + { + "epoch": 0.9463259282274351, + "grad_norm": 0.3458874225616455, + "learning_rate": 4.711435615727284e-06, + "loss": 0.6517, + "step": 4061 + }, + { + "epoch": 0.9465589560354202, + "grad_norm": 0.36765947937965393, + "learning_rate": 4.7112927662436316e-06, + "loss": 0.6361, + "step": 4062 + }, + { + "epoch": 0.9467919838434053, + "grad_norm": 0.3558279573917389, + "learning_rate": 4.711149883577662e-06, + "loss": 0.6512, + "step": 4063 + }, + { + "epoch": 0.9470250116513904, + "grad_norm": 0.34259936213493347, + "learning_rate": 4.711006967731519e-06, + "loss": 0.6498, + "step": 4064 + }, + { + "epoch": 0.9472580394593755, + "grad_norm": 0.3572194576263428, + "learning_rate": 4.710864018707348e-06, + "loss": 0.6586, + "step": 4065 + }, + { + "epoch": 0.9474910672673605, + "grad_norm": 0.35302966833114624, + "learning_rate": 4.710721036507291e-06, + "loss": 0.6546, + "step": 4066 + }, + { + "epoch": 0.9477240950753456, + "grad_norm": 0.33911585807800293, + "learning_rate": 4.7105780211334975e-06, + "loss": 0.6243, + "step": 4067 + }, + { + "epoch": 0.9479571228833308, + "grad_norm": 0.3554805815219879, + "learning_rate": 4.710434972588111e-06, + "loss": 0.6258, + "step": 4068 + }, + { + "epoch": 0.9481901506913158, + "grad_norm": 0.3463744521141052, + "learning_rate": 4.71029189087328e-06, + "loss": 0.6377, + "step": 4069 + }, + { + "epoch": 0.9484231784993009, + "grad_norm": 0.3514041006565094, + "learning_rate": 4.710148775991149e-06, + "loss": 0.6371, + "step": 4070 + }, + { + "epoch": 0.948656206307286, + "grad_norm": 0.35029953718185425, + "learning_rate": 4.7100056279438674e-06, + "loss": 0.6291, + "step": 4071 + }, + { + "epoch": 0.948889234115271, + "grad_norm": 0.3556520938873291, + "learning_rate": 4.709862446733584e-06, + "loss": 0.6402, + "step": 4072 + }, + { + "epoch": 0.9491222619232562, + "grad_norm": 0.356517493724823, + "learning_rate": 4.709719232362446e-06, + "loss": 0.6659, + "step": 4073 + }, + { + "epoch": 0.9493552897312413, + "grad_norm": 0.3460703492164612, + "learning_rate": 4.709575984832602e-06, + "loss": 0.6135, + "step": 4074 + }, + { + "epoch": 0.9495883175392263, + "grad_norm": 0.341627836227417, + "learning_rate": 4.709432704146203e-06, + "loss": 0.639, + "step": 4075 + }, + { + "epoch": 0.9498213453472114, + "grad_norm": 0.34520676732063293, + "learning_rate": 4.7092893903053975e-06, + "loss": 0.6481, + "step": 4076 + }, + { + "epoch": 0.9500543731551965, + "grad_norm": 0.3539208769798279, + "learning_rate": 4.709146043312337e-06, + "loss": 0.6229, + "step": 4077 + }, + { + "epoch": 0.9502874009631816, + "grad_norm": 0.34183570742607117, + "learning_rate": 4.709002663169173e-06, + "loss": 0.6505, + "step": 4078 + }, + { + "epoch": 0.9505204287711667, + "grad_norm": 0.3455747067928314, + "learning_rate": 4.708859249878057e-06, + "loss": 0.631, + "step": 4079 + }, + { + "epoch": 0.9507534565791518, + "grad_norm": 0.33745333552360535, + "learning_rate": 4.708715803441139e-06, + "loss": 0.6557, + "step": 4080 + }, + { + "epoch": 0.9509864843871368, + "grad_norm": 0.3433181047439575, + "learning_rate": 4.708572323860574e-06, + "loss": 0.6458, + "step": 4081 + }, + { + "epoch": 0.9512195121951219, + "grad_norm": 0.3469127416610718, + "learning_rate": 4.708428811138514e-06, + "loss": 0.6099, + "step": 4082 + }, + { + "epoch": 0.9514525400031071, + "grad_norm": 0.3461785614490509, + "learning_rate": 4.7082852652771124e-06, + "loss": 0.6285, + "step": 4083 + }, + { + "epoch": 0.9516855678110921, + "grad_norm": 0.34390559792518616, + "learning_rate": 4.708141686278524e-06, + "loss": 0.6257, + "step": 4084 + }, + { + "epoch": 0.9519185956190772, + "grad_norm": 0.3508131206035614, + "learning_rate": 4.707998074144902e-06, + "loss": 0.6371, + "step": 4085 + }, + { + "epoch": 0.9521516234270623, + "grad_norm": 0.3504517674446106, + "learning_rate": 4.7078544288784025e-06, + "loss": 0.6355, + "step": 4086 + }, + { + "epoch": 0.9523846512350473, + "grad_norm": 0.35657748579978943, + "learning_rate": 4.70771075048118e-06, + "loss": 0.6382, + "step": 4087 + }, + { + "epoch": 0.9526176790430325, + "grad_norm": 0.35951074957847595, + "learning_rate": 4.707567038955392e-06, + "loss": 0.6342, + "step": 4088 + }, + { + "epoch": 0.9528507068510176, + "grad_norm": 0.34907835721969604, + "learning_rate": 4.707423294303194e-06, + "loss": 0.6311, + "step": 4089 + }, + { + "epoch": 0.9530837346590026, + "grad_norm": 0.35337841510772705, + "learning_rate": 4.707279516526742e-06, + "loss": 0.6092, + "step": 4090 + }, + { + "epoch": 0.9533167624669877, + "grad_norm": 0.3493017256259918, + "learning_rate": 4.7071357056281965e-06, + "loss": 0.6551, + "step": 4091 + }, + { + "epoch": 0.9535497902749728, + "grad_norm": 0.3360094726085663, + "learning_rate": 4.706991861609713e-06, + "loss": 0.6383, + "step": 4092 + }, + { + "epoch": 0.953782818082958, + "grad_norm": 0.35654398798942566, + "learning_rate": 4.706847984473451e-06, + "loss": 0.6158, + "step": 4093 + }, + { + "epoch": 0.954015845890943, + "grad_norm": 0.33953747153282166, + "learning_rate": 4.706704074221569e-06, + "loss": 0.6266, + "step": 4094 + }, + { + "epoch": 0.9542488736989281, + "grad_norm": 0.3467637896537781, + "learning_rate": 4.706560130856227e-06, + "loss": 0.645, + "step": 4095 + }, + { + "epoch": 0.9544819015069131, + "grad_norm": 0.340591698884964, + "learning_rate": 4.706416154379585e-06, + "loss": 0.6163, + "step": 4096 + }, + { + "epoch": 0.9547149293148982, + "grad_norm": 0.3187772333621979, + "learning_rate": 4.706272144793802e-06, + "loss": 0.6025, + "step": 4097 + }, + { + "epoch": 0.9549479571228834, + "grad_norm": 0.35161280632019043, + "learning_rate": 4.706128102101041e-06, + "loss": 0.641, + "step": 4098 + }, + { + "epoch": 0.9551809849308684, + "grad_norm": 0.34821173548698425, + "learning_rate": 4.705984026303463e-06, + "loss": 0.6449, + "step": 4099 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 0.3433050811290741, + "learning_rate": 4.705839917403229e-06, + "loss": 0.6503, + "step": 4100 + }, + { + "epoch": 0.9556470405468386, + "grad_norm": 0.35014182329177856, + "learning_rate": 4.7056957754025026e-06, + "loss": 0.6596, + "step": 4101 + }, + { + "epoch": 0.9558800683548236, + "grad_norm": 0.34333646297454834, + "learning_rate": 4.705551600303445e-06, + "loss": 0.6505, + "step": 4102 + }, + { + "epoch": 0.9561130961628087, + "grad_norm": 0.35003596544265747, + "learning_rate": 4.7054073921082224e-06, + "loss": 0.6462, + "step": 4103 + }, + { + "epoch": 0.9563461239707939, + "grad_norm": 0.35982751846313477, + "learning_rate": 4.705263150818997e-06, + "loss": 0.6474, + "step": 4104 + }, + { + "epoch": 0.956579151778779, + "grad_norm": 0.3419579863548279, + "learning_rate": 4.705118876437933e-06, + "loss": 0.6331, + "step": 4105 + }, + { + "epoch": 0.956812179586764, + "grad_norm": 0.326788991689682, + "learning_rate": 4.704974568967197e-06, + "loss": 0.6353, + "step": 4106 + }, + { + "epoch": 0.9570452073947491, + "grad_norm": 0.34831392765045166, + "learning_rate": 4.704830228408953e-06, + "loss": 0.6447, + "step": 4107 + }, + { + "epoch": 0.9572782352027341, + "grad_norm": 0.3612050712108612, + "learning_rate": 4.704685854765368e-06, + "loss": 0.6344, + "step": 4108 + }, + { + "epoch": 0.9575112630107193, + "grad_norm": 0.33644765615463257, + "learning_rate": 4.704541448038606e-06, + "loss": 0.6334, + "step": 4109 + }, + { + "epoch": 0.9577442908187044, + "grad_norm": 0.3433167040348053, + "learning_rate": 4.704397008230838e-06, + "loss": 0.6269, + "step": 4110 + }, + { + "epoch": 0.9579773186266894, + "grad_norm": 0.34118807315826416, + "learning_rate": 4.704252535344228e-06, + "loss": 0.6444, + "step": 4111 + }, + { + "epoch": 0.9582103464346745, + "grad_norm": 0.3747541904449463, + "learning_rate": 4.704108029380945e-06, + "loss": 0.6747, + "step": 4112 + }, + { + "epoch": 0.9584433742426596, + "grad_norm": 0.33478042483329773, + "learning_rate": 4.703963490343157e-06, + "loss": 0.6458, + "step": 4113 + }, + { + "epoch": 0.9586764020506447, + "grad_norm": 0.3495170474052429, + "learning_rate": 4.703818918233035e-06, + "loss": 0.6536, + "step": 4114 + }, + { + "epoch": 0.9589094298586298, + "grad_norm": 0.3637564480304718, + "learning_rate": 4.703674313052746e-06, + "loss": 0.6361, + "step": 4115 + }, + { + "epoch": 0.9591424576666149, + "grad_norm": 0.33975401520729065, + "learning_rate": 4.703529674804461e-06, + "loss": 0.6252, + "step": 4116 + }, + { + "epoch": 0.9593754854745999, + "grad_norm": 0.3318953812122345, + "learning_rate": 4.70338500349035e-06, + "loss": 0.6172, + "step": 4117 + }, + { + "epoch": 0.959608513282585, + "grad_norm": 0.3549575209617615, + "learning_rate": 4.703240299112585e-06, + "loss": 0.6421, + "step": 4118 + }, + { + "epoch": 0.9598415410905702, + "grad_norm": 0.34093573689460754, + "learning_rate": 4.7030955616733355e-06, + "loss": 0.6465, + "step": 4119 + }, + { + "epoch": 0.9600745688985552, + "grad_norm": 0.346332848072052, + "learning_rate": 4.7029507911747755e-06, + "loss": 0.6095, + "step": 4120 + }, + { + "epoch": 0.9603075967065403, + "grad_norm": 0.33521321415901184, + "learning_rate": 4.702805987619076e-06, + "loss": 0.6433, + "step": 4121 + }, + { + "epoch": 0.9605406245145254, + "grad_norm": 0.33941054344177246, + "learning_rate": 4.702661151008411e-06, + "loss": 0.6382, + "step": 4122 + }, + { + "epoch": 0.9607736523225104, + "grad_norm": 0.34671664237976074, + "learning_rate": 4.702516281344952e-06, + "loss": 0.6331, + "step": 4123 + }, + { + "epoch": 0.9610066801304956, + "grad_norm": 0.36346057057380676, + "learning_rate": 4.702371378630875e-06, + "loss": 0.632, + "step": 4124 + }, + { + "epoch": 0.9612397079384807, + "grad_norm": 0.3462219834327698, + "learning_rate": 4.702226442868353e-06, + "loss": 0.6053, + "step": 4125 + }, + { + "epoch": 0.9614727357464657, + "grad_norm": 0.3610422909259796, + "learning_rate": 4.7020814740595614e-06, + "loss": 0.6624, + "step": 4126 + }, + { + "epoch": 0.9617057635544508, + "grad_norm": 0.3555166721343994, + "learning_rate": 4.701936472206676e-06, + "loss": 0.6273, + "step": 4127 + }, + { + "epoch": 0.9619387913624359, + "grad_norm": 0.3433380722999573, + "learning_rate": 4.701791437311871e-06, + "loss": 0.6262, + "step": 4128 + }, + { + "epoch": 0.962171819170421, + "grad_norm": 0.3463190793991089, + "learning_rate": 4.701646369377325e-06, + "loss": 0.631, + "step": 4129 + }, + { + "epoch": 0.9624048469784061, + "grad_norm": 0.34544050693511963, + "learning_rate": 4.701501268405214e-06, + "loss": 0.6261, + "step": 4130 + }, + { + "epoch": 0.9626378747863912, + "grad_norm": 0.3471633195877075, + "learning_rate": 4.701356134397715e-06, + "loss": 0.6194, + "step": 4131 + }, + { + "epoch": 0.9628709025943762, + "grad_norm": 0.35447537899017334, + "learning_rate": 4.701210967357006e-06, + "loss": 0.6564, + "step": 4132 + }, + { + "epoch": 0.9631039304023613, + "grad_norm": 0.3469873368740082, + "learning_rate": 4.701065767285265e-06, + "loss": 0.6308, + "step": 4133 + }, + { + "epoch": 0.9633369582103465, + "grad_norm": 0.352437287569046, + "learning_rate": 4.700920534184672e-06, + "loss": 0.6229, + "step": 4134 + }, + { + "epoch": 0.9635699860183315, + "grad_norm": 0.3516117334365845, + "learning_rate": 4.7007752680574056e-06, + "loss": 0.6139, + "step": 4135 + }, + { + "epoch": 0.9638030138263166, + "grad_norm": 0.34800562262535095, + "learning_rate": 4.700629968905646e-06, + "loss": 0.6248, + "step": 4136 + }, + { + "epoch": 0.9640360416343017, + "grad_norm": 0.34464031457901, + "learning_rate": 4.700484636731573e-06, + "loss": 0.6512, + "step": 4137 + }, + { + "epoch": 0.9642690694422867, + "grad_norm": 0.3386528491973877, + "learning_rate": 4.7003392715373666e-06, + "loss": 0.6637, + "step": 4138 + }, + { + "epoch": 0.9645020972502719, + "grad_norm": 0.3451685309410095, + "learning_rate": 4.7001938733252094e-06, + "loss": 0.6373, + "step": 4139 + }, + { + "epoch": 0.964735125058257, + "grad_norm": 0.3547638952732086, + "learning_rate": 4.700048442097284e-06, + "loss": 0.6303, + "step": 4140 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 0.3364916443824768, + "learning_rate": 4.6999029778557716e-06, + "loss": 0.6118, + "step": 4141 + }, + { + "epoch": 0.9652011806742271, + "grad_norm": 0.3459072709083557, + "learning_rate": 4.699757480602855e-06, + "loss": 0.6271, + "step": 4142 + }, + { + "epoch": 0.9654342084822122, + "grad_norm": 0.3522323668003082, + "learning_rate": 4.699611950340717e-06, + "loss": 0.6289, + "step": 4143 + }, + { + "epoch": 0.9656672362901974, + "grad_norm": 0.3535788059234619, + "learning_rate": 4.699466387071542e-06, + "loss": 0.6402, + "step": 4144 + }, + { + "epoch": 0.9659002640981824, + "grad_norm": 0.35009515285491943, + "learning_rate": 4.699320790797515e-06, + "loss": 0.6382, + "step": 4145 + }, + { + "epoch": 0.9661332919061675, + "grad_norm": 0.35433080792427063, + "learning_rate": 4.69917516152082e-06, + "loss": 0.6488, + "step": 4146 + }, + { + "epoch": 0.9663663197141525, + "grad_norm": 0.34390050172805786, + "learning_rate": 4.699029499243642e-06, + "loss": 0.6524, + "step": 4147 + }, + { + "epoch": 0.9665993475221376, + "grad_norm": 0.3563655912876129, + "learning_rate": 4.698883803968167e-06, + "loss": 0.6342, + "step": 4148 + }, + { + "epoch": 0.9668323753301227, + "grad_norm": 0.35178545117378235, + "learning_rate": 4.698738075696583e-06, + "loss": 0.6447, + "step": 4149 + }, + { + "epoch": 0.9670654031381078, + "grad_norm": 0.35255277156829834, + "learning_rate": 4.698592314431074e-06, + "loss": 0.6361, + "step": 4150 + }, + { + "epoch": 0.9672984309460929, + "grad_norm": 0.35265469551086426, + "learning_rate": 4.698446520173828e-06, + "loss": 0.6389, + "step": 4151 + }, + { + "epoch": 0.967531458754078, + "grad_norm": 0.3523028790950775, + "learning_rate": 4.698300692927034e-06, + "loss": 0.6529, + "step": 4152 + }, + { + "epoch": 0.967764486562063, + "grad_norm": 0.3491208553314209, + "learning_rate": 4.69815483269288e-06, + "loss": 0.6363, + "step": 4153 + }, + { + "epoch": 0.9679975143700481, + "grad_norm": 0.36037132143974304, + "learning_rate": 4.698008939473554e-06, + "loss": 0.6421, + "step": 4154 + }, + { + "epoch": 0.9682305421780333, + "grad_norm": 0.3322261571884155, + "learning_rate": 4.697863013271246e-06, + "loss": 0.6295, + "step": 4155 + }, + { + "epoch": 0.9684635699860183, + "grad_norm": 0.34726381301879883, + "learning_rate": 4.697717054088145e-06, + "loss": 0.6525, + "step": 4156 + }, + { + "epoch": 0.9686965977940034, + "grad_norm": 0.38901031017303467, + "learning_rate": 4.697571061926441e-06, + "loss": 0.5619, + "step": 4157 + }, + { + "epoch": 0.9689296256019885, + "grad_norm": 0.3436329662799835, + "learning_rate": 4.697425036788326e-06, + "loss": 0.6239, + "step": 4158 + }, + { + "epoch": 0.9691626534099735, + "grad_norm": 0.35217103362083435, + "learning_rate": 4.69727897867599e-06, + "loss": 0.617, + "step": 4159 + }, + { + "epoch": 0.9693956812179587, + "grad_norm": 0.3780081272125244, + "learning_rate": 4.697132887591626e-06, + "loss": 0.606, + "step": 4160 + }, + { + "epoch": 0.9696287090259438, + "grad_norm": 0.33276063203811646, + "learning_rate": 4.696986763537425e-06, + "loss": 0.621, + "step": 4161 + }, + { + "epoch": 0.9698617368339288, + "grad_norm": 0.35217997431755066, + "learning_rate": 4.69684060651558e-06, + "loss": 0.6123, + "step": 4162 + }, + { + "epoch": 0.9700947646419139, + "grad_norm": 0.3560169041156769, + "learning_rate": 4.696694416528285e-06, + "loss": 0.6343, + "step": 4163 + }, + { + "epoch": 0.970327792449899, + "grad_norm": 0.3605276048183441, + "learning_rate": 4.696548193577732e-06, + "loss": 0.6174, + "step": 4164 + }, + { + "epoch": 0.9705608202578841, + "grad_norm": 0.35538920760154724, + "learning_rate": 4.696401937666117e-06, + "loss": 0.6377, + "step": 4165 + }, + { + "epoch": 0.9707938480658692, + "grad_norm": 0.3460730016231537, + "learning_rate": 4.6962556487956344e-06, + "loss": 0.6347, + "step": 4166 + }, + { + "epoch": 0.9710268758738543, + "grad_norm": 0.3481353521347046, + "learning_rate": 4.696109326968479e-06, + "loss": 0.649, + "step": 4167 + }, + { + "epoch": 0.9712599036818393, + "grad_norm": 0.3317468464374542, + "learning_rate": 4.695962972186846e-06, + "loss": 0.6404, + "step": 4168 + }, + { + "epoch": 0.9714929314898244, + "grad_norm": 0.3346211314201355, + "learning_rate": 4.695816584452931e-06, + "loss": 0.6158, + "step": 4169 + }, + { + "epoch": 0.9717259592978096, + "grad_norm": 0.33837655186653137, + "learning_rate": 4.695670163768933e-06, + "loss": 0.6537, + "step": 4170 + }, + { + "epoch": 0.9719589871057946, + "grad_norm": 0.34390321373939514, + "learning_rate": 4.695523710137047e-06, + "loss": 0.6504, + "step": 4171 + }, + { + "epoch": 0.9721920149137797, + "grad_norm": 0.38088130950927734, + "learning_rate": 4.695377223559473e-06, + "loss": 0.6385, + "step": 4172 + }, + { + "epoch": 0.9724250427217648, + "grad_norm": 0.34042176604270935, + "learning_rate": 4.695230704038407e-06, + "loss": 0.6555, + "step": 4173 + }, + { + "epoch": 0.9726580705297498, + "grad_norm": 0.34779244661331177, + "learning_rate": 4.695084151576048e-06, + "loss": 0.6428, + "step": 4174 + }, + { + "epoch": 0.972891098337735, + "grad_norm": 0.3373328447341919, + "learning_rate": 4.694937566174596e-06, + "loss": 0.6357, + "step": 4175 + }, + { + "epoch": 0.9731241261457201, + "grad_norm": 0.3511413335800171, + "learning_rate": 4.69479094783625e-06, + "loss": 0.634, + "step": 4176 + }, + { + "epoch": 0.9733571539537051, + "grad_norm": 0.35043513774871826, + "learning_rate": 4.69464429656321e-06, + "loss": 0.6693, + "step": 4177 + }, + { + "epoch": 0.9735901817616902, + "grad_norm": 0.3519209623336792, + "learning_rate": 4.6944976123576765e-06, + "loss": 0.6385, + "step": 4178 + }, + { + "epoch": 0.9738232095696753, + "grad_norm": 0.33685749769210815, + "learning_rate": 4.694350895221852e-06, + "loss": 0.6234, + "step": 4179 + }, + { + "epoch": 0.9740562373776604, + "grad_norm": 0.34730666875839233, + "learning_rate": 4.694204145157937e-06, + "loss": 0.6646, + "step": 4180 + }, + { + "epoch": 0.9742892651856455, + "grad_norm": 0.34268319606781006, + "learning_rate": 4.694057362168133e-06, + "loss": 0.6287, + "step": 4181 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 0.3469334840774536, + "learning_rate": 4.693910546254644e-06, + "loss": 0.612, + "step": 4182 + }, + { + "epoch": 0.9747553208016156, + "grad_norm": 0.330902636051178, + "learning_rate": 4.693763697419672e-06, + "loss": 0.6036, + "step": 4183 + }, + { + "epoch": 0.9749883486096007, + "grad_norm": 0.3341575860977173, + "learning_rate": 4.693616815665421e-06, + "loss": 0.6256, + "step": 4184 + }, + { + "epoch": 0.9752213764175859, + "grad_norm": 0.34232351183891296, + "learning_rate": 4.693469900994095e-06, + "loss": 0.6299, + "step": 4185 + }, + { + "epoch": 0.975454404225571, + "grad_norm": 0.3517410457134247, + "learning_rate": 4.693322953407899e-06, + "loss": 0.629, + "step": 4186 + }, + { + "epoch": 0.975687432033556, + "grad_norm": 0.3420118987560272, + "learning_rate": 4.693175972909037e-06, + "loss": 0.6392, + "step": 4187 + }, + { + "epoch": 0.9759204598415411, + "grad_norm": 0.3478136360645294, + "learning_rate": 4.693028959499717e-06, + "loss": 0.6503, + "step": 4188 + }, + { + "epoch": 0.9761534876495261, + "grad_norm": 0.3653351366519928, + "learning_rate": 4.692881913182142e-06, + "loss": 0.6491, + "step": 4189 + }, + { + "epoch": 0.9763865154575113, + "grad_norm": 0.3805861175060272, + "learning_rate": 4.692734833958521e-06, + "loss": 0.6358, + "step": 4190 + }, + { + "epoch": 0.9766195432654964, + "grad_norm": 0.3610598146915436, + "learning_rate": 4.692587721831059e-06, + "loss": 0.6451, + "step": 4191 + }, + { + "epoch": 0.9768525710734814, + "grad_norm": 0.34295403957366943, + "learning_rate": 4.692440576801964e-06, + "loss": 0.6162, + "step": 4192 + }, + { + "epoch": 0.9770855988814665, + "grad_norm": 0.3399611711502075, + "learning_rate": 4.692293398873447e-06, + "loss": 0.6171, + "step": 4193 + }, + { + "epoch": 0.9773186266894516, + "grad_norm": 0.33708441257476807, + "learning_rate": 4.6921461880477114e-06, + "loss": 0.6388, + "step": 4194 + }, + { + "epoch": 0.9775516544974366, + "grad_norm": 0.35223227739334106, + "learning_rate": 4.6919989443269705e-06, + "loss": 0.6224, + "step": 4195 + }, + { + "epoch": 0.9777846823054218, + "grad_norm": 0.3487841784954071, + "learning_rate": 4.691851667713432e-06, + "loss": 0.6388, + "step": 4196 + }, + { + "epoch": 0.9780177101134069, + "grad_norm": 0.3607700765132904, + "learning_rate": 4.6917043582093055e-06, + "loss": 0.6444, + "step": 4197 + }, + { + "epoch": 0.9782507379213919, + "grad_norm": 0.3615007698535919, + "learning_rate": 4.691557015816802e-06, + "loss": 0.6688, + "step": 4198 + }, + { + "epoch": 0.978483765729377, + "grad_norm": 0.3561670482158661, + "learning_rate": 4.6914096405381335e-06, + "loss": 0.6243, + "step": 4199 + }, + { + "epoch": 0.9787167935373621, + "grad_norm": 0.36228471994400024, + "learning_rate": 4.6912622323755105e-06, + "loss": 0.6414, + "step": 4200 + }, + { + "epoch": 0.9789498213453472, + "grad_norm": 0.35121461749076843, + "learning_rate": 4.691114791331145e-06, + "loss": 0.6323, + "step": 4201 + }, + { + "epoch": 0.9791828491533323, + "grad_norm": 0.3521095812320709, + "learning_rate": 4.690967317407249e-06, + "loss": 0.6311, + "step": 4202 + }, + { + "epoch": 0.9794158769613174, + "grad_norm": 0.36342135071754456, + "learning_rate": 4.6908198106060365e-06, + "loss": 0.6371, + "step": 4203 + }, + { + "epoch": 0.9796489047693024, + "grad_norm": 0.35885167121887207, + "learning_rate": 4.6906722709297206e-06, + "loss": 0.6303, + "step": 4204 + }, + { + "epoch": 0.9798819325772875, + "grad_norm": 0.3604649007320404, + "learning_rate": 4.690524698380515e-06, + "loss": 0.617, + "step": 4205 + }, + { + "epoch": 0.9801149603852727, + "grad_norm": 0.3410576283931732, + "learning_rate": 4.690377092960635e-06, + "loss": 0.6264, + "step": 4206 + }, + { + "epoch": 0.9803479881932577, + "grad_norm": 0.346603661775589, + "learning_rate": 4.690229454672295e-06, + "loss": 0.6335, + "step": 4207 + }, + { + "epoch": 0.9805810160012428, + "grad_norm": 0.3362673223018646, + "learning_rate": 4.690081783517709e-06, + "loss": 0.6338, + "step": 4208 + }, + { + "epoch": 0.9808140438092279, + "grad_norm": 0.3472369611263275, + "learning_rate": 4.689934079499096e-06, + "loss": 0.6487, + "step": 4209 + }, + { + "epoch": 0.9810470716172129, + "grad_norm": 0.341979056596756, + "learning_rate": 4.689786342618669e-06, + "loss": 0.6233, + "step": 4210 + }, + { + "epoch": 0.9812800994251981, + "grad_norm": 0.3421529233455658, + "learning_rate": 4.689638572878647e-06, + "loss": 0.6135, + "step": 4211 + }, + { + "epoch": 0.9815131272331832, + "grad_norm": 0.3419298827648163, + "learning_rate": 4.689490770281248e-06, + "loss": 0.6449, + "step": 4212 + }, + { + "epoch": 0.9817461550411682, + "grad_norm": 0.34866249561309814, + "learning_rate": 4.689342934828687e-06, + "loss": 0.6589, + "step": 4213 + }, + { + "epoch": 0.9819791828491533, + "grad_norm": 0.36052942276000977, + "learning_rate": 4.689195066523186e-06, + "loss": 0.6471, + "step": 4214 + }, + { + "epoch": 0.9822122106571384, + "grad_norm": 0.3542186915874481, + "learning_rate": 4.689047165366961e-06, + "loss": 0.6463, + "step": 4215 + }, + { + "epoch": 0.9824452384651235, + "grad_norm": 0.34356263279914856, + "learning_rate": 4.688899231362234e-06, + "loss": 0.6164, + "step": 4216 + }, + { + "epoch": 0.9826782662731086, + "grad_norm": 0.33817258477211, + "learning_rate": 4.688751264511222e-06, + "loss": 0.6191, + "step": 4217 + }, + { + "epoch": 0.9829112940810937, + "grad_norm": 0.3424202501773834, + "learning_rate": 4.688603264816148e-06, + "loss": 0.6138, + "step": 4218 + }, + { + "epoch": 0.9831443218890787, + "grad_norm": 0.3457900583744049, + "learning_rate": 4.688455232279231e-06, + "loss": 0.6712, + "step": 4219 + }, + { + "epoch": 0.9833773496970638, + "grad_norm": 0.35190868377685547, + "learning_rate": 4.688307166902693e-06, + "loss": 0.6168, + "step": 4220 + }, + { + "epoch": 0.983610377505049, + "grad_norm": 0.34234094619750977, + "learning_rate": 4.688159068688756e-06, + "loss": 0.6402, + "step": 4221 + }, + { + "epoch": 0.983843405313034, + "grad_norm": 0.347786009311676, + "learning_rate": 4.688010937639642e-06, + "loss": 0.6653, + "step": 4222 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 0.34836938977241516, + "learning_rate": 4.687862773757574e-06, + "loss": 0.6573, + "step": 4223 + }, + { + "epoch": 0.9843094609290042, + "grad_norm": 0.34940803050994873, + "learning_rate": 4.6877145770447766e-06, + "loss": 0.6321, + "step": 4224 + }, + { + "epoch": 0.9845424887369892, + "grad_norm": 0.3335087299346924, + "learning_rate": 4.687566347503471e-06, + "loss": 0.6614, + "step": 4225 + }, + { + "epoch": 0.9847755165449744, + "grad_norm": 0.33274802565574646, + "learning_rate": 4.687418085135883e-06, + "loss": 0.6521, + "step": 4226 + }, + { + "epoch": 0.9850085443529595, + "grad_norm": 0.3553895056247711, + "learning_rate": 4.687269789944238e-06, + "loss": 0.6394, + "step": 4227 + }, + { + "epoch": 0.9852415721609445, + "grad_norm": 0.33954721689224243, + "learning_rate": 4.687121461930761e-06, + "loss": 0.625, + "step": 4228 + }, + { + "epoch": 0.9854745999689296, + "grad_norm": 0.32733067870140076, + "learning_rate": 4.686973101097676e-06, + "loss": 0.5965, + "step": 4229 + }, + { + "epoch": 0.9857076277769147, + "grad_norm": 0.3356587588787079, + "learning_rate": 4.686824707447211e-06, + "loss": 0.6219, + "step": 4230 + }, + { + "epoch": 0.9859406555848998, + "grad_norm": 0.34108981490135193, + "learning_rate": 4.6866762809815925e-06, + "loss": 0.6082, + "step": 4231 + }, + { + "epoch": 0.9861736833928849, + "grad_norm": 0.3382529318332672, + "learning_rate": 4.686527821703048e-06, + "loss": 0.632, + "step": 4232 + }, + { + "epoch": 0.98640671120087, + "grad_norm": 0.3665127456188202, + "learning_rate": 4.6863793296138046e-06, + "loss": 0.6526, + "step": 4233 + }, + { + "epoch": 0.986639739008855, + "grad_norm": 0.35475605726242065, + "learning_rate": 4.686230804716092e-06, + "loss": 0.6157, + "step": 4234 + }, + { + "epoch": 0.9868727668168401, + "grad_norm": 0.3404821753501892, + "learning_rate": 4.686082247012137e-06, + "loss": 0.641, + "step": 4235 + }, + { + "epoch": 0.9871057946248253, + "grad_norm": 0.36233729124069214, + "learning_rate": 4.685933656504169e-06, + "loss": 0.6599, + "step": 4236 + }, + { + "epoch": 0.9873388224328103, + "grad_norm": 0.33920085430145264, + "learning_rate": 4.685785033194419e-06, + "loss": 0.6166, + "step": 4237 + }, + { + "epoch": 0.9875718502407954, + "grad_norm": 0.3503301441669464, + "learning_rate": 4.685636377085117e-06, + "loss": 0.6445, + "step": 4238 + }, + { + "epoch": 0.9878048780487805, + "grad_norm": 0.3675065338611603, + "learning_rate": 4.685487688178492e-06, + "loss": 0.6466, + "step": 4239 + }, + { + "epoch": 0.9880379058567655, + "grad_norm": 0.348318874835968, + "learning_rate": 4.685338966476779e-06, + "loss": 0.6434, + "step": 4240 + }, + { + "epoch": 0.9882709336647506, + "grad_norm": 0.3550128638744354, + "learning_rate": 4.685190211982206e-06, + "loss": 0.6534, + "step": 4241 + }, + { + "epoch": 0.9885039614727358, + "grad_norm": 0.34767258167266846, + "learning_rate": 4.685041424697005e-06, + "loss": 0.6368, + "step": 4242 + }, + { + "epoch": 0.9887369892807208, + "grad_norm": 0.35387736558914185, + "learning_rate": 4.6848926046234125e-06, + "loss": 0.649, + "step": 4243 + }, + { + "epoch": 0.9889700170887059, + "grad_norm": 0.3393484652042389, + "learning_rate": 4.684743751763657e-06, + "loss": 0.6435, + "step": 4244 + }, + { + "epoch": 0.989203044896691, + "grad_norm": 0.3452407717704773, + "learning_rate": 4.684594866119976e-06, + "loss": 0.625, + "step": 4245 + }, + { + "epoch": 0.989436072704676, + "grad_norm": 0.34635162353515625, + "learning_rate": 4.684445947694601e-06, + "loss": 0.6306, + "step": 4246 + }, + { + "epoch": 0.9896691005126612, + "grad_norm": 0.3605519235134125, + "learning_rate": 4.684296996489769e-06, + "loss": 0.6556, + "step": 4247 + }, + { + "epoch": 0.9899021283206463, + "grad_norm": 0.3307499885559082, + "learning_rate": 4.684148012507713e-06, + "loss": 0.6204, + "step": 4248 + }, + { + "epoch": 0.9901351561286313, + "grad_norm": 0.3384837508201599, + "learning_rate": 4.68399899575067e-06, + "loss": 0.6277, + "step": 4249 + }, + { + "epoch": 0.9903681839366164, + "grad_norm": 0.3591001033782959, + "learning_rate": 4.6838499462208745e-06, + "loss": 0.6545, + "step": 4250 + }, + { + "epoch": 0.9906012117446015, + "grad_norm": 0.3386015295982361, + "learning_rate": 4.6837008639205656e-06, + "loss": 0.6379, + "step": 4251 + }, + { + "epoch": 0.9908342395525866, + "grad_norm": 0.3350871205329895, + "learning_rate": 4.683551748851978e-06, + "loss": 0.6366, + "step": 4252 + }, + { + "epoch": 0.9910672673605717, + "grad_norm": 0.3523252308368683, + "learning_rate": 4.683402601017351e-06, + "loss": 0.6401, + "step": 4253 + }, + { + "epoch": 0.9913002951685568, + "grad_norm": 0.3424891531467438, + "learning_rate": 4.683253420418922e-06, + "loss": 0.6186, + "step": 4254 + }, + { + "epoch": 0.9915333229765418, + "grad_norm": 0.33324313163757324, + "learning_rate": 4.68310420705893e-06, + "loss": 0.6268, + "step": 4255 + }, + { + "epoch": 0.9917663507845269, + "grad_norm": 0.3392321765422821, + "learning_rate": 4.682954960939613e-06, + "loss": 0.6325, + "step": 4256 + }, + { + "epoch": 0.9919993785925121, + "grad_norm": 0.3441314101219177, + "learning_rate": 4.68280568206321e-06, + "loss": 0.6418, + "step": 4257 + }, + { + "epoch": 0.9922324064004971, + "grad_norm": 0.3471652567386627, + "learning_rate": 4.682656370431965e-06, + "loss": 0.621, + "step": 4258 + }, + { + "epoch": 0.9924654342084822, + "grad_norm": 0.3431786298751831, + "learning_rate": 4.682507026048114e-06, + "loss": 0.6302, + "step": 4259 + }, + { + "epoch": 0.9926984620164673, + "grad_norm": 0.3489800989627838, + "learning_rate": 4.6823576489139e-06, + "loss": 0.6403, + "step": 4260 + }, + { + "epoch": 0.9929314898244523, + "grad_norm": 0.34170499444007874, + "learning_rate": 4.682208239031566e-06, + "loss": 0.6477, + "step": 4261 + }, + { + "epoch": 0.9931645176324375, + "grad_norm": 0.3614504635334015, + "learning_rate": 4.682058796403351e-06, + "loss": 0.6536, + "step": 4262 + }, + { + "epoch": 0.9933975454404226, + "grad_norm": 0.34629854559898376, + "learning_rate": 4.681909321031499e-06, + "loss": 0.6231, + "step": 4263 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 0.3496578633785248, + "learning_rate": 4.681759812918253e-06, + "loss": 0.6482, + "step": 4264 + }, + { + "epoch": 0.9938636010563927, + "grad_norm": 0.35924050211906433, + "learning_rate": 4.681610272065857e-06, + "loss": 0.6412, + "step": 4265 + }, + { + "epoch": 0.9940966288643778, + "grad_norm": 0.33870628476142883, + "learning_rate": 4.681460698476555e-06, + "loss": 0.6283, + "step": 4266 + }, + { + "epoch": 0.994329656672363, + "grad_norm": 0.3420523703098297, + "learning_rate": 4.681311092152589e-06, + "loss": 0.6289, + "step": 4267 + }, + { + "epoch": 0.994562684480348, + "grad_norm": 0.35048535466194153, + "learning_rate": 4.681161453096207e-06, + "loss": 0.6298, + "step": 4268 + }, + { + "epoch": 0.9947957122883331, + "grad_norm": 0.35372352600097656, + "learning_rate": 4.681011781309653e-06, + "loss": 0.6334, + "step": 4269 + }, + { + "epoch": 0.9950287400963181, + "grad_norm": 0.34511396288871765, + "learning_rate": 4.680862076795174e-06, + "loss": 0.6213, + "step": 4270 + }, + { + "epoch": 0.9952617679043032, + "grad_norm": 0.35686177015304565, + "learning_rate": 4.680712339555015e-06, + "loss": 0.616, + "step": 4271 + }, + { + "epoch": 0.9954947957122884, + "grad_norm": 0.33306941390037537, + "learning_rate": 4.680562569591424e-06, + "loss": 0.6342, + "step": 4272 + }, + { + "epoch": 0.9957278235202734, + "grad_norm": 0.3564325273036957, + "learning_rate": 4.680412766906648e-06, + "loss": 0.6262, + "step": 4273 + }, + { + "epoch": 0.9959608513282585, + "grad_norm": 0.3555409908294678, + "learning_rate": 4.680262931502935e-06, + "loss": 0.6263, + "step": 4274 + }, + { + "epoch": 0.9961938791362436, + "grad_norm": 0.3479321599006653, + "learning_rate": 4.680113063382534e-06, + "loss": 0.6303, + "step": 4275 + }, + { + "epoch": 0.9964269069442286, + "grad_norm": 0.3541272282600403, + "learning_rate": 4.6799631625476924e-06, + "loss": 0.62, + "step": 4276 + }, + { + "epoch": 0.9966599347522138, + "grad_norm": 0.3635050654411316, + "learning_rate": 4.679813229000662e-06, + "loss": 0.6223, + "step": 4277 + }, + { + "epoch": 0.9968929625601989, + "grad_norm": 0.33839425444602966, + "learning_rate": 4.679663262743689e-06, + "loss": 0.6172, + "step": 4278 + }, + { + "epoch": 0.997125990368184, + "grad_norm": 0.3511579930782318, + "learning_rate": 4.679513263779027e-06, + "loss": 0.6301, + "step": 4279 + }, + { + "epoch": 0.997359018176169, + "grad_norm": 0.3462288975715637, + "learning_rate": 4.679363232108927e-06, + "loss": 0.6399, + "step": 4280 + }, + { + "epoch": 0.9975920459841541, + "grad_norm": 0.34921541810035706, + "learning_rate": 4.679213167735638e-06, + "loss": 0.6325, + "step": 4281 + }, + { + "epoch": 0.9978250737921392, + "grad_norm": 0.36695635318756104, + "learning_rate": 4.679063070661412e-06, + "loss": 0.6731, + "step": 4282 + }, + { + "epoch": 0.9980581016001243, + "grad_norm": 0.3593139946460724, + "learning_rate": 4.678912940888504e-06, + "loss": 0.6545, + "step": 4283 + }, + { + "epoch": 0.9982911294081094, + "grad_norm": 0.34419116377830505, + "learning_rate": 4.678762778419165e-06, + "loss": 0.6052, + "step": 4284 + }, + { + "epoch": 0.9985241572160944, + "grad_norm": 0.32752856612205505, + "learning_rate": 4.678612583255648e-06, + "loss": 0.6189, + "step": 4285 + }, + { + "epoch": 0.9987571850240795, + "grad_norm": 0.3528313934803009, + "learning_rate": 4.678462355400207e-06, + "loss": 0.6503, + "step": 4286 + }, + { + "epoch": 0.9989902128320646, + "grad_norm": 0.34713804721832275, + "learning_rate": 4.678312094855097e-06, + "loss": 0.6351, + "step": 4287 + }, + { + "epoch": 0.9992232406400497, + "grad_norm": 0.3487328588962555, + "learning_rate": 4.678161801622574e-06, + "loss": 0.665, + "step": 4288 + }, + { + "epoch": 0.9994562684480348, + "grad_norm": 0.3459736704826355, + "learning_rate": 4.678011475704889e-06, + "loss": 0.6243, + "step": 4289 + }, + { + "epoch": 0.9996892962560199, + "grad_norm": 0.3487725555896759, + "learning_rate": 4.677861117104302e-06, + "loss": 0.6206, + "step": 4290 + }, + { + "epoch": 0.9999223240640049, + "grad_norm": 0.33903583884239197, + "learning_rate": 4.677710725823067e-06, + "loss": 0.6177, + "step": 4291 + } + ], + "logging_steps": 1, + "max_steps": 25746, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 4291, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4031134651428897e+19, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}