diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 1.573991060256958, + "learning_rate": 4.545454545454545e-07, + "loss": 2.660639762878418, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 0.6165518760681152, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.9370158910751343, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.5032806992530823, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.883617877960205, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.5399006605148315, + "learning_rate": 3.1818181818181817e-06, + "loss": 1.7472549676895142, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.4401944577693939, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.7204209566116333, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.4825930595397949, + "learning_rate": 5e-06, + "loss": 1.56802499294281, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.3861444294452667, + "learning_rate": 5.909090909090909e-06, + "loss": 1.478776216506958, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.29860642552375793, + "learning_rate": 6.818181818181818e-06, + "loss": 1.215264916419983, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.3623839318752289, + "learning_rate": 7.727272727272727e-06, + "loss": 1.1711786985397339, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.3934680223464966, + "learning_rate": 8.636363636363637e-06, + "loss": 1.4052964448928833, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.3248288929462433, + "learning_rate": 9.545454545454547e-06, + "loss": 1.337301254272461, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.41455528140068054, + "learning_rate": 1.0454545454545455e-05, + "loss": 1.4091546535491943, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.7718392610549927, + "learning_rate": 1.1363636363636365e-05, + "loss": 1.608378529548645, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.3031645715236664, + "learning_rate": 1.2272727272727273e-05, + "loss": 1.3645198345184326, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.3317704200744629, + "learning_rate": 1.318181818181818e-05, + "loss": 1.2910865545272827, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.3077561855316162, + "learning_rate": 1.409090909090909e-05, + "loss": 1.2850395441055298, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.6236030459403992, + "learning_rate": 1.5e-05, + "loss": 1.011098861694336, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.3117682635784149, + "learning_rate": 1.590909090909091e-05, + "loss": 1.1125837564468384, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.3884047269821167, + "learning_rate": 1.6818181818181818e-05, + "loss": 1.3625156879425049, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.21100102365016937, + "learning_rate": 1.772727272727273e-05, + "loss": 1.3091150522232056, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.7283101677894592, + "learning_rate": 1.8636363636363638e-05, + "loss": 0.9641504883766174, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.27201205492019653, + "learning_rate": 1.9545454545454546e-05, + "loss": 0.8841361403465271, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.2947981059551239, + "learning_rate": 2.0454545454545457e-05, + "loss": 1.3351988792419434, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.25678518414497375, + "learning_rate": 2.1363636363636362e-05, + "loss": 1.305159568786621, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.3346308767795563, + "learning_rate": 2.2272727272727274e-05, + "loss": 1.2757641077041626, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.39588257670402527, + "learning_rate": 2.318181818181818e-05, + "loss": 1.2430144548416138, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 1.0091103315353394, + "learning_rate": 2.4090909090909093e-05, + "loss": 0.886474609375, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.38303902745246887, + "learning_rate": 2.5e-05, + "loss": 1.45668363571167, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.5787215828895569, + "learning_rate": 2.4999795867415925e-05, + "loss": 1.4211903810501099, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.33087924122810364, + "learning_rate": 2.499918347707172e-05, + "loss": 1.3224607706069946, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.3203301727771759, + "learning_rate": 2.499816285119117e-05, + "loss": 1.5149167776107788, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.8560717701911926, + "learning_rate": 2.499673402681304e-05, + "loss": 1.1875461339950562, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.779531717300415, + "learning_rate": 2.49948970557897e-05, + "loss": 1.3079862594604492, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.3134159743785858, + "learning_rate": 2.499265200478526e-05, + "loss": 1.2827211618423462, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.5278278589248657, + "learning_rate": 2.498999895527316e-05, + "loss": 1.3154817819595337, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.34715163707733154, + "learning_rate": 2.4986938003533195e-05, + "loss": 1.1282734870910645, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.2873562276363373, + "learning_rate": 2.498346926064803e-05, + "loss": 1.2482017278671265, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.40799790620803833, + "learning_rate": 2.497959285249916e-05, + "loss": 1.3365082740783691, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 2.886547565460205, + "learning_rate": 2.497530891976237e-05, + "loss": 0.8918291926383972, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.35245203971862793, + "learning_rate": 2.497061761790258e-05, + "loss": 0.6639876961708069, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.48221555352211, + "learning_rate": 2.4965519117168257e-05, + "loss": 1.4380748271942139, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.51203852891922, + "learning_rate": 2.49600136025852e-05, + "loss": 0.7898932695388794, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.27620550990104675, + "learning_rate": 2.495410127394983e-05, + "loss": 1.3038958311080933, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.3855900764465332, + "learning_rate": 2.494778234582197e-05, + "loss": 1.272139072418213, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 2.9909884929656982, + "learning_rate": 2.494105704751701e-05, + "loss": 0.8982883095741272, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.8854891061782837, + "learning_rate": 2.4933925623097626e-05, + "loss": 1.3485952615737915, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.3434160053730011, + "learning_rate": 2.49263883313649e-05, + "loss": 1.1437045335769653, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.5682878494262695, + "learning_rate": 2.4918445445848933e-05, + "loss": 1.5587941408157349, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.2681918740272522, + "learning_rate": 2.491009725479893e-05, + "loss": 0.9907023906707764, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.32420745491981506, + "learning_rate": 2.4901344061172718e-05, + "loss": 1.0972590446472168, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.3802216053009033, + "learning_rate": 2.489218618262578e-05, + "loss": 1.2862308025360107, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.9653081297874451, + "learning_rate": 2.4882623951499687e-05, + "loss": 1.0500785112380981, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.2368626743555069, + "learning_rate": 2.4872657714810083e-05, + "loss": 1.2645362615585327, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.29273951053619385, + "learning_rate": 2.4862287834234058e-05, + "loss": 1.4147664308547974, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.3903352618217468, + "learning_rate": 2.4851514686097042e-05, + "loss": 1.2646139860153198, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.25432300567626953, + "learning_rate": 2.4840338661359128e-05, + "loss": 1.2783875465393066, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.6913565397262573, + "learning_rate": 2.4828760165600914e-05, + "loss": 1.4046285152435303, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.7609861493110657, + "learning_rate": 2.481677961900875e-05, + "loss": 1.2646381855010986, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.3743007481098175, + "learning_rate": 2.4804397456359513e-05, + "loss": 1.031684160232544, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.79277104139328, + "learning_rate": 2.4791614127004825e-05, + "loss": 1.2975627183914185, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.8501270413398743, + "learning_rate": 2.477843009485474e-05, + "loss": 1.358191728591919, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.3265473246574402, + "learning_rate": 2.4764845838360912e-05, + "loss": 1.051877498626709, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.35509005188941956, + "learning_rate": 2.475086185049923e-05, + "loss": 1.0525556802749634, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.3058815896511078, + "learning_rate": 2.473647863875193e-05, + "loss": 1.325781226158142, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.3684760332107544, + "learning_rate": 2.472169672508918e-05, + "loss": 1.2639888525009155, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.31321296095848083, + "learning_rate": 2.4706516645950126e-05, + "loss": 1.2196840047836304, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.6234083771705627, + "learning_rate": 2.4690938952223446e-05, + "loss": 1.122013807296753, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.5376752614974976, + "learning_rate": 2.4674964209227338e-05, + "loss": 1.0269572734832764, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.3545861542224884, + "learning_rate": 2.4658592996689018e-05, + "loss": 0.9930480718612671, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.2623893618583679, + "learning_rate": 2.464182590872366e-05, + "loss": 0.6596786379814148, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.2950592339038849, + "learning_rate": 2.4624663553812876e-05, + "loss": 1.2530113458633423, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.36303240060806274, + "learning_rate": 2.4607106554782592e-05, + "loss": 1.278784155845642, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.28947484493255615, + "learning_rate": 2.4589155548780468e-05, + "loss": 1.2354241609573364, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.32500797510147095, + "learning_rate": 2.4570811187252763e-05, + "loss": 1.230383038520813, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.27029216289520264, + "learning_rate": 2.4552074135920718e-05, + "loss": 1.1657707691192627, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.7339510321617126, + "learning_rate": 2.4532945074756356e-05, + "loss": 1.271276831626892, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.29380178451538086, + "learning_rate": 2.4513424697957853e-05, + "loss": 1.2509087324142456, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.3937924802303314, + "learning_rate": 2.4493513713924312e-05, + "loss": 1.2946242094039917, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.4068252146244049, + "learning_rate": 2.4473212845230066e-05, + "loss": 1.028969168663025, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.2621578276157379, + "learning_rate": 2.4452522828598448e-05, + "loss": 1.2205129861831665, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.4484550654888153, + "learning_rate": 2.4431444414875076e-05, + "loss": 0.7462490797042847, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.3683895170688629, + "learning_rate": 2.440997836900058e-05, + "loss": 1.6338239908218384, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 3.4501283168792725, + "learning_rate": 2.438812546998286e-05, + "loss": 0.7732384204864502, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.5550245046615601, + "learning_rate": 2.4365886510868798e-05, + "loss": 1.1818565130233765, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.39989838004112244, + "learning_rate": 2.4343262298715504e-05, + "loss": 1.2723463773727417, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.49861252307891846, + "learning_rate": 2.4320253654560986e-05, + "loss": 1.3423465490341187, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.4895351231098175, + "learning_rate": 2.429686141339441e-05, + "loss": 1.4451054334640503, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.27367469668388367, + "learning_rate": 2.427308642412576e-05, + "loss": 1.2423328161239624, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.6150447726249695, + "learning_rate": 2.4248929549555024e-05, + "loss": 1.208847165107727, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.21229158341884613, + "learning_rate": 2.422439166634091e-05, + "loss": 1.2032520771026611, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.38285210728645325, + "learning_rate": 2.4199473664969033e-05, + "loss": 1.1992058753967285, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 1.3906806707382202, + "learning_rate": 2.4174176449719557e-05, + "loss": 1.3618383407592773, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 1.3885375261306763, + "learning_rate": 2.4148500938634432e-05, + "loss": 1.2250099182128906, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.5974783897399902, + "learning_rate": 2.412244806348404e-05, + "loss": 1.208520770072937, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.2837812900543213, + "learning_rate": 2.4096018769733402e-05, + "loss": 1.256629467010498, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.5171183943748474, + "learning_rate": 2.4069214016507857e-05, + "loss": 1.2277578115463257, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.5090808868408203, + "learning_rate": 2.4042034776558252e-05, + "loss": 1.2597812414169312, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.277024507522583, + "learning_rate": 2.4014482036225657e-05, + "loss": 1.3096250295639038, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.28094005584716797, + "learning_rate": 2.398655679540555e-05, + "loss": 1.1250698566436768, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.49337470531463623, + "learning_rate": 2.3958260067511546e-05, + "loss": 1.2817038297653198, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.2434162199497223, + "learning_rate": 2.392959287943861e-05, + "loss": 1.095908761024475, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.8104755282402039, + "learning_rate": 2.390055627152579e-05, + "loss": 0.9623884558677673, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.2845552861690521, + "learning_rate": 2.3871151297518478e-05, + "loss": 1.2727272510528564, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.30428346991539, + "learning_rate": 2.384137902453016e-05, + "loss": 1.2006559371948242, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.5667380690574646, + "learning_rate": 2.381124053300368e-05, + "loss": 1.2453666925430298, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.2638285458087921, + "learning_rate": 2.378073691667204e-05, + "loss": 1.5382798910140991, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.2258531153202057, + "learning_rate": 2.3749869282518718e-05, + "loss": 1.2467749118804932, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.4023035168647766, + "learning_rate": 2.3718638750737472e-05, + "loss": 1.6013109683990479, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.2880134582519531, + "learning_rate": 2.3687046454691712e-05, + "loss": 1.2340466976165771, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.21783682703971863, + "learning_rate": 2.3655093540873353e-05, + "loss": 1.3127397298812866, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.23105667531490326, + "learning_rate": 2.3622781168861214e-05, + "loss": 1.1359162330627441, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 1.055073857307434, + "learning_rate": 2.3590110511278933e-05, + "loss": 1.0895743370056152, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.22074124217033386, + "learning_rate": 2.3557082753752437e-05, + "loss": 1.2820183038711548, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 2.2928848266601562, + "learning_rate": 2.352369909486687e-05, + "loss": 1.2790377140045166, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.1916349232196808, + "learning_rate": 2.3489960746123128e-05, + "loss": 0.7690463662147522, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.24763409793376923, + "learning_rate": 2.3455868931893894e-05, + "loss": 1.2869762182235718, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.6581783294677734, + "learning_rate": 2.342142488937919e-05, + "loss": 0.9537703990936279, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.7167282104492188, + "learning_rate": 2.3386629868561492e-05, + "loss": 0.891618549823761, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.4610043168067932, + "learning_rate": 2.3351485132160346e-05, + "loss": 1.3490716218948364, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.44225654006004333, + "learning_rate": 2.331599195558659e-05, + "loss": 1.2553770542144775, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.5151908993721008, + "learning_rate": 2.3280151626896003e-05, + "loss": 1.2040798664093018, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.2373773753643036, + "learning_rate": 2.324396544674262e-05, + "loss": 1.0659421682357788, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.2626818120479584, + "learning_rate": 2.3207434728331502e-05, + "loss": 1.2204513549804688, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.29700711369514465, + "learning_rate": 2.317056079737108e-05, + "loss": 1.2991636991500854, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.3610213100910187, + "learning_rate": 2.3133344992025057e-05, + "loss": 1.2486889362335205, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.2309371531009674, + "learning_rate": 2.3095788662863838e-05, + "loss": 1.1427106857299805, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.1426355540752411, + "learning_rate": 2.305789317281551e-05, + "loss": 1.2058444023132324, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.19725173711776733, + "learning_rate": 2.30196598971164e-05, + "loss": 1.1971604824066162, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.26061874628067017, + "learning_rate": 2.2981090223261146e-05, + "loss": 1.1320452690124512, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.1859847754240036, + "learning_rate": 2.294218555095236e-05, + "loss": 1.1934643983840942, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.21825049817562103, + "learning_rate": 2.2902947292049827e-05, + "loss": 1.2934924364089966, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.14852340519428253, + "learning_rate": 2.2863376870519273e-05, + "loss": 1.2320027351379395, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.22215086221694946, + "learning_rate": 2.282347572238068e-05, + "loss": 1.0896164178848267, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.2927921414375305, + "learning_rate": 2.278324529565617e-05, + "loss": 1.2266433238983154, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.19128933548927307, + "learning_rate": 2.274268705031748e-05, + "loss": 1.2257647514343262, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.18742051720619202, + "learning_rate": 2.2701802458232945e-05, + "loss": 1.1814370155334473, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.3515142500400543, + "learning_rate": 2.266059300311411e-05, + "loss": 1.204115867614746, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.19052213430404663, + "learning_rate": 2.2619060180461873e-05, + "loss": 1.253179907798767, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.21446406841278076, + "learning_rate": 2.2577205497512227e-05, + "loss": 1.221806526184082, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.4379148483276367, + "learning_rate": 2.253503047318153e-05, + "loss": 0.8980993628501892, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.6354475021362305, + "learning_rate": 2.249253663801143e-05, + "loss": 1.2315393686294556, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.16127297282218933, + "learning_rate": 2.244972553411328e-05, + "loss": 1.2106963396072388, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.16545647382736206, + "learning_rate": 2.2406598715112188e-05, + "loss": 1.2198878526687622, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.17703773081302643, + "learning_rate": 2.2363157746090647e-05, + "loss": 1.1894257068634033, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.18015217781066895, + "learning_rate": 2.231940420353173e-05, + "loss": 1.162126064300537, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.971755862236023, + "learning_rate": 2.2275339675261865e-05, + "loss": 1.1871088743209839, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.2369270771741867, + "learning_rate": 2.2230965760393242e-05, + "loss": 1.187207818031311, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.521456778049469, + "learning_rate": 2.218628406926575e-05, + "loss": 1.5364131927490234, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.5664535164833069, + "learning_rate": 2.214129622338856e-05, + "loss": 0.925621509552002, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.24125301837921143, + "learning_rate": 2.2096003855381268e-05, + "loss": 1.6008453369140625, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.25689688324928284, + "learning_rate": 2.2050408608914652e-05, + "loss": 1.223463773727417, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.4135361909866333, + "learning_rate": 2.200451213865102e-05, + "loss": 0.9032531380653381, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.4481862783432007, + "learning_rate": 2.195831611018416e-05, + "loss": 1.557176113128662, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.598118245601654, + "learning_rate": 2.1911822199978905e-05, + "loss": 1.0001596212387085, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.1991797536611557, + "learning_rate": 2.186503209531028e-05, + "loss": 1.2186784744262695, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.23143993318080902, + "learning_rate": 2.1817947494202283e-05, + "loss": 1.2285571098327637, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.2992916405200958, + "learning_rate": 2.1770570105366253e-05, + "loss": 1.2401832342147827, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.3111695945262909, + "learning_rate": 2.1722901648138872e-05, + "loss": 1.2467424869537354, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.29361769556999207, + "learning_rate": 2.1674943852419754e-05, + "loss": 1.1437691450119019, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.2712153196334839, + "learning_rate": 2.1626698458608678e-05, + "loss": 1.3648308515548706, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.36124688386917114, + "learning_rate": 2.1578167217542434e-05, + "loss": 1.098549485206604, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.38840731978416443, + "learning_rate": 2.1529351890431267e-05, + "loss": 0.7575273513793945, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.26621684432029724, + "learning_rate": 2.1480254248794972e-05, + "loss": 1.25328528881073, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.3533453643321991, + "learning_rate": 2.1430876074398613e-05, + "loss": 1.0998711585998535, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.2502342164516449, + "learning_rate": 2.138121915918785e-05, + "loss": 1.2683922052383423, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 1.4214491844177246, + "learning_rate": 2.133128530522391e-05, + "loss": 1.303632140159607, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.4628271162509918, + "learning_rate": 2.1281076324618197e-05, + "loss": 1.1815130710601807, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.29728227853775024, + "learning_rate": 2.123059403946653e-05, + "loss": 1.2302957773208618, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.26023662090301514, + "learning_rate": 2.1179840281783002e-05, + "loss": 1.2583401203155518, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.20131556689739227, + "learning_rate": 2.1128816893433524e-05, + "loss": 1.2529937028884888, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.2651193141937256, + "learning_rate": 2.107752572606895e-05, + "loss": 1.2728800773620605, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 1.3046387434005737, + "learning_rate": 2.1025968641057917e-05, + "loss": 1.2606099843978882, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.23248107731342316, + "learning_rate": 2.097414750941927e-05, + "loss": 1.6241867542266846, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.457072913646698, + "learning_rate": 2.092206421175416e-05, + "loss": 1.0590146780014038, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.19466765224933624, + "learning_rate": 2.0869720638177817e-05, + "loss": 0.9096658825874329, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.2271135449409485, + "learning_rate": 2.0817118688250933e-05, + "loss": 1.2334853410720825, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.2870445251464844, + "learning_rate": 2.0764260270910753e-05, + "loss": 0.9327258467674255, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.3647417724132538, + "learning_rate": 2.0711147304401775e-05, + "loss": 1.1043840646743774, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.2910922169685364, + "learning_rate": 2.0657781716206156e-05, + "loss": 0.7837404608726501, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.26856181025505066, + "learning_rate": 2.0604165442973738e-05, + "loss": 1.2202802896499634, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.7940663695335388, + "learning_rate": 2.0550300430451805e-05, + "loss": 0.9926665425300598, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.3847670257091522, + "learning_rate": 2.049618863341443e-05, + "loss": 1.2039496898651123, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.4092676043510437, + "learning_rate": 2.0441832015591565e-05, + "loss": 1.1870145797729492, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.2678527235984802, + "learning_rate": 2.038723254959775e-05, + "loss": 1.052931785583496, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.3561493158340454, + "learning_rate": 2.033239221686057e-05, + "loss": 0.9613900780677795, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.3432145416736603, + "learning_rate": 2.0277313007548702e-05, + "loss": 0.9641975164413452, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.5496553778648376, + "learning_rate": 2.022199692049971e-05, + "loss": 0.9910390377044678, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 1.1139615774154663, + "learning_rate": 2.0166445963147514e-05, + "loss": 0.8240501284599304, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.38149014115333557, + "learning_rate": 2.0110662151449538e-05, + "loss": 0.9968246817588806, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.41972488164901733, + "learning_rate": 2.0054647509813533e-05, + "loss": 0.705604612827301, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.7368504405021667, + "learning_rate": 1.9998404071024132e-05, + "loss": 0.833368718624115, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.2574571967124939, + "learning_rate": 1.9941933876169064e-05, + "loss": 0.9591267704963684, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.6520720720291138, + "learning_rate": 1.9885238974565092e-05, + "loss": 0.7256029844284058, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.43294209241867065, + "learning_rate": 1.982832142368365e-05, + "loss": 0.6323433518409729, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.29659733176231384, + "learning_rate": 1.9771183289076148e-05, + "loss": 1.1004457473754883, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.19339755177497864, + "learning_rate": 1.9713826644299056e-05, + "loss": 0.9760780334472656, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.2182750552892685, + "learning_rate": 1.96562535708386e-05, + "loss": 0.6805109977722168, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.26215946674346924, + "learning_rate": 1.959846615803528e-05, + "loss": 1.012329339981079, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.220313161611557, + "learning_rate": 1.954046650300801e-05, + "loss": 0.9814585447311401, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.339661568403244, + "learning_rate": 1.9482256710578028e-05, + "loss": 1.0675204992294312, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.5715194344520569, + "learning_rate": 1.94238388931925e-05, + "loss": 1.002265214920044, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.326412171125412, + "learning_rate": 1.9365215170847887e-05, + "loss": 1.1493186950683594, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.781463086605072, + "learning_rate": 1.9306387671012965e-05, + "loss": 0.9932349920272827, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.7584837675094604, + "learning_rate": 1.9247358528551662e-05, + "loss": 0.6940405368804932, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.2473216950893402, + "learning_rate": 1.918812988564555e-05, + "loss": 1.1692583560943604, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.4508194923400879, + "learning_rate": 1.9128703891716138e-05, + "loss": 0.7730159759521484, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.20886428654193878, + "learning_rate": 1.906908270334683e-05, + "loss": 0.8129177689552307, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 1.0284196138381958, + "learning_rate": 1.90092684842047e-05, + "loss": 0.7486385703086853, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.29529622197151184, + "learning_rate": 1.8949263404961936e-05, + "loss": 0.9710283279418945, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.19345751404762268, + "learning_rate": 1.888906964321711e-05, + "loss": 0.9194430112838745, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.1850510686635971, + "learning_rate": 1.88286893834161e-05, + "loss": 1.015109896659851, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 1.9381911754608154, + "learning_rate": 1.8768124816772854e-05, + "loss": 0.7446098327636719, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.8182835578918457, + "learning_rate": 1.870737814118987e-05, + "loss": 0.7042106986045837, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.3277008831501007, + "learning_rate": 1.86464515611784e-05, + "loss": 1.1486928462982178, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.23618929088115692, + "learning_rate": 1.8585347287778493e-05, + "loss": 0.9731719493865967, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.2742961645126343, + "learning_rate": 1.8524067538478713e-05, + "loss": 1.040266752243042, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.27158892154693604, + "learning_rate": 1.8462614537135708e-05, + "loss": 0.8597835898399353, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.2809169292449951, + "learning_rate": 1.840099051389346e-05, + "loss": 1.0985636711120605, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.27836382389068604, + "learning_rate": 1.8339197705102396e-05, + "loss": 0.9655686616897583, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.2705060839653015, + "learning_rate": 1.8277238353238193e-05, + "loss": 0.8749003410339355, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.40282824635505676, + "learning_rate": 1.8215114706820435e-05, + "loss": 0.9861440062522888, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.28667566180229187, + "learning_rate": 1.8152829020330975e-05, + "loss": 0.9603111743927002, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.20321297645568848, + "learning_rate": 1.8090383554132158e-05, + "loss": 0.6063272953033447, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.2519320845603943, + "learning_rate": 1.8027780574384753e-05, + "loss": 1.0045597553253174, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.2760215401649475, + "learning_rate": 1.796502235296575e-05, + "loss": 0.9750694036483765, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 1.3805330991744995, + "learning_rate": 1.7902111167385883e-05, + "loss": 0.9757618308067322, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.41420966386795044, + "learning_rate": 1.7839049300707003e-05, + "loss": 1.1883511543273926, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 3.457296371459961, + "learning_rate": 1.7775839041459203e-05, + "loss": 0.8075215816497803, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.41424238681793213, + "learning_rate": 1.7712482683557778e-05, + "loss": 0.7205969095230103, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.23676064610481262, + "learning_rate": 1.7648982526219995e-05, + "loss": 0.9795306324958801, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.5828472375869751, + "learning_rate": 1.7585340873881617e-05, + "loss": 0.7240006923675537, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.41735684871673584, + "learning_rate": 1.7521560036113303e-05, + "loss": 0.9175815582275391, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 1.274187684059143, + "learning_rate": 1.7457642327536777e-05, + "loss": 1.003617763519287, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.19330422580242157, + "learning_rate": 1.7393590067740852e-05, + "loss": 0.9662733674049377, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.4729224443435669, + "learning_rate": 1.732940558119722e-05, + "loss": 1.0254093408584595, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.2658669352531433, + "learning_rate": 1.726509119717612e-05, + "loss": 1.1315693855285645, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.17870910465717316, + "learning_rate": 1.7200649249661806e-05, + "loss": 1.0432623624801636, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.6331385970115662, + "learning_rate": 1.7136082077267837e-05, + "loss": 0.4694063067436218, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.24067234992980957, + "learning_rate": 1.7071392023152208e-05, + "loss": 0.9866682291030884, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.2517263889312744, + "learning_rate": 1.700658143493233e-05, + "loss": 1.1108297109603882, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.15475492179393768, + "learning_rate": 1.6941652664599814e-05, + "loss": 0.8292356133460999, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.412447988986969, + "learning_rate": 1.687660806843514e-05, + "loss": 0.6224348545074463, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.20403409004211426, + "learning_rate": 1.681145000692213e-05, + "loss": 0.9716439843177795, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.32221364974975586, + "learning_rate": 1.6746180844662286e-05, + "loss": 0.9632929563522339, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.4680832028388977, + "learning_rate": 1.6680802950288988e-05, + "loss": 0.678726851940155, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.24255001544952393, + "learning_rate": 1.661531869638153e-05, + "loss": 1.0519957542419434, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.14533624053001404, + "learning_rate": 1.6549730459379012e-05, + "loss": 0.7603930234909058, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.29373615980148315, + "learning_rate": 1.648404061949412e-05, + "loss": 1.014536738395691, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.8250491619110107, + "learning_rate": 1.641825156062671e-05, + "loss": 0.9662138223648071, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.2744878828525543, + "learning_rate": 1.635236567027734e-05, + "loss": 1.1449735164642334, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.3034563362598419, + "learning_rate": 1.6286385339460596e-05, + "loss": 0.9222993850708008, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.2692282795906067, + "learning_rate": 1.6220312962618332e-05, + "loss": 0.6134414076805115, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.5227485299110413, + "learning_rate": 1.6154150937532775e-05, + "loss": 0.9774447679519653, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.2803296148777008, + "learning_rate": 1.6087901665239502e-05, + "loss": 0.9835152626037598, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.3903689980506897, + "learning_rate": 1.602156754994032e-05, + "loss": 1.0372285842895508, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.18825502693653107, + "learning_rate": 1.5955150998916003e-05, + "loss": 0.9928014874458313, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.1467607021331787, + "learning_rate": 1.5888654422438946e-05, + "loss": 0.793111264705658, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.1626003533601761, + "learning_rate": 1.5822080233685678e-05, + "loss": 0.6302416324615479, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.21208561956882477, + "learning_rate": 1.575543084864929e-05, + "loss": 1.056746006011963, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.5272570252418518, + "learning_rate": 1.5688708686051784e-05, + "loss": 0.6418699026107788, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.2039785534143448, + "learning_rate": 1.5621916167256255e-05, + "loss": 0.9852989315986633, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 2.6706042289733887, + "learning_rate": 1.555505571617906e-05, + "loss": 0.786631166934967, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.8656861186027527, + "learning_rate": 1.5488129759201822e-05, + "loss": 0.9886066913604736, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.3004356920719147, + "learning_rate": 1.5421140725083388e-05, + "loss": 0.6762040257453918, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.6082632541656494, + "learning_rate": 1.53540910448717e-05, + "loss": 0.7938519716262817, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.4222017824649811, + "learning_rate": 1.5286983151815553e-05, + "loss": 0.5967254042625427, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.20243442058563232, + "learning_rate": 1.52198194812763e-05, + "loss": 0.9870072603225708, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.21171127259731293, + "learning_rate": 1.515260247063948e-05, + "loss": 0.9821056127548218, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.19740687310695648, + "learning_rate": 1.5085334559226342e-05, + "loss": 0.9811551570892334, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.1451021134853363, + "learning_rate": 1.5018018188205354e-05, + "loss": 1.0408740043640137, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.2504618763923645, + "learning_rate": 1.4950655800503577e-05, + "loss": 0.967536985874176, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.2872159481048584, + "learning_rate": 1.4883249840718045e-05, + "loss": 0.8747035264968872, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.7941830158233643, + "learning_rate": 1.4815802755027019e-05, + "loss": 1.1690365076065063, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.818136990070343, + "learning_rate": 1.474831699110124e-05, + "loss": 0.5261512994766235, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.25710561871528625, + "learning_rate": 1.4680794998015081e-05, + "loss": 0.9896012544631958, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.2098042219877243, + "learning_rate": 1.4613239226157693e-05, + "loss": 0.9484928250312805, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.21850721538066864, + "learning_rate": 1.4545652127144052e-05, + "loss": 0.9997845888137817, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.2467595338821411, + "learning_rate": 1.4478036153726013e-05, + "loss": 0.9743095636367798, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.23051407933235168, + "learning_rate": 1.4410393759703286e-05, + "loss": 0.7760718464851379, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.29818111658096313, + "learning_rate": 1.4342727399834396e-05, + "loss": 0.6354138851165771, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.5161439180374146, + "learning_rate": 1.4275039529747592e-05, + "loss": 0.893290638923645, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.8079479336738586, + "learning_rate": 1.4207332605851725e-05, + "loss": 0.7297835946083069, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.19712623953819275, + "learning_rate": 1.413960908524713e-05, + "loss": 0.9552266597747803, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.35008788108825684, + "learning_rate": 1.407187142563643e-05, + "loss": 0.7161392569541931, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.30580294132232666, + "learning_rate": 1.4004122085235349e-05, + "loss": 1.1108880043029785, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.28407377004623413, + "learning_rate": 1.3936363522683532e-05, + "loss": 0.9953396916389465, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.3069007396697998, + "learning_rate": 1.3868598196955273e-05, + "loss": 0.9776367545127869, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.22995546460151672, + "learning_rate": 1.3800828567270318e-05, + "loss": 0.7785615921020508, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.33251991868019104, + "learning_rate": 1.3733057093004605e-05, + "loss": 1.0668448209762573, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.46238404512405396, + "learning_rate": 1.3665286233601002e-05, + "loss": 0.577606201171875, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.29064756631851196, + "learning_rate": 1.3597518448480075e-05, + "loss": 0.6106272339820862, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.39371079206466675, + "learning_rate": 1.3529756196950815e-05, + "loss": 0.741959810256958, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.19503989815711975, + "learning_rate": 1.3462001938121405e-05, + "loss": 1.024315595626831, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.222434401512146, + "learning_rate": 1.3394258130809961e-05, + "loss": 1.0157414674758911, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.2086457908153534, + "learning_rate": 1.332652723345533e-05, + "loss": 0.7172755002975464, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.25153470039367676, + "learning_rate": 1.3258811704027823e-05, + "loss": 0.9605967998504639, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 1.1914056539535522, + "learning_rate": 1.3191113999940072e-05, + "loss": 0.7792953252792358, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.3395598828792572, + "learning_rate": 1.3123436577957814e-05, + "loss": 0.39568331837654114, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.21833889186382294, + "learning_rate": 1.305578189411075e-05, + "loss": 0.997237503528595, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.4959961771965027, + "learning_rate": 1.2988152403603387e-05, + "loss": 1.0844365358352661, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.17567996680736542, + "learning_rate": 1.2920550560725985e-05, + "loss": 1.0519412755966187, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.2537671625614166, + "learning_rate": 1.2852978818765457e-05, + "loss": 1.0641611814498901, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.272575318813324, + "learning_rate": 1.2785439629916346e-05, + "loss": 1.0360894203186035, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.22028231620788574, + "learning_rate": 1.2717935445191823e-05, + "loss": 0.8390608429908752, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2603016793727875, + "learning_rate": 1.2650468714334764e-05, + "loss": 1.0364290475845337, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.9713470339775085, + "learning_rate": 1.2583041885728841e-05, + "loss": 0.6826381087303162, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.23464682698249817, + "learning_rate": 1.2515657406309653e-05, + "loss": 0.9781528115272522, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.2752858102321625, + "learning_rate": 1.2448317721475938e-05, + "loss": 1.009163737297058, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.19153419137001038, + "learning_rate": 1.2381025275000826e-05, + "loss": 0.5738469362258911, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.2596787214279175, + "learning_rate": 1.2313782508943165e-05, + "loss": 0.9725096225738525, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.36354807019233704, + "learning_rate": 1.224659186355888e-05, + "loss": 1.1703816652297974, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.2569591701030731, + "learning_rate": 1.2179455777212426e-05, + "loss": 0.6586411595344543, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.14563089609146118, + "learning_rate": 1.2112376686288302e-05, + "loss": 0.5979112386703491, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.17862805724143982, + "learning_rate": 1.204535702510263e-05, + "loss": 1.0485451221466064, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.46438074111938477, + "learning_rate": 1.1978399225814802e-05, + "loss": 0.7404160499572754, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.16041846573352814, + "learning_rate": 1.1911505718339238e-05, + "loss": 0.9989252686500549, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.16997864842414856, + "learning_rate": 1.1844678930257189e-05, + "loss": 1.0865825414657593, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.19891749322414398, + "learning_rate": 1.1777921286728649e-05, + "loss": 0.8886318206787109, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.1822405755519867, + "learning_rate": 1.1711235210404322e-05, + "loss": 0.9987983107566833, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.5658647418022156, + "learning_rate": 1.1644623121337744e-05, + "loss": 0.8494293689727783, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.2527749836444855, + "learning_rate": 1.1578087436897425e-05, + "loss": 1.0923233032226562, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.17464260756969452, + "learning_rate": 1.1511630571679133e-05, + "loss": 0.9967983961105347, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.18370413780212402, + "learning_rate": 1.1445254937418262e-05, + "loss": 0.6361693739891052, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.277452290058136, + "learning_rate": 1.1378962942902325e-05, + "loss": 0.9944519996643066, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.17650683224201202, + "learning_rate": 1.1312756993883528e-05, + "loss": 0.9504789113998413, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.1316922903060913, + "learning_rate": 1.1246639492991464e-05, + "loss": 0.9381383657455444, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.44887641072273254, + "learning_rate": 1.1180612839645918e-05, + "loss": 0.9580968022346497, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.5374517440795898, + "learning_rate": 1.1114679429969799e-05, + "loss": 0.8842057585716248, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.2197987288236618, + "learning_rate": 1.1048841656702183e-05, + "loss": 0.6668030023574829, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.2299073338508606, + "learning_rate": 1.0983101909111476e-05, + "loss": 0.9139745831489563, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.17643967270851135, + "learning_rate": 1.0917462572908709e-05, + "loss": 0.9805459976196289, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.23675832152366638, + "learning_rate": 1.0851926030160964e-05, + "loss": 1.0229063034057617, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.3587989807128906, + "learning_rate": 1.0786494659204919e-05, + "loss": 0.9302806854248047, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.18514367938041687, + "learning_rate": 1.0721170834560539e-05, + "loss": 0.9443346261978149, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.1954764574766159, + "learning_rate": 1.0655956926844917e-05, + "loss": 0.6162158250808716, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.6150888800621033, + "learning_rate": 1.0590855302686232e-05, + "loss": 0.864729642868042, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.15662595629692078, + "learning_rate": 1.0525868324637867e-05, + "loss": 0.9829941391944885, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.1537102609872818, + "learning_rate": 1.046099835109266e-05, + "loss": 0.9522998332977295, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.13544432818889618, + "learning_rate": 1.0396247736197342e-05, + "loss": 0.22472639381885529, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.214643195271492, + "learning_rate": 1.0331618829767084e-05, + "loss": 0.9816657900810242, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.14030401408672333, + "learning_rate": 1.026711397720023e-05, + "loss": 0.9638572931289673, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.23839998245239258, + "learning_rate": 1.0202735519393172e-05, + "loss": 0.8905544281005859, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.6069282293319702, + "learning_rate": 1.013848579265542e-05, + "loss": 0.7191258668899536, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.18707877397537231, + "learning_rate": 1.0074367128624787e-05, + "loss": 0.9472928047180176, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.1760343462228775, + "learning_rate": 1.0010381854182808e-05, + "loss": 1.0247843265533447, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.2688332796096802, + "learning_rate": 9.946532291370258e-06, + "loss": 0.7216252684593201, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.2513716518878937, + "learning_rate": 9.882820757302921e-06, + "loss": 0.9491530060768127, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.1897820681333542, + "learning_rate": 9.819249564087476e-06, + "loss": 0.9903871417045593, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.2507593631744385, + "learning_rate": 9.755821018737602e-06, + "loss": 0.739215075969696, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.2772381901741028, + "learning_rate": 9.692537423090254e-06, + "loss": 0.9631948471069336, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.1794651299715042, + "learning_rate": 9.629401073722121e-06, + "loss": 0.9130382537841797, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.18744076788425446, + "learning_rate": 9.566414261866308e-06, + "loss": 0.9499990940093994, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.21617577970027924, + "learning_rate": 9.503579273329149e-06, + "loss": 1.0123318433761597, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.17715895175933838, + "learning_rate": 9.440898388407277e-06, + "loss": 0.9562976360321045, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.1489608734846115, + "learning_rate": 9.378373881804876e-06, + "loss": 0.6913546323776245, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.24274346232414246, + "learning_rate": 9.316008022551121e-06, + "loss": 0.5943477749824524, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.25395557284355164, + "learning_rate": 9.253803073917827e-06, + "loss": 1.011484146118164, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.38927656412124634, + "learning_rate": 9.19176129333734e-06, + "loss": 0.6682248115539551, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.25566795468330383, + "learning_rate": 9.129884932320593e-06, + "loss": 0.9508749842643738, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.4887275993824005, + "learning_rate": 9.0681762363754e-06, + "loss": 1.0459179878234863, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.3566785454750061, + "learning_rate": 9.006637444924973e-06, + "loss": 0.9785248041152954, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.5626527667045593, + "learning_rate": 8.945270791226646e-06, + "loss": 0.945759117603302, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 5.221266269683838, + "learning_rate": 8.884078502290838e-06, + "loss": 0.9256299138069153, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.2847398519515991, + "learning_rate": 8.823062798800233e-06, + "loss": 0.9720381498336792, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.2270623743534088, + "learning_rate": 8.762225895029169e-06, + "loss": 0.8780955672264099, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.18520908057689667, + "learning_rate": 8.701569998763323e-06, + "loss": 0.9250549674034119, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.17245544493198395, + "learning_rate": 8.64109731121955e-06, + "loss": 0.41599372029304504, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.25467920303344727, + "learning_rate": 8.580810026966034e-06, + "loss": 0.7172941565513611, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.24227584898471832, + "learning_rate": 8.520710333842605e-06, + "loss": 0.747187614440918, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.24147503077983856, + "learning_rate": 8.460800412881386e-06, + "loss": 0.6775767207145691, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.31402096152305603, + "learning_rate": 8.401082438227617e-06, + "loss": 0.402444452047348, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.42537474632263184, + "learning_rate": 8.341558577060769e-06, + "loss": 0.6484034657478333, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.3523010313510895, + "learning_rate": 8.282230989515865e-06, + "loss": 0.4699830114841461, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.5600184202194214, + "learning_rate": 8.223101828605141e-06, + "loss": 0.6734252572059631, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.15328167378902435, + "learning_rate": 8.164173240139865e-06, + "loss": 0.45088091492652893, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.6332978010177612, + "learning_rate": 8.105447362652492e-06, + "loss": 0.44899141788482666, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 1.7822338342666626, + "learning_rate": 8.046926327319037e-06, + "loss": 0.5379037857055664, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.3468593657016754, + "learning_rate": 7.988612257881757e-06, + "loss": 0.6135660409927368, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.4308558702468872, + "learning_rate": 7.930507270572065e-06, + "loss": 0.6905173659324646, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.214769646525383, + "learning_rate": 7.872613474033722e-06, + "loss": 0.6643854379653931, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.2445564866065979, + "learning_rate": 7.814932969246342e-06, + "loss": 0.5414201021194458, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.2596500515937805, + "learning_rate": 7.75746784944912e-06, + "loss": 0.6696488261222839, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.30743348598480225, + "learning_rate": 7.700220200064889e-06, + "loss": 0.6852096319198608, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.9081697463989258, + "learning_rate": 7.643192098624413e-06, + "loss": 0.665625810623169, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.2047584056854248, + "learning_rate": 7.586385614691024e-06, + "loss": 0.6999115943908691, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.5113670229911804, + "learning_rate": 7.529802809785497e-06, + "loss": 0.6937403082847595, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.19943810999393463, + "learning_rate": 7.473445737311244e-06, + "loss": 0.7570751309394836, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.24206914007663727, + "learning_rate": 7.417316442479783e-06, + "loss": 0.4566923677921295, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.5807485580444336, + "learning_rate": 7.361416962236542e-06, + "loss": 0.617829442024231, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.3329077959060669, + "learning_rate": 7.305749325186914e-06, + "loss": 0.6804491877555847, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.20504486560821533, + "learning_rate": 7.2503155515226575e-06, + "loss": 0.7602362632751465, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.1894470453262329, + "learning_rate": 7.195117652948555e-06, + "loss": 0.6853156089782715, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.11648036539554596, + "learning_rate": 7.14015763260944e-06, + "loss": 0.3979906439781189, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 1.0007914304733276, + "learning_rate": 7.085437485017487e-06, + "loss": 0.640531063079834, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.1968093067407608, + "learning_rate": 7.030959195979833e-06, + "loss": 0.6763147115707397, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.2067585587501526, + "learning_rate": 6.976724742526503e-06, + "loss": 0.5175256729125977, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.7652968764305115, + "learning_rate": 6.92273609283868e-06, + "loss": 0.4226246774196625, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.21701058745384216, + "learning_rate": 6.868995206177264e-06, + "loss": 0.6554322838783264, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 1.6271648406982422, + "learning_rate": 6.815504032811775e-06, + "loss": 0.6646580100059509, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.4442613422870636, + "learning_rate": 6.7622645139495815e-06, + "loss": 0.585219144821167, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.23071300983428955, + "learning_rate": 6.7092785816654515e-06, + "loss": 0.6361981630325317, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.15861168503761292, + "learning_rate": 6.656548158831436e-06, + "loss": 0.7302151322364807, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.2113688737154007, + "learning_rate": 6.6040751590470825e-06, + "loss": 0.6704022884368896, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.13283978402614594, + "learning_rate": 6.551861486569997e-06, + "loss": 0.38798680901527405, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.23720614612102509, + "learning_rate": 6.499909036246738e-06, + "loss": 0.37715965509414673, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.23791414499282837, + "learning_rate": 6.448219693444056e-06, + "loss": 0.7323361039161682, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.24081940948963165, + "learning_rate": 6.39679533398045e-06, + "loss": 0.5640745759010315, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.3527999222278595, + "learning_rate": 6.345637824058126e-06, + "loss": 0.673088788986206, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.5025823712348938, + "learning_rate": 6.294749020195253e-06, + "loss": 0.6976668834686279, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.2786237895488739, + "learning_rate": 6.244130769158594e-06, + "loss": 0.6512521505355835, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 1.141295313835144, + "learning_rate": 6.193784907896478e-06, + "loss": 0.6717444062232971, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.32836222648620605, + "learning_rate": 6.143713263472156e-06, + "loss": 0.6926584839820862, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 0.4723096489906311, + "learning_rate": 6.0939176529974765e-06, + "loss": 0.46854105591773987, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.2536696791648865, + "learning_rate": 6.044399883566959e-06, + "loss": 0.4778412878513336, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.19949211180210114, + "learning_rate": 5.995161752192192e-06, + "loss": 0.7425835132598877, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.6780716180801392, + "learning_rate": 5.9462050457366446e-06, + "loss": 0.3689644932746887, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.22472506761550903, + "learning_rate": 5.897531540850807e-06, + "loss": 0.3652177155017853, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.26355841755867004, + "learning_rate": 5.849143003907717e-06, + "loss": 0.632552444934845, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.6640907526016235, + "learning_rate": 5.801041190938849e-06, + "loss": 0.7073798775672913, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.2682071626186371, + "learning_rate": 5.75322784757041e-06, + "loss": 0.4572494328022003, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.23095370829105377, + "learning_rate": 5.705704708959968e-06, + "loss": 0.7449085712432861, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.19461746513843536, + "learning_rate": 5.658473499733499e-06, + "loss": 0.6545416712760925, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.1823868453502655, + "learning_rate": 5.611535933922781e-06, + "loss": 0.29822948575019836, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.19124025106430054, + "learning_rate": 5.564893714903211e-06, + "loss": 0.7034991383552551, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.19907920062541962, + "learning_rate": 5.518548535331984e-06, + "loss": 0.6916401386260986, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.3887382447719574, + "learning_rate": 5.472502077086653e-06, + "loss": 0.6322124004364014, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.23771852254867554, + "learning_rate": 5.426756011204108e-06, + "loss": 0.6052520871162415, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.14829999208450317, + "learning_rate": 5.381311997819933e-06, + "loss": 0.7813782691955566, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.26474112272262573, + "learning_rate": 5.336171686108153e-06, + "loss": 0.5027655959129333, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.2717822194099426, + "learning_rate": 5.291336714221382e-06, + "loss": 0.7000398635864258, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.24665828049182892, + "learning_rate": 5.246808709231382e-06, + "loss": 0.7338890433311462, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 1.1350178718566895, + "learning_rate": 5.202589287070019e-06, + "loss": 0.5282155275344849, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 1.0712934732437134, + "learning_rate": 5.158680052470609e-06, + "loss": 0.5917450785636902, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.17483752965927124, + "learning_rate": 5.115082598909689e-06, + "loss": 0.7042443156242371, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.24676814675331116, + "learning_rate": 5.071798508549189e-06, + "loss": 0.715234100818634, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.22551953792572021, + "learning_rate": 5.028829352179017e-06, + "loss": 0.804095983505249, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.18146005272865295, + "learning_rate": 4.986176689160042e-06, + "loss": 0.7026334404945374, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.5449501872062683, + "learning_rate": 4.943842067367525e-06, + "loss": 0.49338310956954956, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.1813872754573822, + "learning_rate": 4.901827023134931e-06, + "loss": 0.7016088366508484, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.15302123129367828, + "learning_rate": 4.86013308119818e-06, + "loss": 0.6298723816871643, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.18854597210884094, + "learning_rate": 4.818761754640316e-06, + "loss": 0.43687549233436584, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.22853216528892517, + "learning_rate": 4.777714544836585e-06, + "loss": 0.7133804559707642, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.19461189210414886, + "learning_rate": 4.736992941399972e-06, + "loss": 0.7002368569374084, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.2936843931674957, + "learning_rate": 4.696598422127119e-06, + "loss": 0.3858260214328766, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.25543907284736633, + "learning_rate": 4.656532452944716e-06, + "loss": 0.449078232049942, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.46111661195755005, + "learning_rate": 4.616796487856282e-06, + "loss": 0.641425371170044, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.1983172744512558, + "learning_rate": 4.577391968889416e-06, + "loss": 0.3780648708343506, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.28097936511039734, + "learning_rate": 4.53832032604346e-06, + "loss": 0.6762884259223938, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.234109029173851, + "learning_rate": 4.499582977237594e-06, + "loss": 0.7201648950576782, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.18416042625904083, + "learning_rate": 4.461181328259397e-06, + "loss": 0.6271026134490967, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.17045891284942627, + "learning_rate": 4.423116772713817e-06, + "loss": 0.7007759809494019, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.1790081411600113, + "learning_rate": 4.3853906919726045e-06, + "loss": 0.710627019405365, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.8877317309379578, + "learning_rate": 4.348004455124173e-06, + "loss": 0.4587891399860382, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.07845756411552429, + "learning_rate": 4.310959418923926e-06, + "loss": 0.13992244005203247, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.10895147174596786, + "learning_rate": 4.274256927745015e-06, + "loss": 0.49745655059814453, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.1617729514837265, + "learning_rate": 4.23789831352955e-06, + "loss": 0.4288228750228882, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.21319684386253357, + "learning_rate": 4.2018848957402605e-06, + "loss": 0.6953483819961548, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.2012702375650406, + "learning_rate": 4.166217981312621e-06, + "loss": 0.5614168047904968, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.20277954638004303, + "learning_rate": 4.130898864607417e-06, + "loss": 0.6851552724838257, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.2765226662158966, + "learning_rate": 4.09592882736377e-06, + "loss": 0.6657923460006714, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.4667165279388428, + "learning_rate": 4.061309138652622e-06, + "loss": 0.7339661717414856, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.16822604835033417, + "learning_rate": 4.027041054830689e-06, + "loss": 0.6094578504562378, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.14607980847358704, + "learning_rate": 3.993125819494862e-06, + "loss": 0.34630584716796875, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.15208321809768677, + "learning_rate": 3.9595646634370745e-06, + "loss": 0.4717685878276825, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.2327331304550171, + "learning_rate": 3.926358804599636e-06, + "loss": 0.6192865371704102, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.757671594619751, + "learning_rate": 3.8935094480310425e-06, + "loss": 0.5656976103782654, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.22793398797512054, + "learning_rate": 3.861017785842234e-06, + "loss": 0.7342063188552856, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.9295087456703186, + "learning_rate": 3.828884997163339e-06, + "loss": 0.5990333557128906, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 2.140310764312744, + "learning_rate": 3.7971122481008757e-06, + "loss": 0.5494145154953003, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.28645941615104675, + "learning_rate": 3.7657006916954453e-06, + "loss": 0.6415644288063049, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.38164955377578735, + "learning_rate": 3.734651467879878e-06, + "loss": 0.43504810333251953, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.34789642691612244, + "learning_rate": 3.7039657034378726e-06, + "loss": 0.6228368282318115, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.07762913405895233, + "learning_rate": 3.6736445119630964e-06, + "loss": 0.16022926568984985, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.28069254755973816, + "learning_rate": 3.6436889938187833e-06, + "loss": 0.6584885716438293, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.24518746137619019, + "learning_rate": 3.6141002360977864e-06, + "loss": 0.43174296617507935, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.39224839210510254, + "learning_rate": 3.584879312583141e-06, + "loss": 0.5712751746177673, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.24317054450511932, + "learning_rate": 3.5560272837090935e-06, + "loss": 0.37820327281951904, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.15435853600502014, + "learning_rate": 3.527545196522615e-06, + "loss": 0.17984823882579803, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.32159680128097534, + "learning_rate": 3.499434084645407e-06, + "loss": 0.4029221534729004, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.3094979226589203, + "learning_rate": 3.4716949682363803e-06, + "loss": 0.739897608757019, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.425645112991333, + "learning_rate": 3.4443288539546508e-06, + "loss": 0.4608995020389557, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.22378240525722504, + "learning_rate": 3.417336734922998e-06, + "loss": 0.6960005760192871, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.18555636703968048, + "learning_rate": 3.3907195906918236e-06, + "loss": 0.7077355980873108, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.2145896553993225, + "learning_rate": 3.364478387203602e-06, + "loss": 0.5513492226600647, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.7693909406661987, + "learning_rate": 3.338614076757832e-06, + "loss": 0.5141048431396484, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.3477981388568878, + "learning_rate": 3.3131275979764746e-06, + "loss": 0.6895625591278076, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.19012540578842163, + "learning_rate": 3.288019875769889e-06, + "loss": 0.39245468378067017, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.24051906168460846, + "learning_rate": 3.2632918213032687e-06, + "loss": 0.6571274995803833, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.14302131533622742, + "learning_rate": 3.2389443319635762e-06, + "loss": 0.6791136860847473, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.21270255744457245, + "learning_rate": 3.2149782913269715e-06, + "loss": 0.7463234066963196, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.20450647175312042, + "learning_rate": 3.1913945691267574e-06, + "loss": 0.46346530318260193, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.18693040311336517, + "learning_rate": 3.1681940212218013e-06, + "loss": 0.3435611426830292, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.20872755348682404, + "learning_rate": 3.145377489565492e-06, + "loss": 0.6869267225265503, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.19473455846309662, + "learning_rate": 3.1229458021751754e-06, + "loss": 0.7176718711853027, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.17841266095638275, + "learning_rate": 3.100899773102103e-06, + "loss": 0.618594229221344, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.19311846792697906, + "learning_rate": 3.0792402024019028e-06, + "loss": 0.34453269839286804, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.22827599942684174, + "learning_rate": 3.0579678761055294e-06, + "loss": 0.6793984770774841, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.1737447828054428, + "learning_rate": 3.0370835661907522e-06, + "loss": 0.7369482517242432, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.17720820009708405, + "learning_rate": 3.016588030554128e-06, + "loss": 0.6935803294181824, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.650160551071167, + "learning_rate": 2.996482012983508e-06, + "loss": 0.5621395707130432, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.23146776854991913, + "learning_rate": 2.9767662431310373e-06, + "loss": 0.6701301336288452, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.45669713616371155, + "learning_rate": 2.957441436486682e-06, + "loss": 0.7368155121803284, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.2950546145439148, + "learning_rate": 2.9385082943522553e-06, + "loss": 0.2910221815109253, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.19097362458705902, + "learning_rate": 2.9199675038159764e-06, + "loss": 0.7412728667259216, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.24510665237903595, + "learning_rate": 2.9018197377275334e-06, + "loss": 0.6697684526443481, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.14479011297225952, + "learning_rate": 2.8840656546736607e-06, + "loss": 0.5396718978881836, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.23086193203926086, + "learning_rate": 2.866705898954241e-06, + "loss": 0.6256304383277893, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.29304757714271545, + "learning_rate": 2.8497411005589283e-06, + "loss": 0.6394587755203247, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.3104848265647888, + "learning_rate": 2.8331718751442793e-06, + "loss": 0.6255086064338684, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.14495858550071716, + "learning_rate": 2.8169988240114145e-06, + "loss": 0.40636876225471497, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.29641368985176086, + "learning_rate": 2.801222534084196e-06, + "loss": 0.6926835179328918, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.2795805037021637, + "learning_rate": 2.785843577887926e-06, + "loss": 0.3881547749042511, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.45699605345726013, + "learning_rate": 2.7708625135285717e-06, + "loss": 0.7579897046089172, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 1.0389068126678467, + "learning_rate": 2.7562798846725133e-06, + "loss": 0.2406935691833496, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.3375071585178375, + "learning_rate": 2.742096220526808e-06, + "loss": 0.686618447303772, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.18238572776317596, + "learning_rate": 2.728312035819995e-06, + "loss": 0.667717695236206, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.49168893694877625, + "learning_rate": 2.7149278307834047e-06, + "loss": 0.5261977910995483, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.24831274151802063, + "learning_rate": 2.701944091133011e-06, + "loss": 0.7198750972747803, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.22962301969528198, + "learning_rate": 2.6893612880518064e-06, + "loss": 0.737044095993042, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.19474509358406067, + "learning_rate": 2.677179878172699e-06, + "loss": 0.7108073830604553, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.20618607103824615, + "learning_rate": 2.6654003035619427e-06, + "loss": 0.6671370267868042, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.1941603571176529, + "learning_rate": 2.654022991703093e-06, + "loss": 0.5263208746910095, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.4619889259338379, + "learning_rate": 2.6430483554814956e-06, + "loss": 0.6518206596374512, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.21917857229709625, + "learning_rate": 2.632476793169303e-06, + "loss": 0.3260215222835541, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.17283010482788086, + "learning_rate": 2.622308688411019e-06, + "loss": 0.6257282495498657, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.23165258765220642, + "learning_rate": 2.612544410209574e-06, + "loss": 0.6545224189758301, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.2500731945037842, + "learning_rate": 2.6031843129129425e-06, + "loss": 0.6312705278396606, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.7894412875175476, + "learning_rate": 2.594228736201274e-06, + "loss": 0.6379542946815491, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 1.0066219568252563, + "learning_rate": 2.5856780050745726e-06, + "loss": 0.44013917446136475, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.7618535757064819, + "learning_rate": 2.577532429840896e-06, + "loss": 0.6751360297203064, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.2618259787559509, + "learning_rate": 2.5697923061051056e-06, + "loss": 0.7184084057807922, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.17739877104759216, + "learning_rate": 2.5624579147581233e-06, + "loss": 0.8313037753105164, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.09571345150470734, + "learning_rate": 2.555529521966754e-06, + "loss": 0.3473111391067505, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.21488875150680542, + "learning_rate": 2.5490073791640125e-06, + "loss": 0.45797085762023926, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.27381402254104614, + "learning_rate": 2.542891723040009e-06, + "loss": 0.5820063948631287, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.19229131937026978, + "learning_rate": 2.5371827755333562e-06, + "loss": 0.7217463254928589, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.21169181168079376, + "learning_rate": 2.5318807438231114e-06, + "loss": 0.35460832715034485, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.20496554672718048, + "learning_rate": 2.5269858203212634e-06, + "loss": 0.3624781668186188, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.267284095287323, + "learning_rate": 2.522498182665746e-06, + "loss": 0.4492642879486084, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.286236971616745, + "learning_rate": 2.5184179937139957e-06, + "loss": 0.6190025210380554, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.2303326278924942, + "learning_rate": 2.5147454015370368e-06, + "loss": 0.5356917977333069, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.1802782416343689, + "learning_rate": 2.511480539414109e-06, + "loss": 0.6177518367767334, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.15038588643074036, + "learning_rate": 2.508623525827835e-06, + "loss": 0.32314690947532654, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.20698142051696777, + "learning_rate": 2.5061744644599134e-06, + "loss": 0.6555347442626953, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.1626245528459549, + "learning_rate": 2.504133444187364e-06, + "loss": 0.5033056139945984, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.20192566514015198, + "learning_rate": 2.5025005390792964e-06, + "loss": 0.6249701380729675, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.17979055643081665, + "learning_rate": 2.5012758083942227e-06, + "loss": 0.5733636021614075, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.41940733790397644, + "learning_rate": 2.500459296577912e-06, + "loss": 0.4294215440750122, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.2482825517654419, + "learning_rate": 2.5000510332617706e-06, + "loss": 0.6288962364196777, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.931331991116186e+18, + "train_loss": 0.9040751279160407, + "train_runtime": 12420.2984, + "train_samples_per_second": 5.304, + "train_steps_per_second": 0.088 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.931331991116186e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}