diff --git "a/SFT/trainer_state.json" "b/SFT/trainer_state.json" new file mode 100644--- /dev/null +++ "b/SFT/trainer_state.json" @@ -0,0 +1,7815 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 2500, + "global_step": 5540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009025270758122744, + "grad_norm": 9.466440213077052, + "learning_rate": 1.9985559566787006e-05, + "loss": 2.4118, + "step": 5 + }, + { + "epoch": 0.018050541516245487, + "grad_norm": 5.092031681557354, + "learning_rate": 1.996750902527076e-05, + "loss": 0.8115, + "step": 10 + }, + { + "epoch": 0.02707581227436823, + "grad_norm": 3.581292467477243, + "learning_rate": 1.9949458483754514e-05, + "loss": 0.7566, + "step": 15 + }, + { + "epoch": 0.036101083032490974, + "grad_norm": 3.0664426091496577, + "learning_rate": 1.9931407942238267e-05, + "loss": 0.7238, + "step": 20 + }, + { + "epoch": 0.04512635379061372, + "grad_norm": 2.953202771996899, + "learning_rate": 1.9913357400722025e-05, + "loss": 0.7088, + "step": 25 + }, + { + "epoch": 0.05415162454873646, + "grad_norm": 3.2107443547649233, + "learning_rate": 1.989530685920578e-05, + "loss": 0.6899, + "step": 30 + }, + { + "epoch": 0.0631768953068592, + "grad_norm": 2.606024610125901, + "learning_rate": 1.9877256317689532e-05, + "loss": 0.6928, + "step": 35 + }, + { + "epoch": 0.07220216606498195, + "grad_norm": 3.2049516084300245, + "learning_rate": 1.9859205776173286e-05, + "loss": 0.69, + "step": 40 + }, + { + "epoch": 0.0812274368231047, + "grad_norm": 2.761288783636034, + "learning_rate": 1.984115523465704e-05, + "loss": 0.6867, + "step": 45 + }, + { + "epoch": 0.09025270758122744, + "grad_norm": 2.996941813764781, + "learning_rate": 1.9823104693140797e-05, + "loss": 0.6675, + "step": 50 + }, + { + "epoch": 0.09927797833935018, + "grad_norm": 2.7492945960977617, + "learning_rate": 1.980505415162455e-05, + "loss": 0.6852, + "step": 55 + }, + { + "epoch": 0.10830324909747292, + "grad_norm": 3.1658110710465124, + "learning_rate": 1.9787003610108305e-05, + "loss": 0.6447, + "step": 60 + }, + { + "epoch": 0.11732851985559567, + "grad_norm": 2.5627271539534777, + "learning_rate": 1.976895306859206e-05, + "loss": 0.6562, + "step": 65 + }, + { + "epoch": 0.1263537906137184, + "grad_norm": 2.752341860058262, + "learning_rate": 1.9750902527075816e-05, + "loss": 0.6771, + "step": 70 + }, + { + "epoch": 0.13537906137184116, + "grad_norm": 3.047967974028345, + "learning_rate": 1.973285198555957e-05, + "loss": 0.6656, + "step": 75 + }, + { + "epoch": 0.1444043321299639, + "grad_norm": 2.9706224809053916, + "learning_rate": 1.9714801444043323e-05, + "loss": 0.6635, + "step": 80 + }, + { + "epoch": 0.15342960288808663, + "grad_norm": 2.4472232682052253, + "learning_rate": 1.9696750902527077e-05, + "loss": 0.6474, + "step": 85 + }, + { + "epoch": 0.1624548736462094, + "grad_norm": 2.701915790395711, + "learning_rate": 1.967870036101083e-05, + "loss": 0.6641, + "step": 90 + }, + { + "epoch": 0.17148014440433212, + "grad_norm": 2.4548463645426946, + "learning_rate": 1.9660649819494585e-05, + "loss": 0.6509, + "step": 95 + }, + { + "epoch": 0.18050541516245489, + "grad_norm": 12.210856636716178, + "learning_rate": 1.964259927797834e-05, + "loss": 0.6646, + "step": 100 + }, + { + "epoch": 0.18953068592057762, + "grad_norm": 2.603197676839273, + "learning_rate": 1.9624548736462096e-05, + "loss": 0.6975, + "step": 105 + }, + { + "epoch": 0.19855595667870035, + "grad_norm": 2.414541801668063, + "learning_rate": 1.960649819494585e-05, + "loss": 0.6412, + "step": 110 + }, + { + "epoch": 0.2075812274368231, + "grad_norm": 2.5004150715560787, + "learning_rate": 1.9588447653429607e-05, + "loss": 0.6477, + "step": 115 + }, + { + "epoch": 0.21660649819494585, + "grad_norm": 2.65590491408136, + "learning_rate": 1.957039711191336e-05, + "loss": 0.6328, + "step": 120 + }, + { + "epoch": 0.22563176895306858, + "grad_norm": 2.8094738107146107, + "learning_rate": 1.9552346570397115e-05, + "loss": 0.643, + "step": 125 + }, + { + "epoch": 0.23465703971119134, + "grad_norm": 2.4628672221036787, + "learning_rate": 1.953429602888087e-05, + "loss": 0.6295, + "step": 130 + }, + { + "epoch": 0.24368231046931407, + "grad_norm": 2.6028074637701315, + "learning_rate": 1.9516245487364622e-05, + "loss": 0.6231, + "step": 135 + }, + { + "epoch": 0.2527075812274368, + "grad_norm": 2.6627654496834112, + "learning_rate": 1.9498194945848376e-05, + "loss": 0.6164, + "step": 140 + }, + { + "epoch": 0.26173285198555957, + "grad_norm": 2.428314569670895, + "learning_rate": 1.948014440433213e-05, + "loss": 0.6137, + "step": 145 + }, + { + "epoch": 0.27075812274368233, + "grad_norm": 2.145860067780341, + "learning_rate": 1.9462093862815884e-05, + "loss": 0.6162, + "step": 150 + }, + { + "epoch": 0.27978339350180503, + "grad_norm": 2.6330932206114865, + "learning_rate": 1.944404332129964e-05, + "loss": 0.6152, + "step": 155 + }, + { + "epoch": 0.2888086642599278, + "grad_norm": 1.9933015700312968, + "learning_rate": 1.9425992779783395e-05, + "loss": 0.6333, + "step": 160 + }, + { + "epoch": 0.29783393501805056, + "grad_norm": 2.532862933716995, + "learning_rate": 1.940794223826715e-05, + "loss": 0.6284, + "step": 165 + }, + { + "epoch": 0.30685920577617326, + "grad_norm": 2.1667192528112755, + "learning_rate": 1.9389891696750906e-05, + "loss": 0.6237, + "step": 170 + }, + { + "epoch": 0.315884476534296, + "grad_norm": 2.1701228962066295, + "learning_rate": 1.937184115523466e-05, + "loss": 0.6336, + "step": 175 + }, + { + "epoch": 0.3249097472924188, + "grad_norm": 2.273883822591775, + "learning_rate": 1.9353790613718413e-05, + "loss": 0.6209, + "step": 180 + }, + { + "epoch": 0.33393501805054154, + "grad_norm": 2.29978243530194, + "learning_rate": 1.9335740072202167e-05, + "loss": 0.6318, + "step": 185 + }, + { + "epoch": 0.34296028880866425, + "grad_norm": 2.411589937459258, + "learning_rate": 1.931768953068592e-05, + "loss": 0.6107, + "step": 190 + }, + { + "epoch": 0.351985559566787, + "grad_norm": 2.175980233783614, + "learning_rate": 1.9299638989169675e-05, + "loss": 0.6034, + "step": 195 + }, + { + "epoch": 0.36101083032490977, + "grad_norm": 2.3182975023815566, + "learning_rate": 1.9281588447653432e-05, + "loss": 0.6054, + "step": 200 + }, + { + "epoch": 0.3700361010830325, + "grad_norm": 2.301500912368193, + "learning_rate": 1.9263537906137186e-05, + "loss": 0.608, + "step": 205 + }, + { + "epoch": 0.37906137184115524, + "grad_norm": 2.3411097096245146, + "learning_rate": 1.924548736462094e-05, + "loss": 0.6096, + "step": 210 + }, + { + "epoch": 0.388086642599278, + "grad_norm": 2.3477579646460267, + "learning_rate": 1.9227436823104693e-05, + "loss": 0.5981, + "step": 215 + }, + { + "epoch": 0.3971119133574007, + "grad_norm": 2.574256614403349, + "learning_rate": 1.920938628158845e-05, + "loss": 0.6032, + "step": 220 + }, + { + "epoch": 0.40613718411552346, + "grad_norm": 2.30445852375371, + "learning_rate": 1.9191335740072204e-05, + "loss": 0.6263, + "step": 225 + }, + { + "epoch": 0.4151624548736462, + "grad_norm": 1.9463153842646943, + "learning_rate": 1.9173285198555958e-05, + "loss": 0.5888, + "step": 230 + }, + { + "epoch": 0.42418772563176893, + "grad_norm": 2.2957992259075692, + "learning_rate": 1.9155234657039712e-05, + "loss": 0.5919, + "step": 235 + }, + { + "epoch": 0.4332129963898917, + "grad_norm": 2.4850310058268397, + "learning_rate": 1.913718411552347e-05, + "loss": 0.5982, + "step": 240 + }, + { + "epoch": 0.44223826714801445, + "grad_norm": 2.1484807254693665, + "learning_rate": 1.9119133574007223e-05, + "loss": 0.5922, + "step": 245 + }, + { + "epoch": 0.45126353790613716, + "grad_norm": 2.117521935012365, + "learning_rate": 1.9101083032490977e-05, + "loss": 0.614, + "step": 250 + }, + { + "epoch": 0.4602888086642599, + "grad_norm": 2.129059383234727, + "learning_rate": 1.908303249097473e-05, + "loss": 0.6065, + "step": 255 + }, + { + "epoch": 0.4693140794223827, + "grad_norm": 2.1304018405195904, + "learning_rate": 1.9064981949458485e-05, + "loss": 0.5776, + "step": 260 + }, + { + "epoch": 0.47833935018050544, + "grad_norm": 2.2992759766146973, + "learning_rate": 1.904693140794224e-05, + "loss": 0.5832, + "step": 265 + }, + { + "epoch": 0.48736462093862815, + "grad_norm": 2.0969841001906704, + "learning_rate": 1.9028880866425992e-05, + "loss": 0.6178, + "step": 270 + }, + { + "epoch": 0.4963898916967509, + "grad_norm": 2.2377624378205834, + "learning_rate": 1.901083032490975e-05, + "loss": 0.5877, + "step": 275 + }, + { + "epoch": 0.5054151624548736, + "grad_norm": 2.2091730376930308, + "learning_rate": 1.8992779783393503e-05, + "loss": 0.6093, + "step": 280 + }, + { + "epoch": 0.5144404332129964, + "grad_norm": 2.0821252876149274, + "learning_rate": 1.897472924187726e-05, + "loss": 0.5971, + "step": 285 + }, + { + "epoch": 0.5234657039711191, + "grad_norm": 2.4846088029201923, + "learning_rate": 1.8956678700361014e-05, + "loss": 0.5894, + "step": 290 + }, + { + "epoch": 0.5324909747292419, + "grad_norm": 2.253250469898687, + "learning_rate": 1.8938628158844768e-05, + "loss": 0.6035, + "step": 295 + }, + { + "epoch": 0.5415162454873647, + "grad_norm": 2.1890406664703237, + "learning_rate": 1.8920577617328522e-05, + "loss": 0.5905, + "step": 300 + }, + { + "epoch": 0.5505415162454874, + "grad_norm": 1.830902528715668, + "learning_rate": 1.8902527075812276e-05, + "loss": 0.5666, + "step": 305 + }, + { + "epoch": 0.5595667870036101, + "grad_norm": 2.2712137897785216, + "learning_rate": 1.888447653429603e-05, + "loss": 0.5794, + "step": 310 + }, + { + "epoch": 0.5685920577617328, + "grad_norm": 2.064463681190312, + "learning_rate": 1.8866425992779783e-05, + "loss": 0.5813, + "step": 315 + }, + { + "epoch": 0.5776173285198556, + "grad_norm": 2.0670626484908046, + "learning_rate": 1.8848375451263537e-05, + "loss": 0.5989, + "step": 320 + }, + { + "epoch": 0.5866425992779783, + "grad_norm": 2.4382023992648563, + "learning_rate": 1.8830324909747294e-05, + "loss": 0.5844, + "step": 325 + }, + { + "epoch": 0.5956678700361011, + "grad_norm": 2.3559788464517935, + "learning_rate": 1.8812274368231048e-05, + "loss": 0.5751, + "step": 330 + }, + { + "epoch": 0.6046931407942239, + "grad_norm": 2.0417255214726655, + "learning_rate": 1.8794223826714802e-05, + "loss": 0.582, + "step": 335 + }, + { + "epoch": 0.6137184115523465, + "grad_norm": 2.0606597313764174, + "learning_rate": 1.877617328519856e-05, + "loss": 0.5563, + "step": 340 + }, + { + "epoch": 0.6227436823104693, + "grad_norm": 2.1279001628524714, + "learning_rate": 1.8758122743682313e-05, + "loss": 0.5646, + "step": 345 + }, + { + "epoch": 0.631768953068592, + "grad_norm": 2.1228151514711318, + "learning_rate": 1.8740072202166067e-05, + "loss": 0.5687, + "step": 350 + }, + { + "epoch": 0.6407942238267148, + "grad_norm": 2.1178045046697997, + "learning_rate": 1.872202166064982e-05, + "loss": 0.5808, + "step": 355 + }, + { + "epoch": 0.6498194945848376, + "grad_norm": 2.110658120249641, + "learning_rate": 1.8703971119133574e-05, + "loss": 0.5576, + "step": 360 + }, + { + "epoch": 0.6588447653429603, + "grad_norm": 2.0620796847472835, + "learning_rate": 1.8685920577617328e-05, + "loss": 0.5817, + "step": 365 + }, + { + "epoch": 0.6678700361010831, + "grad_norm": 2.2036387339163217, + "learning_rate": 1.8667870036101086e-05, + "loss": 0.583, + "step": 370 + }, + { + "epoch": 0.6768953068592057, + "grad_norm": 2.169248099645488, + "learning_rate": 1.864981949458484e-05, + "loss": 0.5711, + "step": 375 + }, + { + "epoch": 0.6859205776173285, + "grad_norm": 2.0070917491256237, + "learning_rate": 1.8631768953068593e-05, + "loss": 0.5714, + "step": 380 + }, + { + "epoch": 0.6949458483754513, + "grad_norm": 2.412092286853898, + "learning_rate": 1.8613718411552347e-05, + "loss": 0.558, + "step": 385 + }, + { + "epoch": 0.703971119133574, + "grad_norm": 2.1573044881131636, + "learning_rate": 1.8595667870036104e-05, + "loss": 0.5605, + "step": 390 + }, + { + "epoch": 0.7129963898916968, + "grad_norm": 2.0060925615864234, + "learning_rate": 1.8577617328519858e-05, + "loss": 0.5646, + "step": 395 + }, + { + "epoch": 0.7220216606498195, + "grad_norm": 2.0721270781349856, + "learning_rate": 1.8559566787003612e-05, + "loss": 0.5591, + "step": 400 + }, + { + "epoch": 0.7310469314079422, + "grad_norm": 1.9740854158027745, + "learning_rate": 1.8541516245487366e-05, + "loss": 0.5749, + "step": 405 + }, + { + "epoch": 0.740072202166065, + "grad_norm": 2.1666939121131343, + "learning_rate": 1.852346570397112e-05, + "loss": 0.5559, + "step": 410 + }, + { + "epoch": 0.7490974729241877, + "grad_norm": 2.006153861447105, + "learning_rate": 1.8505415162454877e-05, + "loss": 0.5579, + "step": 415 + }, + { + "epoch": 0.7581227436823105, + "grad_norm": 2.1562521533225896, + "learning_rate": 1.848736462093863e-05, + "loss": 0.5641, + "step": 420 + }, + { + "epoch": 0.7671480144404332, + "grad_norm": 2.4213982076348386, + "learning_rate": 1.8469314079422384e-05, + "loss": 0.5697, + "step": 425 + }, + { + "epoch": 0.776173285198556, + "grad_norm": 2.021492521441799, + "learning_rate": 1.8451263537906138e-05, + "loss": 0.5512, + "step": 430 + }, + { + "epoch": 0.7851985559566786, + "grad_norm": 2.2387067716748903, + "learning_rate": 1.8433212996389892e-05, + "loss": 0.5567, + "step": 435 + }, + { + "epoch": 0.7942238267148014, + "grad_norm": 2.1182127501600654, + "learning_rate": 1.8415162454873646e-05, + "loss": 0.5529, + "step": 440 + }, + { + "epoch": 0.8032490974729242, + "grad_norm": 1.8809686502549097, + "learning_rate": 1.8397111913357403e-05, + "loss": 0.549, + "step": 445 + }, + { + "epoch": 0.8122743682310469, + "grad_norm": 1.9559681017618193, + "learning_rate": 1.8379061371841157e-05, + "loss": 0.546, + "step": 450 + }, + { + "epoch": 0.8212996389891697, + "grad_norm": 1.8843502716508203, + "learning_rate": 1.836101083032491e-05, + "loss": 0.5281, + "step": 455 + }, + { + "epoch": 0.8303249097472925, + "grad_norm": 2.0782130238317365, + "learning_rate": 1.8342960288808668e-05, + "loss": 0.5355, + "step": 460 + }, + { + "epoch": 0.8393501805054152, + "grad_norm": 2.028491033091686, + "learning_rate": 1.832490974729242e-05, + "loss": 0.5573, + "step": 465 + }, + { + "epoch": 0.8483754512635379, + "grad_norm": 1.997600040747785, + "learning_rate": 1.8306859205776175e-05, + "loss": 0.5503, + "step": 470 + }, + { + "epoch": 0.8574007220216606, + "grad_norm": 2.086622795789731, + "learning_rate": 1.828880866425993e-05, + "loss": 0.5271, + "step": 475 + }, + { + "epoch": 0.8664259927797834, + "grad_norm": 2.2126356409155603, + "learning_rate": 1.8270758122743683e-05, + "loss": 0.5295, + "step": 480 + }, + { + "epoch": 0.8754512635379061, + "grad_norm": 1.9420226465771289, + "learning_rate": 1.8252707581227437e-05, + "loss": 0.5229, + "step": 485 + }, + { + "epoch": 0.8844765342960289, + "grad_norm": 2.3845783258984605, + "learning_rate": 1.823465703971119e-05, + "loss": 0.5302, + "step": 490 + }, + { + "epoch": 0.8935018050541517, + "grad_norm": 1.990644749395453, + "learning_rate": 1.8216606498194948e-05, + "loss": 0.5327, + "step": 495 + }, + { + "epoch": 0.9025270758122743, + "grad_norm": 1.9296515724126473, + "learning_rate": 1.81985559566787e-05, + "loss": 0.5368, + "step": 500 + }, + { + "epoch": 0.9115523465703971, + "grad_norm": 2.08354639136672, + "learning_rate": 1.8180505415162456e-05, + "loss": 0.537, + "step": 505 + }, + { + "epoch": 0.9205776173285198, + "grad_norm": 1.8810121127412915, + "learning_rate": 1.8162454873646213e-05, + "loss": 0.5233, + "step": 510 + }, + { + "epoch": 0.9296028880866426, + "grad_norm": 2.1865339910493, + "learning_rate": 1.8144404332129967e-05, + "loss": 0.5229, + "step": 515 + }, + { + "epoch": 0.9386281588447654, + "grad_norm": 1.8530594783548222, + "learning_rate": 1.812635379061372e-05, + "loss": 0.5489, + "step": 520 + }, + { + "epoch": 0.9476534296028881, + "grad_norm": 1.8045811881947107, + "learning_rate": 1.8108303249097474e-05, + "loss": 0.5216, + "step": 525 + }, + { + "epoch": 0.9566787003610109, + "grad_norm": 2.0216105196649465, + "learning_rate": 1.8090252707581228e-05, + "loss": 0.5327, + "step": 530 + }, + { + "epoch": 0.9657039711191335, + "grad_norm": 2.070828240474061, + "learning_rate": 1.8072202166064982e-05, + "loss": 0.5341, + "step": 535 + }, + { + "epoch": 0.9747292418772563, + "grad_norm": 2.118337434473814, + "learning_rate": 1.8054151624548736e-05, + "loss": 0.5206, + "step": 540 + }, + { + "epoch": 0.983754512635379, + "grad_norm": 1.9327007846480317, + "learning_rate": 1.8036101083032493e-05, + "loss": 0.5223, + "step": 545 + }, + { + "epoch": 0.9927797833935018, + "grad_norm": 1.8102667165760828, + "learning_rate": 1.8018050541516247e-05, + "loss": 0.5183, + "step": 550 + }, + { + "epoch": 1.0018050541516246, + "grad_norm": 1.7565629809184389, + "learning_rate": 1.8e-05, + "loss": 0.5008, + "step": 555 + }, + { + "epoch": 1.0108303249097472, + "grad_norm": 1.7812661895325377, + "learning_rate": 1.7981949458483758e-05, + "loss": 0.403, + "step": 560 + }, + { + "epoch": 1.01985559566787, + "grad_norm": 1.9167006814291294, + "learning_rate": 1.796389891696751e-05, + "loss": 0.412, + "step": 565 + }, + { + "epoch": 1.0288808664259927, + "grad_norm": 1.9923575509001286, + "learning_rate": 1.7945848375451265e-05, + "loss": 0.3926, + "step": 570 + }, + { + "epoch": 1.0379061371841156, + "grad_norm": 1.7192659741327438, + "learning_rate": 1.792779783393502e-05, + "loss": 0.4072, + "step": 575 + }, + { + "epoch": 1.0469314079422383, + "grad_norm": 1.8513603883344656, + "learning_rate": 1.7909747292418773e-05, + "loss": 0.4031, + "step": 580 + }, + { + "epoch": 1.055956678700361, + "grad_norm": 1.828939711963778, + "learning_rate": 1.7891696750902527e-05, + "loss": 0.3984, + "step": 585 + }, + { + "epoch": 1.0649819494584838, + "grad_norm": 1.789210627896697, + "learning_rate": 1.7873646209386284e-05, + "loss": 0.4054, + "step": 590 + }, + { + "epoch": 1.0740072202166064, + "grad_norm": 1.8182359140012072, + "learning_rate": 1.7855595667870038e-05, + "loss": 0.4016, + "step": 595 + }, + { + "epoch": 1.0830324909747293, + "grad_norm": 1.8738010102389915, + "learning_rate": 1.783754512635379e-05, + "loss": 0.3916, + "step": 600 + }, + { + "epoch": 1.092057761732852, + "grad_norm": 1.7056071281982275, + "learning_rate": 1.7819494584837545e-05, + "loss": 0.4005, + "step": 605 + }, + { + "epoch": 1.1010830324909748, + "grad_norm": 1.7631642754877512, + "learning_rate": 1.7801444043321303e-05, + "loss": 0.4044, + "step": 610 + }, + { + "epoch": 1.1101083032490975, + "grad_norm": 1.9174467447157293, + "learning_rate": 1.7783393501805056e-05, + "loss": 0.3994, + "step": 615 + }, + { + "epoch": 1.1191335740072201, + "grad_norm": 1.9504105068703066, + "learning_rate": 1.776534296028881e-05, + "loss": 0.413, + "step": 620 + }, + { + "epoch": 1.128158844765343, + "grad_norm": 1.7274668622868, + "learning_rate": 1.7747292418772564e-05, + "loss": 0.4035, + "step": 625 + }, + { + "epoch": 1.1371841155234657, + "grad_norm": 1.8575861518758143, + "learning_rate": 1.772924187725632e-05, + "loss": 0.4098, + "step": 630 + }, + { + "epoch": 1.1462093862815885, + "grad_norm": 1.834605129701837, + "learning_rate": 1.7711191335740075e-05, + "loss": 0.4107, + "step": 635 + }, + { + "epoch": 1.1552346570397112, + "grad_norm": 1.6992491657802056, + "learning_rate": 1.769314079422383e-05, + "loss": 0.4104, + "step": 640 + }, + { + "epoch": 1.164259927797834, + "grad_norm": 1.6426546320530329, + "learning_rate": 1.7675090252707583e-05, + "loss": 0.4078, + "step": 645 + }, + { + "epoch": 1.1732851985559567, + "grad_norm": 1.8449792200452055, + "learning_rate": 1.7657039711191337e-05, + "loss": 0.4138, + "step": 650 + }, + { + "epoch": 1.1823104693140793, + "grad_norm": 1.7495204322144347, + "learning_rate": 1.763898916967509e-05, + "loss": 0.4084, + "step": 655 + }, + { + "epoch": 1.1913357400722022, + "grad_norm": 1.6260627217547752, + "learning_rate": 1.7620938628158844e-05, + "loss": 0.4197, + "step": 660 + }, + { + "epoch": 1.2003610108303249, + "grad_norm": 1.803828249935731, + "learning_rate": 1.76028880866426e-05, + "loss": 0.4157, + "step": 665 + }, + { + "epoch": 1.2093862815884477, + "grad_norm": 1.6698584933682465, + "learning_rate": 1.7584837545126355e-05, + "loss": 0.4108, + "step": 670 + }, + { + "epoch": 1.2184115523465704, + "grad_norm": 1.8547896347467827, + "learning_rate": 1.756678700361011e-05, + "loss": 0.4049, + "step": 675 + }, + { + "epoch": 1.2274368231046933, + "grad_norm": 1.9793660391728152, + "learning_rate": 1.7548736462093866e-05, + "loss": 0.4086, + "step": 680 + }, + { + "epoch": 1.236462093862816, + "grad_norm": 1.9481106812224744, + "learning_rate": 1.753068592057762e-05, + "loss": 0.4217, + "step": 685 + }, + { + "epoch": 1.2454873646209386, + "grad_norm": 1.8589145184235236, + "learning_rate": 1.7512635379061374e-05, + "loss": 0.4077, + "step": 690 + }, + { + "epoch": 1.2545126353790614, + "grad_norm": 1.7491717689464192, + "learning_rate": 1.7494584837545128e-05, + "loss": 0.4041, + "step": 695 + }, + { + "epoch": 1.263537906137184, + "grad_norm": 1.8301618024998683, + "learning_rate": 1.747653429602888e-05, + "loss": 0.4156, + "step": 700 + }, + { + "epoch": 1.2725631768953067, + "grad_norm": 1.8157533205183023, + "learning_rate": 1.7458483754512635e-05, + "loss": 0.4066, + "step": 705 + }, + { + "epoch": 1.2815884476534296, + "grad_norm": 1.618452105653501, + "learning_rate": 1.744043321299639e-05, + "loss": 0.4045, + "step": 710 + }, + { + "epoch": 1.2906137184115525, + "grad_norm": 1.801416162193692, + "learning_rate": 1.7422382671480146e-05, + "loss": 0.4114, + "step": 715 + }, + { + "epoch": 1.2996389891696751, + "grad_norm": 1.9120529810373577, + "learning_rate": 1.74043321299639e-05, + "loss": 0.4053, + "step": 720 + }, + { + "epoch": 1.3086642599277978, + "grad_norm": 1.8649971513309, + "learning_rate": 1.7386281588447654e-05, + "loss": 0.3994, + "step": 725 + }, + { + "epoch": 1.3176895306859207, + "grad_norm": 1.5823876746717431, + "learning_rate": 1.736823104693141e-05, + "loss": 0.405, + "step": 730 + }, + { + "epoch": 1.3267148014440433, + "grad_norm": 1.6565144465938595, + "learning_rate": 1.7350180505415165e-05, + "loss": 0.3956, + "step": 735 + }, + { + "epoch": 1.335740072202166, + "grad_norm": 1.7349944474589856, + "learning_rate": 1.733212996389892e-05, + "loss": 0.4004, + "step": 740 + }, + { + "epoch": 1.3447653429602888, + "grad_norm": 1.8413333458623544, + "learning_rate": 1.7314079422382673e-05, + "loss": 0.3933, + "step": 745 + }, + { + "epoch": 1.3537906137184115, + "grad_norm": 1.5956337270311123, + "learning_rate": 1.7296028880866426e-05, + "loss": 0.4027, + "step": 750 + }, + { + "epoch": 1.3628158844765343, + "grad_norm": 1.7395021140734128, + "learning_rate": 1.727797833935018e-05, + "loss": 0.4071, + "step": 755 + }, + { + "epoch": 1.371841155234657, + "grad_norm": 1.7983719467870525, + "learning_rate": 1.7259927797833937e-05, + "loss": 0.404, + "step": 760 + }, + { + "epoch": 1.3808664259927799, + "grad_norm": 1.7263325979359656, + "learning_rate": 1.724187725631769e-05, + "loss": 0.3933, + "step": 765 + }, + { + "epoch": 1.3898916967509025, + "grad_norm": 1.7378004534277407, + "learning_rate": 1.7223826714801445e-05, + "loss": 0.3839, + "step": 770 + }, + { + "epoch": 1.3989169675090252, + "grad_norm": 1.8567819478825005, + "learning_rate": 1.72057761732852e-05, + "loss": 0.4032, + "step": 775 + }, + { + "epoch": 1.407942238267148, + "grad_norm": 1.752933416288969, + "learning_rate": 1.7187725631768956e-05, + "loss": 0.3914, + "step": 780 + }, + { + "epoch": 1.4169675090252707, + "grad_norm": 1.7945976502850631, + "learning_rate": 1.716967509025271e-05, + "loss": 0.4052, + "step": 785 + }, + { + "epoch": 1.4259927797833936, + "grad_norm": 1.8440958122894435, + "learning_rate": 1.7151624548736464e-05, + "loss": 0.3889, + "step": 790 + }, + { + "epoch": 1.4350180505415162, + "grad_norm": 1.9541732635364462, + "learning_rate": 1.7133574007220218e-05, + "loss": 0.3897, + "step": 795 + }, + { + "epoch": 1.444043321299639, + "grad_norm": 1.7128039074068433, + "learning_rate": 1.711552346570397e-05, + "loss": 0.3949, + "step": 800 + }, + { + "epoch": 1.4530685920577617, + "grad_norm": 1.7805663890604293, + "learning_rate": 1.709747292418773e-05, + "loss": 0.4046, + "step": 805 + }, + { + "epoch": 1.4620938628158844, + "grad_norm": 1.7176479703166927, + "learning_rate": 1.7079422382671482e-05, + "loss": 0.3948, + "step": 810 + }, + { + "epoch": 1.4711191335740073, + "grad_norm": 1.7476594687945506, + "learning_rate": 1.7061371841155236e-05, + "loss": 0.3987, + "step": 815 + }, + { + "epoch": 1.48014440433213, + "grad_norm": 1.7097737410315963, + "learning_rate": 1.704332129963899e-05, + "loss": 0.3971, + "step": 820 + }, + { + "epoch": 1.4891696750902528, + "grad_norm": 1.6154261828372305, + "learning_rate": 1.7025270758122744e-05, + "loss": 0.3965, + "step": 825 + }, + { + "epoch": 1.4981949458483754, + "grad_norm": 1.6439021077041545, + "learning_rate": 1.7007220216606498e-05, + "loss": 0.3864, + "step": 830 + }, + { + "epoch": 1.5072202166064983, + "grad_norm": 1.7335388992906235, + "learning_rate": 1.6989169675090255e-05, + "loss": 0.3904, + "step": 835 + }, + { + "epoch": 1.516245487364621, + "grad_norm": 1.9280976394130103, + "learning_rate": 1.697111913357401e-05, + "loss": 0.3982, + "step": 840 + }, + { + "epoch": 1.5252707581227436, + "grad_norm": 1.6397519533163014, + "learning_rate": 1.6953068592057766e-05, + "loss": 0.4071, + "step": 845 + }, + { + "epoch": 1.5342960288808665, + "grad_norm": 1.7403299732030928, + "learning_rate": 1.693501805054152e-05, + "loss": 0.396, + "step": 850 + }, + { + "epoch": 1.5433212996389891, + "grad_norm": 1.856917104233432, + "learning_rate": 1.6916967509025274e-05, + "loss": 0.3965, + "step": 855 + }, + { + "epoch": 1.5523465703971118, + "grad_norm": 2.0561648148768246, + "learning_rate": 1.6898916967509027e-05, + "loss": 0.4011, + "step": 860 + }, + { + "epoch": 1.5613718411552346, + "grad_norm": 1.5934247442993748, + "learning_rate": 1.688086642599278e-05, + "loss": 0.4047, + "step": 865 + }, + { + "epoch": 1.5703971119133575, + "grad_norm": 1.6742097942966434, + "learning_rate": 1.6862815884476535e-05, + "loss": 0.3768, + "step": 870 + }, + { + "epoch": 1.5794223826714802, + "grad_norm": 1.6344395357039199, + "learning_rate": 1.684476534296029e-05, + "loss": 0.3994, + "step": 875 + }, + { + "epoch": 1.5884476534296028, + "grad_norm": 1.947535712414049, + "learning_rate": 1.6826714801444043e-05, + "loss": 0.3859, + "step": 880 + }, + { + "epoch": 1.5974729241877257, + "grad_norm": 1.8081716367634284, + "learning_rate": 1.68086642599278e-05, + "loss": 0.3971, + "step": 885 + }, + { + "epoch": 1.6064981949458483, + "grad_norm": 1.7555061804990388, + "learning_rate": 1.6790613718411554e-05, + "loss": 0.4033, + "step": 890 + }, + { + "epoch": 1.615523465703971, + "grad_norm": 1.7977625864566746, + "learning_rate": 1.6772563176895307e-05, + "loss": 0.3869, + "step": 895 + }, + { + "epoch": 1.6245487364620939, + "grad_norm": 2.0151295696094156, + "learning_rate": 1.6754512635379065e-05, + "loss": 0.3896, + "step": 900 + }, + { + "epoch": 1.6335740072202167, + "grad_norm": 1.7117350496546804, + "learning_rate": 1.673646209386282e-05, + "loss": 0.3931, + "step": 905 + }, + { + "epoch": 1.6425992779783394, + "grad_norm": 1.5367121125433267, + "learning_rate": 1.6718411552346572e-05, + "loss": 0.3925, + "step": 910 + }, + { + "epoch": 1.651624548736462, + "grad_norm": 1.7252709384729956, + "learning_rate": 1.6700361010830326e-05, + "loss": 0.3813, + "step": 915 + }, + { + "epoch": 1.660649819494585, + "grad_norm": 1.6424146369682562, + "learning_rate": 1.668231046931408e-05, + "loss": 0.3959, + "step": 920 + }, + { + "epoch": 1.6696750902527075, + "grad_norm": 1.7074460801427431, + "learning_rate": 1.6664259927797834e-05, + "loss": 0.3821, + "step": 925 + }, + { + "epoch": 1.6787003610108302, + "grad_norm": 1.7779369602208115, + "learning_rate": 1.6646209386281588e-05, + "loss": 0.387, + "step": 930 + }, + { + "epoch": 1.687725631768953, + "grad_norm": 1.7587322948042428, + "learning_rate": 1.6628158844765345e-05, + "loss": 0.3824, + "step": 935 + }, + { + "epoch": 1.696750902527076, + "grad_norm": 1.719570186327862, + "learning_rate": 1.66101083032491e-05, + "loss": 0.3968, + "step": 940 + }, + { + "epoch": 1.7057761732851986, + "grad_norm": 1.758936196148823, + "learning_rate": 1.6592057761732852e-05, + "loss": 0.4, + "step": 945 + }, + { + "epoch": 1.7148014440433212, + "grad_norm": 1.7037689600288302, + "learning_rate": 1.657400722021661e-05, + "loss": 0.3824, + "step": 950 + }, + { + "epoch": 1.7238267148014441, + "grad_norm": 1.6704621391960432, + "learning_rate": 1.6555956678700363e-05, + "loss": 0.3904, + "step": 955 + }, + { + "epoch": 1.7328519855595668, + "grad_norm": 1.6276171413365443, + "learning_rate": 1.6537906137184117e-05, + "loss": 0.3936, + "step": 960 + }, + { + "epoch": 1.7418772563176894, + "grad_norm": 1.4915457367830292, + "learning_rate": 1.651985559566787e-05, + "loss": 0.3878, + "step": 965 + }, + { + "epoch": 1.7509025270758123, + "grad_norm": 1.6998121121079899, + "learning_rate": 1.6501805054151625e-05, + "loss": 0.3851, + "step": 970 + }, + { + "epoch": 1.7599277978339352, + "grad_norm": 1.7868333554788238, + "learning_rate": 1.6483754512635382e-05, + "loss": 0.4016, + "step": 975 + }, + { + "epoch": 1.7689530685920578, + "grad_norm": 1.9509112307215477, + "learning_rate": 1.6465703971119136e-05, + "loss": 0.3947, + "step": 980 + }, + { + "epoch": 1.7779783393501805, + "grad_norm": 1.8313430184945898, + "learning_rate": 1.644765342960289e-05, + "loss": 0.3883, + "step": 985 + }, + { + "epoch": 1.7870036101083033, + "grad_norm": 1.69745773450961, + "learning_rate": 1.6429602888086644e-05, + "loss": 0.3927, + "step": 990 + }, + { + "epoch": 1.796028880866426, + "grad_norm": 1.5706439056988484, + "learning_rate": 1.6411552346570397e-05, + "loss": 0.3885, + "step": 995 + }, + { + "epoch": 1.8050541516245486, + "grad_norm": 1.9749233060644407, + "learning_rate": 1.639350180505415e-05, + "loss": 0.3911, + "step": 1000 + }, + { + "epoch": 1.8140794223826715, + "grad_norm": 1.835030597426223, + "learning_rate": 1.637545126353791e-05, + "loss": 0.3942, + "step": 1005 + }, + { + "epoch": 1.8231046931407944, + "grad_norm": 1.5738451037571743, + "learning_rate": 1.6357400722021662e-05, + "loss": 0.3812, + "step": 1010 + }, + { + "epoch": 1.8321299638989168, + "grad_norm": 1.751135780497672, + "learning_rate": 1.6339350180505416e-05, + "loss": 0.3862, + "step": 1015 + }, + { + "epoch": 1.8411552346570397, + "grad_norm": 1.864136563331331, + "learning_rate": 1.6321299638989173e-05, + "loss": 0.3942, + "step": 1020 + }, + { + "epoch": 1.8501805054151625, + "grad_norm": 1.5420248188805685, + "learning_rate": 1.6303249097472927e-05, + "loss": 0.3828, + "step": 1025 + }, + { + "epoch": 1.8592057761732852, + "grad_norm": 1.6307207463776452, + "learning_rate": 1.628519855595668e-05, + "loss": 0.3766, + "step": 1030 + }, + { + "epoch": 1.8682310469314078, + "grad_norm": 1.7022785037029124, + "learning_rate": 1.6267148014440435e-05, + "loss": 0.3847, + "step": 1035 + }, + { + "epoch": 1.8772563176895307, + "grad_norm": 1.6865169590583908, + "learning_rate": 1.624909747292419e-05, + "loss": 0.3855, + "step": 1040 + }, + { + "epoch": 1.8862815884476536, + "grad_norm": 1.772735245585654, + "learning_rate": 1.6231046931407942e-05, + "loss": 0.3766, + "step": 1045 + }, + { + "epoch": 1.895306859205776, + "grad_norm": 1.6414783417710321, + "learning_rate": 1.6212996389891696e-05, + "loss": 0.3769, + "step": 1050 + }, + { + "epoch": 1.904332129963899, + "grad_norm": 1.638546064732281, + "learning_rate": 1.6194945848375453e-05, + "loss": 0.3811, + "step": 1055 + }, + { + "epoch": 1.9133574007220218, + "grad_norm": 1.7273768032341619, + "learning_rate": 1.6176895306859207e-05, + "loss": 0.3787, + "step": 1060 + }, + { + "epoch": 1.9223826714801444, + "grad_norm": 1.7610887591542017, + "learning_rate": 1.615884476534296e-05, + "loss": 0.3901, + "step": 1065 + }, + { + "epoch": 1.931407942238267, + "grad_norm": 1.7492183570516289, + "learning_rate": 1.6140794223826718e-05, + "loss": 0.385, + "step": 1070 + }, + { + "epoch": 1.94043321299639, + "grad_norm": 1.6697391799649597, + "learning_rate": 1.6122743682310472e-05, + "loss": 0.3865, + "step": 1075 + }, + { + "epoch": 1.9494584837545126, + "grad_norm": 1.5675976217251384, + "learning_rate": 1.6104693140794226e-05, + "loss": 0.3757, + "step": 1080 + }, + { + "epoch": 1.9584837545126352, + "grad_norm": 1.6558779934861987, + "learning_rate": 1.608664259927798e-05, + "loss": 0.379, + "step": 1085 + }, + { + "epoch": 1.967509025270758, + "grad_norm": 1.6873028676221205, + "learning_rate": 1.6068592057761733e-05, + "loss": 0.3811, + "step": 1090 + }, + { + "epoch": 1.976534296028881, + "grad_norm": 1.6240908295441967, + "learning_rate": 1.6050541516245487e-05, + "loss": 0.3654, + "step": 1095 + }, + { + "epoch": 1.9855595667870036, + "grad_norm": 1.6615813580574832, + "learning_rate": 1.603249097472924e-05, + "loss": 0.3796, + "step": 1100 + }, + { + "epoch": 1.9945848375451263, + "grad_norm": 1.686361226213432, + "learning_rate": 1.6014440433212998e-05, + "loss": 0.3837, + "step": 1105 + }, + { + "epoch": 2.003610108303249, + "grad_norm": 1.4089105927337982, + "learning_rate": 1.5996389891696752e-05, + "loss": 0.3506, + "step": 1110 + }, + { + "epoch": 2.012635379061372, + "grad_norm": 1.375915950771055, + "learning_rate": 1.5978339350180506e-05, + "loss": 0.2759, + "step": 1115 + }, + { + "epoch": 2.0216606498194944, + "grad_norm": 1.5521739291638614, + "learning_rate": 1.5960288808664263e-05, + "loss": 0.2756, + "step": 1120 + }, + { + "epoch": 2.0306859205776173, + "grad_norm": 1.6072972838350046, + "learning_rate": 1.5942238267148017e-05, + "loss": 0.2673, + "step": 1125 + }, + { + "epoch": 2.03971119133574, + "grad_norm": 1.6652650820556896, + "learning_rate": 1.592418772563177e-05, + "loss": 0.27, + "step": 1130 + }, + { + "epoch": 2.0487364620938626, + "grad_norm": 1.4210627524187658, + "learning_rate": 1.5906137184115525e-05, + "loss": 0.2706, + "step": 1135 + }, + { + "epoch": 2.0577617328519855, + "grad_norm": 1.6471286272058623, + "learning_rate": 1.588808664259928e-05, + "loss": 0.2762, + "step": 1140 + }, + { + "epoch": 2.0667870036101084, + "grad_norm": 1.467229273738129, + "learning_rate": 1.5870036101083032e-05, + "loss": 0.2828, + "step": 1145 + }, + { + "epoch": 2.0758122743682312, + "grad_norm": 1.4064489033689571, + "learning_rate": 1.585198555956679e-05, + "loss": 0.2751, + "step": 1150 + }, + { + "epoch": 2.0848375451263537, + "grad_norm": 1.4888952106078595, + "learning_rate": 1.5833935018050543e-05, + "loss": 0.2785, + "step": 1155 + }, + { + "epoch": 2.0938628158844765, + "grad_norm": 1.5202276448854608, + "learning_rate": 1.5815884476534297e-05, + "loss": 0.2728, + "step": 1160 + }, + { + "epoch": 2.1028880866425994, + "grad_norm": 1.4911272606877095, + "learning_rate": 1.579783393501805e-05, + "loss": 0.2735, + "step": 1165 + }, + { + "epoch": 2.111913357400722, + "grad_norm": 1.5345612677088643, + "learning_rate": 1.5779783393501805e-05, + "loss": 0.2751, + "step": 1170 + }, + { + "epoch": 2.1209386281588447, + "grad_norm": 1.470603227078021, + "learning_rate": 1.5761732851985562e-05, + "loss": 0.2799, + "step": 1175 + }, + { + "epoch": 2.1299638989169676, + "grad_norm": 1.6228746622020027, + "learning_rate": 1.5743682310469316e-05, + "loss": 0.2751, + "step": 1180 + }, + { + "epoch": 2.1389891696750905, + "grad_norm": 1.5092478918219132, + "learning_rate": 1.572563176895307e-05, + "loss": 0.2837, + "step": 1185 + }, + { + "epoch": 2.148014440433213, + "grad_norm": 1.5022801045910057, + "learning_rate": 1.5707581227436823e-05, + "loss": 0.2806, + "step": 1190 + }, + { + "epoch": 2.1570397111913358, + "grad_norm": 1.5037599452460795, + "learning_rate": 1.568953068592058e-05, + "loss": 0.2767, + "step": 1195 + }, + { + "epoch": 2.1660649819494586, + "grad_norm": 1.579515606540816, + "learning_rate": 1.5671480144404334e-05, + "loss": 0.2849, + "step": 1200 + }, + { + "epoch": 2.175090252707581, + "grad_norm": 1.4754220940034453, + "learning_rate": 1.5653429602888088e-05, + "loss": 0.2791, + "step": 1205 + }, + { + "epoch": 2.184115523465704, + "grad_norm": 1.3959522376935156, + "learning_rate": 1.5635379061371842e-05, + "loss": 0.2881, + "step": 1210 + }, + { + "epoch": 2.193140794223827, + "grad_norm": 1.5444043202318694, + "learning_rate": 1.5617328519855596e-05, + "loss": 0.2786, + "step": 1215 + }, + { + "epoch": 2.2021660649819497, + "grad_norm": 1.4479480310561215, + "learning_rate": 1.559927797833935e-05, + "loss": 0.2797, + "step": 1220 + }, + { + "epoch": 2.211191335740072, + "grad_norm": 1.3937802391741012, + "learning_rate": 1.5581227436823107e-05, + "loss": 0.2799, + "step": 1225 + }, + { + "epoch": 2.220216606498195, + "grad_norm": 1.4393031837130634, + "learning_rate": 1.556317689530686e-05, + "loss": 0.2823, + "step": 1230 + }, + { + "epoch": 2.229241877256318, + "grad_norm": 1.6426238576078906, + "learning_rate": 1.5545126353790614e-05, + "loss": 0.2784, + "step": 1235 + }, + { + "epoch": 2.2382671480144403, + "grad_norm": 1.6072066180424502, + "learning_rate": 1.552707581227437e-05, + "loss": 0.2872, + "step": 1240 + }, + { + "epoch": 2.247292418772563, + "grad_norm": 1.3689820360945464, + "learning_rate": 1.5509025270758125e-05, + "loss": 0.2812, + "step": 1245 + }, + { + "epoch": 2.256317689530686, + "grad_norm": 1.363835208538375, + "learning_rate": 1.549097472924188e-05, + "loss": 0.283, + "step": 1250 + }, + { + "epoch": 2.265342960288809, + "grad_norm": 1.6184406483983944, + "learning_rate": 1.5472924187725633e-05, + "loss": 0.2819, + "step": 1255 + }, + { + "epoch": 2.2743682310469313, + "grad_norm": 1.452212492826692, + "learning_rate": 1.5454873646209387e-05, + "loss": 0.2886, + "step": 1260 + }, + { + "epoch": 2.283393501805054, + "grad_norm": 1.3645649562334174, + "learning_rate": 1.543682310469314e-05, + "loss": 0.2817, + "step": 1265 + }, + { + "epoch": 2.292418772563177, + "grad_norm": 1.503443086552838, + "learning_rate": 1.5418772563176895e-05, + "loss": 0.2816, + "step": 1270 + }, + { + "epoch": 2.3014440433212995, + "grad_norm": 1.4467084497351497, + "learning_rate": 1.5400722021660652e-05, + "loss": 0.2857, + "step": 1275 + }, + { + "epoch": 2.3104693140794224, + "grad_norm": 1.3905500897215815, + "learning_rate": 1.5382671480144406e-05, + "loss": 0.2858, + "step": 1280 + }, + { + "epoch": 2.3194945848375452, + "grad_norm": 1.5392545695875637, + "learning_rate": 1.536462093862816e-05, + "loss": 0.2828, + "step": 1285 + }, + { + "epoch": 2.328519855595668, + "grad_norm": 1.5670788162913514, + "learning_rate": 1.5346570397111917e-05, + "loss": 0.2819, + "step": 1290 + }, + { + "epoch": 2.3375451263537905, + "grad_norm": 1.4159881393523641, + "learning_rate": 1.532851985559567e-05, + "loss": 0.2824, + "step": 1295 + }, + { + "epoch": 2.3465703971119134, + "grad_norm": 1.4831615682921662, + "learning_rate": 1.5310469314079424e-05, + "loss": 0.2865, + "step": 1300 + }, + { + "epoch": 2.3555956678700363, + "grad_norm": 1.4219138664036257, + "learning_rate": 1.5292418772563178e-05, + "loss": 0.2868, + "step": 1305 + }, + { + "epoch": 2.3646209386281587, + "grad_norm": 1.512943638670528, + "learning_rate": 1.5274368231046932e-05, + "loss": 0.2808, + "step": 1310 + }, + { + "epoch": 2.3736462093862816, + "grad_norm": 1.5368411160330724, + "learning_rate": 1.5256317689530686e-05, + "loss": 0.2818, + "step": 1315 + }, + { + "epoch": 2.3826714801444044, + "grad_norm": 1.4589365443912974, + "learning_rate": 1.5238267148014441e-05, + "loss": 0.2858, + "step": 1320 + }, + { + "epoch": 2.3916967509025273, + "grad_norm": 4.062153022713927, + "learning_rate": 1.5220216606498197e-05, + "loss": 0.2859, + "step": 1325 + }, + { + "epoch": 2.4007220216606497, + "grad_norm": 1.5707167014303813, + "learning_rate": 1.520216606498195e-05, + "loss": 0.2848, + "step": 1330 + }, + { + "epoch": 2.4097472924187726, + "grad_norm": 1.435828117165283, + "learning_rate": 1.5184115523465706e-05, + "loss": 0.2882, + "step": 1335 + }, + { + "epoch": 2.4187725631768955, + "grad_norm": 1.4275643368334596, + "learning_rate": 1.516606498194946e-05, + "loss": 0.2867, + "step": 1340 + }, + { + "epoch": 2.427797833935018, + "grad_norm": 1.5650882814331575, + "learning_rate": 1.5148014440433214e-05, + "loss": 0.2802, + "step": 1345 + }, + { + "epoch": 2.436823104693141, + "grad_norm": 1.6485980287400839, + "learning_rate": 1.512996389891697e-05, + "loss": 0.2876, + "step": 1350 + }, + { + "epoch": 2.4458483754512637, + "grad_norm": 1.7648222520273187, + "learning_rate": 1.5111913357400723e-05, + "loss": 0.2918, + "step": 1355 + }, + { + "epoch": 2.4548736462093865, + "grad_norm": 1.3897537891471694, + "learning_rate": 1.5093862815884477e-05, + "loss": 0.2827, + "step": 1360 + }, + { + "epoch": 2.463898916967509, + "grad_norm": 1.363449811555752, + "learning_rate": 1.5075812274368234e-05, + "loss": 0.2755, + "step": 1365 + }, + { + "epoch": 2.472924187725632, + "grad_norm": 1.5054780686538907, + "learning_rate": 1.5057761732851988e-05, + "loss": 0.2818, + "step": 1370 + }, + { + "epoch": 2.4819494584837547, + "grad_norm": 1.4402872956422859, + "learning_rate": 1.5039711191335742e-05, + "loss": 0.2806, + "step": 1375 + }, + { + "epoch": 2.490974729241877, + "grad_norm": 1.531753267929379, + "learning_rate": 1.5021660649819495e-05, + "loss": 0.281, + "step": 1380 + }, + { + "epoch": 2.5, + "grad_norm": 1.2873311123663773, + "learning_rate": 1.5003610108303251e-05, + "loss": 0.2816, + "step": 1385 + }, + { + "epoch": 2.509025270758123, + "grad_norm": 1.3443259078812042, + "learning_rate": 1.4985559566787005e-05, + "loss": 0.2816, + "step": 1390 + }, + { + "epoch": 2.5180505415162457, + "grad_norm": 1.5434631176695652, + "learning_rate": 1.4967509025270759e-05, + "loss": 0.2819, + "step": 1395 + }, + { + "epoch": 2.527075812274368, + "grad_norm": 1.362978897038253, + "learning_rate": 1.4949458483754512e-05, + "loss": 0.2794, + "step": 1400 + }, + { + "epoch": 2.536101083032491, + "grad_norm": 1.3819832377162544, + "learning_rate": 1.4931407942238268e-05, + "loss": 0.2868, + "step": 1405 + }, + { + "epoch": 2.5451263537906135, + "grad_norm": 1.6555554545565292, + "learning_rate": 1.4913357400722023e-05, + "loss": 0.2794, + "step": 1410 + }, + { + "epoch": 2.5541516245487363, + "grad_norm": 1.3799356893593522, + "learning_rate": 1.4895306859205779e-05, + "loss": 0.2895, + "step": 1415 + }, + { + "epoch": 2.563176895306859, + "grad_norm": 1.3978197779714834, + "learning_rate": 1.4877256317689533e-05, + "loss": 0.2823, + "step": 1420 + }, + { + "epoch": 2.572202166064982, + "grad_norm": 1.5752923039032896, + "learning_rate": 1.4859205776173287e-05, + "loss": 0.2864, + "step": 1425 + }, + { + "epoch": 2.581227436823105, + "grad_norm": 1.5198542385453242, + "learning_rate": 1.484115523465704e-05, + "loss": 0.2895, + "step": 1430 + }, + { + "epoch": 2.5902527075812274, + "grad_norm": 1.5294363816313867, + "learning_rate": 1.4823104693140796e-05, + "loss": 0.2841, + "step": 1435 + }, + { + "epoch": 2.5992779783393503, + "grad_norm": 1.6460098424826168, + "learning_rate": 1.480505415162455e-05, + "loss": 0.2905, + "step": 1440 + }, + { + "epoch": 2.6083032490974727, + "grad_norm": 1.4396643499754782, + "learning_rate": 1.4787003610108304e-05, + "loss": 0.2839, + "step": 1445 + }, + { + "epoch": 2.6173285198555956, + "grad_norm": 1.4770734853739884, + "learning_rate": 1.4768953068592057e-05, + "loss": 0.285, + "step": 1450 + }, + { + "epoch": 2.6263537906137184, + "grad_norm": 1.5925936800627583, + "learning_rate": 1.4750902527075815e-05, + "loss": 0.2939, + "step": 1455 + }, + { + "epoch": 2.6353790613718413, + "grad_norm": 1.3085799141153367, + "learning_rate": 1.4732851985559568e-05, + "loss": 0.2755, + "step": 1460 + }, + { + "epoch": 2.644404332129964, + "grad_norm": 1.3774680740883536, + "learning_rate": 1.4714801444043322e-05, + "loss": 0.2808, + "step": 1465 + }, + { + "epoch": 2.6534296028880866, + "grad_norm": 1.4223794368692813, + "learning_rate": 1.4696750902527078e-05, + "loss": 0.2767, + "step": 1470 + }, + { + "epoch": 2.6624548736462095, + "grad_norm": 1.579637524626807, + "learning_rate": 1.4678700361010832e-05, + "loss": 0.2849, + "step": 1475 + }, + { + "epoch": 2.671480144404332, + "grad_norm": 1.393710344111409, + "learning_rate": 1.4660649819494585e-05, + "loss": 0.2768, + "step": 1480 + }, + { + "epoch": 2.6805054151624548, + "grad_norm": 1.36136102500023, + "learning_rate": 1.464259927797834e-05, + "loss": 0.2816, + "step": 1485 + }, + { + "epoch": 2.6895306859205776, + "grad_norm": 1.4211824221206752, + "learning_rate": 1.4624548736462095e-05, + "loss": 0.2856, + "step": 1490 + }, + { + "epoch": 2.6985559566787005, + "grad_norm": 1.4156029806705734, + "learning_rate": 1.460649819494585e-05, + "loss": 0.2783, + "step": 1495 + }, + { + "epoch": 2.707581227436823, + "grad_norm": 1.523501338124956, + "learning_rate": 1.4588447653429606e-05, + "loss": 0.2872, + "step": 1500 + }, + { + "epoch": 2.716606498194946, + "grad_norm": 1.472818222499458, + "learning_rate": 1.457039711191336e-05, + "loss": 0.2806, + "step": 1505 + }, + { + "epoch": 2.7256317689530687, + "grad_norm": 1.3839972224563968, + "learning_rate": 1.4552346570397113e-05, + "loss": 0.2887, + "step": 1510 + }, + { + "epoch": 2.734657039711191, + "grad_norm": 1.4093341174867682, + "learning_rate": 1.4534296028880867e-05, + "loss": 0.2812, + "step": 1515 + }, + { + "epoch": 2.743682310469314, + "grad_norm": 1.4227505403261873, + "learning_rate": 1.4516245487364623e-05, + "loss": 0.2873, + "step": 1520 + }, + { + "epoch": 2.752707581227437, + "grad_norm": 1.532809014546811, + "learning_rate": 1.4498194945848376e-05, + "loss": 0.2804, + "step": 1525 + }, + { + "epoch": 2.7617328519855597, + "grad_norm": 1.5838121158952596, + "learning_rate": 1.448014440433213e-05, + "loss": 0.2841, + "step": 1530 + }, + { + "epoch": 2.770758122743682, + "grad_norm": 1.463955244062951, + "learning_rate": 1.4462093862815884e-05, + "loss": 0.2833, + "step": 1535 + }, + { + "epoch": 2.779783393501805, + "grad_norm": 1.5918158548066845, + "learning_rate": 1.4444043321299641e-05, + "loss": 0.2815, + "step": 1540 + }, + { + "epoch": 2.788808664259928, + "grad_norm": 1.4805453499007415, + "learning_rate": 1.4425992779783395e-05, + "loss": 0.2811, + "step": 1545 + }, + { + "epoch": 2.7978339350180503, + "grad_norm": 1.5502318521124943, + "learning_rate": 1.4407942238267149e-05, + "loss": 0.28, + "step": 1550 + }, + { + "epoch": 2.806859205776173, + "grad_norm": 1.4288682445201122, + "learning_rate": 1.4389891696750904e-05, + "loss": 0.2868, + "step": 1555 + }, + { + "epoch": 2.815884476534296, + "grad_norm": 1.4010775386857144, + "learning_rate": 1.4371841155234658e-05, + "loss": 0.2889, + "step": 1560 + }, + { + "epoch": 2.824909747292419, + "grad_norm": 1.4238966947086382, + "learning_rate": 1.4353790613718412e-05, + "loss": 0.2797, + "step": 1565 + }, + { + "epoch": 2.8339350180505414, + "grad_norm": 1.6701165033755396, + "learning_rate": 1.4335740072202166e-05, + "loss": 0.2821, + "step": 1570 + }, + { + "epoch": 2.8429602888086642, + "grad_norm": 1.52168185280516, + "learning_rate": 1.4317689530685921e-05, + "loss": 0.2836, + "step": 1575 + }, + { + "epoch": 2.851985559566787, + "grad_norm": 1.4368504165059217, + "learning_rate": 1.4299638989169675e-05, + "loss": 0.2821, + "step": 1580 + }, + { + "epoch": 2.8610108303249095, + "grad_norm": 1.5537901610258407, + "learning_rate": 1.4281588447653432e-05, + "loss": 0.2767, + "step": 1585 + }, + { + "epoch": 2.8700361010830324, + "grad_norm": 1.5418670823823388, + "learning_rate": 1.4263537906137186e-05, + "loss": 0.286, + "step": 1590 + }, + { + "epoch": 2.8790613718411553, + "grad_norm": 1.346173892816451, + "learning_rate": 1.424548736462094e-05, + "loss": 0.2855, + "step": 1595 + }, + { + "epoch": 2.888086642599278, + "grad_norm": 1.4152264182563925, + "learning_rate": 1.4227436823104694e-05, + "loss": 0.2815, + "step": 1600 + }, + { + "epoch": 2.8971119133574006, + "grad_norm": 1.3391376662233245, + "learning_rate": 1.420938628158845e-05, + "loss": 0.2851, + "step": 1605 + }, + { + "epoch": 2.9061371841155235, + "grad_norm": 1.4435874024341668, + "learning_rate": 1.4191335740072203e-05, + "loss": 0.2796, + "step": 1610 + }, + { + "epoch": 2.9151624548736463, + "grad_norm": 1.3751003396664772, + "learning_rate": 1.4173285198555957e-05, + "loss": 0.2825, + "step": 1615 + }, + { + "epoch": 2.9241877256317688, + "grad_norm": 1.5204342697256372, + "learning_rate": 1.4155234657039711e-05, + "loss": 0.2841, + "step": 1620 + }, + { + "epoch": 2.9332129963898916, + "grad_norm": 1.6012862613909276, + "learning_rate": 1.4137184115523468e-05, + "loss": 0.2772, + "step": 1625 + }, + { + "epoch": 2.9422382671480145, + "grad_norm": 1.40233355498611, + "learning_rate": 1.4119133574007222e-05, + "loss": 0.2813, + "step": 1630 + }, + { + "epoch": 2.9512635379061374, + "grad_norm": 1.4678056138681723, + "learning_rate": 1.4101083032490976e-05, + "loss": 0.2833, + "step": 1635 + }, + { + "epoch": 2.96028880866426, + "grad_norm": 1.5256510122152633, + "learning_rate": 1.4083032490974731e-05, + "loss": 0.2875, + "step": 1640 + }, + { + "epoch": 2.9693140794223827, + "grad_norm": 1.5409003183834475, + "learning_rate": 1.4064981949458485e-05, + "loss": 0.2907, + "step": 1645 + }, + { + "epoch": 2.9783393501805056, + "grad_norm": 1.5329424357705386, + "learning_rate": 1.4046931407942239e-05, + "loss": 0.2795, + "step": 1650 + }, + { + "epoch": 2.987364620938628, + "grad_norm": 1.4730310617789872, + "learning_rate": 1.4028880866425993e-05, + "loss": 0.2818, + "step": 1655 + }, + { + "epoch": 2.996389891696751, + "grad_norm": 1.51332909920422, + "learning_rate": 1.4010830324909748e-05, + "loss": 0.2813, + "step": 1660 + }, + { + "epoch": 3.0054151624548737, + "grad_norm": 1.1148635775562197, + "learning_rate": 1.3992779783393502e-05, + "loss": 0.2436, + "step": 1665 + }, + { + "epoch": 3.0144404332129966, + "grad_norm": 1.358530663003378, + "learning_rate": 1.397472924187726e-05, + "loss": 0.2095, + "step": 1670 + }, + { + "epoch": 3.023465703971119, + "grad_norm": 1.3975749965062991, + "learning_rate": 1.3956678700361013e-05, + "loss": 0.2111, + "step": 1675 + }, + { + "epoch": 3.032490974729242, + "grad_norm": 1.2464319048586523, + "learning_rate": 1.3938628158844767e-05, + "loss": 0.2168, + "step": 1680 + }, + { + "epoch": 3.0415162454873648, + "grad_norm": 1.2891229687458905, + "learning_rate": 1.392057761732852e-05, + "loss": 0.209, + "step": 1685 + }, + { + "epoch": 3.050541516245487, + "grad_norm": 1.4037828122476248, + "learning_rate": 1.3902527075812276e-05, + "loss": 0.2153, + "step": 1690 + }, + { + "epoch": 3.05956678700361, + "grad_norm": 1.2235806246519516, + "learning_rate": 1.388447653429603e-05, + "loss": 0.2096, + "step": 1695 + }, + { + "epoch": 3.068592057761733, + "grad_norm": 1.378724047451379, + "learning_rate": 1.3866425992779784e-05, + "loss": 0.2085, + "step": 1700 + }, + { + "epoch": 3.077617328519856, + "grad_norm": 1.382858335186212, + "learning_rate": 1.3848375451263538e-05, + "loss": 0.2204, + "step": 1705 + }, + { + "epoch": 3.0866425992779782, + "grad_norm": 1.3137128844249182, + "learning_rate": 1.3830324909747293e-05, + "loss": 0.2162, + "step": 1710 + }, + { + "epoch": 3.095667870036101, + "grad_norm": 1.2029503152905936, + "learning_rate": 1.3812274368231049e-05, + "loss": 0.212, + "step": 1715 + }, + { + "epoch": 3.104693140794224, + "grad_norm": 1.3679968447380255, + "learning_rate": 1.3794223826714802e-05, + "loss": 0.2112, + "step": 1720 + }, + { + "epoch": 3.1137184115523464, + "grad_norm": 1.4087458694349797, + "learning_rate": 1.3776173285198558e-05, + "loss": 0.2124, + "step": 1725 + }, + { + "epoch": 3.1227436823104693, + "grad_norm": 1.2653578136284922, + "learning_rate": 1.3758122743682312e-05, + "loss": 0.2138, + "step": 1730 + }, + { + "epoch": 3.131768953068592, + "grad_norm": 1.3112584499411382, + "learning_rate": 1.3740072202166066e-05, + "loss": 0.2163, + "step": 1735 + }, + { + "epoch": 3.140794223826715, + "grad_norm": 1.4058232641289103, + "learning_rate": 1.372202166064982e-05, + "loss": 0.2159, + "step": 1740 + }, + { + "epoch": 3.1498194945848375, + "grad_norm": 1.41030881061776, + "learning_rate": 1.3703971119133575e-05, + "loss": 0.2145, + "step": 1745 + }, + { + "epoch": 3.1588447653429603, + "grad_norm": 1.5104671605422084, + "learning_rate": 1.3685920577617329e-05, + "loss": 0.2139, + "step": 1750 + }, + { + "epoch": 3.167870036101083, + "grad_norm": 1.2630974507680133, + "learning_rate": 1.3667870036101086e-05, + "loss": 0.2157, + "step": 1755 + }, + { + "epoch": 3.1768953068592056, + "grad_norm": 1.272128771203331, + "learning_rate": 1.364981949458484e-05, + "loss": 0.2161, + "step": 1760 + }, + { + "epoch": 3.1859205776173285, + "grad_norm": 1.3478004143294164, + "learning_rate": 1.3631768953068594e-05, + "loss": 0.2163, + "step": 1765 + }, + { + "epoch": 3.1949458483754514, + "grad_norm": 1.450785422963031, + "learning_rate": 1.3613718411552347e-05, + "loss": 0.2073, + "step": 1770 + }, + { + "epoch": 3.2039711191335742, + "grad_norm": 1.4737969733507372, + "learning_rate": 1.3595667870036103e-05, + "loss": 0.214, + "step": 1775 + }, + { + "epoch": 3.2129963898916967, + "grad_norm": 1.55495138656805, + "learning_rate": 1.3577617328519857e-05, + "loss": 0.217, + "step": 1780 + }, + { + "epoch": 3.2220216606498195, + "grad_norm": 1.301565356658425, + "learning_rate": 1.355956678700361e-05, + "loss": 0.2125, + "step": 1785 + }, + { + "epoch": 3.2310469314079424, + "grad_norm": 1.3526575231198374, + "learning_rate": 1.3541516245487364e-05, + "loss": 0.2139, + "step": 1790 + }, + { + "epoch": 3.240072202166065, + "grad_norm": 1.3480835110678375, + "learning_rate": 1.352346570397112e-05, + "loss": 0.2163, + "step": 1795 + }, + { + "epoch": 3.2490974729241877, + "grad_norm": 1.585271243780268, + "learning_rate": 1.3505415162454875e-05, + "loss": 0.2172, + "step": 1800 + }, + { + "epoch": 3.2581227436823106, + "grad_norm": 1.3914377175448838, + "learning_rate": 1.348736462093863e-05, + "loss": 0.2175, + "step": 1805 + }, + { + "epoch": 3.2671480144404335, + "grad_norm": 1.3096937764073042, + "learning_rate": 1.3469314079422385e-05, + "loss": 0.2161, + "step": 1810 + }, + { + "epoch": 3.276173285198556, + "grad_norm": 1.4025247600726756, + "learning_rate": 1.3451263537906139e-05, + "loss": 0.2192, + "step": 1815 + }, + { + "epoch": 3.2851985559566788, + "grad_norm": 1.341953244519878, + "learning_rate": 1.3433212996389892e-05, + "loss": 0.2193, + "step": 1820 + }, + { + "epoch": 3.2942238267148016, + "grad_norm": 1.1736050435468526, + "learning_rate": 1.3415162454873646e-05, + "loss": 0.2169, + "step": 1825 + }, + { + "epoch": 3.303249097472924, + "grad_norm": 7.343497355928463, + "learning_rate": 1.3397111913357402e-05, + "loss": 0.2221, + "step": 1830 + }, + { + "epoch": 3.312274368231047, + "grad_norm": 1.371834881522216, + "learning_rate": 1.3379061371841155e-05, + "loss": 0.2199, + "step": 1835 + }, + { + "epoch": 3.32129963898917, + "grad_norm": 1.4365047454230295, + "learning_rate": 1.336101083032491e-05, + "loss": 0.2132, + "step": 1840 + }, + { + "epoch": 3.3303249097472922, + "grad_norm": 1.272421653704436, + "learning_rate": 1.3342960288808667e-05, + "loss": 0.2184, + "step": 1845 + }, + { + "epoch": 3.339350180505415, + "grad_norm": 1.4316028096472446, + "learning_rate": 1.332490974729242e-05, + "loss": 0.2132, + "step": 1850 + }, + { + "epoch": 3.348375451263538, + "grad_norm": 1.2759638728894496, + "learning_rate": 1.3306859205776174e-05, + "loss": 0.2171, + "step": 1855 + }, + { + "epoch": 3.357400722021661, + "grad_norm": 1.3243009878900587, + "learning_rate": 1.328880866425993e-05, + "loss": 0.2187, + "step": 1860 + }, + { + "epoch": 3.3664259927797833, + "grad_norm": 1.357548066354826, + "learning_rate": 1.3270758122743683e-05, + "loss": 0.218, + "step": 1865 + }, + { + "epoch": 3.375451263537906, + "grad_norm": 1.3425006805058106, + "learning_rate": 1.3252707581227437e-05, + "loss": 0.217, + "step": 1870 + }, + { + "epoch": 3.384476534296029, + "grad_norm": 1.285802529815462, + "learning_rate": 1.3234657039711191e-05, + "loss": 0.216, + "step": 1875 + }, + { + "epoch": 3.3935018050541514, + "grad_norm": 1.2778875094894446, + "learning_rate": 1.3216606498194947e-05, + "loss": 0.2193, + "step": 1880 + }, + { + "epoch": 3.4025270758122743, + "grad_norm": 1.1908353789550035, + "learning_rate": 1.3198555956678702e-05, + "loss": 0.2161, + "step": 1885 + }, + { + "epoch": 3.411552346570397, + "grad_norm": 1.34603474003137, + "learning_rate": 1.3180505415162456e-05, + "loss": 0.2141, + "step": 1890 + }, + { + "epoch": 3.4205776173285196, + "grad_norm": 1.4297727153398665, + "learning_rate": 1.3162454873646211e-05, + "loss": 0.2237, + "step": 1895 + }, + { + "epoch": 3.4296028880866425, + "grad_norm": 1.3837512629017574, + "learning_rate": 1.3144404332129965e-05, + "loss": 0.2214, + "step": 1900 + }, + { + "epoch": 3.4386281588447654, + "grad_norm": 1.4387141423605057, + "learning_rate": 1.3126353790613719e-05, + "loss": 0.2187, + "step": 1905 + }, + { + "epoch": 3.4476534296028882, + "grad_norm": 1.2799805992130007, + "learning_rate": 1.3108303249097475e-05, + "loss": 0.2179, + "step": 1910 + }, + { + "epoch": 3.4566787003610107, + "grad_norm": 1.4835446521559619, + "learning_rate": 1.3090252707581228e-05, + "loss": 0.2203, + "step": 1915 + }, + { + "epoch": 3.4657039711191335, + "grad_norm": 1.341999213127749, + "learning_rate": 1.3072202166064982e-05, + "loss": 0.2168, + "step": 1920 + }, + { + "epoch": 3.4747292418772564, + "grad_norm": 1.3763087376806546, + "learning_rate": 1.3054151624548736e-05, + "loss": 0.2149, + "step": 1925 + }, + { + "epoch": 3.483754512635379, + "grad_norm": 1.4007971114232958, + "learning_rate": 1.3036101083032493e-05, + "loss": 0.2182, + "step": 1930 + }, + { + "epoch": 3.4927797833935017, + "grad_norm": 1.4572957861270215, + "learning_rate": 1.3018050541516247e-05, + "loss": 0.2147, + "step": 1935 + }, + { + "epoch": 3.5018050541516246, + "grad_norm": 1.3722288775763722, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.2198, + "step": 1940 + }, + { + "epoch": 3.5108303249097474, + "grad_norm": 1.3099638901670316, + "learning_rate": 1.2981949458483756e-05, + "loss": 0.2195, + "step": 1945 + }, + { + "epoch": 3.51985559566787, + "grad_norm": 1.2794841153864642, + "learning_rate": 1.296389891696751e-05, + "loss": 0.2181, + "step": 1950 + }, + { + "epoch": 3.5288808664259927, + "grad_norm": 1.4143673780025412, + "learning_rate": 1.2945848375451264e-05, + "loss": 0.2159, + "step": 1955 + }, + { + "epoch": 3.5379061371841156, + "grad_norm": 1.2691083266416614, + "learning_rate": 1.2927797833935018e-05, + "loss": 0.2145, + "step": 1960 + }, + { + "epoch": 3.546931407942238, + "grad_norm": 1.6855255358173022, + "learning_rate": 1.2909747292418773e-05, + "loss": 0.2142, + "step": 1965 + }, + { + "epoch": 3.555956678700361, + "grad_norm": 1.2577832543255076, + "learning_rate": 1.2891696750902527e-05, + "loss": 0.2185, + "step": 1970 + }, + { + "epoch": 3.564981949458484, + "grad_norm": 1.4260534179211517, + "learning_rate": 1.2873646209386283e-05, + "loss": 0.2227, + "step": 1975 + }, + { + "epoch": 3.5740072202166067, + "grad_norm": 1.2525086568956194, + "learning_rate": 1.2855595667870038e-05, + "loss": 0.2193, + "step": 1980 + }, + { + "epoch": 3.583032490974729, + "grad_norm": 1.2589927076038325, + "learning_rate": 1.2837545126353792e-05, + "loss": 0.2203, + "step": 1985 + }, + { + "epoch": 3.592057761732852, + "grad_norm": 1.417146294874885, + "learning_rate": 1.2819494584837546e-05, + "loss": 0.2172, + "step": 1990 + }, + { + "epoch": 3.601083032490975, + "grad_norm": 1.201542111015426, + "learning_rate": 1.2801444043321301e-05, + "loss": 0.2175, + "step": 1995 + }, + { + "epoch": 3.6101083032490973, + "grad_norm": 1.3003616222573477, + "learning_rate": 1.2783393501805055e-05, + "loss": 0.2218, + "step": 2000 + }, + { + "epoch": 3.61913357400722, + "grad_norm": 1.2526661517801678, + "learning_rate": 1.2765342960288809e-05, + "loss": 0.2211, + "step": 2005 + }, + { + "epoch": 3.628158844765343, + "grad_norm": 1.3562427529698038, + "learning_rate": 1.2747292418772563e-05, + "loss": 0.2202, + "step": 2010 + }, + { + "epoch": 3.637184115523466, + "grad_norm": 1.337359110529062, + "learning_rate": 1.272924187725632e-05, + "loss": 0.2192, + "step": 2015 + }, + { + "epoch": 3.6462093862815883, + "grad_norm": 1.3247434076106055, + "learning_rate": 1.2711191335740074e-05, + "loss": 0.2175, + "step": 2020 + }, + { + "epoch": 3.655234657039711, + "grad_norm": 1.401401481932495, + "learning_rate": 1.2693140794223828e-05, + "loss": 0.2176, + "step": 2025 + }, + { + "epoch": 3.664259927797834, + "grad_norm": 1.3989599012921654, + "learning_rate": 1.2675090252707583e-05, + "loss": 0.2145, + "step": 2030 + }, + { + "epoch": 3.6732851985559565, + "grad_norm": 1.37942404901644, + "learning_rate": 1.2657039711191337e-05, + "loss": 0.2176, + "step": 2035 + }, + { + "epoch": 3.6823104693140793, + "grad_norm": 1.3768859929806074, + "learning_rate": 1.263898916967509e-05, + "loss": 0.2216, + "step": 2040 + }, + { + "epoch": 3.691335740072202, + "grad_norm": 1.4119126477817214, + "learning_rate": 1.2620938628158845e-05, + "loss": 0.2176, + "step": 2045 + }, + { + "epoch": 3.700361010830325, + "grad_norm": 1.521657968464524, + "learning_rate": 1.26028880866426e-05, + "loss": 0.2215, + "step": 2050 + }, + { + "epoch": 3.7093862815884475, + "grad_norm": 1.1673555154434911, + "learning_rate": 1.2584837545126354e-05, + "loss": 0.2207, + "step": 2055 + }, + { + "epoch": 3.7184115523465704, + "grad_norm": 1.3165831513135962, + "learning_rate": 1.256678700361011e-05, + "loss": 0.2199, + "step": 2060 + }, + { + "epoch": 3.7274368231046933, + "grad_norm": 1.2771969687673677, + "learning_rate": 1.2548736462093865e-05, + "loss": 0.2174, + "step": 2065 + }, + { + "epoch": 3.7364620938628157, + "grad_norm": 1.3460833673268793, + "learning_rate": 1.2530685920577619e-05, + "loss": 0.2176, + "step": 2070 + }, + { + "epoch": 3.7454873646209386, + "grad_norm": 1.321801578603406, + "learning_rate": 1.2512635379061373e-05, + "loss": 0.2181, + "step": 2075 + }, + { + "epoch": 3.7545126353790614, + "grad_norm": 1.296115804685079, + "learning_rate": 1.2494584837545128e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 3.7635379061371843, + "grad_norm": 1.611992163605043, + "learning_rate": 1.2476534296028882e-05, + "loss": 0.2218, + "step": 2085 + }, + { + "epoch": 3.7725631768953067, + "grad_norm": 1.333178205245191, + "learning_rate": 1.2458483754512636e-05, + "loss": 0.2191, + "step": 2090 + }, + { + "epoch": 3.7815884476534296, + "grad_norm": 1.3160245659773944, + "learning_rate": 1.244043321299639e-05, + "loss": 0.2103, + "step": 2095 + }, + { + "epoch": 3.7906137184115525, + "grad_norm": 1.229821228240215, + "learning_rate": 1.2422382671480145e-05, + "loss": 0.2211, + "step": 2100 + }, + { + "epoch": 3.799638989169675, + "grad_norm": 1.4668142178779533, + "learning_rate": 1.24043321299639e-05, + "loss": 0.2157, + "step": 2105 + }, + { + "epoch": 3.808664259927798, + "grad_norm": 1.2767987004558847, + "learning_rate": 1.2386281588447654e-05, + "loss": 0.2139, + "step": 2110 + }, + { + "epoch": 3.8176895306859207, + "grad_norm": 1.3419257818618695, + "learning_rate": 1.236823104693141e-05, + "loss": 0.2177, + "step": 2115 + }, + { + "epoch": 3.8267148014440435, + "grad_norm": 1.3106254825745933, + "learning_rate": 1.2350180505415164e-05, + "loss": 0.2188, + "step": 2120 + }, + { + "epoch": 3.835740072202166, + "grad_norm": 1.291543676501794, + "learning_rate": 1.2332129963898918e-05, + "loss": 0.2211, + "step": 2125 + }, + { + "epoch": 3.844765342960289, + "grad_norm": 1.4261998506808886, + "learning_rate": 1.2314079422382671e-05, + "loss": 0.2197, + "step": 2130 + }, + { + "epoch": 3.8537906137184117, + "grad_norm": 1.358333414575488, + "learning_rate": 1.2296028880866427e-05, + "loss": 0.2159, + "step": 2135 + }, + { + "epoch": 3.862815884476534, + "grad_norm": 1.474955377700595, + "learning_rate": 1.227797833935018e-05, + "loss": 0.2218, + "step": 2140 + }, + { + "epoch": 3.871841155234657, + "grad_norm": 1.3720904636423812, + "learning_rate": 1.2259927797833938e-05, + "loss": 0.2177, + "step": 2145 + }, + { + "epoch": 3.88086642599278, + "grad_norm": 1.5460881931560433, + "learning_rate": 1.2241877256317692e-05, + "loss": 0.2202, + "step": 2150 + }, + { + "epoch": 3.8898916967509027, + "grad_norm": 1.372535903476554, + "learning_rate": 1.2223826714801446e-05, + "loss": 0.2133, + "step": 2155 + }, + { + "epoch": 3.898916967509025, + "grad_norm": 1.3380599019325592, + "learning_rate": 1.22057761732852e-05, + "loss": 0.2156, + "step": 2160 + }, + { + "epoch": 3.907942238267148, + "grad_norm": 1.5215368016858366, + "learning_rate": 1.2187725631768955e-05, + "loss": 0.2209, + "step": 2165 + }, + { + "epoch": 3.916967509025271, + "grad_norm": 1.4103734495459146, + "learning_rate": 1.2169675090252709e-05, + "loss": 0.2156, + "step": 2170 + }, + { + "epoch": 3.9259927797833933, + "grad_norm": 1.3438329909754207, + "learning_rate": 1.2151624548736462e-05, + "loss": 0.2199, + "step": 2175 + }, + { + "epoch": 3.935018050541516, + "grad_norm": 1.2636238619595532, + "learning_rate": 1.2133574007220216e-05, + "loss": 0.2197, + "step": 2180 + }, + { + "epoch": 3.944043321299639, + "grad_norm": 1.1840357472659475, + "learning_rate": 1.2115523465703972e-05, + "loss": 0.2122, + "step": 2185 + }, + { + "epoch": 3.953068592057762, + "grad_norm": 1.2040616851708978, + "learning_rate": 1.2097472924187727e-05, + "loss": 0.2185, + "step": 2190 + }, + { + "epoch": 3.9620938628158844, + "grad_norm": 1.358963573245228, + "learning_rate": 1.2079422382671481e-05, + "loss": 0.2153, + "step": 2195 + }, + { + "epoch": 3.9711191335740073, + "grad_norm": 1.2933453928461196, + "learning_rate": 1.2061371841155237e-05, + "loss": 0.2157, + "step": 2200 + }, + { + "epoch": 3.98014440433213, + "grad_norm": 1.207608106082354, + "learning_rate": 1.204332129963899e-05, + "loss": 0.2199, + "step": 2205 + }, + { + "epoch": 3.9891696750902526, + "grad_norm": 1.3581034288222624, + "learning_rate": 1.2025270758122744e-05, + "loss": 0.2196, + "step": 2210 + }, + { + "epoch": 3.9981949458483754, + "grad_norm": 1.508352838653625, + "learning_rate": 1.2007220216606498e-05, + "loss": 0.2219, + "step": 2215 + }, + { + "epoch": 4.007220216606498, + "grad_norm": 1.1525808259183898, + "learning_rate": 1.1989169675090254e-05, + "loss": 0.172, + "step": 2220 + }, + { + "epoch": 4.016245487364621, + "grad_norm": 1.6837096670325025, + "learning_rate": 1.1971119133574007e-05, + "loss": 0.1473, + "step": 2225 + }, + { + "epoch": 4.025270758122744, + "grad_norm": 1.3622634803237335, + "learning_rate": 1.1953068592057765e-05, + "loss": 0.1424, + "step": 2230 + }, + { + "epoch": 4.034296028880866, + "grad_norm": 1.167482488838041, + "learning_rate": 1.1935018050541518e-05, + "loss": 0.1428, + "step": 2235 + }, + { + "epoch": 4.043321299638989, + "grad_norm": 1.370361726176911, + "learning_rate": 1.1916967509025272e-05, + "loss": 0.1385, + "step": 2240 + }, + { + "epoch": 4.052346570397112, + "grad_norm": 1.2641058446971232, + "learning_rate": 1.1898916967509026e-05, + "loss": 0.1358, + "step": 2245 + }, + { + "epoch": 4.061371841155235, + "grad_norm": 1.4838069133238292, + "learning_rate": 1.1880866425992782e-05, + "loss": 0.1369, + "step": 2250 + }, + { + "epoch": 4.0703971119133575, + "grad_norm": 1.411880125524948, + "learning_rate": 1.1862815884476535e-05, + "loss": 0.1376, + "step": 2255 + }, + { + "epoch": 4.07942238267148, + "grad_norm": 1.532801179696419, + "learning_rate": 1.184476534296029e-05, + "loss": 0.144, + "step": 2260 + }, + { + "epoch": 4.088447653429603, + "grad_norm": 1.2722280030975663, + "learning_rate": 1.1826714801444043e-05, + "loss": 0.1377, + "step": 2265 + }, + { + "epoch": 4.097472924187725, + "grad_norm": 1.257371634404589, + "learning_rate": 1.1808664259927799e-05, + "loss": 0.1376, + "step": 2270 + }, + { + "epoch": 4.106498194945848, + "grad_norm": 1.2867994012584483, + "learning_rate": 1.1790613718411554e-05, + "loss": 0.1438, + "step": 2275 + }, + { + "epoch": 4.115523465703971, + "grad_norm": 1.3235714594660852, + "learning_rate": 1.1772563176895308e-05, + "loss": 0.1375, + "step": 2280 + }, + { + "epoch": 4.124548736462094, + "grad_norm": 1.3538854718074433, + "learning_rate": 1.1754512635379063e-05, + "loss": 0.1434, + "step": 2285 + }, + { + "epoch": 4.133574007220217, + "grad_norm": 1.3387220340603523, + "learning_rate": 1.1736462093862817e-05, + "loss": 0.1439, + "step": 2290 + }, + { + "epoch": 4.14259927797834, + "grad_norm": 1.3342275390499512, + "learning_rate": 1.1718411552346571e-05, + "loss": 0.1423, + "step": 2295 + }, + { + "epoch": 4.1516245487364625, + "grad_norm": 1.3172292559591101, + "learning_rate": 1.1700361010830325e-05, + "loss": 0.1397, + "step": 2300 + }, + { + "epoch": 4.1606498194945845, + "grad_norm": 1.5089183076261925, + "learning_rate": 1.168231046931408e-05, + "loss": 0.1478, + "step": 2305 + }, + { + "epoch": 4.169675090252707, + "grad_norm": 1.2882037882410693, + "learning_rate": 1.1664259927797834e-05, + "loss": 0.1441, + "step": 2310 + }, + { + "epoch": 4.17870036101083, + "grad_norm": 1.3275271715737593, + "learning_rate": 1.1646209386281588e-05, + "loss": 0.1467, + "step": 2315 + }, + { + "epoch": 4.187725631768953, + "grad_norm": 1.26669118474737, + "learning_rate": 1.1628158844765345e-05, + "loss": 0.1448, + "step": 2320 + }, + { + "epoch": 4.196750902527076, + "grad_norm": 1.287627972081866, + "learning_rate": 1.1610108303249099e-05, + "loss": 0.1463, + "step": 2325 + }, + { + "epoch": 4.205776173285199, + "grad_norm": 1.4698351462515338, + "learning_rate": 1.1592057761732853e-05, + "loss": 0.1418, + "step": 2330 + }, + { + "epoch": 4.214801444043322, + "grad_norm": 1.214815481124232, + "learning_rate": 1.1574007220216608e-05, + "loss": 0.1414, + "step": 2335 + }, + { + "epoch": 4.223826714801444, + "grad_norm": 1.3143493372944082, + "learning_rate": 1.1555956678700362e-05, + "loss": 0.1434, + "step": 2340 + }, + { + "epoch": 4.2328519855595665, + "grad_norm": 1.312607955074974, + "learning_rate": 1.1537906137184116e-05, + "loss": 0.144, + "step": 2345 + }, + { + "epoch": 4.241877256317689, + "grad_norm": 1.2190888161416562, + "learning_rate": 1.151985559566787e-05, + "loss": 0.1426, + "step": 2350 + }, + { + "epoch": 4.250902527075812, + "grad_norm": 1.3594194563402886, + "learning_rate": 1.1501805054151625e-05, + "loss": 0.1435, + "step": 2355 + }, + { + "epoch": 4.259927797833935, + "grad_norm": 1.4782948938985976, + "learning_rate": 1.148375451263538e-05, + "loss": 0.1454, + "step": 2360 + }, + { + "epoch": 4.268953068592058, + "grad_norm": 1.336811201239736, + "learning_rate": 1.1465703971119135e-05, + "loss": 0.1454, + "step": 2365 + }, + { + "epoch": 4.277978339350181, + "grad_norm": 1.2482577868629918, + "learning_rate": 1.144765342960289e-05, + "loss": 0.1411, + "step": 2370 + }, + { + "epoch": 4.287003610108303, + "grad_norm": 1.2732430927693474, + "learning_rate": 1.1429602888086644e-05, + "loss": 0.1417, + "step": 2375 + }, + { + "epoch": 4.296028880866426, + "grad_norm": 1.5099576743037075, + "learning_rate": 1.1411552346570398e-05, + "loss": 0.1448, + "step": 2380 + }, + { + "epoch": 4.305054151624549, + "grad_norm": 1.289221803177811, + "learning_rate": 1.1393501805054152e-05, + "loss": 0.147, + "step": 2385 + }, + { + "epoch": 4.3140794223826715, + "grad_norm": 1.3263143595170082, + "learning_rate": 1.1375451263537907e-05, + "loss": 0.146, + "step": 2390 + }, + { + "epoch": 4.323104693140794, + "grad_norm": 1.2413770324582891, + "learning_rate": 1.1357400722021661e-05, + "loss": 0.1457, + "step": 2395 + }, + { + "epoch": 4.332129963898917, + "grad_norm": 1.2639388626402195, + "learning_rate": 1.1339350180505415e-05, + "loss": 0.1507, + "step": 2400 + }, + { + "epoch": 4.34115523465704, + "grad_norm": 1.264633141154961, + "learning_rate": 1.1321299638989172e-05, + "loss": 0.1449, + "step": 2405 + }, + { + "epoch": 4.350180505415162, + "grad_norm": 1.436154267765744, + "learning_rate": 1.1303249097472926e-05, + "loss": 0.1507, + "step": 2410 + }, + { + "epoch": 4.359205776173285, + "grad_norm": 1.1582422344120165, + "learning_rate": 1.128519855595668e-05, + "loss": 0.1484, + "step": 2415 + }, + { + "epoch": 4.368231046931408, + "grad_norm": 1.279883119070738, + "learning_rate": 1.1267148014440435e-05, + "loss": 0.1427, + "step": 2420 + }, + { + "epoch": 4.377256317689531, + "grad_norm": 1.3274819211696616, + "learning_rate": 1.1249097472924189e-05, + "loss": 0.1452, + "step": 2425 + }, + { + "epoch": 4.386281588447654, + "grad_norm": 1.3298646533007747, + "learning_rate": 1.1231046931407943e-05, + "loss": 0.1491, + "step": 2430 + }, + { + "epoch": 4.3953068592057765, + "grad_norm": 1.2549287331161112, + "learning_rate": 1.1212996389891697e-05, + "loss": 0.1489, + "step": 2435 + }, + { + "epoch": 4.404332129963899, + "grad_norm": 1.400198206798889, + "learning_rate": 1.1194945848375452e-05, + "loss": 0.1451, + "step": 2440 + }, + { + "epoch": 4.413357400722021, + "grad_norm": 1.3684946718616282, + "learning_rate": 1.1176895306859206e-05, + "loss": 0.1521, + "step": 2445 + }, + { + "epoch": 4.422382671480144, + "grad_norm": 1.2087420140469356, + "learning_rate": 1.1158844765342961e-05, + "loss": 0.1454, + "step": 2450 + }, + { + "epoch": 4.431407942238267, + "grad_norm": 1.2910589959993624, + "learning_rate": 1.1140794223826717e-05, + "loss": 0.1449, + "step": 2455 + }, + { + "epoch": 4.44043321299639, + "grad_norm": 1.4447075027189171, + "learning_rate": 1.112274368231047e-05, + "loss": 0.1443, + "step": 2460 + }, + { + "epoch": 4.449458483754513, + "grad_norm": 1.3740604349535315, + "learning_rate": 1.1104693140794225e-05, + "loss": 0.1413, + "step": 2465 + }, + { + "epoch": 4.458483754512636, + "grad_norm": 1.4050751545666154, + "learning_rate": 1.1086642599277978e-05, + "loss": 0.1483, + "step": 2470 + }, + { + "epoch": 4.467509025270758, + "grad_norm": 1.2385281470710559, + "learning_rate": 1.1068592057761734e-05, + "loss": 0.1485, + "step": 2475 + }, + { + "epoch": 4.4765342960288805, + "grad_norm": 1.2281174369886623, + "learning_rate": 1.1050541516245488e-05, + "loss": 0.1485, + "step": 2480 + }, + { + "epoch": 4.485559566787003, + "grad_norm": 1.2151464275592043, + "learning_rate": 1.1032490974729241e-05, + "loss": 0.1418, + "step": 2485 + }, + { + "epoch": 4.494584837545126, + "grad_norm": 1.3791171874873505, + "learning_rate": 1.1014440433212999e-05, + "loss": 0.1448, + "step": 2490 + }, + { + "epoch": 4.503610108303249, + "grad_norm": 1.2082141878609713, + "learning_rate": 1.0996389891696753e-05, + "loss": 0.142, + "step": 2495 + }, + { + "epoch": 4.512635379061372, + "grad_norm": 1.2284432308372542, + "learning_rate": 1.0978339350180506e-05, + "loss": 0.1462, + "step": 2500 + }, + { + "epoch": 4.512635379061372, + "eval_loss": 0.11392025649547577, + "eval_runtime": 768.0942, + "eval_samples_per_second": 17.309, + "eval_steps_per_second": 0.721, + "step": 2500 + }, + { + "epoch": 4.521660649819495, + "grad_norm": 1.230111612515777, + "learning_rate": 1.0960288808664262e-05, + "loss": 0.1425, + "step": 2505 + }, + { + "epoch": 4.530685920577618, + "grad_norm": 1.3673654833429962, + "learning_rate": 1.0942238267148016e-05, + "loss": 0.146, + "step": 2510 + }, + { + "epoch": 4.53971119133574, + "grad_norm": 1.470735178829857, + "learning_rate": 1.092418772563177e-05, + "loss": 0.1512, + "step": 2515 + }, + { + "epoch": 4.548736462093863, + "grad_norm": 1.431415105533255, + "learning_rate": 1.0906137184115523e-05, + "loss": 0.146, + "step": 2520 + }, + { + "epoch": 4.5577617328519855, + "grad_norm": 1.304958395632544, + "learning_rate": 1.0888086642599279e-05, + "loss": 0.1405, + "step": 2525 + }, + { + "epoch": 4.566787003610108, + "grad_norm": 1.202244774918958, + "learning_rate": 1.0870036101083033e-05, + "loss": 0.1494, + "step": 2530 + }, + { + "epoch": 4.575812274368231, + "grad_norm": 1.2061190438112863, + "learning_rate": 1.0851985559566788e-05, + "loss": 0.1454, + "step": 2535 + }, + { + "epoch": 4.584837545126354, + "grad_norm": 1.2303141902787755, + "learning_rate": 1.0833935018050544e-05, + "loss": 0.1464, + "step": 2540 + }, + { + "epoch": 4.593862815884476, + "grad_norm": 1.1973084706979935, + "learning_rate": 1.0815884476534297e-05, + "loss": 0.1525, + "step": 2545 + }, + { + "epoch": 4.602888086642599, + "grad_norm": 1.265181420660065, + "learning_rate": 1.0797833935018051e-05, + "loss": 0.1467, + "step": 2550 + }, + { + "epoch": 4.611913357400722, + "grad_norm": 1.3348119006510952, + "learning_rate": 1.0779783393501805e-05, + "loss": 0.1503, + "step": 2555 + }, + { + "epoch": 4.620938628158845, + "grad_norm": 1.3586496560954744, + "learning_rate": 1.076173285198556e-05, + "loss": 0.1478, + "step": 2560 + }, + { + "epoch": 4.629963898916968, + "grad_norm": 1.2646064815159468, + "learning_rate": 1.0743682310469314e-05, + "loss": 0.1506, + "step": 2565 + }, + { + "epoch": 4.6389891696750905, + "grad_norm": 1.2208996441560755, + "learning_rate": 1.0725631768953068e-05, + "loss": 0.1489, + "step": 2570 + }, + { + "epoch": 4.648014440433213, + "grad_norm": 1.2249820178968749, + "learning_rate": 1.0707581227436824e-05, + "loss": 0.1491, + "step": 2575 + }, + { + "epoch": 4.657039711191336, + "grad_norm": 1.2788771690689633, + "learning_rate": 1.068953068592058e-05, + "loss": 0.1468, + "step": 2580 + }, + { + "epoch": 4.666064981949458, + "grad_norm": 1.235614926954878, + "learning_rate": 1.0671480144404333e-05, + "loss": 0.1448, + "step": 2585 + }, + { + "epoch": 4.675090252707581, + "grad_norm": 1.4835623380890686, + "learning_rate": 1.0653429602888089e-05, + "loss": 0.1473, + "step": 2590 + }, + { + "epoch": 4.684115523465704, + "grad_norm": 1.3282902129276972, + "learning_rate": 1.0635379061371842e-05, + "loss": 0.1513, + "step": 2595 + }, + { + "epoch": 4.693140794223827, + "grad_norm": 1.3495078303520642, + "learning_rate": 1.0617328519855596e-05, + "loss": 0.1493, + "step": 2600 + }, + { + "epoch": 4.70216606498195, + "grad_norm": 1.3624724519254527, + "learning_rate": 1.059927797833935e-05, + "loss": 0.1464, + "step": 2605 + }, + { + "epoch": 4.7111913357400725, + "grad_norm": 1.5149872464672054, + "learning_rate": 1.0581227436823106e-05, + "loss": 0.1476, + "step": 2610 + }, + { + "epoch": 4.7202166064981945, + "grad_norm": 1.4140631499929084, + "learning_rate": 1.056317689530686e-05, + "loss": 0.1467, + "step": 2615 + }, + { + "epoch": 4.729241877256317, + "grad_norm": 1.3773886206332047, + "learning_rate": 1.0545126353790615e-05, + "loss": 0.1478, + "step": 2620 + }, + { + "epoch": 4.73826714801444, + "grad_norm": 1.360291667120557, + "learning_rate": 1.052707581227437e-05, + "loss": 0.1511, + "step": 2625 + }, + { + "epoch": 4.747292418772563, + "grad_norm": 2.0327952709868455, + "learning_rate": 1.0509025270758124e-05, + "loss": 0.1494, + "step": 2630 + }, + { + "epoch": 4.756317689530686, + "grad_norm": 1.1791100232489107, + "learning_rate": 1.0490974729241878e-05, + "loss": 0.1489, + "step": 2635 + }, + { + "epoch": 4.765342960288809, + "grad_norm": 1.3854796935196865, + "learning_rate": 1.0472924187725632e-05, + "loss": 0.1474, + "step": 2640 + }, + { + "epoch": 4.774368231046932, + "grad_norm": 1.195069413636359, + "learning_rate": 1.0454873646209387e-05, + "loss": 0.1473, + "step": 2645 + }, + { + "epoch": 4.783393501805055, + "grad_norm": 1.1422586625889126, + "learning_rate": 1.0436823104693141e-05, + "loss": 0.1492, + "step": 2650 + }, + { + "epoch": 4.792418772563177, + "grad_norm": 1.307147062583015, + "learning_rate": 1.0418772563176895e-05, + "loss": 0.1512, + "step": 2655 + }, + { + "epoch": 4.8014440433212995, + "grad_norm": 1.3584746863088188, + "learning_rate": 1.040072202166065e-05, + "loss": 0.1457, + "step": 2660 + }, + { + "epoch": 4.810469314079422, + "grad_norm": 1.3606216007098004, + "learning_rate": 1.0382671480144406e-05, + "loss": 0.1523, + "step": 2665 + }, + { + "epoch": 4.819494584837545, + "grad_norm": 1.2830676853953231, + "learning_rate": 1.036462093862816e-05, + "loss": 0.1468, + "step": 2670 + }, + { + "epoch": 4.828519855595668, + "grad_norm": 1.4025094885250153, + "learning_rate": 1.0346570397111915e-05, + "loss": 0.1531, + "step": 2675 + }, + { + "epoch": 4.837545126353791, + "grad_norm": 1.2538328025809764, + "learning_rate": 1.0328519855595669e-05, + "loss": 0.1511, + "step": 2680 + }, + { + "epoch": 4.846570397111913, + "grad_norm": 1.2447983687959376, + "learning_rate": 1.0310469314079423e-05, + "loss": 0.1482, + "step": 2685 + }, + { + "epoch": 4.855595667870036, + "grad_norm": 1.259843451978081, + "learning_rate": 1.0292418772563177e-05, + "loss": 0.1488, + "step": 2690 + }, + { + "epoch": 4.864620938628159, + "grad_norm": 1.2570447719051847, + "learning_rate": 1.0274368231046932e-05, + "loss": 0.1486, + "step": 2695 + }, + { + "epoch": 4.873646209386282, + "grad_norm": 1.329856783565704, + "learning_rate": 1.0256317689530686e-05, + "loss": 0.1529, + "step": 2700 + }, + { + "epoch": 4.882671480144404, + "grad_norm": 1.2638883131188237, + "learning_rate": 1.023826714801444e-05, + "loss": 0.1491, + "step": 2705 + }, + { + "epoch": 4.891696750902527, + "grad_norm": 1.308091101546035, + "learning_rate": 1.0220216606498197e-05, + "loss": 0.1444, + "step": 2710 + }, + { + "epoch": 4.90072202166065, + "grad_norm": 1.401420044812705, + "learning_rate": 1.0202166064981951e-05, + "loss": 0.1496, + "step": 2715 + }, + { + "epoch": 4.909747292418773, + "grad_norm": 1.1802217133046131, + "learning_rate": 1.0184115523465705e-05, + "loss": 0.1447, + "step": 2720 + }, + { + "epoch": 4.918772563176895, + "grad_norm": 1.418350121725291, + "learning_rate": 1.016606498194946e-05, + "loss": 0.1507, + "step": 2725 + }, + { + "epoch": 4.927797833935018, + "grad_norm": 1.3201116853524673, + "learning_rate": 1.0148014440433214e-05, + "loss": 0.1486, + "step": 2730 + }, + { + "epoch": 4.936823104693141, + "grad_norm": 1.3651415755311056, + "learning_rate": 1.0129963898916968e-05, + "loss": 0.1482, + "step": 2735 + }, + { + "epoch": 4.945848375451264, + "grad_norm": 1.2638304079285558, + "learning_rate": 1.0111913357400722e-05, + "loss": 0.1473, + "step": 2740 + }, + { + "epoch": 4.9548736462093865, + "grad_norm": 1.1922484627616543, + "learning_rate": 1.0093862815884477e-05, + "loss": 0.1515, + "step": 2745 + }, + { + "epoch": 4.963898916967509, + "grad_norm": 1.32172317810071, + "learning_rate": 1.0075812274368233e-05, + "loss": 0.151, + "step": 2750 + }, + { + "epoch": 4.972924187725631, + "grad_norm": 1.3124037260863468, + "learning_rate": 1.0057761732851987e-05, + "loss": 0.1458, + "step": 2755 + }, + { + "epoch": 4.981949458483754, + "grad_norm": 1.308966677769924, + "learning_rate": 1.0039711191335742e-05, + "loss": 0.1466, + "step": 2760 + }, + { + "epoch": 4.990974729241877, + "grad_norm": 1.3394659011449825, + "learning_rate": 1.0021660649819496e-05, + "loss": 0.1488, + "step": 2765 + }, + { + "epoch": 5.0, + "grad_norm": 1.1211076532163786, + "learning_rate": 1.000361010830325e-05, + "loss": 0.1451, + "step": 2770 + }, + { + "epoch": 5.009025270758123, + "grad_norm": 0.8762321663139153, + "learning_rate": 9.985559566787004e-06, + "loss": 0.0838, + "step": 2775 + }, + { + "epoch": 5.018050541516246, + "grad_norm": 1.1947630523435986, + "learning_rate": 9.967509025270759e-06, + "loss": 0.0791, + "step": 2780 + }, + { + "epoch": 5.027075812274369, + "grad_norm": 1.1957279068028621, + "learning_rate": 9.949458483754515e-06, + "loss": 0.0762, + "step": 2785 + }, + { + "epoch": 5.036101083032491, + "grad_norm": 1.0406919344632632, + "learning_rate": 9.931407942238268e-06, + "loss": 0.0789, + "step": 2790 + }, + { + "epoch": 5.0451263537906135, + "grad_norm": 1.0243376437328686, + "learning_rate": 9.913357400722022e-06, + "loss": 0.0782, + "step": 2795 + }, + { + "epoch": 5.054151624548736, + "grad_norm": 1.1064381648099426, + "learning_rate": 9.895306859205776e-06, + "loss": 0.0777, + "step": 2800 + }, + { + "epoch": 5.063176895306859, + "grad_norm": 0.9669535500625326, + "learning_rate": 9.877256317689532e-06, + "loss": 0.0756, + "step": 2805 + }, + { + "epoch": 5.072202166064982, + "grad_norm": 1.0367193436042796, + "learning_rate": 9.859205776173287e-06, + "loss": 0.0759, + "step": 2810 + }, + { + "epoch": 5.081227436823105, + "grad_norm": 1.0786119261971507, + "learning_rate": 9.84115523465704e-06, + "loss": 0.0759, + "step": 2815 + }, + { + "epoch": 5.090252707581228, + "grad_norm": 1.000167422197771, + "learning_rate": 9.823104693140795e-06, + "loss": 0.0785, + "step": 2820 + }, + { + "epoch": 5.09927797833935, + "grad_norm": 0.9542524209315384, + "learning_rate": 9.805054151624548e-06, + "loss": 0.0773, + "step": 2825 + }, + { + "epoch": 5.108303249097473, + "grad_norm": 0.9683260308018407, + "learning_rate": 9.787003610108304e-06, + "loss": 0.0773, + "step": 2830 + }, + { + "epoch": 5.117328519855596, + "grad_norm": 1.0346987353569643, + "learning_rate": 9.768953068592058e-06, + "loss": 0.0748, + "step": 2835 + }, + { + "epoch": 5.126353790613718, + "grad_norm": 1.1793270223432109, + "learning_rate": 9.750902527075813e-06, + "loss": 0.078, + "step": 2840 + }, + { + "epoch": 5.135379061371841, + "grad_norm": 0.8930953181531934, + "learning_rate": 9.732851985559567e-06, + "loss": 0.0772, + "step": 2845 + }, + { + "epoch": 5.144404332129964, + "grad_norm": 1.023338270678483, + "learning_rate": 9.714801444043323e-06, + "loss": 0.0781, + "step": 2850 + }, + { + "epoch": 5.153429602888087, + "grad_norm": 0.946465899056401, + "learning_rate": 9.696750902527076e-06, + "loss": 0.0766, + "step": 2855 + }, + { + "epoch": 5.162454873646209, + "grad_norm": 0.9856919879257939, + "learning_rate": 9.67870036101083e-06, + "loss": 0.078, + "step": 2860 + }, + { + "epoch": 5.171480144404332, + "grad_norm": 0.9817835099066485, + "learning_rate": 9.660649819494586e-06, + "loss": 0.0778, + "step": 2865 + }, + { + "epoch": 5.180505415162455, + "grad_norm": 0.9867957619852264, + "learning_rate": 9.642599277978341e-06, + "loss": 0.0794, + "step": 2870 + }, + { + "epoch": 5.189530685920578, + "grad_norm": 0.9281749841484341, + "learning_rate": 9.624548736462095e-06, + "loss": 0.079, + "step": 2875 + }, + { + "epoch": 5.1985559566787005, + "grad_norm": 1.1418877678658672, + "learning_rate": 9.606498194945849e-06, + "loss": 0.0816, + "step": 2880 + }, + { + "epoch": 5.207581227436823, + "grad_norm": 1.2801357990196476, + "learning_rate": 9.588447653429603e-06, + "loss": 0.0802, + "step": 2885 + }, + { + "epoch": 5.216606498194946, + "grad_norm": 1.08414475669863, + "learning_rate": 9.570397111913358e-06, + "loss": 0.0826, + "step": 2890 + }, + { + "epoch": 5.225631768953068, + "grad_norm": 1.0307284905851641, + "learning_rate": 9.552346570397114e-06, + "loss": 0.082, + "step": 2895 + }, + { + "epoch": 5.234657039711191, + "grad_norm": 1.0409677907783568, + "learning_rate": 9.534296028880868e-06, + "loss": 0.0794, + "step": 2900 + }, + { + "epoch": 5.243682310469314, + "grad_norm": 1.0620698441964218, + "learning_rate": 9.516245487364621e-06, + "loss": 0.0805, + "step": 2905 + }, + { + "epoch": 5.252707581227437, + "grad_norm": 1.074425732291813, + "learning_rate": 9.498194945848375e-06, + "loss": 0.078, + "step": 2910 + }, + { + "epoch": 5.26173285198556, + "grad_norm": 1.026175971232318, + "learning_rate": 9.48014440433213e-06, + "loss": 0.0791, + "step": 2915 + }, + { + "epoch": 5.270758122743683, + "grad_norm": 1.067615156649838, + "learning_rate": 9.462093862815885e-06, + "loss": 0.0801, + "step": 2920 + }, + { + "epoch": 5.2797833935018055, + "grad_norm": 1.0009258120547495, + "learning_rate": 9.44404332129964e-06, + "loss": 0.0799, + "step": 2925 + }, + { + "epoch": 5.2888086642599275, + "grad_norm": 0.997000627917853, + "learning_rate": 9.425992779783394e-06, + "loss": 0.0801, + "step": 2930 + }, + { + "epoch": 5.29783393501805, + "grad_norm": 1.0650332846055963, + "learning_rate": 9.40794223826715e-06, + "loss": 0.083, + "step": 2935 + }, + { + "epoch": 5.306859205776173, + "grad_norm": 1.0505624675010425, + "learning_rate": 9.389891696750903e-06, + "loss": 0.0806, + "step": 2940 + }, + { + "epoch": 5.315884476534296, + "grad_norm": 1.0721048636152144, + "learning_rate": 9.371841155234657e-06, + "loss": 0.0835, + "step": 2945 + }, + { + "epoch": 5.324909747292419, + "grad_norm": 1.133446760779764, + "learning_rate": 9.353790613718413e-06, + "loss": 0.0832, + "step": 2950 + }, + { + "epoch": 5.333935018050542, + "grad_norm": 0.9772797348599754, + "learning_rate": 9.335740072202168e-06, + "loss": 0.0829, + "step": 2955 + }, + { + "epoch": 5.342960288808664, + "grad_norm": 1.1072279559914036, + "learning_rate": 9.317689530685922e-06, + "loss": 0.0849, + "step": 2960 + }, + { + "epoch": 5.351985559566787, + "grad_norm": 1.1628299433020886, + "learning_rate": 9.299638989169676e-06, + "loss": 0.0786, + "step": 2965 + }, + { + "epoch": 5.3610108303249095, + "grad_norm": 1.098783237460958, + "learning_rate": 9.28158844765343e-06, + "loss": 0.0811, + "step": 2970 + }, + { + "epoch": 5.370036101083032, + "grad_norm": 1.0282918449980682, + "learning_rate": 9.263537906137185e-06, + "loss": 0.0807, + "step": 2975 + }, + { + "epoch": 5.379061371841155, + "grad_norm": 0.9904645159012198, + "learning_rate": 9.24548736462094e-06, + "loss": 0.0826, + "step": 2980 + }, + { + "epoch": 5.388086642599278, + "grad_norm": 0.9548744748187918, + "learning_rate": 9.227436823104694e-06, + "loss": 0.0848, + "step": 2985 + }, + { + "epoch": 5.397111913357401, + "grad_norm": 0.9743172447638601, + "learning_rate": 9.209386281588448e-06, + "loss": 0.0811, + "step": 2990 + }, + { + "epoch": 5.406137184115524, + "grad_norm": 1.0785358862799732, + "learning_rate": 9.191335740072202e-06, + "loss": 0.0799, + "step": 2995 + }, + { + "epoch": 5.415162454873646, + "grad_norm": 1.1287010196459963, + "learning_rate": 9.173285198555957e-06, + "loss": 0.0848, + "step": 3000 + }, + { + "epoch": 5.424187725631769, + "grad_norm": 1.2004903366809976, + "learning_rate": 9.155234657039711e-06, + "loss": 0.0829, + "step": 3005 + }, + { + "epoch": 5.433212996389892, + "grad_norm": 1.0193224933306848, + "learning_rate": 9.137184115523467e-06, + "loss": 0.0845, + "step": 3010 + }, + { + "epoch": 5.4422382671480145, + "grad_norm": 1.0644266825718822, + "learning_rate": 9.11913357400722e-06, + "loss": 0.0839, + "step": 3015 + }, + { + "epoch": 5.451263537906137, + "grad_norm": 0.8258848911520923, + "learning_rate": 9.101083032490976e-06, + "loss": 0.0822, + "step": 3020 + }, + { + "epoch": 5.46028880866426, + "grad_norm": 0.9412075089089998, + "learning_rate": 9.08303249097473e-06, + "loss": 0.0796, + "step": 3025 + }, + { + "epoch": 5.469314079422382, + "grad_norm": 1.1565753144937303, + "learning_rate": 9.064981949458484e-06, + "loss": 0.084, + "step": 3030 + }, + { + "epoch": 5.478339350180505, + "grad_norm": 1.131138454580635, + "learning_rate": 9.04693140794224e-06, + "loss": 0.0838, + "step": 3035 + }, + { + "epoch": 5.487364620938628, + "grad_norm": 0.9492827416842319, + "learning_rate": 9.028880866425993e-06, + "loss": 0.0813, + "step": 3040 + }, + { + "epoch": 5.496389891696751, + "grad_norm": 0.9559475828227638, + "learning_rate": 9.010830324909749e-06, + "loss": 0.0816, + "step": 3045 + }, + { + "epoch": 5.505415162454874, + "grad_norm": 0.9227409169217686, + "learning_rate": 8.992779783393502e-06, + "loss": 0.0831, + "step": 3050 + }, + { + "epoch": 5.514440433212997, + "grad_norm": 1.0194412684846976, + "learning_rate": 8.974729241877256e-06, + "loss": 0.0832, + "step": 3055 + }, + { + "epoch": 5.5234657039711195, + "grad_norm": 1.0830453157290356, + "learning_rate": 8.956678700361012e-06, + "loss": 0.0834, + "step": 3060 + }, + { + "epoch": 5.532490974729242, + "grad_norm": 1.2132106015297117, + "learning_rate": 8.938628158844767e-06, + "loss": 0.0851, + "step": 3065 + }, + { + "epoch": 5.541516245487364, + "grad_norm": 1.0126228133692035, + "learning_rate": 8.920577617328521e-06, + "loss": 0.085, + "step": 3070 + }, + { + "epoch": 5.550541516245487, + "grad_norm": 1.2031735728430073, + "learning_rate": 8.902527075812275e-06, + "loss": 0.0822, + "step": 3075 + }, + { + "epoch": 5.55956678700361, + "grad_norm": 1.0701261920334157, + "learning_rate": 8.884476534296029e-06, + "loss": 0.0832, + "step": 3080 + }, + { + "epoch": 5.568592057761733, + "grad_norm": 1.0580816761482588, + "learning_rate": 8.866425992779784e-06, + "loss": 0.0796, + "step": 3085 + }, + { + "epoch": 5.577617328519856, + "grad_norm": 1.1940482533499739, + "learning_rate": 8.84837545126354e-06, + "loss": 0.0811, + "step": 3090 + }, + { + "epoch": 5.586642599277979, + "grad_norm": 1.165235309648367, + "learning_rate": 8.830324909747294e-06, + "loss": 0.0863, + "step": 3095 + }, + { + "epoch": 5.595667870036101, + "grad_norm": 0.9249917161400244, + "learning_rate": 8.812274368231047e-06, + "loss": 0.0826, + "step": 3100 + }, + { + "epoch": 5.6046931407942235, + "grad_norm": 0.9717526930385568, + "learning_rate": 8.794223826714801e-06, + "loss": 0.0831, + "step": 3105 + }, + { + "epoch": 5.613718411552346, + "grad_norm": 0.9703434417026183, + "learning_rate": 8.776173285198557e-06, + "loss": 0.0825, + "step": 3110 + }, + { + "epoch": 5.622743682310469, + "grad_norm": 1.1028319154775599, + "learning_rate": 8.75812274368231e-06, + "loss": 0.0836, + "step": 3115 + }, + { + "epoch": 5.631768953068592, + "grad_norm": 1.0334242813746233, + "learning_rate": 8.740072202166066e-06, + "loss": 0.0839, + "step": 3120 + }, + { + "epoch": 5.640794223826715, + "grad_norm": 0.952976186597617, + "learning_rate": 8.72202166064982e-06, + "loss": 0.0829, + "step": 3125 + }, + { + "epoch": 5.649819494584838, + "grad_norm": 1.1742650529657839, + "learning_rate": 8.703971119133575e-06, + "loss": 0.0851, + "step": 3130 + }, + { + "epoch": 5.658844765342961, + "grad_norm": 1.0818057279178166, + "learning_rate": 8.68592057761733e-06, + "loss": 0.0843, + "step": 3135 + }, + { + "epoch": 5.667870036101083, + "grad_norm": 1.0218551853468192, + "learning_rate": 8.667870036101083e-06, + "loss": 0.0835, + "step": 3140 + }, + { + "epoch": 5.676895306859206, + "grad_norm": 0.9122852593914, + "learning_rate": 8.649819494584839e-06, + "loss": 0.0823, + "step": 3145 + }, + { + "epoch": 5.6859205776173285, + "grad_norm": 0.9335847424506468, + "learning_rate": 8.631768953068594e-06, + "loss": 0.0816, + "step": 3150 + }, + { + "epoch": 5.694945848375451, + "grad_norm": 1.0829119759659693, + "learning_rate": 8.613718411552348e-06, + "loss": 0.082, + "step": 3155 + }, + { + "epoch": 5.703971119133574, + "grad_norm": 0.8918149119424851, + "learning_rate": 8.595667870036102e-06, + "loss": 0.0827, + "step": 3160 + }, + { + "epoch": 5.712996389891697, + "grad_norm": 0.9986993160612925, + "learning_rate": 8.577617328519855e-06, + "loss": 0.0821, + "step": 3165 + }, + { + "epoch": 5.722021660649819, + "grad_norm": 1.0234360957137028, + "learning_rate": 8.559566787003611e-06, + "loss": 0.0835, + "step": 3170 + }, + { + "epoch": 5.731046931407942, + "grad_norm": 1.0744486767713939, + "learning_rate": 8.541516245487366e-06, + "loss": 0.0861, + "step": 3175 + }, + { + "epoch": 5.740072202166065, + "grad_norm": 1.140075564988142, + "learning_rate": 8.52346570397112e-06, + "loss": 0.0869, + "step": 3180 + }, + { + "epoch": 5.749097472924188, + "grad_norm": 1.088662389971363, + "learning_rate": 8.505415162454874e-06, + "loss": 0.0828, + "step": 3185 + }, + { + "epoch": 5.758122743682311, + "grad_norm": 1.0811807303803094, + "learning_rate": 8.487364620938628e-06, + "loss": 0.0847, + "step": 3190 + }, + { + "epoch": 5.7671480144404335, + "grad_norm": 0.9808903181684402, + "learning_rate": 8.469314079422383e-06, + "loss": 0.0814, + "step": 3195 + }, + { + "epoch": 5.776173285198556, + "grad_norm": 0.9528918043651443, + "learning_rate": 8.451263537906137e-06, + "loss": 0.0842, + "step": 3200 + }, + { + "epoch": 5.785198555956678, + "grad_norm": 1.0415470416861194, + "learning_rate": 8.433212996389893e-06, + "loss": 0.0852, + "step": 3205 + }, + { + "epoch": 5.794223826714801, + "grad_norm": 0.973061380539213, + "learning_rate": 8.415162454873647e-06, + "loss": 0.0823, + "step": 3210 + }, + { + "epoch": 5.803249097472924, + "grad_norm": 0.9918845339236286, + "learning_rate": 8.397111913357402e-06, + "loss": 0.0855, + "step": 3215 + }, + { + "epoch": 5.812274368231047, + "grad_norm": 1.0771655009336334, + "learning_rate": 8.379061371841156e-06, + "loss": 0.0834, + "step": 3220 + }, + { + "epoch": 5.82129963898917, + "grad_norm": 1.2069079266806046, + "learning_rate": 8.36101083032491e-06, + "loss": 0.086, + "step": 3225 + }, + { + "epoch": 5.830324909747293, + "grad_norm": 1.1089608320239726, + "learning_rate": 8.342960288808665e-06, + "loss": 0.0853, + "step": 3230 + }, + { + "epoch": 5.8393501805054155, + "grad_norm": 1.0591489304507902, + "learning_rate": 8.324909747292419e-06, + "loss": 0.0851, + "step": 3235 + }, + { + "epoch": 5.8483754512635375, + "grad_norm": 0.9769104219070994, + "learning_rate": 8.306859205776175e-06, + "loss": 0.0854, + "step": 3240 + }, + { + "epoch": 5.85740072202166, + "grad_norm": 1.0564740427808548, + "learning_rate": 8.288808664259928e-06, + "loss": 0.0848, + "step": 3245 + }, + { + "epoch": 5.866425992779783, + "grad_norm": 1.1856181761239502, + "learning_rate": 8.270758122743682e-06, + "loss": 0.0844, + "step": 3250 + }, + { + "epoch": 5.875451263537906, + "grad_norm": 1.1377866343272012, + "learning_rate": 8.252707581227438e-06, + "loss": 0.083, + "step": 3255 + }, + { + "epoch": 5.884476534296029, + "grad_norm": 0.960741526303003, + "learning_rate": 8.234657039711193e-06, + "loss": 0.0866, + "step": 3260 + }, + { + "epoch": 5.893501805054152, + "grad_norm": 1.042073925207735, + "learning_rate": 8.216606498194947e-06, + "loss": 0.0823, + "step": 3265 + }, + { + "epoch": 5.902527075812275, + "grad_norm": 1.1046114797773166, + "learning_rate": 8.198555956678701e-06, + "loss": 0.0849, + "step": 3270 + }, + { + "epoch": 5.911552346570397, + "grad_norm": 0.9693673702377407, + "learning_rate": 8.180505415162455e-06, + "loss": 0.0853, + "step": 3275 + }, + { + "epoch": 5.92057761732852, + "grad_norm": 0.9935728695450117, + "learning_rate": 8.16245487364621e-06, + "loss": 0.0821, + "step": 3280 + }, + { + "epoch": 5.9296028880866425, + "grad_norm": 1.0804926319490278, + "learning_rate": 8.144404332129964e-06, + "loss": 0.0835, + "step": 3285 + }, + { + "epoch": 5.938628158844765, + "grad_norm": 1.064140844986384, + "learning_rate": 8.12635379061372e-06, + "loss": 0.0844, + "step": 3290 + }, + { + "epoch": 5.947653429602888, + "grad_norm": 1.0950234731974577, + "learning_rate": 8.108303249097473e-06, + "loss": 0.0856, + "step": 3295 + }, + { + "epoch": 5.956678700361011, + "grad_norm": 0.8673519333585892, + "learning_rate": 8.090252707581227e-06, + "loss": 0.083, + "step": 3300 + }, + { + "epoch": 5.965703971119133, + "grad_norm": 1.0637539730176666, + "learning_rate": 8.072202166064983e-06, + "loss": 0.0876, + "step": 3305 + }, + { + "epoch": 5.974729241877256, + "grad_norm": 1.1094983157547202, + "learning_rate": 8.054151624548736e-06, + "loss": 0.0824, + "step": 3310 + }, + { + "epoch": 5.983754512635379, + "grad_norm": 1.0059308649225067, + "learning_rate": 8.036101083032492e-06, + "loss": 0.083, + "step": 3315 + }, + { + "epoch": 5.992779783393502, + "grad_norm": 1.0366600463191211, + "learning_rate": 8.018050541516246e-06, + "loss": 0.0823, + "step": 3320 + }, + { + "epoch": 6.001805054151625, + "grad_norm": 0.8077933037227134, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0794, + "step": 3325 + }, + { + "epoch": 6.0108303249097474, + "grad_norm": 0.6607413873878891, + "learning_rate": 7.981949458483755e-06, + "loss": 0.0592, + "step": 3330 + }, + { + "epoch": 6.01985559566787, + "grad_norm": 0.8591048311669046, + "learning_rate": 7.963898916967509e-06, + "loss": 0.0581, + "step": 3335 + }, + { + "epoch": 6.028880866425993, + "grad_norm": 0.70812448877148, + "learning_rate": 7.945848375451264e-06, + "loss": 0.0557, + "step": 3340 + }, + { + "epoch": 6.037906137184115, + "grad_norm": 0.7669486224905381, + "learning_rate": 7.92779783393502e-06, + "loss": 0.0556, + "step": 3345 + }, + { + "epoch": 6.046931407942238, + "grad_norm": 0.8323807855804514, + "learning_rate": 7.909747292418774e-06, + "loss": 0.0556, + "step": 3350 + }, + { + "epoch": 6.055956678700361, + "grad_norm": 0.7520520480227791, + "learning_rate": 7.891696750902528e-06, + "loss": 0.0565, + "step": 3355 + }, + { + "epoch": 6.064981949458484, + "grad_norm": 0.7627341484745689, + "learning_rate": 7.873646209386281e-06, + "loss": 0.0577, + "step": 3360 + }, + { + "epoch": 6.074007220216607, + "grad_norm": 0.6284399763085183, + "learning_rate": 7.855595667870037e-06, + "loss": 0.0559, + "step": 3365 + }, + { + "epoch": 6.0830324909747295, + "grad_norm": 0.6933321046620422, + "learning_rate": 7.83754512635379e-06, + "loss": 0.057, + "step": 3370 + }, + { + "epoch": 6.092057761732852, + "grad_norm": 0.7124947102302089, + "learning_rate": 7.819494584837546e-06, + "loss": 0.057, + "step": 3375 + }, + { + "epoch": 6.101083032490974, + "grad_norm": 0.6774361061625748, + "learning_rate": 7.8014440433213e-06, + "loss": 0.0558, + "step": 3380 + }, + { + "epoch": 6.110108303249097, + "grad_norm": 0.6796571497120569, + "learning_rate": 7.783393501805054e-06, + "loss": 0.0552, + "step": 3385 + }, + { + "epoch": 6.11913357400722, + "grad_norm": 0.7096655971296874, + "learning_rate": 7.76534296028881e-06, + "loss": 0.0566, + "step": 3390 + }, + { + "epoch": 6.128158844765343, + "grad_norm": 0.6734447095516544, + "learning_rate": 7.747292418772563e-06, + "loss": 0.0561, + "step": 3395 + }, + { + "epoch": 6.137184115523466, + "grad_norm": 0.7849122123015055, + "learning_rate": 7.729241877256319e-06, + "loss": 0.0571, + "step": 3400 + }, + { + "epoch": 6.146209386281589, + "grad_norm": 0.8010538023816386, + "learning_rate": 7.711191335740073e-06, + "loss": 0.0559, + "step": 3405 + }, + { + "epoch": 6.155234657039712, + "grad_norm": 0.7974825006602668, + "learning_rate": 7.693140794223828e-06, + "loss": 0.0558, + "step": 3410 + }, + { + "epoch": 6.164259927797834, + "grad_norm": 0.738675744252563, + "learning_rate": 7.675090252707582e-06, + "loss": 0.0589, + "step": 3415 + }, + { + "epoch": 6.1732851985559565, + "grad_norm": 0.7350542546764174, + "learning_rate": 7.657039711191336e-06, + "loss": 0.0579, + "step": 3420 + }, + { + "epoch": 6.182310469314079, + "grad_norm": 0.7619517736710059, + "learning_rate": 7.638989169675091e-06, + "loss": 0.0583, + "step": 3425 + }, + { + "epoch": 6.191335740072202, + "grad_norm": 0.7031956779944963, + "learning_rate": 7.620938628158845e-06, + "loss": 0.0573, + "step": 3430 + }, + { + "epoch": 6.200361010830325, + "grad_norm": 0.7075380742871121, + "learning_rate": 7.6028880866426006e-06, + "loss": 0.0578, + "step": 3435 + }, + { + "epoch": 6.209386281588448, + "grad_norm": 0.7290441002589563, + "learning_rate": 7.584837545126354e-06, + "loss": 0.0575, + "step": 3440 + }, + { + "epoch": 6.21841155234657, + "grad_norm": 0.7207806459527201, + "learning_rate": 7.566787003610109e-06, + "loss": 0.0567, + "step": 3445 + }, + { + "epoch": 6.227436823104693, + "grad_norm": 0.6314466279187362, + "learning_rate": 7.548736462093863e-06, + "loss": 0.0571, + "step": 3450 + }, + { + "epoch": 6.236462093862816, + "grad_norm": 0.6887561278193176, + "learning_rate": 7.530685920577618e-06, + "loss": 0.0593, + "step": 3455 + }, + { + "epoch": 6.245487364620939, + "grad_norm": 0.6206250853652553, + "learning_rate": 7.512635379061373e-06, + "loss": 0.0577, + "step": 3460 + }, + { + "epoch": 6.254512635379061, + "grad_norm": 0.6522866377346127, + "learning_rate": 7.494584837545127e-06, + "loss": 0.0566, + "step": 3465 + }, + { + "epoch": 6.263537906137184, + "grad_norm": 0.8267081288646022, + "learning_rate": 7.4765342960288815e-06, + "loss": 0.058, + "step": 3470 + }, + { + "epoch": 6.272563176895307, + "grad_norm": 0.7317740303239466, + "learning_rate": 7.458483754512636e-06, + "loss": 0.0578, + "step": 3475 + }, + { + "epoch": 6.28158844765343, + "grad_norm": 0.7147086385332849, + "learning_rate": 7.440433212996391e-06, + "loss": 0.0581, + "step": 3480 + }, + { + "epoch": 6.290613718411552, + "grad_norm": 0.8128890990175704, + "learning_rate": 7.422382671480145e-06, + "loss": 0.0582, + "step": 3485 + }, + { + "epoch": 6.299638989169675, + "grad_norm": 0.733774553010267, + "learning_rate": 7.404332129963899e-06, + "loss": 0.0589, + "step": 3490 + }, + { + "epoch": 6.308664259927798, + "grad_norm": 0.6543459576191735, + "learning_rate": 7.386281588447653e-06, + "loss": 0.0586, + "step": 3495 + }, + { + "epoch": 6.317689530685921, + "grad_norm": 0.7327266175801325, + "learning_rate": 7.368231046931409e-06, + "loss": 0.0568, + "step": 3500 + }, + { + "epoch": 6.3267148014440435, + "grad_norm": 0.6332768442841004, + "learning_rate": 7.350180505415163e-06, + "loss": 0.0588, + "step": 3505 + }, + { + "epoch": 6.335740072202166, + "grad_norm": 0.7514342787787303, + "learning_rate": 7.332129963898917e-06, + "loss": 0.0561, + "step": 3510 + }, + { + "epoch": 6.344765342960288, + "grad_norm": 0.9126804192240373, + "learning_rate": 7.314079422382672e-06, + "loss": 0.06, + "step": 3515 + }, + { + "epoch": 6.353790613718411, + "grad_norm": 0.7488940592798972, + "learning_rate": 7.296028880866427e-06, + "loss": 0.0589, + "step": 3520 + }, + { + "epoch": 6.362815884476534, + "grad_norm": 0.7822412626745566, + "learning_rate": 7.277978339350181e-06, + "loss": 0.057, + "step": 3525 + }, + { + "epoch": 6.371841155234657, + "grad_norm": 0.641895512790488, + "learning_rate": 7.259927797833936e-06, + "loss": 0.0566, + "step": 3530 + }, + { + "epoch": 6.38086642599278, + "grad_norm": 0.610560993405513, + "learning_rate": 7.24187725631769e-06, + "loss": 0.0568, + "step": 3535 + }, + { + "epoch": 6.389891696750903, + "grad_norm": 0.6706724032290002, + "learning_rate": 7.223826714801445e-06, + "loss": 0.0606, + "step": 3540 + }, + { + "epoch": 6.398916967509026, + "grad_norm": 0.658753045191665, + "learning_rate": 7.2057761732852e-06, + "loss": 0.0593, + "step": 3545 + }, + { + "epoch": 6.4079422382671485, + "grad_norm": 0.6654660097704281, + "learning_rate": 7.187725631768954e-06, + "loss": 0.0585, + "step": 3550 + }, + { + "epoch": 6.4169675090252705, + "grad_norm": 0.7183651068540791, + "learning_rate": 7.169675090252708e-06, + "loss": 0.0583, + "step": 3555 + }, + { + "epoch": 6.425992779783393, + "grad_norm": 0.8088470667398641, + "learning_rate": 7.151624548736462e-06, + "loss": 0.0588, + "step": 3560 + }, + { + "epoch": 6.435018050541516, + "grad_norm": 0.8263381826886101, + "learning_rate": 7.133574007220218e-06, + "loss": 0.0593, + "step": 3565 + }, + { + "epoch": 6.444043321299639, + "grad_norm": 0.7405826430937665, + "learning_rate": 7.115523465703971e-06, + "loss": 0.0578, + "step": 3570 + }, + { + "epoch": 6.453068592057762, + "grad_norm": 0.7185281820450516, + "learning_rate": 7.097472924187726e-06, + "loss": 0.0579, + "step": 3575 + }, + { + "epoch": 6.462093862815885, + "grad_norm": 0.7382936760799543, + "learning_rate": 7.07942238267148e-06, + "loss": 0.0575, + "step": 3580 + }, + { + "epoch": 6.471119133574007, + "grad_norm": 0.6877004421197578, + "learning_rate": 7.061371841155235e-06, + "loss": 0.0573, + "step": 3585 + }, + { + "epoch": 6.48014440433213, + "grad_norm": 0.6861100161163733, + "learning_rate": 7.04332129963899e-06, + "loss": 0.0599, + "step": 3590 + }, + { + "epoch": 6.4891696750902526, + "grad_norm": 0.6794004024109824, + "learning_rate": 7.025270758122744e-06, + "loss": 0.059, + "step": 3595 + }, + { + "epoch": 6.498194945848375, + "grad_norm": 0.8104554628255708, + "learning_rate": 7.0072202166064985e-06, + "loss": 0.0587, + "step": 3600 + }, + { + "epoch": 6.507220216606498, + "grad_norm": 0.8305032099557856, + "learning_rate": 6.989169675090254e-06, + "loss": 0.0585, + "step": 3605 + }, + { + "epoch": 6.516245487364621, + "grad_norm": 0.8023646626140253, + "learning_rate": 6.971119133574008e-06, + "loss": 0.0595, + "step": 3610 + }, + { + "epoch": 6.525270758122744, + "grad_norm": 0.765073355799892, + "learning_rate": 6.9530685920577625e-06, + "loss": 0.0584, + "step": 3615 + }, + { + "epoch": 6.534296028880867, + "grad_norm": 0.7404552154431031, + "learning_rate": 6.935018050541516e-06, + "loss": 0.0577, + "step": 3620 + }, + { + "epoch": 6.543321299638989, + "grad_norm": 0.7405781286739906, + "learning_rate": 6.916967509025271e-06, + "loss": 0.0574, + "step": 3625 + }, + { + "epoch": 6.552346570397112, + "grad_norm": 0.7793217854667968, + "learning_rate": 6.8989169675090265e-06, + "loss": 0.0605, + "step": 3630 + }, + { + "epoch": 6.561371841155235, + "grad_norm": 0.5845887643829276, + "learning_rate": 6.88086642599278e-06, + "loss": 0.0599, + "step": 3635 + }, + { + "epoch": 6.5703971119133575, + "grad_norm": 0.7145137796672513, + "learning_rate": 6.862815884476535e-06, + "loss": 0.0585, + "step": 3640 + }, + { + "epoch": 6.57942238267148, + "grad_norm": 0.7417399070524405, + "learning_rate": 6.844765342960289e-06, + "loss": 0.0587, + "step": 3645 + }, + { + "epoch": 6.588447653429603, + "grad_norm": 0.6846077356552561, + "learning_rate": 6.826714801444044e-06, + "loss": 0.0588, + "step": 3650 + }, + { + "epoch": 6.597472924187725, + "grad_norm": 0.6999561458466752, + "learning_rate": 6.808664259927798e-06, + "loss": 0.0595, + "step": 3655 + }, + { + "epoch": 6.606498194945848, + "grad_norm": 0.6567498870286701, + "learning_rate": 6.790613718411553e-06, + "loss": 0.0581, + "step": 3660 + }, + { + "epoch": 6.615523465703971, + "grad_norm": 0.7350564747052517, + "learning_rate": 6.7725631768953075e-06, + "loss": 0.0596, + "step": 3665 + }, + { + "epoch": 6.624548736462094, + "grad_norm": 0.6574574010183412, + "learning_rate": 6.754512635379062e-06, + "loss": 0.0584, + "step": 3670 + }, + { + "epoch": 6.633574007220217, + "grad_norm": 0.754103084530502, + "learning_rate": 6.736462093862817e-06, + "loss": 0.0597, + "step": 3675 + }, + { + "epoch": 6.64259927797834, + "grad_norm": 0.6204823250912029, + "learning_rate": 6.718411552346571e-06, + "loss": 0.0594, + "step": 3680 + }, + { + "epoch": 6.6516245487364625, + "grad_norm": 0.6526250315655968, + "learning_rate": 6.700361010830325e-06, + "loss": 0.0586, + "step": 3685 + }, + { + "epoch": 6.6606498194945845, + "grad_norm": 0.6759108282695475, + "learning_rate": 6.682310469314079e-06, + "loss": 0.0598, + "step": 3690 + }, + { + "epoch": 6.669675090252707, + "grad_norm": 0.6281015940025027, + "learning_rate": 6.664259927797835e-06, + "loss": 0.0583, + "step": 3695 + }, + { + "epoch": 6.67870036101083, + "grad_norm": 0.8112328410048235, + "learning_rate": 6.646209386281589e-06, + "loss": 0.0581, + "step": 3700 + }, + { + "epoch": 6.687725631768953, + "grad_norm": 0.8294177729044503, + "learning_rate": 6.628158844765343e-06, + "loss": 0.0588, + "step": 3705 + }, + { + "epoch": 6.696750902527076, + "grad_norm": 0.6837556241925122, + "learning_rate": 6.610108303249098e-06, + "loss": 0.0584, + "step": 3710 + }, + { + "epoch": 6.705776173285199, + "grad_norm": 0.6977698116292954, + "learning_rate": 6.592057761732853e-06, + "loss": 0.0607, + "step": 3715 + }, + { + "epoch": 6.714801444043322, + "grad_norm": 0.8094312645469506, + "learning_rate": 6.574007220216607e-06, + "loss": 0.0583, + "step": 3720 + }, + { + "epoch": 6.723826714801444, + "grad_norm": 0.6015350505674695, + "learning_rate": 6.555956678700362e-06, + "loss": 0.0584, + "step": 3725 + }, + { + "epoch": 6.7328519855595665, + "grad_norm": 0.7144030665488617, + "learning_rate": 6.5379061371841156e-06, + "loss": 0.0586, + "step": 3730 + }, + { + "epoch": 6.741877256317689, + "grad_norm": 0.7376393032962922, + "learning_rate": 6.519855595667871e-06, + "loss": 0.0591, + "step": 3735 + }, + { + "epoch": 6.750902527075812, + "grad_norm": 0.8824089248862236, + "learning_rate": 6.501805054151626e-06, + "loss": 0.0613, + "step": 3740 + }, + { + "epoch": 6.759927797833935, + "grad_norm": 0.6572712036921975, + "learning_rate": 6.4837545126353796e-06, + "loss": 0.0584, + "step": 3745 + }, + { + "epoch": 6.768953068592058, + "grad_norm": 0.6525445173798352, + "learning_rate": 6.465703971119134e-06, + "loss": 0.0599, + "step": 3750 + }, + { + "epoch": 6.777978339350181, + "grad_norm": 0.7744924607467405, + "learning_rate": 6.447653429602888e-06, + "loss": 0.0603, + "step": 3755 + }, + { + "epoch": 6.787003610108303, + "grad_norm": 0.7489114246161626, + "learning_rate": 6.4296028880866436e-06, + "loss": 0.0593, + "step": 3760 + }, + { + "epoch": 6.796028880866426, + "grad_norm": 0.645441657942143, + "learning_rate": 6.411552346570397e-06, + "loss": 0.0601, + "step": 3765 + }, + { + "epoch": 6.805054151624549, + "grad_norm": 0.7628853557562308, + "learning_rate": 6.393501805054152e-06, + "loss": 0.059, + "step": 3770 + }, + { + "epoch": 6.8140794223826715, + "grad_norm": 0.6961794188939842, + "learning_rate": 6.375451263537906e-06, + "loss": 0.0589, + "step": 3775 + }, + { + "epoch": 6.823104693140794, + "grad_norm": 0.732051818736521, + "learning_rate": 6.357400722021661e-06, + "loss": 0.059, + "step": 3780 + }, + { + "epoch": 6.832129963898917, + "grad_norm": 0.6559311687860191, + "learning_rate": 6.339350180505416e-06, + "loss": 0.0583, + "step": 3785 + }, + { + "epoch": 6.841155234657039, + "grad_norm": 0.6593139916118618, + "learning_rate": 6.32129963898917e-06, + "loss": 0.0584, + "step": 3790 + }, + { + "epoch": 6.850180505415162, + "grad_norm": 0.6976834804145813, + "learning_rate": 6.3032490974729245e-06, + "loss": 0.0592, + "step": 3795 + }, + { + "epoch": 6.859205776173285, + "grad_norm": 0.9247025540887917, + "learning_rate": 6.28519855595668e-06, + "loss": 0.0579, + "step": 3800 + }, + { + "epoch": 6.868231046931408, + "grad_norm": 0.8326401902247998, + "learning_rate": 6.267148014440434e-06, + "loss": 0.0604, + "step": 3805 + }, + { + "epoch": 6.877256317689531, + "grad_norm": 0.5853865156076931, + "learning_rate": 6.2490974729241885e-06, + "loss": 0.0594, + "step": 3810 + }, + { + "epoch": 6.886281588447654, + "grad_norm": 0.6292032096557062, + "learning_rate": 6.231046931407942e-06, + "loss": 0.0573, + "step": 3815 + }, + { + "epoch": 6.8953068592057765, + "grad_norm": 0.7744076548642933, + "learning_rate": 6.212996389891697e-06, + "loss": 0.0607, + "step": 3820 + }, + { + "epoch": 6.904332129963899, + "grad_norm": 0.7914463992742894, + "learning_rate": 6.1949458483754525e-06, + "loss": 0.0606, + "step": 3825 + }, + { + "epoch": 6.913357400722021, + "grad_norm": 0.701945083370995, + "learning_rate": 6.176895306859206e-06, + "loss": 0.0587, + "step": 3830 + }, + { + "epoch": 6.922382671480144, + "grad_norm": 0.6696484276570565, + "learning_rate": 6.158844765342961e-06, + "loss": 0.0608, + "step": 3835 + }, + { + "epoch": 6.931407942238267, + "grad_norm": 0.7132432488001278, + "learning_rate": 6.140794223826715e-06, + "loss": 0.0576, + "step": 3840 + }, + { + "epoch": 6.94043321299639, + "grad_norm": 0.6745043956786805, + "learning_rate": 6.12274368231047e-06, + "loss": 0.0597, + "step": 3845 + }, + { + "epoch": 6.949458483754513, + "grad_norm": 0.7735109912206191, + "learning_rate": 6.104693140794224e-06, + "loss": 0.058, + "step": 3850 + }, + { + "epoch": 6.958483754512636, + "grad_norm": 0.8204235477419153, + "learning_rate": 6.086642599277979e-06, + "loss": 0.0611, + "step": 3855 + }, + { + "epoch": 6.967509025270758, + "grad_norm": 0.6940007350939205, + "learning_rate": 6.068592057761733e-06, + "loss": 0.0572, + "step": 3860 + }, + { + "epoch": 6.9765342960288805, + "grad_norm": 0.7293292682342528, + "learning_rate": 6.050541516245488e-06, + "loss": 0.0595, + "step": 3865 + }, + { + "epoch": 6.985559566787003, + "grad_norm": 0.7122810615357342, + "learning_rate": 6.032490974729243e-06, + "loss": 0.0581, + "step": 3870 + }, + { + "epoch": 6.994584837545126, + "grad_norm": 0.6537994119990527, + "learning_rate": 6.014440433212997e-06, + "loss": 0.0598, + "step": 3875 + }, + { + "epoch": 7.003610108303249, + "grad_norm": 0.45575554548627367, + "learning_rate": 5.996389891696751e-06, + "loss": 0.0554, + "step": 3880 + }, + { + "epoch": 7.012635379061372, + "grad_norm": 0.5070177525024102, + "learning_rate": 5.978339350180505e-06, + "loss": 0.0472, + "step": 3885 + }, + { + "epoch": 7.021660649819495, + "grad_norm": 0.5588320341624766, + "learning_rate": 5.960288808664261e-06, + "loss": 0.0476, + "step": 3890 + }, + { + "epoch": 7.030685920577618, + "grad_norm": 0.4730456078169544, + "learning_rate": 5.942238267148015e-06, + "loss": 0.0465, + "step": 3895 + }, + { + "epoch": 7.03971119133574, + "grad_norm": 0.4802501819942531, + "learning_rate": 5.924187725631769e-06, + "loss": 0.0467, + "step": 3900 + }, + { + "epoch": 7.048736462093863, + "grad_norm": 0.43376711913832905, + "learning_rate": 5.906137184115524e-06, + "loss": 0.046, + "step": 3905 + }, + { + "epoch": 7.0577617328519855, + "grad_norm": 0.5467463303002804, + "learning_rate": 5.888086642599279e-06, + "loss": 0.0472, + "step": 3910 + }, + { + "epoch": 7.066787003610108, + "grad_norm": 0.5482465563214539, + "learning_rate": 5.870036101083033e-06, + "loss": 0.0484, + "step": 3915 + }, + { + "epoch": 7.075812274368231, + "grad_norm": 0.47912292874862494, + "learning_rate": 5.851985559566788e-06, + "loss": 0.0464, + "step": 3920 + }, + { + "epoch": 7.084837545126354, + "grad_norm": 0.5614426575228361, + "learning_rate": 5.8339350180505415e-06, + "loss": 0.0473, + "step": 3925 + }, + { + "epoch": 7.093862815884476, + "grad_norm": 0.45529148018104104, + "learning_rate": 5.815884476534297e-06, + "loss": 0.0475, + "step": 3930 + }, + { + "epoch": 7.102888086642599, + "grad_norm": 0.4788344545919543, + "learning_rate": 5.797833935018051e-06, + "loss": 0.0479, + "step": 3935 + }, + { + "epoch": 7.111913357400722, + "grad_norm": 0.46173140924255596, + "learning_rate": 5.7797833935018055e-06, + "loss": 0.0473, + "step": 3940 + }, + { + "epoch": 7.120938628158845, + "grad_norm": 0.5390906257209109, + "learning_rate": 5.761732851985559e-06, + "loss": 0.0485, + "step": 3945 + }, + { + "epoch": 7.129963898916968, + "grad_norm": 0.5227509249629044, + "learning_rate": 5.743682310469314e-06, + "loss": 0.0467, + "step": 3950 + }, + { + "epoch": 7.1389891696750905, + "grad_norm": 0.48947576478003363, + "learning_rate": 5.7256317689530695e-06, + "loss": 0.0472, + "step": 3955 + }, + { + "epoch": 7.148014440433213, + "grad_norm": 0.49244408024963654, + "learning_rate": 5.707581227436823e-06, + "loss": 0.0483, + "step": 3960 + }, + { + "epoch": 7.157039711191336, + "grad_norm": 0.5222815328881174, + "learning_rate": 5.689530685920578e-06, + "loss": 0.048, + "step": 3965 + }, + { + "epoch": 7.166064981949458, + "grad_norm": 0.5817714524678111, + "learning_rate": 5.671480144404332e-06, + "loss": 0.0473, + "step": 3970 + }, + { + "epoch": 7.175090252707581, + "grad_norm": 0.5925655806700871, + "learning_rate": 5.653429602888087e-06, + "loss": 0.0497, + "step": 3975 + }, + { + "epoch": 7.184115523465704, + "grad_norm": 0.49153223210393315, + "learning_rate": 5.635379061371842e-06, + "loss": 0.0488, + "step": 3980 + }, + { + "epoch": 7.193140794223827, + "grad_norm": 0.47623000392936515, + "learning_rate": 5.617328519855596e-06, + "loss": 0.0483, + "step": 3985 + }, + { + "epoch": 7.20216606498195, + "grad_norm": 0.5429876236829767, + "learning_rate": 5.5992779783393505e-06, + "loss": 0.0475, + "step": 3990 + }, + { + "epoch": 7.2111913357400725, + "grad_norm": 0.5392076103305925, + "learning_rate": 5.581227436823106e-06, + "loss": 0.0473, + "step": 3995 + }, + { + "epoch": 7.2202166064981945, + "grad_norm": 0.4442145984166065, + "learning_rate": 5.56317689530686e-06, + "loss": 0.0477, + "step": 4000 + }, + { + "epoch": 7.229241877256317, + "grad_norm": 0.4474040313759183, + "learning_rate": 5.5451263537906145e-06, + "loss": 0.0478, + "step": 4005 + }, + { + "epoch": 7.23826714801444, + "grad_norm": 0.49719158373571726, + "learning_rate": 5.527075812274368e-06, + "loss": 0.0471, + "step": 4010 + }, + { + "epoch": 7.247292418772563, + "grad_norm": 0.5110277695420385, + "learning_rate": 5.509025270758123e-06, + "loss": 0.0472, + "step": 4015 + }, + { + "epoch": 7.256317689530686, + "grad_norm": 0.551234875347394, + "learning_rate": 5.490974729241878e-06, + "loss": 0.0482, + "step": 4020 + }, + { + "epoch": 7.265342960288809, + "grad_norm": 0.5164595334266263, + "learning_rate": 5.472924187725632e-06, + "loss": 0.0483, + "step": 4025 + }, + { + "epoch": 7.274368231046932, + "grad_norm": 0.5120155623092252, + "learning_rate": 5.454873646209387e-06, + "loss": 0.0473, + "step": 4030 + }, + { + "epoch": 7.283393501805054, + "grad_norm": 0.7138533885415025, + "learning_rate": 5.436823104693141e-06, + "loss": 0.0491, + "step": 4035 + }, + { + "epoch": 7.292418772563177, + "grad_norm": 0.5256186511567283, + "learning_rate": 5.418772563176896e-06, + "loss": 0.0482, + "step": 4040 + }, + { + "epoch": 7.3014440433212995, + "grad_norm": 0.5308368446047028, + "learning_rate": 5.40072202166065e-06, + "loss": 0.0483, + "step": 4045 + }, + { + "epoch": 7.310469314079422, + "grad_norm": 0.6340076056397649, + "learning_rate": 5.382671480144405e-06, + "loss": 0.0497, + "step": 4050 + }, + { + "epoch": 7.319494584837545, + "grad_norm": 0.5515989056962669, + "learning_rate": 5.3646209386281586e-06, + "loss": 0.0485, + "step": 4055 + }, + { + "epoch": 7.328519855595668, + "grad_norm": 0.5670979301767023, + "learning_rate": 5.346570397111914e-06, + "loss": 0.049, + "step": 4060 + }, + { + "epoch": 7.337545126353791, + "grad_norm": 0.5155332846058096, + "learning_rate": 5.328519855595669e-06, + "loss": 0.0491, + "step": 4065 + }, + { + "epoch": 7.346570397111913, + "grad_norm": 0.4824925439743894, + "learning_rate": 5.3104693140794226e-06, + "loss": 0.0478, + "step": 4070 + }, + { + "epoch": 7.355595667870036, + "grad_norm": 0.5632294248317786, + "learning_rate": 5.292418772563177e-06, + "loss": 0.0487, + "step": 4075 + }, + { + "epoch": 7.364620938628159, + "grad_norm": 0.5912408794489219, + "learning_rate": 5.274368231046931e-06, + "loss": 0.0494, + "step": 4080 + }, + { + "epoch": 7.373646209386282, + "grad_norm": 0.6610618876210643, + "learning_rate": 5.2563176895306866e-06, + "loss": 0.0484, + "step": 4085 + }, + { + "epoch": 7.382671480144404, + "grad_norm": 0.5715318430573592, + "learning_rate": 5.238267148014441e-06, + "loss": 0.0484, + "step": 4090 + }, + { + "epoch": 7.391696750902527, + "grad_norm": 0.5661862605906709, + "learning_rate": 5.220216606498195e-06, + "loss": 0.0486, + "step": 4095 + }, + { + "epoch": 7.40072202166065, + "grad_norm": 0.9029651147919706, + "learning_rate": 5.20216606498195e-06, + "loss": 0.0508, + "step": 4100 + }, + { + "epoch": 7.409747292418772, + "grad_norm": 0.5165409614311138, + "learning_rate": 5.184115523465705e-06, + "loss": 0.0495, + "step": 4105 + }, + { + "epoch": 7.418772563176895, + "grad_norm": 0.5306583449154315, + "learning_rate": 5.166064981949459e-06, + "loss": 0.0491, + "step": 4110 + }, + { + "epoch": 7.427797833935018, + "grad_norm": 0.450162478445713, + "learning_rate": 5.148014440433214e-06, + "loss": 0.0481, + "step": 4115 + }, + { + "epoch": 7.436823104693141, + "grad_norm": 0.5832168434645013, + "learning_rate": 5.1299638989169675e-06, + "loss": 0.0484, + "step": 4120 + }, + { + "epoch": 7.445848375451264, + "grad_norm": 0.6015879512932737, + "learning_rate": 5.111913357400723e-06, + "loss": 0.0483, + "step": 4125 + }, + { + "epoch": 7.4548736462093865, + "grad_norm": 0.516158408533585, + "learning_rate": 5.093862815884477e-06, + "loss": 0.0502, + "step": 4130 + }, + { + "epoch": 7.463898916967509, + "grad_norm": 0.5739166956674056, + "learning_rate": 5.0758122743682315e-06, + "loss": 0.049, + "step": 4135 + }, + { + "epoch": 7.472924187725631, + "grad_norm": 0.5545335828483756, + "learning_rate": 5.057761732851985e-06, + "loss": 0.0493, + "step": 4140 + }, + { + "epoch": 7.481949458483754, + "grad_norm": 0.47988775413231505, + "learning_rate": 5.03971119133574e-06, + "loss": 0.0481, + "step": 4145 + }, + { + "epoch": 7.490974729241877, + "grad_norm": 0.4135154941397547, + "learning_rate": 5.0216606498194955e-06, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 7.5, + "grad_norm": 0.584778293406958, + "learning_rate": 5.003610108303249e-06, + "loss": 0.0496, + "step": 4155 + }, + { + "epoch": 7.509025270758123, + "grad_norm": 0.6596870903880235, + "learning_rate": 4.985559566787004e-06, + "loss": 0.0509, + "step": 4160 + }, + { + "epoch": 7.518050541516246, + "grad_norm": 0.5258493182778312, + "learning_rate": 4.967509025270759e-06, + "loss": 0.0479, + "step": 4165 + }, + { + "epoch": 7.527075812274369, + "grad_norm": 0.5240346323436935, + "learning_rate": 4.949458483754513e-06, + "loss": 0.0486, + "step": 4170 + }, + { + "epoch": 7.536101083032491, + "grad_norm": 0.5698373146810196, + "learning_rate": 4.931407942238268e-06, + "loss": 0.0498, + "step": 4175 + }, + { + "epoch": 7.5451263537906135, + "grad_norm": 0.5168274629904006, + "learning_rate": 4.913357400722022e-06, + "loss": 0.0484, + "step": 4180 + }, + { + "epoch": 7.554151624548736, + "grad_norm": 0.4967245847978526, + "learning_rate": 4.8953068592057764e-06, + "loss": 0.0494, + "step": 4185 + }, + { + "epoch": 7.563176895306859, + "grad_norm": 0.5296559595656957, + "learning_rate": 4.877256317689531e-06, + "loss": 0.0487, + "step": 4190 + }, + { + "epoch": 7.572202166064982, + "grad_norm": 0.4681797361268003, + "learning_rate": 4.859205776173286e-06, + "loss": 0.0492, + "step": 4195 + }, + { + "epoch": 7.581227436823105, + "grad_norm": 0.5837080132512812, + "learning_rate": 4.8411552346570404e-06, + "loss": 0.0505, + "step": 4200 + }, + { + "epoch": 7.590252707581227, + "grad_norm": 0.6465877829891032, + "learning_rate": 4.823104693140795e-06, + "loss": 0.0491, + "step": 4205 + }, + { + "epoch": 7.59927797833935, + "grad_norm": 0.5542060145314779, + "learning_rate": 4.805054151624549e-06, + "loss": 0.0502, + "step": 4210 + }, + { + "epoch": 7.608303249097473, + "grad_norm": 0.544169659697146, + "learning_rate": 4.787003610108304e-06, + "loss": 0.049, + "step": 4215 + }, + { + "epoch": 7.617328519855596, + "grad_norm": 0.4766240683764691, + "learning_rate": 4.768953068592058e-06, + "loss": 0.0488, + "step": 4220 + }, + { + "epoch": 7.626353790613718, + "grad_norm": 0.5857310047242031, + "learning_rate": 4.750902527075812e-06, + "loss": 0.0484, + "step": 4225 + }, + { + "epoch": 7.635379061371841, + "grad_norm": 0.6584297092461749, + "learning_rate": 4.7328519855595676e-06, + "loss": 0.0504, + "step": 4230 + }, + { + "epoch": 7.644404332129964, + "grad_norm": 0.6165622003595669, + "learning_rate": 4.714801444043321e-06, + "loss": 0.0497, + "step": 4235 + }, + { + "epoch": 7.653429602888087, + "grad_norm": 0.5411935348913237, + "learning_rate": 4.696750902527076e-06, + "loss": 0.0493, + "step": 4240 + }, + { + "epoch": 7.662454873646209, + "grad_norm": 0.5757635354988099, + "learning_rate": 4.678700361010831e-06, + "loss": 0.0487, + "step": 4245 + }, + { + "epoch": 7.671480144404332, + "grad_norm": 0.4789345734543781, + "learning_rate": 4.660649819494585e-06, + "loss": 0.0493, + "step": 4250 + }, + { + "epoch": 7.680505415162455, + "grad_norm": 0.5729021031172669, + "learning_rate": 4.64259927797834e-06, + "loss": 0.0494, + "step": 4255 + }, + { + "epoch": 7.689530685920578, + "grad_norm": 0.5131076676913804, + "learning_rate": 4.624548736462095e-06, + "loss": 0.0486, + "step": 4260 + }, + { + "epoch": 7.6985559566787005, + "grad_norm": 0.5950116178035108, + "learning_rate": 4.6064981949458485e-06, + "loss": 0.0492, + "step": 4265 + }, + { + "epoch": 7.707581227436823, + "grad_norm": 0.5524155549870725, + "learning_rate": 4.588447653429603e-06, + "loss": 0.049, + "step": 4270 + }, + { + "epoch": 7.716606498194945, + "grad_norm": 0.5270091994399845, + "learning_rate": 4.570397111913358e-06, + "loss": 0.0488, + "step": 4275 + }, + { + "epoch": 7.725631768953068, + "grad_norm": 0.46706887147243975, + "learning_rate": 4.552346570397112e-06, + "loss": 0.0499, + "step": 4280 + }, + { + "epoch": 7.734657039711191, + "grad_norm": 0.598860629124346, + "learning_rate": 4.534296028880867e-06, + "loss": 0.049, + "step": 4285 + }, + { + "epoch": 7.743682310469314, + "grad_norm": 0.4792268975334193, + "learning_rate": 4.516245487364621e-06, + "loss": 0.0504, + "step": 4290 + }, + { + "epoch": 7.752707581227437, + "grad_norm": 0.5423376274444353, + "learning_rate": 4.498194945848376e-06, + "loss": 0.0487, + "step": 4295 + }, + { + "epoch": 7.76173285198556, + "grad_norm": 0.5483231957576472, + "learning_rate": 4.48014440433213e-06, + "loss": 0.0492, + "step": 4300 + }, + { + "epoch": 7.770758122743683, + "grad_norm": 0.5272610680544871, + "learning_rate": 4.462093862815885e-06, + "loss": 0.0498, + "step": 4305 + }, + { + "epoch": 7.7797833935018055, + "grad_norm": 0.5746187590999563, + "learning_rate": 4.444043321299639e-06, + "loss": 0.0476, + "step": 4310 + }, + { + "epoch": 7.7888086642599275, + "grad_norm": 0.5034631208906547, + "learning_rate": 4.425992779783394e-06, + "loss": 0.0499, + "step": 4315 + }, + { + "epoch": 7.79783393501805, + "grad_norm": 0.4802523837756993, + "learning_rate": 4.407942238267148e-06, + "loss": 0.0491, + "step": 4320 + }, + { + "epoch": 7.806859205776173, + "grad_norm": 0.5995873657958978, + "learning_rate": 4.389891696750903e-06, + "loss": 0.0503, + "step": 4325 + }, + { + "epoch": 7.815884476534296, + "grad_norm": 0.5877452980751235, + "learning_rate": 4.3718411552346575e-06, + "loss": 0.0484, + "step": 4330 + }, + { + "epoch": 7.824909747292419, + "grad_norm": 0.5912813010558485, + "learning_rate": 4.353790613718412e-06, + "loss": 0.0506, + "step": 4335 + }, + { + "epoch": 7.833935018050542, + "grad_norm": 0.5241916120291473, + "learning_rate": 4.335740072202167e-06, + "loss": 0.0498, + "step": 4340 + }, + { + "epoch": 7.842960288808664, + "grad_norm": 0.6235491325166662, + "learning_rate": 4.317689530685921e-06, + "loss": 0.0493, + "step": 4345 + }, + { + "epoch": 7.851985559566787, + "grad_norm": 0.4918990115478395, + "learning_rate": 4.299638989169675e-06, + "loss": 0.0488, + "step": 4350 + }, + { + "epoch": 7.8610108303249095, + "grad_norm": 0.5015566746050467, + "learning_rate": 4.28158844765343e-06, + "loss": 0.0492, + "step": 4355 + }, + { + "epoch": 7.870036101083032, + "grad_norm": 0.5897991332222726, + "learning_rate": 4.263537906137185e-06, + "loss": 0.0499, + "step": 4360 + }, + { + "epoch": 7.879061371841155, + "grad_norm": 0.5044541687620931, + "learning_rate": 4.245487364620938e-06, + "loss": 0.0496, + "step": 4365 + }, + { + "epoch": 7.888086642599278, + "grad_norm": 0.5298007896462054, + "learning_rate": 4.227436823104694e-06, + "loss": 0.0499, + "step": 4370 + }, + { + "epoch": 7.897111913357401, + "grad_norm": 0.6508152344907792, + "learning_rate": 4.209386281588448e-06, + "loss": 0.0499, + "step": 4375 + }, + { + "epoch": 7.906137184115524, + "grad_norm": 0.4952820870375711, + "learning_rate": 4.191335740072202e-06, + "loss": 0.0486, + "step": 4380 + }, + { + "epoch": 7.915162454873646, + "grad_norm": 0.534088933198461, + "learning_rate": 4.173285198555957e-06, + "loss": 0.0497, + "step": 4385 + }, + { + "epoch": 7.924187725631769, + "grad_norm": 0.5456898452361849, + "learning_rate": 4.155234657039712e-06, + "loss": 0.0498, + "step": 4390 + }, + { + "epoch": 7.933212996389892, + "grad_norm": 0.5206741989861254, + "learning_rate": 4.137184115523466e-06, + "loss": 0.0503, + "step": 4395 + }, + { + "epoch": 7.9422382671480145, + "grad_norm": 0.5130632228899183, + "learning_rate": 4.119133574007221e-06, + "loss": 0.0503, + "step": 4400 + }, + { + "epoch": 7.951263537906137, + "grad_norm": 0.5076154969319394, + "learning_rate": 4.101083032490975e-06, + "loss": 0.0497, + "step": 4405 + }, + { + "epoch": 7.96028880866426, + "grad_norm": 0.4532751901805038, + "learning_rate": 4.0830324909747296e-06, + "loss": 0.0486, + "step": 4410 + }, + { + "epoch": 7.969314079422382, + "grad_norm": 0.5569018909973789, + "learning_rate": 4.064981949458484e-06, + "loss": 0.0495, + "step": 4415 + }, + { + "epoch": 7.978339350180505, + "grad_norm": 0.5691379588931518, + "learning_rate": 4.046931407942238e-06, + "loss": 0.0491, + "step": 4420 + }, + { + "epoch": 7.987364620938628, + "grad_norm": 0.5087632373984906, + "learning_rate": 4.0288808664259935e-06, + "loss": 0.0507, + "step": 4425 + }, + { + "epoch": 7.996389891696751, + "grad_norm": 0.5438187646707812, + "learning_rate": 4.010830324909747e-06, + "loss": 0.0509, + "step": 4430 + }, + { + "epoch": 8.005415162454874, + "grad_norm": 0.46810462572343003, + "learning_rate": 3.992779783393502e-06, + "loss": 0.0467, + "step": 4435 + }, + { + "epoch": 8.014440433212997, + "grad_norm": 0.5110390059676326, + "learning_rate": 3.974729241877257e-06, + "loss": 0.0439, + "step": 4440 + }, + { + "epoch": 8.02346570397112, + "grad_norm": 0.5350833689826751, + "learning_rate": 3.956678700361011e-06, + "loss": 0.0443, + "step": 4445 + }, + { + "epoch": 8.032490974729242, + "grad_norm": 0.5246123933567166, + "learning_rate": 3.938628158844765e-06, + "loss": 0.0436, + "step": 4450 + }, + { + "epoch": 8.041516245487365, + "grad_norm": 0.5240110675258801, + "learning_rate": 3.920577617328521e-06, + "loss": 0.0438, + "step": 4455 + }, + { + "epoch": 8.050541516245488, + "grad_norm": 0.4508238887199392, + "learning_rate": 3.9025270758122745e-06, + "loss": 0.0434, + "step": 4460 + }, + { + "epoch": 8.059566787003611, + "grad_norm": 0.4587378398564632, + "learning_rate": 3.884476534296029e-06, + "loss": 0.0433, + "step": 4465 + }, + { + "epoch": 8.068592057761732, + "grad_norm": 0.4536888628696346, + "learning_rate": 3.866425992779784e-06, + "loss": 0.044, + "step": 4470 + }, + { + "epoch": 8.077617328519855, + "grad_norm": 0.489763842575816, + "learning_rate": 3.848375451263538e-06, + "loss": 0.0439, + "step": 4475 + }, + { + "epoch": 8.086642599277978, + "grad_norm": 0.4841182353897231, + "learning_rate": 3.830324909747293e-06, + "loss": 0.0447, + "step": 4480 + }, + { + "epoch": 8.0956678700361, + "grad_norm": 0.43276031492594386, + "learning_rate": 3.812274368231047e-06, + "loss": 0.0436, + "step": 4485 + }, + { + "epoch": 8.104693140794224, + "grad_norm": 0.43236829754729605, + "learning_rate": 3.7942238267148016e-06, + "loss": 0.0442, + "step": 4490 + }, + { + "epoch": 8.113718411552346, + "grad_norm": 0.5146544780130509, + "learning_rate": 3.776173285198556e-06, + "loss": 0.0444, + "step": 4495 + }, + { + "epoch": 8.12274368231047, + "grad_norm": 0.5155356346240472, + "learning_rate": 3.758122743682311e-06, + "loss": 0.0447, + "step": 4500 + }, + { + "epoch": 8.131768953068592, + "grad_norm": 0.4360981669075826, + "learning_rate": 3.740072202166065e-06, + "loss": 0.0444, + "step": 4505 + }, + { + "epoch": 8.140794223826715, + "grad_norm": 0.5047195091481486, + "learning_rate": 3.72202166064982e-06, + "loss": 0.0453, + "step": 4510 + }, + { + "epoch": 8.149819494584838, + "grad_norm": 0.4206990761101632, + "learning_rate": 3.703971119133574e-06, + "loss": 0.0455, + "step": 4515 + }, + { + "epoch": 8.15884476534296, + "grad_norm": 0.4082036544428832, + "learning_rate": 3.685920577617329e-06, + "loss": 0.0446, + "step": 4520 + }, + { + "epoch": 8.167870036101084, + "grad_norm": 0.4584532743829287, + "learning_rate": 3.6678700361010834e-06, + "loss": 0.0437, + "step": 4525 + }, + { + "epoch": 8.176895306859207, + "grad_norm": 0.5033458410420543, + "learning_rate": 3.649819494584838e-06, + "loss": 0.0443, + "step": 4530 + }, + { + "epoch": 8.18592057761733, + "grad_norm": 0.4791798460117816, + "learning_rate": 3.6317689530685923e-06, + "loss": 0.0445, + "step": 4535 + }, + { + "epoch": 8.19494584837545, + "grad_norm": 0.4196203455463231, + "learning_rate": 3.6137184115523466e-06, + "loss": 0.0442, + "step": 4540 + }, + { + "epoch": 8.203971119133573, + "grad_norm": 0.41063069847051686, + "learning_rate": 3.5956678700361012e-06, + "loss": 0.0445, + "step": 4545 + }, + { + "epoch": 8.212996389891696, + "grad_norm": 0.45170926636173403, + "learning_rate": 3.5776173285198555e-06, + "loss": 0.0445, + "step": 4550 + }, + { + "epoch": 8.222021660649819, + "grad_norm": 0.4365171717286736, + "learning_rate": 3.5595667870036106e-06, + "loss": 0.0439, + "step": 4555 + }, + { + "epoch": 8.231046931407942, + "grad_norm": 0.44704281997750017, + "learning_rate": 3.541516245487365e-06, + "loss": 0.0451, + "step": 4560 + }, + { + "epoch": 8.240072202166065, + "grad_norm": 0.44891926809808913, + "learning_rate": 3.5234657039711195e-06, + "loss": 0.0444, + "step": 4565 + }, + { + "epoch": 8.249097472924188, + "grad_norm": 0.4965534347892108, + "learning_rate": 3.5054151624548737e-06, + "loss": 0.045, + "step": 4570 + }, + { + "epoch": 8.25812274368231, + "grad_norm": 0.4926232662322707, + "learning_rate": 3.487364620938629e-06, + "loss": 0.0458, + "step": 4575 + }, + { + "epoch": 8.267148014440433, + "grad_norm": 0.4655262777572625, + "learning_rate": 3.469314079422383e-06, + "loss": 0.0447, + "step": 4580 + }, + { + "epoch": 8.276173285198556, + "grad_norm": 0.42335405356873596, + "learning_rate": 3.4512635379061377e-06, + "loss": 0.0448, + "step": 4585 + }, + { + "epoch": 8.28519855595668, + "grad_norm": 0.4222745585103908, + "learning_rate": 3.433212996389892e-06, + "loss": 0.0447, + "step": 4590 + }, + { + "epoch": 8.294223826714802, + "grad_norm": 0.5301226189403145, + "learning_rate": 3.4151624548736466e-06, + "loss": 0.0446, + "step": 4595 + }, + { + "epoch": 8.303249097472925, + "grad_norm": 0.42819391239433835, + "learning_rate": 3.397111913357401e-06, + "loss": 0.0452, + "step": 4600 + }, + { + "epoch": 8.312274368231048, + "grad_norm": 0.514730540749935, + "learning_rate": 3.379061371841155e-06, + "loss": 0.0448, + "step": 4605 + }, + { + "epoch": 8.321299638989169, + "grad_norm": 0.3997116268939232, + "learning_rate": 3.36101083032491e-06, + "loss": 0.046, + "step": 4610 + }, + { + "epoch": 8.330324909747292, + "grad_norm": 0.5325835981272248, + "learning_rate": 3.3429602888086644e-06, + "loss": 0.0445, + "step": 4615 + }, + { + "epoch": 8.339350180505415, + "grad_norm": 0.4871783345754918, + "learning_rate": 3.324909747292419e-06, + "loss": 0.0436, + "step": 4620 + }, + { + "epoch": 8.348375451263538, + "grad_norm": 0.44483140119486997, + "learning_rate": 3.3068592057761733e-06, + "loss": 0.0457, + "step": 4625 + }, + { + "epoch": 8.35740072202166, + "grad_norm": 0.4988117583696254, + "learning_rate": 3.288808664259928e-06, + "loss": 0.0451, + "step": 4630 + }, + { + "epoch": 8.366425992779783, + "grad_norm": 0.498516790859318, + "learning_rate": 3.2707581227436822e-06, + "loss": 0.0453, + "step": 4635 + }, + { + "epoch": 8.375451263537906, + "grad_norm": 0.4871818511915015, + "learning_rate": 3.2527075812274373e-06, + "loss": 0.0449, + "step": 4640 + }, + { + "epoch": 8.384476534296029, + "grad_norm": 0.4644645285652418, + "learning_rate": 3.2346570397111916e-06, + "loss": 0.045, + "step": 4645 + }, + { + "epoch": 8.393501805054152, + "grad_norm": 0.48904484199047815, + "learning_rate": 3.2166064981949462e-06, + "loss": 0.0458, + "step": 4650 + }, + { + "epoch": 8.402527075812275, + "grad_norm": 0.49191409648820944, + "learning_rate": 3.1985559566787005e-06, + "loss": 0.0449, + "step": 4655 + }, + { + "epoch": 8.411552346570398, + "grad_norm": 0.42381190140416636, + "learning_rate": 3.1805054151624556e-06, + "loss": 0.0451, + "step": 4660 + }, + { + "epoch": 8.42057761732852, + "grad_norm": 0.5141446233668461, + "learning_rate": 3.16245487364621e-06, + "loss": 0.0448, + "step": 4665 + }, + { + "epoch": 8.429602888086643, + "grad_norm": 0.43988353370949224, + "learning_rate": 3.1444043321299645e-06, + "loss": 0.0449, + "step": 4670 + }, + { + "epoch": 8.438628158844764, + "grad_norm": 0.5124332256581936, + "learning_rate": 3.1263537906137187e-06, + "loss": 0.046, + "step": 4675 + }, + { + "epoch": 8.447653429602887, + "grad_norm": 0.44304572226308614, + "learning_rate": 3.108303249097473e-06, + "loss": 0.0464, + "step": 4680 + }, + { + "epoch": 8.45667870036101, + "grad_norm": 0.5470624111508224, + "learning_rate": 3.0902527075812276e-06, + "loss": 0.0454, + "step": 4685 + }, + { + "epoch": 8.465703971119133, + "grad_norm": 0.4030173134885733, + "learning_rate": 3.072202166064982e-06, + "loss": 0.0453, + "step": 4690 + }, + { + "epoch": 8.474729241877256, + "grad_norm": 0.48989039284185976, + "learning_rate": 3.054151624548737e-06, + "loss": 0.0456, + "step": 4695 + }, + { + "epoch": 8.483754512635379, + "grad_norm": 0.44373432641009264, + "learning_rate": 3.036101083032491e-06, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 8.492779783393502, + "grad_norm": 0.4167509709185524, + "learning_rate": 3.018050541516246e-06, + "loss": 0.0445, + "step": 4705 + }, + { + "epoch": 8.501805054151625, + "grad_norm": 0.5383282538033065, + "learning_rate": 3e-06, + "loss": 0.0467, + "step": 4710 + }, + { + "epoch": 8.510830324909747, + "grad_norm": 0.528737608029945, + "learning_rate": 2.9819494584837547e-06, + "loss": 0.0457, + "step": 4715 + }, + { + "epoch": 8.51985559566787, + "grad_norm": 0.508286793685187, + "learning_rate": 2.9638989169675094e-06, + "loss": 0.0466, + "step": 4720 + }, + { + "epoch": 8.528880866425993, + "grad_norm": 0.43030142221157835, + "learning_rate": 2.945848375451264e-06, + "loss": 0.0458, + "step": 4725 + }, + { + "epoch": 8.537906137184116, + "grad_norm": 0.5345650309751631, + "learning_rate": 2.9277978339350183e-06, + "loss": 0.0451, + "step": 4730 + }, + { + "epoch": 8.546931407942239, + "grad_norm": 0.4211221353833985, + "learning_rate": 2.909747292418773e-06, + "loss": 0.0453, + "step": 4735 + }, + { + "epoch": 8.555956678700362, + "grad_norm": 0.45522058199277166, + "learning_rate": 2.8916967509025272e-06, + "loss": 0.0446, + "step": 4740 + }, + { + "epoch": 8.564981949458485, + "grad_norm": 0.44570872798295774, + "learning_rate": 2.8736462093862815e-06, + "loss": 0.0455, + "step": 4745 + }, + { + "epoch": 8.574007220216606, + "grad_norm": 0.4552227500869339, + "learning_rate": 2.8555956678700365e-06, + "loss": 0.0449, + "step": 4750 + }, + { + "epoch": 8.583032490974729, + "grad_norm": 0.5005592666887394, + "learning_rate": 2.8375451263537908e-06, + "loss": 0.0463, + "step": 4755 + }, + { + "epoch": 8.592057761732852, + "grad_norm": 0.42833298285006977, + "learning_rate": 2.8194945848375454e-06, + "loss": 0.0457, + "step": 4760 + }, + { + "epoch": 8.601083032490974, + "grad_norm": 0.5140646854966842, + "learning_rate": 2.8014440433212997e-06, + "loss": 0.0471, + "step": 4765 + }, + { + "epoch": 8.610108303249097, + "grad_norm": 0.5892042283245157, + "learning_rate": 2.7833935018050544e-06, + "loss": 0.0452, + "step": 4770 + }, + { + "epoch": 8.61913357400722, + "grad_norm": 0.452012962570369, + "learning_rate": 2.7653429602888086e-06, + "loss": 0.0448, + "step": 4775 + }, + { + "epoch": 8.628158844765343, + "grad_norm": 0.4396464198294571, + "learning_rate": 2.7472924187725637e-06, + "loss": 0.0461, + "step": 4780 + }, + { + "epoch": 8.637184115523466, + "grad_norm": 0.41842531603143573, + "learning_rate": 2.729241877256318e-06, + "loss": 0.0451, + "step": 4785 + }, + { + "epoch": 8.646209386281589, + "grad_norm": 0.41898202699131626, + "learning_rate": 2.7111913357400726e-06, + "loss": 0.0457, + "step": 4790 + }, + { + "epoch": 8.655234657039712, + "grad_norm": 0.5196610269445232, + "learning_rate": 2.693140794223827e-06, + "loss": 0.0452, + "step": 4795 + }, + { + "epoch": 8.664259927797834, + "grad_norm": 0.46936311363162797, + "learning_rate": 2.675090252707582e-06, + "loss": 0.0455, + "step": 4800 + }, + { + "epoch": 8.673285198555957, + "grad_norm": 0.4740159197363855, + "learning_rate": 2.657039711191336e-06, + "loss": 0.0447, + "step": 4805 + }, + { + "epoch": 8.68231046931408, + "grad_norm": 0.43074612764563486, + "learning_rate": 2.6389891696750904e-06, + "loss": 0.0456, + "step": 4810 + }, + { + "epoch": 8.691335740072201, + "grad_norm": 0.5425044634950232, + "learning_rate": 2.620938628158845e-06, + "loss": 0.0466, + "step": 4815 + }, + { + "epoch": 8.700361010830324, + "grad_norm": 0.5081339659232526, + "learning_rate": 2.6028880866425993e-06, + "loss": 0.046, + "step": 4820 + }, + { + "epoch": 8.709386281588447, + "grad_norm": 0.4651038570514386, + "learning_rate": 2.584837545126354e-06, + "loss": 0.0463, + "step": 4825 + }, + { + "epoch": 8.71841155234657, + "grad_norm": 0.49355797519718886, + "learning_rate": 2.566787003610108e-06, + "loss": 0.0457, + "step": 4830 + }, + { + "epoch": 8.727436823104693, + "grad_norm": 0.46746434525998604, + "learning_rate": 2.5487364620938633e-06, + "loss": 0.0461, + "step": 4835 + }, + { + "epoch": 8.736462093862816, + "grad_norm": 0.4210937803108333, + "learning_rate": 2.5306859205776175e-06, + "loss": 0.0449, + "step": 4840 + }, + { + "epoch": 8.745487364620939, + "grad_norm": 0.45489349719230204, + "learning_rate": 2.512635379061372e-06, + "loss": 0.0451, + "step": 4845 + }, + { + "epoch": 8.754512635379061, + "grad_norm": 0.49590457857103976, + "learning_rate": 2.4945848375451264e-06, + "loss": 0.0462, + "step": 4850 + }, + { + "epoch": 8.763537906137184, + "grad_norm": 0.41135841968863646, + "learning_rate": 2.476534296028881e-06, + "loss": 0.0459, + "step": 4855 + }, + { + "epoch": 8.772563176895307, + "grad_norm": 0.5862358329269642, + "learning_rate": 2.4584837545126353e-06, + "loss": 0.046, + "step": 4860 + }, + { + "epoch": 8.78158844765343, + "grad_norm": 0.4869407752123959, + "learning_rate": 2.44043321299639e-06, + "loss": 0.046, + "step": 4865 + }, + { + "epoch": 8.790613718411553, + "grad_norm": 0.4541073046578143, + "learning_rate": 2.4223826714801447e-06, + "loss": 0.0452, + "step": 4870 + }, + { + "epoch": 8.799638989169676, + "grad_norm": 0.506564617854228, + "learning_rate": 2.4043321299638993e-06, + "loss": 0.0467, + "step": 4875 + }, + { + "epoch": 8.808664259927799, + "grad_norm": 0.5487992800276584, + "learning_rate": 2.3862815884476536e-06, + "loss": 0.0461, + "step": 4880 + }, + { + "epoch": 8.81768953068592, + "grad_norm": 0.5911995859863417, + "learning_rate": 2.3682310469314082e-06, + "loss": 0.046, + "step": 4885 + }, + { + "epoch": 8.826714801444043, + "grad_norm": 0.49900273240686943, + "learning_rate": 2.350180505415163e-06, + "loss": 0.046, + "step": 4890 + }, + { + "epoch": 8.835740072202166, + "grad_norm": 0.48948946203921534, + "learning_rate": 2.332129963898917e-06, + "loss": 0.0458, + "step": 4895 + }, + { + "epoch": 8.844765342960288, + "grad_norm": 0.4148924662584737, + "learning_rate": 2.314079422382672e-06, + "loss": 0.0461, + "step": 4900 + }, + { + "epoch": 8.853790613718411, + "grad_norm": 0.5404388584683619, + "learning_rate": 2.296028880866426e-06, + "loss": 0.046, + "step": 4905 + }, + { + "epoch": 8.862815884476534, + "grad_norm": 0.4321254910131908, + "learning_rate": 2.2779783393501807e-06, + "loss": 0.0454, + "step": 4910 + }, + { + "epoch": 8.871841155234657, + "grad_norm": 0.5949655513323762, + "learning_rate": 2.259927797833935e-06, + "loss": 0.046, + "step": 4915 + }, + { + "epoch": 8.88086642599278, + "grad_norm": 0.4378486480608059, + "learning_rate": 2.2418772563176896e-06, + "loss": 0.0452, + "step": 4920 + }, + { + "epoch": 8.889891696750903, + "grad_norm": 0.5870449555902121, + "learning_rate": 2.2238267148014443e-06, + "loss": 0.0459, + "step": 4925 + }, + { + "epoch": 8.898916967509026, + "grad_norm": 0.5683809822588126, + "learning_rate": 2.2057761732851985e-06, + "loss": 0.0462, + "step": 4930 + }, + { + "epoch": 8.907942238267148, + "grad_norm": 0.46521601312012895, + "learning_rate": 2.187725631768953e-06, + "loss": 0.0467, + "step": 4935 + }, + { + "epoch": 8.916967509025271, + "grad_norm": 0.47477683194093706, + "learning_rate": 2.169675090252708e-06, + "loss": 0.0462, + "step": 4940 + }, + { + "epoch": 8.925992779783394, + "grad_norm": 0.4804957099911855, + "learning_rate": 2.1516245487364625e-06, + "loss": 0.0454, + "step": 4945 + }, + { + "epoch": 8.935018050541515, + "grad_norm": 0.4571767863844299, + "learning_rate": 2.1335740072202168e-06, + "loss": 0.0453, + "step": 4950 + }, + { + "epoch": 8.944043321299638, + "grad_norm": 0.5601651443669259, + "learning_rate": 2.1155234657039714e-06, + "loss": 0.0463, + "step": 4955 + }, + { + "epoch": 8.953068592057761, + "grad_norm": 0.5500455060473453, + "learning_rate": 2.097472924187726e-06, + "loss": 0.0463, + "step": 4960 + }, + { + "epoch": 8.962093862815884, + "grad_norm": 0.5232043738599569, + "learning_rate": 2.0794223826714803e-06, + "loss": 0.0464, + "step": 4965 + }, + { + "epoch": 8.971119133574007, + "grad_norm": 0.4969509637225851, + "learning_rate": 2.0613718411552346e-06, + "loss": 0.0467, + "step": 4970 + }, + { + "epoch": 8.98014440433213, + "grad_norm": 0.46415374324212516, + "learning_rate": 2.0433212996389892e-06, + "loss": 0.0464, + "step": 4975 + }, + { + "epoch": 8.989169675090253, + "grad_norm": 0.495736189400118, + "learning_rate": 2.025270758122744e-06, + "loss": 0.0459, + "step": 4980 + }, + { + "epoch": 8.998194945848375, + "grad_norm": 0.5020092748082658, + "learning_rate": 2.007220216606498e-06, + "loss": 0.0458, + "step": 4985 + }, + { + "epoch": 9.007220216606498, + "grad_norm": 0.4534175561356577, + "learning_rate": 1.989169675090253e-06, + "loss": 0.0428, + "step": 4990 + }, + { + "epoch": 9.016245487364621, + "grad_norm": 0.39468719173310207, + "learning_rate": 1.9711191335740075e-06, + "loss": 0.0425, + "step": 4995 + }, + { + "epoch": 9.025270758122744, + "grad_norm": 0.5163608939567661, + "learning_rate": 1.9530685920577617e-06, + "loss": 0.0423, + "step": 5000 + }, + { + "epoch": 9.025270758122744, + "eval_loss": 0.04193449020385742, + "eval_runtime": 759.583, + "eval_samples_per_second": 17.503, + "eval_steps_per_second": 0.729, + "step": 5000 + }, + { + "epoch": 9.034296028880867, + "grad_norm": 0.4599758437234501, + "learning_rate": 1.9350180505415164e-06, + "loss": 0.0422, + "step": 5005 + }, + { + "epoch": 9.04332129963899, + "grad_norm": 0.49964626450423844, + "learning_rate": 1.916967509025271e-06, + "loss": 0.0427, + "step": 5010 + }, + { + "epoch": 9.052346570397113, + "grad_norm": 0.4178907774622966, + "learning_rate": 1.8989169675090255e-06, + "loss": 0.0417, + "step": 5015 + }, + { + "epoch": 9.061371841155236, + "grad_norm": 0.39048192594633196, + "learning_rate": 1.88086642599278e-06, + "loss": 0.042, + "step": 5020 + }, + { + "epoch": 9.070397111913357, + "grad_norm": 0.4791548976955057, + "learning_rate": 1.8628158844765346e-06, + "loss": 0.0412, + "step": 5025 + }, + { + "epoch": 9.07942238267148, + "grad_norm": 0.4638926347317585, + "learning_rate": 1.844765342960289e-06, + "loss": 0.0423, + "step": 5030 + }, + { + "epoch": 9.088447653429602, + "grad_norm": 0.5289808337845963, + "learning_rate": 1.8267148014440433e-06, + "loss": 0.0421, + "step": 5035 + }, + { + "epoch": 9.097472924187725, + "grad_norm": 0.48767336384018667, + "learning_rate": 1.808664259927798e-06, + "loss": 0.0421, + "step": 5040 + }, + { + "epoch": 9.106498194945848, + "grad_norm": 0.4589026757647638, + "learning_rate": 1.7906137184115524e-06, + "loss": 0.0427, + "step": 5045 + }, + { + "epoch": 9.115523465703971, + "grad_norm": 0.46264566303543775, + "learning_rate": 1.7725631768953069e-06, + "loss": 0.0422, + "step": 5050 + }, + { + "epoch": 9.124548736462094, + "grad_norm": 0.4410983802446593, + "learning_rate": 1.7545126353790615e-06, + "loss": 0.0412, + "step": 5055 + }, + { + "epoch": 9.133574007220217, + "grad_norm": 0.4202311860157241, + "learning_rate": 1.736462093862816e-06, + "loss": 0.0421, + "step": 5060 + }, + { + "epoch": 9.14259927797834, + "grad_norm": 0.41239420931705556, + "learning_rate": 1.7184115523465706e-06, + "loss": 0.0426, + "step": 5065 + }, + { + "epoch": 9.151624548736462, + "grad_norm": 0.48539351529631136, + "learning_rate": 1.700361010830325e-06, + "loss": 0.0424, + "step": 5070 + }, + { + "epoch": 9.160649819494585, + "grad_norm": 0.4427031394702614, + "learning_rate": 1.6823104693140795e-06, + "loss": 0.0426, + "step": 5075 + }, + { + "epoch": 9.169675090252708, + "grad_norm": 0.4752210878563376, + "learning_rate": 1.6642599277978342e-06, + "loss": 0.0421, + "step": 5080 + }, + { + "epoch": 9.178700361010831, + "grad_norm": 0.5234039982082958, + "learning_rate": 1.6462093862815887e-06, + "loss": 0.0433, + "step": 5085 + }, + { + "epoch": 9.187725631768952, + "grad_norm": 0.4515971877704036, + "learning_rate": 1.6281588447653431e-06, + "loss": 0.0433, + "step": 5090 + }, + { + "epoch": 9.196750902527075, + "grad_norm": 0.6669560841097782, + "learning_rate": 1.6101083032490978e-06, + "loss": 0.0426, + "step": 5095 + }, + { + "epoch": 9.205776173285198, + "grad_norm": 0.4598456495040656, + "learning_rate": 1.592057761732852e-06, + "loss": 0.0423, + "step": 5100 + }, + { + "epoch": 9.21480144404332, + "grad_norm": 0.4965666374110849, + "learning_rate": 1.5740072202166065e-06, + "loss": 0.0428, + "step": 5105 + }, + { + "epoch": 9.223826714801444, + "grad_norm": 0.37975291809275413, + "learning_rate": 1.5559566787003611e-06, + "loss": 0.0429, + "step": 5110 + }, + { + "epoch": 9.232851985559567, + "grad_norm": 0.4716411802408934, + "learning_rate": 1.5379061371841156e-06, + "loss": 0.0428, + "step": 5115 + }, + { + "epoch": 9.24187725631769, + "grad_norm": 0.4327838862415194, + "learning_rate": 1.51985559566787e-06, + "loss": 0.0433, + "step": 5120 + }, + { + "epoch": 9.250902527075812, + "grad_norm": 0.5003874549105873, + "learning_rate": 1.5018050541516247e-06, + "loss": 0.0422, + "step": 5125 + }, + { + "epoch": 9.259927797833935, + "grad_norm": 0.49004225732796214, + "learning_rate": 1.4837545126353792e-06, + "loss": 0.0431, + "step": 5130 + }, + { + "epoch": 9.268953068592058, + "grad_norm": 0.41580487545033434, + "learning_rate": 1.4657039711191336e-06, + "loss": 0.0423, + "step": 5135 + }, + { + "epoch": 9.277978339350181, + "grad_norm": 0.6027983626577104, + "learning_rate": 1.4476534296028883e-06, + "loss": 0.0439, + "step": 5140 + }, + { + "epoch": 9.287003610108304, + "grad_norm": 0.4598712206248585, + "learning_rate": 1.4296028880866427e-06, + "loss": 0.0429, + "step": 5145 + }, + { + "epoch": 9.296028880866427, + "grad_norm": 0.49342736984840635, + "learning_rate": 1.4115523465703974e-06, + "loss": 0.0426, + "step": 5150 + }, + { + "epoch": 9.30505415162455, + "grad_norm": 0.49176684451178787, + "learning_rate": 1.3935018050541518e-06, + "loss": 0.0427, + "step": 5155 + }, + { + "epoch": 9.314079422382672, + "grad_norm": 0.45945014096327996, + "learning_rate": 1.3754512635379063e-06, + "loss": 0.0429, + "step": 5160 + }, + { + "epoch": 9.323104693140793, + "grad_norm": 0.4114132503638751, + "learning_rate": 1.357400722021661e-06, + "loss": 0.0428, + "step": 5165 + }, + { + "epoch": 9.332129963898916, + "grad_norm": 0.42473969764225256, + "learning_rate": 1.3393501805054152e-06, + "loss": 0.0431, + "step": 5170 + }, + { + "epoch": 9.34115523465704, + "grad_norm": 0.48677947222160384, + "learning_rate": 1.3212996389891696e-06, + "loss": 0.0427, + "step": 5175 + }, + { + "epoch": 9.350180505415162, + "grad_norm": 0.46858607602832114, + "learning_rate": 1.3032490974729243e-06, + "loss": 0.043, + "step": 5180 + }, + { + "epoch": 9.359205776173285, + "grad_norm": 0.42070686139078084, + "learning_rate": 1.2851985559566788e-06, + "loss": 0.043, + "step": 5185 + }, + { + "epoch": 9.368231046931408, + "grad_norm": 0.5082683923945276, + "learning_rate": 1.2671480144404332e-06, + "loss": 0.0432, + "step": 5190 + }, + { + "epoch": 9.37725631768953, + "grad_norm": 0.49146765440804957, + "learning_rate": 1.2490974729241879e-06, + "loss": 0.0433, + "step": 5195 + }, + { + "epoch": 9.386281588447654, + "grad_norm": 0.4942629872457007, + "learning_rate": 1.2310469314079423e-06, + "loss": 0.0432, + "step": 5200 + }, + { + "epoch": 9.395306859205776, + "grad_norm": 0.5421985341404022, + "learning_rate": 1.2129963898916968e-06, + "loss": 0.0434, + "step": 5205 + }, + { + "epoch": 9.4043321299639, + "grad_norm": 0.5216299465972287, + "learning_rate": 1.1949458483754514e-06, + "loss": 0.0427, + "step": 5210 + }, + { + "epoch": 9.413357400722022, + "grad_norm": 0.41742965195734916, + "learning_rate": 1.176895306859206e-06, + "loss": 0.0435, + "step": 5215 + }, + { + "epoch": 9.422382671480145, + "grad_norm": 0.50664802662452, + "learning_rate": 1.1588447653429604e-06, + "loss": 0.0427, + "step": 5220 + }, + { + "epoch": 9.431407942238268, + "grad_norm": 0.44313965879945716, + "learning_rate": 1.1407942238267148e-06, + "loss": 0.0435, + "step": 5225 + }, + { + "epoch": 9.440433212996389, + "grad_norm": 0.5131645452817757, + "learning_rate": 1.1227436823104695e-06, + "loss": 0.0434, + "step": 5230 + }, + { + "epoch": 9.449458483754512, + "grad_norm": 0.4474063447070478, + "learning_rate": 1.104693140794224e-06, + "loss": 0.0428, + "step": 5235 + }, + { + "epoch": 9.458483754512635, + "grad_norm": 0.4729265352475666, + "learning_rate": 1.0866425992779784e-06, + "loss": 0.0434, + "step": 5240 + }, + { + "epoch": 9.467509025270758, + "grad_norm": 0.45958551411695614, + "learning_rate": 1.068592057761733e-06, + "loss": 0.0432, + "step": 5245 + }, + { + "epoch": 9.47653429602888, + "grad_norm": 0.41039837943400087, + "learning_rate": 1.0505415162454875e-06, + "loss": 0.0439, + "step": 5250 + }, + { + "epoch": 9.485559566787003, + "grad_norm": 0.44753217417346286, + "learning_rate": 1.032490974729242e-06, + "loss": 0.042, + "step": 5255 + }, + { + "epoch": 9.494584837545126, + "grad_norm": 0.4139276562020777, + "learning_rate": 1.0144404332129964e-06, + "loss": 0.0427, + "step": 5260 + }, + { + "epoch": 9.50361010830325, + "grad_norm": 0.6251275698046171, + "learning_rate": 9.96389891696751e-07, + "loss": 0.0436, + "step": 5265 + }, + { + "epoch": 9.512635379061372, + "grad_norm": 0.4501260605118548, + "learning_rate": 9.783393501805055e-07, + "loss": 0.0431, + "step": 5270 + }, + { + "epoch": 9.521660649819495, + "grad_norm": 0.3980490709756831, + "learning_rate": 9.6028880866426e-07, + "loss": 0.0435, + "step": 5275 + }, + { + "epoch": 9.530685920577618, + "grad_norm": 0.46391281584553307, + "learning_rate": 9.422382671480146e-07, + "loss": 0.0436, + "step": 5280 + }, + { + "epoch": 9.53971119133574, + "grad_norm": 0.4127369030638665, + "learning_rate": 9.24187725631769e-07, + "loss": 0.043, + "step": 5285 + }, + { + "epoch": 9.548736462093864, + "grad_norm": 0.40324535228510755, + "learning_rate": 9.061371841155235e-07, + "loss": 0.0424, + "step": 5290 + }, + { + "epoch": 9.557761732851986, + "grad_norm": 0.4319791954661702, + "learning_rate": 8.880866425992781e-07, + "loss": 0.0435, + "step": 5295 + }, + { + "epoch": 9.566787003610107, + "grad_norm": 0.46271459456155417, + "learning_rate": 8.700361010830325e-07, + "loss": 0.0437, + "step": 5300 + }, + { + "epoch": 9.57581227436823, + "grad_norm": 0.427126926635068, + "learning_rate": 8.519855595667871e-07, + "loss": 0.0434, + "step": 5305 + }, + { + "epoch": 9.584837545126353, + "grad_norm": 0.4691427501429953, + "learning_rate": 8.339350180505417e-07, + "loss": 0.0437, + "step": 5310 + }, + { + "epoch": 9.593862815884476, + "grad_norm": 0.45679217355753204, + "learning_rate": 8.15884476534296e-07, + "loss": 0.0435, + "step": 5315 + }, + { + "epoch": 9.602888086642599, + "grad_norm": 0.48200222966829426, + "learning_rate": 7.978339350180506e-07, + "loss": 0.0437, + "step": 5320 + }, + { + "epoch": 9.611913357400722, + "grad_norm": 0.4901938411136157, + "learning_rate": 7.797833935018051e-07, + "loss": 0.044, + "step": 5325 + }, + { + "epoch": 9.620938628158845, + "grad_norm": 0.5165751415632741, + "learning_rate": 7.617328519855597e-07, + "loss": 0.0438, + "step": 5330 + }, + { + "epoch": 9.629963898916968, + "grad_norm": 0.5120078231958552, + "learning_rate": 7.436823104693141e-07, + "loss": 0.0427, + "step": 5335 + }, + { + "epoch": 9.63898916967509, + "grad_norm": 0.40973988656758753, + "learning_rate": 7.256317689530687e-07, + "loss": 0.0431, + "step": 5340 + }, + { + "epoch": 9.648014440433213, + "grad_norm": 0.4582389620066886, + "learning_rate": 7.075812274368232e-07, + "loss": 0.0433, + "step": 5345 + }, + { + "epoch": 9.657039711191336, + "grad_norm": 0.552631347988738, + "learning_rate": 6.895306859205776e-07, + "loss": 0.044, + "step": 5350 + }, + { + "epoch": 9.666064981949459, + "grad_norm": 0.3783877672799792, + "learning_rate": 6.714801444043322e-07, + "loss": 0.0435, + "step": 5355 + }, + { + "epoch": 9.675090252707582, + "grad_norm": 0.48138412749053733, + "learning_rate": 6.534296028880867e-07, + "loss": 0.0432, + "step": 5360 + }, + { + "epoch": 9.684115523465703, + "grad_norm": 0.4338898356243751, + "learning_rate": 6.353790613718413e-07, + "loss": 0.0433, + "step": 5365 + }, + { + "epoch": 9.693140794223826, + "grad_norm": 0.5225550209175859, + "learning_rate": 6.173285198555957e-07, + "loss": 0.0437, + "step": 5370 + }, + { + "epoch": 9.702166064981949, + "grad_norm": 0.5167671561018974, + "learning_rate": 5.992779783393502e-07, + "loss": 0.0434, + "step": 5375 + }, + { + "epoch": 9.711191335740072, + "grad_norm": 0.4115427231842469, + "learning_rate": 5.812274368231047e-07, + "loss": 0.0439, + "step": 5380 + }, + { + "epoch": 9.720216606498195, + "grad_norm": 0.45107850138578787, + "learning_rate": 5.631768953068593e-07, + "loss": 0.0439, + "step": 5385 + }, + { + "epoch": 9.729241877256317, + "grad_norm": 0.4342115174653207, + "learning_rate": 5.451263537906137e-07, + "loss": 0.0429, + "step": 5390 + }, + { + "epoch": 9.73826714801444, + "grad_norm": 0.4398151357748213, + "learning_rate": 5.270758122743683e-07, + "loss": 0.0432, + "step": 5395 + }, + { + "epoch": 9.747292418772563, + "grad_norm": 0.4242422950084841, + "learning_rate": 5.090252707581228e-07, + "loss": 0.0428, + "step": 5400 + }, + { + "epoch": 9.756317689530686, + "grad_norm": 0.46470583012028227, + "learning_rate": 4.909747292418773e-07, + "loss": 0.0431, + "step": 5405 + }, + { + "epoch": 9.765342960288809, + "grad_norm": 0.41760352928012284, + "learning_rate": 4.729241877256318e-07, + "loss": 0.0439, + "step": 5410 + }, + { + "epoch": 9.774368231046932, + "grad_norm": 0.43617126444221704, + "learning_rate": 4.548736462093863e-07, + "loss": 0.0429, + "step": 5415 + }, + { + "epoch": 9.783393501805055, + "grad_norm": 0.44372565623267207, + "learning_rate": 4.368231046931409e-07, + "loss": 0.0435, + "step": 5420 + }, + { + "epoch": 9.792418772563177, + "grad_norm": 0.4428259601085559, + "learning_rate": 4.1877256317689533e-07, + "loss": 0.0436, + "step": 5425 + }, + { + "epoch": 9.8014440433213, + "grad_norm": 0.5278115547439892, + "learning_rate": 4.0072202166064984e-07, + "loss": 0.0433, + "step": 5430 + }, + { + "epoch": 9.810469314079423, + "grad_norm": 0.4527700640650528, + "learning_rate": 3.826714801444044e-07, + "loss": 0.043, + "step": 5435 + }, + { + "epoch": 9.819494584837544, + "grad_norm": 0.401226496888323, + "learning_rate": 3.6462093862815885e-07, + "loss": 0.0428, + "step": 5440 + }, + { + "epoch": 9.828519855595667, + "grad_norm": 0.49440429063887187, + "learning_rate": 3.465703971119134e-07, + "loss": 0.0428, + "step": 5445 + }, + { + "epoch": 9.83754512635379, + "grad_norm": 0.5076032586052756, + "learning_rate": 3.285198555956679e-07, + "loss": 0.044, + "step": 5450 + }, + { + "epoch": 9.846570397111913, + "grad_norm": 0.43280796148087763, + "learning_rate": 3.104693140794224e-07, + "loss": 0.0432, + "step": 5455 + }, + { + "epoch": 9.855595667870036, + "grad_norm": 0.5015245155705008, + "learning_rate": 2.924187725631769e-07, + "loss": 0.0438, + "step": 5460 + }, + { + "epoch": 9.864620938628159, + "grad_norm": 0.5286579403439075, + "learning_rate": 2.743682310469314e-07, + "loss": 0.0434, + "step": 5465 + }, + { + "epoch": 9.873646209386282, + "grad_norm": 0.4224487204668749, + "learning_rate": 2.5631768953068593e-07, + "loss": 0.043, + "step": 5470 + }, + { + "epoch": 9.882671480144404, + "grad_norm": 0.45900406854255277, + "learning_rate": 2.3826714801444044e-07, + "loss": 0.0437, + "step": 5475 + }, + { + "epoch": 9.891696750902527, + "grad_norm": 0.45858140540922404, + "learning_rate": 2.2021660649819497e-07, + "loss": 0.0433, + "step": 5480 + }, + { + "epoch": 9.90072202166065, + "grad_norm": 0.530002579562083, + "learning_rate": 2.0216606498194947e-07, + "loss": 0.0434, + "step": 5485 + }, + { + "epoch": 9.909747292418773, + "grad_norm": 0.5370134546660643, + "learning_rate": 1.84115523465704e-07, + "loss": 0.0437, + "step": 5490 + }, + { + "epoch": 9.918772563176896, + "grad_norm": 0.4902526885586167, + "learning_rate": 1.660649819494585e-07, + "loss": 0.0439, + "step": 5495 + }, + { + "epoch": 9.927797833935019, + "grad_norm": 0.4649182165247798, + "learning_rate": 1.4801444043321301e-07, + "loss": 0.0444, + "step": 5500 + }, + { + "epoch": 9.93682310469314, + "grad_norm": 0.5575325439198506, + "learning_rate": 1.2996389891696752e-07, + "loss": 0.0424, + "step": 5505 + }, + { + "epoch": 9.945848375451263, + "grad_norm": 0.45452667158004884, + "learning_rate": 1.1191335740072203e-07, + "loss": 0.0437, + "step": 5510 + }, + { + "epoch": 9.954873646209386, + "grad_norm": 0.5203515708243794, + "learning_rate": 9.386281588447654e-08, + "loss": 0.0439, + "step": 5515 + }, + { + "epoch": 9.963898916967509, + "grad_norm": 0.45178261726772573, + "learning_rate": 7.581227436823105e-08, + "loss": 0.0432, + "step": 5520 + }, + { + "epoch": 9.972924187725631, + "grad_norm": 0.4688905694517184, + "learning_rate": 5.776173285198556e-08, + "loss": 0.0434, + "step": 5525 + }, + { + "epoch": 9.981949458483754, + "grad_norm": 0.43677959065978206, + "learning_rate": 3.971119133574008e-08, + "loss": 0.0437, + "step": 5530 + }, + { + "epoch": 9.990974729241877, + "grad_norm": 0.49453568586672814, + "learning_rate": 2.1660649819494588e-08, + "loss": 0.0437, + "step": 5535 + }, + { + "epoch": 10.0, + "grad_norm": 0.4332227379502704, + "learning_rate": 3.6101083032490975e-09, + "loss": 0.0433, + "step": 5540 + }, + { + "epoch": 10.0, + "step": 5540, + "total_flos": 613623860887552.0, + "train_loss": 0.19332283668838685, + "train_runtime": 35523.5631, + "train_samples_per_second": 3.743, + "train_steps_per_second": 0.156 + } + ], + "logging_steps": 5, + "max_steps": 5540, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 613623860887552.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}