{ "best_metric": 0.11874233186244965, "best_model_checkpoint": "./weights/OurNewMoleculeModel-v1/checkpoint-256125", "epoch": 25.0, "eval_steps": 500, "global_step": 256125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04880429477794046, "grad_norm": 1.3488572835922241, "learning_rate": 4.99998163439129e-05, "loss": 2.2985, "step": 500 }, { "epoch": 0.09760858955588092, "grad_norm": 0.4086189568042755, "learning_rate": 4.999926537834994e-05, "loss": 2.0404, "step": 1000 }, { "epoch": 0.14641288433382138, "grad_norm": 0.5561855435371399, "learning_rate": 4.999834711140619e-05, "loss": 2.0324, "step": 1500 }, { "epoch": 0.19521717911176184, "grad_norm": 0.2902628779411316, "learning_rate": 4.999706155657327e-05, "loss": 2.0287, "step": 2000 }, { "epoch": 0.2440214738897023, "grad_norm": 0.7554148435592651, "learning_rate": 4.999540873273918e-05, "loss": 2.0277, "step": 2500 }, { "epoch": 0.29282576866764276, "grad_norm": 0.34928998351097107, "learning_rate": 4.999338866418801e-05, "loss": 2.0227, "step": 3000 }, { "epoch": 0.3416300634455832, "grad_norm": 0.5614811182022095, "learning_rate": 4.999100138059959e-05, "loss": 2.0122, "step": 3500 }, { "epoch": 0.3904343582235237, "grad_norm": 0.5667726993560791, "learning_rate": 4.998824691704905e-05, "loss": 1.9914, "step": 4000 }, { "epoch": 0.43923865300146414, "grad_norm": 1.2578080892562866, "learning_rate": 4.998512531400633e-05, "loss": 1.9431, "step": 4500 }, { "epoch": 0.4880429477794046, "grad_norm": 1.1142494678497314, "learning_rate": 4.9981636617335516e-05, "loss": 1.578, "step": 5000 }, { "epoch": 0.5368472425573451, "grad_norm": 0.9630743861198425, "learning_rate": 4.997778087829424e-05, "loss": 1.2667, "step": 5500 }, { "epoch": 0.5856515373352855, "grad_norm": 0.7279083132743835, "learning_rate": 4.9973558153532925e-05, "loss": 1.0208, "step": 6000 }, { "epoch": 0.634455832113226, "grad_norm": 0.8263267874717712, "learning_rate": 4.996896850509387e-05, "loss": 0.885, "step": 6500 }, { "epoch": 0.6832601268911664, "grad_norm": 0.7792947292327881, "learning_rate": 4.996401200041044e-05, "loss": 0.8054, "step": 7000 }, { "epoch": 0.7320644216691069, "grad_norm": 0.6826034188270569, "learning_rate": 4.9958688712306015e-05, "loss": 0.7463, "step": 7500 }, { "epoch": 0.7808687164470474, "grad_norm": 0.7101658582687378, "learning_rate": 4.995299871899292e-05, "loss": 0.6952, "step": 8000 }, { "epoch": 0.8296730112249878, "grad_norm": 0.5552261471748352, "learning_rate": 4.994694210407133e-05, "loss": 0.6516, "step": 8500 }, { "epoch": 0.8784773060029283, "grad_norm": 0.5594379305839539, "learning_rate": 4.994051895652797e-05, "loss": 0.6156, "step": 9000 }, { "epoch": 0.9272816007808687, "grad_norm": 0.7451700568199158, "learning_rate": 4.993372937073485e-05, "loss": 0.5843, "step": 9500 }, { "epoch": 0.9760858955588092, "grad_norm": 0.5584864020347595, "learning_rate": 4.9926573446447875e-05, "loss": 0.5583, "step": 10000 }, { "epoch": 1.0, "eval_loss": 0.4663134217262268, "eval_runtime": 27.1679, "eval_samples_per_second": 289.901, "eval_steps_per_second": 0.405, "step": 10245 }, { "epoch": 1.0248901903367496, "grad_norm": 0.527858555316925, "learning_rate": 4.9919051288805364e-05, "loss": 0.5353, "step": 10500 }, { "epoch": 1.0736944851146901, "grad_norm": 0.5612876415252686, "learning_rate": 4.9911163008326527e-05, "loss": 0.5154, "step": 11000 }, { "epoch": 1.1224987798926305, "grad_norm": 0.4924549460411072, "learning_rate": 4.990290872090982e-05, "loss": 0.4931, "step": 11500 }, { "epoch": 1.171303074670571, "grad_norm": 0.4243695139884949, "learning_rate": 4.9894288547831245e-05, "loss": 0.476, "step": 12000 }, { "epoch": 1.2201073694485114, "grad_norm": 0.5059812068939209, "learning_rate": 4.98853026157426e-05, "loss": 0.4609, "step": 12500 }, { "epoch": 1.268911664226452, "grad_norm": 0.4593505263328552, "learning_rate": 4.987595105666956e-05, "loss": 0.4468, "step": 13000 }, { "epoch": 1.3177159590043923, "grad_norm": 0.46688178181648254, "learning_rate": 4.9866234008009794e-05, "loss": 0.434, "step": 13500 }, { "epoch": 1.3665202537823329, "grad_norm": 0.4821254312992096, "learning_rate": 4.9856151612530905e-05, "loss": 0.4218, "step": 14000 }, { "epoch": 1.4153245485602732, "grad_norm": 0.4354498088359833, "learning_rate": 4.9845704018368364e-05, "loss": 0.4105, "step": 14500 }, { "epoch": 1.4641288433382138, "grad_norm": 0.4537793695926666, "learning_rate": 4.9834891379023305e-05, "loss": 0.3998, "step": 15000 }, { "epoch": 1.5129331381161544, "grad_norm": 0.37507402896881104, "learning_rate": 4.9823713853360294e-05, "loss": 0.3899, "step": 15500 }, { "epoch": 1.5617374328940947, "grad_norm": 0.40271782875061035, "learning_rate": 4.981217160560499e-05, "loss": 0.3812, "step": 16000 }, { "epoch": 1.610541727672035, "grad_norm": 0.3701293170452118, "learning_rate": 4.9800264805341694e-05, "loss": 0.373, "step": 16500 }, { "epoch": 1.6593460224499756, "grad_norm": 0.41362902522087097, "learning_rate": 4.978799362751094e-05, "loss": 0.3654, "step": 17000 }, { "epoch": 1.7081503172279162, "grad_norm": 0.3652186989784241, "learning_rate": 4.9775358252406836e-05, "loss": 0.3581, "step": 17500 }, { "epoch": 1.7569546120058566, "grad_norm": 0.366926908493042, "learning_rate": 4.9762358865674464e-05, "loss": 0.3515, "step": 18000 }, { "epoch": 1.805758906783797, "grad_norm": 0.4293728470802307, "learning_rate": 4.974899565830715e-05, "loss": 0.3449, "step": 18500 }, { "epoch": 1.8545632015617375, "grad_norm": 0.37214261293411255, "learning_rate": 4.973526882664364e-05, "loss": 0.3394, "step": 19000 }, { "epoch": 1.903367496339678, "grad_norm": 0.4047256112098694, "learning_rate": 4.9721178572365235e-05, "loss": 0.3337, "step": 19500 }, { "epoch": 1.9521717911176184, "grad_norm": 0.34720858931541443, "learning_rate": 4.9706725102492814e-05, "loss": 0.3287, "step": 20000 }, { "epoch": 2.0, "eval_loss": 0.28212064504623413, "eval_runtime": 23.8844, "eval_samples_per_second": 329.755, "eval_steps_per_second": 0.461, "step": 20490 }, { "epoch": 2.0009760858955588, "grad_norm": 0.37098678946495056, "learning_rate": 4.969190862938378e-05, "loss": 0.3237, "step": 20500 }, { "epoch": 2.049780380673499, "grad_norm": 0.3951970040798187, "learning_rate": 4.967672937072898e-05, "loss": 0.3191, "step": 21000 }, { "epoch": 2.09858467545144, "grad_norm": 0.3509838581085205, "learning_rate": 4.9661187549549476e-05, "loss": 0.3144, "step": 21500 }, { "epoch": 2.1473889702293802, "grad_norm": 0.35936230421066284, "learning_rate": 4.9645283394193274e-05, "loss": 0.3099, "step": 22000 }, { "epoch": 2.1961932650073206, "grad_norm": 0.3251510560512543, "learning_rate": 4.962901713833197e-05, "loss": 0.3063, "step": 22500 }, { "epoch": 2.244997559785261, "grad_norm": 0.33518901467323303, "learning_rate": 4.9612389020957306e-05, "loss": 0.3023, "step": 23000 }, { "epoch": 2.2938018545632017, "grad_norm": 0.3487328886985779, "learning_rate": 4.9595399286377686e-05, "loss": 0.2985, "step": 23500 }, { "epoch": 2.342606149341142, "grad_norm": 0.34018632769584656, "learning_rate": 4.9578048184214565e-05, "loss": 0.2952, "step": 24000 }, { "epoch": 2.3914104441190824, "grad_norm": 0.34304648637771606, "learning_rate": 4.956033596939879e-05, "loss": 0.2915, "step": 24500 }, { "epoch": 2.440214738897023, "grad_norm": 0.34716567397117615, "learning_rate": 4.9542262902166834e-05, "loss": 0.2883, "step": 25000 }, { "epoch": 2.4890190336749636, "grad_norm": 0.3204454481601715, "learning_rate": 4.952382924805702e-05, "loss": 0.2853, "step": 25500 }, { "epoch": 2.537823328452904, "grad_norm": 0.3337819278240204, "learning_rate": 4.950503527790555e-05, "loss": 0.2821, "step": 26000 }, { "epoch": 2.5866276232308443, "grad_norm": 0.3394376039505005, "learning_rate": 4.948588126784261e-05, "loss": 0.2793, "step": 26500 }, { "epoch": 2.6354319180087846, "grad_norm": 0.3065101206302643, "learning_rate": 4.9466367499288213e-05, "loss": 0.2767, "step": 27000 }, { "epoch": 2.6842362127867254, "grad_norm": 0.30751967430114746, "learning_rate": 4.9446494258948176e-05, "loss": 0.2736, "step": 27500 }, { "epoch": 2.7330405075646658, "grad_norm": 0.31060898303985596, "learning_rate": 4.942626183880981e-05, "loss": 0.2712, "step": 28000 }, { "epoch": 2.781844802342606, "grad_norm": 0.38574928045272827, "learning_rate": 4.940567053613768e-05, "loss": 0.2688, "step": 28500 }, { "epoch": 2.8306490971205465, "grad_norm": 0.31712907552719116, "learning_rate": 4.938472065346925e-05, "loss": 0.2669, "step": 29000 }, { "epoch": 2.879453391898487, "grad_norm": 0.2964314818382263, "learning_rate": 4.9363412498610385e-05, "loss": 0.2641, "step": 29500 }, { "epoch": 2.9282576866764276, "grad_norm": 0.30216559767723083, "learning_rate": 4.934174638463087e-05, "loss": 0.2616, "step": 30000 }, { "epoch": 2.977061981454368, "grad_norm": 0.2843080461025238, "learning_rate": 4.9319722629859813e-05, "loss": 0.2598, "step": 30500 }, { "epoch": 3.0, "eval_loss": 0.2259799689054489, "eval_runtime": 24.7473, "eval_samples_per_second": 318.256, "eval_steps_per_second": 0.444, "step": 30735 }, { "epoch": 3.0258662762323083, "grad_norm": 0.3090941905975342, "learning_rate": 4.9297341557880936e-05, "loss": 0.2577, "step": 31000 }, { "epoch": 3.074670571010249, "grad_norm": 0.29751360416412354, "learning_rate": 4.927460349752785e-05, "loss": 0.2554, "step": 31500 }, { "epoch": 3.1234748657881894, "grad_norm": 0.2908008396625519, "learning_rate": 4.925150878287921e-05, "loss": 0.2537, "step": 32000 }, { "epoch": 3.17227916056613, "grad_norm": 0.29090872406959534, "learning_rate": 4.92280577532538e-05, "loss": 0.2518, "step": 32500 }, { "epoch": 3.22108345534407, "grad_norm": 0.301048219203949, "learning_rate": 4.9204250753205585e-05, "loss": 0.2503, "step": 33000 }, { "epoch": 3.2698877501220105, "grad_norm": 0.2861855924129486, "learning_rate": 4.91800881325186e-05, "loss": 0.2482, "step": 33500 }, { "epoch": 3.3186920448999513, "grad_norm": 0.28286224603652954, "learning_rate": 4.915557024620183e-05, "loss": 0.2466, "step": 34000 }, { "epoch": 3.3674963396778916, "grad_norm": 0.3069954514503479, "learning_rate": 4.913069745448399e-05, "loss": 0.2451, "step": 34500 }, { "epoch": 3.416300634455832, "grad_norm": 0.2962004542350769, "learning_rate": 4.910547012280827e-05, "loss": 0.2436, "step": 35000 }, { "epoch": 3.465104929233773, "grad_norm": 0.2845563590526581, "learning_rate": 4.907988862182689e-05, "loss": 0.2421, "step": 35500 }, { "epoch": 3.513909224011713, "grad_norm": 0.26839151978492737, "learning_rate": 4.905395332739574e-05, "loss": 0.2406, "step": 36000 }, { "epoch": 3.5627135187896535, "grad_norm": 0.27475783228874207, "learning_rate": 4.902766462056877e-05, "loss": 0.2389, "step": 36500 }, { "epoch": 3.611517813567594, "grad_norm": 0.26468226313591003, "learning_rate": 4.900102288759249e-05, "loss": 0.2374, "step": 37000 }, { "epoch": 3.660322108345534, "grad_norm": 0.276924729347229, "learning_rate": 4.89740285199002e-05, "loss": 0.2361, "step": 37500 }, { "epoch": 3.709126403123475, "grad_norm": 0.2739529609680176, "learning_rate": 4.894668191410629e-05, "loss": 0.2348, "step": 38000 }, { "epoch": 3.7579306979014153, "grad_norm": 0.26919183135032654, "learning_rate": 4.8918983472000433e-05, "loss": 0.2336, "step": 38500 }, { "epoch": 3.8067349926793557, "grad_norm": 0.29099541902542114, "learning_rate": 4.88909336005416e-05, "loss": 0.2323, "step": 39000 }, { "epoch": 3.8555392874572965, "grad_norm": 0.2892494797706604, "learning_rate": 4.8862532711852184e-05, "loss": 0.2308, "step": 39500 }, { "epoch": 3.904343582235237, "grad_norm": 0.29746654629707336, "learning_rate": 4.883378122321186e-05, "loss": 0.2292, "step": 40000 }, { "epoch": 3.953147877013177, "grad_norm": 0.26809337735176086, "learning_rate": 4.8804679557051495e-05, "loss": 0.2283, "step": 40500 }, { "epoch": 4.0, "eval_loss": 0.19832605123519897, "eval_runtime": 25.8272, "eval_samples_per_second": 304.95, "eval_steps_per_second": 0.426, "step": 40980 }, { "epoch": 4.0019521717911175, "grad_norm": 0.2542949616909027, "learning_rate": 4.877522814094696e-05, "loss": 0.2272, "step": 41000 }, { "epoch": 4.050756466569058, "grad_norm": 0.2937975525856018, "learning_rate": 4.8745427407612776e-05, "loss": 0.2258, "step": 41500 }, { "epoch": 4.099560761346998, "grad_norm": 0.2632514536380768, "learning_rate": 4.8715277794895855e-05, "loss": 0.2256, "step": 42000 }, { "epoch": 4.148365056124939, "grad_norm": 0.2573137879371643, "learning_rate": 4.8684779745768974e-05, "loss": 0.2237, "step": 42500 }, { "epoch": 4.19716935090288, "grad_norm": 0.2653585970401764, "learning_rate": 4.8653933708324325e-05, "loss": 0.223, "step": 43000 }, { "epoch": 4.24597364568082, "grad_norm": 0.25552433729171753, "learning_rate": 4.862274013576691e-05, "loss": 0.2218, "step": 43500 }, { "epoch": 4.2947779404587605, "grad_norm": 0.2834942936897278, "learning_rate": 4.859119948640789e-05, "loss": 0.2211, "step": 44000 }, { "epoch": 4.343582235236701, "grad_norm": 0.2516108751296997, "learning_rate": 4.855931222365784e-05, "loss": 0.2202, "step": 44500 }, { "epoch": 4.392386530014641, "grad_norm": 0.301641583442688, "learning_rate": 4.852707881601996e-05, "loss": 0.2188, "step": 45000 }, { "epoch": 4.4411908247925815, "grad_norm": 0.26468151807785034, "learning_rate": 4.849449973708316e-05, "loss": 0.2176, "step": 45500 }, { "epoch": 4.489995119570522, "grad_norm": 0.274828165769577, "learning_rate": 4.846157546551516e-05, "loss": 0.2171, "step": 46000 }, { "epoch": 4.538799414348462, "grad_norm": 0.27979806065559387, "learning_rate": 4.842830648505535e-05, "loss": 0.2161, "step": 46500 }, { "epoch": 4.5876037091264035, "grad_norm": 0.26616737246513367, "learning_rate": 4.839469328450783e-05, "loss": 0.2149, "step": 47000 }, { "epoch": 4.636408003904344, "grad_norm": 0.24560213088989258, "learning_rate": 4.8360736357734083e-05, "loss": 0.2145, "step": 47500 }, { "epoch": 4.685212298682284, "grad_norm": 0.25653526186943054, "learning_rate": 4.8326436203645833e-05, "loss": 0.213, "step": 48000 }, { "epoch": 4.7340165934602245, "grad_norm": 0.2549044191837311, "learning_rate": 4.829179332619763e-05, "loss": 0.2124, "step": 48500 }, { "epoch": 4.782820888238165, "grad_norm": 0.24373945593833923, "learning_rate": 4.8256808234379516e-05, "loss": 0.2115, "step": 49000 }, { "epoch": 4.831625183016105, "grad_norm": 0.24188542366027832, "learning_rate": 4.822148144220948e-05, "loss": 0.2104, "step": 49500 }, { "epoch": 4.880429477794046, "grad_norm": 0.2541993260383606, "learning_rate": 4.8185813468725974e-05, "loss": 0.2102, "step": 50000 }, { "epoch": 4.929233772571987, "grad_norm": 0.266525536775589, "learning_rate": 4.814980483798022e-05, "loss": 0.2092, "step": 50500 }, { "epoch": 4.978038067349927, "grad_norm": 0.24894855916500092, "learning_rate": 4.811345607902855e-05, "loss": 0.2084, "step": 51000 }, { "epoch": 5.0, "eval_loss": 0.18130482733249664, "eval_runtime": 23.9311, "eval_samples_per_second": 329.111, "eval_steps_per_second": 0.46, "step": 51225 }, { "epoch": 5.0268423621278675, "grad_norm": 0.23973380029201508, "learning_rate": 4.8076767725924654e-05, "loss": 0.2076, "step": 51500 }, { "epoch": 5.075646656905808, "grad_norm": 0.23818284273147583, "learning_rate": 4.803974031771166e-05, "loss": 0.2067, "step": 52000 }, { "epoch": 5.124450951683748, "grad_norm": 0.23774628341197968, "learning_rate": 4.8002374398414295e-05, "loss": 0.2061, "step": 52500 }, { "epoch": 5.1732552464616886, "grad_norm": 0.2544199824333191, "learning_rate": 4.796467051703083e-05, "loss": 0.2051, "step": 53000 }, { "epoch": 5.222059541239629, "grad_norm": 0.24035200476646423, "learning_rate": 4.7926629227525066e-05, "loss": 0.2042, "step": 53500 }, { "epoch": 5.270863836017569, "grad_norm": 0.25180783867836, "learning_rate": 4.788825108881814e-05, "loss": 0.2037, "step": 54000 }, { "epoch": 5.31966813079551, "grad_norm": 0.25087398290634155, "learning_rate": 4.7849536664780346e-05, "loss": 0.2032, "step": 54500 }, { "epoch": 5.368472425573451, "grad_norm": 0.2356226146221161, "learning_rate": 4.7810486524222885e-05, "loss": 0.2024, "step": 55000 }, { "epoch": 5.417276720351391, "grad_norm": 0.25190770626068115, "learning_rate": 4.777110124088942e-05, "loss": 0.2019, "step": 55500 }, { "epoch": 5.4660810151293315, "grad_norm": 0.24268530309200287, "learning_rate": 4.77313813934477e-05, "loss": 0.2011, "step": 56000 }, { "epoch": 5.514885309907272, "grad_norm": 0.23932518064975739, "learning_rate": 4.7691327565481095e-05, "loss": 0.2005, "step": 56500 }, { "epoch": 5.563689604685212, "grad_norm": 0.23731377720832825, "learning_rate": 4.765094034547992e-05, "loss": 0.1996, "step": 57000 }, { "epoch": 5.612493899463153, "grad_norm": 0.2333805412054062, "learning_rate": 4.76102203268329e-05, "loss": 0.1989, "step": 57500 }, { "epoch": 5.661298194241093, "grad_norm": 0.24407994747161865, "learning_rate": 4.756916810781838e-05, "loss": 0.1987, "step": 58000 }, { "epoch": 5.710102489019034, "grad_norm": 0.23789800703525543, "learning_rate": 4.752778429159554e-05, "loss": 0.1979, "step": 58500 }, { "epoch": 5.7589067837969745, "grad_norm": 0.24565084278583527, "learning_rate": 4.7486069486195564e-05, "loss": 0.1969, "step": 59000 }, { "epoch": 5.807711078574915, "grad_norm": 0.26797595620155334, "learning_rate": 4.744402430451269e-05, "loss": 0.1965, "step": 59500 }, { "epoch": 5.856515373352855, "grad_norm": 0.25408676266670227, "learning_rate": 4.74016493642952e-05, "loss": 0.1955, "step": 60000 }, { "epoch": 5.905319668130796, "grad_norm": 0.23447421193122864, "learning_rate": 4.7358945288136344e-05, "loss": 0.1949, "step": 60500 }, { "epoch": 5.954123962908736, "grad_norm": 0.2329121083021164, "learning_rate": 4.7315912703465225e-05, "loss": 0.1948, "step": 61000 }, { "epoch": 6.0, "eval_loss": 0.1711394339799881, "eval_runtime": 27.596, "eval_samples_per_second": 285.403, "eval_steps_per_second": 0.399, "step": 61470 }, { "epoch": 6.002928257686676, "grad_norm": 0.2361510992050171, "learning_rate": 4.727255224253751e-05, "loss": 0.1941, "step": 61500 }, { "epoch": 6.051732552464617, "grad_norm": 0.23526506125926971, "learning_rate": 4.7228864542426224e-05, "loss": 0.1934, "step": 62000 }, { "epoch": 6.100536847242557, "grad_norm": 0.24888668954372406, "learning_rate": 4.7184850245012316e-05, "loss": 0.1928, "step": 62500 }, { "epoch": 6.149341142020498, "grad_norm": 0.24024108052253723, "learning_rate": 4.714050999697528e-05, "loss": 0.1924, "step": 63000 }, { "epoch": 6.1981454367984385, "grad_norm": 0.24707584083080292, "learning_rate": 4.709584444978364e-05, "loss": 0.192, "step": 63500 }, { "epoch": 6.246949731576379, "grad_norm": 0.2352433204650879, "learning_rate": 4.705085425968536e-05, "loss": 0.1915, "step": 64000 }, { "epoch": 6.295754026354319, "grad_norm": 0.24224288761615753, "learning_rate": 4.700554008769823e-05, "loss": 0.1907, "step": 64500 }, { "epoch": 6.34455832113226, "grad_norm": 0.2216614931821823, "learning_rate": 4.6959902599600125e-05, "loss": 0.1902, "step": 65000 }, { "epoch": 6.3933626159102, "grad_norm": 0.22495177388191223, "learning_rate": 4.691394246591925e-05, "loss": 0.1899, "step": 65500 }, { "epoch": 6.44216691068814, "grad_norm": 0.22609297931194305, "learning_rate": 4.686766036192426e-05, "loss": 0.1891, "step": 66000 }, { "epoch": 6.490971205466081, "grad_norm": 0.24654404819011688, "learning_rate": 4.682105696761436e-05, "loss": 0.1889, "step": 66500 }, { "epoch": 6.539775500244021, "grad_norm": 0.2228369563817978, "learning_rate": 4.6774132967709336e-05, "loss": 0.1881, "step": 67000 }, { "epoch": 6.588579795021962, "grad_norm": 0.21981576085090637, "learning_rate": 4.6726889051639436e-05, "loss": 0.1878, "step": 67500 }, { "epoch": 6.637384089799903, "grad_norm": 0.22510704398155212, "learning_rate": 4.6679325913535266e-05, "loss": 0.1871, "step": 68000 }, { "epoch": 6.686188384577843, "grad_norm": 0.24267421662807465, "learning_rate": 4.663144425221763e-05, "loss": 0.1867, "step": 68500 }, { "epoch": 6.734992679355783, "grad_norm": 0.2170720249414444, "learning_rate": 4.65832447711872e-05, "loss": 0.1862, "step": 69000 }, { "epoch": 6.783796974133724, "grad_norm": 0.25550180673599243, "learning_rate": 4.653472817861425e-05, "loss": 0.1857, "step": 69500 }, { "epoch": 6.832601268911664, "grad_norm": 0.23408746719360352, "learning_rate": 4.648589518732815e-05, "loss": 0.1853, "step": 70000 }, { "epoch": 6.881405563689604, "grad_norm": 0.26076194643974304, "learning_rate": 4.6436746514807e-05, "loss": 0.1849, "step": 70500 }, { "epoch": 6.930209858467546, "grad_norm": 0.21694616973400116, "learning_rate": 4.638728288316704e-05, "loss": 0.184, "step": 71000 }, { "epoch": 6.979014153245486, "grad_norm": 0.21888791024684906, "learning_rate": 4.633750501915203e-05, "loss": 0.184, "step": 71500 }, { "epoch": 7.0, "eval_loss": 0.16187380254268646, "eval_runtime": 26.1, "eval_samples_per_second": 301.762, "eval_steps_per_second": 0.421, "step": 71715 }, { "epoch": 7.027818448023426, "grad_norm": 0.22506974637508392, "learning_rate": 4.628741365412258e-05, "loss": 0.1836, "step": 72000 }, { "epoch": 7.076622742801367, "grad_norm": 0.21344968676567078, "learning_rate": 4.623700952404542e-05, "loss": 0.1832, "step": 72500 }, { "epoch": 7.125427037579307, "grad_norm": 0.22301891446113586, "learning_rate": 4.618629336948258e-05, "loss": 0.1826, "step": 73000 }, { "epoch": 7.174231332357247, "grad_norm": 0.2228812873363495, "learning_rate": 4.6135265935580494e-05, "loss": 0.182, "step": 73500 }, { "epoch": 7.223035627135188, "grad_norm": 0.24568694829940796, "learning_rate": 4.6083927972059084e-05, "loss": 0.1814, "step": 74000 }, { "epoch": 7.271839921913128, "grad_norm": 0.23808668553829193, "learning_rate": 4.603228023320069e-05, "loss": 0.1816, "step": 74500 }, { "epoch": 7.320644216691068, "grad_norm": 0.21967822313308716, "learning_rate": 4.598032347783905e-05, "loss": 0.1809, "step": 75000 }, { "epoch": 7.36944851146901, "grad_norm": 0.20847086608409882, "learning_rate": 4.5928058469348115e-05, "loss": 0.1806, "step": 75500 }, { "epoch": 7.41825280624695, "grad_norm": 0.22811928391456604, "learning_rate": 4.587548597563084e-05, "loss": 0.18, "step": 76000 }, { "epoch": 7.46705710102489, "grad_norm": 0.22424574196338654, "learning_rate": 4.582260676910791e-05, "loss": 0.1794, "step": 76500 }, { "epoch": 7.515861395802831, "grad_norm": 0.22317995131015778, "learning_rate": 4.5769421626706376e-05, "loss": 0.1793, "step": 77000 }, { "epoch": 7.564665690580771, "grad_norm": 0.21519626677036285, "learning_rate": 4.571593132984825e-05, "loss": 0.1789, "step": 77500 }, { "epoch": 7.613469985358711, "grad_norm": 0.2195836454629898, "learning_rate": 4.566213666443901e-05, "loss": 0.1784, "step": 78000 }, { "epoch": 7.662274280136652, "grad_norm": 0.23087261617183685, "learning_rate": 4.56080384208561e-05, "loss": 0.1778, "step": 78500 }, { "epoch": 7.711078574914593, "grad_norm": 0.2173396646976471, "learning_rate": 4.5553637393937234e-05, "loss": 0.1777, "step": 79000 }, { "epoch": 7.759882869692533, "grad_norm": 0.22740761935710907, "learning_rate": 4.54989343829688e-05, "loss": 0.1774, "step": 79500 }, { "epoch": 7.808687164470474, "grad_norm": 0.20074845850467682, "learning_rate": 4.544393019167408e-05, "loss": 0.1768, "step": 80000 }, { "epoch": 7.857491459248414, "grad_norm": 0.21903088688850403, "learning_rate": 4.538862562820143e-05, "loss": 0.1766, "step": 80500 }, { "epoch": 7.906295754026354, "grad_norm": 0.21944737434387207, "learning_rate": 4.533302150511243e-05, "loss": 0.1763, "step": 81000 }, { "epoch": 7.955100048804295, "grad_norm": 0.22298942506313324, "learning_rate": 4.5277118639369935e-05, "loss": 0.1758, "step": 81500 }, { "epoch": 8.0, "eval_loss": 0.15350797772407532, "eval_runtime": 25.6287, "eval_samples_per_second": 307.312, "eval_steps_per_second": 0.429, "step": 81960 }, { "epoch": 8.003904343582235, "grad_norm": 0.22214815020561218, "learning_rate": 4.5220917852326076e-05, "loss": 0.1758, "step": 82000 }, { "epoch": 8.052708638360176, "grad_norm": 0.22404730319976807, "learning_rate": 4.516441996971018e-05, "loss": 0.1751, "step": 82500 }, { "epoch": 8.101512933138116, "grad_norm": 0.21983228623867035, "learning_rate": 4.510762582161664e-05, "loss": 0.1747, "step": 83000 }, { "epoch": 8.150317227916057, "grad_norm": 0.23077502846717834, "learning_rate": 4.5050536242492756e-05, "loss": 0.1745, "step": 83500 }, { "epoch": 8.199121522693996, "grad_norm": 0.21954509615898132, "learning_rate": 4.4993152071126424e-05, "loss": 0.174, "step": 84000 }, { "epoch": 8.247925817471938, "grad_norm": 0.2204139679670334, "learning_rate": 4.493547415063382e-05, "loss": 0.1739, "step": 84500 }, { "epoch": 8.296730112249879, "grad_norm": 0.2210853546857834, "learning_rate": 4.487750332844704e-05, "loss": 0.1736, "step": 85000 }, { "epoch": 8.345534407027818, "grad_norm": 0.21140769124031067, "learning_rate": 4.4819240456301645e-05, "loss": 0.1732, "step": 85500 }, { "epoch": 8.39433870180576, "grad_norm": 0.22270390391349792, "learning_rate": 4.476068639022412e-05, "loss": 0.1726, "step": 86000 }, { "epoch": 8.443142996583699, "grad_norm": 0.2249361127614975, "learning_rate": 4.4701841990519324e-05, "loss": 0.1724, "step": 86500 }, { "epoch": 8.49194729136164, "grad_norm": 0.21904852986335754, "learning_rate": 4.4642708121757815e-05, "loss": 0.1723, "step": 87000 }, { "epoch": 8.54075158613958, "grad_norm": 0.21276357769966125, "learning_rate": 4.45832856527632e-05, "loss": 0.1717, "step": 87500 }, { "epoch": 8.589555880917521, "grad_norm": 0.21569614112377167, "learning_rate": 4.452357545659934e-05, "loss": 0.1714, "step": 88000 }, { "epoch": 8.63836017569546, "grad_norm": 0.21162466704845428, "learning_rate": 4.446357841055749e-05, "loss": 0.171, "step": 88500 }, { "epoch": 8.687164470473402, "grad_norm": 0.2211264669895172, "learning_rate": 4.4403295396143495e-05, "loss": 0.1709, "step": 89000 }, { "epoch": 8.735968765251343, "grad_norm": 0.20906701683998108, "learning_rate": 4.434272729906475e-05, "loss": 0.1707, "step": 89500 }, { "epoch": 8.784773060029282, "grad_norm": 0.2192634642124176, "learning_rate": 4.428187500921721e-05, "loss": 0.1701, "step": 90000 }, { "epoch": 8.833577354807224, "grad_norm": 0.2148887813091278, "learning_rate": 4.4220739420672376e-05, "loss": 0.1697, "step": 90500 }, { "epoch": 8.882381649585163, "grad_norm": 0.20213574171066284, "learning_rate": 4.4159321431664084e-05, "loss": 0.1695, "step": 91000 }, { "epoch": 8.931185944363104, "grad_norm": 0.21166318655014038, "learning_rate": 4.4097621944575324e-05, "loss": 0.1695, "step": 91500 }, { "epoch": 8.979990239141044, "grad_norm": 0.2028771936893463, "learning_rate": 4.4035641865925015e-05, "loss": 0.1693, "step": 92000 }, { "epoch": 9.0, "eval_loss": 0.15039320290088654, "eval_runtime": 24.9603, "eval_samples_per_second": 315.541, "eval_steps_per_second": 0.441, "step": 92205 }, { "epoch": 9.028794533918985, "grad_norm": 0.20694176852703094, "learning_rate": 4.3973382106354655e-05, "loss": 0.1686, "step": 92500 }, { "epoch": 9.077598828696924, "grad_norm": 0.21907255053520203, "learning_rate": 4.391084358061494e-05, "loss": 0.1684, "step": 93000 }, { "epoch": 9.126403123474866, "grad_norm": 0.21821749210357666, "learning_rate": 4.3848027207552364e-05, "loss": 0.1683, "step": 93500 }, { "epoch": 9.175207418252807, "grad_norm": 0.20274536311626434, "learning_rate": 4.3784933910095646e-05, "loss": 0.1677, "step": 94000 }, { "epoch": 9.224011713030746, "grad_norm": 0.20460249483585358, "learning_rate": 4.372156461524226e-05, "loss": 0.1676, "step": 94500 }, { "epoch": 9.272816007808688, "grad_norm": 0.21497923135757446, "learning_rate": 4.3657920254044726e-05, "loss": 0.1673, "step": 95000 }, { "epoch": 9.321620302586627, "grad_norm": 0.20720575749874115, "learning_rate": 4.3594001761597e-05, "loss": 0.1673, "step": 95500 }, { "epoch": 9.370424597364568, "grad_norm": 0.22322164475917816, "learning_rate": 4.352981007702071e-05, "loss": 0.1668, "step": 96000 }, { "epoch": 9.419228892142508, "grad_norm": 0.20235677063465118, "learning_rate": 4.346534614345132e-05, "loss": 0.1665, "step": 96500 }, { "epoch": 9.468033186920449, "grad_norm": 0.20581580698490143, "learning_rate": 4.340061090802436e-05, "loss": 0.1663, "step": 97000 }, { "epoch": 9.51683748169839, "grad_norm": 0.2083093822002411, "learning_rate": 4.333560532186142e-05, "loss": 0.166, "step": 97500 }, { "epoch": 9.56564177647633, "grad_norm": 0.20584595203399658, "learning_rate": 4.327033034005622e-05, "loss": 0.1657, "step": 98000 }, { "epoch": 9.614446071254271, "grad_norm": 0.20942457020282745, "learning_rate": 4.320478692166059e-05, "loss": 0.1656, "step": 98500 }, { "epoch": 9.66325036603221, "grad_norm": 0.20925435423851013, "learning_rate": 4.313897602967034e-05, "loss": 0.1654, "step": 99000 }, { "epoch": 9.712054660810152, "grad_norm": 0.22049732506275177, "learning_rate": 4.307289863101116e-05, "loss": 0.165, "step": 99500 }, { "epoch": 9.760858955588091, "grad_norm": 0.20315922796726227, "learning_rate": 4.300655569652437e-05, "loss": 0.1646, "step": 100000 }, { "epoch": 9.809663250366032, "grad_norm": 0.20489932596683502, "learning_rate": 4.293994820095264e-05, "loss": 0.1643, "step": 100500 }, { "epoch": 9.858467545143974, "grad_norm": 0.218128502368927, "learning_rate": 4.287307712292576e-05, "loss": 0.1643, "step": 101000 }, { "epoch": 9.907271839921913, "grad_norm": 0.20896770060062408, "learning_rate": 4.280594344494617e-05, "loss": 0.164, "step": 101500 }, { "epoch": 9.956076134699854, "grad_norm": 0.20507818460464478, "learning_rate": 4.273854815337455e-05, "loss": 0.1636, "step": 102000 }, { "epoch": 10.0, "eval_loss": 0.14604029059410095, "eval_runtime": 23.6994, "eval_samples_per_second": 332.329, "eval_steps_per_second": 0.464, "step": 102450 }, { "epoch": 10.004880429477794, "grad_norm": 0.20058345794677734, "learning_rate": 4.267089223841534e-05, "loss": 0.1636, "step": 102500 }, { "epoch": 10.053684724255735, "grad_norm": 0.2024383842945099, "learning_rate": 4.2602976694102205e-05, "loss": 0.1632, "step": 103000 }, { "epoch": 10.102489019033674, "grad_norm": 0.21127928793430328, "learning_rate": 4.253480251828337e-05, "loss": 0.1629, "step": 103500 }, { "epoch": 10.151293313811616, "grad_norm": 0.19965404272079468, "learning_rate": 4.246637071260705e-05, "loss": 0.1629, "step": 104000 }, { "epoch": 10.200097608589555, "grad_norm": 0.20860032737255096, "learning_rate": 4.239768228250664e-05, "loss": 0.1624, "step": 104500 }, { "epoch": 10.248901903367496, "grad_norm": 0.21451444923877716, "learning_rate": 4.232873823718602e-05, "loss": 0.1624, "step": 105000 }, { "epoch": 10.297706198145438, "grad_norm": 0.21074171364307404, "learning_rate": 4.225953958960466e-05, "loss": 0.1623, "step": 105500 }, { "epoch": 10.346510492923377, "grad_norm": 0.21716845035552979, "learning_rate": 4.21900873564628e-05, "loss": 0.1617, "step": 106000 }, { "epoch": 10.395314787701318, "grad_norm": 0.21059440076351166, "learning_rate": 4.2120382558186474e-05, "loss": 0.1617, "step": 106500 }, { "epoch": 10.444119082479258, "grad_norm": 0.22244805097579956, "learning_rate": 4.205042621891251e-05, "loss": 0.1614, "step": 107000 }, { "epoch": 10.492923377257199, "grad_norm": 0.21420615911483765, "learning_rate": 4.1980219366473514e-05, "loss": 0.1611, "step": 107500 }, { "epoch": 10.541727672035138, "grad_norm": 0.2058490365743637, "learning_rate": 4.1909763032382756e-05, "loss": 0.161, "step": 108000 }, { "epoch": 10.59053196681308, "grad_norm": 0.20425471663475037, "learning_rate": 4.1839058251819e-05, "loss": 0.1609, "step": 108500 }, { "epoch": 10.63933626159102, "grad_norm": 0.20022732019424438, "learning_rate": 4.176810606361132e-05, "loss": 0.1606, "step": 109000 }, { "epoch": 10.68814055636896, "grad_norm": 0.20972158014774323, "learning_rate": 4.169690751022382e-05, "loss": 0.1604, "step": 109500 }, { "epoch": 10.736944851146902, "grad_norm": 0.20773041248321533, "learning_rate": 4.1625463637740297e-05, "loss": 0.1602, "step": 110000 }, { "epoch": 10.785749145924841, "grad_norm": 0.2000124752521515, "learning_rate": 4.1553775495848934e-05, "loss": 0.1601, "step": 110500 }, { "epoch": 10.834553440702782, "grad_norm": 0.21309678256511688, "learning_rate": 4.148184413782682e-05, "loss": 0.1597, "step": 111000 }, { "epoch": 10.883357735480722, "grad_norm": 0.2132243663072586, "learning_rate": 4.14096706205245e-05, "loss": 0.1597, "step": 111500 }, { "epoch": 10.932162030258663, "grad_norm": 0.20744946599006653, "learning_rate": 4.133725600435042e-05, "loss": 0.1596, "step": 112000 }, { "epoch": 10.980966325036603, "grad_norm": 0.20575416088104248, "learning_rate": 4.12646013532554e-05, "loss": 0.159, "step": 112500 }, { "epoch": 11.0, "eval_loss": 0.13835683465003967, "eval_runtime": 27.3854, "eval_samples_per_second": 287.598, "eval_steps_per_second": 0.402, "step": 112695 }, { "epoch": 11.029770619814544, "grad_norm": 0.2070922553539276, "learning_rate": 4.119170773471695e-05, "loss": 0.1589, "step": 113000 }, { "epoch": 11.078574914592485, "grad_norm": 0.20478574931621552, "learning_rate": 4.11185762197236e-05, "loss": 0.1586, "step": 113500 }, { "epoch": 11.127379209370424, "grad_norm": 0.1970217078924179, "learning_rate": 4.104520788275921e-05, "loss": 0.1586, "step": 114000 }, { "epoch": 11.176183504148366, "grad_norm": 0.19945302605628967, "learning_rate": 4.097160380178707e-05, "loss": 0.1582, "step": 114500 }, { "epoch": 11.224987798926305, "grad_norm": 0.19257070124149323, "learning_rate": 4.0897765058234224e-05, "loss": 0.1581, "step": 115000 }, { "epoch": 11.273792093704246, "grad_norm": 0.2013574242591858, "learning_rate": 4.082369273697542e-05, "loss": 0.158, "step": 115500 }, { "epoch": 11.322596388482186, "grad_norm": 0.21071788668632507, "learning_rate": 4.0749387926317295e-05, "loss": 0.1575, "step": 116000 }, { "epoch": 11.371400683260127, "grad_norm": 0.2010817974805832, "learning_rate": 4.0674851717982286e-05, "loss": 0.1574, "step": 116500 }, { "epoch": 11.420204978038067, "grad_norm": 0.20782026648521423, "learning_rate": 4.0600085207092695e-05, "loss": 0.1573, "step": 117000 }, { "epoch": 11.469009272816008, "grad_norm": 0.2070448100566864, "learning_rate": 4.052508949215447e-05, "loss": 0.1573, "step": 117500 }, { "epoch": 11.517813567593949, "grad_norm": 0.2066112607717514, "learning_rate": 4.044986567504121e-05, "loss": 0.1571, "step": 118000 }, { "epoch": 11.566617862371888, "grad_norm": 0.20482249557971954, "learning_rate": 4.037441486097785e-05, "loss": 0.1568, "step": 118500 }, { "epoch": 11.61542215714983, "grad_norm": 0.20141823589801788, "learning_rate": 4.02987381585245e-05, "loss": 0.1568, "step": 119000 }, { "epoch": 11.66422645192777, "grad_norm": 0.20818044245243073, "learning_rate": 4.02228366795601e-05, "loss": 0.1565, "step": 119500 }, { "epoch": 11.71303074670571, "grad_norm": 0.20303422212600708, "learning_rate": 4.014671153926619e-05, "loss": 0.1562, "step": 120000 }, { "epoch": 11.76183504148365, "grad_norm": 0.19013996422290802, "learning_rate": 4.007036385611036e-05, "loss": 0.156, "step": 120500 }, { "epoch": 11.810639336261591, "grad_norm": 0.20407438278198242, "learning_rate": 3.999379475182996e-05, "loss": 0.1562, "step": 121000 }, { "epoch": 11.859443631039532, "grad_norm": 0.1977386772632599, "learning_rate": 3.991700535141556e-05, "loss": 0.1556, "step": 121500 }, { "epoch": 11.908247925817472, "grad_norm": 0.19012510776519775, "learning_rate": 3.9839996783094435e-05, "loss": 0.1555, "step": 122000 }, { "epoch": 11.957052220595413, "grad_norm": 0.20828774571418762, "learning_rate": 3.976277017831396e-05, "loss": 0.1553, "step": 122500 }, { "epoch": 12.0, "eval_loss": 0.13950450718402863, "eval_runtime": 28.1831, "eval_samples_per_second": 279.458, "eval_steps_per_second": 0.39, "step": 122940 }, { "epoch": 12.005856515373353, "grad_norm": 0.19804109632968903, "learning_rate": 3.968532667172501e-05, "loss": 0.1552, "step": 123000 }, { "epoch": 12.054660810151294, "grad_norm": 0.2035941481590271, "learning_rate": 3.960766740116531e-05, "loss": 0.1549, "step": 123500 }, { "epoch": 12.103465104929233, "grad_norm": 0.20041148364543915, "learning_rate": 3.952979350764268e-05, "loss": 0.1547, "step": 124000 }, { "epoch": 12.152269399707174, "grad_norm": 0.19230812788009644, "learning_rate": 3.945170613531828e-05, "loss": 0.1548, "step": 124500 }, { "epoch": 12.201073694485114, "grad_norm": 0.2065581977367401, "learning_rate": 3.9373406431489826e-05, "loss": 0.1544, "step": 125000 }, { "epoch": 12.249877989263055, "grad_norm": 0.19001494348049164, "learning_rate": 3.929489554657466e-05, "loss": 0.1543, "step": 125500 }, { "epoch": 12.298682284040996, "grad_norm": 0.20618636906147003, "learning_rate": 3.921617463409298e-05, "loss": 0.1537, "step": 126000 }, { "epoch": 12.347486578818936, "grad_norm": 0.1987367868423462, "learning_rate": 3.913724485065074e-05, "loss": 0.1542, "step": 126500 }, { "epoch": 12.396290873596877, "grad_norm": 0.1950555443763733, "learning_rate": 3.905810735592276e-05, "loss": 0.1537, "step": 127000 }, { "epoch": 12.445095168374817, "grad_norm": 0.20843225717544556, "learning_rate": 3.8978763312635645e-05, "loss": 0.1535, "step": 127500 }, { "epoch": 12.493899463152758, "grad_norm": 0.19434267282485962, "learning_rate": 3.889921388655073e-05, "loss": 0.1535, "step": 128000 }, { "epoch": 12.542703757930697, "grad_norm": 0.19898554682731628, "learning_rate": 3.881946024644691e-05, "loss": 0.1533, "step": 128500 }, { "epoch": 12.591508052708638, "grad_norm": 0.19874414801597595, "learning_rate": 3.873950356410352e-05, "loss": 0.1534, "step": 129000 }, { "epoch": 12.640312347486578, "grad_norm": 0.19424794614315033, "learning_rate": 3.865934501428304e-05, "loss": 0.1528, "step": 129500 }, { "epoch": 12.68911664226452, "grad_norm": 0.19256962835788727, "learning_rate": 3.8578985774713955e-05, "loss": 0.153, "step": 130000 }, { "epoch": 12.73792093704246, "grad_norm": 0.21424148976802826, "learning_rate": 3.8498427026073325e-05, "loss": 0.1527, "step": 130500 }, { "epoch": 12.7867252318204, "grad_norm": 0.20375344157218933, "learning_rate": 3.841766995196951e-05, "loss": 0.1526, "step": 131000 }, { "epoch": 12.835529526598341, "grad_norm": 0.2020910084247589, "learning_rate": 3.8336715738924787e-05, "loss": 0.1522, "step": 131500 }, { "epoch": 12.88433382137628, "grad_norm": 0.21570877730846405, "learning_rate": 3.825556557635787e-05, "loss": 0.1522, "step": 132000 }, { "epoch": 12.933138116154222, "grad_norm": 0.202886700630188, "learning_rate": 3.817422065656645e-05, "loss": 0.1522, "step": 132500 }, { "epoch": 12.981942410932161, "grad_norm": 0.19793546199798584, "learning_rate": 3.809268217470971e-05, "loss": 0.1519, "step": 133000 }, { "epoch": 13.0, "eval_loss": 0.13297139108181, "eval_runtime": 27.6372, "eval_samples_per_second": 284.978, "eval_steps_per_second": 0.398, "step": 133185 }, { "epoch": 13.030746705710103, "grad_norm": 0.19757746160030365, "learning_rate": 3.8010951328790745e-05, "loss": 0.1519, "step": 133500 }, { "epoch": 13.079551000488044, "grad_norm": 0.1974940001964569, "learning_rate": 3.792902931963893e-05, "loss": 0.1515, "step": 134000 }, { "epoch": 13.128355295265983, "grad_norm": 0.19320930540561676, "learning_rate": 3.784691735089232e-05, "loss": 0.1517, "step": 134500 }, { "epoch": 13.177159590043924, "grad_norm": 0.2007361203432083, "learning_rate": 3.776461662897995e-05, "loss": 0.1513, "step": 135000 }, { "epoch": 13.225963884821864, "grad_norm": 0.1926342397928238, "learning_rate": 3.76821283631041e-05, "loss": 0.1514, "step": 135500 }, { "epoch": 13.274768179599805, "grad_norm": 0.18830719590187073, "learning_rate": 3.759945376522254e-05, "loss": 0.1512, "step": 136000 }, { "epoch": 13.323572474377745, "grad_norm": 0.1940852552652359, "learning_rate": 3.7516594050030715e-05, "loss": 0.151, "step": 136500 }, { "epoch": 13.372376769155686, "grad_norm": 0.1951226443052292, "learning_rate": 3.7433550434943934e-05, "loss": 0.1508, "step": 137000 }, { "epoch": 13.421181063933625, "grad_norm": 0.18908989429473877, "learning_rate": 3.735032414007941e-05, "loss": 0.1505, "step": 137500 }, { "epoch": 13.469985358711567, "grad_norm": 0.19911529123783112, "learning_rate": 3.7266916388238396e-05, "loss": 0.1503, "step": 138000 }, { "epoch": 13.518789653489508, "grad_norm": 0.20053178071975708, "learning_rate": 3.718332840488821e-05, "loss": 0.1504, "step": 138500 }, { "epoch": 13.567593948267447, "grad_norm": 0.19537031650543213, "learning_rate": 3.70995614181442e-05, "loss": 0.1502, "step": 139000 }, { "epoch": 13.616398243045388, "grad_norm": 0.19510440528392792, "learning_rate": 3.7015616658751715e-05, "loss": 0.1503, "step": 139500 }, { "epoch": 13.665202537823328, "grad_norm": 0.196214497089386, "learning_rate": 3.693149536006807e-05, "loss": 0.1499, "step": 140000 }, { "epoch": 13.71400683260127, "grad_norm": 0.1952546089887619, "learning_rate": 3.6847198758044326e-05, "loss": 0.1499, "step": 140500 }, { "epoch": 13.762811127379209, "grad_norm": 0.19812558591365814, "learning_rate": 3.6762728091207216e-05, "loss": 0.1498, "step": 141000 }, { "epoch": 13.81161542215715, "grad_norm": 0.18906739354133606, "learning_rate": 3.66780846006409e-05, "loss": 0.1493, "step": 141500 }, { "epoch": 13.860419716935091, "grad_norm": 0.20462313294410706, "learning_rate": 3.659326952996879e-05, "loss": 0.1494, "step": 142000 }, { "epoch": 13.90922401171303, "grad_norm": 0.1982060968875885, "learning_rate": 3.650828412533519e-05, "loss": 0.1493, "step": 142500 }, { "epoch": 13.958028306490972, "grad_norm": 0.19797129929065704, "learning_rate": 3.6423129635387033e-05, "loss": 0.1494, "step": 143000 }, { "epoch": 14.0, "eval_loss": 0.13158732652664185, "eval_runtime": 27.7401, "eval_samples_per_second": 283.921, "eval_steps_per_second": 0.397, "step": 143430 }, { "epoch": 14.006832601268911, "grad_norm": 0.19103878736495972, "learning_rate": 3.6337807311255574e-05, "loss": 0.149, "step": 143500 }, { "epoch": 14.055636896046853, "grad_norm": 0.19477146863937378, "learning_rate": 3.625231840653794e-05, "loss": 0.1488, "step": 144000 }, { "epoch": 14.104441190824792, "grad_norm": 0.1984102576971054, "learning_rate": 3.616666417727875e-05, "loss": 0.1487, "step": 144500 }, { "epoch": 14.153245485602733, "grad_norm": 0.20152725279331207, "learning_rate": 3.608084588195166e-05, "loss": 0.1488, "step": 145000 }, { "epoch": 14.202049780380673, "grad_norm": 0.1842581033706665, "learning_rate": 3.599486478144085e-05, "loss": 0.1486, "step": 145500 }, { "epoch": 14.250854075158614, "grad_norm": 0.20297376811504364, "learning_rate": 3.590872213902252e-05, "loss": 0.1483, "step": 146000 }, { "epoch": 14.299658369936555, "grad_norm": 0.1883450597524643, "learning_rate": 3.582241922034631e-05, "loss": 0.1482, "step": 146500 }, { "epoch": 14.348462664714495, "grad_norm": 0.18912336230278015, "learning_rate": 3.573595729341675e-05, "loss": 0.1482, "step": 147000 }, { "epoch": 14.397266959492436, "grad_norm": 0.1913149505853653, "learning_rate": 3.564933762857454e-05, "loss": 0.1478, "step": 147500 }, { "epoch": 14.446071254270375, "grad_norm": 0.19658420979976654, "learning_rate": 3.556256149847801e-05, "loss": 0.1479, "step": 148000 }, { "epoch": 14.494875549048317, "grad_norm": 0.1880834996700287, "learning_rate": 3.547563017808432e-05, "loss": 0.1478, "step": 148500 }, { "epoch": 14.543679843826256, "grad_norm": 0.1877063512802124, "learning_rate": 3.538854494463074e-05, "loss": 0.1478, "step": 149000 }, { "epoch": 14.592484138604197, "grad_norm": 0.19691213965415955, "learning_rate": 3.530130707761594e-05, "loss": 0.1474, "step": 149500 }, { "epoch": 14.641288433382137, "grad_norm": 0.19889949262142181, "learning_rate": 3.521391785878114e-05, "loss": 0.1472, "step": 150000 }, { "epoch": 14.690092728160078, "grad_norm": 0.1987435221672058, "learning_rate": 3.512637857209131e-05, "loss": 0.1471, "step": 150500 }, { "epoch": 14.73889702293802, "grad_norm": 0.20512694120407104, "learning_rate": 3.503869050371626e-05, "loss": 0.1471, "step": 151000 }, { "epoch": 14.787701317715959, "grad_norm": 0.19599127769470215, "learning_rate": 3.4950854942011814e-05, "loss": 0.1471, "step": 151500 }, { "epoch": 14.8365056124939, "grad_norm": 0.1986822932958603, "learning_rate": 3.4862873177500796e-05, "loss": 0.1467, "step": 152000 }, { "epoch": 14.88530990727184, "grad_norm": 0.18663661181926727, "learning_rate": 3.4774746502854164e-05, "loss": 0.1469, "step": 152500 }, { "epoch": 14.93411420204978, "grad_norm": 0.1881023645401001, "learning_rate": 3.46864762128719e-05, "loss": 0.1467, "step": 153000 }, { "epoch": 14.98291849682772, "grad_norm": 0.1909170150756836, "learning_rate": 3.4598063604464106e-05, "loss": 0.1465, "step": 153500 }, { "epoch": 15.0, "eval_loss": 0.1301085352897644, "eval_runtime": 23.6471, "eval_samples_per_second": 333.064, "eval_steps_per_second": 0.465, "step": 153675 }, { "epoch": 15.031722791605661, "grad_norm": 0.19014447927474976, "learning_rate": 3.450950997663189e-05, "loss": 0.1461, "step": 154000 }, { "epoch": 15.080527086383603, "grad_norm": 0.20832829177379608, "learning_rate": 3.442081663044827e-05, "loss": 0.1463, "step": 154500 }, { "epoch": 15.129331381161542, "grad_norm": 0.19706888496875763, "learning_rate": 3.433198486903906e-05, "loss": 0.1461, "step": 155000 }, { "epoch": 15.178135675939483, "grad_norm": 0.2018064558506012, "learning_rate": 3.424301599756378e-05, "loss": 0.1463, "step": 155500 }, { "epoch": 15.226939970717423, "grad_norm": 0.19212935864925385, "learning_rate": 3.41539113231964e-05, "loss": 0.1464, "step": 156000 }, { "epoch": 15.275744265495364, "grad_norm": 0.20076821744441986, "learning_rate": 3.406467215510619e-05, "loss": 0.1459, "step": 156500 }, { "epoch": 15.324548560273303, "grad_norm": 0.19215160608291626, "learning_rate": 3.3975299804438476e-05, "loss": 0.1456, "step": 157000 }, { "epoch": 15.373352855051245, "grad_norm": 0.19090279936790466, "learning_rate": 3.388579558429534e-05, "loss": 0.1458, "step": 157500 }, { "epoch": 15.422157149829186, "grad_norm": 0.19182687997817993, "learning_rate": 3.3796160809716386e-05, "loss": 0.1454, "step": 158000 }, { "epoch": 15.470961444607125, "grad_norm": 0.18930520117282867, "learning_rate": 3.370639679765936e-05, "loss": 0.1452, "step": 158500 }, { "epoch": 15.519765739385067, "grad_norm": 0.20811304450035095, "learning_rate": 3.3616504866980834e-05, "loss": 0.1452, "step": 159000 }, { "epoch": 15.568570034163006, "grad_norm": 0.18808256089687347, "learning_rate": 3.3526486338416835e-05, "loss": 0.1453, "step": 159500 }, { "epoch": 15.617374328940947, "grad_norm": 0.18801531195640564, "learning_rate": 3.343634253456343e-05, "loss": 0.1451, "step": 160000 }, { "epoch": 15.666178623718887, "grad_norm": 0.19010472297668457, "learning_rate": 3.334607477985727e-05, "loss": 0.145, "step": 160500 }, { "epoch": 15.714982918496828, "grad_norm": 0.20773784816265106, "learning_rate": 3.3255684400556165e-05, "loss": 0.1449, "step": 161000 }, { "epoch": 15.763787213274767, "grad_norm": 0.1926048994064331, "learning_rate": 3.316517272471959e-05, "loss": 0.1445, "step": 161500 }, { "epoch": 15.812591508052709, "grad_norm": 0.20847058296203613, "learning_rate": 3.307454108218916e-05, "loss": 0.1448, "step": 162000 }, { "epoch": 15.86139580283065, "grad_norm": 0.18687431514263153, "learning_rate": 3.2983790804569105e-05, "loss": 0.1445, "step": 162500 }, { "epoch": 15.91020009760859, "grad_norm": 0.19642353057861328, "learning_rate": 3.2892923225206695e-05, "loss": 0.1443, "step": 163000 }, { "epoch": 15.95900439238653, "grad_norm": 0.19062745571136475, "learning_rate": 3.280193967917265e-05, "loss": 0.1444, "step": 163500 }, { "epoch": 16.0, "eval_loss": 0.1288023591041565, "eval_runtime": 27.3979, "eval_samples_per_second": 287.468, "eval_steps_per_second": 0.401, "step": 163920 }, { "epoch": 16.00780868716447, "grad_norm": 0.2008381485939026, "learning_rate": 3.271084150324154e-05, "loss": 0.1441, "step": 164000 }, { "epoch": 16.05661298194241, "grad_norm": 0.1929151713848114, "learning_rate": 3.261963003587214e-05, "loss": 0.1443, "step": 164500 }, { "epoch": 16.105417276720353, "grad_norm": 0.19287170469760895, "learning_rate": 3.252830661718772e-05, "loss": 0.144, "step": 165000 }, { "epoch": 16.15422157149829, "grad_norm": 0.19933773577213287, "learning_rate": 3.243687258895643e-05, "loss": 0.1439, "step": 165500 }, { "epoch": 16.20302586627623, "grad_norm": 0.2010374516248703, "learning_rate": 3.234532929457155e-05, "loss": 0.1439, "step": 166000 }, { "epoch": 16.251830161054173, "grad_norm": 0.19827648997306824, "learning_rate": 3.2253678079031724e-05, "loss": 0.1439, "step": 166500 }, { "epoch": 16.300634455832114, "grad_norm": 0.1934526264667511, "learning_rate": 3.2161920288921254e-05, "loss": 0.1438, "step": 167000 }, { "epoch": 16.349438750610055, "grad_norm": 0.20245911180973053, "learning_rate": 3.2070057272390263e-05, "loss": 0.1436, "step": 167500 }, { "epoch": 16.398243045387993, "grad_norm": 0.1878873109817505, "learning_rate": 3.197809037913493e-05, "loss": 0.1433, "step": 168000 }, { "epoch": 16.447047340165934, "grad_norm": 0.19571448862552643, "learning_rate": 3.188602096037764e-05, "loss": 0.1435, "step": 168500 }, { "epoch": 16.495851634943875, "grad_norm": 0.19554303586483002, "learning_rate": 3.179385036884712e-05, "loss": 0.1433, "step": 169000 }, { "epoch": 16.544655929721817, "grad_norm": 0.18918287754058838, "learning_rate": 3.170157995875859e-05, "loss": 0.1435, "step": 169500 }, { "epoch": 16.593460224499758, "grad_norm": 0.19676432013511658, "learning_rate": 3.160921108579385e-05, "loss": 0.1432, "step": 170000 }, { "epoch": 16.642264519277695, "grad_norm": 0.20606379210948944, "learning_rate": 3.151674510708136e-05, "loss": 0.1431, "step": 170500 }, { "epoch": 16.691068814055637, "grad_norm": 0.18640325963497162, "learning_rate": 3.142418338117631e-05, "loss": 0.1428, "step": 171000 }, { "epoch": 16.739873108833578, "grad_norm": 0.18933062255382538, "learning_rate": 3.1331527268040646e-05, "loss": 0.1431, "step": 171500 }, { "epoch": 16.78867740361152, "grad_norm": 0.1884533166885376, "learning_rate": 3.12387781290231e-05, "loss": 0.1427, "step": 172000 }, { "epoch": 16.837481698389457, "grad_norm": 0.19676893949508667, "learning_rate": 3.11459373268392e-05, "loss": 0.1426, "step": 172500 }, { "epoch": 16.886285993167398, "grad_norm": 0.18692608177661896, "learning_rate": 3.105300622555122e-05, "loss": 0.1429, "step": 173000 }, { "epoch": 16.93509028794534, "grad_norm": 0.2095184326171875, "learning_rate": 3.095998619054813e-05, "loss": 0.1425, "step": 173500 }, { "epoch": 16.98389458272328, "grad_norm": 0.19869489967823029, "learning_rate": 3.086687858852562e-05, "loss": 0.1425, "step": 174000 }, { "epoch": 17.0, "eval_loss": 0.12800458073616028, "eval_runtime": 25.827, "eval_samples_per_second": 304.952, "eval_steps_per_second": 0.426, "step": 174165 }, { "epoch": 17.032698877501222, "grad_norm": 0.18050076067447662, "learning_rate": 3.077368478746591e-05, "loss": 0.142, "step": 174500 }, { "epoch": 17.08150317227916, "grad_norm": 0.19508038461208344, "learning_rate": 3.068040615661768e-05, "loss": 0.1422, "step": 175000 }, { "epoch": 17.1303074670571, "grad_norm": 0.19345000386238098, "learning_rate": 3.0587044066476024e-05, "loss": 0.142, "step": 175500 }, { "epoch": 17.179111761835042, "grad_norm": 0.18671298027038574, "learning_rate": 3.0493599888762235e-05, "loss": 0.1417, "step": 176000 }, { "epoch": 17.227916056612983, "grad_norm": 0.18719059228897095, "learning_rate": 3.0400074996403666e-05, "loss": 0.1419, "step": 176500 }, { "epoch": 17.27672035139092, "grad_norm": 0.192045196890831, "learning_rate": 3.0306470763513584e-05, "loss": 0.142, "step": 177000 }, { "epoch": 17.325524646168862, "grad_norm": 0.18663588166236877, "learning_rate": 3.0212788565370952e-05, "loss": 0.1419, "step": 177500 }, { "epoch": 17.374328940946803, "grad_norm": 0.19223402440547943, "learning_rate": 3.0119029778400266e-05, "loss": 0.1416, "step": 178000 }, { "epoch": 17.423133235724745, "grad_norm": 0.20375187695026398, "learning_rate": 3.002519578015126e-05, "loss": 0.1417, "step": 178500 }, { "epoch": 17.471937530502686, "grad_norm": 0.19722655415534973, "learning_rate": 2.9931287949278752e-05, "loss": 0.1413, "step": 179000 }, { "epoch": 17.520741825280624, "grad_norm": 0.20561105012893677, "learning_rate": 2.9837307665522297e-05, "loss": 0.1412, "step": 179500 }, { "epoch": 17.569546120058565, "grad_norm": 0.1842418909072876, "learning_rate": 2.9743256309686013e-05, "loss": 0.1413, "step": 180000 }, { "epoch": 17.618350414836506, "grad_norm": 0.19416528940200806, "learning_rate": 2.9649135263618205e-05, "loss": 0.1414, "step": 180500 }, { "epoch": 17.667154709614447, "grad_norm": 0.18883706629276276, "learning_rate": 2.9554945910191122e-05, "loss": 0.1414, "step": 181000 }, { "epoch": 17.715959004392385, "grad_norm": 0.18695645034313202, "learning_rate": 2.9460689633280613e-05, "loss": 0.1413, "step": 181500 }, { "epoch": 17.764763299170326, "grad_norm": 0.1854555606842041, "learning_rate": 2.9366367817745794e-05, "loss": 0.1411, "step": 182000 }, { "epoch": 17.813567593948267, "grad_norm": 0.1904602348804474, "learning_rate": 2.927198184940872e-05, "loss": 0.1411, "step": 182500 }, { "epoch": 17.86237188872621, "grad_norm": 0.1872331202030182, "learning_rate": 2.917753311503399e-05, "loss": 0.1409, "step": 183000 }, { "epoch": 17.91117618350415, "grad_norm": 0.19253146648406982, "learning_rate": 2.90830230023084e-05, "loss": 0.1409, "step": 183500 }, { "epoch": 17.959980478282088, "grad_norm": 0.18223468959331512, "learning_rate": 2.8988452899820563e-05, "loss": 0.1407, "step": 184000 }, { "epoch": 18.0, "eval_loss": 0.12518393993377686, "eval_runtime": 25.7207, "eval_samples_per_second": 306.212, "eval_steps_per_second": 0.428, "step": 184410 }, { "epoch": 18.00878477306003, "grad_norm": 0.19220831990242004, "learning_rate": 2.889382419704047e-05, "loss": 0.1408, "step": 184500 }, { "epoch": 18.05758906783797, "grad_norm": 0.19996266067028046, "learning_rate": 2.8799138284299105e-05, "loss": 0.1406, "step": 185000 }, { "epoch": 18.10639336261591, "grad_norm": 0.192152738571167, "learning_rate": 2.8704396552767997e-05, "loss": 0.1405, "step": 185500 }, { "epoch": 18.15519765739385, "grad_norm": 0.19583114981651306, "learning_rate": 2.8609600394438816e-05, "loss": 0.1404, "step": 186000 }, { "epoch": 18.20400195217179, "grad_norm": 0.1908300369977951, "learning_rate": 2.851475120210289e-05, "loss": 0.1405, "step": 186500 }, { "epoch": 18.25280624694973, "grad_norm": 0.19682295620441437, "learning_rate": 2.8419850369330714e-05, "loss": 0.14, "step": 187000 }, { "epoch": 18.301610541727673, "grad_norm": 0.18878893554210663, "learning_rate": 2.8324899290451556e-05, "loss": 0.1403, "step": 187500 }, { "epoch": 18.350414836505614, "grad_norm": 0.19927945733070374, "learning_rate": 2.822989936053291e-05, "loss": 0.1402, "step": 188000 }, { "epoch": 18.39921913128355, "grad_norm": 0.18962599337100983, "learning_rate": 2.8134851975359994e-05, "loss": 0.1399, "step": 188500 }, { "epoch": 18.448023426061493, "grad_norm": 0.19572696089744568, "learning_rate": 2.8039758531415278e-05, "loss": 0.1399, "step": 189000 }, { "epoch": 18.496827720839434, "grad_norm": 0.19577118754386902, "learning_rate": 2.7944620425857952e-05, "loss": 0.14, "step": 189500 }, { "epoch": 18.545632015617375, "grad_norm": 0.1974543035030365, "learning_rate": 2.78494390565034e-05, "loss": 0.1398, "step": 190000 }, { "epoch": 18.594436310395317, "grad_norm": 0.19602327048778534, "learning_rate": 2.775421582180263e-05, "loss": 0.1397, "step": 190500 }, { "epoch": 18.643240605173254, "grad_norm": 0.18849612772464752, "learning_rate": 2.7658952120821802e-05, "loss": 0.1396, "step": 191000 }, { "epoch": 18.692044899951195, "grad_norm": 0.19312690198421478, "learning_rate": 2.756364935322158e-05, "loss": 0.1395, "step": 191500 }, { "epoch": 18.740849194729137, "grad_norm": 0.18100771307945251, "learning_rate": 2.7468308919236652e-05, "loss": 0.1394, "step": 192000 }, { "epoch": 18.789653489507078, "grad_norm": 0.20045186579227448, "learning_rate": 2.737293221965509e-05, "loss": 0.1394, "step": 192500 }, { "epoch": 18.838457784285016, "grad_norm": 0.1846308708190918, "learning_rate": 2.7277520655797816e-05, "loss": 0.1393, "step": 193000 }, { "epoch": 18.887262079062957, "grad_norm": 0.18819710612297058, "learning_rate": 2.7182075629497976e-05, "loss": 0.1394, "step": 193500 }, { "epoch": 18.936066373840898, "grad_norm": 0.18752720952033997, "learning_rate": 2.7086598543080392e-05, "loss": 0.1391, "step": 194000 }, { "epoch": 18.98487066861884, "grad_norm": 0.19363176822662354, "learning_rate": 2.6991090799340905e-05, "loss": 0.1391, "step": 194500 }, { "epoch": 19.0, "eval_loss": 0.1259300708770752, "eval_runtime": 25.5672, "eval_samples_per_second": 308.051, "eval_steps_per_second": 0.43, "step": 194655 }, { "epoch": 19.03367496339678, "grad_norm": 0.19123421609401703, "learning_rate": 2.6895553801525803e-05, "loss": 0.1391, "step": 195000 }, { "epoch": 19.08247925817472, "grad_norm": 0.19878804683685303, "learning_rate": 2.6799988953311162e-05, "loss": 0.1389, "step": 195500 }, { "epoch": 19.13128355295266, "grad_norm": 0.19207318127155304, "learning_rate": 2.6704397658782283e-05, "loss": 0.1391, "step": 196000 }, { "epoch": 19.1800878477306, "grad_norm": 0.18511444330215454, "learning_rate": 2.6608781322413018e-05, "loss": 0.1389, "step": 196500 }, { "epoch": 19.228892142508542, "grad_norm": 0.19707535207271576, "learning_rate": 2.651314134904514e-05, "loss": 0.1389, "step": 197000 }, { "epoch": 19.27769643728648, "grad_norm": 0.1916116625070572, "learning_rate": 2.6417479143867697e-05, "loss": 0.1387, "step": 197500 }, { "epoch": 19.32650073206442, "grad_norm": 0.18978238105773926, "learning_rate": 2.632179611239642e-05, "loss": 0.1387, "step": 198000 }, { "epoch": 19.375305026842362, "grad_norm": 0.1835888773202896, "learning_rate": 2.6226093660452982e-05, "loss": 0.1385, "step": 198500 }, { "epoch": 19.424109321620303, "grad_norm": 0.18811723589897156, "learning_rate": 2.613037319414441e-05, "loss": 0.1387, "step": 199000 }, { "epoch": 19.472913616398245, "grad_norm": 0.1998414546251297, "learning_rate": 2.6034636119842414e-05, "loss": 0.1385, "step": 199500 }, { "epoch": 19.521717911176182, "grad_norm": 0.18518772721290588, "learning_rate": 2.5938883844162715e-05, "loss": 0.1382, "step": 200000 }, { "epoch": 19.570522205954124, "grad_norm": 0.19242486357688904, "learning_rate": 2.584311777394437e-05, "loss": 0.1384, "step": 200500 }, { "epoch": 19.619326500732065, "grad_norm": 0.2028750330209732, "learning_rate": 2.574733931622912e-05, "loss": 0.1384, "step": 201000 }, { "epoch": 19.668130795510006, "grad_norm": 0.18917541205883026, "learning_rate": 2.5651549878240694e-05, "loss": 0.1381, "step": 201500 }, { "epoch": 19.716935090287944, "grad_norm": 0.19596756994724274, "learning_rate": 2.5555750867364188e-05, "loss": 0.138, "step": 202000 }, { "epoch": 19.765739385065885, "grad_norm": 0.19332247972488403, "learning_rate": 2.5459943691125292e-05, "loss": 0.1381, "step": 202500 }, { "epoch": 19.814543679843826, "grad_norm": 0.19187049567699432, "learning_rate": 2.536412975716972e-05, "loss": 0.1381, "step": 203000 }, { "epoch": 19.863347974621767, "grad_norm": 0.19392500817775726, "learning_rate": 2.5268310473242424e-05, "loss": 0.1378, "step": 203500 }, { "epoch": 19.91215226939971, "grad_norm": 0.19194450974464417, "learning_rate": 2.517248724716701e-05, "loss": 0.1377, "step": 204000 }, { "epoch": 19.960956564177646, "grad_norm": 0.20554892718791962, "learning_rate": 2.5076661486824953e-05, "loss": 0.1379, "step": 204500 }, { "epoch": 20.0, "eval_loss": 0.1231779009103775, "eval_runtime": 29.7434, "eval_samples_per_second": 264.798, "eval_steps_per_second": 0.37, "step": 204900 }, { "epoch": 20.009760858955588, "grad_norm": 0.19533833861351013, "learning_rate": 2.4980834600135006e-05, "loss": 0.1377, "step": 205000 }, { "epoch": 20.05856515373353, "grad_norm": 0.18907921016216278, "learning_rate": 2.488500799503244e-05, "loss": 0.1377, "step": 205500 }, { "epoch": 20.10736944851147, "grad_norm": 0.1802392452955246, "learning_rate": 2.4789183079448417e-05, "loss": 0.1378, "step": 206000 }, { "epoch": 20.156173743289408, "grad_norm": 0.19577832520008087, "learning_rate": 2.4693361261289247e-05, "loss": 0.1375, "step": 206500 }, { "epoch": 20.20497803806735, "grad_norm": 0.20748840272426605, "learning_rate": 2.4597543948415748e-05, "loss": 0.1376, "step": 207000 }, { "epoch": 20.25378233284529, "grad_norm": 0.19364304840564728, "learning_rate": 2.4501732548622546e-05, "loss": 0.1375, "step": 207500 }, { "epoch": 20.30258662762323, "grad_norm": 0.1987764686346054, "learning_rate": 2.440592846961738e-05, "loss": 0.1373, "step": 208000 }, { "epoch": 20.351390922401173, "grad_norm": 0.1924201399087906, "learning_rate": 2.4310133119000438e-05, "loss": 0.1376, "step": 208500 }, { "epoch": 20.40019521717911, "grad_norm": 0.19483359158039093, "learning_rate": 2.4214347904243644e-05, "loss": 0.1374, "step": 209000 }, { "epoch": 20.44899951195705, "grad_norm": 0.19892901182174683, "learning_rate": 2.4118574232670025e-05, "loss": 0.1372, "step": 209500 }, { "epoch": 20.497803806734993, "grad_norm": 0.18968260288238525, "learning_rate": 2.4022813511433027e-05, "loss": 0.137, "step": 210000 }, { "epoch": 20.546608101512934, "grad_norm": 0.19339485466480255, "learning_rate": 2.3927067147495765e-05, "loss": 0.1372, "step": 210500 }, { "epoch": 20.595412396290875, "grad_norm": 0.19323968887329102, "learning_rate": 2.383133654761045e-05, "loss": 0.137, "step": 211000 }, { "epoch": 20.644216691068813, "grad_norm": 0.18963748216629028, "learning_rate": 2.3735623118297692e-05, "loss": 0.1369, "step": 211500 }, { "epoch": 20.693020985846754, "grad_norm": 0.190143883228302, "learning_rate": 2.3639928265825783e-05, "loss": 0.1369, "step": 212000 }, { "epoch": 20.741825280624695, "grad_norm": 0.19597776234149933, "learning_rate": 2.3544253396190112e-05, "loss": 0.1369, "step": 212500 }, { "epoch": 20.790629575402637, "grad_norm": 0.18973353505134583, "learning_rate": 2.3448599915092443e-05, "loss": 0.1366, "step": 213000 }, { "epoch": 20.839433870180574, "grad_norm": 0.20242229104042053, "learning_rate": 2.3352969227920303e-05, "loss": 0.1368, "step": 213500 }, { "epoch": 20.888238164958516, "grad_norm": 0.19486981630325317, "learning_rate": 2.325736273972633e-05, "loss": 0.1368, "step": 214000 }, { "epoch": 20.937042459736457, "grad_norm": 0.18778111040592194, "learning_rate": 2.3161781855207575e-05, "loss": 0.1365, "step": 214500 }, { "epoch": 20.985846754514398, "grad_norm": 0.19285354018211365, "learning_rate": 2.3066227978684964e-05, "loss": 0.1363, "step": 215000 }, { "epoch": 21.0, "eval_loss": 0.12139205634593964, "eval_runtime": 26.2565, "eval_samples_per_second": 299.963, "eval_steps_per_second": 0.419, "step": 215145 }, { "epoch": 21.03465104929234, "grad_norm": 0.1933123618364334, "learning_rate": 2.297070251408259e-05, "loss": 0.1364, "step": 215500 }, { "epoch": 21.083455344070277, "grad_norm": 0.18427444994449615, "learning_rate": 2.287520686490707e-05, "loss": 0.1365, "step": 216000 }, { "epoch": 21.13225963884822, "grad_norm": 0.17762655019760132, "learning_rate": 2.2779742434227005e-05, "loss": 0.1363, "step": 216500 }, { "epoch": 21.18106393362616, "grad_norm": 0.18944330513477325, "learning_rate": 2.2684310624652287e-05, "loss": 0.1363, "step": 217000 }, { "epoch": 21.2298682284041, "grad_norm": 0.19393311440944672, "learning_rate": 2.2588912838313535e-05, "loss": 0.1363, "step": 217500 }, { "epoch": 21.27867252318204, "grad_norm": 0.1875392496585846, "learning_rate": 2.2493550476841495e-05, "loss": 0.1363, "step": 218000 }, { "epoch": 21.32747681795998, "grad_norm": 0.19635601341724396, "learning_rate": 2.2398224941346408e-05, "loss": 0.1362, "step": 218500 }, { "epoch": 21.37628111273792, "grad_norm": 0.19351017475128174, "learning_rate": 2.2302937632397462e-05, "loss": 0.1359, "step": 219000 }, { "epoch": 21.425085407515862, "grad_norm": 0.18472112715244293, "learning_rate": 2.2207689950002213e-05, "loss": 0.1362, "step": 219500 }, { "epoch": 21.473889702293803, "grad_norm": 0.192471444606781, "learning_rate": 2.211248329358598e-05, "loss": 0.1359, "step": 220000 }, { "epoch": 21.52269399707174, "grad_norm": 0.19304192066192627, "learning_rate": 2.2017319061971338e-05, "loss": 0.1362, "step": 220500 }, { "epoch": 21.571498291849682, "grad_norm": 0.18912473320960999, "learning_rate": 2.1922198653357498e-05, "loss": 0.1362, "step": 221000 }, { "epoch": 21.620302586627623, "grad_norm": 0.19801722466945648, "learning_rate": 2.182712346529983e-05, "loss": 0.1363, "step": 221500 }, { "epoch": 21.669106881405565, "grad_norm": 0.18331073224544525, "learning_rate": 2.1732094894689313e-05, "loss": 0.136, "step": 222000 }, { "epoch": 21.717911176183506, "grad_norm": 0.1763552576303482, "learning_rate": 2.1637114337731967e-05, "loss": 0.1356, "step": 222500 }, { "epoch": 21.766715470961444, "grad_norm": 0.1820065975189209, "learning_rate": 2.1542183189928387e-05, "loss": 0.1356, "step": 223000 }, { "epoch": 21.815519765739385, "grad_norm": 0.18830101191997528, "learning_rate": 2.1447302846053234e-05, "loss": 0.1358, "step": 223500 }, { "epoch": 21.864324060517326, "grad_norm": 0.19416014850139618, "learning_rate": 2.135247470013471e-05, "loss": 0.1354, "step": 224000 }, { "epoch": 21.913128355295267, "grad_norm": 0.1934524029493332, "learning_rate": 2.1257700145434132e-05, "loss": 0.1356, "step": 224500 }, { "epoch": 21.961932650073205, "grad_norm": 0.19462282955646515, "learning_rate": 2.116298057442539e-05, "loss": 0.1357, "step": 225000 }, { "epoch": 22.0, "eval_loss": 0.12161369621753693, "eval_runtime": 26.6058, "eval_samples_per_second": 296.025, "eval_steps_per_second": 0.413, "step": 225390 }, { "epoch": 22.010736944851146, "grad_norm": 0.18952177464962006, "learning_rate": 2.106831737877456e-05, "loss": 0.1354, "step": 225500 }, { "epoch": 22.059541239629088, "grad_norm": 0.2017366886138916, "learning_rate": 2.0973711949319415e-05, "loss": 0.1355, "step": 226000 }, { "epoch": 22.10834553440703, "grad_norm": 0.19085553288459778, "learning_rate": 2.087916567604897e-05, "loss": 0.1353, "step": 226500 }, { "epoch": 22.15714982918497, "grad_norm": 0.20396627485752106, "learning_rate": 2.0784679948083138e-05, "loss": 0.1352, "step": 227000 }, { "epoch": 22.205954123962908, "grad_norm": 0.19046179950237274, "learning_rate": 2.0690256153652248e-05, "loss": 0.1353, "step": 227500 }, { "epoch": 22.25475841874085, "grad_norm": 0.19359087944030762, "learning_rate": 2.0595895680076645e-05, "loss": 0.1353, "step": 228000 }, { "epoch": 22.30356271351879, "grad_norm": 0.186729297041893, "learning_rate": 2.0501599913746374e-05, "loss": 0.1351, "step": 228500 }, { "epoch": 22.35236700829673, "grad_norm": 0.1899571716785431, "learning_rate": 2.0407370240100747e-05, "loss": 0.1352, "step": 229000 }, { "epoch": 22.40117130307467, "grad_norm": 0.1941409856081009, "learning_rate": 2.0313208043608017e-05, "loss": 0.1351, "step": 229500 }, { "epoch": 22.44997559785261, "grad_norm": 0.20220808684825897, "learning_rate": 2.021911470774504e-05, "loss": 0.1352, "step": 230000 }, { "epoch": 22.49877989263055, "grad_norm": 0.1803186982870102, "learning_rate": 2.0125091614976908e-05, "loss": 0.1348, "step": 230500 }, { "epoch": 22.547584187408493, "grad_norm": 0.19634583592414856, "learning_rate": 2.0031140146736696e-05, "loss": 0.1351, "step": 231000 }, { "epoch": 22.596388482186434, "grad_norm": 0.19138526916503906, "learning_rate": 1.9937261683405135e-05, "loss": 0.1351, "step": 231500 }, { "epoch": 22.64519277696437, "grad_norm": 0.18439550697803497, "learning_rate": 1.9843457604290306e-05, "loss": 0.1348, "step": 232000 }, { "epoch": 22.693997071742313, "grad_norm": 0.1892482042312622, "learning_rate": 1.974972928760744e-05, "loss": 0.1347, "step": 232500 }, { "epoch": 22.742801366520254, "grad_norm": 0.18848678469657898, "learning_rate": 1.9656078110458585e-05, "loss": 0.1347, "step": 233000 }, { "epoch": 22.791605661298195, "grad_norm": 0.1945199817419052, "learning_rate": 1.9562505448812453e-05, "loss": 0.1346, "step": 233500 }, { "epoch": 22.840409956076133, "grad_norm": 0.1922951489686966, "learning_rate": 1.946901267748417e-05, "loss": 0.1346, "step": 234000 }, { "epoch": 22.889214250854074, "grad_norm": 0.18175315856933594, "learning_rate": 1.937560117011504e-05, "loss": 0.1347, "step": 234500 }, { "epoch": 22.938018545632016, "grad_norm": 0.18632791936397552, "learning_rate": 1.9282272299152416e-05, "loss": 0.1344, "step": 235000 }, { "epoch": 22.986822840409957, "grad_norm": 0.1905319094657898, "learning_rate": 1.9189027435829533e-05, "loss": 0.1344, "step": 235500 }, { "epoch": 23.0, "eval_loss": 0.11993886530399323, "eval_runtime": 27.3709, "eval_samples_per_second": 287.75, "eval_steps_per_second": 0.402, "step": 235635 }, { "epoch": 23.035627135187898, "grad_norm": 0.19479139149188995, "learning_rate": 1.909586795014532e-05, "loss": 0.1343, "step": 236000 }, { "epoch": 23.084431429965836, "grad_norm": 0.1937461495399475, "learning_rate": 1.9002795210844315e-05, "loss": 0.1341, "step": 236500 }, { "epoch": 23.133235724743777, "grad_norm": 0.1967599093914032, "learning_rate": 1.890981058539652e-05, "loss": 0.1342, "step": 237000 }, { "epoch": 23.182040019521718, "grad_norm": 0.1805768460035324, "learning_rate": 1.8816915439977333e-05, "loss": 0.1342, "step": 237500 }, { "epoch": 23.23084431429966, "grad_norm": 0.1909085512161255, "learning_rate": 1.8724111139447474e-05, "loss": 0.1342, "step": 238000 }, { "epoch": 23.279648609077597, "grad_norm": 0.18977640569210052, "learning_rate": 1.863139904733291e-05, "loss": 0.134, "step": 238500 }, { "epoch": 23.32845290385554, "grad_norm": 0.20453977584838867, "learning_rate": 1.853878052580485e-05, "loss": 0.1341, "step": 239000 }, { "epoch": 23.37725719863348, "grad_norm": 0.18761217594146729, "learning_rate": 1.8446256935659725e-05, "loss": 0.1341, "step": 239500 }, { "epoch": 23.42606149341142, "grad_norm": 0.19060368835926056, "learning_rate": 1.835382963629916e-05, "loss": 0.1341, "step": 240000 }, { "epoch": 23.474865788189362, "grad_norm": 0.18985587358474731, "learning_rate": 1.8261499985710057e-05, "loss": 0.1341, "step": 240500 }, { "epoch": 23.5236700829673, "grad_norm": 0.18299199640750885, "learning_rate": 1.81692693404446e-05, "loss": 0.1339, "step": 241000 }, { "epoch": 23.57247437774524, "grad_norm": 0.19359715282917023, "learning_rate": 1.807713905560034e-05, "loss": 0.1337, "step": 241500 }, { "epoch": 23.621278672523182, "grad_norm": 0.19137471914291382, "learning_rate": 1.79851104848003e-05, "loss": 0.1342, "step": 242000 }, { "epoch": 23.670082967301123, "grad_norm": 0.19408565759658813, "learning_rate": 1.7893184980173038e-05, "loss": 0.134, "step": 242500 }, { "epoch": 23.718887262079065, "grad_norm": 0.19133317470550537, "learning_rate": 1.7801363892332846e-05, "loss": 0.1339, "step": 243000 }, { "epoch": 23.767691556857002, "grad_norm": 0.18910160660743713, "learning_rate": 1.770964857035986e-05, "loss": 0.1338, "step": 243500 }, { "epoch": 23.816495851634944, "grad_norm": 0.1919185370206833, "learning_rate": 1.7618040361780246e-05, "loss": 0.134, "step": 244000 }, { "epoch": 23.865300146412885, "grad_norm": 0.19021070003509521, "learning_rate": 1.7526540612546433e-05, "loss": 0.1336, "step": 244500 }, { "epoch": 23.914104441190826, "grad_norm": 0.1913536936044693, "learning_rate": 1.743515066701726e-05, "loss": 0.1338, "step": 245000 }, { "epoch": 23.962908735968764, "grad_norm": 0.19384372234344482, "learning_rate": 1.734387186793834e-05, "loss": 0.1336, "step": 245500 }, { "epoch": 24.0, "eval_loss": 0.12013557553291321, "eval_runtime": 25.8494, "eval_samples_per_second": 304.688, "eval_steps_per_second": 0.426, "step": 245880 }, { "epoch": 24.011713030746705, "grad_norm": 0.1967899650335312, "learning_rate": 1.7252705556422237e-05, "loss": 0.1337, "step": 246000 }, { "epoch": 24.060517325524646, "grad_norm": 0.19760966300964355, "learning_rate": 1.7161653071928774e-05, "loss": 0.1335, "step": 246500 }, { "epoch": 24.109321620302588, "grad_norm": 0.20642146468162537, "learning_rate": 1.707071575224541e-05, "loss": 0.1334, "step": 247000 }, { "epoch": 24.15812591508053, "grad_norm": 0.1874382495880127, "learning_rate": 1.6979894933467533e-05, "loss": 0.1334, "step": 247500 }, { "epoch": 24.206930209858466, "grad_norm": 0.1970881223678589, "learning_rate": 1.6889191949978827e-05, "loss": 0.1336, "step": 248000 }, { "epoch": 24.255734504636408, "grad_norm": 0.1981177181005478, "learning_rate": 1.6798608134431705e-05, "loss": 0.1335, "step": 248500 }, { "epoch": 24.30453879941435, "grad_norm": 0.1884346753358841, "learning_rate": 1.6708144817727685e-05, "loss": 0.1331, "step": 249000 }, { "epoch": 24.35334309419229, "grad_norm": 0.20689553022384644, "learning_rate": 1.6617803328997877e-05, "loss": 0.1336, "step": 249500 }, { "epoch": 24.402147388970228, "grad_norm": 0.21094737946987152, "learning_rate": 1.6527584995583428e-05, "loss": 0.1334, "step": 250000 }, { "epoch": 24.45095168374817, "grad_norm": 0.19117839634418488, "learning_rate": 1.643749114301602e-05, "loss": 0.133, "step": 250500 }, { "epoch": 24.49975597852611, "grad_norm": 0.1867818832397461, "learning_rate": 1.6347523094998413e-05, "loss": 0.1333, "step": 251000 }, { "epoch": 24.54856027330405, "grad_norm": 0.20169509947299957, "learning_rate": 1.6257682173384987e-05, "loss": 0.1332, "step": 251500 }, { "epoch": 24.597364568081993, "grad_norm": 0.20369745790958405, "learning_rate": 1.616796969816229e-05, "loss": 0.1332, "step": 252000 }, { "epoch": 24.64616886285993, "grad_norm": 0.19687892496585846, "learning_rate": 1.607838698742972e-05, "loss": 0.1334, "step": 252500 }, { "epoch": 24.69497315763787, "grad_norm": 0.20486027002334595, "learning_rate": 1.5988935357380068e-05, "loss": 0.1331, "step": 253000 }, { "epoch": 24.743777452415813, "grad_norm": 0.18960419297218323, "learning_rate": 1.5899616122280248e-05, "loss": 0.1329, "step": 253500 }, { "epoch": 24.792581747193754, "grad_norm": 0.18826229870319366, "learning_rate": 1.581043059445197e-05, "loss": 0.1331, "step": 254000 }, { "epoch": 24.841386041971692, "grad_norm": 0.18491852283477783, "learning_rate": 1.572138008425242e-05, "loss": 0.1329, "step": 254500 }, { "epoch": 24.890190336749633, "grad_norm": 0.19611585140228271, "learning_rate": 1.5632465900055073e-05, "loss": 0.1329, "step": 255000 }, { "epoch": 24.938994631527574, "grad_norm": 0.1943751573562622, "learning_rate": 1.5543689348230415e-05, "loss": 0.1329, "step": 255500 }, { "epoch": 24.987798926305516, "grad_norm": 0.1918455809354782, "learning_rate": 1.545505173312678e-05, "loss": 0.1328, "step": 256000 }, { "epoch": 25.0, "eval_loss": 0.11874233186244965, "eval_runtime": 24.4939, "eval_samples_per_second": 321.55, "eval_steps_per_second": 0.449, "step": 256125 } ], "logging_steps": 500, "max_steps": 409800, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.917586395968569e+18, "train_batch_size": 384, "trial_name": null, "trial_params": null }