| { | |
| "best_metric": 0.11874233186244965, | |
| "best_model_checkpoint": "./weights/OurNewMoleculeModel-v1/checkpoint-256125", | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 256125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04880429477794046, | |
| "grad_norm": 1.3488572835922241, | |
| "learning_rate": 4.99998163439129e-05, | |
| "loss": 2.2985, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09760858955588092, | |
| "grad_norm": 0.4086189568042755, | |
| "learning_rate": 4.999926537834994e-05, | |
| "loss": 2.0404, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14641288433382138, | |
| "grad_norm": 0.5561855435371399, | |
| "learning_rate": 4.999834711140619e-05, | |
| "loss": 2.0324, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19521717911176184, | |
| "grad_norm": 0.2902628779411316, | |
| "learning_rate": 4.999706155657327e-05, | |
| "loss": 2.0287, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2440214738897023, | |
| "grad_norm": 0.7554148435592651, | |
| "learning_rate": 4.999540873273918e-05, | |
| "loss": 2.0277, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.29282576866764276, | |
| "grad_norm": 0.34928998351097107, | |
| "learning_rate": 4.999338866418801e-05, | |
| "loss": 2.0227, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3416300634455832, | |
| "grad_norm": 0.5614811182022095, | |
| "learning_rate": 4.999100138059959e-05, | |
| "loss": 2.0122, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3904343582235237, | |
| "grad_norm": 0.5667726993560791, | |
| "learning_rate": 4.998824691704905e-05, | |
| "loss": 1.9914, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.43923865300146414, | |
| "grad_norm": 1.2578080892562866, | |
| "learning_rate": 4.998512531400633e-05, | |
| "loss": 1.9431, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.4880429477794046, | |
| "grad_norm": 1.1142494678497314, | |
| "learning_rate": 4.9981636617335516e-05, | |
| "loss": 1.578, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5368472425573451, | |
| "grad_norm": 0.9630743861198425, | |
| "learning_rate": 4.997778087829424e-05, | |
| "loss": 1.2667, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5856515373352855, | |
| "grad_norm": 0.7279083132743835, | |
| "learning_rate": 4.9973558153532925e-05, | |
| "loss": 1.0208, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.634455832113226, | |
| "grad_norm": 0.8263267874717712, | |
| "learning_rate": 4.996896850509387e-05, | |
| "loss": 0.885, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6832601268911664, | |
| "grad_norm": 0.7792947292327881, | |
| "learning_rate": 4.996401200041044e-05, | |
| "loss": 0.8054, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7320644216691069, | |
| "grad_norm": 0.6826034188270569, | |
| "learning_rate": 4.9958688712306015e-05, | |
| "loss": 0.7463, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7808687164470474, | |
| "grad_norm": 0.7101658582687378, | |
| "learning_rate": 4.995299871899292e-05, | |
| "loss": 0.6952, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8296730112249878, | |
| "grad_norm": 0.5552261471748352, | |
| "learning_rate": 4.994694210407133e-05, | |
| "loss": 0.6516, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8784773060029283, | |
| "grad_norm": 0.5594379305839539, | |
| "learning_rate": 4.994051895652797e-05, | |
| "loss": 0.6156, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9272816007808687, | |
| "grad_norm": 0.7451700568199158, | |
| "learning_rate": 4.993372937073485e-05, | |
| "loss": 0.5843, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9760858955588092, | |
| "grad_norm": 0.5584864020347595, | |
| "learning_rate": 4.9926573446447875e-05, | |
| "loss": 0.5583, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4663134217262268, | |
| "eval_runtime": 27.1679, | |
| "eval_samples_per_second": 289.901, | |
| "eval_steps_per_second": 0.405, | |
| "step": 10245 | |
| }, | |
| { | |
| "epoch": 1.0248901903367496, | |
| "grad_norm": 0.527858555316925, | |
| "learning_rate": 4.9919051288805364e-05, | |
| "loss": 0.5353, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.0736944851146901, | |
| "grad_norm": 0.5612876415252686, | |
| "learning_rate": 4.9911163008326527e-05, | |
| "loss": 0.5154, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1224987798926305, | |
| "grad_norm": 0.4924549460411072, | |
| "learning_rate": 4.990290872090982e-05, | |
| "loss": 0.4931, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.171303074670571, | |
| "grad_norm": 0.4243695139884949, | |
| "learning_rate": 4.9894288547831245e-05, | |
| "loss": 0.476, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.2201073694485114, | |
| "grad_norm": 0.5059812068939209, | |
| "learning_rate": 4.98853026157426e-05, | |
| "loss": 0.4609, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.268911664226452, | |
| "grad_norm": 0.4593505263328552, | |
| "learning_rate": 4.987595105666956e-05, | |
| "loss": 0.4468, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.3177159590043923, | |
| "grad_norm": 0.46688178181648254, | |
| "learning_rate": 4.9866234008009794e-05, | |
| "loss": 0.434, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.3665202537823329, | |
| "grad_norm": 0.4821254312992096, | |
| "learning_rate": 4.9856151612530905e-05, | |
| "loss": 0.4218, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.4153245485602732, | |
| "grad_norm": 0.4354498088359833, | |
| "learning_rate": 4.9845704018368364e-05, | |
| "loss": 0.4105, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.4641288433382138, | |
| "grad_norm": 0.4537793695926666, | |
| "learning_rate": 4.9834891379023305e-05, | |
| "loss": 0.3998, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.5129331381161544, | |
| "grad_norm": 0.37507402896881104, | |
| "learning_rate": 4.9823713853360294e-05, | |
| "loss": 0.3899, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.5617374328940947, | |
| "grad_norm": 0.40271782875061035, | |
| "learning_rate": 4.981217160560499e-05, | |
| "loss": 0.3812, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.610541727672035, | |
| "grad_norm": 0.3701293170452118, | |
| "learning_rate": 4.9800264805341694e-05, | |
| "loss": 0.373, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.6593460224499756, | |
| "grad_norm": 0.41362902522087097, | |
| "learning_rate": 4.978799362751094e-05, | |
| "loss": 0.3654, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.7081503172279162, | |
| "grad_norm": 0.3652186989784241, | |
| "learning_rate": 4.9775358252406836e-05, | |
| "loss": 0.3581, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.7569546120058566, | |
| "grad_norm": 0.366926908493042, | |
| "learning_rate": 4.9762358865674464e-05, | |
| "loss": 0.3515, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.805758906783797, | |
| "grad_norm": 0.4293728470802307, | |
| "learning_rate": 4.974899565830715e-05, | |
| "loss": 0.3449, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.8545632015617375, | |
| "grad_norm": 0.37214261293411255, | |
| "learning_rate": 4.973526882664364e-05, | |
| "loss": 0.3394, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.903367496339678, | |
| "grad_norm": 0.4047256112098694, | |
| "learning_rate": 4.9721178572365235e-05, | |
| "loss": 0.3337, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.9521717911176184, | |
| "grad_norm": 0.34720858931541443, | |
| "learning_rate": 4.9706725102492814e-05, | |
| "loss": 0.3287, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.28212064504623413, | |
| "eval_runtime": 23.8844, | |
| "eval_samples_per_second": 329.755, | |
| "eval_steps_per_second": 0.461, | |
| "step": 20490 | |
| }, | |
| { | |
| "epoch": 2.0009760858955588, | |
| "grad_norm": 0.37098678946495056, | |
| "learning_rate": 4.969190862938378e-05, | |
| "loss": 0.3237, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.049780380673499, | |
| "grad_norm": 0.3951970040798187, | |
| "learning_rate": 4.967672937072898e-05, | |
| "loss": 0.3191, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.09858467545144, | |
| "grad_norm": 0.3509838581085205, | |
| "learning_rate": 4.9661187549549476e-05, | |
| "loss": 0.3144, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.1473889702293802, | |
| "grad_norm": 0.35936230421066284, | |
| "learning_rate": 4.9645283394193274e-05, | |
| "loss": 0.3099, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.1961932650073206, | |
| "grad_norm": 0.3251510560512543, | |
| "learning_rate": 4.962901713833197e-05, | |
| "loss": 0.3063, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.244997559785261, | |
| "grad_norm": 0.33518901467323303, | |
| "learning_rate": 4.9612389020957306e-05, | |
| "loss": 0.3023, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.2938018545632017, | |
| "grad_norm": 0.3487328886985779, | |
| "learning_rate": 4.9595399286377686e-05, | |
| "loss": 0.2985, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.342606149341142, | |
| "grad_norm": 0.34018632769584656, | |
| "learning_rate": 4.9578048184214565e-05, | |
| "loss": 0.2952, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.3914104441190824, | |
| "grad_norm": 0.34304648637771606, | |
| "learning_rate": 4.956033596939879e-05, | |
| "loss": 0.2915, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.440214738897023, | |
| "grad_norm": 0.34716567397117615, | |
| "learning_rate": 4.9542262902166834e-05, | |
| "loss": 0.2883, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.4890190336749636, | |
| "grad_norm": 0.3204454481601715, | |
| "learning_rate": 4.952382924805702e-05, | |
| "loss": 0.2853, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.537823328452904, | |
| "grad_norm": 0.3337819278240204, | |
| "learning_rate": 4.950503527790555e-05, | |
| "loss": 0.2821, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.5866276232308443, | |
| "grad_norm": 0.3394376039505005, | |
| "learning_rate": 4.948588126784261e-05, | |
| "loss": 0.2793, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.6354319180087846, | |
| "grad_norm": 0.3065101206302643, | |
| "learning_rate": 4.9466367499288213e-05, | |
| "loss": 0.2767, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.6842362127867254, | |
| "grad_norm": 0.30751967430114746, | |
| "learning_rate": 4.9446494258948176e-05, | |
| "loss": 0.2736, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.7330405075646658, | |
| "grad_norm": 0.31060898303985596, | |
| "learning_rate": 4.942626183880981e-05, | |
| "loss": 0.2712, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.781844802342606, | |
| "grad_norm": 0.38574928045272827, | |
| "learning_rate": 4.940567053613768e-05, | |
| "loss": 0.2688, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.8306490971205465, | |
| "grad_norm": 0.31712907552719116, | |
| "learning_rate": 4.938472065346925e-05, | |
| "loss": 0.2669, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.879453391898487, | |
| "grad_norm": 0.2964314818382263, | |
| "learning_rate": 4.9363412498610385e-05, | |
| "loss": 0.2641, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.9282576866764276, | |
| "grad_norm": 0.30216559767723083, | |
| "learning_rate": 4.934174638463087e-05, | |
| "loss": 0.2616, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.977061981454368, | |
| "grad_norm": 0.2843080461025238, | |
| "learning_rate": 4.9319722629859813e-05, | |
| "loss": 0.2598, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.2259799689054489, | |
| "eval_runtime": 24.7473, | |
| "eval_samples_per_second": 318.256, | |
| "eval_steps_per_second": 0.444, | |
| "step": 30735 | |
| }, | |
| { | |
| "epoch": 3.0258662762323083, | |
| "grad_norm": 0.3090941905975342, | |
| "learning_rate": 4.9297341557880936e-05, | |
| "loss": 0.2577, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.074670571010249, | |
| "grad_norm": 0.29751360416412354, | |
| "learning_rate": 4.927460349752785e-05, | |
| "loss": 0.2554, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 3.1234748657881894, | |
| "grad_norm": 0.2908008396625519, | |
| "learning_rate": 4.925150878287921e-05, | |
| "loss": 0.2537, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.17227916056613, | |
| "grad_norm": 0.29090872406959534, | |
| "learning_rate": 4.92280577532538e-05, | |
| "loss": 0.2518, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 3.22108345534407, | |
| "grad_norm": 0.301048219203949, | |
| "learning_rate": 4.9204250753205585e-05, | |
| "loss": 0.2503, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.2698877501220105, | |
| "grad_norm": 0.2861855924129486, | |
| "learning_rate": 4.91800881325186e-05, | |
| "loss": 0.2482, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 3.3186920448999513, | |
| "grad_norm": 0.28286224603652954, | |
| "learning_rate": 4.915557024620183e-05, | |
| "loss": 0.2466, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.3674963396778916, | |
| "grad_norm": 0.3069954514503479, | |
| "learning_rate": 4.913069745448399e-05, | |
| "loss": 0.2451, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 3.416300634455832, | |
| "grad_norm": 0.2962004542350769, | |
| "learning_rate": 4.910547012280827e-05, | |
| "loss": 0.2436, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.465104929233773, | |
| "grad_norm": 0.2845563590526581, | |
| "learning_rate": 4.907988862182689e-05, | |
| "loss": 0.2421, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 3.513909224011713, | |
| "grad_norm": 0.26839151978492737, | |
| "learning_rate": 4.905395332739574e-05, | |
| "loss": 0.2406, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.5627135187896535, | |
| "grad_norm": 0.27475783228874207, | |
| "learning_rate": 4.902766462056877e-05, | |
| "loss": 0.2389, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 3.611517813567594, | |
| "grad_norm": 0.26468226313591003, | |
| "learning_rate": 4.900102288759249e-05, | |
| "loss": 0.2374, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.660322108345534, | |
| "grad_norm": 0.276924729347229, | |
| "learning_rate": 4.89740285199002e-05, | |
| "loss": 0.2361, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 3.709126403123475, | |
| "grad_norm": 0.2739529609680176, | |
| "learning_rate": 4.894668191410629e-05, | |
| "loss": 0.2348, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 3.7579306979014153, | |
| "grad_norm": 0.26919183135032654, | |
| "learning_rate": 4.8918983472000433e-05, | |
| "loss": 0.2336, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 3.8067349926793557, | |
| "grad_norm": 0.29099541902542114, | |
| "learning_rate": 4.88909336005416e-05, | |
| "loss": 0.2323, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 3.8555392874572965, | |
| "grad_norm": 0.2892494797706604, | |
| "learning_rate": 4.8862532711852184e-05, | |
| "loss": 0.2308, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 3.904343582235237, | |
| "grad_norm": 0.29746654629707336, | |
| "learning_rate": 4.883378122321186e-05, | |
| "loss": 0.2292, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 3.953147877013177, | |
| "grad_norm": 0.26809337735176086, | |
| "learning_rate": 4.8804679557051495e-05, | |
| "loss": 0.2283, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.19832605123519897, | |
| "eval_runtime": 25.8272, | |
| "eval_samples_per_second": 304.95, | |
| "eval_steps_per_second": 0.426, | |
| "step": 40980 | |
| }, | |
| { | |
| "epoch": 4.0019521717911175, | |
| "grad_norm": 0.2542949616909027, | |
| "learning_rate": 4.877522814094696e-05, | |
| "loss": 0.2272, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.050756466569058, | |
| "grad_norm": 0.2937975525856018, | |
| "learning_rate": 4.8745427407612776e-05, | |
| "loss": 0.2258, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 4.099560761346998, | |
| "grad_norm": 0.2632514536380768, | |
| "learning_rate": 4.8715277794895855e-05, | |
| "loss": 0.2256, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.148365056124939, | |
| "grad_norm": 0.2573137879371643, | |
| "learning_rate": 4.8684779745768974e-05, | |
| "loss": 0.2237, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 4.19716935090288, | |
| "grad_norm": 0.2653585970401764, | |
| "learning_rate": 4.8653933708324325e-05, | |
| "loss": 0.223, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.24597364568082, | |
| "grad_norm": 0.25552433729171753, | |
| "learning_rate": 4.862274013576691e-05, | |
| "loss": 0.2218, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 4.2947779404587605, | |
| "grad_norm": 0.2834942936897278, | |
| "learning_rate": 4.859119948640789e-05, | |
| "loss": 0.2211, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.343582235236701, | |
| "grad_norm": 0.2516108751296997, | |
| "learning_rate": 4.855931222365784e-05, | |
| "loss": 0.2202, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 4.392386530014641, | |
| "grad_norm": 0.301641583442688, | |
| "learning_rate": 4.852707881601996e-05, | |
| "loss": 0.2188, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.4411908247925815, | |
| "grad_norm": 0.26468151807785034, | |
| "learning_rate": 4.849449973708316e-05, | |
| "loss": 0.2176, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 4.489995119570522, | |
| "grad_norm": 0.274828165769577, | |
| "learning_rate": 4.846157546551516e-05, | |
| "loss": 0.2171, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.538799414348462, | |
| "grad_norm": 0.27979806065559387, | |
| "learning_rate": 4.842830648505535e-05, | |
| "loss": 0.2161, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 4.5876037091264035, | |
| "grad_norm": 0.26616737246513367, | |
| "learning_rate": 4.839469328450783e-05, | |
| "loss": 0.2149, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 4.636408003904344, | |
| "grad_norm": 0.24560213088989258, | |
| "learning_rate": 4.8360736357734083e-05, | |
| "loss": 0.2145, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 4.685212298682284, | |
| "grad_norm": 0.25653526186943054, | |
| "learning_rate": 4.8326436203645833e-05, | |
| "loss": 0.213, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 4.7340165934602245, | |
| "grad_norm": 0.2549044191837311, | |
| "learning_rate": 4.829179332619763e-05, | |
| "loss": 0.2124, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 4.782820888238165, | |
| "grad_norm": 0.24373945593833923, | |
| "learning_rate": 4.8256808234379516e-05, | |
| "loss": 0.2115, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 4.831625183016105, | |
| "grad_norm": 0.24188542366027832, | |
| "learning_rate": 4.822148144220948e-05, | |
| "loss": 0.2104, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 4.880429477794046, | |
| "grad_norm": 0.2541993260383606, | |
| "learning_rate": 4.8185813468725974e-05, | |
| "loss": 0.2102, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 4.929233772571987, | |
| "grad_norm": 0.266525536775589, | |
| "learning_rate": 4.814980483798022e-05, | |
| "loss": 0.2092, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 4.978038067349927, | |
| "grad_norm": 0.24894855916500092, | |
| "learning_rate": 4.811345607902855e-05, | |
| "loss": 0.2084, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.18130482733249664, | |
| "eval_runtime": 23.9311, | |
| "eval_samples_per_second": 329.111, | |
| "eval_steps_per_second": 0.46, | |
| "step": 51225 | |
| }, | |
| { | |
| "epoch": 5.0268423621278675, | |
| "grad_norm": 0.23973380029201508, | |
| "learning_rate": 4.8076767725924654e-05, | |
| "loss": 0.2076, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 5.075646656905808, | |
| "grad_norm": 0.23818284273147583, | |
| "learning_rate": 4.803974031771166e-05, | |
| "loss": 0.2067, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.124450951683748, | |
| "grad_norm": 0.23774628341197968, | |
| "learning_rate": 4.8002374398414295e-05, | |
| "loss": 0.2061, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 5.1732552464616886, | |
| "grad_norm": 0.2544199824333191, | |
| "learning_rate": 4.796467051703083e-05, | |
| "loss": 0.2051, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.222059541239629, | |
| "grad_norm": 0.24035200476646423, | |
| "learning_rate": 4.7926629227525066e-05, | |
| "loss": 0.2042, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 5.270863836017569, | |
| "grad_norm": 0.25180783867836, | |
| "learning_rate": 4.788825108881814e-05, | |
| "loss": 0.2037, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.31966813079551, | |
| "grad_norm": 0.25087398290634155, | |
| "learning_rate": 4.7849536664780346e-05, | |
| "loss": 0.2032, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 5.368472425573451, | |
| "grad_norm": 0.2356226146221161, | |
| "learning_rate": 4.7810486524222885e-05, | |
| "loss": 0.2024, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.417276720351391, | |
| "grad_norm": 0.25190770626068115, | |
| "learning_rate": 4.777110124088942e-05, | |
| "loss": 0.2019, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 5.4660810151293315, | |
| "grad_norm": 0.24268530309200287, | |
| "learning_rate": 4.77313813934477e-05, | |
| "loss": 0.2011, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 5.514885309907272, | |
| "grad_norm": 0.23932518064975739, | |
| "learning_rate": 4.7691327565481095e-05, | |
| "loss": 0.2005, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 5.563689604685212, | |
| "grad_norm": 0.23731377720832825, | |
| "learning_rate": 4.765094034547992e-05, | |
| "loss": 0.1996, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 5.612493899463153, | |
| "grad_norm": 0.2333805412054062, | |
| "learning_rate": 4.76102203268329e-05, | |
| "loss": 0.1989, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 5.661298194241093, | |
| "grad_norm": 0.24407994747161865, | |
| "learning_rate": 4.756916810781838e-05, | |
| "loss": 0.1987, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 5.710102489019034, | |
| "grad_norm": 0.23789800703525543, | |
| "learning_rate": 4.752778429159554e-05, | |
| "loss": 0.1979, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 5.7589067837969745, | |
| "grad_norm": 0.24565084278583527, | |
| "learning_rate": 4.7486069486195564e-05, | |
| "loss": 0.1969, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 5.807711078574915, | |
| "grad_norm": 0.26797595620155334, | |
| "learning_rate": 4.744402430451269e-05, | |
| "loss": 0.1965, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 5.856515373352855, | |
| "grad_norm": 0.25408676266670227, | |
| "learning_rate": 4.74016493642952e-05, | |
| "loss": 0.1955, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 5.905319668130796, | |
| "grad_norm": 0.23447421193122864, | |
| "learning_rate": 4.7358945288136344e-05, | |
| "loss": 0.1949, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 5.954123962908736, | |
| "grad_norm": 0.2329121083021164, | |
| "learning_rate": 4.7315912703465225e-05, | |
| "loss": 0.1948, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.1711394339799881, | |
| "eval_runtime": 27.596, | |
| "eval_samples_per_second": 285.403, | |
| "eval_steps_per_second": 0.399, | |
| "step": 61470 | |
| }, | |
| { | |
| "epoch": 6.002928257686676, | |
| "grad_norm": 0.2361510992050171, | |
| "learning_rate": 4.727255224253751e-05, | |
| "loss": 0.1941, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 6.051732552464617, | |
| "grad_norm": 0.23526506125926971, | |
| "learning_rate": 4.7228864542426224e-05, | |
| "loss": 0.1934, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.100536847242557, | |
| "grad_norm": 0.24888668954372406, | |
| "learning_rate": 4.7184850245012316e-05, | |
| "loss": 0.1928, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 6.149341142020498, | |
| "grad_norm": 0.24024108052253723, | |
| "learning_rate": 4.714050999697528e-05, | |
| "loss": 0.1924, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.1981454367984385, | |
| "grad_norm": 0.24707584083080292, | |
| "learning_rate": 4.709584444978364e-05, | |
| "loss": 0.192, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 6.246949731576379, | |
| "grad_norm": 0.2352433204650879, | |
| "learning_rate": 4.705085425968536e-05, | |
| "loss": 0.1915, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.295754026354319, | |
| "grad_norm": 0.24224288761615753, | |
| "learning_rate": 4.700554008769823e-05, | |
| "loss": 0.1907, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 6.34455832113226, | |
| "grad_norm": 0.2216614931821823, | |
| "learning_rate": 4.6959902599600125e-05, | |
| "loss": 0.1902, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 6.3933626159102, | |
| "grad_norm": 0.22495177388191223, | |
| "learning_rate": 4.691394246591925e-05, | |
| "loss": 0.1899, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 6.44216691068814, | |
| "grad_norm": 0.22609297931194305, | |
| "learning_rate": 4.686766036192426e-05, | |
| "loss": 0.1891, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 6.490971205466081, | |
| "grad_norm": 0.24654404819011688, | |
| "learning_rate": 4.682105696761436e-05, | |
| "loss": 0.1889, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 6.539775500244021, | |
| "grad_norm": 0.2228369563817978, | |
| "learning_rate": 4.6774132967709336e-05, | |
| "loss": 0.1881, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 6.588579795021962, | |
| "grad_norm": 0.21981576085090637, | |
| "learning_rate": 4.6726889051639436e-05, | |
| "loss": 0.1878, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 6.637384089799903, | |
| "grad_norm": 0.22510704398155212, | |
| "learning_rate": 4.6679325913535266e-05, | |
| "loss": 0.1871, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 6.686188384577843, | |
| "grad_norm": 0.24267421662807465, | |
| "learning_rate": 4.663144425221763e-05, | |
| "loss": 0.1867, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 6.734992679355783, | |
| "grad_norm": 0.2170720249414444, | |
| "learning_rate": 4.65832447711872e-05, | |
| "loss": 0.1862, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 6.783796974133724, | |
| "grad_norm": 0.25550180673599243, | |
| "learning_rate": 4.653472817861425e-05, | |
| "loss": 0.1857, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 6.832601268911664, | |
| "grad_norm": 0.23408746719360352, | |
| "learning_rate": 4.648589518732815e-05, | |
| "loss": 0.1853, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 6.881405563689604, | |
| "grad_norm": 0.26076194643974304, | |
| "learning_rate": 4.6436746514807e-05, | |
| "loss": 0.1849, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 6.930209858467546, | |
| "grad_norm": 0.21694616973400116, | |
| "learning_rate": 4.638728288316704e-05, | |
| "loss": 0.184, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 6.979014153245486, | |
| "grad_norm": 0.21888791024684906, | |
| "learning_rate": 4.633750501915203e-05, | |
| "loss": 0.184, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.16187380254268646, | |
| "eval_runtime": 26.1, | |
| "eval_samples_per_second": 301.762, | |
| "eval_steps_per_second": 0.421, | |
| "step": 71715 | |
| }, | |
| { | |
| "epoch": 7.027818448023426, | |
| "grad_norm": 0.22506974637508392, | |
| "learning_rate": 4.628741365412258e-05, | |
| "loss": 0.1836, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.076622742801367, | |
| "grad_norm": 0.21344968676567078, | |
| "learning_rate": 4.623700952404542e-05, | |
| "loss": 0.1832, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 7.125427037579307, | |
| "grad_norm": 0.22301891446113586, | |
| "learning_rate": 4.618629336948258e-05, | |
| "loss": 0.1826, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.174231332357247, | |
| "grad_norm": 0.2228812873363495, | |
| "learning_rate": 4.6135265935580494e-05, | |
| "loss": 0.182, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 7.223035627135188, | |
| "grad_norm": 0.24568694829940796, | |
| "learning_rate": 4.6083927972059084e-05, | |
| "loss": 0.1814, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.271839921913128, | |
| "grad_norm": 0.23808668553829193, | |
| "learning_rate": 4.603228023320069e-05, | |
| "loss": 0.1816, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 7.320644216691068, | |
| "grad_norm": 0.21967822313308716, | |
| "learning_rate": 4.598032347783905e-05, | |
| "loss": 0.1809, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 7.36944851146901, | |
| "grad_norm": 0.20847086608409882, | |
| "learning_rate": 4.5928058469348115e-05, | |
| "loss": 0.1806, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 7.41825280624695, | |
| "grad_norm": 0.22811928391456604, | |
| "learning_rate": 4.587548597563084e-05, | |
| "loss": 0.18, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 7.46705710102489, | |
| "grad_norm": 0.22424574196338654, | |
| "learning_rate": 4.582260676910791e-05, | |
| "loss": 0.1794, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 7.515861395802831, | |
| "grad_norm": 0.22317995131015778, | |
| "learning_rate": 4.5769421626706376e-05, | |
| "loss": 0.1793, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 7.564665690580771, | |
| "grad_norm": 0.21519626677036285, | |
| "learning_rate": 4.571593132984825e-05, | |
| "loss": 0.1789, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 7.613469985358711, | |
| "grad_norm": 0.2195836454629898, | |
| "learning_rate": 4.566213666443901e-05, | |
| "loss": 0.1784, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 7.662274280136652, | |
| "grad_norm": 0.23087261617183685, | |
| "learning_rate": 4.56080384208561e-05, | |
| "loss": 0.1778, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 7.711078574914593, | |
| "grad_norm": 0.2173396646976471, | |
| "learning_rate": 4.5553637393937234e-05, | |
| "loss": 0.1777, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 7.759882869692533, | |
| "grad_norm": 0.22740761935710907, | |
| "learning_rate": 4.54989343829688e-05, | |
| "loss": 0.1774, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 7.808687164470474, | |
| "grad_norm": 0.20074845850467682, | |
| "learning_rate": 4.544393019167408e-05, | |
| "loss": 0.1768, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 7.857491459248414, | |
| "grad_norm": 0.21903088688850403, | |
| "learning_rate": 4.538862562820143e-05, | |
| "loss": 0.1766, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 7.906295754026354, | |
| "grad_norm": 0.21944737434387207, | |
| "learning_rate": 4.533302150511243e-05, | |
| "loss": 0.1763, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 7.955100048804295, | |
| "grad_norm": 0.22298942506313324, | |
| "learning_rate": 4.5277118639369935e-05, | |
| "loss": 0.1758, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.15350797772407532, | |
| "eval_runtime": 25.6287, | |
| "eval_samples_per_second": 307.312, | |
| "eval_steps_per_second": 0.429, | |
| "step": 81960 | |
| }, | |
| { | |
| "epoch": 8.003904343582235, | |
| "grad_norm": 0.22214815020561218, | |
| "learning_rate": 4.5220917852326076e-05, | |
| "loss": 0.1758, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.052708638360176, | |
| "grad_norm": 0.22404730319976807, | |
| "learning_rate": 4.516441996971018e-05, | |
| "loss": 0.1751, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 8.101512933138116, | |
| "grad_norm": 0.21983228623867035, | |
| "learning_rate": 4.510762582161664e-05, | |
| "loss": 0.1747, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.150317227916057, | |
| "grad_norm": 0.23077502846717834, | |
| "learning_rate": 4.5050536242492756e-05, | |
| "loss": 0.1745, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 8.199121522693996, | |
| "grad_norm": 0.21954509615898132, | |
| "learning_rate": 4.4993152071126424e-05, | |
| "loss": 0.174, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 8.247925817471938, | |
| "grad_norm": 0.2204139679670334, | |
| "learning_rate": 4.493547415063382e-05, | |
| "loss": 0.1739, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 8.296730112249879, | |
| "grad_norm": 0.2210853546857834, | |
| "learning_rate": 4.487750332844704e-05, | |
| "loss": 0.1736, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 8.345534407027818, | |
| "grad_norm": 0.21140769124031067, | |
| "learning_rate": 4.4819240456301645e-05, | |
| "loss": 0.1732, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 8.39433870180576, | |
| "grad_norm": 0.22270390391349792, | |
| "learning_rate": 4.476068639022412e-05, | |
| "loss": 0.1726, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 8.443142996583699, | |
| "grad_norm": 0.2249361127614975, | |
| "learning_rate": 4.4701841990519324e-05, | |
| "loss": 0.1724, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 8.49194729136164, | |
| "grad_norm": 0.21904852986335754, | |
| "learning_rate": 4.4642708121757815e-05, | |
| "loss": 0.1723, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 8.54075158613958, | |
| "grad_norm": 0.21276357769966125, | |
| "learning_rate": 4.45832856527632e-05, | |
| "loss": 0.1717, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 8.589555880917521, | |
| "grad_norm": 0.21569614112377167, | |
| "learning_rate": 4.452357545659934e-05, | |
| "loss": 0.1714, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 8.63836017569546, | |
| "grad_norm": 0.21162466704845428, | |
| "learning_rate": 4.446357841055749e-05, | |
| "loss": 0.171, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 8.687164470473402, | |
| "grad_norm": 0.2211264669895172, | |
| "learning_rate": 4.4403295396143495e-05, | |
| "loss": 0.1709, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 8.735968765251343, | |
| "grad_norm": 0.20906701683998108, | |
| "learning_rate": 4.434272729906475e-05, | |
| "loss": 0.1707, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 8.784773060029282, | |
| "grad_norm": 0.2192634642124176, | |
| "learning_rate": 4.428187500921721e-05, | |
| "loss": 0.1701, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 8.833577354807224, | |
| "grad_norm": 0.2148887813091278, | |
| "learning_rate": 4.4220739420672376e-05, | |
| "loss": 0.1697, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 8.882381649585163, | |
| "grad_norm": 0.20213574171066284, | |
| "learning_rate": 4.4159321431664084e-05, | |
| "loss": 0.1695, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 8.931185944363104, | |
| "grad_norm": 0.21166318655014038, | |
| "learning_rate": 4.4097621944575324e-05, | |
| "loss": 0.1695, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 8.979990239141044, | |
| "grad_norm": 0.2028771936893463, | |
| "learning_rate": 4.4035641865925015e-05, | |
| "loss": 0.1693, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.15039320290088654, | |
| "eval_runtime": 24.9603, | |
| "eval_samples_per_second": 315.541, | |
| "eval_steps_per_second": 0.441, | |
| "step": 92205 | |
| }, | |
| { | |
| "epoch": 9.028794533918985, | |
| "grad_norm": 0.20694176852703094, | |
| "learning_rate": 4.3973382106354655e-05, | |
| "loss": 0.1686, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 9.077598828696924, | |
| "grad_norm": 0.21907255053520203, | |
| "learning_rate": 4.391084358061494e-05, | |
| "loss": 0.1684, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 9.126403123474866, | |
| "grad_norm": 0.21821749210357666, | |
| "learning_rate": 4.3848027207552364e-05, | |
| "loss": 0.1683, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 9.175207418252807, | |
| "grad_norm": 0.20274536311626434, | |
| "learning_rate": 4.3784933910095646e-05, | |
| "loss": 0.1677, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 9.224011713030746, | |
| "grad_norm": 0.20460249483585358, | |
| "learning_rate": 4.372156461524226e-05, | |
| "loss": 0.1676, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 9.272816007808688, | |
| "grad_norm": 0.21497923135757446, | |
| "learning_rate": 4.3657920254044726e-05, | |
| "loss": 0.1673, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 9.321620302586627, | |
| "grad_norm": 0.20720575749874115, | |
| "learning_rate": 4.3594001761597e-05, | |
| "loss": 0.1673, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 9.370424597364568, | |
| "grad_norm": 0.22322164475917816, | |
| "learning_rate": 4.352981007702071e-05, | |
| "loss": 0.1668, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 9.419228892142508, | |
| "grad_norm": 0.20235677063465118, | |
| "learning_rate": 4.346534614345132e-05, | |
| "loss": 0.1665, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 9.468033186920449, | |
| "grad_norm": 0.20581580698490143, | |
| "learning_rate": 4.340061090802436e-05, | |
| "loss": 0.1663, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 9.51683748169839, | |
| "grad_norm": 0.2083093822002411, | |
| "learning_rate": 4.333560532186142e-05, | |
| "loss": 0.166, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 9.56564177647633, | |
| "grad_norm": 0.20584595203399658, | |
| "learning_rate": 4.327033034005622e-05, | |
| "loss": 0.1657, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 9.614446071254271, | |
| "grad_norm": 0.20942457020282745, | |
| "learning_rate": 4.320478692166059e-05, | |
| "loss": 0.1656, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 9.66325036603221, | |
| "grad_norm": 0.20925435423851013, | |
| "learning_rate": 4.313897602967034e-05, | |
| "loss": 0.1654, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 9.712054660810152, | |
| "grad_norm": 0.22049732506275177, | |
| "learning_rate": 4.307289863101116e-05, | |
| "loss": 0.165, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 9.760858955588091, | |
| "grad_norm": 0.20315922796726227, | |
| "learning_rate": 4.300655569652437e-05, | |
| "loss": 0.1646, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 9.809663250366032, | |
| "grad_norm": 0.20489932596683502, | |
| "learning_rate": 4.293994820095264e-05, | |
| "loss": 0.1643, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 9.858467545143974, | |
| "grad_norm": 0.218128502368927, | |
| "learning_rate": 4.287307712292576e-05, | |
| "loss": 0.1643, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 9.907271839921913, | |
| "grad_norm": 0.20896770060062408, | |
| "learning_rate": 4.280594344494617e-05, | |
| "loss": 0.164, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 9.956076134699854, | |
| "grad_norm": 0.20507818460464478, | |
| "learning_rate": 4.273854815337455e-05, | |
| "loss": 0.1636, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.14604029059410095, | |
| "eval_runtime": 23.6994, | |
| "eval_samples_per_second": 332.329, | |
| "eval_steps_per_second": 0.464, | |
| "step": 102450 | |
| }, | |
| { | |
| "epoch": 10.004880429477794, | |
| "grad_norm": 0.20058345794677734, | |
| "learning_rate": 4.267089223841534e-05, | |
| "loss": 0.1636, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 10.053684724255735, | |
| "grad_norm": 0.2024383842945099, | |
| "learning_rate": 4.2602976694102205e-05, | |
| "loss": 0.1632, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 10.102489019033674, | |
| "grad_norm": 0.21127928793430328, | |
| "learning_rate": 4.253480251828337e-05, | |
| "loss": 0.1629, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 10.151293313811616, | |
| "grad_norm": 0.19965404272079468, | |
| "learning_rate": 4.246637071260705e-05, | |
| "loss": 0.1629, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 10.200097608589555, | |
| "grad_norm": 0.20860032737255096, | |
| "learning_rate": 4.239768228250664e-05, | |
| "loss": 0.1624, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 10.248901903367496, | |
| "grad_norm": 0.21451444923877716, | |
| "learning_rate": 4.232873823718602e-05, | |
| "loss": 0.1624, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 10.297706198145438, | |
| "grad_norm": 0.21074171364307404, | |
| "learning_rate": 4.225953958960466e-05, | |
| "loss": 0.1623, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 10.346510492923377, | |
| "grad_norm": 0.21716845035552979, | |
| "learning_rate": 4.21900873564628e-05, | |
| "loss": 0.1617, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 10.395314787701318, | |
| "grad_norm": 0.21059440076351166, | |
| "learning_rate": 4.2120382558186474e-05, | |
| "loss": 0.1617, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 10.444119082479258, | |
| "grad_norm": 0.22244805097579956, | |
| "learning_rate": 4.205042621891251e-05, | |
| "loss": 0.1614, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 10.492923377257199, | |
| "grad_norm": 0.21420615911483765, | |
| "learning_rate": 4.1980219366473514e-05, | |
| "loss": 0.1611, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 10.541727672035138, | |
| "grad_norm": 0.2058490365743637, | |
| "learning_rate": 4.1909763032382756e-05, | |
| "loss": 0.161, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 10.59053196681308, | |
| "grad_norm": 0.20425471663475037, | |
| "learning_rate": 4.1839058251819e-05, | |
| "loss": 0.1609, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 10.63933626159102, | |
| "grad_norm": 0.20022732019424438, | |
| "learning_rate": 4.176810606361132e-05, | |
| "loss": 0.1606, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 10.68814055636896, | |
| "grad_norm": 0.20972158014774323, | |
| "learning_rate": 4.169690751022382e-05, | |
| "loss": 0.1604, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 10.736944851146902, | |
| "grad_norm": 0.20773041248321533, | |
| "learning_rate": 4.1625463637740297e-05, | |
| "loss": 0.1602, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 10.785749145924841, | |
| "grad_norm": 0.2000124752521515, | |
| "learning_rate": 4.1553775495848934e-05, | |
| "loss": 0.1601, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 10.834553440702782, | |
| "grad_norm": 0.21309678256511688, | |
| "learning_rate": 4.148184413782682e-05, | |
| "loss": 0.1597, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 10.883357735480722, | |
| "grad_norm": 0.2132243663072586, | |
| "learning_rate": 4.14096706205245e-05, | |
| "loss": 0.1597, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 10.932162030258663, | |
| "grad_norm": 0.20744946599006653, | |
| "learning_rate": 4.133725600435042e-05, | |
| "loss": 0.1596, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 10.980966325036603, | |
| "grad_norm": 0.20575416088104248, | |
| "learning_rate": 4.12646013532554e-05, | |
| "loss": 0.159, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.13835683465003967, | |
| "eval_runtime": 27.3854, | |
| "eval_samples_per_second": 287.598, | |
| "eval_steps_per_second": 0.402, | |
| "step": 112695 | |
| }, | |
| { | |
| "epoch": 11.029770619814544, | |
| "grad_norm": 0.2070922553539276, | |
| "learning_rate": 4.119170773471695e-05, | |
| "loss": 0.1589, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 11.078574914592485, | |
| "grad_norm": 0.20478574931621552, | |
| "learning_rate": 4.11185762197236e-05, | |
| "loss": 0.1586, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 11.127379209370424, | |
| "grad_norm": 0.1970217078924179, | |
| "learning_rate": 4.104520788275921e-05, | |
| "loss": 0.1586, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 11.176183504148366, | |
| "grad_norm": 0.19945302605628967, | |
| "learning_rate": 4.097160380178707e-05, | |
| "loss": 0.1582, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 11.224987798926305, | |
| "grad_norm": 0.19257070124149323, | |
| "learning_rate": 4.0897765058234224e-05, | |
| "loss": 0.1581, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 11.273792093704246, | |
| "grad_norm": 0.2013574242591858, | |
| "learning_rate": 4.082369273697542e-05, | |
| "loss": 0.158, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 11.322596388482186, | |
| "grad_norm": 0.21071788668632507, | |
| "learning_rate": 4.0749387926317295e-05, | |
| "loss": 0.1575, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 11.371400683260127, | |
| "grad_norm": 0.2010817974805832, | |
| "learning_rate": 4.0674851717982286e-05, | |
| "loss": 0.1574, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 11.420204978038067, | |
| "grad_norm": 0.20782026648521423, | |
| "learning_rate": 4.0600085207092695e-05, | |
| "loss": 0.1573, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 11.469009272816008, | |
| "grad_norm": 0.2070448100566864, | |
| "learning_rate": 4.052508949215447e-05, | |
| "loss": 0.1573, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 11.517813567593949, | |
| "grad_norm": 0.2066112607717514, | |
| "learning_rate": 4.044986567504121e-05, | |
| "loss": 0.1571, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 11.566617862371888, | |
| "grad_norm": 0.20482249557971954, | |
| "learning_rate": 4.037441486097785e-05, | |
| "loss": 0.1568, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 11.61542215714983, | |
| "grad_norm": 0.20141823589801788, | |
| "learning_rate": 4.02987381585245e-05, | |
| "loss": 0.1568, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 11.66422645192777, | |
| "grad_norm": 0.20818044245243073, | |
| "learning_rate": 4.02228366795601e-05, | |
| "loss": 0.1565, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 11.71303074670571, | |
| "grad_norm": 0.20303422212600708, | |
| "learning_rate": 4.014671153926619e-05, | |
| "loss": 0.1562, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 11.76183504148365, | |
| "grad_norm": 0.19013996422290802, | |
| "learning_rate": 4.007036385611036e-05, | |
| "loss": 0.156, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 11.810639336261591, | |
| "grad_norm": 0.20407438278198242, | |
| "learning_rate": 3.999379475182996e-05, | |
| "loss": 0.1562, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 11.859443631039532, | |
| "grad_norm": 0.1977386772632599, | |
| "learning_rate": 3.991700535141556e-05, | |
| "loss": 0.1556, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 11.908247925817472, | |
| "grad_norm": 0.19012510776519775, | |
| "learning_rate": 3.9839996783094435e-05, | |
| "loss": 0.1555, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 11.957052220595413, | |
| "grad_norm": 0.20828774571418762, | |
| "learning_rate": 3.976277017831396e-05, | |
| "loss": 0.1553, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.13950450718402863, | |
| "eval_runtime": 28.1831, | |
| "eval_samples_per_second": 279.458, | |
| "eval_steps_per_second": 0.39, | |
| "step": 122940 | |
| }, | |
| { | |
| "epoch": 12.005856515373353, | |
| "grad_norm": 0.19804109632968903, | |
| "learning_rate": 3.968532667172501e-05, | |
| "loss": 0.1552, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 12.054660810151294, | |
| "grad_norm": 0.2035941481590271, | |
| "learning_rate": 3.960766740116531e-05, | |
| "loss": 0.1549, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 12.103465104929233, | |
| "grad_norm": 0.20041148364543915, | |
| "learning_rate": 3.952979350764268e-05, | |
| "loss": 0.1547, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 12.152269399707174, | |
| "grad_norm": 0.19230812788009644, | |
| "learning_rate": 3.945170613531828e-05, | |
| "loss": 0.1548, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 12.201073694485114, | |
| "grad_norm": 0.2065581977367401, | |
| "learning_rate": 3.9373406431489826e-05, | |
| "loss": 0.1544, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 12.249877989263055, | |
| "grad_norm": 0.19001494348049164, | |
| "learning_rate": 3.929489554657466e-05, | |
| "loss": 0.1543, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 12.298682284040996, | |
| "grad_norm": 0.20618636906147003, | |
| "learning_rate": 3.921617463409298e-05, | |
| "loss": 0.1537, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 12.347486578818936, | |
| "grad_norm": 0.1987367868423462, | |
| "learning_rate": 3.913724485065074e-05, | |
| "loss": 0.1542, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 12.396290873596877, | |
| "grad_norm": 0.1950555443763733, | |
| "learning_rate": 3.905810735592276e-05, | |
| "loss": 0.1537, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 12.445095168374817, | |
| "grad_norm": 0.20843225717544556, | |
| "learning_rate": 3.8978763312635645e-05, | |
| "loss": 0.1535, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 12.493899463152758, | |
| "grad_norm": 0.19434267282485962, | |
| "learning_rate": 3.889921388655073e-05, | |
| "loss": 0.1535, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 12.542703757930697, | |
| "grad_norm": 0.19898554682731628, | |
| "learning_rate": 3.881946024644691e-05, | |
| "loss": 0.1533, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 12.591508052708638, | |
| "grad_norm": 0.19874414801597595, | |
| "learning_rate": 3.873950356410352e-05, | |
| "loss": 0.1534, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 12.640312347486578, | |
| "grad_norm": 0.19424794614315033, | |
| "learning_rate": 3.865934501428304e-05, | |
| "loss": 0.1528, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 12.68911664226452, | |
| "grad_norm": 0.19256962835788727, | |
| "learning_rate": 3.8578985774713955e-05, | |
| "loss": 0.153, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 12.73792093704246, | |
| "grad_norm": 0.21424148976802826, | |
| "learning_rate": 3.8498427026073325e-05, | |
| "loss": 0.1527, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 12.7867252318204, | |
| "grad_norm": 0.20375344157218933, | |
| "learning_rate": 3.841766995196951e-05, | |
| "loss": 0.1526, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 12.835529526598341, | |
| "grad_norm": 0.2020910084247589, | |
| "learning_rate": 3.8336715738924787e-05, | |
| "loss": 0.1522, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 12.88433382137628, | |
| "grad_norm": 0.21570877730846405, | |
| "learning_rate": 3.825556557635787e-05, | |
| "loss": 0.1522, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 12.933138116154222, | |
| "grad_norm": 0.202886700630188, | |
| "learning_rate": 3.817422065656645e-05, | |
| "loss": 0.1522, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 12.981942410932161, | |
| "grad_norm": 0.19793546199798584, | |
| "learning_rate": 3.809268217470971e-05, | |
| "loss": 0.1519, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.13297139108181, | |
| "eval_runtime": 27.6372, | |
| "eval_samples_per_second": 284.978, | |
| "eval_steps_per_second": 0.398, | |
| "step": 133185 | |
| }, | |
| { | |
| "epoch": 13.030746705710103, | |
| "grad_norm": 0.19757746160030365, | |
| "learning_rate": 3.8010951328790745e-05, | |
| "loss": 0.1519, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 13.079551000488044, | |
| "grad_norm": 0.1974940001964569, | |
| "learning_rate": 3.792902931963893e-05, | |
| "loss": 0.1515, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 13.128355295265983, | |
| "grad_norm": 0.19320930540561676, | |
| "learning_rate": 3.784691735089232e-05, | |
| "loss": 0.1517, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 13.177159590043924, | |
| "grad_norm": 0.2007361203432083, | |
| "learning_rate": 3.776461662897995e-05, | |
| "loss": 0.1513, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 13.225963884821864, | |
| "grad_norm": 0.1926342397928238, | |
| "learning_rate": 3.76821283631041e-05, | |
| "loss": 0.1514, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 13.274768179599805, | |
| "grad_norm": 0.18830719590187073, | |
| "learning_rate": 3.759945376522254e-05, | |
| "loss": 0.1512, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 13.323572474377745, | |
| "grad_norm": 0.1940852552652359, | |
| "learning_rate": 3.7516594050030715e-05, | |
| "loss": 0.151, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 13.372376769155686, | |
| "grad_norm": 0.1951226443052292, | |
| "learning_rate": 3.7433550434943934e-05, | |
| "loss": 0.1508, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 13.421181063933625, | |
| "grad_norm": 0.18908989429473877, | |
| "learning_rate": 3.735032414007941e-05, | |
| "loss": 0.1505, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 13.469985358711567, | |
| "grad_norm": 0.19911529123783112, | |
| "learning_rate": 3.7266916388238396e-05, | |
| "loss": 0.1503, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 13.518789653489508, | |
| "grad_norm": 0.20053178071975708, | |
| "learning_rate": 3.718332840488821e-05, | |
| "loss": 0.1504, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 13.567593948267447, | |
| "grad_norm": 0.19537031650543213, | |
| "learning_rate": 3.70995614181442e-05, | |
| "loss": 0.1502, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 13.616398243045388, | |
| "grad_norm": 0.19510440528392792, | |
| "learning_rate": 3.7015616658751715e-05, | |
| "loss": 0.1503, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 13.665202537823328, | |
| "grad_norm": 0.196214497089386, | |
| "learning_rate": 3.693149536006807e-05, | |
| "loss": 0.1499, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 13.71400683260127, | |
| "grad_norm": 0.1952546089887619, | |
| "learning_rate": 3.6847198758044326e-05, | |
| "loss": 0.1499, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 13.762811127379209, | |
| "grad_norm": 0.19812558591365814, | |
| "learning_rate": 3.6762728091207216e-05, | |
| "loss": 0.1498, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 13.81161542215715, | |
| "grad_norm": 0.18906739354133606, | |
| "learning_rate": 3.66780846006409e-05, | |
| "loss": 0.1493, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 13.860419716935091, | |
| "grad_norm": 0.20462313294410706, | |
| "learning_rate": 3.659326952996879e-05, | |
| "loss": 0.1494, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 13.90922401171303, | |
| "grad_norm": 0.1982060968875885, | |
| "learning_rate": 3.650828412533519e-05, | |
| "loss": 0.1493, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 13.958028306490972, | |
| "grad_norm": 0.19797129929065704, | |
| "learning_rate": 3.6423129635387033e-05, | |
| "loss": 0.1494, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.13158732652664185, | |
| "eval_runtime": 27.7401, | |
| "eval_samples_per_second": 283.921, | |
| "eval_steps_per_second": 0.397, | |
| "step": 143430 | |
| }, | |
| { | |
| "epoch": 14.006832601268911, | |
| "grad_norm": 0.19103878736495972, | |
| "learning_rate": 3.6337807311255574e-05, | |
| "loss": 0.149, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 14.055636896046853, | |
| "grad_norm": 0.19477146863937378, | |
| "learning_rate": 3.625231840653794e-05, | |
| "loss": 0.1488, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 14.104441190824792, | |
| "grad_norm": 0.1984102576971054, | |
| "learning_rate": 3.616666417727875e-05, | |
| "loss": 0.1487, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 14.153245485602733, | |
| "grad_norm": 0.20152725279331207, | |
| "learning_rate": 3.608084588195166e-05, | |
| "loss": 0.1488, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 14.202049780380673, | |
| "grad_norm": 0.1842581033706665, | |
| "learning_rate": 3.599486478144085e-05, | |
| "loss": 0.1486, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 14.250854075158614, | |
| "grad_norm": 0.20297376811504364, | |
| "learning_rate": 3.590872213902252e-05, | |
| "loss": 0.1483, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 14.299658369936555, | |
| "grad_norm": 0.1883450597524643, | |
| "learning_rate": 3.582241922034631e-05, | |
| "loss": 0.1482, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 14.348462664714495, | |
| "grad_norm": 0.18912336230278015, | |
| "learning_rate": 3.573595729341675e-05, | |
| "loss": 0.1482, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 14.397266959492436, | |
| "grad_norm": 0.1913149505853653, | |
| "learning_rate": 3.564933762857454e-05, | |
| "loss": 0.1478, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 14.446071254270375, | |
| "grad_norm": 0.19658420979976654, | |
| "learning_rate": 3.556256149847801e-05, | |
| "loss": 0.1479, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 14.494875549048317, | |
| "grad_norm": 0.1880834996700287, | |
| "learning_rate": 3.547563017808432e-05, | |
| "loss": 0.1478, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 14.543679843826256, | |
| "grad_norm": 0.1877063512802124, | |
| "learning_rate": 3.538854494463074e-05, | |
| "loss": 0.1478, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 14.592484138604197, | |
| "grad_norm": 0.19691213965415955, | |
| "learning_rate": 3.530130707761594e-05, | |
| "loss": 0.1474, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 14.641288433382137, | |
| "grad_norm": 0.19889949262142181, | |
| "learning_rate": 3.521391785878114e-05, | |
| "loss": 0.1472, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 14.690092728160078, | |
| "grad_norm": 0.1987435221672058, | |
| "learning_rate": 3.512637857209131e-05, | |
| "loss": 0.1471, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 14.73889702293802, | |
| "grad_norm": 0.20512694120407104, | |
| "learning_rate": 3.503869050371626e-05, | |
| "loss": 0.1471, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 14.787701317715959, | |
| "grad_norm": 0.19599127769470215, | |
| "learning_rate": 3.4950854942011814e-05, | |
| "loss": 0.1471, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 14.8365056124939, | |
| "grad_norm": 0.1986822932958603, | |
| "learning_rate": 3.4862873177500796e-05, | |
| "loss": 0.1467, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 14.88530990727184, | |
| "grad_norm": 0.18663661181926727, | |
| "learning_rate": 3.4774746502854164e-05, | |
| "loss": 0.1469, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 14.93411420204978, | |
| "grad_norm": 0.1881023645401001, | |
| "learning_rate": 3.46864762128719e-05, | |
| "loss": 0.1467, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 14.98291849682772, | |
| "grad_norm": 0.1909170150756836, | |
| "learning_rate": 3.4598063604464106e-05, | |
| "loss": 0.1465, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.1301085352897644, | |
| "eval_runtime": 23.6471, | |
| "eval_samples_per_second": 333.064, | |
| "eval_steps_per_second": 0.465, | |
| "step": 153675 | |
| }, | |
| { | |
| "epoch": 15.031722791605661, | |
| "grad_norm": 0.19014447927474976, | |
| "learning_rate": 3.450950997663189e-05, | |
| "loss": 0.1461, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 15.080527086383603, | |
| "grad_norm": 0.20832829177379608, | |
| "learning_rate": 3.442081663044827e-05, | |
| "loss": 0.1463, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 15.129331381161542, | |
| "grad_norm": 0.19706888496875763, | |
| "learning_rate": 3.433198486903906e-05, | |
| "loss": 0.1461, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 15.178135675939483, | |
| "grad_norm": 0.2018064558506012, | |
| "learning_rate": 3.424301599756378e-05, | |
| "loss": 0.1463, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 15.226939970717423, | |
| "grad_norm": 0.19212935864925385, | |
| "learning_rate": 3.41539113231964e-05, | |
| "loss": 0.1464, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 15.275744265495364, | |
| "grad_norm": 0.20076821744441986, | |
| "learning_rate": 3.406467215510619e-05, | |
| "loss": 0.1459, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 15.324548560273303, | |
| "grad_norm": 0.19215160608291626, | |
| "learning_rate": 3.3975299804438476e-05, | |
| "loss": 0.1456, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 15.373352855051245, | |
| "grad_norm": 0.19090279936790466, | |
| "learning_rate": 3.388579558429534e-05, | |
| "loss": 0.1458, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 15.422157149829186, | |
| "grad_norm": 0.19182687997817993, | |
| "learning_rate": 3.3796160809716386e-05, | |
| "loss": 0.1454, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 15.470961444607125, | |
| "grad_norm": 0.18930520117282867, | |
| "learning_rate": 3.370639679765936e-05, | |
| "loss": 0.1452, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 15.519765739385067, | |
| "grad_norm": 0.20811304450035095, | |
| "learning_rate": 3.3616504866980834e-05, | |
| "loss": 0.1452, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 15.568570034163006, | |
| "grad_norm": 0.18808256089687347, | |
| "learning_rate": 3.3526486338416835e-05, | |
| "loss": 0.1453, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 15.617374328940947, | |
| "grad_norm": 0.18801531195640564, | |
| "learning_rate": 3.343634253456343e-05, | |
| "loss": 0.1451, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 15.666178623718887, | |
| "grad_norm": 0.19010472297668457, | |
| "learning_rate": 3.334607477985727e-05, | |
| "loss": 0.145, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 15.714982918496828, | |
| "grad_norm": 0.20773784816265106, | |
| "learning_rate": 3.3255684400556165e-05, | |
| "loss": 0.1449, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 15.763787213274767, | |
| "grad_norm": 0.1926048994064331, | |
| "learning_rate": 3.316517272471959e-05, | |
| "loss": 0.1445, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 15.812591508052709, | |
| "grad_norm": 0.20847058296203613, | |
| "learning_rate": 3.307454108218916e-05, | |
| "loss": 0.1448, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 15.86139580283065, | |
| "grad_norm": 0.18687431514263153, | |
| "learning_rate": 3.2983790804569105e-05, | |
| "loss": 0.1445, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 15.91020009760859, | |
| "grad_norm": 0.19642353057861328, | |
| "learning_rate": 3.2892923225206695e-05, | |
| "loss": 0.1443, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 15.95900439238653, | |
| "grad_norm": 0.19062745571136475, | |
| "learning_rate": 3.280193967917265e-05, | |
| "loss": 0.1444, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.1288023591041565, | |
| "eval_runtime": 27.3979, | |
| "eval_samples_per_second": 287.468, | |
| "eval_steps_per_second": 0.401, | |
| "step": 163920 | |
| }, | |
| { | |
| "epoch": 16.00780868716447, | |
| "grad_norm": 0.2008381485939026, | |
| "learning_rate": 3.271084150324154e-05, | |
| "loss": 0.1441, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 16.05661298194241, | |
| "grad_norm": 0.1929151713848114, | |
| "learning_rate": 3.261963003587214e-05, | |
| "loss": 0.1443, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 16.105417276720353, | |
| "grad_norm": 0.19287170469760895, | |
| "learning_rate": 3.252830661718772e-05, | |
| "loss": 0.144, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 16.15422157149829, | |
| "grad_norm": 0.19933773577213287, | |
| "learning_rate": 3.243687258895643e-05, | |
| "loss": 0.1439, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 16.20302586627623, | |
| "grad_norm": 0.2010374516248703, | |
| "learning_rate": 3.234532929457155e-05, | |
| "loss": 0.1439, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 16.251830161054173, | |
| "grad_norm": 0.19827648997306824, | |
| "learning_rate": 3.2253678079031724e-05, | |
| "loss": 0.1439, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 16.300634455832114, | |
| "grad_norm": 0.1934526264667511, | |
| "learning_rate": 3.2161920288921254e-05, | |
| "loss": 0.1438, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 16.349438750610055, | |
| "grad_norm": 0.20245911180973053, | |
| "learning_rate": 3.2070057272390263e-05, | |
| "loss": 0.1436, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 16.398243045387993, | |
| "grad_norm": 0.1878873109817505, | |
| "learning_rate": 3.197809037913493e-05, | |
| "loss": 0.1433, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 16.447047340165934, | |
| "grad_norm": 0.19571448862552643, | |
| "learning_rate": 3.188602096037764e-05, | |
| "loss": 0.1435, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 16.495851634943875, | |
| "grad_norm": 0.19554303586483002, | |
| "learning_rate": 3.179385036884712e-05, | |
| "loss": 0.1433, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 16.544655929721817, | |
| "grad_norm": 0.18918287754058838, | |
| "learning_rate": 3.170157995875859e-05, | |
| "loss": 0.1435, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 16.593460224499758, | |
| "grad_norm": 0.19676432013511658, | |
| "learning_rate": 3.160921108579385e-05, | |
| "loss": 0.1432, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 16.642264519277695, | |
| "grad_norm": 0.20606379210948944, | |
| "learning_rate": 3.151674510708136e-05, | |
| "loss": 0.1431, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 16.691068814055637, | |
| "grad_norm": 0.18640325963497162, | |
| "learning_rate": 3.142418338117631e-05, | |
| "loss": 0.1428, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 16.739873108833578, | |
| "grad_norm": 0.18933062255382538, | |
| "learning_rate": 3.1331527268040646e-05, | |
| "loss": 0.1431, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 16.78867740361152, | |
| "grad_norm": 0.1884533166885376, | |
| "learning_rate": 3.12387781290231e-05, | |
| "loss": 0.1427, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 16.837481698389457, | |
| "grad_norm": 0.19676893949508667, | |
| "learning_rate": 3.11459373268392e-05, | |
| "loss": 0.1426, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 16.886285993167398, | |
| "grad_norm": 0.18692608177661896, | |
| "learning_rate": 3.105300622555122e-05, | |
| "loss": 0.1429, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 16.93509028794534, | |
| "grad_norm": 0.2095184326171875, | |
| "learning_rate": 3.095998619054813e-05, | |
| "loss": 0.1425, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 16.98389458272328, | |
| "grad_norm": 0.19869489967823029, | |
| "learning_rate": 3.086687858852562e-05, | |
| "loss": 0.1425, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.12800458073616028, | |
| "eval_runtime": 25.827, | |
| "eval_samples_per_second": 304.952, | |
| "eval_steps_per_second": 0.426, | |
| "step": 174165 | |
| }, | |
| { | |
| "epoch": 17.032698877501222, | |
| "grad_norm": 0.18050076067447662, | |
| "learning_rate": 3.077368478746591e-05, | |
| "loss": 0.142, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 17.08150317227916, | |
| "grad_norm": 0.19508038461208344, | |
| "learning_rate": 3.068040615661768e-05, | |
| "loss": 0.1422, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 17.1303074670571, | |
| "grad_norm": 0.19345000386238098, | |
| "learning_rate": 3.0587044066476024e-05, | |
| "loss": 0.142, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 17.179111761835042, | |
| "grad_norm": 0.18671298027038574, | |
| "learning_rate": 3.0493599888762235e-05, | |
| "loss": 0.1417, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 17.227916056612983, | |
| "grad_norm": 0.18719059228897095, | |
| "learning_rate": 3.0400074996403666e-05, | |
| "loss": 0.1419, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 17.27672035139092, | |
| "grad_norm": 0.192045196890831, | |
| "learning_rate": 3.0306470763513584e-05, | |
| "loss": 0.142, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 17.325524646168862, | |
| "grad_norm": 0.18663588166236877, | |
| "learning_rate": 3.0212788565370952e-05, | |
| "loss": 0.1419, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 17.374328940946803, | |
| "grad_norm": 0.19223402440547943, | |
| "learning_rate": 3.0119029778400266e-05, | |
| "loss": 0.1416, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 17.423133235724745, | |
| "grad_norm": 0.20375187695026398, | |
| "learning_rate": 3.002519578015126e-05, | |
| "loss": 0.1417, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 17.471937530502686, | |
| "grad_norm": 0.19722655415534973, | |
| "learning_rate": 2.9931287949278752e-05, | |
| "loss": 0.1413, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 17.520741825280624, | |
| "grad_norm": 0.20561105012893677, | |
| "learning_rate": 2.9837307665522297e-05, | |
| "loss": 0.1412, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 17.569546120058565, | |
| "grad_norm": 0.1842418909072876, | |
| "learning_rate": 2.9743256309686013e-05, | |
| "loss": 0.1413, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 17.618350414836506, | |
| "grad_norm": 0.19416528940200806, | |
| "learning_rate": 2.9649135263618205e-05, | |
| "loss": 0.1414, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 17.667154709614447, | |
| "grad_norm": 0.18883706629276276, | |
| "learning_rate": 2.9554945910191122e-05, | |
| "loss": 0.1414, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 17.715959004392385, | |
| "grad_norm": 0.18695645034313202, | |
| "learning_rate": 2.9460689633280613e-05, | |
| "loss": 0.1413, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 17.764763299170326, | |
| "grad_norm": 0.1854555606842041, | |
| "learning_rate": 2.9366367817745794e-05, | |
| "loss": 0.1411, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 17.813567593948267, | |
| "grad_norm": 0.1904602348804474, | |
| "learning_rate": 2.927198184940872e-05, | |
| "loss": 0.1411, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 17.86237188872621, | |
| "grad_norm": 0.1872331202030182, | |
| "learning_rate": 2.917753311503399e-05, | |
| "loss": 0.1409, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 17.91117618350415, | |
| "grad_norm": 0.19253146648406982, | |
| "learning_rate": 2.90830230023084e-05, | |
| "loss": 0.1409, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 17.959980478282088, | |
| "grad_norm": 0.18223468959331512, | |
| "learning_rate": 2.8988452899820563e-05, | |
| "loss": 0.1407, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.12518393993377686, | |
| "eval_runtime": 25.7207, | |
| "eval_samples_per_second": 306.212, | |
| "eval_steps_per_second": 0.428, | |
| "step": 184410 | |
| }, | |
| { | |
| "epoch": 18.00878477306003, | |
| "grad_norm": 0.19220831990242004, | |
| "learning_rate": 2.889382419704047e-05, | |
| "loss": 0.1408, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 18.05758906783797, | |
| "grad_norm": 0.19996266067028046, | |
| "learning_rate": 2.8799138284299105e-05, | |
| "loss": 0.1406, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 18.10639336261591, | |
| "grad_norm": 0.192152738571167, | |
| "learning_rate": 2.8704396552767997e-05, | |
| "loss": 0.1405, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 18.15519765739385, | |
| "grad_norm": 0.19583114981651306, | |
| "learning_rate": 2.8609600394438816e-05, | |
| "loss": 0.1404, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 18.20400195217179, | |
| "grad_norm": 0.1908300369977951, | |
| "learning_rate": 2.851475120210289e-05, | |
| "loss": 0.1405, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 18.25280624694973, | |
| "grad_norm": 0.19682295620441437, | |
| "learning_rate": 2.8419850369330714e-05, | |
| "loss": 0.14, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 18.301610541727673, | |
| "grad_norm": 0.18878893554210663, | |
| "learning_rate": 2.8324899290451556e-05, | |
| "loss": 0.1403, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 18.350414836505614, | |
| "grad_norm": 0.19927945733070374, | |
| "learning_rate": 2.822989936053291e-05, | |
| "loss": 0.1402, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 18.39921913128355, | |
| "grad_norm": 0.18962599337100983, | |
| "learning_rate": 2.8134851975359994e-05, | |
| "loss": 0.1399, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 18.448023426061493, | |
| "grad_norm": 0.19572696089744568, | |
| "learning_rate": 2.8039758531415278e-05, | |
| "loss": 0.1399, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 18.496827720839434, | |
| "grad_norm": 0.19577118754386902, | |
| "learning_rate": 2.7944620425857952e-05, | |
| "loss": 0.14, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 18.545632015617375, | |
| "grad_norm": 0.1974543035030365, | |
| "learning_rate": 2.78494390565034e-05, | |
| "loss": 0.1398, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 18.594436310395317, | |
| "grad_norm": 0.19602327048778534, | |
| "learning_rate": 2.775421582180263e-05, | |
| "loss": 0.1397, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 18.643240605173254, | |
| "grad_norm": 0.18849612772464752, | |
| "learning_rate": 2.7658952120821802e-05, | |
| "loss": 0.1396, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 18.692044899951195, | |
| "grad_norm": 0.19312690198421478, | |
| "learning_rate": 2.756364935322158e-05, | |
| "loss": 0.1395, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 18.740849194729137, | |
| "grad_norm": 0.18100771307945251, | |
| "learning_rate": 2.7468308919236652e-05, | |
| "loss": 0.1394, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 18.789653489507078, | |
| "grad_norm": 0.20045186579227448, | |
| "learning_rate": 2.737293221965509e-05, | |
| "loss": 0.1394, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 18.838457784285016, | |
| "grad_norm": 0.1846308708190918, | |
| "learning_rate": 2.7277520655797816e-05, | |
| "loss": 0.1393, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 18.887262079062957, | |
| "grad_norm": 0.18819710612297058, | |
| "learning_rate": 2.7182075629497976e-05, | |
| "loss": 0.1394, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 18.936066373840898, | |
| "grad_norm": 0.18752720952033997, | |
| "learning_rate": 2.7086598543080392e-05, | |
| "loss": 0.1391, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 18.98487066861884, | |
| "grad_norm": 0.19363176822662354, | |
| "learning_rate": 2.6991090799340905e-05, | |
| "loss": 0.1391, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.1259300708770752, | |
| "eval_runtime": 25.5672, | |
| "eval_samples_per_second": 308.051, | |
| "eval_steps_per_second": 0.43, | |
| "step": 194655 | |
| }, | |
| { | |
| "epoch": 19.03367496339678, | |
| "grad_norm": 0.19123421609401703, | |
| "learning_rate": 2.6895553801525803e-05, | |
| "loss": 0.1391, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 19.08247925817472, | |
| "grad_norm": 0.19878804683685303, | |
| "learning_rate": 2.6799988953311162e-05, | |
| "loss": 0.1389, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 19.13128355295266, | |
| "grad_norm": 0.19207318127155304, | |
| "learning_rate": 2.6704397658782283e-05, | |
| "loss": 0.1391, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 19.1800878477306, | |
| "grad_norm": 0.18511444330215454, | |
| "learning_rate": 2.6608781322413018e-05, | |
| "loss": 0.1389, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 19.228892142508542, | |
| "grad_norm": 0.19707535207271576, | |
| "learning_rate": 2.651314134904514e-05, | |
| "loss": 0.1389, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 19.27769643728648, | |
| "grad_norm": 0.1916116625070572, | |
| "learning_rate": 2.6417479143867697e-05, | |
| "loss": 0.1387, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 19.32650073206442, | |
| "grad_norm": 0.18978238105773926, | |
| "learning_rate": 2.632179611239642e-05, | |
| "loss": 0.1387, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 19.375305026842362, | |
| "grad_norm": 0.1835888773202896, | |
| "learning_rate": 2.6226093660452982e-05, | |
| "loss": 0.1385, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 19.424109321620303, | |
| "grad_norm": 0.18811723589897156, | |
| "learning_rate": 2.613037319414441e-05, | |
| "loss": 0.1387, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 19.472913616398245, | |
| "grad_norm": 0.1998414546251297, | |
| "learning_rate": 2.6034636119842414e-05, | |
| "loss": 0.1385, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 19.521717911176182, | |
| "grad_norm": 0.18518772721290588, | |
| "learning_rate": 2.5938883844162715e-05, | |
| "loss": 0.1382, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 19.570522205954124, | |
| "grad_norm": 0.19242486357688904, | |
| "learning_rate": 2.584311777394437e-05, | |
| "loss": 0.1384, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 19.619326500732065, | |
| "grad_norm": 0.2028750330209732, | |
| "learning_rate": 2.574733931622912e-05, | |
| "loss": 0.1384, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 19.668130795510006, | |
| "grad_norm": 0.18917541205883026, | |
| "learning_rate": 2.5651549878240694e-05, | |
| "loss": 0.1381, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 19.716935090287944, | |
| "grad_norm": 0.19596756994724274, | |
| "learning_rate": 2.5555750867364188e-05, | |
| "loss": 0.138, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 19.765739385065885, | |
| "grad_norm": 0.19332247972488403, | |
| "learning_rate": 2.5459943691125292e-05, | |
| "loss": 0.1381, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 19.814543679843826, | |
| "grad_norm": 0.19187049567699432, | |
| "learning_rate": 2.536412975716972e-05, | |
| "loss": 0.1381, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 19.863347974621767, | |
| "grad_norm": 0.19392500817775726, | |
| "learning_rate": 2.5268310473242424e-05, | |
| "loss": 0.1378, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 19.91215226939971, | |
| "grad_norm": 0.19194450974464417, | |
| "learning_rate": 2.517248724716701e-05, | |
| "loss": 0.1377, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 19.960956564177646, | |
| "grad_norm": 0.20554892718791962, | |
| "learning_rate": 2.5076661486824953e-05, | |
| "loss": 0.1379, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.1231779009103775, | |
| "eval_runtime": 29.7434, | |
| "eval_samples_per_second": 264.798, | |
| "eval_steps_per_second": 0.37, | |
| "step": 204900 | |
| }, | |
| { | |
| "epoch": 20.009760858955588, | |
| "grad_norm": 0.19533833861351013, | |
| "learning_rate": 2.4980834600135006e-05, | |
| "loss": 0.1377, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 20.05856515373353, | |
| "grad_norm": 0.18907921016216278, | |
| "learning_rate": 2.488500799503244e-05, | |
| "loss": 0.1377, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 20.10736944851147, | |
| "grad_norm": 0.1802392452955246, | |
| "learning_rate": 2.4789183079448417e-05, | |
| "loss": 0.1378, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 20.156173743289408, | |
| "grad_norm": 0.19577832520008087, | |
| "learning_rate": 2.4693361261289247e-05, | |
| "loss": 0.1375, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 20.20497803806735, | |
| "grad_norm": 0.20748840272426605, | |
| "learning_rate": 2.4597543948415748e-05, | |
| "loss": 0.1376, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 20.25378233284529, | |
| "grad_norm": 0.19364304840564728, | |
| "learning_rate": 2.4501732548622546e-05, | |
| "loss": 0.1375, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 20.30258662762323, | |
| "grad_norm": 0.1987764686346054, | |
| "learning_rate": 2.440592846961738e-05, | |
| "loss": 0.1373, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 20.351390922401173, | |
| "grad_norm": 0.1924201399087906, | |
| "learning_rate": 2.4310133119000438e-05, | |
| "loss": 0.1376, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 20.40019521717911, | |
| "grad_norm": 0.19483359158039093, | |
| "learning_rate": 2.4214347904243644e-05, | |
| "loss": 0.1374, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 20.44899951195705, | |
| "grad_norm": 0.19892901182174683, | |
| "learning_rate": 2.4118574232670025e-05, | |
| "loss": 0.1372, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 20.497803806734993, | |
| "grad_norm": 0.18968260288238525, | |
| "learning_rate": 2.4022813511433027e-05, | |
| "loss": 0.137, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 20.546608101512934, | |
| "grad_norm": 0.19339485466480255, | |
| "learning_rate": 2.3927067147495765e-05, | |
| "loss": 0.1372, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 20.595412396290875, | |
| "grad_norm": 0.19323968887329102, | |
| "learning_rate": 2.383133654761045e-05, | |
| "loss": 0.137, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 20.644216691068813, | |
| "grad_norm": 0.18963748216629028, | |
| "learning_rate": 2.3735623118297692e-05, | |
| "loss": 0.1369, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 20.693020985846754, | |
| "grad_norm": 0.190143883228302, | |
| "learning_rate": 2.3639928265825783e-05, | |
| "loss": 0.1369, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 20.741825280624695, | |
| "grad_norm": 0.19597776234149933, | |
| "learning_rate": 2.3544253396190112e-05, | |
| "loss": 0.1369, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 20.790629575402637, | |
| "grad_norm": 0.18973353505134583, | |
| "learning_rate": 2.3448599915092443e-05, | |
| "loss": 0.1366, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 20.839433870180574, | |
| "grad_norm": 0.20242229104042053, | |
| "learning_rate": 2.3352969227920303e-05, | |
| "loss": 0.1368, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 20.888238164958516, | |
| "grad_norm": 0.19486981630325317, | |
| "learning_rate": 2.325736273972633e-05, | |
| "loss": 0.1368, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 20.937042459736457, | |
| "grad_norm": 0.18778111040592194, | |
| "learning_rate": 2.3161781855207575e-05, | |
| "loss": 0.1365, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 20.985846754514398, | |
| "grad_norm": 0.19285354018211365, | |
| "learning_rate": 2.3066227978684964e-05, | |
| "loss": 0.1363, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.12139205634593964, | |
| "eval_runtime": 26.2565, | |
| "eval_samples_per_second": 299.963, | |
| "eval_steps_per_second": 0.419, | |
| "step": 215145 | |
| }, | |
| { | |
| "epoch": 21.03465104929234, | |
| "grad_norm": 0.1933123618364334, | |
| "learning_rate": 2.297070251408259e-05, | |
| "loss": 0.1364, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 21.083455344070277, | |
| "grad_norm": 0.18427444994449615, | |
| "learning_rate": 2.287520686490707e-05, | |
| "loss": 0.1365, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 21.13225963884822, | |
| "grad_norm": 0.17762655019760132, | |
| "learning_rate": 2.2779742434227005e-05, | |
| "loss": 0.1363, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 21.18106393362616, | |
| "grad_norm": 0.18944330513477325, | |
| "learning_rate": 2.2684310624652287e-05, | |
| "loss": 0.1363, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 21.2298682284041, | |
| "grad_norm": 0.19393311440944672, | |
| "learning_rate": 2.2588912838313535e-05, | |
| "loss": 0.1363, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 21.27867252318204, | |
| "grad_norm": 0.1875392496585846, | |
| "learning_rate": 2.2493550476841495e-05, | |
| "loss": 0.1363, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 21.32747681795998, | |
| "grad_norm": 0.19635601341724396, | |
| "learning_rate": 2.2398224941346408e-05, | |
| "loss": 0.1362, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 21.37628111273792, | |
| "grad_norm": 0.19351017475128174, | |
| "learning_rate": 2.2302937632397462e-05, | |
| "loss": 0.1359, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 21.425085407515862, | |
| "grad_norm": 0.18472112715244293, | |
| "learning_rate": 2.2207689950002213e-05, | |
| "loss": 0.1362, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 21.473889702293803, | |
| "grad_norm": 0.192471444606781, | |
| "learning_rate": 2.211248329358598e-05, | |
| "loss": 0.1359, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 21.52269399707174, | |
| "grad_norm": 0.19304192066192627, | |
| "learning_rate": 2.2017319061971338e-05, | |
| "loss": 0.1362, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 21.571498291849682, | |
| "grad_norm": 0.18912473320960999, | |
| "learning_rate": 2.1922198653357498e-05, | |
| "loss": 0.1362, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 21.620302586627623, | |
| "grad_norm": 0.19801722466945648, | |
| "learning_rate": 2.182712346529983e-05, | |
| "loss": 0.1363, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 21.669106881405565, | |
| "grad_norm": 0.18331073224544525, | |
| "learning_rate": 2.1732094894689313e-05, | |
| "loss": 0.136, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 21.717911176183506, | |
| "grad_norm": 0.1763552576303482, | |
| "learning_rate": 2.1637114337731967e-05, | |
| "loss": 0.1356, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 21.766715470961444, | |
| "grad_norm": 0.1820065975189209, | |
| "learning_rate": 2.1542183189928387e-05, | |
| "loss": 0.1356, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 21.815519765739385, | |
| "grad_norm": 0.18830101191997528, | |
| "learning_rate": 2.1447302846053234e-05, | |
| "loss": 0.1358, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 21.864324060517326, | |
| "grad_norm": 0.19416014850139618, | |
| "learning_rate": 2.135247470013471e-05, | |
| "loss": 0.1354, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 21.913128355295267, | |
| "grad_norm": 0.1934524029493332, | |
| "learning_rate": 2.1257700145434132e-05, | |
| "loss": 0.1356, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 21.961932650073205, | |
| "grad_norm": 0.19462282955646515, | |
| "learning_rate": 2.116298057442539e-05, | |
| "loss": 0.1357, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.12161369621753693, | |
| "eval_runtime": 26.6058, | |
| "eval_samples_per_second": 296.025, | |
| "eval_steps_per_second": 0.413, | |
| "step": 225390 | |
| }, | |
| { | |
| "epoch": 22.010736944851146, | |
| "grad_norm": 0.18952177464962006, | |
| "learning_rate": 2.106831737877456e-05, | |
| "loss": 0.1354, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 22.059541239629088, | |
| "grad_norm": 0.2017366886138916, | |
| "learning_rate": 2.0973711949319415e-05, | |
| "loss": 0.1355, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 22.10834553440703, | |
| "grad_norm": 0.19085553288459778, | |
| "learning_rate": 2.087916567604897e-05, | |
| "loss": 0.1353, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 22.15714982918497, | |
| "grad_norm": 0.20396627485752106, | |
| "learning_rate": 2.0784679948083138e-05, | |
| "loss": 0.1352, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 22.205954123962908, | |
| "grad_norm": 0.19046179950237274, | |
| "learning_rate": 2.0690256153652248e-05, | |
| "loss": 0.1353, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 22.25475841874085, | |
| "grad_norm": 0.19359087944030762, | |
| "learning_rate": 2.0595895680076645e-05, | |
| "loss": 0.1353, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 22.30356271351879, | |
| "grad_norm": 0.186729297041893, | |
| "learning_rate": 2.0501599913746374e-05, | |
| "loss": 0.1351, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 22.35236700829673, | |
| "grad_norm": 0.1899571716785431, | |
| "learning_rate": 2.0407370240100747e-05, | |
| "loss": 0.1352, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 22.40117130307467, | |
| "grad_norm": 0.1941409856081009, | |
| "learning_rate": 2.0313208043608017e-05, | |
| "loss": 0.1351, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 22.44997559785261, | |
| "grad_norm": 0.20220808684825897, | |
| "learning_rate": 2.021911470774504e-05, | |
| "loss": 0.1352, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 22.49877989263055, | |
| "grad_norm": 0.1803186982870102, | |
| "learning_rate": 2.0125091614976908e-05, | |
| "loss": 0.1348, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 22.547584187408493, | |
| "grad_norm": 0.19634583592414856, | |
| "learning_rate": 2.0031140146736696e-05, | |
| "loss": 0.1351, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 22.596388482186434, | |
| "grad_norm": 0.19138526916503906, | |
| "learning_rate": 1.9937261683405135e-05, | |
| "loss": 0.1351, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 22.64519277696437, | |
| "grad_norm": 0.18439550697803497, | |
| "learning_rate": 1.9843457604290306e-05, | |
| "loss": 0.1348, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 22.693997071742313, | |
| "grad_norm": 0.1892482042312622, | |
| "learning_rate": 1.974972928760744e-05, | |
| "loss": 0.1347, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 22.742801366520254, | |
| "grad_norm": 0.18848678469657898, | |
| "learning_rate": 1.9656078110458585e-05, | |
| "loss": 0.1347, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 22.791605661298195, | |
| "grad_norm": 0.1945199817419052, | |
| "learning_rate": 1.9562505448812453e-05, | |
| "loss": 0.1346, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 22.840409956076133, | |
| "grad_norm": 0.1922951489686966, | |
| "learning_rate": 1.946901267748417e-05, | |
| "loss": 0.1346, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 22.889214250854074, | |
| "grad_norm": 0.18175315856933594, | |
| "learning_rate": 1.937560117011504e-05, | |
| "loss": 0.1347, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 22.938018545632016, | |
| "grad_norm": 0.18632791936397552, | |
| "learning_rate": 1.9282272299152416e-05, | |
| "loss": 0.1344, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 22.986822840409957, | |
| "grad_norm": 0.1905319094657898, | |
| "learning_rate": 1.9189027435829533e-05, | |
| "loss": 0.1344, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.11993886530399323, | |
| "eval_runtime": 27.3709, | |
| "eval_samples_per_second": 287.75, | |
| "eval_steps_per_second": 0.402, | |
| "step": 235635 | |
| }, | |
| { | |
| "epoch": 23.035627135187898, | |
| "grad_norm": 0.19479139149188995, | |
| "learning_rate": 1.909586795014532e-05, | |
| "loss": 0.1343, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 23.084431429965836, | |
| "grad_norm": 0.1937461495399475, | |
| "learning_rate": 1.9002795210844315e-05, | |
| "loss": 0.1341, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 23.133235724743777, | |
| "grad_norm": 0.1967599093914032, | |
| "learning_rate": 1.890981058539652e-05, | |
| "loss": 0.1342, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 23.182040019521718, | |
| "grad_norm": 0.1805768460035324, | |
| "learning_rate": 1.8816915439977333e-05, | |
| "loss": 0.1342, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 23.23084431429966, | |
| "grad_norm": 0.1909085512161255, | |
| "learning_rate": 1.8724111139447474e-05, | |
| "loss": 0.1342, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 23.279648609077597, | |
| "grad_norm": 0.18977640569210052, | |
| "learning_rate": 1.863139904733291e-05, | |
| "loss": 0.134, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 23.32845290385554, | |
| "grad_norm": 0.20453977584838867, | |
| "learning_rate": 1.853878052580485e-05, | |
| "loss": 0.1341, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 23.37725719863348, | |
| "grad_norm": 0.18761217594146729, | |
| "learning_rate": 1.8446256935659725e-05, | |
| "loss": 0.1341, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 23.42606149341142, | |
| "grad_norm": 0.19060368835926056, | |
| "learning_rate": 1.835382963629916e-05, | |
| "loss": 0.1341, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 23.474865788189362, | |
| "grad_norm": 0.18985587358474731, | |
| "learning_rate": 1.8261499985710057e-05, | |
| "loss": 0.1341, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 23.5236700829673, | |
| "grad_norm": 0.18299199640750885, | |
| "learning_rate": 1.81692693404446e-05, | |
| "loss": 0.1339, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 23.57247437774524, | |
| "grad_norm": 0.19359715282917023, | |
| "learning_rate": 1.807713905560034e-05, | |
| "loss": 0.1337, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 23.621278672523182, | |
| "grad_norm": 0.19137471914291382, | |
| "learning_rate": 1.79851104848003e-05, | |
| "loss": 0.1342, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 23.670082967301123, | |
| "grad_norm": 0.19408565759658813, | |
| "learning_rate": 1.7893184980173038e-05, | |
| "loss": 0.134, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 23.718887262079065, | |
| "grad_norm": 0.19133317470550537, | |
| "learning_rate": 1.7801363892332846e-05, | |
| "loss": 0.1339, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 23.767691556857002, | |
| "grad_norm": 0.18910160660743713, | |
| "learning_rate": 1.770964857035986e-05, | |
| "loss": 0.1338, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 23.816495851634944, | |
| "grad_norm": 0.1919185370206833, | |
| "learning_rate": 1.7618040361780246e-05, | |
| "loss": 0.134, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 23.865300146412885, | |
| "grad_norm": 0.19021070003509521, | |
| "learning_rate": 1.7526540612546433e-05, | |
| "loss": 0.1336, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 23.914104441190826, | |
| "grad_norm": 0.1913536936044693, | |
| "learning_rate": 1.743515066701726e-05, | |
| "loss": 0.1338, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 23.962908735968764, | |
| "grad_norm": 0.19384372234344482, | |
| "learning_rate": 1.734387186793834e-05, | |
| "loss": 0.1336, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.12013557553291321, | |
| "eval_runtime": 25.8494, | |
| "eval_samples_per_second": 304.688, | |
| "eval_steps_per_second": 0.426, | |
| "step": 245880 | |
| }, | |
| { | |
| "epoch": 24.011713030746705, | |
| "grad_norm": 0.1967899650335312, | |
| "learning_rate": 1.7252705556422237e-05, | |
| "loss": 0.1337, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 24.060517325524646, | |
| "grad_norm": 0.19760966300964355, | |
| "learning_rate": 1.7161653071928774e-05, | |
| "loss": 0.1335, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 24.109321620302588, | |
| "grad_norm": 0.20642146468162537, | |
| "learning_rate": 1.707071575224541e-05, | |
| "loss": 0.1334, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 24.15812591508053, | |
| "grad_norm": 0.1874382495880127, | |
| "learning_rate": 1.6979894933467533e-05, | |
| "loss": 0.1334, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 24.206930209858466, | |
| "grad_norm": 0.1970881223678589, | |
| "learning_rate": 1.6889191949978827e-05, | |
| "loss": 0.1336, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 24.255734504636408, | |
| "grad_norm": 0.1981177181005478, | |
| "learning_rate": 1.6798608134431705e-05, | |
| "loss": 0.1335, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 24.30453879941435, | |
| "grad_norm": 0.1884346753358841, | |
| "learning_rate": 1.6708144817727685e-05, | |
| "loss": 0.1331, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 24.35334309419229, | |
| "grad_norm": 0.20689553022384644, | |
| "learning_rate": 1.6617803328997877e-05, | |
| "loss": 0.1336, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 24.402147388970228, | |
| "grad_norm": 0.21094737946987152, | |
| "learning_rate": 1.6527584995583428e-05, | |
| "loss": 0.1334, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 24.45095168374817, | |
| "grad_norm": 0.19117839634418488, | |
| "learning_rate": 1.643749114301602e-05, | |
| "loss": 0.133, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 24.49975597852611, | |
| "grad_norm": 0.1867818832397461, | |
| "learning_rate": 1.6347523094998413e-05, | |
| "loss": 0.1333, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 24.54856027330405, | |
| "grad_norm": 0.20169509947299957, | |
| "learning_rate": 1.6257682173384987e-05, | |
| "loss": 0.1332, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 24.597364568081993, | |
| "grad_norm": 0.20369745790958405, | |
| "learning_rate": 1.616796969816229e-05, | |
| "loss": 0.1332, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 24.64616886285993, | |
| "grad_norm": 0.19687892496585846, | |
| "learning_rate": 1.607838698742972e-05, | |
| "loss": 0.1334, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 24.69497315763787, | |
| "grad_norm": 0.20486027002334595, | |
| "learning_rate": 1.5988935357380068e-05, | |
| "loss": 0.1331, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 24.743777452415813, | |
| "grad_norm": 0.18960419297218323, | |
| "learning_rate": 1.5899616122280248e-05, | |
| "loss": 0.1329, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 24.792581747193754, | |
| "grad_norm": 0.18826229870319366, | |
| "learning_rate": 1.581043059445197e-05, | |
| "loss": 0.1331, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 24.841386041971692, | |
| "grad_norm": 0.18491852283477783, | |
| "learning_rate": 1.572138008425242e-05, | |
| "loss": 0.1329, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 24.890190336749633, | |
| "grad_norm": 0.19611585140228271, | |
| "learning_rate": 1.5632465900055073e-05, | |
| "loss": 0.1329, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 24.938994631527574, | |
| "grad_norm": 0.1943751573562622, | |
| "learning_rate": 1.5543689348230415e-05, | |
| "loss": 0.1329, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 24.987798926305516, | |
| "grad_norm": 0.1918455809354782, | |
| "learning_rate": 1.545505173312678e-05, | |
| "loss": 0.1328, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.11874233186244965, | |
| "eval_runtime": 24.4939, | |
| "eval_samples_per_second": 321.55, | |
| "eval_steps_per_second": 0.449, | |
| "step": 256125 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 409800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.917586395968569e+18, | |
| "train_batch_size": 384, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |