{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.524030177821643, "eval_steps": 2500, "global_step": 97500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 0.0005, "loss": 0.5631, "step": 500 }, { "epoch": 0.07, "learning_rate": 0.0004974874371859296, "loss": 0.14, "step": 1000 }, { "epoch": 0.1, "learning_rate": 0.0004949748743718593, "loss": 0.0804, "step": 1500 }, { "epoch": 0.13, "learning_rate": 0.0004924623115577889, "loss": 0.0597, "step": 2000 }, { "epoch": 0.17, "learning_rate": 0.0004899497487437187, "loss": 0.0501, "step": 2500 }, { "epoch": 0.17, "eval_loss": 0.033905673772096634, "eval_runtime": 8.0076, "eval_samples_per_second": 124.881, "eval_steps_per_second": 1.249, "step": 2500 }, { "epoch": 0.2, "learning_rate": 0.00048743718592964827, "loss": 0.0436, "step": 3000 }, { "epoch": 0.23, "learning_rate": 0.0004849246231155779, "loss": 0.0398, "step": 3500 }, { "epoch": 0.27, "learning_rate": 0.00048241206030150754, "loss": 0.0374, "step": 4000 }, { "epoch": 0.3, "learning_rate": 0.0004798994974874372, "loss": 0.035, "step": 4500 }, { "epoch": 0.33, "learning_rate": 0.00047738693467336686, "loss": 0.0331, "step": 5000 }, { "epoch": 0.33, "eval_loss": 0.023899264633655548, "eval_runtime": 7.9042, "eval_samples_per_second": 126.515, "eval_steps_per_second": 1.265, "step": 5000 }, { "epoch": 0.37, "learning_rate": 0.00047487437185929647, "loss": 0.0313, "step": 5500 }, { "epoch": 0.4, "learning_rate": 0.00047236180904522613, "loss": 0.03, "step": 6000 }, { "epoch": 0.43, "learning_rate": 0.0004698492462311558, "loss": 0.0287, "step": 6500 }, { "epoch": 0.47, "learning_rate": 0.00046733668341708545, "loss": 0.0277, "step": 7000 }, { "epoch": 0.5, "learning_rate": 0.0004648241206030151, "loss": 0.0271, "step": 7500 }, { "epoch": 0.5, "eval_loss": 0.01935673877596855, "eval_runtime": 8.4605, "eval_samples_per_second": 118.197, "eval_steps_per_second": 1.182, "step": 7500 }, { "epoch": 0.54, "learning_rate": 0.0004623115577889447, "loss": 0.0262, "step": 8000 }, { "epoch": 0.57, "learning_rate": 0.0004597989949748744, "loss": 0.0253, "step": 8500 }, { "epoch": 0.6, "learning_rate": 0.000457286432160804, "loss": 0.0244, "step": 9000 }, { "epoch": 0.64, "learning_rate": 0.0004547738693467337, "loss": 0.0239, "step": 9500 }, { "epoch": 0.67, "learning_rate": 0.0004522613065326633, "loss": 0.0238, "step": 10000 }, { "epoch": 0.67, "eval_loss": 0.017778538167476654, "eval_runtime": 7.6198, "eval_samples_per_second": 131.237, "eval_steps_per_second": 1.312, "step": 10000 }, { "epoch": 0.7, "learning_rate": 0.000449748743718593, "loss": 0.0229, "step": 10500 }, { "epoch": 0.74, "learning_rate": 0.0004472361809045226, "loss": 0.0223, "step": 11000 }, { "epoch": 0.77, "learning_rate": 0.00044472361809045225, "loss": 0.0219, "step": 11500 }, { "epoch": 0.8, "learning_rate": 0.00044221105527638197, "loss": 0.0216, "step": 12000 }, { "epoch": 0.84, "learning_rate": 0.0004396984924623116, "loss": 0.0212, "step": 12500 }, { "epoch": 0.84, "eval_loss": 0.014742564409971237, "eval_runtime": 7.6159, "eval_samples_per_second": 131.305, "eval_steps_per_second": 1.313, "step": 12500 }, { "epoch": 0.87, "learning_rate": 0.00043718592964824124, "loss": 0.0206, "step": 13000 }, { "epoch": 0.9, "learning_rate": 0.00043467336683417085, "loss": 0.0205, "step": 13500 }, { "epoch": 0.94, "learning_rate": 0.0004321608040201005, "loss": 0.0198, "step": 14000 }, { "epoch": 0.97, "learning_rate": 0.0004296482412060301, "loss": 0.0198, "step": 14500 }, { "epoch": 1.0, "learning_rate": 0.00042713567839195983, "loss": 0.0193, "step": 15000 }, { "epoch": 1.0, "eval_loss": 0.01463437918573618, "eval_runtime": 7.6369, "eval_samples_per_second": 130.944, "eval_steps_per_second": 1.309, "step": 15000 }, { "epoch": 1.04, "learning_rate": 0.00042462311557788944, "loss": 0.0178, "step": 15500 }, { "epoch": 1.07, "learning_rate": 0.0004221105527638191, "loss": 0.0177, "step": 16000 }, { "epoch": 1.1, "learning_rate": 0.00041959798994974876, "loss": 0.0172, "step": 16500 }, { "epoch": 1.14, "learning_rate": 0.00041708542713567837, "loss": 0.017, "step": 17000 }, { "epoch": 1.17, "learning_rate": 0.0004145728643216081, "loss": 0.0172, "step": 17500 }, { "epoch": 1.17, "eval_loss": 0.0131097212433815, "eval_runtime": 7.63, "eval_samples_per_second": 131.062, "eval_steps_per_second": 1.311, "step": 17500 }, { "epoch": 1.2, "learning_rate": 0.0004120603015075377, "loss": 0.0168, "step": 18000 }, { "epoch": 1.24, "learning_rate": 0.00040954773869346736, "loss": 0.017, "step": 18500 }, { "epoch": 1.27, "learning_rate": 0.00040703517587939697, "loss": 0.0165, "step": 19000 }, { "epoch": 1.3, "learning_rate": 0.00040452261306532663, "loss": 0.0163, "step": 19500 }, { "epoch": 1.34, "learning_rate": 0.0004020100502512563, "loss": 0.0163, "step": 20000 }, { "epoch": 1.34, "eval_loss": 0.012624073773622513, "eval_runtime": 7.6613, "eval_samples_per_second": 130.527, "eval_steps_per_second": 1.305, "step": 20000 }, { "epoch": 1.37, "learning_rate": 0.00039949748743718595, "loss": 0.0161, "step": 20500 }, { "epoch": 1.41, "learning_rate": 0.00039698492462311556, "loss": 0.0162, "step": 21000 }, { "epoch": 1.44, "learning_rate": 0.0003944723618090452, "loss": 0.0162, "step": 21500 }, { "epoch": 1.47, "learning_rate": 0.0003919597989949749, "loss": 0.0158, "step": 22000 }, { "epoch": 1.51, "learning_rate": 0.00038944723618090455, "loss": 0.0157, "step": 22500 }, { "epoch": 1.51, "eval_loss": 0.0122187789529562, "eval_runtime": 7.6253, "eval_samples_per_second": 131.143, "eval_steps_per_second": 1.311, "step": 22500 }, { "epoch": 1.54, "learning_rate": 0.0003869346733668342, "loss": 0.0289, "step": 23000 }, { "epoch": 1.57, "learning_rate": 0.0003844221105527638, "loss": 0.0157, "step": 23500 }, { "epoch": 1.61, "learning_rate": 0.0003819095477386935, "loss": 0.0148, "step": 24000 }, { "epoch": 1.64, "learning_rate": 0.0003793969849246231, "loss": 0.0148, "step": 24500 }, { "epoch": 1.67, "learning_rate": 0.0003768844221105528, "loss": 0.0147, "step": 25000 }, { "epoch": 1.67, "eval_loss": 0.011391976848244667, "eval_runtime": 7.5953, "eval_samples_per_second": 131.66, "eval_steps_per_second": 1.317, "step": 25000 }, { "epoch": 1.71, "learning_rate": 0.0003743718592964824, "loss": 0.0148, "step": 25500 }, { "epoch": 1.74, "learning_rate": 0.00037185929648241207, "loss": 0.0146, "step": 26000 }, { "epoch": 1.77, "learning_rate": 0.0003693467336683417, "loss": 0.0145, "step": 26500 }, { "epoch": 1.81, "learning_rate": 0.00036683417085427134, "loss": 0.0143, "step": 27000 }, { "epoch": 1.84, "learning_rate": 0.00036432160804020106, "loss": 0.0144, "step": 27500 }, { "epoch": 1.84, "eval_loss": 0.011031309142708778, "eval_runtime": 7.6444, "eval_samples_per_second": 130.814, "eval_steps_per_second": 1.308, "step": 27500 }, { "epoch": 1.87, "learning_rate": 0.00036180904522613067, "loss": 0.0142, "step": 28000 }, { "epoch": 1.91, "learning_rate": 0.00035929648241206033, "loss": 0.014, "step": 28500 }, { "epoch": 1.94, "learning_rate": 0.00035678391959798994, "loss": 0.0143, "step": 29000 }, { "epoch": 1.97, "learning_rate": 0.0003542713567839196, "loss": 0.0144, "step": 29500 }, { "epoch": 2.01, "learning_rate": 0.00035175879396984926, "loss": 0.0136, "step": 30000 }, { "epoch": 2.01, "eval_loss": 0.010939952917397022, "eval_runtime": 7.6117, "eval_samples_per_second": 131.376, "eval_steps_per_second": 1.314, "step": 30000 }, { "epoch": 2.04, "learning_rate": 0.0003492462311557789, "loss": 0.0121, "step": 30500 }, { "epoch": 2.07, "learning_rate": 0.00034673366834170853, "loss": 0.0121, "step": 31000 }, { "epoch": 2.11, "learning_rate": 0.0003442211055276382, "loss": 0.0122, "step": 31500 }, { "epoch": 2.14, "learning_rate": 0.0003417085427135678, "loss": 0.012, "step": 32000 }, { "epoch": 2.17, "learning_rate": 0.0003391959798994975, "loss": 0.0121, "step": 32500 }, { "epoch": 2.17, "eval_loss": 0.010623764246702194, "eval_runtime": 7.623, "eval_samples_per_second": 131.182, "eval_steps_per_second": 1.312, "step": 32500 }, { "epoch": 2.21, "learning_rate": 0.0003366834170854272, "loss": 0.012, "step": 33000 }, { "epoch": 2.24, "learning_rate": 0.0003341708542713568, "loss": 0.0119, "step": 33500 }, { "epoch": 2.28, "learning_rate": 0.00033165829145728645, "loss": 0.0119, "step": 34000 }, { "epoch": 2.31, "learning_rate": 0.00032914572864321606, "loss": 0.0118, "step": 34500 }, { "epoch": 2.34, "learning_rate": 0.00032663316582914577, "loss": 0.0118, "step": 35000 }, { "epoch": 2.34, "eval_loss": 0.01009445358067751, "eval_runtime": 7.8168, "eval_samples_per_second": 127.93, "eval_steps_per_second": 1.279, "step": 35000 }, { "epoch": 2.38, "learning_rate": 0.0003241206030150754, "loss": 0.0119, "step": 35500 }, { "epoch": 2.41, "learning_rate": 0.00032160804020100504, "loss": 0.0119, "step": 36000 }, { "epoch": 2.44, "learning_rate": 0.00031909547738693465, "loss": 0.0117, "step": 36500 }, { "epoch": 2.48, "learning_rate": 0.0003165829145728643, "loss": 0.0117, "step": 37000 }, { "epoch": 2.51, "learning_rate": 0.00031407035175879403, "loss": 0.0115, "step": 37500 }, { "epoch": 2.51, "eval_loss": 0.010385086759924889, "eval_runtime": 8.0071, "eval_samples_per_second": 124.889, "eval_steps_per_second": 1.249, "step": 37500 }, { "epoch": 2.54, "learning_rate": 0.00031155778894472364, "loss": 0.0116, "step": 38000 }, { "epoch": 2.58, "learning_rate": 0.0003090452261306533, "loss": 0.0115, "step": 38500 }, { "epoch": 2.61, "learning_rate": 0.0003065326633165829, "loss": 0.0115, "step": 39000 }, { "epoch": 2.64, "learning_rate": 0.00030402010050251257, "loss": 0.0117, "step": 39500 }, { "epoch": 2.68, "learning_rate": 0.0003015075376884422, "loss": 0.0116, "step": 40000 }, { "epoch": 2.68, "eval_loss": 0.010146565735340118, "eval_runtime": 7.8031, "eval_samples_per_second": 128.154, "eval_steps_per_second": 1.282, "step": 40000 }, { "epoch": 2.71, "learning_rate": 0.0002989949748743719, "loss": 0.0115, "step": 40500 }, { "epoch": 2.74, "learning_rate": 0.0002964824120603015, "loss": 0.0113, "step": 41000 }, { "epoch": 2.78, "learning_rate": 0.00029396984924623116, "loss": 0.0112, "step": 41500 }, { "epoch": 2.81, "learning_rate": 0.00029145728643216077, "loss": 0.0113, "step": 42000 }, { "epoch": 2.84, "learning_rate": 0.00028894472361809043, "loss": 0.0113, "step": 42500 }, { "epoch": 2.84, "eval_loss": 0.009525712579488754, "eval_runtime": 7.6057, "eval_samples_per_second": 131.48, "eval_steps_per_second": 1.315, "step": 42500 }, { "epoch": 2.88, "learning_rate": 0.00028643216080402015, "loss": 0.0111, "step": 43000 }, { "epoch": 2.91, "learning_rate": 0.00028391959798994976, "loss": 0.0111, "step": 43500 }, { "epoch": 2.94, "learning_rate": 0.0002814070351758794, "loss": 0.011, "step": 44000 }, { "epoch": 2.98, "learning_rate": 0.000278894472361809, "loss": 0.0109, "step": 44500 }, { "epoch": 3.01, "learning_rate": 0.0002763819095477387, "loss": 0.0106, "step": 45000 }, { "epoch": 3.01, "eval_loss": 0.009837072342634201, "eval_runtime": 7.6063, "eval_samples_per_second": 131.469, "eval_steps_per_second": 1.315, "step": 45000 }, { "epoch": 3.04, "learning_rate": 0.00027386934673366835, "loss": 0.0091, "step": 45500 }, { "epoch": 3.08, "learning_rate": 0.000271356783919598, "loss": 0.009, "step": 46000 }, { "epoch": 3.11, "learning_rate": 0.0002688442211055276, "loss": 0.0088, "step": 46500 }, { "epoch": 3.14, "learning_rate": 0.0002663316582914573, "loss": 0.0091, "step": 47000 }, { "epoch": 3.18, "learning_rate": 0.0002638190954773869, "loss": 0.0092, "step": 47500 }, { "epoch": 3.18, "eval_loss": 0.009356345981359482, "eval_runtime": 7.6201, "eval_samples_per_second": 131.232, "eval_steps_per_second": 1.312, "step": 47500 }, { "epoch": 3.21, "learning_rate": 0.0002613065326633166, "loss": 0.0092, "step": 48000 }, { "epoch": 3.25, "learning_rate": 0.00025879396984924627, "loss": 0.009, "step": 48500 }, { "epoch": 3.28, "learning_rate": 0.0002562814070351759, "loss": 0.0089, "step": 49000 }, { "epoch": 3.31, "learning_rate": 0.00025376884422110554, "loss": 0.0092, "step": 49500 }, { "epoch": 3.35, "learning_rate": 0.00025125628140703515, "loss": 0.0091, "step": 50000 }, { "epoch": 3.35, "eval_loss": 0.009372641332447529, "eval_runtime": 7.5969, "eval_samples_per_second": 131.633, "eval_steps_per_second": 1.316, "step": 50000 }, { "epoch": 3.38, "learning_rate": 0.0002487437185929648, "loss": 0.0091, "step": 50500 }, { "epoch": 3.41, "learning_rate": 0.00024623115577889447, "loss": 0.009, "step": 51000 }, { "epoch": 3.45, "learning_rate": 0.00024371859296482413, "loss": 0.0093, "step": 51500 }, { "epoch": 3.48, "learning_rate": 0.00024120603015075377, "loss": 0.0092, "step": 52000 }, { "epoch": 3.51, "learning_rate": 0.00023869346733668343, "loss": 0.009, "step": 52500 }, { "epoch": 3.51, "eval_loss": 0.009138910099864006, "eval_runtime": 7.5893, "eval_samples_per_second": 131.765, "eval_steps_per_second": 1.318, "step": 52500 }, { "epoch": 3.55, "learning_rate": 0.00023618090452261307, "loss": 0.0091, "step": 53000 }, { "epoch": 3.58, "learning_rate": 0.00023366834170854273, "loss": 0.0089, "step": 53500 }, { "epoch": 3.61, "learning_rate": 0.00023115577889447236, "loss": 0.009, "step": 54000 }, { "epoch": 3.65, "learning_rate": 0.000228643216080402, "loss": 0.0092, "step": 54500 }, { "epoch": 3.68, "learning_rate": 0.00022613065326633166, "loss": 0.0091, "step": 55000 }, { "epoch": 3.68, "eval_loss": 0.00910147000104189, "eval_runtime": 7.6234, "eval_samples_per_second": 131.175, "eval_steps_per_second": 1.312, "step": 55000 }, { "epoch": 3.71, "learning_rate": 0.0002236180904522613, "loss": 0.0089, "step": 55500 }, { "epoch": 3.75, "learning_rate": 0.00022110552763819098, "loss": 0.0089, "step": 56000 }, { "epoch": 3.78, "learning_rate": 0.00021859296482412062, "loss": 0.009, "step": 56500 }, { "epoch": 3.81, "learning_rate": 0.00021608040201005025, "loss": 0.0091, "step": 57000 }, { "epoch": 3.85, "learning_rate": 0.00021356783919597992, "loss": 0.0089, "step": 57500 }, { "epoch": 3.85, "eval_loss": 0.009140917100012302, "eval_runtime": 7.5801, "eval_samples_per_second": 131.925, "eval_steps_per_second": 1.319, "step": 57500 }, { "epoch": 3.88, "learning_rate": 0.00021105527638190955, "loss": 0.0089, "step": 58000 }, { "epoch": 3.91, "learning_rate": 0.00020854271356783919, "loss": 0.0088, "step": 58500 }, { "epoch": 3.95, "learning_rate": 0.00020603015075376885, "loss": 0.0089, "step": 59000 }, { "epoch": 3.98, "learning_rate": 0.00020351758793969848, "loss": 0.0088, "step": 59500 }, { "epoch": 4.01, "learning_rate": 0.00020100502512562814, "loss": 0.0081, "step": 60000 }, { "epoch": 4.01, "eval_loss": 0.00933513417840004, "eval_runtime": 7.577, "eval_samples_per_second": 131.978, "eval_steps_per_second": 1.32, "step": 60000 }, { "epoch": 4.05, "learning_rate": 0.00019849246231155778, "loss": 0.0068, "step": 60500 }, { "epoch": 4.08, "learning_rate": 0.00019597989949748744, "loss": 0.0068, "step": 61000 }, { "epoch": 4.12, "learning_rate": 0.0001934673366834171, "loss": 0.007, "step": 61500 }, { "epoch": 4.15, "learning_rate": 0.00019095477386934674, "loss": 0.0068, "step": 62000 }, { "epoch": 4.18, "learning_rate": 0.0001884422110552764, "loss": 0.0069, "step": 62500 }, { "epoch": 4.18, "eval_loss": 0.009106965735554695, "eval_runtime": 7.6103, "eval_samples_per_second": 131.401, "eval_steps_per_second": 1.314, "step": 62500 }, { "epoch": 4.22, "learning_rate": 0.00018592964824120604, "loss": 0.0068, "step": 63000 }, { "epoch": 4.25, "learning_rate": 0.00018341708542713567, "loss": 0.007, "step": 63500 }, { "epoch": 4.28, "learning_rate": 0.00018090452261306533, "loss": 0.007, "step": 64000 }, { "epoch": 4.32, "learning_rate": 0.00017839195979899497, "loss": 0.0069, "step": 64500 }, { "epoch": 4.35, "learning_rate": 0.00017587939698492463, "loss": 0.0069, "step": 65000 }, { "epoch": 4.35, "eval_loss": 0.009035669267177582, "eval_runtime": 7.7828, "eval_samples_per_second": 128.488, "eval_steps_per_second": 1.285, "step": 65000 }, { "epoch": 4.38, "learning_rate": 0.00017336683417085427, "loss": 0.007, "step": 65500 }, { "epoch": 4.42, "learning_rate": 0.0001708542713567839, "loss": 0.0069, "step": 66000 }, { "epoch": 4.45, "learning_rate": 0.0001683417085427136, "loss": 0.0069, "step": 66500 }, { "epoch": 4.48, "learning_rate": 0.00016582914572864322, "loss": 0.0068, "step": 67000 }, { "epoch": 4.52, "learning_rate": 0.00016331658291457289, "loss": 0.0071, "step": 67500 }, { "epoch": 4.52, "eval_loss": 0.009078802540898323, "eval_runtime": 7.5849, "eval_samples_per_second": 131.841, "eval_steps_per_second": 1.318, "step": 67500 }, { "epoch": 4.55, "learning_rate": 0.00016080402010050252, "loss": 0.007, "step": 68000 }, { "epoch": 4.58, "learning_rate": 0.00015829145728643216, "loss": 0.0071, "step": 68500 }, { "epoch": 4.62, "learning_rate": 0.00015577889447236182, "loss": 0.0071, "step": 69000 }, { "epoch": 4.65, "learning_rate": 0.00015326633165829145, "loss": 0.0072, "step": 69500 }, { "epoch": 4.68, "learning_rate": 0.0001507537688442211, "loss": 0.0069, "step": 70000 }, { "epoch": 4.68, "eval_loss": 0.008746389299631119, "eval_runtime": 7.5714, "eval_samples_per_second": 132.075, "eval_steps_per_second": 1.321, "step": 70000 }, { "epoch": 4.72, "learning_rate": 0.00014824120603015075, "loss": 0.0071, "step": 70500 }, { "epoch": 4.75, "learning_rate": 0.00014572864321608039, "loss": 0.007, "step": 71000 }, { "epoch": 4.78, "learning_rate": 0.00014321608040201007, "loss": 0.007, "step": 71500 }, { "epoch": 4.82, "learning_rate": 0.0001407035175879397, "loss": 0.007, "step": 72000 }, { "epoch": 4.85, "learning_rate": 0.00013819095477386934, "loss": 0.0069, "step": 72500 }, { "epoch": 4.85, "eval_loss": 0.008641643449664116, "eval_runtime": 7.59, "eval_samples_per_second": 131.752, "eval_steps_per_second": 1.318, "step": 72500 }, { "epoch": 4.88, "learning_rate": 0.000135678391959799, "loss": 0.007, "step": 73000 }, { "epoch": 4.92, "learning_rate": 0.00013316582914572864, "loss": 0.007, "step": 73500 }, { "epoch": 4.95, "learning_rate": 0.0001306532663316583, "loss": 0.0069, "step": 74000 }, { "epoch": 4.99, "learning_rate": 0.00012814070351758794, "loss": 0.0071, "step": 74500 }, { "epoch": 5.02, "learning_rate": 0.00012562814070351757, "loss": 0.0058, "step": 75000 }, { "epoch": 5.02, "eval_loss": 0.00928833894431591, "eval_runtime": 7.8485, "eval_samples_per_second": 127.413, "eval_steps_per_second": 1.274, "step": 75000 }, { "epoch": 5.05, "learning_rate": 0.00012311557788944724, "loss": 0.005, "step": 75500 }, { "epoch": 5.09, "learning_rate": 0.00012060301507537688, "loss": 0.0051, "step": 76000 }, { "epoch": 5.12, "learning_rate": 0.00011809045226130653, "loss": 0.0053, "step": 76500 }, { "epoch": 5.15, "learning_rate": 0.00011557788944723618, "loss": 0.0052, "step": 77000 }, { "epoch": 5.19, "learning_rate": 0.00011306532663316583, "loss": 0.0053, "step": 77500 }, { "epoch": 5.19, "eval_loss": 0.009319030679762363, "eval_runtime": 7.5874, "eval_samples_per_second": 131.797, "eval_steps_per_second": 1.318, "step": 77500 }, { "epoch": 5.22, "learning_rate": 0.00011055276381909549, "loss": 0.0053, "step": 78000 }, { "epoch": 5.25, "learning_rate": 0.00010804020100502513, "loss": 0.0054, "step": 78500 }, { "epoch": 5.29, "learning_rate": 0.00010552763819095478, "loss": 0.0052, "step": 79000 }, { "epoch": 5.32, "learning_rate": 0.00010301507537688442, "loss": 0.0053, "step": 79500 }, { "epoch": 5.35, "learning_rate": 0.00010050251256281407, "loss": 0.0052, "step": 80000 }, { "epoch": 5.35, "eval_loss": 0.008985009975731373, "eval_runtime": 7.8274, "eval_samples_per_second": 127.757, "eval_steps_per_second": 1.278, "step": 80000 }, { "epoch": 5.39, "learning_rate": 9.798994974874372e-05, "loss": 0.0053, "step": 80500 }, { "epoch": 5.42, "learning_rate": 9.547738693467337e-05, "loss": 0.0053, "step": 81000 }, { "epoch": 5.45, "learning_rate": 9.296482412060302e-05, "loss": 0.0052, "step": 81500 }, { "epoch": 5.49, "learning_rate": 9.045226130653267e-05, "loss": 0.0053, "step": 82000 }, { "epoch": 5.52, "learning_rate": 8.793969849246232e-05, "loss": 0.0052, "step": 82500 }, { "epoch": 5.52, "eval_loss": 0.008979461155831814, "eval_runtime": 7.6451, "eval_samples_per_second": 130.803, "eval_steps_per_second": 1.308, "step": 82500 }, { "epoch": 5.55, "learning_rate": 8.542713567839195e-05, "loss": 0.0051, "step": 83000 }, { "epoch": 5.59, "learning_rate": 8.291457286432161e-05, "loss": 0.0051, "step": 83500 }, { "epoch": 5.62, "learning_rate": 8.040201005025126e-05, "loss": 0.0053, "step": 84000 }, { "epoch": 5.65, "learning_rate": 7.788944723618091e-05, "loss": 0.0053, "step": 84500 }, { "epoch": 5.69, "learning_rate": 7.537688442211054e-05, "loss": 0.0052, "step": 85000 }, { "epoch": 5.69, "eval_loss": 0.008833312429487705, "eval_runtime": 7.5954, "eval_samples_per_second": 131.659, "eval_steps_per_second": 1.317, "step": 85000 }, { "epoch": 5.72, "learning_rate": 7.286432160804019e-05, "loss": 0.0053, "step": 85500 }, { "epoch": 5.75, "learning_rate": 7.035175879396985e-05, "loss": 0.0052, "step": 86000 }, { "epoch": 5.79, "learning_rate": 6.78391959798995e-05, "loss": 0.0052, "step": 86500 }, { "epoch": 5.82, "learning_rate": 6.532663316582915e-05, "loss": 0.0052, "step": 87000 }, { "epoch": 5.85, "learning_rate": 6.281407035175879e-05, "loss": 0.0052, "step": 87500 }, { "epoch": 5.85, "eval_loss": 0.00872196163982153, "eval_runtime": 7.5919, "eval_samples_per_second": 131.72, "eval_steps_per_second": 1.317, "step": 87500 }, { "epoch": 5.89, "learning_rate": 6.030150753768844e-05, "loss": 0.0051, "step": 88000 }, { "epoch": 5.92, "learning_rate": 5.778894472361809e-05, "loss": 0.0052, "step": 88500 }, { "epoch": 5.96, "learning_rate": 5.5276381909547746e-05, "loss": 0.0053, "step": 89000 }, { "epoch": 5.99, "learning_rate": 5.276381909547739e-05, "loss": 0.0052, "step": 89500 }, { "epoch": 6.02, "learning_rate": 5.0251256281407036e-05, "loss": 0.0043, "step": 90000 }, { "epoch": 6.02, "eval_loss": 0.009319793432950974, "eval_runtime": 7.6253, "eval_samples_per_second": 131.143, "eval_steps_per_second": 1.311, "step": 90000 }, { "epoch": 6.06, "learning_rate": 4.7738693467336685e-05, "loss": 0.0039, "step": 90500 }, { "epoch": 6.09, "learning_rate": 4.522613065326633e-05, "loss": 0.0039, "step": 91000 }, { "epoch": 6.12, "learning_rate": 4.2713567839195975e-05, "loss": 0.004, "step": 91500 }, { "epoch": 6.16, "learning_rate": 4.020100502512563e-05, "loss": 0.004, "step": 92000 }, { "epoch": 6.19, "learning_rate": 3.768844221105527e-05, "loss": 0.0038, "step": 92500 }, { "epoch": 6.19, "eval_loss": 0.009469099342823029, "eval_runtime": 7.9167, "eval_samples_per_second": 126.315, "eval_steps_per_second": 1.263, "step": 92500 }, { "epoch": 6.22, "learning_rate": 3.517587939698493e-05, "loss": 0.004, "step": 93000 }, { "epoch": 6.26, "learning_rate": 3.2663316582914576e-05, "loss": 0.0039, "step": 93500 }, { "epoch": 6.29, "learning_rate": 3.015075376884422e-05, "loss": 0.0039, "step": 94000 }, { "epoch": 6.32, "learning_rate": 2.7638190954773873e-05, "loss": 0.004, "step": 94500 }, { "epoch": 6.36, "learning_rate": 2.5125628140703518e-05, "loss": 0.0039, "step": 95000 }, { "epoch": 6.36, "eval_loss": 0.009595912881195545, "eval_runtime": 7.607, "eval_samples_per_second": 131.458, "eval_steps_per_second": 1.315, "step": 95000 }, { "epoch": 6.39, "learning_rate": 2.2613065326633167e-05, "loss": 0.004, "step": 95500 }, { "epoch": 6.42, "learning_rate": 2.0100502512562815e-05, "loss": 0.0038, "step": 96000 }, { "epoch": 6.46, "learning_rate": 1.7587939698492464e-05, "loss": 0.0039, "step": 96500 }, { "epoch": 6.49, "learning_rate": 1.507537688442211e-05, "loss": 0.004, "step": 97000 }, { "epoch": 6.52, "learning_rate": 1.2562814070351759e-05, "loss": 0.0039, "step": 97500 }, { "epoch": 6.52, "eval_loss": 0.009522764943540096, "eval_runtime": 7.7116, "eval_samples_per_second": 129.674, "eval_steps_per_second": 1.297, "step": 97500 } ], "logging_steps": 500, "max_steps": 100000, "num_train_epochs": 7, "save_steps": 500, "total_flos": 6.676282499901161e+18, "trial_name": null, "trial_params": null }