| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.524030177821643, | |
| "eval_steps": 2500, | |
| "global_step": 97500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03, | |
| "learning_rate": 0.0005, | |
| "loss": 0.5631, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "learning_rate": 0.0004974874371859296, | |
| "loss": 0.14, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "learning_rate": 0.0004949748743718593, | |
| "loss": 0.0804, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "learning_rate": 0.0004924623115577889, | |
| "loss": 0.0597, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "learning_rate": 0.0004899497487437187, | |
| "loss": 0.0501, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 0.033905673772096634, | |
| "eval_runtime": 8.0076, | |
| "eval_samples_per_second": 124.881, | |
| "eval_steps_per_second": 1.249, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "learning_rate": 0.00048743718592964827, | |
| "loss": 0.0436, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "learning_rate": 0.0004849246231155779, | |
| "loss": 0.0398, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "learning_rate": 0.00048241206030150754, | |
| "loss": 0.0374, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "learning_rate": 0.0004798994974874372, | |
| "loss": 0.035, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "learning_rate": 0.00047738693467336686, | |
| "loss": 0.0331, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "eval_loss": 0.023899264633655548, | |
| "eval_runtime": 7.9042, | |
| "eval_samples_per_second": 126.515, | |
| "eval_steps_per_second": 1.265, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "learning_rate": 0.00047487437185929647, | |
| "loss": 0.0313, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 0.00047236180904522613, | |
| "loss": 0.03, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "learning_rate": 0.0004698492462311558, | |
| "loss": 0.0287, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "learning_rate": 0.00046733668341708545, | |
| "loss": 0.0277, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "learning_rate": 0.0004648241206030151, | |
| "loss": 0.0271, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.01935673877596855, | |
| "eval_runtime": 8.4605, | |
| "eval_samples_per_second": 118.197, | |
| "eval_steps_per_second": 1.182, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "learning_rate": 0.0004623115577889447, | |
| "loss": 0.0262, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "learning_rate": 0.0004597989949748744, | |
| "loss": 0.0253, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 0.000457286432160804, | |
| "loss": 0.0244, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 0.0004547738693467337, | |
| "loss": 0.0239, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "learning_rate": 0.0004522613065326633, | |
| "loss": 0.0238, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_loss": 0.017778538167476654, | |
| "eval_runtime": 7.6198, | |
| "eval_samples_per_second": 131.237, | |
| "eval_steps_per_second": 1.312, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "learning_rate": 0.000449748743718593, | |
| "loss": 0.0229, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "learning_rate": 0.0004472361809045226, | |
| "loss": 0.0223, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "learning_rate": 0.00044472361809045225, | |
| "loss": 0.0219, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 0.00044221105527638197, | |
| "loss": 0.0216, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "learning_rate": 0.0004396984924623116, | |
| "loss": 0.0212, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 0.014742564409971237, | |
| "eval_runtime": 7.6159, | |
| "eval_samples_per_second": 131.305, | |
| "eval_steps_per_second": 1.313, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "learning_rate": 0.00043718592964824124, | |
| "loss": 0.0206, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 0.00043467336683417085, | |
| "loss": 0.0205, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "learning_rate": 0.0004321608040201005, | |
| "loss": 0.0198, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "learning_rate": 0.0004296482412060301, | |
| "loss": 0.0198, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 0.00042713567839195983, | |
| "loss": 0.0193, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.01463437918573618, | |
| "eval_runtime": 7.6369, | |
| "eval_samples_per_second": 130.944, | |
| "eval_steps_per_second": 1.309, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 0.00042462311557788944, | |
| "loss": 0.0178, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "learning_rate": 0.0004221105527638191, | |
| "loss": 0.0177, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "learning_rate": 0.00041959798994974876, | |
| "loss": 0.0172, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "learning_rate": 0.00041708542713567837, | |
| "loss": 0.017, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "learning_rate": 0.0004145728643216081, | |
| "loss": 0.0172, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "eval_loss": 0.0131097212433815, | |
| "eval_runtime": 7.63, | |
| "eval_samples_per_second": 131.062, | |
| "eval_steps_per_second": 1.311, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "learning_rate": 0.0004120603015075377, | |
| "loss": 0.0168, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "learning_rate": 0.00040954773869346736, | |
| "loss": 0.017, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 0.00040703517587939697, | |
| "loss": 0.0165, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "learning_rate": 0.00040452261306532663, | |
| "loss": 0.0163, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "learning_rate": 0.0004020100502512563, | |
| "loss": 0.0163, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "eval_loss": 0.012624073773622513, | |
| "eval_runtime": 7.6613, | |
| "eval_samples_per_second": 130.527, | |
| "eval_steps_per_second": 1.305, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "learning_rate": 0.00039949748743718595, | |
| "loss": 0.0161, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "learning_rate": 0.00039698492462311556, | |
| "loss": 0.0162, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "learning_rate": 0.0003944723618090452, | |
| "loss": 0.0162, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "learning_rate": 0.0003919597989949749, | |
| "loss": 0.0158, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 0.00038944723618090455, | |
| "loss": 0.0157, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "eval_loss": 0.0122187789529562, | |
| "eval_runtime": 7.6253, | |
| "eval_samples_per_second": 131.143, | |
| "eval_steps_per_second": 1.311, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "learning_rate": 0.0003869346733668342, | |
| "loss": 0.0289, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "learning_rate": 0.0003844221105527638, | |
| "loss": 0.0157, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "learning_rate": 0.0003819095477386935, | |
| "loss": 0.0148, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "learning_rate": 0.0003793969849246231, | |
| "loss": 0.0148, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "learning_rate": 0.0003768844221105528, | |
| "loss": 0.0147, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_loss": 0.011391976848244667, | |
| "eval_runtime": 7.5953, | |
| "eval_samples_per_second": 131.66, | |
| "eval_steps_per_second": 1.317, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 0.0003743718592964824, | |
| "loss": 0.0148, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "learning_rate": 0.00037185929648241207, | |
| "loss": 0.0146, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "learning_rate": 0.0003693467336683417, | |
| "loss": 0.0145, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 0.00036683417085427134, | |
| "loss": 0.0143, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "learning_rate": 0.00036432160804020106, | |
| "loss": 0.0144, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "eval_loss": 0.011031309142708778, | |
| "eval_runtime": 7.6444, | |
| "eval_samples_per_second": 130.814, | |
| "eval_steps_per_second": 1.308, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 0.00036180904522613067, | |
| "loss": 0.0142, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "learning_rate": 0.00035929648241206033, | |
| "loss": 0.014, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "learning_rate": 0.00035678391959798994, | |
| "loss": 0.0143, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "learning_rate": 0.0003542713567839196, | |
| "loss": 0.0144, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "learning_rate": 0.00035175879396984926, | |
| "loss": 0.0136, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "eval_loss": 0.010939952917397022, | |
| "eval_runtime": 7.6117, | |
| "eval_samples_per_second": 131.376, | |
| "eval_steps_per_second": 1.314, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "learning_rate": 0.0003492462311557789, | |
| "loss": 0.0121, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "learning_rate": 0.00034673366834170853, | |
| "loss": 0.0121, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "learning_rate": 0.0003442211055276382, | |
| "loss": 0.0122, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "learning_rate": 0.0003417085427135678, | |
| "loss": 0.012, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "learning_rate": 0.0003391959798994975, | |
| "loss": 0.0121, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "eval_loss": 0.010623764246702194, | |
| "eval_runtime": 7.623, | |
| "eval_samples_per_second": 131.182, | |
| "eval_steps_per_second": 1.312, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "learning_rate": 0.0003366834170854272, | |
| "loss": 0.012, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "learning_rate": 0.0003341708542713568, | |
| "loss": 0.0119, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "learning_rate": 0.00033165829145728645, | |
| "loss": 0.0119, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "learning_rate": 0.00032914572864321606, | |
| "loss": 0.0118, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "learning_rate": 0.00032663316582914577, | |
| "loss": 0.0118, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "eval_loss": 0.01009445358067751, | |
| "eval_runtime": 7.8168, | |
| "eval_samples_per_second": 127.93, | |
| "eval_steps_per_second": 1.279, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "learning_rate": 0.0003241206030150754, | |
| "loss": 0.0119, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "learning_rate": 0.00032160804020100504, | |
| "loss": 0.0119, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "learning_rate": 0.00031909547738693465, | |
| "loss": 0.0117, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "learning_rate": 0.0003165829145728643, | |
| "loss": 0.0117, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "learning_rate": 0.00031407035175879403, | |
| "loss": 0.0115, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "eval_loss": 0.010385086759924889, | |
| "eval_runtime": 8.0071, | |
| "eval_samples_per_second": 124.889, | |
| "eval_steps_per_second": 1.249, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "learning_rate": 0.00031155778894472364, | |
| "loss": 0.0116, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "learning_rate": 0.0003090452261306533, | |
| "loss": 0.0115, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "learning_rate": 0.0003065326633165829, | |
| "loss": 0.0115, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "learning_rate": 0.00030402010050251257, | |
| "loss": 0.0117, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "learning_rate": 0.0003015075376884422, | |
| "loss": 0.0116, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "eval_loss": 0.010146565735340118, | |
| "eval_runtime": 7.8031, | |
| "eval_samples_per_second": 128.154, | |
| "eval_steps_per_second": 1.282, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "learning_rate": 0.0002989949748743719, | |
| "loss": 0.0115, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "learning_rate": 0.0002964824120603015, | |
| "loss": 0.0113, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "learning_rate": 0.00029396984924623116, | |
| "loss": 0.0112, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "learning_rate": 0.00029145728643216077, | |
| "loss": 0.0113, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "learning_rate": 0.00028894472361809043, | |
| "loss": 0.0113, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "eval_loss": 0.009525712579488754, | |
| "eval_runtime": 7.6057, | |
| "eval_samples_per_second": 131.48, | |
| "eval_steps_per_second": 1.315, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "learning_rate": 0.00028643216080402015, | |
| "loss": 0.0111, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "learning_rate": 0.00028391959798994976, | |
| "loss": 0.0111, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "learning_rate": 0.0002814070351758794, | |
| "loss": 0.011, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "learning_rate": 0.000278894472361809, | |
| "loss": 0.0109, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "learning_rate": 0.0002763819095477387, | |
| "loss": 0.0106, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "eval_loss": 0.009837072342634201, | |
| "eval_runtime": 7.6063, | |
| "eval_samples_per_second": 131.469, | |
| "eval_steps_per_second": 1.315, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "learning_rate": 0.00027386934673366835, | |
| "loss": 0.0091, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "learning_rate": 0.000271356783919598, | |
| "loss": 0.009, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "learning_rate": 0.0002688442211055276, | |
| "loss": 0.0088, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "learning_rate": 0.0002663316582914573, | |
| "loss": 0.0091, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "learning_rate": 0.0002638190954773869, | |
| "loss": 0.0092, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "eval_loss": 0.009356345981359482, | |
| "eval_runtime": 7.6201, | |
| "eval_samples_per_second": 131.232, | |
| "eval_steps_per_second": 1.312, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "learning_rate": 0.0002613065326633166, | |
| "loss": 0.0092, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "learning_rate": 0.00025879396984924627, | |
| "loss": 0.009, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "learning_rate": 0.0002562814070351759, | |
| "loss": 0.0089, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "learning_rate": 0.00025376884422110554, | |
| "loss": 0.0092, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "learning_rate": 0.00025125628140703515, | |
| "loss": 0.0091, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "eval_loss": 0.009372641332447529, | |
| "eval_runtime": 7.5969, | |
| "eval_samples_per_second": 131.633, | |
| "eval_steps_per_second": 1.316, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "learning_rate": 0.0002487437185929648, | |
| "loss": 0.0091, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "learning_rate": 0.00024623115577889447, | |
| "loss": 0.009, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "learning_rate": 0.00024371859296482413, | |
| "loss": 0.0093, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "learning_rate": 0.00024120603015075377, | |
| "loss": 0.0092, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "learning_rate": 0.00023869346733668343, | |
| "loss": 0.009, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "eval_loss": 0.009138910099864006, | |
| "eval_runtime": 7.5893, | |
| "eval_samples_per_second": 131.765, | |
| "eval_steps_per_second": 1.318, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "learning_rate": 0.00023618090452261307, | |
| "loss": 0.0091, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "learning_rate": 0.00023366834170854273, | |
| "loss": 0.0089, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "learning_rate": 0.00023115577889447236, | |
| "loss": 0.009, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "learning_rate": 0.000228643216080402, | |
| "loss": 0.0092, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "learning_rate": 0.00022613065326633166, | |
| "loss": 0.0091, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "eval_loss": 0.00910147000104189, | |
| "eval_runtime": 7.6234, | |
| "eval_samples_per_second": 131.175, | |
| "eval_steps_per_second": 1.312, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "learning_rate": 0.0002236180904522613, | |
| "loss": 0.0089, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "learning_rate": 0.00022110552763819098, | |
| "loss": 0.0089, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "learning_rate": 0.00021859296482412062, | |
| "loss": 0.009, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "learning_rate": 0.00021608040201005025, | |
| "loss": 0.0091, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "learning_rate": 0.00021356783919597992, | |
| "loss": 0.0089, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "eval_loss": 0.009140917100012302, | |
| "eval_runtime": 7.5801, | |
| "eval_samples_per_second": 131.925, | |
| "eval_steps_per_second": 1.319, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "learning_rate": 0.00021105527638190955, | |
| "loss": 0.0089, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "learning_rate": 0.00020854271356783919, | |
| "loss": 0.0088, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "learning_rate": 0.00020603015075376885, | |
| "loss": 0.0089, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "learning_rate": 0.00020351758793969848, | |
| "loss": 0.0088, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "learning_rate": 0.00020100502512562814, | |
| "loss": 0.0081, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "eval_loss": 0.00933513417840004, | |
| "eval_runtime": 7.577, | |
| "eval_samples_per_second": 131.978, | |
| "eval_steps_per_second": 1.32, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "learning_rate": 0.00019849246231155778, | |
| "loss": 0.0068, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "learning_rate": 0.00019597989949748744, | |
| "loss": 0.0068, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "learning_rate": 0.0001934673366834171, | |
| "loss": 0.007, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "learning_rate": 0.00019095477386934674, | |
| "loss": 0.0068, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "learning_rate": 0.0001884422110552764, | |
| "loss": 0.0069, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "eval_loss": 0.009106965735554695, | |
| "eval_runtime": 7.6103, | |
| "eval_samples_per_second": 131.401, | |
| "eval_steps_per_second": 1.314, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "learning_rate": 0.00018592964824120604, | |
| "loss": 0.0068, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "learning_rate": 0.00018341708542713567, | |
| "loss": 0.007, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "learning_rate": 0.00018090452261306533, | |
| "loss": 0.007, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "learning_rate": 0.00017839195979899497, | |
| "loss": 0.0069, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "learning_rate": 0.00017587939698492463, | |
| "loss": 0.0069, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "eval_loss": 0.009035669267177582, | |
| "eval_runtime": 7.7828, | |
| "eval_samples_per_second": 128.488, | |
| "eval_steps_per_second": 1.285, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "learning_rate": 0.00017336683417085427, | |
| "loss": 0.007, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "learning_rate": 0.0001708542713567839, | |
| "loss": 0.0069, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "learning_rate": 0.0001683417085427136, | |
| "loss": 0.0069, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "learning_rate": 0.00016582914572864322, | |
| "loss": 0.0068, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "learning_rate": 0.00016331658291457289, | |
| "loss": 0.0071, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "eval_loss": 0.009078802540898323, | |
| "eval_runtime": 7.5849, | |
| "eval_samples_per_second": 131.841, | |
| "eval_steps_per_second": 1.318, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "learning_rate": 0.00016080402010050252, | |
| "loss": 0.007, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "learning_rate": 0.00015829145728643216, | |
| "loss": 0.0071, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "learning_rate": 0.00015577889447236182, | |
| "loss": 0.0071, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "learning_rate": 0.00015326633165829145, | |
| "loss": 0.0072, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "learning_rate": 0.0001507537688442211, | |
| "loss": 0.0069, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "eval_loss": 0.008746389299631119, | |
| "eval_runtime": 7.5714, | |
| "eval_samples_per_second": 132.075, | |
| "eval_steps_per_second": 1.321, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "learning_rate": 0.00014824120603015075, | |
| "loss": 0.0071, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "learning_rate": 0.00014572864321608039, | |
| "loss": 0.007, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "learning_rate": 0.00014321608040201007, | |
| "loss": 0.007, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "learning_rate": 0.0001407035175879397, | |
| "loss": 0.007, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "learning_rate": 0.00013819095477386934, | |
| "loss": 0.0069, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "eval_loss": 0.008641643449664116, | |
| "eval_runtime": 7.59, | |
| "eval_samples_per_second": 131.752, | |
| "eval_steps_per_second": 1.318, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "learning_rate": 0.000135678391959799, | |
| "loss": 0.007, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "learning_rate": 0.00013316582914572864, | |
| "loss": 0.007, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "learning_rate": 0.0001306532663316583, | |
| "loss": 0.0069, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "learning_rate": 0.00012814070351758794, | |
| "loss": 0.0071, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 5.02, | |
| "learning_rate": 0.00012562814070351757, | |
| "loss": 0.0058, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 5.02, | |
| "eval_loss": 0.00928833894431591, | |
| "eval_runtime": 7.8485, | |
| "eval_samples_per_second": 127.413, | |
| "eval_steps_per_second": 1.274, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "learning_rate": 0.00012311557788944724, | |
| "loss": 0.005, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 5.09, | |
| "learning_rate": 0.00012060301507537688, | |
| "loss": 0.0051, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "learning_rate": 0.00011809045226130653, | |
| "loss": 0.0053, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "learning_rate": 0.00011557788944723618, | |
| "loss": 0.0052, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "learning_rate": 0.00011306532663316583, | |
| "loss": 0.0053, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "eval_loss": 0.009319030679762363, | |
| "eval_runtime": 7.5874, | |
| "eval_samples_per_second": 131.797, | |
| "eval_steps_per_second": 1.318, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "learning_rate": 0.00011055276381909549, | |
| "loss": 0.0053, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "learning_rate": 0.00010804020100502513, | |
| "loss": 0.0054, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "learning_rate": 0.00010552763819095478, | |
| "loss": 0.0052, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "learning_rate": 0.00010301507537688442, | |
| "loss": 0.0053, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "learning_rate": 0.00010050251256281407, | |
| "loss": 0.0052, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "eval_loss": 0.008985009975731373, | |
| "eval_runtime": 7.8274, | |
| "eval_samples_per_second": 127.757, | |
| "eval_steps_per_second": 1.278, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 5.39, | |
| "learning_rate": 9.798994974874372e-05, | |
| "loss": 0.0053, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "learning_rate": 9.547738693467337e-05, | |
| "loss": 0.0053, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "learning_rate": 9.296482412060302e-05, | |
| "loss": 0.0052, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "learning_rate": 9.045226130653267e-05, | |
| "loss": 0.0053, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "learning_rate": 8.793969849246232e-05, | |
| "loss": 0.0052, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "eval_loss": 0.008979461155831814, | |
| "eval_runtime": 7.6451, | |
| "eval_samples_per_second": 130.803, | |
| "eval_steps_per_second": 1.308, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 5.55, | |
| "learning_rate": 8.542713567839195e-05, | |
| "loss": 0.0051, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 5.59, | |
| "learning_rate": 8.291457286432161e-05, | |
| "loss": 0.0051, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 5.62, | |
| "learning_rate": 8.040201005025126e-05, | |
| "loss": 0.0053, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "learning_rate": 7.788944723618091e-05, | |
| "loss": 0.0053, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 5.69, | |
| "learning_rate": 7.537688442211054e-05, | |
| "loss": 0.0052, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 5.69, | |
| "eval_loss": 0.008833312429487705, | |
| "eval_runtime": 7.5954, | |
| "eval_samples_per_second": 131.659, | |
| "eval_steps_per_second": 1.317, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 5.72, | |
| "learning_rate": 7.286432160804019e-05, | |
| "loss": 0.0053, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "learning_rate": 7.035175879396985e-05, | |
| "loss": 0.0052, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "learning_rate": 6.78391959798995e-05, | |
| "loss": 0.0052, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 5.82, | |
| "learning_rate": 6.532663316582915e-05, | |
| "loss": 0.0052, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "learning_rate": 6.281407035175879e-05, | |
| "loss": 0.0052, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "eval_loss": 0.00872196163982153, | |
| "eval_runtime": 7.5919, | |
| "eval_samples_per_second": 131.72, | |
| "eval_steps_per_second": 1.317, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "learning_rate": 6.030150753768844e-05, | |
| "loss": 0.0051, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "learning_rate": 5.778894472361809e-05, | |
| "loss": 0.0052, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 5.96, | |
| "learning_rate": 5.5276381909547746e-05, | |
| "loss": 0.0053, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 5.99, | |
| "learning_rate": 5.276381909547739e-05, | |
| "loss": 0.0052, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "learning_rate": 5.0251256281407036e-05, | |
| "loss": 0.0043, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "eval_loss": 0.009319793432950974, | |
| "eval_runtime": 7.6253, | |
| "eval_samples_per_second": 131.143, | |
| "eval_steps_per_second": 1.311, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 6.06, | |
| "learning_rate": 4.7738693467336685e-05, | |
| "loss": 0.0039, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "learning_rate": 4.522613065326633e-05, | |
| "loss": 0.0039, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "learning_rate": 4.2713567839195975e-05, | |
| "loss": 0.004, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "learning_rate": 4.020100502512563e-05, | |
| "loss": 0.004, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "learning_rate": 3.768844221105527e-05, | |
| "loss": 0.0038, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "eval_loss": 0.009469099342823029, | |
| "eval_runtime": 7.9167, | |
| "eval_samples_per_second": 126.315, | |
| "eval_steps_per_second": 1.263, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 6.22, | |
| "learning_rate": 3.517587939698493e-05, | |
| "loss": 0.004, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 6.26, | |
| "learning_rate": 3.2663316582914576e-05, | |
| "loss": 0.0039, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 6.29, | |
| "learning_rate": 3.015075376884422e-05, | |
| "loss": 0.0039, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "learning_rate": 2.7638190954773873e-05, | |
| "loss": 0.004, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "learning_rate": 2.5125628140703518e-05, | |
| "loss": 0.0039, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "eval_loss": 0.009595912881195545, | |
| "eval_runtime": 7.607, | |
| "eval_samples_per_second": 131.458, | |
| "eval_steps_per_second": 1.315, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "learning_rate": 2.2613065326633167e-05, | |
| "loss": 0.004, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "learning_rate": 2.0100502512562815e-05, | |
| "loss": 0.0038, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 6.46, | |
| "learning_rate": 1.7587939698492464e-05, | |
| "loss": 0.0039, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "learning_rate": 1.507537688442211e-05, | |
| "loss": 0.004, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "learning_rate": 1.2562814070351759e-05, | |
| "loss": 0.0039, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "eval_loss": 0.009522764943540096, | |
| "eval_runtime": 7.7116, | |
| "eval_samples_per_second": 129.674, | |
| "eval_steps_per_second": 1.297, | |
| "step": 97500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 100000, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "total_flos": 6.676282499901161e+18, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |