| { | |
| "best_global_step": 25500, | |
| "best_metric": 0.21131116151809692, | |
| "best_model_checkpoint": "reverse_model/checkpoint-25500", | |
| "epoch": 3.7155762785953663, | |
| "eval_steps": 500, | |
| "global_step": 25500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07285443683520326, | |
| "grad_norm": 0.7519411444664001, | |
| "learning_rate": 0.00019818228180096167, | |
| "loss": 0.974, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07285443683520326, | |
| "eval_loss": 0.6119500398635864, | |
| "eval_runtime": 0.5563, | |
| "eval_samples_per_second": 179.775, | |
| "eval_steps_per_second": 23.371, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14570887367040652, | |
| "grad_norm": 0.8617602586746216, | |
| "learning_rate": 0.0001963609208800816, | |
| "loss": 0.6234, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14570887367040652, | |
| "eval_loss": 0.5182287096977234, | |
| "eval_runtime": 0.5766, | |
| "eval_samples_per_second": 173.435, | |
| "eval_steps_per_second": 22.546, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2185633105056098, | |
| "grad_norm": 0.4657430648803711, | |
| "learning_rate": 0.0001945395599592015, | |
| "loss": 0.5373, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2185633105056098, | |
| "eval_loss": 0.4636226296424866, | |
| "eval_runtime": 0.5903, | |
| "eval_samples_per_second": 169.403, | |
| "eval_steps_per_second": 22.022, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.29141774734081305, | |
| "grad_norm": 0.5456737875938416, | |
| "learning_rate": 0.00019271819903832145, | |
| "loss": 0.4952, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.29141774734081305, | |
| "eval_loss": 0.4364851415157318, | |
| "eval_runtime": 0.5613, | |
| "eval_samples_per_second": 178.171, | |
| "eval_steps_per_second": 23.162, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3642721841760163, | |
| "grad_norm": 0.8705533146858215, | |
| "learning_rate": 0.00019089683811744136, | |
| "loss": 0.4634, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3642721841760163, | |
| "eval_loss": 0.4068869650363922, | |
| "eval_runtime": 0.5819, | |
| "eval_samples_per_second": 171.863, | |
| "eval_steps_per_second": 22.342, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4371266210112196, | |
| "grad_norm": 0.45825353264808655, | |
| "learning_rate": 0.0001890754771965613, | |
| "loss": 0.4429, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4371266210112196, | |
| "eval_loss": 0.395874947309494, | |
| "eval_runtime": 0.5825, | |
| "eval_samples_per_second": 171.678, | |
| "eval_steps_per_second": 22.318, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5099810578464229, | |
| "grad_norm": 0.5927444100379944, | |
| "learning_rate": 0.0001872541162756812, | |
| "loss": 0.4223, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5099810578464229, | |
| "eval_loss": 0.36890122294425964, | |
| "eval_runtime": 0.5584, | |
| "eval_samples_per_second": 179.096, | |
| "eval_steps_per_second": 23.282, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5828354946816261, | |
| "grad_norm": 0.6521668434143066, | |
| "learning_rate": 0.0001854327553548011, | |
| "loss": 0.4066, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5828354946816261, | |
| "eval_loss": 0.3580659031867981, | |
| "eval_runtime": 0.5849, | |
| "eval_samples_per_second": 170.956, | |
| "eval_steps_per_second": 22.224, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6556899315168294, | |
| "grad_norm": 0.5500112175941467, | |
| "learning_rate": 0.00018361139443392105, | |
| "loss": 0.3909, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6556899315168294, | |
| "eval_loss": 0.3474609851837158, | |
| "eval_runtime": 0.569, | |
| "eval_samples_per_second": 175.738, | |
| "eval_steps_per_second": 22.846, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7285443683520326, | |
| "grad_norm": 0.703709065914154, | |
| "learning_rate": 0.00018179003351304095, | |
| "loss": 0.3806, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7285443683520326, | |
| "eval_loss": 0.33880433440208435, | |
| "eval_runtime": 0.5577, | |
| "eval_samples_per_second": 179.313, | |
| "eval_steps_per_second": 23.311, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8013988051872359, | |
| "grad_norm": 0.568647027015686, | |
| "learning_rate": 0.0001799686725921609, | |
| "loss": 0.3731, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8013988051872359, | |
| "eval_loss": 0.33678069710731506, | |
| "eval_runtime": 0.5788, | |
| "eval_samples_per_second": 172.771, | |
| "eval_steps_per_second": 22.46, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8742532420224391, | |
| "grad_norm": 0.6542627811431885, | |
| "learning_rate": 0.00017814731167128077, | |
| "loss": 0.3597, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8742532420224391, | |
| "eval_loss": 0.3265901207923889, | |
| "eval_runtime": 0.5582, | |
| "eval_samples_per_second": 179.158, | |
| "eval_steps_per_second": 23.291, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9471076788576425, | |
| "grad_norm": 0.6129189729690552, | |
| "learning_rate": 0.0001763259507504007, | |
| "loss": 0.3555, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9471076788576425, | |
| "eval_loss": 0.3173937499523163, | |
| "eval_runtime": 0.5933, | |
| "eval_samples_per_second": 168.543, | |
| "eval_steps_per_second": 21.911, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0199621156928458, | |
| "grad_norm": 0.46865177154541016, | |
| "learning_rate": 0.00017450458982952062, | |
| "loss": 0.3471, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0199621156928458, | |
| "eval_loss": 0.3176809549331665, | |
| "eval_runtime": 0.5602, | |
| "eval_samples_per_second": 178.494, | |
| "eval_steps_per_second": 23.204, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0928165525280489, | |
| "grad_norm": 0.5402314066886902, | |
| "learning_rate": 0.00017268322890864055, | |
| "loss": 0.3377, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.0928165525280489, | |
| "eval_loss": 0.30362746119499207, | |
| "eval_runtime": 0.5623, | |
| "eval_samples_per_second": 177.838, | |
| "eval_steps_per_second": 23.119, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.1656709893632522, | |
| "grad_norm": 0.4397026300430298, | |
| "learning_rate": 0.00017086186798776046, | |
| "loss": 0.3327, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1656709893632522, | |
| "eval_loss": 0.2984870970249176, | |
| "eval_runtime": 0.5634, | |
| "eval_samples_per_second": 177.483, | |
| "eval_steps_per_second": 23.073, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2385254261984555, | |
| "grad_norm": 0.4877306818962097, | |
| "learning_rate": 0.00016904050706688037, | |
| "loss": 0.3257, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2385254261984555, | |
| "eval_loss": 0.29171615839004517, | |
| "eval_runtime": 0.579, | |
| "eval_samples_per_second": 172.707, | |
| "eval_steps_per_second": 22.452, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.3113798630336588, | |
| "grad_norm": 1.0982270240783691, | |
| "learning_rate": 0.0001672191461460003, | |
| "loss": 0.32, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3113798630336588, | |
| "eval_loss": 0.2923184633255005, | |
| "eval_runtime": 0.558, | |
| "eval_samples_per_second": 179.22, | |
| "eval_steps_per_second": 23.299, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.384234299868862, | |
| "grad_norm": 0.6584481000900269, | |
| "learning_rate": 0.0001653977852251202, | |
| "loss": 0.3129, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.384234299868862, | |
| "eval_loss": 0.2869529128074646, | |
| "eval_runtime": 0.5651, | |
| "eval_samples_per_second": 176.96, | |
| "eval_steps_per_second": 23.005, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.4570887367040652, | |
| "grad_norm": 0.5571127533912659, | |
| "learning_rate": 0.00016357642430424015, | |
| "loss": 0.3178, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4570887367040652, | |
| "eval_loss": 0.2796230614185333, | |
| "eval_runtime": 0.5645, | |
| "eval_samples_per_second": 177.159, | |
| "eval_steps_per_second": 23.031, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.5299431735392686, | |
| "grad_norm": 0.33073556423187256, | |
| "learning_rate": 0.00016175506338336006, | |
| "loss": 0.3054, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.5299431735392686, | |
| "eval_loss": 0.27576252818107605, | |
| "eval_runtime": 0.5776, | |
| "eval_samples_per_second": 173.121, | |
| "eval_steps_per_second": 22.506, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.6027976103744717, | |
| "grad_norm": 0.45740246772766113, | |
| "learning_rate": 0.00015993370246247996, | |
| "loss": 0.307, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6027976103744717, | |
| "eval_loss": 0.27131548523902893, | |
| "eval_runtime": 0.5666, | |
| "eval_samples_per_second": 176.496, | |
| "eval_steps_per_second": 22.945, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6756520472096752, | |
| "grad_norm": 0.45748448371887207, | |
| "learning_rate": 0.00015811234154159987, | |
| "loss": 0.3015, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.6756520472096752, | |
| "eval_loss": 0.26808932423591614, | |
| "eval_runtime": 0.5676, | |
| "eval_samples_per_second": 176.195, | |
| "eval_steps_per_second": 22.905, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.7485064840448783, | |
| "grad_norm": 0.4469503164291382, | |
| "learning_rate": 0.0001562909806207198, | |
| "loss": 0.301, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7485064840448783, | |
| "eval_loss": 0.2650498151779175, | |
| "eval_runtime": 0.5572, | |
| "eval_samples_per_second": 179.466, | |
| "eval_steps_per_second": 23.331, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.8213609208800816, | |
| "grad_norm": 0.6140857338905334, | |
| "learning_rate": 0.00015446961969983972, | |
| "loss": 0.2948, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.8213609208800816, | |
| "eval_loss": 0.2665635645389557, | |
| "eval_runtime": 0.5602, | |
| "eval_samples_per_second": 178.493, | |
| "eval_steps_per_second": 23.204, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.894215357715285, | |
| "grad_norm": 0.4431038200855255, | |
| "learning_rate": 0.00015264825877895965, | |
| "loss": 0.2929, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.894215357715285, | |
| "eval_loss": 0.26208823919296265, | |
| "eval_runtime": 0.562, | |
| "eval_samples_per_second": 177.925, | |
| "eval_steps_per_second": 23.13, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.967069794550488, | |
| "grad_norm": 0.5034199953079224, | |
| "learning_rate": 0.0001508268978580796, | |
| "loss": 0.2844, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.967069794550488, | |
| "eval_loss": 0.2574382722377777, | |
| "eval_runtime": 0.5819, | |
| "eval_samples_per_second": 171.837, | |
| "eval_steps_per_second": 22.339, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.0399242313856916, | |
| "grad_norm": 0.5565065741539001, | |
| "learning_rate": 0.00014900553693719947, | |
| "loss": 0.2807, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.0399242313856916, | |
| "eval_loss": 0.25828686356544495, | |
| "eval_runtime": 0.5827, | |
| "eval_samples_per_second": 171.603, | |
| "eval_steps_per_second": 22.308, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.1127786682208947, | |
| "grad_norm": 0.5862753987312317, | |
| "learning_rate": 0.0001471841760163194, | |
| "loss": 0.2806, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.1127786682208947, | |
| "eval_loss": 0.2531309425830841, | |
| "eval_runtime": 0.5905, | |
| "eval_samples_per_second": 169.362, | |
| "eval_steps_per_second": 22.017, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.1856331050560978, | |
| "grad_norm": 0.7702651619911194, | |
| "learning_rate": 0.0001453628150954393, | |
| "loss": 0.2776, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.1856331050560978, | |
| "eval_loss": 0.24827995896339417, | |
| "eval_runtime": 0.5626, | |
| "eval_samples_per_second": 177.742, | |
| "eval_steps_per_second": 23.106, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.2584875418913013, | |
| "grad_norm": 0.4496975541114807, | |
| "learning_rate": 0.00014354145417455925, | |
| "loss": 0.2729, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.2584875418913013, | |
| "eval_loss": 0.24621199071407318, | |
| "eval_runtime": 0.5772, | |
| "eval_samples_per_second": 173.237, | |
| "eval_steps_per_second": 22.521, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.3313419787265044, | |
| "grad_norm": 0.5896193981170654, | |
| "learning_rate": 0.00014172009325367916, | |
| "loss": 0.2718, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.3313419787265044, | |
| "eval_loss": 0.24523521959781647, | |
| "eval_runtime": 0.5819, | |
| "eval_samples_per_second": 171.859, | |
| "eval_steps_per_second": 22.342, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.4041964155617075, | |
| "grad_norm": 0.411600798368454, | |
| "learning_rate": 0.00013989873233279907, | |
| "loss": 0.2718, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.4041964155617075, | |
| "eval_loss": 0.24163128435611725, | |
| "eval_runtime": 0.5597, | |
| "eval_samples_per_second": 178.657, | |
| "eval_steps_per_second": 23.225, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.477050852396911, | |
| "grad_norm": 0.5009840130805969, | |
| "learning_rate": 0.00013807737141191897, | |
| "loss": 0.2652, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.477050852396911, | |
| "eval_loss": 0.24147550761699677, | |
| "eval_runtime": 0.5809, | |
| "eval_samples_per_second": 172.154, | |
| "eval_steps_per_second": 22.38, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.549905289232114, | |
| "grad_norm": 0.5353007912635803, | |
| "learning_rate": 0.0001362560104910389, | |
| "loss": 0.2628, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.549905289232114, | |
| "eval_loss": 0.2369464635848999, | |
| "eval_runtime": 0.5912, | |
| "eval_samples_per_second": 169.158, | |
| "eval_steps_per_second": 21.991, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.6227597260673177, | |
| "grad_norm": 0.4573606848716736, | |
| "learning_rate": 0.00013443464957015885, | |
| "loss": 0.2609, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.6227597260673177, | |
| "eval_loss": 0.23909151554107666, | |
| "eval_runtime": 0.5596, | |
| "eval_samples_per_second": 178.696, | |
| "eval_steps_per_second": 23.23, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.6956141629025208, | |
| "grad_norm": 0.4674642086029053, | |
| "learning_rate": 0.00013261328864927875, | |
| "loss": 0.257, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.6956141629025208, | |
| "eval_loss": 0.2337809056043625, | |
| "eval_runtime": 0.5605, | |
| "eval_samples_per_second": 178.402, | |
| "eval_steps_per_second": 23.192, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.768468599737724, | |
| "grad_norm": 0.43507474660873413, | |
| "learning_rate": 0.00013079192772839866, | |
| "loss": 0.2566, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.768468599737724, | |
| "eval_loss": 0.2308950424194336, | |
| "eval_runtime": 0.561, | |
| "eval_samples_per_second": 178.259, | |
| "eval_steps_per_second": 23.174, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.8413230365729274, | |
| "grad_norm": 0.4606495797634125, | |
| "learning_rate": 0.00012897056680751857, | |
| "loss": 0.2607, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.8413230365729274, | |
| "eval_loss": 0.23102878034114838, | |
| "eval_runtime": 0.5745, | |
| "eval_samples_per_second": 174.057, | |
| "eval_steps_per_second": 22.627, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.9141774734081305, | |
| "grad_norm": 0.686039388179779, | |
| "learning_rate": 0.0001271492058866385, | |
| "loss": 0.2562, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.9141774734081305, | |
| "eval_loss": 0.22536581754684448, | |
| "eval_runtime": 0.5619, | |
| "eval_samples_per_second": 177.974, | |
| "eval_steps_per_second": 23.137, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.987031910243334, | |
| "grad_norm": 0.48106732964515686, | |
| "learning_rate": 0.00012532784496575841, | |
| "loss": 0.2524, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.987031910243334, | |
| "eval_loss": 0.22328069806098938, | |
| "eval_runtime": 0.562, | |
| "eval_samples_per_second": 177.946, | |
| "eval_steps_per_second": 23.133, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.059886347078537, | |
| "grad_norm": 0.44265252351760864, | |
| "learning_rate": 0.00012350648404487835, | |
| "loss": 0.2469, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.059886347078537, | |
| "eval_loss": 0.22362683713436127, | |
| "eval_runtime": 0.5835, | |
| "eval_samples_per_second": 171.391, | |
| "eval_steps_per_second": 22.281, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.1327407839137402, | |
| "grad_norm": 0.4698319435119629, | |
| "learning_rate": 0.00012168512312399827, | |
| "loss": 0.2521, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.1327407839137402, | |
| "eval_loss": 0.22086407244205475, | |
| "eval_runtime": 0.5707, | |
| "eval_samples_per_second": 175.226, | |
| "eval_steps_per_second": 22.779, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.2055952207489438, | |
| "grad_norm": 0.4953310191631317, | |
| "learning_rate": 0.00011986376220311817, | |
| "loss": 0.2478, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.2055952207489438, | |
| "eval_loss": 0.22542959451675415, | |
| "eval_runtime": 0.5593, | |
| "eval_samples_per_second": 178.801, | |
| "eval_steps_per_second": 23.244, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.278449657584147, | |
| "grad_norm": 0.921518087387085, | |
| "learning_rate": 0.00011804240128223809, | |
| "loss": 0.2447, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.278449657584147, | |
| "eval_loss": 0.22003282606601715, | |
| "eval_runtime": 0.5616, | |
| "eval_samples_per_second": 178.07, | |
| "eval_steps_per_second": 23.149, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.35130409441935, | |
| "grad_norm": 0.5857972502708435, | |
| "learning_rate": 0.00011622104036135801, | |
| "loss": 0.2432, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.35130409441935, | |
| "eval_loss": 0.21950890123844147, | |
| "eval_runtime": 0.5587, | |
| "eval_samples_per_second": 178.975, | |
| "eval_steps_per_second": 23.267, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.4241585312545535, | |
| "grad_norm": 0.5046322345733643, | |
| "learning_rate": 0.00011439967944047793, | |
| "loss": 0.2394, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.4241585312545535, | |
| "eval_loss": 0.21759502589702606, | |
| "eval_runtime": 0.5757, | |
| "eval_samples_per_second": 173.711, | |
| "eval_steps_per_second": 22.582, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.4970129680897566, | |
| "grad_norm": 0.6441205739974976, | |
| "learning_rate": 0.00011257831851959786, | |
| "loss": 0.2371, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4970129680897566, | |
| "eval_loss": 0.21436944603919983, | |
| "eval_runtime": 0.5779, | |
| "eval_samples_per_second": 173.027, | |
| "eval_steps_per_second": 22.494, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.5698674049249597, | |
| "grad_norm": 0.5626524090766907, | |
| "learning_rate": 0.00011075695759871776, | |
| "loss": 0.2391, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 3.5698674049249597, | |
| "eval_loss": 0.21467125415802002, | |
| "eval_runtime": 0.5829, | |
| "eval_samples_per_second": 171.542, | |
| "eval_steps_per_second": 22.3, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 3.6427218417601632, | |
| "grad_norm": 0.6218989491462708, | |
| "learning_rate": 0.00010893559667783769, | |
| "loss": 0.235, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.6427218417601632, | |
| "eval_loss": 0.21365521848201752, | |
| "eval_runtime": 0.5617, | |
| "eval_samples_per_second": 178.033, | |
| "eval_steps_per_second": 23.144, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.7155762785953663, | |
| "grad_norm": 0.39173460006713867, | |
| "learning_rate": 0.00010711423575695761, | |
| "loss": 0.2364, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 3.7155762785953663, | |
| "eval_loss": 0.21131116151809692, | |
| "eval_runtime": 0.5528, | |
| "eval_samples_per_second": 180.905, | |
| "eval_steps_per_second": 23.518, | |
| "step": 25500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 54904, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6902431875072000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |