| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3297872340425532, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013297872340425532, | |
| "grad_norm": 0.00017850878066383302, | |
| "learning_rate": 2.9999069195872345e-05, | |
| "loss": 1.5526, | |
| "num_input_tokens_seen": 22912, | |
| "step": 5, | |
| "train_runtime": 15.9018, | |
| "train_tokens_per_second": 1440.846 | |
| }, | |
| { | |
| "epoch": 0.026595744680851064, | |
| "grad_norm": 0.00022191159951034933, | |
| "learning_rate": 2.9995288002087968e-05, | |
| "loss": 0.1475, | |
| "num_input_tokens_seen": 47104, | |
| "step": 10, | |
| "train_runtime": 23.1749, | |
| "train_tokens_per_second": 2032.547 | |
| }, | |
| { | |
| "epoch": 0.0398936170212766, | |
| "grad_norm": 0.00020294415298849344, | |
| "learning_rate": 2.9988598976060308e-05, | |
| "loss": 0.153, | |
| "num_input_tokens_seen": 73920, | |
| "step": 15, | |
| "train_runtime": 31.9246, | |
| "train_tokens_per_second": 2315.457 | |
| }, | |
| { | |
| "epoch": 0.05319148936170213, | |
| "grad_norm": 7.007523527136073e-05, | |
| "learning_rate": 2.9979003414901197e-05, | |
| "loss": 0.1529, | |
| "num_input_tokens_seen": 99360, | |
| "step": 20, | |
| "train_runtime": 39.4019, | |
| "train_tokens_per_second": 2521.709 | |
| }, | |
| { | |
| "epoch": 0.06648936170212766, | |
| "grad_norm": 0.00010616348299663514, | |
| "learning_rate": 2.99665031793473e-05, | |
| "loss": 0.1295, | |
| "num_input_tokens_seen": 124192, | |
| "step": 25, | |
| "train_runtime": 46.8336, | |
| "train_tokens_per_second": 2651.77 | |
| }, | |
| { | |
| "epoch": 0.0797872340425532, | |
| "grad_norm": 5.6807843066053465e-05, | |
| "learning_rate": 2.995110069339927e-05, | |
| "loss": 0.1431, | |
| "num_input_tokens_seen": 151456, | |
| "step": 30, | |
| "train_runtime": 54.8766, | |
| "train_tokens_per_second": 2759.94 | |
| }, | |
| { | |
| "epoch": 0.09308510638297872, | |
| "grad_norm": 8.653431723359972e-05, | |
| "learning_rate": 2.993279894385171e-05, | |
| "loss": 0.1003, | |
| "num_input_tokens_seen": 177344, | |
| "step": 35, | |
| "train_runtime": 62.5606, | |
| "train_tokens_per_second": 2834.754 | |
| }, | |
| { | |
| "epoch": 0.10638297872340426, | |
| "grad_norm": 4.548930155579001e-05, | |
| "learning_rate": 2.9911601479713985e-05, | |
| "loss": 0.1126, | |
| "num_input_tokens_seen": 205952, | |
| "step": 40, | |
| "train_runtime": 70.8371, | |
| "train_tokens_per_second": 2907.403 | |
| }, | |
| { | |
| "epoch": 0.1196808510638298, | |
| "grad_norm": 0.000141258497023955, | |
| "learning_rate": 2.988751241152199e-05, | |
| "loss": 0.1204, | |
| "num_input_tokens_seen": 237920, | |
| "step": 45, | |
| "train_runtime": 79.888, | |
| "train_tokens_per_second": 2978.17 | |
| }, | |
| { | |
| "epoch": 0.13297872340425532, | |
| "grad_norm": 4.336608981247991e-05, | |
| "learning_rate": 2.9860536410541076e-05, | |
| "loss": 0.069, | |
| "num_input_tokens_seen": 264128, | |
| "step": 50, | |
| "train_runtime": 87.681, | |
| "train_tokens_per_second": 3012.373 | |
| }, | |
| { | |
| "epoch": 0.14627659574468085, | |
| "grad_norm": 7.17395669198595e-05, | |
| "learning_rate": 2.983067870786019e-05, | |
| "loss": 0.0447, | |
| "num_input_tokens_seen": 288896, | |
| "step": 55, | |
| "train_runtime": 95.0758, | |
| "train_tokens_per_second": 3038.587 | |
| }, | |
| { | |
| "epoch": 0.1595744680851064, | |
| "grad_norm": 4.3858930439455435e-05, | |
| "learning_rate": 2.9797945093377513e-05, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 311680, | |
| "step": 60, | |
| "train_runtime": 102.0665, | |
| "train_tokens_per_second": 3053.696 | |
| }, | |
| { | |
| "epoch": 0.17287234042553193, | |
| "grad_norm": 8.529757906217128e-05, | |
| "learning_rate": 2.976234191467767e-05, | |
| "loss": 0.0789, | |
| "num_input_tokens_seen": 334976, | |
| "step": 65, | |
| "train_runtime": 109.0924, | |
| "train_tokens_per_second": 3070.572 | |
| }, | |
| { | |
| "epoch": 0.18617021276595744, | |
| "grad_norm": 6.885492621222511e-05, | |
| "learning_rate": 2.9723876075800846e-05, | |
| "loss": 0.083, | |
| "num_input_tokens_seen": 360480, | |
| "step": 70, | |
| "train_runtime": 116.6544, | |
| "train_tokens_per_second": 3090.154 | |
| }, | |
| { | |
| "epoch": 0.19946808510638298, | |
| "grad_norm": 7.440832268912345e-05, | |
| "learning_rate": 2.968255503590398e-05, | |
| "loss": 0.0511, | |
| "num_input_tokens_seen": 384768, | |
| "step": 75, | |
| "train_runtime": 123.9537, | |
| "train_tokens_per_second": 3104.126 | |
| }, | |
| { | |
| "epoch": 0.2127659574468085, | |
| "grad_norm": 7.101365190465003e-05, | |
| "learning_rate": 2.963838680781431e-05, | |
| "loss": 0.0788, | |
| "num_input_tokens_seen": 410304, | |
| "step": 80, | |
| "train_runtime": 131.6092, | |
| "train_tokens_per_second": 3117.593 | |
| }, | |
| { | |
| "epoch": 0.22606382978723405, | |
| "grad_norm": 1.87977020686958e-05, | |
| "learning_rate": 2.959137995647556e-05, | |
| "loss": 0.0467, | |
| "num_input_tokens_seen": 437888, | |
| "step": 85, | |
| "train_runtime": 139.7728, | |
| "train_tokens_per_second": 3132.856 | |
| }, | |
| { | |
| "epoch": 0.2393617021276596, | |
| "grad_norm": 0.0001294072571909055, | |
| "learning_rate": 2.9541543597287034e-05, | |
| "loss": 0.053, | |
| "num_input_tokens_seen": 462976, | |
| "step": 90, | |
| "train_runtime": 147.3061, | |
| "train_tokens_per_second": 3142.952 | |
| }, | |
| { | |
| "epoch": 0.2526595744680851, | |
| "grad_norm": 9.718466753838584e-05, | |
| "learning_rate": 2.9488887394336025e-05, | |
| "loss": 0.0345, | |
| "num_input_tokens_seen": 485280, | |
| "step": 95, | |
| "train_runtime": 154.2543, | |
| "train_tokens_per_second": 3145.973 | |
| }, | |
| { | |
| "epoch": 0.26595744680851063, | |
| "grad_norm": 0.00011859676305903122, | |
| "learning_rate": 2.9433421558523767e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 509856, | |
| "step": 100, | |
| "train_runtime": 161.6729, | |
| "train_tokens_per_second": 3153.627 | |
| }, | |
| { | |
| "epoch": 0.27925531914893614, | |
| "grad_norm": 6.369210314005613e-05, | |
| "learning_rate": 2.9375156845585374e-05, | |
| "loss": 0.0562, | |
| "num_input_tokens_seen": 535264, | |
| "step": 105, | |
| "train_runtime": 170.0802, | |
| "train_tokens_per_second": 3147.127 | |
| }, | |
| { | |
| "epoch": 0.2925531914893617, | |
| "grad_norm": 4.727758641820401e-05, | |
| "learning_rate": 2.9314104554004137e-05, | |
| "loss": 0.0371, | |
| "num_input_tokens_seen": 562912, | |
| "step": 110, | |
| "train_runtime": 178.2035, | |
| "train_tokens_per_second": 3158.815 | |
| }, | |
| { | |
| "epoch": 0.3058510638297872, | |
| "grad_norm": 0.00010592794569674879, | |
| "learning_rate": 2.925027652282056e-05, | |
| "loss": 0.0586, | |
| "num_input_tokens_seen": 585280, | |
| "step": 115, | |
| "train_runtime": 185.0929, | |
| "train_tokens_per_second": 3162.088 | |
| }, | |
| { | |
| "epoch": 0.3191489361702128, | |
| "grad_norm": 2.9270680897752754e-05, | |
| "learning_rate": 2.918368512933657e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 612224, | |
| "step": 120, | |
| "train_runtime": 192.9777, | |
| "train_tokens_per_second": 3172.512 | |
| }, | |
| { | |
| "epoch": 0.3324468085106383, | |
| "grad_norm": 0.00010040538472821936, | |
| "learning_rate": 2.911434328671536e-05, | |
| "loss": 0.0751, | |
| "num_input_tokens_seen": 639264, | |
| "step": 125, | |
| "train_runtime": 201.0075, | |
| "train_tokens_per_second": 3180.299 | |
| }, | |
| { | |
| "epoch": 0.34574468085106386, | |
| "grad_norm": 0.0001042517542373389, | |
| "learning_rate": 2.904226444147732e-05, | |
| "loss": 0.0677, | |
| "num_input_tokens_seen": 665280, | |
| "step": 130, | |
| "train_runtime": 208.7729, | |
| "train_tokens_per_second": 3186.621 | |
| }, | |
| { | |
| "epoch": 0.35904255319148937, | |
| "grad_norm": 7.185106369433925e-05, | |
| "learning_rate": 2.896746257089251e-05, | |
| "loss": 0.0587, | |
| "num_input_tokens_seen": 689216, | |
| "step": 135, | |
| "train_runtime": 216.0407, | |
| "train_tokens_per_second": 3190.214 | |
| }, | |
| { | |
| "epoch": 0.3723404255319149, | |
| "grad_norm": 5.872031761100516e-05, | |
| "learning_rate": 2.8889952180270287e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 714880, | |
| "step": 140, | |
| "train_runtime": 223.7009, | |
| "train_tokens_per_second": 3195.695 | |
| }, | |
| { | |
| "epoch": 0.38563829787234044, | |
| "grad_norm": 2.933590076281689e-05, | |
| "learning_rate": 2.880974830014643e-05, | |
| "loss": 0.1054, | |
| "num_input_tokens_seen": 739904, | |
| "step": 145, | |
| "train_runtime": 231.1836, | |
| "train_tokens_per_second": 3200.504 | |
| }, | |
| { | |
| "epoch": 0.39893617021276595, | |
| "grad_norm": 0.00012435043754521757, | |
| "learning_rate": 2.872686648336853e-05, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 765824, | |
| "step": 150, | |
| "train_runtime": 238.8742, | |
| "train_tokens_per_second": 3205.972 | |
| }, | |
| { | |
| "epoch": 0.4122340425531915, | |
| "grad_norm": 8.882827387424186e-05, | |
| "learning_rate": 2.8641322802079984e-05, | |
| "loss": 0.0508, | |
| "num_input_tokens_seen": 797952, | |
| "step": 155, | |
| "train_runtime": 248.022, | |
| "train_tokens_per_second": 3217.263 | |
| }, | |
| { | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 9.789071918930858e-05, | |
| "learning_rate": 2.8553133844603382e-05, | |
| "loss": 0.0399, | |
| "num_input_tokens_seen": 823264, | |
| "step": 160, | |
| "train_runtime": 255.6112, | |
| "train_tokens_per_second": 3220.766 | |
| }, | |
| { | |
| "epoch": 0.43882978723404253, | |
| "grad_norm": 4.716894181910902e-05, | |
| "learning_rate": 2.846231671222374e-05, | |
| "loss": 0.062, | |
| "num_input_tokens_seen": 849216, | |
| "step": 165, | |
| "train_runtime": 263.3712, | |
| "train_tokens_per_second": 3224.408 | |
| }, | |
| { | |
| "epoch": 0.4521276595744681, | |
| "grad_norm": 8.95522753125988e-05, | |
| "learning_rate": 2.836888901587229e-05, | |
| "loss": 0.1292, | |
| "num_input_tokens_seen": 874208, | |
| "step": 170, | |
| "train_runtime": 270.894, | |
| "train_tokens_per_second": 3227.122 | |
| }, | |
| { | |
| "epoch": 0.4654255319148936, | |
| "grad_norm": 3.6886351153953e-05, | |
| "learning_rate": 2.827286887271143e-05, | |
| "loss": 0.0558, | |
| "num_input_tokens_seen": 898624, | |
| "step": 175, | |
| "train_runtime": 278.2599, | |
| "train_tokens_per_second": 3229.441 | |
| }, | |
| { | |
| "epoch": 0.4787234042553192, | |
| "grad_norm": 7.180378452176228e-05, | |
| "learning_rate": 2.8174274902621495e-05, | |
| "loss": 0.0506, | |
| "num_input_tokens_seen": 921728, | |
| "step": 180, | |
| "train_runtime": 285.3501, | |
| "train_tokens_per_second": 3230.166 | |
| }, | |
| { | |
| "epoch": 0.4920212765957447, | |
| "grad_norm": 2.529071207391098e-05, | |
| "learning_rate": 2.8073126224590073e-05, | |
| "loss": 0.0713, | |
| "num_input_tokens_seen": 948160, | |
| "step": 185, | |
| "train_runtime": 293.1898, | |
| "train_tokens_per_second": 3233.946 | |
| }, | |
| { | |
| "epoch": 0.5053191489361702, | |
| "grad_norm": 2.971558387798723e-05, | |
| "learning_rate": 2.7969442453004525e-05, | |
| "loss": 0.0423, | |
| "num_input_tokens_seen": 974688, | |
| "step": 190, | |
| "train_runtime": 301.0219, | |
| "train_tokens_per_second": 3237.93 | |
| }, | |
| { | |
| "epoch": 0.5186170212765957, | |
| "grad_norm": 1.3908083019487094e-05, | |
| "learning_rate": 2.786324369384841e-05, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 999232, | |
| "step": 195, | |
| "train_runtime": 308.373, | |
| "train_tokens_per_second": 3240.336 | |
| }, | |
| { | |
| "epoch": 0.5319148936170213, | |
| "grad_norm": 8.287108357762918e-05, | |
| "learning_rate": 2.7754550540802632e-05, | |
| "loss": 0.0505, | |
| "num_input_tokens_seen": 1024352, | |
| "step": 200, | |
| "train_runtime": 315.9074, | |
| "train_tokens_per_second": 3242.57 | |
| }, | |
| { | |
| "epoch": 0.5452127659574468, | |
| "grad_norm": 7.783296314300969e-05, | |
| "learning_rate": 2.7643384071251957e-05, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 1049088, | |
| "step": 205, | |
| "train_runtime": 324.1076, | |
| "train_tokens_per_second": 3236.851 | |
| }, | |
| { | |
| "epoch": 0.5585106382978723, | |
| "grad_norm": 0.0001195428121718578, | |
| "learning_rate": 2.7529765842197798e-05, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 1073024, | |
| "step": 210, | |
| "train_runtime": 331.3284, | |
| "train_tokens_per_second": 3238.552 | |
| }, | |
| { | |
| "epoch": 0.5718085106382979, | |
| "grad_norm": 4.606168658938259e-05, | |
| "learning_rate": 2.741371788607793e-05, | |
| "loss": 0.0616, | |
| "num_input_tokens_seen": 1098880, | |
| "step": 215, | |
| "train_runtime": 339.0001, | |
| "train_tokens_per_second": 3241.533 | |
| }, | |
| { | |
| "epoch": 0.5851063829787234, | |
| "grad_norm": 0.00013229926116764545, | |
| "learning_rate": 2.729526270649405e-05, | |
| "loss": 0.0821, | |
| "num_input_tokens_seen": 1127328, | |
| "step": 220, | |
| "train_runtime": 347.2586, | |
| "train_tokens_per_second": 3246.364 | |
| }, | |
| { | |
| "epoch": 0.598404255319149, | |
| "grad_norm": 8.632720710011199e-05, | |
| "learning_rate": 2.7174423273847966e-05, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 1151584, | |
| "step": 225, | |
| "train_runtime": 354.5073, | |
| "train_tokens_per_second": 3248.407 | |
| }, | |
| { | |
| "epoch": 0.6117021276595744, | |
| "grad_norm": 4.496889596339315e-05, | |
| "learning_rate": 2.705122302088725e-05, | |
| "loss": 0.0667, | |
| "num_input_tokens_seen": 1180544, | |
| "step": 230, | |
| "train_runtime": 363.022, | |
| "train_tokens_per_second": 3251.991 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 1.9521097783581354e-05, | |
| "learning_rate": 2.6925685838161247e-05, | |
| "loss": 0.035, | |
| "num_input_tokens_seen": 1206080, | |
| "step": 235, | |
| "train_runtime": 370.6153, | |
| "train_tokens_per_second": 3254.264 | |
| }, | |
| { | |
| "epoch": 0.6382978723404256, | |
| "grad_norm": 4.637776146410033e-05, | |
| "learning_rate": 2.67978360693883e-05, | |
| "loss": 0.0604, | |
| "num_input_tokens_seen": 1230304, | |
| "step": 240, | |
| "train_runtime": 377.9559, | |
| "train_tokens_per_second": 3255.153 | |
| }, | |
| { | |
| "epoch": 0.651595744680851, | |
| "grad_norm": 3.3805175917223096e-05, | |
| "learning_rate": 2.6667698506735113e-05, | |
| "loss": 0.0556, | |
| "num_input_tokens_seen": 1256640, | |
| "step": 245, | |
| "train_runtime": 385.7509, | |
| "train_tokens_per_second": 3257.646 | |
| }, | |
| { | |
| "epoch": 0.6648936170212766, | |
| "grad_norm": 0.00010089632996823639, | |
| "learning_rate": 2.6535298386009144e-05, | |
| "loss": 0.0487, | |
| "num_input_tokens_seen": 1280064, | |
| "step": 250, | |
| "train_runtime": 392.8672, | |
| "train_tokens_per_second": 3258.262 | |
| }, | |
| { | |
| "epoch": 0.6781914893617021, | |
| "grad_norm": 3.6058525438420475e-05, | |
| "learning_rate": 2.6400661381764962e-05, | |
| "loss": 0.0702, | |
| "num_input_tokens_seen": 1305984, | |
| "step": 255, | |
| "train_runtime": 400.5999, | |
| "train_tokens_per_second": 3260.071 | |
| }, | |
| { | |
| "epoch": 0.6914893617021277, | |
| "grad_norm": 1.9650842659757473e-05, | |
| "learning_rate": 2.6263813602325525e-05, | |
| "loss": 0.0422, | |
| "num_input_tokens_seen": 1333088, | |
| "step": 260, | |
| "train_runtime": 408.608, | |
| "train_tokens_per_second": 3262.511 | |
| }, | |
| { | |
| "epoch": 0.7047872340425532, | |
| "grad_norm": 2.503952600818593e-05, | |
| "learning_rate": 2.6124781584719365e-05, | |
| "loss": 0.0674, | |
| "num_input_tokens_seen": 1357728, | |
| "step": 265, | |
| "train_runtime": 416.0446, | |
| "train_tokens_per_second": 3263.419 | |
| }, | |
| { | |
| "epoch": 0.7180851063829787, | |
| "grad_norm": 3.540120815159753e-05, | |
| "learning_rate": 2.5983592289534602e-05, | |
| "loss": 0.0446, | |
| "num_input_tokens_seen": 1383104, | |
| "step": 270, | |
| "train_runtime": 423.6735, | |
| "train_tokens_per_second": 3264.552 | |
| }, | |
| { | |
| "epoch": 0.7313829787234043, | |
| "grad_norm": 5.61477463634219e-05, | |
| "learning_rate": 2.584027309569086e-05, | |
| "loss": 0.0382, | |
| "num_input_tokens_seen": 1408096, | |
| "step": 275, | |
| "train_runtime": 431.1736, | |
| "train_tokens_per_second": 3265.729 | |
| }, | |
| { | |
| "epoch": 0.7446808510638298, | |
| "grad_norm": 1.1481101864774246e-05, | |
| "learning_rate": 2.5694851795130044e-05, | |
| "loss": 0.0189, | |
| "num_input_tokens_seen": 1434048, | |
| "step": 280, | |
| "train_runtime": 438.8402, | |
| "train_tokens_per_second": 3267.813 | |
| }, | |
| { | |
| "epoch": 0.7579787234042553, | |
| "grad_norm": 0.0001053257001331076, | |
| "learning_rate": 2.5547356587427017e-05, | |
| "loss": 0.0246, | |
| "num_input_tokens_seen": 1457856, | |
| "step": 285, | |
| "train_runtime": 446.036, | |
| "train_tokens_per_second": 3268.471 | |
| }, | |
| { | |
| "epoch": 0.7712765957446809, | |
| "grad_norm": 5.1625109335873276e-05, | |
| "learning_rate": 2.539781607432125e-05, | |
| "loss": 0.0624, | |
| "num_input_tokens_seen": 1481120, | |
| "step": 290, | |
| "train_runtime": 453.1392, | |
| "train_tokens_per_second": 3268.576 | |
| }, | |
| { | |
| "epoch": 0.7845744680851063, | |
| "grad_norm": 6.952533112780657e-06, | |
| "learning_rate": 2.5246259254170464e-05, | |
| "loss": 0.0346, | |
| "num_input_tokens_seen": 1506176, | |
| "step": 295, | |
| "train_runtime": 460.6884, | |
| "train_tokens_per_second": 3269.403 | |
| }, | |
| { | |
| "epoch": 0.7978723404255319, | |
| "grad_norm": 9.527090878691524e-05, | |
| "learning_rate": 2.5092715516327384e-05, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 1529824, | |
| "step": 300, | |
| "train_runtime": 467.9003, | |
| "train_tokens_per_second": 3269.551 | |
| }, | |
| { | |
| "epoch": 0.8111702127659575, | |
| "grad_norm": 2.904631219280418e-05, | |
| "learning_rate": 2.4937214635440665e-05, | |
| "loss": 0.0361, | |
| "num_input_tokens_seen": 1552384, | |
| "step": 305, | |
| "train_runtime": 475.6103, | |
| "train_tokens_per_second": 3263.983 | |
| }, | |
| { | |
| "epoch": 0.824468085106383, | |
| "grad_norm": 4.446757884579711e-05, | |
| "learning_rate": 2.4779786765681082e-05, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 1579072, | |
| "step": 310, | |
| "train_runtime": 483.4588, | |
| "train_tokens_per_second": 3266.198 | |
| }, | |
| { | |
| "epoch": 0.8377659574468085, | |
| "grad_norm": 6.499775918200612e-05, | |
| "learning_rate": 2.4620462434894158e-05, | |
| "loss": 0.0503, | |
| "num_input_tokens_seen": 1603744, | |
| "step": 315, | |
| "train_runtime": 490.8348, | |
| "train_tokens_per_second": 3267.381 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 2.785153810691554e-05, | |
| "learning_rate": 2.4459272538680308e-05, | |
| "loss": 0.0371, | |
| "num_input_tokens_seen": 1627712, | |
| "step": 320, | |
| "train_runtime": 498.0766, | |
| "train_tokens_per_second": 3267.995 | |
| }, | |
| { | |
| "epoch": 0.8643617021276596, | |
| "grad_norm": 6.219661008799449e-05, | |
| "learning_rate": 2.4296248334403672e-05, | |
| "loss": 0.0635, | |
| "num_input_tokens_seen": 1653600, | |
| "step": 325, | |
| "train_runtime": 505.7239, | |
| "train_tokens_per_second": 3269.768 | |
| }, | |
| { | |
| "epoch": 0.8776595744680851, | |
| "grad_norm": 4.4950455048820004e-05, | |
| "learning_rate": 2.413142143513081e-05, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 1676928, | |
| "step": 330, | |
| "train_runtime": 512.8025, | |
| "train_tokens_per_second": 3270.125 | |
| }, | |
| { | |
| "epoch": 0.8909574468085106, | |
| "grad_norm": 3.27678098983597e-05, | |
| "learning_rate": 2.3964823803500395e-05, | |
| "loss": 0.052, | |
| "num_input_tokens_seen": 1707808, | |
| "step": 335, | |
| "train_runtime": 521.6471, | |
| "train_tokens_per_second": 3273.876 | |
| }, | |
| { | |
| "epoch": 0.9042553191489362, | |
| "grad_norm": 4.62313364550937e-05, | |
| "learning_rate": 2.3796487745525145e-05, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 1732576, | |
| "step": 340, | |
| "train_runtime": 529.0667, | |
| "train_tokens_per_second": 3274.778 | |
| }, | |
| { | |
| "epoch": 0.9175531914893617, | |
| "grad_norm": 3.923915573977865e-05, | |
| "learning_rate": 2.3626445904327155e-05, | |
| "loss": 0.0205, | |
| "num_input_tokens_seen": 1758016, | |
| "step": 345, | |
| "train_runtime": 536.6618, | |
| "train_tokens_per_second": 3275.836 | |
| }, | |
| { | |
| "epoch": 0.9308510638297872, | |
| "grad_norm": 6.78059086567373e-06, | |
| "learning_rate": 2.3454731253807862e-05, | |
| "loss": 0.0232, | |
| "num_input_tokens_seen": 1783872, | |
| "step": 350, | |
| "train_runtime": 544.3484, | |
| "train_tokens_per_second": 3277.078 | |
| }, | |
| { | |
| "epoch": 0.9441489361702128, | |
| "grad_norm": 2.773117921606172e-05, | |
| "learning_rate": 2.328137709225385e-05, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 1807008, | |
| "step": 355, | |
| "train_runtime": 551.3669, | |
| "train_tokens_per_second": 3277.324 | |
| }, | |
| { | |
| "epoch": 0.9574468085106383, | |
| "grad_norm": 9.006850450532511e-05, | |
| "learning_rate": 2.3106417035879797e-05, | |
| "loss": 0.0517, | |
| "num_input_tokens_seen": 1834048, | |
| "step": 360, | |
| "train_runtime": 559.3162, | |
| "train_tokens_per_second": 3279.089 | |
| }, | |
| { | |
| "epoch": 0.9707446808510638, | |
| "grad_norm": 5.6452212447766215e-05, | |
| "learning_rate": 2.2929885012309697e-05, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 1861728, | |
| "step": 365, | |
| "train_runtime": 567.4458, | |
| "train_tokens_per_second": 3280.891 | |
| }, | |
| { | |
| "epoch": 0.9840425531914894, | |
| "grad_norm": 2.901201980876067e-07, | |
| "learning_rate": 2.2751815253997783e-05, | |
| "loss": 0.0186, | |
| "num_input_tokens_seen": 1885376, | |
| "step": 370, | |
| "train_runtime": 574.6516, | |
| "train_tokens_per_second": 3280.903 | |
| }, | |
| { | |
| "epoch": 0.9973404255319149, | |
| "grad_norm": 1.5018988506199094e-06, | |
| "learning_rate": 2.2572242291590264e-05, | |
| "loss": 0.0293, | |
| "num_input_tokens_seen": 1908128, | |
| "step": 375, | |
| "train_runtime": 581.6252, | |
| "train_tokens_per_second": 3280.683 | |
| }, | |
| { | |
| "epoch": 1.0106382978723405, | |
| "grad_norm": 0.00014393814490176737, | |
| "learning_rate": 2.239120094722926e-05, | |
| "loss": 0.2291, | |
| "num_input_tokens_seen": 1932528, | |
| "step": 380, | |
| "train_runtime": 589.9529, | |
| "train_tokens_per_second": 3275.733 | |
| }, | |
| { | |
| "epoch": 1.023936170212766, | |
| "grad_norm": 0.00014409016876015812, | |
| "learning_rate": 2.2208726327800257e-05, | |
| "loss": 1.028, | |
| "num_input_tokens_seen": 1957648, | |
| "step": 385, | |
| "train_runtime": 597.4479, | |
| "train_tokens_per_second": 3276.684 | |
| }, | |
| { | |
| "epoch": 1.0372340425531914, | |
| "grad_norm": 0.00011712688865372911, | |
| "learning_rate": 2.202485381812426e-05, | |
| "loss": 1.012, | |
| "num_input_tokens_seen": 1985392, | |
| "step": 390, | |
| "train_runtime": 605.572, | |
| "train_tokens_per_second": 3278.54 | |
| }, | |
| { | |
| "epoch": 1.050531914893617, | |
| "grad_norm": 9.847906039794907e-05, | |
| "learning_rate": 2.1839619074096117e-05, | |
| "loss": 1.1113, | |
| "num_input_tokens_seen": 2014320, | |
| "step": 395, | |
| "train_runtime": 614.0367, | |
| "train_tokens_per_second": 3280.455 | |
| }, | |
| { | |
| "epoch": 1.0638297872340425, | |
| "grad_norm": 0.00011392939632060006, | |
| "learning_rate": 2.1653058015770262e-05, | |
| "loss": 1.0173, | |
| "num_input_tokens_seen": 2041328, | |
| "step": 400, | |
| "train_runtime": 622.0201, | |
| "train_tokens_per_second": 3281.772 | |
| }, | |
| { | |
| "epoch": 1.077127659574468, | |
| "grad_norm": 8.545993478037417e-05, | |
| "learning_rate": 2.146520682039522e-05, | |
| "loss": 0.7919, | |
| "num_input_tokens_seen": 2068848, | |
| "step": 405, | |
| "train_runtime": 630.9237, | |
| "train_tokens_per_second": 3279.078 | |
| }, | |
| { | |
| "epoch": 1.0904255319148937, | |
| "grad_norm": 8.802056254353374e-05, | |
| "learning_rate": 2.127610191539825e-05, | |
| "loss": 0.696, | |
| "num_input_tokens_seen": 2094352, | |
| "step": 410, | |
| "train_runtime": 638.5178, | |
| "train_tokens_per_second": 3280.022 | |
| }, | |
| { | |
| "epoch": 1.1037234042553192, | |
| "grad_norm": 6.343067070702091e-05, | |
| "learning_rate": 2.1085779971321456e-05, | |
| "loss": 0.5359, | |
| "num_input_tokens_seen": 2120592, | |
| "step": 415, | |
| "train_runtime": 646.3244, | |
| "train_tokens_per_second": 3281.003 | |
| }, | |
| { | |
| "epoch": 1.1170212765957448, | |
| "grad_norm": 0.0001089554643840529, | |
| "learning_rate": 2.089427789471078e-05, | |
| "loss": 0.4819, | |
| "num_input_tokens_seen": 2147376, | |
| "step": 420, | |
| "train_runtime": 654.2112, | |
| "train_tokens_per_second": 3282.39 | |
| }, | |
| { | |
| "epoch": 1.1303191489361701, | |
| "grad_norm": 5.21246729476843e-05, | |
| "learning_rate": 2.0701632820959223e-05, | |
| "loss": 0.3732, | |
| "num_input_tokens_seen": 2170992, | |
| "step": 425, | |
| "train_runtime": 661.38, | |
| "train_tokens_per_second": 3282.518 | |
| }, | |
| { | |
| "epoch": 1.1436170212765957, | |
| "grad_norm": 6.755034701200202e-05, | |
| "learning_rate": 2.0507882107105664e-05, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 2199216, | |
| "step": 430, | |
| "train_runtime": 669.64, | |
| "train_tokens_per_second": 3284.177 | |
| }, | |
| { | |
| "epoch": 1.1569148936170213, | |
| "grad_norm": 9.302370017394423e-05, | |
| "learning_rate": 2.0313063324590736e-05, | |
| "loss": 0.2404, | |
| "num_input_tokens_seen": 2223120, | |
| "step": 435, | |
| "train_runtime": 676.8858, | |
| "train_tokens_per_second": 3284.335 | |
| }, | |
| { | |
| "epoch": 1.1702127659574468, | |
| "grad_norm": 7.59345421101898e-05, | |
| "learning_rate": 2.0117214251971088e-05, | |
| "loss": 0.2588, | |
| "num_input_tokens_seen": 2246128, | |
| "step": 440, | |
| "train_runtime": 683.9358, | |
| "train_tokens_per_second": 3284.121 | |
| }, | |
| { | |
| "epoch": 1.1835106382978724, | |
| "grad_norm": 0.00014472956536337733, | |
| "learning_rate": 1.9920372867593537e-05, | |
| "loss": 0.1375, | |
| "num_input_tokens_seen": 2274448, | |
| "step": 445, | |
| "train_runtime": 692.2182, | |
| "train_tokens_per_second": 3285.738 | |
| }, | |
| { | |
| "epoch": 1.196808510638298, | |
| "grad_norm": 6.864719762234017e-05, | |
| "learning_rate": 1.9722577342230408e-05, | |
| "loss": 0.1394, | |
| "num_input_tokens_seen": 2298736, | |
| "step": 450, | |
| "train_runtime": 699.5691, | |
| "train_tokens_per_second": 3285.931 | |
| }, | |
| { | |
| "epoch": 1.2101063829787235, | |
| "grad_norm": 7.647907477803528e-05, | |
| "learning_rate": 1.9523866031677607e-05, | |
| "loss": 0.1386, | |
| "num_input_tokens_seen": 2326192, | |
| "step": 455, | |
| "train_runtime": 707.7177, | |
| "train_tokens_per_second": 3286.893 | |
| }, | |
| { | |
| "epoch": 1.2234042553191489, | |
| "grad_norm": 5.32688463863451e-05, | |
| "learning_rate": 1.9324277469316807e-05, | |
| "loss": 0.1507, | |
| "num_input_tokens_seen": 2354992, | |
| "step": 460, | |
| "train_runtime": 716.0771, | |
| "train_tokens_per_second": 3288.741 | |
| }, | |
| { | |
| "epoch": 1.2367021276595744, | |
| "grad_norm": 6.799784750910476e-05, | |
| "learning_rate": 1.9123850358643208e-05, | |
| "loss": 0.1407, | |
| "num_input_tokens_seen": 2378736, | |
| "step": 465, | |
| "train_runtime": 723.2828, | |
| "train_tokens_per_second": 3288.805 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 6.61658777971752e-05, | |
| "learning_rate": 1.8922623565760255e-05, | |
| "loss": 0.1241, | |
| "num_input_tokens_seen": 2402928, | |
| "step": 470, | |
| "train_runtime": 730.5847, | |
| "train_tokens_per_second": 3289.048 | |
| }, | |
| { | |
| "epoch": 1.2632978723404256, | |
| "grad_norm": 6.660693179583177e-05, | |
| "learning_rate": 1.87206361118429e-05, | |
| "loss": 0.1393, | |
| "num_input_tokens_seen": 2427536, | |
| "step": 475, | |
| "train_runtime": 737.9838, | |
| "train_tokens_per_second": 3289.417 | |
| }, | |
| { | |
| "epoch": 1.2765957446808511, | |
| "grad_norm": 5.5432989029213786e-05, | |
| "learning_rate": 1.8517927165570745e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 2451952, | |
| "step": 480, | |
| "train_runtime": 745.3344, | |
| "train_tokens_per_second": 3289.734 | |
| }, | |
| { | |
| "epoch": 1.2898936170212765, | |
| "grad_norm": 5.3888677939539775e-05, | |
| "learning_rate": 1.831453603553259e-05, | |
| "loss": 0.1255, | |
| "num_input_tokens_seen": 2480912, | |
| "step": 485, | |
| "train_runtime": 753.8109, | |
| "train_tokens_per_second": 3291.16 | |
| }, | |
| { | |
| "epoch": 1.3031914893617023, | |
| "grad_norm": 6.483653123723343e-05, | |
| "learning_rate": 1.811050216260385e-05, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 2505744, | |
| "step": 490, | |
| "train_runtime": 761.3174, | |
| "train_tokens_per_second": 3291.326 | |
| }, | |
| { | |
| "epoch": 1.3164893617021276, | |
| "grad_norm": 5.622122625936754e-05, | |
| "learning_rate": 1.790586511229832e-05, | |
| "loss": 0.1123, | |
| "num_input_tokens_seen": 2528720, | |
| "step": 495, | |
| "train_runtime": 768.3834, | |
| "train_tokens_per_second": 3290.961 | |
| }, | |
| { | |
| "epoch": 1.3297872340425532, | |
| "grad_norm": 4.249440462444909e-05, | |
| "learning_rate": 1.7700664567095788e-05, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 2551760, | |
| "step": 500, | |
| "train_runtime": 775.3677, | |
| "train_tokens_per_second": 3291.032 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1128, | |
| "num_input_tokens_seen": 2551760, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0923743661195264e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |