{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3297872340425532, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013297872340425532, "grad_norm": 0.00017850878066383302, "learning_rate": 2.9999069195872345e-05, "loss": 1.5526, "num_input_tokens_seen": 22912, "step": 5, "train_runtime": 15.9018, "train_tokens_per_second": 1440.846 }, { "epoch": 0.026595744680851064, "grad_norm": 0.00022191159951034933, "learning_rate": 2.9995288002087968e-05, "loss": 0.1475, "num_input_tokens_seen": 47104, "step": 10, "train_runtime": 23.1749, "train_tokens_per_second": 2032.547 }, { "epoch": 0.0398936170212766, "grad_norm": 0.00020294415298849344, "learning_rate": 2.9988598976060308e-05, "loss": 0.153, "num_input_tokens_seen": 73920, "step": 15, "train_runtime": 31.9246, "train_tokens_per_second": 2315.457 }, { "epoch": 0.05319148936170213, "grad_norm": 7.007523527136073e-05, "learning_rate": 2.9979003414901197e-05, "loss": 0.1529, "num_input_tokens_seen": 99360, "step": 20, "train_runtime": 39.4019, "train_tokens_per_second": 2521.709 }, { "epoch": 0.06648936170212766, "grad_norm": 0.00010616348299663514, "learning_rate": 2.99665031793473e-05, "loss": 0.1295, "num_input_tokens_seen": 124192, "step": 25, "train_runtime": 46.8336, "train_tokens_per_second": 2651.77 }, { "epoch": 0.0797872340425532, "grad_norm": 5.6807843066053465e-05, "learning_rate": 2.995110069339927e-05, "loss": 0.1431, "num_input_tokens_seen": 151456, "step": 30, "train_runtime": 54.8766, "train_tokens_per_second": 2759.94 }, { "epoch": 0.09308510638297872, "grad_norm": 8.653431723359972e-05, "learning_rate": 2.993279894385171e-05, "loss": 0.1003, "num_input_tokens_seen": 177344, "step": 35, "train_runtime": 62.5606, "train_tokens_per_second": 2834.754 }, { "epoch": 0.10638297872340426, "grad_norm": 4.548930155579001e-05, "learning_rate": 2.9911601479713985e-05, "loss": 0.1126, "num_input_tokens_seen": 205952, "step": 40, "train_runtime": 70.8371, "train_tokens_per_second": 2907.403 }, { "epoch": 0.1196808510638298, "grad_norm": 0.000141258497023955, "learning_rate": 2.988751241152199e-05, "loss": 0.1204, "num_input_tokens_seen": 237920, "step": 45, "train_runtime": 79.888, "train_tokens_per_second": 2978.17 }, { "epoch": 0.13297872340425532, "grad_norm": 4.336608981247991e-05, "learning_rate": 2.9860536410541076e-05, "loss": 0.069, "num_input_tokens_seen": 264128, "step": 50, "train_runtime": 87.681, "train_tokens_per_second": 3012.373 }, { "epoch": 0.14627659574468085, "grad_norm": 7.17395669198595e-05, "learning_rate": 2.983067870786019e-05, "loss": 0.0447, "num_input_tokens_seen": 288896, "step": 55, "train_runtime": 95.0758, "train_tokens_per_second": 3038.587 }, { "epoch": 0.1595744680851064, "grad_norm": 4.3858930439455435e-05, "learning_rate": 2.9797945093377513e-05, "loss": 0.07, "num_input_tokens_seen": 311680, "step": 60, "train_runtime": 102.0665, "train_tokens_per_second": 3053.696 }, { "epoch": 0.17287234042553193, "grad_norm": 8.529757906217128e-05, "learning_rate": 2.976234191467767e-05, "loss": 0.0789, "num_input_tokens_seen": 334976, "step": 65, "train_runtime": 109.0924, "train_tokens_per_second": 3070.572 }, { "epoch": 0.18617021276595744, "grad_norm": 6.885492621222511e-05, "learning_rate": 2.9723876075800846e-05, "loss": 0.083, "num_input_tokens_seen": 360480, "step": 70, "train_runtime": 116.6544, "train_tokens_per_second": 3090.154 }, { "epoch": 0.19946808510638298, "grad_norm": 7.440832268912345e-05, "learning_rate": 2.968255503590398e-05, "loss": 0.0511, "num_input_tokens_seen": 384768, "step": 75, "train_runtime": 123.9537, "train_tokens_per_second": 3104.126 }, { "epoch": 0.2127659574468085, "grad_norm": 7.101365190465003e-05, "learning_rate": 2.963838680781431e-05, "loss": 0.0788, "num_input_tokens_seen": 410304, "step": 80, "train_runtime": 131.6092, "train_tokens_per_second": 3117.593 }, { "epoch": 0.22606382978723405, "grad_norm": 1.87977020686958e-05, "learning_rate": 2.959137995647556e-05, "loss": 0.0467, "num_input_tokens_seen": 437888, "step": 85, "train_runtime": 139.7728, "train_tokens_per_second": 3132.856 }, { "epoch": 0.2393617021276596, "grad_norm": 0.0001294072571909055, "learning_rate": 2.9541543597287034e-05, "loss": 0.053, "num_input_tokens_seen": 462976, "step": 90, "train_runtime": 147.3061, "train_tokens_per_second": 3142.952 }, { "epoch": 0.2526595744680851, "grad_norm": 9.718466753838584e-05, "learning_rate": 2.9488887394336025e-05, "loss": 0.0345, "num_input_tokens_seen": 485280, "step": 95, "train_runtime": 154.2543, "train_tokens_per_second": 3145.973 }, { "epoch": 0.26595744680851063, "grad_norm": 0.00011859676305903122, "learning_rate": 2.9433421558523767e-05, "loss": 0.0716, "num_input_tokens_seen": 509856, "step": 100, "train_runtime": 161.6729, "train_tokens_per_second": 3153.627 }, { "epoch": 0.27925531914893614, "grad_norm": 6.369210314005613e-05, "learning_rate": 2.9375156845585374e-05, "loss": 0.0562, "num_input_tokens_seen": 535264, "step": 105, "train_runtime": 170.0802, "train_tokens_per_second": 3147.127 }, { "epoch": 0.2925531914893617, "grad_norm": 4.727758641820401e-05, "learning_rate": 2.9314104554004137e-05, "loss": 0.0371, "num_input_tokens_seen": 562912, "step": 110, "train_runtime": 178.2035, "train_tokens_per_second": 3158.815 }, { "epoch": 0.3058510638297872, "grad_norm": 0.00010592794569674879, "learning_rate": 2.925027652282056e-05, "loss": 0.0586, "num_input_tokens_seen": 585280, "step": 115, "train_runtime": 185.0929, "train_tokens_per_second": 3162.088 }, { "epoch": 0.3191489361702128, "grad_norm": 2.9270680897752754e-05, "learning_rate": 2.918368512933657e-05, "loss": 0.0633, "num_input_tokens_seen": 612224, "step": 120, "train_runtime": 192.9777, "train_tokens_per_second": 3172.512 }, { "epoch": 0.3324468085106383, "grad_norm": 0.00010040538472821936, "learning_rate": 2.911434328671536e-05, "loss": 0.0751, "num_input_tokens_seen": 639264, "step": 125, "train_runtime": 201.0075, "train_tokens_per_second": 3180.299 }, { "epoch": 0.34574468085106386, "grad_norm": 0.0001042517542373389, "learning_rate": 2.904226444147732e-05, "loss": 0.0677, "num_input_tokens_seen": 665280, "step": 130, "train_runtime": 208.7729, "train_tokens_per_second": 3186.621 }, { "epoch": 0.35904255319148937, "grad_norm": 7.185106369433925e-05, "learning_rate": 2.896746257089251e-05, "loss": 0.0587, "num_input_tokens_seen": 689216, "step": 135, "train_runtime": 216.0407, "train_tokens_per_second": 3190.214 }, { "epoch": 0.3723404255319149, "grad_norm": 5.872031761100516e-05, "learning_rate": 2.8889952180270287e-05, "loss": 0.0605, "num_input_tokens_seen": 714880, "step": 140, "train_runtime": 223.7009, "train_tokens_per_second": 3195.695 }, { "epoch": 0.38563829787234044, "grad_norm": 2.933590076281689e-05, "learning_rate": 2.880974830014643e-05, "loss": 0.1054, "num_input_tokens_seen": 739904, "step": 145, "train_runtime": 231.1836, "train_tokens_per_second": 3200.504 }, { "epoch": 0.39893617021276595, "grad_norm": 0.00012435043754521757, "learning_rate": 2.872686648336853e-05, "loss": 0.0479, "num_input_tokens_seen": 765824, "step": 150, "train_runtime": 238.8742, "train_tokens_per_second": 3205.972 }, { "epoch": 0.4122340425531915, "grad_norm": 8.882827387424186e-05, "learning_rate": 2.8641322802079984e-05, "loss": 0.0508, "num_input_tokens_seen": 797952, "step": 155, "train_runtime": 248.022, "train_tokens_per_second": 3217.263 }, { "epoch": 0.425531914893617, "grad_norm": 9.789071918930858e-05, "learning_rate": 2.8553133844603382e-05, "loss": 0.0399, "num_input_tokens_seen": 823264, "step": 160, "train_runtime": 255.6112, "train_tokens_per_second": 3220.766 }, { "epoch": 0.43882978723404253, "grad_norm": 4.716894181910902e-05, "learning_rate": 2.846231671222374e-05, "loss": 0.062, "num_input_tokens_seen": 849216, "step": 165, "train_runtime": 263.3712, "train_tokens_per_second": 3224.408 }, { "epoch": 0.4521276595744681, "grad_norm": 8.95522753125988e-05, "learning_rate": 2.836888901587229e-05, "loss": 0.1292, "num_input_tokens_seen": 874208, "step": 170, "train_runtime": 270.894, "train_tokens_per_second": 3227.122 }, { "epoch": 0.4654255319148936, "grad_norm": 3.6886351153953e-05, "learning_rate": 2.827286887271143e-05, "loss": 0.0558, "num_input_tokens_seen": 898624, "step": 175, "train_runtime": 278.2599, "train_tokens_per_second": 3229.441 }, { "epoch": 0.4787234042553192, "grad_norm": 7.180378452176228e-05, "learning_rate": 2.8174274902621495e-05, "loss": 0.0506, "num_input_tokens_seen": 921728, "step": 180, "train_runtime": 285.3501, "train_tokens_per_second": 3230.166 }, { "epoch": 0.4920212765957447, "grad_norm": 2.529071207391098e-05, "learning_rate": 2.8073126224590073e-05, "loss": 0.0713, "num_input_tokens_seen": 948160, "step": 185, "train_runtime": 293.1898, "train_tokens_per_second": 3233.946 }, { "epoch": 0.5053191489361702, "grad_norm": 2.971558387798723e-05, "learning_rate": 2.7969442453004525e-05, "loss": 0.0423, "num_input_tokens_seen": 974688, "step": 190, "train_runtime": 301.0219, "train_tokens_per_second": 3237.93 }, { "epoch": 0.5186170212765957, "grad_norm": 1.3908083019487094e-05, "learning_rate": 2.786324369384841e-05, "loss": 0.0376, "num_input_tokens_seen": 999232, "step": 195, "train_runtime": 308.373, "train_tokens_per_second": 3240.336 }, { "epoch": 0.5319148936170213, "grad_norm": 8.287108357762918e-05, "learning_rate": 2.7754550540802632e-05, "loss": 0.0505, "num_input_tokens_seen": 1024352, "step": 200, "train_runtime": 315.9074, "train_tokens_per_second": 3242.57 }, { "epoch": 0.5452127659574468, "grad_norm": 7.783296314300969e-05, "learning_rate": 2.7643384071251957e-05, "loss": 0.0347, "num_input_tokens_seen": 1049088, "step": 205, "train_runtime": 324.1076, "train_tokens_per_second": 3236.851 }, { "epoch": 0.5585106382978723, "grad_norm": 0.0001195428121718578, "learning_rate": 2.7529765842197798e-05, "loss": 0.0386, "num_input_tokens_seen": 1073024, "step": 210, "train_runtime": 331.3284, "train_tokens_per_second": 3238.552 }, { "epoch": 0.5718085106382979, "grad_norm": 4.606168658938259e-05, "learning_rate": 2.741371788607793e-05, "loss": 0.0616, "num_input_tokens_seen": 1098880, "step": 215, "train_runtime": 339.0001, "train_tokens_per_second": 3241.533 }, { "epoch": 0.5851063829787234, "grad_norm": 0.00013229926116764545, "learning_rate": 2.729526270649405e-05, "loss": 0.0821, "num_input_tokens_seen": 1127328, "step": 220, "train_runtime": 347.2586, "train_tokens_per_second": 3246.364 }, { "epoch": 0.598404255319149, "grad_norm": 8.632720710011199e-05, "learning_rate": 2.7174423273847966e-05, "loss": 0.0685, "num_input_tokens_seen": 1151584, "step": 225, "train_runtime": 354.5073, "train_tokens_per_second": 3248.407 }, { "epoch": 0.6117021276595744, "grad_norm": 4.496889596339315e-05, "learning_rate": 2.705122302088725e-05, "loss": 0.0667, "num_input_tokens_seen": 1180544, "step": 230, "train_runtime": 363.022, "train_tokens_per_second": 3251.991 }, { "epoch": 0.625, "grad_norm": 1.9521097783581354e-05, "learning_rate": 2.6925685838161247e-05, "loss": 0.035, "num_input_tokens_seen": 1206080, "step": 235, "train_runtime": 370.6153, "train_tokens_per_second": 3254.264 }, { "epoch": 0.6382978723404256, "grad_norm": 4.637776146410033e-05, "learning_rate": 2.67978360693883e-05, "loss": 0.0604, "num_input_tokens_seen": 1230304, "step": 240, "train_runtime": 377.9559, "train_tokens_per_second": 3255.153 }, { "epoch": 0.651595744680851, "grad_norm": 3.3805175917223096e-05, "learning_rate": 2.6667698506735113e-05, "loss": 0.0556, "num_input_tokens_seen": 1256640, "step": 245, "train_runtime": 385.7509, "train_tokens_per_second": 3257.646 }, { "epoch": 0.6648936170212766, "grad_norm": 0.00010089632996823639, "learning_rate": 2.6535298386009144e-05, "loss": 0.0487, "num_input_tokens_seen": 1280064, "step": 250, "train_runtime": 392.8672, "train_tokens_per_second": 3258.262 }, { "epoch": 0.6781914893617021, "grad_norm": 3.6058525438420475e-05, "learning_rate": 2.6400661381764962e-05, "loss": 0.0702, "num_input_tokens_seen": 1305984, "step": 255, "train_runtime": 400.5999, "train_tokens_per_second": 3260.071 }, { "epoch": 0.6914893617021277, "grad_norm": 1.9650842659757473e-05, "learning_rate": 2.6263813602325525e-05, "loss": 0.0422, "num_input_tokens_seen": 1333088, "step": 260, "train_runtime": 408.608, "train_tokens_per_second": 3262.511 }, { "epoch": 0.7047872340425532, "grad_norm": 2.503952600818593e-05, "learning_rate": 2.6124781584719365e-05, "loss": 0.0674, "num_input_tokens_seen": 1357728, "step": 265, "train_runtime": 416.0446, "train_tokens_per_second": 3263.419 }, { "epoch": 0.7180851063829787, "grad_norm": 3.540120815159753e-05, "learning_rate": 2.5983592289534602e-05, "loss": 0.0446, "num_input_tokens_seen": 1383104, "step": 270, "train_runtime": 423.6735, "train_tokens_per_second": 3264.552 }, { "epoch": 0.7313829787234043, "grad_norm": 5.61477463634219e-05, "learning_rate": 2.584027309569086e-05, "loss": 0.0382, "num_input_tokens_seen": 1408096, "step": 275, "train_runtime": 431.1736, "train_tokens_per_second": 3265.729 }, { "epoch": 0.7446808510638298, "grad_norm": 1.1481101864774246e-05, "learning_rate": 2.5694851795130044e-05, "loss": 0.0189, "num_input_tokens_seen": 1434048, "step": 280, "train_runtime": 438.8402, "train_tokens_per_second": 3267.813 }, { "epoch": 0.7579787234042553, "grad_norm": 0.0001053257001331076, "learning_rate": 2.5547356587427017e-05, "loss": 0.0246, "num_input_tokens_seen": 1457856, "step": 285, "train_runtime": 446.036, "train_tokens_per_second": 3268.471 }, { "epoch": 0.7712765957446809, "grad_norm": 5.1625109335873276e-05, "learning_rate": 2.539781607432125e-05, "loss": 0.0624, "num_input_tokens_seen": 1481120, "step": 290, "train_runtime": 453.1392, "train_tokens_per_second": 3268.576 }, { "epoch": 0.7845744680851063, "grad_norm": 6.952533112780657e-06, "learning_rate": 2.5246259254170464e-05, "loss": 0.0346, "num_input_tokens_seen": 1506176, "step": 295, "train_runtime": 460.6884, "train_tokens_per_second": 3269.403 }, { "epoch": 0.7978723404255319, "grad_norm": 9.527090878691524e-05, "learning_rate": 2.5092715516327384e-05, "loss": 0.075, "num_input_tokens_seen": 1529824, "step": 300, "train_runtime": 467.9003, "train_tokens_per_second": 3269.551 }, { "epoch": 0.8111702127659575, "grad_norm": 2.904631219280418e-05, "learning_rate": 2.4937214635440665e-05, "loss": 0.0361, "num_input_tokens_seen": 1552384, "step": 305, "train_runtime": 475.6103, "train_tokens_per_second": 3263.983 }, { "epoch": 0.824468085106383, "grad_norm": 4.446757884579711e-05, "learning_rate": 2.4779786765681082e-05, "loss": 0.0367, "num_input_tokens_seen": 1579072, "step": 310, "train_runtime": 483.4588, "train_tokens_per_second": 3266.198 }, { "epoch": 0.8377659574468085, "grad_norm": 6.499775918200612e-05, "learning_rate": 2.4620462434894158e-05, "loss": 0.0503, "num_input_tokens_seen": 1603744, "step": 315, "train_runtime": 490.8348, "train_tokens_per_second": 3267.381 }, { "epoch": 0.851063829787234, "grad_norm": 2.785153810691554e-05, "learning_rate": 2.4459272538680308e-05, "loss": 0.0371, "num_input_tokens_seen": 1627712, "step": 320, "train_runtime": 498.0766, "train_tokens_per_second": 3267.995 }, { "epoch": 0.8643617021276596, "grad_norm": 6.219661008799449e-05, "learning_rate": 2.4296248334403672e-05, "loss": 0.0635, "num_input_tokens_seen": 1653600, "step": 325, "train_runtime": 505.7239, "train_tokens_per_second": 3269.768 }, { "epoch": 0.8776595744680851, "grad_norm": 4.4950455048820004e-05, "learning_rate": 2.413142143513081e-05, "loss": 0.0597, "num_input_tokens_seen": 1676928, "step": 330, "train_runtime": 512.8025, "train_tokens_per_second": 3270.125 }, { "epoch": 0.8909574468085106, "grad_norm": 3.27678098983597e-05, "learning_rate": 2.3964823803500395e-05, "loss": 0.052, "num_input_tokens_seen": 1707808, "step": 335, "train_runtime": 521.6471, "train_tokens_per_second": 3273.876 }, { "epoch": 0.9042553191489362, "grad_norm": 4.62313364550937e-05, "learning_rate": 2.3796487745525145e-05, "loss": 0.048, "num_input_tokens_seen": 1732576, "step": 340, "train_runtime": 529.0667, "train_tokens_per_second": 3274.778 }, { "epoch": 0.9175531914893617, "grad_norm": 3.923915573977865e-05, "learning_rate": 2.3626445904327155e-05, "loss": 0.0205, "num_input_tokens_seen": 1758016, "step": 345, "train_runtime": 536.6618, "train_tokens_per_second": 3275.836 }, { "epoch": 0.9308510638297872, "grad_norm": 6.78059086567373e-06, "learning_rate": 2.3454731253807862e-05, "loss": 0.0232, "num_input_tokens_seen": 1783872, "step": 350, "train_runtime": 544.3484, "train_tokens_per_second": 3277.078 }, { "epoch": 0.9441489361702128, "grad_norm": 2.773117921606172e-05, "learning_rate": 2.328137709225385e-05, "loss": 0.0152, "num_input_tokens_seen": 1807008, "step": 355, "train_runtime": 551.3669, "train_tokens_per_second": 3277.324 }, { "epoch": 0.9574468085106383, "grad_norm": 9.006850450532511e-05, "learning_rate": 2.3106417035879797e-05, "loss": 0.0517, "num_input_tokens_seen": 1834048, "step": 360, "train_runtime": 559.3162, "train_tokens_per_second": 3279.089 }, { "epoch": 0.9707446808510638, "grad_norm": 5.6452212447766215e-05, "learning_rate": 2.2929885012309697e-05, "loss": 0.0419, "num_input_tokens_seen": 1861728, "step": 365, "train_runtime": 567.4458, "train_tokens_per_second": 3280.891 }, { "epoch": 0.9840425531914894, "grad_norm": 2.901201980876067e-07, "learning_rate": 2.2751815253997783e-05, "loss": 0.0186, "num_input_tokens_seen": 1885376, "step": 370, "train_runtime": 574.6516, "train_tokens_per_second": 3280.903 }, { "epoch": 0.9973404255319149, "grad_norm": 1.5018988506199094e-06, "learning_rate": 2.2572242291590264e-05, "loss": 0.0293, "num_input_tokens_seen": 1908128, "step": 375, "train_runtime": 581.6252, "train_tokens_per_second": 3280.683 }, { "epoch": 1.0106382978723405, "grad_norm": 0.00014393814490176737, "learning_rate": 2.239120094722926e-05, "loss": 0.2291, "num_input_tokens_seen": 1932528, "step": 380, "train_runtime": 589.9529, "train_tokens_per_second": 3275.733 }, { "epoch": 1.023936170212766, "grad_norm": 0.00014409016876015812, "learning_rate": 2.2208726327800257e-05, "loss": 1.028, "num_input_tokens_seen": 1957648, "step": 385, "train_runtime": 597.4479, "train_tokens_per_second": 3276.684 }, { "epoch": 1.0372340425531914, "grad_norm": 0.00011712688865372911, "learning_rate": 2.202485381812426e-05, "loss": 1.012, "num_input_tokens_seen": 1985392, "step": 390, "train_runtime": 605.572, "train_tokens_per_second": 3278.54 }, { "epoch": 1.050531914893617, "grad_norm": 9.847906039794907e-05, "learning_rate": 2.1839619074096117e-05, "loss": 1.1113, "num_input_tokens_seen": 2014320, "step": 395, "train_runtime": 614.0367, "train_tokens_per_second": 3280.455 }, { "epoch": 1.0638297872340425, "grad_norm": 0.00011392939632060006, "learning_rate": 2.1653058015770262e-05, "loss": 1.0173, "num_input_tokens_seen": 2041328, "step": 400, "train_runtime": 622.0201, "train_tokens_per_second": 3281.772 }, { "epoch": 1.077127659574468, "grad_norm": 8.545993478037417e-05, "learning_rate": 2.146520682039522e-05, "loss": 0.7919, "num_input_tokens_seen": 2068848, "step": 405, "train_runtime": 630.9237, "train_tokens_per_second": 3279.078 }, { "epoch": 1.0904255319148937, "grad_norm": 8.802056254353374e-05, "learning_rate": 2.127610191539825e-05, "loss": 0.696, "num_input_tokens_seen": 2094352, "step": 410, "train_runtime": 638.5178, "train_tokens_per_second": 3280.022 }, { "epoch": 1.1037234042553192, "grad_norm": 6.343067070702091e-05, "learning_rate": 2.1085779971321456e-05, "loss": 0.5359, "num_input_tokens_seen": 2120592, "step": 415, "train_runtime": 646.3244, "train_tokens_per_second": 3281.003 }, { "epoch": 1.1170212765957448, "grad_norm": 0.0001089554643840529, "learning_rate": 2.089427789471078e-05, "loss": 0.4819, "num_input_tokens_seen": 2147376, "step": 420, "train_runtime": 654.2112, "train_tokens_per_second": 3282.39 }, { "epoch": 1.1303191489361701, "grad_norm": 5.21246729476843e-05, "learning_rate": 2.0701632820959223e-05, "loss": 0.3732, "num_input_tokens_seen": 2170992, "step": 425, "train_runtime": 661.38, "train_tokens_per_second": 3282.518 }, { "epoch": 1.1436170212765957, "grad_norm": 6.755034701200202e-05, "learning_rate": 2.0507882107105664e-05, "loss": 0.3435, "num_input_tokens_seen": 2199216, "step": 430, "train_runtime": 669.64, "train_tokens_per_second": 3284.177 }, { "epoch": 1.1569148936170213, "grad_norm": 9.302370017394423e-05, "learning_rate": 2.0313063324590736e-05, "loss": 0.2404, "num_input_tokens_seen": 2223120, "step": 435, "train_runtime": 676.8858, "train_tokens_per_second": 3284.335 }, { "epoch": 1.1702127659574468, "grad_norm": 7.59345421101898e-05, "learning_rate": 2.0117214251971088e-05, "loss": 0.2588, "num_input_tokens_seen": 2246128, "step": 440, "train_runtime": 683.9358, "train_tokens_per_second": 3284.121 }, { "epoch": 1.1835106382978724, "grad_norm": 0.00014472956536337733, "learning_rate": 1.9920372867593537e-05, "loss": 0.1375, "num_input_tokens_seen": 2274448, "step": 445, "train_runtime": 692.2182, "train_tokens_per_second": 3285.738 }, { "epoch": 1.196808510638298, "grad_norm": 6.864719762234017e-05, "learning_rate": 1.9722577342230408e-05, "loss": 0.1394, "num_input_tokens_seen": 2298736, "step": 450, "train_runtime": 699.5691, "train_tokens_per_second": 3285.931 }, { "epoch": 1.2101063829787235, "grad_norm": 7.647907477803528e-05, "learning_rate": 1.9523866031677607e-05, "loss": 0.1386, "num_input_tokens_seen": 2326192, "step": 455, "train_runtime": 707.7177, "train_tokens_per_second": 3286.893 }, { "epoch": 1.2234042553191489, "grad_norm": 5.32688463863451e-05, "learning_rate": 1.9324277469316807e-05, "loss": 0.1507, "num_input_tokens_seen": 2354992, "step": 460, "train_runtime": 716.0771, "train_tokens_per_second": 3288.741 }, { "epoch": 1.2367021276595744, "grad_norm": 6.799784750910476e-05, "learning_rate": 1.9123850358643208e-05, "loss": 0.1407, "num_input_tokens_seen": 2378736, "step": 465, "train_runtime": 723.2828, "train_tokens_per_second": 3288.805 }, { "epoch": 1.25, "grad_norm": 6.61658777971752e-05, "learning_rate": 1.8922623565760255e-05, "loss": 0.1241, "num_input_tokens_seen": 2402928, "step": 470, "train_runtime": 730.5847, "train_tokens_per_second": 3289.048 }, { "epoch": 1.2632978723404256, "grad_norm": 6.660693179583177e-05, "learning_rate": 1.87206361118429e-05, "loss": 0.1393, "num_input_tokens_seen": 2427536, "step": 475, "train_runtime": 737.9838, "train_tokens_per_second": 3289.417 }, { "epoch": 1.2765957446808511, "grad_norm": 5.5432989029213786e-05, "learning_rate": 1.8517927165570745e-05, "loss": 0.1068, "num_input_tokens_seen": 2451952, "step": 480, "train_runtime": 745.3344, "train_tokens_per_second": 3289.734 }, { "epoch": 1.2898936170212765, "grad_norm": 5.3888677939539775e-05, "learning_rate": 1.831453603553259e-05, "loss": 0.1255, "num_input_tokens_seen": 2480912, "step": 485, "train_runtime": 753.8109, "train_tokens_per_second": 3291.16 }, { "epoch": 1.3031914893617023, "grad_norm": 6.483653123723343e-05, "learning_rate": 1.811050216260385e-05, "loss": 0.0855, "num_input_tokens_seen": 2505744, "step": 490, "train_runtime": 761.3174, "train_tokens_per_second": 3291.326 }, { "epoch": 1.3164893617021276, "grad_norm": 5.622122625936754e-05, "learning_rate": 1.790586511229832e-05, "loss": 0.1123, "num_input_tokens_seen": 2528720, "step": 495, "train_runtime": 768.3834, "train_tokens_per_second": 3290.961 }, { "epoch": 1.3297872340425532, "grad_norm": 4.249440462444909e-05, "learning_rate": 1.7700664567095788e-05, "loss": 0.0643, "num_input_tokens_seen": 2551760, "step": 500, "train_runtime": 775.3677, "train_tokens_per_second": 3291.032 } ], "logging_steps": 5, "max_steps": 1128, "num_input_tokens_seen": 2551760, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0923743661195264e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }