| { |
| "best_global_step": 2500, |
| "best_metric": 2.1584246158599854, |
| "best_model_checkpoint": "/kaggle/working/checkpoints/checkpoint-2500", |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 2691, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0011157601115760112, |
| "grad_norm": 255.8717498779297, |
| "learning_rate": 0.0, |
| "loss": 108.23062133789062, |
| "num_input_tokens_seen": 8192, |
| "step": 1, |
| "train_runtime": 9.3992, |
| "train_tokens_per_second": 871.567 |
| }, |
| { |
| "epoch": 0.05578800557880056, |
| "grad_norm": 115.45601654052734, |
| "learning_rate": 0.000362962962962963, |
| "loss": 93.04728356186224, |
| "num_input_tokens_seen": 409600, |
| "step": 50, |
| "train_runtime": 173.2384, |
| "train_tokens_per_second": 2364.372 |
| }, |
| { |
| "epoch": 0.11157601115760112, |
| "grad_norm": 1.7184170484542847, |
| "learning_rate": 0.0007333333333333333, |
| "loss": 36.475849609375, |
| "num_input_tokens_seen": 819200, |
| "step": 100, |
| "train_runtime": 351.2989, |
| "train_tokens_per_second": 2331.918 |
| }, |
| { |
| "epoch": 0.16736401673640167, |
| "grad_norm": 0.7904146909713745, |
| "learning_rate": 0.0009945226917057904, |
| "loss": 17.753541259765626, |
| "num_input_tokens_seen": 1228800, |
| "step": 150, |
| "train_runtime": 530.547, |
| "train_tokens_per_second": 2316.1 |
| }, |
| { |
| "epoch": 0.22315202231520223, |
| "grad_norm": 0.6032889485359192, |
| "learning_rate": 0.000974960876369327, |
| "loss": 14.870999755859375, |
| "num_input_tokens_seen": 1638400, |
| "step": 200, |
| "train_runtime": 709.2031, |
| "train_tokens_per_second": 2310.199 |
| }, |
| { |
| "epoch": 0.2789400278940028, |
| "grad_norm": 0.5387754440307617, |
| "learning_rate": 0.0009553990610328639, |
| "loss": 14.170311279296875, |
| "num_input_tokens_seen": 2048000, |
| "step": 250, |
| "train_runtime": 887.3381, |
| "train_tokens_per_second": 2308.027 |
| }, |
| { |
| "epoch": 0.33472803347280333, |
| "grad_norm": 0.5034986734390259, |
| "learning_rate": 0.0009358372456964006, |
| "loss": 13.64815673828125, |
| "num_input_tokens_seen": 2457600, |
| "step": 300, |
| "train_runtime": 1065.8215, |
| "train_tokens_per_second": 2305.827 |
| }, |
| { |
| "epoch": 0.3905160390516039, |
| "grad_norm": 0.637877345085144, |
| "learning_rate": 0.0009162754303599374, |
| "loss": 13.331446533203126, |
| "num_input_tokens_seen": 2867200, |
| "step": 350, |
| "train_runtime": 1244.2112, |
| "train_tokens_per_second": 2304.432 |
| }, |
| { |
| "epoch": 0.44630404463040446, |
| "grad_norm": 0.5915430188179016, |
| "learning_rate": 0.0008967136150234741, |
| "loss": 12.95500732421875, |
| "num_input_tokens_seen": 3276800, |
| "step": 400, |
| "train_runtime": 1422.4179, |
| "train_tokens_per_second": 2303.683 |
| }, |
| { |
| "epoch": 0.502092050209205, |
| "grad_norm": 0.5109351873397827, |
| "learning_rate": 0.000877151799687011, |
| "loss": 12.745074462890624, |
| "num_input_tokens_seen": 3686400, |
| "step": 450, |
| "train_runtime": 1600.7409, |
| "train_tokens_per_second": 2302.934 |
| }, |
| { |
| "epoch": 0.5578800557880056, |
| "grad_norm": 0.5278257727622986, |
| "learning_rate": 0.0008575899843505478, |
| "loss": 12.500452880859376, |
| "num_input_tokens_seen": 4096000, |
| "step": 500, |
| "train_runtime": 1779.1422, |
| "train_tokens_per_second": 2302.233 |
| }, |
| { |
| "epoch": 0.5578800557880056, |
| "eval_loss": 2.4494383335113525, |
| "eval_runtime": 10.7867, |
| "eval_samples_per_second": 53.77, |
| "eval_steps_per_second": 6.768, |
| "num_input_tokens_seen": 4096000, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6136680613668062, |
| "grad_norm": 0.5274862051010132, |
| "learning_rate": 0.0008380281690140845, |
| "loss": 12.42565185546875, |
| "num_input_tokens_seen": 4505600, |
| "step": 550, |
| "train_runtime": 2008.4152, |
| "train_tokens_per_second": 2243.361 |
| }, |
| { |
| "epoch": 0.6694560669456067, |
| "grad_norm": 0.5122476816177368, |
| "learning_rate": 0.0008184663536776214, |
| "loss": 12.244449462890625, |
| "num_input_tokens_seen": 4915200, |
| "step": 600, |
| "train_runtime": 2186.6759, |
| "train_tokens_per_second": 2247.795 |
| }, |
| { |
| "epoch": 0.7252440725244073, |
| "grad_norm": 0.48029351234436035, |
| "learning_rate": 0.000798904538341158, |
| "loss": 12.14824951171875, |
| "num_input_tokens_seen": 5324800, |
| "step": 650, |
| "train_runtime": 2365.4468, |
| "train_tokens_per_second": 2251.076 |
| }, |
| { |
| "epoch": 0.7810320781032078, |
| "grad_norm": 0.5422759652137756, |
| "learning_rate": 0.0007793427230046949, |
| "loss": 12.082501220703126, |
| "num_input_tokens_seen": 5734400, |
| "step": 700, |
| "train_runtime": 2543.5192, |
| "train_tokens_per_second": 2254.514 |
| }, |
| { |
| "epoch": 0.8368200836820083, |
| "grad_norm": 0.5309758186340332, |
| "learning_rate": 0.0007597809076682316, |
| "loss": 11.98714599609375, |
| "num_input_tokens_seen": 6144000, |
| "step": 750, |
| "train_runtime": 2721.8962, |
| "train_tokens_per_second": 2257.25 |
| }, |
| { |
| "epoch": 0.8926080892608089, |
| "grad_norm": 0.5811660289764404, |
| "learning_rate": 0.0007402190923317684, |
| "loss": 11.878330078125, |
| "num_input_tokens_seen": 6553600, |
| "step": 800, |
| "train_runtime": 2899.6508, |
| "train_tokens_per_second": 2260.134 |
| }, |
| { |
| "epoch": 0.9483960948396095, |
| "grad_norm": 0.5693972706794739, |
| "learning_rate": 0.0007206572769953051, |
| "loss": 11.66504638671875, |
| "num_input_tokens_seen": 6963200, |
| "step": 850, |
| "train_runtime": 3077.6825, |
| "train_tokens_per_second": 2262.482 |
| }, |
| { |
| "epoch": 1.003347280334728, |
| "grad_norm": 0.5016735792160034, |
| "learning_rate": 0.000701095461658842, |
| "loss": 11.572359619140625, |
| "num_input_tokens_seen": 7366656, |
| "step": 900, |
| "train_runtime": 3253.1428, |
| "train_tokens_per_second": 2264.474 |
| }, |
| { |
| "epoch": 1.0591352859135286, |
| "grad_norm": 0.5279722809791565, |
| "learning_rate": 0.0006815336463223787, |
| "loss": 11.42222412109375, |
| "num_input_tokens_seen": 7776256, |
| "step": 950, |
| "train_runtime": 3431.1574, |
| "train_tokens_per_second": 2266.365 |
| }, |
| { |
| "epoch": 1.114923291492329, |
| "grad_norm": 0.5057054758071899, |
| "learning_rate": 0.0006619718309859155, |
| "loss": 11.35930419921875, |
| "num_input_tokens_seen": 8185856, |
| "step": 1000, |
| "train_runtime": 3609.1142, |
| "train_tokens_per_second": 2268.107 |
| }, |
| { |
| "epoch": 1.114923291492329, |
| "eval_loss": 2.3056819438934326, |
| "eval_runtime": 10.8358, |
| "eval_samples_per_second": 53.527, |
| "eval_steps_per_second": 6.737, |
| "num_input_tokens_seen": 8185856, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.1707112970711298, |
| "grad_norm": 0.4898532032966614, |
| "learning_rate": 0.0006424100156494523, |
| "loss": 11.516380615234375, |
| "num_input_tokens_seen": 8595456, |
| "step": 1050, |
| "train_runtime": 3839.7669, |
| "train_tokens_per_second": 2238.536 |
| }, |
| { |
| "epoch": 1.2264993026499302, |
| "grad_norm": 0.48597925901412964, |
| "learning_rate": 0.000622848200312989, |
| "loss": 11.34823974609375, |
| "num_input_tokens_seen": 9005056, |
| "step": 1100, |
| "train_runtime": 4018.0046, |
| "train_tokens_per_second": 2241.176 |
| }, |
| { |
| "epoch": 1.2822873082287307, |
| "grad_norm": 0.48646438121795654, |
| "learning_rate": 0.0006032863849765259, |
| "loss": 11.37316650390625, |
| "num_input_tokens_seen": 9414656, |
| "step": 1150, |
| "train_runtime": 4195.7027, |
| "train_tokens_per_second": 2243.881 |
| }, |
| { |
| "epoch": 1.3380753138075314, |
| "grad_norm": 0.5371856689453125, |
| "learning_rate": 0.0005837245696400626, |
| "loss": 11.138577880859375, |
| "num_input_tokens_seen": 9824256, |
| "step": 1200, |
| "train_runtime": 4373.778, |
| "train_tokens_per_second": 2246.172 |
| }, |
| { |
| "epoch": 1.393863319386332, |
| "grad_norm": 0.5028601884841919, |
| "learning_rate": 0.0005641627543035994, |
| "loss": 11.108404541015625, |
| "num_input_tokens_seen": 10233856, |
| "step": 1250, |
| "train_runtime": 4552.0261, |
| "train_tokens_per_second": 2248.198 |
| }, |
| { |
| "epoch": 1.4496513249651324, |
| "grad_norm": 0.5344403386116028, |
| "learning_rate": 0.0005446009389671362, |
| "loss": 11.08916015625, |
| "num_input_tokens_seen": 10643456, |
| "step": 1300, |
| "train_runtime": 4730.315, |
| "train_tokens_per_second": 2250.052 |
| }, |
| { |
| "epoch": 1.505439330543933, |
| "grad_norm": 0.4930146038532257, |
| "learning_rate": 0.000525039123630673, |
| "loss": 11.0640771484375, |
| "num_input_tokens_seen": 11053056, |
| "step": 1350, |
| "train_runtime": 4908.43, |
| "train_tokens_per_second": 2251.852 |
| }, |
| { |
| "epoch": 1.5612273361227336, |
| "grad_norm": 0.6248587965965271, |
| "learning_rate": 0.0005054773082942097, |
| "loss": 11.053743896484375, |
| "num_input_tokens_seen": 11462656, |
| "step": 1400, |
| "train_runtime": 5086.4814, |
| "train_tokens_per_second": 2253.553 |
| }, |
| { |
| "epoch": 1.617015341701534, |
| "grad_norm": 0.5878036618232727, |
| "learning_rate": 0.0004859154929577465, |
| "loss": 11.075479736328125, |
| "num_input_tokens_seen": 11872256, |
| "step": 1450, |
| "train_runtime": 5264.6386, |
| "train_tokens_per_second": 2255.094 |
| }, |
| { |
| "epoch": 1.6728033472803348, |
| "grad_norm": 0.5579462647438049, |
| "learning_rate": 0.0004663536776212833, |
| "loss": 11.019775390625, |
| "num_input_tokens_seen": 12281856, |
| "step": 1500, |
| "train_runtime": 5442.3024, |
| "train_tokens_per_second": 2256.739 |
| }, |
| { |
| "epoch": 1.6728033472803348, |
| "eval_loss": 2.2273507118225098, |
| "eval_runtime": 10.872, |
| "eval_samples_per_second": 53.348, |
| "eval_steps_per_second": 6.714, |
| "num_input_tokens_seen": 12281856, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.7285913528591352, |
| "grad_norm": 0.5162424445152283, |
| "learning_rate": 0.00044679186228482003, |
| "loss": 11.0057470703125, |
| "num_input_tokens_seen": 12691456, |
| "step": 1550, |
| "train_runtime": 5677.3627, |
| "train_tokens_per_second": 2235.449 |
| }, |
| { |
| "epoch": 1.7843793584379357, |
| "grad_norm": 0.5111306309700012, |
| "learning_rate": 0.00042723004694835684, |
| "loss": 10.911416015625, |
| "num_input_tokens_seen": 13101056, |
| "step": 1600, |
| "train_runtime": 5855.7999, |
| "train_tokens_per_second": 2237.279 |
| }, |
| { |
| "epoch": 1.8401673640167364, |
| "grad_norm": 0.5066443681716919, |
| "learning_rate": 0.0004076682316118936, |
| "loss": 10.815118408203125, |
| "num_input_tokens_seen": 13510656, |
| "step": 1650, |
| "train_runtime": 6033.7829, |
| "train_tokens_per_second": 2239.168 |
| }, |
| { |
| "epoch": 1.8959553695955371, |
| "grad_norm": 0.4963262677192688, |
| "learning_rate": 0.00038810641627543035, |
| "loss": 10.912720947265624, |
| "num_input_tokens_seen": 13920256, |
| "step": 1700, |
| "train_runtime": 6211.8611, |
| "train_tokens_per_second": 2240.916 |
| }, |
| { |
| "epoch": 1.9517433751743374, |
| "grad_norm": 0.5284143090248108, |
| "learning_rate": 0.00036854460093896715, |
| "loss": 10.824471435546876, |
| "num_input_tokens_seen": 14329856, |
| "step": 1750, |
| "train_runtime": 6389.8997, |
| "train_tokens_per_second": 2242.579 |
| }, |
| { |
| "epoch": 2.006694560669456, |
| "grad_norm": 0.5001941323280334, |
| "learning_rate": 0.0003489827856025039, |
| "loss": 10.686629638671874, |
| "num_input_tokens_seen": 14733312, |
| "step": 1800, |
| "train_runtime": 6565.3838, |
| "train_tokens_per_second": 2244.09 |
| }, |
| { |
| "epoch": 2.0624825662482564, |
| "grad_norm": 0.5786783695220947, |
| "learning_rate": 0.00032942097026604066, |
| "loss": 10.69493896484375, |
| "num_input_tokens_seen": 15142912, |
| "step": 1850, |
| "train_runtime": 6743.0758, |
| "train_tokens_per_second": 2245.698 |
| }, |
| { |
| "epoch": 2.118270571827057, |
| "grad_norm": 0.5400704145431519, |
| "learning_rate": 0.00030985915492957747, |
| "loss": 10.6007958984375, |
| "num_input_tokens_seen": 15552512, |
| "step": 1900, |
| "train_runtime": 6921.2091, |
| "train_tokens_per_second": 2247.08 |
| }, |
| { |
| "epoch": 2.174058577405858, |
| "grad_norm": 0.5565312504768372, |
| "learning_rate": 0.0002902973395931143, |
| "loss": 10.623209228515625, |
| "num_input_tokens_seen": 15962112, |
| "step": 1950, |
| "train_runtime": 7099.0243, |
| "train_tokens_per_second": 2248.494 |
| }, |
| { |
| "epoch": 2.229846582984658, |
| "grad_norm": 0.5031603574752808, |
| "learning_rate": 0.00027073552425665103, |
| "loss": 10.5346484375, |
| "num_input_tokens_seen": 16371712, |
| "step": 2000, |
| "train_runtime": 7277.2957, |
| "train_tokens_per_second": 2249.697 |
| }, |
| { |
| "epoch": 2.229846582984658, |
| "eval_loss": 2.1852738857269287, |
| "eval_runtime": 10.833, |
| "eval_samples_per_second": 53.54, |
| "eval_steps_per_second": 6.739, |
| "num_input_tokens_seen": 16371712, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.285634588563459, |
| "grad_norm": 0.5220429301261902, |
| "learning_rate": 0.00025117370892018784, |
| "loss": 10.5833935546875, |
| "num_input_tokens_seen": 16781312, |
| "step": 2050, |
| "train_runtime": 7514.4606, |
| "train_tokens_per_second": 2233.202 |
| }, |
| { |
| "epoch": 2.3414225941422595, |
| "grad_norm": 0.5597192049026489, |
| "learning_rate": 0.0002316118935837246, |
| "loss": 10.598099365234376, |
| "num_input_tokens_seen": 17190912, |
| "step": 2100, |
| "train_runtime": 7692.7208, |
| "train_tokens_per_second": 2234.699 |
| }, |
| { |
| "epoch": 2.3972105997210598, |
| "grad_norm": 0.5263229012489319, |
| "learning_rate": 0.00021205007824726135, |
| "loss": 10.5710595703125, |
| "num_input_tokens_seen": 17600512, |
| "step": 2150, |
| "train_runtime": 7870.818, |
| "train_tokens_per_second": 2236.173 |
| }, |
| { |
| "epoch": 2.4529986052998605, |
| "grad_norm": 0.5467224717140198, |
| "learning_rate": 0.00019248826291079813, |
| "loss": 10.567000732421874, |
| "num_input_tokens_seen": 18010112, |
| "step": 2200, |
| "train_runtime": 8049.3853, |
| "train_tokens_per_second": 2237.452 |
| }, |
| { |
| "epoch": 2.508786610878661, |
| "grad_norm": 0.5116831660270691, |
| "learning_rate": 0.0001729264475743349, |
| "loss": 10.55864013671875, |
| "num_input_tokens_seen": 18419712, |
| "step": 2250, |
| "train_runtime": 8227.2353, |
| "train_tokens_per_second": 2238.87 |
| }, |
| { |
| "epoch": 2.5645746164574614, |
| "grad_norm": 0.4988791048526764, |
| "learning_rate": 0.00015336463223787167, |
| "loss": 10.57171142578125, |
| "num_input_tokens_seen": 18829312, |
| "step": 2300, |
| "train_runtime": 8405.3452, |
| "train_tokens_per_second": 2240.159 |
| }, |
| { |
| "epoch": 2.620362622036262, |
| "grad_norm": 0.5179631114006042, |
| "learning_rate": 0.00013380281690140845, |
| "loss": 10.552501220703125, |
| "num_input_tokens_seen": 19238912, |
| "step": 2350, |
| "train_runtime": 8583.3621, |
| "train_tokens_per_second": 2241.419 |
| }, |
| { |
| "epoch": 2.676150627615063, |
| "grad_norm": 0.5133325457572937, |
| "learning_rate": 0.00011424100156494523, |
| "loss": 10.537681884765625, |
| "num_input_tokens_seen": 19648512, |
| "step": 2400, |
| "train_runtime": 8761.4066, |
| "train_tokens_per_second": 2242.621 |
| }, |
| { |
| "epoch": 2.731938633193863, |
| "grad_norm": 0.5093114972114563, |
| "learning_rate": 9.467918622848201e-05, |
| "loss": 10.546854248046875, |
| "num_input_tokens_seen": 20058112, |
| "step": 2450, |
| "train_runtime": 8939.3652, |
| "train_tokens_per_second": 2243.796 |
| }, |
| { |
| "epoch": 2.787726638772664, |
| "grad_norm": 0.534430742263794, |
| "learning_rate": 7.511737089201878e-05, |
| "loss": 10.509217529296874, |
| "num_input_tokens_seen": 20467712, |
| "step": 2500, |
| "train_runtime": 9117.1673, |
| "train_tokens_per_second": 2244.964 |
| }, |
| { |
| "epoch": 2.787726638772664, |
| "eval_loss": 2.1584246158599854, |
| "eval_runtime": 10.8364, |
| "eval_samples_per_second": 53.523, |
| "eval_steps_per_second": 6.737, |
| "num_input_tokens_seen": 20467712, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.8435146443514645, |
| "grad_norm": 0.5242642760276794, |
| "learning_rate": 5.555555555555555e-05, |
| "loss": 10.4308740234375, |
| "num_input_tokens_seen": 20877312, |
| "step": 2550, |
| "train_runtime": 9348.7798, |
| "train_tokens_per_second": 2233.159 |
| }, |
| { |
| "epoch": 2.8993026499302648, |
| "grad_norm": 0.5218517184257507, |
| "learning_rate": 3.599374021909233e-05, |
| "loss": 10.400145263671876, |
| "num_input_tokens_seen": 21286912, |
| "step": 2600, |
| "train_runtime": 9526.9521, |
| "train_tokens_per_second": 2234.388 |
| }, |
| { |
| "epoch": 2.9550906555090655, |
| "grad_norm": 0.5121810436248779, |
| "learning_rate": 1.643192488262911e-05, |
| "loss": 10.519163818359376, |
| "num_input_tokens_seen": 21696512, |
| "step": 2650, |
| "train_runtime": 9704.9471, |
| "train_tokens_per_second": 2235.614 |
| }, |
| { |
| "epoch": 3.0, |
| "num_input_tokens_seen": 22026240, |
| "step": 2691, |
| "total_flos": 3.99160296603648e+16, |
| "train_loss": 13.479616094991115, |
| "train_runtime": 9892.7445, |
| "train_samples_per_second": 17.395, |
| "train_steps_per_second": 0.272 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 2691, |
| "num_input_tokens_seen": 22026240, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.99160296603648e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|