| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9092975676290066, | |
| "eval_steps": 10, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009092975676290065, | |
| "grad_norm": 1.0088555812835693, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 2.2722, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01818595135258013, | |
| "grad_norm": 0.9372844099998474, | |
| "learning_rate": 0.000196, | |
| "loss": 1.6351, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01818595135258013, | |
| "eval_loss": 1.5618833303451538, | |
| "eval_runtime": 7.6668, | |
| "eval_samples_per_second": 3.913, | |
| "eval_steps_per_second": 1.956, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0272789270288702, | |
| "grad_norm": 1.3525443077087402, | |
| "learning_rate": 0.000194, | |
| "loss": 1.5293, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03637190270516026, | |
| "grad_norm": 0.8991140723228455, | |
| "learning_rate": 0.000192, | |
| "loss": 1.4111, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03637190270516026, | |
| "eval_loss": 1.3718944787979126, | |
| "eval_runtime": 7.8196, | |
| "eval_samples_per_second": 3.837, | |
| "eval_steps_per_second": 1.918, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04546487838145033, | |
| "grad_norm": 1.98069429397583, | |
| "learning_rate": 0.00019, | |
| "loss": 1.3139, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0545578540577404, | |
| "grad_norm": 0.6621396541595459, | |
| "learning_rate": 0.000188, | |
| "loss": 1.4428, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0545578540577404, | |
| "eval_loss": 1.2937129735946655, | |
| "eval_runtime": 7.4563, | |
| "eval_samples_per_second": 4.023, | |
| "eval_steps_per_second": 2.012, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06365082973403047, | |
| "grad_norm": 0.896124005317688, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 1.3239, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07274380541032052, | |
| "grad_norm": 1.9882720708847046, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 1.279, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07274380541032052, | |
| "eval_loss": 1.2551789283752441, | |
| "eval_runtime": 7.9884, | |
| "eval_samples_per_second": 3.755, | |
| "eval_steps_per_second": 1.878, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08183678108661059, | |
| "grad_norm": 0.7292985320091248, | |
| "learning_rate": 0.000182, | |
| "loss": 1.2615, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09092975676290066, | |
| "grad_norm": 0.7677621245384216, | |
| "learning_rate": 0.00018, | |
| "loss": 1.2903, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09092975676290066, | |
| "eval_loss": 1.2139209508895874, | |
| "eval_runtime": 7.6731, | |
| "eval_samples_per_second": 3.91, | |
| "eval_steps_per_second": 1.955, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10002273243919073, | |
| "grad_norm": 0.781851589679718, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 1.1273, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1091157081154808, | |
| "grad_norm": 0.7166887521743774, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 1.3067, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1091157081154808, | |
| "eval_loss": 1.2027504444122314, | |
| "eval_runtime": 7.6999, | |
| "eval_samples_per_second": 3.896, | |
| "eval_steps_per_second": 1.948, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11820868379177085, | |
| "grad_norm": 0.7799960970878601, | |
| "learning_rate": 0.000174, | |
| "loss": 1.1987, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12730165946806093, | |
| "grad_norm": 0.6864632964134216, | |
| "learning_rate": 0.000172, | |
| "loss": 1.2013, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12730165946806093, | |
| "eval_loss": 1.1920855045318604, | |
| "eval_runtime": 7.8738, | |
| "eval_samples_per_second": 3.81, | |
| "eval_steps_per_second": 1.905, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.136394635144351, | |
| "grad_norm": 0.774085283279419, | |
| "learning_rate": 0.00017, | |
| "loss": 1.1184, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.14548761082064104, | |
| "grad_norm": 0.6681156158447266, | |
| "learning_rate": 0.000168, | |
| "loss": 1.2931, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14548761082064104, | |
| "eval_loss": 1.1745468378067017, | |
| "eval_runtime": 7.4956, | |
| "eval_samples_per_second": 4.002, | |
| "eval_steps_per_second": 2.001, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15458058649693113, | |
| "grad_norm": 0.7310240864753723, | |
| "learning_rate": 0.000166, | |
| "loss": 1.1426, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16367356217322118, | |
| "grad_norm": 0.8338828682899475, | |
| "learning_rate": 0.000164, | |
| "loss": 1.1719, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16367356217322118, | |
| "eval_loss": 1.1653213500976562, | |
| "eval_runtime": 7.8929, | |
| "eval_samples_per_second": 3.801, | |
| "eval_steps_per_second": 1.9, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17276653784951126, | |
| "grad_norm": 0.732770562171936, | |
| "learning_rate": 0.000162, | |
| "loss": 1.2321, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.18185951352580132, | |
| "grad_norm": 0.7523607611656189, | |
| "learning_rate": 0.00016, | |
| "loss": 1.2331, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18185951352580132, | |
| "eval_loss": 1.1490192413330078, | |
| "eval_runtime": 7.6199, | |
| "eval_samples_per_second": 3.937, | |
| "eval_steps_per_second": 1.969, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19095248920209137, | |
| "grad_norm": 0.7681267261505127, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 1.1277, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.20004546487838146, | |
| "grad_norm": 0.7249591946601868, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 1.142, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20004546487838146, | |
| "eval_loss": 1.137698769569397, | |
| "eval_runtime": 7.8436, | |
| "eval_samples_per_second": 3.825, | |
| "eval_steps_per_second": 1.912, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2091384405546715, | |
| "grad_norm": 0.6904309391975403, | |
| "learning_rate": 0.000154, | |
| "loss": 1.2033, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2182314162309616, | |
| "grad_norm": 0.7456697821617126, | |
| "learning_rate": 0.000152, | |
| "loss": 1.1777, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2182314162309616, | |
| "eval_loss": 1.1293922662734985, | |
| "eval_runtime": 7.2963, | |
| "eval_samples_per_second": 4.112, | |
| "eval_steps_per_second": 2.056, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22732439190725165, | |
| "grad_norm": 0.6743273735046387, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.1582, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2364173675835417, | |
| "grad_norm": 0.6429440379142761, | |
| "learning_rate": 0.000148, | |
| "loss": 1.1064, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2364173675835417, | |
| "eval_loss": 1.119972825050354, | |
| "eval_runtime": 7.7787, | |
| "eval_samples_per_second": 3.857, | |
| "eval_steps_per_second": 1.928, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2455103432598318, | |
| "grad_norm": 0.6626828908920288, | |
| "learning_rate": 0.000146, | |
| "loss": 1.1741, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.25460331893612187, | |
| "grad_norm": 0.8786306381225586, | |
| "learning_rate": 0.000144, | |
| "loss": 0.9836, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.25460331893612187, | |
| "eval_loss": 1.1226236820220947, | |
| "eval_runtime": 7.3222, | |
| "eval_samples_per_second": 4.097, | |
| "eval_steps_per_second": 2.049, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2636962946124119, | |
| "grad_norm": 0.7686639428138733, | |
| "learning_rate": 0.000142, | |
| "loss": 1.0945, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.272789270288702, | |
| "grad_norm": 0.795609712600708, | |
| "learning_rate": 0.00014, | |
| "loss": 0.9761, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.272789270288702, | |
| "eval_loss": 1.0910608768463135, | |
| "eval_runtime": 7.8761, | |
| "eval_samples_per_second": 3.809, | |
| "eval_steps_per_second": 1.905, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.28188224596499206, | |
| "grad_norm": 0.8161769509315491, | |
| "learning_rate": 0.000138, | |
| "loss": 1.0516, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2909752216412821, | |
| "grad_norm": 0.7441025972366333, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.0843, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2909752216412821, | |
| "eval_loss": 1.0994905233383179, | |
| "eval_runtime": 7.3248, | |
| "eval_samples_per_second": 4.096, | |
| "eval_steps_per_second": 2.048, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.30006819731757217, | |
| "grad_norm": 0.8015936613082886, | |
| "learning_rate": 0.000134, | |
| "loss": 1.2283, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.30916117299386225, | |
| "grad_norm": 0.7653372287750244, | |
| "learning_rate": 0.000132, | |
| "loss": 1.0927, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.30916117299386225, | |
| "eval_loss": 1.0781885385513306, | |
| "eval_runtime": 7.7433, | |
| "eval_samples_per_second": 3.874, | |
| "eval_steps_per_second": 1.937, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3182541486701523, | |
| "grad_norm": 0.7825664281845093, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 1.106, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.32734712434644236, | |
| "grad_norm": 0.7554489970207214, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 1.0999, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.32734712434644236, | |
| "eval_loss": 1.0733944177627563, | |
| "eval_runtime": 7.5964, | |
| "eval_samples_per_second": 3.949, | |
| "eval_steps_per_second": 1.975, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.33644010002273245, | |
| "grad_norm": 0.8089460730552673, | |
| "learning_rate": 0.000126, | |
| "loss": 1.2226, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.34553307569902253, | |
| "grad_norm": 0.7402002215385437, | |
| "learning_rate": 0.000124, | |
| "loss": 1.1182, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.34553307569902253, | |
| "eval_loss": 1.0658830404281616, | |
| "eval_runtime": 7.8865, | |
| "eval_samples_per_second": 3.804, | |
| "eval_steps_per_second": 1.902, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.35462605137531256, | |
| "grad_norm": 0.6649179458618164, | |
| "learning_rate": 0.000122, | |
| "loss": 1.0671, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.36371902705160264, | |
| "grad_norm": 0.7573872804641724, | |
| "learning_rate": 0.00012, | |
| "loss": 1.0291, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36371902705160264, | |
| "eval_loss": 1.0471783876419067, | |
| "eval_runtime": 7.9526, | |
| "eval_samples_per_second": 3.772, | |
| "eval_steps_per_second": 1.886, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3728120027278927, | |
| "grad_norm": 0.8243398666381836, | |
| "learning_rate": 0.000118, | |
| "loss": 1.1096, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.38190497840418275, | |
| "grad_norm": 0.721502423286438, | |
| "learning_rate": 0.000116, | |
| "loss": 1.2158, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.38190497840418275, | |
| "eval_loss": 1.0554709434509277, | |
| "eval_runtime": 7.3409, | |
| "eval_samples_per_second": 4.087, | |
| "eval_steps_per_second": 2.043, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.39099795408047283, | |
| "grad_norm": 0.7591432332992554, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 1.0817, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4000909297567629, | |
| "grad_norm": 0.7596343755722046, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 1.0873, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4000909297567629, | |
| "eval_loss": 1.0482908487319946, | |
| "eval_runtime": 7.8536, | |
| "eval_samples_per_second": 3.82, | |
| "eval_steps_per_second": 1.91, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40918390543305294, | |
| "grad_norm": 0.8296840190887451, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 1.0252, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.418276881109343, | |
| "grad_norm": 0.9094285368919373, | |
| "learning_rate": 0.00010800000000000001, | |
| "loss": 1.0978, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.418276881109343, | |
| "eval_loss": 1.046170711517334, | |
| "eval_runtime": 7.4472, | |
| "eval_samples_per_second": 4.028, | |
| "eval_steps_per_second": 2.014, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4273698567856331, | |
| "grad_norm": 0.8471206426620483, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 1.0371, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4364628324619232, | |
| "grad_norm": 0.8168342113494873, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 1.0352, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4364628324619232, | |
| "eval_loss": 1.0409115552902222, | |
| "eval_runtime": 7.8502, | |
| "eval_samples_per_second": 3.822, | |
| "eval_steps_per_second": 1.911, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4455558081382132, | |
| "grad_norm": 0.7482770681381226, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 1.0812, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4546487838145033, | |
| "grad_norm": 0.7300863862037659, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1762, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4546487838145033, | |
| "eval_loss": 1.0410172939300537, | |
| "eval_runtime": 7.4872, | |
| "eval_samples_per_second": 4.007, | |
| "eval_steps_per_second": 2.003, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4637417594907934, | |
| "grad_norm": 0.7066290378570557, | |
| "learning_rate": 9.8e-05, | |
| "loss": 1.1054, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4728347351670834, | |
| "grad_norm": 0.8214625716209412, | |
| "learning_rate": 9.6e-05, | |
| "loss": 1.0563, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4728347351670834, | |
| "eval_loss": 1.03702974319458, | |
| "eval_runtime": 7.8723, | |
| "eval_samples_per_second": 3.811, | |
| "eval_steps_per_second": 1.905, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.8834312558174133, | |
| "learning_rate": 9.4e-05, | |
| "loss": 1.1071, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.4910206865196636, | |
| "grad_norm": 0.768332302570343, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 1.0537, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4910206865196636, | |
| "eval_loss": 1.033887267112732, | |
| "eval_runtime": 7.7503, | |
| "eval_samples_per_second": 3.871, | |
| "eval_steps_per_second": 1.935, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5001136621959537, | |
| "grad_norm": 0.805924654006958, | |
| "learning_rate": 9e-05, | |
| "loss": 1.1193, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5092066378722437, | |
| "grad_norm": 0.8571528792381287, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 1.0951, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5092066378722437, | |
| "eval_loss": 1.0283806324005127, | |
| "eval_runtime": 7.6361, | |
| "eval_samples_per_second": 3.929, | |
| "eval_steps_per_second": 1.964, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5182996135485337, | |
| "grad_norm": 0.8743025064468384, | |
| "learning_rate": 8.6e-05, | |
| "loss": 0.9861, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5273925892248238, | |
| "grad_norm": 0.8119250535964966, | |
| "learning_rate": 8.4e-05, | |
| "loss": 1.0458, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5273925892248238, | |
| "eval_loss": 1.0257965326309204, | |
| "eval_runtime": 7.8945, | |
| "eval_samples_per_second": 3.8, | |
| "eval_steps_per_second": 1.9, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5364855649011139, | |
| "grad_norm": 0.9032679796218872, | |
| "learning_rate": 8.2e-05, | |
| "loss": 1.0145, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.545578540577404, | |
| "grad_norm": 0.8125148415565491, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0212, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.545578540577404, | |
| "eval_loss": 1.018557071685791, | |
| "eval_runtime": 7.5438, | |
| "eval_samples_per_second": 3.977, | |
| "eval_steps_per_second": 1.988, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.554671516253694, | |
| "grad_norm": 0.77150958776474, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 1.0901, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5637644919299841, | |
| "grad_norm": 0.8303976058959961, | |
| "learning_rate": 7.6e-05, | |
| "loss": 1.0535, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5637644919299841, | |
| "eval_loss": 1.019250750541687, | |
| "eval_runtime": 7.9264, | |
| "eval_samples_per_second": 3.785, | |
| "eval_steps_per_second": 1.892, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5728574676062742, | |
| "grad_norm": 0.8433631658554077, | |
| "learning_rate": 7.4e-05, | |
| "loss": 1.1187, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5819504432825642, | |
| "grad_norm": 0.8279653787612915, | |
| "learning_rate": 7.2e-05, | |
| "loss": 1.1483, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5819504432825642, | |
| "eval_loss": 1.0166659355163574, | |
| "eval_runtime": 7.3093, | |
| "eval_samples_per_second": 4.104, | |
| "eval_steps_per_second": 2.052, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5910434189588543, | |
| "grad_norm": 0.6873704791069031, | |
| "learning_rate": 7e-05, | |
| "loss": 1.0573, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6001363946351443, | |
| "grad_norm": 0.7217792868614197, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 1.0225, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6001363946351443, | |
| "eval_loss": 1.0203421115875244, | |
| "eval_runtime": 7.9938, | |
| "eval_samples_per_second": 3.753, | |
| "eval_steps_per_second": 1.876, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6092293703114344, | |
| "grad_norm": 0.828619122505188, | |
| "learning_rate": 6.6e-05, | |
| "loss": 1.0272, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6183223459877245, | |
| "grad_norm": 0.7822660207748413, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.9776, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6183223459877245, | |
| "eval_loss": 1.0186898708343506, | |
| "eval_runtime": 7.3434, | |
| "eval_samples_per_second": 4.085, | |
| "eval_steps_per_second": 2.043, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6274153216640146, | |
| "grad_norm": 0.7307916283607483, | |
| "learning_rate": 6.2e-05, | |
| "loss": 1.0637, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6365082973403046, | |
| "grad_norm": 0.8595789670944214, | |
| "learning_rate": 6e-05, | |
| "loss": 1.0571, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6365082973403046, | |
| "eval_loss": 1.008802056312561, | |
| "eval_runtime": 7.6422, | |
| "eval_samples_per_second": 3.926, | |
| "eval_steps_per_second": 1.963, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6456012730165946, | |
| "grad_norm": 1.0007542371749878, | |
| "learning_rate": 5.8e-05, | |
| "loss": 1.1277, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6546942486928847, | |
| "grad_norm": 0.8014799356460571, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 1.2342, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6546942486928847, | |
| "eval_loss": 1.0054609775543213, | |
| "eval_runtime": 7.5443, | |
| "eval_samples_per_second": 3.977, | |
| "eval_steps_per_second": 1.988, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6637872243691748, | |
| "grad_norm": 0.8301798105239868, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 1.0886, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6728802000454649, | |
| "grad_norm": 0.8582270741462708, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 1.0834, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6728802000454649, | |
| "eval_loss": 0.9980356693267822, | |
| "eval_runtime": 7.3182, | |
| "eval_samples_per_second": 4.099, | |
| "eval_steps_per_second": 2.05, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.681973175721755, | |
| "grad_norm": 0.9084227085113525, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0517, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6910661513980451, | |
| "grad_norm": 0.8120643496513367, | |
| "learning_rate": 4.8e-05, | |
| "loss": 1.0931, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6910661513980451, | |
| "eval_loss": 0.9912369847297668, | |
| "eval_runtime": 7.8444, | |
| "eval_samples_per_second": 3.824, | |
| "eval_steps_per_second": 1.912, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.700159127074335, | |
| "grad_norm": 0.8523077964782715, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 1.0883, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7092521027506251, | |
| "grad_norm": 0.8379296660423279, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 1.1041, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7092521027506251, | |
| "eval_loss": 0.9924930334091187, | |
| "eval_runtime": 7.364, | |
| "eval_samples_per_second": 4.074, | |
| "eval_steps_per_second": 2.037, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7183450784269152, | |
| "grad_norm": 0.9272042512893677, | |
| "learning_rate": 4.2e-05, | |
| "loss": 1.0839, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7274380541032053, | |
| "grad_norm": 0.8774125576019287, | |
| "learning_rate": 4e-05, | |
| "loss": 0.9889, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7274380541032053, | |
| "eval_loss": 0.9954690337181091, | |
| "eval_runtime": 7.8404, | |
| "eval_samples_per_second": 3.826, | |
| "eval_steps_per_second": 1.913, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7365310297794954, | |
| "grad_norm": 0.7553389072418213, | |
| "learning_rate": 3.8e-05, | |
| "loss": 1.0906, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7456240054557854, | |
| "grad_norm": 0.7866451740264893, | |
| "learning_rate": 3.6e-05, | |
| "loss": 1.0219, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7456240054557854, | |
| "eval_loss": 0.994717001914978, | |
| "eval_runtime": 7.8266, | |
| "eval_samples_per_second": 3.833, | |
| "eval_steps_per_second": 1.917, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.8554181456565857, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 1.0598, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.7638099568083655, | |
| "grad_norm": 0.9773761034011841, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 1.033, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7638099568083655, | |
| "eval_loss": 0.9926409125328064, | |
| "eval_runtime": 7.2819, | |
| "eval_samples_per_second": 4.12, | |
| "eval_steps_per_second": 2.06, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7729029324846556, | |
| "grad_norm": 0.8768495917320251, | |
| "learning_rate": 3e-05, | |
| "loss": 1.054, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7819959081609457, | |
| "grad_norm": 0.787002682685852, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 1.0548, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7819959081609457, | |
| "eval_loss": 0.9910202622413635, | |
| "eval_runtime": 7.8704, | |
| "eval_samples_per_second": 3.812, | |
| "eval_steps_per_second": 1.906, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7910888838372357, | |
| "grad_norm": 0.843839704990387, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.0936, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8001818595135258, | |
| "grad_norm": 0.9202592968940735, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.0684, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8001818595135258, | |
| "eval_loss": 0.9879806637763977, | |
| "eval_runtime": 7.292, | |
| "eval_samples_per_second": 4.114, | |
| "eval_steps_per_second": 2.057, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8092748351898159, | |
| "grad_norm": 0.8747548460960388, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.0185, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8183678108661059, | |
| "grad_norm": 0.8311501145362854, | |
| "learning_rate": 2e-05, | |
| "loss": 1.0874, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8183678108661059, | |
| "eval_loss": 0.9860556125640869, | |
| "eval_runtime": 7.7936, | |
| "eval_samples_per_second": 3.849, | |
| "eval_steps_per_second": 1.925, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.827460786542396, | |
| "grad_norm": 0.8813076615333557, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.0209, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.836553762218686, | |
| "grad_norm": 0.9480300545692444, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.0878, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.836553762218686, | |
| "eval_loss": 0.9852551817893982, | |
| "eval_runtime": 7.2978, | |
| "eval_samples_per_second": 4.111, | |
| "eval_steps_per_second": 2.055, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8456467378949761, | |
| "grad_norm": 0.8942534923553467, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.9746, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.8547397135712662, | |
| "grad_norm": 0.9491382837295532, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.9443, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8547397135712662, | |
| "eval_loss": 0.9845015406608582, | |
| "eval_runtime": 7.7967, | |
| "eval_samples_per_second": 3.848, | |
| "eval_steps_per_second": 1.924, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8638326892475563, | |
| "grad_norm": 0.9191480278968811, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0311, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.8729256649238464, | |
| "grad_norm": 0.8474745750427246, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.1006, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8729256649238464, | |
| "eval_loss": 0.9836694002151489, | |
| "eval_runtime": 7.3154, | |
| "eval_samples_per_second": 4.101, | |
| "eval_steps_per_second": 2.05, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8820186406001363, | |
| "grad_norm": 0.8463994860649109, | |
| "learning_rate": 6e-06, | |
| "loss": 1.0196, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.8911116162764264, | |
| "grad_norm": 0.8902223706245422, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.0447, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8911116162764264, | |
| "eval_loss": 0.9838915467262268, | |
| "eval_runtime": 7.7764, | |
| "eval_samples_per_second": 3.858, | |
| "eval_steps_per_second": 1.929, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9002045919527165, | |
| "grad_norm": 0.8993239998817444, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.0981, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9092975676290066, | |
| "grad_norm": 0.8118588924407959, | |
| "learning_rate": 0.0, | |
| "loss": 1.0078, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9092975676290066, | |
| "eval_loss": 0.9837616086006165, | |
| "eval_runtime": 7.456, | |
| "eval_samples_per_second": 4.024, | |
| "eval_steps_per_second": 2.012, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.068977650017795e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |