| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9983102399459276, |
| "eval_steps": 500, |
| "global_step": 2217, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013518080432578574, |
| "grad_norm": 3.0995258158495695, |
| "learning_rate": 5e-06, |
| "loss": 0.845, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027036160865157147, |
| "grad_norm": 2.4721955543947596, |
| "learning_rate": 5e-06, |
| "loss": 0.6931, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04055424129773572, |
| "grad_norm": 2.8908078365820544, |
| "learning_rate": 5e-06, |
| "loss": 0.6512, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.054072321730314295, |
| "grad_norm": 2.128304353555308, |
| "learning_rate": 5e-06, |
| "loss": 0.6546, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06759040216289287, |
| "grad_norm": 1.8583110367716618, |
| "learning_rate": 5e-06, |
| "loss": 0.6376, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08110848259547145, |
| "grad_norm": 1.9572074992289452, |
| "learning_rate": 5e-06, |
| "loss": 0.6293, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09462656302805002, |
| "grad_norm": 2.1439310974732906, |
| "learning_rate": 5e-06, |
| "loss": 0.6303, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10814464346062859, |
| "grad_norm": 2.1027938324907276, |
| "learning_rate": 5e-06, |
| "loss": 0.6263, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12166272389320716, |
| "grad_norm": 2.0994621637558613, |
| "learning_rate": 5e-06, |
| "loss": 0.6197, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13518080432578575, |
| "grad_norm": 2.3558311705586434, |
| "learning_rate": 5e-06, |
| "loss": 0.6137, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14869888475836432, |
| "grad_norm": 3.6078661884191936, |
| "learning_rate": 5e-06, |
| "loss": 0.6143, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1622169651909429, |
| "grad_norm": 2.0098953064632252, |
| "learning_rate": 5e-06, |
| "loss": 0.6119, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17573504562352146, |
| "grad_norm": 1.7892961602329431, |
| "learning_rate": 5e-06, |
| "loss": 0.6082, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18925312605610004, |
| "grad_norm": 2.495233222827848, |
| "learning_rate": 5e-06, |
| "loss": 0.6105, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2027712064886786, |
| "grad_norm": 2.73669002831843, |
| "learning_rate": 5e-06, |
| "loss": 0.6005, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21628928692125718, |
| "grad_norm": 2.25755467972834, |
| "learning_rate": 5e-06, |
| "loss": 0.6041, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.22980736735383575, |
| "grad_norm": 2.2899488859758317, |
| "learning_rate": 5e-06, |
| "loss": 0.6042, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.24332544778641432, |
| "grad_norm": 1.9924897200956881, |
| "learning_rate": 5e-06, |
| "loss": 0.5942, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2568435282189929, |
| "grad_norm": 2.0832289534715596, |
| "learning_rate": 5e-06, |
| "loss": 0.6008, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2703616086515715, |
| "grad_norm": 1.9923009880039881, |
| "learning_rate": 5e-06, |
| "loss": 0.6039, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28387968908415007, |
| "grad_norm": 1.6840960209729652, |
| "learning_rate": 5e-06, |
| "loss": 0.6056, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.29739776951672864, |
| "grad_norm": 1.644653139908748, |
| "learning_rate": 5e-06, |
| "loss": 0.5976, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3109158499493072, |
| "grad_norm": 1.815796136594004, |
| "learning_rate": 5e-06, |
| "loss": 0.5983, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3244339303818858, |
| "grad_norm": 2.0476220349655736, |
| "learning_rate": 5e-06, |
| "loss": 0.5985, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.33795201081446435, |
| "grad_norm": 1.826188487172147, |
| "learning_rate": 5e-06, |
| "loss": 0.5991, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3514700912470429, |
| "grad_norm": 1.8779821757544928, |
| "learning_rate": 5e-06, |
| "loss": 0.6011, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3649881716796215, |
| "grad_norm": 1.546297402705703, |
| "learning_rate": 5e-06, |
| "loss": 0.5991, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.37850625211220007, |
| "grad_norm": 1.9100992643242412, |
| "learning_rate": 5e-06, |
| "loss": 0.6016, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.39202433254477864, |
| "grad_norm": 1.633188543050389, |
| "learning_rate": 5e-06, |
| "loss": 0.5931, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4055424129773572, |
| "grad_norm": 1.7612843300351624, |
| "learning_rate": 5e-06, |
| "loss": 0.5912, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4190604934099358, |
| "grad_norm": 1.5609583873755621, |
| "learning_rate": 5e-06, |
| "loss": 0.5983, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.43257857384251436, |
| "grad_norm": 1.6780024100547228, |
| "learning_rate": 5e-06, |
| "loss": 0.5899, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.44609665427509293, |
| "grad_norm": 2.142609724069825, |
| "learning_rate": 5e-06, |
| "loss": 0.5905, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4596147347076715, |
| "grad_norm": 1.7302040609097082, |
| "learning_rate": 5e-06, |
| "loss": 0.5977, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4731328151402501, |
| "grad_norm": 1.6276169982986228, |
| "learning_rate": 5e-06, |
| "loss": 0.5883, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.48665089557282865, |
| "grad_norm": 1.6710934957701402, |
| "learning_rate": 5e-06, |
| "loss": 0.5974, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5001689760054072, |
| "grad_norm": 1.767516047841104, |
| "learning_rate": 5e-06, |
| "loss": 0.5912, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5136870564379858, |
| "grad_norm": 1.6695942153323693, |
| "learning_rate": 5e-06, |
| "loss": 0.5891, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5272051368705644, |
| "grad_norm": 1.7527726211773795, |
| "learning_rate": 5e-06, |
| "loss": 0.5908, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.540723217303143, |
| "grad_norm": 1.651084341663377, |
| "learning_rate": 5e-06, |
| "loss": 0.5858, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5542412977357215, |
| "grad_norm": 1.7390619949343342, |
| "learning_rate": 5e-06, |
| "loss": 0.595, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5677593781683001, |
| "grad_norm": 1.4988408072021968, |
| "learning_rate": 5e-06, |
| "loss": 0.5846, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5812774586008786, |
| "grad_norm": 1.573068476680162, |
| "learning_rate": 5e-06, |
| "loss": 0.5814, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5947955390334573, |
| "grad_norm": 1.6120372463584451, |
| "learning_rate": 5e-06, |
| "loss": 0.5899, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6083136194660358, |
| "grad_norm": 1.6274467183788262, |
| "learning_rate": 5e-06, |
| "loss": 0.59, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6218316998986144, |
| "grad_norm": 1.7252328462432982, |
| "learning_rate": 5e-06, |
| "loss": 0.5854, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6353497803311929, |
| "grad_norm": 1.737699797028079, |
| "learning_rate": 5e-06, |
| "loss": 0.5806, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6488678607637716, |
| "grad_norm": 1.5370682500918078, |
| "learning_rate": 5e-06, |
| "loss": 0.5881, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6623859411963501, |
| "grad_norm": 1.7740843576068728, |
| "learning_rate": 5e-06, |
| "loss": 0.5806, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6759040216289287, |
| "grad_norm": 1.5652522580101986, |
| "learning_rate": 5e-06, |
| "loss": 0.572, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6894221020615072, |
| "grad_norm": 1.5568021887696903, |
| "learning_rate": 5e-06, |
| "loss": 0.58, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7029401824940859, |
| "grad_norm": 1.5175784957924134, |
| "learning_rate": 5e-06, |
| "loss": 0.579, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7164582629266644, |
| "grad_norm": 1.4295771272533815, |
| "learning_rate": 5e-06, |
| "loss": 0.5871, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.729976343359243, |
| "grad_norm": 1.7204826587382944, |
| "learning_rate": 5e-06, |
| "loss": 0.5817, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7434944237918215, |
| "grad_norm": 1.577838279477792, |
| "learning_rate": 5e-06, |
| "loss": 0.5779, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7570125042244001, |
| "grad_norm": 1.6528962259307287, |
| "learning_rate": 5e-06, |
| "loss": 0.5812, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7705305846569787, |
| "grad_norm": 1.6042870663862332, |
| "learning_rate": 5e-06, |
| "loss": 0.5786, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7840486650895573, |
| "grad_norm": 1.709986377305198, |
| "learning_rate": 5e-06, |
| "loss": 0.5793, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7975667455221359, |
| "grad_norm": 1.7924690518428852, |
| "learning_rate": 5e-06, |
| "loss": 0.5803, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8110848259547144, |
| "grad_norm": 1.5055046913979044, |
| "learning_rate": 5e-06, |
| "loss": 0.5768, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.824602906387293, |
| "grad_norm": 1.4964520058827533, |
| "learning_rate": 5e-06, |
| "loss": 0.5759, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8381209868198716, |
| "grad_norm": 1.6720625396672169, |
| "learning_rate": 5e-06, |
| "loss": 0.5754, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8516390672524502, |
| "grad_norm": 1.5463769829403606, |
| "learning_rate": 5e-06, |
| "loss": 0.5786, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8651571476850287, |
| "grad_norm": 1.4669224733461368, |
| "learning_rate": 5e-06, |
| "loss": 0.5708, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8786752281176073, |
| "grad_norm": 1.520619999962285, |
| "learning_rate": 5e-06, |
| "loss": 0.574, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.8921933085501859, |
| "grad_norm": 1.5914892508820981, |
| "learning_rate": 5e-06, |
| "loss": 0.5783, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9057113889827645, |
| "grad_norm": 1.8577143073111821, |
| "learning_rate": 5e-06, |
| "loss": 0.5829, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.919229469415343, |
| "grad_norm": 1.5962074409528684, |
| "learning_rate": 5e-06, |
| "loss": 0.5838, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9327475498479216, |
| "grad_norm": 1.446689597838186, |
| "learning_rate": 5e-06, |
| "loss": 0.5756, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9462656302805001, |
| "grad_norm": 1.510508928886103, |
| "learning_rate": 5e-06, |
| "loss": 0.5738, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9597837107130788, |
| "grad_norm": 1.4607211586070719, |
| "learning_rate": 5e-06, |
| "loss": 0.5757, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9733017911456573, |
| "grad_norm": 1.4221599670931588, |
| "learning_rate": 5e-06, |
| "loss": 0.5735, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.9868198715782359, |
| "grad_norm": 1.484579352264901, |
| "learning_rate": 5e-06, |
| "loss": 0.5772, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.9989861439675566, |
| "eval_loss": 0.1434517651796341, |
| "eval_runtime": 379.3166, |
| "eval_samples_per_second": 26.271, |
| "eval_steps_per_second": 0.411, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.0010138560324433, |
| "grad_norm": 2.7421566702088738, |
| "learning_rate": 5e-06, |
| "loss": 0.5708, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.014531936465022, |
| "grad_norm": 1.8529103798230466, |
| "learning_rate": 5e-06, |
| "loss": 0.4629, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0280500168976006, |
| "grad_norm": 1.67580460086375, |
| "learning_rate": 5e-06, |
| "loss": 0.46, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.041568097330179, |
| "grad_norm": 1.640470300013924, |
| "learning_rate": 5e-06, |
| "loss": 0.4562, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0550861777627576, |
| "grad_norm": 1.6699136465078195, |
| "learning_rate": 5e-06, |
| "loss": 0.4536, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.0686042581953363, |
| "grad_norm": 1.7807842688633373, |
| "learning_rate": 5e-06, |
| "loss": 0.4662, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0821223386279148, |
| "grad_norm": 1.6145188030618127, |
| "learning_rate": 5e-06, |
| "loss": 0.4587, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0956404190604934, |
| "grad_norm": 1.6719771276304467, |
| "learning_rate": 5e-06, |
| "loss": 0.4581, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.1091584994930719, |
| "grad_norm": 1.479688368315656, |
| "learning_rate": 5e-06, |
| "loss": 0.456, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1226765799256506, |
| "grad_norm": 1.5743910150201328, |
| "learning_rate": 5e-06, |
| "loss": 0.4604, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1361946603582291, |
| "grad_norm": 1.5023364194883289, |
| "learning_rate": 5e-06, |
| "loss": 0.4652, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1497127407908077, |
| "grad_norm": 1.7167217243377628, |
| "learning_rate": 5e-06, |
| "loss": 0.472, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1632308212233862, |
| "grad_norm": 1.5221498447986286, |
| "learning_rate": 5e-06, |
| "loss": 0.4665, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.176748901655965, |
| "grad_norm": 1.7809772273502287, |
| "learning_rate": 5e-06, |
| "loss": 0.4656, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1902669820885434, |
| "grad_norm": 1.7250751373229314, |
| "learning_rate": 5e-06, |
| "loss": 0.4581, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.203785062521122, |
| "grad_norm": 1.861343520779267, |
| "learning_rate": 5e-06, |
| "loss": 0.4636, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2173031429537007, |
| "grad_norm": 2.0625018676728715, |
| "learning_rate": 5e-06, |
| "loss": 0.4659, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2308212233862792, |
| "grad_norm": 1.551721464148321, |
| "learning_rate": 5e-06, |
| "loss": 0.4629, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2443393038188577, |
| "grad_norm": 1.7884538726855719, |
| "learning_rate": 5e-06, |
| "loss": 0.4707, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2578573842514362, |
| "grad_norm": 1.6748195476683634, |
| "learning_rate": 5e-06, |
| "loss": 0.4711, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2713754646840147, |
| "grad_norm": 1.7623689240351639, |
| "learning_rate": 5e-06, |
| "loss": 0.4649, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.2848935451165935, |
| "grad_norm": 1.7011947665040341, |
| "learning_rate": 5e-06, |
| "loss": 0.4634, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.298411625549172, |
| "grad_norm": 1.5930792336293036, |
| "learning_rate": 5e-06, |
| "loss": 0.4646, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3119297059817505, |
| "grad_norm": 1.727734986402252, |
| "learning_rate": 5e-06, |
| "loss": 0.4593, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.3254477864143293, |
| "grad_norm": 1.5431155855105338, |
| "learning_rate": 5e-06, |
| "loss": 0.4686, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.3389658668469078, |
| "grad_norm": 1.701221393426559, |
| "learning_rate": 5e-06, |
| "loss": 0.4647, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3524839472794863, |
| "grad_norm": 1.5946440906826853, |
| "learning_rate": 5e-06, |
| "loss": 0.474, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.3660020277120648, |
| "grad_norm": 1.570462820909487, |
| "learning_rate": 5e-06, |
| "loss": 0.4715, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.3795201081446435, |
| "grad_norm": 1.5515085427705777, |
| "learning_rate": 5e-06, |
| "loss": 0.4658, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.393038188577222, |
| "grad_norm": 1.612110437850543, |
| "learning_rate": 5e-06, |
| "loss": 0.475, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.4065562690098006, |
| "grad_norm": 1.48793819280309, |
| "learning_rate": 5e-06, |
| "loss": 0.4711, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.420074349442379, |
| "grad_norm": 1.5900270531941814, |
| "learning_rate": 5e-06, |
| "loss": 0.466, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.4335924298749578, |
| "grad_norm": 1.625411671224631, |
| "learning_rate": 5e-06, |
| "loss": 0.4655, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.4471105103075363, |
| "grad_norm": 1.7287593338346183, |
| "learning_rate": 5e-06, |
| "loss": 0.4718, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.4606285907401149, |
| "grad_norm": 1.5782208709005707, |
| "learning_rate": 5e-06, |
| "loss": 0.4711, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.4741466711726936, |
| "grad_norm": 1.7240525193349314, |
| "learning_rate": 5e-06, |
| "loss": 0.4683, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.4876647516052721, |
| "grad_norm": 1.7400342420780646, |
| "learning_rate": 5e-06, |
| "loss": 0.4695, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5011828320378506, |
| "grad_norm": 1.5363707121370902, |
| "learning_rate": 5e-06, |
| "loss": 0.4724, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.5147009124704292, |
| "grad_norm": 1.7266066662849726, |
| "learning_rate": 5e-06, |
| "loss": 0.4709, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.5282189929030077, |
| "grad_norm": 1.7190911907364863, |
| "learning_rate": 5e-06, |
| "loss": 0.4645, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.5417370733355864, |
| "grad_norm": 1.606217517626092, |
| "learning_rate": 5e-06, |
| "loss": 0.4702, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.555255153768165, |
| "grad_norm": 1.766144995012523, |
| "learning_rate": 5e-06, |
| "loss": 0.4739, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.5687732342007434, |
| "grad_norm": 1.5926253751713118, |
| "learning_rate": 5e-06, |
| "loss": 0.4672, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.5822913146333222, |
| "grad_norm": 1.709203191355986, |
| "learning_rate": 5e-06, |
| "loss": 0.4731, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.5958093950659007, |
| "grad_norm": 1.5574946245332464, |
| "learning_rate": 5e-06, |
| "loss": 0.4703, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.6093274754984792, |
| "grad_norm": 1.584669174527815, |
| "learning_rate": 5e-06, |
| "loss": 0.4705, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.622845555931058, |
| "grad_norm": 1.5042749804205873, |
| "learning_rate": 5e-06, |
| "loss": 0.47, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.6363636363636362, |
| "grad_norm": 1.6609143139694778, |
| "learning_rate": 5e-06, |
| "loss": 0.4677, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.649881716796215, |
| "grad_norm": 1.7891835221599115, |
| "learning_rate": 5e-06, |
| "loss": 0.4719, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.6633997972287935, |
| "grad_norm": 1.5476583418714311, |
| "learning_rate": 5e-06, |
| "loss": 0.4721, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.676917877661372, |
| "grad_norm": 1.6549785438500684, |
| "learning_rate": 5e-06, |
| "loss": 0.4751, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.6904359580939508, |
| "grad_norm": 1.6039460840462256, |
| "learning_rate": 5e-06, |
| "loss": 0.4845, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.7039540385265293, |
| "grad_norm": 1.6896161846830133, |
| "learning_rate": 5e-06, |
| "loss": 0.4729, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.7174721189591078, |
| "grad_norm": 1.5618414410556232, |
| "learning_rate": 5e-06, |
| "loss": 0.4776, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.7309901993916865, |
| "grad_norm": 1.5952861695756622, |
| "learning_rate": 5e-06, |
| "loss": 0.4779, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.7445082798242648, |
| "grad_norm": 1.699598571695824, |
| "learning_rate": 5e-06, |
| "loss": 0.4751, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.7580263602568436, |
| "grad_norm": 1.809834320799901, |
| "learning_rate": 5e-06, |
| "loss": 0.4739, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.771544440689422, |
| "grad_norm": 1.6219739662720212, |
| "learning_rate": 5e-06, |
| "loss": 0.473, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.7850625211220006, |
| "grad_norm": 1.6762754772871185, |
| "learning_rate": 5e-06, |
| "loss": 0.4721, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.7985806015545793, |
| "grad_norm": 1.6645833857767827, |
| "learning_rate": 5e-06, |
| "loss": 0.4755, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.8120986819871578, |
| "grad_norm": 1.5125797777292427, |
| "learning_rate": 5e-06, |
| "loss": 0.4721, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.8256167624197364, |
| "grad_norm": 1.4872727963351564, |
| "learning_rate": 5e-06, |
| "loss": 0.477, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.839134842852315, |
| "grad_norm": 1.5871439872439899, |
| "learning_rate": 5e-06, |
| "loss": 0.4765, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.8526529232848934, |
| "grad_norm": 1.592006128597771, |
| "learning_rate": 5e-06, |
| "loss": 0.4746, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.8661710037174721, |
| "grad_norm": 1.598997771987625, |
| "learning_rate": 5e-06, |
| "loss": 0.4781, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.8796890841500506, |
| "grad_norm": 1.5928725680036628, |
| "learning_rate": 5e-06, |
| "loss": 0.4803, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.8932071645826292, |
| "grad_norm": 1.623893547015882, |
| "learning_rate": 5e-06, |
| "loss": 0.4752, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.906725245015208, |
| "grad_norm": 1.7940114937618843, |
| "learning_rate": 5e-06, |
| "loss": 0.4836, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.9202433254477864, |
| "grad_norm": 1.5205185910384997, |
| "learning_rate": 5e-06, |
| "loss": 0.4809, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.933761405880365, |
| "grad_norm": 1.6080341693065385, |
| "learning_rate": 5e-06, |
| "loss": 0.481, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.9472794863129437, |
| "grad_norm": 1.5206917589511617, |
| "learning_rate": 5e-06, |
| "loss": 0.4711, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.9607975667455222, |
| "grad_norm": 1.6175809292420489, |
| "learning_rate": 5e-06, |
| "loss": 0.4759, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.9743156471781007, |
| "grad_norm": 1.5775797663419633, |
| "learning_rate": 5e-06, |
| "loss": 0.4699, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.9878337276106794, |
| "grad_norm": 1.6961433441398912, |
| "learning_rate": 5e-06, |
| "loss": 0.4737, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.9986481919567423, |
| "eval_loss": 0.14539429545402527, |
| "eval_runtime": 380.8674, |
| "eval_samples_per_second": 26.164, |
| "eval_steps_per_second": 0.41, |
| "step": 1478 |
| }, |
| { |
| "epoch": 2.0020277120648866, |
| "grad_norm": 3.8413511660769193, |
| "learning_rate": 5e-06, |
| "loss": 0.4622, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.0155457924974653, |
| "grad_norm": 2.289274121046122, |
| "learning_rate": 5e-06, |
| "loss": 0.3511, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.029063872930044, |
| "grad_norm": 1.9108409569758165, |
| "learning_rate": 5e-06, |
| "loss": 0.3338, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.0425819533626224, |
| "grad_norm": 1.7936328744279362, |
| "learning_rate": 5e-06, |
| "loss": 0.3366, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.056100033795201, |
| "grad_norm": 1.9158569170297433, |
| "learning_rate": 5e-06, |
| "loss": 0.3439, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.06961811422778, |
| "grad_norm": 1.791129292519035, |
| "learning_rate": 5e-06, |
| "loss": 0.3392, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.083136194660358, |
| "grad_norm": 2.0098239614151026, |
| "learning_rate": 5e-06, |
| "loss": 0.3392, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.096654275092937, |
| "grad_norm": 1.7766198397906945, |
| "learning_rate": 5e-06, |
| "loss": 0.3389, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.110172355525515, |
| "grad_norm": 2.08318641104621, |
| "learning_rate": 5e-06, |
| "loss": 0.3431, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.123690435958094, |
| "grad_norm": 2.2228561770595667, |
| "learning_rate": 5e-06, |
| "loss": 0.3395, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.1372085163906727, |
| "grad_norm": 1.9214094936994222, |
| "learning_rate": 5e-06, |
| "loss": 0.3366, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.150726596823251, |
| "grad_norm": 1.8535859994144672, |
| "learning_rate": 5e-06, |
| "loss": 0.3451, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.1642446772558297, |
| "grad_norm": 1.8759119457837454, |
| "learning_rate": 5e-06, |
| "loss": 0.3469, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.1777627576884084, |
| "grad_norm": 1.9517825046828854, |
| "learning_rate": 5e-06, |
| "loss": 0.3455, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.1912808381209867, |
| "grad_norm": 1.951414687627, |
| "learning_rate": 5e-06, |
| "loss": 0.3433, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.2047989185535655, |
| "grad_norm": 1.8390753103711273, |
| "learning_rate": 5e-06, |
| "loss": 0.3394, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.2183169989861438, |
| "grad_norm": 1.852336735941585, |
| "learning_rate": 5e-06, |
| "loss": 0.3482, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.2318350794187225, |
| "grad_norm": 2.104974336142616, |
| "learning_rate": 5e-06, |
| "loss": 0.3546, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.2453531598513012, |
| "grad_norm": 1.9022953700727612, |
| "learning_rate": 5e-06, |
| "loss": 0.344, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.2588712402838795, |
| "grad_norm": 1.912675084485768, |
| "learning_rate": 5e-06, |
| "loss": 0.3491, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.2723893207164583, |
| "grad_norm": 1.766645490672379, |
| "learning_rate": 5e-06, |
| "loss": 0.3531, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.285907401149037, |
| "grad_norm": 1.864712721003459, |
| "learning_rate": 5e-06, |
| "loss": 0.346, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.2994254815816153, |
| "grad_norm": 2.007832977417507, |
| "learning_rate": 5e-06, |
| "loss": 0.3478, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.312943562014194, |
| "grad_norm": 1.948790808762209, |
| "learning_rate": 5e-06, |
| "loss": 0.3485, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.3264616424467723, |
| "grad_norm": 2.0217128757867293, |
| "learning_rate": 5e-06, |
| "loss": 0.3517, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.339979722879351, |
| "grad_norm": 2.144727796810822, |
| "learning_rate": 5e-06, |
| "loss": 0.3555, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.35349780331193, |
| "grad_norm": 1.8087253456008405, |
| "learning_rate": 5e-06, |
| "loss": 0.3559, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.367015883744508, |
| "grad_norm": 1.838290701763884, |
| "learning_rate": 5e-06, |
| "loss": 0.3546, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.380533964177087, |
| "grad_norm": 1.775768613839116, |
| "learning_rate": 5e-06, |
| "loss": 0.3551, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.3940520446096656, |
| "grad_norm": 1.912762130191922, |
| "learning_rate": 5e-06, |
| "loss": 0.3547, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.407570125042244, |
| "grad_norm": 1.7693808777695688, |
| "learning_rate": 5e-06, |
| "loss": 0.3554, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.4210882054748226, |
| "grad_norm": 2.062101118297791, |
| "learning_rate": 5e-06, |
| "loss": 0.3539, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.4346062859074014, |
| "grad_norm": 1.7936240593829114, |
| "learning_rate": 5e-06, |
| "loss": 0.3565, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.4481243663399797, |
| "grad_norm": 1.8990616257107005, |
| "learning_rate": 5e-06, |
| "loss": 0.3506, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.4616424467725584, |
| "grad_norm": 1.8897003912341879, |
| "learning_rate": 5e-06, |
| "loss": 0.3539, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.4751605272051367, |
| "grad_norm": 1.9638380799816073, |
| "learning_rate": 5e-06, |
| "loss": 0.3531, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.4886786076377154, |
| "grad_norm": 1.7974113469484045, |
| "learning_rate": 5e-06, |
| "loss": 0.354, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.502196688070294, |
| "grad_norm": 1.825033766196877, |
| "learning_rate": 5e-06, |
| "loss": 0.3575, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.5157147685028725, |
| "grad_norm": 1.8251675734409782, |
| "learning_rate": 5e-06, |
| "loss": 0.3518, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.529232848935451, |
| "grad_norm": 2.0787239988149397, |
| "learning_rate": 5e-06, |
| "loss": 0.3633, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.5427509293680295, |
| "grad_norm": 1.7869028110036567, |
| "learning_rate": 5e-06, |
| "loss": 0.3576, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.5562690098006082, |
| "grad_norm": 1.7932422035286009, |
| "learning_rate": 5e-06, |
| "loss": 0.3589, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.569787090233187, |
| "grad_norm": 1.8125739600297648, |
| "learning_rate": 5e-06, |
| "loss": 0.3571, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.5833051706657653, |
| "grad_norm": 1.8667683506155952, |
| "learning_rate": 5e-06, |
| "loss": 0.3612, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.596823251098344, |
| "grad_norm": 1.8639012162902293, |
| "learning_rate": 5e-06, |
| "loss": 0.3598, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.6103413315309227, |
| "grad_norm": 2.0202670438057924, |
| "learning_rate": 5e-06, |
| "loss": 0.3612, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.623859411963501, |
| "grad_norm": 1.8193852600474405, |
| "learning_rate": 5e-06, |
| "loss": 0.3604, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.6373774923960798, |
| "grad_norm": 1.8662033138623173, |
| "learning_rate": 5e-06, |
| "loss": 0.3646, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.6508955728286585, |
| "grad_norm": 1.9796354345767144, |
| "learning_rate": 5e-06, |
| "loss": 0.3633, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.664413653261237, |
| "grad_norm": 1.9455704972640486, |
| "learning_rate": 5e-06, |
| "loss": 0.3622, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.6779317336938155, |
| "grad_norm": 1.6977674417388293, |
| "learning_rate": 5e-06, |
| "loss": 0.3661, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.6914498141263943, |
| "grad_norm": 1.7297060050258417, |
| "learning_rate": 5e-06, |
| "loss": 0.353, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.7049678945589726, |
| "grad_norm": 1.982039686598582, |
| "learning_rate": 5e-06, |
| "loss": 0.3616, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.7184859749915513, |
| "grad_norm": 1.9841967443753195, |
| "learning_rate": 5e-06, |
| "loss": 0.3626, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.7320040554241296, |
| "grad_norm": 1.9857692866519607, |
| "learning_rate": 5e-06, |
| "loss": 0.3634, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.7455221358567083, |
| "grad_norm": 1.9757117083253184, |
| "learning_rate": 5e-06, |
| "loss": 0.3697, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.759040216289287, |
| "grad_norm": 1.8782922508939865, |
| "learning_rate": 5e-06, |
| "loss": 0.3652, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.7725582967218654, |
| "grad_norm": 1.9035541906463247, |
| "learning_rate": 5e-06, |
| "loss": 0.3599, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.786076377154444, |
| "grad_norm": 1.951655639577415, |
| "learning_rate": 5e-06, |
| "loss": 0.369, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.7995944575870224, |
| "grad_norm": 1.7412018691630233, |
| "learning_rate": 5e-06, |
| "loss": 0.3638, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.813112538019601, |
| "grad_norm": 1.8554265727819574, |
| "learning_rate": 5e-06, |
| "loss": 0.3679, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.82663061845218, |
| "grad_norm": 1.9678242485081974, |
| "learning_rate": 5e-06, |
| "loss": 0.3637, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.840148698884758, |
| "grad_norm": 1.9578034139617126, |
| "learning_rate": 5e-06, |
| "loss": 0.3653, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.853666779317337, |
| "grad_norm": 2.016045119436882, |
| "learning_rate": 5e-06, |
| "loss": 0.3616, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.8671848597499157, |
| "grad_norm": 1.9377074982514926, |
| "learning_rate": 5e-06, |
| "loss": 0.3615, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.880702940182494, |
| "grad_norm": 2.005341596622271, |
| "learning_rate": 5e-06, |
| "loss": 0.3693, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.8942210206150727, |
| "grad_norm": 1.823360962532199, |
| "learning_rate": 5e-06, |
| "loss": 0.3686, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.9077391010476514, |
| "grad_norm": 1.7689506770881627, |
| "learning_rate": 5e-06, |
| "loss": 0.3667, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.9212571814802297, |
| "grad_norm": 1.899310672863797, |
| "learning_rate": 5e-06, |
| "loss": 0.3645, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.9347752619128085, |
| "grad_norm": 1.8348517557022244, |
| "learning_rate": 5e-06, |
| "loss": 0.3646, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.948293342345387, |
| "grad_norm": 1.8637043447496588, |
| "learning_rate": 5e-06, |
| "loss": 0.3702, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.9618114227779655, |
| "grad_norm": 1.8308571241634983, |
| "learning_rate": 5e-06, |
| "loss": 0.3634, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.9753295032105442, |
| "grad_norm": 1.9515551374723294, |
| "learning_rate": 5e-06, |
| "loss": 0.3733, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.9888475836431225, |
| "grad_norm": 1.9021705828666453, |
| "learning_rate": 5e-06, |
| "loss": 0.3691, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.9983102399459276, |
| "eval_loss": 0.16083495318889618, |
| "eval_runtime": 379.909, |
| "eval_samples_per_second": 26.23, |
| "eval_steps_per_second": 0.411, |
| "step": 2217 |
| }, |
| { |
| "epoch": 2.9983102399459276, |
| "step": 2217, |
| "total_flos": 1856569406914560.0, |
| "train_loss": 0.47475808259329927, |
| "train_runtime": 63259.7159, |
| "train_samples_per_second": 8.979, |
| "train_steps_per_second": 0.035 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2217, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1856569406914560.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|