| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994683678894205, | |
| "eval_steps": 500, | |
| "global_step": 940, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01063264221158958, | |
| "grad_norm": 5.609270095825195, | |
| "learning_rate": 0.00019808510638297873, | |
| "loss": 2.4434, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02126528442317916, | |
| "grad_norm": 4.589075088500977, | |
| "learning_rate": 0.00019595744680851065, | |
| "loss": 1.6902, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03189792663476874, | |
| "grad_norm": 3.7465291023254395, | |
| "learning_rate": 0.00019382978723404257, | |
| "loss": 1.3148, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04253056884635832, | |
| "grad_norm": 3.543064594268799, | |
| "learning_rate": 0.00019170212765957448, | |
| "loss": 1.4302, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0531632110579479, | |
| "grad_norm": 2.68544340133667, | |
| "learning_rate": 0.0001895744680851064, | |
| "loss": 1.3222, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06379585326953748, | |
| "grad_norm": 2.752901792526245, | |
| "learning_rate": 0.00018744680851063832, | |
| "loss": 1.2792, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07442849548112707, | |
| "grad_norm": 2.7944841384887695, | |
| "learning_rate": 0.0001853191489361702, | |
| "loss": 1.3764, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08506113769271664, | |
| "grad_norm": 3.0340654850006104, | |
| "learning_rate": 0.00018319148936170215, | |
| "loss": 1.2255, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 2.5017054080963135, | |
| "learning_rate": 0.00018106382978723404, | |
| "loss": 1.1689, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1063264221158958, | |
| "grad_norm": 4.572251319885254, | |
| "learning_rate": 0.00017893617021276596, | |
| "loss": 1.1418, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 3.354853630065918, | |
| "learning_rate": 0.00017680851063829787, | |
| "loss": 1.3103, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12759170653907495, | |
| "grad_norm": 2.387272834777832, | |
| "learning_rate": 0.0001746808510638298, | |
| "loss": 1.2848, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13822434875066453, | |
| "grad_norm": 2.579465627670288, | |
| "learning_rate": 0.0001725531914893617, | |
| "loss": 1.2395, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14885699096225413, | |
| "grad_norm": 2.9512267112731934, | |
| "learning_rate": 0.00017042553191489362, | |
| "loss": 1.3716, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1594896331738437, | |
| "grad_norm": 2.6200809478759766, | |
| "learning_rate": 0.00016829787234042554, | |
| "loss": 1.2647, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17012227538543329, | |
| "grad_norm": 2.7764666080474854, | |
| "learning_rate": 0.00016617021276595746, | |
| "loss": 1.0862, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18075491759702286, | |
| "grad_norm": 2.454061269760132, | |
| "learning_rate": 0.00016404255319148937, | |
| "loss": 1.2213, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 2.483651876449585, | |
| "learning_rate": 0.0001619148936170213, | |
| "loss": 1.1895, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 3.5856575965881348, | |
| "learning_rate": 0.0001597872340425532, | |
| "loss": 1.2322, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2126528442317916, | |
| "grad_norm": 2.1436448097229004, | |
| "learning_rate": 0.00015765957446808512, | |
| "loss": 1.2782, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22328548644338117, | |
| "grad_norm": 2.569831609725952, | |
| "learning_rate": 0.00015553191489361701, | |
| "loss": 1.171, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 2.5455546379089355, | |
| "learning_rate": 0.00015340425531914896, | |
| "loss": 1.3055, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24455077086656035, | |
| "grad_norm": 1.7153586149215698, | |
| "learning_rate": 0.00015127659574468085, | |
| "loss": 1.0822, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2551834130781499, | |
| "grad_norm": 2.549631357192993, | |
| "learning_rate": 0.00014914893617021276, | |
| "loss": 1.3206, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2658160552897395, | |
| "grad_norm": 2.53717041015625, | |
| "learning_rate": 0.00014702127659574468, | |
| "loss": 1.1995, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27644869750132905, | |
| "grad_norm": 2.331685781478882, | |
| "learning_rate": 0.0001448936170212766, | |
| "loss": 1.2426, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 2.6866092681884766, | |
| "learning_rate": 0.00014276595744680851, | |
| "loss": 1.1314, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29771398192450826, | |
| "grad_norm": 2.107909679412842, | |
| "learning_rate": 0.00014063829787234043, | |
| "loss": 1.1542, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3083466241360978, | |
| "grad_norm": 1.8758138418197632, | |
| "learning_rate": 0.00013851063829787235, | |
| "loss": 1.1377, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3189792663476874, | |
| "grad_norm": 1.647929072380066, | |
| "learning_rate": 0.00013638297872340427, | |
| "loss": 1.0096, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32961190855927697, | |
| "grad_norm": 2.186124563217163, | |
| "learning_rate": 0.00013425531914893618, | |
| "loss": 1.1746, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.34024455077086657, | |
| "grad_norm": 2.4536380767822266, | |
| "learning_rate": 0.0001321276595744681, | |
| "loss": 1.1833, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 1.8024215698242188, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 0.9309, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3615098351940457, | |
| "grad_norm": 3.0355000495910645, | |
| "learning_rate": 0.0001278723404255319, | |
| "loss": 1.0863, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3721424774056353, | |
| "grad_norm": 1.9415550231933594, | |
| "learning_rate": 0.00012574468085106382, | |
| "loss": 1.0507, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 2.327995538711548, | |
| "learning_rate": 0.00012361702127659577, | |
| "loss": 1.2524, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3934077618288145, | |
| "grad_norm": 2.001037120819092, | |
| "learning_rate": 0.00012148936170212766, | |
| "loss": 1.0437, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 2.1419551372528076, | |
| "learning_rate": 0.00011936170212765959, | |
| "loss": 1.0968, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.41467304625199364, | |
| "grad_norm": 2.3085482120513916, | |
| "learning_rate": 0.0001172340425531915, | |
| "loss": 1.1706, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4253056884635832, | |
| "grad_norm": 4.618401050567627, | |
| "learning_rate": 0.0001151063829787234, | |
| "loss": 1.0685, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4359383306751728, | |
| "grad_norm": 2.421363115310669, | |
| "learning_rate": 0.00011297872340425532, | |
| "loss": 1.0373, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.44657097288676234, | |
| "grad_norm": 1.6373859643936157, | |
| "learning_rate": 0.00011085106382978725, | |
| "loss": 1.0669, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.45720361509835195, | |
| "grad_norm": 2.3031554222106934, | |
| "learning_rate": 0.00010872340425531916, | |
| "loss": 1.1394, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 1.9488067626953125, | |
| "learning_rate": 0.00010659574468085107, | |
| "loss": 1.0347, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 1.8650946617126465, | |
| "learning_rate": 0.00010446808510638298, | |
| "loss": 1.1159, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4891015417331207, | |
| "grad_norm": 1.8462837934494019, | |
| "learning_rate": 0.0001023404255319149, | |
| "loss": 1.0389, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.49973418394471025, | |
| "grad_norm": 2.5941386222839355, | |
| "learning_rate": 0.00010021276595744682, | |
| "loss": 1.1587, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5103668261562998, | |
| "grad_norm": 1.34873366355896, | |
| "learning_rate": 9.808510638297873e-05, | |
| "loss": 1.1095, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5209994683678895, | |
| "grad_norm": 2.2580478191375732, | |
| "learning_rate": 9.595744680851064e-05, | |
| "loss": 1.1268, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.531632110579479, | |
| "grad_norm": 2.389127731323242, | |
| "learning_rate": 9.382978723404256e-05, | |
| "loss": 1.2718, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5422647527910686, | |
| "grad_norm": 2.1379384994506836, | |
| "learning_rate": 9.170212765957448e-05, | |
| "loss": 1.0394, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5528973950026581, | |
| "grad_norm": 2.5996925830841064, | |
| "learning_rate": 8.95744680851064e-05, | |
| "loss": 1.0508, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5635300372142478, | |
| "grad_norm": 2.143913984298706, | |
| "learning_rate": 8.74468085106383e-05, | |
| "loss": 1.0038, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 2.285888910293579, | |
| "learning_rate": 8.531914893617021e-05, | |
| "loss": 1.2064, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 2.3260293006896973, | |
| "learning_rate": 8.319148936170213e-05, | |
| "loss": 1.0499, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5954279638490165, | |
| "grad_norm": 2.3417248725891113, | |
| "learning_rate": 8.106382978723405e-05, | |
| "loss": 1.1371, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 2.194345474243164, | |
| "learning_rate": 7.893617021276596e-05, | |
| "loss": 1.0571, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6166932482721956, | |
| "grad_norm": 2.3759639263153076, | |
| "learning_rate": 7.680851063829788e-05, | |
| "loss": 0.9709, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6273258904837852, | |
| "grad_norm": 1.7851307392120361, | |
| "learning_rate": 7.46808510638298e-05, | |
| "loss": 1.0751, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6379585326953748, | |
| "grad_norm": 2.1073718070983887, | |
| "learning_rate": 7.25531914893617e-05, | |
| "loss": 1.0453, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6485911749069644, | |
| "grad_norm": 3.0715222358703613, | |
| "learning_rate": 7.042553191489362e-05, | |
| "loss": 1.019, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6592238171185539, | |
| "grad_norm": 2.7208268642425537, | |
| "learning_rate": 6.829787234042554e-05, | |
| "loss": 0.919, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 1.7897045612335205, | |
| "learning_rate": 6.617021276595745e-05, | |
| "loss": 0.9964, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6804891015417331, | |
| "grad_norm": 2.317929744720459, | |
| "learning_rate": 6.404255319148937e-05, | |
| "loss": 1.1598, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6911217437533227, | |
| "grad_norm": 1.826894760131836, | |
| "learning_rate": 6.191489361702127e-05, | |
| "loss": 1.117, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 2.0165112018585205, | |
| "learning_rate": 5.9787234042553196e-05, | |
| "loss": 1.0511, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7123870281765019, | |
| "grad_norm": 1.6636179685592651, | |
| "learning_rate": 5.7659574468085106e-05, | |
| "loss": 1.0488, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7230196703880915, | |
| "grad_norm": 2.3491950035095215, | |
| "learning_rate": 5.553191489361702e-05, | |
| "loss": 1.2297, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.733652312599681, | |
| "grad_norm": 2.28796124458313, | |
| "learning_rate": 5.3404255319148946e-05, | |
| "loss": 1.1457, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7442849548112705, | |
| "grad_norm": 2.550320863723755, | |
| "learning_rate": 5.1276595744680856e-05, | |
| "loss": 1.069, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7549175970228602, | |
| "grad_norm": 1.5172102451324463, | |
| "learning_rate": 4.9148936170212766e-05, | |
| "loss": 0.849, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 1.7714675664901733, | |
| "learning_rate": 4.702127659574468e-05, | |
| "loss": 1.1457, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7761828814460393, | |
| "grad_norm": 1.587169885635376, | |
| "learning_rate": 4.489361702127659e-05, | |
| "loss": 0.9438, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.786815523657629, | |
| "grad_norm": 2.464047908782959, | |
| "learning_rate": 4.276595744680851e-05, | |
| "loss": 1.0606, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7974481658692185, | |
| "grad_norm": 1.6491392850875854, | |
| "learning_rate": 4.063829787234043e-05, | |
| "loss": 1.0333, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 2.159282684326172, | |
| "learning_rate": 3.8510638297872344e-05, | |
| "loss": 0.9192, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 1.6473966836929321, | |
| "learning_rate": 3.638297872340426e-05, | |
| "loss": 1.0218, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8293460925039873, | |
| "grad_norm": 2.5140249729156494, | |
| "learning_rate": 3.425531914893617e-05, | |
| "loss": 1.1425, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8399787347155768, | |
| "grad_norm": 1.8191956281661987, | |
| "learning_rate": 3.212765957446809e-05, | |
| "loss": 1.0179, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8506113769271664, | |
| "grad_norm": 1.570918083190918, | |
| "learning_rate": 3e-05, | |
| "loss": 1.0624, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 2.4648308753967285, | |
| "learning_rate": 2.7872340425531918e-05, | |
| "loss": 1.0768, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8718766613503456, | |
| "grad_norm": 2.4284791946411133, | |
| "learning_rate": 2.574468085106383e-05, | |
| "loss": 1.0748, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8825093035619351, | |
| "grad_norm": 2.543541193008423, | |
| "learning_rate": 2.3617021276595748e-05, | |
| "loss": 1.051, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8931419457735247, | |
| "grad_norm": 2.0287232398986816, | |
| "learning_rate": 2.148936170212766e-05, | |
| "loss": 0.9987, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9037745879851143, | |
| "grad_norm": 2.2504048347473145, | |
| "learning_rate": 1.9361702127659575e-05, | |
| "loss": 0.9468, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9144072301967039, | |
| "grad_norm": 1.889223337173462, | |
| "learning_rate": 1.723404255319149e-05, | |
| "loss": 1.1471, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9250398724082934, | |
| "grad_norm": 2.414099931716919, | |
| "learning_rate": 1.5106382978723405e-05, | |
| "loss": 1.0941, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 1.7655644416809082, | |
| "learning_rate": 1.2978723404255318e-05, | |
| "loss": 1.0504, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9463051568314726, | |
| "grad_norm": 1.6641113758087158, | |
| "learning_rate": 1.0851063829787235e-05, | |
| "loss": 1.1144, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 2.2806735038757324, | |
| "learning_rate": 8.72340425531915e-06, | |
| "loss": 1.0884, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9675704412546517, | |
| "grad_norm": 2.1201162338256836, | |
| "learning_rate": 6.595744680851064e-06, | |
| "loss": 1.0405, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9782030834662414, | |
| "grad_norm": 1.651154637336731, | |
| "learning_rate": 4.468085106382979e-06, | |
| "loss": 0.9533, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.988835725677831, | |
| "grad_norm": 2.6276893615722656, | |
| "learning_rate": 2.3404255319148935e-06, | |
| "loss": 1.0675, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9994683678894205, | |
| "grad_norm": 1.7685601711273193, | |
| "learning_rate": 2.1276595744680852e-07, | |
| "loss": 0.8962, | |
| "step": 940 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 940, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2872735048949760.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |