{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2953367875647668, "eval_steps": 10000000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012953367875647668, "grad_norm": 33.1711293280208, "learning_rate": 6.476683937823834e-09, "loss": 3.2052, "step": 10 }, { "epoch": 0.025906735751295335, "grad_norm": 32.37649814316274, "learning_rate": 1.2953367875647667e-08, "loss": 3.0656, "step": 20 }, { "epoch": 0.038860103626943004, "grad_norm": 31.56149230496282, "learning_rate": 1.9430051813471502e-08, "loss": 3.1704, "step": 30 }, { "epoch": 0.05181347150259067, "grad_norm": 31.942868827719224, "learning_rate": 2.5906735751295334e-08, "loss": 3.1568, "step": 40 }, { "epoch": 0.06476683937823834, "grad_norm": 32.143099670749734, "learning_rate": 3.238341968911917e-08, "loss": 3.1613, "step": 50 }, { "epoch": 0.07772020725388601, "grad_norm": 32.24118466383636, "learning_rate": 3.8860103626943005e-08, "loss": 3.1164, "step": 60 }, { "epoch": 0.09067357512953368, "grad_norm": 31.650055464493875, "learning_rate": 4.533678756476684e-08, "loss": 3.1494, "step": 70 }, { "epoch": 0.10362694300518134, "grad_norm": 28.899262199995725, "learning_rate": 5.181347150259067e-08, "loss": 3.0801, "step": 80 }, { "epoch": 0.11658031088082901, "grad_norm": 28.89484749420223, "learning_rate": 5.8290155440414504e-08, "loss": 3.0142, "step": 90 }, { "epoch": 0.12953367875647667, "grad_norm": 28.383181515972513, "learning_rate": 6.476683937823834e-08, "loss": 2.9967, "step": 100 }, { "epoch": 0.14248704663212436, "grad_norm": 27.801929424192455, "learning_rate": 7.124352331606218e-08, "loss": 2.984, "step": 110 }, { "epoch": 0.15544041450777202, "grad_norm": 21.516965224010576, "learning_rate": 7.772020725388601e-08, "loss": 2.822, "step": 120 }, { "epoch": 0.16839378238341968, "grad_norm": 19.65232033324181, "learning_rate": 8.419689119170984e-08, "loss": 2.7653, "step": 130 }, { "epoch": 0.18134715025906736, "grad_norm": 19.201263343869776, "learning_rate": 9.067357512953368e-08, "loss": 2.6609, "step": 140 }, { "epoch": 0.19430051813471502, "grad_norm": 14.459530975729104, "learning_rate": 9.715025906735751e-08, "loss": 2.5682, "step": 150 }, { "epoch": 0.20725388601036268, "grad_norm": 9.12934345966672, "learning_rate": 1.0362694300518134e-07, "loss": 2.432, "step": 160 }, { "epoch": 0.22020725388601037, "grad_norm": 7.256496950927029, "learning_rate": 1.1010362694300518e-07, "loss": 2.3733, "step": 170 }, { "epoch": 0.23316062176165803, "grad_norm": 6.662098845634381, "learning_rate": 1.1658031088082901e-07, "loss": 2.3659, "step": 180 }, { "epoch": 0.24611398963730569, "grad_norm": 6.095993583987254, "learning_rate": 1.2305699481865284e-07, "loss": 2.3295, "step": 190 }, { "epoch": 0.25906735751295334, "grad_norm": 5.693722335575032, "learning_rate": 1.2953367875647668e-07, "loss": 2.3366, "step": 200 }, { "epoch": 0.27202072538860106, "grad_norm": 5.047232489514463, "learning_rate": 1.3601036269430052e-07, "loss": 2.2315, "step": 210 }, { "epoch": 0.2849740932642487, "grad_norm": 4.927414652144195, "learning_rate": 1.4248704663212436e-07, "loss": 2.2668, "step": 220 }, { "epoch": 0.2979274611398964, "grad_norm": 4.835355533686139, "learning_rate": 1.4896373056994818e-07, "loss": 2.2649, "step": 230 }, { "epoch": 0.31088082901554404, "grad_norm": 4.87421791743561, "learning_rate": 1.5544041450777202e-07, "loss": 2.2932, "step": 240 }, { "epoch": 0.3238341968911917, "grad_norm": 4.820276947385723, "learning_rate": 1.6191709844559583e-07, "loss": 2.2502, "step": 250 }, { "epoch": 0.33678756476683935, "grad_norm": 4.516053705523844, "learning_rate": 1.6839378238341968e-07, "loss": 2.2293, "step": 260 }, { "epoch": 0.34974093264248707, "grad_norm": 4.496107225758641, "learning_rate": 1.7487046632124352e-07, "loss": 2.2073, "step": 270 }, { "epoch": 0.3626943005181347, "grad_norm": 4.349690749010343, "learning_rate": 1.8134715025906736e-07, "loss": 2.2241, "step": 280 }, { "epoch": 0.3756476683937824, "grad_norm": 4.556595681247454, "learning_rate": 1.8782383419689118e-07, "loss": 2.2485, "step": 290 }, { "epoch": 0.38860103626943004, "grad_norm": 4.209430201616371, "learning_rate": 1.9430051813471502e-07, "loss": 2.2237, "step": 300 }, { "epoch": 0.4015544041450777, "grad_norm": 4.489040941077934, "learning_rate": 2.0077720207253883e-07, "loss": 2.2653, "step": 310 }, { "epoch": 0.41450777202072536, "grad_norm": 4.803060461126722, "learning_rate": 2.0725388601036267e-07, "loss": 2.1946, "step": 320 }, { "epoch": 0.4274611398963731, "grad_norm": 4.280584588616054, "learning_rate": 2.1373056994818652e-07, "loss": 2.2146, "step": 330 }, { "epoch": 0.44041450777202074, "grad_norm": 4.619244786650026, "learning_rate": 2.2020725388601036e-07, "loss": 2.2021, "step": 340 }, { "epoch": 0.4533678756476684, "grad_norm": 4.4382532701001995, "learning_rate": 2.2668393782383417e-07, "loss": 2.1895, "step": 350 }, { "epoch": 0.46632124352331605, "grad_norm": 4.2716622332872145, "learning_rate": 2.3316062176165802e-07, "loss": 2.1829, "step": 360 }, { "epoch": 0.4792746113989637, "grad_norm": 4.273910475216059, "learning_rate": 2.3963730569948183e-07, "loss": 2.2089, "step": 370 }, { "epoch": 0.49222797927461137, "grad_norm": 4.478306118940495, "learning_rate": 2.4611398963730567e-07, "loss": 2.1644, "step": 380 }, { "epoch": 0.5051813471502591, "grad_norm": 4.4518158510772485, "learning_rate": 2.525906735751295e-07, "loss": 2.1981, "step": 390 }, { "epoch": 0.5181347150259067, "grad_norm": 4.288110327620116, "learning_rate": 2.5906735751295336e-07, "loss": 2.1446, "step": 400 }, { "epoch": 0.5310880829015544, "grad_norm": 4.176981515512014, "learning_rate": 2.655440414507772e-07, "loss": 2.1793, "step": 410 }, { "epoch": 0.5440414507772021, "grad_norm": 4.341974723955389, "learning_rate": 2.7202072538860104e-07, "loss": 2.1744, "step": 420 }, { "epoch": 0.5569948186528497, "grad_norm": 4.283471991855415, "learning_rate": 2.7849740932642483e-07, "loss": 2.1733, "step": 430 }, { "epoch": 0.5699481865284974, "grad_norm": 4.373337793890535, "learning_rate": 2.849740932642487e-07, "loss": 2.2163, "step": 440 }, { "epoch": 0.582901554404145, "grad_norm": 4.192319713165312, "learning_rate": 2.914507772020725e-07, "loss": 2.1688, "step": 450 }, { "epoch": 0.5958549222797928, "grad_norm": 4.431919073381032, "learning_rate": 2.9792746113989635e-07, "loss": 2.1808, "step": 460 }, { "epoch": 0.6088082901554405, "grad_norm": 4.4889229042752845, "learning_rate": 3.044041450777202e-07, "loss": 2.1981, "step": 470 }, { "epoch": 0.6217616580310881, "grad_norm": 4.222020097797262, "learning_rate": 3.1088082901554404e-07, "loss": 2.1798, "step": 480 }, { "epoch": 0.6347150259067358, "grad_norm": 4.216176333681839, "learning_rate": 3.173575129533679e-07, "loss": 2.1864, "step": 490 }, { "epoch": 0.6476683937823834, "grad_norm": 4.366482754156596, "learning_rate": 3.2383419689119167e-07, "loss": 2.1344, "step": 500 }, { "epoch": 0.6606217616580311, "grad_norm": 4.357005078373983, "learning_rate": 3.303108808290155e-07, "loss": 2.1424, "step": 510 }, { "epoch": 0.6735751295336787, "grad_norm": 4.541933842125955, "learning_rate": 3.3678756476683935e-07, "loss": 2.1517, "step": 520 }, { "epoch": 0.6865284974093264, "grad_norm": 4.161277678712947, "learning_rate": 3.432642487046632e-07, "loss": 2.1546, "step": 530 }, { "epoch": 0.6994818652849741, "grad_norm": 4.236717017318247, "learning_rate": 3.4974093264248704e-07, "loss": 2.1327, "step": 540 }, { "epoch": 0.7124352331606217, "grad_norm": 4.300988527799866, "learning_rate": 3.562176165803109e-07, "loss": 2.1501, "step": 550 }, { "epoch": 0.7253886010362695, "grad_norm": 4.223261260043241, "learning_rate": 3.626943005181347e-07, "loss": 2.1615, "step": 560 }, { "epoch": 0.7383419689119171, "grad_norm": 4.085308096354535, "learning_rate": 3.691709844559585e-07, "loss": 2.144, "step": 570 }, { "epoch": 0.7512953367875648, "grad_norm": 4.31015677001362, "learning_rate": 3.7564766839378235e-07, "loss": 2.1859, "step": 580 }, { "epoch": 0.7642487046632125, "grad_norm": 4.231574714412857, "learning_rate": 3.8212435233160625e-07, "loss": 2.1476, "step": 590 }, { "epoch": 0.7772020725388601, "grad_norm": 4.206477374687759, "learning_rate": 3.8860103626943004e-07, "loss": 2.1658, "step": 600 }, { "epoch": 0.7901554404145078, "grad_norm": 4.354390663140942, "learning_rate": 3.950777202072539e-07, "loss": 2.1599, "step": 610 }, { "epoch": 0.8031088082901554, "grad_norm": 4.110842381635348, "learning_rate": 4.0155440414507767e-07, "loss": 2.1431, "step": 620 }, { "epoch": 0.8160621761658031, "grad_norm": 4.237764332245077, "learning_rate": 4.0803108808290156e-07, "loss": 2.1543, "step": 630 }, { "epoch": 0.8290155440414507, "grad_norm": 3.905536571258385, "learning_rate": 4.1450777202072535e-07, "loss": 2.1106, "step": 640 }, { "epoch": 0.8419689119170984, "grad_norm": 4.393170487432548, "learning_rate": 4.209844559585492e-07, "loss": 2.1392, "step": 650 }, { "epoch": 0.8549222797927462, "grad_norm": 4.251449853594785, "learning_rate": 4.2746113989637303e-07, "loss": 2.1434, "step": 660 }, { "epoch": 0.8678756476683938, "grad_norm": 4.230410652383188, "learning_rate": 4.339378238341969e-07, "loss": 2.1033, "step": 670 }, { "epoch": 0.8808290155440415, "grad_norm": 4.191297726929567, "learning_rate": 4.404145077720207e-07, "loss": 2.1668, "step": 680 }, { "epoch": 0.8937823834196891, "grad_norm": 4.157886217693691, "learning_rate": 4.468911917098445e-07, "loss": 2.1379, "step": 690 }, { "epoch": 0.9067357512953368, "grad_norm": 4.485713372256864, "learning_rate": 4.5336787564766835e-07, "loss": 2.1352, "step": 700 }, { "epoch": 0.9196891191709845, "grad_norm": 3.9736562746500805, "learning_rate": 4.5984455958549224e-07, "loss": 2.1097, "step": 710 }, { "epoch": 0.9326424870466321, "grad_norm": 4.118058611454383, "learning_rate": 4.6632124352331603e-07, "loss": 2.1121, "step": 720 }, { "epoch": 0.9455958549222798, "grad_norm": 4.059747364924617, "learning_rate": 4.7279792746113987e-07, "loss": 2.0727, "step": 730 }, { "epoch": 0.9585492227979274, "grad_norm": 3.9437377004412997, "learning_rate": 4.792746113989637e-07, "loss": 2.0997, "step": 740 }, { "epoch": 0.9715025906735751, "grad_norm": 4.038836900317155, "learning_rate": 4.857512953367875e-07, "loss": 2.1516, "step": 750 }, { "epoch": 0.9844559585492227, "grad_norm": 4.416716033210665, "learning_rate": 4.922279792746113e-07, "loss": 2.1124, "step": 760 }, { "epoch": 0.9974093264248705, "grad_norm": 4.138231942142784, "learning_rate": 4.987046632124352e-07, "loss": 2.0725, "step": 770 }, { "epoch": 1.0103626943005182, "grad_norm": 4.29950033055081, "learning_rate": 5.05181347150259e-07, "loss": 2.0996, "step": 780 }, { "epoch": 1.0233160621761659, "grad_norm": 4.142376447802417, "learning_rate": 5.116580310880829e-07, "loss": 2.1324, "step": 790 }, { "epoch": 1.0362694300518134, "grad_norm": 3.917371610743461, "learning_rate": 5.181347150259067e-07, "loss": 2.1006, "step": 800 }, { "epoch": 1.049222797927461, "grad_norm": 4.038077523537081, "learning_rate": 5.246113989637306e-07, "loss": 2.1275, "step": 810 }, { "epoch": 1.0621761658031088, "grad_norm": 4.318310745831879, "learning_rate": 5.310880829015544e-07, "loss": 2.0793, "step": 820 }, { "epoch": 1.0751295336787565, "grad_norm": 4.26330338898587, "learning_rate": 5.375647668393782e-07, "loss": 2.0955, "step": 830 }, { "epoch": 1.0880829015544042, "grad_norm": 3.8965234945979663, "learning_rate": 5.440414507772021e-07, "loss": 2.0684, "step": 840 }, { "epoch": 1.1010362694300517, "grad_norm": 4.288695103356495, "learning_rate": 5.505181347150258e-07, "loss": 2.1247, "step": 850 }, { "epoch": 1.1139896373056994, "grad_norm": 4.035234944690109, "learning_rate": 5.569948186528497e-07, "loss": 2.1666, "step": 860 }, { "epoch": 1.1269430051813472, "grad_norm": 4.091744197400346, "learning_rate": 5.634715025906735e-07, "loss": 2.1217, "step": 870 }, { "epoch": 1.1398963730569949, "grad_norm": 3.927555977045572, "learning_rate": 5.699481865284974e-07, "loss": 2.1349, "step": 880 }, { "epoch": 1.1528497409326426, "grad_norm": 4.16752707585748, "learning_rate": 5.764248704663213e-07, "loss": 2.1126, "step": 890 }, { "epoch": 1.16580310880829, "grad_norm": 4.099847144482344, "learning_rate": 5.82901554404145e-07, "loss": 2.1144, "step": 900 }, { "epoch": 1.1787564766839378, "grad_norm": 4.174988920130071, "learning_rate": 5.893782383419689e-07, "loss": 2.0872, "step": 910 }, { "epoch": 1.1917098445595855, "grad_norm": 4.109961957930782, "learning_rate": 5.958549222797927e-07, "loss": 2.0567, "step": 920 }, { "epoch": 1.2046632124352332, "grad_norm": 4.180647847650424, "learning_rate": 6.023316062176166e-07, "loss": 2.1202, "step": 930 }, { "epoch": 1.2176165803108807, "grad_norm": 3.98575411050178, "learning_rate": 6.088082901554404e-07, "loss": 2.1152, "step": 940 }, { "epoch": 1.2305699481865284, "grad_norm": 3.9239892073269997, "learning_rate": 6.152849740932642e-07, "loss": 2.0765, "step": 950 }, { "epoch": 1.2435233160621761, "grad_norm": 4.266583390376126, "learning_rate": 6.217616580310881e-07, "loss": 2.099, "step": 960 }, { "epoch": 1.2564766839378239, "grad_norm": 3.958906112705802, "learning_rate": 6.282383419689119e-07, "loss": 2.0834, "step": 970 }, { "epoch": 1.2694300518134716, "grad_norm": 4.050771101367249, "learning_rate": 6.347150259067358e-07, "loss": 2.1043, "step": 980 }, { "epoch": 1.2823834196891193, "grad_norm": 4.145392853886648, "learning_rate": 6.411917098445595e-07, "loss": 2.0901, "step": 990 }, { "epoch": 1.2953367875647668, "grad_norm": 4.279480677144505, "learning_rate": 6.476683937823833e-07, "loss": 2.084, "step": 1000 } ], "logging_steps": 10, "max_steps": 15440, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 21117560389632.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }