| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 23.272232304900182, | |
| "eval_steps": 100, | |
| "global_step": 1600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14519056261343014, | |
| "grad_norm": 22.5987491607666, | |
| "learning_rate": 5.882352941176471e-07, | |
| "loss": 9.9337, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.29038112522686027, | |
| "grad_norm": 16.331647872924805, | |
| "learning_rate": 1.1764705882352942e-06, | |
| "loss": 9.311, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4355716878402904, | |
| "grad_norm": 7.719615936279297, | |
| "learning_rate": 1.7647058823529414e-06, | |
| "loss": 8.3271, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5807622504537205, | |
| "grad_norm": 4.961643695831299, | |
| "learning_rate": 2.3529411764705885e-06, | |
| "loss": 7.6964, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7259528130671506, | |
| "grad_norm": 3.091538190841675, | |
| "learning_rate": 2.9411764705882355e-06, | |
| "loss": 7.2891, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8711433756805808, | |
| "grad_norm": 2.1008360385894775, | |
| "learning_rate": 3.529411764705883e-06, | |
| "loss": 6.9609, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0181488203266789, | |
| "grad_norm": 1.4959540367126465, | |
| "learning_rate": 4.11764705882353e-06, | |
| "loss": 6.8303, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1633393829401089, | |
| "grad_norm": 1.1464451551437378, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 6.4138, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.308529945553539, | |
| "grad_norm": 1.0414376258850098, | |
| "learning_rate": 5.294117647058824e-06, | |
| "loss": 6.1729, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.453720508166969, | |
| "grad_norm": 1.088547945022583, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 5.9844, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.453720508166969, | |
| "eval_loss": 5.881684303283691, | |
| "eval_runtime": 14.889, | |
| "eval_samples_per_second": 191.417, | |
| "eval_steps_per_second": 6.045, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.5989110707803993, | |
| "grad_norm": 1.8722983598709106, | |
| "learning_rate": 6.470588235294119e-06, | |
| "loss": 5.8047, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.7441016333938295, | |
| "grad_norm": 2.358875036239624, | |
| "learning_rate": 7.058823529411766e-06, | |
| "loss": 5.6561, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.8892921960072595, | |
| "grad_norm": 2.1306469440460205, | |
| "learning_rate": 7.647058823529411e-06, | |
| "loss": 5.4932, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.0362976406533577, | |
| "grad_norm": 1.8775111436843872, | |
| "learning_rate": 8.23529411764706e-06, | |
| "loss": 5.4843, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.1814882032667877, | |
| "grad_norm": 2.393399477005005, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 5.203, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.3266787658802177, | |
| "grad_norm": 1.8098454475402832, | |
| "learning_rate": 9.411764705882354e-06, | |
| "loss": 5.0835, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.471869328493648, | |
| "grad_norm": 1.7228336334228516, | |
| "learning_rate": 1e-05, | |
| "loss": 4.9627, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.617059891107078, | |
| "grad_norm": 2.191887378692627, | |
| "learning_rate": 1.0588235294117648e-05, | |
| "loss": 4.8249, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.762250453720508, | |
| "grad_norm": 2.229368209838867, | |
| "learning_rate": 1.1176470588235295e-05, | |
| "loss": 4.7276, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.907441016333938, | |
| "grad_norm": 2.7427608966827393, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 4.6183, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.907441016333938, | |
| "eval_loss": 4.551872253417969, | |
| "eval_runtime": 14.875, | |
| "eval_samples_per_second": 191.597, | |
| "eval_steps_per_second": 6.05, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.0544464609800364, | |
| "grad_norm": 1.6583671569824219, | |
| "learning_rate": 1.235294117647059e-05, | |
| "loss": 4.6092, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.1996370235934664, | |
| "grad_norm": 1.6920286417007446, | |
| "learning_rate": 1.2941176470588238e-05, | |
| "loss": 4.4222, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.344827586206897, | |
| "grad_norm": 2.430650472640991, | |
| "learning_rate": 1.3529411764705885e-05, | |
| "loss": 4.3312, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.490018148820327, | |
| "grad_norm": 1.9790210723876953, | |
| "learning_rate": 1.4117647058823532e-05, | |
| "loss": 4.2582, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.635208711433757, | |
| "grad_norm": 2.1671459674835205, | |
| "learning_rate": 1.4705882352941179e-05, | |
| "loss": 4.1759, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.780399274047187, | |
| "grad_norm": 2.1711926460266113, | |
| "learning_rate": 1.5294117647058822e-05, | |
| "loss": 4.1042, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.925589836660617, | |
| "grad_norm": 3.5840203762054443, | |
| "learning_rate": 1.5882352941176473e-05, | |
| "loss": 4.0326, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.072595281306715, | |
| "grad_norm": 3.1725780963897705, | |
| "learning_rate": 1.647058823529412e-05, | |
| "loss": 4.0802, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.217785843920145, | |
| "grad_norm": 3.8986566066741943, | |
| "learning_rate": 1.7058823529411767e-05, | |
| "loss": 3.9111, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.362976406533575, | |
| "grad_norm": 3.374891996383667, | |
| "learning_rate": 1.7647058823529414e-05, | |
| "loss": 3.8626, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.362976406533575, | |
| "eval_loss": 3.839364767074585, | |
| "eval_runtime": 14.9144, | |
| "eval_samples_per_second": 191.091, | |
| "eval_steps_per_second": 6.034, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.508166969147005, | |
| "grad_norm": 5.283336639404297, | |
| "learning_rate": 1.823529411764706e-05, | |
| "loss": 3.8031, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.653357531760435, | |
| "grad_norm": 4.989195346832275, | |
| "learning_rate": 1.8823529411764708e-05, | |
| "loss": 3.7691, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.798548094373865, | |
| "grad_norm": 4.338991641998291, | |
| "learning_rate": 1.9411764705882355e-05, | |
| "loss": 3.7185, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.943738656987296, | |
| "grad_norm": 5.136618614196777, | |
| "learning_rate": 2e-05, | |
| "loss": 3.6737, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.090744101633394, | |
| "grad_norm": 4.936671733856201, | |
| "learning_rate": 1.9999472984871734e-05, | |
| "loss": 3.7175, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.235934664246824, | |
| "grad_norm": 5.321089744567871, | |
| "learning_rate": 1.9997891995035914e-05, | |
| "loss": 3.5934, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.381125226860254, | |
| "grad_norm": 5.364513874053955, | |
| "learning_rate": 1.999525719713366e-05, | |
| "loss": 3.5399, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.526315789473684, | |
| "grad_norm": 4.728522300720215, | |
| "learning_rate": 1.999156886888064e-05, | |
| "loss": 3.4985, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.671506352087114, | |
| "grad_norm": 3.8172831535339355, | |
| "learning_rate": 1.998682739903781e-05, | |
| "loss": 3.4783, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.816696914700545, | |
| "grad_norm": 5.241791725158691, | |
| "learning_rate": 1.9981033287370443e-05, | |
| "loss": 3.4375, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.816696914700545, | |
| "eval_loss": 3.425196647644043, | |
| "eval_runtime": 14.8876, | |
| "eval_samples_per_second": 191.435, | |
| "eval_steps_per_second": 6.045, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.961887477313975, | |
| "grad_norm": 5.1964569091796875, | |
| "learning_rate": 1.9974187144595433e-05, | |
| "loss": 3.4051, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.108892921960073, | |
| "grad_norm": 4.528454780578613, | |
| "learning_rate": 1.9966289692316944e-05, | |
| "loss": 3.4574, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.254083484573503, | |
| "grad_norm": 5.369091987609863, | |
| "learning_rate": 1.9957341762950346e-05, | |
| "loss": 3.3329, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.399274047186933, | |
| "grad_norm": 4.877899169921875, | |
| "learning_rate": 1.9947344299634464e-05, | |
| "loss": 3.297, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.544464609800363, | |
| "grad_norm": 4.93041467666626, | |
| "learning_rate": 1.993629835613218e-05, | |
| "loss": 3.265, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.689655172413794, | |
| "grad_norm": 4.630007743835449, | |
| "learning_rate": 1.992420509671936e-05, | |
| "loss": 3.2344, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.834845735027224, | |
| "grad_norm": 5.94407844543457, | |
| "learning_rate": 1.9911065796062137e-05, | |
| "loss": 3.1948, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.980036297640654, | |
| "grad_norm": 5.310027599334717, | |
| "learning_rate": 1.9896881839082554e-05, | |
| "loss": 3.1593, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.127041742286751, | |
| "grad_norm": 4.374192714691162, | |
| "learning_rate": 1.9881654720812594e-05, | |
| "loss": 3.2061, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.272232304900181, | |
| "grad_norm": 5.397394180297852, | |
| "learning_rate": 1.9865386046236597e-05, | |
| "loss": 3.1139, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.272232304900181, | |
| "eval_loss": 3.0933759212493896, | |
| "eval_runtime": 14.9156, | |
| "eval_samples_per_second": 191.074, | |
| "eval_steps_per_second": 6.034, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.417422867513611, | |
| "grad_norm": 5.458749294281006, | |
| "learning_rate": 1.9848077530122083e-05, | |
| "loss": 3.079, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 7.562613430127042, | |
| "grad_norm": 5.029440402984619, | |
| "learning_rate": 1.982973099683902e-05, | |
| "loss": 3.0268, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.707803992740472, | |
| "grad_norm": 6.583869457244873, | |
| "learning_rate": 1.9810348380167527e-05, | |
| "loss": 3.0157, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 7.852994555353902, | |
| "grad_norm": 4.824792861938477, | |
| "learning_rate": 1.9789931723094046e-05, | |
| "loss": 2.9801, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.998185117967332, | |
| "grad_norm": 6.213143348693848, | |
| "learning_rate": 1.9768483177596008e-05, | |
| "loss": 3.0539, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.14519056261343, | |
| "grad_norm": 4.015461444854736, | |
| "learning_rate": 1.9746005004415004e-05, | |
| "loss": 2.932, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.29038112522686, | |
| "grad_norm": 4.033381462097168, | |
| "learning_rate": 1.9722499572818496e-05, | |
| "loss": 2.9166, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 8.43557168784029, | |
| "grad_norm": 3.8710265159606934, | |
| "learning_rate": 1.9697969360350098e-05, | |
| "loss": 2.8946, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.58076225045372, | |
| "grad_norm": 3.5882229804992676, | |
| "learning_rate": 1.9672416952568416e-05, | |
| "loss": 2.8624, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 8.72595281306715, | |
| "grad_norm": 4.0660858154296875, | |
| "learning_rate": 1.9645845042774555e-05, | |
| "loss": 2.8595, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.72595281306715, | |
| "eval_loss": 2.8510355949401855, | |
| "eval_runtime": 14.8889, | |
| "eval_samples_per_second": 191.417, | |
| "eval_steps_per_second": 6.045, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.87114337568058, | |
| "grad_norm": 2.8742496967315674, | |
| "learning_rate": 1.961825643172819e-05, | |
| "loss": 2.8184, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 9.01814882032668, | |
| "grad_norm": 3.7376978397369385, | |
| "learning_rate": 1.9589654027352412e-05, | |
| "loss": 2.8907, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 9.163339382940109, | |
| "grad_norm": 3.3460233211517334, | |
| "learning_rate": 1.956004084442718e-05, | |
| "loss": 2.7884, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 9.30852994555354, | |
| "grad_norm": 3.148989200592041, | |
| "learning_rate": 1.9529420004271568e-05, | |
| "loss": 2.7853, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 9.453720508166969, | |
| "grad_norm": 3.648300886154175, | |
| "learning_rate": 1.9497794734414782e-05, | |
| "loss": 2.7481, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.5989110707804, | |
| "grad_norm": 3.3044703006744385, | |
| "learning_rate": 1.9465168368255946e-05, | |
| "loss": 2.7365, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 9.744101633393829, | |
| "grad_norm": 3.034123659133911, | |
| "learning_rate": 1.9431544344712776e-05, | |
| "loss": 2.7177, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 9.88929219600726, | |
| "grad_norm": 3.021785259246826, | |
| "learning_rate": 1.9396926207859085e-05, | |
| "loss": 2.6953, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 10.036297640653357, | |
| "grad_norm": 3.7900452613830566, | |
| "learning_rate": 1.936131760655124e-05, | |
| "loss": 2.7471, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 10.181488203266788, | |
| "grad_norm": 2.3373985290527344, | |
| "learning_rate": 1.932472229404356e-05, | |
| "loss": 2.671, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.181488203266788, | |
| "eval_loss": 2.6855897903442383, | |
| "eval_runtime": 15.1054, | |
| "eval_samples_per_second": 188.674, | |
| "eval_steps_per_second": 5.958, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.326678765880217, | |
| "grad_norm": 3.666839599609375, | |
| "learning_rate": 1.9287144127592704e-05, | |
| "loss": 2.6597, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 10.471869328493648, | |
| "grad_norm": 2.6193573474884033, | |
| "learning_rate": 1.924858706805112e-05, | |
| "loss": 2.6424, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 10.617059891107077, | |
| "grad_norm": 3.3366763591766357, | |
| "learning_rate": 1.920905517944954e-05, | |
| "loss": 2.6206, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 10.762250453720508, | |
| "grad_norm": 2.5702297687530518, | |
| "learning_rate": 1.9168552628568632e-05, | |
| "loss": 2.6068, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 10.907441016333939, | |
| "grad_norm": 4.158718109130859, | |
| "learning_rate": 1.9127083684499805e-05, | |
| "loss": 2.6103, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 11.054446460980037, | |
| "grad_norm": 2.952253818511963, | |
| "learning_rate": 1.9084652718195237e-05, | |
| "loss": 2.666, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 11.199637023593466, | |
| "grad_norm": 3.0563578605651855, | |
| "learning_rate": 1.9041264202007158e-05, | |
| "loss": 2.5752, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 11.344827586206897, | |
| "grad_norm": 3.01377272605896, | |
| "learning_rate": 1.8996922709216456e-05, | |
| "loss": 2.5711, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 11.490018148820326, | |
| "grad_norm": 2.676931142807007, | |
| "learning_rate": 1.8951632913550625e-05, | |
| "loss": 2.5482, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 11.635208711433757, | |
| "grad_norm": 2.5490481853485107, | |
| "learning_rate": 1.8905399588691165e-05, | |
| "loss": 2.5373, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 11.635208711433757, | |
| "eval_loss": 2.5524280071258545, | |
| "eval_runtime": 14.8872, | |
| "eval_samples_per_second": 191.439, | |
| "eval_steps_per_second": 6.045, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 11.780399274047188, | |
| "grad_norm": 3.0705935955047607, | |
| "learning_rate": 1.8858227607770398e-05, | |
| "loss": 2.528, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 11.925589836660617, | |
| "grad_norm": 2.3048040866851807, | |
| "learning_rate": 1.8810121942857848e-05, | |
| "loss": 2.517, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 12.072595281306715, | |
| "grad_norm": 2.1240074634552, | |
| "learning_rate": 1.8761087664436137e-05, | |
| "loss": 2.5742, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 12.217785843920145, | |
| "grad_norm": 3.5343940258026123, | |
| "learning_rate": 1.8711129940866577e-05, | |
| "loss": 2.5046, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 12.362976406533575, | |
| "grad_norm": 2.78930926322937, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 2.4962, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 12.508166969147005, | |
| "grad_norm": 2.7689406871795654, | |
| "learning_rate": 1.860846531784368e-05, | |
| "loss": 2.4728, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 12.653357531760436, | |
| "grad_norm": 2.76344895362854, | |
| "learning_rate": 1.8555769239552232e-05, | |
| "loss": 2.4663, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 12.798548094373865, | |
| "grad_norm": 2.473503828048706, | |
| "learning_rate": 1.8502171357296144e-05, | |
| "loss": 2.4568, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 12.943738656987296, | |
| "grad_norm": 3.0922844409942627, | |
| "learning_rate": 1.8447677320454367e-05, | |
| "loss": 2.4562, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 13.090744101633394, | |
| "grad_norm": 3.1442785263061523, | |
| "learning_rate": 1.839229287286327e-05, | |
| "loss": 2.5069, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 13.090744101633394, | |
| "eval_loss": 2.458808183670044, | |
| "eval_runtime": 14.8914, | |
| "eval_samples_per_second": 191.385, | |
| "eval_steps_per_second": 6.044, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 13.235934664246823, | |
| "grad_norm": 2.646357536315918, | |
| "learning_rate": 1.8336023852211197e-05, | |
| "loss": 2.4391, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 13.381125226860254, | |
| "grad_norm": 2.668268918991089, | |
| "learning_rate": 1.827887618942318e-05, | |
| "loss": 2.4319, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 13.526315789473685, | |
| "grad_norm": 2.6600754261016846, | |
| "learning_rate": 1.8220855908035783e-05, | |
| "loss": 2.4136, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 13.671506352087114, | |
| "grad_norm": 3.3083410263061523, | |
| "learning_rate": 1.816196912356222e-05, | |
| "loss": 2.4137, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 13.816696914700545, | |
| "grad_norm": 3.797403573989868, | |
| "learning_rate": 1.8102222042847735e-05, | |
| "loss": 2.4063, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 13.961887477313974, | |
| "grad_norm": 3.599175214767456, | |
| "learning_rate": 1.8041620963415418e-05, | |
| "loss": 2.3988, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 14.108892921960072, | |
| "grad_norm": 4.192938327789307, | |
| "learning_rate": 1.7980172272802398e-05, | |
| "loss": 2.452, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 14.254083484573503, | |
| "grad_norm": 4.492245197296143, | |
| "learning_rate": 1.7917882447886585e-05, | |
| "loss": 2.3884, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 14.399274047186934, | |
| "grad_norm": 3.3529608249664307, | |
| "learning_rate": 1.785475805420399e-05, | |
| "loss": 2.379, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 14.544464609800363, | |
| "grad_norm": 3.170184850692749, | |
| "learning_rate": 1.7790805745256703e-05, | |
| "loss": 2.3716, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 14.544464609800363, | |
| "eval_loss": 2.3836851119995117, | |
| "eval_runtime": 14.94, | |
| "eval_samples_per_second": 190.763, | |
| "eval_steps_per_second": 6.024, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 14.689655172413794, | |
| "grad_norm": 2.9663002490997314, | |
| "learning_rate": 1.772603226181159e-05, | |
| "loss": 2.3473, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 14.834845735027223, | |
| "grad_norm": 4.165375232696533, | |
| "learning_rate": 1.766044443118978e-05, | |
| "loss": 2.3599, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 14.980036297640654, | |
| "grad_norm": 4.470517158508301, | |
| "learning_rate": 1.7594049166547073e-05, | |
| "loss": 2.3509, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 15.127041742286751, | |
| "grad_norm": 4.556570529937744, | |
| "learning_rate": 1.7526853466145248e-05, | |
| "loss": 2.408, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 15.272232304900182, | |
| "grad_norm": 5.301922798156738, | |
| "learning_rate": 1.7458864412614436e-05, | |
| "loss": 2.3295, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 15.417422867513611, | |
| "grad_norm": 4.635037422180176, | |
| "learning_rate": 1.7390089172206594e-05, | |
| "loss": 2.3307, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 15.562613430127042, | |
| "grad_norm": 4.69865083694458, | |
| "learning_rate": 1.7320534994040148e-05, | |
| "loss": 2.3145, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 15.707803992740471, | |
| "grad_norm": 3.920423984527588, | |
| "learning_rate": 1.725020920933593e-05, | |
| "loss": 2.3157, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 15.852994555353902, | |
| "grad_norm": 4.361971378326416, | |
| "learning_rate": 1.717911923064442e-05, | |
| "loss": 2.3078, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 15.998185117967331, | |
| "grad_norm": 7.068411350250244, | |
| "learning_rate": 1.710727255106447e-05, | |
| "loss": 2.3586, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 15.998185117967331, | |
| "eval_loss": 2.3172919750213623, | |
| "eval_runtime": 14.8891, | |
| "eval_samples_per_second": 191.415, | |
| "eval_steps_per_second": 6.045, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 16.14519056261343, | |
| "grad_norm": 4.1981329917907715, | |
| "learning_rate": 1.70346767434535e-05, | |
| "loss": 2.3117, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 16.29038112522686, | |
| "grad_norm": 3.4976508617401123, | |
| "learning_rate": 1.696133945962927e-05, | |
| "loss": 2.305, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 16.43557168784029, | |
| "grad_norm": 4.435906410217285, | |
| "learning_rate": 1.6887268429563387e-05, | |
| "loss": 2.2863, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 16.58076225045372, | |
| "grad_norm": 4.972898483276367, | |
| "learning_rate": 1.681247146056654e-05, | |
| "loss": 2.2743, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 16.72595281306715, | |
| "grad_norm": 4.645566940307617, | |
| "learning_rate": 1.6736956436465573e-05, | |
| "loss": 2.27, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 16.87114337568058, | |
| "grad_norm": 3.4692468643188477, | |
| "learning_rate": 1.6660731316772503e-05, | |
| "loss": 2.2665, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 17.01814882032668, | |
| "grad_norm": 5.320245265960693, | |
| "learning_rate": 1.6583804135845582e-05, | |
| "loss": 2.3195, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 17.16333938294011, | |
| "grad_norm": 4.651059150695801, | |
| "learning_rate": 1.650618300204242e-05, | |
| "loss": 2.2719, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 17.308529945553538, | |
| "grad_norm": 3.385852098464966, | |
| "learning_rate": 1.6427876096865394e-05, | |
| "loss": 2.2618, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 17.45372050816697, | |
| "grad_norm": 4.4722771644592285, | |
| "learning_rate": 1.634889167409923e-05, | |
| "loss": 2.2455, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 17.45372050816697, | |
| "eval_loss": 2.266216516494751, | |
| "eval_runtime": 14.8832, | |
| "eval_samples_per_second": 191.491, | |
| "eval_steps_per_second": 6.047, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 17.5989110707804, | |
| "grad_norm": 4.48843240737915, | |
| "learning_rate": 1.626923805894107e-05, | |
| "loss": 2.2325, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 17.74410163339383, | |
| "grad_norm": 4.738080978393555, | |
| "learning_rate": 1.6188923647122946e-05, | |
| "loss": 2.2342, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 17.88929219600726, | |
| "grad_norm": 3.4513983726501465, | |
| "learning_rate": 1.610795690402688e-05, | |
| "loss": 2.2307, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 18.03629764065336, | |
| "grad_norm": 5.09975528717041, | |
| "learning_rate": 1.6026346363792565e-05, | |
| "loss": 2.2732, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 18.181488203266788, | |
| "grad_norm": 4.678542613983154, | |
| "learning_rate": 1.594410062841787e-05, | |
| "loss": 2.2163, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 18.326678765880217, | |
| "grad_norm": 4.759681701660156, | |
| "learning_rate": 1.5861228366852148e-05, | |
| "loss": 2.2162, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 18.471869328493646, | |
| "grad_norm": 4.229626178741455, | |
| "learning_rate": 1.5777738314082514e-05, | |
| "loss": 2.2111, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 18.61705989110708, | |
| "grad_norm": 3.9783313274383545, | |
| "learning_rate": 1.5693639270213138e-05, | |
| "loss": 2.2148, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 18.762250453720508, | |
| "grad_norm": 5.122952461242676, | |
| "learning_rate": 1.56089400995377e-05, | |
| "loss": 2.1918, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 18.907441016333937, | |
| "grad_norm": 4.5258893966674805, | |
| "learning_rate": 1.552364972960506e-05, | |
| "loss": 2.1982, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 18.907441016333937, | |
| "eval_loss": 2.218618392944336, | |
| "eval_runtime": 14.9173, | |
| "eval_samples_per_second": 191.053, | |
| "eval_steps_per_second": 6.033, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 19.054446460980035, | |
| "grad_norm": 3.5518476963043213, | |
| "learning_rate": 1.5437777150278268e-05, | |
| "loss": 2.241, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 19.199637023593468, | |
| "grad_norm": 3.2547390460968018, | |
| "learning_rate": 1.5351331412787004e-05, | |
| "loss": 2.1769, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 19.344827586206897, | |
| "grad_norm": 4.707066535949707, | |
| "learning_rate": 1.526432162877356e-05, | |
| "loss": 2.1888, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 19.490018148820326, | |
| "grad_norm": 3.84458327293396, | |
| "learning_rate": 1.5176756969332428e-05, | |
| "loss": 2.1845, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 19.635208711433755, | |
| "grad_norm": 3.817499876022339, | |
| "learning_rate": 1.5088646664043652e-05, | |
| "loss": 2.1694, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 19.780399274047188, | |
| "grad_norm": 3.213623285293579, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 2.1732, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 19.925589836660617, | |
| "grad_norm": 3.957732677459717, | |
| "learning_rate": 1.4910826320828085e-05, | |
| "loss": 2.1611, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 20.072595281306715, | |
| "grad_norm": 4.165735244750977, | |
| "learning_rate": 1.4821135025703491e-05, | |
| "loss": 2.2287, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 20.217785843920144, | |
| "grad_norm": 4.221045017242432, | |
| "learning_rate": 1.4730935568360103e-05, | |
| "loss": 2.1643, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 20.362976406533576, | |
| "grad_norm": 3.8451671600341797, | |
| "learning_rate": 1.4640237456093636e-05, | |
| "loss": 2.1391, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 20.362976406533576, | |
| "eval_loss": 2.1760454177856445, | |
| "eval_runtime": 14.8981, | |
| "eval_samples_per_second": 191.3, | |
| "eval_steps_per_second": 6.041, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 20.508166969147005, | |
| "grad_norm": 3.5534346103668213, | |
| "learning_rate": 1.4549050248759546e-05, | |
| "loss": 2.1447, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 20.653357531760435, | |
| "grad_norm": 3.4696710109710693, | |
| "learning_rate": 1.4457383557765385e-05, | |
| "loss": 2.1429, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 20.798548094373867, | |
| "grad_norm": 3.365741491317749, | |
| "learning_rate": 1.4365247045057732e-05, | |
| "loss": 2.1362, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 20.943738656987296, | |
| "grad_norm": 3.3320400714874268, | |
| "learning_rate": 1.427265042210381e-05, | |
| "loss": 2.1331, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 21.090744101633394, | |
| "grad_norm": 3.987398624420166, | |
| "learning_rate": 1.4179603448867836e-05, | |
| "loss": 2.1908, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 21.235934664246823, | |
| "grad_norm": 2.9857592582702637, | |
| "learning_rate": 1.4086115932782316e-05, | |
| "loss": 2.116, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 21.381125226860256, | |
| "grad_norm": 3.6675405502319336, | |
| "learning_rate": 1.399219772771431e-05, | |
| "loss": 2.1191, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 21.526315789473685, | |
| "grad_norm": 3.398852586746216, | |
| "learning_rate": 1.3897858732926794e-05, | |
| "loss": 2.1145, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 21.671506352087114, | |
| "grad_norm": 4.064932346343994, | |
| "learning_rate": 1.3803108892035259e-05, | |
| "loss": 2.1189, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 21.816696914700543, | |
| "grad_norm": 4.461515426635742, | |
| "learning_rate": 1.3707958191959609e-05, | |
| "loss": 2.118, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 21.816696914700543, | |
| "eval_loss": 2.1411948204040527, | |
| "eval_runtime": 14.8907, | |
| "eval_samples_per_second": 191.395, | |
| "eval_steps_per_second": 6.044, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 21.961887477313976, | |
| "grad_norm": 4.643894195556641, | |
| "learning_rate": 1.3612416661871532e-05, | |
| "loss": 2.1139, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 22.108892921960074, | |
| "grad_norm": 3.617164134979248, | |
| "learning_rate": 1.3516494372137368e-05, | |
| "loss": 2.1598, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 22.254083484573503, | |
| "grad_norm": 4.053228855133057, | |
| "learning_rate": 1.342020143325669e-05, | |
| "loss": 2.1049, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 22.399274047186932, | |
| "grad_norm": 3.701639175415039, | |
| "learning_rate": 1.3323547994796597e-05, | |
| "loss": 2.0949, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 22.544464609800364, | |
| "grad_norm": 4.151967525482178, | |
| "learning_rate": 1.322654424432195e-05, | |
| "loss": 2.0972, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 22.689655172413794, | |
| "grad_norm": 3.0913383960723877, | |
| "learning_rate": 1.3129200406321545e-05, | |
| "loss": 2.0929, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 22.834845735027223, | |
| "grad_norm": 3.61426043510437, | |
| "learning_rate": 1.3031526741130435e-05, | |
| "loss": 2.0796, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 22.980036297640652, | |
| "grad_norm": 2.2186896800994873, | |
| "learning_rate": 1.2933533543848462e-05, | |
| "loss": 2.0845, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 23.12704174228675, | |
| "grad_norm": 3.799440860748291, | |
| "learning_rate": 1.283523114325511e-05, | |
| "loss": 2.1387, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 23.272232304900182, | |
| "grad_norm": 4.307620048522949, | |
| "learning_rate": 1.2736629900720832e-05, | |
| "loss": 2.0764, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 23.272232304900182, | |
| "eval_loss": 2.099868059158325, | |
| "eval_runtime": 14.8905, | |
| "eval_samples_per_second": 191.397, | |
| "eval_steps_per_second": 6.044, | |
| "step": 1600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.967214202780385e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |