| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.918032786885246, | |
| "eval_steps": 500, | |
| "global_step": 90, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03278688524590164, | |
| "grad_norm": 8.54790985307395, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.8862, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.06557377049180328, | |
| "grad_norm": 8.667266551400829, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.8865, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.09836065573770492, | |
| "grad_norm": 4.253953257140578, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.7819, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.13114754098360656, | |
| "grad_norm": 4.571455990663048, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.8968, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 2.556736840356925, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.736, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.19672131147540983, | |
| "grad_norm": 1.1763049443066929, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.6517, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.22950819672131148, | |
| "grad_norm": 3.878296683574531, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.6656, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.26229508196721313, | |
| "grad_norm": 1.3430144344783912, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.6455, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.29508196721311475, | |
| "grad_norm": 1.7928730960517045, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6658, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 1.7860417495479892, | |
| "learning_rate": 4.938271604938271e-05, | |
| "loss": 0.6959, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.36065573770491804, | |
| "grad_norm": 2.038418980975677, | |
| "learning_rate": 4.876543209876544e-05, | |
| "loss": 0.8413, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.39344262295081966, | |
| "grad_norm": 1.63503431407122, | |
| "learning_rate": 4.814814814814815e-05, | |
| "loss": 0.7398, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.4262295081967213, | |
| "grad_norm": 3.4570180698446102, | |
| "learning_rate": 4.7530864197530866e-05, | |
| "loss": 0.643, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.45901639344262296, | |
| "grad_norm": 1.909653231704531, | |
| "learning_rate": 4.691358024691358e-05, | |
| "loss": 0.6473, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 1.1021784492331361, | |
| "learning_rate": 4.62962962962963e-05, | |
| "loss": 0.5783, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.5245901639344263, | |
| "grad_norm": 1.0905128650989067, | |
| "learning_rate": 4.567901234567901e-05, | |
| "loss": 0.6608, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.5573770491803278, | |
| "grad_norm": 0.8230839314496673, | |
| "learning_rate": 4.506172839506173e-05, | |
| "loss": 0.5281, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.5901639344262295, | |
| "grad_norm": 0.7979512406007738, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.6013, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.6229508196721312, | |
| "grad_norm": 0.8214685227306615, | |
| "learning_rate": 4.3827160493827164e-05, | |
| "loss": 0.6052, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 0.7411224875292269, | |
| "learning_rate": 4.3209876543209875e-05, | |
| "loss": 0.5937, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.6885245901639344, | |
| "grad_norm": 0.8701752151136424, | |
| "learning_rate": 4.259259259259259e-05, | |
| "loss": 0.6406, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.7213114754098361, | |
| "grad_norm": 0.8401050888517467, | |
| "learning_rate": 4.197530864197531e-05, | |
| "loss": 0.6058, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.7540983606557377, | |
| "grad_norm": 0.7417247270584298, | |
| "learning_rate": 4.135802469135803e-05, | |
| "loss": 0.6038, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.7868852459016393, | |
| "grad_norm": 0.9114029519760943, | |
| "learning_rate": 4.074074074074074e-05, | |
| "loss": 0.6365, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 0.5932383057667643, | |
| "learning_rate": 4.012345679012346e-05, | |
| "loss": 0.5592, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.8524590163934426, | |
| "grad_norm": 0.8875300854470327, | |
| "learning_rate": 3.950617283950617e-05, | |
| "loss": 0.58, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.8852459016393442, | |
| "grad_norm": 0.698119949220337, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.5508, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.9180327868852459, | |
| "grad_norm": 0.6873879524652156, | |
| "learning_rate": 3.82716049382716e-05, | |
| "loss": 0.5604, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.9508196721311475, | |
| "grad_norm": 0.7513945877189441, | |
| "learning_rate": 3.7654320987654326e-05, | |
| "loss": 0.6177, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 0.7621130594480049, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 0.6052, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7621130594480049, | |
| "learning_rate": 3.6419753086419754e-05, | |
| "loss": 0.6161, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.0327868852459017, | |
| "grad_norm": 1.1602966706383506, | |
| "learning_rate": 3.580246913580247e-05, | |
| "loss": 0.4707, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.0655737704918034, | |
| "grad_norm": 0.9862650911000476, | |
| "learning_rate": 3.518518518518519e-05, | |
| "loss": 0.4504, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.098360655737705, | |
| "grad_norm": 0.6543075581779948, | |
| "learning_rate": 3.45679012345679e-05, | |
| "loss": 0.4378, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.1311475409836065, | |
| "grad_norm": 1.0708955865869483, | |
| "learning_rate": 3.395061728395062e-05, | |
| "loss": 0.3739, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.1639344262295082, | |
| "grad_norm": 0.7920736941610336, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.5279, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.1967213114754098, | |
| "grad_norm": 1.0147951574632021, | |
| "learning_rate": 3.271604938271605e-05, | |
| "loss": 0.4744, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.2295081967213115, | |
| "grad_norm": 0.8011306267325417, | |
| "learning_rate": 3.209876543209876e-05, | |
| "loss": 0.3643, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.2622950819672132, | |
| "grad_norm": 0.882826657634216, | |
| "learning_rate": 3.148148148148148e-05, | |
| "loss": 0.4572, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.2950819672131146, | |
| "grad_norm": 0.6594002473409742, | |
| "learning_rate": 3.08641975308642e-05, | |
| "loss": 0.4189, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.3278688524590163, | |
| "grad_norm": 0.9197411839924926, | |
| "learning_rate": 3.0246913580246916e-05, | |
| "loss": 0.4134, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.360655737704918, | |
| "grad_norm": 0.5937970209705666, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 0.4167, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.3934426229508197, | |
| "grad_norm": 0.7598027273233191, | |
| "learning_rate": 2.9012345679012347e-05, | |
| "loss": 0.3958, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.4262295081967213, | |
| "grad_norm": 0.6359617986206775, | |
| "learning_rate": 2.839506172839506e-05, | |
| "loss": 0.3343, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.459016393442623, | |
| "grad_norm": 0.5718147137591809, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.3004, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.4918032786885247, | |
| "grad_norm": 0.7654980285571443, | |
| "learning_rate": 2.7160493827160493e-05, | |
| "loss": 0.5256, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.5245901639344264, | |
| "grad_norm": 0.7363197942697386, | |
| "learning_rate": 2.654320987654321e-05, | |
| "loss": 0.393, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.5573770491803278, | |
| "grad_norm": 0.7665387351515982, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 0.5149, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.5901639344262295, | |
| "grad_norm": 0.7066469851509634, | |
| "learning_rate": 2.5308641975308646e-05, | |
| "loss": 0.4941, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.6229508196721312, | |
| "grad_norm": 0.7648936464359298, | |
| "learning_rate": 2.4691358024691357e-05, | |
| "loss": 0.3753, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.6557377049180326, | |
| "grad_norm": 0.6448719303500821, | |
| "learning_rate": 2.4074074074074074e-05, | |
| "loss": 0.3928, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.6885245901639343, | |
| "grad_norm": 0.6218169475713733, | |
| "learning_rate": 2.345679012345679e-05, | |
| "loss": 0.3762, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.721311475409836, | |
| "grad_norm": 0.6081189880821058, | |
| "learning_rate": 2.2839506172839506e-05, | |
| "loss": 0.4177, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.7540983606557377, | |
| "grad_norm": 0.6742635601106441, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.4148, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.7868852459016393, | |
| "grad_norm": 0.6227360966755229, | |
| "learning_rate": 2.1604938271604937e-05, | |
| "loss": 0.4026, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.819672131147541, | |
| "grad_norm": 0.5496512265959002, | |
| "learning_rate": 2.0987654320987655e-05, | |
| "loss": 0.4264, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.8524590163934427, | |
| "grad_norm": 0.5607326273950837, | |
| "learning_rate": 2.037037037037037e-05, | |
| "loss": 0.4166, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.8852459016393444, | |
| "grad_norm": 0.5981631088299048, | |
| "learning_rate": 1.9753086419753087e-05, | |
| "loss": 0.4428, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.918032786885246, | |
| "grad_norm": 0.5886417861863633, | |
| "learning_rate": 1.91358024691358e-05, | |
| "loss": 0.376, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.9508196721311475, | |
| "grad_norm": 0.7324371338816704, | |
| "learning_rate": 1.8518518518518518e-05, | |
| "loss": 0.4152, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.9836065573770492, | |
| "grad_norm": 0.5908566699889206, | |
| "learning_rate": 1.7901234567901236e-05, | |
| "loss": 0.407, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.856651059424739, | |
| "learning_rate": 1.728395061728395e-05, | |
| "loss": 0.4554, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.0327868852459017, | |
| "grad_norm": 1.0363682630325586, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.2502, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.0655737704918034, | |
| "grad_norm": 0.8199508819054775, | |
| "learning_rate": 1.604938271604938e-05, | |
| "loss": 0.2407, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.098360655737705, | |
| "grad_norm": 0.915069806532844, | |
| "learning_rate": 1.54320987654321e-05, | |
| "loss": 0.2736, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.1311475409836067, | |
| "grad_norm": 0.7930134395978335, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 0.2334, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.1639344262295084, | |
| "grad_norm": 0.7952610694726575, | |
| "learning_rate": 1.419753086419753e-05, | |
| "loss": 0.3251, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.19672131147541, | |
| "grad_norm": 0.8797285975271935, | |
| "learning_rate": 1.3580246913580247e-05, | |
| "loss": 0.2774, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.2295081967213113, | |
| "grad_norm": 0.8766915390470155, | |
| "learning_rate": 1.2962962962962962e-05, | |
| "loss": 0.2677, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.262295081967213, | |
| "grad_norm": 0.8203118457855438, | |
| "learning_rate": 1.2345679012345678e-05, | |
| "loss": 0.2663, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.2950819672131146, | |
| "grad_norm": 0.7395948247582571, | |
| "learning_rate": 1.1728395061728396e-05, | |
| "loss": 0.2948, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.3278688524590163, | |
| "grad_norm": 0.6468513279523844, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.2915, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.360655737704918, | |
| "grad_norm": 0.7131007135911857, | |
| "learning_rate": 1.0493827160493827e-05, | |
| "loss": 0.2789, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.3934426229508197, | |
| "grad_norm": 0.5947845671986667, | |
| "learning_rate": 9.876543209876543e-06, | |
| "loss": 0.2271, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.4262295081967213, | |
| "grad_norm": 0.585559196088907, | |
| "learning_rate": 9.259259259259259e-06, | |
| "loss": 0.2598, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "grad_norm": 0.5903298722902863, | |
| "learning_rate": 8.641975308641975e-06, | |
| "loss": 0.2434, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.4918032786885247, | |
| "grad_norm": 0.5111285726002718, | |
| "learning_rate": 8.02469135802469e-06, | |
| "loss": 0.1805, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.5245901639344264, | |
| "grad_norm": 0.6602400571727327, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.2585, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.557377049180328, | |
| "grad_norm": 0.5576606188458177, | |
| "learning_rate": 6.790123456790123e-06, | |
| "loss": 0.2444, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.5901639344262293, | |
| "grad_norm": 0.5636514910004302, | |
| "learning_rate": 6.172839506172839e-06, | |
| "loss": 0.1957, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.6229508196721314, | |
| "grad_norm": 0.5494326562606712, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.2118, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.6557377049180326, | |
| "grad_norm": 0.6306209914601519, | |
| "learning_rate": 4.938271604938272e-06, | |
| "loss": 0.2355, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.6885245901639343, | |
| "grad_norm": 0.5282030410948224, | |
| "learning_rate": 4.3209876543209875e-06, | |
| "loss": 0.2291, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.721311475409836, | |
| "grad_norm": 0.5951652057378445, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.2143, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.7540983606557377, | |
| "grad_norm": 0.6232143958744484, | |
| "learning_rate": 3.0864197530864196e-06, | |
| "loss": 0.261, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.7868852459016393, | |
| "grad_norm": 0.5718059519402111, | |
| "learning_rate": 2.469135802469136e-06, | |
| "loss": 0.3096, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.819672131147541, | |
| "grad_norm": 0.5749143241777619, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 0.2421, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.8524590163934427, | |
| "grad_norm": 0.5732337864471968, | |
| "learning_rate": 1.234567901234568e-06, | |
| "loss": 0.244, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.8852459016393444, | |
| "grad_norm": 0.6137567991367067, | |
| "learning_rate": 6.17283950617284e-07, | |
| "loss": 0.2497, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.918032786885246, | |
| "grad_norm": 0.5468536373030446, | |
| "learning_rate": 0.0, | |
| "loss": 0.2038, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.918032786885246, | |
| "step": 90, | |
| "total_flos": 43494039879680.0, | |
| "train_loss": 0.45132542418109045, | |
| "train_runtime": 3156.2675, | |
| "train_samples_per_second": 0.464, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 90, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 43494039879680.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |