| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.961038961038961, | |
| "eval_steps": 500, | |
| "global_step": 114, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03, | |
| "learning_rate": 5e-06, | |
| "loss": 8.853, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "learning_rate": 1e-05, | |
| "loss": 8.3278, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 5.9423, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "learning_rate": 2e-05, | |
| "loss": 2.6977, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "learning_rate": 1.9995921928281893e-05, | |
| "loss": 2.0802, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 1.9983691039261358e-05, | |
| "loss": 1.6523, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "learning_rate": 1.9963317308626916e-05, | |
| "loss": 1.0093, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 1.99348173534855e-05, | |
| "loss": 1.2197, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "learning_rate": 1.989821441880933e-05, | |
| "loss": 0.9122, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "learning_rate": 1.9853538358476933e-05, | |
| "loss": 0.7479, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 1.9800825610923937e-05, | |
| "loss": 0.927, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "learning_rate": 1.9740119169423337e-05, | |
| "loss": 0.5945, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "learning_rate": 1.9671468547019575e-05, | |
| "loss": 0.6373, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "learning_rate": 1.9594929736144978e-05, | |
| "loss": 0.5594, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "learning_rate": 1.9510565162951538e-05, | |
| "loss": 0.61, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "learning_rate": 1.941844363639525e-05, | |
| "loss": 0.5487, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "learning_rate": 1.9318640292114526e-05, | |
| "loss": 0.5727, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "learning_rate": 1.92112365311485e-05, | |
| "loss": 0.4994, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "learning_rate": 1.9096319953545186e-05, | |
| "loss": 0.6802, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 1.8973984286913584e-05, | |
| "loss": 0.5732, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "learning_rate": 1.8844329309978146e-05, | |
| "loss": 0.4517, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "learning_rate": 1.8707460771197773e-05, | |
| "loss": 0.4944, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 1.856349030251589e-05, | |
| "loss": 0.4534, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "learning_rate": 1.8412535328311813e-05, | |
| "loss": 0.4643, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "learning_rate": 1.825471896962774e-05, | |
| "loss": 0.3858, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "learning_rate": 1.8090169943749477e-05, | |
| "loss": 0.3925, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "learning_rate": 1.7919022459222754e-05, | |
| "loss": 0.4107, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "learning_rate": 1.7741416106390828e-05, | |
| "loss": 0.4385, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "learning_rate": 1.7557495743542586e-05, | |
| "loss": 0.3052, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "learning_rate": 1.736741137876405e-05, | |
| "loss": 0.2988, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "learning_rate": 1.7171318047589637e-05, | |
| "loss": 0.3424, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "learning_rate": 1.696937568655294e-05, | |
| "loss": 0.4287, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "learning_rate": 1.6761749002740195e-05, | |
| "loss": 0.394, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "learning_rate": 1.6548607339452853e-05, | |
| "loss": 0.4674, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "learning_rate": 1.6330124538088705e-05, | |
| "loss": 0.3405, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "learning_rate": 1.6106478796354382e-05, | |
| "loss": 0.3606, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "learning_rate": 1.5877852522924733e-05, | |
| "loss": 0.2745, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "learning_rate": 1.5644432188667695e-05, | |
| "loss": 0.3599, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "learning_rate": 1.5406408174555978e-05, | |
| "loss": 0.2998, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 1.5163974616389621e-05, | |
| "loss": 0.2837, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "learning_rate": 1.491732924645604e-05, | |
| "loss": 0.204, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "learning_rate": 1.4666673232256738e-05, | |
| "loss": 0.2637, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "learning_rate": 1.4412211012432213e-05, | |
| "loss": 0.2783, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "learning_rate": 1.4154150130018867e-05, | |
| "loss": 0.2416, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "learning_rate": 1.3892701063173917e-05, | |
| "loss": 0.2795, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "learning_rate": 1.362807705350641e-05, | |
| "loss": 0.2432, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "learning_rate": 1.3360493932154301e-05, | |
| "loss": 0.2224, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "learning_rate": 1.3090169943749475e-05, | |
| "loss": 0.2528, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 1.2817325568414299e-05, | |
| "loss": 0.2461, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "learning_rate": 1.2542183341934873e-05, | |
| "loss": 0.2683, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "learning_rate": 1.2264967674257647e-05, | |
| "loss": 0.2396, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "learning_rate": 1.1985904666457455e-05, | |
| "loss": 0.2429, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 1.170522192632624e-05, | |
| "loss": 0.2568, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "learning_rate": 1.1423148382732854e-05, | |
| "loss": 0.2612, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 1.1139914098905406e-05, | |
| "loss": 0.2221, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "learning_rate": 1.08557500847884e-05, | |
| "loss": 0.2526, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "learning_rate": 1.0570888108627682e-05, | |
| "loss": 0.2413, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 1.0285560507936962e-05, | |
| "loss": 0.1982, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1877, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "learning_rate": 9.71443949206304e-06, | |
| "loss": 0.2348, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "learning_rate": 9.42911189137232e-06, | |
| "loss": 0.2131, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "learning_rate": 9.144249915211605e-06, | |
| "loss": 0.2035, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "learning_rate": 8.860085901094595e-06, | |
| "loss": 0.2275, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "learning_rate": 8.576851617267151e-06, | |
| "loss": 0.2132, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "learning_rate": 8.294778073673762e-06, | |
| "loss": 0.2492, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 8.014095333542548e-06, | |
| "loss": 0.2203, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "learning_rate": 7.735032325742355e-06, | |
| "loss": 0.212, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "learning_rate": 7.4578166580651335e-06, | |
| "loss": 0.2255, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "learning_rate": 7.182674431585703e-06, | |
| "loss": 0.2004, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 0.2618, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "learning_rate": 6.639506067845698e-06, | |
| "loss": 0.205, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 6.3719229464935915e-06, | |
| "loss": 0.1988, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "learning_rate": 6.107298936826086e-06, | |
| "loss": 0.2342, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "learning_rate": 5.845849869981137e-06, | |
| "loss": 0.2351, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "learning_rate": 5.587788987567785e-06, | |
| "loss": 0.2241, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "learning_rate": 5.333326767743263e-06, | |
| "loss": 0.2031, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 5.082670753543961e-06, | |
| "loss": 0.1787, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "learning_rate": 4.836025383610382e-06, | |
| "loss": 0.1162, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "learning_rate": 4.593591825444028e-06, | |
| "loss": 0.1533, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "learning_rate": 4.355567811332311e-06, | |
| "loss": 0.1156, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "learning_rate": 4.12214747707527e-06, | |
| "loss": 0.1691, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "learning_rate": 3.893521203645618e-06, | |
| "loss": 0.1713, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "learning_rate": 3.6698754619112974e-06, | |
| "loss": 0.1608, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "learning_rate": 3.4513926605471504e-06, | |
| "loss": 0.1553, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "learning_rate": 3.2382509972598087e-06, | |
| "loss": 0.1298, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "learning_rate": 3.0306243134470668e-06, | |
| "loss": 0.1794, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "learning_rate": 2.8286819524103657e-06, | |
| "loss": 0.1235, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "learning_rate": 2.6325886212359496e-06, | |
| "loss": 0.1465, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "learning_rate": 2.4425042564574186e-06, | |
| "loss": 0.1605, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "learning_rate": 2.2585838936091753e-06, | |
| "loss": 0.1318, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "learning_rate": 2.0809775407772505e-06, | |
| "loss": 0.158, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "learning_rate": 1.9098300562505266e-06, | |
| "loss": 0.1413, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "learning_rate": 1.74528103037226e-06, | |
| "loss": 0.2054, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "learning_rate": 1.587464671688187e-06, | |
| "loss": 0.1365, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "learning_rate": 1.436509697484111e-06, | |
| "loss": 0.133, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "learning_rate": 1.2925392288022299e-06, | |
| "loss": 0.154, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "learning_rate": 1.1556706900218572e-06, | |
| "loss": 0.191, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "learning_rate": 1.0260157130864178e-06, | |
| "loss": 0.1685, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "learning_rate": 9.036800464548157e-07, | |
| "loss": 0.1512, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "learning_rate": 7.887634688515e-07, | |
| "loss": 0.1472, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "learning_rate": 6.813597078854772e-07, | |
| "loss": 0.1433, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "learning_rate": 5.815563636047539e-07, | |
| "loss": 0.1774, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "learning_rate": 4.894348370484648e-07, | |
| "loss": 0.161, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "learning_rate": 4.0507026385502747e-07, | |
| "loss": 0.1705, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "learning_rate": 3.2853145298042954e-07, | |
| "loss": 0.1515, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "learning_rate": 2.5988083057666534e-07, | |
| "loss": 0.165, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "learning_rate": 1.9917438907606556e-07, | |
| "loss": 0.1492, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "learning_rate": 1.464616415230702e-07, | |
| "loss": 0.1548, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "learning_rate": 1.0178558119067316e-07, | |
| "loss": 0.1562, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "learning_rate": 6.51826465144978e-08, | |
| "loss": 0.1532, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "learning_rate": 3.668269137308666e-08, | |
| "loss": 0.1501, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "learning_rate": 1.630896073864352e-08, | |
| "loss": 0.1639, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "learning_rate": 4.0780717181077015e-09, | |
| "loss": 0.1539, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "learning_rate": 0.0, | |
| "loss": 0.1866, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "step": 114, | |
| "total_flos": 118111256969216.0, | |
| "train_loss": 0.5400874392505277, | |
| "train_runtime": 1384.0957, | |
| "train_samples_per_second": 21.317, | |
| "train_steps_per_second": 0.082 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 114, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "total_flos": 118111256969216.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |