| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.612048192771084, | |
| "eval_steps": 500, | |
| "global_step": 343, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0963855421686747, | |
| "grad_norm": 1.7634485960006714, | |
| "learning_rate": 4.998814299283415e-05, | |
| "loss": 0.8996, | |
| "num_input_tokens_seen": 78528, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1927710843373494, | |
| "grad_norm": 1.3068124055862427, | |
| "learning_rate": 4.995258321842611e-05, | |
| "loss": 0.6806, | |
| "num_input_tokens_seen": 159120, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2891566265060241, | |
| "grad_norm": 1.2104840278625488, | |
| "learning_rate": 4.989335440737586e-05, | |
| "loss": 0.618, | |
| "num_input_tokens_seen": 223552, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3855421686746988, | |
| "grad_norm": 1.4112542867660522, | |
| "learning_rate": 4.98105127417984e-05, | |
| "loss": 0.5594, | |
| "num_input_tokens_seen": 290944, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.9026587605476379, | |
| "learning_rate": 4.9704136802031485e-05, | |
| "loss": 0.5253, | |
| "num_input_tokens_seen": 364064, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5783132530120482, | |
| "grad_norm": 0.9427546858787537, | |
| "learning_rate": 4.957432749209755e-05, | |
| "loss": 0.4794, | |
| "num_input_tokens_seen": 440176, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6746987951807228, | |
| "grad_norm": 1.0594468116760254, | |
| "learning_rate": 4.942120794399002e-05, | |
| "loss": 0.4546, | |
| "num_input_tokens_seen": 517184, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.7710843373493976, | |
| "grad_norm": 0.9458279013633728, | |
| "learning_rate": 4.9244923400875245e-05, | |
| "loss": 0.4703, | |
| "num_input_tokens_seen": 591424, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8674698795180723, | |
| "grad_norm": 1.1610336303710938, | |
| "learning_rate": 4.9045641079320484e-05, | |
| "loss": 0.4407, | |
| "num_input_tokens_seen": 662784, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 1.0153354406356812, | |
| "learning_rate": 4.882355001067892e-05, | |
| "loss": 0.4425, | |
| "num_input_tokens_seen": 734784, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.0602409638554218, | |
| "grad_norm": 1.0889695882797241, | |
| "learning_rate": 4.857886086178194e-05, | |
| "loss": 0.4081, | |
| "num_input_tokens_seen": 808336, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.1566265060240963, | |
| "grad_norm": 0.9168598055839539, | |
| "learning_rate": 4.8311805735108894e-05, | |
| "loss": 0.4002, | |
| "num_input_tokens_seen": 882672, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.2530120481927711, | |
| "grad_norm": 0.8168660998344421, | |
| "learning_rate": 4.802263794862385e-05, | |
| "loss": 0.3587, | |
| "num_input_tokens_seen": 947680, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.3493975903614457, | |
| "grad_norm": 1.0652003288269043, | |
| "learning_rate": 4.7711631795488096e-05, | |
| "loss": 0.356, | |
| "num_input_tokens_seen": 1022112, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 1.1781517267227173, | |
| "learning_rate": 4.7379082283876566e-05, | |
| "loss": 0.3639, | |
| "num_input_tokens_seen": 1091744, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.5421686746987953, | |
| "grad_norm": 1.0550976991653442, | |
| "learning_rate": 4.702530485714461e-05, | |
| "loss": 0.3288, | |
| "num_input_tokens_seen": 1163728, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.6385542168674698, | |
| "grad_norm": 1.3946661949157715, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 0.3563, | |
| "num_input_tokens_seen": 1245728, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.7349397590361446, | |
| "grad_norm": 1.1458536386489868, | |
| "learning_rate": 4.625542839324036e-05, | |
| "loss": 0.3642, | |
| "num_input_tokens_seen": 1315056, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.8313253012048194, | |
| "grad_norm": 1.0227209329605103, | |
| "learning_rate": 4.584005963052799e-05, | |
| "loss": 0.3407, | |
| "num_input_tokens_seen": 1392224, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 1.0699985027313232, | |
| "learning_rate": 4.540492280890555e-05, | |
| "loss": 0.3216, | |
| "num_input_tokens_seen": 1471008, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.0240963855421685, | |
| "grad_norm": 0.8573477268218994, | |
| "learning_rate": 4.4950430682006e-05, | |
| "loss": 0.3197, | |
| "num_input_tokens_seen": 1546912, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.1204819277108435, | |
| "grad_norm": 1.1516242027282715, | |
| "learning_rate": 4.447701436314176e-05, | |
| "loss": 0.2904, | |
| "num_input_tokens_seen": 1611328, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.216867469879518, | |
| "grad_norm": 1.0890793800354004, | |
| "learning_rate": 4.398512291636768e-05, | |
| "loss": 0.2498, | |
| "num_input_tokens_seen": 1682528, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.3132530120481927, | |
| "grad_norm": 1.3621636629104614, | |
| "learning_rate": 4.347522293051648e-05, | |
| "loss": 0.269, | |
| "num_input_tokens_seen": 1751856, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 1.338083028793335, | |
| "learning_rate": 4.294779807661105e-05, | |
| "loss": 0.2838, | |
| "num_input_tokens_seen": 1830288, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.5060240963855422, | |
| "grad_norm": 1.2083592414855957, | |
| "learning_rate": 4.2403348649073174e-05, | |
| "loss": 0.2466, | |
| "num_input_tokens_seen": 1905296, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.602409638554217, | |
| "grad_norm": 1.35024094581604, | |
| "learning_rate": 4.184239109116393e-05, | |
| "loss": 0.2272, | |
| "num_input_tokens_seen": 1974464, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.6987951807228914, | |
| "grad_norm": 1.3738912343978882, | |
| "learning_rate": 4.126545750510605e-05, | |
| "loss": 0.2484, | |
| "num_input_tokens_seen": 2058176, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.7951807228915664, | |
| "grad_norm": 1.5877448320388794, | |
| "learning_rate": 4.067309514735267e-05, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 2124912, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 1.3735121488571167, | |
| "learning_rate": 4.0065865909481417e-05, | |
| "loss": 0.2597, | |
| "num_input_tokens_seen": 2213456, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.9879518072289155, | |
| "grad_norm": 1.6480368375778198, | |
| "learning_rate": 3.9444345785206285e-05, | |
| "loss": 0.2525, | |
| "num_input_tokens_seen": 2281680, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 3.0843373493975905, | |
| "grad_norm": 1.2931358814239502, | |
| "learning_rate": 3.880912432401265e-05, | |
| "loss": 0.1832, | |
| "num_input_tokens_seen": 2349408, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.180722891566265, | |
| "grad_norm": 1.4131468534469604, | |
| "learning_rate": 3.81608040719339e-05, | |
| "loss": 0.1519, | |
| "num_input_tokens_seen": 2425456, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 3.2771084337349397, | |
| "grad_norm": 1.6228159666061401, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.1707, | |
| "num_input_tokens_seen": 2494064, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.3734939759036147, | |
| "grad_norm": 1.1356842517852783, | |
| "learning_rate": 3.6827338920900254e-05, | |
| "loss": 0.1603, | |
| "num_input_tokens_seen": 2573616, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 3.4698795180722892, | |
| "grad_norm": 1.3535553216934204, | |
| "learning_rate": 3.6143458894413465e-05, | |
| "loss": 0.1683, | |
| "num_input_tokens_seen": 2657744, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.566265060240964, | |
| "grad_norm": 1.3832409381866455, | |
| "learning_rate": 3.544900862216959e-05, | |
| "loss": 0.1734, | |
| "num_input_tokens_seen": 2721200, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.662650602409639, | |
| "grad_norm": 1.6430705785751343, | |
| "learning_rate": 3.474464683231698e-05, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 2798320, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.7590361445783134, | |
| "grad_norm": 1.7706836462020874, | |
| "learning_rate": 3.403104165467883e-05, | |
| "loss": 0.1601, | |
| "num_input_tokens_seen": 2879200, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.855421686746988, | |
| "grad_norm": 1.7721610069274902, | |
| "learning_rate": 3.330886998699149e-05, | |
| "loss": 0.1911, | |
| "num_input_tokens_seen": 2947024, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.9518072289156625, | |
| "grad_norm": 1.666278600692749, | |
| "learning_rate": 3.257881685282609e-05, | |
| "loss": 0.1741, | |
| "num_input_tokens_seen": 3016656, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 4.048192771084337, | |
| "grad_norm": 1.099639892578125, | |
| "learning_rate": 3.1841574751802076e-05, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 3084416, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.144578313253012, | |
| "grad_norm": 1.5020925998687744, | |
| "learning_rate": 3.109784300270943e-05, | |
| "loss": 0.1027, | |
| "num_input_tokens_seen": 3166784, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 4.240963855421687, | |
| "grad_norm": 2.203794240951538, | |
| "learning_rate": 3.0348327080162435e-05, | |
| "loss": 0.0955, | |
| "num_input_tokens_seen": 3239584, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.337349397590361, | |
| "grad_norm": 1.7183223962783813, | |
| "learning_rate": 2.9593737945414264e-05, | |
| "loss": 0.1006, | |
| "num_input_tokens_seen": 3313360, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 4.433734939759036, | |
| "grad_norm": 1.4102908372879028, | |
| "learning_rate": 2.8834791371967142e-05, | |
| "loss": 0.1007, | |
| "num_input_tokens_seen": 3377840, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.530120481927711, | |
| "grad_norm": 1.214020013809204, | |
| "learning_rate": 2.8072207266617855e-05, | |
| "loss": 0.1033, | |
| "num_input_tokens_seen": 3455904, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 4.626506024096385, | |
| "grad_norm": 1.5255635976791382, | |
| "learning_rate": 2.7306708986582553e-05, | |
| "loss": 0.1023, | |
| "num_input_tokens_seen": 3529360, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.72289156626506, | |
| "grad_norm": 1.6624009609222412, | |
| "learning_rate": 2.653902265334858e-05, | |
| "loss": 0.1121, | |
| "num_input_tokens_seen": 3605344, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 4.8192771084337345, | |
| "grad_norm": 1.7999521493911743, | |
| "learning_rate": 2.5769876463904265e-05, | |
| "loss": 0.1028, | |
| "num_input_tokens_seen": 3678352, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.9156626506024095, | |
| "grad_norm": 2.1297786235809326, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.1055, | |
| "num_input_tokens_seen": 3752608, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 5.0120481927710845, | |
| "grad_norm": 1.215146780014038, | |
| "learning_rate": 2.4230123536095748e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 3819744, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.108433734939759, | |
| "grad_norm": 1.448801040649414, | |
| "learning_rate": 2.346097734665143e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 3896592, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 5.204819277108434, | |
| "grad_norm": 1.220989465713501, | |
| "learning_rate": 2.2693291013417453e-05, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 3970976, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.301204819277109, | |
| "grad_norm": 1.3077821731567383, | |
| "learning_rate": 2.192779273338215e-05, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 4051760, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 5.397590361445783, | |
| "grad_norm": 2.02695369720459, | |
| "learning_rate": 2.116520862803286e-05, | |
| "loss": 0.059, | |
| "num_input_tokens_seen": 4124096, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.493975903614458, | |
| "grad_norm": 1.6377320289611816, | |
| "learning_rate": 2.0406262054585738e-05, | |
| "loss": 0.0648, | |
| "num_input_tokens_seen": 4188448, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 5.590361445783133, | |
| "grad_norm": 1.6187361478805542, | |
| "learning_rate": 1.965167291983757e-05, | |
| "loss": 0.0709, | |
| "num_input_tokens_seen": 4261056, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.686746987951807, | |
| "grad_norm": 1.4855268001556396, | |
| "learning_rate": 1.890215699729057e-05, | |
| "loss": 0.0641, | |
| "num_input_tokens_seen": 4329024, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 5.783132530120482, | |
| "grad_norm": 1.4216831922531128, | |
| "learning_rate": 1.815842524819793e-05, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 4406624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.879518072289157, | |
| "grad_norm": 1.7383759021759033, | |
| "learning_rate": 1.7421183147173915e-05, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 4480352, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 5.975903614457831, | |
| "grad_norm": 1.5599803924560547, | |
| "learning_rate": 1.6691130013008514e-05, | |
| "loss": 0.0626, | |
| "num_input_tokens_seen": 4554080, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.072289156626506, | |
| "grad_norm": 1.028124213218689, | |
| "learning_rate": 1.5968958345321178e-05, | |
| "loss": 0.0465, | |
| "num_input_tokens_seen": 4628576, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 6.168674698795181, | |
| "grad_norm": 1.4686311483383179, | |
| "learning_rate": 1.5255353167683017e-05, | |
| "loss": 0.0421, | |
| "num_input_tokens_seen": 4704512, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.265060240963855, | |
| "grad_norm": 1.1644634008407593, | |
| "learning_rate": 1.4550991377830426e-05, | |
| "loss": 0.0303, | |
| "num_input_tokens_seen": 4776912, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 6.36144578313253, | |
| "grad_norm": 1.090997338294983, | |
| "learning_rate": 1.3856541105586545e-05, | |
| "loss": 0.0337, | |
| "num_input_tokens_seen": 4855600, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.457831325301205, | |
| "grad_norm": 1.4336110353469849, | |
| "learning_rate": 1.3172661079099752e-05, | |
| "loss": 0.0333, | |
| "num_input_tokens_seen": 4927600, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 6.554216867469879, | |
| "grad_norm": 1.3488271236419678, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 5003184, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.612048192771084, | |
| "num_input_tokens_seen": 5052448, | |
| "step": 343, | |
| "total_flos": 2.28822660837802e+17, | |
| "train_loss": 0.2300173058541106, | |
| "train_runtime": 12675.9679, | |
| "train_samples_per_second": 0.655, | |
| "train_steps_per_second": 0.04 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 510, | |
| "num_input_tokens_seen": 5052448, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.28822660837802e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |