| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.991304347826087, |
| "eval_steps": 500, |
| "global_step": 76, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013043478260869565, |
| "grad_norm": 30.641691207885742, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 2.1951, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.02608695652173913, |
| "grad_norm": 31.937227249145508, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 2.274, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0391304347826087, |
| "grad_norm": 30.902198791503906, |
| "learning_rate": 1.5000000000000002e-07, |
| "loss": 2.1994, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05217391304347826, |
| "grad_norm": 31.316129684448242, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 2.1975, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.06521739130434782, |
| "grad_norm": 30.678176879882812, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 2.1836, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0782608695652174, |
| "grad_norm": 31.651874542236328, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 2.2658, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.09130434782608696, |
| "grad_norm": 31.49991226196289, |
| "learning_rate": 3.5000000000000004e-07, |
| "loss": 2.2324, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.10434782608695652, |
| "grad_norm": 31.158674240112305, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 2.1922, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.11739130434782609, |
| "grad_norm": 30.786540985107422, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 2.2302, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.13043478260869565, |
| "grad_norm": 30.784502029418945, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 2.1845, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.14347826086956522, |
| "grad_norm": 30.878541946411133, |
| "learning_rate": 5.5e-07, |
| "loss": 2.1471, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1565217391304348, |
| "grad_norm": 30.616840362548828, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 2.086, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.16956521739130434, |
| "grad_norm": 29.676923751831055, |
| "learning_rate": 6.5e-07, |
| "loss": 2.0468, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.1826086956521739, |
| "grad_norm": 30.126493453979492, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 1.9897, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1956521739130435, |
| "grad_norm": 30.251602172851562, |
| "learning_rate": 7.5e-07, |
| "loss": 1.9232, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.20869565217391303, |
| "grad_norm": 30.116798400878906, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 1.8406, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2217391304347826, |
| "grad_norm": 30.881160736083984, |
| "learning_rate": 8.500000000000001e-07, |
| "loss": 1.7809, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.23478260869565218, |
| "grad_norm": 31.951828002929688, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 1.7001, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.24782608695652175, |
| "grad_norm": 31.537166595458984, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 1.5591, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.2608695652173913, |
| "grad_norm": 32.81351852416992, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.4578, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.27391304347826084, |
| "grad_norm": 31.268814086914062, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 1.3437, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.28695652173913044, |
| "grad_norm": 29.851335525512695, |
| "learning_rate": 1.1e-06, |
| "loss": 1.2298, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 25.662965774536133, |
| "learning_rate": 1.1500000000000002e-06, |
| "loss": 1.0528, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3130434782608696, |
| "grad_norm": 26.398494720458984, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.9624, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.32608695652173914, |
| "grad_norm": 26.3111629486084, |
| "learning_rate": 1.25e-06, |
| "loss": 0.7787, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3391304347826087, |
| "grad_norm": 28.78609275817871, |
| "learning_rate": 1.3e-06, |
| "loss": 0.6537, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3521739130434783, |
| "grad_norm": 26.990314483642578, |
| "learning_rate": 1.3500000000000002e-06, |
| "loss": 0.4982, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.3652173913043478, |
| "grad_norm": 23.103364944458008, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.3761, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3782608695652174, |
| "grad_norm": 20.322370529174805, |
| "learning_rate": 1.45e-06, |
| "loss": 0.2816, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.391304347826087, |
| "grad_norm": 18.134693145751953, |
| "learning_rate": 1.5e-06, |
| "loss": 0.194, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4043478260869565, |
| "grad_norm": 10.411937713623047, |
| "learning_rate": 1.5500000000000002e-06, |
| "loss": 0.1208, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.41739130434782606, |
| "grad_norm": 3.981872081756592, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0885, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.43043478260869567, |
| "grad_norm": 2.513179063796997, |
| "learning_rate": 1.6500000000000003e-06, |
| "loss": 0.0827, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.4434782608695652, |
| "grad_norm": 1.3037446737289429, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 0.0738, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.45652173913043476, |
| "grad_norm": 1.5666987895965576, |
| "learning_rate": 1.75e-06, |
| "loss": 0.0737, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.46956521739130436, |
| "grad_norm": 1.8724833726882935, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0716, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4826086956521739, |
| "grad_norm": 1.8552911281585693, |
| "learning_rate": 1.85e-06, |
| "loss": 0.0703, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.4956521739130435, |
| "grad_norm": 1.165390968322754, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.0665, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.508695652173913, |
| "grad_norm": 1.1661189794540405, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 0.0654, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5217391304347826, |
| "grad_norm": 1.3230024576187134, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0693, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5347826086956522, |
| "grad_norm": 0.7362420558929443, |
| "learning_rate": 2.05e-06, |
| "loss": 0.0665, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.5478260869565217, |
| "grad_norm": 0.6983249187469482, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.0695, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5608695652173913, |
| "grad_norm": 1.134478211402893, |
| "learning_rate": 2.15e-06, |
| "loss": 0.0638, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.5739130434782609, |
| "grad_norm": 1.1987451314926147, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0538, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.5869565217391305, |
| "grad_norm": 0.8745909929275513, |
| "learning_rate": 2.25e-06, |
| "loss": 0.0618, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.0686204433441162, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 0.0662, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.6130434782608696, |
| "grad_norm": 0.6556565165519714, |
| "learning_rate": 2.35e-06, |
| "loss": 0.056, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6260869565217392, |
| "grad_norm": 0.7634578943252563, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0594, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6391304347826087, |
| "grad_norm": 0.8059038519859314, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.0615, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.6521739130434783, |
| "grad_norm": 0.5894052982330322, |
| "learning_rate": 2.5e-06, |
| "loss": 0.058, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6652173913043479, |
| "grad_norm": 0.5712239146232605, |
| "learning_rate": 2.55e-06, |
| "loss": 0.0589, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.6782608695652174, |
| "grad_norm": 0.6876189708709717, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0607, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.691304347826087, |
| "grad_norm": 0.623663067817688, |
| "learning_rate": 2.6500000000000005e-06, |
| "loss": 0.0576, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.7043478260869566, |
| "grad_norm": 0.6075452566146851, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.0612, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.717391304347826, |
| "grad_norm": 0.7780712842941284, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 0.0566, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7304347826086957, |
| "grad_norm": 0.6400704383850098, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0603, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7434782608695653, |
| "grad_norm": 0.6284154653549194, |
| "learning_rate": 2.85e-06, |
| "loss": 0.0547, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.7565217391304347, |
| "grad_norm": 0.5499616265296936, |
| "learning_rate": 2.9e-06, |
| "loss": 0.0571, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7695652173913043, |
| "grad_norm": 0.6163158416748047, |
| "learning_rate": 2.95e-06, |
| "loss": 0.0587, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.782608695652174, |
| "grad_norm": 0.684912919998169, |
| "learning_rate": 3e-06, |
| "loss": 0.0596, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7956521739130434, |
| "grad_norm": 0.7246200442314148, |
| "learning_rate": 3.05e-06, |
| "loss": 0.0493, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.808695652173913, |
| "grad_norm": 0.7146164774894714, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.0559, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.8217391304347826, |
| "grad_norm": 0.6179472804069519, |
| "learning_rate": 3.1500000000000003e-06, |
| "loss": 0.0604, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.8347826086956521, |
| "grad_norm": 0.7352550029754639, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0498, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.8478260869565217, |
| "grad_norm": 0.5682029128074646, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 0.0562, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8608695652173913, |
| "grad_norm": 0.4977523982524872, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.0534, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.8739130434782608, |
| "grad_norm": 0.5390787124633789, |
| "learning_rate": 3.3500000000000005e-06, |
| "loss": 0.0534, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.8869565217391304, |
| "grad_norm": 0.712452232837677, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0528, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.5136511921882629, |
| "learning_rate": 3.45e-06, |
| "loss": 0.0529, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.9130434782608695, |
| "grad_norm": 0.5339029431343079, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0489, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.9260869565217391, |
| "grad_norm": 0.647507905960083, |
| "learning_rate": 3.5500000000000003e-06, |
| "loss": 0.0477, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.9391304347826087, |
| "grad_norm": 0.5351842641830444, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0491, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9521739130434783, |
| "grad_norm": 0.620066225528717, |
| "learning_rate": 3.65e-06, |
| "loss": 0.0477, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.9652173913043478, |
| "grad_norm": 0.5559766292572021, |
| "learning_rate": 3.7e-06, |
| "loss": 0.0444, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.9782608695652174, |
| "grad_norm": 0.5821895599365234, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0484, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.991304347826087, |
| "grad_norm": 0.6146248579025269, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.046, |
| "step": 76 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 456, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 76, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9197878895378432e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|