| { | |
| "best_metric": 0.26356959342956543, | |
| "best_model_checkpoint": "./ryan_model3272024/checkpoint-1000", | |
| "epoch": 0.6496519721577726, | |
| "eval_steps": 100, | |
| "global_step": 1400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.38699468970298767, | |
| "learning_rate": 0.0001994199535962877, | |
| "loss": 0.4038, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.6787680387496948, | |
| "learning_rate": 0.00019883990719257543, | |
| "loss": 0.4003, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.5743306279182434, | |
| "learning_rate": 0.00019825986078886312, | |
| "loss": 0.3591, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.41705068945884705, | |
| "learning_rate": 0.00019767981438515082, | |
| "loss": 0.3524, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 0.339992493391037, | |
| "eval_na_accuracy": 0.7586872577667236, | |
| "eval_ordinal_accuracy": 0.38746026158332825, | |
| "eval_ordinal_mae": 0.8904515504837036, | |
| "eval_runtime": 335.205, | |
| "eval_samples_per_second": 11.87, | |
| "eval_steps_per_second": 1.486, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.36200761795043945, | |
| "learning_rate": 0.0001970997679814385, | |
| "loss": 0.3071, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.24589791893959045, | |
| "learning_rate": 0.00019651972157772623, | |
| "loss": 0.3475, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6089735627174377, | |
| "learning_rate": 0.00019593967517401393, | |
| "loss": 0.3072, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.5671761631965637, | |
| "learning_rate": 0.00019535962877030162, | |
| "loss": 0.2683, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 0.36712726950645447, | |
| "eval_na_accuracy": 0.623552143573761, | |
| "eval_ordinal_accuracy": 0.48916497826576233, | |
| "eval_ordinal_mae": 0.7306416630744934, | |
| "eval_runtime": 155.9343, | |
| "eval_samples_per_second": 25.517, | |
| "eval_steps_per_second": 3.194, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.2764167785644531, | |
| "learning_rate": 0.00019477958236658932, | |
| "loss": 0.2953, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.9076497554779053, | |
| "learning_rate": 0.00019419953596287704, | |
| "loss": 0.3382, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.2747127115726471, | |
| "learning_rate": 0.00019361948955916474, | |
| "loss": 0.2752, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9448749423027039, | |
| "learning_rate": 0.00019303944315545243, | |
| "loss": 0.3314, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 0.3450469672679901, | |
| "eval_na_accuracy": 0.6969112157821655, | |
| "eval_ordinal_accuracy": 0.4013291001319885, | |
| "eval_ordinal_mae": 0.8077224493026733, | |
| "eval_runtime": 156.2328, | |
| "eval_samples_per_second": 25.468, | |
| "eval_steps_per_second": 3.188, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.2589721083641052, | |
| "learning_rate": 0.00019245939675174015, | |
| "loss": 0.3486, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.44286003708839417, | |
| "learning_rate": 0.00019187935034802785, | |
| "loss": 0.3386, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.3215602934360504, | |
| "learning_rate": 0.00019129930394431554, | |
| "loss": 0.3056, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.9510051012039185, | |
| "learning_rate": 0.00019071925754060324, | |
| "loss": 0.2747, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": 0.28132036328315735, | |
| "eval_na_accuracy": 0.7895752787590027, | |
| "eval_ordinal_accuracy": 0.5423288345336914, | |
| "eval_ordinal_mae": 0.6105712056159973, | |
| "eval_runtime": 155.1965, | |
| "eval_samples_per_second": 25.638, | |
| "eval_steps_per_second": 3.209, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.5417093634605408, | |
| "learning_rate": 0.00019013921113689096, | |
| "loss": 0.2522, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.405881643295288, | |
| "learning_rate": 0.00018955916473317868, | |
| "loss": 0.3589, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.8319898843765259, | |
| "learning_rate": 0.00018897911832946638, | |
| "loss": 0.2991, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.9455621242523193, | |
| "learning_rate": 0.00018839907192575407, | |
| "loss": 0.3247, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "eval_loss": 0.3143959045410156, | |
| "eval_na_accuracy": 0.7104247212409973, | |
| "eval_ordinal_accuracy": 0.4524703919887543, | |
| "eval_ordinal_mae": 0.7256373763084412, | |
| "eval_runtime": 157.1141, | |
| "eval_samples_per_second": 25.326, | |
| "eval_steps_per_second": 3.17, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6339251399040222, | |
| "learning_rate": 0.00018781902552204177, | |
| "loss": 0.303, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.3713740408420563, | |
| "learning_rate": 0.0001872389791183295, | |
| "loss": 0.3035, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7050974369049072, | |
| "learning_rate": 0.00018665893271461718, | |
| "loss": 0.2609, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.791477620601654, | |
| "learning_rate": 0.00018607888631090488, | |
| "loss": 0.3612, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 0.3074879050254822, | |
| "eval_na_accuracy": 0.7586872577667236, | |
| "eval_ordinal_accuracy": 0.4984108507633209, | |
| "eval_ordinal_mae": 0.6415887475013733, | |
| "eval_runtime": 154.2538, | |
| "eval_samples_per_second": 25.795, | |
| "eval_steps_per_second": 3.228, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.39196524024009705, | |
| "learning_rate": 0.0001854988399071926, | |
| "loss": 0.31, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.0753191709518433, | |
| "learning_rate": 0.0001849187935034803, | |
| "loss": 0.2722, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8922611474990845, | |
| "learning_rate": 0.000184338747099768, | |
| "loss": 0.3132, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6866246461868286, | |
| "learning_rate": 0.0001837587006960557, | |
| "loss": 0.3031, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.2784635126590729, | |
| "eval_na_accuracy": 0.7895752787590027, | |
| "eval_ordinal_accuracy": 0.5556197762489319, | |
| "eval_ordinal_mae": 0.5720168352127075, | |
| "eval_runtime": 154.421, | |
| "eval_samples_per_second": 25.767, | |
| "eval_steps_per_second": 3.225, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.713051676750183, | |
| "learning_rate": 0.0001831786542923434, | |
| "loss": 0.337, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0872548818588257, | |
| "learning_rate": 0.0001825986078886311, | |
| "loss": 0.2918, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.5099256038665771, | |
| "learning_rate": 0.0001820185614849188, | |
| "loss": 0.2509, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.5774210691452026, | |
| "learning_rate": 0.0001814385150812065, | |
| "loss": 0.2866, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": 0.28780511021614075, | |
| "eval_na_accuracy": 0.7335907220840454, | |
| "eval_ordinal_accuracy": 0.5775787234306335, | |
| "eval_ordinal_mae": 0.5347856879234314, | |
| "eval_runtime": 154.6062, | |
| "eval_samples_per_second": 25.736, | |
| "eval_steps_per_second": 3.221, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.33059367537498474, | |
| "learning_rate": 0.00018085846867749422, | |
| "loss": 0.2626, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.45087730884552, | |
| "learning_rate": 0.0001802784222737819, | |
| "loss": 0.3485, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.195901870727539, | |
| "learning_rate": 0.0001796983758700696, | |
| "loss": 0.3007, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.26779890060424805, | |
| "learning_rate": 0.00017911832946635733, | |
| "loss": 0.2927, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 0.2688673734664917, | |
| "eval_na_accuracy": 0.7972972989082336, | |
| "eval_ordinal_accuracy": 0.5573533773422241, | |
| "eval_ordinal_mae": 0.5855077505111694, | |
| "eval_runtime": 154.5178, | |
| "eval_samples_per_second": 25.751, | |
| "eval_steps_per_second": 3.223, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5635965466499329, | |
| "learning_rate": 0.00017853828306264502, | |
| "loss": 0.269, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.8135786056518555, | |
| "learning_rate": 0.00017795823665893272, | |
| "loss": 0.2677, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.49396631121635437, | |
| "learning_rate": 0.0001773781902552204, | |
| "loss": 0.3069, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.3267723321914673, | |
| "learning_rate": 0.00017679814385150814, | |
| "loss": 0.3003, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_loss": 0.26356959342956543, | |
| "eval_na_accuracy": 0.7915058135986328, | |
| "eval_ordinal_accuracy": 0.581045925617218, | |
| "eval_ordinal_mae": 0.5543876886367798, | |
| "eval_runtime": 157.946, | |
| "eval_samples_per_second": 25.192, | |
| "eval_steps_per_second": 3.153, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9938157200813293, | |
| "learning_rate": 0.00017621809744779583, | |
| "loss": 0.2521, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.45715010166168213, | |
| "learning_rate": 0.00017563805104408353, | |
| "loss": 0.2926, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.9666409492492676, | |
| "learning_rate": 0.00017505800464037122, | |
| "loss": 0.2581, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.5301055908203125, | |
| "learning_rate": 0.00017447795823665894, | |
| "loss": 0.2522, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_loss": 0.3009192943572998, | |
| "eval_na_accuracy": 0.8571428656578064, | |
| "eval_ordinal_accuracy": 0.54435133934021, | |
| "eval_ordinal_mae": 0.5650931596755981, | |
| "eval_runtime": 159.1216, | |
| "eval_samples_per_second": 25.006, | |
| "eval_steps_per_second": 3.13, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8192782998085022, | |
| "learning_rate": 0.00017389791183294664, | |
| "loss": 0.3584, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.0657265186309814, | |
| "learning_rate": 0.00017331786542923433, | |
| "loss": 0.2547, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5887840390205383, | |
| "learning_rate": 0.00017273781902552203, | |
| "loss": 0.2335, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8169906735420227, | |
| "learning_rate": 0.00017215777262180975, | |
| "loss": 0.262, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 0.279022216796875, | |
| "eval_na_accuracy": 0.8301158547401428, | |
| "eval_ordinal_accuracy": 0.5801791548728943, | |
| "eval_ordinal_mae": 0.5203233361244202, | |
| "eval_runtime": 159.9167, | |
| "eval_samples_per_second": 24.882, | |
| "eval_steps_per_second": 3.114, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.5461835861206055, | |
| "learning_rate": 0.00017157772621809744, | |
| "loss": 0.2387, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7304142117500305, | |
| "learning_rate": 0.00017099767981438517, | |
| "loss": 0.2366, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.3845186233520508, | |
| "learning_rate": 0.00017041763341067286, | |
| "loss": 0.2309, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5202885270118713, | |
| "learning_rate": 0.00016983758700696058, | |
| "loss": 0.2139, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 0.2653418481349945, | |
| "eval_na_accuracy": 0.7509652376174927, | |
| "eval_ordinal_accuracy": 0.5492632389068604, | |
| "eval_ordinal_mae": 0.562603771686554, | |
| "eval_runtime": 158.9921, | |
| "eval_samples_per_second": 25.026, | |
| "eval_steps_per_second": 3.132, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.6506483554840088, | |
| "learning_rate": 0.00016925754060324828, | |
| "loss": 0.3071, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.5789369940757751, | |
| "learning_rate": 0.00016867749419953597, | |
| "loss": 0.2689, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5665389895439148, | |
| "learning_rate": 0.00016809744779582367, | |
| "loss": 0.2598, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6937847137451172, | |
| "learning_rate": 0.0001675174013921114, | |
| "loss": 0.2655, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 0.2760397493839264, | |
| "eval_na_accuracy": 0.7123551964759827, | |
| "eval_ordinal_accuracy": 0.5426177382469177, | |
| "eval_ordinal_mae": 0.6106911897659302, | |
| "eval_runtime": 160.1635, | |
| "eval_samples_per_second": 24.843, | |
| "eval_steps_per_second": 3.109, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "step": 1400, | |
| "total_flos": 1.735882797809664e+18, | |
| "train_loss": 0.29669314997536794, | |
| "train_runtime": 4786.838, | |
| "train_samples_per_second": 28.807, | |
| "train_steps_per_second": 1.801 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 8620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "total_flos": 1.735882797809664e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |