| { | |
| "best_global_step": 20000, | |
| "best_metric": 0.14082255959510803, | |
| "best_model_checkpoint": "/media/user/Expansion1/deberta-v3-base-zyda-2-v2-text-quality-v3/checkpoint-20000", | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 11.949178695678711, | |
| "learning_rate": 4.97505e-05, | |
| "loss": 0.3835, | |
| "num_input_tokens_seen": 512000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.623898506164551, | |
| "learning_rate": 4.95005e-05, | |
| "loss": 0.2484, | |
| "num_input_tokens_seen": 1024000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.0655770301818848, | |
| "learning_rate": 4.9250500000000006e-05, | |
| "loss": 0.2332, | |
| "num_input_tokens_seen": 1536000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8.194499015808105, | |
| "learning_rate": 4.9000500000000006e-05, | |
| "loss": 0.2097, | |
| "num_input_tokens_seen": 2048000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.8507510423660278, | |
| "learning_rate": 4.875050000000001e-05, | |
| "loss": 0.1988, | |
| "num_input_tokens_seen": 2560000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.8679802417755127, | |
| "learning_rate": 4.85005e-05, | |
| "loss": 0.1957, | |
| "num_input_tokens_seen": 3072000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 3.2234883308410645, | |
| "learning_rate": 4.82505e-05, | |
| "loss": 0.1793, | |
| "num_input_tokens_seen": 3584000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.360517978668213, | |
| "learning_rate": 4.80005e-05, | |
| "loss": 0.1816, | |
| "num_input_tokens_seen": 4096000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.652502536773682, | |
| "learning_rate": 4.77505e-05, | |
| "loss": 0.1855, | |
| "num_input_tokens_seen": 4608000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.757875919342041, | |
| "learning_rate": 4.7500500000000004e-05, | |
| "loss": 0.1751, | |
| "num_input_tokens_seen": 5120000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 3.0092484951019287, | |
| "learning_rate": 4.7250500000000004e-05, | |
| "loss": 0.1785, | |
| "num_input_tokens_seen": 5632000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 7.830347061157227, | |
| "learning_rate": 4.7000500000000005e-05, | |
| "loss": 0.1711, | |
| "num_input_tokens_seen": 6144000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.926468849182129, | |
| "learning_rate": 4.6750500000000006e-05, | |
| "loss": 0.168, | |
| "num_input_tokens_seen": 6656000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.43612003326416, | |
| "learning_rate": 4.65005e-05, | |
| "loss": 0.1772, | |
| "num_input_tokens_seen": 7168000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 2.3997323513031006, | |
| "learning_rate": 4.62505e-05, | |
| "loss": 0.1632, | |
| "num_input_tokens_seen": 7680000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 12.628423690795898, | |
| "learning_rate": 4.60005e-05, | |
| "loss": 0.1714, | |
| "num_input_tokens_seen": 8192000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.8220003843307495, | |
| "learning_rate": 4.57505e-05, | |
| "loss": 0.1613, | |
| "num_input_tokens_seen": 8704000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.2584903240203857, | |
| "learning_rate": 4.55005e-05, | |
| "loss": 0.1547, | |
| "num_input_tokens_seen": 9216000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.5416566133499146, | |
| "learning_rate": 4.52505e-05, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 9728000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.472825288772583, | |
| "learning_rate": 4.5000500000000004e-05, | |
| "loss": 0.1635, | |
| "num_input_tokens_seen": 10239872, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_combined_score": 0.18538867612314003, | |
| "eval_loss": 0.18538866937160492, | |
| "eval_mse": 0.18538868287467514, | |
| "eval_runtime": 29.5714, | |
| "eval_samples_per_second": 676.329, | |
| "eval_steps_per_second": 84.541, | |
| "num_input_tokens_seen": 10239872, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.986963987350464, | |
| "learning_rate": 4.47505e-05, | |
| "loss": 0.1226, | |
| "num_input_tokens_seen": 10751872, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.7588199973106384, | |
| "learning_rate": 4.45005e-05, | |
| "loss": 0.1172, | |
| "num_input_tokens_seen": 11263872, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.8432678580284119, | |
| "learning_rate": 4.42505e-05, | |
| "loss": 0.1186, | |
| "num_input_tokens_seen": 11775872, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 12.563228607177734, | |
| "learning_rate": 4.40005e-05, | |
| "loss": 0.1139, | |
| "num_input_tokens_seen": 12287872, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.207587242126465, | |
| "learning_rate": 4.37505e-05, | |
| "loss": 0.121, | |
| "num_input_tokens_seen": 12799872, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.978637456893921, | |
| "learning_rate": 4.35005e-05, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 13311872, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 5.6478729248046875, | |
| "learning_rate": 4.32505e-05, | |
| "loss": 0.1182, | |
| "num_input_tokens_seen": 13823872, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.0157413482666016, | |
| "learning_rate": 4.30005e-05, | |
| "loss": 0.1099, | |
| "num_input_tokens_seen": 14335872, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 2.2837512493133545, | |
| "learning_rate": 4.2750500000000003e-05, | |
| "loss": 0.1154, | |
| "num_input_tokens_seen": 14847872, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.124837875366211, | |
| "learning_rate": 4.2500500000000004e-05, | |
| "loss": 0.1163, | |
| "num_input_tokens_seen": 15359872, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.8782966136932373, | |
| "learning_rate": 4.2250500000000005e-05, | |
| "loss": 0.1167, | |
| "num_input_tokens_seen": 15871872, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.085688591003418, | |
| "learning_rate": 4.2000500000000006e-05, | |
| "loss": 0.1156, | |
| "num_input_tokens_seen": 16383872, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.9874955415725708, | |
| "learning_rate": 4.1750500000000006e-05, | |
| "loss": 0.1183, | |
| "num_input_tokens_seen": 16895872, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.6902706623077393, | |
| "learning_rate": 4.15005e-05, | |
| "loss": 0.1112, | |
| "num_input_tokens_seen": 17407872, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 3.0735440254211426, | |
| "learning_rate": 4.12505e-05, | |
| "loss": 0.1159, | |
| "num_input_tokens_seen": 17919872, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.936267614364624, | |
| "learning_rate": 4.10005e-05, | |
| "loss": 0.1187, | |
| "num_input_tokens_seen": 18431872, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 3.598895311355591, | |
| "learning_rate": 4.07505e-05, | |
| "loss": 0.1147, | |
| "num_input_tokens_seen": 18943872, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 3.655381917953491, | |
| "learning_rate": 4.05005e-05, | |
| "loss": 0.1387, | |
| "num_input_tokens_seen": 19455872, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 9.855778694152832, | |
| "learning_rate": 4.0250500000000004e-05, | |
| "loss": 0.1238, | |
| "num_input_tokens_seen": 19967872, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.558746337890625, | |
| "learning_rate": 4.0000500000000004e-05, | |
| "loss": 0.1241, | |
| "num_input_tokens_seen": 20479744, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_combined_score": 0.14082256163832602, | |
| "eval_loss": 0.14082255959510803, | |
| "eval_mse": 0.14082256368154403, | |
| "eval_runtime": 30.1283, | |
| "eval_samples_per_second": 663.828, | |
| "eval_steps_per_second": 82.979, | |
| "num_input_tokens_seen": 20479744, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.3755764961242676, | |
| "learning_rate": 3.97505e-05, | |
| "loss": 0.0804, | |
| "num_input_tokens_seen": 20991744, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 3.242955207824707, | |
| "learning_rate": 3.95005e-05, | |
| "loss": 0.0795, | |
| "num_input_tokens_seen": 21503744, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.4045000076293945, | |
| "learning_rate": 3.92505e-05, | |
| "loss": 0.0814, | |
| "num_input_tokens_seen": 22015744, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.5508718490600586, | |
| "learning_rate": 3.90005e-05, | |
| "loss": 0.0848, | |
| "num_input_tokens_seen": 22527744, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.8529911041259766, | |
| "learning_rate": 3.87505e-05, | |
| "loss": 0.081, | |
| "num_input_tokens_seen": 23039744, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 10.657905578613281, | |
| "learning_rate": 3.85005e-05, | |
| "loss": 0.0786, | |
| "num_input_tokens_seen": 23551744, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.378411293029785, | |
| "learning_rate": 3.82505e-05, | |
| "loss": 0.0823, | |
| "num_input_tokens_seen": 24063744, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.6125261783599854, | |
| "learning_rate": 3.80005e-05, | |
| "loss": 0.0787, | |
| "num_input_tokens_seen": 24575744, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.3133174180984497, | |
| "learning_rate": 3.77505e-05, | |
| "loss": 0.0761, | |
| "num_input_tokens_seen": 25087744, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 3.3419981002807617, | |
| "learning_rate": 3.75005e-05, | |
| "loss": 0.0775, | |
| "num_input_tokens_seen": 25599744, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 2.1734654903411865, | |
| "learning_rate": 3.72505e-05, | |
| "loss": 0.0846, | |
| "num_input_tokens_seen": 26111744, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 3.2352869510650635, | |
| "learning_rate": 3.70005e-05, | |
| "loss": 0.0817, | |
| "num_input_tokens_seen": 26623744, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 3.37646746635437, | |
| "learning_rate": 3.675050000000001e-05, | |
| "loss": 0.0816, | |
| "num_input_tokens_seen": 27135744, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.5875842571258545, | |
| "learning_rate": 3.650050000000001e-05, | |
| "loss": 0.0843, | |
| "num_input_tokens_seen": 27647744, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 7.768916606903076, | |
| "learning_rate": 3.62505e-05, | |
| "loss": 0.089, | |
| "num_input_tokens_seen": 28159744, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.6333940029144287, | |
| "learning_rate": 3.60005e-05, | |
| "loss": 0.1209, | |
| "num_input_tokens_seen": 28671744, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 3.4022088050842285, | |
| "learning_rate": 3.57505e-05, | |
| "loss": 0.082, | |
| "num_input_tokens_seen": 29183744, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.5310307741165161, | |
| "learning_rate": 3.5500500000000003e-05, | |
| "loss": 0.0813, | |
| "num_input_tokens_seen": 29695744, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 3.3515617847442627, | |
| "learning_rate": 3.5250500000000004e-05, | |
| "loss": 0.0856, | |
| "num_input_tokens_seen": 30207744, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.5893547534942627, | |
| "learning_rate": 3.5000500000000005e-05, | |
| "loss": 0.0882, | |
| "num_input_tokens_seen": 30719616, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_combined_score": 0.1746896443902683, | |
| "eval_loss": 0.1746896207332611, | |
| "eval_mse": 0.17468963824495307, | |
| "eval_runtime": 29.4701, | |
| "eval_samples_per_second": 678.654, | |
| "eval_steps_per_second": 84.832, | |
| "num_input_tokens_seen": 30719616, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 1.333294153213501, | |
| "learning_rate": 3.4750500000000006e-05, | |
| "loss": 0.0562, | |
| "num_input_tokens_seen": 31231616, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.8254738450050354, | |
| "learning_rate": 3.45005e-05, | |
| "loss": 0.053, | |
| "num_input_tokens_seen": 31743616, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 1.7611359357833862, | |
| "learning_rate": 3.42505e-05, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 32255616, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.1055493354797363, | |
| "learning_rate": 3.40005e-05, | |
| "loss": 0.0557, | |
| "num_input_tokens_seen": 32767616, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 1.6912920475006104, | |
| "learning_rate": 3.37505e-05, | |
| "loss": 0.0557, | |
| "num_input_tokens_seen": 33279616, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 2.5604867935180664, | |
| "learning_rate": 3.35005e-05, | |
| "loss": 0.0619, | |
| "num_input_tokens_seen": 33791616, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 1.7852438688278198, | |
| "learning_rate": 3.32505e-05, | |
| "loss": 0.0564, | |
| "num_input_tokens_seen": 34303616, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.84659481048584, | |
| "learning_rate": 3.3000500000000004e-05, | |
| "loss": 0.0563, | |
| "num_input_tokens_seen": 34815616, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 2.0315301418304443, | |
| "learning_rate": 3.27505e-05, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 35327616, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.9043070077896118, | |
| "learning_rate": 3.25005e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 35839616, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 1.7389405965805054, | |
| "learning_rate": 3.22505e-05, | |
| "loss": 0.0544, | |
| "num_input_tokens_seen": 36351616, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.8132746815681458, | |
| "learning_rate": 3.20005e-05, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 36863616, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 1.8100671768188477, | |
| "learning_rate": 3.17505e-05, | |
| "loss": 0.0558, | |
| "num_input_tokens_seen": 37375616, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 10.433902740478516, | |
| "learning_rate": 3.15005e-05, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 37887616, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.7512624263763428, | |
| "learning_rate": 3.12505e-05, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 38399616, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.3957535028457642, | |
| "learning_rate": 3.1000499999999996e-05, | |
| "loss": 0.0546, | |
| "num_input_tokens_seen": 38911616, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 1.069032073020935, | |
| "learning_rate": 3.0750499999999996e-05, | |
| "loss": 0.0547, | |
| "num_input_tokens_seen": 39423616, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 3.4046223163604736, | |
| "learning_rate": 3.0500500000000004e-05, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 39935616, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 1.5711253881454468, | |
| "learning_rate": 3.0250500000000005e-05, | |
| "loss": 0.0571, | |
| "num_input_tokens_seen": 40447616, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.8568646907806396, | |
| "learning_rate": 3.0000500000000005e-05, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 40959488, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_combined_score": 0.1528494923779644, | |
| "eval_loss": 0.152849480509758, | |
| "eval_mse": 0.15284948934500966, | |
| "eval_runtime": 29.495, | |
| "eval_samples_per_second": 678.08, | |
| "eval_steps_per_second": 84.76, | |
| "num_input_tokens_seen": 40959488, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 1.1214642524719238, | |
| "learning_rate": 2.9750500000000003e-05, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 41471488, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 2.6408936977386475, | |
| "learning_rate": 2.9500500000000003e-05, | |
| "loss": 0.0361, | |
| "num_input_tokens_seen": 41983488, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 1.0093015432357788, | |
| "learning_rate": 2.9250500000000004e-05, | |
| "loss": 0.0361, | |
| "num_input_tokens_seen": 42495488, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 2.0412521362304688, | |
| "learning_rate": 2.90005e-05, | |
| "loss": 0.0377, | |
| "num_input_tokens_seen": 43007488, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 2.0059244632720947, | |
| "learning_rate": 2.8750500000000002e-05, | |
| "loss": 0.0354, | |
| "num_input_tokens_seen": 43519488, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 3.214423179626465, | |
| "learning_rate": 2.8500500000000003e-05, | |
| "loss": 0.0373, | |
| "num_input_tokens_seen": 44031488, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 2.101541519165039, | |
| "learning_rate": 2.8250500000000003e-05, | |
| "loss": 0.0381, | |
| "num_input_tokens_seen": 44543488, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.8797721862792969, | |
| "learning_rate": 2.80005e-05, | |
| "loss": 0.0381, | |
| "num_input_tokens_seen": 45055488, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 2.0589728355407715, | |
| "learning_rate": 2.77505e-05, | |
| "loss": 0.036, | |
| "num_input_tokens_seen": 45567488, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.5758140087127686, | |
| "learning_rate": 2.7500500000000002e-05, | |
| "loss": 0.0372, | |
| "num_input_tokens_seen": 46079488, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 1.531252145767212, | |
| "learning_rate": 2.72505e-05, | |
| "loss": 0.0381, | |
| "num_input_tokens_seen": 46591488, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 1.053691029548645, | |
| "learning_rate": 2.70005e-05, | |
| "loss": 0.0396, | |
| "num_input_tokens_seen": 47103488, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 1.031100869178772, | |
| "learning_rate": 2.67505e-05, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 47615488, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 0.8592771887779236, | |
| "learning_rate": 2.65005e-05, | |
| "loss": 0.0381, | |
| "num_input_tokens_seen": 48127488, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 3.529454231262207, | |
| "learning_rate": 2.62505e-05, | |
| "loss": 0.0406, | |
| "num_input_tokens_seen": 48639488, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.2595094442367554, | |
| "learning_rate": 2.60005e-05, | |
| "loss": 0.044, | |
| "num_input_tokens_seen": 49151488, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 1.0460163354873657, | |
| "learning_rate": 2.57505e-05, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 49663488, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 0.7415432333946228, | |
| "learning_rate": 2.55005e-05, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 50175488, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.9863350987434387, | |
| "learning_rate": 2.5250499999999998e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 50687488, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 2.2840659618377686, | |
| "learning_rate": 2.50005e-05, | |
| "loss": 0.0372, | |
| "num_input_tokens_seen": 51199360, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_combined_score": 0.14802570643204935, | |
| "eval_loss": 0.14802570641040802, | |
| "eval_mse": 0.14802570645369068, | |
| "eval_runtime": 29.5199, | |
| "eval_samples_per_second": 677.508, | |
| "eval_steps_per_second": 84.689, | |
| "num_input_tokens_seen": 51199360, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 1.0202912092208862, | |
| "learning_rate": 2.4750500000000003e-05, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 51711360, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "grad_norm": 1.1298741102218628, | |
| "learning_rate": 2.45005e-05, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 52223360, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 1.2378206253051758, | |
| "learning_rate": 2.42505e-05, | |
| "loss": 0.0258, | |
| "num_input_tokens_seen": 52735360, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 1.6293431520462036, | |
| "learning_rate": 2.4000500000000002e-05, | |
| "loss": 0.0272, | |
| "num_input_tokens_seen": 53247360, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 3.9734299182891846, | |
| "learning_rate": 2.37505e-05, | |
| "loss": 0.0272, | |
| "num_input_tokens_seen": 53759360, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 0.6598159074783325, | |
| "learning_rate": 2.35005e-05, | |
| "loss": 0.0262, | |
| "num_input_tokens_seen": 54271360, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "grad_norm": 0.6012576818466187, | |
| "learning_rate": 2.32505e-05, | |
| "loss": 0.027, | |
| "num_input_tokens_seen": 54783360, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 2.462887763977051, | |
| "learning_rate": 2.30005e-05, | |
| "loss": 0.0268, | |
| "num_input_tokens_seen": 55295360, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "grad_norm": 2.0268304347991943, | |
| "learning_rate": 2.2750500000000002e-05, | |
| "loss": 0.0263, | |
| "num_input_tokens_seen": 55807360, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.8000567555427551, | |
| "learning_rate": 2.2500500000000003e-05, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 56319360, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.55, | |
| "grad_norm": 1.5781893730163574, | |
| "learning_rate": 2.2250500000000003e-05, | |
| "loss": 0.0265, | |
| "num_input_tokens_seen": 56831360, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 1.2630614042282104, | |
| "learning_rate": 2.20005e-05, | |
| "loss": 0.0257, | |
| "num_input_tokens_seen": 57343360, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 1.3778091669082642, | |
| "learning_rate": 2.17505e-05, | |
| "loss": 0.0271, | |
| "num_input_tokens_seen": 57855360, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 1.0909324884414673, | |
| "learning_rate": 2.1500500000000002e-05, | |
| "loss": 0.026, | |
| "num_input_tokens_seen": 58367360, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 3.5209500789642334, | |
| "learning_rate": 2.1250500000000003e-05, | |
| "loss": 0.0264, | |
| "num_input_tokens_seen": 58879360, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 1.4671865701675415, | |
| "learning_rate": 2.10005e-05, | |
| "loss": 0.0265, | |
| "num_input_tokens_seen": 59391360, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 1.6409125328063965, | |
| "learning_rate": 2.07505e-05, | |
| "loss": 0.0268, | |
| "num_input_tokens_seen": 59903360, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "grad_norm": 1.4418998956680298, | |
| "learning_rate": 2.05005e-05, | |
| "loss": 0.0262, | |
| "num_input_tokens_seen": 60415360, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 1.3441293239593506, | |
| "learning_rate": 2.02505e-05, | |
| "loss": 0.0257, | |
| "num_input_tokens_seen": 60927360, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.8778462409973145, | |
| "learning_rate": 2.00005e-05, | |
| "loss": 0.0263, | |
| "num_input_tokens_seen": 61439232, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_combined_score": 0.15236617343673117, | |
| "eval_loss": 0.15236616134643555, | |
| "eval_mse": 0.1523661706258656, | |
| "eval_runtime": 29.4149, | |
| "eval_samples_per_second": 679.926, | |
| "eval_steps_per_second": 84.991, | |
| "num_input_tokens_seen": 61439232, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.05, | |
| "grad_norm": 1.4335697889328003, | |
| "learning_rate": 1.97505e-05, | |
| "loss": 0.0181, | |
| "num_input_tokens_seen": 61951232, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 6.1, | |
| "grad_norm": 1.1381551027297974, | |
| "learning_rate": 1.95005e-05, | |
| "loss": 0.0197, | |
| "num_input_tokens_seen": 62463232, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.15, | |
| "grad_norm": 0.7046132683753967, | |
| "learning_rate": 1.92505e-05, | |
| "loss": 0.0186, | |
| "num_input_tokens_seen": 62975232, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 1.008306860923767, | |
| "learning_rate": 1.9000500000000002e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 63487232, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 2.0765221118927, | |
| "learning_rate": 1.8750500000000003e-05, | |
| "loss": 0.0185, | |
| "num_input_tokens_seen": 63999232, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 1.2361551523208618, | |
| "learning_rate": 1.85005e-05, | |
| "loss": 0.0179, | |
| "num_input_tokens_seen": 64511232, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.35, | |
| "grad_norm": 0.7231354117393494, | |
| "learning_rate": 1.82505e-05, | |
| "loss": 0.0194, | |
| "num_input_tokens_seen": 65023232, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.779230535030365, | |
| "learning_rate": 1.80005e-05, | |
| "loss": 0.0198, | |
| "num_input_tokens_seen": 65535232, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "grad_norm": 0.7320069074630737, | |
| "learning_rate": 1.77505e-05, | |
| "loss": 0.0187, | |
| "num_input_tokens_seen": 66047232, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.8597579598426819, | |
| "learning_rate": 1.75005e-05, | |
| "loss": 0.0191, | |
| "num_input_tokens_seen": 66559232, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 6.55, | |
| "grad_norm": 1.4109529256820679, | |
| "learning_rate": 1.72505e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 67071232, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 1.4900848865509033, | |
| "learning_rate": 1.70005e-05, | |
| "loss": 0.0173, | |
| "num_input_tokens_seen": 67583232, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 6.65, | |
| "grad_norm": 1.3828743696212769, | |
| "learning_rate": 1.6750499999999998e-05, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 68095232, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 6.7, | |
| "grad_norm": 0.6733376383781433, | |
| "learning_rate": 1.6500500000000002e-05, | |
| "loss": 0.019, | |
| "num_input_tokens_seen": 68607232, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 0.4570697546005249, | |
| "learning_rate": 1.6250500000000003e-05, | |
| "loss": 0.0181, | |
| "num_input_tokens_seen": 69119232, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 0.9463149309158325, | |
| "learning_rate": 1.60005e-05, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 69631232, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "grad_norm": 0.9304377436637878, | |
| "learning_rate": 1.57505e-05, | |
| "loss": 0.0185, | |
| "num_input_tokens_seen": 70143232, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "grad_norm": 0.8526313304901123, | |
| "learning_rate": 1.5500500000000002e-05, | |
| "loss": 0.0185, | |
| "num_input_tokens_seen": 70655232, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 1.6793274879455566, | |
| "learning_rate": 1.52505e-05, | |
| "loss": 0.0194, | |
| "num_input_tokens_seen": 71167232, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 1.2873644828796387, | |
| "learning_rate": 1.5000500000000001e-05, | |
| "loss": 0.0203, | |
| "num_input_tokens_seen": 71679104, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_combined_score": 0.1494929350818927, | |
| "eval_loss": 0.14949294924736023, | |
| "eval_mse": 0.14949293581758635, | |
| "eval_runtime": 29.5374, | |
| "eval_samples_per_second": 677.107, | |
| "eval_steps_per_second": 84.638, | |
| "num_input_tokens_seen": 71679104, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 0.8493014574050903, | |
| "learning_rate": 1.47505e-05, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 72191104, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.6162556409835815, | |
| "learning_rate": 1.45005e-05, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 72703104, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 7.15, | |
| "grad_norm": 0.6198768019676208, | |
| "learning_rate": 1.42505e-05, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 73215104, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.6122292876243591, | |
| "learning_rate": 1.40005e-05, | |
| "loss": 0.0138, | |
| "num_input_tokens_seen": 73727104, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 0.8132468461990356, | |
| "learning_rate": 1.37505e-05, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 74239104, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 7.3, | |
| "grad_norm": 0.791746973991394, | |
| "learning_rate": 1.3500499999999999e-05, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 74751104, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 1.6126739978790283, | |
| "learning_rate": 1.3250500000000001e-05, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 75263104, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 1.348046898841858, | |
| "learning_rate": 1.3000500000000002e-05, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 75775104, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.45, | |
| "grad_norm": 1.5154032707214355, | |
| "learning_rate": 1.2750500000000001e-05, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 76287104, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 1.3086836338043213, | |
| "learning_rate": 1.2500500000000002e-05, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 76799104, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 7.55, | |
| "grad_norm": 1.3077424764633179, | |
| "learning_rate": 1.22505e-05, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 77311104, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 1.377185344696045, | |
| "learning_rate": 1.2000500000000001e-05, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 77823104, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 1.2250688076019287, | |
| "learning_rate": 1.17505e-05, | |
| "loss": 0.0146, | |
| "num_input_tokens_seen": 78335104, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 7.7, | |
| "grad_norm": 0.8044687509536743, | |
| "learning_rate": 1.15005e-05, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 78847104, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.8126741647720337, | |
| "learning_rate": 1.12505e-05, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 79359104, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.6075248122215271, | |
| "learning_rate": 1.10005e-05, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 79871104, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 7.85, | |
| "grad_norm": 1.874189853668213, | |
| "learning_rate": 1.0750500000000002e-05, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 80383104, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 7.9, | |
| "grad_norm": 0.5488854646682739, | |
| "learning_rate": 1.05005e-05, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 80895104, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 7.95, | |
| "grad_norm": 1.5739060640335083, | |
| "learning_rate": 1.0250500000000001e-05, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 81407104, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 1.897755742073059, | |
| "learning_rate": 1.00005e-05, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 81918976, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_combined_score": 0.1482119562218898, | |
| "eval_loss": 0.14821195602416992, | |
| "eval_mse": 0.14821195641960966, | |
| "eval_runtime": 29.5069, | |
| "eval_samples_per_second": 677.807, | |
| "eval_steps_per_second": 84.726, | |
| "num_input_tokens_seen": 81918976, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.05, | |
| "grad_norm": 0.39859962463378906, | |
| "learning_rate": 9.7505e-06, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 82430976, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 8.1, | |
| "grad_norm": 1.8892147541046143, | |
| "learning_rate": 9.500500000000002e-06, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 82942976, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "grad_norm": 0.7789964079856873, | |
| "learning_rate": 9.2505e-06, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 83454976, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 0.6423227787017822, | |
| "learning_rate": 9.000500000000001e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 83966976, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 0.6862022876739502, | |
| "learning_rate": 8.7505e-06, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 84478976, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 8.3, | |
| "grad_norm": 0.6521459817886353, | |
| "learning_rate": 8.5005e-06, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 84990976, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.35, | |
| "grad_norm": 1.0782101154327393, | |
| "learning_rate": 8.2505e-06, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 85502976, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 0.32573211193084717, | |
| "learning_rate": 8.0005e-06, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 86014976, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 8.45, | |
| "grad_norm": 0.4790741205215454, | |
| "learning_rate": 7.750500000000001e-06, | |
| "loss": 0.0097, | |
| "num_input_tokens_seen": 86526976, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 5.938267230987549, | |
| "learning_rate": 7.5005000000000004e-06, | |
| "loss": 0.0099, | |
| "num_input_tokens_seen": 87038976, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 8.55, | |
| "grad_norm": 0.3625955283641815, | |
| "learning_rate": 7.2505e-06, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 87550976, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 1.664149522781372, | |
| "learning_rate": 7.0005e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 88062976, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 8.65, | |
| "grad_norm": 0.35580164194107056, | |
| "learning_rate": 6.7505e-06, | |
| "loss": 0.0097, | |
| "num_input_tokens_seen": 88574976, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "grad_norm": 0.814786434173584, | |
| "learning_rate": 6.5005e-06, | |
| "loss": 0.0099, | |
| "num_input_tokens_seen": 89086976, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 0.479640930891037, | |
| "learning_rate": 6.2505000000000005e-06, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 89598976, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.4606671929359436, | |
| "learning_rate": 6.0005e-06, | |
| "loss": 0.0094, | |
| "num_input_tokens_seen": 90110976, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 2.0643467903137207, | |
| "learning_rate": 5.7505e-06, | |
| "loss": 0.0099, | |
| "num_input_tokens_seen": 90622976, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 0.6785427331924438, | |
| "learning_rate": 5.5005e-06, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 91134976, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 0.6333959102630615, | |
| "learning_rate": 5.250500000000001e-06, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 91646976, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.8463544249534607, | |
| "learning_rate": 5.000500000000001e-06, | |
| "loss": 0.0098, | |
| "num_input_tokens_seen": 92158848, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_combined_score": 0.14495953552467267, | |
| "eval_loss": 0.14495953917503357, | |
| "eval_mse": 0.1449595318743118, | |
| "eval_runtime": 29.5073, | |
| "eval_samples_per_second": 677.799, | |
| "eval_steps_per_second": 84.725, | |
| "num_input_tokens_seen": 92158848, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.05, | |
| "grad_norm": 0.39637425541877747, | |
| "learning_rate": 4.7505000000000005e-06, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 92670848, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 9.1, | |
| "grad_norm": 0.7424957752227783, | |
| "learning_rate": 4.5005e-06, | |
| "loss": 0.0085, | |
| "num_input_tokens_seen": 93182848, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 9.15, | |
| "grad_norm": 0.8151483535766602, | |
| "learning_rate": 4.2505e-06, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 93694848, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 1.604078769683838, | |
| "learning_rate": 4.0005e-06, | |
| "loss": 0.0086, | |
| "num_input_tokens_seen": 94206848, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 0.42909368872642517, | |
| "learning_rate": 3.7505e-06, | |
| "loss": 0.0084, | |
| "num_input_tokens_seen": 94718848, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 9.3, | |
| "grad_norm": 0.6759423017501831, | |
| "learning_rate": 3.5005e-06, | |
| "loss": 0.0077, | |
| "num_input_tokens_seen": 95230848, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 9.35, | |
| "grad_norm": 0.5954917669296265, | |
| "learning_rate": 3.2505e-06, | |
| "loss": 0.0081, | |
| "num_input_tokens_seen": 95742848, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "grad_norm": 0.6435306072235107, | |
| "learning_rate": 3.0005000000000003e-06, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 96254848, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 9.45, | |
| "grad_norm": 0.8906601071357727, | |
| "learning_rate": 2.7505e-06, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 96766848, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 1.4101794958114624, | |
| "learning_rate": 2.5005e-06, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 97278848, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 9.55, | |
| "grad_norm": 0.7406792044639587, | |
| "learning_rate": 2.2505000000000003e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 97790848, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 1.437361240386963, | |
| "learning_rate": 2.0004999999999997e-06, | |
| "loss": 0.0077, | |
| "num_input_tokens_seen": 98302848, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 9.65, | |
| "grad_norm": 0.4781911373138428, | |
| "learning_rate": 1.7505e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 98814848, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 9.7, | |
| "grad_norm": 0.5876700282096863, | |
| "learning_rate": 1.5005e-06, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 99326848, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 9.75, | |
| "grad_norm": 0.933368980884552, | |
| "learning_rate": 1.2505000000000001e-06, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 99838848, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 0.7791544198989868, | |
| "learning_rate": 1.0005e-06, | |
| "loss": 0.0075, | |
| "num_input_tokens_seen": 100350848, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 9.85, | |
| "grad_norm": 0.45317134261131287, | |
| "learning_rate": 7.505000000000001e-07, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 100862848, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 9.9, | |
| "grad_norm": 1.5439448356628418, | |
| "learning_rate": 5.005e-07, | |
| "loss": 0.0074, | |
| "num_input_tokens_seen": 101374848, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "grad_norm": 0.5587248206138611, | |
| "learning_rate": 2.5049999999999997e-07, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 101886848, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.6633381247520447, | |
| "learning_rate": 5e-10, | |
| "loss": 0.0073, | |
| "num_input_tokens_seen": 102398720, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_combined_score": 0.14527450438803524, | |
| "eval_loss": 0.14527450501918793, | |
| "eval_mse": 0.14527450375688256, | |
| "eval_runtime": 29.5752, | |
| "eval_samples_per_second": 676.241, | |
| "eval_steps_per_second": 84.53, | |
| "num_input_tokens_seen": 102398720, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 102398720, | |
| "step": 100000, | |
| "total_flos": 5.262202453327104e+16, | |
| "train_loss": 0.056572345192432406, | |
| "train_runtime": 7202.8043, | |
| "train_samples_per_second": 111.066, | |
| "train_steps_per_second": 13.883, | |
| "train_tokens_per_second": 14216.507 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 102398720, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.262202453327104e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |