| { | |
| "best_metric": 0.08158940076828003, | |
| "best_model_checkpoint": "./fine-tuned/checkpoint-12500", | |
| "epoch": 2.64, | |
| "eval_steps": 500, | |
| "global_step": 16500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 14499.107421875, | |
| "learning_rate": 2.9919999999999998e-05, | |
| "loss": 0.3351, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 9562.1748046875, | |
| "learning_rate": 2.9840000000000002e-05, | |
| "loss": 0.0964, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 11098.59375, | |
| "learning_rate": 2.976e-05, | |
| "loss": 0.0895, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 9281.0146484375, | |
| "learning_rate": 2.968e-05, | |
| "loss": 0.0797, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 10050.3623046875, | |
| "learning_rate": 2.96e-05, | |
| "loss": 0.0812, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 7611.0849609375, | |
| "learning_rate": 2.9520000000000002e-05, | |
| "loss": 0.0755, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 9915.1259765625, | |
| "learning_rate": 2.944e-05, | |
| "loss": 0.0793, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 10182.263671875, | |
| "learning_rate": 2.936e-05, | |
| "loss": 0.0775, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 11287.8271484375, | |
| "learning_rate": 2.928e-05, | |
| "loss": 0.0782, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6672.08251953125, | |
| "learning_rate": 2.92e-05, | |
| "loss": 0.0811, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 0.09235642850399017, | |
| "eval_runtime": 116.7651, | |
| "eval_samples_per_second": 17.128, | |
| "eval_steps_per_second": 2.141, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 6587.6513671875, | |
| "learning_rate": 2.9120000000000002e-05, | |
| "loss": 0.0815, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 6632.0947265625, | |
| "learning_rate": 2.904e-05, | |
| "loss": 0.0794, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 9301.228515625, | |
| "learning_rate": 2.896e-05, | |
| "loss": 0.076, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 10575.0791015625, | |
| "learning_rate": 2.888e-05, | |
| "loss": 0.0791, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8609.86328125, | |
| "learning_rate": 2.88e-05, | |
| "loss": 0.0799, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 11379.4423828125, | |
| "learning_rate": 2.8720000000000003e-05, | |
| "loss": 0.0759, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 8489.6904296875, | |
| "learning_rate": 2.864e-05, | |
| "loss": 0.0753, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 12353.6279296875, | |
| "learning_rate": 2.856e-05, | |
| "loss": 0.075, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 11535.3994140625, | |
| "learning_rate": 2.8480000000000002e-05, | |
| "loss": 0.0757, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 8291.939453125, | |
| "learning_rate": 2.84e-05, | |
| "loss": 0.0753, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.08949962258338928, | |
| "eval_runtime": 116.7407, | |
| "eval_samples_per_second": 17.132, | |
| "eval_steps_per_second": 2.141, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 8266.658203125, | |
| "learning_rate": 2.832e-05, | |
| "loss": 0.0767, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 6160.548828125, | |
| "learning_rate": 2.824e-05, | |
| "loss": 0.067, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 7343.408203125, | |
| "learning_rate": 2.816e-05, | |
| "loss": 0.0717, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 5661.76318359375, | |
| "learning_rate": 2.8080000000000002e-05, | |
| "loss": 0.0733, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8678.46484375, | |
| "learning_rate": 2.8e-05, | |
| "loss": 0.0737, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 6331.21533203125, | |
| "learning_rate": 2.792e-05, | |
| "loss": 0.0696, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 10563.5400390625, | |
| "learning_rate": 2.784e-05, | |
| "loss": 0.0747, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 7221.74365234375, | |
| "learning_rate": 2.7760000000000002e-05, | |
| "loss": 0.0716, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 6486.46142578125, | |
| "learning_rate": 2.768e-05, | |
| "loss": 0.0711, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6838.505859375, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 0.0703, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 0.08808805048465729, | |
| "eval_runtime": 116.8722, | |
| "eval_samples_per_second": 17.113, | |
| "eval_steps_per_second": 2.139, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 6751.6494140625, | |
| "learning_rate": 2.752e-05, | |
| "loss": 0.0781, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 5040.9033203125, | |
| "learning_rate": 2.7439999999999998e-05, | |
| "loss": 0.0686, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 8748.07421875, | |
| "learning_rate": 2.7360000000000002e-05, | |
| "loss": 0.0689, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 5971.705078125, | |
| "learning_rate": 2.728e-05, | |
| "loss": 0.0671, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 10833.1357421875, | |
| "learning_rate": 2.72e-05, | |
| "loss": 0.0734, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 10036.919921875, | |
| "learning_rate": 2.712e-05, | |
| "loss": 0.0715, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 7755.1669921875, | |
| "learning_rate": 2.704e-05, | |
| "loss": 0.0669, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 7584.822265625, | |
| "learning_rate": 2.696e-05, | |
| "loss": 0.0699, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 10103.142578125, | |
| "learning_rate": 2.688e-05, | |
| "loss": 0.07, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5768.24267578125, | |
| "learning_rate": 2.68e-05, | |
| "loss": 0.0709, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.08704760670661926, | |
| "eval_runtime": 116.8362, | |
| "eval_samples_per_second": 17.118, | |
| "eval_steps_per_second": 2.14, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 6016.46826171875, | |
| "learning_rate": 2.672e-05, | |
| "loss": 0.0663, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6869.53076171875, | |
| "learning_rate": 2.6640000000000002e-05, | |
| "loss": 0.073, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 6099.595703125, | |
| "learning_rate": 2.656e-05, | |
| "loss": 0.0667, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 6923.919921875, | |
| "learning_rate": 2.648e-05, | |
| "loss": 0.0653, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 8005.85595703125, | |
| "learning_rate": 2.64e-05, | |
| "loss": 0.0685, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 6473.466796875, | |
| "learning_rate": 2.632e-05, | |
| "loss": 0.0678, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 7177.6328125, | |
| "learning_rate": 2.6240000000000003e-05, | |
| "loss": 0.0637, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 5574.75439453125, | |
| "learning_rate": 2.616e-05, | |
| "loss": 0.0698, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 6910.39599609375, | |
| "learning_rate": 2.608e-05, | |
| "loss": 0.0645, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5913.9775390625, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.068, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.08615937829017639, | |
| "eval_runtime": 116.9591, | |
| "eval_samples_per_second": 17.1, | |
| "eval_steps_per_second": 2.137, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7447.5625, | |
| "learning_rate": 2.592e-05, | |
| "loss": 0.0672, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 7057.10009765625, | |
| "learning_rate": 2.584e-05, | |
| "loss": 0.0683, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 8279.7392578125, | |
| "learning_rate": 2.576e-05, | |
| "loss": 0.0631, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 7663.275390625, | |
| "learning_rate": 2.568e-05, | |
| "loss": 0.0698, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7116.74609375, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 0.0703, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 8839.5986328125, | |
| "learning_rate": 2.552e-05, | |
| "loss": 0.0654, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 7157.17333984375, | |
| "learning_rate": 2.544e-05, | |
| "loss": 0.0628, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 7690.267578125, | |
| "learning_rate": 2.536e-05, | |
| "loss": 0.0694, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 5030.39501953125, | |
| "learning_rate": 2.5280000000000002e-05, | |
| "loss": 0.0654, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 7269.51171875, | |
| "learning_rate": 2.52e-05, | |
| "loss": 0.0732, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 0.08551913499832153, | |
| "eval_runtime": 116.545, | |
| "eval_samples_per_second": 17.161, | |
| "eval_steps_per_second": 2.145, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7060.21826171875, | |
| "learning_rate": 2.5120000000000003e-05, | |
| "loss": 0.0684, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 7841.55322265625, | |
| "learning_rate": 2.504e-05, | |
| "loss": 0.0653, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 5290.3271484375, | |
| "learning_rate": 2.4959999999999998e-05, | |
| "loss": 0.0668, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 6200.4853515625, | |
| "learning_rate": 2.4880000000000002e-05, | |
| "loss": 0.0665, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6859.83544921875, | |
| "learning_rate": 2.48e-05, | |
| "loss": 0.0678, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 7718.70068359375, | |
| "learning_rate": 2.472e-05, | |
| "loss": 0.0679, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 10752.4873046875, | |
| "learning_rate": 2.464e-05, | |
| "loss": 0.062, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6991.5087890625, | |
| "learning_rate": 2.456e-05, | |
| "loss": 0.0659, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6204.99658203125, | |
| "learning_rate": 2.448e-05, | |
| "loss": 0.0636, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 13521.5908203125, | |
| "learning_rate": 2.44e-05, | |
| "loss": 0.0671, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 0.08540560305118561, | |
| "eval_runtime": 116.9131, | |
| "eval_samples_per_second": 17.107, | |
| "eval_steps_per_second": 2.138, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 6408.47265625, | |
| "learning_rate": 2.432e-05, | |
| "loss": 0.0652, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 5537.69287109375, | |
| "learning_rate": 2.4240000000000002e-05, | |
| "loss": 0.0633, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 7664.20703125, | |
| "learning_rate": 2.4160000000000002e-05, | |
| "loss": 0.0652, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 5726.9697265625, | |
| "learning_rate": 2.408e-05, | |
| "loss": 0.0667, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6898.275390625, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.0675, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 9309.822265625, | |
| "learning_rate": 2.392e-05, | |
| "loss": 0.0668, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 8566.080078125, | |
| "learning_rate": 2.384e-05, | |
| "loss": 0.064, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 5729.54833984375, | |
| "learning_rate": 2.3760000000000003e-05, | |
| "loss": 0.0635, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 9562.8701171875, | |
| "learning_rate": 2.368e-05, | |
| "loss": 0.0643, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4704.76025390625, | |
| "learning_rate": 2.3599999999999998e-05, | |
| "loss": 0.0649, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.08466340601444244, | |
| "eval_runtime": 116.6411, | |
| "eval_samples_per_second": 17.147, | |
| "eval_steps_per_second": 2.143, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 7243.01611328125, | |
| "learning_rate": 2.3520000000000002e-05, | |
| "loss": 0.0622, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 7986.32568359375, | |
| "learning_rate": 2.344e-05, | |
| "loss": 0.0678, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 9114.8974609375, | |
| "learning_rate": 2.336e-05, | |
| "loss": 0.0671, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 8830.62109375, | |
| "learning_rate": 2.328e-05, | |
| "loss": 0.0679, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 9311.2412109375, | |
| "learning_rate": 2.32e-05, | |
| "loss": 0.063, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 31307.103515625, | |
| "learning_rate": 2.3120000000000002e-05, | |
| "loss": 0.0649, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 9040.0126953125, | |
| "learning_rate": 2.304e-05, | |
| "loss": 0.0633, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 7183.91650390625, | |
| "learning_rate": 2.296e-05, | |
| "loss": 0.0582, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6460.2998046875, | |
| "learning_rate": 2.288e-05, | |
| "loss": 0.0672, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6104.8671875, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 0.0597, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.0842796117067337, | |
| "eval_runtime": 116.9361, | |
| "eval_samples_per_second": 17.103, | |
| "eval_steps_per_second": 2.138, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 7553.5556640625, | |
| "learning_rate": 2.272e-05, | |
| "loss": 0.063, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 7194.16162109375, | |
| "learning_rate": 2.2640000000000003e-05, | |
| "loss": 0.0597, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 7578.23583984375, | |
| "learning_rate": 2.256e-05, | |
| "loss": 0.0627, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 7874.51904296875, | |
| "learning_rate": 2.2479999999999998e-05, | |
| "loss": 0.0628, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 6014.06640625, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 0.0651, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 7170.10400390625, | |
| "learning_rate": 2.232e-05, | |
| "loss": 0.0656, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 7596.84326171875, | |
| "learning_rate": 2.224e-05, | |
| "loss": 0.0598, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 7802.14990234375, | |
| "learning_rate": 2.216e-05, | |
| "loss": 0.0605, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 5468.1845703125, | |
| "learning_rate": 2.208e-05, | |
| "loss": 0.0594, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5185.58642578125, | |
| "learning_rate": 2.2e-05, | |
| "loss": 0.0586, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.08396206796169281, | |
| "eval_runtime": 116.8224, | |
| "eval_samples_per_second": 17.12, | |
| "eval_steps_per_second": 2.14, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 6047.43359375, | |
| "learning_rate": 2.192e-05, | |
| "loss": 0.0673, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 6286.21484375, | |
| "learning_rate": 2.184e-05, | |
| "loss": 0.0609, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 6187.03369140625, | |
| "learning_rate": 2.1760000000000002e-05, | |
| "loss": 0.0628, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 4476.73095703125, | |
| "learning_rate": 2.1680000000000002e-05, | |
| "loss": 0.0626, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 6180.27490234375, | |
| "learning_rate": 2.16e-05, | |
| "loss": 0.061, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 8477.626953125, | |
| "learning_rate": 2.152e-05, | |
| "loss": 0.0638, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 11541.119140625, | |
| "learning_rate": 2.144e-05, | |
| "loss": 0.0602, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 6183.49609375, | |
| "learning_rate": 2.136e-05, | |
| "loss": 0.0645, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 7597.5810546875, | |
| "learning_rate": 2.1280000000000003e-05, | |
| "loss": 0.067, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 8438.478515625, | |
| "learning_rate": 2.12e-05, | |
| "loss": 0.0628, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 0.08360794186592102, | |
| "eval_runtime": 116.6576, | |
| "eval_samples_per_second": 17.144, | |
| "eval_steps_per_second": 2.143, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 8200.35546875, | |
| "learning_rate": 2.1119999999999998e-05, | |
| "loss": 0.0676, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 8816.8076171875, | |
| "learning_rate": 2.1040000000000002e-05, | |
| "loss": 0.0626, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 8886.630859375, | |
| "learning_rate": 2.096e-05, | |
| "loss": 0.0657, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 8212.525390625, | |
| "learning_rate": 2.088e-05, | |
| "loss": 0.0619, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 5723.00439453125, | |
| "learning_rate": 2.08e-05, | |
| "loss": 0.0623, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 8616.3349609375, | |
| "learning_rate": 2.072e-05, | |
| "loss": 0.063, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 7717.373046875, | |
| "learning_rate": 2.064e-05, | |
| "loss": 0.063, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 6325.8193359375, | |
| "learning_rate": 2.056e-05, | |
| "loss": 0.0628, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 6938.89111328125, | |
| "learning_rate": 2.048e-05, | |
| "loss": 0.0585, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 8704.166015625, | |
| "learning_rate": 2.04e-05, | |
| "loss": 0.0634, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 0.08321517705917358, | |
| "eval_runtime": 116.6701, | |
| "eval_samples_per_second": 17.142, | |
| "eval_steps_per_second": 2.143, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 5835.19189453125, | |
| "learning_rate": 2.0320000000000002e-05, | |
| "loss": 0.0643, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 5896.76318359375, | |
| "learning_rate": 2.024e-05, | |
| "loss": 0.0625, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 6958.45751953125, | |
| "learning_rate": 2.016e-05, | |
| "loss": 0.0657, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 4680.04736328125, | |
| "learning_rate": 2.008e-05, | |
| "loss": 0.0632, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 8230.8056640625, | |
| "learning_rate": 1.9999999999999998e-05, | |
| "loss": 0.0603, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 5693.77001953125, | |
| "learning_rate": 1.9920000000000002e-05, | |
| "loss": 0.0574, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 14030.3583984375, | |
| "learning_rate": 1.984e-05, | |
| "loss": 0.0563, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 11693.09375, | |
| "learning_rate": 1.976e-05, | |
| "loss": 0.0558, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 5772.1845703125, | |
| "learning_rate": 1.968e-05, | |
| "loss": 0.0544, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 8641.919921875, | |
| "learning_rate": 1.96e-05, | |
| "loss": 0.0606, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_loss": 0.08356834203004837, | |
| "eval_runtime": 116.7914, | |
| "eval_samples_per_second": 17.125, | |
| "eval_steps_per_second": 2.141, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 6437.4033203125, | |
| "learning_rate": 1.952e-05, | |
| "loss": 0.0567, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 5099.38330078125, | |
| "learning_rate": 1.944e-05, | |
| "loss": 0.0553, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 5254.07275390625, | |
| "learning_rate": 1.936e-05, | |
| "loss": 0.0564, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 7453.3330078125, | |
| "learning_rate": 1.9280000000000002e-05, | |
| "loss": 0.0573, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 3853.006103515625, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.0607, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 8804.1083984375, | |
| "learning_rate": 1.912e-05, | |
| "loss": 0.0578, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 5899.22021484375, | |
| "learning_rate": 1.904e-05, | |
| "loss": 0.0555, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 8429.76171875, | |
| "learning_rate": 1.896e-05, | |
| "loss": 0.0539, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 9160.4794921875, | |
| "learning_rate": 1.888e-05, | |
| "loss": 0.0572, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 4707.27099609375, | |
| "learning_rate": 1.8800000000000003e-05, | |
| "loss": 0.0563, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_loss": 0.08350159972906113, | |
| "eval_runtime": 116.6938, | |
| "eval_samples_per_second": 17.139, | |
| "eval_steps_per_second": 2.142, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 5663.18603515625, | |
| "learning_rate": 1.872e-05, | |
| "loss": 0.0537, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 9569.765625, | |
| "learning_rate": 1.8639999999999998e-05, | |
| "loss": 0.0607, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 7370.98046875, | |
| "learning_rate": 1.8560000000000002e-05, | |
| "loss": 0.0607, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 5951.6533203125, | |
| "learning_rate": 1.848e-05, | |
| "loss": 0.0547, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 8285.0830078125, | |
| "learning_rate": 1.84e-05, | |
| "loss": 0.0589, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 7549.8271484375, | |
| "learning_rate": 1.832e-05, | |
| "loss": 0.0587, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 7480.25927734375, | |
| "learning_rate": 1.824e-05, | |
| "loss": 0.058, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 35994.15234375, | |
| "learning_rate": 1.816e-05, | |
| "loss": 0.0585, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 7489.05859375, | |
| "learning_rate": 1.808e-05, | |
| "loss": 0.0616, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 6134.80126953125, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.0572, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 0.08285799622535706, | |
| "eval_runtime": 116.9169, | |
| "eval_samples_per_second": 17.106, | |
| "eval_steps_per_second": 2.138, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 4982.9521484375, | |
| "learning_rate": 1.792e-05, | |
| "loss": 0.0569, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 5407.9384765625, | |
| "learning_rate": 1.7840000000000002e-05, | |
| "loss": 0.0579, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 6399.041015625, | |
| "learning_rate": 1.776e-05, | |
| "loss": 0.0569, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 6688.9658203125, | |
| "learning_rate": 1.768e-05, | |
| "loss": 0.0598, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 10116.4990234375, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.0538, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 7761.98876953125, | |
| "learning_rate": 1.7519999999999998e-05, | |
| "loss": 0.0549, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 5940.802734375, | |
| "learning_rate": 1.7440000000000002e-05, | |
| "loss": 0.0537, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 7946.06787109375, | |
| "learning_rate": 1.736e-05, | |
| "loss": 0.0548, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 8282.916015625, | |
| "learning_rate": 1.728e-05, | |
| "loss": 0.0539, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 6786.72509765625, | |
| "learning_rate": 1.72e-05, | |
| "loss": 0.0573, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.08285758644342422, | |
| "eval_runtime": 116.7577, | |
| "eval_samples_per_second": 17.129, | |
| "eval_steps_per_second": 2.141, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 6129.27783203125, | |
| "learning_rate": 1.712e-05, | |
| "loss": 0.0578, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 6502.31298828125, | |
| "learning_rate": 1.704e-05, | |
| "loss": 0.0513, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 10347.439453125, | |
| "learning_rate": 1.696e-05, | |
| "loss": 0.0527, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 7870.1796875, | |
| "learning_rate": 1.688e-05, | |
| "loss": 0.0565, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 7197.3447265625, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.0538, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 5525.79931640625, | |
| "learning_rate": 1.672e-05, | |
| "loss": 0.0579, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 5812.7490234375, | |
| "learning_rate": 1.664e-05, | |
| "loss": 0.0543, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 5728.1904296875, | |
| "learning_rate": 1.656e-05, | |
| "loss": 0.0572, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 6965.53759765625, | |
| "learning_rate": 1.648e-05, | |
| "loss": 0.0535, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 6986.52783203125, | |
| "learning_rate": 1.64e-05, | |
| "loss": 0.0549, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "eval_loss": 0.08279111981391907, | |
| "eval_runtime": 116.6629, | |
| "eval_samples_per_second": 17.143, | |
| "eval_steps_per_second": 2.143, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 6076.61865234375, | |
| "learning_rate": 1.6320000000000003e-05, | |
| "loss": 0.0566, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 6356.2578125, | |
| "learning_rate": 1.624e-05, | |
| "loss": 0.0527, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 8593.4482421875, | |
| "learning_rate": 1.6159999999999998e-05, | |
| "loss": 0.0611, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 6525.712890625, | |
| "learning_rate": 1.6080000000000002e-05, | |
| "loss": 0.0508, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 6376.82177734375, | |
| "learning_rate": 1.6e-05, | |
| "loss": 0.0554, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 7890.4990234375, | |
| "learning_rate": 1.592e-05, | |
| "loss": 0.0546, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 5426.74267578125, | |
| "learning_rate": 1.584e-05, | |
| "loss": 0.0558, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 8708.7294921875, | |
| "learning_rate": 1.576e-05, | |
| "loss": 0.0597, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 7744.2490234375, | |
| "learning_rate": 1.568e-05, | |
| "loss": 0.0553, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 4320.080078125, | |
| "learning_rate": 1.56e-05, | |
| "loss": 0.0602, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_loss": 0.08268450945615768, | |
| "eval_runtime": 116.8196, | |
| "eval_samples_per_second": 17.12, | |
| "eval_steps_per_second": 2.14, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 5681.900390625, | |
| "learning_rate": 1.552e-05, | |
| "loss": 0.0549, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 5498.50048828125, | |
| "learning_rate": 1.544e-05, | |
| "loss": 0.0551, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 7044.8017578125, | |
| "learning_rate": 1.5360000000000002e-05, | |
| "loss": 0.0557, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 8311.8076171875, | |
| "learning_rate": 1.528e-05, | |
| "loss": 0.0559, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 10259.4189453125, | |
| "learning_rate": 1.5200000000000002e-05, | |
| "loss": 0.057, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 7944.630859375, | |
| "learning_rate": 1.5120000000000001e-05, | |
| "loss": 0.0541, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 9513.1875, | |
| "learning_rate": 1.504e-05, | |
| "loss": 0.0558, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 6013.54296875, | |
| "learning_rate": 1.4959999999999999e-05, | |
| "loss": 0.0532, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 7162.22314453125, | |
| "learning_rate": 1.488e-05, | |
| "loss": 0.0553, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 6351.9833984375, | |
| "learning_rate": 1.48e-05, | |
| "loss": 0.0548, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "eval_loss": 0.08252418041229248, | |
| "eval_runtime": 116.7082, | |
| "eval_samples_per_second": 17.137, | |
| "eval_steps_per_second": 2.142, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 6762.00244140625, | |
| "learning_rate": 1.472e-05, | |
| "loss": 0.0529, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 7704.66748046875, | |
| "learning_rate": 1.464e-05, | |
| "loss": 0.0576, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 5400.18798828125, | |
| "learning_rate": 1.4560000000000001e-05, | |
| "loss": 0.0556, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 6167.47216796875, | |
| "learning_rate": 1.448e-05, | |
| "loss": 0.0547, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 5073.39892578125, | |
| "learning_rate": 1.44e-05, | |
| "loss": 0.0544, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 6849.08447265625, | |
| "learning_rate": 1.432e-05, | |
| "loss": 0.0571, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 6866.765625, | |
| "learning_rate": 1.4240000000000001e-05, | |
| "loss": 0.0518, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 8185.33740234375, | |
| "learning_rate": 1.416e-05, | |
| "loss": 0.0605, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 7759.45361328125, | |
| "learning_rate": 1.408e-05, | |
| "loss": 0.0581, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5736.8740234375, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.0582, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.08249519765377045, | |
| "eval_runtime": 116.9496, | |
| "eval_samples_per_second": 17.101, | |
| "eval_steps_per_second": 2.138, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 5240.361328125, | |
| "learning_rate": 1.392e-05, | |
| "loss": 0.0546, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 7000.00927734375, | |
| "learning_rate": 1.384e-05, | |
| "loss": 0.0535, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 8141.75048828125, | |
| "learning_rate": 1.376e-05, | |
| "loss": 0.0555, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 6566.3662109375, | |
| "learning_rate": 1.3680000000000001e-05, | |
| "loss": 0.0518, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 7028.8935546875, | |
| "learning_rate": 1.36e-05, | |
| "loss": 0.0572, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 13007.5703125, | |
| "learning_rate": 1.352e-05, | |
| "loss": 0.0567, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 6286.06640625, | |
| "learning_rate": 1.344e-05, | |
| "loss": 0.0529, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 6360.68408203125, | |
| "learning_rate": 1.336e-05, | |
| "loss": 0.054, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 8098.84228515625, | |
| "learning_rate": 1.328e-05, | |
| "loss": 0.0592, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 6886.65283203125, | |
| "learning_rate": 1.32e-05, | |
| "loss": 0.0524, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "eval_loss": 0.08225961029529572, | |
| "eval_runtime": 116.8647, | |
| "eval_samples_per_second": 17.114, | |
| "eval_steps_per_second": 2.139, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 5443.7119140625, | |
| "learning_rate": 1.3120000000000001e-05, | |
| "loss": 0.0554, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 6497.8408203125, | |
| "learning_rate": 1.304e-05, | |
| "loss": 0.057, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 5618.49853515625, | |
| "learning_rate": 1.296e-05, | |
| "loss": 0.0498, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 7447.96728515625, | |
| "learning_rate": 1.288e-05, | |
| "loss": 0.0568, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 8283.306640625, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 0.0566, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 7497.0419921875, | |
| "learning_rate": 1.272e-05, | |
| "loss": 0.0502, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 8445.2421875, | |
| "learning_rate": 1.2640000000000001e-05, | |
| "loss": 0.0562, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 15980.0498046875, | |
| "learning_rate": 1.2560000000000002e-05, | |
| "loss": 0.0588, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 5444.55615234375, | |
| "learning_rate": 1.2479999999999999e-05, | |
| "loss": 0.0564, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 7009.3037109375, | |
| "learning_rate": 1.24e-05, | |
| "loss": 0.0549, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_loss": 0.08220627158880234, | |
| "eval_runtime": 116.957, | |
| "eval_samples_per_second": 17.1, | |
| "eval_steps_per_second": 2.138, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 5123.0029296875, | |
| "learning_rate": 1.232e-05, | |
| "loss": 0.0562, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 7975.41064453125, | |
| "learning_rate": 1.224e-05, | |
| "loss": 0.0515, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 5846.47705078125, | |
| "learning_rate": 1.216e-05, | |
| "loss": 0.054, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 7158.12109375, | |
| "learning_rate": 1.2080000000000001e-05, | |
| "loss": 0.0577, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 5405.5224609375, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.0538, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 7155.9677734375, | |
| "learning_rate": 1.192e-05, | |
| "loss": 0.0539, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 6886.369140625, | |
| "learning_rate": 1.184e-05, | |
| "loss": 0.0565, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 7139.15283203125, | |
| "learning_rate": 1.1760000000000001e-05, | |
| "loss": 0.0539, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 5965.82666015625, | |
| "learning_rate": 1.168e-05, | |
| "loss": 0.0587, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 6557.6708984375, | |
| "learning_rate": 1.16e-05, | |
| "loss": 0.0552, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "eval_loss": 0.08207839727401733, | |
| "eval_runtime": 116.751, | |
| "eval_samples_per_second": 17.13, | |
| "eval_steps_per_second": 2.141, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 5619.83984375, | |
| "learning_rate": 1.152e-05, | |
| "loss": 0.0563, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 92426.8046875, | |
| "learning_rate": 1.144e-05, | |
| "loss": 0.0588, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 7583.005859375, | |
| "learning_rate": 1.136e-05, | |
| "loss": 0.0559, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 6395.92578125, | |
| "learning_rate": 1.128e-05, | |
| "loss": 0.0552, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 9939.912109375, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.0523, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 5679.93212890625, | |
| "learning_rate": 1.112e-05, | |
| "loss": 0.0585, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 6536.05419921875, | |
| "learning_rate": 1.104e-05, | |
| "loss": 0.0533, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 7333.63330078125, | |
| "learning_rate": 1.096e-05, | |
| "loss": 0.0566, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 7345.85009765625, | |
| "learning_rate": 1.0880000000000001e-05, | |
| "loss": 0.0555, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 21337.044921875, | |
| "learning_rate": 1.08e-05, | |
| "loss": 0.0576, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.08194975554943085, | |
| "eval_runtime": 116.8029, | |
| "eval_samples_per_second": 17.123, | |
| "eval_steps_per_second": 2.14, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 6469.14306640625, | |
| "learning_rate": 1.072e-05, | |
| "loss": 0.0584, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 7579.2998046875, | |
| "learning_rate": 1.0640000000000001e-05, | |
| "loss": 0.0573, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": 8114.94921875, | |
| "learning_rate": 1.0559999999999999e-05, | |
| "loss": 0.0523, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 7263.44384765625, | |
| "learning_rate": 1.048e-05, | |
| "loss": 0.0517, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 8325.9580078125, | |
| "learning_rate": 1.04e-05, | |
| "loss": 0.0524, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 6577.01318359375, | |
| "learning_rate": 1.032e-05, | |
| "loss": 0.0533, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": 6278.1826171875, | |
| "learning_rate": 1.024e-05, | |
| "loss": 0.0532, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 7769.2333984375, | |
| "learning_rate": 1.0160000000000001e-05, | |
| "loss": 0.0532, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 10089.91796875, | |
| "learning_rate": 1.008e-05, | |
| "loss": 0.0539, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 9177.8115234375, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 0.0588, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.08158940076828003, | |
| "eval_runtime": 116.7903, | |
| "eval_samples_per_second": 17.125, | |
| "eval_steps_per_second": 2.141, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": 6336.53076171875, | |
| "learning_rate": 9.92e-06, | |
| "loss": 0.0466, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 4880.88330078125, | |
| "learning_rate": 9.84e-06, | |
| "loss": 0.0531, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": 6478.1640625, | |
| "learning_rate": 9.76e-06, | |
| "loss": 0.0516, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 6105.318359375, | |
| "learning_rate": 9.68e-06, | |
| "loss": 0.0492, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 6270.1318359375, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.0511, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 5914.5458984375, | |
| "learning_rate": 9.52e-06, | |
| "loss": 0.0522, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": 6194.03076171875, | |
| "learning_rate": 9.44e-06, | |
| "loss": 0.0535, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 7986.248046875, | |
| "learning_rate": 9.36e-06, | |
| "loss": 0.0529, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": 10384.2099609375, | |
| "learning_rate": 9.280000000000001e-06, | |
| "loss": 0.0471, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 8849.5703125, | |
| "learning_rate": 9.2e-06, | |
| "loss": 0.0502, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_loss": 0.08202869445085526, | |
| "eval_runtime": 117.0019, | |
| "eval_samples_per_second": 17.094, | |
| "eval_steps_per_second": 2.137, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": 7875.97900390625, | |
| "learning_rate": 9.12e-06, | |
| "loss": 0.049, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 6825.78076171875, | |
| "learning_rate": 9.04e-06, | |
| "loss": 0.0465, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": 5515.30322265625, | |
| "learning_rate": 8.96e-06, | |
| "loss": 0.0535, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 8940.48828125, | |
| "learning_rate": 8.88e-06, | |
| "loss": 0.0564, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5110.7119140625, | |
| "learning_rate": 8.8e-06, | |
| "loss": 0.0509, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 8984.7353515625, | |
| "learning_rate": 8.720000000000001e-06, | |
| "loss": 0.0479, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": 8438.55078125, | |
| "learning_rate": 8.64e-06, | |
| "loss": 0.0502, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 5724.0849609375, | |
| "learning_rate": 8.56e-06, | |
| "loss": 0.0501, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": 7649.28955078125, | |
| "learning_rate": 8.48e-06, | |
| "loss": 0.0569, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 8429.0166015625, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 0.053, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_loss": 0.08213882148265839, | |
| "eval_runtime": 116.6956, | |
| "eval_samples_per_second": 17.139, | |
| "eval_steps_per_second": 2.142, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": 4402.388671875, | |
| "learning_rate": 8.32e-06, | |
| "loss": 0.0499, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 9858.970703125, | |
| "learning_rate": 8.24e-06, | |
| "loss": 0.0506, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": 6748.5732421875, | |
| "learning_rate": 8.160000000000001e-06, | |
| "loss": 0.05, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 7720.3994140625, | |
| "learning_rate": 8.079999999999999e-06, | |
| "loss": 0.0504, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 5066.37060546875, | |
| "learning_rate": 8e-06, | |
| "loss": 0.0533, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 7975.1376953125, | |
| "learning_rate": 7.92e-06, | |
| "loss": 0.0482, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": 6690.85302734375, | |
| "learning_rate": 7.84e-06, | |
| "loss": 0.0518, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 8501.337890625, | |
| "learning_rate": 7.76e-06, | |
| "loss": 0.0534, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": 15215.427734375, | |
| "learning_rate": 7.680000000000001e-06, | |
| "loss": 0.0488, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 6265.7568359375, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 0.0468, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_loss": 0.08207998424768448, | |
| "eval_runtime": 116.7104, | |
| "eval_samples_per_second": 17.136, | |
| "eval_steps_per_second": 2.142, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": 5661.556640625, | |
| "learning_rate": 7.52e-06, | |
| "loss": 0.0516, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 6117.46728515625, | |
| "learning_rate": 7.44e-06, | |
| "loss": 0.0535, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 5083.50634765625, | |
| "learning_rate": 7.36e-06, | |
| "loss": 0.0514, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 6597.24365234375, | |
| "learning_rate": 7.280000000000001e-06, | |
| "loss": 0.0566, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 7306.90185546875, | |
| "learning_rate": 7.2e-06, | |
| "loss": 0.0523, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 6694.41552734375, | |
| "learning_rate": 7.1200000000000004e-06, | |
| "loss": 0.0475, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": 3753.303466796875, | |
| "learning_rate": 7.04e-06, | |
| "loss": 0.0501, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 5714.30078125, | |
| "learning_rate": 6.96e-06, | |
| "loss": 0.0485, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": 7579.119140625, | |
| "learning_rate": 6.88e-06, | |
| "loss": 0.0504, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 6103.64599609375, | |
| "learning_rate": 6.8e-06, | |
| "loss": 0.0531, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "eval_loss": 0.08199251443147659, | |
| "eval_runtime": 116.661, | |
| "eval_samples_per_second": 17.144, | |
| "eval_steps_per_second": 2.143, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": 7419.63623046875, | |
| "learning_rate": 6.72e-06, | |
| "loss": 0.0527, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 6152.6513671875, | |
| "learning_rate": 6.64e-06, | |
| "loss": 0.048, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": 6703.68994140625, | |
| "learning_rate": 6.560000000000001e-06, | |
| "loss": 0.0537, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 8612.31640625, | |
| "learning_rate": 6.48e-06, | |
| "loss": 0.0512, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 6183.3798828125, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.0499, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 7795.396484375, | |
| "learning_rate": 6.3200000000000005e-06, | |
| "loss": 0.0525, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": 6911.2099609375, | |
| "learning_rate": 6.2399999999999995e-06, | |
| "loss": 0.0503, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 9744.9267578125, | |
| "learning_rate": 6.16e-06, | |
| "loss": 0.0509, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": 4487.8115234375, | |
| "learning_rate": 6.08e-06, | |
| "loss": 0.0504, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 6276.47607421875, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0505, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.08178989589214325, | |
| "eval_runtime": 116.6529, | |
| "eval_samples_per_second": 17.145, | |
| "eval_steps_per_second": 2.143, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": 7706.4375, | |
| "learning_rate": 5.92e-06, | |
| "loss": 0.0513, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 6188.396484375, | |
| "learning_rate": 5.84e-06, | |
| "loss": 0.0511, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": 6621.79345703125, | |
| "learning_rate": 5.76e-06, | |
| "loss": 0.0506, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 5284.65185546875, | |
| "learning_rate": 5.68e-06, | |
| "loss": 0.0486, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 6653.84716796875, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 0.053, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 6338.93505859375, | |
| "learning_rate": 5.52e-06, | |
| "loss": 0.0517, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": 6020.87548828125, | |
| "learning_rate": 5.4400000000000004e-06, | |
| "loss": 0.0524, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 7275.64697265625, | |
| "learning_rate": 5.36e-06, | |
| "loss": 0.0516, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": 5086.87744140625, | |
| "learning_rate": 5.279999999999999e-06, | |
| "loss": 0.0514, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4989.05078125, | |
| "learning_rate": 5.2e-06, | |
| "loss": 0.0526, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "eval_loss": 0.08169461041688919, | |
| "eval_runtime": 116.7302, | |
| "eval_samples_per_second": 17.134, | |
| "eval_steps_per_second": 2.142, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": 6472.25537109375, | |
| "learning_rate": 5.12e-06, | |
| "loss": 0.0496, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 6369.4833984375, | |
| "learning_rate": 5.04e-06, | |
| "loss": 0.0518, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": 8784.1083984375, | |
| "learning_rate": 4.96e-06, | |
| "loss": 0.0547, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 8509.6650390625, | |
| "learning_rate": 4.88e-06, | |
| "loss": 0.0555, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 7856.84716796875, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.0513, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 6816.51123046875, | |
| "learning_rate": 4.72e-06, | |
| "loss": 0.0493, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": 6773.06884765625, | |
| "learning_rate": 4.6400000000000005e-06, | |
| "loss": 0.05, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 9726.3818359375, | |
| "learning_rate": 4.56e-06, | |
| "loss": 0.0518, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": 8707.6591796875, | |
| "learning_rate": 4.48e-06, | |
| "loss": 0.0499, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4772.958984375, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.0522, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_loss": 0.08175843954086304, | |
| "eval_runtime": 116.8011, | |
| "eval_samples_per_second": 17.123, | |
| "eval_steps_per_second": 2.14, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": 5753.2734375, | |
| "learning_rate": 4.32e-06, | |
| "loss": 0.0474, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 8457.216796875, | |
| "learning_rate": 4.24e-06, | |
| "loss": 0.0511, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": 8808.5458984375, | |
| "learning_rate": 4.16e-06, | |
| "loss": 0.0497, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 7374.8994140625, | |
| "learning_rate": 4.080000000000001e-06, | |
| "loss": 0.0469, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 8480.7109375, | |
| "learning_rate": 4e-06, | |
| "loss": 0.0527, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 9268.271484375, | |
| "learning_rate": 3.92e-06, | |
| "loss": 0.0514, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": 7013.30810546875, | |
| "learning_rate": 3.8400000000000005e-06, | |
| "loss": 0.0529, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 5726.31298828125, | |
| "learning_rate": 3.76e-06, | |
| "loss": 0.0532, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": 6278.4267578125, | |
| "learning_rate": 3.68e-06, | |
| "loss": 0.0508, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 7901.65576171875, | |
| "learning_rate": 3.6e-06, | |
| "loss": 0.0489, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "eval_loss": 0.08172949403524399, | |
| "eval_runtime": 116.6616, | |
| "eval_samples_per_second": 17.144, | |
| "eval_steps_per_second": 2.143, | |
| "step": 16500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 18750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.038244155392e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |