| { | |
| "best_metric": 0.06056738272309303, | |
| "best_model_checkpoint": "./vit-4-veggies/checkpoint-600", | |
| "epoch": 8.0, | |
| "eval_steps": 100, | |
| "global_step": 1232, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06493506493506493, | |
| "grad_norm": 98652.796875, | |
| "learning_rate": 4.959415584415585e-05, | |
| "loss": 1.4476, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12987012987012986, | |
| "grad_norm": 94436.625, | |
| "learning_rate": 4.918831168831169e-05, | |
| "loss": 1.136, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.19480519480519481, | |
| "grad_norm": 101840.984375, | |
| "learning_rate": 4.8782467532467536e-05, | |
| "loss": 0.9244, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2597402597402597, | |
| "grad_norm": 114599.859375, | |
| "learning_rate": 4.8376623376623384e-05, | |
| "loss": 0.6615, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3246753246753247, | |
| "grad_norm": 83704.421875, | |
| "learning_rate": 4.797077922077922e-05, | |
| "loss": 0.6185, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.38961038961038963, | |
| "grad_norm": 78849.09375, | |
| "learning_rate": 4.756493506493507e-05, | |
| "loss": 0.4871, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 99447.1015625, | |
| "learning_rate": 4.715909090909091e-05, | |
| "loss": 0.4844, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 157846.703125, | |
| "learning_rate": 4.675324675324675e-05, | |
| "loss": 0.4082, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5844155844155844, | |
| "grad_norm": 131451.96875, | |
| "learning_rate": 4.63474025974026e-05, | |
| "loss": 0.3779, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "grad_norm": 107052.734375, | |
| "learning_rate": 4.5941558441558444e-05, | |
| "loss": 0.3154, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "eval_accuracy": 0.9434571890145396, | |
| "eval_loss": 0.3097687065601349, | |
| "eval_runtime": 12.2238, | |
| "eval_samples_per_second": 101.278, | |
| "eval_steps_per_second": 6.381, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 383554.3125, | |
| "learning_rate": 4.5535714285714286e-05, | |
| "loss": 0.2848, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7792207792207793, | |
| "grad_norm": 261821.75, | |
| "learning_rate": 4.5129870129870135e-05, | |
| "loss": 0.2766, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8441558441558441, | |
| "grad_norm": 116962.8203125, | |
| "learning_rate": 4.472402597402598e-05, | |
| "loss": 0.2735, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 126778.890625, | |
| "learning_rate": 4.431818181818182e-05, | |
| "loss": 0.2426, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.974025974025974, | |
| "grad_norm": 41615.3984375, | |
| "learning_rate": 4.391233766233767e-05, | |
| "loss": 0.2355, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0389610389610389, | |
| "grad_norm": 41465.8828125, | |
| "learning_rate": 4.3506493506493503e-05, | |
| "loss": 0.2035, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.103896103896104, | |
| "grad_norm": 336415.25, | |
| "learning_rate": 4.310064935064935e-05, | |
| "loss": 0.2156, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1688311688311688, | |
| "grad_norm": 49056.53125, | |
| "learning_rate": 4.26948051948052e-05, | |
| "loss": 0.2044, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2337662337662338, | |
| "grad_norm": 122884.6171875, | |
| "learning_rate": 4.228896103896104e-05, | |
| "loss": 0.2539, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2987012987012987, | |
| "grad_norm": 128831.859375, | |
| "learning_rate": 4.1883116883116886e-05, | |
| "loss": 0.1446, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2987012987012987, | |
| "eval_accuracy": 0.9434571890145396, | |
| "eval_loss": 0.2216598242521286, | |
| "eval_runtime": 12.6893, | |
| "eval_samples_per_second": 97.563, | |
| "eval_steps_per_second": 6.147, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 46098.05078125, | |
| "learning_rate": 4.1477272727272734e-05, | |
| "loss": 0.1041, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 81511.2109375, | |
| "learning_rate": 4.107142857142857e-05, | |
| "loss": 0.1112, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4935064935064934, | |
| "grad_norm": 324834.40625, | |
| "learning_rate": 4.066558441558442e-05, | |
| "loss": 0.14, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5584415584415585, | |
| "grad_norm": 143048.796875, | |
| "learning_rate": 4.025974025974026e-05, | |
| "loss": 0.2534, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.6233766233766234, | |
| "grad_norm": 39697.8828125, | |
| "learning_rate": 3.98538961038961e-05, | |
| "loss": 0.1718, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6883116883116882, | |
| "grad_norm": 32256.904296875, | |
| "learning_rate": 3.944805194805195e-05, | |
| "loss": 0.1571, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7532467532467533, | |
| "grad_norm": 19842.701171875, | |
| "learning_rate": 3.9042207792207794e-05, | |
| "loss": 0.0853, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 12634.58984375, | |
| "learning_rate": 3.8636363636363636e-05, | |
| "loss": 0.0671, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.883116883116883, | |
| "grad_norm": 37113.8984375, | |
| "learning_rate": 3.8230519480519485e-05, | |
| "loss": 0.0523, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.948051948051948, | |
| "grad_norm": 11151.8271484375, | |
| "learning_rate": 3.782467532467533e-05, | |
| "loss": 0.0814, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.948051948051948, | |
| "eval_accuracy": 0.9717285945072698, | |
| "eval_loss": 0.1310429573059082, | |
| "eval_runtime": 12.6419, | |
| "eval_samples_per_second": 97.928, | |
| "eval_steps_per_second": 6.17, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.012987012987013, | |
| "grad_norm": 22527.939453125, | |
| "learning_rate": 3.741883116883117e-05, | |
| "loss": 0.0838, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0779220779220777, | |
| "grad_norm": 56426.72265625, | |
| "learning_rate": 3.701298701298702e-05, | |
| "loss": 0.0836, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 175356.546875, | |
| "learning_rate": 3.6607142857142853e-05, | |
| "loss": 0.0688, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.207792207792208, | |
| "grad_norm": 347378.875, | |
| "learning_rate": 3.62012987012987e-05, | |
| "loss": 0.07, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 12468.73828125, | |
| "learning_rate": 3.579545454545455e-05, | |
| "loss": 0.0364, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.3376623376623376, | |
| "grad_norm": 6203.49267578125, | |
| "learning_rate": 3.5389610389610387e-05, | |
| "loss": 0.0303, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.4025974025974026, | |
| "grad_norm": 5669.07568359375, | |
| "learning_rate": 3.4983766233766235e-05, | |
| "loss": 0.0258, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4675324675324677, | |
| "grad_norm": 6123.68603515625, | |
| "learning_rate": 3.4577922077922084e-05, | |
| "loss": 0.0664, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.5324675324675323, | |
| "grad_norm": 4741.46435546875, | |
| "learning_rate": 3.417207792207792e-05, | |
| "loss": 0.0371, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5974025974025974, | |
| "grad_norm": 5111.3154296875, | |
| "learning_rate": 3.376623376623377e-05, | |
| "loss": 0.0438, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5974025974025974, | |
| "eval_accuracy": 0.9830371567043619, | |
| "eval_loss": 0.08745533972978592, | |
| "eval_runtime": 12.6006, | |
| "eval_samples_per_second": 98.25, | |
| "eval_steps_per_second": 6.19, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.6623376623376624, | |
| "grad_norm": 6704.41015625, | |
| "learning_rate": 3.336038961038961e-05, | |
| "loss": 0.0687, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 5477.58544921875, | |
| "learning_rate": 3.295454545454545e-05, | |
| "loss": 0.0178, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.792207792207792, | |
| "grad_norm": 3572.594970703125, | |
| "learning_rate": 3.25487012987013e-05, | |
| "loss": 0.0581, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 3537.077880859375, | |
| "learning_rate": 3.2142857142857144e-05, | |
| "loss": 0.0337, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.9220779220779223, | |
| "grad_norm": 247302.34375, | |
| "learning_rate": 3.1737012987012986e-05, | |
| "loss": 0.032, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.987012987012987, | |
| "grad_norm": 3149.987548828125, | |
| "learning_rate": 3.1331168831168835e-05, | |
| "loss": 0.0395, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.051948051948052, | |
| "grad_norm": 2695.8544921875, | |
| "learning_rate": 3.092532467532468e-05, | |
| "loss": 0.0112, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.116883116883117, | |
| "grad_norm": 3298.885986328125, | |
| "learning_rate": 3.051948051948052e-05, | |
| "loss": 0.0131, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "grad_norm": 2481.77685546875, | |
| "learning_rate": 3.0113636363636365e-05, | |
| "loss": 0.0201, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.2467532467532467, | |
| "grad_norm": 1988.6038818359375, | |
| "learning_rate": 2.9707792207792207e-05, | |
| "loss": 0.0212, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.2467532467532467, | |
| "eval_accuracy": 0.9765751211631664, | |
| "eval_loss": 0.11985456198453903, | |
| "eval_runtime": 12.9037, | |
| "eval_samples_per_second": 95.941, | |
| "eval_steps_per_second": 6.045, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.311688311688312, | |
| "grad_norm": 2029.4908447265625, | |
| "learning_rate": 2.9301948051948052e-05, | |
| "loss": 0.0123, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.3766233766233764, | |
| "grad_norm": 1813.953125, | |
| "learning_rate": 2.8896103896103898e-05, | |
| "loss": 0.0072, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.4415584415584415, | |
| "grad_norm": 1632.518310546875, | |
| "learning_rate": 2.849025974025974e-05, | |
| "loss": 0.0187, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.5064935064935066, | |
| "grad_norm": 11811.1875, | |
| "learning_rate": 2.8084415584415585e-05, | |
| "loss": 0.0063, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 21330.248046875, | |
| "learning_rate": 2.767857142857143e-05, | |
| "loss": 0.0273, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 4902.02587890625, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.0261, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.7012987012987013, | |
| "grad_norm": 1297.66162109375, | |
| "learning_rate": 2.686688311688312e-05, | |
| "loss": 0.0051, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.7662337662337664, | |
| "grad_norm": 1216.2420654296875, | |
| "learning_rate": 2.6461038961038964e-05, | |
| "loss": 0.0046, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.8311688311688314, | |
| "grad_norm": 997.738525390625, | |
| "learning_rate": 2.6055194805194806e-05, | |
| "loss": 0.004, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.896103896103896, | |
| "grad_norm": 998.6089477539062, | |
| "learning_rate": 2.5649350649350652e-05, | |
| "loss": 0.0212, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.896103896103896, | |
| "eval_accuracy": 0.9878836833602584, | |
| "eval_loss": 0.06056738272309303, | |
| "eval_runtime": 12.5878, | |
| "eval_samples_per_second": 98.349, | |
| "eval_steps_per_second": 6.196, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.961038961038961, | |
| "grad_norm": 1652.7265625, | |
| "learning_rate": 2.5243506493506497e-05, | |
| "loss": 0.0088, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.025974025974026, | |
| "grad_norm": 921.3099365234375, | |
| "learning_rate": 2.483766233766234e-05, | |
| "loss": 0.026, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "grad_norm": 873.7997436523438, | |
| "learning_rate": 2.4431818181818185e-05, | |
| "loss": 0.0226, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.1558441558441555, | |
| "grad_norm": 843.581298828125, | |
| "learning_rate": 2.4025974025974027e-05, | |
| "loss": 0.0033, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.220779220779221, | |
| "grad_norm": 681.9098510742188, | |
| "learning_rate": 2.362012987012987e-05, | |
| "loss": 0.003, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 644.044921875, | |
| "learning_rate": 2.3214285714285715e-05, | |
| "loss": 0.0028, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.35064935064935, | |
| "grad_norm": 717.3087158203125, | |
| "learning_rate": 2.280844155844156e-05, | |
| "loss": 0.0026, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.415584415584416, | |
| "grad_norm": 587.276611328125, | |
| "learning_rate": 2.2402597402597402e-05, | |
| "loss": 0.0023, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.48051948051948, | |
| "grad_norm": 533.31494140625, | |
| "learning_rate": 2.1996753246753248e-05, | |
| "loss": 0.0021, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 525.0987548828125, | |
| "learning_rate": 2.1590909090909093e-05, | |
| "loss": 0.002, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "eval_accuracy": 0.9862681744749596, | |
| "eval_loss": 0.08030502498149872, | |
| "eval_runtime": 12.9251, | |
| "eval_samples_per_second": 95.782, | |
| "eval_steps_per_second": 6.035, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.6103896103896105, | |
| "grad_norm": 519.3194580078125, | |
| "learning_rate": 2.1185064935064935e-05, | |
| "loss": 0.0257, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.675324675324675, | |
| "grad_norm": 507.4796142578125, | |
| "learning_rate": 2.077922077922078e-05, | |
| "loss": 0.0018, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.740259740259741, | |
| "grad_norm": 425.1520080566406, | |
| "learning_rate": 2.0373376623376626e-05, | |
| "loss": 0.0017, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.805194805194805, | |
| "grad_norm": 393.7642517089844, | |
| "learning_rate": 1.996753246753247e-05, | |
| "loss": 0.0015, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.87012987012987, | |
| "grad_norm": 377.76678466796875, | |
| "learning_rate": 1.956168831168831e-05, | |
| "loss": 0.0016, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.935064935064935, | |
| "grad_norm": 422.4974670410156, | |
| "learning_rate": 1.9155844155844156e-05, | |
| "loss": 0.026, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 381.5065002441406, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.0014, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 5.064935064935065, | |
| "grad_norm": 336.5712890625, | |
| "learning_rate": 1.8344155844155844e-05, | |
| "loss": 0.0012, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 5.12987012987013, | |
| "grad_norm": 320.25439453125, | |
| "learning_rate": 1.793831168831169e-05, | |
| "loss": 0.0012, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 5.194805194805195, | |
| "grad_norm": 326.00421142578125, | |
| "learning_rate": 1.7532467532467535e-05, | |
| "loss": 0.0011, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.194805194805195, | |
| "eval_accuracy": 0.9870759289176091, | |
| "eval_loss": 0.07453276216983795, | |
| "eval_runtime": 12.6647, | |
| "eval_samples_per_second": 97.752, | |
| "eval_steps_per_second": 6.159, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.259740259740259, | |
| "grad_norm": 403.37353515625, | |
| "learning_rate": 1.7126623376623377e-05, | |
| "loss": 0.0012, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 5.324675324675325, | |
| "grad_norm": 306.0226745605469, | |
| "learning_rate": 1.672077922077922e-05, | |
| "loss": 0.0011, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 5.3896103896103895, | |
| "grad_norm": 282.32855224609375, | |
| "learning_rate": 1.6314935064935065e-05, | |
| "loss": 0.001, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "grad_norm": 67814.4609375, | |
| "learning_rate": 1.590909090909091e-05, | |
| "loss": 0.0255, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 5.51948051948052, | |
| "grad_norm": 242.10775756835938, | |
| "learning_rate": 1.5503246753246752e-05, | |
| "loss": 0.001, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.584415584415584, | |
| "grad_norm": 226.62974548339844, | |
| "learning_rate": 1.50974025974026e-05, | |
| "loss": 0.0432, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 5.64935064935065, | |
| "grad_norm": 225.47959899902344, | |
| "learning_rate": 1.4691558441558442e-05, | |
| "loss": 0.0008, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 216.57928466796875, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 0.0008, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 5.779220779220779, | |
| "grad_norm": 239.439208984375, | |
| "learning_rate": 1.3879870129870131e-05, | |
| "loss": 0.0008, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 5.8441558441558445, | |
| "grad_norm": 247.78924560546875, | |
| "learning_rate": 1.3474025974025975e-05, | |
| "loss": 0.0008, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.8441558441558445, | |
| "eval_accuracy": 0.9878836833602584, | |
| "eval_loss": 0.08090992271900177, | |
| "eval_runtime": 12.58, | |
| "eval_samples_per_second": 98.41, | |
| "eval_steps_per_second": 6.2, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.909090909090909, | |
| "grad_norm": 208.6429901123047, | |
| "learning_rate": 1.3068181818181819e-05, | |
| "loss": 0.0008, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.974025974025974, | |
| "grad_norm": 235.62033081054688, | |
| "learning_rate": 1.2662337662337662e-05, | |
| "loss": 0.0007, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 6.038961038961039, | |
| "grad_norm": 183.11407470703125, | |
| "learning_rate": 1.2256493506493508e-05, | |
| "loss": 0.0007, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 6.103896103896104, | |
| "grad_norm": 161.49224853515625, | |
| "learning_rate": 1.1850649350649352e-05, | |
| "loss": 0.0104, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 6.1688311688311686, | |
| "grad_norm": 180.62030029296875, | |
| "learning_rate": 1.1444805194805196e-05, | |
| "loss": 0.0007, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 6.233766233766234, | |
| "grad_norm": 197.23057556152344, | |
| "learning_rate": 1.103896103896104e-05, | |
| "loss": 0.0006, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 6.298701298701299, | |
| "grad_norm": 148.50794982910156, | |
| "learning_rate": 1.0633116883116883e-05, | |
| "loss": 0.0206, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 6.363636363636363, | |
| "grad_norm": 142.77064514160156, | |
| "learning_rate": 1.0227272727272729e-05, | |
| "loss": 0.0006, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 6.428571428571429, | |
| "grad_norm": 655451.4375, | |
| "learning_rate": 9.821428571428573e-06, | |
| "loss": 0.0027, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 6.4935064935064934, | |
| "grad_norm": 140.66160583496094, | |
| "learning_rate": 9.415584415584416e-06, | |
| "loss": 0.0005, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.4935064935064934, | |
| "eval_accuracy": 0.9886914378029079, | |
| "eval_loss": 0.0861237570643425, | |
| "eval_runtime": 12.6203, | |
| "eval_samples_per_second": 98.096, | |
| "eval_steps_per_second": 6.181, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.558441558441558, | |
| "grad_norm": 159.91189575195312, | |
| "learning_rate": 9.00974025974026e-06, | |
| "loss": 0.0006, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 6.623376623376624, | |
| "grad_norm": 141.53871154785156, | |
| "learning_rate": 8.603896103896104e-06, | |
| "loss": 0.0005, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 6.688311688311688, | |
| "grad_norm": 156.2162322998047, | |
| "learning_rate": 8.19805194805195e-06, | |
| "loss": 0.0005, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 6.753246753246753, | |
| "grad_norm": 141.82945251464844, | |
| "learning_rate": 7.792207792207792e-06, | |
| "loss": 0.0005, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 6.818181818181818, | |
| "grad_norm": 172.6562957763672, | |
| "learning_rate": 7.386363636363637e-06, | |
| "loss": 0.0005, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 6.883116883116883, | |
| "grad_norm": 118.13848876953125, | |
| "learning_rate": 6.98051948051948e-06, | |
| "loss": 0.0005, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 6.948051948051948, | |
| "grad_norm": 119.30269622802734, | |
| "learning_rate": 6.574675324675325e-06, | |
| "loss": 0.0005, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 7.012987012987013, | |
| "grad_norm": 120.57817840576172, | |
| "learning_rate": 6.168831168831169e-06, | |
| "loss": 0.0004, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 7.077922077922078, | |
| "grad_norm": 117.14134979248047, | |
| "learning_rate": 5.762987012987013e-06, | |
| "loss": 0.0005, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 130.8657989501953, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.0005, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "eval_accuracy": 0.9878836833602584, | |
| "eval_loss": 0.08649223297834396, | |
| "eval_runtime": 12.5975, | |
| "eval_samples_per_second": 98.273, | |
| "eval_steps_per_second": 6.192, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.207792207792208, | |
| "grad_norm": 134.7167510986328, | |
| "learning_rate": 4.951298701298702e-06, | |
| "loss": 0.0005, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 7.2727272727272725, | |
| "grad_norm": 117.66633605957031, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.0004, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 7.337662337662338, | |
| "grad_norm": 129.73902893066406, | |
| "learning_rate": 4.13961038961039e-06, | |
| "loss": 0.0004, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 7.402597402597403, | |
| "grad_norm": 112.13995361328125, | |
| "learning_rate": 3.733766233766234e-06, | |
| "loss": 0.0004, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 7.467532467532467, | |
| "grad_norm": 135.0137939453125, | |
| "learning_rate": 3.327922077922078e-06, | |
| "loss": 0.0005, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 7.532467532467533, | |
| "grad_norm": 115.70610046386719, | |
| "learning_rate": 2.922077922077922e-06, | |
| "loss": 0.0004, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 7.597402597402597, | |
| "grad_norm": 136.4860076904297, | |
| "learning_rate": 2.5162337662337663e-06, | |
| "loss": 0.0004, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 7.662337662337662, | |
| "grad_norm": 107.2647705078125, | |
| "learning_rate": 2.1103896103896105e-06, | |
| "loss": 0.0004, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 7.7272727272727275, | |
| "grad_norm": 121.89676666259766, | |
| "learning_rate": 1.7045454545454546e-06, | |
| "loss": 0.0005, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 7.792207792207792, | |
| "grad_norm": 149.9101104736328, | |
| "learning_rate": 1.2987012987012988e-06, | |
| "loss": 0.0004, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.792207792207792, | |
| "eval_accuracy": 0.9878836833602584, | |
| "eval_loss": 0.07882251590490341, | |
| "eval_runtime": 12.5777, | |
| "eval_samples_per_second": 98.428, | |
| "eval_steps_per_second": 6.201, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.857142857142857, | |
| "grad_norm": 118.61868286132812, | |
| "learning_rate": 8.928571428571428e-07, | |
| "loss": 0.0004, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 7.922077922077922, | |
| "grad_norm": 108.26444244384766, | |
| "learning_rate": 4.87012987012987e-07, | |
| "loss": 0.0004, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 7.987012987012987, | |
| "grad_norm": 101.61443328857422, | |
| "learning_rate": 8.116883116883118e-08, | |
| "loss": 0.0004, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "step": 1232, | |
| "total_flos": 3.0452071013776097e+18, | |
| "train_loss": 0.09478302804823865, | |
| "train_runtime": 1093.0198, | |
| "train_samples_per_second": 35.952, | |
| "train_steps_per_second": 1.127 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1232, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.0452071013776097e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |