| { | |
| "best_metric": 1.0969369411468506, | |
| "best_model_checkpoint": "./vit-base-melSpecImagesCREMA/checkpoint-400", | |
| "epoch": 10.0, | |
| "eval_steps": 100, | |
| "global_step": 1310, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.503008246421814, | |
| "learning_rate": 0.0001984732824427481, | |
| "loss": 1.7751, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.1629976034164429, | |
| "learning_rate": 0.0001969465648854962, | |
| "loss": 1.6558, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.205520749092102, | |
| "learning_rate": 0.00019541984732824428, | |
| "loss": 1.6062, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.4008492231369019, | |
| "learning_rate": 0.00019389312977099237, | |
| "loss": 1.6114, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3961032629013062, | |
| "learning_rate": 0.00019236641221374049, | |
| "loss": 1.5886, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.31687331199646, | |
| "learning_rate": 0.00019083969465648857, | |
| "loss": 1.544, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.3430259227752686, | |
| "learning_rate": 0.00018931297709923666, | |
| "loss": 1.493, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.1518867015838623, | |
| "learning_rate": 0.00018778625954198475, | |
| "loss": 1.5494, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.066909909248352, | |
| "learning_rate": 0.00018625954198473284, | |
| "loss": 1.4324, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.1103651523590088, | |
| "learning_rate": 0.00018473282442748093, | |
| "loss": 1.5606, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_accuracy": 0.40786948176583493, | |
| "eval_loss": 1.4423701763153076, | |
| "eval_runtime": 10.9463, | |
| "eval_samples_per_second": 95.192, | |
| "eval_steps_per_second": 11.967, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.20200777053833, | |
| "learning_rate": 0.00018320610687022902, | |
| "loss": 1.429, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.1876991987228394, | |
| "learning_rate": 0.0001816793893129771, | |
| "loss": 1.4265, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.1246036291122437, | |
| "learning_rate": 0.00018015267175572518, | |
| "loss": 1.4526, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.8578764200210571, | |
| "learning_rate": 0.0001786259541984733, | |
| "loss": 1.349, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.8486779928207397, | |
| "learning_rate": 0.00017709923664122138, | |
| "loss": 1.3172, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.775232195854187, | |
| "learning_rate": 0.00017557251908396947, | |
| "loss": 1.2735, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.7306294441223145, | |
| "learning_rate": 0.00017404580152671756, | |
| "loss": 1.3665, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.8813070058822632, | |
| "learning_rate": 0.00017251908396946565, | |
| "loss": 1.322, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.4844669103622437, | |
| "learning_rate": 0.00017099236641221374, | |
| "loss": 1.324, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.489121913909912, | |
| "learning_rate": 0.00016946564885496183, | |
| "loss": 1.2841, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "eval_accuracy": 0.3694817658349328, | |
| "eval_loss": 1.498081088066101, | |
| "eval_runtime": 11.2582, | |
| "eval_samples_per_second": 92.555, | |
| "eval_steps_per_second": 11.636, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.9091380834579468, | |
| "learning_rate": 0.00016793893129770992, | |
| "loss": 1.2919, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.8453233242034912, | |
| "learning_rate": 0.00016641221374045804, | |
| "loss": 1.2998, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.699350595474243, | |
| "learning_rate": 0.00016488549618320613, | |
| "loss": 1.3153, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.409592628479004, | |
| "learning_rate": 0.00016335877862595422, | |
| "loss": 1.1943, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.59845769405365, | |
| "learning_rate": 0.0001618320610687023, | |
| "loss": 1.2302, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.2508938312530518, | |
| "learning_rate": 0.0001603053435114504, | |
| "loss": 1.4177, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.6891452074050903, | |
| "learning_rate": 0.00015877862595419848, | |
| "loss": 1.0828, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 3.317949056625366, | |
| "learning_rate": 0.00015725190839694657, | |
| "loss": 1.1312, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 3.0603318214416504, | |
| "learning_rate": 0.00015572519083969466, | |
| "loss": 1.0713, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.0383269786834717, | |
| "learning_rate": 0.00015419847328244275, | |
| "loss": 1.0159, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "eval_accuracy": 0.5518234165067178, | |
| "eval_loss": 1.1692744493484497, | |
| "eval_runtime": 11.2045, | |
| "eval_samples_per_second": 92.998, | |
| "eval_steps_per_second": 11.692, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.9367152452468872, | |
| "learning_rate": 0.00015267175572519084, | |
| "loss": 0.9986, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.3003132343292236, | |
| "learning_rate": 0.00015114503816793893, | |
| "loss": 1.0508, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.84384822845459, | |
| "learning_rate": 0.00014961832061068702, | |
| "loss": 1.0623, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.7760237455368042, | |
| "learning_rate": 0.0001480916030534351, | |
| "loss": 1.0425, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.6222035884857178, | |
| "learning_rate": 0.0001465648854961832, | |
| "loss": 1.0366, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.4612748622894287, | |
| "learning_rate": 0.0001450381679389313, | |
| "loss": 1.0451, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.3277204036712646, | |
| "learning_rate": 0.00014351145038167938, | |
| "loss": 1.1176, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.187209129333496, | |
| "learning_rate": 0.00014198473282442747, | |
| "loss": 0.9908, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 3.229308843612671, | |
| "learning_rate": 0.00014045801526717556, | |
| "loss": 1.0181, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 1.847447156906128, | |
| "learning_rate": 0.00013893129770992368, | |
| "loss": 0.9868, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "eval_accuracy": 0.5930902111324377, | |
| "eval_loss": 1.0969369411468506, | |
| "eval_runtime": 11.1277, | |
| "eval_samples_per_second": 93.64, | |
| "eval_steps_per_second": 11.772, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 2.6871042251586914, | |
| "learning_rate": 0.00013740458015267177, | |
| "loss": 0.8034, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 3.8769664764404297, | |
| "learning_rate": 0.00013587786259541986, | |
| "loss": 0.8141, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 2.8785126209259033, | |
| "learning_rate": 0.00013435114503816795, | |
| "loss": 0.7343, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 2.4167301654815674, | |
| "learning_rate": 0.00013282442748091604, | |
| "loss": 0.8251, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 2.921082019805908, | |
| "learning_rate": 0.00013129770992366413, | |
| "loss": 0.7511, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 2.847332000732422, | |
| "learning_rate": 0.00012977099236641222, | |
| "loss": 0.7392, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 3.5761501789093018, | |
| "learning_rate": 0.0001282442748091603, | |
| "loss": 0.6463, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 3.0666024684906006, | |
| "learning_rate": 0.0001267175572519084, | |
| "loss": 0.7027, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 1.9951330423355103, | |
| "learning_rate": 0.0001251908396946565, | |
| "loss": 0.7666, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 3.5129895210266113, | |
| "learning_rate": 0.0001236641221374046, | |
| "loss": 0.8477, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "eval_accuracy": 0.5796545105566219, | |
| "eval_loss": 1.1718865633010864, | |
| "eval_runtime": 11.143, | |
| "eval_samples_per_second": 93.511, | |
| "eval_steps_per_second": 11.756, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 2.490349054336548, | |
| "learning_rate": 0.0001221374045801527, | |
| "loss": 0.6279, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 4.751981258392334, | |
| "learning_rate": 0.00012061068702290077, | |
| "loss": 0.6861, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 1.843367338180542, | |
| "learning_rate": 0.00011908396946564886, | |
| "loss": 0.6348, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 3.823432445526123, | |
| "learning_rate": 0.00011755725190839695, | |
| "loss": 0.4487, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 1.805336833000183, | |
| "learning_rate": 0.00011603053435114504, | |
| "loss": 0.4393, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 2.604633331298828, | |
| "learning_rate": 0.00011450381679389313, | |
| "loss": 0.4098, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 3.3479952812194824, | |
| "learning_rate": 0.00011297709923664124, | |
| "loss": 0.5057, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 2.961186408996582, | |
| "learning_rate": 0.00011145038167938933, | |
| "loss": 0.5446, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 4.528743267059326, | |
| "learning_rate": 0.00010992366412213742, | |
| "loss": 0.5032, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 3.593770742416382, | |
| "learning_rate": 0.0001083969465648855, | |
| "loss": 0.5495, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "eval_accuracy": 0.5806142034548945, | |
| "eval_loss": 1.234790563583374, | |
| "eval_runtime": 11.1508, | |
| "eval_samples_per_second": 93.446, | |
| "eval_steps_per_second": 11.748, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 2.7948455810546875, | |
| "learning_rate": 0.00010687022900763359, | |
| "loss": 0.519, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 5.143344879150391, | |
| "learning_rate": 0.00010534351145038168, | |
| "loss": 0.5493, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 4.292489528656006, | |
| "learning_rate": 0.00010381679389312977, | |
| "loss": 0.5947, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 3.135993480682373, | |
| "learning_rate": 0.00010229007633587786, | |
| "loss": 0.4873, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 2.5363755226135254, | |
| "learning_rate": 0.00010076335877862595, | |
| "loss": 0.4813, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 2.4878172874450684, | |
| "learning_rate": 9.923664122137405e-05, | |
| "loss": 0.4249, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "grad_norm": 2.6882851123809814, | |
| "learning_rate": 9.770992366412214e-05, | |
| "loss": 0.2691, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 2.534364700317383, | |
| "learning_rate": 9.618320610687024e-05, | |
| "loss": 0.3055, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.27, | |
| "grad_norm": 4.340401649475098, | |
| "learning_rate": 9.465648854961833e-05, | |
| "loss": 0.2444, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 5.553411483764648, | |
| "learning_rate": 9.312977099236642e-05, | |
| "loss": 0.2671, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "eval_accuracy": 0.5854126679462572, | |
| "eval_loss": 1.3457223176956177, | |
| "eval_runtime": 11.154, | |
| "eval_samples_per_second": 93.419, | |
| "eval_steps_per_second": 11.745, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "grad_norm": 2.335097551345825, | |
| "learning_rate": 9.160305343511451e-05, | |
| "loss": 0.3027, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 6.312303066253662, | |
| "learning_rate": 9.007633587786259e-05, | |
| "loss": 0.3205, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "grad_norm": 7.102092266082764, | |
| "learning_rate": 8.854961832061069e-05, | |
| "loss": 0.2796, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 4.718852996826172, | |
| "learning_rate": 8.702290076335878e-05, | |
| "loss": 0.3254, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "grad_norm": 5.368557453155518, | |
| "learning_rate": 8.549618320610687e-05, | |
| "loss": 0.329, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 1.7768850326538086, | |
| "learning_rate": 8.396946564885496e-05, | |
| "loss": 0.2239, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 1.6067969799041748, | |
| "learning_rate": 8.244274809160306e-05, | |
| "loss": 0.2289, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 0.8688230514526367, | |
| "learning_rate": 8.091603053435115e-05, | |
| "loss": 0.2715, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "grad_norm": 2.5685417652130127, | |
| "learning_rate": 7.938931297709924e-05, | |
| "loss": 0.2098, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "grad_norm": 1.9992344379425049, | |
| "learning_rate": 7.786259541984733e-05, | |
| "loss": 0.1388, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "eval_accuracy": 0.5786948176583493, | |
| "eval_loss": 1.389073371887207, | |
| "eval_runtime": 11.1211, | |
| "eval_samples_per_second": 93.696, | |
| "eval_steps_per_second": 11.779, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.18, | |
| "grad_norm": 2.335876703262329, | |
| "learning_rate": 7.633587786259542e-05, | |
| "loss": 0.1762, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.26, | |
| "grad_norm": 4.345489025115967, | |
| "learning_rate": 7.480916030534351e-05, | |
| "loss": 0.1829, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "grad_norm": 1.5818490982055664, | |
| "learning_rate": 7.32824427480916e-05, | |
| "loss": 0.2543, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.41, | |
| "grad_norm": 2.3892974853515625, | |
| "learning_rate": 7.175572519083969e-05, | |
| "loss": 0.2035, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "grad_norm": 0.9355500936508179, | |
| "learning_rate": 7.022900763358778e-05, | |
| "loss": 0.2287, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "grad_norm": 3.3484275341033936, | |
| "learning_rate": 6.870229007633588e-05, | |
| "loss": 0.1433, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 1.1796901226043701, | |
| "learning_rate": 6.717557251908397e-05, | |
| "loss": 0.1943, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 3.2419047355651855, | |
| "learning_rate": 6.564885496183206e-05, | |
| "loss": 0.2079, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 6.79, | |
| "grad_norm": 5.841382026672363, | |
| "learning_rate": 6.412213740458015e-05, | |
| "loss": 0.2108, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 1.1594499349594116, | |
| "learning_rate": 6.259541984732826e-05, | |
| "loss": 0.1548, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "eval_accuracy": 0.5978886756238004, | |
| "eval_loss": 1.4216477870941162, | |
| "eval_runtime": 11.0828, | |
| "eval_samples_per_second": 94.019, | |
| "eval_steps_per_second": 11.82, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 1.131686806678772, | |
| "learning_rate": 6.106870229007635e-05, | |
| "loss": 0.1264, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.02, | |
| "grad_norm": 0.8231382966041565, | |
| "learning_rate": 5.954198473282443e-05, | |
| "loss": 0.1726, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.48267778754234314, | |
| "learning_rate": 5.801526717557252e-05, | |
| "loss": 0.1198, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.18, | |
| "grad_norm": 2.5278027057647705, | |
| "learning_rate": 5.648854961832062e-05, | |
| "loss": 0.1702, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 2.183692455291748, | |
| "learning_rate": 5.496183206106871e-05, | |
| "loss": 0.0854, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.33, | |
| "grad_norm": 0.40628916025161743, | |
| "learning_rate": 5.3435114503816794e-05, | |
| "loss": 0.1078, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.19366604089736938, | |
| "learning_rate": 5.1908396946564884e-05, | |
| "loss": 0.0979, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 0.4136432707309723, | |
| "learning_rate": 5.038167938931297e-05, | |
| "loss": 0.0954, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 0.17071671783924103, | |
| "learning_rate": 4.885496183206107e-05, | |
| "loss": 0.0432, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.63, | |
| "grad_norm": 0.9517059326171875, | |
| "learning_rate": 4.7328244274809166e-05, | |
| "loss": 0.0906, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.63, | |
| "eval_accuracy": 0.564299424184261, | |
| "eval_loss": 1.640116810798645, | |
| "eval_runtime": 11.0958, | |
| "eval_samples_per_second": 93.91, | |
| "eval_steps_per_second": 11.806, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.71, | |
| "grad_norm": 0.1622392237186432, | |
| "learning_rate": 4.5801526717557256e-05, | |
| "loss": 0.1284, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 7.79, | |
| "grad_norm": 6.343606948852539, | |
| "learning_rate": 4.4274809160305345e-05, | |
| "loss": 0.0985, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 7.86, | |
| "grad_norm": 0.7874533534049988, | |
| "learning_rate": 4.2748091603053435e-05, | |
| "loss": 0.1168, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 7.94, | |
| "grad_norm": 0.31581443548202515, | |
| "learning_rate": 4.122137404580153e-05, | |
| "loss": 0.1668, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.02, | |
| "grad_norm": 0.3773091435432434, | |
| "learning_rate": 3.969465648854962e-05, | |
| "loss": 0.1027, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.09, | |
| "grad_norm": 0.09075737744569778, | |
| "learning_rate": 3.816793893129771e-05, | |
| "loss": 0.0301, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.17, | |
| "grad_norm": 0.1646617352962494, | |
| "learning_rate": 3.66412213740458e-05, | |
| "loss": 0.077, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 1.9873558282852173, | |
| "learning_rate": 3.511450381679389e-05, | |
| "loss": 0.0443, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 0.13392330706119537, | |
| "learning_rate": 3.358778625954199e-05, | |
| "loss": 0.0823, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 1.0865620374679565, | |
| "learning_rate": 3.2061068702290076e-05, | |
| "loss": 0.1047, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "eval_accuracy": 0.5873320537428023, | |
| "eval_loss": 1.6780017614364624, | |
| "eval_runtime": 11.1335, | |
| "eval_samples_per_second": 93.591, | |
| "eval_steps_per_second": 11.766, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.47, | |
| "grad_norm": 0.10025681555271149, | |
| "learning_rate": 3.053435114503817e-05, | |
| "loss": 0.0592, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.55, | |
| "grad_norm": 0.14586412906646729, | |
| "learning_rate": 2.900763358778626e-05, | |
| "loss": 0.0393, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 8.63, | |
| "grad_norm": 0.5182372331619263, | |
| "learning_rate": 2.7480916030534355e-05, | |
| "loss": 0.0802, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "grad_norm": 0.3386911153793335, | |
| "learning_rate": 2.5954198473282442e-05, | |
| "loss": 0.0748, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 8.78, | |
| "grad_norm": 0.10998167097568512, | |
| "learning_rate": 2.4427480916030535e-05, | |
| "loss": 0.0366, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 0.12017246335744858, | |
| "learning_rate": 2.2900763358778628e-05, | |
| "loss": 0.0461, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 8.93, | |
| "grad_norm": 0.09363219887018204, | |
| "learning_rate": 2.1374045801526718e-05, | |
| "loss": 0.056, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.01, | |
| "grad_norm": 0.08617076277732849, | |
| "learning_rate": 1.984732824427481e-05, | |
| "loss": 0.0407, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.08, | |
| "grad_norm": 0.07992846518754959, | |
| "learning_rate": 1.83206106870229e-05, | |
| "loss": 0.0225, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "grad_norm": 0.5971510410308838, | |
| "learning_rate": 1.6793893129770993e-05, | |
| "loss": 0.0583, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "eval_accuracy": 0.5767754318618042, | |
| "eval_loss": 1.6794880628585815, | |
| "eval_runtime": 11.1041, | |
| "eval_samples_per_second": 93.839, | |
| "eval_steps_per_second": 11.797, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.24, | |
| "grad_norm": 0.06741204857826233, | |
| "learning_rate": 1.5267175572519086e-05, | |
| "loss": 0.0221, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.31, | |
| "grad_norm": 0.08024097234010696, | |
| "learning_rate": 1.3740458015267178e-05, | |
| "loss": 0.0357, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.39, | |
| "grad_norm": 0.17311328649520874, | |
| "learning_rate": 1.2213740458015267e-05, | |
| "loss": 0.0336, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.47, | |
| "grad_norm": 0.09014473855495453, | |
| "learning_rate": 1.0687022900763359e-05, | |
| "loss": 0.0514, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 9.54, | |
| "grad_norm": 0.07394669950008392, | |
| "learning_rate": 9.16030534351145e-06, | |
| "loss": 0.0534, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 9.62, | |
| "grad_norm": 0.11295180022716522, | |
| "learning_rate": 7.633587786259543e-06, | |
| "loss": 0.0254, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 9.69, | |
| "grad_norm": 1.9013243913650513, | |
| "learning_rate": 6.106870229007634e-06, | |
| "loss": 0.0919, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 9.77, | |
| "grad_norm": 0.08759523928165436, | |
| "learning_rate": 4.580152671755725e-06, | |
| "loss": 0.0476, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 9.85, | |
| "grad_norm": 0.07936747372150421, | |
| "learning_rate": 3.053435114503817e-06, | |
| "loss": 0.0365, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 0.13975408673286438, | |
| "learning_rate": 1.5267175572519084e-06, | |
| "loss": 0.0228, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "eval_accuracy": 0.5882917466410749, | |
| "eval_loss": 1.6925907135009766, | |
| "eval_runtime": 11.1309, | |
| "eval_samples_per_second": 93.613, | |
| "eval_steps_per_second": 11.769, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.3255839943885803, | |
| "learning_rate": 0.0, | |
| "loss": 0.0364, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 1310, | |
| "total_flos": 3.229206972532531e+18, | |
| "train_loss": 0.583979975267221, | |
| "train_runtime": 954.0647, | |
| "train_samples_per_second": 43.676, | |
| "train_steps_per_second": 1.373 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1310, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "total_flos": 3.229206972532531e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |