| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3431232091690544, | |
| "eval_steps": 2500, | |
| "global_step": 60000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01119269340974212, | |
| "grad_norm": 2.299727201461792, | |
| "learning_rate": 4.981345510983763e-05, | |
| "loss": 1.8848, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.02238538681948424, | |
| "grad_norm": 1.9952893257141113, | |
| "learning_rate": 4.962691021967526e-05, | |
| "loss": 1.7595, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.03357808022922636, | |
| "grad_norm": 2.1056811809539795, | |
| "learning_rate": 4.944036532951289e-05, | |
| "loss": 1.6994, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.04477077363896848, | |
| "grad_norm": 2.0474352836608887, | |
| "learning_rate": 4.925382043935053e-05, | |
| "loss": 1.6629, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0559634670487106, | |
| "grad_norm": 1.9989269971847534, | |
| "learning_rate": 4.906727554918816e-05, | |
| "loss": 1.6236, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0559634670487106, | |
| "eval_accuracy": 0.5569847646608951, | |
| "eval_loss": 2.425182342529297, | |
| "eval_runtime": 707.8445, | |
| "eval_samples_per_second": 91.796, | |
| "eval_steps_per_second": 3.826, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.06715616045845273, | |
| "grad_norm": 1.879557490348816, | |
| "learning_rate": 4.888073065902579e-05, | |
| "loss": 1.5991, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.07834885386819485, | |
| "grad_norm": 1.9889895915985107, | |
| "learning_rate": 4.869418576886342e-05, | |
| "loss": 1.5751, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.08954154727793696, | |
| "grad_norm": 1.910925269126892, | |
| "learning_rate": 4.8507640878701055e-05, | |
| "loss": 1.5587, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.10073424068767908, | |
| "grad_norm": 1.9268312454223633, | |
| "learning_rate": 4.8321095988538685e-05, | |
| "loss": 1.546, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.1119269340974212, | |
| "grad_norm": 1.8074718713760376, | |
| "learning_rate": 4.8134551098376315e-05, | |
| "loss": 1.5301, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1119269340974212, | |
| "eval_accuracy": 0.566450867740456, | |
| "eval_loss": 2.3531110286712646, | |
| "eval_runtime": 716.3757, | |
| "eval_samples_per_second": 90.702, | |
| "eval_steps_per_second": 3.78, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.12311962750716332, | |
| "grad_norm": 1.8723756074905396, | |
| "learning_rate": 4.7948006208213945e-05, | |
| "loss": 1.5153, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.13431232091690545, | |
| "grad_norm": 1.8938133716583252, | |
| "learning_rate": 4.7761461318051575e-05, | |
| "loss": 1.5051, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.14550501432664756, | |
| "grad_norm": 1.8093421459197998, | |
| "learning_rate": 4.757491642788921e-05, | |
| "loss": 1.4922, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1566977077363897, | |
| "grad_norm": 1.8811379671096802, | |
| "learning_rate": 4.738837153772684e-05, | |
| "loss": 1.4841, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.1678904011461318, | |
| "grad_norm": 1.8162873983383179, | |
| "learning_rate": 4.720182664756447e-05, | |
| "loss": 1.4664, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.1678904011461318, | |
| "eval_accuracy": 0.5726688422748262, | |
| "eval_loss": 2.2988929748535156, | |
| "eval_runtime": 706.3059, | |
| "eval_samples_per_second": 91.996, | |
| "eval_steps_per_second": 3.834, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.17908309455587393, | |
| "grad_norm": 1.861790418624878, | |
| "learning_rate": 4.70152817574021e-05, | |
| "loss": 1.4613, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.19027578796561603, | |
| "grad_norm": 1.7351659536361694, | |
| "learning_rate": 4.682873686723974e-05, | |
| "loss": 1.4554, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.20146848137535817, | |
| "grad_norm": 1.796727180480957, | |
| "learning_rate": 4.664219197707737e-05, | |
| "loss": 1.4469, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.2126611747851003, | |
| "grad_norm": 1.751111388206482, | |
| "learning_rate": 4.6455647086915e-05, | |
| "loss": 1.4405, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.2238538681948424, | |
| "grad_norm": 1.793644905090332, | |
| "learning_rate": 4.626910219675263e-05, | |
| "loss": 1.4314, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.2238538681948424, | |
| "eval_accuracy": 0.5781162212828304, | |
| "eval_loss": 2.257195472717285, | |
| "eval_runtime": 709.7465, | |
| "eval_samples_per_second": 91.55, | |
| "eval_steps_per_second": 3.815, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.23504656160458454, | |
| "grad_norm": 1.7030937671661377, | |
| "learning_rate": 4.6082557306590264e-05, | |
| "loss": 1.425, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.24623925501432664, | |
| "grad_norm": 1.7245328426361084, | |
| "learning_rate": 4.5896012416427894e-05, | |
| "loss": 1.4206, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.25743194842406875, | |
| "grad_norm": 1.7355397939682007, | |
| "learning_rate": 4.570946752626552e-05, | |
| "loss": 1.409, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.2686246418338109, | |
| "grad_norm": 1.7283306121826172, | |
| "learning_rate": 4.5522922636103154e-05, | |
| "loss": 1.4086, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.279817335243553, | |
| "grad_norm": 1.7133527994155884, | |
| "learning_rate": 4.5336377745940784e-05, | |
| "loss": 1.4042, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.279817335243553, | |
| "eval_accuracy": 0.5822483255357088, | |
| "eval_loss": 2.2244207859039307, | |
| "eval_runtime": 708.1859, | |
| "eval_samples_per_second": 91.751, | |
| "eval_steps_per_second": 3.824, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2910100286532951, | |
| "grad_norm": 1.688602328300476, | |
| "learning_rate": 4.514983285577842e-05, | |
| "loss": 1.3952, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.3022027220630373, | |
| "grad_norm": 1.6839321851730347, | |
| "learning_rate": 4.4963287965616043e-05, | |
| "loss": 1.3932, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.3133954154727794, | |
| "grad_norm": 1.7225844860076904, | |
| "learning_rate": 4.477674307545368e-05, | |
| "loss": 1.3839, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.3245881088825215, | |
| "grad_norm": 1.6329905986785889, | |
| "learning_rate": 4.459019818529131e-05, | |
| "loss": 1.3856, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.3357808022922636, | |
| "grad_norm": 1.7012953758239746, | |
| "learning_rate": 4.440365329512895e-05, | |
| "loss": 1.3771, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3357808022922636, | |
| "eval_accuracy": 0.586269614225024, | |
| "eval_loss": 2.1954798698425293, | |
| "eval_runtime": 718.0126, | |
| "eval_samples_per_second": 90.496, | |
| "eval_steps_per_second": 3.772, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.34697349570200575, | |
| "grad_norm": 1.6593496799468994, | |
| "learning_rate": 4.421710840496657e-05, | |
| "loss": 1.376, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.35816618911174786, | |
| "grad_norm": 1.6412550210952759, | |
| "learning_rate": 4.4030563514804206e-05, | |
| "loss": 1.3712, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.36935888252148996, | |
| "grad_norm": 1.6455302238464355, | |
| "learning_rate": 4.3844018624641836e-05, | |
| "loss": 1.3699, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.38055157593123207, | |
| "grad_norm": 1.6210881471633911, | |
| "learning_rate": 4.3657473734479466e-05, | |
| "loss": 1.3618, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.3917442693409742, | |
| "grad_norm": 1.6821410655975342, | |
| "learning_rate": 4.3470928844317096e-05, | |
| "loss": 1.3563, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.3917442693409742, | |
| "eval_accuracy": 0.589598096204646, | |
| "eval_loss": 2.168947219848633, | |
| "eval_runtime": 707.8628, | |
| "eval_samples_per_second": 91.793, | |
| "eval_steps_per_second": 3.826, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.40293696275071633, | |
| "grad_norm": 1.719738245010376, | |
| "learning_rate": 4.3284383954154726e-05, | |
| "loss": 1.3585, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.41412965616045844, | |
| "grad_norm": 1.660507321357727, | |
| "learning_rate": 4.309783906399236e-05, | |
| "loss": 1.3502, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.4253223495702006, | |
| "grad_norm": 1.7758148908615112, | |
| "learning_rate": 4.291129417382999e-05, | |
| "loss": 1.3459, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.4365150429799427, | |
| "grad_norm": 1.6665699481964111, | |
| "learning_rate": 4.272474928366762e-05, | |
| "loss": 1.3435, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.4477077363896848, | |
| "grad_norm": 1.6364027261734009, | |
| "learning_rate": 4.253820439350525e-05, | |
| "loss": 1.3401, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4477077363896848, | |
| "eval_accuracy": 0.5922608205511055, | |
| "eval_loss": 2.1485562324523926, | |
| "eval_runtime": 704.3891, | |
| "eval_samples_per_second": 92.246, | |
| "eval_steps_per_second": 3.844, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4589004297994269, | |
| "grad_norm": 9.470758438110352, | |
| "learning_rate": 4.426374462750716e-05, | |
| "loss": 8.0235, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.4700931232091691, | |
| "grad_norm": 9.91232967376709, | |
| "learning_rate": 4.412383595988539e-05, | |
| "loss": 7.9603, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.4812858166189112, | |
| "grad_norm": 9.734143257141113, | |
| "learning_rate": 4.398392729226361e-05, | |
| "loss": 7.9793, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.4924785100286533, | |
| "grad_norm": 9.574400901794434, | |
| "learning_rate": 4.3844018624641836e-05, | |
| "loss": 7.9731, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.5036712034383954, | |
| "grad_norm": 10.017444610595703, | |
| "learning_rate": 4.370410995702006e-05, | |
| "loss": 7.9335, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.5036712034383954, | |
| "eval_accuracy": 0.5952892349509148, | |
| "eval_loss": 2.1270551681518555, | |
| "eval_runtime": 525.2336, | |
| "eval_samples_per_second": 123.711, | |
| "eval_steps_per_second": 2.578, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.5148638968481375, | |
| "grad_norm": 9.66054916381836, | |
| "learning_rate": 4.356420128939828e-05, | |
| "loss": 7.9224, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.5260565902578797, | |
| "grad_norm": 9.483991622924805, | |
| "learning_rate": 4.342429262177651e-05, | |
| "loss": 7.9197, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.5372492836676218, | |
| "grad_norm": 9.803547859191895, | |
| "learning_rate": 4.3284383954154726e-05, | |
| "loss": 7.8932, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.5484419770773639, | |
| "grad_norm": 9.597293853759766, | |
| "learning_rate": 4.3144475286532955e-05, | |
| "loss": 7.8622, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.559634670487106, | |
| "grad_norm": 9.679096221923828, | |
| "learning_rate": 4.300456661891118e-05, | |
| "loss": 7.8644, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.559634670487106, | |
| "eval_accuracy": 0.5979848217689566, | |
| "eval_loss": 2.106226682662964, | |
| "eval_runtime": 528.4389, | |
| "eval_samples_per_second": 122.96, | |
| "eval_steps_per_second": 2.562, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.5708273638968482, | |
| "grad_norm": 9.878997802734375, | |
| "learning_rate": 4.28646579512894e-05, | |
| "loss": 7.8388, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.5820200573065902, | |
| "grad_norm": 9.320840835571289, | |
| "learning_rate": 4.272474928366762e-05, | |
| "loss": 7.8199, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5932127507163324, | |
| "grad_norm": 9.581457138061523, | |
| "learning_rate": 4.2584840616045845e-05, | |
| "loss": 7.8194, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.6044054441260746, | |
| "grad_norm": 9.690735816955566, | |
| "learning_rate": 4.2444931948424074e-05, | |
| "loss": 7.8147, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.6155981375358166, | |
| "grad_norm": 9.55455207824707, | |
| "learning_rate": 4.230502328080229e-05, | |
| "loss": 7.7927, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.6155981375358166, | |
| "eval_accuracy": 0.599545230267029, | |
| "eval_loss": 2.09478497505188, | |
| "eval_runtime": 531.729, | |
| "eval_samples_per_second": 122.199, | |
| "eval_steps_per_second": 2.546, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.6267908309455588, | |
| "grad_norm": 9.352036476135254, | |
| "learning_rate": 4.216511461318052e-05, | |
| "loss": 7.7711, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.6379835243553008, | |
| "grad_norm": 9.413168907165527, | |
| "learning_rate": 4.202520594555874e-05, | |
| "loss": 7.7733, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.649176217765043, | |
| "grad_norm": 9.420402526855469, | |
| "learning_rate": 4.1885297277936964e-05, | |
| "loss": 7.74, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.6603689111747851, | |
| "grad_norm": 9.579030990600586, | |
| "learning_rate": 4.1745388610315186e-05, | |
| "loss": 7.7237, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.6715616045845272, | |
| "grad_norm": 12.816407203674316, | |
| "learning_rate": 4.160547994269341e-05, | |
| "loss": 7.7401, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6715616045845272, | |
| "eval_accuracy": 0.6018761556694041, | |
| "eval_loss": 2.079362630844116, | |
| "eval_runtime": 531.1941, | |
| "eval_samples_per_second": 122.323, | |
| "eval_steps_per_second": 2.549, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6827542979942693, | |
| "grad_norm": 9.477621078491211, | |
| "learning_rate": 4.146557127507164e-05, | |
| "loss": 7.717, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.6939469914040115, | |
| "grad_norm": 9.8326416015625, | |
| "learning_rate": 4.132566260744986e-05, | |
| "loss": 7.7148, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.7051396848137536, | |
| "grad_norm": 9.668205261230469, | |
| "learning_rate": 4.118575393982808e-05, | |
| "loss": 7.6845, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.7163323782234957, | |
| "grad_norm": 9.344961166381836, | |
| "learning_rate": 4.1045845272206305e-05, | |
| "loss": 7.673, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.7275250716332379, | |
| "grad_norm": 12.754666328430176, | |
| "learning_rate": 4.090593660458453e-05, | |
| "loss": 7.646, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.7275250716332379, | |
| "eval_accuracy": 0.6036969981017439, | |
| "eval_loss": 2.0638949871063232, | |
| "eval_runtime": 534.6839, | |
| "eval_samples_per_second": 121.524, | |
| "eval_steps_per_second": 2.532, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.7387177650429799, | |
| "grad_norm": 9.269234657287598, | |
| "learning_rate": 4.076602793696275e-05, | |
| "loss": 7.6452, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.7499104584527221, | |
| "grad_norm": 9.59334659576416, | |
| "learning_rate": 4.062611926934098e-05, | |
| "loss": 7.6369, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.7611031518624641, | |
| "grad_norm": 9.979016304016113, | |
| "learning_rate": 4.04862106017192e-05, | |
| "loss": 7.6306, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.7722958452722063, | |
| "grad_norm": 9.395634651184082, | |
| "learning_rate": 4.0346301934097424e-05, | |
| "loss": 7.6083, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.7834885386819485, | |
| "grad_norm": 9.377208709716797, | |
| "learning_rate": 4.0206393266475646e-05, | |
| "loss": 7.6113, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7834885386819485, | |
| "eval_accuracy": 0.6060486530458662, | |
| "eval_loss": 2.046678066253662, | |
| "eval_runtime": 527.2553, | |
| "eval_samples_per_second": 123.236, | |
| "eval_steps_per_second": 2.568, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7946812320916905, | |
| "grad_norm": 9.33324146270752, | |
| "learning_rate": 4.006648459885387e-05, | |
| "loss": 7.596, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.8058739255014327, | |
| "grad_norm": 10.012749671936035, | |
| "learning_rate": 3.992657593123209e-05, | |
| "loss": 7.5944, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.8170666189111748, | |
| "grad_norm": 9.17791748046875, | |
| "learning_rate": 3.9786667263610314e-05, | |
| "loss": 7.5724, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.8282593123209169, | |
| "grad_norm": 9.714068412780762, | |
| "learning_rate": 3.964675859598854e-05, | |
| "loss": 7.5716, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.839452005730659, | |
| "grad_norm": 9.122146606445312, | |
| "learning_rate": 3.9506849928366765e-05, | |
| "loss": 7.5428, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.839452005730659, | |
| "eval_accuracy": 0.6080310471813272, | |
| "eval_loss": 2.0341005325317383, | |
| "eval_runtime": 534.9624, | |
| "eval_samples_per_second": 121.461, | |
| "eval_steps_per_second": 2.531, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.8506446991404012, | |
| "grad_norm": 8.890284538269043, | |
| "learning_rate": 3.936694126074499e-05, | |
| "loss": 7.5108, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.8618373925501432, | |
| "grad_norm": 9.258638381958008, | |
| "learning_rate": 3.922703259312321e-05, | |
| "loss": 7.5283, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.8730300859598854, | |
| "grad_norm": 9.524474143981934, | |
| "learning_rate": 3.908712392550143e-05, | |
| "loss": 7.5168, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.8842227793696275, | |
| "grad_norm": 9.608149528503418, | |
| "learning_rate": 3.894721525787966e-05, | |
| "loss": 7.5206, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.8954154727793696, | |
| "grad_norm": 9.405288696289062, | |
| "learning_rate": 3.880730659025788e-05, | |
| "loss": 7.5039, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.8954154727793696, | |
| "eval_accuracy": 0.6094788673634718, | |
| "eval_loss": 2.0253567695617676, | |
| "eval_runtime": 535.0948, | |
| "eval_samples_per_second": 121.431, | |
| "eval_steps_per_second": 2.53, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.9066081661891118, | |
| "grad_norm": 8.706295013427734, | |
| "learning_rate": 3.8667397922636107e-05, | |
| "loss": 7.4819, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.9178008595988538, | |
| "grad_norm": 9.542219161987305, | |
| "learning_rate": 3.852748925501433e-05, | |
| "loss": 7.4888, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.928993553008596, | |
| "grad_norm": 9.111319541931152, | |
| "learning_rate": 3.838758058739255e-05, | |
| "loss": 7.4749, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.9401862464183381, | |
| "grad_norm": 9.335123062133789, | |
| "learning_rate": 3.824767191977078e-05, | |
| "loss": 7.4591, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.9513789398280802, | |
| "grad_norm": 9.537328720092773, | |
| "learning_rate": 3.8107763252148996e-05, | |
| "loss": 7.4533, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.9513789398280802, | |
| "eval_accuracy": 0.6107361281876699, | |
| "eval_loss": 2.0133583545684814, | |
| "eval_runtime": 530.2968, | |
| "eval_samples_per_second": 122.529, | |
| "eval_steps_per_second": 2.553, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.9625716332378224, | |
| "grad_norm": 9.227987289428711, | |
| "learning_rate": 3.7967854584527225e-05, | |
| "loss": 7.4506, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.9737643266475645, | |
| "grad_norm": 9.076460838317871, | |
| "learning_rate": 3.782794591690544e-05, | |
| "loss": 7.4557, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.9849570200573066, | |
| "grad_norm": 9.841446876525879, | |
| "learning_rate": 3.768803724928367e-05, | |
| "loss": 7.4319, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.9961497134670487, | |
| "grad_norm": 9.169388771057129, | |
| "learning_rate": 3.754812858166189e-05, | |
| "loss": 7.4453, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.0073424068767909, | |
| "grad_norm": 9.200368881225586, | |
| "learning_rate": 3.7408219914040115e-05, | |
| "loss": 7.4149, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.0073424068767909, | |
| "eval_accuracy": 0.6120696404224526, | |
| "eval_loss": 2.0035457611083984, | |
| "eval_runtime": 532.3092, | |
| "eval_samples_per_second": 122.066, | |
| "eval_steps_per_second": 2.544, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.018535100286533, | |
| "grad_norm": 9.189336776733398, | |
| "learning_rate": 3.7268311246418344e-05, | |
| "loss": 7.3981, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.029727793696275, | |
| "grad_norm": 9.504659652709961, | |
| "learning_rate": 3.712840257879656e-05, | |
| "loss": 7.4031, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.0409204871060171, | |
| "grad_norm": 9.516868591308594, | |
| "learning_rate": 3.698849391117479e-05, | |
| "loss": 7.3822, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.0521131805157593, | |
| "grad_norm": 9.417741775512695, | |
| "learning_rate": 3.6848585243553005e-05, | |
| "loss": 7.3887, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.0633058739255015, | |
| "grad_norm": 9.202630043029785, | |
| "learning_rate": 3.6708676575931234e-05, | |
| "loss": 7.379, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.0633058739255015, | |
| "eval_accuracy": 0.6134326919026999, | |
| "eval_loss": 1.9946683645248413, | |
| "eval_runtime": 529.4253, | |
| "eval_samples_per_second": 122.731, | |
| "eval_steps_per_second": 2.557, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.0744985673352436, | |
| "grad_norm": 9.18812084197998, | |
| "learning_rate": 3.6568767908309456e-05, | |
| "loss": 7.3648, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.0856912607449858, | |
| "grad_norm": 9.317421913146973, | |
| "learning_rate": 3.642885924068768e-05, | |
| "loss": 7.3581, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.0968839541547277, | |
| "grad_norm": 9.30117130279541, | |
| "learning_rate": 3.628895057306591e-05, | |
| "loss": 7.3242, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.1080766475644699, | |
| "grad_norm": 9.295071601867676, | |
| "learning_rate": 3.6149041905444124e-05, | |
| "loss": 7.3343, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.119269340974212, | |
| "grad_norm": 9.372967720031738, | |
| "learning_rate": 3.600913323782235e-05, | |
| "loss": 7.324, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.119269340974212, | |
| "eval_accuracy": 0.6151487162989436, | |
| "eval_loss": 1.9853588342666626, | |
| "eval_runtime": 528.6238, | |
| "eval_samples_per_second": 122.917, | |
| "eval_steps_per_second": 2.561, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.1304620343839542, | |
| "grad_norm": 10.693807601928711, | |
| "learning_rate": 3.5869224570200575e-05, | |
| "loss": 7.3164, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.1416547277936964, | |
| "grad_norm": 9.047393798828125, | |
| "learning_rate": 3.57293159025788e-05, | |
| "loss": 7.3028, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.1528474212034383, | |
| "grad_norm": 9.055428504943848, | |
| "learning_rate": 3.558940723495702e-05, | |
| "loss": 7.316, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.1640401146131805, | |
| "grad_norm": 8.821599960327148, | |
| "learning_rate": 3.544949856733524e-05, | |
| "loss": 7.2759, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.1752328080229226, | |
| "grad_norm": 8.971498489379883, | |
| "learning_rate": 3.530958989971347e-05, | |
| "loss": 7.3041, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.1752328080229226, | |
| "eval_accuracy": 0.6162336395081003, | |
| "eval_loss": 1.9736484289169312, | |
| "eval_runtime": 526.2473, | |
| "eval_samples_per_second": 123.472, | |
| "eval_steps_per_second": 2.573, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.1864255014326648, | |
| "grad_norm": 9.30490779876709, | |
| "learning_rate": 3.5169681232091694e-05, | |
| "loss": 7.2966, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.197618194842407, | |
| "grad_norm": 9.367337226867676, | |
| "learning_rate": 3.5029772564469917e-05, | |
| "loss": 7.2862, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.2088108882521489, | |
| "grad_norm": 9.002731323242188, | |
| "learning_rate": 3.488986389684814e-05, | |
| "loss": 7.2858, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.220003581661891, | |
| "grad_norm": 9.070691108703613, | |
| "learning_rate": 3.474995522922636e-05, | |
| "loss": 7.2692, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.2311962750716332, | |
| "grad_norm": 9.154426574707031, | |
| "learning_rate": 3.4610046561604584e-05, | |
| "loss": 7.262, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.2311962750716332, | |
| "eval_accuracy": 0.6174860367511424, | |
| "eval_loss": 1.9672149419784546, | |
| "eval_runtime": 529.8804, | |
| "eval_samples_per_second": 122.626, | |
| "eval_steps_per_second": 2.555, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.2423889684813754, | |
| "grad_norm": 9.364106178283691, | |
| "learning_rate": 3.447013789398281e-05, | |
| "loss": 7.2489, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.2535816618911175, | |
| "grad_norm": 9.267243385314941, | |
| "learning_rate": 3.4330229226361035e-05, | |
| "loss": 7.2664, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.2647743553008595, | |
| "grad_norm": 9.162137031555176, | |
| "learning_rate": 3.419032055873926e-05, | |
| "loss": 7.2475, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.2759670487106018, | |
| "grad_norm": 9.292202949523926, | |
| "learning_rate": 3.405041189111748e-05, | |
| "loss": 7.2357, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.2871597421203438, | |
| "grad_norm": 9.280839920043945, | |
| "learning_rate": 3.39105032234957e-05, | |
| "loss": 7.2169, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.2871597421203438, | |
| "eval_accuracy": 0.6184868881174969, | |
| "eval_loss": 1.9653985500335693, | |
| "eval_runtime": 531.1309, | |
| "eval_samples_per_second": 122.337, | |
| "eval_steps_per_second": 2.549, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.298352435530086, | |
| "grad_norm": 8.75936222076416, | |
| "learning_rate": 3.3770594555873925e-05, | |
| "loss": 7.2052, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.309545128939828, | |
| "grad_norm": 8.891804695129395, | |
| "learning_rate": 3.363068588825215e-05, | |
| "loss": 7.2258, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.3207378223495703, | |
| "grad_norm": 8.931051254272461, | |
| "learning_rate": 3.349077722063038e-05, | |
| "loss": 7.1899, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.3319305157593124, | |
| "grad_norm": 9.616579055786133, | |
| "learning_rate": 3.33508685530086e-05, | |
| "loss": 7.2068, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.3431232091690544, | |
| "grad_norm": 8.981892585754395, | |
| "learning_rate": 3.321095988538682e-05, | |
| "loss": 7.2084, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.3431232091690544, | |
| "eval_accuracy": 0.6190439375486008, | |
| "eval_loss": 1.9544332027435303, | |
| "eval_runtime": 528.8193, | |
| "eval_samples_per_second": 122.872, | |
| "eval_steps_per_second": 2.56, | |
| "step": 60000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 178688, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 2500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2799139827941786e+18, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |