| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 782, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0025575447570332483, | |
| "grad_norm": 21.953563303736964, | |
| "learning_rate": 9.999959651660741e-06, | |
| "loss": 1.3458, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005115089514066497, | |
| "grad_norm": 11.652464724143648, | |
| "learning_rate": 9.999838607294157e-06, | |
| "loss": 1.293, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0076726342710997444, | |
| "grad_norm": 14.373069724258055, | |
| "learning_rate": 9.999636868853824e-06, | |
| "loss": 0.9527, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.010230179028132993, | |
| "grad_norm": 9.875205922264723, | |
| "learning_rate": 9.999354439595668e-06, | |
| "loss": 0.9394, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01278772378516624, | |
| "grad_norm": 15.116196776322745, | |
| "learning_rate": 9.998991324077906e-06, | |
| "loss": 0.8439, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015345268542199489, | |
| "grad_norm": 11.057152894995923, | |
| "learning_rate": 9.998547528160987e-06, | |
| "loss": 0.8405, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.017902813299232736, | |
| "grad_norm": 9.365688071554938, | |
| "learning_rate": 9.998023059007477e-06, | |
| "loss": 0.8734, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.020460358056265986, | |
| "grad_norm": 8.392619519229692, | |
| "learning_rate": 9.997417925081963e-06, | |
| "loss": 0.7449, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.023017902813299233, | |
| "grad_norm": 9.340778471253389, | |
| "learning_rate": 9.996732136150902e-06, | |
| "loss": 0.7876, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02557544757033248, | |
| "grad_norm": 8.20102806123342, | |
| "learning_rate": 9.995965703282472e-06, | |
| "loss": 0.7954, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.028132992327365727, | |
| "grad_norm": 8.76971572309029, | |
| "learning_rate": 9.995118638846394e-06, | |
| "loss": 0.8333, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.030690537084398978, | |
| "grad_norm": 7.557620850007077, | |
| "learning_rate": 9.99419095651372e-06, | |
| "loss": 0.8554, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03324808184143223, | |
| "grad_norm": 7.743714182477662, | |
| "learning_rate": 9.993182671256633e-06, | |
| "loss": 0.8637, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03580562659846547, | |
| "grad_norm": 7.32084774886768, | |
| "learning_rate": 9.992093799348182e-06, | |
| "loss": 0.8621, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.03836317135549872, | |
| "grad_norm": 7.6800585762630655, | |
| "learning_rate": 9.990924358362037e-06, | |
| "loss": 0.7075, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04092071611253197, | |
| "grad_norm": 7.053712401961599, | |
| "learning_rate": 9.9896743671722e-06, | |
| "loss": 0.8078, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 7.957894711517596, | |
| "learning_rate": 9.988343845952697e-06, | |
| "loss": 0.8534, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.04603580562659847, | |
| "grad_norm": 7.77708191899406, | |
| "learning_rate": 9.986932816177258e-06, | |
| "loss": 0.7898, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04859335038363171, | |
| "grad_norm": 8.86563729131372, | |
| "learning_rate": 9.985441300618966e-06, | |
| "loss": 0.7867, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.05115089514066496, | |
| "grad_norm": 7.9582904003704655, | |
| "learning_rate": 9.98386932334989e-06, | |
| "loss": 0.7112, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05370843989769821, | |
| "grad_norm": 7.46439913319433, | |
| "learning_rate": 9.982216909740703e-06, | |
| "loss": 0.7207, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.056265984654731455, | |
| "grad_norm": 7.502417267747896, | |
| "learning_rate": 9.980484086460258e-06, | |
| "loss": 0.7635, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.058823529411764705, | |
| "grad_norm": 8.077410576519545, | |
| "learning_rate": 9.978670881475173e-06, | |
| "loss": 0.7568, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.061381074168797956, | |
| "grad_norm": 8.530645418517887, | |
| "learning_rate": 9.976777324049374e-06, | |
| "loss": 0.8006, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0639386189258312, | |
| "grad_norm": 7.884014258298183, | |
| "learning_rate": 9.974803444743617e-06, | |
| "loss": 0.741, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06649616368286446, | |
| "grad_norm": 6.917821196530823, | |
| "learning_rate": 9.972749275415005e-06, | |
| "loss": 0.6668, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.06905370843989769, | |
| "grad_norm": 6.951997744847425, | |
| "learning_rate": 9.970614849216465e-06, | |
| "loss": 0.7619, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07161125319693094, | |
| "grad_norm": 6.541689929654185, | |
| "learning_rate": 9.96840020059622e-06, | |
| "loss": 0.7667, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0741687979539642, | |
| "grad_norm": 7.022380585505507, | |
| "learning_rate": 9.966105365297226e-06, | |
| "loss": 0.7176, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07672634271099744, | |
| "grad_norm": 6.912790106638039, | |
| "learning_rate": 9.963730380356599e-06, | |
| "loss": 0.7199, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0792838874680307, | |
| "grad_norm": 6.0140784283728275, | |
| "learning_rate": 9.96127528410502e-06, | |
| "loss": 0.7356, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08184143222506395, | |
| "grad_norm": 6.50099127837641, | |
| "learning_rate": 9.958740116166113e-06, | |
| "loss": 0.6741, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08439897698209718, | |
| "grad_norm": 6.196159967289502, | |
| "learning_rate": 9.9561249174558e-06, | |
| "loss": 0.6453, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 7.115083432855918, | |
| "learning_rate": 9.953429730181653e-06, | |
| "loss": 0.6921, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.08951406649616368, | |
| "grad_norm": 8.373221612447116, | |
| "learning_rate": 9.950654597842209e-06, | |
| "loss": 0.6904, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09207161125319693, | |
| "grad_norm": 7.708793853244428, | |
| "learning_rate": 9.947799565226253e-06, | |
| "loss": 0.7295, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09462915601023018, | |
| "grad_norm": 8.81215615740888, | |
| "learning_rate": 9.944864678412118e-06, | |
| "loss": 0.6806, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.09718670076726342, | |
| "grad_norm": 7.456995220385941, | |
| "learning_rate": 9.94184998476693e-06, | |
| "loss": 0.6659, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.09974424552429667, | |
| "grad_norm": 7.636846126721769, | |
| "learning_rate": 9.938755532945838e-06, | |
| "loss": 0.7073, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.10230179028132992, | |
| "grad_norm": 8.018319876671674, | |
| "learning_rate": 9.93558137289124e-06, | |
| "loss": 0.8063, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10485933503836317, | |
| "grad_norm": 6.429546682427428, | |
| "learning_rate": 9.932327555831972e-06, | |
| "loss": 0.5498, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.10741687979539642, | |
| "grad_norm": 6.106558471191083, | |
| "learning_rate": 9.928994134282477e-06, | |
| "loss": 0.6125, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.10997442455242967, | |
| "grad_norm": 7.220242425194578, | |
| "learning_rate": 9.925581162041967e-06, | |
| "loss": 0.6553, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.11253196930946291, | |
| "grad_norm": 7.406518370609504, | |
| "learning_rate": 9.922088694193546e-06, | |
| "loss": 0.7235, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.11508951406649616, | |
| "grad_norm": 6.232863127816167, | |
| "learning_rate": 9.918516787103322e-06, | |
| "loss": 0.6302, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 6.960585716404723, | |
| "learning_rate": 9.91486549841951e-06, | |
| "loss": 0.6631, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.12020460358056266, | |
| "grad_norm": 7.014565196822424, | |
| "learning_rate": 9.911134887071477e-06, | |
| "loss": 0.6013, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.12276214833759591, | |
| "grad_norm": 6.747724971317866, | |
| "learning_rate": 9.907325013268816e-06, | |
| "loss": 0.7311, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.12531969309462915, | |
| "grad_norm": 6.343001355283702, | |
| "learning_rate": 9.903435938500356e-06, | |
| "loss": 0.5662, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1278772378516624, | |
| "grad_norm": 6.68715844293194, | |
| "learning_rate": 9.899467725533181e-06, | |
| "loss": 0.6746, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 6.670242029824649, | |
| "learning_rate": 9.895420438411616e-06, | |
| "loss": 0.5636, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1329923273657289, | |
| "grad_norm": 7.6565238394007, | |
| "learning_rate": 9.89129414245618e-06, | |
| "loss": 0.6959, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.13554987212276215, | |
| "grad_norm": 6.322260761537018, | |
| "learning_rate": 9.887088904262557e-06, | |
| "loss": 0.6475, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.13810741687979539, | |
| "grad_norm": 7.424958009645482, | |
| "learning_rate": 9.882804791700488e-06, | |
| "loss": 0.7381, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.14066496163682865, | |
| "grad_norm": 6.484080185567617, | |
| "learning_rate": 9.878441873912712e-06, | |
| "loss": 0.5978, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1432225063938619, | |
| "grad_norm": 7.904610864584768, | |
| "learning_rate": 9.87400022131382e-06, | |
| "loss": 0.7156, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.14578005115089515, | |
| "grad_norm": 6.091768577876145, | |
| "learning_rate": 9.869479905589136e-06, | |
| "loss": 0.5674, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1483375959079284, | |
| "grad_norm": 6.338206394229405, | |
| "learning_rate": 9.864880999693551e-06, | |
| "loss": 0.5511, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.15089514066496162, | |
| "grad_norm": 7.393436860670507, | |
| "learning_rate": 9.860203577850353e-06, | |
| "loss": 0.6305, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.1534526854219949, | |
| "grad_norm": 6.520098900315275, | |
| "learning_rate": 9.855447715550024e-06, | |
| "loss": 0.6191, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15601023017902813, | |
| "grad_norm": 6.415362809418058, | |
| "learning_rate": 9.850613489549018e-06, | |
| "loss": 0.6296, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.1585677749360614, | |
| "grad_norm": 7.9580764103093635, | |
| "learning_rate": 9.845700977868536e-06, | |
| "loss": 0.6384, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.16112531969309463, | |
| "grad_norm": 7.1082899489573315, | |
| "learning_rate": 9.840710259793251e-06, | |
| "loss": 0.6021, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.1636828644501279, | |
| "grad_norm": 6.386504712691233, | |
| "learning_rate": 9.835641415870038e-06, | |
| "loss": 0.6873, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.16624040920716113, | |
| "grad_norm": 7.4797007816006555, | |
| "learning_rate": 9.830494527906671e-06, | |
| "loss": 0.5648, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.16879795396419436, | |
| "grad_norm": 6.748679766187366, | |
| "learning_rate": 9.825269678970502e-06, | |
| "loss": 0.566, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.17135549872122763, | |
| "grad_norm": 6.56170099270124, | |
| "learning_rate": 9.819966953387122e-06, | |
| "loss": 0.6398, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 5.941005918686002, | |
| "learning_rate": 9.814586436738998e-06, | |
| "loss": 0.6658, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.17647058823529413, | |
| "grad_norm": 5.942719440981736, | |
| "learning_rate": 9.809128215864096e-06, | |
| "loss": 0.5971, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.17902813299232737, | |
| "grad_norm": 6.4968806279365925, | |
| "learning_rate": 9.803592378854476e-06, | |
| "loss": 0.6047, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1815856777493606, | |
| "grad_norm": 5.577555703482892, | |
| "learning_rate": 9.797979015054868e-06, | |
| "loss": 0.5534, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.18414322250639387, | |
| "grad_norm": 6.262423254220968, | |
| "learning_rate": 9.792288215061237e-06, | |
| "loss": 0.5755, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.1867007672634271, | |
| "grad_norm": 7.17658874763877, | |
| "learning_rate": 9.786520070719313e-06, | |
| "loss": 0.5511, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.18925831202046037, | |
| "grad_norm": 5.532267867303149, | |
| "learning_rate": 9.780674675123113e-06, | |
| "loss": 0.4965, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1918158567774936, | |
| "grad_norm": 7.040268722404183, | |
| "learning_rate": 9.77475212261344e-06, | |
| "loss": 0.6644, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.19437340153452684, | |
| "grad_norm": 6.774497918030386, | |
| "learning_rate": 9.768752508776358e-06, | |
| "loss": 0.491, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1969309462915601, | |
| "grad_norm": 5.776934238941505, | |
| "learning_rate": 9.762675930441647e-06, | |
| "loss": 0.4861, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.19948849104859334, | |
| "grad_norm": 6.47046232974907, | |
| "learning_rate": 9.756522485681247e-06, | |
| "loss": 0.6273, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2020460358056266, | |
| "grad_norm": 6.712438744552012, | |
| "learning_rate": 9.750292273807666e-06, | |
| "loss": 0.7368, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.20460358056265984, | |
| "grad_norm": 8.615585919768426, | |
| "learning_rate": 9.743985395372387e-06, | |
| "loss": 0.6335, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2071611253196931, | |
| "grad_norm": 8.071799145378323, | |
| "learning_rate": 9.737601952164238e-06, | |
| "loss": 0.6612, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.20971867007672634, | |
| "grad_norm": 6.823667160150434, | |
| "learning_rate": 9.73114204720775e-06, | |
| "loss": 0.5456, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.21227621483375958, | |
| "grad_norm": 5.888063194216176, | |
| "learning_rate": 9.724605784761501e-06, | |
| "loss": 0.5959, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.21483375959079284, | |
| "grad_norm": 7.210449720464657, | |
| "learning_rate": 9.717993270316421e-06, | |
| "loss": 0.5919, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 7.06901541191221, | |
| "learning_rate": 9.711304610594104e-06, | |
| "loss": 0.5882, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.21994884910485935, | |
| "grad_norm": 6.374122555190542, | |
| "learning_rate": 9.704539913545073e-06, | |
| "loss": 0.6133, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.22250639386189258, | |
| "grad_norm": 6.630925756972144, | |
| "learning_rate": 9.697699288347043e-06, | |
| "loss": 0.5664, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.22506393861892582, | |
| "grad_norm": 5.966658672241774, | |
| "learning_rate": 9.690782845403164e-06, | |
| "loss": 0.6244, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.22762148337595908, | |
| "grad_norm": 5.5770246123900264, | |
| "learning_rate": 9.683790696340229e-06, | |
| "loss": 0.5334, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.23017902813299232, | |
| "grad_norm": 6.522499319244262, | |
| "learning_rate": 9.676722954006878e-06, | |
| "loss": 0.6739, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23273657289002558, | |
| "grad_norm": 6.4235805218363025, | |
| "learning_rate": 9.669579732471779e-06, | |
| "loss": 0.6595, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 6.3431696131235284, | |
| "learning_rate": 9.66236114702178e-06, | |
| "loss": 0.6023, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.23785166240409208, | |
| "grad_norm": 6.429883886786425, | |
| "learning_rate": 9.655067314160058e-06, | |
| "loss": 0.5986, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.24040920716112532, | |
| "grad_norm": 6.701368521252926, | |
| "learning_rate": 9.647698351604227e-06, | |
| "loss": 0.6569, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.24296675191815856, | |
| "grad_norm": 6.626338500498468, | |
| "learning_rate": 9.640254378284447e-06, | |
| "loss": 0.5552, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.24552429667519182, | |
| "grad_norm": 6.872036104522696, | |
| "learning_rate": 9.632735514341508e-06, | |
| "loss": 0.5384, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.24808184143222506, | |
| "grad_norm": 7.976561364452108, | |
| "learning_rate": 9.625141881124874e-06, | |
| "loss": 0.6225, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.2506393861892583, | |
| "grad_norm": 6.979721775572128, | |
| "learning_rate": 9.617473601190743e-06, | |
| "loss": 0.5937, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.2531969309462916, | |
| "grad_norm": 7.519173375318072, | |
| "learning_rate": 9.609730798300056e-06, | |
| "loss": 0.5673, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.2557544757033248, | |
| "grad_norm": 6.501376080036557, | |
| "learning_rate": 9.601913597416513e-06, | |
| "loss": 0.6167, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25831202046035806, | |
| "grad_norm": 7.232025532728033, | |
| "learning_rate": 9.594022124704541e-06, | |
| "loss": 0.6528, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 5.821804513422366, | |
| "learning_rate": 9.586056507527266e-06, | |
| "loss": 0.5839, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.26342710997442453, | |
| "grad_norm": 7.447246210872055, | |
| "learning_rate": 9.578016874444459e-06, | |
| "loss": 0.5425, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2659846547314578, | |
| "grad_norm": 5.023721202880345, | |
| "learning_rate": 9.569903355210457e-06, | |
| "loss": 0.4649, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.26854219948849106, | |
| "grad_norm": 5.3855220410063165, | |
| "learning_rate": 9.561716080772072e-06, | |
| "loss": 0.5362, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2710997442455243, | |
| "grad_norm": 5.755012965635693, | |
| "learning_rate": 9.55345518326647e-06, | |
| "loss": 0.637, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.27365728900255754, | |
| "grad_norm": 6.317278618265475, | |
| "learning_rate": 9.545120796019056e-06, | |
| "loss": 0.6073, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.27621483375959077, | |
| "grad_norm": 6.556750652969424, | |
| "learning_rate": 9.5367130535413e-06, | |
| "loss": 0.6106, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.27877237851662406, | |
| "grad_norm": 7.234204791297877, | |
| "learning_rate": 9.528232091528578e-06, | |
| "loss": 0.5537, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2813299232736573, | |
| "grad_norm": 6.44863975829238, | |
| "learning_rate": 9.519678046857987e-06, | |
| "loss": 0.6654, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.28388746803069054, | |
| "grad_norm": 5.882964477582141, | |
| "learning_rate": 9.511051057586125e-06, | |
| "loss": 0.5723, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2864450127877238, | |
| "grad_norm": 5.682881601819309, | |
| "learning_rate": 9.502351262946865e-06, | |
| "loss": 0.5325, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.289002557544757, | |
| "grad_norm": 5.907234833224297, | |
| "learning_rate": 9.493578803349117e-06, | |
| "loss": 0.6238, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2915601023017903, | |
| "grad_norm": 6.539157912120955, | |
| "learning_rate": 9.48473382037455e-06, | |
| "loss": 0.6228, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.29411764705882354, | |
| "grad_norm": 6.263387218751874, | |
| "learning_rate": 9.475816456775313e-06, | |
| "loss": 0.5954, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2966751918158568, | |
| "grad_norm": 5.93110985413684, | |
| "learning_rate": 9.466826856471728e-06, | |
| "loss": 0.6008, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.29923273657289, | |
| "grad_norm": 6.326093043591936, | |
| "learning_rate": 9.457765164549979e-06, | |
| "loss": 0.5834, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.30179028132992325, | |
| "grad_norm": 8.333855109469555, | |
| "learning_rate": 9.448631527259749e-06, | |
| "loss": 0.7357, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 7.352912958641833, | |
| "learning_rate": 9.439426092011877e-06, | |
| "loss": 0.5621, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.3069053708439898, | |
| "grad_norm": 6.812130929920315, | |
| "learning_rate": 9.430149007375974e-06, | |
| "loss": 0.6281, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.309462915601023, | |
| "grad_norm": 6.244862089177942, | |
| "learning_rate": 9.42080042307802e-06, | |
| "loss": 0.6083, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.31202046035805625, | |
| "grad_norm": 5.91824427054237, | |
| "learning_rate": 9.411380489997962e-06, | |
| "loss": 0.5141, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3145780051150895, | |
| "grad_norm": 5.9138584985365075, | |
| "learning_rate": 9.401889360167256e-06, | |
| "loss": 0.5525, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3171355498721228, | |
| "grad_norm": 6.264682424037789, | |
| "learning_rate": 9.392327186766434e-06, | |
| "loss": 0.5049, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.319693094629156, | |
| "grad_norm": 5.838854049045799, | |
| "learning_rate": 9.382694124122624e-06, | |
| "loss": 0.5835, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.32225063938618925, | |
| "grad_norm": 6.39551091760669, | |
| "learning_rate": 9.372990327707057e-06, | |
| "loss": 0.5132, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3248081841432225, | |
| "grad_norm": 6.716488574379217, | |
| "learning_rate": 9.36321595413256e-06, | |
| "loss": 0.5372, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3273657289002558, | |
| "grad_norm": 5.742568960781894, | |
| "learning_rate": 9.353371161151032e-06, | |
| "loss": 0.6203, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.329923273657289, | |
| "grad_norm": 6.772868273583815, | |
| "learning_rate": 9.34345610765089e-06, | |
| "loss": 0.4926, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.33248081841432225, | |
| "grad_norm": 7.185477633140159, | |
| "learning_rate": 9.333470953654513e-06, | |
| "loss": 0.6842, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3350383631713555, | |
| "grad_norm": 6.296716433927121, | |
| "learning_rate": 9.32341586031565e-06, | |
| "loss": 0.5163, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.3375959079283887, | |
| "grad_norm": 8.081152679238958, | |
| "learning_rate": 9.31329098991683e-06, | |
| "loss": 0.7902, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.340153452685422, | |
| "grad_norm": 6.861874747525335, | |
| "learning_rate": 9.303096505866734e-06, | |
| "loss": 0.5645, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.34271099744245526, | |
| "grad_norm": 6.171268132782227, | |
| "learning_rate": 9.292832572697566e-06, | |
| "loss": 0.6114, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.3452685421994885, | |
| "grad_norm": 6.552747924757749, | |
| "learning_rate": 9.282499356062385e-06, | |
| "loss": 0.5954, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 6.847724657541522, | |
| "learning_rate": 9.272097022732444e-06, | |
| "loss": 0.6485, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.35038363171355497, | |
| "grad_norm": 6.507209809740027, | |
| "learning_rate": 9.261625740594494e-06, | |
| "loss": 0.6159, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 7.252605300424873, | |
| "learning_rate": 9.251085678648072e-06, | |
| "loss": 0.5576, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.3554987212276215, | |
| "grad_norm": 7.119115054572448, | |
| "learning_rate": 9.240477007002777e-06, | |
| "loss": 0.7135, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.35805626598465473, | |
| "grad_norm": 7.349847252060697, | |
| "learning_rate": 9.22979989687552e-06, | |
| "loss": 0.6444, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.36061381074168797, | |
| "grad_norm": 5.765325529890724, | |
| "learning_rate": 9.219054520587766e-06, | |
| "loss": 0.4233, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.3631713554987212, | |
| "grad_norm": 5.112678313504443, | |
| "learning_rate": 9.208241051562753e-06, | |
| "loss": 0.5447, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3657289002557545, | |
| "grad_norm": 7.144208807574378, | |
| "learning_rate": 9.197359664322684e-06, | |
| "loss": 0.5891, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.36828644501278773, | |
| "grad_norm": 6.278104080681577, | |
| "learning_rate": 9.186410534485924e-06, | |
| "loss": 0.5701, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.37084398976982097, | |
| "grad_norm": 6.887145963626663, | |
| "learning_rate": 9.175393838764153e-06, | |
| "loss": 0.5502, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3734015345268542, | |
| "grad_norm": 6.023815922626531, | |
| "learning_rate": 9.164309754959523e-06, | |
| "loss": 0.5286, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.37595907928388744, | |
| "grad_norm": 5.517318940161725, | |
| "learning_rate": 9.153158461961782e-06, | |
| "loss": 0.4433, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.37851662404092073, | |
| "grad_norm": 6.019318968473694, | |
| "learning_rate": 9.14194013974539e-06, | |
| "loss": 0.5065, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.38107416879795397, | |
| "grad_norm": 5.6444749821227385, | |
| "learning_rate": 9.130654969366619e-06, | |
| "loss": 0.501, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3836317135549872, | |
| "grad_norm": 6.559732586020477, | |
| "learning_rate": 9.11930313296062e-06, | |
| "loss": 0.6101, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.38618925831202044, | |
| "grad_norm": 6.2279396393587705, | |
| "learning_rate": 9.107884813738492e-06, | |
| "loss": 0.5938, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.3887468030690537, | |
| "grad_norm": 7.319750378815782, | |
| "learning_rate": 9.096400195984322e-06, | |
| "loss": 0.4252, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 7.088600256578154, | |
| "learning_rate": 9.08484946505221e-06, | |
| "loss": 0.5793, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3938618925831202, | |
| "grad_norm": 6.573409174775933, | |
| "learning_rate": 9.073232807363283e-06, | |
| "loss": 0.5026, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.39641943734015345, | |
| "grad_norm": 7.980317521312881, | |
| "learning_rate": 9.061550410402677e-06, | |
| "loss": 0.6736, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3989769820971867, | |
| "grad_norm": 6.87810163307716, | |
| "learning_rate": 9.049802462716521e-06, | |
| "loss": 0.493, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.40153452685422, | |
| "grad_norm": 7.407535434490462, | |
| "learning_rate": 9.037989153908882e-06, | |
| "loss": 0.5762, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.4040920716112532, | |
| "grad_norm": 6.310545687817972, | |
| "learning_rate": 9.026110674638722e-06, | |
| "loss": 0.5802, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.40664961636828645, | |
| "grad_norm": 6.538278944739297, | |
| "learning_rate": 9.0141672166168e-06, | |
| "loss": 0.4665, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.4092071611253197, | |
| "grad_norm": 6.25525186329276, | |
| "learning_rate": 9.002158972602599e-06, | |
| "loss": 0.65, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4117647058823529, | |
| "grad_norm": 7.134439997933502, | |
| "learning_rate": 8.990086136401199e-06, | |
| "loss": 0.6436, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.4143222506393862, | |
| "grad_norm": 6.907730334313879, | |
| "learning_rate": 8.977948902860154e-06, | |
| "loss": 0.6688, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.41687979539641945, | |
| "grad_norm": 5.60949076779962, | |
| "learning_rate": 8.965747467866355e-06, | |
| "loss": 0.4263, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.4194373401534527, | |
| "grad_norm": 7.153866044006984, | |
| "learning_rate": 8.953482028342853e-06, | |
| "loss": 0.707, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.4219948849104859, | |
| "grad_norm": 5.749873195369449, | |
| "learning_rate": 8.9411527822457e-06, | |
| "loss": 0.5522, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.42455242966751916, | |
| "grad_norm": 6.450113679037296, | |
| "learning_rate": 8.92875992856073e-06, | |
| "loss": 0.491, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.42710997442455245, | |
| "grad_norm": 7.10664387732353, | |
| "learning_rate": 8.916303667300373e-06, | |
| "loss": 0.5526, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.4296675191815857, | |
| "grad_norm": 5.908129226489756, | |
| "learning_rate": 8.903784199500412e-06, | |
| "loss": 0.4589, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4322250639386189, | |
| "grad_norm": 6.036965674381112, | |
| "learning_rate": 8.89120172721674e-06, | |
| "loss": 0.5393, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 6.557422143271899, | |
| "learning_rate": 8.8785564535221e-06, | |
| "loss": 0.4947, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4373401534526854, | |
| "grad_norm": 7.091654604179938, | |
| "learning_rate": 8.86584858250281e-06, | |
| "loss": 0.5744, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.4398976982097187, | |
| "grad_norm": 6.839942726126082, | |
| "learning_rate": 8.853078319255466e-06, | |
| "loss": 0.5621, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.4424552429667519, | |
| "grad_norm": 6.705305452145543, | |
| "learning_rate": 8.840245869883635e-06, | |
| "loss": 0.6277, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.44501278772378516, | |
| "grad_norm": 6.410914353629197, | |
| "learning_rate": 8.827351441494525e-06, | |
| "loss": 0.5795, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.4475703324808184, | |
| "grad_norm": 6.178508928681474, | |
| "learning_rate": 8.814395242195642e-06, | |
| "loss": 0.5039, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.45012787723785164, | |
| "grad_norm": 5.283454576226612, | |
| "learning_rate": 8.80137748109144e-06, | |
| "loss": 0.4565, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.45268542199488493, | |
| "grad_norm": 5.78397157032685, | |
| "learning_rate": 8.78829836827993e-06, | |
| "loss": 0.5435, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.45524296675191817, | |
| "grad_norm": 5.014711664858047, | |
| "learning_rate": 8.77515811484931e-06, | |
| "loss": 0.4744, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.4578005115089514, | |
| "grad_norm": 5.673473816390766, | |
| "learning_rate": 8.761956932874539e-06, | |
| "loss": 0.4794, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.46035805626598464, | |
| "grad_norm": 6.548843702355434, | |
| "learning_rate": 8.748695035413925e-06, | |
| "loss": 0.5124, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4629156010230179, | |
| "grad_norm": 5.294743068866496, | |
| "learning_rate": 8.735372636505681e-06, | |
| "loss": 0.4964, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.46547314578005117, | |
| "grad_norm": 5.307479046129796, | |
| "learning_rate": 8.72198995116448e-06, | |
| "loss": 0.4848, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.4680306905370844, | |
| "grad_norm": 5.862290813329295, | |
| "learning_rate": 8.708547195377968e-06, | |
| "loss": 0.6168, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 6.660472406940894, | |
| "learning_rate": 8.695044586103297e-06, | |
| "loss": 0.6317, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.4731457800511509, | |
| "grad_norm": 6.239173715990654, | |
| "learning_rate": 8.68148234126361e-06, | |
| "loss": 0.5712, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.47570332480818417, | |
| "grad_norm": 6.855169313192307, | |
| "learning_rate": 8.667860679744529e-06, | |
| "loss": 0.4569, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 6.098286103768463, | |
| "learning_rate": 8.65417982139062e-06, | |
| "loss": 0.5377, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.48081841432225064, | |
| "grad_norm": 6.687927077404218, | |
| "learning_rate": 8.640439987001855e-06, | |
| "loss": 0.5466, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.4833759590792839, | |
| "grad_norm": 5.06986215439764, | |
| "learning_rate": 8.626641398330027e-06, | |
| "loss": 0.4597, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.4859335038363171, | |
| "grad_norm": 6.615207228778788, | |
| "learning_rate": 8.612784278075195e-06, | |
| "loss": 0.6486, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4884910485933504, | |
| "grad_norm": 8.206032710195597, | |
| "learning_rate": 8.598868849882074e-06, | |
| "loss": 0.53, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.49104859335038364, | |
| "grad_norm": 6.512201203410748, | |
| "learning_rate": 8.58489533833643e-06, | |
| "loss": 0.5075, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.4936061381074169, | |
| "grad_norm": 6.623708653660542, | |
| "learning_rate": 8.570863968961456e-06, | |
| "loss": 0.4697, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.4961636828644501, | |
| "grad_norm": 4.93093459028815, | |
| "learning_rate": 8.556774968214134e-06, | |
| "loss": 0.5169, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.49872122762148335, | |
| "grad_norm": 5.787452319450779, | |
| "learning_rate": 8.542628563481577e-06, | |
| "loss": 0.5482, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5012787723785166, | |
| "grad_norm": 5.88850708880366, | |
| "learning_rate": 8.52842498307736e-06, | |
| "loss": 0.6134, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.5038363171355499, | |
| "grad_norm": 6.696557687225988, | |
| "learning_rate": 8.514164456237835e-06, | |
| "loss": 0.6447, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.5063938618925832, | |
| "grad_norm": 7.195386414110228, | |
| "learning_rate": 8.499847213118431e-06, | |
| "loss": 0.5117, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5089514066496164, | |
| "grad_norm": 5.825208905742397, | |
| "learning_rate": 8.485473484789944e-06, | |
| "loss": 0.5152, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.5115089514066496, | |
| "grad_norm": 6.118462869888847, | |
| "learning_rate": 8.471043503234796e-06, | |
| "loss": 0.6536, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5140664961636828, | |
| "grad_norm": 7.551332041886624, | |
| "learning_rate": 8.45655750134331e-06, | |
| "loss": 0.6084, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.5166240409207161, | |
| "grad_norm": 5.225118027592022, | |
| "learning_rate": 8.442015712909926e-06, | |
| "loss": 0.4555, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.5191815856777494, | |
| "grad_norm": 5.3592959184851265, | |
| "learning_rate": 8.427418372629456e-06, | |
| "loss": 0.4821, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 6.212770160868918, | |
| "learning_rate": 8.412765716093273e-06, | |
| "loss": 0.5149, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.5242966751918159, | |
| "grad_norm": 5.519259201547804, | |
| "learning_rate": 8.398057979785515e-06, | |
| "loss": 0.4876, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5268542199488491, | |
| "grad_norm": 6.847844140759948, | |
| "learning_rate": 8.383295401079284e-06, | |
| "loss": 0.5245, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.5294117647058824, | |
| "grad_norm": 6.473882379231715, | |
| "learning_rate": 8.368478218232787e-06, | |
| "loss": 0.5319, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.5319693094629157, | |
| "grad_norm": 4.995240865011453, | |
| "learning_rate": 8.353606670385514e-06, | |
| "loss": 0.4201, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.5345268542199488, | |
| "grad_norm": 5.272197527549254, | |
| "learning_rate": 8.338680997554372e-06, | |
| "loss": 0.4832, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5370843989769821, | |
| "grad_norm": 5.585738561949535, | |
| "learning_rate": 8.3237014406298e-06, | |
| "loss": 0.4929, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5396419437340153, | |
| "grad_norm": 6.239332915949274, | |
| "learning_rate": 8.308668241371897e-06, | |
| "loss": 0.4171, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5421994884910486, | |
| "grad_norm": 5.322513595323884, | |
| "learning_rate": 8.293581642406517e-06, | |
| "loss": 0.4073, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5447570332480819, | |
| "grad_norm": 7.46575629890418, | |
| "learning_rate": 8.278441887221338e-06, | |
| "loss": 0.6626, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5473145780051151, | |
| "grad_norm": 6.052661000824651, | |
| "learning_rate": 8.263249220161957e-06, | |
| "loss": 0.5068, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5498721227621484, | |
| "grad_norm": 6.708569886061961, | |
| "learning_rate": 8.248003886427927e-06, | |
| "loss": 0.4966, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5524296675191815, | |
| "grad_norm": 5.68545952396897, | |
| "learning_rate": 8.232706132068806e-06, | |
| "loss": 0.3861, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5549872122762148, | |
| "grad_norm": 5.421452942064916, | |
| "learning_rate": 8.217356203980187e-06, | |
| "loss": 0.3885, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.5575447570332481, | |
| "grad_norm": 6.028892220556533, | |
| "learning_rate": 8.201954349899712e-06, | |
| "loss": 0.5848, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5601023017902813, | |
| "grad_norm": 5.85865954700368, | |
| "learning_rate": 8.186500818403076e-06, | |
| "loss": 0.4014, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5626598465473146, | |
| "grad_norm": 7.224336653003557, | |
| "learning_rate": 8.17099585890001e-06, | |
| "loss": 0.6191, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 6.30772616724702, | |
| "learning_rate": 8.155439721630265e-06, | |
| "loss": 0.4756, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.5677749360613811, | |
| "grad_norm": 6.236289350658551, | |
| "learning_rate": 8.139832657659557e-06, | |
| "loss": 0.4964, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.5703324808184144, | |
| "grad_norm": 6.209791638515158, | |
| "learning_rate": 8.124174918875532e-06, | |
| "loss": 0.5958, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5728900255754475, | |
| "grad_norm": 6.377139101398714, | |
| "learning_rate": 8.108466757983695e-06, | |
| "loss": 0.3906, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.5754475703324808, | |
| "grad_norm": 5.907378705805276, | |
| "learning_rate": 8.092708428503324e-06, | |
| "loss": 0.5376, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.578005115089514, | |
| "grad_norm": 6.743429798340147, | |
| "learning_rate": 8.076900184763394e-06, | |
| "loss": 0.4802, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.5805626598465473, | |
| "grad_norm": 5.376964104341389, | |
| "learning_rate": 8.061042281898453e-06, | |
| "loss": 0.4509, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5831202046035806, | |
| "grad_norm": 6.511105613676549, | |
| "learning_rate": 8.04513497584452e-06, | |
| "loss": 0.4214, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5856777493606138, | |
| "grad_norm": 6.852114152108356, | |
| "learning_rate": 8.02917852333495e-06, | |
| "loss": 0.6038, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 5.572255565814968, | |
| "learning_rate": 8.013173181896283e-06, | |
| "loss": 0.5224, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5907928388746803, | |
| "grad_norm": 7.0760399259778435, | |
| "learning_rate": 7.9971192098441e-06, | |
| "loss": 0.4502, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.5933503836317136, | |
| "grad_norm": 5.207560218712082, | |
| "learning_rate": 7.981016866278843e-06, | |
| "loss": 0.4027, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5959079283887468, | |
| "grad_norm": 5.796635616276233, | |
| "learning_rate": 7.964866411081645e-06, | |
| "loss": 0.5675, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.59846547314578, | |
| "grad_norm": 5.708164631421739, | |
| "learning_rate": 7.94866810491012e-06, | |
| "loss": 0.4437, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.6010230179028133, | |
| "grad_norm": 6.436431203161013, | |
| "learning_rate": 7.93242220919417e-06, | |
| "loss": 0.5474, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6035805626598465, | |
| "grad_norm": 5.574144310350591, | |
| "learning_rate": 7.916128986131761e-06, | |
| "loss": 0.5439, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.6061381074168798, | |
| "grad_norm": 4.8240456797819835, | |
| "learning_rate": 7.899788698684687e-06, | |
| "loss": 0.4686, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 6.5914849870729055, | |
| "learning_rate": 7.883401610574338e-06, | |
| "loss": 0.5512, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.6112531969309463, | |
| "grad_norm": 4.739332602957458, | |
| "learning_rate": 7.866967986277423e-06, | |
| "loss": 0.4204, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.6138107416879796, | |
| "grad_norm": 6.116120073202256, | |
| "learning_rate": 7.850488091021726e-06, | |
| "loss": 0.5596, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6163682864450127, | |
| "grad_norm": 6.299579832647148, | |
| "learning_rate": 7.833962190781809e-06, | |
| "loss": 0.5729, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.618925831202046, | |
| "grad_norm": 5.77832842987742, | |
| "learning_rate": 7.817390552274721e-06, | |
| "loss": 0.4062, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.6214833759590793, | |
| "grad_norm": 5.6928424134185365, | |
| "learning_rate": 7.800773442955703e-06, | |
| "loss": 0.562, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.6240409207161125, | |
| "grad_norm": 5.754032663780959, | |
| "learning_rate": 7.784111131013858e-06, | |
| "loss": 0.4763, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.6265984654731458, | |
| "grad_norm": 6.1254790208347, | |
| "learning_rate": 7.767403885367832e-06, | |
| "loss": 0.4931, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.629156010230179, | |
| "grad_norm": 6.088136454995643, | |
| "learning_rate": 7.750651975661471e-06, | |
| "loss": 0.5366, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.6317135549872123, | |
| "grad_norm": 6.525801581028963, | |
| "learning_rate": 7.733855672259472e-06, | |
| "loss": 0.5869, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.6342710997442456, | |
| "grad_norm": 5.142226783902718, | |
| "learning_rate": 7.717015246243012e-06, | |
| "loss": 0.4107, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.6368286445012787, | |
| "grad_norm": 5.884475685733821, | |
| "learning_rate": 7.700130969405377e-06, | |
| "loss": 0.5575, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.639386189258312, | |
| "grad_norm": 5.430956350007929, | |
| "learning_rate": 7.683203114247587e-06, | |
| "loss": 0.4316, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6419437340153452, | |
| "grad_norm": 5.852470007876826, | |
| "learning_rate": 7.66623195397397e-06, | |
| "loss": 0.5228, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.6445012787723785, | |
| "grad_norm": 6.316931409524609, | |
| "learning_rate": 7.649217762487786e-06, | |
| "loss": 0.6069, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.6470588235294118, | |
| "grad_norm": 5.981156306158716, | |
| "learning_rate": 7.63216081438678e-06, | |
| "loss": 0.4525, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.649616368286445, | |
| "grad_norm": 6.193124213697377, | |
| "learning_rate": 7.615061384958764e-06, | |
| "loss": 0.6367, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 6.145394633019291, | |
| "learning_rate": 7.597919750177168e-06, | |
| "loss": 0.5622, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6547314578005116, | |
| "grad_norm": 6.076895232152138, | |
| "learning_rate": 7.580736186696593e-06, | |
| "loss": 0.5016, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6572890025575447, | |
| "grad_norm": 5.562852949209647, | |
| "learning_rate": 7.563510971848339e-06, | |
| "loss": 0.5739, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.659846547314578, | |
| "grad_norm": 6.358261854476947, | |
| "learning_rate": 7.546244383635929e-06, | |
| "loss": 0.5783, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.6624040920716112, | |
| "grad_norm": 5.61873313563532, | |
| "learning_rate": 7.528936700730627e-06, | |
| "loss": 0.5671, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6649616368286445, | |
| "grad_norm": 5.007471092297137, | |
| "learning_rate": 7.5115882024669375e-06, | |
| "loss": 0.4238, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6675191815856778, | |
| "grad_norm": 5.02718344671977, | |
| "learning_rate": 7.494199168838099e-06, | |
| "loss": 0.431, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.670076726342711, | |
| "grad_norm": 5.822025275525143, | |
| "learning_rate": 7.476769880491561e-06, | |
| "loss": 0.555, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.6726342710997443, | |
| "grad_norm": 5.852672525450696, | |
| "learning_rate": 7.459300618724462e-06, | |
| "loss": 0.4537, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.6751918158567775, | |
| "grad_norm": 6.36830749484907, | |
| "learning_rate": 7.44179166547908e-06, | |
| "loss": 0.5466, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.6777493606138107, | |
| "grad_norm": 5.187641704740303, | |
| "learning_rate": 7.42424330333829e-06, | |
| "loss": 0.4966, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.680306905370844, | |
| "grad_norm": 5.862839321803861, | |
| "learning_rate": 7.406655815520998e-06, | |
| "loss": 0.4902, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.6828644501278772, | |
| "grad_norm": 6.529161660718858, | |
| "learning_rate": 7.389029485877577e-06, | |
| "loss": 0.493, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.6854219948849105, | |
| "grad_norm": 5.732050686572585, | |
| "learning_rate": 7.371364598885276e-06, | |
| "loss": 0.4744, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.6879795396419437, | |
| "grad_norm": 5.533174363200175, | |
| "learning_rate": 7.353661439643638e-06, | |
| "loss": 0.3833, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.690537084398977, | |
| "grad_norm": 5.768168615527615, | |
| "learning_rate": 7.335920293869891e-06, | |
| "loss": 0.423, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6930946291560103, | |
| "grad_norm": 5.852266644103708, | |
| "learning_rate": 7.318141447894344e-06, | |
| "loss": 0.3371, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 6.038563114564619, | |
| "learning_rate": 7.300325188655762e-06, | |
| "loss": 0.4891, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.6982097186700768, | |
| "grad_norm": 6.280803826327464, | |
| "learning_rate": 7.28247180369673e-06, | |
| "loss": 0.5385, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.7007672634271099, | |
| "grad_norm": 7.292365659382516, | |
| "learning_rate": 7.264581581159024e-06, | |
| "loss": 0.6148, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.7033248081841432, | |
| "grad_norm": 6.763240999324924, | |
| "learning_rate": 7.246654809778951e-06, | |
| "loss": 0.5272, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 6.444401975777849, | |
| "learning_rate": 7.2286917788826926e-06, | |
| "loss": 0.4879, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.7084398976982097, | |
| "grad_norm": 5.037923525497081, | |
| "learning_rate": 7.210692778381634e-06, | |
| "loss": 0.5377, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.710997442455243, | |
| "grad_norm": 6.327806611970394, | |
| "learning_rate": 7.192658098767686e-06, | |
| "loss": 0.4654, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.7135549872122762, | |
| "grad_norm": 5.832786135763086, | |
| "learning_rate": 7.174588031108598e-06, | |
| "loss": 0.5921, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.7161125319693095, | |
| "grad_norm": 6.032098832742715, | |
| "learning_rate": 7.1564828670432595e-06, | |
| "loss": 0.5032, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7186700767263428, | |
| "grad_norm": 7.528807903355475, | |
| "learning_rate": 7.138342898776989e-06, | |
| "loss": 0.5143, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.7212276214833759, | |
| "grad_norm": 5.662236290695636, | |
| "learning_rate": 7.120168419076825e-06, | |
| "loss": 0.5752, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.7237851662404092, | |
| "grad_norm": 5.922836277812778, | |
| "learning_rate": 7.101959721266798e-06, | |
| "loss": 0.5907, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.7263427109974424, | |
| "grad_norm": 6.258012070363337, | |
| "learning_rate": 7.083717099223192e-06, | |
| "loss": 0.5447, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.7289002557544757, | |
| "grad_norm": 5.261480296532744, | |
| "learning_rate": 7.0654408473698084e-06, | |
| "loss": 0.4521, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.731457800511509, | |
| "grad_norm": 5.918110722172615, | |
| "learning_rate": 7.047131260673214e-06, | |
| "loss": 0.4637, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.7340153452685422, | |
| "grad_norm": 5.741282290810403, | |
| "learning_rate": 7.0287886346379755e-06, | |
| "loss": 0.4131, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.7365728900255755, | |
| "grad_norm": 5.943919434881143, | |
| "learning_rate": 7.010413265301888e-06, | |
| "loss": 0.4712, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 6.1059644383499885, | |
| "learning_rate": 6.9920054492312086e-06, | |
| "loss": 0.6022, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.7416879795396419, | |
| "grad_norm": 6.884474367848085, | |
| "learning_rate": 6.97356548351586e-06, | |
| "loss": 0.5212, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7442455242966752, | |
| "grad_norm": 5.758493578440039, | |
| "learning_rate": 6.9550936657646386e-06, | |
| "loss": 0.507, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.7468030690537084, | |
| "grad_norm": 5.5122177192122415, | |
| "learning_rate": 6.936590294100414e-06, | |
| "loss": 0.4096, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.7493606138107417, | |
| "grad_norm": 6.529147733060143, | |
| "learning_rate": 6.918055667155311e-06, | |
| "loss": 0.4668, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.7519181585677749, | |
| "grad_norm": 5.580434227838566, | |
| "learning_rate": 6.899490084065897e-06, | |
| "loss": 0.4825, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.7544757033248082, | |
| "grad_norm": 6.141771398723171, | |
| "learning_rate": 6.8808938444683505e-06, | |
| "loss": 0.5189, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7570332480818415, | |
| "grad_norm": 6.258445455734687, | |
| "learning_rate": 6.862267248493624e-06, | |
| "loss": 0.4217, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7595907928388747, | |
| "grad_norm": 5.577447249480196, | |
| "learning_rate": 6.843610596762606e-06, | |
| "loss": 0.4574, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7621483375959079, | |
| "grad_norm": 6.508949986966596, | |
| "learning_rate": 6.824924190381257e-06, | |
| "loss": 0.4512, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7647058823529411, | |
| "grad_norm": 6.487835617711066, | |
| "learning_rate": 6.806208330935766e-06, | |
| "loss": 0.4817, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7672634271099744, | |
| "grad_norm": 6.733508290939032, | |
| "learning_rate": 6.7874633204876705e-06, | |
| "loss": 0.4648, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7698209718670077, | |
| "grad_norm": 6.6391089444926195, | |
| "learning_rate": 6.768689461568987e-06, | |
| "loss": 0.4959, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.7723785166240409, | |
| "grad_norm": 5.426573235588597, | |
| "learning_rate": 6.7498870571773275e-06, | |
| "loss": 0.4101, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.7749360613810742, | |
| "grad_norm": 5.27403325499086, | |
| "learning_rate": 6.731056410771008e-06, | |
| "loss": 0.4183, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.7774936061381074, | |
| "grad_norm": 5.602097858442588, | |
| "learning_rate": 6.712197826264154e-06, | |
| "loss": 0.4712, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.7800511508951407, | |
| "grad_norm": 5.602023911663575, | |
| "learning_rate": 6.69331160802179e-06, | |
| "loss": 0.376, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 6.057331404811353, | |
| "learning_rate": 6.674398060854931e-06, | |
| "loss": 0.3333, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.7851662404092071, | |
| "grad_norm": 5.453910778706793, | |
| "learning_rate": 6.655457490015667e-06, | |
| "loss": 0.5251, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.7877237851662404, | |
| "grad_norm": 6.209463244054028, | |
| "learning_rate": 6.636490201192229e-06, | |
| "loss": 0.5256, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.7902813299232737, | |
| "grad_norm": 5.6125006489249145, | |
| "learning_rate": 6.617496500504056e-06, | |
| "loss": 0.35, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.7928388746803069, | |
| "grad_norm": 6.414498410153366, | |
| "learning_rate": 6.5984766944968636e-06, | |
| "loss": 0.5181, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7953964194373402, | |
| "grad_norm": 6.101747662704522, | |
| "learning_rate": 6.579431090137681e-06, | |
| "loss": 0.4106, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.7979539641943734, | |
| "grad_norm": 6.462777333488606, | |
| "learning_rate": 6.560359994809916e-06, | |
| "loss": 0.6125, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.8005115089514067, | |
| "grad_norm": 5.9920053305051875, | |
| "learning_rate": 6.541263716308375e-06, | |
| "loss": 0.4968, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.80306905370844, | |
| "grad_norm": 6.671005371719509, | |
| "learning_rate": 6.522142562834307e-06, | |
| "loss": 0.5637, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.8056265984654731, | |
| "grad_norm": 5.361336122168199, | |
| "learning_rate": 6.502996842990431e-06, | |
| "loss": 0.4208, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8081841432225064, | |
| "grad_norm": 5.670064103939166, | |
| "learning_rate": 6.483826865775941e-06, | |
| "loss": 0.5278, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.8107416879795396, | |
| "grad_norm": 5.5103998057715105, | |
| "learning_rate": 6.46463294058154e-06, | |
| "loss": 0.4007, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.8132992327365729, | |
| "grad_norm": 5.659095784663181, | |
| "learning_rate": 6.445415377184427e-06, | |
| "loss": 0.4742, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.8158567774936062, | |
| "grad_norm": 6.30132561670194, | |
| "learning_rate": 6.426174485743309e-06, | |
| "loss": 0.4078, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.8184143222506394, | |
| "grad_norm": 5.643268096385628, | |
| "learning_rate": 6.4069105767933944e-06, | |
| "loss": 0.46, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8209718670076727, | |
| "grad_norm": 7.788725418859061, | |
| "learning_rate": 6.387623961241375e-06, | |
| "loss": 0.6119, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 5.927896020719375, | |
| "learning_rate": 6.368314950360416e-06, | |
| "loss": 0.5225, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 5.296955151964955, | |
| "learning_rate": 6.348983855785122e-06, | |
| "loss": 0.3126, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.8286445012787724, | |
| "grad_norm": 5.095733276738074, | |
| "learning_rate": 6.3296309895065215e-06, | |
| "loss": 0.3639, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.8312020460358056, | |
| "grad_norm": 6.080988913298908, | |
| "learning_rate": 6.310256663867019e-06, | |
| "loss": 0.5063, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8337595907928389, | |
| "grad_norm": 7.036245894709906, | |
| "learning_rate": 6.290861191555359e-06, | |
| "loss": 0.4578, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.8363171355498721, | |
| "grad_norm": 5.580633409599807, | |
| "learning_rate": 6.271444885601583e-06, | |
| "loss": 0.4639, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.8388746803069054, | |
| "grad_norm": 6.523213064272758, | |
| "learning_rate": 6.252008059371968e-06, | |
| "loss": 0.4699, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.8414322250639387, | |
| "grad_norm": 4.85798591447732, | |
| "learning_rate": 6.2325510265639785e-06, | |
| "loss": 0.2973, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.8439897698209718, | |
| "grad_norm": 6.805564012992218, | |
| "learning_rate": 6.213074101201202e-06, | |
| "loss": 0.4894, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8465473145780051, | |
| "grad_norm": 6.052583253032932, | |
| "learning_rate": 6.193577597628268e-06, | |
| "loss": 0.5193, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.8491048593350383, | |
| "grad_norm": 6.230894069829904, | |
| "learning_rate": 6.174061830505801e-06, | |
| "loss": 0.5028, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.8516624040920716, | |
| "grad_norm": 7.201079594756455, | |
| "learning_rate": 6.154527114805312e-06, | |
| "loss": 0.618, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.8542199488491049, | |
| "grad_norm": 6.600993850416883, | |
| "learning_rate": 6.1349737658041385e-06, | |
| "loss": 0.5133, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.8567774936061381, | |
| "grad_norm": 5.724822321191247, | |
| "learning_rate": 6.115402099080345e-06, | |
| "loss": 0.4838, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8593350383631714, | |
| "grad_norm": 5.613108225355487, | |
| "learning_rate": 6.095812430507627e-06, | |
| "loss": 0.3442, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.8618925831202046, | |
| "grad_norm": 5.965477408637899, | |
| "learning_rate": 6.076205076250227e-06, | |
| "loss": 0.5109, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.8644501278772379, | |
| "grad_norm": 5.9857901067552, | |
| "learning_rate": 6.056580352757813e-06, | |
| "loss": 0.4511, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.8670076726342711, | |
| "grad_norm": 6.174131332105638, | |
| "learning_rate": 6.036938576760388e-06, | |
| "loss": 0.4419, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 5.075847963553367, | |
| "learning_rate": 6.0172800652631706e-06, | |
| "loss": 0.3777, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8721227621483376, | |
| "grad_norm": 7.190594951132575, | |
| "learning_rate": 5.997605135541472e-06, | |
| "loss": 0.5106, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8746803069053708, | |
| "grad_norm": 6.747196680979683, | |
| "learning_rate": 5.977914105135594e-06, | |
| "loss": 0.4762, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.8772378516624041, | |
| "grad_norm": 5.714370912906624, | |
| "learning_rate": 5.9582072918456805e-06, | |
| "loss": 0.362, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.8797953964194374, | |
| "grad_norm": 5.236876408531075, | |
| "learning_rate": 5.938485013726612e-06, | |
| "loss": 0.3947, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.8823529411764706, | |
| "grad_norm": 5.116217278468624, | |
| "learning_rate": 5.918747589082853e-06, | |
| "loss": 0.4747, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8849104859335039, | |
| "grad_norm": 4.749747087957306, | |
| "learning_rate": 5.898995336463326e-06, | |
| "loss": 0.4274, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.887468030690537, | |
| "grad_norm": 5.230418676152823, | |
| "learning_rate": 5.879228574656269e-06, | |
| "loss": 0.3441, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.8900255754475703, | |
| "grad_norm": 6.303242756857959, | |
| "learning_rate": 5.859447622684084e-06, | |
| "loss": 0.5131, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.8925831202046036, | |
| "grad_norm": 5.4435933577635645, | |
| "learning_rate": 5.839652799798197e-06, | |
| "loss": 0.4243, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.8951406649616368, | |
| "grad_norm": 6.921408500128556, | |
| "learning_rate": 5.819844425473899e-06, | |
| "loss": 0.5549, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8976982097186701, | |
| "grad_norm": 6.635487149449039, | |
| "learning_rate": 5.800022819405194e-06, | |
| "loss": 0.5061, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.9002557544757033, | |
| "grad_norm": 6.3209563326259515, | |
| "learning_rate": 5.780188301499636e-06, | |
| "loss": 0.5999, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.9028132992327366, | |
| "grad_norm": 6.077422261762329, | |
| "learning_rate": 5.760341191873167e-06, | |
| "loss": 0.5111, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.9053708439897699, | |
| "grad_norm": 5.867259188193717, | |
| "learning_rate": 5.740481810844952e-06, | |
| "loss": 0.4771, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.907928388746803, | |
| "grad_norm": 4.777893746653604, | |
| "learning_rate": 5.720610478932211e-06, | |
| "loss": 0.3242, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9104859335038363, | |
| "grad_norm": 5.764419825204445, | |
| "learning_rate": 5.700727516845038e-06, | |
| "loss": 0.3306, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 5.510318712985209, | |
| "learning_rate": 5.680833245481234e-06, | |
| "loss": 0.4642, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.9156010230179028, | |
| "grad_norm": 6.53182819796998, | |
| "learning_rate": 5.660927985921122e-06, | |
| "loss": 0.51, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.9181585677749361, | |
| "grad_norm": 6.086318246788371, | |
| "learning_rate": 5.641012059422369e-06, | |
| "loss": 0.5472, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.9207161125319693, | |
| "grad_norm": 5.544550425971534, | |
| "learning_rate": 5.621085787414799e-06, | |
| "loss": 0.4603, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9232736572890026, | |
| "grad_norm": 6.1043161575819616, | |
| "learning_rate": 5.601149491495206e-06, | |
| "loss": 0.485, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.9258312020460358, | |
| "grad_norm": 5.827986489165051, | |
| "learning_rate": 5.581203493422161e-06, | |
| "loss": 0.5864, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.928388746803069, | |
| "grad_norm": 5.172049016763376, | |
| "learning_rate": 5.561248115110822e-06, | |
| "loss": 0.4517, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.9309462915601023, | |
| "grad_norm": 6.640832021653832, | |
| "learning_rate": 5.541283678627742e-06, | |
| "loss": 0.3703, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.9335038363171355, | |
| "grad_norm": 5.361432485515485, | |
| "learning_rate": 5.521310506185661e-06, | |
| "loss": 0.4262, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9360613810741688, | |
| "grad_norm": 5.808037599792696, | |
| "learning_rate": 5.501328920138314e-06, | |
| "loss": 0.544, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.9386189258312021, | |
| "grad_norm": 6.0541662716251095, | |
| "learning_rate": 5.481339242975227e-06, | |
| "loss": 0.4024, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 6.232047697753629, | |
| "learning_rate": 5.46134179731651e-06, | |
| "loss": 0.4862, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.9437340153452686, | |
| "grad_norm": 6.000932213910604, | |
| "learning_rate": 5.441336905907653e-06, | |
| "loss": 0.4635, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.9462915601023018, | |
| "grad_norm": 7.611546241496742, | |
| "learning_rate": 5.421324891614312e-06, | |
| "loss": 0.4135, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.948849104859335, | |
| "grad_norm": 5.362330490202002, | |
| "learning_rate": 5.4013060774171055e-06, | |
| "loss": 0.4506, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.9514066496163683, | |
| "grad_norm": 5.346120483245015, | |
| "learning_rate": 5.3812807864063946e-06, | |
| "loss": 0.4576, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.9539641943734015, | |
| "grad_norm": 6.100965821026688, | |
| "learning_rate": 5.361249341777075e-06, | |
| "loss": 0.5165, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 4.676613084116823, | |
| "learning_rate": 5.341212066823356e-06, | |
| "loss": 0.4383, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.959079283887468, | |
| "grad_norm": 6.564693244827939, | |
| "learning_rate": 5.321169284933543e-06, | |
| "loss": 0.5044, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9616368286445013, | |
| "grad_norm": 4.752286756978166, | |
| "learning_rate": 5.3011213195848245e-06, | |
| "loss": 0.5422, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.9641943734015346, | |
| "grad_norm": 6.298216680263071, | |
| "learning_rate": 5.281068494338039e-06, | |
| "loss": 0.3751, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.9667519181585678, | |
| "grad_norm": 5.041453892793781, | |
| "learning_rate": 5.26101113283247e-06, | |
| "loss": 0.3732, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.969309462915601, | |
| "grad_norm": 6.072922237394072, | |
| "learning_rate": 5.240949558780605e-06, | |
| "loss": 0.4873, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.9718670076726342, | |
| "grad_norm": 5.712375997105542, | |
| "learning_rate": 5.220884095962924e-06, | |
| "loss": 0.4877, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9744245524296675, | |
| "grad_norm": 5.5897510763688585, | |
| "learning_rate": 5.200815068222666e-06, | |
| "loss": 0.386, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9769820971867008, | |
| "grad_norm": 6.160592176666778, | |
| "learning_rate": 5.1807427994606065e-06, | |
| "loss": 0.369, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.979539641943734, | |
| "grad_norm": 5.482848722330923, | |
| "learning_rate": 5.1606676136298305e-06, | |
| "loss": 0.4618, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.9820971867007673, | |
| "grad_norm": 5.96411837712334, | |
| "learning_rate": 5.140589834730503e-06, | |
| "loss": 0.4286, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9846547314578005, | |
| "grad_norm": 6.021924443213883, | |
| "learning_rate": 5.120509786804635e-06, | |
| "loss": 0.4545, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9872122762148338, | |
| "grad_norm": 4.860619712046072, | |
| "learning_rate": 5.100427793930862e-06, | |
| "loss": 0.4847, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.989769820971867, | |
| "grad_norm": 6.16249371180202, | |
| "learning_rate": 5.08034418021921e-06, | |
| "loss": 0.4119, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.9923273657289002, | |
| "grad_norm": 5.190813395638479, | |
| "learning_rate": 5.06025926980586e-06, | |
| "loss": 0.3609, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.9948849104859335, | |
| "grad_norm": 5.3092155597782025, | |
| "learning_rate": 5.040173386847926e-06, | |
| "loss": 0.3655, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.9974424552429667, | |
| "grad_norm": 5.7321767766000935, | |
| "learning_rate": 5.0200868555182155e-06, | |
| "loss": 0.5465, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.719257228559791, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4347, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.0025575447570332, | |
| "grad_norm": 4.2391486408050145, | |
| "learning_rate": 4.979913144481785e-06, | |
| "loss": 0.2179, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.0051150895140666, | |
| "grad_norm": 4.604106412801713, | |
| "learning_rate": 4.959826613152074e-06, | |
| "loss": 0.2675, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.0076726342710998, | |
| "grad_norm": 4.113700302207306, | |
| "learning_rate": 4.939740730194141e-06, | |
| "loss": 0.2586, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.010230179028133, | |
| "grad_norm": 4.894518417900452, | |
| "learning_rate": 4.919655819780792e-06, | |
| "loss": 0.278, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0127877237851663, | |
| "grad_norm": 4.3903147870940815, | |
| "learning_rate": 4.899572206069138e-06, | |
| "loss": 0.2175, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.0153452685421995, | |
| "grad_norm": 4.395292367085451, | |
| "learning_rate": 4.879490213195366e-06, | |
| "loss": 0.2597, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.0179028132992327, | |
| "grad_norm": 4.399892144315872, | |
| "learning_rate": 4.8594101652694996e-06, | |
| "loss": 0.2806, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.020460358056266, | |
| "grad_norm": 5.345621237626395, | |
| "learning_rate": 4.839332386370171e-06, | |
| "loss": 0.2571, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.0230179028132993, | |
| "grad_norm": 4.533667138931473, | |
| "learning_rate": 4.819257200539394e-06, | |
| "loss": 0.2646, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0255754475703325, | |
| "grad_norm": 5.037019368108468, | |
| "learning_rate": 4.799184931777337e-06, | |
| "loss": 0.1862, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.0281329923273657, | |
| "grad_norm": 5.628048303078724, | |
| "learning_rate": 4.779115904037079e-06, | |
| "loss": 0.239, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.030690537084399, | |
| "grad_norm": 6.657437275552853, | |
| "learning_rate": 4.759050441219395e-06, | |
| "loss": 0.2032, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.0332480818414322, | |
| "grad_norm": 7.066364647253674, | |
| "learning_rate": 4.738988867167531e-06, | |
| "loss": 0.2686, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.0358056265984654, | |
| "grad_norm": 6.242504066160711, | |
| "learning_rate": 4.718931505661961e-06, | |
| "loss": 0.2039, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.0383631713554988, | |
| "grad_norm": 7.053347520596888, | |
| "learning_rate": 4.698878680415176e-06, | |
| "loss": 0.2677, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.040920716112532, | |
| "grad_norm": 6.980982091790442, | |
| "learning_rate": 4.678830715066458e-06, | |
| "loss": 0.2986, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.0434782608695652, | |
| "grad_norm": 7.1587957747958715, | |
| "learning_rate": 4.6587879331766465e-06, | |
| "loss": 0.2742, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.0460358056265984, | |
| "grad_norm": 6.506842088570043, | |
| "learning_rate": 4.638750658222927e-06, | |
| "loss": 0.327, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.0485933503836318, | |
| "grad_norm": 5.343374587345889, | |
| "learning_rate": 4.618719213593605e-06, | |
| "loss": 0.2399, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.051150895140665, | |
| "grad_norm": 5.952604432226821, | |
| "learning_rate": 4.598693922582896e-06, | |
| "loss": 0.3362, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.0537084398976981, | |
| "grad_norm": 5.292017124731555, | |
| "learning_rate": 4.5786751083856895e-06, | |
| "loss": 0.2229, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.0562659846547315, | |
| "grad_norm": 5.7674633387114, | |
| "learning_rate": 4.558663094092348e-06, | |
| "loss": 0.2716, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.0588235294117647, | |
| "grad_norm": 4.654675739900228, | |
| "learning_rate": 4.53865820268349e-06, | |
| "loss": 0.2439, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.061381074168798, | |
| "grad_norm": 6.404992169914481, | |
| "learning_rate": 4.518660757024774e-06, | |
| "loss": 0.2713, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.0639386189258313, | |
| "grad_norm": 4.940754639892674, | |
| "learning_rate": 4.498671079861686e-06, | |
| "loss": 0.2225, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.0664961636828645, | |
| "grad_norm": 4.76891987704093, | |
| "learning_rate": 4.478689493814341e-06, | |
| "loss": 0.2676, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.0690537084398977, | |
| "grad_norm": 5.023886841031966, | |
| "learning_rate": 4.4587163213722595e-06, | |
| "loss": 0.2594, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.0716112531969308, | |
| "grad_norm": 4.184649476108494, | |
| "learning_rate": 4.438751884889179e-06, | |
| "loss": 0.1791, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.0741687979539642, | |
| "grad_norm": 5.029459043202698, | |
| "learning_rate": 4.41879650657784e-06, | |
| "loss": 0.2588, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0767263427109974, | |
| "grad_norm": 4.795405784853634, | |
| "learning_rate": 4.398850508504795e-06, | |
| "loss": 0.2273, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.0792838874680306, | |
| "grad_norm": 4.137020254719259, | |
| "learning_rate": 4.3789142125852015e-06, | |
| "loss": 0.2682, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.081841432225064, | |
| "grad_norm": 5.222698810349911, | |
| "learning_rate": 4.358987940577631e-06, | |
| "loss": 0.2186, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.0843989769820972, | |
| "grad_norm": 5.743293194926402, | |
| "learning_rate": 4.339072014078879e-06, | |
| "loss": 0.1999, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.0869565217391304, | |
| "grad_norm": 5.052413496789277, | |
| "learning_rate": 4.319166754518768e-06, | |
| "loss": 0.2047, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0895140664961638, | |
| "grad_norm": 6.345187477594701, | |
| "learning_rate": 4.299272483154963e-06, | |
| "loss": 0.314, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.092071611253197, | |
| "grad_norm": 5.440791188116355, | |
| "learning_rate": 4.27938952106779e-06, | |
| "loss": 0.1874, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.0946291560102301, | |
| "grad_norm": 4.59572793131487, | |
| "learning_rate": 4.259518189155049e-06, | |
| "loss": 0.2259, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.0971867007672633, | |
| "grad_norm": 7.1276467094843925, | |
| "learning_rate": 4.2396588081268355e-06, | |
| "loss": 0.2556, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.0997442455242967, | |
| "grad_norm": 6.0356348861027564, | |
| "learning_rate": 4.219811698500365e-06, | |
| "loss": 0.3379, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.10230179028133, | |
| "grad_norm": 5.364328953584096, | |
| "learning_rate": 4.199977180594807e-06, | |
| "loss": 0.1789, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.104859335038363, | |
| "grad_norm": 6.395865991308588, | |
| "learning_rate": 4.1801555745261025e-06, | |
| "loss": 0.3364, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.1074168797953965, | |
| "grad_norm": 5.307623822753343, | |
| "learning_rate": 4.160347200201804e-06, | |
| "loss": 0.3123, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.1099744245524297, | |
| "grad_norm": 5.8960456968886055, | |
| "learning_rate": 4.140552377315918e-06, | |
| "loss": 0.3115, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.1125319693094629, | |
| "grad_norm": 5.5625443910431915, | |
| "learning_rate": 4.120771425343733e-06, | |
| "loss": 0.2276, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.1150895140664963, | |
| "grad_norm": 5.922218273593377, | |
| "learning_rate": 4.101004663536675e-06, | |
| "loss": 0.2569, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 5.263570887474491, | |
| "learning_rate": 4.081252410917148e-06, | |
| "loss": 0.2444, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.1202046035805626, | |
| "grad_norm": 6.181574590354267, | |
| "learning_rate": 4.061514986273391e-06, | |
| "loss": 0.251, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.1227621483375958, | |
| "grad_norm": 6.1580046071843935, | |
| "learning_rate": 4.041792708154321e-06, | |
| "loss": 0.2596, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.1253196930946292, | |
| "grad_norm": 5.458696870991704, | |
| "learning_rate": 4.022085894864408e-06, | |
| "loss": 0.2237, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1278772378516624, | |
| "grad_norm": 4.70048293838936, | |
| "learning_rate": 4.0023948644585294e-06, | |
| "loss": 0.2476, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.1304347826086956, | |
| "grad_norm": 4.466773072234314, | |
| "learning_rate": 3.982719934736832e-06, | |
| "loss": 0.2, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.132992327365729, | |
| "grad_norm": 4.27013775097264, | |
| "learning_rate": 3.963061423239612e-06, | |
| "loss": 0.1978, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.1355498721227621, | |
| "grad_norm": 5.689677983167152, | |
| "learning_rate": 3.943419647242189e-06, | |
| "loss": 0.3034, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.1381074168797953, | |
| "grad_norm": 4.25975811296343, | |
| "learning_rate": 3.923794923749775e-06, | |
| "loss": 0.2187, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.1406649616368287, | |
| "grad_norm": 4.876551747060212, | |
| "learning_rate": 3.904187569492373e-06, | |
| "loss": 0.257, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.143222506393862, | |
| "grad_norm": 7.4635507464539765, | |
| "learning_rate": 3.884597900919656e-06, | |
| "loss": 0.2295, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.145780051150895, | |
| "grad_norm": 5.715365590596455, | |
| "learning_rate": 3.865026234195863e-06, | |
| "loss": 0.2771, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.1483375959079285, | |
| "grad_norm": 5.275576356708176, | |
| "learning_rate": 3.8454728851946885e-06, | |
| "loss": 0.1754, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.1508951406649617, | |
| "grad_norm": 5.062656319020689, | |
| "learning_rate": 3.8259381694942e-06, | |
| "loss": 0.2382, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1534526854219949, | |
| "grad_norm": 5.798164920877668, | |
| "learning_rate": 3.806422402371733e-06, | |
| "loss": 0.288, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.156010230179028, | |
| "grad_norm": 5.272021205396943, | |
| "learning_rate": 3.786925898798801e-06, | |
| "loss": 0.2654, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.1585677749360614, | |
| "grad_norm": 4.739102069258541, | |
| "learning_rate": 3.767448973436021e-06, | |
| "loss": 0.1996, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.1611253196930946, | |
| "grad_norm": 5.933924764643274, | |
| "learning_rate": 3.7479919406280334e-06, | |
| "loss": 0.3014, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.1636828644501278, | |
| "grad_norm": 5.230108631129741, | |
| "learning_rate": 3.728555114398419e-06, | |
| "loss": 0.2306, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.1662404092071612, | |
| "grad_norm": 5.1311748950507905, | |
| "learning_rate": 3.709138808444641e-06, | |
| "loss": 0.2426, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.1687979539641944, | |
| "grad_norm": 4.8518342992156365, | |
| "learning_rate": 3.689743336132982e-06, | |
| "loss": 0.148, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.1713554987212276, | |
| "grad_norm": 4.141242492696908, | |
| "learning_rate": 3.6703690104934806e-06, | |
| "loss": 0.1969, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.1739130434782608, | |
| "grad_norm": 5.575504964227216, | |
| "learning_rate": 3.6510161442148783e-06, | |
| "loss": 0.2847, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 5.205832366429269, | |
| "learning_rate": 3.6316850496395863e-06, | |
| "loss": 0.3003, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1790281329923273, | |
| "grad_norm": 5.640735065432005, | |
| "learning_rate": 3.6123760387586265e-06, | |
| "loss": 0.2454, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.1815856777493605, | |
| "grad_norm": 5.129282050106555, | |
| "learning_rate": 3.5930894232066072e-06, | |
| "loss": 0.1335, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.184143222506394, | |
| "grad_norm": 5.640092025988574, | |
| "learning_rate": 3.5738255142566912e-06, | |
| "loss": 0.2181, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.186700767263427, | |
| "grad_norm": 5.386833673551597, | |
| "learning_rate": 3.5545846228155743e-06, | |
| "loss": 0.2176, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.1892583120204603, | |
| "grad_norm": 5.704105112383357, | |
| "learning_rate": 3.5353670594184623e-06, | |
| "loss": 0.2497, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.1918158567774937, | |
| "grad_norm": 5.93251011789455, | |
| "learning_rate": 3.516173134224059e-06, | |
| "loss": 0.3096, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.1943734015345269, | |
| "grad_norm": 5.135070353719214, | |
| "learning_rate": 3.4970031570095707e-06, | |
| "loss": 0.2201, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.19693094629156, | |
| "grad_norm": 5.865866790423223, | |
| "learning_rate": 3.477857437165694e-06, | |
| "loss": 0.2694, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.1994884910485935, | |
| "grad_norm": 6.268703077398429, | |
| "learning_rate": 3.458736283691626e-06, | |
| "loss": 0.2555, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.2020460358056266, | |
| "grad_norm": 6.588443786412173, | |
| "learning_rate": 3.4396400051900846e-06, | |
| "loss": 0.2546, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2046035805626598, | |
| "grad_norm": 5.327149282563148, | |
| "learning_rate": 3.4205689098623195e-06, | |
| "loss": 0.1783, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.207161125319693, | |
| "grad_norm": 4.958795590477734, | |
| "learning_rate": 3.401523305503139e-06, | |
| "loss": 0.1755, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.2097186700767264, | |
| "grad_norm": 5.8814512868036735, | |
| "learning_rate": 3.3825034994959445e-06, | |
| "loss": 0.2751, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.2122762148337596, | |
| "grad_norm": 6.346180998183863, | |
| "learning_rate": 3.3635097988077724e-06, | |
| "loss": 0.2825, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.2148337595907928, | |
| "grad_norm": 4.824141112150008, | |
| "learning_rate": 3.3445425099843343e-06, | |
| "loss": 0.193, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.2173913043478262, | |
| "grad_norm": 5.212763506921242, | |
| "learning_rate": 3.3256019391450696e-06, | |
| "loss": 0.2539, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.2199488491048593, | |
| "grad_norm": 5.731843763443905, | |
| "learning_rate": 3.3066883919782116e-06, | |
| "loss": 0.2647, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.2225063938618925, | |
| "grad_norm": 5.069916401820976, | |
| "learning_rate": 3.287802173735848e-06, | |
| "loss": 0.2466, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.2250639386189257, | |
| "grad_norm": 5.351892499599417, | |
| "learning_rate": 3.268943589228992e-06, | |
| "loss": 0.2369, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.227621483375959, | |
| "grad_norm": 5.226527301587174, | |
| "learning_rate": 3.250112942822673e-06, | |
| "loss": 0.2063, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2301790281329923, | |
| "grad_norm": 5.599367983952576, | |
| "learning_rate": 3.231310538431015e-06, | |
| "loss": 0.3108, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.2327365728900257, | |
| "grad_norm": 4.949232012109662, | |
| "learning_rate": 3.212536679512332e-06, | |
| "loss": 0.2778, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 4.692407945943108, | |
| "learning_rate": 3.1937916690642356e-06, | |
| "loss": 0.2972, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.237851662404092, | |
| "grad_norm": 4.946087662063809, | |
| "learning_rate": 3.1750758096187446e-06, | |
| "loss": 0.2155, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.2404092071611252, | |
| "grad_norm": 4.8219004232287, | |
| "learning_rate": 3.1563894032373977e-06, | |
| "loss": 0.22, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.2429667519181586, | |
| "grad_norm": 5.621400027096903, | |
| "learning_rate": 3.137732751506376e-06, | |
| "loss": 0.2286, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.2455242966751918, | |
| "grad_norm": 5.479705903908969, | |
| "learning_rate": 3.1191061555316503e-06, | |
| "loss": 0.2534, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.248081841432225, | |
| "grad_norm": 5.794553272942662, | |
| "learning_rate": 3.1005099159341044e-06, | |
| "loss": 0.2618, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.2506393861892584, | |
| "grad_norm": 6.017425638082386, | |
| "learning_rate": 3.08194433284469e-06, | |
| "loss": 0.2843, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.2531969309462916, | |
| "grad_norm": 4.5326844807976805, | |
| "learning_rate": 3.0634097058995877e-06, | |
| "loss": 0.1928, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2557544757033248, | |
| "grad_norm": 6.4463104104564115, | |
| "learning_rate": 3.0449063342353635e-06, | |
| "loss": 0.2478, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.258312020460358, | |
| "grad_norm": 4.5887061422293565, | |
| "learning_rate": 3.0264345164841426e-06, | |
| "loss": 0.204, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.2608695652173914, | |
| "grad_norm": 5.508160093203401, | |
| "learning_rate": 3.007994550768793e-06, | |
| "loss": 0.2159, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.2634271099744245, | |
| "grad_norm": 4.737111003577314, | |
| "learning_rate": 2.989586734698113e-06, | |
| "loss": 0.2093, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.265984654731458, | |
| "grad_norm": 4.947584431847608, | |
| "learning_rate": 2.971211365362028e-06, | |
| "loss": 0.1984, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.2685421994884911, | |
| "grad_norm": 4.800447464091687, | |
| "learning_rate": 2.9528687393267865e-06, | |
| "loss": 0.2396, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.2710997442455243, | |
| "grad_norm": 5.424287981614177, | |
| "learning_rate": 2.934559152630192e-06, | |
| "loss": 0.2752, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.2736572890025575, | |
| "grad_norm": 4.885472933623559, | |
| "learning_rate": 2.9162829007768103e-06, | |
| "loss": 0.2778, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.2762148337595907, | |
| "grad_norm": 5.051334757075423, | |
| "learning_rate": 2.898040278733203e-06, | |
| "loss": 0.2728, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.278772378516624, | |
| "grad_norm": 5.103121044913984, | |
| "learning_rate": 2.879831580923176e-06, | |
| "loss": 0.2276, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.278772378516624, | |
| "eval_loss": 0.4383206367492676, | |
| "eval_runtime": 0.9493, | |
| "eval_samples_per_second": 33.71, | |
| "eval_steps_per_second": 8.428, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2813299232736572, | |
| "grad_norm": 6.302964407184529, | |
| "learning_rate": 2.8616571012230134e-06, | |
| "loss": 0.324, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.2838874680306906, | |
| "grad_norm": 4.570682847412226, | |
| "learning_rate": 2.843517132956742e-06, | |
| "loss": 0.2638, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.2864450127877238, | |
| "grad_norm": 5.1557886110649065, | |
| "learning_rate": 2.8254119688914017e-06, | |
| "loss": 0.2901, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.289002557544757, | |
| "grad_norm": 4.59205694943921, | |
| "learning_rate": 2.8073419012323154e-06, | |
| "loss": 0.246, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.2915601023017902, | |
| "grad_norm": 5.220862830802117, | |
| "learning_rate": 2.789307221618369e-06, | |
| "loss": 0.2665, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.2941176470588236, | |
| "grad_norm": 5.6445669568957895, | |
| "learning_rate": 2.771308221117309e-06, | |
| "loss": 0.2565, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.2966751918158568, | |
| "grad_norm": 5.578642462958587, | |
| "learning_rate": 2.7533451902210512e-06, | |
| "loss": 0.2805, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.29923273657289, | |
| "grad_norm": 5.9960066898200814, | |
| "learning_rate": 2.7354184188409773e-06, | |
| "loss": 0.3228, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.3017902813299234, | |
| "grad_norm": 4.257760920803555, | |
| "learning_rate": 2.71752819630327e-06, | |
| "loss": 0.1833, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.3043478260869565, | |
| "grad_norm": 6.1683068408323605, | |
| "learning_rate": 2.6996748113442397e-06, | |
| "loss": 0.185, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3069053708439897, | |
| "grad_norm": 5.042423224754131, | |
| "learning_rate": 2.6818585521056573e-06, | |
| "loss": 0.241, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.309462915601023, | |
| "grad_norm": 4.727233078424547, | |
| "learning_rate": 2.66407970613011e-06, | |
| "loss": 0.1559, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.3120204603580563, | |
| "grad_norm": 5.0241427635079505, | |
| "learning_rate": 2.646338560356363e-06, | |
| "loss": 0.1877, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.3145780051150895, | |
| "grad_norm": 6.126061319891909, | |
| "learning_rate": 2.6286354011147252e-06, | |
| "loss": 0.2004, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.317135549872123, | |
| "grad_norm": 5.24912663925928, | |
| "learning_rate": 2.6109705141224255e-06, | |
| "loss": 0.2364, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.319693094629156, | |
| "grad_norm": 6.034124083029447, | |
| "learning_rate": 2.593344184479003e-06, | |
| "loss": 0.2788, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.3222506393861893, | |
| "grad_norm": 6.608163697606231, | |
| "learning_rate": 2.575756696661713e-06, | |
| "loss": 0.2416, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.3248081841432224, | |
| "grad_norm": 4.758894846786754, | |
| "learning_rate": 2.5582083345209217e-06, | |
| "loss": 0.2124, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.3273657289002558, | |
| "grad_norm": 5.719504905312417, | |
| "learning_rate": 2.540699381275539e-06, | |
| "loss": 0.297, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.329923273657289, | |
| "grad_norm": 5.86724061218003, | |
| "learning_rate": 2.5232301195084395e-06, | |
| "loss": 0.3234, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3324808184143222, | |
| "grad_norm": 5.7644650842566945, | |
| "learning_rate": 2.5058008311619035e-06, | |
| "loss": 0.2615, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.3350383631713556, | |
| "grad_norm": 5.745064713338971, | |
| "learning_rate": 2.488411797533064e-06, | |
| "loss": 0.2118, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.3375959079283888, | |
| "grad_norm": 4.576869292927031, | |
| "learning_rate": 2.4710632992693737e-06, | |
| "loss": 0.2278, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.340153452685422, | |
| "grad_norm": 5.037272698300459, | |
| "learning_rate": 2.4537556163640726e-06, | |
| "loss": 0.236, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.3427109974424551, | |
| "grad_norm": 5.421366472711992, | |
| "learning_rate": 2.4364890281516633e-06, | |
| "loss": 0.2577, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.3452685421994885, | |
| "grad_norm": 4.866078798724115, | |
| "learning_rate": 2.4192638133034074e-06, | |
| "loss": 0.2318, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.3478260869565217, | |
| "grad_norm": 4.7964708165794265, | |
| "learning_rate": 2.4020802498228333e-06, | |
| "loss": 0.2054, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.350383631713555, | |
| "grad_norm": 4.928572310449122, | |
| "learning_rate": 2.384938615041238e-06, | |
| "loss": 0.2681, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.3529411764705883, | |
| "grad_norm": 4.737745386088544, | |
| "learning_rate": 2.3678391856132203e-06, | |
| "loss": 0.23, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.3554987212276215, | |
| "grad_norm": 4.546792180996154, | |
| "learning_rate": 2.350782237512215e-06, | |
| "loss": 0.2158, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3580562659846547, | |
| "grad_norm": 5.14316611376246, | |
| "learning_rate": 2.3337680460260314e-06, | |
| "loss": 0.196, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.3606138107416879, | |
| "grad_norm": 5.834646238531157, | |
| "learning_rate": 2.316796885752415e-06, | |
| "loss": 0.2587, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.3631713554987213, | |
| "grad_norm": 5.780876751105411, | |
| "learning_rate": 2.299869030594622e-06, | |
| "loss": 0.1942, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.3657289002557544, | |
| "grad_norm": 4.548997507254711, | |
| "learning_rate": 2.2829847537569904e-06, | |
| "loss": 0.176, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.3682864450127878, | |
| "grad_norm": 5.731302369481431, | |
| "learning_rate": 2.266144327740531e-06, | |
| "loss": 0.3019, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.370843989769821, | |
| "grad_norm": 5.073679478403773, | |
| "learning_rate": 2.2493480243385298e-06, | |
| "loss": 0.2065, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.3734015345268542, | |
| "grad_norm": 5.850299158046723, | |
| "learning_rate": 2.2325961146321683e-06, | |
| "loss": 0.2714, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.3759590792838874, | |
| "grad_norm": 5.7360121373777035, | |
| "learning_rate": 2.2158888689861434e-06, | |
| "loss": 0.2249, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.3785166240409208, | |
| "grad_norm": 4.828788116522103, | |
| "learning_rate": 2.1992265570442974e-06, | |
| "loss": 0.2304, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.381074168797954, | |
| "grad_norm": 5.160394779683274, | |
| "learning_rate": 2.182609447725279e-06, | |
| "loss": 0.2259, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3836317135549872, | |
| "grad_norm": 5.290912122436337, | |
| "learning_rate": 2.1660378092181935e-06, | |
| "loss": 0.2104, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.3861892583120206, | |
| "grad_norm": 5.983793408233051, | |
| "learning_rate": 2.149511908978275e-06, | |
| "loss": 0.2372, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.3887468030690537, | |
| "grad_norm": 5.146327357469625, | |
| "learning_rate": 2.1330320137225773e-06, | |
| "loss": 0.2594, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.391304347826087, | |
| "grad_norm": 6.1200616170554465, | |
| "learning_rate": 2.1165983894256647e-06, | |
| "loss": 0.2858, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.39386189258312, | |
| "grad_norm": 5.18913029029246, | |
| "learning_rate": 2.100211301315315e-06, | |
| "loss": 0.1924, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.3964194373401535, | |
| "grad_norm": 5.568160243169466, | |
| "learning_rate": 2.0838710138682412e-06, | |
| "loss": 0.193, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.3989769820971867, | |
| "grad_norm": 4.998024067501082, | |
| "learning_rate": 2.0675777908058307e-06, | |
| "loss": 0.2753, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.40153452685422, | |
| "grad_norm": 5.644184048080268, | |
| "learning_rate": 2.051331895089882e-06, | |
| "loss": 0.2865, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.4040920716112533, | |
| "grad_norm": 5.40264585527916, | |
| "learning_rate": 2.035133588918356e-06, | |
| "loss": 0.183, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.4066496163682864, | |
| "grad_norm": 4.477185903868405, | |
| "learning_rate": 2.0189831337211573e-06, | |
| "loss": 0.1957, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4092071611253196, | |
| "grad_norm": 5.199329102721851, | |
| "learning_rate": 2.0028807901559027e-06, | |
| "loss": 0.2067, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.4117647058823528, | |
| "grad_norm": 4.040859440248337, | |
| "learning_rate": 1.9868268181037186e-06, | |
| "loss": 0.188, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.4143222506393862, | |
| "grad_norm": 4.554963712348274, | |
| "learning_rate": 1.970821476665051e-06, | |
| "loss": 0.1872, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.4168797953964194, | |
| "grad_norm": 5.430125296226688, | |
| "learning_rate": 1.9548650241554812e-06, | |
| "loss": 0.2154, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.4194373401534528, | |
| "grad_norm": 5.328931412586536, | |
| "learning_rate": 1.9389577181015496e-06, | |
| "loss": 0.1755, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.421994884910486, | |
| "grad_norm": 4.249854605239616, | |
| "learning_rate": 1.923099815236608e-06, | |
| "loss": 0.1593, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.4245524296675192, | |
| "grad_norm": 3.907795738528054, | |
| "learning_rate": 1.9072915714966761e-06, | |
| "loss": 0.1251, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.4271099744245523, | |
| "grad_norm": 5.529382370073762, | |
| "learning_rate": 1.8915332420163074e-06, | |
| "loss": 0.1817, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.4296675191815857, | |
| "grad_norm": 5.035709320601147, | |
| "learning_rate": 1.8758250811244682e-06, | |
| "loss": 0.2079, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.432225063938619, | |
| "grad_norm": 4.581843899498873, | |
| "learning_rate": 1.8601673423404449e-06, | |
| "loss": 0.2037, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.434782608695652, | |
| "grad_norm": 6.5976247360019045, | |
| "learning_rate": 1.8445602783697375e-06, | |
| "loss": 0.2877, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.4373401534526855, | |
| "grad_norm": 4.959100730308061, | |
| "learning_rate": 1.8290041410999893e-06, | |
| "loss": 0.1957, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.4398976982097187, | |
| "grad_norm": 5.169599476739708, | |
| "learning_rate": 1.8134991815969238e-06, | |
| "loss": 0.2358, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.4424552429667519, | |
| "grad_norm": 5.894324327311631, | |
| "learning_rate": 1.798045650100289e-06, | |
| "loss": 0.224, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.445012787723785, | |
| "grad_norm": 5.802424880479901, | |
| "learning_rate": 1.782643796019814e-06, | |
| "loss": 0.2429, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.4475703324808185, | |
| "grad_norm": 4.922016237101965, | |
| "learning_rate": 1.7672938679311957e-06, | |
| "loss": 0.2266, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.4501278772378516, | |
| "grad_norm": 5.755816715889569, | |
| "learning_rate": 1.7519961135720737e-06, | |
| "loss": 0.2376, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.452685421994885, | |
| "grad_norm": 5.220912348487513, | |
| "learning_rate": 1.736750779838044e-06, | |
| "loss": 0.2216, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.4552429667519182, | |
| "grad_norm": 5.210521733834325, | |
| "learning_rate": 1.7215581127786624e-06, | |
| "loss": 0.2615, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.4578005115089514, | |
| "grad_norm": 5.082754952247903, | |
| "learning_rate": 1.7064183575934856e-06, | |
| "loss": 0.2395, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4603580562659846, | |
| "grad_norm": 4.668118713570096, | |
| "learning_rate": 1.6913317586281048e-06, | |
| "loss": 0.1761, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.4629156010230178, | |
| "grad_norm": 5.582283863667842, | |
| "learning_rate": 1.676298559370202e-06, | |
| "loss": 0.2342, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.4654731457800512, | |
| "grad_norm": 5.88971284160264, | |
| "learning_rate": 1.6613190024456293e-06, | |
| "loss": 0.3086, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.4680306905370843, | |
| "grad_norm": 5.053648614417428, | |
| "learning_rate": 1.6463933296144863e-06, | |
| "loss": 0.2169, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 5.3893999941494055, | |
| "learning_rate": 1.6315217817672142e-06, | |
| "loss": 0.2483, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.473145780051151, | |
| "grad_norm": 6.003001211927369, | |
| "learning_rate": 1.6167045989207185e-06, | |
| "loss": 0.2488, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.4757033248081841, | |
| "grad_norm": 5.8557134317975965, | |
| "learning_rate": 1.6019420202144853e-06, | |
| "loss": 0.2349, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.4782608695652173, | |
| "grad_norm": 5.263808302802599, | |
| "learning_rate": 1.5872342839067305e-06, | |
| "loss": 0.1857, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.4808184143222507, | |
| "grad_norm": 4.433242818261617, | |
| "learning_rate": 1.5725816273705453e-06, | |
| "loss": 0.1888, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.4833759590792839, | |
| "grad_norm": 4.993905414062928, | |
| "learning_rate": 1.5579842870900746e-06, | |
| "loss": 0.2154, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.485933503836317, | |
| "grad_norm": 4.381712629412114, | |
| "learning_rate": 1.5434424986566938e-06, | |
| "loss": 0.222, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.4884910485933505, | |
| "grad_norm": 4.174610676856918, | |
| "learning_rate": 1.5289564967652033e-06, | |
| "loss": 0.1991, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.4910485933503836, | |
| "grad_norm": 4.819672292661032, | |
| "learning_rate": 1.5145265152100574e-06, | |
| "loss": 0.2425, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.4936061381074168, | |
| "grad_norm": 5.297609781727035, | |
| "learning_rate": 1.5001527868815702e-06, | |
| "loss": 0.3006, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.49616368286445, | |
| "grad_norm": 5.79687615069299, | |
| "learning_rate": 1.4858355437621663e-06, | |
| "loss": 0.27, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.4987212276214834, | |
| "grad_norm": 4.762605653912011, | |
| "learning_rate": 1.4715750169226417e-06, | |
| "loss": 0.2548, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.5012787723785166, | |
| "grad_norm": 5.7198048630752725, | |
| "learning_rate": 1.457371436518424e-06, | |
| "loss": 0.2594, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.50383631713555, | |
| "grad_norm": 4.802403477803473, | |
| "learning_rate": 1.4432250317858675e-06, | |
| "loss": 0.19, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.5063938618925832, | |
| "grad_norm": 6.635420914435053, | |
| "learning_rate": 1.4291360310385455e-06, | |
| "loss": 0.2405, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.5089514066496164, | |
| "grad_norm": 4.539889740139929, | |
| "learning_rate": 1.4151046616635727e-06, | |
| "loss": 0.1389, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.5115089514066495, | |
| "grad_norm": 5.38811895384659, | |
| "learning_rate": 1.4011311501179287e-06, | |
| "loss": 0.2662, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.5140664961636827, | |
| "grad_norm": 5.109717895825474, | |
| "learning_rate": 1.3872157219248045e-06, | |
| "loss": 0.2043, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.5166240409207161, | |
| "grad_norm": 5.0959884186276625, | |
| "learning_rate": 1.373358601669973e-06, | |
| "loss": 0.1719, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.5191815856777495, | |
| "grad_norm": 5.469942867350689, | |
| "learning_rate": 1.3595600129981469e-06, | |
| "loss": 0.1644, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.5217391304347827, | |
| "grad_norm": 3.8395720981267885, | |
| "learning_rate": 1.3458201786093795e-06, | |
| "loss": 0.1706, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.5242966751918159, | |
| "grad_norm": 5.518424328264906, | |
| "learning_rate": 1.3321393202554739e-06, | |
| "loss": 0.23, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.526854219948849, | |
| "grad_norm": 6.325750802451315, | |
| "learning_rate": 1.3185176587363919e-06, | |
| "loss": 0.3093, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.5294117647058822, | |
| "grad_norm": 5.306860678214338, | |
| "learning_rate": 1.3049554138967052e-06, | |
| "loss": 0.1669, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.5319693094629157, | |
| "grad_norm": 5.531828559483773, | |
| "learning_rate": 1.2914528046220332e-06, | |
| "loss": 0.2521, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.5345268542199488, | |
| "grad_norm": 4.928627869404515, | |
| "learning_rate": 1.278010048835523e-06, | |
| "loss": 0.1606, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5370843989769822, | |
| "grad_norm": 4.736176509824429, | |
| "learning_rate": 1.2646273634943195e-06, | |
| "loss": 0.2382, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.5396419437340154, | |
| "grad_norm": 5.273813096512506, | |
| "learning_rate": 1.2513049645860759e-06, | |
| "loss": 0.1809, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.5421994884910486, | |
| "grad_norm": 5.824236440729651, | |
| "learning_rate": 1.2380430671254618e-06, | |
| "loss": 0.2194, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.5447570332480818, | |
| "grad_norm": 5.0892118395960395, | |
| "learning_rate": 1.224841885150691e-06, | |
| "loss": 0.2326, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.547314578005115, | |
| "grad_norm": 4.776070837603081, | |
| "learning_rate": 1.2117016317200702e-06, | |
| "loss": 0.202, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.5498721227621484, | |
| "grad_norm": 4.827682184305226, | |
| "learning_rate": 1.1986225189085627e-06, | |
| "loss": 0.2047, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.5524296675191815, | |
| "grad_norm": 6.354947289868109, | |
| "learning_rate": 1.185604757804359e-06, | |
| "loss": 0.2489, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.554987212276215, | |
| "grad_norm": 6.389932140184679, | |
| "learning_rate": 1.172648558505477e-06, | |
| "loss": 0.2039, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.5575447570332481, | |
| "grad_norm": 5.4707664838796815, | |
| "learning_rate": 1.1597541301163655e-06, | |
| "loss": 0.2242, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.5601023017902813, | |
| "grad_norm": 5.852672190013367, | |
| "learning_rate": 1.1469216807445348e-06, | |
| "loss": 0.1804, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5626598465473145, | |
| "grad_norm": 6.351885487364482, | |
| "learning_rate": 1.1341514174971907e-06, | |
| "loss": 0.2128, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.5652173913043477, | |
| "grad_norm": 5.1770067311650605, | |
| "learning_rate": 1.1214435464779006e-06, | |
| "loss": 0.2993, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.567774936061381, | |
| "grad_norm": 5.447984746676171, | |
| "learning_rate": 1.1087982727832613e-06, | |
| "loss": 0.307, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.5703324808184145, | |
| "grad_norm": 5.182727825447981, | |
| "learning_rate": 1.0962158004995893e-06, | |
| "loss": 0.2687, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.5728900255754477, | |
| "grad_norm": 4.64809564738005, | |
| "learning_rate": 1.083696332699628e-06, | |
| "loss": 0.1572, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.5754475703324808, | |
| "grad_norm": 4.7325713746238876, | |
| "learning_rate": 1.0712400714392723e-06, | |
| "loss": 0.18, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.578005115089514, | |
| "grad_norm": 5.269623806320352, | |
| "learning_rate": 1.058847217754303e-06, | |
| "loss": 0.2037, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.5805626598465472, | |
| "grad_norm": 5.304757901803767, | |
| "learning_rate": 1.0465179716571467e-06, | |
| "loss": 0.1971, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.5831202046035806, | |
| "grad_norm": 6.728413029180757, | |
| "learning_rate": 1.034252532133646e-06, | |
| "loss": 0.2494, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 1.5856777493606138, | |
| "grad_norm": 3.4223539307887454, | |
| "learning_rate": 1.0220510971398473e-06, | |
| "loss": 0.1613, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5882352941176472, | |
| "grad_norm": 5.305411837383943, | |
| "learning_rate": 1.0099138635988026e-06, | |
| "loss": 0.2349, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.5907928388746804, | |
| "grad_norm": 4.856914484373969, | |
| "learning_rate": 9.978410273974015e-07, | |
| "loss": 0.1723, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 1.5933503836317136, | |
| "grad_norm": 5.358684756870008, | |
| "learning_rate": 9.858327833832004e-07, | |
| "loss": 0.2593, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 1.5959079283887467, | |
| "grad_norm": 4.542731858430077, | |
| "learning_rate": 9.738893253612808e-07, | |
| "loss": 0.1939, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.59846547314578, | |
| "grad_norm": 4.5148886714925345, | |
| "learning_rate": 9.620108460911181e-07, | |
| "loss": 0.1901, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.6010230179028133, | |
| "grad_norm": 5.107745783747687, | |
| "learning_rate": 9.50197537283481e-07, | |
| "loss": 0.2062, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 1.6035805626598465, | |
| "grad_norm": 6.591147405424452, | |
| "learning_rate": 9.384495895973227e-07, | |
| "loss": 0.2293, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.60613810741688, | |
| "grad_norm": 5.607830445145212, | |
| "learning_rate": 9.267671926367166e-07, | |
| "loss": 0.2449, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 1.608695652173913, | |
| "grad_norm": 5.457362074950791, | |
| "learning_rate": 9.151505349477901e-07, | |
| "loss": 0.2638, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 1.6112531969309463, | |
| "grad_norm": 4.821051497667833, | |
| "learning_rate": 9.035998040156801e-07, | |
| "loss": 0.1902, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6138107416879794, | |
| "grad_norm": 5.171122943753494, | |
| "learning_rate": 8.921151862615091e-07, | |
| "loss": 0.1422, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 1.6163682864450126, | |
| "grad_norm": 6.259146518225325, | |
| "learning_rate": 8.806968670393801e-07, | |
| "loss": 0.315, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 1.618925831202046, | |
| "grad_norm": 5.397528592846895, | |
| "learning_rate": 8.693450306333818e-07, | |
| "loss": 0.21, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.6214833759590794, | |
| "grad_norm": 5.180249332098257, | |
| "learning_rate": 8.580598602546109e-07, | |
| "loss": 0.2556, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 1.6240409207161126, | |
| "grad_norm": 4.562634677645485, | |
| "learning_rate": 8.4684153803822e-07, | |
| "loss": 0.2216, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.6265984654731458, | |
| "grad_norm": 5.390722528674729, | |
| "learning_rate": 8.356902450404792e-07, | |
| "loss": 0.233, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.629156010230179, | |
| "grad_norm": 4.775191692612564, | |
| "learning_rate": 8.246061612358475e-07, | |
| "loss": 0.2287, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.6317135549872122, | |
| "grad_norm": 4.852828279434425, | |
| "learning_rate": 8.135894655140758e-07, | |
| "loss": 0.2191, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.6342710997442456, | |
| "grad_norm": 5.704310246599364, | |
| "learning_rate": 8.026403356773161e-07, | |
| "loss": 0.2047, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.6368286445012787, | |
| "grad_norm": 4.793807479675851, | |
| "learning_rate": 7.91758948437249e-07, | |
| "loss": 0.1618, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6393861892583121, | |
| "grad_norm": 4.754502329271982, | |
| "learning_rate": 7.809454794122346e-07, | |
| "loss": 0.2781, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.6419437340153453, | |
| "grad_norm": 7.018986504016533, | |
| "learning_rate": 7.702001031244816e-07, | |
| "loss": 0.2729, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.6445012787723785, | |
| "grad_norm": 4.41916704046766, | |
| "learning_rate": 7.595229929972253e-07, | |
| "loss": 0.2092, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.6470588235294117, | |
| "grad_norm": 5.336932050022912, | |
| "learning_rate": 7.489143213519301e-07, | |
| "loss": 0.2267, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.6496163682864449, | |
| "grad_norm": 4.654218849496965, | |
| "learning_rate": 7.383742594055077e-07, | |
| "loss": 0.2136, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.6521739130434783, | |
| "grad_norm": 5.1623224835718515, | |
| "learning_rate": 7.279029772675572e-07, | |
| "loss": 0.221, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.6547314578005117, | |
| "grad_norm": 5.686897560893601, | |
| "learning_rate": 7.17500643937617e-07, | |
| "loss": 0.243, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.6572890025575449, | |
| "grad_norm": 4.738802336926236, | |
| "learning_rate": 7.071674273024353e-07, | |
| "loss": 0.2059, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.659846547314578, | |
| "grad_norm": 6.305944976355385, | |
| "learning_rate": 6.969034941332664e-07, | |
| "loss": 0.2147, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.6624040920716112, | |
| "grad_norm": 5.190380929928794, | |
| "learning_rate": 6.86709010083172e-07, | |
| "loss": 0.1909, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6649616368286444, | |
| "grad_norm": 5.659990773064172, | |
| "learning_rate": 6.765841396843514e-07, | |
| "loss": 0.1913, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.6675191815856778, | |
| "grad_norm": 4.841853616494702, | |
| "learning_rate": 6.665290463454882e-07, | |
| "loss": 0.201, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.670076726342711, | |
| "grad_norm": 6.0589020041416575, | |
| "learning_rate": 6.565438923491102e-07, | |
| "loss": 0.2908, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.6726342710997444, | |
| "grad_norm": 4.716079545501844, | |
| "learning_rate": 6.466288388489689e-07, | |
| "loss": 0.2169, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.6751918158567776, | |
| "grad_norm": 4.268196527173102, | |
| "learning_rate": 6.367840458674401e-07, | |
| "loss": 0.202, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.6777493606138107, | |
| "grad_norm": 5.8944147568821625, | |
| "learning_rate": 6.270096722929442e-07, | |
| "loss": 0.2586, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.680306905370844, | |
| "grad_norm": 5.564876452450793, | |
| "learning_rate": 6.173058758773775e-07, | |
| "loss": 0.2391, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.682864450127877, | |
| "grad_norm": 5.003498689191179, | |
| "learning_rate": 6.076728132335669e-07, | |
| "loss": 0.1756, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.6854219948849105, | |
| "grad_norm": 4.993122576064924, | |
| "learning_rate": 5.981106398327463e-07, | |
| "loss": 0.1984, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 1.6879795396419437, | |
| "grad_norm": 5.118160492207756, | |
| "learning_rate": 5.886195100020408e-07, | |
| "loss": 0.2879, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.690537084398977, | |
| "grad_norm": 5.396969385626712, | |
| "learning_rate": 5.7919957692198e-07, | |
| "loss": 0.1767, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 1.6930946291560103, | |
| "grad_norm": 4.709841482974692, | |
| "learning_rate": 5.698509926240275e-07, | |
| "loss": 0.2265, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 1.6956521739130435, | |
| "grad_norm": 5.74378925311425, | |
| "learning_rate": 5.60573907988124e-07, | |
| "loss": 0.2585, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.6982097186700766, | |
| "grad_norm": 5.340843111211026, | |
| "learning_rate": 5.513684727402529e-07, | |
| "loss": 0.1713, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 1.7007672634271098, | |
| "grad_norm": 5.095292121289202, | |
| "learning_rate": 5.422348354500217e-07, | |
| "loss": 0.1969, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.7033248081841432, | |
| "grad_norm": 6.38491199555891, | |
| "learning_rate": 5.331731435282705e-07, | |
| "loss": 0.1764, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 6.33773326776292, | |
| "learning_rate": 5.241835432246888e-07, | |
| "loss": 0.2176, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 1.7084398976982098, | |
| "grad_norm": 4.747290198021267, | |
| "learning_rate": 5.152661796254505e-07, | |
| "loss": 0.2194, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.710997442455243, | |
| "grad_norm": 5.180305167036278, | |
| "learning_rate": 5.064211966508837e-07, | |
| "loss": 0.1838, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.7135549872122762, | |
| "grad_norm": 5.158569886057061, | |
| "learning_rate": 4.976487370531352e-07, | |
| "loss": 0.1714, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.7161125319693094, | |
| "grad_norm": 6.7699503709640405, | |
| "learning_rate": 4.88948942413876e-07, | |
| "loss": 0.2794, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 1.7186700767263428, | |
| "grad_norm": 4.330269645663795, | |
| "learning_rate": 4.803219531420128e-07, | |
| "loss": 0.1567, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.721227621483376, | |
| "grad_norm": 6.071163454166977, | |
| "learning_rate": 4.717679084714222e-07, | |
| "loss": 0.2268, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 1.7237851662404093, | |
| "grad_norm": 4.455492690019572, | |
| "learning_rate": 4.6328694645870254e-07, | |
| "loss": 0.2092, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.7263427109974425, | |
| "grad_norm": 5.5297942207932165, | |
| "learning_rate": 4.5487920398094465e-07, | |
| "loss": 0.2037, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.7289002557544757, | |
| "grad_norm": 4.8818412705137115, | |
| "learning_rate": 4.46544816733529e-07, | |
| "loss": 0.2475, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 1.7314578005115089, | |
| "grad_norm": 5.627569483357689, | |
| "learning_rate": 4.382839192279303e-07, | |
| "loss": 0.2585, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 1.734015345268542, | |
| "grad_norm": 5.1192641037653965, | |
| "learning_rate": 4.3009664478954384e-07, | |
| "loss": 0.2449, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.7365728900255755, | |
| "grad_norm": 5.286870696275029, | |
| "learning_rate": 4.219831255555423e-07, | |
| "loss": 0.2004, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 5.5327277136832675, | |
| "learning_rate": 4.139434924727359e-07, | |
| "loss": 0.1921, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.741687979539642, | |
| "grad_norm": 4.552608688952828, | |
| "learning_rate": 4.059778752954607e-07, | |
| "loss": 0.1432, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.7442455242966752, | |
| "grad_norm": 5.216152442361206, | |
| "learning_rate": 3.9808640258348686e-07, | |
| "loss": 0.1754, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 1.7468030690537084, | |
| "grad_norm": 5.756818223369158, | |
| "learning_rate": 3.9026920169994374e-07, | |
| "loss": 0.21, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 1.7493606138107416, | |
| "grad_norm": 5.29491810051506, | |
| "learning_rate": 3.825263988092587e-07, | |
| "loss": 0.2228, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.7519181585677748, | |
| "grad_norm": 5.164127220528273, | |
| "learning_rate": 3.7485811887512714e-07, | |
| "loss": 0.2144, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.7544757033248082, | |
| "grad_norm": 5.314203439099788, | |
| "learning_rate": 3.672644856584928e-07, | |
| "loss": 0.2911, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 1.7570332480818416, | |
| "grad_norm": 4.972659541709094, | |
| "learning_rate": 3.597456217155526e-07, | |
| "loss": 0.1519, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 1.7595907928388748, | |
| "grad_norm": 5.199722512215739, | |
| "learning_rate": 3.523016483957742e-07, | |
| "loss": 0.2739, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 1.762148337595908, | |
| "grad_norm": 5.21999031799005, | |
| "learning_rate": 3.4493268583994434e-07, | |
| "loss": 0.201, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 4.364821712930802, | |
| "learning_rate": 3.3763885297822153e-07, | |
| "loss": 0.1881, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7672634271099743, | |
| "grad_norm": 5.202401016558692, | |
| "learning_rate": 3.3042026752822254e-07, | |
| "loss": 0.2125, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 1.7698209718670077, | |
| "grad_norm": 5.684342025889591, | |
| "learning_rate": 3.2327704599312283e-07, | |
| "loss": 0.2636, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 1.772378516624041, | |
| "grad_norm": 5.59338748722469, | |
| "learning_rate": 3.16209303659773e-07, | |
| "loss": 0.2489, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 1.7749360613810743, | |
| "grad_norm": 4.23116707170702, | |
| "learning_rate": 3.0921715459683753e-07, | |
| "loss": 0.153, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 1.7774936061381075, | |
| "grad_norm": 5.558061912478462, | |
| "learning_rate": 3.0230071165295804e-07, | |
| "loss": 0.2291, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.7800511508951407, | |
| "grad_norm": 5.197951332859319, | |
| "learning_rate": 2.95460086454929e-07, | |
| "loss": 0.2267, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 1.7826086956521738, | |
| "grad_norm": 5.668569655261014, | |
| "learning_rate": 2.88695389405898e-07, | |
| "loss": 0.2191, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 1.785166240409207, | |
| "grad_norm": 4.00218617616878, | |
| "learning_rate": 2.820067296835799e-07, | |
| "loss": 0.1415, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 1.7877237851662404, | |
| "grad_norm": 5.580094964915473, | |
| "learning_rate": 2.753942152385014e-07, | |
| "loss": 0.2282, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 1.7902813299232738, | |
| "grad_norm": 5.157143860228644, | |
| "learning_rate": 2.688579527922514e-07, | |
| "loss": 0.228, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.792838874680307, | |
| "grad_norm": 6.375062738221611, | |
| "learning_rate": 2.6239804783576294e-07, | |
| "loss": 0.227, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 1.7953964194373402, | |
| "grad_norm": 5.7437796904243115, | |
| "learning_rate": 2.560146046276135e-07, | |
| "loss": 0.2823, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 1.7979539641943734, | |
| "grad_norm": 5.624690347371044, | |
| "learning_rate": 2.4970772619233475e-07, | |
| "loss": 0.2042, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 1.8005115089514065, | |
| "grad_norm": 6.498262506148002, | |
| "learning_rate": 2.4347751431875453e-07, | |
| "loss": 0.2598, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 1.80306905370844, | |
| "grad_norm": 6.186981848018963, | |
| "learning_rate": 2.373240695583534e-07, | |
| "loss": 0.2505, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.8056265984654731, | |
| "grad_norm": 4.425992548953008, | |
| "learning_rate": 2.3124749122364286e-07, | |
| "loss": 0.2145, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 1.8081841432225065, | |
| "grad_norm": 4.690718811893992, | |
| "learning_rate": 2.2524787738656073e-07, | |
| "loss": 0.2106, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 1.8107416879795397, | |
| "grad_norm": 6.156573679807675, | |
| "learning_rate": 2.1932532487688784e-07, | |
| "loss": 0.222, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 1.813299232736573, | |
| "grad_norm": 5.586951769770138, | |
| "learning_rate": 2.1347992928068884e-07, | |
| "loss": 0.2469, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 1.815856777493606, | |
| "grad_norm": 5.817151417919622, | |
| "learning_rate": 2.0771178493876387e-07, | |
| "loss": 0.2514, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8184143222506393, | |
| "grad_norm": 5.920190459702125, | |
| "learning_rate": 2.0202098494513157e-07, | |
| "loss": 0.2034, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 1.8209718670076727, | |
| "grad_norm": 5.8230410453933015, | |
| "learning_rate": 1.964076211455246e-07, | |
| "loss": 0.2364, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 5.7392724174161405, | |
| "learning_rate": 1.908717841359048e-07, | |
| "loss": 0.2135, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 1.8260869565217392, | |
| "grad_norm": 5.775722433349359, | |
| "learning_rate": 1.8541356326100436e-07, | |
| "loss": 0.2462, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 1.8286445012787724, | |
| "grad_norm": 5.591670023546011, | |
| "learning_rate": 1.800330466128808e-07, | |
| "loss": 0.228, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.8312020460358056, | |
| "grad_norm": 4.711360255327199, | |
| "learning_rate": 1.7473032102949983e-07, | |
| "loss": 0.2075, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 1.8337595907928388, | |
| "grad_norm": 5.929377308741077, | |
| "learning_rate": 1.695054720933309e-07, | |
| "loss": 0.257, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 1.836317135549872, | |
| "grad_norm": 5.279542039345601, | |
| "learning_rate": 1.6435858412996275e-07, | |
| "loss": 0.1529, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 1.8388746803069054, | |
| "grad_norm": 5.1397676431266595, | |
| "learning_rate": 1.5928974020674947e-07, | |
| "loss": 0.1491, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 1.8414322250639388, | |
| "grad_norm": 5.512326623165319, | |
| "learning_rate": 1.542990221314644e-07, | |
| "loss": 0.1774, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.843989769820972, | |
| "grad_norm": 5.66627970312488, | |
| "learning_rate": 1.4938651045098174e-07, | |
| "loss": 0.2278, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 1.8465473145780051, | |
| "grad_norm": 4.424031839884274, | |
| "learning_rate": 1.445522844499775e-07, | |
| "loss": 0.1713, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 1.8491048593350383, | |
| "grad_norm": 6.625716850892438, | |
| "learning_rate": 1.3979642214964728e-07, | |
| "loss": 0.254, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 1.8516624040920715, | |
| "grad_norm": 5.181446281414914, | |
| "learning_rate": 1.3511900030644954e-07, | |
| "loss": 0.1753, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 1.854219948849105, | |
| "grad_norm": 6.694348870428919, | |
| "learning_rate": 1.3052009441086533e-07, | |
| "loss": 0.2535, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.856777493606138, | |
| "grad_norm": 6.09347774518519, | |
| "learning_rate": 1.2599977868618052e-07, | |
| "loss": 0.2617, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.8593350383631715, | |
| "grad_norm": 6.126918278593983, | |
| "learning_rate": 1.215581260872889e-07, | |
| "loss": 0.1734, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 1.8618925831202047, | |
| "grad_norm": 4.9394061667025175, | |
| "learning_rate": 1.1719520829951203e-07, | |
| "loss": 0.1727, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 1.8644501278772379, | |
| "grad_norm": 5.679604325861919, | |
| "learning_rate": 1.1291109573744574e-07, | |
| "loss": 0.2195, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 1.867007672634271, | |
| "grad_norm": 5.5914554870990525, | |
| "learning_rate": 1.087058575438199e-07, | |
| "loss": 0.2528, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8695652173913042, | |
| "grad_norm": 5.877663743949046, | |
| "learning_rate": 1.0457956158838545e-07, | |
| "loss": 0.2491, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 1.8721227621483376, | |
| "grad_norm": 3.8851223721617663, | |
| "learning_rate": 1.0053227446681912e-07, | |
| "loss": 0.1317, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 1.8746803069053708, | |
| "grad_norm": 5.412598503950858, | |
| "learning_rate": 9.656406149964548e-08, | |
| "loss": 0.2314, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 1.8772378516624042, | |
| "grad_norm": 4.937142526302332, | |
| "learning_rate": 9.267498673118547e-08, | |
| "loss": 0.1964, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 1.8797953964194374, | |
| "grad_norm": 5.6093925489434735, | |
| "learning_rate": 8.886511292852395e-08, | |
| "loss": 0.2169, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 5.544589584646933, | |
| "learning_rate": 8.513450158049109e-08, | |
| "loss": 0.2262, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 1.8849104859335037, | |
| "grad_norm": 4.522441136690872, | |
| "learning_rate": 8.148321289667749e-08, | |
| "loss": 0.1836, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 1.887468030690537, | |
| "grad_norm": 4.735747110712485, | |
| "learning_rate": 7.791130580645623e-08, | |
| "loss": 0.2037, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 1.8900255754475703, | |
| "grad_norm": 5.424105506041224, | |
| "learning_rate": 7.441883795803462e-08, | |
| "loss": 0.181, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 1.8925831202046037, | |
| "grad_norm": 5.57291250707421, | |
| "learning_rate": 7.100586571752444e-08, | |
| "loss": 0.2009, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.895140664961637, | |
| "grad_norm": 4.723852414529119, | |
| "learning_rate": 6.767244416802988e-08, | |
| "loss": 0.2135, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 1.89769820971867, | |
| "grad_norm": 4.625168893146648, | |
| "learning_rate": 6.441862710876102e-08, | |
| "loss": 0.182, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 1.9002557544757033, | |
| "grad_norm": 5.742248230826967, | |
| "learning_rate": 6.124446705416343e-08, | |
| "loss": 0.2256, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 1.9028132992327365, | |
| "grad_norm": 5.446320051746138, | |
| "learning_rate": 5.815001523307162e-08, | |
| "loss": 0.1844, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 1.9053708439897699, | |
| "grad_norm": 5.776856594363444, | |
| "learning_rate": 5.513532158788193e-08, | |
| "loss": 0.2793, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.907928388746803, | |
| "grad_norm": 6.11898381729892, | |
| "learning_rate": 5.220043477374759e-08, | |
| "loss": 0.228, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 1.9104859335038364, | |
| "grad_norm": 5.485214100078597, | |
| "learning_rate": 4.934540215779271e-08, | |
| "loss": 0.2315, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 1.9130434782608696, | |
| "grad_norm": 5.1671494114407155, | |
| "learning_rate": 4.657026981834623e-08, | |
| "loss": 0.2191, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 1.9156010230179028, | |
| "grad_norm": 5.544282249484376, | |
| "learning_rate": 4.3875082544201364e-08, | |
| "loss": 0.2446, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 1.918158567774936, | |
| "grad_norm": 4.828585643362734, | |
| "learning_rate": 4.125988383388957e-08, | |
| "loss": 0.2293, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9207161125319692, | |
| "grad_norm": 6.32383213055493, | |
| "learning_rate": 3.87247158949805e-08, | |
| "loss": 0.2332, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 1.9232736572890026, | |
| "grad_norm": 5.560416321895529, | |
| "learning_rate": 3.626961964340203e-08, | |
| "loss": 0.2383, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 1.9258312020460358, | |
| "grad_norm": 5.865352141528642, | |
| "learning_rate": 3.389463470277576e-08, | |
| "loss": 0.2104, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 1.9283887468030692, | |
| "grad_norm": 5.341153765939635, | |
| "learning_rate": 3.159979940378088e-08, | |
| "loss": 0.2128, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 1.9309462915601023, | |
| "grad_norm": 5.291758252142984, | |
| "learning_rate": 2.938515078353521e-08, | |
| "loss": 0.1886, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.9335038363171355, | |
| "grad_norm": 5.163447364959534, | |
| "learning_rate": 2.725072458499567e-08, | |
| "loss": 0.2996, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 1.9360613810741687, | |
| "grad_norm": 6.676816256652347, | |
| "learning_rate": 2.519655525638376e-08, | |
| "loss": 0.2902, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 1.938618925831202, | |
| "grad_norm": 6.29139443707462, | |
| "learning_rate": 2.3222675950627106e-08, | |
| "loss": 0.1572, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 4.037989400305309, | |
| "learning_rate": 2.1329118524827662e-08, | |
| "loss": 0.1417, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 1.9437340153452687, | |
| "grad_norm": 4.5423783674836695, | |
| "learning_rate": 1.9515913539743247e-08, | |
| "loss": 0.233, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9462915601023019, | |
| "grad_norm": 3.859294316006792, | |
| "learning_rate": 1.7783090259297918e-08, | |
| "loss": 0.1918, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 1.948849104859335, | |
| "grad_norm": 5.265098117553532, | |
| "learning_rate": 1.613067665010959e-08, | |
| "loss": 0.1965, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 1.9514066496163682, | |
| "grad_norm": 4.559779542352908, | |
| "learning_rate": 1.4558699381034825e-08, | |
| "loss": 0.1888, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 1.9539641943734014, | |
| "grad_norm": 6.603926656306259, | |
| "learning_rate": 1.3067183822742525e-08, | |
| "loss": 0.2729, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 1.9565217391304348, | |
| "grad_norm": 5.966454788112439, | |
| "learning_rate": 1.1656154047303691e-08, | |
| "loss": 0.1994, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.959079283887468, | |
| "grad_norm": 5.135081772282031, | |
| "learning_rate": 1.0325632827801745e-08, | |
| "loss": 0.1875, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 1.9616368286445014, | |
| "grad_norm": 4.6463930984383275, | |
| "learning_rate": 9.075641637964483e-09, | |
| "loss": 0.213, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 1.9641943734015346, | |
| "grad_norm": 5.19696586284704, | |
| "learning_rate": 7.906200651819907e-09, | |
| "loss": 0.188, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 1.9667519181585678, | |
| "grad_norm": 4.1586996620802585, | |
| "learning_rate": 6.817328743368712e-09, | |
| "loss": 0.1314, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 1.969309462915601, | |
| "grad_norm": 6.428796482081383, | |
| "learning_rate": 5.809043486279531e-09, | |
| "loss": 0.265, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9718670076726341, | |
| "grad_norm": 6.071118019652648, | |
| "learning_rate": 4.881361153606934e-09, | |
| "loss": 0.1938, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 1.9744245524296675, | |
| "grad_norm": 6.004256284137117, | |
| "learning_rate": 4.034296717527752e-09, | |
| "loss": 0.258, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 1.976982097186701, | |
| "grad_norm": 4.424170682035935, | |
| "learning_rate": 3.2678638490996064e-09, | |
| "loss": 0.1944, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 1.979539641943734, | |
| "grad_norm": 4.5443816733282025, | |
| "learning_rate": 2.5820749180388573e-09, | |
| "loss": 0.225, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 1.9820971867007673, | |
| "grad_norm": 6.813889752181503, | |
| "learning_rate": 1.976940992523546e-09, | |
| "loss": 0.242, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.9846547314578005, | |
| "grad_norm": 6.943362317683093, | |
| "learning_rate": 1.4524718390140913e-09, | |
| "loss": 0.3044, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 1.9872122762148337, | |
| "grad_norm": 5.440758095117894, | |
| "learning_rate": 1.0086759220934162e-09, | |
| "loss": 0.2411, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 1.989769820971867, | |
| "grad_norm": 3.8628110410030914, | |
| "learning_rate": 6.455604043331676e-10, | |
| "loss": 0.1659, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 1.9923273657289002, | |
| "grad_norm": 5.170928482570995, | |
| "learning_rate": 3.631311461765874e-10, | |
| "loss": 0.1393, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 1.9948849104859336, | |
| "grad_norm": 5.769015016748028, | |
| "learning_rate": 1.6139270584358823e-10, | |
| "loss": 0.178, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9974424552429668, | |
| "grad_norm": 5.070011448256778, | |
| "learning_rate": 4.034833925969928e-11, | |
| "loss": 0.2705, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 6.222117398419603, | |
| "learning_rate": 0.0, | |
| "loss": 0.1879, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 782, | |
| "total_flos": 850857689088.0, | |
| "train_loss": 0.39088617330012115, | |
| "train_runtime": 416.9902, | |
| "train_samples_per_second": 15.003, | |
| "train_steps_per_second": 1.875 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 782, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 70000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 850857689088.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |