| { | |
| "best_metric": 0.7238953709602356, | |
| "best_model_checkpoint": "outputs/checkpoint-450", | |
| "epoch": 0.2815139192993431, | |
| "eval_steps": 25, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0006255864873318737, | |
| "grad_norm": 3.302262783050537, | |
| "learning_rate": 4e-05, | |
| "loss": 1.7639, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0012511729746637473, | |
| "grad_norm": 3.7728819847106934, | |
| "learning_rate": 8e-05, | |
| "loss": 2.3471, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.001876759461995621, | |
| "grad_norm": 3.575211763381958, | |
| "learning_rate": 0.00012, | |
| "loss": 1.274, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0025023459493274947, | |
| "grad_norm": 4.3921918869018555, | |
| "learning_rate": 0.00016, | |
| "loss": 1.8361, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0031279324366593683, | |
| "grad_norm": 3.215696096420288, | |
| "learning_rate": 0.0002, | |
| "loss": 2.8766, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.003753518923991242, | |
| "grad_norm": 4.060017108917236, | |
| "learning_rate": 0.0001996638655462185, | |
| "loss": 1.4329, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.004379105411323116, | |
| "grad_norm": 2.7935523986816406, | |
| "learning_rate": 0.00019932773109243698, | |
| "loss": 1.2844, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.005004691898654989, | |
| "grad_norm": 2.312218189239502, | |
| "learning_rate": 0.00019899159663865548, | |
| "loss": 1.8112, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.005630278385986863, | |
| "grad_norm": 3.5389914512634277, | |
| "learning_rate": 0.00019865546218487395, | |
| "loss": 2.0504, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.006255864873318737, | |
| "grad_norm": 2.913029432296753, | |
| "learning_rate": 0.00019831932773109245, | |
| "loss": 1.9101, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00688145136065061, | |
| "grad_norm": 3.6916606426239014, | |
| "learning_rate": 0.00019798319327731095, | |
| "loss": 1.9899, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.007507037847982484, | |
| "grad_norm": 3.002810478210449, | |
| "learning_rate": 0.00019764705882352942, | |
| "loss": 1.3224, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.008132624335314358, | |
| "grad_norm": 1.657835602760315, | |
| "learning_rate": 0.00019731092436974792, | |
| "loss": 1.2208, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.008758210822646231, | |
| "grad_norm": 2.414161443710327, | |
| "learning_rate": 0.00019697478991596642, | |
| "loss": 1.5375, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.009383797309978105, | |
| "grad_norm": 1.9695100784301758, | |
| "learning_rate": 0.00019663865546218486, | |
| "loss": 0.9218, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.010009383797309979, | |
| "grad_norm": 3.9755845069885254, | |
| "learning_rate": 0.00019630252100840336, | |
| "loss": 1.3608, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.010634970284641852, | |
| "grad_norm": 6.843455791473389, | |
| "learning_rate": 0.00019596638655462186, | |
| "loss": 1.2168, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.011260556771973726, | |
| "grad_norm": 3.8736443519592285, | |
| "learning_rate": 0.00019563025210084033, | |
| "loss": 0.7392, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0118861432593056, | |
| "grad_norm": 1.7369539737701416, | |
| "learning_rate": 0.00019529411764705883, | |
| "loss": 1.0495, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.012511729746637473, | |
| "grad_norm": 1.1708225011825562, | |
| "learning_rate": 0.0001949579831932773, | |
| "loss": 1.2266, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013137316233969347, | |
| "grad_norm": 1.4693603515625, | |
| "learning_rate": 0.0001946218487394958, | |
| "loss": 1.1364, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.01376290272130122, | |
| "grad_norm": 0.8484959602355957, | |
| "learning_rate": 0.0001942857142857143, | |
| "loss": 0.6253, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.014388489208633094, | |
| "grad_norm": 2.7237887382507324, | |
| "learning_rate": 0.00019394957983193278, | |
| "loss": 1.2932, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.015014075695964968, | |
| "grad_norm": 1.1654947996139526, | |
| "learning_rate": 0.00019361344537815127, | |
| "loss": 0.5659, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.01563966218329684, | |
| "grad_norm": 1.7193485498428345, | |
| "learning_rate": 0.00019327731092436975, | |
| "loss": 1.3627, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01563966218329684, | |
| "eval_loss": 1.0974555015563965, | |
| "eval_runtime": 46.8133, | |
| "eval_samples_per_second": 5.469, | |
| "eval_steps_per_second": 2.734, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.016265248670628715, | |
| "grad_norm": 2.883988380432129, | |
| "learning_rate": 0.00019294117647058825, | |
| "loss": 0.6257, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.01689083515796059, | |
| "grad_norm": 1.4707483053207397, | |
| "learning_rate": 0.00019260504201680674, | |
| "loss": 0.879, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.017516421645292463, | |
| "grad_norm": 1.3346422910690308, | |
| "learning_rate": 0.00019226890756302522, | |
| "loss": 1.0058, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.018142008132624336, | |
| "grad_norm": 0.5815519094467163, | |
| "learning_rate": 0.00019193277310924372, | |
| "loss": 0.3475, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.01876759461995621, | |
| "grad_norm": 0.8800593018531799, | |
| "learning_rate": 0.00019159663865546221, | |
| "loss": 0.5426, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.019393181107288084, | |
| "grad_norm": 8.196944236755371, | |
| "learning_rate": 0.0001912605042016807, | |
| "loss": 1.0088, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.020018767594619957, | |
| "grad_norm": 3.264193296432495, | |
| "learning_rate": 0.00019092436974789919, | |
| "loss": 0.9319, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.02064435408195183, | |
| "grad_norm": 1.1047834157943726, | |
| "learning_rate": 0.00019058823529411766, | |
| "loss": 0.9262, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.021269940569283705, | |
| "grad_norm": 1.982783555984497, | |
| "learning_rate": 0.00019025210084033613, | |
| "loss": 1.2904, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.021895527056615578, | |
| "grad_norm": 2.6765289306640625, | |
| "learning_rate": 0.00018991596638655463, | |
| "loss": 1.0785, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.022521113543947452, | |
| "grad_norm": 4.674818992614746, | |
| "learning_rate": 0.0001895798319327731, | |
| "loss": 0.9822, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.023146700031279326, | |
| "grad_norm": 1.6232353448867798, | |
| "learning_rate": 0.0001892436974789916, | |
| "loss": 0.6441, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0237722865186112, | |
| "grad_norm": 2.623237371444702, | |
| "learning_rate": 0.0001889075630252101, | |
| "loss": 0.8874, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.024397873005943073, | |
| "grad_norm": 1.4366761445999146, | |
| "learning_rate": 0.00018857142857142857, | |
| "loss": 0.4596, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.025023459493274947, | |
| "grad_norm": 1.8809682130813599, | |
| "learning_rate": 0.00018823529411764707, | |
| "loss": 0.887, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02564904598060682, | |
| "grad_norm": 1.081438660621643, | |
| "learning_rate": 0.00018789915966386554, | |
| "loss": 0.4735, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.026274632467938694, | |
| "grad_norm": 2.1302649974823, | |
| "learning_rate": 0.00018756302521008404, | |
| "loss": 0.795, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.026900218955270568, | |
| "grad_norm": 2.005425453186035, | |
| "learning_rate": 0.00018722689075630254, | |
| "loss": 0.8891, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.02752580544260244, | |
| "grad_norm": 1.7256505489349365, | |
| "learning_rate": 0.000186890756302521, | |
| "loss": 0.5993, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.028151391929934315, | |
| "grad_norm": 0.927653968334198, | |
| "learning_rate": 0.0001865546218487395, | |
| "loss": 0.6185, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02877697841726619, | |
| "grad_norm": 1.5710850954055786, | |
| "learning_rate": 0.000186218487394958, | |
| "loss": 0.5258, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.029402564904598062, | |
| "grad_norm": 1.8794296979904175, | |
| "learning_rate": 0.00018588235294117648, | |
| "loss": 0.8168, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.030028151391929936, | |
| "grad_norm": 0.9695333242416382, | |
| "learning_rate": 0.00018554621848739498, | |
| "loss": 0.5458, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.03065373787926181, | |
| "grad_norm": 3.7846665382385254, | |
| "learning_rate": 0.00018521008403361345, | |
| "loss": 0.943, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.03127932436659368, | |
| "grad_norm": 1.9213052988052368, | |
| "learning_rate": 0.00018487394957983195, | |
| "loss": 0.5069, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03127932436659368, | |
| "eval_loss": 0.9765783548355103, | |
| "eval_runtime": 43.502, | |
| "eval_samples_per_second": 5.885, | |
| "eval_steps_per_second": 2.942, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03190491085392556, | |
| "grad_norm": 2.0580382347106934, | |
| "learning_rate": 0.00018453781512605045, | |
| "loss": 0.9423, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.03253049734125743, | |
| "grad_norm": 2.063591957092285, | |
| "learning_rate": 0.0001842016806722689, | |
| "loss": 0.7054, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.033156083828589304, | |
| "grad_norm": 1.2656595706939697, | |
| "learning_rate": 0.0001838655462184874, | |
| "loss": 0.401, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.03378167031592118, | |
| "grad_norm": 1.2392399311065674, | |
| "learning_rate": 0.0001835294117647059, | |
| "loss": 0.6077, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.03440725680325305, | |
| "grad_norm": 0.99504154920578, | |
| "learning_rate": 0.00018319327731092437, | |
| "loss": 0.6313, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.035032843290584925, | |
| "grad_norm": 2.0478012561798096, | |
| "learning_rate": 0.00018285714285714286, | |
| "loss": 1.2652, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0356584297779168, | |
| "grad_norm": 0.9636131525039673, | |
| "learning_rate": 0.00018252100840336134, | |
| "loss": 0.7561, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.03628401626524867, | |
| "grad_norm": 0.874576210975647, | |
| "learning_rate": 0.00018218487394957984, | |
| "loss": 0.7461, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.036909602752580546, | |
| "grad_norm": 1.3745896816253662, | |
| "learning_rate": 0.00018184873949579833, | |
| "loss": 1.2856, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.03753518923991242, | |
| "grad_norm": 2.4839162826538086, | |
| "learning_rate": 0.0001815126050420168, | |
| "loss": 1.0574, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.038160775727244294, | |
| "grad_norm": 1.2671383619308472, | |
| "learning_rate": 0.0001811764705882353, | |
| "loss": 0.6177, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.03878636221457617, | |
| "grad_norm": 1.1862553358078003, | |
| "learning_rate": 0.0001808403361344538, | |
| "loss": 1.1169, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.03941194870190804, | |
| "grad_norm": 1.1347297430038452, | |
| "learning_rate": 0.00018050420168067228, | |
| "loss": 1.3303, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.040037535189239915, | |
| "grad_norm": 2.1583523750305176, | |
| "learning_rate": 0.00018016806722689078, | |
| "loss": 0.7941, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.04066312167657179, | |
| "grad_norm": 1.2432655096054077, | |
| "learning_rate": 0.00017983193277310925, | |
| "loss": 0.7848, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04128870816390366, | |
| "grad_norm": 1.3345468044281006, | |
| "learning_rate": 0.00017949579831932775, | |
| "loss": 0.8953, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.041914294651235535, | |
| "grad_norm": 0.6861767768859863, | |
| "learning_rate": 0.00017915966386554625, | |
| "loss": 0.4162, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.04253988113856741, | |
| "grad_norm": 0.85309898853302, | |
| "learning_rate": 0.00017882352941176472, | |
| "loss": 0.6606, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.04316546762589928, | |
| "grad_norm": 1.0247780084609985, | |
| "learning_rate": 0.00017848739495798322, | |
| "loss": 0.5271, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.043791054113231156, | |
| "grad_norm": 1.3019441366195679, | |
| "learning_rate": 0.0001781512605042017, | |
| "loss": 0.5605, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04441664060056303, | |
| "grad_norm": 1.1024900674819946, | |
| "learning_rate": 0.00017781512605042016, | |
| "loss": 0.9303, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.045042227087894904, | |
| "grad_norm": 1.079655408859253, | |
| "learning_rate": 0.00017747899159663866, | |
| "loss": 1.0138, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.04566781357522678, | |
| "grad_norm": 1.1078468561172485, | |
| "learning_rate": 0.00017714285714285713, | |
| "loss": 0.9861, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.04629340006255865, | |
| "grad_norm": 1.8648931980133057, | |
| "learning_rate": 0.00017680672268907563, | |
| "loss": 0.6756, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.046918986549890525, | |
| "grad_norm": 0.8588104248046875, | |
| "learning_rate": 0.00017647058823529413, | |
| "loss": 0.4867, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.046918986549890525, | |
| "eval_loss": 0.9139823913574219, | |
| "eval_runtime": 43.5635, | |
| "eval_samples_per_second": 5.876, | |
| "eval_steps_per_second": 2.938, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0475445730372224, | |
| "grad_norm": 1.6970480680465698, | |
| "learning_rate": 0.0001761344537815126, | |
| "loss": 0.5523, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.04817015952455427, | |
| "grad_norm": 0.8562026023864746, | |
| "learning_rate": 0.0001757983193277311, | |
| "loss": 0.4084, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.048795746011886146, | |
| "grad_norm": 0.9487925171852112, | |
| "learning_rate": 0.0001754621848739496, | |
| "loss": 0.6204, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.04942133249921802, | |
| "grad_norm": 11.929024696350098, | |
| "learning_rate": 0.00017512605042016807, | |
| "loss": 1.1662, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.05004691898654989, | |
| "grad_norm": 1.3468140363693237, | |
| "learning_rate": 0.00017478991596638657, | |
| "loss": 0.8037, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05067250547388177, | |
| "grad_norm": 0.7379503846168518, | |
| "learning_rate": 0.00017445378151260504, | |
| "loss": 0.6564, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.05129809196121364, | |
| "grad_norm": 1.0315027236938477, | |
| "learning_rate": 0.00017411764705882354, | |
| "loss": 0.6377, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.051923678448545514, | |
| "grad_norm": 0.5900093913078308, | |
| "learning_rate": 0.00017378151260504204, | |
| "loss": 0.5122, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.05254926493587739, | |
| "grad_norm": 1.5138239860534668, | |
| "learning_rate": 0.0001734453781512605, | |
| "loss": 0.4769, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.05317485142320926, | |
| "grad_norm": 1.016790747642517, | |
| "learning_rate": 0.000173109243697479, | |
| "loss": 0.6654, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.053800437910541135, | |
| "grad_norm": 1.1964718103408813, | |
| "learning_rate": 0.00017277310924369748, | |
| "loss": 0.6334, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.05442602439787301, | |
| "grad_norm": 1.102842092514038, | |
| "learning_rate": 0.00017243697478991598, | |
| "loss": 0.832, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.05505161088520488, | |
| "grad_norm": 6.609305381774902, | |
| "learning_rate": 0.00017210084033613448, | |
| "loss": 0.6112, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.055677197372536756, | |
| "grad_norm": 2.6627745628356934, | |
| "learning_rate": 0.00017176470588235293, | |
| "loss": 1.032, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.05630278385986863, | |
| "grad_norm": 2.114955425262451, | |
| "learning_rate": 0.00017142857142857143, | |
| "loss": 0.6116, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0569283703472005, | |
| "grad_norm": 1.7707552909851074, | |
| "learning_rate": 0.00017109243697478992, | |
| "loss": 0.4766, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.05755395683453238, | |
| "grad_norm": 0.9983264803886414, | |
| "learning_rate": 0.0001707563025210084, | |
| "loss": 0.5397, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.05817954332186425, | |
| "grad_norm": 8.190524101257324, | |
| "learning_rate": 0.0001704201680672269, | |
| "loss": 0.9531, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.058805129809196124, | |
| "grad_norm": 1.9920661449432373, | |
| "learning_rate": 0.0001700840336134454, | |
| "loss": 1.3801, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.059430716296528, | |
| "grad_norm": 0.8791856169700623, | |
| "learning_rate": 0.00016974789915966387, | |
| "loss": 0.6218, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.06005630278385987, | |
| "grad_norm": 1.0745537281036377, | |
| "learning_rate": 0.00016941176470588237, | |
| "loss": 0.5578, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.060681889271191745, | |
| "grad_norm": 1.4266705513000488, | |
| "learning_rate": 0.00016907563025210084, | |
| "loss": 1.5821, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.06130747575852362, | |
| "grad_norm": 1.1001832485198975, | |
| "learning_rate": 0.00016873949579831934, | |
| "loss": 0.5972, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.06193306224585549, | |
| "grad_norm": 1.3168463706970215, | |
| "learning_rate": 0.00016840336134453784, | |
| "loss": 0.5794, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.06255864873318737, | |
| "grad_norm": 1.0342196226119995, | |
| "learning_rate": 0.0001680672268907563, | |
| "loss": 0.6827, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06255864873318737, | |
| "eval_loss": 0.8885337114334106, | |
| "eval_runtime": 43.4886, | |
| "eval_samples_per_second": 5.887, | |
| "eval_steps_per_second": 2.943, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06318423522051923, | |
| "grad_norm": 2.2497031688690186, | |
| "learning_rate": 0.0001677310924369748, | |
| "loss": 0.6468, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.06380982170785111, | |
| "grad_norm": 0.8061516284942627, | |
| "learning_rate": 0.00016739495798319328, | |
| "loss": 0.5388, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.06443540819518298, | |
| "grad_norm": 0.6954531669616699, | |
| "learning_rate": 0.00016705882352941178, | |
| "loss": 0.3191, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.06506099468251486, | |
| "grad_norm": 1.3721911907196045, | |
| "learning_rate": 0.00016672268907563028, | |
| "loss": 0.9, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.06568658116984673, | |
| "grad_norm": 1.084492564201355, | |
| "learning_rate": 0.00016638655462184875, | |
| "loss": 0.6144, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.06631216765717861, | |
| "grad_norm": 3.317697525024414, | |
| "learning_rate": 0.00016605042016806725, | |
| "loss": 0.634, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.06693775414451048, | |
| "grad_norm": 2.5598530769348145, | |
| "learning_rate": 0.00016571428571428575, | |
| "loss": 0.8931, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.06756334063184236, | |
| "grad_norm": 3.6414177417755127, | |
| "learning_rate": 0.0001653781512605042, | |
| "loss": 0.7226, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.06818892711917422, | |
| "grad_norm": 2.2443768978118896, | |
| "learning_rate": 0.0001650420168067227, | |
| "loss": 0.8862, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.0688145136065061, | |
| "grad_norm": 0.6285691857337952, | |
| "learning_rate": 0.0001647058823529412, | |
| "loss": 0.3766, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06944010009383797, | |
| "grad_norm": 0.6171959042549133, | |
| "learning_rate": 0.00016436974789915966, | |
| "loss": 0.2821, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.07006568658116985, | |
| "grad_norm": 1.0057804584503174, | |
| "learning_rate": 0.00016403361344537816, | |
| "loss": 0.6293, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.07069127306850172, | |
| "grad_norm": 1.3190034627914429, | |
| "learning_rate": 0.00016369747899159663, | |
| "loss": 0.5547, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.0713168595558336, | |
| "grad_norm": 0.518517017364502, | |
| "learning_rate": 0.00016336134453781513, | |
| "loss": 0.1951, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.07194244604316546, | |
| "grad_norm": 0.848175048828125, | |
| "learning_rate": 0.00016302521008403363, | |
| "loss": 0.5091, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07256803253049735, | |
| "grad_norm": 0.7387409806251526, | |
| "learning_rate": 0.0001626890756302521, | |
| "loss": 0.3872, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.07319361901782921, | |
| "grad_norm": 2.828091859817505, | |
| "learning_rate": 0.0001623529411764706, | |
| "loss": 1.2046, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.07381920550516109, | |
| "grad_norm": 1.7653822898864746, | |
| "learning_rate": 0.00016201680672268907, | |
| "loss": 1.8133, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.07444479199249296, | |
| "grad_norm": 3.5097360610961914, | |
| "learning_rate": 0.00016168067226890757, | |
| "loss": 0.6837, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.07507037847982484, | |
| "grad_norm": 1.3884797096252441, | |
| "learning_rate": 0.00016134453781512607, | |
| "loss": 0.8846, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0756959649671567, | |
| "grad_norm": 22.705190658569336, | |
| "learning_rate": 0.00016100840336134454, | |
| "loss": 0.7281, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.07632155145448859, | |
| "grad_norm": 3.1223599910736084, | |
| "learning_rate": 0.00016067226890756304, | |
| "loss": 0.6254, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.07694713794182045, | |
| "grad_norm": 0.530583381652832, | |
| "learning_rate": 0.00016033613445378154, | |
| "loss": 0.3292, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.07757272442915233, | |
| "grad_norm": 1.4720183610916138, | |
| "learning_rate": 0.00016, | |
| "loss": 0.8192, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.0781983109164842, | |
| "grad_norm": 0.6448870301246643, | |
| "learning_rate": 0.0001596638655462185, | |
| "loss": 0.2431, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0781983109164842, | |
| "eval_loss": 0.890012800693512, | |
| "eval_runtime": 43.5059, | |
| "eval_samples_per_second": 5.884, | |
| "eval_steps_per_second": 2.942, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07882389740381608, | |
| "grad_norm": 1.803906798362732, | |
| "learning_rate": 0.00015932773109243698, | |
| "loss": 0.8937, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.07944948389114795, | |
| "grad_norm": 2.2447054386138916, | |
| "learning_rate": 0.00015899159663865546, | |
| "loss": 0.6993, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.08007507037847983, | |
| "grad_norm": 0.6667381525039673, | |
| "learning_rate": 0.00015865546218487396, | |
| "loss": 0.4266, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.0807006568658117, | |
| "grad_norm": 1.1449408531188965, | |
| "learning_rate": 0.00015831932773109243, | |
| "loss": 0.5557, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.08132624335314358, | |
| "grad_norm": 1.399849534034729, | |
| "learning_rate": 0.00015798319327731093, | |
| "loss": 0.6761, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08195182984047544, | |
| "grad_norm": 0.745627760887146, | |
| "learning_rate": 0.00015764705882352943, | |
| "loss": 0.5323, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.08257741632780732, | |
| "grad_norm": 1.162428379058838, | |
| "learning_rate": 0.0001573109243697479, | |
| "loss": 0.8231, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.08320300281513919, | |
| "grad_norm": 1.0329734086990356, | |
| "learning_rate": 0.0001569747899159664, | |
| "loss": 0.6179, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.08382858930247107, | |
| "grad_norm": 0.5739912986755371, | |
| "learning_rate": 0.00015663865546218487, | |
| "loss": 0.2515, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.08445417578980294, | |
| "grad_norm": 1.2065409421920776, | |
| "learning_rate": 0.00015630252100840337, | |
| "loss": 0.6161, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.08507976227713482, | |
| "grad_norm": 1.1025582551956177, | |
| "learning_rate": 0.00015596638655462187, | |
| "loss": 0.5926, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.08570534876446669, | |
| "grad_norm": 0.78680020570755, | |
| "learning_rate": 0.00015563025210084034, | |
| "loss": 0.9987, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.08633093525179857, | |
| "grad_norm": 0.6232782006263733, | |
| "learning_rate": 0.00015529411764705884, | |
| "loss": 0.4952, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 3.347989559173584, | |
| "learning_rate": 0.00015495798319327734, | |
| "loss": 1.0787, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.08758210822646231, | |
| "grad_norm": 0.9020625352859497, | |
| "learning_rate": 0.0001546218487394958, | |
| "loss": 0.354, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08820769471379418, | |
| "grad_norm": 1.8955539464950562, | |
| "learning_rate": 0.0001542857142857143, | |
| "loss": 0.5515, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.08883328120112606, | |
| "grad_norm": 5.194116115570068, | |
| "learning_rate": 0.00015394957983193278, | |
| "loss": 0.6843, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.08945886768845793, | |
| "grad_norm": 1.4467953443527222, | |
| "learning_rate": 0.00015361344537815128, | |
| "loss": 0.4236, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.09008445417578981, | |
| "grad_norm": 0.523921012878418, | |
| "learning_rate": 0.00015327731092436978, | |
| "loss": 0.2165, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.09071004066312167, | |
| "grad_norm": 1.653648018836975, | |
| "learning_rate": 0.00015294117647058822, | |
| "loss": 1.0643, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.09133562715045355, | |
| "grad_norm": 0.6991509199142456, | |
| "learning_rate": 0.00015260504201680672, | |
| "loss": 0.4398, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.09196121363778542, | |
| "grad_norm": 1.3986660242080688, | |
| "learning_rate": 0.00015226890756302522, | |
| "loss": 0.8488, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.0925868001251173, | |
| "grad_norm": 1.2424954175949097, | |
| "learning_rate": 0.0001519327731092437, | |
| "loss": 0.9516, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.09321238661244917, | |
| "grad_norm": 0.8900560140609741, | |
| "learning_rate": 0.0001515966386554622, | |
| "loss": 0.767, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.09383797309978105, | |
| "grad_norm": 40.042503356933594, | |
| "learning_rate": 0.00015126050420168066, | |
| "loss": 0.9691, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09383797309978105, | |
| "eval_loss": 0.8660734295845032, | |
| "eval_runtime": 43.5102, | |
| "eval_samples_per_second": 5.884, | |
| "eval_steps_per_second": 2.942, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09446355958711292, | |
| "grad_norm": 2.816359519958496, | |
| "learning_rate": 0.00015092436974789916, | |
| "loss": 1.4959, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.0950891460744448, | |
| "grad_norm": 1.9332157373428345, | |
| "learning_rate": 0.00015058823529411766, | |
| "loss": 0.6786, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.09571473256177666, | |
| "grad_norm": 1.2608965635299683, | |
| "learning_rate": 0.00015025210084033613, | |
| "loss": 1.1282, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.09634031904910854, | |
| "grad_norm": 1.0167793035507202, | |
| "learning_rate": 0.00014991596638655463, | |
| "loss": 0.4932, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.09696590553644041, | |
| "grad_norm": 1.6121408939361572, | |
| "learning_rate": 0.00014957983193277313, | |
| "loss": 0.7193, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.09759149202377229, | |
| "grad_norm": 2.4104394912719727, | |
| "learning_rate": 0.0001492436974789916, | |
| "loss": 0.4472, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.09821707851110416, | |
| "grad_norm": 1.1095707416534424, | |
| "learning_rate": 0.0001489075630252101, | |
| "loss": 0.7595, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.09884266499843604, | |
| "grad_norm": 1.686458945274353, | |
| "learning_rate": 0.00014857142857142857, | |
| "loss": 0.5686, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.0994682514857679, | |
| "grad_norm": 3.2238378524780273, | |
| "learning_rate": 0.00014823529411764707, | |
| "loss": 0.4236, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.10009383797309979, | |
| "grad_norm": 1.800552248954773, | |
| "learning_rate": 0.00014789915966386557, | |
| "loss": 0.9519, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10071942446043165, | |
| "grad_norm": 0.6441445350646973, | |
| "learning_rate": 0.00014756302521008404, | |
| "loss": 0.4119, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.10134501094776353, | |
| "grad_norm": 0.5892903804779053, | |
| "learning_rate": 0.00014722689075630254, | |
| "loss": 0.2956, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.1019705974350954, | |
| "grad_norm": 0.8733301758766174, | |
| "learning_rate": 0.00014689075630252101, | |
| "loss": 0.5749, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.10259618392242728, | |
| "grad_norm": 1.0460662841796875, | |
| "learning_rate": 0.0001465546218487395, | |
| "loss": 0.8167, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.10322177040975915, | |
| "grad_norm": 0.8178017735481262, | |
| "learning_rate": 0.00014621848739495799, | |
| "loss": 0.9027, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.10384735689709103, | |
| "grad_norm": 0.5698068737983704, | |
| "learning_rate": 0.00014588235294117646, | |
| "loss": 0.1829, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.1044729433844229, | |
| "grad_norm": 1.0011018514633179, | |
| "learning_rate": 0.00014554621848739496, | |
| "loss": 0.8985, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.10509852987175478, | |
| "grad_norm": 1.189772367477417, | |
| "learning_rate": 0.00014521008403361346, | |
| "loss": 0.5547, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.10572411635908664, | |
| "grad_norm": 0.7990069389343262, | |
| "learning_rate": 0.00014487394957983193, | |
| "loss": 0.6222, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.10634970284641852, | |
| "grad_norm": 0.6419771313667297, | |
| "learning_rate": 0.00014453781512605043, | |
| "loss": 0.3225, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10697528933375039, | |
| "grad_norm": 0.8978354930877686, | |
| "learning_rate": 0.00014420168067226893, | |
| "loss": 0.4567, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.10760087582108227, | |
| "grad_norm": 0.7193794250488281, | |
| "learning_rate": 0.0001438655462184874, | |
| "loss": 0.4793, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.10822646230841414, | |
| "grad_norm": 0.9533759355545044, | |
| "learning_rate": 0.0001435294117647059, | |
| "loss": 1.4397, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.10885204879574602, | |
| "grad_norm": 0.48348739743232727, | |
| "learning_rate": 0.00014319327731092437, | |
| "loss": 0.3398, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.10947763528307788, | |
| "grad_norm": 0.7699019312858582, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 0.7491, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10947763528307788, | |
| "eval_loss": 0.8425782322883606, | |
| "eval_runtime": 43.5013, | |
| "eval_samples_per_second": 5.885, | |
| "eval_steps_per_second": 2.942, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.11010322177040976, | |
| "grad_norm": 0.9201186895370483, | |
| "learning_rate": 0.00014252100840336137, | |
| "loss": 0.6919, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.11072880825774163, | |
| "grad_norm": 0.8190593123435974, | |
| "learning_rate": 0.00014218487394957984, | |
| "loss": 0.6262, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.11135439474507351, | |
| "grad_norm": 0.9715782403945923, | |
| "learning_rate": 0.00014184873949579834, | |
| "loss": 0.8364, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.11197998123240538, | |
| "grad_norm": 0.6699782609939575, | |
| "learning_rate": 0.0001415126050420168, | |
| "loss": 0.4898, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.11260556771973726, | |
| "grad_norm": 1.8386518955230713, | |
| "learning_rate": 0.0001411764705882353, | |
| "loss": 0.7812, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.11323115420706913, | |
| "grad_norm": 0.7240263819694519, | |
| "learning_rate": 0.0001408403361344538, | |
| "loss": 0.5508, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.113856740694401, | |
| "grad_norm": 0.6068630814552307, | |
| "learning_rate": 0.00014050420168067225, | |
| "loss": 0.5151, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.11448232718173287, | |
| "grad_norm": 1.6705517768859863, | |
| "learning_rate": 0.00014016806722689075, | |
| "loss": 1.2281, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.11510791366906475, | |
| "grad_norm": 1.6179956197738647, | |
| "learning_rate": 0.00013983193277310925, | |
| "loss": 0.7365, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.11573350015639662, | |
| "grad_norm": 1.5741758346557617, | |
| "learning_rate": 0.00013949579831932772, | |
| "loss": 1.0039, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.1163590866437285, | |
| "grad_norm": 0.9270511865615845, | |
| "learning_rate": 0.00013915966386554622, | |
| "loss": 0.5768, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.11698467313106037, | |
| "grad_norm": 1.3651914596557617, | |
| "learning_rate": 0.00013882352941176472, | |
| "loss": 0.7715, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.11761025961839225, | |
| "grad_norm": 1.4330601692199707, | |
| "learning_rate": 0.0001384873949579832, | |
| "loss": 0.4462, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.11823584610572412, | |
| "grad_norm": 0.9181672930717468, | |
| "learning_rate": 0.0001381512605042017, | |
| "loss": 0.3901, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.118861432593056, | |
| "grad_norm": 0.5304622650146484, | |
| "learning_rate": 0.00013781512605042016, | |
| "loss": 0.1718, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11948701908038786, | |
| "grad_norm": 0.7475191354751587, | |
| "learning_rate": 0.00013747899159663866, | |
| "loss": 0.3602, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.12011260556771974, | |
| "grad_norm": 1.2558002471923828, | |
| "learning_rate": 0.00013714285714285716, | |
| "loss": 0.8558, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.12073819205505161, | |
| "grad_norm": 0.9859037399291992, | |
| "learning_rate": 0.00013680672268907563, | |
| "loss": 0.7155, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.12136377854238349, | |
| "grad_norm": 0.6028466820716858, | |
| "learning_rate": 0.00013647058823529413, | |
| "loss": 0.9596, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.12198936502971536, | |
| "grad_norm": 0.5713469386100769, | |
| "learning_rate": 0.0001361344537815126, | |
| "loss": 0.3442, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.12261495151704724, | |
| "grad_norm": 1.0781211853027344, | |
| "learning_rate": 0.0001357983193277311, | |
| "loss": 0.5569, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.1232405380043791, | |
| "grad_norm": 0.7850176095962524, | |
| "learning_rate": 0.0001354621848739496, | |
| "loss": 0.5853, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.12386612449171099, | |
| "grad_norm": 0.8100555539131165, | |
| "learning_rate": 0.00013512605042016807, | |
| "loss": 0.8285, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.12449171097904285, | |
| "grad_norm": 1.106834888458252, | |
| "learning_rate": 0.00013478991596638657, | |
| "loss": 0.9521, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.12511729746637473, | |
| "grad_norm": 1.4412230253219604, | |
| "learning_rate": 0.00013445378151260507, | |
| "loss": 0.6478, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12511729746637473, | |
| "eval_loss": 0.8300326466560364, | |
| "eval_runtime": 43.5102, | |
| "eval_samples_per_second": 5.884, | |
| "eval_steps_per_second": 2.942, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1257428839537066, | |
| "grad_norm": 1.7852795124053955, | |
| "learning_rate": 0.00013411764705882352, | |
| "loss": 0.5687, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.12636847044103847, | |
| "grad_norm": 2.423583745956421, | |
| "learning_rate": 0.00013378151260504202, | |
| "loss": 0.9082, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.12699405692837035, | |
| "grad_norm": 1.538001298904419, | |
| "learning_rate": 0.00013344537815126052, | |
| "loss": 0.7143, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.12761964341570223, | |
| "grad_norm": 1.7380592823028564, | |
| "learning_rate": 0.000133109243697479, | |
| "loss": 0.8296, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.1282452299030341, | |
| "grad_norm": 0.8279218673706055, | |
| "learning_rate": 0.0001327731092436975, | |
| "loss": 0.6719, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.12887081639036596, | |
| "grad_norm": 0.7059926986694336, | |
| "learning_rate": 0.00013243697478991596, | |
| "loss": 0.4785, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.12949640287769784, | |
| "grad_norm": 0.6946935653686523, | |
| "learning_rate": 0.00013210084033613446, | |
| "loss": 0.4578, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.13012198936502972, | |
| "grad_norm": 0.9800712466239929, | |
| "learning_rate": 0.00013176470588235296, | |
| "loss": 1.4369, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.1307475758523616, | |
| "grad_norm": 0.708831787109375, | |
| "learning_rate": 0.00013142857142857143, | |
| "loss": 0.5071, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.13137316233969346, | |
| "grad_norm": 1.0098780393600464, | |
| "learning_rate": 0.00013109243697478993, | |
| "loss": 0.9155, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.13199874882702534, | |
| "grad_norm": 1.1598243713378906, | |
| "learning_rate": 0.0001307563025210084, | |
| "loss": 0.3757, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.13262433531435722, | |
| "grad_norm": 0.7583935260772705, | |
| "learning_rate": 0.0001304201680672269, | |
| "loss": 0.3365, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.1332499218016891, | |
| "grad_norm": 1.0866564512252808, | |
| "learning_rate": 0.0001300840336134454, | |
| "loss": 0.6398, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.13387550828902095, | |
| "grad_norm": 1.4322006702423096, | |
| "learning_rate": 0.00012974789915966387, | |
| "loss": 0.6427, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.13450109477635283, | |
| "grad_norm": 1.600325345993042, | |
| "learning_rate": 0.00012941176470588237, | |
| "loss": 0.6884, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1351266812636847, | |
| "grad_norm": 1.0634167194366455, | |
| "learning_rate": 0.00012907563025210087, | |
| "loss": 1.0343, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.13575226775101656, | |
| "grad_norm": 0.9889366626739502, | |
| "learning_rate": 0.00012873949579831934, | |
| "loss": 0.717, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.13637785423834844, | |
| "grad_norm": 2.0635392665863037, | |
| "learning_rate": 0.00012840336134453784, | |
| "loss": 0.5965, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.13700344072568033, | |
| "grad_norm": 0.8937773704528809, | |
| "learning_rate": 0.0001280672268907563, | |
| "loss": 0.7281, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.1376290272130122, | |
| "grad_norm": 0.9768427014350891, | |
| "learning_rate": 0.00012773109243697478, | |
| "loss": 0.5687, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13825461370034406, | |
| "grad_norm": 1.3913767337799072, | |
| "learning_rate": 0.00012739495798319328, | |
| "loss": 0.3984, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.13888020018767594, | |
| "grad_norm": 1.4933342933654785, | |
| "learning_rate": 0.00012705882352941175, | |
| "loss": 1.2441, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.13950578667500782, | |
| "grad_norm": 1.0846196413040161, | |
| "learning_rate": 0.00012672268907563025, | |
| "loss": 0.9013, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.1401313731623397, | |
| "grad_norm": 0.7788563370704651, | |
| "learning_rate": 0.00012638655462184875, | |
| "loss": 0.4674, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.14075695964967155, | |
| "grad_norm": 0.7341142296791077, | |
| "learning_rate": 0.00012605042016806722, | |
| "loss": 1.3271, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.14075695964967155, | |
| "eval_loss": 0.8179877996444702, | |
| "eval_runtime": 43.5514, | |
| "eval_samples_per_second": 5.878, | |
| "eval_steps_per_second": 2.939, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.14138254613700343, | |
| "grad_norm": 6.473598480224609, | |
| "learning_rate": 0.00012571428571428572, | |
| "loss": 0.6219, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.14200813262433531, | |
| "grad_norm": 0.9846400022506714, | |
| "learning_rate": 0.0001253781512605042, | |
| "loss": 0.4407, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.1426337191116672, | |
| "grad_norm": 0.7880604267120361, | |
| "learning_rate": 0.0001250420168067227, | |
| "loss": 0.3927, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.14325930559899905, | |
| "grad_norm": 1.5999399423599243, | |
| "learning_rate": 0.0001247058823529412, | |
| "loss": 0.6917, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.14388489208633093, | |
| "grad_norm": 0.8072729110717773, | |
| "learning_rate": 0.00012436974789915966, | |
| "loss": 0.4909, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1445104785736628, | |
| "grad_norm": 2.2560601234436035, | |
| "learning_rate": 0.00012403361344537816, | |
| "loss": 0.3355, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.1451360650609947, | |
| "grad_norm": 0.9964832663536072, | |
| "learning_rate": 0.00012369747899159666, | |
| "loss": 0.4436, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.14576165154832654, | |
| "grad_norm": 1.1081007719039917, | |
| "learning_rate": 0.00012336134453781513, | |
| "loss": 0.6582, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.14638723803565842, | |
| "grad_norm": 0.9722908735275269, | |
| "learning_rate": 0.00012302521008403363, | |
| "loss": 0.7412, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.1470128245229903, | |
| "grad_norm": 0.7456592917442322, | |
| "learning_rate": 0.0001226890756302521, | |
| "loss": 0.4303, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.14763841101032218, | |
| "grad_norm": 1.0428457260131836, | |
| "learning_rate": 0.0001223529411764706, | |
| "loss": 1.0538, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.14826399749765404, | |
| "grad_norm": 0.9209719896316528, | |
| "learning_rate": 0.00012201680672268909, | |
| "loss": 0.5864, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.14888958398498592, | |
| "grad_norm": 0.990292489528656, | |
| "learning_rate": 0.00012168067226890756, | |
| "loss": 0.5929, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.1495151704723178, | |
| "grad_norm": 0.6086494326591492, | |
| "learning_rate": 0.00012134453781512605, | |
| "loss": 0.4436, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.15014075695964968, | |
| "grad_norm": 1.429149866104126, | |
| "learning_rate": 0.00012100840336134453, | |
| "loss": 0.246, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.15076634344698153, | |
| "grad_norm": 1.8170491456985474, | |
| "learning_rate": 0.00012067226890756302, | |
| "loss": 0.6574, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.1513919299343134, | |
| "grad_norm": 1.1577768325805664, | |
| "learning_rate": 0.00012033613445378152, | |
| "loss": 0.5706, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.1520175164216453, | |
| "grad_norm": 0.7442137598991394, | |
| "learning_rate": 0.00012, | |
| "loss": 0.2772, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.15264310290897717, | |
| "grad_norm": 1.1375997066497803, | |
| "learning_rate": 0.00011966386554621849, | |
| "loss": 0.397, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.15326868939630903, | |
| "grad_norm": 0.8451513648033142, | |
| "learning_rate": 0.00011932773109243697, | |
| "loss": 0.5425, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1538942758836409, | |
| "grad_norm": 0.7176560163497925, | |
| "learning_rate": 0.00011899159663865547, | |
| "loss": 0.4398, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.1545198623709728, | |
| "grad_norm": 1.049872875213623, | |
| "learning_rate": 0.00011865546218487396, | |
| "loss": 0.6479, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.15514544885830467, | |
| "grad_norm": 0.6093642115592957, | |
| "learning_rate": 0.00011831932773109244, | |
| "loss": 0.6125, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.15577103534563652, | |
| "grad_norm": 0.9963379502296448, | |
| "learning_rate": 0.00011798319327731093, | |
| "loss": 0.3768, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.1563966218329684, | |
| "grad_norm": 3.4668896198272705, | |
| "learning_rate": 0.00011764705882352942, | |
| "loss": 0.3744, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1563966218329684, | |
| "eval_loss": 0.8456696271896362, | |
| "eval_runtime": 43.5223, | |
| "eval_samples_per_second": 5.882, | |
| "eval_steps_per_second": 2.941, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15702220832030028, | |
| "grad_norm": 0.6826130747795105, | |
| "learning_rate": 0.00011731092436974791, | |
| "loss": 0.4877, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.15764779480763216, | |
| "grad_norm": 1.8045300245285034, | |
| "learning_rate": 0.0001169747899159664, | |
| "loss": 0.9699, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.15827338129496402, | |
| "grad_norm": 0.7311923503875732, | |
| "learning_rate": 0.00011663865546218489, | |
| "loss": 0.4648, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.1588989677822959, | |
| "grad_norm": 1.7481943368911743, | |
| "learning_rate": 0.00011630252100840337, | |
| "loss": 0.8871, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.15952455426962778, | |
| "grad_norm": 2.6331326961517334, | |
| "learning_rate": 0.00011596638655462187, | |
| "loss": 0.8109, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.16015014075695966, | |
| "grad_norm": 0.899364709854126, | |
| "learning_rate": 0.00011563025210084036, | |
| "loss": 0.5021, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1607757272442915, | |
| "grad_norm": 0.922218918800354, | |
| "learning_rate": 0.00011529411764705881, | |
| "loss": 0.5741, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.1614013137316234, | |
| "grad_norm": 5.335756301879883, | |
| "learning_rate": 0.00011495798319327731, | |
| "loss": 0.842, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.16202690021895527, | |
| "grad_norm": 0.8632665872573853, | |
| "learning_rate": 0.0001146218487394958, | |
| "loss": 0.4208, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.16265248670628715, | |
| "grad_norm": 4.576591968536377, | |
| "learning_rate": 0.00011428571428571428, | |
| "loss": 0.8813, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.163278073193619, | |
| "grad_norm": 0.907714307308197, | |
| "learning_rate": 0.00011394957983193277, | |
| "loss": 0.7204, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.16390365968095089, | |
| "grad_norm": 0.8328534960746765, | |
| "learning_rate": 0.00011361344537815127, | |
| "loss": 0.7552, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.16452924616828277, | |
| "grad_norm": 1.0882028341293335, | |
| "learning_rate": 0.00011327731092436975, | |
| "loss": 0.9079, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.16515483265561465, | |
| "grad_norm": 1.0093358755111694, | |
| "learning_rate": 0.00011294117647058824, | |
| "loss": 0.6284, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.1657804191429465, | |
| "grad_norm": 0.853907585144043, | |
| "learning_rate": 0.00011260504201680672, | |
| "loss": 0.508, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.16640600563027838, | |
| "grad_norm": 1.0016460418701172, | |
| "learning_rate": 0.00011226890756302521, | |
| "loss": 0.597, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.16703159211761026, | |
| "grad_norm": 1.0138968229293823, | |
| "learning_rate": 0.00011193277310924371, | |
| "loss": 0.9238, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.16765717860494214, | |
| "grad_norm": 1.1728049516677856, | |
| "learning_rate": 0.0001115966386554622, | |
| "loss": 0.9152, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.168282765092274, | |
| "grad_norm": 1.2228264808654785, | |
| "learning_rate": 0.00011126050420168068, | |
| "loss": 0.7483, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.16890835157960588, | |
| "grad_norm": 0.6260212659835815, | |
| "learning_rate": 0.00011092436974789917, | |
| "loss": 0.5566, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16953393806693776, | |
| "grad_norm": 0.7589625716209412, | |
| "learning_rate": 0.00011058823529411766, | |
| "loss": 0.6242, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.17015952455426964, | |
| "grad_norm": 1.1016935110092163, | |
| "learning_rate": 0.00011025210084033615, | |
| "loss": 0.4419, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.1707851110416015, | |
| "grad_norm": 0.8092851042747498, | |
| "learning_rate": 0.00010991596638655464, | |
| "loss": 0.5168, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.17141069752893337, | |
| "grad_norm": 1.012885332107544, | |
| "learning_rate": 0.00010957983193277312, | |
| "loss": 0.4334, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.17203628401626525, | |
| "grad_norm": 2.6073336601257324, | |
| "learning_rate": 0.00010924369747899159, | |
| "loss": 0.5262, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.17203628401626525, | |
| "eval_loss": 0.8115787506103516, | |
| "eval_runtime": 43.4931, | |
| "eval_samples_per_second": 5.886, | |
| "eval_steps_per_second": 2.943, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.17266187050359713, | |
| "grad_norm": 5.577237606048584, | |
| "learning_rate": 0.00010890756302521008, | |
| "loss": 1.0595, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.17328745699092898, | |
| "grad_norm": 1.1434190273284912, | |
| "learning_rate": 0.00010857142857142856, | |
| "loss": 0.4401, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 0.951992928981781, | |
| "learning_rate": 0.00010823529411764706, | |
| "loss": 0.4393, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.17453862996559275, | |
| "grad_norm": 0.6695138216018677, | |
| "learning_rate": 0.00010789915966386555, | |
| "loss": 0.314, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.17516421645292463, | |
| "grad_norm": 0.40990278124809265, | |
| "learning_rate": 0.00010756302521008403, | |
| "loss": 0.192, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.17578980294025648, | |
| "grad_norm": 0.9555610418319702, | |
| "learning_rate": 0.00010722689075630252, | |
| "loss": 0.3646, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.17641538942758836, | |
| "grad_norm": 0.7370548844337463, | |
| "learning_rate": 0.000106890756302521, | |
| "loss": 0.8997, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.17704097591492024, | |
| "grad_norm": 1.0178982019424438, | |
| "learning_rate": 0.0001065546218487395, | |
| "loss": 0.986, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.17766656240225212, | |
| "grad_norm": 0.41388389468193054, | |
| "learning_rate": 0.00010621848739495799, | |
| "loss": 0.2069, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.17829214888958397, | |
| "grad_norm": 0.7140624523162842, | |
| "learning_rate": 0.00010588235294117647, | |
| "loss": 0.4852, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.17891773537691585, | |
| "grad_norm": 0.7758356332778931, | |
| "learning_rate": 0.00010554621848739496, | |
| "loss": 0.3943, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.17954332186424773, | |
| "grad_norm": 1.4193260669708252, | |
| "learning_rate": 0.00010521008403361346, | |
| "loss": 0.6412, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.18016890835157962, | |
| "grad_norm": 0.7264838814735413, | |
| "learning_rate": 0.00010487394957983194, | |
| "loss": 0.7834, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.18079449483891147, | |
| "grad_norm": 2.4300973415374756, | |
| "learning_rate": 0.00010453781512605043, | |
| "loss": 0.7462, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.18142008132624335, | |
| "grad_norm": 1.033916711807251, | |
| "learning_rate": 0.00010420168067226892, | |
| "loss": 0.5241, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.18204566781357523, | |
| "grad_norm": 0.5583767294883728, | |
| "learning_rate": 0.00010386554621848741, | |
| "loss": 0.7815, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.1826712543009071, | |
| "grad_norm": 0.7440481781959534, | |
| "learning_rate": 0.0001035294117647059, | |
| "loss": 0.4674, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.18329684078823896, | |
| "grad_norm": 4.230656147003174, | |
| "learning_rate": 0.00010319327731092439, | |
| "loss": 0.5219, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.18392242727557084, | |
| "grad_norm": 0.6165269017219543, | |
| "learning_rate": 0.00010285714285714286, | |
| "loss": 0.3274, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.18454801376290272, | |
| "grad_norm": 0.5844498872756958, | |
| "learning_rate": 0.00010252100840336134, | |
| "loss": 0.3719, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1851736002502346, | |
| "grad_norm": 0.9936206936836243, | |
| "learning_rate": 0.00010218487394957983, | |
| "loss": 1.0453, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.18579918673756646, | |
| "grad_norm": 1.749831199645996, | |
| "learning_rate": 0.00010184873949579831, | |
| "loss": 0.6634, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.18642477322489834, | |
| "grad_norm": 0.4740132689476013, | |
| "learning_rate": 0.0001015126050420168, | |
| "loss": 0.2901, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.18705035971223022, | |
| "grad_norm": 0.664300262928009, | |
| "learning_rate": 0.0001011764705882353, | |
| "loss": 0.5869, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.1876759461995621, | |
| "grad_norm": 0.7400941252708435, | |
| "learning_rate": 0.00010084033613445378, | |
| "loss": 0.7881, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1876759461995621, | |
| "eval_loss": 0.7877693772315979, | |
| "eval_runtime": 43.5162, | |
| "eval_samples_per_second": 5.883, | |
| "eval_steps_per_second": 2.941, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18830153268689395, | |
| "grad_norm": 0.6142858862876892, | |
| "learning_rate": 0.00010050420168067227, | |
| "loss": 0.3808, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.18892711917422583, | |
| "grad_norm": 1.991969347000122, | |
| "learning_rate": 0.00010016806722689076, | |
| "loss": 0.7035, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.1895527056615577, | |
| "grad_norm": 0.6220730543136597, | |
| "learning_rate": 9.983193277310925e-05, | |
| "loss": 0.2548, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.1901782921488896, | |
| "grad_norm": 0.6476833820343018, | |
| "learning_rate": 9.949579831932774e-05, | |
| "loss": 0.3569, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.19080387863622145, | |
| "grad_norm": 0.7133951783180237, | |
| "learning_rate": 9.915966386554623e-05, | |
| "loss": 0.4744, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.19142946512355333, | |
| "grad_norm": 0.6500736474990845, | |
| "learning_rate": 9.882352941176471e-05, | |
| "loss": 0.4653, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.1920550516108852, | |
| "grad_norm": 1.1231927871704102, | |
| "learning_rate": 9.848739495798321e-05, | |
| "loss": 0.818, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.1926806380982171, | |
| "grad_norm": 0.8654798865318298, | |
| "learning_rate": 9.815126050420168e-05, | |
| "loss": 0.7065, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.19330622458554894, | |
| "grad_norm": 0.45660969614982605, | |
| "learning_rate": 9.781512605042017e-05, | |
| "loss": 0.2412, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.19393181107288082, | |
| "grad_norm": 0.9538519978523254, | |
| "learning_rate": 9.747899159663865e-05, | |
| "loss": 1.3428, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1945573975602127, | |
| "grad_norm": 0.596633791923523, | |
| "learning_rate": 9.714285714285715e-05, | |
| "loss": 0.5119, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.19518298404754458, | |
| "grad_norm": 0.5247074365615845, | |
| "learning_rate": 9.680672268907564e-05, | |
| "loss": 0.6413, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.19580857053487644, | |
| "grad_norm": 0.7713050246238708, | |
| "learning_rate": 9.647058823529412e-05, | |
| "loss": 0.49, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.19643415702220832, | |
| "grad_norm": 0.6971513628959656, | |
| "learning_rate": 9.613445378151261e-05, | |
| "loss": 0.6505, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.1970597435095402, | |
| "grad_norm": 0.5454917550086975, | |
| "learning_rate": 9.579831932773111e-05, | |
| "loss": 0.7018, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.19768532999687208, | |
| "grad_norm": 0.8349499702453613, | |
| "learning_rate": 9.546218487394959e-05, | |
| "loss": 0.3179, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.19831091648420393, | |
| "grad_norm": 0.5682560801506042, | |
| "learning_rate": 9.512605042016806e-05, | |
| "loss": 0.4003, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.1989365029715358, | |
| "grad_norm": 0.5094739198684692, | |
| "learning_rate": 9.478991596638655e-05, | |
| "loss": 0.313, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.1995620894588677, | |
| "grad_norm": 1.7074236869812012, | |
| "learning_rate": 9.445378151260505e-05, | |
| "loss": 0.9912, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.20018767594619957, | |
| "grad_norm": 1.1477283239364624, | |
| "learning_rate": 9.411764705882353e-05, | |
| "loss": 0.851, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.20081326243353143, | |
| "grad_norm": 0.6616579294204712, | |
| "learning_rate": 9.378151260504202e-05, | |
| "loss": 0.4844, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.2014388489208633, | |
| "grad_norm": 1.0401920080184937, | |
| "learning_rate": 9.34453781512605e-05, | |
| "loss": 0.5421, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.2020644354081952, | |
| "grad_norm": 0.729664146900177, | |
| "learning_rate": 9.3109243697479e-05, | |
| "loss": 0.6632, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.20269002189552707, | |
| "grad_norm": 0.6752575635910034, | |
| "learning_rate": 9.277310924369749e-05, | |
| "loss": 0.4352, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.20331560838285892, | |
| "grad_norm": 0.7963948249816895, | |
| "learning_rate": 9.243697478991598e-05, | |
| "loss": 0.7614, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.20331560838285892, | |
| "eval_loss": 0.771190881729126, | |
| "eval_runtime": 43.551, | |
| "eval_samples_per_second": 5.878, | |
| "eval_steps_per_second": 2.939, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.2039411948701908, | |
| "grad_norm": 0.7778791189193726, | |
| "learning_rate": 9.210084033613445e-05, | |
| "loss": 0.7251, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.20456678135752268, | |
| "grad_norm": 3.0929737091064453, | |
| "learning_rate": 9.176470588235295e-05, | |
| "loss": 0.5375, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.20519236784485456, | |
| "grad_norm": 0.6188391447067261, | |
| "learning_rate": 9.142857142857143e-05, | |
| "loss": 0.4007, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.20581795433218641, | |
| "grad_norm": 0.9423925876617432, | |
| "learning_rate": 9.109243697478992e-05, | |
| "loss": 0.5059, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.2064435408195183, | |
| "grad_norm": 0.506572425365448, | |
| "learning_rate": 9.07563025210084e-05, | |
| "loss": 0.2794, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.20706912730685018, | |
| "grad_norm": 1.7139545679092407, | |
| "learning_rate": 9.04201680672269e-05, | |
| "loss": 0.5984, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.20769471379418206, | |
| "grad_norm": 0.5540574789047241, | |
| "learning_rate": 9.008403361344539e-05, | |
| "loss": 0.323, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.2083203002815139, | |
| "grad_norm": 0.6909454464912415, | |
| "learning_rate": 8.974789915966387e-05, | |
| "loss": 0.5399, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.2089458867688458, | |
| "grad_norm": 0.7409022450447083, | |
| "learning_rate": 8.941176470588236e-05, | |
| "loss": 0.4251, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.20957147325617767, | |
| "grad_norm": 0.6636312007904053, | |
| "learning_rate": 8.907563025210084e-05, | |
| "loss": 0.4021, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.21019705974350955, | |
| "grad_norm": 0.5426271557807922, | |
| "learning_rate": 8.873949579831933e-05, | |
| "loss": 0.2095, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.2108226462308414, | |
| "grad_norm": 0.8870647549629211, | |
| "learning_rate": 8.840336134453782e-05, | |
| "loss": 0.5773, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.21144823271817328, | |
| "grad_norm": 0.5508524179458618, | |
| "learning_rate": 8.80672268907563e-05, | |
| "loss": 0.6744, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.21207381920550517, | |
| "grad_norm": 1.6577738523483276, | |
| "learning_rate": 8.77310924369748e-05, | |
| "loss": 1.1134, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.21269940569283705, | |
| "grad_norm": 3.218395233154297, | |
| "learning_rate": 8.739495798319329e-05, | |
| "loss": 0.5932, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2133249921801689, | |
| "grad_norm": 0.5119672417640686, | |
| "learning_rate": 8.705882352941177e-05, | |
| "loss": 0.1831, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.21395057866750078, | |
| "grad_norm": 0.4874535799026489, | |
| "learning_rate": 8.672268907563026e-05, | |
| "loss": 0.485, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.21457616515483266, | |
| "grad_norm": 0.6597093939781189, | |
| "learning_rate": 8.638655462184874e-05, | |
| "loss": 0.3588, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.21520175164216454, | |
| "grad_norm": 1.1764620542526245, | |
| "learning_rate": 8.605042016806724e-05, | |
| "loss": 1.8765, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.2158273381294964, | |
| "grad_norm": 0.6894935369491577, | |
| "learning_rate": 8.571428571428571e-05, | |
| "loss": 0.5355, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.21645292461682827, | |
| "grad_norm": 0.5896294116973877, | |
| "learning_rate": 8.53781512605042e-05, | |
| "loss": 0.494, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.21707851110416015, | |
| "grad_norm": 0.6212694048881531, | |
| "learning_rate": 8.50420168067227e-05, | |
| "loss": 0.5721, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.21770409759149204, | |
| "grad_norm": 0.5058571100234985, | |
| "learning_rate": 8.470588235294118e-05, | |
| "loss": 0.5051, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.2183296840788239, | |
| "grad_norm": 0.5089401006698608, | |
| "learning_rate": 8.436974789915967e-05, | |
| "loss": 0.3794, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.21895527056615577, | |
| "grad_norm": 6.416032314300537, | |
| "learning_rate": 8.403361344537815e-05, | |
| "loss": 0.5026, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21895527056615577, | |
| "eval_loss": 0.7647964954376221, | |
| "eval_runtime": 43.4854, | |
| "eval_samples_per_second": 5.887, | |
| "eval_steps_per_second": 2.944, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21958085705348765, | |
| "grad_norm": 0.8862031698226929, | |
| "learning_rate": 8.369747899159664e-05, | |
| "loss": 0.5003, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.22020644354081953, | |
| "grad_norm": 1.3196977376937866, | |
| "learning_rate": 8.336134453781514e-05, | |
| "loss": 0.4573, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.22083203002815138, | |
| "grad_norm": 0.4763634204864502, | |
| "learning_rate": 8.302521008403362e-05, | |
| "loss": 0.3987, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.22145761651548326, | |
| "grad_norm": 0.45634883642196655, | |
| "learning_rate": 8.26890756302521e-05, | |
| "loss": 0.2064, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.22208320300281514, | |
| "grad_norm": 0.443393737077713, | |
| "learning_rate": 8.23529411764706e-05, | |
| "loss": 0.2308, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.22270878949014702, | |
| "grad_norm": 1.135941505432129, | |
| "learning_rate": 8.201680672268908e-05, | |
| "loss": 0.8731, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.22333437597747888, | |
| "grad_norm": 0.6853610873222351, | |
| "learning_rate": 8.168067226890757e-05, | |
| "loss": 0.5563, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.22395996246481076, | |
| "grad_norm": 0.6356902718544006, | |
| "learning_rate": 8.134453781512605e-05, | |
| "loss": 0.4265, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.22458554895214264, | |
| "grad_norm": 0.6331340074539185, | |
| "learning_rate": 8.100840336134454e-05, | |
| "loss": 0.3293, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.22521113543947452, | |
| "grad_norm": 0.8068905472755432, | |
| "learning_rate": 8.067226890756304e-05, | |
| "loss": 1.2136, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.22583672192680637, | |
| "grad_norm": 0.6827020049095154, | |
| "learning_rate": 8.033613445378152e-05, | |
| "loss": 0.8519, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.22646230841413825, | |
| "grad_norm": 0.829730749130249, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6237, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.22708789490147013, | |
| "grad_norm": 0.5221096873283386, | |
| "learning_rate": 7.966386554621849e-05, | |
| "loss": 0.3099, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.227713481388802, | |
| "grad_norm": 0.6234191060066223, | |
| "learning_rate": 7.932773109243698e-05, | |
| "loss": 0.9356, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.22833906787613387, | |
| "grad_norm": 0.5766564607620239, | |
| "learning_rate": 7.899159663865546e-05, | |
| "loss": 0.2529, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.22896465436346575, | |
| "grad_norm": 0.758171558380127, | |
| "learning_rate": 7.865546218487395e-05, | |
| "loss": 0.7577, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.22959024085079763, | |
| "grad_norm": 0.6313957571983337, | |
| "learning_rate": 7.831932773109243e-05, | |
| "loss": 0.6219, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.2302158273381295, | |
| "grad_norm": 0.7843011617660522, | |
| "learning_rate": 7.798319327731093e-05, | |
| "loss": 0.6617, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.23084141382546136, | |
| "grad_norm": 0.9671229124069214, | |
| "learning_rate": 7.764705882352942e-05, | |
| "loss": 0.7156, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.23146700031279324, | |
| "grad_norm": 0.663546085357666, | |
| "learning_rate": 7.73109243697479e-05, | |
| "loss": 0.4462, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.23209258680012512, | |
| "grad_norm": 0.6233255863189697, | |
| "learning_rate": 7.697478991596639e-05, | |
| "loss": 0.5534, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.232718173287457, | |
| "grad_norm": 2.0895440578460693, | |
| "learning_rate": 7.663865546218489e-05, | |
| "loss": 0.7289, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.23334375977478886, | |
| "grad_norm": 0.6122156381607056, | |
| "learning_rate": 7.630252100840336e-05, | |
| "loss": 0.3891, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.23396934626212074, | |
| "grad_norm": 0.5940058827400208, | |
| "learning_rate": 7.596638655462185e-05, | |
| "loss": 0.3038, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.23459493274945262, | |
| "grad_norm": 0.35755977034568787, | |
| "learning_rate": 7.563025210084033e-05, | |
| "loss": 0.2848, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.23459493274945262, | |
| "eval_loss": 0.7521110773086548, | |
| "eval_runtime": 43.5299, | |
| "eval_samples_per_second": 5.881, | |
| "eval_steps_per_second": 2.941, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2352205192367845, | |
| "grad_norm": 0.8450719118118286, | |
| "learning_rate": 7.529411764705883e-05, | |
| "loss": 0.5348, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.23584610572411635, | |
| "grad_norm": 0.9100202918052673, | |
| "learning_rate": 7.495798319327732e-05, | |
| "loss": 0.8178, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.23647169221144823, | |
| "grad_norm": 0.5748711228370667, | |
| "learning_rate": 7.46218487394958e-05, | |
| "loss": 0.778, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.2370972786987801, | |
| "grad_norm": 0.5675060153007507, | |
| "learning_rate": 7.428571428571429e-05, | |
| "loss": 0.8042, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.237722865186112, | |
| "grad_norm": 6.2747392654418945, | |
| "learning_rate": 7.394957983193279e-05, | |
| "loss": 1.4173, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.23834845167344385, | |
| "grad_norm": 0.6252509355545044, | |
| "learning_rate": 7.361344537815127e-05, | |
| "loss": 0.5095, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.23897403816077573, | |
| "grad_norm": 1.0525410175323486, | |
| "learning_rate": 7.327731092436974e-05, | |
| "loss": 1.1774, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.2395996246481076, | |
| "grad_norm": 0.505670428276062, | |
| "learning_rate": 7.294117647058823e-05, | |
| "loss": 0.7603, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.2402252111354395, | |
| "grad_norm": 0.5476568341255188, | |
| "learning_rate": 7.260504201680673e-05, | |
| "loss": 0.3354, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.24085079762277134, | |
| "grad_norm": 0.687854528427124, | |
| "learning_rate": 7.226890756302521e-05, | |
| "loss": 0.6306, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.24147638411010322, | |
| "grad_norm": 1.3373991250991821, | |
| "learning_rate": 7.19327731092437e-05, | |
| "loss": 0.5736, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.2421019705974351, | |
| "grad_norm": 0.5465985536575317, | |
| "learning_rate": 7.159663865546218e-05, | |
| "loss": 0.2859, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.24272755708476698, | |
| "grad_norm": 0.6637946963310242, | |
| "learning_rate": 7.126050420168068e-05, | |
| "loss": 0.5948, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.24335314357209883, | |
| "grad_norm": 0.637915313243866, | |
| "learning_rate": 7.092436974789917e-05, | |
| "loss": 0.3947, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.24397873005943072, | |
| "grad_norm": 0.8073198795318604, | |
| "learning_rate": 7.058823529411765e-05, | |
| "loss": 0.5006, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2446043165467626, | |
| "grad_norm": 0.7423315644264221, | |
| "learning_rate": 7.025210084033613e-05, | |
| "loss": 0.5209, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.24522990303409448, | |
| "grad_norm": 0.6418082118034363, | |
| "learning_rate": 6.991596638655463e-05, | |
| "loss": 0.3793, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.24585548952142633, | |
| "grad_norm": 1.072240948677063, | |
| "learning_rate": 6.957983193277311e-05, | |
| "loss": 0.3656, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.2464810760087582, | |
| "grad_norm": 1.3167545795440674, | |
| "learning_rate": 6.92436974789916e-05, | |
| "loss": 0.9145, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.2471066624960901, | |
| "grad_norm": 0.6734040379524231, | |
| "learning_rate": 6.890756302521008e-05, | |
| "loss": 0.4063, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.24773224898342197, | |
| "grad_norm": 0.48195910453796387, | |
| "learning_rate": 6.857142857142858e-05, | |
| "loss": 0.4146, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.24835783547075382, | |
| "grad_norm": 1.2620956897735596, | |
| "learning_rate": 6.823529411764707e-05, | |
| "loss": 0.4289, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.2489834219580857, | |
| "grad_norm": 0.6438835859298706, | |
| "learning_rate": 6.789915966386555e-05, | |
| "loss": 0.6204, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.24960900844541759, | |
| "grad_norm": 1.6006457805633545, | |
| "learning_rate": 6.756302521008404e-05, | |
| "loss": 0.8828, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.25023459493274947, | |
| "grad_norm": 3.7350921630859375, | |
| "learning_rate": 6.722689075630254e-05, | |
| "loss": 0.5608, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.25023459493274947, | |
| "eval_loss": 0.7404398322105408, | |
| "eval_runtime": 43.5267, | |
| "eval_samples_per_second": 5.881, | |
| "eval_steps_per_second": 2.941, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2508601814200813, | |
| "grad_norm": 0.776977002620697, | |
| "learning_rate": 6.689075630252101e-05, | |
| "loss": 0.709, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.2514857679074132, | |
| "grad_norm": 0.547192394733429, | |
| "learning_rate": 6.65546218487395e-05, | |
| "loss": 0.2832, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.2521113543947451, | |
| "grad_norm": 1.2148370742797852, | |
| "learning_rate": 6.621848739495798e-05, | |
| "loss": 0.9248, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.25273694088207693, | |
| "grad_norm": 0.5215961337089539, | |
| "learning_rate": 6.588235294117648e-05, | |
| "loss": 0.5407, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.25336252736940884, | |
| "grad_norm": 0.32982224225997925, | |
| "learning_rate": 6.554621848739496e-05, | |
| "loss": 0.1891, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.2539881138567407, | |
| "grad_norm": 0.707619309425354, | |
| "learning_rate": 6.521008403361345e-05, | |
| "loss": 0.6929, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.25461370034407255, | |
| "grad_norm": 1.87132728099823, | |
| "learning_rate": 6.487394957983193e-05, | |
| "loss": 0.6022, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.25523928683140445, | |
| "grad_norm": 0.5033402442932129, | |
| "learning_rate": 6.453781512605043e-05, | |
| "loss": 0.2833, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.2558648733187363, | |
| "grad_norm": 1.0010263919830322, | |
| "learning_rate": 6.420168067226892e-05, | |
| "loss": 0.2809, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.2564904598060682, | |
| "grad_norm": 0.9624127745628357, | |
| "learning_rate": 6.386554621848739e-05, | |
| "loss": 0.5303, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.25711604629340007, | |
| "grad_norm": 1.2495983839035034, | |
| "learning_rate": 6.352941176470588e-05, | |
| "loss": 0.5994, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.2577416327807319, | |
| "grad_norm": 0.7493329048156738, | |
| "learning_rate": 6.319327731092438e-05, | |
| "loss": 0.4795, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.25836721926806383, | |
| "grad_norm": 1.0783026218414307, | |
| "learning_rate": 6.285714285714286e-05, | |
| "loss": 1.13, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.2589928057553957, | |
| "grad_norm": 0.6462905406951904, | |
| "learning_rate": 6.252100840336135e-05, | |
| "loss": 0.4111, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.25961839224272754, | |
| "grad_norm": 0.4357486665248871, | |
| "learning_rate": 6.218487394957983e-05, | |
| "loss": 0.2122, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.26024397873005944, | |
| "grad_norm": 0.42553481459617615, | |
| "learning_rate": 6.184873949579833e-05, | |
| "loss": 0.2509, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 0.8176494836807251, | |
| "learning_rate": 6.151260504201682e-05, | |
| "loss": 1.157, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.2614951517047232, | |
| "grad_norm": 0.527748703956604, | |
| "learning_rate": 6.11764705882353e-05, | |
| "loss": 0.3804, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.26212073819205506, | |
| "grad_norm": 0.9033327102661133, | |
| "learning_rate": 6.084033613445378e-05, | |
| "loss": 0.6555, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.2627463246793869, | |
| "grad_norm": 0.7106732130050659, | |
| "learning_rate": 6.0504201680672267e-05, | |
| "loss": 0.4358, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2633719111667188, | |
| "grad_norm": 1.1655712127685547, | |
| "learning_rate": 6.016806722689076e-05, | |
| "loss": 0.5233, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.2639974976540507, | |
| "grad_norm": 0.7053611874580383, | |
| "learning_rate": 5.9831932773109244e-05, | |
| "loss": 0.428, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.2646230841413825, | |
| "grad_norm": 0.7588666677474976, | |
| "learning_rate": 5.9495798319327737e-05, | |
| "loss": 0.5904, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.26524867062871443, | |
| "grad_norm": 0.6778993010520935, | |
| "learning_rate": 5.915966386554622e-05, | |
| "loss": 0.7834, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.2658742571160463, | |
| "grad_norm": 0.5685262084007263, | |
| "learning_rate": 5.882352941176471e-05, | |
| "loss": 0.7958, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2658742571160463, | |
| "eval_loss": 0.7321073412895203, | |
| "eval_runtime": 43.5595, | |
| "eval_samples_per_second": 5.877, | |
| "eval_steps_per_second": 2.939, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2664998436033782, | |
| "grad_norm": 0.41137516498565674, | |
| "learning_rate": 5.84873949579832e-05, | |
| "loss": 0.255, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.26712543009071005, | |
| "grad_norm": 0.48806631565093994, | |
| "learning_rate": 5.8151260504201685e-05, | |
| "loss": 0.2586, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.2677510165780419, | |
| "grad_norm": 0.877154529094696, | |
| "learning_rate": 5.781512605042018e-05, | |
| "loss": 0.5605, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.2683766030653738, | |
| "grad_norm": 1.1426063776016235, | |
| "learning_rate": 5.7478991596638656e-05, | |
| "loss": 0.6767, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.26900218955270566, | |
| "grad_norm": 0.7325838208198547, | |
| "learning_rate": 5.714285714285714e-05, | |
| "loss": 0.4384, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2696277760400375, | |
| "grad_norm": 0.815000593662262, | |
| "learning_rate": 5.6806722689075634e-05, | |
| "loss": 0.5198, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.2702533625273694, | |
| "grad_norm": 0.582699716091156, | |
| "learning_rate": 5.647058823529412e-05, | |
| "loss": 0.5345, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.2708789490147013, | |
| "grad_norm": 0.6257805228233337, | |
| "learning_rate": 5.6134453781512605e-05, | |
| "loss": 0.4172, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.27150453550203313, | |
| "grad_norm": 0.8166823983192444, | |
| "learning_rate": 5.57983193277311e-05, | |
| "loss": 0.7274, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.27213012198936504, | |
| "grad_norm": 0.6732988953590393, | |
| "learning_rate": 5.546218487394958e-05, | |
| "loss": 0.781, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2727557084766969, | |
| "grad_norm": 0.6230109930038452, | |
| "learning_rate": 5.5126050420168075e-05, | |
| "loss": 0.6356, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.2733812949640288, | |
| "grad_norm": 0.6590014696121216, | |
| "learning_rate": 5.478991596638656e-05, | |
| "loss": 0.9665, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.27400688145136065, | |
| "grad_norm": 0.3651019036769867, | |
| "learning_rate": 5.445378151260504e-05, | |
| "loss": 0.3858, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.2746324679386925, | |
| "grad_norm": 0.6834749579429626, | |
| "learning_rate": 5.411764705882353e-05, | |
| "loss": 0.5685, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.2752580544260244, | |
| "grad_norm": 0.46671655774116516, | |
| "learning_rate": 5.378151260504202e-05, | |
| "loss": 0.4953, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.27588364091335627, | |
| "grad_norm": 0.6245185732841492, | |
| "learning_rate": 5.34453781512605e-05, | |
| "loss": 0.7836, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.2765092274006881, | |
| "grad_norm": 0.5942935943603516, | |
| "learning_rate": 5.3109243697478995e-05, | |
| "loss": 0.3441, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.27713481388802, | |
| "grad_norm": 0.7539409399032593, | |
| "learning_rate": 5.277310924369748e-05, | |
| "loss": 0.7533, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.2777604003753519, | |
| "grad_norm": 0.40587514638900757, | |
| "learning_rate": 5.243697478991597e-05, | |
| "loss": 0.3298, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.2783859868626838, | |
| "grad_norm": 0.5237724184989929, | |
| "learning_rate": 5.210084033613446e-05, | |
| "loss": 0.65, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.27901157335001564, | |
| "grad_norm": 0.6571043133735657, | |
| "learning_rate": 5.176470588235295e-05, | |
| "loss": 0.5741, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.2796371598373475, | |
| "grad_norm": 0.4717683792114258, | |
| "learning_rate": 5.142857142857143e-05, | |
| "loss": 0.3073, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.2802627463246794, | |
| "grad_norm": 0.4331720173358917, | |
| "learning_rate": 5.1092436974789914e-05, | |
| "loss": 0.401, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.28088833281201125, | |
| "grad_norm": 0.6984372138977051, | |
| "learning_rate": 5.07563025210084e-05, | |
| "loss": 0.5573, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.2815139192993431, | |
| "grad_norm": 0.556936502456665, | |
| "learning_rate": 5.042016806722689e-05, | |
| "loss": 0.4267, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2815139192993431, | |
| "eval_loss": 0.7238953709602356, | |
| "eval_runtime": 43.5441, | |
| "eval_samples_per_second": 5.879, | |
| "eval_steps_per_second": 2.94, | |
| "step": 450 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.838116962892513e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |