| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 3125, |
| "global_step": 9375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 126.07254791259766, |
| "learning_rate": 5.330490405117271e-09, |
| "loss": 5.5721, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 50.644527435302734, |
| "learning_rate": 1.7057569296375268e-07, |
| "loss": 4.7319, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 50.18523406982422, |
| "learning_rate": 3.4115138592750537e-07, |
| "loss": 4.2388, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 22.55028533935547, |
| "learning_rate": 5.11727078891258e-07, |
| "loss": 4.1004, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 17.249303817749023, |
| "learning_rate": 6.823027718550107e-07, |
| "loss": 3.8056, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 27.913209915161133, |
| "learning_rate": 8.528784648187634e-07, |
| "loss": 3.7559, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 31.13373374938965, |
| "learning_rate": 1.023454157782516e-06, |
| "loss": 3.5802, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 51.599185943603516, |
| "learning_rate": 1.1940298507462686e-06, |
| "loss": 3.4202, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 26.857385635375977, |
| "learning_rate": 1.3646055437100215e-06, |
| "loss": 3.1577, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 36.01700210571289, |
| "learning_rate": 1.5351812366737743e-06, |
| "loss": 2.9155, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 48.35042953491211, |
| "learning_rate": 1.7057569296375267e-06, |
| "loss": 2.4688, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 32.770713806152344, |
| "learning_rate": 1.8763326226012796e-06, |
| "loss": 2.0288, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 28.91162872314453, |
| "learning_rate": 2.046908315565032e-06, |
| "loss": 1.713, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 28.609853744506836, |
| "learning_rate": 2.217484008528785e-06, |
| "loss": 1.2517, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 23.230613708496094, |
| "learning_rate": 2.3880597014925373e-06, |
| "loss": 0.8838, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 20.77098846435547, |
| "learning_rate": 2.55863539445629e-06, |
| "loss": 0.6982, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 8.70417594909668, |
| "learning_rate": 2.729211087420043e-06, |
| "loss": 0.5794, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 6.534135341644287, |
| "learning_rate": 2.8997867803837954e-06, |
| "loss": 0.5319, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 4.264636039733887, |
| "learning_rate": 3.0703624733475486e-06, |
| "loss": 0.5251, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 4.235575199127197, |
| "learning_rate": 3.240938166311301e-06, |
| "loss": 0.5002, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 3.1073174476623535, |
| "learning_rate": 3.4115138592750535e-06, |
| "loss": 0.4929, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 3.024670362472534, |
| "learning_rate": 3.582089552238806e-06, |
| "loss": 0.5191, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.9338278770446777, |
| "learning_rate": 3.752665245202559e-06, |
| "loss": 0.4784, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.67437481880188, |
| "learning_rate": 3.9232409381663116e-06, |
| "loss": 0.4685, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.5493340492248535, |
| "learning_rate": 4.093816631130064e-06, |
| "loss": 0.4812, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.4631426334381104, |
| "learning_rate": 4.264392324093816e-06, |
| "loss": 0.4695, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.305046319961548, |
| "learning_rate": 4.43496801705757e-06, |
| "loss": 0.4799, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.070432186126709, |
| "learning_rate": 4.605543710021322e-06, |
| "loss": 0.4713, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.4328596591949463, |
| "learning_rate": 4.7761194029850745e-06, |
| "loss": 0.4581, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.923413872718811, |
| "learning_rate": 4.946695095948828e-06, |
| "loss": 0.4723, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.141838312149048, |
| "learning_rate": 4.999916116490299e-06, |
| "loss": 0.454, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.8363310098648071, |
| "learning_rate": 4.999494633386398e-06, |
| "loss": 0.4541, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 1.8952796459197998, |
| "learning_rate": 4.998718279148715e-06, |
| "loss": 0.4404, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.261046886444092, |
| "learning_rate": 4.997587164001815e-06, |
| "loss": 0.4578, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.8481628894805908, |
| "learning_rate": 4.996101448538208e-06, |
| "loss": 0.4567, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.8773612976074219, |
| "learning_rate": 4.994261343695546e-06, |
| "loss": 0.4448, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.0779101848602295, |
| "learning_rate": 4.992067110726676e-06, |
| "loss": 0.4654, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.975841999053955, |
| "learning_rate": 4.989519061162551e-06, |
| "loss": 0.451, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.04510498046875, |
| "learning_rate": 4.986617556767996e-06, |
| "loss": 0.4494, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.2312591075897217, |
| "learning_rate": 4.983363009490345e-06, |
| "loss": 0.4491, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.15928053855896, |
| "learning_rate": 4.979755881400958e-06, |
| "loss": 0.4469, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.8935317993164062, |
| "learning_rate": 4.975796684629615e-06, |
| "loss": 0.4407, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 1.8919423818588257, |
| "learning_rate": 4.9714859812918025e-06, |
| "loss": 0.4378, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.8667327165603638, |
| "learning_rate": 4.966824383408912e-06, |
| "loss": 0.4461, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.7683241367340088, |
| "learning_rate": 4.961812552821344e-06, |
| "loss": 0.4673, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.1038663387298584, |
| "learning_rate": 4.9564512010945376e-06, |
| "loss": 0.445, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 1.9102988243103027, |
| "learning_rate": 4.950741089417953e-06, |
| "loss": 0.4387, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.906335473060608, |
| "learning_rate": 4.9446830284969925e-06, |
| "loss": 0.4451, |
| "step": 1504 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 1.8324521780014038, |
| "learning_rate": 4.9382778784379036e-06, |
| "loss": 0.4239, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.6944546699523926, |
| "learning_rate": 4.93152654862566e-06, |
| "loss": 0.4323, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 1.6708574295043945, |
| "learning_rate": 4.924429997594853e-06, |
| "loss": 0.4358, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.7164026498794556, |
| "learning_rate": 4.916989232893599e-06, |
| "loss": 0.4464, |
| "step": 1632 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 1.9400017261505127, |
| "learning_rate": 4.9092053109404915e-06, |
| "loss": 0.4439, |
| "step": 1664 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.053682804107666, |
| "learning_rate": 4.901079336874613e-06, |
| "loss": 0.4232, |
| "step": 1696 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 1.9536739587783813, |
| "learning_rate": 4.892612464398635e-06, |
| "loss": 0.4548, |
| "step": 1728 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.9369513988494873, |
| "learning_rate": 4.883805895615012e-06, |
| "loss": 0.4334, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 1.754228115081787, |
| "learning_rate": 4.874660880855312e-06, |
| "loss": 0.4525, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.147170305252075, |
| "learning_rate": 4.865178718502702e-06, |
| "loss": 0.4281, |
| "step": 1824 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 1.8635765314102173, |
| "learning_rate": 4.855360754807605e-06, |
| "loss": 0.4354, |
| "step": 1856 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.63584303855896, |
| "learning_rate": 4.845208383696562e-06, |
| "loss": 0.4423, |
| "step": 1888 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 1.874233365058899, |
| "learning_rate": 4.834723046574325e-06, |
| "loss": 0.4265, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.0124189853668213, |
| "learning_rate": 4.823906232119217e-06, |
| "loss": 0.4336, |
| "step": 1952 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 1.8331832885742188, |
| "learning_rate": 4.812759476071763e-06, |
| "loss": 0.4129, |
| "step": 1984 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.133513927459717, |
| "learning_rate": 4.801284361016662e-06, |
| "loss": 0.4443, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 1.4341892004013062, |
| "learning_rate": 4.7894825161580895e-06, |
| "loss": 0.4205, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.1529791355133057, |
| "learning_rate": 4.777355617088385e-06, |
| "loss": 0.4233, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.9409756660461426, |
| "learning_rate": 4.764905385550162e-06, |
| "loss": 0.4468, |
| "step": 2112 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.3252675533294678, |
| "learning_rate": 4.752133589191858e-06, |
| "loss": 0.439, |
| "step": 2144 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.023167133331299, |
| "learning_rate": 4.739042041316768e-06, |
| "loss": 0.4336, |
| "step": 2176 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 1.8779696226119995, |
| "learning_rate": 4.725632600625596e-06, |
| "loss": 0.4262, |
| "step": 2208 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.8494573831558228, |
| "learning_rate": 4.711907170952566e-06, |
| "loss": 0.4192, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.6154637336730957, |
| "learning_rate": 4.697867700995114e-06, |
| "loss": 0.4195, |
| "step": 2272 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 1.9525200128555298, |
| "learning_rate": 4.6835161840372275e-06, |
| "loss": 0.4277, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.0012083053588867, |
| "learning_rate": 4.668854657666433e-06, |
| "loss": 0.4279, |
| "step": 2336 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.5427674055099487, |
| "learning_rate": 4.653885203484516e-06, |
| "loss": 0.4291, |
| "step": 2368 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 1.8067190647125244, |
| "learning_rate": 4.638609946811972e-06, |
| "loss": 0.4493, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 1.7905209064483643, |
| "learning_rate": 4.623031056386266e-06, |
| "loss": 0.4218, |
| "step": 2432 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.0493154525756836, |
| "learning_rate": 4.60715074405392e-06, |
| "loss": 0.4241, |
| "step": 2464 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.927325963973999, |
| "learning_rate": 4.5909712644564785e-06, |
| "loss": 0.4361, |
| "step": 2496 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.2927017211914062, |
| "learning_rate": 4.574494914710402e-06, |
| "loss": 0.4257, |
| "step": 2528 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 1.3847366571426392, |
| "learning_rate": 4.557724034080933e-06, |
| "loss": 0.4221, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 1.9423835277557373, |
| "learning_rate": 4.540661003649969e-06, |
| "loss": 0.4194, |
| "step": 2592 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.7483317852020264, |
| "learning_rate": 4.523308245978002e-06, |
| "loss": 0.4231, |
| "step": 2624 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 1.8706562519073486, |
| "learning_rate": 4.505668224760177e-06, |
| "loss": 0.4361, |
| "step": 2656 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.5217187404632568, |
| "learning_rate": 4.487743444476497e-06, |
| "loss": 0.4293, |
| "step": 2688 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 1.5683813095092773, |
| "learning_rate": 4.4695364500362505e-06, |
| "loss": 0.4191, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.9017343521118164, |
| "learning_rate": 4.451049826416682e-06, |
| "loss": 0.425, |
| "step": 2752 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 1.5698572397232056, |
| "learning_rate": 4.432286198295998e-06, |
| "loss": 0.4189, |
| "step": 2784 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.894131064414978, |
| "learning_rate": 4.41324822968071e-06, |
| "loss": 0.4193, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 1.7698094844818115, |
| "learning_rate": 4.393938623527417e-06, |
| "loss": 0.4211, |
| "step": 2848 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.4975591897964478, |
| "learning_rate": 4.374360121359038e-06, |
| "loss": 0.4104, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 1.7062549591064453, |
| "learning_rate": 4.3545155028755865e-06, |
| "loss": 0.4363, |
| "step": 2912 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.5995006561279297, |
| "learning_rate": 4.33440758555951e-06, |
| "loss": 0.4153, |
| "step": 2944 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.1322031021118164, |
| "learning_rate": 4.3140392242756776e-06, |
| "loss": 0.4211, |
| "step": 2976 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.7106413841247559, |
| "learning_rate": 4.293413310866049e-06, |
| "loss": 0.4285, |
| "step": 3008 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 1.458028793334961, |
| "learning_rate": 4.272532773739104e-06, |
| "loss": 0.4393, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.8825474977493286, |
| "learning_rate": 4.251400577454071e-06, |
| "loss": 0.4252, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 1.8799755573272705, |
| "learning_rate": 4.230019722300031e-06, |
| "loss": 0.4077, |
| "step": 3104 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.2173070907592773, |
| "learning_rate": 4.208393243869944e-06, |
| "loss": 0.3926, |
| "step": 3136 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 2.1596415042877197, |
| "learning_rate": 4.1865242126296595e-06, |
| "loss": 0.3684, |
| "step": 3168 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 1.7839455604553223, |
| "learning_rate": 4.16441573348199e-06, |
| "loss": 0.3653, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 2.041905403137207, |
| "learning_rate": 4.142070945325877e-06, |
| "loss": 0.3662, |
| "step": 3232 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 1.9254001379013062, |
| "learning_rate": 4.119493020610743e-06, |
| "loss": 0.3666, |
| "step": 3264 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 1.974266767501831, |
| "learning_rate": 4.096685164886077e-06, |
| "loss": 0.3741, |
| "step": 3296 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 1.8142021894454956, |
| "learning_rate": 4.073650616346317e-06, |
| "loss": 0.3611, |
| "step": 3328 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 2.006136417388916, |
| "learning_rate": 4.050392645371101e-06, |
| "loss": 0.3704, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 1.5683622360229492, |
| "learning_rate": 4.02691455406095e-06, |
| "loss": 0.3533, |
| "step": 3392 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 2.074063777923584, |
| "learning_rate": 4.003219675768442e-06, |
| "loss": 0.373, |
| "step": 3424 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 1.7841862440109253, |
| "learning_rate": 3.9793113746249554e-06, |
| "loss": 0.3542, |
| "step": 3456 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 1.8422183990478516, |
| "learning_rate": 3.955193045063038e-06, |
| "loss": 0.356, |
| "step": 3488 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 1.7018710374832153, |
| "learning_rate": 3.930868111334471e-06, |
| "loss": 0.3623, |
| "step": 3520 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 1.9030427932739258, |
| "learning_rate": 3.9063400270241114e-06, |
| "loss": 0.375, |
| "step": 3552 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 2.2064173221588135, |
| "learning_rate": 3.8816122745595556e-06, |
| "loss": 0.3621, |
| "step": 3584 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 1.7391672134399414, |
| "learning_rate": 3.856688364716715e-06, |
| "loss": 0.3675, |
| "step": 3616 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 2.20231032371521, |
| "learning_rate": 3.8315718361213694e-06, |
| "loss": 0.3637, |
| "step": 3648 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 1.6645841598510742, |
| "learning_rate": 3.8062662547467604e-06, |
| "loss": 0.355, |
| "step": 3680 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 1.9098048210144043, |
| "learning_rate": 3.780775213407305e-06, |
| "loss": 0.3672, |
| "step": 3712 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 1.7326850891113281, |
| "learning_rate": 3.755102331248497e-06, |
| "loss": 0.3653, |
| "step": 3744 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 1.9420820474624634, |
| "learning_rate": 3.729251253233073e-06, |
| "loss": 0.3704, |
| "step": 3776 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 1.9405999183654785, |
| "learning_rate": 3.703225649623511e-06, |
| "loss": 0.3897, |
| "step": 3808 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 2.333876371383667, |
| "learning_rate": 3.677029215460935e-06, |
| "loss": 0.3575, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 2.2306437492370605, |
| "learning_rate": 3.6506656700405045e-06, |
| "loss": 0.3566, |
| "step": 3872 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 2.2080612182617188, |
| "learning_rate": 3.624138756383361e-06, |
| "loss": 0.3779, |
| "step": 3904 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 1.5209364891052246, |
| "learning_rate": 3.5974522407052013e-06, |
| "loss": 0.3659, |
| "step": 3936 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 1.905605435371399, |
| "learning_rate": 3.570609911881566e-06, |
| "loss": 0.3704, |
| "step": 3968 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 1.9363412857055664, |
| "learning_rate": 3.543615580909898e-06, |
| "loss": 0.3752, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 1.6711021661758423, |
| "learning_rate": 3.516473080368478e-06, |
| "loss": 0.3706, |
| "step": 4032 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 2.128994941711426, |
| "learning_rate": 3.489186263872275e-06, |
| "loss": 0.3695, |
| "step": 4064 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 1.7105140686035156, |
| "learning_rate": 3.461759005525831e-06, |
| "loss": 0.3701, |
| "step": 4096 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 1.8307222127914429, |
| "learning_rate": 3.43419519937322e-06, |
| "loss": 0.3577, |
| "step": 4128 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 1.8442600965499878, |
| "learning_rate": 3.406498758845184e-06, |
| "loss": 0.3706, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 1.8921831846237183, |
| "learning_rate": 3.3786736162035187e-06, |
| "loss": 0.364, |
| "step": 4192 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 1.7511910200119019, |
| "learning_rate": 3.3507237219827784e-06, |
| "loss": 0.3488, |
| "step": 4224 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 1.3822124004364014, |
| "learning_rate": 3.3226530444293893e-06, |
| "loss": 0.3628, |
| "step": 4256 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 2.302302837371826, |
| "learning_rate": 3.2944655689382554e-06, |
| "loss": 0.36, |
| "step": 4288 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 1.6606963872909546, |
| "learning_rate": 3.2661652974869164e-06, |
| "loss": 0.3487, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 2.009692668914795, |
| "learning_rate": 3.2377562480673623e-06, |
| "loss": 0.3789, |
| "step": 4352 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 2.011293649673462, |
| "learning_rate": 3.20924245411557e-06, |
| "loss": 0.3669, |
| "step": 4384 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 2.3706395626068115, |
| "learning_rate": 3.180627963938847e-06, |
| "loss": 0.3698, |
| "step": 4416 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 2.0138208866119385, |
| "learning_rate": 3.1519168401410627e-06, |
| "loss": 0.355, |
| "step": 4448 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 1.6110373735427856, |
| "learning_rate": 3.123113159045854e-06, |
| "loss": 0.3519, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 1.9039591550827026, |
| "learning_rate": 3.09422101011788e-06, |
| "loss": 0.3677, |
| "step": 4512 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 1.9409842491149902, |
| "learning_rate": 3.0652444953822097e-06, |
| "loss": 0.3476, |
| "step": 4544 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 1.5499393939971924, |
| "learning_rate": 3.0361877288419306e-06, |
| "loss": 0.3642, |
| "step": 4576 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 1.5498781204223633, |
| "learning_rate": 3.0070548358940523e-06, |
| "loss": 0.3811, |
| "step": 4608 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 1.8059883117675781, |
| "learning_rate": 2.9778499527437932e-06, |
| "loss": 0.3741, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 1.4482682943344116, |
| "learning_rate": 2.9485772258173405e-06, |
| "loss": 0.3674, |
| "step": 4672 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 2.086606502532959, |
| "learning_rate": 2.919240811173143e-06, |
| "loss": 0.3621, |
| "step": 4704 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 1.6022703647613525, |
| "learning_rate": 2.8898448739118533e-06, |
| "loss": 0.3651, |
| "step": 4736 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 1.6419217586517334, |
| "learning_rate": 2.8603935875849744e-06, |
| "loss": 0.3709, |
| "step": 4768 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 1.490946888923645, |
| "learning_rate": 2.830891133602311e-06, |
| "loss": 0.3408, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 1.644801378250122, |
| "learning_rate": 2.8013417006383078e-06, |
| "loss": 0.3408, |
| "step": 4832 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 1.872612714767456, |
| "learning_rate": 2.771749484037352e-06, |
| "loss": 0.3633, |
| "step": 4864 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 2.0236546993255615, |
| "learning_rate": 2.7421186852181282e-06, |
| "loss": 0.3504, |
| "step": 4896 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 1.8779181241989136, |
| "learning_rate": 2.7124535110771155e-06, |
| "loss": 0.3757, |
| "step": 4928 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 1.8941439390182495, |
| "learning_rate": 2.6827581733913027e-06, |
| "loss": 0.3581, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.814450740814209, |
| "learning_rate": 2.6530368882202127e-06, |
| "loss": 0.3623, |
| "step": 4992 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 2.031270980834961, |
| "learning_rate": 2.623293875307319e-06, |
| "loss": 0.3712, |
| "step": 5024 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 1.736070156097412, |
| "learning_rate": 2.5935333574809385e-06, |
| "loss": 0.3663, |
| "step": 5056 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 1.778670072555542, |
| "learning_rate": 2.5637595600546855e-06, |
| "loss": 0.3683, |
| "step": 5088 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 1.6772242784500122, |
| "learning_rate": 2.533976710227574e-06, |
| "loss": 0.3687, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 1.56125807762146, |
| "learning_rate": 2.504189036483851e-06, |
| "loss": 0.3556, |
| "step": 5152 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 1.861116647720337, |
| "learning_rate": 2.4744007679926514e-06, |
| "loss": 0.3567, |
| "step": 5184 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 2.2606282234191895, |
| "learning_rate": 2.444616134007549e-06, |
| "loss": 0.3358, |
| "step": 5216 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 2.367314100265503, |
| "learning_rate": 2.4148393632661033e-06, |
| "loss": 0.3554, |
| "step": 5248 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 2.0420873165130615, |
| "learning_rate": 2.385074683389469e-06, |
| "loss": 0.3627, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 2.024829864501953, |
| "learning_rate": 2.3553263202821775e-06, |
| "loss": 0.3581, |
| "step": 5312 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 2.053925037384033, |
| "learning_rate": 2.3255984975321503e-06, |
| "loss": 0.3531, |
| "step": 5344 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.7462561130523682, |
| "learning_rate": 2.2958954358110467e-06, |
| "loss": 0.3574, |
| "step": 5376 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 1.6682531833648682, |
| "learning_rate": 2.266221352275029e-06, |
| "loss": 0.3675, |
| "step": 5408 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 1.618286371231079, |
| "learning_rate": 2.2365804599660147e-06, |
| "loss": 0.361, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 2.0060200691223145, |
| "learning_rate": 2.2069769672135283e-06, |
| "loss": 0.3491, |
| "step": 5472 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 1.9721282720565796, |
| "learning_rate": 2.1774150770372106e-06, |
| "loss": 0.3621, |
| "step": 5504 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 1.5395954847335815, |
| "learning_rate": 2.147898986550087e-06, |
| "loss": 0.3621, |
| "step": 5536 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 1.6971800327301025, |
| "learning_rate": 2.1184328863626754e-06, |
| "loss": 0.3637, |
| "step": 5568 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 1.3535592555999756, |
| "learning_rate": 2.089020959988009e-06, |
| "loss": 0.3536, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.4577397108078003, |
| "learning_rate": 2.059667383247683e-06, |
| "loss": 0.3368, |
| "step": 5632 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 1.9268568754196167, |
| "learning_rate": 2.0303763236789717e-06, |
| "loss": 0.3653, |
| "step": 5664 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 1.6601406335830688, |
| "learning_rate": 2.001151939943144e-06, |
| "loss": 0.3661, |
| "step": 5696 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 1.9766888618469238, |
| "learning_rate": 1.9719983812350193e-06, |
| "loss": 0.369, |
| "step": 5728 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 1.538190484046936, |
| "learning_rate": 1.942919786693886e-06, |
| "loss": 0.3582, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 2.4176807403564453, |
| "learning_rate": 1.913920284815831e-06, |
| "loss": 0.3556, |
| "step": 5792 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 2.0340559482574463, |
| "learning_rate": 1.8850039928675898e-06, |
| "loss": 0.3705, |
| "step": 5824 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 1.6979023218154907, |
| "learning_rate": 1.8561750163019896e-06, |
| "loss": 0.3571, |
| "step": 5856 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.6448854207992554, |
| "learning_rate": 1.8274374481750662e-06, |
| "loss": 0.3598, |
| "step": 5888 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 1.9256216287612915, |
| "learning_rate": 1.7987953685649485e-06, |
| "loss": 0.3704, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 1.91543710231781, |
| "learning_rate": 1.7702528439925767e-06, |
| "loss": 0.3599, |
| "step": 5952 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 1.8443219661712646, |
| "learning_rate": 1.7418139268443482e-06, |
| "loss": 0.3557, |
| "step": 5984 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 1.9858685731887817, |
| "learning_rate": 1.7134826547967757e-06, |
| "loss": 0.3562, |
| "step": 6016 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 1.5424166917800903, |
| "learning_rate": 1.6852630502432238e-06, |
| "loss": 0.3554, |
| "step": 6048 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 1.9284968376159668, |
| "learning_rate": 1.6571591197228285e-06, |
| "loss": 0.3618, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 1.8226544857025146, |
| "learning_rate": 1.629174853351651e-06, |
| "loss": 0.3528, |
| "step": 6112 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 1.8396884202957153, |
| "learning_rate": 1.6013142242561813e-06, |
| "loss": 0.3507, |
| "step": 6144 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 1.8388822078704834, |
| "learning_rate": 1.5735811880092394e-06, |
| "loss": 0.3611, |
| "step": 6176 |
| }, |
| { |
| "epoch": 1.99, |
| "grad_norm": 1.725791573524475, |
| "learning_rate": 1.5459796820683737e-06, |
| "loss": 0.3411, |
| "step": 6208 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 2.2812933921813965, |
| "learning_rate": 1.518513625216838e-06, |
| "loss": 0.3445, |
| "step": 6240 |
| }, |
| { |
| "epoch": 2.01, |
| "grad_norm": 1.6806988716125488, |
| "learning_rate": 1.491186917007206e-06, |
| "loss": 0.3137, |
| "step": 6272 |
| }, |
| { |
| "epoch": 2.02, |
| "grad_norm": 1.8829107284545898, |
| "learning_rate": 1.4640034372077322e-06, |
| "loss": 0.2856, |
| "step": 6304 |
| }, |
| { |
| "epoch": 2.03, |
| "grad_norm": 1.9766901731491089, |
| "learning_rate": 1.4369670452515044e-06, |
| "loss": 0.2604, |
| "step": 6336 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 2.055413007736206, |
| "learning_rate": 1.4100815796884998e-06, |
| "loss": 0.2694, |
| "step": 6368 |
| }, |
| { |
| "epoch": 2.05, |
| "grad_norm": 1.782673954963684, |
| "learning_rate": 1.3833508576405974e-06, |
| "loss": 0.2694, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.06, |
| "grad_norm": 2.202526807785034, |
| "learning_rate": 1.3567786742596283e-06, |
| "loss": 0.2621, |
| "step": 6432 |
| }, |
| { |
| "epoch": 2.07, |
| "grad_norm": 2.1951637268066406, |
| "learning_rate": 1.3303688021885575e-06, |
| "loss": 0.2789, |
| "step": 6464 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 1.7499464750289917, |
| "learning_rate": 1.304124991025852e-06, |
| "loss": 0.2621, |
| "step": 6496 |
| }, |
| { |
| "epoch": 2.09, |
| "grad_norm": 1.8003720045089722, |
| "learning_rate": 1.2780509667931217e-06, |
| "loss": 0.2715, |
| "step": 6528 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 1.9713287353515625, |
| "learning_rate": 1.2521504314061173e-06, |
| "loss": 0.2717, |
| "step": 6560 |
| }, |
| { |
| "epoch": 2.11, |
| "grad_norm": 1.9567084312438965, |
| "learning_rate": 1.2264270621491286e-06, |
| "loss": 0.274, |
| "step": 6592 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 2.075596809387207, |
| "learning_rate": 1.2008845111529088e-06, |
| "loss": 0.2612, |
| "step": 6624 |
| }, |
| { |
| "epoch": 2.13, |
| "grad_norm": 2.1604607105255127, |
| "learning_rate": 1.1755264048761464e-06, |
| "loss": 0.2704, |
| "step": 6656 |
| }, |
| { |
| "epoch": 2.14, |
| "grad_norm": 2.0567729473114014, |
| "learning_rate": 1.1503563435905943e-06, |
| "loss": 0.2711, |
| "step": 6688 |
| }, |
| { |
| "epoch": 2.15, |
| "grad_norm": 2.0198721885681152, |
| "learning_rate": 1.1253779008699131e-06, |
| "loss": 0.2699, |
| "step": 6720 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 2.6587815284729004, |
| "learning_rate": 1.100594623082303e-06, |
| "loss": 0.2647, |
| "step": 6752 |
| }, |
| { |
| "epoch": 2.17, |
| "grad_norm": 1.6927788257598877, |
| "learning_rate": 1.0760100288870077e-06, |
| "loss": 0.2648, |
| "step": 6784 |
| }, |
| { |
| "epoch": 2.18, |
| "grad_norm": 2.316779613494873, |
| "learning_rate": 1.051627608734733e-06, |
| "loss": 0.276, |
| "step": 6816 |
| }, |
| { |
| "epoch": 2.19, |
| "grad_norm": 1.5877448320388794, |
| "learning_rate": 1.027450824372094e-06, |
| "loss": 0.2622, |
| "step": 6848 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 2.0945749282836914, |
| "learning_rate": 1.0034831083501206e-06, |
| "loss": 0.2597, |
| "step": 6880 |
| }, |
| { |
| "epoch": 2.21, |
| "grad_norm": 2.4265644550323486, |
| "learning_rate": 9.797278635369137e-07, |
| "loss": 0.258, |
| "step": 6912 |
| }, |
| { |
| "epoch": 2.22, |
| "grad_norm": 1.770364761352539, |
| "learning_rate": 9.561884626345206e-07, |
| "loss": 0.2624, |
| "step": 6944 |
| }, |
| { |
| "epoch": 2.23, |
| "grad_norm": 2.046229839324951, |
| "learning_rate": 9.328682477000789e-07, |
| "loss": 0.2618, |
| "step": 6976 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 2.9548141956329346, |
| "learning_rate": 9.097705296713297e-07, |
| "loss": 0.2697, |
| "step": 7008 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 2.126011848449707, |
| "learning_rate": 8.868985878965366e-07, |
| "loss": 0.2661, |
| "step": 7040 |
| }, |
| { |
| "epoch": 2.26, |
| "grad_norm": 2.0025265216827393, |
| "learning_rate": 8.642556696688922e-07, |
| "loss": 0.2533, |
| "step": 7072 |
| }, |
| { |
| "epoch": 2.27, |
| "grad_norm": 1.7288347482681274, |
| "learning_rate": 8.41844989765479e-07, |
| "loss": 0.2644, |
| "step": 7104 |
| }, |
| { |
| "epoch": 2.28, |
| "grad_norm": 1.7986013889312744, |
| "learning_rate": 8.196697299908424e-07, |
| "loss": 0.2684, |
| "step": 7136 |
| }, |
| { |
| "epoch": 2.29, |
| "grad_norm": 2.160757541656494, |
| "learning_rate": 7.977330387252477e-07, |
| "loss": 0.2807, |
| "step": 7168 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 2.1614770889282227, |
| "learning_rate": 7.760380304776832e-07, |
| "loss": 0.2801, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.31, |
| "grad_norm": 2.361065149307251, |
| "learning_rate": 7.545877854436698e-07, |
| "loss": 0.2604, |
| "step": 7232 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 1.8524858951568604, |
| "learning_rate": 7.333853490679435e-07, |
| "loss": 0.2776, |
| "step": 7264 |
| }, |
| { |
| "epoch": 2.33, |
| "grad_norm": 1.9294536113739014, |
| "learning_rate": 7.124337316120735e-07, |
| "loss": 0.2795, |
| "step": 7296 |
| }, |
| { |
| "epoch": 2.34, |
| "grad_norm": 1.9421364068984985, |
| "learning_rate": 6.917359077270716e-07, |
| "loss": 0.2635, |
| "step": 7328 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 1.9388601779937744, |
| "learning_rate": 6.712948160310612e-07, |
| "loss": 0.2571, |
| "step": 7360 |
| }, |
| { |
| "epoch": 2.37, |
| "grad_norm": 1.906515121459961, |
| "learning_rate": 6.511133586920601e-07, |
| "loss": 0.2634, |
| "step": 7392 |
| }, |
| { |
| "epoch": 2.38, |
| "grad_norm": 1.894484043121338, |
| "learning_rate": 6.311944010159394e-07, |
| "loss": 0.2725, |
| "step": 7424 |
| }, |
| { |
| "epoch": 2.39, |
| "grad_norm": 2.0004079341888428, |
| "learning_rate": 6.115407710396145e-07, |
| "loss": 0.2565, |
| "step": 7456 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 2.3763046264648438, |
| "learning_rate": 5.921552591295304e-07, |
| "loss": 0.2634, |
| "step": 7488 |
| }, |
| { |
| "epoch": 2.41, |
| "grad_norm": 2.0313122272491455, |
| "learning_rate": 5.730406175854908e-07, |
| "loss": 0.2668, |
| "step": 7520 |
| }, |
| { |
| "epoch": 2.42, |
| "grad_norm": 2.0953962802886963, |
| "learning_rate": 5.54199560249897e-07, |
| "loss": 0.2708, |
| "step": 7552 |
| }, |
| { |
| "epoch": 2.43, |
| "grad_norm": 2.3282768726348877, |
| "learning_rate": 5.35634762122442e-07, |
| "loss": 0.2639, |
| "step": 7584 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 2.0609822273254395, |
| "learning_rate": 5.173488589803238e-07, |
| "loss": 0.2691, |
| "step": 7616 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 1.9901596307754517, |
| "learning_rate": 4.993444470040234e-07, |
| "loss": 0.2646, |
| "step": 7648 |
| }, |
| { |
| "epoch": 2.46, |
| "grad_norm": 1.9838128089904785, |
| "learning_rate": 4.816240824087076e-07, |
| "loss": 0.2665, |
| "step": 7680 |
| }, |
| { |
| "epoch": 2.47, |
| "grad_norm": 2.1872060298919678, |
| "learning_rate": 4.6419028108130456e-07, |
| "loss": 0.2665, |
| "step": 7712 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 1.5998047590255737, |
| "learning_rate": 4.470455182233052e-07, |
| "loss": 0.2539, |
| "step": 7744 |
| }, |
| { |
| "epoch": 2.49, |
| "grad_norm": 1.842804193496704, |
| "learning_rate": 4.3019222799934117e-07, |
| "loss": 0.2562, |
| "step": 7776 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 2.4707839488983154, |
| "learning_rate": 4.1363280319158925e-07, |
| "loss": 0.2625, |
| "step": 7808 |
| }, |
| { |
| "epoch": 2.51, |
| "grad_norm": 2.171933174133301, |
| "learning_rate": 3.973695948600512e-07, |
| "loss": 0.2663, |
| "step": 7840 |
| }, |
| { |
| "epoch": 2.52, |
| "grad_norm": 2.098424196243286, |
| "learning_rate": 3.8140491200875567e-07, |
| "loss": 0.266, |
| "step": 7872 |
| }, |
| { |
| "epoch": 2.53, |
| "grad_norm": 2.3341708183288574, |
| "learning_rate": 3.6574102125793433e-07, |
| "loss": 0.2568, |
| "step": 7904 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 2.1890530586242676, |
| "learning_rate": 3.50380146522212e-07, |
| "loss": 0.2735, |
| "step": 7936 |
| }, |
| { |
| "epoch": 2.55, |
| "grad_norm": 1.7215001583099365, |
| "learning_rate": 3.3532446869486255e-07, |
| "loss": 0.2553, |
| "step": 7968 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 2.284882068634033, |
| "learning_rate": 3.205761253381706e-07, |
| "loss": 0.2622, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.57, |
| "grad_norm": 2.0944039821624756, |
| "learning_rate": 3.061372103799487e-07, |
| "loss": 0.2687, |
| "step": 8032 |
| }, |
| { |
| "epoch": 2.58, |
| "grad_norm": 1.7873029708862305, |
| "learning_rate": 2.920097738162453e-07, |
| "loss": 0.2609, |
| "step": 8064 |
| }, |
| { |
| "epoch": 2.59, |
| "grad_norm": 1.9296663999557495, |
| "learning_rate": 2.781958214202918e-07, |
| "loss": 0.2711, |
| "step": 8096 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 2.2704007625579834, |
| "learning_rate": 2.646973144577325e-07, |
| "loss": 0.2591, |
| "step": 8128 |
| }, |
| { |
| "epoch": 2.61, |
| "grad_norm": 2.158012628555298, |
| "learning_rate": 2.515161694081647e-07, |
| "loss": 0.2572, |
| "step": 8160 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 2.1333682537078857, |
| "learning_rate": 2.386542576930456e-07, |
| "loss": 0.2707, |
| "step": 8192 |
| }, |
| { |
| "epoch": 2.63, |
| "grad_norm": 2.117429733276367, |
| "learning_rate": 2.261134054099917e-07, |
| "loss": 0.2548, |
| "step": 8224 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 1.9128592014312744, |
| "learning_rate": 2.1389539307351547e-07, |
| "loss": 0.2694, |
| "step": 8256 |
| }, |
| { |
| "epoch": 2.65, |
| "grad_norm": 2.2764246463775635, |
| "learning_rate": 2.0200195536223267e-07, |
| "loss": 0.2689, |
| "step": 8288 |
| }, |
| { |
| "epoch": 2.66, |
| "grad_norm": 1.794533371925354, |
| "learning_rate": 1.9043478087257623e-07, |
| "loss": 0.2625, |
| "step": 8320 |
| }, |
| { |
| "epoch": 2.67, |
| "grad_norm": 1.9093414545059204, |
| "learning_rate": 1.7919551187905837e-07, |
| "loss": 0.2612, |
| "step": 8352 |
| }, |
| { |
| "epoch": 2.68, |
| "grad_norm": 1.827714443206787, |
| "learning_rate": 1.6828574410110016e-07, |
| "loss": 0.2603, |
| "step": 8384 |
| }, |
| { |
| "epoch": 2.69, |
| "grad_norm": 2.5951650142669678, |
| "learning_rate": 1.5770702647647823e-07, |
| "loss": 0.2654, |
| "step": 8416 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 2.659973621368408, |
| "learning_rate": 1.474608609414113e-07, |
| "loss": 0.2644, |
| "step": 8448 |
| }, |
| { |
| "epoch": 2.71, |
| "grad_norm": 2.4667563438415527, |
| "learning_rate": 1.3754870221731775e-07, |
| "loss": 0.2583, |
| "step": 8480 |
| }, |
| { |
| "epoch": 2.72, |
| "grad_norm": 1.7934099435806274, |
| "learning_rate": 1.2797195760428093e-07, |
| "loss": 0.2624, |
| "step": 8512 |
| }, |
| { |
| "epoch": 2.73, |
| "grad_norm": 2.117466926574707, |
| "learning_rate": 1.1873198678124309e-07, |
| "loss": 0.2452, |
| "step": 8544 |
| }, |
| { |
| "epoch": 2.74, |
| "grad_norm": 1.7996597290039062, |
| "learning_rate": 1.0983010161296215e-07, |
| "loss": 0.2512, |
| "step": 8576 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 1.920240879058838, |
| "learning_rate": 1.0126756596375687e-07, |
| "loss": 0.2631, |
| "step": 8608 |
| }, |
| { |
| "epoch": 2.76, |
| "grad_norm": 2.1029436588287354, |
| "learning_rate": 9.304559551806675e-08, |
| "loss": 0.2703, |
| "step": 8640 |
| }, |
| { |
| "epoch": 2.78, |
| "grad_norm": 2.1418569087982178, |
| "learning_rate": 8.516535760785455e-08, |
| "loss": 0.2621, |
| "step": 8672 |
| }, |
| { |
| "epoch": 2.79, |
| "grad_norm": 2.1141951084136963, |
| "learning_rate": 7.762797104686858e-08, |
| "loss": 0.248, |
| "step": 8704 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 2.1870038509368896, |
| "learning_rate": 7.043450597179979e-08, |
| "loss": 0.2679, |
| "step": 8736 |
| }, |
| { |
| "epoch": 2.81, |
| "grad_norm": 2.4698402881622314, |
| "learning_rate": 6.358598369034518e-08, |
| "loss": 0.2649, |
| "step": 8768 |
| }, |
| { |
| "epoch": 2.82, |
| "grad_norm": 1.8242698907852173, |
| "learning_rate": 5.7083376536204436e-08, |
| "loss": 0.268, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.83, |
| "grad_norm": 2.2052414417266846, |
| "learning_rate": 5.092760773103417e-08, |
| "loss": 0.2625, |
| "step": 8832 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 2.050403118133545, |
| "learning_rate": 4.511955125336726e-08, |
| "loss": 0.2668, |
| "step": 8864 |
| }, |
| { |
| "epoch": 2.85, |
| "grad_norm": 1.8949859142303467, |
| "learning_rate": 3.966003171453181e-08, |
| "loss": 0.2615, |
| "step": 8896 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 1.904829740524292, |
| "learning_rate": 3.4549824241572326e-08, |
| "loss": 0.2522, |
| "step": 8928 |
| }, |
| { |
| "epoch": 2.87, |
| "grad_norm": 1.9748365879058838, |
| "learning_rate": 2.9789654367200492e-08, |
| "loss": 0.2637, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 1.9434069395065308, |
| "learning_rate": 2.538019792678703e-08, |
| "loss": 0.2555, |
| "step": 8992 |
| }, |
| { |
| "epoch": 2.89, |
| "grad_norm": 1.8021697998046875, |
| "learning_rate": 2.1322080962405677e-08, |
| "loss": 0.2479, |
| "step": 9024 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 1.9932126998901367, |
| "learning_rate": 1.7615879633953724e-08, |
| "loss": 0.2724, |
| "step": 9056 |
| }, |
| { |
| "epoch": 2.91, |
| "grad_norm": 2.393543243408203, |
| "learning_rate": 1.4262120137345791e-08, |
| "loss": 0.2662, |
| "step": 9088 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 2.3378348350524902, |
| "learning_rate": 1.1261278629810246e-08, |
| "loss": 0.2719, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.93, |
| "grad_norm": 1.788970947265625, |
| "learning_rate": 8.613781162282731e-09, |
| "loss": 0.2573, |
| "step": 9152 |
| }, |
| { |
| "epoch": 2.94, |
| "grad_norm": 1.7528036832809448, |
| "learning_rate": 6.32000361891788e-09, |
| "loss": 0.2645, |
| "step": 9184 |
| }, |
| { |
| "epoch": 2.95, |
| "grad_norm": 2.0878617763519287, |
| "learning_rate": 4.380271663723401e-09, |
| "loss": 0.2552, |
| "step": 9216 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 1.8262449502944946, |
| "learning_rate": 2.794860694320389e-09, |
| "loss": 0.2565, |
| "step": 9248 |
| }, |
| { |
| "epoch": 2.97, |
| "grad_norm": 1.9307301044464111, |
| "learning_rate": 1.5639958028462742e-09, |
| "loss": 0.2743, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.98, |
| "grad_norm": 1.9267100095748901, |
| "learning_rate": 6.878517439948274e-10, |
| "loss": 0.2629, |
| "step": 9312 |
| }, |
| { |
| "epoch": 2.99, |
| "grad_norm": 1.860356330871582, |
| "learning_rate": 1.6655291020573062e-10, |
| "loss": 0.2593, |
| "step": 9344 |
| } |
| ], |
| "logging_steps": 32, |
| "max_steps": 9375, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 3125, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|