| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 3.0057201385498047, | |
| "learning_rate": 0.0, | |
| "loss": 0.7767, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 4.027021884918213, | |
| "learning_rate": 1.7543859649122808e-07, | |
| "loss": 0.9309, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 4.203588485717773, | |
| "learning_rate": 3.5087719298245616e-07, | |
| "loss": 1.0352, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 3.4434900283813477, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 0.8864, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.71036958694458, | |
| "learning_rate": 7.017543859649123e-07, | |
| "loss": 0.9448, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 3.9288582801818848, | |
| "learning_rate": 8.771929824561404e-07, | |
| "loss": 0.9452, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 3.28279447555542, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 0.9235, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 3.7405591011047363, | |
| "learning_rate": 1.2280701754385965e-06, | |
| "loss": 0.9624, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 3.152585983276367, | |
| "learning_rate": 1.4035087719298246e-06, | |
| "loss": 0.9113, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.855647325515747, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 0.9915, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 2.4473962783813477, | |
| "learning_rate": 1.7543859649122807e-06, | |
| "loss": 0.9369, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.9605332612991333, | |
| "learning_rate": 1.929824561403509e-06, | |
| "loss": 0.8725, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 1.6995816230773926, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 0.8757, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 1.5501030683517456, | |
| "learning_rate": 2.280701754385965e-06, | |
| "loss": 0.8101, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.3763302564620972, | |
| "learning_rate": 2.456140350877193e-06, | |
| "loss": 0.7746, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.3676273822784424, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 0.8405, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 1.2178465127944946, | |
| "learning_rate": 2.8070175438596493e-06, | |
| "loss": 0.8295, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 1.5485527515411377, | |
| "learning_rate": 2.9824561403508774e-06, | |
| "loss": 0.9355, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 1.5414535999298096, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 0.9135, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.202146053314209, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.7358, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 1.4990830421447754, | |
| "learning_rate": 3.5087719298245615e-06, | |
| "loss": 0.8796, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 1.2759506702423096, | |
| "learning_rate": 3.6842105263157896e-06, | |
| "loss": 0.8877, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 1.2119123935699463, | |
| "learning_rate": 3.859649122807018e-06, | |
| "loss": 0.8834, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 1.2157747745513916, | |
| "learning_rate": 4.035087719298246e-06, | |
| "loss": 0.7249, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.2723734378814697, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 0.8532, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 1.1948084831237793, | |
| "learning_rate": 4.385964912280702e-06, | |
| "loss": 0.8083, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 1.4162577390670776, | |
| "learning_rate": 4.56140350877193e-06, | |
| "loss": 0.8205, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 1.314074993133545, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 0.9847, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 1.282394289970398, | |
| "learning_rate": 4.912280701754386e-06, | |
| "loss": 0.7886, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.1445820331573486, | |
| "learning_rate": 5.087719298245615e-06, | |
| "loss": 0.7637, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 1.146834373474121, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 0.7831, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 1.5468993186950684, | |
| "learning_rate": 5.438596491228071e-06, | |
| "loss": 0.8063, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 1.1543419361114502, | |
| "learning_rate": 5.6140350877192985e-06, | |
| "loss": 0.6723, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.9877017736434937, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 0.7602, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.2613970041275024, | |
| "learning_rate": 5.964912280701755e-06, | |
| "loss": 0.876, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 1.065625786781311, | |
| "learning_rate": 6.140350877192983e-06, | |
| "loss": 0.8201, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 1.5083690881729126, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 0.8454, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 1.3322983980178833, | |
| "learning_rate": 6.491228070175439e-06, | |
| "loss": 0.7553, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.8651496171951294, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.6321, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.0781185626983643, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 0.7641, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 1.300404667854309, | |
| "learning_rate": 7.017543859649123e-06, | |
| "loss": 0.8532, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.9824493527412415, | |
| "learning_rate": 7.192982456140352e-06, | |
| "loss": 0.708, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.9752010107040405, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 0.7339, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.9740676879882812, | |
| "learning_rate": 7.5438596491228074e-06, | |
| "loss": 0.8256, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8943621516227722, | |
| "learning_rate": 7.719298245614036e-06, | |
| "loss": 0.7857, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.9791707396507263, | |
| "learning_rate": 7.894736842105265e-06, | |
| "loss": 0.6158, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 1.1584774255752563, | |
| "learning_rate": 8.070175438596492e-06, | |
| "loss": 0.7807, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.1623049974441528, | |
| "learning_rate": 8.24561403508772e-06, | |
| "loss": 0.863, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.944733738899231, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 0.6933, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.1078664064407349, | |
| "learning_rate": 8.596491228070176e-06, | |
| "loss": 0.7738, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 1.0056620836257935, | |
| "learning_rate": 8.771929824561405e-06, | |
| "loss": 0.8474, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 1.0359532833099365, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 0.7872, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 1.0367261171340942, | |
| "learning_rate": 9.12280701754386e-06, | |
| "loss": 0.7232, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 1.2011809349060059, | |
| "learning_rate": 9.298245614035088e-06, | |
| "loss": 0.7479, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.2602508068084717, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 0.8371, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 1.0027590990066528, | |
| "learning_rate": 9.649122807017545e-06, | |
| "loss": 0.7311, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.968899130821228, | |
| "learning_rate": 9.824561403508772e-06, | |
| "loss": 0.6074, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 1.0579102039337158, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8013, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 1.0420589447021484, | |
| "learning_rate": 9.999978367986988e-06, | |
| "loss": 0.6804, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1043133735656738, | |
| "learning_rate": 9.999913472135126e-06, | |
| "loss": 0.6541, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 1.1697230339050293, | |
| "learning_rate": 9.999805313005946e-06, | |
| "loss": 0.8051, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 1.1130088567733765, | |
| "learning_rate": 9.99965389153533e-06, | |
| "loss": 0.8385, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 1.0213102102279663, | |
| "learning_rate": 9.999459209033495e-06, | |
| "loss": 0.8227, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 1.2667725086212158, | |
| "learning_rate": 9.999221267184993e-06, | |
| "loss": 0.7983, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.3060858249664307, | |
| "learning_rate": 9.998940068048688e-06, | |
| "loss": 0.8821, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 1.2459371089935303, | |
| "learning_rate": 9.998615614057743e-06, | |
| "loss": 0.7995, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 1.1216330528259277, | |
| "learning_rate": 9.998247908019594e-06, | |
| "loss": 0.7072, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 1.0714335441589355, | |
| "learning_rate": 9.997836953115927e-06, | |
| "loss": 0.8306, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 1.1212742328643799, | |
| "learning_rate": 9.997382752902658e-06, | |
| "loss": 0.6879, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.4561703205108643, | |
| "learning_rate": 9.996885311309892e-06, | |
| "loss": 0.7157, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 1.0812970399856567, | |
| "learning_rate": 9.996344632641895e-06, | |
| "loss": 0.7786, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.0658122301101685, | |
| "learning_rate": 9.995760721577053e-06, | |
| "loss": 0.7545, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 1.0522650480270386, | |
| "learning_rate": 9.995133583167833e-06, | |
| "loss": 0.7943, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 1.0689505338668823, | |
| "learning_rate": 9.994463222840748e-06, | |
| "loss": 0.8227, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.0530729293823242, | |
| "learning_rate": 9.993749646396286e-06, | |
| "loss": 0.67, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 1.1894912719726562, | |
| "learning_rate": 9.992992860008893e-06, | |
| "loss": 0.7187, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 1.136382818222046, | |
| "learning_rate": 9.99219287022689e-06, | |
| "loss": 0.8166, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 1.302037000656128, | |
| "learning_rate": 9.991349683972435e-06, | |
| "loss": 0.6921, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 1.112160325050354, | |
| "learning_rate": 9.990463308541452e-06, | |
| "loss": 0.6257, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0237510204315186, | |
| "learning_rate": 9.989533751603578e-06, | |
| "loss": 0.7005, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 1.159624695777893, | |
| "learning_rate": 9.988561021202083e-06, | |
| "loss": 0.7886, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.8578418493270874, | |
| "learning_rate": 9.987545125753818e-06, | |
| "loss": 0.6552, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 1.2471290826797485, | |
| "learning_rate": 9.986486074049131e-06, | |
| "loss": 0.7921, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 1.5150797367095947, | |
| "learning_rate": 9.985383875251783e-06, | |
| "loss": 0.6081, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8957585692405701, | |
| "learning_rate": 9.98423853889889e-06, | |
| "loss": 0.7087, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 1.407145619392395, | |
| "learning_rate": 9.983050074900824e-06, | |
| "loss": 0.7093, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 1.2437479496002197, | |
| "learning_rate": 9.98181849354113e-06, | |
| "loss": 0.6716, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 1.0245378017425537, | |
| "learning_rate": 9.980543805476447e-06, | |
| "loss": 0.7025, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.9500531554222107, | |
| "learning_rate": 9.979226021736396e-06, | |
| "loss": 0.6935, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.0900883674621582, | |
| "learning_rate": 9.977865153723508e-06, | |
| "loss": 0.7676, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 1.1064441204071045, | |
| "learning_rate": 9.976461213213104e-06, | |
| "loss": 0.7307, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.0186768770217896, | |
| "learning_rate": 9.975014212353212e-06, | |
| "loss": 0.7, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 1.1086044311523438, | |
| "learning_rate": 9.973524163664447e-06, | |
| "loss": 0.7366, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 1.0784200429916382, | |
| "learning_rate": 9.971991080039912e-06, | |
| "loss": 0.8675, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.015350580215454, | |
| "learning_rate": 9.970414974745077e-06, | |
| "loss": 0.7899, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.0211460590362549, | |
| "learning_rate": 9.968795861417676e-06, | |
| "loss": 0.6649, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 1.1149851083755493, | |
| "learning_rate": 9.967133754067581e-06, | |
| "loss": 0.7986, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 1.219916820526123, | |
| "learning_rate": 9.965428667076687e-06, | |
| "loss": 0.6227, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.9323999881744385, | |
| "learning_rate": 9.963680615198774e-06, | |
| "loss": 0.7846, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.994877278804779, | |
| "learning_rate": 9.961889613559396e-06, | |
| "loss": 0.6859, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 1.0774526596069336, | |
| "learning_rate": 9.960055677655743e-06, | |
| "loss": 0.7118, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 1.0737630128860474, | |
| "learning_rate": 9.958178823356503e-06, | |
| "loss": 0.7428, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 1.024340271949768, | |
| "learning_rate": 9.956259066901733e-06, | |
| "loss": 0.74, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.9770359396934509, | |
| "learning_rate": 9.954296424902709e-06, | |
| "loss": 0.7257, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.2270859479904175, | |
| "learning_rate": 9.95229091434179e-06, | |
| "loss": 0.725, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 1.1405079364776611, | |
| "learning_rate": 9.950242552572272e-06, | |
| "loss": 0.8309, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.9354670643806458, | |
| "learning_rate": 9.948151357318228e-06, | |
| "loss": 0.6561, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.0575851202011108, | |
| "learning_rate": 9.946017346674362e-06, | |
| "loss": 0.8447, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 1.2344043254852295, | |
| "learning_rate": 9.943840539105853e-06, | |
| "loss": 0.7488, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.1389329433441162, | |
| "learning_rate": 9.941620953448195e-06, | |
| "loss": 0.777, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 1.1365277767181396, | |
| "learning_rate": 9.939358608907026e-06, | |
| "loss": 0.7491, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.9578616619110107, | |
| "learning_rate": 9.937053525057977e-06, | |
| "loss": 0.7264, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 1.0343176126480103, | |
| "learning_rate": 9.934705721846487e-06, | |
| "loss": 0.6402, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 1.0238369703292847, | |
| "learning_rate": 9.932315219587641e-06, | |
| "loss": 0.6726, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.9988969564437866, | |
| "learning_rate": 9.92988203896599e-06, | |
| "loss": 0.6303, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.1477857828140259, | |
| "learning_rate": 9.927406201035368e-06, | |
| "loss": 0.6896, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 1.7063319683074951, | |
| "learning_rate": 9.924887727218724e-06, | |
| "loss": 0.6522, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 1.0718915462493896, | |
| "learning_rate": 9.922326639307918e-06, | |
| "loss": 0.7174, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 1.186877965927124, | |
| "learning_rate": 9.919722959463545e-06, | |
| "loss": 0.7575, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8924572467803955, | |
| "learning_rate": 9.917076710214739e-06, | |
| "loss": 0.6954, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 1.1163474321365356, | |
| "learning_rate": 9.914387914458983e-06, | |
| "loss": 0.6926, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.9361369013786316, | |
| "learning_rate": 9.911656595461899e-06, | |
| "loss": 0.6776, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 1.3585964441299438, | |
| "learning_rate": 9.908882776857057e-06, | |
| "loss": 0.7793, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.1526217460632324, | |
| "learning_rate": 9.906066482645774e-06, | |
| "loss": 0.7857, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0375585556030273, | |
| "learning_rate": 9.903207737196892e-06, | |
| "loss": 0.6332, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 1.5819244384765625, | |
| "learning_rate": 9.900306565246579e-06, | |
| "loss": 0.6019, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 0.9700779914855957, | |
| "learning_rate": 9.89736299189811e-06, | |
| "loss": 0.6471, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 1.0393579006195068, | |
| "learning_rate": 9.894377042621654e-06, | |
| "loss": 0.594, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 1.0894830226898193, | |
| "learning_rate": 9.891348743254046e-06, | |
| "loss": 0.8139, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.1021678447723389, | |
| "learning_rate": 9.888278119998573e-06, | |
| "loss": 0.5949, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 1.0352238416671753, | |
| "learning_rate": 9.885165199424738e-06, | |
| "loss": 0.7143, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 1.0344549417495728, | |
| "learning_rate": 9.882010008468038e-06, | |
| "loss": 0.6757, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 1.0013649463653564, | |
| "learning_rate": 9.878812574429722e-06, | |
| "loss": 0.6091, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 0.8968698978424072, | |
| "learning_rate": 9.875572924976568e-06, | |
| "loss": 0.5219, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.9471760988235474, | |
| "learning_rate": 9.87229108814063e-06, | |
| "loss": 0.4959, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 1.2148710489273071, | |
| "learning_rate": 9.868967092319003e-06, | |
| "loss": 0.6285, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 3.3314623832702637, | |
| "learning_rate": 9.865600966273576e-06, | |
| "loss": 0.5498, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 0.9996878504753113, | |
| "learning_rate": 9.86219273913078e-06, | |
| "loss": 0.597, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 1.085577368736267, | |
| "learning_rate": 9.858742440381343e-06, | |
| "loss": 0.6418, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.9825400710105896, | |
| "learning_rate": 9.855250099880026e-06, | |
| "loss": 0.5821, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 1.9832978248596191, | |
| "learning_rate": 9.851715747845372e-06, | |
| "loss": 0.5783, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 1.178113579750061, | |
| "learning_rate": 9.848139414859441e-06, | |
| "loss": 0.6737, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 1.0589377880096436, | |
| "learning_rate": 9.844521131867546e-06, | |
| "loss": 0.6187, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.8780454397201538, | |
| "learning_rate": 9.840860930177984e-06, | |
| "loss": 0.5028, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9216269850730896, | |
| "learning_rate": 9.837158841461767e-06, | |
| "loss": 0.6243, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 0.894197940826416, | |
| "learning_rate": 9.833414897752346e-06, | |
| "loss": 0.5588, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 0.8651962876319885, | |
| "learning_rate": 9.829629131445342e-06, | |
| "loss": 0.6252, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 1.1386091709136963, | |
| "learning_rate": 9.825801575298248e-06, | |
| "loss": 0.5398, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 1.014466643333435, | |
| "learning_rate": 9.821932262430164e-06, | |
| "loss": 0.6413, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0604788064956665, | |
| "learning_rate": 9.818021226321502e-06, | |
| "loss": 0.545, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 1.0483622550964355, | |
| "learning_rate": 9.814068500813692e-06, | |
| "loss": 0.5889, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 1.0369048118591309, | |
| "learning_rate": 9.8100741201089e-06, | |
| "loss": 0.5274, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 1.0958386659622192, | |
| "learning_rate": 9.806038118769724e-06, | |
| "loss": 0.7628, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 1.6769359111785889, | |
| "learning_rate": 9.801960531718898e-06, | |
| "loss": 0.6351, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.9066492319107056, | |
| "learning_rate": 9.797841394238987e-06, | |
| "loss": 0.5705, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 1.1000174283981323, | |
| "learning_rate": 9.793680741972084e-06, | |
| "loss": 0.6308, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 1.5397871732711792, | |
| "learning_rate": 9.789478610919508e-06, | |
| "loss": 0.677, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 0.9126397371292114, | |
| "learning_rate": 9.785235037441473e-06, | |
| "loss": 0.6686, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 0.8568590879440308, | |
| "learning_rate": 9.780950058256802e-06, | |
| "loss": 0.6127, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.0696728229522705, | |
| "learning_rate": 9.77662371044258e-06, | |
| "loss": 0.7597, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 0.9905379414558411, | |
| "learning_rate": 9.77225603143385e-06, | |
| "loss": 0.6321, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 0.9721628427505493, | |
| "learning_rate": 9.767847059023292e-06, | |
| "loss": 0.6696, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 0.9839254021644592, | |
| "learning_rate": 9.763396831360884e-06, | |
| "loss": 0.6746, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 1.0584723949432373, | |
| "learning_rate": 9.75890538695358e-06, | |
| "loss": 0.7859, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.1244181394577026, | |
| "learning_rate": 9.75437276466497e-06, | |
| "loss": 0.7305, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 0.8547583222389221, | |
| "learning_rate": 9.749799003714954e-06, | |
| "loss": 0.633, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 0.8881062269210815, | |
| "learning_rate": 9.745184143679398e-06, | |
| "loss": 0.4783, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.8600384593009949, | |
| "learning_rate": 9.74052822448978e-06, | |
| "loss": 0.6902, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 1.004089117050171, | |
| "learning_rate": 9.735831286432869e-06, | |
| "loss": 0.6668, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.8583224415779114, | |
| "learning_rate": 9.731093370150349e-06, | |
| "loss": 0.4957, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 1.9749398231506348, | |
| "learning_rate": 9.72631451663849e-06, | |
| "loss": 0.6108, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.9652974605560303, | |
| "learning_rate": 9.721494767247779e-06, | |
| "loss": 0.6986, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 0.994115948677063, | |
| "learning_rate": 9.71663416368257e-06, | |
| "loss": 0.622, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 0.7899606823921204, | |
| "learning_rate": 9.71173274800072e-06, | |
| "loss": 0.5757, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.004468560218811, | |
| "learning_rate": 9.70679056261322e-06, | |
| "loss": 0.6448, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 0.967308759689331, | |
| "learning_rate": 9.70180765028384e-06, | |
| "loss": 0.5976, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 0.9774156212806702, | |
| "learning_rate": 9.696784054128749e-06, | |
| "loss": 0.7471, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 1.028783917427063, | |
| "learning_rate": 9.691719817616148e-06, | |
| "loss": 0.6287, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 0.9413847327232361, | |
| "learning_rate": 9.686614984565888e-06, | |
| "loss": 0.6325, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.9678646922111511, | |
| "learning_rate": 9.681469599149093e-06, | |
| "loss": 0.7941, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 1.135720133781433, | |
| "learning_rate": 9.676283705887783e-06, | |
| "loss": 0.6387, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 1.1593883037567139, | |
| "learning_rate": 9.671057349654481e-06, | |
| "loss": 0.6043, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 0.8413971662521362, | |
| "learning_rate": 9.66579057567183e-06, | |
| "loss": 0.5406, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.9617403745651245, | |
| "learning_rate": 9.660483429512198e-06, | |
| "loss": 0.6032, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.1562846899032593, | |
| "learning_rate": 9.65513595709729e-06, | |
| "loss": 0.6027, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 0.9406894445419312, | |
| "learning_rate": 9.649748204697741e-06, | |
| "loss": 0.5774, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 0.7794913649559021, | |
| "learning_rate": 9.644320218932723e-06, | |
| "loss": 0.5726, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 0.8694751858711243, | |
| "learning_rate": 9.63885204676954e-06, | |
| "loss": 0.5695, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 0.9629790186882019, | |
| "learning_rate": 9.63334373552322e-06, | |
| "loss": 0.6327, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.1740370988845825, | |
| "learning_rate": 9.627795332856107e-06, | |
| "loss": 0.6442, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 1.0454487800598145, | |
| "learning_rate": 9.622206886777448e-06, | |
| "loss": 0.6519, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 0.9646772146224976, | |
| "learning_rate": 9.616578445642982e-06, | |
| "loss": 0.5082, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 1.2303582429885864, | |
| "learning_rate": 9.61091005815451e-06, | |
| "loss": 0.6097, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 0.8123118877410889, | |
| "learning_rate": 9.605201773359485e-06, | |
| "loss": 0.5543, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.7796357274055481, | |
| "learning_rate": 9.599453640650585e-06, | |
| "loss": 0.6042, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 0.8437601923942566, | |
| "learning_rate": 9.59366570976528e-06, | |
| "loss": 0.529, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 1.154600739479065, | |
| "learning_rate": 9.587838030785413e-06, | |
| "loss": 0.6697, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 0.9317930936813354, | |
| "learning_rate": 9.581970654136752e-06, | |
| "loss": 0.5722, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 1.0552774667739868, | |
| "learning_rate": 9.576063630588563e-06, | |
| "loss": 0.6131, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.0051699876785278, | |
| "learning_rate": 9.570117011253173e-06, | |
| "loss": 0.5859, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 2.1757352352142334, | |
| "learning_rate": 9.56413084758552e-06, | |
| "loss": 0.6157, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 0.9021430015563965, | |
| "learning_rate": 9.55810519138271e-06, | |
| "loss": 0.6015, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 1.2466237545013428, | |
| "learning_rate": 9.552040094783575e-06, | |
| "loss": 0.6462, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 0.959256649017334, | |
| "learning_rate": 9.545935610268213e-06, | |
| "loss": 0.5971, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 0.9433544874191284, | |
| "learning_rate": 9.53979179065754e-06, | |
| "loss": 0.6192, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 1.0719369649887085, | |
| "learning_rate": 9.533608689112827e-06, | |
| "loss": 0.5879, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 0.9381573796272278, | |
| "learning_rate": 9.527386359135254e-06, | |
| "loss": 0.6502, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 1.6748526096343994, | |
| "learning_rate": 9.521124854565425e-06, | |
| "loss": 0.5498, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.9933590888977051, | |
| "learning_rate": 9.514824229582922e-06, | |
| "loss": 0.6914, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.9754785299301147, | |
| "learning_rate": 9.508484538705823e-06, | |
| "loss": 0.5698, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 0.8225585222244263, | |
| "learning_rate": 9.50210583679024e-06, | |
| "loss": 0.6338, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 1.2640197277069092, | |
| "learning_rate": 9.495688179029838e-06, | |
| "loss": 0.5747, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 0.9488126635551453, | |
| "learning_rate": 9.48923162095536e-06, | |
| "loss": 0.476, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 1.0105180740356445, | |
| "learning_rate": 9.482736218434144e-06, | |
| "loss": 0.6627, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.8411703705787659, | |
| "learning_rate": 9.476202027669644e-06, | |
| "loss": 0.6187, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 0.7924178838729858, | |
| "learning_rate": 9.469629105200937e-06, | |
| "loss": 0.582, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 0.9977278709411621, | |
| "learning_rate": 9.463017507902245e-06, | |
| "loss": 0.6525, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 0.9288497567176819, | |
| "learning_rate": 9.45636729298243e-06, | |
| "loss": 0.6054, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 0.8257185220718384, | |
| "learning_rate": 9.449678517984503e-06, | |
| "loss": 0.5828, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.8198689222335815, | |
| "learning_rate": 9.442951240785135e-06, | |
| "loss": 0.5183, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 0.9914250373840332, | |
| "learning_rate": 9.436185519594145e-06, | |
| "loss": 0.6419, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 0.8594644665718079, | |
| "learning_rate": 9.429381412954e-06, | |
| "loss": 0.5427, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 1.18206787109375, | |
| "learning_rate": 9.422538979739307e-06, | |
| "loss": 0.7432, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 0.9081814885139465, | |
| "learning_rate": 9.415658279156312e-06, | |
| "loss": 0.6059, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1395457983016968, | |
| "learning_rate": 9.408739370742372e-06, | |
| "loss": 0.6052, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 0.8333037495613098, | |
| "learning_rate": 9.401782314365458e-06, | |
| "loss": 0.5763, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 0.9968838691711426, | |
| "learning_rate": 9.39478717022362e-06, | |
| "loss": 0.6014, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.8430538177490234, | |
| "learning_rate": 9.387753998844482e-06, | |
| "loss": 0.552, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.8073670864105225, | |
| "learning_rate": 9.380682861084703e-06, | |
| "loss": 0.5531, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.7828116416931152, | |
| "learning_rate": 9.37357381812946e-06, | |
| "loss": 0.565, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 0.8722400069236755, | |
| "learning_rate": 9.366426931491917e-06, | |
| "loss": 0.6368, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.9940326809883118, | |
| "learning_rate": 9.359242263012693e-06, | |
| "loss": 0.6373, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.9354084730148315, | |
| "learning_rate": 9.352019874859326e-06, | |
| "loss": 0.5976, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 0.9175680875778198, | |
| "learning_rate": 9.344759829525734e-06, | |
| "loss": 0.5601, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.1721409559249878, | |
| "learning_rate": 9.33746218983167e-06, | |
| "loss": 0.5834, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 0.8633074760437012, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.6287, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 0.8795122504234314, | |
| "learning_rate": 9.32275438026711e-06, | |
| "loss": 0.5676, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 1.5167433023452759, | |
| "learning_rate": 9.315344337660422e-06, | |
| "loss": 0.7062, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 0.8232209086418152, | |
| "learning_rate": 9.307896955219787e-06, | |
| "loss": 0.6113, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.4300435781478882, | |
| "learning_rate": 9.300412297385954e-06, | |
| "loss": 0.6826, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 1.0260647535324097, | |
| "learning_rate": 9.29289042892221e-06, | |
| "loss": 0.6348, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 0.788696825504303, | |
| "learning_rate": 9.285331414913816e-06, | |
| "loss": 0.6163, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": 0.8610551357269287, | |
| "learning_rate": 9.277735320767449e-06, | |
| "loss": 0.53, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 1.0100200176239014, | |
| "learning_rate": 9.270102212210632e-06, | |
| "loss": 0.5481, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.886954665184021, | |
| "learning_rate": 9.262432155291167e-06, | |
| "loss": 0.4982, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 1.0647324323654175, | |
| "learning_rate": 9.254725216376562e-06, | |
| "loss": 0.5961, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": 0.8342650532722473, | |
| "learning_rate": 9.246981462153456e-06, | |
| "loss": 0.5485, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 0.8798630833625793, | |
| "learning_rate": 9.239200959627048e-06, | |
| "loss": 0.6088, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.7577885389328003, | |
| "learning_rate": 9.231383776120512e-06, | |
| "loss": 0.4636, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.8578501343727112, | |
| "learning_rate": 9.223529979274411e-06, | |
| "loss": 0.6752, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": 0.8716771006584167, | |
| "learning_rate": 9.215639637046121e-06, | |
| "loss": 0.4821, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 0.9138606190681458, | |
| "learning_rate": 9.207712817709237e-06, | |
| "loss": 0.4309, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": 1.0174994468688965, | |
| "learning_rate": 9.19974958985298e-06, | |
| "loss": 0.5563, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 0.9415088295936584, | |
| "learning_rate": 9.191750022381613e-06, | |
| "loss": 0.4935, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8819617629051208, | |
| "learning_rate": 9.183714184513832e-06, | |
| "loss": 0.4942, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 0.930507242679596, | |
| "learning_rate": 9.175642145782179e-06, | |
| "loss": 0.4656, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": 0.9761205315589905, | |
| "learning_rate": 9.16753397603243e-06, | |
| "loss": 0.5395, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.7865678071975708, | |
| "learning_rate": 9.159389745423003e-06, | |
| "loss": 0.454, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": 0.9103251099586487, | |
| "learning_rate": 9.151209524424333e-06, | |
| "loss": 0.3648, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.8915781378746033, | |
| "learning_rate": 9.142993383818284e-06, | |
| "loss": 0.6086, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": 0.859764575958252, | |
| "learning_rate": 9.134741394697517e-06, | |
| "loss": 0.4295, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 0.8280050158500671, | |
| "learning_rate": 9.126453628464889e-06, | |
| "loss": 0.4515, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": 1.0346128940582275, | |
| "learning_rate": 9.118130156832823e-06, | |
| "loss": 0.5544, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.792113184928894, | |
| "learning_rate": 9.109771051822702e-06, | |
| "loss": 0.3673, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.8384730219841003, | |
| "learning_rate": 9.10137638576423e-06, | |
| "loss": 0.4647, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.7523196935653687, | |
| "learning_rate": 9.09294623129482e-06, | |
| "loss": 0.4786, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": 0.9915981888771057, | |
| "learning_rate": 9.084480661358954e-06, | |
| "loss": 0.5301, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.980383574962616, | |
| "learning_rate": 9.07597974920756e-06, | |
| "loss": 0.4275, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": 0.8446120619773865, | |
| "learning_rate": 9.067443568397378e-06, | |
| "loss": 0.4136, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.8461412787437439, | |
| "learning_rate": 9.058872192790314e-06, | |
| "loss": 0.4638, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": 0.8193701505661011, | |
| "learning_rate": 9.05026569655281e-06, | |
| "loss": 0.5051, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 0.9443950653076172, | |
| "learning_rate": 9.041624154155208e-06, | |
| "loss": 0.4844, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": 0.791313648223877, | |
| "learning_rate": 9.032947640371086e-06, | |
| "loss": 0.416, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 1.0709561109542847, | |
| "learning_rate": 9.02423623027663e-06, | |
| "loss": 0.5036, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.8591241836547852, | |
| "learning_rate": 9.01548999924997e-06, | |
| "loss": 0.3976, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 0.7817285060882568, | |
| "learning_rate": 9.006709022970547e-06, | |
| "loss": 0.3538, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": 0.9098508954048157, | |
| "learning_rate": 8.997893377418432e-06, | |
| "loss": 0.5238, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.9865612387657166, | |
| "learning_rate": 8.98904313887369e-06, | |
| "loss": 0.5565, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": 0.8226122856140137, | |
| "learning_rate": 8.980158383915714e-06, | |
| "loss": 0.452, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.7830989956855774, | |
| "learning_rate": 8.971239189422555e-06, | |
| "loss": 0.4309, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": 1.0743530988693237, | |
| "learning_rate": 8.962285632570266e-06, | |
| "loss": 0.4097, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 1.0063495635986328, | |
| "learning_rate": 8.953297790832231e-06, | |
| "loss": 0.5654, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 1.0220483541488647, | |
| "learning_rate": 8.944275741978495e-06, | |
| "loss": 0.4384, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 0.8661972880363464, | |
| "learning_rate": 8.935219564075087e-06, | |
| "loss": 0.4436, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 1.028968334197998, | |
| "learning_rate": 8.92612933548335e-06, | |
| "loss": 0.4704, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 0.848927915096283, | |
| "learning_rate": 8.917005134859263e-06, | |
| "loss": 0.5136, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": 0.7315829396247864, | |
| "learning_rate": 8.907847041152757e-06, | |
| "loss": 0.4477, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.8626575469970703, | |
| "learning_rate": 8.89865513360703e-06, | |
| "loss": 0.4097, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": 0.9252862334251404, | |
| "learning_rate": 8.889429491757872e-06, | |
| "loss": 0.5281, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.8084948658943176, | |
| "learning_rate": 8.88017019543296e-06, | |
| "loss": 0.4739, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": 1.0392563343048096, | |
| "learning_rate": 8.870877324751186e-06, | |
| "loss": 0.4135, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 0.795489490032196, | |
| "learning_rate": 8.861550960121946e-06, | |
| "loss": 0.3648, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": 0.8194026350975037, | |
| "learning_rate": 8.852191182244456e-06, | |
| "loss": 0.4356, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 0.7702086567878723, | |
| "learning_rate": 8.842798072107055e-06, | |
| "loss": 0.386, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.7251138687133789, | |
| "learning_rate": 8.833371710986493e-06, | |
| "loss": 0.3638, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 1.0485248565673828, | |
| "learning_rate": 8.823912180447237e-06, | |
| "loss": 0.4637, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": 0.890069305896759, | |
| "learning_rate": 8.81441956234076e-06, | |
| "loss": 0.5069, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.8273584842681885, | |
| "learning_rate": 8.804893938804839e-06, | |
| "loss": 0.512, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": 0.8748934268951416, | |
| "learning_rate": 8.795335392262841e-06, | |
| "loss": 0.432, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.8324396014213562, | |
| "learning_rate": 8.785744005423003e-06, | |
| "loss": 0.4667, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": 0.8544197678565979, | |
| "learning_rate": 8.77611986127773e-06, | |
| "loss": 0.4673, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 1.2245566844940186, | |
| "learning_rate": 8.766463043102864e-06, | |
| "loss": 0.6687, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": 0.8746636509895325, | |
| "learning_rate": 8.756773634456975e-06, | |
| "loss": 0.448, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 0.9743396639823914, | |
| "learning_rate": 8.747051719180626e-06, | |
| "loss": 0.5006, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.8210893273353577, | |
| "learning_rate": 8.737297381395657e-06, | |
| "loss": 0.4388, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 0.801645040512085, | |
| "learning_rate": 8.727510705504453e-06, | |
| "loss": 0.4047, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": 0.8253946900367737, | |
| "learning_rate": 8.717691776189214e-06, | |
| "loss": 0.4908, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.8477187752723694, | |
| "learning_rate": 8.707840678411223e-06, | |
| "loss": 0.4314, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": 0.7261312007904053, | |
| "learning_rate": 8.69795749741011e-06, | |
| "loss": 0.3766, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.9027365446090698, | |
| "learning_rate": 8.688042318703111e-06, | |
| "loss": 0.4756, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": 0.9057645201683044, | |
| "learning_rate": 8.678095228084343e-06, | |
| "loss": 0.5247, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.8955289125442505, | |
| "learning_rate": 8.66811631162404e-06, | |
| "loss": 0.492, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": 0.7744578719139099, | |
| "learning_rate": 8.65810565566782e-06, | |
| "loss": 0.4576, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 0.8038774132728577, | |
| "learning_rate": 8.648063346835943e-06, | |
| "loss": 0.4527, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.9276174306869507, | |
| "learning_rate": 8.637989472022548e-06, | |
| "loss": 0.4586, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 0.9310397505760193, | |
| "learning_rate": 8.627884118394913e-06, | |
| "loss": 0.4612, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": 0.8855887055397034, | |
| "learning_rate": 8.617747373392697e-06, | |
| "loss": 0.5321, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.7462752461433411, | |
| "learning_rate": 8.607579324727175e-06, | |
| "loss": 0.4002, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": 1.0667074918746948, | |
| "learning_rate": 8.597380060380493e-06, | |
| "loss": 0.4636, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.9851921796798706, | |
| "learning_rate": 8.5871496686049e-06, | |
| "loss": 0.4762, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": 1.001611590385437, | |
| "learning_rate": 8.576888237921983e-06, | |
| "loss": 0.4729, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 0.8335314989089966, | |
| "learning_rate": 8.566595857121902e-06, | |
| "loss": 0.4509, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": 0.9434210062026978, | |
| "learning_rate": 8.556272615262623e-06, | |
| "loss": 0.482, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 0.9304957389831543, | |
| "learning_rate": 8.545918601669147e-06, | |
| "loss": 0.4813, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.7728573083877563, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 0.4834, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 0.9087052345275879, | |
| "learning_rate": 8.525118617910144e-06, | |
| "loss": 0.4356, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": 0.8022294044494629, | |
| "learning_rate": 8.514672827722824e-06, | |
| "loss": 0.4399, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.7249376773834229, | |
| "learning_rate": 8.504196625756166e-06, | |
| "loss": 0.4411, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": 0.6998037695884705, | |
| "learning_rate": 8.493690102658703e-06, | |
| "loss": 0.4215, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.7693272829055786, | |
| "learning_rate": 8.483153349341336e-06, | |
| "loss": 0.4296, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.648, | |
| "grad_norm": 0.874349057674408, | |
| "learning_rate": 8.472586456976534e-06, | |
| "loss": 0.4325, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 0.6986884474754333, | |
| "learning_rate": 8.461989516997565e-06, | |
| "loss": 0.3688, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.664, | |
| "grad_norm": 0.8365640044212341, | |
| "learning_rate": 8.45136262109768e-06, | |
| "loss": 0.5117, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 0.7621400952339172, | |
| "learning_rate": 8.440705861229344e-06, | |
| "loss": 0.4379, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.8883290886878967, | |
| "learning_rate": 8.430019329603423e-06, | |
| "loss": 0.4712, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 0.8937128186225891, | |
| "learning_rate": 8.41930311868839e-06, | |
| "loss": 0.4865, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": 0.7254724502563477, | |
| "learning_rate": 8.408557321209534e-06, | |
| "loss": 0.3977, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.8835223317146301, | |
| "learning_rate": 8.397782030148147e-06, | |
| "loss": 0.5018, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": 0.7353959679603577, | |
| "learning_rate": 8.386977338740724e-06, | |
| "loss": 0.4522, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.9096419811248779, | |
| "learning_rate": 8.376143340478153e-06, | |
| "loss": 0.4349, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": 0.8415313959121704, | |
| "learning_rate": 8.365280129104912e-06, | |
| "loss": 0.3908, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 0.7993387579917908, | |
| "learning_rate": 8.354387798618254e-06, | |
| "loss": 0.4132, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": 0.7036588788032532, | |
| "learning_rate": 8.34346644326739e-06, | |
| "loss": 0.4435, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 0.7259690761566162, | |
| "learning_rate": 8.332516157552684e-06, | |
| "loss": 0.408, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.0898205041885376, | |
| "learning_rate": 8.321537036224822e-06, | |
| "loss": 0.5556, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 0.7409061789512634, | |
| "learning_rate": 8.310529174284004e-06, | |
| "loss": 0.435, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.776, | |
| "grad_norm": 0.8712742924690247, | |
| "learning_rate": 8.299492666979114e-06, | |
| "loss": 0.4701, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.8046022653579712, | |
| "learning_rate": 8.288427609806899e-06, | |
| "loss": 0.528, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.792, | |
| "grad_norm": 0.7399017214775085, | |
| "learning_rate": 8.277334098511147e-06, | |
| "loss": 0.374, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.8569490909576416, | |
| "learning_rate": 8.266212229081846e-06, | |
| "loss": 0.4363, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.808, | |
| "grad_norm": 0.7750512957572937, | |
| "learning_rate": 8.255062097754371e-06, | |
| "loss": 0.3865, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 0.8413577675819397, | |
| "learning_rate": 8.243883801008632e-06, | |
| "loss": 0.4495, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.824, | |
| "grad_norm": 0.7431395053863525, | |
| "learning_rate": 8.232677435568252e-06, | |
| "loss": 0.4077, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 0.9745064377784729, | |
| "learning_rate": 8.221443098399733e-06, | |
| "loss": 0.5273, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.9550256729125977, | |
| "learning_rate": 8.210180886711603e-06, | |
| "loss": 0.4009, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 0.8951454758644104, | |
| "learning_rate": 8.198890897953586e-06, | |
| "loss": 0.5218, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.856, | |
| "grad_norm": 0.7028780579566956, | |
| "learning_rate": 8.187573229815757e-06, | |
| "loss": 0.4437, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.7699680328369141, | |
| "learning_rate": 8.176227980227693e-06, | |
| "loss": 0.4422, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.872, | |
| "grad_norm": 1.3268731832504272, | |
| "learning_rate": 8.164855247357628e-06, | |
| "loss": 0.4985, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.801872968673706, | |
| "learning_rate": 8.153455129611605e-06, | |
| "loss": 0.4028, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.888, | |
| "grad_norm": 0.7494660019874573, | |
| "learning_rate": 8.142027725632622e-06, | |
| "loss": 0.5131, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 0.9350019693374634, | |
| "learning_rate": 8.130573134299782e-06, | |
| "loss": 0.4778, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.904, | |
| "grad_norm": 0.9649576544761658, | |
| "learning_rate": 8.119091454727427e-06, | |
| "loss": 0.5837, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 0.7435542941093445, | |
| "learning_rate": 8.107582786264299e-06, | |
| "loss": 0.4237, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.8181478977203369, | |
| "learning_rate": 8.09604722849266e-06, | |
| "loss": 0.4325, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 0.9543557167053223, | |
| "learning_rate": 8.084484881227449e-06, | |
| "loss": 0.4458, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.936, | |
| "grad_norm": 0.807310938835144, | |
| "learning_rate": 8.072895844515398e-06, | |
| "loss": 0.5369, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.6983502507209778, | |
| "learning_rate": 8.061280218634192e-06, | |
| "loss": 0.4961, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.952, | |
| "grad_norm": 0.8616600036621094, | |
| "learning_rate": 8.049638104091575e-06, | |
| "loss": 0.469, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.8262357115745544, | |
| "learning_rate": 8.037969601624495e-06, | |
| "loss": 0.4619, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.968, | |
| "grad_norm": 0.7818635702133179, | |
| "learning_rate": 8.026274812198235e-06, | |
| "loss": 0.4968, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 0.884961724281311, | |
| "learning_rate": 8.014553837005527e-06, | |
| "loss": 0.5882, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.984, | |
| "grad_norm": 0.7759022116661072, | |
| "learning_rate": 8.002806777465685e-06, | |
| "loss": 0.3439, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 1.4017070531845093, | |
| "learning_rate": 7.99103373522373e-06, | |
| "loss": 0.4397, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.8649992942810059, | |
| "learning_rate": 7.9792348121495e-06, | |
| "loss": 0.3618, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 0.8995476365089417, | |
| "learning_rate": 7.967410110336782e-06, | |
| "loss": 0.2888, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 0.8667290806770325, | |
| "learning_rate": 7.955559732102414e-06, | |
| "loss": 0.318, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "grad_norm": 0.9534152150154114, | |
| "learning_rate": 7.943683779985412e-06, | |
| "loss": 0.3992, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.032, | |
| "grad_norm": 2.839970827102661, | |
| "learning_rate": 7.931782356746076e-06, | |
| "loss": 0.2688, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.6292753219604492, | |
| "learning_rate": 7.919855565365102e-06, | |
| "loss": 0.3406, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.048, | |
| "grad_norm": 1.801590085029602, | |
| "learning_rate": 7.907903509042696e-06, | |
| "loss": 0.4699, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.056, | |
| "grad_norm": 1.0775017738342285, | |
| "learning_rate": 7.895926291197667e-06, | |
| "loss": 0.2951, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.064, | |
| "grad_norm": 0.7743735909461975, | |
| "learning_rate": 7.883924015466554e-06, | |
| "loss": 0.3324, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 0.8310099244117737, | |
| "learning_rate": 7.871896785702707e-06, | |
| "loss": 0.3172, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.8860973715782166, | |
| "learning_rate": 7.859844705975405e-06, | |
| "loss": 0.3087, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.088, | |
| "grad_norm": 3.5202293395996094, | |
| "learning_rate": 7.847767880568944e-06, | |
| "loss": 0.2741, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 0.7423680424690247, | |
| "learning_rate": 7.835666413981744e-06, | |
| "loss": 0.253, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 0.9558984637260437, | |
| "learning_rate": 7.823540410925434e-06, | |
| "loss": 0.3973, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.112, | |
| "grad_norm": 0.9960572719573975, | |
| "learning_rate": 7.811389976323963e-06, | |
| "loss": 0.3157, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.8729398250579834, | |
| "learning_rate": 7.799215215312667e-06, | |
| "loss": 0.2538, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.128, | |
| "grad_norm": 1.4660489559173584, | |
| "learning_rate": 7.787016233237387e-06, | |
| "loss": 0.3225, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 0.9435684084892273, | |
| "learning_rate": 7.774793135653537e-06, | |
| "loss": 0.297, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.144, | |
| "grad_norm": 0.8412906527519226, | |
| "learning_rate": 7.7625460283252e-06, | |
| "loss": 0.3133, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.152, | |
| "grad_norm": 0.9741681218147278, | |
| "learning_rate": 7.750275017224208e-06, | |
| "loss": 0.4216, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.8716290593147278, | |
| "learning_rate": 7.737980208529232e-06, | |
| "loss": 0.322, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 0.7589024305343628, | |
| "learning_rate": 7.725661708624855e-06, | |
| "loss": 0.3232, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 0.7157604098320007, | |
| "learning_rate": 7.713319624100657e-06, | |
| "loss": 0.26, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.184, | |
| "grad_norm": 0.8383468985557556, | |
| "learning_rate": 7.700954061750295e-06, | |
| "loss": 0.2799, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.192, | |
| "grad_norm": 0.868595540523529, | |
| "learning_rate": 7.688565128570564e-06, | |
| "loss": 0.3485, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.8890668153762817, | |
| "learning_rate": 7.676152931760496e-06, | |
| "loss": 0.2581, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.208, | |
| "grad_norm": 0.8658837080001831, | |
| "learning_rate": 7.663717578720412e-06, | |
| "loss": 0.3895, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.216, | |
| "grad_norm": 0.8102222084999084, | |
| "learning_rate": 7.651259177050996e-06, | |
| "loss": 0.3544, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.224, | |
| "grad_norm": 0.8114266395568848, | |
| "learning_rate": 7.638777834552372e-06, | |
| "loss": 0.2496, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 0.8970929980278015, | |
| "learning_rate": 7.626273659223166e-06, | |
| "loss": 0.278, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.9272769689559937, | |
| "learning_rate": 7.61374675925957e-06, | |
| "loss": 0.3057, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.248, | |
| "grad_norm": 0.8910090327262878, | |
| "learning_rate": 7.601197243054411e-06, | |
| "loss": 0.3428, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 0.8541823029518127, | |
| "learning_rate": 7.588625219196208e-06, | |
| "loss": 0.3088, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 1.1296759843826294, | |
| "learning_rate": 7.576030796468233e-06, | |
| "loss": 0.3217, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.2720000000000002, | |
| "grad_norm": 0.7654091119766235, | |
| "learning_rate": 7.563414083847573e-06, | |
| "loss": 0.2941, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 0.7795118689537048, | |
| "learning_rate": 7.5507751905041885e-06, | |
| "loss": 0.2943, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.288, | |
| "grad_norm": 0.7350049018859863, | |
| "learning_rate": 7.538114225799955e-06, | |
| "loss": 0.2193, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 0.692876398563385, | |
| "learning_rate": 7.525431299287737e-06, | |
| "loss": 0.2663, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.304, | |
| "grad_norm": 0.8777008056640625, | |
| "learning_rate": 7.512726520710429e-06, | |
| "loss": 0.2506, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.312, | |
| "grad_norm": 0.8459540605545044, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.3189, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.8410331010818481, | |
| "learning_rate": 7.4872518472765594e-06, | |
| "loss": 0.3986, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 0.8244580030441284, | |
| "learning_rate": 7.474482172847391e-06, | |
| "loss": 0.3109, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 0.852331817150116, | |
| "learning_rate": 7.461691087205993e-06, | |
| "loss": 0.2063, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.344, | |
| "grad_norm": 0.95198655128479, | |
| "learning_rate": 7.4488787010311425e-06, | |
| "loss": 0.2647, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.352, | |
| "grad_norm": 1.0169159173965454, | |
| "learning_rate": 7.436045125185923e-06, | |
| "loss": 0.3935, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.8539062142372131, | |
| "learning_rate": 7.423190470716761e-06, | |
| "loss": 0.2898, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.368, | |
| "grad_norm": 0.7559723854064941, | |
| "learning_rate": 7.4103148488524824e-06, | |
| "loss": 0.2889, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.376, | |
| "grad_norm": 0.7408546805381775, | |
| "learning_rate": 7.3974183710033334e-06, | |
| "loss": 0.2623, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.384, | |
| "grad_norm": 0.7821682095527649, | |
| "learning_rate": 7.384501148760024e-06, | |
| "loss": 0.3472, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 0.6960223913192749, | |
| "learning_rate": 7.371563293892761e-06, | |
| "loss": 0.3226, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.9443512558937073, | |
| "learning_rate": 7.3586049183502875e-06, | |
| "loss": 0.3578, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.408, | |
| "grad_norm": 0.8014100193977356, | |
| "learning_rate": 7.345626134258897e-06, | |
| "loss": 0.3144, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 0.9299169182777405, | |
| "learning_rate": 7.3326270539214826e-06, | |
| "loss": 0.3547, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 1.090511441230774, | |
| "learning_rate": 7.319607789816555e-06, | |
| "loss": 0.3809, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.432, | |
| "grad_norm": 0.8724474310874939, | |
| "learning_rate": 7.306568454597269e-06, | |
| "loss": 0.3196, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.7956423759460449, | |
| "learning_rate": 7.293509161090453e-06, | |
| "loss": 0.2524, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.448, | |
| "grad_norm": 0.7545970678329468, | |
| "learning_rate": 7.28043002229563e-06, | |
| "loss": 0.262, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 0.7623492479324341, | |
| "learning_rate": 7.2673311513840395e-06, | |
| "loss": 0.3246, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.464, | |
| "grad_norm": 0.8377344608306885, | |
| "learning_rate": 7.2542126616976596e-06, | |
| "loss": 0.2918, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.472, | |
| "grad_norm": 1.0835148096084595, | |
| "learning_rate": 7.241074666748228e-06, | |
| "loss": 0.3353, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.7926676273345947, | |
| "learning_rate": 7.227917280216254e-06, | |
| "loss": 0.2968, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 0.8346993327140808, | |
| "learning_rate": 7.214740615950041e-06, | |
| "loss": 0.331, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 0.7735843658447266, | |
| "learning_rate": 7.201544787964698e-06, | |
| "loss": 0.214, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.504, | |
| "grad_norm": 0.7349847555160522, | |
| "learning_rate": 7.188329910441154e-06, | |
| "loss": 0.2201, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.512, | |
| "grad_norm": 0.8358851671218872, | |
| "learning_rate": 7.175096097725169e-06, | |
| "loss": 0.3334, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.7020286321640015, | |
| "learning_rate": 7.161843464326349e-06, | |
| "loss": 0.2884, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.528, | |
| "grad_norm": 0.9368682503700256, | |
| "learning_rate": 7.148572124917148e-06, | |
| "loss": 0.2915, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.536, | |
| "grad_norm": 0.7516599297523499, | |
| "learning_rate": 7.135282194331881e-06, | |
| "loss": 0.2376, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.544, | |
| "grad_norm": 0.7513339519500732, | |
| "learning_rate": 7.121973787565727e-06, | |
| "loss": 0.2542, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 0.816264808177948, | |
| "learning_rate": 7.1086470197737405e-06, | |
| "loss": 0.3151, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.8877426385879517, | |
| "learning_rate": 7.095302006269842e-06, | |
| "loss": 0.3091, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.568, | |
| "grad_norm": 0.7619266510009766, | |
| "learning_rate": 7.0819388625258385e-06, | |
| "loss": 0.3661, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 0.7122573256492615, | |
| "learning_rate": 7.06855770417041e-06, | |
| "loss": 0.2344, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 0.6627663969993591, | |
| "learning_rate": 7.05515864698811e-06, | |
| "loss": 0.278, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.592, | |
| "grad_norm": 0.632908821105957, | |
| "learning_rate": 7.041741806918372e-06, | |
| "loss": 0.3164, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.7511745095252991, | |
| "learning_rate": 7.028307300054499e-06, | |
| "loss": 0.2744, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.608, | |
| "grad_norm": 1.4944278001785278, | |
| "learning_rate": 7.014855242642662e-06, | |
| "loss": 0.5377, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 0.7006387114524841, | |
| "learning_rate": 7.0013857510808934e-06, | |
| "loss": 0.2977, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.624, | |
| "grad_norm": 1.0094897747039795, | |
| "learning_rate": 6.987898941918082e-06, | |
| "loss": 0.3555, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.632, | |
| "grad_norm": 0.7829846739768982, | |
| "learning_rate": 6.974394931852957e-06, | |
| "loss": 0.2524, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.6909668445587158, | |
| "learning_rate": 6.960873837733089e-06, | |
| "loss": 0.349, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 1.0253453254699707, | |
| "learning_rate": 6.94733577655387e-06, | |
| "loss": 0.4412, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 0.8452252149581909, | |
| "learning_rate": 6.933780865457508e-06, | |
| "loss": 0.2807, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.664, | |
| "grad_norm": 0.7776572704315186, | |
| "learning_rate": 6.920209221732007e-06, | |
| "loss": 0.2807, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.672, | |
| "grad_norm": 0.8717998266220093, | |
| "learning_rate": 6.90662096281016e-06, | |
| "loss": 0.3892, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.7512979507446289, | |
| "learning_rate": 6.893016206268518e-06, | |
| "loss": 0.2935, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.6879999999999997, | |
| "grad_norm": 0.802406370639801, | |
| "learning_rate": 6.879395069826394e-06, | |
| "loss": 0.2944, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.6959999999999997, | |
| "grad_norm": 0.7089901566505432, | |
| "learning_rate": 6.865757671344827e-06, | |
| "loss": 0.3191, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.7039999999999997, | |
| "grad_norm": 0.8097975850105286, | |
| "learning_rate": 6.85210412882557e-06, | |
| "loss": 0.3193, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 0.7090730667114258, | |
| "learning_rate": 6.838434560410064e-06, | |
| "loss": 0.3095, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.7199999999999998, | |
| "grad_norm": 0.87074875831604, | |
| "learning_rate": 6.824749084378428e-06, | |
| "loss": 0.32, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.7279999999999998, | |
| "grad_norm": 0.7040944695472717, | |
| "learning_rate": 6.811047819148413e-06, | |
| "loss": 0.2874, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 0.7431588172912598, | |
| "learning_rate": 6.7973308832744035e-06, | |
| "loss": 0.3269, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 0.7199814915657043, | |
| "learning_rate": 6.783598395446371e-06, | |
| "loss": 0.3722, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.752, | |
| "grad_norm": 0.6959889531135559, | |
| "learning_rate": 6.769850474488859e-06, | |
| "loss": 0.2722, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.7497812509536743, | |
| "learning_rate": 6.756087239359948e-06, | |
| "loss": 0.2221, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.768, | |
| "grad_norm": 0.7543243169784546, | |
| "learning_rate": 6.742308809150232e-06, | |
| "loss": 0.2754, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 0.8410171270370483, | |
| "learning_rate": 6.728515303081782e-06, | |
| "loss": 0.2823, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.784, | |
| "grad_norm": 0.8377023935317993, | |
| "learning_rate": 6.714706840507122e-06, | |
| "loss": 0.3318, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.792, | |
| "grad_norm": 0.8734577298164368, | |
| "learning_rate": 6.700883540908185e-06, | |
| "loss": 0.3444, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.838463306427002, | |
| "learning_rate": 6.687045523895292e-06, | |
| "loss": 0.3606, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 0.9711816906929016, | |
| "learning_rate": 6.673192909206109e-06, | |
| "loss": 0.4177, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 0.7916324138641357, | |
| "learning_rate": 6.6593258167046115e-06, | |
| "loss": 0.2866, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.824, | |
| "grad_norm": 0.9043160676956177, | |
| "learning_rate": 6.64544436638005e-06, | |
| "loss": 0.3816, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.832, | |
| "grad_norm": 0.8380642533302307, | |
| "learning_rate": 6.63154867834591e-06, | |
| "loss": 0.2881, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.8009607791900635, | |
| "learning_rate": 6.617638872838874e-06, | |
| "loss": 0.2978, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.848, | |
| "grad_norm": 0.8324043154716492, | |
| "learning_rate": 6.603715070217779e-06, | |
| "loss": 0.3916, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.856, | |
| "grad_norm": 0.9089515805244446, | |
| "learning_rate": 6.589777390962575e-06, | |
| "loss": 0.3845, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.864, | |
| "grad_norm": 0.8030752539634705, | |
| "learning_rate": 6.5758259556732896e-06, | |
| "loss": 0.3043, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 0.8114820718765259, | |
| "learning_rate": 6.561860885068972e-06, | |
| "loss": 0.2985, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.7960605025291443, | |
| "learning_rate": 6.547882299986658e-06, | |
| "loss": 0.2702, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.888, | |
| "grad_norm": 0.8797109127044678, | |
| "learning_rate": 6.53389032138032e-06, | |
| "loss": 0.3504, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 0.7048487067222595, | |
| "learning_rate": 6.519885070319827e-06, | |
| "loss": 0.2763, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 0.7555437684059143, | |
| "learning_rate": 6.505866667989884e-06, | |
| "loss": 0.3933, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.912, | |
| "grad_norm": 0.6869528293609619, | |
| "learning_rate": 6.491835235688999e-06, | |
| "loss": 0.2569, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.8202469348907471, | |
| "learning_rate": 6.477790894828422e-06, | |
| "loss": 0.2525, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.928, | |
| "grad_norm": 0.7586043477058411, | |
| "learning_rate": 6.463733766931096e-06, | |
| "loss": 0.3043, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 0.9517699480056763, | |
| "learning_rate": 6.449663973630613e-06, | |
| "loss": 0.3379, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.944, | |
| "grad_norm": 0.723063051700592, | |
| "learning_rate": 6.435581636670154e-06, | |
| "loss": 0.3467, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.952, | |
| "grad_norm": 0.8022458553314209, | |
| "learning_rate": 6.421486877901436e-06, | |
| "loss": 0.2657, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 1.3096660375595093, | |
| "learning_rate": 6.407379819283661e-06, | |
| "loss": 0.4588, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 0.871299147605896, | |
| "learning_rate": 6.393260582882462e-06, | |
| "loss": 0.3651, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 0.7607028484344482, | |
| "learning_rate": 6.379129290868837e-06, | |
| "loss": 0.3678, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.984, | |
| "grad_norm": 0.7246721386909485, | |
| "learning_rate": 6.364986065518106e-06, | |
| "loss": 0.3548, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.992, | |
| "grad_norm": 0.8004239797592163, | |
| "learning_rate": 6.350831029208844e-06, | |
| "loss": 0.3142, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.7275088429450989, | |
| "learning_rate": 6.336664304421818e-06, | |
| "loss": 0.2845, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.008, | |
| "grad_norm": 1.2236182689666748, | |
| "learning_rate": 6.322486013738942e-06, | |
| "loss": 0.2159, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.016, | |
| "grad_norm": 1.1026666164398193, | |
| "learning_rate": 6.308296279842204e-06, | |
| "loss": 0.2046, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.024, | |
| "grad_norm": 0.8165322542190552, | |
| "learning_rate": 6.294095225512604e-06, | |
| "loss": 0.1917, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "grad_norm": 0.8382667899131775, | |
| "learning_rate": 6.279882973629101e-06, | |
| "loss": 0.1884, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 1.0882318019866943, | |
| "learning_rate": 6.265659647167542e-06, | |
| "loss": 0.1639, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.048, | |
| "grad_norm": 1.1926336288452148, | |
| "learning_rate": 6.2514253691996e-06, | |
| "loss": 0.2134, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.056, | |
| "grad_norm": 1.2375679016113281, | |
| "learning_rate": 6.237180262891709e-06, | |
| "loss": 0.1433, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.064, | |
| "grad_norm": 1.3838156461715698, | |
| "learning_rate": 6.222924451504001e-06, | |
| "loss": 0.1706, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.072, | |
| "grad_norm": 1.3754796981811523, | |
| "learning_rate": 6.208658058389232e-06, | |
| "loss": 0.1796, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.8128661513328552, | |
| "learning_rate": 6.194381206991723e-06, | |
| "loss": 0.2034, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.088, | |
| "grad_norm": 0.8433014154434204, | |
| "learning_rate": 6.180094020846291e-06, | |
| "loss": 0.1747, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 4.096, | |
| "grad_norm": 0.9003710150718689, | |
| "learning_rate": 6.165796623577171e-06, | |
| "loss": 0.2375, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 4.104, | |
| "grad_norm": 0.8863418102264404, | |
| "learning_rate": 6.15148913889696e-06, | |
| "loss": 0.1481, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 4.112, | |
| "grad_norm": 0.8291362524032593, | |
| "learning_rate": 6.1371716906055336e-06, | |
| "loss": 0.1336, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.7721700072288513, | |
| "learning_rate": 6.122844402588982e-06, | |
| "loss": 0.1624, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 0.9002088308334351, | |
| "learning_rate": 6.10850739881854e-06, | |
| "loss": 0.1546, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 4.136, | |
| "grad_norm": 0.8927168846130371, | |
| "learning_rate": 6.094160803349508e-06, | |
| "loss": 0.1147, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 4.144, | |
| "grad_norm": 0.6976279616355896, | |
| "learning_rate": 6.079804740320181e-06, | |
| "loss": 0.1584, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 4.152, | |
| "grad_norm": 0.8408224582672119, | |
| "learning_rate": 6.065439333950776e-06, | |
| "loss": 0.1629, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.6547120213508606, | |
| "learning_rate": 6.051064708542357e-06, | |
| "loss": 0.1409, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.168, | |
| "grad_norm": 0.8906222581863403, | |
| "learning_rate": 6.036680988475756e-06, | |
| "loss": 0.1433, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 4.176, | |
| "grad_norm": 0.8860357403755188, | |
| "learning_rate": 6.022288298210502e-06, | |
| "loss": 0.1817, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 4.184, | |
| "grad_norm": 0.7497817277908325, | |
| "learning_rate": 6.00788676228374e-06, | |
| "loss": 0.1566, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 4.192, | |
| "grad_norm": 0.8113112449645996, | |
| "learning_rate": 5.993476505309154e-06, | |
| "loss": 0.1461, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.8568198084831238, | |
| "learning_rate": 5.979057651975893e-06, | |
| "loss": 0.1837, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.208, | |
| "grad_norm": 1.1512551307678223, | |
| "learning_rate": 5.964630327047485e-06, | |
| "loss": 0.196, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 4.216, | |
| "grad_norm": 0.7884339690208435, | |
| "learning_rate": 5.9501946553607615e-06, | |
| "loss": 0.1393, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 4.224, | |
| "grad_norm": 0.7390790581703186, | |
| "learning_rate": 5.935750761824777e-06, | |
| "loss": 0.1381, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 4.232, | |
| "grad_norm": 0.9154812693595886, | |
| "learning_rate": 5.921298771419731e-06, | |
| "loss": 0.1841, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.6464409232139587, | |
| "learning_rate": 5.906838809195879e-06, | |
| "loss": 0.1328, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.248, | |
| "grad_norm": 0.8564907312393188, | |
| "learning_rate": 5.8923710002724595e-06, | |
| "loss": 0.1769, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 4.256, | |
| "grad_norm": 0.6951649188995361, | |
| "learning_rate": 5.877895469836604e-06, | |
| "loss": 0.1047, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 4.264, | |
| "grad_norm": 0.682344377040863, | |
| "learning_rate": 5.863412343142258e-06, | |
| "loss": 0.182, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 4.272, | |
| "grad_norm": 0.7463601231575012, | |
| "learning_rate": 5.848921745509094e-06, | |
| "loss": 0.122, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.8343256115913391, | |
| "learning_rate": 5.8344238023214305e-06, | |
| "loss": 0.2197, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 0.7566097378730774, | |
| "learning_rate": 5.819918639027149e-06, | |
| "loss": 0.1342, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 4.296, | |
| "grad_norm": 0.771939218044281, | |
| "learning_rate": 5.805406381136598e-06, | |
| "loss": 0.2226, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 4.304, | |
| "grad_norm": 0.9192513823509216, | |
| "learning_rate": 5.790887154221521e-06, | |
| "loss": 0.1733, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 4.312, | |
| "grad_norm": 0.877711296081543, | |
| "learning_rate": 5.776361083913959e-06, | |
| "loss": 0.1946, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 1.1210631132125854, | |
| "learning_rate": 5.7618282959051685e-06, | |
| "loss": 0.2014, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.328, | |
| "grad_norm": 0.8137388825416565, | |
| "learning_rate": 5.747288915944533e-06, | |
| "loss": 0.161, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 4.336, | |
| "grad_norm": 0.8118594288825989, | |
| "learning_rate": 5.7327430698384775e-06, | |
| "loss": 0.105, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 4.344, | |
| "grad_norm": 0.7466629147529602, | |
| "learning_rate": 5.718190883449373e-06, | |
| "loss": 0.1797, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 4.352, | |
| "grad_norm": 0.7677578330039978, | |
| "learning_rate": 5.703632482694453e-06, | |
| "loss": 0.2377, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.6646220088005066, | |
| "learning_rate": 5.689067993544726e-06, | |
| "loss": 0.1571, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.368, | |
| "grad_norm": 0.8208298087120056, | |
| "learning_rate": 5.674497542023875e-06, | |
| "loss": 0.1324, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 4.376, | |
| "grad_norm": 0.8292826414108276, | |
| "learning_rate": 5.659921254207183e-06, | |
| "loss": 0.1344, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 4.384, | |
| "grad_norm": 0.9105590581893921, | |
| "learning_rate": 5.645339256220427e-06, | |
| "loss": 0.2105, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 4.392, | |
| "grad_norm": 0.6421738266944885, | |
| "learning_rate": 5.630751674238796e-06, | |
| "loss": 0.1674, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.7151204943656921, | |
| "learning_rate": 5.616158634485793e-06, | |
| "loss": 0.1985, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.408, | |
| "grad_norm": 0.8210756778717041, | |
| "learning_rate": 5.601560263232153e-06, | |
| "loss": 0.2164, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 4.416, | |
| "grad_norm": 0.84303218126297, | |
| "learning_rate": 5.5869566867947344e-06, | |
| "loss": 0.1591, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 4.424, | |
| "grad_norm": 0.8301178812980652, | |
| "learning_rate": 5.572348031535442e-06, | |
| "loss": 0.2021, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 4.432, | |
| "grad_norm": 1.613991618156433, | |
| "learning_rate": 5.557734423860122e-06, | |
| "loss": 0.1589, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 1.2904980182647705, | |
| "learning_rate": 5.543115990217478e-06, | |
| "loss": 0.2438, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 0.7553536295890808, | |
| "learning_rate": 5.528492857097966e-06, | |
| "loss": 0.1959, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 4.456, | |
| "grad_norm": 0.641670286655426, | |
| "learning_rate": 5.513865151032709e-06, | |
| "loss": 0.1752, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 4.464, | |
| "grad_norm": 0.9965618848800659, | |
| "learning_rate": 5.499232998592399e-06, | |
| "loss": 0.1465, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 4.4719999999999995, | |
| "grad_norm": 0.8387920260429382, | |
| "learning_rate": 5.484596526386198e-06, | |
| "loss": 0.1797, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.9162147641181946, | |
| "learning_rate": 5.469955861060653e-06, | |
| "loss": 0.171, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.4879999999999995, | |
| "grad_norm": 0.6971352100372314, | |
| "learning_rate": 5.455311129298586e-06, | |
| "loss": 0.1301, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 4.496, | |
| "grad_norm": 0.8540926575660706, | |
| "learning_rate": 5.44066245781801e-06, | |
| "loss": 0.2179, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 4.504, | |
| "grad_norm": 0.7464815378189087, | |
| "learning_rate": 5.426009973371026e-06, | |
| "loss": 0.2847, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 4.5120000000000005, | |
| "grad_norm": 0.8111969828605652, | |
| "learning_rate": 5.4113538027427245e-06, | |
| "loss": 0.1934, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.8497796058654785, | |
| "learning_rate": 5.396694072750099e-06, | |
| "loss": 0.1585, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.5280000000000005, | |
| "grad_norm": 0.8559319376945496, | |
| "learning_rate": 5.382030910240936e-06, | |
| "loss": 0.1683, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 4.536, | |
| "grad_norm": 0.805315375328064, | |
| "learning_rate": 5.367364442092724e-06, | |
| "loss": 0.1577, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 4.5440000000000005, | |
| "grad_norm": 0.7732157111167908, | |
| "learning_rate": 5.352694795211555e-06, | |
| "loss": 0.1681, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 4.552, | |
| "grad_norm": 0.742019772529602, | |
| "learning_rate": 5.338022096531028e-06, | |
| "loss": 0.1332, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 0.8582367897033691, | |
| "learning_rate": 5.3233464730111426e-06, | |
| "loss": 0.1247, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.568, | |
| "grad_norm": 0.730681836605072, | |
| "learning_rate": 5.308668051637213e-06, | |
| "loss": 0.1475, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 4.576, | |
| "grad_norm": 0.8676419854164124, | |
| "learning_rate": 5.29398695941876e-06, | |
| "loss": 0.1614, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 4.584, | |
| "grad_norm": 0.6681349873542786, | |
| "learning_rate": 5.279303323388413e-06, | |
| "loss": 0.1392, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 4.592, | |
| "grad_norm": 0.826603889465332, | |
| "learning_rate": 5.2646172706008154e-06, | |
| "loss": 0.209, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.5670231580734253, | |
| "learning_rate": 5.249928928131523e-06, | |
| "loss": 0.0939, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 0.7469101548194885, | |
| "learning_rate": 5.235238423075899e-06, | |
| "loss": 0.0982, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 4.616, | |
| "grad_norm": 0.782953679561615, | |
| "learning_rate": 5.220545882548024e-06, | |
| "loss": 0.1826, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 4.624, | |
| "grad_norm": 0.8022812008857727, | |
| "learning_rate": 5.20585143367959e-06, | |
| "loss": 0.1971, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 4.632, | |
| "grad_norm": 0.7650404572486877, | |
| "learning_rate": 5.191155203618796e-06, | |
| "loss": 0.1458, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.8026061058044434, | |
| "learning_rate": 5.176457319529264e-06, | |
| "loss": 0.1165, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.648, | |
| "grad_norm": 0.8590072989463806, | |
| "learning_rate": 5.161757908588917e-06, | |
| "loss": 0.1812, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 4.656, | |
| "grad_norm": 0.6927671432495117, | |
| "learning_rate": 5.147057097988898e-06, | |
| "loss": 0.1598, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 4.664, | |
| "grad_norm": 0.793268084526062, | |
| "learning_rate": 5.132355014932455e-06, | |
| "loss": 0.1656, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 4.672, | |
| "grad_norm": 0.7395486831665039, | |
| "learning_rate": 5.1176517866338495e-06, | |
| "loss": 0.1475, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.7657336592674255, | |
| "learning_rate": 5.102947540317254e-06, | |
| "loss": 0.1537, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.688, | |
| "grad_norm": 0.7089317440986633, | |
| "learning_rate": 5.088242403215644e-06, | |
| "loss": 0.1515, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 4.696, | |
| "grad_norm": 0.9448972344398499, | |
| "learning_rate": 5.073536502569708e-06, | |
| "loss": 0.142, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 4.704, | |
| "grad_norm": 0.8224512338638306, | |
| "learning_rate": 5.058829965626742e-06, | |
| "loss": 0.1793, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 4.712, | |
| "grad_norm": 0.8339155316352844, | |
| "learning_rate": 5.0441229196395416e-06, | |
| "loss": 0.1887, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.7287314534187317, | |
| "learning_rate": 5.029415491865311e-06, | |
| "loss": 0.1872, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.728, | |
| "grad_norm": 0.7033903002738953, | |
| "learning_rate": 5.014707809564562e-06, | |
| "loss": 0.1936, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 4.736, | |
| "grad_norm": 0.7430479526519775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2697, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 4.744, | |
| "grad_norm": 0.8038540482521057, | |
| "learning_rate": 4.98529219043544e-06, | |
| "loss": 0.1888, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 4.752, | |
| "grad_norm": 0.7803244590759277, | |
| "learning_rate": 4.97058450813469e-06, | |
| "loss": 0.1786, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.746216356754303, | |
| "learning_rate": 4.955877080360462e-06, | |
| "loss": 0.1814, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 0.7220593690872192, | |
| "learning_rate": 4.94117003437326e-06, | |
| "loss": 0.0849, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 4.776, | |
| "grad_norm": 0.814316987991333, | |
| "learning_rate": 4.926463497430293e-06, | |
| "loss": 0.1596, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 4.784, | |
| "grad_norm": 0.8442027568817139, | |
| "learning_rate": 4.911757596784358e-06, | |
| "loss": 0.1341, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 4.792, | |
| "grad_norm": 0.9273790121078491, | |
| "learning_rate": 4.897052459682749e-06, | |
| "loss": 0.1949, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.7030894160270691, | |
| "learning_rate": 4.882348213366152e-06, | |
| "loss": 0.184, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.808, | |
| "grad_norm": 0.7360998392105103, | |
| "learning_rate": 4.867644985067548e-06, | |
| "loss": 0.1367, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 4.816, | |
| "grad_norm": 0.9348264336585999, | |
| "learning_rate": 4.8529429020111035e-06, | |
| "loss": 0.3853, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 4.824, | |
| "grad_norm": 1.1637816429138184, | |
| "learning_rate": 4.838242091411085e-06, | |
| "loss": 0.1667, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 4.832, | |
| "grad_norm": 0.786730945110321, | |
| "learning_rate": 4.823542680470738e-06, | |
| "loss": 0.1395, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 0.7474973797798157, | |
| "learning_rate": 4.808844796381205e-06, | |
| "loss": 0.1013, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.848, | |
| "grad_norm": 0.7067230343818665, | |
| "learning_rate": 4.794148566320412e-06, | |
| "loss": 0.1399, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 4.856, | |
| "grad_norm": 0.7749925851821899, | |
| "learning_rate": 4.779454117451978e-06, | |
| "loss": 0.1227, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 4.864, | |
| "grad_norm": 0.7210198044776917, | |
| "learning_rate": 4.7647615769241e-06, | |
| "loss": 0.1358, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 4.872, | |
| "grad_norm": 0.6654530763626099, | |
| "learning_rate": 4.750071071868478e-06, | |
| "loss": 0.1565, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 0.7312431335449219, | |
| "learning_rate": 4.7353827293991845e-06, | |
| "loss": 0.1653, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.888, | |
| "grad_norm": 0.8637939095497131, | |
| "learning_rate": 4.720696676611589e-06, | |
| "loss": 0.1731, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 4.896, | |
| "grad_norm": 0.7614461779594421, | |
| "learning_rate": 4.706013040581242e-06, | |
| "loss": 0.1226, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 4.904, | |
| "grad_norm": 0.8063651919364929, | |
| "learning_rate": 4.691331948362789e-06, | |
| "loss": 0.1936, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 4.912, | |
| "grad_norm": 0.807110607624054, | |
| "learning_rate": 4.676653526988858e-06, | |
| "loss": 0.1201, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.7894400358200073, | |
| "learning_rate": 4.661977903468974e-06, | |
| "loss": 0.1306, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 0.7502027153968811, | |
| "learning_rate": 4.647305204788445e-06, | |
| "loss": 0.1266, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 4.936, | |
| "grad_norm": 0.8045099377632141, | |
| "learning_rate": 4.632635557907277e-06, | |
| "loss": 0.1481, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 4.944, | |
| "grad_norm": 0.7186449766159058, | |
| "learning_rate": 4.617969089759066e-06, | |
| "loss": 0.1482, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 4.952, | |
| "grad_norm": 0.7988418340682983, | |
| "learning_rate": 4.603305927249902e-06, | |
| "loss": 0.1462, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.800707995891571, | |
| "learning_rate": 4.588646197257278e-06, | |
| "loss": 0.1404, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.968, | |
| "grad_norm": 0.8178343176841736, | |
| "learning_rate": 4.573990026628976e-06, | |
| "loss": 0.1128, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 4.976, | |
| "grad_norm": 0.8397066593170166, | |
| "learning_rate": 4.559337542181993e-06, | |
| "loss": 0.1585, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 4.984, | |
| "grad_norm": 0.8154752254486084, | |
| "learning_rate": 4.544688870701416e-06, | |
| "loss": 0.1791, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 4.992, | |
| "grad_norm": 0.8033022284507751, | |
| "learning_rate": 4.53004413893935e-06, | |
| "loss": 0.1467, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.7062161564826965, | |
| "learning_rate": 4.5154034736138035e-06, | |
| "loss": 0.0788, | |
| "step": 625 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.697570719603098e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |