| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9423076923076925, | |
| "eval_steps": 500, | |
| "global_step": 468, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 26.249136076838056, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 1.1102, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 26.110264576346324, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.0997, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 25.94374396489923, | |
| "learning_rate": 1.5e-06, | |
| "loss": 1.1239, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 22.37475129701464, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.1059, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 16.25191401266457, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.0405, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 22.679130338310745, | |
| "learning_rate": 3e-06, | |
| "loss": 1.0213, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 15.916959043580418, | |
| "learning_rate": 3.5e-06, | |
| "loss": 1.0104, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 9.482985220910942, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.9937, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.965472899289267, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.9693, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 12.298531856803667, | |
| "learning_rate": 5e-06, | |
| "loss": 0.962, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 6.569727331935135, | |
| "learning_rate": 4.999941186489917e-06, | |
| "loss": 0.9374, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.377638735854951, | |
| "learning_rate": 4.999764748726891e-06, | |
| "loss": 0.9315, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.601213320961148, | |
| "learning_rate": 4.999470695012462e-06, | |
| "loss": 0.9317, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.018739227425751, | |
| "learning_rate": 4.999059039182093e-06, | |
| "loss": 0.9246, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.719715824372754, | |
| "learning_rate": 4.998529800604525e-06, | |
| "loss": 0.9136, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.9341753543248505, | |
| "learning_rate": 4.99788300418086e-06, | |
| "loss": 0.9221, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.060460637107745, | |
| "learning_rate": 4.997118680343392e-06, | |
| "loss": 0.9021, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.576152525840241, | |
| "learning_rate": 4.996236865054177e-06, | |
| "loss": 0.8804, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.2384463037464157, | |
| "learning_rate": 4.995237599803336e-06, | |
| "loss": 0.8864, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.961735655562835, | |
| "learning_rate": 4.994120931607106e-06, | |
| "loss": 0.884, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.715236432103334, | |
| "learning_rate": 4.992886913005628e-06, | |
| "loss": 0.8803, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.526746110251263, | |
| "learning_rate": 4.991535602060475e-06, | |
| "loss": 0.8856, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.663980149991846, | |
| "learning_rate": 4.9900670623519185e-06, | |
| "loss": 0.8613, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.4521755603079787, | |
| "learning_rate": 4.988481362975939e-06, | |
| "loss": 0.8692, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.414349491574281, | |
| "learning_rate": 4.986778578540973e-06, | |
| "loss": 0.8602, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.8508832900360477, | |
| "learning_rate": 4.984958789164404e-06, | |
| "loss": 0.8526, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 3.196516977267481, | |
| "learning_rate": 4.983022080468794e-06, | |
| "loss": 0.8416, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.6095238921057553, | |
| "learning_rate": 4.980968543577849e-06, | |
| "loss": 0.8477, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.2205063108737506, | |
| "learning_rate": 4.978798275112142e-06, | |
| "loss": 0.8502, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.453254154090838, | |
| "learning_rate": 4.976511377184557e-06, | |
| "loss": 0.8468, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.9249988861975766, | |
| "learning_rate": 4.97410795739549e-06, | |
| "loss": 0.8391, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.462696000038617, | |
| "learning_rate": 4.971588128827783e-06, | |
| "loss": 0.8436, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 3.587837288732032, | |
| "learning_rate": 4.968952010041408e-06, | |
| "loss": 0.8564, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.2433648029783635, | |
| "learning_rate": 4.966199725067883e-06, | |
| "loss": 0.8501, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.55480062205993, | |
| "learning_rate": 4.96333140340444e-06, | |
| "loss": 0.8474, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 3.430958497804354, | |
| "learning_rate": 4.960347180007932e-06, | |
| "loss": 0.8343, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.0559248054083428, | |
| "learning_rate": 4.957247195288479e-06, | |
| "loss": 0.8358, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.2032433802521147, | |
| "learning_rate": 4.9540315951028695e-06, | |
| "loss": 0.8538, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.757473867230571, | |
| "learning_rate": 4.9507005307476894e-06, | |
| "loss": 0.8488, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.045305839090202, | |
| "learning_rate": 4.947254158952209e-06, | |
| "loss": 0.8463, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.9629285351308554, | |
| "learning_rate": 4.943692641871005e-06, | |
| "loss": 0.828, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.7499225905634037, | |
| "learning_rate": 4.940016147076337e-06, | |
| "loss": 0.835, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.606092501631258, | |
| "learning_rate": 4.9362248475502515e-06, | |
| "loss": 0.8269, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.674180949875197, | |
| "learning_rate": 4.932318921676458e-06, | |
| "loss": 0.8417, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.7285602808097336, | |
| "learning_rate": 4.928298553231924e-06, | |
| "loss": 0.8142, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.7015909165553387, | |
| "learning_rate": 4.924163931378233e-06, | |
| "loss": 0.8323, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.5415943230444498, | |
| "learning_rate": 4.919915250652686e-06, | |
| "loss": 0.8244, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.776100967798618, | |
| "learning_rate": 4.9155527109591435e-06, | |
| "loss": 0.8516, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.711552634387251, | |
| "learning_rate": 4.911076517558623e-06, | |
| "loss": 0.8313, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.905341213972799, | |
| "learning_rate": 4.906486881059641e-06, | |
| "loss": 0.827, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.6533492618702206, | |
| "learning_rate": 4.901784017408303e-06, | |
| "loss": 0.8298, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.3477688431170414, | |
| "learning_rate": 4.896968147878146e-06, | |
| "loss": 0.8014, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.8384104093830587, | |
| "learning_rate": 4.892039499059721e-06, | |
| "loss": 0.8116, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.432850021289229, | |
| "learning_rate": 4.886998302849938e-06, | |
| "loss": 0.8156, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.804790940572451, | |
| "learning_rate": 4.881844796441153e-06, | |
| "loss": 0.8159, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.3348703819353926, | |
| "learning_rate": 4.876579222310007e-06, | |
| "loss": 0.8096, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 3.1901608706880134, | |
| "learning_rate": 4.8712018282060165e-06, | |
| "loss": 0.811, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.660868627279477, | |
| "learning_rate": 4.86571286713992e-06, | |
| "loss": 0.818, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.7869506219362514, | |
| "learning_rate": 4.860112597371772e-06, | |
| "loss": 0.8267, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.3781426665180727, | |
| "learning_rate": 4.85440128239879e-06, | |
| "loss": 0.8112, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 3.015345568347538, | |
| "learning_rate": 4.8485791909429575e-06, | |
| "loss": 0.8151, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.4423049063633546, | |
| "learning_rate": 4.842646596938383e-06, | |
| "loss": 0.8282, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.6311610579830345, | |
| "learning_rate": 4.8366037795184086e-06, | |
| "loss": 0.8224, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.2899705695712282, | |
| "learning_rate": 4.830451023002477e-06, | |
| "loss": 0.8249, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.734019031576332, | |
| "learning_rate": 4.824188616882754e-06, | |
| "loss": 0.8136, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.0962707496285153, | |
| "learning_rate": 4.817816855810507e-06, | |
| "loss": 0.8184, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.9665475220491966, | |
| "learning_rate": 4.811336039582244e-06, | |
| "loss": 0.817, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.3700279096809824, | |
| "learning_rate": 4.804746473125605e-06, | |
| "loss": 0.81, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 3.027450051648286, | |
| "learning_rate": 4.798048466485018e-06, | |
| "loss": 0.8219, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.752180087601986, | |
| "learning_rate": 4.791242334807106e-06, | |
| "loss": 0.8101, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.573031098084336, | |
| "learning_rate": 4.784328398325866e-06, | |
| "loss": 0.8099, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.7123062400387576, | |
| "learning_rate": 4.7773069823475945e-06, | |
| "loss": 0.8088, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.474407154924412, | |
| "learning_rate": 4.770178417235589e-06, | |
| "loss": 0.8232, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.4442790557938965, | |
| "learning_rate": 4.762943038394597e-06, | |
| "loss": 0.8051, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.1553005222889583, | |
| "learning_rate": 4.755601186255041e-06, | |
| "loss": 0.825, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.312089317706644, | |
| "learning_rate": 4.7481532062569945e-06, | |
| "loss": 0.8168, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.2731409969922693, | |
| "learning_rate": 4.7405994488339375e-06, | |
| "loss": 0.8095, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.444830999943097, | |
| "learning_rate": 4.732940269396259e-06, | |
| "loss": 0.8108, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.315315922207351, | |
| "learning_rate": 4.725176028314541e-06, | |
| "loss": 0.8072, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.231139324893734, | |
| "learning_rate": 4.7173070909026015e-06, | |
| "loss": 0.8093, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.3706185711850956, | |
| "learning_rate": 4.7093338274003035e-06, | |
| "loss": 0.8011, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.2002695782644905, | |
| "learning_rate": 4.701256612956137e-06, | |
| "loss": 0.8136, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.704162588410486, | |
| "learning_rate": 4.693075827609569e-06, | |
| "loss": 0.8161, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.9269420606373808, | |
| "learning_rate": 4.684791856273161e-06, | |
| "loss": 0.8023, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.4468707403860037, | |
| "learning_rate": 4.676405088714458e-06, | |
| "loss": 0.8019, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.07255184003701, | |
| "learning_rate": 4.667915919537651e-06, | |
| "loss": 0.8155, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.2036583211359746, | |
| "learning_rate": 4.6593247481650105e-06, | |
| "loss": 0.8175, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.8140377607105893, | |
| "learning_rate": 4.65063197881809e-06, | |
| "loss": 0.8047, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.1515853560068243, | |
| "learning_rate": 4.641838020498713e-06, | |
| "loss": 0.8185, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.422760543528869, | |
| "learning_rate": 4.632943286969724e-06, | |
| "loss": 0.8053, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.0829660459092776, | |
| "learning_rate": 4.6239481967355226e-06, | |
| "loss": 0.8016, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.1797861749114857, | |
| "learning_rate": 4.614853173022374e-06, | |
| "loss": 0.8068, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.6076944835409135, | |
| "learning_rate": 4.605658643758492e-06, | |
| "loss": 0.8121, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.8970468662695665, | |
| "learning_rate": 4.59636504155391e-06, | |
| "loss": 0.8146, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.195985971319255, | |
| "learning_rate": 4.586972803680119e-06, | |
| "loss": 0.7956, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.7905344729119324, | |
| "learning_rate": 4.577482372049503e-06, | |
| "loss": 0.7953, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.940825819357636, | |
| "learning_rate": 4.567894193194538e-06, | |
| "loss": 0.8047, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.7851389603760988, | |
| "learning_rate": 4.558208718246787e-06, | |
| "loss": 0.8105, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.4544053049998884, | |
| "learning_rate": 4.548426402915674e-06, | |
| "loss": 0.8007, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.1082597379545165, | |
| "learning_rate": 4.538547707467038e-06, | |
| "loss": 0.805, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.9122304736142002, | |
| "learning_rate": 4.528573096701484e-06, | |
| "loss": 0.8067, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.990095091929317, | |
| "learning_rate": 4.5185030399325085e-06, | |
| "loss": 0.8025, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.029120797101969, | |
| "learning_rate": 4.508338010964419e-06, | |
| "loss": 0.8051, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.9232157433946324, | |
| "learning_rate": 4.498078488070044e-06, | |
| "loss": 0.8078, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.026639315850326, | |
| "learning_rate": 4.4877249539682235e-06, | |
| "loss": 0.7957, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.321397841878117, | |
| "learning_rate": 4.477277895801105e-06, | |
| "loss": 0.8039, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.1058946605532656, | |
| "learning_rate": 4.466737805111218e-06, | |
| "loss": 0.7921, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.9367272096486814, | |
| "learning_rate": 4.456105177818345e-06, | |
| "loss": 0.7922, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.5699051489574605, | |
| "learning_rate": 4.445380514196192e-06, | |
| "loss": 0.7984, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.0519065594178003, | |
| "learning_rate": 4.434564318848851e-06, | |
| "loss": 0.784, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.809445653695483, | |
| "learning_rate": 4.423657100687051e-06, | |
| "loss": 0.7835, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.170793808104346, | |
| "learning_rate": 4.41265937290422e-06, | |
| "loss": 0.8039, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 2.612499453872831, | |
| "learning_rate": 4.401571652952338e-06, | |
| "loss": 0.8099, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 2.1820432279807718, | |
| "learning_rate": 4.390394462517589e-06, | |
| "loss": 0.7929, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.335653384387635, | |
| "learning_rate": 4.379128327495813e-06, | |
| "loss": 0.7999, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.9923005143051236, | |
| "learning_rate": 4.367773777967769e-06, | |
| "loss": 0.8123, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.1550829328561787, | |
| "learning_rate": 4.3563313481741855e-06, | |
| "loss": 0.7905, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.8608335678809116, | |
| "learning_rate": 4.344801576490631e-06, | |
| "loss": 0.8051, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.1245561127414923, | |
| "learning_rate": 4.3331850054021806e-06, | |
| "loss": 0.7979, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.0733225632670127, | |
| "learning_rate": 4.321482181477891e-06, | |
| "loss": 0.7712, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.2998040554493366, | |
| "learning_rate": 4.309693655345084e-06, | |
| "loss": 0.7523, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 3.0325218749275624, | |
| "learning_rate": 4.29781998166344e-06, | |
| "loss": 0.7591, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 3.696913587445061, | |
| "learning_rate": 4.2858617190989e-06, | |
| "loss": 0.7447, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.852432566856283, | |
| "learning_rate": 4.273819430297382e-06, | |
| "loss": 0.7557, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 4.684715882065128, | |
| "learning_rate": 4.261693681858306e-06, | |
| "loss": 0.7596, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 3.5092399399988343, | |
| "learning_rate": 4.2494850443079305e-06, | |
| "loss": 0.7467, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.851985488385657, | |
| "learning_rate": 4.237194092072521e-06, | |
| "loss": 0.7475, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.328297635562109, | |
| "learning_rate": 4.2248214034513114e-06, | |
| "loss": 0.7442, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 3.9102386032965373, | |
| "learning_rate": 4.212367560589299e-06, | |
| "loss": 0.7651, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 3.06457511968988, | |
| "learning_rate": 4.199833149449853e-06, | |
| "loss": 0.7418, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 3.2168853165359175, | |
| "learning_rate": 4.187218759787148e-06, | |
| "loss": 0.743, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.787806998778304, | |
| "learning_rate": 4.174524985118411e-06, | |
| "loss": 0.7583, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.9052093067187443, | |
| "learning_rate": 4.161752422695995e-06, | |
| "loss": 0.7657, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 3.0182785590623173, | |
| "learning_rate": 4.148901673479285e-06, | |
| "loss": 0.7362, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 3.344873463157497, | |
| "learning_rate": 4.135973342106416e-06, | |
| "loss": 0.7558, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.859789074330451, | |
| "learning_rate": 4.122968036865827e-06, | |
| "loss": 0.7486, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 3.228835212527119, | |
| "learning_rate": 4.109886369667636e-06, | |
| "loss": 0.7655, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.731179147211052, | |
| "learning_rate": 4.096728956014857e-06, | |
| "loss": 0.7528, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 2.652548225590943, | |
| "learning_rate": 4.083496414974434e-06, | |
| "loss": 0.7448, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.318475213880663, | |
| "learning_rate": 4.070189369148117e-06, | |
| "loss": 0.7577, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.63738412652813, | |
| "learning_rate": 4.056808444643163e-06, | |
| "loss": 0.7475, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.557947896210419, | |
| "learning_rate": 4.043354271042884e-06, | |
| "loss": 0.7368, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.555902551567185, | |
| "learning_rate": 4.02982748137702e-06, | |
| "loss": 0.7434, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.331216526916747, | |
| "learning_rate": 4.0162287120919545e-06, | |
| "loss": 0.7592, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.298359286001758, | |
| "learning_rate": 4.002558603020772e-06, | |
| "loss": 0.7407, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.2941501459555806, | |
| "learning_rate": 3.988817797353149e-06, | |
| "loss": 0.7534, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.195437955425272, | |
| "learning_rate": 3.975006941605099e-06, | |
| "loss": 0.7501, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.2354482392933583, | |
| "learning_rate": 3.961126685588541e-06, | |
| "loss": 0.7628, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.271851901257072, | |
| "learning_rate": 3.947177682380738e-06, | |
| "loss": 0.7559, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.06190688358819, | |
| "learning_rate": 3.933160588293564e-06, | |
| "loss": 0.7381, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.1082422406253145, | |
| "learning_rate": 3.9190760628426225e-06, | |
| "loss": 0.761, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.257445076263947, | |
| "learning_rate": 3.904924768716216e-06, | |
| "loss": 0.7339, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.0575089260504065, | |
| "learning_rate": 3.890707371744169e-06, | |
| "loss": 0.7481, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.2929887801738205, | |
| "learning_rate": 3.8764245408664964e-06, | |
| "loss": 0.7378, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.0516588845869244, | |
| "learning_rate": 3.862076948101934e-06, | |
| "loss": 0.7565, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.265288203536537, | |
| "learning_rate": 3.847665268516314e-06, | |
| "loss": 0.7489, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.3008837342067174, | |
| "learning_rate": 3.833190180190808e-06, | |
| "loss": 0.7387, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.303244798174658, | |
| "learning_rate": 3.818652364190018e-06, | |
| "loss": 0.7456, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.266318666547602, | |
| "learning_rate": 3.8040525045299337e-06, | |
| "loss": 0.7574, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.12834726687523, | |
| "learning_rate": 3.7893912881457505e-06, | |
| "loss": 0.7444, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.3349048004783968, | |
| "learning_rate": 3.7746694048595458e-06, | |
| "loss": 0.7576, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.3189890726535025, | |
| "learning_rate": 3.759887547347825e-06, | |
| "loss": 0.7661, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.4796894123137703, | |
| "learning_rate": 3.745046411108928e-06, | |
| "loss": 0.75, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.3870417794754446, | |
| "learning_rate": 3.730146694430308e-06, | |
| "loss": 0.7553, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.0568742731615752, | |
| "learning_rate": 3.7151890983556747e-06, | |
| "loss": 0.7493, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.4078734682098344, | |
| "learning_rate": 3.700174326652011e-06, | |
| "loss": 0.7413, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.1293229041387853, | |
| "learning_rate": 3.685103085776457e-06, | |
| "loss": 0.7467, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.117068360526989, | |
| "learning_rate": 3.6699760848430753e-06, | |
| "loss": 0.7396, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.9883016666638709, | |
| "learning_rate": 3.654794035589484e-06, | |
| "loss": 0.7491, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.2563683164117534, | |
| "learning_rate": 3.6395576523433672e-06, | |
| "loss": 0.7518, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.3064779080171744, | |
| "learning_rate": 3.6242676519888693e-06, | |
| "loss": 0.7565, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.9222922356745977, | |
| "learning_rate": 3.608924753932862e-06, | |
| "loss": 0.7353, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.2598801622305547, | |
| "learning_rate": 3.593529680071097e-06, | |
| "loss": 0.7466, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 2.0105134715596082, | |
| "learning_rate": 3.578083154754241e-06, | |
| "loss": 0.7427, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.482181765049527, | |
| "learning_rate": 3.5625859047537904e-06, | |
| "loss": 0.7531, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.989275137459208, | |
| "learning_rate": 3.547038659227881e-06, | |
| "loss": 0.7458, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 2.375903859138452, | |
| "learning_rate": 3.5314421496869777e-06, | |
| "loss": 0.7497, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.818443971455023, | |
| "learning_rate": 3.515797109959458e-06, | |
| "loss": 0.7416, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.1798526485495127, | |
| "learning_rate": 3.500104276157083e-06, | |
| "loss": 0.7477, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.8954837732640064, | |
| "learning_rate": 3.484364386640365e-06, | |
| "loss": 0.7511, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.1614293408819023, | |
| "learning_rate": 3.4685781819838233e-06, | |
| "loss": 0.7517, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.1506646824589497, | |
| "learning_rate": 3.452746404941143e-06, | |
| "loss": 0.7355, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.169292733116705, | |
| "learning_rate": 3.4368698004102284e-06, | |
| "loss": 0.74, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.418141938844683, | |
| "learning_rate": 3.420949115398151e-06, | |
| "loss": 0.7503, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.0599479007715886, | |
| "learning_rate": 3.404985098986007e-06, | |
| "loss": 0.7569, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.9525184362879087, | |
| "learning_rate": 3.388978502293666e-06, | |
| "loss": 0.7354, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.639288273379809, | |
| "learning_rate": 3.372930078444439e-06, | |
| "loss": 0.7567, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.0510480212606232, | |
| "learning_rate": 3.3568405825296355e-06, | |
| "loss": 0.7433, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.1578817124425735, | |
| "learning_rate": 3.34071077157304e-06, | |
| "loss": 0.7417, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.5268212959295893, | |
| "learning_rate": 3.3245414044952927e-06, | |
| "loss": 0.7485, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.159749201380735, | |
| "learning_rate": 3.308333242078179e-06, | |
| "loss": 0.7507, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 2.308409227797682, | |
| "learning_rate": 3.292087046928838e-06, | |
| "loss": 0.752, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.6229465595278616, | |
| "learning_rate": 3.2758035834438804e-06, | |
| "loss": 0.7348, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.055730524359484, | |
| "learning_rate": 3.2594836177734208e-06, | |
| "loss": 0.7421, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 2.0176570132314837, | |
| "learning_rate": 3.2431279177850317e-06, | |
| "loss": 0.7536, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.403883281288115, | |
| "learning_rate": 3.226737253027614e-06, | |
| "loss": 0.7279, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.000280638267734, | |
| "learning_rate": 3.210312394695189e-06, | |
| "loss": 0.7437, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.220127089726233, | |
| "learning_rate": 3.1938541155906146e-06, | |
| "loss": 0.7477, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.1887426336139875, | |
| "learning_rate": 3.177363190089221e-06, | |
| "loss": 0.7474, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.981302078482481, | |
| "learning_rate": 3.1608403941023793e-06, | |
| "loss": 0.7396, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.264086468679179, | |
| "learning_rate": 3.144286505040992e-06, | |
| "loss": 0.7391, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.125610702261824, | |
| "learning_rate": 3.1277023017789166e-06, | |
| "loss": 0.7449, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.167346271778115, | |
| "learning_rate": 3.111088564616317e-06, | |
| "loss": 0.741, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.375363723765046, | |
| "learning_rate": 3.094446075242952e-06, | |
| "loss": 0.7469, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.9942074170938597, | |
| "learning_rate": 3.0777756167013946e-06, | |
| "loss": 0.7431, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.146743351563113, | |
| "learning_rate": 3.0610779733501904e-06, | |
| "loss": 0.752, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.1525241003649174, | |
| "learning_rate": 3.044353930826952e-06, | |
| "loss": 0.7345, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.8638029080202694, | |
| "learning_rate": 3.0276042760113937e-06, | |
| "loss": 0.7343, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.8356317644401954, | |
| "learning_rate": 3.0108297969883105e-06, | |
| "loss": 0.7336, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.0787011804537583, | |
| "learning_rate": 2.9940312830104936e-06, | |
| "loss": 0.7505, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.8672332467108457, | |
| "learning_rate": 2.977209524461601e-06, | |
| "loss": 0.7479, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.0236026898201063, | |
| "learning_rate": 2.960365312818967e-06, | |
| "loss": 0.7563, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.9813232623431425, | |
| "learning_rate": 2.9434994406163574e-06, | |
| "loss": 0.7405, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.009874379662123, | |
| "learning_rate": 2.9266127014066905e-06, | |
| "loss": 0.7576, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.9182181733436605, | |
| "learning_rate": 2.9097058897246904e-06, | |
| "loss": 0.732, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.2713302836846836, | |
| "learning_rate": 2.8927798010495095e-06, | |
| "loss": 0.7557, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.8499354567551494, | |
| "learning_rate": 2.875835231767297e-06, | |
| "loss": 0.746, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.056857824508339, | |
| "learning_rate": 2.8588729791337298e-06, | |
| "loss": 0.7327, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.0366392068105768, | |
| "learning_rate": 2.8418938412365016e-06, | |
| "loss": 0.7649, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.075283194301643, | |
| "learning_rate": 2.8248986169577697e-06, | |
| "loss": 0.7498, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.8137465567608382, | |
| "learning_rate": 2.807888105936571e-06, | |
| "loss": 0.7426, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.9713775183042563, | |
| "learning_rate": 2.7908631085311933e-06, | |
| "loss": 0.7573, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.9347350716569554, | |
| "learning_rate": 2.7738244257815234e-06, | |
| "loss": 0.739, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.04529529738096, | |
| "learning_rate": 2.756772859371351e-06, | |
| "loss": 0.743, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.1971388123428754, | |
| "learning_rate": 2.7397092115906554e-06, | |
| "loss": 0.7471, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.7934500775315345, | |
| "learning_rate": 2.7226342852978542e-06, | |
| "loss": 0.7443, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.268705081511345, | |
| "learning_rate": 2.7055488838820266e-06, | |
| "loss": 0.7414, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.9139342117735205, | |
| "learning_rate": 2.6884538112251147e-06, | |
| "loss": 0.7406, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.3588112901077714, | |
| "learning_rate": 2.6713498716641017e-06, | |
| "loss": 0.7575, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.7941800288489695, | |
| "learning_rate": 2.6542378699531645e-06, | |
| "loss": 0.7459, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.5891645782274955, | |
| "learning_rate": 2.6371186112258118e-06, | |
| "loss": 0.7472, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.8268045992021926, | |
| "learning_rate": 2.6199929009570003e-06, | |
| "loss": 0.7489, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.703784770062622, | |
| "learning_rate": 2.602861544925236e-06, | |
| "loss": 0.7272, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.00621810320303, | |
| "learning_rate": 2.5857253491746646e-06, | |
| "loss": 0.7434, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 3.0770514249566507, | |
| "learning_rate": 2.568585119977142e-06, | |
| "loss": 0.7547, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.199087929149206, | |
| "learning_rate": 2.551441663794304e-06, | |
| "loss": 0.7362, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.609728798751417, | |
| "learning_rate": 2.5342957872396156e-06, | |
| "loss": 0.7387, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.4167509521439583, | |
| "learning_rate": 2.5171482970404244e-06, | |
| "loss": 0.7291, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.272061225763508, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.7054, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.824272633709913, | |
| "learning_rate": 2.482851702959577e-06, | |
| "loss": 0.6944, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 3.5666822575384316, | |
| "learning_rate": 2.4657042127603853e-06, | |
| "loss": 0.6993, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.3931138116624724, | |
| "learning_rate": 2.4485583362056975e-06, | |
| "loss": 0.7047, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.9320600754571795, | |
| "learning_rate": 2.4314148800228584e-06, | |
| "loss": 0.6925, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.8973919990299133, | |
| "learning_rate": 2.4142746508253367e-06, | |
| "loss": 0.6965, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.625815226251557, | |
| "learning_rate": 2.3971384550747644e-06, | |
| "loss": 0.683, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.4155664260482714, | |
| "learning_rate": 2.3800070990430006e-06, | |
| "loss": 0.6994, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.411022689524766, | |
| "learning_rate": 2.3628813887741882e-06, | |
| "loss": 0.6894, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.28077143295534, | |
| "learning_rate": 2.345762130046836e-06, | |
| "loss": 0.7023, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.401968274277045, | |
| "learning_rate": 2.3286501283358987e-06, | |
| "loss": 0.6759, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.378798265143013, | |
| "learning_rate": 2.311546188774886e-06, | |
| "loss": 0.6958, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.3673961258869904, | |
| "learning_rate": 2.2944511161179743e-06, | |
| "loss": 0.6838, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.516008730553284, | |
| "learning_rate": 2.2773657147021466e-06, | |
| "loss": 0.6909, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.2281402280211466, | |
| "learning_rate": 2.2602907884093454e-06, | |
| "loss": 0.6811, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 2.1439630603489315, | |
| "learning_rate": 2.24322714062865e-06, | |
| "loss": 0.6952, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.1634818483654077, | |
| "learning_rate": 2.2261755742184783e-06, | |
| "loss": 0.698, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.196698584094087, | |
| "learning_rate": 2.2091368914688067e-06, | |
| "loss": 0.6791, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.146209315784236, | |
| "learning_rate": 2.19211189406343e-06, | |
| "loss": 0.7023, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.1405757699513766, | |
| "learning_rate": 2.1751013830422303e-06, | |
| "loss": 0.6977, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.9616924770190525, | |
| "learning_rate": 2.1581061587634992e-06, | |
| "loss": 0.6812, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.23295032851048, | |
| "learning_rate": 2.14112702086627e-06, | |
| "loss": 0.685, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.279103072547858, | |
| "learning_rate": 2.1241647682327037e-06, | |
| "loss": 0.6954, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.1829812242717317, | |
| "learning_rate": 2.1072201989504914e-06, | |
| "loss": 0.6934, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.2223700306607537, | |
| "learning_rate": 2.09029411027531e-06, | |
| "loss": 0.6886, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.0785442261456337, | |
| "learning_rate": 2.073387298593311e-06, | |
| "loss": 0.6994, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.266610094629974, | |
| "learning_rate": 2.0565005593836434e-06, | |
| "loss": 0.6969, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.132447543935174, | |
| "learning_rate": 2.0396346871810347e-06, | |
| "loss": 0.6773, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.0884173463715228, | |
| "learning_rate": 2.0227904755383985e-06, | |
| "loss": 0.6945, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.2085913230241396, | |
| "learning_rate": 2.005968716989507e-06, | |
| "loss": 0.6949, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.0268180328155507, | |
| "learning_rate": 1.98917020301169e-06, | |
| "loss": 0.7027, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.4747960228822037, | |
| "learning_rate": 1.9723957239886067e-06, | |
| "loss": 0.6794, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.095701576576396, | |
| "learning_rate": 1.955646069173048e-06, | |
| "loss": 0.6961, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.0823665858025313, | |
| "learning_rate": 1.93892202664981e-06, | |
| "loss": 0.6786, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.245218571211113, | |
| "learning_rate": 1.922224383298606e-06, | |
| "loss": 0.6974, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.158681117909884, | |
| "learning_rate": 1.905553924757049e-06, | |
| "loss": 0.7002, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 2.146126951984283, | |
| "learning_rate": 1.888911435383684e-06, | |
| "loss": 0.6843, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.1238895111610048, | |
| "learning_rate": 1.8722976982210845e-06, | |
| "loss": 0.684, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 2.1081813807297984, | |
| "learning_rate": 1.8557134949590087e-06, | |
| "loss": 0.6868, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.0759325520644096, | |
| "learning_rate": 1.8391596058976214e-06, | |
| "loss": 0.69, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.087216621474724, | |
| "learning_rate": 1.8226368099107793e-06, | |
| "loss": 0.6923, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.4018313648831113, | |
| "learning_rate": 1.806145884409386e-06, | |
| "loss": 0.6931, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.0013309281300216, | |
| "learning_rate": 1.7896876053048112e-06, | |
| "loss": 0.6893, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.059546352986111, | |
| "learning_rate": 1.7732627469723868e-06, | |
| "loss": 0.6867, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.000325940508461, | |
| "learning_rate": 1.756872082214969e-06, | |
| "loss": 0.6914, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.379361502001129, | |
| "learning_rate": 1.7405163822265803e-06, | |
| "loss": 0.6906, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.8960887672873148, | |
| "learning_rate": 1.7241964165561204e-06, | |
| "loss": 0.6673, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.172181466809878, | |
| "learning_rate": 1.707912953071163e-06, | |
| "loss": 0.6781, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.015019267745016, | |
| "learning_rate": 1.6916667579218216e-06, | |
| "loss": 0.6963, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.0355415165674846, | |
| "learning_rate": 1.6754585955047081e-06, | |
| "loss": 0.6779, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 2.1086130667226977, | |
| "learning_rate": 1.6592892284269597e-06, | |
| "loss": 0.6998, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.059886916939569, | |
| "learning_rate": 1.6431594174703647e-06, | |
| "loss": 0.6802, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.0641741782118332, | |
| "learning_rate": 1.6270699215555608e-06, | |
| "loss": 0.6854, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.1806834552623444, | |
| "learning_rate": 1.6110214977063345e-06, | |
| "loss": 0.6987, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 2.158353312239409, | |
| "learning_rate": 1.5950149010139938e-06, | |
| "loss": 0.6823, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.921092711707764, | |
| "learning_rate": 1.5790508846018493e-06, | |
| "loss": 0.6941, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.3977437347525594, | |
| "learning_rate": 1.563130199589773e-06, | |
| "loss": 0.6915, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 2.0303949213498167, | |
| "learning_rate": 1.5472535950588575e-06, | |
| "loss": 0.6971, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.176759402835286, | |
| "learning_rate": 1.5314218180161783e-06, | |
| "loss": 0.6809, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.9729876407553733, | |
| "learning_rate": 1.5156356133596356e-06, | |
| "loss": 0.6933, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.168577615246608, | |
| "learning_rate": 1.4998957238429173e-06, | |
| "loss": 0.6873, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 2.0839917041722704, | |
| "learning_rate": 1.4842028900405422e-06, | |
| "loss": 0.6984, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.893739609530612, | |
| "learning_rate": 1.4685578503130227e-06, | |
| "loss": 0.6922, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.0857317964888193, | |
| "learning_rate": 1.4529613407721193e-06, | |
| "loss": 0.6908, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 2.0026826285563564, | |
| "learning_rate": 1.4374140952462109e-06, | |
| "loss": 0.6752, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.914005315845231, | |
| "learning_rate": 1.4219168452457593e-06, | |
| "loss": 0.6988, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.0223609197099552, | |
| "learning_rate": 1.4064703199289038e-06, | |
| "loss": 0.6842, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.1096738426378407, | |
| "learning_rate": 1.391075246067139e-06, | |
| "loss": 0.6823, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.0768301164703438, | |
| "learning_rate": 1.375732348011132e-06, | |
| "loss": 0.6898, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.123519591919823, | |
| "learning_rate": 1.3604423476566342e-06, | |
| "loss": 0.6732, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.1761224742449934, | |
| "learning_rate": 1.3452059644105174e-06, | |
| "loss": 0.6915, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 2.1481136545780246, | |
| "learning_rate": 1.3300239151569251e-06, | |
| "loss": 0.6942, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.194600040469437, | |
| "learning_rate": 1.3148969142235436e-06, | |
| "loss": 0.6788, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.1089291541456223, | |
| "learning_rate": 1.2998256733479896e-06, | |
| "loss": 0.7013, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 2.19121425222603, | |
| "learning_rate": 1.2848109016443255e-06, | |
| "loss": 0.6897, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.9319788219662473, | |
| "learning_rate": 1.2698533055696926e-06, | |
| "loss": 0.6976, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.275808654157748, | |
| "learning_rate": 1.254953588891073e-06, | |
| "loss": 0.6839, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.198386215375118, | |
| "learning_rate": 1.2401124526521763e-06, | |
| "loss": 0.6946, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.8790492694109773, | |
| "learning_rate": 1.225330595140455e-06, | |
| "loss": 0.676, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.9926892667726497, | |
| "learning_rate": 1.2106087118542504e-06, | |
| "loss": 0.6824, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.0144848585141206, | |
| "learning_rate": 1.1959474954700667e-06, | |
| "loss": 0.6906, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.0949703627379446, | |
| "learning_rate": 1.1813476358099824e-06, | |
| "loss": 0.6952, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.878706080567921, | |
| "learning_rate": 1.166809819809192e-06, | |
| "loss": 0.6846, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.106975261880749, | |
| "learning_rate": 1.1523347314836857e-06, | |
| "loss": 0.6916, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.0193667358417486, | |
| "learning_rate": 1.1379230518980663e-06, | |
| "loss": 0.695, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.8829454822076184, | |
| "learning_rate": 1.123575459133504e-06, | |
| "loss": 0.6856, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.9885667241669744, | |
| "learning_rate": 1.109292628255832e-06, | |
| "loss": 0.6849, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.9038960696049037, | |
| "learning_rate": 1.0950752312837846e-06, | |
| "loss": 0.6901, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 2.0104842271889467, | |
| "learning_rate": 1.0809239371573779e-06, | |
| "loss": 0.7014, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.9066116631636623, | |
| "learning_rate": 1.0668394117064365e-06, | |
| "loss": 0.6798, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.9713999361623535, | |
| "learning_rate": 1.0528223176192618e-06, | |
| "loss": 0.6979, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.8465153504391632, | |
| "learning_rate": 1.0388733144114605e-06, | |
| "loss": 0.6892, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.9408714006937027, | |
| "learning_rate": 1.024993058394902e-06, | |
| "loss": 0.6985, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.030993015395332, | |
| "learning_rate": 1.0111822026468515e-06, | |
| "loss": 0.6925, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.811976430858568, | |
| "learning_rate": 9.974413969792285e-07, | |
| "loss": 0.6805, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.025426310321446, | |
| "learning_rate": 9.837712879080464e-07, | |
| "loss": 0.6884, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.8699504401283087, | |
| "learning_rate": 9.701725186229801e-07, | |
| "loss": 0.6766, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.9813729971640541, | |
| "learning_rate": 9.56645728957117e-07, | |
| "loss": 0.6816, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.857568380571694, | |
| "learning_rate": 9.431915553568374e-07, | |
| "loss": 0.6941, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.8075501016131494, | |
| "learning_rate": 9.298106308518847e-07, | |
| "loss": 0.6915, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.896748082277053, | |
| "learning_rate": 9.165035850255672e-07, | |
| "loss": 0.6965, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.903236161607879, | |
| "learning_rate": 9.032710439851444e-07, | |
| "loss": 0.6942, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.0473184895907344, | |
| "learning_rate": 8.901136303323654e-07, | |
| "loss": 0.6868, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.9225358657320613, | |
| "learning_rate": 8.770319631341745e-07, | |
| "loss": 0.6833, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.8842364675717973, | |
| "learning_rate": 8.640266578935841e-07, | |
| "loss": 0.7059, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.9601955634309354, | |
| "learning_rate": 8.510983265207152e-07, | |
| "loss": 0.6996, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.8136997160021915, | |
| "learning_rate": 8.382475773040055e-07, | |
| "loss": 0.6836, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.9111490776903417, | |
| "learning_rate": 8.254750148815893e-07, | |
| "loss": 0.6996, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.8878734449529964, | |
| "learning_rate": 8.127812402128521e-07, | |
| "loss": 0.6932, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.7623368894317115, | |
| "learning_rate": 8.001668505501464e-07, | |
| "loss": 0.696, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.9509975903694705, | |
| "learning_rate": 7.876324394107018e-07, | |
| "loss": 0.6886, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.8836926534531768, | |
| "learning_rate": 7.751785965486894e-07, | |
| "loss": 0.6898, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.9384474477733897, | |
| "learning_rate": 7.628059079274793e-07, | |
| "loss": 0.6829, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.8215374593231801, | |
| "learning_rate": 7.505149556920698e-07, | |
| "loss": 0.6908, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.8093873518769943, | |
| "learning_rate": 7.383063181416955e-07, | |
| "loss": 0.6983, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.9915437999230632, | |
| "learning_rate": 7.261805697026178e-07, | |
| "loss": 0.7005, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.89611825729105, | |
| "learning_rate": 7.141382809010999e-07, | |
| "loss": 0.6931, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.8365953198306064, | |
| "learning_rate": 7.021800183365607e-07, | |
| "loss": 0.6817, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 1.8887825422099398, | |
| "learning_rate": 6.903063446549166e-07, | |
| "loss": 0.6796, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.0505162217401396, | |
| "learning_rate": 6.785178185221095e-07, | |
| "loss": 0.6823, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 2.8780046222752, | |
| "learning_rate": 6.668149945978203e-07, | |
| "loss": 0.6598, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.320474085762604, | |
| "learning_rate": 6.551984235093692e-07, | |
| "loss": 0.6646, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 3.148494101628221, | |
| "learning_rate": 6.436686518258156e-07, | |
| "loss": 0.6521, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.9894322407930707, | |
| "learning_rate": 6.322262220322314e-07, | |
| "loss": 0.6497, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.0905514911758116, | |
| "learning_rate": 6.208716725041869e-07, | |
| "loss": 0.6729, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 2.5235937968654882, | |
| "learning_rate": 6.096055374824117e-07, | |
| "loss": 0.6536, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 2.7164252624114953, | |
| "learning_rate": 5.984283470476621e-07, | |
| "loss": 0.6557, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 2.1414297977553134, | |
| "learning_rate": 5.873406270957804e-07, | |
| "loss": 0.6517, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.087954653292254, | |
| "learning_rate": 5.763428993129499e-07, | |
| "loss": 0.6535, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 2.5430489969699166, | |
| "learning_rate": 5.654356811511494e-07, | |
| "loss": 0.6594, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 2.4893194798160425, | |
| "learning_rate": 5.546194858038073e-07, | |
| "loss": 0.6702, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 1.9260382585512938, | |
| "learning_rate": 5.438948221816559e-07, | |
| "loss": 0.6629, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 2.057039216215999, | |
| "learning_rate": 5.332621948887823e-07, | |
| "loss": 0.6583, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 2.182074257751017, | |
| "learning_rate": 5.227221041988955e-07, | |
| "loss": 0.6602, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 1.9356067875549532, | |
| "learning_rate": 5.122750460317768e-07, | |
| "loss": 0.6621, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 1.9075744893117703, | |
| "learning_rate": 5.019215119299578e-07, | |
| "loss": 0.6673, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 2.0600626341053028, | |
| "learning_rate": 4.916619890355812e-07, | |
| "loss": 0.6577, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 1.847664227547946, | |
| "learning_rate": 4.814969600674926e-07, | |
| "loss": 0.6566, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 1.9200825550285445, | |
| "learning_rate": 4.714269032985161e-07, | |
| "loss": 0.6531, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 1.945604786921752, | |
| "learning_rate": 4.614522925329626e-07, | |
| "loss": 0.6577, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 1.9471196049311694, | |
| "learning_rate": 4.515735970843263e-07, | |
| "loss": 0.6659, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 1.8278961360694248, | |
| "learning_rate": 4.417912817532133e-07, | |
| "loss": 0.6554, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.88830260098924, | |
| "learning_rate": 4.321058068054626e-07, | |
| "loss": 0.6563, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 1.9149749844100774, | |
| "learning_rate": 4.225176279504975e-07, | |
| "loss": 0.6571, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 1.8814221934773716, | |
| "learning_rate": 4.130271963198815e-07, | |
| "loss": 0.6572, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 1.8849419819366298, | |
| "learning_rate": 4.0363495844609134e-07, | |
| "loss": 0.6604, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 1.9046094115295815, | |
| "learning_rate": 3.9434135624150854e-07, | |
| "loss": 0.6652, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 1.944275827853693, | |
| "learning_rate": 3.8514682697762706e-07, | |
| "loss": 0.6572, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 1.8699721288071858, | |
| "learning_rate": 3.7605180326447806e-07, | |
| "loss": 0.6401, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 1.775035768873695, | |
| "learning_rate": 3.6705671303027687e-07, | |
| "loss": 0.6523, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 1.7843895394177849, | |
| "learning_rate": 3.581619795012875e-07, | |
| "loss": 0.6516, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 1.919359950542867, | |
| "learning_rate": 3.493680211819103e-07, | |
| "loss": 0.6607, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 1.8576252034229292, | |
| "learning_rate": 3.4067525183499013e-07, | |
| "loss": 0.6663, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 1.7764574523914607, | |
| "learning_rate": 3.3208408046234904e-07, | |
| "loss": 0.6576, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 1.8446907169053142, | |
| "learning_rate": 3.2359491128554214e-07, | |
| "loss": 0.6582, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 1.7493689583147616, | |
| "learning_rate": 3.152081437268398e-07, | |
| "loss": 0.6548, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 1.816698197495291, | |
| "learning_rate": 3.069241723904318e-07, | |
| "loss": 0.6636, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 1.790271464078186, | |
| "learning_rate": 2.987433870438641e-07, | |
| "loss": 0.657, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 1.743131857961643, | |
| "learning_rate": 2.906661725996976e-07, | |
| "loss": 0.6652, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 1.7977795864445705, | |
| "learning_rate": 2.82692909097399e-07, | |
| "loss": 0.6455, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 1.777376679638967, | |
| "learning_rate": 2.7482397168545895e-07, | |
| "loss": 0.6592, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.806389217351911, | |
| "learning_rate": 2.670597306037412e-07, | |
| "loss": 0.6606, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 1.773333434653589, | |
| "learning_rate": 2.59400551166063e-07, | |
| "loss": 0.6576, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 1.7728777287155046, | |
| "learning_rate": 2.5184679374300553e-07, | |
| "loss": 0.6606, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 1.83343142007096, | |
| "learning_rate": 2.4439881374496016e-07, | |
| "loss": 0.6713, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 1.8119712073997163, | |
| "learning_rate": 2.3705696160540303e-07, | |
| "loss": 0.6596, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 1.7575305127120062, | |
| "learning_rate": 2.298215827644118e-07, | |
| "loss": 0.6582, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 1.80965570055429, | |
| "learning_rate": 2.2269301765240558e-07, | |
| "loss": 0.6508, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 1.8329895956407685, | |
| "learning_rate": 2.1567160167413503e-07, | |
| "loss": 0.6657, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 1.8295154972235375, | |
| "learning_rate": 2.0875766519289436e-07, | |
| "loss": 0.6602, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.778360539334375, | |
| "learning_rate": 2.0195153351498325e-07, | |
| "loss": 0.6672, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 1.8281360399477038, | |
| "learning_rate": 1.9525352687439548e-07, | |
| "loss": 0.6713, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 1.798281276385492, | |
| "learning_rate": 1.886639604177573e-07, | |
| "loss": 0.6589, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 1.8101646090365584, | |
| "learning_rate": 1.821831441894939e-07, | |
| "loss": 0.6576, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 1.8163930084993238, | |
| "learning_rate": 1.7581138311724754e-07, | |
| "loss": 0.6509, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 1.7889849857989786, | |
| "learning_rate": 1.6954897699752394e-07, | |
| "loss": 0.6654, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 1.7753176697331132, | |
| "learning_rate": 1.6339622048159198e-07, | |
| "loss": 0.6555, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.758833276503715, | |
| "learning_rate": 1.5735340306161752e-07, | |
| "loss": 0.665, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 1.7863343585516815, | |
| "learning_rate": 1.514208090570432e-07, | |
| "loss": 0.6484, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 1.7763079205782726, | |
| "learning_rate": 1.4559871760121108e-07, | |
| "loss": 0.6562, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.8490593873759873, | |
| "learning_rate": 1.3988740262822847e-07, | |
| "loss": 0.6497, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 1.7753823119901868, | |
| "learning_rate": 1.3428713286008005e-07, | |
| "loss": 0.6534, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 1.7671712087628604, | |
| "learning_rate": 1.2879817179398375e-07, | |
| "loss": 0.6519, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 1.7594428378082356, | |
| "learning_rate": 1.2342077768999372e-07, | |
| "loss": 0.6519, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 1.767897963166057, | |
| "learning_rate": 1.1815520355884679e-07, | |
| "loss": 0.6528, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 1.7463739318936164, | |
| "learning_rate": 1.130016971500622e-07, | |
| "loss": 0.6582, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 1.747840277010472, | |
| "learning_rate": 1.0796050094027954e-07, | |
| "loss": 0.6661, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 1.8160480622182698, | |
| "learning_rate": 1.0303185212185485e-07, | |
| "loss": 0.646, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 1.7568873705777095, | |
| "learning_rate": 9.821598259169729e-08, | |
| "loss": 0.6554, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 1.733832059747267, | |
| "learning_rate": 9.351311894036014e-08, | |
| "loss": 0.6632, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 1.804637277135235, | |
| "learning_rate": 8.892348244137788e-08, | |
| "loss": 0.66, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 1.767868039735343, | |
| "learning_rate": 8.444728904085737e-08, | |
| "loss": 0.659, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 1.770931658466082, | |
| "learning_rate": 8.008474934731447e-08, | |
| "loss": 0.668, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 1.7732670135950312, | |
| "learning_rate": 7.583606862176713e-08, | |
| "loss": 0.6548, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 1.7259860505689657, | |
| "learning_rate": 7.170144676807683e-08, | |
| "loss": 0.6318, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 1.7392331188224266, | |
| "learning_rate": 6.768107832354292e-08, | |
| "loss": 0.6636, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 1.7732212376542704, | |
| "learning_rate": 6.377515244974903e-08, | |
| "loss": 0.6626, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 1.7335582830409095, | |
| "learning_rate": 5.99838529236646e-08, | |
| "loss": 0.668, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.7716856700895114, | |
| "learning_rate": 5.6307358128994685e-08, | |
| "loss": 0.667, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 1.7617896255786891, | |
| "learning_rate": 5.274584104779157e-08, | |
| "loss": 0.6538, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 1.7528019015815823, | |
| "learning_rate": 4.929946925231077e-08, | |
| "loss": 0.6534, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 1.8117296265464948, | |
| "learning_rate": 4.5968404897130944e-08, | |
| "loss": 0.6674, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 1.749044793771054, | |
| "learning_rate": 4.27528047115211e-08, | |
| "loss": 0.6682, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 1.7454523412078409, | |
| "learning_rate": 3.965281999206899e-08, | |
| "loss": 0.6601, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 1.7598878691389603, | |
| "learning_rate": 3.666859659556016e-08, | |
| "loss": 0.6603, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 1.7046387508749583, | |
| "learning_rate": 3.3800274932117294e-08, | |
| "loss": 0.6518, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 1.7163795248428233, | |
| "learning_rate": 3.1047989958592203e-08, | |
| "loss": 0.6651, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 1.777257207147479, | |
| "learning_rate": 2.841187117221672e-08, | |
| "loss": 0.6558, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.8219202465976836, | |
| "learning_rate": 2.5892042604510614e-08, | |
| "loss": 0.6508, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 1.7767451714812037, | |
| "learning_rate": 2.348862281544323e-08, | |
| "loss": 0.6509, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 1.7465806936718902, | |
| "learning_rate": 2.1201724887858488e-08, | |
| "loss": 0.6523, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 1.7329527459099043, | |
| "learning_rate": 1.9031456422151374e-08, | |
| "loss": 0.6404, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.7965434015907633, | |
| "learning_rate": 1.6977919531207533e-08, | |
| "loss": 0.6603, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 1.7618638033364344, | |
| "learning_rate": 1.5041210835596288e-08, | |
| "loss": 0.6421, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 1.7717100092665263, | |
| "learning_rate": 1.3221421459027329e-08, | |
| "loss": 0.6656, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 1.7605568107436471, | |
| "learning_rate": 1.1518637024061086e-08, | |
| "loss": 0.6668, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 1.751907548134551, | |
| "learning_rate": 9.932937648081397e-09, | |
| "loss": 0.6579, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 1.7386986707922565, | |
| "learning_rate": 8.464397939524915e-09, | |
| "loss": 0.6703, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 1.7643987709822369, | |
| "learning_rate": 7.113086994372242e-09, | |
| "loss": 0.666, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 1.7296344516569304, | |
| "learning_rate": 5.879068392894427e-09, | |
| "loss": 0.6522, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 1.7593863922129787, | |
| "learning_rate": 4.762400196664518e-09, | |
| "loss": 0.6586, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 1.7608500271319567, | |
| "learning_rate": 3.763134945823088e-09, | |
| "loss": 0.6689, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 1.7248718265179743, | |
| "learning_rate": 2.8813196566079836e-09, | |
| "loss": 0.6476, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 1.7430786741620756, | |
| "learning_rate": 2.116995819140821e-09, | |
| "loss": 0.6636, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 1.7541283862977322, | |
| "learning_rate": 1.4701993954760462e-09, | |
| "loss": 0.6639, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 1.7203096912350941, | |
| "learning_rate": 9.409608179078433e-10, | |
| "loss": 0.6475, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 1.7295283175572225, | |
| "learning_rate": 5.293049875393363e-10, | |
| "loss": 0.6589, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 1.7037779795629253, | |
| "learning_rate": 2.3525127310936035e-10, | |
| "loss": 0.6521, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 1.7513315633630457, | |
| "learning_rate": 5.88135100831888e-11, | |
| "loss": 0.6556, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 1.7159722803516417, | |
| "learning_rate": 0.0, | |
| "loss": 0.6477, | |
| "step": 468 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 468, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 117, | |
| "total_flos": 783498671554560.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |