| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 34.44804382324219, | |
| "learning_rate": 1e-05, | |
| "loss": 13.0101, | |
| "mean_token_accuracy": 0.4696590006351471, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 30.779788970947266, | |
| "learning_rate": 2e-05, | |
| "loss": 12.3851, | |
| "mean_token_accuracy": 0.47303473204374313, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 29.67559242248535, | |
| "learning_rate": 3e-05, | |
| "loss": 12.3488, | |
| "mean_token_accuracy": 0.49709559231996536, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 26.862010955810547, | |
| "learning_rate": 4e-05, | |
| "loss": 11.6596, | |
| "mean_token_accuracy": 0.5584611147642136, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 22.10072135925293, | |
| "learning_rate": 5e-05, | |
| "loss": 10.1384, | |
| "mean_token_accuracy": 0.5924926251173019, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 20.171361923217773, | |
| "learning_rate": 4.9473684210526315e-05, | |
| "loss": 9.5421, | |
| "mean_token_accuracy": 0.5888276249170303, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 16.452842712402344, | |
| "learning_rate": 4.8947368421052635e-05, | |
| "loss": 8.4344, | |
| "mean_token_accuracy": 0.632336363196373, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 12.61925983428955, | |
| "learning_rate": 4.842105263157895e-05, | |
| "loss": 7.7786, | |
| "mean_token_accuracy": 0.6633019298315048, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 11.540078163146973, | |
| "learning_rate": 4.789473684210526e-05, | |
| "loss": 7.5607, | |
| "mean_token_accuracy": 0.6721473336219788, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 9.264492988586426, | |
| "learning_rate": 4.736842105263158e-05, | |
| "loss": 7.5589, | |
| "mean_token_accuracy": 0.6924590468406677, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 9.118257522583008, | |
| "learning_rate": 4.68421052631579e-05, | |
| "loss": 7.2038, | |
| "mean_token_accuracy": 0.6936477273702621, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 9.140292167663574, | |
| "learning_rate": 4.6315789473684214e-05, | |
| "loss": 6.7882, | |
| "mean_token_accuracy": 0.7149988412857056, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 8.944159507751465, | |
| "learning_rate": 4.5789473684210527e-05, | |
| "loss": 6.7822, | |
| "mean_token_accuracy": 0.7038442492485046, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 8.406500816345215, | |
| "learning_rate": 4.5263157894736846e-05, | |
| "loss": 6.6675, | |
| "mean_token_accuracy": 0.6912005394697189, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.81226921081543, | |
| "learning_rate": 4.473684210526316e-05, | |
| "loss": 6.156, | |
| "mean_token_accuracy": 0.7322177290916443, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 10.26094913482666, | |
| "learning_rate": 4.421052631578947e-05, | |
| "loss": 6.3945, | |
| "mean_token_accuracy": 0.7067171931266785, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 9.275789260864258, | |
| "learning_rate": 4.368421052631579e-05, | |
| "loss": 6.0611, | |
| "mean_token_accuracy": 0.7207391560077667, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 9.354033470153809, | |
| "learning_rate": 4.3157894736842105e-05, | |
| "loss": 6.1713, | |
| "mean_token_accuracy": 0.7210505157709122, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 7.362099647521973, | |
| "learning_rate": 4.2631578947368425e-05, | |
| "loss": 6.4072, | |
| "mean_token_accuracy": 0.7182824611663818, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.947159290313721, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 6.2321, | |
| "mean_token_accuracy": 0.7235151827335358, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 9.023740768432617, | |
| "learning_rate": 4.157894736842106e-05, | |
| "loss": 6.0462, | |
| "mean_token_accuracy": 0.7328377515077591, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 7.097681999206543, | |
| "learning_rate": 4.105263157894737e-05, | |
| "loss": 5.1996, | |
| "mean_token_accuracy": 0.7640947103500366, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 7.417032718658447, | |
| "learning_rate": 4.0526315789473684e-05, | |
| "loss": 6.2611, | |
| "mean_token_accuracy": 0.7178521305322647, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.950761318206787, | |
| "learning_rate": 4e-05, | |
| "loss": 5.5961, | |
| "mean_token_accuracy": 0.7444904744625092, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 11.129012107849121, | |
| "learning_rate": 3.9473684210526316e-05, | |
| "loss": 5.3222, | |
| "mean_token_accuracy": 0.75727978348732, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 7.0950469970703125, | |
| "learning_rate": 3.894736842105263e-05, | |
| "loss": 5.4028, | |
| "mean_token_accuracy": 0.7472832798957825, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.279951095581055, | |
| "learning_rate": 3.842105263157895e-05, | |
| "loss": 5.6822, | |
| "mean_token_accuracy": 0.7470583468675613, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 7.986962795257568, | |
| "learning_rate": 3.789473684210527e-05, | |
| "loss": 5.5314, | |
| "mean_token_accuracy": 0.7577493786811829, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 7.804550647735596, | |
| "learning_rate": 3.736842105263158e-05, | |
| "loss": 5.6165, | |
| "mean_token_accuracy": 0.7538002282381058, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.608943939208984, | |
| "learning_rate": 3.6842105263157895e-05, | |
| "loss": 5.8104, | |
| "mean_token_accuracy": 0.7309564054012299, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 7.622748374938965, | |
| "learning_rate": 3.6315789473684214e-05, | |
| "loss": 6.0722, | |
| "mean_token_accuracy": 0.740648090839386, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 7.36374044418335, | |
| "learning_rate": 3.578947368421053e-05, | |
| "loss": 5.1848, | |
| "mean_token_accuracy": 0.7709122896194458, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 6.820411682128906, | |
| "learning_rate": 3.526315789473684e-05, | |
| "loss": 5.0621, | |
| "mean_token_accuracy": 0.7692567408084869, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 6.473299980163574, | |
| "learning_rate": 3.473684210526316e-05, | |
| "loss": 5.3879, | |
| "mean_token_accuracy": 0.7631748914718628, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.249595642089844, | |
| "learning_rate": 3.421052631578947e-05, | |
| "loss": 5.7251, | |
| "mean_token_accuracy": 0.7535725235939026, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 6.396822929382324, | |
| "learning_rate": 3.368421052631579e-05, | |
| "loss": 5.5203, | |
| "mean_token_accuracy": 0.7570009678602219, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 5.999199390411377, | |
| "learning_rate": 3.3157894736842106e-05, | |
| "loss": 4.8803, | |
| "mean_token_accuracy": 0.7936893254518509, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 6.992559909820557, | |
| "learning_rate": 3.2631578947368426e-05, | |
| "loss": 5.3993, | |
| "mean_token_accuracy": 0.7533106952905655, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 7.724341869354248, | |
| "learning_rate": 3.210526315789474e-05, | |
| "loss": 5.6051, | |
| "mean_token_accuracy": 0.7366128116846085, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.947137832641602, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 5.5221, | |
| "mean_token_accuracy": 0.7558678537607193, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 7.61790657043457, | |
| "learning_rate": 3.105263157894737e-05, | |
| "loss": 5.0442, | |
| "mean_token_accuracy": 0.7664971798658371, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.234894275665283, | |
| "learning_rate": 3.0526315789473684e-05, | |
| "loss": 5.1083, | |
| "mean_token_accuracy": 0.7650484591722488, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 6.606740951538086, | |
| "learning_rate": 3e-05, | |
| "loss": 4.7578, | |
| "mean_token_accuracy": 0.7714228928089142, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 7.6674957275390625, | |
| "learning_rate": 2.9473684210526314e-05, | |
| "loss": 4.8684, | |
| "mean_token_accuracy": 0.7777758836746216, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.467043399810791, | |
| "learning_rate": 2.8947368421052634e-05, | |
| "loss": 4.8455, | |
| "mean_token_accuracy": 0.7818402796983719, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 8.154311180114746, | |
| "learning_rate": 2.842105263157895e-05, | |
| "loss": 4.2716, | |
| "mean_token_accuracy": 0.7975698858499527, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 6.493333339691162, | |
| "learning_rate": 2.7894736842105263e-05, | |
| "loss": 4.8411, | |
| "mean_token_accuracy": 0.7828859686851501, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.82868766784668, | |
| "learning_rate": 2.7368421052631583e-05, | |
| "loss": 5.1772, | |
| "mean_token_accuracy": 0.7642232775688171, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 7.6515793800354, | |
| "learning_rate": 2.6842105263157896e-05, | |
| "loss": 5.3152, | |
| "mean_token_accuracy": 0.7572789639234543, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.436869144439697, | |
| "learning_rate": 2.6315789473684212e-05, | |
| "loss": 5.6628, | |
| "mean_token_accuracy": 0.7437437325716019, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7.494150161743164, | |
| "learning_rate": 2.578947368421053e-05, | |
| "loss": 4.7967, | |
| "mean_token_accuracy": 0.7860404402017593, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 7.129732131958008, | |
| "learning_rate": 2.5263157894736845e-05, | |
| "loss": 4.1962, | |
| "mean_token_accuracy": 0.8091482371091843, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 6.54312801361084, | |
| "learning_rate": 2.4736842105263158e-05, | |
| "loss": 5.1673, | |
| "mean_token_accuracy": 0.7621930986642838, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 7.091205596923828, | |
| "learning_rate": 2.4210526315789474e-05, | |
| "loss": 4.9484, | |
| "mean_token_accuracy": 0.7637773156166077, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.452225685119629, | |
| "learning_rate": 2.368421052631579e-05, | |
| "loss": 5.2121, | |
| "mean_token_accuracy": 0.7751423120498657, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 7.079778671264648, | |
| "learning_rate": 2.3157894736842107e-05, | |
| "loss": 4.4913, | |
| "mean_token_accuracy": 0.7925348579883575, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 6.021428108215332, | |
| "learning_rate": 2.2631578947368423e-05, | |
| "loss": 5.1743, | |
| "mean_token_accuracy": 0.7735058069229126, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 6.639064788818359, | |
| "learning_rate": 2.2105263157894736e-05, | |
| "loss": 4.2683, | |
| "mean_token_accuracy": 0.7983661592006683, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 7.0270256996154785, | |
| "learning_rate": 2.1578947368421053e-05, | |
| "loss": 4.7973, | |
| "mean_token_accuracy": 0.7833641171455383, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.48342752456665, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 4.7573, | |
| "mean_token_accuracy": 0.7896635830402374, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7.281738758087158, | |
| "learning_rate": 2.0526315789473685e-05, | |
| "loss": 5.3465, | |
| "mean_token_accuracy": 0.7754404097795486, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 7.372372627258301, | |
| "learning_rate": 2e-05, | |
| "loss": 4.3753, | |
| "mean_token_accuracy": 0.7940351366996765, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 7.551279544830322, | |
| "learning_rate": 1.9473684210526315e-05, | |
| "loss": 4.7479, | |
| "mean_token_accuracy": 0.7812730073928833, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 6.71967077255249, | |
| "learning_rate": 1.8947368421052634e-05, | |
| "loss": 4.7599, | |
| "mean_token_accuracy": 0.7793499678373337, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 7.2533063888549805, | |
| "learning_rate": 1.8421052631578947e-05, | |
| "loss": 4.8565, | |
| "mean_token_accuracy": 0.7772107869386673, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 6.089155197143555, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 4.4817, | |
| "mean_token_accuracy": 0.7843845635652542, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 7.141996383666992, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 4.8889, | |
| "mean_token_accuracy": 0.7912164181470871, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6.36722993850708, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 5.663, | |
| "mean_token_accuracy": 0.7410993576049805, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6.695736885070801, | |
| "learning_rate": 1.6315789473684213e-05, | |
| "loss": 5.4292, | |
| "mean_token_accuracy": 0.7530855089426041, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.8679518699646, | |
| "learning_rate": 1.5789473684210526e-05, | |
| "loss": 4.6897, | |
| "mean_token_accuracy": 0.7747458964586258, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 7.445884704589844, | |
| "learning_rate": 1.5263157894736842e-05, | |
| "loss": 4.9695, | |
| "mean_token_accuracy": 0.782129317522049, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 6.467252254486084, | |
| "learning_rate": 1.4736842105263157e-05, | |
| "loss": 4.2316, | |
| "mean_token_accuracy": 0.8010141849517822, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 6.673328876495361, | |
| "learning_rate": 1.4210526315789475e-05, | |
| "loss": 5.5348, | |
| "mean_token_accuracy": 0.7492043673992157, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 6.37332820892334, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 4.1118, | |
| "mean_token_accuracy": 0.8063640743494034, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6.5554327964782715, | |
| "learning_rate": 1.3157894736842106e-05, | |
| "loss": 4.3587, | |
| "mean_token_accuracy": 0.7785357385873795, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 7.242586135864258, | |
| "learning_rate": 1.2631578947368422e-05, | |
| "loss": 4.7259, | |
| "mean_token_accuracy": 0.7722227722406387, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 6.499831676483154, | |
| "learning_rate": 1.2105263157894737e-05, | |
| "loss": 4.582, | |
| "mean_token_accuracy": 0.8019344508647919, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 6.212286949157715, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": 4.3784, | |
| "mean_token_accuracy": 0.7915610671043396, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 6.525815963745117, | |
| "learning_rate": 1.1052631578947368e-05, | |
| "loss": 4.6062, | |
| "mean_token_accuracy": 0.7760037034749985, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 6.437106609344482, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 4.3811, | |
| "mean_token_accuracy": 0.7914712876081467, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 6.544358730316162, | |
| "learning_rate": 1e-05, | |
| "loss": 4.1778, | |
| "mean_token_accuracy": 0.813565582036972, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 5.799524307250977, | |
| "learning_rate": 9.473684210526317e-06, | |
| "loss": 4.5642, | |
| "mean_token_accuracy": 0.7896921932697296, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 6.26992130279541, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 4.4728, | |
| "mean_token_accuracy": 0.7932559102773666, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 7.898167133331299, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 4.7496, | |
| "mean_token_accuracy": 0.7824009358882904, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.840052127838135, | |
| "learning_rate": 7.894736842105263e-06, | |
| "loss": 4.4144, | |
| "mean_token_accuracy": 0.8065742999315262, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 5.884693145751953, | |
| "learning_rate": 7.3684210526315784e-06, | |
| "loss": 4.0572, | |
| "mean_token_accuracy": 0.8163146525621414, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 6.5094380378723145, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 4.692, | |
| "mean_token_accuracy": 0.7821700870990753, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 6.171222686767578, | |
| "learning_rate": 6.315789473684211e-06, | |
| "loss": 4.7373, | |
| "mean_token_accuracy": 0.7865629494190216, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.430420875549316, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 5.1443, | |
| "mean_token_accuracy": 0.7733548134565353, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6.6331987380981445, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 4.8172, | |
| "mean_token_accuracy": 0.7886734008789062, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 6.18582820892334, | |
| "learning_rate": 4.736842105263159e-06, | |
| "loss": 4.0192, | |
| "mean_token_accuracy": 0.8108067065477371, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 6.787937641143799, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 4.3634, | |
| "mean_token_accuracy": 0.7937927544116974, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 6.101965427398682, | |
| "learning_rate": 3.6842105263157892e-06, | |
| "loss": 4.7228, | |
| "mean_token_accuracy": 0.7906162887811661, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 6.77475118637085, | |
| "learning_rate": 3.1578947368421056e-06, | |
| "loss": 4.4398, | |
| "mean_token_accuracy": 0.7847026586532593, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 6.1204023361206055, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 4.3872, | |
| "mean_token_accuracy": 0.7921232283115387, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 6.262160301208496, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 4.2201, | |
| "mean_token_accuracy": 0.7995105534791946, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 6.433416843414307, | |
| "learning_rate": 1.5789473684210528e-06, | |
| "loss": 4.2633, | |
| "mean_token_accuracy": 0.782690092921257, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 6.012047290802002, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 4.2714, | |
| "mean_token_accuracy": 0.791715532541275, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 6.2551445960998535, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 4.4735, | |
| "mean_token_accuracy": 0.7887705862522125, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 6.641271591186523, | |
| "learning_rate": 0.0, | |
| "loss": 4.7771, | |
| "mean_token_accuracy": 0.7853755056858063, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1546494935040000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |