Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_metric": 3.30434513092041, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_1208/checkpoint-90000", | |
| "epoch": 10.0, | |
| "eval_steps": 1000, | |
| "global_step": 92910, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005381552039608223, | |
| "grad_norm": 1.3027253150939941, | |
| "learning_rate": 0.000294, | |
| "loss": 8.5174, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010763104079216447, | |
| "grad_norm": 1.869943618774414, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.9458, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01614465611882467, | |
| "grad_norm": 2.063448190689087, | |
| "learning_rate": 0.0005996832237905398, | |
| "loss": 6.473, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.021526208158432893, | |
| "grad_norm": 1.8779255151748657, | |
| "learning_rate": 0.0005993599827604783, | |
| "loss": 6.2586, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.026907760198041114, | |
| "grad_norm": 0.8987168669700623, | |
| "learning_rate": 0.0005990367417304169, | |
| "loss": 6.097, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03228931223764934, | |
| "grad_norm": 2.6893460750579834, | |
| "learning_rate": 0.0005987135007003555, | |
| "loss": 5.9796, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03767086427725756, | |
| "grad_norm": 2.125455379486084, | |
| "learning_rate": 0.0005983902596702941, | |
| "loss": 5.8614, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.04305241631686579, | |
| "grad_norm": 1.041877031326294, | |
| "learning_rate": 0.0005980670186402327, | |
| "loss": 5.8053, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.048433968356474004, | |
| "grad_norm": 0.9991294145584106, | |
| "learning_rate": 0.0005977437776101713, | |
| "loss": 5.7346, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05381552039608223, | |
| "grad_norm": 0.9753233790397644, | |
| "learning_rate": 0.0005974205365801099, | |
| "loss": 5.6336, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05919707243569045, | |
| "grad_norm": 1.0149271488189697, | |
| "learning_rate": 0.0005970972955500484, | |
| "loss": 5.5819, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.06457862447529868, | |
| "grad_norm": 0.9937843084335327, | |
| "learning_rate": 0.0005967740545199869, | |
| "loss": 5.5192, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0699601765149069, | |
| "grad_norm": 1.5022847652435303, | |
| "learning_rate": 0.0005964508134899256, | |
| "loss": 5.4219, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.07534172855451512, | |
| "grad_norm": 1.5764541625976562, | |
| "learning_rate": 0.0005961275724598642, | |
| "loss": 5.3884, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08072328059412334, | |
| "grad_norm": 1.1085480451583862, | |
| "learning_rate": 0.0005958043314298028, | |
| "loss": 5.3324, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.08610483263373157, | |
| "grad_norm": 1.2022193670272827, | |
| "learning_rate": 0.0005954810903997413, | |
| "loss": 5.2821, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09148638467333979, | |
| "grad_norm": 1.2098697423934937, | |
| "learning_rate": 0.00059515784936968, | |
| "loss": 5.2259, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.09686793671294801, | |
| "grad_norm": 1.3306306600570679, | |
| "learning_rate": 0.0005948346083396185, | |
| "loss": 5.136, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10224948875255624, | |
| "grad_norm": 0.839387059211731, | |
| "learning_rate": 0.0005945113673095572, | |
| "loss": 5.1285, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.10763104079216446, | |
| "grad_norm": 1.0255858898162842, | |
| "learning_rate": 0.0005941881262794957, | |
| "loss": 5.0836, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10763104079216446, | |
| "eval_accuracy": 0.22717450012337537, | |
| "eval_loss": 5.0228166580200195, | |
| "eval_runtime": 185.5233, | |
| "eval_samples_per_second": 97.082, | |
| "eval_steps_per_second": 6.069, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11301259283177269, | |
| "grad_norm": 1.2033151388168335, | |
| "learning_rate": 0.0005938648852494342, | |
| "loss": 5.0406, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1183941448713809, | |
| "grad_norm": 1.600353717803955, | |
| "learning_rate": 0.0005935416442193729, | |
| "loss": 5.0267, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12377569691098914, | |
| "grad_norm": 1.2065142393112183, | |
| "learning_rate": 0.0005932184031893114, | |
| "loss": 4.9908, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.12915724895059735, | |
| "grad_norm": 1.0816049575805664, | |
| "learning_rate": 0.0005928951621592501, | |
| "loss": 4.9785, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.13453880099020557, | |
| "grad_norm": 1.2168177366256714, | |
| "learning_rate": 0.0005925719211291886, | |
| "loss": 4.9248, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1399203530298138, | |
| "grad_norm": 1.3226429224014282, | |
| "learning_rate": 0.0005922486800991272, | |
| "loss": 4.8858, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.14530190506942203, | |
| "grad_norm": 0.9506095051765442, | |
| "learning_rate": 0.0005919254390690658, | |
| "loss": 4.8763, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.15068345710903025, | |
| "grad_norm": 0.8765633702278137, | |
| "learning_rate": 0.0005916021980390043, | |
| "loss": 4.8708, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.15606500914863847, | |
| "grad_norm": 0.890556812286377, | |
| "learning_rate": 0.0005912789570089429, | |
| "loss": 4.839, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.16144656118824668, | |
| "grad_norm": 1.441969394683838, | |
| "learning_rate": 0.0005909557159788815, | |
| "loss": 4.7959, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1668281132278549, | |
| "grad_norm": 1.151434302330017, | |
| "learning_rate": 0.0005906324749488202, | |
| "loss": 4.7874, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.17220966526746315, | |
| "grad_norm": 0.8059516549110413, | |
| "learning_rate": 0.0005903092339187587, | |
| "loss": 4.755, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.17759121730707136, | |
| "grad_norm": 0.9665142893791199, | |
| "learning_rate": 0.0005899859928886973, | |
| "loss": 4.7388, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.18297276934667958, | |
| "grad_norm": 1.0503524541854858, | |
| "learning_rate": 0.0005896627518586358, | |
| "loss": 4.6922, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1883543213862878, | |
| "grad_norm": 0.9099863171577454, | |
| "learning_rate": 0.0005893395108285745, | |
| "loss": 4.6958, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.19373587342589602, | |
| "grad_norm": 0.9622862935066223, | |
| "learning_rate": 0.0005890162697985131, | |
| "loss": 4.6703, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.19911742546550426, | |
| "grad_norm": 0.9242944717407227, | |
| "learning_rate": 0.0005886930287684516, | |
| "loss": 4.6431, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.20449897750511248, | |
| "grad_norm": 1.066117525100708, | |
| "learning_rate": 0.0005883697877383902, | |
| "loss": 4.6229, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2098805295447207, | |
| "grad_norm": 0.7566308975219727, | |
| "learning_rate": 0.0005880465467083287, | |
| "loss": 4.608, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2152620815843289, | |
| "grad_norm": 0.9950522184371948, | |
| "learning_rate": 0.0005877233056782674, | |
| "loss": 4.5783, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2152620815843289, | |
| "eval_accuracy": 0.2709776945400077, | |
| "eval_loss": 4.502764701843262, | |
| "eval_runtime": 184.3265, | |
| "eval_samples_per_second": 97.712, | |
| "eval_steps_per_second": 6.109, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.22064363362393713, | |
| "grad_norm": 0.7409501671791077, | |
| "learning_rate": 0.000587400064648206, | |
| "loss": 4.5489, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.22602518566354537, | |
| "grad_norm": 1.014644742012024, | |
| "learning_rate": 0.0005870768236181446, | |
| "loss": 4.5494, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2314067377031536, | |
| "grad_norm": 1.1158607006072998, | |
| "learning_rate": 0.0005867535825880831, | |
| "loss": 4.5381, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.2367882897427618, | |
| "grad_norm": 0.9258712530136108, | |
| "learning_rate": 0.0005864303415580218, | |
| "loss": 4.5231, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.24216984178237003, | |
| "grad_norm": 1.1612284183502197, | |
| "learning_rate": 0.0005861071005279603, | |
| "loss": 4.4892, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.24755139382197827, | |
| "grad_norm": 0.8133726119995117, | |
| "learning_rate": 0.0005857838594978988, | |
| "loss": 4.4964, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2529329458615865, | |
| "grad_norm": 0.9338302612304688, | |
| "learning_rate": 0.0005854606184678375, | |
| "loss": 4.4625, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2583144979011947, | |
| "grad_norm": 0.8687306642532349, | |
| "learning_rate": 0.000585137377437776, | |
| "loss": 4.4476, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2636960499408029, | |
| "grad_norm": 1.1344246864318848, | |
| "learning_rate": 0.0005848141364077147, | |
| "loss": 4.4284, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.26907760198041114, | |
| "grad_norm": 0.8330696225166321, | |
| "learning_rate": 0.0005844908953776532, | |
| "loss": 4.4127, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.27445915402001936, | |
| "grad_norm": 0.9070584774017334, | |
| "learning_rate": 0.0005841676543475918, | |
| "loss": 4.396, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.2798407060596276, | |
| "grad_norm": 0.9120801687240601, | |
| "learning_rate": 0.0005838444133175304, | |
| "loss": 4.3979, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2852222580992358, | |
| "grad_norm": 0.9401586055755615, | |
| "learning_rate": 0.0005835211722874689, | |
| "loss": 4.3884, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.29060381013884407, | |
| "grad_norm": 0.7761553525924683, | |
| "learning_rate": 0.0005831979312574076, | |
| "loss": 4.368, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2959853621784523, | |
| "grad_norm": 0.7783112525939941, | |
| "learning_rate": 0.0005828746902273461, | |
| "loss": 4.3638, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3013669142180605, | |
| "grad_norm": 0.8098959922790527, | |
| "learning_rate": 0.0005825514491972847, | |
| "loss": 4.3602, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 0.9200159311294556, | |
| "learning_rate": 0.0005822282081672233, | |
| "loss": 4.3382, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.31213001829727693, | |
| "grad_norm": 0.6858677268028259, | |
| "learning_rate": 0.000581904967137162, | |
| "loss": 4.3457, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.31751157033688515, | |
| "grad_norm": 0.6915244460105896, | |
| "learning_rate": 0.0005815817261071005, | |
| "loss": 4.3249, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.32289312237649337, | |
| "grad_norm": 0.7182185649871826, | |
| "learning_rate": 0.000581258485077039, | |
| "loss": 4.31, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.32289312237649337, | |
| "eval_accuracy": 0.2989587467802075, | |
| "eval_loss": 4.229890823364258, | |
| "eval_runtime": 184.4312, | |
| "eval_samples_per_second": 97.657, | |
| "eval_steps_per_second": 6.105, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3282746744161016, | |
| "grad_norm": 0.7634304165840149, | |
| "learning_rate": 0.0005809352440469776, | |
| "loss": 4.2854, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3336562264557098, | |
| "grad_norm": 1.2449687719345093, | |
| "learning_rate": 0.0005806120030169162, | |
| "loss": 4.3057, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3390377784953181, | |
| "grad_norm": 0.6714697480201721, | |
| "learning_rate": 0.0005802887619868548, | |
| "loss": 4.2843, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3444193305349263, | |
| "grad_norm": 0.9516893029212952, | |
| "learning_rate": 0.0005799655209567934, | |
| "loss": 4.2684, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3498008825745345, | |
| "grad_norm": 0.783854067325592, | |
| "learning_rate": 0.000579642279926732, | |
| "loss": 4.2611, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.35518243461414273, | |
| "grad_norm": 0.6821441054344177, | |
| "learning_rate": 0.0005793190388966706, | |
| "loss": 4.2546, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.36056398665375095, | |
| "grad_norm": 0.7214898467063904, | |
| "learning_rate": 0.0005789957978666092, | |
| "loss": 4.2497, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.36594553869335916, | |
| "grad_norm": 0.6362515091896057, | |
| "learning_rate": 0.0005786725568365477, | |
| "loss": 4.2347, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.3713270907329674, | |
| "grad_norm": 2.6991851329803467, | |
| "learning_rate": 0.0005783493158064862, | |
| "loss": 4.2186, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.3767086427725756, | |
| "grad_norm": 1.0031359195709229, | |
| "learning_rate": 0.0005780260747764249, | |
| "loss": 4.2248, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3820901948121838, | |
| "grad_norm": 0.8442318439483643, | |
| "learning_rate": 0.0005777028337463635, | |
| "loss": 4.2146, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.38747174685179203, | |
| "grad_norm": 0.7066633701324463, | |
| "learning_rate": 0.0005773795927163021, | |
| "loss": 4.2087, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.3928532988914003, | |
| "grad_norm": 0.6780686378479004, | |
| "learning_rate": 0.0005770563516862406, | |
| "loss": 4.2092, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.3982348509310085, | |
| "grad_norm": 0.7969300746917725, | |
| "learning_rate": 0.0005767331106561793, | |
| "loss": 4.1909, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.40361640297061674, | |
| "grad_norm": 0.7795552611351013, | |
| "learning_rate": 0.0005764098696261178, | |
| "loss": 4.1996, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.40899795501022496, | |
| "grad_norm": 0.6940560936927795, | |
| "learning_rate": 0.0005760866285960565, | |
| "loss": 4.1672, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4143795070498332, | |
| "grad_norm": 0.8106645345687866, | |
| "learning_rate": 0.000575763387565995, | |
| "loss": 4.1588, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.4197610590894414, | |
| "grad_norm": 0.8771141171455383, | |
| "learning_rate": 0.0005754401465359335, | |
| "loss": 4.1567, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4251426111290496, | |
| "grad_norm": 0.6984492540359497, | |
| "learning_rate": 0.0005751169055058722, | |
| "loss": 4.1593, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.4305241631686578, | |
| "grad_norm": 0.6983693838119507, | |
| "learning_rate": 0.0005747936644758107, | |
| "loss": 4.1786, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4305241631686578, | |
| "eval_accuracy": 0.3122335220018295, | |
| "eval_loss": 4.096415996551514, | |
| "eval_runtime": 184.3549, | |
| "eval_samples_per_second": 97.697, | |
| "eval_steps_per_second": 6.108, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.43590571520826604, | |
| "grad_norm": 0.6588996052742004, | |
| "learning_rate": 0.0005744704234457494, | |
| "loss": 4.1513, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.44128726724787426, | |
| "grad_norm": 0.7628340721130371, | |
| "learning_rate": 0.0005741471824156879, | |
| "loss": 4.1355, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.44666881928748253, | |
| "grad_norm": 0.726026177406311, | |
| "learning_rate": 0.0005738239413856265, | |
| "loss": 4.1351, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.45205037132709075, | |
| "grad_norm": 0.6202034950256348, | |
| "learning_rate": 0.0005735007003555651, | |
| "loss": 4.1324, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.45743192336669897, | |
| "grad_norm": 0.7028141021728516, | |
| "learning_rate": 0.0005731774593255036, | |
| "loss": 4.1229, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.4628134754063072, | |
| "grad_norm": 0.5698866248130798, | |
| "learning_rate": 0.0005728542182954422, | |
| "loss": 4.1281, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4681950274459154, | |
| "grad_norm": 0.6480181813240051, | |
| "learning_rate": 0.0005725309772653808, | |
| "loss": 4.1226, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.4735765794855236, | |
| "grad_norm": 0.749788224697113, | |
| "learning_rate": 0.0005722077362353195, | |
| "loss": 4.1099, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.47895813152513184, | |
| "grad_norm": 0.6271551251411438, | |
| "learning_rate": 0.000571884495205258, | |
| "loss": 4.1148, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.48433968356474005, | |
| "grad_norm": 0.7270399332046509, | |
| "learning_rate": 0.0005715612541751966, | |
| "loss": 4.112, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.48972123560434827, | |
| "grad_norm": 0.5946004986763, | |
| "learning_rate": 0.0005712380131451351, | |
| "loss": 4.1068, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.49510278764395654, | |
| "grad_norm": 0.6538162231445312, | |
| "learning_rate": 0.0005709147721150738, | |
| "loss": 4.0989, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5004843396835648, | |
| "grad_norm": 0.6393954157829285, | |
| "learning_rate": 0.0005705915310850124, | |
| "loss": 4.1014, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.505865891723173, | |
| "grad_norm": 0.6512112617492676, | |
| "learning_rate": 0.000570268290054951, | |
| "loss": 4.0907, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5112474437627812, | |
| "grad_norm": 0.6690767407417297, | |
| "learning_rate": 0.0005699450490248895, | |
| "loss": 4.0755, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5166289958023894, | |
| "grad_norm": 0.8351621627807617, | |
| "learning_rate": 0.0005696218079948281, | |
| "loss": 4.0729, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5220105478419976, | |
| "grad_norm": 0.5802532434463501, | |
| "learning_rate": 0.0005692985669647667, | |
| "loss": 4.0563, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.5273920998816058, | |
| "grad_norm": 0.5896292328834534, | |
| "learning_rate": 0.0005689753259347053, | |
| "loss": 4.067, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5327736519212141, | |
| "grad_norm": 0.636608362197876, | |
| "learning_rate": 0.0005686520849046439, | |
| "loss": 4.0503, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.5381552039608223, | |
| "grad_norm": 0.6797523498535156, | |
| "learning_rate": 0.0005683288438745824, | |
| "loss": 4.0536, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5381552039608223, | |
| "eval_accuracy": 0.3220743227040748, | |
| "eval_loss": 3.988776922225952, | |
| "eval_runtime": 184.5323, | |
| "eval_samples_per_second": 97.604, | |
| "eval_steps_per_second": 6.102, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5435367560004305, | |
| "grad_norm": 0.6974573135375977, | |
| "learning_rate": 0.0005680056028445211, | |
| "loss": 4.0728, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.5489183080400387, | |
| "grad_norm": 0.6960166096687317, | |
| "learning_rate": 0.0005676823618144596, | |
| "loss": 4.0342, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5542998600796469, | |
| "grad_norm": 0.6508740782737732, | |
| "learning_rate": 0.0005673591207843981, | |
| "loss": 4.0413, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.5596814121192552, | |
| "grad_norm": 0.5421522259712219, | |
| "learning_rate": 0.0005670358797543368, | |
| "loss": 4.0522, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5650629641588634, | |
| "grad_norm": 0.6057594418525696, | |
| "learning_rate": 0.0005667126387242753, | |
| "loss": 4.0246, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5704445161984716, | |
| "grad_norm": 0.6709496974945068, | |
| "learning_rate": 0.000566389397694214, | |
| "loss": 4.0335, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5758260682380799, | |
| "grad_norm": 0.7465163469314575, | |
| "learning_rate": 0.0005660661566641525, | |
| "loss": 4.0373, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.5812076202776881, | |
| "grad_norm": 0.6498463749885559, | |
| "learning_rate": 0.0005657429156340911, | |
| "loss": 4.0334, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5865891723172963, | |
| "grad_norm": 0.6239168047904968, | |
| "learning_rate": 0.0005654196746040297, | |
| "loss": 4.0123, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.5919707243569046, | |
| "grad_norm": 0.702239453792572, | |
| "learning_rate": 0.0005650964335739684, | |
| "loss": 4.0362, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5973522763965128, | |
| "grad_norm": 0.7050323486328125, | |
| "learning_rate": 0.0005647731925439069, | |
| "loss": 4.0221, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.602733828436121, | |
| "grad_norm": 0.6412404179573059, | |
| "learning_rate": 0.0005644499515138454, | |
| "loss": 4.0115, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6081153804757292, | |
| "grad_norm": 0.5876803398132324, | |
| "learning_rate": 0.000564126710483784, | |
| "loss": 4.019, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 0.7241882085800171, | |
| "learning_rate": 0.0005638034694537226, | |
| "loss": 4.0124, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6188784845549457, | |
| "grad_norm": 0.7000322937965393, | |
| "learning_rate": 0.0005634802284236612, | |
| "loss": 3.985, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.6242600365945539, | |
| "grad_norm": 0.6904421448707581, | |
| "learning_rate": 0.0005631569873935998, | |
| "loss": 4.0069, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6296415886341621, | |
| "grad_norm": 0.6670873761177063, | |
| "learning_rate": 0.0005628337463635384, | |
| "loss": 3.9828, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.6350231406737703, | |
| "grad_norm": 0.7039216160774231, | |
| "learning_rate": 0.0005625105053334769, | |
| "loss": 3.9863, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6404046927133785, | |
| "grad_norm": 0.6507138609886169, | |
| "learning_rate": 0.0005621872643034155, | |
| "loss": 3.9983, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.6457862447529867, | |
| "grad_norm": 0.5310302972793579, | |
| "learning_rate": 0.0005618640232733541, | |
| "loss": 3.9918, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6457862447529867, | |
| "eval_accuracy": 0.3281852870734247, | |
| "eval_loss": 3.9185032844543457, | |
| "eval_runtime": 184.1254, | |
| "eval_samples_per_second": 97.819, | |
| "eval_steps_per_second": 6.115, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.651167796792595, | |
| "grad_norm": 0.5807756185531616, | |
| "learning_rate": 0.0005615407822432927, | |
| "loss": 3.9858, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.6565493488322032, | |
| "grad_norm": 0.6004651188850403, | |
| "learning_rate": 0.0005612175412132313, | |
| "loss": 3.9621, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6619309008718114, | |
| "grad_norm": 0.780788004398346, | |
| "learning_rate": 0.0005608943001831699, | |
| "loss": 4.0063, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.6673124529114196, | |
| "grad_norm": 0.6192348003387451, | |
| "learning_rate": 0.0005605710591531085, | |
| "loss": 3.9821, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6726940049510278, | |
| "grad_norm": 0.5568573474884033, | |
| "learning_rate": 0.000560247818123047, | |
| "loss": 3.9722, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6780755569906362, | |
| "grad_norm": 0.6049451231956482, | |
| "learning_rate": 0.0005599245770929855, | |
| "loss": 3.9517, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6834571090302444, | |
| "grad_norm": 0.6511033177375793, | |
| "learning_rate": 0.0005596013360629242, | |
| "loss": 3.9879, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.6888386610698526, | |
| "grad_norm": 0.5751333236694336, | |
| "learning_rate": 0.0005592780950328628, | |
| "loss": 3.9719, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6942202131094608, | |
| "grad_norm": 0.5952321290969849, | |
| "learning_rate": 0.0005589548540028014, | |
| "loss": 3.9546, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.699601765149069, | |
| "grad_norm": 0.5651475787162781, | |
| "learning_rate": 0.0005586316129727399, | |
| "loss": 3.9592, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7049833171886772, | |
| "grad_norm": 0.6752631068229675, | |
| "learning_rate": 0.0005583083719426786, | |
| "loss": 3.9475, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.7103648692282855, | |
| "grad_norm": 0.6644642949104309, | |
| "learning_rate": 0.0005579851309126171, | |
| "loss": 3.9202, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.7157464212678937, | |
| "grad_norm": 0.6549872159957886, | |
| "learning_rate": 0.0005576618898825558, | |
| "loss": 3.9537, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.7211279733075019, | |
| "grad_norm": 0.7222427129745483, | |
| "learning_rate": 0.0005573386488524943, | |
| "loss": 3.9412, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.7265095253471101, | |
| "grad_norm": 0.5681380033493042, | |
| "learning_rate": 0.0005570154078224328, | |
| "loss": 3.9501, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.7318910773867183, | |
| "grad_norm": 0.7449719905853271, | |
| "learning_rate": 0.0005566921667923715, | |
| "loss": 3.9301, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7372726294263265, | |
| "grad_norm": 0.6220486760139465, | |
| "learning_rate": 0.00055636892576231, | |
| "loss": 3.9138, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.7426541814659348, | |
| "grad_norm": 0.5239010453224182, | |
| "learning_rate": 0.0005560456847322487, | |
| "loss": 3.9303, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.748035733505543, | |
| "grad_norm": 0.5946761965751648, | |
| "learning_rate": 0.0005557224437021872, | |
| "loss": 3.9261, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.7534172855451512, | |
| "grad_norm": 0.6462786793708801, | |
| "learning_rate": 0.0005553992026721258, | |
| "loss": 3.9177, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7534172855451512, | |
| "eval_accuracy": 0.33319483711899917, | |
| "eval_loss": 3.859447956085205, | |
| "eval_runtime": 184.513, | |
| "eval_samples_per_second": 97.614, | |
| "eval_steps_per_second": 6.103, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7587988375847594, | |
| "grad_norm": 0.6271085143089294, | |
| "learning_rate": 0.0005550759616420644, | |
| "loss": 3.9277, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.7641803896243676, | |
| "grad_norm": 0.5990097522735596, | |
| "learning_rate": 0.000554752720612003, | |
| "loss": 3.9301, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.7695619416639758, | |
| "grad_norm": 0.6081097722053528, | |
| "learning_rate": 0.0005544294795819415, | |
| "loss": 3.9422, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.7749434937035841, | |
| "grad_norm": 0.5346890091896057, | |
| "learning_rate": 0.0005541062385518801, | |
| "loss": 3.9186, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7803250457431924, | |
| "grad_norm": 0.5819073915481567, | |
| "learning_rate": 0.0005537829975218188, | |
| "loss": 3.8904, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7857065977828006, | |
| "grad_norm": 0.6600393652915955, | |
| "learning_rate": 0.0005534597564917573, | |
| "loss": 3.9178, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7910881498224088, | |
| "grad_norm": 0.5617551207542419, | |
| "learning_rate": 0.0005531365154616959, | |
| "loss": 3.9553, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.796469701862017, | |
| "grad_norm": 0.5963563323020935, | |
| "learning_rate": 0.0005528132744316344, | |
| "loss": 3.9078, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.8018512539016253, | |
| "grad_norm": 0.575839102268219, | |
| "learning_rate": 0.0005524900334015731, | |
| "loss": 3.8958, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.8072328059412335, | |
| "grad_norm": 0.5678157210350037, | |
| "learning_rate": 0.0005521667923715117, | |
| "loss": 3.9134, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8126143579808417, | |
| "grad_norm": 0.6379591226577759, | |
| "learning_rate": 0.0005518435513414502, | |
| "loss": 3.8998, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.8179959100204499, | |
| "grad_norm": 0.5917794704437256, | |
| "learning_rate": 0.0005515203103113888, | |
| "loss": 3.9019, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.8233774620600581, | |
| "grad_norm": 0.5240581035614014, | |
| "learning_rate": 0.0005511970692813274, | |
| "loss": 3.8809, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.8287590140996663, | |
| "grad_norm": 0.5748676061630249, | |
| "learning_rate": 0.000550873828251266, | |
| "loss": 3.9109, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.8341405661392746, | |
| "grad_norm": 0.6109129190444946, | |
| "learning_rate": 0.0005505505872212045, | |
| "loss": 3.882, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.8395221181788828, | |
| "grad_norm": 0.6101870536804199, | |
| "learning_rate": 0.0005502273461911432, | |
| "loss": 3.873, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.844903670218491, | |
| "grad_norm": 0.6166355013847351, | |
| "learning_rate": 0.0005499041051610817, | |
| "loss": 3.8915, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.8502852222580992, | |
| "grad_norm": 0.5955120325088501, | |
| "learning_rate": 0.0005495808641310204, | |
| "loss": 3.8613, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.8556667742977074, | |
| "grad_norm": 0.6824830770492554, | |
| "learning_rate": 0.0005492576231009589, | |
| "loss": 3.8744, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.8610483263373157, | |
| "grad_norm": 0.674412727355957, | |
| "learning_rate": 0.0005489343820708974, | |
| "loss": 3.8897, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8610483263373157, | |
| "eval_accuracy": 0.33778368318295593, | |
| "eval_loss": 3.8161725997924805, | |
| "eval_runtime": 184.3308, | |
| "eval_samples_per_second": 97.71, | |
| "eval_steps_per_second": 6.109, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8664298783769239, | |
| "grad_norm": 0.5796586871147156, | |
| "learning_rate": 0.0005486176058614372, | |
| "loss": 3.8789, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.8718114304165321, | |
| "grad_norm": 0.5682729482650757, | |
| "learning_rate": 0.0005482943648313759, | |
| "loss": 3.8718, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.5278633236885071, | |
| "learning_rate": 0.0005479711238013145, | |
| "loss": 3.8675, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.8825745344957485, | |
| "grad_norm": 0.5991818904876709, | |
| "learning_rate": 0.0005476478827712531, | |
| "loss": 3.8915, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.8879560865353568, | |
| "grad_norm": 0.5216752886772156, | |
| "learning_rate": 0.0005473246417411916, | |
| "loss": 3.8704, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.8933376385749651, | |
| "grad_norm": 0.6296066045761108, | |
| "learning_rate": 0.0005470014007111302, | |
| "loss": 3.8679, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.8987191906145733, | |
| "grad_norm": 0.6122331023216248, | |
| "learning_rate": 0.0005466781596810688, | |
| "loss": 3.874, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.9041007426541815, | |
| "grad_norm": 0.5540187358856201, | |
| "learning_rate": 0.0005463549186510073, | |
| "loss": 3.8623, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.9094822946937897, | |
| "grad_norm": 0.5371411442756653, | |
| "learning_rate": 0.000546031677620946, | |
| "loss": 3.8693, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.9148638467333979, | |
| "grad_norm": 0.5804259181022644, | |
| "learning_rate": 0.0005457084365908845, | |
| "loss": 3.8625, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 0.6279832720756531, | |
| "learning_rate": 0.0005453851955608232, | |
| "loss": 3.8759, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.9256269508126144, | |
| "grad_norm": 0.6223065853118896, | |
| "learning_rate": 0.0005450619545307617, | |
| "loss": 3.8436, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.9310085028522226, | |
| "grad_norm": 0.529935359954834, | |
| "learning_rate": 0.0005447387135007003, | |
| "loss": 3.8584, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.9363900548918308, | |
| "grad_norm": 0.6262571811676025, | |
| "learning_rate": 0.0005444154724706389, | |
| "loss": 3.8652, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.941771606931439, | |
| "grad_norm": 0.6989444494247437, | |
| "learning_rate": 0.0005440922314405775, | |
| "loss": 3.8577, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9471531589710472, | |
| "grad_norm": 0.5584626197814941, | |
| "learning_rate": 0.0005437689904105161, | |
| "loss": 3.8512, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.9525347110106555, | |
| "grad_norm": 0.5014194250106812, | |
| "learning_rate": 0.0005434457493804546, | |
| "loss": 3.8494, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.9579162630502637, | |
| "grad_norm": 0.5900559425354004, | |
| "learning_rate": 0.0005431225083503932, | |
| "loss": 3.8554, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9632978150898719, | |
| "grad_norm": 0.5138096213340759, | |
| "learning_rate": 0.0005427992673203318, | |
| "loss": 3.8558, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.9686793671294801, | |
| "grad_norm": 0.6263041496276855, | |
| "learning_rate": 0.0005424760262902704, | |
| "loss": 3.8345, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9686793671294801, | |
| "eval_accuracy": 0.34189869400315376, | |
| "eval_loss": 3.7795462608337402, | |
| "eval_runtime": 184.0848, | |
| "eval_samples_per_second": 97.841, | |
| "eval_steps_per_second": 6.117, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9740609191690883, | |
| "grad_norm": 0.5770505666732788, | |
| "learning_rate": 0.000542152785260209, | |
| "loss": 3.8348, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.9794424712086965, | |
| "grad_norm": 0.5001862049102783, | |
| "learning_rate": 0.0005418295442301476, | |
| "loss": 3.8579, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.9848240232483048, | |
| "grad_norm": 0.6384955048561096, | |
| "learning_rate": 0.0005415063032000861, | |
| "loss": 3.8485, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.9902055752879131, | |
| "grad_norm": 0.516613245010376, | |
| "learning_rate": 0.0005411830621700248, | |
| "loss": 3.8487, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.9955871273275213, | |
| "grad_norm": 0.6885047554969788, | |
| "learning_rate": 0.0005408598211399633, | |
| "loss": 3.8337, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.0009686793671295, | |
| "grad_norm": 0.632448673248291, | |
| "learning_rate": 0.0005405365801099019, | |
| "loss": 3.8242, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.0063502314067376, | |
| "grad_norm": 0.5753522515296936, | |
| "learning_rate": 0.0005402133390798405, | |
| "loss": 3.7651, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.011731783446346, | |
| "grad_norm": 0.575958251953125, | |
| "learning_rate": 0.000539890098049779, | |
| "loss": 3.7743, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.017113335485954, | |
| "grad_norm": 0.5731710195541382, | |
| "learning_rate": 0.0005395668570197177, | |
| "loss": 3.7657, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.0224948875255624, | |
| "grad_norm": 0.5972917079925537, | |
| "learning_rate": 0.0005392436159896562, | |
| "loss": 3.7855, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.0278764395651705, | |
| "grad_norm": 0.579627275466919, | |
| "learning_rate": 0.0005389203749595948, | |
| "loss": 3.7698, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.0332579916047788, | |
| "grad_norm": 0.5724299550056458, | |
| "learning_rate": 0.0005385971339295334, | |
| "loss": 3.7752, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.0386395436443872, | |
| "grad_norm": 0.6032650470733643, | |
| "learning_rate": 0.000538273892899472, | |
| "loss": 3.7569, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.0440210956839953, | |
| "grad_norm": 0.5845139622688293, | |
| "learning_rate": 0.0005379506518694106, | |
| "loss": 3.7737, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.0494026477236036, | |
| "grad_norm": 0.6454305052757263, | |
| "learning_rate": 0.0005376274108393491, | |
| "loss": 3.754, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.0547841997632117, | |
| "grad_norm": 0.5765638947486877, | |
| "learning_rate": 0.0005373041698092877, | |
| "loss": 3.7711, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.06016575180282, | |
| "grad_norm": 0.5599635243415833, | |
| "learning_rate": 0.0005369873935998276, | |
| "loss": 3.7606, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.0655473038424281, | |
| "grad_norm": 0.5297030210494995, | |
| "learning_rate": 0.0005366641525697661, | |
| "loss": 3.7679, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.0709288558820365, | |
| "grad_norm": 0.512566328048706, | |
| "learning_rate": 0.0005363409115397048, | |
| "loss": 3.774, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.0763104079216446, | |
| "grad_norm": 0.5814263820648193, | |
| "learning_rate": 0.0005360176705096433, | |
| "loss": 3.7649, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0763104079216446, | |
| "eval_accuracy": 0.34471595473129546, | |
| "eval_loss": 3.7472047805786133, | |
| "eval_runtime": 184.3941, | |
| "eval_samples_per_second": 97.677, | |
| "eval_steps_per_second": 6.106, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.081691959961253, | |
| "grad_norm": 0.6061851978302002, | |
| "learning_rate": 0.0005356944294795819, | |
| "loss": 3.7888, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.087073512000861, | |
| "grad_norm": 0.6160200834274292, | |
| "learning_rate": 0.0005353711884495205, | |
| "loss": 3.7649, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.0924550640404693, | |
| "grad_norm": 0.6028212308883667, | |
| "learning_rate": 0.000535047947419459, | |
| "loss": 3.7416, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.0978366160800774, | |
| "grad_norm": 0.5839086771011353, | |
| "learning_rate": 0.0005347247063893976, | |
| "loss": 3.7718, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.1032181681196858, | |
| "grad_norm": 0.5984543561935425, | |
| "learning_rate": 0.0005344014653593362, | |
| "loss": 3.7682, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.1085997201592939, | |
| "grad_norm": 0.5810673832893372, | |
| "learning_rate": 0.0005340782243292748, | |
| "loss": 3.7759, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.1139812721989022, | |
| "grad_norm": 0.580169677734375, | |
| "learning_rate": 0.0005337549832992134, | |
| "loss": 3.7368, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.1193628242385103, | |
| "grad_norm": 0.7361278533935547, | |
| "learning_rate": 0.000533431742269152, | |
| "loss": 3.7562, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.1247443762781186, | |
| "grad_norm": 0.5884946584701538, | |
| "learning_rate": 0.0005331085012390905, | |
| "loss": 3.7512, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.1301259283177267, | |
| "grad_norm": 0.5699751377105713, | |
| "learning_rate": 0.0005327852602090292, | |
| "loss": 3.7216, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.135507480357335, | |
| "grad_norm": 0.6556951999664307, | |
| "learning_rate": 0.0005324620191789678, | |
| "loss": 3.7621, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.1408890323969434, | |
| "grad_norm": 0.6037649512290955, | |
| "learning_rate": 0.0005321387781489063, | |
| "loss": 3.7559, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.1462705844365515, | |
| "grad_norm": 0.5747148394584656, | |
| "learning_rate": 0.0005318155371188449, | |
| "loss": 3.7759, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.1516521364761596, | |
| "grad_norm": 0.6342859268188477, | |
| "learning_rate": 0.0005314922960887834, | |
| "loss": 3.7659, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.157033688515768, | |
| "grad_norm": 0.6004108786582947, | |
| "learning_rate": 0.0005311690550587221, | |
| "loss": 3.7477, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.1624152405553763, | |
| "grad_norm": 0.6157592535018921, | |
| "learning_rate": 0.0005308458140286607, | |
| "loss": 3.7601, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.1677967925949844, | |
| "grad_norm": 0.6026566624641418, | |
| "learning_rate": 0.0005305225729985993, | |
| "loss": 3.7366, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.1731783446345927, | |
| "grad_norm": 0.6203866600990295, | |
| "learning_rate": 0.0005301993319685378, | |
| "loss": 3.7546, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.1785598966742008, | |
| "grad_norm": 0.5511950254440308, | |
| "learning_rate": 0.0005298760909384765, | |
| "loss": 3.7499, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.1839414487138091, | |
| "grad_norm": 0.5800632834434509, | |
| "learning_rate": 0.000529552849908415, | |
| "loss": 3.7704, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1839414487138091, | |
| "eval_accuracy": 0.3470900203300418, | |
| "eval_loss": 3.7228450775146484, | |
| "eval_runtime": 184.2292, | |
| "eval_samples_per_second": 97.764, | |
| "eval_steps_per_second": 6.112, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1893230007534172, | |
| "grad_norm": 0.5778889060020447, | |
| "learning_rate": 0.0005292296088783535, | |
| "loss": 3.7438, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.1947045527930256, | |
| "grad_norm": 0.5566492080688477, | |
| "learning_rate": 0.0005289063678482922, | |
| "loss": 3.7394, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.2000861048326337, | |
| "grad_norm": 0.5893499255180359, | |
| "learning_rate": 0.0005285831268182307, | |
| "loss": 3.7474, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.205467656872242, | |
| "grad_norm": 0.6097339987754822, | |
| "learning_rate": 0.0005282598857881694, | |
| "loss": 3.7637, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.21084920891185, | |
| "grad_norm": 0.6325443387031555, | |
| "learning_rate": 0.0005279366447581079, | |
| "loss": 3.7378, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.2162307609514584, | |
| "grad_norm": 0.580863356590271, | |
| "learning_rate": 0.0005276134037280465, | |
| "loss": 3.7388, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.2216123129910665, | |
| "grad_norm": 0.6277652382850647, | |
| "learning_rate": 0.0005272901626979851, | |
| "loss": 3.7391, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 0.5761270523071289, | |
| "learning_rate": 0.0005269669216679236, | |
| "loss": 3.742, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.232375417070283, | |
| "grad_norm": 0.6242594718933105, | |
| "learning_rate": 0.0005266436806378623, | |
| "loss": 3.7412, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.2377569691098913, | |
| "grad_norm": 0.5723513960838318, | |
| "learning_rate": 0.0005263204396078008, | |
| "loss": 3.7559, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.2431385211494996, | |
| "grad_norm": 0.647551417350769, | |
| "learning_rate": 0.0005259971985777394, | |
| "loss": 3.7353, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.2485200731891077, | |
| "grad_norm": 0.5628076195716858, | |
| "learning_rate": 0.000525673957547678, | |
| "loss": 3.7361, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.2539016252287158, | |
| "grad_norm": 0.6053175926208496, | |
| "learning_rate": 0.0005253507165176167, | |
| "loss": 3.7354, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.2592831772683242, | |
| "grad_norm": 0.6818082332611084, | |
| "learning_rate": 0.0005250274754875552, | |
| "loss": 3.7576, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.2646647293079325, | |
| "grad_norm": 0.5190131664276123, | |
| "learning_rate": 0.0005247042344574938, | |
| "loss": 3.7252, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.2700462813475406, | |
| "grad_norm": 0.5695263743400574, | |
| "learning_rate": 0.0005243809934274323, | |
| "loss": 3.7287, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.275427833387149, | |
| "grad_norm": 0.5233474969863892, | |
| "learning_rate": 0.0005240577523973709, | |
| "loss": 3.7249, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.280809385426757, | |
| "grad_norm": 0.6039193868637085, | |
| "learning_rate": 0.0005237345113673095, | |
| "loss": 3.7472, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.2861909374663654, | |
| "grad_norm": 0.6351351141929626, | |
| "learning_rate": 0.0005234112703372481, | |
| "loss": 3.7272, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.2915724895059735, | |
| "grad_norm": 0.5825541019439697, | |
| "learning_rate": 0.0005230880293071867, | |
| "loss": 3.7211, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.2915724895059735, | |
| "eval_accuracy": 0.3495334064712139, | |
| "eval_loss": 3.6967616081237793, | |
| "eval_runtime": 184.0792, | |
| "eval_samples_per_second": 97.844, | |
| "eval_steps_per_second": 6.117, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.2969540415455818, | |
| "grad_norm": 0.5748907923698425, | |
| "learning_rate": 0.0005227647882771253, | |
| "loss": 3.7198, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.30233559358519, | |
| "grad_norm": 0.6389493942260742, | |
| "learning_rate": 0.0005224415472470639, | |
| "loss": 3.75, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.3077171456247982, | |
| "grad_norm": 0.5892848372459412, | |
| "learning_rate": 0.0005221183062170024, | |
| "loss": 3.7315, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.3130986976644063, | |
| "grad_norm": 0.5801231265068054, | |
| "learning_rate": 0.0005217950651869409, | |
| "loss": 3.7343, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.3184802497040147, | |
| "grad_norm": 0.6174342632293701, | |
| "learning_rate": 0.0005214718241568796, | |
| "loss": 3.7349, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.3238618017436228, | |
| "grad_norm": 0.6601594090461731, | |
| "learning_rate": 0.0005211485831268182, | |
| "loss": 3.7232, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.329243353783231, | |
| "grad_norm": 0.6473212838172913, | |
| "learning_rate": 0.0005208253420967568, | |
| "loss": 3.7249, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.3346249058228392, | |
| "grad_norm": 0.629569411277771, | |
| "learning_rate": 0.0005205021010666953, | |
| "loss": 3.7134, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.3400064578624475, | |
| "grad_norm": 0.6184583902359009, | |
| "learning_rate": 0.0005201788600366339, | |
| "loss": 3.7016, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.3453880099020559, | |
| "grad_norm": 0.5057381391525269, | |
| "learning_rate": 0.0005198556190065725, | |
| "loss": 3.726, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.350769561941664, | |
| "grad_norm": 0.606020987033844, | |
| "learning_rate": 0.0005195323779765112, | |
| "loss": 3.7269, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.356151113981272, | |
| "grad_norm": 0.5790770053863525, | |
| "learning_rate": 0.0005192091369464497, | |
| "loss": 3.7095, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.3615326660208804, | |
| "grad_norm": 0.5820352435112, | |
| "learning_rate": 0.0005188858959163882, | |
| "loss": 3.7182, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.3669142180604887, | |
| "grad_norm": 0.5658197402954102, | |
| "learning_rate": 0.0005185626548863269, | |
| "loss": 3.7244, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.3722957701000968, | |
| "grad_norm": 0.5616994500160217, | |
| "learning_rate": 0.0005182394138562654, | |
| "loss": 3.7267, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.3776773221397052, | |
| "grad_norm": 0.6056994795799255, | |
| "learning_rate": 0.0005179161728262041, | |
| "loss": 3.7119, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.3830588741793133, | |
| "grad_norm": 0.5663125514984131, | |
| "learning_rate": 0.0005175929317961426, | |
| "loss": 3.7249, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.3884404262189216, | |
| "grad_norm": 0.5755977630615234, | |
| "learning_rate": 0.0005172696907660812, | |
| "loss": 3.7163, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.3938219782585297, | |
| "grad_norm": 0.6310080289840698, | |
| "learning_rate": 0.0005169464497360198, | |
| "loss": 3.7186, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.399203530298138, | |
| "grad_norm": 0.5720473527908325, | |
| "learning_rate": 0.0005166232087059583, | |
| "loss": 3.7043, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.399203530298138, | |
| "eval_accuracy": 0.35189302123588095, | |
| "eval_loss": 3.675204038619995, | |
| "eval_runtime": 184.4822, | |
| "eval_samples_per_second": 97.63, | |
| "eval_steps_per_second": 6.104, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.4045850823377461, | |
| "grad_norm": 0.6183189749717712, | |
| "learning_rate": 0.0005162999676758969, | |
| "loss": 3.7147, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.4099666343773545, | |
| "grad_norm": 0.6030387878417969, | |
| "learning_rate": 0.0005159767266458355, | |
| "loss": 3.7161, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.4153481864169626, | |
| "grad_norm": 0.5624415874481201, | |
| "learning_rate": 0.0005156534856157741, | |
| "loss": 3.7097, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.420729738456571, | |
| "grad_norm": 0.6489253044128418, | |
| "learning_rate": 0.0005153302445857127, | |
| "loss": 3.6855, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.426111290496179, | |
| "grad_norm": 0.586063802242279, | |
| "learning_rate": 0.0005150070035556513, | |
| "loss": 3.7143, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.4314928425357873, | |
| "grad_norm": 0.6068193912506104, | |
| "learning_rate": 0.0005146837625255898, | |
| "loss": 3.6917, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.4368743945753955, | |
| "grad_norm": 0.5106357336044312, | |
| "learning_rate": 0.0005143605214955285, | |
| "loss": 3.6907, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.4422559466150038, | |
| "grad_norm": 0.6621003746986389, | |
| "learning_rate": 0.0005140372804654671, | |
| "loss": 3.6986, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.447637498654612, | |
| "grad_norm": 0.5909392237663269, | |
| "learning_rate": 0.0005137140394354056, | |
| "loss": 3.7168, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.4530190506942202, | |
| "grad_norm": 0.5878072381019592, | |
| "learning_rate": 0.0005133907984053442, | |
| "loss": 3.6998, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.4584006027338283, | |
| "grad_norm": 0.598780632019043, | |
| "learning_rate": 0.0005130675573752827, | |
| "loss": 3.7068, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.4637821547734367, | |
| "grad_norm": 0.5767022967338562, | |
| "learning_rate": 0.0005127443163452214, | |
| "loss": 3.7203, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.469163706813045, | |
| "grad_norm": 0.5619738101959229, | |
| "learning_rate": 0.00051242107531516, | |
| "loss": 3.7009, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.474545258852653, | |
| "grad_norm": 0.5568773150444031, | |
| "learning_rate": 0.0005120978342850986, | |
| "loss": 3.7009, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.4799268108922612, | |
| "grad_norm": 0.573401927947998, | |
| "learning_rate": 0.0005117745932550371, | |
| "loss": 3.7032, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.4853083629318695, | |
| "grad_norm": 0.6453996300697327, | |
| "learning_rate": 0.0005114513522249758, | |
| "loss": 3.7009, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.4906899149714778, | |
| "grad_norm": 0.610974907875061, | |
| "learning_rate": 0.0005111281111949143, | |
| "loss": 3.6982, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.496071467011086, | |
| "grad_norm": 0.5626153349876404, | |
| "learning_rate": 0.0005108048701648528, | |
| "loss": 3.6936, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.501453019050694, | |
| "grad_norm": 0.5994746685028076, | |
| "learning_rate": 0.0005104816291347915, | |
| "loss": 3.7043, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.5068345710903024, | |
| "grad_norm": 0.5882098078727722, | |
| "learning_rate": 0.00051015838810473, | |
| "loss": 3.704, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.5068345710903024, | |
| "eval_accuracy": 0.3537855372358336, | |
| "eval_loss": 3.654278516769409, | |
| "eval_runtime": 184.1956, | |
| "eval_samples_per_second": 97.782, | |
| "eval_steps_per_second": 6.113, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.5122161231299107, | |
| "grad_norm": 0.5625395774841309, | |
| "learning_rate": 0.0005098416118952699, | |
| "loss": 3.6777, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.5175976751695188, | |
| "grad_norm": 0.6187540888786316, | |
| "learning_rate": 0.0005095183708652085, | |
| "loss": 3.7096, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.5229792272091272, | |
| "grad_norm": 0.6217636466026306, | |
| "learning_rate": 0.000509195129835147, | |
| "loss": 3.6766, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.5283607792487355, | |
| "grad_norm": 0.5671772956848145, | |
| "learning_rate": 0.0005088718888050856, | |
| "loss": 3.7046, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 0.5394829511642456, | |
| "learning_rate": 0.0005085486477750242, | |
| "loss": 3.6966, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.5391238833279517, | |
| "grad_norm": 0.5557177662849426, | |
| "learning_rate": 0.0005082254067449629, | |
| "loss": 3.6946, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.54450543536756, | |
| "grad_norm": 0.5696578025817871, | |
| "learning_rate": 0.0005079021657149014, | |
| "loss": 3.6839, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.5498869874071683, | |
| "grad_norm": 0.6024355292320251, | |
| "learning_rate": 0.0005075789246848399, | |
| "loss": 3.6854, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.5552685394467765, | |
| "grad_norm": 0.5560182929039001, | |
| "learning_rate": 0.0005072556836547785, | |
| "loss": 3.6859, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.5606500914863846, | |
| "grad_norm": 0.6246845126152039, | |
| "learning_rate": 0.0005069324426247171, | |
| "loss": 3.7011, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.566031643525993, | |
| "grad_norm": 0.670316219329834, | |
| "learning_rate": 0.0005066092015946557, | |
| "loss": 3.6898, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.5714131955656012, | |
| "grad_norm": 0.5871966481208801, | |
| "learning_rate": 0.0005062859605645943, | |
| "loss": 3.68, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.5767947476052093, | |
| "grad_norm": 0.5870825052261353, | |
| "learning_rate": 0.0005059627195345329, | |
| "loss": 3.6844, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.5821762996448174, | |
| "grad_norm": 0.550830602645874, | |
| "learning_rate": 0.0005056394785044715, | |
| "loss": 3.6753, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.5875578516844258, | |
| "grad_norm": 0.6592797636985779, | |
| "learning_rate": 0.00050531623747441, | |
| "loss": 3.6651, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.592939403724034, | |
| "grad_norm": 0.5714665055274963, | |
| "learning_rate": 0.0005049929964443486, | |
| "loss": 3.6914, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.5983209557636422, | |
| "grad_norm": 0.6148173213005066, | |
| "learning_rate": 0.0005046697554142871, | |
| "loss": 3.6846, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.6037025078032503, | |
| "grad_norm": 0.6103314757347107, | |
| "learning_rate": 0.0005043465143842258, | |
| "loss": 3.677, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.6090840598428586, | |
| "grad_norm": 0.604094386100769, | |
| "learning_rate": 0.0005040232733541644, | |
| "loss": 3.6856, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.614465611882467, | |
| "grad_norm": 0.5953848361968994, | |
| "learning_rate": 0.000503700032324103, | |
| "loss": 3.6575, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.614465611882467, | |
| "eval_accuracy": 0.35560482118939923, | |
| "eval_loss": 3.636918783187866, | |
| "eval_runtime": 184.2382, | |
| "eval_samples_per_second": 97.759, | |
| "eval_steps_per_second": 6.112, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.619847163922075, | |
| "grad_norm": 0.63779616355896, | |
| "learning_rate": 0.0005033832561146428, | |
| "loss": 3.6775, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.6252287159616834, | |
| "grad_norm": 0.5981197953224182, | |
| "learning_rate": 0.0005030600150845813, | |
| "loss": 3.6823, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.6306102680012917, | |
| "grad_norm": 0.5513173341751099, | |
| "learning_rate": 0.00050273677405452, | |
| "loss": 3.665, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.6359918200408998, | |
| "grad_norm": 0.5226863622665405, | |
| "learning_rate": 0.0005024135330244585, | |
| "loss": 3.6834, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.641373372080508, | |
| "grad_norm": 0.5921908617019653, | |
| "learning_rate": 0.0005020902919943972, | |
| "loss": 3.6746, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.6467549241201163, | |
| "grad_norm": 0.5408695340156555, | |
| "learning_rate": 0.0005017670509643357, | |
| "loss": 3.6716, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.6521364761597246, | |
| "grad_norm": 0.5570070743560791, | |
| "learning_rate": 0.0005014438099342743, | |
| "loss": 3.6705, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.6575180281993327, | |
| "grad_norm": 0.5434333086013794, | |
| "learning_rate": 0.0005011205689042129, | |
| "loss": 3.6641, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.6628995802389408, | |
| "grad_norm": 0.5776414275169373, | |
| "learning_rate": 0.0005007973278741514, | |
| "loss": 3.6835, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.6682811322785491, | |
| "grad_norm": 0.5947062969207764, | |
| "learning_rate": 0.00050047408684409, | |
| "loss": 3.6673, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.6736626843181575, | |
| "grad_norm": 0.5519924759864807, | |
| "learning_rate": 0.0005001508458140286, | |
| "loss": 3.6608, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.6790442363577656, | |
| "grad_norm": 0.5676537752151489, | |
| "learning_rate": 0.0004998276047839673, | |
| "loss": 3.6529, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.6844257883973737, | |
| "grad_norm": 0.6070230007171631, | |
| "learning_rate": 0.0004995043637539058, | |
| "loss": 3.6634, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.689807340436982, | |
| "grad_norm": 0.5815381407737732, | |
| "learning_rate": 0.0004991811227238443, | |
| "loss": 3.6705, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.6951888924765903, | |
| "grad_norm": 0.6373358368873596, | |
| "learning_rate": 0.0004988578816937829, | |
| "loss": 3.6826, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.7005704445161984, | |
| "grad_norm": 0.5782313942909241, | |
| "learning_rate": 0.0004985346406637215, | |
| "loss": 3.678, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.7059519965558065, | |
| "grad_norm": 0.6094913482666016, | |
| "learning_rate": 0.0004982113996336602, | |
| "loss": 3.68, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.7113335485954149, | |
| "grad_norm": 0.5883018970489502, | |
| "learning_rate": 0.0004978881586035987, | |
| "loss": 3.6775, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.7167151006350232, | |
| "grad_norm": 0.6484796404838562, | |
| "learning_rate": 0.0004975649175735373, | |
| "loss": 3.6624, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.7220966526746313, | |
| "grad_norm": 0.5902780890464783, | |
| "learning_rate": 0.0004972416765434759, | |
| "loss": 3.6752, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.7220966526746313, | |
| "eval_accuracy": 0.3573156695609261, | |
| "eval_loss": 3.617367744445801, | |
| "eval_runtime": 184.3087, | |
| "eval_samples_per_second": 97.722, | |
| "eval_steps_per_second": 6.109, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.7274782047142396, | |
| "grad_norm": 0.5780335664749146, | |
| "learning_rate": 0.0004969184355134145, | |
| "loss": 3.6807, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.732859756753848, | |
| "grad_norm": 0.5813971757888794, | |
| "learning_rate": 0.0004965951944833531, | |
| "loss": 3.6819, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.738241308793456, | |
| "grad_norm": 0.5852022171020508, | |
| "learning_rate": 0.0004962719534532916, | |
| "loss": 3.647, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.7436228608330642, | |
| "grad_norm": 0.5813935399055481, | |
| "learning_rate": 0.0004959487124232302, | |
| "loss": 3.6538, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.7490044128726725, | |
| "grad_norm": 0.5969535112380981, | |
| "learning_rate": 0.0004956254713931688, | |
| "loss": 3.6764, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.566013514995575, | |
| "learning_rate": 0.0004953022303631074, | |
| "loss": 3.6599, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.759767516951889, | |
| "grad_norm": 0.5533055067062378, | |
| "learning_rate": 0.0004949789893330459, | |
| "loss": 3.6518, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.765149068991497, | |
| "grad_norm": 0.5435376167297363, | |
| "learning_rate": 0.0004946557483029846, | |
| "loss": 3.6667, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.7705306210311054, | |
| "grad_norm": 0.5930315852165222, | |
| "learning_rate": 0.0004943325072729231, | |
| "loss": 3.6586, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.7759121730707137, | |
| "grad_norm": 0.628606379032135, | |
| "learning_rate": 0.0004940092662428617, | |
| "loss": 3.6611, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.7812937251103218, | |
| "grad_norm": 0.5725036263465881, | |
| "learning_rate": 0.0004936860252128003, | |
| "loss": 3.6475, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.78667527714993, | |
| "grad_norm": 0.5788425207138062, | |
| "learning_rate": 0.0004933627841827388, | |
| "loss": 3.6625, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.7920568291895382, | |
| "grad_norm": 0.5657828450202942, | |
| "learning_rate": 0.0004930395431526775, | |
| "loss": 3.667, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.7974383812291466, | |
| "grad_norm": 0.5956182479858398, | |
| "learning_rate": 0.0004927163021226161, | |
| "loss": 3.6467, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.8028199332687547, | |
| "grad_norm": 0.5671237707138062, | |
| "learning_rate": 0.0004923930610925547, | |
| "loss": 3.6535, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.8082014853083628, | |
| "grad_norm": 0.5978695154190063, | |
| "learning_rate": 0.0004920698200624932, | |
| "loss": 3.6343, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.813583037347971, | |
| "grad_norm": 0.6054092645645142, | |
| "learning_rate": 0.0004917465790324317, | |
| "loss": 3.6545, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.8189645893875794, | |
| "grad_norm": 0.5926100015640259, | |
| "learning_rate": 0.0004914233380023704, | |
| "loss": 3.6525, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.8243461414271875, | |
| "grad_norm": 0.5705035924911499, | |
| "learning_rate": 0.0004911000969723089, | |
| "loss": 3.6582, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.8297276934667959, | |
| "grad_norm": 0.5524888038635254, | |
| "learning_rate": 0.0004907768559422476, | |
| "loss": 3.6393, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.8297276934667959, | |
| "eval_accuracy": 0.3585524653829183, | |
| "eval_loss": 3.603188991546631, | |
| "eval_runtime": 184.4615, | |
| "eval_samples_per_second": 97.641, | |
| "eval_steps_per_second": 6.104, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.8351092455064042, | |
| "grad_norm": 0.6277946829795837, | |
| "learning_rate": 0.0004904536149121861, | |
| "loss": 3.6574, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 0.6016470789909363, | |
| "learning_rate": 0.0004901303738821248, | |
| "loss": 3.6467, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.8458723495856204, | |
| "grad_norm": 0.6743981838226318, | |
| "learning_rate": 0.0004898071328520633, | |
| "loss": 3.6369, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.8512539016252287, | |
| "grad_norm": 0.622292160987854, | |
| "learning_rate": 0.0004894838918220019, | |
| "loss": 3.6435, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.856635453664837, | |
| "grad_norm": 0.5475077033042908, | |
| "learning_rate": 0.0004891606507919405, | |
| "loss": 3.6357, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.8620170057044452, | |
| "grad_norm": 0.5939141511917114, | |
| "learning_rate": 0.000488837409761879, | |
| "loss": 3.6559, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.8673985577440533, | |
| "grad_norm": 0.5346246957778931, | |
| "learning_rate": 0.0004885141687318177, | |
| "loss": 3.663, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.8727801097836616, | |
| "grad_norm": 0.5727187395095825, | |
| "learning_rate": 0.00048819092770175623, | |
| "loss": 3.6476, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.87816166182327, | |
| "grad_norm": 0.6204800605773926, | |
| "learning_rate": 0.0004878676866716948, | |
| "loss": 3.6445, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.883543213862878, | |
| "grad_norm": 0.5652103424072266, | |
| "learning_rate": 0.00048754444564163337, | |
| "loss": 3.652, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.8889247659024861, | |
| "grad_norm": 0.550086259841919, | |
| "learning_rate": 0.000487221204611572, | |
| "loss": 3.6597, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.8943063179420945, | |
| "grad_norm": 0.5764421224594116, | |
| "learning_rate": 0.00048689796358151056, | |
| "loss": 3.6409, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.8996878699817028, | |
| "grad_norm": 0.5565707087516785, | |
| "learning_rate": 0.00048657472255144915, | |
| "loss": 3.6391, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.905069422021311, | |
| "grad_norm": 0.542223334312439, | |
| "learning_rate": 0.00048625148152138775, | |
| "loss": 3.6273, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.910450974060919, | |
| "grad_norm": 0.6438802480697632, | |
| "learning_rate": 0.0004859282404913263, | |
| "loss": 3.6469, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.9158325261005273, | |
| "grad_norm": 0.6026858687400818, | |
| "learning_rate": 0.0004856049994612649, | |
| "loss": 3.636, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.9212140781401357, | |
| "grad_norm": 0.5909935832023621, | |
| "learning_rate": 0.00048528175843120353, | |
| "loss": 3.6383, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.9265956301797438, | |
| "grad_norm": 0.6021227240562439, | |
| "learning_rate": 0.0004849585174011421, | |
| "loss": 3.6338, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.931977182219352, | |
| "grad_norm": 0.6149671673774719, | |
| "learning_rate": 0.00048463527637108067, | |
| "loss": 3.655, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.9373587342589604, | |
| "grad_norm": 0.6580572128295898, | |
| "learning_rate": 0.0004843120353410192, | |
| "loss": 3.6456, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.9373587342589604, | |
| "eval_accuracy": 0.3600008083774853, | |
| "eval_loss": 3.5912606716156006, | |
| "eval_runtime": 184.3004, | |
| "eval_samples_per_second": 97.726, | |
| "eval_steps_per_second": 6.11, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.9427402862985685, | |
| "grad_norm": 0.5607840418815613, | |
| "learning_rate": 0.0004839887943109578, | |
| "loss": 3.632, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.9481218383381766, | |
| "grad_norm": 0.5700309872627258, | |
| "learning_rate": 0.00048366555328089645, | |
| "loss": 3.6422, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.953503390377785, | |
| "grad_norm": 0.5553288459777832, | |
| "learning_rate": 0.000483342312250835, | |
| "loss": 3.6275, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.9588849424173933, | |
| "grad_norm": 0.6393088102340698, | |
| "learning_rate": 0.0004830190712207736, | |
| "loss": 3.6327, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.9642664944570014, | |
| "grad_norm": 0.5855211019515991, | |
| "learning_rate": 0.0004826958301907122, | |
| "loss": 3.6175, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.9696480464966095, | |
| "grad_norm": 0.6120786070823669, | |
| "learning_rate": 0.0004823725891606507, | |
| "loss": 3.6458, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.9750295985362178, | |
| "grad_norm": 0.6420284509658813, | |
| "learning_rate": 0.0004820493481305893, | |
| "loss": 3.6347, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.9804111505758262, | |
| "grad_norm": 0.5363466739654541, | |
| "learning_rate": 0.00048172610710052797, | |
| "loss": 3.6235, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.9857927026154343, | |
| "grad_norm": 0.5807979702949524, | |
| "learning_rate": 0.0004814028660704665, | |
| "loss": 3.6403, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.9911742546550424, | |
| "grad_norm": 0.5786207318305969, | |
| "learning_rate": 0.0004810796250404051, | |
| "loss": 3.6512, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9965558066946507, | |
| "grad_norm": 0.5900049209594727, | |
| "learning_rate": 0.00048075638401034364, | |
| "loss": 3.6497, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.001937358734259, | |
| "grad_norm": 0.587768018245697, | |
| "learning_rate": 0.00048043314298028224, | |
| "loss": 3.6017, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.007318910773867, | |
| "grad_norm": 0.6221120953559875, | |
| "learning_rate": 0.00048010990195022083, | |
| "loss": 3.5595, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.0127004628134753, | |
| "grad_norm": 0.591773271560669, | |
| "learning_rate": 0.0004797866609201594, | |
| "loss": 3.5379, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.018082014853084, | |
| "grad_norm": 0.5833855867385864, | |
| "learning_rate": 0.000479463419890098, | |
| "loss": 3.5557, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.023463566892692, | |
| "grad_norm": 0.6240286231040955, | |
| "learning_rate": 0.0004791401788600366, | |
| "loss": 3.549, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.0288451189323, | |
| "grad_norm": 0.6125996708869934, | |
| "learning_rate": 0.00047881693782997515, | |
| "loss": 3.5324, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.034226670971908, | |
| "grad_norm": 0.6184468269348145, | |
| "learning_rate": 0.00047849369679991375, | |
| "loss": 3.5473, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.0396082230115167, | |
| "grad_norm": 0.6058159470558167, | |
| "learning_rate": 0.0004781704557698523, | |
| "loss": 3.5531, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "grad_norm": 0.6288303732872009, | |
| "learning_rate": 0.00047784721473979094, | |
| "loss": 3.5613, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "eval_accuracy": 0.36149511154359787, | |
| "eval_loss": 3.5811712741851807, | |
| "eval_runtime": 184.1955, | |
| "eval_samples_per_second": 97.782, | |
| "eval_steps_per_second": 6.113, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.050371327090733, | |
| "grad_norm": 0.5850680470466614, | |
| "learning_rate": 0.00047752397370972953, | |
| "loss": 3.5547, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.055752879130341, | |
| "grad_norm": 0.607171356678009, | |
| "learning_rate": 0.0004772007326796681, | |
| "loss": 3.5435, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.0611344311699495, | |
| "grad_norm": 0.6126758456230164, | |
| "learning_rate": 0.00047687749164960667, | |
| "loss": 3.539, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.0665159832095576, | |
| "grad_norm": 0.6476554274559021, | |
| "learning_rate": 0.0004765607154401465, | |
| "loss": 3.5559, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.0718975352491658, | |
| "grad_norm": 0.5856466889381409, | |
| "learning_rate": 0.00047623747441008507, | |
| "loss": 3.5714, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.0772790872887743, | |
| "grad_norm": 0.5796750783920288, | |
| "learning_rate": 0.0004759142333800236, | |
| "loss": 3.5467, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.0826606393283824, | |
| "grad_norm": 0.5950219035148621, | |
| "learning_rate": 0.00047559099234996226, | |
| "loss": 3.5663, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.0880421913679905, | |
| "grad_norm": 0.5547754168510437, | |
| "learning_rate": 0.00047526775131990085, | |
| "loss": 3.5419, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.0934237434075986, | |
| "grad_norm": 0.6345862746238708, | |
| "learning_rate": 0.0004749445102898394, | |
| "loss": 3.5469, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.098805295447207, | |
| "grad_norm": 0.5701510310173035, | |
| "learning_rate": 0.000474621269259778, | |
| "loss": 3.5691, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.1041868474868153, | |
| "grad_norm": 0.6009135246276855, | |
| "learning_rate": 0.0004742980282297166, | |
| "loss": 3.5445, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.1095683995264234, | |
| "grad_norm": 0.6103658676147461, | |
| "learning_rate": 0.0004739747871996551, | |
| "loss": 3.5591, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.1149499515660315, | |
| "grad_norm": 0.5732374787330627, | |
| "learning_rate": 0.00047365154616959377, | |
| "loss": 3.5341, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.12033150360564, | |
| "grad_norm": 0.5666860342025757, | |
| "learning_rate": 0.00047332830513953237, | |
| "loss": 3.5662, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.125713055645248, | |
| "grad_norm": 0.5759153962135315, | |
| "learning_rate": 0.0004730050641094709, | |
| "loss": 3.536, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.1310946076848563, | |
| "grad_norm": 0.5631008744239807, | |
| "learning_rate": 0.0004726818230794095, | |
| "loss": 3.5459, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.1364761597244644, | |
| "grad_norm": 0.584092915058136, | |
| "learning_rate": 0.00047235858204934804, | |
| "loss": 3.5598, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.141857711764073, | |
| "grad_norm": 0.5826037526130676, | |
| "learning_rate": 0.0004720353410192867, | |
| "loss": 3.5701, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 0.6279293894767761, | |
| "learning_rate": 0.0004717120999892253, | |
| "loss": 3.5717, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.152620815843289, | |
| "grad_norm": 0.624784529209137, | |
| "learning_rate": 0.0004713888589591638, | |
| "loss": 3.5371, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.152620815843289, | |
| "eval_accuracy": 0.36282893439440844, | |
| "eval_loss": 3.5696587562561035, | |
| "eval_runtime": 184.3013, | |
| "eval_samples_per_second": 97.726, | |
| "eval_steps_per_second": 6.11, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.1580023678828972, | |
| "grad_norm": 0.5869525671005249, | |
| "learning_rate": 0.0004710656179291024, | |
| "loss": 3.5439, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.163383919922506, | |
| "grad_norm": 0.5409868955612183, | |
| "learning_rate": 0.000470742376899041, | |
| "loss": 3.5553, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.168765471962114, | |
| "grad_norm": 0.5911038517951965, | |
| "learning_rate": 0.00047041913586897956, | |
| "loss": 3.5464, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.174147024001722, | |
| "grad_norm": 0.5998333096504211, | |
| "learning_rate": 0.0004700958948389182, | |
| "loss": 3.5672, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.1795285760413305, | |
| "grad_norm": 0.5834320187568665, | |
| "learning_rate": 0.0004697726538088568, | |
| "loss": 3.541, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.1849101280809387, | |
| "grad_norm": 0.5647948384284973, | |
| "learning_rate": 0.00046944941277879534, | |
| "loss": 3.5666, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.1902916801205468, | |
| "grad_norm": 0.6270108819007874, | |
| "learning_rate": 0.00046912617174873394, | |
| "loss": 3.5462, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.195673232160155, | |
| "grad_norm": 0.5561853647232056, | |
| "learning_rate": 0.0004688029307186725, | |
| "loss": 3.5507, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.2010547841997634, | |
| "grad_norm": 0.6108466982841492, | |
| "learning_rate": 0.00046847968968861107, | |
| "loss": 3.5621, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.2064363362393715, | |
| "grad_norm": 0.57036954164505, | |
| "learning_rate": 0.0004681564486585497, | |
| "loss": 3.5569, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.2118178882789796, | |
| "grad_norm": 0.6032209396362305, | |
| "learning_rate": 0.00046783320762848826, | |
| "loss": 3.5647, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.2171994403185877, | |
| "grad_norm": 0.6014056205749512, | |
| "learning_rate": 0.00046750996659842685, | |
| "loss": 3.5581, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.2225809923581963, | |
| "grad_norm": 0.5592637062072754, | |
| "learning_rate": 0.00046718672556836545, | |
| "loss": 3.5436, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.2279625443978044, | |
| "grad_norm": 0.5877309441566467, | |
| "learning_rate": 0.000466863484538304, | |
| "loss": 3.5606, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.2333440964374125, | |
| "grad_norm": 0.6111421585083008, | |
| "learning_rate": 0.0004665402435082426, | |
| "loss": 3.581, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.2387256484770206, | |
| "grad_norm": 0.5187894701957703, | |
| "learning_rate": 0.00046621700247818123, | |
| "loss": 3.5543, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.244107200516629, | |
| "grad_norm": 0.6091694831848145, | |
| "learning_rate": 0.0004658937614481198, | |
| "loss": 3.5487, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.2494887525562373, | |
| "grad_norm": 0.6413667798042297, | |
| "learning_rate": 0.00046557052041805837, | |
| "loss": 3.5557, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.2548703045958454, | |
| "grad_norm": 0.5929499268531799, | |
| "learning_rate": 0.0004652472793879969, | |
| "loss": 3.5565, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.2602518566354535, | |
| "grad_norm": 0.6386003494262695, | |
| "learning_rate": 0.0004649240383579355, | |
| "loss": 3.5621, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.2602518566354535, | |
| "eval_accuracy": 0.36359341611307117, | |
| "eval_loss": 3.5591483116149902, | |
| "eval_runtime": 183.9319, | |
| "eval_samples_per_second": 97.922, | |
| "eval_steps_per_second": 6.122, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.265633408675062, | |
| "grad_norm": 0.5843613743782043, | |
| "learning_rate": 0.00046460079732787415, | |
| "loss": 3.5686, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.27101496071467, | |
| "grad_norm": 0.5554518699645996, | |
| "learning_rate": 0.0004642775562978127, | |
| "loss": 3.5453, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.2763965127542782, | |
| "grad_norm": 0.5896638631820679, | |
| "learning_rate": 0.0004639543152677513, | |
| "loss": 3.5483, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 2.281778064793887, | |
| "grad_norm": 0.6211512088775635, | |
| "learning_rate": 0.0004636310742376899, | |
| "loss": 3.5357, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.287159616833495, | |
| "grad_norm": 0.6356164216995239, | |
| "learning_rate": 0.0004633142980282297, | |
| "loss": 3.5411, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.292541168873103, | |
| "grad_norm": 0.5573554635047913, | |
| "learning_rate": 0.00046299105699816823, | |
| "loss": 3.558, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.297922720912711, | |
| "grad_norm": 0.5714161992073059, | |
| "learning_rate": 0.0004626678159681068, | |
| "loss": 3.5582, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 2.303304272952319, | |
| "grad_norm": 0.5942437052726746, | |
| "learning_rate": 0.0004623445749380454, | |
| "loss": 3.5571, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.3086858249919278, | |
| "grad_norm": 0.5505504012107849, | |
| "learning_rate": 0.000462021333907984, | |
| "loss": 3.5575, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 2.314067377031536, | |
| "grad_norm": 0.6126779913902283, | |
| "learning_rate": 0.0004616980928779226, | |
| "loss": 3.534, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.319448929071144, | |
| "grad_norm": 0.664038360118866, | |
| "learning_rate": 0.0004613748518478612, | |
| "loss": 3.5549, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 2.3248304811107525, | |
| "grad_norm": 0.6307573318481445, | |
| "learning_rate": 0.00046105161081779974, | |
| "loss": 3.5503, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.3302120331503606, | |
| "grad_norm": 0.5964935421943665, | |
| "learning_rate": 0.00046072836978773834, | |
| "loss": 3.5477, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 2.3355935851899687, | |
| "grad_norm": 0.7227281928062439, | |
| "learning_rate": 0.000460405128757677, | |
| "loss": 3.561, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.340975137229577, | |
| "grad_norm": 0.5851664543151855, | |
| "learning_rate": 0.0004600818877276155, | |
| "loss": 3.5566, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.3463566892691854, | |
| "grad_norm": 0.615311324596405, | |
| "learning_rate": 0.0004597586466975541, | |
| "loss": 3.5538, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.3517382413087935, | |
| "grad_norm": 0.5875931978225708, | |
| "learning_rate": 0.00045943540566749266, | |
| "loss": 3.5542, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 2.3571197933484016, | |
| "grad_norm": 0.5925188660621643, | |
| "learning_rate": 0.00045911216463743126, | |
| "loss": 3.5604, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.3625013453880097, | |
| "grad_norm": 0.5988454818725586, | |
| "learning_rate": 0.00045878892360736985, | |
| "loss": 3.5432, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 2.3678828974276183, | |
| "grad_norm": 0.6129398941993713, | |
| "learning_rate": 0.00045846568257730845, | |
| "loss": 3.558, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.3678828974276183, | |
| "eval_accuracy": 0.36470232748610576, | |
| "eval_loss": 3.5517067909240723, | |
| "eval_runtime": 184.545, | |
| "eval_samples_per_second": 97.597, | |
| "eval_steps_per_second": 6.101, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.3732644494672264, | |
| "grad_norm": 0.5788018703460693, | |
| "learning_rate": 0.00045814244154724704, | |
| "loss": 3.5436, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 2.3786460015068345, | |
| "grad_norm": 0.6360171437263489, | |
| "learning_rate": 0.00045781920051718563, | |
| "loss": 3.5381, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.384027553546443, | |
| "grad_norm": 0.5815111994743347, | |
| "learning_rate": 0.0004574959594871242, | |
| "loss": 3.5559, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 2.389409105586051, | |
| "grad_norm": 0.6702762842178345, | |
| "learning_rate": 0.00045717271845706277, | |
| "loss": 3.5402, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.3947906576256592, | |
| "grad_norm": 0.5692477226257324, | |
| "learning_rate": 0.0004568494774270013, | |
| "loss": 3.5557, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.4001722096652673, | |
| "grad_norm": 0.6554452776908875, | |
| "learning_rate": 0.00045652623639693996, | |
| "loss": 3.5385, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.4055537617048754, | |
| "grad_norm": 0.6034250259399414, | |
| "learning_rate": 0.00045620299536687855, | |
| "loss": 3.5647, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 2.410935313744484, | |
| "grad_norm": 0.5821554064750671, | |
| "learning_rate": 0.0004558797543368171, | |
| "loss": 3.5636, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.416316865784092, | |
| "grad_norm": 0.6203377842903137, | |
| "learning_rate": 0.0004555565133067557, | |
| "loss": 3.5547, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 2.4216984178237, | |
| "grad_norm": 0.6128560900688171, | |
| "learning_rate": 0.0004552332722766943, | |
| "loss": 3.5712, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.4270799698633088, | |
| "grad_norm": 0.5825649499893188, | |
| "learning_rate": 0.0004549100312466328, | |
| "loss": 3.547, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 2.432461521902917, | |
| "grad_norm": 0.7408450841903687, | |
| "learning_rate": 0.0004545867902165715, | |
| "loss": 3.5563, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.437843073942525, | |
| "grad_norm": 0.6568633317947388, | |
| "learning_rate": 0.00045426354918651007, | |
| "loss": 3.5675, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 2.443224625982133, | |
| "grad_norm": 0.5571654438972473, | |
| "learning_rate": 0.0004539403081564486, | |
| "loss": 3.5478, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.4486061780217416, | |
| "grad_norm": 0.6605640649795532, | |
| "learning_rate": 0.0004536170671263872, | |
| "loss": 3.5322, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 0.6819828748703003, | |
| "learning_rate": 0.00045329382609632574, | |
| "loss": 3.5611, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.459369282100958, | |
| "grad_norm": 0.5603277087211609, | |
| "learning_rate": 0.0004529705850662644, | |
| "loss": 3.5442, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 2.464750834140566, | |
| "grad_norm": 0.6082057952880859, | |
| "learning_rate": 0.000452647344036203, | |
| "loss": 3.5527, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.4701323861801745, | |
| "grad_norm": 0.6479313373565674, | |
| "learning_rate": 0.00045232410300614153, | |
| "loss": 3.553, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 2.4755139382197826, | |
| "grad_norm": 0.6584368348121643, | |
| "learning_rate": 0.0004520008619760801, | |
| "loss": 3.5628, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.4755139382197826, | |
| "eval_accuracy": 0.3660629876002065, | |
| "eval_loss": 3.5401203632354736, | |
| "eval_runtime": 184.0797, | |
| "eval_samples_per_second": 97.843, | |
| "eval_steps_per_second": 6.117, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.4808954902593907, | |
| "grad_norm": 0.5972318053245544, | |
| "learning_rate": 0.0004516776209460187, | |
| "loss": 3.5309, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 2.4862770422989993, | |
| "grad_norm": 0.5607774257659912, | |
| "learning_rate": 0.00045135437991595726, | |
| "loss": 3.5419, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.4916585943386074, | |
| "grad_norm": 0.6000484824180603, | |
| "learning_rate": 0.0004510311388858959, | |
| "loss": 3.5581, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 2.4970401463782155, | |
| "grad_norm": 0.6345840692520142, | |
| "learning_rate": 0.0004507078978558345, | |
| "loss": 3.5502, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.5024216984178236, | |
| "grad_norm": 0.5603790283203125, | |
| "learning_rate": 0.00045038465682577304, | |
| "loss": 3.5513, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 2.5078032504574317, | |
| "grad_norm": 0.5980100035667419, | |
| "learning_rate": 0.00045006141579571164, | |
| "loss": 3.5528, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.5131848024970402, | |
| "grad_norm": 0.5831683874130249, | |
| "learning_rate": 0.0004497381747656502, | |
| "loss": 3.5515, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 2.5185663545366483, | |
| "grad_norm": 0.5655375123023987, | |
| "learning_rate": 0.00044942139855619004, | |
| "loss": 3.5402, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.5239479065762565, | |
| "grad_norm": 0.6101444363594055, | |
| "learning_rate": 0.0004490981575261286, | |
| "loss": 3.5329, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 2.529329458615865, | |
| "grad_norm": 0.6579620242118835, | |
| "learning_rate": 0.0004487749164960672, | |
| "loss": 3.5455, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.534711010655473, | |
| "grad_norm": 0.5922878980636597, | |
| "learning_rate": 0.0004484516754660058, | |
| "loss": 3.5537, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 2.540092562695081, | |
| "grad_norm": 0.6091205477714539, | |
| "learning_rate": 0.00044812843443594436, | |
| "loss": 3.5536, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.5454741147346893, | |
| "grad_norm": 0.5624716281890869, | |
| "learning_rate": 0.00044780519340588296, | |
| "loss": 3.5327, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 2.550855666774298, | |
| "grad_norm": 0.5864114165306091, | |
| "learning_rate": 0.0004474819523758215, | |
| "loss": 3.5433, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.556237218813906, | |
| "grad_norm": 0.6021702885627747, | |
| "learning_rate": 0.0004471587113457601, | |
| "loss": 3.5506, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 2.561618770853514, | |
| "grad_norm": 0.5519259572029114, | |
| "learning_rate": 0.00044683547031569874, | |
| "loss": 3.536, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.567000322893122, | |
| "grad_norm": 0.624495804309845, | |
| "learning_rate": 0.0004465122292856373, | |
| "loss": 3.536, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 2.5723818749327307, | |
| "grad_norm": 0.5957192182540894, | |
| "learning_rate": 0.0004461889882555759, | |
| "loss": 3.5582, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.577763426972339, | |
| "grad_norm": 0.5890944004058838, | |
| "learning_rate": 0.00044586574722551447, | |
| "loss": 3.5589, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 2.583144979011947, | |
| "grad_norm": 0.6432296633720398, | |
| "learning_rate": 0.000445542506195453, | |
| "loss": 3.5542, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.583144979011947, | |
| "eval_accuracy": 0.3664736955161452, | |
| "eval_loss": 3.5323314666748047, | |
| "eval_runtime": 184.3491, | |
| "eval_samples_per_second": 97.7, | |
| "eval_steps_per_second": 6.108, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.5885265310515555, | |
| "grad_norm": 0.57781583070755, | |
| "learning_rate": 0.0004452192651653916, | |
| "loss": 3.5414, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 2.5939080830911636, | |
| "grad_norm": 0.5868206024169922, | |
| "learning_rate": 0.00044489602413533025, | |
| "loss": 3.5449, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 2.5992896351307717, | |
| "grad_norm": 0.5909056067466736, | |
| "learning_rate": 0.0004445727831052688, | |
| "loss": 3.5556, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 2.60467118717038, | |
| "grad_norm": 0.5784221291542053, | |
| "learning_rate": 0.0004442495420752074, | |
| "loss": 3.5357, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.610052739209988, | |
| "grad_norm": 0.5711833238601685, | |
| "learning_rate": 0.00044392630104514593, | |
| "loss": 3.5492, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 2.6154342912495965, | |
| "grad_norm": 0.6024125814437866, | |
| "learning_rate": 0.0004436030600150845, | |
| "loss": 3.5404, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 2.6208158432892046, | |
| "grad_norm": 0.5955642461776733, | |
| "learning_rate": 0.0004432798189850231, | |
| "loss": 3.5498, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 2.6261973953288127, | |
| "grad_norm": 0.6499757766723633, | |
| "learning_rate": 0.0004429565779549617, | |
| "loss": 3.5437, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.618604838848114, | |
| "learning_rate": 0.0004426333369249003, | |
| "loss": 3.533, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 2.6369604994080293, | |
| "grad_norm": 0.6137416362762451, | |
| "learning_rate": 0.0004423100958948389, | |
| "loss": 3.5525, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.6423420514476375, | |
| "grad_norm": 0.5660908222198486, | |
| "learning_rate": 0.00044198685486477744, | |
| "loss": 3.538, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 2.6477236034872456, | |
| "grad_norm": 0.5895227789878845, | |
| "learning_rate": 0.00044166361383471604, | |
| "loss": 3.5476, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.653105155526854, | |
| "grad_norm": 0.639029860496521, | |
| "learning_rate": 0.0004413403728046547, | |
| "loss": 3.5235, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 2.658486707566462, | |
| "grad_norm": 0.6285099387168884, | |
| "learning_rate": 0.0004410171317745932, | |
| "loss": 3.542, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 2.6638682596060703, | |
| "grad_norm": 0.6055117845535278, | |
| "learning_rate": 0.0004406938907445318, | |
| "loss": 3.5338, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 2.6692498116456784, | |
| "grad_norm": 0.5412980914115906, | |
| "learning_rate": 0.00044037064971447036, | |
| "loss": 3.5385, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.674631363685287, | |
| "grad_norm": 0.6035342216491699, | |
| "learning_rate": 0.00044004740868440896, | |
| "loss": 3.5469, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 2.680012915724895, | |
| "grad_norm": 0.5654745101928711, | |
| "learning_rate": 0.00043972416765434755, | |
| "loss": 3.528, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 2.685394467764503, | |
| "grad_norm": 0.6008833050727844, | |
| "learning_rate": 0.00043940092662428615, | |
| "loss": 3.5475, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 2.6907760198041117, | |
| "grad_norm": 0.5768605470657349, | |
| "learning_rate": 0.00043907768559422474, | |
| "loss": 3.5234, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.6907760198041117, | |
| "eval_accuracy": 0.36766844267055326, | |
| "eval_loss": 3.519519090652466, | |
| "eval_runtime": 184.2393, | |
| "eval_samples_per_second": 97.759, | |
| "eval_steps_per_second": 6.112, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.69615757184372, | |
| "grad_norm": 0.6297202706336975, | |
| "learning_rate": 0.00043875444456416334, | |
| "loss": 3.5581, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 2.701539123883328, | |
| "grad_norm": 0.5893862843513489, | |
| "learning_rate": 0.0004384312035341019, | |
| "loss": 3.5378, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 2.706920675922936, | |
| "grad_norm": 0.6063299775123596, | |
| "learning_rate": 0.00043810796250404047, | |
| "loss": 3.535, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 2.712302227962544, | |
| "grad_norm": 0.6746584177017212, | |
| "learning_rate": 0.000437784721473979, | |
| "loss": 3.5498, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.7176837800021527, | |
| "grad_norm": 0.5552766919136047, | |
| "learning_rate": 0.00043746148044391766, | |
| "loss": 3.5503, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 2.723065332041761, | |
| "grad_norm": 0.5598071217536926, | |
| "learning_rate": 0.00043713823941385625, | |
| "loss": 3.5285, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 2.728446884081369, | |
| "grad_norm": 0.5776399970054626, | |
| "learning_rate": 0.0004368149983837948, | |
| "loss": 3.5618, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 2.7338284361209775, | |
| "grad_norm": 0.5596196055412292, | |
| "learning_rate": 0.00043649822217433466, | |
| "loss": 3.535, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.7392099881605856, | |
| "grad_norm": 0.5944263339042664, | |
| "learning_rate": 0.0004361749811442732, | |
| "loss": 3.5311, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 2.7445915402001937, | |
| "grad_norm": 0.560356855392456, | |
| "learning_rate": 0.0004358517401142118, | |
| "loss": 3.5544, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.749973092239802, | |
| "grad_norm": 0.6242508888244629, | |
| "learning_rate": 0.00043552849908415033, | |
| "loss": 3.548, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 2.7553546442794103, | |
| "grad_norm": 0.6270253658294678, | |
| "learning_rate": 0.000435205258054089, | |
| "loss": 3.5441, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 0.6369729042053223, | |
| "learning_rate": 0.0004348820170240276, | |
| "loss": 3.5349, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 2.7661177483586266, | |
| "grad_norm": 0.6286373734474182, | |
| "learning_rate": 0.0004345587759939661, | |
| "loss": 3.5473, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 2.7714993003982347, | |
| "grad_norm": 0.6675140261650085, | |
| "learning_rate": 0.0004342355349639047, | |
| "loss": 3.549, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 2.776880852437843, | |
| "grad_norm": 0.5758721828460693, | |
| "learning_rate": 0.0004339122939338433, | |
| "loss": 3.5219, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.7822624044774513, | |
| "grad_norm": 0.5658400654792786, | |
| "learning_rate": 0.00043358905290378184, | |
| "loss": 3.5397, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 2.7876439565170594, | |
| "grad_norm": 0.6118866205215454, | |
| "learning_rate": 0.0004332658118737205, | |
| "loss": 3.5471, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 2.793025508556668, | |
| "grad_norm": 0.5958803296089172, | |
| "learning_rate": 0.0004329425708436591, | |
| "loss": 3.5447, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 2.798407060596276, | |
| "grad_norm": 0.5699183940887451, | |
| "learning_rate": 0.00043261932981359763, | |
| "loss": 3.5435, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.798407060596276, | |
| "eval_accuracy": 0.3682564720994052, | |
| "eval_loss": 3.5144779682159424, | |
| "eval_runtime": 184.444, | |
| "eval_samples_per_second": 97.65, | |
| "eval_steps_per_second": 6.105, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.803788612635884, | |
| "grad_norm": 0.5818027257919312, | |
| "learning_rate": 0.0004322960887835362, | |
| "loss": 3.5262, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.8091701646754923, | |
| "grad_norm": 0.569835364818573, | |
| "learning_rate": 0.00043197284775347476, | |
| "loss": 3.5344, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.8145517167151004, | |
| "grad_norm": 0.6536638140678406, | |
| "learning_rate": 0.00043164960672341336, | |
| "loss": 3.5414, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.819933268754709, | |
| "grad_norm": 0.7015666365623474, | |
| "learning_rate": 0.000431326365693352, | |
| "loss": 3.542, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.825314820794317, | |
| "grad_norm": 0.5483152270317078, | |
| "learning_rate": 0.00043100312466329055, | |
| "loss": 3.538, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.830696372833925, | |
| "grad_norm": 0.5531379580497742, | |
| "learning_rate": 0.00043067988363322914, | |
| "loss": 3.5229, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.8360779248735337, | |
| "grad_norm": 0.5912043452262878, | |
| "learning_rate": 0.00043035664260316774, | |
| "loss": 3.5388, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.841459476913142, | |
| "grad_norm": 0.6138274669647217, | |
| "learning_rate": 0.0004300334015731063, | |
| "loss": 3.5357, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.84684102895275, | |
| "grad_norm": 0.5332802534103394, | |
| "learning_rate": 0.0004297101605430449, | |
| "loss": 3.5257, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.852222580992358, | |
| "grad_norm": 0.6212345361709595, | |
| "learning_rate": 0.0004293869195129835, | |
| "loss": 3.517, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.857604133031966, | |
| "grad_norm": 0.6443509459495544, | |
| "learning_rate": 0.00042906367848292206, | |
| "loss": 3.5264, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.8629856850715747, | |
| "grad_norm": 0.6516796946525574, | |
| "learning_rate": 0.00042874043745286066, | |
| "loss": 3.5297, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.868367237111183, | |
| "grad_norm": 0.5682997107505798, | |
| "learning_rate": 0.0004284171964227992, | |
| "loss": 3.5414, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.873748789150791, | |
| "grad_norm": 0.5739607810974121, | |
| "learning_rate": 0.0004280939553927378, | |
| "loss": 3.5115, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.8791303411903995, | |
| "grad_norm": 0.5442869663238525, | |
| "learning_rate": 0.00042777071436267644, | |
| "loss": 3.5261, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.8845118932300076, | |
| "grad_norm": 0.6176914572715759, | |
| "learning_rate": 0.000427447473332615, | |
| "loss": 3.5328, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.8898934452696157, | |
| "grad_norm": 0.6462579965591431, | |
| "learning_rate": 0.0004271242323025536, | |
| "loss": 3.5073, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.895274997309224, | |
| "grad_norm": 0.5656293630599976, | |
| "learning_rate": 0.00042680099127249217, | |
| "loss": 3.5401, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.9006565493488323, | |
| "grad_norm": 0.6997568011283875, | |
| "learning_rate": 0.0004264777502424307, | |
| "loss": 3.5345, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.9060381013884404, | |
| "grad_norm": 0.5928633213043213, | |
| "learning_rate": 0.0004261545092123693, | |
| "loss": 3.5322, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.9060381013884404, | |
| "eval_accuracy": 0.3696151764615253, | |
| "eval_loss": 3.5033304691314697, | |
| "eval_runtime": 184.2571, | |
| "eval_samples_per_second": 97.749, | |
| "eval_steps_per_second": 6.111, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.9114196534280485, | |
| "grad_norm": 0.5869461894035339, | |
| "learning_rate": 0.00042583126818230795, | |
| "loss": 3.5216, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.9168012054676566, | |
| "grad_norm": 0.5901129245758057, | |
| "learning_rate": 0.0004255080271522465, | |
| "loss": 3.5241, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.922182757507265, | |
| "grad_norm": 0.6579878330230713, | |
| "learning_rate": 0.0004251847861221851, | |
| "loss": 3.5215, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.9275643095468733, | |
| "grad_norm": 0.6109535694122314, | |
| "learning_rate": 0.00042486154509212363, | |
| "loss": 3.5239, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.9329458615864814, | |
| "grad_norm": 0.6221215724945068, | |
| "learning_rate": 0.0004245383040620622, | |
| "loss": 3.5325, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.93832741362609, | |
| "grad_norm": 0.571694552898407, | |
| "learning_rate": 0.0004242150630320009, | |
| "loss": 3.5265, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.943708965665698, | |
| "grad_norm": 0.6387854218482971, | |
| "learning_rate": 0.0004238918220019394, | |
| "loss": 3.544, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 2.949090517705306, | |
| "grad_norm": 0.6187459230422974, | |
| "learning_rate": 0.000423568580971878, | |
| "loss": 3.5198, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.9544720697449143, | |
| "grad_norm": 0.5934954881668091, | |
| "learning_rate": 0.00042324533994181655, | |
| "loss": 3.5083, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 2.9598536217845224, | |
| "grad_norm": 0.63029545545578, | |
| "learning_rate": 0.00042292209891175514, | |
| "loss": 3.5482, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.965235173824131, | |
| "grad_norm": 0.5702903270721436, | |
| "learning_rate": 0.00042259885788169374, | |
| "loss": 3.5337, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 2.970616725863739, | |
| "grad_norm": 0.6199389696121216, | |
| "learning_rate": 0.00042227561685163233, | |
| "loss": 3.5171, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.975998277903347, | |
| "grad_norm": 0.5664713978767395, | |
| "learning_rate": 0.00042195237582157093, | |
| "loss": 3.5048, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 2.9813798299429557, | |
| "grad_norm": 0.6108223795890808, | |
| "learning_rate": 0.00042163559961211073, | |
| "loss": 3.5295, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.986761381982564, | |
| "grad_norm": 0.6058242917060852, | |
| "learning_rate": 0.00042131235858204933, | |
| "loss": 3.539, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.992142934022172, | |
| "grad_norm": 0.5966590046882629, | |
| "learning_rate": 0.0004209891175519879, | |
| "loss": 3.5526, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.9975244860617805, | |
| "grad_norm": 0.5852935910224915, | |
| "learning_rate": 0.00042066587652192646, | |
| "loss": 3.5124, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 3.0029060381013886, | |
| "grad_norm": 0.6247649192810059, | |
| "learning_rate": 0.00042034263549186506, | |
| "loss": 3.478, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 3.0082875901409967, | |
| "grad_norm": 0.6025030016899109, | |
| "learning_rate": 0.0004200193944618036, | |
| "loss": 3.4115, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 3.0136691421806048, | |
| "grad_norm": 0.5869174003601074, | |
| "learning_rate": 0.00041969615343174225, | |
| "loss": 3.4226, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.0136691421806048, | |
| "eval_accuracy": 0.3702457978224006, | |
| "eval_loss": 3.4976563453674316, | |
| "eval_runtime": 184.0262, | |
| "eval_samples_per_second": 97.872, | |
| "eval_steps_per_second": 6.119, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.0190506942202133, | |
| "grad_norm": 0.5895264148712158, | |
| "learning_rate": 0.00041937291240168084, | |
| "loss": 3.4277, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 3.0244322462598214, | |
| "grad_norm": 0.6447648406028748, | |
| "learning_rate": 0.0004190496713716194, | |
| "loss": 3.4232, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 3.0298137982994295, | |
| "grad_norm": 0.6442786455154419, | |
| "learning_rate": 0.000418726430341558, | |
| "loss": 3.4341, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 3.0351953503390376, | |
| "grad_norm": 0.5691360831260681, | |
| "learning_rate": 0.00041840318931149657, | |
| "loss": 3.4111, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 3.040576902378646, | |
| "grad_norm": 0.6102415323257446, | |
| "learning_rate": 0.00041807994828143517, | |
| "loss": 3.4265, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 3.0459584544182543, | |
| "grad_norm": 0.6060606837272644, | |
| "learning_rate": 0.00041775670725137376, | |
| "loss": 3.4497, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 3.0513400064578624, | |
| "grad_norm": 0.580539345741272, | |
| "learning_rate": 0.00041743346622131236, | |
| "loss": 3.4643, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 3.0567215584974705, | |
| "grad_norm": 0.6304591298103333, | |
| "learning_rate": 0.0004171102251912509, | |
| "loss": 3.4302, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 3.062103110537079, | |
| "grad_norm": 0.6201354265213013, | |
| "learning_rate": 0.0004167869841611895, | |
| "loss": 3.4378, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 0.6007677316665649, | |
| "learning_rate": 0.00041646374313112803, | |
| "loss": 3.4577, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 3.0728662146162953, | |
| "grad_norm": 0.5703549981117249, | |
| "learning_rate": 0.0004161405021010667, | |
| "loss": 3.4305, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 3.0782477666559034, | |
| "grad_norm": 0.618215024471283, | |
| "learning_rate": 0.0004158172610710053, | |
| "loss": 3.4434, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 3.083629318695512, | |
| "grad_norm": 0.6170952320098877, | |
| "learning_rate": 0.0004154940200409438, | |
| "loss": 3.4533, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 3.08901087073512, | |
| "grad_norm": 0.5749800801277161, | |
| "learning_rate": 0.0004151707790108824, | |
| "loss": 3.4502, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 3.094392422774728, | |
| "grad_norm": 0.6872437596321106, | |
| "learning_rate": 0.00041484753798082095, | |
| "loss": 3.4246, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 3.0997739748143363, | |
| "grad_norm": 0.6020567417144775, | |
| "learning_rate": 0.00041452429695075955, | |
| "loss": 3.4634, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 3.105155526853945, | |
| "grad_norm": 0.6059355735778809, | |
| "learning_rate": 0.0004142010559206982, | |
| "loss": 3.4406, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 3.110537078893553, | |
| "grad_norm": 0.6008569002151489, | |
| "learning_rate": 0.00041387781489063673, | |
| "loss": 3.4548, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 3.115918630933161, | |
| "grad_norm": 0.6149502992630005, | |
| "learning_rate": 0.00041355457386057533, | |
| "loss": 3.4469, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 3.121300182972769, | |
| "grad_norm": 0.5984567999839783, | |
| "learning_rate": 0.0004132313328305139, | |
| "loss": 3.4413, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 3.121300182972769, | |
| "eval_accuracy": 0.3708215494749295, | |
| "eval_loss": 3.495443820953369, | |
| "eval_runtime": 184.593, | |
| "eval_samples_per_second": 97.571, | |
| "eval_steps_per_second": 6.1, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 3.1266817350123777, | |
| "grad_norm": 0.6125328540802002, | |
| "learning_rate": 0.00041290809180045246, | |
| "loss": 3.4512, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 3.132063287051986, | |
| "grad_norm": 0.6699221134185791, | |
| "learning_rate": 0.0004125848507703911, | |
| "loss": 3.4447, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 3.137444839091594, | |
| "grad_norm": 0.5934370160102844, | |
| "learning_rate": 0.0004122616097403297, | |
| "loss": 3.4446, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 3.1428263911312024, | |
| "grad_norm": 0.6435014009475708, | |
| "learning_rate": 0.00041193836871026825, | |
| "loss": 3.4587, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 3.1482079431708105, | |
| "grad_norm": 0.6182610392570496, | |
| "learning_rate": 0.00041161512768020684, | |
| "loss": 3.4422, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 3.1535894952104186, | |
| "grad_norm": 0.6269465088844299, | |
| "learning_rate": 0.0004112918866501454, | |
| "loss": 3.4524, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 3.1589710472500268, | |
| "grad_norm": 0.6223031878471375, | |
| "learning_rate": 0.000410968645620084, | |
| "loss": 3.4615, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 3.1643525992896353, | |
| "grad_norm": 0.5721325874328613, | |
| "learning_rate": 0.0004106454045900226, | |
| "loss": 3.4607, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 3.1697341513292434, | |
| "grad_norm": 0.6590650081634521, | |
| "learning_rate": 0.00041032216355996117, | |
| "loss": 3.4552, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 3.1751157033688515, | |
| "grad_norm": 0.5947334170341492, | |
| "learning_rate": 0.00040999892252989976, | |
| "loss": 3.4323, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 3.1804972554084596, | |
| "grad_norm": 0.5962185859680176, | |
| "learning_rate": 0.00040967568149983836, | |
| "loss": 3.437, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 3.185878807448068, | |
| "grad_norm": 0.6300623416900635, | |
| "learning_rate": 0.0004093524404697769, | |
| "loss": 3.4554, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 3.1912603594876763, | |
| "grad_norm": 0.5935133695602417, | |
| "learning_rate": 0.0004090291994397155, | |
| "loss": 3.4556, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 3.1966419115272844, | |
| "grad_norm": 0.5989574193954468, | |
| "learning_rate": 0.0004087124232302553, | |
| "loss": 3.461, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 3.2020234635668925, | |
| "grad_norm": 0.6037615537643433, | |
| "learning_rate": 0.0004083891822001939, | |
| "loss": 3.4526, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 3.207405015606501, | |
| "grad_norm": 0.644180178642273, | |
| "learning_rate": 0.00040806594117013254, | |
| "loss": 3.4623, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 3.212786567646109, | |
| "grad_norm": 0.5938394665718079, | |
| "learning_rate": 0.0004077427001400711, | |
| "loss": 3.4498, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 3.2181681196857173, | |
| "grad_norm": 0.592949628829956, | |
| "learning_rate": 0.0004074194591100097, | |
| "loss": 3.4375, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 3.2235496717253254, | |
| "grad_norm": 0.5955378413200378, | |
| "learning_rate": 0.0004070962180799482, | |
| "loss": 3.4436, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 3.228931223764934, | |
| "grad_norm": 0.6777014136314392, | |
| "learning_rate": 0.0004067729770498868, | |
| "loss": 3.4657, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.228931223764934, | |
| "eval_accuracy": 0.3714358729026326, | |
| "eval_loss": 3.4908430576324463, | |
| "eval_runtime": 184.0934, | |
| "eval_samples_per_second": 97.836, | |
| "eval_steps_per_second": 6.116, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.234312775804542, | |
| "grad_norm": 0.6666581034660339, | |
| "learning_rate": 0.00040644973601982546, | |
| "loss": 3.4779, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 3.23969432784415, | |
| "grad_norm": 0.6749293208122253, | |
| "learning_rate": 0.000406126494989764, | |
| "loss": 3.4398, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 3.2450758798837587, | |
| "grad_norm": 0.6298037767410278, | |
| "learning_rate": 0.0004058032539597026, | |
| "loss": 3.4731, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 3.250457431923367, | |
| "grad_norm": 0.6505112648010254, | |
| "learning_rate": 0.00040548001292964114, | |
| "loss": 3.4701, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 3.255838983962975, | |
| "grad_norm": 0.5902146697044373, | |
| "learning_rate": 0.00040515677189957973, | |
| "loss": 3.4507, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 3.261220536002583, | |
| "grad_norm": 0.5966035723686218, | |
| "learning_rate": 0.0004048335308695183, | |
| "loss": 3.4517, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 3.2666020880421915, | |
| "grad_norm": 0.6362846493721008, | |
| "learning_rate": 0.0004045102898394569, | |
| "loss": 3.4426, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 3.2719836400817996, | |
| "grad_norm": 0.6061277985572815, | |
| "learning_rate": 0.0004041870488093955, | |
| "loss": 3.4499, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 3.2773651921214078, | |
| "grad_norm": 0.5965515971183777, | |
| "learning_rate": 0.0004038638077793341, | |
| "loss": 3.4513, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 3.282746744161016, | |
| "grad_norm": 0.6205342411994934, | |
| "learning_rate": 0.00040354056674927265, | |
| "loss": 3.4627, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 3.2881282962006244, | |
| "grad_norm": 0.6279780864715576, | |
| "learning_rate": 0.00040321732571921124, | |
| "loss": 3.4423, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 3.2935098482402325, | |
| "grad_norm": 0.6254119277000427, | |
| "learning_rate": 0.0004028940846891498, | |
| "loss": 3.4628, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 3.2988914002798406, | |
| "grad_norm": 0.6423187851905823, | |
| "learning_rate": 0.00040257084365908843, | |
| "loss": 3.4666, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 3.304272952319449, | |
| "grad_norm": 0.6249606609344482, | |
| "learning_rate": 0.00040224760262902703, | |
| "loss": 3.4677, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 3.3096545043590573, | |
| "grad_norm": 0.5912461876869202, | |
| "learning_rate": 0.00040192436159896557, | |
| "loss": 3.4457, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 3.3150360563986654, | |
| "grad_norm": 0.6003895998001099, | |
| "learning_rate": 0.00040160112056890416, | |
| "loss": 3.458, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 3.3204176084382735, | |
| "grad_norm": 0.6189260482788086, | |
| "learning_rate": 0.00040127787953884276, | |
| "loss": 3.4547, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 3.3257991604778816, | |
| "grad_norm": 0.574907124042511, | |
| "learning_rate": 0.00040095463850878135, | |
| "loss": 3.4408, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 3.33118071251749, | |
| "grad_norm": 0.6639299988746643, | |
| "learning_rate": 0.00040063139747871995, | |
| "loss": 3.4682, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 3.3365622645570983, | |
| "grad_norm": 0.6120445728302002, | |
| "learning_rate": 0.00040030815644865854, | |
| "loss": 3.4539, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.3365622645570983, | |
| "eval_accuracy": 0.3720266186536799, | |
| "eval_loss": 3.485776901245117, | |
| "eval_runtime": 184.2468, | |
| "eval_samples_per_second": 97.755, | |
| "eval_steps_per_second": 6.111, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.3419438165967064, | |
| "grad_norm": 0.5764549374580383, | |
| "learning_rate": 0.0003999849154185971, | |
| "loss": 3.4518, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 3.347325368636315, | |
| "grad_norm": 0.6696987152099609, | |
| "learning_rate": 0.0003996616743885357, | |
| "loss": 3.4551, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 3.352706920675923, | |
| "grad_norm": 0.6358708143234253, | |
| "learning_rate": 0.0003993384333584742, | |
| "loss": 3.4639, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 3.358088472715531, | |
| "grad_norm": 0.613065242767334, | |
| "learning_rate": 0.00039901519232841287, | |
| "loss": 3.467, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 3.3634700247551392, | |
| "grad_norm": 0.6027072072029114, | |
| "learning_rate": 0.00039869195129835146, | |
| "loss": 3.4519, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 3.368851576794748, | |
| "grad_norm": 0.6306042671203613, | |
| "learning_rate": 0.00039836871026829, | |
| "loss": 3.4532, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 3.374233128834356, | |
| "grad_norm": 0.6109300255775452, | |
| "learning_rate": 0.0003980454692382286, | |
| "loss": 3.4616, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 3.379614680873964, | |
| "grad_norm": 0.553433358669281, | |
| "learning_rate": 0.0003977222282081672, | |
| "loss": 3.4389, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 3.384996232913572, | |
| "grad_norm": 0.6606940031051636, | |
| "learning_rate": 0.00039739898717810573, | |
| "loss": 3.4711, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 3.3903777849531807, | |
| "grad_norm": 0.5818373560905457, | |
| "learning_rate": 0.0003970757461480444, | |
| "loss": 3.4743, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 3.3957593369927888, | |
| "grad_norm": 0.6499767303466797, | |
| "learning_rate": 0.000396752505117983, | |
| "loss": 3.4631, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 3.401140889032397, | |
| "grad_norm": 0.6842148900032043, | |
| "learning_rate": 0.0003964292640879215, | |
| "loss": 3.4525, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 3.4065224410720054, | |
| "grad_norm": 0.6853141784667969, | |
| "learning_rate": 0.0003961060230578601, | |
| "loss": 3.466, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 3.4119039931116135, | |
| "grad_norm": 0.5986456274986267, | |
| "learning_rate": 0.00039578278202779865, | |
| "loss": 3.4655, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 3.4172855451512216, | |
| "grad_norm": 0.6357953548431396, | |
| "learning_rate": 0.0003954660058183385, | |
| "loss": 3.4552, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 3.4226670971908297, | |
| "grad_norm": 0.5822431445121765, | |
| "learning_rate": 0.00039514276478827705, | |
| "loss": 3.4714, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 3.428048649230438, | |
| "grad_norm": 0.5881465077400208, | |
| "learning_rate": 0.0003948195237582157, | |
| "loss": 3.4785, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 3.4334302012700464, | |
| "grad_norm": 0.658664345741272, | |
| "learning_rate": 0.0003944962827281543, | |
| "loss": 3.4438, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 3.4388117533096545, | |
| "grad_norm": 0.6199131011962891, | |
| "learning_rate": 0.00039417304169809284, | |
| "loss": 3.4477, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 3.4441933053492626, | |
| "grad_norm": 0.5990483164787292, | |
| "learning_rate": 0.00039384980066803143, | |
| "loss": 3.4536, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.4441933053492626, | |
| "eval_accuracy": 0.3732962276477921, | |
| "eval_loss": 3.474824905395508, | |
| "eval_runtime": 184.3175, | |
| "eval_samples_per_second": 97.717, | |
| "eval_steps_per_second": 6.109, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.449574857388871, | |
| "grad_norm": 0.6051961779594421, | |
| "learning_rate": 0.00039352655963796997, | |
| "loss": 3.4445, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 3.4549564094284793, | |
| "grad_norm": 0.6444359421730042, | |
| "learning_rate": 0.00039320331860790857, | |
| "loss": 3.4698, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 3.4603379614680874, | |
| "grad_norm": 0.6289820671081543, | |
| "learning_rate": 0.0003928800775778472, | |
| "loss": 3.476, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 3.4657195135076955, | |
| "grad_norm": 0.629337728023529, | |
| "learning_rate": 0.00039255683654778576, | |
| "loss": 3.4475, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 3.471101065547304, | |
| "grad_norm": 0.7152823805809021, | |
| "learning_rate": 0.00039223359551772435, | |
| "loss": 3.442, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 3.476482617586912, | |
| "grad_norm": 0.6605930924415588, | |
| "learning_rate": 0.00039191035448766294, | |
| "loss": 3.4607, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 3.4818641696265202, | |
| "grad_norm": 0.6295269131660461, | |
| "learning_rate": 0.0003915871134576015, | |
| "loss": 3.4762, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 3.4872457216661283, | |
| "grad_norm": 0.6065512895584106, | |
| "learning_rate": 0.0003912638724275401, | |
| "loss": 3.4594, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 3.492627273705737, | |
| "grad_norm": 0.6755715608596802, | |
| "learning_rate": 0.00039094063139747873, | |
| "loss": 3.4502, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 3.498008825745345, | |
| "grad_norm": 0.6711599230766296, | |
| "learning_rate": 0.00039061739036741727, | |
| "loss": 3.4669, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 3.503390377784953, | |
| "grad_norm": 0.627836287021637, | |
| "learning_rate": 0.00039029414933735586, | |
| "loss": 3.472, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 3.5087719298245617, | |
| "grad_norm": 0.5943620204925537, | |
| "learning_rate": 0.0003899709083072944, | |
| "loss": 3.4326, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 3.5141534818641698, | |
| "grad_norm": 0.6491729617118835, | |
| "learning_rate": 0.000389647667277233, | |
| "loss": 3.4653, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 3.519535033903778, | |
| "grad_norm": 0.6225029230117798, | |
| "learning_rate": 0.00038932442624717165, | |
| "loss": 3.4569, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 3.524916585943386, | |
| "grad_norm": 0.6100902557373047, | |
| "learning_rate": 0.0003890011852171102, | |
| "loss": 3.4583, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 3.530298137982994, | |
| "grad_norm": 0.632154643535614, | |
| "learning_rate": 0.0003886779441870488, | |
| "loss": 3.4669, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 3.5356796900226026, | |
| "grad_norm": 0.6418779492378235, | |
| "learning_rate": 0.0003883547031569874, | |
| "loss": 3.4561, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 3.5410612420622107, | |
| "grad_norm": 0.607397735118866, | |
| "learning_rate": 0.0003880314621269259, | |
| "loss": 3.4731, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 3.546442794101819, | |
| "grad_norm": 0.596447765827179, | |
| "learning_rate": 0.0003877082210968645, | |
| "loss": 3.4619, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 3.5518243461414274, | |
| "grad_norm": 0.5973241329193115, | |
| "learning_rate": 0.00038738498006680316, | |
| "loss": 3.4635, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.5518243461414274, | |
| "eval_accuracy": 0.37345877236796254, | |
| "eval_loss": 3.470689296722412, | |
| "eval_runtime": 184.1157, | |
| "eval_samples_per_second": 97.824, | |
| "eval_steps_per_second": 6.116, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.5572058981810355, | |
| "grad_norm": 0.6128191947937012, | |
| "learning_rate": 0.0003870617390367417, | |
| "loss": 3.4471, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 3.5625874502206436, | |
| "grad_norm": 0.6342182755470276, | |
| "learning_rate": 0.0003867384980066803, | |
| "loss": 3.4383, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 3.5679690022602517, | |
| "grad_norm": 0.6245155334472656, | |
| "learning_rate": 0.00038641525697661884, | |
| "loss": 3.4491, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 3.57335055429986, | |
| "grad_norm": 0.6510711312294006, | |
| "learning_rate": 0.00038609201594655743, | |
| "loss": 3.4695, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 3.5787321063394684, | |
| "grad_norm": 0.5803980231285095, | |
| "learning_rate": 0.000385768774916496, | |
| "loss": 3.4374, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 3.5841136583790765, | |
| "grad_norm": 0.6799480319023132, | |
| "learning_rate": 0.0003854455338864346, | |
| "loss": 3.4564, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 3.5894952104186846, | |
| "grad_norm": 0.5653195381164551, | |
| "learning_rate": 0.0003851287576769744, | |
| "loss": 3.4536, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 3.594876762458293, | |
| "grad_norm": 0.5928677916526794, | |
| "learning_rate": 0.000384805516646913, | |
| "loss": 3.4688, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 3.6002583144979012, | |
| "grad_norm": 0.6252151727676392, | |
| "learning_rate": 0.0003844822756168516, | |
| "loss": 3.4489, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 3.6056398665375093, | |
| "grad_norm": 0.6133690476417542, | |
| "learning_rate": 0.00038415903458679016, | |
| "loss": 3.452, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 3.611021418577118, | |
| "grad_norm": 0.6196562647819519, | |
| "learning_rate": 0.00038383579355672875, | |
| "loss": 3.4762, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 3.616402970616726, | |
| "grad_norm": 0.5981706380844116, | |
| "learning_rate": 0.00038351255252666735, | |
| "loss": 3.4517, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 3.621784522656334, | |
| "grad_norm": 0.6419304609298706, | |
| "learning_rate": 0.00038318931149660594, | |
| "loss": 3.4487, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 3.627166074695942, | |
| "grad_norm": 0.6731728315353394, | |
| "learning_rate": 0.00038286607046654454, | |
| "loss": 3.4649, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 3.6325476267355503, | |
| "grad_norm": 0.6518445014953613, | |
| "learning_rate": 0.00038254282943648313, | |
| "loss": 3.4609, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 3.637929178775159, | |
| "grad_norm": 0.6324836015701294, | |
| "learning_rate": 0.00038221958840642167, | |
| "loss": 3.4645, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 3.643310730814767, | |
| "grad_norm": 0.6253151297569275, | |
| "learning_rate": 0.00038189634737636027, | |
| "loss": 3.4673, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 3.648692282854375, | |
| "grad_norm": 0.6527857184410095, | |
| "learning_rate": 0.0003815731063462988, | |
| "loss": 3.4524, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 3.6540738348939836, | |
| "grad_norm": 0.6473141312599182, | |
| "learning_rate": 0.00038124986531623745, | |
| "loss": 3.4641, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 3.6594553869335917, | |
| "grad_norm": 0.6557619571685791, | |
| "learning_rate": 0.00038092662428617605, | |
| "loss": 3.4613, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.6594553869335917, | |
| "eval_accuracy": 0.3743220195616486, | |
| "eval_loss": 3.46305513381958, | |
| "eval_runtime": 184.4765, | |
| "eval_samples_per_second": 97.633, | |
| "eval_steps_per_second": 6.104, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.6648369389732, | |
| "grad_norm": 0.6722961664199829, | |
| "learning_rate": 0.0003806033832561146, | |
| "loss": 3.4783, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 3.670218491012808, | |
| "grad_norm": 0.6513126492500305, | |
| "learning_rate": 0.0003802801422260532, | |
| "loss": 3.4677, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 3.675600043052416, | |
| "grad_norm": 0.6127039790153503, | |
| "learning_rate": 0.0003799569011959918, | |
| "loss": 3.4632, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 0.6399204730987549, | |
| "learning_rate": 0.0003796336601659303, | |
| "loss": 3.4421, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 3.6863631471316327, | |
| "grad_norm": 0.6492198705673218, | |
| "learning_rate": 0.00037931041913586897, | |
| "loss": 3.4609, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 3.691744699171241, | |
| "grad_norm": 0.5957018136978149, | |
| "learning_rate": 0.00037898717810580756, | |
| "loss": 3.4562, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 3.6971262512108494, | |
| "grad_norm": 0.6067306399345398, | |
| "learning_rate": 0.0003786639370757461, | |
| "loss": 3.436, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 3.7025078032504575, | |
| "grad_norm": 0.6163427829742432, | |
| "learning_rate": 0.0003783406960456847, | |
| "loss": 3.4668, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 3.7078893552900656, | |
| "grad_norm": 0.6470581889152527, | |
| "learning_rate": 0.00037801745501562324, | |
| "loss": 3.4397, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 3.713270907329674, | |
| "grad_norm": 0.6045754551887512, | |
| "learning_rate": 0.0003776942139855619, | |
| "loss": 3.4499, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 3.7186524593692822, | |
| "grad_norm": 0.6307573914527893, | |
| "learning_rate": 0.0003773709729555005, | |
| "loss": 3.4591, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 3.7240340114088903, | |
| "grad_norm": 0.6075461506843567, | |
| "learning_rate": 0.000377047731925439, | |
| "loss": 3.4555, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 3.7294155634484984, | |
| "grad_norm": 0.6341610550880432, | |
| "learning_rate": 0.0003767244908953776, | |
| "loss": 3.4448, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 3.7347971154881066, | |
| "grad_norm": 0.6407119035720825, | |
| "learning_rate": 0.0003764012498653162, | |
| "loss": 3.4506, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 3.740178667527715, | |
| "grad_norm": 0.6549863815307617, | |
| "learning_rate": 0.00037607800883525475, | |
| "loss": 3.4556, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 3.745560219567323, | |
| "grad_norm": 0.6298230290412903, | |
| "learning_rate": 0.0003757547678051934, | |
| "loss": 3.4343, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 3.7509417716069313, | |
| "grad_norm": 0.6664312481880188, | |
| "learning_rate": 0.00037543799159573315, | |
| "loss": 3.453, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 3.75632332364654, | |
| "grad_norm": 0.6711027026176453, | |
| "learning_rate": 0.0003751147505656718, | |
| "loss": 3.4654, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 3.761704875686148, | |
| "grad_norm": 0.6025902032852173, | |
| "learning_rate": 0.00037479150953561034, | |
| "loss": 3.4561, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 3.767086427725756, | |
| "grad_norm": 0.5972911715507507, | |
| "learning_rate": 0.00037446826850554894, | |
| "loss": 3.4663, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.767086427725756, | |
| "eval_accuracy": 0.3749463390550306, | |
| "eval_loss": 3.4594063758850098, | |
| "eval_runtime": 184.468, | |
| "eval_samples_per_second": 97.638, | |
| "eval_steps_per_second": 6.104, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.772467979765364, | |
| "grad_norm": 0.6819879412651062, | |
| "learning_rate": 0.00037414502747548753, | |
| "loss": 3.4476, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 3.7778495318049723, | |
| "grad_norm": 0.6131329536437988, | |
| "learning_rate": 0.00037382178644542607, | |
| "loss": 3.4533, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 3.783231083844581, | |
| "grad_norm": 0.661630392074585, | |
| "learning_rate": 0.00037349854541536467, | |
| "loss": 3.4407, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 3.788612635884189, | |
| "grad_norm": 0.6314149498939514, | |
| "learning_rate": 0.0003731753043853033, | |
| "loss": 3.4782, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 3.793994187923797, | |
| "grad_norm": 0.5939237475395203, | |
| "learning_rate": 0.00037285206335524186, | |
| "loss": 3.4645, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 3.7993757399634056, | |
| "grad_norm": 0.6490049958229065, | |
| "learning_rate": 0.00037252882232518045, | |
| "loss": 3.4433, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 3.8047572920030137, | |
| "grad_norm": 0.5956017374992371, | |
| "learning_rate": 0.000372205581295119, | |
| "loss": 3.4523, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 3.810138844042622, | |
| "grad_norm": 0.5964668989181519, | |
| "learning_rate": 0.0003718823402650576, | |
| "loss": 3.4486, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 3.8155203960822304, | |
| "grad_norm": 0.6286461353302002, | |
| "learning_rate": 0.00037155909923499624, | |
| "loss": 3.4649, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 3.8209019481218385, | |
| "grad_norm": 0.700835108757019, | |
| "learning_rate": 0.0003712358582049348, | |
| "loss": 3.4379, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 3.8262835001614466, | |
| "grad_norm": 0.6382637023925781, | |
| "learning_rate": 0.00037091261717487337, | |
| "loss": 3.4416, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 3.8316650522010547, | |
| "grad_norm": 0.6375047564506531, | |
| "learning_rate": 0.00037058937614481197, | |
| "loss": 3.4508, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 3.837046604240663, | |
| "grad_norm": 0.6712459325790405, | |
| "learning_rate": 0.0003702661351147505, | |
| "loss": 3.4532, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 3.8424281562802713, | |
| "grad_norm": 0.6061450839042664, | |
| "learning_rate": 0.0003699428940846891, | |
| "loss": 3.4675, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 3.8478097083198795, | |
| "grad_norm": 0.5931434035301208, | |
| "learning_rate": 0.00036961965305462775, | |
| "loss": 3.4356, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 3.8531912603594876, | |
| "grad_norm": 0.6473008394241333, | |
| "learning_rate": 0.0003692964120245663, | |
| "loss": 3.4576, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 3.858572812399096, | |
| "grad_norm": 0.6027951240539551, | |
| "learning_rate": 0.0003689731709945049, | |
| "loss": 3.4624, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 3.863954364438704, | |
| "grad_norm": 0.644424557685852, | |
| "learning_rate": 0.0003686499299644434, | |
| "loss": 3.4491, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 3.8693359164783123, | |
| "grad_norm": 0.6382813453674316, | |
| "learning_rate": 0.000368326688934382, | |
| "loss": 3.4447, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 3.8747174685179204, | |
| "grad_norm": 0.6464617848396301, | |
| "learning_rate": 0.0003680034479043206, | |
| "loss": 3.4611, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.8747174685179204, | |
| "eval_accuracy": 0.3756499751565172, | |
| "eval_loss": 3.4547743797302246, | |
| "eval_runtime": 184.1244, | |
| "eval_samples_per_second": 97.82, | |
| "eval_steps_per_second": 6.115, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.8800990205575285, | |
| "grad_norm": 0.6092334985733032, | |
| "learning_rate": 0.0003676802068742592, | |
| "loss": 3.4632, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 3.885480572597137, | |
| "grad_norm": 0.6260703206062317, | |
| "learning_rate": 0.0003673569658441978, | |
| "loss": 3.4586, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 3.890862124636745, | |
| "grad_norm": 0.6519879102706909, | |
| "learning_rate": 0.0003670337248141364, | |
| "loss": 3.4441, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 3.8962436766763533, | |
| "grad_norm": 0.6434177756309509, | |
| "learning_rate": 0.00036671048378407494, | |
| "loss": 3.47, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 3.901625228715962, | |
| "grad_norm": 0.7166149020195007, | |
| "learning_rate": 0.00036638724275401353, | |
| "loss": 3.4527, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 3.90700678075557, | |
| "grad_norm": 0.6426181197166443, | |
| "learning_rate": 0.0003660640017239522, | |
| "loss": 3.4641, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 3.912388332795178, | |
| "grad_norm": 0.6303426027297974, | |
| "learning_rate": 0.0003657407606938907, | |
| "loss": 3.4504, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 3.9177698848347866, | |
| "grad_norm": 0.6063511967658997, | |
| "learning_rate": 0.0003654175196638293, | |
| "loss": 3.4319, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 3.9231514368743947, | |
| "grad_norm": 0.6871627569198608, | |
| "learning_rate": 0.00036509427863376786, | |
| "loss": 3.4317, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 3.928532988914003, | |
| "grad_norm": 0.5857812762260437, | |
| "learning_rate": 0.00036477103760370645, | |
| "loss": 3.455, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 3.933914540953611, | |
| "grad_norm": 0.6200795769691467, | |
| "learning_rate": 0.00036444779657364505, | |
| "loss": 3.4462, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 3.939296092993219, | |
| "grad_norm": 0.5892250537872314, | |
| "learning_rate": 0.00036412455554358364, | |
| "loss": 3.4636, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 3.9446776450328276, | |
| "grad_norm": 0.5856645107269287, | |
| "learning_rate": 0.00036380131451352224, | |
| "loss": 3.435, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 3.9500591970724357, | |
| "grad_norm": 0.6464253664016724, | |
| "learning_rate": 0.00036347807348346083, | |
| "loss": 3.4561, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 3.955440749112044, | |
| "grad_norm": 0.6302794218063354, | |
| "learning_rate": 0.00036315483245339937, | |
| "loss": 3.4688, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 3.9608223011516523, | |
| "grad_norm": 0.6558413505554199, | |
| "learning_rate": 0.00036283159142333797, | |
| "loss": 3.4551, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 3.9662038531912605, | |
| "grad_norm": 0.659332811832428, | |
| "learning_rate": 0.0003625083503932765, | |
| "loss": 3.4508, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 3.9715854052308686, | |
| "grad_norm": 0.6148266792297363, | |
| "learning_rate": 0.00036218510936321516, | |
| "loss": 3.4569, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 3.9769669572704767, | |
| "grad_norm": 0.6297675967216492, | |
| "learning_rate": 0.00036186186833315375, | |
| "loss": 3.4486, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 3.9823485093100848, | |
| "grad_norm": 0.653570294380188, | |
| "learning_rate": 0.0003615386273030923, | |
| "loss": 3.4587, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.9823485093100848, | |
| "eval_accuracy": 0.375846310924798, | |
| "eval_loss": 3.4468870162963867, | |
| "eval_runtime": 184.3986, | |
| "eval_samples_per_second": 97.674, | |
| "eval_steps_per_second": 6.106, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.9877300613496933, | |
| "grad_norm": 0.6133513450622559, | |
| "learning_rate": 0.0003612153862730309, | |
| "loss": 3.4321, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 3.9931116133893014, | |
| "grad_norm": 0.6079823970794678, | |
| "learning_rate": 0.0003608921452429695, | |
| "loss": 3.4477, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 3.9984931654289095, | |
| "grad_norm": 0.6801696419715881, | |
| "learning_rate": 0.000360568904212908, | |
| "loss": 3.4466, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 4.003874717468518, | |
| "grad_norm": 0.68626469373703, | |
| "learning_rate": 0.00036024566318284667, | |
| "loss": 3.3731, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 4.009256269508126, | |
| "grad_norm": 0.6593034863471985, | |
| "learning_rate": 0.00035992242215278526, | |
| "loss": 3.3572, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 4.014637821547734, | |
| "grad_norm": 0.6391949653625488, | |
| "learning_rate": 0.0003595991811227238, | |
| "loss": 3.3597, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 4.020019373587343, | |
| "grad_norm": 0.59788578748703, | |
| "learning_rate": 0.0003592759400926624, | |
| "loss": 3.3676, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 4.0254009256269505, | |
| "grad_norm": 0.6144354939460754, | |
| "learning_rate": 0.00035895269906260094, | |
| "loss": 3.3513, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 4.030782477666559, | |
| "grad_norm": 0.6309810280799866, | |
| "learning_rate": 0.0003586294580325396, | |
| "loss": 3.3733, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 4.036164029706168, | |
| "grad_norm": 0.6031246185302734, | |
| "learning_rate": 0.0003583062170024782, | |
| "loss": 3.3727, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 4.041545581745775, | |
| "grad_norm": 0.6295172572135925, | |
| "learning_rate": 0.0003579829759724167, | |
| "loss": 3.3594, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 4.046927133785384, | |
| "grad_norm": 0.6725473403930664, | |
| "learning_rate": 0.0003576597349423553, | |
| "loss": 3.3678, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 4.0523086858249915, | |
| "grad_norm": 0.667570948600769, | |
| "learning_rate": 0.0003573364939122939, | |
| "loss": 3.3745, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 4.0576902378646, | |
| "grad_norm": 0.7284749746322632, | |
| "learning_rate": 0.00035701325288223245, | |
| "loss": 3.3634, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 4.063071789904209, | |
| "grad_norm": 0.6347604990005493, | |
| "learning_rate": 0.0003566900118521711, | |
| "loss": 3.3548, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 4.068453341943816, | |
| "grad_norm": 0.6340035200119019, | |
| "learning_rate": 0.0003563667708221097, | |
| "loss": 3.336, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 4.073834893983425, | |
| "grad_norm": 0.6336855292320251, | |
| "learning_rate": 0.00035604352979204824, | |
| "loss": 3.3733, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 4.079216446023033, | |
| "grad_norm": 0.6538589596748352, | |
| "learning_rate": 0.00035572028876198683, | |
| "loss": 3.3753, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 4.084597998062641, | |
| "grad_norm": 0.6248359084129333, | |
| "learning_rate": 0.00035539704773192537, | |
| "loss": 3.38, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 4.08997955010225, | |
| "grad_norm": 0.7011412978172302, | |
| "learning_rate": 0.00035507380670186397, | |
| "loss": 3.3805, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 4.08997955010225, | |
| "eval_accuracy": 0.3764695438893018, | |
| "eval_loss": 3.4512176513671875, | |
| "eval_runtime": 184.2744, | |
| "eval_samples_per_second": 97.74, | |
| "eval_steps_per_second": 6.11, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 4.095361102141858, | |
| "grad_norm": 0.6123966574668884, | |
| "learning_rate": 0.0003547505656718026, | |
| "loss": 3.3756, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 4.100742654181466, | |
| "grad_norm": 0.6666425466537476, | |
| "learning_rate": 0.00035442732464174116, | |
| "loss": 3.3918, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 4.106124206221074, | |
| "grad_norm": 0.6175438761711121, | |
| "learning_rate": 0.00035410408361167975, | |
| "loss": 3.3734, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 4.111505758260682, | |
| "grad_norm": 0.6974527835845947, | |
| "learning_rate": 0.0003537808425816183, | |
| "loss": 3.3812, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 4.1168873103002905, | |
| "grad_norm": 0.615986168384552, | |
| "learning_rate": 0.0003534576015515569, | |
| "loss": 3.3619, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 4.122268862339899, | |
| "grad_norm": 0.6702946424484253, | |
| "learning_rate": 0.00035313436052149553, | |
| "loss": 3.3741, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 4.127650414379507, | |
| "grad_norm": 0.6737379431724548, | |
| "learning_rate": 0.0003528111194914341, | |
| "loss": 3.3716, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 4.133031966419115, | |
| "grad_norm": 0.6457754969596863, | |
| "learning_rate": 0.00035248787846137267, | |
| "loss": 3.3738, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 4.138413518458724, | |
| "grad_norm": 0.7151730060577393, | |
| "learning_rate": 0.00035216463743131126, | |
| "loss": 3.3621, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 4.1437950704983315, | |
| "grad_norm": 0.6836617588996887, | |
| "learning_rate": 0.0003518413964012498, | |
| "loss": 3.3906, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 4.14917662253794, | |
| "grad_norm": 0.6597222685813904, | |
| "learning_rate": 0.0003515181553711884, | |
| "loss": 3.3895, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 4.154558174577549, | |
| "grad_norm": 0.6961684823036194, | |
| "learning_rate": 0.00035119491434112705, | |
| "loss": 3.3872, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 4.159939726617156, | |
| "grad_norm": 0.7248278856277466, | |
| "learning_rate": 0.0003508716733110656, | |
| "loss": 3.375, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 4.165321278656765, | |
| "grad_norm": 0.6473908424377441, | |
| "learning_rate": 0.0003505484322810042, | |
| "loss": 3.3844, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 4.1707028306963725, | |
| "grad_norm": 0.6644169688224792, | |
| "learning_rate": 0.0003502251912509427, | |
| "loss": 3.3946, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 4.176084382735981, | |
| "grad_norm": 0.6361338496208191, | |
| "learning_rate": 0.0003499019502208813, | |
| "loss": 3.3738, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 4.18146593477559, | |
| "grad_norm": 0.6182463765144348, | |
| "learning_rate": 0.0003495787091908199, | |
| "loss": 3.3672, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 4.186847486815197, | |
| "grad_norm": 0.691481351852417, | |
| "learning_rate": 0.0003492554681607585, | |
| "loss": 3.3662, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 4.192229038854806, | |
| "grad_norm": 0.6630625128746033, | |
| "learning_rate": 0.0003489322271306971, | |
| "loss": 3.3683, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 4.197610590894414, | |
| "grad_norm": 0.6156312823295593, | |
| "learning_rate": 0.0003486089861006357, | |
| "loss": 3.3817, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 4.197610590894414, | |
| "eval_accuracy": 0.3773192094720111, | |
| "eval_loss": 3.443570613861084, | |
| "eval_runtime": 184.3057, | |
| "eval_samples_per_second": 97.724, | |
| "eval_steps_per_second": 6.109, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 4.202992142934022, | |
| "grad_norm": 0.6532960534095764, | |
| "learning_rate": 0.00034828574507057424, | |
| "loss": 3.3671, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 4.208373694973631, | |
| "grad_norm": 0.7191522121429443, | |
| "learning_rate": 0.00034796250404051283, | |
| "loss": 3.3788, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 4.213755247013238, | |
| "grad_norm": 0.6334662437438965, | |
| "learning_rate": 0.00034763926301045137, | |
| "loss": 3.3782, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 4.219136799052847, | |
| "grad_norm": 0.6378034949302673, | |
| "learning_rate": 0.00034731602198039, | |
| "loss": 3.3868, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 4.224518351092455, | |
| "grad_norm": 0.706544816493988, | |
| "learning_rate": 0.0003469927809503286, | |
| "loss": 3.3835, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 4.229899903132063, | |
| "grad_norm": 0.6315046548843384, | |
| "learning_rate": 0.00034666953992026716, | |
| "loss": 3.3966, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 4.2352814551716715, | |
| "grad_norm": 0.6523990035057068, | |
| "learning_rate": 0.00034634629889020575, | |
| "loss": 3.3841, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 4.24066300721128, | |
| "grad_norm": 0.693922758102417, | |
| "learning_rate": 0.00034602305786014435, | |
| "loss": 3.3804, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 4.246044559250888, | |
| "grad_norm": 0.61918044090271, | |
| "learning_rate": 0.00034569981683008294, | |
| "loss": 3.3935, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 4.251426111290496, | |
| "grad_norm": 0.6147217154502869, | |
| "learning_rate": 0.00034537657580002154, | |
| "loss": 3.3864, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 4.256807663330104, | |
| "grad_norm": 0.6341997981071472, | |
| "learning_rate": 0.00034505333476996013, | |
| "loss": 3.3893, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 4.2621892153697125, | |
| "grad_norm": 0.7464113831520081, | |
| "learning_rate": 0.00034473009373989867, | |
| "loss": 3.3762, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 4.267570767409321, | |
| "grad_norm": 0.6500197052955627, | |
| "learning_rate": 0.00034440685270983727, | |
| "loss": 3.387, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 4.272952319448929, | |
| "grad_norm": 0.7259834408760071, | |
| "learning_rate": 0.0003440836116797758, | |
| "loss": 3.3841, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 4.278333871488537, | |
| "grad_norm": 0.5935174226760864, | |
| "learning_rate": 0.00034376037064971445, | |
| "loss": 3.3802, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 4.283715423528146, | |
| "grad_norm": 0.6306664943695068, | |
| "learning_rate": 0.00034343712961965305, | |
| "loss": 3.391, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 4.2890969755677535, | |
| "grad_norm": 0.6982831358909607, | |
| "learning_rate": 0.0003431138885895916, | |
| "loss": 3.3919, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 0.7150142192840576, | |
| "learning_rate": 0.0003427906475595302, | |
| "loss": 3.3797, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 4.299860079646971, | |
| "grad_norm": 0.7036126255989075, | |
| "learning_rate": 0.0003424674065294688, | |
| "loss": 3.388, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 4.305241631686578, | |
| "grad_norm": 0.6505343914031982, | |
| "learning_rate": 0.0003421441654994073, | |
| "loss": 3.4009, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.305241631686578, | |
| "eval_accuracy": 0.3774062404351505, | |
| "eval_loss": 3.44183349609375, | |
| "eval_runtime": 184.1548, | |
| "eval_samples_per_second": 97.804, | |
| "eval_steps_per_second": 6.114, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.310623183726187, | |
| "grad_norm": 0.6343244314193726, | |
| "learning_rate": 0.00034182092446934597, | |
| "loss": 3.3809, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 4.3160047357657945, | |
| "grad_norm": 0.6463423371315002, | |
| "learning_rate": 0.00034149768343928456, | |
| "loss": 3.398, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 4.321386287805403, | |
| "grad_norm": 0.6768094301223755, | |
| "learning_rate": 0.0003411744424092231, | |
| "loss": 3.3764, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 4.326767839845012, | |
| "grad_norm": 0.6386173963546753, | |
| "learning_rate": 0.0003408512013791617, | |
| "loss": 3.3993, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 4.332149391884619, | |
| "grad_norm": 0.6519148945808411, | |
| "learning_rate": 0.00034052796034910024, | |
| "loss": 3.3753, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 4.337530943924228, | |
| "grad_norm": 0.6257440447807312, | |
| "learning_rate": 0.0003402047193190389, | |
| "loss": 3.3695, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 4.342912495963836, | |
| "grad_norm": 0.6440525054931641, | |
| "learning_rate": 0.0003398814782889775, | |
| "loss": 3.3875, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 4.348294048003444, | |
| "grad_norm": 0.6732410788536072, | |
| "learning_rate": 0.000339558237258916, | |
| "loss": 3.367, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 4.3536756000430525, | |
| "grad_norm": 0.6767770051956177, | |
| "learning_rate": 0.0003392349962288546, | |
| "loss": 3.3783, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 4.359057152082661, | |
| "grad_norm": 0.6681703329086304, | |
| "learning_rate": 0.0003389117551987932, | |
| "loss": 3.3883, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 4.364438704122269, | |
| "grad_norm": 0.6140384078025818, | |
| "learning_rate": 0.00033858851416873175, | |
| "loss": 3.3996, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 4.369820256161877, | |
| "grad_norm": 0.6840130686759949, | |
| "learning_rate": 0.0003382652731386704, | |
| "loss": 3.3791, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 4.375201808201485, | |
| "grad_norm": 0.6633166074752808, | |
| "learning_rate": 0.000337942032108609, | |
| "loss": 3.3833, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 4.3805833602410935, | |
| "grad_norm": 0.6062387824058533, | |
| "learning_rate": 0.00033761879107854754, | |
| "loss": 3.3969, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 4.385964912280702, | |
| "grad_norm": 0.6649901866912842, | |
| "learning_rate": 0.00033729555004848613, | |
| "loss": 3.3882, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 4.39134646432031, | |
| "grad_norm": 0.6902977824211121, | |
| "learning_rate": 0.00033697230901842467, | |
| "loss": 3.3906, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 4.396728016359918, | |
| "grad_norm": 0.6805328130722046, | |
| "learning_rate": 0.00033665553280896453, | |
| "loss": 3.3986, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 4.402109568399527, | |
| "grad_norm": 0.6568860411643982, | |
| "learning_rate": 0.00033633229177890307, | |
| "loss": 3.3903, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 4.4074911204391345, | |
| "grad_norm": 0.6838862895965576, | |
| "learning_rate": 0.00033600905074884167, | |
| "loss": 3.3801, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 4.412872672478743, | |
| "grad_norm": 0.6646157503128052, | |
| "learning_rate": 0.0003356858097187803, | |
| "loss": 3.3963, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.412872672478743, | |
| "eval_accuracy": 0.3778497615232092, | |
| "eval_loss": 3.4354708194732666, | |
| "eval_runtime": 184.3844, | |
| "eval_samples_per_second": 97.682, | |
| "eval_steps_per_second": 6.107, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.418254224518351, | |
| "grad_norm": 0.6933935284614563, | |
| "learning_rate": 0.00033536256868871886, | |
| "loss": 3.4114, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 4.423635776557959, | |
| "grad_norm": 0.721962571144104, | |
| "learning_rate": 0.00033503932765865745, | |
| "loss": 3.4128, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 4.429017328597568, | |
| "grad_norm": 0.6755461692810059, | |
| "learning_rate": 0.000334716086628596, | |
| "loss": 3.3924, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 4.4343988806371755, | |
| "grad_norm": 0.6381179094314575, | |
| "learning_rate": 0.0003343928455985346, | |
| "loss": 3.383, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 4.439780432676784, | |
| "grad_norm": 0.7221701145172119, | |
| "learning_rate": 0.00033406960456847324, | |
| "loss": 3.387, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 4.445161984716393, | |
| "grad_norm": 0.6238223910331726, | |
| "learning_rate": 0.0003337463635384118, | |
| "loss": 3.3925, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 4.450543536756, | |
| "grad_norm": 0.6577396392822266, | |
| "learning_rate": 0.00033342312250835037, | |
| "loss": 3.398, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 4.455925088795609, | |
| "grad_norm": 0.6841791272163391, | |
| "learning_rate": 0.00033309988147828896, | |
| "loss": 3.4063, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 4.461306640835216, | |
| "grad_norm": 0.6639859080314636, | |
| "learning_rate": 0.0003327766404482275, | |
| "loss": 3.4045, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 4.466688192874825, | |
| "grad_norm": 0.6243983507156372, | |
| "learning_rate": 0.0003324533994181661, | |
| "loss": 3.3933, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 4.4720697449144335, | |
| "grad_norm": 0.6617377400398254, | |
| "learning_rate": 0.00033213015838810475, | |
| "loss": 3.3984, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 4.477451296954041, | |
| "grad_norm": 0.6659533977508545, | |
| "learning_rate": 0.0003318069173580433, | |
| "loss": 3.389, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 4.48283284899365, | |
| "grad_norm": 0.6686351895332336, | |
| "learning_rate": 0.0003314836763279819, | |
| "loss": 3.3932, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 4.488214401033258, | |
| "grad_norm": 0.6653340458869934, | |
| "learning_rate": 0.0003311604352979204, | |
| "loss": 3.3835, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 4.493595953072866, | |
| "grad_norm": 0.7052305340766907, | |
| "learning_rate": 0.000330837194267859, | |
| "loss": 3.3838, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 4.4989775051124745, | |
| "grad_norm": 0.62774658203125, | |
| "learning_rate": 0.0003305139532377976, | |
| "loss": 3.3961, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 4.504359057152083, | |
| "grad_norm": 0.6671816110610962, | |
| "learning_rate": 0.0003301907122077362, | |
| "loss": 3.3838, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 4.509740609191691, | |
| "grad_norm": 0.6851453185081482, | |
| "learning_rate": 0.0003298674711776748, | |
| "loss": 3.3973, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 4.515122161231299, | |
| "grad_norm": 0.6708073019981384, | |
| "learning_rate": 0.0003295442301476134, | |
| "loss": 3.3784, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 4.520503713270907, | |
| "grad_norm": 0.6731506586074829, | |
| "learning_rate": 0.00032922098911755194, | |
| "loss": 3.3697, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.520503713270907, | |
| "eval_accuracy": 0.37830425655293715, | |
| "eval_loss": 3.4301443099975586, | |
| "eval_runtime": 184.382, | |
| "eval_samples_per_second": 97.683, | |
| "eval_steps_per_second": 6.107, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.5258852653105155, | |
| "grad_norm": 0.671728789806366, | |
| "learning_rate": 0.00032889774808749053, | |
| "loss": 3.3964, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 4.531266817350124, | |
| "grad_norm": 0.6927101016044617, | |
| "learning_rate": 0.0003285745070574292, | |
| "loss": 3.4106, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 4.536648369389732, | |
| "grad_norm": 0.6305346488952637, | |
| "learning_rate": 0.0003282512660273677, | |
| "loss": 3.3722, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 4.54202992142934, | |
| "grad_norm": 0.6529112458229065, | |
| "learning_rate": 0.0003279280249973063, | |
| "loss": 3.4013, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 4.547411473468949, | |
| "grad_norm": 0.6353604197502136, | |
| "learning_rate": 0.00032760478396724486, | |
| "loss": 3.3874, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 4.5527930255085565, | |
| "grad_norm": 0.6560521125793457, | |
| "learning_rate": 0.00032728154293718345, | |
| "loss": 3.3811, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 4.558174577548165, | |
| "grad_norm": 0.7130979299545288, | |
| "learning_rate": 0.00032696476672772326, | |
| "loss": 3.4099, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 4.563556129587774, | |
| "grad_norm": 0.6741908192634583, | |
| "learning_rate": 0.00032664152569766185, | |
| "loss": 3.3978, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 4.568937681627381, | |
| "grad_norm": 0.6519588232040405, | |
| "learning_rate": 0.0003263182846676004, | |
| "loss": 3.3865, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 4.57431923366699, | |
| "grad_norm": 0.6607858538627625, | |
| "learning_rate": 0.00032599504363753904, | |
| "loss": 3.3917, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 4.579700785706597, | |
| "grad_norm": 0.6284041404724121, | |
| "learning_rate": 0.00032567180260747764, | |
| "loss": 3.4014, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 4.585082337746206, | |
| "grad_norm": 0.7115404009819031, | |
| "learning_rate": 0.0003253485615774162, | |
| "loss": 3.3902, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 4.5904638897858145, | |
| "grad_norm": 0.722709596157074, | |
| "learning_rate": 0.00032502532054735477, | |
| "loss": 3.3849, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 4.595845441825422, | |
| "grad_norm": 0.6816657781600952, | |
| "learning_rate": 0.00032470207951729337, | |
| "loss": 3.3918, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 4.601226993865031, | |
| "grad_norm": 0.6426047682762146, | |
| "learning_rate": 0.0003243788384872319, | |
| "loss": 3.3803, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 4.606608545904638, | |
| "grad_norm": 0.6596842408180237, | |
| "learning_rate": 0.00032405559745717056, | |
| "loss": 3.3904, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 4.611990097944247, | |
| "grad_norm": 0.668118417263031, | |
| "learning_rate": 0.00032373235642710915, | |
| "loss": 3.3983, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 4.6173716499838555, | |
| "grad_norm": 0.6379391551017761, | |
| "learning_rate": 0.0003234091153970477, | |
| "loss": 3.4012, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 4.622753202023463, | |
| "grad_norm": 0.6547034978866577, | |
| "learning_rate": 0.0003230858743669863, | |
| "loss": 3.3822, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 4.628134754063072, | |
| "grad_norm": 0.6577107310295105, | |
| "learning_rate": 0.0003227626333369248, | |
| "loss": 3.3999, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.628134754063072, | |
| "eval_accuracy": 0.37914512125173344, | |
| "eval_loss": 3.4248054027557373, | |
| "eval_runtime": 184.3395, | |
| "eval_samples_per_second": 97.706, | |
| "eval_steps_per_second": 6.108, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.63351630610268, | |
| "grad_norm": 0.6792987585067749, | |
| "learning_rate": 0.0003224393923068635, | |
| "loss": 3.3997, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 4.638897858142288, | |
| "grad_norm": 0.6531506776809692, | |
| "learning_rate": 0.00032211615127680207, | |
| "loss": 3.3926, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 4.6442794101818965, | |
| "grad_norm": 0.7270342707633972, | |
| "learning_rate": 0.0003217929102467406, | |
| "loss": 3.3733, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 4.649660962221505, | |
| "grad_norm": 0.7006716728210449, | |
| "learning_rate": 0.0003214696692166792, | |
| "loss": 3.3878, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 4.655042514261113, | |
| "grad_norm": 0.6774951815605164, | |
| "learning_rate": 0.0003211464281866178, | |
| "loss": 3.4109, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 4.660424066300721, | |
| "grad_norm": 0.7012584805488586, | |
| "learning_rate": 0.00032082318715655634, | |
| "loss": 3.4, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 4.665805618340329, | |
| "grad_norm": 0.6669802665710449, | |
| "learning_rate": 0.000320499946126495, | |
| "loss": 3.4022, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 4.6711871703799375, | |
| "grad_norm": 0.6756395697593689, | |
| "learning_rate": 0.0003201767050964336, | |
| "loss": 3.3843, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 4.676568722419546, | |
| "grad_norm": 0.6468850374221802, | |
| "learning_rate": 0.0003198534640663721, | |
| "loss": 3.3928, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 4.681950274459154, | |
| "grad_norm": 0.677679717540741, | |
| "learning_rate": 0.0003195302230363107, | |
| "loss": 3.3808, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 4.687331826498762, | |
| "grad_norm": 0.6484048962593079, | |
| "learning_rate": 0.00031920698200624926, | |
| "loss": 3.3898, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 4.692713378538371, | |
| "grad_norm": 0.6454549431800842, | |
| "learning_rate": 0.00031888374097618785, | |
| "loss": 3.3837, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 4.6980949305779784, | |
| "grad_norm": 0.6815332770347595, | |
| "learning_rate": 0.0003185604999461265, | |
| "loss": 3.3866, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 4.703476482617587, | |
| "grad_norm": 0.6679681539535522, | |
| "learning_rate": 0.00031823725891606504, | |
| "loss": 3.3745, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 4.7088580346571955, | |
| "grad_norm": 0.6820075511932373, | |
| "learning_rate": 0.00031791401788600364, | |
| "loss": 3.396, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 4.714239586696803, | |
| "grad_norm": 0.6664008498191833, | |
| "learning_rate": 0.00031759077685594223, | |
| "loss": 3.3971, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 4.719621138736412, | |
| "grad_norm": 0.6687772274017334, | |
| "learning_rate": 0.00031726753582588077, | |
| "loss": 3.3814, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 4.725002690776019, | |
| "grad_norm": 0.6516989469528198, | |
| "learning_rate": 0.0003169442947958194, | |
| "loss": 3.3893, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 4.730384242815628, | |
| "grad_norm": 0.6796718239784241, | |
| "learning_rate": 0.000316621053765758, | |
| "loss": 3.395, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 4.7357657948552365, | |
| "grad_norm": 0.6910555958747864, | |
| "learning_rate": 0.00031629781273569656, | |
| "loss": 3.3977, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.7357657948552365, | |
| "eval_accuracy": 0.379433486015993, | |
| "eval_loss": 3.4196970462799072, | |
| "eval_runtime": 184.5952, | |
| "eval_samples_per_second": 97.57, | |
| "eval_steps_per_second": 6.1, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.741147346894844, | |
| "grad_norm": 0.6522223949432373, | |
| "learning_rate": 0.00031597457170563515, | |
| "loss": 3.3942, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 4.746528898934453, | |
| "grad_norm": 0.6282691955566406, | |
| "learning_rate": 0.0003156513306755737, | |
| "loss": 3.3846, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 4.751910450974061, | |
| "grad_norm": 0.6662289500236511, | |
| "learning_rate": 0.0003153280896455123, | |
| "loss": 3.3674, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 4.757292003013669, | |
| "grad_norm": 0.811521589756012, | |
| "learning_rate": 0.00031500484861545094, | |
| "loss": 3.3871, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 4.7626735550532775, | |
| "grad_norm": 0.7855165600776672, | |
| "learning_rate": 0.0003146816075853895, | |
| "loss": 3.4034, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 4.768055107092886, | |
| "grad_norm": 0.6359848380088806, | |
| "learning_rate": 0.00031435836655532807, | |
| "loss": 3.3817, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 4.773436659132494, | |
| "grad_norm": 0.7628109455108643, | |
| "learning_rate": 0.00031403512552526667, | |
| "loss": 3.3712, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 4.778818211172102, | |
| "grad_norm": 0.6852911710739136, | |
| "learning_rate": 0.0003137118844952052, | |
| "loss": 3.3905, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 4.78419976321171, | |
| "grad_norm": 0.696208655834198, | |
| "learning_rate": 0.0003133886434651438, | |
| "loss": 3.3841, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 4.7895813152513185, | |
| "grad_norm": 0.6593944430351257, | |
| "learning_rate": 0.00031306540243508245, | |
| "loss": 3.3912, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 4.794962867290927, | |
| "grad_norm": 0.6796725988388062, | |
| "learning_rate": 0.000312742161405021, | |
| "loss": 3.3688, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 4.800344419330535, | |
| "grad_norm": 0.7254936695098877, | |
| "learning_rate": 0.0003124189203749596, | |
| "loss": 3.3725, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 4.805725971370143, | |
| "grad_norm": 0.6678683757781982, | |
| "learning_rate": 0.0003120956793448981, | |
| "loss": 3.4062, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 4.811107523409751, | |
| "grad_norm": 0.7855321764945984, | |
| "learning_rate": 0.0003117724383148367, | |
| "loss": 3.3827, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 4.8164890754493594, | |
| "grad_norm": 0.6922734975814819, | |
| "learning_rate": 0.00031144919728477526, | |
| "loss": 3.3846, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 4.821870627488968, | |
| "grad_norm": 0.6983393430709839, | |
| "learning_rate": 0.0003111259562547139, | |
| "loss": 3.3818, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 4.827252179528576, | |
| "grad_norm": 0.6777713298797607, | |
| "learning_rate": 0.0003108027152246525, | |
| "loss": 3.38, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 4.832633731568184, | |
| "grad_norm": 0.6608052253723145, | |
| "learning_rate": 0.0003104859390151923, | |
| "loss": 3.3875, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 4.838015283607793, | |
| "grad_norm": 0.6794652342796326, | |
| "learning_rate": 0.0003101626979851309, | |
| "loss": 3.3907, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 4.8433968356474, | |
| "grad_norm": 0.682873010635376, | |
| "learning_rate": 0.00030983945695506945, | |
| "loss": 3.3768, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.8433968356474, | |
| "eval_accuracy": 0.3800040223299069, | |
| "eval_loss": 3.415503978729248, | |
| "eval_runtime": 184.4239, | |
| "eval_samples_per_second": 97.661, | |
| "eval_steps_per_second": 6.105, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.848778387687009, | |
| "grad_norm": 0.6700130701065063, | |
| "learning_rate": 0.00030951621592500804, | |
| "loss": 3.3961, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 4.8541599397266175, | |
| "grad_norm": 0.6327756643295288, | |
| "learning_rate": 0.00030919297489494663, | |
| "loss": 3.3819, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 4.859541491766225, | |
| "grad_norm": 0.652381956577301, | |
| "learning_rate": 0.00030886973386488523, | |
| "loss": 3.3899, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 4.864923043805834, | |
| "grad_norm": 0.6288101077079773, | |
| "learning_rate": 0.0003085464928348238, | |
| "loss": 3.395, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 4.870304595845441, | |
| "grad_norm": 0.6799596548080444, | |
| "learning_rate": 0.0003082232518047624, | |
| "loss": 3.389, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 4.87568614788505, | |
| "grad_norm": 0.7153991460800171, | |
| "learning_rate": 0.00030790001077470096, | |
| "loss": 3.3965, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 4.8810676999246585, | |
| "grad_norm": 0.6825360655784607, | |
| "learning_rate": 0.00030757676974463955, | |
| "loss": 3.3976, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 4.886449251964266, | |
| "grad_norm": 0.6581000685691833, | |
| "learning_rate": 0.0003072535287145781, | |
| "loss": 3.3977, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 4.891830804003875, | |
| "grad_norm": 0.6416078805923462, | |
| "learning_rate": 0.00030693028768451674, | |
| "loss": 3.3901, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 4.897212356043483, | |
| "grad_norm": 0.6281269788742065, | |
| "learning_rate": 0.00030660704665445534, | |
| "loss": 3.3916, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 4.902593908083091, | |
| "grad_norm": 0.6486085057258606, | |
| "learning_rate": 0.0003062838056243939, | |
| "loss": 3.3974, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 0.6486194133758545, | |
| "learning_rate": 0.00030596056459433247, | |
| "loss": 3.3997, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 4.913357012162308, | |
| "grad_norm": 0.7123668789863586, | |
| "learning_rate": 0.00030563732356427107, | |
| "loss": 3.3929, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 4.918738564201916, | |
| "grad_norm": 0.6686903834342957, | |
| "learning_rate": 0.00030531408253420966, | |
| "loss": 3.3778, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 4.924120116241524, | |
| "grad_norm": 0.7370855212211609, | |
| "learning_rate": 0.00030499084150414826, | |
| "loss": 3.3901, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 4.929501668281132, | |
| "grad_norm": 0.6912970542907715, | |
| "learning_rate": 0.00030466760047408685, | |
| "loss": 3.3995, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 4.9348832203207404, | |
| "grad_norm": 0.6770774722099304, | |
| "learning_rate": 0.0003043443594440254, | |
| "loss": 3.4027, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 4.940264772360349, | |
| "grad_norm": 0.683289110660553, | |
| "learning_rate": 0.000304021118413964, | |
| "loss": 3.4135, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 4.945646324399957, | |
| "grad_norm": 0.676683247089386, | |
| "learning_rate": 0.0003036978773839025, | |
| "loss": 3.4119, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 4.951027876439565, | |
| "grad_norm": 0.6402007937431335, | |
| "learning_rate": 0.0003033746363538412, | |
| "loss": 3.3817, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.951027876439565, | |
| "eval_accuracy": 0.3802043782550368, | |
| "eval_loss": 3.4119012355804443, | |
| "eval_runtime": 184.8341, | |
| "eval_samples_per_second": 97.444, | |
| "eval_steps_per_second": 6.092, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.956409428479174, | |
| "grad_norm": 0.6698101758956909, | |
| "learning_rate": 0.00030305139532377977, | |
| "loss": 3.381, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 4.961790980518781, | |
| "grad_norm": 0.6488439440727234, | |
| "learning_rate": 0.0003027281542937183, | |
| "loss": 3.3643, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 4.96717253255839, | |
| "grad_norm": 0.6902374029159546, | |
| "learning_rate": 0.0003024049132636569, | |
| "loss": 3.3951, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 4.9725540845979985, | |
| "grad_norm": 0.662213146686554, | |
| "learning_rate": 0.0003020816722335955, | |
| "loss": 3.393, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 4.977935636637606, | |
| "grad_norm": 0.6803269982337952, | |
| "learning_rate": 0.00030175843120353404, | |
| "loss": 3.3851, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 4.983317188677215, | |
| "grad_norm": 0.6812626719474792, | |
| "learning_rate": 0.00030144165499407385, | |
| "loss": 3.3742, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 4.988698740716822, | |
| "grad_norm": 0.6587677001953125, | |
| "learning_rate": 0.00030111841396401244, | |
| "loss": 3.3903, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 4.994080292756431, | |
| "grad_norm": 0.6358218789100647, | |
| "learning_rate": 0.0003007951729339511, | |
| "loss": 3.3916, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 4.9994618447960395, | |
| "grad_norm": 0.7453263998031616, | |
| "learning_rate": 0.00030047193190388963, | |
| "loss": 3.3938, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 5.004843396835647, | |
| "grad_norm": 0.6780865788459778, | |
| "learning_rate": 0.0003001486908738282, | |
| "loss": 3.3087, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 5.010224948875256, | |
| "grad_norm": 0.6883599162101746, | |
| "learning_rate": 0.0002998254498437668, | |
| "loss": 3.2893, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 5.015606500914864, | |
| "grad_norm": 0.6944405436515808, | |
| "learning_rate": 0.0002995022088137054, | |
| "loss": 3.2771, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 5.020988052954472, | |
| "grad_norm": 0.683914303779602, | |
| "learning_rate": 0.00029917896778364396, | |
| "loss": 3.3025, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 5.0263696049940805, | |
| "grad_norm": 0.6876422166824341, | |
| "learning_rate": 0.00029885572675358255, | |
| "loss": 3.3214, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 5.031751157033688, | |
| "grad_norm": 0.8835436701774597, | |
| "learning_rate": 0.00029853248572352114, | |
| "loss": 3.2975, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 5.037132709073297, | |
| "grad_norm": 0.6763400435447693, | |
| "learning_rate": 0.00029820924469345974, | |
| "loss": 3.302, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 5.042514261112905, | |
| "grad_norm": 0.6604331731796265, | |
| "learning_rate": 0.0002978860036633983, | |
| "loss": 3.2919, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 5.047895813152513, | |
| "grad_norm": 0.7357029318809509, | |
| "learning_rate": 0.00029756276263333693, | |
| "loss": 3.2974, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 5.0532773651921215, | |
| "grad_norm": 0.6902467012405396, | |
| "learning_rate": 0.00029723952160327547, | |
| "loss": 3.3091, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 5.05865891723173, | |
| "grad_norm": 0.6801806688308716, | |
| "learning_rate": 0.00029691628057321406, | |
| "loss": 3.3069, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 5.05865891723173, | |
| "eval_accuracy": 0.38042059750178764, | |
| "eval_loss": 3.413647413253784, | |
| "eval_runtime": 184.2622, | |
| "eval_samples_per_second": 97.747, | |
| "eval_steps_per_second": 6.111, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 5.064040469271338, | |
| "grad_norm": 0.6707634329795837, | |
| "learning_rate": 0.00029659303954315266, | |
| "loss": 3.3099, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 5.069422021310946, | |
| "grad_norm": 0.7099149823188782, | |
| "learning_rate": 0.00029626979851309125, | |
| "loss": 3.3124, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 5.074803573350554, | |
| "grad_norm": 0.7230901122093201, | |
| "learning_rate": 0.00029594655748302985, | |
| "loss": 3.2958, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 5.080185125390162, | |
| "grad_norm": 0.7420896887779236, | |
| "learning_rate": 0.0002956233164529684, | |
| "loss": 3.318, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 5.085566677429771, | |
| "grad_norm": 0.6557525396347046, | |
| "learning_rate": 0.000295300075422907, | |
| "loss": 3.3007, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 5.090948229469379, | |
| "grad_norm": 0.6699824929237366, | |
| "learning_rate": 0.0002949768343928456, | |
| "loss": 3.3163, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 5.096329781508987, | |
| "grad_norm": 0.6804993152618408, | |
| "learning_rate": 0.00029465359336278417, | |
| "loss": 3.3055, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 5.101711333548596, | |
| "grad_norm": 0.7249795794487, | |
| "learning_rate": 0.0002943303523327227, | |
| "loss": 3.3088, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 5.107092885588203, | |
| "grad_norm": 0.7169092893600464, | |
| "learning_rate": 0.00029400711130266136, | |
| "loss": 3.3096, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 5.112474437627812, | |
| "grad_norm": 0.723460853099823, | |
| "learning_rate": 0.0002936838702725999, | |
| "loss": 3.3081, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 5.1178559896674205, | |
| "grad_norm": 0.8058868050575256, | |
| "learning_rate": 0.0002933606292425385, | |
| "loss": 3.2892, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 5.123237541707028, | |
| "grad_norm": 0.7276614308357239, | |
| "learning_rate": 0.0002930373882124771, | |
| "loss": 3.3306, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 5.128619093746637, | |
| "grad_norm": 0.6905969381332397, | |
| "learning_rate": 0.0002927141471824157, | |
| "loss": 3.3312, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 5.134000645786244, | |
| "grad_norm": 0.720950186252594, | |
| "learning_rate": 0.0002923909061523542, | |
| "loss": 3.3273, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 5.139382197825853, | |
| "grad_norm": 0.6645218133926392, | |
| "learning_rate": 0.0002920676651222928, | |
| "loss": 3.3218, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 5.1447637498654615, | |
| "grad_norm": 0.7677816152572632, | |
| "learning_rate": 0.0002917444240922314, | |
| "loss": 3.3128, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 5.150145301905069, | |
| "grad_norm": 0.6742658019065857, | |
| "learning_rate": 0.00029142118306216996, | |
| "loss": 3.3294, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 5.155526853944678, | |
| "grad_norm": 0.7500579357147217, | |
| "learning_rate": 0.0002910979420321086, | |
| "loss": 3.3281, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 5.160908405984286, | |
| "grad_norm": 0.6635573506355286, | |
| "learning_rate": 0.00029077470100204715, | |
| "loss": 3.3289, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 5.166289958023894, | |
| "grad_norm": 0.7364498376846313, | |
| "learning_rate": 0.00029045145997198574, | |
| "loss": 3.2946, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 5.166289958023894, | |
| "eval_accuracy": 0.3809652744284016, | |
| "eval_loss": 3.4089674949645996, | |
| "eval_runtime": 184.5255, | |
| "eval_samples_per_second": 97.607, | |
| "eval_steps_per_second": 6.102, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 5.1716715100635025, | |
| "grad_norm": 0.673941433429718, | |
| "learning_rate": 0.00029012821894192433, | |
| "loss": 3.3105, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 5.17705306210311, | |
| "grad_norm": 0.6794301867485046, | |
| "learning_rate": 0.00028980497791186293, | |
| "loss": 3.2917, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 5.182434614142719, | |
| "grad_norm": 0.6894687414169312, | |
| "learning_rate": 0.0002894817368818015, | |
| "loss": 3.318, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 5.187816166182327, | |
| "grad_norm": 0.6699786186218262, | |
| "learning_rate": 0.00028915849585174006, | |
| "loss": 3.3131, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 5.193197718221935, | |
| "grad_norm": 0.6765483617782593, | |
| "learning_rate": 0.00028883525482167866, | |
| "loss": 3.3264, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 5.198579270261543, | |
| "grad_norm": 0.7597613334655762, | |
| "learning_rate": 0.00028851201379161725, | |
| "loss": 3.3273, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 5.203960822301152, | |
| "grad_norm": 0.6809649467468262, | |
| "learning_rate": 0.00028818877276155585, | |
| "loss": 3.3295, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 5.20934237434076, | |
| "grad_norm": 0.6863871216773987, | |
| "learning_rate": 0.0002878655317314944, | |
| "loss": 3.3336, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 5.214723926380368, | |
| "grad_norm": 0.7177680134773254, | |
| "learning_rate": 0.00028754229070143304, | |
| "loss": 3.3289, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 5.220105478419977, | |
| "grad_norm": 0.680456817150116, | |
| "learning_rate": 0.0002872190496713716, | |
| "loss": 3.3252, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 5.225487030459584, | |
| "grad_norm": 0.6903363466262817, | |
| "learning_rate": 0.0002868958086413102, | |
| "loss": 3.3231, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 5.230868582499193, | |
| "grad_norm": 0.6907674074172974, | |
| "learning_rate": 0.00028657256761124877, | |
| "loss": 3.3412, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 5.236250134538801, | |
| "grad_norm": 0.6623774170875549, | |
| "learning_rate": 0.00028624932658118736, | |
| "loss": 3.341, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 5.241631686578409, | |
| "grad_norm": 0.6799823641777039, | |
| "learning_rate": 0.0002859260855511259, | |
| "loss": 3.3378, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 5.247013238618018, | |
| "grad_norm": 0.6796805262565613, | |
| "learning_rate": 0.0002856028445210645, | |
| "loss": 3.327, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 5.252394790657625, | |
| "grad_norm": 0.6830520033836365, | |
| "learning_rate": 0.0002852796034910031, | |
| "loss": 3.3288, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 5.257776342697234, | |
| "grad_norm": 0.6735912561416626, | |
| "learning_rate": 0.0002849563624609417, | |
| "loss": 3.3375, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "grad_norm": 0.7199484705924988, | |
| "learning_rate": 0.0002846331214308803, | |
| "loss": 3.2972, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 5.26853944677645, | |
| "grad_norm": 0.7499407529830933, | |
| "learning_rate": 0.0002843098804008188, | |
| "loss": 3.3332, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 5.273920998816059, | |
| "grad_norm": 0.6806166768074036, | |
| "learning_rate": 0.00028398663937075747, | |
| "loss": 3.3324, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 5.273920998816059, | |
| "eval_accuracy": 0.3814164012186073, | |
| "eval_loss": 3.4085781574249268, | |
| "eval_runtime": 184.4159, | |
| "eval_samples_per_second": 97.665, | |
| "eval_steps_per_second": 6.106, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 5.279302550855666, | |
| "grad_norm": 0.7061271667480469, | |
| "learning_rate": 0.000283663398340696, | |
| "loss": 3.3311, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 5.284684102895275, | |
| "grad_norm": 0.7735579609870911, | |
| "learning_rate": 0.0002833401573106346, | |
| "loss": 3.329, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 5.2900656549348835, | |
| "grad_norm": 0.7523805499076843, | |
| "learning_rate": 0.0002830169162805732, | |
| "loss": 3.3216, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 5.295447206974491, | |
| "grad_norm": 0.7363016605377197, | |
| "learning_rate": 0.0002826936752505118, | |
| "loss": 3.3267, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 5.3008287590141, | |
| "grad_norm": 0.6788080334663391, | |
| "learning_rate": 0.00028237043422045034, | |
| "loss": 3.3243, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 5.306210311053708, | |
| "grad_norm": 0.6680963039398193, | |
| "learning_rate": 0.00028204719319038893, | |
| "loss": 3.3386, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 5.311591863093316, | |
| "grad_norm": 0.6916529536247253, | |
| "learning_rate": 0.0002817239521603275, | |
| "loss": 3.3257, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 5.316973415132924, | |
| "grad_norm": 0.6715468764305115, | |
| "learning_rate": 0.0002814007111302661, | |
| "loss": 3.3161, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 5.322354967172533, | |
| "grad_norm": 0.6650114059448242, | |
| "learning_rate": 0.0002810774701002047, | |
| "loss": 3.3352, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 5.327736519212141, | |
| "grad_norm": 0.6942517757415771, | |
| "learning_rate": 0.00028075422907014325, | |
| "loss": 3.3284, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 5.333118071251749, | |
| "grad_norm": 0.7156750559806824, | |
| "learning_rate": 0.00028043098804008185, | |
| "loss": 3.3411, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 5.338499623291357, | |
| "grad_norm": 0.6560901999473572, | |
| "learning_rate": 0.00028010774701002044, | |
| "loss": 3.3299, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 5.343881175330965, | |
| "grad_norm": 0.6979596018791199, | |
| "learning_rate": 0.00027978450597995904, | |
| "loss": 3.3344, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 5.349262727370574, | |
| "grad_norm": 0.7661385536193848, | |
| "learning_rate": 0.0002794612649498976, | |
| "loss": 3.3308, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 5.354644279410182, | |
| "grad_norm": 0.6681206226348877, | |
| "learning_rate": 0.00027913802391983623, | |
| "loss": 3.3102, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 5.36002583144979, | |
| "grad_norm": 0.7179029583930969, | |
| "learning_rate": 0.00027881478288977477, | |
| "loss": 3.333, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 5.365407383489399, | |
| "grad_norm": 0.6773339509963989, | |
| "learning_rate": 0.00027849154185971336, | |
| "loss": 3.3383, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 5.370788935529006, | |
| "grad_norm": 0.7313709259033203, | |
| "learning_rate": 0.00027816830082965196, | |
| "loss": 3.339, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 5.376170487568615, | |
| "grad_norm": 0.7196881175041199, | |
| "learning_rate": 0.00027784505979959055, | |
| "loss": 3.3444, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 5.3815520396082235, | |
| "grad_norm": 0.784724235534668, | |
| "learning_rate": 0.00027752181876952915, | |
| "loss": 3.3176, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.3815520396082235, | |
| "eval_accuracy": 0.3815070177270446, | |
| "eval_loss": 3.4042985439300537, | |
| "eval_runtime": 184.3894, | |
| "eval_samples_per_second": 97.679, | |
| "eval_steps_per_second": 6.107, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.386933591647831, | |
| "grad_norm": 0.6928351521492004, | |
| "learning_rate": 0.0002771985777394677, | |
| "loss": 3.3233, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 5.39231514368744, | |
| "grad_norm": 0.7170709371566772, | |
| "learning_rate": 0.0002768753367094063, | |
| "loss": 3.346, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 5.397696695727047, | |
| "grad_norm": 0.7356758117675781, | |
| "learning_rate": 0.0002765520956793449, | |
| "loss": 3.2966, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 5.403078247766656, | |
| "grad_norm": 0.7231123447418213, | |
| "learning_rate": 0.00027622885464928347, | |
| "loss": 3.3227, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 5.4084597998062645, | |
| "grad_norm": 0.6928778886795044, | |
| "learning_rate": 0.000275905613619222, | |
| "loss": 3.3545, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 5.413841351845872, | |
| "grad_norm": 0.6765695214271545, | |
| "learning_rate": 0.00027558237258916066, | |
| "loss": 3.3454, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 5.419222903885481, | |
| "grad_norm": 0.7140141129493713, | |
| "learning_rate": 0.0002752591315590992, | |
| "loss": 3.319, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 5.424604455925088, | |
| "grad_norm": 0.7007126212120056, | |
| "learning_rate": 0.0002749358905290378, | |
| "loss": 3.3465, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 5.429986007964697, | |
| "grad_norm": 0.7254543900489807, | |
| "learning_rate": 0.0002746126494989764, | |
| "loss": 3.3339, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 5.435367560004305, | |
| "grad_norm": 0.6623748540878296, | |
| "learning_rate": 0.000274289408468915, | |
| "loss": 3.3197, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 5.440749112043913, | |
| "grad_norm": 0.7220912575721741, | |
| "learning_rate": 0.0002739661674388535, | |
| "loss": 3.3158, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 5.446130664083522, | |
| "grad_norm": 0.752842128276825, | |
| "learning_rate": 0.0002736429264087921, | |
| "loss": 3.3559, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 5.45151221612313, | |
| "grad_norm": 0.7269429564476013, | |
| "learning_rate": 0.0002733196853787307, | |
| "loss": 3.3266, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 5.456893768162738, | |
| "grad_norm": 0.6927263140678406, | |
| "learning_rate": 0.0002729964443486693, | |
| "loss": 3.3375, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 5.462275320202346, | |
| "grad_norm": 0.7431906461715698, | |
| "learning_rate": 0.0002726732033186079, | |
| "loss": 3.3347, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 5.467656872241955, | |
| "grad_norm": 0.7084429264068604, | |
| "learning_rate": 0.00027234996228854644, | |
| "loss": 3.3443, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 5.473038424281563, | |
| "grad_norm": 0.6668190956115723, | |
| "learning_rate": 0.0002720267212584851, | |
| "loss": 3.3278, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 5.478419976321171, | |
| "grad_norm": 0.6932339668273926, | |
| "learning_rate": 0.00027170348022842363, | |
| "loss": 3.3292, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 5.483801528360779, | |
| "grad_norm": 0.7017044425010681, | |
| "learning_rate": 0.00027138023919836223, | |
| "loss": 3.3141, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 5.489183080400387, | |
| "grad_norm": 0.681545078754425, | |
| "learning_rate": 0.0002710569981683008, | |
| "loss": 3.336, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.489183080400387, | |
| "eval_accuracy": 0.3822459660170709, | |
| "eval_loss": 3.3982532024383545, | |
| "eval_runtime": 184.4528, | |
| "eval_samples_per_second": 97.646, | |
| "eval_steps_per_second": 6.105, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.494564632439996, | |
| "grad_norm": 0.7238340377807617, | |
| "learning_rate": 0.0002707337571382394, | |
| "loss": 3.3537, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 5.499946184479604, | |
| "grad_norm": 0.6586156487464905, | |
| "learning_rate": 0.00027041051610817796, | |
| "loss": 3.3475, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 5.505327736519212, | |
| "grad_norm": 0.7239208817481995, | |
| "learning_rate": 0.00027009373989871776, | |
| "loss": 3.3281, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 5.510709288558821, | |
| "grad_norm": 0.7013527750968933, | |
| "learning_rate": 0.00026977049886865636, | |
| "loss": 3.3309, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 5.516090840598428, | |
| "grad_norm": 0.7006890773773193, | |
| "learning_rate": 0.00026944725783859495, | |
| "loss": 3.3243, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 0.7042725682258606, | |
| "learning_rate": 0.00026912401680853355, | |
| "loss": 3.3298, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 5.5268539446776455, | |
| "grad_norm": 0.6982862949371338, | |
| "learning_rate": 0.0002688007757784721, | |
| "loss": 3.3553, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 5.532235496717253, | |
| "grad_norm": 0.7178354859352112, | |
| "learning_rate": 0.00026847753474841074, | |
| "loss": 3.3368, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 5.537617048756862, | |
| "grad_norm": 0.7167358994483948, | |
| "learning_rate": 0.0002681542937183493, | |
| "loss": 3.3238, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 5.542998600796469, | |
| "grad_norm": 0.7084592580795288, | |
| "learning_rate": 0.0002678310526882879, | |
| "loss": 3.3357, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 5.548380152836078, | |
| "grad_norm": 0.6898399591445923, | |
| "learning_rate": 0.00026750781165822647, | |
| "loss": 3.3362, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 5.553761704875686, | |
| "grad_norm": 0.6914436221122742, | |
| "learning_rate": 0.0002671910354487663, | |
| "loss": 3.3538, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 5.559143256915294, | |
| "grad_norm": 0.6864462494850159, | |
| "learning_rate": 0.00026686779441870487, | |
| "loss": 3.3335, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 5.564524808954903, | |
| "grad_norm": 0.7406713366508484, | |
| "learning_rate": 0.0002665445533886434, | |
| "loss": 3.3269, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 5.569906360994511, | |
| "grad_norm": 0.6506957411766052, | |
| "learning_rate": 0.00026622131235858206, | |
| "loss": 3.3449, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 5.575287913034119, | |
| "grad_norm": 0.651504397392273, | |
| "learning_rate": 0.0002658980713285206, | |
| "loss": 3.3319, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 5.580669465073727, | |
| "grad_norm": 0.7113884687423706, | |
| "learning_rate": 0.0002655748302984592, | |
| "loss": 3.3368, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 5.586051017113336, | |
| "grad_norm": 0.6775544285774231, | |
| "learning_rate": 0.0002652515892683978, | |
| "loss": 3.3318, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 5.591432569152944, | |
| "grad_norm": 0.6846035718917847, | |
| "learning_rate": 0.0002649283482383364, | |
| "loss": 3.3263, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 5.596814121192552, | |
| "grad_norm": 0.6773697137832642, | |
| "learning_rate": 0.0002646051072082749, | |
| "loss": 3.3309, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.596814121192552, | |
| "eval_accuracy": 0.38265015475974073, | |
| "eval_loss": 3.3931753635406494, | |
| "eval_runtime": 184.5806, | |
| "eval_samples_per_second": 97.578, | |
| "eval_steps_per_second": 6.1, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.60219567323216, | |
| "grad_norm": 0.7283803820610046, | |
| "learning_rate": 0.0002642818661782135, | |
| "loss": 3.3354, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 5.607577225271768, | |
| "grad_norm": 0.6806847453117371, | |
| "learning_rate": 0.0002639586251481521, | |
| "loss": 3.3427, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 5.612958777311377, | |
| "grad_norm": 0.7113904356956482, | |
| "learning_rate": 0.0002636353841180907, | |
| "loss": 3.3385, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 5.618340329350985, | |
| "grad_norm": 0.7080471515655518, | |
| "learning_rate": 0.0002633121430880293, | |
| "loss": 3.3374, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 5.623721881390593, | |
| "grad_norm": 0.7069630026817322, | |
| "learning_rate": 0.00026298890205796784, | |
| "loss": 3.3373, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 5.629103433430201, | |
| "grad_norm": 0.66731196641922, | |
| "learning_rate": 0.00026266566102790644, | |
| "loss": 3.3431, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 5.634484985469809, | |
| "grad_norm": 0.7217484712600708, | |
| "learning_rate": 0.00026234241999784503, | |
| "loss": 3.3434, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 5.639866537509418, | |
| "grad_norm": 0.7056784629821777, | |
| "learning_rate": 0.0002620191789677836, | |
| "loss": 3.3239, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 5.645248089549026, | |
| "grad_norm": 0.7574684023857117, | |
| "learning_rate": 0.00026169593793772217, | |
| "loss": 3.33, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 5.650629641588634, | |
| "grad_norm": 0.6346601843833923, | |
| "learning_rate": 0.0002613726969076608, | |
| "loss": 3.3388, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 5.656011193628243, | |
| "grad_norm": 0.6737771034240723, | |
| "learning_rate": 0.00026104945587759936, | |
| "loss": 3.3355, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 5.66139274566785, | |
| "grad_norm": 0.719514012336731, | |
| "learning_rate": 0.00026072621484753795, | |
| "loss": 3.3449, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 5.666774297707459, | |
| "grad_norm": 0.7287180423736572, | |
| "learning_rate": 0.00026040297381747655, | |
| "loss": 3.3309, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 5.672155849747067, | |
| "grad_norm": 0.6821631193161011, | |
| "learning_rate": 0.00026007973278741514, | |
| "loss": 3.3392, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 5.677537401786675, | |
| "grad_norm": 0.6680901646614075, | |
| "learning_rate": 0.00025975649175735373, | |
| "loss": 3.3216, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 5.682918953826284, | |
| "grad_norm": 0.725506067276001, | |
| "learning_rate": 0.0002594332507272923, | |
| "loss": 3.3445, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 5.688300505865891, | |
| "grad_norm": 0.6824221611022949, | |
| "learning_rate": 0.00025911000969723087, | |
| "loss": 3.3382, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 5.6936820579055, | |
| "grad_norm": 0.6922152638435364, | |
| "learning_rate": 0.00025878676866716946, | |
| "loss": 3.3261, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 5.699063609945108, | |
| "grad_norm": 0.6897504329681396, | |
| "learning_rate": 0.00025846352763710806, | |
| "loss": 3.3328, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 5.704445161984716, | |
| "grad_norm": 0.7494200468063354, | |
| "learning_rate": 0.0002581402866070466, | |
| "loss": 3.3284, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.704445161984716, | |
| "eval_accuracy": 0.38330065959908605, | |
| "eval_loss": 3.3890981674194336, | |
| "eval_runtime": 184.3011, | |
| "eval_samples_per_second": 97.726, | |
| "eval_steps_per_second": 6.11, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.709826714024325, | |
| "grad_norm": 0.6961687207221985, | |
| "learning_rate": 0.00025781704557698525, | |
| "loss": 3.345, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 5.715208266063933, | |
| "grad_norm": 0.6829299926757812, | |
| "learning_rate": 0.0002574938045469238, | |
| "loss": 3.3334, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 5.720589818103541, | |
| "grad_norm": 0.690844714641571, | |
| "learning_rate": 0.0002571705635168624, | |
| "loss": 3.355, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 5.725971370143149, | |
| "grad_norm": 0.6903976202011108, | |
| "learning_rate": 0.000256847322486801, | |
| "loss": 3.3509, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 5.731352922182758, | |
| "grad_norm": 0.6853659152984619, | |
| "learning_rate": 0.0002565240814567396, | |
| "loss": 3.3413, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 5.736734474222366, | |
| "grad_norm": 0.7064983248710632, | |
| "learning_rate": 0.0002562008404266781, | |
| "loss": 3.3314, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 5.742116026261974, | |
| "grad_norm": 0.7611440420150757, | |
| "learning_rate": 0.0002558775993966167, | |
| "loss": 3.3322, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 5.747497578301582, | |
| "grad_norm": 0.7700773477554321, | |
| "learning_rate": 0.0002555543583665553, | |
| "loss": 3.338, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 5.75287913034119, | |
| "grad_norm": 0.8143061399459839, | |
| "learning_rate": 0.0002552311173364939, | |
| "loss": 3.3355, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 5.758260682380799, | |
| "grad_norm": 0.6784583926200867, | |
| "learning_rate": 0.0002549078763064325, | |
| "loss": 3.3549, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 5.763642234420407, | |
| "grad_norm": 0.7153463363647461, | |
| "learning_rate": 0.00025458463527637103, | |
| "loss": 3.3405, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 5.769023786460015, | |
| "grad_norm": 0.7094969153404236, | |
| "learning_rate": 0.0002542613942463097, | |
| "loss": 3.344, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 5.774405338499624, | |
| "grad_norm": 0.6838723421096802, | |
| "learning_rate": 0.0002539381532162482, | |
| "loss": 3.318, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 5.779786890539231, | |
| "grad_norm": 0.7074011564254761, | |
| "learning_rate": 0.0002536149121861868, | |
| "loss": 3.3362, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 5.78516844257884, | |
| "grad_norm": 0.7460276484489441, | |
| "learning_rate": 0.0002532916711561254, | |
| "loss": 3.355, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 5.790549994618448, | |
| "grad_norm": 0.6998981237411499, | |
| "learning_rate": 0.000252968430126064, | |
| "loss": 3.3113, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 5.795931546658056, | |
| "grad_norm": 0.7156438231468201, | |
| "learning_rate": 0.00025264518909600255, | |
| "loss": 3.3243, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 5.801313098697665, | |
| "grad_norm": 0.7122145891189575, | |
| "learning_rate": 0.00025232194806594114, | |
| "loss": 3.329, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 5.806694650737272, | |
| "grad_norm": 0.6809146404266357, | |
| "learning_rate": 0.00025199870703587974, | |
| "loss": 3.3572, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 5.812076202776881, | |
| "grad_norm": 0.7085328698158264, | |
| "learning_rate": 0.00025167546600581833, | |
| "loss": 3.3387, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.812076202776881, | |
| "eval_accuracy": 0.3836084732302645, | |
| "eval_loss": 3.3847055435180664, | |
| "eval_runtime": 184.9912, | |
| "eval_samples_per_second": 97.361, | |
| "eval_steps_per_second": 6.087, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.817457754816489, | |
| "grad_norm": 0.7111963629722595, | |
| "learning_rate": 0.0002513522249757569, | |
| "loss": 3.3469, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 5.822839306856097, | |
| "grad_norm": 0.7095236778259277, | |
| "learning_rate": 0.00025102898394569547, | |
| "loss": 3.3262, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 5.828220858895706, | |
| "grad_norm": 0.6420708894729614, | |
| "learning_rate": 0.00025070574291563406, | |
| "loss": 3.3279, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 5.833602410935313, | |
| "grad_norm": 0.760785698890686, | |
| "learning_rate": 0.00025038250188557265, | |
| "loss": 3.3192, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 5.838983962974922, | |
| "grad_norm": 0.7175909876823425, | |
| "learning_rate": 0.00025005926085551125, | |
| "loss": 3.3305, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 5.84436551501453, | |
| "grad_norm": 0.6923112869262695, | |
| "learning_rate": 0.0002497360198254498, | |
| "loss": 3.351, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 5.849747067054138, | |
| "grad_norm": 0.7294127941131592, | |
| "learning_rate": 0.00024941277879538844, | |
| "loss": 3.3391, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 5.855128619093747, | |
| "grad_norm": 0.692392885684967, | |
| "learning_rate": 0.000249089537765327, | |
| "loss": 3.3468, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 5.860510171133355, | |
| "grad_norm": 0.7060067653656006, | |
| "learning_rate": 0.0002487662967352656, | |
| "loss": 3.3305, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 5.865891723172963, | |
| "grad_norm": 0.7050901651382446, | |
| "learning_rate": 0.00024844305570520417, | |
| "loss": 3.3267, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 5.871273275212571, | |
| "grad_norm": 0.7604856491088867, | |
| "learning_rate": 0.00024811981467514276, | |
| "loss": 3.3238, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 5.87665482725218, | |
| "grad_norm": 0.7352104783058167, | |
| "learning_rate": 0.00024779657364508136, | |
| "loss": 3.3494, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 5.882036379291788, | |
| "grad_norm": 0.6886479258537292, | |
| "learning_rate": 0.0002474733326150199, | |
| "loss": 3.3413, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 5.887417931331396, | |
| "grad_norm": 0.715201199054718, | |
| "learning_rate": 0.0002471500915849585, | |
| "loss": 3.3259, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 5.892799483371004, | |
| "grad_norm": 0.7124782800674438, | |
| "learning_rate": 0.0002468268505548971, | |
| "loss": 3.327, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 5.898181035410612, | |
| "grad_norm": 0.7112532258033752, | |
| "learning_rate": 0.0002465036095248357, | |
| "loss": 3.3208, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 5.903562587450221, | |
| "grad_norm": 0.6896623373031616, | |
| "learning_rate": 0.0002461803684947742, | |
| "loss": 3.3351, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 5.9089441394898286, | |
| "grad_norm": 0.7776743769645691, | |
| "learning_rate": 0.0002458571274647128, | |
| "loss": 3.3279, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 5.914325691529437, | |
| "grad_norm": 0.66485196352005, | |
| "learning_rate": 0.0002455338864346514, | |
| "loss": 3.3363, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 5.919707243569046, | |
| "grad_norm": 0.7536632418632507, | |
| "learning_rate": 0.00024521064540459, | |
| "loss": 3.3288, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.919707243569046, | |
| "eval_accuracy": 0.3838439240381585, | |
| "eval_loss": 3.380603551864624, | |
| "eval_runtime": 184.5212, | |
| "eval_samples_per_second": 97.609, | |
| "eval_steps_per_second": 6.102, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.925088795608653, | |
| "grad_norm": 0.7313013076782227, | |
| "learning_rate": 0.0002448874043745286, | |
| "loss": 3.352, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 5.930470347648262, | |
| "grad_norm": 0.7326631546020508, | |
| "learning_rate": 0.00024456416334446714, | |
| "loss": 3.3198, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 5.93585189968787, | |
| "grad_norm": 0.708411693572998, | |
| "learning_rate": 0.00024424092231440574, | |
| "loss": 3.3285, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 5.941233451727478, | |
| "grad_norm": 0.7135109305381775, | |
| "learning_rate": 0.00024391768128434436, | |
| "loss": 3.3456, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 5.946615003767087, | |
| "grad_norm": 0.6980443000793457, | |
| "learning_rate": 0.00024359444025428293, | |
| "loss": 3.3312, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 5.951996555806694, | |
| "grad_norm": 0.7981253266334534, | |
| "learning_rate": 0.0002432711992242215, | |
| "loss": 3.3442, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 5.957378107846303, | |
| "grad_norm": 0.7348515391349792, | |
| "learning_rate": 0.0002429479581941601, | |
| "loss": 3.3211, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 5.962759659885911, | |
| "grad_norm": 0.6746955513954163, | |
| "learning_rate": 0.00024262471716409868, | |
| "loss": 3.3398, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 5.968141211925519, | |
| "grad_norm": 0.7755898237228394, | |
| "learning_rate": 0.00024230147613403728, | |
| "loss": 3.3418, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 5.973522763965128, | |
| "grad_norm": 0.6907931566238403, | |
| "learning_rate": 0.00024197823510397584, | |
| "loss": 3.3452, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 5.978904316004736, | |
| "grad_norm": 0.7837631702423096, | |
| "learning_rate": 0.0002416549940739144, | |
| "loss": 3.3322, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 5.984285868044344, | |
| "grad_norm": 0.720840573310852, | |
| "learning_rate": 0.00024133175304385303, | |
| "loss": 3.3389, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 5.989667420083952, | |
| "grad_norm": 0.6716418862342834, | |
| "learning_rate": 0.0002410085120137916, | |
| "loss": 3.3443, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 5.995048972123561, | |
| "grad_norm": 0.6989961266517639, | |
| "learning_rate": 0.00024068527098373017, | |
| "loss": 3.3217, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 6.000430524163169, | |
| "grad_norm": 0.7186738848686218, | |
| "learning_rate": 0.0002403620299536688, | |
| "loss": 3.3256, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 6.005812076202777, | |
| "grad_norm": 0.7037482261657715, | |
| "learning_rate": 0.00024003878892360736, | |
| "loss": 3.2369, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 6.011193628242385, | |
| "grad_norm": 0.7382490038871765, | |
| "learning_rate": 0.00023971554789354593, | |
| "loss": 3.2544, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 6.016575180281993, | |
| "grad_norm": 0.7183423042297363, | |
| "learning_rate": 0.00023939230686348452, | |
| "loss": 3.2488, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 6.021956732321602, | |
| "grad_norm": 0.6994176506996155, | |
| "learning_rate": 0.0002390690658334231, | |
| "loss": 3.2376, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 6.0273382843612096, | |
| "grad_norm": 0.7932013273239136, | |
| "learning_rate": 0.00023874582480336168, | |
| "loss": 3.2596, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 6.0273382843612096, | |
| "eval_accuracy": 0.38380991568427253, | |
| "eval_loss": 3.385104179382324, | |
| "eval_runtime": 185.0498, | |
| "eval_samples_per_second": 97.331, | |
| "eval_steps_per_second": 6.085, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 6.032719836400818, | |
| "grad_norm": 0.6959599256515503, | |
| "learning_rate": 0.00023842258377330028, | |
| "loss": 3.2471, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 6.038101388440427, | |
| "grad_norm": 0.6982143521308899, | |
| "learning_rate": 0.00023809934274323885, | |
| "loss": 3.2574, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 6.043482940480034, | |
| "grad_norm": 0.6901403069496155, | |
| "learning_rate": 0.0002377761017131774, | |
| "loss": 3.2574, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 6.048864492519643, | |
| "grad_norm": 0.7211006283760071, | |
| "learning_rate": 0.00023745932550371725, | |
| "loss": 3.2582, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 6.0542460445592505, | |
| "grad_norm": 0.6988502740859985, | |
| "learning_rate": 0.00023713608447365584, | |
| "loss": 3.2451, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 6.059627596598859, | |
| "grad_norm": 0.7439178824424744, | |
| "learning_rate": 0.00023681284344359444, | |
| "loss": 3.2421, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 6.065009148638468, | |
| "grad_norm": 0.7897803783416748, | |
| "learning_rate": 0.000236489602413533, | |
| "loss": 3.2458, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 6.070390700678075, | |
| "grad_norm": 0.7665706276893616, | |
| "learning_rate": 0.0002361663613834716, | |
| "loss": 3.2681, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 6.075772252717684, | |
| "grad_norm": 0.7175304889678955, | |
| "learning_rate": 0.00023584312035341017, | |
| "loss": 3.2519, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 6.081153804757292, | |
| "grad_norm": 0.6955448985099792, | |
| "learning_rate": 0.00023551987932334876, | |
| "loss": 3.2435, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 6.0865353567969, | |
| "grad_norm": 0.7103674411773682, | |
| "learning_rate": 0.00023519663829328735, | |
| "loss": 3.2499, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 6.091916908836509, | |
| "grad_norm": 0.6995223760604858, | |
| "learning_rate": 0.00023487339726322592, | |
| "loss": 3.2548, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 6.097298460876116, | |
| "grad_norm": 0.7171559929847717, | |
| "learning_rate": 0.0002345501562331645, | |
| "loss": 3.2703, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 6.102680012915725, | |
| "grad_norm": 0.6809073090553284, | |
| "learning_rate": 0.0002342269152031031, | |
| "loss": 3.2562, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 6.108061564955333, | |
| "grad_norm": 0.7592636346817017, | |
| "learning_rate": 0.00023390367417304168, | |
| "loss": 3.2539, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 6.113443116994941, | |
| "grad_norm": 0.7267977595329285, | |
| "learning_rate": 0.00023358043314298025, | |
| "loss": 3.2519, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 6.11882466903455, | |
| "grad_norm": 0.7413089275360107, | |
| "learning_rate": 0.00023325719211291887, | |
| "loss": 3.274, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 6.124206221074158, | |
| "grad_norm": 0.73122638463974, | |
| "learning_rate": 0.00023293395108285744, | |
| "loss": 3.2802, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 6.129587773113766, | |
| "grad_norm": 0.7569130659103394, | |
| "learning_rate": 0.000232610710052796, | |
| "loss": 3.2575, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 0.7256159782409668, | |
| "learning_rate": 0.0002322874690227346, | |
| "loss": 3.2702, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "eval_accuracy": 0.38442326123598525, | |
| "eval_loss": 3.3839449882507324, | |
| "eval_runtime": 184.2132, | |
| "eval_samples_per_second": 97.773, | |
| "eval_steps_per_second": 6.112, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 6.140350877192983, | |
| "grad_norm": 0.7238855957984924, | |
| "learning_rate": 0.0002319642279926732, | |
| "loss": 3.2678, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 6.1457324292325906, | |
| "grad_norm": 0.7548224329948425, | |
| "learning_rate": 0.00023164098696261176, | |
| "loss": 3.2434, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 6.151113981272199, | |
| "grad_norm": 0.718875527381897, | |
| "learning_rate": 0.00023131774593255036, | |
| "loss": 3.2457, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 6.156495533311807, | |
| "grad_norm": 0.7122008204460144, | |
| "learning_rate": 0.00023099450490248892, | |
| "loss": 3.2702, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 6.161877085351415, | |
| "grad_norm": 0.743674635887146, | |
| "learning_rate": 0.00023067126387242754, | |
| "loss": 3.2538, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 6.167258637391024, | |
| "grad_norm": 0.7488346695899963, | |
| "learning_rate": 0.0002303480228423661, | |
| "loss": 3.2706, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 6.1726401894306315, | |
| "grad_norm": 0.7008352279663086, | |
| "learning_rate": 0.00023002478181230468, | |
| "loss": 3.2688, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 6.17802174147024, | |
| "grad_norm": 0.7453217506408691, | |
| "learning_rate": 0.00022970154078224327, | |
| "loss": 3.2469, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 6.183403293509849, | |
| "grad_norm": 0.7243471145629883, | |
| "learning_rate": 0.00022937829975218187, | |
| "loss": 3.2547, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 6.188784845549456, | |
| "grad_norm": 0.7291977405548096, | |
| "learning_rate": 0.00022905505872212044, | |
| "loss": 3.2612, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 6.194166397589065, | |
| "grad_norm": 0.7475267648696899, | |
| "learning_rate": 0.00022873181769205903, | |
| "loss": 3.2817, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 6.1995479496286725, | |
| "grad_norm": 0.7287805676460266, | |
| "learning_rate": 0.0002284085766619976, | |
| "loss": 3.2536, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 6.204929501668281, | |
| "grad_norm": 0.7033033967018127, | |
| "learning_rate": 0.0002280853356319362, | |
| "loss": 3.268, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 6.21031105370789, | |
| "grad_norm": 0.7168211340904236, | |
| "learning_rate": 0.0002277620946018748, | |
| "loss": 3.2749, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 6.215692605747497, | |
| "grad_norm": 0.7199757695198059, | |
| "learning_rate": 0.00022743885357181336, | |
| "loss": 3.2675, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 6.221074157787106, | |
| "grad_norm": 0.7445018887519836, | |
| "learning_rate": 0.00022711561254175192, | |
| "loss": 3.2799, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 6.226455709826714, | |
| "grad_norm": 0.7764657735824585, | |
| "learning_rate": 0.00022679237151169054, | |
| "loss": 3.274, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 6.231837261866322, | |
| "grad_norm": 0.807851254940033, | |
| "learning_rate": 0.0002264691304816291, | |
| "loss": 3.2702, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 6.237218813905931, | |
| "grad_norm": 0.6823807954788208, | |
| "learning_rate": 0.00022614588945156768, | |
| "loss": 3.2565, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 6.242600365945538, | |
| "grad_norm": 0.7420631051063538, | |
| "learning_rate": 0.0002258226484215063, | |
| "loss": 3.2469, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 6.242600365945538, | |
| "eval_accuracy": 0.384593846269854, | |
| "eval_loss": 3.382420063018799, | |
| "eval_runtime": 184.5323, | |
| "eval_samples_per_second": 97.603, | |
| "eval_steps_per_second": 6.102, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 6.247981917985147, | |
| "grad_norm": 0.7086026072502136, | |
| "learning_rate": 0.00022549940739144487, | |
| "loss": 3.2669, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 6.253363470024755, | |
| "grad_norm": 0.7496986985206604, | |
| "learning_rate": 0.00022517616636138344, | |
| "loss": 3.2706, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 6.258745022064363, | |
| "grad_norm": 0.8295875191688538, | |
| "learning_rate": 0.00022485292533132203, | |
| "loss": 3.2842, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 6.264126574103972, | |
| "grad_norm": 0.7018703818321228, | |
| "learning_rate": 0.00022452968430126063, | |
| "loss": 3.2483, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 6.26950812614358, | |
| "grad_norm": 0.7856804132461548, | |
| "learning_rate": 0.00022420644327119922, | |
| "loss": 3.2631, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 6.274889678183188, | |
| "grad_norm": 0.742798388004303, | |
| "learning_rate": 0.0002238832022411378, | |
| "loss": 3.2942, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 6.280271230222796, | |
| "grad_norm": 0.7351896166801453, | |
| "learning_rate": 0.00022356642603167762, | |
| "loss": 3.2751, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 6.285652782262405, | |
| "grad_norm": 0.7923677563667297, | |
| "learning_rate": 0.0002232431850016162, | |
| "loss": 3.2643, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 6.2910343343020125, | |
| "grad_norm": 0.731157124042511, | |
| "learning_rate": 0.00022291994397155476, | |
| "loss": 3.2858, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 6.296415886341621, | |
| "grad_norm": 0.6895765662193298, | |
| "learning_rate": 0.00022259670294149338, | |
| "loss": 3.2637, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 6.301797438381229, | |
| "grad_norm": 0.7152901291847229, | |
| "learning_rate": 0.00022227346191143195, | |
| "loss": 3.282, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 6.307178990420837, | |
| "grad_norm": 0.7318597435951233, | |
| "learning_rate": 0.00022195022088137051, | |
| "loss": 3.287, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 6.312560542460446, | |
| "grad_norm": 0.7311955690383911, | |
| "learning_rate": 0.0002216269798513091, | |
| "loss": 3.2738, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 6.3179420945000535, | |
| "grad_norm": 0.7657797932624817, | |
| "learning_rate": 0.00022130373882124768, | |
| "loss": 3.2812, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 6.323323646539662, | |
| "grad_norm": 0.7154713273048401, | |
| "learning_rate": 0.00022098049779118627, | |
| "loss": 3.2792, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 6.328705198579271, | |
| "grad_norm": 0.6902388334274292, | |
| "learning_rate": 0.00022065725676112487, | |
| "loss": 3.298, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 6.334086750618878, | |
| "grad_norm": 0.7189655900001526, | |
| "learning_rate": 0.00022033401573106343, | |
| "loss": 3.2837, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 6.339468302658487, | |
| "grad_norm": 0.768619954586029, | |
| "learning_rate": 0.000220010774701002, | |
| "loss": 3.3002, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 6.344849854698095, | |
| "grad_norm": 0.7521898746490479, | |
| "learning_rate": 0.00021968753367094062, | |
| "loss": 3.2853, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 6.350231406737703, | |
| "grad_norm": 0.7037254571914673, | |
| "learning_rate": 0.0002193642926408792, | |
| "loss": 3.27, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 6.350231406737703, | |
| "eval_accuracy": 0.3849088309916282, | |
| "eval_loss": 3.378408193588257, | |
| "eval_runtime": 184.7166, | |
| "eval_samples_per_second": 97.506, | |
| "eval_steps_per_second": 6.096, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 6.355612958777312, | |
| "grad_norm": 0.7696385383605957, | |
| "learning_rate": 0.00021904105161081778, | |
| "loss": 3.2782, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 6.360994510816919, | |
| "grad_norm": 0.7132013440132141, | |
| "learning_rate": 0.00021871781058075638, | |
| "loss": 3.2695, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 6.366376062856528, | |
| "grad_norm": 0.732195258140564, | |
| "learning_rate": 0.00021839456955069495, | |
| "loss": 3.2862, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 6.371757614896136, | |
| "grad_norm": 0.7322704195976257, | |
| "learning_rate": 0.00021807132852063354, | |
| "loss": 3.2794, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 6.377139166935744, | |
| "grad_norm": 0.7418326735496521, | |
| "learning_rate": 0.0002177480874905721, | |
| "loss": 3.2946, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 6.382520718975353, | |
| "grad_norm": 0.7401996850967407, | |
| "learning_rate": 0.0002174248464605107, | |
| "loss": 3.2738, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 6.387902271014961, | |
| "grad_norm": 0.7222006320953369, | |
| "learning_rate": 0.0002171016054304493, | |
| "loss": 3.2722, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 6.393283823054569, | |
| "grad_norm": 0.750238299369812, | |
| "learning_rate": 0.00021677836440038787, | |
| "loss": 3.2873, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 6.398665375094177, | |
| "grad_norm": 0.7619665861129761, | |
| "learning_rate": 0.00021645512337032643, | |
| "loss": 3.2865, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 6.404046927133785, | |
| "grad_norm": 0.7211326956748962, | |
| "learning_rate": 0.00021613188234026506, | |
| "loss": 3.2845, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 6.4094284791733935, | |
| "grad_norm": 0.750061571598053, | |
| "learning_rate": 0.00021580864131020362, | |
| "loss": 3.31, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 6.414810031213002, | |
| "grad_norm": 0.726005494594574, | |
| "learning_rate": 0.0002154854002801422, | |
| "loss": 3.2918, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 6.42019158325261, | |
| "grad_norm": 0.7573758959770203, | |
| "learning_rate": 0.0002151621592500808, | |
| "loss": 3.2917, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 6.425573135292218, | |
| "grad_norm": 0.7391150593757629, | |
| "learning_rate": 0.00021483891822001938, | |
| "loss": 3.2791, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 6.430954687331827, | |
| "grad_norm": 0.7519822716712952, | |
| "learning_rate": 0.00021451567718995795, | |
| "loss": 3.3105, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 6.4363362393714345, | |
| "grad_norm": 0.7713526487350464, | |
| "learning_rate": 0.00021419243615989654, | |
| "loss": 3.2881, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 6.441717791411043, | |
| "grad_norm": 0.7575291991233826, | |
| "learning_rate": 0.00021386919512983514, | |
| "loss": 3.2936, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 6.447099343450651, | |
| "grad_norm": 0.7623528838157654, | |
| "learning_rate": 0.0002135459540997737, | |
| "loss": 3.2876, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 6.452480895490259, | |
| "grad_norm": 0.6868704557418823, | |
| "learning_rate": 0.0002132227130697123, | |
| "loss": 3.2962, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 6.457862447529868, | |
| "grad_norm": 0.757573664188385, | |
| "learning_rate": 0.00021289947203965087, | |
| "loss": 3.2823, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.457862447529868, | |
| "eval_accuracy": 0.38533246860121684, | |
| "eval_loss": 3.3746187686920166, | |
| "eval_runtime": 184.5347, | |
| "eval_samples_per_second": 97.602, | |
| "eval_steps_per_second": 6.102, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.4632439995694755, | |
| "grad_norm": 0.7375765442848206, | |
| "learning_rate": 0.0002125762310095895, | |
| "loss": 3.2661, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 6.468625551609084, | |
| "grad_norm": 0.7561099529266357, | |
| "learning_rate": 0.00021225298997952806, | |
| "loss": 3.2906, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 6.474007103648693, | |
| "grad_norm": 0.6815261244773865, | |
| "learning_rate": 0.00021192974894946662, | |
| "loss": 3.2807, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 6.4793886556883, | |
| "grad_norm": 0.7357954382896423, | |
| "learning_rate": 0.00021160650791940524, | |
| "loss": 3.2714, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 6.484770207727909, | |
| "grad_norm": 0.7838602066040039, | |
| "learning_rate": 0.0002112832668893438, | |
| "loss": 3.2868, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 6.490151759767517, | |
| "grad_norm": 0.7356141805648804, | |
| "learning_rate": 0.00021096002585928238, | |
| "loss": 3.283, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 6.495533311807125, | |
| "grad_norm": 0.7910470962524414, | |
| "learning_rate": 0.00021063678482922097, | |
| "loss": 3.2661, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 6.500914863846734, | |
| "grad_norm": 0.7709822654724121, | |
| "learning_rate": 0.00021031354379915957, | |
| "loss": 3.2817, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 6.506296415886341, | |
| "grad_norm": 0.7653742432594299, | |
| "learning_rate": 0.00020999030276909814, | |
| "loss": 3.2721, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 6.51167796792595, | |
| "grad_norm": 0.8064974546432495, | |
| "learning_rate": 0.00020966706173903673, | |
| "loss": 3.2936, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 6.517059519965558, | |
| "grad_norm": 0.7694202661514282, | |
| "learning_rate": 0.0002093438207089753, | |
| "loss": 3.2643, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 6.522441072005166, | |
| "grad_norm": 0.7296129465103149, | |
| "learning_rate": 0.00020902057967891387, | |
| "loss": 3.2766, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 6.5278226240447745, | |
| "grad_norm": 0.7111927270889282, | |
| "learning_rate": 0.0002087038034694537, | |
| "loss": 3.2881, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 6.533204176084383, | |
| "grad_norm": 0.7283411026000977, | |
| "learning_rate": 0.00020838056243939227, | |
| "loss": 3.2934, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 6.538585728123991, | |
| "grad_norm": 0.7556711435317993, | |
| "learning_rate": 0.0002080573214093309, | |
| "loss": 3.2895, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 6.543967280163599, | |
| "grad_norm": 0.7352252006530762, | |
| "learning_rate": 0.00020773408037926946, | |
| "loss": 3.2778, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 6.549348832203208, | |
| "grad_norm": 0.79709792137146, | |
| "learning_rate": 0.00020741083934920805, | |
| "loss": 3.2836, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 6.5547303842428155, | |
| "grad_norm": 0.7675243020057678, | |
| "learning_rate": 0.00020708759831914662, | |
| "loss": 3.3018, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 6.560111936282424, | |
| "grad_norm": 0.7569796442985535, | |
| "learning_rate": 0.00020676435728908521, | |
| "loss": 3.2825, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 6.565493488322032, | |
| "grad_norm": 0.7302240133285522, | |
| "learning_rate": 0.0002064411162590238, | |
| "loss": 3.2891, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.565493488322032, | |
| "eval_accuracy": 0.38546676357055554, | |
| "eval_loss": 3.3698201179504395, | |
| "eval_runtime": 184.5506, | |
| "eval_samples_per_second": 97.594, | |
| "eval_steps_per_second": 6.101, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.57087504036164, | |
| "grad_norm": 0.8027206659317017, | |
| "learning_rate": 0.00020611787522896238, | |
| "loss": 3.2821, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 6.576256592401249, | |
| "grad_norm": 0.7942132949829102, | |
| "learning_rate": 0.00020579463419890094, | |
| "loss": 3.2906, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 6.5816381444408565, | |
| "grad_norm": 0.7035049796104431, | |
| "learning_rate": 0.00020547139316883957, | |
| "loss": 3.2796, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 6.587019696480465, | |
| "grad_norm": 0.7181252837181091, | |
| "learning_rate": 0.00020514815213877813, | |
| "loss": 3.2802, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 6.592401248520073, | |
| "grad_norm": 0.7472237348556519, | |
| "learning_rate": 0.0002048249111087167, | |
| "loss": 3.2996, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 6.597782800559681, | |
| "grad_norm": 0.7235519886016846, | |
| "learning_rate": 0.00020450167007865532, | |
| "loss": 3.2763, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 6.60316435259929, | |
| "grad_norm": 0.7864811420440674, | |
| "learning_rate": 0.0002041784290485939, | |
| "loss": 3.2803, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 6.608545904638898, | |
| "grad_norm": 0.7728503346443176, | |
| "learning_rate": 0.00020385518801853246, | |
| "loss": 3.2628, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 6.613927456678506, | |
| "grad_norm": 0.7238863110542297, | |
| "learning_rate": 0.00020353194698847105, | |
| "loss": 3.2929, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 6.619309008718115, | |
| "grad_norm": 0.7301209568977356, | |
| "learning_rate": 0.00020320870595840965, | |
| "loss": 3.2925, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 6.624690560757722, | |
| "grad_norm": 0.7258849740028381, | |
| "learning_rate": 0.00020288546492834821, | |
| "loss": 3.2566, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 6.630072112797331, | |
| "grad_norm": 0.7997481226921082, | |
| "learning_rate": 0.0002025622238982868, | |
| "loss": 3.2918, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 6.635453664836939, | |
| "grad_norm": 0.7105148434638977, | |
| "learning_rate": 0.00020223898286822538, | |
| "loss": 3.2829, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 6.640835216876547, | |
| "grad_norm": 0.724937915802002, | |
| "learning_rate": 0.00020191574183816397, | |
| "loss": 3.2826, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 6.6462167689161555, | |
| "grad_norm": 0.7699351906776428, | |
| "learning_rate": 0.00020159250080810257, | |
| "loss": 3.2718, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 6.651598320955763, | |
| "grad_norm": 0.6951019167900085, | |
| "learning_rate": 0.00020126925977804113, | |
| "loss": 3.2886, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 6.656979872995372, | |
| "grad_norm": 0.7419074773788452, | |
| "learning_rate": 0.00020094601874797976, | |
| "loss": 3.3004, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 6.66236142503498, | |
| "grad_norm": 0.7189041972160339, | |
| "learning_rate": 0.00020062277771791832, | |
| "loss": 3.2838, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 6.667742977074588, | |
| "grad_norm": 0.8087723255157471, | |
| "learning_rate": 0.0002002995366878569, | |
| "loss": 3.2995, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 6.6731245291141965, | |
| "grad_norm": 0.763596773147583, | |
| "learning_rate": 0.00019997629565779548, | |
| "loss": 3.3042, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.6731245291141965, | |
| "eval_accuracy": 0.3860081809105351, | |
| "eval_loss": 3.3655924797058105, | |
| "eval_runtime": 184.5192, | |
| "eval_samples_per_second": 97.61, | |
| "eval_steps_per_second": 6.102, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.678506081153805, | |
| "grad_norm": 0.7524381875991821, | |
| "learning_rate": 0.00019965305462773405, | |
| "loss": 3.2681, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 6.683887633193413, | |
| "grad_norm": 0.7397823929786682, | |
| "learning_rate": 0.00019932981359767265, | |
| "loss": 3.2809, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 6.689269185233021, | |
| "grad_norm": 0.791314423084259, | |
| "learning_rate": 0.00019900657256761124, | |
| "loss": 3.29, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 6.69465073727263, | |
| "grad_norm": 0.7627555727958679, | |
| "learning_rate": 0.0001986833315375498, | |
| "loss": 3.279, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 6.7000322893122375, | |
| "grad_norm": 0.7323765158653259, | |
| "learning_rate": 0.00019836009050748838, | |
| "loss": 3.2983, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 6.705413841351846, | |
| "grad_norm": 0.75467449426651, | |
| "learning_rate": 0.000198036849477427, | |
| "loss": 3.2881, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 6.710795393391454, | |
| "grad_norm": 0.7454742193222046, | |
| "learning_rate": 0.00019771360844736557, | |
| "loss": 3.2927, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 6.716176945431062, | |
| "grad_norm": 0.7409285306930542, | |
| "learning_rate": 0.00019739036741730413, | |
| "loss": 3.2686, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 6.721558497470671, | |
| "grad_norm": 0.7461663484573364, | |
| "learning_rate": 0.00019706712638724276, | |
| "loss": 3.2757, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 6.7269400495102785, | |
| "grad_norm": 0.7160119414329529, | |
| "learning_rate": 0.00019674388535718132, | |
| "loss": 3.2942, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 6.732321601549887, | |
| "grad_norm": 0.7318922281265259, | |
| "learning_rate": 0.0001964206443271199, | |
| "loss": 3.2803, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 6.737703153589496, | |
| "grad_norm": 0.7417627573013306, | |
| "learning_rate": 0.00019609740329705849, | |
| "loss": 3.2786, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 6.743084705629103, | |
| "grad_norm": 0.7505048513412476, | |
| "learning_rate": 0.00019577416226699708, | |
| "loss": 3.2866, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 0.7610653638839722, | |
| "learning_rate": 0.00019545092123693565, | |
| "loss": 3.2964, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 6.75384780970832, | |
| "grad_norm": 0.8253541588783264, | |
| "learning_rate": 0.00019512768020687424, | |
| "loss": 3.2805, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 6.759229361747928, | |
| "grad_norm": 0.7113492488861084, | |
| "learning_rate": 0.0001948044391768128, | |
| "loss": 3.2761, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 6.7646109137875365, | |
| "grad_norm": 0.7965245842933655, | |
| "learning_rate": 0.00019448119814675143, | |
| "loss": 3.2815, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 6.769992465827144, | |
| "grad_norm": 0.7471493482589722, | |
| "learning_rate": 0.00019415795711669, | |
| "loss": 3.2948, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 6.775374017866753, | |
| "grad_norm": 0.7140371203422546, | |
| "learning_rate": 0.00019383471608662857, | |
| "loss": 3.2705, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 6.780755569906361, | |
| "grad_norm": 0.7177582383155823, | |
| "learning_rate": 0.0001935114750565672, | |
| "loss": 3.2623, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.780755569906361, | |
| "eval_accuracy": 0.38672713706920353, | |
| "eval_loss": 3.3607940673828125, | |
| "eval_runtime": 184.6782, | |
| "eval_samples_per_second": 97.526, | |
| "eval_steps_per_second": 6.097, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.786137121945969, | |
| "grad_norm": 0.7676652669906616, | |
| "learning_rate": 0.00019318823402650576, | |
| "loss": 3.2935, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 6.7915186739855775, | |
| "grad_norm": 0.7786064743995667, | |
| "learning_rate": 0.00019286499299644432, | |
| "loss": 3.2795, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 6.796900226025185, | |
| "grad_norm": 0.7760289311408997, | |
| "learning_rate": 0.00019254175196638292, | |
| "loss": 3.2892, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 6.802281778064794, | |
| "grad_norm": 0.7892428040504456, | |
| "learning_rate": 0.0001922185109363215, | |
| "loss": 3.2816, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 6.807663330104402, | |
| "grad_norm": 0.8190674781799316, | |
| "learning_rate": 0.00019189526990626008, | |
| "loss": 3.2927, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 6.813044882144011, | |
| "grad_norm": 0.8174397349357605, | |
| "learning_rate": 0.00019157202887619867, | |
| "loss": 3.2842, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 6.8184264341836185, | |
| "grad_norm": 0.7673400640487671, | |
| "learning_rate": 0.00019124878784613724, | |
| "loss": 3.2732, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 6.823807986223227, | |
| "grad_norm": 0.7255118489265442, | |
| "learning_rate": 0.00019092554681607584, | |
| "loss": 3.2817, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 6.829189538262835, | |
| "grad_norm": 0.8077763915061951, | |
| "learning_rate": 0.00019060230578601443, | |
| "loss": 3.2905, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 6.834571090302443, | |
| "grad_norm": 0.7471402287483215, | |
| "learning_rate": 0.000190279064755953, | |
| "loss": 3.2887, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 6.839952642342052, | |
| "grad_norm": 0.7530843615531921, | |
| "learning_rate": 0.00018995582372589157, | |
| "loss": 3.2994, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 6.8453341943816595, | |
| "grad_norm": 0.7443686127662659, | |
| "learning_rate": 0.0001896325826958302, | |
| "loss": 3.2946, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 6.850715746421268, | |
| "grad_norm": 0.7804056406021118, | |
| "learning_rate": 0.00018930934166576876, | |
| "loss": 3.2926, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 6.856097298460876, | |
| "grad_norm": 0.7758466005325317, | |
| "learning_rate": 0.00018898610063570732, | |
| "loss": 3.2731, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 6.861478850500484, | |
| "grad_norm": 0.782230019569397, | |
| "learning_rate": 0.00018866285960564595, | |
| "loss": 3.287, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 6.866860402540093, | |
| "grad_norm": 0.7680057883262634, | |
| "learning_rate": 0.0001883396185755845, | |
| "loss": 3.302, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 6.8722419545797, | |
| "grad_norm": 0.7489615082740784, | |
| "learning_rate": 0.0001880163775455231, | |
| "loss": 3.2765, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 6.877623506619309, | |
| "grad_norm": 0.7689592838287354, | |
| "learning_rate": 0.00018769313651546168, | |
| "loss": 3.2741, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 6.8830050586589175, | |
| "grad_norm": 0.7587409615516663, | |
| "learning_rate": 0.00018736989548540027, | |
| "loss": 3.2811, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 6.888386610698525, | |
| "grad_norm": 0.7683374881744385, | |
| "learning_rate": 0.00018704665445533886, | |
| "loss": 3.2891, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.888386610698525, | |
| "eval_accuracy": 0.38712567586170704, | |
| "eval_loss": 3.356752395629883, | |
| "eval_runtime": 184.2201, | |
| "eval_samples_per_second": 97.769, | |
| "eval_steps_per_second": 6.112, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.893768162738134, | |
| "grad_norm": 0.7452059984207153, | |
| "learning_rate": 0.00018672341342527743, | |
| "loss": 3.2998, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 6.899149714777742, | |
| "grad_norm": 0.7655795216560364, | |
| "learning_rate": 0.000186400172395216, | |
| "loss": 3.2836, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 6.90453126681735, | |
| "grad_norm": 0.773313045501709, | |
| "learning_rate": 0.00018607693136515462, | |
| "loss": 3.27, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 6.9099128188569585, | |
| "grad_norm": 0.7528076171875, | |
| "learning_rate": 0.0001857536903350932, | |
| "loss": 3.2664, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 6.915294370896566, | |
| "grad_norm": 0.7117979526519775, | |
| "learning_rate": 0.00018543044930503176, | |
| "loss": 3.2996, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 6.920675922936175, | |
| "grad_norm": 0.7476879358291626, | |
| "learning_rate": 0.00018510720827497035, | |
| "loss": 3.2838, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 6.926057474975783, | |
| "grad_norm": 0.7241749167442322, | |
| "learning_rate": 0.00018478396724490895, | |
| "loss": 3.2912, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 6.931439027015391, | |
| "grad_norm": 0.7778185606002808, | |
| "learning_rate": 0.00018446719103544875, | |
| "loss": 3.2872, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 6.9368205790549995, | |
| "grad_norm": 0.7526164650917053, | |
| "learning_rate": 0.00018414395000538732, | |
| "loss": 3.2891, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 6.942202131094608, | |
| "grad_norm": 0.764832079410553, | |
| "learning_rate": 0.00018382070897532591, | |
| "loss": 3.2921, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 6.947583683134216, | |
| "grad_norm": 0.7514047026634216, | |
| "learning_rate": 0.0001834974679452645, | |
| "loss": 3.2665, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 6.952965235173824, | |
| "grad_norm": 0.8263497948646545, | |
| "learning_rate": 0.00018317422691520308, | |
| "loss": 3.2744, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 6.958346787213433, | |
| "grad_norm": 0.7777371406555176, | |
| "learning_rate": 0.0001828509858851417, | |
| "loss": 3.308, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 6.9637283392530405, | |
| "grad_norm": 0.7499234080314636, | |
| "learning_rate": 0.00018252774485508027, | |
| "loss": 3.3022, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 6.969109891292649, | |
| "grad_norm": 0.7568396925926208, | |
| "learning_rate": 0.00018220450382501883, | |
| "loss": 3.2816, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 6.974491443332257, | |
| "grad_norm": 0.7589066028594971, | |
| "learning_rate": 0.00018188126279495743, | |
| "loss": 3.292, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 6.979872995371865, | |
| "grad_norm": 0.7187942862510681, | |
| "learning_rate": 0.00018155802176489602, | |
| "loss": 3.2778, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 6.985254547411474, | |
| "grad_norm": 0.7738951444625854, | |
| "learning_rate": 0.0001812347807348346, | |
| "loss": 3.2777, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 6.990636099451081, | |
| "grad_norm": 0.7671989798545837, | |
| "learning_rate": 0.00018091153970477319, | |
| "loss": 3.3004, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 6.99601765149069, | |
| "grad_norm": 0.7247815728187561, | |
| "learning_rate": 0.00018058829867471175, | |
| "loss": 3.2861, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 6.99601765149069, | |
| "eval_accuracy": 0.3873069088785816, | |
| "eval_loss": 3.353672742843628, | |
| "eval_runtime": 184.4733, | |
| "eval_samples_per_second": 97.635, | |
| "eval_steps_per_second": 6.104, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 7.0013992035302985, | |
| "grad_norm": 0.7607365846633911, | |
| "learning_rate": 0.00018026505764465035, | |
| "loss": 3.2605, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 7.006780755569906, | |
| "grad_norm": 0.7479583621025085, | |
| "learning_rate": 0.00017994181661458894, | |
| "loss": 3.1962, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 7.012162307609515, | |
| "grad_norm": 0.8074206113815308, | |
| "learning_rate": 0.0001796185755845275, | |
| "loss": 3.2052, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 7.017543859649122, | |
| "grad_norm": 0.7758211493492126, | |
| "learning_rate": 0.00017929533455446608, | |
| "loss": 3.1925, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 7.022925411688731, | |
| "grad_norm": 0.7739306092262268, | |
| "learning_rate": 0.0001789720935244047, | |
| "loss": 3.2098, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 7.0283069637283395, | |
| "grad_norm": 0.8387571573257446, | |
| "learning_rate": 0.00017864885249434327, | |
| "loss": 3.2035, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 7.033688515767947, | |
| "grad_norm": 0.7665441632270813, | |
| "learning_rate": 0.00017832561146428183, | |
| "loss": 3.2247, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 7.039070067807556, | |
| "grad_norm": 0.7938107848167419, | |
| "learning_rate": 0.00017800237043422046, | |
| "loss": 3.2211, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 7.044451619847164, | |
| "grad_norm": 0.7679534554481506, | |
| "learning_rate": 0.00017767912940415902, | |
| "loss": 3.19, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 7.049833171886772, | |
| "grad_norm": 0.7663267254829407, | |
| "learning_rate": 0.0001773558883740976, | |
| "loss": 3.217, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 7.0552147239263805, | |
| "grad_norm": 0.7496455311775208, | |
| "learning_rate": 0.00017703264734403619, | |
| "loss": 3.2282, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 7.060596275965988, | |
| "grad_norm": 0.7909696698188782, | |
| "learning_rate": 0.00017670940631397475, | |
| "loss": 3.205, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 7.065977828005597, | |
| "grad_norm": 0.8413503766059875, | |
| "learning_rate": 0.00017638616528391337, | |
| "loss": 3.2066, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 7.071359380045205, | |
| "grad_norm": 0.8082650303840637, | |
| "learning_rate": 0.00017606292425385194, | |
| "loss": 3.2173, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 7.076740932084813, | |
| "grad_norm": 0.7603335380554199, | |
| "learning_rate": 0.0001757396832237905, | |
| "loss": 3.1987, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 7.0821224841244215, | |
| "grad_norm": 0.8148004412651062, | |
| "learning_rate": 0.00017541644219372913, | |
| "loss": 3.2047, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 7.08750403616403, | |
| "grad_norm": 0.7317112684249878, | |
| "learning_rate": 0.0001750932011636677, | |
| "loss": 3.2157, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 7.092885588203638, | |
| "grad_norm": 0.7178577780723572, | |
| "learning_rate": 0.00017476996013360627, | |
| "loss": 3.212, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 7.098267140243246, | |
| "grad_norm": 0.762653648853302, | |
| "learning_rate": 0.00017444671910354486, | |
| "loss": 3.2236, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 7.103648692282855, | |
| "grad_norm": 0.804692804813385, | |
| "learning_rate": 0.00017412347807348346, | |
| "loss": 3.2228, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 7.103648692282855, | |
| "eval_accuracy": 0.3870136547343703, | |
| "eval_loss": 3.359973907470703, | |
| "eval_runtime": 184.5015, | |
| "eval_samples_per_second": 97.62, | |
| "eval_steps_per_second": 6.103, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 7.109030244322462, | |
| "grad_norm": 0.7459787130355835, | |
| "learning_rate": 0.00017380023704342202, | |
| "loss": 3.2159, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 7.114411796362071, | |
| "grad_norm": 0.8122513294219971, | |
| "learning_rate": 0.00017347699601336062, | |
| "loss": 3.2056, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 7.119793348401679, | |
| "grad_norm": 0.7792388796806335, | |
| "learning_rate": 0.00017315375498329919, | |
| "loss": 3.2337, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 7.125174900441287, | |
| "grad_norm": 0.7569959759712219, | |
| "learning_rate": 0.00017283051395323778, | |
| "loss": 3.2079, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 7.130556452480896, | |
| "grad_norm": 0.7915155291557312, | |
| "learning_rate": 0.00017250727292317638, | |
| "loss": 3.2235, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 7.135938004520503, | |
| "grad_norm": 0.7481163740158081, | |
| "learning_rate": 0.00017218403189311494, | |
| "loss": 3.2076, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 7.141319556560112, | |
| "grad_norm": 0.8158993721008301, | |
| "learning_rate": 0.0001718607908630535, | |
| "loss": 3.2174, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 7.1467011085997205, | |
| "grad_norm": 0.7769084572792053, | |
| "learning_rate": 0.00017153754983299213, | |
| "loss": 3.2079, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 7.152082660639328, | |
| "grad_norm": 0.776822030544281, | |
| "learning_rate": 0.0001712143088029307, | |
| "loss": 3.219, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 7.157464212678937, | |
| "grad_norm": 0.7852200865745544, | |
| "learning_rate": 0.00017089106777286927, | |
| "loss": 3.1996, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 7.162845764718545, | |
| "grad_norm": 0.8139584064483643, | |
| "learning_rate": 0.0001705678267428079, | |
| "loss": 3.2271, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 7.168227316758153, | |
| "grad_norm": 0.8104858994483948, | |
| "learning_rate": 0.00017024458571274646, | |
| "loss": 3.2187, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 7.1736088687977615, | |
| "grad_norm": 0.7869734168052673, | |
| "learning_rate": 0.00016992134468268505, | |
| "loss": 3.2282, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 7.178990420837369, | |
| "grad_norm": 0.780508816242218, | |
| "learning_rate": 0.00016959810365262362, | |
| "loss": 3.2097, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 7.184371972876978, | |
| "grad_norm": 0.784691572189331, | |
| "learning_rate": 0.0001692748626225622, | |
| "loss": 3.2214, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 7.189753524916586, | |
| "grad_norm": 0.7956085205078125, | |
| "learning_rate": 0.0001689516215925008, | |
| "loss": 3.2162, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 7.195135076956194, | |
| "grad_norm": 0.7606350183486938, | |
| "learning_rate": 0.00016862838056243938, | |
| "loss": 3.2187, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 7.2005166289958025, | |
| "grad_norm": 0.8778959512710571, | |
| "learning_rate": 0.00016830513953237794, | |
| "loss": 3.2145, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 7.205898181035411, | |
| "grad_norm": 0.7992659211158752, | |
| "learning_rate": 0.00016798189850231657, | |
| "loss": 3.2176, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 7.211279733075019, | |
| "grad_norm": 0.7710304260253906, | |
| "learning_rate": 0.00016765865747225513, | |
| "loss": 3.213, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 7.211279733075019, | |
| "eval_accuracy": 0.3876768719615899, | |
| "eval_loss": 3.356171131134033, | |
| "eval_runtime": 184.2825, | |
| "eval_samples_per_second": 97.736, | |
| "eval_steps_per_second": 6.11, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 7.216661285114627, | |
| "grad_norm": 0.834844172000885, | |
| "learning_rate": 0.0001673354164421937, | |
| "loss": 3.2286, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 7.222042837154235, | |
| "grad_norm": 0.7779362201690674, | |
| "learning_rate": 0.00016701217541213232, | |
| "loss": 3.2254, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 7.2274243891938434, | |
| "grad_norm": 0.7590049505233765, | |
| "learning_rate": 0.0001666889343820709, | |
| "loss": 3.2229, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 7.232805941233452, | |
| "grad_norm": 0.7776985168457031, | |
| "learning_rate": 0.00016636569335200946, | |
| "loss": 3.2246, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 7.23818749327306, | |
| "grad_norm": 0.7713861465454102, | |
| "learning_rate": 0.00016604245232194805, | |
| "loss": 3.2196, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 7.243569045312668, | |
| "grad_norm": 0.8110638856887817, | |
| "learning_rate": 0.00016571921129188665, | |
| "loss": 3.2214, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 7.248950597352277, | |
| "grad_norm": 0.8662999868392944, | |
| "learning_rate": 0.00016539597026182521, | |
| "loss": 3.2173, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 7.254332149391884, | |
| "grad_norm": 0.8214209675788879, | |
| "learning_rate": 0.0001650727292317638, | |
| "loss": 3.2314, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 7.259713701431493, | |
| "grad_norm": 0.7896443009376526, | |
| "learning_rate": 0.00016474948820170238, | |
| "loss": 3.2362, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 7.265095253471101, | |
| "grad_norm": 0.7559022903442383, | |
| "learning_rate": 0.00016442624717164094, | |
| "loss": 3.2217, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 7.270476805510709, | |
| "grad_norm": 0.8077674508094788, | |
| "learning_rate": 0.00016410300614157957, | |
| "loss": 3.2308, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 7.275858357550318, | |
| "grad_norm": 0.7656305432319641, | |
| "learning_rate": 0.00016377976511151813, | |
| "loss": 3.225, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 7.281239909589925, | |
| "grad_norm": 0.7853976488113403, | |
| "learning_rate": 0.00016345652408145675, | |
| "loss": 3.2388, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 7.286621461629534, | |
| "grad_norm": 0.7327544689178467, | |
| "learning_rate": 0.00016313328305139532, | |
| "loss": 3.243, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 7.2920030136691425, | |
| "grad_norm": 0.8078761696815491, | |
| "learning_rate": 0.0001628100420213339, | |
| "loss": 3.2254, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 7.29738456570875, | |
| "grad_norm": 0.8278520107269287, | |
| "learning_rate": 0.00016248680099127248, | |
| "loss": 3.223, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 7.302766117748359, | |
| "grad_norm": 0.8349838256835938, | |
| "learning_rate": 0.00016216355996121105, | |
| "loss": 3.2515, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 7.308147669787967, | |
| "grad_norm": 0.800133228302002, | |
| "learning_rate": 0.00016184031893114965, | |
| "loss": 3.2447, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 7.313529221827575, | |
| "grad_norm": 0.7627754807472229, | |
| "learning_rate": 0.00016151707790108824, | |
| "loss": 3.2176, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 7.3189107738671835, | |
| "grad_norm": 0.8355843424797058, | |
| "learning_rate": 0.0001611938368710268, | |
| "loss": 3.2305, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 7.3189107738671835, | |
| "eval_accuracy": 0.38772902534774084, | |
| "eval_loss": 3.355006694793701, | |
| "eval_runtime": 184.4328, | |
| "eval_samples_per_second": 97.656, | |
| "eval_steps_per_second": 6.105, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 7.324292325906791, | |
| "grad_norm": 0.7886375784873962, | |
| "learning_rate": 0.00016087059584096538, | |
| "loss": 3.2153, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 7.3296738779464, | |
| "grad_norm": 0.8135440349578857, | |
| "learning_rate": 0.000160547354810904, | |
| "loss": 3.2493, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 7.335055429986008, | |
| "grad_norm": 0.7818495035171509, | |
| "learning_rate": 0.00016022411378084257, | |
| "loss": 3.2047, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 7.340436982025616, | |
| "grad_norm": 0.7987973690032959, | |
| "learning_rate": 0.00015990087275078113, | |
| "loss": 3.2378, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 7.3458185340652244, | |
| "grad_norm": 0.7804546356201172, | |
| "learning_rate": 0.00015957763172071976, | |
| "loss": 3.2426, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 7.351200086104833, | |
| "grad_norm": 0.820387601852417, | |
| "learning_rate": 0.00015925439069065832, | |
| "loss": 3.2207, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 7.356581638144441, | |
| "grad_norm": 0.8470367789268494, | |
| "learning_rate": 0.0001589311496605969, | |
| "loss": 3.2296, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 0.789986789226532, | |
| "learning_rate": 0.00015860790863053548, | |
| "loss": 3.2171, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 7.367344742223658, | |
| "grad_norm": 0.7953157424926758, | |
| "learning_rate": 0.00015829113242107532, | |
| "loss": 3.2279, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 7.372726294263265, | |
| "grad_norm": 0.7469373941421509, | |
| "learning_rate": 0.00015796789139101389, | |
| "loss": 3.2222, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 7.378107846302874, | |
| "grad_norm": 0.8377979397773743, | |
| "learning_rate": 0.00015764465036095245, | |
| "loss": 3.228, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 7.383489398342482, | |
| "grad_norm": 0.7953019738197327, | |
| "learning_rate": 0.00015732140933089108, | |
| "loss": 3.2008, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 7.38887095038209, | |
| "grad_norm": 0.8219582438468933, | |
| "learning_rate": 0.00015699816830082964, | |
| "loss": 3.2191, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 7.394252502421699, | |
| "grad_norm": 0.8549178838729858, | |
| "learning_rate": 0.0001566749272707682, | |
| "loss": 3.2092, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 7.399634054461306, | |
| "grad_norm": 0.7702128887176514, | |
| "learning_rate": 0.00015635168624070683, | |
| "loss": 3.22, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 7.405015606500915, | |
| "grad_norm": 0.843408465385437, | |
| "learning_rate": 0.0001560284452106454, | |
| "loss": 3.2458, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 7.4103971585405235, | |
| "grad_norm": 0.8221510052680969, | |
| "learning_rate": 0.00015570520418058397, | |
| "loss": 3.2327, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 7.415778710580131, | |
| "grad_norm": 0.8469644784927368, | |
| "learning_rate": 0.00015538196315052256, | |
| "loss": 3.2238, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 7.42116026261974, | |
| "grad_norm": 0.7789564728736877, | |
| "learning_rate": 0.00015505872212046113, | |
| "loss": 3.2411, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 7.426541814659347, | |
| "grad_norm": 0.8157601356506348, | |
| "learning_rate": 0.00015473548109039972, | |
| "loss": 3.2288, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 7.426541814659347, | |
| "eval_accuracy": 0.38810018361251514, | |
| "eval_loss": 3.349553346633911, | |
| "eval_runtime": 184.6865, | |
| "eval_samples_per_second": 97.522, | |
| "eval_steps_per_second": 6.097, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 7.431923366698956, | |
| "grad_norm": 0.8120962381362915, | |
| "learning_rate": 0.00015441224006033832, | |
| "loss": 3.2178, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 7.4373049187385645, | |
| "grad_norm": 0.7830744981765747, | |
| "learning_rate": 0.00015408899903027689, | |
| "loss": 3.218, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 7.442686470778172, | |
| "grad_norm": 0.8280989527702332, | |
| "learning_rate": 0.00015376575800021545, | |
| "loss": 3.2391, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 7.448068022817781, | |
| "grad_norm": 0.8267955780029297, | |
| "learning_rate": 0.00015344251697015408, | |
| "loss": 3.2389, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 7.453449574857389, | |
| "grad_norm": 0.8615748882293701, | |
| "learning_rate": 0.00015311927594009264, | |
| "loss": 3.2351, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 7.458831126896997, | |
| "grad_norm": 0.7550056576728821, | |
| "learning_rate": 0.0001527960349100312, | |
| "loss": 3.2423, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 7.4642126789366054, | |
| "grad_norm": 0.8129693865776062, | |
| "learning_rate": 0.00015247279387996983, | |
| "loss": 3.2548, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 7.469594230976213, | |
| "grad_norm": 0.7872095108032227, | |
| "learning_rate": 0.0001521495528499084, | |
| "loss": 3.2227, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 7.474975783015822, | |
| "grad_norm": 0.7631450891494751, | |
| "learning_rate": 0.000151826311819847, | |
| "loss": 3.2189, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 7.48035733505543, | |
| "grad_norm": 0.7999120950698853, | |
| "learning_rate": 0.00015150307078978556, | |
| "loss": 3.2383, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 7.485738887095038, | |
| "grad_norm": 0.8011081218719482, | |
| "learning_rate": 0.00015117982975972416, | |
| "loss": 3.2253, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 7.491120439134646, | |
| "grad_norm": 0.8270715475082397, | |
| "learning_rate": 0.00015085658872966275, | |
| "loss": 3.2216, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 7.496501991174255, | |
| "grad_norm": 0.8640902638435364, | |
| "learning_rate": 0.00015053334769960132, | |
| "loss": 3.2372, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 7.501883543213863, | |
| "grad_norm": 0.8011231422424316, | |
| "learning_rate": 0.0001502101066695399, | |
| "loss": 3.2338, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 7.507265095253471, | |
| "grad_norm": 0.8384730219841003, | |
| "learning_rate": 0.00014989333046007972, | |
| "loss": 3.2315, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 7.51264664729308, | |
| "grad_norm": 0.8214177489280701, | |
| "learning_rate": 0.00014957008943001832, | |
| "loss": 3.2196, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 7.518028199332687, | |
| "grad_norm": 0.8249951601028442, | |
| "learning_rate": 0.00014924684839995688, | |
| "loss": 3.2374, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 7.523409751372296, | |
| "grad_norm": 0.8308032155036926, | |
| "learning_rate": 0.00014892360736989548, | |
| "loss": 3.2311, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 7.528791303411904, | |
| "grad_norm": 0.8367502093315125, | |
| "learning_rate": 0.00014860036633983407, | |
| "loss": 3.2348, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 7.534172855451512, | |
| "grad_norm": 0.807752788066864, | |
| "learning_rate": 0.00014827712530977264, | |
| "loss": 3.2313, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.534172855451512, | |
| "eval_accuracy": 0.3886321481512548, | |
| "eval_loss": 3.346137762069702, | |
| "eval_runtime": 184.2366, | |
| "eval_samples_per_second": 97.76, | |
| "eval_steps_per_second": 6.112, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.539554407491121, | |
| "grad_norm": 0.8112996816635132, | |
| "learning_rate": 0.00014795388427971123, | |
| "loss": 3.2335, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 7.544935959530728, | |
| "grad_norm": 0.7886261940002441, | |
| "learning_rate": 0.0001476306432496498, | |
| "loss": 3.2534, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 7.550317511570337, | |
| "grad_norm": 0.8170931935310364, | |
| "learning_rate": 0.0001473074022195884, | |
| "loss": 3.2409, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 7.5556990636099455, | |
| "grad_norm": 0.8506706953048706, | |
| "learning_rate": 0.00014698416118952696, | |
| "loss": 3.212, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 7.561080615649553, | |
| "grad_norm": 0.894822359085083, | |
| "learning_rate": 0.00014666092015946556, | |
| "loss": 3.2461, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 7.566462167689162, | |
| "grad_norm": 0.8242021799087524, | |
| "learning_rate": 0.00014633767912940415, | |
| "loss": 3.2416, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 7.57184371972877, | |
| "grad_norm": 0.7750385403633118, | |
| "learning_rate": 0.00014601443809934272, | |
| "loss": 3.2266, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 7.577225271768378, | |
| "grad_norm": 0.8312146663665771, | |
| "learning_rate": 0.00014569119706928132, | |
| "loss": 3.2444, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 7.5826068238079865, | |
| "grad_norm": 0.7865069508552551, | |
| "learning_rate": 0.0001453679560392199, | |
| "loss": 3.2235, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 7.587988375847594, | |
| "grad_norm": 0.8098903298377991, | |
| "learning_rate": 0.00014504471500915848, | |
| "loss": 3.2274, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 7.593369927887203, | |
| "grad_norm": 0.8226689696311951, | |
| "learning_rate": 0.00014472147397909707, | |
| "loss": 3.2269, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 7.598751479926811, | |
| "grad_norm": 0.8115870356559753, | |
| "learning_rate": 0.00014439823294903564, | |
| "loss": 3.2301, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 7.604133031966419, | |
| "grad_norm": 0.8107540607452393, | |
| "learning_rate": 0.00014407499191897423, | |
| "loss": 3.2209, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 7.609514584006027, | |
| "grad_norm": 0.816052258014679, | |
| "learning_rate": 0.0001437517508889128, | |
| "loss": 3.2312, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 7.614896136045635, | |
| "grad_norm": 0.7707471251487732, | |
| "learning_rate": 0.0001434285098588514, | |
| "loss": 3.2305, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 7.620277688085244, | |
| "grad_norm": 0.793674647808075, | |
| "learning_rate": 0.00014310526882879, | |
| "loss": 3.2473, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 7.625659240124852, | |
| "grad_norm": 0.9196163415908813, | |
| "learning_rate": 0.00014278202779872856, | |
| "loss": 3.2371, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 7.63104079216446, | |
| "grad_norm": 0.7878997921943665, | |
| "learning_rate": 0.00014245878676866715, | |
| "loss": 3.2358, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 7.636422344204068, | |
| "grad_norm": 0.843461811542511, | |
| "learning_rate": 0.00014213554573860575, | |
| "loss": 3.2483, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 7.641803896243677, | |
| "grad_norm": 0.8309678435325623, | |
| "learning_rate": 0.00014181230470854434, | |
| "loss": 3.2305, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 7.641803896243677, | |
| "eval_accuracy": 0.3890313388610852, | |
| "eval_loss": 3.3432910442352295, | |
| "eval_runtime": 184.235, | |
| "eval_samples_per_second": 97.761, | |
| "eval_steps_per_second": 6.112, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 7.647185448283285, | |
| "grad_norm": 0.7841317057609558, | |
| "learning_rate": 0.0001414890636784829, | |
| "loss": 3.2462, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 7.652567000322893, | |
| "grad_norm": 0.7946882247924805, | |
| "learning_rate": 0.0001411658226484215, | |
| "loss": 3.2419, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 7.657948552362502, | |
| "grad_norm": 0.8070786595344543, | |
| "learning_rate": 0.00014084258161836007, | |
| "loss": 3.2466, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 7.663330104402109, | |
| "grad_norm": 0.8690606951713562, | |
| "learning_rate": 0.00014051934058829867, | |
| "loss": 3.2453, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 7.668711656441718, | |
| "grad_norm": 0.8302962779998779, | |
| "learning_rate": 0.00014019609955823723, | |
| "loss": 3.2364, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 7.674093208481326, | |
| "grad_norm": 0.8969767093658447, | |
| "learning_rate": 0.00013987285852817583, | |
| "loss": 3.2421, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 7.679474760520934, | |
| "grad_norm": 0.789718747138977, | |
| "learning_rate": 0.0001395496174981144, | |
| "loss": 3.2396, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 7.684856312560543, | |
| "grad_norm": 0.8139830231666565, | |
| "learning_rate": 0.000139226376468053, | |
| "loss": 3.2435, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 7.69023786460015, | |
| "grad_norm": 0.8262346386909485, | |
| "learning_rate": 0.00013890313543799159, | |
| "loss": 3.2516, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 7.695619416639759, | |
| "grad_norm": 0.7663363218307495, | |
| "learning_rate": 0.00013857989440793018, | |
| "loss": 3.2447, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 7.7010009686793675, | |
| "grad_norm": 0.9069011807441711, | |
| "learning_rate": 0.00013825665337786875, | |
| "loss": 3.2388, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 7.706382520718975, | |
| "grad_norm": 0.8168088793754578, | |
| "learning_rate": 0.00013793341234780734, | |
| "loss": 3.2439, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 7.711764072758584, | |
| "grad_norm": 0.8117689490318298, | |
| "learning_rate": 0.00013761017131774594, | |
| "loss": 3.2352, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 7.717145624798192, | |
| "grad_norm": 0.7903016209602356, | |
| "learning_rate": 0.0001372869302876845, | |
| "loss": 3.2315, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 7.7225271768378, | |
| "grad_norm": 0.7806147336959839, | |
| "learning_rate": 0.0001369636892576231, | |
| "loss": 3.2229, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 7.727908728877408, | |
| "grad_norm": 0.8229786157608032, | |
| "learning_rate": 0.00013664044822756167, | |
| "loss": 3.225, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 7.733290280917016, | |
| "grad_norm": 0.7852742671966553, | |
| "learning_rate": 0.00013631720719750026, | |
| "loss": 3.2338, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 7.738671832956625, | |
| "grad_norm": 0.8520014882087708, | |
| "learning_rate": 0.00013599396616743883, | |
| "loss": 3.2297, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 7.744053384996233, | |
| "grad_norm": 0.8504642248153687, | |
| "learning_rate": 0.00013567072513737742, | |
| "loss": 3.2343, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 7.749434937035841, | |
| "grad_norm": 0.8431692719459534, | |
| "learning_rate": 0.00013534748410731602, | |
| "loss": 3.2307, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.749434937035841, | |
| "eval_accuracy": 0.3895777542339042, | |
| "eval_loss": 3.3391544818878174, | |
| "eval_runtime": 184.6027, | |
| "eval_samples_per_second": 97.566, | |
| "eval_steps_per_second": 6.1, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.754816489075449, | |
| "grad_norm": 0.8284783363342285, | |
| "learning_rate": 0.0001350242430772546, | |
| "loss": 3.2289, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 7.760198041115058, | |
| "grad_norm": 0.87309730052948, | |
| "learning_rate": 0.00013470100204719318, | |
| "loss": 3.2234, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 7.765579593154666, | |
| "grad_norm": 0.8297306299209595, | |
| "learning_rate": 0.00013437776101713178, | |
| "loss": 3.2388, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 7.770961145194274, | |
| "grad_norm": 0.8693126440048218, | |
| "learning_rate": 0.00013405451998707034, | |
| "loss": 3.2477, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 7.776342697233883, | |
| "grad_norm": 0.7984379529953003, | |
| "learning_rate": 0.00013373127895700894, | |
| "loss": 3.2352, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 7.78172424927349, | |
| "grad_norm": 0.7747134566307068, | |
| "learning_rate": 0.00013340803792694753, | |
| "loss": 3.2416, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 7.787105801313099, | |
| "grad_norm": 0.8343895673751831, | |
| "learning_rate": 0.0001330847968968861, | |
| "loss": 3.2254, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 7.792487353352707, | |
| "grad_norm": 0.8807649612426758, | |
| "learning_rate": 0.00013276155586682467, | |
| "loss": 3.2497, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 7.797868905392315, | |
| "grad_norm": 0.8675557374954224, | |
| "learning_rate": 0.00013243831483676326, | |
| "loss": 3.2235, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 7.803250457431924, | |
| "grad_norm": 0.8026169538497925, | |
| "learning_rate": 0.00013211507380670186, | |
| "loss": 3.2313, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 7.808632009471531, | |
| "grad_norm": 0.7819876670837402, | |
| "learning_rate": 0.00013179183277664042, | |
| "loss": 3.2311, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 7.81401356151114, | |
| "grad_norm": 0.8406408429145813, | |
| "learning_rate": 0.00013146859174657902, | |
| "loss": 3.2473, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 7.819395113550748, | |
| "grad_norm": 0.803548276424408, | |
| "learning_rate": 0.00013114535071651761, | |
| "loss": 3.2152, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 7.824776665590356, | |
| "grad_norm": 0.8550993800163269, | |
| "learning_rate": 0.00013082210968645618, | |
| "loss": 3.2351, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 7.830158217629965, | |
| "grad_norm": 0.8060100078582764, | |
| "learning_rate": 0.00013049886865639478, | |
| "loss": 3.2257, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 7.835539769669572, | |
| "grad_norm": 0.8339954614639282, | |
| "learning_rate": 0.00013017562762633337, | |
| "loss": 3.2484, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 7.840921321709181, | |
| "grad_norm": 0.8514379858970642, | |
| "learning_rate": 0.00012985238659627194, | |
| "loss": 3.2455, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 7.846302873748789, | |
| "grad_norm": 0.851406991481781, | |
| "learning_rate": 0.00012952914556621053, | |
| "loss": 3.2255, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 7.851684425788397, | |
| "grad_norm": 0.816084086894989, | |
| "learning_rate": 0.0001292059045361491, | |
| "loss": 3.239, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 7.857065977828006, | |
| "grad_norm": 0.89252769947052, | |
| "learning_rate": 0.0001288826635060877, | |
| "loss": 3.2462, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.857065977828006, | |
| "eval_accuracy": 0.38954309396269143, | |
| "eval_loss": 3.3360230922698975, | |
| "eval_runtime": 184.1802, | |
| "eval_samples_per_second": 97.79, | |
| "eval_steps_per_second": 6.114, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.862447529867614, | |
| "grad_norm": 0.8219316601753235, | |
| "learning_rate": 0.00012855942247602626, | |
| "loss": 3.2336, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 7.867829081907222, | |
| "grad_norm": 0.866523802280426, | |
| "learning_rate": 0.00012823618144596486, | |
| "loss": 3.245, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 7.87321063394683, | |
| "grad_norm": 0.8043298125267029, | |
| "learning_rate": 0.00012791294041590345, | |
| "loss": 3.2226, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 7.878592185986438, | |
| "grad_norm": 0.8249024152755737, | |
| "learning_rate": 0.00012758969938584202, | |
| "loss": 3.2326, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 7.883973738026047, | |
| "grad_norm": 0.8488306403160095, | |
| "learning_rate": 0.00012726645835578061, | |
| "loss": 3.2453, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 7.889355290065655, | |
| "grad_norm": 0.8137925267219543, | |
| "learning_rate": 0.0001269432173257192, | |
| "loss": 3.2201, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 7.894736842105263, | |
| "grad_norm": 0.8148021101951599, | |
| "learning_rate": 0.00012661997629565778, | |
| "loss": 3.2198, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 7.900118394144871, | |
| "grad_norm": 0.8085612654685974, | |
| "learning_rate": 0.00012629673526559637, | |
| "loss": 3.2318, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 7.90549994618448, | |
| "grad_norm": 0.809008002281189, | |
| "learning_rate": 0.00012597349423553497, | |
| "loss": 3.2332, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 7.910881498224088, | |
| "grad_norm": 0.8529590964317322, | |
| "learning_rate": 0.00012565025320547353, | |
| "loss": 3.2444, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 7.916263050263696, | |
| "grad_norm": 0.8345469832420349, | |
| "learning_rate": 0.00012532701217541213, | |
| "loss": 3.2307, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 7.921644602303305, | |
| "grad_norm": 0.7679382562637329, | |
| "learning_rate": 0.0001250037711453507, | |
| "loss": 3.2446, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 7.927026154342912, | |
| "grad_norm": 0.8335257172584534, | |
| "learning_rate": 0.0001246805301152893, | |
| "loss": 3.2245, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 7.932407706382521, | |
| "grad_norm": 0.8356488943099976, | |
| "learning_rate": 0.00012435728908522786, | |
| "loss": 3.2501, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 7.937789258422129, | |
| "grad_norm": 0.8207045793533325, | |
| "learning_rate": 0.00012403404805516645, | |
| "loss": 3.2376, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 7.943170810461737, | |
| "grad_norm": 0.8445755839347839, | |
| "learning_rate": 0.00012371727184570629, | |
| "loss": 3.2414, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 7.948552362501346, | |
| "grad_norm": 0.7874829769134521, | |
| "learning_rate": 0.0001234004956362461, | |
| "loss": 3.2445, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 7.953933914540953, | |
| "grad_norm": 0.7683655619621277, | |
| "learning_rate": 0.00012307725460618466, | |
| "loss": 3.2392, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 7.959315466580562, | |
| "grad_norm": 0.8490731120109558, | |
| "learning_rate": 0.00012275401357612326, | |
| "loss": 3.2255, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 7.96469701862017, | |
| "grad_norm": 0.8062489032745361, | |
| "learning_rate": 0.00012243077254606182, | |
| "loss": 3.2259, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.96469701862017, | |
| "eval_accuracy": 0.3901544837624235, | |
| "eval_loss": 3.331286668777466, | |
| "eval_runtime": 184.3879, | |
| "eval_samples_per_second": 97.68, | |
| "eval_steps_per_second": 6.107, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.970078570659778, | |
| "grad_norm": 0.7976067662239075, | |
| "learning_rate": 0.00012210753151600042, | |
| "loss": 3.2255, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 7.975460122699387, | |
| "grad_norm": 0.8020706176757812, | |
| "learning_rate": 0.00012178429048593901, | |
| "loss": 3.2332, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 7.980841674738995, | |
| "grad_norm": 0.7868686318397522, | |
| "learning_rate": 0.00012146104945587758, | |
| "loss": 3.2325, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 7.986223226778603, | |
| "grad_norm": 0.8337668776512146, | |
| "learning_rate": 0.00012113780842581617, | |
| "loss": 3.2419, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 7.991604778818211, | |
| "grad_norm": 0.8602197766304016, | |
| "learning_rate": 0.00012081456739575477, | |
| "loss": 3.2313, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 7.996986330857819, | |
| "grad_norm": 0.8567992448806763, | |
| "learning_rate": 0.00012049132636569334, | |
| "loss": 3.2328, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 8.002367882897428, | |
| "grad_norm": 0.8783753514289856, | |
| "learning_rate": 0.00012016808533563193, | |
| "loss": 3.2019, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 8.007749434937036, | |
| "grad_norm": 0.8364815711975098, | |
| "learning_rate": 0.00011984484430557051, | |
| "loss": 3.1519, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 8.013130986976645, | |
| "grad_norm": 0.8052390813827515, | |
| "learning_rate": 0.0001195216032755091, | |
| "loss": 3.1529, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 8.018512539016251, | |
| "grad_norm": 0.8049619793891907, | |
| "learning_rate": 0.00011919836224544767, | |
| "loss": 3.154, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 8.02389409105586, | |
| "grad_norm": 0.8000898957252502, | |
| "learning_rate": 0.00011887512121538627, | |
| "loss": 3.1562, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 8.029275643095469, | |
| "grad_norm": 0.8549275994300842, | |
| "learning_rate": 0.00011855188018532485, | |
| "loss": 3.1382, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 8.034657195135077, | |
| "grad_norm": 0.8691955208778381, | |
| "learning_rate": 0.00011822863915526343, | |
| "loss": 3.1614, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 8.040038747174686, | |
| "grad_norm": 0.8241603970527649, | |
| "learning_rate": 0.00011790539812520201, | |
| "loss": 3.1545, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 8.045420299214294, | |
| "grad_norm": 0.7827221751213074, | |
| "learning_rate": 0.00011758215709514061, | |
| "loss": 3.1564, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 8.050801851253901, | |
| "grad_norm": 0.8604016304016113, | |
| "learning_rate": 0.00011725891606507917, | |
| "loss": 3.1516, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 8.05618340329351, | |
| "grad_norm": 0.8160719871520996, | |
| "learning_rate": 0.00011693567503501777, | |
| "loss": 3.1621, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 8.061564955333118, | |
| "grad_norm": 0.8329173922538757, | |
| "learning_rate": 0.00011661243400495635, | |
| "loss": 3.162, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 8.066946507372727, | |
| "grad_norm": 0.8375145792961121, | |
| "learning_rate": 0.00011628919297489493, | |
| "loss": 3.1807, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 8.072328059412335, | |
| "grad_norm": 0.8866572976112366, | |
| "learning_rate": 0.00011596595194483351, | |
| "loss": 3.1789, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 8.072328059412335, | |
| "eval_accuracy": 0.38998748427385266, | |
| "eval_loss": 3.337472677230835, | |
| "eval_runtime": 184.7219, | |
| "eval_samples_per_second": 97.503, | |
| "eval_steps_per_second": 6.096, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 8.077709611451942, | |
| "grad_norm": 0.8435813784599304, | |
| "learning_rate": 0.00011564271091477211, | |
| "loss": 3.167, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 8.08309116349155, | |
| "grad_norm": 0.845312237739563, | |
| "learning_rate": 0.0001153194698847107, | |
| "loss": 3.1702, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 8.088472715531159, | |
| "grad_norm": 0.8060571551322937, | |
| "learning_rate": 0.00011499622885464927, | |
| "loss": 3.1735, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 8.093854267570768, | |
| "grad_norm": 0.876544177532196, | |
| "learning_rate": 0.00011467298782458786, | |
| "loss": 3.1761, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 8.099235819610376, | |
| "grad_norm": 0.8017421364784241, | |
| "learning_rate": 0.00011434974679452645, | |
| "loss": 3.1735, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 8.104617371649983, | |
| "grad_norm": 0.8665487170219421, | |
| "learning_rate": 0.00011402650576446503, | |
| "loss": 3.1671, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 8.109998923689592, | |
| "grad_norm": 0.864019513130188, | |
| "learning_rate": 0.00011370326473440361, | |
| "loss": 3.1754, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 8.1153804757292, | |
| "grad_norm": 0.7847714424133301, | |
| "learning_rate": 0.0001133800237043422, | |
| "loss": 3.1637, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 8.120762027768809, | |
| "grad_norm": 0.7971537113189697, | |
| "learning_rate": 0.00011305678267428077, | |
| "loss": 3.159, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 8.126143579808417, | |
| "grad_norm": 0.8206765055656433, | |
| "learning_rate": 0.00011273354164421936, | |
| "loss": 3.1894, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 8.131525131848026, | |
| "grad_norm": 0.8835159540176392, | |
| "learning_rate": 0.00011241030061415795, | |
| "loss": 3.1863, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 8.136906683887632, | |
| "grad_norm": 0.8609153628349304, | |
| "learning_rate": 0.00011208705958409654, | |
| "loss": 3.1728, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 8.142288235927241, | |
| "grad_norm": 0.8374049663543701, | |
| "learning_rate": 0.00011176381855403511, | |
| "loss": 3.1752, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 8.14766978796685, | |
| "grad_norm": 0.8300270438194275, | |
| "learning_rate": 0.0001114405775239737, | |
| "loss": 3.1822, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 8.153051340006458, | |
| "grad_norm": 0.8436737060546875, | |
| "learning_rate": 0.00011111733649391228, | |
| "loss": 3.1726, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 8.158432892046067, | |
| "grad_norm": 0.8542490601539612, | |
| "learning_rate": 0.00011079409546385086, | |
| "loss": 3.1713, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 8.163814444085673, | |
| "grad_norm": 0.8978158831596375, | |
| "learning_rate": 0.00011047085443378945, | |
| "loss": 3.1836, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 8.169195996125282, | |
| "grad_norm": 0.8399717211723328, | |
| "learning_rate": 0.00011014761340372804, | |
| "loss": 3.1693, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 8.17457754816489, | |
| "grad_norm": 0.8642532825469971, | |
| "learning_rate": 0.00010982437237366661, | |
| "loss": 3.1777, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 8.1799591002045, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00010950759616420644, | |
| "loss": 3.1772, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 8.1799591002045, | |
| "eval_accuracy": 0.3901858844470019, | |
| "eval_loss": 3.334829330444336, | |
| "eval_runtime": 184.1053, | |
| "eval_samples_per_second": 97.83, | |
| "eval_steps_per_second": 6.116, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 8.185340652244108, | |
| "grad_norm": 0.8661144971847534, | |
| "learning_rate": 0.00010918435513414502, | |
| "loss": 3.1709, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 8.190722204283716, | |
| "grad_norm": 0.8771913051605225, | |
| "learning_rate": 0.00010886111410408359, | |
| "loss": 3.1775, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 8.196103756323323, | |
| "grad_norm": 0.8441982269287109, | |
| "learning_rate": 0.00010853787307402218, | |
| "loss": 3.1796, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 8.201485308362932, | |
| "grad_norm": 0.8596539497375488, | |
| "learning_rate": 0.00010821463204396078, | |
| "loss": 3.1848, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 8.20686686040254, | |
| "grad_norm": 0.8797788023948669, | |
| "learning_rate": 0.00010789139101389935, | |
| "loss": 3.1779, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 8.212248412442149, | |
| "grad_norm": 0.8350202441215515, | |
| "learning_rate": 0.00010756814998383794, | |
| "loss": 3.1791, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 8.217629964481757, | |
| "grad_norm": 0.9758611917495728, | |
| "learning_rate": 0.00010724490895377652, | |
| "loss": 3.1655, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 8.223011516521364, | |
| "grad_norm": 0.8383360505104065, | |
| "learning_rate": 0.00010692166792371512, | |
| "loss": 3.1832, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 8.228393068560973, | |
| "grad_norm": 0.8440639972686768, | |
| "learning_rate": 0.00010659842689365368, | |
| "loss": 3.1878, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 8.233774620600581, | |
| "grad_norm": 0.8401709794998169, | |
| "learning_rate": 0.00010627518586359228, | |
| "loss": 3.1736, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 8.23915617264019, | |
| "grad_norm": 0.8579102158546448, | |
| "learning_rate": 0.00010595194483353086, | |
| "loss": 3.1638, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 8.244537724679798, | |
| "grad_norm": 0.8385379314422607, | |
| "learning_rate": 0.00010562870380346944, | |
| "loss": 3.1753, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 8.249919276719407, | |
| "grad_norm": 0.8203363418579102, | |
| "learning_rate": 0.00010530546277340802, | |
| "loss": 3.1707, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 8.255300828759013, | |
| "grad_norm": 0.8243541121482849, | |
| "learning_rate": 0.00010498222174334662, | |
| "loss": 3.1716, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 8.260682380798622, | |
| "grad_norm": 0.8422791361808777, | |
| "learning_rate": 0.00010465898071328519, | |
| "loss": 3.1836, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 8.26606393283823, | |
| "grad_norm": 0.8471134305000305, | |
| "learning_rate": 0.00010433573968322378, | |
| "loss": 3.1814, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 8.27144548487784, | |
| "grad_norm": 0.8101161122322083, | |
| "learning_rate": 0.00010401249865316237, | |
| "loss": 3.1779, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 8.276827036917448, | |
| "grad_norm": 0.896263599395752, | |
| "learning_rate": 0.00010368925762310096, | |
| "loss": 3.1831, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 8.282208588957054, | |
| "grad_norm": 0.8486348390579224, | |
| "learning_rate": 0.00010336601659303954, | |
| "loss": 3.1806, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 8.287590140996663, | |
| "grad_norm": 0.8380976915359497, | |
| "learning_rate": 0.00010304277556297812, | |
| "loss": 3.1732, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 8.287590140996663, | |
| "eval_accuracy": 0.39076543895060434, | |
| "eval_loss": 3.3328659534454346, | |
| "eval_runtime": 184.2535, | |
| "eval_samples_per_second": 97.751, | |
| "eval_steps_per_second": 6.111, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 8.292971693036272, | |
| "grad_norm": 0.7985622882843018, | |
| "learning_rate": 0.00010271953453291671, | |
| "loss": 3.1663, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 8.29835324507588, | |
| "grad_norm": 0.8832119107246399, | |
| "learning_rate": 0.00010239629350285528, | |
| "loss": 3.192, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 8.303734797115489, | |
| "grad_norm": 0.8704482316970825, | |
| "learning_rate": 0.00010207305247279387, | |
| "loss": 3.1871, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 8.309116349155097, | |
| "grad_norm": 0.8640622496604919, | |
| "learning_rate": 0.00010174981144273246, | |
| "loss": 3.1888, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 8.314497901194704, | |
| "grad_norm": 0.8418446779251099, | |
| "learning_rate": 0.00010142657041267104, | |
| "loss": 3.1915, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 8.319879453234313, | |
| "grad_norm": 0.8596892952919006, | |
| "learning_rate": 0.00010110332938260962, | |
| "loss": 3.1899, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 8.325261005273921, | |
| "grad_norm": 0.8529751896858215, | |
| "learning_rate": 0.00010078008835254821, | |
| "loss": 3.1842, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 8.33064255731353, | |
| "grad_norm": 0.8708468079566956, | |
| "learning_rate": 0.0001004568473224868, | |
| "loss": 3.1842, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 8.336024109353138, | |
| "grad_norm": 0.8727350234985352, | |
| "learning_rate": 0.00010013360629242537, | |
| "loss": 3.1902, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 8.341405661392745, | |
| "grad_norm": 0.8403820395469666, | |
| "learning_rate": 9.981036526236396e-05, | |
| "loss": 3.1683, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 8.346787213432354, | |
| "grad_norm": 0.7990129590034485, | |
| "learning_rate": 9.948712423230255e-05, | |
| "loss": 3.1668, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 8.352168765471962, | |
| "grad_norm": 0.8051156401634216, | |
| "learning_rate": 9.916388320224112e-05, | |
| "loss": 3.1875, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 8.35755031751157, | |
| "grad_norm": 1.0316652059555054, | |
| "learning_rate": 9.884064217217971e-05, | |
| "loss": 3.1986, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 8.36293186955118, | |
| "grad_norm": 0.9079576730728149, | |
| "learning_rate": 9.851740114211831e-05, | |
| "loss": 3.18, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 8.368313421590786, | |
| "grad_norm": 0.9003082513809204, | |
| "learning_rate": 9.819416011205688e-05, | |
| "loss": 3.1652, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 8.373694973630395, | |
| "grad_norm": 0.8261138796806335, | |
| "learning_rate": 9.787091908199547e-05, | |
| "loss": 3.1786, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 8.379076525670003, | |
| "grad_norm": 0.857865035533905, | |
| "learning_rate": 9.754767805193405e-05, | |
| "loss": 3.1672, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 8.384458077709612, | |
| "grad_norm": 0.8354493975639343, | |
| "learning_rate": 9.722443702187265e-05, | |
| "loss": 3.1921, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 8.38983962974922, | |
| "grad_norm": 0.7907045483589172, | |
| "learning_rate": 9.690119599181121e-05, | |
| "loss": 3.1879, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 8.395221181788829, | |
| "grad_norm": 0.8590234518051147, | |
| "learning_rate": 9.657795496174981e-05, | |
| "loss": 3.1881, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 8.395221181788829, | |
| "eval_accuracy": 0.3911622392969028, | |
| "eval_loss": 3.328979253768921, | |
| "eval_runtime": 184.3554, | |
| "eval_samples_per_second": 97.697, | |
| "eval_steps_per_second": 6.108, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 8.400602733828435, | |
| "grad_norm": 0.8273560404777527, | |
| "learning_rate": 9.625471393168839e-05, | |
| "loss": 3.1692, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 8.405984285868044, | |
| "grad_norm": 0.8984537720680237, | |
| "learning_rate": 9.593147290162697e-05, | |
| "loss": 3.1826, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 8.411365837907653, | |
| "grad_norm": 0.8866853713989258, | |
| "learning_rate": 9.560823187156555e-05, | |
| "loss": 3.1882, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 8.416747389947261, | |
| "grad_norm": 0.8914728760719299, | |
| "learning_rate": 9.528499084150415e-05, | |
| "loss": 3.1835, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 8.42212894198687, | |
| "grad_norm": 0.8183653950691223, | |
| "learning_rate": 9.496174981144271e-05, | |
| "loss": 3.1826, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 8.427510494026476, | |
| "grad_norm": 0.8232743144035339, | |
| "learning_rate": 9.463850878138131e-05, | |
| "loss": 3.1944, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 8.432892046066085, | |
| "grad_norm": 0.8727222681045532, | |
| "learning_rate": 9.431526775131989e-05, | |
| "loss": 3.2018, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 8.438273598105694, | |
| "grad_norm": 0.8639195561408997, | |
| "learning_rate": 9.399202672125848e-05, | |
| "loss": 3.1838, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 8.443655150145302, | |
| "grad_norm": 0.8310608267784119, | |
| "learning_rate": 9.366878569119705e-05, | |
| "loss": 3.1739, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 8.44903670218491, | |
| "grad_norm": 0.849603533744812, | |
| "learning_rate": 9.334554466113565e-05, | |
| "loss": 3.174, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 8.45441825422452, | |
| "grad_norm": 0.8781525492668152, | |
| "learning_rate": 9.302230363107424e-05, | |
| "loss": 3.1771, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 8.459799806264126, | |
| "grad_norm": 0.9027126431465149, | |
| "learning_rate": 9.269906260101281e-05, | |
| "loss": 3.1938, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 8.465181358303735, | |
| "grad_norm": 0.8226575255393982, | |
| "learning_rate": 9.23758215709514e-05, | |
| "loss": 3.1688, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 8.470562910343343, | |
| "grad_norm": 0.8545637130737305, | |
| "learning_rate": 9.205258054088998e-05, | |
| "loss": 3.1884, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 8.475944462382952, | |
| "grad_norm": 0.8180684447288513, | |
| "learning_rate": 9.172933951082856e-05, | |
| "loss": 3.1945, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 8.48132601442256, | |
| "grad_norm": 0.8815638422966003, | |
| "learning_rate": 9.140609848076715e-05, | |
| "loss": 3.1881, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 8.486707566462167, | |
| "grad_norm": 0.8755268454551697, | |
| "learning_rate": 9.108285745070574e-05, | |
| "loss": 3.1853, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 8.492089118501776, | |
| "grad_norm": 0.8428160548210144, | |
| "learning_rate": 9.075961642064432e-05, | |
| "loss": 3.195, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 8.497470670541384, | |
| "grad_norm": 0.8920302987098694, | |
| "learning_rate": 9.04363753905829e-05, | |
| "loss": 3.193, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 8.502852222580993, | |
| "grad_norm": 0.8534532189369202, | |
| "learning_rate": 9.011313436052148e-05, | |
| "loss": 3.1839, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 8.502852222580993, | |
| "eval_accuracy": 0.3912604615074871, | |
| "eval_loss": 3.3262176513671875, | |
| "eval_runtime": 184.6415, | |
| "eval_samples_per_second": 97.546, | |
| "eval_steps_per_second": 6.098, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 8.508233774620601, | |
| "grad_norm": 0.9027853012084961, | |
| "learning_rate": 8.978989333046008e-05, | |
| "loss": 3.1808, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 8.513615326660208, | |
| "grad_norm": 0.8264344334602356, | |
| "learning_rate": 8.946665230039865e-05, | |
| "loss": 3.1587, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 8.518996878699816, | |
| "grad_norm": 0.8871257305145264, | |
| "learning_rate": 8.914341127033724e-05, | |
| "loss": 3.1913, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 8.524378430739425, | |
| "grad_norm": 0.835419237613678, | |
| "learning_rate": 8.882017024027582e-05, | |
| "loss": 3.1695, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 8.529759982779034, | |
| "grad_norm": 0.9073612093925476, | |
| "learning_rate": 8.84969292102144e-05, | |
| "loss": 3.1817, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 8.535141534818642, | |
| "grad_norm": 0.8366214036941528, | |
| "learning_rate": 8.817368818015298e-05, | |
| "loss": 3.1713, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 8.54052308685825, | |
| "grad_norm": 0.8558536767959595, | |
| "learning_rate": 8.785044715009158e-05, | |
| "loss": 3.1971, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 8.545904638897857, | |
| "grad_norm": 0.8308901190757751, | |
| "learning_rate": 8.753367094063139e-05, | |
| "loss": 3.1882, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 8.551286190937466, | |
| "grad_norm": 0.8245387673377991, | |
| "learning_rate": 8.721042991056998e-05, | |
| "loss": 3.1859, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 8.556667742977075, | |
| "grad_norm": 0.9523900747299194, | |
| "learning_rate": 8.688718888050856e-05, | |
| "loss": 3.1975, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 8.562049295016683, | |
| "grad_norm": 0.8246482014656067, | |
| "learning_rate": 8.656394785044713e-05, | |
| "loss": 3.1653, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 8.567430847056292, | |
| "grad_norm": 0.8551926612854004, | |
| "learning_rate": 8.624070682038572e-05, | |
| "loss": 3.1805, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 8.572812399095898, | |
| "grad_norm": 0.8453466892242432, | |
| "learning_rate": 8.591746579032432e-05, | |
| "loss": 3.2024, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 8.578193951135507, | |
| "grad_norm": 0.8227813839912415, | |
| "learning_rate": 8.55942247602629e-05, | |
| "loss": 3.1821, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 8.583575503175116, | |
| "grad_norm": 0.8279109001159668, | |
| "learning_rate": 8.527098373020148e-05, | |
| "loss": 3.171, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 8.588957055214724, | |
| "grad_norm": 0.8841160535812378, | |
| "learning_rate": 8.494774270014006e-05, | |
| "loss": 3.1634, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 8.594338607254333, | |
| "grad_norm": 0.8176218271255493, | |
| "learning_rate": 8.462450167007866e-05, | |
| "loss": 3.2067, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 8.599720159293941, | |
| "grad_norm": 0.8833402395248413, | |
| "learning_rate": 8.430126064001722e-05, | |
| "loss": 3.185, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 8.605101711333548, | |
| "grad_norm": 0.8307499289512634, | |
| "learning_rate": 8.397801960995582e-05, | |
| "loss": 3.1787, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 8.610483263373157, | |
| "grad_norm": 0.9109451770782471, | |
| "learning_rate": 8.36547785798944e-05, | |
| "loss": 3.1876, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.610483263373157, | |
| "eval_accuracy": 0.3915649068991433, | |
| "eval_loss": 3.3241310119628906, | |
| "eval_runtime": 184.3032, | |
| "eval_samples_per_second": 97.725, | |
| "eval_steps_per_second": 6.109, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.615864815412765, | |
| "grad_norm": 0.9592380523681641, | |
| "learning_rate": 8.333153754983298e-05, | |
| "loss": 3.201, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 8.621246367452374, | |
| "grad_norm": 0.838151216506958, | |
| "learning_rate": 8.300829651977156e-05, | |
| "loss": 3.1579, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 8.626627919491982, | |
| "grad_norm": 0.8300402760505676, | |
| "learning_rate": 8.268505548971016e-05, | |
| "loss": 3.1736, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 8.632009471531589, | |
| "grad_norm": 0.8234220147132874, | |
| "learning_rate": 8.236181445964875e-05, | |
| "loss": 3.1975, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 8.637391023571197, | |
| "grad_norm": 0.876625657081604, | |
| "learning_rate": 8.203857342958732e-05, | |
| "loss": 3.1827, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 8.642772575610806, | |
| "grad_norm": 0.8351073861122131, | |
| "learning_rate": 8.171533239952591e-05, | |
| "loss": 3.1816, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 8.648154127650415, | |
| "grad_norm": 0.8206344842910767, | |
| "learning_rate": 8.13920913694645e-05, | |
| "loss": 3.1766, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 8.653535679690023, | |
| "grad_norm": 0.8717277646064758, | |
| "learning_rate": 8.106885033940308e-05, | |
| "loss": 3.1884, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 8.658917231729632, | |
| "grad_norm": 0.8345118761062622, | |
| "learning_rate": 8.074560930934166e-05, | |
| "loss": 3.1898, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 8.664298783769238, | |
| "grad_norm": 0.8316090703010559, | |
| "learning_rate": 8.042236827928025e-05, | |
| "loss": 3.1869, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 8.669680335808847, | |
| "grad_norm": 0.856178343296051, | |
| "learning_rate": 8.009912724921882e-05, | |
| "loss": 3.1743, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 8.675061887848456, | |
| "grad_norm": 0.8580847382545471, | |
| "learning_rate": 7.977588621915741e-05, | |
| "loss": 3.1797, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 8.680443439888064, | |
| "grad_norm": 0.8349714875221252, | |
| "learning_rate": 7.9452645189096e-05, | |
| "loss": 3.192, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 8.685824991927673, | |
| "grad_norm": 0.8190581798553467, | |
| "learning_rate": 7.912940415903459e-05, | |
| "loss": 3.1809, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 8.69120654396728, | |
| "grad_norm": 0.8722666501998901, | |
| "learning_rate": 7.880616312897316e-05, | |
| "loss": 3.2007, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 8.696588096006888, | |
| "grad_norm": 0.8494431972503662, | |
| "learning_rate": 7.848292209891175e-05, | |
| "loss": 3.194, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 8.701969648046497, | |
| "grad_norm": 0.8651822209358215, | |
| "learning_rate": 7.815968106885033e-05, | |
| "loss": 3.188, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 8.707351200086105, | |
| "grad_norm": 0.908804178237915, | |
| "learning_rate": 7.783644003878891e-05, | |
| "loss": 3.1798, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 8.712732752125714, | |
| "grad_norm": 0.8358641862869263, | |
| "learning_rate": 7.75131990087275e-05, | |
| "loss": 3.1709, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 8.718114304165322, | |
| "grad_norm": 0.8248346447944641, | |
| "learning_rate": 7.718995797866609e-05, | |
| "loss": 3.1846, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 8.718114304165322, | |
| "eval_accuracy": 0.39203428737450186, | |
| "eval_loss": 3.3206233978271484, | |
| "eval_runtime": 184.2159, | |
| "eval_samples_per_second": 97.771, | |
| "eval_steps_per_second": 6.112, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 8.723495856204929, | |
| "grad_norm": 0.8361005187034607, | |
| "learning_rate": 7.686671694860466e-05, | |
| "loss": 3.1901, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 8.728877408244538, | |
| "grad_norm": 0.8511915802955627, | |
| "learning_rate": 7.654347591854325e-05, | |
| "loss": 3.197, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 8.734258960284146, | |
| "grad_norm": 0.875729501247406, | |
| "learning_rate": 7.622023488848185e-05, | |
| "loss": 3.2046, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 8.739640512323755, | |
| "grad_norm": 0.9036592841148376, | |
| "learning_rate": 7.589699385842043e-05, | |
| "loss": 3.1766, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 8.745022064363363, | |
| "grad_norm": 0.9133176207542419, | |
| "learning_rate": 7.557375282835901e-05, | |
| "loss": 3.1987, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 8.75040361640297, | |
| "grad_norm": 0.8762437105178833, | |
| "learning_rate": 7.525051179829759e-05, | |
| "loss": 3.1848, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 8.755785168442578, | |
| "grad_norm": 0.890551745891571, | |
| "learning_rate": 7.492727076823617e-05, | |
| "loss": 3.1773, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 8.761166720482187, | |
| "grad_norm": 0.8716076016426086, | |
| "learning_rate": 7.460402973817477e-05, | |
| "loss": 3.1866, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 8.766548272521796, | |
| "grad_norm": 0.8556268215179443, | |
| "learning_rate": 7.428078870811335e-05, | |
| "loss": 3.1767, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 8.771929824561404, | |
| "grad_norm": 0.8896015286445618, | |
| "learning_rate": 7.395754767805193e-05, | |
| "loss": 3.1841, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 8.777311376601011, | |
| "grad_norm": 0.8343814611434937, | |
| "learning_rate": 7.363430664799051e-05, | |
| "loss": 3.1673, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 8.78269292864062, | |
| "grad_norm": 0.8750051856040955, | |
| "learning_rate": 7.331106561792909e-05, | |
| "loss": 3.1828, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 8.788074480680228, | |
| "grad_norm": 0.8982149362564087, | |
| "learning_rate": 7.298782458786768e-05, | |
| "loss": 3.1911, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 8.793456032719837, | |
| "grad_norm": 0.7928913831710815, | |
| "learning_rate": 7.266458355780627e-05, | |
| "loss": 3.1609, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 8.798837584759445, | |
| "grad_norm": 0.8926288485527039, | |
| "learning_rate": 7.234134252774485e-05, | |
| "loss": 3.1732, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 8.804219136799054, | |
| "grad_norm": 0.8625380992889404, | |
| "learning_rate": 7.201810149768343e-05, | |
| "loss": 3.1757, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 8.80960068883866, | |
| "grad_norm": 0.9233858585357666, | |
| "learning_rate": 7.169486046762201e-05, | |
| "loss": 3.1739, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 8.814982240878269, | |
| "grad_norm": 0.8412981629371643, | |
| "learning_rate": 7.13716194375606e-05, | |
| "loss": 3.1728, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 8.820363792917878, | |
| "grad_norm": 0.8994260430335999, | |
| "learning_rate": 7.104837840749918e-05, | |
| "loss": 3.1718, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 8.825745344957486, | |
| "grad_norm": 0.8829771280288696, | |
| "learning_rate": 7.072513737743778e-05, | |
| "loss": 3.1852, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.825745344957486, | |
| "eval_accuracy": 0.39260819192793794, | |
| "eval_loss": 3.3162174224853516, | |
| "eval_runtime": 184.347, | |
| "eval_samples_per_second": 97.702, | |
| "eval_steps_per_second": 6.108, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.831126896997095, | |
| "grad_norm": 0.8612265586853027, | |
| "learning_rate": 7.040189634737636e-05, | |
| "loss": 3.2016, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 8.836508449036701, | |
| "grad_norm": 0.8780418038368225, | |
| "learning_rate": 7.007865531731494e-05, | |
| "loss": 3.1763, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 8.84189000107631, | |
| "grad_norm": 0.8199126720428467, | |
| "learning_rate": 6.975541428725352e-05, | |
| "loss": 3.1766, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 8.847271553115919, | |
| "grad_norm": 0.9061501622200012, | |
| "learning_rate": 6.94321732571921e-05, | |
| "loss": 3.1808, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 8.852653105155527, | |
| "grad_norm": 0.8966909646987915, | |
| "learning_rate": 6.91089322271307e-05, | |
| "loss": 3.1829, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 8.858034657195136, | |
| "grad_norm": 0.8570286631584167, | |
| "learning_rate": 6.878569119706928e-05, | |
| "loss": 3.1946, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 8.863416209234742, | |
| "grad_norm": 0.874578595161438, | |
| "learning_rate": 6.846245016700786e-05, | |
| "loss": 3.1889, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 8.868797761274351, | |
| "grad_norm": 0.8451172113418579, | |
| "learning_rate": 6.813920913694644e-05, | |
| "loss": 3.1807, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 8.87417931331396, | |
| "grad_norm": 0.8931718468666077, | |
| "learning_rate": 6.781596810688502e-05, | |
| "loss": 3.1799, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 8.879560865353568, | |
| "grad_norm": 0.8435003757476807, | |
| "learning_rate": 6.749272707682362e-05, | |
| "loss": 3.1653, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 8.884942417393177, | |
| "grad_norm": 0.8534196019172668, | |
| "learning_rate": 6.71694860467622e-05, | |
| "loss": 3.1826, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 8.890323969432785, | |
| "grad_norm": 0.851076066493988, | |
| "learning_rate": 6.684624501670078e-05, | |
| "loss": 3.1898, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 8.895705521472392, | |
| "grad_norm": 0.8538959622383118, | |
| "learning_rate": 6.652300398663936e-05, | |
| "loss": 3.1634, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 8.901087073512, | |
| "grad_norm": 0.8814782500267029, | |
| "learning_rate": 6.619976295657794e-05, | |
| "loss": 3.1896, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 8.906468625551609, | |
| "grad_norm": 0.8407677412033081, | |
| "learning_rate": 6.587652192651654e-05, | |
| "loss": 3.188, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 8.911850177591218, | |
| "grad_norm": 0.8978355526924133, | |
| "learning_rate": 6.555328089645512e-05, | |
| "loss": 3.1864, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 8.917231729630826, | |
| "grad_norm": 0.8800509572029114, | |
| "learning_rate": 6.52300398663937e-05, | |
| "loss": 3.1936, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 8.922613281670433, | |
| "grad_norm": 0.9059144854545593, | |
| "learning_rate": 6.490679883633229e-05, | |
| "loss": 3.1688, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 8.927994833710041, | |
| "grad_norm": 0.9003951549530029, | |
| "learning_rate": 6.458355780627087e-05, | |
| "loss": 3.1929, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 8.93337638574965, | |
| "grad_norm": 0.8630614280700684, | |
| "learning_rate": 6.426031677620946e-05, | |
| "loss": 3.1742, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.93337638574965, | |
| "eval_accuracy": 0.392769215507679, | |
| "eval_loss": 3.312757730484009, | |
| "eval_runtime": 184.5097, | |
| "eval_samples_per_second": 97.615, | |
| "eval_steps_per_second": 6.103, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.938757937789259, | |
| "grad_norm": 0.8457894325256348, | |
| "learning_rate": 6.393707574614804e-05, | |
| "loss": 3.1898, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 8.944139489828867, | |
| "grad_norm": 0.8665552139282227, | |
| "learning_rate": 6.361383471608662e-05, | |
| "loss": 3.183, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 8.949521041868476, | |
| "grad_norm": 0.8568403720855713, | |
| "learning_rate": 6.329059368602521e-05, | |
| "loss": 3.1694, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 8.954902593908082, | |
| "grad_norm": 0.8542162775993347, | |
| "learning_rate": 6.29673526559638e-05, | |
| "loss": 3.1816, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 8.960284145947691, | |
| "grad_norm": 0.8801798820495605, | |
| "learning_rate": 6.264411162590237e-05, | |
| "loss": 3.1772, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 8.9656656979873, | |
| "grad_norm": 0.9118631482124329, | |
| "learning_rate": 6.232087059584096e-05, | |
| "loss": 3.185, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 8.971047250026908, | |
| "grad_norm": 0.8372429013252258, | |
| "learning_rate": 6.199762956577954e-05, | |
| "loss": 3.2102, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 8.976428802066517, | |
| "grad_norm": 0.8432099223136902, | |
| "learning_rate": 6.168085335631936e-05, | |
| "loss": 3.1847, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 8.981810354106123, | |
| "grad_norm": 0.8292369842529297, | |
| "learning_rate": 6.135761232625794e-05, | |
| "loss": 3.1843, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 8.987191906145732, | |
| "grad_norm": 0.8080323338508606, | |
| "learning_rate": 6.1034371296196526e-05, | |
| "loss": 3.1968, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 8.99257345818534, | |
| "grad_norm": 0.8511005640029907, | |
| "learning_rate": 6.0711130266135114e-05, | |
| "loss": 3.1874, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 8.997955010224949, | |
| "grad_norm": 0.8977751135826111, | |
| "learning_rate": 6.0387889236073695e-05, | |
| "loss": 3.1862, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 9.003336562264558, | |
| "grad_norm": 0.8723348379135132, | |
| "learning_rate": 6.0064648206012276e-05, | |
| "loss": 3.1498, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 9.008718114304166, | |
| "grad_norm": 0.8234129548072815, | |
| "learning_rate": 5.9741407175950864e-05, | |
| "loss": 3.124, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 9.014099666343773, | |
| "grad_norm": 0.8998926877975464, | |
| "learning_rate": 5.942463096649068e-05, | |
| "loss": 3.114, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 9.019481218383381, | |
| "grad_norm": 0.8508299589157104, | |
| "learning_rate": 5.910138993642926e-05, | |
| "loss": 3.1173, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 9.02486277042299, | |
| "grad_norm": 0.8853340744972229, | |
| "learning_rate": 5.877814890636784e-05, | |
| "loss": 3.1361, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 9.030244322462599, | |
| "grad_norm": 0.9189099073410034, | |
| "learning_rate": 5.845490787630643e-05, | |
| "loss": 3.1291, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 9.035625874502207, | |
| "grad_norm": 0.8015260100364685, | |
| "learning_rate": 5.813166684624501e-05, | |
| "loss": 3.0993, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 9.041007426541814, | |
| "grad_norm": 0.80156409740448, | |
| "learning_rate": 5.7808425816183596e-05, | |
| "loss": 3.1159, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 9.041007426541814, | |
| "eval_accuracy": 0.39277214913565, | |
| "eval_loss": 3.3150827884674072, | |
| "eval_runtime": 184.1087, | |
| "eval_samples_per_second": 97.828, | |
| "eval_steps_per_second": 6.116, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 9.046388978581422, | |
| "grad_norm": 0.8560525178909302, | |
| "learning_rate": 5.748518478612218e-05, | |
| "loss": 3.1227, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 9.051770530621031, | |
| "grad_norm": 0.8749635815620422, | |
| "learning_rate": 5.716194375606076e-05, | |
| "loss": 3.1184, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 9.05715208266064, | |
| "grad_norm": 0.8701488971710205, | |
| "learning_rate": 5.6838702725999346e-05, | |
| "loss": 3.1153, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 9.062533634700248, | |
| "grad_norm": 0.8524916172027588, | |
| "learning_rate": 5.651546169593793e-05, | |
| "loss": 3.1306, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 9.067915186739857, | |
| "grad_norm": 0.9289652109146118, | |
| "learning_rate": 5.619868548647774e-05, | |
| "loss": 3.1385, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 9.073296738779463, | |
| "grad_norm": 0.8488720655441284, | |
| "learning_rate": 5.5875444456416335e-05, | |
| "loss": 3.128, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 9.078678290819072, | |
| "grad_norm": 0.8681465983390808, | |
| "learning_rate": 5.5552203426354916e-05, | |
| "loss": 3.1178, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 9.08405984285868, | |
| "grad_norm": 0.8536664247512817, | |
| "learning_rate": 5.52289623962935e-05, | |
| "loss": 3.1278, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 9.089441394898289, | |
| "grad_norm": 0.865757405757904, | |
| "learning_rate": 5.4905721366232085e-05, | |
| "loss": 3.126, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 9.094822946937898, | |
| "grad_norm": 0.8798834681510925, | |
| "learning_rate": 5.4582480336170666e-05, | |
| "loss": 3.1296, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 9.100204498977504, | |
| "grad_norm": 0.8854317665100098, | |
| "learning_rate": 5.4259239306109254e-05, | |
| "loss": 3.1397, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 9.105586051017113, | |
| "grad_norm": 0.9418936371803284, | |
| "learning_rate": 5.3935998276047835e-05, | |
| "loss": 3.1341, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 9.110967603056721, | |
| "grad_norm": 0.8872946500778198, | |
| "learning_rate": 5.3612757245986416e-05, | |
| "loss": 3.1232, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 9.11634915509633, | |
| "grad_norm": 0.8650902509689331, | |
| "learning_rate": 5.3289516215925004e-05, | |
| "loss": 3.1424, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 9.121730707135939, | |
| "grad_norm": 0.8672756552696228, | |
| "learning_rate": 5.2966275185863585e-05, | |
| "loss": 3.1227, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 9.127112259175545, | |
| "grad_norm": 0.8339865803718567, | |
| "learning_rate": 5.264303415580217e-05, | |
| "loss": 3.1347, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 9.132493811215154, | |
| "grad_norm": 0.8523164987564087, | |
| "learning_rate": 5.2319793125740754e-05, | |
| "loss": 3.1382, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 9.137875363254762, | |
| "grad_norm": 0.9534077644348145, | |
| "learning_rate": 5.1996552095679336e-05, | |
| "loss": 3.1449, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 9.143256915294371, | |
| "grad_norm": 0.8613294959068298, | |
| "learning_rate": 5.1673311065617923e-05, | |
| "loss": 3.123, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 9.14863846733398, | |
| "grad_norm": 0.8886215686798096, | |
| "learning_rate": 5.1350070035556505e-05, | |
| "loss": 3.1207, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 9.14863846733398, | |
| "eval_accuracy": 0.3929050316174471, | |
| "eval_loss": 3.315538167953491, | |
| "eval_runtime": 184.3337, | |
| "eval_samples_per_second": 97.709, | |
| "eval_steps_per_second": 6.108, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 9.154020019373588, | |
| "grad_norm": 0.8625528216362, | |
| "learning_rate": 5.102682900549509e-05, | |
| "loss": 3.1354, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 9.159401571413195, | |
| "grad_norm": 0.8478276133537292, | |
| "learning_rate": 5.0703587975433674e-05, | |
| "loss": 3.1324, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 9.164783123452803, | |
| "grad_norm": 0.8763461709022522, | |
| "learning_rate": 5.0380346945372255e-05, | |
| "loss": 3.139, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 9.170164675492412, | |
| "grad_norm": 0.8898596167564392, | |
| "learning_rate": 5.005710591531085e-05, | |
| "loss": 3.1253, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 9.17554622753202, | |
| "grad_norm": 0.8577936291694641, | |
| "learning_rate": 4.973386488524943e-05, | |
| "loss": 3.1247, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 9.180927779571629, | |
| "grad_norm": 0.8351694345474243, | |
| "learning_rate": 4.941062385518802e-05, | |
| "loss": 3.1322, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 9.186309331611236, | |
| "grad_norm": 0.8371776938438416, | |
| "learning_rate": 4.90873828251266e-05, | |
| "loss": 3.134, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 9.191690883650844, | |
| "grad_norm": 0.8292602896690369, | |
| "learning_rate": 4.876414179506518e-05, | |
| "loss": 3.1191, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 9.197072435690453, | |
| "grad_norm": 0.8805650472640991, | |
| "learning_rate": 4.844090076500377e-05, | |
| "loss": 3.1384, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 9.202453987730062, | |
| "grad_norm": 0.8351313471794128, | |
| "learning_rate": 4.811765973494235e-05, | |
| "loss": 3.1286, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 9.20783553976967, | |
| "grad_norm": 0.8334101438522339, | |
| "learning_rate": 4.779441870488094e-05, | |
| "loss": 3.138, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 9.213217091809279, | |
| "grad_norm": 0.871560275554657, | |
| "learning_rate": 4.747117767481952e-05, | |
| "loss": 3.1333, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 9.218598643848885, | |
| "grad_norm": 0.9008331894874573, | |
| "learning_rate": 4.71479366447581e-05, | |
| "loss": 3.1517, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 9.223980195888494, | |
| "grad_norm": 0.8196895718574524, | |
| "learning_rate": 4.682469561469669e-05, | |
| "loss": 3.1241, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 9.229361747928102, | |
| "grad_norm": 0.8557183742523193, | |
| "learning_rate": 4.650145458463527e-05, | |
| "loss": 3.1314, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 9.234743299967711, | |
| "grad_norm": 0.8415449261665344, | |
| "learning_rate": 4.6178213554573856e-05, | |
| "loss": 3.1192, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 9.24012485200732, | |
| "grad_norm": 0.8404421210289001, | |
| "learning_rate": 4.585497252451244e-05, | |
| "loss": 3.1095, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 9.245506404046926, | |
| "grad_norm": 0.9417492151260376, | |
| "learning_rate": 4.553173149445102e-05, | |
| "loss": 3.1309, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 9.250887956086535, | |
| "grad_norm": 0.8479632139205933, | |
| "learning_rate": 4.520849046438961e-05, | |
| "loss": 3.1308, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 9.256269508126143, | |
| "grad_norm": 0.8539721369743347, | |
| "learning_rate": 4.488524943432819e-05, | |
| "loss": 3.1319, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 9.256269508126143, | |
| "eval_accuracy": 0.3930026019107045, | |
| "eval_loss": 3.3137943744659424, | |
| "eval_runtime": 184.4287, | |
| "eval_samples_per_second": 97.658, | |
| "eval_steps_per_second": 6.105, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 9.261651060165752, | |
| "grad_norm": 0.885164737701416, | |
| "learning_rate": 4.456200840426678e-05, | |
| "loss": 3.1171, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 9.26703261220536, | |
| "grad_norm": 0.90435791015625, | |
| "learning_rate": 4.4238767374205363e-05, | |
| "loss": 3.1469, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 9.272414164244967, | |
| "grad_norm": 0.8895688056945801, | |
| "learning_rate": 4.3915526344143945e-05, | |
| "loss": 3.1335, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 9.277795716284576, | |
| "grad_norm": 0.929306149482727, | |
| "learning_rate": 4.359228531408253e-05, | |
| "loss": 3.1421, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 9.283177268324184, | |
| "grad_norm": 0.8387036919593811, | |
| "learning_rate": 4.3269044284021114e-05, | |
| "loss": 3.1357, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 9.288558820363793, | |
| "grad_norm": 0.856637716293335, | |
| "learning_rate": 4.29458032539597e-05, | |
| "loss": 3.1339, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 9.293940372403402, | |
| "grad_norm": 0.8665279150009155, | |
| "learning_rate": 4.262256222389828e-05, | |
| "loss": 3.1129, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 9.29932192444301, | |
| "grad_norm": 0.8598510026931763, | |
| "learning_rate": 4.229932119383687e-05, | |
| "loss": 3.1295, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 9.304703476482617, | |
| "grad_norm": 0.8456576466560364, | |
| "learning_rate": 4.197608016377545e-05, | |
| "loss": 3.1304, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 9.310085028522225, | |
| "grad_norm": 0.856044590473175, | |
| "learning_rate": 4.165283913371403e-05, | |
| "loss": 3.1488, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 9.315466580561834, | |
| "grad_norm": 0.868798017501831, | |
| "learning_rate": 4.132959810365262e-05, | |
| "loss": 3.1286, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 9.320848132601443, | |
| "grad_norm": 0.8525956869125366, | |
| "learning_rate": 4.10063570735912e-05, | |
| "loss": 3.1158, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 9.326229684641051, | |
| "grad_norm": 0.8735033869743347, | |
| "learning_rate": 4.068311604352979e-05, | |
| "loss": 3.1108, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 9.331611236680658, | |
| "grad_norm": 0.8608236312866211, | |
| "learning_rate": 4.035987501346837e-05, | |
| "loss": 3.136, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 9.336992788720266, | |
| "grad_norm": 0.8945574760437012, | |
| "learning_rate": 4.003663398340695e-05, | |
| "loss": 3.1444, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 9.342374340759875, | |
| "grad_norm": 0.8776229023933411, | |
| "learning_rate": 3.9713392953345546e-05, | |
| "loss": 3.1236, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 9.347755892799483, | |
| "grad_norm": 0.8870150446891785, | |
| "learning_rate": 3.939015192328413e-05, | |
| "loss": 3.1294, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 9.353137444839092, | |
| "grad_norm": 0.8757968544960022, | |
| "learning_rate": 3.9066910893222715e-05, | |
| "loss": 3.1108, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 9.3585189968787, | |
| "grad_norm": 0.8697242140769958, | |
| "learning_rate": 3.8743669863161296e-05, | |
| "loss": 3.1186, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 9.363900548918307, | |
| "grad_norm": 0.9357397556304932, | |
| "learning_rate": 3.842042883309988e-05, | |
| "loss": 3.1304, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 9.363900548918307, | |
| "eval_accuracy": 0.3935234838548871, | |
| "eval_loss": 3.3103060722351074, | |
| "eval_runtime": 184.3264, | |
| "eval_samples_per_second": 97.713, | |
| "eval_steps_per_second": 6.109, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 9.369282100957916, | |
| "grad_norm": 0.8500669598579407, | |
| "learning_rate": 3.8097187803038465e-05, | |
| "loss": 3.1281, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 9.374663652997524, | |
| "grad_norm": 0.938818097114563, | |
| "learning_rate": 3.7773946772977047e-05, | |
| "loss": 3.1266, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 9.380045205037133, | |
| "grad_norm": 0.8869884610176086, | |
| "learning_rate": 3.745070574291563e-05, | |
| "loss": 3.1197, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 9.385426757076742, | |
| "grad_norm": 0.8740733861923218, | |
| "learning_rate": 3.7127464712854216e-05, | |
| "loss": 3.1127, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 9.390808309116348, | |
| "grad_norm": 0.8429086804389954, | |
| "learning_rate": 3.6804223682792803e-05, | |
| "loss": 3.1368, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 9.396189861155957, | |
| "grad_norm": 0.8436853885650635, | |
| "learning_rate": 3.6480982652731385e-05, | |
| "loss": 3.1329, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 9.401571413195565, | |
| "grad_norm": 0.8691253662109375, | |
| "learning_rate": 3.6157741622669966e-05, | |
| "loss": 3.1343, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 9.406952965235174, | |
| "grad_norm": 0.8493805527687073, | |
| "learning_rate": 3.5834500592608554e-05, | |
| "loss": 3.1322, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 9.412334517274783, | |
| "grad_norm": 0.8372256755828857, | |
| "learning_rate": 3.5511259562547135e-05, | |
| "loss": 3.1282, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 9.417716069314391, | |
| "grad_norm": 0.9369255304336548, | |
| "learning_rate": 3.518801853248572e-05, | |
| "loss": 3.1355, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 9.423097621353998, | |
| "grad_norm": 0.8417009711265564, | |
| "learning_rate": 3.4864777502424304e-05, | |
| "loss": 3.1429, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 9.428479173393606, | |
| "grad_norm": 0.8512555360794067, | |
| "learning_rate": 3.4541536472362885e-05, | |
| "loss": 3.1554, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 9.433860725433215, | |
| "grad_norm": 0.8206108808517456, | |
| "learning_rate": 3.421829544230147e-05, | |
| "loss": 3.15, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 9.439242277472824, | |
| "grad_norm": 0.8449435234069824, | |
| "learning_rate": 3.389505441224006e-05, | |
| "loss": 3.1204, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 9.444623829512432, | |
| "grad_norm": 0.8863608837127686, | |
| "learning_rate": 3.357181338217864e-05, | |
| "loss": 3.1457, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 9.450005381552039, | |
| "grad_norm": 0.8533875346183777, | |
| "learning_rate": 3.324857235211722e-05, | |
| "loss": 3.1373, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 9.455386933591647, | |
| "grad_norm": 0.8496246337890625, | |
| "learning_rate": 3.292533132205581e-05, | |
| "loss": 3.1229, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 9.460768485631256, | |
| "grad_norm": 0.8718200922012329, | |
| "learning_rate": 3.260209029199439e-05, | |
| "loss": 3.1419, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 9.466150037670864, | |
| "grad_norm": 0.8778585195541382, | |
| "learning_rate": 3.227884926193298e-05, | |
| "loss": 3.1365, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 9.471531589710473, | |
| "grad_norm": 0.8406819105148315, | |
| "learning_rate": 3.195560823187157e-05, | |
| "loss": 3.1322, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 9.471531589710473, | |
| "eval_accuracy": 0.3937116706565818, | |
| "eval_loss": 3.308526039123535, | |
| "eval_runtime": 184.3146, | |
| "eval_samples_per_second": 97.719, | |
| "eval_steps_per_second": 6.109, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 9.476913141750082, | |
| "grad_norm": 0.8434910178184509, | |
| "learning_rate": 3.163236720181014e-05, | |
| "loss": 3.1451, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 9.482294693789688, | |
| "grad_norm": 0.8757779598236084, | |
| "learning_rate": 3.130912617174873e-05, | |
| "loss": 3.1069, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 9.487676245829297, | |
| "grad_norm": 0.8904576301574707, | |
| "learning_rate": 3.098588514168732e-05, | |
| "loss": 3.1308, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 9.493057797868905, | |
| "grad_norm": 0.8869066834449768, | |
| "learning_rate": 3.06626441116259e-05, | |
| "loss": 3.1355, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 9.498439349908514, | |
| "grad_norm": 0.9309498071670532, | |
| "learning_rate": 3.0339403081564487e-05, | |
| "loss": 3.1306, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 9.503820901948123, | |
| "grad_norm": 0.8875665664672852, | |
| "learning_rate": 3.0016162051503068e-05, | |
| "loss": 3.1287, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 9.50920245398773, | |
| "grad_norm": 0.8742145895957947, | |
| "learning_rate": 2.9692921021441652e-05, | |
| "loss": 3.1505, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 9.514584006027338, | |
| "grad_norm": 0.8632606863975525, | |
| "learning_rate": 2.9369679991380237e-05, | |
| "loss": 3.1422, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 9.519965558066946, | |
| "grad_norm": 0.9001516103744507, | |
| "learning_rate": 2.904643896131882e-05, | |
| "loss": 3.1404, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 9.525347110106555, | |
| "grad_norm": 0.8595061898231506, | |
| "learning_rate": 2.8723197931257406e-05, | |
| "loss": 3.1312, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 9.530728662146164, | |
| "grad_norm": 0.8811789751052856, | |
| "learning_rate": 2.8399956901195987e-05, | |
| "loss": 3.131, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 9.536110214185772, | |
| "grad_norm": 0.8497152924537659, | |
| "learning_rate": 2.807671587113457e-05, | |
| "loss": 3.1318, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 9.541491766225379, | |
| "grad_norm": 0.8783244490623474, | |
| "learning_rate": 2.775347484107316e-05, | |
| "loss": 3.1358, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 9.546873318264987, | |
| "grad_norm": 0.8505585193634033, | |
| "learning_rate": 2.7430233811011744e-05, | |
| "loss": 3.1534, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 9.552254870304596, | |
| "grad_norm": 0.8915456533432007, | |
| "learning_rate": 2.7106992780950328e-05, | |
| "loss": 3.1278, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 9.557636422344205, | |
| "grad_norm": 0.821374237537384, | |
| "learning_rate": 2.678375175088891e-05, | |
| "loss": 3.1401, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 9.563017974383813, | |
| "grad_norm": 0.8537230491638184, | |
| "learning_rate": 2.6460510720827494e-05, | |
| "loss": 3.12, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 9.56839952642342, | |
| "grad_norm": 0.8617140054702759, | |
| "learning_rate": 2.6137269690766078e-05, | |
| "loss": 3.1276, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 9.573781078463028, | |
| "grad_norm": 0.8495514988899231, | |
| "learning_rate": 2.5814028660704663e-05, | |
| "loss": 3.1407, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 9.579162630502637, | |
| "grad_norm": 0.8824370503425598, | |
| "learning_rate": 2.5490787630643247e-05, | |
| "loss": 3.1383, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 9.579162630502637, | |
| "eval_accuracy": 0.39395125027421274, | |
| "eval_loss": 3.306459426879883, | |
| "eval_runtime": 184.351, | |
| "eval_samples_per_second": 97.7, | |
| "eval_steps_per_second": 6.108, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 9.584544182542245, | |
| "grad_norm": 0.9219626784324646, | |
| "learning_rate": 2.5167546600581828e-05, | |
| "loss": 3.1377, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 9.589925734581854, | |
| "grad_norm": 0.8579556941986084, | |
| "learning_rate": 2.4844305570520416e-05, | |
| "loss": 3.1249, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 9.59530728662146, | |
| "grad_norm": 0.8856773972511292, | |
| "learning_rate": 2.4521064540459e-05, | |
| "loss": 3.1391, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 9.60068883866107, | |
| "grad_norm": 0.9169157147407532, | |
| "learning_rate": 2.4197823510397585e-05, | |
| "loss": 3.1445, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 9.606070390700678, | |
| "grad_norm": 0.8753631711006165, | |
| "learning_rate": 2.387458248033617e-05, | |
| "loss": 3.1216, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 9.611451942740286, | |
| "grad_norm": 0.8575186133384705, | |
| "learning_rate": 2.355134145027475e-05, | |
| "loss": 3.1357, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 9.616833494779895, | |
| "grad_norm": 0.8461126089096069, | |
| "learning_rate": 2.3228100420213335e-05, | |
| "loss": 3.1182, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 9.622215046819504, | |
| "grad_norm": 0.8273517489433289, | |
| "learning_rate": 2.290485939015192e-05, | |
| "loss": 3.1433, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 9.62759659885911, | |
| "grad_norm": 0.8721249103546143, | |
| "learning_rate": 2.2581618360090508e-05, | |
| "loss": 3.1381, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 9.632978150898719, | |
| "grad_norm": 0.8490434288978577, | |
| "learning_rate": 2.2258377330029092e-05, | |
| "loss": 3.1333, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 9.638359702938327, | |
| "grad_norm": 0.877873420715332, | |
| "learning_rate": 2.1935136299967673e-05, | |
| "loss": 3.119, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 9.643741254977936, | |
| "grad_norm": 0.8854243755340576, | |
| "learning_rate": 2.1611895269906258e-05, | |
| "loss": 3.1336, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 9.649122807017545, | |
| "grad_norm": 0.883566677570343, | |
| "learning_rate": 2.1288654239844842e-05, | |
| "loss": 3.1233, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 9.654504359057151, | |
| "grad_norm": 0.9156243205070496, | |
| "learning_rate": 2.0965413209783427e-05, | |
| "loss": 3.1349, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 9.65988591109676, | |
| "grad_norm": 0.8827773332595825, | |
| "learning_rate": 2.064217217972201e-05, | |
| "loss": 3.1359, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 9.665267463136368, | |
| "grad_norm": 0.8543577194213867, | |
| "learning_rate": 2.0318931149660596e-05, | |
| "loss": 3.1209, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 9.670649015175977, | |
| "grad_norm": 0.8862645626068115, | |
| "learning_rate": 1.9995690119599177e-05, | |
| "loss": 3.1338, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 9.676030567215586, | |
| "grad_norm": 0.8666056990623474, | |
| "learning_rate": 1.9672449089537765e-05, | |
| "loss": 3.1261, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 9.681412119255192, | |
| "grad_norm": 0.8972306847572327, | |
| "learning_rate": 1.934920805947635e-05, | |
| "loss": 3.1226, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 9.6867936712948, | |
| "grad_norm": 0.859221339225769, | |
| "learning_rate": 1.9025967029414934e-05, | |
| "loss": 3.1251, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.6867936712948, | |
| "eval_accuracy": 0.3943069798289173, | |
| "eval_loss": 3.30434513092041, | |
| "eval_runtime": 184.4405, | |
| "eval_samples_per_second": 97.652, | |
| "eval_steps_per_second": 6.105, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.69217522333441, | |
| "grad_norm": 0.8957327604293823, | |
| "learning_rate": 1.8702725999353515e-05, | |
| "loss": 3.1281, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 9.697556775374018, | |
| "grad_norm": 0.8474478125572205, | |
| "learning_rate": 1.8379484969292103e-05, | |
| "loss": 3.14, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 9.702938327413626, | |
| "grad_norm": 0.8843562006950378, | |
| "learning_rate": 1.8056243939230684e-05, | |
| "loss": 3.1401, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 9.708319879453235, | |
| "grad_norm": 0.8579201698303223, | |
| "learning_rate": 1.7733002909169268e-05, | |
| "loss": 3.1439, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 9.713701431492842, | |
| "grad_norm": 0.8406257629394531, | |
| "learning_rate": 1.741622669970908e-05, | |
| "loss": 3.1415, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 9.71908298353245, | |
| "grad_norm": 0.8735593557357788, | |
| "learning_rate": 1.7092985669647666e-05, | |
| "loss": 3.1417, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 9.724464535572059, | |
| "grad_norm": 0.8594205975532532, | |
| "learning_rate": 1.676974463958625e-05, | |
| "loss": 3.1401, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 9.729846087611667, | |
| "grad_norm": 0.8785849213600159, | |
| "learning_rate": 1.6446503609524835e-05, | |
| "loss": 3.1289, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 9.735227639651276, | |
| "grad_norm": 0.8445061445236206, | |
| "learning_rate": 1.612326257946342e-05, | |
| "loss": 3.1322, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 9.740609191690883, | |
| "grad_norm": 0.8652383089065552, | |
| "learning_rate": 1.5800021549402004e-05, | |
| "loss": 3.1351, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 9.745990743730491, | |
| "grad_norm": 0.8727036118507385, | |
| "learning_rate": 1.5476780519340585e-05, | |
| "loss": 3.1276, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 9.7513722957701, | |
| "grad_norm": 0.9171848893165588, | |
| "learning_rate": 1.5153539489279171e-05, | |
| "loss": 3.1363, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 9.756753847809708, | |
| "grad_norm": 0.8866875171661377, | |
| "learning_rate": 1.4830298459217756e-05, | |
| "loss": 3.1437, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 9.762135399849317, | |
| "grad_norm": 0.8701785802841187, | |
| "learning_rate": 1.4507057429156339e-05, | |
| "loss": 3.1413, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 9.767516951888926, | |
| "grad_norm": 0.9043261408805847, | |
| "learning_rate": 1.4183816399094925e-05, | |
| "loss": 3.1366, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 9.772898503928532, | |
| "grad_norm": 0.8844500780105591, | |
| "learning_rate": 1.3860575369033508e-05, | |
| "loss": 3.1355, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 9.77828005596814, | |
| "grad_norm": 0.8667038083076477, | |
| "learning_rate": 1.3537334338972092e-05, | |
| "loss": 3.1278, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 9.78366160800775, | |
| "grad_norm": 0.9178418517112732, | |
| "learning_rate": 1.3214093308910678e-05, | |
| "loss": 3.128, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 9.789043160047358, | |
| "grad_norm": 0.8584386706352234, | |
| "learning_rate": 1.2890852278849261e-05, | |
| "loss": 3.12, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 9.794424712086967, | |
| "grad_norm": 0.857905387878418, | |
| "learning_rate": 1.2567611248787846e-05, | |
| "loss": 3.1278, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 9.794424712086967, | |
| "eval_accuracy": 0.3944417094098073, | |
| "eval_loss": 3.302494525909424, | |
| "eval_runtime": 184.3867, | |
| "eval_samples_per_second": 97.681, | |
| "eval_steps_per_second": 6.107, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 9.799806264126573, | |
| "grad_norm": 0.8387991786003113, | |
| "learning_rate": 1.2244370218726428e-05, | |
| "loss": 3.1491, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 9.805187816166182, | |
| "grad_norm": 0.8886919021606445, | |
| "learning_rate": 1.1921129188665013e-05, | |
| "loss": 3.127, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 9.81056936820579, | |
| "grad_norm": 0.8610341548919678, | |
| "learning_rate": 1.1597888158603599e-05, | |
| "loss": 3.1222, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 9.815950920245399, | |
| "grad_norm": 0.9010535478591919, | |
| "learning_rate": 1.1274647128542182e-05, | |
| "loss": 3.1197, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 9.821332472285007, | |
| "grad_norm": 0.889415442943573, | |
| "learning_rate": 1.0951406098480766e-05, | |
| "loss": 3.129, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 9.826714024324616, | |
| "grad_norm": 0.8866837024688721, | |
| "learning_rate": 1.0628165068419349e-05, | |
| "loss": 3.1362, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 9.832095576364223, | |
| "grad_norm": 0.8579146265983582, | |
| "learning_rate": 1.0304924038357935e-05, | |
| "loss": 3.1482, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 9.837477128403831, | |
| "grad_norm": 0.8629161715507507, | |
| "learning_rate": 9.98168300829652e-06, | |
| "loss": 3.1269, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 9.84285868044344, | |
| "grad_norm": 0.8659911751747131, | |
| "learning_rate": 9.658441978235103e-06, | |
| "loss": 3.1422, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 9.848240232483048, | |
| "grad_norm": 0.8958485722541809, | |
| "learning_rate": 9.335200948173687e-06, | |
| "loss": 3.1349, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 9.853621784522657, | |
| "grad_norm": 0.8604736328125, | |
| "learning_rate": 9.011959918112272e-06, | |
| "loss": 3.1448, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 9.859003336562264, | |
| "grad_norm": 0.8661377429962158, | |
| "learning_rate": 8.688718888050856e-06, | |
| "loss": 3.1482, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 9.864384888601872, | |
| "grad_norm": 0.8428304195404053, | |
| "learning_rate": 8.36547785798944e-06, | |
| "loss": 3.1223, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 9.869766440641481, | |
| "grad_norm": 0.8337749242782593, | |
| "learning_rate": 8.042236827928023e-06, | |
| "loss": 3.1227, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 9.87514799268109, | |
| "grad_norm": 0.8688568472862244, | |
| "learning_rate": 7.71899579786661e-06, | |
| "loss": 3.1225, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 9.880529544720698, | |
| "grad_norm": 0.8873223662376404, | |
| "learning_rate": 7.395754767805193e-06, | |
| "loss": 3.1411, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 9.885911096760307, | |
| "grad_norm": 0.8440225124359131, | |
| "learning_rate": 7.072513737743777e-06, | |
| "loss": 3.1443, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 9.891292648799913, | |
| "grad_norm": 0.8451166152954102, | |
| "learning_rate": 6.749272707682361e-06, | |
| "loss": 3.1333, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 9.896674200839522, | |
| "grad_norm": 0.9271588921546936, | |
| "learning_rate": 6.426031677620945e-06, | |
| "loss": 3.1229, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 9.90205575287913, | |
| "grad_norm": 0.9085114598274231, | |
| "learning_rate": 6.10279064755953e-06, | |
| "loss": 3.1259, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.90205575287913, | |
| "eval_accuracy": 0.3946223991622428, | |
| "eval_loss": 3.3004517555236816, | |
| "eval_runtime": 184.2564, | |
| "eval_samples_per_second": 97.75, | |
| "eval_steps_per_second": 6.111, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.907437304918739, | |
| "grad_norm": 0.9208228588104248, | |
| "learning_rate": 5.779549617498114e-06, | |
| "loss": 3.1325, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 9.912818856958348, | |
| "grad_norm": 0.8710732460021973, | |
| "learning_rate": 5.4563085874366985e-06, | |
| "loss": 3.128, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 9.918200408997954, | |
| "grad_norm": 0.8572892546653748, | |
| "learning_rate": 5.133067557375282e-06, | |
| "loss": 3.1317, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 9.923581961037563, | |
| "grad_norm": 0.8705772757530212, | |
| "learning_rate": 4.809826527313866e-06, | |
| "loss": 3.1121, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 9.928963513077171, | |
| "grad_norm": 0.8528627753257751, | |
| "learning_rate": 4.48658549725245e-06, | |
| "loss": 3.1055, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 9.93434506511678, | |
| "grad_norm": 0.8808982968330383, | |
| "learning_rate": 4.169809287792264e-06, | |
| "loss": 3.1417, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 9.939726617156388, | |
| "grad_norm": 0.8688636422157288, | |
| "learning_rate": 3.846568257730847e-06, | |
| "loss": 3.1351, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 9.945108169195997, | |
| "grad_norm": 0.8640137314796448, | |
| "learning_rate": 3.523327227669432e-06, | |
| "loss": 3.124, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 9.950489721235604, | |
| "grad_norm": 0.8745116591453552, | |
| "learning_rate": 3.2000861976080162e-06, | |
| "loss": 3.1238, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 9.955871273275212, | |
| "grad_norm": 0.8759902119636536, | |
| "learning_rate": 2.8768451675466007e-06, | |
| "loss": 3.1328, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 9.961252825314821, | |
| "grad_norm": 0.8939502239227295, | |
| "learning_rate": 2.5536041374851848e-06, | |
| "loss": 3.1505, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 9.96663437735443, | |
| "grad_norm": 0.849872887134552, | |
| "learning_rate": 2.230363107423769e-06, | |
| "loss": 3.11, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 9.972015929394038, | |
| "grad_norm": 0.8717160820960999, | |
| "learning_rate": 1.9071220773623531e-06, | |
| "loss": 3.1217, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 9.977397481433645, | |
| "grad_norm": 0.9214881658554077, | |
| "learning_rate": 1.5838810473009372e-06, | |
| "loss": 3.1166, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 9.982779033473253, | |
| "grad_norm": 0.8742483258247375, | |
| "learning_rate": 1.2606400172395215e-06, | |
| "loss": 3.1271, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 9.988160585512862, | |
| "grad_norm": 0.8844699859619141, | |
| "learning_rate": 9.373989871781058e-07, | |
| "loss": 3.1413, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 9.99354213755247, | |
| "grad_norm": 0.8351616263389587, | |
| "learning_rate": 6.1415795711669e-07, | |
| "loss": 3.1251, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 9.998923689592079, | |
| "grad_norm": 0.8443599939346313, | |
| "learning_rate": 2.909169270552742e-07, | |
| "loss": 3.1367, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 92910, | |
| "total_flos": 7.7681859821568e+17, | |
| "train_loss": 3.4559342654807845, | |
| "train_runtime": 80439.6826, | |
| "train_samples_per_second": 36.959, | |
| "train_steps_per_second": 1.155 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 92910, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.7681859821568e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |