| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9493463695685385, | |
| "eval_steps": 500, | |
| "global_step": 43500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.7957815527915955, | |
| "learning_rate": 9.97817594552716e-06, | |
| "loss": 5.9209, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.7227272391319275, | |
| "learning_rate": 9.956351891054321e-06, | |
| "loss": 5.8363, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.6150572896003723, | |
| "learning_rate": 9.93452783658148e-06, | |
| "loss": 5.7212, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.4721869230270386, | |
| "learning_rate": 9.912703782108642e-06, | |
| "loss": 5.5998, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.4436299204826355, | |
| "learning_rate": 9.890879727635801e-06, | |
| "loss": 5.4923, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.39484986662864685, | |
| "learning_rate": 9.86905567316296e-06, | |
| "loss": 5.3936, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.336157888174057, | |
| "learning_rate": 9.847231618690121e-06, | |
| "loss": 5.3086, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.28207510709762573, | |
| "learning_rate": 9.82540756421728e-06, | |
| "loss": 5.2389, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.21387213468551636, | |
| "learning_rate": 9.80358350974444e-06, | |
| "loss": 5.1929, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.19382569193840027, | |
| "learning_rate": 9.781759455271601e-06, | |
| "loss": 5.1608, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.18469111621379852, | |
| "learning_rate": 9.75993540079876e-06, | |
| "loss": 5.1402, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.16184094548225403, | |
| "learning_rate": 9.738111346325922e-06, | |
| "loss": 5.121, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.16706742346286774, | |
| "learning_rate": 9.716287291853081e-06, | |
| "loss": 5.1087, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.16692855954170227, | |
| "learning_rate": 9.694463237380242e-06, | |
| "loss": 5.0948, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.15095993876457214, | |
| "learning_rate": 9.672639182907401e-06, | |
| "loss": 5.0854, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.1381816416978836, | |
| "learning_rate": 9.65081512843456e-06, | |
| "loss": 5.0749, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.14653028547763824, | |
| "learning_rate": 9.62899107396172e-06, | |
| "loss": 5.0654, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.15313006937503815, | |
| "learning_rate": 9.607167019488881e-06, | |
| "loss": 5.0578, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.14888273179531097, | |
| "learning_rate": 9.58534296501604e-06, | |
| "loss": 5.0526, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.16502290964126587, | |
| "learning_rate": 9.563518910543202e-06, | |
| "loss": 5.0386, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.15124522149562836, | |
| "learning_rate": 9.541694856070361e-06, | |
| "loss": 5.0313, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.1460607498884201, | |
| "learning_rate": 9.519870801597522e-06, | |
| "loss": 5.0253, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.15850311517715454, | |
| "learning_rate": 9.498046747124681e-06, | |
| "loss": 5.0167, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.21040180325508118, | |
| "learning_rate": 9.476222692651842e-06, | |
| "loss": 5.0112, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.1923857480287552, | |
| "learning_rate": 9.454398638179002e-06, | |
| "loss": 5.002, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.1804654449224472, | |
| "learning_rate": 9.432574583706163e-06, | |
| "loss": 5.0006, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.18370215594768524, | |
| "learning_rate": 9.41075052923332e-06, | |
| "loss": 4.9894, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.22127391397953033, | |
| "learning_rate": 9.388926474760482e-06, | |
| "loss": 4.9866, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.2355722337961197, | |
| "learning_rate": 9.367102420287641e-06, | |
| "loss": 4.9807, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.2686903178691864, | |
| "learning_rate": 9.345278365814802e-06, | |
| "loss": 4.9762, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.2140015810728073, | |
| "learning_rate": 9.323454311341961e-06, | |
| "loss": 4.9703, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.2413332760334015, | |
| "learning_rate": 9.301630256869122e-06, | |
| "loss": 4.9677, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.36390700936317444, | |
| "learning_rate": 9.279806202396282e-06, | |
| "loss": 4.9618, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.20959585905075073, | |
| "learning_rate": 9.257982147923443e-06, | |
| "loss": 4.9577, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.2816881239414215, | |
| "learning_rate": 9.236158093450602e-06, | |
| "loss": 4.9579, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.565319299697876, | |
| "learning_rate": 9.214334038977763e-06, | |
| "loss": 4.9513, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6039319634437561, | |
| "learning_rate": 9.192509984504923e-06, | |
| "loss": 4.9505, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.18052706122398376, | |
| "learning_rate": 9.170685930032082e-06, | |
| "loss": 4.9481, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.22476325929164886, | |
| "learning_rate": 9.148861875559241e-06, | |
| "loss": 4.945, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.21381278336048126, | |
| "learning_rate": 9.127037821086402e-06, | |
| "loss": 4.942, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1974494308233261, | |
| "learning_rate": 9.105213766613562e-06, | |
| "loss": 4.9382, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.41070911288261414, | |
| "learning_rate": 9.083389712140723e-06, | |
| "loss": 4.9366, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1987224966287613, | |
| "learning_rate": 9.061565657667882e-06, | |
| "loss": 4.9356, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.24896694719791412, | |
| "learning_rate": 9.039741603195043e-06, | |
| "loss": 4.934, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.2133394479751587, | |
| "learning_rate": 9.017917548722203e-06, | |
| "loss": 4.9317, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.24069222807884216, | |
| "learning_rate": 8.996093494249362e-06, | |
| "loss": 4.9308, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.30186885595321655, | |
| "learning_rate": 8.974269439776523e-06, | |
| "loss": 4.9294, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.22445006668567657, | |
| "learning_rate": 8.952445385303682e-06, | |
| "loss": 4.9279, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.5278885960578918, | |
| "learning_rate": 8.930621330830842e-06, | |
| "loss": 4.9219, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.34161177277565, | |
| "learning_rate": 8.908797276358003e-06, | |
| "loss": 4.9185, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.46790817379951477, | |
| "learning_rate": 8.886973221885162e-06, | |
| "loss": 4.9219, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.24288785457611084, | |
| "learning_rate": 8.865149167412323e-06, | |
| "loss": 4.9184, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.0053044557571411, | |
| "learning_rate": 8.843325112939482e-06, | |
| "loss": 4.9202, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.2430969476699829, | |
| "learning_rate": 8.821501058466644e-06, | |
| "loss": 4.9138, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.4104827642440796, | |
| "learning_rate": 8.799677003993803e-06, | |
| "loss": 4.9119, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.3076772093772888, | |
| "learning_rate": 8.777852949520962e-06, | |
| "loss": 4.9135, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.2760007679462433, | |
| "learning_rate": 8.756028895048123e-06, | |
| "loss": 4.915, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.35674023628234863, | |
| "learning_rate": 8.734204840575283e-06, | |
| "loss": 4.908, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.370185911655426, | |
| "learning_rate": 8.712380786102442e-06, | |
| "loss": 4.9129, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.3864850699901581, | |
| "learning_rate": 8.690556731629603e-06, | |
| "loss": 4.9084, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.5277231335639954, | |
| "learning_rate": 8.668732677156762e-06, | |
| "loss": 4.91, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.4505026936531067, | |
| "learning_rate": 8.646908622683924e-06, | |
| "loss": 4.9074, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9321696758270264, | |
| "learning_rate": 8.625084568211083e-06, | |
| "loss": 4.905, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7220796346664429, | |
| "learning_rate": 8.603260513738242e-06, | |
| "loss": 4.907, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.26252031326293945, | |
| "learning_rate": 8.581436459265403e-06, | |
| "loss": 4.9033, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.2740485966205597, | |
| "learning_rate": 8.559612404792563e-06, | |
| "loss": 4.9014, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.5640104413032532, | |
| "learning_rate": 8.537788350319724e-06, | |
| "loss": 4.9019, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.3341173827648163, | |
| "learning_rate": 8.515964295846883e-06, | |
| "loss": 4.8988, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.45002481341362, | |
| "learning_rate": 8.494140241374044e-06, | |
| "loss": 4.8982, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.3300448954105377, | |
| "learning_rate": 8.472316186901203e-06, | |
| "loss": 4.8983, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.38954102993011475, | |
| "learning_rate": 8.450492132428363e-06, | |
| "loss": 4.8994, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.29785066843032837, | |
| "learning_rate": 8.428668077955522e-06, | |
| "loss": 4.8946, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.3793606758117676, | |
| "learning_rate": 8.406844023482683e-06, | |
| "loss": 4.893, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.3125133216381073, | |
| "learning_rate": 8.385019969009843e-06, | |
| "loss": 4.893, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.30539774894714355, | |
| "learning_rate": 8.363195914537004e-06, | |
| "loss": 4.8938, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.36112794280052185, | |
| "learning_rate": 8.341371860064163e-06, | |
| "loss": 4.8952, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.29357707500457764, | |
| "learning_rate": 8.319547805591324e-06, | |
| "loss": 4.8939, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6816242933273315, | |
| "learning_rate": 8.297723751118483e-06, | |
| "loss": 4.8941, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.46635544300079346, | |
| "learning_rate": 8.275899696645645e-06, | |
| "loss": 4.8914, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.4570920467376709, | |
| "learning_rate": 8.254075642172802e-06, | |
| "loss": 4.8897, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.0053654909133911, | |
| "learning_rate": 8.232251587699963e-06, | |
| "loss": 4.8932, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.5443705320358276, | |
| "learning_rate": 8.210427533227123e-06, | |
| "loss": 4.8904, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.9791301488876343, | |
| "learning_rate": 8.188603478754284e-06, | |
| "loss": 4.8887, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.41778409481048584, | |
| "learning_rate": 8.166779424281443e-06, | |
| "loss": 4.8881, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.3111547529697418, | |
| "learning_rate": 8.144955369808604e-06, | |
| "loss": 4.888, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8998810052871704, | |
| "learning_rate": 8.123131315335763e-06, | |
| "loss": 4.8882, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.0568580627441406, | |
| "learning_rate": 8.101307260862924e-06, | |
| "loss": 4.8836, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.4567314088344574, | |
| "learning_rate": 8.079483206390084e-06, | |
| "loss": 4.8872, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8183225393295288, | |
| "learning_rate": 8.057659151917245e-06, | |
| "loss": 4.8832, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.41399893164634705, | |
| "learning_rate": 8.035835097444404e-06, | |
| "loss": 4.8831, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.7039474248886108, | |
| "learning_rate": 8.014011042971564e-06, | |
| "loss": 4.8866, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.2951236367225647, | |
| "learning_rate": 7.992186988498723e-06, | |
| "loss": 4.8823, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.30894136428833, | |
| "learning_rate": 7.970362934025884e-06, | |
| "loss": 4.8847, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.0861660242080688, | |
| "learning_rate": 7.948538879553043e-06, | |
| "loss": 4.8818, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.6845309138298035, | |
| "learning_rate": 7.926714825080204e-06, | |
| "loss": 4.8821, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.36056384444236755, | |
| "learning_rate": 7.904890770607364e-06, | |
| "loss": 4.8786, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.4470602571964264, | |
| "learning_rate": 7.883066716134525e-06, | |
| "loss": 4.8825, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.1526983976364136, | |
| "learning_rate": 7.861242661661684e-06, | |
| "loss": 4.8832, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.4809672832489014, | |
| "learning_rate": 7.839418607188845e-06, | |
| "loss": 4.88, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5240287780761719, | |
| "learning_rate": 7.817594552716005e-06, | |
| "loss": 4.8799, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.7026857733726501, | |
| "learning_rate": 7.795770498243164e-06, | |
| "loss": 4.878, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.1413593292236328, | |
| "learning_rate": 7.773946443770323e-06, | |
| "loss": 4.879, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.347445011138916, | |
| "learning_rate": 7.752122389297484e-06, | |
| "loss": 4.876, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.108039379119873, | |
| "learning_rate": 7.730298334824644e-06, | |
| "loss": 4.875, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.1911160945892334, | |
| "learning_rate": 7.708474280351805e-06, | |
| "loss": 4.8781, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3719279766082764, | |
| "learning_rate": 7.686650225878964e-06, | |
| "loss": 4.8771, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9556494355201721, | |
| "learning_rate": 7.664826171406125e-06, | |
| "loss": 4.8763, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.42252323031425476, | |
| "learning_rate": 7.643002116933285e-06, | |
| "loss": 4.8751, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.9392989873886108, | |
| "learning_rate": 7.621178062460445e-06, | |
| "loss": 4.8751, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5632112622261047, | |
| "learning_rate": 7.599354007987605e-06, | |
| "loss": 4.873, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8829624056816101, | |
| "learning_rate": 7.577529953514765e-06, | |
| "loss": 4.874, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.4162335395812988, | |
| "learning_rate": 7.555705899041924e-06, | |
| "loss": 4.8713, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8155901432037354, | |
| "learning_rate": 7.533881844569084e-06, | |
| "loss": 4.8728, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.4502034187316895, | |
| "learning_rate": 7.512057790096244e-06, | |
| "loss": 4.8763, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.4868534803390503, | |
| "learning_rate": 7.490233735623404e-06, | |
| "loss": 4.8731, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3518391847610474, | |
| "learning_rate": 7.4684096811505646e-06, | |
| "loss": 4.8702, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5818581581115723, | |
| "learning_rate": 7.446585626677725e-06, | |
| "loss": 4.8709, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.434921383857727, | |
| "learning_rate": 7.424761572204885e-06, | |
| "loss": 4.8708, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.8794093132019043, | |
| "learning_rate": 7.402937517732045e-06, | |
| "loss": 4.8684, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.3370536267757416, | |
| "learning_rate": 7.381113463259205e-06, | |
| "loss": 4.8711, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.7272424697875977, | |
| "learning_rate": 7.359289408786366e-06, | |
| "loss": 4.875, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.3709563612937927, | |
| "learning_rate": 7.337465354313526e-06, | |
| "loss": 4.8682, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8434191346168518, | |
| "learning_rate": 7.315641299840684e-06, | |
| "loss": 4.8693, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.244709849357605, | |
| "learning_rate": 7.2938172453678445e-06, | |
| "loss": 4.8681, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7124190926551819, | |
| "learning_rate": 7.271993190895005e-06, | |
| "loss": 4.8702, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8966384530067444, | |
| "learning_rate": 7.250169136422165e-06, | |
| "loss": 4.8706, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.42346426844596863, | |
| "learning_rate": 7.228345081949325e-06, | |
| "loss": 4.8679, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.0583099126815796, | |
| "learning_rate": 7.206521027476485e-06, | |
| "loss": 4.8669, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.9891448020935059, | |
| "learning_rate": 7.1846969730036456e-06, | |
| "loss": 4.8688, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.387790322303772, | |
| "learning_rate": 7.162872918530806e-06, | |
| "loss": 4.8668, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.7412809133529663, | |
| "learning_rate": 7.141048864057966e-06, | |
| "loss": 4.8658, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.7861388921737671, | |
| "learning_rate": 7.119224809585125e-06, | |
| "loss": 4.8651, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 3.033806085586548, | |
| "learning_rate": 7.0974007551122855e-06, | |
| "loss": 4.8655, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.232498049736023, | |
| "learning_rate": 7.075576700639445e-06, | |
| "loss": 4.8633, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.7451385259628296, | |
| "learning_rate": 7.053752646166605e-06, | |
| "loss": 4.8668, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5436673760414124, | |
| "learning_rate": 7.031928591693765e-06, | |
| "loss": 4.867, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.891558289527893, | |
| "learning_rate": 7.0101045372209255e-06, | |
| "loss": 4.8652, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.3846653401851654, | |
| "learning_rate": 6.988280482748086e-06, | |
| "loss": 4.8668, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.1685656309127808, | |
| "learning_rate": 6.966456428275246e-06, | |
| "loss": 4.8661, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.38460877537727356, | |
| "learning_rate": 6.944632373802406e-06, | |
| "loss": 4.8651, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.9353694915771484, | |
| "learning_rate": 6.9228083193295655e-06, | |
| "loss": 4.8635, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7546364068984985, | |
| "learning_rate": 6.900984264856726e-06, | |
| "loss": 4.8639, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.9310697317123413, | |
| "learning_rate": 6.879160210383886e-06, | |
| "loss": 4.862, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5370875597000122, | |
| "learning_rate": 6.857336155911045e-06, | |
| "loss": 4.8633, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8054449558258057, | |
| "learning_rate": 6.8355121014382055e-06, | |
| "loss": 4.866, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.40611472725868225, | |
| "learning_rate": 6.813688046965366e-06, | |
| "loss": 4.8638, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.6744693517684937, | |
| "learning_rate": 6.791863992492526e-06, | |
| "loss": 4.8639, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.0651392936706543, | |
| "learning_rate": 6.770039938019686e-06, | |
| "loss": 4.8626, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.0164268016815186, | |
| "learning_rate": 6.7482158835468455e-06, | |
| "loss": 4.8646, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.5757657289505005, | |
| "learning_rate": 6.726391829074006e-06, | |
| "loss": 4.8642, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.9722656607627869, | |
| "learning_rate": 6.704567774601166e-06, | |
| "loss": 4.8647, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.559166431427002, | |
| "learning_rate": 6.682743720128326e-06, | |
| "loss": 4.8623, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.8523690700531006, | |
| "learning_rate": 6.660919665655486e-06, | |
| "loss": 4.8643, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.1013731956481934, | |
| "learning_rate": 6.6390956111826465e-06, | |
| "loss": 4.8615, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.3530488014221191, | |
| "learning_rate": 6.617271556709806e-06, | |
| "loss": 4.8617, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.8760623931884766, | |
| "learning_rate": 6.595447502236966e-06, | |
| "loss": 4.8604, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.709780216217041, | |
| "learning_rate": 6.5736234477641254e-06, | |
| "loss": 4.86, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.935454845428467, | |
| "learning_rate": 6.551799393291286e-06, | |
| "loss": 4.8588, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.8519152402877808, | |
| "learning_rate": 6.529975338818446e-06, | |
| "loss": 4.86, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.5556691288948059, | |
| "learning_rate": 6.508151284345606e-06, | |
| "loss": 4.8596, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.9867918491363525, | |
| "learning_rate": 6.486327229872766e-06, | |
| "loss": 4.8569, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.39979785680770874, | |
| "learning_rate": 6.4645031753999265e-06, | |
| "loss": 4.8594, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.42257916927337646, | |
| "learning_rate": 6.442679120927087e-06, | |
| "loss": 4.8604, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.39576584100723267, | |
| "learning_rate": 6.420855066454247e-06, | |
| "loss": 4.8595, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.216848373413086, | |
| "learning_rate": 6.399031011981407e-06, | |
| "loss": 4.8599, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.709771990776062, | |
| "learning_rate": 6.377206957508566e-06, | |
| "loss": 4.8571, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.4081980884075165, | |
| "learning_rate": 6.355382903035726e-06, | |
| "loss": 4.857, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7303671836853027, | |
| "learning_rate": 6.333558848562886e-06, | |
| "loss": 4.8584, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.0993214845657349, | |
| "learning_rate": 6.311734794090046e-06, | |
| "loss": 4.859, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.951707661151886, | |
| "learning_rate": 6.2899107396172064e-06, | |
| "loss": 4.856, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.3436036109924316, | |
| "learning_rate": 6.268086685144367e-06, | |
| "loss": 4.8559, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7691673040390015, | |
| "learning_rate": 6.246262630671527e-06, | |
| "loss": 4.8595, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3263872861862183, | |
| "learning_rate": 6.224438576198687e-06, | |
| "loss": 4.8562, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.6155282258987427, | |
| "learning_rate": 6.202614521725847e-06, | |
| "loss": 4.8578, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.459848403930664, | |
| "learning_rate": 6.1807904672530075e-06, | |
| "loss": 4.8536, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.0362757444381714, | |
| "learning_rate": 6.158966412780166e-06, | |
| "loss": 4.8559, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.5081543922424316, | |
| "learning_rate": 6.137142358307326e-06, | |
| "loss": 4.8576, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.0799150466918945, | |
| "learning_rate": 6.115318303834486e-06, | |
| "loss": 4.8553, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.4601239562034607, | |
| "learning_rate": 6.093494249361647e-06, | |
| "loss": 4.8552, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.4647897779941559, | |
| "learning_rate": 6.071670194888807e-06, | |
| "loss": 4.8515, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.550953209400177, | |
| "learning_rate": 6.049846140415967e-06, | |
| "loss": 4.8559, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.748518705368042, | |
| "learning_rate": 6.028022085943127e-06, | |
| "loss": 4.8543, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.6062777042388916, | |
| "learning_rate": 6.0061980314702874e-06, | |
| "loss": 4.8571, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8579190373420715, | |
| "learning_rate": 5.984373976997448e-06, | |
| "loss": 4.8566, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8633508682250977, | |
| "learning_rate": 5.962549922524608e-06, | |
| "loss": 4.8528, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.5768360495567322, | |
| "learning_rate": 5.940725868051768e-06, | |
| "loss": 4.8559, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.7455744743347168, | |
| "learning_rate": 5.918901813578927e-06, | |
| "loss": 4.8577, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.3996860682964325, | |
| "learning_rate": 5.897077759106087e-06, | |
| "loss": 4.8547, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.049177885055542, | |
| "learning_rate": 5.875253704633247e-06, | |
| "loss": 4.8549, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0341905355453491, | |
| "learning_rate": 5.853429650160407e-06, | |
| "loss": 4.8506, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.4852238893508911, | |
| "learning_rate": 5.831605595687567e-06, | |
| "loss": 4.8561, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.2729053497314453, | |
| "learning_rate": 5.809781541214728e-06, | |
| "loss": 4.8552, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.259981393814087, | |
| "learning_rate": 5.787957486741888e-06, | |
| "loss": 4.8506, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.0674850940704346, | |
| "learning_rate": 5.766133432269048e-06, | |
| "loss": 4.8543, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.3431077003479004, | |
| "learning_rate": 5.744309377796207e-06, | |
| "loss": 4.8522, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.9442102313041687, | |
| "learning_rate": 5.722485323323368e-06, | |
| "loss": 4.8511, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.453003466129303, | |
| "learning_rate": 5.700661268850528e-06, | |
| "loss": 4.8512, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.39568933844566345, | |
| "learning_rate": 5.678837214377687e-06, | |
| "loss": 4.8527, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.3336150646209717, | |
| "learning_rate": 5.657013159904847e-06, | |
| "loss": 4.852, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 3.6281139850616455, | |
| "learning_rate": 5.635189105432008e-06, | |
| "loss": 4.853, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.4694347679615021, | |
| "learning_rate": 5.613365050959168e-06, | |
| "loss": 4.8529, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7866901159286499, | |
| "learning_rate": 5.591540996486328e-06, | |
| "loss": 4.851, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5806153416633606, | |
| "learning_rate": 5.569716942013487e-06, | |
| "loss": 4.8539, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5267885327339172, | |
| "learning_rate": 5.5478928875406476e-06, | |
| "loss": 4.8559, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.056804656982422, | |
| "learning_rate": 5.526068833067808e-06, | |
| "loss": 4.855, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 3.2893295288085938, | |
| "learning_rate": 5.504244778594968e-06, | |
| "loss": 4.8512, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.9322779774665833, | |
| "learning_rate": 5.482420724122128e-06, | |
| "loss": 4.8525, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.505246639251709, | |
| "learning_rate": 5.4605966696492876e-06, | |
| "loss": 4.8499, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.040572166442871, | |
| "learning_rate": 5.438772615176448e-06, | |
| "loss": 4.8509, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.8444960713386536, | |
| "learning_rate": 5.416948560703608e-06, | |
| "loss": 4.8496, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.4201112985610962, | |
| "learning_rate": 5.395124506230768e-06, | |
| "loss": 4.8505, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.2699060440063477, | |
| "learning_rate": 5.3733004517579275e-06, | |
| "loss": 4.8498, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.7760267853736877, | |
| "learning_rate": 5.351476397285088e-06, | |
| "loss": 4.8506, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.49700111150741577, | |
| "learning_rate": 5.329652342812248e-06, | |
| "loss": 4.8513, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.6455672979354858, | |
| "learning_rate": 5.307828288339408e-06, | |
| "loss": 4.8514, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6039340496063232, | |
| "learning_rate": 5.286004233866568e-06, | |
| "loss": 4.8486, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.9197044968605042, | |
| "learning_rate": 5.2641801793937286e-06, | |
| "loss": 4.8515, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.3925398290157318, | |
| "learning_rate": 5.242356124920889e-06, | |
| "loss": 4.8506, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.0743471384048462, | |
| "learning_rate": 5.220532070448048e-06, | |
| "loss": 4.853, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4220006763935089, | |
| "learning_rate": 5.1987080159752075e-06, | |
| "loss": 4.8493, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6162962913513184, | |
| "learning_rate": 5.176883961502368e-06, | |
| "loss": 4.8518, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6012248992919922, | |
| "learning_rate": 5.155059907029528e-06, | |
| "loss": 4.8468, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.838961124420166, | |
| "learning_rate": 5.133235852556688e-06, | |
| "loss": 4.8511, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.9484496116638184, | |
| "learning_rate": 5.111411798083848e-06, | |
| "loss": 4.8523, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.7750368118286133, | |
| "learning_rate": 5.0895877436110085e-06, | |
| "loss": 4.8467, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7409512996673584, | |
| "learning_rate": 5.067763689138169e-06, | |
| "loss": 4.8465, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6893154382705688, | |
| "learning_rate": 5.045939634665329e-06, | |
| "loss": 4.8507, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.4250375032424927, | |
| "learning_rate": 5.024115580192489e-06, | |
| "loss": 4.8471, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.5553233623504639, | |
| "learning_rate": 5.002291525719649e-06, | |
| "loss": 4.847, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.42157289385795593, | |
| "learning_rate": 4.980467471246809e-06, | |
| "loss": 4.8465, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.3181480169296265, | |
| "learning_rate": 4.958643416773969e-06, | |
| "loss": 4.8471, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.3415127992630005, | |
| "learning_rate": 4.936819362301128e-06, | |
| "loss": 4.8476, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.485499382019043, | |
| "learning_rate": 4.9149953078282885e-06, | |
| "loss": 4.8487, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.6069031953811646, | |
| "learning_rate": 4.893171253355449e-06, | |
| "loss": 4.8501, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.8160085678100586, | |
| "learning_rate": 4.871347198882609e-06, | |
| "loss": 4.8474, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.411562204360962, | |
| "learning_rate": 4.849523144409769e-06, | |
| "loss": 4.8437, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.40973162651062, | |
| "learning_rate": 4.827699089936929e-06, | |
| "loss": 4.8459, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.4108150005340576, | |
| "learning_rate": 4.805875035464089e-06, | |
| "loss": 4.8472, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.5337820649147034, | |
| "learning_rate": 4.784050980991249e-06, | |
| "loss": 4.8472, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.6120834350585938, | |
| "learning_rate": 4.762226926518409e-06, | |
| "loss": 4.8471, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.2425861358642578, | |
| "learning_rate": 4.740402872045569e-06, | |
| "loss": 4.8476, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.7485963106155396, | |
| "learning_rate": 4.7185788175727295e-06, | |
| "loss": 4.8519, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.4650036692619324, | |
| "learning_rate": 4.696754763099889e-06, | |
| "loss": 4.8474, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.4887036681175232, | |
| "learning_rate": 4.674930708627049e-06, | |
| "loss": 4.8464, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.113826036453247, | |
| "learning_rate": 4.653106654154209e-06, | |
| "loss": 4.8492, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.9913270473480225, | |
| "learning_rate": 4.6312825996813695e-06, | |
| "loss": 4.8456, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.8120296001434326, | |
| "learning_rate": 4.60945854520853e-06, | |
| "loss": 4.8465, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.4769647121429443, | |
| "learning_rate": 4.587634490735689e-06, | |
| "loss": 4.8434, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.9758397340774536, | |
| "learning_rate": 4.565810436262849e-06, | |
| "loss": 4.8425, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.4331335723400116, | |
| "learning_rate": 4.5439863817900095e-06, | |
| "loss": 4.8479, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.6201393604278564, | |
| "learning_rate": 4.52216232731717e-06, | |
| "loss": 4.848, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 3.273059129714966, | |
| "learning_rate": 4.50033827284433e-06, | |
| "loss": 4.846, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.6748781204223633, | |
| "learning_rate": 4.47851421837149e-06, | |
| "loss": 4.8451, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.6629786491394043, | |
| "learning_rate": 4.4566901638986495e-06, | |
| "loss": 4.8424, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5216906070709229, | |
| "learning_rate": 4.43486610942581e-06, | |
| "loss": 4.8483, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6084491014480591, | |
| "learning_rate": 4.41304205495297e-06, | |
| "loss": 4.8445, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6803710460662842, | |
| "learning_rate": 4.39121800048013e-06, | |
| "loss": 4.8436, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8140966892242432, | |
| "learning_rate": 4.3693939460072895e-06, | |
| "loss": 4.8457, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5089240670204163, | |
| "learning_rate": 4.34756989153445e-06, | |
| "loss": 4.848, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5802351236343384, | |
| "learning_rate": 4.32574583706161e-06, | |
| "loss": 4.8451, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.4696178734302521, | |
| "learning_rate": 4.30392178258877e-06, | |
| "loss": 4.8445, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.432464063167572, | |
| "learning_rate": 4.2820977281159294e-06, | |
| "loss": 4.8469, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.313357949256897, | |
| "learning_rate": 4.26027367364309e-06, | |
| "loss": 4.8451, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1689280271530151, | |
| "learning_rate": 4.23844961917025e-06, | |
| "loss": 4.8441, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.7955917119979858, | |
| "learning_rate": 4.21662556469741e-06, | |
| "loss": 4.8446, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.5141814947128296, | |
| "learning_rate": 4.1948015102245694e-06, | |
| "loss": 4.848, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.512915849685669, | |
| "learning_rate": 4.17297745575173e-06, | |
| "loss": 4.8441, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 3.6091654300689697, | |
| "learning_rate": 4.15115340127889e-06, | |
| "loss": 4.8445, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6325381994247437, | |
| "learning_rate": 4.12932934680605e-06, | |
| "loss": 4.8447, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.8864593505859375, | |
| "learning_rate": 4.107505292333209e-06, | |
| "loss": 4.8451, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.2388365268707275, | |
| "learning_rate": 4.08568123786037e-06, | |
| "loss": 4.8419, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.5139906406402588, | |
| "learning_rate": 4.06385718338753e-06, | |
| "loss": 4.8436, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.44464704394340515, | |
| "learning_rate": 4.04203312891469e-06, | |
| "loss": 4.8455, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5341111421585083, | |
| "learning_rate": 4.02020907444185e-06, | |
| "loss": 4.842, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6707934141159058, | |
| "learning_rate": 3.99838501996901e-06, | |
| "loss": 4.8473, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.842087984085083, | |
| "learning_rate": 3.97656096549617e-06, | |
| "loss": 4.845, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.575089693069458, | |
| "learning_rate": 3.95473691102333e-06, | |
| "loss": 4.8432, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.7443206310272217, | |
| "learning_rate": 3.93291285655049e-06, | |
| "loss": 4.8426, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 3.0586435794830322, | |
| "learning_rate": 3.9110888020776504e-06, | |
| "loss": 4.8458, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6179745197296143, | |
| "learning_rate": 3.88926474760481e-06, | |
| "loss": 4.8441, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.7903715372085571, | |
| "learning_rate": 3.86744069313197e-06, | |
| "loss": 4.8412, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.2073774337768555, | |
| "learning_rate": 3.84561663865913e-06, | |
| "loss": 4.8416, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7796311974525452, | |
| "learning_rate": 3.82379258418629e-06, | |
| "loss": 4.8427, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.6196668148040771, | |
| "learning_rate": 3.8019685297134506e-06, | |
| "loss": 4.8421, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.8816477656364441, | |
| "learning_rate": 3.7801444752406104e-06, | |
| "loss": 4.8424, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.104554295539856, | |
| "learning_rate": 3.75832042076777e-06, | |
| "loss": 4.8429, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6221787333488464, | |
| "learning_rate": 3.7364963662949304e-06, | |
| "loss": 4.8439, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6664981842041016, | |
| "learning_rate": 3.7146723118220906e-06, | |
| "loss": 4.8443, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.061434268951416, | |
| "learning_rate": 3.692848257349251e-06, | |
| "loss": 4.8422, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.919090747833252, | |
| "learning_rate": 3.671024202876411e-06, | |
| "loss": 4.841, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7256101369857788, | |
| "learning_rate": 3.6492001484035704e-06, | |
| "loss": 4.8413, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.418320894241333, | |
| "learning_rate": 3.6273760939307306e-06, | |
| "loss": 4.8434, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.3488261699676514, | |
| "learning_rate": 3.605552039457891e-06, | |
| "loss": 4.8436, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9698038697242737, | |
| "learning_rate": 3.583727984985051e-06, | |
| "loss": 4.8434, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.685869812965393, | |
| "learning_rate": 3.561903930512211e-06, | |
| "loss": 4.8398, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8971433639526367, | |
| "learning_rate": 3.5400798760393706e-06, | |
| "loss": 4.841, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.2868356704711914, | |
| "learning_rate": 3.5182558215665308e-06, | |
| "loss": 4.8409, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8014007210731506, | |
| "learning_rate": 3.496431767093691e-06, | |
| "loss": 4.841, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7061727643013, | |
| "learning_rate": 3.474607712620851e-06, | |
| "loss": 4.8433, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7472312450408936, | |
| "learning_rate": 3.4527836581480114e-06, | |
| "loss": 4.8423, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.389944076538086, | |
| "learning_rate": 3.4309596036751708e-06, | |
| "loss": 4.8404, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.251760959625244, | |
| "learning_rate": 3.409135549202331e-06, | |
| "loss": 4.8452, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1334567070007324, | |
| "learning_rate": 3.387311494729491e-06, | |
| "loss": 4.8425, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1134765148162842, | |
| "learning_rate": 3.3654874402566514e-06, | |
| "loss": 4.8393, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.4619927704334259, | |
| "learning_rate": 3.343663385783811e-06, | |
| "loss": 4.8415, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.933467388153076, | |
| "learning_rate": 3.3218393313109714e-06, | |
| "loss": 4.8399, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.7002770900726318, | |
| "learning_rate": 3.300015276838131e-06, | |
| "loss": 4.8417, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.8919426798820496, | |
| "learning_rate": 3.2781912223652914e-06, | |
| "loss": 4.8388, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.6836445331573486, | |
| "learning_rate": 3.256367167892451e-06, | |
| "loss": 4.8386, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.4594995975494385, | |
| "learning_rate": 3.2345431134196114e-06, | |
| "loss": 4.8399, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7272505164146423, | |
| "learning_rate": 3.2127190589467716e-06, | |
| "loss": 4.8397, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.1747781038284302, | |
| "learning_rate": 3.1908950044739313e-06, | |
| "loss": 4.8405, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6341750621795654, | |
| "learning_rate": 3.169070950001091e-06, | |
| "loss": 4.8415, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.8783094882965088, | |
| "learning_rate": 3.1472468955282513e-06, | |
| "loss": 4.8351, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6762621402740479, | |
| "learning_rate": 3.1254228410554115e-06, | |
| "loss": 4.8446, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.1475865840911865, | |
| "learning_rate": 3.1035987865825718e-06, | |
| "loss": 4.8397, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.4153202772140503, | |
| "learning_rate": 3.0817747321097315e-06, | |
| "loss": 4.84, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 3.4596126079559326, | |
| "learning_rate": 3.0599506776368913e-06, | |
| "loss": 4.8424, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.4196875095367432, | |
| "learning_rate": 3.0381266231640515e-06, | |
| "loss": 4.8418, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.9644806385040283, | |
| "learning_rate": 3.0163025686912117e-06, | |
| "loss": 4.8414, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6097564101219177, | |
| "learning_rate": 2.994478514218372e-06, | |
| "loss": 4.8401, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.8151260614395142, | |
| "learning_rate": 2.972654459745532e-06, | |
| "loss": 4.8427, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.938615083694458, | |
| "learning_rate": 2.9508304052726915e-06, | |
| "loss": 4.8408, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 3.4757254123687744, | |
| "learning_rate": 2.9290063507998517e-06, | |
| "loss": 4.8429, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0553146600723267, | |
| "learning_rate": 2.907182296327012e-06, | |
| "loss": 4.8412, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.4443321228027344, | |
| "learning_rate": 2.885358241854172e-06, | |
| "loss": 4.8405, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.0173979997634888, | |
| "learning_rate": 2.8635341873813323e-06, | |
| "loss": 4.8367, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.2443580627441406, | |
| "learning_rate": 2.8417101329084917e-06, | |
| "loss": 4.8393, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9010906219482422, | |
| "learning_rate": 2.819886078435652e-06, | |
| "loss": 4.8391, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.2947003841400146, | |
| "learning_rate": 2.798062023962812e-06, | |
| "loss": 4.8401, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9409279227256775, | |
| "learning_rate": 2.7762379694899723e-06, | |
| "loss": 4.8389, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5245996713638306, | |
| "learning_rate": 2.754413915017132e-06, | |
| "loss": 4.8392, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.031678318977356, | |
| "learning_rate": 2.732589860544292e-06, | |
| "loss": 4.8406, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.3470673561096191, | |
| "learning_rate": 2.710765806071452e-06, | |
| "loss": 4.8404, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.0202999114990234, | |
| "learning_rate": 2.6889417515986123e-06, | |
| "loss": 4.8409, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.7307863235473633, | |
| "learning_rate": 2.667117697125772e-06, | |
| "loss": 4.8378, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.679755687713623, | |
| "learning_rate": 2.6452936426529323e-06, | |
| "loss": 4.8388, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.951936721801758, | |
| "learning_rate": 2.6234695881800925e-06, | |
| "loss": 4.8403, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.8046462535858154, | |
| "learning_rate": 2.6016455337072523e-06, | |
| "loss": 4.838, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.8763813972473145, | |
| "learning_rate": 2.579821479234412e-06, | |
| "loss": 4.8393, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.4633421897888184, | |
| "learning_rate": 2.5579974247615723e-06, | |
| "loss": 4.8403, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.8257120847702026, | |
| "learning_rate": 2.5361733702887325e-06, | |
| "loss": 4.8364, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.8002654314041138, | |
| "learning_rate": 2.5143493158158927e-06, | |
| "loss": 4.8422, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.5233999490737915, | |
| "learning_rate": 2.4925252613430525e-06, | |
| "loss": 4.838, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.0047553777694702, | |
| "learning_rate": 2.4707012068702123e-06, | |
| "loss": 4.8406, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.5430443286895752, | |
| "learning_rate": 2.4488771523973725e-06, | |
| "loss": 4.8385, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.5413551330566406, | |
| "learning_rate": 2.4270530979245327e-06, | |
| "loss": 4.8377, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.4709758162498474, | |
| "learning_rate": 2.4052290434516925e-06, | |
| "loss": 4.8364, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.9328405857086182, | |
| "learning_rate": 2.3834049889788527e-06, | |
| "loss": 4.8389, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6147871017456055, | |
| "learning_rate": 2.361580934506013e-06, | |
| "loss": 4.8382, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.017819404602051, | |
| "learning_rate": 2.3397568800331727e-06, | |
| "loss": 4.838, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.1005585193634033, | |
| "learning_rate": 2.317932825560333e-06, | |
| "loss": 4.8383, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.5119142532348633, | |
| "learning_rate": 2.2961087710874927e-06, | |
| "loss": 4.8354, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.8534014225006104, | |
| "learning_rate": 2.274284716614653e-06, | |
| "loss": 4.8387, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.2567205429077148, | |
| "learning_rate": 2.252460662141813e-06, | |
| "loss": 4.8392, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.021512508392334, | |
| "learning_rate": 2.230636607668973e-06, | |
| "loss": 4.8365, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7630836367607117, | |
| "learning_rate": 2.208812553196133e-06, | |
| "loss": 4.8394, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.742551326751709, | |
| "learning_rate": 2.1869884987232933e-06, | |
| "loss": 4.8398, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.5478837490081787, | |
| "learning_rate": 2.165164444250453e-06, | |
| "loss": 4.8369, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 3.5186381340026855, | |
| "learning_rate": 2.1433403897776133e-06, | |
| "loss": 4.8407, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.9799013137817383, | |
| "learning_rate": 2.121516335304773e-06, | |
| "loss": 4.8404, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.6745467782020569, | |
| "learning_rate": 2.0996922808319333e-06, | |
| "loss": 4.8361, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.257988452911377, | |
| "learning_rate": 2.077868226359093e-06, | |
| "loss": 4.8391, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.193894863128662, | |
| "learning_rate": 2.0560441718862532e-06, | |
| "loss": 4.8372, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0707862377166748, | |
| "learning_rate": 2.0342201174134134e-06, | |
| "loss": 4.8386, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.415719747543335, | |
| "learning_rate": 2.0123960629405732e-06, | |
| "loss": 4.8397, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.7595218420028687, | |
| "learning_rate": 1.9905720084677334e-06, | |
| "loss": 4.8362, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.733424186706543, | |
| "learning_rate": 1.9687479539948932e-06, | |
| "loss": 4.8387, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.846148729324341, | |
| "learning_rate": 1.9469238995220534e-06, | |
| "loss": 4.8351, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 3.0567848682403564, | |
| "learning_rate": 1.9250998450492132e-06, | |
| "loss": 4.8384, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.4598780870437622, | |
| "learning_rate": 1.9032757905763734e-06, | |
| "loss": 4.8391, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.080013394355774, | |
| "learning_rate": 1.8814517361035334e-06, | |
| "loss": 4.839, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.5742350816726685, | |
| "learning_rate": 1.8596276816306934e-06, | |
| "loss": 4.8415, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.509831428527832, | |
| "learning_rate": 1.8378036271578536e-06, | |
| "loss": 4.8383, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.7902163863182068, | |
| "learning_rate": 1.8159795726850136e-06, | |
| "loss": 4.8383, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.2146787643432617, | |
| "learning_rate": 1.7941555182121736e-06, | |
| "loss": 4.839, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.5992825031280518, | |
| "learning_rate": 1.7723314637393336e-06, | |
| "loss": 4.8342, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.7323722839355469, | |
| "learning_rate": 1.7505074092664936e-06, | |
| "loss": 4.8384, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.4978944659233093, | |
| "learning_rate": 1.7286833547936538e-06, | |
| "loss": 4.838, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.597199022769928, | |
| "learning_rate": 1.7068593003208136e-06, | |
| "loss": 4.8345, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.2732481956481934, | |
| "learning_rate": 1.6850352458479738e-06, | |
| "loss": 4.8382, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.6661024689674377, | |
| "learning_rate": 1.663211191375134e-06, | |
| "loss": 4.8395, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.767288088798523, | |
| "learning_rate": 1.6413871369022938e-06, | |
| "loss": 4.838, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.47860872745513916, | |
| "learning_rate": 1.619563082429454e-06, | |
| "loss": 4.839, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5148721933364868, | |
| "learning_rate": 1.5977390279566138e-06, | |
| "loss": 4.839, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.728582739830017, | |
| "learning_rate": 1.575914973483774e-06, | |
| "loss": 4.8347, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9544780254364014, | |
| "learning_rate": 1.554090919010934e-06, | |
| "loss": 4.8369, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.7922559976577759, | |
| "learning_rate": 1.532266864538094e-06, | |
| "loss": 4.8385, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.432121217250824, | |
| "learning_rate": 1.510442810065254e-06, | |
| "loss": 4.8376, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.711795449256897, | |
| "learning_rate": 1.4886187555924142e-06, | |
| "loss": 4.8391, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.8019421696662903, | |
| "learning_rate": 1.466794701119574e-06, | |
| "loss": 4.8375, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.8278266191482544, | |
| "learning_rate": 1.4449706466467342e-06, | |
| "loss": 4.8342, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.6380483508110046, | |
| "learning_rate": 1.423146592173894e-06, | |
| "loss": 4.838, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9113682508468628, | |
| "learning_rate": 1.4013225377010542e-06, | |
| "loss": 4.8362, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.5136518478393555, | |
| "learning_rate": 1.3794984832282144e-06, | |
| "loss": 4.838, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.0487759113311768, | |
| "learning_rate": 1.3576744287553742e-06, | |
| "loss": 4.8389, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.9537123441696167, | |
| "learning_rate": 1.3358503742825344e-06, | |
| "loss": 4.8338, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.4224497079849243, | |
| "learning_rate": 1.3140263198096944e-06, | |
| "loss": 4.8358, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.1593801975250244, | |
| "learning_rate": 1.2922022653368544e-06, | |
| "loss": 4.8331, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.5574233531951904, | |
| "learning_rate": 1.2703782108640146e-06, | |
| "loss": 4.8351, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.2431851625442505, | |
| "learning_rate": 1.2485541563911746e-06, | |
| "loss": 4.8391, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.5098065137863159, | |
| "learning_rate": 1.2267301019183346e-06, | |
| "loss": 4.8389, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.690344512462616, | |
| "learning_rate": 1.2049060474454946e-06, | |
| "loss": 4.8387, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.661180019378662, | |
| "learning_rate": 1.1830819929726546e-06, | |
| "loss": 4.8373, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.4016964435577393, | |
| "learning_rate": 1.1612579384998145e-06, | |
| "loss": 4.8373, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.7542119026184082, | |
| "learning_rate": 1.1394338840269745e-06, | |
| "loss": 4.8359, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.7012521624565125, | |
| "learning_rate": 1.1176098295541345e-06, | |
| "loss": 4.8367, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.8252130746841431, | |
| "learning_rate": 1.0957857750812947e-06, | |
| "loss": 4.8377, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.0341030359268188, | |
| "learning_rate": 1.0739617206084547e-06, | |
| "loss": 4.8374, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.5905879735946655, | |
| "learning_rate": 1.0521376661356147e-06, | |
| "loss": 4.84, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.8270330429077148, | |
| "learning_rate": 1.0303136116627747e-06, | |
| "loss": 4.8343, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.5131927728652954, | |
| "learning_rate": 1.008489557189935e-06, | |
| "loss": 4.8367, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9526300430297852, | |
| "learning_rate": 9.86665502717095e-07, | |
| "loss": 4.8365, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.43970614671707153, | |
| "learning_rate": 9.64841448244255e-07, | |
| "loss": 4.8353, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.5485687255859375, | |
| "learning_rate": 9.430173937714149e-07, | |
| "loss": 4.8375, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.4453191757202148, | |
| "learning_rate": 9.21193339298575e-07, | |
| "loss": 4.8366, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.5337852835655212, | |
| "learning_rate": 8.99369284825735e-07, | |
| "loss": 4.8376, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.0073989629745483, | |
| "learning_rate": 8.77545230352895e-07, | |
| "loss": 4.8382, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.8360026478767395, | |
| "learning_rate": 8.55721175880055e-07, | |
| "loss": 4.838, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.3463079929351807, | |
| "learning_rate": 8.338971214072151e-07, | |
| "loss": 4.8379, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.6060824990272522, | |
| "learning_rate": 8.120730669343751e-07, | |
| "loss": 4.8386, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.7597670555114746, | |
| "learning_rate": 7.902490124615351e-07, | |
| "loss": 4.8373, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.566273808479309, | |
| "learning_rate": 7.684249579886951e-07, | |
| "loss": 4.8366, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.5708392262458801, | |
| "learning_rate": 7.466009035158553e-07, | |
| "loss": 4.8363, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.2547303438186646, | |
| "learning_rate": 7.247768490430153e-07, | |
| "loss": 4.8382, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6009268760681152, | |
| "learning_rate": 7.029527945701753e-07, | |
| "loss": 4.8361, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.8468096256256104, | |
| "learning_rate": 6.811287400973353e-07, | |
| "loss": 4.8372, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.3321032524108887, | |
| "learning_rate": 6.593046856244954e-07, | |
| "loss": 4.8369, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.5124238729476929, | |
| "learning_rate": 6.374806311516554e-07, | |
| "loss": 4.8382, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6034313440322876, | |
| "learning_rate": 6.156565766788154e-07, | |
| "loss": 4.8393, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 3.0424869060516357, | |
| "learning_rate": 5.938325222059755e-07, | |
| "loss": 4.8362, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9956300854682922, | |
| "learning_rate": 5.720084677331355e-07, | |
| "loss": 4.8358, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.2779983282089233, | |
| "learning_rate": 5.501844132602956e-07, | |
| "loss": 4.8366, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.522408664226532, | |
| "learning_rate": 5.283603587874556e-07, | |
| "loss": 4.8345, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5073395371437073, | |
| "learning_rate": 5.065363043146156e-07, | |
| "loss": 4.8362, | |
| "step": 43500 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 45821, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "total_flos": 4.0839050005128806e+17, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |