| { | |
| "best_metric": 0.018521126359701157, | |
| "best_model_checkpoint": "/home/paperspace/Data/models/Goavanto/llm3br256-v/checkpoint-400", | |
| "epoch": 5.198019801980198, | |
| "eval_steps": 25, | |
| "global_step": 525, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009900990099009901, | |
| "grad_norm": 0.2780945897102356, | |
| "learning_rate": 3.952569169960474e-07, | |
| "loss": 0.1412, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.019801980198019802, | |
| "grad_norm": 0.2978728413581848, | |
| "learning_rate": 7.905138339920948e-07, | |
| "loss": 0.1399, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0297029702970297, | |
| "grad_norm": 0.30333277583122253, | |
| "learning_rate": 1.1857707509881422e-06, | |
| "loss": 0.14, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.039603960396039604, | |
| "grad_norm": 0.2764367461204529, | |
| "learning_rate": 1.5810276679841897e-06, | |
| "loss": 0.1333, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04950495049504951, | |
| "grad_norm": 0.31314605474472046, | |
| "learning_rate": 1.9762845849802374e-06, | |
| "loss": 0.1377, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0594059405940594, | |
| "grad_norm": 0.2985245883464813, | |
| "learning_rate": 2.3715415019762844e-06, | |
| "loss": 0.1872, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06930693069306931, | |
| "grad_norm": 0.314704567193985, | |
| "learning_rate": 2.7667984189723323e-06, | |
| "loss": 0.1336, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07920792079207921, | |
| "grad_norm": 0.31308513879776, | |
| "learning_rate": 3.1620553359683794e-06, | |
| "loss": 0.156, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0891089108910891, | |
| "grad_norm": 0.29465916752815247, | |
| "learning_rate": 3.5573122529644273e-06, | |
| "loss": 0.1615, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 0.258452445268631, | |
| "learning_rate": 3.952569169960475e-06, | |
| "loss": 0.1277, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10891089108910891, | |
| "grad_norm": 0.27227145433425903, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.156, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1188118811881188, | |
| "grad_norm": 0.256413072347641, | |
| "learning_rate": 4.743083003952569e-06, | |
| "loss": 0.1454, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.12871287128712872, | |
| "grad_norm": 0.18560951948165894, | |
| "learning_rate": 5.138339920948617e-06, | |
| "loss": 0.1014, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.13861386138613863, | |
| "grad_norm": 0.21946443617343903, | |
| "learning_rate": 5.533596837944665e-06, | |
| "loss": 0.1573, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1485148514851485, | |
| "grad_norm": 0.20054368674755096, | |
| "learning_rate": 5.928853754940711e-06, | |
| "loss": 0.1043, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15841584158415842, | |
| "grad_norm": 0.17403173446655273, | |
| "learning_rate": 6.324110671936759e-06, | |
| "loss": 0.1517, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.16831683168316833, | |
| "grad_norm": 0.1680103838443756, | |
| "learning_rate": 6.719367588932807e-06, | |
| "loss": 0.1285, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1782178217821782, | |
| "grad_norm": 0.16556352376937866, | |
| "learning_rate": 7.1146245059288545e-06, | |
| "loss": 0.1036, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.18811881188118812, | |
| "grad_norm": 0.1210479736328125, | |
| "learning_rate": 7.509881422924901e-06, | |
| "loss": 0.1052, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.14721368253231049, | |
| "learning_rate": 7.90513833992095e-06, | |
| "loss": 0.0883, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2079207920792079, | |
| "grad_norm": 0.14860320091247559, | |
| "learning_rate": 8.300395256916998e-06, | |
| "loss": 0.1041, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.21782178217821782, | |
| "grad_norm": 0.14374814927577972, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.0877, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.22772277227722773, | |
| "grad_norm": 0.13484112918376923, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.1186, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2376237623762376, | |
| "grad_norm": 0.13147170841693878, | |
| "learning_rate": 9.486166007905138e-06, | |
| "loss": 0.1149, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.24752475247524752, | |
| "grad_norm": 0.12901976704597473, | |
| "learning_rate": 9.881422924901186e-06, | |
| "loss": 0.1227, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.24752475247524752, | |
| "eval_loss": 0.10737022757530212, | |
| "eval_runtime": 22.0248, | |
| "eval_samples_per_second": 4.54, | |
| "eval_steps_per_second": 0.136, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25742574257425743, | |
| "grad_norm": 0.12279005348682404, | |
| "learning_rate": 1.0276679841897234e-05, | |
| "loss": 0.0986, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.26732673267326734, | |
| "grad_norm": 0.11221862584352493, | |
| "learning_rate": 1.0671936758893281e-05, | |
| "loss": 0.0932, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.27722772277227725, | |
| "grad_norm": 0.11220777034759521, | |
| "learning_rate": 1.106719367588933e-05, | |
| "loss": 0.091, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2871287128712871, | |
| "grad_norm": 0.1216827854514122, | |
| "learning_rate": 1.1462450592885376e-05, | |
| "loss": 0.095, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 0.0984376072883606, | |
| "learning_rate": 1.1857707509881423e-05, | |
| "loss": 0.0587, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3069306930693069, | |
| "grad_norm": 0.09776771813631058, | |
| "learning_rate": 1.225296442687747e-05, | |
| "loss": 0.0788, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.31683168316831684, | |
| "grad_norm": 0.10127560794353485, | |
| "learning_rate": 1.2648221343873517e-05, | |
| "loss": 0.0967, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.32673267326732675, | |
| "grad_norm": 0.10297820717096329, | |
| "learning_rate": 1.3043478260869566e-05, | |
| "loss": 0.0857, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.33663366336633666, | |
| "grad_norm": 0.10922635346651077, | |
| "learning_rate": 1.3438735177865614e-05, | |
| "loss": 0.1421, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3465346534653465, | |
| "grad_norm": 0.09746473282575607, | |
| "learning_rate": 1.383399209486166e-05, | |
| "loss": 0.0773, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3564356435643564, | |
| "grad_norm": 0.0974055677652359, | |
| "learning_rate": 1.4229249011857709e-05, | |
| "loss": 0.0613, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.36633663366336633, | |
| "grad_norm": 0.092307910323143, | |
| "learning_rate": 1.4624505928853754e-05, | |
| "loss": 0.0902, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.37623762376237624, | |
| "grad_norm": 0.1032821536064148, | |
| "learning_rate": 1.5019762845849802e-05, | |
| "loss": 0.0907, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.38613861386138615, | |
| "grad_norm": 0.08956699073314667, | |
| "learning_rate": 1.541501976284585e-05, | |
| "loss": 0.0667, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.08857525885105133, | |
| "learning_rate": 1.58102766798419e-05, | |
| "loss": 0.0501, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.40594059405940597, | |
| "grad_norm": 0.0948619470000267, | |
| "learning_rate": 1.6205533596837947e-05, | |
| "loss": 0.074, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4158415841584158, | |
| "grad_norm": 0.09170933812856674, | |
| "learning_rate": 1.6600790513833996e-05, | |
| "loss": 0.0791, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.42574257425742573, | |
| "grad_norm": 0.0803118497133255, | |
| "learning_rate": 1.699604743083004e-05, | |
| "loss": 0.075, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.43564356435643564, | |
| "grad_norm": 0.08512408286333084, | |
| "learning_rate": 1.739130434782609e-05, | |
| "loss": 0.0611, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.44554455445544555, | |
| "grad_norm": 0.08622541278600693, | |
| "learning_rate": 1.7786561264822134e-05, | |
| "loss": 0.0449, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.45544554455445546, | |
| "grad_norm": 0.08092786371707916, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.038, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.46534653465346537, | |
| "grad_norm": 0.09300900250673294, | |
| "learning_rate": 1.857707509881423e-05, | |
| "loss": 0.1017, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.4752475247524752, | |
| "grad_norm": 0.10565974563360214, | |
| "learning_rate": 1.8972332015810275e-05, | |
| "loss": 0.0709, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.48514851485148514, | |
| "grad_norm": 0.09708713740110397, | |
| "learning_rate": 1.9367588932806324e-05, | |
| "loss": 0.0581, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.49504950495049505, | |
| "grad_norm": 0.07682275772094727, | |
| "learning_rate": 1.9762845849802372e-05, | |
| "loss": 0.0553, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.49504950495049505, | |
| "eval_loss": 0.055061474442481995, | |
| "eval_runtime": 20.9273, | |
| "eval_samples_per_second": 4.778, | |
| "eval_steps_per_second": 0.143, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.504950495049505, | |
| "grad_norm": 0.08843517303466797, | |
| "learning_rate": 2.015810276679842e-05, | |
| "loss": 0.085, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5148514851485149, | |
| "grad_norm": 0.07925857603549957, | |
| "learning_rate": 2.055335968379447e-05, | |
| "loss": 0.0466, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5247524752475248, | |
| "grad_norm": 0.07153014093637466, | |
| "learning_rate": 2.0948616600790517e-05, | |
| "loss": 0.037, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5346534653465347, | |
| "grad_norm": 0.07946132123470306, | |
| "learning_rate": 2.1343873517786562e-05, | |
| "loss": 0.0542, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.5445544554455446, | |
| "grad_norm": 0.08954454213380814, | |
| "learning_rate": 2.173913043478261e-05, | |
| "loss": 0.0335, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5544554455445545, | |
| "grad_norm": 0.07380135357379913, | |
| "learning_rate": 2.213438735177866e-05, | |
| "loss": 0.0348, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5643564356435643, | |
| "grad_norm": 0.0993795096874237, | |
| "learning_rate": 2.2529644268774703e-05, | |
| "loss": 0.0568, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5742574257425742, | |
| "grad_norm": 0.07377209514379501, | |
| "learning_rate": 2.2924901185770752e-05, | |
| "loss": 0.0365, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5841584158415841, | |
| "grad_norm": 0.06827972829341888, | |
| "learning_rate": 2.33201581027668e-05, | |
| "loss": 0.0464, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 0.06968732923269272, | |
| "learning_rate": 2.3715415019762845e-05, | |
| "loss": 0.0548, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6039603960396039, | |
| "grad_norm": 0.07616455107927322, | |
| "learning_rate": 2.4110671936758893e-05, | |
| "loss": 0.0431, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6138613861386139, | |
| "grad_norm": 0.06691834330558777, | |
| "learning_rate": 2.450592885375494e-05, | |
| "loss": 0.0459, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6237623762376238, | |
| "grad_norm": 0.07361909747123718, | |
| "learning_rate": 2.490118577075099e-05, | |
| "loss": 0.0364, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6336633663366337, | |
| "grad_norm": 0.07976940274238586, | |
| "learning_rate": 2.5296442687747035e-05, | |
| "loss": 0.0618, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.6435643564356436, | |
| "grad_norm": 0.08242682367563248, | |
| "learning_rate": 2.5691699604743087e-05, | |
| "loss": 0.0401, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6534653465346535, | |
| "grad_norm": 0.05510717257857323, | |
| "learning_rate": 2.608695652173913e-05, | |
| "loss": 0.0238, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6633663366336634, | |
| "grad_norm": 0.08613371849060059, | |
| "learning_rate": 2.6482213438735183e-05, | |
| "loss": 0.0494, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6732673267326733, | |
| "grad_norm": 0.07181484997272491, | |
| "learning_rate": 2.6877470355731228e-05, | |
| "loss": 0.0373, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6831683168316832, | |
| "grad_norm": 0.07495307922363281, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.0346, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.693069306930693, | |
| "grad_norm": 0.08683991432189941, | |
| "learning_rate": 2.766798418972332e-05, | |
| "loss": 0.0278, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7029702970297029, | |
| "grad_norm": 0.08579660952091217, | |
| "learning_rate": 2.8063241106719366e-05, | |
| "loss": 0.0383, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7128712871287128, | |
| "grad_norm": 0.10032771527767181, | |
| "learning_rate": 2.8458498023715418e-05, | |
| "loss": 0.0565, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7227722772277227, | |
| "grad_norm": 0.09024665504693985, | |
| "learning_rate": 2.8853754940711463e-05, | |
| "loss": 0.0449, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7326732673267327, | |
| "grad_norm": 0.09194691479206085, | |
| "learning_rate": 2.9249011857707508e-05, | |
| "loss": 0.0378, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.7425742574257426, | |
| "grad_norm": 0.07428496330976486, | |
| "learning_rate": 2.964426877470356e-05, | |
| "loss": 0.0338, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7425742574257426, | |
| "eval_loss": 0.03502385690808296, | |
| "eval_runtime": 20.9125, | |
| "eval_samples_per_second": 4.782, | |
| "eval_steps_per_second": 0.143, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7524752475247525, | |
| "grad_norm": 0.06667866557836533, | |
| "learning_rate": 3.0039525691699605e-05, | |
| "loss": 0.0317, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7623762376237624, | |
| "grad_norm": 0.07727614790201187, | |
| "learning_rate": 3.0434782608695656e-05, | |
| "loss": 0.0396, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7722772277227723, | |
| "grad_norm": 0.10815678536891937, | |
| "learning_rate": 3.08300395256917e-05, | |
| "loss": 0.0297, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7821782178217822, | |
| "grad_norm": 0.07847210019826889, | |
| "learning_rate": 3.1225296442687746e-05, | |
| "loss": 0.0414, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 0.09983616322278976, | |
| "learning_rate": 3.16205533596838e-05, | |
| "loss": 0.0453, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.801980198019802, | |
| "grad_norm": 0.06763055920600891, | |
| "learning_rate": 3.201581027667984e-05, | |
| "loss": 0.0269, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.8118811881188119, | |
| "grad_norm": 0.07611255347728729, | |
| "learning_rate": 3.2411067193675894e-05, | |
| "loss": 0.0364, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8217821782178217, | |
| "grad_norm": 0.06326984614133835, | |
| "learning_rate": 3.280632411067194e-05, | |
| "loss": 0.0285, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8316831683168316, | |
| "grad_norm": 0.06751732528209686, | |
| "learning_rate": 3.320158102766799e-05, | |
| "loss": 0.0244, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8415841584158416, | |
| "grad_norm": 0.06836965680122375, | |
| "learning_rate": 3.3596837944664036e-05, | |
| "loss": 0.0293, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8514851485148515, | |
| "grad_norm": 0.06516125798225403, | |
| "learning_rate": 3.399209486166008e-05, | |
| "loss": 0.0378, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8613861386138614, | |
| "grad_norm": 0.06571876257658005, | |
| "learning_rate": 3.438735177865613e-05, | |
| "loss": 0.0265, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8712871287128713, | |
| "grad_norm": 0.06418901681900024, | |
| "learning_rate": 3.478260869565218e-05, | |
| "loss": 0.0363, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8811881188118812, | |
| "grad_norm": 0.09142803400754929, | |
| "learning_rate": 3.517786561264822e-05, | |
| "loss": 0.0351, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8910891089108911, | |
| "grad_norm": 0.09272617101669312, | |
| "learning_rate": 3.557312252964427e-05, | |
| "loss": 0.0314, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.900990099009901, | |
| "grad_norm": 0.08047693967819214, | |
| "learning_rate": 3.596837944664031e-05, | |
| "loss": 0.0277, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.9108910891089109, | |
| "grad_norm": 0.07349320501089096, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.025, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.9207920792079208, | |
| "grad_norm": 0.08519802987575531, | |
| "learning_rate": 3.675889328063241e-05, | |
| "loss": 0.0406, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.9306930693069307, | |
| "grad_norm": 0.06724913418292999, | |
| "learning_rate": 3.715415019762846e-05, | |
| "loss": 0.0317, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9405940594059405, | |
| "grad_norm": 0.06739809364080429, | |
| "learning_rate": 3.7549407114624506e-05, | |
| "loss": 0.0363, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9504950495049505, | |
| "grad_norm": 0.07618068903684616, | |
| "learning_rate": 3.794466403162055e-05, | |
| "loss": 0.0181, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.9603960396039604, | |
| "grad_norm": 0.061489593237638474, | |
| "learning_rate": 3.83399209486166e-05, | |
| "loss": 0.0296, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.9702970297029703, | |
| "grad_norm": 0.0637054592370987, | |
| "learning_rate": 3.873517786561265e-05, | |
| "loss": 0.0213, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.9801980198019802, | |
| "grad_norm": 0.0836983397603035, | |
| "learning_rate": 3.91304347826087e-05, | |
| "loss": 0.0294, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.9900990099009901, | |
| "grad_norm": 0.06744949519634247, | |
| "learning_rate": 3.9525691699604744e-05, | |
| "loss": 0.0302, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9900990099009901, | |
| "eval_loss": 0.027701813727617264, | |
| "eval_runtime": 20.9769, | |
| "eval_samples_per_second": 4.767, | |
| "eval_steps_per_second": 0.143, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.11184727400541306, | |
| "learning_rate": 3.9920948616600796e-05, | |
| "loss": 0.0226, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.00990099009901, | |
| "grad_norm": 0.056078869849443436, | |
| "learning_rate": 4.031620553359684e-05, | |
| "loss": 0.023, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.0198019801980198, | |
| "grad_norm": 0.06441275775432587, | |
| "learning_rate": 4.0711462450592886e-05, | |
| "loss": 0.0296, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.0297029702970297, | |
| "grad_norm": 0.07957013696432114, | |
| "learning_rate": 4.110671936758894e-05, | |
| "loss": 0.0203, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.0396039603960396, | |
| "grad_norm": 0.05331544578075409, | |
| "learning_rate": 4.150197628458498e-05, | |
| "loss": 0.0235, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0495049504950495, | |
| "grad_norm": 0.06107131764292717, | |
| "learning_rate": 4.1897233201581034e-05, | |
| "loss": 0.0296, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.0594059405940595, | |
| "grad_norm": 0.05904534086585045, | |
| "learning_rate": 4.229249011857708e-05, | |
| "loss": 0.0272, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.0693069306930694, | |
| "grad_norm": 0.07586849480867386, | |
| "learning_rate": 4.2687747035573124e-05, | |
| "loss": 0.0341, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.0792079207920793, | |
| "grad_norm": 0.08533436059951782, | |
| "learning_rate": 4.3083003952569175e-05, | |
| "loss": 0.0355, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0891089108910892, | |
| "grad_norm": 0.05731397494673729, | |
| "learning_rate": 4.347826086956522e-05, | |
| "loss": 0.03, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.099009900990099, | |
| "grad_norm": 0.048471707850694656, | |
| "learning_rate": 4.387351778656127e-05, | |
| "loss": 0.0196, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.108910891089109, | |
| "grad_norm": 0.053837962448596954, | |
| "learning_rate": 4.426877470355732e-05, | |
| "loss": 0.0168, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.118811881188119, | |
| "grad_norm": 0.06376203894615173, | |
| "learning_rate": 4.466403162055336e-05, | |
| "loss": 0.0251, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.1287128712871288, | |
| "grad_norm": 0.06253273785114288, | |
| "learning_rate": 4.505928853754941e-05, | |
| "loss": 0.0244, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.1386138613861387, | |
| "grad_norm": 0.05787895992398262, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 0.0213, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1485148514851484, | |
| "grad_norm": 0.07152873277664185, | |
| "learning_rate": 4.5849802371541504e-05, | |
| "loss": 0.0206, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.1584158415841583, | |
| "grad_norm": 0.07703561335802078, | |
| "learning_rate": 4.624505928853755e-05, | |
| "loss": 0.0346, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.1683168316831682, | |
| "grad_norm": 0.05837339907884598, | |
| "learning_rate": 4.66403162055336e-05, | |
| "loss": 0.0259, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1782178217821782, | |
| "grad_norm": 0.07039913535118103, | |
| "learning_rate": 4.7035573122529645e-05, | |
| "loss": 0.0248, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.188118811881188, | |
| "grad_norm": 0.06346864998340607, | |
| "learning_rate": 4.743083003952569e-05, | |
| "loss": 0.0243, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.198019801980198, | |
| "grad_norm": 0.07102694362401962, | |
| "learning_rate": 4.782608695652174e-05, | |
| "loss": 0.0211, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.2079207920792079, | |
| "grad_norm": 0.08494184911251068, | |
| "learning_rate": 4.822134387351779e-05, | |
| "loss": 0.0395, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.2178217821782178, | |
| "grad_norm": 0.07663221657276154, | |
| "learning_rate": 4.861660079051384e-05, | |
| "loss": 0.0343, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.2277227722772277, | |
| "grad_norm": 0.0614091195166111, | |
| "learning_rate": 4.901185770750988e-05, | |
| "loss": 0.0269, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2376237623762376, | |
| "grad_norm": 0.08366970717906952, | |
| "learning_rate": 4.940711462450593e-05, | |
| "loss": 0.0373, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2376237623762376, | |
| "eval_loss": 0.025583934038877487, | |
| "eval_runtime": 20.9445, | |
| "eval_samples_per_second": 4.775, | |
| "eval_steps_per_second": 0.143, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2475247524752475, | |
| "grad_norm": 0.07697997242212296, | |
| "learning_rate": 4.980237154150198e-05, | |
| "loss": 0.0305, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.2574257425742574, | |
| "grad_norm": 0.05984925106167793, | |
| "learning_rate": 5.0197628458498025e-05, | |
| "loss": 0.0171, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2673267326732673, | |
| "grad_norm": 0.08026100695133209, | |
| "learning_rate": 5.059288537549407e-05, | |
| "loss": 0.0238, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.2772277227722773, | |
| "grad_norm": 0.05464643985033035, | |
| "learning_rate": 5.098814229249013e-05, | |
| "loss": 0.0296, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.2871287128712872, | |
| "grad_norm": 0.07287590205669403, | |
| "learning_rate": 5.138339920948617e-05, | |
| "loss": 0.0246, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.297029702970297, | |
| "grad_norm": 0.08635013550519943, | |
| "learning_rate": 5.177865612648222e-05, | |
| "loss": 0.0339, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.306930693069307, | |
| "grad_norm": 0.06489317864179611, | |
| "learning_rate": 5.217391304347826e-05, | |
| "loss": 0.0181, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.316831683168317, | |
| "grad_norm": 0.37230196595191956, | |
| "learning_rate": 5.256916996047431e-05, | |
| "loss": 0.0307, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3267326732673268, | |
| "grad_norm": 0.06730218231678009, | |
| "learning_rate": 5.2964426877470366e-05, | |
| "loss": 0.0215, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3366336633663367, | |
| "grad_norm": 0.09615404158830643, | |
| "learning_rate": 5.335968379446641e-05, | |
| "loss": 0.0252, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3465346534653464, | |
| "grad_norm": 0.08481566607952118, | |
| "learning_rate": 5.3754940711462456e-05, | |
| "loss": 0.0284, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3564356435643563, | |
| "grad_norm": 0.11659260839223862, | |
| "learning_rate": 5.41501976284585e-05, | |
| "loss": 0.0336, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3663366336633662, | |
| "grad_norm": 0.0634186789393425, | |
| "learning_rate": 5.4545454545454546e-05, | |
| "loss": 0.0241, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.3762376237623761, | |
| "grad_norm": 0.08782602101564407, | |
| "learning_rate": 5.49407114624506e-05, | |
| "loss": 0.0239, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.386138613861386, | |
| "grad_norm": 0.10084912180900574, | |
| "learning_rate": 5.533596837944664e-05, | |
| "loss": 0.0342, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.396039603960396, | |
| "grad_norm": 0.07874171435832977, | |
| "learning_rate": 5.573122529644269e-05, | |
| "loss": 0.0186, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.4059405940594059, | |
| "grad_norm": 0.06874096393585205, | |
| "learning_rate": 5.612648221343873e-05, | |
| "loss": 0.0244, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.4158415841584158, | |
| "grad_norm": 0.060906875878572464, | |
| "learning_rate": 5.652173913043478e-05, | |
| "loss": 0.0179, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.4257425742574257, | |
| "grad_norm": 0.061183176934719086, | |
| "learning_rate": 5.6916996047430836e-05, | |
| "loss": 0.0216, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4356435643564356, | |
| "grad_norm": 0.055084019899368286, | |
| "learning_rate": 5.731225296442688e-05, | |
| "loss": 0.0171, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4455445544554455, | |
| "grad_norm": 0.05983046442270279, | |
| "learning_rate": 5.7707509881422926e-05, | |
| "loss": 0.0231, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.4554455445544554, | |
| "grad_norm": 0.060153719037771225, | |
| "learning_rate": 5.810276679841897e-05, | |
| "loss": 0.0147, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4653465346534653, | |
| "grad_norm": 0.046402279287576675, | |
| "learning_rate": 5.8498023715415016e-05, | |
| "loss": 0.0273, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.4752475247524752, | |
| "grad_norm": 0.06791272014379501, | |
| "learning_rate": 5.8893280632411074e-05, | |
| "loss": 0.0185, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.4851485148514851, | |
| "grad_norm": 0.06259552389383316, | |
| "learning_rate": 5.928853754940712e-05, | |
| "loss": 0.0321, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4851485148514851, | |
| "eval_loss": 0.025141000747680664, | |
| "eval_runtime": 20.9532, | |
| "eval_samples_per_second": 4.773, | |
| "eval_steps_per_second": 0.143, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.495049504950495, | |
| "grad_norm": 0.05741759389638901, | |
| "learning_rate": 5.9683794466403164e-05, | |
| "loss": 0.0227, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.504950495049505, | |
| "grad_norm": 0.07616166770458221, | |
| "learning_rate": 6.007905138339921e-05, | |
| "loss": 0.0175, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.5148514851485149, | |
| "grad_norm": 0.05826897919178009, | |
| "learning_rate": 6.0474308300395254e-05, | |
| "loss": 0.0227, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.5247524752475248, | |
| "grad_norm": 0.055770143866539, | |
| "learning_rate": 6.086956521739131e-05, | |
| "loss": 0.0183, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5346534653465347, | |
| "grad_norm": 0.055019546300172806, | |
| "learning_rate": 6.126482213438736e-05, | |
| "loss": 0.02, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5445544554455446, | |
| "grad_norm": 0.053829967975616455, | |
| "learning_rate": 6.16600790513834e-05, | |
| "loss": 0.0157, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.5544554455445545, | |
| "grad_norm": 0.051947206258773804, | |
| "learning_rate": 6.205533596837945e-05, | |
| "loss": 0.0265, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5643564356435644, | |
| "grad_norm": 0.0410446897149086, | |
| "learning_rate": 6.245059288537549e-05, | |
| "loss": 0.014, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.5742574257425743, | |
| "grad_norm": 0.06168140098452568, | |
| "learning_rate": 6.284584980237155e-05, | |
| "loss": 0.0264, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.5841584158415842, | |
| "grad_norm": 0.045245859771966934, | |
| "learning_rate": 6.32411067193676e-05, | |
| "loss": 0.0153, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5940594059405941, | |
| "grad_norm": 0.0567997582256794, | |
| "learning_rate": 6.363636363636364e-05, | |
| "loss": 0.0172, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.603960396039604, | |
| "grad_norm": 0.04940714314579964, | |
| "learning_rate": 6.403162055335969e-05, | |
| "loss": 0.023, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.613861386138614, | |
| "grad_norm": 0.05434204638004303, | |
| "learning_rate": 6.442687747035574e-05, | |
| "loss": 0.0388, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.6237623762376239, | |
| "grad_norm": 0.056556135416030884, | |
| "learning_rate": 6.482213438735179e-05, | |
| "loss": 0.0198, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.6336633663366338, | |
| "grad_norm": 0.048558373004198074, | |
| "learning_rate": 6.521739130434783e-05, | |
| "loss": 0.0303, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6435643564356437, | |
| "grad_norm": 0.05275854095816612, | |
| "learning_rate": 6.561264822134388e-05, | |
| "loss": 0.0157, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.6534653465346536, | |
| "grad_norm": 0.04988724738359451, | |
| "learning_rate": 6.600790513833992e-05, | |
| "loss": 0.0157, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6633663366336635, | |
| "grad_norm": 0.04950392246246338, | |
| "learning_rate": 6.640316205533598e-05, | |
| "loss": 0.0288, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.6732673267326734, | |
| "grad_norm": 0.06501295417547226, | |
| "learning_rate": 6.679841897233203e-05, | |
| "loss": 0.0344, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.6831683168316833, | |
| "grad_norm": 0.05085508152842522, | |
| "learning_rate": 6.719367588932807e-05, | |
| "loss": 0.0315, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.693069306930693, | |
| "grad_norm": 0.055841974914073944, | |
| "learning_rate": 6.758893280632412e-05, | |
| "loss": 0.0219, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.702970297029703, | |
| "grad_norm": 0.050552286207675934, | |
| "learning_rate": 6.798418972332016e-05, | |
| "loss": 0.0309, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.7128712871287128, | |
| "grad_norm": 0.052804503589868546, | |
| "learning_rate": 6.837944664031622e-05, | |
| "loss": 0.0303, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.7227722772277227, | |
| "grad_norm": 0.055684514343738556, | |
| "learning_rate": 6.877470355731227e-05, | |
| "loss": 0.0289, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.7326732673267327, | |
| "grad_norm": 0.04727136716246605, | |
| "learning_rate": 6.916996047430831e-05, | |
| "loss": 0.026, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7326732673267327, | |
| "eval_loss": 0.022802595049142838, | |
| "eval_runtime": 20.961, | |
| "eval_samples_per_second": 4.771, | |
| "eval_steps_per_second": 0.143, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7425742574257426, | |
| "grad_norm": 0.05808348208665848, | |
| "learning_rate": 6.956521739130436e-05, | |
| "loss": 0.0296, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.7524752475247525, | |
| "grad_norm": 0.047991879284381866, | |
| "learning_rate": 6.99604743083004e-05, | |
| "loss": 0.0203, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7623762376237624, | |
| "grad_norm": 0.04677740857005119, | |
| "learning_rate": 7.035573122529645e-05, | |
| "loss": 0.016, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.7722772277227723, | |
| "grad_norm": 0.0436997190117836, | |
| "learning_rate": 7.075098814229249e-05, | |
| "loss": 0.0161, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.7821782178217822, | |
| "grad_norm": 0.05496394634246826, | |
| "learning_rate": 7.114624505928854e-05, | |
| "loss": 0.0351, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7920792079207921, | |
| "grad_norm": 0.044135309755802155, | |
| "learning_rate": 7.154150197628458e-05, | |
| "loss": 0.0198, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.801980198019802, | |
| "grad_norm": 0.04966725409030914, | |
| "learning_rate": 7.193675889328062e-05, | |
| "loss": 0.0214, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.811881188118812, | |
| "grad_norm": 0.04237734153866768, | |
| "learning_rate": 7.233201581027668e-05, | |
| "loss": 0.0211, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.8217821782178216, | |
| "grad_norm": 0.04926709085702896, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 0.0291, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.8316831683168315, | |
| "grad_norm": 0.053725503385066986, | |
| "learning_rate": 7.312252964426877e-05, | |
| "loss": 0.0151, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8415841584158414, | |
| "grad_norm": 0.039645515382289886, | |
| "learning_rate": 7.351778656126482e-05, | |
| "loss": 0.0227, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.8514851485148514, | |
| "grad_norm": 0.0530475378036499, | |
| "learning_rate": 7.391304347826086e-05, | |
| "loss": 0.0335, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.8613861386138613, | |
| "grad_norm": 0.04808201268315315, | |
| "learning_rate": 7.430830039525692e-05, | |
| "loss": 0.0272, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8712871287128712, | |
| "grad_norm": 0.05676993727684021, | |
| "learning_rate": 7.470355731225297e-05, | |
| "loss": 0.0244, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.881188118811881, | |
| "grad_norm": 0.04248756170272827, | |
| "learning_rate": 7.509881422924901e-05, | |
| "loss": 0.0139, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.891089108910891, | |
| "grad_norm": 0.04386115446686745, | |
| "learning_rate": 7.549407114624506e-05, | |
| "loss": 0.0161, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.900990099009901, | |
| "grad_norm": 0.04502849653363228, | |
| "learning_rate": 7.58893280632411e-05, | |
| "loss": 0.0246, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.9108910891089108, | |
| "grad_norm": 0.053589943796396255, | |
| "learning_rate": 7.628458498023716e-05, | |
| "loss": 0.0207, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.9207920792079207, | |
| "grad_norm": 0.04484814032912254, | |
| "learning_rate": 7.66798418972332e-05, | |
| "loss": 0.0219, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.9306930693069306, | |
| "grad_norm": 0.06994163244962692, | |
| "learning_rate": 7.707509881422925e-05, | |
| "loss": 0.0279, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9405940594059405, | |
| "grad_norm": 0.044320590794086456, | |
| "learning_rate": 7.74703557312253e-05, | |
| "loss": 0.0284, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.9504950495049505, | |
| "grad_norm": 0.04095998406410217, | |
| "learning_rate": 7.786561264822134e-05, | |
| "loss": 0.022, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.9603960396039604, | |
| "grad_norm": 0.043975040316581726, | |
| "learning_rate": 7.82608695652174e-05, | |
| "loss": 0.0189, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.9702970297029703, | |
| "grad_norm": 0.048833929002285004, | |
| "learning_rate": 7.865612648221344e-05, | |
| "loss": 0.0279, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "grad_norm": 0.05329909175634384, | |
| "learning_rate": 7.905138339920949e-05, | |
| "loss": 0.029, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "eval_loss": 0.021168576553463936, | |
| "eval_runtime": 20.9649, | |
| "eval_samples_per_second": 4.77, | |
| "eval_steps_per_second": 0.143, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.99009900990099, | |
| "grad_norm": 0.04758258908987045, | |
| "learning_rate": 7.944664031620553e-05, | |
| "loss": 0.0174, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.07869398593902588, | |
| "learning_rate": 7.984189723320159e-05, | |
| "loss": 0.0162, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.00990099009901, | |
| "grad_norm": 0.0439910814166069, | |
| "learning_rate": 8.023715415019764e-05, | |
| "loss": 0.017, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.01980198019802, | |
| "grad_norm": 0.04175785556435585, | |
| "learning_rate": 8.063241106719368e-05, | |
| "loss": 0.0108, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.0297029702970297, | |
| "grad_norm": 0.037400927394628525, | |
| "learning_rate": 8.102766798418973e-05, | |
| "loss": 0.0129, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0396039603960396, | |
| "grad_norm": 0.09054706990718842, | |
| "learning_rate": 8.142292490118577e-05, | |
| "loss": 0.0497, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.0495049504950495, | |
| "grad_norm": 0.032963309437036514, | |
| "learning_rate": 8.181818181818183e-05, | |
| "loss": 0.0132, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.0594059405940595, | |
| "grad_norm": 0.045048072934150696, | |
| "learning_rate": 8.221343873517787e-05, | |
| "loss": 0.0139, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.0693069306930694, | |
| "grad_norm": 0.04151559993624687, | |
| "learning_rate": 8.260869565217392e-05, | |
| "loss": 0.0186, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.0792079207920793, | |
| "grad_norm": 0.04299120604991913, | |
| "learning_rate": 8.300395256916996e-05, | |
| "loss": 0.0146, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.089108910891089, | |
| "grad_norm": 0.05108791962265968, | |
| "learning_rate": 8.339920948616601e-05, | |
| "loss": 0.0234, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.099009900990099, | |
| "grad_norm": 0.044956084340810776, | |
| "learning_rate": 8.379446640316207e-05, | |
| "loss": 0.0266, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.108910891089109, | |
| "grad_norm": 0.04504725709557533, | |
| "learning_rate": 8.418972332015811e-05, | |
| "loss": 0.0201, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.118811881188119, | |
| "grad_norm": 0.06300884485244751, | |
| "learning_rate": 8.458498023715416e-05, | |
| "loss": 0.0464, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.128712871287129, | |
| "grad_norm": 0.04494632035493851, | |
| "learning_rate": 8.49802371541502e-05, | |
| "loss": 0.0117, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.1386138613861387, | |
| "grad_norm": 0.0645475760102272, | |
| "learning_rate": 8.537549407114625e-05, | |
| "loss": 0.0311, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.1485148514851486, | |
| "grad_norm": 0.03961382433772087, | |
| "learning_rate": 8.57707509881423e-05, | |
| "loss": 0.01, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.1584158415841586, | |
| "grad_norm": 0.04515567421913147, | |
| "learning_rate": 8.616600790513835e-05, | |
| "loss": 0.0141, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.1683168316831685, | |
| "grad_norm": 0.06631825864315033, | |
| "learning_rate": 8.65612648221344e-05, | |
| "loss": 0.0175, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.1782178217821784, | |
| "grad_norm": 0.04718014970421791, | |
| "learning_rate": 8.695652173913044e-05, | |
| "loss": 0.0201, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.1881188118811883, | |
| "grad_norm": 0.06612230092287064, | |
| "learning_rate": 8.735177865612649e-05, | |
| "loss": 0.0317, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.198019801980198, | |
| "grad_norm": 0.05043951794505119, | |
| "learning_rate": 8.774703557312254e-05, | |
| "loss": 0.02, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.207920792079208, | |
| "grad_norm": 0.03858309984207153, | |
| "learning_rate": 8.814229249011859e-05, | |
| "loss": 0.0142, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.217821782178218, | |
| "grad_norm": 0.06211186945438385, | |
| "learning_rate": 8.853754940711463e-05, | |
| "loss": 0.0398, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.227722772277228, | |
| "grad_norm": 0.04007279500365257, | |
| "learning_rate": 8.893280632411068e-05, | |
| "loss": 0.0152, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.227722772277228, | |
| "eval_loss": 0.021625498309731483, | |
| "eval_runtime": 21.7529, | |
| "eval_samples_per_second": 4.597, | |
| "eval_steps_per_second": 0.138, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.237623762376238, | |
| "grad_norm": 0.042071305215358734, | |
| "learning_rate": 8.932806324110672e-05, | |
| "loss": 0.0153, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.2475247524752477, | |
| "grad_norm": 0.04820103570818901, | |
| "learning_rate": 8.972332015810277e-05, | |
| "loss": 0.0143, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.2574257425742577, | |
| "grad_norm": 0.047800008207559586, | |
| "learning_rate": 9.011857707509881e-05, | |
| "loss": 0.0144, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.2673267326732676, | |
| "grad_norm": 0.03672846406698227, | |
| "learning_rate": 9.051383399209486e-05, | |
| "loss": 0.0204, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.2772277227722775, | |
| "grad_norm": 0.044539038091897964, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 0.0241, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.287128712871287, | |
| "grad_norm": 0.03706245496869087, | |
| "learning_rate": 9.130434782608696e-05, | |
| "loss": 0.0189, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.297029702970297, | |
| "grad_norm": 0.038041867315769196, | |
| "learning_rate": 9.169960474308301e-05, | |
| "loss": 0.0168, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.3069306930693068, | |
| "grad_norm": 0.05145088583230972, | |
| "learning_rate": 9.209486166007905e-05, | |
| "loss": 0.0173, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.3168316831683167, | |
| "grad_norm": 0.03916524723172188, | |
| "learning_rate": 9.24901185770751e-05, | |
| "loss": 0.0186, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.3267326732673266, | |
| "grad_norm": 0.03896910697221756, | |
| "learning_rate": 9.288537549407114e-05, | |
| "loss": 0.0166, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.3366336633663365, | |
| "grad_norm": 0.04472080245614052, | |
| "learning_rate": 9.32806324110672e-05, | |
| "loss": 0.0258, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.3465346534653464, | |
| "grad_norm": 0.0344708114862442, | |
| "learning_rate": 9.367588932806325e-05, | |
| "loss": 0.0165, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.3564356435643563, | |
| "grad_norm": 0.03994801267981529, | |
| "learning_rate": 9.407114624505929e-05, | |
| "loss": 0.0288, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.366336633663366, | |
| "grad_norm": 0.03858465701341629, | |
| "learning_rate": 9.446640316205534e-05, | |
| "loss": 0.0216, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.376237623762376, | |
| "grad_norm": 0.0324896015226841, | |
| "learning_rate": 9.486166007905138e-05, | |
| "loss": 0.0133, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.386138613861386, | |
| "grad_norm": 0.04445994645357132, | |
| "learning_rate": 9.525691699604744e-05, | |
| "loss": 0.026, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.396039603960396, | |
| "grad_norm": 0.046353645622730255, | |
| "learning_rate": 9.565217391304348e-05, | |
| "loss": 0.0218, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.405940594059406, | |
| "grad_norm": 0.03423113003373146, | |
| "learning_rate": 9.604743083003953e-05, | |
| "loss": 0.0099, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.4158415841584158, | |
| "grad_norm": 0.04123309999704361, | |
| "learning_rate": 9.644268774703557e-05, | |
| "loss": 0.0202, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.4257425742574257, | |
| "grad_norm": 0.039391182363033295, | |
| "learning_rate": 9.683794466403162e-05, | |
| "loss": 0.0203, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.4356435643564356, | |
| "grad_norm": 0.045452989637851715, | |
| "learning_rate": 9.723320158102768e-05, | |
| "loss": 0.0299, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.4455445544554455, | |
| "grad_norm": 0.04196856915950775, | |
| "learning_rate": 9.762845849802372e-05, | |
| "loss": 0.0209, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.4554455445544554, | |
| "grad_norm": 0.0465984083712101, | |
| "learning_rate": 9.802371541501977e-05, | |
| "loss": 0.0264, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.4653465346534653, | |
| "grad_norm": 0.036751143634319305, | |
| "learning_rate": 9.841897233201581e-05, | |
| "loss": 0.0132, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.4752475247524752, | |
| "grad_norm": 0.03972426801919937, | |
| "learning_rate": 9.881422924901186e-05, | |
| "loss": 0.011, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4752475247524752, | |
| "eval_loss": 0.02048182487487793, | |
| "eval_runtime": 20.9235, | |
| "eval_samples_per_second": 4.779, | |
| "eval_steps_per_second": 0.143, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.485148514851485, | |
| "grad_norm": 0.03993945196270943, | |
| "learning_rate": 9.920948616600791e-05, | |
| "loss": 0.0112, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.495049504950495, | |
| "grad_norm": 0.03752398490905762, | |
| "learning_rate": 9.960474308300396e-05, | |
| "loss": 0.0172, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.504950495049505, | |
| "grad_norm": 0.031341757625341415, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0135, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.514851485148515, | |
| "grad_norm": 0.04680801182985306, | |
| "learning_rate": 9.999995220053555e-05, | |
| "loss": 0.0175, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.5247524752475248, | |
| "grad_norm": 0.04588630795478821, | |
| "learning_rate": 9.999980880223359e-05, | |
| "loss": 0.0305, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.5346534653465347, | |
| "grad_norm": 0.035120490938425064, | |
| "learning_rate": 9.999956980536828e-05, | |
| "loss": 0.0246, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.5445544554455446, | |
| "grad_norm": 0.041602976620197296, | |
| "learning_rate": 9.999923521039659e-05, | |
| "loss": 0.0109, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.5544554455445545, | |
| "grad_norm": 0.04591014236211777, | |
| "learning_rate": 9.999880501795826e-05, | |
| "loss": 0.0138, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.5643564356435644, | |
| "grad_norm": 0.030721504241228104, | |
| "learning_rate": 9.99982792288758e-05, | |
| "loss": 0.0099, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.5742574257425743, | |
| "grad_norm": 0.047804564237594604, | |
| "learning_rate": 9.999765784415451e-05, | |
| "loss": 0.0192, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.5841584158415842, | |
| "grad_norm": 0.04211296886205673, | |
| "learning_rate": 9.999694086498248e-05, | |
| "loss": 0.0211, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.594059405940594, | |
| "grad_norm": 0.05463152006268501, | |
| "learning_rate": 9.999612829273053e-05, | |
| "loss": 0.0125, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.603960396039604, | |
| "grad_norm": 0.03957533463835716, | |
| "learning_rate": 9.99952201289523e-05, | |
| "loss": 0.0237, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.613861386138614, | |
| "grad_norm": 0.04941708594560623, | |
| "learning_rate": 9.999421637538418e-05, | |
| "loss": 0.0156, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.623762376237624, | |
| "grad_norm": 0.035674627870321274, | |
| "learning_rate": 9.999311703394532e-05, | |
| "loss": 0.0175, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.633663366336634, | |
| "grad_norm": 0.04425996169447899, | |
| "learning_rate": 9.999192210673762e-05, | |
| "loss": 0.0301, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.6435643564356437, | |
| "grad_norm": 0.03343842178583145, | |
| "learning_rate": 9.999063159604579e-05, | |
| "loss": 0.0178, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.6534653465346536, | |
| "grad_norm": 0.028746318072080612, | |
| "learning_rate": 9.998924550433723e-05, | |
| "loss": 0.0161, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.6633663366336635, | |
| "grad_norm": 0.03598029911518097, | |
| "learning_rate": 9.998776383426216e-05, | |
| "loss": 0.024, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.6732673267326734, | |
| "grad_norm": 0.03354136273264885, | |
| "learning_rate": 9.998618658865344e-05, | |
| "loss": 0.0125, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6831683168316833, | |
| "grad_norm": 0.03313228860497475, | |
| "learning_rate": 9.998451377052678e-05, | |
| "loss": 0.0214, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.693069306930693, | |
| "grad_norm": 0.04074874147772789, | |
| "learning_rate": 9.998274538308054e-05, | |
| "loss": 0.0182, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.7029702970297027, | |
| "grad_norm": 0.03895655646920204, | |
| "learning_rate": 9.998088142969586e-05, | |
| "loss": 0.0268, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.7128712871287126, | |
| "grad_norm": 0.029414329677820206, | |
| "learning_rate": 9.997892191393657e-05, | |
| "loss": 0.0088, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.7227722772277225, | |
| "grad_norm": 0.0380302369594574, | |
| "learning_rate": 9.997686683954923e-05, | |
| "loss": 0.0154, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.7227722772277225, | |
| "eval_loss": 0.019408825784921646, | |
| "eval_runtime": 20.9386, | |
| "eval_samples_per_second": 4.776, | |
| "eval_steps_per_second": 0.143, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.7326732673267324, | |
| "grad_norm": 0.04297591373324394, | |
| "learning_rate": 9.997471621046308e-05, | |
| "loss": 0.0172, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.7425742574257423, | |
| "grad_norm": 0.043008070439100266, | |
| "learning_rate": 9.997247003079008e-05, | |
| "loss": 0.0317, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.7524752475247523, | |
| "grad_norm": 0.03837353363633156, | |
| "learning_rate": 9.997012830482491e-05, | |
| "loss": 0.0246, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.762376237623762, | |
| "grad_norm": 0.03948510065674782, | |
| "learning_rate": 9.996769103704486e-05, | |
| "loss": 0.0183, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.772277227722772, | |
| "grad_norm": 0.029715241864323616, | |
| "learning_rate": 9.996515823210997e-05, | |
| "loss": 0.0219, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.782178217821782, | |
| "grad_norm": 0.031051289290189743, | |
| "learning_rate": 9.996252989486287e-05, | |
| "loss": 0.0143, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.792079207920792, | |
| "grad_norm": 0.03561589494347572, | |
| "learning_rate": 9.99598060303289e-05, | |
| "loss": 0.0178, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.801980198019802, | |
| "grad_norm": 0.03417595103383064, | |
| "learning_rate": 9.995698664371604e-05, | |
| "loss": 0.0114, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.8118811881188117, | |
| "grad_norm": 0.03924238681793213, | |
| "learning_rate": 9.995407174041489e-05, | |
| "loss": 0.019, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.8217821782178216, | |
| "grad_norm": 0.03989838808774948, | |
| "learning_rate": 9.995106132599869e-05, | |
| "loss": 0.0114, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.8316831683168315, | |
| "grad_norm": 0.030943676829338074, | |
| "learning_rate": 9.994795540622328e-05, | |
| "loss": 0.0121, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.8415841584158414, | |
| "grad_norm": 0.03534318134188652, | |
| "learning_rate": 9.99447539870271e-05, | |
| "loss": 0.0133, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.8514851485148514, | |
| "grad_norm": 0.03818822652101517, | |
| "learning_rate": 9.99414570745312e-05, | |
| "loss": 0.0198, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.8613861386138613, | |
| "grad_norm": 0.04481323063373566, | |
| "learning_rate": 9.993806467503923e-05, | |
| "loss": 0.0169, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.871287128712871, | |
| "grad_norm": 0.03227623179554939, | |
| "learning_rate": 9.993457679503737e-05, | |
| "loss": 0.0214, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.881188118811881, | |
| "grad_norm": 0.033491671085357666, | |
| "learning_rate": 9.993099344119437e-05, | |
| "loss": 0.0104, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.891089108910891, | |
| "grad_norm": 0.03478483110666275, | |
| "learning_rate": 9.992731462036152e-05, | |
| "loss": 0.0183, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.900990099009901, | |
| "grad_norm": 0.0348794087767601, | |
| "learning_rate": 9.992354033957266e-05, | |
| "loss": 0.0095, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.910891089108911, | |
| "grad_norm": 0.058163851499557495, | |
| "learning_rate": 9.991967060604413e-05, | |
| "loss": 0.0197, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.9207920792079207, | |
| "grad_norm": 0.05041566491127014, | |
| "learning_rate": 9.991570542717477e-05, | |
| "loss": 0.0219, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.9306930693069306, | |
| "grad_norm": 0.031357474625110626, | |
| "learning_rate": 9.991164481054591e-05, | |
| "loss": 0.0127, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.9405940594059405, | |
| "grad_norm": 0.052746862173080444, | |
| "learning_rate": 9.99074887639214e-05, | |
| "loss": 0.0161, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.9504950495049505, | |
| "grad_norm": 0.04254617169499397, | |
| "learning_rate": 9.990323729524747e-05, | |
| "loss": 0.019, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.9603960396039604, | |
| "grad_norm": 0.05648938938975334, | |
| "learning_rate": 9.989889041265286e-05, | |
| "loss": 0.0149, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.9702970297029703, | |
| "grad_norm": 0.0423521064221859, | |
| "learning_rate": 9.98944481244487e-05, | |
| "loss": 0.021, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.9702970297029703, | |
| "eval_loss": 0.019180113449692726, | |
| "eval_runtime": 20.9272, | |
| "eval_samples_per_second": 4.778, | |
| "eval_steps_per_second": 0.143, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.98019801980198, | |
| "grad_norm": 0.0332060270011425, | |
| "learning_rate": 9.988991043912857e-05, | |
| "loss": 0.01, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.99009900990099, | |
| "grad_norm": 0.05181276798248291, | |
| "learning_rate": 9.988527736536841e-05, | |
| "loss": 0.0326, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.037442926317453384, | |
| "learning_rate": 9.988054891202656e-05, | |
| "loss": 0.0052, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 3.00990099009901, | |
| "grad_norm": 0.032612621784210205, | |
| "learning_rate": 9.987572508814372e-05, | |
| "loss": 0.0093, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 3.01980198019802, | |
| "grad_norm": 0.040106579661369324, | |
| "learning_rate": 9.987080590294295e-05, | |
| "loss": 0.0293, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.0297029702970297, | |
| "grad_norm": 0.059703320264816284, | |
| "learning_rate": 9.986579136582963e-05, | |
| "loss": 0.0273, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.0396039603960396, | |
| "grad_norm": 0.045052703469991684, | |
| "learning_rate": 9.986068148639143e-05, | |
| "loss": 0.0079, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.0495049504950495, | |
| "grad_norm": 0.035572804510593414, | |
| "learning_rate": 9.985547627439835e-05, | |
| "loss": 0.0139, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.0594059405940595, | |
| "grad_norm": 0.04415634274482727, | |
| "learning_rate": 9.985017573980262e-05, | |
| "loss": 0.0094, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.0693069306930694, | |
| "grad_norm": 0.0477583073079586, | |
| "learning_rate": 9.984477989273876e-05, | |
| "loss": 0.0156, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0792079207920793, | |
| "grad_norm": 0.03828883171081543, | |
| "learning_rate": 9.983928874352352e-05, | |
| "loss": 0.0069, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.089108910891089, | |
| "grad_norm": 0.04859181493520737, | |
| "learning_rate": 9.983370230265585e-05, | |
| "loss": 0.0234, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.099009900990099, | |
| "grad_norm": 0.03425335884094238, | |
| "learning_rate": 9.982802058081691e-05, | |
| "loss": 0.0133, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.108910891089109, | |
| "grad_norm": 0.03466329351067543, | |
| "learning_rate": 9.982224358887003e-05, | |
| "loss": 0.0225, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.118811881188119, | |
| "grad_norm": 0.040790338069200516, | |
| "learning_rate": 9.981637133786071e-05, | |
| "loss": 0.01, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.128712871287129, | |
| "grad_norm": 0.028947383165359497, | |
| "learning_rate": 9.981040383901652e-05, | |
| "loss": 0.0071, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.1386138613861387, | |
| "grad_norm": 0.03497043997049332, | |
| "learning_rate": 9.980434110374724e-05, | |
| "loss": 0.023, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.1485148514851486, | |
| "grad_norm": 0.03491488844156265, | |
| "learning_rate": 9.979818314364468e-05, | |
| "loss": 0.0118, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.1584158415841586, | |
| "grad_norm": 0.027231434360146523, | |
| "learning_rate": 9.979192997048271e-05, | |
| "loss": 0.0089, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.1683168316831685, | |
| "grad_norm": 0.030446205288171768, | |
| "learning_rate": 9.978558159621728e-05, | |
| "loss": 0.0135, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.1782178217821784, | |
| "grad_norm": 0.03138475492596626, | |
| "learning_rate": 9.977913803298633e-05, | |
| "loss": 0.012, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.1881188118811883, | |
| "grad_norm": 0.03834952786564827, | |
| "learning_rate": 9.977259929310985e-05, | |
| "loss": 0.0089, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.198019801980198, | |
| "grad_norm": 0.035104572772979736, | |
| "learning_rate": 9.97659653890897e-05, | |
| "loss": 0.0088, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.207920792079208, | |
| "grad_norm": 0.035660646855831146, | |
| "learning_rate": 9.975923633360985e-05, | |
| "loss": 0.022, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.217821782178218, | |
| "grad_norm": 0.04493353143334389, | |
| "learning_rate": 9.975241213953606e-05, | |
| "loss": 0.0282, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.217821782178218, | |
| "eval_loss": 0.018639273941516876, | |
| "eval_runtime": 20.9758, | |
| "eval_samples_per_second": 4.767, | |
| "eval_steps_per_second": 0.143, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.227722772277228, | |
| "grad_norm": 0.042079996317625046, | |
| "learning_rate": 9.974549281991603e-05, | |
| "loss": 0.0072, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.237623762376238, | |
| "grad_norm": 0.04238072782754898, | |
| "learning_rate": 9.973847838797939e-05, | |
| "loss": 0.0199, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.2475247524752477, | |
| "grad_norm": 0.03974232077598572, | |
| "learning_rate": 9.973136885713754e-05, | |
| "loss": 0.0132, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.2574257425742577, | |
| "grad_norm": 0.03601127117872238, | |
| "learning_rate": 9.972416424098379e-05, | |
| "loss": 0.0157, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.2673267326732676, | |
| "grad_norm": 0.03373201563954353, | |
| "learning_rate": 9.971686455329319e-05, | |
| "loss": 0.0165, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.2772277227722775, | |
| "grad_norm": 0.03458605706691742, | |
| "learning_rate": 9.970946980802261e-05, | |
| "loss": 0.0147, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.287128712871287, | |
| "grad_norm": 0.02960544265806675, | |
| "learning_rate": 9.970198001931062e-05, | |
| "loss": 0.0088, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.297029702970297, | |
| "grad_norm": 0.03317386284470558, | |
| "learning_rate": 9.969439520147754e-05, | |
| "loss": 0.0169, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.3069306930693068, | |
| "grad_norm": 0.037376001477241516, | |
| "learning_rate": 9.968671536902539e-05, | |
| "loss": 0.0176, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.3168316831683167, | |
| "grad_norm": 0.03728969767689705, | |
| "learning_rate": 9.967894053663782e-05, | |
| "loss": 0.0128, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.3267326732673266, | |
| "grad_norm": 0.033844251185655594, | |
| "learning_rate": 9.967107071918019e-05, | |
| "loss": 0.0124, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.3366336633663365, | |
| "grad_norm": 0.04100586473941803, | |
| "learning_rate": 9.966310593169936e-05, | |
| "loss": 0.0266, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.3465346534653464, | |
| "grad_norm": 0.03216198459267616, | |
| "learning_rate": 9.965504618942389e-05, | |
| "loss": 0.0078, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.3564356435643563, | |
| "grad_norm": 0.03028401918709278, | |
| "learning_rate": 9.96468915077638e-05, | |
| "loss": 0.0163, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.366336633663366, | |
| "grad_norm": 0.0313386544585228, | |
| "learning_rate": 9.963864190231067e-05, | |
| "loss": 0.0212, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.376237623762376, | |
| "grad_norm": 0.04166916385293007, | |
| "learning_rate": 9.96302973888376e-05, | |
| "loss": 0.0315, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.386138613861386, | |
| "grad_norm": 0.028147250413894653, | |
| "learning_rate": 9.962185798329909e-05, | |
| "loss": 0.0111, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.396039603960396, | |
| "grad_norm": 0.028218761086463928, | |
| "learning_rate": 9.96133237018311e-05, | |
| "loss": 0.0107, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.405940594059406, | |
| "grad_norm": 0.03172610327601433, | |
| "learning_rate": 9.960469456075099e-05, | |
| "loss": 0.0166, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.4158415841584158, | |
| "grad_norm": 0.0334465391933918, | |
| "learning_rate": 9.959597057655753e-05, | |
| "loss": 0.0215, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.4257425742574257, | |
| "grad_norm": 0.036227963864803314, | |
| "learning_rate": 9.958715176593076e-05, | |
| "loss": 0.0139, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.4356435643564356, | |
| "grad_norm": 0.03319300711154938, | |
| "learning_rate": 9.957823814573206e-05, | |
| "loss": 0.0096, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.4455445544554455, | |
| "grad_norm": 0.02689778245985508, | |
| "learning_rate": 9.956922973300409e-05, | |
| "loss": 0.0129, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.4554455445544554, | |
| "grad_norm": 0.030777866020798683, | |
| "learning_rate": 9.956012654497074e-05, | |
| "loss": 0.016, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.4653465346534653, | |
| "grad_norm": 0.03275037929415703, | |
| "learning_rate": 9.95509285990371e-05, | |
| "loss": 0.007, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.4653465346534653, | |
| "eval_loss": 0.018103841692209244, | |
| "eval_runtime": 20.9432, | |
| "eval_samples_per_second": 4.775, | |
| "eval_steps_per_second": 0.143, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.4752475247524752, | |
| "grad_norm": 0.0414116345345974, | |
| "learning_rate": 9.954163591278945e-05, | |
| "loss": 0.0322, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.485148514851485, | |
| "grad_norm": 0.039487581700086594, | |
| "learning_rate": 9.953224850399524e-05, | |
| "loss": 0.0091, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.495049504950495, | |
| "grad_norm": 0.04589547961950302, | |
| "learning_rate": 9.952276639060292e-05, | |
| "loss": 0.0298, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.504950495049505, | |
| "grad_norm": 0.038935884833335876, | |
| "learning_rate": 9.951318959074216e-05, | |
| "loss": 0.009, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.514851485148515, | |
| "grad_norm": 0.035445939749479294, | |
| "learning_rate": 9.950351812272356e-05, | |
| "loss": 0.0081, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.5247524752475248, | |
| "grad_norm": 0.04069902002811432, | |
| "learning_rate": 9.949375200503877e-05, | |
| "loss": 0.0086, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.5346534653465347, | |
| "grad_norm": 0.046185996383428574, | |
| "learning_rate": 9.948389125636039e-05, | |
| "loss": 0.0201, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.5445544554455446, | |
| "grad_norm": 0.02967759221792221, | |
| "learning_rate": 9.947393589554197e-05, | |
| "loss": 0.015, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.5544554455445545, | |
| "grad_norm": 0.02880985476076603, | |
| "learning_rate": 9.946388594161795e-05, | |
| "loss": 0.0081, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.5643564356435644, | |
| "grad_norm": 0.03159020096063614, | |
| "learning_rate": 9.945374141380361e-05, | |
| "loss": 0.0153, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.5742574257425743, | |
| "grad_norm": 0.034859638661146164, | |
| "learning_rate": 9.944350233149509e-05, | |
| "loss": 0.0128, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.5841584158415842, | |
| "grad_norm": 0.04179144650697708, | |
| "learning_rate": 9.943316871426929e-05, | |
| "loss": 0.0368, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.594059405940594, | |
| "grad_norm": 0.03368615359067917, | |
| "learning_rate": 9.942274058188383e-05, | |
| "loss": 0.0245, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.603960396039604, | |
| "grad_norm": 0.03103993646800518, | |
| "learning_rate": 9.941221795427713e-05, | |
| "loss": 0.0119, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.613861386138614, | |
| "grad_norm": 0.024007977917790413, | |
| "learning_rate": 9.94016008515682e-05, | |
| "loss": 0.007, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.623762376237624, | |
| "grad_norm": 0.031901855021715164, | |
| "learning_rate": 9.939088929405674e-05, | |
| "loss": 0.0124, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.633663366336634, | |
| "grad_norm": 0.026758648455142975, | |
| "learning_rate": 9.938008330222296e-05, | |
| "loss": 0.0089, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.6435643564356437, | |
| "grad_norm": 0.0313340462744236, | |
| "learning_rate": 9.936918289672774e-05, | |
| "loss": 0.0101, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.6534653465346536, | |
| "grad_norm": 0.028882941231131554, | |
| "learning_rate": 9.93581880984124e-05, | |
| "loss": 0.0093, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.6633663366336635, | |
| "grad_norm": 0.033211950212717056, | |
| "learning_rate": 9.934709892829875e-05, | |
| "loss": 0.0143, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.6732673267326734, | |
| "grad_norm": 0.04559667780995369, | |
| "learning_rate": 9.933591540758907e-05, | |
| "loss": 0.0403, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.6831683168316833, | |
| "grad_norm": 0.0350009985268116, | |
| "learning_rate": 9.932463755766599e-05, | |
| "loss": 0.0297, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.693069306930693, | |
| "grad_norm": 0.03305431827902794, | |
| "learning_rate": 9.931326540009253e-05, | |
| "loss": 0.0228, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.7029702970297027, | |
| "grad_norm": 0.025855455547571182, | |
| "learning_rate": 9.930179895661201e-05, | |
| "loss": 0.007, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.7128712871287126, | |
| "grad_norm": 0.03375259414315224, | |
| "learning_rate": 9.929023824914802e-05, | |
| "loss": 0.017, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.7128712871287126, | |
| "eval_loss": 0.018772218376398087, | |
| "eval_runtime": 20.9276, | |
| "eval_samples_per_second": 4.778, | |
| "eval_steps_per_second": 0.143, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.7227722772277225, | |
| "grad_norm": 0.031592901796102524, | |
| "learning_rate": 9.927858329980439e-05, | |
| "loss": 0.0177, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.7326732673267324, | |
| "grad_norm": 0.03559165447950363, | |
| "learning_rate": 9.926683413086514e-05, | |
| "loss": 0.0229, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.7425742574257423, | |
| "grad_norm": 0.03730347007513046, | |
| "learning_rate": 9.925499076479441e-05, | |
| "loss": 0.0169, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.7524752475247523, | |
| "grad_norm": 0.04660610854625702, | |
| "learning_rate": 9.924305322423649e-05, | |
| "loss": 0.034, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.762376237623762, | |
| "grad_norm": 0.023445578292012215, | |
| "learning_rate": 9.923102153201566e-05, | |
| "loss": 0.0098, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.772277227722772, | |
| "grad_norm": 0.026974381878972054, | |
| "learning_rate": 9.921889571113628e-05, | |
| "loss": 0.0123, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.782178217821782, | |
| "grad_norm": 0.029063357040286064, | |
| "learning_rate": 9.920667578478268e-05, | |
| "loss": 0.0202, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.792079207920792, | |
| "grad_norm": 0.04441928118467331, | |
| "learning_rate": 9.919436177631907e-05, | |
| "loss": 0.0096, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.801980198019802, | |
| "grad_norm": 0.03253911808133125, | |
| "learning_rate": 9.918195370928957e-05, | |
| "loss": 0.017, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.8118811881188117, | |
| "grad_norm": 0.0321219228208065, | |
| "learning_rate": 9.916945160741817e-05, | |
| "loss": 0.007, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.8217821782178216, | |
| "grad_norm": 0.029211685061454773, | |
| "learning_rate": 9.915685549460861e-05, | |
| "loss": 0.0091, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.8316831683168315, | |
| "grad_norm": 0.028232336044311523, | |
| "learning_rate": 9.914416539494435e-05, | |
| "loss": 0.009, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.8415841584158414, | |
| "grad_norm": 0.03041473776102066, | |
| "learning_rate": 9.913138133268862e-05, | |
| "loss": 0.0082, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.8514851485148514, | |
| "grad_norm": 0.026392994448542595, | |
| "learning_rate": 9.911850333228427e-05, | |
| "loss": 0.0159, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.8613861386138613, | |
| "grad_norm": 0.027819879353046417, | |
| "learning_rate": 9.910553141835376e-05, | |
| "loss": 0.0179, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.871287128712871, | |
| "grad_norm": 0.032292746007442474, | |
| "learning_rate": 9.909246561569912e-05, | |
| "loss": 0.0192, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.881188118811881, | |
| "grad_norm": 0.02765706181526184, | |
| "learning_rate": 9.907930594930185e-05, | |
| "loss": 0.0233, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.891089108910891, | |
| "grad_norm": 0.027257481589913368, | |
| "learning_rate": 9.9066052444323e-05, | |
| "loss": 0.012, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.900990099009901, | |
| "grad_norm": 0.03747186064720154, | |
| "learning_rate": 9.905270512610296e-05, | |
| "loss": 0.0279, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.910891089108911, | |
| "grad_norm": 0.022682279348373413, | |
| "learning_rate": 9.903926402016153e-05, | |
| "loss": 0.0069, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.9207920792079207, | |
| "grad_norm": 0.02796732820570469, | |
| "learning_rate": 9.902572915219779e-05, | |
| "loss": 0.0073, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.9306930693069306, | |
| "grad_norm": 0.034723684191703796, | |
| "learning_rate": 9.901210054809015e-05, | |
| "loss": 0.0181, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.9405940594059405, | |
| "grad_norm": 0.033110056072473526, | |
| "learning_rate": 9.899837823389618e-05, | |
| "loss": 0.0078, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.9504950495049505, | |
| "grad_norm": 0.03528871014714241, | |
| "learning_rate": 9.898456223585267e-05, | |
| "loss": 0.0132, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.9603960396039604, | |
| "grad_norm": 0.056491754949092865, | |
| "learning_rate": 9.897065258037552e-05, | |
| "loss": 0.0315, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.9603960396039604, | |
| "eval_loss": 0.018521126359701157, | |
| "eval_runtime": 20.9582, | |
| "eval_samples_per_second": 4.771, | |
| "eval_steps_per_second": 0.143, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.9702970297029703, | |
| "grad_norm": 0.03693537786602974, | |
| "learning_rate": 9.895664929405966e-05, | |
| "loss": 0.0164, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.98019801980198, | |
| "grad_norm": 0.034034907817840576, | |
| "learning_rate": 9.89425524036791e-05, | |
| "loss": 0.015, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.99009900990099, | |
| "grad_norm": 0.03214896842837334, | |
| "learning_rate": 9.892836193618679e-05, | |
| "loss": 0.012, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.049190979450941086, | |
| "learning_rate": 9.89140779187146e-05, | |
| "loss": 0.009, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 4.00990099009901, | |
| "grad_norm": 0.02638743631541729, | |
| "learning_rate": 9.889970037857324e-05, | |
| "loss": 0.0169, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.01980198019802, | |
| "grad_norm": 0.02798108384013176, | |
| "learning_rate": 9.88852293432523e-05, | |
| "loss": 0.0201, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 4.02970297029703, | |
| "grad_norm": 0.02562817744910717, | |
| "learning_rate": 9.887066484042007e-05, | |
| "loss": 0.0134, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 4.03960396039604, | |
| "grad_norm": 0.025396671146154404, | |
| "learning_rate": 9.885600689792356e-05, | |
| "loss": 0.0063, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.0495049504950495, | |
| "grad_norm": 0.024238383397459984, | |
| "learning_rate": 9.884125554378845e-05, | |
| "loss": 0.0112, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 4.0594059405940595, | |
| "grad_norm": 0.035111505538225174, | |
| "learning_rate": 9.882641080621902e-05, | |
| "loss": 0.0265, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.069306930693069, | |
| "grad_norm": 0.03213327378034592, | |
| "learning_rate": 9.881147271359807e-05, | |
| "loss": 0.0141, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.079207920792079, | |
| "grad_norm": 0.03519801050424576, | |
| "learning_rate": 9.879644129448694e-05, | |
| "loss": 0.0072, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.089108910891089, | |
| "grad_norm": 0.03529242426156998, | |
| "learning_rate": 9.878131657762535e-05, | |
| "loss": 0.0112, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.099009900990099, | |
| "grad_norm": 0.023639151826500893, | |
| "learning_rate": 9.876609859193145e-05, | |
| "loss": 0.0057, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.108910891089109, | |
| "grad_norm": 0.02800002135336399, | |
| "learning_rate": 9.875078736650172e-05, | |
| "loss": 0.0183, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.118811881188119, | |
| "grad_norm": 0.02918158285319805, | |
| "learning_rate": 9.873538293061087e-05, | |
| "loss": 0.0148, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.128712871287129, | |
| "grad_norm": 0.03694870322942734, | |
| "learning_rate": 9.871988531371185e-05, | |
| "loss": 0.006, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.138613861386139, | |
| "grad_norm": 0.03968213126063347, | |
| "learning_rate": 9.870429454543582e-05, | |
| "loss": 0.0083, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.148514851485149, | |
| "grad_norm": 0.036222394555807114, | |
| "learning_rate": 9.868861065559192e-05, | |
| "loss": 0.0157, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.158415841584159, | |
| "grad_norm": 0.03854987397789955, | |
| "learning_rate": 9.867283367416747e-05, | |
| "loss": 0.0066, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.1683168316831685, | |
| "grad_norm": 0.03001275099813938, | |
| "learning_rate": 9.865696363132769e-05, | |
| "loss": 0.0144, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.178217821782178, | |
| "grad_norm": 0.023233724758028984, | |
| "learning_rate": 9.864100055741576e-05, | |
| "loss": 0.005, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.188118811881188, | |
| "grad_norm": 0.027578100562095642, | |
| "learning_rate": 9.862494448295277e-05, | |
| "loss": 0.0066, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.198019801980198, | |
| "grad_norm": 0.03242870792746544, | |
| "learning_rate": 9.860879543863755e-05, | |
| "loss": 0.0182, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.207920792079208, | |
| "grad_norm": 0.02438744716346264, | |
| "learning_rate": 9.859255345534675e-05, | |
| "loss": 0.0156, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.207920792079208, | |
| "eval_loss": 0.019263744354248047, | |
| "eval_runtime": 20.9857, | |
| "eval_samples_per_second": 4.765, | |
| "eval_steps_per_second": 0.143, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.217821782178218, | |
| "grad_norm": 0.027921592816710472, | |
| "learning_rate": 9.857621856413469e-05, | |
| "loss": 0.0052, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.227722772277228, | |
| "grad_norm": 0.031431883573532104, | |
| "learning_rate": 9.855979079623332e-05, | |
| "loss": 0.0118, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.237623762376238, | |
| "grad_norm": 0.03149943798780441, | |
| "learning_rate": 9.85432701830522e-05, | |
| "loss": 0.0093, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.247524752475248, | |
| "grad_norm": 0.029634475708007812, | |
| "learning_rate": 9.852665675617837e-05, | |
| "loss": 0.0174, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.257425742574258, | |
| "grad_norm": 0.03414541855454445, | |
| "learning_rate": 9.850995054737637e-05, | |
| "loss": 0.0062, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.267326732673268, | |
| "grad_norm": 0.0337190143764019, | |
| "learning_rate": 9.849315158858807e-05, | |
| "loss": 0.0067, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.2772277227722775, | |
| "grad_norm": 0.050311364233493805, | |
| "learning_rate": 9.847625991193277e-05, | |
| "loss": 0.019, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.287128712871287, | |
| "grad_norm": 0.03678404539823532, | |
| "learning_rate": 9.845927554970698e-05, | |
| "loss": 0.0275, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.297029702970297, | |
| "grad_norm": 0.03602934628725052, | |
| "learning_rate": 9.84421985343844e-05, | |
| "loss": 0.0263, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.306930693069307, | |
| "grad_norm": 0.0423722043633461, | |
| "learning_rate": 9.842502889861596e-05, | |
| "loss": 0.0063, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.316831683168317, | |
| "grad_norm": 0.026815395802259445, | |
| "learning_rate": 9.840776667522962e-05, | |
| "loss": 0.0104, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.326732673267327, | |
| "grad_norm": 0.04021260514855385, | |
| "learning_rate": 9.839041189723039e-05, | |
| "loss": 0.0113, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.336633663366337, | |
| "grad_norm": 0.03357745334506035, | |
| "learning_rate": 9.837296459780022e-05, | |
| "loss": 0.0277, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.346534653465347, | |
| "grad_norm": 0.03575926274061203, | |
| "learning_rate": 9.835542481029798e-05, | |
| "loss": 0.014, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.356435643564357, | |
| "grad_norm": 0.03378582373261452, | |
| "learning_rate": 9.833779256825937e-05, | |
| "loss": 0.014, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.366336633663367, | |
| "grad_norm": 0.029247019439935684, | |
| "learning_rate": 9.832006790539685e-05, | |
| "loss": 0.0075, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 4.376237623762377, | |
| "grad_norm": 0.0354682132601738, | |
| "learning_rate": 9.830225085559961e-05, | |
| "loss": 0.0106, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 4.3861386138613865, | |
| "grad_norm": 0.04129399359226227, | |
| "learning_rate": 9.828434145293346e-05, | |
| "loss": 0.02, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 4.396039603960396, | |
| "grad_norm": 0.026085088029503822, | |
| "learning_rate": 9.826633973164079e-05, | |
| "loss": 0.0122, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.405940594059406, | |
| "grad_norm": 0.038606371730566025, | |
| "learning_rate": 9.824824572614051e-05, | |
| "loss": 0.0134, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.415841584158416, | |
| "grad_norm": 0.03461376577615738, | |
| "learning_rate": 9.823005947102797e-05, | |
| "loss": 0.0056, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.425742574257426, | |
| "grad_norm": 0.029777944087982178, | |
| "learning_rate": 9.821178100107489e-05, | |
| "loss": 0.0211, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.435643564356436, | |
| "grad_norm": 0.025174397975206375, | |
| "learning_rate": 9.819341035122933e-05, | |
| "loss": 0.0065, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.445544554455446, | |
| "grad_norm": 0.02835717424750328, | |
| "learning_rate": 9.817494755661558e-05, | |
| "loss": 0.0164, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.455445544554456, | |
| "grad_norm": 0.02822643145918846, | |
| "learning_rate": 9.815639265253409e-05, | |
| "loss": 0.0059, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.455445544554456, | |
| "eval_loss": 0.01967804692685604, | |
| "eval_runtime": 20.9324, | |
| "eval_samples_per_second": 4.777, | |
| "eval_steps_per_second": 0.143, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.465346534653466, | |
| "grad_norm": 0.03281950205564499, | |
| "learning_rate": 9.813774567446145e-05, | |
| "loss": 0.0064, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.475247524752476, | |
| "grad_norm": 0.027459366247057915, | |
| "learning_rate": 9.811900665805029e-05, | |
| "loss": 0.0167, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.485148514851485, | |
| "grad_norm": 0.03079644776880741, | |
| "learning_rate": 9.81001756391292e-05, | |
| "loss": 0.0062, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.4950495049504955, | |
| "grad_norm": 0.02574409544467926, | |
| "learning_rate": 9.808125265370269e-05, | |
| "loss": 0.0082, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.5049504950495045, | |
| "grad_norm": 0.04239686205983162, | |
| "learning_rate": 9.806223773795108e-05, | |
| "loss": 0.0083, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.514851485148515, | |
| "grad_norm": 0.027789965271949768, | |
| "learning_rate": 9.804313092823049e-05, | |
| "loss": 0.0131, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.524752475247524, | |
| "grad_norm": 0.024712661281228065, | |
| "learning_rate": 9.802393226107278e-05, | |
| "loss": 0.0086, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.534653465346535, | |
| "grad_norm": 0.028639836236834526, | |
| "learning_rate": 9.800464177318531e-05, | |
| "loss": 0.0131, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.544554455445544, | |
| "grad_norm": 0.023141171783208847, | |
| "learning_rate": 9.798525950145115e-05, | |
| "loss": 0.0056, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.554455445544555, | |
| "grad_norm": 0.029834497720003128, | |
| "learning_rate": 9.796578548292874e-05, | |
| "loss": 0.0113, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.564356435643564, | |
| "grad_norm": 0.029348380863666534, | |
| "learning_rate": 9.794621975485201e-05, | |
| "loss": 0.0081, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.574257425742574, | |
| "grad_norm": 0.029745472595095634, | |
| "learning_rate": 9.79265623546302e-05, | |
| "loss": 0.0091, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.584158415841584, | |
| "grad_norm": 0.04096932336688042, | |
| "learning_rate": 9.790681331984785e-05, | |
| "loss": 0.0217, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.594059405940594, | |
| "grad_norm": 0.037326063960790634, | |
| "learning_rate": 9.78869726882647e-05, | |
| "loss": 0.0067, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.603960396039604, | |
| "grad_norm": 0.029236366972327232, | |
| "learning_rate": 9.786704049781558e-05, | |
| "loss": 0.017, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.6138613861386135, | |
| "grad_norm": 0.027106214314699173, | |
| "learning_rate": 9.784701678661045e-05, | |
| "loss": 0.006, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.623762376237623, | |
| "grad_norm": 0.041205085813999176, | |
| "learning_rate": 9.782690159293419e-05, | |
| "loss": 0.0214, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.633663366336633, | |
| "grad_norm": 0.02912343665957451, | |
| "learning_rate": 9.780669495524661e-05, | |
| "loss": 0.0068, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.643564356435643, | |
| "grad_norm": 0.02972271479666233, | |
| "learning_rate": 9.77863969121824e-05, | |
| "loss": 0.0132, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.653465346534653, | |
| "grad_norm": 0.03316974267363548, | |
| "learning_rate": 9.776600750255099e-05, | |
| "loss": 0.0104, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.663366336633663, | |
| "grad_norm": 0.025208162143826485, | |
| "learning_rate": 9.774552676533644e-05, | |
| "loss": 0.019, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.673267326732673, | |
| "grad_norm": 0.029168223962187767, | |
| "learning_rate": 9.772495473969751e-05, | |
| "loss": 0.0185, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.683168316831683, | |
| "grad_norm": 0.027234015986323357, | |
| "learning_rate": 9.77042914649675e-05, | |
| "loss": 0.013, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.693069306930693, | |
| "grad_norm": 0.030596833676099777, | |
| "learning_rate": 9.768353698065412e-05, | |
| "loss": 0.0111, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.702970297029703, | |
| "grad_norm": 0.027127567678689957, | |
| "learning_rate": 9.766269132643951e-05, | |
| "loss": 0.0136, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.702970297029703, | |
| "eval_loss": 0.019799688830971718, | |
| "eval_runtime": 20.9596, | |
| "eval_samples_per_second": 4.771, | |
| "eval_steps_per_second": 0.143, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.712871287128713, | |
| "grad_norm": 0.031133603304624557, | |
| "learning_rate": 9.76417545421801e-05, | |
| "loss": 0.0163, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.7227722772277225, | |
| "grad_norm": 0.034247223287820816, | |
| "learning_rate": 9.762072666790658e-05, | |
| "loss": 0.0245, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.732673267326732, | |
| "grad_norm": 0.0344618484377861, | |
| "learning_rate": 9.75996077438238e-05, | |
| "loss": 0.0142, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.742574257425742, | |
| "grad_norm": 0.03324158489704132, | |
| "learning_rate": 9.757839781031069e-05, | |
| "loss": 0.0114, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.752475247524752, | |
| "grad_norm": 0.025920778512954712, | |
| "learning_rate": 9.755709690792017e-05, | |
| "loss": 0.0095, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.762376237623762, | |
| "grad_norm": 0.03353674337267876, | |
| "learning_rate": 9.753570507737914e-05, | |
| "loss": 0.0223, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.772277227722772, | |
| "grad_norm": 0.035666413605213165, | |
| "learning_rate": 9.751422235958829e-05, | |
| "loss": 0.0196, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.782178217821782, | |
| "grad_norm": 0.03323884680867195, | |
| "learning_rate": 9.749264879562216e-05, | |
| "loss": 0.0275, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.792079207920792, | |
| "grad_norm": 0.024832090362906456, | |
| "learning_rate": 9.74709844267289e-05, | |
| "loss": 0.0197, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.801980198019802, | |
| "grad_norm": 0.025249451398849487, | |
| "learning_rate": 9.744922929433033e-05, | |
| "loss": 0.018, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.811881188118812, | |
| "grad_norm": 0.027032941579818726, | |
| "learning_rate": 9.74273834400218e-05, | |
| "loss": 0.0185, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.821782178217822, | |
| "grad_norm": 0.03405793011188507, | |
| "learning_rate": 9.740544690557213e-05, | |
| "loss": 0.0284, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.8316831683168315, | |
| "grad_norm": 0.033517688512802124, | |
| "learning_rate": 9.738341973292349e-05, | |
| "loss": 0.0319, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.841584158415841, | |
| "grad_norm": 0.027846721932291985, | |
| "learning_rate": 9.736130196419135e-05, | |
| "loss": 0.0139, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.851485148514851, | |
| "grad_norm": 0.027481894940137863, | |
| "learning_rate": 9.733909364166442e-05, | |
| "loss": 0.0078, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.861386138613861, | |
| "grad_norm": 0.02963847480714321, | |
| "learning_rate": 9.731679480780456e-05, | |
| "loss": 0.0175, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.871287128712871, | |
| "grad_norm": 0.022556889802217484, | |
| "learning_rate": 9.729440550524664e-05, | |
| "loss": 0.0065, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.881188118811881, | |
| "grad_norm": 0.030938800424337387, | |
| "learning_rate": 9.727192577679851e-05, | |
| "loss": 0.0126, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.891089108910891, | |
| "grad_norm": 0.05100259929895401, | |
| "learning_rate": 9.724935566544098e-05, | |
| "loss": 0.0365, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.900990099009901, | |
| "grad_norm": 0.03517129644751549, | |
| "learning_rate": 9.722669521432757e-05, | |
| "loss": 0.0219, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.910891089108911, | |
| "grad_norm": 0.02679610438644886, | |
| "learning_rate": 9.720394446678458e-05, | |
| "loss": 0.0147, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.920792079207921, | |
| "grad_norm": 0.027280865237116814, | |
| "learning_rate": 9.718110346631099e-05, | |
| "loss": 0.0152, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.930693069306931, | |
| "grad_norm": 0.029038678854703903, | |
| "learning_rate": 9.715817225657827e-05, | |
| "loss": 0.0117, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.9405940594059405, | |
| "grad_norm": 0.02707214094698429, | |
| "learning_rate": 9.713515088143042e-05, | |
| "loss": 0.0099, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.9504950495049505, | |
| "grad_norm": 0.028160203248262405, | |
| "learning_rate": 9.71120393848838e-05, | |
| "loss": 0.0092, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.9504950495049505, | |
| "eval_loss": 0.021699432283639908, | |
| "eval_runtime": 20.9339, | |
| "eval_samples_per_second": 4.777, | |
| "eval_steps_per_second": 0.143, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.96039603960396, | |
| "grad_norm": 0.028015898540616035, | |
| "learning_rate": 9.70888378111271e-05, | |
| "loss": 0.0138, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.97029702970297, | |
| "grad_norm": 0.02438513934612274, | |
| "learning_rate": 9.706554620452125e-05, | |
| "loss": 0.0065, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.98019801980198, | |
| "grad_norm": 0.03049881011247635, | |
| "learning_rate": 9.704216460959929e-05, | |
| "loss": 0.013, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.99009900990099, | |
| "grad_norm": 0.02331537939608097, | |
| "learning_rate": 9.701869307106633e-05, | |
| "loss": 0.0208, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.11598920822143555, | |
| "learning_rate": 9.699513163379943e-05, | |
| "loss": 0.0146, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 5.00990099009901, | |
| "grad_norm": 0.019882583990693092, | |
| "learning_rate": 9.697148034284759e-05, | |
| "loss": 0.0117, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 5.01980198019802, | |
| "grad_norm": 0.03078438714146614, | |
| "learning_rate": 9.694773924343154e-05, | |
| "loss": 0.0052, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 5.02970297029703, | |
| "grad_norm": 0.02752445451915264, | |
| "learning_rate": 9.692390838094377e-05, | |
| "loss": 0.0121, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 5.03960396039604, | |
| "grad_norm": 0.027121013030409813, | |
| "learning_rate": 9.689998780094837e-05, | |
| "loss": 0.0261, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 5.0495049504950495, | |
| "grad_norm": 0.0361727774143219, | |
| "learning_rate": 9.687597754918099e-05, | |
| "loss": 0.006, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.0594059405940595, | |
| "grad_norm": 0.03587155416607857, | |
| "learning_rate": 9.68518776715487e-05, | |
| "loss": 0.0067, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 5.069306930693069, | |
| "grad_norm": 0.032527755945920944, | |
| "learning_rate": 9.682768821412997e-05, | |
| "loss": 0.0067, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 5.079207920792079, | |
| "grad_norm": 0.03142087161540985, | |
| "learning_rate": 9.68034092231745e-05, | |
| "loss": 0.0079, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 5.089108910891089, | |
| "grad_norm": 0.03114234283566475, | |
| "learning_rate": 9.677904074510322e-05, | |
| "loss": 0.0199, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 5.099009900990099, | |
| "grad_norm": 0.026284724473953247, | |
| "learning_rate": 9.675458282650813e-05, | |
| "loss": 0.0121, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 5.108910891089109, | |
| "grad_norm": 0.0318143405020237, | |
| "learning_rate": 9.673003551415224e-05, | |
| "loss": 0.0063, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 5.118811881188119, | |
| "grad_norm": 0.035465918481349945, | |
| "learning_rate": 9.67053988549695e-05, | |
| "loss": 0.0207, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 5.128712871287129, | |
| "grad_norm": 0.0312662236392498, | |
| "learning_rate": 9.668067289606466e-05, | |
| "loss": 0.012, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 5.138613861386139, | |
| "grad_norm": 0.03460320085287094, | |
| "learning_rate": 9.665585768471324e-05, | |
| "loss": 0.0052, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 5.148514851485149, | |
| "grad_norm": 0.03635144233703613, | |
| "learning_rate": 9.663095326836138e-05, | |
| "loss": 0.0226, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.158415841584159, | |
| "grad_norm": 0.03495261073112488, | |
| "learning_rate": 9.660595969462578e-05, | |
| "loss": 0.0157, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 5.1683168316831685, | |
| "grad_norm": 0.029816031455993652, | |
| "learning_rate": 9.658087701129365e-05, | |
| "loss": 0.0052, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 5.178217821782178, | |
| "grad_norm": 0.027445781975984573, | |
| "learning_rate": 9.655570526632252e-05, | |
| "loss": 0.0129, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 5.188118811881188, | |
| "grad_norm": 0.03341228514909744, | |
| "learning_rate": 9.653044450784023e-05, | |
| "loss": 0.0177, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 5.198019801980198, | |
| "grad_norm": 0.036780472844839096, | |
| "learning_rate": 9.650509478414482e-05, | |
| "loss": 0.008, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 5.198019801980198, | |
| "eval_loss": 0.01887640729546547, | |
| "eval_runtime": 20.9336, | |
| "eval_samples_per_second": 4.777, | |
| "eval_steps_per_second": 0.143, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 5.198019801980198, | |
| "step": 525, | |
| "total_flos": 1.9740066929914675e+18, | |
| "train_loss": 0.027972995166977245, | |
| "train_runtime": 14286.9421, | |
| "train_samples_per_second": 8.427, | |
| "train_steps_per_second": 0.177 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 2525, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 4 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9740066929914675e+18, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |