| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 250, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.7934532165527344, |
| "learning_rate": 0.0002, |
| "loss": 2.3404, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.5168005228042603, |
| "learning_rate": 0.0002, |
| "loss": 2.0804, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.047807216644287, |
| "learning_rate": 0.0002, |
| "loss": 1.9184, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.041599988937378, |
| "learning_rate": 0.0002, |
| "loss": 2.0393, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.8074644804000854, |
| "learning_rate": 0.0002, |
| "loss": 2.1779, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.7784727811813354, |
| "learning_rate": 0.0002, |
| "loss": 2.1583, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.8535248637199402, |
| "learning_rate": 0.0002, |
| "loss": 2.0766, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.8507911562919617, |
| "learning_rate": 0.0002, |
| "loss": 2.1175, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.8497746586799622, |
| "learning_rate": 0.0002, |
| "loss": 2.1156, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.9205197691917419, |
| "learning_rate": 0.0002, |
| "loss": 2.1782, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.2332911491394043, |
| "learning_rate": 0.0002, |
| "loss": 2.0536, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.3457396030426025, |
| "learning_rate": 0.0002, |
| "loss": 2.2186, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.6494730114936829, |
| "learning_rate": 0.0002, |
| "loss": 1.8011, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.6363134980201721, |
| "learning_rate": 0.0002, |
| "loss": 1.8819, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.7927612662315369, |
| "learning_rate": 0.0002, |
| "loss": 1.8845, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.7082176804542542, |
| "learning_rate": 0.0002, |
| "loss": 1.8748, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.861709713935852, |
| "learning_rate": 0.0002, |
| "loss": 1.8777, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.7901681661605835, |
| "learning_rate": 0.0002, |
| "loss": 1.8811, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.7719288468360901, |
| "learning_rate": 0.0002, |
| "loss": 1.8898, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.027469277381897, |
| "learning_rate": 0.0002, |
| "loss": 2.0089, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.9486727714538574, |
| "learning_rate": 0.0002, |
| "loss": 1.968, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.9629890322685242, |
| "learning_rate": 0.0002, |
| "loss": 2.0568, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.033793568611145, |
| "learning_rate": 0.0002, |
| "loss": 1.8401, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.3298218250274658, |
| "learning_rate": 0.0002, |
| "loss": 1.9165, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.6936089396476746, |
| "learning_rate": 0.0002, |
| "loss": 1.6614, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.6136096119880676, |
| "learning_rate": 0.0002, |
| "loss": 1.8167, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.6043046712875366, |
| "learning_rate": 0.0002, |
| "loss": 1.7217, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.6395452618598938, |
| "learning_rate": 0.0002, |
| "loss": 1.7353, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.6829009056091309, |
| "learning_rate": 0.0002, |
| "loss": 1.7708, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.8561712503433228, |
| "learning_rate": 0.0002, |
| "loss": 1.774, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.7594190239906311, |
| "learning_rate": 0.0002, |
| "loss": 1.8788, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.867341160774231, |
| "learning_rate": 0.0002, |
| "loss": 1.8708, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.9393973350524902, |
| "learning_rate": 0.0002, |
| "loss": 1.9839, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.0540133714675903, |
| "learning_rate": 0.0002, |
| "loss": 1.7637, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 1.2020256519317627, |
| "learning_rate": 0.0002, |
| "loss": 1.9187, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.7588919401168823, |
| "learning_rate": 0.0002, |
| "loss": 1.5851, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.9404975175857544, |
| "learning_rate": 0.0002, |
| "loss": 1.7903, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.7744253873825073, |
| "learning_rate": 0.0002, |
| "loss": 1.7476, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.7260447144508362, |
| "learning_rate": 0.0002, |
| "loss": 1.5547, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.9214150905609131, |
| "learning_rate": 0.0002, |
| "loss": 1.6342, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.834932267665863, |
| "learning_rate": 0.0002, |
| "loss": 1.6577, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.8663449883460999, |
| "learning_rate": 0.0002, |
| "loss": 1.7133, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.9534509181976318, |
| "learning_rate": 0.0002, |
| "loss": 1.7999, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.058899164199829, |
| "learning_rate": 0.0002, |
| "loss": 1.5964, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.1835004091262817, |
| "learning_rate": 0.0002, |
| "loss": 1.7596, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.2041128873825073, |
| "learning_rate": 0.0002, |
| "loss": 1.6529, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.5300588607788086, |
| "learning_rate": 0.0002, |
| "loss": 1.7852, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.6037429571151733, |
| "learning_rate": 0.0002, |
| "loss": 1.6873, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.6437931656837463, |
| "learning_rate": 0.0002, |
| "loss": 1.5729, |
| "step": 49 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.1996123790740967, |
| "learning_rate": 0.0002, |
| "loss": 1.5609, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.6186010837554932, |
| "eval_runtime": 565.3333, |
| "eval_samples_per_second": 0.708, |
| "eval_steps_per_second": 0.177, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.6480159163475037, |
| "learning_rate": 0.0002, |
| "loss": 1.5754, |
| "step": 51 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.8269457817077637, |
| "learning_rate": 0.0002, |
| "loss": 1.4743, |
| "step": 52 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 1.1054280996322632, |
| "learning_rate": 0.0002, |
| "loss": 1.3447, |
| "step": 53 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.9144314527511597, |
| "learning_rate": 0.0002, |
| "loss": 1.3652, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.8429620862007141, |
| "learning_rate": 0.0002, |
| "loss": 1.4696, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 1.3091776371002197, |
| "learning_rate": 0.0002, |
| "loss": 1.3109, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.1400000000000001, |
| "grad_norm": 1.2086460590362549, |
| "learning_rate": 0.0002, |
| "loss": 1.3424, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 1.1823766231536865, |
| "learning_rate": 0.0002, |
| "loss": 1.213, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 1.817803144454956, |
| "learning_rate": 0.0002, |
| "loss": 1.2911, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 1.2870073318481445, |
| "learning_rate": 0.0002, |
| "loss": 1.2712, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 1.2424544095993042, |
| "learning_rate": 0.0002, |
| "loss": 1.2292, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 1.4258471727371216, |
| "learning_rate": 0.0002, |
| "loss": 1.0814, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 1.1297271251678467, |
| "learning_rate": 0.0002, |
| "loss": 1.5303, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.8728504776954651, |
| "learning_rate": 0.0002, |
| "loss": 1.4354, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.7809789776802063, |
| "learning_rate": 0.0002, |
| "loss": 1.4398, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 0.844166100025177, |
| "learning_rate": 0.0002, |
| "loss": 1.2171, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.8636218905448914, |
| "learning_rate": 0.0002, |
| "loss": 1.23, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.9831591248512268, |
| "learning_rate": 0.0002, |
| "loss": 1.2496, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 1.4268325567245483, |
| "learning_rate": 0.0002, |
| "loss": 1.2354, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 1.6133723258972168, |
| "learning_rate": 0.0002, |
| "loss": 1.2123, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 1.5462720394134521, |
| "learning_rate": 0.0002, |
| "loss": 1.0258, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 1.1962395906448364, |
| "learning_rate": 0.0002, |
| "loss": 1.0858, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 1.413921594619751, |
| "learning_rate": 0.0002, |
| "loss": 1.0169, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 1.442657470703125, |
| "learning_rate": 0.0002, |
| "loss": 0.9553, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.9919085502624512, |
| "learning_rate": 0.0002, |
| "loss": 1.2394, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.988468587398529, |
| "learning_rate": 0.0002, |
| "loss": 1.3642, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.9793186187744141, |
| "learning_rate": 0.0002, |
| "loss": 1.2818, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.7799855470657349, |
| "learning_rate": 0.0002, |
| "loss": 1.1705, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 0.8288784027099609, |
| "learning_rate": 0.0002, |
| "loss": 1.0484, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.064773440361023, |
| "learning_rate": 0.0002, |
| "loss": 1.1608, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 1.0099600553512573, |
| "learning_rate": 0.0002, |
| "loss": 1.1873, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.6400000000000001, |
| "grad_norm": 1.9040124416351318, |
| "learning_rate": 0.0002, |
| "loss": 0.9739, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.6600000000000001, |
| "grad_norm": 1.2448644638061523, |
| "learning_rate": 0.0002, |
| "loss": 0.9418, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 1.2129086256027222, |
| "learning_rate": 0.0002, |
| "loss": 0.8821, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 1.6727265119552612, |
| "learning_rate": 0.0002, |
| "loss": 0.9965, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.6569440364837646, |
| "learning_rate": 0.0002, |
| "loss": 0.9182, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.8596146702766418, |
| "learning_rate": 0.0002, |
| "loss": 1.3188, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.8928490281105042, |
| "learning_rate": 0.0002, |
| "loss": 1.3601, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.7409713268280029, |
| "learning_rate": 0.0002, |
| "loss": 1.1212, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.8979334831237793, |
| "learning_rate": 0.0002, |
| "loss": 1.4162, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.8199999999999998, |
| "grad_norm": 0.979978621006012, |
| "learning_rate": 0.0002, |
| "loss": 1.1969, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.9733594059944153, |
| "learning_rate": 0.0002, |
| "loss": 1.0468, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.8599999999999999, |
| "grad_norm": 0.9226842522621155, |
| "learning_rate": 0.0002, |
| "loss": 1.1807, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.1638745069503784, |
| "learning_rate": 0.0002, |
| "loss": 1.139, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 1.5604937076568604, |
| "learning_rate": 0.0002, |
| "loss": 1.1872, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 1.3674428462982178, |
| "learning_rate": 0.0002, |
| "loss": 1.1865, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 1.8469598293304443, |
| "learning_rate": 0.0002, |
| "loss": 1.0469, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 1.3148952722549438, |
| "learning_rate": 0.0002, |
| "loss": 0.9915, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 1.599141001701355, |
| "learning_rate": 0.0002, |
| "loss": 1.2296, |
| "step": 99 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.3382114171981812, |
| "learning_rate": 0.0002, |
| "loss": 1.1813, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.409305453300476, |
| "eval_runtime": 565.9517, |
| "eval_samples_per_second": 0.707, |
| "eval_steps_per_second": 0.177, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.02, |
| "grad_norm": 1.0162380933761597, |
| "learning_rate": 0.0002, |
| "loss": 1.1481, |
| "step": 101 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 0.7402092814445496, |
| "learning_rate": 0.0002, |
| "loss": 1.0086, |
| "step": 102 |
| }, |
| { |
| "epoch": 2.06, |
| "grad_norm": 0.8824872970581055, |
| "learning_rate": 0.0002, |
| "loss": 1.0588, |
| "step": 103 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 0.7582442760467529, |
| "learning_rate": 0.0002, |
| "loss": 0.8181, |
| "step": 104 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 1.0200812816619873, |
| "learning_rate": 0.0002, |
| "loss": 0.9041, |
| "step": 105 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 1.08174467086792, |
| "learning_rate": 0.0002, |
| "loss": 0.8479, |
| "step": 106 |
| }, |
| { |
| "epoch": 2.14, |
| "grad_norm": 1.01225745677948, |
| "learning_rate": 0.0002, |
| "loss": 0.7659, |
| "step": 107 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 1.2194840908050537, |
| "learning_rate": 0.0002, |
| "loss": 0.7926, |
| "step": 108 |
| }, |
| { |
| "epoch": 2.18, |
| "grad_norm": 1.0519524812698364, |
| "learning_rate": 0.0002, |
| "loss": 0.6604, |
| "step": 109 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 1.2860150337219238, |
| "learning_rate": 0.0002, |
| "loss": 0.663, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.22, |
| "grad_norm": 1.5521994829177856, |
| "learning_rate": 0.0002, |
| "loss": 0.7791, |
| "step": 111 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 1.455283284187317, |
| "learning_rate": 0.0002, |
| "loss": 0.5112, |
| "step": 112 |
| }, |
| { |
| "epoch": 2.26, |
| "grad_norm": 1.7097278833389282, |
| "learning_rate": 0.0002, |
| "loss": 1.2219, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.2800000000000002, |
| "grad_norm": 1.5385531187057495, |
| "learning_rate": 0.0002, |
| "loss": 1.1261, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 1.0525436401367188, |
| "learning_rate": 0.0002, |
| "loss": 0.86, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 1.0388120412826538, |
| "learning_rate": 0.0002, |
| "loss": 0.9022, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.34, |
| "grad_norm": 1.060497760772705, |
| "learning_rate": 0.0002, |
| "loss": 0.9265, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 1.0629950761795044, |
| "learning_rate": 0.0002, |
| "loss": 0.7222, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.38, |
| "grad_norm": 1.2574018239974976, |
| "learning_rate": 0.0002, |
| "loss": 0.7952, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 1.0951610803604126, |
| "learning_rate": 0.0002, |
| "loss": 0.6647, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.42, |
| "grad_norm": 1.46285879611969, |
| "learning_rate": 0.0002, |
| "loss": 0.7845, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 1.3611388206481934, |
| "learning_rate": 0.0002, |
| "loss": 0.7084, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.46, |
| "grad_norm": 1.6670907735824585, |
| "learning_rate": 0.0002, |
| "loss": 0.6594, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 2.1525955200195312, |
| "learning_rate": 0.0002, |
| "loss": 0.6401, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 2.5126793384552, |
| "learning_rate": 0.0002, |
| "loss": 0.926, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.52, |
| "grad_norm": 1.800521969795227, |
| "learning_rate": 0.0002, |
| "loss": 1.0936, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 1.0617576837539673, |
| "learning_rate": 0.0002, |
| "loss": 0.8052, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 1.0823312997817993, |
| "learning_rate": 0.0002, |
| "loss": 0.9443, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.58, |
| "grad_norm": 1.2193264961242676, |
| "learning_rate": 0.0002, |
| "loss": 0.7955, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 1.0502954721450806, |
| "learning_rate": 0.0002, |
| "loss": 0.8365, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 1.1898560523986816, |
| "learning_rate": 0.0002, |
| "loss": 0.9706, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 1.1076680421829224, |
| "learning_rate": 0.0002, |
| "loss": 0.7529, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.66, |
| "grad_norm": 1.3826709985733032, |
| "learning_rate": 0.0002, |
| "loss": 0.7474, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.68, |
| "grad_norm": 1.2504832744598389, |
| "learning_rate": 0.0002, |
| "loss": 0.7086, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 1.6292765140533447, |
| "learning_rate": 0.0002, |
| "loss": 0.6305, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "grad_norm": 1.9603074789047241, |
| "learning_rate": 0.0002, |
| "loss": 0.6834, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.74, |
| "grad_norm": 2.202030897140503, |
| "learning_rate": 0.0002, |
| "loss": 1.1712, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.76, |
| "grad_norm": 1.6344685554504395, |
| "learning_rate": 0.0002, |
| "loss": 1.0772, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.7800000000000002, |
| "grad_norm": 1.3579537868499756, |
| "learning_rate": 0.0002, |
| "loss": 0.8803, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 1.0554553270339966, |
| "learning_rate": 0.0002, |
| "loss": 0.9222, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.82, |
| "grad_norm": 0.9431642889976501, |
| "learning_rate": 0.0002, |
| "loss": 0.8031, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 1.0826098918914795, |
| "learning_rate": 0.0002, |
| "loss": 0.8259, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 1.24959135055542, |
| "learning_rate": 0.0002, |
| "loss": 0.7957, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 1.1057368516921997, |
| "learning_rate": 0.0002, |
| "loss": 0.7079, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 1.144061803817749, |
| "learning_rate": 0.0002, |
| "loss": 0.7165, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 1.0690631866455078, |
| "learning_rate": 0.0002, |
| "loss": 0.601, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.94, |
| "grad_norm": 1.292758584022522, |
| "learning_rate": 0.0002, |
| "loss": 0.7191, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 1.729408860206604, |
| "learning_rate": 0.0002, |
| "loss": 0.5851, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.98, |
| "grad_norm": 2.078197717666626, |
| "learning_rate": 0.0002, |
| "loss": 0.8942, |
| "step": 149 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 2.0007128715515137, |
| "learning_rate": 0.0002, |
| "loss": 0.7139, |
| "step": 150 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 1.4874461889266968, |
| "eval_runtime": 566.1506, |
| "eval_samples_per_second": 0.707, |
| "eval_steps_per_second": 0.177, |
| "step": 150 |
| }, |
| { |
| "epoch": 3.02, |
| "grad_norm": 1.024670958518982, |
| "learning_rate": 0.0002, |
| "loss": 0.8969, |
| "step": 151 |
| }, |
| { |
| "epoch": 3.04, |
| "grad_norm": 0.9738882184028625, |
| "learning_rate": 0.0002, |
| "loss": 0.9056, |
| "step": 152 |
| }, |
| { |
| "epoch": 3.06, |
| "grad_norm": 0.9969688653945923, |
| "learning_rate": 0.0002, |
| "loss": 0.6676, |
| "step": 153 |
| }, |
| { |
| "epoch": 3.08, |
| "grad_norm": 1.12136971950531, |
| "learning_rate": 0.0002, |
| "loss": 0.5916, |
| "step": 154 |
| }, |
| { |
| "epoch": 3.1, |
| "grad_norm": 1.3517699241638184, |
| "learning_rate": 0.0002, |
| "loss": 0.5908, |
| "step": 155 |
| }, |
| { |
| "epoch": 3.12, |
| "grad_norm": 1.5965360403060913, |
| "learning_rate": 0.0002, |
| "loss": 0.6148, |
| "step": 156 |
| }, |
| { |
| "epoch": 3.14, |
| "grad_norm": 1.3009252548217773, |
| "learning_rate": 0.0002, |
| "loss": 0.4902, |
| "step": 157 |
| }, |
| { |
| "epoch": 3.16, |
| "grad_norm": 1.2742400169372559, |
| "learning_rate": 0.0002, |
| "loss": 0.4515, |
| "step": 158 |
| }, |
| { |
| "epoch": 3.18, |
| "grad_norm": 1.2994771003723145, |
| "learning_rate": 0.0002, |
| "loss": 0.416, |
| "step": 159 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 1.3306324481964111, |
| "learning_rate": 0.0002, |
| "loss": 0.4144, |
| "step": 160 |
| }, |
| { |
| "epoch": 3.22, |
| "grad_norm": 1.5406475067138672, |
| "learning_rate": 0.0002, |
| "loss": 0.4256, |
| "step": 161 |
| }, |
| { |
| "epoch": 3.24, |
| "grad_norm": 1.584506630897522, |
| "learning_rate": 0.0002, |
| "loss": 0.4511, |
| "step": 162 |
| }, |
| { |
| "epoch": 3.26, |
| "grad_norm": 1.6618622541427612, |
| "learning_rate": 0.0002, |
| "loss": 0.8865, |
| "step": 163 |
| }, |
| { |
| "epoch": 3.2800000000000002, |
| "grad_norm": 1.6019847393035889, |
| "learning_rate": 0.0002, |
| "loss": 0.7339, |
| "step": 164 |
| }, |
| { |
| "epoch": 3.3, |
| "grad_norm": 1.1740251779556274, |
| "learning_rate": 0.0002, |
| "loss": 0.6945, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.32, |
| "grad_norm": 1.1268410682678223, |
| "learning_rate": 0.0002, |
| "loss": 0.579, |
| "step": 166 |
| }, |
| { |
| "epoch": 3.34, |
| "grad_norm": 1.3038002252578735, |
| "learning_rate": 0.0002, |
| "loss": 0.5217, |
| "step": 167 |
| }, |
| { |
| "epoch": 3.36, |
| "grad_norm": 1.112185001373291, |
| "learning_rate": 0.0002, |
| "loss": 0.4766, |
| "step": 168 |
| }, |
| { |
| "epoch": 3.38, |
| "grad_norm": 1.3828542232513428, |
| "learning_rate": 0.0002, |
| "loss": 0.4781, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.4, |
| "grad_norm": 1.1456600427627563, |
| "learning_rate": 0.0002, |
| "loss": 0.4056, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.42, |
| "grad_norm": 1.2479093074798584, |
| "learning_rate": 0.0002, |
| "loss": 0.4447, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.44, |
| "grad_norm": 1.4044010639190674, |
| "learning_rate": 0.0002, |
| "loss": 0.3814, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.46, |
| "grad_norm": 1.565138339996338, |
| "learning_rate": 0.0002, |
| "loss": 0.3982, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.48, |
| "grad_norm": 1.4442418813705444, |
| "learning_rate": 0.0002, |
| "loss": 0.4262, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 1.1203701496124268, |
| "learning_rate": 0.0002, |
| "loss": 0.8025, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.52, |
| "grad_norm": 1.3620504140853882, |
| "learning_rate": 0.0002, |
| "loss": 0.9045, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.54, |
| "grad_norm": 1.5145343542099, |
| "learning_rate": 0.0002, |
| "loss": 0.703, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.56, |
| "grad_norm": 1.333682656288147, |
| "learning_rate": 0.0002, |
| "loss": 0.6285, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.58, |
| "grad_norm": 1.4228661060333252, |
| "learning_rate": 0.0002, |
| "loss": 0.6004, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 1.2111386060714722, |
| "learning_rate": 0.0002, |
| "loss": 0.4754, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.62, |
| "grad_norm": 1.410719394683838, |
| "learning_rate": 0.0002, |
| "loss": 0.5324, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.64, |
| "grad_norm": 1.4157259464263916, |
| "learning_rate": 0.0002, |
| "loss": 0.4556, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.66, |
| "grad_norm": 1.3982216119766235, |
| "learning_rate": 0.0002, |
| "loss": 0.4465, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.68, |
| "grad_norm": 1.4364334344863892, |
| "learning_rate": 0.0002, |
| "loss": 0.4313, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.7, |
| "grad_norm": 1.5408861637115479, |
| "learning_rate": 0.0002, |
| "loss": 0.4108, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.7199999999999998, |
| "grad_norm": 1.5500551462173462, |
| "learning_rate": 0.0002, |
| "loss": 0.4664, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.74, |
| "grad_norm": 1.1150060892105103, |
| "learning_rate": 0.0002, |
| "loss": 0.8951, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.76, |
| "grad_norm": 1.0168464183807373, |
| "learning_rate": 0.0002, |
| "loss": 0.8619, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.7800000000000002, |
| "grad_norm": 1.2093026638031006, |
| "learning_rate": 0.0002, |
| "loss": 0.5782, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.8, |
| "grad_norm": 1.3905984163284302, |
| "learning_rate": 0.0002, |
| "loss": 0.6929, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.82, |
| "grad_norm": 1.3665902614593506, |
| "learning_rate": 0.0002, |
| "loss": 0.5701, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.84, |
| "grad_norm": 1.1478445529937744, |
| "learning_rate": 0.0002, |
| "loss": 0.4515, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.86, |
| "grad_norm": 1.2758458852767944, |
| "learning_rate": 0.0002, |
| "loss": 0.5819, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.88, |
| "grad_norm": 1.0731329917907715, |
| "learning_rate": 0.0002, |
| "loss": 0.4448, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.9, |
| "grad_norm": 1.20659339427948, |
| "learning_rate": 0.0002, |
| "loss": 0.4295, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.92, |
| "grad_norm": 1.3976835012435913, |
| "learning_rate": 0.0002, |
| "loss": 0.4566, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.94, |
| "grad_norm": 1.617711067199707, |
| "learning_rate": 0.0002, |
| "loss": 0.4303, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.96, |
| "grad_norm": 1.707471489906311, |
| "learning_rate": 0.0002, |
| "loss": 0.4539, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.98, |
| "grad_norm": 1.2962028980255127, |
| "learning_rate": 0.0002, |
| "loss": 0.5971, |
| "step": 199 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.8809109926223755, |
| "learning_rate": 0.0002, |
| "loss": 0.4973, |
| "step": 200 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 1.5414550304412842, |
| "eval_runtime": 565.7272, |
| "eval_samples_per_second": 0.707, |
| "eval_steps_per_second": 0.177, |
| "step": 200 |
| }, |
| { |
| "epoch": 4.02, |
| "grad_norm": 0.9540490508079529, |
| "learning_rate": 0.0002, |
| "loss": 0.56, |
| "step": 201 |
| }, |
| { |
| "epoch": 4.04, |
| "grad_norm": 1.0443426370620728, |
| "learning_rate": 0.0002, |
| "loss": 0.6562, |
| "step": 202 |
| }, |
| { |
| "epoch": 4.06, |
| "grad_norm": 1.020203948020935, |
| "learning_rate": 0.0002, |
| "loss": 0.4533, |
| "step": 203 |
| }, |
| { |
| "epoch": 4.08, |
| "grad_norm": 1.5309128761291504, |
| "learning_rate": 0.0002, |
| "loss": 0.3575, |
| "step": 204 |
| }, |
| { |
| "epoch": 4.1, |
| "grad_norm": 1.7135676145553589, |
| "learning_rate": 0.0002, |
| "loss": 0.3286, |
| "step": 205 |
| }, |
| { |
| "epoch": 4.12, |
| "grad_norm": 1.602728247642517, |
| "learning_rate": 0.0002, |
| "loss": 0.2556, |
| "step": 206 |
| }, |
| { |
| "epoch": 4.14, |
| "grad_norm": 1.8623350858688354, |
| "learning_rate": 0.0002, |
| "loss": 0.314, |
| "step": 207 |
| }, |
| { |
| "epoch": 4.16, |
| "grad_norm": 1.5630223751068115, |
| "learning_rate": 0.0002, |
| "loss": 0.2716, |
| "step": 208 |
| }, |
| { |
| "epoch": 4.18, |
| "grad_norm": 1.3671077489852905, |
| "learning_rate": 0.0002, |
| "loss": 0.2506, |
| "step": 209 |
| }, |
| { |
| "epoch": 4.2, |
| "grad_norm": 1.0884723663330078, |
| "learning_rate": 0.0002, |
| "loss": 0.2473, |
| "step": 210 |
| }, |
| { |
| "epoch": 4.22, |
| "grad_norm": 1.193832516670227, |
| "learning_rate": 0.0002, |
| "loss": 0.2836, |
| "step": 211 |
| }, |
| { |
| "epoch": 4.24, |
| "grad_norm": 1.0041422843933105, |
| "learning_rate": 0.0002, |
| "loss": 0.391, |
| "step": 212 |
| }, |
| { |
| "epoch": 4.26, |
| "grad_norm": 1.013597846031189, |
| "learning_rate": 0.0002, |
| "loss": 0.628, |
| "step": 213 |
| }, |
| { |
| "epoch": 4.28, |
| "grad_norm": 0.9650751948356628, |
| "learning_rate": 0.0002, |
| "loss": 0.5202, |
| "step": 214 |
| }, |
| { |
| "epoch": 4.3, |
| "grad_norm": 1.0781069993972778, |
| "learning_rate": 0.0002, |
| "loss": 0.4967, |
| "step": 215 |
| }, |
| { |
| "epoch": 4.32, |
| "grad_norm": 1.1297317743301392, |
| "learning_rate": 0.0002, |
| "loss": 0.4154, |
| "step": 216 |
| }, |
| { |
| "epoch": 4.34, |
| "grad_norm": 1.2913479804992676, |
| "learning_rate": 0.0002, |
| "loss": 0.3014, |
| "step": 217 |
| }, |
| { |
| "epoch": 4.36, |
| "grad_norm": 1.4399878978729248, |
| "learning_rate": 0.0002, |
| "loss": 0.3344, |
| "step": 218 |
| }, |
| { |
| "epoch": 4.38, |
| "grad_norm": 1.4960243701934814, |
| "learning_rate": 0.0002, |
| "loss": 0.2894, |
| "step": 219 |
| }, |
| { |
| "epoch": 4.4, |
| "grad_norm": 1.925826072692871, |
| "learning_rate": 0.0002, |
| "loss": 0.281, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.42, |
| "grad_norm": 1.6930102109909058, |
| "learning_rate": 0.0002, |
| "loss": 0.2512, |
| "step": 221 |
| }, |
| { |
| "epoch": 4.44, |
| "grad_norm": 1.6776522397994995, |
| "learning_rate": 0.0002, |
| "loss": 0.2744, |
| "step": 222 |
| }, |
| { |
| "epoch": 4.46, |
| "grad_norm": 1.3323974609375, |
| "learning_rate": 0.0002, |
| "loss": 0.2951, |
| "step": 223 |
| }, |
| { |
| "epoch": 4.48, |
| "grad_norm": 1.2120009660720825, |
| "learning_rate": 0.0002, |
| "loss": 0.3707, |
| "step": 224 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 1.2817238569259644, |
| "learning_rate": 0.0002, |
| "loss": 0.6035, |
| "step": 225 |
| }, |
| { |
| "epoch": 4.52, |
| "grad_norm": 1.1797271966934204, |
| "learning_rate": 0.0002, |
| "loss": 0.5878, |
| "step": 226 |
| }, |
| { |
| "epoch": 4.54, |
| "grad_norm": 0.9533390402793884, |
| "learning_rate": 0.0002, |
| "loss": 0.3458, |
| "step": 227 |
| }, |
| { |
| "epoch": 4.5600000000000005, |
| "grad_norm": 1.0915648937225342, |
| "learning_rate": 0.0002, |
| "loss": 0.3617, |
| "step": 228 |
| }, |
| { |
| "epoch": 4.58, |
| "grad_norm": 1.3463889360427856, |
| "learning_rate": 0.0002, |
| "loss": 0.3853, |
| "step": 229 |
| }, |
| { |
| "epoch": 4.6, |
| "grad_norm": 1.457556128501892, |
| "learning_rate": 0.0002, |
| "loss": 0.3343, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.62, |
| "grad_norm": 1.681526780128479, |
| "learning_rate": 0.0002, |
| "loss": 0.3132, |
| "step": 231 |
| }, |
| { |
| "epoch": 4.64, |
| "grad_norm": 1.7101032733917236, |
| "learning_rate": 0.0002, |
| "loss": 0.2913, |
| "step": 232 |
| }, |
| { |
| "epoch": 4.66, |
| "grad_norm": 2.1125667095184326, |
| "learning_rate": 0.0002, |
| "loss": 0.304, |
| "step": 233 |
| }, |
| { |
| "epoch": 4.68, |
| "grad_norm": 1.5824134349822998, |
| "learning_rate": 0.0002, |
| "loss": 0.2878, |
| "step": 234 |
| }, |
| { |
| "epoch": 4.7, |
| "grad_norm": 1.5257948637008667, |
| "learning_rate": 0.0002, |
| "loss": 0.3049, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.72, |
| "grad_norm": 1.1413626670837402, |
| "learning_rate": 0.0002, |
| "loss": 0.3875, |
| "step": 236 |
| }, |
| { |
| "epoch": 4.74, |
| "grad_norm": 1.4785950183868408, |
| "learning_rate": 0.0002, |
| "loss": 0.7915, |
| "step": 237 |
| }, |
| { |
| "epoch": 4.76, |
| "grad_norm": 1.0445829629898071, |
| "learning_rate": 0.0002, |
| "loss": 0.4765, |
| "step": 238 |
| }, |
| { |
| "epoch": 4.78, |
| "grad_norm": 1.0932363271713257, |
| "learning_rate": 0.0002, |
| "loss": 0.4081, |
| "step": 239 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 1.313068151473999, |
| "learning_rate": 0.0002, |
| "loss": 0.4278, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.82, |
| "grad_norm": 1.2771199941635132, |
| "learning_rate": 0.0002, |
| "loss": 0.3481, |
| "step": 241 |
| }, |
| { |
| "epoch": 4.84, |
| "grad_norm": 1.3306118249893188, |
| "learning_rate": 0.0002, |
| "loss": 0.3301, |
| "step": 242 |
| }, |
| { |
| "epoch": 4.86, |
| "grad_norm": 1.2204334735870361, |
| "learning_rate": 0.0002, |
| "loss": 0.2974, |
| "step": 243 |
| }, |
| { |
| "epoch": 4.88, |
| "grad_norm": 1.1585593223571777, |
| "learning_rate": 0.0002, |
| "loss": 0.2404, |
| "step": 244 |
| }, |
| { |
| "epoch": 4.9, |
| "grad_norm": 1.6888794898986816, |
| "learning_rate": 0.0002, |
| "loss": 0.2817, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.92, |
| "grad_norm": 1.4956034421920776, |
| "learning_rate": 0.0002, |
| "loss": 0.2583, |
| "step": 246 |
| }, |
| { |
| "epoch": 4.9399999999999995, |
| "grad_norm": 1.6130638122558594, |
| "learning_rate": 0.0002, |
| "loss": 0.2888, |
| "step": 247 |
| }, |
| { |
| "epoch": 4.96, |
| "grad_norm": 2.280722141265869, |
| "learning_rate": 0.0002, |
| "loss": 0.3732, |
| "step": 248 |
| }, |
| { |
| "epoch": 4.98, |
| "grad_norm": 1.8177040815353394, |
| "learning_rate": 0.0002, |
| "loss": 0.6086, |
| "step": 249 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.674232840538025, |
| "learning_rate": 0.0002, |
| "loss": 0.3232, |
| "step": 250 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.6614739894866943, |
| "eval_runtime": 565.5774, |
| "eval_samples_per_second": 0.707, |
| "eval_steps_per_second": 0.177, |
| "step": 250 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 300, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.433614153351168e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|