| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.2814258911819887, | |
| "eval_steps": 800, | |
| "global_step": 30400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004803001876172608, | |
| "grad_norm": 3.330345630645752, | |
| "learning_rate": 0.0001113525, | |
| "loss": 4.3152, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.009606003752345216, | |
| "grad_norm": 2.5987207889556885, | |
| "learning_rate": 0.00022447249999999998, | |
| "loss": 3.5908, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.014409005628517824, | |
| "grad_norm": 10.176867485046387, | |
| "learning_rate": 0.00033759249999999996, | |
| "loss": 3.3927, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.01921200750469043, | |
| "grad_norm": 6.534875869750977, | |
| "learning_rate": 0.00045071249999999993, | |
| "loss": 3.3333, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.02401500938086304, | |
| "grad_norm": 6.088456630706787, | |
| "learning_rate": 0.0005638325, | |
| "loss": 3.2928, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.028818011257035647, | |
| "grad_norm": 6.937580108642578, | |
| "learning_rate": 0.0006769524999999999, | |
| "loss": 3.2901, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.033621013133208255, | |
| "grad_norm": 6.744969844818115, | |
| "learning_rate": 0.0007900724999999999, | |
| "loss": 3.2589, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.03842401500938086, | |
| "grad_norm": 2.2261719703674316, | |
| "learning_rate": 0.0009031925, | |
| "loss": 3.231, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.04322701688555347, | |
| "grad_norm": 1.030404806137085, | |
| "learning_rate": 0.0010163124999999999, | |
| "loss": 3.2278, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.04803001876172608, | |
| "grad_norm": 1.036293387413025, | |
| "learning_rate": 0.0011294324999999998, | |
| "loss": 3.272, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.05283302063789869, | |
| "grad_norm": 1.1835274696350098, | |
| "learning_rate": 0.0012425525, | |
| "loss": 3.256, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.057636022514071295, | |
| "grad_norm": 0.8378634452819824, | |
| "learning_rate": 0.0013556724999999998, | |
| "loss": 3.27, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.0624390243902439, | |
| "grad_norm": 0.7602612972259521, | |
| "learning_rate": 0.0014687925, | |
| "loss": 3.2261, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.06724202626641651, | |
| "grad_norm": 0.6387987732887268, | |
| "learning_rate": 0.0015819124999999997, | |
| "loss": 3.2153, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.07204502814258912, | |
| "grad_norm": 0.4422095715999603, | |
| "learning_rate": 0.0016950325, | |
| "loss": 3.1975, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.07684803001876173, | |
| "grad_norm": 0.39002183079719543, | |
| "learning_rate": 0.0018081524999999999, | |
| "loss": 3.1983, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.08165103189493433, | |
| "grad_norm": 5.926162242889404, | |
| "learning_rate": 0.0019212724999999996, | |
| "loss": 3.1763, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.08645403377110694, | |
| "grad_norm": 0.4173193871974945, | |
| "learning_rate": 0.0020343924999999996, | |
| "loss": 3.1833, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.09125703564727955, | |
| "grad_norm": 0.4136042594909668, | |
| "learning_rate": 0.0021475125, | |
| "loss": 3.1846, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.09606003752345216, | |
| "grad_norm": 0.39301183819770813, | |
| "learning_rate": 0.0022606324999999996, | |
| "loss": 3.1739, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.10086303939962477, | |
| "grad_norm": 0.4910842776298523, | |
| "learning_rate": 0.0023737525, | |
| "loss": 3.1614, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.10566604127579737, | |
| "grad_norm": 0.4039038121700287, | |
| "learning_rate": 0.0024868725, | |
| "loss": 3.1577, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.11046904315196998, | |
| "grad_norm": 0.3286585211753845, | |
| "learning_rate": 0.0025999925, | |
| "loss": 3.1458, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.11527204502814259, | |
| "grad_norm": 0.44095373153686523, | |
| "learning_rate": 0.0027131125, | |
| "loss": 3.155, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.1200750469043152, | |
| "grad_norm": 0.40613290667533875, | |
| "learning_rate": 0.0028262325, | |
| "loss": 3.1469, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1248780487804878, | |
| "grad_norm": 0.4613141417503357, | |
| "learning_rate": 0.002828, | |
| "loss": 3.1392, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.1296810506566604, | |
| "grad_norm": 0.3758493661880493, | |
| "learning_rate": 0.002828, | |
| "loss": 3.1298, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.13448405253283302, | |
| "grad_norm": 0.32609787583351135, | |
| "learning_rate": 0.002828, | |
| "loss": 3.123, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.13928705440900563, | |
| "grad_norm": 0.4221761226654053, | |
| "learning_rate": 0.002828, | |
| "loss": 3.1076, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.14409005628517824, | |
| "grad_norm": 0.4372267425060272, | |
| "learning_rate": 0.002828, | |
| "loss": 3.098, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.14889305816135084, | |
| "grad_norm": 0.36804404854774475, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0952, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.15369606003752345, | |
| "grad_norm": 0.314120888710022, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0751, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.15849906191369606, | |
| "grad_norm": 0.3158409297466278, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0574, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.16330206378986867, | |
| "grad_norm": 0.35668376088142395, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0598, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.16810506566604128, | |
| "grad_norm": 0.3429064452648163, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0554, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.17290806754221388, | |
| "grad_norm": 0.37981563806533813, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0439, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.1777110694183865, | |
| "grad_norm": 0.45046043395996094, | |
| "learning_rate": 0.002828, | |
| "loss": 3.034, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.1825140712945591, | |
| "grad_norm": 0.30424681305885315, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0408, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.1873170731707317, | |
| "grad_norm": 0.4374525845050812, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0289, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.19212007504690432, | |
| "grad_norm": 0.4312361776828766, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0252, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.19692307692307692, | |
| "grad_norm": 0.33109021186828613, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0094, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.20172607879924953, | |
| "grad_norm": 0.4393901228904724, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0021, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.20652908067542214, | |
| "grad_norm": 0.44241341948509216, | |
| "learning_rate": 0.002828, | |
| "loss": 3.0005, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.21133208255159475, | |
| "grad_norm": 0.36241745948791504, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9939, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.21613508442776735, | |
| "grad_norm": 0.40780672430992126, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9788, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.22093808630393996, | |
| "grad_norm": 0.3944590389728546, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9854, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.22574108818011257, | |
| "grad_norm": 0.40449267625808716, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9819, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.23054409005628518, | |
| "grad_norm": 0.37247487902641296, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9827, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.23534709193245779, | |
| "grad_norm": 0.3732891082763672, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9714, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.2401500938086304, | |
| "grad_norm": 0.3168690800666809, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9649, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.244953095684803, | |
| "grad_norm": 0.32185083627700806, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9607, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.2497560975609756, | |
| "grad_norm": 0.3293335437774658, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9464, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.2545590994371482, | |
| "grad_norm": 0.39153945446014404, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9513, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.2593621013133208, | |
| "grad_norm": 0.36884990334510803, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9418, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.26416510318949343, | |
| "grad_norm": 0.39196011424064636, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9407, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.26896810506566604, | |
| "grad_norm": 0.36011603474617004, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9461, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.27377110694183865, | |
| "grad_norm": 0.3608081638813019, | |
| "learning_rate": 0.002828, | |
| "loss": 2.937, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.27857410881801126, | |
| "grad_norm": 0.3833774924278259, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9254, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.28337711069418386, | |
| "grad_norm": 0.35225459933280945, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9165, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.2881801125703565, | |
| "grad_norm": 0.39832860231399536, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9259, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.2929831144465291, | |
| "grad_norm": 0.36834558844566345, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9186, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.2977861163227017, | |
| "grad_norm": 0.3877101540565491, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9107, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.3025891181988743, | |
| "grad_norm": 0.40037983655929565, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9086, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.3073921200750469, | |
| "grad_norm": 0.35432353615760803, | |
| "learning_rate": 0.002828, | |
| "loss": 2.9039, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.3121951219512195, | |
| "grad_norm": 0.3740752935409546, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8973, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.3169981238273921, | |
| "grad_norm": 0.3972289264202118, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8868, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 0.3218011257035647, | |
| "grad_norm": 0.3818065822124481, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8916, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 0.32660412757973734, | |
| "grad_norm": 0.31802886724472046, | |
| "learning_rate": 0.002828, | |
| "loss": 2.895, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.33140712945590994, | |
| "grad_norm": 0.3920498192310333, | |
| "learning_rate": 0.002828, | |
| "loss": 2.896, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 0.33621013133208255, | |
| "grad_norm": 0.42001602053642273, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8757, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.34101313320825516, | |
| "grad_norm": 0.38037222623825073, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8812, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 0.34581613508442777, | |
| "grad_norm": 0.6402748823165894, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8741, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.3506191369606004, | |
| "grad_norm": 0.3265625536441803, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8659, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 0.355422138836773, | |
| "grad_norm": 0.3389698565006256, | |
| "learning_rate": 0.002828, | |
| "loss": 2.863, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 0.3602251407129456, | |
| "grad_norm": 0.34922096133232117, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8555, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.3650281425891182, | |
| "grad_norm": 0.370980441570282, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8624, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.3698311444652908, | |
| "grad_norm": 0.3553221821784973, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8573, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 0.3746341463414634, | |
| "grad_norm": 0.36796537041664124, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8567, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 0.379437148217636, | |
| "grad_norm": 0.3615240752696991, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8444, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 0.38424015009380863, | |
| "grad_norm": 0.4196101427078247, | |
| "learning_rate": 0.002828, | |
| "loss": 2.845, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.38904315196998124, | |
| "grad_norm": 0.334185928106308, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8376, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 0.39384615384615385, | |
| "grad_norm": 0.30093756318092346, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8302, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 0.39864915572232645, | |
| "grad_norm": 0.41615140438079834, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8365, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 0.40345215759849906, | |
| "grad_norm": 0.38547712564468384, | |
| "learning_rate": 0.002828, | |
| "loss": 2.833, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.40825515947467167, | |
| "grad_norm": 0.336453378200531, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8289, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.4130581613508443, | |
| "grad_norm": 0.33043336868286133, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8154, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 0.4178611632270169, | |
| "grad_norm": 0.33151519298553467, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8267, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 0.4226641651031895, | |
| "grad_norm": 0.29418498277664185, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8167, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 0.4274671669793621, | |
| "grad_norm": 0.3507523536682129, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8227, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 0.4322701688555347, | |
| "grad_norm": 0.36976736783981323, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8087, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.4370731707317073, | |
| "grad_norm": 0.4142448604106903, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8191, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 0.4418761726078799, | |
| "grad_norm": 0.3893688917160034, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8032, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 0.44667917448405253, | |
| "grad_norm": 0.3025995194911957, | |
| "learning_rate": 0.002828, | |
| "loss": 2.8049, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 0.45148217636022514, | |
| "grad_norm": 0.3676198422908783, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7976, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 0.45628517823639775, | |
| "grad_norm": 0.39022454619407654, | |
| "learning_rate": 0.002828, | |
| "loss": 2.796, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.46108818011257036, | |
| "grad_norm": 0.38986560702323914, | |
| "learning_rate": 0.002828, | |
| "loss": 2.791, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 0.46589118198874296, | |
| "grad_norm": 0.35879769921302795, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7949, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 0.47069418386491557, | |
| "grad_norm": 0.44419315457344055, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7862, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 0.4754971857410882, | |
| "grad_norm": 0.30884304642677307, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7864, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 0.4803001876172608, | |
| "grad_norm": 0.542960524559021, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7842, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.4851031894934334, | |
| "grad_norm": 0.39032405614852905, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7798, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 0.489906191369606, | |
| "grad_norm": 0.3760650157928467, | |
| "learning_rate": 0.002828, | |
| "loss": 2.78, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 0.4947091932457786, | |
| "grad_norm": 0.33309632539749146, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7741, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 0.4995121951219512, | |
| "grad_norm": 0.37640711665153503, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7795, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 0.5043151969981239, | |
| "grad_norm": 0.36830273270606995, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7596, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.5091181988742964, | |
| "grad_norm": 0.3751394748687744, | |
| "learning_rate": 0.002828, | |
| "loss": 2.761, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 0.5139212007504691, | |
| "grad_norm": 0.3472868800163269, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7567, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 0.5187242026266417, | |
| "grad_norm": 0.3749905526638031, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7654, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 0.5235272045028143, | |
| "grad_norm": 0.4672335982322693, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7467, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 0.5283302063789869, | |
| "grad_norm": 0.30083194375038147, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7596, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.5331332082551595, | |
| "grad_norm": 0.34232673048973083, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7425, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 0.5379362101313321, | |
| "grad_norm": 0.42222973704338074, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7486, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 0.5427392120075047, | |
| "grad_norm": 0.36008650064468384, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7451, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 0.5475422138836773, | |
| "grad_norm": 0.34359127283096313, | |
| "learning_rate": 0.002828, | |
| "loss": 2.734, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 0.55234521575985, | |
| "grad_norm": 0.3953745663166046, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7397, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.5571482176360225, | |
| "grad_norm": 0.36703094840049744, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7313, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 0.5619512195121952, | |
| "grad_norm": 0.31787919998168945, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7363, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 0.5667542213883677, | |
| "grad_norm": 0.31179967522621155, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7236, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 0.5715572232645404, | |
| "grad_norm": 0.3990299105644226, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7191, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 0.576360225140713, | |
| "grad_norm": 0.3776848018169403, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7244, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.5811632270168856, | |
| "grad_norm": 0.36117562651634216, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7131, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 0.5859662288930582, | |
| "grad_norm": 0.3219313323497772, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7202, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 0.5907692307692308, | |
| "grad_norm": 0.4501495957374573, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7115, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 0.5955722326454034, | |
| "grad_norm": 0.3939913809299469, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7076, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 0.600375234521576, | |
| "grad_norm": 0.3244933485984802, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7047, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6051782363977486, | |
| "grad_norm": 0.3094891607761383, | |
| "learning_rate": 0.002828, | |
| "loss": 2.698, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 0.6099812382739213, | |
| "grad_norm": 0.3525580167770386, | |
| "learning_rate": 0.002828, | |
| "loss": 2.7056, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 0.6147842401500938, | |
| "grad_norm": 0.3058718144893646, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6937, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 0.6195872420262665, | |
| "grad_norm": 0.31864726543426514, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6935, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 0.624390243902439, | |
| "grad_norm": 0.3197256028652191, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6981, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.6291932457786117, | |
| "grad_norm": 0.30954182147979736, | |
| "learning_rate": 0.002828, | |
| "loss": 2.705, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 0.6339962476547842, | |
| "grad_norm": 0.4144911468029022, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6832, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 0.6387992495309569, | |
| "grad_norm": 0.34720951318740845, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6858, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 0.6436022514071295, | |
| "grad_norm": 0.30545172095298767, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6758, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 0.6484052532833021, | |
| "grad_norm": 0.3341416120529175, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6673, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.6532082551594747, | |
| "grad_norm": 0.5191973447799683, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6798, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 0.6580112570356473, | |
| "grad_norm": 0.44382575154304504, | |
| "learning_rate": 0.002828, | |
| "loss": 2.683, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 0.6628142589118199, | |
| "grad_norm": 0.45676809549331665, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6731, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 0.6676172607879926, | |
| "grad_norm": 0.3542475700378418, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6813, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 0.6724202626641651, | |
| "grad_norm": 0.3976110517978668, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6714, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.6772232645403378, | |
| "grad_norm": 0.37194061279296875, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6646, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 0.6820262664165103, | |
| "grad_norm": 0.4080849289894104, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6638, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 0.686829268292683, | |
| "grad_norm": 0.3275296986103058, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6643, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 0.6916322701688555, | |
| "grad_norm": 0.4300732910633087, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6545, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 0.6964352720450282, | |
| "grad_norm": 0.528816282749176, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6639, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 0.7012382739212007, | |
| "grad_norm": 0.39729437232017517, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6669, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 0.7060412757973734, | |
| "grad_norm": 0.36177024245262146, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6429, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 0.710844277673546, | |
| "grad_norm": 0.3488599956035614, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6409, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 0.7156472795497186, | |
| "grad_norm": 0.361208438873291, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6354, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 0.7204502814258912, | |
| "grad_norm": 0.3307696282863617, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6398, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7252532833020638, | |
| "grad_norm": 0.47409588098526, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6899, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 0.7300562851782364, | |
| "grad_norm": 0.43482983112335205, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6675, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 0.7348592870544091, | |
| "grad_norm": 0.43177512288093567, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6703, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 0.7396622889305816, | |
| "grad_norm": 0.5830815434455872, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6698, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 0.7444652908067543, | |
| "grad_norm": 0.42559024691581726, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6687, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 0.7492682926829268, | |
| "grad_norm": 0.36572182178497314, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6602, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 0.7540712945590995, | |
| "grad_norm": 0.42863738536834717, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6684, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 0.758874296435272, | |
| "grad_norm": 0.34681934118270874, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6618, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 0.7636772983114447, | |
| "grad_norm": 0.40332967042922974, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6523, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 0.7684803001876173, | |
| "grad_norm": 0.47137463092803955, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6543, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.7732833020637899, | |
| "grad_norm": 0.3324384093284607, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6444, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 0.7780863039399625, | |
| "grad_norm": 0.3714103698730469, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6466, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 0.7828893058161351, | |
| "grad_norm": 0.3684547543525696, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6497, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 0.7876923076923077, | |
| "grad_norm": 0.3580617606639862, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6428, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 0.7924953095684804, | |
| "grad_norm": 0.4132176339626312, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6407, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 0.7972983114446529, | |
| "grad_norm": 0.4079800546169281, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6374, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 0.8021013133208256, | |
| "grad_norm": 0.40170854330062866, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6319, | |
| "step": 10688 | |
| }, | |
| { | |
| "epoch": 0.8069043151969981, | |
| "grad_norm": 0.4748755097389221, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6489, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 0.8117073170731708, | |
| "grad_norm": 0.3806183338165283, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6363, | |
| "step": 10816 | |
| }, | |
| { | |
| "epoch": 0.8165103189493433, | |
| "grad_norm": 0.32777532935142517, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6386, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 0.821313320825516, | |
| "grad_norm": 0.4884773790836334, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6293, | |
| "step": 10944 | |
| }, | |
| { | |
| "epoch": 0.8261163227016886, | |
| "grad_norm": 0.43175649642944336, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6351, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 0.8309193245778612, | |
| "grad_norm": 0.44375500082969666, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6272, | |
| "step": 11072 | |
| }, | |
| { | |
| "epoch": 0.8357223264540338, | |
| "grad_norm": 0.36503469944000244, | |
| "learning_rate": 0.002828, | |
| "loss": 2.628, | |
| "step": 11136 | |
| }, | |
| { | |
| "epoch": 0.8405253283302064, | |
| "grad_norm": 0.3493196368217468, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6238, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.845328330206379, | |
| "grad_norm": 0.3593812584877014, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6161, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 0.8501313320825516, | |
| "grad_norm": 0.4043927788734436, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6248, | |
| "step": 11328 | |
| }, | |
| { | |
| "epoch": 0.8549343339587242, | |
| "grad_norm": 0.3805730938911438, | |
| "learning_rate": 0.002828, | |
| "loss": 2.619, | |
| "step": 11392 | |
| }, | |
| { | |
| "epoch": 0.8597373358348969, | |
| "grad_norm": 0.40822461247444153, | |
| "learning_rate": 0.002828, | |
| "loss": 2.619, | |
| "step": 11456 | |
| }, | |
| { | |
| "epoch": 0.8645403377110694, | |
| "grad_norm": 0.3430253565311432, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6162, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.8693433395872421, | |
| "grad_norm": 0.3665921688079834, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6083, | |
| "step": 11584 | |
| }, | |
| { | |
| "epoch": 0.8741463414634146, | |
| "grad_norm": 0.3768637776374817, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6085, | |
| "step": 11648 | |
| }, | |
| { | |
| "epoch": 0.8789493433395873, | |
| "grad_norm": 0.6709098219871521, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6067, | |
| "step": 11712 | |
| }, | |
| { | |
| "epoch": 0.8837523452157598, | |
| "grad_norm": 0.37109729647636414, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5975, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 0.8885553470919325, | |
| "grad_norm": 0.35545358061790466, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6086, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 0.8933583489681051, | |
| "grad_norm": 0.34493309259414673, | |
| "learning_rate": 0.002828, | |
| "loss": 2.6009, | |
| "step": 11904 | |
| }, | |
| { | |
| "epoch": 0.8981613508442777, | |
| "grad_norm": 0.35226738452911377, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5909, | |
| "step": 11968 | |
| }, | |
| { | |
| "epoch": 0.9029643527204503, | |
| "grad_norm": 0.3626823425292969, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5954, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 0.9077673545966229, | |
| "grad_norm": 0.4639281630516052, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5976, | |
| "step": 12096 | |
| }, | |
| { | |
| "epoch": 0.9125703564727955, | |
| "grad_norm": 0.425073504447937, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5846, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 0.9173733583489682, | |
| "grad_norm": 0.4849206507205963, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5851, | |
| "step": 12224 | |
| }, | |
| { | |
| "epoch": 0.9221763602251407, | |
| "grad_norm": 0.3517647385597229, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5832, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 0.9269793621013134, | |
| "grad_norm": 0.4217440187931061, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5777, | |
| "step": 12352 | |
| }, | |
| { | |
| "epoch": 0.9317823639774859, | |
| "grad_norm": 0.3862438499927521, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5769, | |
| "step": 12416 | |
| }, | |
| { | |
| "epoch": 0.9365853658536586, | |
| "grad_norm": 0.4026007056236267, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5802, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 0.9413883677298311, | |
| "grad_norm": 0.3353049159049988, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5741, | |
| "step": 12544 | |
| }, | |
| { | |
| "epoch": 0.9461913696060038, | |
| "grad_norm": 0.35357797145843506, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5723, | |
| "step": 12608 | |
| }, | |
| { | |
| "epoch": 0.9509943714821764, | |
| "grad_norm": 0.35685861110687256, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5801, | |
| "step": 12672 | |
| }, | |
| { | |
| "epoch": 0.955797373358349, | |
| "grad_norm": 0.36265361309051514, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5784, | |
| "step": 12736 | |
| }, | |
| { | |
| "epoch": 0.9606003752345216, | |
| "grad_norm": 0.4119773805141449, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5646, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9654033771106942, | |
| "grad_norm": 0.3662680387496948, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5693, | |
| "step": 12864 | |
| }, | |
| { | |
| "epoch": 0.9702063789868668, | |
| "grad_norm": 0.3822716772556305, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5643, | |
| "step": 12928 | |
| }, | |
| { | |
| "epoch": 0.9750093808630395, | |
| "grad_norm": 0.3412950038909912, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5646, | |
| "step": 12992 | |
| }, | |
| { | |
| "epoch": 0.979812382739212, | |
| "grad_norm": 0.373353511095047, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5614, | |
| "step": 13056 | |
| }, | |
| { | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 10.112526893615723, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5578, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 0.9894183864915572, | |
| "grad_norm": 0.36393383145332336, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5696, | |
| "step": 13184 | |
| }, | |
| { | |
| "epoch": 0.9942213883677299, | |
| "grad_norm": 0.4176023006439209, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5533, | |
| "step": 13248 | |
| }, | |
| { | |
| "epoch": 0.9990243902439024, | |
| "grad_norm": 0.4248984456062317, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5569, | |
| "step": 13312 | |
| }, | |
| { | |
| "epoch": 1.003827392120075, | |
| "grad_norm": 0.3931824564933777, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5246, | |
| "step": 13376 | |
| }, | |
| { | |
| "epoch": 1.0086303939962478, | |
| "grad_norm": 0.3742982745170593, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5104, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 1.0134333958724202, | |
| "grad_norm": 0.4388613998889923, | |
| "learning_rate": 0.002828, | |
| "loss": 2.519, | |
| "step": 13504 | |
| }, | |
| { | |
| "epoch": 1.0182363977485929, | |
| "grad_norm": 0.41458427906036377, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5162, | |
| "step": 13568 | |
| }, | |
| { | |
| "epoch": 1.0230393996247655, | |
| "grad_norm": 0.3841855227947235, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5129, | |
| "step": 13632 | |
| }, | |
| { | |
| "epoch": 1.027842401500938, | |
| "grad_norm": 0.43930500745773315, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5179, | |
| "step": 13696 | |
| }, | |
| { | |
| "epoch": 1.0326454033771106, | |
| "grad_norm": 0.3687760531902313, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5006, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 1.0374484052532833, | |
| "grad_norm": 0.3823833465576172, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5039, | |
| "step": 13824 | |
| }, | |
| { | |
| "epoch": 1.042251407129456, | |
| "grad_norm": 0.40025222301483154, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5155, | |
| "step": 13888 | |
| }, | |
| { | |
| "epoch": 1.0470544090056286, | |
| "grad_norm": 0.40790122747421265, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5064, | |
| "step": 13952 | |
| }, | |
| { | |
| "epoch": 1.051857410881801, | |
| "grad_norm": 0.42718634009361267, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5095, | |
| "step": 14016 | |
| }, | |
| { | |
| "epoch": 1.0566604127579737, | |
| "grad_norm": 0.3305782079696655, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 1.0614634146341464, | |
| "grad_norm": 0.37126559019088745, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5099, | |
| "step": 14144 | |
| }, | |
| { | |
| "epoch": 1.0662664165103188, | |
| "grad_norm": 0.414987176656723, | |
| "learning_rate": 0.002828, | |
| "loss": 2.501, | |
| "step": 14208 | |
| }, | |
| { | |
| "epoch": 1.0710694183864915, | |
| "grad_norm": 0.45917075872421265, | |
| "learning_rate": 0.002828, | |
| "loss": 2.5062, | |
| "step": 14272 | |
| }, | |
| { | |
| "epoch": 1.0758724202626642, | |
| "grad_norm": 0.4362465739250183, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4949, | |
| "step": 14336 | |
| }, | |
| { | |
| "epoch": 1.0806754221388368, | |
| "grad_norm": 0.40015289187431335, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4957, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.0854784240150095, | |
| "grad_norm": 0.3781159818172455, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4979, | |
| "step": 14464 | |
| }, | |
| { | |
| "epoch": 1.090281425891182, | |
| "grad_norm": 0.4165579676628113, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4913, | |
| "step": 14528 | |
| }, | |
| { | |
| "epoch": 1.0950844277673546, | |
| "grad_norm": 0.4100767970085144, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4978, | |
| "step": 14592 | |
| }, | |
| { | |
| "epoch": 1.0998874296435273, | |
| "grad_norm": 0.4211256504058838, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4871, | |
| "step": 14656 | |
| }, | |
| { | |
| "epoch": 1.1046904315196997, | |
| "grad_norm": 0.390396386384964, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4933, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 1.1094934333958724, | |
| "grad_norm": 0.3585355281829834, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4811, | |
| "step": 14784 | |
| }, | |
| { | |
| "epoch": 1.114296435272045, | |
| "grad_norm": 0.5148431062698364, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4888, | |
| "step": 14848 | |
| }, | |
| { | |
| "epoch": 1.1190994371482177, | |
| "grad_norm": 0.44254639744758606, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4821, | |
| "step": 14912 | |
| }, | |
| { | |
| "epoch": 1.1239024390243904, | |
| "grad_norm": 0.3710468113422394, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4819, | |
| "step": 14976 | |
| }, | |
| { | |
| "epoch": 1.1287054409005628, | |
| "grad_norm": 0.41197285056114197, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4842, | |
| "step": 15040 | |
| }, | |
| { | |
| "epoch": 1.1335084427767355, | |
| "grad_norm": 0.37512508034706116, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4776, | |
| "step": 15104 | |
| }, | |
| { | |
| "epoch": 1.1383114446529081, | |
| "grad_norm": 0.4286038279533386, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4748, | |
| "step": 15168 | |
| }, | |
| { | |
| "epoch": 1.1431144465290806, | |
| "grad_norm": 0.37446776032447815, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4727, | |
| "step": 15232 | |
| }, | |
| { | |
| "epoch": 1.1479174484052532, | |
| "grad_norm": 0.4537597894668579, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4663, | |
| "step": 15296 | |
| }, | |
| { | |
| "epoch": 1.152720450281426, | |
| "grad_norm": 0.36247050762176514, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4699, | |
| "step": 15360 | |
| }, | |
| { | |
| "epoch": 1.1575234521575986, | |
| "grad_norm": 0.3772297501564026, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4734, | |
| "step": 15424 | |
| }, | |
| { | |
| "epoch": 1.1623264540337712, | |
| "grad_norm": 0.3789200186729431, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4696, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 1.1671294559099437, | |
| "grad_norm": 0.36870113015174866, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4671, | |
| "step": 15552 | |
| }, | |
| { | |
| "epoch": 1.1719324577861163, | |
| "grad_norm": 0.37448298931121826, | |
| "learning_rate": 0.002828, | |
| "loss": 2.462, | |
| "step": 15616 | |
| }, | |
| { | |
| "epoch": 1.176735459662289, | |
| "grad_norm": 0.4384878873825073, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4648, | |
| "step": 15680 | |
| }, | |
| { | |
| "epoch": 1.1815384615384614, | |
| "grad_norm": 0.37811148166656494, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4598, | |
| "step": 15744 | |
| }, | |
| { | |
| "epoch": 1.186341463414634, | |
| "grad_norm": 0.4190385341644287, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4643, | |
| "step": 15808 | |
| }, | |
| { | |
| "epoch": 1.1911444652908068, | |
| "grad_norm": 0.48885485529899597, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4564, | |
| "step": 15872 | |
| }, | |
| { | |
| "epoch": 1.1959474671669794, | |
| "grad_norm": 0.42267611622810364, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4671, | |
| "step": 15936 | |
| }, | |
| { | |
| "epoch": 1.200750469043152, | |
| "grad_norm": 0.3886626064777374, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4715, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.2055534709193245, | |
| "grad_norm": 0.40871456265449524, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4558, | |
| "step": 16064 | |
| }, | |
| { | |
| "epoch": 1.2103564727954972, | |
| "grad_norm": 0.46952739357948303, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4497, | |
| "step": 16128 | |
| }, | |
| { | |
| "epoch": 1.2151594746716698, | |
| "grad_norm": 0.41340023279190063, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4402, | |
| "step": 16192 | |
| }, | |
| { | |
| "epoch": 1.2199624765478423, | |
| "grad_norm": 0.36176440119743347, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4473, | |
| "step": 16256 | |
| }, | |
| { | |
| "epoch": 1.224765478424015, | |
| "grad_norm": 0.4117899239063263, | |
| "learning_rate": 0.002828, | |
| "loss": 2.443, | |
| "step": 16320 | |
| }, | |
| { | |
| "epoch": 1.2295684803001876, | |
| "grad_norm": 0.5039286613464355, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4557, | |
| "step": 16384 | |
| }, | |
| { | |
| "epoch": 1.2343714821763603, | |
| "grad_norm": 0.3716677129268646, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4522, | |
| "step": 16448 | |
| }, | |
| { | |
| "epoch": 1.239174484052533, | |
| "grad_norm": 0.42316168546676636, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4424, | |
| "step": 16512 | |
| }, | |
| { | |
| "epoch": 1.2439774859287054, | |
| "grad_norm": 0.5081620216369629, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4325, | |
| "step": 16576 | |
| }, | |
| { | |
| "epoch": 1.248780487804878, | |
| "grad_norm": 0.39409589767456055, | |
| "learning_rate": 0.002828, | |
| "loss": 2.435, | |
| "step": 16640 | |
| }, | |
| { | |
| "epoch": 1.2535834896810507, | |
| "grad_norm": 0.38638824224472046, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4363, | |
| "step": 16704 | |
| }, | |
| { | |
| "epoch": 1.2583864915572232, | |
| "grad_norm": 0.41918718814849854, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4404, | |
| "step": 16768 | |
| }, | |
| { | |
| "epoch": 1.2631894934333958, | |
| "grad_norm": 0.3932395279407501, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4403, | |
| "step": 16832 | |
| }, | |
| { | |
| "epoch": 1.2679924953095685, | |
| "grad_norm": 0.3787371814250946, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4386, | |
| "step": 16896 | |
| }, | |
| { | |
| "epoch": 1.2727954971857411, | |
| "grad_norm": 0.40612953901290894, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4219, | |
| "step": 16960 | |
| }, | |
| { | |
| "epoch": 1.2775984990619138, | |
| "grad_norm": 0.4243071675300598, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4261, | |
| "step": 17024 | |
| }, | |
| { | |
| "epoch": 1.2824015009380862, | |
| "grad_norm": 0.4240303039550781, | |
| "learning_rate": 0.002828, | |
| "loss": 2.444, | |
| "step": 17088 | |
| }, | |
| { | |
| "epoch": 1.287204502814259, | |
| "grad_norm": 0.4888259470462799, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4344, | |
| "step": 17152 | |
| }, | |
| { | |
| "epoch": 1.2920075046904316, | |
| "grad_norm": 0.4678399860858917, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4306, | |
| "step": 17216 | |
| }, | |
| { | |
| "epoch": 1.296810506566604, | |
| "grad_norm": 0.38733649253845215, | |
| "learning_rate": 0.002828, | |
| "loss": 2.431, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 1.3016135084427767, | |
| "grad_norm": 0.38587358593940735, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4205, | |
| "step": 17344 | |
| }, | |
| { | |
| "epoch": 1.3064165103189493, | |
| "grad_norm": 0.39998751878738403, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4336, | |
| "step": 17408 | |
| }, | |
| { | |
| "epoch": 1.311219512195122, | |
| "grad_norm": 0.36294978857040405, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4238, | |
| "step": 17472 | |
| }, | |
| { | |
| "epoch": 1.3160225140712947, | |
| "grad_norm": 0.3924562633037567, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4197, | |
| "step": 17536 | |
| }, | |
| { | |
| "epoch": 1.320825515947467, | |
| "grad_norm": 0.3837553560733795, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4243, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.3256285178236398, | |
| "grad_norm": 0.38875913619995117, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4172, | |
| "step": 17664 | |
| }, | |
| { | |
| "epoch": 1.3304315196998124, | |
| "grad_norm": 0.41738125681877136, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4225, | |
| "step": 17728 | |
| }, | |
| { | |
| "epoch": 1.3352345215759849, | |
| "grad_norm": 0.3645491898059845, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4151, | |
| "step": 17792 | |
| }, | |
| { | |
| "epoch": 1.3400375234521575, | |
| "grad_norm": 0.43829870223999023, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4099, | |
| "step": 17856 | |
| }, | |
| { | |
| "epoch": 1.3448405253283302, | |
| "grad_norm": 0.3851640820503235, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4168, | |
| "step": 17920 | |
| }, | |
| { | |
| "epoch": 1.3496435272045029, | |
| "grad_norm": 0.36147060990333557, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4085, | |
| "step": 17984 | |
| }, | |
| { | |
| "epoch": 1.3544465290806755, | |
| "grad_norm": 0.42050638794898987, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4121, | |
| "step": 18048 | |
| }, | |
| { | |
| "epoch": 1.359249530956848, | |
| "grad_norm": 0.3830699920654297, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4095, | |
| "step": 18112 | |
| }, | |
| { | |
| "epoch": 1.3640525328330206, | |
| "grad_norm": 0.3830968737602234, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4077, | |
| "step": 18176 | |
| }, | |
| { | |
| "epoch": 1.3688555347091933, | |
| "grad_norm": 0.3880060017108917, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4124, | |
| "step": 18240 | |
| }, | |
| { | |
| "epoch": 1.3736585365853657, | |
| "grad_norm": 0.45445796847343445, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4014, | |
| "step": 18304 | |
| }, | |
| { | |
| "epoch": 1.3784615384615384, | |
| "grad_norm": 0.3750540316104889, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4003, | |
| "step": 18368 | |
| }, | |
| { | |
| "epoch": 1.383264540337711, | |
| "grad_norm": 0.3783455193042755, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3983, | |
| "step": 18432 | |
| }, | |
| { | |
| "epoch": 1.3880675422138837, | |
| "grad_norm": 0.40336528420448303, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4105, | |
| "step": 18496 | |
| }, | |
| { | |
| "epoch": 1.3928705440900564, | |
| "grad_norm": 0.43220385909080505, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4018, | |
| "step": 18560 | |
| }, | |
| { | |
| "epoch": 1.3976735459662288, | |
| "grad_norm": 0.4069630205631256, | |
| "learning_rate": 0.002828, | |
| "loss": 2.4049, | |
| "step": 18624 | |
| }, | |
| { | |
| "epoch": 1.4024765478424015, | |
| "grad_norm": 0.3866819441318512, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3917, | |
| "step": 18688 | |
| }, | |
| { | |
| "epoch": 1.4072795497185742, | |
| "grad_norm": 0.3699668347835541, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3908, | |
| "step": 18752 | |
| }, | |
| { | |
| "epoch": 1.4120825515947466, | |
| "grad_norm": 0.377645879983902, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3957, | |
| "step": 18816 | |
| }, | |
| { | |
| "epoch": 1.4168855534709193, | |
| "grad_norm": 0.36612892150878906, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3973, | |
| "step": 18880 | |
| }, | |
| { | |
| "epoch": 1.421688555347092, | |
| "grad_norm": 0.385735422372818, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3952, | |
| "step": 18944 | |
| }, | |
| { | |
| "epoch": 1.4264915572232646, | |
| "grad_norm": 0.4026818871498108, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3908, | |
| "step": 19008 | |
| }, | |
| { | |
| "epoch": 1.4312945590994373, | |
| "grad_norm": 0.39212891459465027, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3923, | |
| "step": 19072 | |
| }, | |
| { | |
| "epoch": 1.4360975609756097, | |
| "grad_norm": 0.43533411622047424, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3918, | |
| "step": 19136 | |
| }, | |
| { | |
| "epoch": 1.4409005628517824, | |
| "grad_norm": 0.4136466383934021, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3885, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.445703564727955, | |
| "grad_norm": 0.38349345326423645, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3891, | |
| "step": 19264 | |
| }, | |
| { | |
| "epoch": 1.4505065666041275, | |
| "grad_norm": 0.42666760087013245, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3725, | |
| "step": 19328 | |
| }, | |
| { | |
| "epoch": 1.4553095684803001, | |
| "grad_norm": 0.3926577866077423, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3885, | |
| "step": 19392 | |
| }, | |
| { | |
| "epoch": 1.4601125703564728, | |
| "grad_norm": 0.3736414611339569, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3855, | |
| "step": 19456 | |
| }, | |
| { | |
| "epoch": 1.4649155722326455, | |
| "grad_norm": 0.36343908309936523, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3773, | |
| "step": 19520 | |
| }, | |
| { | |
| "epoch": 1.4697185741088181, | |
| "grad_norm": 0.380211740732193, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3809, | |
| "step": 19584 | |
| }, | |
| { | |
| "epoch": 1.4745215759849906, | |
| "grad_norm": 0.40481454133987427, | |
| "learning_rate": 0.002828, | |
| "loss": 2.375, | |
| "step": 19648 | |
| }, | |
| { | |
| "epoch": 1.4793245778611632, | |
| "grad_norm": 0.45368635654449463, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3707, | |
| "step": 19712 | |
| }, | |
| { | |
| "epoch": 1.484127579737336, | |
| "grad_norm": 0.4029395580291748, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3733, | |
| "step": 19776 | |
| }, | |
| { | |
| "epoch": 1.4889305816135083, | |
| "grad_norm": 0.3748946785926819, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3739, | |
| "step": 19840 | |
| }, | |
| { | |
| "epoch": 1.493733583489681, | |
| "grad_norm": 0.36640551686286926, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3652, | |
| "step": 19904 | |
| }, | |
| { | |
| "epoch": 1.4985365853658537, | |
| "grad_norm": 0.4150533676147461, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3709, | |
| "step": 19968 | |
| }, | |
| { | |
| "epoch": 1.5033395872420263, | |
| "grad_norm": 0.49730879068374634, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3668, | |
| "step": 20032 | |
| }, | |
| { | |
| "epoch": 1.508142589118199, | |
| "grad_norm": 0.37675461173057556, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3695, | |
| "step": 20096 | |
| }, | |
| { | |
| "epoch": 1.5129455909943714, | |
| "grad_norm": 0.3647516965866089, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3733, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 1.517748592870544, | |
| "grad_norm": 1.2981253862380981, | |
| "learning_rate": 0.002828, | |
| "loss": 2.369, | |
| "step": 20224 | |
| }, | |
| { | |
| "epoch": 1.5225515947467168, | |
| "grad_norm": 0.5044511556625366, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3578, | |
| "step": 20288 | |
| }, | |
| { | |
| "epoch": 1.5273545966228892, | |
| "grad_norm": 0.3651883006095886, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3601, | |
| "step": 20352 | |
| }, | |
| { | |
| "epoch": 1.532157598499062, | |
| "grad_norm": 0.4419403076171875, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3607, | |
| "step": 20416 | |
| }, | |
| { | |
| "epoch": 1.5369606003752345, | |
| "grad_norm": 0.38631224632263184, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3619, | |
| "step": 20480 | |
| }, | |
| { | |
| "epoch": 1.5417636022514072, | |
| "grad_norm": 0.34725359082221985, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3573, | |
| "step": 20544 | |
| }, | |
| { | |
| "epoch": 1.5465666041275798, | |
| "grad_norm": 0.3991786241531372, | |
| "learning_rate": 0.002828, | |
| "loss": 2.357, | |
| "step": 20608 | |
| }, | |
| { | |
| "epoch": 1.5513696060037523, | |
| "grad_norm": 0.3595084846019745, | |
| "learning_rate": 0.002828, | |
| "loss": 2.357, | |
| "step": 20672 | |
| }, | |
| { | |
| "epoch": 1.556172607879925, | |
| "grad_norm": 0.4021853804588318, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3537, | |
| "step": 20736 | |
| }, | |
| { | |
| "epoch": 1.5609756097560976, | |
| "grad_norm": 0.3939075767993927, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3594, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.56577861163227, | |
| "grad_norm": 0.3889540135860443, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3573, | |
| "step": 20864 | |
| }, | |
| { | |
| "epoch": 1.570581613508443, | |
| "grad_norm": 0.41366517543792725, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3442, | |
| "step": 20928 | |
| }, | |
| { | |
| "epoch": 1.5753846153846154, | |
| "grad_norm": 0.37127187848091125, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3457, | |
| "step": 20992 | |
| }, | |
| { | |
| "epoch": 1.580187617260788, | |
| "grad_norm": 0.4014946520328522, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3457, | |
| "step": 21056 | |
| }, | |
| { | |
| "epoch": 1.5849906191369607, | |
| "grad_norm": 0.35794708132743835, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3508, | |
| "step": 21120 | |
| }, | |
| { | |
| "epoch": 1.5897936210131332, | |
| "grad_norm": 0.3924767076969147, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3451, | |
| "step": 21184 | |
| }, | |
| { | |
| "epoch": 1.5945966228893058, | |
| "grad_norm": 0.34789031744003296, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3444, | |
| "step": 21248 | |
| }, | |
| { | |
| "epoch": 1.5993996247654785, | |
| "grad_norm": 0.37461933493614197, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3385, | |
| "step": 21312 | |
| }, | |
| { | |
| "epoch": 1.604202626641651, | |
| "grad_norm": 0.40146076679229736, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3406, | |
| "step": 21376 | |
| }, | |
| { | |
| "epoch": 1.6090056285178238, | |
| "grad_norm": 0.4080921411514282, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3423, | |
| "step": 21440 | |
| }, | |
| { | |
| "epoch": 1.6138086303939962, | |
| "grad_norm": 0.40802744030952454, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3377, | |
| "step": 21504 | |
| }, | |
| { | |
| "epoch": 1.618611632270169, | |
| "grad_norm": 0.420188307762146, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3399, | |
| "step": 21568 | |
| }, | |
| { | |
| "epoch": 1.6234146341463416, | |
| "grad_norm": 0.40739214420318604, | |
| "learning_rate": 0.002828, | |
| "loss": 2.34, | |
| "step": 21632 | |
| }, | |
| { | |
| "epoch": 1.628217636022514, | |
| "grad_norm": 0.41674676537513733, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3326, | |
| "step": 21696 | |
| }, | |
| { | |
| "epoch": 1.6330206378986867, | |
| "grad_norm": 0.41856762766838074, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3366, | |
| "step": 21760 | |
| }, | |
| { | |
| "epoch": 1.6378236397748593, | |
| "grad_norm": 0.39763346314430237, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3339, | |
| "step": 21824 | |
| }, | |
| { | |
| "epoch": 1.6426266416510318, | |
| "grad_norm": 0.3777034282684326, | |
| "learning_rate": 0.002828, | |
| "loss": 2.343, | |
| "step": 21888 | |
| }, | |
| { | |
| "epoch": 1.6474296435272047, | |
| "grad_norm": 0.3617188036441803, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3341, | |
| "step": 21952 | |
| }, | |
| { | |
| "epoch": 1.652232645403377, | |
| "grad_norm": 0.4504718482494354, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3295, | |
| "step": 22016 | |
| }, | |
| { | |
| "epoch": 1.6570356472795498, | |
| "grad_norm": 0.37388357520103455, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3408, | |
| "step": 22080 | |
| }, | |
| { | |
| "epoch": 1.6618386491557224, | |
| "grad_norm": 0.3807313144207001, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3249, | |
| "step": 22144 | |
| }, | |
| { | |
| "epoch": 1.6666416510318949, | |
| "grad_norm": 0.4428509771823883, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3347, | |
| "step": 22208 | |
| }, | |
| { | |
| "epoch": 1.6714446529080675, | |
| "grad_norm": 0.39028382301330566, | |
| "learning_rate": 0.002828, | |
| "loss": 2.336, | |
| "step": 22272 | |
| }, | |
| { | |
| "epoch": 1.6762476547842402, | |
| "grad_norm": 0.482424259185791, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3212, | |
| "step": 22336 | |
| }, | |
| { | |
| "epoch": 1.6810506566604126, | |
| "grad_norm": 0.39801299571990967, | |
| "learning_rate": 0.002828, | |
| "loss": 2.314, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.6858536585365855, | |
| "grad_norm": 0.4351527988910675, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3223, | |
| "step": 22464 | |
| }, | |
| { | |
| "epoch": 1.690656660412758, | |
| "grad_norm": 0.4509490430355072, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3246, | |
| "step": 22528 | |
| }, | |
| { | |
| "epoch": 1.6954596622889306, | |
| "grad_norm": 0.35885152220726013, | |
| "learning_rate": 0.002828, | |
| "loss": 2.319, | |
| "step": 22592 | |
| }, | |
| { | |
| "epoch": 1.7002626641651033, | |
| "grad_norm": 0.4146900177001953, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3214, | |
| "step": 22656 | |
| }, | |
| { | |
| "epoch": 1.7050656660412757, | |
| "grad_norm": 0.40194573998451233, | |
| "learning_rate": 0.002828, | |
| "loss": 2.322, | |
| "step": 22720 | |
| }, | |
| { | |
| "epoch": 1.7098686679174484, | |
| "grad_norm": 0.43570390343666077, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3241, | |
| "step": 22784 | |
| }, | |
| { | |
| "epoch": 1.714671669793621, | |
| "grad_norm": 0.35558512806892395, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3193, | |
| "step": 22848 | |
| }, | |
| { | |
| "epoch": 1.7194746716697935, | |
| "grad_norm": 0.3700902760028839, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3219, | |
| "step": 22912 | |
| }, | |
| { | |
| "epoch": 1.7242776735459664, | |
| "grad_norm": 0.4287453591823578, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3078, | |
| "step": 22976 | |
| }, | |
| { | |
| "epoch": 1.7290806754221388, | |
| "grad_norm": 0.41843536496162415, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3103, | |
| "step": 23040 | |
| }, | |
| { | |
| "epoch": 1.7338836772983115, | |
| "grad_norm": 0.3938317596912384, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3176, | |
| "step": 23104 | |
| }, | |
| { | |
| "epoch": 1.7386866791744842, | |
| "grad_norm": 0.44625958800315857, | |
| "learning_rate": 0.002828, | |
| "loss": 2.307, | |
| "step": 23168 | |
| }, | |
| { | |
| "epoch": 1.7434896810506566, | |
| "grad_norm": 0.4598078727722168, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2958, | |
| "step": 23232 | |
| }, | |
| { | |
| "epoch": 1.7482926829268293, | |
| "grad_norm": 0.4126788377761841, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3094, | |
| "step": 23296 | |
| }, | |
| { | |
| "epoch": 1.753095684803002, | |
| "grad_norm": 0.3801914155483246, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3048, | |
| "step": 23360 | |
| }, | |
| { | |
| "epoch": 1.7578986866791744, | |
| "grad_norm": 0.4619985818862915, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3069, | |
| "step": 23424 | |
| }, | |
| { | |
| "epoch": 1.7627016885553473, | |
| "grad_norm": 0.4068593680858612, | |
| "learning_rate": 0.002828, | |
| "loss": 2.299, | |
| "step": 23488 | |
| }, | |
| { | |
| "epoch": 1.7675046904315197, | |
| "grad_norm": 0.36146870255470276, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3042, | |
| "step": 23552 | |
| }, | |
| { | |
| "epoch": 1.7723076923076924, | |
| "grad_norm": 0.3995908200740814, | |
| "learning_rate": 0.002828, | |
| "loss": 2.3006, | |
| "step": 23616 | |
| }, | |
| { | |
| "epoch": 1.777110694183865, | |
| "grad_norm": 0.3970596492290497, | |
| "learning_rate": 0.002828, | |
| "loss": 2.313, | |
| "step": 23680 | |
| }, | |
| { | |
| "epoch": 1.7819136960600375, | |
| "grad_norm": 0.4287073612213135, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2974, | |
| "step": 23744 | |
| }, | |
| { | |
| "epoch": 1.7867166979362101, | |
| "grad_norm": 0.41250482201576233, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2937, | |
| "step": 23808 | |
| }, | |
| { | |
| "epoch": 1.7915196998123828, | |
| "grad_norm": 0.411668062210083, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2994, | |
| "step": 23872 | |
| }, | |
| { | |
| "epoch": 1.7963227016885552, | |
| "grad_norm": 0.4834740459918976, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2895, | |
| "step": 23936 | |
| }, | |
| { | |
| "epoch": 1.8011257035647281, | |
| "grad_norm": 0.3624022603034973, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2888, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8059287054409006, | |
| "grad_norm": 0.36700454354286194, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2917, | |
| "step": 24064 | |
| }, | |
| { | |
| "epoch": 1.8107317073170732, | |
| "grad_norm": 0.3666454255580902, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2896, | |
| "step": 24128 | |
| }, | |
| { | |
| "epoch": 1.8155347091932459, | |
| "grad_norm": 0.4110506474971771, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2947, | |
| "step": 24192 | |
| }, | |
| { | |
| "epoch": 1.8203377110694183, | |
| "grad_norm": 0.3604464530944824, | |
| "learning_rate": 0.002828, | |
| "loss": 2.289, | |
| "step": 24256 | |
| }, | |
| { | |
| "epoch": 1.825140712945591, | |
| "grad_norm": 0.40807706117630005, | |
| "learning_rate": 0.002828, | |
| "loss": 2.288, | |
| "step": 24320 | |
| }, | |
| { | |
| "epoch": 1.8299437148217637, | |
| "grad_norm": 0.3632533848285675, | |
| "learning_rate": 0.002828, | |
| "loss": 2.29, | |
| "step": 24384 | |
| }, | |
| { | |
| "epoch": 1.834746716697936, | |
| "grad_norm": 0.38520562648773193, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2805, | |
| "step": 24448 | |
| }, | |
| { | |
| "epoch": 1.839549718574109, | |
| "grad_norm": 0.4228810966014862, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2842, | |
| "step": 24512 | |
| }, | |
| { | |
| "epoch": 1.8443527204502814, | |
| "grad_norm": 0.4542325735092163, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2781, | |
| "step": 24576 | |
| }, | |
| { | |
| "epoch": 1.849155722326454, | |
| "grad_norm": 0.37316882610321045, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2829, | |
| "step": 24640 | |
| }, | |
| { | |
| "epoch": 1.8539587242026268, | |
| "grad_norm": 0.5505624413490295, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2942, | |
| "step": 24704 | |
| }, | |
| { | |
| "epoch": 1.8587617260787992, | |
| "grad_norm": 0.4269484281539917, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2793, | |
| "step": 24768 | |
| }, | |
| { | |
| "epoch": 1.8635647279549719, | |
| "grad_norm": 0.407760888338089, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2803, | |
| "step": 24832 | |
| }, | |
| { | |
| "epoch": 1.8683677298311445, | |
| "grad_norm": 0.4192192554473877, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2818, | |
| "step": 24896 | |
| }, | |
| { | |
| "epoch": 1.873170731707317, | |
| "grad_norm": 0.3924838602542877, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2757, | |
| "step": 24960 | |
| }, | |
| { | |
| "epoch": 1.8779737335834898, | |
| "grad_norm": 0.3799656629562378, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2695, | |
| "step": 25024 | |
| }, | |
| { | |
| "epoch": 1.8827767354596623, | |
| "grad_norm": 0.40570494532585144, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2801, | |
| "step": 25088 | |
| }, | |
| { | |
| "epoch": 1.887579737335835, | |
| "grad_norm": 0.3898228704929352, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2709, | |
| "step": 25152 | |
| }, | |
| { | |
| "epoch": 1.8923827392120076, | |
| "grad_norm": 0.393216073513031, | |
| "learning_rate": 0.002828, | |
| "loss": 2.282, | |
| "step": 25216 | |
| }, | |
| { | |
| "epoch": 1.89718574108818, | |
| "grad_norm": 0.4247749149799347, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2776, | |
| "step": 25280 | |
| }, | |
| { | |
| "epoch": 1.9019887429643527, | |
| "grad_norm": 0.4670035243034363, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2754, | |
| "step": 25344 | |
| }, | |
| { | |
| "epoch": 1.9067917448405254, | |
| "grad_norm": 0.40336644649505615, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2768, | |
| "step": 25408 | |
| }, | |
| { | |
| "epoch": 1.9115947467166978, | |
| "grad_norm": 0.48462921380996704, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2634, | |
| "step": 25472 | |
| }, | |
| { | |
| "epoch": 1.9163977485928707, | |
| "grad_norm": 0.44047805666923523, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2674, | |
| "step": 25536 | |
| }, | |
| { | |
| "epoch": 1.9212007504690432, | |
| "grad_norm": 0.4221409261226654, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2724, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.9260037523452158, | |
| "grad_norm": 0.4272362291812897, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2683, | |
| "step": 25664 | |
| }, | |
| { | |
| "epoch": 1.9308067542213885, | |
| "grad_norm": 0.4309645891189575, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2612, | |
| "step": 25728 | |
| }, | |
| { | |
| "epoch": 1.935609756097561, | |
| "grad_norm": 0.4220867156982422, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2665, | |
| "step": 25792 | |
| }, | |
| { | |
| "epoch": 1.9404127579737336, | |
| "grad_norm": 0.3765920102596283, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2652, | |
| "step": 25856 | |
| }, | |
| { | |
| "epoch": 1.9452157598499062, | |
| "grad_norm": 0.44643986225128174, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2627, | |
| "step": 25920 | |
| }, | |
| { | |
| "epoch": 1.9500187617260787, | |
| "grad_norm": 0.4022061824798584, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2665, | |
| "step": 25984 | |
| }, | |
| { | |
| "epoch": 1.9548217636022516, | |
| "grad_norm": 0.3935778737068176, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2585, | |
| "step": 26048 | |
| }, | |
| { | |
| "epoch": 1.959624765478424, | |
| "grad_norm": 0.3877500295639038, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2629, | |
| "step": 26112 | |
| }, | |
| { | |
| "epoch": 1.9644277673545967, | |
| "grad_norm": 0.3891729712486267, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2594, | |
| "step": 26176 | |
| }, | |
| { | |
| "epoch": 1.9692307692307693, | |
| "grad_norm": 0.3616099953651428, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2601, | |
| "step": 26240 | |
| }, | |
| { | |
| "epoch": 1.9740337711069418, | |
| "grad_norm": 0.3855280578136444, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2677, | |
| "step": 26304 | |
| }, | |
| { | |
| "epoch": 1.9788367729831144, | |
| "grad_norm": 0.44039493799209595, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2641, | |
| "step": 26368 | |
| }, | |
| { | |
| "epoch": 1.983639774859287, | |
| "grad_norm": 0.37217262387275696, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2603, | |
| "step": 26432 | |
| }, | |
| { | |
| "epoch": 1.9884427767354595, | |
| "grad_norm": 0.3942553997039795, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2508, | |
| "step": 26496 | |
| }, | |
| { | |
| "epoch": 1.9932457786116324, | |
| "grad_norm": 0.3975297808647156, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2466, | |
| "step": 26560 | |
| }, | |
| { | |
| "epoch": 1.9980487804878049, | |
| "grad_norm": 0.39197394251823425, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2515, | |
| "step": 26624 | |
| }, | |
| { | |
| "epoch": 2.0028517823639773, | |
| "grad_norm": 0.38722801208496094, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2354, | |
| "step": 26688 | |
| }, | |
| { | |
| "epoch": 2.00765478424015, | |
| "grad_norm": 0.38619640469551086, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2152, | |
| "step": 26752 | |
| }, | |
| { | |
| "epoch": 2.0124577861163226, | |
| "grad_norm": 0.49529945850372314, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2167, | |
| "step": 26816 | |
| }, | |
| { | |
| "epoch": 2.0172607879924955, | |
| "grad_norm": 0.4199656844139099, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2092, | |
| "step": 26880 | |
| }, | |
| { | |
| "epoch": 2.022063789868668, | |
| "grad_norm": 0.45820868015289307, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2249, | |
| "step": 26944 | |
| }, | |
| { | |
| "epoch": 2.0268667917448404, | |
| "grad_norm": 0.4006725251674652, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2165, | |
| "step": 27008 | |
| }, | |
| { | |
| "epoch": 2.0316697936210133, | |
| "grad_norm": 0.4596467614173889, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2154, | |
| "step": 27072 | |
| }, | |
| { | |
| "epoch": 2.0364727954971857, | |
| "grad_norm": 0.38660213351249695, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2062, | |
| "step": 27136 | |
| }, | |
| { | |
| "epoch": 2.041275797373358, | |
| "grad_norm": 0.44082361459732056, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2124, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.046078799249531, | |
| "grad_norm": 0.3886605203151703, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2182, | |
| "step": 27264 | |
| }, | |
| { | |
| "epoch": 2.0508818011257035, | |
| "grad_norm": 0.41386017203330994, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2168, | |
| "step": 27328 | |
| }, | |
| { | |
| "epoch": 2.055684803001876, | |
| "grad_norm": 0.411478191614151, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2092, | |
| "step": 27392 | |
| }, | |
| { | |
| "epoch": 2.060487804878049, | |
| "grad_norm": 0.47288912534713745, | |
| "learning_rate": 0.002828, | |
| "loss": 2.21, | |
| "step": 27456 | |
| }, | |
| { | |
| "epoch": 2.0652908067542213, | |
| "grad_norm": 0.36384883522987366, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2095, | |
| "step": 27520 | |
| }, | |
| { | |
| "epoch": 2.070093808630394, | |
| "grad_norm": 0.40636852383613586, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2092, | |
| "step": 27584 | |
| }, | |
| { | |
| "epoch": 2.0748968105065666, | |
| "grad_norm": 0.4425170421600342, | |
| "learning_rate": 0.002828, | |
| "loss": 2.212, | |
| "step": 27648 | |
| }, | |
| { | |
| "epoch": 2.079699812382739, | |
| "grad_norm": 0.48468896746635437, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2078, | |
| "step": 27712 | |
| }, | |
| { | |
| "epoch": 2.084502814258912, | |
| "grad_norm": 0.40420570969581604, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2158, | |
| "step": 27776 | |
| }, | |
| { | |
| "epoch": 2.0893058161350844, | |
| "grad_norm": 0.44314709305763245, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2121, | |
| "step": 27840 | |
| }, | |
| { | |
| "epoch": 2.0941088180112573, | |
| "grad_norm": 0.5187743306159973, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2138, | |
| "step": 27904 | |
| }, | |
| { | |
| "epoch": 2.0989118198874297, | |
| "grad_norm": 0.4796048104763031, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2027, | |
| "step": 27968 | |
| }, | |
| { | |
| "epoch": 2.103714821763602, | |
| "grad_norm": 0.43605130910873413, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2029, | |
| "step": 28032 | |
| }, | |
| { | |
| "epoch": 2.108517823639775, | |
| "grad_norm": 0.4523628056049347, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2038, | |
| "step": 28096 | |
| }, | |
| { | |
| "epoch": 2.1133208255159475, | |
| "grad_norm": 0.4183247983455658, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2026, | |
| "step": 28160 | |
| }, | |
| { | |
| "epoch": 2.11812382739212, | |
| "grad_norm": 0.5113268494606018, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1998, | |
| "step": 28224 | |
| }, | |
| { | |
| "epoch": 2.122926829268293, | |
| "grad_norm": 0.40837016701698303, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2054, | |
| "step": 28288 | |
| }, | |
| { | |
| "epoch": 2.1277298311444652, | |
| "grad_norm": 0.40093889832496643, | |
| "learning_rate": 0.002828, | |
| "loss": 2.208, | |
| "step": 28352 | |
| }, | |
| { | |
| "epoch": 2.1325328330206377, | |
| "grad_norm": 0.3988894820213318, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2028, | |
| "step": 28416 | |
| }, | |
| { | |
| "epoch": 2.1373358348968106, | |
| "grad_norm": 0.42024731636047363, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1952, | |
| "step": 28480 | |
| }, | |
| { | |
| "epoch": 2.142138836772983, | |
| "grad_norm": 0.38691264390945435, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2035, | |
| "step": 28544 | |
| }, | |
| { | |
| "epoch": 2.146941838649156, | |
| "grad_norm": 0.41956332325935364, | |
| "learning_rate": 0.002828, | |
| "loss": 2.196, | |
| "step": 28608 | |
| }, | |
| { | |
| "epoch": 2.1517448405253283, | |
| "grad_norm": 0.4035188555717468, | |
| "learning_rate": 0.002828, | |
| "loss": 2.2038, | |
| "step": 28672 | |
| }, | |
| { | |
| "epoch": 2.1565478424015008, | |
| "grad_norm": 0.35282230377197266, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1835, | |
| "step": 28736 | |
| }, | |
| { | |
| "epoch": 2.1613508442776737, | |
| "grad_norm": 0.43618568778038025, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1946, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.166153846153846, | |
| "grad_norm": 0.4310976564884186, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1873, | |
| "step": 28864 | |
| }, | |
| { | |
| "epoch": 2.170956848030019, | |
| "grad_norm": 0.4475420415401459, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1946, | |
| "step": 28928 | |
| }, | |
| { | |
| "epoch": 2.1757598499061914, | |
| "grad_norm": 0.4384845197200775, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1935, | |
| "step": 28992 | |
| }, | |
| { | |
| "epoch": 2.180562851782364, | |
| "grad_norm": 0.40141811966896057, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1925, | |
| "step": 29056 | |
| }, | |
| { | |
| "epoch": 2.1853658536585368, | |
| "grad_norm": 0.3754780888557434, | |
| "learning_rate": 0.002828, | |
| "loss": 2.19, | |
| "step": 29120 | |
| }, | |
| { | |
| "epoch": 2.190168855534709, | |
| "grad_norm": 0.40471306443214417, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1915, | |
| "step": 29184 | |
| }, | |
| { | |
| "epoch": 2.1949718574108816, | |
| "grad_norm": 1.464024543762207, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1909, | |
| "step": 29248 | |
| }, | |
| { | |
| "epoch": 2.1997748592870545, | |
| "grad_norm": 0.3818819522857666, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1915, | |
| "step": 29312 | |
| }, | |
| { | |
| "epoch": 2.204577861163227, | |
| "grad_norm": 0.3688436448574066, | |
| "learning_rate": 0.002828, | |
| "loss": 2.184, | |
| "step": 29376 | |
| }, | |
| { | |
| "epoch": 2.2093808630393994, | |
| "grad_norm": 0.4367921054363251, | |
| "learning_rate": 0.002828, | |
| "loss": 2.185, | |
| "step": 29440 | |
| }, | |
| { | |
| "epoch": 2.2141838649155723, | |
| "grad_norm": 0.3566763401031494, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1871, | |
| "step": 29504 | |
| }, | |
| { | |
| "epoch": 2.2189868667917447, | |
| "grad_norm": 0.4481133222579956, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1826, | |
| "step": 29568 | |
| }, | |
| { | |
| "epoch": 2.2237898686679176, | |
| "grad_norm": 0.44622039794921875, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1885, | |
| "step": 29632 | |
| }, | |
| { | |
| "epoch": 2.22859287054409, | |
| "grad_norm": 0.4857657253742218, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1784, | |
| "step": 29696 | |
| }, | |
| { | |
| "epoch": 2.2333958724202625, | |
| "grad_norm": 0.41923773288726807, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1807, | |
| "step": 29760 | |
| }, | |
| { | |
| "epoch": 2.2381988742964354, | |
| "grad_norm": 0.4176802933216095, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1798, | |
| "step": 29824 | |
| }, | |
| { | |
| "epoch": 2.243001876172608, | |
| "grad_norm": 0.4086935520172119, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1739, | |
| "step": 29888 | |
| }, | |
| { | |
| "epoch": 2.2478048780487807, | |
| "grad_norm": 0.40138566493988037, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1857, | |
| "step": 29952 | |
| }, | |
| { | |
| "epoch": 2.252607879924953, | |
| "grad_norm": 0.393996000289917, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1818, | |
| "step": 30016 | |
| }, | |
| { | |
| "epoch": 2.2574108818011256, | |
| "grad_norm": 0.3962005078792572, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1724, | |
| "step": 30080 | |
| }, | |
| { | |
| "epoch": 2.2622138836772985, | |
| "grad_norm": 0.41648438572883606, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1835, | |
| "step": 30144 | |
| }, | |
| { | |
| "epoch": 2.267016885553471, | |
| "grad_norm": 0.3810112774372101, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1736, | |
| "step": 30208 | |
| }, | |
| { | |
| "epoch": 2.2718198874296434, | |
| "grad_norm": 0.4520975649356842, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1793, | |
| "step": 30272 | |
| }, | |
| { | |
| "epoch": 2.2766228893058162, | |
| "grad_norm": 0.4406943917274475, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1732, | |
| "step": 30336 | |
| }, | |
| { | |
| "epoch": 2.2814258911819887, | |
| "grad_norm": 0.4186633825302124, | |
| "learning_rate": 0.002828, | |
| "loss": 2.1774, | |
| "step": 30400 | |
| } | |
| ], | |
| "logging_steps": 64, | |
| "max_steps": 333125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 320, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.993193875177472e+17, | |
| "train_batch_size": 200, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |