{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004032258064516129, "grad_norm": 23.183565139770508, "learning_rate": 0.0, "loss": 1.0275, "step": 1 }, { "epoch": 0.008064516129032258, "grad_norm": 23.37139320373535, "learning_rate": 6.666666666666668e-06, "loss": 1.0547, "step": 2 }, { "epoch": 0.012096774193548387, "grad_norm": 22.907167434692383, "learning_rate": 1.0566416671474378e-05, "loss": 1.0348, "step": 3 }, { "epoch": 0.016129032258064516, "grad_norm": 15.111400604248047, "learning_rate": 1.3333333333333337e-05, "loss": 0.6538, "step": 4 }, { "epoch": 0.020161290322580645, "grad_norm": 6.891965389251709, "learning_rate": 1.5479520632582417e-05, "loss": 0.253, "step": 5 }, { "epoch": 0.024193548387096774, "grad_norm": 5.94376802444458, "learning_rate": 1.7233083338141044e-05, "loss": 0.3932, "step": 6 }, { "epoch": 0.028225806451612902, "grad_norm": 2.6044232845306396, "learning_rate": 1.8715699480384028e-05, "loss": 0.2468, "step": 7 }, { "epoch": 0.03225806451612903, "grad_norm": 5.8222126960754395, "learning_rate": 2e-05, "loss": 0.3847, "step": 8 }, { "epoch": 0.036290322580645164, "grad_norm": 2.5366413593292236, "learning_rate": 2e-05, "loss": 0.306, "step": 9 }, { "epoch": 0.04032258064516129, "grad_norm": 1.6535553932189941, "learning_rate": 2e-05, "loss": 0.2243, "step": 10 }, { "epoch": 0.04435483870967742, "grad_norm": 1.9359959363937378, "learning_rate": 2e-05, "loss": 0.2214, "step": 11 }, { "epoch": 0.04838709677419355, "grad_norm": 2.255415201187134, "learning_rate": 2e-05, "loss": 0.2351, "step": 12 }, { "epoch": 0.05241935483870968, "grad_norm": 1.2205442190170288, "learning_rate": 2e-05, "loss": 0.1992, "step": 13 }, { "epoch": 0.056451612903225805, "grad_norm": 2.05025053024292, "learning_rate": 2e-05, "loss": 0.2232, "step": 14 }, { "epoch": 0.06048387096774194, "grad_norm": 1.5063331127166748, "learning_rate": 2e-05, "loss": 0.2107, "step": 15 }, { "epoch": 0.06451612903225806, "grad_norm": 1.1306530237197876, "learning_rate": 2e-05, "loss": 0.2015, "step": 16 }, { "epoch": 0.06854838709677419, "grad_norm": 0.8307346701622009, "learning_rate": 2e-05, "loss": 0.191, "step": 17 }, { "epoch": 0.07258064516129033, "grad_norm": 0.6416309475898743, "learning_rate": 2e-05, "loss": 0.184, "step": 18 }, { "epoch": 0.07661290322580645, "grad_norm": 1.3595116138458252, "learning_rate": 2e-05, "loss": 0.2, "step": 19 }, { "epoch": 0.08064516129032258, "grad_norm": 0.6382694840431213, "learning_rate": 2e-05, "loss": 0.1793, "step": 20 }, { "epoch": 0.0846774193548387, "grad_norm": 1.1086512804031372, "learning_rate": 2e-05, "loss": 0.1925, "step": 21 }, { "epoch": 0.08870967741935484, "grad_norm": 0.6290364861488342, "learning_rate": 2e-05, "loss": 0.1828, "step": 22 }, { "epoch": 0.09274193548387097, "grad_norm": 1.621184229850769, "learning_rate": 2e-05, "loss": 0.1849, "step": 23 }, { "epoch": 0.0967741935483871, "grad_norm": 1.1934734582901, "learning_rate": 2e-05, "loss": 0.1978, "step": 24 }, { "epoch": 0.10080645161290322, "grad_norm": 0.5095123648643494, "learning_rate": 2e-05, "loss": 0.1823, "step": 25 }, { "epoch": 0.10483870967741936, "grad_norm": 0.9218118786811829, "learning_rate": 2e-05, "loss": 0.191, "step": 26 }, { "epoch": 0.10887096774193548, "grad_norm": 0.7978827357292175, "learning_rate": 2e-05, "loss": 0.1766, "step": 27 }, { "epoch": 0.11290322580645161, "grad_norm": 0.8179630637168884, "learning_rate": 2e-05, "loss": 0.1868, "step": 28 }, { "epoch": 0.11693548387096774, "grad_norm": 0.5699785351753235, "learning_rate": 2e-05, "loss": 0.1869, "step": 29 }, { "epoch": 0.12096774193548387, "grad_norm": 0.7835913300514221, "learning_rate": 2e-05, "loss": 0.1973, "step": 30 }, { "epoch": 0.125, "grad_norm": 0.5692117810249329, "learning_rate": 2e-05, "loss": 0.1759, "step": 31 }, { "epoch": 0.12903225806451613, "grad_norm": 0.803184449672699, "learning_rate": 2e-05, "loss": 0.1907, "step": 32 }, { "epoch": 0.13306451612903225, "grad_norm": 0.5545604228973389, "learning_rate": 2e-05, "loss": 0.1676, "step": 33 }, { "epoch": 0.13709677419354838, "grad_norm": 0.3685874342918396, "learning_rate": 2e-05, "loss": 0.1766, "step": 34 }, { "epoch": 0.14112903225806453, "grad_norm": 0.6002175807952881, "learning_rate": 2e-05, "loss": 0.1836, "step": 35 }, { "epoch": 0.14516129032258066, "grad_norm": 0.5726589560508728, "learning_rate": 2e-05, "loss": 0.1757, "step": 36 }, { "epoch": 0.14919354838709678, "grad_norm": 0.5990753769874573, "learning_rate": 2e-05, "loss": 0.1814, "step": 37 }, { "epoch": 0.1532258064516129, "grad_norm": 0.5180577039718628, "learning_rate": 2e-05, "loss": 0.1725, "step": 38 }, { "epoch": 0.15725806451612903, "grad_norm": 0.6645565629005432, "learning_rate": 2e-05, "loss": 0.1856, "step": 39 }, { "epoch": 0.16129032258064516, "grad_norm": 0.7106342911720276, "learning_rate": 2e-05, "loss": 0.1679, "step": 40 }, { "epoch": 0.16532258064516128, "grad_norm": 0.8131007552146912, "learning_rate": 2e-05, "loss": 0.1806, "step": 41 }, { "epoch": 0.1693548387096774, "grad_norm": 0.8143223524093628, "learning_rate": 2e-05, "loss": 0.1689, "step": 42 }, { "epoch": 0.17338709677419356, "grad_norm": 0.6998667120933533, "learning_rate": 2e-05, "loss": 0.1681, "step": 43 }, { "epoch": 0.1774193548387097, "grad_norm": 0.9026826620101929, "learning_rate": 2e-05, "loss": 0.1597, "step": 44 }, { "epoch": 0.1814516129032258, "grad_norm": 1.212770700454712, "learning_rate": 2e-05, "loss": 0.1707, "step": 45 }, { "epoch": 0.18548387096774194, "grad_norm": 0.8376269936561584, "learning_rate": 2e-05, "loss": 0.1552, "step": 46 }, { "epoch": 0.18951612903225806, "grad_norm": 0.9025837182998657, "learning_rate": 2e-05, "loss": 0.1633, "step": 47 }, { "epoch": 0.1935483870967742, "grad_norm": 0.9542744159698486, "learning_rate": 2e-05, "loss": 0.1749, "step": 48 }, { "epoch": 0.1975806451612903, "grad_norm": 0.7638697624206543, "learning_rate": 2e-05, "loss": 0.158, "step": 49 }, { "epoch": 0.20161290322580644, "grad_norm": 0.5949487090110779, "learning_rate": 2e-05, "loss": 0.1487, "step": 50 }, { "epoch": 0.2056451612903226, "grad_norm": 1.0373241901397705, "learning_rate": 2e-05, "loss": 0.1519, "step": 51 }, { "epoch": 0.20967741935483872, "grad_norm": 0.5316594243049622, "learning_rate": 2e-05, "loss": 0.1364, "step": 52 }, { "epoch": 0.21370967741935484, "grad_norm": 0.624768078327179, "learning_rate": 2e-05, "loss": 0.1449, "step": 53 }, { "epoch": 0.21774193548387097, "grad_norm": 0.7126561403274536, "learning_rate": 2e-05, "loss": 0.1426, "step": 54 }, { "epoch": 0.2217741935483871, "grad_norm": 1.0926051139831543, "learning_rate": 2e-05, "loss": 0.142, "step": 55 }, { "epoch": 0.22580645161290322, "grad_norm": 1.873496651649475, "learning_rate": 2e-05, "loss": 0.1578, "step": 56 }, { "epoch": 0.22983870967741934, "grad_norm": 0.8202502727508545, "learning_rate": 2e-05, "loss": 0.1462, "step": 57 }, { "epoch": 0.23387096774193547, "grad_norm": 0.6349180936813354, "learning_rate": 2e-05, "loss": 0.1326, "step": 58 }, { "epoch": 0.23790322580645162, "grad_norm": 1.0204631090164185, "learning_rate": 2e-05, "loss": 0.1385, "step": 59 }, { "epoch": 0.24193548387096775, "grad_norm": 0.8092764616012573, "learning_rate": 2e-05, "loss": 0.1341, "step": 60 }, { "epoch": 0.24596774193548387, "grad_norm": 1.0302892923355103, "learning_rate": 2e-05, "loss": 0.144, "step": 61 }, { "epoch": 0.25, "grad_norm": 1.2825901508331299, "learning_rate": 2e-05, "loss": 0.1391, "step": 62 }, { "epoch": 0.2540322580645161, "grad_norm": 0.873502790927887, "learning_rate": 2e-05, "loss": 0.1331, "step": 63 }, { "epoch": 0.25806451612903225, "grad_norm": 0.8886832594871521, "learning_rate": 2e-05, "loss": 0.1248, "step": 64 }, { "epoch": 0.2620967741935484, "grad_norm": 0.7013624906539917, "learning_rate": 2e-05, "loss": 0.1164, "step": 65 }, { "epoch": 0.2661290322580645, "grad_norm": 0.7485561966896057, "learning_rate": 2e-05, "loss": 0.118, "step": 66 }, { "epoch": 0.2701612903225806, "grad_norm": 0.7874916791915894, "learning_rate": 2e-05, "loss": 0.1116, "step": 67 }, { "epoch": 0.27419354838709675, "grad_norm": 0.8042868375778198, "learning_rate": 2e-05, "loss": 0.1215, "step": 68 }, { "epoch": 0.2782258064516129, "grad_norm": 0.604430615901947, "learning_rate": 2e-05, "loss": 0.1109, "step": 69 }, { "epoch": 0.28225806451612906, "grad_norm": 0.976264476776123, "learning_rate": 2e-05, "loss": 0.124, "step": 70 }, { "epoch": 0.2862903225806452, "grad_norm": 1.0005311965942383, "learning_rate": 2e-05, "loss": 0.1112, "step": 71 }, { "epoch": 0.2903225806451613, "grad_norm": 0.6228518486022949, "learning_rate": 2e-05, "loss": 0.1049, "step": 72 }, { "epoch": 0.29435483870967744, "grad_norm": 0.7674490809440613, "learning_rate": 2e-05, "loss": 0.1091, "step": 73 }, { "epoch": 0.29838709677419356, "grad_norm": 1.027273416519165, "learning_rate": 2e-05, "loss": 0.1147, "step": 74 }, { "epoch": 0.3024193548387097, "grad_norm": 0.6840062737464905, "learning_rate": 2e-05, "loss": 0.0962, "step": 75 }, { "epoch": 0.3064516129032258, "grad_norm": 0.5666499137878418, "learning_rate": 2e-05, "loss": 0.0956, "step": 76 }, { "epoch": 0.31048387096774194, "grad_norm": 0.594052791595459, "learning_rate": 2e-05, "loss": 0.0968, "step": 77 }, { "epoch": 0.31451612903225806, "grad_norm": 0.7595533132553101, "learning_rate": 2e-05, "loss": 0.0927, "step": 78 }, { "epoch": 0.3185483870967742, "grad_norm": 0.7487107515335083, "learning_rate": 2e-05, "loss": 0.1027, "step": 79 }, { "epoch": 0.3225806451612903, "grad_norm": 0.5936404466629028, "learning_rate": 2e-05, "loss": 0.0884, "step": 80 }, { "epoch": 0.32661290322580644, "grad_norm": 0.4667339622974396, "learning_rate": 2e-05, "loss": 0.0946, "step": 81 }, { "epoch": 0.33064516129032256, "grad_norm": 0.8685793280601501, "learning_rate": 2e-05, "loss": 0.1011, "step": 82 }, { "epoch": 0.3346774193548387, "grad_norm": 0.7018740177154541, "learning_rate": 2e-05, "loss": 0.0872, "step": 83 }, { "epoch": 0.3387096774193548, "grad_norm": 1.0336928367614746, "learning_rate": 2e-05, "loss": 0.1042, "step": 84 }, { "epoch": 0.34274193548387094, "grad_norm": 0.6793813705444336, "learning_rate": 2e-05, "loss": 0.0924, "step": 85 }, { "epoch": 0.3467741935483871, "grad_norm": 1.0464022159576416, "learning_rate": 2e-05, "loss": 0.0955, "step": 86 }, { "epoch": 0.35080645161290325, "grad_norm": 0.7015179991722107, "learning_rate": 2e-05, "loss": 0.0928, "step": 87 }, { "epoch": 0.3548387096774194, "grad_norm": 0.7370674014091492, "learning_rate": 2e-05, "loss": 0.0895, "step": 88 }, { "epoch": 0.3588709677419355, "grad_norm": 0.5556283593177795, "learning_rate": 2e-05, "loss": 0.0895, "step": 89 }, { "epoch": 0.3629032258064516, "grad_norm": 0.6646509766578674, "learning_rate": 2e-05, "loss": 0.0871, "step": 90 }, { "epoch": 0.36693548387096775, "grad_norm": 0.7947157025337219, "learning_rate": 2e-05, "loss": 0.0939, "step": 91 }, { "epoch": 0.3709677419354839, "grad_norm": 0.6044544577598572, "learning_rate": 2e-05, "loss": 0.0857, "step": 92 }, { "epoch": 0.375, "grad_norm": 0.7086596488952637, "learning_rate": 2e-05, "loss": 0.0919, "step": 93 }, { "epoch": 0.3790322580645161, "grad_norm": 0.6559664011001587, "learning_rate": 2e-05, "loss": 0.0856, "step": 94 }, { "epoch": 0.38306451612903225, "grad_norm": 0.784209132194519, "learning_rate": 2e-05, "loss": 0.0836, "step": 95 }, { "epoch": 0.3870967741935484, "grad_norm": 0.5902017951011658, "learning_rate": 2e-05, "loss": 0.0747, "step": 96 }, { "epoch": 0.3911290322580645, "grad_norm": 0.697828471660614, "learning_rate": 2e-05, "loss": 0.0776, "step": 97 }, { "epoch": 0.3951612903225806, "grad_norm": 0.5101798176765442, "learning_rate": 2e-05, "loss": 0.0777, "step": 98 }, { "epoch": 0.39919354838709675, "grad_norm": 0.8497079610824585, "learning_rate": 2e-05, "loss": 0.0832, "step": 99 }, { "epoch": 0.4032258064516129, "grad_norm": 0.5198425054550171, "learning_rate": 2e-05, "loss": 0.0786, "step": 100 }, { "epoch": 0.40725806451612906, "grad_norm": 0.6342234015464783, "learning_rate": 2e-05, "loss": 0.0801, "step": 101 }, { "epoch": 0.4112903225806452, "grad_norm": 0.4612491726875305, "learning_rate": 2e-05, "loss": 0.0805, "step": 102 }, { "epoch": 0.4153225806451613, "grad_norm": 0.8742281198501587, "learning_rate": 2e-05, "loss": 0.0882, "step": 103 }, { "epoch": 0.41935483870967744, "grad_norm": 0.3069051206111908, "learning_rate": 2e-05, "loss": 0.0762, "step": 104 }, { "epoch": 0.42338709677419356, "grad_norm": 0.7006452083587646, "learning_rate": 2e-05, "loss": 0.0744, "step": 105 }, { "epoch": 0.4274193548387097, "grad_norm": 0.514578640460968, "learning_rate": 2e-05, "loss": 0.0827, "step": 106 }, { "epoch": 0.4314516129032258, "grad_norm": 1.0400453805923462, "learning_rate": 2e-05, "loss": 0.0819, "step": 107 }, { "epoch": 0.43548387096774194, "grad_norm": 0.7127644419670105, "learning_rate": 2e-05, "loss": 0.083, "step": 108 }, { "epoch": 0.43951612903225806, "grad_norm": 0.617011308670044, "learning_rate": 2e-05, "loss": 0.0705, "step": 109 }, { "epoch": 0.4435483870967742, "grad_norm": 0.5836071968078613, "learning_rate": 2e-05, "loss": 0.0777, "step": 110 }, { "epoch": 0.4475806451612903, "grad_norm": 0.6622437238693237, "learning_rate": 2e-05, "loss": 0.0747, "step": 111 }, { "epoch": 0.45161290322580644, "grad_norm": 0.7056003212928772, "learning_rate": 2e-05, "loss": 0.0715, "step": 112 }, { "epoch": 0.45564516129032256, "grad_norm": 0.6626383662223816, "learning_rate": 2e-05, "loss": 0.0776, "step": 113 }, { "epoch": 0.4596774193548387, "grad_norm": 0.7465190291404724, "learning_rate": 2e-05, "loss": 0.0856, "step": 114 }, { "epoch": 0.4637096774193548, "grad_norm": 0.5531803369522095, "learning_rate": 2e-05, "loss": 0.0679, "step": 115 }, { "epoch": 0.46774193548387094, "grad_norm": 0.4788318872451782, "learning_rate": 2e-05, "loss": 0.0648, "step": 116 }, { "epoch": 0.4717741935483871, "grad_norm": 0.6184081435203552, "learning_rate": 2e-05, "loss": 0.0801, "step": 117 }, { "epoch": 0.47580645161290325, "grad_norm": 0.6424548029899597, "learning_rate": 2e-05, "loss": 0.0789, "step": 118 }, { "epoch": 0.4798387096774194, "grad_norm": 0.7118510007858276, "learning_rate": 2e-05, "loss": 0.0735, "step": 119 }, { "epoch": 0.4838709677419355, "grad_norm": 0.4841958284378052, "learning_rate": 2e-05, "loss": 0.061, "step": 120 }, { "epoch": 0.4879032258064516, "grad_norm": 0.8846139311790466, "learning_rate": 2e-05, "loss": 0.0747, "step": 121 }, { "epoch": 0.49193548387096775, "grad_norm": 0.5449007153511047, "learning_rate": 2e-05, "loss": 0.0711, "step": 122 }, { "epoch": 0.4959677419354839, "grad_norm": 0.767926037311554, "learning_rate": 2e-05, "loss": 0.0736, "step": 123 }, { "epoch": 0.5, "grad_norm": 0.5696377158164978, "learning_rate": 2e-05, "loss": 0.0671, "step": 124 }, { "epoch": 0.5040322580645161, "grad_norm": 0.6430863738059998, "learning_rate": 2e-05, "loss": 0.0679, "step": 125 }, { "epoch": 0.5080645161290323, "grad_norm": 0.7779257893562317, "learning_rate": 2e-05, "loss": 0.0713, "step": 126 }, { "epoch": 0.5120967741935484, "grad_norm": 0.7092922329902649, "learning_rate": 2e-05, "loss": 0.0765, "step": 127 }, { "epoch": 0.5161290322580645, "grad_norm": 0.5975173711776733, "learning_rate": 2e-05, "loss": 0.066, "step": 128 }, { "epoch": 0.5201612903225806, "grad_norm": 0.5376009941101074, "learning_rate": 2e-05, "loss": 0.0642, "step": 129 }, { "epoch": 0.5241935483870968, "grad_norm": 0.4406221807003021, "learning_rate": 2e-05, "loss": 0.0594, "step": 130 }, { "epoch": 0.5282258064516129, "grad_norm": 0.530074954032898, "learning_rate": 2e-05, "loss": 0.0731, "step": 131 }, { "epoch": 0.532258064516129, "grad_norm": 0.5786536335945129, "learning_rate": 2e-05, "loss": 0.06, "step": 132 }, { "epoch": 0.5362903225806451, "grad_norm": 0.5356053113937378, "learning_rate": 2e-05, "loss": 0.0659, "step": 133 }, { "epoch": 0.5403225806451613, "grad_norm": 0.3962647318840027, "learning_rate": 2e-05, "loss": 0.0618, "step": 134 }, { "epoch": 0.5443548387096774, "grad_norm": 0.3608771860599518, "learning_rate": 2e-05, "loss": 0.0643, "step": 135 }, { "epoch": 0.5483870967741935, "grad_norm": 0.5634734034538269, "learning_rate": 2e-05, "loss": 0.0615, "step": 136 }, { "epoch": 0.5524193548387096, "grad_norm": 0.5571008324623108, "learning_rate": 2e-05, "loss": 0.0663, "step": 137 }, { "epoch": 0.5564516129032258, "grad_norm": 0.5018740296363831, "learning_rate": 2e-05, "loss": 0.0613, "step": 138 }, { "epoch": 0.5604838709677419, "grad_norm": 0.664557933807373, "learning_rate": 2e-05, "loss": 0.067, "step": 139 }, { "epoch": 0.5645161290322581, "grad_norm": 0.6537980437278748, "learning_rate": 2e-05, "loss": 0.0593, "step": 140 }, { "epoch": 0.5685483870967742, "grad_norm": 0.8715218901634216, "learning_rate": 2e-05, "loss": 0.0693, "step": 141 }, { "epoch": 0.5725806451612904, "grad_norm": 0.5582900047302246, "learning_rate": 2e-05, "loss": 0.0605, "step": 142 }, { "epoch": 0.5766129032258065, "grad_norm": 0.4657461941242218, "learning_rate": 2e-05, "loss": 0.0594, "step": 143 }, { "epoch": 0.5806451612903226, "grad_norm": 0.5373775959014893, "learning_rate": 2e-05, "loss": 0.07, "step": 144 }, { "epoch": 0.5846774193548387, "grad_norm": 0.4283169209957123, "learning_rate": 2e-05, "loss": 0.0536, "step": 145 }, { "epoch": 0.5887096774193549, "grad_norm": 0.6403968930244446, "learning_rate": 2e-05, "loss": 0.0667, "step": 146 }, { "epoch": 0.592741935483871, "grad_norm": 0.32464203238487244, "learning_rate": 2e-05, "loss": 0.0548, "step": 147 }, { "epoch": 0.5967741935483871, "grad_norm": 0.648133397102356, "learning_rate": 2e-05, "loss": 0.0612, "step": 148 }, { "epoch": 0.6008064516129032, "grad_norm": 0.47770267724990845, "learning_rate": 2e-05, "loss": 0.0544, "step": 149 }, { "epoch": 0.6048387096774194, "grad_norm": 0.9105427861213684, "learning_rate": 2e-05, "loss": 0.0684, "step": 150 }, { "epoch": 0.6088709677419355, "grad_norm": 0.6342010498046875, "learning_rate": 2e-05, "loss": 0.0601, "step": 151 }, { "epoch": 0.6129032258064516, "grad_norm": 0.8317110538482666, "learning_rate": 2e-05, "loss": 0.0584, "step": 152 }, { "epoch": 0.6169354838709677, "grad_norm": 0.57545405626297, "learning_rate": 2e-05, "loss": 0.059, "step": 153 }, { "epoch": 0.6209677419354839, "grad_norm": 0.46788084506988525, "learning_rate": 2e-05, "loss": 0.0552, "step": 154 }, { "epoch": 0.625, "grad_norm": 0.5528416633605957, "learning_rate": 2e-05, "loss": 0.0579, "step": 155 }, { "epoch": 0.6290322580645161, "grad_norm": 0.45801204442977905, "learning_rate": 2e-05, "loss": 0.0539, "step": 156 }, { "epoch": 0.6330645161290323, "grad_norm": 0.47493261098861694, "learning_rate": 2e-05, "loss": 0.0585, "step": 157 }, { "epoch": 0.6370967741935484, "grad_norm": 0.46749451756477356, "learning_rate": 2e-05, "loss": 0.0537, "step": 158 }, { "epoch": 0.6411290322580645, "grad_norm": 0.5712094306945801, "learning_rate": 2e-05, "loss": 0.0636, "step": 159 }, { "epoch": 0.6451612903225806, "grad_norm": 0.474437952041626, "learning_rate": 2e-05, "loss": 0.0539, "step": 160 }, { "epoch": 0.6491935483870968, "grad_norm": 0.5955020785331726, "learning_rate": 2e-05, "loss": 0.0686, "step": 161 }, { "epoch": 0.6532258064516129, "grad_norm": 0.5444841980934143, "learning_rate": 2e-05, "loss": 0.0514, "step": 162 }, { "epoch": 0.657258064516129, "grad_norm": 0.585702657699585, "learning_rate": 2e-05, "loss": 0.057, "step": 163 }, { "epoch": 0.6612903225806451, "grad_norm": 0.6098143458366394, "learning_rate": 2e-05, "loss": 0.0624, "step": 164 }, { "epoch": 0.6653225806451613, "grad_norm": 0.5105492472648621, "learning_rate": 2e-05, "loss": 0.0524, "step": 165 }, { "epoch": 0.6693548387096774, "grad_norm": 0.3543269634246826, "learning_rate": 2e-05, "loss": 0.0427, "step": 166 }, { "epoch": 0.6733870967741935, "grad_norm": 0.40186411142349243, "learning_rate": 2e-05, "loss": 0.0513, "step": 167 }, { "epoch": 0.6774193548387096, "grad_norm": 0.4863409101963043, "learning_rate": 2e-05, "loss": 0.0615, "step": 168 }, { "epoch": 0.6814516129032258, "grad_norm": 0.35418546199798584, "learning_rate": 2e-05, "loss": 0.0534, "step": 169 }, { "epoch": 0.6854838709677419, "grad_norm": 0.4265013039112091, "learning_rate": 2e-05, "loss": 0.0424, "step": 170 }, { "epoch": 0.6895161290322581, "grad_norm": 0.4792309105396271, "learning_rate": 2e-05, "loss": 0.0534, "step": 171 }, { "epoch": 0.6935483870967742, "grad_norm": 0.9275990724563599, "learning_rate": 2e-05, "loss": 0.0605, "step": 172 }, { "epoch": 0.6975806451612904, "grad_norm": 0.5802022218704224, "learning_rate": 2e-05, "loss": 0.0541, "step": 173 }, { "epoch": 0.7016129032258065, "grad_norm": 0.8620706796646118, "learning_rate": 2e-05, "loss": 0.0617, "step": 174 }, { "epoch": 0.7056451612903226, "grad_norm": 0.6036432981491089, "learning_rate": 2e-05, "loss": 0.0626, "step": 175 }, { "epoch": 0.7096774193548387, "grad_norm": 0.5247609615325928, "learning_rate": 2e-05, "loss": 0.0553, "step": 176 }, { "epoch": 0.7137096774193549, "grad_norm": 0.5166157484054565, "learning_rate": 2e-05, "loss": 0.0549, "step": 177 }, { "epoch": 0.717741935483871, "grad_norm": 0.4395121932029724, "learning_rate": 2e-05, "loss": 0.0526, "step": 178 }, { "epoch": 0.7217741935483871, "grad_norm": 0.47025758028030396, "learning_rate": 2e-05, "loss": 0.0558, "step": 179 }, { "epoch": 0.7258064516129032, "grad_norm": 0.5386791229248047, "learning_rate": 2e-05, "loss": 0.0539, "step": 180 }, { "epoch": 0.7298387096774194, "grad_norm": 0.5612148642539978, "learning_rate": 2e-05, "loss": 0.0591, "step": 181 }, { "epoch": 0.7338709677419355, "grad_norm": 0.4585655927658081, "learning_rate": 2e-05, "loss": 0.0546, "step": 182 }, { "epoch": 0.7379032258064516, "grad_norm": 0.5998373627662659, "learning_rate": 2e-05, "loss": 0.0583, "step": 183 }, { "epoch": 0.7419354838709677, "grad_norm": 0.38647782802581787, "learning_rate": 2e-05, "loss": 0.0494, "step": 184 }, { "epoch": 0.7459677419354839, "grad_norm": 0.567383348941803, "learning_rate": 2e-05, "loss": 0.0487, "step": 185 }, { "epoch": 0.75, "grad_norm": 0.5236309766769409, "learning_rate": 2e-05, "loss": 0.0552, "step": 186 }, { "epoch": 0.7540322580645161, "grad_norm": 0.3990425765514374, "learning_rate": 2e-05, "loss": 0.0512, "step": 187 }, { "epoch": 0.7580645161290323, "grad_norm": 0.5519928336143494, "learning_rate": 2e-05, "loss": 0.0449, "step": 188 }, { "epoch": 0.7620967741935484, "grad_norm": 0.43356701731681824, "learning_rate": 2e-05, "loss": 0.0413, "step": 189 }, { "epoch": 0.7661290322580645, "grad_norm": 0.46121910214424133, "learning_rate": 2e-05, "loss": 0.0441, "step": 190 }, { "epoch": 0.7701612903225806, "grad_norm": 0.5286686420440674, "learning_rate": 2e-05, "loss": 0.0506, "step": 191 }, { "epoch": 0.7741935483870968, "grad_norm": 0.6215876340866089, "learning_rate": 2e-05, "loss": 0.0538, "step": 192 }, { "epoch": 0.7782258064516129, "grad_norm": 0.7031762003898621, "learning_rate": 2e-05, "loss": 0.0507, "step": 193 }, { "epoch": 0.782258064516129, "grad_norm": 0.4998103678226471, "learning_rate": 2e-05, "loss": 0.055, "step": 194 }, { "epoch": 0.7862903225806451, "grad_norm": 0.4593054950237274, "learning_rate": 2e-05, "loss": 0.0468, "step": 195 }, { "epoch": 0.7903225806451613, "grad_norm": 0.6475517749786377, "learning_rate": 2e-05, "loss": 0.0559, "step": 196 }, { "epoch": 0.7943548387096774, "grad_norm": 0.523537278175354, "learning_rate": 2e-05, "loss": 0.0464, "step": 197 }, { "epoch": 0.7983870967741935, "grad_norm": 0.6223071813583374, "learning_rate": 2e-05, "loss": 0.0464, "step": 198 }, { "epoch": 0.8024193548387096, "grad_norm": 0.40836507081985474, "learning_rate": 2e-05, "loss": 0.049, "step": 199 }, { "epoch": 0.8064516129032258, "grad_norm": 0.6119136810302734, "learning_rate": 2e-05, "loss": 0.0536, "step": 200 }, { "epoch": 0.8104838709677419, "grad_norm": 0.4265545904636383, "learning_rate": 2e-05, "loss": 0.0502, "step": 201 }, { "epoch": 0.8145161290322581, "grad_norm": 0.44581177830696106, "learning_rate": 2e-05, "loss": 0.0471, "step": 202 }, { "epoch": 0.8185483870967742, "grad_norm": 0.4306443929672241, "learning_rate": 2e-05, "loss": 0.0444, "step": 203 }, { "epoch": 0.8225806451612904, "grad_norm": 0.402327299118042, "learning_rate": 2e-05, "loss": 0.0415, "step": 204 }, { "epoch": 0.8266129032258065, "grad_norm": 0.4216252863407135, "learning_rate": 2e-05, "loss": 0.0465, "step": 205 }, { "epoch": 0.8306451612903226, "grad_norm": 0.3738255202770233, "learning_rate": 2e-05, "loss": 0.0415, "step": 206 }, { "epoch": 0.8346774193548387, "grad_norm": 0.5387892723083496, "learning_rate": 2e-05, "loss": 0.0543, "step": 207 }, { "epoch": 0.8387096774193549, "grad_norm": 0.5584475994110107, "learning_rate": 2e-05, "loss": 0.0457, "step": 208 }, { "epoch": 0.842741935483871, "grad_norm": 0.5456405878067017, "learning_rate": 2e-05, "loss": 0.048, "step": 209 }, { "epoch": 0.8467741935483871, "grad_norm": 0.5054622888565063, "learning_rate": 2e-05, "loss": 0.0476, "step": 210 }, { "epoch": 0.8508064516129032, "grad_norm": 0.41379377245903015, "learning_rate": 2e-05, "loss": 0.0436, "step": 211 }, { "epoch": 0.8548387096774194, "grad_norm": 0.3779892921447754, "learning_rate": 2e-05, "loss": 0.0478, "step": 212 }, { "epoch": 0.8588709677419355, "grad_norm": 0.4135122001171112, "learning_rate": 2e-05, "loss": 0.0422, "step": 213 }, { "epoch": 0.8629032258064516, "grad_norm": 0.5435640215873718, "learning_rate": 2e-05, "loss": 0.0484, "step": 214 }, { "epoch": 0.8669354838709677, "grad_norm": 0.5836952924728394, "learning_rate": 2e-05, "loss": 0.0493, "step": 215 }, { "epoch": 0.8709677419354839, "grad_norm": 0.4919867515563965, "learning_rate": 2e-05, "loss": 0.0503, "step": 216 }, { "epoch": 0.875, "grad_norm": 0.4889490008354187, "learning_rate": 2e-05, "loss": 0.0475, "step": 217 }, { "epoch": 0.8790322580645161, "grad_norm": 0.4471587538719177, "learning_rate": 2e-05, "loss": 0.0381, "step": 218 }, { "epoch": 0.8830645161290323, "grad_norm": 0.40294429659843445, "learning_rate": 2e-05, "loss": 0.0438, "step": 219 }, { "epoch": 0.8870967741935484, "grad_norm": 0.46678218245506287, "learning_rate": 2e-05, "loss": 0.0442, "step": 220 }, { "epoch": 0.8911290322580645, "grad_norm": 0.622652530670166, "learning_rate": 2e-05, "loss": 0.0412, "step": 221 }, { "epoch": 0.8951612903225806, "grad_norm": 0.41154831647872925, "learning_rate": 2e-05, "loss": 0.0451, "step": 222 }, { "epoch": 0.8991935483870968, "grad_norm": 0.36561766266822815, "learning_rate": 2e-05, "loss": 0.0453, "step": 223 }, { "epoch": 0.9032258064516129, "grad_norm": 0.619911789894104, "learning_rate": 2e-05, "loss": 0.0481, "step": 224 }, { "epoch": 0.907258064516129, "grad_norm": 0.543843686580658, "learning_rate": 2e-05, "loss": 0.043, "step": 225 }, { "epoch": 0.9112903225806451, "grad_norm": 0.546393871307373, "learning_rate": 2e-05, "loss": 0.0435, "step": 226 }, { "epoch": 0.9153225806451613, "grad_norm": 0.3940606713294983, "learning_rate": 2e-05, "loss": 0.0406, "step": 227 }, { "epoch": 0.9193548387096774, "grad_norm": 0.31918397545814514, "learning_rate": 2e-05, "loss": 0.0384, "step": 228 }, { "epoch": 0.9233870967741935, "grad_norm": 0.35918116569519043, "learning_rate": 2e-05, "loss": 0.0366, "step": 229 }, { "epoch": 0.9274193548387096, "grad_norm": 0.39295467734336853, "learning_rate": 2e-05, "loss": 0.0395, "step": 230 }, { "epoch": 0.9314516129032258, "grad_norm": 0.34643733501434326, "learning_rate": 2e-05, "loss": 0.0405, "step": 231 }, { "epoch": 0.9354838709677419, "grad_norm": 0.3488601744174957, "learning_rate": 2e-05, "loss": 0.0435, "step": 232 }, { "epoch": 0.9395161290322581, "grad_norm": 0.4448557496070862, "learning_rate": 2e-05, "loss": 0.0459, "step": 233 }, { "epoch": 0.9435483870967742, "grad_norm": 0.4407562017440796, "learning_rate": 2e-05, "loss": 0.0447, "step": 234 }, { "epoch": 0.9475806451612904, "grad_norm": 0.5757035613059998, "learning_rate": 2e-05, "loss": 0.0458, "step": 235 }, { "epoch": 0.9516129032258065, "grad_norm": 0.29268836975097656, "learning_rate": 2e-05, "loss": 0.0441, "step": 236 }, { "epoch": 0.9556451612903226, "grad_norm": 0.39647752046585083, "learning_rate": 2e-05, "loss": 0.0382, "step": 237 }, { "epoch": 0.9596774193548387, "grad_norm": 0.4112660884857178, "learning_rate": 2e-05, "loss": 0.0382, "step": 238 }, { "epoch": 0.9637096774193549, "grad_norm": 0.4475345313549042, "learning_rate": 2e-05, "loss": 0.0458, "step": 239 }, { "epoch": 0.967741935483871, "grad_norm": 0.26978054642677307, "learning_rate": 2e-05, "loss": 0.0419, "step": 240 }, { "epoch": 0.9717741935483871, "grad_norm": 0.4117030203342438, "learning_rate": 2e-05, "loss": 0.043, "step": 241 }, { "epoch": 0.9758064516129032, "grad_norm": 0.28733769059181213, "learning_rate": 2e-05, "loss": 0.0387, "step": 242 }, { "epoch": 0.9798387096774194, "grad_norm": 0.32847997546195984, "learning_rate": 2e-05, "loss": 0.0407, "step": 243 }, { "epoch": 0.9838709677419355, "grad_norm": 0.4303770661354065, "learning_rate": 2e-05, "loss": 0.0417, "step": 244 }, { "epoch": 0.9879032258064516, "grad_norm": 0.36009445786476135, "learning_rate": 2e-05, "loss": 0.0391, "step": 245 }, { "epoch": 0.9919354838709677, "grad_norm": 0.46317991614341736, "learning_rate": 2e-05, "loss": 0.0477, "step": 246 }, { "epoch": 0.9959677419354839, "grad_norm": 0.48081448674201965, "learning_rate": 2e-05, "loss": 0.035, "step": 247 }, { "epoch": 1.0, "grad_norm": 0.3577556908130646, "learning_rate": 2e-05, "loss": 0.0435, "step": 248 } ], "logging_steps": 1.0, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 284755222462464.0, "train_batch_size": 48, "trial_name": null, "trial_params": null }