Vinallama_patch_file_v1 / trainer_state.json
anhvv200053's picture
Upload 11 files
fe99a65 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.733952049497293,
"eval_steps": 500,
"global_step": 30000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025779840164990978,
"grad_norm": 0.5347822308540344,
"learning_rate": 0.0002,
"loss": 1.4509,
"step": 100
},
{
"epoch": 0.051559680329981955,
"grad_norm": 0.4712078273296356,
"learning_rate": 0.0002,
"loss": 1.1744,
"step": 200
},
{
"epoch": 0.07733952049497293,
"grad_norm": 0.5031601786613464,
"learning_rate": 0.0002,
"loss": 1.096,
"step": 300
},
{
"epoch": 0.10311936065996391,
"grad_norm": 0.49241065979003906,
"learning_rate": 0.0002,
"loss": 0.9847,
"step": 400
},
{
"epoch": 0.12889920082495487,
"grad_norm": 0.9957050681114197,
"learning_rate": 0.0002,
"loss": 0.9928,
"step": 500
},
{
"epoch": 0.15467904098994587,
"grad_norm": 0.38163048028945923,
"learning_rate": 0.0002,
"loss": 0.9008,
"step": 600
},
{
"epoch": 0.18045888115493683,
"grad_norm": 0.4322434663772583,
"learning_rate": 0.0002,
"loss": 0.9108,
"step": 700
},
{
"epoch": 0.20623872131992782,
"grad_norm": 0.4072737395763397,
"learning_rate": 0.0002,
"loss": 0.8713,
"step": 800
},
{
"epoch": 0.23201856148491878,
"grad_norm": 0.5637839436531067,
"learning_rate": 0.0002,
"loss": 0.8538,
"step": 900
},
{
"epoch": 0.25779840164990975,
"grad_norm": 0.6094131469726562,
"learning_rate": 0.0002,
"loss": 0.8154,
"step": 1000
},
{
"epoch": 0.28357824181490077,
"grad_norm": 0.4212701618671417,
"learning_rate": 0.0002,
"loss": 0.7897,
"step": 1100
},
{
"epoch": 0.30935808197989173,
"grad_norm": 0.4663824737071991,
"learning_rate": 0.0002,
"loss": 0.8021,
"step": 1200
},
{
"epoch": 0.3351379221448827,
"grad_norm": 0.3774861693382263,
"learning_rate": 0.0002,
"loss": 0.7452,
"step": 1300
},
{
"epoch": 0.36091776230987366,
"grad_norm": 0.19446992874145508,
"learning_rate": 0.0002,
"loss": 0.737,
"step": 1400
},
{
"epoch": 0.3866976024748647,
"grad_norm": 0.25984033942222595,
"learning_rate": 0.0002,
"loss": 0.6966,
"step": 1500
},
{
"epoch": 0.41247744263985564,
"grad_norm": 0.3495163023471832,
"learning_rate": 0.0002,
"loss": 0.7179,
"step": 1600
},
{
"epoch": 0.4382572828048466,
"grad_norm": 0.5092929601669312,
"learning_rate": 0.0002,
"loss": 0.7132,
"step": 1700
},
{
"epoch": 0.46403712296983757,
"grad_norm": 0.16095790266990662,
"learning_rate": 0.0002,
"loss": 0.6652,
"step": 1800
},
{
"epoch": 0.4898169631348286,
"grad_norm": 0.38502034544944763,
"learning_rate": 0.0002,
"loss": 0.6564,
"step": 1900
},
{
"epoch": 0.5155968032998195,
"grad_norm": 0.3100506067276001,
"learning_rate": 0.0002,
"loss": 0.6082,
"step": 2000
},
{
"epoch": 0.5413766434648105,
"grad_norm": 0.4585016965866089,
"learning_rate": 0.0002,
"loss": 0.6491,
"step": 2100
},
{
"epoch": 0.5671564836298015,
"grad_norm": 0.35394927859306335,
"learning_rate": 0.0002,
"loss": 0.6136,
"step": 2200
},
{
"epoch": 0.5929363237947924,
"grad_norm": 0.4828909933567047,
"learning_rate": 0.0002,
"loss": 0.5639,
"step": 2300
},
{
"epoch": 0.6187161639597835,
"grad_norm": 0.7377568483352661,
"learning_rate": 0.0002,
"loss": 0.5998,
"step": 2400
},
{
"epoch": 0.6444960041247745,
"grad_norm": 0.33992356061935425,
"learning_rate": 0.0002,
"loss": 0.5535,
"step": 2500
},
{
"epoch": 0.6702758442897654,
"grad_norm": 0.40880173444747925,
"learning_rate": 0.0002,
"loss": 0.5839,
"step": 2600
},
{
"epoch": 0.6960556844547564,
"grad_norm": 0.6135886907577515,
"learning_rate": 0.0002,
"loss": 0.5697,
"step": 2700
},
{
"epoch": 0.7218355246197473,
"grad_norm": 0.14242181181907654,
"learning_rate": 0.0002,
"loss": 0.562,
"step": 2800
},
{
"epoch": 0.7476153647847383,
"grad_norm": 0.1636349856853485,
"learning_rate": 0.0002,
"loss": 0.5301,
"step": 2900
},
{
"epoch": 0.7733952049497294,
"grad_norm": 0.5300703644752502,
"learning_rate": 0.0002,
"loss": 0.5428,
"step": 3000
},
{
"epoch": 0.7991750451147203,
"grad_norm": 0.2816906273365021,
"learning_rate": 0.0002,
"loss": 0.5319,
"step": 3100
},
{
"epoch": 0.8249548852797113,
"grad_norm": 0.4165875315666199,
"learning_rate": 0.0002,
"loss": 0.5073,
"step": 3200
},
{
"epoch": 0.8507347254447022,
"grad_norm": 0.46957316994667053,
"learning_rate": 0.0002,
"loss": 0.4973,
"step": 3300
},
{
"epoch": 0.8765145656096932,
"grad_norm": 0.22382797300815582,
"learning_rate": 0.0002,
"loss": 0.5091,
"step": 3400
},
{
"epoch": 0.9022944057746842,
"grad_norm": 0.517814576625824,
"learning_rate": 0.0002,
"loss": 0.4879,
"step": 3500
},
{
"epoch": 0.9280742459396751,
"grad_norm": 0.44171011447906494,
"learning_rate": 0.0002,
"loss": 0.4711,
"step": 3600
},
{
"epoch": 0.9538540861046662,
"grad_norm": 0.3107047379016876,
"learning_rate": 0.0002,
"loss": 0.465,
"step": 3700
},
{
"epoch": 0.9796339262696572,
"grad_norm": 0.09984863549470901,
"learning_rate": 0.0002,
"loss": 0.4485,
"step": 3800
},
{
"epoch": 1.005413766434648,
"grad_norm": 0.43100592494010925,
"learning_rate": 0.0002,
"loss": 0.4752,
"step": 3900
},
{
"epoch": 1.031193606599639,
"grad_norm": 0.5259262919425964,
"learning_rate": 0.0002,
"loss": 0.3621,
"step": 4000
},
{
"epoch": 1.0569734467646301,
"grad_norm": 0.47033509612083435,
"learning_rate": 0.0002,
"loss": 0.3569,
"step": 4100
},
{
"epoch": 1.082753286929621,
"grad_norm": 0.5318751931190491,
"learning_rate": 0.0002,
"loss": 0.3512,
"step": 4200
},
{
"epoch": 1.108533127094612,
"grad_norm": 0.5434057116508484,
"learning_rate": 0.0002,
"loss": 0.3504,
"step": 4300
},
{
"epoch": 1.134312967259603,
"grad_norm": 0.47843560576438904,
"learning_rate": 0.0002,
"loss": 0.3712,
"step": 4400
},
{
"epoch": 1.160092807424594,
"grad_norm": 0.5956776142120361,
"learning_rate": 0.0002,
"loss": 0.3511,
"step": 4500
},
{
"epoch": 1.1858726475895849,
"grad_norm": 0.5072950720787048,
"learning_rate": 0.0002,
"loss": 0.3445,
"step": 4600
},
{
"epoch": 1.211652487754576,
"grad_norm": 0.5608052611351013,
"learning_rate": 0.0002,
"loss": 0.3377,
"step": 4700
},
{
"epoch": 1.237432327919567,
"grad_norm": 0.474223256111145,
"learning_rate": 0.0002,
"loss": 0.3276,
"step": 4800
},
{
"epoch": 1.2632121680845578,
"grad_norm": 0.5215118527412415,
"learning_rate": 0.0002,
"loss": 0.3375,
"step": 4900
},
{
"epoch": 1.288992008249549,
"grad_norm": 0.3922516405582428,
"learning_rate": 0.0002,
"loss": 0.342,
"step": 5000
},
{
"epoch": 1.3147718484145399,
"grad_norm": 0.4958643615245819,
"learning_rate": 0.0002,
"loss": 0.3553,
"step": 5100
},
{
"epoch": 1.3405516885795308,
"grad_norm": 0.564983069896698,
"learning_rate": 0.0002,
"loss": 0.3389,
"step": 5200
},
{
"epoch": 1.3663315287445217,
"grad_norm": 0.5662856698036194,
"learning_rate": 0.0002,
"loss": 0.3382,
"step": 5300
},
{
"epoch": 1.3921113689095128,
"grad_norm": 0.5040738582611084,
"learning_rate": 0.0002,
"loss": 0.3408,
"step": 5400
},
{
"epoch": 1.4178912090745037,
"grad_norm": 0.27346768975257874,
"learning_rate": 0.0002,
"loss": 0.3266,
"step": 5500
},
{
"epoch": 1.4436710492394949,
"grad_norm": 0.5055024027824402,
"learning_rate": 0.0002,
"loss": 0.3561,
"step": 5600
},
{
"epoch": 1.4694508894044858,
"grad_norm": 0.5442714691162109,
"learning_rate": 0.0002,
"loss": 0.3241,
"step": 5700
},
{
"epoch": 1.4952307295694767,
"grad_norm": 0.4862806499004364,
"learning_rate": 0.0002,
"loss": 0.344,
"step": 5800
},
{
"epoch": 1.5210105697344676,
"grad_norm": 0.6346714496612549,
"learning_rate": 0.0002,
"loss": 0.3195,
"step": 5900
},
{
"epoch": 1.5467904098994585,
"grad_norm": 0.5846338272094727,
"learning_rate": 0.0002,
"loss": 0.3232,
"step": 6000
},
{
"epoch": 1.5725702500644496,
"grad_norm": 0.41255345940589905,
"learning_rate": 0.0002,
"loss": 0.3379,
"step": 6100
},
{
"epoch": 1.5983500902294405,
"grad_norm": 0.6396617293357849,
"learning_rate": 0.0002,
"loss": 0.3099,
"step": 6200
},
{
"epoch": 1.6241299303944317,
"grad_norm": 0.3450670540332794,
"learning_rate": 0.0002,
"loss": 0.3129,
"step": 6300
},
{
"epoch": 1.6499097705594226,
"grad_norm": 0.30461055040359497,
"learning_rate": 0.0002,
"loss": 0.2978,
"step": 6400
},
{
"epoch": 1.6756896107244135,
"grad_norm": 0.4209739863872528,
"learning_rate": 0.0002,
"loss": 0.3323,
"step": 6500
},
{
"epoch": 1.7014694508894044,
"grad_norm": 0.3296062648296356,
"learning_rate": 0.0002,
"loss": 0.3047,
"step": 6600
},
{
"epoch": 1.7272492910543955,
"grad_norm": 0.9009484648704529,
"learning_rate": 0.0002,
"loss": 0.3046,
"step": 6700
},
{
"epoch": 1.7530291312193864,
"grad_norm": 0.7505986094474792,
"learning_rate": 0.0002,
"loss": 0.3123,
"step": 6800
},
{
"epoch": 1.7788089713843775,
"grad_norm": 0.3542492389678955,
"learning_rate": 0.0002,
"loss": 0.3259,
"step": 6900
},
{
"epoch": 1.8045888115493685,
"grad_norm": 0.4935378432273865,
"learning_rate": 0.0002,
"loss": 0.3262,
"step": 7000
},
{
"epoch": 1.8303686517143594,
"grad_norm": 0.3000539541244507,
"learning_rate": 0.0002,
"loss": 0.2887,
"step": 7100
},
{
"epoch": 1.8561484918793503,
"grad_norm": 0.2680779695510864,
"learning_rate": 0.0002,
"loss": 0.3108,
"step": 7200
},
{
"epoch": 1.8819283320443412,
"grad_norm": 0.5922934412956238,
"learning_rate": 0.0002,
"loss": 0.3211,
"step": 7300
},
{
"epoch": 1.9077081722093323,
"grad_norm": 0.38349688053131104,
"learning_rate": 0.0002,
"loss": 0.316,
"step": 7400
},
{
"epoch": 1.9334880123743234,
"grad_norm": 0.7654793858528137,
"learning_rate": 0.0002,
"loss": 0.3111,
"step": 7500
},
{
"epoch": 1.9592678525393143,
"grad_norm": 0.2399352639913559,
"learning_rate": 0.0002,
"loss": 0.3042,
"step": 7600
},
{
"epoch": 1.9850476927043053,
"grad_norm": 0.42787912487983704,
"learning_rate": 0.0002,
"loss": 0.2928,
"step": 7700
},
{
"epoch": 2.010827532869296,
"grad_norm": 0.4771544933319092,
"learning_rate": 0.0002,
"loss": 0.2487,
"step": 7800
},
{
"epoch": 2.036607373034287,
"grad_norm": 0.6133277416229248,
"learning_rate": 0.0002,
"loss": 0.2219,
"step": 7900
},
{
"epoch": 2.062387213199278,
"grad_norm": 0.43137651681900024,
"learning_rate": 0.0002,
"loss": 0.2158,
"step": 8000
},
{
"epoch": 2.0881670533642693,
"grad_norm": 0.41038885712623596,
"learning_rate": 0.0002,
"loss": 0.2127,
"step": 8100
},
{
"epoch": 2.1139468935292602,
"grad_norm": 0.351235568523407,
"learning_rate": 0.0002,
"loss": 0.2185,
"step": 8200
},
{
"epoch": 2.139726733694251,
"grad_norm": 0.41089433431625366,
"learning_rate": 0.0002,
"loss": 0.2346,
"step": 8300
},
{
"epoch": 2.165506573859242,
"grad_norm": 0.3464137613773346,
"learning_rate": 0.0002,
"loss": 0.2273,
"step": 8400
},
{
"epoch": 2.191286414024233,
"grad_norm": 0.2753762900829315,
"learning_rate": 0.0002,
"loss": 0.2359,
"step": 8500
},
{
"epoch": 2.217066254189224,
"grad_norm": 0.3630015552043915,
"learning_rate": 0.0002,
"loss": 0.2351,
"step": 8600
},
{
"epoch": 2.2428460943542152,
"grad_norm": 0.5501378178596497,
"learning_rate": 0.0002,
"loss": 0.2273,
"step": 8700
},
{
"epoch": 2.268625934519206,
"grad_norm": 0.31958362460136414,
"learning_rate": 0.0002,
"loss": 0.2306,
"step": 8800
},
{
"epoch": 2.294405774684197,
"grad_norm": 0.4495809078216553,
"learning_rate": 0.0002,
"loss": 0.2283,
"step": 8900
},
{
"epoch": 2.320185614849188,
"grad_norm": 0.45789313316345215,
"learning_rate": 0.0002,
"loss": 0.2191,
"step": 9000
},
{
"epoch": 2.345965455014179,
"grad_norm": 0.2430783361196518,
"learning_rate": 0.0002,
"loss": 0.2266,
"step": 9100
},
{
"epoch": 2.3717452951791698,
"grad_norm": 0.512585461139679,
"learning_rate": 0.0002,
"loss": 0.2293,
"step": 9200
},
{
"epoch": 2.3975251353441607,
"grad_norm": 0.42088598012924194,
"learning_rate": 0.0002,
"loss": 0.2388,
"step": 9300
},
{
"epoch": 2.423304975509152,
"grad_norm": 0.4196650981903076,
"learning_rate": 0.0002,
"loss": 0.2305,
"step": 9400
},
{
"epoch": 2.449084815674143,
"grad_norm": 0.45856234431266785,
"learning_rate": 0.0002,
"loss": 0.2294,
"step": 9500
},
{
"epoch": 2.474864655839134,
"grad_norm": 0.5690295100212097,
"learning_rate": 0.0002,
"loss": 0.2237,
"step": 9600
},
{
"epoch": 2.5006444960041248,
"grad_norm": 0.5325428247451782,
"learning_rate": 0.0002,
"loss": 0.2125,
"step": 9700
},
{
"epoch": 2.5264243361691157,
"grad_norm": 0.4254339933395386,
"learning_rate": 0.0002,
"loss": 0.2335,
"step": 9800
},
{
"epoch": 2.5522041763341066,
"grad_norm": 0.44463545083999634,
"learning_rate": 0.0002,
"loss": 0.2247,
"step": 9900
},
{
"epoch": 2.577984016499098,
"grad_norm": 0.4192294776439667,
"learning_rate": 0.0002,
"loss": 0.2328,
"step": 10000
},
{
"epoch": 2.603763856664089,
"grad_norm": 0.39080777764320374,
"learning_rate": 0.0002,
"loss": 0.2229,
"step": 10100
},
{
"epoch": 2.6295436968290797,
"grad_norm": 0.3375299870967865,
"learning_rate": 0.0002,
"loss": 0.2374,
"step": 10200
},
{
"epoch": 2.6553235369940706,
"grad_norm": 0.6126553416252136,
"learning_rate": 0.0002,
"loss": 0.2283,
"step": 10300
},
{
"epoch": 2.6811033771590616,
"grad_norm": 0.21654823422431946,
"learning_rate": 0.0002,
"loss": 0.2265,
"step": 10400
},
{
"epoch": 2.7068832173240525,
"grad_norm": 0.41668832302093506,
"learning_rate": 0.0002,
"loss": 0.2267,
"step": 10500
},
{
"epoch": 2.7326630574890434,
"grad_norm": 0.5655872225761414,
"learning_rate": 0.0002,
"loss": 0.2331,
"step": 10600
},
{
"epoch": 2.7584428976540343,
"grad_norm": 0.49956533312797546,
"learning_rate": 0.0002,
"loss": 0.2323,
"step": 10700
},
{
"epoch": 2.7842227378190256,
"grad_norm": 0.4230547547340393,
"learning_rate": 0.0002,
"loss": 0.2157,
"step": 10800
},
{
"epoch": 2.8100025779840165,
"grad_norm": 0.5253151655197144,
"learning_rate": 0.0002,
"loss": 0.2189,
"step": 10900
},
{
"epoch": 2.8357824181490074,
"grad_norm": 0.3807348906993866,
"learning_rate": 0.0002,
"loss": 0.2285,
"step": 11000
},
{
"epoch": 2.8615622583139984,
"grad_norm": 0.6454833149909973,
"learning_rate": 0.0002,
"loss": 0.228,
"step": 11100
},
{
"epoch": 2.8873420984789897,
"grad_norm": 0.2508118450641632,
"learning_rate": 0.0002,
"loss": 0.2139,
"step": 11200
},
{
"epoch": 2.9131219386439806,
"grad_norm": 0.32768428325653076,
"learning_rate": 0.0002,
"loss": 0.2206,
"step": 11300
},
{
"epoch": 2.9389017788089715,
"grad_norm": 0.4850573241710663,
"learning_rate": 0.0002,
"loss": 0.2235,
"step": 11400
},
{
"epoch": 2.9646816189739624,
"grad_norm": 0.6089478135108948,
"learning_rate": 0.0002,
"loss": 0.2081,
"step": 11500
},
{
"epoch": 2.9904614591389533,
"grad_norm": 0.47153401374816895,
"learning_rate": 0.0002,
"loss": 0.2463,
"step": 11600
},
{
"epoch": 3.0162412993039442,
"grad_norm": 0.3843853771686554,
"learning_rate": 0.0002,
"loss": 0.1911,
"step": 11700
},
{
"epoch": 3.042021139468935,
"grad_norm": 0.21224769949913025,
"learning_rate": 0.0002,
"loss": 0.1753,
"step": 11800
},
{
"epoch": 3.067800979633926,
"grad_norm": 0.3223534822463989,
"learning_rate": 0.0002,
"loss": 0.1799,
"step": 11900
},
{
"epoch": 3.0935808197989174,
"grad_norm": 0.399443656206131,
"learning_rate": 0.0002,
"loss": 0.1755,
"step": 12000
},
{
"epoch": 3.1193606599639083,
"grad_norm": 0.253034770488739,
"learning_rate": 0.0002,
"loss": 0.177,
"step": 12100
},
{
"epoch": 3.1451405001288992,
"grad_norm": 0.318568617105484,
"learning_rate": 0.0002,
"loss": 0.1772,
"step": 12200
},
{
"epoch": 3.17092034029389,
"grad_norm": 0.2624630928039551,
"learning_rate": 0.0002,
"loss": 0.1876,
"step": 12300
},
{
"epoch": 3.196700180458881,
"grad_norm": 0.46422523260116577,
"learning_rate": 0.0002,
"loss": 0.1717,
"step": 12400
},
{
"epoch": 3.222480020623872,
"grad_norm": 0.4504973888397217,
"learning_rate": 0.0002,
"loss": 0.1862,
"step": 12500
},
{
"epoch": 3.2482598607888633,
"grad_norm": 0.44676682353019714,
"learning_rate": 0.0002,
"loss": 0.1865,
"step": 12600
},
{
"epoch": 3.274039700953854,
"grad_norm": 0.44682949781417847,
"learning_rate": 0.0002,
"loss": 0.1797,
"step": 12700
},
{
"epoch": 3.299819541118845,
"grad_norm": 0.22240401804447174,
"learning_rate": 0.0002,
"loss": 0.1823,
"step": 12800
},
{
"epoch": 3.325599381283836,
"grad_norm": 0.3457636535167694,
"learning_rate": 0.0002,
"loss": 0.1839,
"step": 12900
},
{
"epoch": 3.351379221448827,
"grad_norm": 0.5065191388130188,
"learning_rate": 0.0002,
"loss": 0.1823,
"step": 13000
},
{
"epoch": 3.377159061613818,
"grad_norm": 0.516930341720581,
"learning_rate": 0.0002,
"loss": 0.1812,
"step": 13100
},
{
"epoch": 3.4029389017788088,
"grad_norm": 0.5823391079902649,
"learning_rate": 0.0002,
"loss": 0.1851,
"step": 13200
},
{
"epoch": 3.4287187419438,
"grad_norm": 0.4604497253894806,
"learning_rate": 0.0002,
"loss": 0.1897,
"step": 13300
},
{
"epoch": 3.454498582108791,
"grad_norm": 0.3871957063674927,
"learning_rate": 0.0002,
"loss": 0.1778,
"step": 13400
},
{
"epoch": 3.480278422273782,
"grad_norm": 0.40806278586387634,
"learning_rate": 0.0002,
"loss": 0.1854,
"step": 13500
},
{
"epoch": 3.506058262438773,
"grad_norm": 0.24849525094032288,
"learning_rate": 0.0002,
"loss": 0.1825,
"step": 13600
},
{
"epoch": 3.5318381026037637,
"grad_norm": 0.28265008330345154,
"learning_rate": 0.0002,
"loss": 0.1914,
"step": 13700
},
{
"epoch": 3.557617942768755,
"grad_norm": 0.18643364310264587,
"learning_rate": 0.0002,
"loss": 0.1728,
"step": 13800
},
{
"epoch": 3.583397782933746,
"grad_norm": 0.36125150322914124,
"learning_rate": 0.0002,
"loss": 0.184,
"step": 13900
},
{
"epoch": 3.609177623098737,
"grad_norm": 0.35003572702407837,
"learning_rate": 0.0002,
"loss": 0.1834,
"step": 14000
},
{
"epoch": 3.634957463263728,
"grad_norm": 0.29175901412963867,
"learning_rate": 0.0002,
"loss": 0.1845,
"step": 14100
},
{
"epoch": 3.6607373034287187,
"grad_norm": 0.37868496775627136,
"learning_rate": 0.0002,
"loss": 0.1893,
"step": 14200
},
{
"epoch": 3.6865171435937096,
"grad_norm": 0.3279033899307251,
"learning_rate": 0.0002,
"loss": 0.1908,
"step": 14300
},
{
"epoch": 3.7122969837587005,
"grad_norm": 0.31007370352745056,
"learning_rate": 0.0002,
"loss": 0.1832,
"step": 14400
},
{
"epoch": 3.7380768239236914,
"grad_norm": 0.298289030790329,
"learning_rate": 0.0002,
"loss": 0.1948,
"step": 14500
},
{
"epoch": 3.763856664088683,
"grad_norm": 0.6039551496505737,
"learning_rate": 0.0002,
"loss": 0.1828,
"step": 14600
},
{
"epoch": 3.7896365042536737,
"grad_norm": 0.449587345123291,
"learning_rate": 0.0002,
"loss": 0.1891,
"step": 14700
},
{
"epoch": 3.8154163444186646,
"grad_norm": 0.6465901136398315,
"learning_rate": 0.0002,
"loss": 0.1895,
"step": 14800
},
{
"epoch": 3.8411961845836555,
"grad_norm": 0.5226249098777771,
"learning_rate": 0.0002,
"loss": 0.1767,
"step": 14900
},
{
"epoch": 3.8669760247486464,
"grad_norm": 0.29470816254615784,
"learning_rate": 0.0002,
"loss": 0.1958,
"step": 15000
},
{
"epoch": 3.892755864913638,
"grad_norm": 0.4997386336326599,
"learning_rate": 0.0002,
"loss": 0.1984,
"step": 15100
},
{
"epoch": 3.9185357050786287,
"grad_norm": 0.35381177067756653,
"learning_rate": 0.0002,
"loss": 0.1839,
"step": 15200
},
{
"epoch": 3.9443155452436196,
"grad_norm": 0.29231759905815125,
"learning_rate": 0.0002,
"loss": 0.1812,
"step": 15300
},
{
"epoch": 3.9700953854086105,
"grad_norm": 0.40497833490371704,
"learning_rate": 0.0002,
"loss": 0.1798,
"step": 15400
},
{
"epoch": 3.9958752255736014,
"grad_norm": 0.1775328516960144,
"learning_rate": 0.0002,
"loss": 0.1931,
"step": 15500
},
{
"epoch": 4.021655065738592,
"grad_norm": 0.2625548243522644,
"learning_rate": 0.0002,
"loss": 0.1513,
"step": 15600
},
{
"epoch": 4.047434905903583,
"grad_norm": 0.47476592659950256,
"learning_rate": 0.0002,
"loss": 0.1607,
"step": 15700
},
{
"epoch": 4.073214746068574,
"grad_norm": 0.4454491138458252,
"learning_rate": 0.0002,
"loss": 0.1529,
"step": 15800
},
{
"epoch": 4.098994586233565,
"grad_norm": 0.12239188700914383,
"learning_rate": 0.0002,
"loss": 0.1539,
"step": 15900
},
{
"epoch": 4.124774426398556,
"grad_norm": 0.2339598536491394,
"learning_rate": 0.0002,
"loss": 0.1572,
"step": 16000
},
{
"epoch": 4.150554266563548,
"grad_norm": 0.19658803939819336,
"learning_rate": 0.0002,
"loss": 0.1571,
"step": 16100
},
{
"epoch": 4.176334106728539,
"grad_norm": 0.25842776894569397,
"learning_rate": 0.0002,
"loss": 0.155,
"step": 16200
},
{
"epoch": 4.20211394689353,
"grad_norm": 0.4655442535877228,
"learning_rate": 0.0002,
"loss": 0.1584,
"step": 16300
},
{
"epoch": 4.2278937870585205,
"grad_norm": 0.3778013586997986,
"learning_rate": 0.0002,
"loss": 0.1587,
"step": 16400
},
{
"epoch": 4.253673627223511,
"grad_norm": 0.22199797630310059,
"learning_rate": 0.0002,
"loss": 0.1573,
"step": 16500
},
{
"epoch": 4.279453467388502,
"grad_norm": 0.23724961280822754,
"learning_rate": 0.0002,
"loss": 0.1649,
"step": 16600
},
{
"epoch": 4.305233307553493,
"grad_norm": 0.4558769166469574,
"learning_rate": 0.0002,
"loss": 0.1633,
"step": 16700
},
{
"epoch": 4.331013147718484,
"grad_norm": 0.27720391750335693,
"learning_rate": 0.0002,
"loss": 0.1613,
"step": 16800
},
{
"epoch": 4.356792987883475,
"grad_norm": 0.3628349304199219,
"learning_rate": 0.0002,
"loss": 0.16,
"step": 16900
},
{
"epoch": 4.382572828048466,
"grad_norm": 0.6290438175201416,
"learning_rate": 0.0002,
"loss": 0.1658,
"step": 17000
},
{
"epoch": 4.408352668213457,
"grad_norm": 0.14983007311820984,
"learning_rate": 0.0002,
"loss": 0.1629,
"step": 17100
},
{
"epoch": 4.434132508378448,
"grad_norm": 0.30865323543548584,
"learning_rate": 0.0002,
"loss": 0.1603,
"step": 17200
},
{
"epoch": 4.459912348543439,
"grad_norm": 0.5674950480461121,
"learning_rate": 0.0002,
"loss": 0.1674,
"step": 17300
},
{
"epoch": 4.4856921887084305,
"grad_norm": 0.40429455041885376,
"learning_rate": 0.0002,
"loss": 0.1677,
"step": 17400
},
{
"epoch": 4.511472028873421,
"grad_norm": 0.27213749289512634,
"learning_rate": 0.0002,
"loss": 0.1642,
"step": 17500
},
{
"epoch": 4.537251869038412,
"grad_norm": 0.40964949131011963,
"learning_rate": 0.0002,
"loss": 0.1626,
"step": 17600
},
{
"epoch": 4.563031709203403,
"grad_norm": 0.3955250382423401,
"learning_rate": 0.0002,
"loss": 0.1564,
"step": 17700
},
{
"epoch": 4.588811549368394,
"grad_norm": 0.3900775611400604,
"learning_rate": 0.0002,
"loss": 0.1605,
"step": 17800
},
{
"epoch": 4.614591389533385,
"grad_norm": 0.2436327487230301,
"learning_rate": 0.0002,
"loss": 0.1603,
"step": 17900
},
{
"epoch": 4.640371229698376,
"grad_norm": 0.4188991189002991,
"learning_rate": 0.0002,
"loss": 0.163,
"step": 18000
},
{
"epoch": 4.666151069863367,
"grad_norm": 0.15686850249767303,
"learning_rate": 0.0002,
"loss": 0.1656,
"step": 18100
},
{
"epoch": 4.691930910028358,
"grad_norm": 0.30334389209747314,
"learning_rate": 0.0002,
"loss": 0.1612,
"step": 18200
},
{
"epoch": 4.717710750193349,
"grad_norm": 0.33619073033332825,
"learning_rate": 0.0002,
"loss": 0.1626,
"step": 18300
},
{
"epoch": 4.7434905903583395,
"grad_norm": 0.20497629046440125,
"learning_rate": 0.0002,
"loss": 0.1647,
"step": 18400
},
{
"epoch": 4.76927043052333,
"grad_norm": 0.20428726077079773,
"learning_rate": 0.0002,
"loss": 0.1726,
"step": 18500
},
{
"epoch": 4.795050270688321,
"grad_norm": 0.3606746196746826,
"learning_rate": 0.0002,
"loss": 0.1638,
"step": 18600
},
{
"epoch": 4.820830110853313,
"grad_norm": 0.3441687226295471,
"learning_rate": 0.0002,
"loss": 0.1676,
"step": 18700
},
{
"epoch": 4.846609951018304,
"grad_norm": 0.3479159474372864,
"learning_rate": 0.0002,
"loss": 0.1654,
"step": 18800
},
{
"epoch": 4.872389791183295,
"grad_norm": 0.39751461148262024,
"learning_rate": 0.0002,
"loss": 0.1592,
"step": 18900
},
{
"epoch": 4.898169631348286,
"grad_norm": 0.1793346256017685,
"learning_rate": 0.0002,
"loss": 0.1683,
"step": 19000
},
{
"epoch": 4.923949471513277,
"grad_norm": 0.100714772939682,
"learning_rate": 0.0002,
"loss": 0.1592,
"step": 19100
},
{
"epoch": 4.949729311678268,
"grad_norm": 0.6268895864486694,
"learning_rate": 0.0002,
"loss": 0.1667,
"step": 19200
},
{
"epoch": 4.975509151843259,
"grad_norm": 0.32232895493507385,
"learning_rate": 0.0002,
"loss": 0.1615,
"step": 19300
},
{
"epoch": 5.0012889920082495,
"grad_norm": 0.3094789683818817,
"learning_rate": 0.0002,
"loss": 0.1648,
"step": 19400
},
{
"epoch": 5.02706883217324,
"grad_norm": 0.3806459307670593,
"learning_rate": 0.0002,
"loss": 0.149,
"step": 19500
},
{
"epoch": 5.052848672338231,
"grad_norm": 0.28195375204086304,
"learning_rate": 0.0002,
"loss": 0.1409,
"step": 19600
},
{
"epoch": 5.078628512503222,
"grad_norm": 0.1819002479314804,
"learning_rate": 0.0002,
"loss": 0.1403,
"step": 19700
},
{
"epoch": 5.104408352668213,
"grad_norm": 0.27728572487831116,
"learning_rate": 0.0002,
"loss": 0.1426,
"step": 19800
},
{
"epoch": 5.130188192833204,
"grad_norm": 0.21889761090278625,
"learning_rate": 0.0002,
"loss": 0.1499,
"step": 19900
},
{
"epoch": 5.155968032998196,
"grad_norm": 0.3974555432796478,
"learning_rate": 0.0002,
"loss": 0.1427,
"step": 20000
},
{
"epoch": 5.181747873163187,
"grad_norm": 0.48159608244895935,
"learning_rate": 0.0002,
"loss": 0.1477,
"step": 20100
},
{
"epoch": 5.207527713328178,
"grad_norm": 0.3865210711956024,
"learning_rate": 0.0002,
"loss": 0.1424,
"step": 20200
},
{
"epoch": 5.233307553493169,
"grad_norm": 0.26485195755958557,
"learning_rate": 0.0002,
"loss": 0.1486,
"step": 20300
},
{
"epoch": 5.2590873936581595,
"grad_norm": 0.41939619183540344,
"learning_rate": 0.0002,
"loss": 0.151,
"step": 20400
},
{
"epoch": 5.28486723382315,
"grad_norm": 0.3483380973339081,
"learning_rate": 0.0002,
"loss": 0.1475,
"step": 20500
},
{
"epoch": 5.310647073988141,
"grad_norm": 0.40975695848464966,
"learning_rate": 0.0002,
"loss": 0.1461,
"step": 20600
},
{
"epoch": 5.336426914153132,
"grad_norm": 0.27101436257362366,
"learning_rate": 0.0002,
"loss": 0.1528,
"step": 20700
},
{
"epoch": 5.362206754318123,
"grad_norm": 0.27852606773376465,
"learning_rate": 0.0002,
"loss": 0.1484,
"step": 20800
},
{
"epoch": 5.387986594483114,
"grad_norm": 0.4176689684391022,
"learning_rate": 0.0002,
"loss": 0.1485,
"step": 20900
},
{
"epoch": 5.413766434648105,
"grad_norm": 0.4901387691497803,
"learning_rate": 0.0002,
"loss": 0.1479,
"step": 21000
},
{
"epoch": 5.439546274813096,
"grad_norm": 0.33768975734710693,
"learning_rate": 0.0002,
"loss": 0.15,
"step": 21100
},
{
"epoch": 5.465326114978087,
"grad_norm": 0.5349870324134827,
"learning_rate": 0.0002,
"loss": 0.1485,
"step": 21200
},
{
"epoch": 5.4911059551430785,
"grad_norm": 0.24405865371227264,
"learning_rate": 0.0002,
"loss": 0.146,
"step": 21300
},
{
"epoch": 5.516885795308069,
"grad_norm": 0.2870001494884491,
"learning_rate": 0.0002,
"loss": 0.1482,
"step": 21400
},
{
"epoch": 5.54266563547306,
"grad_norm": 0.34606364369392395,
"learning_rate": 0.0002,
"loss": 0.1535,
"step": 21500
},
{
"epoch": 5.568445475638051,
"grad_norm": 0.4999238848686218,
"learning_rate": 0.0002,
"loss": 0.1523,
"step": 21600
},
{
"epoch": 5.594225315803042,
"grad_norm": 0.2526559829711914,
"learning_rate": 0.0002,
"loss": 0.1524,
"step": 21700
},
{
"epoch": 5.620005155968033,
"grad_norm": 0.270786315202713,
"learning_rate": 0.0002,
"loss": 0.1511,
"step": 21800
},
{
"epoch": 5.645784996133024,
"grad_norm": 0.4440493881702423,
"learning_rate": 0.0002,
"loss": 0.1539,
"step": 21900
},
{
"epoch": 5.671564836298015,
"grad_norm": 0.4871107041835785,
"learning_rate": 0.0002,
"loss": 0.1505,
"step": 22000
},
{
"epoch": 5.697344676463006,
"grad_norm": 0.40973493456840515,
"learning_rate": 0.0002,
"loss": 0.1553,
"step": 22100
},
{
"epoch": 5.723124516627997,
"grad_norm": 0.4365851581096649,
"learning_rate": 0.0002,
"loss": 0.1502,
"step": 22200
},
{
"epoch": 5.748904356792988,
"grad_norm": 0.5478639602661133,
"learning_rate": 0.0002,
"loss": 0.1611,
"step": 22300
},
{
"epoch": 5.7746841969579785,
"grad_norm": 0.29485803842544556,
"learning_rate": 0.0002,
"loss": 0.157,
"step": 22400
},
{
"epoch": 5.800464037122969,
"grad_norm": 0.20778502523899078,
"learning_rate": 0.0002,
"loss": 0.1489,
"step": 22500
},
{
"epoch": 5.826243877287961,
"grad_norm": 0.1795939952135086,
"learning_rate": 0.0002,
"loss": 0.1517,
"step": 22600
},
{
"epoch": 5.852023717452952,
"grad_norm": 0.4165894687175751,
"learning_rate": 0.0002,
"loss": 0.1464,
"step": 22700
},
{
"epoch": 5.877803557617943,
"grad_norm": 0.35076722502708435,
"learning_rate": 0.0002,
"loss": 0.1499,
"step": 22800
},
{
"epoch": 5.903583397782934,
"grad_norm": 0.3190014362335205,
"learning_rate": 0.0002,
"loss": 0.1474,
"step": 22900
},
{
"epoch": 5.929363237947925,
"grad_norm": 0.6232258081436157,
"learning_rate": 0.0002,
"loss": 0.1521,
"step": 23000
},
{
"epoch": 5.955143078112916,
"grad_norm": 0.41889217495918274,
"learning_rate": 0.0002,
"loss": 0.1553,
"step": 23100
},
{
"epoch": 5.980922918277907,
"grad_norm": 0.4977259635925293,
"learning_rate": 0.0002,
"loss": 0.1543,
"step": 23200
},
{
"epoch": 6.006702758442898,
"grad_norm": 0.3092762231826782,
"learning_rate": 0.0002,
"loss": 0.145,
"step": 23300
},
{
"epoch": 6.0324825986078885,
"grad_norm": 0.15745452046394348,
"learning_rate": 0.0002,
"loss": 0.138,
"step": 23400
},
{
"epoch": 6.058262438772879,
"grad_norm": 0.10685788840055466,
"learning_rate": 0.0002,
"loss": 0.1345,
"step": 23500
},
{
"epoch": 6.08404227893787,
"grad_norm": 0.41699907183647156,
"learning_rate": 0.0002,
"loss": 0.1379,
"step": 23600
},
{
"epoch": 6.109822119102861,
"grad_norm": 0.18783129751682281,
"learning_rate": 0.0002,
"loss": 0.1306,
"step": 23700
},
{
"epoch": 6.135601959267852,
"grad_norm": 0.15569710731506348,
"learning_rate": 0.0002,
"loss": 0.1372,
"step": 23800
},
{
"epoch": 6.161381799432844,
"grad_norm": 0.4492259919643402,
"learning_rate": 0.0002,
"loss": 0.1414,
"step": 23900
},
{
"epoch": 6.187161639597835,
"grad_norm": 0.1448894888162613,
"learning_rate": 0.0002,
"loss": 0.1376,
"step": 24000
},
{
"epoch": 6.212941479762826,
"grad_norm": 0.2028491050004959,
"learning_rate": 0.0002,
"loss": 0.1349,
"step": 24100
},
{
"epoch": 6.238721319927817,
"grad_norm": 0.19205012917518616,
"learning_rate": 0.0002,
"loss": 0.1396,
"step": 24200
},
{
"epoch": 6.2645011600928076,
"grad_norm": 0.29885369539260864,
"learning_rate": 0.0002,
"loss": 0.1449,
"step": 24300
},
{
"epoch": 6.2902810002577985,
"grad_norm": 0.15814617276191711,
"learning_rate": 0.0002,
"loss": 0.1438,
"step": 24400
},
{
"epoch": 6.316060840422789,
"grad_norm": 0.2691551148891449,
"learning_rate": 0.0002,
"loss": 0.1406,
"step": 24500
},
{
"epoch": 6.34184068058778,
"grad_norm": 0.543335497379303,
"learning_rate": 0.0002,
"loss": 0.1389,
"step": 24600
},
{
"epoch": 6.367620520752771,
"grad_norm": 0.33116665482521057,
"learning_rate": 0.0002,
"loss": 0.1403,
"step": 24700
},
{
"epoch": 6.393400360917762,
"grad_norm": 0.5159612894058228,
"learning_rate": 0.0002,
"loss": 0.1408,
"step": 24800
},
{
"epoch": 6.419180201082753,
"grad_norm": 0.30205056071281433,
"learning_rate": 0.0002,
"loss": 0.1409,
"step": 24900
},
{
"epoch": 6.444960041247744,
"grad_norm": 0.44916966557502747,
"learning_rate": 0.0002,
"loss": 0.1432,
"step": 25000
},
{
"epoch": 6.470739881412735,
"grad_norm": 0.18665899336338043,
"learning_rate": 0.0002,
"loss": 0.1434,
"step": 25100
},
{
"epoch": 6.496519721577727,
"grad_norm": 0.4078758656978607,
"learning_rate": 0.0002,
"loss": 0.1411,
"step": 25200
},
{
"epoch": 6.5222995617427175,
"grad_norm": 0.39813536405563354,
"learning_rate": 0.0002,
"loss": 0.1445,
"step": 25300
},
{
"epoch": 6.548079401907708,
"grad_norm": 0.2587377727031708,
"learning_rate": 0.0002,
"loss": 0.1463,
"step": 25400
},
{
"epoch": 6.573859242072699,
"grad_norm": 0.41181057691574097,
"learning_rate": 0.0002,
"loss": 0.1487,
"step": 25500
},
{
"epoch": 6.59963908223769,
"grad_norm": 0.3136518597602844,
"learning_rate": 0.0002,
"loss": 0.1414,
"step": 25600
},
{
"epoch": 6.625418922402681,
"grad_norm": 0.4114777445793152,
"learning_rate": 0.0002,
"loss": 0.1434,
"step": 25700
},
{
"epoch": 6.651198762567672,
"grad_norm": 0.17142866551876068,
"learning_rate": 0.0002,
"loss": 0.1411,
"step": 25800
},
{
"epoch": 6.676978602732663,
"grad_norm": 0.5585296750068665,
"learning_rate": 0.0002,
"loss": 0.148,
"step": 25900
},
{
"epoch": 6.702758442897654,
"grad_norm": 0.23773185908794403,
"learning_rate": 0.0002,
"loss": 0.1468,
"step": 26000
},
{
"epoch": 6.728538283062645,
"grad_norm": 0.38246840238571167,
"learning_rate": 0.0002,
"loss": 0.1426,
"step": 26100
},
{
"epoch": 6.754318123227636,
"grad_norm": 0.5393186807632446,
"learning_rate": 0.0002,
"loss": 0.1456,
"step": 26200
},
{
"epoch": 6.780097963392627,
"grad_norm": 0.21433015167713165,
"learning_rate": 0.0002,
"loss": 0.1456,
"step": 26300
},
{
"epoch": 6.8058778035576175,
"grad_norm": 0.4375258982181549,
"learning_rate": 0.0002,
"loss": 0.1461,
"step": 26400
},
{
"epoch": 6.831657643722609,
"grad_norm": 0.515832781791687,
"learning_rate": 0.0002,
"loss": 0.1484,
"step": 26500
},
{
"epoch": 6.8574374838876,
"grad_norm": 0.496559739112854,
"learning_rate": 0.0002,
"loss": 0.1461,
"step": 26600
},
{
"epoch": 6.883217324052591,
"grad_norm": 0.30182015895843506,
"learning_rate": 0.0002,
"loss": 0.1471,
"step": 26700
},
{
"epoch": 6.908997164217582,
"grad_norm": 0.3858971893787384,
"learning_rate": 0.0002,
"loss": 0.1469,
"step": 26800
},
{
"epoch": 6.934777004382573,
"grad_norm": 0.30368533730506897,
"learning_rate": 0.0002,
"loss": 0.1466,
"step": 26900
},
{
"epoch": 6.960556844547564,
"grad_norm": 0.29557520151138306,
"learning_rate": 0.0002,
"loss": 0.1446,
"step": 27000
},
{
"epoch": 6.986336684712555,
"grad_norm": 0.34702664613723755,
"learning_rate": 0.0002,
"loss": 0.143,
"step": 27100
},
{
"epoch": 7.012116524877546,
"grad_norm": 0.18182627856731415,
"learning_rate": 0.0002,
"loss": 0.1467,
"step": 27200
},
{
"epoch": 7.037896365042537,
"grad_norm": 0.48641154170036316,
"learning_rate": 0.0002,
"loss": 0.1337,
"step": 27300
},
{
"epoch": 7.0636762052075275,
"grad_norm": 0.5797538757324219,
"learning_rate": 0.0002,
"loss": 0.1291,
"step": 27400
},
{
"epoch": 7.089456045372518,
"grad_norm": 0.20399855077266693,
"learning_rate": 0.0002,
"loss": 0.1372,
"step": 27500
},
{
"epoch": 7.115235885537509,
"grad_norm": 0.12141354382038116,
"learning_rate": 0.0002,
"loss": 0.1359,
"step": 27600
},
{
"epoch": 7.1410157257025,
"grad_norm": 0.13764117658138275,
"learning_rate": 0.0002,
"loss": 0.1276,
"step": 27700
},
{
"epoch": 7.166795565867492,
"grad_norm": 0.21888123452663422,
"learning_rate": 0.0002,
"loss": 0.1337,
"step": 27800
},
{
"epoch": 7.192575406032483,
"grad_norm": 0.1562834531068802,
"learning_rate": 0.0002,
"loss": 0.133,
"step": 27900
},
{
"epoch": 7.218355246197474,
"grad_norm": 0.3367880880832672,
"learning_rate": 0.0002,
"loss": 0.1335,
"step": 28000
},
{
"epoch": 7.244135086362465,
"grad_norm": 0.1075579896569252,
"learning_rate": 0.0002,
"loss": 0.1334,
"step": 28100
},
{
"epoch": 7.269914926527456,
"grad_norm": 0.11283877491950989,
"learning_rate": 0.0002,
"loss": 0.1356,
"step": 28200
},
{
"epoch": 7.2956947666924465,
"grad_norm": 0.24768362939357758,
"learning_rate": 0.0002,
"loss": 0.1374,
"step": 28300
},
{
"epoch": 7.3214746068574375,
"grad_norm": 0.22776305675506592,
"learning_rate": 0.0002,
"loss": 0.1307,
"step": 28400
},
{
"epoch": 7.347254447022428,
"grad_norm": 0.13827867805957794,
"learning_rate": 0.0002,
"loss": 0.1396,
"step": 28500
},
{
"epoch": 7.373034287187419,
"grad_norm": 0.2935916781425476,
"learning_rate": 0.0002,
"loss": 0.1355,
"step": 28600
},
{
"epoch": 7.39881412735241,
"grad_norm": 0.10991048812866211,
"learning_rate": 0.0002,
"loss": 0.1349,
"step": 28700
},
{
"epoch": 7.424593967517401,
"grad_norm": 0.30149704217910767,
"learning_rate": 0.0002,
"loss": 0.1374,
"step": 28800
},
{
"epoch": 7.450373807682392,
"grad_norm": 0.13918708264827728,
"learning_rate": 0.0002,
"loss": 0.141,
"step": 28900
},
{
"epoch": 7.476153647847383,
"grad_norm": 0.13292869925498962,
"learning_rate": 0.0002,
"loss": 0.1386,
"step": 29000
},
{
"epoch": 7.501933488012375,
"grad_norm": 0.5602275729179382,
"learning_rate": 0.0002,
"loss": 0.1421,
"step": 29100
},
{
"epoch": 7.527713328177366,
"grad_norm": 0.12204320728778839,
"learning_rate": 0.0002,
"loss": 0.1334,
"step": 29200
},
{
"epoch": 7.5534931683423565,
"grad_norm": 0.17424637079238892,
"learning_rate": 0.0002,
"loss": 0.1372,
"step": 29300
},
{
"epoch": 7.579273008507347,
"grad_norm": 0.4190254509449005,
"learning_rate": 0.0002,
"loss": 0.1458,
"step": 29400
},
{
"epoch": 7.605052848672338,
"grad_norm": 0.13242638111114502,
"learning_rate": 0.0002,
"loss": 0.1421,
"step": 29500
},
{
"epoch": 7.630832688837329,
"grad_norm": 0.23242244124412537,
"learning_rate": 0.0002,
"loss": 0.1429,
"step": 29600
},
{
"epoch": 7.65661252900232,
"grad_norm": 0.4323575794696808,
"learning_rate": 0.0002,
"loss": 0.1402,
"step": 29700
},
{
"epoch": 7.682392369167311,
"grad_norm": 0.1595413088798523,
"learning_rate": 0.0002,
"loss": 0.1403,
"step": 29800
},
{
"epoch": 7.708172209332302,
"grad_norm": 0.1448589414358139,
"learning_rate": 0.0002,
"loss": 0.136,
"step": 29900
},
{
"epoch": 7.733952049497293,
"grad_norm": 0.5433810353279114,
"learning_rate": 0.0002,
"loss": 0.139,
"step": 30000
}
],
"logging_steps": 100,
"max_steps": 31032,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.186321886206116e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}