Invalid JSON: Unexpected token 'I', ..."_metric": Infinity,
"... is not valid JSON
| { | |
| "best_metric": Infinity, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 100, | |
| "global_step": 2382, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012594458438287153, | |
| "eval_loss": NaN, | |
| "eval_runtime": 24.0728, | |
| "eval_samples_per_second": 1.745, | |
| "eval_steps_per_second": 1.745, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.012594458438287154, | |
| "grad_norm": 10.215713500976562, | |
| "learning_rate": 0.0001, | |
| "loss": 4.9227, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02518891687657431, | |
| "grad_norm": 11.19245433807373, | |
| "learning_rate": 0.0002, | |
| "loss": 3.6568, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.037783375314861464, | |
| "grad_norm": 17.468347549438477, | |
| "learning_rate": 0.0003, | |
| "loss": 3.4374, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05037783375314862, | |
| "grad_norm": 111.16232299804688, | |
| "learning_rate": 0.0004, | |
| "loss": 7.2007, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06297229219143577, | |
| "grad_norm": 174.66659545898438, | |
| "learning_rate": 0.0005, | |
| "loss": 8.4586, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07556675062972293, | |
| "grad_norm": 600.462890625, | |
| "learning_rate": 0.0006, | |
| "loss": 8.467, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08816120906801007, | |
| "grad_norm": 24.518911361694336, | |
| "learning_rate": 0.0007, | |
| "loss": 8.2763, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10075566750629723, | |
| "grad_norm": 8.935935974121094, | |
| "learning_rate": 0.0008, | |
| "loss": 9.2561, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11335012594458438, | |
| "grad_norm": 27.04026985168457, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 7.731, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12594458438287154, | |
| "grad_norm": 56.256690979003906, | |
| "learning_rate": 0.001, | |
| "loss": 7.8286, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12594458438287154, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4265, | |
| "eval_samples_per_second": 2.723, | |
| "eval_steps_per_second": 2.723, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1385390428211587, | |
| "grad_norm": 27.969837188720703, | |
| "learning_rate": 0.000999953093091659, | |
| "loss": 8.0009, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15113350125944586, | |
| "grad_norm": 20.14385223388672, | |
| "learning_rate": 0.0009998123812565674, | |
| "loss": 7.8523, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.163727959697733, | |
| "grad_norm": 40.78677749633789, | |
| "learning_rate": 0.000999577891162835, | |
| "loss": 7.9675, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.17632241813602015, | |
| "grad_norm": 27.47247886657715, | |
| "learning_rate": 0.000999249667251695, | |
| "loss": 7.6457, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1889168765743073, | |
| "grad_norm": 16.2076473236084, | |
| "learning_rate": 0.000998827771729083, | |
| "loss": 7.8101, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20151133501259447, | |
| "grad_norm": 22.394704818725586, | |
| "learning_rate": 0.0009983122845538459, | |
| "loss": 7.2711, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2141057934508816, | |
| "grad_norm": 37.36329650878906, | |
| "learning_rate": 0.0009977033034225882, | |
| "loss": 7.0109, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.22670025188916876, | |
| "grad_norm": 17.58085823059082, | |
| "learning_rate": 0.0009970009437511567, | |
| "loss": 7.5939, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.23929471032745592, | |
| "grad_norm": 44.42656326293945, | |
| "learning_rate": 0.0009962053386527666, | |
| "loss": 7.5965, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2518891687657431, | |
| "grad_norm": 13.50401782989502, | |
| "learning_rate": 0.0009953166389127734, | |
| "loss": 7.2486, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2518891687657431, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.2724, | |
| "eval_samples_per_second": 2.75, | |
| "eval_steps_per_second": 2.75, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.26448362720403024, | |
| "grad_norm": 22.640573501586914, | |
| "learning_rate": 0.000994335012960095, | |
| "loss": 6.7046, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2770780856423174, | |
| "grad_norm": 29.585573196411133, | |
| "learning_rate": 0.0009932606468352912, | |
| "loss": 8.0273, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.28967254408060455, | |
| "grad_norm": 28.875736236572266, | |
| "learning_rate": 0.0009920937441553052, | |
| "loss": 7.3677, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3022670025188917, | |
| "grad_norm": 35.41697311401367, | |
| "learning_rate": 0.0009908345260748724, | |
| "loss": 7.3579, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3148614609571788, | |
| "grad_norm": 29.97820472717285, | |
| "learning_rate": 0.000989483231244607, | |
| "loss": 7.0202, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.327455919395466, | |
| "grad_norm": 13.438394546508789, | |
| "learning_rate": 0.000988040115765773, | |
| "loss": 6.742, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.34005037783375314, | |
| "grad_norm": 24.55386734008789, | |
| "learning_rate": 0.000986505453141746, | |
| "loss": 6.7159, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3526448362720403, | |
| "grad_norm": 31.250648498535156, | |
| "learning_rate": 0.0009848795342261784, | |
| "loss": 7.6011, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.36523929471032746, | |
| "grad_norm": 17.08733558654785, | |
| "learning_rate": 0.000983162667167877, | |
| "loss": 7.5208, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3778337531486146, | |
| "grad_norm": 22.433013916015625, | |
| "learning_rate": 0.0009813551773523999, | |
| "loss": 7.2601, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3778337531486146, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4322, | |
| "eval_samples_per_second": 2.722, | |
| "eval_steps_per_second": 2.722, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3904282115869018, | |
| "grad_norm": 22.491186141967773, | |
| "learning_rate": 0.00097945740734039, | |
| "loss": 7.1465, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.40302267002518893, | |
| "grad_norm": 24.347196578979492, | |
| "learning_rate": 0.0009774697168026514, | |
| "loss": 7.1475, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4156171284634761, | |
| "grad_norm": 10.394509315490723, | |
| "learning_rate": 0.0009753924824519837, | |
| "loss": 7.0914, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4282115869017632, | |
| "grad_norm": 24.985843658447266, | |
| "learning_rate": 0.0009732260979717857, | |
| "loss": 6.985, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.44080604534005036, | |
| "grad_norm": 20.668533325195312, | |
| "learning_rate": 0.0009709709739414448, | |
| "loss": 6.2001, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4534005037783375, | |
| "grad_norm": 15.814973831176758, | |
| "learning_rate": 0.0009686275377585208, | |
| "loss": 7.0025, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4659949622166247, | |
| "grad_norm": 14.465486526489258, | |
| "learning_rate": 0.0009661962335577461, | |
| "loss": 6.8131, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.47858942065491183, | |
| "grad_norm": 22.262863159179688, | |
| "learning_rate": 0.0009636775221268514, | |
| "loss": 6.7339, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.491183879093199, | |
| "grad_norm": 13.883723258972168, | |
| "learning_rate": 0.0009610718808192354, | |
| "loss": 7.1253, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5037783375314862, | |
| "grad_norm": 10.253217697143555, | |
| "learning_rate": 0.0009583798034634963, | |
| "loss": 8.2142, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5037783375314862, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.6368, | |
| "eval_samples_per_second": 2.686, | |
| "eval_steps_per_second": 2.686, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5163727959697733, | |
| "grad_norm": 18.610076904296875, | |
| "learning_rate": 0.0009556018002698389, | |
| "loss": 7.7783, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5289672544080605, | |
| "grad_norm": 18.9676513671875, | |
| "learning_rate": 0.0009527383977333791, | |
| "loss": 7.3047, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5415617128463476, | |
| "grad_norm": 6.478220462799072, | |
| "learning_rate": 0.0009497901385343608, | |
| "loss": 7.6647, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5541561712846348, | |
| "grad_norm": 15.899592399597168, | |
| "learning_rate": 0.0009467575814353052, | |
| "loss": 7.2044, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5667506297229219, | |
| "grad_norm": 7.312307834625244, | |
| "learning_rate": 0.0009436413011751129, | |
| "loss": 6.7378, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5793450881612091, | |
| "grad_norm": 9.657133102416992, | |
| "learning_rate": 0.0009404418883601375, | |
| "loss": 7.3231, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5919395465994962, | |
| "grad_norm": 6.748540878295898, | |
| "learning_rate": 0.000937159949352252, | |
| "loss": 7.4876, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6045340050377834, | |
| "grad_norm": 5.967790603637695, | |
| "learning_rate": 0.0009337961061539308, | |
| "loss": 7.5902, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6171284634760705, | |
| "grad_norm": 3.245502233505249, | |
| "learning_rate": 0.0009303509962903634, | |
| "loss": 7.1788, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6297229219143576, | |
| "grad_norm": 7.3367919921875, | |
| "learning_rate": 0.0009268252726886314, | |
| "loss": 7.1902, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6297229219143576, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4155, | |
| "eval_samples_per_second": 2.725, | |
| "eval_steps_per_second": 2.725, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6423173803526449, | |
| "grad_norm": 2.903081178665161, | |
| "learning_rate": 0.0009232196035539623, | |
| "loss": 7.236, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.654911838790932, | |
| "grad_norm": 3.9663569927215576, | |
| "learning_rate": 0.0009195346722430898, | |
| "loss": 7.3749, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6675062972292192, | |
| "grad_norm": 6.435197830200195, | |
| "learning_rate": 0.0009157711771347421, | |
| "loss": 7.5964, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6801007556675063, | |
| "grad_norm": 4.062136173248291, | |
| "learning_rate": 0.0009119298314972834, | |
| "loss": 6.7231, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6926952141057935, | |
| "grad_norm": 3.391726493835449, | |
| "learning_rate": 0.0009080113633535325, | |
| "loss": 6.9205, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7052896725440806, | |
| "grad_norm": 3.272814989089966, | |
| "learning_rate": 0.0009040165153427869, | |
| "loss": 6.6434, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7178841309823678, | |
| "grad_norm": 1.8474681377410889, | |
| "learning_rate": 0.0008999460445800744, | |
| "loss": 7.1821, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7304785894206549, | |
| "grad_norm": 1.8579058647155762, | |
| "learning_rate": 0.0008958007225126639, | |
| "loss": 7.408, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.743073047858942, | |
| "grad_norm": 2.2286558151245117, | |
| "learning_rate": 0.0008915813347738575, | |
| "loss": 6.8997, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7556675062972292, | |
| "grad_norm": 2.628117322921753, | |
| "learning_rate": 0.0008872886810340951, | |
| "loss": 6.3799, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7556675062972292, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.1353, | |
| "eval_samples_per_second": 2.775, | |
| "eval_steps_per_second": 2.775, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7682619647355163, | |
| "grad_norm": 1.8156355619430542, | |
| "learning_rate": 0.000882923574849399, | |
| "loss": 7.1042, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7808564231738035, | |
| "grad_norm": 2.4351723194122314, | |
| "learning_rate": 0.0008784868435071861, | |
| "loss": 6.9231, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7934508816120907, | |
| "grad_norm": 2.3676507472991943, | |
| "learning_rate": 0.0008739793278694784, | |
| "loss": 7.0717, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8060453400503779, | |
| "grad_norm": 2.2056849002838135, | |
| "learning_rate": 0.0008694018822135399, | |
| "loss": 6.9117, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.818639798488665, | |
| "grad_norm": 1.623586893081665, | |
| "learning_rate": 0.000864755374069972, | |
| "loss": 6.6581, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8312342569269522, | |
| "grad_norm": 2.0398213863372803, | |
| "learning_rate": 0.0008600406840582961, | |
| "loss": 7.1913, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8438287153652393, | |
| "grad_norm": 1.6448144912719727, | |
| "learning_rate": 0.0008552587057200568, | |
| "loss": 6.8668, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8564231738035264, | |
| "grad_norm": 1.1676232814788818, | |
| "learning_rate": 0.0008504103453494745, | |
| "loss": 6.5517, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8690176322418136, | |
| "grad_norm": 19.695444107055664, | |
| "learning_rate": 0.0008454965218216828, | |
| "loss": 6.0138, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8816120906801007, | |
| "grad_norm": 1.936806082725525, | |
| "learning_rate": 0.0008405181664185797, | |
| "loss": 6.7115, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8816120906801007, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3262, | |
| "eval_samples_per_second": 2.74, | |
| "eval_steps_per_second": 2.74, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8942065491183879, | |
| "grad_norm": 1.0898339748382568, | |
| "learning_rate": 0.0008354762226523293, | |
| "loss": 7.2575, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.906801007556675, | |
| "grad_norm": 1.8871369361877441, | |
| "learning_rate": 0.0008303716460865444, | |
| "loss": 7.1923, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9193954659949622, | |
| "grad_norm": 1.402613639831543, | |
| "learning_rate": 0.0008252054041551846, | |
| "loss": 6.6392, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9319899244332494, | |
| "grad_norm": 2.2640938758850098, | |
| "learning_rate": 0.0008199784759792064, | |
| "loss": 7.136, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9445843828715366, | |
| "grad_norm": 0.9886698722839355, | |
| "learning_rate": 0.0008146918521809975, | |
| "loss": 6.6487, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9571788413098237, | |
| "grad_norm": 1.0600054264068604, | |
| "learning_rate": 0.0008093465346966299, | |
| "loss": 6.9025, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9697732997481109, | |
| "grad_norm": 1.1450759172439575, | |
| "learning_rate": 0.0008039435365859717, | |
| "loss": 7.0955, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.982367758186398, | |
| "grad_norm": 1.5099146366119385, | |
| "learning_rate": 0.0007984838818406879, | |
| "loss": 6.3033, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9949622166246851, | |
| "grad_norm": 0.9716188311576843, | |
| "learning_rate": 0.000792968605190172, | |
| "loss": 6.8283, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0075566750629723, | |
| "grad_norm": 2.8636345863342285, | |
| "learning_rate": 0.0007873987519054385, | |
| "loss": 6.0414, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0075566750629723, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4227, | |
| "eval_samples_per_second": 2.723, | |
| "eval_steps_per_second": 2.723, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0201511335012594, | |
| "grad_norm": 2.396373748779297, | |
| "learning_rate": 0.0007817753776010232, | |
| "loss": 6.3219, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0327455919395465, | |
| "grad_norm": 1.2031290531158447, | |
| "learning_rate": 0.0007760995480349184, | |
| "loss": 6.2492, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0453400503778338, | |
| "grad_norm": 3.029849052429199, | |
| "learning_rate": 0.0007703723389065873, | |
| "loss": 6.4878, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.057934508816121, | |
| "grad_norm": 3.050990104675293, | |
| "learning_rate": 0.0007645948356530953, | |
| "loss": 5.9352, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.070528967254408, | |
| "grad_norm": 1.3113600015640259, | |
| "learning_rate": 0.000758768133243394, | |
| "loss": 6.3298, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0831234256926952, | |
| "grad_norm": 1.306564450263977, | |
| "learning_rate": 0.0007528933359708001, | |
| "loss": 6.4415, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0957178841309823, | |
| "grad_norm": 1.9978339672088623, | |
| "learning_rate": 0.0007469715572437062, | |
| "loss": 6.4877, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1083123425692696, | |
| "grad_norm": 1.6242040395736694, | |
| "learning_rate": 0.0007410039193745646, | |
| "loss": 6.7378, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1209068010075567, | |
| "grad_norm": 1.457687258720398, | |
| "learning_rate": 0.0007349915533671839, | |
| "loss": 5.9593, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.1335012594458438, | |
| "grad_norm": 12.794464111328125, | |
| "learning_rate": 0.0007289355987023768, | |
| "loss": 6.428, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1335012594458438, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3011, | |
| "eval_samples_per_second": 2.745, | |
| "eval_steps_per_second": 2.745, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.146095717884131, | |
| "grad_norm": 1.6018197536468506, | |
| "learning_rate": 0.0007228372031220035, | |
| "loss": 6.0791, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1586901763224182, | |
| "grad_norm": 1.2083125114440918, | |
| "learning_rate": 0.000716697522411448, | |
| "loss": 6.1847, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1712846347607053, | |
| "grad_norm": 1.2754706144332886, | |
| "learning_rate": 0.0007105177201805702, | |
| "loss": 6.5466, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1838790931989924, | |
| "grad_norm": 2407.5634765625, | |
| "learning_rate": 0.0007042989676431752, | |
| "loss": 6.0645, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.1964735516372795, | |
| "grad_norm": 3.1770615577697754, | |
| "learning_rate": 0.0006980424433950421, | |
| "loss": 13.3282, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2090680100755669, | |
| "grad_norm": 4.1725993156433105, | |
| "learning_rate": 0.0006917493331905523, | |
| "loss": 6.1844, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.221662468513854, | |
| "grad_norm": 2.3469631671905518, | |
| "learning_rate": 0.0006854208297179626, | |
| "loss": 6.1732, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.234256926952141, | |
| "grad_norm": 4.253868579864502, | |
| "learning_rate": 0.0006790581323733633, | |
| "loss": 6.6594, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2468513853904282, | |
| "grad_norm": 0.4837864637374878, | |
| "learning_rate": 0.000672662447033365, | |
| "loss": 6.4401, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2594458438287153, | |
| "grad_norm": 0.912817120552063, | |
| "learning_rate": 0.0006662349858265576, | |
| "loss": 6.3167, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2594458438287153, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4775, | |
| "eval_samples_per_second": 2.714, | |
| "eval_steps_per_second": 2.714, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2720403022670026, | |
| "grad_norm": 2.411555767059326, | |
| "learning_rate": 0.0006597769669037844, | |
| "loss": 6.043, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2846347607052897, | |
| "grad_norm": 1.488756537437439, | |
| "learning_rate": 0.0006532896142072744, | |
| "loss": 6.5599, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2972292191435768, | |
| "grad_norm": 1.906882643699646, | |
| "learning_rate": 0.0006467741572386773, | |
| "loss": 6.6445, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.309823677581864, | |
| "grad_norm": 1.155110478401184, | |
| "learning_rate": 0.0006402318308260451, | |
| "loss": 6.4565, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.322418136020151, | |
| "grad_norm": 1.057112693786621, | |
| "learning_rate": 0.0006336638748898033, | |
| "loss": 5.8222, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3350125944584383, | |
| "grad_norm": 147.21507263183594, | |
| "learning_rate": 0.0006270715342077584, | |
| "loss": 6.1143, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3476070528967254, | |
| "grad_norm": 1.0020054578781128, | |
| "learning_rate": 0.0006204560581791837, | |
| "loss": 6.625, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.3602015113350125, | |
| "grad_norm": 1.1355900764465332, | |
| "learning_rate": 0.0006138187005880293, | |
| "loss": 6.4284, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.3727959697732999, | |
| "grad_norm": 1.4387480020523071, | |
| "learning_rate": 0.0006071607193653024, | |
| "loss": 6.0397, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.385390428211587, | |
| "grad_norm": 0.8876716494560242, | |
| "learning_rate": 0.0006004833763506589, | |
| "loss": 6.0359, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.385390428211587, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3917, | |
| "eval_samples_per_second": 2.729, | |
| "eval_steps_per_second": 2.729, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.397984886649874, | |
| "grad_norm": 10.183558464050293, | |
| "learning_rate": 0.0005937879370532575, | |
| "loss": 6.5227, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.4105793450881612, | |
| "grad_norm": 4.989817142486572, | |
| "learning_rate": 0.0005870756704119161, | |
| "loss": 5.7424, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.4231738035264483, | |
| "grad_norm": 0.8695787191390991, | |
| "learning_rate": 0.000580347848554619, | |
| "loss": 6.1259, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.4357682619647356, | |
| "grad_norm": 2.9313132762908936, | |
| "learning_rate": 0.0005736057465574194, | |
| "loss": 6.2371, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.4483627204030227, | |
| "grad_norm": 15.412851333618164, | |
| "learning_rate": 0.0005668506422027838, | |
| "loss": 6.4632, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4609571788413098, | |
| "grad_norm": 1.714887022972107, | |
| "learning_rate": 0.0005600838157374224, | |
| "loss": 6.1565, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.473551637279597, | |
| "grad_norm": 7.269957542419434, | |
| "learning_rate": 0.0005533065496296534, | |
| "loss": 6.3067, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.486146095717884, | |
| "grad_norm": 7.1710591316223145, | |
| "learning_rate": 0.000546520128326346, | |
| "loss": 6.1889, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.4987405541561714, | |
| "grad_norm": 1.08518648147583, | |
| "learning_rate": 0.0005397258380094882, | |
| "loss": 6.0912, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.5113350125944585, | |
| "grad_norm": 1.5827178955078125, | |
| "learning_rate": 0.0005329249663524261, | |
| "loss": 6.3701, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5113350125944585, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.1761, | |
| "eval_samples_per_second": 2.768, | |
| "eval_steps_per_second": 2.768, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5239294710327456, | |
| "grad_norm": 2.0807712078094482, | |
| "learning_rate": 0.00052611880227582, | |
| "loss": 6.3354, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.536523929471033, | |
| "grad_norm": 0.9915687441825867, | |
| "learning_rate": 0.000519308635703365, | |
| "loss": 6.1947, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.5491183879093198, | |
| "grad_norm": 2.210996150970459, | |
| "learning_rate": 0.0005124957573173205, | |
| "loss": 6.4126, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.561712846347607, | |
| "grad_norm": 0.8593767285346985, | |
| "learning_rate": 0.0005056814583138962, | |
| "loss": 6.0815, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.5743073047858942, | |
| "grad_norm": 6.98859167098999, | |
| "learning_rate": 0.0004988670301585422, | |
| "loss": 5.6936, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5869017632241813, | |
| "grad_norm": 1.7452690601348877, | |
| "learning_rate": 0.0004920537643411849, | |
| "loss": 6.5348, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.5994962216624686, | |
| "grad_norm": 2.5241520404815674, | |
| "learning_rate": 0.0004852429521314616, | |
| "loss": 6.0232, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.6120906801007555, | |
| "grad_norm": 0.8746917247772217, | |
| "learning_rate": 0.0004784358843339947, | |
| "loss": 6.1688, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.6246851385390428, | |
| "grad_norm": 1.0631097555160522, | |
| "learning_rate": 0.0004716338510437549, | |
| "loss": 6.3, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.63727959697733, | |
| "grad_norm": 6.525661945343018, | |
| "learning_rate": 0.00046483814140155867, | |
| "loss": 6.9225, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.63727959697733, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3269, | |
| "eval_samples_per_second": 2.74, | |
| "eval_steps_per_second": 2.74, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.649874055415617, | |
| "grad_norm": 0.913987398147583, | |
| "learning_rate": 0.0004580500433497467, | |
| "loss": 6.1362, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.6624685138539044, | |
| "grad_norm": 1.5815681219100952, | |
| "learning_rate": 0.00045127084338808906, | |
| "loss": 6.4123, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.6750629722921915, | |
| "grad_norm": 1.3284748792648315, | |
| "learning_rate": 0.00044450182632996445, | |
| "loss": 6.1109, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.6876574307304786, | |
| "grad_norm": 1.746523380279541, | |
| "learning_rate": 0.0004377442750588584, | |
| "loss": 6.3474, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.700251889168766, | |
| "grad_norm": 6.772982597351074, | |
| "learning_rate": 0.0004309994702852283, | |
| "loss": 6.0853, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7128463476070528, | |
| "grad_norm": 1.758326768875122, | |
| "learning_rate": 0.00042426869030377795, | |
| "loss": 6.4546, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.7254408060453401, | |
| "grad_norm": 19.387592315673828, | |
| "learning_rate": 0.00041755321075119306, | |
| "loss": 6.0784, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.7380352644836272, | |
| "grad_norm": 0.7060778737068176, | |
| "learning_rate": 0.0004108543043643778, | |
| "loss": 6.5413, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.7506297229219143, | |
| "grad_norm": 22.071979522705078, | |
| "learning_rate": 0.000404173240739243, | |
| "loss": 6.1817, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.7632241813602016, | |
| "grad_norm": 6.522192001342773, | |
| "learning_rate": 0.0003975112860900878, | |
| "loss": 6.5807, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7632241813602016, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3374, | |
| "eval_samples_per_second": 2.738, | |
| "eval_steps_per_second": 2.738, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7758186397984885, | |
| "grad_norm": 0.940758228302002, | |
| "learning_rate": 0.0003908697030096239, | |
| "loss": 6.1871, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.7884130982367759, | |
| "grad_norm": 2.718416929244995, | |
| "learning_rate": 0.00038424975022968456, | |
| "loss": 6.3521, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.801007556675063, | |
| "grad_norm": 13.035350799560547, | |
| "learning_rate": 0.0003776526823826671, | |
| "loss": 6.4073, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.81360201511335, | |
| "grad_norm": 16.264127731323242, | |
| "learning_rate": 0.00037107974976375023, | |
| "loss": 6.0222, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.8261964735516374, | |
| "grad_norm": 9.861279487609863, | |
| "learning_rate": 0.000364532198093936, | |
| "loss": 5.5407, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.8387909319899243, | |
| "grad_norm": 6.148726463317871, | |
| "learning_rate": 0.0003580112682839554, | |
| "loss": 6.162, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.8513853904282116, | |
| "grad_norm": 1.1691336631774902, | |
| "learning_rate": 0.0003515181961990892, | |
| "loss": 6.2415, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.8639798488664987, | |
| "grad_norm": 5.119348526000977, | |
| "learning_rate": 0.00034505421242494186, | |
| "loss": 6.3736, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.8765743073047858, | |
| "grad_norm": 4.946280479431152, | |
| "learning_rate": 0.00033862054203421795, | |
| "loss": 6.3236, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.8891687657430731, | |
| "grad_norm": 1.3196860551834106, | |
| "learning_rate": 0.0003322184043545431, | |
| "loss": 6.8649, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8891687657430731, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3118, | |
| "eval_samples_per_second": 2.743, | |
| "eval_steps_per_second": 2.743, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9017632241813602, | |
| "grad_norm": 43.375179290771484, | |
| "learning_rate": 0.0003258490127373731, | |
| "loss": 5.9977, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9143576826196473, | |
| "grad_norm": 2.4827260971069336, | |
| "learning_rate": 0.00031951357432803636, | |
| "loss": 6.5355, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.9269521410579347, | |
| "grad_norm": 1.779510736465454, | |
| "learning_rate": 0.00031321328983695225, | |
| "loss": 5.9877, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.9395465994962215, | |
| "grad_norm": 9.966399192810059, | |
| "learning_rate": 0.00030694935331206867, | |
| "loss": 5.8464, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.9521410579345089, | |
| "grad_norm": 1.7442153692245483, | |
| "learning_rate": 0.0003007229519125633, | |
| "loss": 6.3703, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.964735516372796, | |
| "grad_norm": 12.200019836425781, | |
| "learning_rate": 0.0002945352656838491, | |
| "loss": 6.3838, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.977329974811083, | |
| "grad_norm": 1.6653344631195068, | |
| "learning_rate": 0.0002883874673339291, | |
| "loss": 6.4439, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.9899244332493704, | |
| "grad_norm": 2.319791078567505, | |
| "learning_rate": 0.00028228072201114156, | |
| "loss": 6.5633, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.0025188916876573, | |
| "grad_norm": 14.965546607971191, | |
| "learning_rate": 0.00027621618708333746, | |
| "loss": 6.3788, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.0151133501259446, | |
| "grad_norm": 37.10917663574219, | |
| "learning_rate": 0.0002701950119185332, | |
| "loss": 6.1397, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0151133501259446, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.2229, | |
| "eval_samples_per_second": 2.759, | |
| "eval_steps_per_second": 2.759, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.027707808564232, | |
| "grad_norm": 21.792129516601562, | |
| "learning_rate": 0.00026421833766707804, | |
| "loss": 6.0449, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.040302267002519, | |
| "grad_norm": 1.4153228998184204, | |
| "learning_rate": 0.0002582872970453817, | |
| "loss": 6.1041, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.052896725440806, | |
| "grad_norm": 19.016847610473633, | |
| "learning_rate": 0.0002524030141212369, | |
| "loss": 5.3807, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.065491183879093, | |
| "grad_norm": 2.5831799507141113, | |
| "learning_rate": 0.0002465666041007844, | |
| "loss": 5.7776, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.0780856423173804, | |
| "grad_norm": 5.981635093688965, | |
| "learning_rate": 0.00024077917311715518, | |
| "loss": 6.3134, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.0906801007556677, | |
| "grad_norm": 4.322854995727539, | |
| "learning_rate": 0.0002350418180208324, | |
| "loss": 6.4723, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.1032745591939546, | |
| "grad_norm": 0.8821796178817749, | |
| "learning_rate": 0.00022935562617177452, | |
| "loss": 6.219, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.115869017632242, | |
| "grad_norm": 1.473185420036316, | |
| "learning_rate": 0.0002237216752333342, | |
| "loss": 6.3348, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.1284634760705288, | |
| "grad_norm": 45.26899719238281, | |
| "learning_rate": 0.00021814103296801765, | |
| "loss": 6.497, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.141057934508816, | |
| "grad_norm": 1.3694355487823486, | |
| "learning_rate": 0.00021261475703511886, | |
| "loss": 5.7675, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.141057934508816, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.2608, | |
| "eval_samples_per_second": 2.752, | |
| "eval_steps_per_second": 2.752, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.1536523929471034, | |
| "grad_norm": 3.0793488025665283, | |
| "learning_rate": 0.00020714389479026886, | |
| "loss": 5.9433, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.1662468513853903, | |
| "grad_norm": 6.340664863586426, | |
| "learning_rate": 0.00020172948308693747, | |
| "loss": 5.8344, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.1788413098236776, | |
| "grad_norm": 10.123492240905762, | |
| "learning_rate": 0.00019637254807992693, | |
| "loss": 5.8357, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.1914357682619645, | |
| "grad_norm": 1.9418385028839111, | |
| "learning_rate": 0.000191074105030891, | |
| "loss": 6.1829, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.204030226700252, | |
| "grad_norm": 4.6294732093811035, | |
| "learning_rate": 0.00018583515811592066, | |
| "loss": 6.0258, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.216624685138539, | |
| "grad_norm": 2.012091636657715, | |
| "learning_rate": 0.00018065670023522873, | |
| "loss": 5.9511, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.229219143576826, | |
| "grad_norm": 13.326998710632324, | |
| "learning_rate": 0.00017553971282497377, | |
| "loss": 6.4601, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.2418136020151134, | |
| "grad_norm": 65.63765716552734, | |
| "learning_rate": 0.00017048516567125406, | |
| "loss": 6.0541, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.2544080604534007, | |
| "grad_norm": 1.0482648611068726, | |
| "learning_rate": 0.0001654940167263127, | |
| "loss": 6.3912, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.2670025188916876, | |
| "grad_norm": 1.324476957321167, | |
| "learning_rate": 0.00016056721192698185, | |
| "loss": 6.2605, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.2670025188916876, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.2431, | |
| "eval_samples_per_second": 2.755, | |
| "eval_steps_per_second": 2.755, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.279596977329975, | |
| "grad_norm": 1.1805267333984375, | |
| "learning_rate": 0.0001557056850154078, | |
| "loss": 5.8732, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.292191435768262, | |
| "grad_norm": 3.1749534606933594, | |
| "learning_rate": 0.00015091035736208437, | |
| "loss": 6.3552, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.304785894206549, | |
| "grad_norm": 8.554312705993652, | |
| "learning_rate": 0.00014618213779123268, | |
| "loss": 6.2407, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.3173803526448364, | |
| "grad_norm": 5.832612991333008, | |
| "learning_rate": 0.00014152192240855774, | |
| "loss": 6.2874, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.3299748110831233, | |
| "grad_norm": 7.480172157287598, | |
| "learning_rate": 0.00013693059443141648, | |
| "loss": 5.5811, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.3425692695214106, | |
| "grad_norm": 17.8604793548584, | |
| "learning_rate": 0.0001324090240214272, | |
| "loss": 6.0511, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.355163727959698, | |
| "grad_norm": 73.90760803222656, | |
| "learning_rate": 0.00012795806811955496, | |
| "loss": 5.7841, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.367758186397985, | |
| "grad_norm": 83.39054870605469, | |
| "learning_rate": 0.00012357857028370055, | |
| "loss": 6.2784, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.380352644836272, | |
| "grad_norm": 268.36724853515625, | |
| "learning_rate": 0.00011927136052882816, | |
| "loss": 6.1363, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.392947103274559, | |
| "grad_norm": 3.740187406539917, | |
| "learning_rate": 0.00011503725516965741, | |
| "loss": 5.8788, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.392947103274559, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.446, | |
| "eval_samples_per_second": 2.719, | |
| "eval_steps_per_second": 2.719, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.4055415617128464, | |
| "grad_norm": 28.20012092590332, | |
| "learning_rate": 0.00011087705666595408, | |
| "loss": 6.2386, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.4181360201511337, | |
| "grad_norm": 303.59649658203125, | |
| "learning_rate": 0.00010679155347044514, | |
| "loss": 6.1873, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.4307304785894206, | |
| "grad_norm": 9.479316711425781, | |
| "learning_rate": 0.00010278151987938858, | |
| "loss": 5.6748, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.443324937027708, | |
| "grad_norm": 11.061058044433594, | |
| "learning_rate": 9.884771588582731e-05, | |
| "loss": 5.9298, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.455919395465995, | |
| "grad_norm": 3.6306285858154297, | |
| "learning_rate": 9.499088703555286e-05, | |
| "loss": 6.2918, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.468513853904282, | |
| "grad_norm": 0.8030872344970703, | |
| "learning_rate": 9.121176428580683e-05, | |
| "loss": 6.0907, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.4811083123425695, | |
| "grad_norm": 12.907964706420898, | |
| "learning_rate": 8.751106386674875e-05, | |
| "loss": 5.7241, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.4937027707808563, | |
| "grad_norm": 7.6929450035095215, | |
| "learning_rate": 8.388948714571339e-05, | |
| "loss": 6.2008, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.5062972292191437, | |
| "grad_norm": 18.33698844909668, | |
| "learning_rate": 8.03477204942856e-05, | |
| "loss": 5.8969, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.5188916876574305, | |
| "grad_norm": 12.14580249786377, | |
| "learning_rate": 7.688643515821794e-05, | |
| "loss": 6.0279, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.5188916876574305, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.4275, | |
| "eval_samples_per_second": 2.722, | |
| "eval_steps_per_second": 2.722, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.531486146095718, | |
| "grad_norm": 18.05765724182129, | |
| "learning_rate": 7.350628713021318e-05, | |
| "loss": 5.5725, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.544080604534005, | |
| "grad_norm": 1.635270595550537, | |
| "learning_rate": 7.020791702559972e-05, | |
| "loss": 6.1687, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.556675062972292, | |
| "grad_norm": 3.140681266784668, | |
| "learning_rate": 6.699194996091947e-05, | |
| "loss": 6.0122, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.5692695214105794, | |
| "grad_norm": 109.08464050292969, | |
| "learning_rate": 6.385899543545474e-05, | |
| "loss": 5.7281, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.5818639798488663, | |
| "grad_norm": 26.377573013305664, | |
| "learning_rate": 6.0809647215713464e-05, | |
| "loss": 6.0238, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.5944584382871536, | |
| "grad_norm": 1.1812814474105835, | |
| "learning_rate": 5.784448322289772e-05, | |
| "loss": 5.5657, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.607052896725441, | |
| "grad_norm": 21.708324432373047, | |
| "learning_rate": 5.496406542337348e-05, | |
| "loss": 5.9767, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.619647355163728, | |
| "grad_norm": 23.635461807250977, | |
| "learning_rate": 5.2168939722165906e-05, | |
| "loss": 6.0625, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.632241813602015, | |
| "grad_norm": 5.608989715576172, | |
| "learning_rate": 4.945963585949741e-05, | |
| "loss": 6.1439, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.644836272040302, | |
| "grad_norm": 9.033554077148438, | |
| "learning_rate": 4.6836667310390156e-05, | |
| "loss": 6.3911, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.644836272040302, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3923, | |
| "eval_samples_per_second": 2.729, | |
| "eval_steps_per_second": 2.729, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.6574307304785894, | |
| "grad_norm": 1.7041218280792236, | |
| "learning_rate": 4.430053118735028e-05, | |
| "loss": 6.0198, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.6700251889168767, | |
| "grad_norm": 5.222975254058838, | |
| "learning_rate": 4.1851708146154423e-05, | |
| "loss": 5.8544, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.682619647355164, | |
| "grad_norm": 30.490474700927734, | |
| "learning_rate": 3.949066229475363e-05, | |
| "loss": 6.2353, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.695214105793451, | |
| "grad_norm": 9.03661060333252, | |
| "learning_rate": 3.72178411053151e-05, | |
| "loss": 6.1697, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.7078085642317378, | |
| "grad_norm": 3.526606798171997, | |
| "learning_rate": 3.503367532941533e-05, | |
| "loss": 5.9258, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.720403022670025, | |
| "grad_norm": 38.84684371948242, | |
| "learning_rate": 3.293857891640318e-05, | |
| "loss": 5.9434, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.7329974811083124, | |
| "grad_norm": 1.9702175855636597, | |
| "learning_rate": 3.093294893494711e-05, | |
| "loss": 6.1222, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.7455919395465997, | |
| "grad_norm": 2.135200023651123, | |
| "learning_rate": 2.9017165497781322e-05, | |
| "loss": 5.8755, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.7581863979848866, | |
| "grad_norm": 35.42985534667969, | |
| "learning_rate": 2.7191591689665536e-05, | |
| "loss": 6.1048, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.770780856423174, | |
| "grad_norm": 1.0068978071212769, | |
| "learning_rate": 2.545657349857288e-05, | |
| "loss": 6.0412, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.770780856423174, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.2361, | |
| "eval_samples_per_second": 2.757, | |
| "eval_steps_per_second": 2.757, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.783375314861461, | |
| "grad_norm": 2.2877144813537598, | |
| "learning_rate": 2.3812439750116348e-05, | |
| "loss": 6.1917, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.795969773299748, | |
| "grad_norm": 24.73290252685547, | |
| "learning_rate": 2.2259502045229487e-05, | |
| "loss": 5.9619, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.8085642317380355, | |
| "grad_norm": 8.858285903930664, | |
| "learning_rate": 2.0798054701110625e-05, | |
| "loss": 5.543, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.8211586901763224, | |
| "grad_norm": 3.0846030712127686, | |
| "learning_rate": 1.9428374695443077e-05, | |
| "loss": 5.6675, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.8337531486146097, | |
| "grad_norm": 3.8137497901916504, | |
| "learning_rate": 1.8150721613901507e-05, | |
| "loss": 6.2949, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.8463476070528966, | |
| "grad_norm": 7.545982360839844, | |
| "learning_rate": 1.6965337600954483e-05, | |
| "loss": 6.0377, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.858942065491184, | |
| "grad_norm": 11.194375038146973, | |
| "learning_rate": 1.5872447313972775e-05, | |
| "loss": 5.8728, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.8715365239294712, | |
| "grad_norm": 1.9697824716567993, | |
| "learning_rate": 1.487225788065136e-05, | |
| "loss": 5.8687, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.884130982367758, | |
| "grad_norm": 1.6085684299468994, | |
| "learning_rate": 1.396495885975419e-05, | |
| "loss": 6.2602, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.8967254408060454, | |
| "grad_norm": 52.61852264404297, | |
| "learning_rate": 1.3150722205188401e-05, | |
| "loss": 6.0862, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.8967254408060454, | |
| "eval_loss": NaN, | |
| "eval_runtime": 15.3223, | |
| "eval_samples_per_second": 2.741, | |
| "eval_steps_per_second": 2.741, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.9093198992443323, | |
| "grad_norm": 1.3086152076721191, | |
| "learning_rate": 1.2429702233415039e-05, | |
| "loss": 5.7933, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.9219143576826196, | |
| "grad_norm": 1.3068674802780151, | |
| "learning_rate": 1.180203559420273e-05, | |
| "loss": 5.5988, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.934508816120907, | |
| "grad_norm": 6.930998802185059, | |
| "learning_rate": 1.126784124472937e-05, | |
| "loss": 6.211, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.947103274559194, | |
| "grad_norm": 101.07465362548828, | |
| "learning_rate": 1.0827220427036787e-05, | |
| "loss": 6.2951, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.959697732997481, | |
| "grad_norm": 7.183104038238525, | |
| "learning_rate": 1.048025664884356e-05, | |
| "loss": 6.2843, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.972292191435768, | |
| "grad_norm": 1.7374407052993774, | |
| "learning_rate": 1.0227015667717848e-05, | |
| "loss": 6.1675, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.9848866498740554, | |
| "grad_norm": 4.074841022491455, | |
| "learning_rate": 1.0067545478615162e-05, | |
| "loss": 6.2533, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.9974811083123427, | |
| "grad_norm": 4.090394020080566, | |
| "learning_rate": 1.000187630478211e-05, | |
| "loss": 6.5554, | |
| "step": 2380 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2382, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3798117919463834e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |