| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9998574709003089, | |
| "eval_steps": 1000, | |
| "global_step": 3946, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002533850661176657, | |
| "grad_norm": 0.2532438337802887, | |
| "learning_rate": 3.1645569620253167e-06, | |
| "loss": 1.4823, | |
| "num_input_tokens_seen": 930032, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005067701322353314, | |
| "grad_norm": 0.2576071619987488, | |
| "learning_rate": 6.329113924050633e-06, | |
| "loss": 1.45, | |
| "num_input_tokens_seen": 1834624, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007601551983529971, | |
| "grad_norm": 0.27260521054267883, | |
| "learning_rate": 9.49367088607595e-06, | |
| "loss": 1.44, | |
| "num_input_tokens_seen": 2731480, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010135402644706628, | |
| "grad_norm": 0.2555162012577057, | |
| "learning_rate": 1.2658227848101267e-05, | |
| "loss": 1.4811, | |
| "num_input_tokens_seen": 3620696, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.012669253305883284, | |
| "grad_norm": 0.24727804958820343, | |
| "learning_rate": 1.5822784810126583e-05, | |
| "loss": 1.4547, | |
| "num_input_tokens_seen": 4537164, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015203103967059942, | |
| "grad_norm": 0.26695573329925537, | |
| "learning_rate": 1.89873417721519e-05, | |
| "loss": 1.4288, | |
| "num_input_tokens_seen": 5457344, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.017736954628236597, | |
| "grad_norm": 0.2801561653614044, | |
| "learning_rate": 2.2151898734177217e-05, | |
| "loss": 1.4569, | |
| "num_input_tokens_seen": 6349292, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.020270805289413257, | |
| "grad_norm": 0.22158554196357727, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4942, | |
| "num_input_tokens_seen": 7245840, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.022804655950589912, | |
| "grad_norm": 0.26374679803848267, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4492, | |
| "num_input_tokens_seen": 8171092, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.025338506611766568, | |
| "grad_norm": 0.23668645322322845, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4677, | |
| "num_input_tokens_seen": 9093156, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.027872357272943227, | |
| "grad_norm": 0.25576356053352356, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4109, | |
| "num_input_tokens_seen": 9976384, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.030406207934119883, | |
| "grad_norm": 0.2770518660545349, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4304, | |
| "num_input_tokens_seen": 10906092, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03294005859529654, | |
| "grad_norm": 0.2333258092403412, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4142, | |
| "num_input_tokens_seen": 11818744, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.035473909256473195, | |
| "grad_norm": 0.24696557223796844, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4246, | |
| "num_input_tokens_seen": 12743780, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.038007759917649854, | |
| "grad_norm": 0.2408542037010193, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4475, | |
| "num_input_tokens_seen": 13674048, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04054161057882651, | |
| "grad_norm": 0.2496064305305481, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4529, | |
| "num_input_tokens_seen": 14538524, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.043075461240003166, | |
| "grad_norm": 0.2827187478542328, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4542, | |
| "num_input_tokens_seen": 15470540, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.045609311901179825, | |
| "grad_norm": 0.25148963928222656, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4361, | |
| "num_input_tokens_seen": 16422976, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.048143162562356484, | |
| "grad_norm": 0.24195212125778198, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4775, | |
| "num_input_tokens_seen": 17344648, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.050677013223533136, | |
| "grad_norm": 0.3068198263645172, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4231, | |
| "num_input_tokens_seen": 18245048, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.053210863884709796, | |
| "grad_norm": 0.24267973005771637, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4354, | |
| "num_input_tokens_seen": 19169384, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.055744714545886455, | |
| "grad_norm": 0.21026775240898132, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4153, | |
| "num_input_tokens_seen": 20096992, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05827856520706311, | |
| "grad_norm": 0.21877512335777283, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3941, | |
| "num_input_tokens_seen": 21025604, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.060812415868239766, | |
| "grad_norm": 0.24055704474449158, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4348, | |
| "num_input_tokens_seen": 21935180, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06334626652941643, | |
| "grad_norm": 0.24673806130886078, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3719, | |
| "num_input_tokens_seen": 22857776, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06588011719059308, | |
| "grad_norm": 0.21661491692066193, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4707, | |
| "num_input_tokens_seen": 23805840, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06841396785176973, | |
| "grad_norm": 0.2766810357570648, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4558, | |
| "num_input_tokens_seen": 24694772, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07094781851294639, | |
| "grad_norm": 0.2665688097476959, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4026, | |
| "num_input_tokens_seen": 25637024, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07348166917412305, | |
| "grad_norm": 0.2424854040145874, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3998, | |
| "num_input_tokens_seen": 26530332, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07601551983529971, | |
| "grad_norm": 0.23512804508209229, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4188, | |
| "num_input_tokens_seen": 27449020, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07854937049647637, | |
| "grad_norm": 0.23620112240314484, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.39, | |
| "num_input_tokens_seen": 28367404, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08108322115765303, | |
| "grad_norm": 0.2523897588253021, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4487, | |
| "num_input_tokens_seen": 29277896, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08361707181882967, | |
| "grad_norm": 0.24064438045024872, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4214, | |
| "num_input_tokens_seen": 30200812, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08615092248000633, | |
| "grad_norm": 0.2440669983625412, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.444, | |
| "num_input_tokens_seen": 31158760, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08868477314118299, | |
| "grad_norm": 0.22009992599487305, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4127, | |
| "num_input_tokens_seen": 32069424, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09121862380235965, | |
| "grad_norm": 0.29601845145225525, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4437, | |
| "num_input_tokens_seen": 33009436, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09375247446353631, | |
| "grad_norm": 0.2240906059741974, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3871, | |
| "num_input_tokens_seen": 33933612, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09628632512471297, | |
| "grad_norm": 0.23164159059524536, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4149, | |
| "num_input_tokens_seen": 34839560, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09882017578588961, | |
| "grad_norm": 0.335622638463974, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.427, | |
| "num_input_tokens_seen": 35748032, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10135402644706627, | |
| "grad_norm": 0.22885636985301971, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4521, | |
| "num_input_tokens_seen": 36672280, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10388787710824293, | |
| "grad_norm": 0.2555045783519745, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4175, | |
| "num_input_tokens_seen": 37599516, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10642172776941959, | |
| "grad_norm": 0.24946229159832, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4276, | |
| "num_input_tokens_seen": 38529556, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.10895557843059625, | |
| "grad_norm": 0.24785666167736053, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4007, | |
| "num_input_tokens_seen": 39460044, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11148942909177291, | |
| "grad_norm": 0.22006012499332428, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4238, | |
| "num_input_tokens_seen": 40369364, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11402327975294956, | |
| "grad_norm": 0.26216018199920654, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4318, | |
| "num_input_tokens_seen": 41307640, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11655713041412621, | |
| "grad_norm": 0.23494452238082886, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.417, | |
| "num_input_tokens_seen": 42200280, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.11909098107530287, | |
| "grad_norm": 0.23429952561855316, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4277, | |
| "num_input_tokens_seen": 43112444, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12162483173647953, | |
| "grad_norm": 0.2510409355163574, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3853, | |
| "num_input_tokens_seen": 44021860, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12415868239765619, | |
| "grad_norm": 0.2570734918117523, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4056, | |
| "num_input_tokens_seen": 44938384, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.12669253305883285, | |
| "grad_norm": 0.23910905420780182, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4303, | |
| "num_input_tokens_seen": 45871544, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1292263837200095, | |
| "grad_norm": 0.2258525788784027, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4243, | |
| "num_input_tokens_seen": 46798524, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.13176023438118617, | |
| "grad_norm": 0.21156556904315948, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3965, | |
| "num_input_tokens_seen": 47696192, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13429408504236282, | |
| "grad_norm": 0.2665134370326996, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.393, | |
| "num_input_tokens_seen": 48669228, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.13682793570353946, | |
| "grad_norm": 0.2551543414592743, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4184, | |
| "num_input_tokens_seen": 49616616, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13936178636471613, | |
| "grad_norm": 0.2285103052854538, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3964, | |
| "num_input_tokens_seen": 50540636, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.14189563702589278, | |
| "grad_norm": 0.23576393723487854, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4187, | |
| "num_input_tokens_seen": 51440464, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.14442948768706945, | |
| "grad_norm": 0.22209148108959198, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.403, | |
| "num_input_tokens_seen": 52315124, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1469633383482461, | |
| "grad_norm": 0.23545274138450623, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4313, | |
| "num_input_tokens_seen": 53261804, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.14949718900942277, | |
| "grad_norm": 0.25153088569641113, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3798, | |
| "num_input_tokens_seen": 54106436, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.15203103967059942, | |
| "grad_norm": 0.23856191337108612, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3679, | |
| "num_input_tokens_seen": 55035052, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15456489033177606, | |
| "grad_norm": 0.23667120933532715, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4191, | |
| "num_input_tokens_seen": 55935200, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.15709874099295273, | |
| "grad_norm": 0.26784512400627136, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3684, | |
| "num_input_tokens_seen": 56843340, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.15963259165412938, | |
| "grad_norm": 0.22612795233726501, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.436, | |
| "num_input_tokens_seen": 57720808, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.16216644231530605, | |
| "grad_norm": 0.24946410953998566, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.426, | |
| "num_input_tokens_seen": 58575924, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1647002929764827, | |
| "grad_norm": 0.2528791129589081, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4191, | |
| "num_input_tokens_seen": 59484056, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.16723414363765934, | |
| "grad_norm": 0.21960842609405518, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.443, | |
| "num_input_tokens_seen": 60382860, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.16976799429883602, | |
| "grad_norm": 0.2500540018081665, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4284, | |
| "num_input_tokens_seen": 61291764, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.17230184496001266, | |
| "grad_norm": 0.27140355110168457, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3909, | |
| "num_input_tokens_seen": 62183556, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.17483569562118934, | |
| "grad_norm": 0.22307205200195312, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3682, | |
| "num_input_tokens_seen": 63098340, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.17736954628236598, | |
| "grad_norm": 0.24494685232639313, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3903, | |
| "num_input_tokens_seen": 64000524, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.17990339694354263, | |
| "grad_norm": 0.2667907476425171, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4569, | |
| "num_input_tokens_seen": 64937424, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1824372476047193, | |
| "grad_norm": 0.22164462506771088, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3806, | |
| "num_input_tokens_seen": 65822472, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.18497109826589594, | |
| "grad_norm": 0.23859019577503204, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4003, | |
| "num_input_tokens_seen": 66691752, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.18750494892707262, | |
| "grad_norm": 0.28847405314445496, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4076, | |
| "num_input_tokens_seen": 67658948, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.19003879958824926, | |
| "grad_norm": 0.2571374177932739, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3924, | |
| "num_input_tokens_seen": 68572048, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19257265024942594, | |
| "grad_norm": 0.24991680681705475, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4164, | |
| "num_input_tokens_seen": 69502808, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.19510650091060258, | |
| "grad_norm": 0.23006725311279297, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4019, | |
| "num_input_tokens_seen": 70423124, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.19764035157177923, | |
| "grad_norm": 0.2484099566936493, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3565, | |
| "num_input_tokens_seen": 71271484, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2001742022329559, | |
| "grad_norm": 0.2604601979255676, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4098, | |
| "num_input_tokens_seen": 72179604, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.20270805289413255, | |
| "grad_norm": 0.2681257724761963, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4152, | |
| "num_input_tokens_seen": 73085296, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20524190355530922, | |
| "grad_norm": 0.20966367423534393, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4006, | |
| "num_input_tokens_seen": 74003640, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.20777575421648586, | |
| "grad_norm": 0.2371470183134079, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3651, | |
| "num_input_tokens_seen": 74957748, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2103096048776625, | |
| "grad_norm": 0.24214884638786316, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3969, | |
| "num_input_tokens_seen": 75841664, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.21284345553883918, | |
| "grad_norm": 0.24258075654506683, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4356, | |
| "num_input_tokens_seen": 76765412, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.21537730620001583, | |
| "grad_norm": 0.25199827551841736, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4171, | |
| "num_input_tokens_seen": 77675892, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2179111568611925, | |
| "grad_norm": 0.219390869140625, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3713, | |
| "num_input_tokens_seen": 78646236, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.22044500752236915, | |
| "grad_norm": 0.2546541690826416, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4154, | |
| "num_input_tokens_seen": 79594216, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.22297885818354582, | |
| "grad_norm": 0.28596746921539307, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3981, | |
| "num_input_tokens_seen": 80523804, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.22551270884472246, | |
| "grad_norm": 0.21436405181884766, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3889, | |
| "num_input_tokens_seen": 81405376, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2280465595058991, | |
| "grad_norm": 0.2508715093135834, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3682, | |
| "num_input_tokens_seen": 82260336, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23058041016707578, | |
| "grad_norm": 0.24959874153137207, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3651, | |
| "num_input_tokens_seen": 83190224, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.23311426082825243, | |
| "grad_norm": 0.27335524559020996, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4221, | |
| "num_input_tokens_seen": 84107372, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2356481114894291, | |
| "grad_norm": 0.2550046443939209, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4029, | |
| "num_input_tokens_seen": 85024192, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.23818196215060575, | |
| "grad_norm": 0.23554718494415283, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4045, | |
| "num_input_tokens_seen": 85956220, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2407158128117824, | |
| "grad_norm": 0.21662922203540802, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3908, | |
| "num_input_tokens_seen": 86858100, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.24324966347295907, | |
| "grad_norm": 0.22381572425365448, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4337, | |
| "num_input_tokens_seen": 87771400, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2457835141341357, | |
| "grad_norm": 0.2680582106113434, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4325, | |
| "num_input_tokens_seen": 88675708, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.24831736479531238, | |
| "grad_norm": 0.22555038332939148, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3741, | |
| "num_input_tokens_seen": 89561964, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.25085121545648903, | |
| "grad_norm": 0.2812931537628174, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4104, | |
| "num_input_tokens_seen": 90488048, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2533850661176657, | |
| "grad_norm": 0.23613446950912476, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4008, | |
| "num_input_tokens_seen": 91375832, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2533850661176657, | |
| "eval_loss": 1.4020060300827026, | |
| "eval_runtime": 2.9465, | |
| "eval_samples_per_second": 50.908, | |
| "eval_steps_per_second": 6.448, | |
| "num_input_tokens_seen": 91375832, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2559189167788423, | |
| "grad_norm": 0.2325298935174942, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3544, | |
| "num_input_tokens_seen": 92347240, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.258452767440019, | |
| "grad_norm": 0.24142597615718842, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3706, | |
| "num_input_tokens_seen": 93237456, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.26098661810119567, | |
| "grad_norm": 0.2356724739074707, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3998, | |
| "num_input_tokens_seen": 94145764, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.26352046876237234, | |
| "grad_norm": 0.243470698595047, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4013, | |
| "num_input_tokens_seen": 95055692, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.26605431942354896, | |
| "grad_norm": 0.2412971556186676, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.373, | |
| "num_input_tokens_seen": 95921656, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.26858817008472563, | |
| "grad_norm": 0.2889567017555237, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3898, | |
| "num_input_tokens_seen": 96821452, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.2711220207459023, | |
| "grad_norm": 0.23939931392669678, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4088, | |
| "num_input_tokens_seen": 97727612, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2736558714070789, | |
| "grad_norm": 0.25132742524147034, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3853, | |
| "num_input_tokens_seen": 98677952, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.2761897220682556, | |
| "grad_norm": 0.2225540727376938, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4041, | |
| "num_input_tokens_seen": 99640748, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.27872357272943227, | |
| "grad_norm": 0.24503560364246368, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3719, | |
| "num_input_tokens_seen": 100557008, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.28125742339060894, | |
| "grad_norm": 0.2348717302083969, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3937, | |
| "num_input_tokens_seen": 101442164, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.28379127405178556, | |
| "grad_norm": 0.24240590631961823, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3641, | |
| "num_input_tokens_seen": 102366056, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.28632512471296223, | |
| "grad_norm": 0.2246118187904358, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3631, | |
| "num_input_tokens_seen": 103261480, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.2888589753741389, | |
| "grad_norm": 0.2967662513256073, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3883, | |
| "num_input_tokens_seen": 104163484, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.2913928260353155, | |
| "grad_norm": 0.24722802639007568, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4444, | |
| "num_input_tokens_seen": 105077064, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2939266766964922, | |
| "grad_norm": 0.2221587598323822, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3809, | |
| "num_input_tokens_seen": 105968728, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.29646052735766887, | |
| "grad_norm": 0.23813994228839874, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3941, | |
| "num_input_tokens_seen": 106838388, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.29899437801884554, | |
| "grad_norm": 0.24747894704341888, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3812, | |
| "num_input_tokens_seen": 107764200, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.30152822868002216, | |
| "grad_norm": 0.26802727580070496, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3717, | |
| "num_input_tokens_seen": 108683176, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.30406207934119883, | |
| "grad_norm": 0.27138280868530273, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.367, | |
| "num_input_tokens_seen": 109606364, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3065959300023755, | |
| "grad_norm": 0.24378275871276855, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3762, | |
| "num_input_tokens_seen": 110518296, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3091297806635521, | |
| "grad_norm": 0.261106938123703, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4227, | |
| "num_input_tokens_seen": 111436828, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3116636313247288, | |
| "grad_norm": 0.2597008943557739, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3847, | |
| "num_input_tokens_seen": 112334112, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.31419748198590547, | |
| "grad_norm": 0.24535202980041504, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3706, | |
| "num_input_tokens_seen": 113211652, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3167313326470821, | |
| "grad_norm": 0.2770673632621765, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3975, | |
| "num_input_tokens_seen": 114117744, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.31926518330825876, | |
| "grad_norm": 0.21976234018802643, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4036, | |
| "num_input_tokens_seen": 115002568, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.32179903396943543, | |
| "grad_norm": 0.22749099135398865, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3625, | |
| "num_input_tokens_seen": 115904964, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3243328846306121, | |
| "grad_norm": 0.22470030188560486, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3905, | |
| "num_input_tokens_seen": 116843732, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3268667352917887, | |
| "grad_norm": 0.2671917974948883, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3839, | |
| "num_input_tokens_seen": 117752200, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3294005859529654, | |
| "grad_norm": 0.24347306787967682, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.36, | |
| "num_input_tokens_seen": 118656912, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.33193443661414207, | |
| "grad_norm": 0.22786876559257507, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.361, | |
| "num_input_tokens_seen": 119561700, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3344682872753187, | |
| "grad_norm": 0.22891202569007874, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3916, | |
| "num_input_tokens_seen": 120537120, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.33700213793649536, | |
| "grad_norm": 0.2579503357410431, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4077, | |
| "num_input_tokens_seen": 121473416, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.33953598859767203, | |
| "grad_norm": 0.24670307338237762, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4055, | |
| "num_input_tokens_seen": 122383356, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3420698392588487, | |
| "grad_norm": 0.2923058569431305, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3875, | |
| "num_input_tokens_seen": 123309020, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.3446036899200253, | |
| "grad_norm": 0.2256019562482834, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3872, | |
| "num_input_tokens_seen": 124234924, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.347137540581202, | |
| "grad_norm": 0.2368822544813156, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3969, | |
| "num_input_tokens_seen": 125162100, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.34967139124237867, | |
| "grad_norm": 0.2430727332830429, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3638, | |
| "num_input_tokens_seen": 126113704, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3522052419035553, | |
| "grad_norm": 0.23543952405452728, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3642, | |
| "num_input_tokens_seen": 127052976, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.35473909256473196, | |
| "grad_norm": 0.24988651275634766, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3784, | |
| "num_input_tokens_seen": 127996892, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.35727294322590863, | |
| "grad_norm": 0.2787221670150757, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4052, | |
| "num_input_tokens_seen": 128935380, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.35980679388708525, | |
| "grad_norm": 0.24997858703136444, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3791, | |
| "num_input_tokens_seen": 129871964, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3623406445482619, | |
| "grad_norm": 0.24547652900218964, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.395, | |
| "num_input_tokens_seen": 130767084, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.3648744952094386, | |
| "grad_norm": 0.23068061470985413, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3677, | |
| "num_input_tokens_seen": 131674508, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.36740834587061527, | |
| "grad_norm": 0.23524820804595947, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4161, | |
| "num_input_tokens_seen": 132602416, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3699421965317919, | |
| "grad_norm": 0.23469901084899902, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3721, | |
| "num_input_tokens_seen": 133506196, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.37247604719296856, | |
| "grad_norm": 0.24987129867076874, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4049, | |
| "num_input_tokens_seen": 134427152, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.37500989785414524, | |
| "grad_norm": 0.24462181329727173, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3797, | |
| "num_input_tokens_seen": 135314244, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.37754374851532185, | |
| "grad_norm": 0.2653500437736511, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3503, | |
| "num_input_tokens_seen": 136230948, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3800775991764985, | |
| "grad_norm": 0.2400883287191391, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3957, | |
| "num_input_tokens_seen": 137179452, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3826114498376752, | |
| "grad_norm": 0.2289241999387741, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3529, | |
| "num_input_tokens_seen": 138078404, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.3851453004988519, | |
| "grad_norm": 0.26289331912994385, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4398, | |
| "num_input_tokens_seen": 138991724, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.3876791511600285, | |
| "grad_norm": 0.2165287286043167, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.41, | |
| "num_input_tokens_seen": 139933240, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.39021300182120516, | |
| "grad_norm": 0.29837462306022644, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3662, | |
| "num_input_tokens_seen": 140836772, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.39274685248238184, | |
| "grad_norm": 0.24651922285556793, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3412, | |
| "num_input_tokens_seen": 141744576, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.39528070314355845, | |
| "grad_norm": 0.29952993988990784, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3907, | |
| "num_input_tokens_seen": 142624188, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.3978145538047351, | |
| "grad_norm": 0.2563650608062744, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3858, | |
| "num_input_tokens_seen": 143554872, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4003484044659118, | |
| "grad_norm": 0.2565977871417999, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3732, | |
| "num_input_tokens_seen": 144477588, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.4028822551270885, | |
| "grad_norm": 0.2879079282283783, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3692, | |
| "num_input_tokens_seen": 145354620, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4054161057882651, | |
| "grad_norm": 0.2640700936317444, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3909, | |
| "num_input_tokens_seen": 146266280, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.40794995644944176, | |
| "grad_norm": 0.26872700452804565, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4051, | |
| "num_input_tokens_seen": 147165620, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.41048380711061844, | |
| "grad_norm": 0.2187357246875763, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.38, | |
| "num_input_tokens_seen": 148098344, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.41301765777179505, | |
| "grad_norm": 0.24293020367622375, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3915, | |
| "num_input_tokens_seen": 149043924, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.41555150843297173, | |
| "grad_norm": 0.23092688620090485, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4019, | |
| "num_input_tokens_seen": 149996036, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4180853590941484, | |
| "grad_norm": 0.27063265442848206, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3723, | |
| "num_input_tokens_seen": 150869152, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.420619209755325, | |
| "grad_norm": 0.25822359323501587, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3682, | |
| "num_input_tokens_seen": 151783488, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4231530604165017, | |
| "grad_norm": 0.269724041223526, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3592, | |
| "num_input_tokens_seen": 152700960, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.42568691107767836, | |
| "grad_norm": 0.23563367128372192, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3679, | |
| "num_input_tokens_seen": 153634040, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.42822076173885504, | |
| "grad_norm": 0.23306426405906677, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3585, | |
| "num_input_tokens_seen": 154569656, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.43075461240003166, | |
| "grad_norm": 0.23761169612407684, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3413, | |
| "num_input_tokens_seen": 155491724, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.43328846306120833, | |
| "grad_norm": 0.23138809204101562, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3892, | |
| "num_input_tokens_seen": 156437340, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.435822313722385, | |
| "grad_norm": 0.24864792823791504, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.387, | |
| "num_input_tokens_seen": 157343056, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4383561643835616, | |
| "grad_norm": 0.24503816664218903, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3544, | |
| "num_input_tokens_seen": 158211084, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4408900150447383, | |
| "grad_norm": 0.23860155045986176, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3947, | |
| "num_input_tokens_seen": 159127644, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.44342386570591497, | |
| "grad_norm": 0.23359131813049316, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3333, | |
| "num_input_tokens_seen": 160056144, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.44595771636709164, | |
| "grad_norm": 0.23289762437343597, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.4039, | |
| "num_input_tokens_seen": 161001352, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.44849156702826826, | |
| "grad_norm": 0.23038776218891144, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3872, | |
| "num_input_tokens_seen": 161931048, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.45102541768944493, | |
| "grad_norm": 0.26440566778182983, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.372, | |
| "num_input_tokens_seen": 162861292, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.4535592683506216, | |
| "grad_norm": 0.2498098909854889, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3287, | |
| "num_input_tokens_seen": 163797388, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4560931190117982, | |
| "grad_norm": 0.2095261961221695, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3778, | |
| "num_input_tokens_seen": 164671840, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4586269696729749, | |
| "grad_norm": 0.2577464282512665, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3821, | |
| "num_input_tokens_seen": 165619284, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.46116082033415157, | |
| "grad_norm": 0.23324383795261383, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3921, | |
| "num_input_tokens_seen": 166521872, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.46369467099532824, | |
| "grad_norm": 0.23413369059562683, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.391, | |
| "num_input_tokens_seen": 167446436, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.46622852165650486, | |
| "grad_norm": 0.2720430791378021, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.359, | |
| "num_input_tokens_seen": 168356260, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.46876237231768153, | |
| "grad_norm": 0.2760706841945648, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3498, | |
| "num_input_tokens_seen": 169262844, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4712962229788582, | |
| "grad_norm": 0.27992355823516846, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3984, | |
| "num_input_tokens_seen": 170164272, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4738300736400348, | |
| "grad_norm": 0.23402582108974457, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3667, | |
| "num_input_tokens_seen": 171067864, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.4763639243012115, | |
| "grad_norm": 0.29928284883499146, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.335, | |
| "num_input_tokens_seen": 172005232, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.47889777496238817, | |
| "grad_norm": 0.25357866287231445, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3802, | |
| "num_input_tokens_seen": 172915708, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.4814316256235648, | |
| "grad_norm": 0.29246291518211365, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3513, | |
| "num_input_tokens_seen": 173820476, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.48396547628474146, | |
| "grad_norm": 0.2792080342769623, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3939, | |
| "num_input_tokens_seen": 174740920, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.48649932694591813, | |
| "grad_norm": 0.3099055588245392, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3693, | |
| "num_input_tokens_seen": 175635720, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.4890331776070948, | |
| "grad_norm": 0.2375776320695877, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3829, | |
| "num_input_tokens_seen": 176538688, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.4915670282682714, | |
| "grad_norm": 0.2295093983411789, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3691, | |
| "num_input_tokens_seen": 177468420, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.4941008789294481, | |
| "grad_norm": 0.21639369428157806, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3509, | |
| "num_input_tokens_seen": 178388296, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.49663472959062477, | |
| "grad_norm": 0.26756080985069275, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3761, | |
| "num_input_tokens_seen": 179341380, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.4991685802518014, | |
| "grad_norm": 0.21319729089736938, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3803, | |
| "num_input_tokens_seen": 180256564, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5017024309129781, | |
| "grad_norm": 0.2565974295139313, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3763, | |
| "num_input_tokens_seen": 181117020, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5042362815741547, | |
| "grad_norm": 0.30257830023765564, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3671, | |
| "num_input_tokens_seen": 182027528, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5067701322353314, | |
| "grad_norm": 0.23474013805389404, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3456, | |
| "num_input_tokens_seen": 182939052, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5067701322353314, | |
| "eval_loss": 1.3669419288635254, | |
| "eval_runtime": 2.8409, | |
| "eval_samples_per_second": 52.801, | |
| "eval_steps_per_second": 6.688, | |
| "num_input_tokens_seen": 182939052, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.509303982896508, | |
| "grad_norm": 0.2144283950328827, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.37, | |
| "num_input_tokens_seen": 183841188, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5118378335576846, | |
| "grad_norm": 0.2299591451883316, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3436, | |
| "num_input_tokens_seen": 184804372, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5143716842188614, | |
| "grad_norm": 0.2291470170021057, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.38, | |
| "num_input_tokens_seen": 185696628, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.516905534880038, | |
| "grad_norm": 0.25624164938926697, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3741, | |
| "num_input_tokens_seen": 186584108, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5194393855412147, | |
| "grad_norm": 0.2826102077960968, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3786, | |
| "num_input_tokens_seen": 187491532, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5219732362023913, | |
| "grad_norm": 0.23644354939460754, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3119, | |
| "num_input_tokens_seen": 188398308, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.524507086863568, | |
| "grad_norm": 0.2631579041481018, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3596, | |
| "num_input_tokens_seen": 189270772, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5270409375247447, | |
| "grad_norm": 0.24663548171520233, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3833, | |
| "num_input_tokens_seen": 190188192, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5295747881859213, | |
| "grad_norm": 0.21753673255443573, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3746, | |
| "num_input_tokens_seen": 191125784, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5321086388470979, | |
| "grad_norm": 0.2312672883272171, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3558, | |
| "num_input_tokens_seen": 192010984, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5346424895082746, | |
| "grad_norm": 0.2641030251979828, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3436, | |
| "num_input_tokens_seen": 192947832, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.5371763401694513, | |
| "grad_norm": 0.2314285784959793, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3889, | |
| "num_input_tokens_seen": 193836096, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5397101908306279, | |
| "grad_norm": 0.2117050439119339, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3636, | |
| "num_input_tokens_seen": 194752188, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5422440414918046, | |
| "grad_norm": 0.24790892004966736, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3577, | |
| "num_input_tokens_seen": 195659416, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5447778921529812, | |
| "grad_norm": 0.253757119178772, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3767, | |
| "num_input_tokens_seen": 196584176, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5473117428141578, | |
| "grad_norm": 0.2629224359989166, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3771, | |
| "num_input_tokens_seen": 197456816, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5498455934753346, | |
| "grad_norm": 0.2274072915315628, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3633, | |
| "num_input_tokens_seen": 198358444, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5523794441365112, | |
| "grad_norm": 0.2630630135536194, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3702, | |
| "num_input_tokens_seen": 199246040, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5549132947976878, | |
| "grad_norm": 0.24167053401470184, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3785, | |
| "num_input_tokens_seen": 200167412, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5574471454588645, | |
| "grad_norm": 0.2560918927192688, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3757, | |
| "num_input_tokens_seen": 201090512, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5599809961200412, | |
| "grad_norm": 0.23884332180023193, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3642, | |
| "num_input_tokens_seen": 202070196, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.5625148467812179, | |
| "grad_norm": 0.25141972303390503, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3669, | |
| "num_input_tokens_seen": 203015232, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5650486974423945, | |
| "grad_norm": 0.20563028752803802, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3622, | |
| "num_input_tokens_seen": 203955992, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5675825481035711, | |
| "grad_norm": 0.26771050691604614, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3551, | |
| "num_input_tokens_seen": 204867084, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5701163987647478, | |
| "grad_norm": 0.2185191512107849, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3574, | |
| "num_input_tokens_seen": 205818444, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5726502494259245, | |
| "grad_norm": 0.23736274242401123, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3478, | |
| "num_input_tokens_seen": 206727340, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5751841000871011, | |
| "grad_norm": 0.2208438366651535, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3576, | |
| "num_input_tokens_seen": 207682956, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5777179507482778, | |
| "grad_norm": 0.215751051902771, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3105, | |
| "num_input_tokens_seen": 208613224, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.5802518014094544, | |
| "grad_norm": 0.24414047598838806, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3637, | |
| "num_input_tokens_seen": 209480700, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.582785652070631, | |
| "grad_norm": 0.27234476804733276, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3648, | |
| "num_input_tokens_seen": 210380616, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5853195027318078, | |
| "grad_norm": 0.23880694806575775, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3452, | |
| "num_input_tokens_seen": 211323472, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.5878533533929844, | |
| "grad_norm": 0.24618738889694214, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3357, | |
| "num_input_tokens_seen": 212269424, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.590387204054161, | |
| "grad_norm": 0.2280731499195099, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3808, | |
| "num_input_tokens_seen": 213236052, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.5929210547153377, | |
| "grad_norm": 0.2641889452934265, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3635, | |
| "num_input_tokens_seen": 214193180, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.5954549053765144, | |
| "grad_norm": 0.24398839473724365, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3157, | |
| "num_input_tokens_seen": 215145888, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5979887560376911, | |
| "grad_norm": 0.29194214940071106, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3809, | |
| "num_input_tokens_seen": 216076328, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6005226066988677, | |
| "grad_norm": 0.23668240010738373, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3723, | |
| "num_input_tokens_seen": 216957792, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6030564573600443, | |
| "grad_norm": 0.2053728848695755, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3106, | |
| "num_input_tokens_seen": 217923088, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.605590308021221, | |
| "grad_norm": 0.2571648061275482, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3636, | |
| "num_input_tokens_seen": 218831976, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6081241586823977, | |
| "grad_norm": 0.25352680683135986, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3448, | |
| "num_input_tokens_seen": 219756636, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6106580093435743, | |
| "grad_norm": 0.23342467844486237, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3908, | |
| "num_input_tokens_seen": 220660172, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.613191860004751, | |
| "grad_norm": 0.24378784000873566, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3631, | |
| "num_input_tokens_seen": 221559444, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6157257106659276, | |
| "grad_norm": 0.23902441561222076, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3389, | |
| "num_input_tokens_seen": 222484304, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6182595613271042, | |
| "grad_norm": 0.24430356919765472, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3741, | |
| "num_input_tokens_seen": 223424636, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.620793411988281, | |
| "grad_norm": 0.22024385631084442, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3173, | |
| "num_input_tokens_seen": 224336328, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6233272626494576, | |
| "grad_norm": 0.2540358304977417, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3551, | |
| "num_input_tokens_seen": 225268812, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6258611133106342, | |
| "grad_norm": 0.30823466181755066, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3315, | |
| "num_input_tokens_seen": 226203392, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6283949639718109, | |
| "grad_norm": 0.22996842861175537, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3647, | |
| "num_input_tokens_seen": 227073928, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6309288146329876, | |
| "grad_norm": 0.22297543287277222, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3673, | |
| "num_input_tokens_seen": 227988144, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6334626652941642, | |
| "grad_norm": 0.2600548267364502, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3417, | |
| "num_input_tokens_seen": 228908304, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6359965159553409, | |
| "grad_norm": 0.27056604623794556, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2998, | |
| "num_input_tokens_seen": 229859596, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6385303666165175, | |
| "grad_norm": 0.22515636682510376, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3605, | |
| "num_input_tokens_seen": 230760960, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6410642172776942, | |
| "grad_norm": 0.33911067247390747, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3648, | |
| "num_input_tokens_seen": 231683832, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.6435980679388709, | |
| "grad_norm": 0.2713491916656494, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3581, | |
| "num_input_tokens_seen": 232586192, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.6461319186000475, | |
| "grad_norm": 0.22554545104503632, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3217, | |
| "num_input_tokens_seen": 233513620, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6486657692612242, | |
| "grad_norm": 0.23459571599960327, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3185, | |
| "num_input_tokens_seen": 234405628, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6511996199224008, | |
| "grad_norm": 0.22022689878940582, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3724, | |
| "num_input_tokens_seen": 235287208, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.6537334705835774, | |
| "grad_norm": 0.2207019031047821, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3871, | |
| "num_input_tokens_seen": 236206532, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.6562673212447542, | |
| "grad_norm": 0.286006897687912, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.338, | |
| "num_input_tokens_seen": 237132236, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6588011719059308, | |
| "grad_norm": 0.24479633569717407, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3636, | |
| "num_input_tokens_seen": 238036544, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6613350225671074, | |
| "grad_norm": 0.21694402396678925, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3711, | |
| "num_input_tokens_seen": 238978380, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6638688732282841, | |
| "grad_norm": 0.22491593658924103, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3516, | |
| "num_input_tokens_seen": 239893524, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6664027238894608, | |
| "grad_norm": 0.24287302792072296, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3333, | |
| "num_input_tokens_seen": 240753560, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.6689365745506374, | |
| "grad_norm": 0.24059581756591797, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3172, | |
| "num_input_tokens_seen": 241689616, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.6714704252118141, | |
| "grad_norm": 0.24688631296157837, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3377, | |
| "num_input_tokens_seen": 242618896, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6740042758729907, | |
| "grad_norm": 0.2412404716014862, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3512, | |
| "num_input_tokens_seen": 243555264, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.6765381265341673, | |
| "grad_norm": 0.23944397270679474, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3806, | |
| "num_input_tokens_seen": 244450244, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.6790719771953441, | |
| "grad_norm": 0.24713559448719025, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3251, | |
| "num_input_tokens_seen": 245398672, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.6816058278565207, | |
| "grad_norm": 0.31667396426200867, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3642, | |
| "num_input_tokens_seen": 246320464, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.6841396785176974, | |
| "grad_norm": 0.250383585691452, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3329, | |
| "num_input_tokens_seen": 247248308, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.686673529178874, | |
| "grad_norm": 0.2263907939195633, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3281, | |
| "num_input_tokens_seen": 248202884, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.6892073798400506, | |
| "grad_norm": 0.24522219598293304, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3477, | |
| "num_input_tokens_seen": 249166112, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6917412305012274, | |
| "grad_norm": 0.22159820795059204, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3192, | |
| "num_input_tokens_seen": 250077904, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.694275081162404, | |
| "grad_norm": 0.2300739735364914, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3336, | |
| "num_input_tokens_seen": 251012120, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.6968089318235806, | |
| "grad_norm": 0.22758354246616364, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3964, | |
| "num_input_tokens_seen": 251934920, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6993427824847573, | |
| "grad_norm": 0.2598190903663635, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3311, | |
| "num_input_tokens_seen": 252877580, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.701876633145934, | |
| "grad_norm": 0.23178431391716003, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3453, | |
| "num_input_tokens_seen": 253792028, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7044104838071106, | |
| "grad_norm": 0.26508447527885437, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3635, | |
| "num_input_tokens_seen": 254742856, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7069443344682873, | |
| "grad_norm": 0.263509601354599, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3584, | |
| "num_input_tokens_seen": 255676980, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7094781851294639, | |
| "grad_norm": 0.25076207518577576, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3654, | |
| "num_input_tokens_seen": 256607480, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7120120357906405, | |
| "grad_norm": 0.3114246726036072, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3626, | |
| "num_input_tokens_seen": 257486156, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7145458864518173, | |
| "grad_norm": 0.2184561789035797, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3481, | |
| "num_input_tokens_seen": 258406168, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7170797371129939, | |
| "grad_norm": 0.27279725670814514, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3358, | |
| "num_input_tokens_seen": 259298936, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7196135877741705, | |
| "grad_norm": 0.23473051190376282, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3157, | |
| "num_input_tokens_seen": 260214884, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7221474384353472, | |
| "grad_norm": 0.2273094654083252, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3695, | |
| "num_input_tokens_seen": 261150656, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7246812890965239, | |
| "grad_norm": 0.23328402638435364, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3491, | |
| "num_input_tokens_seen": 262090748, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7272151397577006, | |
| "grad_norm": 0.27058523893356323, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3164, | |
| "num_input_tokens_seen": 263047956, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7297489904188772, | |
| "grad_norm": 0.26919999718666077, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3429, | |
| "num_input_tokens_seen": 263952708, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7322828410800538, | |
| "grad_norm": 0.2629719078540802, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3736, | |
| "num_input_tokens_seen": 264850904, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7348166917412305, | |
| "grad_norm": 0.2600915729999542, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3179, | |
| "num_input_tokens_seen": 265795528, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7373505424024072, | |
| "grad_norm": 0.29251357913017273, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3671, | |
| "num_input_tokens_seen": 266703240, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7398843930635838, | |
| "grad_norm": 0.23803594708442688, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3632, | |
| "num_input_tokens_seen": 267637720, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7424182437247605, | |
| "grad_norm": 0.24492381513118744, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3275, | |
| "num_input_tokens_seen": 268547588, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.7449520943859371, | |
| "grad_norm": 0.2277376800775528, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3058, | |
| "num_input_tokens_seen": 269503056, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.7474859450471137, | |
| "grad_norm": 0.22645527124404907, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3462, | |
| "num_input_tokens_seen": 270372524, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7500197957082905, | |
| "grad_norm": 0.27738144993782043, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2953, | |
| "num_input_tokens_seen": 271255520, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.7525536463694671, | |
| "grad_norm": 0.2460719496011734, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3291, | |
| "num_input_tokens_seen": 272173512, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.7550874970306437, | |
| "grad_norm": 0.23774035274982452, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3105, | |
| "num_input_tokens_seen": 273082396, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.7576213476918204, | |
| "grad_norm": 0.2344847470521927, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3379, | |
| "num_input_tokens_seen": 273951600, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.760155198352997, | |
| "grad_norm": 0.2422836273908615, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3437, | |
| "num_input_tokens_seen": 274855796, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.760155198352997, | |
| "eval_loss": 1.3378311395645142, | |
| "eval_runtime": 2.7862, | |
| "eval_samples_per_second": 53.837, | |
| "eval_steps_per_second": 6.819, | |
| "num_input_tokens_seen": 274855796, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7626890490141738, | |
| "grad_norm": 0.2418714016675949, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3683, | |
| "num_input_tokens_seen": 275793364, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.7652228996753504, | |
| "grad_norm": 0.2433195561170578, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3397, | |
| "num_input_tokens_seen": 276766688, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.767756750336527, | |
| "grad_norm": 0.2531881034374237, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3069, | |
| "num_input_tokens_seen": 277692944, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.7702906009977037, | |
| "grad_norm": 0.228854700922966, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3467, | |
| "num_input_tokens_seen": 278633648, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7728244516588804, | |
| "grad_norm": 0.21645446121692657, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2949, | |
| "num_input_tokens_seen": 279542668, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.775358302320057, | |
| "grad_norm": 0.2668648362159729, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3272, | |
| "num_input_tokens_seen": 280474528, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.7778921529812337, | |
| "grad_norm": 0.26199036836624146, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3395, | |
| "num_input_tokens_seen": 281383776, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.7804260036424103, | |
| "grad_norm": 0.23948872089385986, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3534, | |
| "num_input_tokens_seen": 282297260, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.7829598543035869, | |
| "grad_norm": 0.2561713755130768, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3251, | |
| "num_input_tokens_seen": 283169516, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.7854937049647637, | |
| "grad_norm": 0.26099705696105957, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3394, | |
| "num_input_tokens_seen": 284109700, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.7880275556259403, | |
| "grad_norm": 0.23930218815803528, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3242, | |
| "num_input_tokens_seen": 285031264, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.7905614062871169, | |
| "grad_norm": 0.23478297889232635, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3647, | |
| "num_input_tokens_seen": 285943620, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.7930952569482936, | |
| "grad_norm": 0.24018226563930511, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3166, | |
| "num_input_tokens_seen": 286819840, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.7956291076094703, | |
| "grad_norm": 0.22437995672225952, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3418, | |
| "num_input_tokens_seen": 287731640, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.7981629582706469, | |
| "grad_norm": 0.2912137806415558, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3336, | |
| "num_input_tokens_seen": 288650768, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8006968089318236, | |
| "grad_norm": 0.27003979682922363, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3094, | |
| "num_input_tokens_seen": 289579424, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8032306595930002, | |
| "grad_norm": 0.24906513094902039, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3089, | |
| "num_input_tokens_seen": 290506080, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.805764510254177, | |
| "grad_norm": 0.2620064616203308, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3741, | |
| "num_input_tokens_seen": 291447632, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8082983609153536, | |
| "grad_norm": 0.22881096601486206, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3601, | |
| "num_input_tokens_seen": 292382736, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8108322115765302, | |
| "grad_norm": 0.23649707436561584, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3212, | |
| "num_input_tokens_seen": 293339376, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8133660622377069, | |
| "grad_norm": 0.22773633897304535, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3124, | |
| "num_input_tokens_seen": 294273900, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8158999128988835, | |
| "grad_norm": 0.23439520597457886, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3104, | |
| "num_input_tokens_seen": 295167620, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8184337635600601, | |
| "grad_norm": 0.2587607800960541, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3378, | |
| "num_input_tokens_seen": 296070252, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8209676142212369, | |
| "grad_norm": 0.2375950813293457, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3608, | |
| "num_input_tokens_seen": 296964880, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8235014648824135, | |
| "grad_norm": 0.217642143368721, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3711, | |
| "num_input_tokens_seen": 297861584, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8260353155435901, | |
| "grad_norm": 0.24903365969657898, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3759, | |
| "num_input_tokens_seen": 298763600, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8285691662047668, | |
| "grad_norm": 0.25492629408836365, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.336, | |
| "num_input_tokens_seen": 299655852, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.8311030168659435, | |
| "grad_norm": 0.26514139771461487, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3294, | |
| "num_input_tokens_seen": 300539872, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8336368675271201, | |
| "grad_norm": 0.23889844119548798, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3845, | |
| "num_input_tokens_seen": 301433356, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8361707181882968, | |
| "grad_norm": 0.23075729608535767, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3359, | |
| "num_input_tokens_seen": 302358284, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8387045688494734, | |
| "grad_norm": 0.28124797344207764, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3663, | |
| "num_input_tokens_seen": 303293764, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.84123841951065, | |
| "grad_norm": 0.30670827627182007, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.335, | |
| "num_input_tokens_seen": 304171336, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.8437722701718268, | |
| "grad_norm": 0.22578497231006622, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.327, | |
| "num_input_tokens_seen": 305091264, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.8463061208330034, | |
| "grad_norm": 0.22120265662670135, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3509, | |
| "num_input_tokens_seen": 306010588, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.8488399714941801, | |
| "grad_norm": 0.2477473020553589, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3565, | |
| "num_input_tokens_seen": 306940328, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8513738221553567, | |
| "grad_norm": 0.2530181109905243, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2936, | |
| "num_input_tokens_seen": 307838056, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8539076728165333, | |
| "grad_norm": 0.2556324303150177, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3002, | |
| "num_input_tokens_seen": 308773220, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.8564415234777101, | |
| "grad_norm": 0.24870575964450836, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3086, | |
| "num_input_tokens_seen": 309713036, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.8589753741388867, | |
| "grad_norm": 0.22579419612884521, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3238, | |
| "num_input_tokens_seen": 310676544, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.8615092248000633, | |
| "grad_norm": 0.26896366477012634, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3518, | |
| "num_input_tokens_seen": 311609100, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.86404307546124, | |
| "grad_norm": 0.23491699993610382, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3478, | |
| "num_input_tokens_seen": 312541060, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.8665769261224167, | |
| "grad_norm": 0.21398873627185822, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.317, | |
| "num_input_tokens_seen": 313464680, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.8691107767835933, | |
| "grad_norm": 0.2201145589351654, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3203, | |
| "num_input_tokens_seen": 314362092, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.87164462744477, | |
| "grad_norm": 0.23937499523162842, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3594, | |
| "num_input_tokens_seen": 315286788, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.8741784781059466, | |
| "grad_norm": 0.2299693375825882, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.359, | |
| "num_input_tokens_seen": 316199708, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.8767123287671232, | |
| "grad_norm": 0.21679440140724182, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3372, | |
| "num_input_tokens_seen": 317082032, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.8792461794283, | |
| "grad_norm": 0.23869968950748444, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2815, | |
| "num_input_tokens_seen": 317999160, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.8817800300894766, | |
| "grad_norm": 0.24342550337314606, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3222, | |
| "num_input_tokens_seen": 318945628, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.8843138807506532, | |
| "grad_norm": 0.23146317899227142, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3188, | |
| "num_input_tokens_seen": 319892264, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.8868477314118299, | |
| "grad_norm": 0.27557140588760376, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3065, | |
| "num_input_tokens_seen": 320815992, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8893815820730065, | |
| "grad_norm": 0.24911952018737793, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3275, | |
| "num_input_tokens_seen": 321703172, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.8919154327341833, | |
| "grad_norm": 0.2727194130420685, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3297, | |
| "num_input_tokens_seen": 322642588, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.8944492833953599, | |
| "grad_norm": 0.242356538772583, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2881, | |
| "num_input_tokens_seen": 323529188, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.8969831340565365, | |
| "grad_norm": 0.21331574022769928, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2861, | |
| "num_input_tokens_seen": 324438984, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.8995169847177132, | |
| "grad_norm": 0.28540030121803284, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3361, | |
| "num_input_tokens_seen": 325302632, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9020508353788899, | |
| "grad_norm": 0.2721042037010193, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3377, | |
| "num_input_tokens_seen": 326223312, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9045846860400665, | |
| "grad_norm": 0.235883429646492, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3603, | |
| "num_input_tokens_seen": 327174992, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9071185367012432, | |
| "grad_norm": 0.2746555507183075, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3497, | |
| "num_input_tokens_seen": 328087740, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9096523873624198, | |
| "grad_norm": 0.21206247806549072, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3192, | |
| "num_input_tokens_seen": 329015496, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9121862380235964, | |
| "grad_norm": 0.24580571055412292, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2958, | |
| "num_input_tokens_seen": 329914504, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9147200886847732, | |
| "grad_norm": 0.2298029512166977, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2955, | |
| "num_input_tokens_seen": 330861412, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9172539393459498, | |
| "grad_norm": 0.20944957435131073, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3413, | |
| "num_input_tokens_seen": 331705132, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9197877900071264, | |
| "grad_norm": 0.26745468378067017, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3528, | |
| "num_input_tokens_seen": 332613612, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.9223216406683031, | |
| "grad_norm": 0.23441898822784424, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3125, | |
| "num_input_tokens_seen": 333546464, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9248554913294798, | |
| "grad_norm": 0.25231051445007324, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3264, | |
| "num_input_tokens_seen": 334449860, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9273893419906565, | |
| "grad_norm": 0.22412322461605072, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3159, | |
| "num_input_tokens_seen": 335390600, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.9299231926518331, | |
| "grad_norm": 0.23513691127300262, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3115, | |
| "num_input_tokens_seen": 336327464, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.9324570433130097, | |
| "grad_norm": 0.22470693290233612, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3214, | |
| "num_input_tokens_seen": 337241700, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.9349908939741864, | |
| "grad_norm": 0.24091310799121857, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3306, | |
| "num_input_tokens_seen": 338184552, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.9375247446353631, | |
| "grad_norm": 0.23601089417934418, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2856, | |
| "num_input_tokens_seen": 339109296, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9400585952965397, | |
| "grad_norm": 0.23559744656085968, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.293, | |
| "num_input_tokens_seen": 340010148, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.9425924459577164, | |
| "grad_norm": 0.2477143257856369, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3226, | |
| "num_input_tokens_seen": 340905016, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.945126296618893, | |
| "grad_norm": 0.2724590599536896, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3063, | |
| "num_input_tokens_seen": 341861552, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.9476601472800696, | |
| "grad_norm": 0.23112662136554718, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3099, | |
| "num_input_tokens_seen": 342806136, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.9501939979412464, | |
| "grad_norm": 0.2522134780883789, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2874, | |
| "num_input_tokens_seen": 343741672, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.952727848602423, | |
| "grad_norm": 0.23056572675704956, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3069, | |
| "num_input_tokens_seen": 344641984, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.9552616992635996, | |
| "grad_norm": 0.2758452892303467, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2951, | |
| "num_input_tokens_seen": 345553040, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.9577955499247763, | |
| "grad_norm": 0.2210364043712616, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.288, | |
| "num_input_tokens_seen": 346455716, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.960329400585953, | |
| "grad_norm": 0.24254508316516876, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3527, | |
| "num_input_tokens_seen": 347362188, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.9628632512471296, | |
| "grad_norm": 0.2317672073841095, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2872, | |
| "num_input_tokens_seen": 348323636, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9653971019083063, | |
| "grad_norm": 0.25921356678009033, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.326, | |
| "num_input_tokens_seen": 349187636, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.9679309525694829, | |
| "grad_norm": 0.24803981184959412, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2919, | |
| "num_input_tokens_seen": 350146896, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.9704648032306596, | |
| "grad_norm": 0.27010080218315125, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3511, | |
| "num_input_tokens_seen": 351082648, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.9729986538918363, | |
| "grad_norm": 0.3154395520687103, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.328, | |
| "num_input_tokens_seen": 351973288, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.9755325045530129, | |
| "grad_norm": 0.27058759331703186, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2797, | |
| "num_input_tokens_seen": 352899120, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.9780663552141896, | |
| "grad_norm": 0.22412972152233124, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3193, | |
| "num_input_tokens_seen": 353825356, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.9806002058753662, | |
| "grad_norm": 0.3295518755912781, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.324, | |
| "num_input_tokens_seen": 354778268, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.9831340565365428, | |
| "grad_norm": 0.20455938577651978, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3359, | |
| "num_input_tokens_seen": 355687292, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.9856679071977196, | |
| "grad_norm": 0.22574731707572937, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3081, | |
| "num_input_tokens_seen": 356581252, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.9882017578588962, | |
| "grad_norm": 0.25318706035614014, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3327, | |
| "num_input_tokens_seen": 357531400, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9907356085200728, | |
| "grad_norm": 0.25423163175582886, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3269, | |
| "num_input_tokens_seen": 358429676, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.9932694591812495, | |
| "grad_norm": 0.23770791292190552, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2942, | |
| "num_input_tokens_seen": 359328932, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.9958033098424262, | |
| "grad_norm": 0.23878265917301178, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3295, | |
| "num_input_tokens_seen": 360264552, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.9983371605036028, | |
| "grad_norm": 0.2264624685049057, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3224, | |
| "num_input_tokens_seen": 361179728, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.9998574709003089, | |
| "num_input_tokens_seen": 361724696, | |
| "step": 3946, | |
| "total_flos": 1.4115183327245763e+18, | |
| "train_loss": 1.3691888915302182, | |
| "train_runtime": 65409.2818, | |
| "train_samples_per_second": 15.446, | |
| "train_steps_per_second": 0.06 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3946, | |
| "num_input_tokens_seen": 361724696, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4115183327245763e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |