| { | |
| "best_global_step": 2200, | |
| "best_metric": 1.8803235292434692, | |
| "best_model_checkpoint": "/content/drive/MyDrive/hyperclova-deobfuscation-lora/checkpoint-2200", | |
| "epoch": 2.6666666666666665, | |
| "eval_steps": 200, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008888888888888889, | |
| "grad_norm": 3.629798412322998, | |
| "learning_rate": 1.8e-05, | |
| "loss": 4.1483, | |
| "mean_token_accuracy": 0.34797456339001653, | |
| "num_tokens": 11242.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017777777777777778, | |
| "grad_norm": 2.6125221252441406, | |
| "learning_rate": 3.8e-05, | |
| "loss": 3.7515, | |
| "mean_token_accuracy": 0.4058148756623268, | |
| "num_tokens": 22106.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 2.9313137531280518, | |
| "learning_rate": 5.8e-05, | |
| "loss": 3.3279, | |
| "mean_token_accuracy": 0.4703808955848217, | |
| "num_tokens": 33774.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 2.0496416091918945, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 2.9114, | |
| "mean_token_accuracy": 0.5239812344312668, | |
| "num_tokens": 44943.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 2.282668352127075, | |
| "learning_rate": 9.8e-05, | |
| "loss": 2.8468, | |
| "mean_token_accuracy": 0.534189497679472, | |
| "num_tokens": 56341.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 2.168651819229126, | |
| "learning_rate": 0.000118, | |
| "loss": 2.7785, | |
| "mean_token_accuracy": 0.5407359585165977, | |
| "num_tokens": 67397.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06222222222222222, | |
| "grad_norm": 2.289881467819214, | |
| "learning_rate": 0.000138, | |
| "loss": 2.736, | |
| "mean_token_accuracy": 0.5326176360249519, | |
| "num_tokens": 78482.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 2.1038105487823486, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 2.5855, | |
| "mean_token_accuracy": 0.5618595249950886, | |
| "num_tokens": 89803.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.24312686920166, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 2.5365, | |
| "mean_token_accuracy": 0.5661972932517528, | |
| "num_tokens": 101015.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 1.9482938051223755, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 2.5634, | |
| "mean_token_accuracy": 0.5538406319916248, | |
| "num_tokens": 112364.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09777777777777778, | |
| "grad_norm": 1.86210298538208, | |
| "learning_rate": 0.00019945038167938932, | |
| "loss": 2.4629, | |
| "mean_token_accuracy": 0.5780388668179512, | |
| "num_tokens": 122882.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 1.8806918859481812, | |
| "learning_rate": 0.00019883969465648855, | |
| "loss": 2.5022, | |
| "mean_token_accuracy": 0.563551553338766, | |
| "num_tokens": 134028.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11555555555555555, | |
| "grad_norm": 2.3264434337615967, | |
| "learning_rate": 0.00019829007633587786, | |
| "loss": 2.4065, | |
| "mean_token_accuracy": 0.5807355619966984, | |
| "num_tokens": 145192.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12444444444444444, | |
| "grad_norm": 1.8537976741790771, | |
| "learning_rate": 0.00019767938931297712, | |
| "loss": 2.4838, | |
| "mean_token_accuracy": 0.566282794624567, | |
| "num_tokens": 156703.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 2.0960652828216553, | |
| "learning_rate": 0.00019706870229007636, | |
| "loss": 2.4119, | |
| "mean_token_accuracy": 0.5830203481018543, | |
| "num_tokens": 168041.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 2.2244813442230225, | |
| "learning_rate": 0.00019645801526717557, | |
| "loss": 2.3726, | |
| "mean_token_accuracy": 0.5844443172216416, | |
| "num_tokens": 178986.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1511111111111111, | |
| "grad_norm": 1.8238722085952759, | |
| "learning_rate": 0.0001958473282442748, | |
| "loss": 2.4419, | |
| "mean_token_accuracy": 0.5708602093160152, | |
| "num_tokens": 190391.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.7154136896133423, | |
| "learning_rate": 0.00019523664122137407, | |
| "loss": 2.4293, | |
| "mean_token_accuracy": 0.5748118035495281, | |
| "num_tokens": 201989.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1688888888888889, | |
| "grad_norm": 1.7582788467407227, | |
| "learning_rate": 0.0001946259541984733, | |
| "loss": 2.3577, | |
| "mean_token_accuracy": 0.5877166777849198, | |
| "num_tokens": 212914.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 1.8613263368606567, | |
| "learning_rate": 0.0001940152671755725, | |
| "loss": 2.3486, | |
| "mean_token_accuracy": 0.5889834299683571, | |
| "num_tokens": 223936.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "eval_loss": 2.3320820331573486, | |
| "eval_mean_token_accuracy": 0.5868698905706405, | |
| "eval_num_tokens": 223936.0, | |
| "eval_runtime": 49.2429, | |
| "eval_samples_per_second": 20.307, | |
| "eval_steps_per_second": 10.154, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 1.8486477136611938, | |
| "learning_rate": 0.00019340458015267175, | |
| "loss": 2.3666, | |
| "mean_token_accuracy": 0.5847611322999, | |
| "num_tokens": 235036.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.19555555555555557, | |
| "grad_norm": 2.018049478530884, | |
| "learning_rate": 0.000192793893129771, | |
| "loss": 2.2689, | |
| "mean_token_accuracy": 0.59971177354455, | |
| "num_tokens": 246101.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.20444444444444446, | |
| "grad_norm": 1.7244890928268433, | |
| "learning_rate": 0.00019218320610687024, | |
| "loss": 2.3262, | |
| "mean_token_accuracy": 0.5855986528098583, | |
| "num_tokens": 257953.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 1.8928934335708618, | |
| "learning_rate": 0.00019157251908396948, | |
| "loss": 2.3318, | |
| "mean_token_accuracy": 0.5885626815259457, | |
| "num_tokens": 269187.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 1.7358920574188232, | |
| "learning_rate": 0.0001909618320610687, | |
| "loss": 2.2145, | |
| "mean_token_accuracy": 0.6092555984854698, | |
| "num_tokens": 279762.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2311111111111111, | |
| "grad_norm": 1.6779032945632935, | |
| "learning_rate": 0.00019035114503816795, | |
| "loss": 2.3152, | |
| "mean_token_accuracy": 0.584602715075016, | |
| "num_tokens": 291454.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.6310207843780518, | |
| "learning_rate": 0.0001897404580152672, | |
| "loss": 2.2669, | |
| "mean_token_accuracy": 0.5965895019471645, | |
| "num_tokens": 302969.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 1.6765615940093994, | |
| "learning_rate": 0.00018912977099236642, | |
| "loss": 2.269, | |
| "mean_token_accuracy": 0.5934441670775413, | |
| "num_tokens": 314204.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2577777777777778, | |
| "grad_norm": 1.793959617614746, | |
| "learning_rate": 0.00018851908396946566, | |
| "loss": 2.2554, | |
| "mean_token_accuracy": 0.600947193801403, | |
| "num_tokens": 325649.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 1.7492129802703857, | |
| "learning_rate": 0.0001879083969465649, | |
| "loss": 2.2157, | |
| "mean_token_accuracy": 0.6022505328059197, | |
| "num_tokens": 337167.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.27555555555555555, | |
| "grad_norm": 1.803576946258545, | |
| "learning_rate": 0.00018729770992366413, | |
| "loss": 2.2854, | |
| "mean_token_accuracy": 0.5923042424023152, | |
| "num_tokens": 348621.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 1.9662351608276367, | |
| "learning_rate": 0.00018668702290076337, | |
| "loss": 2.2639, | |
| "mean_token_accuracy": 0.588193366676569, | |
| "num_tokens": 360272.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 1.6725891828536987, | |
| "learning_rate": 0.0001860763358778626, | |
| "loss": 2.2249, | |
| "mean_token_accuracy": 0.6054098337888718, | |
| "num_tokens": 371346.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3022222222222222, | |
| "grad_norm": 1.68416166305542, | |
| "learning_rate": 0.00018546564885496184, | |
| "loss": 2.1678, | |
| "mean_token_accuracy": 0.6146526508033275, | |
| "num_tokens": 382779.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3111111111111111, | |
| "grad_norm": 1.7218507528305054, | |
| "learning_rate": 0.00018485496183206108, | |
| "loss": 2.2011, | |
| "mean_token_accuracy": 0.6104303196072578, | |
| "num_tokens": 393823.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.6817256212234497, | |
| "learning_rate": 0.0001842442748091603, | |
| "loss": 2.2264, | |
| "mean_token_accuracy": 0.5987282857298851, | |
| "num_tokens": 405438.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3288888888888889, | |
| "grad_norm": 1.7454718351364136, | |
| "learning_rate": 0.00018363358778625955, | |
| "loss": 2.2712, | |
| "mean_token_accuracy": 0.5939777493476868, | |
| "num_tokens": 417299.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3377777777777778, | |
| "grad_norm": 2.011315107345581, | |
| "learning_rate": 0.00018302290076335878, | |
| "loss": 2.2247, | |
| "mean_token_accuracy": 0.6061037018895149, | |
| "num_tokens": 428660.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 1.6242053508758545, | |
| "learning_rate": 0.00018241221374045802, | |
| "loss": 2.232, | |
| "mean_token_accuracy": 0.6062197655439376, | |
| "num_tokens": 439768.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 1.9328559637069702, | |
| "learning_rate": 0.00018180152671755725, | |
| "loss": 2.1291, | |
| "mean_token_accuracy": 0.6168317429721355, | |
| "num_tokens": 450808.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "eval_loss": 2.1662538051605225, | |
| "eval_mean_token_accuracy": 0.6099509916305542, | |
| "eval_num_tokens": 450808.0, | |
| "eval_runtime": 49.4213, | |
| "eval_samples_per_second": 20.234, | |
| "eval_steps_per_second": 10.117, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.36444444444444446, | |
| "grad_norm": 1.8797143697738647, | |
| "learning_rate": 0.0001811908396946565, | |
| "loss": 2.2086, | |
| "mean_token_accuracy": 0.6012695133686066, | |
| "num_tokens": 461592.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 1.7558225393295288, | |
| "learning_rate": 0.00018058015267175575, | |
| "loss": 2.1771, | |
| "mean_token_accuracy": 0.6060668036341668, | |
| "num_tokens": 473434.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.38222222222222224, | |
| "grad_norm": 1.845051884651184, | |
| "learning_rate": 0.00017996946564885496, | |
| "loss": 2.2576, | |
| "mean_token_accuracy": 0.5929104581475257, | |
| "num_tokens": 485130.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 1.6992298364639282, | |
| "learning_rate": 0.0001793587786259542, | |
| "loss": 2.1815, | |
| "mean_token_accuracy": 0.6100690707564353, | |
| "num_tokens": 496482.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.7239253520965576, | |
| "learning_rate": 0.00017874809160305343, | |
| "loss": 2.2082, | |
| "mean_token_accuracy": 0.6001435503363609, | |
| "num_tokens": 508218.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4088888888888889, | |
| "grad_norm": 1.7856336832046509, | |
| "learning_rate": 0.0001781374045801527, | |
| "loss": 2.1593, | |
| "mean_token_accuracy": 0.6118309393525123, | |
| "num_tokens": 519379.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4177777777777778, | |
| "grad_norm": 1.611831545829773, | |
| "learning_rate": 0.00017752671755725193, | |
| "loss": 2.1797, | |
| "mean_token_accuracy": 0.6033190444111824, | |
| "num_tokens": 530561.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.7420586347579956, | |
| "learning_rate": 0.00017691603053435114, | |
| "loss": 2.2027, | |
| "mean_token_accuracy": 0.6067790001630783, | |
| "num_tokens": 542631.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.43555555555555553, | |
| "grad_norm": 1.948723316192627, | |
| "learning_rate": 0.00017630534351145038, | |
| "loss": 2.1753, | |
| "mean_token_accuracy": 0.6109650492668152, | |
| "num_tokens": 553477.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.7983819246292114, | |
| "learning_rate": 0.00017569465648854964, | |
| "loss": 2.158, | |
| "mean_token_accuracy": 0.5996212616562844, | |
| "num_tokens": 565400.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 1.842372179031372, | |
| "learning_rate": 0.00017508396946564888, | |
| "loss": 2.0825, | |
| "mean_token_accuracy": 0.6168116196990013, | |
| "num_tokens": 576953.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 1.91799795627594, | |
| "learning_rate": 0.00017447328244274809, | |
| "loss": 2.1022, | |
| "mean_token_accuracy": 0.6168905258178711, | |
| "num_tokens": 588003.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4711111111111111, | |
| "grad_norm": 1.7727124691009521, | |
| "learning_rate": 0.00017386259541984732, | |
| "loss": 2.1695, | |
| "mean_token_accuracy": 0.5997609972953797, | |
| "num_tokens": 600043.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.8602296113967896, | |
| "learning_rate": 0.00017325190839694658, | |
| "loss": 2.0849, | |
| "mean_token_accuracy": 0.6266478568315506, | |
| "num_tokens": 610974.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4888888888888889, | |
| "grad_norm": 1.545620083808899, | |
| "learning_rate": 0.00017264122137404582, | |
| "loss": 2.1824, | |
| "mean_token_accuracy": 0.6072694823145867, | |
| "num_tokens": 622632.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 1.7485988140106201, | |
| "learning_rate": 0.00017203053435114506, | |
| "loss": 2.1374, | |
| "mean_token_accuracy": 0.6164417043328285, | |
| "num_tokens": 634093.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 1.8591196537017822, | |
| "learning_rate": 0.00017141984732824426, | |
| "loss": 2.0928, | |
| "mean_token_accuracy": 0.6241554819047451, | |
| "num_tokens": 645226.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5155555555555555, | |
| "grad_norm": 1.8163517713546753, | |
| "learning_rate": 0.00017080916030534353, | |
| "loss": 2.0476, | |
| "mean_token_accuracy": 0.6285594403743744, | |
| "num_tokens": 656188.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5244444444444445, | |
| "grad_norm": 1.7729696035385132, | |
| "learning_rate": 0.00017019847328244276, | |
| "loss": 2.1036, | |
| "mean_token_accuracy": 0.6208315283060074, | |
| "num_tokens": 667642.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.7804032564163208, | |
| "learning_rate": 0.000169587786259542, | |
| "loss": 2.1174, | |
| "mean_token_accuracy": 0.6148250237107277, | |
| "num_tokens": 678769.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "eval_loss": 2.0850696563720703, | |
| "eval_mean_token_accuracy": 0.6197466601729393, | |
| "eval_num_tokens": 678769.0, | |
| "eval_runtime": 49.7611, | |
| "eval_samples_per_second": 20.096, | |
| "eval_steps_per_second": 10.048, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5422222222222223, | |
| "grad_norm": 1.8643274307250977, | |
| "learning_rate": 0.00016897709923664124, | |
| "loss": 2.0485, | |
| "mean_token_accuracy": 0.6331146821379662, | |
| "num_tokens": 690014.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5511111111111111, | |
| "grad_norm": 1.8060939311981201, | |
| "learning_rate": 0.00016836641221374047, | |
| "loss": 2.1117, | |
| "mean_token_accuracy": 0.612041813135147, | |
| "num_tokens": 701734.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.7059085369110107, | |
| "learning_rate": 0.0001677557251908397, | |
| "loss": 2.0747, | |
| "mean_token_accuracy": 0.6174572542309761, | |
| "num_tokens": 713570.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 1.6600592136383057, | |
| "learning_rate": 0.00016714503816793894, | |
| "loss": 2.0685, | |
| "mean_token_accuracy": 0.6293445661664009, | |
| "num_tokens": 724815.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5777777777777777, | |
| "grad_norm": 1.6598913669586182, | |
| "learning_rate": 0.00016653435114503818, | |
| "loss": 2.0255, | |
| "mean_token_accuracy": 0.6309839904308319, | |
| "num_tokens": 735777.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 1.8306963443756104, | |
| "learning_rate": 0.00016592366412213741, | |
| "loss": 2.1249, | |
| "mean_token_accuracy": 0.6147443532943726, | |
| "num_tokens": 746903.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5955555555555555, | |
| "grad_norm": 1.626795768737793, | |
| "learning_rate": 0.00016531297709923665, | |
| "loss": 2.0694, | |
| "mean_token_accuracy": 0.6254988595843315, | |
| "num_tokens": 757881.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 1.710806131362915, | |
| "learning_rate": 0.00016470229007633589, | |
| "loss": 2.0397, | |
| "mean_token_accuracy": 0.6233279958367348, | |
| "num_tokens": 768982.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 1.7051280736923218, | |
| "learning_rate": 0.00016409160305343512, | |
| "loss": 2.116, | |
| "mean_token_accuracy": 0.6183760315179825, | |
| "num_tokens": 780072.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 1.607917070388794, | |
| "learning_rate": 0.00016348091603053436, | |
| "loss": 2.0478, | |
| "mean_token_accuracy": 0.6331974640488625, | |
| "num_tokens": 791061.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6311111111111111, | |
| "grad_norm": 1.7803592681884766, | |
| "learning_rate": 0.0001628702290076336, | |
| "loss": 2.0595, | |
| "mean_token_accuracy": 0.6249041527509689, | |
| "num_tokens": 801867.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.6132373809814453, | |
| "learning_rate": 0.00016225954198473283, | |
| "loss": 2.0789, | |
| "mean_token_accuracy": 0.6235784366726875, | |
| "num_tokens": 813112.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6488888888888888, | |
| "grad_norm": 1.790528655052185, | |
| "learning_rate": 0.00016164885496183207, | |
| "loss": 2.0632, | |
| "mean_token_accuracy": 0.6268924325704575, | |
| "num_tokens": 824133.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6577777777777778, | |
| "grad_norm": 2.0007362365722656, | |
| "learning_rate": 0.0001610381679389313, | |
| "loss": 2.0701, | |
| "mean_token_accuracy": 0.6189413338899612, | |
| "num_tokens": 835469.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 2.227158546447754, | |
| "learning_rate": 0.00016042748091603054, | |
| "loss": 2.0339, | |
| "mean_token_accuracy": 0.621903920173645, | |
| "num_tokens": 846572.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 1.80472731590271, | |
| "learning_rate": 0.00015981679389312977, | |
| "loss": 2.1285, | |
| "mean_token_accuracy": 0.604806374013424, | |
| "num_tokens": 857795.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6844444444444444, | |
| "grad_norm": 1.7893937826156616, | |
| "learning_rate": 0.000159206106870229, | |
| "loss": 2.0347, | |
| "mean_token_accuracy": 0.6292635962367058, | |
| "num_tokens": 868429.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 1.6761573553085327, | |
| "learning_rate": 0.00015859541984732824, | |
| "loss": 2.0591, | |
| "mean_token_accuracy": 0.6254431992769242, | |
| "num_tokens": 879659.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7022222222222222, | |
| "grad_norm": 1.803045630455017, | |
| "learning_rate": 0.0001579847328244275, | |
| "loss": 2.0293, | |
| "mean_token_accuracy": 0.6273573949933052, | |
| "num_tokens": 890911.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 1.7385220527648926, | |
| "learning_rate": 0.00015737404580152672, | |
| "loss": 2.0197, | |
| "mean_token_accuracy": 0.63025072067976, | |
| "num_tokens": 902240.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "eval_loss": 2.0297935009002686, | |
| "eval_mean_token_accuracy": 0.628437293112278, | |
| "eval_num_tokens": 902240.0, | |
| "eval_runtime": 49.3011, | |
| "eval_samples_per_second": 20.284, | |
| "eval_steps_per_second": 10.142, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.8906656503677368, | |
| "learning_rate": 0.00015676335877862595, | |
| "loss": 2.0806, | |
| "mean_token_accuracy": 0.619849094748497, | |
| "num_tokens": 914009.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7288888888888889, | |
| "grad_norm": 1.714268684387207, | |
| "learning_rate": 0.0001561526717557252, | |
| "loss": 2.0343, | |
| "mean_token_accuracy": 0.632188580930233, | |
| "num_tokens": 925091.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7377777777777778, | |
| "grad_norm": 1.833918809890747, | |
| "learning_rate": 0.00015554198473282445, | |
| "loss": 2.0747, | |
| "mean_token_accuracy": 0.6280180156230927, | |
| "num_tokens": 936675.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 1.9817575216293335, | |
| "learning_rate": 0.00015493129770992366, | |
| "loss": 2.0859, | |
| "mean_token_accuracy": 0.6128378361463547, | |
| "num_tokens": 948151.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7555555555555555, | |
| "grad_norm": 1.5982656478881836, | |
| "learning_rate": 0.0001543206106870229, | |
| "loss": 2.0455, | |
| "mean_token_accuracy": 0.6276382938027382, | |
| "num_tokens": 959266.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7644444444444445, | |
| "grad_norm": 1.7298970222473145, | |
| "learning_rate": 0.00015370992366412213, | |
| "loss": 1.9604, | |
| "mean_token_accuracy": 0.6377590849995614, | |
| "num_tokens": 970339.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 1.8064581155776978, | |
| "learning_rate": 0.0001530992366412214, | |
| "loss": 2.0698, | |
| "mean_token_accuracy": 0.6194617792963981, | |
| "num_tokens": 981805.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 1.5860410928726196, | |
| "learning_rate": 0.00015248854961832063, | |
| "loss": 2.0182, | |
| "mean_token_accuracy": 0.6292306095361709, | |
| "num_tokens": 993552.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7911111111111111, | |
| "grad_norm": 1.8761259317398071, | |
| "learning_rate": 0.00015187786259541984, | |
| "loss": 2.0335, | |
| "mean_token_accuracy": 0.6285651385784149, | |
| "num_tokens": 1004400.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.6973590850830078, | |
| "learning_rate": 0.00015126717557251908, | |
| "loss": 2.0927, | |
| "mean_token_accuracy": 0.6183614790439605, | |
| "num_tokens": 1015564.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8088888888888889, | |
| "grad_norm": 1.6477675437927246, | |
| "learning_rate": 0.00015065648854961834, | |
| "loss": 1.9187, | |
| "mean_token_accuracy": 0.6427812784910202, | |
| "num_tokens": 1026849.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 1.6942589282989502, | |
| "learning_rate": 0.00015004580152671757, | |
| "loss": 2.0139, | |
| "mean_token_accuracy": 0.6322552219033242, | |
| "num_tokens": 1037721.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 1.6394822597503662, | |
| "learning_rate": 0.0001494351145038168, | |
| "loss": 2.0392, | |
| "mean_token_accuracy": 0.6273665294051171, | |
| "num_tokens": 1048986.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8355555555555556, | |
| "grad_norm": 1.697804570198059, | |
| "learning_rate": 0.00014882442748091602, | |
| "loss": 2.0412, | |
| "mean_token_accuracy": 0.625536386668682, | |
| "num_tokens": 1060627.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8444444444444444, | |
| "grad_norm": 1.8058092594146729, | |
| "learning_rate": 0.00014821374045801528, | |
| "loss": 1.9737, | |
| "mean_token_accuracy": 0.6332821652293206, | |
| "num_tokens": 1071482.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.773294448852539, | |
| "learning_rate": 0.00014760305343511452, | |
| "loss": 2.054, | |
| "mean_token_accuracy": 0.6256278708577157, | |
| "num_tokens": 1082672.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8622222222222222, | |
| "grad_norm": 1.6936707496643066, | |
| "learning_rate": 0.00014699236641221375, | |
| "loss": 1.9957, | |
| "mean_token_accuracy": 0.6333451583981514, | |
| "num_tokens": 1093493.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8711111111111111, | |
| "grad_norm": 1.7029008865356445, | |
| "learning_rate": 0.000146381679389313, | |
| "loss": 2.0526, | |
| "mean_token_accuracy": 0.6244132176041604, | |
| "num_tokens": 1104857.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.8421082496643066, | |
| "learning_rate": 0.00014577099236641223, | |
| "loss": 2.0311, | |
| "mean_token_accuracy": 0.6236826583743096, | |
| "num_tokens": 1116131.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.646053433418274, | |
| "learning_rate": 0.00014516030534351146, | |
| "loss": 1.9973, | |
| "mean_token_accuracy": 0.6274659112095833, | |
| "num_tokens": 1127612.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "eval_loss": 1.989682674407959, | |
| "eval_mean_token_accuracy": 0.633990108013153, | |
| "eval_num_tokens": 1127612.0, | |
| "eval_runtime": 49.3043, | |
| "eval_samples_per_second": 20.282, | |
| "eval_steps_per_second": 10.141, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8977777777777778, | |
| "grad_norm": 1.5941271781921387, | |
| "learning_rate": 0.0001445496183206107, | |
| "loss": 2.0579, | |
| "mean_token_accuracy": 0.6256210282444954, | |
| "num_tokens": 1138866.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 1.7826253175735474, | |
| "learning_rate": 0.00014393893129770993, | |
| "loss": 1.9866, | |
| "mean_token_accuracy": 0.6332772478461266, | |
| "num_tokens": 1150411.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9155555555555556, | |
| "grad_norm": 1.8722221851348877, | |
| "learning_rate": 0.00014332824427480917, | |
| "loss": 2.0398, | |
| "mean_token_accuracy": 0.627329595386982, | |
| "num_tokens": 1161360.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 1.6533294916152954, | |
| "learning_rate": 0.0001427175572519084, | |
| "loss": 2.0271, | |
| "mean_token_accuracy": 0.6259514302015304, | |
| "num_tokens": 1172683.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 1.5746543407440186, | |
| "learning_rate": 0.00014210687022900764, | |
| "loss": 1.9634, | |
| "mean_token_accuracy": 0.6359310179948807, | |
| "num_tokens": 1183277.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9422222222222222, | |
| "grad_norm": 1.6094276905059814, | |
| "learning_rate": 0.00014149618320610688, | |
| "loss": 1.9195, | |
| "mean_token_accuracy": 0.649330523610115, | |
| "num_tokens": 1194160.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9511111111111111, | |
| "grad_norm": 1.9643882513046265, | |
| "learning_rate": 0.0001408854961832061, | |
| "loss": 2.0042, | |
| "mean_token_accuracy": 0.6356254667043686, | |
| "num_tokens": 1205308.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.8238948583602905, | |
| "learning_rate": 0.00014027480916030535, | |
| "loss": 1.9172, | |
| "mean_token_accuracy": 0.6497033536434174, | |
| "num_tokens": 1215760.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9688888888888889, | |
| "grad_norm": 1.7422380447387695, | |
| "learning_rate": 0.00013966412213740458, | |
| "loss": 2.0213, | |
| "mean_token_accuracy": 0.6309294819831848, | |
| "num_tokens": 1226775.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 1.651795744895935, | |
| "learning_rate": 0.00013905343511450382, | |
| "loss": 2.033, | |
| "mean_token_accuracy": 0.6295390352606773, | |
| "num_tokens": 1238191.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 1.673543095588684, | |
| "learning_rate": 0.00013844274809160308, | |
| "loss": 2.0085, | |
| "mean_token_accuracy": 0.6329691678285598, | |
| "num_tokens": 1249561.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 1.7423163652420044, | |
| "learning_rate": 0.0001378320610687023, | |
| "loss": 1.9751, | |
| "mean_token_accuracy": 0.6307685926556588, | |
| "num_tokens": 1260429.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0044444444444445, | |
| "grad_norm": 1.4878981113433838, | |
| "learning_rate": 0.00013722137404580153, | |
| "loss": 1.9171, | |
| "mean_token_accuracy": 0.644737622141838, | |
| "num_tokens": 1271111.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.0133333333333334, | |
| "grad_norm": 1.5343797206878662, | |
| "learning_rate": 0.00013661068702290076, | |
| "loss": 1.8544, | |
| "mean_token_accuracy": 0.6503374725580215, | |
| "num_tokens": 1282434.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.0222222222222221, | |
| "grad_norm": 1.5450340509414673, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.828, | |
| "mean_token_accuracy": 0.6514182686805725, | |
| "num_tokens": 1294382.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.031111111111111, | |
| "grad_norm": 1.8313877582550049, | |
| "learning_rate": 0.00013538931297709923, | |
| "loss": 1.7704, | |
| "mean_token_accuracy": 0.6693721905350685, | |
| "num_tokens": 1305343.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.8418430089950562, | |
| "learning_rate": 0.00013477862595419847, | |
| "loss": 1.7591, | |
| "mean_token_accuracy": 0.67226582467556, | |
| "num_tokens": 1316558.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.048888888888889, | |
| "grad_norm": 1.6022825241088867, | |
| "learning_rate": 0.0001341679389312977, | |
| "loss": 1.8048, | |
| "mean_token_accuracy": 0.6629651457071304, | |
| "num_tokens": 1327938.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.0577777777777777, | |
| "grad_norm": 1.5888707637786865, | |
| "learning_rate": 0.00013355725190839697, | |
| "loss": 1.773, | |
| "mean_token_accuracy": 0.6730352655053139, | |
| "num_tokens": 1338732.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 1.833946943283081, | |
| "learning_rate": 0.0001329465648854962, | |
| "loss": 1.7887, | |
| "mean_token_accuracy": 0.6616317644715309, | |
| "num_tokens": 1350096.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "eval_loss": 1.9697085618972778, | |
| "eval_mean_token_accuracy": 0.6378205664157868, | |
| "eval_num_tokens": 1350096.0, | |
| "eval_runtime": 49.9237, | |
| "eval_samples_per_second": 20.031, | |
| "eval_steps_per_second": 10.015, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0755555555555556, | |
| "grad_norm": 1.6338160037994385, | |
| "learning_rate": 0.00013233587786259541, | |
| "loss": 1.7889, | |
| "mean_token_accuracy": 0.6668319672346115, | |
| "num_tokens": 1360771.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.0844444444444445, | |
| "grad_norm": 1.8737561702728271, | |
| "learning_rate": 0.00013172519083969465, | |
| "loss": 1.7997, | |
| "mean_token_accuracy": 0.6570939287543297, | |
| "num_tokens": 1372450.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0933333333333333, | |
| "grad_norm": 1.758074402809143, | |
| "learning_rate": 0.0001311145038167939, | |
| "loss": 1.8457, | |
| "mean_token_accuracy": 0.653074924647808, | |
| "num_tokens": 1383711.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.1022222222222222, | |
| "grad_norm": 1.839158296585083, | |
| "learning_rate": 0.00013050381679389315, | |
| "loss": 1.8013, | |
| "mean_token_accuracy": 0.6608111187815666, | |
| "num_tokens": 1394856.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.733567476272583, | |
| "learning_rate": 0.00012989312977099238, | |
| "loss": 1.7814, | |
| "mean_token_accuracy": 0.6655508041381836, | |
| "num_tokens": 1406193.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.6274900436401367, | |
| "learning_rate": 0.0001292824427480916, | |
| "loss": 1.858, | |
| "mean_token_accuracy": 0.6488608077168465, | |
| "num_tokens": 1417607.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.1288888888888888, | |
| "grad_norm": 1.690090537071228, | |
| "learning_rate": 0.00012867175572519086, | |
| "loss": 1.8256, | |
| "mean_token_accuracy": 0.6595686703920365, | |
| "num_tokens": 1429073.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.1377777777777778, | |
| "grad_norm": 1.6638071537017822, | |
| "learning_rate": 0.0001280610687022901, | |
| "loss": 1.8334, | |
| "mean_token_accuracy": 0.6580470725893974, | |
| "num_tokens": 1440194.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1466666666666667, | |
| "grad_norm": 1.8339307308197021, | |
| "learning_rate": 0.00012745038167938933, | |
| "loss": 1.783, | |
| "mean_token_accuracy": 0.6632378786802292, | |
| "num_tokens": 1451221.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1555555555555554, | |
| "grad_norm": 1.7621415853500366, | |
| "learning_rate": 0.00012683969465648854, | |
| "loss": 1.844, | |
| "mean_token_accuracy": 0.6506654173135757, | |
| "num_tokens": 1462493.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1644444444444444, | |
| "grad_norm": 1.7811567783355713, | |
| "learning_rate": 0.00012622900763358777, | |
| "loss": 1.8235, | |
| "mean_token_accuracy": 0.6505810797214509, | |
| "num_tokens": 1473710.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.1733333333333333, | |
| "grad_norm": 1.9157836437225342, | |
| "learning_rate": 0.00012561832061068704, | |
| "loss": 1.8885, | |
| "mean_token_accuracy": 0.6459546625614166, | |
| "num_tokens": 1485215.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.1822222222222223, | |
| "grad_norm": 1.6572569608688354, | |
| "learning_rate": 0.00012500763358778627, | |
| "loss": 1.813, | |
| "mean_token_accuracy": 0.6597578257322312, | |
| "num_tokens": 1496371.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.1911111111111112, | |
| "grad_norm": 1.8602449893951416, | |
| "learning_rate": 0.0001243969465648855, | |
| "loss": 1.8179, | |
| "mean_token_accuracy": 0.6519266426563263, | |
| "num_tokens": 1508348.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.8736369609832764, | |
| "learning_rate": 0.00012378625954198472, | |
| "loss": 1.8029, | |
| "mean_token_accuracy": 0.6621162816882133, | |
| "num_tokens": 1519322.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.208888888888889, | |
| "grad_norm": 2.026744842529297, | |
| "learning_rate": 0.00012317557251908398, | |
| "loss": 1.8168, | |
| "mean_token_accuracy": 0.6635635286569596, | |
| "num_tokens": 1530183.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.2177777777777778, | |
| "grad_norm": 1.7360782623291016, | |
| "learning_rate": 0.00012256488549618322, | |
| "loss": 1.7521, | |
| "mean_token_accuracy": 0.6706348299980164, | |
| "num_tokens": 1540862.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.2266666666666666, | |
| "grad_norm": 1.9620578289031982, | |
| "learning_rate": 0.00012195419847328244, | |
| "loss": 1.8228, | |
| "mean_token_accuracy": 0.6569086670875549, | |
| "num_tokens": 1552212.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.2355555555555555, | |
| "grad_norm": 1.6294327974319458, | |
| "learning_rate": 0.00012134351145038167, | |
| "loss": 1.7654, | |
| "mean_token_accuracy": 0.6697377026081085, | |
| "num_tokens": 1563356.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "grad_norm": 1.7311524152755737, | |
| "learning_rate": 0.00012073282442748092, | |
| "loss": 1.9019, | |
| "mean_token_accuracy": 0.6457875579595566, | |
| "num_tokens": 1574569.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "eval_loss": 1.9411770105361938, | |
| "eval_mean_token_accuracy": 0.6407178282737732, | |
| "eval_num_tokens": 1574569.0, | |
| "eval_runtime": 48.3309, | |
| "eval_samples_per_second": 20.691, | |
| "eval_steps_per_second": 10.345, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2533333333333334, | |
| "grad_norm": 1.8629728555679321, | |
| "learning_rate": 0.00012012213740458016, | |
| "loss": 1.7585, | |
| "mean_token_accuracy": 0.671015702188015, | |
| "num_tokens": 1585308.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.2622222222222224, | |
| "grad_norm": 1.958808183670044, | |
| "learning_rate": 0.0001195114503816794, | |
| "loss": 1.8479, | |
| "mean_token_accuracy": 0.6535898372530937, | |
| "num_tokens": 1596886.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.271111111111111, | |
| "grad_norm": 1.950421690940857, | |
| "learning_rate": 0.00011890076335877862, | |
| "loss": 1.8173, | |
| "mean_token_accuracy": 0.6655478686094284, | |
| "num_tokens": 1607683.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.8152872323989868, | |
| "learning_rate": 0.00011829007633587788, | |
| "loss": 1.8791, | |
| "mean_token_accuracy": 0.6531546950340271, | |
| "num_tokens": 1618906.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2888888888888888, | |
| "grad_norm": 1.7857719659805298, | |
| "learning_rate": 0.0001176793893129771, | |
| "loss": 1.7887, | |
| "mean_token_accuracy": 0.6610255971550941, | |
| "num_tokens": 1629981.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.2977777777777777, | |
| "grad_norm": 1.8434971570968628, | |
| "learning_rate": 0.00011706870229007634, | |
| "loss": 1.8368, | |
| "mean_token_accuracy": 0.653369964659214, | |
| "num_tokens": 1641429.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.3066666666666666, | |
| "grad_norm": 1.8877320289611816, | |
| "learning_rate": 0.00011645801526717557, | |
| "loss": 1.7938, | |
| "mean_token_accuracy": 0.6639183640480042, | |
| "num_tokens": 1652601.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.3155555555555556, | |
| "grad_norm": 1.8121625185012817, | |
| "learning_rate": 0.00011584732824427482, | |
| "loss": 1.7862, | |
| "mean_token_accuracy": 0.661414910852909, | |
| "num_tokens": 1663837.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.3244444444444445, | |
| "grad_norm": 1.7919855117797852, | |
| "learning_rate": 0.00011523664122137406, | |
| "loss": 1.8148, | |
| "mean_token_accuracy": 0.6654411420226097, | |
| "num_tokens": 1675018.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 1.828735589981079, | |
| "learning_rate": 0.00011462595419847328, | |
| "loss": 1.8456, | |
| "mean_token_accuracy": 0.6496043875813484, | |
| "num_tokens": 1686136.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.3422222222222222, | |
| "grad_norm": 1.9462794065475464, | |
| "learning_rate": 0.00011401526717557252, | |
| "loss": 1.8412, | |
| "mean_token_accuracy": 0.6603908941149712, | |
| "num_tokens": 1697160.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.3511111111111112, | |
| "grad_norm": 1.6794313192367554, | |
| "learning_rate": 0.00011340458015267177, | |
| "loss": 1.7774, | |
| "mean_token_accuracy": 0.6664682924747467, | |
| "num_tokens": 1707831.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 1.8189337253570557, | |
| "learning_rate": 0.000112793893129771, | |
| "loss": 1.8031, | |
| "mean_token_accuracy": 0.6627006307244301, | |
| "num_tokens": 1719074.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3688888888888888, | |
| "grad_norm": 2.073533296585083, | |
| "learning_rate": 0.00011218320610687022, | |
| "loss": 1.8657, | |
| "mean_token_accuracy": 0.6476830393075943, | |
| "num_tokens": 1730388.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3777777777777778, | |
| "grad_norm": 2.1564207077026367, | |
| "learning_rate": 0.00011157251908396946, | |
| "loss": 1.8261, | |
| "mean_token_accuracy": 0.6567840203642845, | |
| "num_tokens": 1741806.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.3866666666666667, | |
| "grad_norm": 1.6113232374191284, | |
| "learning_rate": 0.00011096183206106871, | |
| "loss": 1.7753, | |
| "mean_token_accuracy": 0.6659888163208961, | |
| "num_tokens": 1753313.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.3955555555555557, | |
| "grad_norm": 1.8112174272537231, | |
| "learning_rate": 0.00011035114503816795, | |
| "loss": 1.8046, | |
| "mean_token_accuracy": 0.6593015149235726, | |
| "num_tokens": 1765144.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.4044444444444444, | |
| "grad_norm": 1.8377541303634644, | |
| "learning_rate": 0.00010974045801526718, | |
| "loss": 1.8848, | |
| "mean_token_accuracy": 0.6533517614006996, | |
| "num_tokens": 1776783.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.4133333333333333, | |
| "grad_norm": 1.8384325504302979, | |
| "learning_rate": 0.0001091297709923664, | |
| "loss": 1.7669, | |
| "mean_token_accuracy": 0.6613995045423507, | |
| "num_tokens": 1788274.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "grad_norm": 1.8124533891677856, | |
| "learning_rate": 0.00010851908396946567, | |
| "loss": 1.8164, | |
| "mean_token_accuracy": 0.6591159239411354, | |
| "num_tokens": 1799707.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "eval_loss": 1.9286668300628662, | |
| "eval_mean_token_accuracy": 0.6434953879117966, | |
| "eval_num_tokens": 1799707.0, | |
| "eval_runtime": 48.6198, | |
| "eval_samples_per_second": 20.568, | |
| "eval_steps_per_second": 10.284, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.431111111111111, | |
| "grad_norm": 1.6931661367416382, | |
| "learning_rate": 0.00010790839694656489, | |
| "loss": 1.7548, | |
| "mean_token_accuracy": 0.664087076485157, | |
| "num_tokens": 1810865.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.7501254081726074, | |
| "learning_rate": 0.00010729770992366413, | |
| "loss": 1.7652, | |
| "mean_token_accuracy": 0.6640020117163659, | |
| "num_tokens": 1821807.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.448888888888889, | |
| "grad_norm": 1.8411732912063599, | |
| "learning_rate": 0.00010668702290076336, | |
| "loss": 1.831, | |
| "mean_token_accuracy": 0.6564242169260979, | |
| "num_tokens": 1832886.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.4577777777777778, | |
| "grad_norm": 2.003892183303833, | |
| "learning_rate": 0.00010607633587786261, | |
| "loss": 1.7791, | |
| "mean_token_accuracy": 0.6632592365145683, | |
| "num_tokens": 1843989.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.4666666666666668, | |
| "grad_norm": 1.7987340688705444, | |
| "learning_rate": 0.00010546564885496185, | |
| "loss": 1.7627, | |
| "mean_token_accuracy": 0.6713873609900475, | |
| "num_tokens": 1855106.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.4755555555555555, | |
| "grad_norm": 1.931877851486206, | |
| "learning_rate": 0.00010485496183206107, | |
| "loss": 1.7976, | |
| "mean_token_accuracy": 0.6631382897496223, | |
| "num_tokens": 1866900.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.4844444444444445, | |
| "grad_norm": 1.7883687019348145, | |
| "learning_rate": 0.0001042442748091603, | |
| "loss": 1.7671, | |
| "mean_token_accuracy": 0.6675158813595772, | |
| "num_tokens": 1877911.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.4933333333333334, | |
| "grad_norm": 1.8195563554763794, | |
| "learning_rate": 0.00010363358778625955, | |
| "loss": 1.8346, | |
| "mean_token_accuracy": 0.652577318251133, | |
| "num_tokens": 1889580.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.5022222222222221, | |
| "grad_norm": 1.7439149618148804, | |
| "learning_rate": 0.00010302290076335879, | |
| "loss": 1.7476, | |
| "mean_token_accuracy": 0.6717594474554062, | |
| "num_tokens": 1901133.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.511111111111111, | |
| "grad_norm": 1.8155314922332764, | |
| "learning_rate": 0.00010241221374045801, | |
| "loss": 1.8044, | |
| "mean_token_accuracy": 0.6617274522781372, | |
| "num_tokens": 1911796.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.7685112953186035, | |
| "learning_rate": 0.00010180152671755725, | |
| "loss": 1.7727, | |
| "mean_token_accuracy": 0.665304908156395, | |
| "num_tokens": 1923217.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.528888888888889, | |
| "grad_norm": 1.737053632736206, | |
| "learning_rate": 0.0001011908396946565, | |
| "loss": 1.8345, | |
| "mean_token_accuracy": 0.6577870160341263, | |
| "num_tokens": 1934355.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.537777777777778, | |
| "grad_norm": 1.9686291217803955, | |
| "learning_rate": 0.00010058015267175573, | |
| "loss": 1.8165, | |
| "mean_token_accuracy": 0.6594037398695946, | |
| "num_tokens": 1945653.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.5466666666666666, | |
| "grad_norm": 1.844651699066162, | |
| "learning_rate": 9.996946564885497e-05, | |
| "loss": 1.8273, | |
| "mean_token_accuracy": 0.6566928923130035, | |
| "num_tokens": 1956891.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 1.8607743978500366, | |
| "learning_rate": 9.93587786259542e-05, | |
| "loss": 1.785, | |
| "mean_token_accuracy": 0.6692357853055, | |
| "num_tokens": 1967789.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5644444444444443, | |
| "grad_norm": 1.9204373359680176, | |
| "learning_rate": 9.874809160305344e-05, | |
| "loss": 1.8264, | |
| "mean_token_accuracy": 0.6549209818243981, | |
| "num_tokens": 1979224.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.5733333333333333, | |
| "grad_norm": 1.7754265069961548, | |
| "learning_rate": 9.813740458015268e-05, | |
| "loss": 1.7467, | |
| "mean_token_accuracy": 0.6670090600848197, | |
| "num_tokens": 1990255.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.5822222222222222, | |
| "grad_norm": 2.069091796875, | |
| "learning_rate": 9.752671755725191e-05, | |
| "loss": 1.7731, | |
| "mean_token_accuracy": 0.6609751120209694, | |
| "num_tokens": 2001606.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.5911111111111111, | |
| "grad_norm": 2.1375646591186523, | |
| "learning_rate": 9.691603053435115e-05, | |
| "loss": 1.8009, | |
| "mean_token_accuracy": 0.6624869346618653, | |
| "num_tokens": 2012912.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.5623434782028198, | |
| "learning_rate": 9.630534351145038e-05, | |
| "loss": 1.7383, | |
| "mean_token_accuracy": 0.6694582119584084, | |
| "num_tokens": 2024571.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.90510892868042, | |
| "eval_mean_token_accuracy": 0.6464553346633911, | |
| "eval_num_tokens": 2024571.0, | |
| "eval_runtime": 48.9449, | |
| "eval_samples_per_second": 20.431, | |
| "eval_steps_per_second": 10.216, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.608888888888889, | |
| "grad_norm": 1.745969295501709, | |
| "learning_rate": 9.569465648854963e-05, | |
| "loss": 1.7552, | |
| "mean_token_accuracy": 0.6786300778388977, | |
| "num_tokens": 2035783.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.6177777777777778, | |
| "grad_norm": 1.7463303804397583, | |
| "learning_rate": 9.508396946564886e-05, | |
| "loss": 1.7495, | |
| "mean_token_accuracy": 0.6666959136724472, | |
| "num_tokens": 2047304.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.6266666666666667, | |
| "grad_norm": 1.9058139324188232, | |
| "learning_rate": 9.44732824427481e-05, | |
| "loss": 1.8365, | |
| "mean_token_accuracy": 0.6536470741033554, | |
| "num_tokens": 2058792.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.6355555555555554, | |
| "grad_norm": 2.065488576889038, | |
| "learning_rate": 9.386259541984733e-05, | |
| "loss": 1.7939, | |
| "mean_token_accuracy": 0.6519258007407188, | |
| "num_tokens": 2070175.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.6444444444444444, | |
| "grad_norm": 1.778023600578308, | |
| "learning_rate": 9.325190839694658e-05, | |
| "loss": 1.8155, | |
| "mean_token_accuracy": 0.655296416580677, | |
| "num_tokens": 2081343.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.6533333333333333, | |
| "grad_norm": 1.7437517642974854, | |
| "learning_rate": 9.26412213740458e-05, | |
| "loss": 1.7996, | |
| "mean_token_accuracy": 0.6618543311953544, | |
| "num_tokens": 2093074.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.6622222222222223, | |
| "grad_norm": 1.7666471004486084, | |
| "learning_rate": 9.203053435114505e-05, | |
| "loss": 1.7658, | |
| "mean_token_accuracy": 0.6631957843899727, | |
| "num_tokens": 2104640.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.6711111111111112, | |
| "grad_norm": 1.912842869758606, | |
| "learning_rate": 9.141984732824428e-05, | |
| "loss": 1.7996, | |
| "mean_token_accuracy": 0.6606781020760536, | |
| "num_tokens": 2115628.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.7230331897735596, | |
| "learning_rate": 9.080916030534351e-05, | |
| "loss": 1.8042, | |
| "mean_token_accuracy": 0.6600380197167397, | |
| "num_tokens": 2126505.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.6888888888888889, | |
| "grad_norm": 1.7043401002883911, | |
| "learning_rate": 9.019847328244276e-05, | |
| "loss": 1.7993, | |
| "mean_token_accuracy": 0.6613149493932724, | |
| "num_tokens": 2138364.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6977777777777778, | |
| "grad_norm": 1.9145572185516357, | |
| "learning_rate": 8.958778625954198e-05, | |
| "loss": 1.8046, | |
| "mean_token_accuracy": 0.662477345764637, | |
| "num_tokens": 2149425.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.7066666666666666, | |
| "grad_norm": 1.7448140382766724, | |
| "learning_rate": 8.897709923664123e-05, | |
| "loss": 1.8004, | |
| "mean_token_accuracy": 0.6539181426167489, | |
| "num_tokens": 2160843.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.7155555555555555, | |
| "grad_norm": 1.8304840326309204, | |
| "learning_rate": 8.836641221374045e-05, | |
| "loss": 1.8404, | |
| "mean_token_accuracy": 0.6593489304184914, | |
| "num_tokens": 2172044.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.7244444444444444, | |
| "grad_norm": 1.802331566810608, | |
| "learning_rate": 8.77557251908397e-05, | |
| "loss": 1.7995, | |
| "mean_token_accuracy": 0.6634193584322929, | |
| "num_tokens": 2182916.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.7333333333333334, | |
| "grad_norm": 1.9834682941436768, | |
| "learning_rate": 8.714503816793894e-05, | |
| "loss": 1.7525, | |
| "mean_token_accuracy": 0.6685526207089424, | |
| "num_tokens": 2194913.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.7422222222222223, | |
| "grad_norm": 1.8077235221862793, | |
| "learning_rate": 8.653435114503817e-05, | |
| "loss": 1.7612, | |
| "mean_token_accuracy": 0.6704939991235733, | |
| "num_tokens": 2205721.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.751111111111111, | |
| "grad_norm": 1.957993745803833, | |
| "learning_rate": 8.592366412213741e-05, | |
| "loss": 1.8059, | |
| "mean_token_accuracy": 0.6547697961330414, | |
| "num_tokens": 2217489.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.7215981483459473, | |
| "learning_rate": 8.531297709923664e-05, | |
| "loss": 1.7913, | |
| "mean_token_accuracy": 0.657075221836567, | |
| "num_tokens": 2228972.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.7688888888888887, | |
| "grad_norm": 1.8760231733322144, | |
| "learning_rate": 8.470229007633588e-05, | |
| "loss": 1.7923, | |
| "mean_token_accuracy": 0.6629065066576004, | |
| "num_tokens": 2240239.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 2.092407703399658, | |
| "learning_rate": 8.409160305343512e-05, | |
| "loss": 1.7593, | |
| "mean_token_accuracy": 0.6686230883002281, | |
| "num_tokens": 2251436.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "eval_loss": 1.893255591392517, | |
| "eval_mean_token_accuracy": 0.6482590944766998, | |
| "eval_num_tokens": 2251436.0, | |
| "eval_runtime": 49.0676, | |
| "eval_samples_per_second": 20.38, | |
| "eval_steps_per_second": 10.19, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7866666666666666, | |
| "grad_norm": 1.7836107015609741, | |
| "learning_rate": 8.348091603053435e-05, | |
| "loss": 1.8033, | |
| "mean_token_accuracy": 0.6598399996757507, | |
| "num_tokens": 2263069.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.7955555555555556, | |
| "grad_norm": 1.7955141067504883, | |
| "learning_rate": 8.287022900763359e-05, | |
| "loss": 1.7922, | |
| "mean_token_accuracy": 0.6619856491684913, | |
| "num_tokens": 2274050.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.8044444444444445, | |
| "grad_norm": 1.7887564897537231, | |
| "learning_rate": 8.225954198473282e-05, | |
| "loss": 1.8353, | |
| "mean_token_accuracy": 0.658150726556778, | |
| "num_tokens": 2285060.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.8133333333333335, | |
| "grad_norm": 1.8892567157745361, | |
| "learning_rate": 8.164885496183207e-05, | |
| "loss": 1.7266, | |
| "mean_token_accuracy": 0.6728688895702362, | |
| "num_tokens": 2296211.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.8222222222222222, | |
| "grad_norm": 1.9226106405258179, | |
| "learning_rate": 8.10381679389313e-05, | |
| "loss": 1.7243, | |
| "mean_token_accuracy": 0.6712497785687447, | |
| "num_tokens": 2307184.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.8311111111111111, | |
| "grad_norm": 1.735863208770752, | |
| "learning_rate": 8.042748091603054e-05, | |
| "loss": 1.7739, | |
| "mean_token_accuracy": 0.6621047109365463, | |
| "num_tokens": 2318602.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.8361355066299438, | |
| "learning_rate": 7.981679389312977e-05, | |
| "loss": 1.8223, | |
| "mean_token_accuracy": 0.6560095950961113, | |
| "num_tokens": 2330193.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.8488888888888888, | |
| "grad_norm": 1.8159486055374146, | |
| "learning_rate": 7.920610687022902e-05, | |
| "loss": 1.7695, | |
| "mean_token_accuracy": 0.6657541528344154, | |
| "num_tokens": 2341442.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.8577777777777778, | |
| "grad_norm": 1.9189419746398926, | |
| "learning_rate": 7.859541984732824e-05, | |
| "loss": 1.8333, | |
| "mean_token_accuracy": 0.6628425523638726, | |
| "num_tokens": 2352479.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 1.8809512853622437, | |
| "learning_rate": 7.798473282442749e-05, | |
| "loss": 1.7371, | |
| "mean_token_accuracy": 0.6683435723185539, | |
| "num_tokens": 2363642.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8755555555555556, | |
| "grad_norm": 1.845886468887329, | |
| "learning_rate": 7.737404580152672e-05, | |
| "loss": 1.7774, | |
| "mean_token_accuracy": 0.6559944331645966, | |
| "num_tokens": 2375376.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.8844444444444446, | |
| "grad_norm": 1.7780894041061401, | |
| "learning_rate": 7.676335877862596e-05, | |
| "loss": 1.7823, | |
| "mean_token_accuracy": 0.6601730152964592, | |
| "num_tokens": 2386944.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.8933333333333333, | |
| "grad_norm": 1.9167022705078125, | |
| "learning_rate": 7.61526717557252e-05, | |
| "loss": 1.7869, | |
| "mean_token_accuracy": 0.6573449537158013, | |
| "num_tokens": 2398391.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.9022222222222223, | |
| "grad_norm": 2.037911891937256, | |
| "learning_rate": 7.554198473282443e-05, | |
| "loss": 1.7858, | |
| "mean_token_accuracy": 0.6593190267682075, | |
| "num_tokens": 2409837.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.911111111111111, | |
| "grad_norm": 1.7496647834777832, | |
| "learning_rate": 7.493129770992367e-05, | |
| "loss": 1.7241, | |
| "mean_token_accuracy": 0.6702290028333664, | |
| "num_tokens": 2421607.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.0227596759796143, | |
| "learning_rate": 7.43206106870229e-05, | |
| "loss": 1.7731, | |
| "mean_token_accuracy": 0.6679618924856185, | |
| "num_tokens": 2432376.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.9288888888888889, | |
| "grad_norm": 1.7401562929153442, | |
| "learning_rate": 7.370992366412214e-05, | |
| "loss": 1.7684, | |
| "mean_token_accuracy": 0.6676609605550766, | |
| "num_tokens": 2443683.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.9377777777777778, | |
| "grad_norm": 2.709106922149658, | |
| "learning_rate": 7.309923664122137e-05, | |
| "loss": 1.709, | |
| "mean_token_accuracy": 0.6738818466663361, | |
| "num_tokens": 2454757.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.9466666666666668, | |
| "grad_norm": 1.8504191637039185, | |
| "learning_rate": 7.248854961832061e-05, | |
| "loss": 1.7411, | |
| "mean_token_accuracy": 0.6681609645485878, | |
| "num_tokens": 2465562.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "grad_norm": 1.9488162994384766, | |
| "learning_rate": 7.187786259541986e-05, | |
| "loss": 1.7927, | |
| "mean_token_accuracy": 0.6587553441524505, | |
| "num_tokens": 2476869.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "eval_loss": 1.8803235292434692, | |
| "eval_mean_token_accuracy": 0.6499251070022583, | |
| "eval_num_tokens": 2476869.0, | |
| "eval_runtime": 47.7648, | |
| "eval_samples_per_second": 20.936, | |
| "eval_steps_per_second": 10.468, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9644444444444444, | |
| "grad_norm": 1.9747337102890015, | |
| "learning_rate": 7.132824427480917e-05, | |
| "loss": 1.7689, | |
| "mean_token_accuracy": 0.666295376420021, | |
| "num_tokens": 2487704.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.9733333333333334, | |
| "grad_norm": 1.8904316425323486, | |
| "learning_rate": 7.071755725190839e-05, | |
| "loss": 1.7538, | |
| "mean_token_accuracy": 0.6645636394619941, | |
| "num_tokens": 2498918.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.982222222222222, | |
| "grad_norm": 1.8791844844818115, | |
| "learning_rate": 7.010687022900764e-05, | |
| "loss": 1.7926, | |
| "mean_token_accuracy": 0.6631673067808151, | |
| "num_tokens": 2509728.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.991111111111111, | |
| "grad_norm": 1.9756606817245483, | |
| "learning_rate": 6.949618320610687e-05, | |
| "loss": 1.7863, | |
| "mean_token_accuracy": 0.6628521859645844, | |
| "num_tokens": 2521073.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.7894699573516846, | |
| "learning_rate": 6.888549618320611e-05, | |
| "loss": 1.7539, | |
| "mean_token_accuracy": 0.6728802308440208, | |
| "num_tokens": 2531820.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.008888888888889, | |
| "grad_norm": 1.702850341796875, | |
| "learning_rate": 6.827480916030535e-05, | |
| "loss": 1.4903, | |
| "mean_token_accuracy": 0.7138098135590554, | |
| "num_tokens": 2542512.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.017777777777778, | |
| "grad_norm": 1.7931528091430664, | |
| "learning_rate": 6.766412213740458e-05, | |
| "loss": 1.601, | |
| "mean_token_accuracy": 0.6894692406058311, | |
| "num_tokens": 2553338.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.026666666666667, | |
| "grad_norm": 2.228480339050293, | |
| "learning_rate": 6.705343511450382e-05, | |
| "loss": 1.609, | |
| "mean_token_accuracy": 0.6943154886364937, | |
| "num_tokens": 2564182.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.0355555555555553, | |
| "grad_norm": 1.9658042192459106, | |
| "learning_rate": 6.644274809160305e-05, | |
| "loss": 1.6545, | |
| "mean_token_accuracy": 0.6824306204915047, | |
| "num_tokens": 2575789.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "grad_norm": 1.7540594339370728, | |
| "learning_rate": 6.583206106870229e-05, | |
| "loss": 1.6229, | |
| "mean_token_accuracy": 0.6881745710968972, | |
| "num_tokens": 2587147.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.0533333333333332, | |
| "grad_norm": 1.799501895904541, | |
| "learning_rate": 6.522137404580153e-05, | |
| "loss": 1.6119, | |
| "mean_token_accuracy": 0.6896049126982688, | |
| "num_tokens": 2598282.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.062222222222222, | |
| "grad_norm": 1.7720867395401, | |
| "learning_rate": 6.461068702290076e-05, | |
| "loss": 1.5519, | |
| "mean_token_accuracy": 0.7038252353668213, | |
| "num_tokens": 2609125.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.071111111111111, | |
| "grad_norm": 1.994992971420288, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 1.5872, | |
| "mean_token_accuracy": 0.690100908279419, | |
| "num_tokens": 2620411.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.9283640384674072, | |
| "learning_rate": 6.338931297709923e-05, | |
| "loss": 1.5867, | |
| "mean_token_accuracy": 0.6923216238617897, | |
| "num_tokens": 2631795.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.088888888888889, | |
| "grad_norm": 1.9957973957061768, | |
| "learning_rate": 6.277862595419848e-05, | |
| "loss": 1.5996, | |
| "mean_token_accuracy": 0.6924369186162949, | |
| "num_tokens": 2643179.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.097777777777778, | |
| "grad_norm": 2.0207560062408447, | |
| "learning_rate": 6.21679389312977e-05, | |
| "loss": 1.515, | |
| "mean_token_accuracy": 0.7066755428910255, | |
| "num_tokens": 2654206.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.1066666666666665, | |
| "grad_norm": 1.8871878385543823, | |
| "learning_rate": 6.155725190839695e-05, | |
| "loss": 1.6139, | |
| "mean_token_accuracy": 0.687422800064087, | |
| "num_tokens": 2665582.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.1155555555555554, | |
| "grad_norm": 1.717610478401184, | |
| "learning_rate": 6.094656488549618e-05, | |
| "loss": 1.6388, | |
| "mean_token_accuracy": 0.6870575189590454, | |
| "num_tokens": 2677533.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.1244444444444444, | |
| "grad_norm": 1.8574187755584717, | |
| "learning_rate": 6.0335877862595426e-05, | |
| "loss": 1.557, | |
| "mean_token_accuracy": 0.6999430671334267, | |
| "num_tokens": 2688755.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 1.9739580154418945, | |
| "learning_rate": 5.9725190839694655e-05, | |
| "loss": 1.6553, | |
| "mean_token_accuracy": 0.6819543272256852, | |
| "num_tokens": 2700558.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "eval_loss": 1.8970768451690674, | |
| "eval_mean_token_accuracy": 0.6490416256189346, | |
| "eval_num_tokens": 2700558.0, | |
| "eval_runtime": 47.6704, | |
| "eval_samples_per_second": 20.977, | |
| "eval_steps_per_second": 10.489, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.1422222222222222, | |
| "grad_norm": 1.893918514251709, | |
| "learning_rate": 5.91145038167939e-05, | |
| "loss": 1.5459, | |
| "mean_token_accuracy": 0.6963777393102646, | |
| "num_tokens": 2711713.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.151111111111111, | |
| "grad_norm": 1.9607445001602173, | |
| "learning_rate": 5.850381679389313e-05, | |
| "loss": 1.6373, | |
| "mean_token_accuracy": 0.6815788432955742, | |
| "num_tokens": 2723686.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.091732978820801, | |
| "learning_rate": 5.789312977099237e-05, | |
| "loss": 1.6422, | |
| "mean_token_accuracy": 0.6811213716864586, | |
| "num_tokens": 2735300.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.168888888888889, | |
| "grad_norm": 2.1138076782226562, | |
| "learning_rate": 5.7282442748091605e-05, | |
| "loss": 1.5848, | |
| "mean_token_accuracy": 0.6962573245167732, | |
| "num_tokens": 2746248.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.1777777777777776, | |
| "grad_norm": 2.1495392322540283, | |
| "learning_rate": 5.667175572519085e-05, | |
| "loss": 1.576, | |
| "mean_token_accuracy": 0.6990228727459907, | |
| "num_tokens": 2757259.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.1866666666666665, | |
| "grad_norm": 2.1444251537323, | |
| "learning_rate": 5.606106870229008e-05, | |
| "loss": 1.5979, | |
| "mean_token_accuracy": 0.6916472837328911, | |
| "num_tokens": 2768228.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.1955555555555555, | |
| "grad_norm": 1.945489525794983, | |
| "learning_rate": 5.545038167938932e-05, | |
| "loss": 1.5663, | |
| "mean_token_accuracy": 0.7005513325333595, | |
| "num_tokens": 2779254.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.2044444444444444, | |
| "grad_norm": 1.8256646394729614, | |
| "learning_rate": 5.483969465648855e-05, | |
| "loss": 1.5751, | |
| "mean_token_accuracy": 0.6961624413728714, | |
| "num_tokens": 2790326.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.2133333333333334, | |
| "grad_norm": 1.9541441202163696, | |
| "learning_rate": 5.422900763358779e-05, | |
| "loss": 1.6268, | |
| "mean_token_accuracy": 0.6893054991960526, | |
| "num_tokens": 2801625.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 2.0127615928649902, | |
| "learning_rate": 5.361832061068702e-05, | |
| "loss": 1.6096, | |
| "mean_token_accuracy": 0.6923437744379044, | |
| "num_tokens": 2813010.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.2311111111111113, | |
| "grad_norm": 2.0325839519500732, | |
| "learning_rate": 5.300763358778626e-05, | |
| "loss": 1.5963, | |
| "mean_token_accuracy": 0.6913090571761131, | |
| "num_tokens": 2824021.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.1595821380615234, | |
| "learning_rate": 5.23969465648855e-05, | |
| "loss": 1.5617, | |
| "mean_token_accuracy": 0.7037980020046234, | |
| "num_tokens": 2835232.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.2488888888888887, | |
| "grad_norm": 2.11661958694458, | |
| "learning_rate": 5.178625954198474e-05, | |
| "loss": 1.6213, | |
| "mean_token_accuracy": 0.6836483731865883, | |
| "num_tokens": 2846524.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.2577777777777777, | |
| "grad_norm": 1.88747239112854, | |
| "learning_rate": 5.117557251908397e-05, | |
| "loss": 1.6408, | |
| "mean_token_accuracy": 0.6860729962587356, | |
| "num_tokens": 2857788.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 1.9622093439102173, | |
| "learning_rate": 5.056488549618321e-05, | |
| "loss": 1.5519, | |
| "mean_token_accuracy": 0.7002682030200958, | |
| "num_tokens": 2868618.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.2755555555555556, | |
| "grad_norm": 1.9343371391296387, | |
| "learning_rate": 4.995419847328244e-05, | |
| "loss": 1.5795, | |
| "mean_token_accuracy": 0.6934511423110962, | |
| "num_tokens": 2879999.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.2844444444444445, | |
| "grad_norm": 1.9991627931594849, | |
| "learning_rate": 4.934351145038168e-05, | |
| "loss": 1.6183, | |
| "mean_token_accuracy": 0.6901679039001465, | |
| "num_tokens": 2891053.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.2933333333333334, | |
| "grad_norm": 1.9480003118515015, | |
| "learning_rate": 4.8732824427480914e-05, | |
| "loss": 1.5826, | |
| "mean_token_accuracy": 0.7007558569312096, | |
| "num_tokens": 2901905.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.3022222222222224, | |
| "grad_norm": 2.021207332611084, | |
| "learning_rate": 4.812213740458015e-05, | |
| "loss": 1.6348, | |
| "mean_token_accuracy": 0.6848765298724174, | |
| "num_tokens": 2913571.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "grad_norm": 1.8385164737701416, | |
| "learning_rate": 4.751145038167939e-05, | |
| "loss": 1.5763, | |
| "mean_token_accuracy": 0.6912240386009216, | |
| "num_tokens": 2925533.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "eval_loss": 1.8940143585205078, | |
| "eval_mean_token_accuracy": 0.6499911918640137, | |
| "eval_num_tokens": 2925533.0, | |
| "eval_runtime": 47.456, | |
| "eval_samples_per_second": 21.072, | |
| "eval_steps_per_second": 10.536, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.9455375671386719, | |
| "learning_rate": 4.690076335877863e-05, | |
| "loss": 1.598, | |
| "mean_token_accuracy": 0.6915700435638428, | |
| "num_tokens": 2936620.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.328888888888889, | |
| "grad_norm": 1.863487720489502, | |
| "learning_rate": 4.6290076335877864e-05, | |
| "loss": 1.5512, | |
| "mean_token_accuracy": 0.7025073647499085, | |
| "num_tokens": 2947753.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.3377777777777777, | |
| "grad_norm": 1.9756685495376587, | |
| "learning_rate": 4.56793893129771e-05, | |
| "loss": 1.5973, | |
| "mean_token_accuracy": 0.6870647758245468, | |
| "num_tokens": 2959635.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.3466666666666667, | |
| "grad_norm": 2.190765142440796, | |
| "learning_rate": 4.5068702290076336e-05, | |
| "loss": 1.5948, | |
| "mean_token_accuracy": 0.6888303905725479, | |
| "num_tokens": 2971675.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.3555555555555556, | |
| "grad_norm": 1.827318787574768, | |
| "learning_rate": 4.445801526717557e-05, | |
| "loss": 1.5682, | |
| "mean_token_accuracy": 0.6952902913093567, | |
| "num_tokens": 2982744.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.3644444444444446, | |
| "grad_norm": 2.11799693107605, | |
| "learning_rate": 4.384732824427481e-05, | |
| "loss": 1.6221, | |
| "mean_token_accuracy": 0.6794109031558037, | |
| "num_tokens": 2994347.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.3733333333333335, | |
| "grad_norm": 2.1472220420837402, | |
| "learning_rate": 4.3236641221374044e-05, | |
| "loss": 1.6353, | |
| "mean_token_accuracy": 0.6876759916543961, | |
| "num_tokens": 3005174.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.3822222222222225, | |
| "grad_norm": 1.9971054792404175, | |
| "learning_rate": 4.2625954198473286e-05, | |
| "loss": 1.5372, | |
| "mean_token_accuracy": 0.7059834420680999, | |
| "num_tokens": 3016492.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.391111111111111, | |
| "grad_norm": 2.067861318588257, | |
| "learning_rate": 4.201526717557252e-05, | |
| "loss": 1.572, | |
| "mean_token_accuracy": 0.6911077201366425, | |
| "num_tokens": 3027826.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.0372536182403564, | |
| "learning_rate": 4.140458015267176e-05, | |
| "loss": 1.5615, | |
| "mean_token_accuracy": 0.6972797185182571, | |
| "num_tokens": 3038770.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.408888888888889, | |
| "grad_norm": 2.15972638130188, | |
| "learning_rate": 4.0793893129770994e-05, | |
| "loss": 1.5806, | |
| "mean_token_accuracy": 0.6947444006800652, | |
| "num_tokens": 3050159.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.417777777777778, | |
| "grad_norm": 2.059760808944702, | |
| "learning_rate": 4.018320610687023e-05, | |
| "loss": 1.6167, | |
| "mean_token_accuracy": 0.6882677704095841, | |
| "num_tokens": 3061009.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.4266666666666667, | |
| "grad_norm": 1.9914629459381104, | |
| "learning_rate": 3.9572519083969466e-05, | |
| "loss": 1.5508, | |
| "mean_token_accuracy": 0.6985371947288513, | |
| "num_tokens": 3072232.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.4355555555555557, | |
| "grad_norm": 2.0151119232177734, | |
| "learning_rate": 3.89618320610687e-05, | |
| "loss": 1.663, | |
| "mean_token_accuracy": 0.6849021047353745, | |
| "num_tokens": 3083939.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 2.02457332611084, | |
| "learning_rate": 3.835114503816794e-05, | |
| "loss": 1.6043, | |
| "mean_token_accuracy": 0.6891427770256996, | |
| "num_tokens": 3095354.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.453333333333333, | |
| "grad_norm": 1.930341362953186, | |
| "learning_rate": 3.774045801526718e-05, | |
| "loss": 1.5648, | |
| "mean_token_accuracy": 0.6962095096707344, | |
| "num_tokens": 3106679.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.462222222222222, | |
| "grad_norm": 2.1718850135803223, | |
| "learning_rate": 3.7129770992366416e-05, | |
| "loss": 1.5514, | |
| "mean_token_accuracy": 0.6997211873531342, | |
| "num_tokens": 3117440.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.471111111111111, | |
| "grad_norm": 1.89506196975708, | |
| "learning_rate": 3.651908396946565e-05, | |
| "loss": 1.6102, | |
| "mean_token_accuracy": 0.6865462198853493, | |
| "num_tokens": 3128685.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.1102652549743652, | |
| "learning_rate": 3.590839694656489e-05, | |
| "loss": 1.6092, | |
| "mean_token_accuracy": 0.6845578849315643, | |
| "num_tokens": 3140574.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "grad_norm": 1.9541523456573486, | |
| "learning_rate": 3.5297709923664124e-05, | |
| "loss": 1.6245, | |
| "mean_token_accuracy": 0.6867643877863884, | |
| "num_tokens": 3151937.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "eval_loss": 1.8869248628616333, | |
| "eval_mean_token_accuracy": 0.6508636207580566, | |
| "eval_num_tokens": 3151937.0, | |
| "eval_runtime": 46.9872, | |
| "eval_samples_per_second": 21.282, | |
| "eval_steps_per_second": 10.641, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.497777777777778, | |
| "grad_norm": 2.006448984146118, | |
| "learning_rate": 3.468702290076336e-05, | |
| "loss": 1.6458, | |
| "mean_token_accuracy": 0.6835160732269288, | |
| "num_tokens": 3163343.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.506666666666667, | |
| "grad_norm": 2.0644562244415283, | |
| "learning_rate": 3.4076335877862595e-05, | |
| "loss": 1.5841, | |
| "mean_token_accuracy": 0.699130979180336, | |
| "num_tokens": 3174278.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.5155555555555553, | |
| "grad_norm": 2.5352766513824463, | |
| "learning_rate": 3.346564885496183e-05, | |
| "loss": 1.6411, | |
| "mean_token_accuracy": 0.687686163187027, | |
| "num_tokens": 3185529.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.5244444444444447, | |
| "grad_norm": 2.2506706714630127, | |
| "learning_rate": 3.2854961832061074e-05, | |
| "loss": 1.5334, | |
| "mean_token_accuracy": 0.7042266175150871, | |
| "num_tokens": 3196422.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 2.038456439971924, | |
| "learning_rate": 3.224427480916031e-05, | |
| "loss": 1.5226, | |
| "mean_token_accuracy": 0.7002356797456741, | |
| "num_tokens": 3207640.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.542222222222222, | |
| "grad_norm": 2.0818448066711426, | |
| "learning_rate": 3.1633587786259545e-05, | |
| "loss": 1.5136, | |
| "mean_token_accuracy": 0.7040936380624772, | |
| "num_tokens": 3218742.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.551111111111111, | |
| "grad_norm": 1.9810820817947388, | |
| "learning_rate": 3.102290076335878e-05, | |
| "loss": 1.6515, | |
| "mean_token_accuracy": 0.6826088905334473, | |
| "num_tokens": 3230062.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.1830689907073975, | |
| "learning_rate": 3.0412213740458017e-05, | |
| "loss": 1.5792, | |
| "mean_token_accuracy": 0.699496129155159, | |
| "num_tokens": 3240533.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.568888888888889, | |
| "grad_norm": 2.101184368133545, | |
| "learning_rate": 2.9801526717557253e-05, | |
| "loss": 1.6538, | |
| "mean_token_accuracy": 0.6724523141980171, | |
| "num_tokens": 3252476.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "grad_norm": 2.021524429321289, | |
| "learning_rate": 2.9190839694656492e-05, | |
| "loss": 1.6146, | |
| "mean_token_accuracy": 0.6886414483189582, | |
| "num_tokens": 3263799.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.586666666666667, | |
| "grad_norm": 1.9668735265731812, | |
| "learning_rate": 2.8580152671755728e-05, | |
| "loss": 1.6477, | |
| "mean_token_accuracy": 0.678925508260727, | |
| "num_tokens": 3275511.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.5955555555555554, | |
| "grad_norm": 2.088491201400757, | |
| "learning_rate": 2.7969465648854964e-05, | |
| "loss": 1.6265, | |
| "mean_token_accuracy": 0.6857595339417457, | |
| "num_tokens": 3286752.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.6044444444444443, | |
| "grad_norm": 2.0536880493164062, | |
| "learning_rate": 2.73587786259542e-05, | |
| "loss": 1.66, | |
| "mean_token_accuracy": 0.681273227930069, | |
| "num_tokens": 3297945.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.6133333333333333, | |
| "grad_norm": 2.0063817501068115, | |
| "learning_rate": 2.674809160305344e-05, | |
| "loss": 1.5102, | |
| "mean_token_accuracy": 0.7025244757533073, | |
| "num_tokens": 3309112.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 1.9980206489562988, | |
| "learning_rate": 2.6137404580152675e-05, | |
| "loss": 1.5142, | |
| "mean_token_accuracy": 0.7049572348594666, | |
| "num_tokens": 3320544.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.631111111111111, | |
| "grad_norm": 2.1506435871124268, | |
| "learning_rate": 2.552671755725191e-05, | |
| "loss": 1.5826, | |
| "mean_token_accuracy": 0.694467018544674, | |
| "num_tokens": 3331309.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.9890793561935425, | |
| "learning_rate": 2.4916030534351147e-05, | |
| "loss": 1.5631, | |
| "mean_token_accuracy": 0.6945617944002151, | |
| "num_tokens": 3343068.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.648888888888889, | |
| "grad_norm": 2.1102676391601562, | |
| "learning_rate": 2.4305343511450383e-05, | |
| "loss": 1.6145, | |
| "mean_token_accuracy": 0.6866093754768372, | |
| "num_tokens": 3354691.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.6577777777777776, | |
| "grad_norm": 2.2881674766540527, | |
| "learning_rate": 2.369465648854962e-05, | |
| "loss": 1.5796, | |
| "mean_token_accuracy": 0.6961612686514854, | |
| "num_tokens": 3365512.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 1.973838210105896, | |
| "learning_rate": 2.3083969465648854e-05, | |
| "loss": 1.5456, | |
| "mean_token_accuracy": 0.703473174571991, | |
| "num_tokens": 3376406.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "eval_loss": 1.881131649017334, | |
| "eval_mean_token_accuracy": 0.6518214672803879, | |
| "eval_num_tokens": 3376406.0, | |
| "eval_runtime": 47.794, | |
| "eval_samples_per_second": 20.923, | |
| "eval_steps_per_second": 10.462, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0707043350011904e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |