| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1383, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021719948415122516, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019869848156182213, |
| "loss": 6.0531, |
| "mean_token_accuracy": 0.37881034857127815, |
| "num_tokens": 78573.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04343989683024503, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019725234996384672, |
| "loss": 5.9903, |
| "mean_token_accuracy": 0.3887314551044255, |
| "num_tokens": 157496.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06515984524536754, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019580621836587129, |
| "loss": 5.9929, |
| "mean_token_accuracy": 0.3850487937917933, |
| "num_tokens": 236512.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08687979366049006, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019436008676789588, |
| "loss": 5.986, |
| "mean_token_accuracy": 0.3865805763518438, |
| "num_tokens": 315608.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10859974207561257, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019291395516992047, |
| "loss": 6.0318, |
| "mean_token_accuracy": 0.38490854618139564, |
| "num_tokens": 394791.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13031969049073508, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019146782357194506, |
| "loss": 5.9875, |
| "mean_token_accuracy": 0.38546840590424836, |
| "num_tokens": 473456.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1520396389058576, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019002169197396965, |
| "loss": 5.9773, |
| "mean_token_accuracy": 0.3837947838823311, |
| "num_tokens": 552247.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17375958732098012, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001885755603759942, |
| "loss": 6.0197, |
| "mean_token_accuracy": 0.38375892234034836, |
| "num_tokens": 631096.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19547953573610263, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001871294287780188, |
| "loss": 6.0014, |
| "mean_token_accuracy": 0.38519658900331705, |
| "num_tokens": 710275.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.21719948415122514, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001856832971800434, |
| "loss": 6.0217, |
| "mean_token_accuracy": 0.3826989881345071, |
| "num_tokens": 789174.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23891943256634765, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018423716558206799, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.3842650496866554, |
| "num_tokens": 867922.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.26063938098147016, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018279103398409255, |
| "loss": 5.9548, |
| "mean_token_accuracy": 0.3875111517496407, |
| "num_tokens": 947073.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2823593293965927, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018134490238611714, |
| "loss": 6.0071, |
| "mean_token_accuracy": 0.38376112943515184, |
| "num_tokens": 1026211.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3040792778117152, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017989877078814173, |
| "loss": 6.0105, |
| "mean_token_accuracy": 0.38463024909142407, |
| "num_tokens": 1105147.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3257992262268377, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001784526391901663, |
| "loss": 6.0449, |
| "mean_token_accuracy": 0.38241248747799544, |
| "num_tokens": 1183837.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.34751917464196025, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017700650759219091, |
| "loss": 5.9946, |
| "mean_token_accuracy": 0.3821917780907825, |
| "num_tokens": 1262676.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.36923912305708273, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017556037599421548, |
| "loss": 6.0049, |
| "mean_token_accuracy": 0.3842250820598565, |
| "num_tokens": 1341759.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.39095907147220527, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017411424439624007, |
| "loss": 5.9736, |
| "mean_token_accuracy": 0.3861917880363762, |
| "num_tokens": 1420987.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.41267901988732775, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017266811279826466, |
| "loss": 6.0431, |
| "mean_token_accuracy": 0.38146835477091373, |
| "num_tokens": 1499903.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4343989683024503, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017122198120028922, |
| "loss": 6.0069, |
| "mean_token_accuracy": 0.3861038032686338, |
| "num_tokens": 1578880.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4561189167175728, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016977584960231381, |
| "loss": 6.0091, |
| "mean_token_accuracy": 0.3845075036631897, |
| "num_tokens": 1657709.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4778388651326953, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001683297180043384, |
| "loss": 6.0029, |
| "mean_token_accuracy": 0.38288453239947556, |
| "num_tokens": 1736620.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.49955881354781784, |
| "grad_norm": 0.0, |
| "learning_rate": 0.000166883586406363, |
| "loss": 5.9933, |
| "mean_token_accuracy": 0.38531269936356694, |
| "num_tokens": 1815478.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5212787619629403, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016543745480838756, |
| "loss": 6.0334, |
| "mean_token_accuracy": 0.3833939728909172, |
| "num_tokens": 1894256.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5429987103780628, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016399132321041215, |
| "loss": 6.0484, |
| "mean_token_accuracy": 0.3841994108865038, |
| "num_tokens": 1972997.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5647186587931854, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016254519161243674, |
| "loss": 6.0285, |
| "mean_token_accuracy": 0.38443261398933826, |
| "num_tokens": 2051890.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5864386072083079, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016109906001446133, |
| "loss": 5.983, |
| "mean_token_accuracy": 0.385862308065407, |
| "num_tokens": 2130746.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6081585556234304, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015965292841648592, |
| "loss": 6.0046, |
| "mean_token_accuracy": 0.3859730801777914, |
| "num_tokens": 2209603.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6298785040385529, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001582067968185105, |
| "loss": 5.9887, |
| "mean_token_accuracy": 0.38499277490191164, |
| "num_tokens": 2288743.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6515984524536754, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015676066522053508, |
| "loss": 5.9788, |
| "mean_token_accuracy": 0.3864025830756873, |
| "num_tokens": 2367648.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6733184008687979, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015531453362255964, |
| "loss": 6.0162, |
| "mean_token_accuracy": 0.38221894631860776, |
| "num_tokens": 2446256.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6950383492839205, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015386840202458423, |
| "loss": 6.0017, |
| "mean_token_accuracy": 0.38538019855041056, |
| "num_tokens": 2525065.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.716758297699043, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015242227042660883, |
| "loss": 5.9569, |
| "mean_token_accuracy": 0.3852926092222333, |
| "num_tokens": 2604183.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7384782461141655, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015097613882863342, |
| "loss": 6.0072, |
| "mean_token_accuracy": 0.38616998645011336, |
| "num_tokens": 2683084.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7601981945292879, |
| "grad_norm": 0.0, |
| "learning_rate": 0.000149530007230658, |
| "loss": 6.0428, |
| "mean_token_accuracy": 0.3831095602363348, |
| "num_tokens": 2761824.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7819181429444105, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014808387563268257, |
| "loss": 5.9781, |
| "mean_token_accuracy": 0.3861894382047467, |
| "num_tokens": 2840709.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.803638091359533, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014663774403470716, |
| "loss": 5.9898, |
| "mean_token_accuracy": 0.3846066597965546, |
| "num_tokens": 2919385.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8253580397746555, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014519161243673173, |
| "loss": 6.0098, |
| "mean_token_accuracy": 0.38357423364650456, |
| "num_tokens": 2998308.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8470779881897781, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014374548083875634, |
| "loss": 6.0349, |
| "mean_token_accuracy": 0.3809299209038727, |
| "num_tokens": 3076962.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8687979366049006, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001422993492407809, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.38217092433478683, |
| "num_tokens": 3155983.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.890517885020023, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001408532176428055, |
| "loss": 5.9728, |
| "mean_token_accuracy": 0.3870704318396747, |
| "num_tokens": 3234960.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9122378334351456, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001394070860448301, |
| "loss": 6.0541, |
| "mean_token_accuracy": 0.38209462116938087, |
| "num_tokens": 3313797.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9339577818502681, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013796095444685465, |
| "loss": 5.9871, |
| "mean_token_accuracy": 0.38401242352556436, |
| "num_tokens": 3392880.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9556777302653906, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013651482284887927, |
| "loss": 5.9901, |
| "mean_token_accuracy": 0.38375150066567587, |
| "num_tokens": 3471597.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9773976786805131, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013506869125090384, |
| "loss": 5.9797, |
| "mean_token_accuracy": 0.38728573790285736, |
| "num_tokens": 3550697.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9991176270956357, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013362255965292843, |
| "loss": 5.9971, |
| "mean_token_accuracy": 0.38420643559657036, |
| "num_tokens": 3629481.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0195479535736103, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013217642805495302, |
| "loss": 6.0223, |
| "mean_token_accuracy": 0.3833757105509308, |
| "num_tokens": 3703746.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0412679019887328, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013073029645697758, |
| "loss": 6.0146, |
| "mean_token_accuracy": 0.38417146790307016, |
| "num_tokens": 3782783.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0629878504038552, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012928416485900217, |
| "loss": 5.9713, |
| "mean_token_accuracy": 0.3863212176831439, |
| "num_tokens": 3862048.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0847077988189777, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012783803326102676, |
| "loss": 5.9944, |
| "mean_token_accuracy": 0.38522618702845646, |
| "num_tokens": 3941220.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1064277472341004, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012639190166305135, |
| "loss": 5.9895, |
| "mean_token_accuracy": 0.38548907284857703, |
| "num_tokens": 4020089.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.128147695649223, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012494577006507592, |
| "loss": 5.9901, |
| "mean_token_accuracy": 0.3841517207212746, |
| "num_tokens": 4098981.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1498676440643454, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001234996384671005, |
| "loss": 6.0125, |
| "mean_token_accuracy": 0.38393973018974065, |
| "num_tokens": 4177983.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1715875924794679, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012205350686912509, |
| "loss": 5.9999, |
| "mean_token_accuracy": 0.3848019931232557, |
| "num_tokens": 4256818.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1933075408945903, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012060737527114966, |
| "loss": 6.0146, |
| "mean_token_accuracy": 0.3834400905063376, |
| "num_tokens": 4335754.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2150274893097128, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011916124367317427, |
| "loss": 5.9832, |
| "mean_token_accuracy": 0.38411646473687144, |
| "num_tokens": 4414480.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2367474377248353, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011771511207519885, |
| "loss": 6.0393, |
| "mean_token_accuracy": 0.3809544609161094, |
| "num_tokens": 4493321.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2584673861399578, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011626898047722344, |
| "loss": 5.9989, |
| "mean_token_accuracy": 0.38565440035890786, |
| "num_tokens": 4572297.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2801873345550805, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011482284887924801, |
| "loss": 5.9983, |
| "mean_token_accuracy": 0.3863610655302182, |
| "num_tokens": 4651247.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.301907282970203, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011337671728127259, |
| "loss": 6.0116, |
| "mean_token_accuracy": 0.3831069741398096, |
| "num_tokens": 4730012.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3236272313853255, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001119305856832972, |
| "loss": 6.0195, |
| "mean_token_accuracy": 0.3857766560278833, |
| "num_tokens": 4809013.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.345347179800448, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011048445408532177, |
| "loss": 6.0108, |
| "mean_token_accuracy": 0.3835394912166521, |
| "num_tokens": 4888027.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3670671282155704, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010903832248734635, |
| "loss": 5.9558, |
| "mean_token_accuracy": 0.3883327366434969, |
| "num_tokens": 4966859.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3887870766306931, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010759219088937094, |
| "loss": 6.0088, |
| "mean_token_accuracy": 0.385754154715687, |
| "num_tokens": 5045905.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.4105070250458156, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010614605929139552, |
| "loss": 6.0236, |
| "mean_token_accuracy": 0.3829167880234309, |
| "num_tokens": 5124413.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.432226973460938, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001046999276934201, |
| "loss": 6.0154, |
| "mean_token_accuracy": 0.3834054367849603, |
| "num_tokens": 5203273.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4539469218760606, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001032537960954447, |
| "loss": 6.005, |
| "mean_token_accuracy": 0.3848568681394681, |
| "num_tokens": 5281984.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.475666870291183, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010180766449746928, |
| "loss": 5.9805, |
| "mean_token_accuracy": 0.3862292483681813, |
| "num_tokens": 5360846.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4973868187063055, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010036153289949386, |
| "loss": 5.9871, |
| "mean_token_accuracy": 0.3851381171494722, |
| "num_tokens": 5439758.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.519106767121428, |
| "grad_norm": 0.0, |
| "learning_rate": 9.891540130151843e-05, |
| "loss": 6.0254, |
| "mean_token_accuracy": 0.3831110308994539, |
| "num_tokens": 5518768.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5408267155365505, |
| "grad_norm": 0.0, |
| "learning_rate": 9.746926970354303e-05, |
| "loss": 6.0339, |
| "mean_token_accuracy": 0.3808896674308926, |
| "num_tokens": 5597672.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.562546663951673, |
| "grad_norm": 0.0, |
| "learning_rate": 9.602313810556762e-05, |
| "loss": 5.994, |
| "mean_token_accuracy": 0.3865658549708314, |
| "num_tokens": 5676393.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5842666123667957, |
| "grad_norm": 0.0, |
| "learning_rate": 9.45770065075922e-05, |
| "loss": 6.0026, |
| "mean_token_accuracy": 0.38676879862323404, |
| "num_tokens": 5755416.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.6059865607819181, |
| "grad_norm": 0.0, |
| "learning_rate": 9.313087490961678e-05, |
| "loss": 6.0175, |
| "mean_token_accuracy": 0.38389978763880206, |
| "num_tokens": 5834322.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.6277065091970406, |
| "grad_norm": 0.0, |
| "learning_rate": 9.168474331164136e-05, |
| "loss": 6.0224, |
| "mean_token_accuracy": 0.3840769061935134, |
| "num_tokens": 5912925.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.6494264576121633, |
| "grad_norm": 0.0, |
| "learning_rate": 9.023861171366594e-05, |
| "loss": 5.9683, |
| "mean_token_accuracy": 0.38740592543035746, |
| "num_tokens": 5991913.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6711464060272858, |
| "grad_norm": 0.0, |
| "learning_rate": 8.879248011569053e-05, |
| "loss": 6.0136, |
| "mean_token_accuracy": 0.3827170055825263, |
| "num_tokens": 6070546.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6928663544424083, |
| "grad_norm": 0.0, |
| "learning_rate": 8.734634851771512e-05, |
| "loss": 5.9688, |
| "mean_token_accuracy": 0.3871995336958207, |
| "num_tokens": 6149623.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.7145863028575308, |
| "grad_norm": 0.0, |
| "learning_rate": 8.59002169197397e-05, |
| "loss": 6.0639, |
| "mean_token_accuracy": 0.3800558194401674, |
| "num_tokens": 6228396.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.7363062512726533, |
| "grad_norm": 0.0, |
| "learning_rate": 8.445408532176429e-05, |
| "loss": 5.9778, |
| "mean_token_accuracy": 0.3854194077430293, |
| "num_tokens": 6307375.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.7580261996877757, |
| "grad_norm": 0.0, |
| "learning_rate": 8.300795372378887e-05, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.38350161886774004, |
| "num_tokens": 6386536.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.7797461481028982, |
| "grad_norm": 0.0, |
| "learning_rate": 8.156182212581345e-05, |
| "loss": 5.9736, |
| "mean_token_accuracy": 0.38414000715129076, |
| "num_tokens": 6465708.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.8014660965180207, |
| "grad_norm": 0.0, |
| "learning_rate": 8.011569052783804e-05, |
| "loss": 5.9847, |
| "mean_token_accuracy": 0.3877883184002712, |
| "num_tokens": 6544782.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.8231860449331432, |
| "grad_norm": 0.0, |
| "learning_rate": 7.866955892986261e-05, |
| "loss": 6.043, |
| "mean_token_accuracy": 0.38251175949117167, |
| "num_tokens": 6623497.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8449059933482657, |
| "grad_norm": 0.0, |
| "learning_rate": 7.72234273318872e-05, |
| "loss": 6.0297, |
| "mean_token_accuracy": 0.38357739897910503, |
| "num_tokens": 6702159.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8666259417633884, |
| "grad_norm": 0.0, |
| "learning_rate": 7.57772957339118e-05, |
| "loss": 6.0335, |
| "mean_token_accuracy": 0.38290644709486515, |
| "num_tokens": 6780886.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8883458901785108, |
| "grad_norm": 0.0, |
| "learning_rate": 7.433116413593637e-05, |
| "loss": 6.0273, |
| "mean_token_accuracy": 0.38307004772359504, |
| "num_tokens": 6859533.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.9100658385936333, |
| "grad_norm": 0.0, |
| "learning_rate": 7.288503253796096e-05, |
| "loss": 5.9827, |
| "mean_token_accuracy": 0.385389854805544, |
| "num_tokens": 6938490.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.9317857870087558, |
| "grad_norm": 0.0, |
| "learning_rate": 7.143890093998554e-05, |
| "loss": 6.0212, |
| "mean_token_accuracy": 0.382453205762431, |
| "num_tokens": 7017297.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.9535057354238785, |
| "grad_norm": 0.0, |
| "learning_rate": 6.999276934201012e-05, |
| "loss": 6.0043, |
| "mean_token_accuracy": 0.38360455771908164, |
| "num_tokens": 7096419.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.975225683839001, |
| "grad_norm": 0.0, |
| "learning_rate": 6.854663774403471e-05, |
| "loss": 5.985, |
| "mean_token_accuracy": 0.38632277538999915, |
| "num_tokens": 7175650.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9969456322541235, |
| "grad_norm": 0.0, |
| "learning_rate": 6.71005061460593e-05, |
| "loss": 6.009, |
| "mean_token_accuracy": 0.38311313565354793, |
| "num_tokens": 7254378.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.017375958732098, |
| "grad_norm": 0.0, |
| "learning_rate": 6.565437454808388e-05, |
| "loss": 5.968, |
| "mean_token_accuracy": 0.38770305797259674, |
| "num_tokens": 7328545.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.0390959071472206, |
| "grad_norm": 0.0, |
| "learning_rate": 6.420824295010847e-05, |
| "loss": 5.9938, |
| "mean_token_accuracy": 0.38461664781207217, |
| "num_tokens": 7407672.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.060815855562343, |
| "grad_norm": 0.0, |
| "learning_rate": 6.276211135213305e-05, |
| "loss": 6.0073, |
| "mean_token_accuracy": 0.3848728087265044, |
| "num_tokens": 7486852.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0825358039774655, |
| "grad_norm": 0.0, |
| "learning_rate": 6.131597975415762e-05, |
| "loss": 5.9564, |
| "mean_token_accuracy": 0.3856648310320452, |
| "num_tokens": 7565680.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.104255752392588, |
| "grad_norm": 0.0, |
| "learning_rate": 5.9869848156182215e-05, |
| "loss": 6.0092, |
| "mean_token_accuracy": 0.38208459480665624, |
| "num_tokens": 7644328.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.1259757008077105, |
| "grad_norm": 0.0, |
| "learning_rate": 5.84237165582068e-05, |
| "loss": 6.0331, |
| "mean_token_accuracy": 0.38403523166198283, |
| "num_tokens": 7723297.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.147695649222833, |
| "grad_norm": 0.0, |
| "learning_rate": 5.697758496023138e-05, |
| "loss": 6.0003, |
| "mean_token_accuracy": 0.38371361922472713, |
| "num_tokens": 7802228.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.1694155976379554, |
| "grad_norm": 0.0, |
| "learning_rate": 5.553145336225597e-05, |
| "loss": 5.9941, |
| "mean_token_accuracy": 0.386648180836346, |
| "num_tokens": 7881292.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.191135546053078, |
| "grad_norm": 0.0, |
| "learning_rate": 5.408532176428055e-05, |
| "loss": 5.9629, |
| "mean_token_accuracy": 0.3859988093841821, |
| "num_tokens": 7960306.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.212855494468201, |
| "grad_norm": 0.0, |
| "learning_rate": 5.263919016630514e-05, |
| "loss": 5.9615, |
| "mean_token_accuracy": 0.3859686201903969, |
| "num_tokens": 8039403.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.2345754428833233, |
| "grad_norm": 0.0, |
| "learning_rate": 5.119305856832972e-05, |
| "loss": 5.9744, |
| "mean_token_accuracy": 0.3849541787989438, |
| "num_tokens": 8118810.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.256295391298446, |
| "grad_norm": 0.0, |
| "learning_rate": 4.9746926970354304e-05, |
| "loss": 6.0192, |
| "mean_token_accuracy": 0.3843318622326478, |
| "num_tokens": 8197744.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.2780153397135683, |
| "grad_norm": 0.0, |
| "learning_rate": 4.830079537237889e-05, |
| "loss": 6.0364, |
| "mean_token_accuracy": 0.38417832332197577, |
| "num_tokens": 8276673.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.2997352881286908, |
| "grad_norm": 0.0, |
| "learning_rate": 4.685466377440347e-05, |
| "loss": 6.0178, |
| "mean_token_accuracy": 0.38303852297831326, |
| "num_tokens": 8355486.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.3214552365438132, |
| "grad_norm": 0.0, |
| "learning_rate": 4.540853217642806e-05, |
| "loss": 5.9567, |
| "mean_token_accuracy": 0.38857606865931305, |
| "num_tokens": 8434638.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.3431751849589357, |
| "grad_norm": 0.0, |
| "learning_rate": 4.396240057845264e-05, |
| "loss": 5.9893, |
| "mean_token_accuracy": 0.38565383905079215, |
| "num_tokens": 8513720.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.364895133374058, |
| "grad_norm": 0.0, |
| "learning_rate": 4.2516268980477226e-05, |
| "loss": 5.9812, |
| "mean_token_accuracy": 0.38872407528106123, |
| "num_tokens": 8592514.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.3866150817891807, |
| "grad_norm": 0.0, |
| "learning_rate": 4.107013738250181e-05, |
| "loss": 6.0234, |
| "mean_token_accuracy": 0.3825469396775588, |
| "num_tokens": 8671231.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.408335030204303, |
| "grad_norm": 0.0, |
| "learning_rate": 3.9624005784526394e-05, |
| "loss": 6.0149, |
| "mean_token_accuracy": 0.3845805463381112, |
| "num_tokens": 8750288.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.4300549786194257, |
| "grad_norm": 0.0, |
| "learning_rate": 3.817787418655098e-05, |
| "loss": 6.0056, |
| "mean_token_accuracy": 0.38355012007523326, |
| "num_tokens": 8829159.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.451774927034548, |
| "grad_norm": 0.0, |
| "learning_rate": 3.673174258857556e-05, |
| "loss": 5.9945, |
| "mean_token_accuracy": 0.38476743231294674, |
| "num_tokens": 8907706.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.4734948754496706, |
| "grad_norm": 0.0, |
| "learning_rate": 3.5285610990600147e-05, |
| "loss": 5.9712, |
| "mean_token_accuracy": 0.3861268714419566, |
| "num_tokens": 8986526.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.495214823864793, |
| "grad_norm": 0.0, |
| "learning_rate": 3.383947939262473e-05, |
| "loss": 6.0243, |
| "mean_token_accuracy": 0.3839300020830706, |
| "num_tokens": 9065399.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.5169347722799156, |
| "grad_norm": 0.0, |
| "learning_rate": 3.2393347794649315e-05, |
| "loss": 6.0197, |
| "mean_token_accuracy": 0.3838517373194918, |
| "num_tokens": 9144182.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.5386547206950385, |
| "grad_norm": 0.0, |
| "learning_rate": 3.09472161966739e-05, |
| "loss": 6.0336, |
| "mean_token_accuracy": 0.38345675652381034, |
| "num_tokens": 9222743.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.560374669110161, |
| "grad_norm": 0.0, |
| "learning_rate": 2.950108459869848e-05, |
| "loss": 6.0275, |
| "mean_token_accuracy": 0.38377260488923637, |
| "num_tokens": 9301686.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.5820946175252835, |
| "grad_norm": 0.0, |
| "learning_rate": 2.8054953000723068e-05, |
| "loss": 6.0401, |
| "mean_token_accuracy": 0.38272831091890114, |
| "num_tokens": 9380398.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.603814565940406, |
| "grad_norm": 0.0, |
| "learning_rate": 2.6608821402747652e-05, |
| "loss": 6.003, |
| "mean_token_accuracy": 0.38574083771090956, |
| "num_tokens": 9459228.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.6255345143555284, |
| "grad_norm": 0.0, |
| "learning_rate": 2.516268980477224e-05, |
| "loss": 6.0233, |
| "mean_token_accuracy": 0.3827782694483176, |
| "num_tokens": 9538141.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.647254462770651, |
| "grad_norm": 0.0, |
| "learning_rate": 2.3716558206796817e-05, |
| "loss": 6.0003, |
| "mean_token_accuracy": 0.3853726448956877, |
| "num_tokens": 9617091.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.6689744111857734, |
| "grad_norm": 0.0, |
| "learning_rate": 2.2270426608821404e-05, |
| "loss": 6.0271, |
| "mean_token_accuracy": 0.38530398234725, |
| "num_tokens": 9696158.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.690694359600896, |
| "grad_norm": 0.0, |
| "learning_rate": 2.082429501084599e-05, |
| "loss": 5.9816, |
| "mean_token_accuracy": 0.38289073556661607, |
| "num_tokens": 9775025.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.7124143080160183, |
| "grad_norm": 0.0, |
| "learning_rate": 1.9378163412870573e-05, |
| "loss": 6.003, |
| "mean_token_accuracy": 0.3852554757846519, |
| "num_tokens": 9853991.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.734134256431141, |
| "grad_norm": 0.0, |
| "learning_rate": 1.7932031814895157e-05, |
| "loss": 6.0387, |
| "mean_token_accuracy": 0.38218798900488765, |
| "num_tokens": 9932459.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.7558542048462638, |
| "grad_norm": 0.0, |
| "learning_rate": 1.648590021691974e-05, |
| "loss": 6.0218, |
| "mean_token_accuracy": 0.38361733745550736, |
| "num_tokens": 10011147.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.7775741532613862, |
| "grad_norm": 0.0, |
| "learning_rate": 1.5039768618944326e-05, |
| "loss": 6.0164, |
| "mean_token_accuracy": 0.3833353664376773, |
| "num_tokens": 10089867.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.7992941016765087, |
| "grad_norm": 0.0, |
| "learning_rate": 1.3593637020968908e-05, |
| "loss": 6.0203, |
| "mean_token_accuracy": 0.3844827389344573, |
| "num_tokens": 10168558.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.821014050091631, |
| "grad_norm": 0.0, |
| "learning_rate": 1.2147505422993492e-05, |
| "loss": 6.0223, |
| "mean_token_accuracy": 0.38475555207114664, |
| "num_tokens": 10247626.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.8427339985067537, |
| "grad_norm": 0.0, |
| "learning_rate": 1.0701373825018076e-05, |
| "loss": 5.997, |
| "mean_token_accuracy": 0.3833994207670912, |
| "num_tokens": 10326786.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.864453946921876, |
| "grad_norm": 0.0, |
| "learning_rate": 9.25524222704266e-06, |
| "loss": 6.0283, |
| "mean_token_accuracy": 0.3812030048458837, |
| "num_tokens": 10405619.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.8861738953369986, |
| "grad_norm": 0.0, |
| "learning_rate": 7.809110629067245e-06, |
| "loss": 6.0422, |
| "mean_token_accuracy": 0.381096905877348, |
| "num_tokens": 10484466.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.907893843752121, |
| "grad_norm": 0.0, |
| "learning_rate": 6.36297903109183e-06, |
| "loss": 5.9859, |
| "mean_token_accuracy": 0.38405414449516684, |
| "num_tokens": 10563350.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.9296137921672436, |
| "grad_norm": 0.0, |
| "learning_rate": 4.916847433116413e-06, |
| "loss": 6.0283, |
| "mean_token_accuracy": 0.38398846148047594, |
| "num_tokens": 10642421.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.951333740582366, |
| "grad_norm": 0.0, |
| "learning_rate": 3.470715835140998e-06, |
| "loss": 6.0132, |
| "mean_token_accuracy": 0.3835720970411785, |
| "num_tokens": 10721272.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.9730536889974886, |
| "grad_norm": 0.0, |
| "learning_rate": 2.024584237165582e-06, |
| "loss": 6.0098, |
| "mean_token_accuracy": 0.3844847684260458, |
| "num_tokens": 10800144.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.994773637412611, |
| "grad_norm": 0.0, |
| "learning_rate": 5.784526391901663e-07, |
| "loss": 5.962, |
| "mean_token_accuracy": 0.3870904964976944, |
| "num_tokens": 10879173.0, |
| "step": 1380 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1383, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.6605276253782835e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|