{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6258890469416785, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.104878902435303, "epoch": 0.000437684648210964, "grad_norm": 628.0, "learning_rate": 0.0, "loss": 19.4827, "loss_lm": 18.642059326171875, "loss_seg": 0.8406008183956146, "mean_token_accuracy": 0.020538671873509884, "num_tokens": 425478.0, "step": 1 }, { "entropy": 4.110010027885437, "epoch": 0.000875369296421928, "grad_norm": 624.0, "learning_rate": 8.746355685131196e-08, "loss": 19.5168, "loss_lm": 18.6736741065979, "loss_seg": 0.8431016504764557, "mean_token_accuracy": 0.02053798781707883, "num_tokens": 850343.0, "step": 2 }, { "entropy": 4.096648573875427, "epoch": 0.001313053944632892, "grad_norm": 628.0, "learning_rate": 1.7492711370262392e-07, "loss": 19.605, "loss_lm": 18.74001979827881, "loss_seg": 0.865015983581543, "mean_token_accuracy": 0.020309734158217907, "num_tokens": 1275358.0, "step": 3 }, { "entropy": 4.098791837692261, "epoch": 0.001750738592843856, "grad_norm": 632.0, "learning_rate": 2.623906705539359e-07, "loss": 19.5028, "loss_lm": 18.66270685195923, "loss_seg": 0.8401043862104416, "mean_token_accuracy": 0.02048569405451417, "num_tokens": 1700572.0, "step": 4 }, { "entropy": 4.139226913452148, "epoch": 0.00218842324105482, "grad_norm": 628.0, "learning_rate": 3.4985422740524783e-07, "loss": 19.4285, "loss_lm": 18.58917760848999, "loss_seg": 0.839279443025589, "mean_token_accuracy": 0.02073761261999607, "num_tokens": 2125932.0, "step": 5 }, { "entropy": 4.139833688735962, "epoch": 0.002626107889265784, "grad_norm": 628.0, "learning_rate": 4.373177842565598e-07, "loss": 19.4419, "loss_lm": 18.584145069122314, "loss_seg": 0.8577510565519333, "mean_token_accuracy": 0.020450078416615725, "num_tokens": 2550475.0, "step": 6 }, { "entropy": 4.133984088897705, "epoch": 0.003063792537476748, "grad_norm": 632.0, "learning_rate": 5.247813411078718e-07, "loss": 19.4605, "loss_lm": 18.598727226257324, "loss_seg": 0.8617883324623108, "mean_token_accuracy": 0.02044122898951173, "num_tokens": 2976003.0, "step": 7 }, { "entropy": 4.142183423042297, "epoch": 0.003501477185687712, "grad_norm": 648.0, "learning_rate": 6.122448979591837e-07, "loss": 19.4832, "loss_lm": 18.646227836608887, "loss_seg": 0.836944580078125, "mean_token_accuracy": 0.020664195995777845, "num_tokens": 3400650.0, "step": 8 }, { "entropy": 4.117235779762268, "epoch": 0.003939161833898676, "grad_norm": 632.0, "learning_rate": 6.997084548104957e-07, "loss": 19.4835, "loss_lm": 18.637729167938232, "loss_seg": 0.8458167910575867, "mean_token_accuracy": 0.0205116905272007, "num_tokens": 3825320.0, "step": 9 }, { "entropy": 4.138426065444946, "epoch": 0.00437684648210964, "grad_norm": 664.0, "learning_rate": 7.871720116618077e-07, "loss": 19.4455, "loss_lm": 18.595465660095215, "loss_seg": 0.8499866127967834, "mean_token_accuracy": 0.02081992756575346, "num_tokens": 4250622.0, "step": 10 }, { "entropy": 4.1646541357040405, "epoch": 0.004814531130320604, "grad_norm": 636.0, "learning_rate": 8.746355685131196e-07, "loss": 19.4057, "loss_lm": 18.539565563201904, "loss_seg": 0.8661729991436005, "mean_token_accuracy": 0.02072056569159031, "num_tokens": 4676286.0, "step": 11 }, { "entropy": 4.138994574546814, "epoch": 0.005252215778531568, "grad_norm": 628.0, "learning_rate": 9.620991253644314e-07, "loss": 19.4485, "loss_lm": 18.582157611846924, "loss_seg": 0.8663884401321411, "mean_token_accuracy": 0.020047332160174847, "num_tokens": 5101307.0, "step": 12 }, { "entropy": 4.1242780685424805, "epoch": 0.005689900426742532, "grad_norm": 624.0, "learning_rate": 1.0495626822157436e-06, "loss": 19.4427, "loss_lm": 18.60509204864502, "loss_seg": 0.8376338928937912, "mean_token_accuracy": 0.02113261353224516, "num_tokens": 5526204.0, "step": 13 }, { "entropy": 4.153870344161987, "epoch": 0.006127585074953496, "grad_norm": 644.0, "learning_rate": 1.1370262390670554e-06, "loss": 19.3921, "loss_lm": 18.543609619140625, "loss_seg": 0.848511815071106, "mean_token_accuracy": 0.020885376259684563, "num_tokens": 5951279.0, "step": 14 }, { "entropy": 4.16007924079895, "epoch": 0.00656526972316446, "grad_norm": 636.0, "learning_rate": 1.2244897959183673e-06, "loss": 19.2968, "loss_lm": 18.468181133270264, "loss_seg": 0.8286407589912415, "mean_token_accuracy": 0.02069214405491948, "num_tokens": 6376020.0, "step": 15 }, { "entropy": 4.18977677822113, "epoch": 0.007002954371375424, "grad_norm": 652.0, "learning_rate": 1.3119533527696792e-06, "loss": 19.3496, "loss_lm": 18.529414653778076, "loss_seg": 0.8202069252729416, "mean_token_accuracy": 0.020420115906745195, "num_tokens": 6801137.0, "step": 16 }, { "entropy": 4.208926677703857, "epoch": 0.007440639019586388, "grad_norm": 632.0, "learning_rate": 1.3994169096209913e-06, "loss": 19.3475, "loss_lm": 18.499176502227783, "loss_seg": 0.8482926338911057, "mean_token_accuracy": 0.02065203059464693, "num_tokens": 7226854.0, "step": 17 }, { "entropy": 4.211102604866028, "epoch": 0.007878323667797352, "grad_norm": 620.0, "learning_rate": 1.4868804664723032e-06, "loss": 19.3488, "loss_lm": 18.508638858795166, "loss_seg": 0.8401971608400345, "mean_token_accuracy": 0.02062120195478201, "num_tokens": 7652088.0, "step": 18 }, { "entropy": 4.234554648399353, "epoch": 0.008316008316008316, "grad_norm": 624.0, "learning_rate": 1.5743440233236153e-06, "loss": 19.2759, "loss_lm": 18.41702365875244, "loss_seg": 0.8588752299547195, "mean_token_accuracy": 0.020260877907276154, "num_tokens": 8077029.0, "step": 19 }, { "entropy": 4.258025288581848, "epoch": 0.00875369296421928, "grad_norm": 628.0, "learning_rate": 1.6618075801749272e-06, "loss": 19.236, "loss_lm": 18.354125499725342, "loss_seg": 0.8818369358778, "mean_token_accuracy": 0.020666799508035183, "num_tokens": 8502715.0, "step": 20 }, { "entropy": 4.2570120096206665, "epoch": 0.009191377612430244, "grad_norm": 624.0, "learning_rate": 1.7492711370262391e-06, "loss": 19.2707, "loss_lm": 18.425055980682373, "loss_seg": 0.8456433117389679, "mean_token_accuracy": 0.020707206334918737, "num_tokens": 8927542.0, "step": 21 }, { "entropy": 4.274366855621338, "epoch": 0.009629062260641207, "grad_norm": 620.0, "learning_rate": 1.836734693877551e-06, "loss": 19.2681, "loss_lm": 18.39394474029541, "loss_seg": 0.8741657733917236, "mean_token_accuracy": 0.021036023274064064, "num_tokens": 9352192.0, "step": 22 }, { "entropy": 4.297635793685913, "epoch": 0.010066746908852172, "grad_norm": 620.0, "learning_rate": 1.924198250728863e-06, "loss": 19.23, "loss_lm": 18.39939308166504, "loss_seg": 0.8305719941854477, "mean_token_accuracy": 0.020366990938782692, "num_tokens": 9776884.0, "step": 23 }, { "entropy": 4.318657875061035, "epoch": 0.010504431557063137, "grad_norm": 624.0, "learning_rate": 2.011661807580175e-06, "loss": 19.1035, "loss_lm": 18.262593746185303, "loss_seg": 0.8409073501825333, "mean_token_accuracy": 0.020451943390071392, "num_tokens": 10201371.0, "step": 24 }, { "entropy": 4.3708735704422, "epoch": 0.0109421162052741, "grad_norm": 620.0, "learning_rate": 2.099125364431487e-06, "loss": 19.0604, "loss_lm": 18.228548049926758, "loss_seg": 0.8318380117416382, "mean_token_accuracy": 0.020378126297146082, "num_tokens": 10626227.0, "step": 25 }, { "entropy": 4.418497562408447, "epoch": 0.011379800853485065, "grad_norm": 616.0, "learning_rate": 2.186588921282799e-06, "loss": 18.9946, "loss_lm": 18.15699338912964, "loss_seg": 0.8375954031944275, "mean_token_accuracy": 0.022820821031928062, "num_tokens": 11051915.0, "step": 26 }, { "entropy": 4.489338517189026, "epoch": 0.011817485501696028, "grad_norm": 632.0, "learning_rate": 2.274052478134111e-06, "loss": 18.8133, "loss_lm": 17.96775531768799, "loss_seg": 0.845514252781868, "mean_token_accuracy": 0.022121363785117865, "num_tokens": 11477232.0, "step": 27 }, { "entropy": 4.523482918739319, "epoch": 0.012255170149906992, "grad_norm": 612.0, "learning_rate": 2.361516034985423e-06, "loss": 18.7449, "loss_lm": 17.89174461364746, "loss_seg": 0.8531419187784195, "mean_token_accuracy": 0.02145737176761031, "num_tokens": 11902270.0, "step": 28 }, { "entropy": 4.573235273361206, "epoch": 0.012692854798117955, "grad_norm": 604.0, "learning_rate": 2.4489795918367347e-06, "loss": 18.6807, "loss_lm": 17.83618927001953, "loss_seg": 0.8445169031620026, "mean_token_accuracy": 0.022404072806239128, "num_tokens": 12326920.0, "step": 29 }, { "entropy": 4.616198897361755, "epoch": 0.01313053944632892, "grad_norm": 608.0, "learning_rate": 2.5364431486880463e-06, "loss": 18.6461, "loss_lm": 17.78361225128174, "loss_seg": 0.8625232726335526, "mean_token_accuracy": 0.0220795925706625, "num_tokens": 12751901.0, "step": 30 }, { "entropy": 4.684335708618164, "epoch": 0.013568224094539885, "grad_norm": 612.0, "learning_rate": 2.6239067055393585e-06, "loss": 18.624, "loss_lm": 17.757392406463623, "loss_seg": 0.8666472434997559, "mean_token_accuracy": 0.02265262883156538, "num_tokens": 13176384.0, "step": 31 }, { "entropy": 4.7467416524887085, "epoch": 0.014005908742750848, "grad_norm": 604.0, "learning_rate": 2.7113702623906706e-06, "loss": 18.3709, "loss_lm": 17.516743183135986, "loss_seg": 0.8541130572557449, "mean_token_accuracy": 0.022426888346672058, "num_tokens": 13601125.0, "step": 32 }, { "entropy": 4.8103309869766235, "epoch": 0.014443593390961813, "grad_norm": 596.0, "learning_rate": 2.7988338192419827e-06, "loss": 18.3897, "loss_lm": 17.506062507629395, "loss_seg": 0.8836417943239212, "mean_token_accuracy": 0.02253139577805996, "num_tokens": 14026371.0, "step": 33 }, { "entropy": 4.841110587120056, "epoch": 0.014881278039172776, "grad_norm": 600.0, "learning_rate": 2.8862973760932943e-06, "loss": 18.2551, "loss_lm": 17.421446800231934, "loss_seg": 0.8336228281259537, "mean_token_accuracy": 0.022542994003742933, "num_tokens": 14451708.0, "step": 34 }, { "entropy": 4.9447150230407715, "epoch": 0.01531896268738374, "grad_norm": 600.0, "learning_rate": 2.9737609329446064e-06, "loss": 18.1591, "loss_lm": 17.30117654800415, "loss_seg": 0.8579097539186478, "mean_token_accuracy": 0.02266804175451398, "num_tokens": 14877002.0, "step": 35 }, { "entropy": 4.96096396446228, "epoch": 0.015756647335594703, "grad_norm": 592.0, "learning_rate": 3.0612244897959185e-06, "loss": 18.0625, "loss_lm": 17.223594665527344, "loss_seg": 0.838898628950119, "mean_token_accuracy": 0.022727820090949535, "num_tokens": 15301842.0, "step": 36 }, { "entropy": 5.049310803413391, "epoch": 0.016194331983805668, "grad_norm": 592.0, "learning_rate": 3.1486880466472307e-06, "loss": 17.9739, "loss_lm": 17.104003429412842, "loss_seg": 0.8698806166648865, "mean_token_accuracy": 0.02251737006008625, "num_tokens": 15726784.0, "step": 37 }, { "entropy": 5.09979248046875, "epoch": 0.016632016632016633, "grad_norm": 584.0, "learning_rate": 3.2361516034985423e-06, "loss": 17.9249, "loss_lm": 17.07524824142456, "loss_seg": 0.8496032357215881, "mean_token_accuracy": 0.02247198298573494, "num_tokens": 16151917.0, "step": 38 }, { "entropy": 5.172423958778381, "epoch": 0.017069701280227598, "grad_norm": 588.0, "learning_rate": 3.3236151603498544e-06, "loss": 17.7249, "loss_lm": 16.88486623764038, "loss_seg": 0.840074211359024, "mean_token_accuracy": 0.022611237596720457, "num_tokens": 16576777.0, "step": 39 }, { "entropy": 5.272212028503418, "epoch": 0.01750738592843856, "grad_norm": 580.0, "learning_rate": 3.411078717201166e-06, "loss": 17.5753, "loss_lm": 16.728742122650146, "loss_seg": 0.8465760946273804, "mean_token_accuracy": 0.022696237079799175, "num_tokens": 17002246.0, "step": 40 }, { "entropy": 5.384192585945129, "epoch": 0.017945070576649524, "grad_norm": 576.0, "learning_rate": 3.4985422740524782e-06, "loss": 17.4006, "loss_lm": 16.571444511413574, "loss_seg": 0.8291645497083664, "mean_token_accuracy": 0.022919199895113707, "num_tokens": 17427462.0, "step": 41 }, { "entropy": 5.466962099075317, "epoch": 0.01838275522486049, "grad_norm": 584.0, "learning_rate": 3.58600583090379e-06, "loss": 17.2907, "loss_lm": 16.481374740600586, "loss_seg": 0.8093040734529495, "mean_token_accuracy": 0.022829109337180853, "num_tokens": 17853081.0, "step": 42 }, { "entropy": 5.632600665092468, "epoch": 0.018820439873071453, "grad_norm": 592.0, "learning_rate": 3.673469387755102e-06, "loss": 17.149, "loss_lm": 16.299394607543945, "loss_seg": 0.8496248573064804, "mean_token_accuracy": 0.02303947973996401, "num_tokens": 18277776.0, "step": 43 }, { "entropy": 5.7409446239471436, "epoch": 0.019258124521282415, "grad_norm": 572.0, "learning_rate": 3.7609329446064145e-06, "loss": 17.0027, "loss_lm": 16.17810869216919, "loss_seg": 0.8245757967233658, "mean_token_accuracy": 0.022922689095139503, "num_tokens": 18702635.0, "step": 44 }, { "entropy": 5.896754741668701, "epoch": 0.01969580916949338, "grad_norm": 572.0, "learning_rate": 3.848396501457726e-06, "loss": 16.7583, "loss_lm": 15.921148538589478, "loss_seg": 0.8371168971061707, "mean_token_accuracy": 0.022921018302440643, "num_tokens": 19127918.0, "step": 45 }, { "entropy": 6.113203644752502, "epoch": 0.020133493817704344, "grad_norm": 600.0, "learning_rate": 3.935860058309038e-06, "loss": 16.5465, "loss_lm": 15.699619770050049, "loss_seg": 0.8468835353851318, "mean_token_accuracy": 0.022550095804035664, "num_tokens": 19552111.0, "step": 46 }, { "entropy": 6.304031133651733, "epoch": 0.02057117846591531, "grad_norm": 600.0, "learning_rate": 4.02332361516035e-06, "loss": 16.1662, "loss_lm": 15.332337617874146, "loss_seg": 0.8338899612426758, "mean_token_accuracy": 0.02316479478031397, "num_tokens": 19977708.0, "step": 47 }, { "entropy": 6.566628575325012, "epoch": 0.021008863114126274, "grad_norm": 676.0, "learning_rate": 4.110787172011661e-06, "loss": 15.7529, "loss_lm": 14.921638011932373, "loss_seg": 0.8312242180109024, "mean_token_accuracy": 0.023348627611994743, "num_tokens": 20402746.0, "step": 48 }, { "entropy": 6.899168491363525, "epoch": 0.021446547762337235, "grad_norm": 772.0, "learning_rate": 4.198250728862974e-06, "loss": 15.3192, "loss_lm": 14.478381872177124, "loss_seg": 0.8408272415399551, "mean_token_accuracy": 0.023646533954888582, "num_tokens": 20827899.0, "step": 49 }, { "entropy": 7.224049210548401, "epoch": 0.0218842324105482, "grad_norm": 964.0, "learning_rate": 4.2857142857142855e-06, "loss": 14.6668, "loss_lm": 13.820450067520142, "loss_seg": 0.8463305830955505, "mean_token_accuracy": 0.02392322849482298, "num_tokens": 21253144.0, "step": 50 }, { "entropy": 7.57668137550354, "epoch": 0.022321917058759164, "grad_norm": 1168.0, "learning_rate": 4.373177842565598e-06, "loss": 13.8626, "loss_lm": 13.035579204559326, "loss_seg": 0.8270699828863144, "mean_token_accuracy": 0.024924433790147305, "num_tokens": 21678449.0, "step": 51 }, { "entropy": 7.9110424518585205, "epoch": 0.02275960170697013, "grad_norm": 1272.0, "learning_rate": 4.46064139941691e-06, "loss": 13.0226, "loss_lm": 12.205313205718994, "loss_seg": 0.8172555267810822, "mean_token_accuracy": 0.02448875503614545, "num_tokens": 22103638.0, "step": 52 }, { "entropy": 8.1587233543396, "epoch": 0.02319728635518109, "grad_norm": 1224.0, "learning_rate": 4.548104956268222e-06, "loss": 12.258, "loss_lm": 11.426699876785278, "loss_seg": 0.8312846422195435, "mean_token_accuracy": 0.024683883879333735, "num_tokens": 22529319.0, "step": 53 }, { "entropy": 8.319692134857178, "epoch": 0.023634971003392055, "grad_norm": 1004.0, "learning_rate": 4.635568513119534e-06, "loss": 11.4891, "loss_lm": 10.678548097610474, "loss_seg": 0.8105914741754532, "mean_token_accuracy": 0.02486988063901663, "num_tokens": 22953674.0, "step": 54 }, { "entropy": 8.375528335571289, "epoch": 0.02407265565160302, "grad_norm": 764.0, "learning_rate": 4.723032069970846e-06, "loss": 10.9868, "loss_lm": 10.151775360107422, "loss_seg": 0.8350031673908234, "mean_token_accuracy": 0.024908107705414295, "num_tokens": 23378493.0, "step": 55 }, { "entropy": 8.342453956604004, "epoch": 0.024510340299813985, "grad_norm": 656.0, "learning_rate": 4.810495626822157e-06, "loss": 10.6976, "loss_lm": 9.881311655044556, "loss_seg": 0.8163158297538757, "mean_token_accuracy": 0.02534069772809744, "num_tokens": 23803995.0, "step": 56 }, { "entropy": 8.288888692855835, "epoch": 0.02494802494802495, "grad_norm": 644.0, "learning_rate": 4.897959183673469e-06, "loss": 10.5106, "loss_lm": 9.683181524276733, "loss_seg": 0.8274053186178207, "mean_token_accuracy": 0.02570035634562373, "num_tokens": 24228181.0, "step": 57 }, { "entropy": 8.27146315574646, "epoch": 0.02538570959623591, "grad_norm": 656.0, "learning_rate": 4.9854227405247814e-06, "loss": 10.3904, "loss_lm": 9.54770541191101, "loss_seg": 0.8427385240793228, "mean_token_accuracy": 0.02557573840022087, "num_tokens": 24653772.0, "step": 58 }, { "entropy": 8.294415950775146, "epoch": 0.025823394244446875, "grad_norm": 656.0, "learning_rate": 5.072886297376093e-06, "loss": 10.2293, "loss_lm": 9.403592348098755, "loss_seg": 0.8256871104240417, "mean_token_accuracy": 0.02594029949977994, "num_tokens": 25078483.0, "step": 59 }, { "entropy": 8.341434955596924, "epoch": 0.02626107889265784, "grad_norm": 648.0, "learning_rate": 5.160349854227406e-06, "loss": 10.089, "loss_lm": 9.257452487945557, "loss_seg": 0.8315456658601761, "mean_token_accuracy": 0.02601619577035308, "num_tokens": 25503480.0, "step": 60 }, { "entropy": 8.400374174118042, "epoch": 0.026698763540868805, "grad_norm": 636.0, "learning_rate": 5.247813411078717e-06, "loss": 9.9336, "loss_lm": 9.10638427734375, "loss_seg": 0.8271730989217758, "mean_token_accuracy": 0.026183578185737133, "num_tokens": 25928903.0, "step": 61 }, { "entropy": 8.446983814239502, "epoch": 0.02713644818907977, "grad_norm": 624.0, "learning_rate": 5.33527696793003e-06, "loss": 9.7572, "loss_lm": 8.928802967071533, "loss_seg": 0.8283981382846832, "mean_token_accuracy": 0.0264985840767622, "num_tokens": 26353646.0, "step": 62 }, { "entropy": 8.484761714935303, "epoch": 0.02757413283729073, "grad_norm": 620.0, "learning_rate": 5.422740524781341e-06, "loss": 9.6194, "loss_lm": 8.808688879013062, "loss_seg": 0.8106957226991653, "mean_token_accuracy": 0.025984744541347027, "num_tokens": 26778459.0, "step": 63 }, { "entropy": 8.504687547683716, "epoch": 0.028011817485501696, "grad_norm": 616.0, "learning_rate": 5.510204081632653e-06, "loss": 9.4691, "loss_lm": 8.66999340057373, "loss_seg": 0.7991273552179337, "mean_token_accuracy": 0.026199035346508026, "num_tokens": 27203889.0, "step": 64 }, { "entropy": 8.510120630264282, "epoch": 0.02844950213371266, "grad_norm": 616.0, "learning_rate": 5.597667638483965e-06, "loss": 9.3298, "loss_lm": 8.546691656112671, "loss_seg": 0.783078134059906, "mean_token_accuracy": 0.026289382949471474, "num_tokens": 27628891.0, "step": 65 }, { "entropy": 8.50618839263916, "epoch": 0.028887186781923625, "grad_norm": 620.0, "learning_rate": 5.6851311953352774e-06, "loss": 9.2435, "loss_lm": 8.45438265800476, "loss_seg": 0.7891413271427155, "mean_token_accuracy": 0.026497644372284412, "num_tokens": 28053425.0, "step": 66 }, { "entropy": 8.497968912124634, "epoch": 0.029324871430134587, "grad_norm": 628.0, "learning_rate": 5.772594752186589e-06, "loss": 9.1173, "loss_lm": 8.350813627243042, "loss_seg": 0.7664579004049301, "mean_token_accuracy": 0.02671846654266119, "num_tokens": 28479297.0, "step": 67 }, { "entropy": 8.492021560668945, "epoch": 0.02976255607834555, "grad_norm": 636.0, "learning_rate": 5.860058309037901e-06, "loss": 9.0377, "loss_lm": 8.254661321640015, "loss_seg": 0.7830318510532379, "mean_token_accuracy": 0.02628872124478221, "num_tokens": 28904843.0, "step": 68 }, { "entropy": 8.476067781448364, "epoch": 0.030200240726556516, "grad_norm": 680.0, "learning_rate": 5.947521865889213e-06, "loss": 8.9633, "loss_lm": 8.194410800933838, "loss_seg": 0.7689334601163864, "mean_token_accuracy": 0.026128015480935574, "num_tokens": 29329920.0, "step": 69 }, { "entropy": 8.457318782806396, "epoch": 0.03063792537476748, "grad_norm": 648.0, "learning_rate": 6.034985422740525e-06, "loss": 8.8637, "loss_lm": 8.089261054992676, "loss_seg": 0.7744680792093277, "mean_token_accuracy": 0.026578488294035196, "num_tokens": 29755498.0, "step": 70 }, { "entropy": 8.445504903793335, "epoch": 0.031075610022978446, "grad_norm": 660.0, "learning_rate": 6.122448979591837e-06, "loss": 8.7909, "loss_lm": 8.023976564407349, "loss_seg": 0.7668972760438919, "mean_token_accuracy": 0.026761943474411964, "num_tokens": 30179983.0, "step": 71 }, { "entropy": 8.427841424942017, "epoch": 0.03151329467118941, "grad_norm": 668.0, "learning_rate": 6.209912536443148e-06, "loss": 8.7118, "loss_lm": 7.942078471183777, "loss_seg": 0.7696750462055206, "mean_token_accuracy": 0.026347942650318146, "num_tokens": 30605285.0, "step": 72 }, { "entropy": 8.41500449180603, "epoch": 0.03195097931940037, "grad_norm": 672.0, "learning_rate": 6.297376093294461e-06, "loss": 8.6611, "loss_lm": 7.898960590362549, "loss_seg": 0.7621441632509232, "mean_token_accuracy": 0.027080140076577663, "num_tokens": 31031184.0, "step": 73 }, { "entropy": 8.397810220718384, "epoch": 0.032388663967611336, "grad_norm": 680.0, "learning_rate": 6.3848396501457726e-06, "loss": 8.5519, "loss_lm": 7.7935779094696045, "loss_seg": 0.7582725435495377, "mean_token_accuracy": 0.027396641671657562, "num_tokens": 31455701.0, "step": 74 }, { "entropy": 8.389027833938599, "epoch": 0.0328263486158223, "grad_norm": 684.0, "learning_rate": 6.472303206997085e-06, "loss": 8.4984, "loss_lm": 7.731083989143372, "loss_seg": 0.7672709375619888, "mean_token_accuracy": 0.02717129373922944, "num_tokens": 31880242.0, "step": 75 }, { "entropy": 8.371859073638916, "epoch": 0.033264033264033266, "grad_norm": 688.0, "learning_rate": 6.559766763848397e-06, "loss": 8.4248, "loss_lm": 7.662654519081116, "loss_seg": 0.7621746212244034, "mean_token_accuracy": 0.02737838262692094, "num_tokens": 32304898.0, "step": 76 }, { "entropy": 8.3561532497406, "epoch": 0.03370171791224423, "grad_norm": 692.0, "learning_rate": 6.647230320699709e-06, "loss": 8.362, "loss_lm": 7.592698454856873, "loss_seg": 0.7693197876214981, "mean_token_accuracy": 0.027738881763070822, "num_tokens": 32729670.0, "step": 77 }, { "entropy": 8.344810485839844, "epoch": 0.034139402560455195, "grad_norm": 692.0, "learning_rate": 6.734693877551021e-06, "loss": 8.2839, "loss_lm": 7.518975257873535, "loss_seg": 0.7649504244327545, "mean_token_accuracy": 0.027500457130372524, "num_tokens": 33154737.0, "step": 78 }, { "entropy": 8.33609127998352, "epoch": 0.03457708720866615, "grad_norm": 696.0, "learning_rate": 6.822157434402332e-06, "loss": 8.2118, "loss_lm": 7.446925759315491, "loss_seg": 0.7648681849241257, "mean_token_accuracy": 0.027517647948116064, "num_tokens": 33579324.0, "step": 79 }, { "entropy": 8.320914506912231, "epoch": 0.03501477185687712, "grad_norm": 696.0, "learning_rate": 6.909620991253644e-06, "loss": 8.1627, "loss_lm": 7.383332014083862, "loss_seg": 0.7794023603200912, "mean_token_accuracy": 0.028133942745625973, "num_tokens": 34004429.0, "step": 80 }, { "entropy": 8.311233758926392, "epoch": 0.03545245650508808, "grad_norm": 700.0, "learning_rate": 6.9970845481049564e-06, "loss": 8.0586, "loss_lm": 7.311756253242493, "loss_seg": 0.7468635439872742, "mean_token_accuracy": 0.028137291315943003, "num_tokens": 34430227.0, "step": 81 }, { "entropy": 8.307392835617065, "epoch": 0.03589014115329905, "grad_norm": 704.0, "learning_rate": 7.0845481049562685e-06, "loss": 8.0055, "loss_lm": 7.249581933021545, "loss_seg": 0.7559671252965927, "mean_token_accuracy": 0.027938659768551588, "num_tokens": 34855396.0, "step": 82 }, { "entropy": 8.306962966918945, "epoch": 0.03632782580151001, "grad_norm": 704.0, "learning_rate": 7.17201166180758e-06, "loss": 7.915, "loss_lm": 7.173552513122559, "loss_seg": 0.7414920777082443, "mean_token_accuracy": 0.0275408155284822, "num_tokens": 35280245.0, "step": 83 }, { "entropy": 8.291202306747437, "epoch": 0.03676551044972098, "grad_norm": 704.0, "learning_rate": 7.259475218658893e-06, "loss": 7.8679, "loss_lm": 7.097670674324036, "loss_seg": 0.770256757736206, "mean_token_accuracy": 0.02834083652123809, "num_tokens": 35705811.0, "step": 84 }, { "entropy": 8.292526006698608, "epoch": 0.03720319509793194, "grad_norm": 704.0, "learning_rate": 7.346938775510204e-06, "loss": 7.7838, "loss_lm": 7.031568288803101, "loss_seg": 0.7522522211074829, "mean_token_accuracy": 0.028153620660305023, "num_tokens": 36130614.0, "step": 85 }, { "entropy": 8.292214393615723, "epoch": 0.037640879746142906, "grad_norm": 704.0, "learning_rate": 7.434402332361516e-06, "loss": 7.7359, "loss_lm": 6.957089304924011, "loss_seg": 0.7787846624851227, "mean_token_accuracy": 0.02827571891248226, "num_tokens": 36555038.0, "step": 86 }, { "entropy": 8.285071849822998, "epoch": 0.03807856439435387, "grad_norm": 708.0, "learning_rate": 7.521865889212829e-06, "loss": 7.6214, "loss_lm": 6.882170915603638, "loss_seg": 0.7392595410346985, "mean_token_accuracy": 0.029559827875345945, "num_tokens": 36979980.0, "step": 87 }, { "entropy": 8.293575763702393, "epoch": 0.03851624904256483, "grad_norm": 704.0, "learning_rate": 7.60932944606414e-06, "loss": 7.5689, "loss_lm": 6.799216985702515, "loss_seg": 0.7696768492460251, "mean_token_accuracy": 0.029225423466414213, "num_tokens": 37405293.0, "step": 88 }, { "entropy": 8.29737401008606, "epoch": 0.038953933690775794, "grad_norm": 704.0, "learning_rate": 7.696793002915452e-06, "loss": 7.5051, "loss_lm": 6.735197305679321, "loss_seg": 0.7699145674705505, "mean_token_accuracy": 0.029253679793328047, "num_tokens": 37830721.0, "step": 89 }, { "entropy": 8.297931671142578, "epoch": 0.03939161833898676, "grad_norm": 704.0, "learning_rate": 7.784256559766764e-06, "loss": 7.3903, "loss_lm": 6.648422479629517, "loss_seg": 0.7418956905603409, "mean_token_accuracy": 0.029206956271082163, "num_tokens": 38255278.0, "step": 90 }, { "entropy": 8.299255132675171, "epoch": 0.03982930298719772, "grad_norm": 704.0, "learning_rate": 7.871720116618076e-06, "loss": 7.3017, "loss_lm": 6.566730380058289, "loss_seg": 0.7349478453397751, "mean_token_accuracy": 0.029645076487213373, "num_tokens": 38680550.0, "step": 91 }, { "entropy": 8.306486129760742, "epoch": 0.04026698763540869, "grad_norm": 704.0, "learning_rate": 7.959183673469388e-06, "loss": 7.2411, "loss_lm": 6.495392560958862, "loss_seg": 0.7457234561443329, "mean_token_accuracy": 0.029195136856287718, "num_tokens": 39105014.0, "step": 92 }, { "entropy": 8.301843166351318, "epoch": 0.04070467228361965, "grad_norm": 708.0, "learning_rate": 8.0466472303207e-06, "loss": 7.1252, "loss_lm": 6.387027621269226, "loss_seg": 0.7381963580846786, "mean_token_accuracy": 0.0304398899897933, "num_tokens": 39530073.0, "step": 93 }, { "entropy": 8.304272890090942, "epoch": 0.04114235693183062, "grad_norm": 704.0, "learning_rate": 8.134110787172012e-06, "loss": 7.0593, "loss_lm": 6.316790461540222, "loss_seg": 0.7424654215574265, "mean_token_accuracy": 0.03249760903418064, "num_tokens": 39954734.0, "step": 94 }, { "entropy": 8.302245616912842, "epoch": 0.04158004158004158, "grad_norm": 704.0, "learning_rate": 8.221574344023323e-06, "loss": 6.9555, "loss_lm": 6.205297946929932, "loss_seg": 0.7502095252275467, "mean_token_accuracy": 0.03875854332000017, "num_tokens": 40379846.0, "step": 95 }, { "entropy": 8.307219505310059, "epoch": 0.04201772622825255, "grad_norm": 704.0, "learning_rate": 8.309037900874636e-06, "loss": 6.8496, "loss_lm": 6.120788812637329, "loss_seg": 0.728853166103363, "mean_token_accuracy": 0.05148617643862963, "num_tokens": 40804845.0, "step": 96 }, { "entropy": 8.312198877334595, "epoch": 0.042455410876463505, "grad_norm": 704.0, "learning_rate": 8.396501457725948e-06, "loss": 6.7447, "loss_lm": 6.02713942527771, "loss_seg": 0.717603400349617, "mean_token_accuracy": 0.07489134930074215, "num_tokens": 41230416.0, "step": 97 }, { "entropy": 8.310283660888672, "epoch": 0.04289309552467447, "grad_norm": 704.0, "learning_rate": 8.483965014577259e-06, "loss": 6.666, "loss_lm": 5.922601580619812, "loss_seg": 0.7434397786855698, "mean_token_accuracy": 0.11518201790750027, "num_tokens": 41656037.0, "step": 98 }, { "entropy": 8.310597658157349, "epoch": 0.043330780172885434, "grad_norm": 708.0, "learning_rate": 8.571428571428571e-06, "loss": 6.5688, "loss_lm": 5.808622121810913, "loss_seg": 0.7602132558822632, "mean_token_accuracy": 0.17919931188225746, "num_tokens": 42080972.0, "step": 99 }, { "entropy": 8.307950496673584, "epoch": 0.0437684648210964, "grad_norm": 700.0, "learning_rate": 8.658892128279885e-06, "loss": 6.4498, "loss_lm": 5.701167464256287, "loss_seg": 0.7486140578985214, "mean_token_accuracy": 0.26220742613077164, "num_tokens": 42506017.0, "step": 100 }, { "entropy": 8.308230876922607, "epoch": 0.044206149469307364, "grad_norm": 700.0, "learning_rate": 8.746355685131197e-06, "loss": 6.3223, "loss_lm": 5.589972138404846, "loss_seg": 0.7322929501533508, "mean_token_accuracy": 0.3493735045194626, "num_tokens": 42930700.0, "step": 101 }, { "entropy": 8.309202432632446, "epoch": 0.04464383411751833, "grad_norm": 700.0, "learning_rate": 8.833819241982507e-06, "loss": 6.2213, "loss_lm": 5.465463876724243, "loss_seg": 0.75588259100914, "mean_token_accuracy": 0.405391588807106, "num_tokens": 43355097.0, "step": 102 }, { "entropy": 8.313190937042236, "epoch": 0.04508151876572929, "grad_norm": 700.0, "learning_rate": 8.92128279883382e-06, "loss": 6.0661, "loss_lm": 5.3489837646484375, "loss_seg": 0.7171551287174225, "mean_token_accuracy": 0.5452592372894287, "num_tokens": 43779471.0, "step": 103 }, { "entropy": 8.311053037643433, "epoch": 0.04551920341394026, "grad_norm": 700.0, "learning_rate": 9.008746355685131e-06, "loss": 5.9435, "loss_lm": 5.2217185497283936, "loss_seg": 0.721745565533638, "mean_token_accuracy": 0.6293759942054749, "num_tokens": 44203646.0, "step": 104 }, { "entropy": 8.315284729003906, "epoch": 0.04595688806215122, "grad_norm": 700.0, "learning_rate": 9.096209912536444e-06, "loss": 5.8608, "loss_lm": 5.107338905334473, "loss_seg": 0.7534476071596146, "mean_token_accuracy": 0.6865324527025223, "num_tokens": 44628103.0, "step": 105 }, { "entropy": 8.30898642539978, "epoch": 0.04639457271036218, "grad_norm": 696.0, "learning_rate": 9.183673469387756e-06, "loss": 5.6913, "loss_lm": 4.9613178968429565, "loss_seg": 0.7300002574920654, "mean_token_accuracy": 0.6950210928916931, "num_tokens": 45053776.0, "step": 106 }, { "entropy": 8.308664560317993, "epoch": 0.046832257358573146, "grad_norm": 696.0, "learning_rate": 9.271137026239068e-06, "loss": 5.5594, "loss_lm": 4.8214908838272095, "loss_seg": 0.7378615885972977, "mean_token_accuracy": 0.7584503591060638, "num_tokens": 45479013.0, "step": 107 }, { "entropy": 8.303479671478271, "epoch": 0.04726994200678411, "grad_norm": 696.0, "learning_rate": 9.358600583090378e-06, "loss": 5.4503, "loss_lm": 4.702779650688171, "loss_seg": 0.7475201785564423, "mean_token_accuracy": 0.7868125736713409, "num_tokens": 45904247.0, "step": 108 }, { "entropy": 8.297297239303589, "epoch": 0.047707626654995075, "grad_norm": 692.0, "learning_rate": 9.446064139941692e-06, "loss": 5.2623, "loss_lm": 4.557962775230408, "loss_seg": 0.7043830752372742, "mean_token_accuracy": 0.8249813169240952, "num_tokens": 46328367.0, "step": 109 }, { "entropy": 8.292833089828491, "epoch": 0.04814531130320604, "grad_norm": 696.0, "learning_rate": 9.533527696793004e-06, "loss": 5.1472, "loss_lm": 4.398914813995361, "loss_seg": 0.7482606470584869, "mean_token_accuracy": 0.8782060295343399, "num_tokens": 46753962.0, "step": 110 }, { "entropy": 8.288044929504395, "epoch": 0.048582995951417005, "grad_norm": 692.0, "learning_rate": 9.620991253644314e-06, "loss": 4.9984, "loss_lm": 4.2658069133758545, "loss_seg": 0.7325713038444519, "mean_token_accuracy": 0.8967859745025635, "num_tokens": 47178609.0, "step": 111 }, { "entropy": 8.284547567367554, "epoch": 0.04902068059962797, "grad_norm": 692.0, "learning_rate": 9.708454810495627e-06, "loss": 4.828, "loss_lm": 4.114639401435852, "loss_seg": 0.7133878320455551, "mean_token_accuracy": 0.9225087761878967, "num_tokens": 47603460.0, "step": 112 }, { "entropy": 8.272825002670288, "epoch": 0.049458365247838934, "grad_norm": 688.0, "learning_rate": 9.795918367346939e-06, "loss": 4.6858, "loss_lm": 3.9581161737442017, "loss_seg": 0.72771355509758, "mean_token_accuracy": 0.9367158114910126, "num_tokens": 48028879.0, "step": 113 }, { "entropy": 8.262993097305298, "epoch": 0.0498960498960499, "grad_norm": 688.0, "learning_rate": 9.88338192419825e-06, "loss": 4.5314, "loss_lm": 3.813956379890442, "loss_seg": 0.7174829542636871, "mean_token_accuracy": 0.9456863850355148, "num_tokens": 48453869.0, "step": 114 }, { "entropy": 8.251443147659302, "epoch": 0.05033373454426086, "grad_norm": 688.0, "learning_rate": 9.970845481049563e-06, "loss": 4.4043, "loss_lm": 3.6631569266319275, "loss_seg": 0.7411067634820938, "mean_token_accuracy": 0.952153742313385, "num_tokens": 48878632.0, "step": 115 }, { "entropy": 8.227027416229248, "epoch": 0.05077141919247182, "grad_norm": 684.0, "learning_rate": 1.0058309037900875e-05, "loss": 4.2124, "loss_lm": 3.4950461387634277, "loss_seg": 0.71737901866436, "mean_token_accuracy": 0.9557310938835144, "num_tokens": 49303947.0, "step": 116 }, { "entropy": 8.206058740615845, "epoch": 0.051209103840682786, "grad_norm": 676.0, "learning_rate": 1.0145772594752185e-05, "loss": 4.0482, "loss_lm": 3.3453933000564575, "loss_seg": 0.7027963995933533, "mean_token_accuracy": 0.9585853666067123, "num_tokens": 49729133.0, "step": 117 }, { "entropy": 8.17612600326538, "epoch": 0.05164678848889375, "grad_norm": 672.0, "learning_rate": 1.02332361516035e-05, "loss": 3.9228, "loss_lm": 3.2019266486167908, "loss_seg": 0.7208628207445145, "mean_token_accuracy": 0.9596938788890839, "num_tokens": 50153770.0, "step": 118 }, { "entropy": 8.142533779144287, "epoch": 0.052084473137104716, "grad_norm": 668.0, "learning_rate": 1.0320699708454811e-05, "loss": 3.7778, "loss_lm": 3.0460426211357117, "loss_seg": 0.7317176312208176, "mean_token_accuracy": 0.9615858495235443, "num_tokens": 50579093.0, "step": 119 }, { "entropy": 8.097778558731079, "epoch": 0.05252215778531568, "grad_norm": 660.0, "learning_rate": 1.0408163265306123e-05, "loss": 3.579, "loss_lm": 2.8859935998916626, "loss_seg": 0.693017765879631, "mean_token_accuracy": 0.9627165198326111, "num_tokens": 51004271.0, "step": 120 }, { "entropy": 8.04852294921875, "epoch": 0.052959842433526645, "grad_norm": 652.0, "learning_rate": 1.0495626822157434e-05, "loss": 3.4482, "loss_lm": 2.734415113925934, "loss_seg": 0.7137662619352341, "mean_token_accuracy": 0.9648780524730682, "num_tokens": 51429634.0, "step": 121 }, { "entropy": 7.989368319511414, "epoch": 0.05339752708173761, "grad_norm": 644.0, "learning_rate": 1.0583090379008748e-05, "loss": 3.2987, "loss_lm": 2.5834702253341675, "loss_seg": 0.7152691781520844, "mean_token_accuracy": 0.966230645775795, "num_tokens": 51855255.0, "step": 122 }, { "entropy": 7.910717248916626, "epoch": 0.053835211729948575, "grad_norm": 636.0, "learning_rate": 1.067055393586006e-05, "loss": 3.1627, "loss_lm": 2.430658757686615, "loss_seg": 0.7320294231176376, "mean_token_accuracy": 0.9673756957054138, "num_tokens": 52280691.0, "step": 123 }, { "entropy": 7.814656138420105, "epoch": 0.05427289637815954, "grad_norm": 628.0, "learning_rate": 1.075801749271137e-05, "loss": 3.0089, "loss_lm": 2.275887668132782, "loss_seg": 0.7329722046852112, "mean_token_accuracy": 0.9683903455734253, "num_tokens": 52706350.0, "step": 124 }, { "entropy": 7.7042306661605835, "epoch": 0.0547105810263705, "grad_norm": 612.0, "learning_rate": 1.0845481049562682e-05, "loss": 2.8451, "loss_lm": 2.132622480392456, "loss_seg": 0.7124882340431213, "mean_token_accuracy": 0.9694004207849503, "num_tokens": 53131110.0, "step": 125 }, { "entropy": 7.562324047088623, "epoch": 0.05514826567458146, "grad_norm": 596.0, "learning_rate": 1.0932944606413994e-05, "loss": 2.7169, "loss_lm": 1.9873931109905243, "loss_seg": 0.7294735610485077, "mean_token_accuracy": 0.9700779467821121, "num_tokens": 53556265.0, "step": 126 }, { "entropy": 7.410821199417114, "epoch": 0.05558595032279243, "grad_norm": 584.0, "learning_rate": 1.1020408163265306e-05, "loss": 2.5863, "loss_lm": 1.8471620082855225, "loss_seg": 0.7391794174909592, "mean_token_accuracy": 0.9712062329053879, "num_tokens": 53981116.0, "step": 127 }, { "entropy": 7.238532662391663, "epoch": 0.05602363497100339, "grad_norm": 560.0, "learning_rate": 1.1107871720116619e-05, "loss": 2.4212, "loss_lm": 1.7108319699764252, "loss_seg": 0.7104092091321945, "mean_token_accuracy": 0.9716636836528778, "num_tokens": 54405971.0, "step": 128 }, { "entropy": 7.042970061302185, "epoch": 0.056461319619214356, "grad_norm": 548.0, "learning_rate": 1.119533527696793e-05, "loss": 2.2788, "loss_lm": 1.578994333744049, "loss_seg": 0.6998318731784821, "mean_token_accuracy": 0.9726728051900864, "num_tokens": 54830658.0, "step": 129 }, { "entropy": 6.810908675193787, "epoch": 0.05689900426742532, "grad_norm": 524.0, "learning_rate": 1.1282798833819241e-05, "loss": 2.1414, "loss_lm": 1.4493812024593353, "loss_seg": 0.6920540928840637, "mean_token_accuracy": 0.973005399107933, "num_tokens": 55256047.0, "step": 130 }, { "entropy": 6.547577977180481, "epoch": 0.057336688915636286, "grad_norm": 496.0, "learning_rate": 1.1370262390670555e-05, "loss": 2.0306, "loss_lm": 1.319274365901947, "loss_seg": 0.7112846821546555, "mean_token_accuracy": 0.9735391736030579, "num_tokens": 55680826.0, "step": 131 }, { "entropy": 6.254086852073669, "epoch": 0.05777437356384725, "grad_norm": 474.0, "learning_rate": 1.1457725947521867e-05, "loss": 1.9369, "loss_lm": 1.203002005815506, "loss_seg": 0.7338609844446182, "mean_token_accuracy": 0.9737337231636047, "num_tokens": 56105745.0, "step": 132 }, { "entropy": 5.951365828514099, "epoch": 0.058212058212058215, "grad_norm": 444.0, "learning_rate": 1.1545189504373177e-05, "loss": 1.7813, "loss_lm": 1.0970235764980316, "loss_seg": 0.6842426359653473, "mean_token_accuracy": 0.9742701500654221, "num_tokens": 56531830.0, "step": 133 }, { "entropy": 5.6269978284835815, "epoch": 0.05864974286026917, "grad_norm": 416.0, "learning_rate": 1.163265306122449e-05, "loss": 1.6903, "loss_lm": 0.9906508475542068, "loss_seg": 0.6996322125196457, "mean_token_accuracy": 0.9747433662414551, "num_tokens": 56956810.0, "step": 134 }, { "entropy": 5.275102376937866, "epoch": 0.05908742750848014, "grad_norm": 384.0, "learning_rate": 1.1720116618075802e-05, "loss": 1.5981, "loss_lm": 0.8951469659805298, "loss_seg": 0.7029356956481934, "mean_token_accuracy": 0.9754538089036942, "num_tokens": 57381945.0, "step": 135 }, { "entropy": 4.911625862121582, "epoch": 0.0595251121566911, "grad_norm": 352.0, "learning_rate": 1.1807580174927114e-05, "loss": 1.4865, "loss_lm": 0.8015758693218231, "loss_seg": 0.6848791688680649, "mean_token_accuracy": 0.9760065823793411, "num_tokens": 57807031.0, "step": 136 }, { "entropy": 4.550464987754822, "epoch": 0.05996279680490207, "grad_norm": 324.0, "learning_rate": 1.1895043731778426e-05, "loss": 1.4078, "loss_lm": 0.7152498066425323, "loss_seg": 0.6925084441900253, "mean_token_accuracy": 0.9766239821910858, "num_tokens": 58232724.0, "step": 137 }, { "entropy": 4.173449516296387, "epoch": 0.06040048145311303, "grad_norm": 298.0, "learning_rate": 1.1982507288629738e-05, "loss": 1.3183, "loss_lm": 0.6410547643899918, "loss_seg": 0.6772904843091965, "mean_token_accuracy": 0.9773548096418381, "num_tokens": 58657267.0, "step": 138 }, { "entropy": 3.8154897689819336, "epoch": 0.060838166101324, "grad_norm": 272.0, "learning_rate": 1.206997084548105e-05, "loss": 1.2384, "loss_lm": 0.5715755075216293, "loss_seg": 0.6668155789375305, "mean_token_accuracy": 0.9779858887195587, "num_tokens": 59081956.0, "step": 139 }, { "entropy": 3.482732594013214, "epoch": 0.06127585074953496, "grad_norm": 239.0, "learning_rate": 1.2157434402332362e-05, "loss": 1.1723, "loss_lm": 0.5076311528682709, "loss_seg": 0.6646937131881714, "mean_token_accuracy": 0.9789465367794037, "num_tokens": 59507068.0, "step": 140 }, { "entropy": 3.136595666408539, "epoch": 0.061713535397745926, "grad_norm": 217.0, "learning_rate": 1.2244897959183674e-05, "loss": 1.0987, "loss_lm": 0.4498852342367172, "loss_seg": 0.6487849056720734, "mean_token_accuracy": 0.9799670726060867, "num_tokens": 59931759.0, "step": 141 }, { "entropy": 2.8289828300476074, "epoch": 0.06215122004595689, "grad_norm": 444.0, "learning_rate": 1.2332361516034986e-05, "loss": 1.116, "loss_lm": 0.40316879004240036, "loss_seg": 0.7128484696149826, "mean_token_accuracy": 0.9812074601650238, "num_tokens": 60356401.0, "step": 142 }, { "entropy": 2.5970622301101685, "epoch": 0.06258890469416785, "grad_norm": 384.0, "learning_rate": 1.2419825072886297e-05, "loss": 1.0542, "loss_lm": 0.36266353726387024, "loss_seg": 0.6915423721075058, "mean_token_accuracy": 0.9824575185775757, "num_tokens": 60781193.0, "step": 143 }, { "entropy": 2.3382530212402344, "epoch": 0.06302658934237881, "grad_norm": 254.0, "learning_rate": 1.250728862973761e-05, "loss": 0.9671, "loss_lm": 0.32643603533506393, "loss_seg": 0.6406343281269073, "mean_token_accuracy": 0.9840810894966125, "num_tokens": 61205605.0, "step": 144 }, { "entropy": 2.3924869894981384, "epoch": 0.06346427399058978, "grad_norm": 740.0, "learning_rate": 1.2594752186588923e-05, "loss": 1.2143, "loss_lm": 0.3295247256755829, "loss_seg": 0.8847718089818954, "mean_token_accuracy": 0.9848604202270508, "num_tokens": 61630165.0, "step": 145 }, { "entropy": 2.1263375878334045, "epoch": 0.06390195863880074, "grad_norm": 536.0, "learning_rate": 1.2682215743440233e-05, "loss": 1.0711, "loss_lm": 0.29622088372707367, "loss_seg": 0.7748997956514359, "mean_token_accuracy": 0.9853166490793228, "num_tokens": 62055785.0, "step": 146 }, { "entropy": 1.9780555963516235, "epoch": 0.06433964328701171, "grad_norm": 184.0, "learning_rate": 1.2769679300291545e-05, "loss": 0.8731, "loss_lm": 0.2796431854367256, "loss_seg": 0.5934604555368423, "mean_token_accuracy": 0.985324427485466, "num_tokens": 62480955.0, "step": 147 }, { "entropy": 1.6652874946594238, "epoch": 0.06477732793522267, "grad_norm": 168.0, "learning_rate": 1.2857142857142857e-05, "loss": 0.8799, "loss_lm": 0.23832495138049126, "loss_seg": 0.6416177898645401, "mean_token_accuracy": 0.9858551323413849, "num_tokens": 62906292.0, "step": 148 }, { "entropy": 1.5599559247493744, "epoch": 0.06521501258343364, "grad_norm": 115.5, "learning_rate": 1.294460641399417e-05, "loss": 0.7736, "loss_lm": 0.22116832062602043, "loss_seg": 0.5524273216724396, "mean_token_accuracy": 0.9861900210380554, "num_tokens": 63331211.0, "step": 149 }, { "entropy": 1.428310364484787, "epoch": 0.0656526972316446, "grad_norm": 164.0, "learning_rate": 1.3032069970845481e-05, "loss": 0.7779, "loss_lm": 0.2060309387743473, "loss_seg": 0.5718290507793427, "mean_token_accuracy": 0.986465647816658, "num_tokens": 63757033.0, "step": 150 }, { "entropy": 1.2878648042678833, "epoch": 0.06609038187985557, "grad_norm": 108.5, "learning_rate": 1.3119533527696794e-05, "loss": 0.692, "loss_lm": 0.18597076833248138, "loss_seg": 0.5060615763068199, "mean_token_accuracy": 0.9865847676992416, "num_tokens": 64181336.0, "step": 151 }, { "entropy": 1.144720435142517, "epoch": 0.06652806652806653, "grad_norm": 84.0, "learning_rate": 1.3206997084548104e-05, "loss": 0.6467, "loss_lm": 0.17108895257115364, "loss_seg": 0.47564713656902313, "mean_token_accuracy": 0.9865793734788895, "num_tokens": 64606592.0, "step": 152 }, { "entropy": 1.0189109444618225, "epoch": 0.0669657511762775, "grad_norm": 104.0, "learning_rate": 1.3294460641399418e-05, "loss": 0.6229, "loss_lm": 0.1555614359676838, "loss_seg": 0.4673702046275139, "mean_token_accuracy": 0.9866470694541931, "num_tokens": 65031393.0, "step": 153 }, { "entropy": 0.9539327472448349, "epoch": 0.06740343582448846, "grad_norm": 196.0, "learning_rate": 1.338192419825073e-05, "loss": 0.6347, "loss_lm": 0.1459937021136284, "loss_seg": 0.4887368232011795, "mean_token_accuracy": 0.9866181463003159, "num_tokens": 65456379.0, "step": 154 }, { "entropy": 0.8544774353504181, "epoch": 0.06784112047269943, "grad_norm": 62.5, "learning_rate": 1.3469387755102042e-05, "loss": 0.5607, "loss_lm": 0.1336657591164112, "loss_seg": 0.426995649933815, "mean_token_accuracy": 0.9881652742624283, "num_tokens": 65882035.0, "step": 155 }, { "entropy": 0.7554172277450562, "epoch": 0.06827880512091039, "grad_norm": 113.5, "learning_rate": 1.3556851311953352e-05, "loss": 0.5252, "loss_lm": 0.1207257378846407, "loss_seg": 0.4044424071907997, "mean_token_accuracy": 0.9878551363945007, "num_tokens": 66306949.0, "step": 156 }, { "entropy": 0.6849589943885803, "epoch": 0.06871648976912134, "grad_norm": 60.75, "learning_rate": 1.3644314868804664e-05, "loss": 0.4489, "loss_lm": 0.11410892754793167, "loss_seg": 0.3347456306219101, "mean_token_accuracy": 0.9888307303190231, "num_tokens": 66732516.0, "step": 157 }, { "entropy": 0.6280005276203156, "epoch": 0.0691541744173323, "grad_norm": 46.25, "learning_rate": 1.3731778425655978e-05, "loss": 0.5437, "loss_lm": 0.11163310892879963, "loss_seg": 0.4320901930332184, "mean_token_accuracy": 0.9890357106924057, "num_tokens": 67157365.0, "step": 158 }, { "entropy": 0.568494126200676, "epoch": 0.06959185906554327, "grad_norm": 78.0, "learning_rate": 1.3819241982507289e-05, "loss": 0.478, "loss_lm": 0.10330026224255562, "loss_seg": 0.37468598037958145, "mean_token_accuracy": 0.9892352819442749, "num_tokens": 67582558.0, "step": 159 }, { "entropy": 0.515695109963417, "epoch": 0.07002954371375424, "grad_norm": 86.0, "learning_rate": 1.39067055393586e-05, "loss": 0.5072, "loss_lm": 0.09881024435162544, "loss_seg": 0.4084376245737076, "mean_token_accuracy": 0.989204466342926, "num_tokens": 68007661.0, "step": 160 }, { "entropy": 0.47703181207180023, "epoch": 0.0704672283619652, "grad_norm": 35.5, "learning_rate": 1.3994169096209913e-05, "loss": 0.417, "loss_lm": 0.09316840022802353, "loss_seg": 0.3238392770290375, "mean_token_accuracy": 0.989563062787056, "num_tokens": 68433179.0, "step": 161 }, { "entropy": 0.43524453788995743, "epoch": 0.07090491301017617, "grad_norm": 59.25, "learning_rate": 1.4081632653061225e-05, "loss": 0.4058, "loss_lm": 0.08815460279583931, "loss_seg": 0.3176855966448784, "mean_token_accuracy": 0.9896282404661179, "num_tokens": 68858278.0, "step": 162 }, { "entropy": 0.39853380620479584, "epoch": 0.07134259765838713, "grad_norm": 62.25, "learning_rate": 1.4169096209912537e-05, "loss": 0.3743, "loss_lm": 0.08382446505129337, "loss_seg": 0.29045093804597855, "mean_token_accuracy": 0.9898317456245422, "num_tokens": 69283892.0, "step": 163 }, { "entropy": 0.3696797415614128, "epoch": 0.0717802823065981, "grad_norm": 29.0, "learning_rate": 1.425655976676385e-05, "loss": 0.3562, "loss_lm": 0.08055675961077213, "loss_seg": 0.2756299264729023, "mean_token_accuracy": 0.9899949878454208, "num_tokens": 69708764.0, "step": 164 }, { "entropy": 0.3417136147618294, "epoch": 0.07221796695480906, "grad_norm": 47.75, "learning_rate": 1.434402332361516e-05, "loss": 0.4027, "loss_lm": 0.0768588799983263, "loss_seg": 0.32580865174531937, "mean_token_accuracy": 0.9900969415903091, "num_tokens": 70134686.0, "step": 165 }, { "entropy": 0.3153391554951668, "epoch": 0.07265565160302002, "grad_norm": 65.0, "learning_rate": 1.4431486880466473e-05, "loss": 0.3407, "loss_lm": 0.0720259491354227, "loss_seg": 0.2686995752155781, "mean_token_accuracy": 0.9902385324239731, "num_tokens": 70560024.0, "step": 166 }, { "entropy": 0.29562944918870926, "epoch": 0.07309333625123099, "grad_norm": 30.125, "learning_rate": 1.4518950437317786e-05, "loss": 0.4161, "loss_lm": 0.0713824424892664, "loss_seg": 0.3447478339076042, "mean_token_accuracy": 0.9901034533977509, "num_tokens": 70985313.0, "step": 167 }, { "entropy": 0.27714597433805466, "epoch": 0.07353102089944195, "grad_norm": 70.0, "learning_rate": 1.4606413994169096e-05, "loss": 0.3313, "loss_lm": 0.07234008610248566, "loss_seg": 0.258948128670454, "mean_token_accuracy": 0.9899743795394897, "num_tokens": 71410414.0, "step": 168 }, { "entropy": 0.2607324719429016, "epoch": 0.07396870554765292, "grad_norm": 40.0, "learning_rate": 1.4693877551020408e-05, "loss": 0.2906, "loss_lm": 0.06676917057484388, "loss_seg": 0.22380203008651733, "mean_token_accuracy": 0.9901368021965027, "num_tokens": 71835598.0, "step": 169 }, { "entropy": 0.24656307697296143, "epoch": 0.07440639019586388, "grad_norm": 182.0, "learning_rate": 1.478134110787172e-05, "loss": 0.397, "loss_lm": 0.06407919339835644, "loss_seg": 0.33289501816034317, "mean_token_accuracy": 0.9900158643722534, "num_tokens": 72261857.0, "step": 170 }, { "entropy": 0.23485206812620163, "epoch": 0.07484407484407485, "grad_norm": 188.0, "learning_rate": 1.4868804664723032e-05, "loss": 0.4344, "loss_lm": 0.06557092256844044, "loss_seg": 0.3688669502735138, "mean_token_accuracy": 0.9901726096868515, "num_tokens": 72687372.0, "step": 171 }, { "entropy": 0.22597737237811089, "epoch": 0.07528175949228581, "grad_norm": 45.25, "learning_rate": 1.4956268221574344e-05, "loss": 0.2943, "loss_lm": 0.06315459124743938, "loss_seg": 0.23118892684578896, "mean_token_accuracy": 0.9902133196592331, "num_tokens": 73112924.0, "step": 172 }, { "entropy": 0.22529231011867523, "epoch": 0.07571944414049678, "grad_norm": 111.0, "learning_rate": 1.5043731778425658e-05, "loss": 0.3707, "loss_lm": 0.06600452773272991, "loss_seg": 0.3047172874212265, "mean_token_accuracy": 0.9904746413230896, "num_tokens": 73537755.0, "step": 173 }, { "entropy": 0.22217413783073425, "epoch": 0.07615712878870774, "grad_norm": 110.5, "learning_rate": 1.513119533527697e-05, "loss": 0.416, "loss_lm": 0.06323215179145336, "loss_seg": 0.3527603819966316, "mean_token_accuracy": 0.9902990013360977, "num_tokens": 73962005.0, "step": 174 }, { "entropy": 0.21910391747951508, "epoch": 0.0765948134369187, "grad_norm": 59.5, "learning_rate": 1.521865889212828e-05, "loss": 0.3535, "loss_lm": 0.06404194235801697, "loss_seg": 0.2894756868481636, "mean_token_accuracy": 0.990134060382843, "num_tokens": 74387884.0, "step": 175 }, { "entropy": 0.2180839329957962, "epoch": 0.07703249808512966, "grad_norm": 41.5, "learning_rate": 1.530612244897959e-05, "loss": 0.3053, "loss_lm": 0.059730712324380875, "loss_seg": 0.24560797587037086, "mean_token_accuracy": 0.9903085380792618, "num_tokens": 74812453.0, "step": 176 }, { "entropy": 0.218543890863657, "epoch": 0.07747018273334062, "grad_norm": 64.5, "learning_rate": 1.5393586005830903e-05, "loss": 0.3792, "loss_lm": 0.06152031943202019, "loss_seg": 0.31767602264881134, "mean_token_accuracy": 0.9903183877468109, "num_tokens": 75237861.0, "step": 177 }, { "entropy": 0.21304816380143166, "epoch": 0.07790786738155159, "grad_norm": 42.25, "learning_rate": 1.5481049562682215e-05, "loss": 0.3511, "loss_lm": 0.059014635160565376, "loss_seg": 0.2921070456504822, "mean_token_accuracy": 0.9906405657529831, "num_tokens": 75663169.0, "step": 178 }, { "entropy": 0.210719782859087, "epoch": 0.07834555202976255, "grad_norm": 16.875, "learning_rate": 1.5568513119533527e-05, "loss": 0.3997, "loss_lm": 0.056192923337221146, "loss_seg": 0.3435175493359566, "mean_token_accuracy": 0.9904303401708603, "num_tokens": 76088309.0, "step": 179 }, { "entropy": 0.2065960206091404, "epoch": 0.07878323667797352, "grad_norm": 53.5, "learning_rate": 1.565597667638484e-05, "loss": 0.3515, "loss_lm": 0.058664034120738506, "loss_seg": 0.292827807366848, "mean_token_accuracy": 0.9902875125408173, "num_tokens": 76513281.0, "step": 180 }, { "entropy": 0.19940197095274925, "epoch": 0.07922092132618448, "grad_norm": 28.375, "learning_rate": 1.574344023323615e-05, "loss": 0.2927, "loss_lm": 0.05283829476684332, "loss_seg": 0.23981951549649239, "mean_token_accuracy": 0.9906515777111053, "num_tokens": 76937682.0, "step": 181 }, { "entropy": 0.19025618955492973, "epoch": 0.07965860597439545, "grad_norm": 58.75, "learning_rate": 1.5830903790087464e-05, "loss": 0.3044, "loss_lm": 0.050748951733112335, "loss_seg": 0.253634724766016, "mean_token_accuracy": 0.9913890808820724, "num_tokens": 77363691.0, "step": 182 }, { "entropy": 0.17979483306407928, "epoch": 0.08009629062260641, "grad_norm": 54.5, "learning_rate": 1.5918367346938776e-05, "loss": 0.2848, "loss_lm": 0.04735856130719185, "loss_seg": 0.23745909705758095, "mean_token_accuracy": 0.9915907382965088, "num_tokens": 77788231.0, "step": 183 }, { "entropy": 0.1663747802376747, "epoch": 0.08053397527081738, "grad_norm": 24.125, "learning_rate": 1.6005830903790088e-05, "loss": 0.2718, "loss_lm": 0.0496495608240366, "loss_seg": 0.2221066653728485, "mean_token_accuracy": 0.9915106296539307, "num_tokens": 78213124.0, "step": 184 }, { "entropy": 0.15975653752684593, "epoch": 0.08097165991902834, "grad_norm": 32.75, "learning_rate": 1.60932944606414e-05, "loss": 0.2682, "loss_lm": 0.0469332579523325, "loss_seg": 0.22124549746513367, "mean_token_accuracy": 0.9916643053293228, "num_tokens": 78638177.0, "step": 185 }, { "entropy": 0.15547459200024605, "epoch": 0.0814093445672393, "grad_norm": 42.5, "learning_rate": 1.6180758017492712e-05, "loss": 0.2798, "loss_lm": 0.048256141133606434, "loss_seg": 0.23152583464980125, "mean_token_accuracy": 0.9917011708021164, "num_tokens": 79063441.0, "step": 186 }, { "entropy": 0.1535099819302559, "epoch": 0.08184702921545027, "grad_norm": 23.75, "learning_rate": 1.6268221574344024e-05, "loss": 0.2651, "loss_lm": 0.045751660130918026, "loss_seg": 0.2193780392408371, "mean_token_accuracy": 0.9916572421789169, "num_tokens": 79488981.0, "step": 187 }, { "entropy": 0.15195325762033463, "epoch": 0.08228471386366124, "grad_norm": 54.25, "learning_rate": 1.6355685131195336e-05, "loss": 0.2898, "loss_lm": 0.04485444165766239, "loss_seg": 0.24496716260910034, "mean_token_accuracy": 0.9919828921556473, "num_tokens": 79913842.0, "step": 188 }, { "entropy": 0.15139055252075195, "epoch": 0.0827223985118722, "grad_norm": 28.875, "learning_rate": 1.6443148688046645e-05, "loss": 0.2648, "loss_lm": 0.04380385112017393, "loss_seg": 0.22101019322872162, "mean_token_accuracy": 0.9916555881500244, "num_tokens": 80339776.0, "step": 189 }, { "entropy": 0.14991848915815353, "epoch": 0.08316008316008316, "grad_norm": 18.125, "learning_rate": 1.6530612244897957e-05, "loss": 0.2465, "loss_lm": 0.046414404176175594, "loss_seg": 0.20004694163799286, "mean_token_accuracy": 0.9919411689043045, "num_tokens": 80764289.0, "step": 190 }, { "entropy": 0.14701230823993683, "epoch": 0.08359776780829413, "grad_norm": 34.75, "learning_rate": 1.6618075801749273e-05, "loss": 0.25, "loss_lm": 0.04420184995979071, "loss_seg": 0.2058185264468193, "mean_token_accuracy": 0.9919120520353317, "num_tokens": 81189354.0, "step": 191 }, { "entropy": 0.14474572986364365, "epoch": 0.0840354524565051, "grad_norm": 27.625, "learning_rate": 1.6705539358600585e-05, "loss": 0.3091, "loss_lm": 0.046339272521436214, "loss_seg": 0.26277070865035057, "mean_token_accuracy": 0.9916874319314957, "num_tokens": 81614940.0, "step": 192 }, { "entropy": 0.13964226096868515, "epoch": 0.08447313710471606, "grad_norm": 31.0, "learning_rate": 1.6793002915451897e-05, "loss": 0.2724, "loss_lm": 0.04501193203032017, "loss_seg": 0.22739733383059502, "mean_token_accuracy": 0.9914847314357758, "num_tokens": 82040201.0, "step": 193 }, { "entropy": 0.13515375927090645, "epoch": 0.08491082175292701, "grad_norm": 29.875, "learning_rate": 1.688046647230321e-05, "loss": 0.2953, "loss_lm": 0.04514308925718069, "loss_seg": 0.2501918748021126, "mean_token_accuracy": 0.9918311834335327, "num_tokens": 82464874.0, "step": 194 }, { "entropy": 0.13183031603693962, "epoch": 0.08534850640113797, "grad_norm": 30.625, "learning_rate": 1.6967930029154518e-05, "loss": 0.2935, "loss_lm": 0.04446984361857176, "loss_seg": 0.2490381971001625, "mean_token_accuracy": 0.9916888326406479, "num_tokens": 82890093.0, "step": 195 }, { "entropy": 0.1267959624528885, "epoch": 0.08578619104934894, "grad_norm": 28.75, "learning_rate": 1.705539358600583e-05, "loss": 0.3235, "loss_lm": 0.04120733682066202, "loss_seg": 0.2823127396404743, "mean_token_accuracy": 0.9918421357870102, "num_tokens": 83314852.0, "step": 196 }, { "entropy": 0.12396634370088577, "epoch": 0.0862238756975599, "grad_norm": 83.0, "learning_rate": 1.7142857142857142e-05, "loss": 0.275, "loss_lm": 0.04191784467548132, "loss_seg": 0.2330823428928852, "mean_token_accuracy": 0.9918895065784454, "num_tokens": 83739871.0, "step": 197 }, { "entropy": 0.12253537401556969, "epoch": 0.08666156034577087, "grad_norm": 33.0, "learning_rate": 1.7230320699708454e-05, "loss": 0.3375, "loss_lm": 0.04308417998254299, "loss_seg": 0.2943778522312641, "mean_token_accuracy": 0.9916523396968842, "num_tokens": 84164929.0, "step": 198 }, { "entropy": 0.12024684250354767, "epoch": 0.08709924499398183, "grad_norm": 85.5, "learning_rate": 1.731778425655977e-05, "loss": 0.2673, "loss_lm": 0.04198971204459667, "loss_seg": 0.2253146953880787, "mean_token_accuracy": 0.9919297844171524, "num_tokens": 84590543.0, "step": 199 }, { "entropy": 0.11992721632122993, "epoch": 0.0875369296421928, "grad_norm": 20.5, "learning_rate": 1.740524781341108e-05, "loss": 0.2532, "loss_lm": 0.03889067191630602, "loss_seg": 0.21434209868311882, "mean_token_accuracy": 0.9917973279953003, "num_tokens": 85015905.0, "step": 200 }, { "entropy": 0.11864635348320007, "epoch": 0.08797461429040376, "grad_norm": 22.125, "learning_rate": 1.7492711370262394e-05, "loss": 0.249, "loss_lm": 0.043304297141730785, "loss_seg": 0.20565522462129593, "mean_token_accuracy": 0.9918980151414871, "num_tokens": 85440994.0, "step": 201 }, { "entropy": 0.11740806885063648, "epoch": 0.08841229893861473, "grad_norm": 33.75, "learning_rate": 1.7580174927113702e-05, "loss": 0.2637, "loss_lm": 0.040762162767350674, "loss_seg": 0.22290831431746483, "mean_token_accuracy": 0.9917878806591034, "num_tokens": 85865761.0, "step": 202 }, { "entropy": 0.11425767838954926, "epoch": 0.08884998358682569, "grad_norm": 18.625, "learning_rate": 1.7667638483965014e-05, "loss": 0.301, "loss_lm": 0.040434930473566055, "loss_seg": 0.2605500966310501, "mean_token_accuracy": 0.9920035600662231, "num_tokens": 86290692.0, "step": 203 }, { "entropy": 0.11189706064760685, "epoch": 0.08928766823503666, "grad_norm": 20.5, "learning_rate": 1.7755102040816327e-05, "loss": 0.2637, "loss_lm": 0.0402750913053751, "loss_seg": 0.2234438695013523, "mean_token_accuracy": 0.9918722957372665, "num_tokens": 86716764.0, "step": 204 }, { "entropy": 0.10876678116619587, "epoch": 0.08972535288324762, "grad_norm": 22.75, "learning_rate": 1.784256559766764e-05, "loss": 0.2928, "loss_lm": 0.040833606384694576, "loss_seg": 0.25200600922107697, "mean_token_accuracy": 0.9920447766780853, "num_tokens": 87141657.0, "step": 205 }, { "entropy": 0.10602356307208538, "epoch": 0.09016303753145859, "grad_norm": 22.75, "learning_rate": 1.793002915451895e-05, "loss": 0.2353, "loss_lm": 0.0373578816652298, "loss_seg": 0.19798034243285656, "mean_token_accuracy": 0.9920905381441116, "num_tokens": 87566603.0, "step": 206 }, { "entropy": 0.1047910638153553, "epoch": 0.09060072217966955, "grad_norm": 25.875, "learning_rate": 1.8017492711370263e-05, "loss": 0.3316, "loss_lm": 0.03885718621313572, "loss_seg": 0.2926966920495033, "mean_token_accuracy": 0.9919590651988983, "num_tokens": 87991793.0, "step": 207 }, { "entropy": 0.10483965277671814, "epoch": 0.09103840682788052, "grad_norm": 25.875, "learning_rate": 1.8104956268221575e-05, "loss": 0.3105, "loss_lm": 0.037414791993796825, "loss_seg": 0.2730870805680752, "mean_token_accuracy": 0.991838738322258, "num_tokens": 88416995.0, "step": 208 }, { "entropy": 0.10331842675805092, "epoch": 0.09147609147609148, "grad_norm": 16.75, "learning_rate": 1.8192419825072887e-05, "loss": 0.3107, "loss_lm": 0.03982378263026476, "loss_seg": 0.2708519659936428, "mean_token_accuracy": 0.9920500069856644, "num_tokens": 88841901.0, "step": 209 }, { "entropy": 0.10273915342986584, "epoch": 0.09191377612430245, "grad_norm": 13.4375, "learning_rate": 1.82798833819242e-05, "loss": 0.2837, "loss_lm": 0.03604744002223015, "loss_seg": 0.2476348839700222, "mean_token_accuracy": 0.9919776618480682, "num_tokens": 89265950.0, "step": 210 }, { "entropy": 0.1014280878007412, "epoch": 0.09235146077251341, "grad_norm": 17.0, "learning_rate": 1.836734693877551e-05, "loss": 0.2473, "loss_lm": 0.03741787001490593, "loss_seg": 0.20983486622571945, "mean_token_accuracy": 0.9921051859855652, "num_tokens": 89691306.0, "step": 211 }, { "entropy": 0.1003858745098114, "epoch": 0.09278914542072436, "grad_norm": 15.5, "learning_rate": 1.8454810495626823e-05, "loss": 0.2511, "loss_lm": 0.039532470516860485, "loss_seg": 0.21160253882408142, "mean_token_accuracy": 0.9919550865888596, "num_tokens": 90116654.0, "step": 212 }, { "entropy": 0.0984532069414854, "epoch": 0.09322683006893533, "grad_norm": 21.25, "learning_rate": 1.8542274052478135e-05, "loss": 0.2284, "loss_lm": 0.03621168667450547, "loss_seg": 0.19215995073318481, "mean_token_accuracy": 0.9920945018529892, "num_tokens": 90542069.0, "step": 213 }, { "entropy": 0.09747209399938583, "epoch": 0.09366451471714629, "grad_norm": 15.8125, "learning_rate": 1.8629737609329444e-05, "loss": 0.247, "loss_lm": 0.03820406273007393, "loss_seg": 0.20882481709122658, "mean_token_accuracy": 0.9919828921556473, "num_tokens": 90966783.0, "step": 214 }, { "entropy": 0.0958875510841608, "epoch": 0.09410219936535726, "grad_norm": 24.125, "learning_rate": 1.8717201166180756e-05, "loss": 0.2396, "loss_lm": 0.03578350879251957, "loss_seg": 0.20385928452014923, "mean_token_accuracy": 0.9919743835926056, "num_tokens": 91391439.0, "step": 215 }, { "entropy": 0.09559732116758823, "epoch": 0.09453988401356822, "grad_norm": 23.375, "learning_rate": 1.880466472303207e-05, "loss": 0.2856, "loss_lm": 0.037378872744739056, "loss_seg": 0.2482263706624508, "mean_token_accuracy": 0.9920036643743515, "num_tokens": 91816740.0, "step": 216 }, { "entropy": 0.0937117226421833, "epoch": 0.09497756866177919, "grad_norm": 13.875, "learning_rate": 1.8892128279883384e-05, "loss": 0.3478, "loss_lm": 0.03573391307145357, "loss_seg": 0.3121104724705219, "mean_token_accuracy": 0.9921299517154694, "num_tokens": 92241883.0, "step": 217 }, { "entropy": 0.09140705689787865, "epoch": 0.09541525330999015, "grad_norm": 21.0, "learning_rate": 1.8979591836734696e-05, "loss": 0.2945, "loss_lm": 0.03404439985752106, "loss_seg": 0.2604537792503834, "mean_token_accuracy": 0.9922395944595337, "num_tokens": 92666608.0, "step": 218 }, { "entropy": 0.09102724492549896, "epoch": 0.09585293795820111, "grad_norm": 12.25, "learning_rate": 1.9067055393586008e-05, "loss": 0.2289, "loss_lm": 0.03578389249742031, "loss_seg": 0.19309165328741074, "mean_token_accuracy": 0.9922223687171936, "num_tokens": 93091279.0, "step": 219 }, { "entropy": 0.08963610976934433, "epoch": 0.09629062260641208, "grad_norm": 19.0, "learning_rate": 1.915451895043732e-05, "loss": 0.2368, "loss_lm": 0.03536381991580129, "loss_seg": 0.2013958916068077, "mean_token_accuracy": 0.9920923709869385, "num_tokens": 93516181.0, "step": 220 }, { "entropy": 0.0881015807390213, "epoch": 0.09672830725462304, "grad_norm": 15.9375, "learning_rate": 1.924198250728863e-05, "loss": 0.2896, "loss_lm": 0.03703663684427738, "loss_seg": 0.2526008151471615, "mean_token_accuracy": 0.9920089840888977, "num_tokens": 93941160.0, "step": 221 }, { "entropy": 0.08754240721464157, "epoch": 0.09716599190283401, "grad_norm": 19.75, "learning_rate": 1.932944606413994e-05, "loss": 0.2666, "loss_lm": 0.03573284251615405, "loss_seg": 0.2309064157307148, "mean_token_accuracy": 0.9920995831489563, "num_tokens": 94366079.0, "step": 222 }, { "entropy": 0.08563804440200329, "epoch": 0.09760367655104497, "grad_norm": 16.625, "learning_rate": 1.9416909620991253e-05, "loss": 0.3601, "loss_lm": 0.03671932686120272, "loss_seg": 0.32333529740571976, "mean_token_accuracy": 0.9921185970306396, "num_tokens": 94790735.0, "step": 223 }, { "entropy": 0.08498679660260677, "epoch": 0.09804136119925594, "grad_norm": 29.625, "learning_rate": 1.9504373177842565e-05, "loss": 0.2437, "loss_lm": 0.032647205516695976, "loss_seg": 0.21104460954666138, "mean_token_accuracy": 0.9920363873243332, "num_tokens": 95215781.0, "step": 224 }, { "entropy": 0.08345746994018555, "epoch": 0.0984790458474669, "grad_norm": 21.25, "learning_rate": 1.9591836734693877e-05, "loss": 0.2798, "loss_lm": 0.03494369797408581, "loss_seg": 0.24481582641601562, "mean_token_accuracy": 0.9922073483467102, "num_tokens": 95641324.0, "step": 225 }, { "entropy": 0.08320427313446999, "epoch": 0.09891673049567787, "grad_norm": 15.25, "learning_rate": 1.9679300291545193e-05, "loss": 0.3031, "loss_lm": 0.03304323274642229, "loss_seg": 0.27001311257481575, "mean_token_accuracy": 0.9923304319381714, "num_tokens": 96066893.0, "step": 226 }, { "entropy": 0.08361087180674076, "epoch": 0.09935441514388883, "grad_norm": 16.875, "learning_rate": 1.97667638483965e-05, "loss": 0.234, "loss_lm": 0.034308457747101784, "loss_seg": 0.1997404880821705, "mean_token_accuracy": 0.9920763820409775, "num_tokens": 96492086.0, "step": 227 }, { "entropy": 0.08391817845404148, "epoch": 0.0997920997920998, "grad_norm": 17.625, "learning_rate": 1.9854227405247814e-05, "loss": 0.2528, "loss_lm": 0.034968080930411816, "loss_seg": 0.21781085059046745, "mean_token_accuracy": 0.9920811802148819, "num_tokens": 96917170.0, "step": 228 }, { "entropy": 0.08265319652855396, "epoch": 0.10022978444031076, "grad_norm": 8.6875, "learning_rate": 1.9941690962099126e-05, "loss": 0.2292, "loss_lm": 0.034824141301214695, "loss_seg": 0.19433972612023354, "mean_token_accuracy": 0.9920964390039444, "num_tokens": 97341884.0, "step": 229 }, { "entropy": 0.08003023639321327, "epoch": 0.10066746908852171, "grad_norm": 20.75, "learning_rate": 2.0029154518950438e-05, "loss": 0.2189, "loss_lm": 0.03090816969051957, "loss_seg": 0.18799526244401932, "mean_token_accuracy": 0.9924476742744446, "num_tokens": 97766249.0, "step": 230 }, { "entropy": 0.07979049533605576, "epoch": 0.10110515373673268, "grad_norm": 21.5, "learning_rate": 2.011661807580175e-05, "loss": 0.2431, "loss_lm": 0.03257273510098457, "loss_seg": 0.2105584852397442, "mean_token_accuracy": 0.9922353327274323, "num_tokens": 98190475.0, "step": 231 }, { "entropy": 0.07942483760416508, "epoch": 0.10154283838494364, "grad_norm": 32.0, "learning_rate": 2.0204081632653062e-05, "loss": 0.2213, "loss_lm": 0.031902752816677094, "loss_seg": 0.1893521137535572, "mean_token_accuracy": 0.9921277463436127, "num_tokens": 98615515.0, "step": 232 }, { "entropy": 0.07938946224749088, "epoch": 0.10198052303315461, "grad_norm": 14.5625, "learning_rate": 2.029154518950437e-05, "loss": 0.2254, "loss_lm": 0.03469203971326351, "loss_seg": 0.19075194746255875, "mean_token_accuracy": 0.9921390265226364, "num_tokens": 99040652.0, "step": 233 }, { "entropy": 0.07894675992429256, "epoch": 0.10241820768136557, "grad_norm": 16.25, "learning_rate": 2.0379008746355683e-05, "loss": 0.3118, "loss_lm": 0.03172463970258832, "loss_seg": 0.28005543909966946, "mean_token_accuracy": 0.9923226088285446, "num_tokens": 99465270.0, "step": 234 }, { "entropy": 0.07690232060849667, "epoch": 0.10285589232957654, "grad_norm": 18.0, "learning_rate": 2.0466472303207e-05, "loss": 0.2584, "loss_lm": 0.030934734735637903, "loss_seg": 0.22749562188982964, "mean_token_accuracy": 0.9926306009292603, "num_tokens": 99889833.0, "step": 235 }, { "entropy": 0.07533525861799717, "epoch": 0.1032935769777875, "grad_norm": 10.4375, "learning_rate": 2.055393586005831e-05, "loss": 0.2014, "loss_lm": 0.030209789518266916, "loss_seg": 0.17123540490865707, "mean_token_accuracy": 0.9925380796194077, "num_tokens": 100314628.0, "step": 236 }, { "entropy": 0.07542312890291214, "epoch": 0.10373126162599847, "grad_norm": 28.5, "learning_rate": 2.0641399416909623e-05, "loss": 0.3043, "loss_lm": 0.033016486559063196, "loss_seg": 0.27130404114723206, "mean_token_accuracy": 0.9922485053539276, "num_tokens": 100739306.0, "step": 237 }, { "entropy": 0.0738108828663826, "epoch": 0.10416894627420943, "grad_norm": 18.125, "learning_rate": 2.0728862973760935e-05, "loss": 0.247, "loss_lm": 0.03164258552715182, "loss_seg": 0.21538680233061314, "mean_token_accuracy": 0.992300882935524, "num_tokens": 101164287.0, "step": 238 }, { "entropy": 0.07288684137165546, "epoch": 0.1046066309224204, "grad_norm": 17.25, "learning_rate": 2.0816326530612247e-05, "loss": 0.2305, "loss_lm": 0.03165686735883355, "loss_seg": 0.1987948678433895, "mean_token_accuracy": 0.9921325147151947, "num_tokens": 101589327.0, "step": 239 }, { "entropy": 0.07148643210530281, "epoch": 0.10504431557063136, "grad_norm": 21.5, "learning_rate": 2.0903790087463556e-05, "loss": 0.3141, "loss_lm": 0.03313578385859728, "loss_seg": 0.28093111887574196, "mean_token_accuracy": 0.9922860115766525, "num_tokens": 102013876.0, "step": 240 }, { "entropy": 0.0709297377616167, "epoch": 0.10548200021884233, "grad_norm": 18.625, "learning_rate": 2.0991253644314868e-05, "loss": 0.1751, "loss_lm": 0.03249536920338869, "loss_seg": 0.1426517255604267, "mean_token_accuracy": 0.9921884685754776, "num_tokens": 102439540.0, "step": 241 }, { "entropy": 0.06995382159948349, "epoch": 0.10591968486705329, "grad_norm": 16.75, "learning_rate": 2.107871720116618e-05, "loss": 0.237, "loss_lm": 0.03058341797441244, "loss_seg": 0.20637906342744827, "mean_token_accuracy": 0.9927704930305481, "num_tokens": 102865149.0, "step": 242 }, { "entropy": 0.06922151520848274, "epoch": 0.10635736951526426, "grad_norm": 20.875, "learning_rate": 2.1166180758017495e-05, "loss": 0.206, "loss_lm": 0.028587642591446638, "loss_seg": 0.17736544832587242, "mean_token_accuracy": 0.9928676784038544, "num_tokens": 103290221.0, "step": 243 }, { "entropy": 0.0692446380853653, "epoch": 0.10679505416347522, "grad_norm": 23.5, "learning_rate": 2.1253644314868807e-05, "loss": 0.1762, "loss_lm": 0.028059078380465508, "loss_seg": 0.14811402559280396, "mean_token_accuracy": 0.9929028898477554, "num_tokens": 103715554.0, "step": 244 }, { "entropy": 0.06828509457409382, "epoch": 0.10723273881168618, "grad_norm": 19.0, "learning_rate": 2.134110787172012e-05, "loss": 0.2565, "loss_lm": 0.02973545715212822, "loss_seg": 0.22673281654715538, "mean_token_accuracy": 0.9929305166006088, "num_tokens": 104140871.0, "step": 245 }, { "entropy": 0.06715320982038975, "epoch": 0.10767042345989715, "grad_norm": 24.125, "learning_rate": 2.1428571428571428e-05, "loss": 0.2757, "loss_lm": 0.028774145524948835, "loss_seg": 0.24693643301725388, "mean_token_accuracy": 0.9931659698486328, "num_tokens": 104565140.0, "step": 246 }, { "entropy": 0.0669881422072649, "epoch": 0.10810810810810811, "grad_norm": 14.4375, "learning_rate": 2.151603498542274e-05, "loss": 0.2147, "loss_lm": 0.031208641827106476, "loss_seg": 0.18345021456480026, "mean_token_accuracy": 0.9931394457817078, "num_tokens": 104989254.0, "step": 247 }, { "entropy": 0.0665444415062666, "epoch": 0.10854579275631908, "grad_norm": 19.75, "learning_rate": 2.1603498542274052e-05, "loss": 0.2476, "loss_lm": 0.029655043967068195, "loss_seg": 0.21796030923724174, "mean_token_accuracy": 0.993149995803833, "num_tokens": 105414158.0, "step": 248 }, { "entropy": 0.06600625440478325, "epoch": 0.10898347740453003, "grad_norm": 19.25, "learning_rate": 2.1690962099125364e-05, "loss": 0.2186, "loss_lm": 0.02813108405098319, "loss_seg": 0.1904673408716917, "mean_token_accuracy": 0.9932126998901367, "num_tokens": 105838864.0, "step": 249 }, { "entropy": 0.06632706709206104, "epoch": 0.109421162052741, "grad_norm": 14.75, "learning_rate": 2.1778425655976677e-05, "loss": 0.2606, "loss_lm": 0.03077856544405222, "loss_seg": 0.22982018068432808, "mean_token_accuracy": 0.9932132363319397, "num_tokens": 106263297.0, "step": 250 }, { "entropy": 0.06624413095414639, "epoch": 0.10985884670095196, "grad_norm": 17.25, "learning_rate": 2.186588921282799e-05, "loss": 0.1855, "loss_lm": 0.02929573319852352, "loss_seg": 0.15617971308529377, "mean_token_accuracy": 0.9930561482906342, "num_tokens": 106687648.0, "step": 251 }, { "entropy": 0.06593830324709415, "epoch": 0.11029653134916292, "grad_norm": 19.0, "learning_rate": 2.19533527696793e-05, "loss": 0.2769, "loss_lm": 0.03018098184838891, "loss_seg": 0.24667621403932571, "mean_token_accuracy": 0.9930168688297272, "num_tokens": 107112915.0, "step": 252 }, { "entropy": 0.06557515449821949, "epoch": 0.11073421599737389, "grad_norm": 18.75, "learning_rate": 2.2040816326530613e-05, "loss": 0.1985, "loss_lm": 0.02621437842026353, "loss_seg": 0.17233439721167088, "mean_token_accuracy": 0.9931248426437378, "num_tokens": 107537874.0, "step": 253 }, { "entropy": 0.06510423310101032, "epoch": 0.11117190064558485, "grad_norm": 9.9375, "learning_rate": 2.2128279883381925e-05, "loss": 0.2054, "loss_lm": 0.028361598029732704, "loss_seg": 0.17705762013792992, "mean_token_accuracy": 0.9933099299669266, "num_tokens": 107962591.0, "step": 254 }, { "entropy": 0.06520529836416245, "epoch": 0.11160958529379582, "grad_norm": 28.5, "learning_rate": 2.2215743440233237e-05, "loss": 0.2488, "loss_lm": 0.02922169305384159, "loss_seg": 0.21953444927930832, "mean_token_accuracy": 0.993139773607254, "num_tokens": 108387593.0, "step": 255 }, { "entropy": 0.06529794447124004, "epoch": 0.11204726994200678, "grad_norm": 14.5, "learning_rate": 2.230320699708455e-05, "loss": 0.2421, "loss_lm": 0.030059504322707653, "loss_seg": 0.2120879665017128, "mean_token_accuracy": 0.9930940419435501, "num_tokens": 108812684.0, "step": 256 }, { "entropy": 0.06414715573191643, "epoch": 0.11248495459021775, "grad_norm": 21.25, "learning_rate": 2.239067055393586e-05, "loss": 0.2342, "loss_lm": 0.029021381866186857, "loss_seg": 0.20518416166305542, "mean_token_accuracy": 0.9930280148983002, "num_tokens": 109237467.0, "step": 257 }, { "entropy": 0.06336754001677036, "epoch": 0.11292263923842871, "grad_norm": 15.4375, "learning_rate": 2.2478134110787173e-05, "loss": 0.2399, "loss_lm": 0.0282561588101089, "loss_seg": 0.21164682507514954, "mean_token_accuracy": 0.9930319339036942, "num_tokens": 109661926.0, "step": 258 }, { "entropy": 0.06313533522188663, "epoch": 0.11336032388663968, "grad_norm": 16.5, "learning_rate": 2.2565597667638482e-05, "loss": 0.2531, "loss_lm": 0.02792822988703847, "loss_seg": 0.22518540918827057, "mean_token_accuracy": 0.9931195229291916, "num_tokens": 110087279.0, "step": 259 }, { "entropy": 0.061713638715445995, "epoch": 0.11379800853485064, "grad_norm": 13.375, "learning_rate": 2.2653061224489794e-05, "loss": 0.233, "loss_lm": 0.027802810538560152, "loss_seg": 0.2051798738539219, "mean_token_accuracy": 0.9930887967348099, "num_tokens": 110512324.0, "step": 260 }, { "entropy": 0.061366464011371136, "epoch": 0.1142356931830616, "grad_norm": 12.8125, "learning_rate": 2.274052478134111e-05, "loss": 0.2088, "loss_lm": 0.02795277675613761, "loss_seg": 0.18085990101099014, "mean_token_accuracy": 0.9930106848478317, "num_tokens": 110938207.0, "step": 261 }, { "entropy": 0.060538435354828835, "epoch": 0.11467337783127257, "grad_norm": 16.25, "learning_rate": 2.2827988338192422e-05, "loss": 0.1675, "loss_lm": 0.027457003016024828, "loss_seg": 0.14002366550266743, "mean_token_accuracy": 0.9931232631206512, "num_tokens": 111364363.0, "step": 262 }, { "entropy": 0.06059168092906475, "epoch": 0.11511106247948354, "grad_norm": 20.125, "learning_rate": 2.2915451895043734e-05, "loss": 0.2377, "loss_lm": 0.02716338401660323, "loss_seg": 0.21054985746741295, "mean_token_accuracy": 0.9931427240371704, "num_tokens": 111788781.0, "step": 263 }, { "entropy": 0.06021826155483723, "epoch": 0.1155487471276945, "grad_norm": 14.875, "learning_rate": 2.3002915451895046e-05, "loss": 0.2659, "loss_lm": 0.027575161308050156, "loss_seg": 0.23833465576171875, "mean_token_accuracy": 0.9931525439023972, "num_tokens": 112213562.0, "step": 264 }, { "entropy": 0.060548423789441586, "epoch": 0.11598643177590547, "grad_norm": 15.8125, "learning_rate": 2.3090379008746355e-05, "loss": 0.253, "loss_lm": 0.02549680508673191, "loss_seg": 0.2275395318865776, "mean_token_accuracy": 0.9933312088251114, "num_tokens": 112638604.0, "step": 265 }, { "entropy": 0.061125935055315495, "epoch": 0.11642411642411643, "grad_norm": 12.4375, "learning_rate": 2.3177842565597667e-05, "loss": 0.2411, "loss_lm": 0.02866930654272437, "loss_seg": 0.21241534873843193, "mean_token_accuracy": 0.9930099248886108, "num_tokens": 113064306.0, "step": 266 }, { "entropy": 0.059963878244161606, "epoch": 0.11686180107232738, "grad_norm": 24.75, "learning_rate": 2.326530612244898e-05, "loss": 0.265, "loss_lm": 0.029880273155868053, "loss_seg": 0.23507126048207283, "mean_token_accuracy": 0.9931396096944809, "num_tokens": 113489443.0, "step": 267 }, { "entropy": 0.059581161476671696, "epoch": 0.11729948572053835, "grad_norm": 16.75, "learning_rate": 2.335276967930029e-05, "loss": 0.1832, "loss_lm": 0.024227865040302277, "loss_seg": 0.15896269865334034, "mean_token_accuracy": 0.9932297617197037, "num_tokens": 113914481.0, "step": 268 }, { "entropy": 0.05779876001179218, "epoch": 0.11773717036874931, "grad_norm": 16.75, "learning_rate": 2.3440233236151603e-05, "loss": 0.2559, "loss_lm": 0.02570533286780119, "loss_seg": 0.23014814406633377, "mean_token_accuracy": 0.9932477325201035, "num_tokens": 114339146.0, "step": 269 }, { "entropy": 0.057946437038481236, "epoch": 0.11817485501696028, "grad_norm": 15.125, "learning_rate": 2.352769679300292e-05, "loss": 0.169, "loss_lm": 0.0290740760974586, "loss_seg": 0.13991431519389153, "mean_token_accuracy": 0.9932016879320145, "num_tokens": 114763904.0, "step": 270 }, { "entropy": 0.05723962839692831, "epoch": 0.11861253966517124, "grad_norm": 15.625, "learning_rate": 2.3615160349854227e-05, "loss": 0.274, "loss_lm": 0.027455431409180164, "loss_seg": 0.24658027291297913, "mean_token_accuracy": 0.9931681305170059, "num_tokens": 115188329.0, "step": 271 }, { "entropy": 0.05868969485163689, "epoch": 0.1190502243133822, "grad_norm": 8.6875, "learning_rate": 2.370262390670554e-05, "loss": 0.2055, "loss_lm": 0.026620298624038696, "loss_seg": 0.17891667038202286, "mean_token_accuracy": 0.9930000603199005, "num_tokens": 115614015.0, "step": 272 }, { "entropy": 0.05697060842067003, "epoch": 0.11948790896159317, "grad_norm": 19.5, "learning_rate": 2.379008746355685e-05, "loss": 0.2486, "loss_lm": 0.027286783326417208, "loss_seg": 0.22127146646380424, "mean_token_accuracy": 0.9935166090726852, "num_tokens": 116038714.0, "step": 273 }, { "entropy": 0.05806905683130026, "epoch": 0.11992559360980413, "grad_norm": 18.375, "learning_rate": 2.3877551020408164e-05, "loss": 0.2273, "loss_lm": 0.028516526333987713, "loss_seg": 0.19880947470664978, "mean_token_accuracy": 0.9930490106344223, "num_tokens": 116464066.0, "step": 274 }, { "entropy": 0.056496658362448215, "epoch": 0.1203632782580151, "grad_norm": 11.3125, "learning_rate": 2.3965014577259476e-05, "loss": 0.2221, "loss_lm": 0.025552250910550356, "loss_seg": 0.1965104453265667, "mean_token_accuracy": 0.993060827255249, "num_tokens": 116889148.0, "step": 275 }, { "entropy": 0.055356889963150024, "epoch": 0.12080096290622606, "grad_norm": 17.25, "learning_rate": 2.4052478134110788e-05, "loss": 0.2462, "loss_lm": 0.02886811876669526, "loss_seg": 0.21730443835258484, "mean_token_accuracy": 0.9930636584758759, "num_tokens": 117314536.0, "step": 276 }, { "entropy": 0.05561610031872988, "epoch": 0.12123864755443703, "grad_norm": 21.25, "learning_rate": 2.41399416909621e-05, "loss": 0.2115, "loss_lm": 0.026869728229939938, "loss_seg": 0.18463006615638733, "mean_token_accuracy": 0.9930455237627029, "num_tokens": 117739374.0, "step": 277 }, { "entropy": 0.05577153991907835, "epoch": 0.121676332202648, "grad_norm": 14.1875, "learning_rate": 2.422740524781341e-05, "loss": 0.2307, "loss_lm": 0.026060300413519144, "loss_seg": 0.20466874912381172, "mean_token_accuracy": 0.9930930137634277, "num_tokens": 118163677.0, "step": 278 }, { "entropy": 0.05513996351510286, "epoch": 0.12211401685085896, "grad_norm": 18.0, "learning_rate": 2.4314868804664724e-05, "loss": 0.2047, "loss_lm": 0.026221629232168198, "loss_seg": 0.17848586291074753, "mean_token_accuracy": 0.9931143820285797, "num_tokens": 118588693.0, "step": 279 }, { "entropy": 0.05563806090503931, "epoch": 0.12255170149906992, "grad_norm": 22.625, "learning_rate": 2.4402332361516036e-05, "loss": 0.2551, "loss_lm": 0.026972012594342232, "loss_seg": 0.2280983105301857, "mean_token_accuracy": 0.9930647164583206, "num_tokens": 119013711.0, "step": 280 }, { "entropy": 0.05474123451858759, "epoch": 0.12298938614728089, "grad_norm": 15.9375, "learning_rate": 2.448979591836735e-05, "loss": 0.2648, "loss_lm": 0.025705956388264894, "loss_seg": 0.2390800602734089, "mean_token_accuracy": 0.9933337718248367, "num_tokens": 119439084.0, "step": 281 }, { "entropy": 0.05515671893954277, "epoch": 0.12342707079549185, "grad_norm": 19.875, "learning_rate": 2.457725947521866e-05, "loss": 0.1807, "loss_lm": 0.023937262129038572, "loss_seg": 0.1567753292620182, "mean_token_accuracy": 0.9930072575807571, "num_tokens": 119863330.0, "step": 282 }, { "entropy": 0.056078145280480385, "epoch": 0.12386475544370282, "grad_norm": 29.5, "learning_rate": 2.4664723032069973e-05, "loss": 0.2709, "loss_lm": 0.02707962691783905, "loss_seg": 0.24386881664395332, "mean_token_accuracy": 0.993037149310112, "num_tokens": 120288107.0, "step": 283 }, { "entropy": 0.05572961550205946, "epoch": 0.12430244009191378, "grad_norm": 20.875, "learning_rate": 2.475218658892128e-05, "loss": 0.2005, "loss_lm": 0.026438170112669468, "loss_seg": 0.17405939102172852, "mean_token_accuracy": 0.9932370483875275, "num_tokens": 120713593.0, "step": 284 }, { "entropy": 0.05569719895720482, "epoch": 0.12474012474012475, "grad_norm": 14.6875, "learning_rate": 2.4839650145772593e-05, "loss": 0.2928, "loss_lm": 0.023852833081036806, "loss_seg": 0.26891399919986725, "mean_token_accuracy": 0.99318927526474, "num_tokens": 121138464.0, "step": 285 }, { "entropy": 0.05377847980707884, "epoch": 0.1251778093883357, "grad_norm": 43.75, "learning_rate": 2.4927113702623906e-05, "loss": 0.1954, "loss_lm": 0.022919403854757547, "loss_seg": 0.17249960452318192, "mean_token_accuracy": 0.9932906031608582, "num_tokens": 121563462.0, "step": 286 }, { "entropy": 0.05418393388390541, "epoch": 0.12561549403654668, "grad_norm": 25.125, "learning_rate": 2.501457725947522e-05, "loss": 0.2446, "loss_lm": 0.025446749292314053, "loss_seg": 0.21910933405160904, "mean_token_accuracy": 0.9931625127792358, "num_tokens": 121988869.0, "step": 287 }, { "entropy": 0.05288207810372114, "epoch": 0.12605317868475763, "grad_norm": 56.0, "learning_rate": 2.5102040816326533e-05, "loss": 0.2348, "loss_lm": 0.02277090959250927, "loss_seg": 0.21200548112392426, "mean_token_accuracy": 0.9931175708770752, "num_tokens": 122413655.0, "step": 288 }, { "entropy": 0.052134424448013306, "epoch": 0.1264908633329686, "grad_norm": 16.875, "learning_rate": 2.5189504373177845e-05, "loss": 0.3124, "loss_lm": 0.023179039824754, "loss_seg": 0.28918375447392464, "mean_token_accuracy": 0.9931408762931824, "num_tokens": 122838587.0, "step": 289 }, { "entropy": 0.05215343181043863, "epoch": 0.12692854798117956, "grad_norm": 20.25, "learning_rate": 2.5276967930029154e-05, "loss": 0.2206, "loss_lm": 0.023020788561552763, "loss_seg": 0.19754371792078018, "mean_token_accuracy": 0.9931735247373581, "num_tokens": 123263644.0, "step": 290 }, { "entropy": 0.05178571119904518, "epoch": 0.12736623262939054, "grad_norm": 12.25, "learning_rate": 2.5364431486880466e-05, "loss": 0.2353, "loss_lm": 0.023629783652722836, "loss_seg": 0.2116333693265915, "mean_token_accuracy": 0.993143230676651, "num_tokens": 123688416.0, "step": 291 }, { "entropy": 0.05192090384662151, "epoch": 0.1278039172776015, "grad_norm": 16.25, "learning_rate": 2.5451895043731778e-05, "loss": 0.2529, "loss_lm": 0.023766778875142336, "loss_seg": 0.22917064279317856, "mean_token_accuracy": 0.9929702430963516, "num_tokens": 124113688.0, "step": 292 }, { "entropy": 0.05074975825846195, "epoch": 0.12824160192581247, "grad_norm": 9.9375, "learning_rate": 2.553935860058309e-05, "loss": 0.1989, "loss_lm": 0.02322889817878604, "loss_seg": 0.1756412386894226, "mean_token_accuracy": 0.9929863214492798, "num_tokens": 124538803.0, "step": 293 }, { "entropy": 0.04959113150835037, "epoch": 0.12867928657402342, "grad_norm": 31.75, "learning_rate": 2.5626822157434402e-05, "loss": 0.231, "loss_lm": 0.024404861498624086, "loss_seg": 0.20658806711435318, "mean_token_accuracy": 0.9933350384235382, "num_tokens": 124964389.0, "step": 294 }, { "entropy": 0.049376978538930416, "epoch": 0.12911697122223437, "grad_norm": 26.5, "learning_rate": 2.5714285714285714e-05, "loss": 0.2695, "loss_lm": 0.02401717472821474, "loss_seg": 0.24550940468907356, "mean_token_accuracy": 0.9932097643613815, "num_tokens": 125388908.0, "step": 295 }, { "entropy": 0.049677045084536076, "epoch": 0.12955465587044535, "grad_norm": 20.375, "learning_rate": 2.580174927113703e-05, "loss": 0.2199, "loss_lm": 0.023168053943663836, "loss_seg": 0.19675498083233833, "mean_token_accuracy": 0.9930374473333359, "num_tokens": 125813568.0, "step": 296 }, { "entropy": 0.0486662145704031, "epoch": 0.1299923405186563, "grad_norm": 11.75, "learning_rate": 2.588921282798834e-05, "loss": 0.1723, "loss_lm": 0.02200358873233199, "loss_seg": 0.1503014862537384, "mean_token_accuracy": 0.9931830018758774, "num_tokens": 126238742.0, "step": 297 }, { "entropy": 0.04872346669435501, "epoch": 0.13043002516686727, "grad_norm": 19.125, "learning_rate": 2.597667638483965e-05, "loss": 0.2067, "loss_lm": 0.021337371319532394, "loss_seg": 0.18537840619683266, "mean_token_accuracy": 0.9930865317583084, "num_tokens": 126663511.0, "step": 298 }, { "entropy": 0.048514013178646564, "epoch": 0.13086770981507823, "grad_norm": 40.0, "learning_rate": 2.6064139941690963e-05, "loss": 0.1724, "loss_lm": 0.023408849723637104, "loss_seg": 0.14896398596465588, "mean_token_accuracy": 0.9932144582271576, "num_tokens": 127088329.0, "step": 299 }, { "entropy": 0.048614502884447575, "epoch": 0.1313053944632892, "grad_norm": 14.875, "learning_rate": 2.6151603498542275e-05, "loss": 0.218, "loss_lm": 0.025182303972542286, "loss_seg": 0.19279003329575062, "mean_token_accuracy": 0.9931870251893997, "num_tokens": 127513361.0, "step": 300 }, { "entropy": 0.04864739719778299, "epoch": 0.13174307911150016, "grad_norm": 13.1875, "learning_rate": 2.6239067055393587e-05, "loss": 0.2138, "loss_lm": 0.02469164924696088, "loss_seg": 0.18906966969370842, "mean_token_accuracy": 0.9931953251361847, "num_tokens": 127937853.0, "step": 301 }, { "entropy": 0.048169150948524475, "epoch": 0.13218076375971113, "grad_norm": 17.375, "learning_rate": 2.63265306122449e-05, "loss": 0.2022, "loss_lm": 0.024169767275452614, "loss_seg": 0.17801300808787346, "mean_token_accuracy": 0.9930455684661865, "num_tokens": 128362859.0, "step": 302 }, { "entropy": 0.048113240860402584, "epoch": 0.13261844840792208, "grad_norm": 12.4375, "learning_rate": 2.6413994169096208e-05, "loss": 0.1999, "loss_lm": 0.02228311961516738, "loss_seg": 0.17765470035374165, "mean_token_accuracy": 0.9931648522615433, "num_tokens": 128787660.0, "step": 303 }, { "entropy": 0.04807461239397526, "epoch": 0.13305613305613306, "grad_norm": 12.5, "learning_rate": 2.650145772594752e-05, "loss": 0.2505, "loss_lm": 0.025046683847904205, "loss_seg": 0.2254452407360077, "mean_token_accuracy": 0.9942185431718826, "num_tokens": 129212519.0, "step": 304 }, { "entropy": 0.04967674892395735, "epoch": 0.13349381770434401, "grad_norm": 12.5625, "learning_rate": 2.6588921282798835e-05, "loss": 0.2266, "loss_lm": 0.024188322480767965, "loss_seg": 0.20238617435097694, "mean_token_accuracy": 0.9939436316490173, "num_tokens": 129638084.0, "step": 305 }, { "entropy": 0.05117738712579012, "epoch": 0.133931502352555, "grad_norm": 12.4375, "learning_rate": 2.6676384839650148e-05, "loss": 0.2623, "loss_lm": 0.02386250300332904, "loss_seg": 0.23843703046441078, "mean_token_accuracy": 0.9941371530294418, "num_tokens": 130062836.0, "step": 306 }, { "entropy": 0.04981319420039654, "epoch": 0.13436918700076594, "grad_norm": 13.0, "learning_rate": 2.676384839650146e-05, "loss": 0.1714, "loss_lm": 0.02319442108273506, "loss_seg": 0.1481863148510456, "mean_token_accuracy": 0.9934583753347397, "num_tokens": 130487722.0, "step": 307 }, { "entropy": 0.049369634129107, "epoch": 0.13480687164897692, "grad_norm": 9.5, "learning_rate": 2.6851311953352772e-05, "loss": 0.1855, "loss_lm": 0.023370465263724327, "loss_seg": 0.16215988621115685, "mean_token_accuracy": 0.9944552034139633, "num_tokens": 130913192.0, "step": 308 }, { "entropy": 0.05096688121557236, "epoch": 0.13524455629718787, "grad_norm": 19.5, "learning_rate": 2.6938775510204084e-05, "loss": 0.2633, "loss_lm": 0.022943210322409868, "loss_seg": 0.24031034857034683, "mean_token_accuracy": 0.9941465109586716, "num_tokens": 131338293.0, "step": 309 }, { "entropy": 0.04985793400555849, "epoch": 0.13568224094539885, "grad_norm": 18.625, "learning_rate": 2.7026239067055393e-05, "loss": 0.2275, "loss_lm": 0.024659883696585894, "loss_seg": 0.20281467400491238, "mean_token_accuracy": 0.9940734654664993, "num_tokens": 131762704.0, "step": 310 }, { "entropy": 0.04806922655552626, "epoch": 0.1361199255936098, "grad_norm": 15.9375, "learning_rate": 2.7113702623906705e-05, "loss": 0.2143, "loss_lm": 0.022648361511528492, "loss_seg": 0.1916104517877102, "mean_token_accuracy": 0.9941062182188034, "num_tokens": 132188104.0, "step": 311 }, { "entropy": 0.04622402135282755, "epoch": 0.13655761024182078, "grad_norm": 22.75, "learning_rate": 2.7201166180758017e-05, "loss": 0.1899, "loss_lm": 0.020604467717930675, "loss_seg": 0.16925404220819473, "mean_token_accuracy": 0.9941329956054688, "num_tokens": 132613228.0, "step": 312 }, { "entropy": 0.04517350159585476, "epoch": 0.13699529489003173, "grad_norm": 17.25, "learning_rate": 2.728862973760933e-05, "loss": 0.2171, "loss_lm": 0.02255088835954666, "loss_seg": 0.19452951848506927, "mean_token_accuracy": 0.9940963536500931, "num_tokens": 133037416.0, "step": 313 }, { "entropy": 0.044926440343260765, "epoch": 0.13743297953824268, "grad_norm": 22.625, "learning_rate": 2.7376093294460644e-05, "loss": 0.1931, "loss_lm": 0.023923044092953205, "loss_seg": 0.16914300248026848, "mean_token_accuracy": 0.9940461218357086, "num_tokens": 133462396.0, "step": 314 }, { "entropy": 0.04389751981943846, "epoch": 0.13787066418645366, "grad_norm": 20.25, "learning_rate": 2.7463556851311957e-05, "loss": 0.2522, "loss_lm": 0.023672665003687143, "loss_seg": 0.22856402769684792, "mean_token_accuracy": 0.994094043970108, "num_tokens": 133887264.0, "step": 315 }, { "entropy": 0.04366180393844843, "epoch": 0.1383083488346646, "grad_norm": 9.0, "learning_rate": 2.7551020408163265e-05, "loss": 0.1993, "loss_lm": 0.023563111200928688, "loss_seg": 0.17571890726685524, "mean_token_accuracy": 0.994193509221077, "num_tokens": 134312161.0, "step": 316 }, { "entropy": 0.043676198460161686, "epoch": 0.1387460334828756, "grad_norm": 11.75, "learning_rate": 2.7638483965014577e-05, "loss": 0.1754, "loss_lm": 0.020213995594531298, "loss_seg": 0.1551507618278265, "mean_token_accuracy": 0.9938732534646988, "num_tokens": 134737361.0, "step": 317 }, { "entropy": 0.04350227024406195, "epoch": 0.13918371813108654, "grad_norm": 27.25, "learning_rate": 2.772594752186589e-05, "loss": 0.2283, "loss_lm": 0.023058931343257427, "loss_seg": 0.20519806444644928, "mean_token_accuracy": 0.9938952177762985, "num_tokens": 135162800.0, "step": 318 }, { "entropy": 0.043521094135940075, "epoch": 0.13962140277929752, "grad_norm": 16.375, "learning_rate": 2.78134110787172e-05, "loss": 0.2395, "loss_lm": 0.022365999408066273, "loss_seg": 0.21713267266750336, "mean_token_accuracy": 0.9938099086284637, "num_tokens": 135587731.0, "step": 319 }, { "entropy": 0.04336182773113251, "epoch": 0.14005908742750847, "grad_norm": 18.875, "learning_rate": 2.7900874635568514e-05, "loss": 0.232, "loss_lm": 0.023408603388816118, "loss_seg": 0.20860851556062698, "mean_token_accuracy": 0.994069442152977, "num_tokens": 136012023.0, "step": 320 }, { "entropy": 0.04394508898258209, "epoch": 0.14049677207571945, "grad_norm": 15.9375, "learning_rate": 2.7988338192419826e-05, "loss": 0.2272, "loss_lm": 0.02037266083061695, "loss_seg": 0.2067890502512455, "mean_token_accuracy": 0.9942297488451004, "num_tokens": 136436979.0, "step": 321 }, { "entropy": 0.04398069437593222, "epoch": 0.1409344567239304, "grad_norm": 11.6875, "learning_rate": 2.8075801749271134e-05, "loss": 0.1913, "loss_lm": 0.02114772330969572, "loss_seg": 0.17013567313551903, "mean_token_accuracy": 0.9946213662624359, "num_tokens": 136861638.0, "step": 322 }, { "entropy": 0.04574378114193678, "epoch": 0.14137214137214138, "grad_norm": 18.25, "learning_rate": 2.816326530612245e-05, "loss": 0.2059, "loss_lm": 0.020328256767243147, "loss_seg": 0.18558678403496742, "mean_token_accuracy": 0.9947027117013931, "num_tokens": 137287713.0, "step": 323 }, { "entropy": 0.04546555411070585, "epoch": 0.14180982602035233, "grad_norm": 16.75, "learning_rate": 2.8250728862973762e-05, "loss": 0.2712, "loss_lm": 0.021633601747453213, "loss_seg": 0.24959101155400276, "mean_token_accuracy": 0.9949882924556732, "num_tokens": 137712107.0, "step": 324 }, { "entropy": 0.04459964111447334, "epoch": 0.1422475106685633, "grad_norm": 46.25, "learning_rate": 2.8338192419825074e-05, "loss": 0.1553, "loss_lm": 0.021689968649297953, "loss_seg": 0.13358097523450851, "mean_token_accuracy": 0.9950276166200638, "num_tokens": 138137030.0, "step": 325 }, { "entropy": 0.044682495296001434, "epoch": 0.14268519531677426, "grad_norm": 11.1875, "learning_rate": 2.8425655976676386e-05, "loss": 0.183, "loss_lm": 0.019185300450772047, "loss_seg": 0.1638251319527626, "mean_token_accuracy": 0.9950543642044067, "num_tokens": 138562221.0, "step": 326 }, { "entropy": 0.04389809723943472, "epoch": 0.14312287996498524, "grad_norm": 13.375, "learning_rate": 2.85131195335277e-05, "loss": 0.2176, "loss_lm": 0.021649810951203108, "loss_seg": 0.19597994163632393, "mean_token_accuracy": 0.9950762093067169, "num_tokens": 138986460.0, "step": 327 }, { "entropy": 0.044493066146969795, "epoch": 0.1435605646131962, "grad_norm": 16.5, "learning_rate": 2.860058309037901e-05, "loss": 0.1963, "loss_lm": 0.022224010434001684, "loss_seg": 0.17411047965288162, "mean_token_accuracy": 0.9949248284101486, "num_tokens": 139411472.0, "step": 328 }, { "entropy": 0.0428394703194499, "epoch": 0.14399824926140717, "grad_norm": 15.75, "learning_rate": 2.868804664723032e-05, "loss": 0.1769, "loss_lm": 0.018569847103208303, "loss_seg": 0.15829071402549744, "mean_token_accuracy": 0.9951105117797852, "num_tokens": 139836230.0, "step": 329 }, { "entropy": 0.04285167716443539, "epoch": 0.14443593390961812, "grad_norm": 9.25, "learning_rate": 2.877551020408163e-05, "loss": 0.2015, "loss_lm": 0.02264624135568738, "loss_seg": 0.17883751541376114, "mean_token_accuracy": 0.9949727058410645, "num_tokens": 140260903.0, "step": 330 }, { "entropy": 0.042921824380755424, "epoch": 0.14487361855782907, "grad_norm": 17.75, "learning_rate": 2.8862973760932947e-05, "loss": 0.2611, "loss_lm": 0.022580127231776714, "loss_seg": 0.2385510317981243, "mean_token_accuracy": 0.9947835803031921, "num_tokens": 140685904.0, "step": 331 }, { "entropy": 0.04201953113079071, "epoch": 0.14531130320604005, "grad_norm": 20.125, "learning_rate": 2.895043731778426e-05, "loss": 0.1949, "loss_lm": 0.02098609320819378, "loss_seg": 0.173897759988904, "mean_token_accuracy": 0.9950056225061417, "num_tokens": 141110967.0, "step": 332 }, { "entropy": 0.040944574400782585, "epoch": 0.145748987854251, "grad_norm": 15.875, "learning_rate": 2.903790087463557e-05, "loss": 0.222, "loss_lm": 0.019740642746910453, "loss_seg": 0.20227056741714478, "mean_token_accuracy": 0.9950513392686844, "num_tokens": 141535519.0, "step": 333 }, { "entropy": 0.04033851157873869, "epoch": 0.14618667250246198, "grad_norm": 23.625, "learning_rate": 2.9125364431486883e-05, "loss": 0.1912, "loss_lm": 0.020991507917642593, "loss_seg": 0.17023188434541225, "mean_token_accuracy": 0.9949212670326233, "num_tokens": 141960421.0, "step": 334 }, { "entropy": 0.04058212507516146, "epoch": 0.14662435715067293, "grad_norm": 13.0625, "learning_rate": 2.9212827988338192e-05, "loss": 0.2085, "loss_lm": 0.0220723794773221, "loss_seg": 0.18644642271101475, "mean_token_accuracy": 0.9946610480546951, "num_tokens": 142385774.0, "step": 335 }, { "entropy": 0.04004905931651592, "epoch": 0.1470620417988839, "grad_norm": 12.25, "learning_rate": 2.9300291545189504e-05, "loss": 0.2002, "loss_lm": 0.019985946360975504, "loss_seg": 0.18021722696721554, "mean_token_accuracy": 0.9949312061071396, "num_tokens": 142810509.0, "step": 336 }, { "entropy": 0.039338755421340466, "epoch": 0.14749972644709486, "grad_norm": 44.75, "learning_rate": 2.9387755102040816e-05, "loss": 0.2158, "loss_lm": 0.020504184532910585, "loss_seg": 0.19533441215753555, "mean_token_accuracy": 0.9949102401733398, "num_tokens": 143235824.0, "step": 337 }, { "entropy": 0.038737401366233826, "epoch": 0.14793741109530584, "grad_norm": 16.625, "learning_rate": 2.9475218658892128e-05, "loss": 0.2044, "loss_lm": 0.018392108846455812, "loss_seg": 0.18604294210672379, "mean_token_accuracy": 0.995047003030777, "num_tokens": 143661209.0, "step": 338 }, { "entropy": 0.039768610149621964, "epoch": 0.1483750957435168, "grad_norm": 14.625, "learning_rate": 2.956268221574344e-05, "loss": 0.2493, "loss_lm": 0.022094372659921646, "loss_seg": 0.22722340002655983, "mean_token_accuracy": 0.9947269707918167, "num_tokens": 144086394.0, "step": 339 }, { "entropy": 0.03825071360915899, "epoch": 0.14881278039172777, "grad_norm": 13.6875, "learning_rate": 2.9650145772594756e-05, "loss": 0.201, "loss_lm": 0.018523574341088533, "loss_seg": 0.18248979188501835, "mean_token_accuracy": 0.9950284510850906, "num_tokens": 144512191.0, "step": 340 }, { "entropy": 0.03857971262186766, "epoch": 0.14925046503993872, "grad_norm": 11.0625, "learning_rate": 2.9737609329446064e-05, "loss": 0.2217, "loss_lm": 0.019151676911860704, "loss_seg": 0.20256520807743073, "mean_token_accuracy": 0.9949617683887482, "num_tokens": 144937475.0, "step": 341 }, { "entropy": 0.03858028631657362, "epoch": 0.1496881496881497, "grad_norm": 15.8125, "learning_rate": 2.9825072886297377e-05, "loss": 0.1818, "loss_lm": 0.018814354203641415, "loss_seg": 0.1629536636173725, "mean_token_accuracy": 0.9949442893266678, "num_tokens": 145362098.0, "step": 342 }, { "entropy": 0.03847341239452362, "epoch": 0.15012583433636065, "grad_norm": 22.0, "learning_rate": 2.991253644314869e-05, "loss": 0.1755, "loss_lm": 0.019187675788998604, "loss_seg": 0.1562769766896963, "mean_token_accuracy": 0.9949893057346344, "num_tokens": 145786757.0, "step": 343 }, { "entropy": 0.038956429809331894, "epoch": 0.15056351898457163, "grad_norm": 18.25, "learning_rate": 3e-05, "loss": 0.1422, "loss_lm": 0.016115883365273476, "loss_seg": 0.1261178757995367, "mean_token_accuracy": 0.9949989169836044, "num_tokens": 146211864.0, "step": 344 }, { "entropy": 0.03829129599034786, "epoch": 0.15100120363278258, "grad_norm": 19.5, "learning_rate": 2.9997292907417434e-05, "loss": 0.2248, "loss_lm": 0.017233122838661075, "loss_seg": 0.20759603939950466, "mean_token_accuracy": 0.9950052499771118, "num_tokens": 146637269.0, "step": 345 }, { "entropy": 0.03974462021142244, "epoch": 0.15143888828099356, "grad_norm": 19.625, "learning_rate": 2.9994585814834868e-05, "loss": 0.1564, "loss_lm": 0.022756863152608275, "loss_seg": 0.13367177732288837, "mean_token_accuracy": 0.9947921484708786, "num_tokens": 147062785.0, "step": 346 }, { "entropy": 0.03849285840988159, "epoch": 0.1518765729292045, "grad_norm": 11.3125, "learning_rate": 2.9991878722252302e-05, "loss": 0.1883, "loss_lm": 0.019976760260760784, "loss_seg": 0.16836295649409294, "mean_token_accuracy": 0.9950983673334122, "num_tokens": 147487521.0, "step": 347 }, { "entropy": 0.03859160467982292, "epoch": 0.15231425757741548, "grad_norm": 14.9375, "learning_rate": 2.9989171629669736e-05, "loss": 0.1824, "loss_lm": 0.017107127700001, "loss_seg": 0.16531717963516712, "mean_token_accuracy": 0.9950363039970398, "num_tokens": 147912596.0, "step": 348 }, { "entropy": 0.03915556147694588, "epoch": 0.15275194222562644, "grad_norm": 13.4375, "learning_rate": 2.998646453708717e-05, "loss": 0.2188, "loss_lm": 0.02302352525293827, "loss_seg": 0.19574545696377754, "mean_token_accuracy": 0.9950244575738907, "num_tokens": 148338061.0, "step": 349 }, { "entropy": 0.0387055529281497, "epoch": 0.1531896268738374, "grad_norm": 13.6875, "learning_rate": 2.9983757444504603e-05, "loss": 0.3148, "loss_lm": 0.02072414569556713, "loss_seg": 0.29410816356539726, "mean_token_accuracy": 0.9950522035360336, "num_tokens": 148763076.0, "step": 350 }, { "entropy": 0.038581084460020065, "epoch": 0.15362731152204837, "grad_norm": 10.6875, "learning_rate": 2.9981050351922037e-05, "loss": 0.2141, "loss_lm": 0.018716621212661266, "loss_seg": 0.19534173607826233, "mean_token_accuracy": 0.9950888901948929, "num_tokens": 149188209.0, "step": 351 }, { "entropy": 0.038098632358014584, "epoch": 0.15406499617025932, "grad_norm": 18.75, "learning_rate": 2.997834325933947e-05, "loss": 0.2484, "loss_lm": 0.02014907728880644, "loss_seg": 0.22820836305618286, "mean_token_accuracy": 0.9949967116117477, "num_tokens": 149613314.0, "step": 352 }, { "entropy": 0.03687785565853119, "epoch": 0.1545026808184703, "grad_norm": 14.6875, "learning_rate": 2.9975636166756904e-05, "loss": 0.2115, "loss_lm": 0.019870325922966003, "loss_seg": 0.1916111223399639, "mean_token_accuracy": 0.9949712306261063, "num_tokens": 150038056.0, "step": 353 }, { "entropy": 0.03582651726901531, "epoch": 0.15494036546668125, "grad_norm": 14.25, "learning_rate": 2.9972929074174338e-05, "loss": 0.1989, "loss_lm": 0.01932408008724451, "loss_seg": 0.17957549542188644, "mean_token_accuracy": 0.995058998465538, "num_tokens": 150462848.0, "step": 354 }, { "entropy": 0.03626282885670662, "epoch": 0.15537805011489222, "grad_norm": 15.1875, "learning_rate": 2.9970221981591772e-05, "loss": 0.2091, "loss_lm": 0.01797245256602764, "loss_seg": 0.1911343540996313, "mean_token_accuracy": 0.9949584603309631, "num_tokens": 150888326.0, "step": 355 }, { "entropy": 0.03562141768634319, "epoch": 0.15581573476310318, "grad_norm": 50.75, "learning_rate": 2.9967514889009202e-05, "loss": 0.2061, "loss_lm": 0.018144378904253244, "loss_seg": 0.18797331117093563, "mean_token_accuracy": 0.9949967712163925, "num_tokens": 151313433.0, "step": 356 }, { "entropy": 0.03547684848308563, "epoch": 0.15625341941131415, "grad_norm": 13.25, "learning_rate": 2.996480779642664e-05, "loss": 0.1874, "loss_lm": 0.020047636702656746, "loss_seg": 0.1673144418746233, "mean_token_accuracy": 0.9951154142618179, "num_tokens": 151739025.0, "step": 357 }, { "entropy": 0.0350248571485281, "epoch": 0.1566911040595251, "grad_norm": 23.625, "learning_rate": 2.9962100703844073e-05, "loss": 0.2379, "loss_lm": 0.019547687377780676, "loss_seg": 0.21833801828324795, "mean_token_accuracy": 0.995070144534111, "num_tokens": 152163923.0, "step": 358 }, { "entropy": 0.03552420996129513, "epoch": 0.15712878870773608, "grad_norm": 32.25, "learning_rate": 2.9959393611261507e-05, "loss": 0.1664, "loss_lm": 0.021024852991104126, "loss_seg": 0.14541378989815712, "mean_token_accuracy": 0.9949065446853638, "num_tokens": 152589758.0, "step": 359 }, { "entropy": 0.03562209848314524, "epoch": 0.15756647335594703, "grad_norm": 10.4375, "learning_rate": 2.995668651867894e-05, "loss": 0.1955, "loss_lm": 0.019449044950306416, "loss_seg": 0.17603744566440582, "mean_token_accuracy": 0.9950518161058426, "num_tokens": 153015348.0, "step": 360 }, { "entropy": 0.035013774409890175, "epoch": 0.158004158004158, "grad_norm": 12.8125, "learning_rate": 2.995397942609637e-05, "loss": 0.2115, "loss_lm": 0.01995088066905737, "loss_seg": 0.1915212869644165, "mean_token_accuracy": 0.9951538145542145, "num_tokens": 153440427.0, "step": 361 }, { "entropy": 0.03536074422299862, "epoch": 0.15844184265236896, "grad_norm": 19.375, "learning_rate": 2.9951272333513804e-05, "loss": 0.2404, "loss_lm": 0.020542269572615623, "loss_seg": 0.2199038565158844, "mean_token_accuracy": 0.9951126575469971, "num_tokens": 153865559.0, "step": 362 }, { "entropy": 0.03531762771308422, "epoch": 0.15887952730057994, "grad_norm": 12.1875, "learning_rate": 2.994856524093124e-05, "loss": 0.229, "loss_lm": 0.0176125664729625, "loss_seg": 0.21139601431787014, "mean_token_accuracy": 0.9951614439487457, "num_tokens": 154291065.0, "step": 363 }, { "entropy": 0.03587163891643286, "epoch": 0.1593172119487909, "grad_norm": 15.75, "learning_rate": 2.9945858148348675e-05, "loss": 0.2307, "loss_lm": 0.021308451425284147, "loss_seg": 0.2093786858022213, "mean_token_accuracy": 0.9949047118425369, "num_tokens": 154716533.0, "step": 364 }, { "entropy": 0.034739477559924126, "epoch": 0.15975489659700187, "grad_norm": 10.25, "learning_rate": 2.994315105576611e-05, "loss": 0.3298, "loss_lm": 0.01971240993589163, "loss_seg": 0.310092456638813, "mean_token_accuracy": 0.9950085878372192, "num_tokens": 155141448.0, "step": 365 }, { "entropy": 0.034044754691421986, "epoch": 0.16019258124521282, "grad_norm": 14.25, "learning_rate": 2.994044396318354e-05, "loss": 0.1844, "loss_lm": 0.018974442966282368, "loss_seg": 0.16542435809969902, "mean_token_accuracy": 0.9950310438871384, "num_tokens": 155566656.0, "step": 366 }, { "entropy": 0.03420005273073912, "epoch": 0.1606302658934238, "grad_norm": 16.5, "learning_rate": 2.9937736870600973e-05, "loss": 0.2075, "loss_lm": 0.017305567394942045, "loss_seg": 0.19019532203674316, "mean_token_accuracy": 0.9949180632829666, "num_tokens": 155991854.0, "step": 367 }, { "entropy": 0.033474452793598175, "epoch": 0.16106795054163475, "grad_norm": 16.875, "learning_rate": 2.993502977801841e-05, "loss": 0.1885, "loss_lm": 0.019552127923816442, "loss_seg": 0.16891567967832088, "mean_token_accuracy": 0.9950813502073288, "num_tokens": 156415915.0, "step": 368 }, { "entropy": 0.03389654215425253, "epoch": 0.1615056351898457, "grad_norm": 18.0, "learning_rate": 2.9932322685435844e-05, "loss": 0.1983, "loss_lm": 0.01937525742687285, "loss_seg": 0.17894300445914268, "mean_token_accuracy": 0.9951227754354477, "num_tokens": 156840714.0, "step": 369 }, { "entropy": 0.03423374891281128, "epoch": 0.16194331983805668, "grad_norm": 17.75, "learning_rate": 2.9929615592853278e-05, "loss": 0.2252, "loss_lm": 0.019769553560763597, "loss_seg": 0.20541483163833618, "mean_token_accuracy": 0.9949528574943542, "num_tokens": 157265494.0, "step": 370 }, { "entropy": 0.03447528276592493, "epoch": 0.16238100448626763, "grad_norm": 19.375, "learning_rate": 2.9926908500270708e-05, "loss": 0.2175, "loss_lm": 0.02024385891854763, "loss_seg": 0.19726458564400673, "mean_token_accuracy": 0.9951126873493195, "num_tokens": 157690031.0, "step": 371 }, { "entropy": 0.0335792675614357, "epoch": 0.1628186891344786, "grad_norm": 14.6875, "learning_rate": 2.9924201407688142e-05, "loss": 0.1956, "loss_lm": 0.02146648708730936, "loss_seg": 0.17410903424024582, "mean_token_accuracy": 0.9950847774744034, "num_tokens": 158114607.0, "step": 372 }, { "entropy": 0.035011948086321354, "epoch": 0.16325637378268956, "grad_norm": 10.375, "learning_rate": 2.992149431510558e-05, "loss": 0.2076, "loss_lm": 0.01958113955333829, "loss_seg": 0.18803782761096954, "mean_token_accuracy": 0.9949024766683578, "num_tokens": 158538734.0, "step": 373 }, { "entropy": 0.03301496524363756, "epoch": 0.16369405843090054, "grad_norm": 15.875, "learning_rate": 2.9918787222523013e-05, "loss": 0.1772, "loss_lm": 0.016721834894269705, "loss_seg": 0.1604675780981779, "mean_token_accuracy": 0.9951984286308289, "num_tokens": 158963318.0, "step": 374 }, { "entropy": 0.03271962143480778, "epoch": 0.1641317430791115, "grad_norm": 19.5, "learning_rate": 2.9916080129940446e-05, "loss": 0.2288, "loss_lm": 0.0169794459361583, "loss_seg": 0.21182715520262718, "mean_token_accuracy": 0.9951697587966919, "num_tokens": 159387470.0, "step": 375 }, { "entropy": 0.032932667061686516, "epoch": 0.16456942772732247, "grad_norm": 16.375, "learning_rate": 2.9913373037357877e-05, "loss": 0.2162, "loss_lm": 0.020612596068531275, "loss_seg": 0.19560179859399796, "mean_token_accuracy": 0.9950146228075027, "num_tokens": 159812476.0, "step": 376 }, { "entropy": 0.033005254343152046, "epoch": 0.16500711237553342, "grad_norm": 20.0, "learning_rate": 2.991066594477531e-05, "loss": 0.206, "loss_lm": 0.018103254260495305, "loss_seg": 0.18791992217302322, "mean_token_accuracy": 0.9950088262557983, "num_tokens": 160237205.0, "step": 377 }, { "entropy": 0.03255915828049183, "epoch": 0.1654447970237444, "grad_norm": 40.5, "learning_rate": 2.9907958852192747e-05, "loss": 0.2064, "loss_lm": 0.019841278437525034, "loss_seg": 0.18652864173054695, "mean_token_accuracy": 0.9950245767831802, "num_tokens": 160662073.0, "step": 378 }, { "entropy": 0.03284250106662512, "epoch": 0.16588248167195535, "grad_norm": 10.875, "learning_rate": 2.990525175961018e-05, "loss": 0.2033, "loss_lm": 0.021962393075227737, "loss_seg": 0.1813563071191311, "mean_token_accuracy": 0.9951088279485703, "num_tokens": 161087495.0, "step": 379 }, { "entropy": 0.03212851379066706, "epoch": 0.16632016632016633, "grad_norm": 14.0, "learning_rate": 2.990254466702761e-05, "loss": 0.2104, "loss_lm": 0.020474249264225364, "loss_seg": 0.1899738498032093, "mean_token_accuracy": 0.9950738847255707, "num_tokens": 161512560.0, "step": 380 }, { "entropy": 0.031555878929793835, "epoch": 0.16675785096837728, "grad_norm": 22.0, "learning_rate": 2.9899837574445045e-05, "loss": 0.1995, "loss_lm": 0.018988156225532293, "loss_seg": 0.1805494762957096, "mean_token_accuracy": 0.9952140301465988, "num_tokens": 161936897.0, "step": 381 }, { "entropy": 0.03139236522838473, "epoch": 0.16719553561658826, "grad_norm": 7.5, "learning_rate": 2.989713048186248e-05, "loss": 0.1717, "loss_lm": 0.018058296758681536, "loss_seg": 0.15363793075084686, "mean_token_accuracy": 0.9951190054416656, "num_tokens": 162361388.0, "step": 382 }, { "entropy": 0.03209950402379036, "epoch": 0.1676332202647992, "grad_norm": 11.125, "learning_rate": 2.9894423389279916e-05, "loss": 0.2153, "loss_lm": 0.019605645444244146, "loss_seg": 0.195688184350729, "mean_token_accuracy": 0.9950139671564102, "num_tokens": 162786531.0, "step": 383 }, { "entropy": 0.03148052096366882, "epoch": 0.1680709049130102, "grad_norm": 11.3125, "learning_rate": 2.989171629669735e-05, "loss": 0.1525, "loss_lm": 0.01844488689675927, "loss_seg": 0.1340642999857664, "mean_token_accuracy": 0.9950131922960281, "num_tokens": 163211630.0, "step": 384 }, { "entropy": 0.031396444886922836, "epoch": 0.16850858956122114, "grad_norm": 23.75, "learning_rate": 2.988900920411478e-05, "loss": 0.1815, "loss_lm": 0.017259130720049143, "loss_seg": 0.16422992944717407, "mean_token_accuracy": 0.9950071722269058, "num_tokens": 163636626.0, "step": 385 }, { "entropy": 0.031023005954921246, "epoch": 0.16894627420943212, "grad_norm": 10.3125, "learning_rate": 2.9886302111532214e-05, "loss": 0.1941, "loss_lm": 0.01827775244601071, "loss_seg": 0.17585142515599728, "mean_token_accuracy": 0.99509397149086, "num_tokens": 164061585.0, "step": 386 }, { "entropy": 0.03137870039790869, "epoch": 0.16938395885764307, "grad_norm": 11.1875, "learning_rate": 2.9883595018949648e-05, "loss": 0.1619, "loss_lm": 0.01679023401811719, "loss_seg": 0.14510618522763252, "mean_token_accuracy": 0.9950006157159805, "num_tokens": 164486624.0, "step": 387 }, { "entropy": 0.031311911065131426, "epoch": 0.16982164350585402, "grad_norm": 11.25, "learning_rate": 2.9880887926367085e-05, "loss": 0.2078, "loss_lm": 0.017801897367462516, "loss_seg": 0.19001781940460205, "mean_token_accuracy": 0.9951049238443375, "num_tokens": 164911524.0, "step": 388 }, { "entropy": 0.03177269734442234, "epoch": 0.170259328154065, "grad_norm": 22.375, "learning_rate": 2.987818083378452e-05, "loss": 0.1792, "loss_lm": 0.017352089984342456, "loss_seg": 0.1618601270020008, "mean_token_accuracy": 0.9949445724487305, "num_tokens": 165337149.0, "step": 389 }, { "entropy": 0.0302529688924551, "epoch": 0.17069701280227595, "grad_norm": 10.9375, "learning_rate": 2.987547374120195e-05, "loss": 0.2093, "loss_lm": 0.017906054388731718, "loss_seg": 0.19142168760299683, "mean_token_accuracy": 0.995250329375267, "num_tokens": 165762180.0, "step": 390 }, { "entropy": 0.030232300516217947, "epoch": 0.17113469745048693, "grad_norm": 28.125, "learning_rate": 2.9872766648619382e-05, "loss": 0.2482, "loss_lm": 0.019496803171932697, "loss_seg": 0.228734340518713, "mean_token_accuracy": 0.9951015114784241, "num_tokens": 166186129.0, "step": 391 }, { "entropy": 0.031423448119312525, "epoch": 0.17157238209869788, "grad_norm": 11.9375, "learning_rate": 2.9870059556036816e-05, "loss": 0.2346, "loss_lm": 0.018818318378180265, "loss_seg": 0.2157513052225113, "mean_token_accuracy": 0.9948758482933044, "num_tokens": 166611558.0, "step": 392 }, { "entropy": 0.031015493907034397, "epoch": 0.17201006674690886, "grad_norm": 12.75, "learning_rate": 2.986735246345425e-05, "loss": 0.1615, "loss_lm": 0.018909971695393324, "loss_seg": 0.14260343089699745, "mean_token_accuracy": 0.9948815703392029, "num_tokens": 167036476.0, "step": 393 }, { "entropy": 0.030473208520561457, "epoch": 0.1724477513951198, "grad_norm": 11.5, "learning_rate": 2.9864645370871687e-05, "loss": 0.2039, "loss_lm": 0.01828971249051392, "loss_seg": 0.18564502336084843, "mean_token_accuracy": 0.9949809610843658, "num_tokens": 167461443.0, "step": 394 }, { "entropy": 0.030116192065179348, "epoch": 0.1728854360433308, "grad_norm": 12.5625, "learning_rate": 2.9861938278289117e-05, "loss": 0.2069, "loss_lm": 0.018552362453192472, "loss_seg": 0.18831505998969078, "mean_token_accuracy": 0.9950396418571472, "num_tokens": 167885403.0, "step": 395 }, { "entropy": 0.029960912186652422, "epoch": 0.17332312069154174, "grad_norm": 12.8125, "learning_rate": 2.985923118570655e-05, "loss": 0.1954, "loss_lm": 0.01710964972153306, "loss_seg": 0.17828438058495522, "mean_token_accuracy": 0.9951315522193909, "num_tokens": 168310163.0, "step": 396 }, { "entropy": 0.030242603737860918, "epoch": 0.17376080533975272, "grad_norm": 16.125, "learning_rate": 2.9856524093123985e-05, "loss": 0.1934, "loss_lm": 0.01956411637365818, "loss_seg": 0.17387208342552185, "mean_token_accuracy": 0.9949858486652374, "num_tokens": 168735534.0, "step": 397 }, { "entropy": 0.02998630004003644, "epoch": 0.17419848998796367, "grad_norm": 11.0, "learning_rate": 2.985381700054142e-05, "loss": 0.1769, "loss_lm": 0.019448031671345234, "loss_seg": 0.1574503853917122, "mean_token_accuracy": 0.9949344545602798, "num_tokens": 169160748.0, "step": 398 }, { "entropy": 0.029291301034390926, "epoch": 0.17463617463617465, "grad_norm": 13.125, "learning_rate": 2.9851109907958856e-05, "loss": 0.2387, "loss_lm": 0.01812267117202282, "loss_seg": 0.22057701274752617, "mean_token_accuracy": 0.9951706230640411, "num_tokens": 169585208.0, "step": 399 }, { "entropy": 0.028948036022484303, "epoch": 0.1750738592843856, "grad_norm": 14.875, "learning_rate": 2.9848402815376286e-05, "loss": 0.1923, "loss_lm": 0.016096462029963732, "loss_seg": 0.17616800591349602, "mean_token_accuracy": 0.9952720552682877, "num_tokens": 170010738.0, "step": 400 }, { "entropy": 0.030208085663616657, "epoch": 0.17551154393259658, "grad_norm": 11.0, "learning_rate": 2.984569572279372e-05, "loss": 0.1624, "loss_lm": 0.018268529325723648, "loss_seg": 0.1441155392676592, "mean_token_accuracy": 0.9948780536651611, "num_tokens": 170436146.0, "step": 401 }, { "entropy": 0.029348785057663918, "epoch": 0.17594922858080753, "grad_norm": 9.8125, "learning_rate": 2.9842988630211153e-05, "loss": 0.1756, "loss_lm": 0.01788363605737686, "loss_seg": 0.15774820744991302, "mean_token_accuracy": 0.9950522482395172, "num_tokens": 170861742.0, "step": 402 }, { "entropy": 0.02991746598854661, "epoch": 0.1763869132290185, "grad_norm": 17.125, "learning_rate": 2.9840281537628587e-05, "loss": 0.1836, "loss_lm": 0.017095545306801796, "loss_seg": 0.1664871610701084, "mean_token_accuracy": 0.9950747787952423, "num_tokens": 171286809.0, "step": 403 }, { "entropy": 0.02928477991372347, "epoch": 0.17682459787722946, "grad_norm": 10.3125, "learning_rate": 2.983757444504602e-05, "loss": 0.2628, "loss_lm": 0.01688672974705696, "loss_seg": 0.2458757348358631, "mean_token_accuracy": 0.9950176179409027, "num_tokens": 171711684.0, "step": 404 }, { "entropy": 0.02970478404313326, "epoch": 0.1772622825254404, "grad_norm": 9.375, "learning_rate": 2.9834867352463455e-05, "loss": 0.1709, "loss_lm": 0.020157197955995798, "loss_seg": 0.15075343661010265, "mean_token_accuracy": 0.9949806183576584, "num_tokens": 172136813.0, "step": 405 }, { "entropy": 0.02933089667931199, "epoch": 0.17769996717365139, "grad_norm": 10.6875, "learning_rate": 2.983216025988089e-05, "loss": 0.1311, "loss_lm": 0.017830198630690575, "loss_seg": 0.113241296261549, "mean_token_accuracy": 0.9949918389320374, "num_tokens": 172562083.0, "step": 406 }, { "entropy": 0.029065421782433987, "epoch": 0.17813765182186234, "grad_norm": 10.1875, "learning_rate": 2.9829453167298322e-05, "loss": 0.1489, "loss_lm": 0.01729930634610355, "loss_seg": 0.13164907693862915, "mean_token_accuracy": 0.995049238204956, "num_tokens": 172987022.0, "step": 407 }, { "entropy": 0.029057391919195652, "epoch": 0.17857533647007331, "grad_norm": 13.75, "learning_rate": 2.9826746074715756e-05, "loss": 0.1854, "loss_lm": 0.017457469599321485, "loss_seg": 0.16796662658452988, "mean_token_accuracy": 0.9950854480266571, "num_tokens": 173412874.0, "step": 408 }, { "entropy": 0.028739509638398886, "epoch": 0.17901302111828427, "grad_norm": 11.0625, "learning_rate": 2.982403898213319e-05, "loss": 0.2214, "loss_lm": 0.01849576225504279, "loss_seg": 0.2028703521937132, "mean_token_accuracy": 0.9951789528131485, "num_tokens": 173838466.0, "step": 409 }, { "entropy": 0.02837638556957245, "epoch": 0.17945070576649524, "grad_norm": 24.0, "learning_rate": 2.9821331889550623e-05, "loss": 0.2046, "loss_lm": 0.02180209569633007, "loss_seg": 0.18284295871853828, "mean_token_accuracy": 0.9951794147491455, "num_tokens": 174264066.0, "step": 410 }, { "entropy": 0.029401167295873165, "epoch": 0.1798883904147062, "grad_norm": 9.8125, "learning_rate": 2.9818624796968057e-05, "loss": 0.2055, "loss_lm": 0.020016676746308804, "loss_seg": 0.18553326278924942, "mean_token_accuracy": 0.994912326335907, "num_tokens": 174689407.0, "step": 411 }, { "entropy": 0.029244455508887768, "epoch": 0.18032607506291717, "grad_norm": 55.25, "learning_rate": 2.981591770438549e-05, "loss": 0.2057, "loss_lm": 0.01764343911781907, "loss_seg": 0.18805190920829773, "mean_token_accuracy": 0.9949567914009094, "num_tokens": 175114499.0, "step": 412 }, { "entropy": 0.029128138441592455, "epoch": 0.18076375971112812, "grad_norm": 14.6875, "learning_rate": 2.9813210611802924e-05, "loss": 0.2491, "loss_lm": 0.01869189366698265, "loss_seg": 0.23044283501803875, "mean_token_accuracy": 0.9949673563241959, "num_tokens": 175539690.0, "step": 413 }, { "entropy": 0.028826304711401463, "epoch": 0.1812014443593391, "grad_norm": 11.5, "learning_rate": 2.9810503519220358e-05, "loss": 0.2234, "loss_lm": 0.019453096203505993, "loss_seg": 0.2039712704718113, "mean_token_accuracy": 0.9950003921985626, "num_tokens": 175965049.0, "step": 414 }, { "entropy": 0.028177348896861076, "epoch": 0.18163912900755005, "grad_norm": 17.875, "learning_rate": 2.9807796426637792e-05, "loss": 0.1748, "loss_lm": 0.01657100347802043, "loss_seg": 0.15819650143384933, "mean_token_accuracy": 0.9951144754886627, "num_tokens": 176390155.0, "step": 415 }, { "entropy": 0.028241741936653852, "epoch": 0.18207681365576103, "grad_norm": 20.375, "learning_rate": 2.9805089334055226e-05, "loss": 0.2325, "loss_lm": 0.017193770268931985, "loss_seg": 0.2153029404580593, "mean_token_accuracy": 0.9950942546129227, "num_tokens": 176815133.0, "step": 416 }, { "entropy": 0.028280611149966717, "epoch": 0.18251449830397198, "grad_norm": 18.25, "learning_rate": 2.980238224147266e-05, "loss": 0.1591, "loss_lm": 0.01835136068984866, "loss_seg": 0.1407839022576809, "mean_token_accuracy": 0.995024248957634, "num_tokens": 177240776.0, "step": 417 }, { "entropy": 0.02826086711138487, "epoch": 0.18295218295218296, "grad_norm": 10.4375, "learning_rate": 2.9799675148890093e-05, "loss": 0.1708, "loss_lm": 0.01894219173118472, "loss_seg": 0.1518598571419716, "mean_token_accuracy": 0.9950684159994125, "num_tokens": 177665953.0, "step": 418 }, { "entropy": 0.028353999834507704, "epoch": 0.1833898676003939, "grad_norm": 14.5, "learning_rate": 2.9796968056307527e-05, "loss": 0.1607, "loss_lm": 0.017732006264850497, "loss_seg": 0.1430056244134903, "mean_token_accuracy": 0.9948750138282776, "num_tokens": 178091715.0, "step": 419 }, { "entropy": 0.026787316892296076, "epoch": 0.1838275522486049, "grad_norm": 15.9375, "learning_rate": 2.979426096372496e-05, "loss": 0.1987, "loss_lm": 0.017773106461390853, "loss_seg": 0.18095580115914345, "mean_token_accuracy": 0.9951706677675247, "num_tokens": 178516797.0, "step": 420 }, { "entropy": 0.02729091001674533, "epoch": 0.18426523689681584, "grad_norm": 13.6875, "learning_rate": 2.9791553871142394e-05, "loss": 0.213, "loss_lm": 0.01897671353071928, "loss_seg": 0.1940369065850973, "mean_token_accuracy": 0.9952114969491959, "num_tokens": 178941919.0, "step": 421 }, { "entropy": 0.02768686879426241, "epoch": 0.18470292154502682, "grad_norm": 24.875, "learning_rate": 2.9788846778559828e-05, "loss": 0.1896, "loss_lm": 0.017872720025479794, "loss_seg": 0.17177292332053185, "mean_token_accuracy": 0.9950226694345474, "num_tokens": 179367215.0, "step": 422 }, { "entropy": 0.02788735879585147, "epoch": 0.18514060619323777, "grad_norm": 8.5, "learning_rate": 2.9786139685977262e-05, "loss": 0.171, "loss_lm": 0.0161010785959661, "loss_seg": 0.15488631650805473, "mean_token_accuracy": 0.9950727671384811, "num_tokens": 179792565.0, "step": 423 }, { "entropy": 0.027110973373055458, "epoch": 0.18557829084144872, "grad_norm": 15.8125, "learning_rate": 2.9783432593394695e-05, "loss": 0.1748, "loss_lm": 0.01863219030201435, "loss_seg": 0.15612566284835339, "mean_token_accuracy": 0.9951549619436264, "num_tokens": 180217509.0, "step": 424 }, { "entropy": 0.02766592428088188, "epoch": 0.1860159754896597, "grad_norm": 17.625, "learning_rate": 2.978072550081213e-05, "loss": 0.1885, "loss_lm": 0.017415453912690282, "loss_seg": 0.17106617242097855, "mean_token_accuracy": 0.9949548095464706, "num_tokens": 180642676.0, "step": 425 }, { "entropy": 0.02708426071330905, "epoch": 0.18645366013787065, "grad_norm": 18.25, "learning_rate": 2.9778018408229563e-05, "loss": 0.2312, "loss_lm": 0.019764424534514546, "loss_seg": 0.21142630651593208, "mean_token_accuracy": 0.9951465427875519, "num_tokens": 181067918.0, "step": 426 }, { "entropy": 0.027664766646921635, "epoch": 0.18689134478608163, "grad_norm": 13.625, "learning_rate": 2.9775311315646997e-05, "loss": 0.2158, "loss_lm": 0.01588010462000966, "loss_seg": 0.1999109424650669, "mean_token_accuracy": 0.9949290454387665, "num_tokens": 181493463.0, "step": 427 }, { "entropy": 0.02671548817306757, "epoch": 0.18732902943429258, "grad_norm": 11.9375, "learning_rate": 2.9772604223064427e-05, "loss": 0.1844, "loss_lm": 0.01986208138987422, "loss_seg": 0.16456240601837635, "mean_token_accuracy": 0.9952501952648163, "num_tokens": 181918067.0, "step": 428 }, { "entropy": 0.027683306019753218, "epoch": 0.18776671408250356, "grad_norm": 9.0, "learning_rate": 2.976989713048186e-05, "loss": 0.2166, "loss_lm": 0.017225287621840835, "loss_seg": 0.1993332915008068, "mean_token_accuracy": 0.9949435293674469, "num_tokens": 182343050.0, "step": 429 }, { "entropy": 0.026995494030416012, "epoch": 0.1882043987307145, "grad_norm": 11.5, "learning_rate": 2.9767190037899298e-05, "loss": 0.1841, "loss_lm": 0.016869226237758994, "loss_seg": 0.1672421619296074, "mean_token_accuracy": 0.9951151758432388, "num_tokens": 182768200.0, "step": 430 }, { "entropy": 0.027409778907895088, "epoch": 0.1886420833789255, "grad_norm": 13.75, "learning_rate": 2.976448294531673e-05, "loss": 0.1645, "loss_lm": 0.01713972701691091, "loss_seg": 0.14733323827385902, "mean_token_accuracy": 0.995104506611824, "num_tokens": 183193076.0, "step": 431 }, { "entropy": 0.027692399453371763, "epoch": 0.18907976802713644, "grad_norm": 14.0, "learning_rate": 2.9761775852734165e-05, "loss": 0.2296, "loss_lm": 0.018275896552950144, "loss_seg": 0.21127891913056374, "mean_token_accuracy": 0.9950283020734787, "num_tokens": 183618273.0, "step": 432 }, { "entropy": 0.027298816479742527, "epoch": 0.18951745267534742, "grad_norm": 28.125, "learning_rate": 2.9759068760151596e-05, "loss": 0.2688, "loss_lm": 0.018500583013519645, "loss_seg": 0.2503487579524517, "mean_token_accuracy": 0.9952258914709091, "num_tokens": 184043448.0, "step": 433 }, { "entropy": 0.028064473532140255, "epoch": 0.18995513732355837, "grad_norm": 17.125, "learning_rate": 2.975636166756903e-05, "loss": 0.1632, "loss_lm": 0.017126570688560605, "loss_seg": 0.1460723765194416, "mean_token_accuracy": 0.9949002861976624, "num_tokens": 184468366.0, "step": 434 }, { "entropy": 0.027689995244145393, "epoch": 0.19039282197176935, "grad_norm": 13.875, "learning_rate": 2.9753654574986466e-05, "loss": 0.1889, "loss_lm": 0.015924200881272554, "loss_seg": 0.17299305647611618, "mean_token_accuracy": 0.9950888305902481, "num_tokens": 184893295.0, "step": 435 }, { "entropy": 0.02853596955537796, "epoch": 0.1908305066199803, "grad_norm": 25.875, "learning_rate": 2.97509474824039e-05, "loss": 0.1934, "loss_lm": 0.0209753573872149, "loss_seg": 0.17240547016263008, "mean_token_accuracy": 0.9949455708265305, "num_tokens": 185318653.0, "step": 436 }, { "entropy": 0.027598432265222073, "epoch": 0.19126819126819128, "grad_norm": 8.4375, "learning_rate": 2.9748240389821334e-05, "loss": 0.2141, "loss_lm": 0.01814459590241313, "loss_seg": 0.19593043997883797, "mean_token_accuracy": 0.9950426071882248, "num_tokens": 185743853.0, "step": 437 }, { "entropy": 0.027821162715554237, "epoch": 0.19170587591640223, "grad_norm": 13.625, "learning_rate": 2.9745533297238764e-05, "loss": 0.1804, "loss_lm": 0.019046641420572996, "loss_seg": 0.1613696739077568, "mean_token_accuracy": 0.9950989037752151, "num_tokens": 186169038.0, "step": 438 }, { "entropy": 0.02763694664463401, "epoch": 0.1921435605646132, "grad_norm": 40.75, "learning_rate": 2.9742826204656198e-05, "loss": 0.1961, "loss_lm": 0.017697892850264907, "loss_seg": 0.1783599965274334, "mean_token_accuracy": 0.995049238204956, "num_tokens": 186593380.0, "step": 439 }, { "entropy": 0.027932789642363787, "epoch": 0.19258124521282416, "grad_norm": 24.5, "learning_rate": 2.9740119112073635e-05, "loss": 0.1442, "loss_lm": 0.016979251289740205, "loss_seg": 0.12718561105430126, "mean_token_accuracy": 0.9948758631944656, "num_tokens": 187018618.0, "step": 440 }, { "entropy": 0.02866519009694457, "epoch": 0.19301892986103514, "grad_norm": 32.5, "learning_rate": 2.973741201949107e-05, "loss": 0.181, "loss_lm": 0.02035455359145999, "loss_seg": 0.16063130646944046, "mean_token_accuracy": 0.9947940707206726, "num_tokens": 187444100.0, "step": 441 }, { "entropy": 0.027783683966845274, "epoch": 0.1934566145092461, "grad_norm": 68.5, "learning_rate": 2.9734704926908503e-05, "loss": 0.2188, "loss_lm": 0.01856807339936495, "loss_seg": 0.20024745538830757, "mean_token_accuracy": 0.9950355589389801, "num_tokens": 187868488.0, "step": 442 }, { "entropy": 0.02673245593905449, "epoch": 0.19389429915745704, "grad_norm": 9.6875, "learning_rate": 2.9731997834325933e-05, "loss": 0.1727, "loss_lm": 0.01953325793147087, "loss_seg": 0.15316522493958473, "mean_token_accuracy": 0.995163083076477, "num_tokens": 188293705.0, "step": 443 }, { "entropy": 0.02730041742324829, "epoch": 0.19433198380566802, "grad_norm": 19.375, "learning_rate": 2.9729290741743367e-05, "loss": 0.2443, "loss_lm": 0.017008480383083224, "loss_seg": 0.22731963545084, "mean_token_accuracy": 0.9950699806213379, "num_tokens": 188718856.0, "step": 444 }, { "entropy": 0.026593342889100313, "epoch": 0.19476966845387897, "grad_norm": 9.8125, "learning_rate": 2.9726583649160804e-05, "loss": 0.1301, "loss_lm": 0.016796777956187725, "loss_seg": 0.11332544684410095, "mean_token_accuracy": 0.9951733648777008, "num_tokens": 189143941.0, "step": 445 }, { "entropy": 0.027890676632523537, "epoch": 0.19520735310208995, "grad_norm": 10.4375, "learning_rate": 2.9723876556578237e-05, "loss": 0.2153, "loss_lm": 0.02045314572751522, "loss_seg": 0.19487272948026657, "mean_token_accuracy": 0.994874581694603, "num_tokens": 189569103.0, "step": 446 }, { "entropy": 0.02717117080464959, "epoch": 0.1956450377503009, "grad_norm": 11.625, "learning_rate": 2.972116946399567e-05, "loss": 0.1867, "loss_lm": 0.018790626199916005, "loss_seg": 0.16794475726783276, "mean_token_accuracy": 0.995045468211174, "num_tokens": 189994167.0, "step": 447 }, { "entropy": 0.026573636569082737, "epoch": 0.19608272239851188, "grad_norm": 7.65625, "learning_rate": 2.97184623714131e-05, "loss": 0.1797, "loss_lm": 0.01600011601112783, "loss_seg": 0.16374944522976875, "mean_token_accuracy": 0.9952274411916733, "num_tokens": 190418623.0, "step": 448 }, { "entropy": 0.027727169450372458, "epoch": 0.19652040704672283, "grad_norm": 29.125, "learning_rate": 2.9715755278830535e-05, "loss": 0.1797, "loss_lm": 0.017979140859097242, "loss_seg": 0.16170133277773857, "mean_token_accuracy": 0.9948810487985611, "num_tokens": 190844480.0, "step": 449 }, { "entropy": 0.026645800098776817, "epoch": 0.1969580916949338, "grad_norm": 32.0, "learning_rate": 2.9713048186247972e-05, "loss": 0.1792, "loss_lm": 0.01701002474874258, "loss_seg": 0.16219790652394295, "mean_token_accuracy": 0.9951037913560867, "num_tokens": 191269647.0, "step": 450 }, { "entropy": 0.026766687631607056, "epoch": 0.19739577634314476, "grad_norm": 16.5, "learning_rate": 2.9710341093665406e-05, "loss": 0.1488, "loss_lm": 0.017386683262884617, "loss_seg": 0.13140122592449188, "mean_token_accuracy": 0.9950246214866638, "num_tokens": 191694314.0, "step": 451 }, { "entropy": 0.027146275620907545, "epoch": 0.19783346099135574, "grad_norm": 14.25, "learning_rate": 2.9707634001082836e-05, "loss": 0.2192, "loss_lm": 0.019979296484962106, "loss_seg": 0.19927046447992325, "mean_token_accuracy": 0.9948320090770721, "num_tokens": 192118461.0, "step": 452 }, { "entropy": 0.025890410412102938, "epoch": 0.1982711456395667, "grad_norm": 19.375, "learning_rate": 2.970492690850027e-05, "loss": 0.2203, "loss_lm": 0.01891688467003405, "loss_seg": 0.2013789266347885, "mean_token_accuracy": 0.9951604604721069, "num_tokens": 192543260.0, "step": 453 }, { "entropy": 0.02593234647065401, "epoch": 0.19870883028777767, "grad_norm": 19.0, "learning_rate": 2.9702219815917704e-05, "loss": 0.1675, "loss_lm": 0.01804658491164446, "loss_seg": 0.14949303306639194, "mean_token_accuracy": 0.9951607435941696, "num_tokens": 192968091.0, "step": 454 }, { "entropy": 0.026417152490466833, "epoch": 0.19914651493598862, "grad_norm": 18.375, "learning_rate": 2.969951272333514e-05, "loss": 0.2147, "loss_lm": 0.01808837172575295, "loss_seg": 0.19666007161140442, "mean_token_accuracy": 0.9951050728559494, "num_tokens": 193392988.0, "step": 455 }, { "entropy": 0.026017773430794477, "epoch": 0.1995841995841996, "grad_norm": 13.0625, "learning_rate": 2.9696805630752575e-05, "loss": 0.1998, "loss_lm": 0.01700029638595879, "loss_seg": 0.1828068494796753, "mean_token_accuracy": 0.9950993359088898, "num_tokens": 193817808.0, "step": 456 }, { "entropy": 0.026798737235367298, "epoch": 0.20002188423241055, "grad_norm": 12.5625, "learning_rate": 2.9694098538170005e-05, "loss": 0.1823, "loss_lm": 0.01835953793488443, "loss_seg": 0.16392433643341064, "mean_token_accuracy": 0.9949840754270554, "num_tokens": 194242024.0, "step": 457 }, { "entropy": 0.02677209721878171, "epoch": 0.20045956888062152, "grad_norm": 19.875, "learning_rate": 2.969139144558744e-05, "loss": 0.2333, "loss_lm": 0.020438782405108213, "loss_seg": 0.21281898766756058, "mean_token_accuracy": 0.9949019551277161, "num_tokens": 194667472.0, "step": 458 }, { "entropy": 0.026761106215417385, "epoch": 0.20089725352883248, "grad_norm": 18.0, "learning_rate": 2.9688684353004872e-05, "loss": 0.1844, "loss_lm": 0.015428267884999514, "loss_seg": 0.16892513260245323, "mean_token_accuracy": 0.994940459728241, "num_tokens": 195092011.0, "step": 459 }, { "entropy": 0.02620151173323393, "epoch": 0.20133493817704343, "grad_norm": 12.3125, "learning_rate": 2.9685977260422306e-05, "loss": 0.1529, "loss_lm": 0.014842558186501265, "loss_seg": 0.1380387358367443, "mean_token_accuracy": 0.9952540397644043, "num_tokens": 195516967.0, "step": 460 }, { "entropy": 0.026848596520721912, "epoch": 0.2017726228252544, "grad_norm": 10.0625, "learning_rate": 2.9683270167839743e-05, "loss": 0.1796, "loss_lm": 0.020482894498854876, "loss_seg": 0.15910447388887405, "mean_token_accuracy": 0.9949885606765747, "num_tokens": 195942357.0, "step": 461 }, { "entropy": 0.026518710888922215, "epoch": 0.20221030747346536, "grad_norm": 15.1875, "learning_rate": 2.9680563075257174e-05, "loss": 0.1485, "loss_lm": 0.018103762064129114, "loss_seg": 0.13035963475704193, "mean_token_accuracy": 0.9950963109731674, "num_tokens": 196366896.0, "step": 462 }, { "entropy": 0.026247555390000343, "epoch": 0.20264799212167633, "grad_norm": 11.5, "learning_rate": 2.9677855982674607e-05, "loss": 0.1608, "loss_lm": 0.01546968356706202, "loss_seg": 0.14537492580711842, "mean_token_accuracy": 0.995126485824585, "num_tokens": 196792019.0, "step": 463 }, { "entropy": 0.025957115460187197, "epoch": 0.20308567676988729, "grad_norm": 9.3125, "learning_rate": 2.967514889009204e-05, "loss": 0.1775, "loss_lm": 0.014867734862491488, "loss_seg": 0.16260599717497826, "mean_token_accuracy": 0.9952259957790375, "num_tokens": 197216793.0, "step": 464 }, { "entropy": 0.026554581709206104, "epoch": 0.20352336141809826, "grad_norm": 13.1875, "learning_rate": 2.9672441797509475e-05, "loss": 0.2191, "loss_lm": 0.018215002957731485, "loss_seg": 0.2008741870522499, "mean_token_accuracy": 0.9950898587703705, "num_tokens": 197641396.0, "step": 465 }, { "entropy": 0.027062344830483198, "epoch": 0.20396104606630922, "grad_norm": 15.9375, "learning_rate": 2.9669734704926912e-05, "loss": 0.2145, "loss_lm": 0.018846245715394616, "loss_seg": 0.1956169195473194, "mean_token_accuracy": 0.9950655549764633, "num_tokens": 198066321.0, "step": 466 }, { "entropy": 0.027229006867855787, "epoch": 0.2043987307145202, "grad_norm": 18.5, "learning_rate": 2.9667027612344342e-05, "loss": 0.2251, "loss_lm": 0.01782834902405739, "loss_seg": 0.20724662020802498, "mean_token_accuracy": 0.9951410442590714, "num_tokens": 198491448.0, "step": 467 }, { "entropy": 0.026817286852747202, "epoch": 0.20483641536273114, "grad_norm": 10.9375, "learning_rate": 2.9664320519761776e-05, "loss": 0.2869, "loss_lm": 0.016587401274591684, "loss_seg": 0.270290344953537, "mean_token_accuracy": 0.9950340837240219, "num_tokens": 198916120.0, "step": 468 }, { "entropy": 0.02669328637421131, "epoch": 0.20527410001094212, "grad_norm": 11.875, "learning_rate": 2.966161342717921e-05, "loss": 0.3117, "loss_lm": 0.018003653036430478, "loss_seg": 0.29365907050669193, "mean_token_accuracy": 0.9950888603925705, "num_tokens": 199341042.0, "step": 469 }, { "entropy": 0.026361124124377966, "epoch": 0.20571178465915307, "grad_norm": 12.5, "learning_rate": 2.9658906334596643e-05, "loss": 0.1975, "loss_lm": 0.016705274349078536, "loss_seg": 0.18084228411316872, "mean_token_accuracy": 0.9951383173465729, "num_tokens": 199766182.0, "step": 470 }, { "entropy": 0.026315534487366676, "epoch": 0.20614946930736405, "grad_norm": 11.125, "learning_rate": 2.965619924201408e-05, "loss": 0.2106, "loss_lm": 0.01845134189352393, "loss_seg": 0.19217592105269432, "mean_token_accuracy": 0.9951562136411667, "num_tokens": 200191668.0, "step": 471 }, { "entropy": 0.02761216787621379, "epoch": 0.206587153955575, "grad_norm": 12.125, "learning_rate": 2.965349214943151e-05, "loss": 0.1892, "loss_lm": 0.018316732486709952, "loss_seg": 0.17093230783939362, "mean_token_accuracy": 0.994753286242485, "num_tokens": 200617422.0, "step": 472 }, { "entropy": 0.0267298542894423, "epoch": 0.20702483860378598, "grad_norm": 13.125, "learning_rate": 2.9650785056848945e-05, "loss": 0.1354, "loss_lm": 0.017278319224715233, "loss_seg": 0.11810050718486309, "mean_token_accuracy": 0.9950340688228607, "num_tokens": 201042306.0, "step": 473 }, { "entropy": 0.026640201453119516, "epoch": 0.20746252325199693, "grad_norm": 12.125, "learning_rate": 2.964807796426638e-05, "loss": 0.213, "loss_lm": 0.018540012184530497, "loss_seg": 0.19444827362895012, "mean_token_accuracy": 0.9951659739017487, "num_tokens": 201467605.0, "step": 474 }, { "entropy": 0.02768407016992569, "epoch": 0.2079002079002079, "grad_norm": 17.0, "learning_rate": 2.9645370871683812e-05, "loss": 0.1992, "loss_lm": 0.018877195194363594, "loss_seg": 0.18036198429763317, "mean_token_accuracy": 0.9949134141206741, "num_tokens": 201892608.0, "step": 475 }, { "entropy": 0.026211443822830915, "epoch": 0.20833789254841886, "grad_norm": 10.375, "learning_rate": 2.9642663779101246e-05, "loss": 0.209, "loss_lm": 0.018823795253410935, "loss_seg": 0.19018888100981712, "mean_token_accuracy": 0.9951914995908737, "num_tokens": 202317462.0, "step": 476 }, { "entropy": 0.02656680904328823, "epoch": 0.20877557719662984, "grad_norm": 14.0, "learning_rate": 2.963995668651868e-05, "loss": 0.2094, "loss_lm": 0.01641870616003871, "loss_seg": 0.19297432526946068, "mean_token_accuracy": 0.9949952811002731, "num_tokens": 202742651.0, "step": 477 }, { "entropy": 0.02720764372497797, "epoch": 0.2092132618448408, "grad_norm": 10.125, "learning_rate": 2.9637249593936113e-05, "loss": 0.2082, "loss_lm": 0.018808207008987665, "loss_seg": 0.1893446147441864, "mean_token_accuracy": 0.9949870407581329, "num_tokens": 203167927.0, "step": 478 }, { "entropy": 0.026405273471027613, "epoch": 0.20965094649305174, "grad_norm": 6.53125, "learning_rate": 2.9634542501353547e-05, "loss": 0.15, "loss_lm": 0.019355395575985312, "loss_seg": 0.13064739853143692, "mean_token_accuracy": 0.9951191693544388, "num_tokens": 203593646.0, "step": 479 }, { "entropy": 0.025825410149991512, "epoch": 0.21008863114126272, "grad_norm": 8.625, "learning_rate": 2.963183540877098e-05, "loss": 0.1874, "loss_lm": 0.019391615875065327, "loss_seg": 0.1680542267858982, "mean_token_accuracy": 0.9951777905225754, "num_tokens": 204018510.0, "step": 480 }, { "entropy": 0.026297799311578274, "epoch": 0.21052631578947367, "grad_norm": 21.5, "learning_rate": 2.9629128316188414e-05, "loss": 0.2062, "loss_lm": 0.01854199543595314, "loss_seg": 0.18761701881885529, "mean_token_accuracy": 0.9950589537620544, "num_tokens": 204443490.0, "step": 481 }, { "entropy": 0.026390003971755505, "epoch": 0.21096400043768465, "grad_norm": 20.125, "learning_rate": 2.9626421223605848e-05, "loss": 0.1628, "loss_lm": 0.019575619138777256, "loss_seg": 0.14325229451060295, "mean_token_accuracy": 0.9950369298458099, "num_tokens": 204868799.0, "step": 482 }, { "entropy": 0.025821740739047527, "epoch": 0.2114016850858956, "grad_norm": 18.0, "learning_rate": 2.9623714131023282e-05, "loss": 0.2399, "loss_lm": 0.017037846846506, "loss_seg": 0.22283775359392166, "mean_token_accuracy": 0.9950796663761139, "num_tokens": 205293746.0, "step": 483 }, { "entropy": 0.026305614039301872, "epoch": 0.21183936973410658, "grad_norm": 7.0, "learning_rate": 2.9621007038440716e-05, "loss": 0.1918, "loss_lm": 0.020238970639184117, "loss_seg": 0.17152566462755203, "mean_token_accuracy": 0.9949326068162918, "num_tokens": 205719185.0, "step": 484 }, { "entropy": 0.025305184070020914, "epoch": 0.21227705438231753, "grad_norm": 7.0625, "learning_rate": 2.961829994585815e-05, "loss": 0.2325, "loss_lm": 0.01841004192829132, "loss_seg": 0.21405542083084583, "mean_token_accuracy": 0.9951383322477341, "num_tokens": 206144107.0, "step": 485 }, { "entropy": 0.02580512873828411, "epoch": 0.2127147390305285, "grad_norm": 12.4375, "learning_rate": 2.9615592853275583e-05, "loss": 0.2373, "loss_lm": 0.01806737598963082, "loss_seg": 0.2192077375948429, "mean_token_accuracy": 0.9950501024723053, "num_tokens": 206569724.0, "step": 486 }, { "entropy": 0.025618545711040497, "epoch": 0.21315242367873946, "grad_norm": 9.25, "learning_rate": 2.9612885760693017e-05, "loss": 0.168, "loss_lm": 0.018829626496881247, "loss_seg": 0.1491557639092207, "mean_token_accuracy": 0.9950786828994751, "num_tokens": 206994997.0, "step": 487 }, { "entropy": 0.025333139579743147, "epoch": 0.21359010832695044, "grad_norm": 9.0, "learning_rate": 2.961017866811045e-05, "loss": 0.181, "loss_lm": 0.019513788167387247, "loss_seg": 0.16143912635743618, "mean_token_accuracy": 0.9950737655162811, "num_tokens": 207419818.0, "step": 488 }, { "entropy": 0.025145765393972397, "epoch": 0.2140277929751614, "grad_norm": 11.6875, "learning_rate": 2.9607471575527884e-05, "loss": 0.1701, "loss_lm": 0.016873621847480536, "loss_seg": 0.15323060005903244, "mean_token_accuracy": 0.9951932579278946, "num_tokens": 207844769.0, "step": 489 }, { "entropy": 0.025116825476288795, "epoch": 0.21446547762337237, "grad_norm": 25.375, "learning_rate": 2.9604764482945318e-05, "loss": 0.2307, "loss_lm": 0.018722593318670988, "loss_seg": 0.21194185316562653, "mean_token_accuracy": 0.9951722919940948, "num_tokens": 208269788.0, "step": 490 }, { "entropy": 0.026102331466972828, "epoch": 0.21490316227158332, "grad_norm": 14.0625, "learning_rate": 2.9602057390362752e-05, "loss": 0.2343, "loss_lm": 0.019808434415608644, "loss_seg": 0.2144642397761345, "mean_token_accuracy": 0.9949634075164795, "num_tokens": 208694062.0, "step": 491 }, { "entropy": 0.02632714109495282, "epoch": 0.2153408469197943, "grad_norm": 10.1875, "learning_rate": 2.9599350297780185e-05, "loss": 0.2486, "loss_lm": 0.018681931775063276, "loss_seg": 0.22989097982645035, "mean_token_accuracy": 0.9949305951595306, "num_tokens": 209119346.0, "step": 492 }, { "entropy": 0.025748887564986944, "epoch": 0.21577853156800525, "grad_norm": 8.0625, "learning_rate": 2.959664320519762e-05, "loss": 0.2698, "loss_lm": 0.018818039447069168, "loss_seg": 0.25095391646027565, "mean_token_accuracy": 0.9951074868440628, "num_tokens": 209544648.0, "step": 493 }, { "entropy": 0.02670912817120552, "epoch": 0.21621621621621623, "grad_norm": 12.4375, "learning_rate": 2.9593936112615053e-05, "loss": 0.1967, "loss_lm": 0.02062403387390077, "loss_seg": 0.17611303180456161, "mean_token_accuracy": 0.9949854165315628, "num_tokens": 209969586.0, "step": 494 }, { "entropy": 0.026669688057154417, "epoch": 0.21665390086442718, "grad_norm": 12.125, "learning_rate": 2.9591229020032483e-05, "loss": 0.1576, "loss_lm": 0.01698905020020902, "loss_seg": 0.14059972949326038, "mean_token_accuracy": 0.9951277375221252, "num_tokens": 210394006.0, "step": 495 }, { "entropy": 0.02629974950104952, "epoch": 0.21709158551263816, "grad_norm": 8.75, "learning_rate": 2.9588521927449917e-05, "loss": 0.1768, "loss_lm": 0.020574014633893967, "loss_seg": 0.15623651444911957, "mean_token_accuracy": 0.995247408747673, "num_tokens": 210818791.0, "step": 496 }, { "entropy": 0.026343629229813814, "epoch": 0.2175292701608491, "grad_norm": 15.4375, "learning_rate": 2.9585814834867354e-05, "loss": 0.2275, "loss_lm": 0.01795199140906334, "loss_seg": 0.209506057202816, "mean_token_accuracy": 0.9950850009918213, "num_tokens": 211244385.0, "step": 497 }, { "entropy": 0.025698804296553135, "epoch": 0.21796695480906006, "grad_norm": 14.0625, "learning_rate": 2.9583107742284788e-05, "loss": 0.2221, "loss_lm": 0.016254385001957417, "loss_seg": 0.20585911720991135, "mean_token_accuracy": 0.9950823038816452, "num_tokens": 211668892.0, "step": 498 }, { "entropy": 0.0255874190479517, "epoch": 0.21840463945727104, "grad_norm": 8.0, "learning_rate": 2.958040064970222e-05, "loss": 0.1956, "loss_lm": 0.019253349862992764, "loss_seg": 0.1763761304318905, "mean_token_accuracy": 0.9950917959213257, "num_tokens": 212094073.0, "step": 499 }, { "entropy": 0.024882496800273657, "epoch": 0.218842324105482, "grad_norm": 10.4375, "learning_rate": 2.9577693557119652e-05, "loss": 0.1476, "loss_lm": 0.01658633048646152, "loss_seg": 0.13097364082932472, "mean_token_accuracy": 0.9951491355895996, "num_tokens": 212518892.0, "step": 500 }, { "entropy": 0.025194136891514063, "epoch": 0.21928000875369297, "grad_norm": 9.125, "learning_rate": 2.9574986464537086e-05, "loss": 0.1736, "loss_lm": 0.01735693495720625, "loss_seg": 0.15622863546013832, "mean_token_accuracy": 0.9952110350131989, "num_tokens": 212943580.0, "step": 501 }, { "entropy": 0.025457216426730156, "epoch": 0.21971769340190392, "grad_norm": 6.875, "learning_rate": 2.9572279371954523e-05, "loss": 0.1937, "loss_lm": 0.0180947775952518, "loss_seg": 0.17559592612087727, "mean_token_accuracy": 0.9951493293046951, "num_tokens": 213368850.0, "step": 502 }, { "entropy": 0.024795397650450468, "epoch": 0.2201553780501149, "grad_norm": 12.0, "learning_rate": 2.9569572279371956e-05, "loss": 0.1594, "loss_lm": 0.016334404470399022, "loss_seg": 0.14306028373539448, "mean_token_accuracy": 0.9952713996171951, "num_tokens": 213794110.0, "step": 503 }, { "entropy": 0.02569504687562585, "epoch": 0.22059306269832585, "grad_norm": 9.4375, "learning_rate": 2.956686518678939e-05, "loss": 0.198, "loss_lm": 0.017612150870263577, "loss_seg": 0.18040401488542557, "mean_token_accuracy": 0.9951404929161072, "num_tokens": 214219585.0, "step": 504 }, { "entropy": 0.026211058255285025, "epoch": 0.22103074734653683, "grad_norm": 7.90625, "learning_rate": 2.956415809420682e-05, "loss": 0.2108, "loss_lm": 0.019465302349999547, "loss_seg": 0.19130735471844673, "mean_token_accuracy": 0.9950664639472961, "num_tokens": 214645234.0, "step": 505 }, { "entropy": 0.02581868227571249, "epoch": 0.22146843199474778, "grad_norm": 11.6875, "learning_rate": 2.9561451001624254e-05, "loss": 0.1795, "loss_lm": 0.01623515342362225, "loss_seg": 0.1632585283368826, "mean_token_accuracy": 0.9949463903903961, "num_tokens": 215070474.0, "step": 506 }, { "entropy": 0.02561677759513259, "epoch": 0.22190611664295876, "grad_norm": 11.9375, "learning_rate": 2.955874390904169e-05, "loss": 0.205, "loss_lm": 0.017247363459318876, "loss_seg": 0.18779144808650017, "mean_token_accuracy": 0.9950262904167175, "num_tokens": 215494879.0, "step": 507 }, { "entropy": 0.025340169202536345, "epoch": 0.2223438012911697, "grad_norm": 12.0, "learning_rate": 2.9556036816459125e-05, "loss": 0.1491, "loss_lm": 0.01753318728879094, "loss_seg": 0.13154352456331253, "mean_token_accuracy": 0.9951101392507553, "num_tokens": 215920628.0, "step": 508 }, { "entropy": 0.02541325194761157, "epoch": 0.22278148593938069, "grad_norm": 16.0, "learning_rate": 2.955332972387656e-05, "loss": 0.1988, "loss_lm": 0.021075377240777016, "loss_seg": 0.17773282900452614, "mean_token_accuracy": 0.995013952255249, "num_tokens": 216346163.0, "step": 509 }, { "entropy": 0.025646645575761795, "epoch": 0.22321917058759164, "grad_norm": 10.75, "learning_rate": 2.955062263129399e-05, "loss": 0.1434, "loss_lm": 0.01574028842151165, "loss_seg": 0.12768089957535267, "mean_token_accuracy": 0.994828537106514, "num_tokens": 216770806.0, "step": 510 }, { "entropy": 0.02490712609142065, "epoch": 0.22365685523580262, "grad_norm": 12.125, "learning_rate": 2.9547915538711423e-05, "loss": 0.1658, "loss_lm": 0.019029746297746897, "loss_seg": 0.146758284419775, "mean_token_accuracy": 0.99517622590065, "num_tokens": 217195740.0, "step": 511 }, { "entropy": 0.02515697991475463, "epoch": 0.22409453988401357, "grad_norm": 9.5625, "learning_rate": 2.954520844612886e-05, "loss": 0.1901, "loss_lm": 0.0172179804649204, "loss_seg": 0.17285723239183426, "mean_token_accuracy": 0.9951075166463852, "num_tokens": 217621253.0, "step": 512 }, { "entropy": 0.024798331782221794, "epoch": 0.22453222453222454, "grad_norm": 7.46875, "learning_rate": 2.9542501353546294e-05, "loss": 0.1922, "loss_lm": 0.017021907959133387, "loss_seg": 0.17518125474452972, "mean_token_accuracy": 0.9951718896627426, "num_tokens": 218046631.0, "step": 513 }, { "entropy": 0.02563309110701084, "epoch": 0.2249699091804355, "grad_norm": 11.75, "learning_rate": 2.9539794260963727e-05, "loss": 0.1917, "loss_lm": 0.01972991693764925, "loss_seg": 0.17193631269037724, "mean_token_accuracy": 0.9948944300413132, "num_tokens": 218472434.0, "step": 514 }, { "entropy": 0.02481383178383112, "epoch": 0.22540759382864645, "grad_norm": 9.375, "learning_rate": 2.9537087168381158e-05, "loss": 0.1616, "loss_lm": 0.01650973316282034, "loss_seg": 0.14507020451128483, "mean_token_accuracy": 0.9952090084552765, "num_tokens": 218896940.0, "step": 515 }, { "entropy": 0.02560254093259573, "epoch": 0.22584527847685743, "grad_norm": 11.8125, "learning_rate": 2.953438007579859e-05, "loss": 0.2164, "loss_lm": 0.019088011234998703, "loss_seg": 0.19733688235282898, "mean_token_accuracy": 0.9948972165584564, "num_tokens": 219322000.0, "step": 516 }, { "entropy": 0.024403584189713, "epoch": 0.22628296312506838, "grad_norm": 7.46875, "learning_rate": 2.953167298321603e-05, "loss": 0.1686, "loss_lm": 0.015999488066881895, "loss_seg": 0.15258214809000492, "mean_token_accuracy": 0.9952554553747177, "num_tokens": 219746652.0, "step": 517 }, { "entropy": 0.025592682417482138, "epoch": 0.22672064777327935, "grad_norm": 11.0625, "learning_rate": 2.9528965890633462e-05, "loss": 0.1931, "loss_lm": 0.018649817211553454, "loss_seg": 0.17446828819811344, "mean_token_accuracy": 0.9949051439762115, "num_tokens": 220171190.0, "step": 518 }, { "entropy": 0.025457176379859447, "epoch": 0.2271583324214903, "grad_norm": 21.125, "learning_rate": 2.9526258798050893e-05, "loss": 0.2581, "loss_lm": 0.02158920280635357, "loss_seg": 0.23647217452526093, "mean_token_accuracy": 0.9950254112482071, "num_tokens": 220596546.0, "step": 519 }, { "entropy": 0.02476073894649744, "epoch": 0.22759601706970128, "grad_norm": 16.5, "learning_rate": 2.9523551705468326e-05, "loss": 0.2371, "loss_lm": 0.01668378966860473, "loss_seg": 0.22041629999876022, "mean_token_accuracy": 0.9951208382844925, "num_tokens": 221021384.0, "step": 520 }, { "entropy": 0.024626020342111588, "epoch": 0.22803370171791223, "grad_norm": 12.8125, "learning_rate": 2.952084461288576e-05, "loss": 0.211, "loss_lm": 0.017188548343256116, "loss_seg": 0.19381482526659966, "mean_token_accuracy": 0.995123490691185, "num_tokens": 221446043.0, "step": 521 }, { "entropy": 0.024522291962057352, "epoch": 0.2284713863661232, "grad_norm": 12.0, "learning_rate": 2.9518137520303197e-05, "loss": 0.2332, "loss_lm": 0.016742888605222106, "loss_seg": 0.21641577780246735, "mean_token_accuracy": 0.9950653612613678, "num_tokens": 221870985.0, "step": 522 }, { "entropy": 0.025335189420729876, "epoch": 0.22890907101433416, "grad_norm": 10.9375, "learning_rate": 2.951543042772063e-05, "loss": 0.1783, "loss_lm": 0.018687356263399124, "loss_seg": 0.15959056094288826, "mean_token_accuracy": 0.9949174523353577, "num_tokens": 222296548.0, "step": 523 }, { "entropy": 0.025316891260445118, "epoch": 0.22934675566254514, "grad_norm": 8.3125, "learning_rate": 2.951272333513806e-05, "loss": 0.1604, "loss_lm": 0.019185435259714723, "loss_seg": 0.14117233082652092, "mean_token_accuracy": 0.9948964864015579, "num_tokens": 222721710.0, "step": 524 }, { "entropy": 0.024506373330950737, "epoch": 0.2297844403107561, "grad_norm": 10.6875, "learning_rate": 2.9510016242555495e-05, "loss": 0.1839, "loss_lm": 0.01639516092836857, "loss_seg": 0.16748670674860477, "mean_token_accuracy": 0.9950292259454727, "num_tokens": 223147806.0, "step": 525 }, { "entropy": 0.025135952979326248, "epoch": 0.23022212495896707, "grad_norm": 9.375, "learning_rate": 2.950730914997293e-05, "loss": 0.2106, "loss_lm": 0.019327690126374364, "loss_seg": 0.1912490427494049, "mean_token_accuracy": 0.9948212057352066, "num_tokens": 223573160.0, "step": 526 }, { "entropy": 0.0247443038970232, "epoch": 0.23065980960717802, "grad_norm": 8.5625, "learning_rate": 2.9504602057390362e-05, "loss": 0.203, "loss_lm": 0.01824121014215052, "loss_seg": 0.18475939705967903, "mean_token_accuracy": 0.9951123297214508, "num_tokens": 223997267.0, "step": 527 }, { "entropy": 0.02486037090420723, "epoch": 0.231097494255389, "grad_norm": 12.4375, "learning_rate": 2.95018949648078e-05, "loss": 0.188, "loss_lm": 0.018753322772681713, "loss_seg": 0.16922978684306145, "mean_token_accuracy": 0.9949108213186264, "num_tokens": 224422281.0, "step": 528 }, { "entropy": 0.02444422570988536, "epoch": 0.23153517890359995, "grad_norm": 7.5625, "learning_rate": 2.949918787222523e-05, "loss": 0.1873, "loss_lm": 0.017032305477187037, "loss_seg": 0.17022673785686493, "mean_token_accuracy": 0.9951663315296173, "num_tokens": 224846995.0, "step": 529 }, { "entropy": 0.024728215765208006, "epoch": 0.23197286355181093, "grad_norm": 10.75, "learning_rate": 2.9496480779642664e-05, "loss": 0.1884, "loss_lm": 0.01621876028366387, "loss_seg": 0.17219862528145313, "mean_token_accuracy": 0.9950145781040192, "num_tokens": 225272024.0, "step": 530 }, { "entropy": 0.024147826712578535, "epoch": 0.23241054820002188, "grad_norm": 16.25, "learning_rate": 2.9493773687060097e-05, "loss": 0.2175, "loss_lm": 0.021853928454220295, "loss_seg": 0.1956302970647812, "mean_token_accuracy": 0.9950089901685715, "num_tokens": 225696769.0, "step": 531 }, { "entropy": 0.023755815345793962, "epoch": 0.23284823284823286, "grad_norm": 6.78125, "learning_rate": 2.949106659447753e-05, "loss": 0.2077, "loss_lm": 0.0184675264172256, "loss_seg": 0.1892213597893715, "mean_token_accuracy": 0.9951311349868774, "num_tokens": 226122291.0, "step": 532 }, { "entropy": 0.024783487897366285, "epoch": 0.2332859174964438, "grad_norm": 11.9375, "learning_rate": 2.9488359501894968e-05, "loss": 0.1812, "loss_lm": 0.018608642276376486, "loss_seg": 0.1625889204442501, "mean_token_accuracy": 0.9950028955936432, "num_tokens": 226547716.0, "step": 533 }, { "entropy": 0.024136213585734367, "epoch": 0.23372360214465476, "grad_norm": 13.3125, "learning_rate": 2.94856524093124e-05, "loss": 0.1781, "loss_lm": 0.015806567389518023, "loss_seg": 0.16225528717041016, "mean_token_accuracy": 0.9950843155384064, "num_tokens": 226972844.0, "step": 534 }, { "entropy": 0.02401077188551426, "epoch": 0.23416128679286574, "grad_norm": 8.625, "learning_rate": 2.9482945316729832e-05, "loss": 0.2091, "loss_lm": 0.016085418174043298, "loss_seg": 0.19300074502825737, "mean_token_accuracy": 0.995084136724472, "num_tokens": 227398580.0, "step": 535 }, { "entropy": 0.023152879904955626, "epoch": 0.2345989714410767, "grad_norm": 10.0, "learning_rate": 2.9480238224147266e-05, "loss": 0.2248, "loss_lm": 0.017071612644940615, "loss_seg": 0.2077261619269848, "mean_token_accuracy": 0.9952373206615448, "num_tokens": 227822254.0, "step": 536 }, { "entropy": 0.024509132839739323, "epoch": 0.23503665608928767, "grad_norm": 9.5, "learning_rate": 2.94775311315647e-05, "loss": 0.1811, "loss_lm": 0.017671722685918212, "loss_seg": 0.16340279951691628, "mean_token_accuracy": 0.9949207603931427, "num_tokens": 228246921.0, "step": 537 }, { "entropy": 0.0237965383566916, "epoch": 0.23547434073749862, "grad_norm": 11.4375, "learning_rate": 2.9474824038982137e-05, "loss": 0.1771, "loss_lm": 0.0149325558450073, "loss_seg": 0.1621437966823578, "mean_token_accuracy": 0.9951521903276443, "num_tokens": 228670990.0, "step": 538 }, { "entropy": 0.024223764427006245, "epoch": 0.2359120253857096, "grad_norm": 10.4375, "learning_rate": 2.9472116946399567e-05, "loss": 0.2108, "loss_lm": 0.018688868964090943, "loss_seg": 0.1921541504561901, "mean_token_accuracy": 0.995053842663765, "num_tokens": 229096345.0, "step": 539 }, { "entropy": 0.02409305050969124, "epoch": 0.23634971003392055, "grad_norm": 5.78125, "learning_rate": 2.9469409853817e-05, "loss": 0.2332, "loss_lm": 0.017662126570940018, "loss_seg": 0.21553605049848557, "mean_token_accuracy": 0.9950805455446243, "num_tokens": 229521988.0, "step": 540 }, { "entropy": 0.023441489785909653, "epoch": 0.23678739468213153, "grad_norm": 13.9375, "learning_rate": 2.9466702761234435e-05, "loss": 0.1188, "loss_lm": 0.01738453726284206, "loss_seg": 0.10140207782387733, "mean_token_accuracy": 0.9952477514743805, "num_tokens": 229947023.0, "step": 541 }, { "entropy": 0.02418115409091115, "epoch": 0.23722507933034248, "grad_norm": 17.125, "learning_rate": 2.946399566865187e-05, "loss": 0.194, "loss_lm": 0.01849177642725408, "loss_seg": 0.1754625029861927, "mean_token_accuracy": 0.9950447976589203, "num_tokens": 230372002.0, "step": 542 }, { "entropy": 0.02439744584262371, "epoch": 0.23766276397855346, "grad_norm": 9.75, "learning_rate": 2.9461288576069302e-05, "loss": 0.2001, "loss_lm": 0.021030144998803735, "loss_seg": 0.17904075235128403, "mean_token_accuracy": 0.9949372857809067, "num_tokens": 230797029.0, "step": 543 }, { "entropy": 0.024378865025937557, "epoch": 0.2381004486267644, "grad_norm": 10.625, "learning_rate": 2.9458581483486736e-05, "loss": 0.2513, "loss_lm": 0.021024833898991346, "loss_seg": 0.2302764542400837, "mean_token_accuracy": 0.9949104487895966, "num_tokens": 231221801.0, "step": 544 }, { "entropy": 0.023870328441262245, "epoch": 0.2385381332749754, "grad_norm": 11.6875, "learning_rate": 2.945587439090417e-05, "loss": 0.135, "loss_lm": 0.01901532243937254, "loss_seg": 0.11594058386981487, "mean_token_accuracy": 0.995104968547821, "num_tokens": 231646634.0, "step": 545 }, { "entropy": 0.023534548934549093, "epoch": 0.23897581792318634, "grad_norm": 9.625, "learning_rate": 2.9453167298321603e-05, "loss": 0.2024, "loss_lm": 0.01589152403175831, "loss_seg": 0.18649992160499096, "mean_token_accuracy": 0.9952166378498077, "num_tokens": 232070972.0, "step": 546 }, { "entropy": 0.024360076058655977, "epoch": 0.23941350257139732, "grad_norm": 10.4375, "learning_rate": 2.9450460205739037e-05, "loss": 0.1743, "loss_lm": 0.01679499656893313, "loss_seg": 0.15746259689331055, "mean_token_accuracy": 0.9950711578130722, "num_tokens": 232496600.0, "step": 547 }, { "entropy": 0.025340807624161243, "epoch": 0.23985118721960827, "grad_norm": 18.125, "learning_rate": 2.944775311315647e-05, "loss": 0.1643, "loss_lm": 0.01883407775312662, "loss_seg": 0.14544817432761192, "mean_token_accuracy": 0.9948838949203491, "num_tokens": 232921521.0, "step": 548 }, { "entropy": 0.024508086033165455, "epoch": 0.24028887186781925, "grad_norm": 10.6875, "learning_rate": 2.9445046020573904e-05, "loss": 0.1764, "loss_lm": 0.017949865432456136, "loss_seg": 0.15846813283860683, "mean_token_accuracy": 0.9949609786272049, "num_tokens": 233346743.0, "step": 549 }, { "entropy": 0.02461986616253853, "epoch": 0.2407265565160302, "grad_norm": 18.75, "learning_rate": 2.9442338927991338e-05, "loss": 0.1699, "loss_lm": 0.020810198038816452, "loss_seg": 0.14908289909362793, "mean_token_accuracy": 0.9949000626802444, "num_tokens": 233772033.0, "step": 550 }, { "entropy": 0.024310939013957977, "epoch": 0.24116424116424118, "grad_norm": 8.25, "learning_rate": 2.9439631835408772e-05, "loss": 0.1874, "loss_lm": 0.019519150257110596, "loss_seg": 0.16783862188458443, "mean_token_accuracy": 0.9950881600379944, "num_tokens": 234197287.0, "step": 551 }, { "entropy": 0.024341030046343803, "epoch": 0.24160192581245213, "grad_norm": 12.625, "learning_rate": 2.9436924742826206e-05, "loss": 0.2479, "loss_lm": 0.02010990772396326, "loss_seg": 0.22775911167263985, "mean_token_accuracy": 0.995009183883667, "num_tokens": 234622459.0, "step": 552 }, { "entropy": 0.02450489066541195, "epoch": 0.24203961046066308, "grad_norm": 13.0625, "learning_rate": 2.943421765024364e-05, "loss": 0.2475, "loss_lm": 0.016599260736256838, "loss_seg": 0.23089115507900715, "mean_token_accuracy": 0.9950418025255203, "num_tokens": 235046964.0, "step": 553 }, { "entropy": 0.023804196622222662, "epoch": 0.24247729510887406, "grad_norm": 12.0, "learning_rate": 2.9431510557661073e-05, "loss": 0.2038, "loss_lm": 0.017147993203252554, "loss_seg": 0.18664206564426422, "mean_token_accuracy": 0.9951944351196289, "num_tokens": 235471644.0, "step": 554 }, { "entropy": 0.023921103682368994, "epoch": 0.242914979757085, "grad_norm": 8.9375, "learning_rate": 2.9428803465078507e-05, "loss": 0.1572, "loss_lm": 0.017224167939275503, "loss_seg": 0.13996345922350883, "mean_token_accuracy": 0.9951624423265457, "num_tokens": 235896842.0, "step": 555 }, { "entropy": 0.023406905587762594, "epoch": 0.243352664405296, "grad_norm": 7.59375, "learning_rate": 2.942609637249594e-05, "loss": 0.1442, "loss_lm": 0.017070555360987782, "loss_seg": 0.12709687557071447, "mean_token_accuracy": 0.9951697587966919, "num_tokens": 236321601.0, "step": 556 }, { "entropy": 0.023512690793722868, "epoch": 0.24379034905350694, "grad_norm": 15.3125, "learning_rate": 2.9423389279913374e-05, "loss": 0.2124, "loss_lm": 0.018818584503605962, "loss_seg": 0.1935972012579441, "mean_token_accuracy": 0.9950921684503555, "num_tokens": 236746409.0, "step": 557 }, { "entropy": 0.024009275250136852, "epoch": 0.24422803370171792, "grad_norm": 9.3125, "learning_rate": 2.9420682187330808e-05, "loss": 0.2154, "loss_lm": 0.018175242934376, "loss_seg": 0.19718663021922112, "mean_token_accuracy": 0.9950221180915833, "num_tokens": 237171466.0, "step": 558 }, { "entropy": 0.023967329878360033, "epoch": 0.24466571834992887, "grad_norm": 10.9375, "learning_rate": 2.941797509474824e-05, "loss": 0.2153, "loss_lm": 0.019536982756108046, "loss_seg": 0.19577623903751373, "mean_token_accuracy": 0.9950368702411652, "num_tokens": 237596973.0, "step": 559 }, { "entropy": 0.023870645090937614, "epoch": 0.24510340299813985, "grad_norm": 12.375, "learning_rate": 2.9415268002165675e-05, "loss": 0.1873, "loss_lm": 0.016646328382194042, "loss_seg": 0.17070217244327068, "mean_token_accuracy": 0.9951527714729309, "num_tokens": 238021094.0, "step": 560 }, { "entropy": 0.023374248761683702, "epoch": 0.2455410876463508, "grad_norm": 11.625, "learning_rate": 2.941256090958311e-05, "loss": 0.2496, "loss_lm": 0.01894111162982881, "loss_seg": 0.2306918427348137, "mean_token_accuracy": 0.9950501620769501, "num_tokens": 238446712.0, "step": 561 }, { "entropy": 0.02335577132180333, "epoch": 0.24597877229456178, "grad_norm": 6.375, "learning_rate": 2.9409853817000543e-05, "loss": 0.1822, "loss_lm": 0.018553839763626456, "loss_seg": 0.16362452134490013, "mean_token_accuracy": 0.9952176511287689, "num_tokens": 238871591.0, "step": 562 }, { "entropy": 0.023760019801557064, "epoch": 0.24641645694277273, "grad_norm": 16.875, "learning_rate": 2.9407146724417973e-05, "loss": 0.1903, "loss_lm": 0.018120256951078773, "loss_seg": 0.1722297016531229, "mean_token_accuracy": 0.9950219392776489, "num_tokens": 239296840.0, "step": 563 }, { "entropy": 0.023272115737199783, "epoch": 0.2468541415909837, "grad_norm": 10.5, "learning_rate": 2.940443963183541e-05, "loss": 0.1166, "loss_lm": 0.015280786203220487, "loss_seg": 0.1013689637184143, "mean_token_accuracy": 0.9951379597187042, "num_tokens": 239722138.0, "step": 564 }, { "entropy": 0.02304071979597211, "epoch": 0.24729182623919466, "grad_norm": 8.9375, "learning_rate": 2.9401732539252844e-05, "loss": 0.1762, "loss_lm": 0.01708412542939186, "loss_seg": 0.15915543399751186, "mean_token_accuracy": 0.9952487498521805, "num_tokens": 240147876.0, "step": 565 }, { "entropy": 0.023223054595291615, "epoch": 0.24772951088740564, "grad_norm": 8.8125, "learning_rate": 2.9399025446670278e-05, "loss": 0.2113, "loss_lm": 0.018052953761070967, "loss_seg": 0.19321588799357414, "mean_token_accuracy": 0.9950773268938065, "num_tokens": 240572821.0, "step": 566 }, { "entropy": 0.023823156487196684, "epoch": 0.24816719553561659, "grad_norm": 18.625, "learning_rate": 2.9396318354087708e-05, "loss": 0.1891, "loss_lm": 0.019582398235797882, "loss_seg": 0.1695595309138298, "mean_token_accuracy": 0.9950233697891235, "num_tokens": 240997970.0, "step": 567 }, { "entropy": 0.023696333169937134, "epoch": 0.24860488018382756, "grad_norm": 9.1875, "learning_rate": 2.9393611261505142e-05, "loss": 0.1931, "loss_lm": 0.015576607547700405, "loss_seg": 0.17747585102915764, "mean_token_accuracy": 0.9950835257768631, "num_tokens": 241422625.0, "step": 568 }, { "entropy": 0.024225213564932346, "epoch": 0.24904256483203852, "grad_norm": 13.8125, "learning_rate": 2.939090416892258e-05, "loss": 0.1732, "loss_lm": 0.019611913710832596, "loss_seg": 0.15357330068945885, "mean_token_accuracy": 0.9948397725820541, "num_tokens": 241847794.0, "step": 569 }, { "entropy": 0.023303732741624117, "epoch": 0.2494802494802495, "grad_norm": 12.875, "learning_rate": 2.9388197076340013e-05, "loss": 0.1989, "loss_lm": 0.017329582944512367, "loss_seg": 0.1815519966185093, "mean_token_accuracy": 0.9951671957969666, "num_tokens": 242272788.0, "step": 570 }, { "entropy": 0.023039519786834717, "epoch": 0.24991793412846044, "grad_norm": 12.6875, "learning_rate": 2.9385489983757446e-05, "loss": 0.1991, "loss_lm": 0.01583169400691986, "loss_seg": 0.18330790102481842, "mean_token_accuracy": 0.9952578544616699, "num_tokens": 242698302.0, "step": 571 }, { "entropy": 0.02381030796095729, "epoch": 0.2503556187766714, "grad_norm": 9.0, "learning_rate": 2.9382782891174877e-05, "loss": 0.1551, "loss_lm": 0.01927150785923004, "loss_seg": 0.1357875969260931, "mean_token_accuracy": 0.9950026124715805, "num_tokens": 243123685.0, "step": 572 }, { "entropy": 0.023487033788114786, "epoch": 0.2507933034248824, "grad_norm": 5.90625, "learning_rate": 2.938007579859231e-05, "loss": 0.1543, "loss_lm": 0.017539660446345806, "loss_seg": 0.13673968985676765, "mean_token_accuracy": 0.995117798447609, "num_tokens": 243548870.0, "step": 573 }, { "entropy": 0.02260364033281803, "epoch": 0.25123098807309335, "grad_norm": 8.0625, "learning_rate": 2.9377368706009748e-05, "loss": 0.2212, "loss_lm": 0.01763667818158865, "loss_seg": 0.2035197652876377, "mean_token_accuracy": 0.9952974915504456, "num_tokens": 243973289.0, "step": 574 }, { "entropy": 0.022791238501667976, "epoch": 0.2516686727213043, "grad_norm": 11.125, "learning_rate": 2.937466161342718e-05, "loss": 0.1444, "loss_lm": 0.015149796614423394, "loss_seg": 0.12926005013287067, "mean_token_accuracy": 0.9951637834310532, "num_tokens": 244398390.0, "step": 575 }, { "entropy": 0.023256669752299786, "epoch": 0.25210635736951525, "grad_norm": 13.5625, "learning_rate": 2.9371954520844615e-05, "loss": 0.2461, "loss_lm": 0.01815605885349214, "loss_seg": 0.22791645675897598, "mean_token_accuracy": 0.9950995296239853, "num_tokens": 244822604.0, "step": 576 }, { "entropy": 0.022502969950437546, "epoch": 0.25254404201772623, "grad_norm": 10.5625, "learning_rate": 2.9369247428262045e-05, "loss": 0.204, "loss_lm": 0.016573916655033827, "loss_seg": 0.18743716180324554, "mean_token_accuracy": 0.99526646733284, "num_tokens": 245247419.0, "step": 577 }, { "entropy": 0.02393568493425846, "epoch": 0.2529817266659372, "grad_norm": 10.9375, "learning_rate": 2.936654033567948e-05, "loss": 0.1553, "loss_lm": 0.021887274459004402, "loss_seg": 0.13341915514320135, "mean_token_accuracy": 0.9949803501367569, "num_tokens": 245672482.0, "step": 578 }, { "entropy": 0.023259182460606098, "epoch": 0.25341941131414814, "grad_norm": 20.125, "learning_rate": 2.9363833243096916e-05, "loss": 0.1658, "loss_lm": 0.016432642238214612, "loss_seg": 0.14935682341456413, "mean_token_accuracy": 0.9951998442411423, "num_tokens": 246098670.0, "step": 579 }, { "entropy": 0.022988830227404833, "epoch": 0.2538570959623591, "grad_norm": 11.3125, "learning_rate": 2.936112615051435e-05, "loss": 0.1472, "loss_lm": 0.01683327602222562, "loss_seg": 0.1303722783923149, "mean_token_accuracy": 0.9950698465108871, "num_tokens": 246523153.0, "step": 580 }, { "entropy": 0.02263589669018984, "epoch": 0.2542947806105701, "grad_norm": 6.3125, "learning_rate": 2.9358419057931784e-05, "loss": 0.2214, "loss_lm": 0.017171377316117287, "loss_seg": 0.20426731556653976, "mean_token_accuracy": 0.9951275438070297, "num_tokens": 246947132.0, "step": 581 }, { "entropy": 0.022825120016932487, "epoch": 0.25473246525878107, "grad_norm": 9.5625, "learning_rate": 2.9355711965349214e-05, "loss": 0.1491, "loss_lm": 0.01615951070562005, "loss_seg": 0.13289815187454224, "mean_token_accuracy": 0.9951589852571487, "num_tokens": 247372213.0, "step": 582 }, { "entropy": 0.02324873022735119, "epoch": 0.255170149906992, "grad_norm": 15.5, "learning_rate": 2.9353004872766648e-05, "loss": 0.2225, "loss_lm": 0.018758831545710564, "loss_seg": 0.20376593619585037, "mean_token_accuracy": 0.9952298551797867, "num_tokens": 247797074.0, "step": 583 }, { "entropy": 0.023211820982396603, "epoch": 0.255607834555203, "grad_norm": 9.9375, "learning_rate": 2.9350297780184085e-05, "loss": 0.2057, "loss_lm": 0.015578970080241561, "loss_seg": 0.19014842621982098, "mean_token_accuracy": 0.9952033013105392, "num_tokens": 248222747.0, "step": 584 }, { "entropy": 0.023489306215196848, "epoch": 0.25604551920341395, "grad_norm": 6.65625, "learning_rate": 2.934759068760152e-05, "loss": 0.1917, "loss_lm": 0.017507408745586872, "loss_seg": 0.17416168935596943, "mean_token_accuracy": 0.995020791888237, "num_tokens": 248648238.0, "step": 585 }, { "entropy": 0.023790621664375067, "epoch": 0.25648320385162493, "grad_norm": 24.875, "learning_rate": 2.9344883595018952e-05, "loss": 0.188, "loss_lm": 0.01872386084869504, "loss_seg": 0.1693129576742649, "mean_token_accuracy": 0.995062068104744, "num_tokens": 249073280.0, "step": 586 }, { "entropy": 0.023451057728379965, "epoch": 0.25692088849983585, "grad_norm": 13.375, "learning_rate": 2.9342176502436383e-05, "loss": 0.1489, "loss_lm": 0.018826513085514307, "loss_seg": 0.13006752729415894, "mean_token_accuracy": 0.995148703455925, "num_tokens": 249499135.0, "step": 587 }, { "entropy": 0.023645215202122927, "epoch": 0.25735857314804683, "grad_norm": 33.75, "learning_rate": 2.9339469409853816e-05, "loss": 0.2113, "loss_lm": 0.014987727859988809, "loss_seg": 0.1963346116244793, "mean_token_accuracy": 0.9949295073747635, "num_tokens": 249924705.0, "step": 588 }, { "entropy": 0.023422122467309237, "epoch": 0.2577962577962578, "grad_norm": 12.0625, "learning_rate": 2.9336762317271253e-05, "loss": 0.1686, "loss_lm": 0.01888796128332615, "loss_seg": 0.14971864223480225, "mean_token_accuracy": 0.9950240403413773, "num_tokens": 250349925.0, "step": 589 }, { "entropy": 0.0230856710113585, "epoch": 0.25823394244446873, "grad_norm": 10.1875, "learning_rate": 2.9334055224688687e-05, "loss": 0.2317, "loss_lm": 0.021569815929979086, "loss_seg": 0.2100805751979351, "mean_token_accuracy": 0.9952144473791122, "num_tokens": 250773645.0, "step": 590 }, { "entropy": 0.0229633548296988, "epoch": 0.2586716270926797, "grad_norm": 9.9375, "learning_rate": 2.9331348132106118e-05, "loss": 0.2026, "loss_lm": 0.018767755944281816, "loss_seg": 0.1838458515703678, "mean_token_accuracy": 0.9951402395963669, "num_tokens": 251198740.0, "step": 591 }, { "entropy": 0.023228432051837444, "epoch": 0.2591093117408907, "grad_norm": 16.375, "learning_rate": 2.932864103952355e-05, "loss": 0.1682, "loss_lm": 0.01664877124130726, "loss_seg": 0.15158863365650177, "mean_token_accuracy": 0.9951690584421158, "num_tokens": 251624295.0, "step": 592 }, { "entropy": 0.02297613024711609, "epoch": 0.25954699638910167, "grad_norm": 9.1875, "learning_rate": 2.9325933946940985e-05, "loss": 0.1675, "loss_lm": 0.018275770358741283, "loss_seg": 0.14919861406087875, "mean_token_accuracy": 0.9950846880674362, "num_tokens": 252049848.0, "step": 593 }, { "entropy": 0.02252402203157544, "epoch": 0.2599846810373126, "grad_norm": 7.96875, "learning_rate": 2.932322685435842e-05, "loss": 0.2006, "loss_lm": 0.020312054082751274, "loss_seg": 0.1803131103515625, "mean_token_accuracy": 0.9952635914087296, "num_tokens": 252474599.0, "step": 594 }, { "entropy": 0.02324205543845892, "epoch": 0.26042236568552357, "grad_norm": 9.9375, "learning_rate": 2.9320519761775856e-05, "loss": 0.1179, "loss_lm": 0.016698748571798205, "loss_seg": 0.10116507858037949, "mean_token_accuracy": 0.9949585348367691, "num_tokens": 252899850.0, "step": 595 }, { "entropy": 0.023554982617497444, "epoch": 0.26086005033373455, "grad_norm": 12.9375, "learning_rate": 2.9317812669193286e-05, "loss": 0.1393, "loss_lm": 0.01902699563652277, "loss_seg": 0.1202947311103344, "mean_token_accuracy": 0.9948181211948395, "num_tokens": 253324990.0, "step": 596 }, { "entropy": 0.022814366966485977, "epoch": 0.26129773498194553, "grad_norm": 11.6875, "learning_rate": 2.931510557661072e-05, "loss": 0.2136, "loss_lm": 0.016203941078856587, "loss_seg": 0.1973632648587227, "mean_token_accuracy": 0.9950972497463226, "num_tokens": 253749415.0, "step": 597 }, { "entropy": 0.022377113346010447, "epoch": 0.26173541963015645, "grad_norm": 14.8125, "learning_rate": 2.9312398484028154e-05, "loss": 0.215, "loss_lm": 0.020153888035565615, "loss_seg": 0.19482707045972347, "mean_token_accuracy": 0.9952087551355362, "num_tokens": 254173706.0, "step": 598 }, { "entropy": 0.02290955651551485, "epoch": 0.26217310427836743, "grad_norm": 12.125, "learning_rate": 2.9309691391445587e-05, "loss": 0.2059, "loss_lm": 0.017597011290490627, "loss_seg": 0.1882961057126522, "mean_token_accuracy": 0.9950179755687714, "num_tokens": 254599223.0, "step": 599 }, { "entropy": 0.022710770834237337, "epoch": 0.2626107889265784, "grad_norm": 8.6875, "learning_rate": 2.9306984298863024e-05, "loss": 0.1689, "loss_lm": 0.01497908984310925, "loss_seg": 0.15396611206233501, "mean_token_accuracy": 0.9950969219207764, "num_tokens": 255024642.0, "step": 600 }, { "entropy": 0.022051696199923754, "epoch": 0.2630484735747894, "grad_norm": 23.875, "learning_rate": 2.9304277206280455e-05, "loss": 0.1603, "loss_lm": 0.01566334185190499, "loss_seg": 0.1446521356701851, "mean_token_accuracy": 0.995252400636673, "num_tokens": 255450058.0, "step": 601 }, { "entropy": 0.02253169985488057, "epoch": 0.2634861582230003, "grad_norm": 9.4375, "learning_rate": 2.930157011369789e-05, "loss": 0.1987, "loss_lm": 0.01571538532152772, "loss_seg": 0.18294399790465832, "mean_token_accuracy": 0.9952473193407059, "num_tokens": 255875462.0, "step": 602 }, { "entropy": 0.02228256082162261, "epoch": 0.2639238428712113, "grad_norm": 8.125, "learning_rate": 2.9298863021115322e-05, "loss": 0.1947, "loss_lm": 0.016442768275737762, "loss_seg": 0.1782997827976942, "mean_token_accuracy": 0.9952788203954697, "num_tokens": 256299246.0, "step": 603 }, { "entropy": 0.02255235379561782, "epoch": 0.26436152751942227, "grad_norm": 14.5625, "learning_rate": 2.9296155928532756e-05, "loss": 0.182, "loss_lm": 0.016196672571823, "loss_seg": 0.16584064811468124, "mean_token_accuracy": 0.9951975047588348, "num_tokens": 256725242.0, "step": 604 }, { "entropy": 0.022813405841588974, "epoch": 0.26479921216763325, "grad_norm": 10.5625, "learning_rate": 2.9293448835950193e-05, "loss": 0.1555, "loss_lm": 0.01669665426015854, "loss_seg": 0.13880806230008602, "mean_token_accuracy": 0.9949975609779358, "num_tokens": 257150216.0, "step": 605 }, { "entropy": 0.023222526535391808, "epoch": 0.26523689681584417, "grad_norm": 15.9375, "learning_rate": 2.9290741743367623e-05, "loss": 0.2034, "loss_lm": 0.021338910795748234, "loss_seg": 0.1820111684501171, "mean_token_accuracy": 0.9950002878904343, "num_tokens": 257575430.0, "step": 606 }, { "entropy": 0.022594839334487915, "epoch": 0.26567458146405515, "grad_norm": 9.375, "learning_rate": 2.9288034650785057e-05, "loss": 0.1573, "loss_lm": 0.017119472846388817, "loss_seg": 0.14013129845261574, "mean_token_accuracy": 0.995130866765976, "num_tokens": 257999662.0, "step": 607 }, { "entropy": 0.02298501692712307, "epoch": 0.2661122661122661, "grad_norm": 10.1875, "learning_rate": 2.928532755820249e-05, "loss": 0.2197, "loss_lm": 0.016418117098510265, "loss_seg": 0.20324559323489666, "mean_token_accuracy": 0.9951234459877014, "num_tokens": 258424933.0, "step": 608 }, { "entropy": 0.023025284986943007, "epoch": 0.26654995076047705, "grad_norm": 9.5625, "learning_rate": 2.9282620465619925e-05, "loss": 0.1676, "loss_lm": 0.01863309694454074, "loss_seg": 0.14894281327724457, "mean_token_accuracy": 0.9949889183044434, "num_tokens": 258849585.0, "step": 609 }, { "entropy": 0.02251780964434147, "epoch": 0.26698763540868803, "grad_norm": 16.25, "learning_rate": 2.9279913373037362e-05, "loss": 0.1702, "loss_lm": 0.017262737965211272, "loss_seg": 0.15292540937662125, "mean_token_accuracy": 0.9952356219291687, "num_tokens": 259274169.0, "step": 610 }, { "entropy": 0.02301188837736845, "epoch": 0.267425320056899, "grad_norm": 6.5625, "learning_rate": 2.9277206280454792e-05, "loss": 0.1879, "loss_lm": 0.018401710782200098, "loss_seg": 0.16954576969146729, "mean_token_accuracy": 0.9950579404830933, "num_tokens": 259699857.0, "step": 611 }, { "entropy": 0.0224439506419003, "epoch": 0.26786300470511, "grad_norm": 8.75, "learning_rate": 2.9274499187872226e-05, "loss": 0.2093, "loss_lm": 0.017835235688835382, "loss_seg": 0.1915074959397316, "mean_token_accuracy": 0.9952681362628937, "num_tokens": 260125026.0, "step": 612 }, { "entropy": 0.02195564517751336, "epoch": 0.2683006893533209, "grad_norm": 12.875, "learning_rate": 2.927179209528966e-05, "loss": 0.2235, "loss_lm": 0.01679299771785736, "loss_seg": 0.2066983263939619, "mean_token_accuracy": 0.9952960014343262, "num_tokens": 260550567.0, "step": 613 }, { "entropy": 0.022960909642279148, "epoch": 0.2687383740015319, "grad_norm": 20.5, "learning_rate": 2.9269085002707093e-05, "loss": 0.1677, "loss_lm": 0.014256829395890236, "loss_seg": 0.15342085249722004, "mean_token_accuracy": 0.9949604570865631, "num_tokens": 260974817.0, "step": 614 }, { "entropy": 0.021758376620709896, "epoch": 0.26917605864974287, "grad_norm": 11.1875, "learning_rate": 2.9266377910124527e-05, "loss": 0.1677, "loss_lm": 0.018361408729106188, "loss_seg": 0.14937179908156395, "mean_token_accuracy": 0.9953179955482483, "num_tokens": 261400232.0, "step": 615 }, { "entropy": 0.021876432932913303, "epoch": 0.26961374329795385, "grad_norm": 12.125, "learning_rate": 2.926367081754196e-05, "loss": 0.1683, "loss_lm": 0.016452837735414505, "loss_seg": 0.15187164768576622, "mean_token_accuracy": 0.9951593428850174, "num_tokens": 261825153.0, "step": 616 }, { "entropy": 0.023408294189721346, "epoch": 0.27005142794616477, "grad_norm": 9.75, "learning_rate": 2.9260963724959394e-05, "loss": 0.1688, "loss_lm": 0.0168170053511858, "loss_seg": 0.15198884531855583, "mean_token_accuracy": 0.9948347061872482, "num_tokens": 262250667.0, "step": 617 }, { "entropy": 0.02283572917804122, "epoch": 0.27048911259437575, "grad_norm": 13.375, "learning_rate": 2.9258256632376828e-05, "loss": 0.1272, "loss_lm": 0.017501406837254763, "loss_seg": 0.10970447212457657, "mean_token_accuracy": 0.9950944483280182, "num_tokens": 262675059.0, "step": 618 }, { "entropy": 0.02196045173332095, "epoch": 0.2709267972425867, "grad_norm": 12.125, "learning_rate": 2.9255549539794262e-05, "loss": 0.1242, "loss_lm": 0.01654084725305438, "loss_seg": 0.10762352123856544, "mean_token_accuracy": 0.9952043145895004, "num_tokens": 263099783.0, "step": 619 }, { "entropy": 0.02166660176590085, "epoch": 0.2713644818907977, "grad_norm": 10.125, "learning_rate": 2.9252842447211696e-05, "loss": 0.1269, "loss_lm": 0.016881871270015836, "loss_seg": 0.1100447028875351, "mean_token_accuracy": 0.9952165931463242, "num_tokens": 263524968.0, "step": 620 }, { "entropy": 0.021963307168334723, "epoch": 0.2718021665390086, "grad_norm": 21.375, "learning_rate": 2.925013535462913e-05, "loss": 0.1806, "loss_lm": 0.01763403811492026, "loss_seg": 0.16301263868808746, "mean_token_accuracy": 0.9952371418476105, "num_tokens": 263950489.0, "step": 621 }, { "entropy": 0.02265307866036892, "epoch": 0.2722398511872196, "grad_norm": 18.25, "learning_rate": 2.9247428262046563e-05, "loss": 0.2331, "loss_lm": 0.018802838400006294, "loss_seg": 0.21430488303303719, "mean_token_accuracy": 0.9950903058052063, "num_tokens": 264375332.0, "step": 622 }, { "entropy": 0.022182156331837177, "epoch": 0.2726775358354306, "grad_norm": 6.03125, "learning_rate": 2.9244721169463997e-05, "loss": 0.1645, "loss_lm": 0.017353077419102192, "loss_seg": 0.1471769642084837, "mean_token_accuracy": 0.9951628893613815, "num_tokens": 264800549.0, "step": 623 }, { "entropy": 0.022637874353677034, "epoch": 0.27311522048364156, "grad_norm": 41.0, "learning_rate": 2.924201407688143e-05, "loss": 0.1938, "loss_lm": 0.01780374674126506, "loss_seg": 0.1760280877351761, "mean_token_accuracy": 0.9950032234191895, "num_tokens": 265225175.0, "step": 624 }, { "entropy": 0.022295447997748852, "epoch": 0.2735529051318525, "grad_norm": 14.875, "learning_rate": 2.923930698429886e-05, "loss": 0.1816, "loss_lm": 0.016969831194728613, "loss_seg": 0.16458317451179028, "mean_token_accuracy": 0.9951911419630051, "num_tokens": 265649985.0, "step": 625 }, { "entropy": 0.022266047541052103, "epoch": 0.27399058978006346, "grad_norm": 11.8125, "learning_rate": 2.9236599891716298e-05, "loss": 0.192, "loss_lm": 0.01546034635975957, "loss_seg": 0.17653338238596916, "mean_token_accuracy": 0.9951686561107635, "num_tokens": 266074685.0, "step": 626 }, { "entropy": 0.021968921646475792, "epoch": 0.27442827442827444, "grad_norm": 11.9375, "learning_rate": 2.923389279913373e-05, "loss": 0.1921, "loss_lm": 0.01649228041060269, "loss_seg": 0.17558671720325947, "mean_token_accuracy": 0.9952634125947952, "num_tokens": 266499221.0, "step": 627 }, { "entropy": 0.02206144155934453, "epoch": 0.27486595907648537, "grad_norm": 8.375, "learning_rate": 2.9231185706551165e-05, "loss": 0.1806, "loss_lm": 0.017426148522645235, "loss_seg": 0.16320085898041725, "mean_token_accuracy": 0.9951926469802856, "num_tokens": 266923941.0, "step": 628 }, { "entropy": 0.022488984279334545, "epoch": 0.27530364372469635, "grad_norm": 11.1875, "learning_rate": 2.92284786139686e-05, "loss": 0.2074, "loss_lm": 0.018963076639920473, "loss_seg": 0.18846803531050682, "mean_token_accuracy": 0.9950586408376694, "num_tokens": 267348912.0, "step": 629 }, { "entropy": 0.022219244856387377, "epoch": 0.2757413283729073, "grad_norm": 14.125, "learning_rate": 2.922577152138603e-05, "loss": 0.209, "loss_lm": 0.018280646298080683, "loss_seg": 0.1906887013465166, "mean_token_accuracy": 0.9951969236135483, "num_tokens": 267773604.0, "step": 630 }, { "entropy": 0.02208399772644043, "epoch": 0.2761790130211183, "grad_norm": 10.0, "learning_rate": 2.9223064428803467e-05, "loss": 0.1564, "loss_lm": 0.018663568422198296, "loss_seg": 0.1377532221376896, "mean_token_accuracy": 0.9950009435415268, "num_tokens": 268198680.0, "step": 631 }, { "entropy": 0.022358245216310024, "epoch": 0.2766166976693292, "grad_norm": 9.625, "learning_rate": 2.92203573362209e-05, "loss": 0.1641, "loss_lm": 0.016046260483562946, "loss_seg": 0.14802229590713978, "mean_token_accuracy": 0.9951290041208267, "num_tokens": 268623633.0, "step": 632 }, { "entropy": 0.02188201155513525, "epoch": 0.2770543823175402, "grad_norm": 9.125, "learning_rate": 2.9217650243638334e-05, "loss": 0.2151, "loss_lm": 0.016263524536043406, "loss_seg": 0.1988377571105957, "mean_token_accuracy": 0.9951429218053818, "num_tokens": 269048747.0, "step": 633 }, { "entropy": 0.021976042538881302, "epoch": 0.2774920669657512, "grad_norm": 7.375, "learning_rate": 2.9214943151055764e-05, "loss": 0.2424, "loss_lm": 0.01633937656879425, "loss_seg": 0.226046621799469, "mean_token_accuracy": 0.9952097833156586, "num_tokens": 269473950.0, "step": 634 }, { "entropy": 0.022051603998988867, "epoch": 0.27792975161396216, "grad_norm": 13.125, "learning_rate": 2.9212236058473198e-05, "loss": 0.1612, "loss_lm": 0.01821007556281984, "loss_seg": 0.14302534237504005, "mean_token_accuracy": 0.9952223747968674, "num_tokens": 269898606.0, "step": 635 }, { "entropy": 0.022108348552137613, "epoch": 0.2783674362621731, "grad_norm": 13.9375, "learning_rate": 2.9209528965890635e-05, "loss": 0.1067, "loss_lm": 0.015312144765630364, "loss_seg": 0.09134986437857151, "mean_token_accuracy": 0.9951605051755905, "num_tokens": 270323428.0, "step": 636 }, { "entropy": 0.02204666333273053, "epoch": 0.27880512091038406, "grad_norm": 9.3125, "learning_rate": 2.920682187330807e-05, "loss": 0.247, "loss_lm": 0.018825853941962123, "loss_seg": 0.2281294632703066, "mean_token_accuracy": 0.995155081152916, "num_tokens": 270748167.0, "step": 637 }, { "entropy": 0.022968601435422897, "epoch": 0.27924280555859504, "grad_norm": 15.25, "learning_rate": 2.9204114780725503e-05, "loss": 0.1706, "loss_lm": 0.017161044059321284, "loss_seg": 0.15346709452569485, "mean_token_accuracy": 0.994946151971817, "num_tokens": 271172763.0, "step": 638 }, { "entropy": 0.022526861168444157, "epoch": 0.279680490206806, "grad_norm": 11.9375, "learning_rate": 2.9201407688142933e-05, "loss": 0.2113, "loss_lm": 0.01765883923508227, "loss_seg": 0.19368476793169975, "mean_token_accuracy": 0.9950684309005737, "num_tokens": 271598579.0, "step": 639 }, { "entropy": 0.02209475776180625, "epoch": 0.28011817485501694, "grad_norm": 18.125, "learning_rate": 2.9198700595560367e-05, "loss": 0.1851, "loss_lm": 0.014927820535376668, "loss_seg": 0.1702042557299137, "mean_token_accuracy": 0.9952916651964188, "num_tokens": 272023086.0, "step": 640 }, { "entropy": 0.02190723828971386, "epoch": 0.2805558595032279, "grad_norm": 6.90625, "learning_rate": 2.9195993502977804e-05, "loss": 0.2025, "loss_lm": 0.01724819396622479, "loss_seg": 0.18528410978615284, "mean_token_accuracy": 0.9952704757452011, "num_tokens": 272448458.0, "step": 641 }, { "entropy": 0.02292134240269661, "epoch": 0.2809935441514389, "grad_norm": 10.0625, "learning_rate": 2.9193286410395238e-05, "loss": 0.171, "loss_lm": 0.02058412879705429, "loss_seg": 0.15039334259927273, "mean_token_accuracy": 0.9950204491615295, "num_tokens": 272874167.0, "step": 642 }, { "entropy": 0.022699706722050905, "epoch": 0.2814312287996499, "grad_norm": 8.25, "learning_rate": 2.919057931781267e-05, "loss": 0.2422, "loss_lm": 0.019125587772578, "loss_seg": 0.22308050096035004, "mean_token_accuracy": 0.9949560910463333, "num_tokens": 273298818.0, "step": 643 }, { "entropy": 0.021776444744318724, "epoch": 0.2818689134478608, "grad_norm": 8.625, "learning_rate": 2.91878722252301e-05, "loss": 0.1897, "loss_lm": 0.015136783476918936, "loss_seg": 0.17452403530478477, "mean_token_accuracy": 0.995258092880249, "num_tokens": 273723931.0, "step": 644 }, { "entropy": 0.021798789035528898, "epoch": 0.2823065980960718, "grad_norm": 6.9375, "learning_rate": 2.9185165132647535e-05, "loss": 0.1795, "loss_lm": 0.01446553599089384, "loss_seg": 0.16503452882170677, "mean_token_accuracy": 0.9953828454017639, "num_tokens": 274149465.0, "step": 645 }, { "entropy": 0.02260319935157895, "epoch": 0.28274428274428276, "grad_norm": 14.75, "learning_rate": 2.9182458040064972e-05, "loss": 0.1711, "loss_lm": 0.017951970919966698, "loss_seg": 0.15314047038555145, "mean_token_accuracy": 0.9951075166463852, "num_tokens": 274574565.0, "step": 646 }, { "entropy": 0.02271709544584155, "epoch": 0.2831819673924937, "grad_norm": 13.875, "learning_rate": 2.9179750947482406e-05, "loss": 0.1702, "loss_lm": 0.019037841586396098, "loss_seg": 0.15119484718888998, "mean_token_accuracy": 0.9949644356966019, "num_tokens": 275000520.0, "step": 647 }, { "entropy": 0.02262835344299674, "epoch": 0.28361965204070466, "grad_norm": 12.0625, "learning_rate": 2.917704385489984e-05, "loss": 0.1659, "loss_lm": 0.016659746412187815, "loss_seg": 0.1492170635610819, "mean_token_accuracy": 0.9950819462537766, "num_tokens": 275426051.0, "step": 648 }, { "entropy": 0.023063411936163902, "epoch": 0.28405733668891564, "grad_norm": 24.625, "learning_rate": 2.917433676231727e-05, "loss": 0.1994, "loss_lm": 0.0180545502807945, "loss_seg": 0.181329982355237, "mean_token_accuracy": 0.9949983209371567, "num_tokens": 275851471.0, "step": 649 }, { "entropy": 0.022291335742920637, "epoch": 0.2844950213371266, "grad_norm": 13.3125, "learning_rate": 2.9171629669734704e-05, "loss": 0.1668, "loss_lm": 0.019044008571654558, "loss_seg": 0.1477748043835163, "mean_token_accuracy": 0.9951373934745789, "num_tokens": 276277140.0, "step": 650 }, { "entropy": 0.022571807261556387, "epoch": 0.28493270598533754, "grad_norm": 11.0, "learning_rate": 2.916892257715214e-05, "loss": 0.2136, "loss_lm": 0.015344776213169098, "loss_seg": 0.19821664690971375, "mean_token_accuracy": 0.9951476901769638, "num_tokens": 276702642.0, "step": 651 }, { "entropy": 0.02256446611136198, "epoch": 0.2853703906335485, "grad_norm": 8.9375, "learning_rate": 2.9166215484569575e-05, "loss": 0.2056, "loss_lm": 0.017713198671117425, "loss_seg": 0.1879071407020092, "mean_token_accuracy": 0.9950971007347107, "num_tokens": 277128062.0, "step": 652 }, { "entropy": 0.02310395147651434, "epoch": 0.2858080752817595, "grad_norm": 14.9375, "learning_rate": 2.916350839198701e-05, "loss": 0.1749, "loss_lm": 0.016701094340533018, "loss_seg": 0.15819884091615677, "mean_token_accuracy": 0.9948804080486298, "num_tokens": 277553666.0, "step": 653 }, { "entropy": 0.02234091004356742, "epoch": 0.2862457599299705, "grad_norm": 12.5625, "learning_rate": 2.916080129940444e-05, "loss": 0.1803, "loss_lm": 0.01788058690726757, "loss_seg": 0.1624690182507038, "mean_token_accuracy": 0.9951940625905991, "num_tokens": 277978312.0, "step": 654 }, { "entropy": 0.021970988251268864, "epoch": 0.2866834445781814, "grad_norm": 11.9375, "learning_rate": 2.9158094206821873e-05, "loss": 0.221, "loss_lm": 0.018990427954122424, "loss_seg": 0.20205258578062057, "mean_token_accuracy": 0.9952029287815094, "num_tokens": 278403329.0, "step": 655 }, { "entropy": 0.02256141835823655, "epoch": 0.2871211292263924, "grad_norm": 9.0625, "learning_rate": 2.915538711423931e-05, "loss": 0.1807, "loss_lm": 0.017383655067533255, "loss_seg": 0.16330083459615707, "mean_token_accuracy": 0.9951260685920715, "num_tokens": 278828618.0, "step": 656 }, { "entropy": 0.022480028215795755, "epoch": 0.28755881387460336, "grad_norm": 9.625, "learning_rate": 2.9152680021656743e-05, "loss": 0.2092, "loss_lm": 0.01622269698418677, "loss_seg": 0.19295678846538067, "mean_token_accuracy": 0.995056539773941, "num_tokens": 279253395.0, "step": 657 }, { "entropy": 0.02212803764268756, "epoch": 0.28799649852281434, "grad_norm": 9.0, "learning_rate": 2.9149972929074174e-05, "loss": 0.1996, "loss_lm": 0.018154525198042393, "loss_seg": 0.18148010224103928, "mean_token_accuracy": 0.995111882686615, "num_tokens": 279678689.0, "step": 658 }, { "entropy": 0.02174836117774248, "epoch": 0.28843418317102526, "grad_norm": 12.0625, "learning_rate": 2.9147265836491608e-05, "loss": 0.1523, "loss_lm": 0.012767281150445342, "loss_seg": 0.1395745389163494, "mean_token_accuracy": 0.9951783716678619, "num_tokens": 280104459.0, "step": 659 }, { "entropy": 0.021890243515372276, "epoch": 0.28887186781923624, "grad_norm": 11.375, "learning_rate": 2.914455874390904e-05, "loss": 0.1826, "loss_lm": 0.016599325463175774, "loss_seg": 0.16604450717568398, "mean_token_accuracy": 0.9952358454465866, "num_tokens": 280529683.0, "step": 660 }, { "entropy": 0.022370937280356884, "epoch": 0.2893095524674472, "grad_norm": 9.6875, "learning_rate": 2.9141851651326475e-05, "loss": 0.1229, "loss_lm": 0.014340804424136877, "loss_seg": 0.10857336781919003, "mean_token_accuracy": 0.9951362013816833, "num_tokens": 280953814.0, "step": 661 }, { "entropy": 0.022040574811398983, "epoch": 0.28974723711565814, "grad_norm": 8.5, "learning_rate": 2.9139144558743912e-05, "loss": 0.158, "loss_lm": 0.015202231472358108, "loss_seg": 0.14276399090886116, "mean_token_accuracy": 0.9951831847429276, "num_tokens": 281378331.0, "step": 662 }, { "entropy": 0.022318535950034857, "epoch": 0.2901849217638691, "grad_norm": 16.375, "learning_rate": 2.9136437466161342e-05, "loss": 0.2139, "loss_lm": 0.01803103880956769, "loss_seg": 0.19591589830815792, "mean_token_accuracy": 0.9952303618192673, "num_tokens": 281803282.0, "step": 663 }, { "entropy": 0.021887517534196377, "epoch": 0.2906226064120801, "grad_norm": 11.4375, "learning_rate": 2.9133730373578776e-05, "loss": 0.1309, "loss_lm": 0.01665573217906058, "loss_seg": 0.11424781195819378, "mean_token_accuracy": 0.9952828586101532, "num_tokens": 282227854.0, "step": 664 }, { "entropy": 0.022805076092481613, "epoch": 0.2910602910602911, "grad_norm": 19.0, "learning_rate": 2.913102328099621e-05, "loss": 0.1832, "loss_lm": 0.014869402861222625, "loss_seg": 0.16835396364331245, "mean_token_accuracy": 0.9950045049190521, "num_tokens": 282653235.0, "step": 665 }, { "entropy": 0.021857398562133312, "epoch": 0.291497975708502, "grad_norm": 7.25, "learning_rate": 2.9128316188413644e-05, "loss": 0.1833, "loss_lm": 0.0169189244043082, "loss_seg": 0.1663987785577774, "mean_token_accuracy": 0.9952373057603836, "num_tokens": 283077733.0, "step": 666 }, { "entropy": 0.022835263051092625, "epoch": 0.291935660356713, "grad_norm": 8.5, "learning_rate": 2.912560909583108e-05, "loss": 0.2324, "loss_lm": 0.018719967687502503, "loss_seg": 0.2136843018233776, "mean_token_accuracy": 0.9949897974729538, "num_tokens": 283503050.0, "step": 667 }, { "entropy": 0.022279794327914715, "epoch": 0.29237334500492396, "grad_norm": 15.4375, "learning_rate": 2.912290200324851e-05, "loss": 0.2289, "loss_lm": 0.018925239332020283, "loss_seg": 0.2099763285368681, "mean_token_accuracy": 0.9950622022151947, "num_tokens": 283928314.0, "step": 668 }, { "entropy": 0.022135685198009014, "epoch": 0.29281102965313494, "grad_norm": 8.8125, "learning_rate": 2.9120194910665945e-05, "loss": 0.1205, "loss_lm": 0.017214721767231822, "loss_seg": 0.10332681611180305, "mean_token_accuracy": 0.9950976818799973, "num_tokens": 284353790.0, "step": 669 }, { "entropy": 0.022351460065692663, "epoch": 0.29324871430134586, "grad_norm": 8.25, "learning_rate": 2.911748781808338e-05, "loss": 0.2457, "loss_lm": 0.01805261359550059, "loss_seg": 0.22768738120794296, "mean_token_accuracy": 0.9949973672628403, "num_tokens": 284778341.0, "step": 670 }, { "entropy": 0.022688166238367558, "epoch": 0.29368639894955684, "grad_norm": 7.1875, "learning_rate": 2.9114780725500812e-05, "loss": 0.1622, "loss_lm": 0.021526161581277847, "loss_seg": 0.14065217226743698, "mean_token_accuracy": 0.9949948638677597, "num_tokens": 285203095.0, "step": 671 }, { "entropy": 0.022575873881578445, "epoch": 0.2941240835977678, "grad_norm": 23.375, "learning_rate": 2.911207363291825e-05, "loss": 0.2265, "loss_lm": 0.01780715142376721, "loss_seg": 0.20870140939950943, "mean_token_accuracy": 0.9950410574674606, "num_tokens": 285628790.0, "step": 672 }, { "entropy": 0.022377200424671173, "epoch": 0.2945617682459788, "grad_norm": 7.40625, "learning_rate": 2.910936654033568e-05, "loss": 0.1926, "loss_lm": 0.01627283589914441, "loss_seg": 0.17633825913071632, "mean_token_accuracy": 0.9951706826686859, "num_tokens": 286054278.0, "step": 673 }, { "entropy": 0.022134121973067522, "epoch": 0.2949994528941897, "grad_norm": 8.75, "learning_rate": 2.9106659447753113e-05, "loss": 0.1574, "loss_lm": 0.01648090104572475, "loss_seg": 0.140877990052104, "mean_token_accuracy": 0.9951300919055939, "num_tokens": 286479092.0, "step": 674 }, { "entropy": 0.022140560671687126, "epoch": 0.2954371375424007, "grad_norm": 14.125, "learning_rate": 2.9103952355170547e-05, "loss": 0.1707, "loss_lm": 0.01560154091566801, "loss_seg": 0.15506883524358273, "mean_token_accuracy": 0.9951194822788239, "num_tokens": 286903798.0, "step": 675 }, { "entropy": 0.023143111262470484, "epoch": 0.2958748221906117, "grad_norm": 15.0625, "learning_rate": 2.910124526258798e-05, "loss": 0.2266, "loss_lm": 0.022517261561006308, "loss_seg": 0.20404911413788795, "mean_token_accuracy": 0.9948773980140686, "num_tokens": 287328955.0, "step": 676 }, { "entropy": 0.021608212031424046, "epoch": 0.29631250683882265, "grad_norm": 19.25, "learning_rate": 2.9098538170005418e-05, "loss": 0.1645, "loss_lm": 0.014267179882153869, "loss_seg": 0.15018873661756516, "mean_token_accuracy": 0.995267316699028, "num_tokens": 287754465.0, "step": 677 }, { "entropy": 0.02213652292266488, "epoch": 0.2967501914870336, "grad_norm": 12.125, "learning_rate": 2.9095831077422848e-05, "loss": 0.1439, "loss_lm": 0.01729557802900672, "loss_seg": 0.1265939436852932, "mean_token_accuracy": 0.9950750321149826, "num_tokens": 288178999.0, "step": 678 }, { "entropy": 0.02150840638205409, "epoch": 0.29718787613524456, "grad_norm": 8.75, "learning_rate": 2.9093123984840282e-05, "loss": 0.1522, "loss_lm": 0.016216106479987502, "loss_seg": 0.1359937284141779, "mean_token_accuracy": 0.9952163100242615, "num_tokens": 288603735.0, "step": 679 }, { "entropy": 0.022421573754400015, "epoch": 0.29762556078345553, "grad_norm": 8.0625, "learning_rate": 2.9090416892257716e-05, "loss": 0.199, "loss_lm": 0.01601706398651004, "loss_seg": 0.1829371601343155, "mean_token_accuracy": 0.9949462115764618, "num_tokens": 289028148.0, "step": 680 }, { "entropy": 0.02166616264730692, "epoch": 0.29806324543166646, "grad_norm": 13.75, "learning_rate": 2.908770979967515e-05, "loss": 0.2016, "loss_lm": 0.017137867398560047, "loss_seg": 0.18449820205569267, "mean_token_accuracy": 0.9951374679803848, "num_tokens": 289454030.0, "step": 681 }, { "entropy": 0.022509572096168995, "epoch": 0.29850093007987744, "grad_norm": 10.1875, "learning_rate": 2.9085002707092583e-05, "loss": 0.1969, "loss_lm": 0.016578247537836432, "loss_seg": 0.1802905574440956, "mean_token_accuracy": 0.9950785338878632, "num_tokens": 289878881.0, "step": 682 }, { "entropy": 0.02219712920486927, "epoch": 0.2989386147280884, "grad_norm": 9.375, "learning_rate": 2.9082295614510017e-05, "loss": 0.1691, "loss_lm": 0.016003972152248025, "loss_seg": 0.15310689061880112, "mean_token_accuracy": 0.9950642287731171, "num_tokens": 290304125.0, "step": 683 }, { "entropy": 0.022421827539801598, "epoch": 0.2993762993762994, "grad_norm": 7.5, "learning_rate": 2.907958852192745e-05, "loss": 0.1697, "loss_lm": 0.020118277985602617, "loss_seg": 0.14953742362558842, "mean_token_accuracy": 0.9949076175689697, "num_tokens": 290729294.0, "step": 684 }, { "entropy": 0.022217226680368185, "epoch": 0.2998139840245103, "grad_norm": 12.3125, "learning_rate": 2.9076881429344884e-05, "loss": 0.205, "loss_lm": 0.017126586521044374, "loss_seg": 0.1878612320870161, "mean_token_accuracy": 0.9952004998922348, "num_tokens": 291154291.0, "step": 685 }, { "entropy": 0.02209206623956561, "epoch": 0.3002516686727213, "grad_norm": 11.5625, "learning_rate": 2.9074174336762318e-05, "loss": 0.2283, "loss_lm": 0.015806706622242928, "loss_seg": 0.21246623247861862, "mean_token_accuracy": 0.995123490691185, "num_tokens": 291579161.0, "step": 686 }, { "entropy": 0.022209457587450743, "epoch": 0.3006893533209323, "grad_norm": 14.0625, "learning_rate": 2.9071467244179752e-05, "loss": 0.1502, "loss_lm": 0.018050425685942173, "loss_seg": 0.13216684758663177, "mean_token_accuracy": 0.9951023459434509, "num_tokens": 292003824.0, "step": 687 }, { "entropy": 0.021636548917740583, "epoch": 0.30112703796914325, "grad_norm": 18.0, "learning_rate": 2.9068760151597186e-05, "loss": 0.1244, "loss_lm": 0.017071059439331293, "loss_seg": 0.10733472183346748, "mean_token_accuracy": 0.9952855110168457, "num_tokens": 292428226.0, "step": 688 }, { "entropy": 0.022360426373779774, "epoch": 0.3015647226173542, "grad_norm": 11.9375, "learning_rate": 2.906605305901462e-05, "loss": 0.2169, "loss_lm": 0.01759148226119578, "loss_seg": 0.1992894746363163, "mean_token_accuracy": 0.995142474770546, "num_tokens": 292853528.0, "step": 689 }, { "entropy": 0.022460936568677425, "epoch": 0.30200240726556515, "grad_norm": 14.125, "learning_rate": 2.9063345966432053e-05, "loss": 0.1656, "loss_lm": 0.0181603676173836, "loss_seg": 0.14746537059545517, "mean_token_accuracy": 0.9950270354747772, "num_tokens": 293278999.0, "step": 690 }, { "entropy": 0.022412229795008898, "epoch": 0.30244009191377613, "grad_norm": 8.625, "learning_rate": 2.9060638873849487e-05, "loss": 0.1581, "loss_lm": 0.019167311489582062, "loss_seg": 0.13890409842133522, "mean_token_accuracy": 0.9950539171695709, "num_tokens": 293704159.0, "step": 691 }, { "entropy": 0.02190632838755846, "epoch": 0.3028777765619871, "grad_norm": 10.0, "learning_rate": 2.9057931781266917e-05, "loss": 0.1575, "loss_lm": 0.015864582266658545, "loss_seg": 0.14167124778032303, "mean_token_accuracy": 0.9952758699655533, "num_tokens": 294128331.0, "step": 692 }, { "entropy": 0.022737638093531132, "epoch": 0.30331546121019803, "grad_norm": 7.84375, "learning_rate": 2.9055224688684354e-05, "loss": 0.1594, "loss_lm": 0.015967827988788486, "loss_seg": 0.1434265486896038, "mean_token_accuracy": 0.9949520975351334, "num_tokens": 294553449.0, "step": 693 }, { "entropy": 0.02237292192876339, "epoch": 0.303753145858409, "grad_norm": 6.84375, "learning_rate": 2.9052517596101788e-05, "loss": 0.2152, "loss_lm": 0.015751355327665806, "loss_seg": 0.19947363249957561, "mean_token_accuracy": 0.9951204210519791, "num_tokens": 294978053.0, "step": 694 }, { "entropy": 0.02217175718396902, "epoch": 0.30419083050662, "grad_norm": 8.0625, "learning_rate": 2.904981050351922e-05, "loss": 0.1762, "loss_lm": 0.01586743420921266, "loss_seg": 0.16028784215450287, "mean_token_accuracy": 0.9949750602245331, "num_tokens": 295403129.0, "step": 695 }, { "entropy": 0.021554852835834026, "epoch": 0.30462851515483097, "grad_norm": 19.25, "learning_rate": 2.9047103410936655e-05, "loss": 0.1406, "loss_lm": 0.017649599350988865, "loss_seg": 0.12298215739428997, "mean_token_accuracy": 0.9950412660837173, "num_tokens": 295828010.0, "step": 696 }, { "entropy": 0.022770472802221775, "epoch": 0.3050661998030419, "grad_norm": 13.8125, "learning_rate": 2.9044396318354086e-05, "loss": 0.191, "loss_lm": 0.01872115395963192, "loss_seg": 0.17227913439273834, "mean_token_accuracy": 0.9948073774576187, "num_tokens": 296253429.0, "step": 697 }, { "entropy": 0.021719202864915133, "epoch": 0.30550388445125287, "grad_norm": 6.6875, "learning_rate": 2.9041689225771523e-05, "loss": 0.1392, "loss_lm": 0.01506071723997593, "loss_seg": 0.12412536516785622, "mean_token_accuracy": 0.9951939433813095, "num_tokens": 296678651.0, "step": 698 }, { "entropy": 0.02160958433523774, "epoch": 0.30594156909946385, "grad_norm": 8.5, "learning_rate": 2.9038982133188957e-05, "loss": 0.2121, "loss_lm": 0.01786481193266809, "loss_seg": 0.19423159211874008, "mean_token_accuracy": 0.9951166957616806, "num_tokens": 297103544.0, "step": 699 }, { "entropy": 0.021623254287987947, "epoch": 0.3063792537476748, "grad_norm": 14.25, "learning_rate": 2.903627504060639e-05, "loss": 0.1755, "loss_lm": 0.014525350416079164, "loss_seg": 0.16096514277160168, "mean_token_accuracy": 0.9951747059822083, "num_tokens": 297528325.0, "step": 700 }, { "entropy": 0.02193178841844201, "epoch": 0.30681693839588575, "grad_norm": 13.75, "learning_rate": 2.9033567948023824e-05, "loss": 0.194, "loss_lm": 0.01758095878176391, "loss_seg": 0.1764231026172638, "mean_token_accuracy": 0.9951479882001877, "num_tokens": 297953094.0, "step": 701 }, { "entropy": 0.02168344333767891, "epoch": 0.30725462304409673, "grad_norm": 13.125, "learning_rate": 2.9030860855441254e-05, "loss": 0.2313, "loss_lm": 0.020627507474273443, "loss_seg": 0.21068352088332176, "mean_token_accuracy": 0.995109036564827, "num_tokens": 298377944.0, "step": 702 }, { "entropy": 0.021096379961818457, "epoch": 0.3076923076923077, "grad_norm": 11.6875, "learning_rate": 2.902815376285869e-05, "loss": 0.1511, "loss_lm": 0.01732667931355536, "loss_seg": 0.1338222585618496, "mean_token_accuracy": 0.9952307641506195, "num_tokens": 298802501.0, "step": 703 }, { "entropy": 0.02232535555958748, "epoch": 0.30812999234051863, "grad_norm": 13.1875, "learning_rate": 2.9025446670276125e-05, "loss": 0.1773, "loss_lm": 0.017512971069663763, "loss_seg": 0.15976257622241974, "mean_token_accuracy": 0.9948889911174774, "num_tokens": 299226883.0, "step": 704 }, { "entropy": 0.022179158870130777, "epoch": 0.3085676769887296, "grad_norm": 11.75, "learning_rate": 2.902273957769356e-05, "loss": 0.1891, "loss_lm": 0.016998667735606432, "loss_seg": 0.17214906215667725, "mean_token_accuracy": 0.9950808435678482, "num_tokens": 299652160.0, "step": 705 }, { "entropy": 0.021763490978628397, "epoch": 0.3090053616369406, "grad_norm": 9.8125, "learning_rate": 2.902003248511099e-05, "loss": 0.1475, "loss_lm": 0.017736115027219057, "loss_seg": 0.12971742637455463, "mean_token_accuracy": 0.9952417612075806, "num_tokens": 300076664.0, "step": 706 }, { "entropy": 0.021827980410307646, "epoch": 0.30944304628515157, "grad_norm": 12.5625, "learning_rate": 2.9017325392528423e-05, "loss": 0.1936, "loss_lm": 0.016870769439265132, "loss_seg": 0.17670376040041447, "mean_token_accuracy": 0.9950532466173172, "num_tokens": 300501570.0, "step": 707 }, { "entropy": 0.021477341186255217, "epoch": 0.3098807309333625, "grad_norm": 9.4375, "learning_rate": 2.901461829994586e-05, "loss": 0.1583, "loss_lm": 0.015485659940168262, "loss_seg": 0.1428641937673092, "mean_token_accuracy": 0.9951112866401672, "num_tokens": 300926380.0, "step": 708 }, { "entropy": 0.02232177695259452, "epoch": 0.31031841558157347, "grad_norm": 11.125, "learning_rate": 2.9011911207363294e-05, "loss": 0.1939, "loss_lm": 0.017705983482301235, "loss_seg": 0.17615310847759247, "mean_token_accuracy": 0.995024099946022, "num_tokens": 301351404.0, "step": 709 }, { "entropy": 0.021781073417514563, "epoch": 0.31075610022978445, "grad_norm": 10.0625, "learning_rate": 2.9009204114780728e-05, "loss": 0.1629, "loss_lm": 0.01619209675118327, "loss_seg": 0.14669782854616642, "mean_token_accuracy": 0.9951379597187042, "num_tokens": 301776291.0, "step": 710 }, { "entropy": 0.021868258714675903, "epoch": 0.3111937848779954, "grad_norm": 17.25, "learning_rate": 2.9006497022198158e-05, "loss": 0.1874, "loss_lm": 0.018449044320732355, "loss_seg": 0.16898243501782417, "mean_token_accuracy": 0.9951075315475464, "num_tokens": 302201617.0, "step": 711 }, { "entropy": 0.02167790522798896, "epoch": 0.31163146952620635, "grad_norm": 10.8125, "learning_rate": 2.900378992961559e-05, "loss": 0.2208, "loss_lm": 0.016295496840029955, "loss_seg": 0.20447765290737152, "mean_token_accuracy": 0.9951798319816589, "num_tokens": 302626434.0, "step": 712 }, { "entropy": 0.021612341981381178, "epoch": 0.31206915417441733, "grad_norm": 8.3125, "learning_rate": 2.900108283703303e-05, "loss": 0.1721, "loss_lm": 0.01656471798196435, "loss_seg": 0.15558060258626938, "mean_token_accuracy": 0.9953102469444275, "num_tokens": 303052217.0, "step": 713 }, { "entropy": 0.02226055273786187, "epoch": 0.3125068388226283, "grad_norm": 6.3125, "learning_rate": 2.8998375744450462e-05, "loss": 0.1842, "loss_lm": 0.01826235745102167, "loss_seg": 0.1658894456923008, "mean_token_accuracy": 0.9949560612440109, "num_tokens": 303477220.0, "step": 714 }, { "entropy": 0.02183679211884737, "epoch": 0.3129445234708393, "grad_norm": 13.3125, "learning_rate": 2.8995668651867896e-05, "loss": 0.194, "loss_lm": 0.01669254247099161, "loss_seg": 0.17730369977653027, "mean_token_accuracy": 0.9952412992715836, "num_tokens": 303902090.0, "step": 715 }, { "entropy": 0.021493173204362392, "epoch": 0.3133822081190502, "grad_norm": 18.375, "learning_rate": 2.8992961559285327e-05, "loss": 0.1754, "loss_lm": 0.015790791250765324, "loss_seg": 0.15962099842727184, "mean_token_accuracy": 0.9952971190214157, "num_tokens": 304327743.0, "step": 716 }, { "entropy": 0.02198872622102499, "epoch": 0.3138198927672612, "grad_norm": 8.75, "learning_rate": 2.899025446670276e-05, "loss": 0.2072, "loss_lm": 0.016303604701533914, "loss_seg": 0.19091235473752022, "mean_token_accuracy": 0.9950989037752151, "num_tokens": 304752301.0, "step": 717 }, { "entropy": 0.022284673992544413, "epoch": 0.31425757741547217, "grad_norm": 16.5, "learning_rate": 2.8987547374120197e-05, "loss": 0.2113, "loss_lm": 0.020663601579144597, "loss_seg": 0.1906120330095291, "mean_token_accuracy": 0.9951329976320267, "num_tokens": 305177792.0, "step": 718 }, { "entropy": 0.021797365974634886, "epoch": 0.3146952620636831, "grad_norm": 9.125, "learning_rate": 2.898484028153763e-05, "loss": 0.17, "loss_lm": 0.01521780202165246, "loss_seg": 0.15480555221438408, "mean_token_accuracy": 0.9952433854341507, "num_tokens": 305602197.0, "step": 719 }, { "entropy": 0.022535153198987246, "epoch": 0.31513294671189407, "grad_norm": 8.25, "learning_rate": 2.8982133188955065e-05, "loss": 0.2279, "loss_lm": 0.017520108725875616, "loss_seg": 0.2103910706937313, "mean_token_accuracy": 0.9950757324695587, "num_tokens": 306027398.0, "step": 720 }, { "entropy": 0.022223451640456915, "epoch": 0.31557063136010505, "grad_norm": 17.125, "learning_rate": 2.8979426096372495e-05, "loss": 0.1431, "loss_lm": 0.017475439002737403, "loss_seg": 0.1256155502051115, "mean_token_accuracy": 0.9950257390737534, "num_tokens": 306452351.0, "step": 721 }, { "entropy": 0.022031696047633886, "epoch": 0.316008316008316, "grad_norm": 8.125, "learning_rate": 2.897671900378993e-05, "loss": 0.1425, "loss_lm": 0.015576799865812063, "loss_seg": 0.12689343467354774, "mean_token_accuracy": 0.9952307492494583, "num_tokens": 306877678.0, "step": 722 }, { "entropy": 0.021335383877158165, "epoch": 0.31644600065652695, "grad_norm": 13.6875, "learning_rate": 2.8974011911207363e-05, "loss": 0.1417, "loss_lm": 0.017111559631302953, "loss_seg": 0.12463472038507462, "mean_token_accuracy": 0.9952790439128876, "num_tokens": 307302326.0, "step": 723 }, { "entropy": 0.021974294912070036, "epoch": 0.3168836853047379, "grad_norm": 25.0, "learning_rate": 2.89713048186248e-05, "loss": 0.1744, "loss_lm": 0.015814642189070582, "loss_seg": 0.15861286967992783, "mean_token_accuracy": 0.99510557949543, "num_tokens": 307727677.0, "step": 724 }, { "entropy": 0.02204922866076231, "epoch": 0.3173213699529489, "grad_norm": 6.40625, "learning_rate": 2.8968597726042233e-05, "loss": 0.2072, "loss_lm": 0.017435021698474884, "loss_seg": 0.1897536050528288, "mean_token_accuracy": 0.995024248957634, "num_tokens": 308152702.0, "step": 725 }, { "entropy": 0.02164354408159852, "epoch": 0.3177590546011599, "grad_norm": 25.5, "learning_rate": 2.8965890633459664e-05, "loss": 0.1442, "loss_lm": 0.016038388945162296, "loss_seg": 0.1281755417585373, "mean_token_accuracy": 0.9949936270713806, "num_tokens": 308577947.0, "step": 726 }, { "entropy": 0.021090847440063953, "epoch": 0.3181967392493708, "grad_norm": 13.0, "learning_rate": 2.8963183540877097e-05, "loss": 0.236, "loss_lm": 0.017390863737091422, "loss_seg": 0.21861034259200096, "mean_token_accuracy": 0.9952103197574615, "num_tokens": 309002159.0, "step": 727 }, { "entropy": 0.02182012889534235, "epoch": 0.3186344238975818, "grad_norm": 6.375, "learning_rate": 2.896047644829453e-05, "loss": 0.1722, "loss_lm": 0.017615615855902433, "loss_seg": 0.1546101700514555, "mean_token_accuracy": 0.9951697140932083, "num_tokens": 309426309.0, "step": 728 }, { "entropy": 0.021501366514712572, "epoch": 0.31907210854579277, "grad_norm": 18.25, "learning_rate": 2.895776935571197e-05, "loss": 0.125, "loss_lm": 0.016495384508743882, "loss_seg": 0.10845602303743362, "mean_token_accuracy": 0.9951692372560501, "num_tokens": 309851446.0, "step": 729 }, { "entropy": 0.02084128325805068, "epoch": 0.31950979319400374, "grad_norm": 10.5625, "learning_rate": 2.89550622631294e-05, "loss": 0.2048, "loss_lm": 0.01760737970471382, "loss_seg": 0.18720362707972527, "mean_token_accuracy": 0.9950818568468094, "num_tokens": 310275756.0, "step": 730 }, { "entropy": 0.02242392487823963, "epoch": 0.31994747784221467, "grad_norm": 11.1875, "learning_rate": 2.8952355170546832e-05, "loss": 0.1918, "loss_lm": 0.017217516899108887, "loss_seg": 0.17456784658133984, "mean_token_accuracy": 0.9948736131191254, "num_tokens": 310700607.0, "step": 731 }, { "entropy": 0.022008344065397978, "epoch": 0.32038516249042565, "grad_norm": 8.1875, "learning_rate": 2.8949648077964266e-05, "loss": 0.1626, "loss_lm": 0.016820691991597414, "loss_seg": 0.1457490511238575, "mean_token_accuracy": 0.994914174079895, "num_tokens": 311125723.0, "step": 732 }, { "entropy": 0.02164075104519725, "epoch": 0.3208228471386366, "grad_norm": 13.375, "learning_rate": 2.89469409853817e-05, "loss": 0.1282, "loss_lm": 0.015411227708682418, "loss_seg": 0.11281058378517628, "mean_token_accuracy": 0.995188295841217, "num_tokens": 311550493.0, "step": 733 }, { "entropy": 0.02147928485646844, "epoch": 0.3212605317868476, "grad_norm": 8.5, "learning_rate": 2.8944233892799137e-05, "loss": 0.1577, "loss_lm": 0.016828162828460336, "loss_seg": 0.14091091975569725, "mean_token_accuracy": 0.9952398538589478, "num_tokens": 311975424.0, "step": 734 }, { "entropy": 0.02098705992102623, "epoch": 0.3216982164350585, "grad_norm": 15.5625, "learning_rate": 2.8941526800216567e-05, "loss": 0.2185, "loss_lm": 0.01996456948108971, "loss_seg": 0.19854597747325897, "mean_token_accuracy": 0.9951898604631424, "num_tokens": 312399687.0, "step": 735 }, { "entropy": 0.02143736556172371, "epoch": 0.3221359010832695, "grad_norm": 11.125, "learning_rate": 2.8938819707634e-05, "loss": 0.1743, "loss_lm": 0.01700361492112279, "loss_seg": 0.15728182531893253, "mean_token_accuracy": 0.9951024800539017, "num_tokens": 312825184.0, "step": 736 }, { "entropy": 0.02201146027073264, "epoch": 0.3225735857314805, "grad_norm": 13.75, "learning_rate": 2.8936112615051435e-05, "loss": 0.2111, "loss_lm": 0.01784383156336844, "loss_seg": 0.1932392753660679, "mean_token_accuracy": 0.9950991123914719, "num_tokens": 313250380.0, "step": 737 }, { "entropy": 0.021881186868995428, "epoch": 0.3230112703796914, "grad_norm": 13.875, "learning_rate": 2.893340552246887e-05, "loss": 0.1495, "loss_lm": 0.015792438993230462, "loss_seg": 0.13367114588618279, "mean_token_accuracy": 0.9949802607297897, "num_tokens": 313675836.0, "step": 738 }, { "entropy": 0.021316255442798138, "epoch": 0.3234489550279024, "grad_norm": 61.75, "learning_rate": 2.8930698429886306e-05, "loss": 0.158, "loss_lm": 0.017859215615317225, "loss_seg": 0.14009425416588783, "mean_token_accuracy": 0.9952014833688736, "num_tokens": 314101122.0, "step": 739 }, { "entropy": 0.022854462265968323, "epoch": 0.32388663967611336, "grad_norm": 13.875, "learning_rate": 2.8927991337303736e-05, "loss": 0.1912, "loss_lm": 0.020667225122451782, "loss_seg": 0.1704919897019863, "mean_token_accuracy": 0.994891494512558, "num_tokens": 314526093.0, "step": 740 }, { "entropy": 0.02144323894754052, "epoch": 0.32432432432432434, "grad_norm": 5.3125, "learning_rate": 2.892528424472117e-05, "loss": 0.1373, "loss_lm": 0.017685246653854847, "loss_seg": 0.11957521736621857, "mean_token_accuracy": 0.995190292596817, "num_tokens": 314951051.0, "step": 741 }, { "entropy": 0.022340254858136177, "epoch": 0.32476200897253527, "grad_norm": 18.75, "learning_rate": 2.8922577152138603e-05, "loss": 0.2361, "loss_lm": 0.018654840532690287, "loss_seg": 0.2174806147813797, "mean_token_accuracy": 0.9949797093868256, "num_tokens": 315376507.0, "step": 742 }, { "entropy": 0.021577860228717327, "epoch": 0.32519969362074624, "grad_norm": 14.875, "learning_rate": 2.8919870059556037e-05, "loss": 0.1166, "loss_lm": 0.018051019869744778, "loss_seg": 0.098522724583745, "mean_token_accuracy": 0.9951096475124359, "num_tokens": 315801795.0, "step": 743 }, { "entropy": 0.02166891749948263, "epoch": 0.3256373782689572, "grad_norm": 10.4375, "learning_rate": 2.8917162966973474e-05, "loss": 0.1782, "loss_lm": 0.01729036238975823, "loss_seg": 0.16092172265052795, "mean_token_accuracy": 0.9951266497373581, "num_tokens": 316227334.0, "step": 744 }, { "entropy": 0.02149716531857848, "epoch": 0.3260750629171682, "grad_norm": 8.9375, "learning_rate": 2.8914455874390905e-05, "loss": 0.1733, "loss_lm": 0.01632074359804392, "loss_seg": 0.15699739381670952, "mean_token_accuracy": 0.9950125217437744, "num_tokens": 316652151.0, "step": 745 }, { "entropy": 0.02190395863726735, "epoch": 0.3265127475653791, "grad_norm": 11.125, "learning_rate": 2.8911748781808338e-05, "loss": 0.2149, "loss_lm": 0.01721517974510789, "loss_seg": 0.1976974457502365, "mean_token_accuracy": 0.9949810653924942, "num_tokens": 317077113.0, "step": 746 }, { "entropy": 0.02096111560240388, "epoch": 0.3269504322135901, "grad_norm": 6.90625, "learning_rate": 2.8909041689225772e-05, "loss": 0.2063, "loss_lm": 0.018267655512318015, "loss_seg": 0.18799977749586105, "mean_token_accuracy": 0.9952853918075562, "num_tokens": 317502535.0, "step": 747 }, { "entropy": 0.02111729420721531, "epoch": 0.3273881168618011, "grad_norm": 12.875, "learning_rate": 2.8906334596643206e-05, "loss": 0.2057, "loss_lm": 0.015050037764012814, "loss_seg": 0.19063196703791618, "mean_token_accuracy": 0.9952007085084915, "num_tokens": 317927942.0, "step": 748 }, { "entropy": 0.02090467605739832, "epoch": 0.32782580151001206, "grad_norm": 10.875, "learning_rate": 2.8903627504060643e-05, "loss": 0.1747, "loss_lm": 0.01578804152086377, "loss_seg": 0.15888874232769012, "mean_token_accuracy": 0.9951747804880142, "num_tokens": 318352342.0, "step": 749 }, { "entropy": 0.02196875959634781, "epoch": 0.328263486158223, "grad_norm": 6.625, "learning_rate": 2.8900920411478073e-05, "loss": 0.1717, "loss_lm": 0.017577596241608262, "loss_seg": 0.15414551459252834, "mean_token_accuracy": 0.9949471205472946, "num_tokens": 318776830.0, "step": 750 }, { "entropy": 0.02144926507025957, "epoch": 0.32870117080643396, "grad_norm": 5.90625, "learning_rate": 2.8898213318895507e-05, "loss": 0.1933, "loss_lm": 0.0172141557559371, "loss_seg": 0.1760416403412819, "mean_token_accuracy": 0.9951989203691483, "num_tokens": 319201254.0, "step": 751 }, { "entropy": 0.021767642814666033, "epoch": 0.32913885545464494, "grad_norm": 21.25, "learning_rate": 2.889550622631294e-05, "loss": 0.1499, "loss_lm": 0.016910484991967678, "loss_seg": 0.1329619139432907, "mean_token_accuracy": 0.9950563311576843, "num_tokens": 319626326.0, "step": 752 }, { "entropy": 0.022326058242470026, "epoch": 0.3295765401028559, "grad_norm": 12.4375, "learning_rate": 2.8892799133730374e-05, "loss": 0.2369, "loss_lm": 0.018642413895577192, "loss_seg": 0.2182312924414873, "mean_token_accuracy": 0.9949728399515152, "num_tokens": 320051815.0, "step": 753 }, { "entropy": 0.02152580814436078, "epoch": 0.33001422475106684, "grad_norm": 6.875, "learning_rate": 2.8890092041147808e-05, "loss": 0.1222, "loss_lm": 0.01815558224916458, "loss_seg": 0.10406284220516682, "mean_token_accuracy": 0.9950953274965286, "num_tokens": 320476624.0, "step": 754 }, { "entropy": 0.021294564474374056, "epoch": 0.3304519093992778, "grad_norm": 10.9375, "learning_rate": 2.8887384948565242e-05, "loss": 0.2119, "loss_lm": 0.01683010021224618, "loss_seg": 0.19508281350135803, "mean_token_accuracy": 0.9950620085000992, "num_tokens": 320901271.0, "step": 755 }, { "entropy": 0.02150242915377021, "epoch": 0.3308895940474888, "grad_norm": 10.375, "learning_rate": 2.8884677855982676e-05, "loss": 0.1641, "loss_lm": 0.01654316345229745, "loss_seg": 0.14760189689695835, "mean_token_accuracy": 0.9950752407312393, "num_tokens": 321326836.0, "step": 756 }, { "entropy": 0.021415740251541138, "epoch": 0.3313272786956997, "grad_norm": 6.21875, "learning_rate": 2.888197076340011e-05, "loss": 0.144, "loss_lm": 0.015019996091723442, "loss_seg": 0.12896339781582355, "mean_token_accuracy": 0.9951150268316269, "num_tokens": 321751732.0, "step": 757 }, { "entropy": 0.021840366534888744, "epoch": 0.3317649633439107, "grad_norm": 14.125, "learning_rate": 2.8879263670817543e-05, "loss": 0.195, "loss_lm": 0.01694334694184363, "loss_seg": 0.17809657007455826, "mean_token_accuracy": 0.9951339364051819, "num_tokens": 322176664.0, "step": 758 }, { "entropy": 0.02137305773794651, "epoch": 0.3322026479921217, "grad_norm": 9.0625, "learning_rate": 2.8876556578234973e-05, "loss": 0.2014, "loss_lm": 0.01703737280331552, "loss_seg": 0.18441039696335793, "mean_token_accuracy": 0.9951358139514923, "num_tokens": 322602407.0, "step": 759 }, { "entropy": 0.02156794350594282, "epoch": 0.33264033264033266, "grad_norm": 17.875, "learning_rate": 2.887384948565241e-05, "loss": 0.1351, "loss_lm": 0.018258269876241684, "loss_seg": 0.11685924418270588, "mean_token_accuracy": 0.9951226115226746, "num_tokens": 323027811.0, "step": 760 }, { "entropy": 0.02106191124767065, "epoch": 0.3330780172885436, "grad_norm": 8.5625, "learning_rate": 2.8871142393069844e-05, "loss": 0.1837, "loss_lm": 0.01569415465928614, "loss_seg": 0.16796032711863518, "mean_token_accuracy": 0.9952089786529541, "num_tokens": 323452343.0, "step": 761 }, { "entropy": 0.021255973726511, "epoch": 0.33351570193675456, "grad_norm": 14.3125, "learning_rate": 2.8868435300487278e-05, "loss": 0.1508, "loss_lm": 0.017659400589764118, "loss_seg": 0.13312510773539543, "mean_token_accuracy": 0.9950284063816071, "num_tokens": 323877335.0, "step": 762 }, { "entropy": 0.021611311938613653, "epoch": 0.33395338658496554, "grad_norm": 24.0, "learning_rate": 2.886572820790471e-05, "loss": 0.1908, "loss_lm": 0.01693419087678194, "loss_seg": 0.1738244853913784, "mean_token_accuracy": 0.9951682090759277, "num_tokens": 324303234.0, "step": 763 }, { "entropy": 0.02154479641467333, "epoch": 0.3343910712331765, "grad_norm": 9.8125, "learning_rate": 2.8863021115322142e-05, "loss": 0.1455, "loss_lm": 0.017074143514037132, "loss_seg": 0.12839597649872303, "mean_token_accuracy": 0.9950418323278427, "num_tokens": 324728159.0, "step": 764 }, { "entropy": 0.021973556373268366, "epoch": 0.33482875588138744, "grad_norm": 10.875, "learning_rate": 2.886031402273958e-05, "loss": 0.1982, "loss_lm": 0.017446238547563553, "loss_seg": 0.18078458681702614, "mean_token_accuracy": 0.9950160682201385, "num_tokens": 325153270.0, "step": 765 }, { "entropy": 0.02105568116530776, "epoch": 0.3352664405295984, "grad_norm": 8.8125, "learning_rate": 2.8857606930157013e-05, "loss": 0.1655, "loss_lm": 0.014954695478081703, "loss_seg": 0.1505206674337387, "mean_token_accuracy": 0.9951411783695221, "num_tokens": 325578196.0, "step": 766 }, { "entropy": 0.021511619444936514, "epoch": 0.3357041251778094, "grad_norm": 7.8125, "learning_rate": 2.8854899837574447e-05, "loss": 0.2507, "loss_lm": 0.01710484316572547, "loss_seg": 0.23360098525881767, "mean_token_accuracy": 0.9950920343399048, "num_tokens": 326003619.0, "step": 767 }, { "entropy": 0.021717617753893137, "epoch": 0.3361418098260204, "grad_norm": 9.9375, "learning_rate": 2.885219274499188e-05, "loss": 0.2186, "loss_lm": 0.01677705137990415, "loss_seg": 0.2018546536564827, "mean_token_accuracy": 0.9951556473970413, "num_tokens": 326428610.0, "step": 768 }, { "entropy": 0.020982106681913137, "epoch": 0.3365794944742313, "grad_norm": 9.0625, "learning_rate": 2.884948565240931e-05, "loss": 0.1761, "loss_lm": 0.015609483700245619, "loss_seg": 0.16050662845373154, "mean_token_accuracy": 0.9952168166637421, "num_tokens": 326852741.0, "step": 769 }, { "entropy": 0.021159572061151266, "epoch": 0.3370171791224423, "grad_norm": 14.5, "learning_rate": 2.8846778559826748e-05, "loss": 0.1631, "loss_lm": 0.016324170865118504, "loss_seg": 0.14677609130740166, "mean_token_accuracy": 0.9952420890331268, "num_tokens": 327277899.0, "step": 770 }, { "entropy": 0.0209430493414402, "epoch": 0.33745486377065326, "grad_norm": 12.25, "learning_rate": 2.884407146724418e-05, "loss": 0.1667, "loss_lm": 0.017333357129245996, "loss_seg": 0.1493380069732666, "mean_token_accuracy": 0.995172455906868, "num_tokens": 327702708.0, "step": 771 }, { "entropy": 0.021695900708436966, "epoch": 0.33789254841886424, "grad_norm": 11.0, "learning_rate": 2.8841364374661615e-05, "loss": 0.1908, "loss_lm": 0.018272065790370107, "loss_seg": 0.1725231409072876, "mean_token_accuracy": 0.9950409233570099, "num_tokens": 328128769.0, "step": 772 }, { "entropy": 0.021449984051287174, "epoch": 0.33833023306707516, "grad_norm": 11.625, "learning_rate": 2.8838657282079045e-05, "loss": 0.1606, "loss_lm": 0.01633989321999252, "loss_seg": 0.14426504634320736, "mean_token_accuracy": 0.9950819462537766, "num_tokens": 328553899.0, "step": 773 }, { "entropy": 0.020886645652353764, "epoch": 0.33876791771528614, "grad_norm": 12.0625, "learning_rate": 2.883595018949648e-05, "loss": 0.1776, "loss_lm": 0.017440489027649164, "loss_seg": 0.1601265985518694, "mean_token_accuracy": 0.9953634589910507, "num_tokens": 328979162.0, "step": 774 }, { "entropy": 0.020747622475028038, "epoch": 0.3392056023634971, "grad_norm": 15.5625, "learning_rate": 2.8833243096913916e-05, "loss": 0.1472, "loss_lm": 0.015949809923768044, "loss_seg": 0.13124196138232946, "mean_token_accuracy": 0.9951514601707458, "num_tokens": 329404423.0, "step": 775 }, { "entropy": 0.021418120712041855, "epoch": 0.33964328701170804, "grad_norm": 7.78125, "learning_rate": 2.883053600433135e-05, "loss": 0.205, "loss_lm": 0.017153015825897455, "loss_seg": 0.1878892295062542, "mean_token_accuracy": 0.9950223863124847, "num_tokens": 329829297.0, "step": 776 }, { "entropy": 0.02090336289256811, "epoch": 0.340080971659919, "grad_norm": 18.625, "learning_rate": 2.8827828911748784e-05, "loss": 0.1762, "loss_lm": 0.017436851048842072, "loss_seg": 0.15876195020973682, "mean_token_accuracy": 0.9951933771371841, "num_tokens": 330254424.0, "step": 777 }, { "entropy": 0.02152075059711933, "epoch": 0.34051865630813, "grad_norm": 6.90625, "learning_rate": 2.8825121819166214e-05, "loss": 0.1213, "loss_lm": 0.01778985932469368, "loss_seg": 0.10355815943330526, "mean_token_accuracy": 0.9950484186410904, "num_tokens": 330679070.0, "step": 778 }, { "entropy": 0.02116960659623146, "epoch": 0.340956340956341, "grad_norm": 16.125, "learning_rate": 2.8822414726583648e-05, "loss": 0.1696, "loss_lm": 0.016195113072171807, "loss_seg": 0.1534504760056734, "mean_token_accuracy": 0.9950952082872391, "num_tokens": 331104763.0, "step": 779 }, { "entropy": 0.021356961224228144, "epoch": 0.3413940256045519, "grad_norm": 11.3125, "learning_rate": 2.8819707634001085e-05, "loss": 0.1882, "loss_lm": 0.017985223326832056, "loss_seg": 0.17019120790064335, "mean_token_accuracy": 0.9950701892375946, "num_tokens": 331529479.0, "step": 780 }, { "entropy": 0.021883662324398756, "epoch": 0.3418317102527629, "grad_norm": 9.5625, "learning_rate": 2.881700054141852e-05, "loss": 0.1699, "loss_lm": 0.018727201502770185, "loss_seg": 0.15117153525352478, "mean_token_accuracy": 0.9949696063995361, "num_tokens": 331955666.0, "step": 781 }, { "entropy": 0.021226214710623026, "epoch": 0.34226939490097386, "grad_norm": 9.625, "learning_rate": 2.8814293448835952e-05, "loss": 0.1449, "loss_lm": 0.01700368756428361, "loss_seg": 0.12791820243000984, "mean_token_accuracy": 0.9951314777135849, "num_tokens": 332380217.0, "step": 782 }, { "entropy": 0.021510903723537922, "epoch": 0.34270707954918483, "grad_norm": 10.0625, "learning_rate": 2.8811586356253383e-05, "loss": 0.1528, "loss_lm": 0.015591435134410858, "loss_seg": 0.1371616516262293, "mean_token_accuracy": 0.9950612187385559, "num_tokens": 332805398.0, "step": 783 }, { "entropy": 0.021186815109103918, "epoch": 0.34314476419739576, "grad_norm": 10.875, "learning_rate": 2.8808879263670816e-05, "loss": 0.1078, "loss_lm": 0.01659896271303296, "loss_seg": 0.09116755612194538, "mean_token_accuracy": 0.9951027482748032, "num_tokens": 333230714.0, "step": 784 }, { "entropy": 0.02112781908363104, "epoch": 0.34358244884560674, "grad_norm": 10.875, "learning_rate": 2.8806172171088254e-05, "loss": 0.2148, "loss_lm": 0.020215231459587812, "loss_seg": 0.19453896954655647, "mean_token_accuracy": 0.9950840771198273, "num_tokens": 333655660.0, "step": 785 }, { "entropy": 0.0209343945607543, "epoch": 0.3440201334938177, "grad_norm": 8.625, "learning_rate": 2.8803465078505687e-05, "loss": 0.1749, "loss_lm": 0.01819707080721855, "loss_seg": 0.1567173358052969, "mean_token_accuracy": 0.995162695646286, "num_tokens": 334080464.0, "step": 786 }, { "entropy": 0.021658243611454964, "epoch": 0.3444578181420287, "grad_norm": 11.5625, "learning_rate": 2.880075798592312e-05, "loss": 0.2172, "loss_lm": 0.0204621572047472, "loss_seg": 0.1967608779668808, "mean_token_accuracy": 0.995152160525322, "num_tokens": 334505354.0, "step": 787 }, { "entropy": 0.021117290947586298, "epoch": 0.3448955027902396, "grad_norm": 9.125, "learning_rate": 2.879805089334055e-05, "loss": 0.1528, "loss_lm": 0.01928791683167219, "loss_seg": 0.1334705725312233, "mean_token_accuracy": 0.9952545017004013, "num_tokens": 334930351.0, "step": 788 }, { "entropy": 0.020998318679630756, "epoch": 0.3453331874384506, "grad_norm": 8.625, "learning_rate": 2.8795343800757985e-05, "loss": 0.1836, "loss_lm": 0.016461455961689353, "loss_seg": 0.1671012993901968, "mean_token_accuracy": 0.995198056101799, "num_tokens": 335355586.0, "step": 789 }, { "entropy": 0.020246946718543768, "epoch": 0.3457708720866616, "grad_norm": 9.125, "learning_rate": 2.879263670817542e-05, "loss": 0.1559, "loss_lm": 0.01611707196570933, "loss_seg": 0.13978437706828117, "mean_token_accuracy": 0.9955461174249649, "num_tokens": 335779865.0, "step": 790 }, { "entropy": 0.02126017352566123, "epoch": 0.3462085567348725, "grad_norm": 7.78125, "learning_rate": 2.8789929615592856e-05, "loss": 0.1436, "loss_lm": 0.015424772631376982, "loss_seg": 0.12816014885902405, "mean_token_accuracy": 0.9950694739818573, "num_tokens": 336203947.0, "step": 791 }, { "entropy": 0.021222928073257208, "epoch": 0.3466462413830835, "grad_norm": 6.75, "learning_rate": 2.878722252301029e-05, "loss": 0.1088, "loss_lm": 0.01660913904197514, "loss_seg": 0.09219102934002876, "mean_token_accuracy": 0.9951791167259216, "num_tokens": 336628513.0, "step": 792 }, { "entropy": 0.020699166227132082, "epoch": 0.34708392603129445, "grad_norm": 9.1875, "learning_rate": 2.878451543042772e-05, "loss": 0.1942, "loss_lm": 0.015160394133999944, "loss_seg": 0.17900218069553375, "mean_token_accuracy": 0.9950940161943436, "num_tokens": 337053669.0, "step": 793 }, { "entropy": 0.02084845956414938, "epoch": 0.34752161067950543, "grad_norm": 9.0625, "learning_rate": 2.8781808337845154e-05, "loss": 0.2162, "loss_lm": 0.01710692560300231, "loss_seg": 0.19906053692102432, "mean_token_accuracy": 0.9950943291187286, "num_tokens": 337478633.0, "step": 794 }, { "entropy": 0.02122864918783307, "epoch": 0.34795929532771636, "grad_norm": 9.6875, "learning_rate": 2.8779101245262587e-05, "loss": 0.2205, "loss_lm": 0.019221846479922533, "loss_seg": 0.2012955415993929, "mean_token_accuracy": 0.9951269626617432, "num_tokens": 337903617.0, "step": 795 }, { "entropy": 0.021395509131252766, "epoch": 0.34839697997592733, "grad_norm": 8.375, "learning_rate": 2.8776394152680025e-05, "loss": 0.195, "loss_lm": 0.01671561342664063, "loss_seg": 0.17830216512084007, "mean_token_accuracy": 0.9952152669429779, "num_tokens": 338328250.0, "step": 796 }, { "entropy": 0.021452026907354593, "epoch": 0.3488346646241383, "grad_norm": 10.8125, "learning_rate": 2.8773687060097455e-05, "loss": 0.1577, "loss_lm": 0.0184651012532413, "loss_seg": 0.1392194889485836, "mean_token_accuracy": 0.9951780885457993, "num_tokens": 338753347.0, "step": 797 }, { "entropy": 0.021406935062259436, "epoch": 0.3492723492723493, "grad_norm": 8.3125, "learning_rate": 2.877097996751489e-05, "loss": 0.196, "loss_lm": 0.01640148274600506, "loss_seg": 0.17962809279561043, "mean_token_accuracy": 0.9952287971973419, "num_tokens": 339178783.0, "step": 798 }, { "entropy": 0.02088742796331644, "epoch": 0.3497100339205602, "grad_norm": 11.0625, "learning_rate": 2.8768272874932322e-05, "loss": 0.1675, "loss_lm": 0.016793069895356894, "loss_seg": 0.15067638829350471, "mean_token_accuracy": 0.9951830506324768, "num_tokens": 339603691.0, "step": 799 }, { "entropy": 0.0214505298063159, "epoch": 0.3501477185687712, "grad_norm": 8.1875, "learning_rate": 2.8765565782349756e-05, "loss": 0.2032, "loss_lm": 0.018109630327671766, "loss_seg": 0.18505368381738663, "mean_token_accuracy": 0.9950765669345856, "num_tokens": 340028364.0, "step": 800 }, { "entropy": 0.020728391595184803, "epoch": 0.35058540321698217, "grad_norm": 14.8125, "learning_rate": 2.8762858689767193e-05, "loss": 0.1569, "loss_lm": 0.016217962140217423, "loss_seg": 0.14072154369205236, "mean_token_accuracy": 0.9952548146247864, "num_tokens": 340453815.0, "step": 801 }, { "entropy": 0.02094144932925701, "epoch": 0.35102308786519315, "grad_norm": 6.8125, "learning_rate": 2.8760151597184624e-05, "loss": 0.178, "loss_lm": 0.015633628237992525, "loss_seg": 0.16237198375165462, "mean_token_accuracy": 0.9951301068067551, "num_tokens": 340878660.0, "step": 802 }, { "entropy": 0.020667174831032753, "epoch": 0.3514607725134041, "grad_norm": 5.9375, "learning_rate": 2.8757444504602057e-05, "loss": 0.2113, "loss_lm": 0.014866209356114268, "loss_seg": 0.19646304659545422, "mean_token_accuracy": 0.9952453821897507, "num_tokens": 341303058.0, "step": 803 }, { "entropy": 0.020966026466339827, "epoch": 0.35189845716161505, "grad_norm": 8.9375, "learning_rate": 2.875473741201949e-05, "loss": 0.1398, "loss_lm": 0.015689621213823557, "loss_seg": 0.1240807119756937, "mean_token_accuracy": 0.9950715601444244, "num_tokens": 341728086.0, "step": 804 }, { "entropy": 0.02133624767884612, "epoch": 0.35233614180982603, "grad_norm": 9.4375, "learning_rate": 2.8752030319436925e-05, "loss": 0.1649, "loss_lm": 0.018024656688794494, "loss_seg": 0.14691819064319134, "mean_token_accuracy": 0.9949420243501663, "num_tokens": 342152718.0, "step": 805 }, { "entropy": 0.021240750793367624, "epoch": 0.352773826458037, "grad_norm": 7.5625, "learning_rate": 2.8749323226854362e-05, "loss": 0.1718, "loss_lm": 0.018110443837940693, "loss_seg": 0.15369416773319244, "mean_token_accuracy": 0.9949983358383179, "num_tokens": 342578086.0, "step": 806 }, { "entropy": 0.020182155538350344, "epoch": 0.35321151110624793, "grad_norm": 13.625, "learning_rate": 2.8746616134271792e-05, "loss": 0.1648, "loss_lm": 0.014301509130746126, "loss_seg": 0.15045714005827904, "mean_token_accuracy": 0.9953377395868301, "num_tokens": 343002950.0, "step": 807 }, { "entropy": 0.02101207571104169, "epoch": 0.3536491957544589, "grad_norm": 8.5, "learning_rate": 2.8743909041689226e-05, "loss": 0.1773, "loss_lm": 0.019186338875442743, "loss_seg": 0.15807471796870232, "mean_token_accuracy": 0.9950046390295029, "num_tokens": 343428543.0, "step": 808 }, { "entropy": 0.021618171595036983, "epoch": 0.3540868804026699, "grad_norm": 9.5625, "learning_rate": 2.874120194910666e-05, "loss": 0.1446, "loss_lm": 0.01751340413466096, "loss_seg": 0.12711906619369984, "mean_token_accuracy": 0.9948561936616898, "num_tokens": 343853887.0, "step": 809 }, { "entropy": 0.020784355234354734, "epoch": 0.3545245650508808, "grad_norm": 21.125, "learning_rate": 2.8738494856524093e-05, "loss": 0.1721, "loss_lm": 0.01743126194924116, "loss_seg": 0.15468478947877884, "mean_token_accuracy": 0.9951532334089279, "num_tokens": 344278881.0, "step": 810 }, { "entropy": 0.021069715730845928, "epoch": 0.3549622496990918, "grad_norm": 9.1875, "learning_rate": 2.873578776394153e-05, "loss": 0.1564, "loss_lm": 0.019244908820837736, "loss_seg": 0.13717728853225708, "mean_token_accuracy": 0.995114654302597, "num_tokens": 344703809.0, "step": 811 }, { "entropy": 0.020456064958125353, "epoch": 0.35539993434730277, "grad_norm": 7.21875, "learning_rate": 2.873308067135896e-05, "loss": 0.1965, "loss_lm": 0.017144516576081514, "loss_seg": 0.1793847158551216, "mean_token_accuracy": 0.9951819032430649, "num_tokens": 345128826.0, "step": 812 }, { "entropy": 0.020621647592633963, "epoch": 0.35583761899551375, "grad_norm": 9.75, "learning_rate": 2.8730373578776395e-05, "loss": 0.1415, "loss_lm": 0.017585279885679483, "loss_seg": 0.12391890026628971, "mean_token_accuracy": 0.995196208357811, "num_tokens": 345553883.0, "step": 813 }, { "entropy": 0.021148609463125467, "epoch": 0.3562753036437247, "grad_norm": 10.0625, "learning_rate": 2.8727666486193828e-05, "loss": 0.1406, "loss_lm": 0.01722975284792483, "loss_seg": 0.12333769351243973, "mean_token_accuracy": 0.9950896799564362, "num_tokens": 345979500.0, "step": 814 }, { "entropy": 0.021631157025694847, "epoch": 0.35671298829193565, "grad_norm": 29.125, "learning_rate": 2.8724959393611262e-05, "loss": 0.1874, "loss_lm": 0.018300797790288925, "loss_seg": 0.16912599839270115, "mean_token_accuracy": 0.9948872774839401, "num_tokens": 346404914.0, "step": 815 }, { "entropy": 0.02082641888409853, "epoch": 0.35715067294014663, "grad_norm": 7.625, "learning_rate": 2.87222523010287e-05, "loss": 0.2074, "loss_lm": 0.01667218003422022, "loss_seg": 0.19076814129948616, "mean_token_accuracy": 0.9950489848852158, "num_tokens": 346830267.0, "step": 816 }, { "entropy": 0.021119427401572466, "epoch": 0.3575883575883576, "grad_norm": 8.875, "learning_rate": 2.871954520844613e-05, "loss": 0.2168, "loss_lm": 0.01722554676234722, "loss_seg": 0.19960340484976768, "mean_token_accuracy": 0.9950905442237854, "num_tokens": 347255952.0, "step": 817 }, { "entropy": 0.02153204008936882, "epoch": 0.35802604223656853, "grad_norm": 10.125, "learning_rate": 2.8716838115863563e-05, "loss": 0.1893, "loss_lm": 0.01730779348872602, "loss_seg": 0.17201275378465652, "mean_token_accuracy": 0.9949880391359329, "num_tokens": 347681319.0, "step": 818 }, { "entropy": 0.0217264867387712, "epoch": 0.3584637268847795, "grad_norm": 12.1875, "learning_rate": 2.8714131023280997e-05, "loss": 0.2199, "loss_lm": 0.01735667511820793, "loss_seg": 0.20254068076610565, "mean_token_accuracy": 0.9949690997600555, "num_tokens": 348106066.0, "step": 819 }, { "entropy": 0.021685502026230097, "epoch": 0.3589014115329905, "grad_norm": 13.6875, "learning_rate": 2.871142393069843e-05, "loss": 0.1836, "loss_lm": 0.016919113229960203, "loss_seg": 0.16665130481123924, "mean_token_accuracy": 0.9950376451015472, "num_tokens": 348530626.0, "step": 820 }, { "entropy": 0.020994343794882298, "epoch": 0.35933909618120147, "grad_norm": 12.5625, "learning_rate": 2.8708716838115864e-05, "loss": 0.1407, "loss_lm": 0.01683503738604486, "loss_seg": 0.12385252490639687, "mean_token_accuracy": 0.9952990114688873, "num_tokens": 348955813.0, "step": 821 }, { "entropy": 0.021115729119628668, "epoch": 0.3597767808294124, "grad_norm": 11.25, "learning_rate": 2.8706009745533298e-05, "loss": 0.1693, "loss_lm": 0.016791286412626505, "loss_seg": 0.1525459736585617, "mean_token_accuracy": 0.9951301366090775, "num_tokens": 349381480.0, "step": 822 }, { "entropy": 0.021913723554462194, "epoch": 0.36021446547762337, "grad_norm": 16.125, "learning_rate": 2.8703302652950732e-05, "loss": 0.1655, "loss_lm": 0.01799807883799076, "loss_seg": 0.14747232757508755, "mean_token_accuracy": 0.9949797093868256, "num_tokens": 349806545.0, "step": 823 }, { "entropy": 0.020975555293262005, "epoch": 0.36065215012583435, "grad_norm": 7.0, "learning_rate": 2.8700595560368166e-05, "loss": 0.152, "loss_lm": 0.017353619681671262, "loss_seg": 0.1346806362271309, "mean_token_accuracy": 0.9952981173992157, "num_tokens": 350231423.0, "step": 824 }, { "entropy": 0.02184506133198738, "epoch": 0.3610898347740453, "grad_norm": 9.125, "learning_rate": 2.86978884677856e-05, "loss": 0.2245, "loss_lm": 0.020156579092144966, "loss_seg": 0.20436947420239449, "mean_token_accuracy": 0.995118111371994, "num_tokens": 350656652.0, "step": 825 }, { "entropy": 0.021614179480820894, "epoch": 0.36152751942225625, "grad_norm": 6.65625, "learning_rate": 2.869518137520303e-05, "loss": 0.1833, "loss_lm": 0.016815990675240755, "loss_seg": 0.16652627289295197, "mean_token_accuracy": 0.9950516223907471, "num_tokens": 351082407.0, "step": 826 }, { "entropy": 0.021429315209388733, "epoch": 0.36196520407046723, "grad_norm": 10.5, "learning_rate": 2.8692474282620467e-05, "loss": 0.1366, "loss_lm": 0.016157929319888353, "loss_seg": 0.12044321000576019, "mean_token_accuracy": 0.9952311664819717, "num_tokens": 351507293.0, "step": 827 }, { "entropy": 0.020402248948812485, "epoch": 0.3624028887186782, "grad_norm": 7.40625, "learning_rate": 2.86897671900379e-05, "loss": 0.1439, "loss_lm": 0.016536080045625567, "loss_seg": 0.1273551993072033, "mean_token_accuracy": 0.9953578412532806, "num_tokens": 351932043.0, "step": 828 }, { "entropy": 0.02110501565039158, "epoch": 0.36284057336688913, "grad_norm": 11.4375, "learning_rate": 2.8687060097455334e-05, "loss": 0.15, "loss_lm": 0.016394035425037146, "loss_seg": 0.1335858255624771, "mean_token_accuracy": 0.9951111823320389, "num_tokens": 352356636.0, "step": 829 }, { "entropy": 0.020713002886623144, "epoch": 0.3632782580151001, "grad_norm": 11.5625, "learning_rate": 2.8684353004872768e-05, "loss": 0.1664, "loss_lm": 0.015875295735895634, "loss_seg": 0.15051560290157795, "mean_token_accuracy": 0.9952277392148972, "num_tokens": 352781337.0, "step": 830 }, { "entropy": 0.021754328161478043, "epoch": 0.3637159426633111, "grad_norm": 23.75, "learning_rate": 2.8681645912290198e-05, "loss": 0.1407, "loss_lm": 0.01661826577037573, "loss_seg": 0.12409277632832527, "mean_token_accuracy": 0.9949293285608292, "num_tokens": 353206918.0, "step": 831 }, { "entropy": 0.02117704600095749, "epoch": 0.36415362731152207, "grad_norm": 11.625, "learning_rate": 2.8678938819707635e-05, "loss": 0.2034, "loss_lm": 0.019381092861294746, "loss_seg": 0.1839696206152439, "mean_token_accuracy": 0.9950947016477585, "num_tokens": 353632980.0, "step": 832 }, { "entropy": 0.020851747132837772, "epoch": 0.364591311959733, "grad_norm": 14.25, "learning_rate": 2.867623172712507e-05, "loss": 0.193, "loss_lm": 0.016808477696031332, "loss_seg": 0.17615597322583199, "mean_token_accuracy": 0.9951279908418655, "num_tokens": 354058029.0, "step": 833 }, { "entropy": 0.020973865408450365, "epoch": 0.36502899660794397, "grad_norm": 8.0625, "learning_rate": 2.8673524634542503e-05, "loss": 0.1723, "loss_lm": 0.014644058421254158, "loss_seg": 0.15763208456337452, "mean_token_accuracy": 0.9952145963907242, "num_tokens": 354483458.0, "step": 834 }, { "entropy": 0.020427412819117308, "epoch": 0.36546668125615495, "grad_norm": 10.3125, "learning_rate": 2.8670817541959937e-05, "loss": 0.1865, "loss_lm": 0.0200028233230114, "loss_seg": 0.16646210849285126, "mean_token_accuracy": 0.9953170418739319, "num_tokens": 354908370.0, "step": 835 }, { "entropy": 0.021942611318081617, "epoch": 0.3659043659043659, "grad_norm": 6.125, "learning_rate": 2.8668110449377367e-05, "loss": 0.1586, "loss_lm": 0.02029439713805914, "loss_seg": 0.13830028101801872, "mean_token_accuracy": 0.9949207156896591, "num_tokens": 355334019.0, "step": 836 }, { "entropy": 0.021217573899775743, "epoch": 0.36634205055257685, "grad_norm": 7.5, "learning_rate": 2.8665403356794804e-05, "loss": 0.1964, "loss_lm": 0.01689047017134726, "loss_seg": 0.179557204246521, "mean_token_accuracy": 0.9951297640800476, "num_tokens": 355759209.0, "step": 837 }, { "entropy": 0.020547923166304827, "epoch": 0.3667797352007878, "grad_norm": 9.1875, "learning_rate": 2.8662696264212238e-05, "loss": 0.2117, "loss_lm": 0.019566636299714446, "loss_seg": 0.19212624803185463, "mean_token_accuracy": 0.9952630549669266, "num_tokens": 356184742.0, "step": 838 }, { "entropy": 0.02061967086046934, "epoch": 0.3672174198489988, "grad_norm": 7.46875, "learning_rate": 2.865998917162967e-05, "loss": 0.155, "loss_lm": 0.018809405621141195, "loss_seg": 0.13622164726257324, "mean_token_accuracy": 0.9952548891305923, "num_tokens": 356609785.0, "step": 839 }, { "entropy": 0.020869011990725994, "epoch": 0.3676551044972098, "grad_norm": 13.4375, "learning_rate": 2.8657282079047105e-05, "loss": 0.1802, "loss_lm": 0.016969997668638825, "loss_seg": 0.16327325627207756, "mean_token_accuracy": 0.9950985908508301, "num_tokens": 357034536.0, "step": 840 }, { "entropy": 0.020226359833031893, "epoch": 0.3680927891454207, "grad_norm": 15.875, "learning_rate": 2.8654574986464535e-05, "loss": 0.1481, "loss_lm": 0.017025336856022477, "loss_seg": 0.13108377903699875, "mean_token_accuracy": 0.9953521490097046, "num_tokens": 357460479.0, "step": 841 }, { "entropy": 0.021123789250850677, "epoch": 0.3685304737936317, "grad_norm": 11.8125, "learning_rate": 2.8651867893881973e-05, "loss": 0.1883, "loss_lm": 0.01806794200092554, "loss_seg": 0.1702130064368248, "mean_token_accuracy": 0.995082214474678, "num_tokens": 357885429.0, "step": 842 }, { "entropy": 0.020640887785702944, "epoch": 0.36896815844184266, "grad_norm": 12.875, "learning_rate": 2.8649160801299406e-05, "loss": 0.1646, "loss_lm": 0.017742610769346356, "loss_seg": 0.1468394510447979, "mean_token_accuracy": 0.9951954036951065, "num_tokens": 358309972.0, "step": 843 }, { "entropy": 0.02154658827930689, "epoch": 0.36940584309005364, "grad_norm": 10.1875, "learning_rate": 2.864645370871684e-05, "loss": 0.1465, "loss_lm": 0.01975848339498043, "loss_seg": 0.12673514895141125, "mean_token_accuracy": 0.9949910044670105, "num_tokens": 358734786.0, "step": 844 }, { "entropy": 0.021496116649359465, "epoch": 0.36984352773826457, "grad_norm": 18.0, "learning_rate": 2.864374661613427e-05, "loss": 0.2123, "loss_lm": 0.015915532829239964, "loss_seg": 0.19634922593832016, "mean_token_accuracy": 0.9949307590723038, "num_tokens": 359159691.0, "step": 845 }, { "entropy": 0.02105236193165183, "epoch": 0.37028121238647554, "grad_norm": 9.0625, "learning_rate": 2.8641039523551704e-05, "loss": 0.1902, "loss_lm": 0.014217698480933905, "loss_seg": 0.17593304812908173, "mean_token_accuracy": 0.9951435476541519, "num_tokens": 359584663.0, "step": 846 }, { "entropy": 0.021121577359735966, "epoch": 0.3707188970346865, "grad_norm": 10.1875, "learning_rate": 2.863833243096914e-05, "loss": 0.1528, "loss_lm": 0.01679648319259286, "loss_seg": 0.135999858379364, "mean_token_accuracy": 0.994930237531662, "num_tokens": 360009312.0, "step": 847 }, { "entropy": 0.02041967399418354, "epoch": 0.37115658168289745, "grad_norm": 11.0, "learning_rate": 2.8635625338386575e-05, "loss": 0.2012, "loss_lm": 0.013979874784126878, "loss_seg": 0.18719331920146942, "mean_token_accuracy": 0.9953102618455887, "num_tokens": 360434438.0, "step": 848 }, { "entropy": 0.02074686996638775, "epoch": 0.3715942663311084, "grad_norm": 8.375, "learning_rate": 2.863291824580401e-05, "loss": 0.1512, "loss_lm": 0.01924884901382029, "loss_seg": 0.13192762434482574, "mean_token_accuracy": 0.9951871931552887, "num_tokens": 360859941.0, "step": 849 }, { "entropy": 0.021523578092455864, "epoch": 0.3720319509793194, "grad_norm": 8.4375, "learning_rate": 2.863021115322144e-05, "loss": 0.1809, "loss_lm": 0.01862902706488967, "loss_seg": 0.16229848936200142, "mean_token_accuracy": 0.9950267225503922, "num_tokens": 361284375.0, "step": 850 }, { "entropy": 0.020362351555377245, "epoch": 0.3724696356275304, "grad_norm": 12.9375, "learning_rate": 2.8627504060638873e-05, "loss": 0.1446, "loss_lm": 0.016624746844172478, "loss_seg": 0.12797734141349792, "mean_token_accuracy": 0.9951574504375458, "num_tokens": 361709142.0, "step": 851 }, { "entropy": 0.020711438730359077, "epoch": 0.3729073202757413, "grad_norm": 12.3125, "learning_rate": 2.862479696805631e-05, "loss": 0.1701, "loss_lm": 0.016222904669120908, "loss_seg": 0.1539191696792841, "mean_token_accuracy": 0.9951877295970917, "num_tokens": 362134070.0, "step": 852 }, { "entropy": 0.021182721946388483, "epoch": 0.3733450049239523, "grad_norm": 24.375, "learning_rate": 2.8622089875473744e-05, "loss": 0.2005, "loss_lm": 0.0187334509100765, "loss_seg": 0.18172580748796463, "mean_token_accuracy": 0.9950071722269058, "num_tokens": 362559229.0, "step": 853 }, { "entropy": 0.02082271547988057, "epoch": 0.37378268957216326, "grad_norm": 11.75, "learning_rate": 2.8619382782891177e-05, "loss": 0.1759, "loss_lm": 0.016959827858954668, "loss_seg": 0.15892895683646202, "mean_token_accuracy": 0.9951220601797104, "num_tokens": 362983558.0, "step": 854 }, { "entropy": 0.019654829055070877, "epoch": 0.37422037422037424, "grad_norm": 9.1875, "learning_rate": 2.8616675690308608e-05, "loss": 0.1358, "loss_lm": 0.016731556737795472, "loss_seg": 0.11904302146285772, "mean_token_accuracy": 0.9953919798135757, "num_tokens": 363408421.0, "step": 855 }, { "entropy": 0.0213309726677835, "epoch": 0.37465805886858516, "grad_norm": 10.625, "learning_rate": 2.861396859772604e-05, "loss": 0.1614, "loss_lm": 0.01986464927904308, "loss_seg": 0.14153467118740082, "mean_token_accuracy": 0.9949818104505539, "num_tokens": 363832852.0, "step": 856 }, { "entropy": 0.0216701477766037, "epoch": 0.37509574351679614, "grad_norm": 10.75, "learning_rate": 2.8611261505143475e-05, "loss": 0.1801, "loss_lm": 0.01646050368435681, "loss_seg": 0.16368546150624752, "mean_token_accuracy": 0.9949587285518646, "num_tokens": 364258698.0, "step": 857 }, { "entropy": 0.02015209011733532, "epoch": 0.3755334281650071, "grad_norm": 8.875, "learning_rate": 2.8608554412560912e-05, "loss": 0.1854, "loss_lm": 0.01577894017100334, "loss_seg": 0.16966862604022026, "mean_token_accuracy": 0.9952917248010635, "num_tokens": 364683854.0, "step": 858 }, { "entropy": 0.021287073381245136, "epoch": 0.3759711128132181, "grad_norm": 10.0, "learning_rate": 2.8605847319978346e-05, "loss": 0.2056, "loss_lm": 0.01844388898462057, "loss_seg": 0.18714315444231033, "mean_token_accuracy": 0.9950579702854156, "num_tokens": 365108521.0, "step": 859 }, { "entropy": 0.02052746945992112, "epoch": 0.376408797461429, "grad_norm": 9.9375, "learning_rate": 2.8603140227395776e-05, "loss": 0.1558, "loss_lm": 0.019175035879015923, "loss_seg": 0.1366468481719494, "mean_token_accuracy": 0.9952903836965561, "num_tokens": 365533794.0, "step": 860 }, { "entropy": 0.020684242248535156, "epoch": 0.37684648210964, "grad_norm": 8.1875, "learning_rate": 2.860043313481321e-05, "loss": 0.149, "loss_lm": 0.01805714750662446, "loss_seg": 0.13091778941452503, "mean_token_accuracy": 0.9951713234186172, "num_tokens": 365958702.0, "step": 861 }, { "entropy": 0.02127715479582548, "epoch": 0.377284166757851, "grad_norm": 9.375, "learning_rate": 2.8597726042230644e-05, "loss": 0.2214, "loss_lm": 0.01799225341528654, "loss_seg": 0.20337276346981525, "mean_token_accuracy": 0.9950200319290161, "num_tokens": 366383362.0, "step": 862 }, { "entropy": 0.020693969912827015, "epoch": 0.37772185140606196, "grad_norm": 21.625, "learning_rate": 2.859501894964808e-05, "loss": 0.148, "loss_lm": 0.014087325427681208, "loss_seg": 0.1338701769709587, "mean_token_accuracy": 0.9951158910989761, "num_tokens": 366808791.0, "step": 863 }, { "entropy": 0.02013802109286189, "epoch": 0.3781595360542729, "grad_norm": 8.9375, "learning_rate": 2.8592311857065515e-05, "loss": 0.1471, "loss_lm": 0.016709442948922515, "loss_seg": 0.13040387444198132, "mean_token_accuracy": 0.9952547252178192, "num_tokens": 367233595.0, "step": 864 }, { "entropy": 0.020986265037208796, "epoch": 0.37859722070248386, "grad_norm": 8.75, "learning_rate": 2.8589604764482945e-05, "loss": 0.1781, "loss_lm": 0.016588534461334348, "loss_seg": 0.1615043208003044, "mean_token_accuracy": 0.9951695799827576, "num_tokens": 367658775.0, "step": 865 }, { "entropy": 0.02076698513701558, "epoch": 0.37903490535069484, "grad_norm": 10.5, "learning_rate": 2.858689767190038e-05, "loss": 0.1424, "loss_lm": 0.01757841045036912, "loss_seg": 0.12485816702246666, "mean_token_accuracy": 0.995227500796318, "num_tokens": 368083247.0, "step": 866 }, { "entropy": 0.02076162025332451, "epoch": 0.37947258999890576, "grad_norm": 12.25, "learning_rate": 2.8584190579317812e-05, "loss": 0.1435, "loss_lm": 0.01782253128476441, "loss_seg": 0.12568407133221626, "mean_token_accuracy": 0.9952037483453751, "num_tokens": 368508532.0, "step": 867 }, { "entropy": 0.020902500953525305, "epoch": 0.37991027464711674, "grad_norm": 107.5, "learning_rate": 2.858148348673525e-05, "loss": 0.1762, "loss_lm": 0.015561116393655539, "loss_seg": 0.16064178198575974, "mean_token_accuracy": 0.9952262490987778, "num_tokens": 368933962.0, "step": 868 }, { "entropy": 0.020775963086634874, "epoch": 0.3803479592953277, "grad_norm": 9.625, "learning_rate": 2.857877639415268e-05, "loss": 0.1669, "loss_lm": 0.015024399617686868, "loss_seg": 0.15191876702010632, "mean_token_accuracy": 0.9951852709054947, "num_tokens": 369359505.0, "step": 869 }, { "entropy": 0.020668822340667248, "epoch": 0.3807856439435387, "grad_norm": 10.5625, "learning_rate": 2.8576069301570114e-05, "loss": 0.2232, "loss_lm": 0.01687100506387651, "loss_seg": 0.2063010334968567, "mean_token_accuracy": 0.9951663613319397, "num_tokens": 369784644.0, "step": 870 }, { "entropy": 0.021279515232890844, "epoch": 0.3812233285917496, "grad_norm": 6.1875, "learning_rate": 2.8573362208987547e-05, "loss": 0.1803, "loss_lm": 0.016778843943029642, "loss_seg": 0.16350042261183262, "mean_token_accuracy": 0.9951684027910233, "num_tokens": 370209489.0, "step": 871 }, { "entropy": 0.021024912130087614, "epoch": 0.3816610132399606, "grad_norm": 10.75, "learning_rate": 2.857065511640498e-05, "loss": 0.1441, "loss_lm": 0.016872902866452932, "loss_seg": 0.1271989718079567, "mean_token_accuracy": 0.9952376186847687, "num_tokens": 370634847.0, "step": 872 }, { "entropy": 0.020807023625820875, "epoch": 0.3820986978881716, "grad_norm": 10.8125, "learning_rate": 2.8567948023822418e-05, "loss": 0.1072, "loss_lm": 0.016115451464429498, "loss_seg": 0.09112070687115192, "mean_token_accuracy": 0.995223343372345, "num_tokens": 371060193.0, "step": 873 }, { "entropy": 0.021274550817906857, "epoch": 0.38253638253638256, "grad_norm": 9.125, "learning_rate": 2.856524093123985e-05, "loss": 0.1824, "loss_lm": 0.014757399912923574, "loss_seg": 0.16766785085201263, "mean_token_accuracy": 0.9950313568115234, "num_tokens": 371485228.0, "step": 874 }, { "entropy": 0.020535110495984554, "epoch": 0.3829740671845935, "grad_norm": 8.8125, "learning_rate": 2.8562533838657282e-05, "loss": 0.1724, "loss_lm": 0.015199553919956088, "loss_seg": 0.15716992691159248, "mean_token_accuracy": 0.9953715205192566, "num_tokens": 371909713.0, "step": 875 }, { "entropy": 0.021571285091340542, "epoch": 0.38341175183280446, "grad_norm": 11.0, "learning_rate": 2.8559826746074716e-05, "loss": 0.1789, "loss_lm": 0.018472227733582258, "loss_seg": 0.160396046936512, "mean_token_accuracy": 0.9949877560138702, "num_tokens": 372334839.0, "step": 876 }, { "entropy": 0.02039207937195897, "epoch": 0.38384943648101544, "grad_norm": 6.96875, "learning_rate": 2.855711965349215e-05, "loss": 0.1454, "loss_lm": 0.016571360174566507, "loss_seg": 0.12887699343264103, "mean_token_accuracy": 0.9953451901674271, "num_tokens": 372759109.0, "step": 877 }, { "entropy": 0.02099448535591364, "epoch": 0.3842871211292264, "grad_norm": 11.1875, "learning_rate": 2.8554412560909587e-05, "loss": 0.1656, "loss_lm": 0.017803808208554983, "loss_seg": 0.14777523279190063, "mean_token_accuracy": 0.9950875341892242, "num_tokens": 373184939.0, "step": 878 }, { "entropy": 0.021462883334606886, "epoch": 0.38472480577743734, "grad_norm": 21.625, "learning_rate": 2.8551705468327017e-05, "loss": 0.1331, "loss_lm": 0.01724299555644393, "loss_seg": 0.1158303078263998, "mean_token_accuracy": 0.9948998391628265, "num_tokens": 373610018.0, "step": 879 }, { "entropy": 0.019971132278442383, "epoch": 0.3851624904256483, "grad_norm": 11.0, "learning_rate": 2.854899837574445e-05, "loss": 0.1809, "loss_lm": 0.01646270276978612, "loss_seg": 0.16439826413989067, "mean_token_accuracy": 0.9952734559774399, "num_tokens": 374035017.0, "step": 880 }, { "entropy": 0.020532181952148676, "epoch": 0.3856001750738593, "grad_norm": 12.4375, "learning_rate": 2.8546291283161885e-05, "loss": 0.1862, "loss_lm": 0.018558518029749393, "loss_seg": 0.16766556724905968, "mean_token_accuracy": 0.9951978027820587, "num_tokens": 374459792.0, "step": 881 }, { "entropy": 0.02081157360225916, "epoch": 0.3860378597220703, "grad_norm": 40.0, "learning_rate": 2.8543584190579318e-05, "loss": 0.2143, "loss_lm": 0.01932859467342496, "loss_seg": 0.19497066736221313, "mean_token_accuracy": 0.9950318485498428, "num_tokens": 374885076.0, "step": 882 }, { "entropy": 0.020596658810973167, "epoch": 0.3864755443702812, "grad_norm": 11.625, "learning_rate": 2.8540877097996755e-05, "loss": 0.2026, "loss_lm": 0.016500286059454083, "loss_seg": 0.18609276041388512, "mean_token_accuracy": 0.9950609356164932, "num_tokens": 375310630.0, "step": 883 }, { "entropy": 0.020068084355443716, "epoch": 0.3869132290184922, "grad_norm": 12.75, "learning_rate": 2.8538170005414186e-05, "loss": 0.2058, "loss_lm": 0.018521183636039495, "loss_seg": 0.18723680824041367, "mean_token_accuracy": 0.9952516257762909, "num_tokens": 375735332.0, "step": 884 }, { "entropy": 0.019850479904562235, "epoch": 0.38735091366670316, "grad_norm": 12.75, "learning_rate": 2.853546291283162e-05, "loss": 0.204, "loss_lm": 0.017162419855594635, "loss_seg": 0.18685490265488625, "mean_token_accuracy": 0.9952124655246735, "num_tokens": 376160545.0, "step": 885 }, { "entropy": 0.020287869032472372, "epoch": 0.3877885983149141, "grad_norm": 10.1875, "learning_rate": 2.8532755820249053e-05, "loss": 0.2262, "loss_lm": 0.018354957457631826, "loss_seg": 0.20787502080202103, "mean_token_accuracy": 0.9951356649398804, "num_tokens": 376585632.0, "step": 886 }, { "entropy": 0.020156554877758026, "epoch": 0.38822628296312506, "grad_norm": 8.625, "learning_rate": 2.8530048727666487e-05, "loss": 0.1676, "loss_lm": 0.017114494694396853, "loss_seg": 0.15049034729599953, "mean_token_accuracy": 0.9952473342418671, "num_tokens": 377010818.0, "step": 887 }, { "entropy": 0.02053353749215603, "epoch": 0.38866396761133604, "grad_norm": 13.4375, "learning_rate": 2.852734163508392e-05, "loss": 0.2294, "loss_lm": 0.017064984887838364, "loss_seg": 0.21236074715852737, "mean_token_accuracy": 0.9950968772172928, "num_tokens": 377435419.0, "step": 888 }, { "entropy": 0.01954725757241249, "epoch": 0.389101652259547, "grad_norm": 7.9375, "learning_rate": 2.8524634542501354e-05, "loss": 0.1499, "loss_lm": 0.015573407988995314, "loss_seg": 0.13429644331336021, "mean_token_accuracy": 0.9953543394804001, "num_tokens": 377860290.0, "step": 889 }, { "entropy": 0.019491024315357208, "epoch": 0.38953933690775794, "grad_norm": 8.0625, "learning_rate": 2.8521927449918788e-05, "loss": 0.1756, "loss_lm": 0.016535240225493908, "loss_seg": 0.1590999886393547, "mean_token_accuracy": 0.9953521192073822, "num_tokens": 378284526.0, "step": 890 }, { "entropy": 0.019837468396872282, "epoch": 0.3899770215559689, "grad_norm": 9.125, "learning_rate": 2.8519220357336222e-05, "loss": 0.1077, "loss_lm": 0.015420288778841496, "loss_seg": 0.0923101045191288, "mean_token_accuracy": 0.995329424738884, "num_tokens": 378710579.0, "step": 891 }, { "entropy": 0.020243441220372915, "epoch": 0.3904147062041799, "grad_norm": 6.65625, "learning_rate": 2.8516513264753656e-05, "loss": 0.1712, "loss_lm": 0.018520287238061428, "loss_seg": 0.1526737790554762, "mean_token_accuracy": 0.9951850026845932, "num_tokens": 379135690.0, "step": 892 }, { "entropy": 0.0196914947591722, "epoch": 0.3908523908523909, "grad_norm": 6.375, "learning_rate": 2.8513806172171086e-05, "loss": 0.109, "loss_lm": 0.01512259989976883, "loss_seg": 0.09383743442595005, "mean_token_accuracy": 0.9953354895114899, "num_tokens": 379560770.0, "step": 893 }, { "entropy": 0.020575968082994223, "epoch": 0.3912900755006018, "grad_norm": 10.75, "learning_rate": 2.8511099079588523e-05, "loss": 0.1999, "loss_lm": 0.021846367046236992, "loss_seg": 0.17807460762560368, "mean_token_accuracy": 0.995124027132988, "num_tokens": 379985874.0, "step": 894 }, { "entropy": 0.020959466695785522, "epoch": 0.3917277601488128, "grad_norm": 9.0625, "learning_rate": 2.8508391987005957e-05, "loss": 0.1953, "loss_lm": 0.017829666379839182, "loss_seg": 0.17748017236590385, "mean_token_accuracy": 0.9950382262468338, "num_tokens": 380411105.0, "step": 895 }, { "entropy": 0.02013287553563714, "epoch": 0.39216544479702375, "grad_norm": 17.625, "learning_rate": 2.850568489442339e-05, "loss": 0.1347, "loss_lm": 0.016718402970582247, "loss_seg": 0.1179671511054039, "mean_token_accuracy": 0.9952024519443512, "num_tokens": 380835456.0, "step": 896 }, { "entropy": 0.0208973390981555, "epoch": 0.39260312944523473, "grad_norm": 11.1875, "learning_rate": 2.8502977801840824e-05, "loss": 0.1569, "loss_lm": 0.016132632736116648, "loss_seg": 0.14079950377345085, "mean_token_accuracy": 0.995121642947197, "num_tokens": 381260531.0, "step": 897 }, { "entropy": 0.020197908394038677, "epoch": 0.39304081409344566, "grad_norm": 9.9375, "learning_rate": 2.8500270709258254e-05, "loss": 0.2243, "loss_lm": 0.01772279478609562, "loss_seg": 0.20662714540958405, "mean_token_accuracy": 0.9951600283384323, "num_tokens": 381685298.0, "step": 898 }, { "entropy": 0.021647877525538206, "epoch": 0.39347849874165663, "grad_norm": 7.46875, "learning_rate": 2.849756361667569e-05, "loss": 0.2072, "loss_lm": 0.019446011167019606, "loss_seg": 0.18771405145525932, "mean_token_accuracy": 0.9948732405900955, "num_tokens": 382110905.0, "step": 899 }, { "entropy": 0.019912623334676027, "epoch": 0.3939161833898676, "grad_norm": 7.0625, "learning_rate": 2.8494856524093125e-05, "loss": 0.1256, "loss_lm": 0.017732542706653476, "loss_seg": 0.10786745883524418, "mean_token_accuracy": 0.9952507317066193, "num_tokens": 382535965.0, "step": 900 }, { "entropy": 0.02018325449898839, "epoch": 0.3943538680380786, "grad_norm": 9.25, "learning_rate": 2.849214943151056e-05, "loss": 0.1256, "loss_lm": 0.014693517936393619, "loss_seg": 0.1109257135540247, "mean_token_accuracy": 0.9952105432748795, "num_tokens": 382960822.0, "step": 901 }, { "entropy": 0.020481571555137634, "epoch": 0.3947915526862895, "grad_norm": 12.0, "learning_rate": 2.8489442338927993e-05, "loss": 0.1323, "loss_lm": 0.015812478959560394, "loss_seg": 0.1165198516100645, "mean_token_accuracy": 0.995086133480072, "num_tokens": 383385920.0, "step": 902 }, { "entropy": 0.020418002270162106, "epoch": 0.3952292373345005, "grad_norm": 6.875, "learning_rate": 2.8486735246345423e-05, "loss": 0.2144, "loss_lm": 0.015223277965560555, "loss_seg": 0.19915659353137016, "mean_token_accuracy": 0.9951633363962173, "num_tokens": 383811399.0, "step": 903 }, { "entropy": 0.021250233985483646, "epoch": 0.3956669219827115, "grad_norm": 7.09375, "learning_rate": 2.848402815376286e-05, "loss": 0.2081, "loss_lm": 0.01844112854450941, "loss_seg": 0.18964719399809837, "mean_token_accuracy": 0.9949470013380051, "num_tokens": 384236483.0, "step": 904 }, { "entropy": 0.02097199158743024, "epoch": 0.3961046066309224, "grad_norm": 11.0625, "learning_rate": 2.8481321061180294e-05, "loss": 0.189, "loss_lm": 0.019267203751951456, "loss_seg": 0.16969550028443336, "mean_token_accuracy": 0.9949745684862137, "num_tokens": 384661927.0, "step": 905 }, { "entropy": 0.020399048924446106, "epoch": 0.3965422912791334, "grad_norm": 9.6875, "learning_rate": 2.8478613968597728e-05, "loss": 0.1463, "loss_lm": 0.01843491243198514, "loss_seg": 0.12782790325582027, "mean_token_accuracy": 0.9952029883861542, "num_tokens": 385086528.0, "step": 906 }, { "entropy": 0.020488820038735867, "epoch": 0.39697997592734435, "grad_norm": 10.125, "learning_rate": 2.847590687601516e-05, "loss": 0.1485, "loss_lm": 0.014907890930771828, "loss_seg": 0.1336047612130642, "mean_token_accuracy": 0.9952676743268967, "num_tokens": 385511034.0, "step": 907 }, { "entropy": 0.02118802396580577, "epoch": 0.39741766057555533, "grad_norm": 6.625, "learning_rate": 2.8473199783432592e-05, "loss": 0.2009, "loss_lm": 0.02063477598130703, "loss_seg": 0.18025580048561096, "mean_token_accuracy": 0.99495729804039, "num_tokens": 385936174.0, "step": 908 }, { "entropy": 0.02059537172317505, "epoch": 0.39785534522376625, "grad_norm": 9.0625, "learning_rate": 2.847049269085003e-05, "loss": 0.1907, "loss_lm": 0.01814474631100893, "loss_seg": 0.17259583622217178, "mean_token_accuracy": 0.9952135235071182, "num_tokens": 386361479.0, "step": 909 }, { "entropy": 0.02081542694941163, "epoch": 0.39829302987197723, "grad_norm": 10.75, "learning_rate": 2.8467785598267463e-05, "loss": 0.1362, "loss_lm": 0.01860963343642652, "loss_seg": 0.11761563271284103, "mean_token_accuracy": 0.9951502978801727, "num_tokens": 386786834.0, "step": 910 }, { "entropy": 0.02097789803519845, "epoch": 0.3987307145201882, "grad_norm": 8.125, "learning_rate": 2.8465078505684896e-05, "loss": 0.1714, "loss_lm": 0.017867916962131858, "loss_seg": 0.15349267795681953, "mean_token_accuracy": 0.9950609058141708, "num_tokens": 387211180.0, "step": 911 }, { "entropy": 0.020514887291938066, "epoch": 0.3991683991683992, "grad_norm": 9.1875, "learning_rate": 2.846237141310233e-05, "loss": 0.1649, "loss_lm": 0.01689703483134508, "loss_seg": 0.14798636175692081, "mean_token_accuracy": 0.9951474070549011, "num_tokens": 387636236.0, "step": 912 }, { "entropy": 0.02131469640880823, "epoch": 0.3996060838166101, "grad_norm": 13.3125, "learning_rate": 2.845966432051976e-05, "loss": 0.2492, "loss_lm": 0.01720150141045451, "loss_seg": 0.23202203214168549, "mean_token_accuracy": 0.9950220286846161, "num_tokens": 388061876.0, "step": 913 }, { "entropy": 0.020857859402894974, "epoch": 0.4000437684648211, "grad_norm": 8.125, "learning_rate": 2.8456957227937197e-05, "loss": 0.1421, "loss_lm": 0.01860899035818875, "loss_seg": 0.12353801168501377, "mean_token_accuracy": 0.9950804114341736, "num_tokens": 388487074.0, "step": 914 }, { "entropy": 0.021768008824437857, "epoch": 0.40048145311303207, "grad_norm": 6.71875, "learning_rate": 2.845425013535463e-05, "loss": 0.1811, "loss_lm": 0.018173933727666736, "loss_seg": 0.1628863923251629, "mean_token_accuracy": 0.9948484003543854, "num_tokens": 388912360.0, "step": 915 }, { "entropy": 0.020769025199115276, "epoch": 0.40091913776124305, "grad_norm": 7.09375, "learning_rate": 2.8451543042772065e-05, "loss": 0.1647, "loss_lm": 0.019862824119627476, "loss_seg": 0.14479761011898518, "mean_token_accuracy": 0.9951034039258957, "num_tokens": 389337895.0, "step": 916 }, { "entropy": 0.020029475912451744, "epoch": 0.401356822409454, "grad_norm": 27.5, "learning_rate": 2.8448835950189495e-05, "loss": 0.1612, "loss_lm": 0.015289365779608488, "loss_seg": 0.14591938816010952, "mean_token_accuracy": 0.9951997995376587, "num_tokens": 389763246.0, "step": 917 }, { "entropy": 0.020112392026931047, "epoch": 0.40179450705766495, "grad_norm": 7.59375, "learning_rate": 2.844612885760693e-05, "loss": 0.1601, "loss_lm": 0.01593224098905921, "loss_seg": 0.14419222250580788, "mean_token_accuracy": 0.9952369779348373, "num_tokens": 390187950.0, "step": 918 }, { "entropy": 0.020912664011120796, "epoch": 0.40223219170587593, "grad_norm": 8.375, "learning_rate": 2.8443421765024366e-05, "loss": 0.1796, "loss_lm": 0.01707614865154028, "loss_seg": 0.1625602599233389, "mean_token_accuracy": 0.9950738996267319, "num_tokens": 390613441.0, "step": 919 }, { "entropy": 0.02040351089090109, "epoch": 0.40266987635408685, "grad_norm": 11.0, "learning_rate": 2.84407146724418e-05, "loss": 0.1634, "loss_lm": 0.017399880103766918, "loss_seg": 0.14604058302938938, "mean_token_accuracy": 0.995151162147522, "num_tokens": 391038060.0, "step": 920 }, { "entropy": 0.02020450495183468, "epoch": 0.40310756100229783, "grad_norm": 10.8125, "learning_rate": 2.8438007579859234e-05, "loss": 0.2232, "loss_lm": 0.015647994121536613, "loss_seg": 0.2076006829738617, "mean_token_accuracy": 0.9952052235603333, "num_tokens": 391462861.0, "step": 921 }, { "entropy": 0.020629984326660633, "epoch": 0.4035452456505088, "grad_norm": 10.625, "learning_rate": 2.8435300487276664e-05, "loss": 0.2143, "loss_lm": 0.017380816396325827, "loss_seg": 0.19694600254297256, "mean_token_accuracy": 0.9951214045286179, "num_tokens": 391887918.0, "step": 922 }, { "entropy": 0.02079554833471775, "epoch": 0.4039829302987198, "grad_norm": 13.375, "learning_rate": 2.8432593394694098e-05, "loss": 0.1896, "loss_lm": 0.01624107640236616, "loss_seg": 0.17334126122295856, "mean_token_accuracy": 0.9950578808784485, "num_tokens": 392312990.0, "step": 923 }, { "entropy": 0.02094415109604597, "epoch": 0.4044206149469307, "grad_norm": 6.78125, "learning_rate": 2.842988630211153e-05, "loss": 0.2525, "loss_lm": 0.017386396415531635, "loss_seg": 0.23507652804255486, "mean_token_accuracy": 0.9950200170278549, "num_tokens": 392738088.0, "step": 924 }, { "entropy": 0.019919383339583874, "epoch": 0.4048582995951417, "grad_norm": 12.0, "learning_rate": 2.842717920952897e-05, "loss": 0.1579, "loss_lm": 0.019257449079304934, "loss_seg": 0.13863240368664265, "mean_token_accuracy": 0.9951159209012985, "num_tokens": 393163314.0, "step": 925 }, { "entropy": 0.020367702469229698, "epoch": 0.40529598424335267, "grad_norm": 6.875, "learning_rate": 2.8424472116946402e-05, "loss": 0.1658, "loss_lm": 0.017954408191144466, "loss_seg": 0.14782327599823475, "mean_token_accuracy": 0.9952104985713959, "num_tokens": 393588152.0, "step": 926 }, { "entropy": 0.019874006509780884, "epoch": 0.40573366889156365, "grad_norm": 7.34375, "learning_rate": 2.8421765024363833e-05, "loss": 0.1565, "loss_lm": 0.01805710792541504, "loss_seg": 0.13843330275267363, "mean_token_accuracy": 0.9952715933322906, "num_tokens": 394012973.0, "step": 927 }, { "entropy": 0.019978174474090338, "epoch": 0.40617135353977457, "grad_norm": 15.125, "learning_rate": 2.8419057931781266e-05, "loss": 0.1696, "loss_lm": 0.016186803113669157, "loss_seg": 0.15338971838355064, "mean_token_accuracy": 0.9952666461467743, "num_tokens": 394437783.0, "step": 928 }, { "entropy": 0.02025445317849517, "epoch": 0.40660903818798555, "grad_norm": 7.875, "learning_rate": 2.84163508391987e-05, "loss": 0.1396, "loss_lm": 0.018127588788047433, "loss_seg": 0.12150396034121513, "mean_token_accuracy": 0.9952601194381714, "num_tokens": 394862436.0, "step": 929 }, { "entropy": 0.020149104297161102, "epoch": 0.40704672283619653, "grad_norm": 9.5, "learning_rate": 2.8413643746616137e-05, "loss": 0.1985, "loss_lm": 0.017900489270687103, "loss_seg": 0.1806008517742157, "mean_token_accuracy": 0.9952666610479355, "num_tokens": 395287246.0, "step": 930 }, { "entropy": 0.021270474884659052, "epoch": 0.4074844074844075, "grad_norm": 10.4375, "learning_rate": 2.841093665403357e-05, "loss": 0.1694, "loss_lm": 0.018667503725737333, "loss_seg": 0.1507392693310976, "mean_token_accuracy": 0.9949002712965012, "num_tokens": 395712938.0, "step": 931 }, { "entropy": 0.020784291438758373, "epoch": 0.40792209213261843, "grad_norm": 10.125, "learning_rate": 2.8408229561451e-05, "loss": 0.1922, "loss_lm": 0.018367442535236478, "loss_seg": 0.17382916063070297, "mean_token_accuracy": 0.9948981702327728, "num_tokens": 396138435.0, "step": 932 }, { "entropy": 0.020189187489449978, "epoch": 0.4083597767808294, "grad_norm": 17.875, "learning_rate": 2.8405522468868435e-05, "loss": 0.1694, "loss_lm": 0.017508843448013067, "loss_seg": 0.15190106444060802, "mean_token_accuracy": 0.9952063113451004, "num_tokens": 396563337.0, "step": 933 }, { "entropy": 0.02064130362123251, "epoch": 0.4087974614290404, "grad_norm": 6.59375, "learning_rate": 2.840281537628587e-05, "loss": 0.1797, "loss_lm": 0.016631000209599733, "loss_seg": 0.16309449821710587, "mean_token_accuracy": 0.9949969053268433, "num_tokens": 396988669.0, "step": 934 }, { "entropy": 0.020353509578853846, "epoch": 0.40923514607725137, "grad_norm": 8.9375, "learning_rate": 2.8400108283703306e-05, "loss": 0.1634, "loss_lm": 0.016581806121394038, "loss_seg": 0.1468304991722107, "mean_token_accuracy": 0.9951013326644897, "num_tokens": 397414055.0, "step": 935 }, { "entropy": 0.020583574660122395, "epoch": 0.4096728307254623, "grad_norm": 7.40625, "learning_rate": 2.8397401191120736e-05, "loss": 0.1397, "loss_lm": 0.015544897876679897, "loss_seg": 0.1241240706294775, "mean_token_accuracy": 0.9951505362987518, "num_tokens": 397839007.0, "step": 936 }, { "entropy": 0.020372563041746616, "epoch": 0.41011051537367327, "grad_norm": 11.8125, "learning_rate": 2.839469409853817e-05, "loss": 0.1492, "loss_lm": 0.018571722088381648, "loss_seg": 0.1306637916713953, "mean_token_accuracy": 0.9951089322566986, "num_tokens": 398263613.0, "step": 937 }, { "entropy": 0.020976862870156765, "epoch": 0.41054820002188425, "grad_norm": 12.375, "learning_rate": 2.8391987005955604e-05, "loss": 0.2061, "loss_lm": 0.01650389889255166, "loss_seg": 0.18964112922549248, "mean_token_accuracy": 0.9950792640447617, "num_tokens": 398689117.0, "step": 938 }, { "entropy": 0.021006583236157894, "epoch": 0.41098588467009517, "grad_norm": 10.625, "learning_rate": 2.8389279913373037e-05, "loss": 0.1981, "loss_lm": 0.016906436532735825, "loss_seg": 0.18114973977208138, "mean_token_accuracy": 0.9950832724571228, "num_tokens": 399114595.0, "step": 939 }, { "entropy": 0.02055912697687745, "epoch": 0.41142356931830615, "grad_norm": 22.5, "learning_rate": 2.8386572820790474e-05, "loss": 0.13, "loss_lm": 0.015300479251891375, "loss_seg": 0.11474193446338177, "mean_token_accuracy": 0.995058536529541, "num_tokens": 399539947.0, "step": 940 }, { "entropy": 0.02084263926371932, "epoch": 0.4118612539665171, "grad_norm": 9.9375, "learning_rate": 2.8383865728207905e-05, "loss": 0.1711, "loss_lm": 0.0166943680960685, "loss_seg": 0.15435907989740372, "mean_token_accuracy": 0.9952099174261093, "num_tokens": 399964556.0, "step": 941 }, { "entropy": 0.019885553512722254, "epoch": 0.4122989386147281, "grad_norm": 15.875, "learning_rate": 2.838115863562534e-05, "loss": 0.1632, "loss_lm": 0.014426580164581537, "loss_seg": 0.148807967081666, "mean_token_accuracy": 0.995274692773819, "num_tokens": 400389462.0, "step": 942 }, { "entropy": 0.019973821472376585, "epoch": 0.41273662326293903, "grad_norm": 7.125, "learning_rate": 2.8378451543042772e-05, "loss": 0.128, "loss_lm": 0.013853142037987709, "loss_seg": 0.11415342800319195, "mean_token_accuracy": 0.9951510727405548, "num_tokens": 400814476.0, "step": 943 }, { "entropy": 0.020371999125927687, "epoch": 0.41317430791115, "grad_norm": 14.5625, "learning_rate": 2.8375744450460206e-05, "loss": 0.1314, "loss_lm": 0.017184875439852476, "loss_seg": 0.11422505043447018, "mean_token_accuracy": 0.9952651858329773, "num_tokens": 401239161.0, "step": 944 }, { "entropy": 0.02010001754388213, "epoch": 0.413611992559361, "grad_norm": 10.3125, "learning_rate": 2.8373037357877643e-05, "loss": 0.1419, "loss_lm": 0.015476525761187077, "loss_seg": 0.12638080306351185, "mean_token_accuracy": 0.9953034371137619, "num_tokens": 401663689.0, "step": 945 }, { "entropy": 0.02047025179490447, "epoch": 0.41404967720757196, "grad_norm": 8.0, "learning_rate": 2.8370330265295073e-05, "loss": 0.1556, "loss_lm": 0.01499765645712614, "loss_seg": 0.14064917340874672, "mean_token_accuracy": 0.9951149821281433, "num_tokens": 402088021.0, "step": 946 }, { "entropy": 0.020921708084642887, "epoch": 0.4144873618557829, "grad_norm": 13.4375, "learning_rate": 2.8367623172712507e-05, "loss": 0.1691, "loss_lm": 0.018127141753211617, "loss_seg": 0.15097353048622608, "mean_token_accuracy": 0.9950357973575592, "num_tokens": 402513611.0, "step": 947 }, { "entropy": 0.020706916693598032, "epoch": 0.41492504650399387, "grad_norm": 8.0625, "learning_rate": 2.836491608012994e-05, "loss": 0.1688, "loss_lm": 0.017035472439602017, "loss_seg": 0.1517876535654068, "mean_token_accuracy": 0.9951425045728683, "num_tokens": 402938070.0, "step": 948 }, { "entropy": 0.01966772973537445, "epoch": 0.41536273115220484, "grad_norm": 11.625, "learning_rate": 2.8362208987547374e-05, "loss": 0.166, "loss_lm": 0.015679315896704793, "loss_seg": 0.15031075850129128, "mean_token_accuracy": 0.9953126460313797, "num_tokens": 403362365.0, "step": 949 }, { "entropy": 0.02023933921009302, "epoch": 0.4158004158004158, "grad_norm": 10.4375, "learning_rate": 2.835950189496481e-05, "loss": 0.1283, "loss_lm": 0.01560867065563798, "loss_seg": 0.11271444521844387, "mean_token_accuracy": 0.995158925652504, "num_tokens": 403787846.0, "step": 950 }, { "entropy": 0.01977926352992654, "epoch": 0.41623810044862675, "grad_norm": 8.4375, "learning_rate": 2.8356794802382242e-05, "loss": 0.1362, "loss_lm": 0.015513006830587983, "loss_seg": 0.12068814784288406, "mean_token_accuracy": 0.9954222738742828, "num_tokens": 404212910.0, "step": 951 }, { "entropy": 0.02033070381730795, "epoch": 0.4166757850968377, "grad_norm": 6.6875, "learning_rate": 2.8354087709799676e-05, "loss": 0.1495, "loss_lm": 0.018027566373348236, "loss_seg": 0.13147328794002533, "mean_token_accuracy": 0.9952546209096909, "num_tokens": 404637285.0, "step": 952 }, { "entropy": 0.020567764528095722, "epoch": 0.4171134697450487, "grad_norm": 8.6875, "learning_rate": 2.835138061721711e-05, "loss": 0.2237, "loss_lm": 0.019187322352081537, "loss_seg": 0.20453394949436188, "mean_token_accuracy": 0.9951696842908859, "num_tokens": 405062068.0, "step": 953 }, { "entropy": 0.02106668660417199, "epoch": 0.4175511543932597, "grad_norm": 8.125, "learning_rate": 2.8348673524634543e-05, "loss": 0.2132, "loss_lm": 0.018385809380561113, "loss_seg": 0.19482460990548134, "mean_token_accuracy": 0.9950552582740784, "num_tokens": 405487544.0, "step": 954 }, { "entropy": 0.020414690021425486, "epoch": 0.4179888390414706, "grad_norm": 24.875, "learning_rate": 2.8345966432051977e-05, "loss": 0.1586, "loss_lm": 0.015659491531550884, "loss_seg": 0.14290832541882992, "mean_token_accuracy": 0.9952883422374725, "num_tokens": 405912379.0, "step": 955 }, { "entropy": 0.020671008620411158, "epoch": 0.4184265236896816, "grad_norm": 17.75, "learning_rate": 2.834325933946941e-05, "loss": 0.2372, "loss_lm": 0.0182772281114012, "loss_seg": 0.2189236767590046, "mean_token_accuracy": 0.9948977530002594, "num_tokens": 406337292.0, "step": 956 }, { "entropy": 0.019828888587653637, "epoch": 0.41886420833789256, "grad_norm": 7.1875, "learning_rate": 2.8340552246886844e-05, "loss": 0.2013, "loss_lm": 0.018417006358504295, "loss_seg": 0.1828365996479988, "mean_token_accuracy": 0.9953819811344147, "num_tokens": 406762732.0, "step": 957 }, { "entropy": 0.02095453068614006, "epoch": 0.4193018929861035, "grad_norm": 17.125, "learning_rate": 2.8337845154304278e-05, "loss": 0.2191, "loss_lm": 0.01861100923269987, "loss_seg": 0.20049047283828259, "mean_token_accuracy": 0.9950577616691589, "num_tokens": 407187622.0, "step": 958 }, { "entropy": 0.020193979144096375, "epoch": 0.41973957763431446, "grad_norm": 7.09375, "learning_rate": 2.8335138061721712e-05, "loss": 0.1998, "loss_lm": 0.018299984745681286, "loss_seg": 0.1815381795167923, "mean_token_accuracy": 0.9951495230197906, "num_tokens": 407613316.0, "step": 959 }, { "entropy": 0.020404345355927944, "epoch": 0.42017726228252544, "grad_norm": 11.25, "learning_rate": 2.8332430969139142e-05, "loss": 0.1857, "loss_lm": 0.018733825534582138, "loss_seg": 0.16693015210330486, "mean_token_accuracy": 0.995112955570221, "num_tokens": 408039535.0, "step": 960 }, { "entropy": 0.02029379829764366, "epoch": 0.4206149469307364, "grad_norm": 10.125, "learning_rate": 2.832972387655658e-05, "loss": 0.1668, "loss_lm": 0.01670343359000981, "loss_seg": 0.15008504129946232, "mean_token_accuracy": 0.9950339198112488, "num_tokens": 408464394.0, "step": 961 }, { "entropy": 0.020473212469369173, "epoch": 0.42105263157894735, "grad_norm": 20.25, "learning_rate": 2.8327016783974013e-05, "loss": 0.1863, "loss_lm": 0.01807640865445137, "loss_seg": 0.1682690680027008, "mean_token_accuracy": 0.9951661080121994, "num_tokens": 408889489.0, "step": 962 }, { "entropy": 0.020575836766511202, "epoch": 0.4214903162271583, "grad_norm": 22.375, "learning_rate": 2.8324309691391447e-05, "loss": 0.2095, "loss_lm": 0.018129930133000016, "loss_seg": 0.19135535135865211, "mean_token_accuracy": 0.9951185584068298, "num_tokens": 409314337.0, "step": 963 }, { "entropy": 0.019985761493444443, "epoch": 0.4219280008753693, "grad_norm": 10.1875, "learning_rate": 2.832160259880888e-05, "loss": 0.2005, "loss_lm": 0.015887286979705095, "loss_seg": 0.1846604384481907, "mean_token_accuracy": 0.9951908141374588, "num_tokens": 409739104.0, "step": 964 }, { "entropy": 0.021093265619128942, "epoch": 0.4223656855235803, "grad_norm": 21.25, "learning_rate": 2.831889550622631e-05, "loss": 0.1604, "loss_lm": 0.017294498393312097, "loss_seg": 0.14309772849082947, "mean_token_accuracy": 0.9948587417602539, "num_tokens": 410164057.0, "step": 965 }, { "entropy": 0.02018806431442499, "epoch": 0.4228033701717912, "grad_norm": 12.5625, "learning_rate": 2.8316188413643748e-05, "loss": 0.1939, "loss_lm": 0.015793865779414773, "loss_seg": 0.17810725793242455, "mean_token_accuracy": 0.9952649623155594, "num_tokens": 410588507.0, "step": 966 }, { "entropy": 0.020557963754981756, "epoch": 0.4232410548200022, "grad_norm": 10.625, "learning_rate": 2.831348132106118e-05, "loss": 0.2019, "loss_lm": 0.018625555792823434, "loss_seg": 0.18325288221240044, "mean_token_accuracy": 0.9950972348451614, "num_tokens": 411013753.0, "step": 967 }, { "entropy": 0.01971343159675598, "epoch": 0.42367873946821316, "grad_norm": 22.5, "learning_rate": 2.8310774228478615e-05, "loss": 0.1772, "loss_lm": 0.0163789598736912, "loss_seg": 0.1607954539358616, "mean_token_accuracy": 0.9953778684139252, "num_tokens": 411438195.0, "step": 968 }, { "entropy": 0.020487369503825903, "epoch": 0.42411642411642414, "grad_norm": 11.1875, "learning_rate": 2.830806713589605e-05, "loss": 0.0917, "loss_lm": 0.015032261842861772, "loss_seg": 0.07666331715881824, "mean_token_accuracy": 0.9951282441616058, "num_tokens": 411863069.0, "step": 969 }, { "entropy": 0.021525570191442966, "epoch": 0.42455410876463506, "grad_norm": 7.6875, "learning_rate": 2.830536004331348e-05, "loss": 0.1981, "loss_lm": 0.01832312112674117, "loss_seg": 0.17978673800826073, "mean_token_accuracy": 0.9948801100254059, "num_tokens": 412288642.0, "step": 970 }, { "entropy": 0.020548141561448574, "epoch": 0.42499179341284604, "grad_norm": 10.6875, "learning_rate": 2.8302652950730916e-05, "loss": 0.1707, "loss_lm": 0.01730383513495326, "loss_seg": 0.15343401581048965, "mean_token_accuracy": 0.995193138718605, "num_tokens": 412714873.0, "step": 971 }, { "entropy": 0.020326785743236542, "epoch": 0.425429478061057, "grad_norm": 10.75, "learning_rate": 2.829994585814835e-05, "loss": 0.1233, "loss_lm": 0.013791349716484547, "loss_seg": 0.10947663523256779, "mean_token_accuracy": 0.9950691759586334, "num_tokens": 413139095.0, "step": 972 }, { "entropy": 0.02056161966174841, "epoch": 0.425867162709268, "grad_norm": 19.25, "learning_rate": 2.8297238765565784e-05, "loss": 0.1573, "loss_lm": 0.016136688878759742, "loss_seg": 0.14112644642591476, "mean_token_accuracy": 0.9951837807893753, "num_tokens": 413564054.0, "step": 973 }, { "entropy": 0.020430988632142544, "epoch": 0.4263048473574789, "grad_norm": 8.25, "learning_rate": 2.8294531672983218e-05, "loss": 0.1536, "loss_lm": 0.017144969198852777, "loss_seg": 0.1364357341080904, "mean_token_accuracy": 0.9952259957790375, "num_tokens": 413989467.0, "step": 974 }, { "entropy": 0.020947682671248913, "epoch": 0.4267425320056899, "grad_norm": 15.625, "learning_rate": 2.8291824580400648e-05, "loss": 0.1932, "loss_lm": 0.01812335103750229, "loss_seg": 0.17508117482066154, "mean_token_accuracy": 0.9950534850358963, "num_tokens": 414414784.0, "step": 975 }, { "entropy": 0.020775779150426388, "epoch": 0.4271802166539009, "grad_norm": 11.0, "learning_rate": 2.8289117487818085e-05, "loss": 0.113, "loss_lm": 0.0170115374494344, "loss_seg": 0.09598380327224731, "mean_token_accuracy": 0.9951118379831314, "num_tokens": 414839439.0, "step": 976 }, { "entropy": 0.020811864640563726, "epoch": 0.4276179013021118, "grad_norm": 12.0, "learning_rate": 2.828641039523552e-05, "loss": 0.2082, "loss_lm": 0.016731784911826253, "loss_seg": 0.19145649299025536, "mean_token_accuracy": 0.9950685352087021, "num_tokens": 415265249.0, "step": 977 }, { "entropy": 0.020705824252218008, "epoch": 0.4280555859503228, "grad_norm": 14.5, "learning_rate": 2.8283703302652953e-05, "loss": 0.1515, "loss_lm": 0.01575105171650648, "loss_seg": 0.13572139479219913, "mean_token_accuracy": 0.9951641857624054, "num_tokens": 415690142.0, "step": 978 }, { "entropy": 0.0204891050234437, "epoch": 0.42849327059853376, "grad_norm": 9.375, "learning_rate": 2.8280996210070386e-05, "loss": 0.1986, "loss_lm": 0.01674752845428884, "loss_seg": 0.18183749355375767, "mean_token_accuracy": 0.9951179921627045, "num_tokens": 416115324.0, "step": 979 }, { "entropy": 0.020824245642870665, "epoch": 0.42893095524674474, "grad_norm": 9.5, "learning_rate": 2.8278289117487817e-05, "loss": 0.2078, "loss_lm": 0.017489104764536023, "loss_seg": 0.19027995504438877, "mean_token_accuracy": 0.9951583743095398, "num_tokens": 416539740.0, "step": 980 }, { "entropy": 0.020795991644263268, "epoch": 0.42936863989495566, "grad_norm": 11.3125, "learning_rate": 2.8275582024905254e-05, "loss": 0.152, "loss_lm": 0.018553431145846844, "loss_seg": 0.13342529814690351, "mean_token_accuracy": 0.9949855506420135, "num_tokens": 416965308.0, "step": 981 }, { "entropy": 0.020847323816269636, "epoch": 0.42980632454316664, "grad_norm": 14.8125, "learning_rate": 2.8272874932322687e-05, "loss": 0.2128, "loss_lm": 0.020751494448632002, "loss_seg": 0.19205872900784016, "mean_token_accuracy": 0.9950369447469711, "num_tokens": 417390837.0, "step": 982 }, { "entropy": 0.020112871192395687, "epoch": 0.4302440091913776, "grad_norm": 9.25, "learning_rate": 2.827016783974012e-05, "loss": 0.1556, "loss_lm": 0.01814958918839693, "loss_seg": 0.13749440386891365, "mean_token_accuracy": 0.9952119439840317, "num_tokens": 417815821.0, "step": 983 }, { "entropy": 0.020154993049800396, "epoch": 0.4306816938395886, "grad_norm": 7.25, "learning_rate": 2.826746074715755e-05, "loss": 0.1946, "loss_lm": 0.017576225101947784, "loss_seg": 0.17705070227384567, "mean_token_accuracy": 0.9952675104141235, "num_tokens": 418241117.0, "step": 984 }, { "entropy": 0.020343809854239225, "epoch": 0.4311193784877995, "grad_norm": 10.375, "learning_rate": 2.8264753654574985e-05, "loss": 0.1524, "loss_lm": 0.017135536763817072, "loss_seg": 0.1352513637393713, "mean_token_accuracy": 0.9950688034296036, "num_tokens": 418665934.0, "step": 985 }, { "entropy": 0.019960310775786638, "epoch": 0.4315570631360105, "grad_norm": 9.0, "learning_rate": 2.826204656199242e-05, "loss": 0.1458, "loss_lm": 0.01454112003557384, "loss_seg": 0.13129881769418716, "mean_token_accuracy": 0.9952907264232635, "num_tokens": 419091442.0, "step": 986 }, { "entropy": 0.02005792409181595, "epoch": 0.4319947477842215, "grad_norm": 7.375, "learning_rate": 2.8259339469409856e-05, "loss": 0.1382, "loss_lm": 0.015179998241364956, "loss_seg": 0.12305208668112755, "mean_token_accuracy": 0.995272621512413, "num_tokens": 419515961.0, "step": 987 }, { "entropy": 0.020560808945447206, "epoch": 0.43243243243243246, "grad_norm": 7.8125, "learning_rate": 2.825663237682729e-05, "loss": 0.2168, "loss_lm": 0.017590974690392613, "loss_seg": 0.1991782709956169, "mean_token_accuracy": 0.9949260354042053, "num_tokens": 419941071.0, "step": 988 }, { "entropy": 0.020181349013000727, "epoch": 0.4328701170806434, "grad_norm": 5.15625, "learning_rate": 2.825392528424472e-05, "loss": 0.1735, "loss_lm": 0.013994093285873532, "loss_seg": 0.15954497829079628, "mean_token_accuracy": 0.9951245337724686, "num_tokens": 420367216.0, "step": 989 }, { "entropy": 0.01957633439451456, "epoch": 0.43330780172885436, "grad_norm": 9.6875, "learning_rate": 2.8251218191662154e-05, "loss": 0.1747, "loss_lm": 0.016625925665721297, "loss_seg": 0.15810541436076164, "mean_token_accuracy": 0.9953273683786392, "num_tokens": 420791770.0, "step": 990 }, { "entropy": 0.01977805746719241, "epoch": 0.43374548637706534, "grad_norm": 9.5625, "learning_rate": 2.8248511099079588e-05, "loss": 0.1754, "loss_lm": 0.01598223391920328, "loss_seg": 0.15937218442559242, "mean_token_accuracy": 0.9951999932527542, "num_tokens": 421216462.0, "step": 991 }, { "entropy": 0.019566737115383148, "epoch": 0.4341831710252763, "grad_norm": 7.71875, "learning_rate": 2.8245804006497025e-05, "loss": 0.2103, "loss_lm": 0.016603730153292418, "loss_seg": 0.1936989314854145, "mean_token_accuracy": 0.9952467232942581, "num_tokens": 421641587.0, "step": 992 }, { "entropy": 0.02014221204444766, "epoch": 0.43462085567348724, "grad_norm": 11.875, "learning_rate": 2.824309691391446e-05, "loss": 0.1201, "loss_lm": 0.015616655116900802, "loss_seg": 0.10449001751840115, "mean_token_accuracy": 0.9951901286840439, "num_tokens": 422067153.0, "step": 993 }, { "entropy": 0.020264738705009222, "epoch": 0.4350585403216982, "grad_norm": 6.71875, "learning_rate": 2.824038982133189e-05, "loss": 0.1322, "loss_lm": 0.01890450483188033, "loss_seg": 0.11334007978439331, "mean_token_accuracy": 0.9952747672796249, "num_tokens": 422492488.0, "step": 994 }, { "entropy": 0.02069350890815258, "epoch": 0.4354962249699092, "grad_norm": 16.375, "learning_rate": 2.8237682728749323e-05, "loss": 0.1526, "loss_lm": 0.017341675702482462, "loss_seg": 0.13528312928974628, "mean_token_accuracy": 0.9949916452169418, "num_tokens": 422917554.0, "step": 995 }, { "entropy": 0.020482997875660658, "epoch": 0.4359339096181201, "grad_norm": 16.125, "learning_rate": 2.8234975636166756e-05, "loss": 0.1149, "loss_lm": 0.0177899906411767, "loss_seg": 0.09710963256657124, "mean_token_accuracy": 0.9950849562883377, "num_tokens": 423342365.0, "step": 996 }, { "entropy": 0.02006883267313242, "epoch": 0.4363715942663311, "grad_norm": 17.375, "learning_rate": 2.8232268543584193e-05, "loss": 0.1715, "loss_lm": 0.015501691959798336, "loss_seg": 0.1560407504439354, "mean_token_accuracy": 0.9951651096343994, "num_tokens": 423767583.0, "step": 997 }, { "entropy": 0.02083710953593254, "epoch": 0.4368092789145421, "grad_norm": 12.8125, "learning_rate": 2.8229561451001627e-05, "loss": 0.2034, "loss_lm": 0.017771105282008648, "loss_seg": 0.1856357678771019, "mean_token_accuracy": 0.995027482509613, "num_tokens": 424192894.0, "step": 998 }, { "entropy": 0.020429393276572227, "epoch": 0.43724696356275305, "grad_norm": 7.625, "learning_rate": 2.8226854358419057e-05, "loss": 0.2025, "loss_lm": 0.018525841180235147, "loss_seg": 0.1839551217854023, "mean_token_accuracy": 0.9950356483459473, "num_tokens": 424617922.0, "step": 999 }, { "entropy": 0.01981859002262354, "epoch": 0.437684648210964, "grad_norm": 5.9375, "learning_rate": 2.822414726583649e-05, "loss": 0.1254, "loss_lm": 0.014934708597138524, "loss_seg": 0.1105052512139082, "mean_token_accuracy": 0.9952787607908249, "num_tokens": 425042339.0, "step": 1000 }, { "entropy": 0.019912963267415762, "epoch": 0.43812233285917496, "grad_norm": 10.8125, "learning_rate": 2.8221440173253925e-05, "loss": 0.2417, "loss_lm": 0.016484242398291826, "loss_seg": 0.2252018079161644, "mean_token_accuracy": 0.9952494651079178, "num_tokens": 425467519.0, "step": 1001 }, { "entropy": 0.02085016667842865, "epoch": 0.43856001750738594, "grad_norm": 9.5, "learning_rate": 2.8218733080671362e-05, "loss": 0.1453, "loss_lm": 0.015102450735867023, "loss_seg": 0.13023433089256287, "mean_token_accuracy": 0.9950131177902222, "num_tokens": 425891636.0, "step": 1002 }, { "entropy": 0.020272505935281515, "epoch": 0.4389977021555969, "grad_norm": 8.625, "learning_rate": 2.8216025988088796e-05, "loss": 0.1809, "loss_lm": 0.01726068346761167, "loss_seg": 0.16361726820468903, "mean_token_accuracy": 0.9950549453496933, "num_tokens": 426317088.0, "step": 1003 }, { "entropy": 0.019764704629778862, "epoch": 0.43943538680380784, "grad_norm": 6.71875, "learning_rate": 2.8213318895506226e-05, "loss": 0.2025, "loss_lm": 0.01797176874242723, "loss_seg": 0.1845271773636341, "mean_token_accuracy": 0.9952574819326401, "num_tokens": 426741707.0, "step": 1004 }, { "entropy": 0.020624042954295874, "epoch": 0.4398730714520188, "grad_norm": 5.46875, "learning_rate": 2.821061180292366e-05, "loss": 0.1638, "loss_lm": 0.017667925683781505, "loss_seg": 0.1461542248725891, "mean_token_accuracy": 0.9952636212110519, "num_tokens": 427166037.0, "step": 1005 }, { "entropy": 0.021038701757788658, "epoch": 0.4403107561002298, "grad_norm": 15.5, "learning_rate": 2.8207904710341093e-05, "loss": 0.2047, "loss_lm": 0.01691030291840434, "loss_seg": 0.18776176124811172, "mean_token_accuracy": 0.9950232207775116, "num_tokens": 427591406.0, "step": 1006 }, { "entropy": 0.020666218362748623, "epoch": 0.4407484407484408, "grad_norm": 8.5, "learning_rate": 2.820519761775853e-05, "loss": 0.1717, "loss_lm": 0.017148385290056467, "loss_seg": 0.15450172126293182, "mean_token_accuracy": 0.9950458854436874, "num_tokens": 428016274.0, "step": 1007 }, { "entropy": 0.02054011169821024, "epoch": 0.4411861253966517, "grad_norm": 11.6875, "learning_rate": 2.820249052517596e-05, "loss": 0.1427, "loss_lm": 0.014756666729226708, "loss_seg": 0.1279711276292801, "mean_token_accuracy": 0.9950979202985764, "num_tokens": 428441158.0, "step": 1008 }, { "entropy": 0.019970895256847143, "epoch": 0.4416238100448627, "grad_norm": 7.84375, "learning_rate": 2.8199783432593395e-05, "loss": 0.0903, "loss_lm": 0.016866640653461218, "loss_seg": 0.07347598485648632, "mean_token_accuracy": 0.9951719641685486, "num_tokens": 428865917.0, "step": 1009 }, { "entropy": 0.020344486460089684, "epoch": 0.44206149469307365, "grad_norm": 10.3125, "learning_rate": 2.819707634001083e-05, "loss": 0.187, "loss_lm": 0.018652712926268578, "loss_seg": 0.16830254346132278, "mean_token_accuracy": 0.9950947314500809, "num_tokens": 429291328.0, "step": 1010 }, { "entropy": 0.019752553664147854, "epoch": 0.44249917934128463, "grad_norm": 15.25, "learning_rate": 2.8194369247428262e-05, "loss": 0.1681, "loss_lm": 0.01638447167351842, "loss_seg": 0.15170247107744217, "mean_token_accuracy": 0.9952648878097534, "num_tokens": 429716628.0, "step": 1011 }, { "entropy": 0.02018354134634137, "epoch": 0.44293686398949556, "grad_norm": 8.6875, "learning_rate": 2.81916621548457e-05, "loss": 0.1898, "loss_lm": 0.017347325570881367, "loss_seg": 0.1724044866859913, "mean_token_accuracy": 0.9951683282852173, "num_tokens": 430142128.0, "step": 1012 }, { "entropy": 0.021078567020595074, "epoch": 0.44337454863770653, "grad_norm": 11.5, "learning_rate": 2.818895506226313e-05, "loss": 0.2073, "loss_lm": 0.018535123206675053, "loss_seg": 0.18879072926938534, "mean_token_accuracy": 0.9950967133045197, "num_tokens": 430566701.0, "step": 1013 }, { "entropy": 0.020698508713394403, "epoch": 0.4438122332859175, "grad_norm": 8.5625, "learning_rate": 2.8186247969680563e-05, "loss": 0.203, "loss_lm": 0.020245956256985664, "loss_seg": 0.18279603868722916, "mean_token_accuracy": 0.9951914846897125, "num_tokens": 430991749.0, "step": 1014 }, { "entropy": 0.020327589008957148, "epoch": 0.44424991793412844, "grad_norm": 6.0, "learning_rate": 2.8183540877097997e-05, "loss": 0.1877, "loss_lm": 0.016217845026403666, "loss_seg": 0.1714358776807785, "mean_token_accuracy": 0.9951455891132355, "num_tokens": 431416684.0, "step": 1015 }, { "entropy": 0.02065076446160674, "epoch": 0.4446876025823394, "grad_norm": 11.3125, "learning_rate": 2.818083378451543e-05, "loss": 0.123, "loss_lm": 0.01636693044565618, "loss_seg": 0.10660112835466862, "mean_token_accuracy": 0.9951047450304031, "num_tokens": 431841527.0, "step": 1016 }, { "entropy": 0.01973456423729658, "epoch": 0.4451252872305504, "grad_norm": 23.0, "learning_rate": 2.8178126691932864e-05, "loss": 0.1654, "loss_lm": 0.014805666403844953, "loss_seg": 0.15062863938510418, "mean_token_accuracy": 0.9954109340906143, "num_tokens": 432265307.0, "step": 1017 }, { "entropy": 0.020122495014220476, "epoch": 0.44556297187876137, "grad_norm": 15.0, "learning_rate": 2.8175419599350298e-05, "loss": 0.2276, "loss_lm": 0.01868913322687149, "loss_seg": 0.20886543951928616, "mean_token_accuracy": 0.995169460773468, "num_tokens": 432689657.0, "step": 1018 }, { "entropy": 0.020264189690351486, "epoch": 0.4460006565269723, "grad_norm": 9.9375, "learning_rate": 2.8172712506767732e-05, "loss": 0.1652, "loss_lm": 0.016945095732808113, "loss_seg": 0.14830395206809044, "mean_token_accuracy": 0.9952627867460251, "num_tokens": 433114545.0, "step": 1019 }, { "entropy": 0.020327575504779816, "epoch": 0.4464383411751833, "grad_norm": 9.1875, "learning_rate": 2.8170005414185166e-05, "loss": 0.2417, "loss_lm": 0.015702110016718507, "loss_seg": 0.22597934305667877, "mean_token_accuracy": 0.9951139539480209, "num_tokens": 433539359.0, "step": 1020 }, { "entropy": 0.020725767593830824, "epoch": 0.44687602582339425, "grad_norm": 12.5, "learning_rate": 2.81672983216026e-05, "loss": 0.1521, "loss_lm": 0.018434582743793726, "loss_seg": 0.13363396935164928, "mean_token_accuracy": 0.9950097948312759, "num_tokens": 433964568.0, "step": 1021 }, { "entropy": 0.019653356168419123, "epoch": 0.44731371047160523, "grad_norm": 13.5625, "learning_rate": 2.8164591229020033e-05, "loss": 0.1671, "loss_lm": 0.015053408686071634, "loss_seg": 0.15208977833390236, "mean_token_accuracy": 0.9952284544706345, "num_tokens": 434390179.0, "step": 1022 }, { "entropy": 0.020102298818528652, "epoch": 0.44775139511981615, "grad_norm": 8.75, "learning_rate": 2.8161884136437467e-05, "loss": 0.144, "loss_lm": 0.01677854312583804, "loss_seg": 0.12719026394188404, "mean_token_accuracy": 0.9952247738838196, "num_tokens": 434815446.0, "step": 1023 }, { "entropy": 0.0206385119818151, "epoch": 0.44818907976802713, "grad_norm": 5.28125, "learning_rate": 2.81591770438549e-05, "loss": 0.1598, "loss_lm": 0.017755616921931505, "loss_seg": 0.1420930102467537, "mean_token_accuracy": 0.9950883537530899, "num_tokens": 435241148.0, "step": 1024 }, { "entropy": 0.02096035098657012, "epoch": 0.4486267644162381, "grad_norm": 10.0625, "learning_rate": 2.8156469951272334e-05, "loss": 0.1352, "loss_lm": 0.016016845358535647, "loss_seg": 0.11921595968306065, "mean_token_accuracy": 0.9949850887060165, "num_tokens": 435666672.0, "step": 1025 }, { "entropy": 0.020492382813245058, "epoch": 0.4490644490644491, "grad_norm": 14.5, "learning_rate": 2.8153762858689768e-05, "loss": 0.1636, "loss_lm": 0.016277360264211893, "loss_seg": 0.14733324199914932, "mean_token_accuracy": 0.9951525628566742, "num_tokens": 436091406.0, "step": 1026 }, { "entropy": 0.020899318624287844, "epoch": 0.44950213371266, "grad_norm": 24.0, "learning_rate": 2.8151055766107202e-05, "loss": 0.1719, "loss_lm": 0.01783300843089819, "loss_seg": 0.15410376712679863, "mean_token_accuracy": 0.9950696229934692, "num_tokens": 436516700.0, "step": 1027 }, { "entropy": 0.02007969282567501, "epoch": 0.449939818360871, "grad_norm": 13.1875, "learning_rate": 2.8148348673524635e-05, "loss": 0.2011, "loss_lm": 0.015229594195261598, "loss_seg": 0.185915008187294, "mean_token_accuracy": 0.9952535927295685, "num_tokens": 436941189.0, "step": 1028 }, { "entropy": 0.020167813170701265, "epoch": 0.45037750300908197, "grad_norm": 8.75, "learning_rate": 2.814564158094207e-05, "loss": 0.1506, "loss_lm": 0.016300230752676725, "loss_seg": 0.1343055795878172, "mean_token_accuracy": 0.9952492713928223, "num_tokens": 437366149.0, "step": 1029 }, { "entropy": 0.02020527981221676, "epoch": 0.4508151876572929, "grad_norm": 9.9375, "learning_rate": 2.8142934488359503e-05, "loss": 0.1756, "loss_lm": 0.018206565640866756, "loss_seg": 0.15741471946239471, "mean_token_accuracy": 0.9950987845659256, "num_tokens": 437790701.0, "step": 1030 }, { "entropy": 0.020327376667410135, "epoch": 0.45125287230550387, "grad_norm": 10.8125, "learning_rate": 2.8140227395776937e-05, "loss": 0.204, "loss_lm": 0.019318503327667713, "loss_seg": 0.18464823439717293, "mean_token_accuracy": 0.9951109141111374, "num_tokens": 438215700.0, "step": 1031 }, { "entropy": 0.01987148029729724, "epoch": 0.45169055695371485, "grad_norm": 7.28125, "learning_rate": 2.8137520303194367e-05, "loss": 0.186, "loss_lm": 0.01945845689624548, "loss_seg": 0.1665420737117529, "mean_token_accuracy": 0.9951478391885757, "num_tokens": 438641045.0, "step": 1032 }, { "entropy": 0.020267392974346876, "epoch": 0.45212824160192583, "grad_norm": 10.4375, "learning_rate": 2.8134813210611804e-05, "loss": 0.1277, "loss_lm": 0.015560805797576904, "loss_seg": 0.11209273524582386, "mean_token_accuracy": 0.9951877444982529, "num_tokens": 439066582.0, "step": 1033 }, { "entropy": 0.020092315040528774, "epoch": 0.45256592625013675, "grad_norm": 12.3125, "learning_rate": 2.8132106118029238e-05, "loss": 0.1526, "loss_lm": 0.018250735476613045, "loss_seg": 0.13436184264719486, "mean_token_accuracy": 0.9951795339584351, "num_tokens": 439492023.0, "step": 1034 }, { "entropy": 0.019843638874590397, "epoch": 0.45300361089834773, "grad_norm": 10.5, "learning_rate": 2.812939902544667e-05, "loss": 0.1566, "loss_lm": 0.01636447967030108, "loss_seg": 0.14024372398853302, "mean_token_accuracy": 0.9951796531677246, "num_tokens": 439917679.0, "step": 1035 }, { "entropy": 0.020429134368896484, "epoch": 0.4534412955465587, "grad_norm": 9.8125, "learning_rate": 2.8126691932864105e-05, "loss": 0.164, "loss_lm": 0.014436648227274418, "loss_seg": 0.14955447986721992, "mean_token_accuracy": 0.9951298236846924, "num_tokens": 440342906.0, "step": 1036 }, { "entropy": 0.019982270430773497, "epoch": 0.4538789801947697, "grad_norm": 12.3125, "learning_rate": 2.8123984840281536e-05, "loss": 0.1545, "loss_lm": 0.01579813938587904, "loss_seg": 0.13869906403124332, "mean_token_accuracy": 0.9951734393835068, "num_tokens": 440767614.0, "step": 1037 }, { "entropy": 0.02049656445160508, "epoch": 0.4543166648429806, "grad_norm": 7.59375, "learning_rate": 2.8121277747698973e-05, "loss": 0.1544, "loss_lm": 0.015798196895048022, "loss_seg": 0.13856331817805767, "mean_token_accuracy": 0.9952250868082047, "num_tokens": 441192514.0, "step": 1038 }, { "entropy": 0.019777275156229734, "epoch": 0.4547543494911916, "grad_norm": 8.25, "learning_rate": 2.8118570655116406e-05, "loss": 0.1585, "loss_lm": 0.017144569661468267, "loss_seg": 0.14135181531310081, "mean_token_accuracy": 0.9952159970998764, "num_tokens": 441617043.0, "step": 1039 }, { "entropy": 0.020153213758021593, "epoch": 0.45519203413940257, "grad_norm": 9.0625, "learning_rate": 2.811586356253384e-05, "loss": 0.137, "loss_lm": 0.015871398150920868, "loss_seg": 0.12110598850995302, "mean_token_accuracy": 0.9950856119394302, "num_tokens": 442042707.0, "step": 1040 }, { "entropy": 0.019820151384919882, "epoch": 0.45562971878761355, "grad_norm": 9.8125, "learning_rate": 2.8113156469951274e-05, "loss": 0.1571, "loss_lm": 0.01824672846123576, "loss_seg": 0.13881448283791542, "mean_token_accuracy": 0.9951030164957047, "num_tokens": 442468051.0, "step": 1041 }, { "entropy": 0.020952559541910887, "epoch": 0.45606740343582447, "grad_norm": 19.875, "learning_rate": 2.8110449377368704e-05, "loss": 0.14, "loss_lm": 0.01711463904939592, "loss_seg": 0.12293428927659988, "mean_token_accuracy": 0.9950878322124481, "num_tokens": 442893497.0, "step": 1042 }, { "entropy": 0.019648967310786247, "epoch": 0.45650508808403545, "grad_norm": 7.75, "learning_rate": 2.810774228478614e-05, "loss": 0.1507, "loss_lm": 0.01638746540993452, "loss_seg": 0.13435614854097366, "mean_token_accuracy": 0.995330423116684, "num_tokens": 443318776.0, "step": 1043 }, { "entropy": 0.02041373774409294, "epoch": 0.4569427727322464, "grad_norm": 7.3125, "learning_rate": 2.8105035192203575e-05, "loss": 0.1937, "loss_lm": 0.016494091134518385, "loss_seg": 0.17724759690463543, "mean_token_accuracy": 0.995020255446434, "num_tokens": 443743457.0, "step": 1044 }, { "entropy": 0.020835496950894594, "epoch": 0.4573804573804574, "grad_norm": 7.375, "learning_rate": 2.810232809962101e-05, "loss": 0.1327, "loss_lm": 0.01725945435464382, "loss_seg": 0.11541704833507538, "mean_token_accuracy": 0.9949132949113846, "num_tokens": 444167502.0, "step": 1045 }, { "entropy": 0.02001164248213172, "epoch": 0.45781814202866833, "grad_norm": 8.0625, "learning_rate": 2.8099621007038443e-05, "loss": 0.1437, "loss_lm": 0.016274921130388975, "loss_seg": 0.1274733506143093, "mean_token_accuracy": 0.9952379465103149, "num_tokens": 444592088.0, "step": 1046 }, { "entropy": 0.01954050175845623, "epoch": 0.4582558266768793, "grad_norm": 7.21875, "learning_rate": 2.8096913914455873e-05, "loss": 0.1594, "loss_lm": 0.016089629381895065, "loss_seg": 0.14333329908549786, "mean_token_accuracy": 0.9952248781919479, "num_tokens": 445016954.0, "step": 1047 }, { "entropy": 0.019570523407310247, "epoch": 0.4586935113250903, "grad_norm": 7.3125, "learning_rate": 2.809420682187331e-05, "loss": 0.1585, "loss_lm": 0.017305462853983045, "loss_seg": 0.141205295920372, "mean_token_accuracy": 0.9952330738306046, "num_tokens": 445441516.0, "step": 1048 }, { "entropy": 0.01994783477857709, "epoch": 0.4591311959733012, "grad_norm": 7.875, "learning_rate": 2.8091499729290744e-05, "loss": 0.146, "loss_lm": 0.017613587202504277, "loss_seg": 0.12833669409155846, "mean_token_accuracy": 0.995109960436821, "num_tokens": 445865601.0, "step": 1049 }, { "entropy": 0.019506609067320824, "epoch": 0.4595688806215122, "grad_norm": 13.5625, "learning_rate": 2.8088792636708177e-05, "loss": 0.1483, "loss_lm": 0.015425918390974402, "loss_seg": 0.13283166848123074, "mean_token_accuracy": 0.9952526390552521, "num_tokens": 446290852.0, "step": 1050 }, { "entropy": 0.020539031364023685, "epoch": 0.46000656526972317, "grad_norm": 5.25, "learning_rate": 2.808608554412561e-05, "loss": 0.1444, "loss_lm": 0.01602113712579012, "loss_seg": 0.1284177340567112, "mean_token_accuracy": 0.9950001537799835, "num_tokens": 446716021.0, "step": 1051 }, { "entropy": 0.02024256531149149, "epoch": 0.46044424991793415, "grad_norm": 9.375, "learning_rate": 2.808337845154304e-05, "loss": 0.1818, "loss_lm": 0.01592435734346509, "loss_seg": 0.16587022691965103, "mean_token_accuracy": 0.9950591176748276, "num_tokens": 447142226.0, "step": 1052 }, { "entropy": 0.01973975356668234, "epoch": 0.46088193456614507, "grad_norm": 10.3125, "learning_rate": 2.8080671358960475e-05, "loss": 0.2019, "loss_lm": 0.017175991786643863, "loss_seg": 0.1847158819437027, "mean_token_accuracy": 0.9952271282672882, "num_tokens": 447566903.0, "step": 1053 }, { "entropy": 0.01977052027359605, "epoch": 0.46131961921435605, "grad_norm": 13.625, "learning_rate": 2.8077964266377912e-05, "loss": 0.1749, "loss_lm": 0.0170962056145072, "loss_seg": 0.1578147765249014, "mean_token_accuracy": 0.9950922429561615, "num_tokens": 447991696.0, "step": 1054 }, { "entropy": 0.01988638611510396, "epoch": 0.461757303862567, "grad_norm": 9.25, "learning_rate": 2.8075257173795346e-05, "loss": 0.2154, "loss_lm": 0.018874758621677756, "loss_seg": 0.19651880487799644, "mean_token_accuracy": 0.9951729327440262, "num_tokens": 448416786.0, "step": 1055 }, { "entropy": 0.020024522207677364, "epoch": 0.462194988510778, "grad_norm": 11.4375, "learning_rate": 2.8072550081212776e-05, "loss": 0.2454, "loss_lm": 0.016818682895973325, "loss_seg": 0.228534746915102, "mean_token_accuracy": 0.9951892644166946, "num_tokens": 448842027.0, "step": 1056 }, { "entropy": 0.019299743697047234, "epoch": 0.4626326731589889, "grad_norm": 14.6875, "learning_rate": 2.806984298863021e-05, "loss": 0.1659, "loss_lm": 0.017427956918254495, "loss_seg": 0.14847850799560547, "mean_token_accuracy": 0.9953072518110275, "num_tokens": 449266490.0, "step": 1057 }, { "entropy": 0.020237802993506193, "epoch": 0.4630703578071999, "grad_norm": 8.9375, "learning_rate": 2.8067135896047644e-05, "loss": 0.1297, "loss_lm": 0.017387221101671457, "loss_seg": 0.1123055275529623, "mean_token_accuracy": 0.9951212704181671, "num_tokens": 449691991.0, "step": 1058 }, { "entropy": 0.01944993156939745, "epoch": 0.4635080424554109, "grad_norm": 5.53125, "learning_rate": 2.806442880346508e-05, "loss": 0.1643, "loss_lm": 0.017769517842680216, "loss_seg": 0.14654969982802868, "mean_token_accuracy": 0.9952557384967804, "num_tokens": 450116256.0, "step": 1059 }, { "entropy": 0.020434448029845953, "epoch": 0.46394572710362186, "grad_norm": 16.125, "learning_rate": 2.8061721710882515e-05, "loss": 0.1887, "loss_lm": 0.018061860697343946, "loss_seg": 0.17065956443548203, "mean_token_accuracy": 0.9950331598520279, "num_tokens": 450541430.0, "step": 1060 }, { "entropy": 0.01963556744158268, "epoch": 0.4643834117518328, "grad_norm": 9.5625, "learning_rate": 2.8059014618299945e-05, "loss": 0.1752, "loss_lm": 0.01903271838091314, "loss_seg": 0.15619863010942936, "mean_token_accuracy": 0.995175689458847, "num_tokens": 450966534.0, "step": 1061 }, { "entropy": 0.01973260287195444, "epoch": 0.46482109640004377, "grad_norm": 6.5625, "learning_rate": 2.805630752571738e-05, "loss": 0.1449, "loss_lm": 0.016893880674615502, "loss_seg": 0.12799490056931973, "mean_token_accuracy": 0.9951260983943939, "num_tokens": 451391808.0, "step": 1062 }, { "entropy": 0.01972159231081605, "epoch": 0.46525878104825474, "grad_norm": 7.75, "learning_rate": 2.8053600433134812e-05, "loss": 0.1395, "loss_lm": 0.015732106752693653, "loss_seg": 0.12381080910563469, "mean_token_accuracy": 0.9952213317155838, "num_tokens": 451816780.0, "step": 1063 }, { "entropy": 0.020252760499715805, "epoch": 0.4656964656964657, "grad_norm": 17.875, "learning_rate": 2.805089334055225e-05, "loss": 0.1961, "loss_lm": 0.01815148093737662, "loss_seg": 0.17790964432060719, "mean_token_accuracy": 0.9950802624225616, "num_tokens": 452241538.0, "step": 1064 }, { "entropy": 0.019519518595188856, "epoch": 0.46613415034467665, "grad_norm": 8.6875, "learning_rate": 2.8048186247969683e-05, "loss": 0.141, "loss_lm": 0.015018172562122345, "loss_seg": 0.12597630359232426, "mean_token_accuracy": 0.9952452629804611, "num_tokens": 452667397.0, "step": 1065 }, { "entropy": 0.019502229057252407, "epoch": 0.4665718349928876, "grad_norm": 7.90625, "learning_rate": 2.8045479155387114e-05, "loss": 0.1625, "loss_lm": 0.016193987801671028, "loss_seg": 0.14634876511991024, "mean_token_accuracy": 0.9952684640884399, "num_tokens": 453092163.0, "step": 1066 }, { "entropy": 0.019868611358106136, "epoch": 0.4670095196410986, "grad_norm": 10.75, "learning_rate": 2.8042772062804547e-05, "loss": 0.1077, "loss_lm": 0.01615598751232028, "loss_seg": 0.09150717407464981, "mean_token_accuracy": 0.9951758831739426, "num_tokens": 453518148.0, "step": 1067 }, { "entropy": 0.020348764024674892, "epoch": 0.4674472042893095, "grad_norm": 17.5, "learning_rate": 2.804006497022198e-05, "loss": 0.1635, "loss_lm": 0.01644234638661146, "loss_seg": 0.14705561101436615, "mean_token_accuracy": 0.9950493574142456, "num_tokens": 453943323.0, "step": 1068 }, { "entropy": 0.019717378541827202, "epoch": 0.4678848889375205, "grad_norm": 6.15625, "learning_rate": 2.8037357877639418e-05, "loss": 0.1698, "loss_lm": 0.017107069958001375, "loss_seg": 0.1526766624301672, "mean_token_accuracy": 0.9952255487442017, "num_tokens": 454367629.0, "step": 1069 }, { "entropy": 0.02012317068874836, "epoch": 0.4683225735857315, "grad_norm": 11.125, "learning_rate": 2.8034650785056852e-05, "loss": 0.1567, "loss_lm": 0.014515371294692159, "loss_seg": 0.1421385444700718, "mean_token_accuracy": 0.9951654076576233, "num_tokens": 454792616.0, "step": 1070 }, { "entropy": 0.019520265981554985, "epoch": 0.46876025823394246, "grad_norm": 10.5, "learning_rate": 2.8031943692474282e-05, "loss": 0.1621, "loss_lm": 0.016298167873173952, "loss_seg": 0.14580263011157513, "mean_token_accuracy": 0.9953795373439789, "num_tokens": 455217625.0, "step": 1071 }, { "entropy": 0.020502539817243814, "epoch": 0.4691979428821534, "grad_norm": 10.0, "learning_rate": 2.8029236599891716e-05, "loss": 0.1306, "loss_lm": 0.0172037270385772, "loss_seg": 0.11339623667299747, "mean_token_accuracy": 0.9950527399778366, "num_tokens": 455642680.0, "step": 1072 }, { "entropy": 0.020033367443829775, "epoch": 0.46963562753036436, "grad_norm": 9.0, "learning_rate": 2.802652950730915e-05, "loss": 0.1331, "loss_lm": 0.01609621848911047, "loss_seg": 0.11696268059313297, "mean_token_accuracy": 0.9950338155031204, "num_tokens": 456067529.0, "step": 1073 }, { "entropy": 0.019627545028924942, "epoch": 0.47007331217857534, "grad_norm": 31.375, "learning_rate": 2.8023822414726587e-05, "loss": 0.1387, "loss_lm": 0.01577637600712478, "loss_seg": 0.12292653322219849, "mean_token_accuracy": 0.9952114224433899, "num_tokens": 456492875.0, "step": 1074 }, { "entropy": 0.02059610979631543, "epoch": 0.4705109968267863, "grad_norm": 7.125, "learning_rate": 2.8021115322144017e-05, "loss": 0.2026, "loss_lm": 0.016367920441552997, "loss_seg": 0.1862299796193838, "mean_token_accuracy": 0.9949418306350708, "num_tokens": 456917922.0, "step": 1075 }, { "entropy": 0.020117704290896654, "epoch": 0.47094868147499724, "grad_norm": 8.125, "learning_rate": 2.801840822956145e-05, "loss": 0.1204, "loss_lm": 0.016018607886508107, "loss_seg": 0.1043872069567442, "mean_token_accuracy": 0.9951675534248352, "num_tokens": 457342920.0, "step": 1076 }, { "entropy": 0.020477585960179567, "epoch": 0.4713863661232082, "grad_norm": 5.625, "learning_rate": 2.8015701136978885e-05, "loss": 0.1843, "loss_lm": 0.015686082420870662, "loss_seg": 0.16862457990646362, "mean_token_accuracy": 0.9950172603130341, "num_tokens": 457767961.0, "step": 1077 }, { "entropy": 0.019502463284879923, "epoch": 0.4718240507714192, "grad_norm": 10.5, "learning_rate": 2.801299404439632e-05, "loss": 0.1497, "loss_lm": 0.017609507776796818, "loss_seg": 0.13211272284388542, "mean_token_accuracy": 0.9952667653560638, "num_tokens": 458193672.0, "step": 1078 }, { "entropy": 0.020010399632155895, "epoch": 0.4722617354196302, "grad_norm": 5.625, "learning_rate": 2.8010286951813755e-05, "loss": 0.1721, "loss_lm": 0.015281409956514835, "loss_seg": 0.1568342261016369, "mean_token_accuracy": 0.9950219690799713, "num_tokens": 458618307.0, "step": 1079 }, { "entropy": 0.01957724243402481, "epoch": 0.4726994200678411, "grad_norm": 10.75, "learning_rate": 2.8007579859231186e-05, "loss": 0.1392, "loss_lm": 0.015522913774475455, "loss_seg": 0.12370312213897705, "mean_token_accuracy": 0.995279535651207, "num_tokens": 459044283.0, "step": 1080 }, { "entropy": 0.020769872702658176, "epoch": 0.4731371047160521, "grad_norm": 8.0625, "learning_rate": 2.800487276664862e-05, "loss": 0.2291, "loss_lm": 0.019015196710824966, "loss_seg": 0.21010162495076656, "mean_token_accuracy": 0.995015874505043, "num_tokens": 459469636.0, "step": 1081 }, { "entropy": 0.019751563668251038, "epoch": 0.47357478936426306, "grad_norm": 11.0625, "learning_rate": 2.8002165674066053e-05, "loss": 0.1801, "loss_lm": 0.015232543228194118, "loss_seg": 0.164831031113863, "mean_token_accuracy": 0.9953251034021378, "num_tokens": 459894429.0, "step": 1082 }, { "entropy": 0.019800505600869656, "epoch": 0.47401247401247404, "grad_norm": 15.4375, "learning_rate": 2.7999458581483487e-05, "loss": 0.1855, "loss_lm": 0.01748626260086894, "loss_seg": 0.16797158680856228, "mean_token_accuracy": 0.995241641998291, "num_tokens": 460319545.0, "step": 1083 }, { "entropy": 0.020089473109692335, "epoch": 0.47445015866068496, "grad_norm": 10.1875, "learning_rate": 2.799675148890092e-05, "loss": 0.1794, "loss_lm": 0.018319970928132534, "loss_seg": 0.16106782853603363, "mean_token_accuracy": 0.995225340127945, "num_tokens": 460743828.0, "step": 1084 }, { "entropy": 0.019448634702712297, "epoch": 0.47488784330889594, "grad_norm": 8.5625, "learning_rate": 2.7994044396318354e-05, "loss": 0.1846, "loss_lm": 0.016320843482390046, "loss_seg": 0.16828666627407074, "mean_token_accuracy": 0.9952933490276337, "num_tokens": 461168070.0, "step": 1085 }, { "entropy": 0.019398999866098166, "epoch": 0.4753255279571069, "grad_norm": 9.3125, "learning_rate": 2.7991337303735788e-05, "loss": 0.1676, "loss_lm": 0.016520884353667498, "loss_seg": 0.15102976188063622, "mean_token_accuracy": 0.9953209608793259, "num_tokens": 461592880.0, "step": 1086 }, { "entropy": 0.020037363283336163, "epoch": 0.47576321260531784, "grad_norm": 14.125, "learning_rate": 2.7988630211153222e-05, "loss": 0.1927, "loss_lm": 0.01605818676762283, "loss_seg": 0.17661066725850105, "mean_token_accuracy": 0.995099350810051, "num_tokens": 462018073.0, "step": 1087 }, { "entropy": 0.020185827277600765, "epoch": 0.4762008972535288, "grad_norm": 8.3125, "learning_rate": 2.7985923118570656e-05, "loss": 0.1486, "loss_lm": 0.015761381713673472, "loss_seg": 0.13281520642340183, "mean_token_accuracy": 0.9950814694166183, "num_tokens": 462443559.0, "step": 1088 }, { "entropy": 0.020557241048663855, "epoch": 0.4766385819017398, "grad_norm": 15.25, "learning_rate": 2.798321602598809e-05, "loss": 0.1612, "loss_lm": 0.018103412352502346, "loss_seg": 0.14308759476989508, "mean_token_accuracy": 0.9950115233659744, "num_tokens": 462868325.0, "step": 1089 }, { "entropy": 0.020034470595419407, "epoch": 0.4770762665499508, "grad_norm": 11.25, "learning_rate": 2.7980508933405523e-05, "loss": 0.1865, "loss_lm": 0.01545076072216034, "loss_seg": 0.17103474959731102, "mean_token_accuracy": 0.9950823485851288, "num_tokens": 463293505.0, "step": 1090 }, { "entropy": 0.020073438063263893, "epoch": 0.4775139511981617, "grad_norm": 8.5, "learning_rate": 2.7977801840822957e-05, "loss": 0.1718, "loss_lm": 0.015452876454219222, "loss_seg": 0.15634028241038322, "mean_token_accuracy": 0.9951553791761398, "num_tokens": 463718078.0, "step": 1091 }, { "entropy": 0.020063691306859255, "epoch": 0.4779516358463727, "grad_norm": 11.25, "learning_rate": 2.797509474824039e-05, "loss": 0.157, "loss_lm": 0.017993425484746695, "loss_seg": 0.13905349746346474, "mean_token_accuracy": 0.9952315539121628, "num_tokens": 464142710.0, "step": 1092 }, { "entropy": 0.019909593742340803, "epoch": 0.47838932049458366, "grad_norm": 14.375, "learning_rate": 2.7972387655657824e-05, "loss": 0.1616, "loss_lm": 0.014919150155037642, "loss_seg": 0.14672615006566048, "mean_token_accuracy": 0.9953067749738693, "num_tokens": 464567347.0, "step": 1093 }, { "entropy": 0.0211784103885293, "epoch": 0.47882700514279464, "grad_norm": 14.6875, "learning_rate": 2.7969680563075258e-05, "loss": 0.1888, "loss_lm": 0.01657710107974708, "loss_seg": 0.17223674431443214, "mean_token_accuracy": 0.9948578625917435, "num_tokens": 464993429.0, "step": 1094 }, { "entropy": 0.019824745133519173, "epoch": 0.47926468979100556, "grad_norm": 9.375, "learning_rate": 2.7966973470492692e-05, "loss": 0.1586, "loss_lm": 0.017888385336846113, "loss_seg": 0.14069318771362305, "mean_token_accuracy": 0.995183989405632, "num_tokens": 465418866.0, "step": 1095 }, { "entropy": 0.0205123545601964, "epoch": 0.47970237443921654, "grad_norm": 10.0, "learning_rate": 2.7964266377910125e-05, "loss": 0.138, "loss_lm": 0.01866045710630715, "loss_seg": 0.11934014223515987, "mean_token_accuracy": 0.9949604123830795, "num_tokens": 465844673.0, "step": 1096 }, { "entropy": 0.02014464046806097, "epoch": 0.4801400590874275, "grad_norm": 11.75, "learning_rate": 2.796155928532756e-05, "loss": 0.1702, "loss_lm": 0.017818002961575985, "loss_seg": 0.15242897160351276, "mean_token_accuracy": 0.9951937049627304, "num_tokens": 466270343.0, "step": 1097 }, { "entropy": 0.019799938425421715, "epoch": 0.4805777437356385, "grad_norm": 7.03125, "learning_rate": 2.7958852192744993e-05, "loss": 0.1667, "loss_lm": 0.015186608768999577, "loss_seg": 0.15154528059065342, "mean_token_accuracy": 0.9953393340110779, "num_tokens": 466695123.0, "step": 1098 }, { "entropy": 0.020151672419160604, "epoch": 0.4810154283838494, "grad_norm": 8.9375, "learning_rate": 2.7956145100162423e-05, "loss": 0.1451, "loss_lm": 0.013982788193970919, "loss_seg": 0.13115357235074043, "mean_token_accuracy": 0.9951880574226379, "num_tokens": 467119655.0, "step": 1099 }, { "entropy": 0.020379184745252132, "epoch": 0.4814531130320604, "grad_norm": 12.9375, "learning_rate": 2.795343800757986e-05, "loss": 0.1516, "loss_lm": 0.016271396074444056, "loss_seg": 0.13532419316470623, "mean_token_accuracy": 0.9950847923755646, "num_tokens": 467545215.0, "step": 1100 }, { "entropy": 0.020105741918087006, "epoch": 0.4818907976802714, "grad_norm": 8.4375, "learning_rate": 2.7950730914997294e-05, "loss": 0.1451, "loss_lm": 0.013858291553333402, "loss_seg": 0.1312569770962, "mean_token_accuracy": 0.9952713549137115, "num_tokens": 467969971.0, "step": 1101 }, { "entropy": 0.020234883297234774, "epoch": 0.48232848232848236, "grad_norm": 6.9375, "learning_rate": 2.7948023822414728e-05, "loss": 0.1616, "loss_lm": 0.01637217658571899, "loss_seg": 0.1452745385468006, "mean_token_accuracy": 0.9953076392412186, "num_tokens": 468394854.0, "step": 1102 }, { "entropy": 0.0203901045024395, "epoch": 0.4827661669766933, "grad_norm": 9.3125, "learning_rate": 2.794531672983216e-05, "loss": 0.1842, "loss_lm": 0.018074539955705404, "loss_seg": 0.1661224476993084, "mean_token_accuracy": 0.9951541125774384, "num_tokens": 468819911.0, "step": 1103 }, { "entropy": 0.020799081306904554, "epoch": 0.48320385162490426, "grad_norm": 17.125, "learning_rate": 2.7942609637249592e-05, "loss": 0.1705, "loss_lm": 0.01830192655324936, "loss_seg": 0.15224340558052063, "mean_token_accuracy": 0.9949770271778107, "num_tokens": 469245707.0, "step": 1104 }, { "entropy": 0.020589864812791348, "epoch": 0.48364153627311524, "grad_norm": 12.625, "learning_rate": 2.793990254466703e-05, "loss": 0.1718, "loss_lm": 0.015645135659724474, "loss_seg": 0.15618086978793144, "mean_token_accuracy": 0.9951393753290176, "num_tokens": 469670926.0, "step": 1105 }, { "entropy": 0.020077296998351812, "epoch": 0.48407922092132616, "grad_norm": 6.21875, "learning_rate": 2.7937195452084463e-05, "loss": 0.1445, "loss_lm": 0.016794369788840413, "loss_seg": 0.1276857890188694, "mean_token_accuracy": 0.9951274991035461, "num_tokens": 470096355.0, "step": 1106 }, { "entropy": 0.02020067535340786, "epoch": 0.48451690556953714, "grad_norm": 12.4375, "learning_rate": 2.7934488359501896e-05, "loss": 0.1989, "loss_lm": 0.017363757360726595, "loss_seg": 0.18152068182826042, "mean_token_accuracy": 0.9951811581850052, "num_tokens": 470521527.0, "step": 1107 }, { "entropy": 0.020120875909924507, "epoch": 0.4849545902177481, "grad_norm": 27.875, "learning_rate": 2.793178126691933e-05, "loss": 0.1507, "loss_lm": 0.0187158421613276, "loss_seg": 0.132002592086792, "mean_token_accuracy": 0.9951335191726685, "num_tokens": 470946658.0, "step": 1108 }, { "entropy": 0.019789457321166992, "epoch": 0.4853922748659591, "grad_norm": 7.4375, "learning_rate": 2.792907417433676e-05, "loss": 0.1702, "loss_lm": 0.01605917909182608, "loss_seg": 0.1540963537991047, "mean_token_accuracy": 0.9952192902565002, "num_tokens": 471371453.0, "step": 1109 }, { "entropy": 0.02006210759282112, "epoch": 0.48582995951417, "grad_norm": 12.4375, "learning_rate": 2.7926367081754198e-05, "loss": 0.1755, "loss_lm": 0.016024702927097678, "loss_seg": 0.15951024368405342, "mean_token_accuracy": 0.995177373290062, "num_tokens": 471796079.0, "step": 1110 }, { "entropy": 0.020828850101679564, "epoch": 0.486267644162381, "grad_norm": 6.9375, "learning_rate": 2.792365998917163e-05, "loss": 0.2176, "loss_lm": 0.019109081011265516, "loss_seg": 0.19847945496439934, "mean_token_accuracy": 0.9950205981731415, "num_tokens": 472222825.0, "step": 1111 }, { "entropy": 0.019907483831048012, "epoch": 0.486705328810592, "grad_norm": 15.3125, "learning_rate": 2.7920952896589065e-05, "loss": 0.1484, "loss_lm": 0.014416008489206433, "loss_seg": 0.13400030322372913, "mean_token_accuracy": 0.9952324777841568, "num_tokens": 472647144.0, "step": 1112 }, { "entropy": 0.02042722189798951, "epoch": 0.48714301345880295, "grad_norm": 7.46875, "learning_rate": 2.79182458040065e-05, "loss": 0.1704, "loss_lm": 0.018198419362306595, "loss_seg": 0.15219812467694283, "mean_token_accuracy": 0.9951159209012985, "num_tokens": 473072381.0, "step": 1113 }, { "entropy": 0.02014303905889392, "epoch": 0.4875806981070139, "grad_norm": 4.40625, "learning_rate": 2.791553871142393e-05, "loss": 0.1499, "loss_lm": 0.01852411264553666, "loss_seg": 0.13134832493960857, "mean_token_accuracy": 0.9951721876859665, "num_tokens": 473497195.0, "step": 1114 }, { "entropy": 0.02020400110632181, "epoch": 0.48801838275522486, "grad_norm": 9.8125, "learning_rate": 2.7912831618841366e-05, "loss": 0.1226, "loss_lm": 0.018760915379971266, "loss_seg": 0.10385102964937687, "mean_token_accuracy": 0.9951084554195404, "num_tokens": 473922378.0, "step": 1115 }, { "entropy": 0.020359400659799576, "epoch": 0.48845606740343583, "grad_norm": 9.9375, "learning_rate": 2.79101245262588e-05, "loss": 0.154, "loss_lm": 0.017017280450090766, "loss_seg": 0.13695436529815197, "mean_token_accuracy": 0.9951376169919968, "num_tokens": 474347857.0, "step": 1116 }, { "entropy": 0.020195574965327978, "epoch": 0.4888937520516468, "grad_norm": 31.75, "learning_rate": 2.7907417433676234e-05, "loss": 0.1848, "loss_lm": 0.01722407853230834, "loss_seg": 0.16757380031049252, "mean_token_accuracy": 0.9952024817466736, "num_tokens": 474773058.0, "step": 1117 }, { "entropy": 0.020733681973069906, "epoch": 0.48933143669985774, "grad_norm": 8.875, "learning_rate": 2.7904710341093667e-05, "loss": 0.1316, "loss_lm": 0.01712454273365438, "loss_seg": 0.11447219550609589, "mean_token_accuracy": 0.9950460940599442, "num_tokens": 475198529.0, "step": 1118 }, { "entropy": 0.020084443036466837, "epoch": 0.4897691213480687, "grad_norm": 8.8125, "learning_rate": 2.7902003248511098e-05, "loss": 0.1151, "loss_lm": 0.013151870807632804, "loss_seg": 0.10190006718039513, "mean_token_accuracy": 0.9952030032873154, "num_tokens": 475623930.0, "step": 1119 }, { "entropy": 0.020465183071792126, "epoch": 0.4902068059962797, "grad_norm": 9.9375, "learning_rate": 2.789929615592853e-05, "loss": 0.1491, "loss_lm": 0.0160646247677505, "loss_seg": 0.1330068502575159, "mean_token_accuracy": 0.9949934333562851, "num_tokens": 476048933.0, "step": 1120 }, { "entropy": 0.020650448743253946, "epoch": 0.49064449064449067, "grad_norm": 9.75, "learning_rate": 2.789658906334597e-05, "loss": 0.1965, "loss_lm": 0.020291860215365887, "loss_seg": 0.17619748041033745, "mean_token_accuracy": 0.9950079619884491, "num_tokens": 476474153.0, "step": 1121 }, { "entropy": 0.0202229842543602, "epoch": 0.4910821752927016, "grad_norm": 11.8125, "learning_rate": 2.7893881970763402e-05, "loss": 0.1512, "loss_lm": 0.015858091646805406, "loss_seg": 0.13531152717769146, "mean_token_accuracy": 0.9951319843530655, "num_tokens": 476898321.0, "step": 1122 }, { "entropy": 0.01977708889171481, "epoch": 0.4915198599409126, "grad_norm": 19.125, "learning_rate": 2.7891174878180833e-05, "loss": 0.1686, "loss_lm": 0.014845427358523011, "loss_seg": 0.15375344827771187, "mean_token_accuracy": 0.9951463937759399, "num_tokens": 477323944.0, "step": 1123 }, { "entropy": 0.020171197596937418, "epoch": 0.49195754458912355, "grad_norm": 10.25, "learning_rate": 2.7888467785598266e-05, "loss": 0.1596, "loss_lm": 0.01961248298175633, "loss_seg": 0.13996219635009766, "mean_token_accuracy": 0.9951231628656387, "num_tokens": 477749392.0, "step": 1124 }, { "entropy": 0.020728038158267736, "epoch": 0.4923952292373345, "grad_norm": 9.4375, "learning_rate": 2.78857606930157e-05, "loss": 0.1777, "loss_lm": 0.01603048713877797, "loss_seg": 0.16162813641130924, "mean_token_accuracy": 0.9949483126401901, "num_tokens": 478174387.0, "step": 1125 }, { "entropy": 0.01977170305326581, "epoch": 0.49283291388554545, "grad_norm": 8.4375, "learning_rate": 2.7883053600433137e-05, "loss": 0.2001, "loss_lm": 0.01642019860446453, "loss_seg": 0.18370952270925045, "mean_token_accuracy": 0.99519182741642, "num_tokens": 478599489.0, "step": 1126 }, { "entropy": 0.019501239992678165, "epoch": 0.49327059853375643, "grad_norm": 8.25, "learning_rate": 2.788034650785057e-05, "loss": 0.1317, "loss_lm": 0.015507131582126021, "loss_seg": 0.11621286161243916, "mean_token_accuracy": 0.995228722691536, "num_tokens": 479024920.0, "step": 1127 }, { "entropy": 0.020102931186556816, "epoch": 0.4937082831819674, "grad_norm": 13.1875, "learning_rate": 2.7877639415268e-05, "loss": 0.1409, "loss_lm": 0.019829262513667345, "loss_seg": 0.1210810299962759, "mean_token_accuracy": 0.9950928539037704, "num_tokens": 479449569.0, "step": 1128 }, { "entropy": 0.020013216882944107, "epoch": 0.49414596783017833, "grad_norm": 12.375, "learning_rate": 2.7874932322685435e-05, "loss": 0.171, "loss_lm": 0.019048758316785097, "loss_seg": 0.15196671709418297, "mean_token_accuracy": 0.9952138960361481, "num_tokens": 479873862.0, "step": 1129 }, { "entropy": 0.019941252656280994, "epoch": 0.4945836524783893, "grad_norm": 7.09375, "learning_rate": 2.787222523010287e-05, "loss": 0.2012, "loss_lm": 0.01648367429152131, "loss_seg": 0.18468765541911125, "mean_token_accuracy": 0.9952042698860168, "num_tokens": 480298577.0, "step": 1130 }, { "entropy": 0.019657188560813665, "epoch": 0.4950213371266003, "grad_norm": 10.6875, "learning_rate": 2.7869518137520306e-05, "loss": 0.1668, "loss_lm": 0.01755379606038332, "loss_seg": 0.1492295991629362, "mean_token_accuracy": 0.9952863305807114, "num_tokens": 480723461.0, "step": 1131 }, { "entropy": 0.019542166031897068, "epoch": 0.49545902177481127, "grad_norm": 12.375, "learning_rate": 2.786681104493774e-05, "loss": 0.1756, "loss_lm": 0.018057395238429308, "loss_seg": 0.15752716176211834, "mean_token_accuracy": 0.9952615201473236, "num_tokens": 481148882.0, "step": 1132 }, { "entropy": 0.02006074832752347, "epoch": 0.4958967064230222, "grad_norm": 8.875, "learning_rate": 2.786410395235517e-05, "loss": 0.166, "loss_lm": 0.015869349241256714, "loss_seg": 0.15012706629931927, "mean_token_accuracy": 0.9951558262109756, "num_tokens": 481573662.0, "step": 1133 }, { "entropy": 0.020228181965649128, "epoch": 0.49633439107123317, "grad_norm": 16.0, "learning_rate": 2.7861396859772604e-05, "loss": 0.1321, "loss_lm": 0.017574451863765717, "loss_seg": 0.1145158838480711, "mean_token_accuracy": 0.9950899034738541, "num_tokens": 481998486.0, "step": 1134 }, { "entropy": 0.020165807101875544, "epoch": 0.49677207571944415, "grad_norm": 10.8125, "learning_rate": 2.7858689767190037e-05, "loss": 0.18, "loss_lm": 0.016063845017924905, "loss_seg": 0.16389813087880611, "mean_token_accuracy": 0.9951011836528778, "num_tokens": 482423665.0, "step": 1135 }, { "entropy": 0.019716350361704826, "epoch": 0.49720976036765513, "grad_norm": 5.25, "learning_rate": 2.7855982674607474e-05, "loss": 0.1436, "loss_lm": 0.01717864559032023, "loss_seg": 0.12641820311546326, "mean_token_accuracy": 0.995142936706543, "num_tokens": 482848574.0, "step": 1136 }, { "entropy": 0.020494448021054268, "epoch": 0.49764744501586605, "grad_norm": 13.0, "learning_rate": 2.7853275582024908e-05, "loss": 0.1554, "loss_lm": 0.020488853566348553, "loss_seg": 0.1348909940570593, "mean_token_accuracy": 0.9950167536735535, "num_tokens": 483273365.0, "step": 1137 }, { "entropy": 0.020037305541336536, "epoch": 0.49808512966407703, "grad_norm": 9.0, "learning_rate": 2.785056848944234e-05, "loss": 0.1615, "loss_lm": 0.017315885750576854, "loss_seg": 0.144160445779562, "mean_token_accuracy": 0.9951564818620682, "num_tokens": 483698634.0, "step": 1138 }, { "entropy": 0.019740702584385872, "epoch": 0.498522814312288, "grad_norm": 17.125, "learning_rate": 2.7847861396859772e-05, "loss": 0.1456, "loss_lm": 0.016720150830224156, "loss_seg": 0.12887064553797245, "mean_token_accuracy": 0.9952638447284698, "num_tokens": 484123830.0, "step": 1139 }, { "entropy": 0.01936609437689185, "epoch": 0.498960498960499, "grad_norm": 7.3125, "learning_rate": 2.7845154304277206e-05, "loss": 0.1692, "loss_lm": 0.013352161273360252, "loss_seg": 0.15588182397186756, "mean_token_accuracy": 0.9952403753995895, "num_tokens": 484549444.0, "step": 1140 }, { "entropy": 0.01983680482953787, "epoch": 0.4993981836087099, "grad_norm": 8.8125, "learning_rate": 2.7842447211694643e-05, "loss": 0.1804, "loss_lm": 0.01746720541268587, "loss_seg": 0.1628999523818493, "mean_token_accuracy": 0.9951337724924088, "num_tokens": 484974359.0, "step": 1141 }, { "entropy": 0.02021007426083088, "epoch": 0.4998358682569209, "grad_norm": 6.15625, "learning_rate": 2.7839740119112077e-05, "loss": 0.1058, "loss_lm": 0.014250181848183274, "loss_seg": 0.09159207623451948, "mean_token_accuracy": 0.9950062781572342, "num_tokens": 485399481.0, "step": 1142 }, { "entropy": 0.019476480316370726, "epoch": 0.5002735529051319, "grad_norm": 14.25, "learning_rate": 2.7837033026529507e-05, "loss": 0.1536, "loss_lm": 0.015553118428215384, "loss_seg": 0.13803115114569664, "mean_token_accuracy": 0.9951896369457245, "num_tokens": 485824540.0, "step": 1143 }, { "entropy": 0.020293437410145998, "epoch": 0.5007112375533428, "grad_norm": 7.25, "learning_rate": 2.783432593394694e-05, "loss": 0.156, "loss_lm": 0.01574294571764767, "loss_seg": 0.14021971821784973, "mean_token_accuracy": 0.995030015707016, "num_tokens": 486249091.0, "step": 1144 }, { "entropy": 0.019475488923490047, "epoch": 0.5011489222015538, "grad_norm": 10.0, "learning_rate": 2.7831618841364375e-05, "loss": 0.2369, "loss_lm": 0.01580022810958326, "loss_seg": 0.2210858054459095, "mean_token_accuracy": 0.9952679127454758, "num_tokens": 486673172.0, "step": 1145 }, { "entropy": 0.019427382852882147, "epoch": 0.5015866068497647, "grad_norm": 17.25, "learning_rate": 2.7828911748781812e-05, "loss": 0.208, "loss_lm": 0.01670202985405922, "loss_seg": 0.19125601276755333, "mean_token_accuracy": 0.9952255040407181, "num_tokens": 487098081.0, "step": 1146 }, { "entropy": 0.019389419816434383, "epoch": 0.5020242914979757, "grad_norm": 10.0, "learning_rate": 2.7826204656199242e-05, "loss": 0.1499, "loss_lm": 0.017372711095958948, "loss_seg": 0.1325739026069641, "mean_token_accuracy": 0.9953373521566391, "num_tokens": 487523531.0, "step": 1147 }, { "entropy": 0.019587614107877016, "epoch": 0.5024619761461867, "grad_norm": 6.96875, "learning_rate": 2.7823497563616676e-05, "loss": 0.1075, "loss_lm": 0.014489737572148442, "loss_seg": 0.09299234673380852, "mean_token_accuracy": 0.9951870292425156, "num_tokens": 487948584.0, "step": 1148 }, { "entropy": 0.019087249413132668, "epoch": 0.5028996607943976, "grad_norm": 11.375, "learning_rate": 2.782079047103411e-05, "loss": 0.1501, "loss_lm": 0.015490321908146143, "loss_seg": 0.13463258370757103, "mean_token_accuracy": 0.9953596591949463, "num_tokens": 488373945.0, "step": 1149 }, { "entropy": 0.01953554479405284, "epoch": 0.5033373454426086, "grad_norm": 13.5, "learning_rate": 2.7818083378451543e-05, "loss": 0.2165, "loss_lm": 0.016519228229299188, "loss_seg": 0.19994153827428818, "mean_token_accuracy": 0.995295375585556, "num_tokens": 488799240.0, "step": 1150 }, { "entropy": 0.02044252073392272, "epoch": 0.5037750300908196, "grad_norm": 13.125, "learning_rate": 2.7815376285868977e-05, "loss": 0.1526, "loss_lm": 0.018204452004283667, "loss_seg": 0.13435794971883297, "mean_token_accuracy": 0.9951442033052444, "num_tokens": 489224270.0, "step": 1151 }, { "entropy": 0.019354455173015594, "epoch": 0.5042127147390305, "grad_norm": 16.0, "learning_rate": 2.781266919328641e-05, "loss": 0.1438, "loss_lm": 0.015514494618400931, "loss_seg": 0.12824741378426552, "mean_token_accuracy": 0.995354101061821, "num_tokens": 489649360.0, "step": 1152 }, { "entropy": 0.019806901924312115, "epoch": 0.5046503993872415, "grad_norm": 7.28125, "learning_rate": 2.7809962100703844e-05, "loss": 0.1205, "loss_lm": 0.014865056378766894, "loss_seg": 0.10566135495901108, "mean_token_accuracy": 0.9952570348978043, "num_tokens": 490074582.0, "step": 1153 }, { "entropy": 0.020079978741705418, "epoch": 0.5050880840354525, "grad_norm": 9.875, "learning_rate": 2.7807255008121278e-05, "loss": 0.1293, "loss_lm": 0.015848195645958185, "loss_seg": 0.11342810094356537, "mean_token_accuracy": 0.9951430857181549, "num_tokens": 490499518.0, "step": 1154 }, { "entropy": 0.019719477277249098, "epoch": 0.5055257686836634, "grad_norm": 26.125, "learning_rate": 2.7804547915538712e-05, "loss": 0.1882, "loss_lm": 0.017060321290045977, "loss_seg": 0.17110422626137733, "mean_token_accuracy": 0.9953031539916992, "num_tokens": 490924661.0, "step": 1155 }, { "entropy": 0.019590890500694513, "epoch": 0.5059634533318744, "grad_norm": 7.75, "learning_rate": 2.7801840822956146e-05, "loss": 0.162, "loss_lm": 0.017416818533092737, "loss_seg": 0.1445488352328539, "mean_token_accuracy": 0.9952874481678009, "num_tokens": 491350504.0, "step": 1156 }, { "entropy": 0.019985038321465254, "epoch": 0.5064011379800853, "grad_norm": 10.25, "learning_rate": 2.779913373037358e-05, "loss": 0.1533, "loss_lm": 0.017191979568451643, "loss_seg": 0.13611316680908203, "mean_token_accuracy": 0.9951635152101517, "num_tokens": 491775988.0, "step": 1157 }, { "entropy": 0.01983813475817442, "epoch": 0.5068388226282963, "grad_norm": 10.375, "learning_rate": 2.7796426637791013e-05, "loss": 0.1572, "loss_lm": 0.018128208816051483, "loss_seg": 0.1390778385102749, "mean_token_accuracy": 0.9951527416706085, "num_tokens": 492201551.0, "step": 1158 }, { "entropy": 0.020045298617333174, "epoch": 0.5072765072765073, "grad_norm": 8.0625, "learning_rate": 2.7793719545208447e-05, "loss": 0.1865, "loss_lm": 0.018075039144605398, "loss_seg": 0.16844230517745018, "mean_token_accuracy": 0.9952651709318161, "num_tokens": 492626642.0, "step": 1159 }, { "entropy": 0.02005120273679495, "epoch": 0.5077141919247182, "grad_norm": 12.6875, "learning_rate": 2.779101245262588e-05, "loss": 0.2137, "loss_lm": 0.017625866224989295, "loss_seg": 0.19607402011752129, "mean_token_accuracy": 0.9951895624399185, "num_tokens": 493051320.0, "step": 1160 }, { "entropy": 0.019151565618813038, "epoch": 0.5081518765729292, "grad_norm": 11.1875, "learning_rate": 2.7788305360043314e-05, "loss": 0.153, "loss_lm": 0.01597250928170979, "loss_seg": 0.13700860738754272, "mean_token_accuracy": 0.9953449219465256, "num_tokens": 493475962.0, "step": 1161 }, { "entropy": 0.019605621695518494, "epoch": 0.5085895612211402, "grad_norm": 15.125, "learning_rate": 2.7785598267460748e-05, "loss": 0.1518, "loss_lm": 0.016558618051931262, "loss_seg": 0.13525067828595638, "mean_token_accuracy": 0.9953371584415436, "num_tokens": 493900513.0, "step": 1162 }, { "entropy": 0.01994952652603388, "epoch": 0.5090272458693511, "grad_norm": 33.75, "learning_rate": 2.7782891174878182e-05, "loss": 0.1717, "loss_lm": 0.014120992505922914, "loss_seg": 0.15760214813053608, "mean_token_accuracy": 0.9951269030570984, "num_tokens": 494326072.0, "step": 1163 }, { "entropy": 0.01970321126282215, "epoch": 0.5094649305175621, "grad_norm": 9.0625, "learning_rate": 2.7780184082295615e-05, "loss": 0.1275, "loss_lm": 0.01537386397831142, "loss_seg": 0.112089853733778, "mean_token_accuracy": 0.9951967746019363, "num_tokens": 494751166.0, "step": 1164 }, { "entropy": 0.020056805107742548, "epoch": 0.5099026151657731, "grad_norm": 9.8125, "learning_rate": 2.777747698971305e-05, "loss": 0.1979, "loss_lm": 0.01837560976855457, "loss_seg": 0.17954705841839314, "mean_token_accuracy": 0.995124027132988, "num_tokens": 495176269.0, "step": 1165 }, { "entropy": 0.019086855463683605, "epoch": 0.510340299813984, "grad_norm": 10.1875, "learning_rate": 2.7774769897130483e-05, "loss": 0.1665, "loss_lm": 0.015327217057347298, "loss_seg": 0.15122219920158386, "mean_token_accuracy": 0.9953815788030624, "num_tokens": 495601037.0, "step": 1166 }, { "entropy": 0.01923914346843958, "epoch": 0.510777984462195, "grad_norm": 5.96875, "learning_rate": 2.7772062804547917e-05, "loss": 0.1276, "loss_lm": 0.01515191444195807, "loss_seg": 0.1124210674315691, "mean_token_accuracy": 0.9953089654445648, "num_tokens": 496024995.0, "step": 1167 }, { "entropy": 0.019812839571386576, "epoch": 0.511215669110406, "grad_norm": 13.3125, "learning_rate": 2.776935571196535e-05, "loss": 0.1863, "loss_lm": 0.016472316812723875, "loss_seg": 0.16982456482946873, "mean_token_accuracy": 0.9950984865427017, "num_tokens": 496449923.0, "step": 1168 }, { "entropy": 0.019507274962961674, "epoch": 0.5116533537586169, "grad_norm": 17.625, "learning_rate": 2.7766648619382784e-05, "loss": 0.1621, "loss_lm": 0.01617756811901927, "loss_seg": 0.14587708748877048, "mean_token_accuracy": 0.9952230602502823, "num_tokens": 496875070.0, "step": 1169 }, { "entropy": 0.01980644417926669, "epoch": 0.5120910384068279, "grad_norm": 17.875, "learning_rate": 2.7763941526800218e-05, "loss": 0.1624, "loss_lm": 0.018193006981164217, "loss_seg": 0.14423936139792204, "mean_token_accuracy": 0.9951439797878265, "num_tokens": 497300083.0, "step": 1170 }, { "entropy": 0.020202876068651676, "epoch": 0.5125287230550388, "grad_norm": 15.625, "learning_rate": 2.7761234434217648e-05, "loss": 0.1489, "loss_lm": 0.018577732145786285, "loss_seg": 0.13030368462204933, "mean_token_accuracy": 0.9950992166996002, "num_tokens": 497725685.0, "step": 1171 }, { "entropy": 0.019719968549907207, "epoch": 0.5129664077032499, "grad_norm": 13.375, "learning_rate": 2.7758527341635085e-05, "loss": 0.1643, "loss_lm": 0.019013224402442575, "loss_seg": 0.14525194838643074, "mean_token_accuracy": 0.9951732009649277, "num_tokens": 498151360.0, "step": 1172 }, { "entropy": 0.020032349973917007, "epoch": 0.5134040923514608, "grad_norm": 14.375, "learning_rate": 2.775582024905252e-05, "loss": 0.2067, "loss_lm": 0.015494988532736897, "loss_seg": 0.19119651056826115, "mean_token_accuracy": 0.9950721710920334, "num_tokens": 498576681.0, "step": 1173 }, { "entropy": 0.02049559447914362, "epoch": 0.5138417769996717, "grad_norm": 6.15625, "learning_rate": 2.7753113156469953e-05, "loss": 0.1607, "loss_lm": 0.014704384608194232, "loss_seg": 0.1459543090313673, "mean_token_accuracy": 0.9950361251831055, "num_tokens": 499002333.0, "step": 1174 }, { "entropy": 0.01968360785394907, "epoch": 0.5142794616478827, "grad_norm": 11.5625, "learning_rate": 2.7750406063887386e-05, "loss": 0.189, "loss_lm": 0.015016490360721946, "loss_seg": 0.174012940376997, "mean_token_accuracy": 0.995329275727272, "num_tokens": 499426646.0, "step": 1175 }, { "entropy": 0.01973893027752638, "epoch": 0.5147171462960937, "grad_norm": 27.0, "learning_rate": 2.7747698971304817e-05, "loss": 0.1624, "loss_lm": 0.015272259246557951, "loss_seg": 0.14715715125203133, "mean_token_accuracy": 0.9951791912317276, "num_tokens": 499851173.0, "step": 1176 }, { "entropy": 0.019900986459106207, "epoch": 0.5151548309443046, "grad_norm": 11.6875, "learning_rate": 2.7744991878722254e-05, "loss": 0.2279, "loss_lm": 0.0141465796623379, "loss_seg": 0.213719941675663, "mean_token_accuracy": 0.9950958788394928, "num_tokens": 500275904.0, "step": 1177 }, { "entropy": 0.019578951876610518, "epoch": 0.5155925155925156, "grad_norm": 6.125, "learning_rate": 2.7742284786139688e-05, "loss": 0.1923, "loss_lm": 0.020215292926877737, "loss_seg": 0.17207111418247223, "mean_token_accuracy": 0.9952017217874527, "num_tokens": 500701214.0, "step": 1178 }, { "entropy": 0.019255063496530056, "epoch": 0.5160302002407265, "grad_norm": 14.125, "learning_rate": 2.773957769355712e-05, "loss": 0.1684, "loss_lm": 0.016188233625143766, "loss_seg": 0.152204018086195, "mean_token_accuracy": 0.9953351318836212, "num_tokens": 501126687.0, "step": 1179 }, { "entropy": 0.019168677739799023, "epoch": 0.5164678848889375, "grad_norm": 10.9375, "learning_rate": 2.7736870600974555e-05, "loss": 0.178, "loss_lm": 0.0190018389839679, "loss_seg": 0.15898779407143593, "mean_token_accuracy": 0.9952624887228012, "num_tokens": 501551560.0, "step": 1180 }, { "entropy": 0.019878546707332134, "epoch": 0.5169055695371485, "grad_norm": 5.9375, "learning_rate": 2.7734163508391985e-05, "loss": 0.1276, "loss_lm": 0.01620730198919773, "loss_seg": 0.11141164507716894, "mean_token_accuracy": 0.9950607717037201, "num_tokens": 501976694.0, "step": 1181 }, { "entropy": 0.02015607012435794, "epoch": 0.5173432541853594, "grad_norm": 16.0, "learning_rate": 2.7731456415809422e-05, "loss": 0.2086, "loss_lm": 0.018952686805278063, "loss_seg": 0.18963675387203693, "mean_token_accuracy": 0.9949365854263306, "num_tokens": 502402461.0, "step": 1182 }, { "entropy": 0.020125900860875845, "epoch": 0.5177809388335705, "grad_norm": 18.625, "learning_rate": 2.7728749323226856e-05, "loss": 0.1412, "loss_lm": 0.016571835381910205, "loss_seg": 0.12462657317519188, "mean_token_accuracy": 0.9949704855680466, "num_tokens": 502828535.0, "step": 1183 }, { "entropy": 0.02024976722896099, "epoch": 0.5182186234817814, "grad_norm": 5.96875, "learning_rate": 2.772604223064429e-05, "loss": 0.175, "loss_lm": 0.01787210674956441, "loss_seg": 0.15712741017341614, "mean_token_accuracy": 0.995154857635498, "num_tokens": 503254514.0, "step": 1184 }, { "entropy": 0.01947504747658968, "epoch": 0.5186563081299923, "grad_norm": 10.625, "learning_rate": 2.7723335138061724e-05, "loss": 0.1528, "loss_lm": 0.01643874915316701, "loss_seg": 0.13631465286016464, "mean_token_accuracy": 0.9952162504196167, "num_tokens": 503679468.0, "step": 1185 }, { "entropy": 0.019873316399753094, "epoch": 0.5190939927782033, "grad_norm": 7.84375, "learning_rate": 2.7720628045479154e-05, "loss": 0.1843, "loss_lm": 0.015885911881923676, "loss_seg": 0.16842125728726387, "mean_token_accuracy": 0.9951051324605942, "num_tokens": 504104560.0, "step": 1186 }, { "entropy": 0.020276382565498352, "epoch": 0.5195316774264143, "grad_norm": 10.375, "learning_rate": 2.7717920952896588e-05, "loss": 0.1974, "loss_lm": 0.01979457400739193, "loss_seg": 0.17757388576865196, "mean_token_accuracy": 0.9949048608541489, "num_tokens": 504529654.0, "step": 1187 }, { "entropy": 0.019944779109209776, "epoch": 0.5199693620746252, "grad_norm": 20.25, "learning_rate": 2.7715213860314025e-05, "loss": 0.2182, "loss_lm": 0.017959002871066332, "loss_seg": 0.2002139613032341, "mean_token_accuracy": 0.9951281994581223, "num_tokens": 504955349.0, "step": 1188 }, { "entropy": 0.02059239288792014, "epoch": 0.5204070467228362, "grad_norm": 7.03125, "learning_rate": 2.771250676773146e-05, "loss": 0.1529, "loss_lm": 0.0190707731526345, "loss_seg": 0.13381202891469002, "mean_token_accuracy": 0.9948649853467941, "num_tokens": 505381001.0, "step": 1189 }, { "entropy": 0.019220374058932066, "epoch": 0.5208447313710471, "grad_norm": 14.0625, "learning_rate": 2.7709799675148892e-05, "loss": 0.168, "loss_lm": 0.013746372889727354, "loss_seg": 0.1542855016887188, "mean_token_accuracy": 0.9952879250049591, "num_tokens": 505806249.0, "step": 1190 }, { "entropy": 0.019809298682957888, "epoch": 0.5212824160192582, "grad_norm": 11.625, "learning_rate": 2.7707092582566323e-05, "loss": 0.1679, "loss_lm": 0.017536107217893004, "loss_seg": 0.15037059597671032, "mean_token_accuracy": 0.9951067119836807, "num_tokens": 506231090.0, "step": 1191 }, { "entropy": 0.019222605507820845, "epoch": 0.5217201006674691, "grad_norm": 12.3125, "learning_rate": 2.7704385489983756e-05, "loss": 0.1702, "loss_lm": 0.013024983229115605, "loss_seg": 0.15714815258979797, "mean_token_accuracy": 0.9952705204486847, "num_tokens": 506656457.0, "step": 1192 }, { "entropy": 0.02014091657474637, "epoch": 0.52215778531568, "grad_norm": 7.90625, "learning_rate": 2.7701678397401193e-05, "loss": 0.1859, "loss_lm": 0.01783396559767425, "loss_seg": 0.16804080829024315, "mean_token_accuracy": 0.9951812773942947, "num_tokens": 507081639.0, "step": 1193 }, { "entropy": 0.01918171439319849, "epoch": 0.5225954699638911, "grad_norm": 9.4375, "learning_rate": 2.7698971304818627e-05, "loss": 0.1417, "loss_lm": 0.015881155151873827, "loss_seg": 0.12583738192915916, "mean_token_accuracy": 0.9953018426895142, "num_tokens": 507507047.0, "step": 1194 }, { "entropy": 0.01959506841376424, "epoch": 0.523033154612102, "grad_norm": 11.8125, "learning_rate": 2.7696264212236058e-05, "loss": 0.1271, "loss_lm": 0.016545044956728816, "loss_seg": 0.1105990894138813, "mean_token_accuracy": 0.9951162487268448, "num_tokens": 507931891.0, "step": 1195 }, { "entropy": 0.019604580476880074, "epoch": 0.5234708392603129, "grad_norm": 16.5, "learning_rate": 2.769355711965349e-05, "loss": 0.1737, "loss_lm": 0.017652529757469893, "loss_seg": 0.15608322247862816, "mean_token_accuracy": 0.9951885938644409, "num_tokens": 508356894.0, "step": 1196 }, { "entropy": 0.019849748350679874, "epoch": 0.5239085239085239, "grad_norm": 5.65625, "learning_rate": 2.7690850027070925e-05, "loss": 0.1639, "loss_lm": 0.01588250230997801, "loss_seg": 0.1479929331690073, "mean_token_accuracy": 0.9950668513774872, "num_tokens": 508781750.0, "step": 1197 }, { "entropy": 0.020209555979818106, "epoch": 0.5243462085567349, "grad_norm": 6.71875, "learning_rate": 2.7688142934488362e-05, "loss": 0.1271, "loss_lm": 0.013473535189405084, "loss_seg": 0.11360423266887665, "mean_token_accuracy": 0.9950402826070786, "num_tokens": 509207548.0, "step": 1198 }, { "entropy": 0.020094662439078093, "epoch": 0.5247838932049458, "grad_norm": 22.125, "learning_rate": 2.7685435841905796e-05, "loss": 0.1846, "loss_lm": 0.018241049721837044, "loss_seg": 0.1664029322564602, "mean_token_accuracy": 0.9950867891311646, "num_tokens": 509632480.0, "step": 1199 }, { "entropy": 0.020106582902371883, "epoch": 0.5252215778531568, "grad_norm": 7.65625, "learning_rate": 2.7682728749323226e-05, "loss": 0.1742, "loss_lm": 0.01718897116370499, "loss_seg": 0.1570416372269392, "mean_token_accuracy": 0.9951454401016235, "num_tokens": 510057184.0, "step": 1200 }, { "entropy": 0.019882333930581808, "epoch": 0.5256592625013677, "grad_norm": 22.5, "learning_rate": 2.768002165674066e-05, "loss": 0.1346, "loss_lm": 0.01446613622829318, "loss_seg": 0.12010161206126213, "mean_token_accuracy": 0.9951154738664627, "num_tokens": 510482760.0, "step": 1201 }, { "entropy": 0.019499074667692184, "epoch": 0.5260969471495788, "grad_norm": 10.3125, "learning_rate": 2.7677314564158094e-05, "loss": 0.1866, "loss_lm": 0.014959484804421663, "loss_seg": 0.1716187410056591, "mean_token_accuracy": 0.9952071011066437, "num_tokens": 510908572.0, "step": 1202 }, { "entropy": 0.020514580886811018, "epoch": 0.5265346317977897, "grad_norm": 15.125, "learning_rate": 2.767460747157553e-05, "loss": 0.1363, "loss_lm": 0.017574438359588385, "loss_seg": 0.11872589960694313, "mean_token_accuracy": 0.9950065463781357, "num_tokens": 511333517.0, "step": 1203 }, { "entropy": 0.01958134723827243, "epoch": 0.5269723164460006, "grad_norm": 7.5, "learning_rate": 2.7671900378992964e-05, "loss": 0.1465, "loss_lm": 0.01723161479458213, "loss_seg": 0.1292430218309164, "mean_token_accuracy": 0.9953087866306305, "num_tokens": 511757459.0, "step": 1204 }, { "entropy": 0.01945507386699319, "epoch": 0.5274100010942117, "grad_norm": 5.46875, "learning_rate": 2.7669193286410395e-05, "loss": 0.1398, "loss_lm": 0.015408231876790524, "loss_seg": 0.1244384367018938, "mean_token_accuracy": 0.9952659159898758, "num_tokens": 512183232.0, "step": 1205 }, { "entropy": 0.020432848948985338, "epoch": 0.5278476857424226, "grad_norm": 16.875, "learning_rate": 2.766648619382783e-05, "loss": 0.1848, "loss_lm": 0.020268275402486324, "loss_seg": 0.16451816447079182, "mean_token_accuracy": 0.9949804097414017, "num_tokens": 512608749.0, "step": 1206 }, { "entropy": 0.01985697355121374, "epoch": 0.5282853703906335, "grad_norm": 8.3125, "learning_rate": 2.7663779101245262e-05, "loss": 0.1434, "loss_lm": 0.015602994477376342, "loss_seg": 0.1278176438063383, "mean_token_accuracy": 0.9951807856559753, "num_tokens": 513033662.0, "step": 1207 }, { "entropy": 0.020023060031235218, "epoch": 0.5287230550388445, "grad_norm": 9.25, "learning_rate": 2.76610720086627e-05, "loss": 0.1892, "loss_lm": 0.020143686793744564, "loss_seg": 0.16905318014323711, "mean_token_accuracy": 0.9950899332761765, "num_tokens": 513458476.0, "step": 1208 }, { "entropy": 0.020103315357118845, "epoch": 0.5291607396870555, "grad_norm": 5.71875, "learning_rate": 2.7658364916080133e-05, "loss": 0.1455, "loss_lm": 0.017466214252635837, "loss_seg": 0.12802313454449177, "mean_token_accuracy": 0.995119571685791, "num_tokens": 513883420.0, "step": 1209 }, { "entropy": 0.020316143985837698, "epoch": 0.5295984243352665, "grad_norm": 9.0, "learning_rate": 2.7655657823497563e-05, "loss": 0.1404, "loss_lm": 0.01728145149536431, "loss_seg": 0.12313734740018845, "mean_token_accuracy": 0.9950887858867645, "num_tokens": 514308351.0, "step": 1210 }, { "entropy": 0.020041123032569885, "epoch": 0.5300361089834774, "grad_norm": 9.375, "learning_rate": 2.7652950730914997e-05, "loss": 0.179, "loss_lm": 0.017165054101496935, "loss_seg": 0.1618192158639431, "mean_token_accuracy": 0.9950858354568481, "num_tokens": 514733424.0, "step": 1211 }, { "entropy": 0.020054246298968792, "epoch": 0.5304737936316883, "grad_norm": 8.375, "learning_rate": 2.765024363833243e-05, "loss": 0.2097, "loss_lm": 0.01626009400933981, "loss_seg": 0.1934615969657898, "mean_token_accuracy": 0.9951907247304916, "num_tokens": 515158192.0, "step": 1212 }, { "entropy": 0.019377054180949926, "epoch": 0.5309114782798994, "grad_norm": 17.0, "learning_rate": 2.7647536545749868e-05, "loss": 0.1624, "loss_lm": 0.01636372017674148, "loss_seg": 0.1460502501577139, "mean_token_accuracy": 0.995273619890213, "num_tokens": 515583412.0, "step": 1213 }, { "entropy": 0.019393810536712408, "epoch": 0.5313491629281103, "grad_norm": 12.1875, "learning_rate": 2.76448294531673e-05, "loss": 0.1397, "loss_lm": 0.015446886653080583, "loss_seg": 0.12425165064632893, "mean_token_accuracy": 0.9952008128166199, "num_tokens": 516008221.0, "step": 1214 }, { "entropy": 0.019607307389378548, "epoch": 0.5317868475763212, "grad_norm": 8.125, "learning_rate": 2.7642122360584732e-05, "loss": 0.1556, "loss_lm": 0.015952856047078967, "loss_seg": 0.13962737284600735, "mean_token_accuracy": 0.9951906651258469, "num_tokens": 516434016.0, "step": 1215 }, { "entropy": 0.019622244406491518, "epoch": 0.5322245322245323, "grad_norm": 8.625, "learning_rate": 2.7639415268002166e-05, "loss": 0.1513, "loss_lm": 0.016388898249715567, "loss_seg": 0.13495883718132973, "mean_token_accuracy": 0.9951501935720444, "num_tokens": 516858739.0, "step": 1216 }, { "entropy": 0.01959148235619068, "epoch": 0.5326622168727432, "grad_norm": 6.4375, "learning_rate": 2.76367081754196e-05, "loss": 0.1539, "loss_lm": 0.016926545184105635, "loss_seg": 0.13694222830235958, "mean_token_accuracy": 0.9951681792736053, "num_tokens": 517283606.0, "step": 1217 }, { "entropy": 0.020069528836756945, "epoch": 0.5330999015209541, "grad_norm": 12.8125, "learning_rate": 2.7634001082837033e-05, "loss": 0.1216, "loss_lm": 0.016497442265972495, "loss_seg": 0.10511428024619818, "mean_token_accuracy": 0.9950151592493057, "num_tokens": 517709254.0, "step": 1218 }, { "entropy": 0.019608844071626663, "epoch": 0.5335375861691651, "grad_norm": 8.875, "learning_rate": 2.7631293990254467e-05, "loss": 0.1413, "loss_lm": 0.017921081045642495, "loss_seg": 0.12341385893523693, "mean_token_accuracy": 0.9952417463064194, "num_tokens": 518133713.0, "step": 1219 }, { "entropy": 0.020591165404766798, "epoch": 0.5339752708173761, "grad_norm": 5.4375, "learning_rate": 2.76285868976719e-05, "loss": 0.1232, "loss_lm": 0.016205167863518, "loss_seg": 0.10699613392353058, "mean_token_accuracy": 0.9948582500219345, "num_tokens": 518559065.0, "step": 1220 }, { "entropy": 0.019611672963947058, "epoch": 0.5344129554655871, "grad_norm": 12.5, "learning_rate": 2.7625879805089334e-05, "loss": 0.1618, "loss_lm": 0.014828614424914122, "loss_seg": 0.1469886153936386, "mean_token_accuracy": 0.9952750355005264, "num_tokens": 518984027.0, "step": 1221 }, { "entropy": 0.02010443015024066, "epoch": 0.534850640113798, "grad_norm": 6.5, "learning_rate": 2.7623172712506768e-05, "loss": 0.1216, "loss_lm": 0.015051043126732111, "loss_seg": 0.10658899880945683, "mean_token_accuracy": 0.9949917197227478, "num_tokens": 519409089.0, "step": 1222 }, { "entropy": 0.019589485600590706, "epoch": 0.5352883247620089, "grad_norm": 13.0625, "learning_rate": 2.7620465619924202e-05, "loss": 0.1367, "loss_lm": 0.01836500153876841, "loss_seg": 0.11834939196705818, "mean_token_accuracy": 0.9952576905488968, "num_tokens": 519833932.0, "step": 1223 }, { "entropy": 0.019598118029534817, "epoch": 0.53572600941022, "grad_norm": 20.25, "learning_rate": 2.7617758527341636e-05, "loss": 0.1778, "loss_lm": 0.017680210061371326, "loss_seg": 0.16013113036751747, "mean_token_accuracy": 0.9950254261493683, "num_tokens": 520258478.0, "step": 1224 }, { "entropy": 0.01901913108304143, "epoch": 0.5361636940584309, "grad_norm": 17.375, "learning_rate": 2.761505143475907e-05, "loss": 0.1729, "loss_lm": 0.015079000033438206, "loss_seg": 0.157837288454175, "mean_token_accuracy": 0.9953346401453018, "num_tokens": 520683678.0, "step": 1225 }, { "entropy": 0.020560132805258036, "epoch": 0.5366013787066418, "grad_norm": 8.625, "learning_rate": 2.7612344342176503e-05, "loss": 0.141, "loss_lm": 0.01747812400572002, "loss_seg": 0.12352119386196136, "mean_token_accuracy": 0.9949498176574707, "num_tokens": 521108975.0, "step": 1226 }, { "entropy": 0.019426824990659952, "epoch": 0.5370390633548529, "grad_norm": 21.875, "learning_rate": 2.7609637249593937e-05, "loss": 0.2024, "loss_lm": 0.018266693456098437, "loss_seg": 0.1841430552303791, "mean_token_accuracy": 0.9952699095010757, "num_tokens": 521533264.0, "step": 1227 }, { "entropy": 0.019768155179917812, "epoch": 0.5374767480030638, "grad_norm": 10.8125, "learning_rate": 2.760693015701137e-05, "loss": 0.1776, "loss_lm": 0.0188250585924834, "loss_seg": 0.15875086933374405, "mean_token_accuracy": 0.9951939582824707, "num_tokens": 521958547.0, "step": 1228 }, { "entropy": 0.020344548393040895, "epoch": 0.5379144326512748, "grad_norm": 10.375, "learning_rate": 2.7604223064428804e-05, "loss": 0.1495, "loss_lm": 0.01729101757518947, "loss_seg": 0.13217037543654442, "mean_token_accuracy": 0.995160236954689, "num_tokens": 522383530.0, "step": 1229 }, { "entropy": 0.020831602159887552, "epoch": 0.5383521172994857, "grad_norm": 7.78125, "learning_rate": 2.7601515971846238e-05, "loss": 0.1406, "loss_lm": 0.016173225827515125, "loss_seg": 0.12437960132956505, "mean_token_accuracy": 0.9950425624847412, "num_tokens": 522809317.0, "step": 1230 }, { "entropy": 0.019378503318876028, "epoch": 0.5387898019476967, "grad_norm": 11.0, "learning_rate": 2.759880887926367e-05, "loss": 0.1943, "loss_lm": 0.014076900901272893, "loss_seg": 0.180205088108778, "mean_token_accuracy": 0.9953462183475494, "num_tokens": 523234505.0, "step": 1231 }, { "entropy": 0.02042239671573043, "epoch": 0.5392274865959077, "grad_norm": 7.34375, "learning_rate": 2.7596101786681105e-05, "loss": 0.1448, "loss_lm": 0.015676557319238782, "loss_seg": 0.12913071736693382, "mean_token_accuracy": 0.9950703680515289, "num_tokens": 523660064.0, "step": 1232 }, { "entropy": 0.019896830432116985, "epoch": 0.5396651712441186, "grad_norm": 7.96875, "learning_rate": 2.759339469409854e-05, "loss": 0.172, "loss_lm": 0.015532844001427293, "loss_seg": 0.15649811923503876, "mean_token_accuracy": 0.9951176345348358, "num_tokens": 524084841.0, "step": 1233 }, { "entropy": 0.02006658772006631, "epoch": 0.5401028558923295, "grad_norm": 11.125, "learning_rate": 2.7590687601515973e-05, "loss": 0.1895, "loss_lm": 0.018151904921978712, "loss_seg": 0.17134559899568558, "mean_token_accuracy": 0.9951793253421783, "num_tokens": 524510049.0, "step": 1234 }, { "entropy": 0.019579609856009483, "epoch": 0.5405405405405406, "grad_norm": 8.5, "learning_rate": 2.7587980508933407e-05, "loss": 0.1634, "loss_lm": 0.01610056357458234, "loss_seg": 0.1472565233707428, "mean_token_accuracy": 0.9952762871980667, "num_tokens": 524934455.0, "step": 1235 }, { "entropy": 0.01975549664348364, "epoch": 0.5409782251887515, "grad_norm": 9.0, "learning_rate": 2.758527341635084e-05, "loss": 0.1386, "loss_lm": 0.016020165756344795, "loss_seg": 0.1225498802959919, "mean_token_accuracy": 0.9952400028705597, "num_tokens": 525359407.0, "step": 1236 }, { "entropy": 0.019764827098697424, "epoch": 0.5414159098369624, "grad_norm": 9.375, "learning_rate": 2.7582566323768274e-05, "loss": 0.1644, "loss_lm": 0.018643526127561927, "loss_seg": 0.14580318704247475, "mean_token_accuracy": 0.9950765669345856, "num_tokens": 525784291.0, "step": 1237 }, { "entropy": 0.020208154805004597, "epoch": 0.5418535944851735, "grad_norm": 8.3125, "learning_rate": 2.7579859231185704e-05, "loss": 0.1545, "loss_lm": 0.016100116772577167, "loss_seg": 0.1384346503764391, "mean_token_accuracy": 0.995118573307991, "num_tokens": 526209353.0, "step": 1238 }, { "entropy": 0.019477510824799538, "epoch": 0.5422912791333844, "grad_norm": 13.625, "learning_rate": 2.757715213860314e-05, "loss": 0.1125, "loss_lm": 0.013782246503978968, "loss_seg": 0.09868844971060753, "mean_token_accuracy": 0.9951157569885254, "num_tokens": 526634145.0, "step": 1239 }, { "entropy": 0.019774936605244875, "epoch": 0.5427289637815954, "grad_norm": 12.0, "learning_rate": 2.7574445046020575e-05, "loss": 0.2008, "loss_lm": 0.015507461968809366, "loss_seg": 0.18525673635303974, "mean_token_accuracy": 0.9951848089694977, "num_tokens": 527058808.0, "step": 1240 }, { "entropy": 0.018807496409863234, "epoch": 0.5431666484298063, "grad_norm": 33.5, "learning_rate": 2.757173795343801e-05, "loss": 0.1578, "loss_lm": 0.015733539359644055, "loss_seg": 0.14208854734897614, "mean_token_accuracy": 0.9953572303056717, "num_tokens": 527483500.0, "step": 1241 }, { "entropy": 0.020187110640108585, "epoch": 0.5436043330780173, "grad_norm": 7.4375, "learning_rate": 2.7569030860855443e-05, "loss": 0.1278, "loss_lm": 0.014971652766689658, "loss_seg": 0.11280013807117939, "mean_token_accuracy": 0.9950553774833679, "num_tokens": 527908171.0, "step": 1242 }, { "entropy": 0.019315889105200768, "epoch": 0.5440420177262283, "grad_norm": 17.625, "learning_rate": 2.7566323768272873e-05, "loss": 0.1336, "loss_lm": 0.016920641530305147, "loss_seg": 0.11672017350792885, "mean_token_accuracy": 0.9952310472726822, "num_tokens": 528332536.0, "step": 1243 }, { "entropy": 0.019687958993017673, "epoch": 0.5444797023744392, "grad_norm": 14.4375, "learning_rate": 2.756361667569031e-05, "loss": 0.1355, "loss_lm": 0.018851428758352995, "loss_seg": 0.11663058213889599, "mean_token_accuracy": 0.9951582849025726, "num_tokens": 528757568.0, "step": 1244 }, { "entropy": 0.019440945237874985, "epoch": 0.5449173870226501, "grad_norm": 4.625, "learning_rate": 2.7560909583107744e-05, "loss": 0.1389, "loss_lm": 0.014864519005641341, "loss_seg": 0.1239941455423832, "mean_token_accuracy": 0.9951140582561493, "num_tokens": 529183044.0, "step": 1245 }, { "entropy": 0.019500144757330418, "epoch": 0.5453550716708612, "grad_norm": 6.34375, "learning_rate": 2.7558202490525178e-05, "loss": 0.1547, "loss_lm": 0.015115297632291913, "loss_seg": 0.13958548288792372, "mean_token_accuracy": 0.9952066838741302, "num_tokens": 529608406.0, "step": 1246 }, { "entropy": 0.019334157928824425, "epoch": 0.5457927563190721, "grad_norm": 9.75, "learning_rate": 2.755549539794261e-05, "loss": 0.1698, "loss_lm": 0.018581110518425703, "loss_seg": 0.15120286121964455, "mean_token_accuracy": 0.9952920079231262, "num_tokens": 530033600.0, "step": 1247 }, { "entropy": 0.019505677744746208, "epoch": 0.5462304409672831, "grad_norm": 4.1875, "learning_rate": 2.755278830536004e-05, "loss": 0.1387, "loss_lm": 0.01773042045533657, "loss_seg": 0.12094481848180294, "mean_token_accuracy": 0.9952163398265839, "num_tokens": 530458364.0, "step": 1248 }, { "entropy": 0.01942210365086794, "epoch": 0.546668125615494, "grad_norm": 9.8125, "learning_rate": 2.7550081212777475e-05, "loss": 0.1312, "loss_lm": 0.01563527574762702, "loss_seg": 0.11552652344107628, "mean_token_accuracy": 0.9951009899377823, "num_tokens": 530882511.0, "step": 1249 }, { "entropy": 0.020047354511916637, "epoch": 0.547105810263705, "grad_norm": 11.625, "learning_rate": 2.7547374120194912e-05, "loss": 0.1967, "loss_lm": 0.018536545801907778, "loss_seg": 0.1781866867095232, "mean_token_accuracy": 0.9950678497552872, "num_tokens": 531307849.0, "step": 1250 }, { "entropy": 0.01902425056323409, "epoch": 0.547543494911916, "grad_norm": 7.84375, "learning_rate": 2.7544667027612346e-05, "loss": 0.1508, "loss_lm": 0.015446130186319351, "loss_seg": 0.13534383289515972, "mean_token_accuracy": 0.9954974204301834, "num_tokens": 531732407.0, "step": 1251 }, { "entropy": 0.01905424939468503, "epoch": 0.5479811795601269, "grad_norm": 9.625, "learning_rate": 2.754195993502978e-05, "loss": 0.1865, "loss_lm": 0.01586047187447548, "loss_seg": 0.17064843699336052, "mean_token_accuracy": 0.995330736041069, "num_tokens": 532157467.0, "step": 1252 }, { "entropy": 0.019829906057566404, "epoch": 0.5484188642083379, "grad_norm": 5.65625, "learning_rate": 2.753925284244721e-05, "loss": 0.1288, "loss_lm": 0.017008653143420815, "loss_seg": 0.11180488206446171, "mean_token_accuracy": 0.9951855540275574, "num_tokens": 532582023.0, "step": 1253 }, { "entropy": 0.019397897645831108, "epoch": 0.5488565488565489, "grad_norm": 8.0625, "learning_rate": 2.7536545749864644e-05, "loss": 0.1797, "loss_lm": 0.015351320384070277, "loss_seg": 0.1643704380840063, "mean_token_accuracy": 0.9953085631132126, "num_tokens": 533007023.0, "step": 1254 }, { "entropy": 0.020664133597165346, "epoch": 0.5492942335047598, "grad_norm": 9.6875, "learning_rate": 2.753383865728208e-05, "loss": 0.2046, "loss_lm": 0.020339983981102705, "loss_seg": 0.18424093164503574, "mean_token_accuracy": 0.9949175417423248, "num_tokens": 533432790.0, "step": 1255 }, { "entropy": 0.019476547371596098, "epoch": 0.5497319181529707, "grad_norm": 8.0, "learning_rate": 2.7531131564699515e-05, "loss": 0.1464, "loss_lm": 0.016897386638447642, "loss_seg": 0.129540603607893, "mean_token_accuracy": 0.9953059107065201, "num_tokens": 533857520.0, "step": 1256 }, { "entropy": 0.019933839328587055, "epoch": 0.5501696028011818, "grad_norm": 9.6875, "learning_rate": 2.752842447211695e-05, "loss": 0.1725, "loss_lm": 0.01658521732315421, "loss_seg": 0.1559355240315199, "mean_token_accuracy": 0.9951670169830322, "num_tokens": 534282056.0, "step": 1257 }, { "entropy": 0.019926483277231455, "epoch": 0.5506072874493927, "grad_norm": 8.1875, "learning_rate": 2.752571737953438e-05, "loss": 0.2478, "loss_lm": 0.0162526061758399, "loss_seg": 0.23150253295898438, "mean_token_accuracy": 0.9951305687427521, "num_tokens": 534706905.0, "step": 1258 }, { "entropy": 0.020088286139070988, "epoch": 0.5510449720976037, "grad_norm": 8.25, "learning_rate": 2.7523010286951813e-05, "loss": 0.1734, "loss_lm": 0.0183142083697021, "loss_seg": 0.15506544336676598, "mean_token_accuracy": 0.9950230717658997, "num_tokens": 535131405.0, "step": 1259 }, { "entropy": 0.01933354791253805, "epoch": 0.5514826567458146, "grad_norm": 8.5625, "learning_rate": 2.752030319436925e-05, "loss": 0.2211, "loss_lm": 0.015539528569206595, "loss_seg": 0.20559818483889103, "mean_token_accuracy": 0.9952897131443024, "num_tokens": 535555752.0, "step": 1260 }, { "entropy": 0.01975806290283799, "epoch": 0.5519203413940256, "grad_norm": 12.75, "learning_rate": 2.7517596101786683e-05, "loss": 0.1349, "loss_lm": 0.01607548608444631, "loss_seg": 0.11883853003382683, "mean_token_accuracy": 0.9951928704977036, "num_tokens": 535981123.0, "step": 1261 }, { "entropy": 0.01928752800449729, "epoch": 0.5523580260422366, "grad_norm": 9.9375, "learning_rate": 2.7514889009204114e-05, "loss": 0.1513, "loss_lm": 0.016191827598959208, "loss_seg": 0.13513808138668537, "mean_token_accuracy": 0.9952566176652908, "num_tokens": 536405900.0, "step": 1262 }, { "entropy": 0.019566496834158897, "epoch": 0.5527957106904475, "grad_norm": 14.5625, "learning_rate": 2.7512181916621548e-05, "loss": 0.1854, "loss_lm": 0.01740369340404868, "loss_seg": 0.16804519668221474, "mean_token_accuracy": 0.9951602816581726, "num_tokens": 536831523.0, "step": 1263 }, { "entropy": 0.020503013860434294, "epoch": 0.5532333953386585, "grad_norm": 10.75, "learning_rate": 2.750947482403898e-05, "loss": 0.2183, "loss_lm": 0.018993920413777232, "loss_seg": 0.1993052028119564, "mean_token_accuracy": 0.9949808567762375, "num_tokens": 537256670.0, "step": 1264 }, { "entropy": 0.019116775132715702, "epoch": 0.5536710799868695, "grad_norm": 16.625, "learning_rate": 2.750676773145642e-05, "loss": 0.2324, "loss_lm": 0.018449575873091817, "loss_seg": 0.21391313150525093, "mean_token_accuracy": 0.9952122569084167, "num_tokens": 537682069.0, "step": 1265 }, { "entropy": 0.019308086950331926, "epoch": 0.5541087646350804, "grad_norm": 8.8125, "learning_rate": 2.7504060638873852e-05, "loss": 0.1474, "loss_lm": 0.015061265556141734, "loss_seg": 0.13238339871168137, "mean_token_accuracy": 0.9951701164245605, "num_tokens": 538107051.0, "step": 1266 }, { "entropy": 0.01997983083128929, "epoch": 0.5545464492832914, "grad_norm": 7.65625, "learning_rate": 2.7501353546291282e-05, "loss": 0.141, "loss_lm": 0.015460044611245394, "loss_seg": 0.12551704421639442, "mean_token_accuracy": 0.9949344098567963, "num_tokens": 538533211.0, "step": 1267 }, { "entropy": 0.019773506093770266, "epoch": 0.5549841339315024, "grad_norm": 16.375, "learning_rate": 2.7498646453708716e-05, "loss": 0.1592, "loss_lm": 0.015993337845429778, "loss_seg": 0.1431601569056511, "mean_token_accuracy": 0.9950658529996872, "num_tokens": 538957778.0, "step": 1268 }, { "entropy": 0.019594584591686726, "epoch": 0.5554218185797133, "grad_norm": 8.5625, "learning_rate": 2.749593936112615e-05, "loss": 0.1326, "loss_lm": 0.014615542022511363, "loss_seg": 0.11802275478839874, "mean_token_accuracy": 0.9951505362987518, "num_tokens": 539382903.0, "step": 1269 }, { "entropy": 0.019660606049001217, "epoch": 0.5558595032279243, "grad_norm": 7.625, "learning_rate": 2.7493232268543587e-05, "loss": 0.1462, "loss_lm": 0.015578966354951262, "loss_seg": 0.13059045001864433, "mean_token_accuracy": 0.9951681196689606, "num_tokens": 539807984.0, "step": 1270 }, { "entropy": 0.020197031553834677, "epoch": 0.5562971878761352, "grad_norm": 24.875, "learning_rate": 2.749052517596102e-05, "loss": 0.2184, "loss_lm": 0.016155600547790527, "loss_seg": 0.20221243798732758, "mean_token_accuracy": 0.9951140433549881, "num_tokens": 540233651.0, "step": 1271 }, { "entropy": 0.0193492341786623, "epoch": 0.5567348725243462, "grad_norm": 12.625, "learning_rate": 2.748781808337845e-05, "loss": 0.1652, "loss_lm": 0.016626141499727964, "loss_seg": 0.14853692427277565, "mean_token_accuracy": 0.9952423572540283, "num_tokens": 540658397.0, "step": 1272 }, { "entropy": 0.020006682258099318, "epoch": 0.5571725571725572, "grad_norm": 12.6875, "learning_rate": 2.7485110990795885e-05, "loss": 0.1682, "loss_lm": 0.01596398907713592, "loss_seg": 0.15224476344883442, "mean_token_accuracy": 0.9950883537530899, "num_tokens": 541084100.0, "step": 1273 }, { "entropy": 0.019703428260982037, "epoch": 0.5576102418207681, "grad_norm": 6.90625, "learning_rate": 2.748240389821332e-05, "loss": 0.1524, "loss_lm": 0.014698751270771027, "loss_seg": 0.13773691654205322, "mean_token_accuracy": 0.995256245136261, "num_tokens": 541507772.0, "step": 1274 }, { "entropy": 0.019581610802561045, "epoch": 0.558047926468979, "grad_norm": 7.5625, "learning_rate": 2.7479696805630756e-05, "loss": 0.1875, "loss_lm": 0.018685634713619947, "loss_seg": 0.16884696297347546, "mean_token_accuracy": 0.9951775223016739, "num_tokens": 541932806.0, "step": 1275 }, { "entropy": 0.01947641372680664, "epoch": 0.5584856111171901, "grad_norm": 10.0, "learning_rate": 2.747698971304819e-05, "loss": 0.1662, "loss_lm": 0.016401903238147497, "loss_seg": 0.14982102252542973, "mean_token_accuracy": 0.9952807575464249, "num_tokens": 542356769.0, "step": 1276 }, { "entropy": 0.019734747242182493, "epoch": 0.558923295765401, "grad_norm": 10.0, "learning_rate": 2.747428262046562e-05, "loss": 0.1457, "loss_lm": 0.017932965885847807, "loss_seg": 0.12774843722581863, "mean_token_accuracy": 0.9952095150947571, "num_tokens": 542780892.0, "step": 1277 }, { "entropy": 0.01939998334273696, "epoch": 0.559360980413612, "grad_norm": 7.5, "learning_rate": 2.7471575527883053e-05, "loss": 0.1611, "loss_lm": 0.01815719436854124, "loss_seg": 0.1429307721555233, "mean_token_accuracy": 0.9953000396490097, "num_tokens": 543205530.0, "step": 1278 }, { "entropy": 0.019840083550661802, "epoch": 0.559798665061823, "grad_norm": 6.3125, "learning_rate": 2.7468868435300487e-05, "loss": 0.1484, "loss_lm": 0.01679651835002005, "loss_seg": 0.13157391175627708, "mean_token_accuracy": 0.9951697736978531, "num_tokens": 543630291.0, "step": 1279 }, { "entropy": 0.01954283658415079, "epoch": 0.5602363497100339, "grad_norm": 17.0, "learning_rate": 2.7466161342717924e-05, "loss": 0.15, "loss_lm": 0.01687881420366466, "loss_seg": 0.13310177996754646, "mean_token_accuracy": 0.9951741993427277, "num_tokens": 544055426.0, "step": 1280 }, { "entropy": 0.019328816793859005, "epoch": 0.5606740343582449, "grad_norm": 8.25, "learning_rate": 2.7463454250135358e-05, "loss": 0.1551, "loss_lm": 0.018327903002500534, "loss_seg": 0.13674654066562653, "mean_token_accuracy": 0.9952136874198914, "num_tokens": 544480351.0, "step": 1281 }, { "entropy": 0.01922547910362482, "epoch": 0.5611117190064558, "grad_norm": 9.3125, "learning_rate": 2.746074715755279e-05, "loss": 0.1643, "loss_lm": 0.017223416827619076, "loss_seg": 0.14710096083581448, "mean_token_accuracy": 0.9952370077371597, "num_tokens": 544904847.0, "step": 1282 }, { "entropy": 0.01990812085568905, "epoch": 0.5615494036546668, "grad_norm": 10.0, "learning_rate": 2.7458040064970222e-05, "loss": 0.158, "loss_lm": 0.016239854507148266, "loss_seg": 0.14171435870230198, "mean_token_accuracy": 0.9951186776161194, "num_tokens": 545329266.0, "step": 1283 }, { "entropy": 0.01972172548994422, "epoch": 0.5619870883028778, "grad_norm": 14.75, "learning_rate": 2.7455332972387656e-05, "loss": 0.1491, "loss_lm": 0.015246049035340548, "loss_seg": 0.13390383124351501, "mean_token_accuracy": 0.9950441718101501, "num_tokens": 545754568.0, "step": 1284 }, { "entropy": 0.019894713535904884, "epoch": 0.5624247729510887, "grad_norm": 14.625, "learning_rate": 2.745262587980509e-05, "loss": 0.141, "loss_lm": 0.01778868562541902, "loss_seg": 0.12322303652763367, "mean_token_accuracy": 0.9951052814722061, "num_tokens": 546179291.0, "step": 1285 }, { "entropy": 0.019437982235103846, "epoch": 0.5628624575992998, "grad_norm": 15.875, "learning_rate": 2.7449918787222523e-05, "loss": 0.1637, "loss_lm": 0.016184972366318107, "loss_seg": 0.14746622182428837, "mean_token_accuracy": 0.9952928125858307, "num_tokens": 546603928.0, "step": 1286 }, { "entropy": 0.01977141248062253, "epoch": 0.5633001422475107, "grad_norm": 16.75, "learning_rate": 2.7447211694639957e-05, "loss": 0.1735, "loss_lm": 0.017348715336993337, "loss_seg": 0.15615328587591648, "mean_token_accuracy": 0.9952436983585358, "num_tokens": 547027986.0, "step": 1287 }, { "entropy": 0.019641988910734653, "epoch": 0.5637378268957216, "grad_norm": 8.375, "learning_rate": 2.744450460205739e-05, "loss": 0.1142, "loss_lm": 0.015006001805886626, "loss_seg": 0.09916149079799652, "mean_token_accuracy": 0.9951583296060562, "num_tokens": 547452992.0, "step": 1288 }, { "entropy": 0.01977735012769699, "epoch": 0.5641755115439326, "grad_norm": 8.4375, "learning_rate": 2.7441797509474824e-05, "loss": 0.0904, "loss_lm": 0.012963758781552315, "loss_seg": 0.07745868805795908, "mean_token_accuracy": 0.995133638381958, "num_tokens": 547877521.0, "step": 1289 }, { "entropy": 0.020074407570064068, "epoch": 0.5646131961921436, "grad_norm": 8.75, "learning_rate": 2.7439090416892258e-05, "loss": 0.2165, "loss_lm": 0.01768211112357676, "loss_seg": 0.19883361645042896, "mean_token_accuracy": 0.9950333684682846, "num_tokens": 548301957.0, "step": 1290 }, { "entropy": 0.01948418328538537, "epoch": 0.5650508808403545, "grad_norm": 10.375, "learning_rate": 2.7436383324309692e-05, "loss": 0.1821, "loss_lm": 0.018599551636725664, "loss_seg": 0.16350850090384483, "mean_token_accuracy": 0.9952752143144608, "num_tokens": 548726877.0, "step": 1291 }, { "entropy": 0.01945304637774825, "epoch": 0.5654885654885655, "grad_norm": 12.125, "learning_rate": 2.7433676231727126e-05, "loss": 0.1446, "loss_lm": 0.018708845134824514, "loss_seg": 0.12593314424157143, "mean_token_accuracy": 0.9952107518911362, "num_tokens": 549151940.0, "step": 1292 }, { "entropy": 0.019406550098210573, "epoch": 0.5659262501367764, "grad_norm": 11.4375, "learning_rate": 2.743096913914456e-05, "loss": 0.2121, "loss_lm": 0.015450082020834088, "loss_seg": 0.1966705657541752, "mean_token_accuracy": 0.9952083379030228, "num_tokens": 549576625.0, "step": 1293 }, { "entropy": 0.019413368310779333, "epoch": 0.5663639347849874, "grad_norm": 32.25, "learning_rate": 2.7428262046561993e-05, "loss": 0.1438, "loss_lm": 0.01673103659413755, "loss_seg": 0.12706043384969234, "mean_token_accuracy": 0.995260015130043, "num_tokens": 550001888.0, "step": 1294 }, { "entropy": 0.02060018526390195, "epoch": 0.5668016194331984, "grad_norm": 15.0625, "learning_rate": 2.7425554953979427e-05, "loss": 0.118, "loss_lm": 0.018498874735087156, "loss_seg": 0.09946440532803535, "mean_token_accuracy": 0.9948902279138565, "num_tokens": 550426773.0, "step": 1295 }, { "entropy": 0.01972961612045765, "epoch": 0.5672393040814093, "grad_norm": 6.75, "learning_rate": 2.742284786139686e-05, "loss": 0.1761, "loss_lm": 0.016645561438053846, "loss_seg": 0.1594093255698681, "mean_token_accuracy": 0.9951419532299042, "num_tokens": 550851989.0, "step": 1296 }, { "entropy": 0.019869945477694273, "epoch": 0.5676769887296204, "grad_norm": 34.0, "learning_rate": 2.7420140768814294e-05, "loss": 0.1087, "loss_lm": 0.017159386072307825, "loss_seg": 0.0915352925658226, "mean_token_accuracy": 0.9951166063547134, "num_tokens": 551277080.0, "step": 1297 }, { "entropy": 0.019213735591620207, "epoch": 0.5681146733778313, "grad_norm": 10.8125, "learning_rate": 2.7417433676231728e-05, "loss": 0.1783, "loss_lm": 0.017604968743398786, "loss_seg": 0.16067445650696754, "mean_token_accuracy": 0.9951997250318527, "num_tokens": 551702015.0, "step": 1298 }, { "entropy": 0.019597957376390696, "epoch": 0.5685523580260422, "grad_norm": 8.3125, "learning_rate": 2.741472658364916e-05, "loss": 0.1906, "loss_lm": 0.016132658580318093, "loss_seg": 0.17450713366270065, "mean_token_accuracy": 0.9950965493917465, "num_tokens": 552127598.0, "step": 1299 }, { "entropy": 0.020227705594152212, "epoch": 0.5689900426742532, "grad_norm": 15.6875, "learning_rate": 2.7412019491066595e-05, "loss": 0.1411, "loss_lm": 0.01689654914662242, "loss_seg": 0.12419924791902304, "mean_token_accuracy": 0.995007798075676, "num_tokens": 552552436.0, "step": 1300 }, { "entropy": 0.019689070992171764, "epoch": 0.5694277273224642, "grad_norm": 10.3125, "learning_rate": 2.740931239848403e-05, "loss": 0.1248, "loss_lm": 0.01731763151474297, "loss_seg": 0.10752789303660393, "mean_token_accuracy": 0.9951540529727936, "num_tokens": 552977283.0, "step": 1301 }, { "entropy": 0.019479486159980297, "epoch": 0.5698654119706751, "grad_norm": 21.0, "learning_rate": 2.7406605305901463e-05, "loss": 0.1279, "loss_lm": 0.015838836086913943, "loss_seg": 0.11207931861281395, "mean_token_accuracy": 0.9952781051397324, "num_tokens": 553401439.0, "step": 1302 }, { "entropy": 0.01961790258064866, "epoch": 0.5703030966188861, "grad_norm": 7.125, "learning_rate": 2.7403898213318897e-05, "loss": 0.1483, "loss_lm": 0.018678506836295128, "loss_seg": 0.1295961420983076, "mean_token_accuracy": 0.9950797259807587, "num_tokens": 553826410.0, "step": 1303 }, { "entropy": 0.018833247013390064, "epoch": 0.570740781267097, "grad_norm": 9.625, "learning_rate": 2.740119112073633e-05, "loss": 0.1584, "loss_lm": 0.01391274225898087, "loss_seg": 0.14444586262106895, "mean_token_accuracy": 0.9953431040048599, "num_tokens": 554251744.0, "step": 1304 }, { "entropy": 0.020095261745154858, "epoch": 0.5711784659153081, "grad_norm": 12.0625, "learning_rate": 2.7398484028153764e-05, "loss": 0.1368, "loss_lm": 0.01835287269204855, "loss_seg": 0.11844531632959843, "mean_token_accuracy": 0.9949215799570084, "num_tokens": 554676472.0, "step": 1305 }, { "entropy": 0.019671666901558638, "epoch": 0.571616150563519, "grad_norm": 17.875, "learning_rate": 2.7395776935571198e-05, "loss": 0.2121, "loss_lm": 0.01560776773840189, "loss_seg": 0.19649219326674938, "mean_token_accuracy": 0.9951983094215393, "num_tokens": 555101673.0, "step": 1306 }, { "entropy": 0.019832690246403217, "epoch": 0.5720538352117299, "grad_norm": 9.25, "learning_rate": 2.739306984298863e-05, "loss": 0.2412, "loss_lm": 0.016356934793293476, "loss_seg": 0.22483883425593376, "mean_token_accuracy": 0.9951754063367844, "num_tokens": 555527137.0, "step": 1307 }, { "entropy": 0.019559832755476236, "epoch": 0.572491519859941, "grad_norm": 6.65625, "learning_rate": 2.7390362750406065e-05, "loss": 0.2118, "loss_lm": 0.01769440737552941, "loss_seg": 0.19406254589557648, "mean_token_accuracy": 0.9952134042978287, "num_tokens": 555952667.0, "step": 1308 }, { "entropy": 0.019398794509470463, "epoch": 0.5729292045081519, "grad_norm": 6.78125, "learning_rate": 2.73876556578235e-05, "loss": 0.1469, "loss_lm": 0.016739234095439315, "loss_seg": 0.13018128089606762, "mean_token_accuracy": 0.995286613702774, "num_tokens": 556378170.0, "step": 1309 }, { "entropy": 0.019531906116753817, "epoch": 0.5733668891563628, "grad_norm": 10.6875, "learning_rate": 2.738494856524093e-05, "loss": 0.1479, "loss_lm": 0.016012556850910187, "loss_seg": 0.13187146931886673, "mean_token_accuracy": 0.9952569752931595, "num_tokens": 556802299.0, "step": 1310 }, { "entropy": 0.019703954458236694, "epoch": 0.5738045738045738, "grad_norm": 16.125, "learning_rate": 2.7382241472658366e-05, "loss": 0.1386, "loss_lm": 0.01460893452167511, "loss_seg": 0.12396993674337864, "mean_token_accuracy": 0.9951658844947815, "num_tokens": 557227589.0, "step": 1311 }, { "entropy": 0.01985739730298519, "epoch": 0.5742422584527848, "grad_norm": 6.09375, "learning_rate": 2.73795343800758e-05, "loss": 0.1583, "loss_lm": 0.01715163793414831, "loss_seg": 0.14117789641022682, "mean_token_accuracy": 0.9951333552598953, "num_tokens": 557652078.0, "step": 1312 }, { "entropy": 0.01968466117978096, "epoch": 0.5746799431009957, "grad_norm": 9.6875, "learning_rate": 2.7376827287493234e-05, "loss": 0.2164, "loss_lm": 0.01488920277915895, "loss_seg": 0.20151623338460922, "mean_token_accuracy": 0.9952045977115631, "num_tokens": 558076389.0, "step": 1313 }, { "entropy": 0.019883330911397934, "epoch": 0.5751176277492067, "grad_norm": 11.3125, "learning_rate": 2.7374120194910668e-05, "loss": 0.1377, "loss_lm": 0.01709689130075276, "loss_seg": 0.12057875655591488, "mean_token_accuracy": 0.9951735585927963, "num_tokens": 558501511.0, "step": 1314 }, { "entropy": 0.019488380756229162, "epoch": 0.5755553123974176, "grad_norm": 11.75, "learning_rate": 2.7371413102328098e-05, "loss": 0.1505, "loss_lm": 0.01690810383297503, "loss_seg": 0.1336099673062563, "mean_token_accuracy": 0.995126336812973, "num_tokens": 558926844.0, "step": 1315 }, { "entropy": 0.019647585228085518, "epoch": 0.5759929970456287, "grad_norm": 15.4375, "learning_rate": 2.736870600974553e-05, "loss": 0.147, "loss_lm": 0.017760675866156816, "loss_seg": 0.12923234142363071, "mean_token_accuracy": 0.9952303618192673, "num_tokens": 559352587.0, "step": 1316 }, { "entropy": 0.019189953804016113, "epoch": 0.5764306816938396, "grad_norm": 12.1875, "learning_rate": 2.736599891716297e-05, "loss": 0.1159, "loss_lm": 0.015176447806879878, "loss_seg": 0.1007572878152132, "mean_token_accuracy": 0.995090126991272, "num_tokens": 559777826.0, "step": 1317 }, { "entropy": 0.019369530957192183, "epoch": 0.5768683663420505, "grad_norm": 8.25, "learning_rate": 2.7363291824580402e-05, "loss": 0.1709, "loss_lm": 0.017462295712903142, "loss_seg": 0.15347526408731937, "mean_token_accuracy": 0.9951369315385818, "num_tokens": 560202810.0, "step": 1318 }, { "entropy": 0.019390554167330265, "epoch": 0.5773060509902616, "grad_norm": 10.25, "learning_rate": 2.7360584731997836e-05, "loss": 0.1946, "loss_lm": 0.015294252196326852, "loss_seg": 0.17927167192101479, "mean_token_accuracy": 0.9950716644525528, "num_tokens": 560627685.0, "step": 1319 }, { "entropy": 0.019761848729103804, "epoch": 0.5777437356384725, "grad_norm": 6.4375, "learning_rate": 2.7357877639415267e-05, "loss": 0.1591, "loss_lm": 0.015767237404361367, "loss_seg": 0.14331646263599396, "mean_token_accuracy": 0.9951439201831818, "num_tokens": 561052898.0, "step": 1320 }, { "entropy": 0.01949772611260414, "epoch": 0.5781814202866834, "grad_norm": 9.125, "learning_rate": 2.73551705468327e-05, "loss": 0.156, "loss_lm": 0.016373315826058388, "loss_seg": 0.13964361231774092, "mean_token_accuracy": 0.9951610565185547, "num_tokens": 561477542.0, "step": 1321 }, { "entropy": 0.019696049857884645, "epoch": 0.5786191049348944, "grad_norm": 22.5, "learning_rate": 2.7352463454250137e-05, "loss": 0.1599, "loss_lm": 0.016094386344775558, "loss_seg": 0.1437576748430729, "mean_token_accuracy": 0.9951797127723694, "num_tokens": 561902787.0, "step": 1322 }, { "entropy": 0.019535000436007977, "epoch": 0.5790567895831054, "grad_norm": 16.875, "learning_rate": 2.734975636166757e-05, "loss": 0.1351, "loss_lm": 0.014162054285407066, "loss_seg": 0.12089038081467152, "mean_token_accuracy": 0.99508897960186, "num_tokens": 562327491.0, "step": 1323 }, { "entropy": 0.019856902305036783, "epoch": 0.5794944742313163, "grad_norm": 17.125, "learning_rate": 2.7347049269085005e-05, "loss": 0.1845, "loss_lm": 0.014921854482963681, "loss_seg": 0.16956507973372936, "mean_token_accuracy": 0.9950222969055176, "num_tokens": 562752373.0, "step": 1324 }, { "entropy": 0.01912854751572013, "epoch": 0.5799321588795273, "grad_norm": 8.5, "learning_rate": 2.7344342176502435e-05, "loss": 0.1212, "loss_lm": 0.01618360448628664, "loss_seg": 0.10500228777527809, "mean_token_accuracy": 0.9952135682106018, "num_tokens": 563177272.0, "step": 1325 }, { "entropy": 0.019013168290257454, "epoch": 0.5803698435277382, "grad_norm": 12.5, "learning_rate": 2.734163508391987e-05, "loss": 0.1265, "loss_lm": 0.01570695941336453, "loss_seg": 0.11074455454945564, "mean_token_accuracy": 0.9953609704971313, "num_tokens": 563601877.0, "step": 1326 }, { "entropy": 0.01952212443575263, "epoch": 0.5808075281759493, "grad_norm": 12.5625, "learning_rate": 2.7338927991337306e-05, "loss": 0.1412, "loss_lm": 0.0163204085547477, "loss_seg": 0.12491567805409431, "mean_token_accuracy": 0.9951775521039963, "num_tokens": 564026324.0, "step": 1327 }, { "entropy": 0.01947280950844288, "epoch": 0.5812452128241602, "grad_norm": 15.875, "learning_rate": 2.733622089875474e-05, "loss": 0.1247, "loss_lm": 0.01725853094831109, "loss_seg": 0.10739497281610966, "mean_token_accuracy": 0.9953004121780396, "num_tokens": 564451428.0, "step": 1328 }, { "entropy": 0.019360090605914593, "epoch": 0.5816828974723711, "grad_norm": 17.875, "learning_rate": 2.7333513806172173e-05, "loss": 0.1691, "loss_lm": 0.017213650746271014, "loss_seg": 0.1519013959914446, "mean_token_accuracy": 0.9952246248722076, "num_tokens": 564875826.0, "step": 1329 }, { "entropy": 0.019290118012577295, "epoch": 0.5821205821205822, "grad_norm": 6.375, "learning_rate": 2.7330806713589604e-05, "loss": 0.1249, "loss_lm": 0.016685407143086195, "loss_seg": 0.10819016396999359, "mean_token_accuracy": 0.995113417506218, "num_tokens": 565300408.0, "step": 1330 }, { "entropy": 0.020197743084281683, "epoch": 0.5825582667687931, "grad_norm": 9.4375, "learning_rate": 2.7328099621007037e-05, "loss": 0.1214, "loss_lm": 0.014408083632588387, "loss_seg": 0.10702812857925892, "mean_token_accuracy": 0.9950064867734909, "num_tokens": 565725943.0, "step": 1331 }, { "entropy": 0.01934031117707491, "epoch": 0.582995951417004, "grad_norm": 21.25, "learning_rate": 2.7325392528424475e-05, "loss": 0.1374, "loss_lm": 0.015171685488894582, "loss_seg": 0.12218049727380276, "mean_token_accuracy": 0.9953483939170837, "num_tokens": 566150484.0, "step": 1332 }, { "entropy": 0.02002386935055256, "epoch": 0.583433636065215, "grad_norm": 12.5, "learning_rate": 2.732268543584191e-05, "loss": 0.1727, "loss_lm": 0.016994561068713665, "loss_seg": 0.15568317472934723, "mean_token_accuracy": 0.9950732290744781, "num_tokens": 566575473.0, "step": 1333 }, { "entropy": 0.019994147587567568, "epoch": 0.583871320713426, "grad_norm": 8.9375, "learning_rate": 2.731997834325934e-05, "loss": 0.1564, "loss_lm": 0.015622104052454233, "loss_seg": 0.14080112613737583, "mean_token_accuracy": 0.9952227771282196, "num_tokens": 567000368.0, "step": 1334 }, { "entropy": 0.019683317746967077, "epoch": 0.584309005361637, "grad_norm": 8.3125, "learning_rate": 2.7317271250676772e-05, "loss": 0.1131, "loss_lm": 0.016402607783675194, "loss_seg": 0.09671476483345032, "mean_token_accuracy": 0.9952505677938461, "num_tokens": 567425655.0, "step": 1335 }, { "entropy": 0.01993213826790452, "epoch": 0.5847466900098479, "grad_norm": 6.125, "learning_rate": 2.7314564158094206e-05, "loss": 0.1483, "loss_lm": 0.017107510240748525, "loss_seg": 0.13121172040700912, "mean_token_accuracy": 0.9951862990856171, "num_tokens": 567851070.0, "step": 1336 }, { "entropy": 0.01995204482227564, "epoch": 0.5851843746580588, "grad_norm": 10.25, "learning_rate": 2.7311857065511643e-05, "loss": 0.1426, "loss_lm": 0.016557261813431978, "loss_seg": 0.12599626369774342, "mean_token_accuracy": 0.9950776249170303, "num_tokens": 568276639.0, "step": 1337 }, { "entropy": 0.019260377623140812, "epoch": 0.5856220593062699, "grad_norm": 12.625, "learning_rate": 2.7309149972929077e-05, "loss": 0.1618, "loss_lm": 0.015340059762820601, "loss_seg": 0.14644450321793556, "mean_token_accuracy": 0.9952053129673004, "num_tokens": 568702272.0, "step": 1338 }, { "entropy": 0.01997317047789693, "epoch": 0.5860597439544808, "grad_norm": 6.1875, "learning_rate": 2.7306442880346507e-05, "loss": 0.1335, "loss_lm": 0.016757457982748747, "loss_seg": 0.11678426526486874, "mean_token_accuracy": 0.9950595498085022, "num_tokens": 569127069.0, "step": 1339 }, { "entropy": 0.02087012678384781, "epoch": 0.5864974286026917, "grad_norm": 11.625, "learning_rate": 2.730373578776394e-05, "loss": 0.1889, "loss_lm": 0.01683966815471649, "loss_seg": 0.1720859818160534, "mean_token_accuracy": 0.9947622418403625, "num_tokens": 569552071.0, "step": 1340 }, { "entropy": 0.019703869242221117, "epoch": 0.5869351132509028, "grad_norm": 15.3125, "learning_rate": 2.7301028695181375e-05, "loss": 0.1773, "loss_lm": 0.019131219014525414, "loss_seg": 0.15819603204727173, "mean_token_accuracy": 0.995135948061943, "num_tokens": 569977419.0, "step": 1341 }, { "entropy": 0.02003240678459406, "epoch": 0.5873727978991137, "grad_norm": 14.625, "learning_rate": 2.7298321602598812e-05, "loss": 0.1355, "loss_lm": 0.017315810779109597, "loss_seg": 0.1182309128344059, "mean_token_accuracy": 0.9950533509254456, "num_tokens": 570403554.0, "step": 1342 }, { "entropy": 0.019746275153011084, "epoch": 0.5878104825473246, "grad_norm": 14.125, "learning_rate": 2.7295614510016246e-05, "loss": 0.162, "loss_lm": 0.017315672943368554, "loss_seg": 0.1446903571486473, "mean_token_accuracy": 0.9950830787420273, "num_tokens": 570828402.0, "step": 1343 }, { "entropy": 0.019529507495462894, "epoch": 0.5882481671955356, "grad_norm": 19.0, "learning_rate": 2.7292907417433676e-05, "loss": 0.1657, "loss_lm": 0.017028774367645383, "loss_seg": 0.1486715581268072, "mean_token_accuracy": 0.9951207339763641, "num_tokens": 571253462.0, "step": 1344 }, { "entropy": 0.02005556458607316, "epoch": 0.5886858518437466, "grad_norm": 7.59375, "learning_rate": 2.729020032485111e-05, "loss": 0.151, "loss_lm": 0.019626859109848738, "loss_seg": 0.13137470744550228, "mean_token_accuracy": 0.9950584769248962, "num_tokens": 571678807.0, "step": 1345 }, { "entropy": 0.018819817807525396, "epoch": 0.5891235364919576, "grad_norm": 8.25, "learning_rate": 2.7287493232268543e-05, "loss": 0.1937, "loss_lm": 0.016821379540488124, "loss_seg": 0.1769267227500677, "mean_token_accuracy": 0.9953401833772659, "num_tokens": 572103016.0, "step": 1346 }, { "entropy": 0.019056047778576612, "epoch": 0.5895612211401685, "grad_norm": 10.75, "learning_rate": 2.7284786139685977e-05, "loss": 0.1736, "loss_lm": 0.018067590659484267, "loss_seg": 0.15551533922553062, "mean_token_accuracy": 0.9952468127012253, "num_tokens": 572528342.0, "step": 1347 }, { "entropy": 0.019606738351285458, "epoch": 0.5899989057883794, "grad_norm": 7.84375, "learning_rate": 2.7282079047103414e-05, "loss": 0.1915, "loss_lm": 0.018187844892963767, "loss_seg": 0.17329971864819527, "mean_token_accuracy": 0.9951340854167938, "num_tokens": 572953517.0, "step": 1348 }, { "entropy": 0.019492469262331724, "epoch": 0.5904365904365905, "grad_norm": 10.25, "learning_rate": 2.7279371954520845e-05, "loss": 0.1205, "loss_lm": 0.0162015815731138, "loss_seg": 0.10428519919514656, "mean_token_accuracy": 0.995140552520752, "num_tokens": 573378436.0, "step": 1349 }, { "entropy": 0.01984760072082281, "epoch": 0.5908742750848014, "grad_norm": 11.0, "learning_rate": 2.7276664861938278e-05, "loss": 0.1534, "loss_lm": 0.018143787747249007, "loss_seg": 0.13521479442715645, "mean_token_accuracy": 0.9950711578130722, "num_tokens": 573803872.0, "step": 1350 }, { "entropy": 0.019495935179293156, "epoch": 0.5913119597330123, "grad_norm": 35.25, "learning_rate": 2.7273957769355712e-05, "loss": 0.1616, "loss_lm": 0.017620900878682733, "loss_seg": 0.14396720752120018, "mean_token_accuracy": 0.9951542019844055, "num_tokens": 574228526.0, "step": 1351 }, { "entropy": 0.018932607024908066, "epoch": 0.5917496443812233, "grad_norm": 9.625, "learning_rate": 2.7271250676773146e-05, "loss": 0.1392, "loss_lm": 0.015064484672620893, "loss_seg": 0.1240896824747324, "mean_token_accuracy": 0.9952940940856934, "num_tokens": 574652410.0, "step": 1352 }, { "entropy": 0.019684759434312582, "epoch": 0.5921873290294343, "grad_norm": 10.75, "learning_rate": 2.726854358419058e-05, "loss": 0.1843, "loss_lm": 0.015584754291921854, "loss_seg": 0.16870226338505745, "mean_token_accuracy": 0.9951731711626053, "num_tokens": 575077068.0, "step": 1353 }, { "entropy": 0.019051342271268368, "epoch": 0.5926250136776453, "grad_norm": 8.125, "learning_rate": 2.7265836491608013e-05, "loss": 0.1571, "loss_lm": 0.014804823556914926, "loss_seg": 0.14231286570429802, "mean_token_accuracy": 0.9952967464923859, "num_tokens": 575501808.0, "step": 1354 }, { "entropy": 0.019623802974820137, "epoch": 0.5930626983258562, "grad_norm": 7.65625, "learning_rate": 2.7263129399025447e-05, "loss": 0.1523, "loss_lm": 0.015601371182128787, "loss_seg": 0.13666625693440437, "mean_token_accuracy": 0.9951887875795364, "num_tokens": 575926391.0, "step": 1355 }, { "entropy": 0.018812259659171104, "epoch": 0.5935003829740672, "grad_norm": 11.5625, "learning_rate": 2.726042230644288e-05, "loss": 0.1331, "loss_lm": 0.015874891076236963, "loss_seg": 0.11718763038516045, "mean_token_accuracy": 0.9953088462352753, "num_tokens": 576351181.0, "step": 1356 }, { "entropy": 0.019734154921025038, "epoch": 0.5939380676222782, "grad_norm": 13.25, "learning_rate": 2.7257715213860314e-05, "loss": 0.2243, "loss_lm": 0.017640580888837576, "loss_seg": 0.20667221583426, "mean_token_accuracy": 0.9950634986162186, "num_tokens": 576776560.0, "step": 1357 }, { "entropy": 0.019461338873952627, "epoch": 0.5943757522704891, "grad_norm": 19.875, "learning_rate": 2.7255008121277748e-05, "loss": 0.1397, "loss_lm": 0.01718348474241793, "loss_seg": 0.12250666227191687, "mean_token_accuracy": 0.9952190220355988, "num_tokens": 577201119.0, "step": 1358 }, { "entropy": 0.01927184034138918, "epoch": 0.5948134369187, "grad_norm": 5.75, "learning_rate": 2.7252301028695182e-05, "loss": 0.1541, "loss_lm": 0.01596410619094968, "loss_seg": 0.13814422115683556, "mean_token_accuracy": 0.9953326731920242, "num_tokens": 577625748.0, "step": 1359 }, { "entropy": 0.019615477416664362, "epoch": 0.5952511215669111, "grad_norm": 15.3125, "learning_rate": 2.7249593936112616e-05, "loss": 0.1561, "loss_lm": 0.01595900091342628, "loss_seg": 0.14012997038662434, "mean_token_accuracy": 0.9950535595417023, "num_tokens": 578051482.0, "step": 1360 }, { "entropy": 0.019502990879118443, "epoch": 0.595688806215122, "grad_norm": 9.4375, "learning_rate": 2.724688684353005e-05, "loss": 0.1596, "loss_lm": 0.018958416301757097, "loss_seg": 0.14063032530248165, "mean_token_accuracy": 0.9951415657997131, "num_tokens": 578476053.0, "step": 1361 }, { "entropy": 0.01961156213656068, "epoch": 0.5961264908633329, "grad_norm": 7.46875, "learning_rate": 2.7244179750947483e-05, "loss": 0.1769, "loss_lm": 0.01861150306649506, "loss_seg": 0.15827403776347637, "mean_token_accuracy": 0.9952917098999023, "num_tokens": 578901644.0, "step": 1362 }, { "entropy": 0.019655213691294193, "epoch": 0.596564175511544, "grad_norm": 11.0, "learning_rate": 2.7241472658364917e-05, "loss": 0.2561, "loss_lm": 0.01759020588360727, "loss_seg": 0.23853324353694916, "mean_token_accuracy": 0.9952498078346252, "num_tokens": 579327292.0, "step": 1363 }, { "entropy": 0.018723507411777973, "epoch": 0.5970018601597549, "grad_norm": 11.4375, "learning_rate": 2.723876556578235e-05, "loss": 0.1654, "loss_lm": 0.015986642567440867, "loss_seg": 0.14942886121571064, "mean_token_accuracy": 0.9953060746192932, "num_tokens": 579752702.0, "step": 1364 }, { "entropy": 0.019514467101544142, "epoch": 0.5974395448079659, "grad_norm": 7.5, "learning_rate": 2.7236058473199784e-05, "loss": 0.1644, "loss_lm": 0.015428133076056838, "loss_seg": 0.148961978033185, "mean_token_accuracy": 0.9953292310237885, "num_tokens": 580177861.0, "step": 1365 }, { "entropy": 0.01936971675604582, "epoch": 0.5978772294561768, "grad_norm": 5.6875, "learning_rate": 2.7233351380617218e-05, "loss": 0.1864, "loss_lm": 0.015778732486069202, "loss_seg": 0.1706116981804371, "mean_token_accuracy": 0.995110958814621, "num_tokens": 580602864.0, "step": 1366 }, { "entropy": 0.01954676629975438, "epoch": 0.5983149141043878, "grad_norm": 14.25, "learning_rate": 2.723064428803465e-05, "loss": 0.16, "loss_lm": 0.015660992125049233, "loss_seg": 0.14433391578495502, "mean_token_accuracy": 0.995142936706543, "num_tokens": 581027371.0, "step": 1367 }, { "entropy": 0.019343260675668716, "epoch": 0.5987525987525988, "grad_norm": 24.125, "learning_rate": 2.7227937195452085e-05, "loss": 0.145, "loss_lm": 0.016214420553296804, "loss_seg": 0.12879678606987, "mean_token_accuracy": 0.9952399730682373, "num_tokens": 581452157.0, "step": 1368 }, { "entropy": 0.01943290326744318, "epoch": 0.5991902834008097, "grad_norm": 7.71875, "learning_rate": 2.722523010286952e-05, "loss": 0.1968, "loss_lm": 0.015044009545817971, "loss_seg": 0.181720782071352, "mean_token_accuracy": 0.9951010495424271, "num_tokens": 581877739.0, "step": 1369 }, { "entropy": 0.02025687787681818, "epoch": 0.5996279680490206, "grad_norm": 9.625, "learning_rate": 2.7222523010286953e-05, "loss": 0.1444, "loss_lm": 0.018149742856621742, "loss_seg": 0.12628796324133873, "mean_token_accuracy": 0.9949820041656494, "num_tokens": 582302993.0, "step": 1370 }, { "entropy": 0.019482959527522326, "epoch": 0.6000656526972317, "grad_norm": 9.75, "learning_rate": 2.7219815917704387e-05, "loss": 0.1324, "loss_lm": 0.01703076367266476, "loss_seg": 0.11540467478334904, "mean_token_accuracy": 0.9951852262020111, "num_tokens": 582727896.0, "step": 1371 }, { "entropy": 0.020068337209522724, "epoch": 0.6005033373454426, "grad_norm": 10.625, "learning_rate": 2.721710882512182e-05, "loss": 0.1361, "loss_lm": 0.015395448310300708, "loss_seg": 0.12068247050046921, "mean_token_accuracy": 0.9950722754001617, "num_tokens": 583152571.0, "step": 1372 }, { "entropy": 0.02043439168483019, "epoch": 0.6009410219936536, "grad_norm": 9.25, "learning_rate": 2.7214401732539254e-05, "loss": 0.1566, "loss_lm": 0.017169510945677757, "loss_seg": 0.13946321979165077, "mean_token_accuracy": 0.9950323551893234, "num_tokens": 583578487.0, "step": 1373 }, { "entropy": 0.02051458414644003, "epoch": 0.6013787066418645, "grad_norm": 16.375, "learning_rate": 2.7211694639956688e-05, "loss": 0.1739, "loss_lm": 0.016966681461781263, "loss_seg": 0.1569811999797821, "mean_token_accuracy": 0.99498151242733, "num_tokens": 584003686.0, "step": 1374 }, { "entropy": 0.019609068986028433, "epoch": 0.6018163912900755, "grad_norm": 9.1875, "learning_rate": 2.720898754737412e-05, "loss": 0.1495, "loss_lm": 0.01657932810485363, "loss_seg": 0.13292556814849377, "mean_token_accuracy": 0.9952994883060455, "num_tokens": 584428473.0, "step": 1375 }, { "entropy": 0.019863348454236984, "epoch": 0.6022540759382865, "grad_norm": 6.6875, "learning_rate": 2.7206280454791555e-05, "loss": 0.1467, "loss_lm": 0.01641199574805796, "loss_seg": 0.13030085526406765, "mean_token_accuracy": 0.9952494502067566, "num_tokens": 584853206.0, "step": 1376 }, { "entropy": 0.019475662149488926, "epoch": 0.6026917605864974, "grad_norm": 12.0625, "learning_rate": 2.7203573362208986e-05, "loss": 0.1397, "loss_lm": 0.01661978126503527, "loss_seg": 0.1231171190738678, "mean_token_accuracy": 0.9951550513505936, "num_tokens": 585278161.0, "step": 1377 }, { "entropy": 0.01937192026525736, "epoch": 0.6031294452347084, "grad_norm": 14.9375, "learning_rate": 2.7200866269626423e-05, "loss": 0.1658, "loss_lm": 0.01597199379466474, "loss_seg": 0.14986537396907806, "mean_token_accuracy": 0.9952970147132874, "num_tokens": 585703354.0, "step": 1378 }, { "entropy": 0.019483039621263742, "epoch": 0.6035671298829194, "grad_norm": 12.3125, "learning_rate": 2.7198159177043856e-05, "loss": 0.1306, "loss_lm": 0.016477426746860147, "loss_seg": 0.11408587917685509, "mean_token_accuracy": 0.9952885508537292, "num_tokens": 586128456.0, "step": 1379 }, { "entropy": 0.019357146695256233, "epoch": 0.6040048145311303, "grad_norm": 8.1875, "learning_rate": 2.719545208446129e-05, "loss": 0.2025, "loss_lm": 0.016438312828540802, "loss_seg": 0.18605555221438408, "mean_token_accuracy": 0.9953526705503464, "num_tokens": 586553591.0, "step": 1380 }, { "entropy": 0.020070358645170927, "epoch": 0.6044424991793412, "grad_norm": 7.96875, "learning_rate": 2.7192744991878724e-05, "loss": 0.1373, "loss_lm": 0.01671492331661284, "loss_seg": 0.12055826932191849, "mean_token_accuracy": 0.995136022567749, "num_tokens": 586978712.0, "step": 1381 }, { "entropy": 0.020141441375017166, "epoch": 0.6048801838275523, "grad_norm": 7.46875, "learning_rate": 2.7190037899296154e-05, "loss": 0.1274, "loss_lm": 0.01724234689027071, "loss_seg": 0.11011207476258278, "mean_token_accuracy": 0.995060920715332, "num_tokens": 587403675.0, "step": 1382 }, { "entropy": 0.019641911145299673, "epoch": 0.6053178684757632, "grad_norm": 22.625, "learning_rate": 2.7187330806713588e-05, "loss": 0.12, "loss_lm": 0.01500439876690507, "loss_seg": 0.10501305013895035, "mean_token_accuracy": 0.9953315258026123, "num_tokens": 587829025.0, "step": 1383 }, { "entropy": 0.019905188586562872, "epoch": 0.6057555531239742, "grad_norm": 5.4375, "learning_rate": 2.7184623714131025e-05, "loss": 0.1699, "loss_lm": 0.016616801032796502, "loss_seg": 0.15323673002421856, "mean_token_accuracy": 0.9951620995998383, "num_tokens": 588253962.0, "step": 1384 }, { "entropy": 0.01950502209365368, "epoch": 0.6061932377721851, "grad_norm": 21.875, "learning_rate": 2.718191662154846e-05, "loss": 0.1448, "loss_lm": 0.01681472361087799, "loss_seg": 0.12801148928701878, "mean_token_accuracy": 0.9953131377696991, "num_tokens": 588678926.0, "step": 1385 }, { "entropy": 0.020323413889855146, "epoch": 0.6066309224203961, "grad_norm": 6.625, "learning_rate": 2.7179209528965892e-05, "loss": 0.1225, "loss_lm": 0.01702142250724137, "loss_seg": 0.10546904429793358, "mean_token_accuracy": 0.9948772192001343, "num_tokens": 589104669.0, "step": 1386 }, { "entropy": 0.019350983668118715, "epoch": 0.6070686070686071, "grad_norm": 6.40625, "learning_rate": 2.7176502436383323e-05, "loss": 0.1691, "loss_lm": 0.01416559424251318, "loss_seg": 0.1549560148268938, "mean_token_accuracy": 0.9952772408723831, "num_tokens": 589529380.0, "step": 1387 }, { "entropy": 0.01951995911076665, "epoch": 0.607506291716818, "grad_norm": 9.4375, "learning_rate": 2.7173795343800756e-05, "loss": 0.2098, "loss_lm": 0.01862860145047307, "loss_seg": 0.19118225201964378, "mean_token_accuracy": 0.9952420294284821, "num_tokens": 589954330.0, "step": 1388 }, { "entropy": 0.019944991450756788, "epoch": 0.607943976365029, "grad_norm": 8.0625, "learning_rate": 2.7171088251218194e-05, "loss": 0.219, "loss_lm": 0.016473687952384353, "loss_seg": 0.20248295739293098, "mean_token_accuracy": 0.9950627088546753, "num_tokens": 590379625.0, "step": 1389 }, { "entropy": 0.019977339077740908, "epoch": 0.60838166101324, "grad_norm": 11.625, "learning_rate": 2.7168381158635627e-05, "loss": 0.1327, "loss_lm": 0.018405034206807613, "loss_seg": 0.11429428309202194, "mean_token_accuracy": 0.9951722621917725, "num_tokens": 590804822.0, "step": 1390 }, { "entropy": 0.020040574483573437, "epoch": 0.6088193456614509, "grad_norm": 21.125, "learning_rate": 2.716567406605306e-05, "loss": 0.2102, "loss_lm": 0.017028342001140118, "loss_seg": 0.1931801438331604, "mean_token_accuracy": 0.995129182934761, "num_tokens": 591229979.0, "step": 1391 }, { "entropy": 0.019969026558101177, "epoch": 0.6092570303096619, "grad_norm": 9.75, "learning_rate": 2.716296697347049e-05, "loss": 0.1959, "loss_lm": 0.019969736225903034, "loss_seg": 0.17597416788339615, "mean_token_accuracy": 0.9949686825275421, "num_tokens": 591654886.0, "step": 1392 }, { "entropy": 0.019891364965587854, "epoch": 0.6096947149578729, "grad_norm": 13.4375, "learning_rate": 2.7160259880887925e-05, "loss": 0.1925, "loss_lm": 0.01823025313206017, "loss_seg": 0.17425265721976757, "mean_token_accuracy": 0.9951877892017365, "num_tokens": 592079407.0, "step": 1393 }, { "entropy": 0.019735191483050585, "epoch": 0.6101323996060838, "grad_norm": 10.0625, "learning_rate": 2.7157552788305362e-05, "loss": 0.1733, "loss_lm": 0.017761550610885024, "loss_seg": 0.15551986545324326, "mean_token_accuracy": 0.995144858956337, "num_tokens": 592504080.0, "step": 1394 }, { "entropy": 0.019844939932227135, "epoch": 0.6105700842542948, "grad_norm": 16.0, "learning_rate": 2.7154845695722796e-05, "loss": 0.1839, "loss_lm": 0.016882835887372494, "loss_seg": 0.16699335724115372, "mean_token_accuracy": 0.9953626096248627, "num_tokens": 592929477.0, "step": 1395 }, { "entropy": 0.019810551777482033, "epoch": 0.6110077689025057, "grad_norm": 23.125, "learning_rate": 2.715213860314023e-05, "loss": 0.1392, "loss_lm": 0.016624177107587457, "loss_seg": 0.12260815501213074, "mean_token_accuracy": 0.9951961189508438, "num_tokens": 593355765.0, "step": 1396 }, { "entropy": 0.020220541395246983, "epoch": 0.6114454535507167, "grad_norm": 10.1875, "learning_rate": 2.714943151055766e-05, "loss": 0.1891, "loss_lm": 0.01688472693786025, "loss_seg": 0.17219825088977814, "mean_token_accuracy": 0.9950398206710815, "num_tokens": 593780322.0, "step": 1397 }, { "entropy": 0.020543282851576805, "epoch": 0.6118831381989277, "grad_norm": 10.25, "learning_rate": 2.7146724417975094e-05, "loss": 0.1184, "loss_lm": 0.015506352297961712, "loss_seg": 0.10286652576178312, "mean_token_accuracy": 0.9949808716773987, "num_tokens": 594205053.0, "step": 1398 }, { "entropy": 0.019548961892724037, "epoch": 0.6123208228471386, "grad_norm": 8.6875, "learning_rate": 2.714401732539253e-05, "loss": 0.151, "loss_lm": 0.01631236099638045, "loss_seg": 0.13465136103332043, "mean_token_accuracy": 0.9951215982437134, "num_tokens": 594630169.0, "step": 1399 }, { "entropy": 0.018908373545855284, "epoch": 0.6127585074953495, "grad_norm": 7.21875, "learning_rate": 2.7141310232809965e-05, "loss": 0.1242, "loss_lm": 0.016168175265192986, "loss_seg": 0.10798749327659607, "mean_token_accuracy": 0.9954513311386108, "num_tokens": 595055291.0, "step": 1400 }, { "entropy": 0.019363957922905684, "epoch": 0.6131961921435606, "grad_norm": 9.375, "learning_rate": 2.7138603140227395e-05, "loss": 0.1061, "loss_lm": 0.015510238008573651, "loss_seg": 0.09057959914207458, "mean_token_accuracy": 0.9952655136585236, "num_tokens": 595480647.0, "step": 1401 }, { "entropy": 0.01953937252983451, "epoch": 0.6136338767917715, "grad_norm": 17.75, "learning_rate": 2.713589604764483e-05, "loss": 0.1746, "loss_lm": 0.01643529860302806, "loss_seg": 0.1582134086638689, "mean_token_accuracy": 0.9951732754707336, "num_tokens": 595905753.0, "step": 1402 }, { "entropy": 0.019666119012981653, "epoch": 0.6140715614399825, "grad_norm": 30.875, "learning_rate": 2.7133188955062262e-05, "loss": 0.1744, "loss_lm": 0.0163270712364465, "loss_seg": 0.1580631509423256, "mean_token_accuracy": 0.9951665997505188, "num_tokens": 596330710.0, "step": 1403 }, { "entropy": 0.019984992686659098, "epoch": 0.6145092460881935, "grad_norm": 11.0625, "learning_rate": 2.71304818624797e-05, "loss": 0.1661, "loss_lm": 0.017425256315618753, "loss_seg": 0.14867867901921272, "mean_token_accuracy": 0.9951986074447632, "num_tokens": 596755729.0, "step": 1404 }, { "entropy": 0.019606669433414936, "epoch": 0.6149469307364044, "grad_norm": 7.46875, "learning_rate": 2.7127774769897133e-05, "loss": 0.1518, "loss_lm": 0.01664712466299534, "loss_seg": 0.13518920727074146, "mean_token_accuracy": 0.9951758682727814, "num_tokens": 597181241.0, "step": 1405 }, { "entropy": 0.019624950364232063, "epoch": 0.6153846153846154, "grad_norm": 6.34375, "learning_rate": 2.7125067677314564e-05, "loss": 0.1522, "loss_lm": 0.01718910434283316, "loss_seg": 0.135060615837574, "mean_token_accuracy": 0.9951810836791992, "num_tokens": 597605393.0, "step": 1406 }, { "entropy": 0.01917695812880993, "epoch": 0.6158223000328263, "grad_norm": 7.71875, "learning_rate": 2.7122360584731997e-05, "loss": 0.1703, "loss_lm": 0.015224294969812036, "loss_seg": 0.15506495349109173, "mean_token_accuracy": 0.9951479732990265, "num_tokens": 598030159.0, "step": 1407 }, { "entropy": 0.019871163181960583, "epoch": 0.6162599846810373, "grad_norm": 16.875, "learning_rate": 2.711965349214943e-05, "loss": 0.1911, "loss_lm": 0.019206269411370158, "loss_seg": 0.17190114222466946, "mean_token_accuracy": 0.9951798617839813, "num_tokens": 598454806.0, "step": 1408 }, { "entropy": 0.019334581214934587, "epoch": 0.6166976693292483, "grad_norm": 6.3125, "learning_rate": 2.7116946399566868e-05, "loss": 0.1688, "loss_lm": 0.015420784475281835, "loss_seg": 0.15337075479328632, "mean_token_accuracy": 0.9952064156532288, "num_tokens": 598879911.0, "step": 1409 }, { "entropy": 0.019642765633761883, "epoch": 0.6171353539774592, "grad_norm": 20.625, "learning_rate": 2.7114239306984302e-05, "loss": 0.1325, "loss_lm": 0.0177694174926728, "loss_seg": 0.11472720094025135, "mean_token_accuracy": 0.9950624853372574, "num_tokens": 599305391.0, "step": 1410 }, { "entropy": 0.02008006628602743, "epoch": 0.6175730386256703, "grad_norm": 8.1875, "learning_rate": 2.7111532214401732e-05, "loss": 0.1108, "loss_lm": 0.014738531550392509, "loss_seg": 0.09604829922318459, "mean_token_accuracy": 0.9951317757368088, "num_tokens": 599729936.0, "step": 1411 }, { "entropy": 0.020012944471091032, "epoch": 0.6180107232738812, "grad_norm": 25.625, "learning_rate": 2.7108825121819166e-05, "loss": 0.1157, "loss_lm": 0.01744033768773079, "loss_seg": 0.09829745441675186, "mean_token_accuracy": 0.9950132220983505, "num_tokens": 600155045.0, "step": 1412 }, { "entropy": 0.020325802732259035, "epoch": 0.6184484079220921, "grad_norm": 7.09375, "learning_rate": 2.71061180292366e-05, "loss": 0.1935, "loss_lm": 0.02018918702378869, "loss_seg": 0.17327209375798702, "mean_token_accuracy": 0.9949408769607544, "num_tokens": 600580391.0, "step": 1413 }, { "entropy": 0.019583539105951786, "epoch": 0.6188860925703031, "grad_norm": 6.59375, "learning_rate": 2.7103410936654033e-05, "loss": 0.1558, "loss_lm": 0.018024866469204426, "loss_seg": 0.13777054101228714, "mean_token_accuracy": 0.9951837658882141, "num_tokens": 601005583.0, "step": 1414 }, { "entropy": 0.019542517140507698, "epoch": 0.6193237772185141, "grad_norm": 7.71875, "learning_rate": 2.710070384407147e-05, "loss": 0.1569, "loss_lm": 0.01667365524917841, "loss_seg": 0.14018826559185982, "mean_token_accuracy": 0.9951742142438889, "num_tokens": 601430559.0, "step": 1415 }, { "entropy": 0.019358986523002386, "epoch": 0.619761461866725, "grad_norm": 10.5, "learning_rate": 2.70979967514889e-05, "loss": 0.1559, "loss_lm": 0.015917113749310374, "loss_seg": 0.13993581011891365, "mean_token_accuracy": 0.9951967000961304, "num_tokens": 601856309.0, "step": 1416 }, { "entropy": 0.019434419460594654, "epoch": 0.620199146514936, "grad_norm": 15.5625, "learning_rate": 2.7095289658906335e-05, "loss": 0.1967, "loss_lm": 0.01671572122722864, "loss_seg": 0.1799982264637947, "mean_token_accuracy": 0.9951344728469849, "num_tokens": 602281721.0, "step": 1417 }, { "entropy": 0.020204068161547184, "epoch": 0.6206368311631469, "grad_norm": 9.0, "learning_rate": 2.7092582566323768e-05, "loss": 0.2235, "loss_lm": 0.017952619353309274, "loss_seg": 0.2055547498166561, "mean_token_accuracy": 0.995005875825882, "num_tokens": 602706426.0, "step": 1418 }, { "entropy": 0.01921110227704048, "epoch": 0.6210745158113579, "grad_norm": 11.3125, "learning_rate": 2.7089875473741202e-05, "loss": 0.1567, "loss_lm": 0.015346852596849203, "loss_seg": 0.14140126667916775, "mean_token_accuracy": 0.9954012781381607, "num_tokens": 603131252.0, "step": 1419 }, { "entropy": 0.019870975986123085, "epoch": 0.6215122004595689, "grad_norm": 11.125, "learning_rate": 2.708716838115864e-05, "loss": 0.1273, "loss_lm": 0.015258762519806623, "loss_seg": 0.1120519982650876, "mean_token_accuracy": 0.9950782507658005, "num_tokens": 603556699.0, "step": 1420 }, { "entropy": 0.02007709164172411, "epoch": 0.6219498851077798, "grad_norm": 11.6875, "learning_rate": 2.708446128857607e-05, "loss": 0.1425, "loss_lm": 0.01519528403878212, "loss_seg": 0.12728452682495117, "mean_token_accuracy": 0.9951508045196533, "num_tokens": 603981883.0, "step": 1421 }, { "entropy": 0.020006680861115456, "epoch": 0.6223875697559909, "grad_norm": 10.4375, "learning_rate": 2.7081754195993503e-05, "loss": 0.1206, "loss_lm": 0.017753833439201117, "loss_seg": 0.10280642844736576, "mean_token_accuracy": 0.9951657205820084, "num_tokens": 604407550.0, "step": 1422 }, { "entropy": 0.019754222128540277, "epoch": 0.6228252544042018, "grad_norm": 20.125, "learning_rate": 2.7079047103410937e-05, "loss": 0.1606, "loss_lm": 0.016387091483920813, "loss_seg": 0.14424735493957996, "mean_token_accuracy": 0.9951499104499817, "num_tokens": 604832873.0, "step": 1423 }, { "entropy": 0.019076218362897635, "epoch": 0.6232629390524127, "grad_norm": 12.3125, "learning_rate": 2.707634001082837e-05, "loss": 0.1069, "loss_lm": 0.018582853488624096, "loss_seg": 0.08831299003213644, "mean_token_accuracy": 0.9951700270175934, "num_tokens": 605257697.0, "step": 1424 }, { "entropy": 0.01985138887539506, "epoch": 0.6237006237006237, "grad_norm": 5.625, "learning_rate": 2.7073632918245804e-05, "loss": 0.1055, "loss_lm": 0.014413055265322328, "loss_seg": 0.09107011556625366, "mean_token_accuracy": 0.995250478386879, "num_tokens": 605682764.0, "step": 1425 }, { "entropy": 0.019910542760044336, "epoch": 0.6241383083488347, "grad_norm": 6.53125, "learning_rate": 2.7070925825663238e-05, "loss": 0.217, "loss_lm": 0.016965609043836594, "loss_seg": 0.2000257596373558, "mean_token_accuracy": 0.9950501620769501, "num_tokens": 606108154.0, "step": 1426 }, { "entropy": 0.019764423836022615, "epoch": 0.6245759929970456, "grad_norm": 14.75, "learning_rate": 2.7068218733080672e-05, "loss": 0.1622, "loss_lm": 0.014990347204729915, "loss_seg": 0.14722510054707527, "mean_token_accuracy": 0.9950245320796967, "num_tokens": 606533423.0, "step": 1427 }, { "entropy": 0.019572392106056213, "epoch": 0.6250136776452566, "grad_norm": 12.25, "learning_rate": 2.7065511640498106e-05, "loss": 0.1273, "loss_lm": 0.014975739177316427, "loss_seg": 0.11230474710464478, "mean_token_accuracy": 0.9951574355363846, "num_tokens": 606958766.0, "step": 1428 }, { "entropy": 0.01975653739646077, "epoch": 0.6254513622934675, "grad_norm": 18.5, "learning_rate": 2.706280454791554e-05, "loss": 0.1294, "loss_lm": 0.017419883515685797, "loss_seg": 0.11200241185724735, "mean_token_accuracy": 0.9950371235609055, "num_tokens": 607383511.0, "step": 1429 }, { "entropy": 0.019559052772819996, "epoch": 0.6258890469416786, "grad_norm": 10.125, "learning_rate": 2.7060097455332973e-05, "loss": 0.1656, "loss_lm": 0.015412120847031474, "loss_seg": 0.1501620151102543, "mean_token_accuracy": 0.9951128959655762, "num_tokens": 607808878.0, "step": 1430 }, { "entropy": 0.019768858794122934, "epoch": 0.6263267315898895, "grad_norm": 5.71875, "learning_rate": 2.7057390362750407e-05, "loss": 0.1354, "loss_lm": 0.018849611282348633, "loss_seg": 0.1165679544210434, "mean_token_accuracy": 0.9951237142086029, "num_tokens": 608234386.0, "step": 1431 }, { "entropy": 0.01898388471454382, "epoch": 0.6267644162381004, "grad_norm": 12.0625, "learning_rate": 2.705468327016784e-05, "loss": 0.1518, "loss_lm": 0.01611983310431242, "loss_seg": 0.1357062514871359, "mean_token_accuracy": 0.995236873626709, "num_tokens": 608659493.0, "step": 1432 }, { "entropy": 0.01974737923592329, "epoch": 0.6272021008863115, "grad_norm": 10.125, "learning_rate": 2.7051976177585274e-05, "loss": 0.1702, "loss_lm": 0.01648699212819338, "loss_seg": 0.15373330563306808, "mean_token_accuracy": 0.9950813204050064, "num_tokens": 609083774.0, "step": 1433 }, { "entropy": 0.01940000569447875, "epoch": 0.6276397855345224, "grad_norm": 11.4375, "learning_rate": 2.7049269085002708e-05, "loss": 0.1447, "loss_lm": 0.014402707573026419, "loss_seg": 0.13032406382262707, "mean_token_accuracy": 0.995200514793396, "num_tokens": 609508793.0, "step": 1434 }, { "entropy": 0.0192378805950284, "epoch": 0.6280774701827333, "grad_norm": 14.3125, "learning_rate": 2.704656199242014e-05, "loss": 0.1294, "loss_lm": 0.016876571346074343, "loss_seg": 0.11253260262310505, "mean_token_accuracy": 0.9952253699302673, "num_tokens": 609933938.0, "step": 1435 }, { "entropy": 0.01879999879747629, "epoch": 0.6285151548309443, "grad_norm": 7.25, "learning_rate": 2.7043854899837575e-05, "loss": 0.145, "loss_lm": 0.015305812936276197, "loss_seg": 0.1296878382563591, "mean_token_accuracy": 0.9952881634235382, "num_tokens": 610358775.0, "step": 1436 }, { "entropy": 0.02015199651941657, "epoch": 0.6289528394791553, "grad_norm": 15.5, "learning_rate": 2.704114780725501e-05, "loss": 0.1521, "loss_lm": 0.017736171139404178, "loss_seg": 0.1343389768153429, "mean_token_accuracy": 0.9949414730072021, "num_tokens": 610783558.0, "step": 1437 }, { "entropy": 0.01989209046587348, "epoch": 0.6293905241273662, "grad_norm": 10.5625, "learning_rate": 2.7038440714672443e-05, "loss": 0.1557, "loss_lm": 0.018949895165860653, "loss_seg": 0.1367217805236578, "mean_token_accuracy": 0.9949910342693329, "num_tokens": 611209365.0, "step": 1438 }, { "entropy": 0.019744286313652992, "epoch": 0.6298282087755772, "grad_norm": 6.3125, "learning_rate": 2.7035733622089877e-05, "loss": 0.1303, "loss_lm": 0.015804585069417953, "loss_seg": 0.1144717549905181, "mean_token_accuracy": 0.9951045215129852, "num_tokens": 611633978.0, "step": 1439 }, { "entropy": 0.019456338603049517, "epoch": 0.6302658934237881, "grad_norm": 15.8125, "learning_rate": 2.703302652950731e-05, "loss": 0.1205, "loss_lm": 0.017690204549580812, "loss_seg": 0.10278509557247162, "mean_token_accuracy": 0.9952830970287323, "num_tokens": 612059207.0, "step": 1440 }, { "entropy": 0.019263347145169973, "epoch": 0.6307035780719992, "grad_norm": 31.625, "learning_rate": 2.7030319436924744e-05, "loss": 0.1437, "loss_lm": 0.01592856994830072, "loss_seg": 0.12776361219584942, "mean_token_accuracy": 0.9952558726072311, "num_tokens": 612484117.0, "step": 1441 }, { "entropy": 0.01951735047623515, "epoch": 0.6311412627202101, "grad_norm": 13.0, "learning_rate": 2.7027612344342178e-05, "loss": 0.1806, "loss_lm": 0.017098215641453862, "loss_seg": 0.16351579129695892, "mean_token_accuracy": 0.9952757209539413, "num_tokens": 612908906.0, "step": 1442 }, { "entropy": 0.01894550770521164, "epoch": 0.631578947368421, "grad_norm": 20.625, "learning_rate": 2.702490525175961e-05, "loss": 0.1826, "loss_lm": 0.014826602069661021, "loss_seg": 0.16781898215413094, "mean_token_accuracy": 0.9953829199075699, "num_tokens": 613333764.0, "step": 1443 }, { "entropy": 0.019646370317786932, "epoch": 0.632016632016632, "grad_norm": 7.03125, "learning_rate": 2.7022198159177045e-05, "loss": 0.2003, "loss_lm": 0.01617956766858697, "loss_seg": 0.18407280184328556, "mean_token_accuracy": 0.9951297491788864, "num_tokens": 613758360.0, "step": 1444 }, { "entropy": 0.019907377660274506, "epoch": 0.632454316664843, "grad_norm": 6.09375, "learning_rate": 2.701949106659448e-05, "loss": 0.2048, "loss_lm": 0.019310099771246314, "loss_seg": 0.18550795689225197, "mean_token_accuracy": 0.9951674491167068, "num_tokens": 614183154.0, "step": 1445 }, { "entropy": 0.019342843908816576, "epoch": 0.6328920013130539, "grad_norm": 8.5625, "learning_rate": 2.7016783974011913e-05, "loss": 0.1544, "loss_lm": 0.015381454024463892, "loss_seg": 0.13898959569633007, "mean_token_accuracy": 0.9952209144830704, "num_tokens": 614608310.0, "step": 1446 }, { "entropy": 0.019439980387687683, "epoch": 0.6333296859612649, "grad_norm": 12.4375, "learning_rate": 2.7014076881429346e-05, "loss": 0.1951, "loss_lm": 0.01795992790721357, "loss_seg": 0.17709221318364143, "mean_token_accuracy": 0.9951573610305786, "num_tokens": 615033261.0, "step": 1447 }, { "entropy": 0.019351529888808727, "epoch": 0.6337673706094759, "grad_norm": 14.8125, "learning_rate": 2.701136978884678e-05, "loss": 0.1389, "loss_lm": 0.017656413838267326, "loss_seg": 0.12124638631939888, "mean_token_accuracy": 0.9952023476362228, "num_tokens": 615458405.0, "step": 1448 }, { "entropy": 0.01968299550935626, "epoch": 0.6342050552576869, "grad_norm": 15.625, "learning_rate": 2.700866269626421e-05, "loss": 0.1503, "loss_lm": 0.015707302140071988, "loss_seg": 0.13457109965384007, "mean_token_accuracy": 0.9951005727052689, "num_tokens": 615884369.0, "step": 1449 }, { "entropy": 0.020065275952219963, "epoch": 0.6346427399058978, "grad_norm": 17.0, "learning_rate": 2.7005955603681644e-05, "loss": 0.1171, "loss_lm": 0.016442543594166636, "loss_seg": 0.10065283998847008, "mean_token_accuracy": 0.9951126277446747, "num_tokens": 616310109.0, "step": 1450 }, { "entropy": 0.019887205213308334, "epoch": 0.6350804245541087, "grad_norm": 6.28125, "learning_rate": 2.700324851109908e-05, "loss": 0.129, "loss_lm": 0.01675606006756425, "loss_seg": 0.11220961064100266, "mean_token_accuracy": 0.9951221793889999, "num_tokens": 616735051.0, "step": 1451 }, { "entropy": 0.019903529901057482, "epoch": 0.6355181092023198, "grad_norm": 20.75, "learning_rate": 2.7000541418516515e-05, "loss": 0.1323, "loss_lm": 0.017412346554920077, "loss_seg": 0.11487367376685143, "mean_token_accuracy": 0.9951423704624176, "num_tokens": 617159886.0, "step": 1452 }, { "entropy": 0.019668471533805132, "epoch": 0.6359557938505307, "grad_norm": 21.75, "learning_rate": 2.699783432593395e-05, "loss": 0.128, "loss_lm": 0.016742126550525427, "loss_seg": 0.11129383370280266, "mean_token_accuracy": 0.9951261132955551, "num_tokens": 617585391.0, "step": 1453 }, { "entropy": 0.019068286288529634, "epoch": 0.6363934784987416, "grad_norm": 15.6875, "learning_rate": 2.699512723335138e-05, "loss": 0.1925, "loss_lm": 0.01949323946610093, "loss_seg": 0.1729699857532978, "mean_token_accuracy": 0.9953784644603729, "num_tokens": 618009884.0, "step": 1454 }, { "entropy": 0.019320223480463028, "epoch": 0.6368311631469527, "grad_norm": 12.4375, "learning_rate": 2.6992420140768813e-05, "loss": 0.2178, "loss_lm": 0.018098381347954273, "loss_seg": 0.19966930523514748, "mean_token_accuracy": 0.9951674938201904, "num_tokens": 618435315.0, "step": 1455 }, { "entropy": 0.019056899938732386, "epoch": 0.6372688477951636, "grad_norm": 9.25, "learning_rate": 2.698971304818625e-05, "loss": 0.1624, "loss_lm": 0.014948752475902438, "loss_seg": 0.1474807672202587, "mean_token_accuracy": 0.9952911287546158, "num_tokens": 618859586.0, "step": 1456 }, { "entropy": 0.019794685300439596, "epoch": 0.6377065324433745, "grad_norm": 13.5625, "learning_rate": 2.6987005955603684e-05, "loss": 0.1553, "loss_lm": 0.01646861992776394, "loss_seg": 0.13886311277747154, "mean_token_accuracy": 0.9951421469449997, "num_tokens": 619284226.0, "step": 1457 }, { "entropy": 0.019095211755484343, "epoch": 0.6381442170915855, "grad_norm": 4.8125, "learning_rate": 2.6984298863021117e-05, "loss": 0.1767, "loss_lm": 0.019160160096362233, "loss_seg": 0.15751873329281807, "mean_token_accuracy": 0.9952867180109024, "num_tokens": 619709792.0, "step": 1458 }, { "entropy": 0.019457585643976927, "epoch": 0.6385819017397965, "grad_norm": 9.6875, "learning_rate": 2.6981591770438548e-05, "loss": 0.2114, "loss_lm": 0.018113365164026618, "loss_seg": 0.19332430139183998, "mean_token_accuracy": 0.9952239096164703, "num_tokens": 620134577.0, "step": 1459 }, { "entropy": 0.02013984229415655, "epoch": 0.6390195863880075, "grad_norm": 15.9375, "learning_rate": 2.697888467785598e-05, "loss": 0.1524, "loss_lm": 0.015653847716748714, "loss_seg": 0.13672325387597084, "mean_token_accuracy": 0.9950017780065536, "num_tokens": 620559324.0, "step": 1460 }, { "entropy": 0.019289780408143997, "epoch": 0.6394572710362184, "grad_norm": 9.8125, "learning_rate": 2.697617758527342e-05, "loss": 0.1518, "loss_lm": 0.01679828343912959, "loss_seg": 0.13497286662459373, "mean_token_accuracy": 0.9952671974897385, "num_tokens": 620984217.0, "step": 1461 }, { "entropy": 0.019731350243091583, "epoch": 0.6398949556844293, "grad_norm": 11.0, "learning_rate": 2.6973470492690852e-05, "loss": 0.1128, "loss_lm": 0.0190454525873065, "loss_seg": 0.09379393514245749, "mean_token_accuracy": 0.9950640648603439, "num_tokens": 621408436.0, "step": 1462 }, { "entropy": 0.020114409737288952, "epoch": 0.6403326403326404, "grad_norm": 9.25, "learning_rate": 2.6970763400108286e-05, "loss": 0.1363, "loss_lm": 0.01743123959749937, "loss_seg": 0.11888314969837666, "mean_token_accuracy": 0.9950530081987381, "num_tokens": 621833124.0, "step": 1463 }, { "entropy": 0.019487507175654173, "epoch": 0.6407703249808513, "grad_norm": 17.5, "learning_rate": 2.6968056307525716e-05, "loss": 0.1925, "loss_lm": 0.016911598620936275, "loss_seg": 0.1755688451230526, "mean_token_accuracy": 0.9953046888113022, "num_tokens": 622258392.0, "step": 1464 }, { "entropy": 0.0193873206153512, "epoch": 0.6412080096290622, "grad_norm": 6.28125, "learning_rate": 2.696534921494315e-05, "loss": 0.1617, "loss_lm": 0.016931336605921388, "loss_seg": 0.1447528451681137, "mean_token_accuracy": 0.9951830357313156, "num_tokens": 622683521.0, "step": 1465 }, { "entropy": 0.01987810479477048, "epoch": 0.6416456942772732, "grad_norm": 9.0, "learning_rate": 2.6962642122360587e-05, "loss": 0.1693, "loss_lm": 0.01750286389142275, "loss_seg": 0.15176829136908054, "mean_token_accuracy": 0.9950507432222366, "num_tokens": 623108601.0, "step": 1466 }, { "entropy": 0.019457606598734856, "epoch": 0.6420833789254842, "grad_norm": 9.4375, "learning_rate": 2.695993502977802e-05, "loss": 0.1028, "loss_lm": 0.016545478021726012, "loss_seg": 0.08630340918898582, "mean_token_accuracy": 0.9951287060976028, "num_tokens": 623533286.0, "step": 1467 }, { "entropy": 0.019607321359217167, "epoch": 0.6425210635736952, "grad_norm": 7.15625, "learning_rate": 2.6957227937195455e-05, "loss": 0.1322, "loss_lm": 0.015876162564381957, "loss_seg": 0.11628183536231518, "mean_token_accuracy": 0.9951684921979904, "num_tokens": 623958171.0, "step": 1468 }, { "entropy": 0.020580414682626724, "epoch": 0.6429587482219061, "grad_norm": 20.5, "learning_rate": 2.6954520844612885e-05, "loss": 0.1641, "loss_lm": 0.0188181702978909, "loss_seg": 0.1452367603778839, "mean_token_accuracy": 0.9949259757995605, "num_tokens": 624383839.0, "step": 1469 }, { "entropy": 0.020031718537211418, "epoch": 0.643396432870117, "grad_norm": 6.125, "learning_rate": 2.695181375203032e-05, "loss": 0.1517, "loss_lm": 0.016605463810265064, "loss_seg": 0.1350562134757638, "mean_token_accuracy": 0.9950140863656998, "num_tokens": 624809220.0, "step": 1470 }, { "entropy": 0.019095032941550016, "epoch": 0.6438341175183281, "grad_norm": 12.9375, "learning_rate": 2.6949106659447756e-05, "loss": 0.1248, "loss_lm": 0.01316363993100822, "loss_seg": 0.11166058294475079, "mean_token_accuracy": 0.9952331930398941, "num_tokens": 625234364.0, "step": 1471 }, { "entropy": 0.01998993381857872, "epoch": 0.644271802166539, "grad_norm": 10.3125, "learning_rate": 2.694639956686519e-05, "loss": 0.1947, "loss_lm": 0.019544463604688644, "loss_seg": 0.17510830983519554, "mean_token_accuracy": 0.9950703382492065, "num_tokens": 625659719.0, "step": 1472 }, { "entropy": 0.020139904227107763, "epoch": 0.6447094868147499, "grad_norm": 11.25, "learning_rate": 2.694369247428262e-05, "loss": 0.1792, "loss_lm": 0.017134476685896516, "loss_seg": 0.16209064237773418, "mean_token_accuracy": 0.9950445741415024, "num_tokens": 626084869.0, "step": 1473 }, { "entropy": 0.020359322428703308, "epoch": 0.645147171462961, "grad_norm": 14.5, "learning_rate": 2.6940985381700054e-05, "loss": 0.1751, "loss_lm": 0.01889014896005392, "loss_seg": 0.15620804391801357, "mean_token_accuracy": 0.9950270503759384, "num_tokens": 626510148.0, "step": 1474 }, { "entropy": 0.0195460831746459, "epoch": 0.6455848561111719, "grad_norm": 11.6875, "learning_rate": 2.6938278289117487e-05, "loss": 0.1475, "loss_lm": 0.020565989427268505, "loss_seg": 0.12694912776350975, "mean_token_accuracy": 0.9951947629451752, "num_tokens": 626935712.0, "step": 1475 }, { "entropy": 0.019014225341379642, "epoch": 0.6460225407593828, "grad_norm": 7.65625, "learning_rate": 2.6935571196534924e-05, "loss": 0.2028, "loss_lm": 0.015574393328279257, "loss_seg": 0.1872164011001587, "mean_token_accuracy": 0.9952922910451889, "num_tokens": 627361132.0, "step": 1476 }, { "entropy": 0.019881269428879023, "epoch": 0.6464602254075938, "grad_norm": 9.125, "learning_rate": 2.6932864103952358e-05, "loss": 0.1377, "loss_lm": 0.016641360707581043, "loss_seg": 0.12104828841984272, "mean_token_accuracy": 0.9949613660573959, "num_tokens": 627786451.0, "step": 1477 }, { "entropy": 0.0197158120572567, "epoch": 0.6468979100558048, "grad_norm": 7.40625, "learning_rate": 2.693015701136979e-05, "loss": 0.1935, "loss_lm": 0.017941409721970558, "loss_seg": 0.1755208820104599, "mean_token_accuracy": 0.9951886534690857, "num_tokens": 628211439.0, "step": 1478 }, { "entropy": 0.01932408893480897, "epoch": 0.6473355947040158, "grad_norm": 6.28125, "learning_rate": 2.6927449918787222e-05, "loss": 0.127, "loss_lm": 0.017306373920291662, "loss_seg": 0.10970071144402027, "mean_token_accuracy": 0.9952446222305298, "num_tokens": 628635765.0, "step": 1479 }, { "entropy": 0.019512193277478218, "epoch": 0.6477732793522267, "grad_norm": 5.84375, "learning_rate": 2.6924742826204656e-05, "loss": 0.173, "loss_lm": 0.016174836549907923, "loss_seg": 0.15680128894746304, "mean_token_accuracy": 0.9952121078968048, "num_tokens": 629060344.0, "step": 1480 }, { "entropy": 0.018977554515004158, "epoch": 0.6482109640004377, "grad_norm": 12.875, "learning_rate": 2.692203573362209e-05, "loss": 0.144, "loss_lm": 0.01629190449602902, "loss_seg": 0.12771347910165787, "mean_token_accuracy": 0.9952857345342636, "num_tokens": 629485617.0, "step": 1481 }, { "entropy": 0.019868650939315557, "epoch": 0.6486486486486487, "grad_norm": 18.625, "learning_rate": 2.6919328641039527e-05, "loss": 0.2171, "loss_lm": 0.01813845126889646, "loss_seg": 0.1989191211760044, "mean_token_accuracy": 0.9951227307319641, "num_tokens": 629910613.0, "step": 1482 }, { "entropy": 0.01895760791376233, "epoch": 0.6490863332968596, "grad_norm": 7.65625, "learning_rate": 2.6916621548456957e-05, "loss": 0.1588, "loss_lm": 0.014509618282318115, "loss_seg": 0.14433430321514606, "mean_token_accuracy": 0.9953112453222275, "num_tokens": 630335438.0, "step": 1483 }, { "entropy": 0.018789059948176146, "epoch": 0.6495240179450705, "grad_norm": 11.875, "learning_rate": 2.691391445587439e-05, "loss": 0.1413, "loss_lm": 0.017363497521728277, "loss_seg": 0.12396619096398354, "mean_token_accuracy": 0.9954484552145004, "num_tokens": 630760945.0, "step": 1484 }, { "entropy": 0.020369593519717455, "epoch": 0.6499617025932816, "grad_norm": 129.0, "learning_rate": 2.6911207363291825e-05, "loss": 0.1376, "loss_lm": 0.018306138226762414, "loss_seg": 0.11927529610693455, "mean_token_accuracy": 0.9950359016656876, "num_tokens": 631186383.0, "step": 1485 }, { "entropy": 0.019711988046765327, "epoch": 0.6503993872414925, "grad_norm": 6.3125, "learning_rate": 2.6908500270709258e-05, "loss": 0.1598, "loss_lm": 0.01601842464879155, "loss_seg": 0.14377675764262676, "mean_token_accuracy": 0.9951306134462357, "num_tokens": 631611419.0, "step": 1486 }, { "entropy": 0.019729814492166042, "epoch": 0.6508370718897035, "grad_norm": 8.0625, "learning_rate": 2.6905793178126695e-05, "loss": 0.1675, "loss_lm": 0.01654923055320978, "loss_seg": 0.15098604932427406, "mean_token_accuracy": 0.9952079504728317, "num_tokens": 632036252.0, "step": 1487 }, { "entropy": 0.019649074412882328, "epoch": 0.6512747565379144, "grad_norm": 5.53125, "learning_rate": 2.6903086085544126e-05, "loss": 0.1261, "loss_lm": 0.015588415320962667, "loss_seg": 0.11050528287887573, "mean_token_accuracy": 0.9951306730508804, "num_tokens": 632461139.0, "step": 1488 }, { "entropy": 0.019786329939961433, "epoch": 0.6517124411861254, "grad_norm": 6.65625, "learning_rate": 2.690037899296156e-05, "loss": 0.1664, "loss_lm": 0.016760489204898477, "loss_seg": 0.14967464841902256, "mean_token_accuracy": 0.9951835721731186, "num_tokens": 632886538.0, "step": 1489 }, { "entropy": 0.019350743852555752, "epoch": 0.6521501258343364, "grad_norm": 9.0625, "learning_rate": 2.6897671900378993e-05, "loss": 0.1545, "loss_lm": 0.01728702522814274, "loss_seg": 0.13720476627349854, "mean_token_accuracy": 0.9953232705593109, "num_tokens": 633311964.0, "step": 1490 }, { "entropy": 0.019258240703493357, "epoch": 0.6525878104825473, "grad_norm": 3.46875, "learning_rate": 2.6894964807796427e-05, "loss": 0.1376, "loss_lm": 0.016674811020493507, "loss_seg": 0.12087642215192318, "mean_token_accuracy": 0.9952661097049713, "num_tokens": 633736702.0, "step": 1491 }, { "entropy": 0.01911810925230384, "epoch": 0.6530254951307582, "grad_norm": 5.65625, "learning_rate": 2.689225771521386e-05, "loss": 0.1406, "loss_lm": 0.01762036862783134, "loss_seg": 0.12302577029913664, "mean_token_accuracy": 0.9953419417142868, "num_tokens": 634161531.0, "step": 1492 }, { "entropy": 0.020167808514088392, "epoch": 0.6534631797789693, "grad_norm": 6.5625, "learning_rate": 2.6889550622631294e-05, "loss": 0.1587, "loss_lm": 0.017740645678713918, "loss_seg": 0.1409534141421318, "mean_token_accuracy": 0.9949798434972763, "num_tokens": 634586611.0, "step": 1493 }, { "entropy": 0.019788989331573248, "epoch": 0.6539008644271802, "grad_norm": 14.875, "learning_rate": 2.6886843530048728e-05, "loss": 0.1613, "loss_lm": 0.01730146026238799, "loss_seg": 0.14397810958325863, "mean_token_accuracy": 0.9951855391263962, "num_tokens": 635012559.0, "step": 1494 }, { "entropy": 0.019744027871638536, "epoch": 0.6543385490753911, "grad_norm": 9.0, "learning_rate": 2.6884136437466162e-05, "loss": 0.2073, "loss_lm": 0.01825863728299737, "loss_seg": 0.18904134258627892, "mean_token_accuracy": 0.9951722770929337, "num_tokens": 635437575.0, "step": 1495 }, { "entropy": 0.019334903452545404, "epoch": 0.6547762337236022, "grad_norm": 9.9375, "learning_rate": 2.6881429344883596e-05, "loss": 0.1461, "loss_lm": 0.015900236554443836, "loss_seg": 0.1302366405725479, "mean_token_accuracy": 0.9952404648065567, "num_tokens": 635862805.0, "step": 1496 }, { "entropy": 0.019285311922430992, "epoch": 0.6552139183718131, "grad_norm": 15.5625, "learning_rate": 2.687872225230103e-05, "loss": 0.1588, "loss_lm": 0.013386818347498775, "loss_seg": 0.14537152089178562, "mean_token_accuracy": 0.9952277839183807, "num_tokens": 636287933.0, "step": 1497 }, { "entropy": 0.019340382888913155, "epoch": 0.6556516030200241, "grad_norm": 6.8125, "learning_rate": 2.6876015159718463e-05, "loss": 0.1566, "loss_lm": 0.01662928145378828, "loss_seg": 0.13993715681135654, "mean_token_accuracy": 0.9952688813209534, "num_tokens": 636712730.0, "step": 1498 }, { "entropy": 0.01914056856185198, "epoch": 0.656089287668235, "grad_norm": 18.25, "learning_rate": 2.6873308067135897e-05, "loss": 0.1303, "loss_lm": 0.016540451208129525, "loss_seg": 0.11371872946619987, "mean_token_accuracy": 0.99526247382164, "num_tokens": 637138022.0, "step": 1499 }, { "entropy": 0.019364485051482916, "epoch": 0.656526972316446, "grad_norm": 10.875, "learning_rate": 2.687060097455333e-05, "loss": 0.1314, "loss_lm": 0.015428519807755947, "loss_seg": 0.11593812238425016, "mean_token_accuracy": 0.9953175187110901, "num_tokens": 637563389.0, "step": 1500 }, { "entropy": 0.0203383294865489, "epoch": 0.656964656964657, "grad_norm": 6.90625, "learning_rate": 2.6867893881970764e-05, "loss": 0.1495, "loss_lm": 0.018195169046521187, "loss_seg": 0.13132174871861935, "mean_token_accuracy": 0.9949998706579208, "num_tokens": 637988762.0, "step": 1501 }, { "entropy": 0.019571488723158836, "epoch": 0.6574023416128679, "grad_norm": 6.84375, "learning_rate": 2.6865186789388198e-05, "loss": 0.1496, "loss_lm": 0.01757770124822855, "loss_seg": 0.1320458985865116, "mean_token_accuracy": 0.9950569719076157, "num_tokens": 638413791.0, "step": 1502 }, { "entropy": 0.02030997257679701, "epoch": 0.6578400262610788, "grad_norm": 11.125, "learning_rate": 2.686247969680563e-05, "loss": 0.143, "loss_lm": 0.018599578645080328, "loss_seg": 0.12441590055823326, "mean_token_accuracy": 0.9949896782636642, "num_tokens": 638839900.0, "step": 1503 }, { "entropy": 0.019661689177155495, "epoch": 0.6582777109092899, "grad_norm": 9.1875, "learning_rate": 2.6859772604223065e-05, "loss": 0.1831, "loss_lm": 0.01978182466700673, "loss_seg": 0.16336339339613914, "mean_token_accuracy": 0.995099663734436, "num_tokens": 639265372.0, "step": 1504 }, { "entropy": 0.019672653172165155, "epoch": 0.6587153955575008, "grad_norm": 7.25, "learning_rate": 2.68570655116405e-05, "loss": 0.2206, "loss_lm": 0.018484815722331405, "loss_seg": 0.2021600492298603, "mean_token_accuracy": 0.995193213224411, "num_tokens": 639690148.0, "step": 1505 }, { "entropy": 0.01950293965637684, "epoch": 0.6591530802057118, "grad_norm": 15.625, "learning_rate": 2.6854358419057933e-05, "loss": 0.1678, "loss_lm": 0.015645494917407632, "loss_seg": 0.15219709277153015, "mean_token_accuracy": 0.9951768815517426, "num_tokens": 640115747.0, "step": 1506 }, { "entropy": 0.01924151461571455, "epoch": 0.6595907648539228, "grad_norm": 11.875, "learning_rate": 2.6851651326475366e-05, "loss": 0.1252, "loss_lm": 0.01861576270312071, "loss_seg": 0.10660436935722828, "mean_token_accuracy": 0.9950987845659256, "num_tokens": 640540107.0, "step": 1507 }, { "entropy": 0.01987115852534771, "epoch": 0.6600284495021337, "grad_norm": 8.9375, "learning_rate": 2.68489442338928e-05, "loss": 0.1881, "loss_lm": 0.017426910111680627, "loss_seg": 0.1706298440694809, "mean_token_accuracy": 0.9950544983148575, "num_tokens": 640964732.0, "step": 1508 }, { "entropy": 0.019387726206332445, "epoch": 0.6604661341503447, "grad_norm": 6.65625, "learning_rate": 2.6846237141310234e-05, "loss": 0.1845, "loss_lm": 0.014498905278742313, "loss_seg": 0.1700038742274046, "mean_token_accuracy": 0.9951553195714951, "num_tokens": 641389920.0, "step": 1509 }, { "entropy": 0.019930333830416203, "epoch": 0.6609038187985556, "grad_norm": 5.0, "learning_rate": 2.6843530048727668e-05, "loss": 0.167, "loss_lm": 0.017401717836037278, "loss_seg": 0.14957539923489094, "mean_token_accuracy": 0.9950274080038071, "num_tokens": 641814617.0, "step": 1510 }, { "entropy": 0.019635639153420925, "epoch": 0.6613415034467666, "grad_norm": 7.0, "learning_rate": 2.68408229561451e-05, "loss": 0.1248, "loss_lm": 0.016010414343327284, "loss_seg": 0.10877146106213331, "mean_token_accuracy": 0.9952490329742432, "num_tokens": 642239568.0, "step": 1511 }, { "entropy": 0.019810990896075964, "epoch": 0.6617791880949776, "grad_norm": 10.125, "learning_rate": 2.6838115863562535e-05, "loss": 0.1411, "loss_lm": 0.01620200788602233, "loss_seg": 0.12487052753567696, "mean_token_accuracy": 0.995086595416069, "num_tokens": 642664317.0, "step": 1512 }, { "entropy": 0.018922675400972366, "epoch": 0.6622168727431885, "grad_norm": 10.1875, "learning_rate": 2.683540877097997e-05, "loss": 0.1286, "loss_lm": 0.014723347499966621, "loss_seg": 0.11385869048535824, "mean_token_accuracy": 0.995404452085495, "num_tokens": 643089020.0, "step": 1513 }, { "entropy": 0.019586335867643356, "epoch": 0.6626545573913994, "grad_norm": 9.4375, "learning_rate": 2.6832701678397403e-05, "loss": 0.1608, "loss_lm": 0.01659778691828251, "loss_seg": 0.14420847594738007, "mean_token_accuracy": 0.9951407015323639, "num_tokens": 643513951.0, "step": 1514 }, { "entropy": 0.019455299247056246, "epoch": 0.6630922420396105, "grad_norm": 19.25, "learning_rate": 2.6829994585814836e-05, "loss": 0.1378, "loss_lm": 0.015408072154968977, "loss_seg": 0.12243807688355446, "mean_token_accuracy": 0.9952266812324524, "num_tokens": 643938566.0, "step": 1515 }, { "entropy": 0.019335421733558178, "epoch": 0.6635299266878214, "grad_norm": 6.96875, "learning_rate": 2.6827287493232267e-05, "loss": 0.1093, "loss_lm": 0.015277097001671791, "loss_seg": 0.0939963236451149, "mean_token_accuracy": 0.9953272491693497, "num_tokens": 644363752.0, "step": 1516 }, { "entropy": 0.019641613587737083, "epoch": 0.6639676113360324, "grad_norm": 10.0625, "learning_rate": 2.68245804006497e-05, "loss": 0.2234, "loss_lm": 0.01540619763545692, "loss_seg": 0.2079438678920269, "mean_token_accuracy": 0.9952006787061691, "num_tokens": 644788368.0, "step": 1517 }, { "entropy": 0.020046424120664597, "epoch": 0.6644052959842434, "grad_norm": 7.75, "learning_rate": 2.6821873308067137e-05, "loss": 0.1807, "loss_lm": 0.017474789172410965, "loss_seg": 0.16322410851716995, "mean_token_accuracy": 0.9951220899820328, "num_tokens": 645213127.0, "step": 1518 }, { "entropy": 0.019498329609632492, "epoch": 0.6648429806324543, "grad_norm": 8.5625, "learning_rate": 2.681916621548457e-05, "loss": 0.119, "loss_lm": 0.015562764136120677, "loss_seg": 0.10340985842049122, "mean_token_accuracy": 0.9952072948217392, "num_tokens": 645637689.0, "step": 1519 }, { "entropy": 0.01989118941128254, "epoch": 0.6652806652806653, "grad_norm": 8.5625, "learning_rate": 2.6816459122902005e-05, "loss": 0.1906, "loss_lm": 0.018708509858697653, "loss_seg": 0.17184480279684067, "mean_token_accuracy": 0.9951550662517548, "num_tokens": 646063283.0, "step": 1520 }, { "entropy": 0.019959974568337202, "epoch": 0.6657183499288762, "grad_norm": 16.5, "learning_rate": 2.6813752030319435e-05, "loss": 0.1361, "loss_lm": 0.018627429381012917, "loss_seg": 0.1175183653831482, "mean_token_accuracy": 0.9952360838651657, "num_tokens": 646488733.0, "step": 1521 }, { "entropy": 0.019532502628862858, "epoch": 0.6661560345770872, "grad_norm": 7.5625, "learning_rate": 2.681104493773687e-05, "loss": 0.1845, "loss_lm": 0.017521586501970887, "loss_seg": 0.16693613678216934, "mean_token_accuracy": 0.9950547516345978, "num_tokens": 646913969.0, "step": 1522 }, { "entropy": 0.019762089010328054, "epoch": 0.6665937192252982, "grad_norm": 8.625, "learning_rate": 2.6808337845154306e-05, "loss": 0.1374, "loss_lm": 0.01828824752010405, "loss_seg": 0.11914723552763462, "mean_token_accuracy": 0.9950423687696457, "num_tokens": 647338755.0, "step": 1523 }, { "entropy": 0.019370415713638067, "epoch": 0.6670314038735091, "grad_norm": 5.96875, "learning_rate": 2.680563075257174e-05, "loss": 0.1292, "loss_lm": 0.014909378718584776, "loss_seg": 0.11426249518990517, "mean_token_accuracy": 0.9951660186052322, "num_tokens": 647764010.0, "step": 1524 }, { "entropy": 0.0193789373151958, "epoch": 0.6674690885217202, "grad_norm": 11.6875, "learning_rate": 2.6802923659989174e-05, "loss": 0.224, "loss_lm": 0.019735120236873627, "loss_seg": 0.20423739403486252, "mean_token_accuracy": 0.9952468723058701, "num_tokens": 648189170.0, "step": 1525 }, { "entropy": 0.018857600167393684, "epoch": 0.6679067731699311, "grad_norm": 20.75, "learning_rate": 2.6800216567406604e-05, "loss": 0.1608, "loss_lm": 0.015063546365126967, "loss_seg": 0.14569758996367455, "mean_token_accuracy": 0.9953029006719589, "num_tokens": 648614517.0, "step": 1526 }, { "entropy": 0.01880672201514244, "epoch": 0.668344457818142, "grad_norm": 12.0, "learning_rate": 2.6797509474824038e-05, "loss": 0.1609, "loss_lm": 0.01674968423321843, "loss_seg": 0.14417557418346405, "mean_token_accuracy": 0.995331883430481, "num_tokens": 649039895.0, "step": 1527 }, { "entropy": 0.019937934819608927, "epoch": 0.668782142466353, "grad_norm": 10.25, "learning_rate": 2.6794802382241475e-05, "loss": 0.1362, "loss_lm": 0.013963686302304268, "loss_seg": 0.12224351614713669, "mean_token_accuracy": 0.9950901418924332, "num_tokens": 649464933.0, "step": 1528 }, { "entropy": 0.020628904458135366, "epoch": 0.669219827114564, "grad_norm": 11.625, "learning_rate": 2.679209528965891e-05, "loss": 0.1265, "loss_lm": 0.01630260841920972, "loss_seg": 0.11024528369307518, "mean_token_accuracy": 0.9948891997337341, "num_tokens": 649890307.0, "step": 1529 }, { "entropy": 0.01921228365972638, "epoch": 0.6696575117627749, "grad_norm": 9.1875, "learning_rate": 2.6789388197076342e-05, "loss": 0.1086, "loss_lm": 0.018532147631049156, "loss_seg": 0.09010729286819696, "mean_token_accuracy": 0.9951053261756897, "num_tokens": 650315207.0, "step": 1530 }, { "entropy": 0.019608781207352877, "epoch": 0.6700951964109859, "grad_norm": 8.6875, "learning_rate": 2.6786681104493773e-05, "loss": 0.1704, "loss_lm": 0.01873190701007843, "loss_seg": 0.151634119451046, "mean_token_accuracy": 0.9951208829879761, "num_tokens": 650740658.0, "step": 1531 }, { "entropy": 0.01953685376793146, "epoch": 0.6705328810591968, "grad_norm": 14.75, "learning_rate": 2.6783974011911206e-05, "loss": 0.1379, "loss_lm": 0.015632548136636615, "loss_seg": 0.1222317535430193, "mean_token_accuracy": 0.9952525496482849, "num_tokens": 651165466.0, "step": 1532 }, { "entropy": 0.018682068213820457, "epoch": 0.6709705657074078, "grad_norm": 7.96875, "learning_rate": 2.6781266919328643e-05, "loss": 0.1031, "loss_lm": 0.015430527040734887, "loss_seg": 0.08765188790857792, "mean_token_accuracy": 0.9953278750181198, "num_tokens": 651590724.0, "step": 1533 }, { "entropy": 0.018647125456482172, "epoch": 0.6714082503556188, "grad_norm": 7.65625, "learning_rate": 2.6778559826746077e-05, "loss": 0.1239, "loss_lm": 0.01590377325192094, "loss_seg": 0.10801218077540398, "mean_token_accuracy": 0.9953394681215286, "num_tokens": 652015928.0, "step": 1534 }, { "entropy": 0.019393187016248703, "epoch": 0.6718459350038297, "grad_norm": 8.25, "learning_rate": 2.677585273416351e-05, "loss": 0.1634, "loss_lm": 0.016593961277976632, "loss_seg": 0.1467830389738083, "mean_token_accuracy": 0.995213970541954, "num_tokens": 652440682.0, "step": 1535 }, { "entropy": 0.019758716691285372, "epoch": 0.6722836196520408, "grad_norm": 7.9375, "learning_rate": 2.677314564158094e-05, "loss": 0.1421, "loss_lm": 0.016499184537678957, "loss_seg": 0.12560712173581123, "mean_token_accuracy": 0.9951338022947311, "num_tokens": 652866258.0, "step": 1536 }, { "entropy": 0.01946771051734686, "epoch": 0.6727213043002517, "grad_norm": 15.5, "learning_rate": 2.6770438548998375e-05, "loss": 0.176, "loss_lm": 0.018061710288748145, "loss_seg": 0.15797672048211098, "mean_token_accuracy": 0.9952332377433777, "num_tokens": 653292100.0, "step": 1537 }, { "entropy": 0.019276177044957876, "epoch": 0.6731589889484626, "grad_norm": 8.1875, "learning_rate": 2.6767731456415812e-05, "loss": 0.1351, "loss_lm": 0.016987699549645185, "loss_seg": 0.11811752244830132, "mean_token_accuracy": 0.9951366931200027, "num_tokens": 653717296.0, "step": 1538 }, { "entropy": 0.01927961828187108, "epoch": 0.6735966735966736, "grad_norm": 8.25, "learning_rate": 2.6765024363833246e-05, "loss": 0.1333, "loss_lm": 0.019026590511202812, "loss_seg": 0.1142609566450119, "mean_token_accuracy": 0.9951333105564117, "num_tokens": 654142377.0, "step": 1539 }, { "entropy": 0.019680932629853487, "epoch": 0.6740343582448846, "grad_norm": 14.0625, "learning_rate": 2.6762317271250676e-05, "loss": 0.1235, "loss_lm": 0.017182016279548407, "loss_seg": 0.10632656887173653, "mean_token_accuracy": 0.9950189739465714, "num_tokens": 654567762.0, "step": 1540 }, { "entropy": 0.019380704499781132, "epoch": 0.6744720428930955, "grad_norm": 5.625, "learning_rate": 2.675961017866811e-05, "loss": 0.133, "loss_lm": 0.013262504246085882, "loss_seg": 0.11971781402826309, "mean_token_accuracy": 0.9952164441347122, "num_tokens": 654992300.0, "step": 1541 }, { "entropy": 0.01933008525520563, "epoch": 0.6749097275413065, "grad_norm": 17.75, "learning_rate": 2.6756903086085544e-05, "loss": 0.1652, "loss_lm": 0.01744946395047009, "loss_seg": 0.1477148998528719, "mean_token_accuracy": 0.9952698349952698, "num_tokens": 655417409.0, "step": 1542 }, { "entropy": 0.01927466271445155, "epoch": 0.6753474121895174, "grad_norm": 12.75, "learning_rate": 2.675419599350298e-05, "loss": 0.1317, "loss_lm": 0.015239906031638384, "loss_seg": 0.11643087584525347, "mean_token_accuracy": 0.9951941668987274, "num_tokens": 655843109.0, "step": 1543 }, { "entropy": 0.01930806739255786, "epoch": 0.6757850968377285, "grad_norm": 11.0625, "learning_rate": 2.6751488900920414e-05, "loss": 0.1093, "loss_lm": 0.015446627978235483, "loss_seg": 0.09385795332491398, "mean_token_accuracy": 0.9952681362628937, "num_tokens": 656267834.0, "step": 1544 }, { "entropy": 0.01979579823091626, "epoch": 0.6762227814859394, "grad_norm": 10.125, "learning_rate": 2.6748781808337845e-05, "loss": 0.1659, "loss_lm": 0.01780050667002797, "loss_seg": 0.14808468893170357, "mean_token_accuracy": 0.9951431900262833, "num_tokens": 656692950.0, "step": 1545 }, { "entropy": 0.019601060077548027, "epoch": 0.6766604661341503, "grad_norm": 18.75, "learning_rate": 2.674607471575528e-05, "loss": 0.1892, "loss_lm": 0.017993090208619833, "loss_seg": 0.1711634062230587, "mean_token_accuracy": 0.9952817112207413, "num_tokens": 657118073.0, "step": 1546 }, { "entropy": 0.01873339992016554, "epoch": 0.6770981507823614, "grad_norm": 7.5625, "learning_rate": 2.6743367623172712e-05, "loss": 0.1232, "loss_lm": 0.015690439613536, "loss_seg": 0.10748069174587727, "mean_token_accuracy": 0.9954468011856079, "num_tokens": 657542734.0, "step": 1547 }, { "entropy": 0.019380898214876652, "epoch": 0.6775358354305723, "grad_norm": 10.5625, "learning_rate": 2.6740660530590146e-05, "loss": 0.1133, "loss_lm": 0.015538682462647557, "loss_seg": 0.09771231561899185, "mean_token_accuracy": 0.9950632601976395, "num_tokens": 657967679.0, "step": 1548 }, { "entropy": 0.020277267321944237, "epoch": 0.6779735200787832, "grad_norm": 16.25, "learning_rate": 2.6737953438007583e-05, "loss": 0.1489, "loss_lm": 0.018856350798159838, "loss_seg": 0.13004943542182446, "mean_token_accuracy": 0.9949947744607925, "num_tokens": 658392821.0, "step": 1549 }, { "entropy": 0.019366572611033916, "epoch": 0.6784112047269942, "grad_norm": 7.46875, "learning_rate": 2.6735246345425013e-05, "loss": 0.1238, "loss_lm": 0.015604143962264061, "loss_seg": 0.1082086879760027, "mean_token_accuracy": 0.9952109605073929, "num_tokens": 658817711.0, "step": 1550 }, { "entropy": 0.019536906853318214, "epoch": 0.6788488893752052, "grad_norm": 10.5, "learning_rate": 2.6732539252842447e-05, "loss": 0.1694, "loss_lm": 0.016588230151683092, "loss_seg": 0.15284780599176884, "mean_token_accuracy": 0.9952368289232254, "num_tokens": 659242403.0, "step": 1551 }, { "entropy": 0.019461532589048147, "epoch": 0.6792865740234161, "grad_norm": 6.34375, "learning_rate": 2.672983216025988e-05, "loss": 0.1452, "loss_lm": 0.015486326534301043, "loss_seg": 0.12976164929568768, "mean_token_accuracy": 0.995184987783432, "num_tokens": 659667282.0, "step": 1552 }, { "entropy": 0.019071664195507765, "epoch": 0.6797242586716271, "grad_norm": 11.1875, "learning_rate": 2.6727125067677315e-05, "loss": 0.0995, "loss_lm": 0.01589769055135548, "loss_seg": 0.08358757384121418, "mean_token_accuracy": 0.9952578097581863, "num_tokens": 660092992.0, "step": 1553 }, { "entropy": 0.01899227360263467, "epoch": 0.680161943319838, "grad_norm": 11.6875, "learning_rate": 2.672441797509475e-05, "loss": 0.1197, "loss_lm": 0.01622637058608234, "loss_seg": 0.10351378470659256, "mean_token_accuracy": 0.995344489812851, "num_tokens": 660517394.0, "step": 1554 }, { "entropy": 0.01941243838518858, "epoch": 0.6805996279680491, "grad_norm": 18.375, "learning_rate": 2.6721710882512182e-05, "loss": 0.137, "loss_lm": 0.014055951265618205, "loss_seg": 0.122908566147089, "mean_token_accuracy": 0.9952382296323776, "num_tokens": 660942624.0, "step": 1555 }, { "entropy": 0.019424671307206154, "epoch": 0.68103731261626, "grad_norm": 10.8125, "learning_rate": 2.6719003789929616e-05, "loss": 0.2242, "loss_lm": 0.0173742501065135, "loss_seg": 0.20686060935258865, "mean_token_accuracy": 0.9952431321144104, "num_tokens": 661368076.0, "step": 1556 }, { "entropy": 0.020091299433261156, "epoch": 0.6814749972644709, "grad_norm": 29.0, "learning_rate": 2.671629669734705e-05, "loss": 0.2249, "loss_lm": 0.01897683832794428, "loss_seg": 0.2059664987027645, "mean_token_accuracy": 0.9949904531240463, "num_tokens": 661793852.0, "step": 1557 }, { "entropy": 0.019694292917847633, "epoch": 0.681912681912682, "grad_norm": 9.0625, "learning_rate": 2.6713589604764483e-05, "loss": 0.1158, "loss_lm": 0.014836837304756045, "loss_seg": 0.10094053857028484, "mean_token_accuracy": 0.9952224344015121, "num_tokens": 662218905.0, "step": 1558 }, { "entropy": 0.019313694909214973, "epoch": 0.6823503665608929, "grad_norm": 4.65625, "learning_rate": 2.671088251218192e-05, "loss": 0.1829, "loss_lm": 0.01682935212738812, "loss_seg": 0.16606585308909416, "mean_token_accuracy": 0.9951692223548889, "num_tokens": 662644050.0, "step": 1559 }, { "entropy": 0.01884030317887664, "epoch": 0.6827880512091038, "grad_norm": 14.0, "learning_rate": 2.670817541959935e-05, "loss": 0.1534, "loss_lm": 0.017097347415983677, "loss_seg": 0.1363410484045744, "mean_token_accuracy": 0.9953790009021759, "num_tokens": 663068780.0, "step": 1560 }, { "entropy": 0.019422159995883703, "epoch": 0.6832257358573148, "grad_norm": 28.375, "learning_rate": 2.6705468327016784e-05, "loss": 0.1605, "loss_lm": 0.015708298422396183, "loss_seg": 0.14480206184089184, "mean_token_accuracy": 0.9950259923934937, "num_tokens": 663493564.0, "step": 1561 }, { "entropy": 0.019842009991407394, "epoch": 0.6836634205055258, "grad_norm": 11.1875, "learning_rate": 2.6702761234434218e-05, "loss": 0.1958, "loss_lm": 0.018155654426664114, "loss_seg": 0.1776711381971836, "mean_token_accuracy": 0.994983896613121, "num_tokens": 663918588.0, "step": 1562 }, { "entropy": 0.019162652548402548, "epoch": 0.6841011051537368, "grad_norm": 14.125, "learning_rate": 2.6700054141851652e-05, "loss": 0.1774, "loss_lm": 0.018577087204903364, "loss_seg": 0.15882222168147564, "mean_token_accuracy": 0.9952983856201172, "num_tokens": 664343305.0, "step": 1563 }, { "entropy": 0.019217629451304674, "epoch": 0.6845387898019477, "grad_norm": 87.0, "learning_rate": 2.6697347049269085e-05, "loss": 0.141, "loss_lm": 0.018009488936513662, "loss_seg": 0.12294423952698708, "mean_token_accuracy": 0.9951650351285934, "num_tokens": 664768502.0, "step": 1564 }, { "entropy": 0.019179685972630978, "epoch": 0.6849764744501586, "grad_norm": 24.75, "learning_rate": 2.669463995668652e-05, "loss": 0.186, "loss_lm": 0.014387594535946846, "loss_seg": 0.1715930551290512, "mean_token_accuracy": 0.9953520148992538, "num_tokens": 665193587.0, "step": 1565 }, { "entropy": 0.01983268279582262, "epoch": 0.6854141590983697, "grad_norm": 17.5, "learning_rate": 2.6691932864103953e-05, "loss": 0.1779, "loss_lm": 0.017316584940999746, "loss_seg": 0.16063239239156246, "mean_token_accuracy": 0.9949595034122467, "num_tokens": 665618730.0, "step": 1566 }, { "entropy": 0.018421347718685865, "epoch": 0.6858518437465806, "grad_norm": 9.0625, "learning_rate": 2.6689225771521387e-05, "loss": 0.1654, "loss_lm": 0.020583736477419734, "loss_seg": 0.14478495344519615, "mean_token_accuracy": 0.9953618943691254, "num_tokens": 666043863.0, "step": 1567 }, { "entropy": 0.01895360928028822, "epoch": 0.6862895283947915, "grad_norm": 7.78125, "learning_rate": 2.668651867893882e-05, "loss": 0.151, "loss_lm": 0.01761220721527934, "loss_seg": 0.13335899263620377, "mean_token_accuracy": 0.9952598661184311, "num_tokens": 666468798.0, "step": 1568 }, { "entropy": 0.020333420485258102, "epoch": 0.6867272130430025, "grad_norm": 9.9375, "learning_rate": 2.6683811586356254e-05, "loss": 0.1743, "loss_lm": 0.019682870246469975, "loss_seg": 0.15461246110498905, "mean_token_accuracy": 0.9950050562620163, "num_tokens": 666894623.0, "step": 1569 }, { "entropy": 0.018781565595418215, "epoch": 0.6871648976912135, "grad_norm": 18.625, "learning_rate": 2.6681104493773688e-05, "loss": 0.2163, "loss_lm": 0.016643522307276726, "loss_seg": 0.19966374710202217, "mean_token_accuracy": 0.995190754532814, "num_tokens": 667319217.0, "step": 1570 }, { "entropy": 0.01982579054310918, "epoch": 0.6876025823394244, "grad_norm": 11.3125, "learning_rate": 2.667839740119112e-05, "loss": 0.1555, "loss_lm": 0.01670387852936983, "loss_seg": 0.13877755962312222, "mean_token_accuracy": 0.9950845390558243, "num_tokens": 667744401.0, "step": 1571 }, { "entropy": 0.019180740229785442, "epoch": 0.6880402669876354, "grad_norm": 12.6875, "learning_rate": 2.6675690308608555e-05, "loss": 0.2099, "loss_lm": 0.018704446963965893, "loss_seg": 0.19116755947470665, "mean_token_accuracy": 0.995339646935463, "num_tokens": 668169665.0, "step": 1572 }, { "entropy": 0.019548370502889156, "epoch": 0.6884779516358464, "grad_norm": 5.8125, "learning_rate": 2.667298321602599e-05, "loss": 0.1608, "loss_lm": 0.014599198941141367, "loss_seg": 0.14620672166347504, "mean_token_accuracy": 0.9950848668813705, "num_tokens": 668594251.0, "step": 1573 }, { "entropy": 0.019534999039024115, "epoch": 0.6889156362840574, "grad_norm": 16.625, "learning_rate": 2.6670276123443423e-05, "loss": 0.1776, "loss_lm": 0.017576006706804037, "loss_seg": 0.1600683219730854, "mean_token_accuracy": 0.9950263798236847, "num_tokens": 669019669.0, "step": 1574 }, { "entropy": 0.01906211907044053, "epoch": 0.6893533209322683, "grad_norm": 5.75, "learning_rate": 2.6667569030860856e-05, "loss": 0.1322, "loss_lm": 0.017788636032491922, "loss_seg": 0.11445141583681107, "mean_token_accuracy": 0.9951910376548767, "num_tokens": 669444676.0, "step": 1575 }, { "entropy": 0.01931497175246477, "epoch": 0.6897910055804792, "grad_norm": 32.0, "learning_rate": 2.666486193827829e-05, "loss": 0.1404, "loss_lm": 0.01757002528756857, "loss_seg": 0.12286803498864174, "mean_token_accuracy": 0.9950856864452362, "num_tokens": 669869552.0, "step": 1576 }, { "entropy": 0.01946650119498372, "epoch": 0.6902286902286903, "grad_norm": 16.625, "learning_rate": 2.6662154845695724e-05, "loss": 0.2011, "loss_lm": 0.01617799955420196, "loss_seg": 0.1849609687924385, "mean_token_accuracy": 0.9951253086328506, "num_tokens": 670294174.0, "step": 1577 }, { "entropy": 0.019947643391788006, "epoch": 0.6906663748769012, "grad_norm": 15.0625, "learning_rate": 2.6659447753113158e-05, "loss": 0.1862, "loss_lm": 0.017567497910931706, "loss_seg": 0.16863504238426685, "mean_token_accuracy": 0.9951398372650146, "num_tokens": 670719880.0, "step": 1578 }, { "entropy": 0.018654197454452515, "epoch": 0.6911040595251121, "grad_norm": 7.875, "learning_rate": 2.6656740660530588e-05, "loss": 0.1066, "loss_lm": 0.013368492247536778, "loss_seg": 0.09327985905110836, "mean_token_accuracy": 0.9954167604446411, "num_tokens": 671144636.0, "step": 1579 }, { "entropy": 0.018958736211061478, "epoch": 0.6915417441733231, "grad_norm": 8.375, "learning_rate": 2.6654033567948025e-05, "loss": 0.2248, "loss_lm": 0.018676841398701072, "loss_seg": 0.20616325736045837, "mean_token_accuracy": 0.9952453076839447, "num_tokens": 671569650.0, "step": 1580 }, { "entropy": 0.0201312811113894, "epoch": 0.6919794288215341, "grad_norm": 7.125, "learning_rate": 2.665132647536546e-05, "loss": 0.1001, "loss_lm": 0.01782849687151611, "loss_seg": 0.08231199532747269, "mean_token_accuracy": 0.9950353056192398, "num_tokens": 671994170.0, "step": 1581 }, { "entropy": 0.01920574437826872, "epoch": 0.692417113469745, "grad_norm": 5.9375, "learning_rate": 2.6648619382782893e-05, "loss": 0.159, "loss_lm": 0.01395086431875825, "loss_seg": 0.14504547603428364, "mean_token_accuracy": 0.9952246993780136, "num_tokens": 672419438.0, "step": 1582 }, { "entropy": 0.019518941640853882, "epoch": 0.692854798117956, "grad_norm": 7.15625, "learning_rate": 2.6645912290200326e-05, "loss": 0.1696, "loss_lm": 0.017179631628096104, "loss_seg": 0.15237300843000412, "mean_token_accuracy": 0.9951711148023605, "num_tokens": 672844562.0, "step": 1583 }, { "entropy": 0.01914135506376624, "epoch": 0.693292482766167, "grad_norm": 10.0, "learning_rate": 2.6643205197617757e-05, "loss": 0.0903, "loss_lm": 0.013024172512814403, "loss_seg": 0.07725144550204277, "mean_token_accuracy": 0.9951783567667007, "num_tokens": 673269696.0, "step": 1584 }, { "entropy": 0.019590582698583603, "epoch": 0.693730167414378, "grad_norm": 7.875, "learning_rate": 2.6640498105035194e-05, "loss": 0.132, "loss_lm": 0.0165264792740345, "loss_seg": 0.1154631245881319, "mean_token_accuracy": 0.9951931089162827, "num_tokens": 673694887.0, "step": 1585 }, { "entropy": 0.018689528107643127, "epoch": 0.6941678520625889, "grad_norm": 7.3125, "learning_rate": 2.6637791012452627e-05, "loss": 0.1261, "loss_lm": 0.01710850792005658, "loss_seg": 0.10899180173873901, "mean_token_accuracy": 0.9953738301992416, "num_tokens": 674119360.0, "step": 1586 }, { "entropy": 0.018947468139231205, "epoch": 0.6946055367107998, "grad_norm": 14.8125, "learning_rate": 2.663508391987006e-05, "loss": 0.1561, "loss_lm": 0.01734138187021017, "loss_seg": 0.1387492809444666, "mean_token_accuracy": 0.9952525496482849, "num_tokens": 674545024.0, "step": 1587 }, { "entropy": 0.019119013100862503, "epoch": 0.6950432213590109, "grad_norm": 9.5625, "learning_rate": 2.663237682728749e-05, "loss": 0.1863, "loss_lm": 0.015460624592378736, "loss_seg": 0.17084337212145329, "mean_token_accuracy": 0.9952130317687988, "num_tokens": 674969709.0, "step": 1588 }, { "entropy": 0.019359676633030176, "epoch": 0.6954809060072218, "grad_norm": 6.65625, "learning_rate": 2.6629669734704925e-05, "loss": 0.1394, "loss_lm": 0.016468648333102465, "loss_seg": 0.12289188243448734, "mean_token_accuracy": 0.9952310919761658, "num_tokens": 675395370.0, "step": 1589 }, { "entropy": 0.019379806704819202, "epoch": 0.6959185906554327, "grad_norm": 8.3125, "learning_rate": 2.6626962642122362e-05, "loss": 0.1453, "loss_lm": 0.01458480954170227, "loss_seg": 0.13075832091271877, "mean_token_accuracy": 0.9951540678739548, "num_tokens": 675821032.0, "step": 1590 }, { "entropy": 0.018942652735859156, "epoch": 0.6963562753036437, "grad_norm": 18.25, "learning_rate": 2.6624255549539796e-05, "loss": 0.1483, "loss_lm": 0.017167577985674143, "loss_seg": 0.13108346797525883, "mean_token_accuracy": 0.9952794909477234, "num_tokens": 676246125.0, "step": 1591 }, { "entropy": 0.019975594710558653, "epoch": 0.6967939599518547, "grad_norm": 6.21875, "learning_rate": 2.662154845695723e-05, "loss": 0.1583, "loss_lm": 0.01786975283175707, "loss_seg": 0.14038366451859474, "mean_token_accuracy": 0.994926393032074, "num_tokens": 676671198.0, "step": 1592 }, { "entropy": 0.01947503024712205, "epoch": 0.6972316446000657, "grad_norm": 14.4375, "learning_rate": 2.661884136437466e-05, "loss": 0.1171, "loss_lm": 0.015653816051781178, "loss_seg": 0.10147225484251976, "mean_token_accuracy": 0.9951733499765396, "num_tokens": 677096516.0, "step": 1593 }, { "entropy": 0.02001682063564658, "epoch": 0.6976693292482766, "grad_norm": 6.625, "learning_rate": 2.6616134271792094e-05, "loss": 0.1575, "loss_lm": 0.018329843413084745, "loss_seg": 0.13917095586657524, "mean_token_accuracy": 0.9951242357492447, "num_tokens": 677520633.0, "step": 1594 }, { "entropy": 0.0187978302128613, "epoch": 0.6981070138964875, "grad_norm": 9.0, "learning_rate": 2.661342717920953e-05, "loss": 0.1361, "loss_lm": 0.015552474651485682, "loss_seg": 0.12056246772408485, "mean_token_accuracy": 0.9953734874725342, "num_tokens": 677946186.0, "step": 1595 }, { "entropy": 0.019923067651689053, "epoch": 0.6985446985446986, "grad_norm": 9.625, "learning_rate": 2.6610720086626965e-05, "loss": 0.1806, "loss_lm": 0.016087135765701532, "loss_seg": 0.16446426697075367, "mean_token_accuracy": 0.995074987411499, "num_tokens": 678371139.0, "step": 1596 }, { "entropy": 0.01923903403803706, "epoch": 0.6989823831929095, "grad_norm": 14.0, "learning_rate": 2.66080129940444e-05, "loss": 0.0898, "loss_lm": 0.016945129027590156, "loss_seg": 0.07286311686038971, "mean_token_accuracy": 0.9952101558446884, "num_tokens": 678795771.0, "step": 1597 }, { "entropy": 0.02007808117195964, "epoch": 0.6994200678411204, "grad_norm": 20.0, "learning_rate": 2.660530590146183e-05, "loss": 0.1343, "loss_lm": 0.01775951124727726, "loss_seg": 0.11649768054485321, "mean_token_accuracy": 0.994987741112709, "num_tokens": 679220912.0, "step": 1598 }, { "entropy": 0.019336056895554066, "epoch": 0.6998577524893315, "grad_norm": 5.5625, "learning_rate": 2.6602598808879263e-05, "loss": 0.1568, "loss_lm": 0.015888605499640107, "loss_seg": 0.14092404767870903, "mean_token_accuracy": 0.9951246827840805, "num_tokens": 679645681.0, "step": 1599 }, { "entropy": 0.019146764650940895, "epoch": 0.7002954371375424, "grad_norm": 7.21875, "learning_rate": 2.65998917162967e-05, "loss": 0.1654, "loss_lm": 0.018262221943587065, "loss_seg": 0.14716297201812267, "mean_token_accuracy": 0.9951466023921967, "num_tokens": 680070892.0, "step": 1600 }, { "entropy": 0.01934619201347232, "epoch": 0.7007331217857533, "grad_norm": 8.5, "learning_rate": 2.6597184623714133e-05, "loss": 0.127, "loss_lm": 0.016417221166193485, "loss_seg": 0.11060795374214649, "mean_token_accuracy": 0.9952251017093658, "num_tokens": 680496837.0, "step": 1601 }, { "entropy": 0.019329605158418417, "epoch": 0.7011708064339643, "grad_norm": 26.625, "learning_rate": 2.6594477531131567e-05, "loss": 0.1482, "loss_lm": 0.01613912614993751, "loss_seg": 0.13201827183365822, "mean_token_accuracy": 0.9951691925525665, "num_tokens": 680921582.0, "step": 1602 }, { "entropy": 0.019473688676953316, "epoch": 0.7016084910821753, "grad_norm": 15.0, "learning_rate": 2.6591770438548997e-05, "loss": 0.1895, "loss_lm": 0.016174154821783304, "loss_seg": 0.17333816178143024, "mean_token_accuracy": 0.9952305108308792, "num_tokens": 681346324.0, "step": 1603 }, { "entropy": 0.019263474736362696, "epoch": 0.7020461757303863, "grad_norm": 4.8125, "learning_rate": 2.658906334596643e-05, "loss": 0.19, "loss_lm": 0.01672515505924821, "loss_seg": 0.17323586717247963, "mean_token_accuracy": 0.9951879680156708, "num_tokens": 681771446.0, "step": 1604 }, { "entropy": 0.019834008999168873, "epoch": 0.7024838603785972, "grad_norm": 12.9375, "learning_rate": 2.6586356253383868e-05, "loss": 0.1096, "loss_lm": 0.018475875724107027, "loss_seg": 0.0911050932481885, "mean_token_accuracy": 0.9949844926595688, "num_tokens": 682197079.0, "step": 1605 }, { "entropy": 0.018865299876779318, "epoch": 0.7029215450268081, "grad_norm": 5.5625, "learning_rate": 2.6583649160801302e-05, "loss": 0.1172, "loss_lm": 0.01628317730501294, "loss_seg": 0.10094572603702545, "mean_token_accuracy": 0.9952982664108276, "num_tokens": 682621763.0, "step": 1606 }, { "entropy": 0.019336250610649586, "epoch": 0.7033592296750192, "grad_norm": 9.5, "learning_rate": 2.6580942068218736e-05, "loss": 0.1727, "loss_lm": 0.01608555973507464, "loss_seg": 0.1566072814166546, "mean_token_accuracy": 0.9953204393386841, "num_tokens": 683046553.0, "step": 1607 }, { "entropy": 0.01919616013765335, "epoch": 0.7037969143232301, "grad_norm": 6.375, "learning_rate": 2.6578234975636166e-05, "loss": 0.1511, "loss_lm": 0.0171138234436512, "loss_seg": 0.13398671336472034, "mean_token_accuracy": 0.9952077269554138, "num_tokens": 683471323.0, "step": 1608 }, { "entropy": 0.018974715378135443, "epoch": 0.704234598971441, "grad_norm": 8.75, "learning_rate": 2.65755278830536e-05, "loss": 0.1645, "loss_lm": 0.017290364485234022, "loss_seg": 0.14718848280608654, "mean_token_accuracy": 0.9953829944133759, "num_tokens": 683896218.0, "step": 1609 }, { "entropy": 0.019826645031571388, "epoch": 0.7046722836196521, "grad_norm": 16.125, "learning_rate": 2.6572820790471033e-05, "loss": 0.1594, "loss_lm": 0.014455924276262522, "loss_seg": 0.1449195947498083, "mean_token_accuracy": 0.9950947165489197, "num_tokens": 684321220.0, "step": 1610 }, { "entropy": 0.01982395490631461, "epoch": 0.705109968267863, "grad_norm": 23.5, "learning_rate": 2.657011369788847e-05, "loss": 0.1432, "loss_lm": 0.017633182927966118, "loss_seg": 0.12556704133749008, "mean_token_accuracy": 0.9950356632471085, "num_tokens": 684746628.0, "step": 1611 }, { "entropy": 0.019088756293058395, "epoch": 0.705547652916074, "grad_norm": 8.625, "learning_rate": 2.65674066053059e-05, "loss": 0.1433, "loss_lm": 0.01607236941345036, "loss_seg": 0.12722787261009216, "mean_token_accuracy": 0.995160236954689, "num_tokens": 685170996.0, "step": 1612 }, { "entropy": 0.019573167897760868, "epoch": 0.7059853375642849, "grad_norm": 9.9375, "learning_rate": 2.6564699512723335e-05, "loss": 0.1618, "loss_lm": 0.01713848952203989, "loss_seg": 0.14465319737792015, "mean_token_accuracy": 0.9952119588851929, "num_tokens": 685596183.0, "step": 1613 }, { "entropy": 0.01925960974767804, "epoch": 0.7064230222124959, "grad_norm": 13.5, "learning_rate": 2.656199242014077e-05, "loss": 0.1661, "loss_lm": 0.01655249623581767, "loss_seg": 0.14950957521796227, "mean_token_accuracy": 0.995231494307518, "num_tokens": 686021863.0, "step": 1614 }, { "entropy": 0.019637124612927437, "epoch": 0.7068607068607069, "grad_norm": 20.875, "learning_rate": 2.6559285327558202e-05, "loss": 0.1271, "loss_lm": 0.016440109349787235, "loss_seg": 0.11069102585315704, "mean_token_accuracy": 0.9950671494007111, "num_tokens": 686447356.0, "step": 1615 }, { "entropy": 0.019678054377436638, "epoch": 0.7072983915089178, "grad_norm": 13.875, "learning_rate": 2.655657823497564e-05, "loss": 0.1573, "loss_lm": 0.017313205171376467, "loss_seg": 0.13998731039464474, "mean_token_accuracy": 0.9950840026140213, "num_tokens": 686871653.0, "step": 1616 }, { "entropy": 0.019402398727834225, "epoch": 0.7077360761571287, "grad_norm": 5.46875, "learning_rate": 2.655387114239307e-05, "loss": 0.1385, "loss_lm": 0.017911388305947185, "loss_seg": 0.12056771665811539, "mean_token_accuracy": 0.9951358139514923, "num_tokens": 687296371.0, "step": 1617 }, { "entropy": 0.01933180494233966, "epoch": 0.7081737608053398, "grad_norm": 11.375, "learning_rate": 2.6551164049810503e-05, "loss": 0.162, "loss_lm": 0.018114174250513315, "loss_seg": 0.14386252872645855, "mean_token_accuracy": 0.9951092153787613, "num_tokens": 687721660.0, "step": 1618 }, { "entropy": 0.01892206072807312, "epoch": 0.7086114454535507, "grad_norm": 6.90625, "learning_rate": 2.6548456957227937e-05, "loss": 0.1251, "loss_lm": 0.013971641892567277, "loss_seg": 0.11116567440330982, "mean_token_accuracy": 0.9953561127185822, "num_tokens": 688147328.0, "step": 1619 }, { "entropy": 0.01939326897263527, "epoch": 0.7090491301017616, "grad_norm": 8.625, "learning_rate": 2.654574986464537e-05, "loss": 0.1347, "loss_lm": 0.015111688058823347, "loss_seg": 0.1196113359183073, "mean_token_accuracy": 0.9951587468385696, "num_tokens": 688572002.0, "step": 1620 }, { "entropy": 0.01960279606282711, "epoch": 0.7094868147499727, "grad_norm": 10.4375, "learning_rate": 2.6543042772062808e-05, "loss": 0.177, "loss_lm": 0.01681931596249342, "loss_seg": 0.16014138981699944, "mean_token_accuracy": 0.9951765388250351, "num_tokens": 688997196.0, "step": 1621 }, { "entropy": 0.01982219610363245, "epoch": 0.7099244993981836, "grad_norm": 16.25, "learning_rate": 2.6540335679480238e-05, "loss": 0.2124, "loss_lm": 0.016440396895632148, "loss_seg": 0.19597972929477692, "mean_token_accuracy": 0.99507175385952, "num_tokens": 689422682.0, "step": 1622 }, { "entropy": 0.019677066709846258, "epoch": 0.7103621840463946, "grad_norm": 13.125, "learning_rate": 2.6537628586897672e-05, "loss": 0.1659, "loss_lm": 0.016242661979049444, "loss_seg": 0.1496221125125885, "mean_token_accuracy": 0.9952884763479233, "num_tokens": 689847347.0, "step": 1623 }, { "entropy": 0.019377021118998528, "epoch": 0.7107998686946055, "grad_norm": 8.25, "learning_rate": 2.6534921494315106e-05, "loss": 0.1243, "loss_lm": 0.014022063231095672, "loss_seg": 0.11029287986457348, "mean_token_accuracy": 0.9951632022857666, "num_tokens": 690272610.0, "step": 1624 }, { "entropy": 0.019138521514832973, "epoch": 0.7112375533428165, "grad_norm": 36.25, "learning_rate": 2.653221440173254e-05, "loss": 0.1409, "loss_lm": 0.01555840321816504, "loss_seg": 0.12529290467500687, "mean_token_accuracy": 0.9952383935451508, "num_tokens": 690698063.0, "step": 1625 }, { "entropy": 0.019240139983594418, "epoch": 0.7116752379910275, "grad_norm": 7.84375, "learning_rate": 2.6529507309149977e-05, "loss": 0.152, "loss_lm": 0.017162347678095102, "loss_seg": 0.1348179690539837, "mean_token_accuracy": 0.9951914250850677, "num_tokens": 691122906.0, "step": 1626 }, { "entropy": 0.01949116215109825, "epoch": 0.7121129226392384, "grad_norm": 12.5625, "learning_rate": 2.6526800216567407e-05, "loss": 0.1156, "loss_lm": 0.013285157037898898, "loss_seg": 0.1023072600364685, "mean_token_accuracy": 0.9952255636453629, "num_tokens": 691547414.0, "step": 1627 }, { "entropy": 0.019332971423864365, "epoch": 0.7125506072874493, "grad_norm": 10.375, "learning_rate": 2.652409312398484e-05, "loss": 0.1358, "loss_lm": 0.015890046721324325, "loss_seg": 0.11994272191077471, "mean_token_accuracy": 0.9951888024806976, "num_tokens": 691973058.0, "step": 1628 }, { "entropy": 0.019249859265983105, "epoch": 0.7129882919356604, "grad_norm": 13.75, "learning_rate": 2.6521386031402274e-05, "loss": 0.1371, "loss_lm": 0.017716194037348032, "loss_seg": 0.11936170607805252, "mean_token_accuracy": 0.9952753335237503, "num_tokens": 692398025.0, "step": 1629 }, { "entropy": 0.019106212072074413, "epoch": 0.7134259765838713, "grad_norm": 8.125, "learning_rate": 2.6518678938819708e-05, "loss": 0.1815, "loss_lm": 0.01866798847913742, "loss_seg": 0.16285977326333523, "mean_token_accuracy": 0.9952380806207657, "num_tokens": 692823015.0, "step": 1630 }, { "entropy": 0.01977305067703128, "epoch": 0.7138636612320823, "grad_norm": 9.0625, "learning_rate": 2.6515971846237142e-05, "loss": 0.1503, "loss_lm": 0.016354915453121066, "loss_seg": 0.13396614789962769, "mean_token_accuracy": 0.9949897974729538, "num_tokens": 693247712.0, "step": 1631 }, { "entropy": 0.019830975215882063, "epoch": 0.7143013458802933, "grad_norm": 7.78125, "learning_rate": 2.6513264753654575e-05, "loss": 0.1714, "loss_lm": 0.015848418697714806, "loss_seg": 0.15554967522621155, "mean_token_accuracy": 0.9949984848499298, "num_tokens": 693672548.0, "step": 1632 }, { "entropy": 0.019521831534802914, "epoch": 0.7147390305285042, "grad_norm": 25.875, "learning_rate": 2.651055766107201e-05, "loss": 0.1409, "loss_lm": 0.016803397331386805, "loss_seg": 0.12409213930368423, "mean_token_accuracy": 0.9951767027378082, "num_tokens": 694097691.0, "step": 1633 }, { "entropy": 0.019063191022723913, "epoch": 0.7151767151767152, "grad_norm": 9.125, "learning_rate": 2.6507850568489443e-05, "loss": 0.1244, "loss_lm": 0.014575277455151081, "loss_seg": 0.1097957119345665, "mean_token_accuracy": 0.9952680766582489, "num_tokens": 694522854.0, "step": 1634 }, { "entropy": 0.019408396910876036, "epoch": 0.7156143998249261, "grad_norm": 10.625, "learning_rate": 2.6505143475906877e-05, "loss": 0.1203, "loss_lm": 0.01582708302885294, "loss_seg": 0.104474738240242, "mean_token_accuracy": 0.9951726645231247, "num_tokens": 694947892.0, "step": 1635 }, { "entropy": 0.019203288946300745, "epoch": 0.7160520844731371, "grad_norm": 5.15625, "learning_rate": 2.650243638332431e-05, "loss": 0.1388, "loss_lm": 0.015970787964761257, "loss_seg": 0.12279631197452545, "mean_token_accuracy": 0.9951555579900742, "num_tokens": 695372662.0, "step": 1636 }, { "entropy": 0.019775772467255592, "epoch": 0.7164897691213481, "grad_norm": 12.75, "learning_rate": 2.6499729290741744e-05, "loss": 0.1238, "loss_lm": 0.016501864418387413, "loss_seg": 0.10733683686703444, "mean_token_accuracy": 0.9950182139873505, "num_tokens": 695798177.0, "step": 1637 }, { "entropy": 0.018867042381316423, "epoch": 0.716927453769559, "grad_norm": 22.75, "learning_rate": 2.6497022198159178e-05, "loss": 0.1278, "loss_lm": 0.015318789752200246, "loss_seg": 0.11249781865626574, "mean_token_accuracy": 0.9952893704175949, "num_tokens": 696223127.0, "step": 1638 }, { "entropy": 0.01918526692315936, "epoch": 0.7173651384177699, "grad_norm": 7.9375, "learning_rate": 2.649431510557661e-05, "loss": 0.1673, "loss_lm": 0.017886283807456493, "loss_seg": 0.14938082918524742, "mean_token_accuracy": 0.9952462762594223, "num_tokens": 696648408.0, "step": 1639 }, { "entropy": 0.01918934704735875, "epoch": 0.717802823065981, "grad_norm": 24.5, "learning_rate": 2.6491608012994045e-05, "loss": 0.1368, "loss_lm": 0.015562335029244423, "loss_seg": 0.12125108763575554, "mean_token_accuracy": 0.9952924698591232, "num_tokens": 697073636.0, "step": 1640 }, { "entropy": 0.019278372637927532, "epoch": 0.7182405077141919, "grad_norm": 25.125, "learning_rate": 2.648890092041148e-05, "loss": 0.1717, "loss_lm": 0.01678784703835845, "loss_seg": 0.15488136373460293, "mean_token_accuracy": 0.9951208084821701, "num_tokens": 697498269.0, "step": 1641 }, { "entropy": 0.019657297059893608, "epoch": 0.7186781923624029, "grad_norm": 6.8125, "learning_rate": 2.6486193827828913e-05, "loss": 0.1813, "loss_lm": 0.01815651450306177, "loss_seg": 0.16311100125312805, "mean_token_accuracy": 0.9951545298099518, "num_tokens": 697923167.0, "step": 1642 }, { "entropy": 0.019152574241161346, "epoch": 0.7191158770106139, "grad_norm": 14.375, "learning_rate": 2.6483486735246346e-05, "loss": 0.1156, "loss_lm": 0.020961353555321693, "loss_seg": 0.09463542327284813, "mean_token_accuracy": 0.9952611923217773, "num_tokens": 698348538.0, "step": 1643 }, { "entropy": 0.019714235793799162, "epoch": 0.7195535616588248, "grad_norm": 6.0625, "learning_rate": 2.648077964266378e-05, "loss": 0.1399, "loss_lm": 0.01626074011437595, "loss_seg": 0.12364237010478973, "mean_token_accuracy": 0.9950137287378311, "num_tokens": 698773888.0, "step": 1644 }, { "entropy": 0.01958022639155388, "epoch": 0.7199912463070358, "grad_norm": 17.25, "learning_rate": 2.6478072550081214e-05, "loss": 0.151, "loss_lm": 0.01670979429036379, "loss_seg": 0.13425582833588123, "mean_token_accuracy": 0.9951082319021225, "num_tokens": 699199893.0, "step": 1645 }, { "entropy": 0.019697504118084908, "epoch": 0.7204289309552467, "grad_norm": 17.25, "learning_rate": 2.6475365457498644e-05, "loss": 0.204, "loss_lm": 0.014723523752763867, "loss_seg": 0.18926409259438515, "mean_token_accuracy": 0.9951064586639404, "num_tokens": 699624703.0, "step": 1646 }, { "entropy": 0.019224740099161863, "epoch": 0.7208666156034577, "grad_norm": 9.8125, "learning_rate": 2.647265836491608e-05, "loss": 0.143, "loss_lm": 0.019465557299554348, "loss_seg": 0.12351469323039055, "mean_token_accuracy": 0.9953376948833466, "num_tokens": 700049353.0, "step": 1647 }, { "entropy": 0.019526919350028038, "epoch": 0.7213043002516687, "grad_norm": 4.34375, "learning_rate": 2.6469951272333515e-05, "loss": 0.1614, "loss_lm": 0.01923380745574832, "loss_seg": 0.1421180833131075, "mean_token_accuracy": 0.995038777589798, "num_tokens": 700474443.0, "step": 1648 }, { "entropy": 0.018850265070796013, "epoch": 0.7217419848998796, "grad_norm": 9.8125, "learning_rate": 2.646724417975095e-05, "loss": 0.1318, "loss_lm": 0.016265355283394456, "loss_seg": 0.11556046642363071, "mean_token_accuracy": 0.9954210817813873, "num_tokens": 700898947.0, "step": 1649 }, { "entropy": 0.01974409492686391, "epoch": 0.7221796695480907, "grad_norm": 13.3125, "learning_rate": 2.6464537087168383e-05, "loss": 0.1713, "loss_lm": 0.018038097536191344, "loss_seg": 0.153280321508646, "mean_token_accuracy": 0.99510657787323, "num_tokens": 701323754.0, "step": 1650 }, { "entropy": 0.01919826865196228, "epoch": 0.7226173541963016, "grad_norm": 12.0625, "learning_rate": 2.6461829994585813e-05, "loss": 0.1701, "loss_lm": 0.017633717507123947, "loss_seg": 0.15245785750448704, "mean_token_accuracy": 0.9951060116291046, "num_tokens": 701748942.0, "step": 1651 }, { "entropy": 0.019274387042969465, "epoch": 0.7230550388445125, "grad_norm": 12.625, "learning_rate": 2.645912290200325e-05, "loss": 0.1568, "loss_lm": 0.016849861945956945, "loss_seg": 0.13995554111897945, "mean_token_accuracy": 0.9952675253152847, "num_tokens": 702174259.0, "step": 1652 }, { "entropy": 0.019534315448254347, "epoch": 0.7234927234927235, "grad_norm": 11.375, "learning_rate": 2.6456415809420684e-05, "loss": 0.1383, "loss_lm": 0.015465744771063328, "loss_seg": 0.12279260717332363, "mean_token_accuracy": 0.9950401037931442, "num_tokens": 702598832.0, "step": 1653 }, { "entropy": 0.020004802383482456, "epoch": 0.7239304081409345, "grad_norm": 12.875, "learning_rate": 2.6453708716838117e-05, "loss": 0.176, "loss_lm": 0.017798444954678416, "loss_seg": 0.15823069028556347, "mean_token_accuracy": 0.9949527680873871, "num_tokens": 703024396.0, "step": 1654 }, { "entropy": 0.019322856795042753, "epoch": 0.7243680927891454, "grad_norm": 8.375, "learning_rate": 2.6451001624255548e-05, "loss": 0.1476, "loss_lm": 0.01719238399527967, "loss_seg": 0.13043152913451195, "mean_token_accuracy": 0.9951926916837692, "num_tokens": 703449758.0, "step": 1655 }, { "entropy": 0.019586248323321342, "epoch": 0.7248057774373564, "grad_norm": 8.3125, "learning_rate": 2.644829453167298e-05, "loss": 0.1328, "loss_lm": 0.016212151618674397, "loss_seg": 0.11659739725291729, "mean_token_accuracy": 0.9951627254486084, "num_tokens": 703874956.0, "step": 1656 }, { "entropy": 0.019434149842709303, "epoch": 0.7252434620855673, "grad_norm": 6.96875, "learning_rate": 2.644558743909042e-05, "loss": 0.202, "loss_lm": 0.01782481838017702, "loss_seg": 0.18421917781233788, "mean_token_accuracy": 0.995130866765976, "num_tokens": 704300041.0, "step": 1657 }, { "entropy": 0.019025245681405067, "epoch": 0.7256811467337783, "grad_norm": 7.375, "learning_rate": 2.6442880346507852e-05, "loss": 0.1451, "loss_lm": 0.015359709970653057, "loss_seg": 0.12971710786223412, "mean_token_accuracy": 0.9952935874462128, "num_tokens": 704724505.0, "step": 1658 }, { "entropy": 0.019121247343719006, "epoch": 0.7261188313819893, "grad_norm": 9.875, "learning_rate": 2.6440173253925286e-05, "loss": 0.1882, "loss_lm": 0.01617843098938465, "loss_seg": 0.17197363637387753, "mean_token_accuracy": 0.9952404648065567, "num_tokens": 705149073.0, "step": 1659 }, { "entropy": 0.01903328811749816, "epoch": 0.7265565160302002, "grad_norm": 11.0625, "learning_rate": 2.6437466161342716e-05, "loss": 0.1633, "loss_lm": 0.016609795158728957, "loss_seg": 0.14673952385783195, "mean_token_accuracy": 0.9953115433454514, "num_tokens": 705573890.0, "step": 1660 }, { "entropy": 0.019413353875279427, "epoch": 0.7269942006784113, "grad_norm": 12.3125, "learning_rate": 2.643475906876015e-05, "loss": 0.1082, "loss_lm": 0.015567237045615911, "loss_seg": 0.09261426329612732, "mean_token_accuracy": 0.9951291382312775, "num_tokens": 705998802.0, "step": 1661 }, { "entropy": 0.019299477338790894, "epoch": 0.7274318853266222, "grad_norm": 20.125, "learning_rate": 2.6432051976177587e-05, "loss": 0.2518, "loss_lm": 0.016976272221654654, "loss_seg": 0.23478450626134872, "mean_token_accuracy": 0.9951795637607574, "num_tokens": 706423602.0, "step": 1662 }, { "entropy": 0.019079437479376793, "epoch": 0.7278695699748331, "grad_norm": 16.0, "learning_rate": 2.642934488359502e-05, "loss": 0.1654, "loss_lm": 0.014083832502365112, "loss_seg": 0.15130471251904964, "mean_token_accuracy": 0.9950942993164062, "num_tokens": 706847989.0, "step": 1663 }, { "entropy": 0.01944953529164195, "epoch": 0.7283072546230441, "grad_norm": 11.5625, "learning_rate": 2.6426637791012455e-05, "loss": 0.1869, "loss_lm": 0.01664851442910731, "loss_seg": 0.17021279782056808, "mean_token_accuracy": 0.9952380508184433, "num_tokens": 707272553.0, "step": 1664 }, { "entropy": 0.019107897765934467, "epoch": 0.728744939271255, "grad_norm": 6.71875, "learning_rate": 2.6423930698429885e-05, "loss": 0.1435, "loss_lm": 0.01866121031343937, "loss_seg": 0.12483246996998787, "mean_token_accuracy": 0.9952711910009384, "num_tokens": 707697360.0, "step": 1665 }, { "entropy": 0.018458629492670298, "epoch": 0.729182623919466, "grad_norm": 23.5, "learning_rate": 2.642122360584732e-05, "loss": 0.1519, "loss_lm": 0.01888528885319829, "loss_seg": 0.1330520249903202, "mean_token_accuracy": 0.9953311085700989, "num_tokens": 708122654.0, "step": 1666 }, { "entropy": 0.01905133854597807, "epoch": 0.729620308567677, "grad_norm": 5.59375, "learning_rate": 2.6418516513264756e-05, "loss": 0.1421, "loss_lm": 0.01524960296228528, "loss_seg": 0.12683063931763172, "mean_token_accuracy": 0.9951681792736053, "num_tokens": 708547740.0, "step": 1667 }, { "entropy": 0.018591311294585466, "epoch": 0.7300579932158879, "grad_norm": 6.875, "learning_rate": 2.641580942068219e-05, "loss": 0.2341, "loss_lm": 0.01839334866963327, "loss_seg": 0.2156727984547615, "mean_token_accuracy": 0.995201051235199, "num_tokens": 708972585.0, "step": 1668 }, { "entropy": 0.01903429627418518, "epoch": 0.730495677864099, "grad_norm": 8.0, "learning_rate": 2.6413102328099623e-05, "loss": 0.1355, "loss_lm": 0.015564379515126348, "loss_seg": 0.11996703967452049, "mean_token_accuracy": 0.9951919764280319, "num_tokens": 709397677.0, "step": 1669 }, { "entropy": 0.01900353841483593, "epoch": 0.7309333625123099, "grad_norm": 9.0625, "learning_rate": 2.6410395235517054e-05, "loss": 0.122, "loss_lm": 0.016320773866027594, "loss_seg": 0.1056737583130598, "mean_token_accuracy": 0.9952110201120377, "num_tokens": 709822368.0, "step": 1670 }, { "entropy": 0.019092541188001633, "epoch": 0.7313710471605208, "grad_norm": 10.5625, "learning_rate": 2.6407688142934487e-05, "loss": 0.1465, "loss_lm": 0.016878033755347133, "loss_seg": 0.12957684695720673, "mean_token_accuracy": 0.9951588362455368, "num_tokens": 710247462.0, "step": 1671 }, { "entropy": 0.019223008304834366, "epoch": 0.7318087318087318, "grad_norm": 7.65625, "learning_rate": 2.6404981050351925e-05, "loss": 0.1937, "loss_lm": 0.018412681994959712, "loss_seg": 0.17529525607824326, "mean_token_accuracy": 0.9952346533536911, "num_tokens": 710673177.0, "step": 1672 }, { "entropy": 0.01909979386255145, "epoch": 0.7322464164569428, "grad_norm": 9.4375, "learning_rate": 2.6402273957769358e-05, "loss": 0.1718, "loss_lm": 0.01796873565763235, "loss_seg": 0.1538194753229618, "mean_token_accuracy": 0.9952205419540405, "num_tokens": 711098289.0, "step": 1673 }, { "entropy": 0.01941515225917101, "epoch": 0.7326841011051537, "grad_norm": 7.53125, "learning_rate": 2.6399566865186792e-05, "loss": 0.1385, "loss_lm": 0.01740917330607772, "loss_seg": 0.12113921158015728, "mean_token_accuracy": 0.9952640980482101, "num_tokens": 711523483.0, "step": 1674 }, { "entropy": 0.018710359930992126, "epoch": 0.7331217857533647, "grad_norm": 6.9375, "learning_rate": 2.6396859772604222e-05, "loss": 0.1879, "loss_lm": 0.015630883863195777, "loss_seg": 0.1722942590713501, "mean_token_accuracy": 0.9952530562877655, "num_tokens": 711947925.0, "step": 1675 }, { "entropy": 0.018647347576916218, "epoch": 0.7335594704015757, "grad_norm": 7.8125, "learning_rate": 2.6394152680021656e-05, "loss": 0.1517, "loss_lm": 0.01576504879631102, "loss_seg": 0.13593677431344986, "mean_token_accuracy": 0.9953014105558395, "num_tokens": 712373311.0, "step": 1676 }, { "entropy": 0.019524395000189543, "epoch": 0.7339971550497866, "grad_norm": 9.8125, "learning_rate": 2.639144558743909e-05, "loss": 0.11, "loss_lm": 0.018761531449854374, "loss_seg": 0.09120488539338112, "mean_token_accuracy": 0.9950410425662994, "num_tokens": 712798986.0, "step": 1677 }, { "entropy": 0.018907741643488407, "epoch": 0.7344348396979976, "grad_norm": 14.6875, "learning_rate": 2.6388738494856527e-05, "loss": 0.1801, "loss_lm": 0.015851166797801852, "loss_seg": 0.16420278698205948, "mean_token_accuracy": 0.9952014237642288, "num_tokens": 713224066.0, "step": 1678 }, { "entropy": 0.019261373672634363, "epoch": 0.7348725243462085, "grad_norm": 9.875, "learning_rate": 2.6386031402273957e-05, "loss": 0.1523, "loss_lm": 0.016839217161759734, "loss_seg": 0.13545888289809227, "mean_token_accuracy": 0.9952269196510315, "num_tokens": 713648078.0, "step": 1679 }, { "entropy": 0.018909951206296682, "epoch": 0.7353102089944196, "grad_norm": 8.1875, "learning_rate": 2.638332430969139e-05, "loss": 0.1777, "loss_lm": 0.018534115981310606, "loss_seg": 0.1591622643172741, "mean_token_accuracy": 0.9952365756034851, "num_tokens": 714073577.0, "step": 1680 }, { "entropy": 0.019281630404293537, "epoch": 0.7357478936426305, "grad_norm": 14.75, "learning_rate": 2.6380617217108825e-05, "loss": 0.1416, "loss_lm": 0.017265136819332838, "loss_seg": 0.12429076246917248, "mean_token_accuracy": 0.9951726049184799, "num_tokens": 714498788.0, "step": 1681 }, { "entropy": 0.019442453980445862, "epoch": 0.7361855782908414, "grad_norm": 11.8125, "learning_rate": 2.637791012452626e-05, "loss": 0.1491, "loss_lm": 0.01629106281325221, "loss_seg": 0.13285018131136894, "mean_token_accuracy": 0.9951771646738052, "num_tokens": 714924412.0, "step": 1682 }, { "entropy": 0.018765263725072145, "epoch": 0.7366232629390524, "grad_norm": 10.125, "learning_rate": 2.6375203031943695e-05, "loss": 0.1797, "loss_lm": 0.017972813919186592, "loss_seg": 0.1617750134319067, "mean_token_accuracy": 0.9953247457742691, "num_tokens": 715349563.0, "step": 1683 }, { "entropy": 0.018710958771407604, "epoch": 0.7370609475872634, "grad_norm": 10.9375, "learning_rate": 2.6372495939361126e-05, "loss": 0.1245, "loss_lm": 0.0158617349807173, "loss_seg": 0.10867105983197689, "mean_token_accuracy": 0.9953538328409195, "num_tokens": 715774152.0, "step": 1684 }, { "entropy": 0.018962702248245478, "epoch": 0.7374986322354743, "grad_norm": 39.25, "learning_rate": 2.636978884677856e-05, "loss": 0.148, "loss_lm": 0.017158661736175418, "loss_seg": 0.13087263703346252, "mean_token_accuracy": 0.9952043145895004, "num_tokens": 716198461.0, "step": 1685 }, { "entropy": 0.01939413556829095, "epoch": 0.7379363168836853, "grad_norm": 21.125, "learning_rate": 2.6367081754195993e-05, "loss": 0.1738, "loss_lm": 0.018387942342087626, "loss_seg": 0.1554603297263384, "mean_token_accuracy": 0.9950994253158569, "num_tokens": 716623902.0, "step": 1686 }, { "entropy": 0.01939264591783285, "epoch": 0.7383740015318963, "grad_norm": 6.15625, "learning_rate": 2.6364374661613427e-05, "loss": 0.1114, "loss_lm": 0.016631616046652198, "loss_seg": 0.0947834812104702, "mean_token_accuracy": 0.9952246993780136, "num_tokens": 717048970.0, "step": 1687 }, { "entropy": 0.01951401447877288, "epoch": 0.7388116861801073, "grad_norm": 11.5, "learning_rate": 2.6361667569030864e-05, "loss": 0.183, "loss_lm": 0.016187736997380853, "loss_seg": 0.1668001115322113, "mean_token_accuracy": 0.9950662851333618, "num_tokens": 717473983.0, "step": 1688 }, { "entropy": 0.019528675824403763, "epoch": 0.7392493708283182, "grad_norm": 10.125, "learning_rate": 2.6358960476448294e-05, "loss": 0.1328, "loss_lm": 0.014653192134574056, "loss_seg": 0.11810922808945179, "mean_token_accuracy": 0.9951538145542145, "num_tokens": 717898768.0, "step": 1689 }, { "entropy": 0.019538532476872206, "epoch": 0.7396870554765291, "grad_norm": 10.8125, "learning_rate": 2.6356253383865728e-05, "loss": 0.1848, "loss_lm": 0.015722638927400112, "loss_seg": 0.16907301358878613, "mean_token_accuracy": 0.9951853156089783, "num_tokens": 718324498.0, "step": 1690 }, { "entropy": 0.01946618827059865, "epoch": 0.7401247401247402, "grad_norm": 23.125, "learning_rate": 2.6353546291283162e-05, "loss": 0.1289, "loss_lm": 0.017235775711014867, "loss_seg": 0.1116199754178524, "mean_token_accuracy": 0.9951149970293045, "num_tokens": 718749668.0, "step": 1691 }, { "entropy": 0.01886589080095291, "epoch": 0.7405624247729511, "grad_norm": 8.875, "learning_rate": 2.6350839198700596e-05, "loss": 0.1153, "loss_lm": 0.015450736740604043, "loss_seg": 0.09987312369048595, "mean_token_accuracy": 0.9952538311481476, "num_tokens": 719174384.0, "step": 1692 }, { "entropy": 0.01932638743892312, "epoch": 0.741000109421162, "grad_norm": 5.84375, "learning_rate": 2.6348132106118033e-05, "loss": 0.1513, "loss_lm": 0.017933545168489218, "loss_seg": 0.13341273181140423, "mean_token_accuracy": 0.9952336996793747, "num_tokens": 719598999.0, "step": 1693 }, { "entropy": 0.01969212992116809, "epoch": 0.741437794069373, "grad_norm": 5.6875, "learning_rate": 2.6345425013535463e-05, "loss": 0.1452, "loss_lm": 0.01535733719356358, "loss_seg": 0.12986251339316368, "mean_token_accuracy": 0.9950503706932068, "num_tokens": 720023664.0, "step": 1694 }, { "entropy": 0.018674525897949934, "epoch": 0.741875478717584, "grad_norm": 17.0, "learning_rate": 2.6342717920952897e-05, "loss": 0.1453, "loss_lm": 0.016205267515033484, "loss_seg": 0.12906166724860668, "mean_token_accuracy": 0.9953716993331909, "num_tokens": 720449471.0, "step": 1695 }, { "entropy": 0.01902249688282609, "epoch": 0.7423131633657949, "grad_norm": 8.25, "learning_rate": 2.634001082837033e-05, "loss": 0.1087, "loss_lm": 0.016639008419588208, "loss_seg": 0.09202315844595432, "mean_token_accuracy": 0.995285302400589, "num_tokens": 720874468.0, "step": 1696 }, { "entropy": 0.018936604261398315, "epoch": 0.7427508480140059, "grad_norm": 10.9375, "learning_rate": 2.6337303735787764e-05, "loss": 0.0976, "loss_lm": 0.01618148572742939, "loss_seg": 0.08139861188828945, "mean_token_accuracy": 0.9952973872423172, "num_tokens": 721299490.0, "step": 1697 }, { "entropy": 0.01987195387482643, "epoch": 0.7431885326622168, "grad_norm": 7.3125, "learning_rate": 2.63345966432052e-05, "loss": 0.147, "loss_lm": 0.01853044074960053, "loss_seg": 0.12850037589669228, "mean_token_accuracy": 0.9950956106185913, "num_tokens": 721724192.0, "step": 1698 }, { "entropy": 0.019052443094551563, "epoch": 0.7436262173104279, "grad_norm": 7.09375, "learning_rate": 2.6331889550622632e-05, "loss": 0.1205, "loss_lm": 0.014969041803851724, "loss_seg": 0.10556880570948124, "mean_token_accuracy": 0.9952804446220398, "num_tokens": 722149144.0, "step": 1699 }, { "entropy": 0.018961823545396328, "epoch": 0.7440639019586388, "grad_norm": 10.5, "learning_rate": 2.6329182458040065e-05, "loss": 0.108, "loss_lm": 0.015300649451091886, "loss_seg": 0.09265412017703056, "mean_token_accuracy": 0.9954109787940979, "num_tokens": 722574040.0, "step": 1700 }, { "entropy": 0.019795699045062065, "epoch": 0.7445015866068497, "grad_norm": 14.3125, "learning_rate": 2.63264753654575e-05, "loss": 0.142, "loss_lm": 0.018916598521173, "loss_seg": 0.12308386154472828, "mean_token_accuracy": 0.9951200634241104, "num_tokens": 722999019.0, "step": 1701 }, { "entropy": 0.019389587454497814, "epoch": 0.7449392712550608, "grad_norm": 9.5625, "learning_rate": 2.6323768272874933e-05, "loss": 0.1586, "loss_lm": 0.01634851540438831, "loss_seg": 0.1422928236424923, "mean_token_accuracy": 0.9951248317956924, "num_tokens": 723423596.0, "step": 1702 }, { "entropy": 0.018825704231858253, "epoch": 0.7453769559032717, "grad_norm": 6.8125, "learning_rate": 2.6321061180292367e-05, "loss": 0.1174, "loss_lm": 0.01653060596436262, "loss_seg": 0.10087233036756516, "mean_token_accuracy": 0.9952742755413055, "num_tokens": 723847601.0, "step": 1703 }, { "entropy": 0.019773182459175587, "epoch": 0.7458146405514826, "grad_norm": 18.625, "learning_rate": 2.63183540877098e-05, "loss": 0.1811, "loss_lm": 0.017695113783702254, "loss_seg": 0.16340987663716078, "mean_token_accuracy": 0.9950878024101257, "num_tokens": 724272848.0, "step": 1704 }, { "entropy": 0.01957549247890711, "epoch": 0.7462523251996936, "grad_norm": 12.625, "learning_rate": 2.6315646995127234e-05, "loss": 0.1452, "loss_lm": 0.014379917411133647, "loss_seg": 0.13077831640839577, "mean_token_accuracy": 0.9951021075248718, "num_tokens": 724698301.0, "step": 1705 }, { "entropy": 0.018853507936000824, "epoch": 0.7466900098479046, "grad_norm": 12.125, "learning_rate": 2.6312939902544668e-05, "loss": 0.168, "loss_lm": 0.015825011068955064, "loss_seg": 0.15216308273375034, "mean_token_accuracy": 0.9952920526266098, "num_tokens": 725123074.0, "step": 1706 }, { "entropy": 0.01884782314300537, "epoch": 0.7471276944961156, "grad_norm": 7.28125, "learning_rate": 2.63102328099621e-05, "loss": 0.1369, "loss_lm": 0.015577148646116257, "loss_seg": 0.12128149531781673, "mean_token_accuracy": 0.9952832907438278, "num_tokens": 725548762.0, "step": 1707 }, { "entropy": 0.019386010244488716, "epoch": 0.7475653791443265, "grad_norm": 12.875, "learning_rate": 2.6307525717379535e-05, "loss": 0.1421, "loss_lm": 0.014090839307755232, "loss_seg": 0.128008883446455, "mean_token_accuracy": 0.9952525496482849, "num_tokens": 725974014.0, "step": 1708 }, { "entropy": 0.017973568756133318, "epoch": 0.7480030637925374, "grad_norm": 8.1875, "learning_rate": 2.630481862479697e-05, "loss": 0.1894, "loss_lm": 0.01420088903978467, "loss_seg": 0.17523043043911457, "mean_token_accuracy": 0.995599240064621, "num_tokens": 726399101.0, "step": 1709 }, { "entropy": 0.01945910882204771, "epoch": 0.7484407484407485, "grad_norm": 11.3125, "learning_rate": 2.6302111532214403e-05, "loss": 0.1369, "loss_lm": 0.01725597563199699, "loss_seg": 0.1195988617837429, "mean_token_accuracy": 0.9950926303863525, "num_tokens": 726824964.0, "step": 1710 }, { "entropy": 0.019475516863167286, "epoch": 0.7488784330889594, "grad_norm": 9.125, "learning_rate": 2.6299404439631836e-05, "loss": 0.159, "loss_lm": 0.01985191274434328, "loss_seg": 0.13912459835410118, "mean_token_accuracy": 0.995137482881546, "num_tokens": 727249202.0, "step": 1711 }, { "entropy": 0.018936427775770426, "epoch": 0.7493161177371703, "grad_norm": 12.625, "learning_rate": 2.629669734704927e-05, "loss": 0.1339, "loss_lm": 0.017624807776883245, "loss_seg": 0.11631463840603828, "mean_token_accuracy": 0.9952666312456131, "num_tokens": 727673551.0, "step": 1712 }, { "entropy": 0.01931011490523815, "epoch": 0.7497538023853814, "grad_norm": 5.0625, "learning_rate": 2.62939902544667e-05, "loss": 0.1493, "loss_lm": 0.015227924101054668, "loss_seg": 0.13410355523228645, "mean_token_accuracy": 0.9952318519353867, "num_tokens": 728098820.0, "step": 1713 }, { "entropy": 0.019555234350264072, "epoch": 0.7501914870335923, "grad_norm": 10.8125, "learning_rate": 2.6291283161884138e-05, "loss": 0.1498, "loss_lm": 0.016717046266421676, "loss_seg": 0.13311788253486156, "mean_token_accuracy": 0.9950561225414276, "num_tokens": 728523980.0, "step": 1714 }, { "entropy": 0.019173407927155495, "epoch": 0.7506291716818032, "grad_norm": 6.3125, "learning_rate": 2.628857606930157e-05, "loss": 0.1824, "loss_lm": 0.015811488963663578, "loss_seg": 0.16654511541128159, "mean_token_accuracy": 0.9951307326555252, "num_tokens": 728948242.0, "step": 1715 }, { "entropy": 0.019339806400239468, "epoch": 0.7510668563300142, "grad_norm": 7.03125, "learning_rate": 2.6285868976719005e-05, "loss": 0.1489, "loss_lm": 0.016271653352305293, "loss_seg": 0.13259941898286343, "mean_token_accuracy": 0.995215967297554, "num_tokens": 729373162.0, "step": 1716 }, { "entropy": 0.01999638881534338, "epoch": 0.7515045409782252, "grad_norm": 12.75, "learning_rate": 2.628316188413644e-05, "loss": 0.2007, "loss_lm": 0.017860125983133912, "loss_seg": 0.18284175731241703, "mean_token_accuracy": 0.9949518740177155, "num_tokens": 729798251.0, "step": 1717 }, { "entropy": 0.01945339795202017, "epoch": 0.7519422256264362, "grad_norm": 5.9375, "learning_rate": 2.628045479155387e-05, "loss": 0.1835, "loss_lm": 0.01808265084400773, "loss_seg": 0.16540087386965752, "mean_token_accuracy": 0.9950669854879379, "num_tokens": 730223323.0, "step": 1718 }, { "entropy": 0.02017759485170245, "epoch": 0.7523799102746471, "grad_norm": 17.125, "learning_rate": 2.6277747698971306e-05, "loss": 0.1651, "loss_lm": 0.01890823058784008, "loss_seg": 0.1461748518049717, "mean_token_accuracy": 0.9950281232595444, "num_tokens": 730648487.0, "step": 1719 }, { "entropy": 0.01908969460055232, "epoch": 0.752817594922858, "grad_norm": 7.875, "learning_rate": 2.627504060638874e-05, "loss": 0.2084, "loss_lm": 0.01775859296321869, "loss_seg": 0.19062772765755653, "mean_token_accuracy": 0.9950976967811584, "num_tokens": 731073156.0, "step": 1720 }, { "entropy": 0.019349998328834772, "epoch": 0.7532552795710691, "grad_norm": 7.0, "learning_rate": 2.6272333513806174e-05, "loss": 0.154, "loss_lm": 0.017907427856698632, "loss_seg": 0.13609549403190613, "mean_token_accuracy": 0.9951318055391312, "num_tokens": 731498326.0, "step": 1721 }, { "entropy": 0.020084715448319912, "epoch": 0.75369296421928, "grad_norm": 14.625, "learning_rate": 2.6269626421223607e-05, "loss": 0.1445, "loss_lm": 0.01832118653692305, "loss_seg": 0.1261486280709505, "mean_token_accuracy": 0.9950347989797592, "num_tokens": 731923643.0, "step": 1722 }, { "entropy": 0.019727840088307858, "epoch": 0.7541306488674909, "grad_norm": 8.5, "learning_rate": 2.6266919328641038e-05, "loss": 0.1946, "loss_lm": 0.01622176100499928, "loss_seg": 0.1783953607082367, "mean_token_accuracy": 0.9951265156269073, "num_tokens": 732349397.0, "step": 1723 }, { "entropy": 0.01955346530303359, "epoch": 0.754568333515702, "grad_norm": 8.6875, "learning_rate": 2.6264212236058475e-05, "loss": 0.1507, "loss_lm": 0.016257635783404112, "loss_seg": 0.13440097868442535, "mean_token_accuracy": 0.9951743930578232, "num_tokens": 732774583.0, "step": 1724 }, { "entropy": 0.01913127163425088, "epoch": 0.7550060181639129, "grad_norm": 6.875, "learning_rate": 2.626150514347591e-05, "loss": 0.1263, "loss_lm": 0.016022175317630172, "loss_seg": 0.11028940230607986, "mean_token_accuracy": 0.9953182637691498, "num_tokens": 733199813.0, "step": 1725 }, { "entropy": 0.019235233310610056, "epoch": 0.7554437028121239, "grad_norm": 6.03125, "learning_rate": 2.6258798050893342e-05, "loss": 0.1387, "loss_lm": 0.016221564961597323, "loss_seg": 0.12252775393426418, "mean_token_accuracy": 0.9952253550291061, "num_tokens": 733625118.0, "step": 1726 }, { "entropy": 0.01890453975647688, "epoch": 0.7558813874603348, "grad_norm": 5.875, "learning_rate": 2.6256090958310773e-05, "loss": 0.1122, "loss_lm": 0.019048570888116956, "loss_seg": 0.09314985387027264, "mean_token_accuracy": 0.9952883124351501, "num_tokens": 734050104.0, "step": 1727 }, { "entropy": 0.019854835234582424, "epoch": 0.7563190721085458, "grad_norm": 8.875, "learning_rate": 2.6253383865728206e-05, "loss": 0.0982, "loss_lm": 0.017686235951259732, "loss_seg": 0.0805592592805624, "mean_token_accuracy": 0.9950420260429382, "num_tokens": 734475254.0, "step": 1728 }, { "entropy": 0.01867088722065091, "epoch": 0.7567567567567568, "grad_norm": 10.625, "learning_rate": 2.6250676773145644e-05, "loss": 0.1554, "loss_lm": 0.016463097417727113, "loss_seg": 0.138894310221076, "mean_token_accuracy": 0.9953152686357498, "num_tokens": 734900436.0, "step": 1729 }, { "entropy": 0.01917132828384638, "epoch": 0.7571944414049677, "grad_norm": 14.9375, "learning_rate": 2.6247969680563077e-05, "loss": 0.1013, "loss_lm": 0.016262495191767812, "loss_seg": 0.08507175836712122, "mean_token_accuracy": 0.9951356202363968, "num_tokens": 735326159.0, "step": 1730 }, { "entropy": 0.019472311716526747, "epoch": 0.7576321260531786, "grad_norm": 7.09375, "learning_rate": 2.624526258798051e-05, "loss": 0.1782, "loss_lm": 0.01723674382083118, "loss_seg": 0.1609807275235653, "mean_token_accuracy": 0.995066687464714, "num_tokens": 735751438.0, "step": 1731 }, { "entropy": 0.019005538430064917, "epoch": 0.7580698107013897, "grad_norm": 9.9375, "learning_rate": 2.624255549539794e-05, "loss": 0.1476, "loss_lm": 0.017236303770914674, "loss_seg": 0.1303407996892929, "mean_token_accuracy": 0.9952498078346252, "num_tokens": 736177059.0, "step": 1732 }, { "entropy": 0.019807029981166124, "epoch": 0.7585074953496006, "grad_norm": 9.75, "learning_rate": 2.6239848402815375e-05, "loss": 0.2097, "loss_lm": 0.016133270226418972, "loss_seg": 0.19359244219958782, "mean_token_accuracy": 0.9950885772705078, "num_tokens": 736603146.0, "step": 1733 }, { "entropy": 0.01912073977291584, "epoch": 0.7589451799978115, "grad_norm": 11.0, "learning_rate": 2.6237141310232812e-05, "loss": 0.1186, "loss_lm": 0.01678632735274732, "loss_seg": 0.10178567469120026, "mean_token_accuracy": 0.9951311200857162, "num_tokens": 737027455.0, "step": 1734 }, { "entropy": 0.020041823852807283, "epoch": 0.7593828646460226, "grad_norm": 6.375, "learning_rate": 2.6234434217650246e-05, "loss": 0.1414, "loss_lm": 0.017903804080560803, "loss_seg": 0.1234846655279398, "mean_token_accuracy": 0.9949952960014343, "num_tokens": 737453037.0, "step": 1735 }, { "entropy": 0.019430491141974926, "epoch": 0.7598205492942335, "grad_norm": 9.0625, "learning_rate": 2.623172712506768e-05, "loss": 0.127, "loss_lm": 0.01523645338602364, "loss_seg": 0.11178679950535297, "mean_token_accuracy": 0.9951263219118118, "num_tokens": 737878560.0, "step": 1736 }, { "entropy": 0.0192705187946558, "epoch": 0.7602582339424445, "grad_norm": 34.25, "learning_rate": 2.622902003248511e-05, "loss": 0.1426, "loss_lm": 0.01752557558938861, "loss_seg": 0.12503083050251007, "mean_token_accuracy": 0.9952025711536407, "num_tokens": 738302711.0, "step": 1737 }, { "entropy": 0.01939929649233818, "epoch": 0.7606959185906554, "grad_norm": 8.625, "learning_rate": 2.6226312939902544e-05, "loss": 0.1524, "loss_lm": 0.018780753947794437, "loss_seg": 0.13358144834637642, "mean_token_accuracy": 0.9951444119215012, "num_tokens": 738727347.0, "step": 1738 }, { "entropy": 0.01854746649041772, "epoch": 0.7611336032388664, "grad_norm": 23.75, "learning_rate": 2.622360584731998e-05, "loss": 0.1797, "loss_lm": 0.015356595162302256, "loss_seg": 0.16435369849205017, "mean_token_accuracy": 0.9953342527151108, "num_tokens": 739152084.0, "step": 1739 }, { "entropy": 0.01884452300146222, "epoch": 0.7615712878870774, "grad_norm": 22.875, "learning_rate": 2.6220898754737414e-05, "loss": 0.1257, "loss_lm": 0.014166115317493677, "loss_seg": 0.1115403100848198, "mean_token_accuracy": 0.9951716512441635, "num_tokens": 739576811.0, "step": 1740 }, { "entropy": 0.01853262772783637, "epoch": 0.7620089725352883, "grad_norm": 11.25, "learning_rate": 2.6218191662154848e-05, "loss": 0.1215, "loss_lm": 0.017270189244300127, "loss_seg": 0.10420216619968414, "mean_token_accuracy": 0.9952627718448639, "num_tokens": 740002334.0, "step": 1741 }, { "entropy": 0.01921224594116211, "epoch": 0.7624466571834992, "grad_norm": 13.25, "learning_rate": 2.621548456957228e-05, "loss": 0.1405, "loss_lm": 0.015555223682895303, "loss_seg": 0.12491861172020435, "mean_token_accuracy": 0.9951238036155701, "num_tokens": 740427252.0, "step": 1742 }, { "entropy": 0.01894852565601468, "epoch": 0.7628843418317103, "grad_norm": 15.125, "learning_rate": 2.6212777476989712e-05, "loss": 0.1653, "loss_lm": 0.018718084320425987, "loss_seg": 0.146629823371768, "mean_token_accuracy": 0.9951933324337006, "num_tokens": 740852663.0, "step": 1743 }, { "entropy": 0.019643038045614958, "epoch": 0.7633220264799212, "grad_norm": 22.125, "learning_rate": 2.6210070384407146e-05, "loss": 0.1495, "loss_lm": 0.017284509958699346, "loss_seg": 0.13220788165926933, "mean_token_accuracy": 0.9950826466083527, "num_tokens": 741278107.0, "step": 1744 }, { "entropy": 0.018951451871544123, "epoch": 0.7637597111281322, "grad_norm": 5.46875, "learning_rate": 2.6207363291824583e-05, "loss": 0.1449, "loss_lm": 0.017382930032908916, "loss_seg": 0.1274894531816244, "mean_token_accuracy": 0.9952110797166824, "num_tokens": 741702772.0, "step": 1745 }, { "entropy": 0.019043582025915384, "epoch": 0.7641973957763432, "grad_norm": 10.375, "learning_rate": 2.6204656199242017e-05, "loss": 0.1823, "loss_lm": 0.014783689752221107, "loss_seg": 0.16749268025159836, "mean_token_accuracy": 0.9951549619436264, "num_tokens": 742127116.0, "step": 1746 }, { "entropy": 0.01999959023669362, "epoch": 0.7646350804245541, "grad_norm": 6.53125, "learning_rate": 2.6201949106659447e-05, "loss": 0.1747, "loss_lm": 0.014917899388819933, "loss_seg": 0.15974079631268978, "mean_token_accuracy": 0.9949569702148438, "num_tokens": 742552863.0, "step": 1747 }, { "entropy": 0.019522272050380707, "epoch": 0.7650727650727651, "grad_norm": 17.75, "learning_rate": 2.619924201407688e-05, "loss": 0.1675, "loss_lm": 0.018744295462965965, "loss_seg": 0.14877424761652946, "mean_token_accuracy": 0.995199665427208, "num_tokens": 742978636.0, "step": 1748 }, { "entropy": 0.018518791534006596, "epoch": 0.765510449720976, "grad_norm": 11.25, "learning_rate": 2.6196534921494315e-05, "loss": 0.1631, "loss_lm": 0.015446779550984502, "loss_seg": 0.1476398017257452, "mean_token_accuracy": 0.995305523276329, "num_tokens": 743404200.0, "step": 1749 }, { "entropy": 0.019109900575131178, "epoch": 0.765948134369187, "grad_norm": 9.6875, "learning_rate": 2.6193827828911752e-05, "loss": 0.1676, "loss_lm": 0.01775194238871336, "loss_seg": 0.14986471459269524, "mean_token_accuracy": 0.9952332526445389, "num_tokens": 743829186.0, "step": 1750 }, { "entropy": 0.019400852266699076, "epoch": 0.766385819017398, "grad_norm": 8.5625, "learning_rate": 2.6191120736329182e-05, "loss": 0.19, "loss_lm": 0.018298832699656487, "loss_seg": 0.1717237252742052, "mean_token_accuracy": 0.9952309727668762, "num_tokens": 744253734.0, "step": 1751 }, { "entropy": 0.019253486767411232, "epoch": 0.7668235036656089, "grad_norm": 8.875, "learning_rate": 2.6188413643746616e-05, "loss": 0.1536, "loss_lm": 0.018016204936429858, "loss_seg": 0.13561192527413368, "mean_token_accuracy": 0.9951531887054443, "num_tokens": 744679268.0, "step": 1752 }, { "entropy": 0.019415898714214563, "epoch": 0.7672611883138198, "grad_norm": 7.375, "learning_rate": 2.618570655116405e-05, "loss": 0.1414, "loss_lm": 0.017538440879434347, "loss_seg": 0.1238242294639349, "mean_token_accuracy": 0.995109960436821, "num_tokens": 745104783.0, "step": 1753 }, { "entropy": 0.019614926539361477, "epoch": 0.7676988729620309, "grad_norm": 16.125, "learning_rate": 2.6182999458581483e-05, "loss": 0.1549, "loss_lm": 0.016527655767276883, "loss_seg": 0.1383248269557953, "mean_token_accuracy": 0.995161846280098, "num_tokens": 745530144.0, "step": 1754 }, { "entropy": 0.019512843806296587, "epoch": 0.7681365576102418, "grad_norm": 7.375, "learning_rate": 2.618029236599892e-05, "loss": 0.1281, "loss_lm": 0.015099078649654984, "loss_seg": 0.11297834292054176, "mean_token_accuracy": 0.99518021941185, "num_tokens": 745955028.0, "step": 1755 }, { "entropy": 0.019151323940604925, "epoch": 0.7685742422584528, "grad_norm": 6.1875, "learning_rate": 2.617758527341635e-05, "loss": 0.2007, "loss_lm": 0.017463771859183908, "loss_seg": 0.18325053807348013, "mean_token_accuracy": 0.9952923953533173, "num_tokens": 746379571.0, "step": 1756 }, { "entropy": 0.018988167867064476, "epoch": 0.7690119269066638, "grad_norm": 5.0625, "learning_rate": 2.6174878180833784e-05, "loss": 0.1195, "loss_lm": 0.01638472662307322, "loss_seg": 0.10309486743062735, "mean_token_accuracy": 0.99526447057724, "num_tokens": 746804635.0, "step": 1757 }, { "entropy": 0.01971560576930642, "epoch": 0.7694496115548747, "grad_norm": 15.375, "learning_rate": 2.6172171088251218e-05, "loss": 0.143, "loss_lm": 0.018536750692874193, "loss_seg": 0.12442439142614603, "mean_token_accuracy": 0.9951288849115372, "num_tokens": 747229757.0, "step": 1758 }, { "entropy": 0.019179109949618578, "epoch": 0.7698872962030857, "grad_norm": 9.875, "learning_rate": 2.6169463995668652e-05, "loss": 0.1623, "loss_lm": 0.019172392785549164, "loss_seg": 0.14311356469988823, "mean_token_accuracy": 0.9952079057693481, "num_tokens": 747655440.0, "step": 1759 }, { "entropy": 0.018796726129949093, "epoch": 0.7703249808512966, "grad_norm": 16.5, "learning_rate": 2.616675690308609e-05, "loss": 0.1609, "loss_lm": 0.015627668472006917, "loss_seg": 0.14529436081647873, "mean_token_accuracy": 0.9953284114599228, "num_tokens": 748080323.0, "step": 1760 }, { "entropy": 0.01917925663292408, "epoch": 0.7707626654995076, "grad_norm": 10.3125, "learning_rate": 2.616404981050352e-05, "loss": 0.1539, "loss_lm": 0.01783291297033429, "loss_seg": 0.13609004393219948, "mean_token_accuracy": 0.9952394366264343, "num_tokens": 748505434.0, "step": 1761 }, { "entropy": 0.01906150532886386, "epoch": 0.7712003501477186, "grad_norm": 16.0, "learning_rate": 2.6161342717920953e-05, "loss": 0.0987, "loss_lm": 0.01590395625680685, "loss_seg": 0.08276947401463985, "mean_token_accuracy": 0.9953567683696747, "num_tokens": 748930514.0, "step": 1762 }, { "entropy": 0.019461941439658403, "epoch": 0.7716380347959295, "grad_norm": 32.5, "learning_rate": 2.6158635625338387e-05, "loss": 0.1503, "loss_lm": 0.017709827283397317, "loss_seg": 0.1325870268046856, "mean_token_accuracy": 0.9952500611543655, "num_tokens": 749355904.0, "step": 1763 }, { "entropy": 0.01889191335067153, "epoch": 0.7720757194441406, "grad_norm": 10.375, "learning_rate": 2.615592853275582e-05, "loss": 0.1254, "loss_lm": 0.01637409720569849, "loss_seg": 0.1090115662664175, "mean_token_accuracy": 0.995352640748024, "num_tokens": 749780623.0, "step": 1764 }, { "entropy": 0.019190100952982903, "epoch": 0.7725134040923515, "grad_norm": 8.75, "learning_rate": 2.6153221440173258e-05, "loss": 0.1566, "loss_lm": 0.017654925817623734, "loss_seg": 0.13893542625010014, "mean_token_accuracy": 0.9951543062925339, "num_tokens": 750206120.0, "step": 1765 }, { "entropy": 0.019171159714460373, "epoch": 0.7729510887405624, "grad_norm": 10.4375, "learning_rate": 2.6150514347590688e-05, "loss": 0.1661, "loss_lm": 0.017047896282747388, "loss_seg": 0.149073189124465, "mean_token_accuracy": 0.9951562136411667, "num_tokens": 750631372.0, "step": 1766 }, { "entropy": 0.019774099811911583, "epoch": 0.7733887733887734, "grad_norm": 12.75, "learning_rate": 2.6147807255008122e-05, "loss": 0.1621, "loss_lm": 0.013890999602153897, "loss_seg": 0.14821546152234077, "mean_token_accuracy": 0.9952040761709213, "num_tokens": 751056058.0, "step": 1767 }, { "entropy": 0.018842444755136967, "epoch": 0.7738264580369844, "grad_norm": 11.3125, "learning_rate": 2.6145100162425555e-05, "loss": 0.1156, "loss_lm": 0.01563167874701321, "loss_seg": 0.09999046288430691, "mean_token_accuracy": 0.9952811449766159, "num_tokens": 751481515.0, "step": 1768 }, { "entropy": 0.019752629101276398, "epoch": 0.7742641426851953, "grad_norm": 13.4375, "learning_rate": 2.614239306984299e-05, "loss": 0.1488, "loss_lm": 0.017694301437586546, "loss_seg": 0.13112915866076946, "mean_token_accuracy": 0.9952538907527924, "num_tokens": 751907074.0, "step": 1769 }, { "entropy": 0.019668371882289648, "epoch": 0.7747018273334063, "grad_norm": 8.3125, "learning_rate": 2.6139685977260423e-05, "loss": 0.1629, "loss_lm": 0.01887131528928876, "loss_seg": 0.14407131634652615, "mean_token_accuracy": 0.9950249195098877, "num_tokens": 752332348.0, "step": 1770 }, { "entropy": 0.01936663081869483, "epoch": 0.7751395119816172, "grad_norm": 11.8125, "learning_rate": 2.6136978884677857e-05, "loss": 0.1485, "loss_lm": 0.019560597836971283, "loss_seg": 0.128978056833148, "mean_token_accuracy": 0.9951183348894119, "num_tokens": 752757794.0, "step": 1771 }, { "entropy": 0.019012628123164177, "epoch": 0.7755771966298282, "grad_norm": 13.875, "learning_rate": 2.613427179209529e-05, "loss": 0.1294, "loss_lm": 0.015319214668124914, "loss_seg": 0.11405091173946857, "mean_token_accuracy": 0.9952844530344009, "num_tokens": 753182059.0, "step": 1772 }, { "entropy": 0.019270923919975758, "epoch": 0.7760148812780392, "grad_norm": 10.9375, "learning_rate": 2.6131564699512724e-05, "loss": 0.1429, "loss_lm": 0.01753033115528524, "loss_seg": 0.12540813721716404, "mean_token_accuracy": 0.9952303767204285, "num_tokens": 753608060.0, "step": 1773 }, { "entropy": 0.018871793057769537, "epoch": 0.7764525659262501, "grad_norm": 13.5, "learning_rate": 2.6128857606930158e-05, "loss": 0.2028, "loss_lm": 0.017540042754262686, "loss_seg": 0.1852625198662281, "mean_token_accuracy": 0.9953768402338028, "num_tokens": 754033660.0, "step": 1774 }, { "entropy": 0.019277256913483143, "epoch": 0.7768902505744611, "grad_norm": 12.0625, "learning_rate": 2.612615051434759e-05, "loss": 0.1414, "loss_lm": 0.015609013848006725, "loss_seg": 0.12583372928202152, "mean_token_accuracy": 0.9952349215745926, "num_tokens": 754459433.0, "step": 1775 }, { "entropy": 0.01933103147894144, "epoch": 0.7773279352226721, "grad_norm": 5.9375, "learning_rate": 2.6123443421765025e-05, "loss": 0.1234, "loss_lm": 0.016300315037369728, "loss_seg": 0.10712573304772377, "mean_token_accuracy": 0.99527907371521, "num_tokens": 754884293.0, "step": 1776 }, { "entropy": 0.01878427853807807, "epoch": 0.777765619870883, "grad_norm": 8.25, "learning_rate": 2.612073632918246e-05, "loss": 0.1424, "loss_lm": 0.017738602589815855, "loss_seg": 0.124635836109519, "mean_token_accuracy": 0.9952638000249863, "num_tokens": 755309036.0, "step": 1777 }, { "entropy": 0.01897771703079343, "epoch": 0.778203304519094, "grad_norm": 13.0, "learning_rate": 2.6118029236599893e-05, "loss": 0.1598, "loss_lm": 0.016870013438165188, "loss_seg": 0.14290245436131954, "mean_token_accuracy": 0.9951777458190918, "num_tokens": 755733907.0, "step": 1778 }, { "entropy": 0.018796308431774378, "epoch": 0.778640989167305, "grad_norm": 9.25, "learning_rate": 2.6115322144017326e-05, "loss": 0.1937, "loss_lm": 0.01739499415270984, "loss_seg": 0.17633956484496593, "mean_token_accuracy": 0.9952931851148605, "num_tokens": 756158360.0, "step": 1779 }, { "entropy": 0.019244427792727947, "epoch": 0.7790786738155159, "grad_norm": 10.875, "learning_rate": 2.6112615051434757e-05, "loss": 0.1611, "loss_lm": 0.017454559914767742, "loss_seg": 0.14363060146570206, "mean_token_accuracy": 0.9951438009738922, "num_tokens": 756583745.0, "step": 1780 }, { "entropy": 0.01946415565907955, "epoch": 0.7795163584637269, "grad_norm": 7.90625, "learning_rate": 2.6109907958852194e-05, "loss": 0.1775, "loss_lm": 0.015492877690121531, "loss_seg": 0.1619829535484314, "mean_token_accuracy": 0.9951265454292297, "num_tokens": 757008885.0, "step": 1781 }, { "entropy": 0.01867098268121481, "epoch": 0.7799540431119378, "grad_norm": 9.25, "learning_rate": 2.6107200866269628e-05, "loss": 0.1325, "loss_lm": 0.016933808103203773, "loss_seg": 0.1155927013605833, "mean_token_accuracy": 0.9953214973211288, "num_tokens": 757433985.0, "step": 1782 }, { "entropy": 0.019175443332642317, "epoch": 0.7803917277601489, "grad_norm": 15.5625, "learning_rate": 2.610449377368706e-05, "loss": 0.1856, "loss_lm": 0.01817059190943837, "loss_seg": 0.1674514226615429, "mean_token_accuracy": 0.9951490312814713, "num_tokens": 757859022.0, "step": 1783 }, { "entropy": 0.01916021015495062, "epoch": 0.7808294124083598, "grad_norm": 37.5, "learning_rate": 2.6101786681104495e-05, "loss": 0.1409, "loss_lm": 0.016861064359545708, "loss_seg": 0.12400862015783787, "mean_token_accuracy": 0.9952006787061691, "num_tokens": 758284029.0, "step": 1784 }, { "entropy": 0.019460052717477083, "epoch": 0.7812670970565707, "grad_norm": 26.5, "learning_rate": 2.6099079588521925e-05, "loss": 0.1302, "loss_lm": 0.015414649853482842, "loss_seg": 0.11479560099542141, "mean_token_accuracy": 0.9950894266366959, "num_tokens": 758708811.0, "step": 1785 }, { "entropy": 0.01873117871582508, "epoch": 0.7817047817047817, "grad_norm": 9.5, "learning_rate": 2.6096372495939362e-05, "loss": 0.1394, "loss_lm": 0.016872295644134283, "loss_seg": 0.12253772094845772, "mean_token_accuracy": 0.9953129589557648, "num_tokens": 759133557.0, "step": 1786 }, { "entropy": 0.018600191920995712, "epoch": 0.7821424663529927, "grad_norm": 11.0, "learning_rate": 2.6093665403356796e-05, "loss": 0.1811, "loss_lm": 0.016809134976938367, "loss_seg": 0.16428342834115028, "mean_token_accuracy": 0.9952855110168457, "num_tokens": 759558975.0, "step": 1787 }, { "entropy": 0.019299421459436417, "epoch": 0.7825801510012036, "grad_norm": 5.5, "learning_rate": 2.609095831077423e-05, "loss": 0.1759, "loss_lm": 0.014731619274243712, "loss_seg": 0.16116208769381046, "mean_token_accuracy": 0.9952007830142975, "num_tokens": 759984634.0, "step": 1788 }, { "entropy": 0.019345590844750404, "epoch": 0.7830178356494146, "grad_norm": 11.1875, "learning_rate": 2.6088251218191664e-05, "loss": 0.1499, "loss_lm": 0.016110915457829833, "loss_seg": 0.133796826004982, "mean_token_accuracy": 0.9951392561197281, "num_tokens": 760410094.0, "step": 1789 }, { "entropy": 0.019075222313404083, "epoch": 0.7834555202976256, "grad_norm": 12.5625, "learning_rate": 2.6085544125609094e-05, "loss": 0.134, "loss_lm": 0.01605557231232524, "loss_seg": 0.11799236759543419, "mean_token_accuracy": 0.9951628297567368, "num_tokens": 760834468.0, "step": 1790 }, { "entropy": 0.018574581015855074, "epoch": 0.7838932049458365, "grad_norm": 17.75, "learning_rate": 2.608283703302653e-05, "loss": 0.1572, "loss_lm": 0.0174315650947392, "loss_seg": 0.13974625058472157, "mean_token_accuracy": 0.9952697306871414, "num_tokens": 761259344.0, "step": 1791 }, { "entropy": 0.018855747301131487, "epoch": 0.7843308895940475, "grad_norm": 5.6875, "learning_rate": 2.6080129940443965e-05, "loss": 0.1605, "loss_lm": 0.014549097744747996, "loss_seg": 0.14595715887844563, "mean_token_accuracy": 0.9953408688306808, "num_tokens": 761684247.0, "step": 1792 }, { "entropy": 0.01875042961910367, "epoch": 0.7847685742422584, "grad_norm": 7.5625, "learning_rate": 2.60774228478614e-05, "loss": 0.1347, "loss_lm": 0.014969516545534134, "loss_seg": 0.11972338706254959, "mean_token_accuracy": 0.9952094107866287, "num_tokens": 762108387.0, "step": 1793 }, { "entropy": 0.018958329688757658, "epoch": 0.7852062588904695, "grad_norm": 9.0625, "learning_rate": 2.607471575527883e-05, "loss": 0.1706, "loss_lm": 0.020253451075404882, "loss_seg": 0.15035483613610268, "mean_token_accuracy": 0.9952137172222137, "num_tokens": 762533526.0, "step": 1794 }, { "entropy": 0.018823156133294106, "epoch": 0.7856439435386804, "grad_norm": 12.6875, "learning_rate": 2.6072008662696263e-05, "loss": 0.1286, "loss_lm": 0.017556756734848022, "loss_seg": 0.11101194005459547, "mean_token_accuracy": 0.9952994734048843, "num_tokens": 762958146.0, "step": 1795 }, { "entropy": 0.018654344137758017, "epoch": 0.7860816281868913, "grad_norm": 22.375, "learning_rate": 2.60693015701137e-05, "loss": 0.1543, "loss_lm": 0.017272063065320253, "loss_seg": 0.1370512992143631, "mean_token_accuracy": 0.9953970611095428, "num_tokens": 763382588.0, "step": 1796 }, { "entropy": 0.019097966607660055, "epoch": 0.7865193128351023, "grad_norm": 7.09375, "learning_rate": 2.6066594477531133e-05, "loss": 0.2387, "loss_lm": 0.01814967207610607, "loss_seg": 0.22052635252475739, "mean_token_accuracy": 0.9952234774827957, "num_tokens": 763807523.0, "step": 1797 }, { "entropy": 0.018592755310237408, "epoch": 0.7869569974833133, "grad_norm": 7.21875, "learning_rate": 2.6063887384948567e-05, "loss": 0.1458, "loss_lm": 0.015401490731164813, "loss_seg": 0.13043463975191116, "mean_token_accuracy": 0.9953719079494476, "num_tokens": 764233350.0, "step": 1798 }, { "entropy": 0.018674479331821203, "epoch": 0.7873946821315242, "grad_norm": 11.9375, "learning_rate": 2.6061180292365998e-05, "loss": 0.1456, "loss_lm": 0.015530681470409036, "loss_seg": 0.1300448700785637, "mean_token_accuracy": 0.9952555894851685, "num_tokens": 764658227.0, "step": 1799 }, { "entropy": 0.019222037866711617, "epoch": 0.7878323667797352, "grad_norm": 6.28125, "learning_rate": 2.605847319978343e-05, "loss": 0.1402, "loss_lm": 0.017497244523838162, "loss_seg": 0.12274700030684471, "mean_token_accuracy": 0.9950926005840302, "num_tokens": 765082258.0, "step": 1800 }, { "entropy": 0.018446438014507294, "epoch": 0.7882700514279462, "grad_norm": 16.25, "learning_rate": 2.605576610720087e-05, "loss": 0.1308, "loss_lm": 0.014786388026550412, "loss_seg": 0.1159772016108036, "mean_token_accuracy": 0.9952771663665771, "num_tokens": 765507406.0, "step": 1801 }, { "entropy": 0.018685953225940466, "epoch": 0.7887077360761572, "grad_norm": 8.25, "learning_rate": 2.6053059014618302e-05, "loss": 0.1511, "loss_lm": 0.016654783859848976, "loss_seg": 0.13447639718651772, "mean_token_accuracy": 0.9952319413423538, "num_tokens": 765932486.0, "step": 1802 }, { "entropy": 0.018771691247820854, "epoch": 0.7891454207243681, "grad_norm": 9.0, "learning_rate": 2.6050351922035736e-05, "loss": 0.1207, "loss_lm": 0.015937183052301407, "loss_seg": 0.1048058271408081, "mean_token_accuracy": 0.9952404648065567, "num_tokens": 766357915.0, "step": 1803 }, { "entropy": 0.019167295191437006, "epoch": 0.789583105372579, "grad_norm": 9.0625, "learning_rate": 2.6047644829453166e-05, "loss": 0.1332, "loss_lm": 0.015272342134267092, "loss_seg": 0.11792948842048645, "mean_token_accuracy": 0.9952631443738937, "num_tokens": 766783256.0, "step": 1804 }, { "entropy": 0.019226504489779472, "epoch": 0.7900207900207901, "grad_norm": 8.4375, "learning_rate": 2.60449377368706e-05, "loss": 0.1799, "loss_lm": 0.01831785636022687, "loss_seg": 0.16162626072764397, "mean_token_accuracy": 0.9951580911874771, "num_tokens": 767208675.0, "step": 1805 }, { "entropy": 0.01926246052607894, "epoch": 0.790458474669001, "grad_norm": 9.0625, "learning_rate": 2.6042230644288037e-05, "loss": 0.1781, "loss_lm": 0.019581064581871033, "loss_seg": 0.15855004079639912, "mean_token_accuracy": 0.9950627088546753, "num_tokens": 767634621.0, "step": 1806 }, { "entropy": 0.01937775406986475, "epoch": 0.7908961593172119, "grad_norm": 32.5, "learning_rate": 2.603952355170547e-05, "loss": 0.198, "loss_lm": 0.01643887721002102, "loss_seg": 0.18154991045594215, "mean_token_accuracy": 0.9949769526720047, "num_tokens": 768059844.0, "step": 1807 }, { "entropy": 0.01862643100321293, "epoch": 0.791333843965423, "grad_norm": 7.0, "learning_rate": 2.6036816459122904e-05, "loss": 0.1559, "loss_lm": 0.015518069500103593, "loss_seg": 0.14039061032235622, "mean_token_accuracy": 0.9953265935182571, "num_tokens": 768484796.0, "step": 1808 }, { "entropy": 0.01950467098504305, "epoch": 0.7917715286136339, "grad_norm": 10.25, "learning_rate": 2.6034109366540335e-05, "loss": 0.2012, "loss_lm": 0.017939023673534393, "loss_seg": 0.1832248792052269, "mean_token_accuracy": 0.9950785934925079, "num_tokens": 768910049.0, "step": 1809 }, { "entropy": 0.019688953179866076, "epoch": 0.7922092132618448, "grad_norm": 11.1875, "learning_rate": 2.603140227395777e-05, "loss": 0.1247, "loss_lm": 0.01476303394883871, "loss_seg": 0.10998490639030933, "mean_token_accuracy": 0.9951071590185165, "num_tokens": 769335331.0, "step": 1810 }, { "entropy": 0.019662756472826004, "epoch": 0.7926468979100558, "grad_norm": 7.34375, "learning_rate": 2.6028695181375202e-05, "loss": 0.1749, "loss_lm": 0.016469802940264344, "loss_seg": 0.15842275694012642, "mean_token_accuracy": 0.9949263781309128, "num_tokens": 769760433.0, "step": 1811 }, { "entropy": 0.019277056213468313, "epoch": 0.7930845825582667, "grad_norm": 8.3125, "learning_rate": 2.602598808879264e-05, "loss": 0.1526, "loss_lm": 0.016158889746293426, "loss_seg": 0.13641945272684097, "mean_token_accuracy": 0.995109960436821, "num_tokens": 770185158.0, "step": 1812 }, { "entropy": 0.019924697000533342, "epoch": 0.7935222672064778, "grad_norm": 11.375, "learning_rate": 2.6023280996210073e-05, "loss": 0.2062, "loss_lm": 0.020984027534723282, "loss_seg": 0.18517464771866798, "mean_token_accuracy": 0.9949282854795456, "num_tokens": 770610231.0, "step": 1813 }, { "entropy": 0.01913660578429699, "epoch": 0.7939599518546887, "grad_norm": 9.0625, "learning_rate": 2.6020573903627503e-05, "loss": 0.1939, "loss_lm": 0.01642435253597796, "loss_seg": 0.17747456952929497, "mean_token_accuracy": 0.995210587978363, "num_tokens": 771035503.0, "step": 1814 }, { "entropy": 0.01936195231974125, "epoch": 0.7943976365028996, "grad_norm": 9.75, "learning_rate": 2.6017866811044937e-05, "loss": 0.1371, "loss_lm": 0.014477911172434688, "loss_seg": 0.12265797331929207, "mean_token_accuracy": 0.9951953142881393, "num_tokens": 771461505.0, "step": 1815 }, { "entropy": 0.01897867349907756, "epoch": 0.7948353211511107, "grad_norm": 25.625, "learning_rate": 2.601515971846237e-05, "loss": 0.1027, "loss_lm": 0.015543145360425115, "loss_seg": 0.08712451346218586, "mean_token_accuracy": 0.9952580630779266, "num_tokens": 771887243.0, "step": 1816 }, { "entropy": 0.01989617431536317, "epoch": 0.7952730057993216, "grad_norm": 10.6875, "learning_rate": 2.6012452625879808e-05, "loss": 0.1369, "loss_lm": 0.01578102703206241, "loss_seg": 0.1211212258785963, "mean_token_accuracy": 0.9950365424156189, "num_tokens": 772312515.0, "step": 1817 }, { "entropy": 0.019592601340264082, "epoch": 0.7957106904475325, "grad_norm": 10.1875, "learning_rate": 2.600974553329724e-05, "loss": 0.116, "loss_lm": 0.018060300964862108, "loss_seg": 0.09797411225736141, "mean_token_accuracy": 0.9949694126844406, "num_tokens": 772737490.0, "step": 1818 }, { "entropy": 0.01947819348424673, "epoch": 0.7961483750957435, "grad_norm": 12.875, "learning_rate": 2.6007038440714672e-05, "loss": 0.1546, "loss_lm": 0.017737448448315263, "loss_seg": 0.13685751520097256, "mean_token_accuracy": 0.9951690286397934, "num_tokens": 773162636.0, "step": 1819 }, { "entropy": 0.01919860439375043, "epoch": 0.7965860597439545, "grad_norm": 6.75, "learning_rate": 2.6004331348132106e-05, "loss": 0.1699, "loss_lm": 0.016726989299058914, "loss_seg": 0.15315145440399647, "mean_token_accuracy": 0.9952964186668396, "num_tokens": 773587990.0, "step": 1820 }, { "entropy": 0.01893904060125351, "epoch": 0.7970237443921654, "grad_norm": 12.375, "learning_rate": 2.600162425554954e-05, "loss": 0.1503, "loss_lm": 0.016265595331788063, "loss_seg": 0.1340783890336752, "mean_token_accuracy": 0.9953448176383972, "num_tokens": 774013652.0, "step": 1821 }, { "entropy": 0.019030215218663216, "epoch": 0.7974614290403764, "grad_norm": 4.8125, "learning_rate": 2.5998917162966977e-05, "loss": 0.1727, "loss_lm": 0.01701186178252101, "loss_seg": 0.1556842066347599, "mean_token_accuracy": 0.9951311200857162, "num_tokens": 774438380.0, "step": 1822 }, { "entropy": 0.01933687226846814, "epoch": 0.7978991136885873, "grad_norm": 8.0625, "learning_rate": 2.5996210070384407e-05, "loss": 0.187, "loss_lm": 0.016952334670349956, "loss_seg": 0.17008758895099163, "mean_token_accuracy": 0.9950984567403793, "num_tokens": 774862920.0, "step": 1823 }, { "entropy": 0.019238529726862907, "epoch": 0.7983367983367984, "grad_norm": 8.9375, "learning_rate": 2.599350297780184e-05, "loss": 0.1032, "loss_lm": 0.016392223304137588, "loss_seg": 0.08685167133808136, "mean_token_accuracy": 0.9952148944139481, "num_tokens": 775287902.0, "step": 1824 }, { "entropy": 0.01946527836844325, "epoch": 0.7987744829850093, "grad_norm": 16.375, "learning_rate": 2.5990795885219274e-05, "loss": 0.1495, "loss_lm": 0.015735001303255558, "loss_seg": 0.1337154284119606, "mean_token_accuracy": 0.9951680302619934, "num_tokens": 775713377.0, "step": 1825 }, { "entropy": 0.019428046885877848, "epoch": 0.7992121676332202, "grad_norm": 30.75, "learning_rate": 2.5988088792636708e-05, "loss": 0.1152, "loss_lm": 0.015348100801929832, "loss_seg": 0.09981212764978409, "mean_token_accuracy": 0.9952244609594345, "num_tokens": 776138643.0, "step": 1826 }, { "entropy": 0.019806182477623224, "epoch": 0.7996498522814313, "grad_norm": 11.5625, "learning_rate": 2.5985381700054145e-05, "loss": 0.1142, "loss_lm": 0.017402190249413252, "loss_seg": 0.09678527526557446, "mean_token_accuracy": 0.9950454980134964, "num_tokens": 776563669.0, "step": 1827 }, { "entropy": 0.018757040612399578, "epoch": 0.8000875369296422, "grad_norm": 7.25, "learning_rate": 2.5982674607471576e-05, "loss": 0.1645, "loss_lm": 0.01593127055093646, "loss_seg": 0.14861192181706429, "mean_token_accuracy": 0.9952519685029984, "num_tokens": 776988867.0, "step": 1828 }, { "entropy": 0.01921221287921071, "epoch": 0.8005252215778531, "grad_norm": 15.0, "learning_rate": 2.597996751488901e-05, "loss": 0.1553, "loss_lm": 0.016553082736209035, "loss_seg": 0.13878557831048965, "mean_token_accuracy": 0.9951890856027603, "num_tokens": 777414325.0, "step": 1829 }, { "entropy": 0.019353396259248257, "epoch": 0.8009629062260641, "grad_norm": 10.125, "learning_rate": 2.5977260422306443e-05, "loss": 0.159, "loss_lm": 0.01536399289034307, "loss_seg": 0.14363721013069153, "mean_token_accuracy": 0.9951584935188293, "num_tokens": 777839176.0, "step": 1830 }, { "entropy": 0.018756679259240627, "epoch": 0.8014005908742751, "grad_norm": 8.0, "learning_rate": 2.5974553329723877e-05, "loss": 0.1667, "loss_lm": 0.017723673256114125, "loss_seg": 0.14900018274784088, "mean_token_accuracy": 0.9953308403491974, "num_tokens": 778264690.0, "step": 1831 }, { "entropy": 0.01851205714046955, "epoch": 0.8018382755224861, "grad_norm": 10.25, "learning_rate": 2.5971846237141314e-05, "loss": 0.1854, "loss_lm": 0.019152211025357246, "loss_seg": 0.16627534106373787, "mean_token_accuracy": 0.9954289495944977, "num_tokens": 778689714.0, "step": 1832 }, { "entropy": 0.01954803103581071, "epoch": 0.802275960170697, "grad_norm": 12.8125, "learning_rate": 2.5969139144558744e-05, "loss": 0.1324, "loss_lm": 0.015359927900135517, "loss_seg": 0.11700385808944702, "mean_token_accuracy": 0.9951796531677246, "num_tokens": 779115367.0, "step": 1833 }, { "entropy": 0.01924759102985263, "epoch": 0.802713644818908, "grad_norm": 9.8125, "learning_rate": 2.5966432051976178e-05, "loss": 0.1282, "loss_lm": 0.01678728754632175, "loss_seg": 0.11137024126946926, "mean_token_accuracy": 0.9950627088546753, "num_tokens": 779540465.0, "step": 1834 }, { "entropy": 0.018898573238402605, "epoch": 0.803151329467119, "grad_norm": 21.5, "learning_rate": 2.596372495939361e-05, "loss": 0.1321, "loss_lm": 0.014792392030358315, "loss_seg": 0.11725973710417747, "mean_token_accuracy": 0.9951980859041214, "num_tokens": 779965469.0, "step": 1835 }, { "entropy": 0.01890780869871378, "epoch": 0.8035890141153299, "grad_norm": 6.90625, "learning_rate": 2.5961017866811045e-05, "loss": 0.1728, "loss_lm": 0.016340036178007722, "loss_seg": 0.15642202645540237, "mean_token_accuracy": 0.9952083081007004, "num_tokens": 780390527.0, "step": 1836 }, { "entropy": 0.019136385526508093, "epoch": 0.8040266987635408, "grad_norm": 7.0, "learning_rate": 2.5958310774228483e-05, "loss": 0.1641, "loss_lm": 0.017540299333631992, "loss_seg": 0.14657678082585335, "mean_token_accuracy": 0.9951888769865036, "num_tokens": 780815138.0, "step": 1837 }, { "entropy": 0.01895340019837022, "epoch": 0.8044643834117519, "grad_norm": 24.0, "learning_rate": 2.5955603681645913e-05, "loss": 0.134, "loss_lm": 0.015939123462885618, "loss_seg": 0.11807608231902122, "mean_token_accuracy": 0.99516262114048, "num_tokens": 781240132.0, "step": 1838 }, { "entropy": 0.018970553297549486, "epoch": 0.8049020680599628, "grad_norm": 6.65625, "learning_rate": 2.5952896589063347e-05, "loss": 0.1406, "loss_lm": 0.0200568032450974, "loss_seg": 0.12052716873586178, "mean_token_accuracy": 0.995097205042839, "num_tokens": 781665583.0, "step": 1839 }, { "entropy": 0.019556475803256035, "epoch": 0.8053397527081737, "grad_norm": 9.625, "learning_rate": 2.595018949648078e-05, "loss": 0.2137, "loss_lm": 0.01790563017129898, "loss_seg": 0.1958030927926302, "mean_token_accuracy": 0.9951564371585846, "num_tokens": 782091513.0, "step": 1840 }, { "entropy": 0.018889253493398428, "epoch": 0.8057774373563847, "grad_norm": 6.21875, "learning_rate": 2.5947482403898214e-05, "loss": 0.1299, "loss_lm": 0.016215100651606917, "loss_seg": 0.11372353881597519, "mean_token_accuracy": 0.9951730519533157, "num_tokens": 782516367.0, "step": 1841 }, { "entropy": 0.01917858375236392, "epoch": 0.8062151220045957, "grad_norm": 16.5, "learning_rate": 2.5944775311315644e-05, "loss": 0.183, "loss_lm": 0.020682767033576965, "loss_seg": 0.16233650036156178, "mean_token_accuracy": 0.9951453506946564, "num_tokens": 782940688.0, "step": 1842 }, { "entropy": 0.019564708694815636, "epoch": 0.8066528066528067, "grad_norm": 9.3125, "learning_rate": 2.594206821873308e-05, "loss": 0.1598, "loss_lm": 0.014996616635471582, "loss_seg": 0.1448079999536276, "mean_token_accuracy": 0.9951488375663757, "num_tokens": 783365293.0, "step": 1843 }, { "entropy": 0.02005771454423666, "epoch": 0.8070904913010176, "grad_norm": 9.3125, "learning_rate": 2.5939361126150515e-05, "loss": 0.1686, "loss_lm": 0.018793793162330985, "loss_seg": 0.14980080723762512, "mean_token_accuracy": 0.9950139820575714, "num_tokens": 783791075.0, "step": 1844 }, { "entropy": 0.01865261746570468, "epoch": 0.8075281759492285, "grad_norm": 11.5, "learning_rate": 2.593665403356795e-05, "loss": 0.1058, "loss_lm": 0.01684008212760091, "loss_seg": 0.08894579485058784, "mean_token_accuracy": 0.9953816682100296, "num_tokens": 784215635.0, "step": 1845 }, { "entropy": 0.018689594697207212, "epoch": 0.8079658605974396, "grad_norm": 10.5625, "learning_rate": 2.5933946940985383e-05, "loss": 0.1957, "loss_lm": 0.016065263422206044, "loss_seg": 0.17966460064053535, "mean_token_accuracy": 0.9953780621290207, "num_tokens": 784639865.0, "step": 1846 }, { "entropy": 0.019406962674111128, "epoch": 0.8084035452456505, "grad_norm": 18.5, "learning_rate": 2.5931239848402813e-05, "loss": 0.1295, "loss_lm": 0.014612880302593112, "loss_seg": 0.11489930376410484, "mean_token_accuracy": 0.9950554817914963, "num_tokens": 785064555.0, "step": 1847 }, { "entropy": 0.019293634220957756, "epoch": 0.8088412298938614, "grad_norm": 12.375, "learning_rate": 2.592853275582025e-05, "loss": 0.1222, "loss_lm": 0.015589357586577535, "loss_seg": 0.106564462184906, "mean_token_accuracy": 0.9951851069927216, "num_tokens": 785489842.0, "step": 1848 }, { "entropy": 0.018858341965824366, "epoch": 0.8092789145420725, "grad_norm": 10.125, "learning_rate": 2.5925825663237684e-05, "loss": 0.163, "loss_lm": 0.016203513834625483, "loss_seg": 0.14679844118654728, "mean_token_accuracy": 0.995283380150795, "num_tokens": 785914468.0, "step": 1849 }, { "entropy": 0.018920081667602062, "epoch": 0.8097165991902834, "grad_norm": 4.5, "learning_rate": 2.5923118570655118e-05, "loss": 0.1543, "loss_lm": 0.014684384223073721, "loss_seg": 0.1396097131073475, "mean_token_accuracy": 0.9952154755592346, "num_tokens": 786339986.0, "step": 1850 }, { "entropy": 0.019292641896754503, "epoch": 0.8101542838384944, "grad_norm": 15.375, "learning_rate": 2.592041147807255e-05, "loss": 0.115, "loss_lm": 0.01537797343917191, "loss_seg": 0.09960976988077164, "mean_token_accuracy": 0.9951473623514175, "num_tokens": 786764700.0, "step": 1851 }, { "entropy": 0.019518309272825718, "epoch": 0.8105919684867053, "grad_norm": 8.25, "learning_rate": 2.591770438548998e-05, "loss": 0.139, "loss_lm": 0.017628483474254608, "loss_seg": 0.12138871289789677, "mean_token_accuracy": 0.9949930906295776, "num_tokens": 787189059.0, "step": 1852 }, { "entropy": 0.01972431456670165, "epoch": 0.8110296531349163, "grad_norm": 8.375, "learning_rate": 2.591499729290742e-05, "loss": 0.1365, "loss_lm": 0.01860291906632483, "loss_seg": 0.11789558827877045, "mean_token_accuracy": 0.995000496506691, "num_tokens": 787614272.0, "step": 1853 }, { "entropy": 0.01878949161618948, "epoch": 0.8114673377831273, "grad_norm": 7.6875, "learning_rate": 2.5912290200324852e-05, "loss": 0.1414, "loss_lm": 0.01732748793438077, "loss_seg": 0.12408186495304108, "mean_token_accuracy": 0.9953248053789139, "num_tokens": 788039032.0, "step": 1854 }, { "entropy": 0.018806067761033773, "epoch": 0.8119050224313382, "grad_norm": 12.6875, "learning_rate": 2.5909583107742286e-05, "loss": 0.1461, "loss_lm": 0.01650863979011774, "loss_seg": 0.12957601435482502, "mean_token_accuracy": 0.9952515214681625, "num_tokens": 788464830.0, "step": 1855 }, { "entropy": 0.019234132021665573, "epoch": 0.8123427070795491, "grad_norm": 10.4375, "learning_rate": 2.590687601515972e-05, "loss": 0.1286, "loss_lm": 0.017128286650404334, "loss_seg": 0.11144638620316982, "mean_token_accuracy": 0.9950909167528152, "num_tokens": 788889342.0, "step": 1856 }, { "entropy": 0.019282478373497725, "epoch": 0.8127803917277602, "grad_norm": 9.5, "learning_rate": 2.590416892257715e-05, "loss": 0.1326, "loss_lm": 0.015650151995941997, "loss_seg": 0.1169829685240984, "mean_token_accuracy": 0.9951002299785614, "num_tokens": 789314210.0, "step": 1857 }, { "entropy": 0.018619113601744175, "epoch": 0.8132180763759711, "grad_norm": 7.46875, "learning_rate": 2.5901461829994587e-05, "loss": 0.1319, "loss_lm": 0.016932345693930984, "loss_seg": 0.11493361555039883, "mean_token_accuracy": 0.9953542351722717, "num_tokens": 789739277.0, "step": 1858 }, { "entropy": 0.019546797033399343, "epoch": 0.813655761024182, "grad_norm": 6.59375, "learning_rate": 2.589875473741202e-05, "loss": 0.1758, "loss_lm": 0.018449201248586178, "loss_seg": 0.15732620656490326, "mean_token_accuracy": 0.9951034933328629, "num_tokens": 790164669.0, "step": 1859 }, { "entropy": 0.019179250113666058, "epoch": 0.8140934456723931, "grad_norm": 11.875, "learning_rate": 2.5896047644829455e-05, "loss": 0.1534, "loss_lm": 0.017085799481719732, "loss_seg": 0.13630736991763115, "mean_token_accuracy": 0.9951113760471344, "num_tokens": 790589915.0, "step": 1860 }, { "entropy": 0.01884925365447998, "epoch": 0.814531130320604, "grad_norm": 12.625, "learning_rate": 2.589334055224689e-05, "loss": 0.1282, "loss_lm": 0.015586461406201124, "loss_seg": 0.11265330202877522, "mean_token_accuracy": 0.9952019602060318, "num_tokens": 791015660.0, "step": 1861 }, { "entropy": 0.01896765176206827, "epoch": 0.814968814968815, "grad_norm": 9.75, "learning_rate": 2.589063345966432e-05, "loss": 0.1366, "loss_lm": 0.014767515705898404, "loss_seg": 0.12186661548912525, "mean_token_accuracy": 0.9951344579458237, "num_tokens": 791440823.0, "step": 1862 }, { "entropy": 0.019116787239909172, "epoch": 0.8154064996170259, "grad_norm": 20.875, "learning_rate": 2.5887926367081756e-05, "loss": 0.1078, "loss_lm": 0.014840566087514162, "loss_seg": 0.09299733489751816, "mean_token_accuracy": 0.9951840937137604, "num_tokens": 791865850.0, "step": 1863 }, { "entropy": 0.018553385976701975, "epoch": 0.8158441842652369, "grad_norm": 6.40625, "learning_rate": 2.588521927449919e-05, "loss": 0.129, "loss_lm": 0.0162917859852314, "loss_seg": 0.11271937750279903, "mean_token_accuracy": 0.995308056473732, "num_tokens": 792291016.0, "step": 1864 }, { "entropy": 0.019086224492639303, "epoch": 0.8162818689134479, "grad_norm": 11.625, "learning_rate": 2.5882512181916623e-05, "loss": 0.166, "loss_lm": 0.014798277989029884, "loss_seg": 0.15121660381555557, "mean_token_accuracy": 0.9952884912490845, "num_tokens": 792715231.0, "step": 1865 }, { "entropy": 0.019598865415900946, "epoch": 0.8167195535616588, "grad_norm": 11.0625, "learning_rate": 2.5879805089334054e-05, "loss": 0.1357, "loss_lm": 0.01637342688627541, "loss_seg": 0.11927886679768562, "mean_token_accuracy": 0.9950295686721802, "num_tokens": 793139726.0, "step": 1866 }, { "entropy": 0.020016161259263754, "epoch": 0.8171572382098697, "grad_norm": 11.1875, "learning_rate": 2.5877097996751488e-05, "loss": 0.1147, "loss_lm": 0.01706355018541217, "loss_seg": 0.09763774648308754, "mean_token_accuracy": 0.9948827177286148, "num_tokens": 793565142.0, "step": 1867 }, { "entropy": 0.018934348598122597, "epoch": 0.8175949228580808, "grad_norm": 18.875, "learning_rate": 2.5874390904168925e-05, "loss": 0.1843, "loss_lm": 0.01547298370860517, "loss_seg": 0.1687801517546177, "mean_token_accuracy": 0.9952411502599716, "num_tokens": 793990844.0, "step": 1868 }, { "entropy": 0.018723455257713795, "epoch": 0.8180326075062917, "grad_norm": 12.5625, "learning_rate": 2.587168381158636e-05, "loss": 0.1711, "loss_lm": 0.016058190260082483, "loss_seg": 0.15502681583166122, "mean_token_accuracy": 0.9953581690788269, "num_tokens": 794416430.0, "step": 1869 }, { "entropy": 0.019297962076961994, "epoch": 0.8184702921545027, "grad_norm": 11.1875, "learning_rate": 2.5868976719003792e-05, "loss": 0.1588, "loss_lm": 0.01776331989094615, "loss_seg": 0.14102045074105263, "mean_token_accuracy": 0.9951668232679367, "num_tokens": 794841790.0, "step": 1870 }, { "entropy": 0.019080262165516615, "epoch": 0.8189079768027137, "grad_norm": 9.75, "learning_rate": 2.5866269626421222e-05, "loss": 0.1805, "loss_lm": 0.016936237225309014, "loss_seg": 0.16358710080385208, "mean_token_accuracy": 0.9951981157064438, "num_tokens": 795267193.0, "step": 1871 }, { "entropy": 0.018748587928712368, "epoch": 0.8193456614509246, "grad_norm": 12.875, "learning_rate": 2.5863562533838656e-05, "loss": 0.1396, "loss_lm": 0.016774797346442938, "loss_seg": 0.12282963842153549, "mean_token_accuracy": 0.9952608942985535, "num_tokens": 795691715.0, "step": 1872 }, { "entropy": 0.019018675666302443, "epoch": 0.8197833460991356, "grad_norm": 18.25, "learning_rate": 2.5860855441256093e-05, "loss": 0.089, "loss_lm": 0.016726209316402674, "loss_seg": 0.07230453565716743, "mean_token_accuracy": 0.9951158761978149, "num_tokens": 796116471.0, "step": 1873 }, { "entropy": 0.0187861705198884, "epoch": 0.8202210307473465, "grad_norm": 9.1875, "learning_rate": 2.5858148348673527e-05, "loss": 0.1257, "loss_lm": 0.013952352106571198, "loss_seg": 0.11172907240688801, "mean_token_accuracy": 0.995185062289238, "num_tokens": 796541540.0, "step": 1874 }, { "entropy": 0.019486572593450546, "epoch": 0.8206587153955575, "grad_norm": 8.75, "learning_rate": 2.585544125609096e-05, "loss": 0.1813, "loss_lm": 0.019656226970255375, "loss_seg": 0.16162783652544022, "mean_token_accuracy": 0.9949460923671722, "num_tokens": 796966587.0, "step": 1875 }, { "entropy": 0.019150893669575453, "epoch": 0.8210964000437685, "grad_norm": 22.375, "learning_rate": 2.585273416350839e-05, "loss": 0.1667, "loss_lm": 0.016806978499516845, "loss_seg": 0.1499217301607132, "mean_token_accuracy": 0.995227187871933, "num_tokens": 797391820.0, "step": 1876 }, { "entropy": 0.019599666818976402, "epoch": 0.8215340846919794, "grad_norm": 9.3125, "learning_rate": 2.5850027070925825e-05, "loss": 0.1474, "loss_lm": 0.01571726822294295, "loss_seg": 0.1316453143954277, "mean_token_accuracy": 0.9951091706752777, "num_tokens": 797816864.0, "step": 1877 }, { "entropy": 0.01900150114670396, "epoch": 0.8219717693401903, "grad_norm": 7.65625, "learning_rate": 2.584731997834326e-05, "loss": 0.1373, "loss_lm": 0.014593883650377393, "loss_seg": 0.12274621613323689, "mean_token_accuracy": 0.9952063709497452, "num_tokens": 798241359.0, "step": 1878 }, { "entropy": 0.019093035720288754, "epoch": 0.8224094539884014, "grad_norm": 9.125, "learning_rate": 2.5844612885760696e-05, "loss": 0.1158, "loss_lm": 0.015184410382062197, "loss_seg": 0.10057393088936806, "mean_token_accuracy": 0.9952426850795746, "num_tokens": 798665943.0, "step": 1879 }, { "entropy": 0.019155631307512522, "epoch": 0.8228471386366123, "grad_norm": 5.65625, "learning_rate": 2.584190579317813e-05, "loss": 0.154, "loss_lm": 0.017208164324983954, "loss_seg": 0.13676435314118862, "mean_token_accuracy": 0.995074063539505, "num_tokens": 799091026.0, "step": 1880 }, { "entropy": 0.018904326017946005, "epoch": 0.8232848232848233, "grad_norm": 10.0625, "learning_rate": 2.583919870059556e-05, "loss": 0.1556, "loss_lm": 0.016946921357885003, "loss_seg": 0.13866417855024338, "mean_token_accuracy": 0.9952771663665771, "num_tokens": 799515311.0, "step": 1881 }, { "entropy": 0.01922968728467822, "epoch": 0.8237225079330343, "grad_norm": 17.75, "learning_rate": 2.5836491608012993e-05, "loss": 0.1463, "loss_lm": 0.0176658951677382, "loss_seg": 0.12862450629472733, "mean_token_accuracy": 0.9952639043331146, "num_tokens": 799939867.0, "step": 1882 }, { "entropy": 0.019033003132790327, "epoch": 0.8241601925812452, "grad_norm": 12.3125, "learning_rate": 2.5833784515430427e-05, "loss": 0.1355, "loss_lm": 0.014583223965018988, "loss_seg": 0.1208710316568613, "mean_token_accuracy": 0.9952219575643539, "num_tokens": 800365292.0, "step": 1883 }, { "entropy": 0.01895188447088003, "epoch": 0.8245978772294562, "grad_norm": 27.375, "learning_rate": 2.5831077422847864e-05, "loss": 0.171, "loss_lm": 0.01857843855395913, "loss_seg": 0.15237562358379364, "mean_token_accuracy": 0.9953236281871796, "num_tokens": 800790332.0, "step": 1884 }, { "entropy": 0.019916626624763012, "epoch": 0.8250355618776671, "grad_norm": 9.1875, "learning_rate": 2.5828370330265298e-05, "loss": 0.1233, "loss_lm": 0.01569947530515492, "loss_seg": 0.10764932632446289, "mean_token_accuracy": 0.9950021952390671, "num_tokens": 801215299.0, "step": 1885 }, { "entropy": 0.019270699936896563, "epoch": 0.8254732465258781, "grad_norm": 19.625, "learning_rate": 2.582566323768273e-05, "loss": 0.1254, "loss_lm": 0.016249347245320678, "loss_seg": 0.10911862552165985, "mean_token_accuracy": 0.9951462745666504, "num_tokens": 801640728.0, "step": 1886 }, { "entropy": 0.019175410736352205, "epoch": 0.8259109311740891, "grad_norm": 11.25, "learning_rate": 2.5822956145100162e-05, "loss": 0.1728, "loss_lm": 0.0184716684743762, "loss_seg": 0.15428913570940495, "mean_token_accuracy": 0.9952511787414551, "num_tokens": 802066276.0, "step": 1887 }, { "entropy": 0.01892786705866456, "epoch": 0.8263486158223, "grad_norm": 12.5, "learning_rate": 2.5820249052517596e-05, "loss": 0.1742, "loss_lm": 0.016416372265666723, "loss_seg": 0.15776389092206955, "mean_token_accuracy": 0.9952397346496582, "num_tokens": 802491394.0, "step": 1888 }, { "entropy": 0.019238942302763462, "epoch": 0.826786300470511, "grad_norm": 10.75, "learning_rate": 2.5817541959935033e-05, "loss": 0.1366, "loss_lm": 0.0163821573369205, "loss_seg": 0.12023917399346828, "mean_token_accuracy": 0.9951348155736923, "num_tokens": 802917013.0, "step": 1889 }, { "entropy": 0.01921773422509432, "epoch": 0.827223985118722, "grad_norm": 5.21875, "learning_rate": 2.5814834867352463e-05, "loss": 0.1478, "loss_lm": 0.0182935344055295, "loss_seg": 0.12948790937662125, "mean_token_accuracy": 0.9951111972332001, "num_tokens": 803342243.0, "step": 1890 }, { "entropy": 0.0187407867051661, "epoch": 0.8276616697669329, "grad_norm": 28.75, "learning_rate": 2.5812127774769897e-05, "loss": 0.1587, "loss_lm": 0.016938136657699943, "loss_seg": 0.14179634302854538, "mean_token_accuracy": 0.9953212291002274, "num_tokens": 803767518.0, "step": 1891 }, { "entropy": 0.019085250329226255, "epoch": 0.8280993544151439, "grad_norm": 81.5, "learning_rate": 2.580942068218733e-05, "loss": 0.1329, "loss_lm": 0.014861556002870202, "loss_seg": 0.11805539298802614, "mean_token_accuracy": 0.9952107816934586, "num_tokens": 804192186.0, "step": 1892 }, { "entropy": 0.01872615935280919, "epoch": 0.8285370390633549, "grad_norm": 10.3125, "learning_rate": 2.5806713589604764e-05, "loss": 0.1268, "loss_lm": 0.01580854062922299, "loss_seg": 0.11094613932073116, "mean_token_accuracy": 0.9952391386032104, "num_tokens": 804617254.0, "step": 1893 }, { "entropy": 0.019266735296696424, "epoch": 0.8289747237115658, "grad_norm": 8.875, "learning_rate": 2.58040064970222e-05, "loss": 0.1653, "loss_lm": 0.01468740007840097, "loss_seg": 0.15061852522194386, "mean_token_accuracy": 0.9951981157064438, "num_tokens": 805042848.0, "step": 1894 }, { "entropy": 0.019196640700101852, "epoch": 0.8294124083597768, "grad_norm": 10.0, "learning_rate": 2.5801299404439632e-05, "loss": 0.1223, "loss_lm": 0.01660587638616562, "loss_seg": 0.10573671385645866, "mean_token_accuracy": 0.9951928704977036, "num_tokens": 805466997.0, "step": 1895 }, { "entropy": 0.01988794794306159, "epoch": 0.8298500930079877, "grad_norm": 11.5625, "learning_rate": 2.5798592311857066e-05, "loss": 0.1548, "loss_lm": 0.014652594923973083, "loss_seg": 0.14009783416986465, "mean_token_accuracy": 0.9951063990592957, "num_tokens": 805892840.0, "step": 1896 }, { "entropy": 0.019557053223252296, "epoch": 0.8302877776561987, "grad_norm": 8.1875, "learning_rate": 2.57958852192745e-05, "loss": 0.1669, "loss_lm": 0.015634298091754317, "loss_seg": 0.15128694102168083, "mean_token_accuracy": 0.9951348900794983, "num_tokens": 806318287.0, "step": 1897 }, { "entropy": 0.018951515201479197, "epoch": 0.8307254623044097, "grad_norm": 15.625, "learning_rate": 2.5793178126691933e-05, "loss": 0.1473, "loss_lm": 0.014363228809088469, "loss_seg": 0.13290681689977646, "mean_token_accuracy": 0.9952378422021866, "num_tokens": 806743475.0, "step": 1898 }, { "entropy": 0.018762752413749695, "epoch": 0.8311631469526206, "grad_norm": 15.375, "learning_rate": 2.579047103410937e-05, "loss": 0.1248, "loss_lm": 0.014784569852054119, "loss_seg": 0.10997649282217026, "mean_token_accuracy": 0.9953199625015259, "num_tokens": 807168667.0, "step": 1899 }, { "entropy": 0.01971504930406809, "epoch": 0.8316008316008316, "grad_norm": 11.3125, "learning_rate": 2.57877639415268e-05, "loss": 0.1436, "loss_lm": 0.016715554054826498, "loss_seg": 0.12690279819071293, "mean_token_accuracy": 0.9951744675636292, "num_tokens": 807593440.0, "step": 1900 }, { "entropy": 0.01962537318468094, "epoch": 0.8320385162490426, "grad_norm": 9.4375, "learning_rate": 2.5785056848944234e-05, "loss": 0.1212, "loss_lm": 0.018481845501810312, "loss_seg": 0.1027086190879345, "mean_token_accuracy": 0.995123490691185, "num_tokens": 808019103.0, "step": 1901 }, { "entropy": 0.01890969416126609, "epoch": 0.8324762008972535, "grad_norm": 10.6875, "learning_rate": 2.5782349756361668e-05, "loss": 0.1245, "loss_lm": 0.016427198890596628, "loss_seg": 0.10809308104217052, "mean_token_accuracy": 0.995345950126648, "num_tokens": 808443649.0, "step": 1902 }, { "entropy": 0.019180889707058668, "epoch": 0.8329138855454645, "grad_norm": 10.0, "learning_rate": 2.57796426637791e-05, "loss": 0.2133, "loss_lm": 0.01500376546755433, "loss_seg": 0.19825803115963936, "mean_token_accuracy": 0.9951799809932709, "num_tokens": 808868516.0, "step": 1903 }, { "entropy": 0.01893705315887928, "epoch": 0.8333515701936755, "grad_norm": 10.25, "learning_rate": 2.577693557119654e-05, "loss": 0.1001, "loss_lm": 0.015263769077137113, "loss_seg": 0.08486807160079479, "mean_token_accuracy": 0.99522465467453, "num_tokens": 809293795.0, "step": 1904 }, { "entropy": 0.019377371296286583, "epoch": 0.8337892548418864, "grad_norm": 10.4375, "learning_rate": 2.577422847861397e-05, "loss": 0.1547, "loss_lm": 0.016146064270287752, "loss_seg": 0.13856822159141302, "mean_token_accuracy": 0.995108351111412, "num_tokens": 809719388.0, "step": 1905 }, { "entropy": 0.01894402364268899, "epoch": 0.8342269394900974, "grad_norm": 10.9375, "learning_rate": 2.5771521386031403e-05, "loss": 0.1396, "loss_lm": 0.015989233972504735, "loss_seg": 0.12361301854252815, "mean_token_accuracy": 0.9952324032783508, "num_tokens": 810145144.0, "step": 1906 }, { "entropy": 0.01953070191666484, "epoch": 0.8346646241383083, "grad_norm": 8.125, "learning_rate": 2.5768814293448837e-05, "loss": 0.1736, "loss_lm": 0.018670415971428156, "loss_seg": 0.1549365036189556, "mean_token_accuracy": 0.9950689524412155, "num_tokens": 810570395.0, "step": 1907 }, { "entropy": 0.018554523587226868, "epoch": 0.8351023087865194, "grad_norm": 8.1875, "learning_rate": 2.576610720086627e-05, "loss": 0.152, "loss_lm": 0.01455302070826292, "loss_seg": 0.1374823935329914, "mean_token_accuracy": 0.995411604642868, "num_tokens": 810994890.0, "step": 1908 }, { "entropy": 0.01908372761681676, "epoch": 0.8355399934347303, "grad_norm": 5.375, "learning_rate": 2.57634001082837e-05, "loss": 0.1299, "loss_lm": 0.01675175060518086, "loss_seg": 0.11311956495046616, "mean_token_accuracy": 0.9952030777931213, "num_tokens": 811420104.0, "step": 1909 }, { "entropy": 0.01951363915577531, "epoch": 0.8359776780829412, "grad_norm": 14.6875, "learning_rate": 2.5760693015701138e-05, "loss": 0.1302, "loss_lm": 0.018938868306577206, "loss_seg": 0.1112319864332676, "mean_token_accuracy": 0.99505415558815, "num_tokens": 811845504.0, "step": 1910 }, { "entropy": 0.019370978698134422, "epoch": 0.8364153627311522, "grad_norm": 16.0, "learning_rate": 2.575798592311857e-05, "loss": 0.1488, "loss_lm": 0.01546702510677278, "loss_seg": 0.1333239283412695, "mean_token_accuracy": 0.9951105266809464, "num_tokens": 812270282.0, "step": 1911 }, { "entropy": 0.019211302511394024, "epoch": 0.8368530473793632, "grad_norm": 17.25, "learning_rate": 2.5755278830536005e-05, "loss": 0.1649, "loss_lm": 0.016860229894518852, "loss_seg": 0.14806048199534416, "mean_token_accuracy": 0.99516960978508, "num_tokens": 812695697.0, "step": 1912 }, { "entropy": 0.019558792002499104, "epoch": 0.8372907320275741, "grad_norm": 14.375, "learning_rate": 2.575257173795344e-05, "loss": 0.1654, "loss_lm": 0.017007023794576526, "loss_seg": 0.14836416393518448, "mean_token_accuracy": 0.9950509518384933, "num_tokens": 813119805.0, "step": 1913 }, { "entropy": 0.019017440732568502, "epoch": 0.8377284166757851, "grad_norm": 12.125, "learning_rate": 2.574986464537087e-05, "loss": 0.1379, "loss_lm": 0.01635135058313608, "loss_seg": 0.12154443189501762, "mean_token_accuracy": 0.9952978491783142, "num_tokens": 813545530.0, "step": 1914 }, { "entropy": 0.019086308777332306, "epoch": 0.838166101323996, "grad_norm": 9.375, "learning_rate": 2.5747157552788306e-05, "loss": 0.1881, "loss_lm": 0.017520014895126224, "loss_seg": 0.17055078595876694, "mean_token_accuracy": 0.9951363354921341, "num_tokens": 813970698.0, "step": 1915 }, { "entropy": 0.019501245580613613, "epoch": 0.838603785972207, "grad_norm": 6.875, "learning_rate": 2.574445046020574e-05, "loss": 0.1159, "loss_lm": 0.018552303314208984, "loss_seg": 0.09730887413024902, "mean_token_accuracy": 0.9950965046882629, "num_tokens": 814396435.0, "step": 1916 }, { "entropy": 0.01982824644073844, "epoch": 0.839041470620418, "grad_norm": 9.125, "learning_rate": 2.5741743367623174e-05, "loss": 0.1878, "loss_lm": 0.016925853211432695, "loss_seg": 0.1708802543580532, "mean_token_accuracy": 0.9950315654277802, "num_tokens": 814822281.0, "step": 1917 }, { "entropy": 0.019121864810585976, "epoch": 0.8394791552686289, "grad_norm": 5.53125, "learning_rate": 2.5739036275040608e-05, "loss": 0.145, "loss_lm": 0.016208574874326587, "loss_seg": 0.12879245169460773, "mean_token_accuracy": 0.9951593726873398, "num_tokens": 815247394.0, "step": 1918 }, { "entropy": 0.01942156907171011, "epoch": 0.83991683991684, "grad_norm": 12.6875, "learning_rate": 2.5736329182458038e-05, "loss": 0.1547, "loss_lm": 0.016422336222603917, "loss_seg": 0.1382303647696972, "mean_token_accuracy": 0.9950017482042313, "num_tokens": 815672514.0, "step": 1919 }, { "entropy": 0.01942607667297125, "epoch": 0.8403545245650509, "grad_norm": 10.5, "learning_rate": 2.5733622089875475e-05, "loss": 0.1203, "loss_lm": 0.01737336185760796, "loss_seg": 0.10290057398378849, "mean_token_accuracy": 0.9950617849826813, "num_tokens": 816097933.0, "step": 1920 }, { "entropy": 0.018630616832524538, "epoch": 0.8407922092132618, "grad_norm": 15.5625, "learning_rate": 2.573091499729291e-05, "loss": 0.1459, "loss_lm": 0.01445306371897459, "loss_seg": 0.13149412348866463, "mean_token_accuracy": 0.9952086955308914, "num_tokens": 816523057.0, "step": 1921 }, { "entropy": 0.01897530583664775, "epoch": 0.8412298938614728, "grad_norm": 10.125, "learning_rate": 2.5728207904710342e-05, "loss": 0.1557, "loss_lm": 0.01718023745343089, "loss_seg": 0.13852661475539207, "mean_token_accuracy": 0.9951557815074921, "num_tokens": 816947674.0, "step": 1922 }, { "entropy": 0.01902478840202093, "epoch": 0.8416675785096838, "grad_norm": 5.5625, "learning_rate": 2.5725500812127776e-05, "loss": 0.1639, "loss_lm": 0.016109816264361143, "loss_seg": 0.14775611087679863, "mean_token_accuracy": 0.9952969402074814, "num_tokens": 817372044.0, "step": 1923 }, { "entropy": 0.019383105915039778, "epoch": 0.8421052631578947, "grad_norm": 11.6875, "learning_rate": 2.5722793719545207e-05, "loss": 0.1278, "loss_lm": 0.01553038414567709, "loss_seg": 0.11224626004695892, "mean_token_accuracy": 0.9951651841402054, "num_tokens": 817797248.0, "step": 1924 }, { "entropy": 0.019240327645093203, "epoch": 0.8425429478061057, "grad_norm": 16.25, "learning_rate": 2.5720086626962644e-05, "loss": 0.1086, "loss_lm": 0.015692028449848294, "loss_seg": 0.09291866514831781, "mean_token_accuracy": 0.9952063262462616, "num_tokens": 818222129.0, "step": 1925 }, { "entropy": 0.018808597698807716, "epoch": 0.8429806324543166, "grad_norm": 15.3125, "learning_rate": 2.5717379534380077e-05, "loss": 0.1493, "loss_lm": 0.015213438542559743, "loss_seg": 0.1341025624424219, "mean_token_accuracy": 0.9952371567487717, "num_tokens": 818646846.0, "step": 1926 }, { "entropy": 0.019117756746709347, "epoch": 0.8434183171025277, "grad_norm": 8.75, "learning_rate": 2.571467244179751e-05, "loss": 0.0953, "loss_lm": 0.014809819404035807, "loss_seg": 0.08048944920301437, "mean_token_accuracy": 0.995189905166626, "num_tokens": 819071967.0, "step": 1927 }, { "entropy": 0.01876638038083911, "epoch": 0.8438560017507386, "grad_norm": 6.1875, "learning_rate": 2.5711965349214945e-05, "loss": 0.1829, "loss_lm": 0.016812630696222186, "loss_seg": 0.16610638052225113, "mean_token_accuracy": 0.9952554106712341, "num_tokens": 819496174.0, "step": 1928 }, { "entropy": 0.0188726088963449, "epoch": 0.8442936863989495, "grad_norm": 8.0625, "learning_rate": 2.5709258256632375e-05, "loss": 0.124, "loss_lm": 0.017032264731824398, "loss_seg": 0.10700828954577446, "mean_token_accuracy": 0.9952666461467743, "num_tokens": 819920767.0, "step": 1929 }, { "entropy": 0.01863311743363738, "epoch": 0.8447313710471606, "grad_norm": 22.875, "learning_rate": 2.5706551164049812e-05, "loss": 0.1696, "loss_lm": 0.016150051960721612, "loss_seg": 0.1534950602799654, "mean_token_accuracy": 0.9952995479106903, "num_tokens": 820345575.0, "step": 1930 }, { "entropy": 0.018998026847839355, "epoch": 0.8451690556953715, "grad_norm": 21.0, "learning_rate": 2.5703844071467246e-05, "loss": 0.1691, "loss_lm": 0.01567698922008276, "loss_seg": 0.15343131497502327, "mean_token_accuracy": 0.9952444136142731, "num_tokens": 820770906.0, "step": 1931 }, { "entropy": 0.01888369210064411, "epoch": 0.8456067403435824, "grad_norm": 9.625, "learning_rate": 2.570113697888468e-05, "loss": 0.2065, "loss_lm": 0.017919821897521615, "loss_seg": 0.1885800715535879, "mean_token_accuracy": 0.9953112155199051, "num_tokens": 821195927.0, "step": 1932 }, { "entropy": 0.01999238319694996, "epoch": 0.8460444249917934, "grad_norm": 6.96875, "learning_rate": 2.569842988630211e-05, "loss": 0.1143, "loss_lm": 0.015201248228549957, "loss_seg": 0.09908457659184933, "mean_token_accuracy": 0.9949917495250702, "num_tokens": 821621206.0, "step": 1933 }, { "entropy": 0.01891676289960742, "epoch": 0.8464821096400044, "grad_norm": 10.6875, "learning_rate": 2.5695722793719544e-05, "loss": 0.154, "loss_lm": 0.015090132365003228, "loss_seg": 0.13892753794789314, "mean_token_accuracy": 0.9953853040933609, "num_tokens": 822045888.0, "step": 1934 }, { "entropy": 0.019393695518374443, "epoch": 0.8469197942882153, "grad_norm": 8.625, "learning_rate": 2.569301570113698e-05, "loss": 0.1534, "loss_lm": 0.014696795493364334, "loss_seg": 0.13871514052152634, "mean_token_accuracy": 0.9951183199882507, "num_tokens": 822471545.0, "step": 1935 }, { "entropy": 0.019240204710513353, "epoch": 0.8473574789364263, "grad_norm": 6.1875, "learning_rate": 2.5690308608554415e-05, "loss": 0.1814, "loss_lm": 0.017041785875335336, "loss_seg": 0.1644025593996048, "mean_token_accuracy": 0.9950748383998871, "num_tokens": 822897094.0, "step": 1936 }, { "entropy": 0.018699086271226406, "epoch": 0.8477951635846372, "grad_norm": 10.6875, "learning_rate": 2.568760151597185e-05, "loss": 0.0971, "loss_lm": 0.014924595598131418, "loss_seg": 0.08214343711733818, "mean_token_accuracy": 0.9952318221330643, "num_tokens": 823321749.0, "step": 1937 }, { "entropy": 0.01905956258997321, "epoch": 0.8482328482328483, "grad_norm": 13.4375, "learning_rate": 2.568489442338928e-05, "loss": 0.1125, "loss_lm": 0.017179520335048437, "loss_seg": 0.09536840487271547, "mean_token_accuracy": 0.9952181130647659, "num_tokens": 823746837.0, "step": 1938 }, { "entropy": 0.0192937427200377, "epoch": 0.8486705328810592, "grad_norm": 8.3125, "learning_rate": 2.5682187330806712e-05, "loss": 0.1491, "loss_lm": 0.016342435032129288, "loss_seg": 0.13279392756521702, "mean_token_accuracy": 0.9952594935894012, "num_tokens": 824171238.0, "step": 1939 }, { "entropy": 0.019973149988800287, "epoch": 0.8491082175292701, "grad_norm": 13.5, "learning_rate": 2.5679480238224146e-05, "loss": 0.1486, "loss_lm": 0.01834485912695527, "loss_seg": 0.13026674091815948, "mean_token_accuracy": 0.9949288666248322, "num_tokens": 824596189.0, "step": 1940 }, { "entropy": 0.019416483119130135, "epoch": 0.8495459021774812, "grad_norm": 5.65625, "learning_rate": 2.5676773145641583e-05, "loss": 0.1578, "loss_lm": 0.01792638679035008, "loss_seg": 0.1399019043892622, "mean_token_accuracy": 0.9951205253601074, "num_tokens": 825021608.0, "step": 1941 }, { "entropy": 0.018915013410151005, "epoch": 0.8499835868256921, "grad_norm": 6.78125, "learning_rate": 2.5674066053059017e-05, "loss": 0.1698, "loss_lm": 0.015301387058570981, "loss_seg": 0.15452897176146507, "mean_token_accuracy": 0.9952030777931213, "num_tokens": 825446636.0, "step": 1942 }, { "entropy": 0.019152655266225338, "epoch": 0.850421271473903, "grad_norm": 15.8125, "learning_rate": 2.5671358960476447e-05, "loss": 0.1513, "loss_lm": 0.015850319992750883, "loss_seg": 0.13546374440193176, "mean_token_accuracy": 0.9951393455266953, "num_tokens": 825872277.0, "step": 1943 }, { "entropy": 0.019117174670100212, "epoch": 0.850858956122114, "grad_norm": 11.75, "learning_rate": 2.566865186789388e-05, "loss": 0.115, "loss_lm": 0.01725736609660089, "loss_seg": 0.09771647956222296, "mean_token_accuracy": 0.9951494783163071, "num_tokens": 826296939.0, "step": 1944 }, { "entropy": 0.01942270901054144, "epoch": 0.851296640770325, "grad_norm": 11.3125, "learning_rate": 2.5665944775311315e-05, "loss": 0.1656, "loss_lm": 0.015208994038403034, "loss_seg": 0.1503551471978426, "mean_token_accuracy": 0.9950648099184036, "num_tokens": 826721229.0, "step": 1945 }, { "entropy": 0.019380586687475443, "epoch": 0.851734325418536, "grad_norm": 7.5625, "learning_rate": 2.5663237682728752e-05, "loss": 0.1936, "loss_lm": 0.018517363583669066, "loss_seg": 0.17504924908280373, "mean_token_accuracy": 0.9951651394367218, "num_tokens": 827146029.0, "step": 1946 }, { "entropy": 0.019902191124856472, "epoch": 0.8521720100667469, "grad_norm": 9.875, "learning_rate": 2.5660530590146186e-05, "loss": 0.1639, "loss_lm": 0.018799783196300268, "loss_seg": 0.14513373747467995, "mean_token_accuracy": 0.9949743151664734, "num_tokens": 827571412.0, "step": 1947 }, { "entropy": 0.01906796172261238, "epoch": 0.8526096947149578, "grad_norm": 9.125, "learning_rate": 2.5657823497563616e-05, "loss": 0.1584, "loss_lm": 0.015279147308319807, "loss_seg": 0.14315886422991753, "mean_token_accuracy": 0.9952830225229263, "num_tokens": 827996207.0, "step": 1948 }, { "entropy": 0.018942557275295258, "epoch": 0.8530473793631689, "grad_norm": 11.0, "learning_rate": 2.565511640498105e-05, "loss": 0.1439, "loss_lm": 0.016091420548036695, "loss_seg": 0.12782644294202328, "mean_token_accuracy": 0.9952550232410431, "num_tokens": 828420416.0, "step": 1949 }, { "entropy": 0.01944596553221345, "epoch": 0.8534850640113798, "grad_norm": 15.1875, "learning_rate": 2.5652409312398483e-05, "loss": 0.1446, "loss_lm": 0.01684889430180192, "loss_seg": 0.1277236044406891, "mean_token_accuracy": 0.9950304925441742, "num_tokens": 828845768.0, "step": 1950 }, { "entropy": 0.019332728814333677, "epoch": 0.8539227486595907, "grad_norm": 11.25, "learning_rate": 2.564970221981592e-05, "loss": 0.2198, "loss_lm": 0.01685482752509415, "loss_seg": 0.20299139246344566, "mean_token_accuracy": 0.9950762093067169, "num_tokens": 829270598.0, "step": 1951 }, { "entropy": 0.019447740633040667, "epoch": 0.8543604333078018, "grad_norm": 8.5, "learning_rate": 2.5646995127233354e-05, "loss": 0.1331, "loss_lm": 0.016484029358252883, "loss_seg": 0.11659306287765503, "mean_token_accuracy": 0.9950680583715439, "num_tokens": 829695552.0, "step": 1952 }, { "entropy": 0.020074656698852777, "epoch": 0.8547981179560127, "grad_norm": 11.9375, "learning_rate": 2.5644288034650785e-05, "loss": 0.1424, "loss_lm": 0.017899127677083015, "loss_seg": 0.12448978051543236, "mean_token_accuracy": 0.9949344992637634, "num_tokens": 830119585.0, "step": 1953 }, { "entropy": 0.01898353174328804, "epoch": 0.8552358026042236, "grad_norm": 9.625, "learning_rate": 2.5641580942068218e-05, "loss": 0.1834, "loss_lm": 0.017996276263147593, "loss_seg": 0.16544963791966438, "mean_token_accuracy": 0.9952114224433899, "num_tokens": 830544746.0, "step": 1954 }, { "entropy": 0.018902351148426533, "epoch": 0.8556734872524346, "grad_norm": 8.875, "learning_rate": 2.5638873849485652e-05, "loss": 0.1304, "loss_lm": 0.017228650394827127, "loss_seg": 0.11320512555539608, "mean_token_accuracy": 0.9953828155994415, "num_tokens": 830969638.0, "step": 1955 }, { "entropy": 0.019874305929988623, "epoch": 0.8561111719006456, "grad_norm": 16.25, "learning_rate": 2.563616675690309e-05, "loss": 0.1401, "loss_lm": 0.018651881255209446, "loss_seg": 0.12149207666516304, "mean_token_accuracy": 0.9949208498001099, "num_tokens": 831395053.0, "step": 1956 }, { "entropy": 0.01895999349653721, "epoch": 0.8565488565488566, "grad_norm": 5.4375, "learning_rate": 2.563345966432052e-05, "loss": 0.1509, "loss_lm": 0.01619516289792955, "loss_seg": 0.13470644131302834, "mean_token_accuracy": 0.9951247572898865, "num_tokens": 831820029.0, "step": 1957 }, { "entropy": 0.019315284211188555, "epoch": 0.8569865411970675, "grad_norm": 15.1875, "learning_rate": 2.5630752571737953e-05, "loss": 0.2089, "loss_lm": 0.015754255698993802, "loss_seg": 0.19311310723423958, "mean_token_accuracy": 0.9952837824821472, "num_tokens": 832244940.0, "step": 1958 }, { "entropy": 0.019066205248236656, "epoch": 0.8574242258452784, "grad_norm": 12.125, "learning_rate": 2.5628045479155387e-05, "loss": 0.1638, "loss_lm": 0.016536150593310595, "loss_seg": 0.14730268344283104, "mean_token_accuracy": 0.9952912926673889, "num_tokens": 832670476.0, "step": 1959 }, { "entropy": 0.018506911117583513, "epoch": 0.8578619104934895, "grad_norm": 6.53125, "learning_rate": 2.562533838657282e-05, "loss": 0.1217, "loss_lm": 0.016037786845117807, "loss_seg": 0.10563589073717594, "mean_token_accuracy": 0.9952795505523682, "num_tokens": 833095561.0, "step": 1960 }, { "entropy": 0.018614201340824366, "epoch": 0.8582995951417004, "grad_norm": 9.125, "learning_rate": 2.5622631293990258e-05, "loss": 0.12, "loss_lm": 0.015673261135816574, "loss_seg": 0.10435241088271141, "mean_token_accuracy": 0.9954676330089569, "num_tokens": 833519747.0, "step": 1961 }, { "entropy": 0.01911925245076418, "epoch": 0.8587372797899113, "grad_norm": 16.375, "learning_rate": 2.5619924201407688e-05, "loss": 0.1527, "loss_lm": 0.016719701699912548, "loss_seg": 0.13593873009085655, "mean_token_accuracy": 0.9952065348625183, "num_tokens": 833944881.0, "step": 1962 }, { "entropy": 0.01977899018675089, "epoch": 0.8591749644381224, "grad_norm": 14.4375, "learning_rate": 2.5617217108825122e-05, "loss": 0.167, "loss_lm": 0.018005372490733862, "loss_seg": 0.14901152998209, "mean_token_accuracy": 0.9949453920125961, "num_tokens": 834369813.0, "step": 1963 }, { "entropy": 0.019050209783017635, "epoch": 0.8596126490863333, "grad_norm": 3.734375, "learning_rate": 2.5614510016242556e-05, "loss": 0.1009, "loss_lm": 0.015403225552290678, "loss_seg": 0.085450055077672, "mean_token_accuracy": 0.9951841682195663, "num_tokens": 834795056.0, "step": 1964 }, { "entropy": 0.01933824783191085, "epoch": 0.8600503337345443, "grad_norm": 14.125, "learning_rate": 2.561180292365999e-05, "loss": 0.2022, "loss_lm": 0.018290948821231723, "loss_seg": 0.18390578776597977, "mean_token_accuracy": 0.9952203184366226, "num_tokens": 835219101.0, "step": 1965 }, { "entropy": 0.019451300147920847, "epoch": 0.8604880183827552, "grad_norm": 10.3125, "learning_rate": 2.5609095831077426e-05, "loss": 0.1704, "loss_lm": 0.016284970100969076, "loss_seg": 0.15414046868681908, "mean_token_accuracy": 0.9950296580791473, "num_tokens": 835644411.0, "step": 1966 }, { "entropy": 0.01900110673159361, "epoch": 0.8609257030309662, "grad_norm": 6.21875, "learning_rate": 2.5606388738494857e-05, "loss": 0.1918, "loss_lm": 0.015711540123447776, "loss_seg": 0.1760791651904583, "mean_token_accuracy": 0.9951701611280441, "num_tokens": 836069859.0, "step": 1967 }, { "entropy": 0.01946002896875143, "epoch": 0.8613633876791772, "grad_norm": 11.5, "learning_rate": 2.560368164591229e-05, "loss": 0.2051, "loss_lm": 0.01789787271991372, "loss_seg": 0.1871565319597721, "mean_token_accuracy": 0.9950851500034332, "num_tokens": 836494650.0, "step": 1968 }, { "entropy": 0.019390237517654896, "epoch": 0.8618010723273881, "grad_norm": 8.375, "learning_rate": 2.5600974553329724e-05, "loss": 0.1124, "loss_lm": 0.017737524583935738, "loss_seg": 0.09465241618454456, "mean_token_accuracy": 0.9950432777404785, "num_tokens": 836919906.0, "step": 1969 }, { "entropy": 0.018696716986596584, "epoch": 0.862238756975599, "grad_norm": 7.28125, "learning_rate": 2.5598267460747158e-05, "loss": 0.1808, "loss_lm": 0.017723313299939036, "loss_seg": 0.16312425211071968, "mean_token_accuracy": 0.9952926784753799, "num_tokens": 837344504.0, "step": 1970 }, { "entropy": 0.018502882681787014, "epoch": 0.8626764416238101, "grad_norm": 10.25, "learning_rate": 2.559556036816459e-05, "loss": 0.1474, "loss_lm": 0.016883237287402153, "loss_seg": 0.1305176205933094, "mean_token_accuracy": 0.9952643364667892, "num_tokens": 837768695.0, "step": 1971 }, { "entropy": 0.019026643596589565, "epoch": 0.863114126272021, "grad_norm": 4.0, "learning_rate": 2.5592853275582025e-05, "loss": 0.1383, "loss_lm": 0.015651859575882554, "loss_seg": 0.12263797037303448, "mean_token_accuracy": 0.9952677488327026, "num_tokens": 838193834.0, "step": 1972 }, { "entropy": 0.018935630097985268, "epoch": 0.8635518109202319, "grad_norm": 16.875, "learning_rate": 2.559014618299946e-05, "loss": 0.1581, "loss_lm": 0.015640968689695, "loss_seg": 0.1424938179552555, "mean_token_accuracy": 0.9953577816486359, "num_tokens": 838618088.0, "step": 1973 }, { "entropy": 0.018911283928900957, "epoch": 0.863989495568443, "grad_norm": 6.0, "learning_rate": 2.5587439090416893e-05, "loss": 0.1389, "loss_lm": 0.01484297914430499, "loss_seg": 0.1240159161388874, "mean_token_accuracy": 0.9951332360506058, "num_tokens": 839043013.0, "step": 1974 }, { "entropy": 0.01965421251952648, "epoch": 0.8644271802166539, "grad_norm": 13.1875, "learning_rate": 2.5584731997834327e-05, "loss": 0.1064, "loss_lm": 0.01622704416513443, "loss_seg": 0.09020890295505524, "mean_token_accuracy": 0.9950009733438492, "num_tokens": 839468309.0, "step": 1975 }, { "entropy": 0.019337660633027554, "epoch": 0.8648648648648649, "grad_norm": 13.0, "learning_rate": 2.558202490525176e-05, "loss": 0.1312, "loss_lm": 0.016939823981374502, "loss_seg": 0.11425619572401047, "mean_token_accuracy": 0.9950617104768753, "num_tokens": 839893550.0, "step": 1976 }, { "entropy": 0.018491536378860474, "epoch": 0.8653025495130758, "grad_norm": 8.1875, "learning_rate": 2.5579317812669194e-05, "loss": 0.1889, "loss_lm": 0.016666690353304148, "loss_seg": 0.1722646988928318, "mean_token_accuracy": 0.9953653812408447, "num_tokens": 840319214.0, "step": 1977 }, { "entropy": 0.019040600396692753, "epoch": 0.8657402341612868, "grad_norm": 6.625, "learning_rate": 2.5576610720086628e-05, "loss": 0.1369, "loss_lm": 0.015719987452030182, "loss_seg": 0.1211327314376831, "mean_token_accuracy": 0.9951546639204025, "num_tokens": 840744711.0, "step": 1978 }, { "entropy": 0.01903607975691557, "epoch": 0.8661779188094978, "grad_norm": 8.875, "learning_rate": 2.557390362750406e-05, "loss": 0.1634, "loss_lm": 0.017085161060094833, "loss_seg": 0.14635509625077248, "mean_token_accuracy": 0.995175912976265, "num_tokens": 841170235.0, "step": 1979 }, { "entropy": 0.01923613902181387, "epoch": 0.8666156034577087, "grad_norm": 15.375, "learning_rate": 2.5571196534921495e-05, "loss": 0.1148, "loss_lm": 0.014858361100777984, "loss_seg": 0.09996409062296152, "mean_token_accuracy": 0.9951476156711578, "num_tokens": 841595779.0, "step": 1980 }, { "entropy": 0.018215878400951624, "epoch": 0.8670532881059196, "grad_norm": 6.9375, "learning_rate": 2.5568489442338926e-05, "loss": 0.1327, "loss_lm": 0.015764299780130386, "loss_seg": 0.11690276302397251, "mean_token_accuracy": 0.995533213019371, "num_tokens": 842020609.0, "step": 1981 }, { "entropy": 0.01898958347737789, "epoch": 0.8674909727541307, "grad_norm": 13.4375, "learning_rate": 2.5565782349756363e-05, "loss": 0.1618, "loss_lm": 0.015318938763812184, "loss_seg": 0.14643406495451927, "mean_token_accuracy": 0.9950793236494064, "num_tokens": 842445914.0, "step": 1982 }, { "entropy": 0.01881705829873681, "epoch": 0.8679286574023416, "grad_norm": 15.6875, "learning_rate": 2.5563075257173796e-05, "loss": 0.1927, "loss_lm": 0.020255373790860176, "loss_seg": 0.1724731232970953, "mean_token_accuracy": 0.9953412562608719, "num_tokens": 842870643.0, "step": 1983 }, { "entropy": 0.018897001165896654, "epoch": 0.8683663420505526, "grad_norm": 9.75, "learning_rate": 2.556036816459123e-05, "loss": 0.1552, "loss_lm": 0.015343099599704146, "loss_seg": 0.1398516744375229, "mean_token_accuracy": 0.9952104836702347, "num_tokens": 843296119.0, "step": 1984 }, { "entropy": 0.01877013500779867, "epoch": 0.8688040266987636, "grad_norm": 6.59375, "learning_rate": 2.5557661072008664e-05, "loss": 0.1754, "loss_lm": 0.01601971173658967, "loss_seg": 0.15942482464015484, "mean_token_accuracy": 0.9952141642570496, "num_tokens": 843721300.0, "step": 1985 }, { "entropy": 0.019339939579367638, "epoch": 0.8692417113469745, "grad_norm": 14.125, "learning_rate": 2.5554953979426094e-05, "loss": 0.2207, "loss_lm": 0.014202290680259466, "loss_seg": 0.20653065294027328, "mean_token_accuracy": 0.9951280206441879, "num_tokens": 844146347.0, "step": 1986 }, { "entropy": 0.019416870549321175, "epoch": 0.8696793959951855, "grad_norm": 13.6875, "learning_rate": 2.555224688684353e-05, "loss": 0.1404, "loss_lm": 0.018118509324267507, "loss_seg": 0.12231374531984329, "mean_token_accuracy": 0.9949436485767365, "num_tokens": 844571499.0, "step": 1987 }, { "entropy": 0.019393964670598507, "epoch": 0.8701170806433964, "grad_norm": 18.5, "learning_rate": 2.5549539794260965e-05, "loss": 0.1684, "loss_lm": 0.017155577428638935, "loss_seg": 0.15124568715691566, "mean_token_accuracy": 0.9951159954071045, "num_tokens": 844996320.0, "step": 1988 }, { "entropy": 0.018817066680639982, "epoch": 0.8705547652916074, "grad_norm": 10.375, "learning_rate": 2.55468327016784e-05, "loss": 0.1361, "loss_lm": 0.0157470372505486, "loss_seg": 0.12037302181124687, "mean_token_accuracy": 0.9953391253948212, "num_tokens": 845421273.0, "step": 1989 }, { "entropy": 0.018748012837022543, "epoch": 0.8709924499398184, "grad_norm": 16.125, "learning_rate": 2.5544125609095832e-05, "loss": 0.1372, "loss_lm": 0.016679071821272373, "loss_seg": 0.12054184824228287, "mean_token_accuracy": 0.9952984303236008, "num_tokens": 845845986.0, "step": 1990 }, { "entropy": 0.018569923005998135, "epoch": 0.8714301345880293, "grad_norm": 11.3125, "learning_rate": 2.5541418516513263e-05, "loss": 0.1465, "loss_lm": 0.015549397096037865, "loss_seg": 0.1309260856360197, "mean_token_accuracy": 0.9952758997678757, "num_tokens": 846271432.0, "step": 1991 }, { "entropy": 0.01855056919157505, "epoch": 0.8718678192362402, "grad_norm": 12.875, "learning_rate": 2.55387114239307e-05, "loss": 0.1279, "loss_lm": 0.01635511126369238, "loss_seg": 0.11153250373899937, "mean_token_accuracy": 0.9953390657901764, "num_tokens": 846696185.0, "step": 1992 }, { "entropy": 0.019822027534246445, "epoch": 0.8723055038844513, "grad_norm": 9.25, "learning_rate": 2.5536004331348134e-05, "loss": 0.1638, "loss_lm": 0.01573363901115954, "loss_seg": 0.1480926163494587, "mean_token_accuracy": 0.995009109377861, "num_tokens": 847122154.0, "step": 1993 }, { "entropy": 0.018761123064905405, "epoch": 0.8727431885326622, "grad_norm": 8.0625, "learning_rate": 2.5533297238765567e-05, "loss": 0.1464, "loss_lm": 0.013583785854279995, "loss_seg": 0.13277802243828773, "mean_token_accuracy": 0.9953856915235519, "num_tokens": 847547099.0, "step": 1994 }, { "entropy": 0.018869518768042326, "epoch": 0.8731808731808732, "grad_norm": 6.65625, "learning_rate": 2.5530590146183e-05, "loss": 0.1604, "loss_lm": 0.016947913682088256, "loss_seg": 0.14347144216299057, "mean_token_accuracy": 0.9952015280723572, "num_tokens": 847971981.0, "step": 1995 }, { "entropy": 0.018983771558851004, "epoch": 0.8736185578290842, "grad_norm": 9.0625, "learning_rate": 2.552788305360043e-05, "loss": 0.1073, "loss_lm": 0.017611907562240958, "loss_seg": 0.08965115807950497, "mean_token_accuracy": 0.9953697174787521, "num_tokens": 848396958.0, "step": 1996 }, { "entropy": 0.019410150591284037, "epoch": 0.8740562424772951, "grad_norm": 4.46875, "learning_rate": 2.552517596101787e-05, "loss": 0.1345, "loss_lm": 0.016848282190039754, "loss_seg": 0.11765014007687569, "mean_token_accuracy": 0.9952677935361862, "num_tokens": 848822510.0, "step": 1997 }, { "entropy": 0.01913824211806059, "epoch": 0.8744939271255061, "grad_norm": 7.65625, "learning_rate": 2.5522468868435302e-05, "loss": 0.1301, "loss_lm": 0.015862953383475542, "loss_seg": 0.11426457576453686, "mean_token_accuracy": 0.9951467365026474, "num_tokens": 849247561.0, "step": 1998 }, { "entropy": 0.019074804615229368, "epoch": 0.874931611773717, "grad_norm": 7.46875, "learning_rate": 2.5519761775852736e-05, "loss": 0.1417, "loss_lm": 0.017472679493948817, "loss_seg": 0.12427427619695663, "mean_token_accuracy": 0.9951996207237244, "num_tokens": 849673537.0, "step": 1999 }, { "entropy": 0.019635041244328022, "epoch": 0.875369296421928, "grad_norm": 30.0, "learning_rate": 2.551705468327017e-05, "loss": 0.1517, "loss_lm": 0.017535478342324495, "loss_seg": 0.13418839871883392, "mean_token_accuracy": 0.9949761480093002, "num_tokens": 850098305.0, "step": 2000 }, { "entropy": 0.01968749286606908, "epoch": 0.875806981070139, "grad_norm": 6.71875, "learning_rate": 2.55143475906876e-05, "loss": 0.1921, "loss_lm": 0.01665174844674766, "loss_seg": 0.17541527189314365, "mean_token_accuracy": 0.9950086325407028, "num_tokens": 850522793.0, "step": 2001 }, { "entropy": 0.019734452478587627, "epoch": 0.8762446657183499, "grad_norm": 11.0625, "learning_rate": 2.5511640498105037e-05, "loss": 0.1509, "loss_lm": 0.016355693573132157, "loss_seg": 0.13453010097146034, "mean_token_accuracy": 0.9949821829795837, "num_tokens": 850947861.0, "step": 2002 }, { "entropy": 0.01985724689438939, "epoch": 0.876682350366561, "grad_norm": 18.0, "learning_rate": 2.550893340552247e-05, "loss": 0.1452, "loss_lm": 0.01705974154174328, "loss_seg": 0.1281596552580595, "mean_token_accuracy": 0.9950014501810074, "num_tokens": 851372377.0, "step": 2003 }, { "entropy": 0.01946673123165965, "epoch": 0.8771200350147719, "grad_norm": 8.25, "learning_rate": 2.5506226312939905e-05, "loss": 0.1381, "loss_lm": 0.017278132028877735, "loss_seg": 0.12083391286432743, "mean_token_accuracy": 0.9951900988817215, "num_tokens": 851797079.0, "step": 2004 }, { "entropy": 0.018716246355324984, "epoch": 0.8775577196629828, "grad_norm": 5.46875, "learning_rate": 2.5503519220357335e-05, "loss": 0.1815, "loss_lm": 0.018278927076607943, "loss_seg": 0.16325891762971878, "mean_token_accuracy": 0.9952493011951447, "num_tokens": 852222659.0, "step": 2005 }, { "entropy": 0.018996804486960173, "epoch": 0.8779954043111938, "grad_norm": 7.6875, "learning_rate": 2.550081212777477e-05, "loss": 0.1582, "loss_lm": 0.022236629389226437, "loss_seg": 0.135969378054142, "mean_token_accuracy": 0.9951315522193909, "num_tokens": 852646794.0, "step": 2006 }, { "entropy": 0.019545685034245253, "epoch": 0.8784330889594048, "grad_norm": 6.71875, "learning_rate": 2.5498105035192202e-05, "loss": 0.1304, "loss_lm": 0.015145182143896818, "loss_seg": 0.11527397390455008, "mean_token_accuracy": 0.9951070100069046, "num_tokens": 853071662.0, "step": 2007 }, { "entropy": 0.01943842973560095, "epoch": 0.8788707736076157, "grad_norm": 9.4375, "learning_rate": 2.549539794260964e-05, "loss": 0.1788, "loss_lm": 0.017354115843772888, "loss_seg": 0.16143738850951195, "mean_token_accuracy": 0.9950673878192902, "num_tokens": 853496570.0, "step": 2008 }, { "entropy": 0.01933176489546895, "epoch": 0.8793084582558267, "grad_norm": 11.625, "learning_rate": 2.5492690850027073e-05, "loss": 0.1091, "loss_lm": 0.01583406957797706, "loss_seg": 0.09329936094582081, "mean_token_accuracy": 0.9951247870922089, "num_tokens": 853921749.0, "step": 2009 }, { "entropy": 0.019788541365414858, "epoch": 0.8797461429040376, "grad_norm": 7.125, "learning_rate": 2.5489983757444504e-05, "loss": 0.1425, "loss_lm": 0.016900417394936085, "loss_seg": 0.12555449828505516, "mean_token_accuracy": 0.9950392842292786, "num_tokens": 854347059.0, "step": 2010 }, { "entropy": 0.019014610908925533, "epoch": 0.8801838275522486, "grad_norm": 11.5625, "learning_rate": 2.5487276664861937e-05, "loss": 0.1623, "loss_lm": 0.016793686663731933, "loss_seg": 0.14548991061747074, "mean_token_accuracy": 0.9952253997325897, "num_tokens": 854771753.0, "step": 2011 }, { "entropy": 0.01916090724989772, "epoch": 0.8806215122004596, "grad_norm": 7.4375, "learning_rate": 2.548456957227937e-05, "loss": 0.1352, "loss_lm": 0.016252566128969193, "loss_seg": 0.1189253069460392, "mean_token_accuracy": 0.9952260106801987, "num_tokens": 855196732.0, "step": 2012 }, { "entropy": 0.01854957640171051, "epoch": 0.8810591968486705, "grad_norm": 7.40625, "learning_rate": 2.5481862479696808e-05, "loss": 0.1122, "loss_lm": 0.01617164141498506, "loss_seg": 0.09599614143371582, "mean_token_accuracy": 0.9952749162912369, "num_tokens": 855622496.0, "step": 2013 }, { "entropy": 0.018996205180883408, "epoch": 0.8814968814968815, "grad_norm": 8.0, "learning_rate": 2.5479155387114242e-05, "loss": 0.1411, "loss_lm": 0.0169518671464175, "loss_seg": 0.12410809844732285, "mean_token_accuracy": 0.9952356964349747, "num_tokens": 856048289.0, "step": 2014 }, { "entropy": 0.019840576220303774, "epoch": 0.8819345661450925, "grad_norm": 17.0, "learning_rate": 2.5476448294531672e-05, "loss": 0.1368, "loss_lm": 0.017385271145030856, "loss_seg": 0.11938321217894554, "mean_token_accuracy": 0.9950364083051682, "num_tokens": 856473346.0, "step": 2015 }, { "entropy": 0.01888884510844946, "epoch": 0.8823722507933034, "grad_norm": 4.1875, "learning_rate": 2.5473741201949106e-05, "loss": 0.2289, "loss_lm": 0.017161318100988865, "loss_seg": 0.2117272913455963, "mean_token_accuracy": 0.995348259806633, "num_tokens": 856898108.0, "step": 2016 }, { "entropy": 0.019077745731920004, "epoch": 0.8828099354415144, "grad_norm": 7.1875, "learning_rate": 2.547103410936654e-05, "loss": 0.1804, "loss_lm": 0.01469336193986237, "loss_seg": 0.16566982492804527, "mean_token_accuracy": 0.9952361285686493, "num_tokens": 857323096.0, "step": 2017 }, { "entropy": 0.018909489270299673, "epoch": 0.8832476200897253, "grad_norm": 8.0, "learning_rate": 2.5468327016783977e-05, "loss": 0.1555, "loss_lm": 0.016994807636365294, "loss_seg": 0.138505507260561, "mean_token_accuracy": 0.9952908456325531, "num_tokens": 857747327.0, "step": 2018 }, { "entropy": 0.01926868734881282, "epoch": 0.8836853047379363, "grad_norm": 10.0, "learning_rate": 2.546561992420141e-05, "loss": 0.1089, "loss_lm": 0.01648764335550368, "loss_seg": 0.09243946149945259, "mean_token_accuracy": 0.9951818287372589, "num_tokens": 858171729.0, "step": 2019 }, { "entropy": 0.01977242063730955, "epoch": 0.8841229893861473, "grad_norm": 13.0, "learning_rate": 2.546291283161884e-05, "loss": 0.1712, "loss_lm": 0.017459961818531156, "loss_seg": 0.15370970219373703, "mean_token_accuracy": 0.9950838536024094, "num_tokens": 858597042.0, "step": 2020 }, { "entropy": 0.019730408675968647, "epoch": 0.8845606740343582, "grad_norm": 19.5, "learning_rate": 2.5460205739036275e-05, "loss": 0.1782, "loss_lm": 0.019594452809542418, "loss_seg": 0.15860229916870594, "mean_token_accuracy": 0.9950926601886749, "num_tokens": 859022070.0, "step": 2021 }, { "entropy": 0.019333388190716505, "epoch": 0.8849983586825693, "grad_norm": 8.5625, "learning_rate": 2.5457498646453708e-05, "loss": 0.1586, "loss_lm": 0.01569389528594911, "loss_seg": 0.14295585453510284, "mean_token_accuracy": 0.9951742142438889, "num_tokens": 859446629.0, "step": 2022 }, { "entropy": 0.019314360339194536, "epoch": 0.8854360433307802, "grad_norm": 13.3125, "learning_rate": 2.5454791553871145e-05, "loss": 0.1477, "loss_lm": 0.016149771632626653, "loss_seg": 0.1315394826233387, "mean_token_accuracy": 0.9950979202985764, "num_tokens": 859871534.0, "step": 2023 }, { "entropy": 0.01923564588651061, "epoch": 0.8858737279789911, "grad_norm": 10.4375, "learning_rate": 2.545208446128858e-05, "loss": 0.2011, "loss_lm": 0.017647432629019022, "loss_seg": 0.18343306705355644, "mean_token_accuracy": 0.9952286630868912, "num_tokens": 860296963.0, "step": 2024 }, { "entropy": 0.019687093794345856, "epoch": 0.8863114126272021, "grad_norm": 20.125, "learning_rate": 2.544937736870601e-05, "loss": 0.1483, "loss_lm": 0.016578272450715303, "loss_seg": 0.13176360726356506, "mean_token_accuracy": 0.995169460773468, "num_tokens": 860722737.0, "step": 2025 }, { "entropy": 0.019099221099168062, "epoch": 0.8867490972754131, "grad_norm": 6.1875, "learning_rate": 2.5446670276123443e-05, "loss": 0.1171, "loss_lm": 0.01656203158199787, "loss_seg": 0.10057170502841473, "mean_token_accuracy": 0.9951137155294418, "num_tokens": 861147356.0, "step": 2026 }, { "entropy": 0.019215912092477083, "epoch": 0.887186781923624, "grad_norm": 14.625, "learning_rate": 2.5443963183540877e-05, "loss": 0.1371, "loss_lm": 0.017604342196136713, "loss_seg": 0.11952082067728043, "mean_token_accuracy": 0.9952516853809357, "num_tokens": 861572978.0, "step": 2027 }, { "entropy": 0.01912984438240528, "epoch": 0.887624466571835, "grad_norm": 6.75, "learning_rate": 2.5441256090958314e-05, "loss": 0.1122, "loss_lm": 0.01730672107078135, "loss_seg": 0.0949250590056181, "mean_token_accuracy": 0.9952182024717331, "num_tokens": 861997671.0, "step": 2028 }, { "entropy": 0.01973636681213975, "epoch": 0.888062151220046, "grad_norm": 5.40625, "learning_rate": 2.5438548998375744e-05, "loss": 0.1555, "loss_lm": 0.01684031682088971, "loss_seg": 0.13866940140724182, "mean_token_accuracy": 0.9951042830944061, "num_tokens": 862423696.0, "step": 2029 }, { "entropy": 0.018501098733395338, "epoch": 0.8884998358682569, "grad_norm": 11.625, "learning_rate": 2.5435841905793178e-05, "loss": 0.1248, "loss_lm": 0.016887723933905363, "loss_seg": 0.10793998371809721, "mean_token_accuracy": 0.995247408747673, "num_tokens": 862848719.0, "step": 2030 }, { "entropy": 0.01886499486863613, "epoch": 0.8889375205164679, "grad_norm": 8.625, "learning_rate": 2.5433134813210612e-05, "loss": 0.1278, "loss_lm": 0.014424030901864171, "loss_seg": 0.11342447064816952, "mean_token_accuracy": 0.9952709972858429, "num_tokens": 863273713.0, "step": 2031 }, { "entropy": 0.01883672596886754, "epoch": 0.8893752051646788, "grad_norm": 20.5, "learning_rate": 2.5430427720628046e-05, "loss": 0.1813, "loss_lm": 0.016347103752195835, "loss_seg": 0.16498861461877823, "mean_token_accuracy": 0.9953230321407318, "num_tokens": 863699156.0, "step": 2032 }, { "entropy": 0.019677511882036924, "epoch": 0.8898128898128899, "grad_norm": 8.875, "learning_rate": 2.5427720628045483e-05, "loss": 0.1335, "loss_lm": 0.018105625873431563, "loss_seg": 0.1153562692925334, "mean_token_accuracy": 0.9950281977653503, "num_tokens": 864124955.0, "step": 2033 }, { "entropy": 0.019154706969857216, "epoch": 0.8902505744611008, "grad_norm": 6.0, "learning_rate": 2.5425013535462913e-05, "loss": 0.1389, "loss_lm": 0.019452878274023533, "loss_seg": 0.1194843240082264, "mean_token_accuracy": 0.9951355904340744, "num_tokens": 864549622.0, "step": 2034 }, { "entropy": 0.019219379872083664, "epoch": 0.8906882591093117, "grad_norm": 9.8125, "learning_rate": 2.5422306442880347e-05, "loss": 0.161, "loss_lm": 0.017690801294520497, "loss_seg": 0.14331343211233616, "mean_token_accuracy": 0.9951156824827194, "num_tokens": 864974635.0, "step": 2035 }, { "entropy": 0.01931913010776043, "epoch": 0.8911259437575227, "grad_norm": 8.375, "learning_rate": 2.541959935029778e-05, "loss": 0.1514, "loss_lm": 0.017052937299013138, "loss_seg": 0.13435798790305853, "mean_token_accuracy": 0.9951683431863785, "num_tokens": 865399715.0, "step": 2036 }, { "entropy": 0.018469716422259808, "epoch": 0.8915636284057337, "grad_norm": 7.15625, "learning_rate": 2.5416892257715214e-05, "loss": 0.2312, "loss_lm": 0.015673512360081077, "loss_seg": 0.21555064152926207, "mean_token_accuracy": 0.9953390806913376, "num_tokens": 865825118.0, "step": 2037 }, { "entropy": 0.01908004516735673, "epoch": 0.8920013130539446, "grad_norm": 12.1875, "learning_rate": 2.5414185165132648e-05, "loss": 0.1175, "loss_lm": 0.016867341240867972, "loss_seg": 0.10060489177703857, "mean_token_accuracy": 0.9951359331607819, "num_tokens": 866251064.0, "step": 2038 }, { "entropy": 0.01864431705325842, "epoch": 0.8924389977021556, "grad_norm": 8.0625, "learning_rate": 2.541147807255008e-05, "loss": 0.1168, "loss_lm": 0.016294396482408047, "loss_seg": 0.10052667558193207, "mean_token_accuracy": 0.9953012466430664, "num_tokens": 866676216.0, "step": 2039 }, { "entropy": 0.019081804435700178, "epoch": 0.8928766823503665, "grad_norm": 8.1875, "learning_rate": 2.5408770979967515e-05, "loss": 0.0958, "loss_lm": 0.01849209424108267, "loss_seg": 0.0773028265684843, "mean_token_accuracy": 0.995176687836647, "num_tokens": 867101820.0, "step": 2040 }, { "entropy": 0.01911224564537406, "epoch": 0.8933143669985776, "grad_norm": 16.5, "learning_rate": 2.540606388738495e-05, "loss": 0.144, "loss_lm": 0.016892929561436176, "loss_seg": 0.12712755240499973, "mean_token_accuracy": 0.9952633231878281, "num_tokens": 867526993.0, "step": 2041 }, { "entropy": 0.01965654082596302, "epoch": 0.8937520516467885, "grad_norm": 7.375, "learning_rate": 2.5403356794802383e-05, "loss": 0.1617, "loss_lm": 0.016916362568736076, "loss_seg": 0.1448330655694008, "mean_token_accuracy": 0.9951275140047073, "num_tokens": 867951994.0, "step": 2042 }, { "entropy": 0.019234593492001295, "epoch": 0.8941897362949994, "grad_norm": 4.8125, "learning_rate": 2.5400649702219817e-05, "loss": 0.1613, "loss_lm": 0.01839029393158853, "loss_seg": 0.14293673262000084, "mean_token_accuracy": 0.9952002614736557, "num_tokens": 868377434.0, "step": 2043 }, { "entropy": 0.018938594963401556, "epoch": 0.8946274209432105, "grad_norm": 7.46875, "learning_rate": 2.539794260963725e-05, "loss": 0.1925, "loss_lm": 0.017832305980846286, "loss_seg": 0.17465168982744217, "mean_token_accuracy": 0.9952490478754044, "num_tokens": 868802356.0, "step": 2044 }, { "entropy": 0.01912978384643793, "epoch": 0.8950651055914214, "grad_norm": 11.6875, "learning_rate": 2.5395235517054684e-05, "loss": 0.1447, "loss_lm": 0.015564739471301436, "loss_seg": 0.129142127931118, "mean_token_accuracy": 0.9950629770755768, "num_tokens": 869227668.0, "step": 2045 }, { "entropy": 0.019162136130034924, "epoch": 0.8955027902396323, "grad_norm": 7.59375, "learning_rate": 2.5392528424472118e-05, "loss": 0.1178, "loss_lm": 0.019205209333449602, "loss_seg": 0.09856190532445908, "mean_token_accuracy": 0.9951402395963669, "num_tokens": 869652542.0, "step": 2046 }, { "entropy": 0.019599747844040394, "epoch": 0.8959404748878433, "grad_norm": 10.75, "learning_rate": 2.538982133188955e-05, "loss": 0.1533, "loss_lm": 0.017139004077762365, "loss_seg": 0.1361719723790884, "mean_token_accuracy": 0.995071679353714, "num_tokens": 870077201.0, "step": 2047 }, { "entropy": 0.018680285196751356, "epoch": 0.8963781595360543, "grad_norm": 16.0, "learning_rate": 2.5387114239306985e-05, "loss": 0.1251, "loss_lm": 0.015242966823279858, "loss_seg": 0.10981444176286459, "mean_token_accuracy": 0.9952940493822098, "num_tokens": 870501938.0, "step": 2048 }, { "entropy": 0.018976171035319567, "epoch": 0.8968158441842652, "grad_norm": 6.78125, "learning_rate": 2.538440714672442e-05, "loss": 0.1582, "loss_lm": 0.017927055712789297, "loss_seg": 0.1403066348284483, "mean_token_accuracy": 0.9950800538063049, "num_tokens": 870927334.0, "step": 2049 }, { "entropy": 0.019766618963330984, "epoch": 0.8972535288324762, "grad_norm": 15.9375, "learning_rate": 2.5381700054141853e-05, "loss": 0.1698, "loss_lm": 0.01662050816230476, "loss_seg": 0.15316886454820633, "mean_token_accuracy": 0.9950233548879623, "num_tokens": 871353254.0, "step": 2050 }, { "entropy": 0.019225214142352343, "epoch": 0.8976912134806871, "grad_norm": 5.6875, "learning_rate": 2.5378992961559286e-05, "loss": 0.1608, "loss_lm": 0.01860324526205659, "loss_seg": 0.14224309287965298, "mean_token_accuracy": 0.9951135367155075, "num_tokens": 871778073.0, "step": 2051 }, { "entropy": 0.0191361247561872, "epoch": 0.8981288981288982, "grad_norm": 20.125, "learning_rate": 2.537628586897672e-05, "loss": 0.1607, "loss_lm": 0.017577579244971275, "loss_seg": 0.1431131772696972, "mean_token_accuracy": 0.9952186793088913, "num_tokens": 872203633.0, "step": 2052 }, { "entropy": 0.0192695795558393, "epoch": 0.8985665827771091, "grad_norm": 14.25, "learning_rate": 2.537357877639415e-05, "loss": 0.1203, "loss_lm": 0.016745212953537703, "loss_seg": 0.10356210079044104, "mean_token_accuracy": 0.9950587302446365, "num_tokens": 872628789.0, "step": 2053 }, { "entropy": 0.019049493595957756, "epoch": 0.89900426742532, "grad_norm": 17.375, "learning_rate": 2.5370871683811588e-05, "loss": 0.1706, "loss_lm": 0.015998522518202662, "loss_seg": 0.1546026784926653, "mean_token_accuracy": 0.9951690584421158, "num_tokens": 873053926.0, "step": 2054 }, { "entropy": 0.018624759279191494, "epoch": 0.8994419520735311, "grad_norm": 7.0, "learning_rate": 2.536816459122902e-05, "loss": 0.114, "loss_lm": 0.014899976551532745, "loss_seg": 0.09908653609454632, "mean_token_accuracy": 0.9952767044305801, "num_tokens": 873479651.0, "step": 2055 }, { "entropy": 0.019095093943178654, "epoch": 0.899879636721742, "grad_norm": 10.875, "learning_rate": 2.5365457498646455e-05, "loss": 0.1614, "loss_lm": 0.019435955211520195, "loss_seg": 0.14198249951004982, "mean_token_accuracy": 0.9951948076486588, "num_tokens": 873905413.0, "step": 2056 }, { "entropy": 0.01929077785462141, "epoch": 0.9003173213699529, "grad_norm": 16.25, "learning_rate": 2.536275040606389e-05, "loss": 0.1556, "loss_lm": 0.01717298640869558, "loss_seg": 0.13840146269649267, "mean_token_accuracy": 0.9951831698417664, "num_tokens": 874331385.0, "step": 2057 }, { "entropy": 0.01909230835735798, "epoch": 0.9007550060181639, "grad_norm": 8.6875, "learning_rate": 2.536004331348132e-05, "loss": 0.1351, "loss_lm": 0.017689561937004328, "loss_seg": 0.11736560985445976, "mean_token_accuracy": 0.9950173646211624, "num_tokens": 874755783.0, "step": 2058 }, { "entropy": 0.01878124987706542, "epoch": 0.9011926906663749, "grad_norm": 9.25, "learning_rate": 2.5357336220898756e-05, "loss": 0.1421, "loss_lm": 0.01906341756694019, "loss_seg": 0.12299415469169617, "mean_token_accuracy": 0.995300367474556, "num_tokens": 875180669.0, "step": 2059 }, { "entropy": 0.01892375247552991, "epoch": 0.9016303753145858, "grad_norm": 7.5, "learning_rate": 2.535462912831619e-05, "loss": 0.1096, "loss_lm": 0.014858898939564824, "loss_seg": 0.09471348952502012, "mean_token_accuracy": 0.9952304810285568, "num_tokens": 875606265.0, "step": 2060 }, { "entropy": 0.01887489715591073, "epoch": 0.9020680599627968, "grad_norm": 8.125, "learning_rate": 2.5351922035733624e-05, "loss": 0.1112, "loss_lm": 0.01605135342106223, "loss_seg": 0.09513943269848824, "mean_token_accuracy": 0.9951876401901245, "num_tokens": 876031377.0, "step": 2061 }, { "entropy": 0.01816961681470275, "epoch": 0.9025057446110077, "grad_norm": 7.25, "learning_rate": 2.5349214943151057e-05, "loss": 0.1422, "loss_lm": 0.016746470471844077, "loss_seg": 0.12542214058339596, "mean_token_accuracy": 0.9953733682632446, "num_tokens": 876457113.0, "step": 2062 }, { "entropy": 0.019459614995867014, "epoch": 0.9029434292592188, "grad_norm": 9.0625, "learning_rate": 2.5346507850568488e-05, "loss": 0.1687, "loss_lm": 0.016491745365783572, "loss_seg": 0.15221188217401505, "mean_token_accuracy": 0.9949824512004852, "num_tokens": 876883197.0, "step": 2063 }, { "entropy": 0.018608121667057276, "epoch": 0.9033811139074297, "grad_norm": 12.0, "learning_rate": 2.5343800757985925e-05, "loss": 0.1606, "loss_lm": 0.01729927002452314, "loss_seg": 0.14327052608132362, "mean_token_accuracy": 0.9953105449676514, "num_tokens": 877307510.0, "step": 2064 }, { "entropy": 0.019261822570115328, "epoch": 0.9038187985556406, "grad_norm": 13.0625, "learning_rate": 2.534109366540336e-05, "loss": 0.1533, "loss_lm": 0.01876168418675661, "loss_seg": 0.13451248779892921, "mean_token_accuracy": 0.9952382445335388, "num_tokens": 877732531.0, "step": 2065 }, { "entropy": 0.01888935873284936, "epoch": 0.9042564832038517, "grad_norm": 8.1875, "learning_rate": 2.5338386572820792e-05, "loss": 0.1839, "loss_lm": 0.01621862268075347, "loss_seg": 0.1676371768116951, "mean_token_accuracy": 0.9952041208744049, "num_tokens": 878158076.0, "step": 2066 }, { "entropy": 0.018557571806013584, "epoch": 0.9046941678520626, "grad_norm": 6.0, "learning_rate": 2.5335679480238226e-05, "loss": 0.1613, "loss_lm": 0.014625171665102243, "loss_seg": 0.14664223790168762, "mean_token_accuracy": 0.9952711164951324, "num_tokens": 878582645.0, "step": 2067 }, { "entropy": 0.020033541601151228, "epoch": 0.9051318525002735, "grad_norm": 17.875, "learning_rate": 2.5332972387655656e-05, "loss": 0.1681, "loss_lm": 0.01935134083032608, "loss_seg": 0.14874101430177689, "mean_token_accuracy": 0.9950449615716934, "num_tokens": 879008268.0, "step": 2068 }, { "entropy": 0.019300045911222696, "epoch": 0.9055695371484845, "grad_norm": 11.5, "learning_rate": 2.5330265295073093e-05, "loss": 0.1874, "loss_lm": 0.017379101365804672, "loss_seg": 0.17003002390265465, "mean_token_accuracy": 0.9950842410326004, "num_tokens": 879433225.0, "step": 2069 }, { "entropy": 0.019074914511293173, "epoch": 0.9060072217966955, "grad_norm": 14.125, "learning_rate": 2.5327558202490527e-05, "loss": 0.1192, "loss_lm": 0.016844364581629634, "loss_seg": 0.10232254303991795, "mean_token_accuracy": 0.9951494485139847, "num_tokens": 879857469.0, "step": 2070 }, { "entropy": 0.019656649325042963, "epoch": 0.9064449064449065, "grad_norm": 9.5, "learning_rate": 2.532485110990796e-05, "loss": 0.1656, "loss_lm": 0.017095125280320644, "loss_seg": 0.14848836697638035, "mean_token_accuracy": 0.9951527416706085, "num_tokens": 880282658.0, "step": 2071 }, { "entropy": 0.019269245211035013, "epoch": 0.9068825910931174, "grad_norm": 11.1875, "learning_rate": 2.532214401732539e-05, "loss": 0.1325, "loss_lm": 0.01566531602293253, "loss_seg": 0.11682611517608166, "mean_token_accuracy": 0.9951577633619308, "num_tokens": 880707433.0, "step": 2072 }, { "entropy": 0.019539784640073776, "epoch": 0.9073202757413283, "grad_norm": 9.5625, "learning_rate": 2.5319436924742825e-05, "loss": 0.2125, "loss_lm": 0.01780272601172328, "loss_seg": 0.1946505717933178, "mean_token_accuracy": 0.99507637321949, "num_tokens": 881133925.0, "step": 2073 }, { "entropy": 0.01878305384889245, "epoch": 0.9077579603895394, "grad_norm": 13.8125, "learning_rate": 2.531672983216026e-05, "loss": 0.1337, "loss_lm": 0.01664671115577221, "loss_seg": 0.11701720580458641, "mean_token_accuracy": 0.9952032417058945, "num_tokens": 881558328.0, "step": 2074 }, { "entropy": 0.01909395633265376, "epoch": 0.9081956450377503, "grad_norm": 6.6875, "learning_rate": 2.5314022739577696e-05, "loss": 0.108, "loss_lm": 0.016767906490713358, "loss_seg": 0.09126285742968321, "mean_token_accuracy": 0.9952128976583481, "num_tokens": 881983571.0, "step": 2075 }, { "entropy": 0.01869589276611805, "epoch": 0.9086333296859612, "grad_norm": 16.75, "learning_rate": 2.531131564699513e-05, "loss": 0.1255, "loss_lm": 0.017106514889746904, "loss_seg": 0.10837003216147423, "mean_token_accuracy": 0.9952948540449142, "num_tokens": 882408809.0, "step": 2076 }, { "entropy": 0.018972618505358696, "epoch": 0.9090710143341723, "grad_norm": 11.6875, "learning_rate": 2.530860855441256e-05, "loss": 0.1011, "loss_lm": 0.017255216604098678, "loss_seg": 0.08387826755642891, "mean_token_accuracy": 0.9951732754707336, "num_tokens": 882834116.0, "step": 2077 }, { "entropy": 0.01885614264756441, "epoch": 0.9095086989823832, "grad_norm": 43.0, "learning_rate": 2.5305901461829994e-05, "loss": 0.1162, "loss_lm": 0.016348432982340455, "loss_seg": 0.09981479123234749, "mean_token_accuracy": 0.9951191693544388, "num_tokens": 883258797.0, "step": 2078 }, { "entropy": 0.0195515975356102, "epoch": 0.9099463836305941, "grad_norm": 7.25, "learning_rate": 2.5303194369247427e-05, "loss": 0.1082, "loss_lm": 0.016366811003535986, "loss_seg": 0.0918817762285471, "mean_token_accuracy": 0.9950392097234726, "num_tokens": 883683909.0, "step": 2079 }, { "entropy": 0.019235597923398018, "epoch": 0.9103840682788051, "grad_norm": 14.1875, "learning_rate": 2.5300487276664864e-05, "loss": 0.1455, "loss_lm": 0.017272014170885086, "loss_seg": 0.1282176449894905, "mean_token_accuracy": 0.9951519221067429, "num_tokens": 884108982.0, "step": 2080 }, { "entropy": 0.019104922655969858, "epoch": 0.9108217529270161, "grad_norm": 7.71875, "learning_rate": 2.5297780184082298e-05, "loss": 0.1527, "loss_lm": 0.015918225049972534, "loss_seg": 0.13677029497921467, "mean_token_accuracy": 0.9951254725456238, "num_tokens": 884533602.0, "step": 2081 }, { "entropy": 0.019061299040913582, "epoch": 0.9112594375752271, "grad_norm": 12.8125, "learning_rate": 2.529507309149973e-05, "loss": 0.183, "loss_lm": 0.015805721515789628, "loss_seg": 0.16721553727984428, "mean_token_accuracy": 0.9952688366174698, "num_tokens": 884959437.0, "step": 2082 }, { "entropy": 0.019163343124091625, "epoch": 0.911697122223438, "grad_norm": 9.1875, "learning_rate": 2.5292365998917162e-05, "loss": 0.0905, "loss_lm": 0.014706664951518178, "loss_seg": 0.07581276819109917, "mean_token_accuracy": 0.9951853603124619, "num_tokens": 885384564.0, "step": 2083 }, { "entropy": 0.01873582787811756, "epoch": 0.9121348068716489, "grad_norm": 10.625, "learning_rate": 2.5289658906334596e-05, "loss": 0.1343, "loss_lm": 0.017485577147454023, "loss_seg": 0.11679332330822945, "mean_token_accuracy": 0.99519844353199, "num_tokens": 885809827.0, "step": 2084 }, { "entropy": 0.018994014710187912, "epoch": 0.91257249151986, "grad_norm": 12.625, "learning_rate": 2.5286951813752033e-05, "loss": 0.1624, "loss_lm": 0.01758583029732108, "loss_seg": 0.14479900896549225, "mean_token_accuracy": 0.995255708694458, "num_tokens": 886234094.0, "step": 2085 }, { "entropy": 0.019008084200322628, "epoch": 0.9130101761680709, "grad_norm": 15.9375, "learning_rate": 2.5284244721169467e-05, "loss": 0.1608, "loss_lm": 0.01712082768790424, "loss_seg": 0.14366145804524422, "mean_token_accuracy": 0.9951848387718201, "num_tokens": 886659151.0, "step": 2086 }, { "entropy": 0.01855421345680952, "epoch": 0.9134478608162818, "grad_norm": 16.125, "learning_rate": 2.5281537628586897e-05, "loss": 0.1438, "loss_lm": 0.015551967546343803, "loss_seg": 0.12821939028799534, "mean_token_accuracy": 0.9953334331512451, "num_tokens": 887084272.0, "step": 2087 }, { "entropy": 0.019522954244166613, "epoch": 0.9138855454644929, "grad_norm": 7.90625, "learning_rate": 2.527883053600433e-05, "loss": 0.1167, "loss_lm": 0.01834270148538053, "loss_seg": 0.09832919202744961, "mean_token_accuracy": 0.9950874596834183, "num_tokens": 887509462.0, "step": 2088 }, { "entropy": 0.01940257241949439, "epoch": 0.9143232301127038, "grad_norm": 7.75, "learning_rate": 2.5276123443421765e-05, "loss": 0.1908, "loss_lm": 0.017373147886246443, "loss_seg": 0.17338052671402693, "mean_token_accuracy": 0.9950153529644012, "num_tokens": 887934749.0, "step": 2089 }, { "entropy": 0.019385788589715958, "epoch": 0.9147609147609148, "grad_norm": 10.125, "learning_rate": 2.52734163508392e-05, "loss": 0.167, "loss_lm": 0.017999646719545126, "loss_seg": 0.1489504873752594, "mean_token_accuracy": 0.9952089786529541, "num_tokens": 888359035.0, "step": 2090 }, { "entropy": 0.019069131463766098, "epoch": 0.9151985994091257, "grad_norm": 6.46875, "learning_rate": 2.5270709258256635e-05, "loss": 0.113, "loss_lm": 0.01611419697292149, "loss_seg": 0.09684613905847073, "mean_token_accuracy": 0.995197206735611, "num_tokens": 888784803.0, "step": 2091 }, { "entropy": 0.019483269192278385, "epoch": 0.9156362840573367, "grad_norm": 6.25, "learning_rate": 2.5268002165674066e-05, "loss": 0.1414, "loss_lm": 0.01806960697285831, "loss_seg": 0.12334651686251163, "mean_token_accuracy": 0.9950718283653259, "num_tokens": 889209476.0, "step": 2092 }, { "entropy": 0.01966270850971341, "epoch": 0.9160739687055477, "grad_norm": 5.34375, "learning_rate": 2.52652950730915e-05, "loss": 0.1239, "loss_lm": 0.01665657851845026, "loss_seg": 0.10720812529325485, "mean_token_accuracy": 0.9949295073747635, "num_tokens": 889634468.0, "step": 2093 }, { "entropy": 0.018993588164448738, "epoch": 0.9165116533537586, "grad_norm": 9.3125, "learning_rate": 2.5262587980508933e-05, "loss": 0.1366, "loss_lm": 0.015796624356880784, "loss_seg": 0.12077387608587742, "mean_token_accuracy": 0.9951447546482086, "num_tokens": 890060108.0, "step": 2094 }, { "entropy": 0.019214346539229155, "epoch": 0.9169493380019695, "grad_norm": 12.6875, "learning_rate": 2.525988088792637e-05, "loss": 0.1321, "loss_lm": 0.01616157521493733, "loss_seg": 0.11592556163668633, "mean_token_accuracy": 0.995198205113411, "num_tokens": 890485544.0, "step": 2095 }, { "entropy": 0.019455104134976864, "epoch": 0.9173870226501806, "grad_norm": 20.625, "learning_rate": 2.52571737953438e-05, "loss": 0.135, "loss_lm": 0.018518107710406184, "loss_seg": 0.116483299061656, "mean_token_accuracy": 0.9951443821191788, "num_tokens": 890910800.0, "step": 2096 }, { "entropy": 0.019100953359156847, "epoch": 0.9178247072983915, "grad_norm": 22.75, "learning_rate": 2.5254466702761234e-05, "loss": 0.1189, "loss_lm": 0.01539307669736445, "loss_seg": 0.103549649938941, "mean_token_accuracy": 0.9951647222042084, "num_tokens": 891335983.0, "step": 2097 }, { "entropy": 0.018970996141433716, "epoch": 0.9182623919466024, "grad_norm": 10.0625, "learning_rate": 2.5251759610178668e-05, "loss": 0.1404, "loss_lm": 0.014128051232546568, "loss_seg": 0.12631755881011486, "mean_token_accuracy": 0.9951845556497574, "num_tokens": 891761457.0, "step": 2098 }, { "entropy": 0.019718601368367672, "epoch": 0.9187000765948135, "grad_norm": 18.5, "learning_rate": 2.5249052517596102e-05, "loss": 0.1539, "loss_lm": 0.017457390669733286, "loss_seg": 0.13648528419435024, "mean_token_accuracy": 0.9951904565095901, "num_tokens": 892186414.0, "step": 2099 }, { "entropy": 0.019052152056246996, "epoch": 0.9191377612430244, "grad_norm": 31.5, "learning_rate": 2.524634542501354e-05, "loss": 0.1893, "loss_lm": 0.014986092923209071, "loss_seg": 0.17428584396839142, "mean_token_accuracy": 0.9952159523963928, "num_tokens": 892610685.0, "step": 2100 }, { "entropy": 0.018642968963831663, "epoch": 0.9195754458912354, "grad_norm": 11.625, "learning_rate": 2.524363833243097e-05, "loss": 0.1443, "loss_lm": 0.014395741745829582, "loss_seg": 0.12991460785269737, "mean_token_accuracy": 0.9952891319990158, "num_tokens": 893035607.0, "step": 2101 }, { "entropy": 0.0191534454934299, "epoch": 0.9200131305394463, "grad_norm": 11.75, "learning_rate": 2.5240931239848403e-05, "loss": 0.1195, "loss_lm": 0.016410328913480043, "loss_seg": 0.10306442901492119, "mean_token_accuracy": 0.9952498525381088, "num_tokens": 893460373.0, "step": 2102 }, { "entropy": 0.019038761500269175, "epoch": 0.9204508151876573, "grad_norm": 6.875, "learning_rate": 2.5238224147265837e-05, "loss": 0.2003, "loss_lm": 0.017581443302333355, "loss_seg": 0.1827644221484661, "mean_token_accuracy": 0.995137944817543, "num_tokens": 893885281.0, "step": 2103 }, { "entropy": 0.01903109671548009, "epoch": 0.9208884998358683, "grad_norm": 13.5625, "learning_rate": 2.523551705468327e-05, "loss": 0.133, "loss_lm": 0.01658963179215789, "loss_seg": 0.116416459903121, "mean_token_accuracy": 0.9951275736093521, "num_tokens": 894310501.0, "step": 2104 }, { "entropy": 0.01912214607000351, "epoch": 0.9213261844840792, "grad_norm": 7.625, "learning_rate": 2.5232809962100704e-05, "loss": 0.1384, "loss_lm": 0.015265090158209205, "loss_seg": 0.12311912514269352, "mean_token_accuracy": 0.9951285570859909, "num_tokens": 894734985.0, "step": 2105 }, { "entropy": 0.019010693300515413, "epoch": 0.9217638691322901, "grad_norm": 6.03125, "learning_rate": 2.5230102869518138e-05, "loss": 0.1611, "loss_lm": 0.017300961771979928, "loss_seg": 0.14382404088974, "mean_token_accuracy": 0.9951945692300797, "num_tokens": 895160510.0, "step": 2106 }, { "entropy": 0.019529504235833883, "epoch": 0.9222015537805012, "grad_norm": 13.8125, "learning_rate": 2.522739577693557e-05, "loss": 0.1115, "loss_lm": 0.01533106341958046, "loss_seg": 0.09619921632111073, "mean_token_accuracy": 0.9951202422380447, "num_tokens": 895585884.0, "step": 2107 }, { "entropy": 0.018329840153455734, "epoch": 0.9226392384287121, "grad_norm": 14.625, "learning_rate": 2.5224688684353005e-05, "loss": 0.1391, "loss_lm": 0.01784573239274323, "loss_seg": 0.12121227569878101, "mean_token_accuracy": 0.9953244775533676, "num_tokens": 896011024.0, "step": 2108 }, { "entropy": 0.01936097117140889, "epoch": 0.9230769230769231, "grad_norm": 16.875, "learning_rate": 2.522198159177044e-05, "loss": 0.1658, "loss_lm": 0.01783219212666154, "loss_seg": 0.1479649618268013, "mean_token_accuracy": 0.994989424943924, "num_tokens": 896436697.0, "step": 2109 }, { "entropy": 0.01916868006810546, "epoch": 0.923514607725134, "grad_norm": 8.875, "learning_rate": 2.5219274499187873e-05, "loss": 0.246, "loss_lm": 0.017905536573380232, "loss_seg": 0.2281311135739088, "mean_token_accuracy": 0.995036780834198, "num_tokens": 896860971.0, "step": 2110 }, { "entropy": 0.018825105857104063, "epoch": 0.923952292373345, "grad_norm": 8.6875, "learning_rate": 2.5216567406605307e-05, "loss": 0.1277, "loss_lm": 0.01611208450049162, "loss_seg": 0.11158808320760727, "mean_token_accuracy": 0.9952887147665024, "num_tokens": 897286495.0, "step": 2111 }, { "entropy": 0.018732786644250154, "epoch": 0.924389977021556, "grad_norm": 5.375, "learning_rate": 2.521386031402274e-05, "loss": 0.103, "loss_lm": 0.016012780601158738, "loss_seg": 0.08699150662869215, "mean_token_accuracy": 0.9952618032693863, "num_tokens": 897710877.0, "step": 2112 }, { "entropy": 0.019709313288331032, "epoch": 0.9248276616697669, "grad_norm": 5.9375, "learning_rate": 2.5211153221440174e-05, "loss": 0.1088, "loss_lm": 0.01806838042102754, "loss_seg": 0.09077797457575798, "mean_token_accuracy": 0.9949490427970886, "num_tokens": 898136524.0, "step": 2113 }, { "entropy": 0.019082532729953527, "epoch": 0.9252653463179779, "grad_norm": 14.4375, "learning_rate": 2.5208446128857608e-05, "loss": 0.1623, "loss_lm": 0.015222044195979834, "loss_seg": 0.1470421515405178, "mean_token_accuracy": 0.995187982916832, "num_tokens": 898561845.0, "step": 2114 }, { "entropy": 0.018530179280787706, "epoch": 0.9257030309661889, "grad_norm": 14.3125, "learning_rate": 2.520573903627504e-05, "loss": 0.1364, "loss_lm": 0.016173590905964375, "loss_seg": 0.12026229687035084, "mean_token_accuracy": 0.9953032732009888, "num_tokens": 898986327.0, "step": 2115 }, { "entropy": 0.018893470987677574, "epoch": 0.9261407156143998, "grad_norm": 7.46875, "learning_rate": 2.5203031943692475e-05, "loss": 0.1411, "loss_lm": 0.01630853279493749, "loss_seg": 0.12475651316344738, "mean_token_accuracy": 0.9952428936958313, "num_tokens": 899411135.0, "step": 2116 }, { "entropy": 0.018274362199008465, "epoch": 0.9265784002626107, "grad_norm": 13.8125, "learning_rate": 2.520032485110991e-05, "loss": 0.1449, "loss_lm": 0.016095811035484076, "loss_seg": 0.1287958137691021, "mean_token_accuracy": 0.9953466951847076, "num_tokens": 899835297.0, "step": 2117 }, { "entropy": 0.019022393506020308, "epoch": 0.9270160849108218, "grad_norm": 9.75, "learning_rate": 2.5197617758527343e-05, "loss": 0.149, "loss_lm": 0.015904766274616122, "loss_seg": 0.13311953470110893, "mean_token_accuracy": 0.9952168017625809, "num_tokens": 900260497.0, "step": 2118 }, { "entropy": 0.018797583878040314, "epoch": 0.9274537695590327, "grad_norm": 13.5, "learning_rate": 2.5194910665944776e-05, "loss": 0.1165, "loss_lm": 0.01494030887261033, "loss_seg": 0.10160566121339798, "mean_token_accuracy": 0.9951755255460739, "num_tokens": 900685795.0, "step": 2119 }, { "entropy": 0.018497740849852562, "epoch": 0.9278914542072437, "grad_norm": 11.6875, "learning_rate": 2.5192203573362207e-05, "loss": 0.1724, "loss_lm": 0.016035414533689618, "loss_seg": 0.15634142607450485, "mean_token_accuracy": 0.9952068030834198, "num_tokens": 901110546.0, "step": 2120 }, { "entropy": 0.019021347165107727, "epoch": 0.9283291388554546, "grad_norm": 8.5625, "learning_rate": 2.5189496480779644e-05, "loss": 0.1587, "loss_lm": 0.017190188635140657, "loss_seg": 0.1415558122098446, "mean_token_accuracy": 0.9950917959213257, "num_tokens": 901535294.0, "step": 2121 }, { "entropy": 0.018686913885176182, "epoch": 0.9287668235036656, "grad_norm": 6.25, "learning_rate": 2.5186789388197077e-05, "loss": 0.2187, "loss_lm": 0.017500610323622823, "loss_seg": 0.20123786479234695, "mean_token_accuracy": 0.9952437430620193, "num_tokens": 901960150.0, "step": 2122 }, { "entropy": 0.018970361910760403, "epoch": 0.9292045081518766, "grad_norm": 8.3125, "learning_rate": 2.518408229561451e-05, "loss": 0.1869, "loss_lm": 0.016236385330557823, "loss_seg": 0.17071150615811348, "mean_token_accuracy": 0.9952137619256973, "num_tokens": 902385278.0, "step": 2123 }, { "entropy": 0.01890083821490407, "epoch": 0.9296421928000875, "grad_norm": 8.3125, "learning_rate": 2.5181375203031945e-05, "loss": 0.1111, "loss_lm": 0.01511844783090055, "loss_seg": 0.09600021503865719, "mean_token_accuracy": 0.9952739179134369, "num_tokens": 902810329.0, "step": 2124 }, { "entropy": 0.018929067999124527, "epoch": 0.9300798774482985, "grad_norm": 5.53125, "learning_rate": 2.5178668110449375e-05, "loss": 0.1185, "loss_lm": 0.015660801669582725, "loss_seg": 0.1028848048299551, "mean_token_accuracy": 0.9951022416353226, "num_tokens": 903234979.0, "step": 2125 }, { "entropy": 0.01888349512591958, "epoch": 0.9305175620965095, "grad_norm": 7.71875, "learning_rate": 2.5175961017866812e-05, "loss": 0.1533, "loss_lm": 0.016775504453107715, "loss_seg": 0.13650535326451063, "mean_token_accuracy": 0.9951289743185043, "num_tokens": 903660120.0, "step": 2126 }, { "entropy": 0.019138480070978403, "epoch": 0.9309552467447204, "grad_norm": 9.5625, "learning_rate": 2.5173253925284246e-05, "loss": 0.133, "loss_lm": 0.017431811429560184, "loss_seg": 0.11559910513460636, "mean_token_accuracy": 0.9951144009828568, "num_tokens": 904085226.0, "step": 2127 }, { "entropy": 0.01840842515230179, "epoch": 0.9313929313929314, "grad_norm": 9.0625, "learning_rate": 2.517054683270168e-05, "loss": 0.1916, "loss_lm": 0.01656662649475038, "loss_seg": 0.1750781573355198, "mean_token_accuracy": 0.9953668862581253, "num_tokens": 904510160.0, "step": 2128 }, { "entropy": 0.01910681277513504, "epoch": 0.9318306160411424, "grad_norm": 8.8125, "learning_rate": 2.5167839740119114e-05, "loss": 0.1003, "loss_lm": 0.014656937913969159, "loss_seg": 0.0856515858322382, "mean_token_accuracy": 0.9950747042894363, "num_tokens": 904934865.0, "step": 2129 }, { "entropy": 0.018875752110034227, "epoch": 0.9322683006893533, "grad_norm": 7.46875, "learning_rate": 2.5165132647536544e-05, "loss": 0.1165, "loss_lm": 0.016550650354474783, "loss_seg": 0.09995656553655863, "mean_token_accuracy": 0.9950714558362961, "num_tokens": 905359714.0, "step": 2130 }, { "entropy": 0.018372986931353807, "epoch": 0.9327059853375643, "grad_norm": 4.65625, "learning_rate": 2.516242555495398e-05, "loss": 0.13, "loss_lm": 0.014675708021968603, "loss_seg": 0.11534040793776512, "mean_token_accuracy": 0.995340496301651, "num_tokens": 905784603.0, "step": 2131 }, { "entropy": 0.018954350147396326, "epoch": 0.9331436699857752, "grad_norm": 6.90625, "learning_rate": 2.5159718462371415e-05, "loss": 0.1233, "loss_lm": 0.0172422977630049, "loss_seg": 0.10601431503891945, "mean_token_accuracy": 0.995203360915184, "num_tokens": 906210079.0, "step": 2132 }, { "entropy": 0.018957439344376326, "epoch": 0.9335813546339862, "grad_norm": 5.5625, "learning_rate": 2.515701136978885e-05, "loss": 0.1057, "loss_lm": 0.0167868931312114, "loss_seg": 0.0888743456453085, "mean_token_accuracy": 0.9951719492673874, "num_tokens": 906634227.0, "step": 2133 }, { "entropy": 0.01820120681077242, "epoch": 0.9340190392821972, "grad_norm": 20.125, "learning_rate": 2.5154304277206282e-05, "loss": 0.0975, "loss_lm": 0.015845627756789327, "loss_seg": 0.08161458000540733, "mean_token_accuracy": 0.9954052716493607, "num_tokens": 907059214.0, "step": 2134 }, { "entropy": 0.019413792062550783, "epoch": 0.9344567239304081, "grad_norm": 7.4375, "learning_rate": 2.5151597184623713e-05, "loss": 0.1872, "loss_lm": 0.019965992541983724, "loss_seg": 0.16725128330290318, "mean_token_accuracy": 0.9950792789459229, "num_tokens": 907483879.0, "step": 2135 }, { "entropy": 0.019201600458472967, "epoch": 0.934894408578619, "grad_norm": 4.6875, "learning_rate": 2.514889009204115e-05, "loss": 0.1669, "loss_lm": 0.015844675712287426, "loss_seg": 0.1510103903710842, "mean_token_accuracy": 0.9951063692569733, "num_tokens": 907908893.0, "step": 2136 }, { "entropy": 0.018938240595161915, "epoch": 0.9353320932268301, "grad_norm": 7.34375, "learning_rate": 2.5146182999458583e-05, "loss": 0.164, "loss_lm": 0.017452533822506666, "loss_seg": 0.14658833108842373, "mean_token_accuracy": 0.9952431917190552, "num_tokens": 908333512.0, "step": 2137 }, { "entropy": 0.018905185628682375, "epoch": 0.935769777875041, "grad_norm": 8.0625, "learning_rate": 2.5143475906876017e-05, "loss": 0.1171, "loss_lm": 0.018740639090538025, "loss_seg": 0.09831863641738892, "mean_token_accuracy": 0.9951378703117371, "num_tokens": 908758608.0, "step": 2138 }, { "entropy": 0.019295768812298775, "epoch": 0.936207462523252, "grad_norm": 11.1875, "learning_rate": 2.514076881429345e-05, "loss": 0.1076, "loss_lm": 0.01763084763661027, "loss_seg": 0.0900033637881279, "mean_token_accuracy": 0.9949318617582321, "num_tokens": 909183014.0, "step": 2139 }, { "entropy": 0.018967486917972565, "epoch": 0.936645147171463, "grad_norm": 8.1875, "learning_rate": 2.513806172171088e-05, "loss": 0.1173, "loss_lm": 0.01641033566556871, "loss_seg": 0.10086371004581451, "mean_token_accuracy": 0.9952117800712585, "num_tokens": 909607498.0, "step": 2140 }, { "entropy": 0.019023454748094082, "epoch": 0.9370828318196739, "grad_norm": 8.9375, "learning_rate": 2.5135354629128315e-05, "loss": 0.1311, "loss_lm": 0.015839268686249852, "loss_seg": 0.11530698463320732, "mean_token_accuracy": 0.9951945543289185, "num_tokens": 910032593.0, "step": 2141 }, { "entropy": 0.019308428280055523, "epoch": 0.9375205164678849, "grad_norm": 9.4375, "learning_rate": 2.5132647536545752e-05, "loss": 0.1232, "loss_lm": 0.016716428101062775, "loss_seg": 0.10648312978446484, "mean_token_accuracy": 0.9951019436120987, "num_tokens": 910457825.0, "step": 2142 }, { "entropy": 0.01886883471161127, "epoch": 0.9379582011160958, "grad_norm": 7.75, "learning_rate": 2.5129940443963186e-05, "loss": 0.1004, "loss_lm": 0.016361453337594867, "loss_seg": 0.0840670894831419, "mean_token_accuracy": 0.9952224493026733, "num_tokens": 910882477.0, "step": 2143 }, { "entropy": 0.019134109374135733, "epoch": 0.9383958857643068, "grad_norm": 35.5, "learning_rate": 2.5127233351380616e-05, "loss": 0.1879, "loss_lm": 0.014954945538192987, "loss_seg": 0.17291891016066074, "mean_token_accuracy": 0.9951948523521423, "num_tokens": 911307616.0, "step": 2144 }, { "entropy": 0.01938332011923194, "epoch": 0.9388335704125178, "grad_norm": 11.625, "learning_rate": 2.512452625879805e-05, "loss": 0.1297, "loss_lm": 0.017595248762518167, "loss_seg": 0.11210675537586212, "mean_token_accuracy": 0.9950926452875137, "num_tokens": 911732628.0, "step": 2145 }, { "entropy": 0.01883777230978012, "epoch": 0.9392712550607287, "grad_norm": 9.75, "learning_rate": 2.5121819166215484e-05, "loss": 0.169, "loss_lm": 0.014997675316408277, "loss_seg": 0.1540488675236702, "mean_token_accuracy": 0.9952539205551147, "num_tokens": 912157141.0, "step": 2146 }, { "entropy": 0.01869105128571391, "epoch": 0.9397089397089398, "grad_norm": 30.125, "learning_rate": 2.511911207363292e-05, "loss": 0.1289, "loss_lm": 0.016101807123050094, "loss_seg": 0.11275606788694859, "mean_token_accuracy": 0.995267778635025, "num_tokens": 912581833.0, "step": 2147 }, { "entropy": 0.019347913563251495, "epoch": 0.9401466243571507, "grad_norm": 8.5, "learning_rate": 2.5116404981050354e-05, "loss": 0.1724, "loss_lm": 0.016888528363779187, "loss_seg": 0.1554964929819107, "mean_token_accuracy": 0.995190292596817, "num_tokens": 913006732.0, "step": 2148 }, { "entropy": 0.019680720288306475, "epoch": 0.9405843090053616, "grad_norm": 5.84375, "learning_rate": 2.5113697888467785e-05, "loss": 0.2033, "loss_lm": 0.019022421911358833, "loss_seg": 0.18428412824869156, "mean_token_accuracy": 0.9950060993432999, "num_tokens": 913432635.0, "step": 2149 }, { "entropy": 0.019454816821962595, "epoch": 0.9410219936535726, "grad_norm": 9.25, "learning_rate": 2.511099079588522e-05, "loss": 0.1182, "loss_lm": 0.0165478833951056, "loss_seg": 0.10169520787894726, "mean_token_accuracy": 0.9951156675815582, "num_tokens": 913858261.0, "step": 2150 }, { "entropy": 0.019021927844733, "epoch": 0.9414596783017836, "grad_norm": 4.5625, "learning_rate": 2.5108283703302652e-05, "loss": 0.1685, "loss_lm": 0.01498891832306981, "loss_seg": 0.1535121500492096, "mean_token_accuracy": 0.9951172322034836, "num_tokens": 914283585.0, "step": 2151 }, { "entropy": 0.020150780212134123, "epoch": 0.9418973629499945, "grad_norm": 16.25, "learning_rate": 2.510557661072009e-05, "loss": 0.1416, "loss_lm": 0.01640950352884829, "loss_seg": 0.12515843659639359, "mean_token_accuracy": 0.995001956820488, "num_tokens": 914708523.0, "step": 2152 }, { "entropy": 0.01909072184935212, "epoch": 0.9423350475982055, "grad_norm": 9.875, "learning_rate": 2.5102869518137523e-05, "loss": 0.1354, "loss_lm": 0.0144427171908319, "loss_seg": 0.12098454870283604, "mean_token_accuracy": 0.9953816384077072, "num_tokens": 915133722.0, "step": 2153 }, { "entropy": 0.01871882053092122, "epoch": 0.9427727322464164, "grad_norm": 13.625, "learning_rate": 2.5100162425554953e-05, "loss": 0.122, "loss_lm": 0.01492278533987701, "loss_seg": 0.1070958860218525, "mean_token_accuracy": 0.9951723664999008, "num_tokens": 915558747.0, "step": 2154 }, { "entropy": 0.01919842464849353, "epoch": 0.9432104168946274, "grad_norm": 5.625, "learning_rate": 2.5097455332972387e-05, "loss": 0.1497, "loss_lm": 0.016099749598652124, "loss_seg": 0.13355292193591595, "mean_token_accuracy": 0.9951122999191284, "num_tokens": 915983672.0, "step": 2155 }, { "entropy": 0.018641977105289698, "epoch": 0.9436481015428384, "grad_norm": 5.78125, "learning_rate": 2.509474824038982e-05, "loss": 0.1456, "loss_lm": 0.014693498611450195, "loss_seg": 0.1309549529105425, "mean_token_accuracy": 0.9952345788478851, "num_tokens": 916408571.0, "step": 2156 }, { "entropy": 0.01961502293124795, "epoch": 0.9440857861910493, "grad_norm": 12.875, "learning_rate": 2.5092041147807258e-05, "loss": 0.1227, "loss_lm": 0.01986095542088151, "loss_seg": 0.10284557659178972, "mean_token_accuracy": 0.995058074593544, "num_tokens": 916834082.0, "step": 2157 }, { "entropy": 0.019448827020823956, "epoch": 0.9445234708392604, "grad_norm": 7.75, "learning_rate": 2.508933405522469e-05, "loss": 0.1124, "loss_lm": 0.0160858235321939, "loss_seg": 0.09631360974162817, "mean_token_accuracy": 0.9951517283916473, "num_tokens": 917259159.0, "step": 2158 }, { "entropy": 0.018827684689313173, "epoch": 0.9449611554874713, "grad_norm": 13.9375, "learning_rate": 2.5086626962642122e-05, "loss": 0.1915, "loss_lm": 0.019412435591220856, "loss_seg": 0.1721055880188942, "mean_token_accuracy": 0.9951992481946945, "num_tokens": 917684053.0, "step": 2159 }, { "entropy": 0.019478428177535534, "epoch": 0.9453988401356822, "grad_norm": 11.1875, "learning_rate": 2.5083919870059556e-05, "loss": 0.1204, "loss_lm": 0.01797647401690483, "loss_seg": 0.1023993156850338, "mean_token_accuracy": 0.9951541423797607, "num_tokens": 918108516.0, "step": 2160 }, { "entropy": 0.019420454278588295, "epoch": 0.9458365247838932, "grad_norm": 5.875, "learning_rate": 2.508121277747699e-05, "loss": 0.1431, "loss_lm": 0.015546652721241117, "loss_seg": 0.1275327019393444, "mean_token_accuracy": 0.9951747208833694, "num_tokens": 918533549.0, "step": 2161 }, { "entropy": 0.019118051044642925, "epoch": 0.9462742094321042, "grad_norm": 12.0, "learning_rate": 2.5078505684894427e-05, "loss": 0.1896, "loss_lm": 0.01889294572174549, "loss_seg": 0.17073284462094307, "mean_token_accuracy": 0.9951176941394806, "num_tokens": 918958734.0, "step": 2162 }, { "entropy": 0.019011087715625763, "epoch": 0.9467118940803151, "grad_norm": 14.6875, "learning_rate": 2.507579859231186e-05, "loss": 0.1874, "loss_lm": 0.016189696500077844, "loss_seg": 0.17117562517523766, "mean_token_accuracy": 0.9951999485492706, "num_tokens": 919384314.0, "step": 2163 }, { "entropy": 0.01820233464241028, "epoch": 0.9471495787285261, "grad_norm": 15.875, "learning_rate": 2.507309149972929e-05, "loss": 0.0957, "loss_lm": 0.01518548233434558, "loss_seg": 0.0805261842906475, "mean_token_accuracy": 0.9954164326190948, "num_tokens": 919808834.0, "step": 2164 }, { "entropy": 0.019245480187237263, "epoch": 0.947587263376737, "grad_norm": 9.75, "learning_rate": 2.5070384407146724e-05, "loss": 0.1335, "loss_lm": 0.016473570140078664, "loss_seg": 0.11702057719230652, "mean_token_accuracy": 0.9952718764543533, "num_tokens": 920233432.0, "step": 2165 }, { "entropy": 0.019012010656297207, "epoch": 0.9480249480249481, "grad_norm": 13.5625, "learning_rate": 2.5067677314564158e-05, "loss": 0.1576, "loss_lm": 0.017362553626298904, "loss_seg": 0.14019236713647842, "mean_token_accuracy": 0.995159775018692, "num_tokens": 920658381.0, "step": 2166 }, { "entropy": 0.019046394154429436, "epoch": 0.948462632673159, "grad_norm": 9.5625, "learning_rate": 2.5064970221981595e-05, "loss": 0.1237, "loss_lm": 0.0164349393453449, "loss_seg": 0.10725354216992855, "mean_token_accuracy": 0.9951155036687851, "num_tokens": 921083368.0, "step": 2167 }, { "entropy": 0.018789473921060562, "epoch": 0.9489003173213699, "grad_norm": 12.875, "learning_rate": 2.5062263129399026e-05, "loss": 0.1139, "loss_lm": 0.01645593182183802, "loss_seg": 0.09747157990932465, "mean_token_accuracy": 0.9953258484601974, "num_tokens": 921508212.0, "step": 2168 }, { "entropy": 0.019362644758075476, "epoch": 0.949338001969581, "grad_norm": 11.25, "learning_rate": 2.505955603681646e-05, "loss": 0.1593, "loss_lm": 0.01653572963550687, "loss_seg": 0.1427355855703354, "mean_token_accuracy": 0.9951449781656265, "num_tokens": 921933103.0, "step": 2169 }, { "entropy": 0.019416358321905136, "epoch": 0.9497756866177919, "grad_norm": 5.75, "learning_rate": 2.5056848944233893e-05, "loss": 0.1832, "loss_lm": 0.019158605253323913, "loss_seg": 0.1640351340174675, "mean_token_accuracy": 0.9951667189598083, "num_tokens": 922358026.0, "step": 2170 }, { "entropy": 0.01893204264342785, "epoch": 0.9502133712660028, "grad_norm": 6.90625, "learning_rate": 2.5054141851651327e-05, "loss": 0.1239, "loss_lm": 0.015015717828646302, "loss_seg": 0.10884957015514374, "mean_token_accuracy": 0.995217964053154, "num_tokens": 922782700.0, "step": 2171 }, { "entropy": 0.018628887366503477, "epoch": 0.9506510559142138, "grad_norm": 8.375, "learning_rate": 2.505143475906876e-05, "loss": 0.157, "loss_lm": 0.014691702788695693, "loss_seg": 0.14230524189770222, "mean_token_accuracy": 0.9952724277973175, "num_tokens": 923207608.0, "step": 2172 }, { "entropy": 0.0190340681001544, "epoch": 0.9510887405624248, "grad_norm": 7.375, "learning_rate": 2.5048727666486194e-05, "loss": 0.1052, "loss_lm": 0.018159847240895033, "loss_seg": 0.08704824559390545, "mean_token_accuracy": 0.9952624887228012, "num_tokens": 923632074.0, "step": 2173 }, { "entropy": 0.019186486024409533, "epoch": 0.9515264252106357, "grad_norm": 7.3125, "learning_rate": 2.5046020573903628e-05, "loss": 0.1447, "loss_lm": 0.015905187698081136, "loss_seg": 0.1287898700684309, "mean_token_accuracy": 0.9951704442501068, "num_tokens": 924056925.0, "step": 2174 }, { "entropy": 0.01927220867946744, "epoch": 0.9519641098588467, "grad_norm": 13.625, "learning_rate": 2.504331348132106e-05, "loss": 0.1364, "loss_lm": 0.015884497202932835, "loss_seg": 0.12048969231545925, "mean_token_accuracy": 0.9950995892286301, "num_tokens": 924481961.0, "step": 2175 }, { "entropy": 0.018932245206087828, "epoch": 0.9524017945070576, "grad_norm": 16.75, "learning_rate": 2.5040606388738495e-05, "loss": 0.1352, "loss_lm": 0.017573054414242506, "loss_seg": 0.11765952594578266, "mean_token_accuracy": 0.9951442778110504, "num_tokens": 924906585.0, "step": 2176 }, { "entropy": 0.01895558973774314, "epoch": 0.9528394791552687, "grad_norm": 7.59375, "learning_rate": 2.503789929615593e-05, "loss": 0.1627, "loss_lm": 0.01727994135580957, "loss_seg": 0.14546462893486023, "mean_token_accuracy": 0.9951563030481339, "num_tokens": 925332267.0, "step": 2177 }, { "entropy": 0.018382920417934656, "epoch": 0.9532771638034796, "grad_norm": 25.625, "learning_rate": 2.5035192203573363e-05, "loss": 0.18, "loss_lm": 0.01762019726447761, "loss_seg": 0.1623983271420002, "mean_token_accuracy": 0.99532051384449, "num_tokens": 925756841.0, "step": 2178 }, { "entropy": 0.018741958308964968, "epoch": 0.9537148484516905, "grad_norm": 7.59375, "learning_rate": 2.5032485110990796e-05, "loss": 0.1648, "loss_lm": 0.016568641643971205, "loss_seg": 0.14819326624274254, "mean_token_accuracy": 0.9953119903802872, "num_tokens": 926182140.0, "step": 2179 }, { "entropy": 0.020055307541042566, "epoch": 0.9541525330999016, "grad_norm": 16.25, "learning_rate": 2.502977801840823e-05, "loss": 0.1512, "loss_lm": 0.014843575190752745, "loss_seg": 0.13634973019361496, "mean_token_accuracy": 0.994892805814743, "num_tokens": 926607401.0, "step": 2180 }, { "entropy": 0.018530170433223248, "epoch": 0.9545902177481125, "grad_norm": 15.875, "learning_rate": 2.5027070925825664e-05, "loss": 0.1705, "loss_lm": 0.016302460571751, "loss_seg": 0.15420601144433022, "mean_token_accuracy": 0.9953573346138, "num_tokens": 927032946.0, "step": 2181 }, { "entropy": 0.019049440044909716, "epoch": 0.9550279023963234, "grad_norm": 17.375, "learning_rate": 2.5024363833243098e-05, "loss": 0.1275, "loss_lm": 0.016191161004826427, "loss_seg": 0.1113461647182703, "mean_token_accuracy": 0.9951897710561752, "num_tokens": 927458297.0, "step": 2182 }, { "entropy": 0.01880012545734644, "epoch": 0.9554655870445344, "grad_norm": 13.125, "learning_rate": 2.502165674066053e-05, "loss": 0.1415, "loss_lm": 0.01591802667826414, "loss_seg": 0.12555259093642235, "mean_token_accuracy": 0.9952466636896133, "num_tokens": 927883214.0, "step": 2183 }, { "entropy": 0.019067036919295788, "epoch": 0.9559032716927454, "grad_norm": 10.6875, "learning_rate": 2.5018949648077965e-05, "loss": 0.1461, "loss_lm": 0.019865242298692465, "loss_seg": 0.1262415386736393, "mean_token_accuracy": 0.9951278120279312, "num_tokens": 928308044.0, "step": 2184 }, { "entropy": 0.019062571693211794, "epoch": 0.9563409563409564, "grad_norm": 13.0, "learning_rate": 2.50162425554954e-05, "loss": 0.1166, "loss_lm": 0.016774814808741212, "loss_seg": 0.0998555589467287, "mean_token_accuracy": 0.9951981902122498, "num_tokens": 928732619.0, "step": 2185 }, { "entropy": 0.019688034895807505, "epoch": 0.9567786409891673, "grad_norm": 19.375, "learning_rate": 2.5013535462912833e-05, "loss": 0.1663, "loss_lm": 0.018396745901554823, "loss_seg": 0.14794372580945492, "mean_token_accuracy": 0.9950730502605438, "num_tokens": 929157374.0, "step": 2186 }, { "entropy": 0.018556124065071344, "epoch": 0.9572163256373782, "grad_norm": 6.8125, "learning_rate": 2.5010828370330266e-05, "loss": 0.1183, "loss_lm": 0.01553428079932928, "loss_seg": 0.10272390954196453, "mean_token_accuracy": 0.9952621757984161, "num_tokens": 929582396.0, "step": 2187 }, { "entropy": 0.019428883213549852, "epoch": 0.9576540102855893, "grad_norm": 20.25, "learning_rate": 2.50081212777477e-05, "loss": 0.0852, "loss_lm": 0.014854087494313717, "loss_seg": 0.07034213095903397, "mean_token_accuracy": 0.9950947314500809, "num_tokens": 930007432.0, "step": 2188 }, { "entropy": 0.018478707410395145, "epoch": 0.9580916949338002, "grad_norm": 5.625, "learning_rate": 2.5005414185165134e-05, "loss": 0.1061, "loss_lm": 0.015913156559690833, "loss_seg": 0.09014578349888325, "mean_token_accuracy": 0.9953100830316544, "num_tokens": 930432516.0, "step": 2189 }, { "entropy": 0.018767801113426685, "epoch": 0.9585293795820111, "grad_norm": 4.9375, "learning_rate": 2.5002707092582567e-05, "loss": 0.1291, "loss_lm": 0.01624393300153315, "loss_seg": 0.11281827837228775, "mean_token_accuracy": 0.995347186923027, "num_tokens": 930857386.0, "step": 2190 }, { "entropy": 0.019036016892641783, "epoch": 0.9589670642302222, "grad_norm": 10.5625, "learning_rate": 2.5e-05, "loss": 0.1533, "loss_lm": 0.015958934789523482, "loss_seg": 0.13732984103262424, "mean_token_accuracy": 0.9952328354120255, "num_tokens": 931282155.0, "step": 2191 }, { "entropy": 0.01922474754974246, "epoch": 0.9594047488784331, "grad_norm": 9.125, "learning_rate": 2.499729290741743e-05, "loss": 0.1371, "loss_lm": 0.01563624176196754, "loss_seg": 0.1214183634147048, "mean_token_accuracy": 0.9951406568288803, "num_tokens": 931706016.0, "step": 2192 }, { "entropy": 0.018881130497902632, "epoch": 0.959842433526644, "grad_norm": 6.40625, "learning_rate": 2.499458581483487e-05, "loss": 0.139, "loss_lm": 0.014521098462864757, "loss_seg": 0.1244814470410347, "mean_token_accuracy": 0.9952048063278198, "num_tokens": 932131206.0, "step": 2193 }, { "entropy": 0.01955269742757082, "epoch": 0.960280118174855, "grad_norm": 21.375, "learning_rate": 2.4991878722252302e-05, "loss": 0.1482, "loss_lm": 0.016275272238999605, "loss_seg": 0.13196012750267982, "mean_token_accuracy": 0.9951050281524658, "num_tokens": 932556298.0, "step": 2194 }, { "entropy": 0.018839805852621794, "epoch": 0.960717802823066, "grad_norm": 9.375, "learning_rate": 2.4989171629669736e-05, "loss": 0.1322, "loss_lm": 0.017670671921223402, "loss_seg": 0.11448536813259125, "mean_token_accuracy": 0.9951909631490707, "num_tokens": 932981259.0, "step": 2195 }, { "entropy": 0.01997534977272153, "epoch": 0.961155487471277, "grad_norm": 6.09375, "learning_rate": 2.498646453708717e-05, "loss": 0.1413, "loss_lm": 0.015936169074848294, "loss_seg": 0.12538042291998863, "mean_token_accuracy": 0.9948789328336716, "num_tokens": 933406163.0, "step": 2196 }, { "entropy": 0.01929467637091875, "epoch": 0.9615931721194879, "grad_norm": 10.5, "learning_rate": 2.49837574445046e-05, "loss": 0.1919, "loss_lm": 0.018073676619678736, "loss_seg": 0.1738685742020607, "mean_token_accuracy": 0.9952186048030853, "num_tokens": 933830908.0, "step": 2197 }, { "entropy": 0.019161043222993612, "epoch": 0.9620308567676988, "grad_norm": 17.5, "learning_rate": 2.4981050351922037e-05, "loss": 0.2004, "loss_lm": 0.017645501298829913, "loss_seg": 0.18279646337032318, "mean_token_accuracy": 0.9950988739728928, "num_tokens": 934256304.0, "step": 2198 }, { "entropy": 0.019850158132612705, "epoch": 0.9624685414159099, "grad_norm": 5.15625, "learning_rate": 2.497834325933947e-05, "loss": 0.2078, "loss_lm": 0.015248279552906752, "loss_seg": 0.19250998087227345, "mean_token_accuracy": 0.9952699989080429, "num_tokens": 934681863.0, "step": 2199 }, { "entropy": 0.01982388412579894, "epoch": 0.9629062260641208, "grad_norm": 10.1875, "learning_rate": 2.4975636166756905e-05, "loss": 0.105, "loss_lm": 0.015459557995200157, "loss_seg": 0.0895701665431261, "mean_token_accuracy": 0.9949400424957275, "num_tokens": 935106965.0, "step": 2200 }, { "entropy": 0.019508524797856808, "epoch": 0.9633439107123317, "grad_norm": 5.9375, "learning_rate": 2.497292907417434e-05, "loss": 0.1322, "loss_lm": 0.017751713981851935, "loss_seg": 0.11441216059029102, "mean_token_accuracy": 0.9950446635484695, "num_tokens": 935531946.0, "step": 2201 }, { "entropy": 0.019952116534113884, "epoch": 0.9637815953605428, "grad_norm": 13.3125, "learning_rate": 2.497022198159177e-05, "loss": 0.1651, "loss_lm": 0.017785600386559963, "loss_seg": 0.14735319279134274, "mean_token_accuracy": 0.9950663298368454, "num_tokens": 935956983.0, "step": 2202 }, { "entropy": 0.01962375408038497, "epoch": 0.9642192800087537, "grad_norm": 6.34375, "learning_rate": 2.4967514889009203e-05, "loss": 0.1352, "loss_lm": 0.01588749047368765, "loss_seg": 0.1193427350372076, "mean_token_accuracy": 0.9952338635921478, "num_tokens": 936382244.0, "step": 2203 }, { "entropy": 0.019592546857893467, "epoch": 0.9646569646569647, "grad_norm": 8.5625, "learning_rate": 2.496480779642664e-05, "loss": 0.1504, "loss_lm": 0.020034973043948412, "loss_seg": 0.13031905516982079, "mean_token_accuracy": 0.9951333999633789, "num_tokens": 936806777.0, "step": 2204 }, { "entropy": 0.018479371909052134, "epoch": 0.9650946493051756, "grad_norm": 6.59375, "learning_rate": 2.4962100703844073e-05, "loss": 0.1471, "loss_lm": 0.016521696466952562, "loss_seg": 0.1306271068751812, "mean_token_accuracy": 0.995396226644516, "num_tokens": 937231808.0, "step": 2205 }, { "entropy": 0.019210333470255136, "epoch": 0.9655323339533866, "grad_norm": 8.0625, "learning_rate": 2.4959393611261507e-05, "loss": 0.1573, "loss_lm": 0.016623452305793762, "loss_seg": 0.14069551974534988, "mean_token_accuracy": 0.9953057765960693, "num_tokens": 937656977.0, "step": 2206 }, { "entropy": 0.019185205921530724, "epoch": 0.9659700186015976, "grad_norm": 13.9375, "learning_rate": 2.4956686518678937e-05, "loss": 0.1314, "loss_lm": 0.019127734005451202, "loss_seg": 0.11224930919706821, "mean_token_accuracy": 0.99520143866539, "num_tokens": 938081830.0, "step": 2207 }, { "entropy": 0.01896746503189206, "epoch": 0.9664077032498085, "grad_norm": 11.875, "learning_rate": 2.495397942609637e-05, "loss": 0.116, "loss_lm": 0.014340948313474655, "loss_seg": 0.1016536308452487, "mean_token_accuracy": 0.9952131658792496, "num_tokens": 938507761.0, "step": 2208 }, { "entropy": 0.018084405921399593, "epoch": 0.9668453878980194, "grad_norm": 5.59375, "learning_rate": 2.4951272333513808e-05, "loss": 0.1677, "loss_lm": 0.017572887940332294, "loss_seg": 0.15015984326601028, "mean_token_accuracy": 0.9953937232494354, "num_tokens": 938933009.0, "step": 2209 }, { "entropy": 0.019051089882850647, "epoch": 0.9672830725462305, "grad_norm": 10.5625, "learning_rate": 2.4948565240931242e-05, "loss": 0.1696, "loss_lm": 0.014782835263758898, "loss_seg": 0.15483674220740795, "mean_token_accuracy": 0.9951930940151215, "num_tokens": 939358811.0, "step": 2210 }, { "entropy": 0.019127607811242342, "epoch": 0.9677207571944414, "grad_norm": 12.5, "learning_rate": 2.4945858148348672e-05, "loss": 0.1141, "loss_lm": 0.01734193880110979, "loss_seg": 0.09674854949116707, "mean_token_accuracy": 0.9950627237558365, "num_tokens": 939784127.0, "step": 2211 }, { "entropy": 0.018614224158227444, "epoch": 0.9681584418426523, "grad_norm": 3.84375, "learning_rate": 2.4943151055766106e-05, "loss": 0.1305, "loss_lm": 0.016915298532694578, "loss_seg": 0.11356950923800468, "mean_token_accuracy": 0.9952698796987534, "num_tokens": 940209670.0, "step": 2212 }, { "entropy": 0.01827936666086316, "epoch": 0.9685961264908634, "grad_norm": 13.5625, "learning_rate": 2.494044396318354e-05, "loss": 0.1493, "loss_lm": 0.013349138665944338, "loss_seg": 0.13599691912531853, "mean_token_accuracy": 0.99532151222229, "num_tokens": 940634748.0, "step": 2213 }, { "entropy": 0.019048191141337156, "epoch": 0.9690338111390743, "grad_norm": 6.46875, "learning_rate": 2.4937736870600977e-05, "loss": 0.1499, "loss_lm": 0.017342955339699984, "loss_seg": 0.13251987285912037, "mean_token_accuracy": 0.9951227903366089, "num_tokens": 941059556.0, "step": 2214 }, { "entropy": 0.01878232741728425, "epoch": 0.9694714957872853, "grad_norm": 25.625, "learning_rate": 2.493502977801841e-05, "loss": 0.1488, "loss_lm": 0.016481005121022463, "loss_seg": 0.1322804894298315, "mean_token_accuracy": 0.9952794313430786, "num_tokens": 941484039.0, "step": 2215 }, { "entropy": 0.017799314111471176, "epoch": 0.9699091804354962, "grad_norm": 12.25, "learning_rate": 2.493232268543584e-05, "loss": 0.1162, "loss_lm": 0.01800488866865635, "loss_seg": 0.09817895665764809, "mean_token_accuracy": 0.9955085217952728, "num_tokens": 941909001.0, "step": 2216 }, { "entropy": 0.01881255442276597, "epoch": 0.9703468650837072, "grad_norm": 13.6875, "learning_rate": 2.4929615592853275e-05, "loss": 0.1939, "loss_lm": 0.01700394880026579, "loss_seg": 0.17688903957605362, "mean_token_accuracy": 0.9951802343130112, "num_tokens": 942333669.0, "step": 2217 }, { "entropy": 0.019178058486431837, "epoch": 0.9707845497319182, "grad_norm": 9.5625, "learning_rate": 2.492690850027071e-05, "loss": 0.2023, "loss_lm": 0.019503210671246052, "loss_seg": 0.18275512009859085, "mean_token_accuracy": 0.9951401352882385, "num_tokens": 942759183.0, "step": 2218 }, { "entropy": 0.01944473758339882, "epoch": 0.9712222343801291, "grad_norm": 8.5625, "learning_rate": 2.4924201407688146e-05, "loss": 0.1733, "loss_lm": 0.02007239032536745, "loss_seg": 0.15322081744670868, "mean_token_accuracy": 0.9950725734233856, "num_tokens": 943184757.0, "step": 2219 }, { "entropy": 0.01922557409852743, "epoch": 0.97165991902834, "grad_norm": 8.4375, "learning_rate": 2.492149431510558e-05, "loss": 0.1409, "loss_lm": 0.01778216427192092, "loss_seg": 0.12314509972929955, "mean_token_accuracy": 0.995056688785553, "num_tokens": 943609746.0, "step": 2220 }, { "entropy": 0.018481323029845953, "epoch": 0.9720976036765511, "grad_norm": 11.0625, "learning_rate": 2.491878722252301e-05, "loss": 0.1856, "loss_lm": 0.01748103415593505, "loss_seg": 0.16815792582929134, "mean_token_accuracy": 0.9952022284269333, "num_tokens": 944034737.0, "step": 2221 }, { "entropy": 0.01921056117862463, "epoch": 0.972535288324762, "grad_norm": 16.375, "learning_rate": 2.4916080129940443e-05, "loss": 0.1821, "loss_lm": 0.0169380703009665, "loss_seg": 0.1651877984404564, "mean_token_accuracy": 0.9950080662965775, "num_tokens": 944460594.0, "step": 2222 }, { "entropy": 0.018731852527707815, "epoch": 0.972972972972973, "grad_norm": 11.4375, "learning_rate": 2.4913373037357877e-05, "loss": 0.1177, "loss_lm": 0.015024604042991996, "loss_seg": 0.10263989865779877, "mean_token_accuracy": 0.9952574372291565, "num_tokens": 944886065.0, "step": 2223 }, { "entropy": 0.01848609559237957, "epoch": 0.973410657621184, "grad_norm": 28.625, "learning_rate": 2.4910665944775314e-05, "loss": 0.1393, "loss_lm": 0.015615233452990651, "loss_seg": 0.12373048067092896, "mean_token_accuracy": 0.9953189343214035, "num_tokens": 945311552.0, "step": 2224 }, { "entropy": 0.018813584465533495, "epoch": 0.9738483422693949, "grad_norm": 11.4375, "learning_rate": 2.4907958852192748e-05, "loss": 0.1979, "loss_lm": 0.016153251752257347, "loss_seg": 0.18176115676760674, "mean_token_accuracy": 0.9951721578836441, "num_tokens": 945736545.0, "step": 2225 }, { "entropy": 0.019230320118367672, "epoch": 0.9742860269176059, "grad_norm": 15.1875, "learning_rate": 2.4905251759610178e-05, "loss": 0.1413, "loss_lm": 0.016373182646930218, "loss_seg": 0.1249341107904911, "mean_token_accuracy": 0.9951506406068802, "num_tokens": 946161731.0, "step": 2226 }, { "entropy": 0.019121637102216482, "epoch": 0.9747237115658168, "grad_norm": 8.6875, "learning_rate": 2.4902544667027612e-05, "loss": 0.1711, "loss_lm": 0.018426955677568913, "loss_seg": 0.15262382104992867, "mean_token_accuracy": 0.9950681179761887, "num_tokens": 946587457.0, "step": 2227 }, { "entropy": 0.019268731586635113, "epoch": 0.9751613962140278, "grad_norm": 13.5, "learning_rate": 2.4899837574445046e-05, "loss": 0.1488, "loss_lm": 0.018437446793541312, "loss_seg": 0.1303517259657383, "mean_token_accuracy": 0.9952272325754166, "num_tokens": 947012297.0, "step": 2228 }, { "entropy": 0.019090023823082447, "epoch": 0.9755990808622388, "grad_norm": 8.1875, "learning_rate": 2.4897130481862483e-05, "loss": 0.1404, "loss_lm": 0.016836607130244374, "loss_seg": 0.12354269251227379, "mean_token_accuracy": 0.9950621575117111, "num_tokens": 947437759.0, "step": 2229 }, { "entropy": 0.01862831460312009, "epoch": 0.9760367655104497, "grad_norm": 7.28125, "learning_rate": 2.4894423389279917e-05, "loss": 0.1117, "loss_lm": 0.01830888446420431, "loss_seg": 0.09339484944939613, "mean_token_accuracy": 0.9952088892459869, "num_tokens": 947862687.0, "step": 2230 }, { "entropy": 0.01952651794999838, "epoch": 0.9764744501586606, "grad_norm": 17.875, "learning_rate": 2.4891716296697347e-05, "loss": 0.1197, "loss_lm": 0.017037205398082733, "loss_seg": 0.10265612043440342, "mean_token_accuracy": 0.9950094521045685, "num_tokens": 948287660.0, "step": 2231 }, { "entropy": 0.018713608849793673, "epoch": 0.9769121348068717, "grad_norm": 6.625, "learning_rate": 2.488900920411478e-05, "loss": 0.1307, "loss_lm": 0.014716977253556252, "loss_seg": 0.11602063104510307, "mean_token_accuracy": 0.995230570435524, "num_tokens": 948712835.0, "step": 2232 }, { "entropy": 0.01868569990620017, "epoch": 0.9773498194550826, "grad_norm": 9.625, "learning_rate": 2.4886302111532214e-05, "loss": 0.1363, "loss_lm": 0.016865208745002747, "loss_seg": 0.11939639039337635, "mean_token_accuracy": 0.9952862411737442, "num_tokens": 949137301.0, "step": 2233 }, { "entropy": 0.0193518977612257, "epoch": 0.9777875041032936, "grad_norm": 15.25, "learning_rate": 2.4883595018949648e-05, "loss": 0.1306, "loss_lm": 0.016778345685452223, "loss_seg": 0.11381619237363338, "mean_token_accuracy": 0.9951376020908356, "num_tokens": 949561545.0, "step": 2234 }, { "entropy": 0.01887892698869109, "epoch": 0.9782251887515045, "grad_norm": 7.5, "learning_rate": 2.4880887926367082e-05, "loss": 0.1556, "loss_lm": 0.01461896882392466, "loss_seg": 0.1409488320350647, "mean_token_accuracy": 0.9953197985887527, "num_tokens": 949987326.0, "step": 2235 }, { "entropy": 0.019014901481568813, "epoch": 0.9786628733997155, "grad_norm": 8.625, "learning_rate": 2.4878180833784515e-05, "loss": 0.1359, "loss_lm": 0.019968206994235516, "loss_seg": 0.11590714938938618, "mean_token_accuracy": 0.99508336186409, "num_tokens": 950413161.0, "step": 2236 }, { "entropy": 0.018863767385482788, "epoch": 0.9791005580479265, "grad_norm": 15.0, "learning_rate": 2.487547374120195e-05, "loss": 0.1542, "loss_lm": 0.015100495656952262, "loss_seg": 0.13911430537700653, "mean_token_accuracy": 0.9953080713748932, "num_tokens": 950838095.0, "step": 2237 }, { "entropy": 0.019052419811487198, "epoch": 0.9795382426961374, "grad_norm": 10.375, "learning_rate": 2.4872766648619383e-05, "loss": 0.1201, "loss_lm": 0.01509132981300354, "loss_seg": 0.10501400753855705, "mean_token_accuracy": 0.99508897960186, "num_tokens": 951263038.0, "step": 2238 }, { "entropy": 0.01943452563136816, "epoch": 0.9799759273443484, "grad_norm": 8.1875, "learning_rate": 2.4870059556036817e-05, "loss": 0.142, "loss_lm": 0.016272058710455894, "loss_seg": 0.12568214908242226, "mean_token_accuracy": 0.9951475262641907, "num_tokens": 951688155.0, "step": 2239 }, { "entropy": 0.018988818861544132, "epoch": 0.9804136119925594, "grad_norm": 9.8125, "learning_rate": 2.486735246345425e-05, "loss": 0.1054, "loss_lm": 0.016419535037130117, "loss_seg": 0.08896987978368998, "mean_token_accuracy": 0.9951393306255341, "num_tokens": 952112956.0, "step": 2240 }, { "entropy": 0.019514480140060186, "epoch": 0.9808512966407703, "grad_norm": 14.0, "learning_rate": 2.4864645370871684e-05, "loss": 0.1461, "loss_lm": 0.01773767964914441, "loss_seg": 0.12836519069969654, "mean_token_accuracy": 0.9952543079853058, "num_tokens": 952537739.0, "step": 2241 }, { "entropy": 0.019319542217999697, "epoch": 0.9812889812889813, "grad_norm": 9.5, "learning_rate": 2.4861938278289118e-05, "loss": 0.1369, "loss_lm": 0.018909431528300047, "loss_seg": 0.11798222362995148, "mean_token_accuracy": 0.9951972961425781, "num_tokens": 952963263.0, "step": 2242 }, { "entropy": 0.01900531118735671, "epoch": 0.9817266659371923, "grad_norm": 8.875, "learning_rate": 2.485923118570655e-05, "loss": 0.1179, "loss_lm": 0.01395821338519454, "loss_seg": 0.1039558369666338, "mean_token_accuracy": 0.9951258450746536, "num_tokens": 953387925.0, "step": 2243 }, { "entropy": 0.01941640581935644, "epoch": 0.9821643505854032, "grad_norm": 11.9375, "learning_rate": 2.4856524093123985e-05, "loss": 0.1068, "loss_lm": 0.014603367075324059, "loss_seg": 0.0922300685197115, "mean_token_accuracy": 0.9951042383909225, "num_tokens": 953812931.0, "step": 2244 }, { "entropy": 0.01935692923143506, "epoch": 0.9826020352336142, "grad_norm": 9.9375, "learning_rate": 2.485381700054142e-05, "loss": 0.1588, "loss_lm": 0.018785344203934073, "loss_seg": 0.14005623012781143, "mean_token_accuracy": 0.9949961602687836, "num_tokens": 954238380.0, "step": 2245 }, { "entropy": 0.018744871485978365, "epoch": 0.9830397198818251, "grad_norm": 8.0, "learning_rate": 2.4851109907958853e-05, "loss": 0.1254, "loss_lm": 0.01536183268763125, "loss_seg": 0.11007003299891949, "mean_token_accuracy": 0.9951559007167816, "num_tokens": 954663198.0, "step": 2246 }, { "entropy": 0.01968692522495985, "epoch": 0.9834774045300361, "grad_norm": 11.4375, "learning_rate": 2.4848402815376286e-05, "loss": 0.1348, "loss_lm": 0.01893954211845994, "loss_seg": 0.11590017564594746, "mean_token_accuracy": 0.9950707256793976, "num_tokens": 955088798.0, "step": 2247 }, { "entropy": 0.019019857980310917, "epoch": 0.9839150891782471, "grad_norm": 12.1875, "learning_rate": 2.484569572279372e-05, "loss": 0.1632, "loss_lm": 0.018723830115050077, "loss_seg": 0.14450165443122387, "mean_token_accuracy": 0.9951669871807098, "num_tokens": 955514202.0, "step": 2248 }, { "entropy": 0.019517315085977316, "epoch": 0.984352773826458, "grad_norm": 6.25, "learning_rate": 2.4842988630211154e-05, "loss": 0.1784, "loss_lm": 0.016842486569657922, "loss_seg": 0.16151647455990314, "mean_token_accuracy": 0.9950575977563858, "num_tokens": 955938278.0, "step": 2249 }, { "entropy": 0.01978367380797863, "epoch": 0.984790458474669, "grad_norm": 5.59375, "learning_rate": 2.4840281537628588e-05, "loss": 0.1297, "loss_lm": 0.014005170436576009, "loss_seg": 0.11568603850901127, "mean_token_accuracy": 0.9950796365737915, "num_tokens": 956363595.0, "step": 2250 }, { "entropy": 0.01896572532132268, "epoch": 0.98522814312288, "grad_norm": 4.4375, "learning_rate": 2.483757444504602e-05, "loss": 0.1726, "loss_lm": 0.016770885325968266, "loss_seg": 0.15584445744752884, "mean_token_accuracy": 0.9952554106712341, "num_tokens": 956788253.0, "step": 2251 }, { "entropy": 0.018899060785770416, "epoch": 0.9856658277710909, "grad_norm": 11.125, "learning_rate": 2.4834867352463455e-05, "loss": 0.0976, "loss_lm": 0.01714475336484611, "loss_seg": 0.08041863795369864, "mean_token_accuracy": 0.9952551275491714, "num_tokens": 957213098.0, "step": 2252 }, { "entropy": 0.01934874663129449, "epoch": 0.9861035124193019, "grad_norm": 7.125, "learning_rate": 2.483216025988089e-05, "loss": 0.1561, "loss_lm": 0.018254012567922473, "loss_seg": 0.137869443744421, "mean_token_accuracy": 0.9950760155916214, "num_tokens": 957637939.0, "step": 2253 }, { "entropy": 0.019227218348532915, "epoch": 0.9865411970675129, "grad_norm": 12.0, "learning_rate": 2.4829453167298323e-05, "loss": 0.1679, "loss_lm": 0.01800430193543434, "loss_seg": 0.14994118176400661, "mean_token_accuracy": 0.9951139241456985, "num_tokens": 958063615.0, "step": 2254 }, { "entropy": 0.019076607190072536, "epoch": 0.9869788817157238, "grad_norm": 7.5625, "learning_rate": 2.4826746074715756e-05, "loss": 0.1502, "loss_lm": 0.01796805765479803, "loss_seg": 0.13219177350401878, "mean_token_accuracy": 0.9952151328325272, "num_tokens": 958488862.0, "step": 2255 }, { "entropy": 0.019176967442035675, "epoch": 0.9874165663639348, "grad_norm": 11.5625, "learning_rate": 2.482403898213319e-05, "loss": 0.155, "loss_lm": 0.014388723997399211, "loss_seg": 0.1405835635960102, "mean_token_accuracy": 0.995173841714859, "num_tokens": 958913375.0, "step": 2256 }, { "entropy": 0.019079747144132853, "epoch": 0.9878542510121457, "grad_norm": 13.8125, "learning_rate": 2.4821331889550624e-05, "loss": 0.1578, "loss_lm": 0.016984361223876476, "loss_seg": 0.1407967172563076, "mean_token_accuracy": 0.995227724313736, "num_tokens": 959338302.0, "step": 2257 }, { "entropy": 0.018828039057552814, "epoch": 0.9882919356603567, "grad_norm": 10.0, "learning_rate": 2.4818624796968057e-05, "loss": 0.1154, "loss_lm": 0.01608019764535129, "loss_seg": 0.0992745254188776, "mean_token_accuracy": 0.9953175932168961, "num_tokens": 959764103.0, "step": 2258 }, { "entropy": 0.019199883099645376, "epoch": 0.9887296203085677, "grad_norm": 8.3125, "learning_rate": 2.4815917704385488e-05, "loss": 0.1418, "loss_lm": 0.0177819118835032, "loss_seg": 0.12401236221194267, "mean_token_accuracy": 0.9951068609952927, "num_tokens": 960189552.0, "step": 2259 }, { "entropy": 0.019848689436912537, "epoch": 0.9891673049567786, "grad_norm": 7.8125, "learning_rate": 2.4813210611802925e-05, "loss": 0.1259, "loss_lm": 0.01562547590583563, "loss_seg": 0.11026909481734037, "mean_token_accuracy": 0.9949754923582077, "num_tokens": 960615259.0, "step": 2260 }, { "entropy": 0.018555755261331797, "epoch": 0.9896049896049897, "grad_norm": 14.5625, "learning_rate": 2.481050351922036e-05, "loss": 0.1243, "loss_lm": 0.015121044358238578, "loss_seg": 0.10920988954603672, "mean_token_accuracy": 0.9953553229570389, "num_tokens": 961040449.0, "step": 2261 }, { "entropy": 0.01897175144404173, "epoch": 0.9900426742532006, "grad_norm": 9.6875, "learning_rate": 2.4807796426637792e-05, "loss": 0.1255, "loss_lm": 0.016123709734529257, "loss_seg": 0.10934090986847878, "mean_token_accuracy": 0.9951813071966171, "num_tokens": 961465620.0, "step": 2262 }, { "entropy": 0.01855249283835292, "epoch": 0.9904803589014115, "grad_norm": 8.3125, "learning_rate": 2.4805089334055226e-05, "loss": 0.1173, "loss_lm": 0.015347661916166544, "loss_seg": 0.10197976790368557, "mean_token_accuracy": 0.9952825307846069, "num_tokens": 961889966.0, "step": 2263 }, { "entropy": 0.018680859357118607, "epoch": 0.9909180435496225, "grad_norm": 48.75, "learning_rate": 2.4802382241472656e-05, "loss": 0.0926, "loss_lm": 0.01575778261758387, "loss_seg": 0.07682503759860992, "mean_token_accuracy": 0.9952681362628937, "num_tokens": 962314910.0, "step": 2264 }, { "entropy": 0.01906129578128457, "epoch": 0.9913557281978335, "grad_norm": 9.75, "learning_rate": 2.4799675148890094e-05, "loss": 0.1264, "loss_lm": 0.015734265325590968, "loss_seg": 0.11062282416969538, "mean_token_accuracy": 0.9951269328594208, "num_tokens": 962740076.0, "step": 2265 }, { "entropy": 0.01916948752477765, "epoch": 0.9917934128460444, "grad_norm": 8.5625, "learning_rate": 2.4796968056307527e-05, "loss": 0.1255, "loss_lm": 0.017850307282060385, "loss_seg": 0.10762825049459934, "mean_token_accuracy": 0.995161771774292, "num_tokens": 963166037.0, "step": 2266 }, { "entropy": 0.01929405564442277, "epoch": 0.9922310974942554, "grad_norm": 6.25, "learning_rate": 2.479426096372496e-05, "loss": 0.1445, "loss_lm": 0.016990405274555087, "loss_seg": 0.12749525532126427, "mean_token_accuracy": 0.9950184971094131, "num_tokens": 963590592.0, "step": 2267 }, { "entropy": 0.018010355532169342, "epoch": 0.9926687821424663, "grad_norm": 7.125, "learning_rate": 2.4791553871142395e-05, "loss": 0.1616, "loss_lm": 0.017030001152306795, "loss_seg": 0.14460715278983116, "mean_token_accuracy": 0.9953214526176453, "num_tokens": 964016102.0, "step": 2268 }, { "entropy": 0.01907344115898013, "epoch": 0.9931064667906773, "grad_norm": 5.71875, "learning_rate": 2.4788846778559825e-05, "loss": 0.1202, "loss_lm": 0.01578231854364276, "loss_seg": 0.10446473397314548, "mean_token_accuracy": 0.995169147849083, "num_tokens": 964440840.0, "step": 2269 }, { "entropy": 0.018807037733495235, "epoch": 0.9935441514388883, "grad_norm": 13.875, "learning_rate": 2.478613968597726e-05, "loss": 0.1452, "loss_lm": 0.016333425184711814, "loss_seg": 0.12888341024518013, "mean_token_accuracy": 0.9952803254127502, "num_tokens": 964865615.0, "step": 2270 }, { "entropy": 0.019603587221354246, "epoch": 0.9939818360870992, "grad_norm": 6.6875, "learning_rate": 2.4783432593394696e-05, "loss": 0.1763, "loss_lm": 0.01979050599038601, "loss_seg": 0.15650699660182, "mean_token_accuracy": 0.9950060546398163, "num_tokens": 965290502.0, "step": 2271 }, { "entropy": 0.018691714387387037, "epoch": 0.9944195207353103, "grad_norm": 16.0, "learning_rate": 2.478072550081213e-05, "loss": 0.1487, "loss_lm": 0.015227718278765678, "loss_seg": 0.13342926651239395, "mean_token_accuracy": 0.995197668671608, "num_tokens": 965715474.0, "step": 2272 }, { "entropy": 0.018641005735844374, "epoch": 0.9948572053835212, "grad_norm": 8.25, "learning_rate": 2.4778018408229563e-05, "loss": 0.1459, "loss_lm": 0.016425827983766794, "loss_seg": 0.129483787342906, "mean_token_accuracy": 0.9952123910188675, "num_tokens": 966140487.0, "step": 2273 }, { "entropy": 0.01893680775538087, "epoch": 0.9952948900317321, "grad_norm": 6.375, "learning_rate": 2.4775311315646994e-05, "loss": 0.1193, "loss_lm": 0.01637783320620656, "loss_seg": 0.10294933430850506, "mean_token_accuracy": 0.9952507019042969, "num_tokens": 966565350.0, "step": 2274 }, { "entropy": 0.0186713351868093, "epoch": 0.9957325746799431, "grad_norm": 9.1875, "learning_rate": 2.4772604223064427e-05, "loss": 0.1294, "loss_lm": 0.016056871507316828, "loss_seg": 0.11333860643208027, "mean_token_accuracy": 0.9950496554374695, "num_tokens": 966990566.0, "step": 2275 }, { "entropy": 0.019112159963697195, "epoch": 0.9961702593281541, "grad_norm": 9.5625, "learning_rate": 2.4769897130481865e-05, "loss": 0.1551, "loss_lm": 0.01747642713598907, "loss_seg": 0.1376134790480137, "mean_token_accuracy": 0.9951484799385071, "num_tokens": 967415348.0, "step": 2276 }, { "entropy": 0.018749231472611427, "epoch": 0.996607943976365, "grad_norm": 4.15625, "learning_rate": 2.4767190037899298e-05, "loss": 0.1217, "loss_lm": 0.017232130281627178, "loss_seg": 0.10444840043783188, "mean_token_accuracy": 0.9951771944761276, "num_tokens": 967840173.0, "step": 2277 }, { "entropy": 0.01884015742689371, "epoch": 0.997045628624576, "grad_norm": 13.125, "learning_rate": 2.4764482945316732e-05, "loss": 0.1278, "loss_lm": 0.017214771592989564, "loss_seg": 0.11059685610234737, "mean_token_accuracy": 0.9952594190835953, "num_tokens": 968265608.0, "step": 2278 }, { "entropy": 0.018703711684793234, "epoch": 0.9974833132727869, "grad_norm": 9.6875, "learning_rate": 2.4761775852734162e-05, "loss": 0.1378, "loss_lm": 0.016831420827656984, "loss_seg": 0.12098681926727295, "mean_token_accuracy": 0.9952650964260101, "num_tokens": 968690717.0, "step": 2279 }, { "entropy": 0.019203164149075747, "epoch": 0.997920997920998, "grad_norm": 7.59375, "learning_rate": 2.4759068760151596e-05, "loss": 0.1515, "loss_lm": 0.018276748480275273, "loss_seg": 0.1331877652555704, "mean_token_accuracy": 0.9951778799295425, "num_tokens": 969115613.0, "step": 2280 }, { "entropy": 0.018742313142865896, "epoch": 0.9983586825692089, "grad_norm": 5.5, "learning_rate": 2.4756361667569033e-05, "loss": 0.1336, "loss_lm": 0.014231948647648096, "loss_seg": 0.1193617433309555, "mean_token_accuracy": 0.9952622056007385, "num_tokens": 969540635.0, "step": 2281 }, { "entropy": 0.019106373190879822, "epoch": 0.9987963672174198, "grad_norm": 13.3125, "learning_rate": 2.4753654574986467e-05, "loss": 0.1387, "loss_lm": 0.017397907096892595, "loss_seg": 0.12133804708719254, "mean_token_accuracy": 0.9951764643192291, "num_tokens": 969966618.0, "step": 2282 }, { "entropy": 0.018696140497922897, "epoch": 0.9992340518656309, "grad_norm": 5.75, "learning_rate": 2.4750947482403897e-05, "loss": 0.1532, "loss_lm": 0.016762955114245415, "loss_seg": 0.1364450454711914, "mean_token_accuracy": 0.9952580779790878, "num_tokens": 970390874.0, "step": 2283 }, { "entropy": 0.019043526146560907, "epoch": 0.9996717365138418, "grad_norm": 9.75, "learning_rate": 2.474824038982133e-05, "loss": 0.1235, "loss_lm": 0.016149200266227126, "loss_seg": 0.10732939466834068, "mean_token_accuracy": 0.9951511174440384, "num_tokens": 970816285.0, "step": 2284 }, { "entropy": 0.01899665594100952, "epoch": 1.0, "grad_norm": 9.25, "learning_rate": 2.4745533297238765e-05, "loss": 0.0875, "loss_lm": 0.014395235106348991, "loss_seg": 0.07313756148020427, "mean_token_accuracy": 0.9950291117032369, "num_tokens": 971134833.0, "step": 2285 }, { "entropy": 0.019202616531401873, "epoch": 1.000437684648211, "grad_norm": 7.90625, "learning_rate": 2.4742826204656202e-05, "loss": 0.1708, "loss_lm": 0.018364102113991976, "loss_seg": 0.15242734923958778, "mean_token_accuracy": 0.9950446635484695, "num_tokens": 971559812.0, "step": 2286 }, { "entropy": 0.018618034664541483, "epoch": 1.0008753692964218, "grad_norm": 10.75, "learning_rate": 2.4740119112073636e-05, "loss": 0.1331, "loss_lm": 0.017452068626880646, "loss_seg": 0.1156833041459322, "mean_token_accuracy": 0.9952089488506317, "num_tokens": 971984950.0, "step": 2287 }, { "entropy": 0.018663329537957907, "epoch": 1.0013130539446329, "grad_norm": 5.09375, "learning_rate": 2.4737412019491066e-05, "loss": 0.0963, "loss_lm": 0.017686687875539064, "loss_seg": 0.0785684185102582, "mean_token_accuracy": 0.9951810836791992, "num_tokens": 972409900.0, "step": 2288 }, { "entropy": 0.018442745320498943, "epoch": 1.001750738592844, "grad_norm": 9.8125, "learning_rate": 2.47347049269085e-05, "loss": 0.1045, "loss_lm": 0.015937273856252432, "loss_seg": 0.08854030817747116, "mean_token_accuracy": 0.9954001158475876, "num_tokens": 972834865.0, "step": 2289 }, { "entropy": 0.018810144159942865, "epoch": 1.0021884232410547, "grad_norm": 6.375, "learning_rate": 2.4731997834325933e-05, "loss": 0.1461, "loss_lm": 0.0161902722902596, "loss_seg": 0.12994452752172947, "mean_token_accuracy": 0.9951576292514801, "num_tokens": 973259557.0, "step": 2290 }, { "entropy": 0.018052523490041494, "epoch": 1.0026261078892658, "grad_norm": 8.625, "learning_rate": 2.472929074174337e-05, "loss": 0.098, "loss_lm": 0.015312557574361563, "loss_seg": 0.08264053240418434, "mean_token_accuracy": 0.9952968955039978, "num_tokens": 973684293.0, "step": 2291 }, { "entropy": 0.019293704070150852, "epoch": 1.0030637925374768, "grad_norm": 5.59375, "learning_rate": 2.4726583649160804e-05, "loss": 0.1496, "loss_lm": 0.01657560048624873, "loss_seg": 0.133052796125412, "mean_token_accuracy": 0.9950029700994492, "num_tokens": 974109120.0, "step": 2292 }, { "entropy": 0.018826942890882492, "epoch": 1.0035014771856876, "grad_norm": 6.71875, "learning_rate": 2.4723876556578234e-05, "loss": 0.1428, "loss_lm": 0.014340338297188282, "loss_seg": 0.12845576740801334, "mean_token_accuracy": 0.9953043609857559, "num_tokens": 974534799.0, "step": 2293 }, { "entropy": 0.018293178640305996, "epoch": 1.0039391618338986, "grad_norm": 5.125, "learning_rate": 2.4721169463995668e-05, "loss": 0.1075, "loss_lm": 0.01680922508239746, "loss_seg": 0.09071608632802963, "mean_token_accuracy": 0.9952855557203293, "num_tokens": 974959611.0, "step": 2294 }, { "entropy": 0.01903407834470272, "epoch": 1.0043768464821097, "grad_norm": 9.0, "learning_rate": 2.4718462371413102e-05, "loss": 0.1989, "loss_lm": 0.017242528731003404, "loss_seg": 0.18166035786271095, "mean_token_accuracy": 0.9952680319547653, "num_tokens": 975384729.0, "step": 2295 }, { "entropy": 0.019236512016505003, "epoch": 1.0048145311303207, "grad_norm": 6.03125, "learning_rate": 2.471575527883054e-05, "loss": 0.1151, "loss_lm": 0.01588709745556116, "loss_seg": 0.09920301474630833, "mean_token_accuracy": 0.995230033993721, "num_tokens": 975810274.0, "step": 2296 }, { "entropy": 0.018959554843604565, "epoch": 1.0052522157785315, "grad_norm": 9.0, "learning_rate": 2.4713048186247973e-05, "loss": 0.1537, "loss_lm": 0.016588071128353477, "loss_seg": 0.13708587549626827, "mean_token_accuracy": 0.9952293038368225, "num_tokens": 976235355.0, "step": 2297 }, { "entropy": 0.01853350130841136, "epoch": 1.0056899004267426, "grad_norm": 5.28125, "learning_rate": 2.4710341093665403e-05, "loss": 0.1204, "loss_lm": 0.014963985420763493, "loss_seg": 0.10547150298953056, "mean_token_accuracy": 0.9951940029859543, "num_tokens": 976660211.0, "step": 2298 }, { "entropy": 0.01880297949537635, "epoch": 1.0061275850749536, "grad_norm": 6.78125, "learning_rate": 2.4707634001082837e-05, "loss": 0.1257, "loss_lm": 0.01607377827167511, "loss_seg": 0.10966955497860909, "mean_token_accuracy": 0.9950737357139587, "num_tokens": 977085645.0, "step": 2299 }, { "entropy": 0.01861158898100257, "epoch": 1.0065652697231644, "grad_norm": 5.375, "learning_rate": 2.470492690850027e-05, "loss": 0.1384, "loss_lm": 0.01446604891680181, "loss_seg": 0.123957060277462, "mean_token_accuracy": 0.9952858537435532, "num_tokens": 977510715.0, "step": 2300 }, { "entropy": 0.019037538208067417, "epoch": 1.0070029543713754, "grad_norm": 31.0, "learning_rate": 2.4702219815917704e-05, "loss": 0.1215, "loss_lm": 0.015926080755889416, "loss_seg": 0.10560724884271622, "mean_token_accuracy": 0.995154932141304, "num_tokens": 977936266.0, "step": 2301 }, { "entropy": 0.019081134349107742, "epoch": 1.0074406390195865, "grad_norm": 12.0625, "learning_rate": 2.469951272333514e-05, "loss": 0.1118, "loss_lm": 0.018058594316244125, "loss_seg": 0.09376423712819815, "mean_token_accuracy": 0.9951038211584091, "num_tokens": 978361661.0, "step": 2302 }, { "entropy": 0.01883206842467189, "epoch": 1.0078783236677973, "grad_norm": 7.0625, "learning_rate": 2.4696805630752572e-05, "loss": 0.1214, "loss_lm": 0.014905232237651944, "loss_seg": 0.10651853680610657, "mean_token_accuracy": 0.9951979070901871, "num_tokens": 978786048.0, "step": 2303 }, { "entropy": 0.01844209060072899, "epoch": 1.0083160083160083, "grad_norm": 7.78125, "learning_rate": 2.4694098538170005e-05, "loss": 0.0882, "loss_lm": 0.01503582950681448, "loss_seg": 0.07316843420267105, "mean_token_accuracy": 0.9954010844230652, "num_tokens": 979211748.0, "step": 2304 }, { "entropy": 0.01814896473661065, "epoch": 1.0087536929642194, "grad_norm": 34.25, "learning_rate": 2.469139144558744e-05, "loss": 0.1272, "loss_lm": 0.015517748892307281, "loss_seg": 0.11168590001761913, "mean_token_accuracy": 0.9954594969749451, "num_tokens": 979636972.0, "step": 2305 }, { "entropy": 0.0185879017226398, "epoch": 1.0091913776124302, "grad_norm": 8.5, "learning_rate": 2.4688684353004873e-05, "loss": 0.174, "loss_lm": 0.01765366573818028, "loss_seg": 0.15639254450798035, "mean_token_accuracy": 0.9952668100595474, "num_tokens": 980061994.0, "step": 2306 }, { "entropy": 0.01918174559250474, "epoch": 1.0096290622606412, "grad_norm": 14.0, "learning_rate": 2.4685977260422307e-05, "loss": 0.1369, "loss_lm": 0.016574549255892634, "loss_seg": 0.12029862590134144, "mean_token_accuracy": 0.9950524121522903, "num_tokens": 980486858.0, "step": 2307 }, { "entropy": 0.018427247647196054, "epoch": 1.0100667469088522, "grad_norm": 7.5625, "learning_rate": 2.468327016783974e-05, "loss": 0.155, "loss_lm": 0.016695953672751784, "loss_seg": 0.13831607438623905, "mean_token_accuracy": 0.9952186942100525, "num_tokens": 980912207.0, "step": 2308 }, { "entropy": 0.018809351604431868, "epoch": 1.010504431557063, "grad_norm": 6.46875, "learning_rate": 2.4680563075257174e-05, "loss": 0.1409, "loss_lm": 0.01642565382644534, "loss_seg": 0.12451525405049324, "mean_token_accuracy": 0.995158776640892, "num_tokens": 981337071.0, "step": 2309 }, { "entropy": 0.01895680231973529, "epoch": 1.010942116205274, "grad_norm": 5.21875, "learning_rate": 2.4677855982674608e-05, "loss": 0.1106, "loss_lm": 0.017384150763973594, "loss_seg": 0.09321432933211327, "mean_token_accuracy": 0.9952079802751541, "num_tokens": 981761525.0, "step": 2310 }, { "entropy": 0.01887498889118433, "epoch": 1.0113798008534851, "grad_norm": 6.6875, "learning_rate": 2.467514889009204e-05, "loss": 0.1191, "loss_lm": 0.017353576608002186, "loss_seg": 0.10173221118748188, "mean_token_accuracy": 0.9951800107955933, "num_tokens": 982186994.0, "step": 2311 }, { "entropy": 0.01881171390414238, "epoch": 1.011817485501696, "grad_norm": 7.03125, "learning_rate": 2.4672441797509475e-05, "loss": 0.0899, "loss_lm": 0.014676759019494057, "loss_seg": 0.0752023970708251, "mean_token_accuracy": 0.9952634274959564, "num_tokens": 982611945.0, "step": 2312 }, { "entropy": 0.018549398984760046, "epoch": 1.012255170149907, "grad_norm": 11.3125, "learning_rate": 2.466973470492691e-05, "loss": 0.1412, "loss_lm": 0.017174532171338797, "loss_seg": 0.12402184680104256, "mean_token_accuracy": 0.9951928406953812, "num_tokens": 983037097.0, "step": 2313 }, { "entropy": 0.018706712406128645, "epoch": 1.012692854798118, "grad_norm": 5.65625, "learning_rate": 2.4667027612344343e-05, "loss": 0.1157, "loss_lm": 0.018479060381650925, "loss_seg": 0.09725796058773994, "mean_token_accuracy": 0.9951501786708832, "num_tokens": 983461746.0, "step": 2314 }, { "entropy": 0.019388664979487658, "epoch": 1.013130539446329, "grad_norm": 7.6875, "learning_rate": 2.4664320519761776e-05, "loss": 0.1198, "loss_lm": 0.015015505952760577, "loss_seg": 0.104804128408432, "mean_token_accuracy": 0.9950343072414398, "num_tokens": 983886840.0, "step": 2315 }, { "entropy": 0.019217326771467924, "epoch": 1.0135682240945398, "grad_norm": 9.1875, "learning_rate": 2.466161342717921e-05, "loss": 0.1213, "loss_lm": 0.016550826374441385, "loss_seg": 0.10477023757994175, "mean_token_accuracy": 0.9950060099363327, "num_tokens": 984311922.0, "step": 2316 }, { "entropy": 0.018616016022861004, "epoch": 1.0140059087427509, "grad_norm": 23.875, "learning_rate": 2.4658906334596644e-05, "loss": 0.113, "loss_lm": 0.01516751479357481, "loss_seg": 0.09779174719005823, "mean_token_accuracy": 0.9952830970287323, "num_tokens": 984736325.0, "step": 2317 }, { "entropy": 0.019252758473157883, "epoch": 1.014443593390962, "grad_norm": 11.0, "learning_rate": 2.4656199242014078e-05, "loss": 0.124, "loss_lm": 0.016097272280603647, "loss_seg": 0.10789452493190765, "mean_token_accuracy": 0.9950280785560608, "num_tokens": 985161087.0, "step": 2318 }, { "entropy": 0.01912646135315299, "epoch": 1.0148812780391727, "grad_norm": 9.25, "learning_rate": 2.465349214943151e-05, "loss": 0.1825, "loss_lm": 0.015558873070403934, "loss_seg": 0.16693528927862644, "mean_token_accuracy": 0.995194748044014, "num_tokens": 985586416.0, "step": 2319 }, { "entropy": 0.018952771089971066, "epoch": 1.0153189626873838, "grad_norm": 6.25, "learning_rate": 2.4650785056848945e-05, "loss": 0.1272, "loss_lm": 0.015040360623970628, "loss_seg": 0.11219358816742897, "mean_token_accuracy": 0.9951695501804352, "num_tokens": 986011778.0, "step": 2320 }, { "entropy": 0.0193246528506279, "epoch": 1.0157566473355948, "grad_norm": 5.3125, "learning_rate": 2.464807796426638e-05, "loss": 0.1124, "loss_lm": 0.017331707291305065, "loss_seg": 0.09505795873701572, "mean_token_accuracy": 0.9951037466526031, "num_tokens": 986437187.0, "step": 2321 }, { "entropy": 0.0184938064776361, "epoch": 1.0161943319838056, "grad_norm": 4.375, "learning_rate": 2.4645370871683813e-05, "loss": 0.134, "loss_lm": 0.016588951461017132, "loss_seg": 0.11742796935141087, "mean_token_accuracy": 0.9952472895383835, "num_tokens": 986862179.0, "step": 2322 }, { "entropy": 0.018274466041475534, "epoch": 1.0166320166320166, "grad_norm": 4.125, "learning_rate": 2.4642663779101246e-05, "loss": 0.136, "loss_lm": 0.017946182284504175, "loss_seg": 0.11804624646902084, "mean_token_accuracy": 0.9952949285507202, "num_tokens": 987286781.0, "step": 2323 }, { "entropy": 0.01927660731598735, "epoch": 1.0170697012802277, "grad_norm": 9.3125, "learning_rate": 2.463995668651868e-05, "loss": 0.1348, "loss_lm": 0.01837538229301572, "loss_seg": 0.11645273305475712, "mean_token_accuracy": 0.9951568692922592, "num_tokens": 987711502.0, "step": 2324 }, { "entropy": 0.018566555343568325, "epoch": 1.0175073859284385, "grad_norm": 7.6875, "learning_rate": 2.4637249593936114e-05, "loss": 0.1293, "loss_lm": 0.015909296926110983, "loss_seg": 0.11341587081551552, "mean_token_accuracy": 0.9953138828277588, "num_tokens": 988135709.0, "step": 2325 }, { "entropy": 0.01894613215699792, "epoch": 1.0179450705766495, "grad_norm": 5.9375, "learning_rate": 2.4634542501353547e-05, "loss": 0.1652, "loss_lm": 0.015410739229992032, "loss_seg": 0.149754099547863, "mean_token_accuracy": 0.9951865673065186, "num_tokens": 988561162.0, "step": 2326 }, { "entropy": 0.01949497265741229, "epoch": 1.0183827552248605, "grad_norm": 9.0625, "learning_rate": 2.463183540877098e-05, "loss": 0.1367, "loss_lm": 0.01693638297729194, "loss_seg": 0.11974200047552586, "mean_token_accuracy": 0.9950249642133713, "num_tokens": 988986433.0, "step": 2327 }, { "entropy": 0.018252952490001917, "epoch": 1.0188204398730714, "grad_norm": 47.0, "learning_rate": 2.4629128316188415e-05, "loss": 0.0984, "loss_lm": 0.01476185373030603, "loss_seg": 0.08363519050180912, "mean_token_accuracy": 0.995455265045166, "num_tokens": 989411252.0, "step": 2328 }, { "entropy": 0.018436355516314507, "epoch": 1.0192581245212824, "grad_norm": 7.65625, "learning_rate": 2.462642122360585e-05, "loss": 0.1344, "loss_lm": 0.017909310292452574, "loss_seg": 0.11653163656592369, "mean_token_accuracy": 0.9952030926942825, "num_tokens": 989836502.0, "step": 2329 }, { "entropy": 0.019272889010608196, "epoch": 1.0196958091694934, "grad_norm": 13.375, "learning_rate": 2.4623714131023282e-05, "loss": 0.1242, "loss_lm": 0.01743269362486899, "loss_seg": 0.10678355395793915, "mean_token_accuracy": 0.9951896965503693, "num_tokens": 990261814.0, "step": 2330 }, { "entropy": 0.01902844524011016, "epoch": 1.0201334938177042, "grad_norm": 13.9375, "learning_rate": 2.4621007038440713e-05, "loss": 0.1808, "loss_lm": 0.019551376346498728, "loss_seg": 0.1612586583942175, "mean_token_accuracy": 0.9951125979423523, "num_tokens": 990686947.0, "step": 2331 }, { "entropy": 0.01904468098655343, "epoch": 1.0205711784659153, "grad_norm": 10.3125, "learning_rate": 2.461829994585815e-05, "loss": 0.1398, "loss_lm": 0.01694096578285098, "loss_seg": 0.12289808876812458, "mean_token_accuracy": 0.9951937645673752, "num_tokens": 991111985.0, "step": 2332 }, { "entropy": 0.018556497525423765, "epoch": 1.0210088631141263, "grad_norm": 14.4375, "learning_rate": 2.4615592853275584e-05, "loss": 0.1309, "loss_lm": 0.014136144192889333, "loss_seg": 0.11675565131008625, "mean_token_accuracy": 0.9952946454286575, "num_tokens": 991536768.0, "step": 2333 }, { "entropy": 0.018752654548734426, "epoch": 1.0214465477623373, "grad_norm": 7.09375, "learning_rate": 2.4612885760693017e-05, "loss": 0.1555, "loss_lm": 0.01730217505246401, "loss_seg": 0.13820384815335274, "mean_token_accuracy": 0.9952612519264221, "num_tokens": 991961295.0, "step": 2334 }, { "entropy": 0.019310304429382086, "epoch": 1.0218842324105482, "grad_norm": 25.375, "learning_rate": 2.461017866811045e-05, "loss": 0.1097, "loss_lm": 0.015993562759831548, "loss_seg": 0.09372700192034245, "mean_token_accuracy": 0.9950401037931442, "num_tokens": 992386688.0, "step": 2335 }, { "entropy": 0.01914657512679696, "epoch": 1.0223219170587592, "grad_norm": 10.5, "learning_rate": 2.460747157552788e-05, "loss": 0.157, "loss_lm": 0.018719874555245042, "loss_seg": 0.1382798906415701, "mean_token_accuracy": 0.9952235221862793, "num_tokens": 992812275.0, "step": 2336 }, { "entropy": 0.018999638501554728, "epoch": 1.0227596017069702, "grad_norm": 8.75, "learning_rate": 2.4604764482945315e-05, "loss": 0.1453, "loss_lm": 0.01502324896864593, "loss_seg": 0.1302652657032013, "mean_token_accuracy": 0.995201900601387, "num_tokens": 993237211.0, "step": 2337 }, { "entropy": 0.01905843848362565, "epoch": 1.023197286355181, "grad_norm": 32.75, "learning_rate": 2.4602057390362752e-05, "loss": 0.1723, "loss_lm": 0.014657821273431182, "loss_seg": 0.15765994973480701, "mean_token_accuracy": 0.9953031539916992, "num_tokens": 993662561.0, "step": 2338 }, { "entropy": 0.01982291927561164, "epoch": 1.023634971003392, "grad_norm": 8.6875, "learning_rate": 2.4599350297780186e-05, "loss": 0.1231, "loss_lm": 0.019988995511084795, "loss_seg": 0.10316017270088196, "mean_token_accuracy": 0.9949870109558105, "num_tokens": 994087025.0, "step": 2339 }, { "entropy": 0.018637523520737886, "epoch": 1.024072655651603, "grad_norm": 14.5625, "learning_rate": 2.459664320519762e-05, "loss": 0.1335, "loss_lm": 0.015380729921162128, "loss_seg": 0.11809058487415314, "mean_token_accuracy": 0.9952577352523804, "num_tokens": 994512081.0, "step": 2340 }, { "entropy": 0.019230426289141178, "epoch": 1.024510340299814, "grad_norm": 6.46875, "learning_rate": 2.459393611261505e-05, "loss": 0.1467, "loss_lm": 0.01657749665901065, "loss_seg": 0.13009782135486603, "mean_token_accuracy": 0.9951571077108383, "num_tokens": 994936993.0, "step": 2341 }, { "entropy": 0.01859817700460553, "epoch": 1.024948024948025, "grad_norm": 14.4375, "learning_rate": 2.4591229020032484e-05, "loss": 0.0902, "loss_lm": 0.014876813860610127, "loss_seg": 0.0753592737019062, "mean_token_accuracy": 0.9952685683965683, "num_tokens": 995361761.0, "step": 2342 }, { "entropy": 0.01869246643036604, "epoch": 1.025385709596236, "grad_norm": 11.9375, "learning_rate": 2.458852192744992e-05, "loss": 0.1125, "loss_lm": 0.01454506954178214, "loss_seg": 0.09799074567854404, "mean_token_accuracy": 0.995305985212326, "num_tokens": 995786300.0, "step": 2343 }, { "entropy": 0.01897433213889599, "epoch": 1.0258233942444468, "grad_norm": 4.4375, "learning_rate": 2.4585814834867354e-05, "loss": 0.1486, "loss_lm": 0.015364120248705149, "loss_seg": 0.1332541797310114, "mean_token_accuracy": 0.9952333569526672, "num_tokens": 996211297.0, "step": 2344 }, { "entropy": 0.01836459431797266, "epoch": 1.0262610788926578, "grad_norm": 7.3125, "learning_rate": 2.4583107742284788e-05, "loss": 0.1473, "loss_lm": 0.015155072323977947, "loss_seg": 0.1321072494611144, "mean_token_accuracy": 0.9952496737241745, "num_tokens": 996637113.0, "step": 2345 }, { "entropy": 0.019294694531708956, "epoch": 1.0266987635408689, "grad_norm": 13.5, "learning_rate": 2.458040064970222e-05, "loss": 0.126, "loss_lm": 0.015262326458469033, "loss_seg": 0.1107602920383215, "mean_token_accuracy": 0.9951841533184052, "num_tokens": 997062138.0, "step": 2346 }, { "entropy": 0.018955033738166094, "epoch": 1.0271364481890797, "grad_norm": 5.09375, "learning_rate": 2.4577693557119652e-05, "loss": 0.1583, "loss_lm": 0.015376382041722536, "loss_seg": 0.14289969019591808, "mean_token_accuracy": 0.9952566623687744, "num_tokens": 997486901.0, "step": 2347 }, { "entropy": 0.01866026595234871, "epoch": 1.0275741328372907, "grad_norm": 8.6875, "learning_rate": 2.457498646453709e-05, "loss": 0.1775, "loss_lm": 0.016786412335932255, "loss_seg": 0.16070939227938652, "mean_token_accuracy": 0.9952063113451004, "num_tokens": 997912031.0, "step": 2348 }, { "entropy": 0.01886496925726533, "epoch": 1.0280118174855017, "grad_norm": 7.84375, "learning_rate": 2.4572279371954523e-05, "loss": 0.1123, "loss_lm": 0.015046392101794481, "loss_seg": 0.0972562674432993, "mean_token_accuracy": 0.9952511340379715, "num_tokens": 998336942.0, "step": 2349 }, { "entropy": 0.018851683009415865, "epoch": 1.0284495021337126, "grad_norm": 9.75, "learning_rate": 2.4569572279371953e-05, "loss": 0.1098, "loss_lm": 0.01782830012962222, "loss_seg": 0.0919838547706604, "mean_token_accuracy": 0.9951366186141968, "num_tokens": 998761703.0, "step": 2350 }, { "entropy": 0.018761716317385435, "epoch": 1.0288871867819236, "grad_norm": 9.125, "learning_rate": 2.4566865186789387e-05, "loss": 0.1267, "loss_lm": 0.01555534009821713, "loss_seg": 0.11111260764300823, "mean_token_accuracy": 0.9953259229660034, "num_tokens": 999185910.0, "step": 2351 }, { "entropy": 0.018867050763219595, "epoch": 1.0293248714301346, "grad_norm": 8.3125, "learning_rate": 2.456415809420682e-05, "loss": 0.1924, "loss_lm": 0.01540533546358347, "loss_seg": 0.17701592110097408, "mean_token_accuracy": 0.9951641708612442, "num_tokens": 999611036.0, "step": 2352 }, { "entropy": 0.018565319012850523, "epoch": 1.0297625560783457, "grad_norm": 15.4375, "learning_rate": 2.4561451001624258e-05, "loss": 0.1544, "loss_lm": 0.017931884387508035, "loss_seg": 0.13648750260472298, "mean_token_accuracy": 0.9953663200139999, "num_tokens": 1000035468.0, "step": 2353 }, { "entropy": 0.019264528527855873, "epoch": 1.0302002407265565, "grad_norm": 8.0, "learning_rate": 2.4558743909041692e-05, "loss": 0.1289, "loss_lm": 0.016010241582989693, "loss_seg": 0.11291105952113867, "mean_token_accuracy": 0.9950678795576096, "num_tokens": 1000460208.0, "step": 2354 }, { "entropy": 0.019013163167983294, "epoch": 1.0306379253747675, "grad_norm": 20.75, "learning_rate": 2.4556036816459122e-05, "loss": 0.1824, "loss_lm": 0.017192179337143898, "loss_seg": 0.16516505554318428, "mean_token_accuracy": 0.9952554404735565, "num_tokens": 1000885306.0, "step": 2355 }, { "entropy": 0.018750854302197695, "epoch": 1.0310756100229785, "grad_norm": 11.25, "learning_rate": 2.4553329723876556e-05, "loss": 0.1069, "loss_lm": 0.013722989009693265, "loss_seg": 0.0931297317147255, "mean_token_accuracy": 0.9950713813304901, "num_tokens": 1001309853.0, "step": 2356 }, { "entropy": 0.01859387382864952, "epoch": 1.0315132946711894, "grad_norm": 6.40625, "learning_rate": 2.455062263129399e-05, "loss": 0.118, "loss_lm": 0.01646452583372593, "loss_seg": 0.1015332043170929, "mean_token_accuracy": 0.9952327162027359, "num_tokens": 1001734162.0, "step": 2357 }, { "entropy": 0.019120692741125822, "epoch": 1.0319509793194004, "grad_norm": 7.3125, "learning_rate": 2.4547915538711427e-05, "loss": 0.1492, "loss_lm": 0.017725558951497078, "loss_seg": 0.1314465645700693, "mean_token_accuracy": 0.995305061340332, "num_tokens": 1002159200.0, "step": 2358 }, { "entropy": 0.018832438625395298, "epoch": 1.0323886639676114, "grad_norm": 8.5, "learning_rate": 2.454520844612886e-05, "loss": 0.1691, "loss_lm": 0.01706455717794597, "loss_seg": 0.15200195647776127, "mean_token_accuracy": 0.9951728880405426, "num_tokens": 1002583855.0, "step": 2359 }, { "entropy": 0.01874538231641054, "epoch": 1.0328263486158222, "grad_norm": 5.28125, "learning_rate": 2.454250135354629e-05, "loss": 0.1129, "loss_lm": 0.015036142198368907, "loss_seg": 0.09783251862972975, "mean_token_accuracy": 0.995186984539032, "num_tokens": 1003008718.0, "step": 2360 }, { "entropy": 0.01884782314300537, "epoch": 1.0332640332640333, "grad_norm": 9.5, "learning_rate": 2.4539794260963724e-05, "loss": 0.0973, "loss_lm": 0.015075347619131207, "loss_seg": 0.08222762867808342, "mean_token_accuracy": 0.9951947331428528, "num_tokens": 1003434047.0, "step": 2361 }, { "entropy": 0.01839602319523692, "epoch": 1.0337017179122443, "grad_norm": 3.828125, "learning_rate": 2.4537087168381158e-05, "loss": 0.1227, "loss_lm": 0.01714239176362753, "loss_seg": 0.10555397719144821, "mean_token_accuracy": 0.9952924102544785, "num_tokens": 1003859077.0, "step": 2362 }, { "entropy": 0.019548787735402584, "epoch": 1.0341394025604551, "grad_norm": 11.125, "learning_rate": 2.4534380075798595e-05, "loss": 0.1358, "loss_lm": 0.015484092524275184, "loss_seg": 0.1203644685447216, "mean_token_accuracy": 0.995027706027031, "num_tokens": 1004285392.0, "step": 2363 }, { "entropy": 0.019406632520258427, "epoch": 1.0345770872086661, "grad_norm": 12.375, "learning_rate": 2.453167298321603e-05, "loss": 0.1401, "loss_lm": 0.01652067876420915, "loss_seg": 0.12359077483415604, "mean_token_accuracy": 0.9950207322835922, "num_tokens": 1004710519.0, "step": 2364 }, { "entropy": 0.018403093796223402, "epoch": 1.0350147718568772, "grad_norm": 140.0, "learning_rate": 2.452896589063346e-05, "loss": 0.1097, "loss_lm": 0.014944064663723111, "loss_seg": 0.0947288665920496, "mean_token_accuracy": 0.9952544122934341, "num_tokens": 1005135298.0, "step": 2365 }, { "entropy": 0.019095387775450945, "epoch": 1.035452456505088, "grad_norm": 10.9375, "learning_rate": 2.4526258798050893e-05, "loss": 0.0818, "loss_lm": 0.016732582356780767, "loss_seg": 0.06508181430399418, "mean_token_accuracy": 0.9950762689113617, "num_tokens": 1005560568.0, "step": 2366 }, { "entropy": 0.018715411890298128, "epoch": 1.035890141153299, "grad_norm": 7.125, "learning_rate": 2.4523551705468327e-05, "loss": 0.1267, "loss_lm": 0.017522528767585754, "loss_seg": 0.1091779712587595, "mean_token_accuracy": 0.9951508492231369, "num_tokens": 1005985358.0, "step": 2367 }, { "entropy": 0.01910148561000824, "epoch": 1.03632782580151, "grad_norm": 7.59375, "learning_rate": 2.452084461288576e-05, "loss": 0.1106, "loss_lm": 0.01615869184024632, "loss_seg": 0.09447379037737846, "mean_token_accuracy": 0.9951367378234863, "num_tokens": 1006409932.0, "step": 2368 }, { "entropy": 0.019709172192960978, "epoch": 1.0367655104497209, "grad_norm": 22.25, "learning_rate": 2.4518137520303198e-05, "loss": 0.1083, "loss_lm": 0.015794393373653293, "loss_seg": 0.09252862818539143, "mean_token_accuracy": 0.9950354397296906, "num_tokens": 1006835317.0, "step": 2369 }, { "entropy": 0.018583699595183134, "epoch": 1.037203195097932, "grad_norm": 10.375, "learning_rate": 2.4515430427720628e-05, "loss": 0.1381, "loss_lm": 0.016880720388144255, "loss_seg": 0.12124234065413475, "mean_token_accuracy": 0.9952555000782013, "num_tokens": 1007260405.0, "step": 2370 }, { "entropy": 0.019493716303259134, "epoch": 1.037640879746143, "grad_norm": 7.125, "learning_rate": 2.4512723335138062e-05, "loss": 0.1984, "loss_lm": 0.015661835437640548, "loss_seg": 0.1827608458697796, "mean_token_accuracy": 0.9950696676969528, "num_tokens": 1007685295.0, "step": 2371 }, { "entropy": 0.019488477148115635, "epoch": 1.038078564394354, "grad_norm": 4.5, "learning_rate": 2.4510016242555495e-05, "loss": 0.1367, "loss_lm": 0.015254465164616704, "loss_seg": 0.12148882634937763, "mean_token_accuracy": 0.9951653182506561, "num_tokens": 1008110087.0, "step": 2372 }, { "entropy": 0.01913502160459757, "epoch": 1.0385162490425648, "grad_norm": 19.5, "learning_rate": 2.450730914997293e-05, "loss": 0.1245, "loss_lm": 0.018155418569222093, "loss_seg": 0.10632205381989479, "mean_token_accuracy": 0.9952136427164078, "num_tokens": 1008534817.0, "step": 2373 }, { "entropy": 0.01926948083564639, "epoch": 1.0389539336907758, "grad_norm": 7.8125, "learning_rate": 2.4504602057390363e-05, "loss": 0.1223, "loss_lm": 0.01776439556851983, "loss_seg": 0.10452230926603079, "mean_token_accuracy": 0.9951942414045334, "num_tokens": 1008960729.0, "step": 2374 }, { "entropy": 0.01884556282311678, "epoch": 1.0393916183389869, "grad_norm": 6.3125, "learning_rate": 2.4501894964807797e-05, "loss": 0.1353, "loss_lm": 0.01697891391813755, "loss_seg": 0.11829387955367565, "mean_token_accuracy": 0.995337039232254, "num_tokens": 1009385309.0, "step": 2375 }, { "entropy": 0.019294302444905043, "epoch": 1.0398293029871977, "grad_norm": 13.6875, "learning_rate": 2.449918787222523e-05, "loss": 0.1222, "loss_lm": 0.018312474247068167, "loss_seg": 0.10387000255286694, "mean_token_accuracy": 0.9950474798679352, "num_tokens": 1009810721.0, "step": 2376 }, { "entropy": 0.01932665891945362, "epoch": 1.0402669876354087, "grad_norm": 7.1875, "learning_rate": 2.4496480779642664e-05, "loss": 0.1695, "loss_lm": 0.01613080850802362, "loss_seg": 0.15334454737603664, "mean_token_accuracy": 0.9952639192342758, "num_tokens": 1010235919.0, "step": 2377 }, { "entropy": 0.019074810203164816, "epoch": 1.0407046722836197, "grad_norm": 540.0, "learning_rate": 2.4493773687060098e-05, "loss": 0.1587, "loss_lm": 0.017705034464597702, "loss_seg": 0.14096244238317013, "mean_token_accuracy": 0.9952584952116013, "num_tokens": 1010660216.0, "step": 2378 }, { "entropy": 0.01932065049186349, "epoch": 1.0411423569318305, "grad_norm": 22.625, "learning_rate": 2.449106659447753e-05, "loss": 0.1299, "loss_lm": 0.018407913390547037, "loss_seg": 0.1115142460912466, "mean_token_accuracy": 0.9951841235160828, "num_tokens": 1011085237.0, "step": 2379 }, { "entropy": 0.01923413760960102, "epoch": 1.0415800415800416, "grad_norm": 7.25, "learning_rate": 2.4488359501894965e-05, "loss": 0.1174, "loss_lm": 0.016114871250465512, "loss_seg": 0.10124300606548786, "mean_token_accuracy": 0.9951943457126617, "num_tokens": 1011510544.0, "step": 2380 }, { "entropy": 0.019209815189242363, "epoch": 1.0420177262282526, "grad_norm": 6.40625, "learning_rate": 2.44856524093124e-05, "loss": 0.1222, "loss_lm": 0.0158599060960114, "loss_seg": 0.1063564233481884, "mean_token_accuracy": 0.9952258318662643, "num_tokens": 1011935713.0, "step": 2381 }, { "entropy": 0.018679560162127018, "epoch": 1.0424554108764634, "grad_norm": 4.34375, "learning_rate": 2.4482945316729833e-05, "loss": 0.1471, "loss_lm": 0.016470753122121096, "loss_seg": 0.1306414809077978, "mean_token_accuracy": 0.9952187091112137, "num_tokens": 1012360435.0, "step": 2382 }, { "entropy": 0.018626593053340912, "epoch": 1.0428930955246745, "grad_norm": 15.5625, "learning_rate": 2.4480238224147266e-05, "loss": 0.1, "loss_lm": 0.017022652318701148, "loss_seg": 0.08293668739497662, "mean_token_accuracy": 0.99529829621315, "num_tokens": 1012785342.0, "step": 2383 }, { "entropy": 0.01896681124344468, "epoch": 1.0433307801728855, "grad_norm": 13.9375, "learning_rate": 2.44775311315647e-05, "loss": 0.1284, "loss_lm": 0.016426448710262775, "loss_seg": 0.11198508366942406, "mean_token_accuracy": 0.9951592236757278, "num_tokens": 1013210876.0, "step": 2384 }, { "entropy": 0.018577646929770708, "epoch": 1.0437684648210963, "grad_norm": 19.25, "learning_rate": 2.4474824038982134e-05, "loss": 0.1043, "loss_lm": 0.015938178403303027, "loss_seg": 0.08833483885973692, "mean_token_accuracy": 0.9952458590269089, "num_tokens": 1013635516.0, "step": 2385 }, { "entropy": 0.01875183591619134, "epoch": 1.0442061494693073, "grad_norm": 6.375, "learning_rate": 2.4472116946399568e-05, "loss": 0.123, "loss_lm": 0.018033907050266862, "loss_seg": 0.10494136996567249, "mean_token_accuracy": 0.9952324330806732, "num_tokens": 1014060232.0, "step": 2386 }, { "entropy": 0.01885755779221654, "epoch": 1.0446438341175184, "grad_norm": 28.75, "learning_rate": 2.4469409853817e-05, "loss": 0.1413, "loss_lm": 0.015927835600450635, "loss_seg": 0.12538444437086582, "mean_token_accuracy": 0.9951714724302292, "num_tokens": 1014485626.0, "step": 2387 }, { "entropy": 0.019367662724107504, "epoch": 1.0450815187657292, "grad_norm": 17.125, "learning_rate": 2.4466702761234435e-05, "loss": 0.0929, "loss_lm": 0.016791976522654295, "loss_seg": 0.0761267188936472, "mean_token_accuracy": 0.9949859976768494, "num_tokens": 1014911014.0, "step": 2388 }, { "entropy": 0.01921122893691063, "epoch": 1.0455192034139402, "grad_norm": 8.0, "learning_rate": 2.446399566865187e-05, "loss": 0.1617, "loss_lm": 0.017195085994899273, "loss_seg": 0.14453038945794106, "mean_token_accuracy": 0.9950935691595078, "num_tokens": 1015336550.0, "step": 2389 }, { "entropy": 0.019020312000066042, "epoch": 1.0459568880621513, "grad_norm": 5.375, "learning_rate": 2.4461288576069303e-05, "loss": 0.1193, "loss_lm": 0.016186327440664172, "loss_seg": 0.10306560434401035, "mean_token_accuracy": 0.995257779955864, "num_tokens": 1015761418.0, "step": 2390 }, { "entropy": 0.018970870412886143, "epoch": 1.046394572710362, "grad_norm": 9.0, "learning_rate": 2.4458581483486736e-05, "loss": 0.1153, "loss_lm": 0.016991494223475456, "loss_seg": 0.09827364515513182, "mean_token_accuracy": 0.9951973706483841, "num_tokens": 1016186559.0, "step": 2391 }, { "entropy": 0.019498018082231283, "epoch": 1.046832257358573, "grad_norm": 5.53125, "learning_rate": 2.445587439090417e-05, "loss": 0.118, "loss_lm": 0.018812935333698988, "loss_seg": 0.09921756573021412, "mean_token_accuracy": 0.9949928820133209, "num_tokens": 1016611144.0, "step": 2392 }, { "entropy": 0.0188647392205894, "epoch": 1.0472699420067841, "grad_norm": 19.5, "learning_rate": 2.4453167298321604e-05, "loss": 0.1018, "loss_lm": 0.015261194435879588, "loss_seg": 0.08656436763703823, "mean_token_accuracy": 0.9952923655509949, "num_tokens": 1017036564.0, "step": 2393 }, { "entropy": 0.01845905790105462, "epoch": 1.0477076266549952, "grad_norm": 20.625, "learning_rate": 2.4450460205739037e-05, "loss": 0.1607, "loss_lm": 0.017653753980994225, "loss_seg": 0.1429971233010292, "mean_token_accuracy": 0.9953073412179947, "num_tokens": 1017461644.0, "step": 2394 }, { "entropy": 0.01919253868982196, "epoch": 1.048145311303206, "grad_norm": 8.1875, "learning_rate": 2.444775311315647e-05, "loss": 0.1302, "loss_lm": 0.015811752527952194, "loss_seg": 0.11440218053758144, "mean_token_accuracy": 0.9951218813657761, "num_tokens": 1017886159.0, "step": 2395 }, { "entropy": 0.019110523629933596, "epoch": 1.048582995951417, "grad_norm": 9.5625, "learning_rate": 2.4445046020573905e-05, "loss": 0.1222, "loss_lm": 0.01568024675361812, "loss_seg": 0.10647168569266796, "mean_token_accuracy": 0.9951692521572113, "num_tokens": 1018311535.0, "step": 2396 }, { "entropy": 0.018657080829143524, "epoch": 1.049020680599628, "grad_norm": 8.0625, "learning_rate": 2.444233892799134e-05, "loss": 0.1394, "loss_lm": 0.016198765952140093, "loss_seg": 0.12323832139372826, "mean_token_accuracy": 0.995232418179512, "num_tokens": 1018736870.0, "step": 2397 }, { "entropy": 0.019618709105998278, "epoch": 1.0494583652478389, "grad_norm": 6.78125, "learning_rate": 2.443963183540877e-05, "loss": 0.1285, "loss_lm": 0.014659687876701355, "loss_seg": 0.11388399079442024, "mean_token_accuracy": 0.9951912611722946, "num_tokens": 1019162298.0, "step": 2398 }, { "entropy": 0.018291885498911142, "epoch": 1.04989604989605, "grad_norm": 4.875, "learning_rate": 2.4436924742826206e-05, "loss": 0.1235, "loss_lm": 0.016818061703816056, "loss_seg": 0.10669941455125809, "mean_token_accuracy": 0.9952626973390579, "num_tokens": 1019587384.0, "step": 2399 }, { "entropy": 0.01900093164294958, "epoch": 1.050333734544261, "grad_norm": 6.6875, "learning_rate": 2.443421765024364e-05, "loss": 0.0988, "loss_lm": 0.01293867314234376, "loss_seg": 0.08585475198924541, "mean_token_accuracy": 0.9950899481773376, "num_tokens": 1020013232.0, "step": 2400 }, { "entropy": 0.01909035211429, "epoch": 1.0507714191924717, "grad_norm": 9.0, "learning_rate": 2.4431510557661073e-05, "loss": 0.0993, "loss_lm": 0.015665883664041758, "loss_seg": 0.08360437862575054, "mean_token_accuracy": 0.9952552318572998, "num_tokens": 1020438280.0, "step": 2401 }, { "entropy": 0.019082187209278345, "epoch": 1.0512091038406828, "grad_norm": 5.03125, "learning_rate": 2.4428803465078507e-05, "loss": 0.143, "loss_lm": 0.017485402058809996, "loss_seg": 0.1255113184452057, "mean_token_accuracy": 0.9951519072055817, "num_tokens": 1020863982.0, "step": 2402 }, { "entropy": 0.017858912236988544, "epoch": 1.0516467884888938, "grad_norm": 24.375, "learning_rate": 2.4426096372495938e-05, "loss": 0.1501, "loss_lm": 0.016392304794862866, "loss_seg": 0.13371814787387848, "mean_token_accuracy": 0.995426595211029, "num_tokens": 1021289190.0, "step": 2403 }, { "entropy": 0.01885406393557787, "epoch": 1.0520844731371046, "grad_norm": 7.15625, "learning_rate": 2.442338927991337e-05, "loss": 0.1485, "loss_lm": 0.016405269969254732, "loss_seg": 0.13206454180181026, "mean_token_accuracy": 0.9951345473527908, "num_tokens": 1021713995.0, "step": 2404 }, { "entropy": 0.019110729917883873, "epoch": 1.0525221577853157, "grad_norm": 5.53125, "learning_rate": 2.442068218733081e-05, "loss": 0.1495, "loss_lm": 0.017616332974284887, "loss_seg": 0.1318507008254528, "mean_token_accuracy": 0.9951214641332626, "num_tokens": 1022139306.0, "step": 2405 }, { "entropy": 0.018909272737801075, "epoch": 1.0529598424335267, "grad_norm": 13.0, "learning_rate": 2.4417975094748242e-05, "loss": 0.1397, "loss_lm": 0.016885580494999886, "loss_seg": 0.1227805744856596, "mean_token_accuracy": 0.9951723963022232, "num_tokens": 1022564528.0, "step": 2406 }, { "entropy": 0.019197622779756784, "epoch": 1.0533975270817375, "grad_norm": 5.1875, "learning_rate": 2.4415268002165676e-05, "loss": 0.1196, "loss_lm": 0.016586901620030403, "loss_seg": 0.10299343429505825, "mean_token_accuracy": 0.995079830288887, "num_tokens": 1022990091.0, "step": 2407 }, { "entropy": 0.01914914371445775, "epoch": 1.0538352117299485, "grad_norm": 39.0, "learning_rate": 2.4412560909583106e-05, "loss": 0.1491, "loss_lm": 0.016288330545648932, "loss_seg": 0.13281740620732307, "mean_token_accuracy": 0.9950712621212006, "num_tokens": 1023415331.0, "step": 2408 }, { "entropy": 0.01894383691251278, "epoch": 1.0542728963781596, "grad_norm": 7.1875, "learning_rate": 2.440985381700054e-05, "loss": 0.1135, "loss_lm": 0.016626962227746844, "loss_seg": 0.09688990190625191, "mean_token_accuracy": 0.99504753947258, "num_tokens": 1023839949.0, "step": 2409 }, { "entropy": 0.018273782450705767, "epoch": 1.0547105810263706, "grad_norm": 6.75, "learning_rate": 2.4407146724417977e-05, "loss": 0.1151, "loss_lm": 0.016316683730110526, "loss_seg": 0.09875372424721718, "mean_token_accuracy": 0.9952241629362106, "num_tokens": 1024264526.0, "step": 2410 }, { "entropy": 0.019400584511458874, "epoch": 1.0551482656745814, "grad_norm": 6.71875, "learning_rate": 2.440443963183541e-05, "loss": 0.1318, "loss_lm": 0.015751716447994113, "loss_seg": 0.11600052285939455, "mean_token_accuracy": 0.995011106133461, "num_tokens": 1024689842.0, "step": 2411 }, { "entropy": 0.019201514776796103, "epoch": 1.0555859503227925, "grad_norm": 9.6875, "learning_rate": 2.4401732539252844e-05, "loss": 0.1867, "loss_lm": 0.014869331382215023, "loss_seg": 0.17187140882015228, "mean_token_accuracy": 0.9950505197048187, "num_tokens": 1025114724.0, "step": 2412 }, { "entropy": 0.01973208598792553, "epoch": 1.0560236349710035, "grad_norm": 12.625, "learning_rate": 2.4399025446670275e-05, "loss": 0.1337, "loss_lm": 0.018854122143238783, "loss_seg": 0.11486594192683697, "mean_token_accuracy": 0.9949441254138947, "num_tokens": 1025540155.0, "step": 2413 }, { "entropy": 0.01876114960759878, "epoch": 1.0564613196192143, "grad_norm": 7.875, "learning_rate": 2.439631835408771e-05, "loss": 0.1133, "loss_lm": 0.016891641076654196, "loss_seg": 0.09645554982125759, "mean_token_accuracy": 0.9953528791666031, "num_tokens": 1025965533.0, "step": 2414 }, { "entropy": 0.018545267637819052, "epoch": 1.0568990042674253, "grad_norm": 11.75, "learning_rate": 2.4393611261505146e-05, "loss": 0.1596, "loss_lm": 0.01566686830483377, "loss_seg": 0.14388910308480263, "mean_token_accuracy": 0.9952921718358994, "num_tokens": 1026390953.0, "step": 2415 }, { "entropy": 0.019013296347111464, "epoch": 1.0573366889156364, "grad_norm": 12.3125, "learning_rate": 2.439090416892258e-05, "loss": 0.0931, "loss_lm": 0.014416966820135713, "loss_seg": 0.0786680644378066, "mean_token_accuracy": 0.9952722042798996, "num_tokens": 1026815840.0, "step": 2416 }, { "entropy": 0.018398545682430267, "epoch": 1.0577743735638472, "grad_norm": 15.5625, "learning_rate": 2.4388197076340013e-05, "loss": 0.1321, "loss_lm": 0.016214559553191066, "loss_seg": 0.11584941297769547, "mean_token_accuracy": 0.9953068792819977, "num_tokens": 1027240886.0, "step": 2417 }, { "entropy": 0.018715667072683573, "epoch": 1.0582120582120582, "grad_norm": 7.34375, "learning_rate": 2.4385489983757443e-05, "loss": 0.0905, "loss_lm": 0.014124652370810509, "loss_seg": 0.0763536598533392, "mean_token_accuracy": 0.9951879978179932, "num_tokens": 1027665839.0, "step": 2418 }, { "entropy": 0.019246583804488182, "epoch": 1.0586497428602692, "grad_norm": 8.5, "learning_rate": 2.4382782891174877e-05, "loss": 0.1583, "loss_lm": 0.016000838251784444, "loss_seg": 0.1422870084643364, "mean_token_accuracy": 0.9950896203517914, "num_tokens": 1028091213.0, "step": 2419 }, { "entropy": 0.01809207908809185, "epoch": 1.05908742750848, "grad_norm": 11.6875, "learning_rate": 2.4380075798592314e-05, "loss": 0.1617, "loss_lm": 0.015822257148101926, "loss_seg": 0.14586662873625755, "mean_token_accuracy": 0.9953448176383972, "num_tokens": 1028516297.0, "step": 2420 }, { "entropy": 0.01844001328572631, "epoch": 1.059525112156691, "grad_norm": 20.0, "learning_rate": 2.4377368706009748e-05, "loss": 0.1594, "loss_lm": 0.014015261083841324, "loss_seg": 0.1453952295705676, "mean_token_accuracy": 0.9954114705324173, "num_tokens": 1028941007.0, "step": 2421 }, { "entropy": 0.0187529600225389, "epoch": 1.0599627968049021, "grad_norm": 4.4375, "learning_rate": 2.437466161342718e-05, "loss": 0.1478, "loss_lm": 0.01693305606022477, "loss_seg": 0.13087179325520992, "mean_token_accuracy": 0.9952714890241623, "num_tokens": 1029366471.0, "step": 2422 }, { "entropy": 0.01827759202569723, "epoch": 1.060400481453113, "grad_norm": 10.625, "learning_rate": 2.4371954520844612e-05, "loss": 0.1394, "loss_lm": 0.013928359374403954, "loss_seg": 0.12549024261534214, "mean_token_accuracy": 0.9953776746988297, "num_tokens": 1029790874.0, "step": 2423 }, { "entropy": 0.01927263056859374, "epoch": 1.060838166101324, "grad_norm": 9.6875, "learning_rate": 2.4369247428262046e-05, "loss": 0.1331, "loss_lm": 0.017210647463798523, "loss_seg": 0.11588780023157597, "mean_token_accuracy": 0.9951882660388947, "num_tokens": 1030216254.0, "step": 2424 }, { "entropy": 0.01859439630061388, "epoch": 1.061275850749535, "grad_norm": 6.84375, "learning_rate": 2.4366540335679483e-05, "loss": 0.1845, "loss_lm": 0.018824305152520537, "loss_seg": 0.16565711237490177, "mean_token_accuracy": 0.9952852129936218, "num_tokens": 1030641893.0, "step": 2425 }, { "entropy": 0.01836224365979433, "epoch": 1.0617135353977458, "grad_norm": 17.25, "learning_rate": 2.4363833243096917e-05, "loss": 0.0887, "loss_lm": 0.016240403056144714, "loss_seg": 0.07246973272413015, "mean_token_accuracy": 0.9953155070543289, "num_tokens": 1031067080.0, "step": 2426 }, { "entropy": 0.01853434508666396, "epoch": 1.0621512200459569, "grad_norm": 13.6875, "learning_rate": 2.4361126150514347e-05, "loss": 0.1166, "loss_lm": 0.01457625231705606, "loss_seg": 0.10203816927969456, "mean_token_accuracy": 0.9952893108129501, "num_tokens": 1031492864.0, "step": 2427 }, { "entropy": 0.018905386328697205, "epoch": 1.062588904694168, "grad_norm": 7.90625, "learning_rate": 2.435841905793178e-05, "loss": 0.1062, "loss_lm": 0.018453153083100915, "loss_seg": 0.08774688094854355, "mean_token_accuracy": 0.9950932115316391, "num_tokens": 1031917367.0, "step": 2428 }, { "entropy": 0.019310864619910717, "epoch": 1.0630265893423787, "grad_norm": 7.5625, "learning_rate": 2.4355711965349214e-05, "loss": 0.1263, "loss_lm": 0.016710615251213312, "loss_seg": 0.10957729443907738, "mean_token_accuracy": 0.9950947165489197, "num_tokens": 1032343007.0, "step": 2429 }, { "entropy": 0.01937743229791522, "epoch": 1.0634642739905897, "grad_norm": 6.6875, "learning_rate": 2.435300487276665e-05, "loss": 0.1171, "loss_lm": 0.016677369130775332, "loss_seg": 0.10045564081519842, "mean_token_accuracy": 0.9950575679540634, "num_tokens": 1032768498.0, "step": 2430 }, { "entropy": 0.018877317663282156, "epoch": 1.0639019586388008, "grad_norm": 11.5625, "learning_rate": 2.4350297780184085e-05, "loss": 0.1204, "loss_lm": 0.015742886811494827, "loss_seg": 0.104674456641078, "mean_token_accuracy": 0.9952540695667267, "num_tokens": 1033193662.0, "step": 2431 }, { "entropy": 0.01922926865518093, "epoch": 1.0643396432870118, "grad_norm": 8.5, "learning_rate": 2.4347590687601516e-05, "loss": 0.135, "loss_lm": 0.019055480137467384, "loss_seg": 0.11590256914496422, "mean_token_accuracy": 0.9951243847608566, "num_tokens": 1033618609.0, "step": 2432 }, { "entropy": 0.018611049745231867, "epoch": 1.0647773279352226, "grad_norm": 15.8125, "learning_rate": 2.434488359501895e-05, "loss": 0.1423, "loss_lm": 0.015667304396629333, "loss_seg": 0.12658920511603355, "mean_token_accuracy": 0.9953245967626572, "num_tokens": 1034044194.0, "step": 2433 }, { "entropy": 0.018879331182688475, "epoch": 1.0652150125834337, "grad_norm": 11.75, "learning_rate": 2.4342176502436383e-05, "loss": 0.1334, "loss_lm": 0.016645091818645597, "loss_seg": 0.11676374636590481, "mean_token_accuracy": 0.9952212870121002, "num_tokens": 1034468478.0, "step": 2434 }, { "entropy": 0.01869609672576189, "epoch": 1.0656526972316447, "grad_norm": 5.65625, "learning_rate": 2.4339469409853817e-05, "loss": 0.129, "loss_lm": 0.01756636705249548, "loss_seg": 0.1114137601107359, "mean_token_accuracy": 0.9951997995376587, "num_tokens": 1034893442.0, "step": 2435 }, { "entropy": 0.01890484057366848, "epoch": 1.0660903818798555, "grad_norm": 3.578125, "learning_rate": 2.4336762317271254e-05, "loss": 0.1047, "loss_lm": 0.019186111399903893, "loss_seg": 0.08554561622440815, "mean_token_accuracy": 0.9951465129852295, "num_tokens": 1035317872.0, "step": 2436 }, { "entropy": 0.018013480119407177, "epoch": 1.0665280665280665, "grad_norm": 9.875, "learning_rate": 2.4334055224688684e-05, "loss": 0.1261, "loss_lm": 0.01813549571670592, "loss_seg": 0.10794452391564846, "mean_token_accuracy": 0.9953771829605103, "num_tokens": 1035742669.0, "step": 2437 }, { "entropy": 0.018912336323410273, "epoch": 1.0669657511762776, "grad_norm": 8.75, "learning_rate": 2.4331348132106118e-05, "loss": 0.1303, "loss_lm": 0.016545224469155073, "loss_seg": 0.11378588154911995, "mean_token_accuracy": 0.9952143728733063, "num_tokens": 1036168051.0, "step": 2438 }, { "entropy": 0.01826710719615221, "epoch": 1.0674034358244884, "grad_norm": 8.0, "learning_rate": 2.432864103952355e-05, "loss": 0.1542, "loss_lm": 0.014976218808442354, "loss_seg": 0.13918237388134003, "mean_token_accuracy": 0.9953212440013885, "num_tokens": 1036593129.0, "step": 2439 }, { "entropy": 0.01866288622841239, "epoch": 1.0678411204726994, "grad_norm": 10.3125, "learning_rate": 2.4325933946940985e-05, "loss": 0.103, "loss_lm": 0.016979453153908253, "loss_seg": 0.08598482422530651, "mean_token_accuracy": 0.9952539056539536, "num_tokens": 1037017660.0, "step": 2440 }, { "entropy": 0.018766355235129595, "epoch": 1.0682788051209104, "grad_norm": 8.6875, "learning_rate": 2.4323226854358423e-05, "loss": 0.1418, "loss_lm": 0.018593054031953216, "loss_seg": 0.12322514317929745, "mean_token_accuracy": 0.9952453970909119, "num_tokens": 1037443077.0, "step": 2441 }, { "entropy": 0.01836878852918744, "epoch": 1.0687164897691213, "grad_norm": 6.3125, "learning_rate": 2.4320519761775853e-05, "loss": 0.1502, "loss_lm": 0.01689096074551344, "loss_seg": 0.13335790112614632, "mean_token_accuracy": 0.9952903836965561, "num_tokens": 1037868128.0, "step": 2442 }, { "entropy": 0.018486580345779657, "epoch": 1.0691541744173323, "grad_norm": 5.25, "learning_rate": 2.4317812669193287e-05, "loss": 0.1114, "loss_lm": 0.014821006217971444, "loss_seg": 0.09653587080538273, "mean_token_accuracy": 0.9952280819416046, "num_tokens": 1038293489.0, "step": 2443 }, { "entropy": 0.019227695651352406, "epoch": 1.0695918590655433, "grad_norm": 21.375, "learning_rate": 2.431510557661072e-05, "loss": 0.1322, "loss_lm": 0.018353143706917763, "loss_seg": 0.1138305002823472, "mean_token_accuracy": 0.9950827360153198, "num_tokens": 1038719124.0, "step": 2444 }, { "entropy": 0.018709500320255756, "epoch": 1.0700295437137541, "grad_norm": 7.65625, "learning_rate": 2.4312398484028154e-05, "loss": 0.1551, "loss_lm": 0.018867611419409513, "loss_seg": 0.13618577271699905, "mean_token_accuracy": 0.9952661544084549, "num_tokens": 1039143903.0, "step": 2445 }, { "entropy": 0.019111644476652145, "epoch": 1.0704672283619652, "grad_norm": 30.0, "learning_rate": 2.4309691391445588e-05, "loss": 0.1538, "loss_lm": 0.01715273456647992, "loss_seg": 0.13660632260143757, "mean_token_accuracy": 0.9950902163982391, "num_tokens": 1039568972.0, "step": 2446 }, { "entropy": 0.018396624363958836, "epoch": 1.0709049130101762, "grad_norm": 11.25, "learning_rate": 2.430698429886302e-05, "loss": 0.1283, "loss_lm": 0.016717947088181973, "loss_seg": 0.11154459044337273, "mean_token_accuracy": 0.995232418179512, "num_tokens": 1039993858.0, "step": 2447 }, { "entropy": 0.01941335154697299, "epoch": 1.0713425976583872, "grad_norm": 9.0625, "learning_rate": 2.4304277206280455e-05, "loss": 0.1534, "loss_lm": 0.020130069460719824, "loss_seg": 0.13331225886940956, "mean_token_accuracy": 0.995015412569046, "num_tokens": 1040419156.0, "step": 2448 }, { "entropy": 0.018281124532222748, "epoch": 1.071780282306598, "grad_norm": 5.78125, "learning_rate": 2.430157011369789e-05, "loss": 0.1239, "loss_lm": 0.018931864760816097, "loss_seg": 0.10492652468383312, "mean_token_accuracy": 0.9952160269021988, "num_tokens": 1040844036.0, "step": 2449 }, { "entropy": 0.018819124903529882, "epoch": 1.072217966954809, "grad_norm": 6.1875, "learning_rate": 2.4298863021115323e-05, "loss": 0.1012, "loss_lm": 0.01551997964270413, "loss_seg": 0.0856763357296586, "mean_token_accuracy": 0.9952312558889389, "num_tokens": 1041269271.0, "step": 2450 }, { "entropy": 0.01951845223084092, "epoch": 1.0726556516030201, "grad_norm": 6.6875, "learning_rate": 2.4296155928532756e-05, "loss": 0.0956, "loss_lm": 0.01464991713874042, "loss_seg": 0.08090462442487478, "mean_token_accuracy": 0.9951104819774628, "num_tokens": 1041694037.0, "step": 2451 }, { "entropy": 0.018467184156179428, "epoch": 1.073093336251231, "grad_norm": 9.9375, "learning_rate": 2.429344883595019e-05, "loss": 0.1538, "loss_lm": 0.014936415012925863, "loss_seg": 0.13886435329914093, "mean_token_accuracy": 0.9953383505344391, "num_tokens": 1042119803.0, "step": 2452 }, { "entropy": 0.01889012660831213, "epoch": 1.073531020899442, "grad_norm": 7.3125, "learning_rate": 2.4290741743367624e-05, "loss": 0.1239, "loss_lm": 0.01519629918038845, "loss_seg": 0.108692966401577, "mean_token_accuracy": 0.9953396022319794, "num_tokens": 1042544840.0, "step": 2453 }, { "entropy": 0.019563852343708277, "epoch": 1.073968705547653, "grad_norm": 5.0625, "learning_rate": 2.4288034650785058e-05, "loss": 0.1046, "loss_lm": 0.01848843414336443, "loss_seg": 0.08607548475265503, "mean_token_accuracy": 0.9950795769691467, "num_tokens": 1042970384.0, "step": 2454 }, { "entropy": 0.019087246619164944, "epoch": 1.0744063901958638, "grad_norm": 9.9375, "learning_rate": 2.428532755820249e-05, "loss": 0.1433, "loss_lm": 0.016964834881946445, "loss_seg": 0.12630905397236347, "mean_token_accuracy": 0.9952047914266586, "num_tokens": 1043395149.0, "step": 2455 }, { "entropy": 0.019636422861367464, "epoch": 1.0748440748440748, "grad_norm": 11.6875, "learning_rate": 2.4282620465619925e-05, "loss": 0.1313, "loss_lm": 0.01765119144693017, "loss_seg": 0.11369835026562214, "mean_token_accuracy": 0.9951162338256836, "num_tokens": 1043820202.0, "step": 2456 }, { "entropy": 0.019603589083999395, "epoch": 1.0752817594922859, "grad_norm": 9.3125, "learning_rate": 2.427991337303736e-05, "loss": 0.1182, "loss_lm": 0.01617264817468822, "loss_seg": 0.1020753663033247, "mean_token_accuracy": 0.9950162172317505, "num_tokens": 1044245774.0, "step": 2457 }, { "entropy": 0.019810060504823923, "epoch": 1.0757194441404967, "grad_norm": 12.0625, "learning_rate": 2.4277206280454792e-05, "loss": 0.2321, "loss_lm": 0.017398591386154294, "loss_seg": 0.214742012321949, "mean_token_accuracy": 0.9950222969055176, "num_tokens": 1044671441.0, "step": 2458 }, { "entropy": 0.01956113986670971, "epoch": 1.0761571287887077, "grad_norm": 5.40625, "learning_rate": 2.4274499187872226e-05, "loss": 0.1446, "loss_lm": 0.015003282343968749, "loss_seg": 0.12955215387046337, "mean_token_accuracy": 0.9949575513601303, "num_tokens": 1045096393.0, "step": 2459 }, { "entropy": 0.018919637892395258, "epoch": 1.0765948134369188, "grad_norm": 8.6875, "learning_rate": 2.427179209528966e-05, "loss": 0.0985, "loss_lm": 0.014537395210936666, "loss_seg": 0.08399843145161867, "mean_token_accuracy": 0.9951442182064056, "num_tokens": 1045521220.0, "step": 2460 }, { "entropy": 0.018485783133655787, "epoch": 1.0770324980851296, "grad_norm": 9.0625, "learning_rate": 2.4269085002707094e-05, "loss": 0.1264, "loss_lm": 0.016072590136900544, "loss_seg": 0.11034332402050495, "mean_token_accuracy": 0.9953731894493103, "num_tokens": 1045945857.0, "step": 2461 }, { "entropy": 0.01896617515012622, "epoch": 1.0774701827333406, "grad_norm": 7.03125, "learning_rate": 2.4266377910124527e-05, "loss": 0.0964, "loss_lm": 0.01687928126193583, "loss_seg": 0.07949281483888626, "mean_token_accuracy": 0.9952410310506821, "num_tokens": 1046370902.0, "step": 2462 }, { "entropy": 0.018547603860497475, "epoch": 1.0779078673815516, "grad_norm": 9.5625, "learning_rate": 2.426367081754196e-05, "loss": 0.1811, "loss_lm": 0.01675031427294016, "loss_seg": 0.1643937435001135, "mean_token_accuracy": 0.9953944087028503, "num_tokens": 1046795997.0, "step": 2463 }, { "entropy": 0.019336730241775513, "epoch": 1.0783455520297625, "grad_norm": 9.25, "learning_rate": 2.4260963724959395e-05, "loss": 0.1739, "loss_lm": 0.016223897226154804, "loss_seg": 0.1576891802251339, "mean_token_accuracy": 0.9951306283473969, "num_tokens": 1047221272.0, "step": 2464 }, { "entropy": 0.018814354669302702, "epoch": 1.0787832366779735, "grad_norm": 6.375, "learning_rate": 2.425825663237683e-05, "loss": 0.1428, "loss_lm": 0.015325962100178003, "loss_seg": 0.12751036509871483, "mean_token_accuracy": 0.9951999932527542, "num_tokens": 1047646042.0, "step": 2465 }, { "entropy": 0.018810574896633625, "epoch": 1.0792209213261845, "grad_norm": 14.0625, "learning_rate": 2.425554953979426e-05, "loss": 0.1368, "loss_lm": 0.016841925214976072, "loss_seg": 0.11996088176965714, "mean_token_accuracy": 0.9953000843524933, "num_tokens": 1048070902.0, "step": 2466 }, { "entropy": 0.01905608782544732, "epoch": 1.0796586059743953, "grad_norm": 16.5, "learning_rate": 2.4252842447211696e-05, "loss": 0.1171, "loss_lm": 0.01599513180553913, "loss_seg": 0.1011128993704915, "mean_token_accuracy": 0.9952002465724945, "num_tokens": 1048495661.0, "step": 2467 }, { "entropy": 0.018847227096557617, "epoch": 1.0800962906226064, "grad_norm": 16.5, "learning_rate": 2.425013535462913e-05, "loss": 0.1132, "loss_lm": 0.017452312167733908, "loss_seg": 0.09576773457229137, "mean_token_accuracy": 0.9953977912664413, "num_tokens": 1048920629.0, "step": 2468 }, { "entropy": 0.0192286791279912, "epoch": 1.0805339752708174, "grad_norm": 7.4375, "learning_rate": 2.4247428262046563e-05, "loss": 0.1194, "loss_lm": 0.015946148661896586, "loss_seg": 0.10349776968359947, "mean_token_accuracy": 0.9952445775270462, "num_tokens": 1049345798.0, "step": 2469 }, { "entropy": 0.018845309037715197, "epoch": 1.0809716599190284, "grad_norm": 9.5, "learning_rate": 2.4244721169463994e-05, "loss": 0.1393, "loss_lm": 0.01530885137617588, "loss_seg": 0.12399106193333864, "mean_token_accuracy": 0.9952191263437271, "num_tokens": 1049771636.0, "step": 2470 }, { "entropy": 0.01950691919773817, "epoch": 1.0814093445672393, "grad_norm": 9.5625, "learning_rate": 2.4242014076881428e-05, "loss": 0.1286, "loss_lm": 0.0127677449490875, "loss_seg": 0.11587913241237402, "mean_token_accuracy": 0.9950901567935944, "num_tokens": 1050196675.0, "step": 2471 }, { "entropy": 0.019143555779010057, "epoch": 1.0818470292154503, "grad_norm": 8.3125, "learning_rate": 2.4239306984298865e-05, "loss": 0.1543, "loss_lm": 0.01898218016140163, "loss_seg": 0.1353074610233307, "mean_token_accuracy": 0.9951915144920349, "num_tokens": 1050621926.0, "step": 2472 }, { "entropy": 0.01923261396586895, "epoch": 1.0822847138636613, "grad_norm": 5.84375, "learning_rate": 2.42365998917163e-05, "loss": 0.1514, "loss_lm": 0.01898231147788465, "loss_seg": 0.13242243975400925, "mean_token_accuracy": 0.9951305091381073, "num_tokens": 1051047397.0, "step": 2473 }, { "entropy": 0.019326599314808846, "epoch": 1.0827223985118721, "grad_norm": 7.28125, "learning_rate": 2.4233892799133732e-05, "loss": 0.1824, "loss_lm": 0.016650006640702486, "loss_seg": 0.16577011719346046, "mean_token_accuracy": 0.9950373321771622, "num_tokens": 1051472360.0, "step": 2474 }, { "entropy": 0.019216276705265045, "epoch": 1.0831600831600832, "grad_norm": 7.25, "learning_rate": 2.4231185706551162e-05, "loss": 0.1037, "loss_lm": 0.01599225332029164, "loss_seg": 0.08773198537528515, "mean_token_accuracy": 0.9951682090759277, "num_tokens": 1051897834.0, "step": 2475 }, { "entropy": 0.018949963618069887, "epoch": 1.0835977678082942, "grad_norm": 14.0, "learning_rate": 2.4228478613968596e-05, "loss": 0.1807, "loss_lm": 0.014961286215111613, "loss_seg": 0.16576329246163368, "mean_token_accuracy": 0.9951662570238113, "num_tokens": 1052322734.0, "step": 2476 }, { "entropy": 0.01867740787565708, "epoch": 1.084035452456505, "grad_norm": 8.625, "learning_rate": 2.4225771521386033e-05, "loss": 0.1123, "loss_lm": 0.016587106743827462, "loss_seg": 0.09573512896895409, "mean_token_accuracy": 0.9953426271677017, "num_tokens": 1052747569.0, "step": 2477 }, { "entropy": 0.018998883664608, "epoch": 1.084473137104716, "grad_norm": 8.4375, "learning_rate": 2.4223064428803467e-05, "loss": 0.1167, "loss_lm": 0.0174783980473876, "loss_seg": 0.09918436594307423, "mean_token_accuracy": 0.9952006638050079, "num_tokens": 1053172607.0, "step": 2478 }, { "entropy": 0.019317958503961563, "epoch": 1.084910821752927, "grad_norm": 9.375, "learning_rate": 2.42203573362209e-05, "loss": 0.1905, "loss_lm": 0.01675054500810802, "loss_seg": 0.17379105463624, "mean_token_accuracy": 0.9950156807899475, "num_tokens": 1053598122.0, "step": 2479 }, { "entropy": 0.019533506594598293, "epoch": 1.085348506401138, "grad_norm": 6.8125, "learning_rate": 2.421765024363833e-05, "loss": 0.1405, "loss_lm": 0.01764407311566174, "loss_seg": 0.12282810732722282, "mean_token_accuracy": 0.9950933456420898, "num_tokens": 1054023643.0, "step": 2480 }, { "entropy": 0.018844107165932655, "epoch": 1.085786191049349, "grad_norm": 9.875, "learning_rate": 2.4214943151055765e-05, "loss": 0.1303, "loss_lm": 0.015116638038307428, "loss_seg": 0.11518904101103544, "mean_token_accuracy": 0.9951783120632172, "num_tokens": 1054448526.0, "step": 2481 }, { "entropy": 0.0185399753972888, "epoch": 1.08622387569756, "grad_norm": 7.40625, "learning_rate": 2.4212236058473202e-05, "loss": 0.1393, "loss_lm": 0.019130602478981018, "loss_seg": 0.12021525576710701, "mean_token_accuracy": 0.9952707439661026, "num_tokens": 1054873709.0, "step": 2482 }, { "entropy": 0.018841763958334923, "epoch": 1.0866615603457708, "grad_norm": 47.25, "learning_rate": 2.4209528965890636e-05, "loss": 0.2041, "loss_lm": 0.014249295461922884, "loss_seg": 0.18982897326350212, "mean_token_accuracy": 0.9952598661184311, "num_tokens": 1055298516.0, "step": 2483 }, { "entropy": 0.019080779515206814, "epoch": 1.0870992449939818, "grad_norm": 10.9375, "learning_rate": 2.420682187330807e-05, "loss": 0.119, "loss_lm": 0.014852075604721904, "loss_seg": 0.1041424423456192, "mean_token_accuracy": 0.9951639175415039, "num_tokens": 1055724229.0, "step": 2484 }, { "entropy": 0.018603547010570765, "epoch": 1.0875369296421928, "grad_norm": 7.21875, "learning_rate": 2.42041147807255e-05, "loss": 0.0962, "loss_lm": 0.016091041965410113, "loss_seg": 0.08012750372290611, "mean_token_accuracy": 0.9953372478485107, "num_tokens": 1056148841.0, "step": 2485 }, { "entropy": 0.0185845959931612, "epoch": 1.0879746142904039, "grad_norm": 12.3125, "learning_rate": 2.4201407688142933e-05, "loss": 0.135, "loss_lm": 0.01668235007673502, "loss_seg": 0.11829794570803642, "mean_token_accuracy": 0.995285838842392, "num_tokens": 1056573899.0, "step": 2486 }, { "entropy": 0.01885743159800768, "epoch": 1.0884122989386147, "grad_norm": 13.0, "learning_rate": 2.419870059556037e-05, "loss": 0.1633, "loss_lm": 0.01617493969388306, "loss_seg": 0.14715716056525707, "mean_token_accuracy": 0.9951824843883514, "num_tokens": 1056998376.0, "step": 2487 }, { "entropy": 0.0194847509264946, "epoch": 1.0888499835868257, "grad_norm": 9.125, "learning_rate": 2.4195993502977804e-05, "loss": 0.1628, "loss_lm": 0.017476009437814355, "loss_seg": 0.14535259827971458, "mean_token_accuracy": 0.9949910789728165, "num_tokens": 1057423005.0, "step": 2488 }, { "entropy": 0.018178242724388838, "epoch": 1.0892876682350368, "grad_norm": 30.125, "learning_rate": 2.4193286410395235e-05, "loss": 0.111, "loss_lm": 0.014949103351682425, "loss_seg": 0.0960787059739232, "mean_token_accuracy": 0.995386004447937, "num_tokens": 1057847518.0, "step": 2489 }, { "entropy": 0.018524435348808765, "epoch": 1.0897253528832476, "grad_norm": 7.96875, "learning_rate": 2.419057931781267e-05, "loss": 0.0981, "loss_lm": 0.016137764090672135, "loss_seg": 0.08199773542582989, "mean_token_accuracy": 0.9952545315027237, "num_tokens": 1058272697.0, "step": 2490 }, { "entropy": 0.018902663607150316, "epoch": 1.0901630375314586, "grad_norm": 7.40625, "learning_rate": 2.4187872225230102e-05, "loss": 0.1233, "loss_lm": 0.015948735643178225, "loss_seg": 0.10732695460319519, "mean_token_accuracy": 0.9951366484165192, "num_tokens": 1058698303.0, "step": 2491 }, { "entropy": 0.019104789942502975, "epoch": 1.0906007221796696, "grad_norm": 6.3125, "learning_rate": 2.418516513264754e-05, "loss": 0.1072, "loss_lm": 0.016896935179829597, "loss_seg": 0.09028295241296291, "mean_token_accuracy": 0.9951622188091278, "num_tokens": 1059123269.0, "step": 2492 }, { "entropy": 0.018264256417751312, "epoch": 1.0910384068278804, "grad_norm": 12.4375, "learning_rate": 2.4182458040064973e-05, "loss": 0.1545, "loss_lm": 0.017160880845040083, "loss_seg": 0.1373802050948143, "mean_token_accuracy": 0.9953165650367737, "num_tokens": 1059548127.0, "step": 2493 }, { "entropy": 0.019318751990795135, "epoch": 1.0914760914760915, "grad_norm": 6.8125, "learning_rate": 2.4179750947482403e-05, "loss": 0.1499, "loss_lm": 0.01593037135899067, "loss_seg": 0.133933924138546, "mean_token_accuracy": 0.9951023459434509, "num_tokens": 1059974016.0, "step": 2494 }, { "entropy": 0.018655063584446907, "epoch": 1.0919137761243025, "grad_norm": 15.125, "learning_rate": 2.4177043854899837e-05, "loss": 0.087, "loss_lm": 0.016713808057829738, "loss_seg": 0.07031348720192909, "mean_token_accuracy": 0.9952705800533295, "num_tokens": 1060398765.0, "step": 2495 }, { "entropy": 0.01897237077355385, "epoch": 1.0923514607725133, "grad_norm": 11.5625, "learning_rate": 2.417433676231727e-05, "loss": 0.1396, "loss_lm": 0.016977182356640697, "loss_seg": 0.12265145126730204, "mean_token_accuracy": 0.9951080679893494, "num_tokens": 1060824134.0, "step": 2496 }, { "entropy": 0.01887418609112501, "epoch": 1.0927891454207244, "grad_norm": 9.75, "learning_rate": 2.4171629669734708e-05, "loss": 0.1171, "loss_lm": 0.016306451987475157, "loss_seg": 0.10080177150666714, "mean_token_accuracy": 0.9952215254306793, "num_tokens": 1061249762.0, "step": 2497 }, { "entropy": 0.018799212761223316, "epoch": 1.0932268300689354, "grad_norm": 9.125, "learning_rate": 2.416892257715214e-05, "loss": 0.1905, "loss_lm": 0.01893604244105518, "loss_seg": 0.17151712626218796, "mean_token_accuracy": 0.9952752739191055, "num_tokens": 1061674718.0, "step": 2498 }, { "entropy": 0.01918961526826024, "epoch": 1.0936645147171462, "grad_norm": 7.5625, "learning_rate": 2.4166215484569572e-05, "loss": 0.0883, "loss_lm": 0.017418119590729475, "loss_seg": 0.07091750111430883, "mean_token_accuracy": 0.9950410425662994, "num_tokens": 1062099574.0, "step": 2499 }, { "entropy": 0.019310460425913334, "epoch": 1.0941021993653572, "grad_norm": 6.09375, "learning_rate": 2.4163508391987006e-05, "loss": 0.1416, "loss_lm": 0.018007936887443066, "loss_seg": 0.12358074449002743, "mean_token_accuracy": 0.995082214474678, "num_tokens": 1062524953.0, "step": 2500 }, { "entropy": 0.018349943216890097, "epoch": 1.0945398840135683, "grad_norm": 9.6875, "learning_rate": 2.416080129940444e-05, "loss": 0.1098, "loss_lm": 0.014762532897293568, "loss_seg": 0.09505069628357887, "mean_token_accuracy": 0.9953418970108032, "num_tokens": 1062949745.0, "step": 2501 }, { "entropy": 0.01914913160726428, "epoch": 1.094977568661779, "grad_norm": 15.5, "learning_rate": 2.4158094206821873e-05, "loss": 0.1601, "loss_lm": 0.016733028227463365, "loss_seg": 0.143369410187006, "mean_token_accuracy": 0.9951623678207397, "num_tokens": 1063374688.0, "step": 2502 }, { "entropy": 0.019139069132506847, "epoch": 1.0954152533099901, "grad_norm": 16.75, "learning_rate": 2.415538711423931e-05, "loss": 0.087, "loss_lm": 0.014656544663012028, "loss_seg": 0.07235012855380774, "mean_token_accuracy": 0.9952305108308792, "num_tokens": 1063798988.0, "step": 2503 }, { "entropy": 0.018714841920882463, "epoch": 1.0958529379582012, "grad_norm": 13.6875, "learning_rate": 2.415268002165674e-05, "loss": 0.1039, "loss_lm": 0.01474899472668767, "loss_seg": 0.08910834416747093, "mean_token_accuracy": 0.9951813817024231, "num_tokens": 1064223566.0, "step": 2504 }, { "entropy": 0.01943361572921276, "epoch": 1.096290622606412, "grad_norm": 7.0, "learning_rate": 2.4149972929074174e-05, "loss": 0.1757, "loss_lm": 0.017034580698236823, "loss_seg": 0.15863217785954475, "mean_token_accuracy": 0.9950691312551498, "num_tokens": 1064650063.0, "step": 2505 }, { "entropy": 0.01928722020238638, "epoch": 1.096728307254623, "grad_norm": 16.75, "learning_rate": 2.4147265836491608e-05, "loss": 0.1174, "loss_lm": 0.016691228607669473, "loss_seg": 0.10075099393725395, "mean_token_accuracy": 0.9950316846370697, "num_tokens": 1065075548.0, "step": 2506 }, { "entropy": 0.01886826753616333, "epoch": 1.097165991902834, "grad_norm": 14.6875, "learning_rate": 2.414455874390904e-05, "loss": 0.1155, "loss_lm": 0.01636734721250832, "loss_seg": 0.09909883514046669, "mean_token_accuracy": 0.9951025396585464, "num_tokens": 1065500228.0, "step": 2507 }, { "entropy": 0.01895914925262332, "epoch": 1.097603676551045, "grad_norm": 5.125, "learning_rate": 2.414185165132648e-05, "loss": 0.105, "loss_lm": 0.019235045183449984, "loss_seg": 0.08575000986456871, "mean_token_accuracy": 0.9953013360500336, "num_tokens": 1065926250.0, "step": 2508 }, { "entropy": 0.01892227865755558, "epoch": 1.0980413611992559, "grad_norm": 5.375, "learning_rate": 2.413914455874391e-05, "loss": 0.1012, "loss_lm": 0.016922117210924625, "loss_seg": 0.0842729089781642, "mean_token_accuracy": 0.9952162653207779, "num_tokens": 1066350563.0, "step": 2509 }, { "entropy": 0.018448358867317438, "epoch": 1.098479045847467, "grad_norm": 8.1875, "learning_rate": 2.4136437466161343e-05, "loss": 0.1325, "loss_lm": 0.014986739959567785, "loss_seg": 0.11749676614999771, "mean_token_accuracy": 0.9953038394451141, "num_tokens": 1066775756.0, "step": 2510 }, { "entropy": 0.01850575627759099, "epoch": 1.098916730495678, "grad_norm": 16.125, "learning_rate": 2.4133730373578777e-05, "loss": 0.1168, "loss_lm": 0.015162966214120388, "loss_seg": 0.1016188245266676, "mean_token_accuracy": 0.9952622950077057, "num_tokens": 1067201025.0, "step": 2511 }, { "entropy": 0.01832391368225217, "epoch": 1.0993544151438888, "grad_norm": 16.75, "learning_rate": 2.413102328099621e-05, "loss": 0.1147, "loss_lm": 0.016740321181714535, "loss_seg": 0.09799547307193279, "mean_token_accuracy": 0.9952461868524551, "num_tokens": 1067626337.0, "step": 2512 }, { "entropy": 0.01930468762293458, "epoch": 1.0997920997920998, "grad_norm": 6.15625, "learning_rate": 2.4128316188413644e-05, "loss": 0.109, "loss_lm": 0.017971906810998917, "loss_seg": 0.09106234926730394, "mean_token_accuracy": 0.9950840622186661, "num_tokens": 1068051410.0, "step": 2513 }, { "entropy": 0.01831521838903427, "epoch": 1.1002297844403108, "grad_norm": 6.28125, "learning_rate": 2.4125609095831078e-05, "loss": 0.1522, "loss_lm": 0.017946169478818774, "loss_seg": 0.13428531028330326, "mean_token_accuracy": 0.9952993839979172, "num_tokens": 1068476423.0, "step": 2514 }, { "entropy": 0.01819819863885641, "epoch": 1.1006674690885216, "grad_norm": 3.953125, "learning_rate": 2.412290200324851e-05, "loss": 0.1021, "loss_lm": 0.015222584595903754, "loss_seg": 0.08690264634788036, "mean_token_accuracy": 0.9953448623418808, "num_tokens": 1068901476.0, "step": 2515 }, { "entropy": 0.018914147280156612, "epoch": 1.1011051537367327, "grad_norm": 8.5625, "learning_rate": 2.4120194910665945e-05, "loss": 0.1105, "loss_lm": 0.017313765361905098, "loss_seg": 0.0931747816503048, "mean_token_accuracy": 0.9951976835727692, "num_tokens": 1069327489.0, "step": 2516 }, { "entropy": 0.0185935921035707, "epoch": 1.1015428383849437, "grad_norm": 4.53125, "learning_rate": 2.411748781808338e-05, "loss": 0.1325, "loss_lm": 0.015503925969824195, "loss_seg": 0.1170442458242178, "mean_token_accuracy": 0.9952164590358734, "num_tokens": 1069752038.0, "step": 2517 }, { "entropy": 0.018856933806091547, "epoch": 1.1019805230331545, "grad_norm": 6.84375, "learning_rate": 2.4114780725500813e-05, "loss": 0.1468, "loss_lm": 0.017328161047771573, "loss_seg": 0.12944298796355724, "mean_token_accuracy": 0.9951533675193787, "num_tokens": 1070176437.0, "step": 2518 }, { "entropy": 0.018721472937613726, "epoch": 1.1024182076813656, "grad_norm": 6.4375, "learning_rate": 2.4112073632918246e-05, "loss": 0.1043, "loss_lm": 0.017434724839404225, "loss_seg": 0.08691061567515135, "mean_token_accuracy": 0.9952454566955566, "num_tokens": 1070601251.0, "step": 2519 }, { "entropy": 0.0180455781519413, "epoch": 1.1028558923295766, "grad_norm": 9.8125, "learning_rate": 2.410936654033568e-05, "loss": 0.1385, "loss_lm": 0.016745975706726313, "loss_seg": 0.12176656536757946, "mean_token_accuracy": 0.9953863471746445, "num_tokens": 1071025585.0, "step": 2520 }, { "entropy": 0.018386175855994225, "epoch": 1.1032935769777874, "grad_norm": 11.4375, "learning_rate": 2.4106659447753114e-05, "loss": 0.1145, "loss_lm": 0.018187259789556265, "loss_seg": 0.0962881138548255, "mean_token_accuracy": 0.9952306747436523, "num_tokens": 1071450344.0, "step": 2521 }, { "entropy": 0.019085606560111046, "epoch": 1.1037312616259984, "grad_norm": 5.5, "learning_rate": 2.4103952355170548e-05, "loss": 0.1459, "loss_lm": 0.016487905522808433, "loss_seg": 0.12942836619913578, "mean_token_accuracy": 0.9951197654008865, "num_tokens": 1071875896.0, "step": 2522 }, { "entropy": 0.018661280628293753, "epoch": 1.1041689462742095, "grad_norm": 9.375, "learning_rate": 2.410124526258798e-05, "loss": 0.217, "loss_lm": 0.016101542860269547, "loss_seg": 0.20087412185966969, "mean_token_accuracy": 0.9952719062566757, "num_tokens": 1072300754.0, "step": 2523 }, { "entropy": 0.018605954945087433, "epoch": 1.1046066309224205, "grad_norm": 10.6875, "learning_rate": 2.4098538170005415e-05, "loss": 0.1144, "loss_lm": 0.015467789256945252, "loss_seg": 0.09892113506793976, "mean_token_accuracy": 0.9952901601791382, "num_tokens": 1072725779.0, "step": 2524 }, { "entropy": 0.018129588570445776, "epoch": 1.1050443155706313, "grad_norm": 13.4375, "learning_rate": 2.409583107742285e-05, "loss": 0.1058, "loss_lm": 0.01564902625977993, "loss_seg": 0.09014440327882767, "mean_token_accuracy": 0.9953401982784271, "num_tokens": 1073150245.0, "step": 2525 }, { "entropy": 0.01894657639786601, "epoch": 1.1054820002188424, "grad_norm": 9.375, "learning_rate": 2.4093123984840282e-05, "loss": 0.1583, "loss_lm": 0.01954412879422307, "loss_seg": 0.1387378592044115, "mean_token_accuracy": 0.9952276945114136, "num_tokens": 1073575141.0, "step": 2526 }, { "entropy": 0.018664275761693716, "epoch": 1.1059196848670534, "grad_norm": 8.0, "learning_rate": 2.4090416892257716e-05, "loss": 0.1361, "loss_lm": 0.01404715608805418, "loss_seg": 0.12203899584710598, "mean_token_accuracy": 0.9952465295791626, "num_tokens": 1073999855.0, "step": 2527 }, { "entropy": 0.01820065826177597, "epoch": 1.1063573695152642, "grad_norm": 10.375, "learning_rate": 2.408770979967515e-05, "loss": 0.1108, "loss_lm": 0.015178713714703918, "loss_seg": 0.09561598021537066, "mean_token_accuracy": 0.9953368455171585, "num_tokens": 1074425060.0, "step": 2528 }, { "entropy": 0.019432386849075556, "epoch": 1.1067950541634752, "grad_norm": 14.875, "learning_rate": 2.4085002707092584e-05, "loss": 0.1084, "loss_lm": 0.016777559649199247, "loss_seg": 0.09160107187926769, "mean_token_accuracy": 0.994980052113533, "num_tokens": 1074850536.0, "step": 2529 }, { "entropy": 0.018517106771469116, "epoch": 1.1072327388116863, "grad_norm": 8.875, "learning_rate": 2.4082295614510017e-05, "loss": 0.1606, "loss_lm": 0.017298854188993573, "loss_seg": 0.14326205849647522, "mean_token_accuracy": 0.9951392114162445, "num_tokens": 1075275338.0, "step": 2530 }, { "entropy": 0.019070847425609827, "epoch": 1.107670423459897, "grad_norm": 7.9375, "learning_rate": 2.407958852192745e-05, "loss": 0.1049, "loss_lm": 0.015349978115409613, "loss_seg": 0.08950954303145409, "mean_token_accuracy": 0.9951909631490707, "num_tokens": 1075700534.0, "step": 2531 }, { "entropy": 0.01837003231048584, "epoch": 1.1081081081081081, "grad_norm": 5.75, "learning_rate": 2.4076881429344885e-05, "loss": 0.1155, "loss_lm": 0.016315432963892817, "loss_seg": 0.09920008480548859, "mean_token_accuracy": 0.9952724725008011, "num_tokens": 1076125235.0, "step": 2532 }, { "entropy": 0.01871601166203618, "epoch": 1.1085457927563191, "grad_norm": 3.75, "learning_rate": 2.4074174336762315e-05, "loss": 0.1285, "loss_lm": 0.01575729064643383, "loss_seg": 0.11277462635189295, "mean_token_accuracy": 0.9951910823583603, "num_tokens": 1076550015.0, "step": 2533 }, { "entropy": 0.01925544533878565, "epoch": 1.10898347740453, "grad_norm": 7.78125, "learning_rate": 2.4071467244179752e-05, "loss": 0.152, "loss_lm": 0.01548838079907, "loss_seg": 0.13653470762073994, "mean_token_accuracy": 0.9950376152992249, "num_tokens": 1076974756.0, "step": 2534 }, { "entropy": 0.018533941823989153, "epoch": 1.109421162052741, "grad_norm": 6.375, "learning_rate": 2.4068760151597186e-05, "loss": 0.1504, "loss_lm": 0.0142076148185879, "loss_seg": 0.13619736768305302, "mean_token_accuracy": 0.9952883571386337, "num_tokens": 1077399626.0, "step": 2535 }, { "entropy": 0.01865835441276431, "epoch": 1.109858846700952, "grad_norm": 4.15625, "learning_rate": 2.406605305901462e-05, "loss": 0.0798, "loss_lm": 0.01665909425355494, "loss_seg": 0.06317294854670763, "mean_token_accuracy": 0.9951754063367844, "num_tokens": 1077824083.0, "step": 2536 }, { "entropy": 0.018844543024897575, "epoch": 1.1102965313491628, "grad_norm": 8.5625, "learning_rate": 2.406334596643205e-05, "loss": 0.1051, "loss_lm": 0.01435557659715414, "loss_seg": 0.09073829930275679, "mean_token_accuracy": 0.9952140301465988, "num_tokens": 1078248617.0, "step": 2537 }, { "entropy": 0.01865725079551339, "epoch": 1.1107342159973739, "grad_norm": 5.5625, "learning_rate": 2.4060638873849484e-05, "loss": 0.1164, "loss_lm": 0.01755483401939273, "loss_seg": 0.0988712478429079, "mean_token_accuracy": 0.9952114224433899, "num_tokens": 1078673320.0, "step": 2538 }, { "entropy": 0.018280629068613052, "epoch": 1.111171900645585, "grad_norm": 12.375, "learning_rate": 2.405793178126692e-05, "loss": 0.0881, "loss_lm": 0.01588413189165294, "loss_seg": 0.0722412383183837, "mean_token_accuracy": 0.9953000247478485, "num_tokens": 1079098152.0, "step": 2539 }, { "entropy": 0.018690412398427725, "epoch": 1.1116095852937957, "grad_norm": 6.09375, "learning_rate": 2.4055224688684355e-05, "loss": 0.1801, "loss_lm": 0.015229466371238232, "loss_seg": 0.16486535966396332, "mean_token_accuracy": 0.9953182935714722, "num_tokens": 1079522953.0, "step": 2540 }, { "entropy": 0.019254035782068968, "epoch": 1.1120472699420068, "grad_norm": 5.0625, "learning_rate": 2.405251759610179e-05, "loss": 0.09, "loss_lm": 0.015452002175152302, "loss_seg": 0.07453487068414688, "mean_token_accuracy": 0.9951895773410797, "num_tokens": 1079947146.0, "step": 2541 }, { "entropy": 0.018428495153784752, "epoch": 1.1124849545902178, "grad_norm": 5.15625, "learning_rate": 2.404981050351922e-05, "loss": 0.0842, "loss_lm": 0.014306164113804698, "loss_seg": 0.06989612337201834, "mean_token_accuracy": 0.9953829944133759, "num_tokens": 1080371402.0, "step": 2542 }, { "entropy": 0.019374053925275803, "epoch": 1.1129226392384286, "grad_norm": 13.375, "learning_rate": 2.4047103410936652e-05, "loss": 0.102, "loss_lm": 0.014467298286035657, "loss_seg": 0.0875656995922327, "mean_token_accuracy": 0.9950565546751022, "num_tokens": 1080796593.0, "step": 2543 }, { "entropy": 0.018588128499686718, "epoch": 1.1133603238866396, "grad_norm": 9.8125, "learning_rate": 2.404439631835409e-05, "loss": 0.1577, "loss_lm": 0.01800233800895512, "loss_seg": 0.13974564336240292, "mean_token_accuracy": 0.9952717274427414, "num_tokens": 1081221445.0, "step": 2544 }, { "entropy": 0.019075030926615, "epoch": 1.1137980085348507, "grad_norm": 3.09375, "learning_rate": 2.4041689225771523e-05, "loss": 0.1247, "loss_lm": 0.015845278976485133, "loss_seg": 0.10887491144239902, "mean_token_accuracy": 0.9952172338962555, "num_tokens": 1081646702.0, "step": 2545 }, { "entropy": 0.018361578229814768, "epoch": 1.1142356931830617, "grad_norm": 7.40625, "learning_rate": 2.4038982133188957e-05, "loss": 0.1227, "loss_lm": 0.014176768017932773, "loss_seg": 0.10851813293993473, "mean_token_accuracy": 0.995366096496582, "num_tokens": 1082071987.0, "step": 2546 }, { "entropy": 0.01866243313997984, "epoch": 1.1146733778312725, "grad_norm": 7.75, "learning_rate": 2.4036275040606387e-05, "loss": 0.1411, "loss_lm": 0.015052194939926267, "loss_seg": 0.12601301446557045, "mean_token_accuracy": 0.9951635897159576, "num_tokens": 1082497682.0, "step": 2547 }, { "entropy": 0.01816478045657277, "epoch": 1.1151110624794836, "grad_norm": 9.6875, "learning_rate": 2.403356794802382e-05, "loss": 0.1153, "loss_lm": 0.015564108965918422, "loss_seg": 0.09971385449171066, "mean_token_accuracy": 0.9953723400831223, "num_tokens": 1082922466.0, "step": 2548 }, { "entropy": 0.019418892916291952, "epoch": 1.1155487471276946, "grad_norm": 10.4375, "learning_rate": 2.4030860855441258e-05, "loss": 0.1274, "loss_lm": 0.015477823559194803, "loss_seg": 0.11188353784382343, "mean_token_accuracy": 0.9950224161148071, "num_tokens": 1083347540.0, "step": 2549 }, { "entropy": 0.018975290469825268, "epoch": 1.1159864317759054, "grad_norm": 7.34375, "learning_rate": 2.4028153762858692e-05, "loss": 0.1232, "loss_lm": 0.015226735733449459, "loss_seg": 0.107925770804286, "mean_token_accuracy": 0.9950440675020218, "num_tokens": 1083773073.0, "step": 2550 }, { "entropy": 0.018557443283498287, "epoch": 1.1164241164241164, "grad_norm": 12.375, "learning_rate": 2.4025446670276126e-05, "loss": 0.112, "loss_lm": 0.016676478320732713, "loss_seg": 0.09527671802788973, "mean_token_accuracy": 0.9951902478933334, "num_tokens": 1084198209.0, "step": 2551 }, { "entropy": 0.018610018771141768, "epoch": 1.1168618010723275, "grad_norm": 11.5, "learning_rate": 2.4022739577693556e-05, "loss": 0.1346, "loss_lm": 0.016580659430474043, "loss_seg": 0.11805563233792782, "mean_token_accuracy": 0.9952166229486465, "num_tokens": 1084622974.0, "step": 2552 }, { "entropy": 0.019140093587338924, "epoch": 1.1172994857205383, "grad_norm": 10.1875, "learning_rate": 2.402003248511099e-05, "loss": 0.1583, "loss_lm": 0.014441408915445209, "loss_seg": 0.1438980270177126, "mean_token_accuracy": 0.9951996803283691, "num_tokens": 1085048573.0, "step": 2553 }, { "entropy": 0.019104157108813524, "epoch": 1.1177371703687493, "grad_norm": 8.125, "learning_rate": 2.4017325392528427e-05, "loss": 0.1407, "loss_lm": 0.01590244355611503, "loss_seg": 0.12482942268252373, "mean_token_accuracy": 0.9951006323099136, "num_tokens": 1085473914.0, "step": 2554 }, { "entropy": 0.018746528308838606, "epoch": 1.1181748550169603, "grad_norm": 10.875, "learning_rate": 2.401461829994586e-05, "loss": 0.1413, "loss_lm": 0.014940000837668777, "loss_seg": 0.12637011520564556, "mean_token_accuracy": 0.9952652603387833, "num_tokens": 1085899020.0, "step": 2555 }, { "entropy": 0.019359364174306393, "epoch": 1.1186125396651712, "grad_norm": 8.375, "learning_rate": 2.4011911207363294e-05, "loss": 0.1002, "loss_lm": 0.016042874893173575, "loss_seg": 0.0841740183532238, "mean_token_accuracy": 0.9950919896364212, "num_tokens": 1086324217.0, "step": 2556 }, { "entropy": 0.0186347640119493, "epoch": 1.1190502243133822, "grad_norm": 11.4375, "learning_rate": 2.4009204114780725e-05, "loss": 0.1078, "loss_lm": 0.015813727863132954, "loss_seg": 0.09199260175228119, "mean_token_accuracy": 0.995241567492485, "num_tokens": 1086749331.0, "step": 2557 }, { "entropy": 0.019275628961622715, "epoch": 1.1194879089615932, "grad_norm": 7.6875, "learning_rate": 2.400649702219816e-05, "loss": 0.1092, "loss_lm": 0.01850590156391263, "loss_seg": 0.09065891988575459, "mean_token_accuracy": 0.9950347691774368, "num_tokens": 1087174474.0, "step": 2558 }, { "entropy": 0.018918603658676147, "epoch": 1.119925593609804, "grad_norm": 38.0, "learning_rate": 2.4003789929615595e-05, "loss": 0.1206, "loss_lm": 0.016260515665635467, "loss_seg": 0.10431852284818888, "mean_token_accuracy": 0.9951822608709335, "num_tokens": 1087599956.0, "step": 2559 }, { "entropy": 0.01849719090387225, "epoch": 1.120363278258015, "grad_norm": 17.75, "learning_rate": 2.400108283703303e-05, "loss": 0.09, "loss_lm": 0.014708100352436304, "loss_seg": 0.07532166503369808, "mean_token_accuracy": 0.995360717177391, "num_tokens": 1088024552.0, "step": 2560 }, { "entropy": 0.01918489718809724, "epoch": 1.120800962906226, "grad_norm": 9.4375, "learning_rate": 2.399837574445046e-05, "loss": 0.083, "loss_lm": 0.015980680705979466, "loss_seg": 0.06704706139862537, "mean_token_accuracy": 0.9951874613761902, "num_tokens": 1088449658.0, "step": 2561 }, { "entropy": 0.018671773374080658, "epoch": 1.1212386475544371, "grad_norm": 8.875, "learning_rate": 2.3995668651867893e-05, "loss": 0.1711, "loss_lm": 0.014961882960051298, "loss_seg": 0.1560989934951067, "mean_token_accuracy": 0.9952708929777145, "num_tokens": 1088874654.0, "step": 2562 }, { "entropy": 0.019418836571276188, "epoch": 1.121676332202648, "grad_norm": 20.0, "learning_rate": 2.3992961559285327e-05, "loss": 0.1181, "loss_lm": 0.017132757930085063, "loss_seg": 0.10101225972175598, "mean_token_accuracy": 0.9951347559690475, "num_tokens": 1089299480.0, "step": 2563 }, { "entropy": 0.018987242598086596, "epoch": 1.122114016850859, "grad_norm": 25.375, "learning_rate": 2.399025446670276e-05, "loss": 0.1201, "loss_lm": 0.01524677500128746, "loss_seg": 0.10483182966709137, "mean_token_accuracy": 0.9951232373714447, "num_tokens": 1089723989.0, "step": 2564 }, { "entropy": 0.01893289340659976, "epoch": 1.12255170149907, "grad_norm": 4.15625, "learning_rate": 2.3987547374120198e-05, "loss": 0.1253, "loss_lm": 0.015345266554504633, "loss_seg": 0.1099639618769288, "mean_token_accuracy": 0.9952447265386581, "num_tokens": 1090148754.0, "step": 2565 }, { "entropy": 0.019329327158629894, "epoch": 1.1229893861472808, "grad_norm": 4.96875, "learning_rate": 2.3984840281537628e-05, "loss": 0.1111, "loss_lm": 0.016581969102844596, "loss_seg": 0.09446977451443672, "mean_token_accuracy": 0.9950853884220123, "num_tokens": 1090574605.0, "step": 2566 }, { "entropy": 0.018948360346257687, "epoch": 1.1234270707954919, "grad_norm": 13.5625, "learning_rate": 2.3982133188955062e-05, "loss": 0.1438, "loss_lm": 0.01633412577211857, "loss_seg": 0.1274388926103711, "mean_token_accuracy": 0.9951950013637543, "num_tokens": 1090999963.0, "step": 2567 }, { "entropy": 0.01855497295036912, "epoch": 1.123864755443703, "grad_norm": 7.03125, "learning_rate": 2.3979426096372496e-05, "loss": 0.1137, "loss_lm": 0.01583109237253666, "loss_seg": 0.0978942271322012, "mean_token_accuracy": 0.9952805936336517, "num_tokens": 1091424518.0, "step": 2568 }, { "entropy": 0.01974852755665779, "epoch": 1.1243024400919137, "grad_norm": 9.1875, "learning_rate": 2.397671900378993e-05, "loss": 0.1323, "loss_lm": 0.014940718188881874, "loss_seg": 0.1173797007650137, "mean_token_accuracy": 0.9949205368757248, "num_tokens": 1091849153.0, "step": 2569 }, { "entropy": 0.01855573384091258, "epoch": 1.1247401247401247, "grad_norm": 7.1875, "learning_rate": 2.3974011911207366e-05, "loss": 0.1247, "loss_lm": 0.015250686788931489, "loss_seg": 0.10940060578286648, "mean_token_accuracy": 0.9953182488679886, "num_tokens": 1092273751.0, "step": 2570 }, { "entropy": 0.019158901646733284, "epoch": 1.1251778093883358, "grad_norm": 4.875, "learning_rate": 2.3971304818624797e-05, "loss": 0.1316, "loss_lm": 0.018750838935375214, "loss_seg": 0.11282534897327423, "mean_token_accuracy": 0.995234951376915, "num_tokens": 1092699129.0, "step": 2571 }, { "entropy": 0.0182928335852921, "epoch": 1.1256154940365466, "grad_norm": 4.4375, "learning_rate": 2.396859772604223e-05, "loss": 0.1741, "loss_lm": 0.01558479038067162, "loss_seg": 0.15852071717381477, "mean_token_accuracy": 0.995304524898529, "num_tokens": 1093124150.0, "step": 2572 }, { "entropy": 0.018867135979235172, "epoch": 1.1260531786847576, "grad_norm": 11.0, "learning_rate": 2.3965890633459664e-05, "loss": 0.1168, "loss_lm": 0.015248337527737021, "loss_seg": 0.10155810415744781, "mean_token_accuracy": 0.9951942712068558, "num_tokens": 1093549025.0, "step": 2573 }, { "entropy": 0.01879269629716873, "epoch": 1.1264908633329687, "grad_norm": 5.25, "learning_rate": 2.3963183540877098e-05, "loss": 0.1323, "loss_lm": 0.01564207742922008, "loss_seg": 0.1167058665305376, "mean_token_accuracy": 0.995364636182785, "num_tokens": 1093974409.0, "step": 2574 }, { "entropy": 0.018297191709280014, "epoch": 1.1269285479811795, "grad_norm": 12.6875, "learning_rate": 2.3960476448294535e-05, "loss": 0.1001, "loss_lm": 0.014573229709640145, "loss_seg": 0.08551334869116545, "mean_token_accuracy": 0.9953567236661911, "num_tokens": 1094398850.0, "step": 2575 }, { "entropy": 0.018706511706113815, "epoch": 1.1273662326293905, "grad_norm": 4.8125, "learning_rate": 2.3957769355711965e-05, "loss": 0.1643, "loss_lm": 0.015531446086242795, "loss_seg": 0.14875882863998413, "mean_token_accuracy": 0.995162695646286, "num_tokens": 1094824055.0, "step": 2576 }, { "entropy": 0.018923374824225903, "epoch": 1.1278039172776015, "grad_norm": 13.875, "learning_rate": 2.39550622631294e-05, "loss": 0.1194, "loss_lm": 0.016930250683799386, "loss_seg": 0.10243554227054119, "mean_token_accuracy": 0.995209813117981, "num_tokens": 1095248623.0, "step": 2577 }, { "entropy": 0.018708901945501566, "epoch": 1.1282416019258124, "grad_norm": 4.1875, "learning_rate": 2.3952355170546833e-05, "loss": 0.1063, "loss_lm": 0.016469012014567852, "loss_seg": 0.0898326225578785, "mean_token_accuracy": 0.9951089173555374, "num_tokens": 1095674274.0, "step": 2578 }, { "entropy": 0.018442112486809492, "epoch": 1.1286792865740234, "grad_norm": 7.875, "learning_rate": 2.3949648077964267e-05, "loss": 0.1072, "loss_lm": 0.01474374276585877, "loss_seg": 0.09242259338498116, "mean_token_accuracy": 0.9953085333108902, "num_tokens": 1096099044.0, "step": 2579 }, { "entropy": 0.019990040455013514, "epoch": 1.1291169712222344, "grad_norm": 9.375, "learning_rate": 2.3946940985381704e-05, "loss": 0.1654, "loss_lm": 0.018118204083293676, "loss_seg": 0.14730321802198887, "mean_token_accuracy": 0.994926705956459, "num_tokens": 1096524197.0, "step": 2580 }, { "entropy": 0.018463618122041225, "epoch": 1.1295546558704452, "grad_norm": 4.5625, "learning_rate": 2.3944233892799134e-05, "loss": 0.1343, "loss_lm": 0.015514767030254006, "loss_seg": 0.11874483898282051, "mean_token_accuracy": 0.9952265918254852, "num_tokens": 1096948795.0, "step": 2581 }, { "entropy": 0.018534811679273844, "epoch": 1.1299923405186563, "grad_norm": 10.25, "learning_rate": 2.3941526800216568e-05, "loss": 0.1175, "loss_lm": 0.014529740437865257, "loss_seg": 0.10295391827821732, "mean_token_accuracy": 0.9953123778104782, "num_tokens": 1097373727.0, "step": 2582 }, { "entropy": 0.018754189368337393, "epoch": 1.1304300251668673, "grad_norm": 6.34375, "learning_rate": 2.3938819707634e-05, "loss": 0.1421, "loss_lm": 0.01781235937960446, "loss_seg": 0.1242564907297492, "mean_token_accuracy": 0.9952790439128876, "num_tokens": 1097798415.0, "step": 2583 }, { "entropy": 0.018815728835761547, "epoch": 1.1308677098150781, "grad_norm": 9.125, "learning_rate": 2.3936112615051435e-05, "loss": 0.1213, "loss_lm": 0.015462741488590837, "loss_seg": 0.10584255307912827, "mean_token_accuracy": 0.9952341765165329, "num_tokens": 1098223077.0, "step": 2584 }, { "entropy": 0.018266955390572548, "epoch": 1.1313053944632891, "grad_norm": 6.46875, "learning_rate": 2.393340552246887e-05, "loss": 0.1347, "loss_lm": 0.018341315910220146, "loss_seg": 0.11639498360455036, "mean_token_accuracy": 0.9952974021434784, "num_tokens": 1098648544.0, "step": 2585 }, { "entropy": 0.01937755336984992, "epoch": 1.1317430791115002, "grad_norm": 16.5, "learning_rate": 2.3930698429886303e-05, "loss": 0.1247, "loss_lm": 0.015268606599420309, "loss_seg": 0.10943195782601833, "mean_token_accuracy": 0.9949624538421631, "num_tokens": 1099074135.0, "step": 2586 }, { "entropy": 0.019421592354774475, "epoch": 1.1321807637597112, "grad_norm": 10.1875, "learning_rate": 2.3927991337303736e-05, "loss": 0.126, "loss_lm": 0.018770487513393164, "loss_seg": 0.10721181891858578, "mean_token_accuracy": 0.9950139224529266, "num_tokens": 1099499265.0, "step": 2587 }, { "entropy": 0.01917851995676756, "epoch": 1.132618448407922, "grad_norm": 15.75, "learning_rate": 2.392528424472117e-05, "loss": 0.1152, "loss_lm": 0.016265419544652104, "loss_seg": 0.09895280189812183, "mean_token_accuracy": 0.9950027614831924, "num_tokens": 1099923879.0, "step": 2588 }, { "entropy": 0.018857049755752087, "epoch": 1.133056133056133, "grad_norm": 6.5625, "learning_rate": 2.3922577152138604e-05, "loss": 0.1296, "loss_lm": 0.016143435146659613, "loss_seg": 0.1134694293141365, "mean_token_accuracy": 0.995268315076828, "num_tokens": 1100348825.0, "step": 2589 }, { "entropy": 0.018373000901192427, "epoch": 1.133493817704344, "grad_norm": 6.09375, "learning_rate": 2.3919870059556038e-05, "loss": 0.1544, "loss_lm": 0.015871485695242882, "loss_seg": 0.13854447193443775, "mean_token_accuracy": 0.9954386651515961, "num_tokens": 1100773452.0, "step": 2590 }, { "entropy": 0.018779887817800045, "epoch": 1.133931502352555, "grad_norm": 15.8125, "learning_rate": 2.391716296697347e-05, "loss": 0.1472, "loss_lm": 0.017712294589728117, "loss_seg": 0.12946193665266037, "mean_token_accuracy": 0.9951828867197037, "num_tokens": 1101198579.0, "step": 2591 }, { "entropy": 0.018391814082860947, "epoch": 1.134369187000766, "grad_norm": 7.34375, "learning_rate": 2.3914455874390905e-05, "loss": 0.1173, "loss_lm": 0.01777127175591886, "loss_seg": 0.09949102811515331, "mean_token_accuracy": 0.9951720535755157, "num_tokens": 1101623168.0, "step": 2592 }, { "entropy": 0.01908684754744172, "epoch": 1.134806871648977, "grad_norm": 11.3125, "learning_rate": 2.391174878180834e-05, "loss": 0.1373, "loss_lm": 0.015457759611308575, "loss_seg": 0.12182057090103626, "mean_token_accuracy": 0.9951308965682983, "num_tokens": 1102047880.0, "step": 2593 }, { "entropy": 0.018895426765084267, "epoch": 1.1352445562971878, "grad_norm": 14.25, "learning_rate": 2.3909041689225772e-05, "loss": 0.1195, "loss_lm": 0.017132835695520043, "loss_seg": 0.10232187062501907, "mean_token_accuracy": 0.9951944798231125, "num_tokens": 1102473592.0, "step": 2594 }, { "entropy": 0.019081723876297474, "epoch": 1.1356822409453988, "grad_norm": 16.25, "learning_rate": 2.3906334596643206e-05, "loss": 0.1253, "loss_lm": 0.015167452162131667, "loss_seg": 0.11014431994408369, "mean_token_accuracy": 0.9951038658618927, "num_tokens": 1102898597.0, "step": 2595 }, { "entropy": 0.01874222420156002, "epoch": 1.1361199255936099, "grad_norm": 10.5, "learning_rate": 2.390362750406064e-05, "loss": 0.1261, "loss_lm": 0.01712531945668161, "loss_seg": 0.10899047367274761, "mean_token_accuracy": 0.9952675253152847, "num_tokens": 1103323900.0, "step": 2596 }, { "entropy": 0.01841387478634715, "epoch": 1.136557610241821, "grad_norm": 17.0, "learning_rate": 2.3900920411478074e-05, "loss": 0.0939, "loss_lm": 0.015677099581807852, "loss_seg": 0.07822662591934204, "mean_token_accuracy": 0.9952124059200287, "num_tokens": 1103748293.0, "step": 2597 }, { "entropy": 0.01953096641227603, "epoch": 1.1369952948900317, "grad_norm": 32.0, "learning_rate": 2.3898213318895507e-05, "loss": 0.1802, "loss_lm": 0.019252500031143427, "loss_seg": 0.16097775846719742, "mean_token_accuracy": 0.9950134307146072, "num_tokens": 1104172627.0, "step": 2598 }, { "entropy": 0.018939473666250706, "epoch": 1.1374329795382427, "grad_norm": 6.3125, "learning_rate": 2.389550622631294e-05, "loss": 0.097, "loss_lm": 0.01861127745360136, "loss_seg": 0.07836185395717621, "mean_token_accuracy": 0.9951388984918594, "num_tokens": 1104597195.0, "step": 2599 }, { "entropy": 0.018227844033390284, "epoch": 1.1378706641864538, "grad_norm": 8.8125, "learning_rate": 2.389279913373037e-05, "loss": 0.1426, "loss_lm": 0.015895027667284012, "loss_seg": 0.12673457711935043, "mean_token_accuracy": 0.9953347593545914, "num_tokens": 1105022205.0, "step": 2600 }, { "entropy": 0.01915524434298277, "epoch": 1.1383083488346646, "grad_norm": 18.875, "learning_rate": 2.389009204114781e-05, "loss": 0.1399, "loss_lm": 0.016392851248383522, "loss_seg": 0.12346415966749191, "mean_token_accuracy": 0.9951280206441879, "num_tokens": 1105446609.0, "step": 2601 }, { "entropy": 0.019032415468245745, "epoch": 1.1387460334828756, "grad_norm": 99.0, "learning_rate": 2.3887384948565242e-05, "loss": 0.1442, "loss_lm": 0.017173826694488525, "loss_seg": 0.1269833892583847, "mean_token_accuracy": 0.9951389878988266, "num_tokens": 1105871199.0, "step": 2602 }, { "entropy": 0.018577864859253168, "epoch": 1.1391837181310867, "grad_norm": 7.53125, "learning_rate": 2.3884677855982676e-05, "loss": 0.1171, "loss_lm": 0.017170226899906993, "loss_seg": 0.0999070480465889, "mean_token_accuracy": 0.9953327476978302, "num_tokens": 1106295400.0, "step": 2603 }, { "entropy": 0.018577953800559044, "epoch": 1.1396214027792975, "grad_norm": 17.625, "learning_rate": 2.388197076340011e-05, "loss": 0.109, "loss_lm": 0.01816611597314477, "loss_seg": 0.09079239889979362, "mean_token_accuracy": 0.99522465467453, "num_tokens": 1106720660.0, "step": 2604 }, { "entropy": 0.018357267137616873, "epoch": 1.1400590874275085, "grad_norm": 21.0, "learning_rate": 2.387926367081754e-05, "loss": 0.0791, "loss_lm": 0.014695881865918636, "loss_seg": 0.06444988865405321, "mean_token_accuracy": 0.9953606575727463, "num_tokens": 1107145859.0, "step": 2605 }, { "entropy": 0.01858772849664092, "epoch": 1.1404967720757195, "grad_norm": 10.1875, "learning_rate": 2.3876556578234977e-05, "loss": 0.0957, "loss_lm": 0.0165899689309299, "loss_seg": 0.07912306115031242, "mean_token_accuracy": 0.9952557981014252, "num_tokens": 1107571191.0, "step": 2606 }, { "entropy": 0.019232023041695356, "epoch": 1.1409344567239303, "grad_norm": 12.0, "learning_rate": 2.387384948565241e-05, "loss": 0.1293, "loss_lm": 0.018384654074907303, "loss_seg": 0.11095833405852318, "mean_token_accuracy": 0.9951020777225494, "num_tokens": 1107996448.0, "step": 2607 }, { "entropy": 0.019239930901676416, "epoch": 1.1413721413721414, "grad_norm": 10.0625, "learning_rate": 2.3871142393069845e-05, "loss": 0.1541, "loss_lm": 0.01864477409981191, "loss_seg": 0.13548711873590946, "mean_token_accuracy": 0.9951817989349365, "num_tokens": 1108421882.0, "step": 2608 }, { "entropy": 0.019039628095924854, "epoch": 1.1418098260203524, "grad_norm": 12.8125, "learning_rate": 2.3868435300487275e-05, "loss": 0.1768, "loss_lm": 0.018557934323325753, "loss_seg": 0.1582312509417534, "mean_token_accuracy": 0.9951393455266953, "num_tokens": 1108847109.0, "step": 2609 }, { "entropy": 0.018928732257336378, "epoch": 1.1422475106685632, "grad_norm": 6.1875, "learning_rate": 2.386572820790471e-05, "loss": 0.1334, "loss_lm": 0.018491773633286357, "loss_seg": 0.11488228105008602, "mean_token_accuracy": 0.9951715916395187, "num_tokens": 1109272827.0, "step": 2610 }, { "entropy": 0.01892076199874282, "epoch": 1.1426851953167743, "grad_norm": 632.0, "learning_rate": 2.3863021115322146e-05, "loss": 0.1341, "loss_lm": 0.01887936773709953, "loss_seg": 0.1151907704770565, "mean_token_accuracy": 0.9951412528753281, "num_tokens": 1109698203.0, "step": 2611 }, { "entropy": 0.018476195633411407, "epoch": 1.1431228799649853, "grad_norm": 14.3125, "learning_rate": 2.386031402273958e-05, "loss": 0.1247, "loss_lm": 0.015459606889635324, "loss_seg": 0.10919441841542721, "mean_token_accuracy": 0.9953095465898514, "num_tokens": 1110123274.0, "step": 2612 }, { "entropy": 0.01915306644514203, "epoch": 1.143560564613196, "grad_norm": 15.8125, "learning_rate": 2.3857606930157013e-05, "loss": 0.1299, "loss_lm": 0.0164385917596519, "loss_seg": 0.11348595842719078, "mean_token_accuracy": 0.9950509816408157, "num_tokens": 1110547942.0, "step": 2613 }, { "entropy": 0.01803170144557953, "epoch": 1.1439982492614071, "grad_norm": 13.5, "learning_rate": 2.3854899837574444e-05, "loss": 0.1112, "loss_lm": 0.016300493385642767, "loss_seg": 0.09489015303552151, "mean_token_accuracy": 0.9955108612775803, "num_tokens": 1110973127.0, "step": 2614 }, { "entropy": 0.018437813967466354, "epoch": 1.1444359339096182, "grad_norm": 7.28125, "learning_rate": 2.3852192744991877e-05, "loss": 0.1339, "loss_lm": 0.01722617377527058, "loss_seg": 0.1167137362062931, "mean_token_accuracy": 0.9952524751424789, "num_tokens": 1111397935.0, "step": 2615 }, { "entropy": 0.01833551749587059, "epoch": 1.144873618557829, "grad_norm": 15.5625, "learning_rate": 2.3849485652409314e-05, "loss": 0.1147, "loss_lm": 0.01653234101831913, "loss_seg": 0.09818617440760136, "mean_token_accuracy": 0.9953358918428421, "num_tokens": 1111822410.0, "step": 2616 }, { "entropy": 0.01843459066003561, "epoch": 1.14531130320604, "grad_norm": 16.75, "learning_rate": 2.3846778559826748e-05, "loss": 0.1303, "loss_lm": 0.015195044688880444, "loss_seg": 0.11507774516940117, "mean_token_accuracy": 0.9952201098203659, "num_tokens": 1112247481.0, "step": 2617 }, { "entropy": 0.019405290950089693, "epoch": 1.145748987854251, "grad_norm": 6.34375, "learning_rate": 2.3844071467244182e-05, "loss": 0.1055, "loss_lm": 0.01787450327537954, "loss_seg": 0.08761456795036793, "mean_token_accuracy": 0.9949953109025955, "num_tokens": 1112672650.0, "step": 2618 }, { "entropy": 0.018616786692291498, "epoch": 1.1461866725024619, "grad_norm": 13.125, "learning_rate": 2.3841364374661612e-05, "loss": 0.1037, "loss_lm": 0.015840463107451797, "loss_seg": 0.08784114010632038, "mean_token_accuracy": 0.9950664341449738, "num_tokens": 1113098078.0, "step": 2619 }, { "entropy": 0.01789504662156105, "epoch": 1.146624357150673, "grad_norm": 8.875, "learning_rate": 2.3838657282079046e-05, "loss": 0.1405, "loss_lm": 0.016106710070744157, "loss_seg": 0.12436503358185291, "mean_token_accuracy": 0.9953046441078186, "num_tokens": 1113522912.0, "step": 2620 }, { "entropy": 0.018700181040912867, "epoch": 1.147062041798884, "grad_norm": 7.40625, "learning_rate": 2.3835950189496483e-05, "loss": 0.1399, "loss_lm": 0.015878387028351426, "loss_seg": 0.12400477007031441, "mean_token_accuracy": 0.9951635003089905, "num_tokens": 1113947968.0, "step": 2621 }, { "entropy": 0.018717084545642138, "epoch": 1.1474997264470947, "grad_norm": 6.21875, "learning_rate": 2.3833243096913917e-05, "loss": 0.1131, "loss_lm": 0.014909647405147552, "loss_seg": 0.09821474179625511, "mean_token_accuracy": 0.9951436966657639, "num_tokens": 1114372753.0, "step": 2622 }, { "entropy": 0.018479691818356514, "epoch": 1.1479374110953058, "grad_norm": 5.53125, "learning_rate": 2.383053600433135e-05, "loss": 0.1726, "loss_lm": 0.016347559168934822, "loss_seg": 0.15628658048808575, "mean_token_accuracy": 0.995313361287117, "num_tokens": 1114797785.0, "step": 2623 }, { "entropy": 0.01861000619828701, "epoch": 1.1483750957435168, "grad_norm": 8.375, "learning_rate": 2.382782891174878e-05, "loss": 0.1126, "loss_lm": 0.018348006065934896, "loss_seg": 0.09423244372010231, "mean_token_accuracy": 0.995184451341629, "num_tokens": 1115222623.0, "step": 2624 }, { "entropy": 0.018612863030284643, "epoch": 1.1488127803917279, "grad_norm": 16.5, "learning_rate": 2.3825121819166215e-05, "loss": 0.1338, "loss_lm": 0.01604352775029838, "loss_seg": 0.11777966469526291, "mean_token_accuracy": 0.9952186048030853, "num_tokens": 1115647569.0, "step": 2625 }, { "entropy": 0.01841731322929263, "epoch": 1.1492504650399387, "grad_norm": 13.0625, "learning_rate": 2.382241472658365e-05, "loss": 0.1651, "loss_lm": 0.014201319077983499, "loss_seg": 0.15091879852116108, "mean_token_accuracy": 0.9952665716409683, "num_tokens": 1116072376.0, "step": 2626 }, { "entropy": 0.01873746095225215, "epoch": 1.1496881496881497, "grad_norm": 6.8125, "learning_rate": 2.3819707634001085e-05, "loss": 0.1725, "loss_lm": 0.01661560544744134, "loss_seg": 0.15584160014986992, "mean_token_accuracy": 0.9951696842908859, "num_tokens": 1116497349.0, "step": 2627 }, { "entropy": 0.018428985960781574, "epoch": 1.1501258343363607, "grad_norm": 15.1875, "learning_rate": 2.3817000541418516e-05, "loss": 0.1498, "loss_lm": 0.017674841452389956, "loss_seg": 0.13208656199276447, "mean_token_accuracy": 0.9952590316534042, "num_tokens": 1116922117.0, "step": 2628 }, { "entropy": 0.018037673085927963, "epoch": 1.1505635189845715, "grad_norm": 5.46875, "learning_rate": 2.381429344883595e-05, "loss": 0.154, "loss_lm": 0.018025120021775365, "loss_seg": 0.1359807290136814, "mean_token_accuracy": 0.995280385017395, "num_tokens": 1117347549.0, "step": 2629 }, { "entropy": 0.01831261906772852, "epoch": 1.1510012036327826, "grad_norm": 30.0, "learning_rate": 2.3811586356253383e-05, "loss": 0.1104, "loss_lm": 0.015441849594935775, "loss_seg": 0.09495816193521023, "mean_token_accuracy": 0.9952964633703232, "num_tokens": 1117772296.0, "step": 2630 }, { "entropy": 0.01883792271837592, "epoch": 1.1514388882809936, "grad_norm": 17.875, "learning_rate": 2.3808879263670817e-05, "loss": 0.1573, "loss_lm": 0.014943792251870036, "loss_seg": 0.1423185970634222, "mean_token_accuracy": 0.9953036457300186, "num_tokens": 1118197259.0, "step": 2631 }, { "entropy": 0.018909761682152748, "epoch": 1.1518765729292044, "grad_norm": 7.4375, "learning_rate": 2.3806172171088254e-05, "loss": 0.1422, "loss_lm": 0.01743133319541812, "loss_seg": 0.1248019952327013, "mean_token_accuracy": 0.9951876252889633, "num_tokens": 1118622381.0, "step": 2632 }, { "entropy": 0.018357797525823116, "epoch": 1.1523142575774155, "grad_norm": 5.21875, "learning_rate": 2.3803465078505684e-05, "loss": 0.1813, "loss_lm": 0.01673445664346218, "loss_seg": 0.16459247469902039, "mean_token_accuracy": 0.9952283948659897, "num_tokens": 1119047145.0, "step": 2633 }, { "entropy": 0.01893534231930971, "epoch": 1.1527519422256265, "grad_norm": 8.6875, "learning_rate": 2.3800757985923118e-05, "loss": 0.0962, "loss_lm": 0.014672182966023684, "loss_seg": 0.08152676653116941, "mean_token_accuracy": 0.9951244592666626, "num_tokens": 1119472101.0, "step": 2634 }, { "entropy": 0.018828736152499914, "epoch": 1.1531896268738373, "grad_norm": 8.875, "learning_rate": 2.3798050893340552e-05, "loss": 0.1305, "loss_lm": 0.01830701227299869, "loss_seg": 0.112201863899827, "mean_token_accuracy": 0.9951853454113007, "num_tokens": 1119897007.0, "step": 2635 }, { "entropy": 0.018961871974170208, "epoch": 1.1536273115220483, "grad_norm": 11.3125, "learning_rate": 2.3795343800757986e-05, "loss": 0.1809, "loss_lm": 0.018458452774211764, "loss_seg": 0.16242646984755993, "mean_token_accuracy": 0.99502332508564, "num_tokens": 1120322148.0, "step": 2636 }, { "entropy": 0.017953335773199797, "epoch": 1.1540649961702594, "grad_norm": 12.5, "learning_rate": 2.3792636708175423e-05, "loss": 0.1579, "loss_lm": 0.01424598298035562, "loss_seg": 0.1436333004385233, "mean_token_accuracy": 0.9953382760286331, "num_tokens": 1120746812.0, "step": 2637 }, { "entropy": 0.01807384565472603, "epoch": 1.1545026808184704, "grad_norm": 10.625, "learning_rate": 2.3789929615592853e-05, "loss": 0.1154, "loss_lm": 0.015600942308083177, "loss_seg": 0.09984833188354969, "mean_token_accuracy": 0.99536994099617, "num_tokens": 1121171588.0, "step": 2638 }, { "entropy": 0.0186034943908453, "epoch": 1.1549403654666812, "grad_norm": 13.875, "learning_rate": 2.3787222523010287e-05, "loss": 0.1006, "loss_lm": 0.01619927491992712, "loss_seg": 0.08440072275698185, "mean_token_accuracy": 0.9952176958322525, "num_tokens": 1121596229.0, "step": 2639 }, { "entropy": 0.019250606652349234, "epoch": 1.1553780501148923, "grad_norm": 11.4375, "learning_rate": 2.378451543042772e-05, "loss": 0.2053, "loss_lm": 0.01850062282755971, "loss_seg": 0.18683776631951332, "mean_token_accuracy": 0.9950004667043686, "num_tokens": 1122021851.0, "step": 2640 }, { "entropy": 0.01818184508010745, "epoch": 1.1558157347631033, "grad_norm": 7.46875, "learning_rate": 2.3781808337845154e-05, "loss": 0.1356, "loss_lm": 0.014694700250402093, "loss_seg": 0.12087994068861008, "mean_token_accuracy": 0.9952539652585983, "num_tokens": 1122447017.0, "step": 2641 }, { "entropy": 0.01828912738710642, "epoch": 1.156253419411314, "grad_norm": 6.0, "learning_rate": 2.377910124526259e-05, "loss": 0.0978, "loss_lm": 0.015437128720805049, "loss_seg": 0.08232893235981464, "mean_token_accuracy": 0.9952709227800369, "num_tokens": 1122872220.0, "step": 2642 }, { "entropy": 0.018842804711312056, "epoch": 1.1566911040595251, "grad_norm": 8.3125, "learning_rate": 2.377639415268002e-05, "loss": 0.1068, "loss_lm": 0.01704796915873885, "loss_seg": 0.08977949991822243, "mean_token_accuracy": 0.9950889050960541, "num_tokens": 1123297730.0, "step": 2643 }, { "entropy": 0.018654934130609035, "epoch": 1.1571287887077362, "grad_norm": 7.65625, "learning_rate": 2.3773687060097455e-05, "loss": 0.1236, "loss_lm": 0.015807037707418203, "loss_seg": 0.107788042165339, "mean_token_accuracy": 0.9952429234981537, "num_tokens": 1123722085.0, "step": 2644 }, { "entropy": 0.01933956891298294, "epoch": 1.157566473355947, "grad_norm": 7.21875, "learning_rate": 2.377097996751489e-05, "loss": 0.1313, "loss_lm": 0.016868436010554433, "loss_seg": 0.11447924375534058, "mean_token_accuracy": 0.995061382651329, "num_tokens": 1124147288.0, "step": 2645 }, { "entropy": 0.01794045465067029, "epoch": 1.158004158004158, "grad_norm": 21.25, "learning_rate": 2.3768272874932323e-05, "loss": 0.1705, "loss_lm": 0.015466162702068686, "loss_seg": 0.15506302937865257, "mean_token_accuracy": 0.9954457879066467, "num_tokens": 1124572344.0, "step": 2646 }, { "entropy": 0.01839365903288126, "epoch": 1.158441842652369, "grad_norm": 12.4375, "learning_rate": 2.376556578234976e-05, "loss": 0.139, "loss_lm": 0.015740663977339864, "loss_seg": 0.1232486143708229, "mean_token_accuracy": 0.9952105134725571, "num_tokens": 1124997612.0, "step": 2647 }, { "entropy": 0.018432369455695152, "epoch": 1.1588795273005799, "grad_norm": 6.84375, "learning_rate": 2.376285868976719e-05, "loss": 0.1399, "loss_lm": 0.015393749810755253, "loss_seg": 0.12447214312851429, "mean_token_accuracy": 0.995310515165329, "num_tokens": 1125422749.0, "step": 2648 }, { "entropy": 0.018830382730811834, "epoch": 1.159317211948791, "grad_norm": 7.4375, "learning_rate": 2.3760151597184624e-05, "loss": 0.1671, "loss_lm": 0.015185322379693389, "loss_seg": 0.15195471420884132, "mean_token_accuracy": 0.9951538592576981, "num_tokens": 1125847798.0, "step": 2649 }, { "entropy": 0.018816565163433552, "epoch": 1.159754896597002, "grad_norm": 8.9375, "learning_rate": 2.3757444504602058e-05, "loss": 0.0855, "loss_lm": 0.014581880299374461, "loss_seg": 0.07096217945218086, "mean_token_accuracy": 0.995289072394371, "num_tokens": 1126272102.0, "step": 2650 }, { "entropy": 0.018923945724964142, "epoch": 1.1601925812452127, "grad_norm": 5.21875, "learning_rate": 2.375473741201949e-05, "loss": 0.1471, "loss_lm": 0.019417281029745936, "loss_seg": 0.12771937437355518, "mean_token_accuracy": 0.9951619058847427, "num_tokens": 1126698031.0, "step": 2651 }, { "entropy": 0.01869999337941408, "epoch": 1.1606302658934238, "grad_norm": 6.5, "learning_rate": 2.3752030319436925e-05, "loss": 0.124, "loss_lm": 0.016442612279206514, "loss_seg": 0.10751440934836864, "mean_token_accuracy": 0.9951973706483841, "num_tokens": 1127122736.0, "step": 2652 }, { "entropy": 0.018648792058229446, "epoch": 1.1610679505416348, "grad_norm": 11.8125, "learning_rate": 2.374932322685436e-05, "loss": 0.1202, "loss_lm": 0.015249573858454823, "loss_seg": 0.1049613356590271, "mean_token_accuracy": 0.995219498872757, "num_tokens": 1127547759.0, "step": 2653 }, { "entropy": 0.019091767724603415, "epoch": 1.1615056351898456, "grad_norm": 7.8125, "learning_rate": 2.3746616134271793e-05, "loss": 0.1178, "loss_lm": 0.017135726753622293, "loss_seg": 0.1006879098713398, "mean_token_accuracy": 0.9950708150863647, "num_tokens": 1127973150.0, "step": 2654 }, { "entropy": 0.018934094812721014, "epoch": 1.1619433198380567, "grad_norm": 4.03125, "learning_rate": 2.3743909041689226e-05, "loss": 0.1293, "loss_lm": 0.016468915389850736, "loss_seg": 0.11282522976398468, "mean_token_accuracy": 0.9951527267694473, "num_tokens": 1128398709.0, "step": 2655 }, { "entropy": 0.018818178214132786, "epoch": 1.1623810044862677, "grad_norm": 8.0625, "learning_rate": 2.374120194910666e-05, "loss": 0.1041, "loss_lm": 0.015608421294018626, "loss_seg": 0.08852394856512547, "mean_token_accuracy": 0.9951561242341995, "num_tokens": 1128823122.0, "step": 2656 }, { "entropy": 0.018326640594750643, "epoch": 1.1628186891344785, "grad_norm": 6.21875, "learning_rate": 2.3738494856524094e-05, "loss": 0.1093, "loss_lm": 0.015891374787315726, "loss_seg": 0.09336875472217798, "mean_token_accuracy": 0.99528968334198, "num_tokens": 1129247655.0, "step": 2657 }, { "entropy": 0.018193372525274754, "epoch": 1.1632563737826895, "grad_norm": 33.25, "learning_rate": 2.3735787763941528e-05, "loss": 0.1404, "loss_lm": 0.01576847513206303, "loss_seg": 0.12467046268284321, "mean_token_accuracy": 0.9953630417585373, "num_tokens": 1129672633.0, "step": 2658 }, { "entropy": 0.019322392530739307, "epoch": 1.1636940584309006, "grad_norm": 10.0625, "learning_rate": 2.373308067135896e-05, "loss": 0.1209, "loss_lm": 0.01720579294487834, "loss_seg": 0.10371639858931303, "mean_token_accuracy": 0.9949718713760376, "num_tokens": 1130098182.0, "step": 2659 }, { "entropy": 0.018597743939608335, "epoch": 1.1641317430791114, "grad_norm": 14.625, "learning_rate": 2.3730373578776395e-05, "loss": 0.1026, "loss_lm": 0.014739313395693898, "loss_seg": 0.087826794013381, "mean_token_accuracy": 0.995177149772644, "num_tokens": 1130522579.0, "step": 2660 }, { "entropy": 0.018455852288752794, "epoch": 1.1645694277273224, "grad_norm": 20.375, "learning_rate": 2.372766648619383e-05, "loss": 0.0974, "loss_lm": 0.015580232255160809, "loss_seg": 0.08185037411749363, "mean_token_accuracy": 0.9953353255987167, "num_tokens": 1130947428.0, "step": 2661 }, { "entropy": 0.01929796626791358, "epoch": 1.1650071123755334, "grad_norm": 3.640625, "learning_rate": 2.3724959393611262e-05, "loss": 0.1474, "loss_lm": 0.018502830993384123, "loss_seg": 0.12886859849095345, "mean_token_accuracy": 0.9951046705245972, "num_tokens": 1131372698.0, "step": 2662 }, { "entropy": 0.018613734748214483, "epoch": 1.1654447970237445, "grad_norm": 12.25, "learning_rate": 2.3722252301028696e-05, "loss": 0.1024, "loss_lm": 0.016502360347658396, "loss_seg": 0.08588579297065735, "mean_token_accuracy": 0.9952306151390076, "num_tokens": 1131797666.0, "step": 2663 }, { "entropy": 0.01871736766770482, "epoch": 1.1658824816719553, "grad_norm": 6.09375, "learning_rate": 2.371954520844613e-05, "loss": 0.1217, "loss_lm": 0.015359103679656982, "loss_seg": 0.10629336535930634, "mean_token_accuracy": 0.9953082650899887, "num_tokens": 1132223263.0, "step": 2664 }, { "entropy": 0.018362655770033598, "epoch": 1.1663201663201663, "grad_norm": 11.375, "learning_rate": 2.3716838115863564e-05, "loss": 0.1103, "loss_lm": 0.015485939336940646, "loss_seg": 0.09477311000227928, "mean_token_accuracy": 0.9952238649129868, "num_tokens": 1132648262.0, "step": 2665 }, { "entropy": 0.01920110499486327, "epoch": 1.1667578509683774, "grad_norm": 14.4375, "learning_rate": 2.3714131023280997e-05, "loss": 0.1602, "loss_lm": 0.016447732457891107, "loss_seg": 0.14373611472547054, "mean_token_accuracy": 0.9950533956289291, "num_tokens": 1133073576.0, "step": 2666 }, { "entropy": 0.018927425611764193, "epoch": 1.1671955356165882, "grad_norm": 6.6875, "learning_rate": 2.3711423930698428e-05, "loss": 0.1376, "loss_lm": 0.018817148404195905, "loss_seg": 0.11876052618026733, "mean_token_accuracy": 0.9951005578041077, "num_tokens": 1133498114.0, "step": 2667 }, { "entropy": 0.019240488298237324, "epoch": 1.1676332202647992, "grad_norm": 7.09375, "learning_rate": 2.3708716838115865e-05, "loss": 0.1493, "loss_lm": 0.019669481087476015, "loss_seg": 0.12965990975499153, "mean_token_accuracy": 0.9950884133577347, "num_tokens": 1133922384.0, "step": 2668 }, { "entropy": 0.01841482473537326, "epoch": 1.1680709049130102, "grad_norm": 6.5, "learning_rate": 2.37060097455333e-05, "loss": 0.15, "loss_lm": 0.015528923599049449, "loss_seg": 0.13448050990700722, "mean_token_accuracy": 0.9952601045370102, "num_tokens": 1134346412.0, "step": 2669 }, { "entropy": 0.019215059000998735, "epoch": 1.168508589561221, "grad_norm": 11.75, "learning_rate": 2.3703302652950732e-05, "loss": 0.0996, "loss_lm": 0.014169534668326378, "loss_seg": 0.08544146828353405, "mean_token_accuracy": 0.9950259327888489, "num_tokens": 1134771522.0, "step": 2670 }, { "entropy": 0.018638798501342535, "epoch": 1.168946274209432, "grad_norm": 13.1875, "learning_rate": 2.3700595560368166e-05, "loss": 0.1277, "loss_lm": 0.01714333170093596, "loss_seg": 0.11058630980551243, "mean_token_accuracy": 0.9952890127897263, "num_tokens": 1135196656.0, "step": 2671 }, { "entropy": 0.018723895773291588, "epoch": 1.1693839588576431, "grad_norm": 8.25, "learning_rate": 2.3697888467785596e-05, "loss": 0.1315, "loss_lm": 0.015822994289919734, "loss_seg": 0.11570902355015278, "mean_token_accuracy": 0.9951987266540527, "num_tokens": 1135621267.0, "step": 2672 }, { "entropy": 0.018738457467406988, "epoch": 1.169821643505854, "grad_norm": 7.84375, "learning_rate": 2.3695181375203033e-05, "loss": 0.1316, "loss_lm": 0.014131150441244245, "loss_seg": 0.11744060553610325, "mean_token_accuracy": 0.9951157718896866, "num_tokens": 1136046060.0, "step": 2673 }, { "entropy": 0.019334834069013596, "epoch": 1.170259328154065, "grad_norm": 10.1875, "learning_rate": 2.3692474282620467e-05, "loss": 0.1223, "loss_lm": 0.015442815842106938, "loss_seg": 0.10682644508779049, "mean_token_accuracy": 0.995154395699501, "num_tokens": 1136472204.0, "step": 2674 }, { "entropy": 0.01862087333574891, "epoch": 1.170697012802276, "grad_norm": 7.65625, "learning_rate": 2.36897671900379e-05, "loss": 0.1327, "loss_lm": 0.016703140921890736, "loss_seg": 0.11600202228873968, "mean_token_accuracy": 0.9952791631221771, "num_tokens": 1136897688.0, "step": 2675 }, { "entropy": 0.01856586243957281, "epoch": 1.171134697450487, "grad_norm": 3.984375, "learning_rate": 2.368706009745533e-05, "loss": 0.1083, "loss_lm": 0.015942622907459736, "loss_seg": 0.09233772940933704, "mean_token_accuracy": 0.9952089637517929, "num_tokens": 1137323199.0, "step": 2676 }, { "entropy": 0.019401528406888247, "epoch": 1.1715723820986979, "grad_norm": 9.1875, "learning_rate": 2.3684353004872765e-05, "loss": 0.1823, "loss_lm": 0.016412786208093166, "loss_seg": 0.1658782083541155, "mean_token_accuracy": 0.9950127601623535, "num_tokens": 1137748241.0, "step": 2677 }, { "entropy": 0.019289986696094275, "epoch": 1.1720100667469089, "grad_norm": 9.8125, "learning_rate": 2.3681645912290202e-05, "loss": 0.1455, "loss_lm": 0.015245230635628104, "loss_seg": 0.13020535930991173, "mean_token_accuracy": 0.9949850589036942, "num_tokens": 1138173330.0, "step": 2678 }, { "entropy": 0.018976077437400818, "epoch": 1.17244775139512, "grad_norm": 30.875, "learning_rate": 2.3678938819707636e-05, "loss": 0.1625, "loss_lm": 0.017164327204227448, "loss_seg": 0.14528802689164877, "mean_token_accuracy": 0.9951624572277069, "num_tokens": 1138598316.0, "step": 2679 }, { "entropy": 0.01904250495135784, "epoch": 1.1728854360433307, "grad_norm": 3.953125, "learning_rate": 2.367623172712507e-05, "loss": 0.0946, "loss_lm": 0.013668065425008535, "loss_seg": 0.08096382953226566, "mean_token_accuracy": 0.9951447695493698, "num_tokens": 1139024428.0, "step": 2680 }, { "entropy": 0.019351196940988302, "epoch": 1.1733231206915418, "grad_norm": 14.75, "learning_rate": 2.36735246345425e-05, "loss": 0.1605, "loss_lm": 0.01459632976911962, "loss_seg": 0.14589298143982887, "mean_token_accuracy": 0.9950179904699326, "num_tokens": 1139449939.0, "step": 2681 }, { "entropy": 0.02003197744488716, "epoch": 1.1737608053397528, "grad_norm": 9.1875, "learning_rate": 2.3670817541959934e-05, "loss": 0.1405, "loss_lm": 0.020087155047804117, "loss_seg": 0.12039285711944103, "mean_token_accuracy": 0.9949968904256821, "num_tokens": 1139874864.0, "step": 2682 }, { "entropy": 0.019211275037378073, "epoch": 1.1741984899879636, "grad_norm": 6.53125, "learning_rate": 2.366811044937737e-05, "loss": 0.1454, "loss_lm": 0.017610693583264947, "loss_seg": 0.1278052181005478, "mean_token_accuracy": 0.9951241612434387, "num_tokens": 1140299168.0, "step": 2683 }, { "entropy": 0.01837721886113286, "epoch": 1.1746361746361746, "grad_norm": 7.4375, "learning_rate": 2.3665403356794804e-05, "loss": 0.1308, "loss_lm": 0.015704199206084013, "loss_seg": 0.11511104367673397, "mean_token_accuracy": 0.9953085482120514, "num_tokens": 1140724158.0, "step": 2684 }, { "entropy": 0.019504178315401077, "epoch": 1.1750738592843857, "grad_norm": 22.0, "learning_rate": 2.3662696264212238e-05, "loss": 0.1106, "loss_lm": 0.01465146103873849, "loss_seg": 0.09590538591146469, "mean_token_accuracy": 0.9949990808963776, "num_tokens": 1141148812.0, "step": 2685 }, { "entropy": 0.01917329803109169, "epoch": 1.1755115439325965, "grad_norm": 6.875, "learning_rate": 2.365998917162967e-05, "loss": 0.1152, "loss_lm": 0.017411542823538184, "loss_seg": 0.0978113878518343, "mean_token_accuracy": 0.9951311349868774, "num_tokens": 1141574514.0, "step": 2686 }, { "entropy": 0.01900511048734188, "epoch": 1.1759492285808075, "grad_norm": 11.75, "learning_rate": 2.3657282079047102e-05, "loss": 0.1178, "loss_lm": 0.01596489897929132, "loss_seg": 0.10178665071725845, "mean_token_accuracy": 0.9950158894062042, "num_tokens": 1141999608.0, "step": 2687 }, { "entropy": 0.01906414097175002, "epoch": 1.1763869132290186, "grad_norm": 8.625, "learning_rate": 2.365457498646454e-05, "loss": 0.133, "loss_lm": 0.01467806869186461, "loss_seg": 0.11835767515003681, "mean_token_accuracy": 0.9952538758516312, "num_tokens": 1142424532.0, "step": 2688 }, { "entropy": 0.01836982974782586, "epoch": 1.1768245978772294, "grad_norm": 6.40625, "learning_rate": 2.3651867893881973e-05, "loss": 0.0883, "loss_lm": 0.014382666675373912, "loss_seg": 0.07386773079633713, "mean_token_accuracy": 0.995485931634903, "num_tokens": 1142849579.0, "step": 2689 }, { "entropy": 0.018959187902510166, "epoch": 1.1772622825254404, "grad_norm": 9.0, "learning_rate": 2.3649160801299407e-05, "loss": 0.1386, "loss_lm": 0.017225958639755845, "loss_seg": 0.12135807797312737, "mean_token_accuracy": 0.9952588677406311, "num_tokens": 1143274353.0, "step": 2690 }, { "entropy": 0.018498800694942474, "epoch": 1.1776999671736514, "grad_norm": 16.5, "learning_rate": 2.3646453708716837e-05, "loss": 0.1018, "loss_lm": 0.015478558838367462, "loss_seg": 0.08628941513597965, "mean_token_accuracy": 0.9953970462083817, "num_tokens": 1143699025.0, "step": 2691 }, { "entropy": 0.018541328608989716, "epoch": 1.1781376518218623, "grad_norm": 5.84375, "learning_rate": 2.364374661613427e-05, "loss": 0.1217, "loss_lm": 0.018624207470566034, "loss_seg": 0.10307897068560123, "mean_token_accuracy": 0.9953151494264603, "num_tokens": 1144123974.0, "step": 2692 }, { "entropy": 0.018848754465579987, "epoch": 1.1785753364700733, "grad_norm": 5.40625, "learning_rate": 2.3641039523551708e-05, "loss": 0.0996, "loss_lm": 0.01786023541353643, "loss_seg": 0.08174203336238861, "mean_token_accuracy": 0.9952375143766403, "num_tokens": 1144548912.0, "step": 2693 }, { "entropy": 0.019476172514259815, "epoch": 1.1790130211182843, "grad_norm": 7.125, "learning_rate": 2.363833243096914e-05, "loss": 0.1296, "loss_lm": 0.015799382235854864, "loss_seg": 0.11378216184675694, "mean_token_accuracy": 0.9949760735034943, "num_tokens": 1144974030.0, "step": 2694 }, { "entropy": 0.01880395319312811, "epoch": 1.1794507057664951, "grad_norm": 7.6875, "learning_rate": 2.3635625338386575e-05, "loss": 0.1355, "loss_lm": 0.016310919309034944, "loss_seg": 0.11916654370725155, "mean_token_accuracy": 0.995327427983284, "num_tokens": 1145398383.0, "step": 2695 }, { "entropy": 0.018918269779533148, "epoch": 1.1798883904147062, "grad_norm": 10.4375, "learning_rate": 2.3632918245804006e-05, "loss": 0.1324, "loss_lm": 0.016699342522770166, "loss_seg": 0.1156834065914154, "mean_token_accuracy": 0.9952470064163208, "num_tokens": 1145823233.0, "step": 2696 }, { "entropy": 0.01845702901482582, "epoch": 1.1803260750629172, "grad_norm": 6.84375, "learning_rate": 2.363021115322144e-05, "loss": 0.1236, "loss_lm": 0.015634265495464206, "loss_seg": 0.10798284038901329, "mean_token_accuracy": 0.9952610582113266, "num_tokens": 1146248171.0, "step": 2697 }, { "entropy": 0.01875591417774558, "epoch": 1.180763759711128, "grad_norm": 9.625, "learning_rate": 2.3627504060638873e-05, "loss": 0.1102, "loss_lm": 0.015070967143401504, "loss_seg": 0.09512421861290932, "mean_token_accuracy": 0.99527807533741, "num_tokens": 1146672539.0, "step": 2698 }, { "entropy": 0.019474032800644636, "epoch": 1.181201444359339, "grad_norm": 20.75, "learning_rate": 2.362479696805631e-05, "loss": 0.1404, "loss_lm": 0.015111206565052271, "loss_seg": 0.12523887865245342, "mean_token_accuracy": 0.995038241147995, "num_tokens": 1147097380.0, "step": 2699 }, { "entropy": 0.017637669574469328, "epoch": 1.18163912900755, "grad_norm": 3.140625, "learning_rate": 2.362208987547374e-05, "loss": 0.0843, "loss_lm": 0.014910705387592316, "loss_seg": 0.06938761658966541, "mean_token_accuracy": 0.9955138117074966, "num_tokens": 1147521501.0, "step": 2700 }, { "entropy": 0.018740898463875055, "epoch": 1.1820768136557611, "grad_norm": 8.6875, "learning_rate": 2.3619382782891174e-05, "loss": 0.1082, "loss_lm": 0.015123232500627637, "loss_seg": 0.09308532997965813, "mean_token_accuracy": 0.9952387362718582, "num_tokens": 1147946534.0, "step": 2701 }, { "entropy": 0.01853296859189868, "epoch": 1.182514498303972, "grad_norm": 15.0625, "learning_rate": 2.3616675690308608e-05, "loss": 0.1107, "loss_lm": 0.015125021571293473, "loss_seg": 0.09559160843491554, "mean_token_accuracy": 0.9953148514032364, "num_tokens": 1148371022.0, "step": 2702 }, { "entropy": 0.01833850797265768, "epoch": 1.182952182952183, "grad_norm": 24.75, "learning_rate": 2.3613968597726042e-05, "loss": 0.0939, "loss_lm": 0.01701850607059896, "loss_seg": 0.07691372185945511, "mean_token_accuracy": 0.9952064901590347, "num_tokens": 1148796143.0, "step": 2703 }, { "entropy": 0.01883335132151842, "epoch": 1.183389867600394, "grad_norm": 5.28125, "learning_rate": 2.361126150514348e-05, "loss": 0.1684, "loss_lm": 0.01538342540152371, "loss_seg": 0.15305232629179955, "mean_token_accuracy": 0.9951186180114746, "num_tokens": 1149221198.0, "step": 2704 }, { "entropy": 0.01891347160562873, "epoch": 1.1838275522486048, "grad_norm": 6.6875, "learning_rate": 2.360855441256091e-05, "loss": 0.1041, "loss_lm": 0.013751959893852472, "loss_seg": 0.09034210443496704, "mean_token_accuracy": 0.9952058345079422, "num_tokens": 1149645792.0, "step": 2705 }, { "entropy": 0.01917427172884345, "epoch": 1.1842652368968158, "grad_norm": 7.96875, "learning_rate": 2.3605847319978343e-05, "loss": 0.108, "loss_lm": 0.019818324595689774, "loss_seg": 0.08823072537779808, "mean_token_accuracy": 0.9951320886611938, "num_tokens": 1150070799.0, "step": 2706 }, { "entropy": 0.018528854474425316, "epoch": 1.1847029215450269, "grad_norm": 5.0625, "learning_rate": 2.3603140227395777e-05, "loss": 0.1026, "loss_lm": 0.017857427010312676, "loss_seg": 0.0847154539078474, "mean_token_accuracy": 0.9952400922775269, "num_tokens": 1150495983.0, "step": 2707 }, { "entropy": 0.018834416288882494, "epoch": 1.1851406061932377, "grad_norm": 7.8125, "learning_rate": 2.360043313481321e-05, "loss": 0.1587, "loss_lm": 0.017630951944738626, "loss_seg": 0.1410227594897151, "mean_token_accuracy": 0.9951893091201782, "num_tokens": 1150921010.0, "step": 2708 }, { "entropy": 0.01915418542921543, "epoch": 1.1855782908414487, "grad_norm": 46.75, "learning_rate": 2.3597726042230648e-05, "loss": 0.1353, "loss_lm": 0.01673645037226379, "loss_seg": 0.11856092140078545, "mean_token_accuracy": 0.995163694024086, "num_tokens": 1151346307.0, "step": 2709 }, { "entropy": 0.0184220471419394, "epoch": 1.1860159754896598, "grad_norm": 9.0625, "learning_rate": 2.3595018949648078e-05, "loss": 0.1232, "loss_lm": 0.015354712959378958, "loss_seg": 0.10779916495084763, "mean_token_accuracy": 0.995294913649559, "num_tokens": 1151770912.0, "step": 2710 }, { "entropy": 0.01882447162643075, "epoch": 1.1864536601378706, "grad_norm": 57.5, "learning_rate": 2.359231185706551e-05, "loss": 0.1733, "loss_lm": 0.01573898969218135, "loss_seg": 0.15752185508608818, "mean_token_accuracy": 0.9952492862939835, "num_tokens": 1152195860.0, "step": 2711 }, { "entropy": 0.01871344866231084, "epoch": 1.1868913447860816, "grad_norm": 9.9375, "learning_rate": 2.3589604764482945e-05, "loss": 0.1158, "loss_lm": 0.015081912744790316, "loss_seg": 0.10069322027266026, "mean_token_accuracy": 0.995124414563179, "num_tokens": 1152620813.0, "step": 2712 }, { "entropy": 0.018528060521930456, "epoch": 1.1873290294342926, "grad_norm": 19.75, "learning_rate": 2.358689767190038e-05, "loss": 0.1112, "loss_lm": 0.01556808827444911, "loss_seg": 0.09567905403673649, "mean_token_accuracy": 0.9951904118061066, "num_tokens": 1153045765.0, "step": 2713 }, { "entropy": 0.01865590177476406, "epoch": 1.1877667140825037, "grad_norm": 4.3125, "learning_rate": 2.3584190579317816e-05, "loss": 0.1571, "loss_lm": 0.01711092796176672, "loss_seg": 0.13997336477041245, "mean_token_accuracy": 0.9952373504638672, "num_tokens": 1153470477.0, "step": 2714 }, { "entropy": 0.018766222521662712, "epoch": 1.1882043987307145, "grad_norm": 7.59375, "learning_rate": 2.3581483486735247e-05, "loss": 0.1104, "loss_lm": 0.01779902004636824, "loss_seg": 0.09264865890145302, "mean_token_accuracy": 0.9951317310333252, "num_tokens": 1153895432.0, "step": 2715 }, { "entropy": 0.01872216258198023, "epoch": 1.1886420833789255, "grad_norm": 9.75, "learning_rate": 2.357877639415268e-05, "loss": 0.1314, "loss_lm": 0.017898481572046876, "loss_seg": 0.11350998282432556, "mean_token_accuracy": 0.9951800405979156, "num_tokens": 1154321344.0, "step": 2716 }, { "entropy": 0.018906783778220415, "epoch": 1.1890797680271366, "grad_norm": 13.75, "learning_rate": 2.3576069301570114e-05, "loss": 0.101, "loss_lm": 0.015666719526052475, "loss_seg": 0.08533048257231712, "mean_token_accuracy": 0.9950784295797348, "num_tokens": 1154746784.0, "step": 2717 }, { "entropy": 0.018455755431205034, "epoch": 1.1895174526753474, "grad_norm": 12.0, "learning_rate": 2.3573362208987548e-05, "loss": 0.18, "loss_lm": 0.016796150244772434, "loss_seg": 0.16318080946803093, "mean_token_accuracy": 0.9952228665351868, "num_tokens": 1155172530.0, "step": 2718 }, { "entropy": 0.01831752574071288, "epoch": 1.1899551373235584, "grad_norm": 9.0, "learning_rate": 2.3570655116404985e-05, "loss": 0.1062, "loss_lm": 0.015164739219471812, "loss_seg": 0.09104944206774235, "mean_token_accuracy": 0.9952289760112762, "num_tokens": 1155597335.0, "step": 2719 }, { "entropy": 0.01820741966366768, "epoch": 1.1903928219717694, "grad_norm": 8.5625, "learning_rate": 2.3567948023822415e-05, "loss": 0.1444, "loss_lm": 0.01530035212635994, "loss_seg": 0.12905902788043022, "mean_token_accuracy": 0.9953610897064209, "num_tokens": 1156021975.0, "step": 2720 }, { "entropy": 0.018363317474722862, "epoch": 1.1908305066199802, "grad_norm": 5.875, "learning_rate": 2.356524093123985e-05, "loss": 0.1565, "loss_lm": 0.01687556877732277, "loss_seg": 0.13967294991016388, "mean_token_accuracy": 0.9953729808330536, "num_tokens": 1156447029.0, "step": 2721 }, { "entropy": 0.018516522366553545, "epoch": 1.1912681912681913, "grad_norm": 7.375, "learning_rate": 2.3562533838657283e-05, "loss": 0.1048, "loss_lm": 0.01534557738341391, "loss_seg": 0.08948995731770992, "mean_token_accuracy": 0.9953124821186066, "num_tokens": 1156872146.0, "step": 2722 }, { "entropy": 0.018431865610182285, "epoch": 1.1917058759164023, "grad_norm": 5.21875, "learning_rate": 2.3559826746074716e-05, "loss": 0.1146, "loss_lm": 0.013535784324631095, "loss_seg": 0.10104651190340519, "mean_token_accuracy": 0.9952646195888519, "num_tokens": 1157296996.0, "step": 2723 }, { "entropy": 0.0187780587002635, "epoch": 1.1921435605646131, "grad_norm": 6.6875, "learning_rate": 2.355711965349215e-05, "loss": 0.1333, "loss_lm": 0.017000644700601697, "loss_seg": 0.11630142852663994, "mean_token_accuracy": 0.9951414614915848, "num_tokens": 1157722981.0, "step": 2724 }, { "entropy": 0.018975420854985714, "epoch": 1.1925812452128242, "grad_norm": 17.625, "learning_rate": 2.3554412560909584e-05, "loss": 0.1179, "loss_lm": 0.01708751660771668, "loss_seg": 0.1007885504513979, "mean_token_accuracy": 0.9950721561908722, "num_tokens": 1158147498.0, "step": 2725 }, { "entropy": 0.01862305123358965, "epoch": 1.1930189298610352, "grad_norm": 6.34375, "learning_rate": 2.3551705468327018e-05, "loss": 0.1396, "loss_lm": 0.0152441521640867, "loss_seg": 0.1243699211627245, "mean_token_accuracy": 0.9951020330190659, "num_tokens": 1158573150.0, "step": 2726 }, { "entropy": 0.01859312318265438, "epoch": 1.193456614509246, "grad_norm": 6.40625, "learning_rate": 2.354899837574445e-05, "loss": 0.0948, "loss_lm": 0.020309212617576122, "loss_seg": 0.07447114773094654, "mean_token_accuracy": 0.9951405078172684, "num_tokens": 1158998654.0, "step": 2727 }, { "entropy": 0.01797586213797331, "epoch": 1.193894299157457, "grad_norm": 4.78125, "learning_rate": 2.3546291283161885e-05, "loss": 0.1447, "loss_lm": 0.016022682189941406, "loss_seg": 0.12866166420280933, "mean_token_accuracy": 0.9953002780675888, "num_tokens": 1159424146.0, "step": 2728 }, { "entropy": 0.018667523749172688, "epoch": 1.194331983805668, "grad_norm": 7.4375, "learning_rate": 2.354358419057932e-05, "loss": 0.1542, "loss_lm": 0.014754042029380798, "loss_seg": 0.13947699591517448, "mean_token_accuracy": 0.9951990097761154, "num_tokens": 1159848825.0, "step": 2729 }, { "entropy": 0.018418774474412203, "epoch": 1.1947696684538789, "grad_norm": 9.125, "learning_rate": 2.3540877097996752e-05, "loss": 0.1757, "loss_lm": 0.015139448223635554, "loss_seg": 0.1605520024895668, "mean_token_accuracy": 0.9951703399419785, "num_tokens": 1160273897.0, "step": 2730 }, { "entropy": 0.017658609431236982, "epoch": 1.19520735310209, "grad_norm": 19.5, "learning_rate": 2.3538170005414186e-05, "loss": 0.101, "loss_lm": 0.016748389694839716, "loss_seg": 0.08420905377715826, "mean_token_accuracy": 0.9953523725271225, "num_tokens": 1160698796.0, "step": 2731 }, { "entropy": 0.018557399976998568, "epoch": 1.195645037750301, "grad_norm": 16.25, "learning_rate": 2.353546291283162e-05, "loss": 0.1241, "loss_lm": 0.01750723854638636, "loss_seg": 0.1065768450498581, "mean_token_accuracy": 0.9952733814716339, "num_tokens": 1161124216.0, "step": 2732 }, { "entropy": 0.01805790327489376, "epoch": 1.1960827223985118, "grad_norm": 12.5, "learning_rate": 2.3532755820249054e-05, "loss": 0.1236, "loss_lm": 0.015761145623400807, "loss_seg": 0.10782920196652412, "mean_token_accuracy": 0.9952747374773026, "num_tokens": 1161548702.0, "step": 2733 }, { "entropy": 0.018658338114619255, "epoch": 1.1965204070467228, "grad_norm": 10.0625, "learning_rate": 2.3530048727666484e-05, "loss": 0.1355, "loss_lm": 0.015095551265403628, "loss_seg": 0.12043143808841705, "mean_token_accuracy": 0.9952667355537415, "num_tokens": 1161973926.0, "step": 2734 }, { "entropy": 0.017664091661572456, "epoch": 1.1969580916949338, "grad_norm": 10.1875, "learning_rate": 2.352734163508392e-05, "loss": 0.1015, "loss_lm": 0.01366893807426095, "loss_seg": 0.08783392421901226, "mean_token_accuracy": 0.995367556810379, "num_tokens": 1162398936.0, "step": 2735 }, { "entropy": 0.018635192420333624, "epoch": 1.1973957763431446, "grad_norm": 8.625, "learning_rate": 2.3524634542501355e-05, "loss": 0.1186, "loss_lm": 0.016093302750959992, "loss_seg": 0.10248077474534512, "mean_token_accuracy": 0.9951343983411789, "num_tokens": 1162824143.0, "step": 2736 }, { "entropy": 0.01932739419862628, "epoch": 1.1978334609913557, "grad_norm": 8.0, "learning_rate": 2.352192744991879e-05, "loss": 0.0986, "loss_lm": 0.017106004059314728, "loss_seg": 0.08148513548076153, "mean_token_accuracy": 0.9949430078268051, "num_tokens": 1163250063.0, "step": 2737 }, { "entropy": 0.01897861296311021, "epoch": 1.1982711456395667, "grad_norm": 6.6875, "learning_rate": 2.3519220357336222e-05, "loss": 0.1052, "loss_lm": 0.015198900131508708, "loss_seg": 0.09004824049770832, "mean_token_accuracy": 0.9950578063726425, "num_tokens": 1163675207.0, "step": 2738 }, { "entropy": 0.01842037634924054, "epoch": 1.1987088302877777, "grad_norm": 5.90625, "learning_rate": 2.3516513264753653e-05, "loss": 0.1525, "loss_lm": 0.016757366247475147, "loss_seg": 0.1357032060623169, "mean_token_accuracy": 0.9951808303594589, "num_tokens": 1164100350.0, "step": 2739 }, { "entropy": 0.01854737661778927, "epoch": 1.1991465149359886, "grad_norm": 20.875, "learning_rate": 2.351380617217109e-05, "loss": 0.1379, "loss_lm": 0.019324959721416235, "loss_seg": 0.11855894140899181, "mean_token_accuracy": 0.9951507896184921, "num_tokens": 1164525116.0, "step": 2740 }, { "entropy": 0.018392464146018028, "epoch": 1.1995841995841996, "grad_norm": 10.25, "learning_rate": 2.3511099079588523e-05, "loss": 0.1431, "loss_lm": 0.015889815520495176, "loss_seg": 0.1271761041134596, "mean_token_accuracy": 0.9951972663402557, "num_tokens": 1164949648.0, "step": 2741 }, { "entropy": 0.018872014246881008, "epoch": 1.2000218842324106, "grad_norm": 17.25, "learning_rate": 2.3508391987005957e-05, "loss": 0.1303, "loss_lm": 0.017066269414499402, "loss_seg": 0.11322389915585518, "mean_token_accuracy": 0.995112806558609, "num_tokens": 1165375015.0, "step": 2742 }, { "entropy": 0.018516133539378643, "epoch": 1.2004595688806214, "grad_norm": 13.25, "learning_rate": 2.350568489442339e-05, "loss": 0.1218, "loss_lm": 0.01505465479567647, "loss_seg": 0.10670253820717335, "mean_token_accuracy": 0.9953254014253616, "num_tokens": 1165799596.0, "step": 2743 }, { "entropy": 0.01808583689853549, "epoch": 1.2008972535288325, "grad_norm": 21.375, "learning_rate": 2.350297780184082e-05, "loss": 0.1036, "loss_lm": 0.014935153303667903, "loss_seg": 0.08871013298630714, "mean_token_accuracy": 0.9953772276639938, "num_tokens": 1166224187.0, "step": 2744 }, { "entropy": 0.017969142645597458, "epoch": 1.2013349381770435, "grad_norm": 14.125, "learning_rate": 2.3500270709258258e-05, "loss": 0.1096, "loss_lm": 0.01688770530745387, "loss_seg": 0.09273350704461336, "mean_token_accuracy": 0.9953262954950333, "num_tokens": 1166648218.0, "step": 2745 }, { "entropy": 0.018922072369605303, "epoch": 1.2017726228252543, "grad_norm": 10.1875, "learning_rate": 2.3497563616675692e-05, "loss": 0.0982, "loss_lm": 0.016615919768810272, "loss_seg": 0.08160062320530415, "mean_token_accuracy": 0.9950762093067169, "num_tokens": 1167073597.0, "step": 2746 }, { "entropy": 0.018321429379284382, "epoch": 1.2022103074734654, "grad_norm": 5.6875, "learning_rate": 2.3494856524093126e-05, "loss": 0.1069, "loss_lm": 0.014833238907158375, "loss_seg": 0.092115992680192, "mean_token_accuracy": 0.9952697604894638, "num_tokens": 1167499319.0, "step": 2747 }, { "entropy": 0.018057921435683966, "epoch": 1.2026479921216764, "grad_norm": 8.1875, "learning_rate": 2.3492149431510556e-05, "loss": 0.1296, "loss_lm": 0.017206470016390085, "loss_seg": 0.11234814953058958, "mean_token_accuracy": 0.9953303039073944, "num_tokens": 1167923504.0, "step": 2748 }, { "entropy": 0.018678872380405664, "epoch": 1.2030856767698872, "grad_norm": 12.375, "learning_rate": 2.348944233892799e-05, "loss": 0.1401, "loss_lm": 0.016734997276216745, "loss_seg": 0.1233642939478159, "mean_token_accuracy": 0.9952268898487091, "num_tokens": 1168347901.0, "step": 2749 }, { "entropy": 0.019122577272355556, "epoch": 1.2035233614180982, "grad_norm": 12.375, "learning_rate": 2.3486735246345427e-05, "loss": 0.1319, "loss_lm": 0.0167694715783, "loss_seg": 0.11513558030128479, "mean_token_accuracy": 0.9951508194208145, "num_tokens": 1168773278.0, "step": 2750 }, { "entropy": 0.018936671782284975, "epoch": 1.2039610460663093, "grad_norm": 7.375, "learning_rate": 2.348402815376286e-05, "loss": 0.095, "loss_lm": 0.013606725260615349, "loss_seg": 0.08138116635382175, "mean_token_accuracy": 0.9951448738574982, "num_tokens": 1169198550.0, "step": 2751 }, { "entropy": 0.018754094373434782, "epoch": 1.2043987307145203, "grad_norm": 13.8125, "learning_rate": 2.3481321061180294e-05, "loss": 0.1367, "loss_lm": 0.016972549725323915, "loss_seg": 0.11977324448525906, "mean_token_accuracy": 0.9951673448085785, "num_tokens": 1169623956.0, "step": 2752 }, { "entropy": 0.019022959284484386, "epoch": 1.2048364153627311, "grad_norm": 40.75, "learning_rate": 2.3478613968597725e-05, "loss": 0.1283, "loss_lm": 0.01809192099608481, "loss_seg": 0.1102320421487093, "mean_token_accuracy": 0.9951580911874771, "num_tokens": 1170049179.0, "step": 2753 }, { "entropy": 0.018843716476112604, "epoch": 1.2052741000109422, "grad_norm": 17.125, "learning_rate": 2.347590687601516e-05, "loss": 0.199, "loss_lm": 0.015935772797092795, "loss_seg": 0.1831133458763361, "mean_token_accuracy": 0.9951278865337372, "num_tokens": 1170474006.0, "step": 2754 }, { "entropy": 0.018382313661277294, "epoch": 1.2057117846591532, "grad_norm": 8.0, "learning_rate": 2.3473199783432596e-05, "loss": 0.1601, "loss_lm": 0.015937493182718754, "loss_seg": 0.14412131160497665, "mean_token_accuracy": 0.9952258765697479, "num_tokens": 1170899164.0, "step": 2755 }, { "entropy": 0.018641795963048935, "epoch": 1.206149469307364, "grad_norm": 11.5, "learning_rate": 2.347049269085003e-05, "loss": 0.1437, "loss_lm": 0.016110149677842855, "loss_seg": 0.12761066108942032, "mean_token_accuracy": 0.9951723664999008, "num_tokens": 1171323150.0, "step": 2756 }, { "entropy": 0.018580283503979445, "epoch": 1.206587153955575, "grad_norm": 8.6875, "learning_rate": 2.3467785598267463e-05, "loss": 0.1523, "loss_lm": 0.015105298487469554, "loss_seg": 0.13723741844296455, "mean_token_accuracy": 0.9952261745929718, "num_tokens": 1171747921.0, "step": 2757 }, { "entropy": 0.01875891024246812, "epoch": 1.207024838603786, "grad_norm": 5.03125, "learning_rate": 2.3465078505684893e-05, "loss": 0.1808, "loss_lm": 0.014624436618760228, "loss_seg": 0.16621357947587967, "mean_token_accuracy": 0.9952000230550766, "num_tokens": 1172172905.0, "step": 2758 }, { "entropy": 0.01869666250422597, "epoch": 1.2074625232519969, "grad_norm": 10.5625, "learning_rate": 2.3462371413102327e-05, "loss": 0.117, "loss_lm": 0.01778641273267567, "loss_seg": 0.09922870807349682, "mean_token_accuracy": 0.9951213151216507, "num_tokens": 1172598188.0, "step": 2759 }, { "entropy": 0.018527576234191656, "epoch": 1.207900207900208, "grad_norm": 5.125, "learning_rate": 2.3459664320519764e-05, "loss": 0.1391, "loss_lm": 0.015977102564647794, "loss_seg": 0.12316945195198059, "mean_token_accuracy": 0.9951967895030975, "num_tokens": 1173023690.0, "step": 2760 }, { "entropy": 0.019432800356298685, "epoch": 1.208337892548419, "grad_norm": 7.3125, "learning_rate": 2.3456957227937198e-05, "loss": 0.1381, "loss_lm": 0.017748035956174135, "loss_seg": 0.12039765529334545, "mean_token_accuracy": 0.9950012564659119, "num_tokens": 1173449572.0, "step": 2761 }, { "entropy": 0.01801883988082409, "epoch": 1.2087755771966298, "grad_norm": 8.1875, "learning_rate": 2.345425013535463e-05, "loss": 0.1049, "loss_lm": 0.014592936495319009, "loss_seg": 0.0903457310050726, "mean_token_accuracy": 0.9954117089509964, "num_tokens": 1173875143.0, "step": 2762 }, { "entropy": 0.018488552421331406, "epoch": 1.2092132618448408, "grad_norm": 9.0, "learning_rate": 2.3451543042772062e-05, "loss": 0.1429, "loss_lm": 0.0161588778719306, "loss_seg": 0.12677029706537724, "mean_token_accuracy": 0.9952864199876785, "num_tokens": 1174300286.0, "step": 2763 }, { "entropy": 0.018525246065109968, "epoch": 1.2096509464930518, "grad_norm": 14.75, "learning_rate": 2.3448835950189496e-05, "loss": 0.1777, "loss_lm": 0.016323587391525507, "loss_seg": 0.16135291196405888, "mean_token_accuracy": 0.9952248930931091, "num_tokens": 1174725358.0, "step": 2764 }, { "entropy": 0.018584273755550385, "epoch": 1.2100886311412626, "grad_norm": 14.5, "learning_rate": 2.344612885760693e-05, "loss": 0.1196, "loss_lm": 0.015793118625879288, "loss_seg": 0.10376361012458801, "mean_token_accuracy": 0.9952626973390579, "num_tokens": 1175150019.0, "step": 2765 }, { "entropy": 0.018508973531425, "epoch": 1.2105263157894737, "grad_norm": 27.25, "learning_rate": 2.3443421765024367e-05, "loss": 0.1054, "loss_lm": 0.016781739192083478, "loss_seg": 0.0885839406400919, "mean_token_accuracy": 0.9953300803899765, "num_tokens": 1175575465.0, "step": 2766 }, { "entropy": 0.01879012444987893, "epoch": 1.2109640004376847, "grad_norm": 7.46875, "learning_rate": 2.3440714672441797e-05, "loss": 0.1098, "loss_lm": 0.01673781289719045, "loss_seg": 0.0930408239364624, "mean_token_accuracy": 0.9952511340379715, "num_tokens": 1176000185.0, "step": 2767 }, { "entropy": 0.018667419906705618, "epoch": 1.2114016850858955, "grad_norm": 11.9375, "learning_rate": 2.343800757985923e-05, "loss": 0.1355, "loss_lm": 0.018025608034804463, "loss_seg": 0.11746158078312874, "mean_token_accuracy": 0.9952159374952316, "num_tokens": 1176425719.0, "step": 2768 }, { "entropy": 0.01819727662950754, "epoch": 1.2118393697341066, "grad_norm": 10.0625, "learning_rate": 2.3435300487276664e-05, "loss": 0.0938, "loss_lm": 0.0158472191542387, "loss_seg": 0.07792629301548004, "mean_token_accuracy": 0.9953521937131882, "num_tokens": 1176850617.0, "step": 2769 }, { "entropy": 0.018767175264656544, "epoch": 1.2122770543823176, "grad_norm": 7.15625, "learning_rate": 2.3432593394694098e-05, "loss": 0.1494, "loss_lm": 0.017432698514312506, "loss_seg": 0.13195120077580214, "mean_token_accuracy": 0.9950946569442749, "num_tokens": 1177276038.0, "step": 2770 }, { "entropy": 0.01917963195592165, "epoch": 1.2127147390305284, "grad_norm": 7.28125, "learning_rate": 2.3429886302111535e-05, "loss": 0.1231, "loss_lm": 0.018596863839775324, "loss_seg": 0.10450253449380398, "mean_token_accuracy": 0.9950687289237976, "num_tokens": 1177701853.0, "step": 2771 }, { "entropy": 0.01845194725319743, "epoch": 1.2131524236787394, "grad_norm": 10.6875, "learning_rate": 2.3427179209528966e-05, "loss": 0.1293, "loss_lm": 0.017652935115620494, "loss_seg": 0.11168397404253483, "mean_token_accuracy": 0.9952748268842697, "num_tokens": 1178126785.0, "step": 2772 }, { "entropy": 0.019141110125929117, "epoch": 1.2135901083269505, "grad_norm": 13.125, "learning_rate": 2.34244721169464e-05, "loss": 0.1503, "loss_lm": 0.016716981772333384, "loss_seg": 0.13361027371138334, "mean_token_accuracy": 0.9950439929962158, "num_tokens": 1178552101.0, "step": 2773 }, { "entropy": 0.019061145838350058, "epoch": 1.2140277929751613, "grad_norm": 12.75, "learning_rate": 2.3421765024363833e-05, "loss": 0.1393, "loss_lm": 0.01858453475870192, "loss_seg": 0.12067867815494537, "mean_token_accuracy": 0.995136097073555, "num_tokens": 1178977039.0, "step": 2774 }, { "entropy": 0.01921116653829813, "epoch": 1.2144654776233723, "grad_norm": 8.0625, "learning_rate": 2.3419057931781267e-05, "loss": 0.1412, "loss_lm": 0.016958105145022273, "loss_seg": 0.12423734366893768, "mean_token_accuracy": 0.9950879812240601, "num_tokens": 1179402526.0, "step": 2775 }, { "entropy": 0.018607220612466335, "epoch": 1.2149031622715833, "grad_norm": 9.6875, "learning_rate": 2.3416350839198704e-05, "loss": 0.1169, "loss_lm": 0.01467104465700686, "loss_seg": 0.1022315826267004, "mean_token_accuracy": 0.9952310174703598, "num_tokens": 1179827775.0, "step": 2776 }, { "entropy": 0.019006323534995317, "epoch": 1.2153408469197944, "grad_norm": 6.3125, "learning_rate": 2.3413643746616134e-05, "loss": 0.1835, "loss_lm": 0.017930160276591778, "loss_seg": 0.16557618603110313, "mean_token_accuracy": 0.9950881153345108, "num_tokens": 1180253050.0, "step": 2777 }, { "entropy": 0.018896025139838457, "epoch": 1.2157785315680052, "grad_norm": 8.5, "learning_rate": 2.3410936654033568e-05, "loss": 0.1091, "loss_lm": 0.016049663769081235, "loss_seg": 0.09302882850170135, "mean_token_accuracy": 0.9951104819774628, "num_tokens": 1180677814.0, "step": 2778 }, { "entropy": 0.018918467685580254, "epoch": 1.2162162162162162, "grad_norm": 6.375, "learning_rate": 2.3408229561451e-05, "loss": 0.0904, "loss_lm": 0.015439496608451009, "loss_seg": 0.07495170645415783, "mean_token_accuracy": 0.9952389746904373, "num_tokens": 1181103093.0, "step": 2779 }, { "entropy": 0.01858033100143075, "epoch": 1.2166539008644273, "grad_norm": 7.40625, "learning_rate": 2.3405522468868435e-05, "loss": 0.1198, "loss_lm": 0.016250152373686433, "loss_seg": 0.10353538580238819, "mean_token_accuracy": 0.9953164607286453, "num_tokens": 1181528129.0, "step": 2780 }, { "entropy": 0.01906605251133442, "epoch": 1.217091585512638, "grad_norm": 6.0, "learning_rate": 2.3402815376285872e-05, "loss": 0.1012, "loss_lm": 0.01610006671398878, "loss_seg": 0.08507996331900358, "mean_token_accuracy": 0.9951100945472717, "num_tokens": 1181953278.0, "step": 2781 }, { "entropy": 0.018541582860052586, "epoch": 1.217529270160849, "grad_norm": 12.6875, "learning_rate": 2.3400108283703303e-05, "loss": 0.1379, "loss_lm": 0.017796884290874004, "loss_seg": 0.12014922127127647, "mean_token_accuracy": 0.9953467547893524, "num_tokens": 1182377678.0, "step": 2782 }, { "entropy": 0.019315263722091913, "epoch": 1.2179669548090601, "grad_norm": 18.625, "learning_rate": 2.3397401191120736e-05, "loss": 0.1461, "loss_lm": 0.01848397357389331, "loss_seg": 0.1276334673166275, "mean_token_accuracy": 0.9951517730951309, "num_tokens": 1182803162.0, "step": 2783 }, { "entropy": 0.019147701561450958, "epoch": 1.218404639457271, "grad_norm": 5.875, "learning_rate": 2.339469409853817e-05, "loss": 0.1151, "loss_lm": 0.017726552207022905, "loss_seg": 0.09734885022044182, "mean_token_accuracy": 0.9951400309801102, "num_tokens": 1183228438.0, "step": 2784 }, { "entropy": 0.018379773944616318, "epoch": 1.218842324105482, "grad_norm": 5.375, "learning_rate": 2.3391987005955604e-05, "loss": 0.1334, "loss_lm": 0.017533538630232215, "loss_seg": 0.11588257178664207, "mean_token_accuracy": 0.9952077269554138, "num_tokens": 1183653440.0, "step": 2785 }, { "entropy": 0.018734491895884275, "epoch": 1.219280008753693, "grad_norm": 5.96875, "learning_rate": 2.338927991337304e-05, "loss": 0.1248, "loss_lm": 0.017054566415026784, "loss_seg": 0.1077154278755188, "mean_token_accuracy": 0.9952855408191681, "num_tokens": 1184078267.0, "step": 2786 }, { "entropy": 0.018847783096134663, "epoch": 1.2197176934019038, "grad_norm": 14.5, "learning_rate": 2.338657282079047e-05, "loss": 0.1473, "loss_lm": 0.014954220270738006, "loss_seg": 0.13233042880892754, "mean_token_accuracy": 0.9952407032251358, "num_tokens": 1184503907.0, "step": 2787 }, { "entropy": 0.018397873267531395, "epoch": 1.2201553780501149, "grad_norm": 8.3125, "learning_rate": 2.3383865728207905e-05, "loss": 0.1258, "loss_lm": 0.016226451843976974, "loss_seg": 0.10953898541629314, "mean_token_accuracy": 0.9952817410230637, "num_tokens": 1184929006.0, "step": 2788 }, { "entropy": 0.01919118268415332, "epoch": 1.220593062698326, "grad_norm": 10.25, "learning_rate": 2.338115863562534e-05, "loss": 0.1608, "loss_lm": 0.017738403752446175, "loss_seg": 0.1430978048592806, "mean_token_accuracy": 0.9950965344905853, "num_tokens": 1185354181.0, "step": 2789 }, { "entropy": 0.019454045221209526, "epoch": 1.221030747346537, "grad_norm": 7.09375, "learning_rate": 2.3378451543042773e-05, "loss": 0.124, "loss_lm": 0.015859932638704777, "loss_seg": 0.1080989446491003, "mean_token_accuracy": 0.9951703399419785, "num_tokens": 1185780026.0, "step": 2790 }, { "entropy": 0.019092064816504717, "epoch": 1.2214684319947477, "grad_norm": 5.46875, "learning_rate": 2.3375744450460206e-05, "loss": 0.1381, "loss_lm": 0.01612661755643785, "loss_seg": 0.12198048457503319, "mean_token_accuracy": 0.995149776339531, "num_tokens": 1186204923.0, "step": 2791 }, { "entropy": 0.01889429008588195, "epoch": 1.2219061166429588, "grad_norm": 10.9375, "learning_rate": 2.337303735787764e-05, "loss": 0.1298, "loss_lm": 0.015128846513107419, "loss_seg": 0.11468838341534138, "mean_token_accuracy": 0.9951554089784622, "num_tokens": 1186630104.0, "step": 2792 }, { "entropy": 0.019153304398059845, "epoch": 1.2223438012911698, "grad_norm": 40.0, "learning_rate": 2.3370330265295074e-05, "loss": 0.1095, "loss_lm": 0.01517300121486187, "loss_seg": 0.09437157027423382, "mean_token_accuracy": 0.9951155781745911, "num_tokens": 1187055100.0, "step": 2793 }, { "entropy": 0.019132220186293125, "epoch": 1.2227814859393806, "grad_norm": 12.125, "learning_rate": 2.3367623172712507e-05, "loss": 0.1231, "loss_lm": 0.014805008191615343, "loss_seg": 0.10826627910137177, "mean_token_accuracy": 0.9951430708169937, "num_tokens": 1187480447.0, "step": 2794 }, { "entropy": 0.01876391191035509, "epoch": 1.2232191705875917, "grad_norm": 9.25, "learning_rate": 2.336491608012994e-05, "loss": 0.1735, "loss_lm": 0.017425051424652338, "loss_seg": 0.15602628700435162, "mean_token_accuracy": 0.9952598065137863, "num_tokens": 1187905904.0, "step": 2795 }, { "entropy": 0.018698380328714848, "epoch": 1.2236568552358027, "grad_norm": 9.6875, "learning_rate": 2.336220898754737e-05, "loss": 0.1445, "loss_lm": 0.015742814168334007, "loss_seg": 0.12872170098125935, "mean_token_accuracy": 0.9951599985361099, "num_tokens": 1188331890.0, "step": 2796 }, { "entropy": 0.019304873887449503, "epoch": 1.2240945398840135, "grad_norm": 35.5, "learning_rate": 2.335950189496481e-05, "loss": 0.1351, "loss_lm": 0.01685668737627566, "loss_seg": 0.11825650744140148, "mean_token_accuracy": 0.9950329959392548, "num_tokens": 1188757884.0, "step": 2797 }, { "entropy": 0.018298442009836435, "epoch": 1.2245322245322245, "grad_norm": 10.0, "learning_rate": 2.3356794802382242e-05, "loss": 0.1172, "loss_lm": 0.017843836918473244, "loss_seg": 0.09932422265410423, "mean_token_accuracy": 0.9952388256788254, "num_tokens": 1189182745.0, "step": 2798 }, { "entropy": 0.018459439743310213, "epoch": 1.2249699091804356, "grad_norm": 5.375, "learning_rate": 2.3354087709799676e-05, "loss": 0.1106, "loss_lm": 0.017847678624093533, "loss_seg": 0.09278274513781071, "mean_token_accuracy": 0.9952524453401566, "num_tokens": 1189606941.0, "step": 2799 }, { "entropy": 0.018498134333640337, "epoch": 1.2254075938286464, "grad_norm": 6.8125, "learning_rate": 2.335138061721711e-05, "loss": 0.1586, "loss_lm": 0.016965799033641815, "loss_seg": 0.1416579969227314, "mean_token_accuracy": 0.9954155385494232, "num_tokens": 1190032465.0, "step": 2800 }, { "entropy": 0.01858683954924345, "epoch": 1.2258452784768574, "grad_norm": 27.625, "learning_rate": 2.334867352463454e-05, "loss": 0.1225, "loss_lm": 0.01734776352532208, "loss_seg": 0.10511913523077965, "mean_token_accuracy": 0.9952578395605087, "num_tokens": 1190457337.0, "step": 2801 }, { "entropy": 0.018395145423710346, "epoch": 1.2262829631250685, "grad_norm": 13.4375, "learning_rate": 2.3345966432051977e-05, "loss": 0.145, "loss_lm": 0.013049653265625238, "loss_seg": 0.1319201197475195, "mean_token_accuracy": 0.9953843653202057, "num_tokens": 1190882353.0, "step": 2802 }, { "entropy": 0.01920338161289692, "epoch": 1.2267206477732793, "grad_norm": 5.125, "learning_rate": 2.334325933946941e-05, "loss": 0.1161, "loss_lm": 0.01526976516470313, "loss_seg": 0.10087434388697147, "mean_token_accuracy": 0.9951328635215759, "num_tokens": 1191306999.0, "step": 2803 }, { "entropy": 0.018724483903497458, "epoch": 1.2271583324214903, "grad_norm": 8.0625, "learning_rate": 2.3340552246886845e-05, "loss": 0.1021, "loss_lm": 0.01635927427560091, "loss_seg": 0.08576348517090082, "mean_token_accuracy": 0.9952400326728821, "num_tokens": 1191732166.0, "step": 2804 }, { "entropy": 0.0192198040895164, "epoch": 1.2275960170697013, "grad_norm": 7.1875, "learning_rate": 2.333784515430428e-05, "loss": 0.1323, "loss_lm": 0.019097912590950727, "loss_seg": 0.11318985559046268, "mean_token_accuracy": 0.9951635748147964, "num_tokens": 1192157046.0, "step": 2805 }, { "entropy": 0.01877668732777238, "epoch": 1.2280337017179122, "grad_norm": 10.5625, "learning_rate": 2.333513806172171e-05, "loss": 0.1498, "loss_lm": 0.01875188620761037, "loss_seg": 0.13104469142854214, "mean_token_accuracy": 0.9951822906732559, "num_tokens": 1192581692.0, "step": 2806 }, { "entropy": 0.018816367723047733, "epoch": 1.2284713863661232, "grad_norm": 8.875, "learning_rate": 2.3332430969139146e-05, "loss": 0.1383, "loss_lm": 0.014525264035910368, "loss_seg": 0.12380352802574635, "mean_token_accuracy": 0.995135098695755, "num_tokens": 1193006935.0, "step": 2807 }, { "entropy": 0.01910355594009161, "epoch": 1.2289090710143342, "grad_norm": 7.375, "learning_rate": 2.332972387655658e-05, "loss": 0.1081, "loss_lm": 0.015195229090750217, "loss_seg": 0.09288697317242622, "mean_token_accuracy": 0.995075449347496, "num_tokens": 1193431725.0, "step": 2808 }, { "entropy": 0.01839363854378462, "epoch": 1.229346755662545, "grad_norm": 4.75, "learning_rate": 2.3327016783974013e-05, "loss": 0.1415, "loss_lm": 0.017549145501106977, "loss_seg": 0.12395520880818367, "mean_token_accuracy": 0.9953323751688004, "num_tokens": 1193857371.0, "step": 2809 }, { "entropy": 0.018328269012272358, "epoch": 1.229784440310756, "grad_norm": 6.9375, "learning_rate": 2.3324309691391447e-05, "loss": 0.1051, "loss_lm": 0.014867448015138507, "loss_seg": 0.09027298167347908, "mean_token_accuracy": 0.9953340590000153, "num_tokens": 1194282765.0, "step": 2810 }, { "entropy": 0.018360687419772148, "epoch": 1.230222124958967, "grad_norm": 14.5625, "learning_rate": 2.3321602598808877e-05, "loss": 0.0978, "loss_lm": 0.014726761495694518, "loss_seg": 0.08311888203024864, "mean_token_accuracy": 0.9953987896442413, "num_tokens": 1194707181.0, "step": 2811 }, { "entropy": 0.01879947492852807, "epoch": 1.230659809607178, "grad_norm": 8.75, "learning_rate": 2.3318895506226315e-05, "loss": 0.1037, "loss_lm": 0.015244440408423543, "loss_seg": 0.08849242329597473, "mean_token_accuracy": 0.9951564520597458, "num_tokens": 1195132432.0, "step": 2812 }, { "entropy": 0.01898801000788808, "epoch": 1.231097494255389, "grad_norm": 11.75, "learning_rate": 2.3316188413643748e-05, "loss": 0.1211, "loss_lm": 0.01569314463995397, "loss_seg": 0.10535794496536255, "mean_token_accuracy": 0.9950358420610428, "num_tokens": 1195557854.0, "step": 2813 }, { "entropy": 0.018505769781768322, "epoch": 1.2315351789036, "grad_norm": 16.625, "learning_rate": 2.3313481321061182e-05, "loss": 0.0924, "loss_lm": 0.01533963461406529, "loss_seg": 0.07703709974884987, "mean_token_accuracy": 0.9952527284622192, "num_tokens": 1195983122.0, "step": 2814 }, { "entropy": 0.01932804426178336, "epoch": 1.231972863551811, "grad_norm": 18.625, "learning_rate": 2.3310774228478612e-05, "loss": 0.1346, "loss_lm": 0.01558558689430356, "loss_seg": 0.11896674800664186, "mean_token_accuracy": 0.9951249063014984, "num_tokens": 1196409118.0, "step": 2815 }, { "entropy": 0.01823671953752637, "epoch": 1.2324105482000218, "grad_norm": 14.4375, "learning_rate": 2.3308067135896046e-05, "loss": 0.1071, "loss_lm": 0.01614010496996343, "loss_seg": 0.09092794358730316, "mean_token_accuracy": 0.9952782243490219, "num_tokens": 1196833499.0, "step": 2816 }, { "entropy": 0.019514461513608694, "epoch": 1.2328482328482329, "grad_norm": 10.375, "learning_rate": 2.3305360043313483e-05, "loss": 0.1023, "loss_lm": 0.017850457690656185, "loss_seg": 0.08449745457619429, "mean_token_accuracy": 0.9949962645769119, "num_tokens": 1197258960.0, "step": 2817 }, { "entropy": 0.018475846853107214, "epoch": 1.233285917496444, "grad_norm": 10.8125, "learning_rate": 2.3302652950730917e-05, "loss": 0.1541, "loss_lm": 0.016830135602504015, "loss_seg": 0.13723774068057537, "mean_token_accuracy": 0.9952456951141357, "num_tokens": 1197683803.0, "step": 2818 }, { "entropy": 0.018825225066393614, "epoch": 1.2337236021446547, "grad_norm": 8.5, "learning_rate": 2.329994585814835e-05, "loss": 0.1234, "loss_lm": 0.017938268138095737, "loss_seg": 0.10550842806696892, "mean_token_accuracy": 0.9952770918607712, "num_tokens": 1198109775.0, "step": 2819 }, { "entropy": 0.019420322962105274, "epoch": 1.2341612867928657, "grad_norm": 8.3125, "learning_rate": 2.329723876556578e-05, "loss": 0.1256, "loss_lm": 0.017761164577677846, "loss_seg": 0.10786540806293488, "mean_token_accuracy": 0.9949723184108734, "num_tokens": 1198534619.0, "step": 2820 }, { "entropy": 0.018174326047301292, "epoch": 1.2345989714410768, "grad_norm": 19.875, "learning_rate": 2.3294531672983215e-05, "loss": 0.1243, "loss_lm": 0.017487173434346914, "loss_seg": 0.10680234059691429, "mean_token_accuracy": 0.9953763484954834, "num_tokens": 1198959767.0, "step": 2821 }, { "entropy": 0.018117977771908045, "epoch": 1.2350366560892876, "grad_norm": 27.5, "learning_rate": 2.3291824580400652e-05, "loss": 0.1302, "loss_lm": 0.015193530591204762, "loss_seg": 0.11496580578386784, "mean_token_accuracy": 0.9952877759933472, "num_tokens": 1199385177.0, "step": 2822 }, { "entropy": 0.01859590830281377, "epoch": 1.2354743407374986, "grad_norm": 6.6875, "learning_rate": 2.3289117487818086e-05, "loss": 0.113, "loss_lm": 0.01521313888952136, "loss_seg": 0.09781633876264095, "mean_token_accuracy": 0.995321199297905, "num_tokens": 1199809599.0, "step": 2823 }, { "entropy": 0.019000902771949768, "epoch": 1.2359120253857097, "grad_norm": 4.09375, "learning_rate": 2.328641039523552e-05, "loss": 0.0968, "loss_lm": 0.014828369719907641, "loss_seg": 0.0819406807422638, "mean_token_accuracy": 0.9951276481151581, "num_tokens": 1200235022.0, "step": 2824 }, { "entropy": 0.018633412197232246, "epoch": 1.2363497100339205, "grad_norm": 26.5, "learning_rate": 2.328370330265295e-05, "loss": 0.1267, "loss_lm": 0.01601134962402284, "loss_seg": 0.11071561276912689, "mean_token_accuracy": 0.9952378123998642, "num_tokens": 1200660633.0, "step": 2825 }, { "entropy": 0.01888323202729225, "epoch": 1.2367873946821315, "grad_norm": 6.03125, "learning_rate": 2.3280996210070383e-05, "loss": 0.1333, "loss_lm": 0.017329610884189606, "loss_seg": 0.1160089522600174, "mean_token_accuracy": 0.9951352775096893, "num_tokens": 1201086111.0, "step": 2826 }, { "entropy": 0.019006310496479273, "epoch": 1.2372250793303425, "grad_norm": 4.0625, "learning_rate": 2.3278289117487817e-05, "loss": 0.1195, "loss_lm": 0.0167323793284595, "loss_seg": 0.10278201289474964, "mean_token_accuracy": 0.995234802365303, "num_tokens": 1201511233.0, "step": 2827 }, { "entropy": 0.01866354839876294, "epoch": 1.2376627639785536, "grad_norm": 7.375, "learning_rate": 2.3275582024905254e-05, "loss": 0.1186, "loss_lm": 0.017216268926858902, "loss_seg": 0.1014175359159708, "mean_token_accuracy": 0.9951832890510559, "num_tokens": 1201936155.0, "step": 2828 }, { "entropy": 0.01856818050146103, "epoch": 1.2381004486267644, "grad_norm": 16.625, "learning_rate": 2.3272874932322688e-05, "loss": 0.1222, "loss_lm": 0.016440215054899454, "loss_seg": 0.10579881072044373, "mean_token_accuracy": 0.9952609241008759, "num_tokens": 1202360861.0, "step": 2829 }, { "entropy": 0.018819726072251797, "epoch": 1.2385381332749754, "grad_norm": 10.875, "learning_rate": 2.3270167839740118e-05, "loss": 0.1645, "loss_lm": 0.01766425813548267, "loss_seg": 0.14683134108781815, "mean_token_accuracy": 0.9952842891216278, "num_tokens": 1202785776.0, "step": 2830 }, { "entropy": 0.01856615813449025, "epoch": 1.2389758179231865, "grad_norm": 6.25, "learning_rate": 2.3267460747157552e-05, "loss": 0.1475, "loss_lm": 0.016279439674690366, "loss_seg": 0.1312691681087017, "mean_token_accuracy": 0.9951832890510559, "num_tokens": 1203211136.0, "step": 2831 }, { "entropy": 0.018979128450155258, "epoch": 1.2394135025713973, "grad_norm": 8.8125, "learning_rate": 2.3264753654574986e-05, "loss": 0.1506, "loss_lm": 0.017363319639116526, "loss_seg": 0.1332603134214878, "mean_token_accuracy": 0.9951685070991516, "num_tokens": 1203636417.0, "step": 2832 }, { "entropy": 0.01949749607592821, "epoch": 1.2398511872196083, "grad_norm": 10.625, "learning_rate": 2.3262046561992423e-05, "loss": 0.1086, "loss_lm": 0.01813040766865015, "loss_seg": 0.09042129572480917, "mean_token_accuracy": 0.99503293633461, "num_tokens": 1204061605.0, "step": 2833 }, { "entropy": 0.018425208050757647, "epoch": 1.2402888718678193, "grad_norm": 25.375, "learning_rate": 2.3259339469409857e-05, "loss": 0.1414, "loss_lm": 0.017439185408875346, "loss_seg": 0.12398914806544781, "mean_token_accuracy": 0.9952579289674759, "num_tokens": 1204486700.0, "step": 2834 }, { "entropy": 0.018267213366925716, "epoch": 1.2407265565160301, "grad_norm": 10.75, "learning_rate": 2.3256632376827287e-05, "loss": 0.1286, "loss_lm": 0.017618673853576183, "loss_seg": 0.11096759326756, "mean_token_accuracy": 0.9952501207590103, "num_tokens": 1204912355.0, "step": 2835 }, { "entropy": 0.019349316600710154, "epoch": 1.2411642411642412, "grad_norm": 8.1875, "learning_rate": 2.325392528424472e-05, "loss": 0.1435, "loss_lm": 0.016192082781344652, "loss_seg": 0.12734898179769516, "mean_token_accuracy": 0.9950567036867142, "num_tokens": 1205336941.0, "step": 2836 }, { "entropy": 0.018780247308313847, "epoch": 1.2416019258124522, "grad_norm": 10.75, "learning_rate": 2.3251218191662154e-05, "loss": 0.1349, "loss_lm": 0.015989244915544987, "loss_seg": 0.11890882160514593, "mean_token_accuracy": 0.9951719641685486, "num_tokens": 1205761496.0, "step": 2837 }, { "entropy": 0.01868445286527276, "epoch": 1.242039610460663, "grad_norm": 4.9375, "learning_rate": 2.324851109907959e-05, "loss": 0.0841, "loss_lm": 0.015941994730383158, "loss_seg": 0.06814949866384268, "mean_token_accuracy": 0.9952408224344254, "num_tokens": 1206186890.0, "step": 2838 }, { "entropy": 0.018816272262483835, "epoch": 1.242477295108874, "grad_norm": 7.625, "learning_rate": 2.3245804006497022e-05, "loss": 0.1692, "loss_lm": 0.017108128406107426, "loss_seg": 0.15213610231876373, "mean_token_accuracy": 0.995196670293808, "num_tokens": 1206611568.0, "step": 2839 }, { "entropy": 0.019109265878796577, "epoch": 1.242914979757085, "grad_norm": 5.625, "learning_rate": 2.3243096913914455e-05, "loss": 0.096, "loss_lm": 0.015997199807316065, "loss_seg": 0.08000302873551846, "mean_token_accuracy": 0.99510458111763, "num_tokens": 1207037023.0, "step": 2840 }, { "entropy": 0.01906287670135498, "epoch": 1.243352664405296, "grad_norm": 8.5625, "learning_rate": 2.324038982133189e-05, "loss": 0.1363, "loss_lm": 0.016004594741389155, "loss_seg": 0.12034406885504723, "mean_token_accuracy": 0.995207816362381, "num_tokens": 1207462049.0, "step": 2841 }, { "entropy": 0.01854358520358801, "epoch": 1.243790349053507, "grad_norm": 6.0, "learning_rate": 2.3237682728749323e-05, "loss": 0.1674, "loss_lm": 0.014910625759512186, "loss_seg": 0.15247830748558044, "mean_token_accuracy": 0.995208814740181, "num_tokens": 1207887590.0, "step": 2842 }, { "entropy": 0.01825420744717121, "epoch": 1.244228033701718, "grad_norm": 16.0, "learning_rate": 2.323497563616676e-05, "loss": 0.1585, "loss_lm": 0.015836065402254462, "loss_seg": 0.1426958255469799, "mean_token_accuracy": 0.9953756630420685, "num_tokens": 1208312250.0, "step": 2843 }, { "entropy": 0.018527035601437092, "epoch": 1.2446657183499288, "grad_norm": 9.3125, "learning_rate": 2.323226854358419e-05, "loss": 0.1162, "loss_lm": 0.016796465031802654, "loss_seg": 0.0993918739259243, "mean_token_accuracy": 0.995233029127121, "num_tokens": 1208737888.0, "step": 2844 }, { "entropy": 0.01925106765702367, "epoch": 1.2451034029981398, "grad_norm": 7.9375, "learning_rate": 2.3229561451001624e-05, "loss": 0.1537, "loss_lm": 0.016516893869265914, "loss_seg": 0.13720053806900978, "mean_token_accuracy": 0.9950904250144958, "num_tokens": 1209162727.0, "step": 2845 }, { "entropy": 0.018972988706082106, "epoch": 1.2455410876463509, "grad_norm": 4.0625, "learning_rate": 2.3226854358419058e-05, "loss": 0.0999, "loss_lm": 0.014364155475050211, "loss_seg": 0.08548587467521429, "mean_token_accuracy": 0.9952205717563629, "num_tokens": 1209588246.0, "step": 2846 }, { "entropy": 0.018732777796685696, "epoch": 1.2459787722945617, "grad_norm": 9.0625, "learning_rate": 2.322414726583649e-05, "loss": 0.1112, "loss_lm": 0.015709037659689784, "loss_seg": 0.09550637006759644, "mean_token_accuracy": 0.9951555728912354, "num_tokens": 1210013865.0, "step": 2847 }, { "entropy": 0.01803758693858981, "epoch": 1.2464164569427727, "grad_norm": 15.75, "learning_rate": 2.322144017325393e-05, "loss": 0.133, "loss_lm": 0.016520689241588116, "loss_seg": 0.1164916455745697, "mean_token_accuracy": 0.9953530430793762, "num_tokens": 1210437753.0, "step": 2848 }, { "entropy": 0.018766032066196203, "epoch": 1.2468541415909837, "grad_norm": 7.28125, "learning_rate": 2.321873308067136e-05, "loss": 0.113, "loss_lm": 0.01786151179112494, "loss_seg": 0.09516044054180384, "mean_token_accuracy": 0.995218351483345, "num_tokens": 1210862856.0, "step": 2849 }, { "entropy": 0.018844743724912405, "epoch": 1.2472918262391945, "grad_norm": 7.71875, "learning_rate": 2.3216025988088793e-05, "loss": 0.0824, "loss_lm": 0.017419880721718073, "loss_seg": 0.06499461457133293, "mean_token_accuracy": 0.9952218383550644, "num_tokens": 1211287658.0, "step": 2850 }, { "entropy": 0.018183180131018162, "epoch": 1.2477295108874056, "grad_norm": 4.34375, "learning_rate": 2.3213318895506226e-05, "loss": 0.1286, "loss_lm": 0.014058890054002404, "loss_seg": 0.11456923931837082, "mean_token_accuracy": 0.9954482913017273, "num_tokens": 1211712683.0, "step": 2851 }, { "entropy": 0.018487644847482443, "epoch": 1.2481671955356166, "grad_norm": 52.75, "learning_rate": 2.321061180292366e-05, "loss": 0.1268, "loss_lm": 0.015254433266818523, "loss_seg": 0.11155851930379868, "mean_token_accuracy": 0.9953051656484604, "num_tokens": 1212137369.0, "step": 2852 }, { "entropy": 0.018436191137880087, "epoch": 1.2486048801838276, "grad_norm": 8.5, "learning_rate": 2.3207904710341097e-05, "loss": 0.1195, "loss_lm": 0.0180751895532012, "loss_seg": 0.10137945786118507, "mean_token_accuracy": 0.9952864348888397, "num_tokens": 1212562481.0, "step": 2853 }, { "entropy": 0.018601246178150177, "epoch": 1.2490425648320385, "grad_norm": 5.25, "learning_rate": 2.3205197617758528e-05, "loss": 0.1587, "loss_lm": 0.01650626794435084, "loss_seg": 0.14219312369823456, "mean_token_accuracy": 0.9952766597270966, "num_tokens": 1212987510.0, "step": 2854 }, { "entropy": 0.018517029471695423, "epoch": 1.2494802494802495, "grad_norm": 6.84375, "learning_rate": 2.320249052517596e-05, "loss": 0.115, "loss_lm": 0.014595334650948644, "loss_seg": 0.10044451057910919, "mean_token_accuracy": 0.9952511638402939, "num_tokens": 1213412402.0, "step": 2855 }, { "entropy": 0.018675474915653467, "epoch": 1.2499179341284605, "grad_norm": 5.71875, "learning_rate": 2.3199783432593395e-05, "loss": 0.1243, "loss_lm": 0.01572181168012321, "loss_seg": 0.10858395509421825, "mean_token_accuracy": 0.9952100962400436, "num_tokens": 1213837831.0, "step": 2856 }, { "entropy": 0.018806705251336098, "epoch": 1.2503556187766713, "grad_norm": 8.4375, "learning_rate": 2.319707634001083e-05, "loss": 0.1473, "loss_lm": 0.016248061321675777, "loss_seg": 0.13106749765574932, "mean_token_accuracy": 0.995093360543251, "num_tokens": 1214262942.0, "step": 2857 }, { "entropy": 0.018324734177440405, "epoch": 1.2507933034248824, "grad_norm": 9.4375, "learning_rate": 2.3194369247428266e-05, "loss": 0.1152, "loss_lm": 0.01468494231812656, "loss_seg": 0.10051416046917439, "mean_token_accuracy": 0.9952365756034851, "num_tokens": 1214687809.0, "step": 2858 }, { "entropy": 0.019088398665189743, "epoch": 1.2512309880730934, "grad_norm": 11.9375, "learning_rate": 2.3191662154845696e-05, "loss": 0.1652, "loss_lm": 0.015929699409753084, "loss_seg": 0.14922703430056572, "mean_token_accuracy": 0.9950011521577835, "num_tokens": 1215113082.0, "step": 2859 }, { "entropy": 0.018567852675914764, "epoch": 1.2516686727213042, "grad_norm": 5.6875, "learning_rate": 2.318895506226313e-05, "loss": 0.0989, "loss_lm": 0.015307299327105284, "loss_seg": 0.08355656452476978, "mean_token_accuracy": 0.9952152669429779, "num_tokens": 1215537951.0, "step": 2860 }, { "entropy": 0.018496960401535034, "epoch": 1.2521063573695153, "grad_norm": 25.5, "learning_rate": 2.3186247969680564e-05, "loss": 0.1425, "loss_lm": 0.01595846703276038, "loss_seg": 0.1265362873673439, "mean_token_accuracy": 0.9952316135168076, "num_tokens": 1215962609.0, "step": 2861 }, { "entropy": 0.018402243964374065, "epoch": 1.2525440420177263, "grad_norm": 23.375, "learning_rate": 2.3183540877097997e-05, "loss": 0.1218, "loss_lm": 0.016355249099433422, "loss_seg": 0.10549441166222095, "mean_token_accuracy": 0.9952091872692108, "num_tokens": 1216388396.0, "step": 2862 }, { "entropy": 0.018251188565045595, "epoch": 1.2529817266659373, "grad_norm": 7.3125, "learning_rate": 2.3180833784515428e-05, "loss": 0.1519, "loss_lm": 0.01541992207057774, "loss_seg": 0.136519193649292, "mean_token_accuracy": 0.9952697157859802, "num_tokens": 1216813697.0, "step": 2863 }, { "entropy": 0.018784675747156143, "epoch": 1.2534194113141481, "grad_norm": 26.75, "learning_rate": 2.3178126691932865e-05, "loss": 0.1199, "loss_lm": 0.01704379660077393, "loss_seg": 0.10283139534294605, "mean_token_accuracy": 0.9952391237020493, "num_tokens": 1217238581.0, "step": 2864 }, { "entropy": 0.01921038795262575, "epoch": 1.2538570959623592, "grad_norm": 6.3125, "learning_rate": 2.31754195993503e-05, "loss": 0.1169, "loss_lm": 0.015250473748892546, "loss_seg": 0.10169845633208752, "mean_token_accuracy": 0.9950304478406906, "num_tokens": 1217663951.0, "step": 2865 }, { "entropy": 0.0190112697891891, "epoch": 1.2542947806105702, "grad_norm": 35.0, "learning_rate": 2.3172712506767732e-05, "loss": 0.1688, "loss_lm": 0.01475689560174942, "loss_seg": 0.15405436232686043, "mean_token_accuracy": 0.9949985146522522, "num_tokens": 1218089186.0, "step": 2866 }, { "entropy": 0.01799109624698758, "epoch": 1.254732465258781, "grad_norm": 8.75, "learning_rate": 2.3170005414185166e-05, "loss": 0.1168, "loss_lm": 0.013975802343338728, "loss_seg": 0.10285023227334023, "mean_token_accuracy": 0.9953833967447281, "num_tokens": 1218513461.0, "step": 2867 }, { "entropy": 0.018233776092529297, "epoch": 1.255170149906992, "grad_norm": 8.125, "learning_rate": 2.3167298321602596e-05, "loss": 0.1473, "loss_lm": 0.01714173285290599, "loss_seg": 0.13011869974434376, "mean_token_accuracy": 0.9953577518463135, "num_tokens": 1218938649.0, "step": 2868 }, { "entropy": 0.01829884061589837, "epoch": 1.255607834555203, "grad_norm": 7.90625, "learning_rate": 2.3164591229020034e-05, "loss": 0.1289, "loss_lm": 0.014712858945131302, "loss_seg": 0.11422025598585606, "mean_token_accuracy": 0.9952806532382965, "num_tokens": 1219363648.0, "step": 2869 }, { "entropy": 0.018971120938658714, "epoch": 1.256045519203414, "grad_norm": 9.0, "learning_rate": 2.3161884136437467e-05, "loss": 0.1558, "loss_lm": 0.019156641326844692, "loss_seg": 0.13667266257107258, "mean_token_accuracy": 0.9951386898756027, "num_tokens": 1219788806.0, "step": 2870 }, { "entropy": 0.01749281259253621, "epoch": 1.256483203851625, "grad_norm": 7.3125, "learning_rate": 2.31591770438549e-05, "loss": 0.1042, "loss_lm": 0.014988898998126388, "loss_seg": 0.08917116187512875, "mean_token_accuracy": 0.9955044537782669, "num_tokens": 1220213827.0, "step": 2871 }, { "entropy": 0.017996262293308973, "epoch": 1.256920888499836, "grad_norm": 18.875, "learning_rate": 2.3156469951272335e-05, "loss": 0.1252, "loss_lm": 0.014037364860996604, "loss_seg": 0.1111790556460619, "mean_token_accuracy": 0.9953224658966064, "num_tokens": 1220638376.0, "step": 2872 }, { "entropy": 0.018890108447521925, "epoch": 1.2573585731480468, "grad_norm": 6.6875, "learning_rate": 2.3153762858689765e-05, "loss": 0.1465, "loss_lm": 0.01778501830995083, "loss_seg": 0.12870402820408344, "mean_token_accuracy": 0.9951166063547134, "num_tokens": 1221063679.0, "step": 2873 }, { "entropy": 0.018655958585441113, "epoch": 1.2577962577962578, "grad_norm": 8.625, "learning_rate": 2.3151055766107202e-05, "loss": 0.1291, "loss_lm": 0.016159639228135347, "loss_seg": 0.11290371976792812, "mean_token_accuracy": 0.9950635731220245, "num_tokens": 1221489258.0, "step": 2874 }, { "entropy": 0.01830011000856757, "epoch": 1.2582339424444688, "grad_norm": 10.5625, "learning_rate": 2.3148348673524636e-05, "loss": 0.1292, "loss_lm": 0.016472506802529097, "loss_seg": 0.11270422115921974, "mean_token_accuracy": 0.9953203201293945, "num_tokens": 1221914244.0, "step": 2875 }, { "entropy": 0.01898934179916978, "epoch": 1.2586716270926797, "grad_norm": 8.3125, "learning_rate": 2.314564158094207e-05, "loss": 0.1301, "loss_lm": 0.01646101800724864, "loss_seg": 0.11362360790371895, "mean_token_accuracy": 0.9950172305107117, "num_tokens": 1222338894.0, "step": 2876 }, { "entropy": 0.018723691813647747, "epoch": 1.2591093117408907, "grad_norm": 4.375, "learning_rate": 2.3142934488359503e-05, "loss": 0.1106, "loss_lm": 0.018084221985191107, "loss_seg": 0.0925280824303627, "mean_token_accuracy": 0.9952433258295059, "num_tokens": 1222763945.0, "step": 2877 }, { "entropy": 0.018239822704344988, "epoch": 1.2595469963891017, "grad_norm": 6.71875, "learning_rate": 2.3140227395776934e-05, "loss": 0.168, "loss_lm": 0.016106252558529377, "loss_seg": 0.15186107717454433, "mean_token_accuracy": 0.9953276515007019, "num_tokens": 1223188300.0, "step": 2878 }, { "entropy": 0.018432358745485544, "epoch": 1.2599846810373125, "grad_norm": 6.0, "learning_rate": 2.313752030319437e-05, "loss": 0.1139, "loss_lm": 0.014968620147556067, "loss_seg": 0.09895501472055912, "mean_token_accuracy": 0.9952323883771896, "num_tokens": 1223612795.0, "step": 2879 }, { "entropy": 0.01897980459034443, "epoch": 1.2604223656855236, "grad_norm": 7.6875, "learning_rate": 2.3134813210611805e-05, "loss": 0.1441, "loss_lm": 0.016912643797695637, "loss_seg": 0.1271867584437132, "mean_token_accuracy": 0.9950941801071167, "num_tokens": 1224039214.0, "step": 2880 }, { "entropy": 0.018167045433074236, "epoch": 1.2608600503337346, "grad_norm": 10.8125, "learning_rate": 2.3132106118029238e-05, "loss": 0.149, "loss_lm": 0.014331113314256072, "loss_seg": 0.13470725156366825, "mean_token_accuracy": 0.995337039232254, "num_tokens": 1224464856.0, "step": 2881 }, { "entropy": 0.018369408790022135, "epoch": 1.2612977349819454, "grad_norm": 27.0, "learning_rate": 2.3129399025446672e-05, "loss": 0.1504, "loss_lm": 0.017491242615506053, "loss_seg": 0.1328878104686737, "mean_token_accuracy": 0.9952462762594223, "num_tokens": 1224890153.0, "step": 2882 }, { "entropy": 0.018229042179882526, "epoch": 1.2617354196301565, "grad_norm": 28.5, "learning_rate": 2.3126691932864102e-05, "loss": 0.1084, "loss_lm": 0.016304126009345055, "loss_seg": 0.09206777159124613, "mean_token_accuracy": 0.9953115582466125, "num_tokens": 1225314751.0, "step": 2883 }, { "entropy": 0.01863346714526415, "epoch": 1.2621731042783675, "grad_norm": 4.875, "learning_rate": 2.312398484028154e-05, "loss": 0.1508, "loss_lm": 0.016962707275524735, "loss_seg": 0.13383719511330128, "mean_token_accuracy": 0.995172992348671, "num_tokens": 1225740456.0, "step": 2884 }, { "entropy": 0.018179562874138355, "epoch": 1.2626107889265783, "grad_norm": 12.3125, "learning_rate": 2.3121277747698973e-05, "loss": 0.1615, "loss_lm": 0.01709839957766235, "loss_seg": 0.14442452415823936, "mean_token_accuracy": 0.9953910261392593, "num_tokens": 1226165658.0, "step": 2885 }, { "entropy": 0.01894738106057048, "epoch": 1.2630484735747893, "grad_norm": 5.0, "learning_rate": 2.3118570655116407e-05, "loss": 0.146, "loss_lm": 0.017102975398302078, "loss_seg": 0.12890570797026157, "mean_token_accuracy": 0.9950626939535141, "num_tokens": 1226590152.0, "step": 2886 }, { "entropy": 0.01852515898644924, "epoch": 1.2634861582230004, "grad_norm": 24.0, "learning_rate": 2.3115863562533837e-05, "loss": 0.1026, "loss_lm": 0.015623058658093214, "loss_seg": 0.0869614128023386, "mean_token_accuracy": 0.9952308982610703, "num_tokens": 1227015575.0, "step": 2887 }, { "entropy": 0.018877229187637568, "epoch": 1.2639238428712112, "grad_norm": 7.40625, "learning_rate": 2.311315646995127e-05, "loss": 0.1372, "loss_lm": 0.017133760498836637, "loss_seg": 0.12007005140185356, "mean_token_accuracy": 0.9952541589736938, "num_tokens": 1227440564.0, "step": 2888 }, { "entropy": 0.018179940525442362, "epoch": 1.2643615275194222, "grad_norm": 5.375, "learning_rate": 2.3110449377368708e-05, "loss": 0.1505, "loss_lm": 0.016227621119469404, "loss_seg": 0.13426274619996548, "mean_token_accuracy": 0.9953655451536179, "num_tokens": 1227864943.0, "step": 2889 }, { "entropy": 0.018715525045990944, "epoch": 1.2647992121676332, "grad_norm": 8.125, "learning_rate": 2.3107742284786142e-05, "loss": 0.1192, "loss_lm": 0.016138562466949224, "loss_seg": 0.10307839140295982, "mean_token_accuracy": 0.9952460527420044, "num_tokens": 1228290646.0, "step": 2890 }, { "entropy": 0.018122913781553507, "epoch": 1.265236896815844, "grad_norm": 7.375, "learning_rate": 2.3105035192203576e-05, "loss": 0.1, "loss_lm": 0.014818616909906268, "loss_seg": 0.08518189657479525, "mean_token_accuracy": 0.9952694028615952, "num_tokens": 1228714867.0, "step": 2891 }, { "entropy": 0.018146853428333998, "epoch": 1.265674581464055, "grad_norm": 5.25, "learning_rate": 2.3102328099621006e-05, "loss": 0.1192, "loss_lm": 0.016705409856513143, "loss_seg": 0.10247863829135895, "mean_token_accuracy": 0.9952553361654282, "num_tokens": 1229140750.0, "step": 2892 }, { "entropy": 0.018193175084888935, "epoch": 1.2661122661122661, "grad_norm": 21.25, "learning_rate": 2.309962100703844e-05, "loss": 0.1173, "loss_lm": 0.013250426389276981, "loss_seg": 0.10402658767998219, "mean_token_accuracy": 0.9952842593193054, "num_tokens": 1229565033.0, "step": 2893 }, { "entropy": 0.018358091358095407, "epoch": 1.266549950760477, "grad_norm": 5.71875, "learning_rate": 2.3096913914455873e-05, "loss": 0.1373, "loss_lm": 0.015726278303191066, "loss_seg": 0.12158746644854546, "mean_token_accuracy": 0.9952517598867416, "num_tokens": 1229989973.0, "step": 2894 }, { "entropy": 0.01878488529473543, "epoch": 1.266987635408688, "grad_norm": 20.75, "learning_rate": 2.309420682187331e-05, "loss": 0.1361, "loss_lm": 0.014959779800847173, "loss_seg": 0.12112843245267868, "mean_token_accuracy": 0.9951632618904114, "num_tokens": 1230414958.0, "step": 2895 }, { "entropy": 0.0185895306058228, "epoch": 1.267425320056899, "grad_norm": 7.125, "learning_rate": 2.3091499729290744e-05, "loss": 0.1024, "loss_lm": 0.019682949874550104, "loss_seg": 0.08269590325653553, "mean_token_accuracy": 0.9952351152896881, "num_tokens": 1230840064.0, "step": 2896 }, { "entropy": 0.018212270457297564, "epoch": 1.26786300470511, "grad_norm": 6.0, "learning_rate": 2.3088792636708174e-05, "loss": 0.1168, "loss_lm": 0.016163601772859693, "loss_seg": 0.10059162974357605, "mean_token_accuracy": 0.9953573048114777, "num_tokens": 1231265431.0, "step": 2897 }, { "entropy": 0.018439948558807373, "epoch": 1.2683006893533209, "grad_norm": 8.125, "learning_rate": 2.3086085544125608e-05, "loss": 0.1295, "loss_lm": 0.016296873334795237, "loss_seg": 0.1131662018597126, "mean_token_accuracy": 0.9952835589647293, "num_tokens": 1231690304.0, "step": 2898 }, { "entropy": 0.01858149701729417, "epoch": 1.2687383740015319, "grad_norm": 28.625, "learning_rate": 2.3083378451543042e-05, "loss": 0.0992, "loss_lm": 0.015951265348121524, "loss_seg": 0.08322742581367493, "mean_token_accuracy": 0.9952517002820969, "num_tokens": 1232114632.0, "step": 2899 }, { "entropy": 0.018715862650424242, "epoch": 1.269176058649743, "grad_norm": 28.5, "learning_rate": 2.308067135896048e-05, "loss": 0.1036, "loss_lm": 0.014815916074439883, "loss_seg": 0.08882204629480839, "mean_token_accuracy": 0.9951424598693848, "num_tokens": 1232539309.0, "step": 2900 }, { "entropy": 0.018426996655762196, "epoch": 1.269613743297954, "grad_norm": 8.25, "learning_rate": 2.3077964266377913e-05, "loss": 0.138, "loss_lm": 0.015261909225955606, "loss_seg": 0.12269982695579529, "mean_token_accuracy": 0.9952602982521057, "num_tokens": 1232964198.0, "step": 2901 }, { "entropy": 0.01870466535910964, "epoch": 1.2700514279461648, "grad_norm": 9.875, "learning_rate": 2.3075257173795343e-05, "loss": 0.1306, "loss_lm": 0.016289229039102793, "loss_seg": 0.11429310869425535, "mean_token_accuracy": 0.9952114969491959, "num_tokens": 1233389360.0, "step": 2902 }, { "entropy": 0.019058877136558294, "epoch": 1.2704891125943758, "grad_norm": 8.375, "learning_rate": 2.3072550081212777e-05, "loss": 0.1218, "loss_lm": 0.017207097727805376, "loss_seg": 0.10459197498857975, "mean_token_accuracy": 0.9950899034738541, "num_tokens": 1233814579.0, "step": 2903 }, { "entropy": 0.01775816408917308, "epoch": 1.2709267972425868, "grad_norm": 8.3125, "learning_rate": 2.306984298863021e-05, "loss": 0.1087, "loss_lm": 0.015169763471931219, "loss_seg": 0.09355533495545387, "mean_token_accuracy": 0.9954462647438049, "num_tokens": 1234239881.0, "step": 2904 }, { "entropy": 0.019107474945485592, "epoch": 1.2713644818907976, "grad_norm": 26.375, "learning_rate": 2.3067135896047648e-05, "loss": 0.1322, "loss_lm": 0.020522410748526454, "loss_seg": 0.11165092885494232, "mean_token_accuracy": 0.9951006919145584, "num_tokens": 1234664391.0, "step": 2905 }, { "entropy": 0.018139797262847424, "epoch": 1.2718021665390087, "grad_norm": 10.8125, "learning_rate": 2.3064428803465078e-05, "loss": 0.0949, "loss_lm": 0.014834780944511294, "loss_seg": 0.08002119325101376, "mean_token_accuracy": 0.995400533080101, "num_tokens": 1235089585.0, "step": 2906 }, { "entropy": 0.01931199198588729, "epoch": 1.2722398511872197, "grad_norm": 8.9375, "learning_rate": 2.3061721710882512e-05, "loss": 0.1193, "loss_lm": 0.017360211815685034, "loss_seg": 0.10188979469239712, "mean_token_accuracy": 0.9949692636728287, "num_tokens": 1235514934.0, "step": 2907 }, { "entropy": 0.018402349203824997, "epoch": 1.2726775358354305, "grad_norm": 8.0625, "learning_rate": 2.3059014618299945e-05, "loss": 0.1468, "loss_lm": 0.01778020430356264, "loss_seg": 0.12899897247552872, "mean_token_accuracy": 0.9952289760112762, "num_tokens": 1235939948.0, "step": 2908 }, { "entropy": 0.018607396632432938, "epoch": 1.2731152204836416, "grad_norm": 11.5, "learning_rate": 2.305630752571738e-05, "loss": 0.1216, "loss_lm": 0.01656882814131677, "loss_seg": 0.1050045546144247, "mean_token_accuracy": 0.9951802492141724, "num_tokens": 1236364848.0, "step": 2909 }, { "entropy": 0.018841457087546587, "epoch": 1.2735529051318526, "grad_norm": 34.0, "learning_rate": 2.3053600433134816e-05, "loss": 0.1566, "loss_lm": 0.015668583568185568, "loss_seg": 0.14093770645558834, "mean_token_accuracy": 0.9951408803462982, "num_tokens": 1236790523.0, "step": 2910 }, { "entropy": 0.018781742546707392, "epoch": 1.2739905897800634, "grad_norm": 8.4375, "learning_rate": 2.3050893340552247e-05, "loss": 0.1097, "loss_lm": 0.01719652791507542, "loss_seg": 0.09251419082283974, "mean_token_accuracy": 0.9952110797166824, "num_tokens": 1237215423.0, "step": 2911 }, { "entropy": 0.01904341485351324, "epoch": 1.2744282744282744, "grad_norm": 9.375, "learning_rate": 2.304818624796968e-05, "loss": 0.1089, "loss_lm": 0.015068606473505497, "loss_seg": 0.09386228583753109, "mean_token_accuracy": 0.9950720220804214, "num_tokens": 1237639915.0, "step": 2912 }, { "entropy": 0.018587426282465458, "epoch": 1.2748659590764855, "grad_norm": 5.375, "learning_rate": 2.3045479155387114e-05, "loss": 0.1051, "loss_lm": 0.015556162688881159, "loss_seg": 0.08953247684985399, "mean_token_accuracy": 0.9952044039964676, "num_tokens": 1238065028.0, "step": 2913 }, { "entropy": 0.018717842176556587, "epoch": 1.2753036437246963, "grad_norm": 16.75, "learning_rate": 2.3042772062804548e-05, "loss": 0.1078, "loss_lm": 0.016538290306925774, "loss_seg": 0.09129046089947224, "mean_token_accuracy": 0.9951327294111252, "num_tokens": 1238491113.0, "step": 2914 }, { "entropy": 0.019101827405393124, "epoch": 1.2757413283729073, "grad_norm": 10.3125, "learning_rate": 2.3040064970221985e-05, "loss": 0.1555, "loss_lm": 0.01751546235755086, "loss_seg": 0.13796865940093994, "mean_token_accuracy": 0.9951034337282181, "num_tokens": 1238915649.0, "step": 2915 }, { "entropy": 0.018684436567127705, "epoch": 1.2761790130211184, "grad_norm": 10.4375, "learning_rate": 2.3037357877639415e-05, "loss": 0.1148, "loss_lm": 0.014661313965916634, "loss_seg": 0.10017762146890163, "mean_token_accuracy": 0.9953237771987915, "num_tokens": 1239340345.0, "step": 2916 }, { "entropy": 0.018500732723623514, "epoch": 1.2766166976693292, "grad_norm": 12.0, "learning_rate": 2.303465078505685e-05, "loss": 0.1092, "loss_lm": 0.014463724102824926, "loss_seg": 0.09473420586436987, "mean_token_accuracy": 0.9951486885547638, "num_tokens": 1239765161.0, "step": 2917 }, { "entropy": 0.01840868592262268, "epoch": 1.2770543823175402, "grad_norm": 35.5, "learning_rate": 2.3031943692474283e-05, "loss": 0.1099, "loss_lm": 0.015486316755414009, "loss_seg": 0.09444906748831272, "mean_token_accuracy": 0.995244026184082, "num_tokens": 1240189646.0, "step": 2918 }, { "entropy": 0.018096966668963432, "epoch": 1.2774920669657512, "grad_norm": 54.0, "learning_rate": 2.3029236599891716e-05, "loss": 0.1022, "loss_lm": 0.016831517685204744, "loss_seg": 0.08532339334487915, "mean_token_accuracy": 0.9954205304384232, "num_tokens": 1240614310.0, "step": 2919 }, { "entropy": 0.019231809303164482, "epoch": 1.277929751613962, "grad_norm": 11.125, "learning_rate": 2.3026529507309154e-05, "loss": 0.1636, "loss_lm": 0.02071509067900479, "loss_seg": 0.14291192404925823, "mean_token_accuracy": 0.9951281696557999, "num_tokens": 1241039372.0, "step": 2920 }, { "entropy": 0.018757441081106663, "epoch": 1.278367436262173, "grad_norm": 18.375, "learning_rate": 2.3023822414726584e-05, "loss": 0.1209, "loss_lm": 0.020839375909417868, "loss_seg": 0.1001041829586029, "mean_token_accuracy": 0.9951882809400558, "num_tokens": 1241464742.0, "step": 2921 }, { "entropy": 0.019752726890146732, "epoch": 1.2788051209103841, "grad_norm": 5.4375, "learning_rate": 2.3021115322144018e-05, "loss": 0.1308, "loss_lm": 0.01994346105493605, "loss_seg": 0.11083858646452427, "mean_token_accuracy": 0.9949802309274673, "num_tokens": 1241889644.0, "step": 2922 }, { "entropy": 0.018975455313920975, "epoch": 1.279242805558595, "grad_norm": 7.6875, "learning_rate": 2.301840822956145e-05, "loss": 0.1274, "loss_lm": 0.015727717895060778, "loss_seg": 0.11162659712135792, "mean_token_accuracy": 0.9950858801603317, "num_tokens": 1242314719.0, "step": 2923 }, { "entropy": 0.01943346345797181, "epoch": 1.279680490206806, "grad_norm": 7.84375, "learning_rate": 2.3015701136978885e-05, "loss": 0.1647, "loss_lm": 0.018051402177661657, "loss_seg": 0.14662939310073853, "mean_token_accuracy": 0.9950547963380814, "num_tokens": 1242739547.0, "step": 2924 }, { "entropy": 0.018311346415430307, "epoch": 1.280118174855017, "grad_norm": 5.625, "learning_rate": 2.301299404439632e-05, "loss": 0.1412, "loss_lm": 0.014355150982737541, "loss_seg": 0.12681385595351458, "mean_token_accuracy": 0.9952167719602585, "num_tokens": 1243164551.0, "step": 2925 }, { "entropy": 0.01924827042967081, "epoch": 1.2805558595032278, "grad_norm": 3.328125, "learning_rate": 2.3010286951813753e-05, "loss": 0.1663, "loss_lm": 0.01771492906846106, "loss_seg": 0.14853827469050884, "mean_token_accuracy": 0.9949765503406525, "num_tokens": 1243589530.0, "step": 2926 }, { "entropy": 0.018213474191725254, "epoch": 1.2809935441514388, "grad_norm": 6.3125, "learning_rate": 2.3007579859231186e-05, "loss": 0.1123, "loss_lm": 0.013941431883722544, "loss_seg": 0.09832682088017464, "mean_token_accuracy": 0.9953214228153229, "num_tokens": 1244014625.0, "step": 2927 }, { "entropy": 0.018267234787344933, "epoch": 1.2814312287996499, "grad_norm": 9.75, "learning_rate": 2.300487276664862e-05, "loss": 0.1288, "loss_lm": 0.01590784825384617, "loss_seg": 0.1128858458250761, "mean_token_accuracy": 0.9953242391347885, "num_tokens": 1244438885.0, "step": 2928 }, { "entropy": 0.018758670426905155, "epoch": 1.2818689134478607, "grad_norm": 7.59375, "learning_rate": 2.3002165674066054e-05, "loss": 0.1023, "loss_lm": 0.015935851028189063, "loss_seg": 0.08635776303708553, "mean_token_accuracy": 0.9951479732990265, "num_tokens": 1244864232.0, "step": 2929 }, { "entropy": 0.018668631091713905, "epoch": 1.2823065980960717, "grad_norm": 10.0, "learning_rate": 2.2999458581483484e-05, "loss": 0.1334, "loss_lm": 0.015258693834766746, "loss_seg": 0.11814338155090809, "mean_token_accuracy": 0.995331346988678, "num_tokens": 1245288952.0, "step": 2930 }, { "entropy": 0.018814377021044493, "epoch": 1.2827442827442828, "grad_norm": 11.25, "learning_rate": 2.299675148890092e-05, "loss": 0.1041, "loss_lm": 0.01590777887031436, "loss_seg": 0.08815627545118332, "mean_token_accuracy": 0.9952645748853683, "num_tokens": 1245713165.0, "step": 2931 }, { "entropy": 0.018920394591987133, "epoch": 1.2831819673924936, "grad_norm": 7.28125, "learning_rate": 2.2994044396318355e-05, "loss": 0.1522, "loss_lm": 0.015714116394519806, "loss_seg": 0.13648520782589912, "mean_token_accuracy": 0.9951404929161072, "num_tokens": 1246137887.0, "step": 2932 }, { "entropy": 0.018711484968662262, "epoch": 1.2836196520407046, "grad_norm": 5.1875, "learning_rate": 2.299133730373579e-05, "loss": 0.0965, "loss_lm": 0.014139947947114706, "loss_seg": 0.08235142193734646, "mean_token_accuracy": 0.9951294958591461, "num_tokens": 1246563281.0, "step": 2933 }, { "entropy": 0.019023279193788767, "epoch": 1.2840573366889156, "grad_norm": 9.9375, "learning_rate": 2.2988630211153222e-05, "loss": 0.1042, "loss_lm": 0.017574963625520468, "loss_seg": 0.08667451236397028, "mean_token_accuracy": 0.9951140582561493, "num_tokens": 1246988124.0, "step": 2934 }, { "entropy": 0.01814826251938939, "epoch": 1.2844950213371267, "grad_norm": 6.0625, "learning_rate": 2.2985923118570653e-05, "loss": 0.1618, "loss_lm": 0.01637882017530501, "loss_seg": 0.14539020787924528, "mean_token_accuracy": 0.9952792376279831, "num_tokens": 1247413469.0, "step": 2935 }, { "entropy": 0.019340371247380972, "epoch": 1.2849327059853375, "grad_norm": 6.0625, "learning_rate": 2.298321602598809e-05, "loss": 0.139, "loss_lm": 0.017670833971351385, "loss_seg": 0.12137603759765625, "mean_token_accuracy": 0.9949954450130463, "num_tokens": 1247838664.0, "step": 2936 }, { "entropy": 0.018823168706148863, "epoch": 1.2853703906335485, "grad_norm": 13.25, "learning_rate": 2.2980508933405524e-05, "loss": 0.1432, "loss_lm": 0.018835451919585466, "loss_seg": 0.12435261532664299, "mean_token_accuracy": 0.995207667350769, "num_tokens": 1248262449.0, "step": 2937 }, { "entropy": 0.018174788914620876, "epoch": 1.2858080752817596, "grad_norm": 8.4375, "learning_rate": 2.2977801840822957e-05, "loss": 0.1247, "loss_lm": 0.01704921922646463, "loss_seg": 0.1076898779720068, "mean_token_accuracy": 0.995335265994072, "num_tokens": 1248687289.0, "step": 2938 }, { "entropy": 0.019366866443306208, "epoch": 1.2862457599299706, "grad_norm": 6.34375, "learning_rate": 2.297509474824039e-05, "loss": 0.1247, "loss_lm": 0.019522670190781355, "loss_seg": 0.10513504594564438, "mean_token_accuracy": 0.9950864613056183, "num_tokens": 1249113038.0, "step": 2939 }, { "entropy": 0.01815885119140148, "epoch": 1.2866834445781814, "grad_norm": 7.09375, "learning_rate": 2.297238765565782e-05, "loss": 0.1256, "loss_lm": 0.017178048845380545, "loss_seg": 0.10838620737195015, "mean_token_accuracy": 0.9953839927911758, "num_tokens": 1249538439.0, "step": 2940 }, { "entropy": 0.017675502225756645, "epoch": 1.2871211292263924, "grad_norm": 7.40625, "learning_rate": 2.296968056307526e-05, "loss": 0.1045, "loss_lm": 0.013190350960940123, "loss_seg": 0.09132098220288754, "mean_token_accuracy": 0.995478942990303, "num_tokens": 1249963291.0, "step": 2941 }, { "entropy": 0.01869225036352873, "epoch": 1.2875588138746035, "grad_norm": 7.65625, "learning_rate": 2.2966973470492692e-05, "loss": 0.1059, "loss_lm": 0.01903310976922512, "loss_seg": 0.08685931004583836, "mean_token_accuracy": 0.9951971024274826, "num_tokens": 1250388627.0, "step": 2942 }, { "entropy": 0.018944515846669674, "epoch": 1.2879964985228143, "grad_norm": 17.125, "learning_rate": 2.2964266377910126e-05, "loss": 0.1404, "loss_lm": 0.01631496148183942, "loss_seg": 0.12406313419342041, "mean_token_accuracy": 0.9951112270355225, "num_tokens": 1250813239.0, "step": 2943 }, { "entropy": 0.018854999914765358, "epoch": 1.2884341831710253, "grad_norm": 9.4375, "learning_rate": 2.296155928532756e-05, "loss": 0.1607, "loss_lm": 0.015610286500304937, "loss_seg": 0.14507760666310787, "mean_token_accuracy": 0.9950980693101883, "num_tokens": 1251237956.0, "step": 2944 }, { "entropy": 0.01807579444721341, "epoch": 1.2888718678192363, "grad_norm": 10.6875, "learning_rate": 2.295885219274499e-05, "loss": 0.1166, "loss_lm": 0.016038726083934307, "loss_seg": 0.10053260996937752, "mean_token_accuracy": 0.9953405559062958, "num_tokens": 1251663073.0, "step": 2945 }, { "entropy": 0.018068308010697365, "epoch": 1.2893095524674472, "grad_norm": 5.125, "learning_rate": 2.2956145100162427e-05, "loss": 0.142, "loss_lm": 0.016114833997562528, "loss_seg": 0.12590299546718597, "mean_token_accuracy": 0.9953491389751434, "num_tokens": 1252088330.0, "step": 2946 }, { "entropy": 0.01868631551042199, "epoch": 1.2897472371156582, "grad_norm": 12.875, "learning_rate": 2.295343800757986e-05, "loss": 0.134, "loss_lm": 0.01547387195751071, "loss_seg": 0.11851990781724453, "mean_token_accuracy": 0.9951898604631424, "num_tokens": 1252513024.0, "step": 2947 }, { "entropy": 0.019363887142390013, "epoch": 1.2901849217638692, "grad_norm": 15.625, "learning_rate": 2.2950730914997295e-05, "loss": 0.1463, "loss_lm": 0.01788270310498774, "loss_seg": 0.12839127890765667, "mean_token_accuracy": 0.9950766712427139, "num_tokens": 1252938073.0, "step": 2948 }, { "entropy": 0.01881395559757948, "epoch": 1.29062260641208, "grad_norm": 4.65625, "learning_rate": 2.2948023822414728e-05, "loss": 0.1495, "loss_lm": 0.017759619979187846, "loss_seg": 0.13172144815325737, "mean_token_accuracy": 0.9951058626174927, "num_tokens": 1253363212.0, "step": 2949 }, { "entropy": 0.0178302270360291, "epoch": 1.291060291060291, "grad_norm": 6.84375, "learning_rate": 2.294531672983216e-05, "loss": 0.1378, "loss_lm": 0.015231189085170627, "loss_seg": 0.12252482399344444, "mean_token_accuracy": 0.9953726977109909, "num_tokens": 1253787851.0, "step": 2950 }, { "entropy": 0.018155541736632586, "epoch": 1.291497975708502, "grad_norm": 4.5, "learning_rate": 2.2942609637249596e-05, "loss": 0.1292, "loss_lm": 0.013847470050677657, "loss_seg": 0.1153416559100151, "mean_token_accuracy": 0.9953522831201553, "num_tokens": 1254213181.0, "step": 2951 }, { "entropy": 0.01874921191483736, "epoch": 1.291935660356713, "grad_norm": 6.09375, "learning_rate": 2.293990254466703e-05, "loss": 0.1344, "loss_lm": 0.015078657772392035, "loss_seg": 0.1192797627300024, "mean_token_accuracy": 0.995105966925621, "num_tokens": 1254638176.0, "step": 2952 }, { "entropy": 0.01888485625386238, "epoch": 1.292373345004924, "grad_norm": 8.3125, "learning_rate": 2.2937195452084463e-05, "loss": 0.1292, "loss_lm": 0.01797597692348063, "loss_seg": 0.11123795248568058, "mean_token_accuracy": 0.9951682090759277, "num_tokens": 1255062625.0, "step": 2953 }, { "entropy": 0.01874395925551653, "epoch": 1.292811029653135, "grad_norm": 8.8125, "learning_rate": 2.2934488359501893e-05, "loss": 0.1258, "loss_lm": 0.018604349810630083, "loss_seg": 0.10718020237982273, "mean_token_accuracy": 0.9951750189065933, "num_tokens": 1255486845.0, "step": 2954 }, { "entropy": 0.018372798804193735, "epoch": 1.2932487143013458, "grad_norm": 5.375, "learning_rate": 2.2931781266919327e-05, "loss": 0.148, "loss_lm": 0.01663051382638514, "loss_seg": 0.1314161941409111, "mean_token_accuracy": 0.9952586889266968, "num_tokens": 1255911978.0, "step": 2955 }, { "entropy": 0.01872770069167018, "epoch": 1.2936863989495568, "grad_norm": 7.84375, "learning_rate": 2.2929074174336764e-05, "loss": 0.1318, "loss_lm": 0.016115642385557294, "loss_seg": 0.11565005034208298, "mean_token_accuracy": 0.9951536506414413, "num_tokens": 1256337012.0, "step": 2956 }, { "entropy": 0.018664001487195492, "epoch": 1.2941240835977679, "grad_norm": 7.875, "learning_rate": 2.2926367081754198e-05, "loss": 0.1311, "loss_lm": 0.013788794633001089, "loss_seg": 0.11733445804566145, "mean_token_accuracy": 0.9953122287988663, "num_tokens": 1256762316.0, "step": 2957 }, { "entropy": 0.018986167386174202, "epoch": 1.2945617682459787, "grad_norm": 10.75, "learning_rate": 2.2923659989171632e-05, "loss": 0.1599, "loss_lm": 0.017591470386832952, "loss_seg": 0.1422783676534891, "mean_token_accuracy": 0.9951458126306534, "num_tokens": 1257187488.0, "step": 2958 }, { "entropy": 0.0183316171169281, "epoch": 1.2949994528941897, "grad_norm": 7.90625, "learning_rate": 2.2920952896589062e-05, "loss": 0.0988, "loss_lm": 0.015347130363807082, "loss_seg": 0.08345530554652214, "mean_token_accuracy": 0.9952401518821716, "num_tokens": 1257612050.0, "step": 2959 }, { "entropy": 0.01846247585490346, "epoch": 1.2954371375424008, "grad_norm": 6.6875, "learning_rate": 2.2918245804006496e-05, "loss": 0.1115, "loss_lm": 0.016388519667088985, "loss_seg": 0.09509257599711418, "mean_token_accuracy": 0.9952648729085922, "num_tokens": 1258036907.0, "step": 2960 }, { "entropy": 0.01832430809736252, "epoch": 1.2958748221906116, "grad_norm": 4.625, "learning_rate": 2.291553871142393e-05, "loss": 0.1031, "loss_lm": 0.013414530549198389, "loss_seg": 0.08973349444568157, "mean_token_accuracy": 0.9951993674039841, "num_tokens": 1258462006.0, "step": 2961 }, { "entropy": 0.018261029850691557, "epoch": 1.2963125068388226, "grad_norm": 13.25, "learning_rate": 2.2912831618841367e-05, "loss": 0.1284, "loss_lm": 0.01609325036406517, "loss_seg": 0.11234669014811516, "mean_token_accuracy": 0.9952777475118637, "num_tokens": 1258887184.0, "step": 2962 }, { "entropy": 0.018690348602831364, "epoch": 1.2967501914870336, "grad_norm": 14.875, "learning_rate": 2.29101245262588e-05, "loss": 0.118, "loss_lm": 0.017025486566126347, "loss_seg": 0.10100630298256874, "mean_token_accuracy": 0.9951516091823578, "num_tokens": 1259311842.0, "step": 2963 }, { "entropy": 0.019243882037699223, "epoch": 1.2971878761352444, "grad_norm": 5.96875, "learning_rate": 2.290741743367623e-05, "loss": 0.0835, "loss_lm": 0.016839044401422143, "loss_seg": 0.0666142888367176, "mean_token_accuracy": 0.9950758069753647, "num_tokens": 1259736258.0, "step": 2964 }, { "entropy": 0.017964386846870184, "epoch": 1.2976255607834555, "grad_norm": 12.5625, "learning_rate": 2.2904710341093664e-05, "loss": 0.106, "loss_lm": 0.014843474375084043, "loss_seg": 0.0911079840734601, "mean_token_accuracy": 0.9954029470682144, "num_tokens": 1260161464.0, "step": 2965 }, { "entropy": 0.018434989266097546, "epoch": 1.2980632454316665, "grad_norm": 7.21875, "learning_rate": 2.2902003248511098e-05, "loss": 0.1313, "loss_lm": 0.015425199875608087, "loss_seg": 0.11585073545575142, "mean_token_accuracy": 0.9952472895383835, "num_tokens": 1260586453.0, "step": 2966 }, { "entropy": 0.018445338122546673, "epoch": 1.2985009300798773, "grad_norm": 14.3125, "learning_rate": 2.2899296155928535e-05, "loss": 0.0888, "loss_lm": 0.014778069220483303, "loss_seg": 0.07405164279043674, "mean_token_accuracy": 0.9952222406864166, "num_tokens": 1261011660.0, "step": 2967 }, { "entropy": 0.018648586235940456, "epoch": 1.2989386147280884, "grad_norm": 9.875, "learning_rate": 2.289658906334597e-05, "loss": 0.0978, "loss_lm": 0.017502060625702143, "loss_seg": 0.08026325237005949, "mean_token_accuracy": 0.9951582998037338, "num_tokens": 1261435851.0, "step": 2968 }, { "entropy": 0.01769248303025961, "epoch": 1.2993762993762994, "grad_norm": 3.84375, "learning_rate": 2.28938819707634e-05, "loss": 0.1058, "loss_lm": 0.015181780094280839, "loss_seg": 0.09066471830010414, "mean_token_accuracy": 0.9954380840063095, "num_tokens": 1261861018.0, "step": 2969 }, { "entropy": 0.018491862807422876, "epoch": 1.2998139840245102, "grad_norm": 10.1875, "learning_rate": 2.2891174878180833e-05, "loss": 0.1306, "loss_lm": 0.015268721152096987, "loss_seg": 0.11530937068164349, "mean_token_accuracy": 0.995254322886467, "num_tokens": 1262285766.0, "step": 2970 }, { "entropy": 0.01855820743367076, "epoch": 1.3002516686727212, "grad_norm": 22.75, "learning_rate": 2.2888467785598267e-05, "loss": 0.1231, "loss_lm": 0.015776414657011628, "loss_seg": 0.10734537616372108, "mean_token_accuracy": 0.9953366965055466, "num_tokens": 1262711141.0, "step": 2971 }, { "entropy": 0.018544222693890333, "epoch": 1.3006893533209323, "grad_norm": 8.25, "learning_rate": 2.2885760693015704e-05, "loss": 0.1566, "loss_lm": 0.018085128162056208, "loss_seg": 0.13854906894266605, "mean_token_accuracy": 0.9953182637691498, "num_tokens": 1263135714.0, "step": 2972 }, { "entropy": 0.018679232336580753, "epoch": 1.3011270379691433, "grad_norm": 6.75, "learning_rate": 2.2883053600433138e-05, "loss": 0.1783, "loss_lm": 0.016091713681817055, "loss_seg": 0.16222567111253738, "mean_token_accuracy": 0.9950733929872513, "num_tokens": 1263560704.0, "step": 2973 }, { "entropy": 0.018708469346165657, "epoch": 1.3015647226173541, "grad_norm": 8.8125, "learning_rate": 2.2880346507850568e-05, "loss": 0.1465, "loss_lm": 0.014042069436982274, "loss_seg": 0.132442157715559, "mean_token_accuracy": 0.9951591044664383, "num_tokens": 1263985384.0, "step": 2974 }, { "entropy": 0.018938731867820024, "epoch": 1.3020024072655652, "grad_norm": 5.34375, "learning_rate": 2.2877639415268002e-05, "loss": 0.1964, "loss_lm": 0.019668887834995985, "loss_seg": 0.17669215239584446, "mean_token_accuracy": 0.995052695274353, "num_tokens": 1264410017.0, "step": 2975 }, { "entropy": 0.01885571051388979, "epoch": 1.3024400919137762, "grad_norm": 4.84375, "learning_rate": 2.2874932322685435e-05, "loss": 0.1047, "loss_lm": 0.016746568260714412, "loss_seg": 0.08795503713190556, "mean_token_accuracy": 0.995135486125946, "num_tokens": 1264835328.0, "step": 2976 }, { "entropy": 0.018294849898666143, "epoch": 1.3028777765619872, "grad_norm": 7.5625, "learning_rate": 2.2872225230102873e-05, "loss": 0.1588, "loss_lm": 0.017881128238514066, "loss_seg": 0.1409150194376707, "mean_token_accuracy": 0.9952412843704224, "num_tokens": 1265260417.0, "step": 2977 }, { "entropy": 0.018635272979736328, "epoch": 1.303315461210198, "grad_norm": 9.875, "learning_rate": 2.2869518137520303e-05, "loss": 0.1375, "loss_lm": 0.01937505090609193, "loss_seg": 0.11813287995755672, "mean_token_accuracy": 0.9951350688934326, "num_tokens": 1265684859.0, "step": 2978 }, { "entropy": 0.01813955558463931, "epoch": 1.303753145858409, "grad_norm": 10.0625, "learning_rate": 2.2866811044937737e-05, "loss": 0.1411, "loss_lm": 0.014900295529514551, "loss_seg": 0.12615684233605862, "mean_token_accuracy": 0.9953524321317673, "num_tokens": 1266109719.0, "step": 2979 }, { "entropy": 0.01869409903883934, "epoch": 1.30419083050662, "grad_norm": 6.78125, "learning_rate": 2.286410395235517e-05, "loss": 0.1468, "loss_lm": 0.016959830885753036, "loss_seg": 0.12986718118190765, "mean_token_accuracy": 0.9953064769506454, "num_tokens": 1266534520.0, "step": 2980 }, { "entropy": 0.018526810687035322, "epoch": 1.304628515154831, "grad_norm": 18.125, "learning_rate": 2.2861396859772604e-05, "loss": 0.148, "loss_lm": 0.01681650849059224, "loss_seg": 0.13115971721708775, "mean_token_accuracy": 0.9951886981725693, "num_tokens": 1266959947.0, "step": 2981 }, { "entropy": 0.018278141040354967, "epoch": 1.305066199803042, "grad_norm": 6.78125, "learning_rate": 2.285868976719004e-05, "loss": 0.1158, "loss_lm": 0.016140796476975083, "loss_seg": 0.09965355973690748, "mean_token_accuracy": 0.995344415307045, "num_tokens": 1267384124.0, "step": 2982 }, { "entropy": 0.01823547389358282, "epoch": 1.305503884451253, "grad_norm": 5.4375, "learning_rate": 2.285598267460747e-05, "loss": 0.1176, "loss_lm": 0.014676062623038888, "loss_seg": 0.10295043885707855, "mean_token_accuracy": 0.9953899681568146, "num_tokens": 1267809451.0, "step": 2983 }, { "entropy": 0.018304476514458656, "epoch": 1.3059415690994638, "grad_norm": 4.71875, "learning_rate": 2.2853275582024905e-05, "loss": 0.1147, "loss_lm": 0.020496776327490807, "loss_seg": 0.0941785890609026, "mean_token_accuracy": 0.9953905194997787, "num_tokens": 1268234618.0, "step": 2984 }, { "entropy": 0.01870338013395667, "epoch": 1.3063792537476748, "grad_norm": 13.3125, "learning_rate": 2.285056848944234e-05, "loss": 0.1684, "loss_lm": 0.015462140087038279, "loss_seg": 0.15290278755128384, "mean_token_accuracy": 0.9952828288078308, "num_tokens": 1268660021.0, "step": 2985 }, { "entropy": 0.01822464819997549, "epoch": 1.3068169383958859, "grad_norm": 4.46875, "learning_rate": 2.2847861396859773e-05, "loss": 0.1192, "loss_lm": 0.017104613361880183, "loss_seg": 0.10212351754307747, "mean_token_accuracy": 0.9952620565891266, "num_tokens": 1269085492.0, "step": 2986 }, { "entropy": 0.018088477198034525, "epoch": 1.3072546230440967, "grad_norm": 7.4375, "learning_rate": 2.284515430427721e-05, "loss": 0.1456, "loss_lm": 0.014520279364660382, "loss_seg": 0.1311289668083191, "mean_token_accuracy": 0.9954536110162735, "num_tokens": 1269509943.0, "step": 2987 }, { "entropy": 0.01924221310764551, "epoch": 1.3076923076923077, "grad_norm": 8.9375, "learning_rate": 2.284244721169464e-05, "loss": 0.1651, "loss_lm": 0.015995553694665432, "loss_seg": 0.14905878901481628, "mean_token_accuracy": 0.9951248914003372, "num_tokens": 1269935363.0, "step": 2988 }, { "entropy": 0.01819454925134778, "epoch": 1.3081299923405187, "grad_norm": 10.0625, "learning_rate": 2.2839740119112074e-05, "loss": 0.1328, "loss_lm": 0.01659560576081276, "loss_seg": 0.11615865305066109, "mean_token_accuracy": 0.9952171444892883, "num_tokens": 1270360600.0, "step": 2989 }, { "entropy": 0.018350556958466768, "epoch": 1.3085676769887296, "grad_norm": 8.3125, "learning_rate": 2.2837033026529508e-05, "loss": 0.121, "loss_lm": 0.014404465677216649, "loss_seg": 0.10658660344779491, "mean_token_accuracy": 0.9953313171863556, "num_tokens": 1270785730.0, "step": 2990 }, { "entropy": 0.01858981279656291, "epoch": 1.3090053616369406, "grad_norm": 34.0, "learning_rate": 2.283432593394694e-05, "loss": 0.1039, "loss_lm": 0.01612793910317123, "loss_seg": 0.08774356637150049, "mean_token_accuracy": 0.9953209161758423, "num_tokens": 1271210776.0, "step": 2991 }, { "entropy": 0.01875845482572913, "epoch": 1.3094430462851516, "grad_norm": 4.625, "learning_rate": 2.2831618841364375e-05, "loss": 0.0779, "loss_lm": 0.015527637675404549, "loss_seg": 0.062412683852016926, "mean_token_accuracy": 0.995075136423111, "num_tokens": 1271636312.0, "step": 2992 }, { "entropy": 0.01826296467334032, "epoch": 1.3098807309333624, "grad_norm": 7.03125, "learning_rate": 2.282891174878181e-05, "loss": 0.1521, "loss_lm": 0.016720861895009875, "loss_seg": 0.135398980230093, "mean_token_accuracy": 0.9952906221151352, "num_tokens": 1272060331.0, "step": 2993 }, { "entropy": 0.019298347178846598, "epoch": 1.3103184155815735, "grad_norm": 18.0, "learning_rate": 2.2826204656199243e-05, "loss": 0.145, "loss_lm": 0.018712144810706377, "loss_seg": 0.12631524726748466, "mean_token_accuracy": 0.9949957430362701, "num_tokens": 1272485323.0, "step": 2994 }, { "entropy": 0.018701044842600822, "epoch": 1.3107561002297845, "grad_norm": 16.0, "learning_rate": 2.2823497563616676e-05, "loss": 0.1486, "loss_lm": 0.014421896310523152, "loss_seg": 0.13419488817453384, "mean_token_accuracy": 0.9952113926410675, "num_tokens": 1272910470.0, "step": 2995 }, { "entropy": 0.01935264328494668, "epoch": 1.3111937848779953, "grad_norm": 10.1875, "learning_rate": 2.282079047103411e-05, "loss": 0.1133, "loss_lm": 0.01597682759165764, "loss_seg": 0.09734515845775604, "mean_token_accuracy": 0.9950771182775497, "num_tokens": 1273335612.0, "step": 2996 }, { "entropy": 0.018831040244549513, "epoch": 1.3116314695262064, "grad_norm": 4.71875, "learning_rate": 2.2818083378451544e-05, "loss": 0.1183, "loss_lm": 0.014699011109769344, "loss_seg": 0.10359789989888668, "mean_token_accuracy": 0.9951471537351608, "num_tokens": 1273760902.0, "step": 2997 }, { "entropy": 0.01921902084723115, "epoch": 1.3120691541744174, "grad_norm": 14.0, "learning_rate": 2.2815376285868977e-05, "loss": 0.1356, "loss_lm": 0.019005391746759415, "loss_seg": 0.11662724241614342, "mean_token_accuracy": 0.9949975460767746, "num_tokens": 1274185873.0, "step": 2998 }, { "entropy": 0.01900405529886484, "epoch": 1.3125068388226282, "grad_norm": 11.5, "learning_rate": 2.281266919328641e-05, "loss": 0.1162, "loss_lm": 0.01845593610778451, "loss_seg": 0.0977474506944418, "mean_token_accuracy": 0.9951882213354111, "num_tokens": 1274610639.0, "step": 2999 }, { "entropy": 0.01788858650252223, "epoch": 1.3129445234708392, "grad_norm": 23.875, "learning_rate": 2.2809962100703845e-05, "loss": 0.1484, "loss_lm": 0.016417424893006682, "loss_seg": 0.131945027038455, "mean_token_accuracy": 0.9953522086143494, "num_tokens": 1275035095.0, "step": 3000 }, { "entropy": 0.018691206816583872, "epoch": 1.3133822081190503, "grad_norm": 15.5, "learning_rate": 2.280725500812128e-05, "loss": 0.1241, "loss_lm": 0.016521933022886515, "loss_seg": 0.10758691839873791, "mean_token_accuracy": 0.9952853918075562, "num_tokens": 1275460569.0, "step": 3001 }, { "entropy": 0.018683449365198612, "epoch": 1.313819892767261, "grad_norm": 11.625, "learning_rate": 2.280454791553871e-05, "loss": 0.2189, "loss_lm": 0.01595788635313511, "loss_seg": 0.20293638668954372, "mean_token_accuracy": 0.9952263385057449, "num_tokens": 1275885362.0, "step": 3002 }, { "entropy": 0.018437660299241543, "epoch": 1.3142575774154721, "grad_norm": 12.25, "learning_rate": 2.2801840822956146e-05, "loss": 0.1239, "loss_lm": 0.014765878906473517, "loss_seg": 0.10909445956349373, "mean_token_accuracy": 0.9952591359615326, "num_tokens": 1276310788.0, "step": 3003 }, { "entropy": 0.019308960996568203, "epoch": 1.3146952620636831, "grad_norm": 6.375, "learning_rate": 2.279913373037358e-05, "loss": 0.0936, "loss_lm": 0.015858561033383012, "loss_seg": 0.07777062617242336, "mean_token_accuracy": 0.9951412528753281, "num_tokens": 1276736397.0, "step": 3004 }, { "entropy": 0.018510064110159874, "epoch": 1.315132946711894, "grad_norm": 6.5, "learning_rate": 2.2796426637791014e-05, "loss": 0.1321, "loss_lm": 0.017419121926650405, "loss_seg": 0.11465064063668251, "mean_token_accuracy": 0.9952831864356995, "num_tokens": 1277160986.0, "step": 3005 }, { "entropy": 0.018533762078732252, "epoch": 1.315570631360105, "grad_norm": 9.1875, "learning_rate": 2.2793719545208447e-05, "loss": 0.126, "loss_lm": 0.01752824243158102, "loss_seg": 0.10842607542872429, "mean_token_accuracy": 0.9951630681753159, "num_tokens": 1277585805.0, "step": 3006 }, { "entropy": 0.018707452341914177, "epoch": 1.316008316008316, "grad_norm": 25.875, "learning_rate": 2.2791012452625878e-05, "loss": 0.1078, "loss_lm": 0.018576892791315913, "loss_seg": 0.08924268744885921, "mean_token_accuracy": 0.9951607882976532, "num_tokens": 1278010621.0, "step": 3007 }, { "entropy": 0.0178384305909276, "epoch": 1.3164460006565268, "grad_norm": 9.6875, "learning_rate": 2.2788305360043315e-05, "loss": 0.1488, "loss_lm": 0.017501166556030512, "loss_seg": 0.13132690079510212, "mean_token_accuracy": 0.9953617751598358, "num_tokens": 1278435732.0, "step": 3008 }, { "entropy": 0.018986468203365803, "epoch": 1.3168836853047379, "grad_norm": 11.375, "learning_rate": 2.278559826746075e-05, "loss": 0.1512, "loss_lm": 0.018007609294727445, "loss_seg": 0.1332067046314478, "mean_token_accuracy": 0.995261013507843, "num_tokens": 1278861084.0, "step": 3009 }, { "entropy": 0.01889408938586712, "epoch": 1.317321369952949, "grad_norm": 5.75, "learning_rate": 2.2782891174878182e-05, "loss": 0.1251, "loss_lm": 0.018840213306248188, "loss_seg": 0.10621354170143604, "mean_token_accuracy": 0.9952155500650406, "num_tokens": 1279286405.0, "step": 3010 }, { "entropy": 0.018733464647084475, "epoch": 1.31775905460116, "grad_norm": 9.6875, "learning_rate": 2.2780184082295616e-05, "loss": 0.1251, "loss_lm": 0.015427351230755448, "loss_seg": 0.109626404941082, "mean_token_accuracy": 0.9951739013195038, "num_tokens": 1279710934.0, "step": 3011 }, { "entropy": 0.018205297645181417, "epoch": 1.3181967392493708, "grad_norm": 15.375, "learning_rate": 2.2777476989713046e-05, "loss": 0.1315, "loss_lm": 0.014631615253165364, "loss_seg": 0.11686640419065952, "mean_token_accuracy": 0.995295524597168, "num_tokens": 1280135770.0, "step": 3012 }, { "entropy": 0.019026787485927343, "epoch": 1.3186344238975818, "grad_norm": 5.0625, "learning_rate": 2.2774769897130483e-05, "loss": 0.1306, "loss_lm": 0.016245569568127394, "loss_seg": 0.11434250697493553, "mean_token_accuracy": 0.9951045364141464, "num_tokens": 1280561035.0, "step": 3013 }, { "entropy": 0.018356990534812212, "epoch": 1.3190721085457928, "grad_norm": 9.5, "learning_rate": 2.2772062804547917e-05, "loss": 0.1079, "loss_lm": 0.015182639472186565, "loss_seg": 0.09271643217653036, "mean_token_accuracy": 0.9954963475465775, "num_tokens": 1280985973.0, "step": 3014 }, { "entropy": 0.01859263377264142, "epoch": 1.3195097931940039, "grad_norm": 11.4375, "learning_rate": 2.276935571196535e-05, "loss": 0.1137, "loss_lm": 0.015948825515806675, "loss_seg": 0.09776544198393822, "mean_token_accuracy": 0.9951532930135727, "num_tokens": 1281411799.0, "step": 3015 }, { "entropy": 0.01791683677583933, "epoch": 1.3199474778422147, "grad_norm": 7.875, "learning_rate": 2.2766648619382784e-05, "loss": 0.0799, "loss_lm": 0.013695795089006424, "loss_seg": 0.0662516038864851, "mean_token_accuracy": 0.9954755157232285, "num_tokens": 1281836317.0, "step": 3016 }, { "entropy": 0.018948047887533903, "epoch": 1.3203851624904257, "grad_norm": 8.0625, "learning_rate": 2.2763941526800215e-05, "loss": 0.1189, "loss_lm": 0.01863930420950055, "loss_seg": 0.10025948844850063, "mean_token_accuracy": 0.9951436221599579, "num_tokens": 1282261694.0, "step": 3017 }, { "entropy": 0.018608850426971912, "epoch": 1.3208228471386367, "grad_norm": 7.875, "learning_rate": 2.2761234434217652e-05, "loss": 0.1428, "loss_lm": 0.01608771993778646, "loss_seg": 0.1267054807394743, "mean_token_accuracy": 0.9952280819416046, "num_tokens": 1282687044.0, "step": 3018 }, { "entropy": 0.018364700488746166, "epoch": 1.3212605317868475, "grad_norm": 3.65625, "learning_rate": 2.2758527341635086e-05, "loss": 0.0766, "loss_lm": 0.015202813316136599, "loss_seg": 0.06135615985840559, "mean_token_accuracy": 0.9953565001487732, "num_tokens": 1283112323.0, "step": 3019 }, { "entropy": 0.01852284139022231, "epoch": 1.3216982164350586, "grad_norm": 12.1875, "learning_rate": 2.275582024905252e-05, "loss": 0.1186, "loss_lm": 0.014841028023511171, "loss_seg": 0.10377182252705097, "mean_token_accuracy": 0.995069831609726, "num_tokens": 1283537866.0, "step": 3020 }, { "entropy": 0.01835358189418912, "epoch": 1.3221359010832696, "grad_norm": 9.5625, "learning_rate": 2.2753113156469953e-05, "loss": 0.0932, "loss_lm": 0.01598158129490912, "loss_seg": 0.07717682141810656, "mean_token_accuracy": 0.9952715486288071, "num_tokens": 1283963772.0, "step": 3021 }, { "entropy": 0.018619246780872345, "epoch": 1.3225735857314804, "grad_norm": 15.8125, "learning_rate": 2.2750406063887383e-05, "loss": 0.1198, "loss_lm": 0.015629177214577794, "loss_seg": 0.10416209883987904, "mean_token_accuracy": 0.9951266795396805, "num_tokens": 1284388681.0, "step": 3022 }, { "entropy": 0.018036707770079374, "epoch": 1.3230112703796915, "grad_norm": 10.1875, "learning_rate": 2.274769897130482e-05, "loss": 0.0987, "loss_lm": 0.014876362634822726, "loss_seg": 0.08385336678475142, "mean_token_accuracy": 0.9952934235334396, "num_tokens": 1284813130.0, "step": 3023 }, { "entropy": 0.018320323433727026, "epoch": 1.3234489550279025, "grad_norm": 10.0625, "learning_rate": 2.2744991878722254e-05, "loss": 0.1437, "loss_lm": 0.01634471141733229, "loss_seg": 0.12733537703752518, "mean_token_accuracy": 0.9952629953622818, "num_tokens": 1285238037.0, "step": 3024 }, { "entropy": 0.018114917911589146, "epoch": 1.3238866396761133, "grad_norm": 49.5, "learning_rate": 2.2742284786139688e-05, "loss": 0.108, "loss_lm": 0.014871487393975258, "loss_seg": 0.09313572198152542, "mean_token_accuracy": 0.9953751862049103, "num_tokens": 1285662879.0, "step": 3025 }, { "entropy": 0.017797799315303564, "epoch": 1.3243243243243243, "grad_norm": 6.1875, "learning_rate": 2.273957769355712e-05, "loss": 0.1255, "loss_lm": 0.015141221461817622, "loss_seg": 0.11038448102772236, "mean_token_accuracy": 0.995413139462471, "num_tokens": 1286087326.0, "step": 3026 }, { "entropy": 0.018289971631020308, "epoch": 1.3247620089725354, "grad_norm": 29.125, "learning_rate": 2.2736870600974552e-05, "loss": 0.1204, "loss_lm": 0.016302868025377393, "loss_seg": 0.10406391508877277, "mean_token_accuracy": 0.99532850086689, "num_tokens": 1286511958.0, "step": 3027 }, { "entropy": 0.018692015670239925, "epoch": 1.3251996936207462, "grad_norm": 9.6875, "learning_rate": 2.2734163508391986e-05, "loss": 0.1274, "loss_lm": 0.01776164839975536, "loss_seg": 0.10967088863253593, "mean_token_accuracy": 0.995335653424263, "num_tokens": 1286937919.0, "step": 3028 }, { "entropy": 0.018252751789987087, "epoch": 1.3256373782689572, "grad_norm": 10.625, "learning_rate": 2.2731456415809423e-05, "loss": 0.122, "loss_lm": 0.015668541891500354, "loss_seg": 0.10634956136345863, "mean_token_accuracy": 0.9952633678913116, "num_tokens": 1287362636.0, "step": 3029 }, { "entropy": 0.018956356216222048, "epoch": 1.3260750629171683, "grad_norm": 12.1875, "learning_rate": 2.2728749323226857e-05, "loss": 0.1907, "loss_lm": 0.014639300759881735, "loss_seg": 0.17609447240829468, "mean_token_accuracy": 0.9951308369636536, "num_tokens": 1287787135.0, "step": 3030 }, { "entropy": 0.01871131919324398, "epoch": 1.326512747565379, "grad_norm": 6.59375, "learning_rate": 2.2726042230644287e-05, "loss": 0.1265, "loss_lm": 0.017648724373430014, "loss_seg": 0.10883583314716816, "mean_token_accuracy": 0.9951596558094025, "num_tokens": 1288212489.0, "step": 3031 }, { "entropy": 0.019049743190407753, "epoch": 1.32695043221359, "grad_norm": 7.375, "learning_rate": 2.272333513806172e-05, "loss": 0.1164, "loss_lm": 0.017487841891124845, "loss_seg": 0.098918616771698, "mean_token_accuracy": 0.9951365143060684, "num_tokens": 1288637459.0, "step": 3032 }, { "entropy": 0.01935098133981228, "epoch": 1.3273881168618011, "grad_norm": 12.125, "learning_rate": 2.2720628045479154e-05, "loss": 0.0896, "loss_lm": 0.017074571456760168, "loss_seg": 0.07249176688492298, "mean_token_accuracy": 0.9949840605258942, "num_tokens": 1289062466.0, "step": 3033 }, { "entropy": 0.01884381426498294, "epoch": 1.327825801510012, "grad_norm": 7.59375, "learning_rate": 2.271792095289659e-05, "loss": 0.152, "loss_lm": 0.01641179551370442, "loss_seg": 0.1355985887348652, "mean_token_accuracy": 0.9951268434524536, "num_tokens": 1289487219.0, "step": 3034 }, { "entropy": 0.01836385577917099, "epoch": 1.328263486158223, "grad_norm": 4.71875, "learning_rate": 2.2715213860314025e-05, "loss": 0.1105, "loss_lm": 0.01619334053248167, "loss_seg": 0.09427703637629747, "mean_token_accuracy": 0.9953677505254745, "num_tokens": 1289912225.0, "step": 3035 }, { "entropy": 0.018756951205432415, "epoch": 1.328701170806434, "grad_norm": 7.3125, "learning_rate": 2.2712506767731456e-05, "loss": 0.1801, "loss_lm": 0.01852802187204361, "loss_seg": 0.16158055886626244, "mean_token_accuracy": 0.9949541389942169, "num_tokens": 1290336932.0, "step": 3036 }, { "entropy": 0.019314866978675127, "epoch": 1.3291388554546448, "grad_norm": 10.8125, "learning_rate": 2.270979967514889e-05, "loss": 0.1046, "loss_lm": 0.016754929907619953, "loss_seg": 0.08786746393889189, "mean_token_accuracy": 0.9949135333299637, "num_tokens": 1290761576.0, "step": 3037 }, { "entropy": 0.017934147734194994, "epoch": 1.3295765401028559, "grad_norm": 18.0, "learning_rate": 2.2707092582566323e-05, "loss": 0.1386, "loss_lm": 0.014528261497616768, "loss_seg": 0.12404138222336769, "mean_token_accuracy": 0.9954361468553543, "num_tokens": 1291186612.0, "step": 3038 }, { "entropy": 0.018339579924941063, "epoch": 1.330014224751067, "grad_norm": 39.25, "learning_rate": 2.270438548998376e-05, "loss": 0.0834, "loss_lm": 0.015586549649015069, "loss_seg": 0.06780748628079891, "mean_token_accuracy": 0.9951917976140976, "num_tokens": 1291611264.0, "step": 3039 }, { "entropy": 0.01828525774180889, "epoch": 1.3304519093992777, "grad_norm": 6.9375, "learning_rate": 2.2701678397401194e-05, "loss": 0.1406, "loss_lm": 0.016540517332032323, "loss_seg": 0.12406091764569283, "mean_token_accuracy": 0.9952528178691864, "num_tokens": 1292036953.0, "step": 3040 }, { "entropy": 0.018569112289696932, "epoch": 1.3308895940474887, "grad_norm": 13.6875, "learning_rate": 2.2698971304818624e-05, "loss": 0.1003, "loss_lm": 0.015907389344647527, "loss_seg": 0.0843914058059454, "mean_token_accuracy": 0.9951665103435516, "num_tokens": 1292461669.0, "step": 3041 }, { "entropy": 0.018928042612969875, "epoch": 1.3313272786956998, "grad_norm": 5.4375, "learning_rate": 2.2696264212236058e-05, "loss": 0.1461, "loss_lm": 0.019081227481365204, "loss_seg": 0.126969326287508, "mean_token_accuracy": 0.9951568394899368, "num_tokens": 1292886340.0, "step": 3042 }, { "entropy": 0.01856541307643056, "epoch": 1.3317649633439106, "grad_norm": 5.8125, "learning_rate": 2.2693557119653492e-05, "loss": 0.1205, "loss_lm": 0.016644337913021445, "loss_seg": 0.10388387739658356, "mean_token_accuracy": 0.9950868338346481, "num_tokens": 1293312113.0, "step": 3043 }, { "entropy": 0.018847581930458546, "epoch": 1.3322026479921216, "grad_norm": 19.625, "learning_rate": 2.269085002707093e-05, "loss": 0.1263, "loss_lm": 0.019255811581388116, "loss_seg": 0.1070591639727354, "mean_token_accuracy": 0.9950763583183289, "num_tokens": 1293738399.0, "step": 3044 }, { "entropy": 0.019045510329306126, "epoch": 1.3326403326403327, "grad_norm": 10.75, "learning_rate": 2.268814293448836e-05, "loss": 0.1456, "loss_lm": 0.017892798176035285, "loss_seg": 0.12771770730614662, "mean_token_accuracy": 0.9950195550918579, "num_tokens": 1294163628.0, "step": 3045 }, { "entropy": 0.018487930297851562, "epoch": 1.3330780172885435, "grad_norm": 12.25, "learning_rate": 2.2685435841905793e-05, "loss": 0.1227, "loss_lm": 0.015592762269079685, "loss_seg": 0.10710174962878227, "mean_token_accuracy": 0.9952544122934341, "num_tokens": 1294588613.0, "step": 3046 }, { "entropy": 0.019530069082975388, "epoch": 1.3335157019367545, "grad_norm": 9.5, "learning_rate": 2.2682728749323227e-05, "loss": 0.1144, "loss_lm": 0.017659179400652647, "loss_seg": 0.09674259275197983, "mean_token_accuracy": 0.995017483830452, "num_tokens": 1295014053.0, "step": 3047 }, { "entropy": 0.01847859052941203, "epoch": 1.3339533865849655, "grad_norm": 16.625, "learning_rate": 2.268002165674066e-05, "loss": 0.1618, "loss_lm": 0.018187580863013864, "loss_seg": 0.14364495128393173, "mean_token_accuracy": 0.995188906788826, "num_tokens": 1295439089.0, "step": 3048 }, { "entropy": 0.018841523211449385, "epoch": 1.3343910712331766, "grad_norm": 15.0, "learning_rate": 2.2677314564158097e-05, "loss": 0.1017, "loss_lm": 0.018596900161355734, "loss_seg": 0.08312592096626759, "mean_token_accuracy": 0.9950280338525772, "num_tokens": 1295864449.0, "step": 3049 }, { "entropy": 0.019366853404790163, "epoch": 1.3348287558813874, "grad_norm": 20.0, "learning_rate": 2.2674607471575528e-05, "loss": 0.1259, "loss_lm": 0.018079040572047234, "loss_seg": 0.10782643780112267, "mean_token_accuracy": 0.9949592649936676, "num_tokens": 1296290170.0, "step": 3050 }, { "entropy": 0.018152936827391386, "epoch": 1.3352664405295984, "grad_norm": 3.921875, "learning_rate": 2.267190037899296e-05, "loss": 0.1129, "loss_lm": 0.014989376533776522, "loss_seg": 0.0979502946138382, "mean_token_accuracy": 0.9952453672885895, "num_tokens": 1296715806.0, "step": 3051 }, { "entropy": 0.01867762440815568, "epoch": 1.3357041251778095, "grad_norm": 9.0625, "learning_rate": 2.2669193286410395e-05, "loss": 0.0867, "loss_lm": 0.017362827202305198, "loss_seg": 0.06930041406303644, "mean_token_accuracy": 0.9951118528842926, "num_tokens": 1297140885.0, "step": 3052 }, { "entropy": 0.018323604483157396, "epoch": 1.3361418098260205, "grad_norm": 10.0625, "learning_rate": 2.266648619382783e-05, "loss": 0.1335, "loss_lm": 0.014103367691859603, "loss_seg": 0.11938498169183731, "mean_token_accuracy": 0.9952692985534668, "num_tokens": 1297566796.0, "step": 3053 }, { "entropy": 0.018504492938518524, "epoch": 1.3365794944742313, "grad_norm": 9.625, "learning_rate": 2.2663779101245266e-05, "loss": 0.1176, "loss_lm": 0.015707370592281222, "loss_seg": 0.10193771682679653, "mean_token_accuracy": 0.9952312111854553, "num_tokens": 1297992233.0, "step": 3054 }, { "entropy": 0.018897186499089003, "epoch": 1.3370171791224423, "grad_norm": 3.546875, "learning_rate": 2.2661072008662696e-05, "loss": 0.1164, "loss_lm": 0.01714946050196886, "loss_seg": 0.09928472246974707, "mean_token_accuracy": 0.9951110184192657, "num_tokens": 1298418064.0, "step": 3055 }, { "entropy": 0.018399065360426903, "epoch": 1.3374548637706534, "grad_norm": 3.9375, "learning_rate": 2.265836491608013e-05, "loss": 0.0913, "loss_lm": 0.015131495194509625, "loss_seg": 0.0761977806687355, "mean_token_accuracy": 0.9952878654003143, "num_tokens": 1298843752.0, "step": 3056 }, { "entropy": 0.018215162213891745, "epoch": 1.3378925484188642, "grad_norm": 7.375, "learning_rate": 2.2655657823497564e-05, "loss": 0.1125, "loss_lm": 0.015502624213695526, "loss_seg": 0.0969532709568739, "mean_token_accuracy": 0.9952372014522552, "num_tokens": 1299268252.0, "step": 3057 }, { "entropy": 0.01849539205431938, "epoch": 1.3383302330670752, "grad_norm": 27.5, "learning_rate": 2.2652950730914998e-05, "loss": 0.17, "loss_lm": 0.01587927876971662, "loss_seg": 0.15416484232991934, "mean_token_accuracy": 0.9952525198459625, "num_tokens": 1299693066.0, "step": 3058 }, { "entropy": 0.01920219324529171, "epoch": 1.3387679177152862, "grad_norm": 7.15625, "learning_rate": 2.265024363833243e-05, "loss": 0.131, "loss_lm": 0.01891333842650056, "loss_seg": 0.11213257350027561, "mean_token_accuracy": 0.9950446039438248, "num_tokens": 1300117854.0, "step": 3059 }, { "entropy": 0.01840121950954199, "epoch": 1.339205602363497, "grad_norm": 20.25, "learning_rate": 2.2647536545749865e-05, "loss": 0.098, "loss_lm": 0.016036109998822212, "loss_seg": 0.08194714412093163, "mean_token_accuracy": 0.9951505810022354, "num_tokens": 1300542840.0, "step": 3060 }, { "entropy": 0.018129289615899324, "epoch": 1.339643287011708, "grad_norm": 52.75, "learning_rate": 2.26448294531673e-05, "loss": 0.0897, "loss_lm": 0.01338258571922779, "loss_seg": 0.07634652778506279, "mean_token_accuracy": 0.9952776879072189, "num_tokens": 1300967582.0, "step": 3061 }, { "entropy": 0.018944684881716967, "epoch": 1.3400809716599191, "grad_norm": 11.5, "learning_rate": 2.2642122360584732e-05, "loss": 0.0941, "loss_lm": 0.01730178133584559, "loss_seg": 0.07683482579886913, "mean_token_accuracy": 0.995144709944725, "num_tokens": 1301392429.0, "step": 3062 }, { "entropy": 0.018444309011101723, "epoch": 1.34051865630813, "grad_norm": 7.65625, "learning_rate": 2.2639415268002166e-05, "loss": 0.1214, "loss_lm": 0.015605228953063488, "loss_seg": 0.10577761195600033, "mean_token_accuracy": 0.9952552914619446, "num_tokens": 1301817083.0, "step": 3063 }, { "entropy": 0.01844845712184906, "epoch": 1.340956340956341, "grad_norm": 4.25, "learning_rate": 2.26367081754196e-05, "loss": 0.1105, "loss_lm": 0.01598594104871154, "loss_seg": 0.09454825147986412, "mean_token_accuracy": 0.9952858686447144, "num_tokens": 1302242139.0, "step": 3064 }, { "entropy": 0.018379660788923502, "epoch": 1.341394025604552, "grad_norm": 8.75, "learning_rate": 2.2634001082837034e-05, "loss": 0.1406, "loss_lm": 0.015089729567989707, "loss_seg": 0.12546558771282434, "mean_token_accuracy": 0.9952269941568375, "num_tokens": 1302666998.0, "step": 3065 }, { "entropy": 0.019154338631778955, "epoch": 1.3418317102527628, "grad_norm": 6.21875, "learning_rate": 2.2631293990254467e-05, "loss": 0.1109, "loss_lm": 0.015032215043902397, "loss_seg": 0.09591427631676197, "mean_token_accuracy": 0.9950619339942932, "num_tokens": 1303091840.0, "step": 3066 }, { "entropy": 0.018743171356618404, "epoch": 1.3422693949009739, "grad_norm": 37.5, "learning_rate": 2.26285868976719e-05, "loss": 0.1351, "loss_lm": 0.01673280750401318, "loss_seg": 0.11839157342910767, "mean_token_accuracy": 0.9951333552598953, "num_tokens": 1303516950.0, "step": 3067 }, { "entropy": 0.018717288970947266, "epoch": 1.342707079549185, "grad_norm": 5.21875, "learning_rate": 2.2625879805089335e-05, "loss": 0.1276, "loss_lm": 0.0144962586928159, "loss_seg": 0.11314519122242928, "mean_token_accuracy": 0.995138555765152, "num_tokens": 1303941697.0, "step": 3068 }, { "entropy": 0.018329376354813576, "epoch": 1.3431447641973957, "grad_norm": 5.59375, "learning_rate": 2.2623172712506765e-05, "loss": 0.1182, "loss_lm": 0.017668199725449085, "loss_seg": 0.1005375050008297, "mean_token_accuracy": 0.9953557848930359, "num_tokens": 1304366047.0, "step": 3069 }, { "entropy": 0.01994467433542013, "epoch": 1.3435824488456067, "grad_norm": 5.65625, "learning_rate": 2.2620465619924202e-05, "loss": 0.1823, "loss_lm": 0.018201956758275628, "loss_seg": 0.1641472652554512, "mean_token_accuracy": 0.9948907345533371, "num_tokens": 1304791912.0, "step": 3070 }, { "entropy": 0.019232708495110273, "epoch": 1.3440201334938178, "grad_norm": 5.90625, "learning_rate": 2.2617758527341636e-05, "loss": 0.1712, "loss_lm": 0.018460209481418133, "loss_seg": 0.1526997946202755, "mean_token_accuracy": 0.9950167089700699, "num_tokens": 1305217911.0, "step": 3071 }, { "entropy": 0.018904021941125393, "epoch": 1.3444578181420286, "grad_norm": 16.625, "learning_rate": 2.261505143475907e-05, "loss": 0.1454, "loss_lm": 0.014886913355439901, "loss_seg": 0.1305470522493124, "mean_token_accuracy": 0.9951668828725815, "num_tokens": 1305643035.0, "step": 3072 }, { "entropy": 0.01906955661252141, "epoch": 1.3448955027902396, "grad_norm": 8.0625, "learning_rate": 2.2612344342176503e-05, "loss": 0.1477, "loss_lm": 0.015696656424552202, "loss_seg": 0.13201743736863136, "mean_token_accuracy": 0.9951348304748535, "num_tokens": 1306067620.0, "step": 3073 }, { "entropy": 0.018504283390939236, "epoch": 1.3453331874384507, "grad_norm": 32.5, "learning_rate": 2.2609637249593934e-05, "loss": 0.1162, "loss_lm": 0.016313957050442696, "loss_seg": 0.09993393905460835, "mean_token_accuracy": 0.9952309280633926, "num_tokens": 1306492616.0, "step": 3074 }, { "entropy": 0.01871699606999755, "epoch": 1.3457708720866615, "grad_norm": 5.5625, "learning_rate": 2.260693015701137e-05, "loss": 0.1293, "loss_lm": 0.017053625313565135, "loss_seg": 0.11228493973612785, "mean_token_accuracy": 0.9952571392059326, "num_tokens": 1306918034.0, "step": 3075 }, { "entropy": 0.018614892847836018, "epoch": 1.3462085567348725, "grad_norm": 9.0, "learning_rate": 2.2604223064428805e-05, "loss": 0.1423, "loss_lm": 0.018127850955352187, "loss_seg": 0.12417718209326267, "mean_token_accuracy": 0.9953528791666031, "num_tokens": 1307343613.0, "step": 3076 }, { "entropy": 0.018625029362738132, "epoch": 1.3466462413830835, "grad_norm": 10.875, "learning_rate": 2.260151597184624e-05, "loss": 0.0772, "loss_lm": 0.01799151790328324, "loss_seg": 0.05916139204055071, "mean_token_accuracy": 0.9952028840780258, "num_tokens": 1307768829.0, "step": 3077 }, { "entropy": 0.019602902699261904, "epoch": 1.3470839260312943, "grad_norm": 4.96875, "learning_rate": 2.2598808879263672e-05, "loss": 0.1038, "loss_lm": 0.017906976863741875, "loss_seg": 0.08592879958450794, "mean_token_accuracy": 0.9951388984918594, "num_tokens": 1308194623.0, "step": 3078 }, { "entropy": 0.018884573131799698, "epoch": 1.3475216106795054, "grad_norm": 11.8125, "learning_rate": 2.2596101786681102e-05, "loss": 0.1167, "loss_lm": 0.014937620144337416, "loss_seg": 0.10175985656678677, "mean_token_accuracy": 0.9951703697443008, "num_tokens": 1308620517.0, "step": 3079 }, { "entropy": 0.019050049129873514, "epoch": 1.3479592953277164, "grad_norm": 10.25, "learning_rate": 2.259339469409854e-05, "loss": 0.1401, "loss_lm": 0.015599670819938183, "loss_seg": 0.12452199123799801, "mean_token_accuracy": 0.9951330572366714, "num_tokens": 1309045790.0, "step": 3080 }, { "entropy": 0.01877568941563368, "epoch": 1.3483969799759272, "grad_norm": 16.875, "learning_rate": 2.2590687601515973e-05, "loss": 0.1331, "loss_lm": 0.014067858224734664, "loss_seg": 0.1189878098666668, "mean_token_accuracy": 0.9953281134366989, "num_tokens": 1309471261.0, "step": 3081 }, { "entropy": 0.01864380156621337, "epoch": 1.3488346646241383, "grad_norm": 12.0, "learning_rate": 2.2587980508933407e-05, "loss": 0.1712, "loss_lm": 0.016506386222317815, "loss_seg": 0.15473148599267006, "mean_token_accuracy": 0.995127260684967, "num_tokens": 1309896211.0, "step": 3082 }, { "entropy": 0.018980410415679216, "epoch": 1.3492723492723493, "grad_norm": 10.3125, "learning_rate": 2.258527341635084e-05, "loss": 0.1014, "loss_lm": 0.01621058932505548, "loss_seg": 0.08520864229649305, "mean_token_accuracy": 0.9951553344726562, "num_tokens": 1310322238.0, "step": 3083 }, { "entropy": 0.018845572602003813, "epoch": 1.34971003392056, "grad_norm": 6.0625, "learning_rate": 2.258256632376827e-05, "loss": 0.1443, "loss_lm": 0.020051985513418913, "loss_seg": 0.12420323677361012, "mean_token_accuracy": 0.9952375292778015, "num_tokens": 1310747192.0, "step": 3084 }, { "entropy": 0.018861080054193735, "epoch": 1.3501477185687711, "grad_norm": 5.9375, "learning_rate": 2.2579859231185708e-05, "loss": 0.1345, "loss_lm": 0.015872462885454297, "loss_seg": 0.11857971921563148, "mean_token_accuracy": 0.9951414316892624, "num_tokens": 1311172203.0, "step": 3085 }, { "entropy": 0.01862869458273053, "epoch": 1.3505854032169822, "grad_norm": 18.125, "learning_rate": 2.2577152138603142e-05, "loss": 0.1483, "loss_lm": 0.015561867039650679, "loss_seg": 0.13269546441733837, "mean_token_accuracy": 0.9952548891305923, "num_tokens": 1311597235.0, "step": 3086 }, { "entropy": 0.01878864085301757, "epoch": 1.3510230878651932, "grad_norm": 5.90625, "learning_rate": 2.2574445046020576e-05, "loss": 0.1377, "loss_lm": 0.017806229181587696, "loss_seg": 0.11986306123435497, "mean_token_accuracy": 0.99513079226017, "num_tokens": 1312022546.0, "step": 3087 }, { "entropy": 0.01886009331792593, "epoch": 1.351460772513404, "grad_norm": 7.125, "learning_rate": 2.257173795343801e-05, "loss": 0.1224, "loss_lm": 0.0173242490272969, "loss_seg": 0.10505790449678898, "mean_token_accuracy": 0.9952259808778763, "num_tokens": 1312448142.0, "step": 3088 }, { "entropy": 0.01919434266164899, "epoch": 1.351898457161615, "grad_norm": 23.0, "learning_rate": 2.256903086085544e-05, "loss": 0.1593, "loss_lm": 0.01850672299042344, "loss_seg": 0.140744986012578, "mean_token_accuracy": 0.9951672703027725, "num_tokens": 1312873745.0, "step": 3089 }, { "entropy": 0.018655949737876654, "epoch": 1.352336141809826, "grad_norm": 9.4375, "learning_rate": 2.2566323768272877e-05, "loss": 0.1801, "loss_lm": 0.01866653678007424, "loss_seg": 0.16140665486454964, "mean_token_accuracy": 0.9951965063810349, "num_tokens": 1313298818.0, "step": 3090 }, { "entropy": 0.018915951251983643, "epoch": 1.3527738264580371, "grad_norm": 8.6875, "learning_rate": 2.256361667569031e-05, "loss": 0.1134, "loss_lm": 0.013506143586710095, "loss_seg": 0.09986221790313721, "mean_token_accuracy": 0.9952588975429535, "num_tokens": 1313723738.0, "step": 3091 }, { "entropy": 0.01894399430602789, "epoch": 1.353211511106248, "grad_norm": 11.1875, "learning_rate": 2.2560909583107744e-05, "loss": 0.1435, "loss_lm": 0.01715452247299254, "loss_seg": 0.12632031552493572, "mean_token_accuracy": 0.995198205113411, "num_tokens": 1314148541.0, "step": 3092 }, { "entropy": 0.018641332630068064, "epoch": 1.353649195754459, "grad_norm": 9.25, "learning_rate": 2.2558202490525175e-05, "loss": 0.1325, "loss_lm": 0.015222336864098907, "loss_seg": 0.11724238470196724, "mean_token_accuracy": 0.995245948433876, "num_tokens": 1314573622.0, "step": 3093 }, { "entropy": 0.018323917407542467, "epoch": 1.35408688040267, "grad_norm": 6.625, "learning_rate": 2.255549539794261e-05, "loss": 0.1081, "loss_lm": 0.016688737785443664, "loss_seg": 0.09143458679318428, "mean_token_accuracy": 0.9952873885631561, "num_tokens": 1314998413.0, "step": 3094 }, { "entropy": 0.0187566252425313, "epoch": 1.3545245650508808, "grad_norm": 14.625, "learning_rate": 2.2552788305360042e-05, "loss": 0.1169, "loss_lm": 0.014653816819190979, "loss_seg": 0.10224264860153198, "mean_token_accuracy": 0.9953038096427917, "num_tokens": 1315424039.0, "step": 3095 }, { "entropy": 0.019144668243825436, "epoch": 1.3549622496990918, "grad_norm": 9.25, "learning_rate": 2.255008121277748e-05, "loss": 0.1024, "loss_lm": 0.016949621494859457, "loss_seg": 0.08541863970458508, "mean_token_accuracy": 0.9951258897781372, "num_tokens": 1315849738.0, "step": 3096 }, { "entropy": 0.01792996469885111, "epoch": 1.3553999343473029, "grad_norm": 9.125, "learning_rate": 2.2547374120194913e-05, "loss": 0.1146, "loss_lm": 0.013594108168035746, "loss_seg": 0.10097195394337177, "mean_token_accuracy": 0.9955526441335678, "num_tokens": 1316274659.0, "step": 3097 }, { "entropy": 0.017916460055857897, "epoch": 1.3558376189955137, "grad_norm": 9.6875, "learning_rate": 2.2544667027612343e-05, "loss": 0.1319, "loss_lm": 0.014339750166982412, "loss_seg": 0.1175637561827898, "mean_token_accuracy": 0.995457336306572, "num_tokens": 1316699009.0, "step": 3098 }, { "entropy": 0.019044444430619478, "epoch": 1.3562753036437247, "grad_norm": 14.8125, "learning_rate": 2.2541959935029777e-05, "loss": 0.1502, "loss_lm": 0.018468895694240928, "loss_seg": 0.13174807466566563, "mean_token_accuracy": 0.9952077567577362, "num_tokens": 1317124047.0, "step": 3099 }, { "entropy": 0.018407852854579687, "epoch": 1.3567129882919358, "grad_norm": 4.0625, "learning_rate": 2.253925284244721e-05, "loss": 0.1563, "loss_lm": 0.018205459229648113, "loss_seg": 0.13811629079282284, "mean_token_accuracy": 0.9953030794858932, "num_tokens": 1317548963.0, "step": 3100 }, { "entropy": 0.01894760224968195, "epoch": 1.3571506729401466, "grad_norm": 8.375, "learning_rate": 2.2536545749864648e-05, "loss": 0.1571, "loss_lm": 0.015578518388792872, "loss_seg": 0.14150039479136467, "mean_token_accuracy": 0.9952652901411057, "num_tokens": 1317974534.0, "step": 3101 }, { "entropy": 0.018765281420201063, "epoch": 1.3575883575883576, "grad_norm": 7.90625, "learning_rate": 2.253383865728208e-05, "loss": 0.1005, "loss_lm": 0.018129928968846798, "loss_seg": 0.08238600101321936, "mean_token_accuracy": 0.9952484965324402, "num_tokens": 1318399823.0, "step": 3102 }, { "entropy": 0.01853972440585494, "epoch": 1.3580260422365686, "grad_norm": 6.78125, "learning_rate": 2.2531131564699512e-05, "loss": 0.134, "loss_lm": 0.015186010161414742, "loss_seg": 0.11885834857821465, "mean_token_accuracy": 0.9952472597360611, "num_tokens": 1318824596.0, "step": 3103 }, { "entropy": 0.018303903751075268, "epoch": 1.3584637268847795, "grad_norm": 15.875, "learning_rate": 2.2528424472116946e-05, "loss": 0.1378, "loss_lm": 0.014357374981045723, "loss_seg": 0.1234768982976675, "mean_token_accuracy": 0.9952313154935837, "num_tokens": 1319248985.0, "step": 3104 }, { "entropy": 0.018326614052057266, "epoch": 1.3589014115329905, "grad_norm": 12.0625, "learning_rate": 2.252571737953438e-05, "loss": 0.1681, "loss_lm": 0.014953055884689093, "loss_seg": 0.15313052758574486, "mean_token_accuracy": 0.9953490495681763, "num_tokens": 1319673159.0, "step": 3105 }, { "entropy": 0.018729280680418015, "epoch": 1.3593390961812015, "grad_norm": 4.84375, "learning_rate": 2.2523010286951816e-05, "loss": 0.1054, "loss_lm": 0.016692608362063766, "loss_seg": 0.08871147036552429, "mean_token_accuracy": 0.9951247870922089, "num_tokens": 1320098763.0, "step": 3106 }, { "entropy": 0.01857569580897689, "epoch": 1.3597767808294123, "grad_norm": 17.125, "learning_rate": 2.252030319436925e-05, "loss": 0.1433, "loss_lm": 0.01464667241089046, "loss_seg": 0.12870267778635025, "mean_token_accuracy": 0.9952777177095413, "num_tokens": 1320523936.0, "step": 3107 }, { "entropy": 0.01870847214013338, "epoch": 1.3602144654776234, "grad_norm": 5.28125, "learning_rate": 2.251759610178668e-05, "loss": 0.1438, "loss_lm": 0.01650694152340293, "loss_seg": 0.1272949017584324, "mean_token_accuracy": 0.9951266944408417, "num_tokens": 1320948865.0, "step": 3108 }, { "entropy": 0.018444636836647987, "epoch": 1.3606521501258344, "grad_norm": 19.125, "learning_rate": 2.2514889009204114e-05, "loss": 0.0878, "loss_lm": 0.015732271363958716, "loss_seg": 0.07207041792571545, "mean_token_accuracy": 0.9952620416879654, "num_tokens": 1321374330.0, "step": 3109 }, { "entropy": 0.01861054403707385, "epoch": 1.3610898347740452, "grad_norm": 8.125, "learning_rate": 2.2512181916621548e-05, "loss": 0.1114, "loss_lm": 0.015393939800560474, "loss_seg": 0.09601050987839699, "mean_token_accuracy": 0.9951537549495697, "num_tokens": 1321799781.0, "step": 3110 }, { "entropy": 0.01861736038699746, "epoch": 1.3615275194222562, "grad_norm": 11.25, "learning_rate": 2.2509474824038985e-05, "loss": 0.1037, "loss_lm": 0.013753695879131556, "loss_seg": 0.08997714705765247, "mean_token_accuracy": 0.9952887445688248, "num_tokens": 1322224902.0, "step": 3111 }, { "entropy": 0.018627090845257044, "epoch": 1.3619652040704673, "grad_norm": 7.625, "learning_rate": 2.250676773145642e-05, "loss": 0.1602, "loss_lm": 0.015777790686115623, "loss_seg": 0.1444333866238594, "mean_token_accuracy": 0.9952597767114639, "num_tokens": 1322649943.0, "step": 3112 }, { "entropy": 0.018192223738878965, "epoch": 1.362402888718678, "grad_norm": 4.5625, "learning_rate": 2.250406063887385e-05, "loss": 0.1635, "loss_lm": 0.014706395333632827, "loss_seg": 0.14880744740366936, "mean_token_accuracy": 0.9953396022319794, "num_tokens": 1323074541.0, "step": 3113 }, { "entropy": 0.018376664258539677, "epoch": 1.3628405733668891, "grad_norm": 5.59375, "learning_rate": 2.2501353546291283e-05, "loss": 0.1609, "loss_lm": 0.015702565666288137, "loss_seg": 0.14516750909388065, "mean_token_accuracy": 0.9953905791044235, "num_tokens": 1323499283.0, "step": 3114 }, { "entropy": 0.018525683786720037, "epoch": 1.3632782580151002, "grad_norm": 5.9375, "learning_rate": 2.2498646453708717e-05, "loss": 0.127, "loss_lm": 0.01483092806302011, "loss_seg": 0.1121958140283823, "mean_token_accuracy": 0.9951852709054947, "num_tokens": 1323925010.0, "step": 3115 }, { "entropy": 0.018134313635528088, "epoch": 1.363715942663311, "grad_norm": 8.8125, "learning_rate": 2.2495939361126154e-05, "loss": 0.1142, "loss_lm": 0.014990540686994791, "loss_seg": 0.09921045042574406, "mean_token_accuracy": 0.9953756332397461, "num_tokens": 1324349682.0, "step": 3116 }, { "entropy": 0.01870200363919139, "epoch": 1.364153627311522, "grad_norm": 8.5, "learning_rate": 2.2493232268543584e-05, "loss": 0.1474, "loss_lm": 0.013754400191828609, "loss_seg": 0.13359764963388443, "mean_token_accuracy": 0.995383158326149, "num_tokens": 1324775465.0, "step": 3117 }, { "entropy": 0.018401119858026505, "epoch": 1.364591311959733, "grad_norm": 7.34375, "learning_rate": 2.2490525175961018e-05, "loss": 0.1087, "loss_lm": 0.01620411849580705, "loss_seg": 0.0925261341035366, "mean_token_accuracy": 0.995297446846962, "num_tokens": 1325201149.0, "step": 3118 }, { "entropy": 0.01925593102350831, "epoch": 1.3650289966079439, "grad_norm": 8.0, "learning_rate": 2.248781808337845e-05, "loss": 0.121, "loss_lm": 0.015942436177283525, "loss_seg": 0.10503282397985458, "mean_token_accuracy": 0.9950167685747147, "num_tokens": 1325626351.0, "step": 3119 }, { "entropy": 0.019413691479712725, "epoch": 1.365466681256155, "grad_norm": 7.46875, "learning_rate": 2.2485110990795885e-05, "loss": 0.128, "loss_lm": 0.01734296465292573, "loss_seg": 0.11065260507166386, "mean_token_accuracy": 0.9950131177902222, "num_tokens": 1326051851.0, "step": 3120 }, { "entropy": 0.018594217486679554, "epoch": 1.365904365904366, "grad_norm": 13.875, "learning_rate": 2.2482403898213322e-05, "loss": 0.1297, "loss_lm": 0.015198171604424715, "loss_seg": 0.11451288312673569, "mean_token_accuracy": 0.9952788949012756, "num_tokens": 1326476504.0, "step": 3121 }, { "entropy": 0.01896284567192197, "epoch": 1.3663420505525767, "grad_norm": 5.3125, "learning_rate": 2.2479696805630753e-05, "loss": 0.1899, "loss_lm": 0.016258818563073874, "loss_seg": 0.17361177504062653, "mean_token_accuracy": 0.9951416403055191, "num_tokens": 1326901723.0, "step": 3122 }, { "entropy": 0.017816497944295406, "epoch": 1.3667797352007878, "grad_norm": 4.3125, "learning_rate": 2.2476989713048186e-05, "loss": 0.137, "loss_lm": 0.015558640006929636, "loss_seg": 0.1214290875941515, "mean_token_accuracy": 0.9955385327339172, "num_tokens": 1327326850.0, "step": 3123 }, { "entropy": 0.018515597563236952, "epoch": 1.3672174198489988, "grad_norm": 81.0, "learning_rate": 2.247428262046562e-05, "loss": 0.1018, "loss_lm": 0.0155503717251122, "loss_seg": 0.08623676281422377, "mean_token_accuracy": 0.9952770620584488, "num_tokens": 1327751982.0, "step": 3124 }, { "entropy": 0.01887683290988207, "epoch": 1.3676551044972098, "grad_norm": 19.875, "learning_rate": 2.2471575527883054e-05, "loss": 0.1087, "loss_lm": 0.018239145167171955, "loss_seg": 0.09044438228011131, "mean_token_accuracy": 0.995159462094307, "num_tokens": 1328176881.0, "step": 3125 }, { "entropy": 0.018428259529173374, "epoch": 1.3680927891454207, "grad_norm": 14.5, "learning_rate": 2.2468868435300488e-05, "loss": 0.1062, "loss_lm": 0.017446061596274376, "loss_seg": 0.08876304142177105, "mean_token_accuracy": 0.9951782077550888, "num_tokens": 1328601531.0, "step": 3126 }, { "entropy": 0.018557384610176086, "epoch": 1.3685304737936317, "grad_norm": 8.0625, "learning_rate": 2.246616134271792e-05, "loss": 0.1664, "loss_lm": 0.018827215302735567, "loss_seg": 0.14755450002849102, "mean_token_accuracy": 0.9951677769422531, "num_tokens": 1329026734.0, "step": 3127 }, { "entropy": 0.01910196663811803, "epoch": 1.3689681584418427, "grad_norm": 10.875, "learning_rate": 2.2463454250135355e-05, "loss": 0.1171, "loss_lm": 0.016960857436060905, "loss_seg": 0.10013921745121479, "mean_token_accuracy": 0.9951353520154953, "num_tokens": 1329451813.0, "step": 3128 }, { "entropy": 0.01825902611017227, "epoch": 1.3694058430900538, "grad_norm": 8.6875, "learning_rate": 2.246074715755279e-05, "loss": 0.1359, "loss_lm": 0.015431889332830906, "loss_seg": 0.12049323320388794, "mean_token_accuracy": 0.9953513592481613, "num_tokens": 1329876183.0, "step": 3129 }, { "entropy": 0.019151288084685802, "epoch": 1.3698435277382646, "grad_norm": 5.71875, "learning_rate": 2.2458040064970222e-05, "loss": 0.1161, "loss_lm": 0.014702056301757693, "loss_seg": 0.10136069171130657, "mean_token_accuracy": 0.995027557015419, "num_tokens": 1330300905.0, "step": 3130 }, { "entropy": 0.018535365350544453, "epoch": 1.3702812123864756, "grad_norm": 8.3125, "learning_rate": 2.2455332972387656e-05, "loss": 0.1101, "loss_lm": 0.015334644122049212, "loss_seg": 0.09480150137096643, "mean_token_accuracy": 0.9951818138360977, "num_tokens": 1330725943.0, "step": 3131 }, { "entropy": 0.019305634777992964, "epoch": 1.3707188970346866, "grad_norm": 6.875, "learning_rate": 2.245262587980509e-05, "loss": 0.1192, "loss_lm": 0.016623994102701545, "loss_seg": 0.10258045233786106, "mean_token_accuracy": 0.9949375241994858, "num_tokens": 1331151788.0, "step": 3132 }, { "entropy": 0.018870738800615072, "epoch": 1.3711565816828974, "grad_norm": 13.9375, "learning_rate": 2.2449918787222524e-05, "loss": 0.1394, "loss_lm": 0.01551432185806334, "loss_seg": 0.12388113513588905, "mean_token_accuracy": 0.9951294213533401, "num_tokens": 1331576760.0, "step": 3133 }, { "entropy": 0.019240160007029772, "epoch": 1.3715942663311085, "grad_norm": 16.75, "learning_rate": 2.2447211694639957e-05, "loss": 0.1621, "loss_lm": 0.01894451375119388, "loss_seg": 0.14316023141145706, "mean_token_accuracy": 0.994911715388298, "num_tokens": 1332001845.0, "step": 3134 }, { "entropy": 0.018031461630016565, "epoch": 1.3720319509793195, "grad_norm": 11.8125, "learning_rate": 2.244450460205739e-05, "loss": 0.0846, "loss_lm": 0.01608716743066907, "loss_seg": 0.06846556253731251, "mean_token_accuracy": 0.9955292046070099, "num_tokens": 1332427403.0, "step": 3135 }, { "entropy": 0.018631794024258852, "epoch": 1.3724696356275303, "grad_norm": 4.53125, "learning_rate": 2.2441797509474825e-05, "loss": 0.1218, "loss_lm": 0.014639189932495356, "loss_seg": 0.10719820111989975, "mean_token_accuracy": 0.9952617585659027, "num_tokens": 1332852616.0, "step": 3136 }, { "entropy": 0.018031437881290913, "epoch": 1.3729073202757414, "grad_norm": 17.75, "learning_rate": 2.243909041689226e-05, "loss": 0.097, "loss_lm": 0.01598232821561396, "loss_seg": 0.08098274748772383, "mean_token_accuracy": 0.9952343553304672, "num_tokens": 1333277280.0, "step": 3137 }, { "entropy": 0.018732101190835238, "epoch": 1.3733450049239524, "grad_norm": 11.4375, "learning_rate": 2.2436383324309692e-05, "loss": 0.1296, "loss_lm": 0.014961817534640431, "loss_seg": 0.11463246122002602, "mean_token_accuracy": 0.9952260851860046, "num_tokens": 1333702226.0, "step": 3138 }, { "entropy": 0.018458392936736345, "epoch": 1.3737826895721632, "grad_norm": 4.25, "learning_rate": 2.2433676231727126e-05, "loss": 0.0972, "loss_lm": 0.018562859389930964, "loss_seg": 0.07867486029863358, "mean_token_accuracy": 0.9953118711709976, "num_tokens": 1334126882.0, "step": 3139 }, { "entropy": 0.0194347919896245, "epoch": 1.3742203742203742, "grad_norm": 31.5, "learning_rate": 2.243096913914456e-05, "loss": 0.1224, "loss_lm": 0.01598880789242685, "loss_seg": 0.1063818670809269, "mean_token_accuracy": 0.9950496852397919, "num_tokens": 1334551459.0, "step": 3140 }, { "entropy": 0.019503615330904722, "epoch": 1.3746580588685853, "grad_norm": 9.5625, "learning_rate": 2.242826204656199e-05, "loss": 0.161, "loss_lm": 0.015516716986894608, "loss_seg": 0.14545209519565105, "mean_token_accuracy": 0.9950758069753647, "num_tokens": 1334976273.0, "step": 3141 }, { "entropy": 0.018391067162156105, "epoch": 1.375095743516796, "grad_norm": 17.0, "learning_rate": 2.2425554953979427e-05, "loss": 0.1507, "loss_lm": 0.016353981103748083, "loss_seg": 0.13431191630661488, "mean_token_accuracy": 0.995314210653305, "num_tokens": 1335400702.0, "step": 3142 }, { "entropy": 0.019443966913968325, "epoch": 1.3755334281650071, "grad_norm": 7.90625, "learning_rate": 2.242284786139686e-05, "loss": 0.1243, "loss_lm": 0.01721318159252405, "loss_seg": 0.1071123881265521, "mean_token_accuracy": 0.994923934340477, "num_tokens": 1335826200.0, "step": 3143 }, { "entropy": 0.018052946776151657, "epoch": 1.3759711128132182, "grad_norm": 14.6875, "learning_rate": 2.2420140768814295e-05, "loss": 0.1505, "loss_lm": 0.01505285338498652, "loss_seg": 0.13544106110930443, "mean_token_accuracy": 0.995388925075531, "num_tokens": 1336251416.0, "step": 3144 }, { "entropy": 0.01897561689838767, "epoch": 1.376408797461429, "grad_norm": 7.71875, "learning_rate": 2.241743367623173e-05, "loss": 0.1424, "loss_lm": 0.01655601733364165, "loss_seg": 0.12581121176481247, "mean_token_accuracy": 0.995034247636795, "num_tokens": 1336676486.0, "step": 3145 }, { "entropy": 0.018276780378073454, "epoch": 1.37684648210964, "grad_norm": 4.9375, "learning_rate": 2.241472658364916e-05, "loss": 0.1471, "loss_lm": 0.016342354007065296, "loss_seg": 0.130748450756073, "mean_token_accuracy": 0.995375782251358, "num_tokens": 1337101578.0, "step": 3146 }, { "entropy": 0.01924346061423421, "epoch": 1.377284166757851, "grad_norm": 12.9375, "learning_rate": 2.2412019491066596e-05, "loss": 0.1607, "loss_lm": 0.0168913712259382, "loss_seg": 0.1438111625611782, "mean_token_accuracy": 0.9950109422206879, "num_tokens": 1337526699.0, "step": 3147 }, { "entropy": 0.018599608913064003, "epoch": 1.3777218514060618, "grad_norm": 11.0625, "learning_rate": 2.240931239848403e-05, "loss": 0.152, "loss_lm": 0.017128185834735632, "loss_seg": 0.13486093655228615, "mean_token_accuracy": 0.9953255653381348, "num_tokens": 1337951712.0, "step": 3148 }, { "entropy": 0.01936903828755021, "epoch": 1.3781595360542729, "grad_norm": 4.09375, "learning_rate": 2.2406605305901463e-05, "loss": 0.1194, "loss_lm": 0.017229429678991437, "loss_seg": 0.10214079357683659, "mean_token_accuracy": 0.9949512928724289, "num_tokens": 1338376710.0, "step": 3149 }, { "entropy": 0.018582383636385202, "epoch": 1.378597220702484, "grad_norm": 11.125, "learning_rate": 2.2403898213318897e-05, "loss": 0.1703, "loss_lm": 0.016940028872340918, "loss_seg": 0.15336036123335361, "mean_token_accuracy": 0.995232343673706, "num_tokens": 1338801822.0, "step": 3150 }, { "entropy": 0.018401296343654394, "epoch": 1.3790349053506947, "grad_norm": 10.1875, "learning_rate": 2.2401191120736327e-05, "loss": 0.1151, "loss_lm": 0.015008488902822137, "loss_seg": 0.10008323937654495, "mean_token_accuracy": 0.9953611195087433, "num_tokens": 1339226211.0, "step": 3151 }, { "entropy": 0.018732066731899977, "epoch": 1.3794725899989058, "grad_norm": 12.9375, "learning_rate": 2.2398484028153764e-05, "loss": 0.1479, "loss_lm": 0.019878331338986754, "loss_seg": 0.12799805495887995, "mean_token_accuracy": 0.9950826317071915, "num_tokens": 1339651403.0, "step": 3152 }, { "entropy": 0.018810314126312733, "epoch": 1.3799102746471168, "grad_norm": 4.5625, "learning_rate": 2.2395776935571198e-05, "loss": 0.0842, "loss_lm": 0.014530907850712538, "loss_seg": 0.06968116946518421, "mean_token_accuracy": 0.9950675666332245, "num_tokens": 1340076313.0, "step": 3153 }, { "entropy": 0.01814224850386381, "epoch": 1.3803479592953276, "grad_norm": 12.625, "learning_rate": 2.2393069842988632e-05, "loss": 0.1197, "loss_lm": 0.015237838728353381, "loss_seg": 0.10443392023444176, "mean_token_accuracy": 0.9952358901500702, "num_tokens": 1340500913.0, "step": 3154 }, { "entropy": 0.01814321707934141, "epoch": 1.3807856439435386, "grad_norm": 24.625, "learning_rate": 2.2390362750406066e-05, "loss": 0.1401, "loss_lm": 0.016416116151958704, "loss_seg": 0.12369803618639708, "mean_token_accuracy": 0.995286300778389, "num_tokens": 1340925807.0, "step": 3155 }, { "entropy": 0.01767310779541731, "epoch": 1.3812233285917497, "grad_norm": 3.984375, "learning_rate": 2.2387655657823496e-05, "loss": 0.0894, "loss_lm": 0.016011580592021346, "loss_seg": 0.07341511361300945, "mean_token_accuracy": 0.9954337626695633, "num_tokens": 1341350614.0, "step": 3156 }, { "entropy": 0.018832051660865545, "epoch": 1.3816610132399605, "grad_norm": 4.75, "learning_rate": 2.238494856524093e-05, "loss": 0.1051, "loss_lm": 0.01822836440987885, "loss_seg": 0.08690716419368982, "mean_token_accuracy": 0.9951311647891998, "num_tokens": 1341775706.0, "step": 3157 }, { "entropy": 0.01858330611139536, "epoch": 1.3820986978881715, "grad_norm": 13.25, "learning_rate": 2.2382241472658367e-05, "loss": 0.118, "loss_lm": 0.014313679886981845, "loss_seg": 0.10367902368307114, "mean_token_accuracy": 0.9952285438776016, "num_tokens": 1342200896.0, "step": 3158 }, { "entropy": 0.01852255268022418, "epoch": 1.3825363825363826, "grad_norm": 5.90625, "learning_rate": 2.23795343800758e-05, "loss": 0.1311, "loss_lm": 0.0161184910684824, "loss_seg": 0.11496888101100922, "mean_token_accuracy": 0.995210736989975, "num_tokens": 1342624723.0, "step": 3159 }, { "entropy": 0.0180829088203609, "epoch": 1.3829740671845934, "grad_norm": 5.34375, "learning_rate": 2.2376827287493234e-05, "loss": 0.0847, "loss_lm": 0.014331787591800094, "loss_seg": 0.07031984534114599, "mean_token_accuracy": 0.9953746348619461, "num_tokens": 1343049946.0, "step": 3160 }, { "entropy": 0.018626255448907614, "epoch": 1.3834117518328044, "grad_norm": 11.9375, "learning_rate": 2.2374120194910665e-05, "loss": 0.1541, "loss_lm": 0.017503102775663137, "loss_seg": 0.13664234429597855, "mean_token_accuracy": 0.9951289743185043, "num_tokens": 1343475905.0, "step": 3161 }, { "entropy": 0.018244005739688873, "epoch": 1.3838494364810154, "grad_norm": 5.96875, "learning_rate": 2.23714131023281e-05, "loss": 0.0932, "loss_lm": 0.013716503977775574, "loss_seg": 0.07952309772372246, "mean_token_accuracy": 0.995302364230156, "num_tokens": 1343901412.0, "step": 3162 }, { "entropy": 0.018422730267047882, "epoch": 1.3842871211292265, "grad_norm": 6.15625, "learning_rate": 2.2368706009745535e-05, "loss": 0.1639, "loss_lm": 0.015912927919998765, "loss_seg": 0.14800681173801422, "mean_token_accuracy": 0.9951359629631042, "num_tokens": 1344326755.0, "step": 3163 }, { "entropy": 0.01855590147897601, "epoch": 1.3847248057774373, "grad_norm": 8.0625, "learning_rate": 2.236599891716297e-05, "loss": 0.092, "loss_lm": 0.015445308992639184, "loss_seg": 0.07656048703938723, "mean_token_accuracy": 0.9951657503843307, "num_tokens": 1344752007.0, "step": 3164 }, { "entropy": 0.018990926444530487, "epoch": 1.3851624904256483, "grad_norm": 16.625, "learning_rate": 2.23632918245804e-05, "loss": 0.1467, "loss_lm": 0.01709437556564808, "loss_seg": 0.1295951660722494, "mean_token_accuracy": 0.9951643198728561, "num_tokens": 1345177532.0, "step": 3165 }, { "entropy": 0.018330775666981936, "epoch": 1.3856001750738594, "grad_norm": 7.03125, "learning_rate": 2.2360584731997833e-05, "loss": 0.1146, "loss_lm": 0.017926607746630907, "loss_seg": 0.09662980400025845, "mean_token_accuracy": 0.9952005594968796, "num_tokens": 1345602550.0, "step": 3166 }, { "entropy": 0.018663641531020403, "epoch": 1.3860378597220704, "grad_norm": 21.5, "learning_rate": 2.2357877639415267e-05, "loss": 0.1362, "loss_lm": 0.01679621427319944, "loss_seg": 0.11935565434396267, "mean_token_accuracy": 0.9951432198286057, "num_tokens": 1346027541.0, "step": 3167 }, { "entropy": 0.018641275819391012, "epoch": 1.3864755443702812, "grad_norm": 21.0, "learning_rate": 2.2355170546832704e-05, "loss": 0.0859, "loss_lm": 0.015464551048353314, "loss_seg": 0.0704132728278637, "mean_token_accuracy": 0.9951359629631042, "num_tokens": 1346453703.0, "step": 3168 }, { "entropy": 0.018987051211297512, "epoch": 1.3869132290184922, "grad_norm": 5.03125, "learning_rate": 2.2352463454250138e-05, "loss": 0.1424, "loss_lm": 0.016054781386628747, "loss_seg": 0.12632913514971733, "mean_token_accuracy": 0.9951116740703583, "num_tokens": 1346879393.0, "step": 3169 }, { "entropy": 0.018609619699418545, "epoch": 1.3873509136667033, "grad_norm": 9.3125, "learning_rate": 2.2349756361667568e-05, "loss": 0.1744, "loss_lm": 0.016324823489412665, "loss_seg": 0.1580883078277111, "mean_token_accuracy": 0.9951739013195038, "num_tokens": 1347304333.0, "step": 3170 }, { "entropy": 0.018362967297434807, "epoch": 1.387788598314914, "grad_norm": 8.4375, "learning_rate": 2.2347049269085002e-05, "loss": 0.1146, "loss_lm": 0.016318489564582705, "loss_seg": 0.09823337942361832, "mean_token_accuracy": 0.995322659611702, "num_tokens": 1347729549.0, "step": 3171 }, { "entropy": 0.01804541889578104, "epoch": 1.3882262829631251, "grad_norm": 14.375, "learning_rate": 2.2344342176502436e-05, "loss": 0.1203, "loss_lm": 0.017467850353568792, "loss_seg": 0.10285289399325848, "mean_token_accuracy": 0.9954690188169479, "num_tokens": 1348154528.0, "step": 3172 }, { "entropy": 0.017921060789376497, "epoch": 1.3886639676113361, "grad_norm": 16.625, "learning_rate": 2.2341635083919873e-05, "loss": 0.1226, "loss_lm": 0.016955559607595205, "loss_seg": 0.10568931512534618, "mean_token_accuracy": 0.9954775124788284, "num_tokens": 1348580086.0, "step": 3173 }, { "entropy": 0.018274400383234024, "epoch": 1.389101652259547, "grad_norm": 8.8125, "learning_rate": 2.2338927991337306e-05, "loss": 0.1737, "loss_lm": 0.017553269863128662, "loss_seg": 0.1561712995171547, "mean_token_accuracy": 0.9953558295965195, "num_tokens": 1349005061.0, "step": 3174 }, { "entropy": 0.01879359083250165, "epoch": 1.389539336907758, "grad_norm": 6.4375, "learning_rate": 2.2336220898754737e-05, "loss": 0.1182, "loss_lm": 0.014944028807803988, "loss_seg": 0.10325174778699875, "mean_token_accuracy": 0.9952211529016495, "num_tokens": 1349430032.0, "step": 3175 }, { "entropy": 0.018252689391374588, "epoch": 1.389977021555969, "grad_norm": 5.40625, "learning_rate": 2.233351380617217e-05, "loss": 0.1232, "loss_lm": 0.01514726784080267, "loss_seg": 0.10804378427565098, "mean_token_accuracy": 0.9951813817024231, "num_tokens": 1349854390.0, "step": 3176 }, { "entropy": 0.018759335856884718, "epoch": 1.3904147062041798, "grad_norm": 9.375, "learning_rate": 2.2330806713589604e-05, "loss": 0.1089, "loss_lm": 0.0179062788374722, "loss_seg": 0.09095481224358082, "mean_token_accuracy": 0.9950933009386063, "num_tokens": 1350279903.0, "step": 3177 }, { "entropy": 0.018048526719212532, "epoch": 1.3908523908523909, "grad_norm": 9.8125, "learning_rate": 2.232809962100704e-05, "loss": 0.1192, "loss_lm": 0.01488768425770104, "loss_seg": 0.10428647510707378, "mean_token_accuracy": 0.9953306615352631, "num_tokens": 1350703878.0, "step": 3178 }, { "entropy": 0.018843223340809345, "epoch": 1.391290075500602, "grad_norm": 6.71875, "learning_rate": 2.2325392528424475e-05, "loss": 0.1279, "loss_lm": 0.018230042420327663, "loss_seg": 0.10965788550674915, "mean_token_accuracy": 0.9952172935009003, "num_tokens": 1351129330.0, "step": 3179 }, { "entropy": 0.018569198437035084, "epoch": 1.3917277601488127, "grad_norm": 4.21875, "learning_rate": 2.2322685435841905e-05, "loss": 0.164, "loss_lm": 0.016441584564745426, "loss_seg": 0.14752528071403503, "mean_token_accuracy": 0.9952113926410675, "num_tokens": 1351554066.0, "step": 3180 }, { "entropy": 0.01862212922424078, "epoch": 1.3921654447970238, "grad_norm": 22.0, "learning_rate": 2.231997834325934e-05, "loss": 0.1299, "loss_lm": 0.01535046030767262, "loss_seg": 0.11451463960111141, "mean_token_accuracy": 0.9952603876590729, "num_tokens": 1351979589.0, "step": 3181 }, { "entropy": 0.018710382282733917, "epoch": 1.3926031294452348, "grad_norm": 16.75, "learning_rate": 2.2317271250676773e-05, "loss": 0.1082, "loss_lm": 0.015635255724191666, "loss_seg": 0.09259086102247238, "mean_token_accuracy": 0.9951951801776886, "num_tokens": 1352404322.0, "step": 3182 }, { "entropy": 0.019060823135077953, "epoch": 1.3930408140934456, "grad_norm": 10.1875, "learning_rate": 2.231456415809421e-05, "loss": 0.1427, "loss_lm": 0.01598716713488102, "loss_seg": 0.12667194567620754, "mean_token_accuracy": 0.9950460195541382, "num_tokens": 1352829622.0, "step": 3183 }, { "entropy": 0.01831109868362546, "epoch": 1.3934784987416566, "grad_norm": 11.8125, "learning_rate": 2.2311857065511644e-05, "loss": 0.1547, "loss_lm": 0.016877345042303205, "loss_seg": 0.1377987377345562, "mean_token_accuracy": 0.9951913505792618, "num_tokens": 1353254848.0, "step": 3184 }, { "entropy": 0.01842379430308938, "epoch": 1.3939161833898677, "grad_norm": 9.5625, "learning_rate": 2.2309149972929074e-05, "loss": 0.1232, "loss_lm": 0.015670485561713576, "loss_seg": 0.10752022825181484, "mean_token_accuracy": 0.9952401667833328, "num_tokens": 1353679838.0, "step": 3185 }, { "entropy": 0.018052478786557913, "epoch": 1.3943538680380785, "grad_norm": 9.875, "learning_rate": 2.2306442880346508e-05, "loss": 0.1119, "loss_lm": 0.013898950768634677, "loss_seg": 0.09796785935759544, "mean_token_accuracy": 0.9954055696725845, "num_tokens": 1354105097.0, "step": 3186 }, { "entropy": 0.018800996709614992, "epoch": 1.3947915526862895, "grad_norm": 7.46875, "learning_rate": 2.230373578776394e-05, "loss": 0.0923, "loss_lm": 0.017344366991892457, "loss_seg": 0.07497598137706518, "mean_token_accuracy": 0.9951599091291428, "num_tokens": 1354530086.0, "step": 3187 }, { "entropy": 0.018131486140191555, "epoch": 1.3952292373345005, "grad_norm": 7.03125, "learning_rate": 2.2301028695181375e-05, "loss": 0.1301, "loss_lm": 0.015276376390829682, "loss_seg": 0.11482299491763115, "mean_token_accuracy": 0.9952666163444519, "num_tokens": 1354955520.0, "step": 3188 }, { "entropy": 0.01877349242568016, "epoch": 1.3956669219827114, "grad_norm": 40.5, "learning_rate": 2.229832160259881e-05, "loss": 0.1039, "loss_lm": 0.01652359589934349, "loss_seg": 0.08736584149301052, "mean_token_accuracy": 0.9952159225940704, "num_tokens": 1355380428.0, "step": 3189 }, { "entropy": 0.018413668498396873, "epoch": 1.3961046066309224, "grad_norm": 6.4375, "learning_rate": 2.2295614510016243e-05, "loss": 0.1393, "loss_lm": 0.01548522268421948, "loss_seg": 0.12376562133431435, "mean_token_accuracy": 0.9951438754796982, "num_tokens": 1355805833.0, "step": 3190 }, { "entropy": 0.018465024419128895, "epoch": 1.3965422912791334, "grad_norm": 25.125, "learning_rate": 2.2292907417433676e-05, "loss": 0.1275, "loss_lm": 0.015452963067218661, "loss_seg": 0.11203296482563019, "mean_token_accuracy": 0.9952137768268585, "num_tokens": 1356230990.0, "step": 3191 }, { "entropy": 0.01858193101361394, "epoch": 1.3969799759273442, "grad_norm": 9.8125, "learning_rate": 2.229020032485111e-05, "loss": 0.155, "loss_lm": 0.01749497279524803, "loss_seg": 0.13753281719982624, "mean_token_accuracy": 0.9952725619077682, "num_tokens": 1356656349.0, "step": 3192 }, { "entropy": 0.01859180349856615, "epoch": 1.3974176605755553, "grad_norm": 5.96875, "learning_rate": 2.2287493232268544e-05, "loss": 0.115, "loss_lm": 0.015207191463559866, "loss_seg": 0.09975155629217625, "mean_token_accuracy": 0.9953149110078812, "num_tokens": 1357081659.0, "step": 3193 }, { "entropy": 0.018488826230168343, "epoch": 1.3978553452237663, "grad_norm": 14.6875, "learning_rate": 2.2284786139685978e-05, "loss": 0.1047, "loss_lm": 0.01729480060748756, "loss_seg": 0.0874237772077322, "mean_token_accuracy": 0.9951082468032837, "num_tokens": 1357506228.0, "step": 3194 }, { "entropy": 0.017870476935058832, "epoch": 1.3982930298719771, "grad_norm": 6.65625, "learning_rate": 2.228207904710341e-05, "loss": 0.1427, "loss_lm": 0.015707701444625854, "loss_seg": 0.12699705362319946, "mean_token_accuracy": 0.9953757524490356, "num_tokens": 1357931018.0, "step": 3195 }, { "entropy": 0.018790619913488626, "epoch": 1.3987307145201882, "grad_norm": 49.25, "learning_rate": 2.2279371954520845e-05, "loss": 0.1253, "loss_lm": 0.019212424755096436, "loss_seg": 0.10610223934054375, "mean_token_accuracy": 0.995031550526619, "num_tokens": 1358355878.0, "step": 3196 }, { "entropy": 0.01843269821256399, "epoch": 1.3991683991683992, "grad_norm": 13.5, "learning_rate": 2.227666486193828e-05, "loss": 0.1292, "loss_lm": 0.014413008699193597, "loss_seg": 0.11480691656470299, "mean_token_accuracy": 0.9952026456594467, "num_tokens": 1358780877.0, "step": 3197 }, { "entropy": 0.018833518959581852, "epoch": 1.39960608381661, "grad_norm": 12.0, "learning_rate": 2.2273957769355712e-05, "loss": 0.094, "loss_lm": 0.017511201789602637, "loss_seg": 0.07653633505105972, "mean_token_accuracy": 0.9952858686447144, "num_tokens": 1359206998.0, "step": 3198 }, { "entropy": 0.01873666374012828, "epoch": 1.400043768464821, "grad_norm": 8.4375, "learning_rate": 2.2271250676773146e-05, "loss": 0.0853, "loss_lm": 0.014629061566665769, "loss_seg": 0.07069120183587074, "mean_token_accuracy": 0.9952050894498825, "num_tokens": 1359632194.0, "step": 3199 }, { "entropy": 0.01867822278290987, "epoch": 1.400481453113032, "grad_norm": 14.5, "learning_rate": 2.226854358419058e-05, "loss": 0.1794, "loss_lm": 0.017806817078962922, "loss_seg": 0.16160177066922188, "mean_token_accuracy": 0.9951731562614441, "num_tokens": 1360056667.0, "step": 3200 }, { "entropy": 0.01834753481671214, "epoch": 1.400919137761243, "grad_norm": 9.375, "learning_rate": 2.2265836491608014e-05, "loss": 0.1027, "loss_lm": 0.014970183139666915, "loss_seg": 0.08775820396840572, "mean_token_accuracy": 0.99530328810215, "num_tokens": 1360481376.0, "step": 3201 }, { "entropy": 0.01814044965431094, "epoch": 1.401356822409454, "grad_norm": 7.34375, "learning_rate": 2.2263129399025447e-05, "loss": 0.1181, "loss_lm": 0.01557410997338593, "loss_seg": 0.10247932188212872, "mean_token_accuracy": 0.9953068345785141, "num_tokens": 1360906219.0, "step": 3202 }, { "entropy": 0.01897834660485387, "epoch": 1.401794507057665, "grad_norm": 5.46875, "learning_rate": 2.226042230644288e-05, "loss": 0.144, "loss_lm": 0.01834432710893452, "loss_seg": 0.12561028823256493, "mean_token_accuracy": 0.9950619786977768, "num_tokens": 1361331261.0, "step": 3203 }, { "entropy": 0.018271672073751688, "epoch": 1.402232191705876, "grad_norm": 8.75, "learning_rate": 2.2257715213860315e-05, "loss": 0.133, "loss_lm": 0.01632504351437092, "loss_seg": 0.11662975139915943, "mean_token_accuracy": 0.9954148679971695, "num_tokens": 1361755821.0, "step": 3204 }, { "entropy": 0.018838339019566774, "epoch": 1.4026698763540868, "grad_norm": 5.75, "learning_rate": 2.225500812127775e-05, "loss": 0.1186, "loss_lm": 0.01565542258322239, "loss_seg": 0.10296338051557541, "mean_token_accuracy": 0.9951501935720444, "num_tokens": 1362180336.0, "step": 3205 }, { "entropy": 0.018810973968356848, "epoch": 1.4031075610022978, "grad_norm": 13.0, "learning_rate": 2.2252301028695182e-05, "loss": 0.1156, "loss_lm": 0.0151275007519871, "loss_seg": 0.10042607598006725, "mean_token_accuracy": 0.9951484352350235, "num_tokens": 1362604917.0, "step": 3206 }, { "entropy": 0.01868565659970045, "epoch": 1.4035452456505089, "grad_norm": 12.875, "learning_rate": 2.2249593936112616e-05, "loss": 0.1129, "loss_lm": 0.01459188642911613, "loss_seg": 0.09835368301719427, "mean_token_accuracy": 0.9953438639640808, "num_tokens": 1363029469.0, "step": 3207 }, { "entropy": 0.019099624827504158, "epoch": 1.40398293029872, "grad_norm": 10.125, "learning_rate": 2.2246886843530046e-05, "loss": 0.1084, "loss_lm": 0.01698084408417344, "loss_seg": 0.09142518974840641, "mean_token_accuracy": 0.9951581954956055, "num_tokens": 1363455301.0, "step": 3208 }, { "entropy": 0.01847967691719532, "epoch": 1.4044206149469307, "grad_norm": 6.9375, "learning_rate": 2.2244179750947483e-05, "loss": 0.1017, "loss_lm": 0.01866106688976288, "loss_seg": 0.08307652547955513, "mean_token_accuracy": 0.9952420443296432, "num_tokens": 1363880229.0, "step": 3209 }, { "entropy": 0.018806074280291796, "epoch": 1.4048582995951417, "grad_norm": 5.46875, "learning_rate": 2.2241472658364917e-05, "loss": 0.1401, "loss_lm": 0.016324032098054886, "loss_seg": 0.12373282760381699, "mean_token_accuracy": 0.9951422363519669, "num_tokens": 1364305721.0, "step": 3210 }, { "entropy": 0.01864958042278886, "epoch": 1.4052959842433528, "grad_norm": 12.8125, "learning_rate": 2.223876556578235e-05, "loss": 0.1374, "loss_lm": 0.014760294230654836, "loss_seg": 0.12265128456056118, "mean_token_accuracy": 0.9951930642127991, "num_tokens": 1364730943.0, "step": 3211 }, { "entropy": 0.018832477740943432, "epoch": 1.4057336688915636, "grad_norm": 6.625, "learning_rate": 2.2236058473199785e-05, "loss": 0.1249, "loss_lm": 0.016316361259669065, "loss_seg": 0.10853559896349907, "mean_token_accuracy": 0.9951193928718567, "num_tokens": 1365156041.0, "step": 3212 }, { "entropy": 0.019404083024710417, "epoch": 1.4061713535397746, "grad_norm": 8.0, "learning_rate": 2.2233351380617215e-05, "loss": 0.1623, "loss_lm": 0.018766033463180065, "loss_seg": 0.14357453770935535, "mean_token_accuracy": 0.9949974417686462, "num_tokens": 1365581392.0, "step": 3213 }, { "entropy": 0.019191516563296318, "epoch": 1.4066090381879857, "grad_norm": 5.125, "learning_rate": 2.2230644288034652e-05, "loss": 0.1222, "loss_lm": 0.01807127334177494, "loss_seg": 0.10409554280340672, "mean_token_accuracy": 0.9951063841581345, "num_tokens": 1366006791.0, "step": 3214 }, { "entropy": 0.018740070052444935, "epoch": 1.4070467228361965, "grad_norm": 7.0625, "learning_rate": 2.2227937195452086e-05, "loss": 0.0986, "loss_lm": 0.01693433104082942, "loss_seg": 0.0816371738910675, "mean_token_accuracy": 0.9951249063014984, "num_tokens": 1366432613.0, "step": 3215 }, { "entropy": 0.01899102097377181, "epoch": 1.4074844074844075, "grad_norm": 6.84375, "learning_rate": 2.222523010286952e-05, "loss": 0.1295, "loss_lm": 0.016209510387852788, "loss_seg": 0.1133007351309061, "mean_token_accuracy": 0.9950401335954666, "num_tokens": 1366857388.0, "step": 3216 }, { "entropy": 0.01889835251495242, "epoch": 1.4079220921326185, "grad_norm": 7.03125, "learning_rate": 2.2222523010286953e-05, "loss": 0.1419, "loss_lm": 0.015630575362592936, "loss_seg": 0.12629388831555843, "mean_token_accuracy": 0.9951025396585464, "num_tokens": 1367282257.0, "step": 3217 }, { "entropy": 0.01863180147483945, "epoch": 1.4083597767808294, "grad_norm": 8.1875, "learning_rate": 2.2219815917704384e-05, "loss": 0.1317, "loss_lm": 0.014696873258799314, "loss_seg": 0.11701318249106407, "mean_token_accuracy": 0.9952095746994019, "num_tokens": 1367707446.0, "step": 3218 }, { "entropy": 0.018133575096726418, "epoch": 1.4087974614290404, "grad_norm": 26.125, "learning_rate": 2.221710882512182e-05, "loss": 0.145, "loss_lm": 0.01831447472795844, "loss_seg": 0.1267199795693159, "mean_token_accuracy": 0.9952151924371719, "num_tokens": 1368133114.0, "step": 3219 }, { "entropy": 0.019283331464976072, "epoch": 1.4092351460772514, "grad_norm": 3.734375, "learning_rate": 2.2214401732539254e-05, "loss": 0.0872, "loss_lm": 0.01648942823521793, "loss_seg": 0.07067565340548754, "mean_token_accuracy": 0.9949928820133209, "num_tokens": 1368558705.0, "step": 3220 }, { "entropy": 0.018510960042476654, "epoch": 1.4096728307254622, "grad_norm": 8.8125, "learning_rate": 2.2211694639956688e-05, "loss": 0.1475, "loss_lm": 0.01855030143633485, "loss_seg": 0.12895381078124046, "mean_token_accuracy": 0.9952546060085297, "num_tokens": 1368983904.0, "step": 3221 }, { "entropy": 0.018435634672641754, "epoch": 1.4101105153736733, "grad_norm": 7.0, "learning_rate": 2.2208987547374122e-05, "loss": 0.0807, "loss_lm": 0.017099043121561408, "loss_seg": 0.06362568773329258, "mean_token_accuracy": 0.9952638447284698, "num_tokens": 1369408250.0, "step": 3222 }, { "entropy": 0.018515396397560835, "epoch": 1.4105482000218843, "grad_norm": 10.625, "learning_rate": 2.2206280454791552e-05, "loss": 0.1273, "loss_lm": 0.015895449789240956, "loss_seg": 0.11137843132019043, "mean_token_accuracy": 0.9951827526092529, "num_tokens": 1369832956.0, "step": 3223 }, { "entropy": 0.017995129339396954, "epoch": 1.4109858846700951, "grad_norm": 17.5, "learning_rate": 2.2203573362208986e-05, "loss": 0.0997, "loss_lm": 0.013621762627735734, "loss_seg": 0.0860597062855959, "mean_token_accuracy": 0.9953409135341644, "num_tokens": 1370257672.0, "step": 3224 }, { "entropy": 0.018422933295369148, "epoch": 1.4114235693183061, "grad_norm": 8.8125, "learning_rate": 2.2200866269626423e-05, "loss": 0.1081, "loss_lm": 0.01799735752865672, "loss_seg": 0.0900605358183384, "mean_token_accuracy": 0.9952421933412552, "num_tokens": 1370682404.0, "step": 3225 }, { "entropy": 0.019004884641617537, "epoch": 1.4118612539665172, "grad_norm": 18.0, "learning_rate": 2.2198159177043857e-05, "loss": 0.0985, "loss_lm": 0.01651674509048462, "loss_seg": 0.08194885216653347, "mean_token_accuracy": 0.9952507168054581, "num_tokens": 1371107280.0, "step": 3226 }, { "entropy": 0.01841794839128852, "epoch": 1.412298938614728, "grad_norm": 4.03125, "learning_rate": 2.219545208446129e-05, "loss": 0.0887, "loss_lm": 0.01586755132302642, "loss_seg": 0.07284386362880468, "mean_token_accuracy": 0.99519282579422, "num_tokens": 1371532193.0, "step": 3227 }, { "entropy": 0.01880839839577675, "epoch": 1.412736623262939, "grad_norm": 9.5, "learning_rate": 2.219274499187872e-05, "loss": 0.177, "loss_lm": 0.016537261428311467, "loss_seg": 0.16048787906765938, "mean_token_accuracy": 0.9951914101839066, "num_tokens": 1371957652.0, "step": 3228 }, { "entropy": 0.018686634488403797, "epoch": 1.41317430791115, "grad_norm": 29.625, "learning_rate": 2.2190037899296155e-05, "loss": 0.1262, "loss_lm": 0.016706894617527723, "loss_seg": 0.10949760489165783, "mean_token_accuracy": 0.9950359612703323, "num_tokens": 1372383718.0, "step": 3229 }, { "entropy": 0.01821760879829526, "epoch": 1.4136119925593609, "grad_norm": 11.0625, "learning_rate": 2.218733080671359e-05, "loss": 0.1072, "loss_lm": 0.014862104319036007, "loss_seg": 0.09230864979326725, "mean_token_accuracy": 0.9952929317951202, "num_tokens": 1372808789.0, "step": 3230 }, { "entropy": 0.018754197284579277, "epoch": 1.414049677207572, "grad_norm": 7.5, "learning_rate": 2.2184623714131025e-05, "loss": 0.0954, "loss_lm": 0.014683909015730023, "loss_seg": 0.08071949146687984, "mean_token_accuracy": 0.995142325758934, "num_tokens": 1373233869.0, "step": 3231 }, { "entropy": 0.019321277737617493, "epoch": 1.414487361855783, "grad_norm": 7.0, "learning_rate": 2.2181916621548456e-05, "loss": 0.134, "loss_lm": 0.01932158600538969, "loss_seg": 0.11469155736267567, "mean_token_accuracy": 0.9950472712516785, "num_tokens": 1373658469.0, "step": 3232 }, { "entropy": 0.01856856234371662, "epoch": 1.4149250465039938, "grad_norm": 7.28125, "learning_rate": 2.217920952896589e-05, "loss": 0.13, "loss_lm": 0.016751936404034495, "loss_seg": 0.11324212700128555, "mean_token_accuracy": 0.9951004832983017, "num_tokens": 1374084201.0, "step": 3233 }, { "entropy": 0.018389212898910046, "epoch": 1.4153627311522048, "grad_norm": 25.375, "learning_rate": 2.2176502436383323e-05, "loss": 0.1405, "loss_lm": 0.016610752558335662, "loss_seg": 0.12384915072470903, "mean_token_accuracy": 0.9952924847602844, "num_tokens": 1374509038.0, "step": 3234 }, { "entropy": 0.018373229540884495, "epoch": 1.4158004158004158, "grad_norm": 6.625, "learning_rate": 2.217379534380076e-05, "loss": 0.1354, "loss_lm": 0.01625415636226535, "loss_seg": 0.11915096268057823, "mean_token_accuracy": 0.995247557759285, "num_tokens": 1374933204.0, "step": 3235 }, { "entropy": 0.018844129517674446, "epoch": 1.4162381004486266, "grad_norm": 10.6875, "learning_rate": 2.2171088251218194e-05, "loss": 0.1573, "loss_lm": 0.020767803769558668, "loss_seg": 0.13650579005479813, "mean_token_accuracy": 0.9951221793889999, "num_tokens": 1375358556.0, "step": 3236 }, { "entropy": 0.01909449603408575, "epoch": 1.4166757850968377, "grad_norm": 9.1875, "learning_rate": 2.2168381158635624e-05, "loss": 0.1039, "loss_lm": 0.01609077723696828, "loss_seg": 0.08782240189611912, "mean_token_accuracy": 0.9951430857181549, "num_tokens": 1375783088.0, "step": 3237 }, { "entropy": 0.018785542342811823, "epoch": 1.4171134697450487, "grad_norm": 16.0, "learning_rate": 2.2165674066053058e-05, "loss": 0.0887, "loss_lm": 0.01625905348919332, "loss_seg": 0.07240465003997087, "mean_token_accuracy": 0.9951359927654266, "num_tokens": 1376208218.0, "step": 3238 }, { "entropy": 0.018703189212828875, "epoch": 1.4175511543932597, "grad_norm": 3.734375, "learning_rate": 2.2162966973470492e-05, "loss": 0.1081, "loss_lm": 0.01638323557563126, "loss_seg": 0.0916745737195015, "mean_token_accuracy": 0.995116725564003, "num_tokens": 1376633470.0, "step": 3239 }, { "entropy": 0.018319273833185434, "epoch": 1.4179888390414706, "grad_norm": 4.59375, "learning_rate": 2.216025988088793e-05, "loss": 0.1172, "loss_lm": 0.016604631207883358, "loss_seg": 0.10057072527706623, "mean_token_accuracy": 0.9952351748943329, "num_tokens": 1377058209.0, "step": 3240 }, { "entropy": 0.01833187695592642, "epoch": 1.4184265236896816, "grad_norm": 7.09375, "learning_rate": 2.2157552788305363e-05, "loss": 0.1337, "loss_lm": 0.016760361846536398, "loss_seg": 0.11691457405686378, "mean_token_accuracy": 0.9952999353408813, "num_tokens": 1377483062.0, "step": 3241 }, { "entropy": 0.01878814399242401, "epoch": 1.4188642083378926, "grad_norm": 7.90625, "learning_rate": 2.2154845695722793e-05, "loss": 0.1186, "loss_lm": 0.01727903657592833, "loss_seg": 0.10128665529191494, "mean_token_accuracy": 0.9952096492052078, "num_tokens": 1377908228.0, "step": 3242 }, { "entropy": 0.018560316413640976, "epoch": 1.4193018929861034, "grad_norm": 8.5, "learning_rate": 2.2152138603140227e-05, "loss": 0.1242, "loss_lm": 0.016593433218076825, "loss_seg": 0.10758759453892708, "mean_token_accuracy": 0.9951618611812592, "num_tokens": 1378332924.0, "step": 3243 }, { "entropy": 0.01938707660883665, "epoch": 1.4197395776343145, "grad_norm": 7.53125, "learning_rate": 2.214943151055766e-05, "loss": 0.0964, "loss_lm": 0.02024205308407545, "loss_seg": 0.07618012931197882, "mean_token_accuracy": 0.995025098323822, "num_tokens": 1378757849.0, "step": 3244 }, { "entropy": 0.019096996635198593, "epoch": 1.4201772622825255, "grad_norm": 5.59375, "learning_rate": 2.2146724417975098e-05, "loss": 0.0927, "loss_lm": 0.01624725665897131, "loss_seg": 0.07640979532152414, "mean_token_accuracy": 0.9950567334890366, "num_tokens": 1379183225.0, "step": 3245 }, { "entropy": 0.019419750664383173, "epoch": 1.4206149469307365, "grad_norm": 14.0, "learning_rate": 2.214401732539253e-05, "loss": 0.1068, "loss_lm": 0.018408370669931173, "loss_seg": 0.08834399469196796, "mean_token_accuracy": 0.9950546473264694, "num_tokens": 1379609462.0, "step": 3246 }, { "entropy": 0.01880883937701583, "epoch": 1.4210526315789473, "grad_norm": 33.0, "learning_rate": 2.214131023280996e-05, "loss": 0.1514, "loss_lm": 0.017732860520482063, "loss_seg": 0.13366862386465073, "mean_token_accuracy": 0.9951679706573486, "num_tokens": 1380034527.0, "step": 3247 }, { "entropy": 0.018707637209445238, "epoch": 1.4214903162271584, "grad_norm": 8.75, "learning_rate": 2.2138603140227395e-05, "loss": 0.0878, "loss_lm": 0.016209630528464913, "loss_seg": 0.07157541252672672, "mean_token_accuracy": 0.9952923804521561, "num_tokens": 1380459713.0, "step": 3248 }, { "entropy": 0.018686714582145214, "epoch": 1.4219280008753694, "grad_norm": 4.53125, "learning_rate": 2.213589604764483e-05, "loss": 0.1314, "loss_lm": 0.017182289622724056, "loss_seg": 0.11418657377362251, "mean_token_accuracy": 0.9952116310596466, "num_tokens": 1380885707.0, "step": 3249 }, { "entropy": 0.018276452086865902, "epoch": 1.4223656855235802, "grad_norm": 11.5, "learning_rate": 2.2133188955062266e-05, "loss": 0.0984, "loss_lm": 0.015893215546384454, "loss_seg": 0.08247793465852737, "mean_token_accuracy": 0.9953227043151855, "num_tokens": 1381310681.0, "step": 3250 }, { "entropy": 0.01862480165436864, "epoch": 1.4228033701717913, "grad_norm": 14.75, "learning_rate": 2.21304818624797e-05, "loss": 0.0941, "loss_lm": 0.014738735975697637, "loss_seg": 0.07935414463281631, "mean_token_accuracy": 0.9953515529632568, "num_tokens": 1381736118.0, "step": 3251 }, { "entropy": 0.01947695715352893, "epoch": 1.4232410548200023, "grad_norm": 10.25, "learning_rate": 2.212777476989713e-05, "loss": 0.166, "loss_lm": 0.01848143758252263, "loss_seg": 0.14747760444879532, "mean_token_accuracy": 0.99496890604496, "num_tokens": 1382161440.0, "step": 3252 }, { "entropy": 0.01848479826003313, "epoch": 1.423678739468213, "grad_norm": 13.5625, "learning_rate": 2.2125067677314564e-05, "loss": 0.1275, "loss_lm": 0.01704496773891151, "loss_seg": 0.11048053577542305, "mean_token_accuracy": 0.9952127039432526, "num_tokens": 1382586876.0, "step": 3253 }, { "entropy": 0.018698133062571287, "epoch": 1.4241164241164241, "grad_norm": 9.4375, "learning_rate": 2.2122360584731998e-05, "loss": 0.1261, "loss_lm": 0.016481163445860147, "loss_seg": 0.10963277891278267, "mean_token_accuracy": 0.9951067864894867, "num_tokens": 1383012332.0, "step": 3254 }, { "entropy": 0.01931195380166173, "epoch": 1.4245541087646352, "grad_norm": 7.4375, "learning_rate": 2.211965349214943e-05, "loss": 0.1185, "loss_lm": 0.01787584857083857, "loss_seg": 0.10061833169311285, "mean_token_accuracy": 0.9951808154582977, "num_tokens": 1383437274.0, "step": 3255 }, { "entropy": 0.018765296787023544, "epoch": 1.424991793412846, "grad_norm": 7.96875, "learning_rate": 2.2116946399566865e-05, "loss": 0.1159, "loss_lm": 0.015387529972940683, "loss_seg": 0.10053827054798603, "mean_token_accuracy": 0.995248332619667, "num_tokens": 1383862353.0, "step": 3256 }, { "entropy": 0.018129197414964437, "epoch": 1.425429478061057, "grad_norm": 6.0, "learning_rate": 2.21142393069843e-05, "loss": 0.0872, "loss_lm": 0.015361960977315903, "loss_seg": 0.07185216620564461, "mean_token_accuracy": 0.995398759841919, "num_tokens": 1384286965.0, "step": 3257 }, { "entropy": 0.017936588265001774, "epoch": 1.425867162709268, "grad_norm": 7.40625, "learning_rate": 2.2111532214401733e-05, "loss": 0.1108, "loss_lm": 0.017301318468526006, "loss_seg": 0.09348340891301632, "mean_token_accuracy": 0.9954627305269241, "num_tokens": 1384712043.0, "step": 3258 }, { "entropy": 0.018245712853968143, "epoch": 1.4263048473574789, "grad_norm": 11.0, "learning_rate": 2.2108825121819166e-05, "loss": 0.139, "loss_lm": 0.016653158236294985, "loss_seg": 0.1223396360874176, "mean_token_accuracy": 0.9952534586191177, "num_tokens": 1385136947.0, "step": 3259 }, { "entropy": 0.018387087155133486, "epoch": 1.42674253200569, "grad_norm": 6.34375, "learning_rate": 2.21061180292366e-05, "loss": 0.1321, "loss_lm": 0.01604864071123302, "loss_seg": 0.11608543992042542, "mean_token_accuracy": 0.9952395111322403, "num_tokens": 1385561461.0, "step": 3260 }, { "entropy": 0.018766250927001238, "epoch": 1.427180216653901, "grad_norm": 24.625, "learning_rate": 2.2103410936654034e-05, "loss": 0.1358, "loss_lm": 0.016622215043753386, "loss_seg": 0.11918992176651955, "mean_token_accuracy": 0.9951430559158325, "num_tokens": 1385986595.0, "step": 3261 }, { "entropy": 0.018863700795918703, "epoch": 1.4276179013021117, "grad_norm": 7.65625, "learning_rate": 2.2100703844071468e-05, "loss": 0.1264, "loss_lm": 0.0168160165194422, "loss_seg": 0.10956376045942307, "mean_token_accuracy": 0.9951784610748291, "num_tokens": 1386411745.0, "step": 3262 }, { "entropy": 0.01819724217057228, "epoch": 1.4280555859503228, "grad_norm": 15.625, "learning_rate": 2.20979967514889e-05, "loss": 0.173, "loss_lm": 0.01740923640318215, "loss_seg": 0.15559172630310059, "mean_token_accuracy": 0.9953183680772781, "num_tokens": 1386836994.0, "step": 3263 }, { "entropy": 0.01908411644399166, "epoch": 1.4284932705985338, "grad_norm": 11.25, "learning_rate": 2.2095289658906335e-05, "loss": 0.1511, "loss_lm": 0.016304922057315707, "loss_seg": 0.13483943231403828, "mean_token_accuracy": 0.9951348751783371, "num_tokens": 1387262413.0, "step": 3264 }, { "entropy": 0.01921625342220068, "epoch": 1.4289309552467446, "grad_norm": 6.71875, "learning_rate": 2.209258256632377e-05, "loss": 0.1431, "loss_lm": 0.01536064245738089, "loss_seg": 0.1277828123420477, "mean_token_accuracy": 0.9952265471220016, "num_tokens": 1387687634.0, "step": 3265 }, { "entropy": 0.018412633799016476, "epoch": 1.4293686398949557, "grad_norm": 6.03125, "learning_rate": 2.2089875473741202e-05, "loss": 0.1194, "loss_lm": 0.014712592819705606, "loss_seg": 0.10468651540577412, "mean_token_accuracy": 0.9951957911252975, "num_tokens": 1388111805.0, "step": 3266 }, { "entropy": 0.019304173067212105, "epoch": 1.4298063245431667, "grad_norm": 12.125, "learning_rate": 2.2087168381158636e-05, "loss": 0.1296, "loss_lm": 0.016936480766162276, "loss_seg": 0.11263859085738659, "mean_token_accuracy": 0.9951313585042953, "num_tokens": 1388536535.0, "step": 3267 }, { "entropy": 0.018836475908756256, "epoch": 1.4302440091913775, "grad_norm": 8.5, "learning_rate": 2.208446128857607e-05, "loss": 0.1263, "loss_lm": 0.017353378934785724, "loss_seg": 0.10898677073419094, "mean_token_accuracy": 0.9952294826507568, "num_tokens": 1388960949.0, "step": 3268 }, { "entropy": 0.018380258698016405, "epoch": 1.4306816938395885, "grad_norm": 13.3125, "learning_rate": 2.2081754195993504e-05, "loss": 0.1322, "loss_lm": 0.015184670221060514, "loss_seg": 0.1170447263866663, "mean_token_accuracy": 0.9952471703290939, "num_tokens": 1389385910.0, "step": 3269 }, { "entropy": 0.018588079139590263, "epoch": 1.4311193784877996, "grad_norm": 6.375, "learning_rate": 2.2079047103410937e-05, "loss": 0.1578, "loss_lm": 0.017520420253276825, "loss_seg": 0.14032031036913395, "mean_token_accuracy": 0.995270386338234, "num_tokens": 1389810419.0, "step": 3270 }, { "entropy": 0.01844443054869771, "epoch": 1.4315570631360104, "grad_norm": 6.09375, "learning_rate": 2.207634001082837e-05, "loss": 0.1094, "loss_lm": 0.01720439433120191, "loss_seg": 0.0922295805066824, "mean_token_accuracy": 0.9952530562877655, "num_tokens": 1390234876.0, "step": 3271 }, { "entropy": 0.019909767899662256, "epoch": 1.4319947477842214, "grad_norm": 25.375, "learning_rate": 2.2073632918245805e-05, "loss": 0.1268, "loss_lm": 0.019093662034720182, "loss_seg": 0.10767803341150284, "mean_token_accuracy": 0.9949759542942047, "num_tokens": 1390661189.0, "step": 3272 }, { "entropy": 0.019209329038858414, "epoch": 1.4324324324324325, "grad_norm": 7.59375, "learning_rate": 2.207092582566324e-05, "loss": 0.1548, "loss_lm": 0.016434935620054603, "loss_seg": 0.1383852045983076, "mean_token_accuracy": 0.9951484501361847, "num_tokens": 1391085569.0, "step": 3273 }, { "entropy": 0.018400342669337988, "epoch": 1.4328701170806433, "grad_norm": 17.875, "learning_rate": 2.2068218733080672e-05, "loss": 0.093, "loss_lm": 0.014049193589016795, "loss_seg": 0.07894541881978512, "mean_token_accuracy": 0.9952452331781387, "num_tokens": 1391510768.0, "step": 3274 }, { "entropy": 0.018876406364142895, "epoch": 1.4333078017288543, "grad_norm": 18.875, "learning_rate": 2.2065511640498106e-05, "loss": 0.1322, "loss_lm": 0.014953901758417487, "loss_seg": 0.11726466938853264, "mean_token_accuracy": 0.9951810538768768, "num_tokens": 1391936155.0, "step": 3275 }, { "entropy": 0.01891611609607935, "epoch": 1.4337454863770653, "grad_norm": 11.8125, "learning_rate": 2.206280454791554e-05, "loss": 0.1099, "loss_lm": 0.016029335092753172, "loss_seg": 0.09386909380555153, "mean_token_accuracy": 0.9951190650463104, "num_tokens": 1392361264.0, "step": 3276 }, { "entropy": 0.018422852735966444, "epoch": 1.4341831710252764, "grad_norm": 9.6875, "learning_rate": 2.2060097455332973e-05, "loss": 0.1284, "loss_lm": 0.015192807652056217, "loss_seg": 0.11324559897184372, "mean_token_accuracy": 0.9952273070812225, "num_tokens": 1392785919.0, "step": 3277 }, { "entropy": 0.018468819092959166, "epoch": 1.4346208556734872, "grad_norm": 11.3125, "learning_rate": 2.2057390362750407e-05, "loss": 0.0975, "loss_lm": 0.01710777124390006, "loss_seg": 0.08039254881441593, "mean_token_accuracy": 0.9952086210250854, "num_tokens": 1393211438.0, "step": 3278 }, { "entropy": 0.018661754205822945, "epoch": 1.4350585403216982, "grad_norm": 17.875, "learning_rate": 2.205468327016784e-05, "loss": 0.1156, "loss_lm": 0.018514936789870262, "loss_seg": 0.09703951515257359, "mean_token_accuracy": 0.9952678978443146, "num_tokens": 1393636587.0, "step": 3279 }, { "entropy": 0.019447929225862026, "epoch": 1.4354962249699093, "grad_norm": 10.5, "learning_rate": 2.205197617758527e-05, "loss": 0.1238, "loss_lm": 0.015213696984574199, "loss_seg": 0.10859149508178234, "mean_token_accuracy": 0.9948657304048538, "num_tokens": 1394062729.0, "step": 3280 }, { "entropy": 0.0185407642275095, "epoch": 1.43593390961812, "grad_norm": 13.25, "learning_rate": 2.204926908500271e-05, "loss": 0.199, "loss_lm": 0.013871717499569058, "loss_seg": 0.18513961136341095, "mean_token_accuracy": 0.9952561259269714, "num_tokens": 1394488109.0, "step": 3281 }, { "entropy": 0.018725662026554346, "epoch": 1.436371594266331, "grad_norm": 6.6875, "learning_rate": 2.2046561992420142e-05, "loss": 0.1163, "loss_lm": 0.0170474702026695, "loss_seg": 0.09925846941769123, "mean_token_accuracy": 0.9951991140842438, "num_tokens": 1394913388.0, "step": 3282 }, { "entropy": 0.017939453944563866, "epoch": 1.4368092789145421, "grad_norm": 15.625, "learning_rate": 2.2043854899837576e-05, "loss": 0.1471, "loss_lm": 0.015472961822524667, "loss_seg": 0.13163226284086704, "mean_token_accuracy": 0.995420902967453, "num_tokens": 1395337620.0, "step": 3283 }, { "entropy": 0.01879996107891202, "epoch": 1.4372469635627532, "grad_norm": 5.09375, "learning_rate": 2.204114780725501e-05, "loss": 0.0891, "loss_lm": 0.016470626927912235, "loss_seg": 0.07267736829817295, "mean_token_accuracy": 0.9950496554374695, "num_tokens": 1395762814.0, "step": 3284 }, { "entropy": 0.018325716722756624, "epoch": 1.437684648210964, "grad_norm": 12.8125, "learning_rate": 2.203844071467244e-05, "loss": 0.2027, "loss_lm": 0.01652303128503263, "loss_seg": 0.18622193299233913, "mean_token_accuracy": 0.9952553361654282, "num_tokens": 1396187858.0, "step": 3285 }, { "entropy": 0.018960495945066214, "epoch": 1.438122332859175, "grad_norm": 6.9375, "learning_rate": 2.2035733622089877e-05, "loss": 0.1873, "loss_lm": 0.017812704434618354, "loss_seg": 0.1695068385452032, "mean_token_accuracy": 0.9950623214244843, "num_tokens": 1396613122.0, "step": 3286 }, { "entropy": 0.018094558268785477, "epoch": 1.438560017507386, "grad_norm": 5.46875, "learning_rate": 2.203302652950731e-05, "loss": 0.1191, "loss_lm": 0.01625644857995212, "loss_seg": 0.1028775330632925, "mean_token_accuracy": 0.9952629953622818, "num_tokens": 1397037870.0, "step": 3287 }, { "entropy": 0.018321544397622347, "epoch": 1.4389977021555969, "grad_norm": 10.8125, "learning_rate": 2.2030319436924744e-05, "loss": 0.1189, "loss_lm": 0.017747991951182485, "loss_seg": 0.10110641084611416, "mean_token_accuracy": 0.99519482254982, "num_tokens": 1397462601.0, "step": 3288 }, { "entropy": 0.01896147197112441, "epoch": 1.439435386803808, "grad_norm": 5.34375, "learning_rate": 2.2027612344342178e-05, "loss": 0.1223, "loss_lm": 0.017971392953768373, "loss_seg": 0.1042942963540554, "mean_token_accuracy": 0.9950940907001495, "num_tokens": 1397887992.0, "step": 3289 }, { "entropy": 0.017876219004392624, "epoch": 1.439873071452019, "grad_norm": 5.125, "learning_rate": 2.202490525175961e-05, "loss": 0.1105, "loss_lm": 0.014222868252545595, "loss_seg": 0.09626634232699871, "mean_token_accuracy": 0.9953685253858566, "num_tokens": 1398313081.0, "step": 3290 }, { "entropy": 0.018855289556086063, "epoch": 1.4403107561002297, "grad_norm": 39.25, "learning_rate": 2.2022198159177042e-05, "loss": 0.0962, "loss_lm": 0.014923156471922994, "loss_seg": 0.08123763743788004, "mean_token_accuracy": 0.9950024634599686, "num_tokens": 1398738285.0, "step": 3291 }, { "entropy": 0.01879999926313758, "epoch": 1.4407484407484408, "grad_norm": 6.96875, "learning_rate": 2.201949106659448e-05, "loss": 0.1759, "loss_lm": 0.01734020235016942, "loss_seg": 0.15858867764472961, "mean_token_accuracy": 0.9951877444982529, "num_tokens": 1399162590.0, "step": 3292 }, { "entropy": 0.018258333206176758, "epoch": 1.4411861253966518, "grad_norm": 6.15625, "learning_rate": 2.2016783974011913e-05, "loss": 0.1158, "loss_lm": 0.01549671939574182, "loss_seg": 0.1002880148589611, "mean_token_accuracy": 0.9953271746635437, "num_tokens": 1399587136.0, "step": 3293 }, { "entropy": 0.019049600698053837, "epoch": 1.4416238100448626, "grad_norm": 4.40625, "learning_rate": 2.2014076881429347e-05, "loss": 0.1233, "loss_lm": 0.017907695611938834, "loss_seg": 0.10543635673820972, "mean_token_accuracy": 0.9952510446310043, "num_tokens": 1400012655.0, "step": 3294 }, { "entropy": 0.018263082019984722, "epoch": 1.4420614946930737, "grad_norm": 9.375, "learning_rate": 2.2011369788846777e-05, "loss": 0.1016, "loss_lm": 0.015176638728007674, "loss_seg": 0.0864058118313551, "mean_token_accuracy": 0.9952794313430786, "num_tokens": 1400438009.0, "step": 3295 }, { "entropy": 0.01831934368237853, "epoch": 1.4424991793412847, "grad_norm": 17.625, "learning_rate": 2.200866269626421e-05, "loss": 0.1381, "loss_lm": 0.019929000176489353, "loss_seg": 0.11817960813641548, "mean_token_accuracy": 0.9952197074890137, "num_tokens": 1400862422.0, "step": 3296 }, { "entropy": 0.018570706248283386, "epoch": 1.4429368639894955, "grad_norm": 15.0, "learning_rate": 2.2005955603681648e-05, "loss": 0.1331, "loss_lm": 0.014777411241084337, "loss_seg": 0.11835317499935627, "mean_token_accuracy": 0.9953037649393082, "num_tokens": 1401287823.0, "step": 3297 }, { "entropy": 0.019218022469431162, "epoch": 1.4433745486377065, "grad_norm": 7.4375, "learning_rate": 2.200324851109908e-05, "loss": 0.1142, "loss_lm": 0.018487317487597466, "loss_seg": 0.09567676857113838, "mean_token_accuracy": 0.9951184689998627, "num_tokens": 1401713275.0, "step": 3298 }, { "entropy": 0.018238509073853493, "epoch": 1.4438122332859176, "grad_norm": 32.5, "learning_rate": 2.2000541418516515e-05, "loss": 0.0962, "loss_lm": 0.015496333362534642, "loss_seg": 0.08074485510587692, "mean_token_accuracy": 0.9952627420425415, "num_tokens": 1402138353.0, "step": 3299 }, { "entropy": 0.01827802788466215, "epoch": 1.4442499179341284, "grad_norm": 10.6875, "learning_rate": 2.1997834325933946e-05, "loss": 0.0894, "loss_lm": 0.013419656315818429, "loss_seg": 0.07597491424530745, "mean_token_accuracy": 0.9952868819236755, "num_tokens": 1402563714.0, "step": 3300 }, { "entropy": 0.018639513291418552, "epoch": 1.4446876025823394, "grad_norm": 13.125, "learning_rate": 2.199512723335138e-05, "loss": 0.0779, "loss_lm": 0.015634848503395915, "loss_seg": 0.06228988245129585, "mean_token_accuracy": 0.9952130019664764, "num_tokens": 1402988574.0, "step": 3301 }, { "entropy": 0.01828896114602685, "epoch": 1.4451252872305504, "grad_norm": 13.4375, "learning_rate": 2.1992420140768817e-05, "loss": 0.1286, "loss_lm": 0.014252487337216735, "loss_seg": 0.11435593105852604, "mean_token_accuracy": 0.995265543460846, "num_tokens": 1403413052.0, "step": 3302 }, { "entropy": 0.018210163805633783, "epoch": 1.4455629718787613, "grad_norm": 5.4375, "learning_rate": 2.198971304818625e-05, "loss": 0.1177, "loss_lm": 0.01404140004888177, "loss_seg": 0.10366186872124672, "mean_token_accuracy": 0.995321735739708, "num_tokens": 1403837961.0, "step": 3303 }, { "entropy": 0.01904019806534052, "epoch": 1.4460006565269723, "grad_norm": 14.1875, "learning_rate": 2.198700595560368e-05, "loss": 0.1594, "loss_lm": 0.016533078625798225, "loss_seg": 0.14287662319839, "mean_token_accuracy": 0.9951108545064926, "num_tokens": 1404262734.0, "step": 3304 }, { "entropy": 0.018387078773230314, "epoch": 1.4464383411751833, "grad_norm": 12.375, "learning_rate": 2.1984298863021114e-05, "loss": 0.1462, "loss_lm": 0.016595502849668264, "loss_seg": 0.12959822453558445, "mean_token_accuracy": 0.9952134490013123, "num_tokens": 1404688484.0, "step": 3305 }, { "entropy": 0.018275729846209288, "epoch": 1.4468760258233941, "grad_norm": 7.03125, "learning_rate": 2.1981591770438548e-05, "loss": 0.1118, "loss_lm": 0.01538537465967238, "loss_seg": 0.09644005354493856, "mean_token_accuracy": 0.9952165633440018, "num_tokens": 1405113862.0, "step": 3306 }, { "entropy": 0.018328561447560787, "epoch": 1.4473137104716052, "grad_norm": 11.0, "learning_rate": 2.1978884677855985e-05, "loss": 0.1766, "loss_lm": 0.016194414580240846, "loss_seg": 0.1604461818933487, "mean_token_accuracy": 0.9952785074710846, "num_tokens": 1405539105.0, "step": 3307 }, { "entropy": 0.019058907870203257, "epoch": 1.4477513951198162, "grad_norm": 21.0, "learning_rate": 2.197617758527342e-05, "loss": 0.1788, "loss_lm": 0.013843971537426114, "loss_seg": 0.16494639962911606, "mean_token_accuracy": 0.9950879216194153, "num_tokens": 1405964353.0, "step": 3308 }, { "entropy": 0.019071975722908974, "epoch": 1.448189079768027, "grad_norm": 17.5, "learning_rate": 2.197347049269085e-05, "loss": 0.0897, "loss_lm": 0.017112259520217776, "loss_seg": 0.07261205743998289, "mean_token_accuracy": 0.9950300604104996, "num_tokens": 1406389701.0, "step": 3309 }, { "entropy": 0.018439787440001965, "epoch": 1.448626764416238, "grad_norm": 15.875, "learning_rate": 2.1970763400108283e-05, "loss": 0.1518, "loss_lm": 0.015615907730534673, "loss_seg": 0.13620425574481487, "mean_token_accuracy": 0.995324119925499, "num_tokens": 1406815847.0, "step": 3310 }, { "entropy": 0.01857677847146988, "epoch": 1.449064449064449, "grad_norm": 7.4375, "learning_rate": 2.1968056307525717e-05, "loss": 0.1059, "loss_lm": 0.014861002331599593, "loss_seg": 0.0910080298781395, "mean_token_accuracy": 0.9951608031988144, "num_tokens": 1407241104.0, "step": 3311 }, { "entropy": 0.018603964243084192, "epoch": 1.44950213371266, "grad_norm": 7.3125, "learning_rate": 2.1965349214943154e-05, "loss": 0.1464, "loss_lm": 0.014940897468477488, "loss_seg": 0.13149508088827133, "mean_token_accuracy": 0.9952004104852676, "num_tokens": 1407665909.0, "step": 3312 }, { "entropy": 0.01814968418329954, "epoch": 1.449939818360871, "grad_norm": 5.4375, "learning_rate": 2.1962642122360588e-05, "loss": 0.1059, "loss_lm": 0.016600042581558228, "loss_seg": 0.08932872302830219, "mean_token_accuracy": 0.99530428647995, "num_tokens": 1408091361.0, "step": 3313 }, { "entropy": 0.018838760908693075, "epoch": 1.450377503009082, "grad_norm": 6.59375, "learning_rate": 2.1959935029778018e-05, "loss": 0.1046, "loss_lm": 0.014908811310306191, "loss_seg": 0.08969012834131718, "mean_token_accuracy": 0.9951482564210892, "num_tokens": 1408515914.0, "step": 3314 }, { "entropy": 0.01856426103040576, "epoch": 1.4508151876572928, "grad_norm": 6.15625, "learning_rate": 2.195722793719545e-05, "loss": 0.1809, "loss_lm": 0.016889306716620922, "loss_seg": 0.16405412182211876, "mean_token_accuracy": 0.9952212572097778, "num_tokens": 1408941710.0, "step": 3315 }, { "entropy": 0.01885298267006874, "epoch": 1.4512528723055038, "grad_norm": 143.0, "learning_rate": 2.1954520844612885e-05, "loss": 0.1769, "loss_lm": 0.019895855337381363, "loss_seg": 0.15699234791100025, "mean_token_accuracy": 0.9952321350574493, "num_tokens": 1409365986.0, "step": 3316 }, { "entropy": 0.018191315699368715, "epoch": 1.4516905569537149, "grad_norm": 15.375, "learning_rate": 2.1951813752030322e-05, "loss": 0.0872, "loss_lm": 0.015967413783073425, "loss_seg": 0.07119763363152742, "mean_token_accuracy": 0.9954060316085815, "num_tokens": 1409791042.0, "step": 3317 }, { "entropy": 0.018792624585330486, "epoch": 1.4521282416019259, "grad_norm": 5.34375, "learning_rate": 2.1949106659447756e-05, "loss": 0.1019, "loss_lm": 0.017819880042225122, "loss_seg": 0.08410442806780338, "mean_token_accuracy": 0.9951200038194656, "num_tokens": 1410215801.0, "step": 3318 }, { "entropy": 0.019073127768933773, "epoch": 1.4525659262501367, "grad_norm": 7.625, "learning_rate": 2.1946399566865187e-05, "loss": 0.1402, "loss_lm": 0.014904672978445888, "loss_seg": 0.12526543997228146, "mean_token_accuracy": 0.9950750321149826, "num_tokens": 1410641149.0, "step": 3319 }, { "entropy": 0.018255016766488552, "epoch": 1.4530036108983477, "grad_norm": 6.53125, "learning_rate": 2.194369247428262e-05, "loss": 0.1709, "loss_lm": 0.01449463702738285, "loss_seg": 0.1563585139811039, "mean_token_accuracy": 0.9954135566949844, "num_tokens": 1411066261.0, "step": 3320 }, { "entropy": 0.018635842017829418, "epoch": 1.4534412955465588, "grad_norm": 8.625, "learning_rate": 2.1940985381700054e-05, "loss": 0.1234, "loss_lm": 0.01730279647745192, "loss_seg": 0.10614147409796715, "mean_token_accuracy": 0.995159924030304, "num_tokens": 1411492411.0, "step": 3321 }, { "entropy": 0.01902051130309701, "epoch": 1.4538789801947698, "grad_norm": 6.875, "learning_rate": 2.1938278289117488e-05, "loss": 0.1532, "loss_lm": 0.01649602479301393, "loss_seg": 0.13670197501778603, "mean_token_accuracy": 0.9949780404567719, "num_tokens": 1411916533.0, "step": 3322 }, { "entropy": 0.01843194756656885, "epoch": 1.4543166648429806, "grad_norm": 11.5625, "learning_rate": 2.1935571196534925e-05, "loss": 0.1074, "loss_lm": 0.016106852795928717, "loss_seg": 0.09130152594298124, "mean_token_accuracy": 0.9952091872692108, "num_tokens": 1412341234.0, "step": 3323 }, { "entropy": 0.018222119193524122, "epoch": 1.4547543494911916, "grad_norm": 10.6875, "learning_rate": 2.1932864103952355e-05, "loss": 0.1376, "loss_lm": 0.016219862271100283, "loss_seg": 0.12133116461336613, "mean_token_accuracy": 0.9952997118234634, "num_tokens": 1412766260.0, "step": 3324 }, { "entropy": 0.019008944276720285, "epoch": 1.4551920341394027, "grad_norm": 12.4375, "learning_rate": 2.193015701136979e-05, "loss": 0.1109, "loss_lm": 0.013776171719655395, "loss_seg": 0.09711072035133839, "mean_token_accuracy": 0.9950959086418152, "num_tokens": 1413191797.0, "step": 3325 }, { "entropy": 0.018611209001392126, "epoch": 1.4556297187876135, "grad_norm": 7.5, "learning_rate": 2.1927449918787223e-05, "loss": 0.1518, "loss_lm": 0.01760191028006375, "loss_seg": 0.13417008891701698, "mean_token_accuracy": 0.9952387660741806, "num_tokens": 1413617058.0, "step": 3326 }, { "entropy": 0.018737418577075005, "epoch": 1.4560674034358245, "grad_norm": 9.0625, "learning_rate": 2.1924742826204656e-05, "loss": 0.1169, "loss_lm": 0.01606496935710311, "loss_seg": 0.10081402957439423, "mean_token_accuracy": 0.995103508234024, "num_tokens": 1414041806.0, "step": 3327 }, { "entropy": 0.018143836874514818, "epoch": 1.4565050880840356, "grad_norm": 9.3125, "learning_rate": 2.192203573362209e-05, "loss": 0.1443, "loss_lm": 0.015794494887813926, "loss_seg": 0.12846271507441998, "mean_token_accuracy": 0.9954380840063095, "num_tokens": 1414466140.0, "step": 3328 }, { "entropy": 0.018698703031986952, "epoch": 1.4569427727322464, "grad_norm": 6.8125, "learning_rate": 2.1919328641039524e-05, "loss": 0.1168, "loss_lm": 0.01724555087275803, "loss_seg": 0.09958983957767487, "mean_token_accuracy": 0.9953119605779648, "num_tokens": 1414891434.0, "step": 3329 }, { "entropy": 0.018544441554695368, "epoch": 1.4573804573804574, "grad_norm": 9.6875, "learning_rate": 2.1916621548456958e-05, "loss": 0.1221, "loss_lm": 0.01645754463970661, "loss_seg": 0.1056010527536273, "mean_token_accuracy": 0.9952291250228882, "num_tokens": 1415316047.0, "step": 3330 }, { "entropy": 0.018885775934904814, "epoch": 1.4578181420286684, "grad_norm": 6.0, "learning_rate": 2.191391445587439e-05, "loss": 0.1192, "loss_lm": 0.014228999149054289, "loss_seg": 0.10499816946685314, "mean_token_accuracy": 0.9952985793352127, "num_tokens": 1415741399.0, "step": 3331 }, { "entropy": 0.019090546760708094, "epoch": 1.4582558266768793, "grad_norm": 16.25, "learning_rate": 2.1911207363291825e-05, "loss": 0.1146, "loss_lm": 0.0192553517408669, "loss_seg": 0.09531519748270512, "mean_token_accuracy": 0.9950425922870636, "num_tokens": 1416166594.0, "step": 3332 }, { "entropy": 0.018209700006991625, "epoch": 1.4586935113250903, "grad_norm": 27.375, "learning_rate": 2.190850027070926e-05, "loss": 0.1045, "loss_lm": 0.015086872270330787, "loss_seg": 0.08945261035114527, "mean_token_accuracy": 0.9953664392232895, "num_tokens": 1416590624.0, "step": 3333 }, { "entropy": 0.019058602396398783, "epoch": 1.4591311959733013, "grad_norm": 10.0, "learning_rate": 2.1905793178126692e-05, "loss": 0.1157, "loss_lm": 0.017146072117611766, "loss_seg": 0.09853420779109001, "mean_token_accuracy": 0.9949834942817688, "num_tokens": 1417016002.0, "step": 3334 }, { "entropy": 0.018854471389204264, "epoch": 1.4595688806215121, "grad_norm": 10.875, "learning_rate": 2.1903086085544126e-05, "loss": 0.1186, "loss_lm": 0.01520347548648715, "loss_seg": 0.10337912105023861, "mean_token_accuracy": 0.9952801167964935, "num_tokens": 1417441128.0, "step": 3335 }, { "entropy": 0.01886706054210663, "epoch": 1.4600065652697232, "grad_norm": 10.1875, "learning_rate": 2.190037899296156e-05, "loss": 0.1427, "loss_lm": 0.014602407114580274, "loss_seg": 0.1281459480524063, "mean_token_accuracy": 0.9952149391174316, "num_tokens": 1417866778.0, "step": 3336 }, { "entropy": 0.018327072728425264, "epoch": 1.4604442499179342, "grad_norm": 16.0, "learning_rate": 2.1897671900378994e-05, "loss": 0.1005, "loss_lm": 0.014637935440987349, "loss_seg": 0.08582267165184021, "mean_token_accuracy": 0.9952647089958191, "num_tokens": 1418291617.0, "step": 3337 }, { "entropy": 0.018397176638245583, "epoch": 1.460881934566145, "grad_norm": 12.9375, "learning_rate": 2.1894964807796427e-05, "loss": 0.1025, "loss_lm": 0.017372861271724105, "loss_seg": 0.08512701839208603, "mean_token_accuracy": 0.9953470081090927, "num_tokens": 1418715808.0, "step": 3338 }, { "entropy": 0.018241430167108774, "epoch": 1.461319619214356, "grad_norm": 12.5, "learning_rate": 2.189225771521386e-05, "loss": 0.1311, "loss_lm": 0.014946104260161519, "loss_seg": 0.11619425844401121, "mean_token_accuracy": 0.9951851069927216, "num_tokens": 1419140914.0, "step": 3339 }, { "entropy": 0.017892957665026188, "epoch": 1.461757303862567, "grad_norm": 6.78125, "learning_rate": 2.1889550622631295e-05, "loss": 0.1169, "loss_lm": 0.015958979725837708, "loss_seg": 0.10094571858644485, "mean_token_accuracy": 0.9952840209007263, "num_tokens": 1419565601.0, "step": 3340 }, { "entropy": 0.018394422717392445, "epoch": 1.462194988510778, "grad_norm": 7.25, "learning_rate": 2.188684353004873e-05, "loss": 0.1431, "loss_lm": 0.01663563889451325, "loss_seg": 0.12648210488259792, "mean_token_accuracy": 0.9952956140041351, "num_tokens": 1419990468.0, "step": 3341 }, { "entropy": 0.018939095083624125, "epoch": 1.462632673158989, "grad_norm": 9.0, "learning_rate": 2.1884136437466162e-05, "loss": 0.0797, "loss_lm": 0.016112256329506636, "loss_seg": 0.06354140024632215, "mean_token_accuracy": 0.9951344877481461, "num_tokens": 1420415678.0, "step": 3342 }, { "entropy": 0.018751127179712057, "epoch": 1.4630703578072, "grad_norm": 12.125, "learning_rate": 2.1881429344883596e-05, "loss": 0.1086, "loss_lm": 0.016924054129049182, "loss_seg": 0.09167724289000034, "mean_token_accuracy": 0.9951927363872528, "num_tokens": 1420840617.0, "step": 3343 }, { "entropy": 0.018300433177500963, "epoch": 1.4635080424554108, "grad_norm": 7.9375, "learning_rate": 2.187872225230103e-05, "loss": 0.1402, "loss_lm": 0.015343699837103486, "loss_seg": 0.12480970844626427, "mean_token_accuracy": 0.9954003095626831, "num_tokens": 1421265379.0, "step": 3344 }, { "entropy": 0.0190693992190063, "epoch": 1.4639457271036218, "grad_norm": 3.0, "learning_rate": 2.1876015159718463e-05, "loss": 0.1453, "loss_lm": 0.01630470994859934, "loss_seg": 0.12894583866000175, "mean_token_accuracy": 0.9950864613056183, "num_tokens": 1421690325.0, "step": 3345 }, { "entropy": 0.019173030741512775, "epoch": 1.4643834117518328, "grad_norm": 5.25, "learning_rate": 2.1873308067135897e-05, "loss": 0.123, "loss_lm": 0.01891552284359932, "loss_seg": 0.1040482334792614, "mean_token_accuracy": 0.9950218200683594, "num_tokens": 1422115154.0, "step": 3346 }, { "entropy": 0.01862857211381197, "epoch": 1.4648210964000437, "grad_norm": 7.5625, "learning_rate": 2.1870600974553327e-05, "loss": 0.1115, "loss_lm": 0.01763521321117878, "loss_seg": 0.09383091237396002, "mean_token_accuracy": 0.9951985627412796, "num_tokens": 1422540620.0, "step": 3347 }, { "entropy": 0.018979248590767384, "epoch": 1.4652587810482547, "grad_norm": 8.6875, "learning_rate": 2.1867893881970765e-05, "loss": 0.1666, "loss_lm": 0.01704051485285163, "loss_seg": 0.14959842152893543, "mean_token_accuracy": 0.9950813800096512, "num_tokens": 1422965690.0, "step": 3348 }, { "entropy": 0.01887133438140154, "epoch": 1.4656964656964657, "grad_norm": 4.6875, "learning_rate": 2.1865186789388198e-05, "loss": 0.1259, "loss_lm": 0.01573451329022646, "loss_seg": 0.11016748007386923, "mean_token_accuracy": 0.9952061325311661, "num_tokens": 1423390787.0, "step": 3349 }, { "entropy": 0.018660790752619505, "epoch": 1.4661341503446765, "grad_norm": 18.0, "learning_rate": 2.1862479696805632e-05, "loss": 0.1226, "loss_lm": 0.01554303546436131, "loss_seg": 0.10707094892859459, "mean_token_accuracy": 0.9951038807630539, "num_tokens": 1423815590.0, "step": 3350 }, { "entropy": 0.018855170346796513, "epoch": 1.4665718349928876, "grad_norm": 15.6875, "learning_rate": 2.1859772604223066e-05, "loss": 0.117, "loss_lm": 0.015905754873529077, "loss_seg": 0.10108179412782192, "mean_token_accuracy": 0.9950821250677109, "num_tokens": 1424240934.0, "step": 3351 }, { "entropy": 0.018971056677401066, "epoch": 1.4670095196410986, "grad_norm": 16.25, "learning_rate": 2.1857065511640496e-05, "loss": 0.1474, "loss_lm": 0.01583090308122337, "loss_seg": 0.13152840733528137, "mean_token_accuracy": 0.9949984699487686, "num_tokens": 1424665568.0, "step": 3352 }, { "entropy": 0.018653301056474447, "epoch": 1.4674472042893094, "grad_norm": 6.6875, "learning_rate": 2.1854358419057933e-05, "loss": 0.1157, "loss_lm": 0.014199662255123258, "loss_seg": 0.10152239166200161, "mean_token_accuracy": 0.9951005280017853, "num_tokens": 1425090909.0, "step": 3353 }, { "entropy": 0.018247915897518396, "epoch": 1.4678848889375204, "grad_norm": 11.5, "learning_rate": 2.1851651326475367e-05, "loss": 0.14, "loss_lm": 0.01451614242978394, "loss_seg": 0.12549232691526413, "mean_token_accuracy": 0.9951484799385071, "num_tokens": 1425515895.0, "step": 3354 }, { "entropy": 0.018317127600312233, "epoch": 1.4683225735857315, "grad_norm": 4.84375, "learning_rate": 2.18489442338928e-05, "loss": 0.1339, "loss_lm": 0.017458354821428657, "loss_seg": 0.11643282696604729, "mean_token_accuracy": 0.9953424632549286, "num_tokens": 1425940100.0, "step": 3355 }, { "entropy": 0.019124918151646852, "epoch": 1.4687602582339425, "grad_norm": 18.75, "learning_rate": 2.1846237141310234e-05, "loss": 0.1127, "loss_lm": 0.018717501778155565, "loss_seg": 0.09394422546029091, "mean_token_accuracy": 0.9950360059738159, "num_tokens": 1426366541.0, "step": 3356 }, { "entropy": 0.01795028569176793, "epoch": 1.4691979428821533, "grad_norm": 16.625, "learning_rate": 2.1843530048727665e-05, "loss": 0.0838, "loss_lm": 0.01465862151235342, "loss_seg": 0.06909143924713135, "mean_token_accuracy": 0.9952170997858047, "num_tokens": 1426792188.0, "step": 3357 }, { "entropy": 0.018280917312949896, "epoch": 1.4696356275303644, "grad_norm": 5.0, "learning_rate": 2.18408229561451e-05, "loss": 0.1493, "loss_lm": 0.01856110943481326, "loss_seg": 0.1306888908147812, "mean_token_accuracy": 0.995224192738533, "num_tokens": 1427217417.0, "step": 3358 }, { "entropy": 0.018152083735913038, "epoch": 1.4700733121785754, "grad_norm": 7.3125, "learning_rate": 2.1838115863562536e-05, "loss": 0.0968, "loss_lm": 0.015383363701403141, "loss_seg": 0.0814614575356245, "mean_token_accuracy": 0.9953386783599854, "num_tokens": 1427642570.0, "step": 3359 }, { "entropy": 0.018870890606194735, "epoch": 1.4705109968267864, "grad_norm": 8.5, "learning_rate": 2.183540877097997e-05, "loss": 0.1638, "loss_lm": 0.018617278896272182, "loss_seg": 0.14516187831759453, "mean_token_accuracy": 0.9951136559247971, "num_tokens": 1428068218.0, "step": 3360 }, { "entropy": 0.018936113454401493, "epoch": 1.4709486814749972, "grad_norm": 5.8125, "learning_rate": 2.1832701678397403e-05, "loss": 0.1129, "loss_lm": 0.017474679509177804, "loss_seg": 0.09538511838763952, "mean_token_accuracy": 0.9951492697000504, "num_tokens": 1428493058.0, "step": 3361 }, { "entropy": 0.018155698664486408, "epoch": 1.4713863661232083, "grad_norm": 8.875, "learning_rate": 2.1829994585814833e-05, "loss": 0.1031, "loss_lm": 0.01806217269040644, "loss_seg": 0.0850836280733347, "mean_token_accuracy": 0.9952698200941086, "num_tokens": 1428918392.0, "step": 3362 }, { "entropy": 0.018387350719422102, "epoch": 1.4718240507714193, "grad_norm": 3.984375, "learning_rate": 2.1827287493232267e-05, "loss": 0.1108, "loss_lm": 0.015499879838898778, "loss_seg": 0.09526707418262959, "mean_token_accuracy": 0.9951499849557877, "num_tokens": 1429343296.0, "step": 3363 }, { "entropy": 0.01809570798650384, "epoch": 1.4722617354196301, "grad_norm": 4.65625, "learning_rate": 2.1824580400649704e-05, "loss": 0.1077, "loss_lm": 0.017467860132455826, "loss_seg": 0.09025590494275093, "mean_token_accuracy": 0.9953717291355133, "num_tokens": 1429767586.0, "step": 3364 }, { "entropy": 0.019653061404824257, "epoch": 1.4726994200678412, "grad_norm": 9.4375, "learning_rate": 2.1821873308067138e-05, "loss": 0.1076, "loss_lm": 0.015586544061079621, "loss_seg": 0.09204586036503315, "mean_token_accuracy": 0.9948921650648117, "num_tokens": 1430192232.0, "step": 3365 }, { "entropy": 0.01771110389381647, "epoch": 1.4731371047160522, "grad_norm": 9.5625, "learning_rate": 2.181916621548457e-05, "loss": 0.0868, "loss_lm": 0.015423242701217532, "loss_seg": 0.07139421254396439, "mean_token_accuracy": 0.9953486025333405, "num_tokens": 1430616576.0, "step": 3366 }, { "entropy": 0.018257999792695045, "epoch": 1.473574789364263, "grad_norm": 7.75, "learning_rate": 2.1816459122902002e-05, "loss": 0.1345, "loss_lm": 0.01691359910182655, "loss_seg": 0.11757840868085623, "mean_token_accuracy": 0.9951459318399429, "num_tokens": 1431042360.0, "step": 3367 }, { "entropy": 0.018638087436556816, "epoch": 1.474012474012474, "grad_norm": 11.0, "learning_rate": 2.1813752030319436e-05, "loss": 0.1164, "loss_lm": 0.015085522783920169, "loss_seg": 0.10128769651055336, "mean_token_accuracy": 0.9951049238443375, "num_tokens": 1431467866.0, "step": 3368 }, { "entropy": 0.019136637449264526, "epoch": 1.474450158660685, "grad_norm": 10.5625, "learning_rate": 2.1811044937736873e-05, "loss": 0.1377, "loss_lm": 0.01989753684028983, "loss_seg": 0.11776384338736534, "mean_token_accuracy": 0.9949009865522385, "num_tokens": 1431893212.0, "step": 3369 }, { "entropy": 0.017831500619649887, "epoch": 1.4748878433088959, "grad_norm": 12.5, "learning_rate": 2.1808337845154307e-05, "loss": 0.1184, "loss_lm": 0.01883115479722619, "loss_seg": 0.09953247755765915, "mean_token_accuracy": 0.995366245508194, "num_tokens": 1432318942.0, "step": 3370 }, { "entropy": 0.018685481511056423, "epoch": 1.475325527957107, "grad_norm": 14.4375, "learning_rate": 2.1805630752571737e-05, "loss": 0.1371, "loss_lm": 0.01390391169115901, "loss_seg": 0.12316776067018509, "mean_token_accuracy": 0.9951238334178925, "num_tokens": 1432744220.0, "step": 3371 }, { "entropy": 0.018264719285070896, "epoch": 1.475763212605318, "grad_norm": 4.0, "learning_rate": 2.180292365998917e-05, "loss": 0.1369, "loss_lm": 0.015263237291947007, "loss_seg": 0.1216532364487648, "mean_token_accuracy": 0.9952147454023361, "num_tokens": 1433169656.0, "step": 3372 }, { "entropy": 0.01860372396185994, "epoch": 1.4762008972535288, "grad_norm": 6.28125, "learning_rate": 2.1800216567406604e-05, "loss": 0.1253, "loss_lm": 0.014804972801357508, "loss_seg": 0.11047953180968761, "mean_token_accuracy": 0.9951217472553253, "num_tokens": 1433594975.0, "step": 3373 }, { "entropy": 0.01889774715527892, "epoch": 1.4766385819017398, "grad_norm": 8.8125, "learning_rate": 2.179750947482404e-05, "loss": 0.0968, "loss_lm": 0.017752576852217317, "loss_seg": 0.07906580995768309, "mean_token_accuracy": 0.9950015395879745, "num_tokens": 1434020477.0, "step": 3374 }, { "entropy": 0.018524003215134144, "epoch": 1.4770762665499508, "grad_norm": 9.375, "learning_rate": 2.1794802382241475e-05, "loss": 0.145, "loss_lm": 0.014755287673324347, "loss_seg": 0.13024152629077435, "mean_token_accuracy": 0.9952579587697983, "num_tokens": 1434445575.0, "step": 3375 }, { "entropy": 0.01923598162829876, "epoch": 1.4775139511981616, "grad_norm": 15.375, "learning_rate": 2.1792095289658906e-05, "loss": 0.136, "loss_lm": 0.01690419763326645, "loss_seg": 0.11913923174142838, "mean_token_accuracy": 0.9950151890516281, "num_tokens": 1434870201.0, "step": 3376 }, { "entropy": 0.018849694170057774, "epoch": 1.4779516358463727, "grad_norm": 7.0, "learning_rate": 2.178938819707634e-05, "loss": 0.1495, "loss_lm": 0.016207764158025384, "loss_seg": 0.13328539580106735, "mean_token_accuracy": 0.9950884729623795, "num_tokens": 1435295080.0, "step": 3377 }, { "entropy": 0.018502947874367237, "epoch": 1.4783893204945837, "grad_norm": 7.65625, "learning_rate": 2.1786681104493773e-05, "loss": 0.0969, "loss_lm": 0.015813726698979735, "loss_seg": 0.08103664591908455, "mean_token_accuracy": 0.9951993823051453, "num_tokens": 1435719974.0, "step": 3378 }, { "entropy": 0.018859424628317356, "epoch": 1.4788270051427945, "grad_norm": 13.25, "learning_rate": 2.178397401191121e-05, "loss": 0.1078, "loss_lm": 0.01690965029411018, "loss_seg": 0.09092999435961246, "mean_token_accuracy": 0.9951298087835312, "num_tokens": 1436144763.0, "step": 3379 }, { "entropy": 0.018694125581532717, "epoch": 1.4792646897910056, "grad_norm": 6.9375, "learning_rate": 2.1781266919328644e-05, "loss": 0.1495, "loss_lm": 0.016217024065554142, "loss_seg": 0.1332997102290392, "mean_token_accuracy": 0.9952013790607452, "num_tokens": 1436569648.0, "step": 3380 }, { "entropy": 0.01817019423469901, "epoch": 1.4797023744392166, "grad_norm": 4.59375, "learning_rate": 2.1778559826746074e-05, "loss": 0.0995, "loss_lm": 0.01466128509491682, "loss_seg": 0.08484579436480999, "mean_token_accuracy": 0.9953225553035736, "num_tokens": 1436993731.0, "step": 3381 }, { "entropy": 0.018681522458791733, "epoch": 1.4801400590874274, "grad_norm": 5.65625, "learning_rate": 2.1775852734163508e-05, "loss": 0.1118, "loss_lm": 0.016197098419070244, "loss_seg": 0.09555909037590027, "mean_token_accuracy": 0.995236411690712, "num_tokens": 1437419219.0, "step": 3382 }, { "entropy": 0.018481530714780092, "epoch": 1.4805777437356384, "grad_norm": 7.5, "learning_rate": 2.177314564158094e-05, "loss": 0.1436, "loss_lm": 0.017389661632478237, "loss_seg": 0.1262530330568552, "mean_token_accuracy": 0.9951529502868652, "num_tokens": 1437844159.0, "step": 3383 }, { "entropy": 0.018098172266036272, "epoch": 1.4810154283838495, "grad_norm": 8.5, "learning_rate": 2.177043854899838e-05, "loss": 0.1143, "loss_lm": 0.014311169506981969, "loss_seg": 0.10000565834343433, "mean_token_accuracy": 0.9953188598155975, "num_tokens": 1438269014.0, "step": 3384 }, { "entropy": 0.018707032781094313, "epoch": 1.4814531130320603, "grad_norm": 5.8125, "learning_rate": 2.1767731456415812e-05, "loss": 0.1301, "loss_lm": 0.017883961787447333, "loss_seg": 0.11224056966602802, "mean_token_accuracy": 0.9952152967453003, "num_tokens": 1438694080.0, "step": 3385 }, { "entropy": 0.01900896057486534, "epoch": 1.4818907976802713, "grad_norm": 10.125, "learning_rate": 2.1765024363833243e-05, "loss": 0.1138, "loss_lm": 0.017153527354821563, "loss_seg": 0.09659822564572096, "mean_token_accuracy": 0.9951500445604324, "num_tokens": 1439119620.0, "step": 3386 }, { "entropy": 0.018790470901876688, "epoch": 1.4823284823284824, "grad_norm": 29.625, "learning_rate": 2.1762317271250677e-05, "loss": 0.1384, "loss_lm": 0.016350994585081935, "loss_seg": 0.12204620614647865, "mean_token_accuracy": 0.9951138943433762, "num_tokens": 1439544673.0, "step": 3387 }, { "entropy": 0.018697576131671667, "epoch": 1.4827661669766932, "grad_norm": 5.3125, "learning_rate": 2.175961017866811e-05, "loss": 0.1191, "loss_lm": 0.01731138350442052, "loss_seg": 0.10179189685732126, "mean_token_accuracy": 0.9952400475740433, "num_tokens": 1439969447.0, "step": 3388 }, { "entropy": 0.017771616578102112, "epoch": 1.4832038516249042, "grad_norm": 6.53125, "learning_rate": 2.1756903086085544e-05, "loss": 0.0979, "loss_lm": 0.014035837259143591, "loss_seg": 0.08388079144060612, "mean_token_accuracy": 0.9954546689987183, "num_tokens": 1440394447.0, "step": 3389 }, { "entropy": 0.018458842299878597, "epoch": 1.4836415362731152, "grad_norm": 16.5, "learning_rate": 2.175419599350298e-05, "loss": 0.1547, "loss_lm": 0.015477299457415938, "loss_seg": 0.13926992006599903, "mean_token_accuracy": 0.9953462928533554, "num_tokens": 1440820065.0, "step": 3390 }, { "entropy": 0.018564981874078512, "epoch": 1.484079220921326, "grad_norm": 13.0, "learning_rate": 2.175148890092041e-05, "loss": 0.0992, "loss_lm": 0.013636426301673055, "loss_seg": 0.08552291430532932, "mean_token_accuracy": 0.9952406138181686, "num_tokens": 1441245460.0, "step": 3391 }, { "entropy": 0.018472825177013874, "epoch": 1.484516905569537, "grad_norm": 12.875, "learning_rate": 2.1748781808337845e-05, "loss": 0.1116, "loss_lm": 0.014803102239966393, "loss_seg": 0.09682402946054935, "mean_token_accuracy": 0.9952022284269333, "num_tokens": 1441669596.0, "step": 3392 }, { "entropy": 0.018222596030682325, "epoch": 1.4849545902177481, "grad_norm": 11.3125, "learning_rate": 2.174607471575528e-05, "loss": 0.0987, "loss_lm": 0.016891272040084004, "loss_seg": 0.08183069806545973, "mean_token_accuracy": 0.9952975660562515, "num_tokens": 1442094020.0, "step": 3393 }, { "entropy": 0.017977542709559202, "epoch": 1.4853922748659592, "grad_norm": 6.71875, "learning_rate": 2.1743367623172713e-05, "loss": 0.1364, "loss_lm": 0.01622926234267652, "loss_seg": 0.1201443886384368, "mean_token_accuracy": 0.995314821600914, "num_tokens": 1442519566.0, "step": 3394 }, { "entropy": 0.018599567469209433, "epoch": 1.48582995951417, "grad_norm": 5.625, "learning_rate": 2.1740660530590146e-05, "loss": 0.1249, "loss_lm": 0.01669353200122714, "loss_seg": 0.1081802286207676, "mean_token_accuracy": 0.9952119886875153, "num_tokens": 1442945407.0, "step": 3395 }, { "entropy": 0.017816448118537664, "epoch": 1.486267644162381, "grad_norm": 10.6875, "learning_rate": 2.173795343800758e-05, "loss": 0.1544, "loss_lm": 0.015545310219749808, "loss_seg": 0.13883099146187305, "mean_token_accuracy": 0.9953274726867676, "num_tokens": 1443370196.0, "step": 3396 }, { "entropy": 0.017588475719094276, "epoch": 1.486705328810592, "grad_norm": 7.0, "learning_rate": 2.1735246345425014e-05, "loss": 0.1731, "loss_lm": 0.014266568468883634, "loss_seg": 0.15882512368261814, "mean_token_accuracy": 0.9953706115484238, "num_tokens": 1443795038.0, "step": 3397 }, { "entropy": 0.01806698366999626, "epoch": 1.487143013458803, "grad_norm": 8.25, "learning_rate": 2.1732539252842447e-05, "loss": 0.1231, "loss_lm": 0.01568649895489216, "loss_seg": 0.1073720995336771, "mean_token_accuracy": 0.9954198598861694, "num_tokens": 1444219637.0, "step": 3398 }, { "entropy": 0.018890843261033297, "epoch": 1.4875806981070139, "grad_norm": 6.3125, "learning_rate": 2.172983216025988e-05, "loss": 0.1111, "loss_lm": 0.013687298400327563, "loss_seg": 0.0974618699401617, "mean_token_accuracy": 0.9951571673154831, "num_tokens": 1444645184.0, "step": 3399 }, { "entropy": 0.018088265787810087, "epoch": 1.488018382755225, "grad_norm": 18.375, "learning_rate": 2.1727125067677315e-05, "loss": 0.1228, "loss_lm": 0.016284196637570858, "loss_seg": 0.10652986820787191, "mean_token_accuracy": 0.9954028874635696, "num_tokens": 1445070390.0, "step": 3400 }, { "entropy": 0.018675114028155804, "epoch": 1.488456067403436, "grad_norm": 8.25, "learning_rate": 2.172441797509475e-05, "loss": 0.1708, "loss_lm": 0.015515135135501623, "loss_seg": 0.1552771795541048, "mean_token_accuracy": 0.995274156332016, "num_tokens": 1445496313.0, "step": 3401 }, { "entropy": 0.01887081190943718, "epoch": 1.4888937520516468, "grad_norm": 4.46875, "learning_rate": 2.1721710882512182e-05, "loss": 0.1658, "loss_lm": 0.014585240045562387, "loss_seg": 0.15124106407165527, "mean_token_accuracy": 0.9953126907348633, "num_tokens": 1445921480.0, "step": 3402 }, { "entropy": 0.01855866936966777, "epoch": 1.4893314366998578, "grad_norm": 22.25, "learning_rate": 2.1719003789929616e-05, "loss": 0.1583, "loss_lm": 0.018372246297076344, "loss_seg": 0.13993990421295166, "mean_token_accuracy": 0.9951546043157578, "num_tokens": 1446346407.0, "step": 3403 }, { "entropy": 0.018720707390457392, "epoch": 1.4897691213480688, "grad_norm": 4.4375, "learning_rate": 2.171629669734705e-05, "loss": 0.1212, "loss_lm": 0.013552231714129448, "loss_seg": 0.10760262794792652, "mean_token_accuracy": 0.9950722008943558, "num_tokens": 1446771539.0, "step": 3404 }, { "entropy": 0.018953962717205286, "epoch": 1.4902068059962796, "grad_norm": 9.625, "learning_rate": 2.1713589604764484e-05, "loss": 0.1502, "loss_lm": 0.01895686681382358, "loss_seg": 0.13120965473353863, "mean_token_accuracy": 0.9951317310333252, "num_tokens": 1447196885.0, "step": 3405 }, { "entropy": 0.0191032444126904, "epoch": 1.4906444906444907, "grad_norm": 6.9375, "learning_rate": 2.1710882512181917e-05, "loss": 0.1213, "loss_lm": 0.014961856184527278, "loss_seg": 0.10633279196918011, "mean_token_accuracy": 0.9951397180557251, "num_tokens": 1447622141.0, "step": 3406 }, { "entropy": 0.01843049516901374, "epoch": 1.4910821752927017, "grad_norm": 22.625, "learning_rate": 2.170817541959935e-05, "loss": 0.1249, "loss_lm": 0.018412486650049686, "loss_seg": 0.1064989548176527, "mean_token_accuracy": 0.9952477365732193, "num_tokens": 1448047358.0, "step": 3407 }, { "entropy": 0.018192298710346222, "epoch": 1.4915198599409125, "grad_norm": 13.0625, "learning_rate": 2.1705468327016785e-05, "loss": 0.1221, "loss_lm": 0.015739195281639695, "loss_seg": 0.10639884136617184, "mean_token_accuracy": 0.9952919334173203, "num_tokens": 1448472328.0, "step": 3408 }, { "entropy": 0.018728526774793863, "epoch": 1.4919575445891236, "grad_norm": 14.125, "learning_rate": 2.170276123443422e-05, "loss": 0.1245, "loss_lm": 0.0161147762555629, "loss_seg": 0.10840432904660702, "mean_token_accuracy": 0.9952615350484848, "num_tokens": 1448896686.0, "step": 3409 }, { "entropy": 0.018179151229560375, "epoch": 1.4923952292373346, "grad_norm": 5.75, "learning_rate": 2.1700054141851652e-05, "loss": 0.1083, "loss_lm": 0.016895256703719497, "loss_seg": 0.09143531136214733, "mean_token_accuracy": 0.9954048693180084, "num_tokens": 1449321859.0, "step": 3410 }, { "entropy": 0.018373547587543726, "epoch": 1.4928329138855454, "grad_norm": 7.25, "learning_rate": 2.1697347049269086e-05, "loss": 0.0962, "loss_lm": 0.014634528430178761, "loss_seg": 0.08160638716071844, "mean_token_accuracy": 0.99531389772892, "num_tokens": 1449747093.0, "step": 3411 }, { "entropy": 0.01825046446174383, "epoch": 1.4932705985337564, "grad_norm": 47.0, "learning_rate": 2.169463995668652e-05, "loss": 0.0852, "loss_lm": 0.01580025115981698, "loss_seg": 0.06937793269753456, "mean_token_accuracy": 0.9952328652143478, "num_tokens": 1450171620.0, "step": 3412 }, { "entropy": 0.018854651134461164, "epoch": 1.4937082831819675, "grad_norm": 5.96875, "learning_rate": 2.1691932864103953e-05, "loss": 0.1183, "loss_lm": 0.015577463200315833, "loss_seg": 0.1027093157172203, "mean_token_accuracy": 0.995207667350769, "num_tokens": 1450596834.0, "step": 3413 }, { "entropy": 0.01820147642865777, "epoch": 1.4941459678301783, "grad_norm": 8.75, "learning_rate": 2.1689225771521387e-05, "loss": 0.1287, "loss_lm": 0.015105052152648568, "loss_seg": 0.1136064063757658, "mean_token_accuracy": 0.995320275425911, "num_tokens": 1451021774.0, "step": 3414 }, { "entropy": 0.018715631682425737, "epoch": 1.4945836524783893, "grad_norm": 10.875, "learning_rate": 2.168651867893882e-05, "loss": 0.1224, "loss_lm": 0.016146440291777253, "loss_seg": 0.10627695731818676, "mean_token_accuracy": 0.9951188862323761, "num_tokens": 1451447039.0, "step": 3415 }, { "entropy": 0.018080200999975204, "epoch": 1.4950213371266003, "grad_norm": 9.0, "learning_rate": 2.1683811586356255e-05, "loss": 0.1134, "loss_lm": 0.014908995013684034, "loss_seg": 0.09848686493933201, "mean_token_accuracy": 0.9953431934118271, "num_tokens": 1451871102.0, "step": 3416 }, { "entropy": 0.018310085870325565, "epoch": 1.4954590217748112, "grad_norm": 24.625, "learning_rate": 2.1681104493773688e-05, "loss": 0.1, "loss_lm": 0.01704810676164925, "loss_seg": 0.08298835717141628, "mean_token_accuracy": 0.9952626377344131, "num_tokens": 1452295994.0, "step": 3417 }, { "entropy": 0.017622766084969044, "epoch": 1.4958967064230222, "grad_norm": 5.5, "learning_rate": 2.1678397401191122e-05, "loss": 0.1086, "loss_lm": 0.015303958905860782, "loss_seg": 0.09331728518009186, "mean_token_accuracy": 0.995502158999443, "num_tokens": 1452721684.0, "step": 3418 }, { "entropy": 0.01846957392990589, "epoch": 1.4963343910712332, "grad_norm": 7.9375, "learning_rate": 2.1675690308608552e-05, "loss": 0.1403, "loss_lm": 0.014646897092461586, "loss_seg": 0.1256242459639907, "mean_token_accuracy": 0.9953043013811111, "num_tokens": 1453146508.0, "step": 3419 }, { "entropy": 0.01811104966327548, "epoch": 1.496772075719444, "grad_norm": 8.375, "learning_rate": 2.1672983216025986e-05, "loss": 0.1219, "loss_lm": 0.017020249739289284, "loss_seg": 0.10489755962044, "mean_token_accuracy": 0.9954186528921127, "num_tokens": 1453572729.0, "step": 3420 }, { "entropy": 0.01899902895092964, "epoch": 1.497209760367655, "grad_norm": 8.375, "learning_rate": 2.1670276123443423e-05, "loss": 0.1248, "loss_lm": 0.015319504076614976, "loss_seg": 0.10949291475117207, "mean_token_accuracy": 0.9952488839626312, "num_tokens": 1453998486.0, "step": 3421 }, { "entropy": 0.01891028881072998, "epoch": 1.497647445015866, "grad_norm": 9.0, "learning_rate": 2.1667569030860857e-05, "loss": 0.1518, "loss_lm": 0.020444239489734173, "loss_seg": 0.13135329633951187, "mean_token_accuracy": 0.9951544404029846, "num_tokens": 1454423166.0, "step": 3422 }, { "entropy": 0.019333472475409508, "epoch": 1.498085129664077, "grad_norm": 14.3125, "learning_rate": 2.166486193827829e-05, "loss": 0.1081, "loss_lm": 0.015118898125365376, "loss_seg": 0.09295330941677094, "mean_token_accuracy": 0.9950321614742279, "num_tokens": 1454849276.0, "step": 3423 }, { "entropy": 0.01857026480138302, "epoch": 1.498522814312288, "grad_norm": 8.1875, "learning_rate": 2.166215484569572e-05, "loss": 0.1176, "loss_lm": 0.01565809058956802, "loss_seg": 0.10191113315522671, "mean_token_accuracy": 0.9952321499586105, "num_tokens": 1455274171.0, "step": 3424 }, { "entropy": 0.01900674309581518, "epoch": 1.498960498960499, "grad_norm": 30.25, "learning_rate": 2.1659447753113155e-05, "loss": 0.139, "loss_lm": 0.015544071793556213, "loss_seg": 0.12349303625524044, "mean_token_accuracy": 0.9951309859752655, "num_tokens": 1455699482.0, "step": 3425 }, { "entropy": 0.018943706061691046, "epoch": 1.4993981836087098, "grad_norm": 7.21875, "learning_rate": 2.1656740660530592e-05, "loss": 0.1026, "loss_lm": 0.015519385924562812, "loss_seg": 0.08707525208592415, "mean_token_accuracy": 0.9951889216899872, "num_tokens": 1456124710.0, "step": 3426 }, { "entropy": 0.01837964216247201, "epoch": 1.4998358682569208, "grad_norm": 5.21875, "learning_rate": 2.1654033567948026e-05, "loss": 0.1466, "loss_lm": 0.015757819870486856, "loss_seg": 0.13085496984422207, "mean_token_accuracy": 0.9951779693365097, "num_tokens": 1456550182.0, "step": 3427 }, { "entropy": 0.018638359382748604, "epoch": 1.5002735529051319, "grad_norm": 4.65625, "learning_rate": 2.165132647536546e-05, "loss": 0.1181, "loss_lm": 0.015988284489139915, "loss_seg": 0.10212249867618084, "mean_token_accuracy": 0.9951349347829819, "num_tokens": 1456974790.0, "step": 3428 }, { "entropy": 0.019151389598846436, "epoch": 1.5007112375533427, "grad_norm": 5.75, "learning_rate": 2.164861938278289e-05, "loss": 0.1111, "loss_lm": 0.019220507005229592, "loss_seg": 0.09185048006474972, "mean_token_accuracy": 0.9950896799564362, "num_tokens": 1457400385.0, "step": 3429 }, { "entropy": 0.018734722398221493, "epoch": 1.501148922201554, "grad_norm": 30.625, "learning_rate": 2.1645912290200323e-05, "loss": 0.1115, "loss_lm": 0.016433299519121647, "loss_seg": 0.09509469009935856, "mean_token_accuracy": 0.9951712489128113, "num_tokens": 1457825924.0, "step": 3430 }, { "entropy": 0.018206140026450157, "epoch": 1.5015866068497647, "grad_norm": 9.5625, "learning_rate": 2.164320519761776e-05, "loss": 0.1557, "loss_lm": 0.01890431996434927, "loss_seg": 0.13679993897676468, "mean_token_accuracy": 0.995265781879425, "num_tokens": 1458250221.0, "step": 3431 }, { "entropy": 0.019098443910479546, "epoch": 1.5020242914979756, "grad_norm": 6.125, "learning_rate": 2.1640498105035194e-05, "loss": 0.1086, "loss_lm": 0.018058995017781854, "loss_seg": 0.09055898897349834, "mean_token_accuracy": 0.9951590597629547, "num_tokens": 1458675926.0, "step": 3432 }, { "entropy": 0.018481159582734108, "epoch": 1.5024619761461868, "grad_norm": 9.8125, "learning_rate": 2.1637791012452628e-05, "loss": 0.1191, "loss_lm": 0.01642263145186007, "loss_seg": 0.10271612741053104, "mean_token_accuracy": 0.9952187240123749, "num_tokens": 1459100461.0, "step": 3433 }, { "entropy": 0.018951816949993372, "epoch": 1.5028996607943976, "grad_norm": 13.3125, "learning_rate": 2.1635083919870058e-05, "loss": 0.1511, "loss_lm": 0.017230321187525988, "loss_seg": 0.13387928064912558, "mean_token_accuracy": 0.9951877295970917, "num_tokens": 1459525405.0, "step": 3434 }, { "entropy": 0.017955064307898283, "epoch": 1.5033373454426084, "grad_norm": 5.125, "learning_rate": 2.1632376827287492e-05, "loss": 0.0937, "loss_lm": 0.014902981230989099, "loss_seg": 0.07884698081761599, "mean_token_accuracy": 0.9953065663576126, "num_tokens": 1459950408.0, "step": 3435 }, { "entropy": 0.01839050045236945, "epoch": 1.5037750300908197, "grad_norm": 23.25, "learning_rate": 2.162966973470493e-05, "loss": 0.0887, "loss_lm": 0.01531901815906167, "loss_seg": 0.07340471353381872, "mean_token_accuracy": 0.9952667951583862, "num_tokens": 1460375022.0, "step": 3436 }, { "entropy": 0.01858749706298113, "epoch": 1.5042127147390305, "grad_norm": 6.46875, "learning_rate": 2.1626962642122363e-05, "loss": 0.1438, "loss_lm": 0.015158969908952713, "loss_seg": 0.12863756716251373, "mean_token_accuracy": 0.9952331781387329, "num_tokens": 1460799578.0, "step": 3437 }, { "entropy": 0.018741412553936243, "epoch": 1.5046503993872415, "grad_norm": 8.8125, "learning_rate": 2.1624255549539797e-05, "loss": 0.1343, "loss_lm": 0.015004369430243969, "loss_seg": 0.11926853656768799, "mean_token_accuracy": 0.9950325340032578, "num_tokens": 1461224513.0, "step": 3438 }, { "entropy": 0.017822700552642345, "epoch": 1.5050880840354526, "grad_norm": 5.875, "learning_rate": 2.1621548456957227e-05, "loss": 0.0871, "loss_lm": 0.015050101559609175, "loss_seg": 0.0720850732177496, "mean_token_accuracy": 0.9954447597265244, "num_tokens": 1461650340.0, "step": 3439 }, { "entropy": 0.018041993025690317, "epoch": 1.5055257686836634, "grad_norm": 7.75, "learning_rate": 2.161884136437466e-05, "loss": 0.1294, "loss_lm": 0.014287085505202413, "loss_seg": 0.1150836106389761, "mean_token_accuracy": 0.9952274560928345, "num_tokens": 1462075005.0, "step": 3440 }, { "entropy": 0.018474940676242113, "epoch": 1.5059634533318744, "grad_norm": 4.75, "learning_rate": 2.1616134271792098e-05, "loss": 0.113, "loss_lm": 0.015661303885281086, "loss_seg": 0.09732447471469641, "mean_token_accuracy": 0.9952374845743179, "num_tokens": 1462499745.0, "step": 3441 }, { "entropy": 0.018466119188815355, "epoch": 1.5064011379800855, "grad_norm": 6.03125, "learning_rate": 2.161342717920953e-05, "loss": 0.13, "loss_lm": 0.01484994892962277, "loss_seg": 0.115124037489295, "mean_token_accuracy": 0.9952078014612198, "num_tokens": 1462924327.0, "step": 3442 }, { "entropy": 0.018425097689032555, "epoch": 1.5068388226282963, "grad_norm": 8.75, "learning_rate": 2.1610720086626962e-05, "loss": 0.1164, "loss_lm": 0.014875004533678293, "loss_seg": 0.10148928686976433, "mean_token_accuracy": 0.9953089356422424, "num_tokens": 1463348683.0, "step": 3443 }, { "entropy": 0.018209596164524555, "epoch": 1.5072765072765073, "grad_norm": 7.125, "learning_rate": 2.1608012994044395e-05, "loss": 0.0918, "loss_lm": 0.0180474657099694, "loss_seg": 0.07371875829994678, "mean_token_accuracy": 0.995320275425911, "num_tokens": 1463773876.0, "step": 3444 }, { "entropy": 0.019003817345947027, "epoch": 1.5077141919247183, "grad_norm": 13.5625, "learning_rate": 2.160530590146183e-05, "loss": 0.1177, "loss_lm": 0.016502991085872054, "loss_seg": 0.10122473537921906, "mean_token_accuracy": 0.9951799511909485, "num_tokens": 1464198298.0, "step": 3445 }, { "entropy": 0.018983627669513226, "epoch": 1.5081518765729292, "grad_norm": 22.0, "learning_rate": 2.1602598808879266e-05, "loss": 0.0998, "loss_lm": 0.014541459735482931, "loss_seg": 0.08530479576438665, "mean_token_accuracy": 0.9951821863651276, "num_tokens": 1464623746.0, "step": 3446 }, { "entropy": 0.0189551692456007, "epoch": 1.5085895612211402, "grad_norm": 7.25, "learning_rate": 2.15998917162967e-05, "loss": 0.1088, "loss_lm": 0.017047673696652055, "loss_seg": 0.09179944731295109, "mean_token_accuracy": 0.9951820373535156, "num_tokens": 1465049537.0, "step": 3447 }, { "entropy": 0.017907018307596445, "epoch": 1.5090272458693512, "grad_norm": 11.5625, "learning_rate": 2.159718462371413e-05, "loss": 0.1171, "loss_lm": 0.018820276018232107, "loss_seg": 0.09831232577562332, "mean_token_accuracy": 0.9954230636358261, "num_tokens": 1465473777.0, "step": 3448 }, { "entropy": 0.018889544066041708, "epoch": 1.509464930517562, "grad_norm": 7.46875, "learning_rate": 2.1594477531131564e-05, "loss": 0.1662, "loss_lm": 0.015279099810868502, "loss_seg": 0.1508832024410367, "mean_token_accuracy": 0.9950831234455109, "num_tokens": 1465899020.0, "step": 3449 }, { "entropy": 0.018492347095161676, "epoch": 1.509902615165773, "grad_norm": 5.21875, "learning_rate": 2.1591770438548998e-05, "loss": 0.1571, "loss_lm": 0.01969301328063011, "loss_seg": 0.13739425130188465, "mean_token_accuracy": 0.9952796548604965, "num_tokens": 1466323945.0, "step": 3450 }, { "entropy": 0.018886541482061148, "epoch": 1.510340299813984, "grad_norm": 12.1875, "learning_rate": 2.158906334596643e-05, "loss": 0.1329, "loss_lm": 0.015971305314451456, "loss_seg": 0.11696361191570759, "mean_token_accuracy": 0.9951021373271942, "num_tokens": 1466749403.0, "step": 3451 }, { "entropy": 0.01899748295545578, "epoch": 1.510777984462195, "grad_norm": 79.5, "learning_rate": 2.158635625338387e-05, "loss": 0.1134, "loss_lm": 0.01674424926750362, "loss_seg": 0.0966582428663969, "mean_token_accuracy": 0.9951426088809967, "num_tokens": 1467174280.0, "step": 3452 }, { "entropy": 0.01858010282739997, "epoch": 1.511215669110406, "grad_norm": 7.8125, "learning_rate": 2.15836491608013e-05, "loss": 0.1252, "loss_lm": 0.017106754705309868, "loss_seg": 0.10806425847113132, "mean_token_accuracy": 0.9951672405004501, "num_tokens": 1467598438.0, "step": 3453 }, { "entropy": 0.01791088841855526, "epoch": 1.511653353758617, "grad_norm": 2.59375, "learning_rate": 2.1580942068218733e-05, "loss": 0.0963, "loss_lm": 0.015045556472614408, "loss_seg": 0.08120567537844181, "mean_token_accuracy": 0.9952715635299683, "num_tokens": 1468023647.0, "step": 3454 }, { "entropy": 0.01887335767969489, "epoch": 1.5120910384068278, "grad_norm": 10.0, "learning_rate": 2.1578234975636166e-05, "loss": 0.1047, "loss_lm": 0.01953486818820238, "loss_seg": 0.0851303469389677, "mean_token_accuracy": 0.9951755851507187, "num_tokens": 1468449128.0, "step": 3455 }, { "entropy": 0.018560827244073153, "epoch": 1.5125287230550388, "grad_norm": 6.53125, "learning_rate": 2.15755278830536e-05, "loss": 0.0976, "loss_lm": 0.016772590344771743, "loss_seg": 0.08078763540834188, "mean_token_accuracy": 0.9952889680862427, "num_tokens": 1468875094.0, "step": 3456 }, { "entropy": 0.018252398818731308, "epoch": 1.5129664077032499, "grad_norm": 4.96875, "learning_rate": 2.1572820790471037e-05, "loss": 0.0898, "loss_lm": 0.014403209555894136, "loss_seg": 0.0754300132393837, "mean_token_accuracy": 0.9953202456235886, "num_tokens": 1469299209.0, "step": 3457 }, { "entropy": 0.019374319817870855, "epoch": 1.5134040923514607, "grad_norm": 7.3125, "learning_rate": 2.1570113697888468e-05, "loss": 0.112, "loss_lm": 0.015970731619745493, "loss_seg": 0.09606990590691566, "mean_token_accuracy": 0.9949937909841537, "num_tokens": 1469724233.0, "step": 3458 }, { "entropy": 0.017795283813029528, "epoch": 1.5138417769996717, "grad_norm": 20.25, "learning_rate": 2.15674066053059e-05, "loss": 0.1759, "loss_lm": 0.0156549122184515, "loss_seg": 0.16024985536932945, "mean_token_accuracy": 0.9953385889530182, "num_tokens": 1470149177.0, "step": 3459 }, { "entropy": 0.018285918049514294, "epoch": 1.5142794616478827, "grad_norm": 20.125, "learning_rate": 2.1564699512723335e-05, "loss": 0.1034, "loss_lm": 0.014507747488096356, "loss_seg": 0.08888092823326588, "mean_token_accuracy": 0.9954258948564529, "num_tokens": 1470574344.0, "step": 3460 }, { "entropy": 0.01802448322996497, "epoch": 1.5147171462960936, "grad_norm": 4.5625, "learning_rate": 2.156199242014077e-05, "loss": 0.1016, "loss_lm": 0.01616324670612812, "loss_seg": 0.08545900788158178, "mean_token_accuracy": 0.9952566176652908, "num_tokens": 1470999721.0, "step": 3461 }, { "entropy": 0.018080811481922865, "epoch": 1.5151548309443046, "grad_norm": 31.625, "learning_rate": 2.1559285327558206e-05, "loss": 0.1208, "loss_lm": 0.01569578517228365, "loss_seg": 0.1051099132746458, "mean_token_accuracy": 0.9951931238174438, "num_tokens": 1471424713.0, "step": 3462 }, { "entropy": 0.018514057621359825, "epoch": 1.5155925155925156, "grad_norm": 8.75, "learning_rate": 2.1556578234975636e-05, "loss": 0.1499, "loss_lm": 0.016501310048624873, "loss_seg": 0.13339297845959663, "mean_token_accuracy": 0.9952227473258972, "num_tokens": 1471849368.0, "step": 3463 }, { "entropy": 0.017958436626940966, "epoch": 1.5160302002407264, "grad_norm": 8.5625, "learning_rate": 2.155387114239307e-05, "loss": 0.1447, "loss_lm": 0.017191379563882947, "loss_seg": 0.12753727100789547, "mean_token_accuracy": 0.9952278584241867, "num_tokens": 1472274283.0, "step": 3464 }, { "entropy": 0.01785907195881009, "epoch": 1.5164678848889375, "grad_norm": 10.375, "learning_rate": 2.1551164049810504e-05, "loss": 0.1307, "loss_lm": 0.014778245473280549, "loss_seg": 0.11596857383847237, "mean_token_accuracy": 0.9953914284706116, "num_tokens": 1472699479.0, "step": 3465 }, { "entropy": 0.018208160530775785, "epoch": 1.5169055695371485, "grad_norm": 13.8125, "learning_rate": 2.1548456957227937e-05, "loss": 0.1344, "loss_lm": 0.016387401381507516, "loss_seg": 0.11805619299411774, "mean_token_accuracy": 0.995304524898529, "num_tokens": 1473124325.0, "step": 3466 }, { "entropy": 0.01843173848465085, "epoch": 1.5173432541853593, "grad_norm": 6.9375, "learning_rate": 2.154574986464537e-05, "loss": 0.1272, "loss_lm": 0.0166676111984998, "loss_seg": 0.11052658408880234, "mean_token_accuracy": 0.9950437247753143, "num_tokens": 1473548816.0, "step": 3467 }, { "entropy": 0.01955985836684704, "epoch": 1.5177809388335706, "grad_norm": 7.5, "learning_rate": 2.1543042772062805e-05, "loss": 0.1284, "loss_lm": 0.018781655933707952, "loss_seg": 0.10966642946004868, "mean_token_accuracy": 0.9948633313179016, "num_tokens": 1473974772.0, "step": 3468 }, { "entropy": 0.01805133745074272, "epoch": 1.5182186234817814, "grad_norm": 12.3125, "learning_rate": 2.154033567948024e-05, "loss": 0.102, "loss_lm": 0.01531226304359734, "loss_seg": 0.08671188168227673, "mean_token_accuracy": 0.9953078031539917, "num_tokens": 1474399234.0, "step": 3469 }, { "entropy": 0.01845029601827264, "epoch": 1.5186563081299922, "grad_norm": 4.6875, "learning_rate": 2.1537628586897672e-05, "loss": 0.104, "loss_lm": 0.014344749506562948, "loss_seg": 0.0896325958892703, "mean_token_accuracy": 0.9952053874731064, "num_tokens": 1474824261.0, "step": 3470 }, { "entropy": 0.018405080307275057, "epoch": 1.5190939927782035, "grad_norm": 7.3125, "learning_rate": 2.1534921494315106e-05, "loss": 0.0945, "loss_lm": 0.01540056080557406, "loss_seg": 0.07914500683546066, "mean_token_accuracy": 0.9952212870121002, "num_tokens": 1475249025.0, "step": 3471 }, { "entropy": 0.01858367631211877, "epoch": 1.5195316774264143, "grad_norm": 14.4375, "learning_rate": 2.153221440173254e-05, "loss": 0.1137, "loss_lm": 0.01773592666722834, "loss_seg": 0.0960109680891037, "mean_token_accuracy": 0.9951479136943817, "num_tokens": 1475674593.0, "step": 3472 }, { "entropy": 0.01812929380685091, "epoch": 1.519969362074625, "grad_norm": 8.625, "learning_rate": 2.1529507309149974e-05, "loss": 0.0869, "loss_lm": 0.017137132352218032, "loss_seg": 0.06975529715418816, "mean_token_accuracy": 0.995343491435051, "num_tokens": 1476099321.0, "step": 3473 }, { "entropy": 0.01866683503612876, "epoch": 1.5204070467228363, "grad_norm": 10.0, "learning_rate": 2.1526800216567407e-05, "loss": 0.1249, "loss_lm": 0.016599047230556607, "loss_seg": 0.10833664052188396, "mean_token_accuracy": 0.9950988292694092, "num_tokens": 1476524498.0, "step": 3474 }, { "entropy": 0.017769981175661087, "epoch": 1.5208447313710471, "grad_norm": 10.75, "learning_rate": 2.152409312398484e-05, "loss": 0.1227, "loss_lm": 0.016641503665596247, "loss_seg": 0.10601836629211903, "mean_token_accuracy": 0.995393306016922, "num_tokens": 1476948605.0, "step": 3475 }, { "entropy": 0.017674379982054234, "epoch": 1.5212824160192582, "grad_norm": 5.5, "learning_rate": 2.1521386031402275e-05, "loss": 0.1043, "loss_lm": 0.013742247363552451, "loss_seg": 0.09056235291063786, "mean_token_accuracy": 0.9953366816043854, "num_tokens": 1477373363.0, "step": 3476 }, { "entropy": 0.018102307803928852, "epoch": 1.5217201006674692, "grad_norm": 4.53125, "learning_rate": 2.151867893881971e-05, "loss": 0.1198, "loss_lm": 0.014387231320142746, "loss_seg": 0.1053960807621479, "mean_token_accuracy": 0.995229572057724, "num_tokens": 1477797188.0, "step": 3477 }, { "entropy": 0.017946198116987944, "epoch": 1.52215778531568, "grad_norm": 5.0625, "learning_rate": 2.1515971846237142e-05, "loss": 0.1503, "loss_lm": 0.014804846839979291, "loss_seg": 0.13548614084720612, "mean_token_accuracy": 0.9952492564916611, "num_tokens": 1478222134.0, "step": 3478 }, { "entropy": 0.018044146709144115, "epoch": 1.522595469963891, "grad_norm": 9.3125, "learning_rate": 2.1513264753654576e-05, "loss": 0.1063, "loss_lm": 0.014664796879515052, "loss_seg": 0.09166929498314857, "mean_token_accuracy": 0.9953723251819611, "num_tokens": 1478646698.0, "step": 3479 }, { "entropy": 0.018453523982316256, "epoch": 1.523033154612102, "grad_norm": 50.25, "learning_rate": 2.151055766107201e-05, "loss": 0.1455, "loss_lm": 0.0152168411295861, "loss_seg": 0.13031454756855965, "mean_token_accuracy": 0.9953182488679886, "num_tokens": 1479071703.0, "step": 3480 }, { "entropy": 0.01837372174486518, "epoch": 1.523470839260313, "grad_norm": 10.375, "learning_rate": 2.1507850568489443e-05, "loss": 0.1277, "loss_lm": 0.015174772823229432, "loss_seg": 0.11254533566534519, "mean_token_accuracy": 0.9952740371227264, "num_tokens": 1479496951.0, "step": 3481 }, { "entropy": 0.018557394854724407, "epoch": 1.523908523908524, "grad_norm": 9.6875, "learning_rate": 2.1505143475906877e-05, "loss": 0.1344, "loss_lm": 0.01849371613934636, "loss_seg": 0.11588465329259634, "mean_token_accuracy": 0.9952179938554764, "num_tokens": 1479922478.0, "step": 3482 }, { "entropy": 0.01856965385377407, "epoch": 1.524346208556735, "grad_norm": 7.625, "learning_rate": 2.150243638332431e-05, "loss": 0.1408, "loss_lm": 0.01718536182306707, "loss_seg": 0.1236087903380394, "mean_token_accuracy": 0.9952558279037476, "num_tokens": 1480347810.0, "step": 3483 }, { "entropy": 0.01858177874237299, "epoch": 1.5247838932049458, "grad_norm": 22.75, "learning_rate": 2.1499729290741745e-05, "loss": 0.1291, "loss_lm": 0.016676851781085134, "loss_seg": 0.11239723768085241, "mean_token_accuracy": 0.9952432662248611, "num_tokens": 1480773248.0, "step": 3484 }, { "entropy": 0.01807291992008686, "epoch": 1.5252215778531568, "grad_norm": 7.8125, "learning_rate": 2.1497022198159178e-05, "loss": 0.1083, "loss_lm": 0.017577510327100754, "loss_seg": 0.09069116972386837, "mean_token_accuracy": 0.9953422844409943, "num_tokens": 1481198096.0, "step": 3485 }, { "entropy": 0.018413844518363476, "epoch": 1.5256592625013679, "grad_norm": 32.25, "learning_rate": 2.149431510557661e-05, "loss": 0.1228, "loss_lm": 0.014748773304745555, "loss_seg": 0.10808606632053852, "mean_token_accuracy": 0.9953346401453018, "num_tokens": 1481623747.0, "step": 3486 }, { "entropy": 0.018971845041960478, "epoch": 1.5260969471495787, "grad_norm": 6.75, "learning_rate": 2.1491608012994042e-05, "loss": 0.1378, "loss_lm": 0.019187291152775288, "loss_seg": 0.1185721755027771, "mean_token_accuracy": 0.9949987679719925, "num_tokens": 1482049048.0, "step": 3487 }, { "entropy": 0.01893228804692626, "epoch": 1.5265346317977897, "grad_norm": 13.25, "learning_rate": 2.148890092041148e-05, "loss": 0.1306, "loss_lm": 0.014093935256823897, "loss_seg": 0.11647613532841206, "mean_token_accuracy": 0.99522764980793, "num_tokens": 1482474383.0, "step": 3488 }, { "entropy": 0.01847773650661111, "epoch": 1.5269723164460007, "grad_norm": 9.5625, "learning_rate": 2.1486193827828913e-05, "loss": 0.1172, "loss_lm": 0.016176973469555378, "loss_seg": 0.1009910311549902, "mean_token_accuracy": 0.9952160865068436, "num_tokens": 1482899746.0, "step": 3489 }, { "entropy": 0.01888372004032135, "epoch": 1.5274100010942115, "grad_norm": 7.5625, "learning_rate": 2.1483486735246347e-05, "loss": 0.0864, "loss_lm": 0.016099911648780107, "loss_seg": 0.07028093189001083, "mean_token_accuracy": 0.9950558692216873, "num_tokens": 1483324472.0, "step": 3490 }, { "entropy": 0.018824321683496237, "epoch": 1.5278476857424226, "grad_norm": 17.125, "learning_rate": 2.1480779642663777e-05, "loss": 0.1731, "loss_lm": 0.0201672469265759, "loss_seg": 0.15292667597532272, "mean_token_accuracy": 0.9951144903898239, "num_tokens": 1483749369.0, "step": 3491 }, { "entropy": 0.018495661206543446, "epoch": 1.5282853703906336, "grad_norm": 9.9375, "learning_rate": 2.147807255008121e-05, "loss": 0.1313, "loss_lm": 0.015171138336881995, "loss_seg": 0.1161684226244688, "mean_token_accuracy": 0.9952172487974167, "num_tokens": 1484174619.0, "step": 3492 }, { "entropy": 0.01849360205233097, "epoch": 1.5287230550388444, "grad_norm": 21.875, "learning_rate": 2.1475365457498648e-05, "loss": 0.1481, "loss_lm": 0.016006281599402428, "loss_seg": 0.13212808594107628, "mean_token_accuracy": 0.9950672686100006, "num_tokens": 1484599325.0, "step": 3493 }, { "entropy": 0.017993380781263113, "epoch": 1.5291607396870555, "grad_norm": 10.75, "learning_rate": 2.1472658364916082e-05, "loss": 0.1089, "loss_lm": 0.014726345892995596, "loss_seg": 0.09416604787111282, "mean_token_accuracy": 0.995380088686943, "num_tokens": 1485024585.0, "step": 3494 }, { "entropy": 0.018258361145853996, "epoch": 1.5295984243352665, "grad_norm": 4.46875, "learning_rate": 2.1469951272333516e-05, "loss": 0.1088, "loss_lm": 0.018825895385816693, "loss_seg": 0.08996527083218098, "mean_token_accuracy": 0.9952812194824219, "num_tokens": 1485449022.0, "step": 3495 }, { "entropy": 0.018258119001984596, "epoch": 1.5300361089834773, "grad_norm": 12.0625, "learning_rate": 2.1467244179750946e-05, "loss": 0.1081, "loss_lm": 0.01507962797768414, "loss_seg": 0.09297394007444382, "mean_token_accuracy": 0.9952444285154343, "num_tokens": 1485874389.0, "step": 3496 }, { "entropy": 0.018344033043831587, "epoch": 1.5304737936316883, "grad_norm": 44.5, "learning_rate": 2.146453708716838e-05, "loss": 0.1135, "loss_lm": 0.016722065629437566, "loss_seg": 0.09679388627409935, "mean_token_accuracy": 0.9952304512262344, "num_tokens": 1486300186.0, "step": 3497 }, { "entropy": 0.018219921737909317, "epoch": 1.5309114782798994, "grad_norm": 6.46875, "learning_rate": 2.1461829994585817e-05, "loss": 0.0814, "loss_lm": 0.014345914823934436, "loss_seg": 0.06702699139714241, "mean_token_accuracy": 0.9951401203870773, "num_tokens": 1486724431.0, "step": 3498 }, { "entropy": 0.018786005210131407, "epoch": 1.5313491629281102, "grad_norm": 7.09375, "learning_rate": 2.145912290200325e-05, "loss": 0.1549, "loss_lm": 0.01652552979066968, "loss_seg": 0.13836810179054737, "mean_token_accuracy": 0.9952257424592972, "num_tokens": 1487148527.0, "step": 3499 }, { "entropy": 0.01887795701622963, "epoch": 1.5317868475763212, "grad_norm": 3.453125, "learning_rate": 2.1456415809420684e-05, "loss": 0.1416, "loss_lm": 0.017109469510614872, "loss_seg": 0.12449125852435827, "mean_token_accuracy": 0.9951428472995758, "num_tokens": 1487574430.0, "step": 3500 }, { "entropy": 0.0182713745161891, "epoch": 1.5322245322245323, "grad_norm": 5.96875, "learning_rate": 2.1453708716838114e-05, "loss": 0.1327, "loss_lm": 0.015150097897276282, "loss_seg": 0.11754799261689186, "mean_token_accuracy": 0.9951091408729553, "num_tokens": 1487999243.0, "step": 3501 }, { "entropy": 0.01825542701408267, "epoch": 1.532662216872743, "grad_norm": 14.3125, "learning_rate": 2.1451001624255548e-05, "loss": 0.0925, "loss_lm": 0.013559672748669982, "loss_seg": 0.07892410829663277, "mean_token_accuracy": 0.9953509569168091, "num_tokens": 1488424220.0, "step": 3502 }, { "entropy": 0.01836828188970685, "epoch": 1.533099901520954, "grad_norm": 4.96875, "learning_rate": 2.1448294531672985e-05, "loss": 0.1385, "loss_lm": 0.016599693102762103, "loss_seg": 0.12191067636013031, "mean_token_accuracy": 0.9952165186405182, "num_tokens": 1488848609.0, "step": 3503 }, { "entropy": 0.018740227911621332, "epoch": 1.5335375861691651, "grad_norm": 9.625, "learning_rate": 2.144558743909042e-05, "loss": 0.1679, "loss_lm": 0.016923357732594013, "loss_seg": 0.1509420908987522, "mean_token_accuracy": 0.9951760470867157, "num_tokens": 1489274341.0, "step": 3504 }, { "entropy": 0.01852476503700018, "epoch": 1.533975270817376, "grad_norm": 10.0625, "learning_rate": 2.1442880346507853e-05, "loss": 0.1011, "loss_lm": 0.017472299048677087, "loss_seg": 0.08366184309124947, "mean_token_accuracy": 0.9952197670936584, "num_tokens": 1489699151.0, "step": 3505 }, { "entropy": 0.018062733579427004, "epoch": 1.5344129554655872, "grad_norm": 14.625, "learning_rate": 2.1440173253925283e-05, "loss": 0.1074, "loss_lm": 0.0134719074703753, "loss_seg": 0.09396171662956476, "mean_token_accuracy": 0.9954187721014023, "num_tokens": 1490124507.0, "step": 3506 }, { "entropy": 0.01845358358696103, "epoch": 1.534850640113798, "grad_norm": 20.625, "learning_rate": 2.1437466161342717e-05, "loss": 0.1206, "loss_lm": 0.016468725400045514, "loss_seg": 0.10413110814988613, "mean_token_accuracy": 0.9952149242162704, "num_tokens": 1490550177.0, "step": 3507 }, { "entropy": 0.018924755044281483, "epoch": 1.5352883247620088, "grad_norm": 17.5, "learning_rate": 2.1434759068760154e-05, "loss": 0.1043, "loss_lm": 0.018419299740344286, "loss_seg": 0.08583617582917213, "mean_token_accuracy": 0.9951026439666748, "num_tokens": 1490976051.0, "step": 3508 }, { "entropy": 0.018513815477490425, "epoch": 1.53572600941022, "grad_norm": 9.75, "learning_rate": 2.1432051976177588e-05, "loss": 0.0897, "loss_lm": 0.014938605483621359, "loss_seg": 0.07478582579642534, "mean_token_accuracy": 0.9952991157770157, "num_tokens": 1491400407.0, "step": 3509 }, { "entropy": 0.01806902466341853, "epoch": 1.536163694058431, "grad_norm": 17.125, "learning_rate": 2.1429344883595018e-05, "loss": 0.113, "loss_lm": 0.01476486842148006, "loss_seg": 0.09820951521396637, "mean_token_accuracy": 0.9954453259706497, "num_tokens": 1491824982.0, "step": 3510 }, { "entropy": 0.018662021961063147, "epoch": 1.5366013787066417, "grad_norm": 7.375, "learning_rate": 2.1426637791012452e-05, "loss": 0.1351, "loss_lm": 0.014372867066413164, "loss_seg": 0.12074223626405, "mean_token_accuracy": 0.995218425989151, "num_tokens": 1492249261.0, "step": 3511 }, { "entropy": 0.018348898272961378, "epoch": 1.537039063354853, "grad_norm": 4.15625, "learning_rate": 2.1423930698429885e-05, "loss": 0.1192, "loss_lm": 0.01641965820454061, "loss_seg": 0.10275372862815857, "mean_token_accuracy": 0.9952751249074936, "num_tokens": 1492673987.0, "step": 3512 }, { "entropy": 0.01871634181588888, "epoch": 1.5374767480030638, "grad_norm": 13.0625, "learning_rate": 2.1421223605847323e-05, "loss": 0.148, "loss_lm": 0.016930482583120465, "loss_seg": 0.13103346899151802, "mean_token_accuracy": 0.9951724857091904, "num_tokens": 1493099030.0, "step": 3513 }, { "entropy": 0.01885366952046752, "epoch": 1.5379144326512748, "grad_norm": 8.6875, "learning_rate": 2.1418516513264756e-05, "loss": 0.1145, "loss_lm": 0.016504642320796847, "loss_seg": 0.09802087768912315, "mean_token_accuracy": 0.9951395243406296, "num_tokens": 1493524274.0, "step": 3514 }, { "entropy": 0.0182275902479887, "epoch": 1.5383521172994858, "grad_norm": 11.375, "learning_rate": 2.1415809420682187e-05, "loss": 0.1196, "loss_lm": 0.015383583027869463, "loss_seg": 0.10426438599824905, "mean_token_accuracy": 0.9953283369541168, "num_tokens": 1493949510.0, "step": 3515 }, { "entropy": 0.01857754262164235, "epoch": 1.5387898019476967, "grad_norm": 4.25, "learning_rate": 2.141310232809962e-05, "loss": 0.0925, "loss_lm": 0.014588958816602826, "loss_seg": 0.07787259295582771, "mean_token_accuracy": 0.995286762714386, "num_tokens": 1494373801.0, "step": 3516 }, { "entropy": 0.018844344187527895, "epoch": 1.5392274865959077, "grad_norm": 5.5, "learning_rate": 2.1410395235517054e-05, "loss": 0.1042, "loss_lm": 0.015570397954434156, "loss_seg": 0.08860672265291214, "mean_token_accuracy": 0.995079293847084, "num_tokens": 1494798885.0, "step": 3517 }, { "entropy": 0.018329727463424206, "epoch": 1.5396651712441187, "grad_norm": 6.40625, "learning_rate": 2.1407688142934488e-05, "loss": 0.153, "loss_lm": 0.017296859063208103, "loss_seg": 0.13566835410892963, "mean_token_accuracy": 0.9952820539474487, "num_tokens": 1495224446.0, "step": 3518 }, { "entropy": 0.01821361854672432, "epoch": 1.5401028558923295, "grad_norm": 7.375, "learning_rate": 2.1404981050351925e-05, "loss": 0.1239, "loss_lm": 0.015304028522223234, "loss_seg": 0.1085647214204073, "mean_token_accuracy": 0.9952318221330643, "num_tokens": 1495649720.0, "step": 3519 }, { "entropy": 0.018901174422353506, "epoch": 1.5405405405405406, "grad_norm": 6.9375, "learning_rate": 2.1402273957769355e-05, "loss": 0.1026, "loss_lm": 0.014758704463019967, "loss_seg": 0.08784870244562626, "mean_token_accuracy": 0.9950806200504303, "num_tokens": 1496074356.0, "step": 3520 }, { "entropy": 0.018721474800258875, "epoch": 1.5409782251887516, "grad_norm": 20.625, "learning_rate": 2.139956686518679e-05, "loss": 0.0869, "loss_lm": 0.0153054294642061, "loss_seg": 0.07164278719574213, "mean_token_accuracy": 0.9951963871717453, "num_tokens": 1496499218.0, "step": 3521 }, { "entropy": 0.0189133882522583, "epoch": 1.5414159098369624, "grad_norm": 4.78125, "learning_rate": 2.1396859772604223e-05, "loss": 0.1083, "loss_lm": 0.01663164235651493, "loss_seg": 0.0916262399405241, "mean_token_accuracy": 0.995108962059021, "num_tokens": 1496924059.0, "step": 3522 }, { "entropy": 0.018169456161558628, "epoch": 1.5418535944851735, "grad_norm": 5.21875, "learning_rate": 2.1394152680021656e-05, "loss": 0.0988, "loss_lm": 0.015848077600821853, "loss_seg": 0.08291430585086346, "mean_token_accuracy": 0.9953625500202179, "num_tokens": 1497350073.0, "step": 3523 }, { "entropy": 0.01891425671055913, "epoch": 1.5422912791333845, "grad_norm": 14.875, "learning_rate": 2.1391445587439094e-05, "loss": 0.1519, "loss_lm": 0.014968423172831535, "loss_seg": 0.13688525557518005, "mean_token_accuracy": 0.9951943159103394, "num_tokens": 1497775170.0, "step": 3524 }, { "entropy": 0.018342279829084873, "epoch": 1.5427289637815953, "grad_norm": 7.625, "learning_rate": 2.1388738494856524e-05, "loss": 0.0882, "loss_lm": 0.014253736240789294, "loss_seg": 0.07393932808190584, "mean_token_accuracy": 0.9952864199876785, "num_tokens": 1498200297.0, "step": 3525 }, { "entropy": 0.018967546988278627, "epoch": 1.5431666484298063, "grad_norm": 4.0, "learning_rate": 2.1386031402273958e-05, "loss": 0.1313, "loss_lm": 0.01752149104140699, "loss_seg": 0.11373584344983101, "mean_token_accuracy": 0.995090901851654, "num_tokens": 1498625410.0, "step": 3526 }, { "entropy": 0.019340807106345892, "epoch": 1.5436043330780174, "grad_norm": 90.5, "learning_rate": 2.138332430969139e-05, "loss": 0.1231, "loss_lm": 0.016786832129582763, "loss_seg": 0.10628178901970387, "mean_token_accuracy": 0.9950148463249207, "num_tokens": 1499051264.0, "step": 3527 }, { "entropy": 0.01876116218045354, "epoch": 1.5440420177262282, "grad_norm": 18.75, "learning_rate": 2.1380617217108825e-05, "loss": 0.1281, "loss_lm": 0.016311715822666883, "loss_seg": 0.11176988109946251, "mean_token_accuracy": 0.9951357394456863, "num_tokens": 1499476343.0, "step": 3528 }, { "entropy": 0.018762556836009026, "epoch": 1.5444797023744392, "grad_norm": 6.5625, "learning_rate": 2.1377910124526262e-05, "loss": 0.1328, "loss_lm": 0.01764875859953463, "loss_seg": 0.11518773529678583, "mean_token_accuracy": 0.9952322542667389, "num_tokens": 1499901201.0, "step": 3529 }, { "entropy": 0.018514301162213087, "epoch": 1.5449173870226502, "grad_norm": 6.625, "learning_rate": 2.1375203031943693e-05, "loss": 0.1246, "loss_lm": 0.017120811622589827, "loss_seg": 0.10751376487314701, "mean_token_accuracy": 0.9952961951494217, "num_tokens": 1500326351.0, "step": 3530 }, { "entropy": 0.01834185654297471, "epoch": 1.545355071670861, "grad_norm": 7.96875, "learning_rate": 2.1372495939361126e-05, "loss": 0.1446, "loss_lm": 0.015584154054522514, "loss_seg": 0.1290341541171074, "mean_token_accuracy": 0.9952322691679001, "num_tokens": 1500751465.0, "step": 3531 }, { "entropy": 0.017663151491433382, "epoch": 1.545792756319072, "grad_norm": 7.40625, "learning_rate": 2.136978884677856e-05, "loss": 0.0995, "loss_lm": 0.016234961338341236, "loss_seg": 0.08323225192725658, "mean_token_accuracy": 0.9955402314662933, "num_tokens": 1501176538.0, "step": 3532 }, { "entropy": 0.019229772966355085, "epoch": 1.5462304409672831, "grad_norm": 9.6875, "learning_rate": 2.1367081754195994e-05, "loss": 0.1407, "loss_lm": 0.01824938994832337, "loss_seg": 0.12248722091317177, "mean_token_accuracy": 0.9949814230203629, "num_tokens": 1501602158.0, "step": 3533 }, { "entropy": 0.01845326367765665, "epoch": 1.546668125615494, "grad_norm": 14.8125, "learning_rate": 2.1364374661613427e-05, "loss": 0.141, "loss_lm": 0.016598939197137952, "loss_seg": 0.12436299212276936, "mean_token_accuracy": 0.9951345473527908, "num_tokens": 1502027167.0, "step": 3534 }, { "entropy": 0.018641588743776083, "epoch": 1.547105810263705, "grad_norm": 8.25, "learning_rate": 2.136166756903086e-05, "loss": 0.1123, "loss_lm": 0.015574827324599028, "loss_seg": 0.09669666551053524, "mean_token_accuracy": 0.9951395392417908, "num_tokens": 1502451942.0, "step": 3535 }, { "entropy": 0.019009916577488184, "epoch": 1.547543494911916, "grad_norm": 14.0625, "learning_rate": 2.1358960476448295e-05, "loss": 0.1595, "loss_lm": 0.020175934303551912, "loss_seg": 0.1393624059855938, "mean_token_accuracy": 0.995074987411499, "num_tokens": 1502876898.0, "step": 3536 }, { "entropy": 0.01846099691465497, "epoch": 1.5479811795601268, "grad_norm": 5.03125, "learning_rate": 2.135625338386573e-05, "loss": 0.1013, "loss_lm": 0.013044357765465975, "loss_seg": 0.08828427456319332, "mean_token_accuracy": 0.9952519983053207, "num_tokens": 1503302091.0, "step": 3537 }, { "entropy": 0.01867950428277254, "epoch": 1.5484188642083379, "grad_norm": 9.25, "learning_rate": 2.1353546291283162e-05, "loss": 0.1322, "loss_lm": 0.01800363091751933, "loss_seg": 0.11417772620916367, "mean_token_accuracy": 0.9951417297124863, "num_tokens": 1503727286.0, "step": 3538 }, { "entropy": 0.018145734909921885, "epoch": 1.5488565488565489, "grad_norm": 5.03125, "learning_rate": 2.1350839198700596e-05, "loss": 0.1254, "loss_lm": 0.01601406536065042, "loss_seg": 0.10934971086680889, "mean_token_accuracy": 0.9952881187200546, "num_tokens": 1504151923.0, "step": 3539 }, { "entropy": 0.018870865926146507, "epoch": 1.5492942335047597, "grad_norm": 3.90625, "learning_rate": 2.134813210611803e-05, "loss": 0.1455, "loss_lm": 0.017130772350355983, "loss_seg": 0.12837176397442818, "mean_token_accuracy": 0.9951517879962921, "num_tokens": 1504576976.0, "step": 3540 }, { "entropy": 0.018520244862884283, "epoch": 1.5497319181529707, "grad_norm": 20.375, "learning_rate": 2.1345425013535464e-05, "loss": 0.1572, "loss_lm": 0.016455504577606916, "loss_seg": 0.14069525338709354, "mean_token_accuracy": 0.9951517283916473, "num_tokens": 1505002214.0, "step": 3541 }, { "entropy": 0.01772124832496047, "epoch": 1.5501696028011818, "grad_norm": 6.46875, "learning_rate": 2.1342717920952897e-05, "loss": 0.1263, "loss_lm": 0.01633195043541491, "loss_seg": 0.10997434705495834, "mean_token_accuracy": 0.9953831285238266, "num_tokens": 1505426701.0, "step": 3542 }, { "entropy": 0.01831643795594573, "epoch": 1.5506072874493926, "grad_norm": 11.5625, "learning_rate": 2.134001082837033e-05, "loss": 0.1489, "loss_lm": 0.015430421335622668, "loss_seg": 0.13343684747815132, "mean_token_accuracy": 0.9952599555253983, "num_tokens": 1505851544.0, "step": 3543 }, { "entropy": 0.018625017255544662, "epoch": 1.5510449720976038, "grad_norm": 10.0625, "learning_rate": 2.1337303735787765e-05, "loss": 0.1055, "loss_lm": 0.016087621450424194, "loss_seg": 0.08936821762472391, "mean_token_accuracy": 0.9952135384082794, "num_tokens": 1506276654.0, "step": 3544 }, { "entropy": 0.01878546830266714, "epoch": 1.5514826567458146, "grad_norm": 4.5625, "learning_rate": 2.13345966432052e-05, "loss": 0.1307, "loss_lm": 0.018805802799761295, "loss_seg": 0.11191561445593834, "mean_token_accuracy": 0.9950558841228485, "num_tokens": 1506701986.0, "step": 3545 }, { "entropy": 0.018436620011925697, "epoch": 1.5519203413940255, "grad_norm": 9.9375, "learning_rate": 2.1331889550622632e-05, "loss": 0.1467, "loss_lm": 0.015495528932660818, "loss_seg": 0.1312150601297617, "mean_token_accuracy": 0.9952208995819092, "num_tokens": 1507126691.0, "step": 3546 }, { "entropy": 0.018165449611842632, "epoch": 1.5523580260422367, "grad_norm": 6.96875, "learning_rate": 2.1329182458040066e-05, "loss": 0.1573, "loss_lm": 0.015400295611470938, "loss_seg": 0.14185736142098904, "mean_token_accuracy": 0.9951808899641037, "num_tokens": 1507551209.0, "step": 3547 }, { "entropy": 0.018693378195166588, "epoch": 1.5527957106904475, "grad_norm": 6.78125, "learning_rate": 2.13264753654575e-05, "loss": 0.1227, "loss_lm": 0.016571708023548126, "loss_seg": 0.10610295459628105, "mean_token_accuracy": 0.9952137321233749, "num_tokens": 1507975885.0, "step": 3548 }, { "entropy": 0.017665620893239975, "epoch": 1.5532333953386583, "grad_norm": 5.78125, "learning_rate": 2.1323768272874933e-05, "loss": 0.1254, "loss_lm": 0.015786404255777597, "loss_seg": 0.10963813029229641, "mean_token_accuracy": 0.9953252077102661, "num_tokens": 1508400664.0, "step": 3549 }, { "entropy": 0.01836379524320364, "epoch": 1.5536710799868696, "grad_norm": 7.96875, "learning_rate": 2.1321061180292367e-05, "loss": 0.1312, "loss_lm": 0.015369016444310546, "loss_seg": 0.11582246236503124, "mean_token_accuracy": 0.9951821118593216, "num_tokens": 1508826124.0, "step": 3550 }, { "entropy": 0.01817905530333519, "epoch": 1.5541087646350804, "grad_norm": 13.625, "learning_rate": 2.13183540877098e-05, "loss": 0.1563, "loss_lm": 0.01615945785306394, "loss_seg": 0.14016817137598991, "mean_token_accuracy": 0.9952621608972549, "num_tokens": 1509250739.0, "step": 3551 }, { "entropy": 0.018742728512734175, "epoch": 1.5545464492832914, "grad_norm": 18.5, "learning_rate": 2.1315646995127235e-05, "loss": 0.106, "loss_lm": 0.016325861448422074, "loss_seg": 0.08964932151138783, "mean_token_accuracy": 0.9950551092624664, "num_tokens": 1509676421.0, "step": 3552 }, { "entropy": 0.018436124082654715, "epoch": 1.5549841339315025, "grad_norm": 6.09375, "learning_rate": 2.1312939902544668e-05, "loss": 0.1321, "loss_lm": 0.017540913075208664, "loss_seg": 0.1145727951079607, "mean_token_accuracy": 0.995260551571846, "num_tokens": 1510100909.0, "step": 3553 }, { "entropy": 0.018547851126641035, "epoch": 1.5554218185797133, "grad_norm": 9.375, "learning_rate": 2.13102328099621e-05, "loss": 0.0825, "loss_lm": 0.014820185722783208, "loss_seg": 0.06769230961799622, "mean_token_accuracy": 0.9952063411474228, "num_tokens": 1510526035.0, "step": 3554 }, { "entropy": 0.018489975947886705, "epoch": 1.5558595032279243, "grad_norm": 4.46875, "learning_rate": 2.1307525717379536e-05, "loss": 0.1352, "loss_lm": 0.015157900983467698, "loss_seg": 0.12008736841380596, "mean_token_accuracy": 0.9952055215835571, "num_tokens": 1510951279.0, "step": 3555 }, { "entropy": 0.018184575252234936, "epoch": 1.5562971878761354, "grad_norm": 9.8125, "learning_rate": 2.130481862479697e-05, "loss": 0.1138, "loss_lm": 0.016352230682969093, "loss_seg": 0.09741988591849804, "mean_token_accuracy": 0.9952254891395569, "num_tokens": 1511375807.0, "step": 3556 }, { "entropy": 0.01788516342639923, "epoch": 1.5567348725243462, "grad_norm": 7.6875, "learning_rate": 2.1302111532214403e-05, "loss": 0.1248, "loss_lm": 0.015643131220713258, "loss_seg": 0.10919059254229069, "mean_token_accuracy": 0.9954213201999664, "num_tokens": 1511799908.0, "step": 3557 }, { "entropy": 0.017867442686110735, "epoch": 1.5571725571725572, "grad_norm": 5.3125, "learning_rate": 2.1299404439631833e-05, "loss": 0.1216, "loss_lm": 0.013535566162317991, "loss_seg": 0.10808268375694752, "mean_token_accuracy": 0.9954130351543427, "num_tokens": 1512224726.0, "step": 3558 }, { "entropy": 0.01771964132785797, "epoch": 1.5576102418207682, "grad_norm": 13.75, "learning_rate": 2.1296697347049267e-05, "loss": 0.1152, "loss_lm": 0.016371742356568575, "loss_seg": 0.09883475676178932, "mean_token_accuracy": 0.9954302608966827, "num_tokens": 1512649642.0, "step": 3559 }, { "entropy": 0.017929745838046074, "epoch": 1.558047926468979, "grad_norm": 4.5, "learning_rate": 2.1293990254466704e-05, "loss": 0.0924, "loss_lm": 0.015036266762763262, "loss_seg": 0.0773517694324255, "mean_token_accuracy": 0.9952965974807739, "num_tokens": 1513074831.0, "step": 3560 }, { "entropy": 0.018325881101191044, "epoch": 1.55848561111719, "grad_norm": 12.9375, "learning_rate": 2.1291283161884138e-05, "loss": 0.1105, "loss_lm": 0.016716323560103774, "loss_seg": 0.09379123710095882, "mean_token_accuracy": 0.9952521175146103, "num_tokens": 1513499199.0, "step": 3561 }, { "entropy": 0.018691904842853546, "epoch": 1.5589232957654011, "grad_norm": 6.6875, "learning_rate": 2.1288576069301572e-05, "loss": 0.14, "loss_lm": 0.014698412036523223, "loss_seg": 0.1252610757946968, "mean_token_accuracy": 0.9951730370521545, "num_tokens": 1513924260.0, "step": 3562 }, { "entropy": 0.017975145019590855, "epoch": 1.559360980413612, "grad_norm": 14.6875, "learning_rate": 2.1285868976719002e-05, "loss": 0.19, "loss_lm": 0.01662130351178348, "loss_seg": 0.17340868711471558, "mean_token_accuracy": 0.9954181164503098, "num_tokens": 1514348899.0, "step": 3563 }, { "entropy": 0.019030170049518347, "epoch": 1.559798665061823, "grad_norm": 8.6875, "learning_rate": 2.1283161884136436e-05, "loss": 0.1323, "loss_lm": 0.015017451485618949, "loss_seg": 0.11727227456867695, "mean_token_accuracy": 0.9951419234275818, "num_tokens": 1514773299.0, "step": 3564 }, { "entropy": 0.018747441470623016, "epoch": 1.560236349710034, "grad_norm": 4.65625, "learning_rate": 2.1280454791553873e-05, "loss": 0.106, "loss_lm": 0.015472320839762688, "loss_seg": 0.09048423357307911, "mean_token_accuracy": 0.9951983839273453, "num_tokens": 1515198105.0, "step": 3565 }, { "entropy": 0.018260031938552856, "epoch": 1.5606740343582448, "grad_norm": 13.0, "learning_rate": 2.1277747698971307e-05, "loss": 0.1538, "loss_lm": 0.016611542087048292, "loss_seg": 0.13720918633043766, "mean_token_accuracy": 0.9952304661273956, "num_tokens": 1515623225.0, "step": 3566 }, { "entropy": 0.018254595808684826, "epoch": 1.5611117190064558, "grad_norm": 22.375, "learning_rate": 2.127504060638874e-05, "loss": 0.1035, "loss_lm": 0.01520902942866087, "loss_seg": 0.08828454185277224, "mean_token_accuracy": 0.9953843057155609, "num_tokens": 1516048435.0, "step": 3567 }, { "entropy": 0.0183335873298347, "epoch": 1.5615494036546669, "grad_norm": 3.65625, "learning_rate": 2.127233351380617e-05, "loss": 0.0866, "loss_lm": 0.01432824763469398, "loss_seg": 0.07231474574655294, "mean_token_accuracy": 0.9953018873929977, "num_tokens": 1516473428.0, "step": 3568 }, { "entropy": 0.017985877115279436, "epoch": 1.5619870883028777, "grad_norm": 16.625, "learning_rate": 2.1269626421223604e-05, "loss": 0.1114, "loss_lm": 0.015042464947327971, "loss_seg": 0.09637895878404379, "mean_token_accuracy": 0.9953561276197433, "num_tokens": 1516897380.0, "step": 3569 }, { "entropy": 0.01829635025933385, "epoch": 1.5624247729510887, "grad_norm": 12.625, "learning_rate": 2.126691932864104e-05, "loss": 0.1031, "loss_lm": 0.01785198925063014, "loss_seg": 0.08523131534457207, "mean_token_accuracy": 0.9952194690704346, "num_tokens": 1517322608.0, "step": 3570 }, { "entropy": 0.018102468457072973, "epoch": 1.5628624575992998, "grad_norm": 46.0, "learning_rate": 2.1264212236058475e-05, "loss": 0.136, "loss_lm": 0.015135903609916568, "loss_seg": 0.12085100077092648, "mean_token_accuracy": 0.9952574521303177, "num_tokens": 1517748076.0, "step": 3571 }, { "entropy": 0.018464401364326477, "epoch": 1.5633001422475106, "grad_norm": 4.90625, "learning_rate": 2.126150514347591e-05, "loss": 0.1375, "loss_lm": 0.015375362243503332, "loss_seg": 0.12215708196163177, "mean_token_accuracy": 0.9951303899288177, "num_tokens": 1518173324.0, "step": 3572 }, { "entropy": 0.018607051111757755, "epoch": 1.5637378268957216, "grad_norm": 10.1875, "learning_rate": 2.125879805089334e-05, "loss": 0.0954, "loss_lm": 0.01283851033076644, "loss_seg": 0.08257889561355114, "mean_token_accuracy": 0.9951924085617065, "num_tokens": 1518599497.0, "step": 3573 }, { "entropy": 0.018025469034910202, "epoch": 1.5641755115439326, "grad_norm": 5.21875, "learning_rate": 2.1256090958310773e-05, "loss": 0.09, "loss_lm": 0.015716560650616884, "loss_seg": 0.07423684559762478, "mean_token_accuracy": 0.995305523276329, "num_tokens": 1519025031.0, "step": 3574 }, { "entropy": 0.018513485323637724, "epoch": 1.5646131961921435, "grad_norm": 10.6875, "learning_rate": 2.125338386572821e-05, "loss": 0.1461, "loss_lm": 0.014221043325960636, "loss_seg": 0.13185226172208786, "mean_token_accuracy": 0.9952539205551147, "num_tokens": 1519449990.0, "step": 3575 }, { "entropy": 0.018309205770492554, "epoch": 1.5650508808403545, "grad_norm": 9.625, "learning_rate": 2.1250676773145644e-05, "loss": 0.1434, "loss_lm": 0.015483994036912918, "loss_seg": 0.1279564183205366, "mean_token_accuracy": 0.9952304810285568, "num_tokens": 1519874946.0, "step": 3576 }, { "entropy": 0.018354606349021196, "epoch": 1.5654885654885655, "grad_norm": 4.6875, "learning_rate": 2.1247969680563078e-05, "loss": 0.0992, "loss_lm": 0.015390049200505018, "loss_seg": 0.08385001309216022, "mean_token_accuracy": 0.9951247274875641, "num_tokens": 1520300099.0, "step": 3577 }, { "entropy": 0.018665568437427282, "epoch": 1.5659262501367763, "grad_norm": 5.96875, "learning_rate": 2.1245262587980508e-05, "loss": 0.1585, "loss_lm": 0.017316104844212532, "loss_seg": 0.1411785315722227, "mean_token_accuracy": 0.9951657354831696, "num_tokens": 1520725372.0, "step": 3578 }, { "entropy": 0.017930268310010433, "epoch": 1.5663639347849874, "grad_norm": 13.625, "learning_rate": 2.1242555495397942e-05, "loss": 0.1141, "loss_lm": 0.015154131455346942, "loss_seg": 0.09891208447515965, "mean_token_accuracy": 0.9952701479196548, "num_tokens": 1521150920.0, "step": 3579 }, { "entropy": 0.018808157183229923, "epoch": 1.5668016194331984, "grad_norm": 5.375, "learning_rate": 2.123984840281538e-05, "loss": 0.1623, "loss_lm": 0.017612448427826166, "loss_seg": 0.14472955279052258, "mean_token_accuracy": 0.9951292127370834, "num_tokens": 1521576491.0, "step": 3580 }, { "entropy": 0.018204953521490097, "epoch": 1.5672393040814092, "grad_norm": 9.375, "learning_rate": 2.1237141310232813e-05, "loss": 0.1293, "loss_lm": 0.016206378582865, "loss_seg": 0.11312113329768181, "mean_token_accuracy": 0.9953999221324921, "num_tokens": 1522001868.0, "step": 3581 }, { "entropy": 0.018049612175673246, "epoch": 1.5676769887296205, "grad_norm": 11.0, "learning_rate": 2.1234434217650243e-05, "loss": 0.1345, "loss_lm": 0.014406982809305191, "loss_seg": 0.12006478756666183, "mean_token_accuracy": 0.9953048825263977, "num_tokens": 1522426502.0, "step": 3582 }, { "entropy": 0.01812473637983203, "epoch": 1.5681146733778313, "grad_norm": 5.71875, "learning_rate": 2.1231727125067677e-05, "loss": 0.1144, "loss_lm": 0.01535606849938631, "loss_seg": 0.09901012107729912, "mean_token_accuracy": 0.9952031970024109, "num_tokens": 1522851543.0, "step": 3583 }, { "entropy": 0.018803533166646957, "epoch": 1.568552358026042, "grad_norm": 11.8125, "learning_rate": 2.122902003248511e-05, "loss": 0.1068, "loss_lm": 0.016336375614628196, "loss_seg": 0.09043768793344498, "mean_token_accuracy": 0.9951290041208267, "num_tokens": 1523276900.0, "step": 3584 }, { "entropy": 0.01810984266921878, "epoch": 1.5689900426742533, "grad_norm": 12.625, "learning_rate": 2.1226312939902544e-05, "loss": 0.1462, "loss_lm": 0.014324644580483437, "loss_seg": 0.13186773657798767, "mean_token_accuracy": 0.9952895641326904, "num_tokens": 1523701436.0, "step": 3585 }, { "entropy": 0.01820716680958867, "epoch": 1.5694277273224642, "grad_norm": 9.4375, "learning_rate": 2.122360584731998e-05, "loss": 0.1716, "loss_lm": 0.01529522449709475, "loss_seg": 0.15629640594124794, "mean_token_accuracy": 0.9951478540897369, "num_tokens": 1524126583.0, "step": 3586 }, { "entropy": 0.018449426162987947, "epoch": 1.569865411970675, "grad_norm": 7.25, "learning_rate": 2.122089875473741e-05, "loss": 0.1636, "loss_lm": 0.01638419763185084, "loss_seg": 0.14722048863768578, "mean_token_accuracy": 0.9952078610658646, "num_tokens": 1524551412.0, "step": 3587 }, { "entropy": 0.018648209981620312, "epoch": 1.5703030966188862, "grad_norm": 11.875, "learning_rate": 2.1218191662154845e-05, "loss": 0.1208, "loss_lm": 0.017861445201560855, "loss_seg": 0.10295846126973629, "mean_token_accuracy": 0.9952439814805984, "num_tokens": 1524976513.0, "step": 3588 }, { "entropy": 0.01858975412324071, "epoch": 1.570740781267097, "grad_norm": 8.875, "learning_rate": 2.121548456957228e-05, "loss": 0.1174, "loss_lm": 0.017629046458750963, "loss_seg": 0.09976255148649216, "mean_token_accuracy": 0.9951977878808975, "num_tokens": 1525402535.0, "step": 3589 }, { "entropy": 0.019375587813556194, "epoch": 1.571178465915308, "grad_norm": 8.5, "learning_rate": 2.1212777476989713e-05, "loss": 0.1462, "loss_lm": 0.016958063933998346, "loss_seg": 0.12928750179708004, "mean_token_accuracy": 0.9949204325675964, "num_tokens": 1525828941.0, "step": 3590 }, { "entropy": 0.0182607164606452, "epoch": 1.571616150563519, "grad_norm": 4.46875, "learning_rate": 2.121007038440715e-05, "loss": 0.1471, "loss_lm": 0.016524787060916424, "loss_seg": 0.13061528094112873, "mean_token_accuracy": 0.9952082335948944, "num_tokens": 1526253388.0, "step": 3591 }, { "entropy": 0.01829001121222973, "epoch": 1.57205383521173, "grad_norm": 4.8125, "learning_rate": 2.120736329182458e-05, "loss": 0.0845, "loss_lm": 0.014954471495002508, "loss_seg": 0.06957209296524525, "mean_token_accuracy": 0.9952467232942581, "num_tokens": 1526678315.0, "step": 3592 }, { "entropy": 0.018441011663526297, "epoch": 1.572491519859941, "grad_norm": 22.25, "learning_rate": 2.1204656199242014e-05, "loss": 0.1479, "loss_lm": 0.017374334624037147, "loss_seg": 0.1305045299232006, "mean_token_accuracy": 0.9953173249959946, "num_tokens": 1527102807.0, "step": 3593 }, { "entropy": 0.018540652934461832, "epoch": 1.572929204508152, "grad_norm": 5.71875, "learning_rate": 2.1201949106659448e-05, "loss": 0.1541, "loss_lm": 0.01585019682534039, "loss_seg": 0.13827685452997684, "mean_token_accuracy": 0.9952900409698486, "num_tokens": 1527527790.0, "step": 3594 }, { "entropy": 0.018242533318698406, "epoch": 1.5733668891563628, "grad_norm": 13.0625, "learning_rate": 2.119924201407688e-05, "loss": 0.0961, "loss_lm": 0.01647555618546903, "loss_seg": 0.07967142574489117, "mean_token_accuracy": 0.9951960444450378, "num_tokens": 1527952841.0, "step": 3595 }, { "entropy": 0.018907906021922827, "epoch": 1.5738045738045738, "grad_norm": 12.75, "learning_rate": 2.119653492149432e-05, "loss": 0.1978, "loss_lm": 0.018365417141467333, "loss_seg": 0.17947198823094368, "mean_token_accuracy": 0.9950379580259323, "num_tokens": 1528378242.0, "step": 3596 }, { "entropy": 0.018736394587904215, "epoch": 1.5742422584527849, "grad_norm": 12.375, "learning_rate": 2.119382782891175e-05, "loss": 0.1679, "loss_lm": 0.018292006570845842, "loss_seg": 0.1495964080095291, "mean_token_accuracy": 0.9951765686273575, "num_tokens": 1528803229.0, "step": 3597 }, { "entropy": 0.018783495761454105, "epoch": 1.5746799431009957, "grad_norm": 4.875, "learning_rate": 2.1191120736329183e-05, "loss": 0.1118, "loss_lm": 0.01765665994025767, "loss_seg": 0.09418299421668053, "mean_token_accuracy": 0.9951131343841553, "num_tokens": 1529228228.0, "step": 3598 }, { "entropy": 0.01885641971603036, "epoch": 1.5751176277492067, "grad_norm": 7.15625, "learning_rate": 2.1188413643746616e-05, "loss": 0.1339, "loss_lm": 0.01606091158464551, "loss_seg": 0.11779220122843981, "mean_token_accuracy": 0.9952026754617691, "num_tokens": 1529653627.0, "step": 3599 }, { "entropy": 0.018356394488364458, "epoch": 1.5755553123974178, "grad_norm": 11.5625, "learning_rate": 2.118570655116405e-05, "loss": 0.1665, "loss_lm": 0.01611056295223534, "loss_seg": 0.15039728954434395, "mean_token_accuracy": 0.995330274105072, "num_tokens": 1530078858.0, "step": 3600 }, { "entropy": 0.018586890306323767, "epoch": 1.5759929970456286, "grad_norm": 6.25, "learning_rate": 2.1182999458581487e-05, "loss": 0.1412, "loss_lm": 0.015731715597212315, "loss_seg": 0.12551806308329105, "mean_token_accuracy": 0.995187297463417, "num_tokens": 1530504164.0, "step": 3601 }, { "entropy": 0.018418016843497753, "epoch": 1.5764306816938396, "grad_norm": 11.125, "learning_rate": 2.1180292365998917e-05, "loss": 0.1005, "loss_lm": 0.015737141016870737, "loss_seg": 0.08473199233412743, "mean_token_accuracy": 0.9951734244823456, "num_tokens": 1530928250.0, "step": 3602 }, { "entropy": 0.019016293808817863, "epoch": 1.5768683663420506, "grad_norm": 10.1875, "learning_rate": 2.117758527341635e-05, "loss": 0.1204, "loss_lm": 0.015752728329971433, "loss_seg": 0.10469505190849304, "mean_token_accuracy": 0.9950851947069168, "num_tokens": 1531353245.0, "step": 3603 }, { "entropy": 0.017859079409390688, "epoch": 1.5773060509902614, "grad_norm": 8.3125, "learning_rate": 2.1174878180833785e-05, "loss": 0.1183, "loss_lm": 0.016195545438677073, "loss_seg": 0.10211298428475857, "mean_token_accuracy": 0.9953352510929108, "num_tokens": 1531778061.0, "step": 3604 }, { "entropy": 0.018118717707693577, "epoch": 1.5777437356384725, "grad_norm": 6.40625, "learning_rate": 2.117217108825122e-05, "loss": 0.0936, "loss_lm": 0.013481729431077838, "loss_seg": 0.08013905305415392, "mean_token_accuracy": 0.9952076971530914, "num_tokens": 1532202380.0, "step": 3605 }, { "entropy": 0.018247625324875116, "epoch": 1.5781814202866835, "grad_norm": 8.9375, "learning_rate": 2.1169463995668652e-05, "loss": 0.1145, "loss_lm": 0.017048438545316458, "loss_seg": 0.0974358581006527, "mean_token_accuracy": 0.9952589571475983, "num_tokens": 1532627140.0, "step": 3606 }, { "entropy": 0.018667913507670164, "epoch": 1.5786191049348943, "grad_norm": 17.25, "learning_rate": 2.1166756903086086e-05, "loss": 0.1169, "loss_lm": 0.015196655644103885, "loss_seg": 0.10169589333236217, "mean_token_accuracy": 0.9952395707368851, "num_tokens": 1533052066.0, "step": 3607 }, { "entropy": 0.018512620590627193, "epoch": 1.5790567895831054, "grad_norm": 33.75, "learning_rate": 2.116404981050352e-05, "loss": 0.122, "loss_lm": 0.014953793492168188, "loss_seg": 0.10705634579062462, "mean_token_accuracy": 0.9952502399682999, "num_tokens": 1533477305.0, "step": 3608 }, { "entropy": 0.0184778799302876, "epoch": 1.5794944742313164, "grad_norm": 5.875, "learning_rate": 2.1161342717920954e-05, "loss": 0.0942, "loss_lm": 0.016681367997080088, "loss_seg": 0.07747747376561165, "mean_token_accuracy": 0.9952505528926849, "num_tokens": 1533902376.0, "step": 3609 }, { "entropy": 0.018161806277930737, "epoch": 1.5799321588795272, "grad_norm": 9.625, "learning_rate": 2.1158635625338387e-05, "loss": 0.0977, "loss_lm": 0.016232939902693033, "loss_seg": 0.08142467029392719, "mean_token_accuracy": 0.9952718019485474, "num_tokens": 1534327441.0, "step": 3610 }, { "entropy": 0.01859798887744546, "epoch": 1.5803698435277382, "grad_norm": 6.65625, "learning_rate": 2.115592853275582e-05, "loss": 0.1322, "loss_lm": 0.016278114868327975, "loss_seg": 0.11592912301421165, "mean_token_accuracy": 0.9951923489570618, "num_tokens": 1534752775.0, "step": 3611 }, { "entropy": 0.017636595759540796, "epoch": 1.5808075281759493, "grad_norm": 7.1875, "learning_rate": 2.1153221440173255e-05, "loss": 0.117, "loss_lm": 0.01562859397381544, "loss_seg": 0.10136322118341923, "mean_token_accuracy": 0.9953764528036118, "num_tokens": 1535177705.0, "step": 3612 }, { "entropy": 0.01845886604860425, "epoch": 1.58124521282416, "grad_norm": 10.875, "learning_rate": 2.115051434759069e-05, "loss": 0.1184, "loss_lm": 0.01792014087550342, "loss_seg": 0.10052931122481823, "mean_token_accuracy": 0.9952272921800613, "num_tokens": 1535603175.0, "step": 3613 }, { "entropy": 0.018725403118878603, "epoch": 1.5816828974723711, "grad_norm": 12.625, "learning_rate": 2.1147807255008122e-05, "loss": 0.106, "loss_lm": 0.01659697643481195, "loss_seg": 0.08940278366208076, "mean_token_accuracy": 0.9950471073389053, "num_tokens": 1536027563.0, "step": 3614 }, { "entropy": 0.01847575092688203, "epoch": 1.5821205821205822, "grad_norm": 10.5625, "learning_rate": 2.1145100162425556e-05, "loss": 0.1863, "loss_lm": 0.015943055273965, "loss_seg": 0.17032507620751858, "mean_token_accuracy": 0.9952535182237625, "num_tokens": 1536452481.0, "step": 3615 }, { "entropy": 0.018656574189662933, "epoch": 1.582558266768793, "grad_norm": 24.375, "learning_rate": 2.114239306984299e-05, "loss": 0.2069, "loss_lm": 0.01746746525168419, "loss_seg": 0.18946471437811852, "mean_token_accuracy": 0.9951414614915848, "num_tokens": 1536877237.0, "step": 3616 }, { "entropy": 0.018299163319170475, "epoch": 1.582995951417004, "grad_norm": 11.5, "learning_rate": 2.1139685977260423e-05, "loss": 0.139, "loss_lm": 0.01582593726925552, "loss_seg": 0.12312618643045425, "mean_token_accuracy": 0.9952973276376724, "num_tokens": 1537302693.0, "step": 3617 }, { "entropy": 0.018754462711513042, "epoch": 1.583433636065215, "grad_norm": 8.875, "learning_rate": 2.1136978884677857e-05, "loss": 0.1189, "loss_lm": 0.01536899572238326, "loss_seg": 0.10357736237347126, "mean_token_accuracy": 0.9951116591691971, "num_tokens": 1537727533.0, "step": 3618 }, { "entropy": 0.01846494898200035, "epoch": 1.5838713207134258, "grad_norm": 6.84375, "learning_rate": 2.113427179209529e-05, "loss": 0.1409, "loss_lm": 0.016397937666624784, "loss_seg": 0.12452236376702785, "mean_token_accuracy": 0.9952083081007004, "num_tokens": 1538152613.0, "step": 3619 }, { "entropy": 0.017970718443393707, "epoch": 1.584309005361637, "grad_norm": 6.375, "learning_rate": 2.1131564699512724e-05, "loss": 0.1433, "loss_lm": 0.016072699800133705, "loss_seg": 0.1272578164935112, "mean_token_accuracy": 0.9954635798931122, "num_tokens": 1538577121.0, "step": 3620 }, { "entropy": 0.018699109088629484, "epoch": 1.584746690009848, "grad_norm": 11.6875, "learning_rate": 2.1128857606930155e-05, "loss": 0.1679, "loss_lm": 0.017827609553933144, "loss_seg": 0.15004604496061802, "mean_token_accuracy": 0.995289221405983, "num_tokens": 1539002488.0, "step": 3621 }, { "entropy": 0.018086241092532873, "epoch": 1.5851843746580587, "grad_norm": 3.859375, "learning_rate": 2.1126150514347592e-05, "loss": 0.1068, "loss_lm": 0.017867998452857137, "loss_seg": 0.08897170703858137, "mean_token_accuracy": 0.9953441768884659, "num_tokens": 1539427058.0, "step": 3622 }, { "entropy": 0.01808001333847642, "epoch": 1.58562205930627, "grad_norm": 10.3125, "learning_rate": 2.1123443421765026e-05, "loss": 0.1441, "loss_lm": 0.01591744855977595, "loss_seg": 0.12819111160933971, "mean_token_accuracy": 0.9953906238079071, "num_tokens": 1539852003.0, "step": 3623 }, { "entropy": 0.01797198550775647, "epoch": 1.5860597439544808, "grad_norm": 8.0, "learning_rate": 2.112073632918246e-05, "loss": 0.1019, "loss_lm": 0.013878729892894626, "loss_seg": 0.08806619606912136, "mean_token_accuracy": 0.9953171014785767, "num_tokens": 1540276697.0, "step": 3624 }, { "entropy": 0.018277646508067846, "epoch": 1.5864974286026916, "grad_norm": 4.8125, "learning_rate": 2.111802923659989e-05, "loss": 0.1204, "loss_lm": 0.017915613017976284, "loss_seg": 0.10248115845024586, "mean_token_accuracy": 0.9951423406600952, "num_tokens": 1540702571.0, "step": 3625 }, { "entropy": 0.018290434032678604, "epoch": 1.5869351132509029, "grad_norm": 18.25, "learning_rate": 2.1115322144017323e-05, "loss": 0.1435, "loss_lm": 0.017507691401988268, "loss_seg": 0.12603642977774143, "mean_token_accuracy": 0.9952770620584488, "num_tokens": 1541127684.0, "step": 3626 }, { "entropy": 0.01831751549616456, "epoch": 1.5873727978991137, "grad_norm": 5.90625, "learning_rate": 2.111261505143476e-05, "loss": 0.1243, "loss_lm": 0.016532813664525747, "loss_seg": 0.10780374333262444, "mean_token_accuracy": 0.9952913969755173, "num_tokens": 1541552397.0, "step": 3627 }, { "entropy": 0.01859271712601185, "epoch": 1.5878104825473245, "grad_norm": 8.8125, "learning_rate": 2.1109907958852194e-05, "loss": 0.1338, "loss_lm": 0.015962428646162152, "loss_seg": 0.11783459223806858, "mean_token_accuracy": 0.9953248649835587, "num_tokens": 1541977612.0, "step": 3628 }, { "entropy": 0.01880415342748165, "epoch": 1.5882481671955357, "grad_norm": 11.0, "learning_rate": 2.1107200866269628e-05, "loss": 0.1388, "loss_lm": 0.015391880180686712, "loss_seg": 0.12343711033463478, "mean_token_accuracy": 0.9950666427612305, "num_tokens": 1542403038.0, "step": 3629 }, { "entropy": 0.018566993065178394, "epoch": 1.5886858518437466, "grad_norm": 6.40625, "learning_rate": 2.110449377368706e-05, "loss": 0.1287, "loss_lm": 0.018053187057375908, "loss_seg": 0.11069018021225929, "mean_token_accuracy": 0.9951683133840561, "num_tokens": 1542827910.0, "step": 3630 }, { "entropy": 0.01866305898874998, "epoch": 1.5891235364919576, "grad_norm": 6.75, "learning_rate": 2.1101786681104492e-05, "loss": 0.1283, "loss_lm": 0.01662814663723111, "loss_seg": 0.11164635047316551, "mean_token_accuracy": 0.9950780421495438, "num_tokens": 1543253124.0, "step": 3631 }, { "entropy": 0.018514557741582394, "epoch": 1.5895612211401686, "grad_norm": 7.0, "learning_rate": 2.109907958852193e-05, "loss": 0.1075, "loss_lm": 0.014948409283533692, "loss_seg": 0.09255496598780155, "mean_token_accuracy": 0.9952882677316666, "num_tokens": 1543678382.0, "step": 3632 }, { "entropy": 0.019153804518282413, "epoch": 1.5899989057883794, "grad_norm": 7.375, "learning_rate": 2.1096372495939363e-05, "loss": 0.1339, "loss_lm": 0.0164596748072654, "loss_seg": 0.1174100711941719, "mean_token_accuracy": 0.9950616657733917, "num_tokens": 1544103796.0, "step": 3633 }, { "entropy": 0.018600392155349255, "epoch": 1.5904365904365905, "grad_norm": 11.8125, "learning_rate": 2.1093665403356797e-05, "loss": 0.1118, "loss_lm": 0.015087307896465063, "loss_seg": 0.09667885862290859, "mean_token_accuracy": 0.9951930344104767, "num_tokens": 1544528526.0, "step": 3634 }, { "entropy": 0.0186191713437438, "epoch": 1.5908742750848015, "grad_norm": 8.6875, "learning_rate": 2.1090958310774227e-05, "loss": 0.1421, "loss_lm": 0.01572692021727562, "loss_seg": 0.12637675181031227, "mean_token_accuracy": 0.9951246082782745, "num_tokens": 1544953481.0, "step": 3635 }, { "entropy": 0.018012269865721464, "epoch": 1.5913119597330123, "grad_norm": 3.890625, "learning_rate": 2.108825121819166e-05, "loss": 0.113, "loss_lm": 0.016869655810296535, "loss_seg": 0.09614304080605507, "mean_token_accuracy": 0.9953073561191559, "num_tokens": 1545377732.0, "step": 3636 }, { "entropy": 0.018205509055405855, "epoch": 1.5917496443812233, "grad_norm": 7.75, "learning_rate": 2.1085544125609098e-05, "loss": 0.0971, "loss_lm": 0.013977973023429513, "loss_seg": 0.08308188058435917, "mean_token_accuracy": 0.9953727126121521, "num_tokens": 1545802551.0, "step": 3637 }, { "entropy": 0.01774213695898652, "epoch": 1.5921873290294344, "grad_norm": 17.125, "learning_rate": 2.108283703302653e-05, "loss": 0.1203, "loss_lm": 0.015425481600686908, "loss_seg": 0.10488243214786053, "mean_token_accuracy": 0.995354950428009, "num_tokens": 1546227689.0, "step": 3638 }, { "entropy": 0.017899768892675638, "epoch": 1.5926250136776452, "grad_norm": 12.0625, "learning_rate": 2.1080129940443965e-05, "loss": 0.1218, "loss_lm": 0.015243443194776773, "loss_seg": 0.10659889318048954, "mean_token_accuracy": 0.9953879565000534, "num_tokens": 1546652363.0, "step": 3639 }, { "entropy": 0.01913178665563464, "epoch": 1.5930626983258562, "grad_norm": 6.21875, "learning_rate": 2.1077422847861396e-05, "loss": 0.1197, "loss_lm": 0.015697119059041142, "loss_seg": 0.10402823239564896, "mean_token_accuracy": 0.9949897974729538, "num_tokens": 1547077274.0, "step": 3640 }, { "entropy": 0.018286097329109907, "epoch": 1.5935003829740673, "grad_norm": 6.34375, "learning_rate": 2.107471575527883e-05, "loss": 0.1316, "loss_lm": 0.017477322136983275, "loss_seg": 0.11412027664482594, "mean_token_accuracy": 0.995185449719429, "num_tokens": 1547502186.0, "step": 3641 }, { "entropy": 0.01845561759546399, "epoch": 1.593938067622278, "grad_norm": 8.3125, "learning_rate": 2.1072008662696266e-05, "loss": 0.1399, "loss_lm": 0.015027598245069385, "loss_seg": 0.12487649917602539, "mean_token_accuracy": 0.9951131641864777, "num_tokens": 1547927165.0, "step": 3642 }, { "entropy": 0.018184118904173374, "epoch": 1.594375752270489, "grad_norm": 3.890625, "learning_rate": 2.10693015701137e-05, "loss": 0.1048, "loss_lm": 0.017397265415638685, "loss_seg": 0.0873932521790266, "mean_token_accuracy": 0.9952612668275833, "num_tokens": 1548351702.0, "step": 3643 }, { "entropy": 0.018624145071953535, "epoch": 1.5948134369187001, "grad_norm": 7.53125, "learning_rate": 2.1066594477531134e-05, "loss": 0.1193, "loss_lm": 0.017232530284672976, "loss_seg": 0.10203531943261623, "mean_token_accuracy": 0.995213508605957, "num_tokens": 1548777012.0, "step": 3644 }, { "entropy": 0.018186498433351517, "epoch": 1.595251121566911, "grad_norm": 20.0, "learning_rate": 2.1063887384948564e-05, "loss": 0.1577, "loss_lm": 0.01754901371896267, "loss_seg": 0.14014995470643044, "mean_token_accuracy": 0.9952650368213654, "num_tokens": 1549201688.0, "step": 3645 }, { "entropy": 0.01936738518998027, "epoch": 1.595688806215122, "grad_norm": 13.5625, "learning_rate": 2.1061180292365998e-05, "loss": 0.1085, "loss_lm": 0.019615410827100277, "loss_seg": 0.08890729304403067, "mean_token_accuracy": 0.9949624091386795, "num_tokens": 1549626714.0, "step": 3646 }, { "entropy": 0.018905588425695896, "epoch": 1.596126490863333, "grad_norm": 12.0625, "learning_rate": 2.1058473199783435e-05, "loss": 0.1124, "loss_lm": 0.01746814977377653, "loss_seg": 0.0949239730834961, "mean_token_accuracy": 0.9951018542051315, "num_tokens": 1550051759.0, "step": 3647 }, { "entropy": 0.017949651461094618, "epoch": 1.5965641755115438, "grad_norm": 6.78125, "learning_rate": 2.105576610720087e-05, "loss": 0.1138, "loss_lm": 0.013004574226215482, "loss_seg": 0.10078010521829128, "mean_token_accuracy": 0.9952664822340012, "num_tokens": 1550476383.0, "step": 3648 }, { "entropy": 0.01837167050689459, "epoch": 1.5970018601597549, "grad_norm": 10.9375, "learning_rate": 2.10530590146183e-05, "loss": 0.112, "loss_lm": 0.015253470977768302, "loss_seg": 0.09670644905418158, "mean_token_accuracy": 0.9952486008405685, "num_tokens": 1550901485.0, "step": 3649 }, { "entropy": 0.018718314822763205, "epoch": 1.597439544807966, "grad_norm": 24.5, "learning_rate": 2.1050351922035733e-05, "loss": 0.121, "loss_lm": 0.01756715727970004, "loss_seg": 0.10346486233174801, "mean_token_accuracy": 0.9951498657464981, "num_tokens": 1551327214.0, "step": 3650 }, { "entropy": 0.018410615157335997, "epoch": 1.5978772294561767, "grad_norm": 5.75, "learning_rate": 2.1047644829453167e-05, "loss": 0.1086, "loss_lm": 0.014843246899545193, "loss_seg": 0.09372608177363873, "mean_token_accuracy": 0.9952231049537659, "num_tokens": 1551751924.0, "step": 3651 }, { "entropy": 0.0187502633780241, "epoch": 1.5983149141043878, "grad_norm": 9.8125, "learning_rate": 2.10449377368706e-05, "loss": 0.1217, "loss_lm": 0.018885956145823002, "loss_seg": 0.1028636060655117, "mean_token_accuracy": 0.9951011091470718, "num_tokens": 1552177499.0, "step": 3652 }, { "entropy": 0.01916158851236105, "epoch": 1.5987525987525988, "grad_norm": 25.5, "learning_rate": 2.1042230644288037e-05, "loss": 0.1349, "loss_lm": 0.015379809541627765, "loss_seg": 0.11950667761266232, "mean_token_accuracy": 0.9950836151838303, "num_tokens": 1552603150.0, "step": 3653 }, { "entropy": 0.018274123314768076, "epoch": 1.5991902834008096, "grad_norm": 14.375, "learning_rate": 2.1039523551705468e-05, "loss": 0.1117, "loss_lm": 0.01667923992499709, "loss_seg": 0.09500672854483128, "mean_token_accuracy": 0.9952167868614197, "num_tokens": 1553028341.0, "step": 3654 }, { "entropy": 0.018209186382591724, "epoch": 1.5996279680490206, "grad_norm": 14.125, "learning_rate": 2.10368164591229e-05, "loss": 0.1013, "loss_lm": 0.01731629017740488, "loss_seg": 0.08402778021991253, "mean_token_accuracy": 0.995193600654602, "num_tokens": 1553453150.0, "step": 3655 }, { "entropy": 0.01843128725886345, "epoch": 1.6000656526972317, "grad_norm": 6.5, "learning_rate": 2.1034109366540335e-05, "loss": 0.13, "loss_lm": 0.016655640210956335, "loss_seg": 0.11329817492514849, "mean_token_accuracy": 0.9952764362096786, "num_tokens": 1553878617.0, "step": 3656 }, { "entropy": 0.018405066803097725, "epoch": 1.6005033373454425, "grad_norm": 15.9375, "learning_rate": 2.103140227395777e-05, "loss": 0.1438, "loss_lm": 0.0158645692281425, "loss_seg": 0.12790067307651043, "mean_token_accuracy": 0.9952037334442139, "num_tokens": 1554304135.0, "step": 3657 }, { "entropy": 0.018879163544625044, "epoch": 1.6009410219936537, "grad_norm": 12.625, "learning_rate": 2.1028695181375206e-05, "loss": 0.1145, "loss_lm": 0.018273334484547377, "loss_seg": 0.09620494954288006, "mean_token_accuracy": 0.9950076192617416, "num_tokens": 1554728340.0, "step": 3658 }, { "entropy": 0.018769212998449802, "epoch": 1.6013787066418645, "grad_norm": 14.125, "learning_rate": 2.1025988088792636e-05, "loss": 0.1844, "loss_lm": 0.01557499822229147, "loss_seg": 0.1687900349497795, "mean_token_accuracy": 0.9951315820217133, "num_tokens": 1555152879.0, "step": 3659 }, { "entropy": 0.018986237701028585, "epoch": 1.6018163912900754, "grad_norm": 7.96875, "learning_rate": 2.102328099621007e-05, "loss": 0.1486, "loss_lm": 0.01655092230066657, "loss_seg": 0.1320139579474926, "mean_token_accuracy": 0.9949955493211746, "num_tokens": 1555577858.0, "step": 3660 }, { "entropy": 0.018666395917534828, "epoch": 1.6022540759382866, "grad_norm": 7.6875, "learning_rate": 2.1020573903627504e-05, "loss": 0.1506, "loss_lm": 0.01593068544752896, "loss_seg": 0.13465582951903343, "mean_token_accuracy": 0.9951933324337006, "num_tokens": 1556003273.0, "step": 3661 }, { "entropy": 0.01902499422430992, "epoch": 1.6026917605864974, "grad_norm": 5.59375, "learning_rate": 2.1017866811044938e-05, "loss": 0.1087, "loss_lm": 0.016176722012460232, "loss_seg": 0.09256084263324738, "mean_token_accuracy": 0.9951279312372208, "num_tokens": 1556428118.0, "step": 3662 }, { "entropy": 0.018748541362583637, "epoch": 1.6031294452347082, "grad_norm": 10.375, "learning_rate": 2.1015159718462375e-05, "loss": 0.1006, "loss_lm": 0.016109029296785593, "loss_seg": 0.08446304686367512, "mean_token_accuracy": 0.9951647371053696, "num_tokens": 1556853298.0, "step": 3663 }, { "entropy": 0.019424670841544867, "epoch": 1.6035671298829195, "grad_norm": 7.21875, "learning_rate": 2.1012452625879805e-05, "loss": 0.1719, "loss_lm": 0.018995231483131647, "loss_seg": 0.15292740613222122, "mean_token_accuracy": 0.994972288608551, "num_tokens": 1557279112.0, "step": 3664 }, { "entropy": 0.01903930213302374, "epoch": 1.6040048145311303, "grad_norm": 9.125, "learning_rate": 2.100974553329724e-05, "loss": 0.1158, "loss_lm": 0.017424479126930237, "loss_seg": 0.09836903400719166, "mean_token_accuracy": 0.9950757622718811, "num_tokens": 1557704519.0, "step": 3665 }, { "entropy": 0.018344346899539232, "epoch": 1.6044424991793411, "grad_norm": 5.09375, "learning_rate": 2.1007038440714673e-05, "loss": 0.1078, "loss_lm": 0.01668786909431219, "loss_seg": 0.09109506011009216, "mean_token_accuracy": 0.9952555596828461, "num_tokens": 1558129395.0, "step": 3666 }, { "entropy": 0.018996707163751125, "epoch": 1.6048801838275524, "grad_norm": 22.25, "learning_rate": 2.1004331348132106e-05, "loss": 0.1119, "loss_lm": 0.015999101800844073, "loss_seg": 0.09593802690505981, "mean_token_accuracy": 0.99503093957901, "num_tokens": 1558554992.0, "step": 3667 }, { "entropy": 0.019034747034311295, "epoch": 1.6053178684757632, "grad_norm": 8.875, "learning_rate": 2.1001624255549543e-05, "loss": 0.1363, "loss_lm": 0.01665681996382773, "loss_seg": 0.11963918525725603, "mean_token_accuracy": 0.9950429946184158, "num_tokens": 1558980414.0, "step": 3668 }, { "entropy": 0.018034447450190783, "epoch": 1.6057555531239742, "grad_norm": 10.6875, "learning_rate": 2.0998917162966974e-05, "loss": 0.1143, "loss_lm": 0.016047741286456585, "loss_seg": 0.09822430368512869, "mean_token_accuracy": 0.9953684061765671, "num_tokens": 1559405271.0, "step": 3669 }, { "entropy": 0.018861657939851284, "epoch": 1.6061932377721853, "grad_norm": 4.09375, "learning_rate": 2.0996210070384407e-05, "loss": 0.1023, "loss_lm": 0.014992159558460116, "loss_seg": 0.08735501766204834, "mean_token_accuracy": 0.9950460344552994, "num_tokens": 1559830992.0, "step": 3670 }, { "entropy": 0.018317617010325193, "epoch": 1.606630922420396, "grad_norm": 37.5, "learning_rate": 2.099350297780184e-05, "loss": 0.1431, "loss_lm": 0.01627910160459578, "loss_seg": 0.12684916704893112, "mean_token_accuracy": 0.9952306151390076, "num_tokens": 1560255504.0, "step": 3671 }, { "entropy": 0.018495507072657347, "epoch": 1.607068607068607, "grad_norm": 6.40625, "learning_rate": 2.0990795885219275e-05, "loss": 0.1246, "loss_lm": 0.01567544648423791, "loss_seg": 0.10893704555928707, "mean_token_accuracy": 0.9951087236404419, "num_tokens": 1560679911.0, "step": 3672 }, { "entropy": 0.018966253381222486, "epoch": 1.6075062917168181, "grad_norm": 9.75, "learning_rate": 2.098808879263671e-05, "loss": 0.1026, "loss_lm": 0.0149933488573879, "loss_seg": 0.08763306587934494, "mean_token_accuracy": 0.9951125234365463, "num_tokens": 1561105036.0, "step": 3673 }, { "entropy": 0.018404838163405657, "epoch": 1.607943976365029, "grad_norm": 5.75, "learning_rate": 2.0985381700054142e-05, "loss": 0.1328, "loss_lm": 0.01852217991836369, "loss_seg": 0.11424889042973518, "mean_token_accuracy": 0.9952410757541656, "num_tokens": 1561530510.0, "step": 3674 }, { "entropy": 0.018584534991532564, "epoch": 1.60838166101324, "grad_norm": 5.03125, "learning_rate": 2.0982674607471576e-05, "loss": 0.1359, "loss_lm": 0.015687645878642797, "loss_seg": 0.1201647836714983, "mean_token_accuracy": 0.9952872544527054, "num_tokens": 1561955694.0, "step": 3675 }, { "entropy": 0.019133142661303282, "epoch": 1.608819345661451, "grad_norm": 15.75, "learning_rate": 2.097996751488901e-05, "loss": 0.1225, "loss_lm": 0.014700609724968672, "loss_seg": 0.10782713070511818, "mean_token_accuracy": 0.9951110780239105, "num_tokens": 1562381090.0, "step": 3676 }, { "entropy": 0.018808778375387192, "epoch": 1.6092570303096618, "grad_norm": 6.4375, "learning_rate": 2.0977260422306443e-05, "loss": 0.1359, "loss_lm": 0.014952931087464094, "loss_seg": 0.12094040587544441, "mean_token_accuracy": 0.9952270835638046, "num_tokens": 1562806766.0, "step": 3677 }, { "entropy": 0.017966709565371275, "epoch": 1.6096947149578729, "grad_norm": 8.1875, "learning_rate": 2.0974553329723877e-05, "loss": 0.0976, "loss_lm": 0.01596576697193086, "loss_seg": 0.08166064880788326, "mean_token_accuracy": 0.9953044205904007, "num_tokens": 1563232031.0, "step": 3678 }, { "entropy": 0.01778426021337509, "epoch": 1.610132399606084, "grad_norm": 6.21875, "learning_rate": 2.097184623714131e-05, "loss": 0.075, "loss_lm": 0.014512794557958841, "loss_seg": 0.060490415431559086, "mean_token_accuracy": 0.9954153448343277, "num_tokens": 1563657069.0, "step": 3679 }, { "entropy": 0.0185508425347507, "epoch": 1.6105700842542947, "grad_norm": 12.1875, "learning_rate": 2.0969139144558745e-05, "loss": 0.1146, "loss_lm": 0.014718271093443036, "loss_seg": 0.09987247083336115, "mean_token_accuracy": 0.9953037053346634, "num_tokens": 1564082438.0, "step": 3680 }, { "entropy": 0.018809742759913206, "epoch": 1.6110077689025057, "grad_norm": 11.3125, "learning_rate": 2.096643205197618e-05, "loss": 0.0829, "loss_lm": 0.014386513736099005, "loss_seg": 0.06846934650093317, "mean_token_accuracy": 0.9952784478664398, "num_tokens": 1564508083.0, "step": 3681 }, { "entropy": 0.018843910191208124, "epoch": 1.6114454535507168, "grad_norm": 23.25, "learning_rate": 2.0963724959393612e-05, "loss": 0.142, "loss_lm": 0.016488159773871303, "loss_seg": 0.1254769302904606, "mean_token_accuracy": 0.995130181312561, "num_tokens": 1564932915.0, "step": 3682 }, { "entropy": 0.018534383736550808, "epoch": 1.6118831381989276, "grad_norm": 5.28125, "learning_rate": 2.0961017866811042e-05, "loss": 0.1588, "loss_lm": 0.014462138758972287, "loss_seg": 0.14437525253742933, "mean_token_accuracy": 0.995068222284317, "num_tokens": 1565358052.0, "step": 3683 }, { "entropy": 0.01819909503683448, "epoch": 1.6123208228471386, "grad_norm": 18.875, "learning_rate": 2.095831077422848e-05, "loss": 0.1457, "loss_lm": 0.017241325462237, "loss_seg": 0.12846017256379128, "mean_token_accuracy": 0.9951556324958801, "num_tokens": 1565782842.0, "step": 3684 }, { "entropy": 0.01813508477061987, "epoch": 1.6127585074953497, "grad_norm": 5.8125, "learning_rate": 2.0955603681645913e-05, "loss": 0.0974, "loss_lm": 0.01619325950741768, "loss_seg": 0.08123997040092945, "mean_token_accuracy": 0.995243027806282, "num_tokens": 1566207448.0, "step": 3685 }, { "entropy": 0.018374116625636816, "epoch": 1.6131961921435605, "grad_norm": 8.375, "learning_rate": 2.0952896589063347e-05, "loss": 0.114, "loss_lm": 0.015708019258454442, "loss_seg": 0.09829298593103886, "mean_token_accuracy": 0.9952490478754044, "num_tokens": 1566631866.0, "step": 3686 }, { "entropy": 0.017943259328603745, "epoch": 1.6136338767917715, "grad_norm": 45.5, "learning_rate": 2.095018949648078e-05, "loss": 0.1189, "loss_lm": 0.016026922734454274, "loss_seg": 0.1028837114572525, "mean_token_accuracy": 0.9954647719860077, "num_tokens": 1567056919.0, "step": 3687 }, { "entropy": 0.017830781172960997, "epoch": 1.6140715614399825, "grad_norm": 5.0625, "learning_rate": 2.094748240389821e-05, "loss": 0.1127, "loss_lm": 0.014290493680164218, "loss_seg": 0.09838832728564739, "mean_token_accuracy": 0.9953382015228271, "num_tokens": 1567481600.0, "step": 3688 }, { "entropy": 0.018491314724087715, "epoch": 1.6145092460881934, "grad_norm": 12.6875, "learning_rate": 2.0944775311315648e-05, "loss": 0.1653, "loss_lm": 0.01651541981846094, "loss_seg": 0.14880551025271416, "mean_token_accuracy": 0.9952441155910492, "num_tokens": 1567906934.0, "step": 3689 }, { "entropy": 0.017998699564486742, "epoch": 1.6149469307364044, "grad_norm": 6.96875, "learning_rate": 2.0942068218733082e-05, "loss": 0.1199, "loss_lm": 0.01732066785916686, "loss_seg": 0.10254390072077513, "mean_token_accuracy": 0.995291531085968, "num_tokens": 1568330804.0, "step": 3690 }, { "entropy": 0.01832845201715827, "epoch": 1.6153846153846154, "grad_norm": 9.375, "learning_rate": 2.0939361126150516e-05, "loss": 0.0889, "loss_lm": 0.014608721248805523, "loss_seg": 0.07428287342190742, "mean_token_accuracy": 0.9952876567840576, "num_tokens": 1568756239.0, "step": 3691 }, { "entropy": 0.01783558214083314, "epoch": 1.6158223000328262, "grad_norm": 9.6875, "learning_rate": 2.093665403356795e-05, "loss": 0.1299, "loss_lm": 0.015330182621255517, "loss_seg": 0.1145285964012146, "mean_token_accuracy": 0.9953308701515198, "num_tokens": 1569180883.0, "step": 3692 }, { "entropy": 0.018076566513627768, "epoch": 1.6162599846810373, "grad_norm": 7.125, "learning_rate": 2.093394694098538e-05, "loss": 0.1577, "loss_lm": 0.015701240161433816, "loss_seg": 0.14195486530661583, "mean_token_accuracy": 0.9952996224164963, "num_tokens": 1569606116.0, "step": 3693 }, { "entropy": 0.01861011842265725, "epoch": 1.6166976693292483, "grad_norm": 10.625, "learning_rate": 2.0931239848402817e-05, "loss": 0.1071, "loss_lm": 0.015388835920020938, "loss_seg": 0.09169754944741726, "mean_token_accuracy": 0.9951593577861786, "num_tokens": 1570031862.0, "step": 3694 }, { "entropy": 0.017930383794009686, "epoch": 1.6171353539774591, "grad_norm": 9.875, "learning_rate": 2.092853275582025e-05, "loss": 0.133, "loss_lm": 0.016574511537328362, "loss_seg": 0.11643102578818798, "mean_token_accuracy": 0.9952979236841202, "num_tokens": 1570456522.0, "step": 3695 }, { "entropy": 0.01867132168263197, "epoch": 1.6175730386256704, "grad_norm": 14.0, "learning_rate": 2.0925825663237684e-05, "loss": 0.1709, "loss_lm": 0.015152904205024242, "loss_seg": 0.15573375299572945, "mean_token_accuracy": 0.9951269328594208, "num_tokens": 1570882098.0, "step": 3696 }, { "entropy": 0.018434492871165276, "epoch": 1.6180107232738812, "grad_norm": 5.1875, "learning_rate": 2.0923118570655115e-05, "loss": 0.166, "loss_lm": 0.017041034530848265, "loss_seg": 0.14896340295672417, "mean_token_accuracy": 0.9952249079942703, "num_tokens": 1571307186.0, "step": 3697 }, { "entropy": 0.018742144107818604, "epoch": 1.618448407922092, "grad_norm": 7.46875, "learning_rate": 2.092041147807255e-05, "loss": 0.1362, "loss_lm": 0.018009882420301437, "loss_seg": 0.11821407452225685, "mean_token_accuracy": 0.9951261878013611, "num_tokens": 1571731878.0, "step": 3698 }, { "entropy": 0.01802693121135235, "epoch": 1.6188860925703032, "grad_norm": 19.0, "learning_rate": 2.0917704385489985e-05, "loss": 0.1333, "loss_lm": 0.0172538033220917, "loss_seg": 0.1160513274371624, "mean_token_accuracy": 0.9953008890151978, "num_tokens": 1572156818.0, "step": 3699 }, { "entropy": 0.018161274027079344, "epoch": 1.619323777218514, "grad_norm": 19.875, "learning_rate": 2.091499729290742e-05, "loss": 0.1092, "loss_lm": 0.013732734136283398, "loss_seg": 0.09544842969626188, "mean_token_accuracy": 0.9954127222299576, "num_tokens": 1572582067.0, "step": 3700 }, { "entropy": 0.017865247558802366, "epoch": 1.6197614618667249, "grad_norm": 8.0, "learning_rate": 2.0912290200324853e-05, "loss": 0.1424, "loss_lm": 0.013815912418067455, "loss_seg": 0.12860175222158432, "mean_token_accuracy": 0.99533811211586, "num_tokens": 1573006311.0, "step": 3701 }, { "entropy": 0.018783065490424633, "epoch": 1.6201991465149361, "grad_norm": 23.25, "learning_rate": 2.0909583107742283e-05, "loss": 0.1333, "loss_lm": 0.015003672102466226, "loss_seg": 0.11834440939128399, "mean_token_accuracy": 0.995216429233551, "num_tokens": 1573431488.0, "step": 3702 }, { "entropy": 0.018232909496873617, "epoch": 1.620636831163147, "grad_norm": 12.8125, "learning_rate": 2.0906876015159717e-05, "loss": 0.1106, "loss_lm": 0.01641329168342054, "loss_seg": 0.09416796080768108, "mean_token_accuracy": 0.9952393472194672, "num_tokens": 1573856155.0, "step": 3703 }, { "entropy": 0.017606645356863737, "epoch": 1.6210745158113578, "grad_norm": 18.875, "learning_rate": 2.0904168922577154e-05, "loss": 0.1022, "loss_lm": 0.015457639703527093, "loss_seg": 0.08678381517529488, "mean_token_accuracy": 0.9954337477684021, "num_tokens": 1574281174.0, "step": 3704 }, { "entropy": 0.018600051756948233, "epoch": 1.621512200459569, "grad_norm": 4.25, "learning_rate": 2.0901461829994588e-05, "loss": 0.0964, "loss_lm": 0.01570461317896843, "loss_seg": 0.0806982396170497, "mean_token_accuracy": 0.9950840026140213, "num_tokens": 1574705886.0, "step": 3705 }, { "entropy": 0.01873933430761099, "epoch": 1.6219498851077798, "grad_norm": 6.40625, "learning_rate": 2.089875473741202e-05, "loss": 0.1238, "loss_lm": 0.015731957042589784, "loss_seg": 0.10801909677684307, "mean_token_accuracy": 0.995188370347023, "num_tokens": 1575131068.0, "step": 3706 }, { "entropy": 0.01822556648403406, "epoch": 1.6223875697559909, "grad_norm": 6.34375, "learning_rate": 2.0896047644829452e-05, "loss": 0.1426, "loss_lm": 0.015109667088836432, "loss_seg": 0.12751406617462635, "mean_token_accuracy": 0.9952599406242371, "num_tokens": 1575555873.0, "step": 3707 }, { "entropy": 0.018660692498087883, "epoch": 1.622825254404202, "grad_norm": 4.625, "learning_rate": 2.0893340552246886e-05, "loss": 0.0877, "loss_lm": 0.01721674599684775, "loss_seg": 0.07043390348553658, "mean_token_accuracy": 0.9952279776334763, "num_tokens": 1575980831.0, "step": 3708 }, { "entropy": 0.018085035495460033, "epoch": 1.6232629390524127, "grad_norm": 6.0, "learning_rate": 2.0890633459664323e-05, "loss": 0.1136, "loss_lm": 0.014270346611738205, "loss_seg": 0.09933238290250301, "mean_token_accuracy": 0.995450034737587, "num_tokens": 1576405412.0, "step": 3709 }, { "entropy": 0.01801941031590104, "epoch": 1.6237006237006237, "grad_norm": 5.40625, "learning_rate": 2.0887926367081756e-05, "loss": 0.0911, "loss_lm": 0.01579095283523202, "loss_seg": 0.07535002194344997, "mean_token_accuracy": 0.9953191727399826, "num_tokens": 1576830488.0, "step": 3710 }, { "entropy": 0.018496980890631676, "epoch": 1.6241383083488348, "grad_norm": 6.875, "learning_rate": 2.088521927449919e-05, "loss": 0.0943, "loss_lm": 0.017013177974149585, "loss_seg": 0.07726612873375416, "mean_token_accuracy": 0.9951449781656265, "num_tokens": 1577255797.0, "step": 3711 }, { "entropy": 0.018499766010791063, "epoch": 1.6245759929970456, "grad_norm": 8.4375, "learning_rate": 2.088251218191662e-05, "loss": 0.1206, "loss_lm": 0.016881931107491255, "loss_seg": 0.10370788350701332, "mean_token_accuracy": 0.9953562170267105, "num_tokens": 1577681028.0, "step": 3712 }, { "entropy": 0.018248308915644884, "epoch": 1.6250136776452566, "grad_norm": 7.25, "learning_rate": 2.0879805089334054e-05, "loss": 0.174, "loss_lm": 0.01610770425759256, "loss_seg": 0.15793460607528687, "mean_token_accuracy": 0.9952931851148605, "num_tokens": 1578105904.0, "step": 3713 }, { "entropy": 0.0187198999337852, "epoch": 1.6254513622934676, "grad_norm": 10.75, "learning_rate": 2.087709799675149e-05, "loss": 0.1148, "loss_lm": 0.014936683000996709, "loss_seg": 0.09984042681753635, "mean_token_accuracy": 0.9952549040317535, "num_tokens": 1578530720.0, "step": 3714 }, { "entropy": 0.018891670741140842, "epoch": 1.6258890469416785, "grad_norm": 7.0, "learning_rate": 2.0874390904168925e-05, "loss": 0.093, "loss_lm": 0.0170332626439631, "loss_seg": 0.07593310065567493, "mean_token_accuracy": 0.9950693994760513, "num_tokens": 1578956379.0, "step": 3715 }, { "entropy": 0.01855229353532195, "epoch": 1.6263267315898895, "grad_norm": 16.875, "learning_rate": 2.087168381158636e-05, "loss": 0.1353, "loss_lm": 0.01715845474973321, "loss_seg": 0.11817704886198044, "mean_token_accuracy": 0.9951707422733307, "num_tokens": 1579381061.0, "step": 3716 }, { "entropy": 0.018525535240769386, "epoch": 1.6267644162381005, "grad_norm": 9.0, "learning_rate": 2.086897671900379e-05, "loss": 0.1233, "loss_lm": 0.01566871628165245, "loss_seg": 0.10764808394014835, "mean_token_accuracy": 0.9953013956546783, "num_tokens": 1579806889.0, "step": 3717 }, { "entropy": 0.018439178355038166, "epoch": 1.6272021008863113, "grad_norm": 11.0, "learning_rate": 2.0866269626421223e-05, "loss": 0.1159, "loss_lm": 0.01592487865127623, "loss_seg": 0.09999128803610802, "mean_token_accuracy": 0.9952190071344376, "num_tokens": 1580232293.0, "step": 3718 }, { "entropy": 0.018540891353040934, "epoch": 1.6276397855345224, "grad_norm": 12.5625, "learning_rate": 2.0863562533838657e-05, "loss": 0.1105, "loss_lm": 0.016087500378489494, "loss_seg": 0.09437310323119164, "mean_token_accuracy": 0.9950932860374451, "num_tokens": 1580657823.0, "step": 3719 }, { "entropy": 0.018387761898338795, "epoch": 1.6280774701827334, "grad_norm": 6.34375, "learning_rate": 2.0860855441256094e-05, "loss": 0.089, "loss_lm": 0.014883269090205431, "loss_seg": 0.07407531887292862, "mean_token_accuracy": 0.9952387660741806, "num_tokens": 1581082885.0, "step": 3720 }, { "entropy": 0.01833647768944502, "epoch": 1.6285151548309442, "grad_norm": 27.75, "learning_rate": 2.0858148348673524e-05, "loss": 0.1093, "loss_lm": 0.016237179515883327, "loss_seg": 0.09303651750087738, "mean_token_accuracy": 0.9951639324426651, "num_tokens": 1581508617.0, "step": 3721 }, { "entropy": 0.018071547616273165, "epoch": 1.6289528394791553, "grad_norm": 23.0, "learning_rate": 2.0855441256090958e-05, "loss": 0.0869, "loss_lm": 0.015020538587123156, "loss_seg": 0.0718584144487977, "mean_token_accuracy": 0.9953924417495728, "num_tokens": 1581933102.0, "step": 3722 }, { "entropy": 0.018951899372041225, "epoch": 1.6293905241273663, "grad_norm": 5.5, "learning_rate": 2.085273416350839e-05, "loss": 0.1194, "loss_lm": 0.01689120987430215, "loss_seg": 0.1024597529321909, "mean_token_accuracy": 0.9949017763137817, "num_tokens": 1582358333.0, "step": 3723 }, { "entropy": 0.01860931934788823, "epoch": 1.629828208775577, "grad_norm": 6.0625, "learning_rate": 2.0850027070925825e-05, "loss": 0.1188, "loss_lm": 0.015492480713874102, "loss_seg": 0.10330300033092499, "mean_token_accuracy": 0.9952092915773392, "num_tokens": 1582783917.0, "step": 3724 }, { "entropy": 0.01745317131280899, "epoch": 1.6302658934237881, "grad_norm": 7.78125, "learning_rate": 2.0847319978343262e-05, "loss": 0.09, "loss_lm": 0.015289907110854983, "loss_seg": 0.07466983236372471, "mean_token_accuracy": 0.9954164177179337, "num_tokens": 1583207791.0, "step": 3725 }, { "entropy": 0.017589182127267122, "epoch": 1.6307035780719992, "grad_norm": 13.375, "learning_rate": 2.0844612885760693e-05, "loss": 0.0989, "loss_lm": 0.013843301450833678, "loss_seg": 0.08500795438885689, "mean_token_accuracy": 0.9955940693616867, "num_tokens": 1583632795.0, "step": 3726 }, { "entropy": 0.018524084705859423, "epoch": 1.63114126272021, "grad_norm": 6.96875, "learning_rate": 2.0841905793178126e-05, "loss": 0.1233, "loss_lm": 0.015356115531176329, "loss_seg": 0.10791338328272104, "mean_token_accuracy": 0.9951658844947815, "num_tokens": 1584057660.0, "step": 3727 }, { "entropy": 0.018563065212219954, "epoch": 1.631578947368421, "grad_norm": 5.625, "learning_rate": 2.083919870059556e-05, "loss": 0.1392, "loss_lm": 0.01574436970986426, "loss_seg": 0.12346257083117962, "mean_token_accuracy": 0.995120033621788, "num_tokens": 1584482230.0, "step": 3728 }, { "entropy": 0.018327781930565834, "epoch": 1.632016632016632, "grad_norm": 6.75, "learning_rate": 2.0836491608012994e-05, "loss": 0.1117, "loss_lm": 0.015020899474620819, "loss_seg": 0.0966337975114584, "mean_token_accuracy": 0.9953640550374985, "num_tokens": 1584906933.0, "step": 3729 }, { "entropy": 0.018424106761813164, "epoch": 1.6324543166648429, "grad_norm": 4.8125, "learning_rate": 2.083378451543043e-05, "loss": 0.0785, "loss_lm": 0.015959070064127445, "loss_seg": 0.06251129321753979, "mean_token_accuracy": 0.99516162276268, "num_tokens": 1585331640.0, "step": 3730 }, { "entropy": 0.01802846882492304, "epoch": 1.632892001313054, "grad_norm": 8.4375, "learning_rate": 2.083107742284786e-05, "loss": 0.1474, "loss_lm": 0.016406210605055094, "loss_seg": 0.13099662400782108, "mean_token_accuracy": 0.9953074157238007, "num_tokens": 1585756932.0, "step": 3731 }, { "entropy": 0.01885649748146534, "epoch": 1.633329685961265, "grad_norm": 17.375, "learning_rate": 2.0828370330265295e-05, "loss": 0.14, "loss_lm": 0.017599113984033465, "loss_seg": 0.12240933254361153, "mean_token_accuracy": 0.9950060546398163, "num_tokens": 1586183252.0, "step": 3732 }, { "entropy": 0.018764516338706017, "epoch": 1.6337673706094757, "grad_norm": 12.8125, "learning_rate": 2.082566323768273e-05, "loss": 0.1502, "loss_lm": 0.016258162911981344, "loss_seg": 0.13390401378273964, "mean_token_accuracy": 0.9952105283737183, "num_tokens": 1586608362.0, "step": 3733 }, { "entropy": 0.018718906678259373, "epoch": 1.634205055257687, "grad_norm": 10.75, "learning_rate": 2.0822956145100162e-05, "loss": 0.1158, "loss_lm": 0.016595921711996198, "loss_seg": 0.09921144880354404, "mean_token_accuracy": 0.9950256198644638, "num_tokens": 1587033735.0, "step": 3734 }, { "entropy": 0.018635276705026627, "epoch": 1.6346427399058978, "grad_norm": 5.8125, "learning_rate": 2.08202490525176e-05, "loss": 0.1013, "loss_lm": 0.0171669814735651, "loss_seg": 0.08413777686655521, "mean_token_accuracy": 0.9952031522989273, "num_tokens": 1587458565.0, "step": 3735 }, { "entropy": 0.018386356998234987, "epoch": 1.6350804245541086, "grad_norm": 16.375, "learning_rate": 2.081754195993503e-05, "loss": 0.1085, "loss_lm": 0.01664535840973258, "loss_seg": 0.09188727848231792, "mean_token_accuracy": 0.9951668828725815, "num_tokens": 1587883714.0, "step": 3736 }, { "entropy": 0.019134470261633396, "epoch": 1.6355181092023199, "grad_norm": 5.84375, "learning_rate": 2.0814834867352464e-05, "loss": 0.1392, "loss_lm": 0.017174788052216172, "loss_seg": 0.12203949131071568, "mean_token_accuracy": 0.9950934201478958, "num_tokens": 1588309047.0, "step": 3737 }, { "entropy": 0.01876771217212081, "epoch": 1.6359557938505307, "grad_norm": 7.3125, "learning_rate": 2.0812127774769897e-05, "loss": 0.0868, "loss_lm": 0.015733071602880955, "loss_seg": 0.07106572482734919, "mean_token_accuracy": 0.995153471827507, "num_tokens": 1588734491.0, "step": 3738 }, { "entropy": 0.01763988146558404, "epoch": 1.6363934784987415, "grad_norm": 6.0625, "learning_rate": 2.080942068218733e-05, "loss": 0.0873, "loss_lm": 0.014885092619806528, "loss_seg": 0.07242812309414148, "mean_token_accuracy": 0.9954058676958084, "num_tokens": 1589159541.0, "step": 3739 }, { "entropy": 0.018482154235243797, "epoch": 1.6368311631469528, "grad_norm": 3.921875, "learning_rate": 2.0806713589604768e-05, "loss": 0.1591, "loss_lm": 0.014724794309586287, "loss_seg": 0.14442431181669235, "mean_token_accuracy": 0.9952248781919479, "num_tokens": 1589585029.0, "step": 3740 }, { "entropy": 0.018764009699225426, "epoch": 1.6372688477951636, "grad_norm": 9.5625, "learning_rate": 2.08040064970222e-05, "loss": 0.0945, "loss_lm": 0.01802345016039908, "loss_seg": 0.07649816572666168, "mean_token_accuracy": 0.9950370490550995, "num_tokens": 1590010138.0, "step": 3741 }, { "entropy": 0.017888858914375305, "epoch": 1.6377065324433744, "grad_norm": 5.21875, "learning_rate": 2.0801299404439632e-05, "loss": 0.1237, "loss_lm": 0.013059420743957162, "loss_seg": 0.11064033955335617, "mean_token_accuracy": 0.9952448159456253, "num_tokens": 1590435537.0, "step": 3742 }, { "entropy": 0.01858657132834196, "epoch": 1.6381442170915856, "grad_norm": 4.78125, "learning_rate": 2.0798592311857066e-05, "loss": 0.0934, "loss_lm": 0.014494827017188072, "loss_seg": 0.07890351675450802, "mean_token_accuracy": 0.9952789694070816, "num_tokens": 1590860180.0, "step": 3743 }, { "entropy": 0.018188799265772104, "epoch": 1.6385819017397965, "grad_norm": 13.25, "learning_rate": 2.07958852192745e-05, "loss": 0.1009, "loss_lm": 0.01561654033139348, "loss_seg": 0.08524657972157001, "mean_token_accuracy": 0.9952966570854187, "num_tokens": 1591285377.0, "step": 3744 }, { "entropy": 0.018322737887501717, "epoch": 1.6390195863880075, "grad_norm": 5.625, "learning_rate": 2.0793178126691933e-05, "loss": 0.1643, "loss_lm": 0.01892744656652212, "loss_seg": 0.14540601149201393, "mean_token_accuracy": 0.9951951652765274, "num_tokens": 1591710105.0, "step": 3745 }, { "entropy": 0.018287829123437405, "epoch": 1.6394572710362185, "grad_norm": 9.25, "learning_rate": 2.0790471034109367e-05, "loss": 0.089, "loss_lm": 0.012629790464416146, "loss_seg": 0.0764111839234829, "mean_token_accuracy": 0.9954745918512344, "num_tokens": 1592135195.0, "step": 3746 }, { "entropy": 0.01785727310925722, "epoch": 1.6398949556844293, "grad_norm": 9.625, "learning_rate": 2.07877639415268e-05, "loss": 0.1108, "loss_lm": 0.014745587948709726, "loss_seg": 0.09606373962014914, "mean_token_accuracy": 0.9952679127454758, "num_tokens": 1592560520.0, "step": 3747 }, { "entropy": 0.018724640365689993, "epoch": 1.6403326403326404, "grad_norm": 5.375, "learning_rate": 2.0785056848944235e-05, "loss": 0.1512, "loss_lm": 0.018251428147777915, "loss_seg": 0.13296719826757908, "mean_token_accuracy": 0.9951303005218506, "num_tokens": 1592985760.0, "step": 3748 }, { "entropy": 0.018481124192476273, "epoch": 1.6407703249808514, "grad_norm": 29.125, "learning_rate": 2.078234975636167e-05, "loss": 0.0939, "loss_lm": 0.016281533055007458, "loss_seg": 0.07766270078718662, "mean_token_accuracy": 0.9952740371227264, "num_tokens": 1593411021.0, "step": 3749 }, { "entropy": 0.018973506521433592, "epoch": 1.6412080096290622, "grad_norm": 6.5, "learning_rate": 2.07796426637791e-05, "loss": 0.1242, "loss_lm": 0.015401798766106367, "loss_seg": 0.10880310274660587, "mean_token_accuracy": 0.9950983822345734, "num_tokens": 1593836368.0, "step": 3750 }, { "entropy": 0.01843545353040099, "epoch": 1.6416456942772732, "grad_norm": 5.15625, "learning_rate": 2.0776935571196536e-05, "loss": 0.1002, "loss_lm": 0.016466228291392326, "loss_seg": 0.08369033224880695, "mean_token_accuracy": 0.9952936619520187, "num_tokens": 1594261042.0, "step": 3751 }, { "entropy": 0.018825996201485395, "epoch": 1.6420833789254843, "grad_norm": 7.75, "learning_rate": 2.077422847861397e-05, "loss": 0.1159, "loss_lm": 0.015781982569023967, "loss_seg": 0.10013877786695957, "mean_token_accuracy": 0.9951404929161072, "num_tokens": 1594685964.0, "step": 3752 }, { "entropy": 0.01861171657219529, "epoch": 1.642521063573695, "grad_norm": 25.75, "learning_rate": 2.0771521386031403e-05, "loss": 0.1227, "loss_lm": 0.016406069975346327, "loss_seg": 0.10626976378262043, "mean_token_accuracy": 0.9952843934297562, "num_tokens": 1595112599.0, "step": 3753 }, { "entropy": 0.018446385394781828, "epoch": 1.6429587482219061, "grad_norm": 9.625, "learning_rate": 2.0768814293448837e-05, "loss": 0.1026, "loss_lm": 0.014569268096238375, "loss_seg": 0.08801496028900146, "mean_token_accuracy": 0.9952516108751297, "num_tokens": 1595537922.0, "step": 3754 }, { "entropy": 0.01888776896521449, "epoch": 1.6433964328701172, "grad_norm": 6.8125, "learning_rate": 2.0766107200866267e-05, "loss": 0.143, "loss_lm": 0.017928540939465165, "loss_seg": 0.12510878592729568, "mean_token_accuracy": 0.9951299130916595, "num_tokens": 1595962936.0, "step": 3755 }, { "entropy": 0.018653591629117727, "epoch": 1.643834117518328, "grad_norm": 9.1875, "learning_rate": 2.0763400108283704e-05, "loss": 0.0952, "loss_lm": 0.016334204468876123, "loss_seg": 0.07890453189611435, "mean_token_accuracy": 0.9953083395957947, "num_tokens": 1596388068.0, "step": 3756 }, { "entropy": 0.018443184904754162, "epoch": 1.644271802166539, "grad_norm": 9.1875, "learning_rate": 2.0760693015701138e-05, "loss": 0.1379, "loss_lm": 0.015489907935261726, "loss_seg": 0.12237639352679253, "mean_token_accuracy": 0.9952184408903122, "num_tokens": 1596813417.0, "step": 3757 }, { "entropy": 0.01938131358474493, "epoch": 1.64470948681475, "grad_norm": 6.34375, "learning_rate": 2.0757985923118572e-05, "loss": 0.1762, "loss_lm": 0.01590813510119915, "loss_seg": 0.16028911620378494, "mean_token_accuracy": 0.9948962926864624, "num_tokens": 1597238971.0, "step": 3758 }, { "entropy": 0.018776318058371544, "epoch": 1.6451471714629609, "grad_norm": 17.375, "learning_rate": 2.0755278830536006e-05, "loss": 0.0885, "loss_lm": 0.015082287834957242, "loss_seg": 0.07343483716249466, "mean_token_accuracy": 0.9952156841754913, "num_tokens": 1597664074.0, "step": 3759 }, { "entropy": 0.018448279704898596, "epoch": 1.645584856111172, "grad_norm": 8.375, "learning_rate": 2.0752571737953436e-05, "loss": 0.1449, "loss_lm": 0.015524807153269649, "loss_seg": 0.1294239815324545, "mean_token_accuracy": 0.9952012896537781, "num_tokens": 1598089117.0, "step": 3760 }, { "entropy": 0.019076131749898195, "epoch": 1.646022540759383, "grad_norm": 11.8125, "learning_rate": 2.0749864645370873e-05, "loss": 0.0955, "loss_lm": 0.01459154486656189, "loss_seg": 0.08095238823443651, "mean_token_accuracy": 0.9950931966304779, "num_tokens": 1598514206.0, "step": 3761 }, { "entropy": 0.01841073017567396, "epoch": 1.6464602254075937, "grad_norm": 6.9375, "learning_rate": 2.0747157552788307e-05, "loss": 0.108, "loss_lm": 0.015382234705612063, "loss_seg": 0.0926537923514843, "mean_token_accuracy": 0.9952107667922974, "num_tokens": 1598939284.0, "step": 3762 }, { "entropy": 0.01894328510388732, "epoch": 1.6468979100558048, "grad_norm": 8.9375, "learning_rate": 2.074445046020574e-05, "loss": 0.1339, "loss_lm": 0.017535913502797484, "loss_seg": 0.11637301370501518, "mean_token_accuracy": 0.9950913488864899, "num_tokens": 1599364846.0, "step": 3763 }, { "entropy": 0.018205059226602316, "epoch": 1.6473355947040158, "grad_norm": 10.6875, "learning_rate": 2.074174336762317e-05, "loss": 0.1126, "loss_lm": 0.014258737908676267, "loss_seg": 0.09836898371577263, "mean_token_accuracy": 0.9952468723058701, "num_tokens": 1599789581.0, "step": 3764 }, { "entropy": 0.018706263042986393, "epoch": 1.6477732793522266, "grad_norm": 3.71875, "learning_rate": 2.0739036275040605e-05, "loss": 0.0947, "loss_lm": 0.017345263622701168, "loss_seg": 0.07736759632825851, "mean_token_accuracy": 0.9951774924993515, "num_tokens": 1600214038.0, "step": 3765 }, { "entropy": 0.019271953031420708, "epoch": 1.6482109640004377, "grad_norm": 4.21875, "learning_rate": 2.0736329182458042e-05, "loss": 0.1295, "loss_lm": 0.019565162248909473, "loss_seg": 0.10990606993436813, "mean_token_accuracy": 0.9950207024812698, "num_tokens": 1600639981.0, "step": 3766 }, { "entropy": 0.018086388241499662, "epoch": 1.6486486486486487, "grad_norm": 6.09375, "learning_rate": 2.0733622089875475e-05, "loss": 0.0883, "loss_lm": 0.015614647418260574, "loss_seg": 0.07263819687068462, "mean_token_accuracy": 0.9954125285148621, "num_tokens": 1601064362.0, "step": 3767 }, { "entropy": 0.018708019983023405, "epoch": 1.6490863332968595, "grad_norm": 7.15625, "learning_rate": 2.073091499729291e-05, "loss": 0.1389, "loss_lm": 0.015779757406562567, "loss_seg": 0.12315458245575428, "mean_token_accuracy": 0.9952048659324646, "num_tokens": 1601488912.0, "step": 3768 }, { "entropy": 0.018318924121558666, "epoch": 1.6495240179450705, "grad_norm": 5.09375, "learning_rate": 2.072820790471034e-05, "loss": 0.1473, "loss_lm": 0.017791051417589188, "loss_seg": 0.12955565378069878, "mean_token_accuracy": 0.995215117931366, "num_tokens": 1601913517.0, "step": 3769 }, { "entropy": 0.019079397432506084, "epoch": 1.6499617025932816, "grad_norm": 13.3125, "learning_rate": 2.0725500812127773e-05, "loss": 0.1696, "loss_lm": 0.016576906200498343, "loss_seg": 0.15301202982664108, "mean_token_accuracy": 0.995046004652977, "num_tokens": 1602338813.0, "step": 3770 }, { "entropy": 0.0184969543479383, "epoch": 1.6503993872414924, "grad_norm": 8.3125, "learning_rate": 2.072279371954521e-05, "loss": 0.1377, "loss_lm": 0.015235968166962266, "loss_seg": 0.12245724350214005, "mean_token_accuracy": 0.9952229112386703, "num_tokens": 1602764758.0, "step": 3771 }, { "entropy": 0.019004895351827145, "epoch": 1.6508370718897036, "grad_norm": 6.21875, "learning_rate": 2.0720086626962644e-05, "loss": 0.1316, "loss_lm": 0.015435319859534502, "loss_seg": 0.11617130972445011, "mean_token_accuracy": 0.9952199906110764, "num_tokens": 1603190620.0, "step": 3772 }, { "entropy": 0.018668001517653465, "epoch": 1.6512747565379144, "grad_norm": 10.1875, "learning_rate": 2.0717379534380078e-05, "loss": 0.1317, "loss_lm": 0.016477507771924138, "loss_seg": 0.11526478454470634, "mean_token_accuracy": 0.9951466768980026, "num_tokens": 1603615240.0, "step": 3773 }, { "entropy": 0.018479629419744015, "epoch": 1.6517124411861253, "grad_norm": 13.0625, "learning_rate": 2.0714672441797508e-05, "loss": 0.1268, "loss_lm": 0.017085825791582465, "loss_seg": 0.10967493150383234, "mean_token_accuracy": 0.9952295869588852, "num_tokens": 1604040289.0, "step": 3774 }, { "entropy": 0.018244325648993254, "epoch": 1.6521501258343365, "grad_norm": 4.03125, "learning_rate": 2.0711965349214942e-05, "loss": 0.0766, "loss_lm": 0.013148749014362693, "loss_seg": 0.0634258259087801, "mean_token_accuracy": 0.995347797870636, "num_tokens": 1604464560.0, "step": 3775 }, { "entropy": 0.018760418985038996, "epoch": 1.6525878104825473, "grad_norm": 20.75, "learning_rate": 2.070925825663238e-05, "loss": 0.0935, "loss_lm": 0.0171705293469131, "loss_seg": 0.07633425947278738, "mean_token_accuracy": 0.9951456040143967, "num_tokens": 1604890541.0, "step": 3776 }, { "entropy": 0.018664776347577572, "epoch": 1.6530254951307581, "grad_norm": 5.8125, "learning_rate": 2.0706551164049813e-05, "loss": 0.1343, "loss_lm": 0.0160548381973058, "loss_seg": 0.11822660081088543, "mean_token_accuracy": 0.9952268302440643, "num_tokens": 1605315591.0, "step": 3777 }, { "entropy": 0.019266970455646515, "epoch": 1.6534631797789694, "grad_norm": 7.8125, "learning_rate": 2.0703844071467246e-05, "loss": 0.1117, "loss_lm": 0.017142122611403465, "loss_seg": 0.09451824799180031, "mean_token_accuracy": 0.9950343668460846, "num_tokens": 1605740316.0, "step": 3778 }, { "entropy": 0.01835039211437106, "epoch": 1.6539008644271802, "grad_norm": 5.625, "learning_rate": 2.0701136978884677e-05, "loss": 0.1676, "loss_lm": 0.015296767698600888, "loss_seg": 0.15233058854937553, "mean_token_accuracy": 0.9952803254127502, "num_tokens": 1606164622.0, "step": 3779 }, { "entropy": 0.01874375157058239, "epoch": 1.654338549075391, "grad_norm": 4.53125, "learning_rate": 2.069842988630211e-05, "loss": 0.1256, "loss_lm": 0.01899485057219863, "loss_seg": 0.10659382492303848, "mean_token_accuracy": 0.9951335787773132, "num_tokens": 1606589982.0, "step": 3780 }, { "entropy": 0.018021341413259506, "epoch": 1.6547762337236023, "grad_norm": 6.3125, "learning_rate": 2.0695722793719544e-05, "loss": 0.0897, "loss_lm": 0.01609425852075219, "loss_seg": 0.07362168468534946, "mean_token_accuracy": 0.9953223168849945, "num_tokens": 1607014254.0, "step": 3781 }, { "entropy": 0.018152805045247078, "epoch": 1.655213918371813, "grad_norm": 5.125, "learning_rate": 2.069301570113698e-05, "loss": 0.0833, "loss_lm": 0.016143698012456298, "loss_seg": 0.06716649886220694, "mean_token_accuracy": 0.9953396171331406, "num_tokens": 1607439276.0, "step": 3782 }, { "entropy": 0.018887754064053297, "epoch": 1.6556516030200241, "grad_norm": 10.9375, "learning_rate": 2.0690308608554415e-05, "loss": 0.1132, "loss_lm": 0.01744392281398177, "loss_seg": 0.09575124084949493, "mean_token_accuracy": 0.9950309097766876, "num_tokens": 1607863666.0, "step": 3783 }, { "entropy": 0.01786930114030838, "epoch": 1.6560892876682352, "grad_norm": 29.875, "learning_rate": 2.0687601515971845e-05, "loss": 0.1174, "loss_lm": 0.016464203130453825, "loss_seg": 0.10097972303628922, "mean_token_accuracy": 0.9954153448343277, "num_tokens": 1608288721.0, "step": 3784 }, { "entropy": 0.01822818908840418, "epoch": 1.656526972316446, "grad_norm": 7.21875, "learning_rate": 2.068489442338928e-05, "loss": 0.0988, "loss_lm": 0.014759742189198732, "loss_seg": 0.08402498066425323, "mean_token_accuracy": 0.9952737390995026, "num_tokens": 1608713525.0, "step": 3785 }, { "entropy": 0.01806391356512904, "epoch": 1.656964656964657, "grad_norm": 21.75, "learning_rate": 2.0682187330806713e-05, "loss": 0.1187, "loss_lm": 0.015731413383036852, "loss_seg": 0.1029647458344698, "mean_token_accuracy": 0.9951834678649902, "num_tokens": 1609139310.0, "step": 3786 }, { "entropy": 0.018752551171928644, "epoch": 1.657402341612868, "grad_norm": 10.3125, "learning_rate": 2.067948023822415e-05, "loss": 0.1526, "loss_lm": 0.016577801201492548, "loss_seg": 0.13602160289883614, "mean_token_accuracy": 0.9950728714466095, "num_tokens": 1609564475.0, "step": 3787 }, { "entropy": 0.01865739095956087, "epoch": 1.6578400262610788, "grad_norm": 4.40625, "learning_rate": 2.067677314564158e-05, "loss": 0.1327, "loss_lm": 0.01817242708057165, "loss_seg": 0.11454515159130096, "mean_token_accuracy": 0.9951309859752655, "num_tokens": 1609989188.0, "step": 3788 }, { "entropy": 0.01805756287649274, "epoch": 1.6582777109092899, "grad_norm": 13.8125, "learning_rate": 2.0674066053059014e-05, "loss": 0.1125, "loss_lm": 0.014398508239537477, "loss_seg": 0.0980930756777525, "mean_token_accuracy": 0.9952612668275833, "num_tokens": 1610413498.0, "step": 3789 }, { "entropy": 0.018830785993486643, "epoch": 1.658715395557501, "grad_norm": 10.125, "learning_rate": 2.0671358960476448e-05, "loss": 0.1162, "loss_lm": 0.014391895616427064, "loss_seg": 0.10183087643235922, "mean_token_accuracy": 0.9952615052461624, "num_tokens": 1610838301.0, "step": 3790 }, { "entropy": 0.01832383032888174, "epoch": 1.6591530802057117, "grad_norm": 4.34375, "learning_rate": 2.066865186789388e-05, "loss": 0.1325, "loss_lm": 0.016908586025238037, "loss_seg": 0.11556940153241158, "mean_token_accuracy": 0.9952360242605209, "num_tokens": 1611263736.0, "step": 3791 }, { "entropy": 0.018624777905642986, "epoch": 1.6595907648539228, "grad_norm": 6.25, "learning_rate": 2.066594477531132e-05, "loss": 0.14, "loss_lm": 0.017276585567742586, "loss_seg": 0.12274044565856457, "mean_token_accuracy": 0.99519844353199, "num_tokens": 1611688954.0, "step": 3792 }, { "entropy": 0.018112202174961567, "epoch": 1.6600284495021338, "grad_norm": 5.34375, "learning_rate": 2.066323768272875e-05, "loss": 0.1078, "loss_lm": 0.01435023220255971, "loss_seg": 0.09349147230386734, "mean_token_accuracy": 0.9953560680150986, "num_tokens": 1612113975.0, "step": 3793 }, { "entropy": 0.01860564900562167, "epoch": 1.6604661341503446, "grad_norm": 7.21875, "learning_rate": 2.0660530590146183e-05, "loss": 0.1101, "loss_lm": 0.015474850311875343, "loss_seg": 0.09465004783123732, "mean_token_accuracy": 0.9951851665973663, "num_tokens": 1612539503.0, "step": 3794 }, { "entropy": 0.018680653534829617, "epoch": 1.6609038187985556, "grad_norm": 9.8125, "learning_rate": 2.0657823497563616e-05, "loss": 0.1788, "loss_lm": 0.016037229215726256, "loss_seg": 0.16278859041631222, "mean_token_accuracy": 0.9950935989618301, "num_tokens": 1612964603.0, "step": 3795 }, { "entropy": 0.017696591094136238, "epoch": 1.6613415034467667, "grad_norm": 11.4375, "learning_rate": 2.065511640498105e-05, "loss": 0.1318, "loss_lm": 0.015333714429289103, "loss_seg": 0.11646217666566372, "mean_token_accuracy": 0.9954245537519455, "num_tokens": 1613388994.0, "step": 3796 }, { "entropy": 0.01866278052330017, "epoch": 1.6617791880949775, "grad_norm": 6.78125, "learning_rate": 2.0652409312398487e-05, "loss": 0.1137, "loss_lm": 0.014202992664650083, "loss_seg": 0.09952025301754475, "mean_token_accuracy": 0.9951295405626297, "num_tokens": 1613814613.0, "step": 3797 }, { "entropy": 0.018176923971623182, "epoch": 1.6622168727431885, "grad_norm": 3.765625, "learning_rate": 2.0649702219815918e-05, "loss": 0.1101, "loss_lm": 0.014906456926837564, "loss_seg": 0.0951782837510109, "mean_token_accuracy": 0.9953079968690872, "num_tokens": 1614239759.0, "step": 3798 }, { "entropy": 0.018868664745241404, "epoch": 1.6626545573913996, "grad_norm": 7.53125, "learning_rate": 2.064699512723335e-05, "loss": 0.1044, "loss_lm": 0.015337741235271096, "loss_seg": 0.08907707408070564, "mean_token_accuracy": 0.9950563013553619, "num_tokens": 1614664307.0, "step": 3799 }, { "entropy": 0.01809209119528532, "epoch": 1.6630922420396104, "grad_norm": 8.8125, "learning_rate": 2.0644288034650785e-05, "loss": 0.1219, "loss_lm": 0.016886016819626093, "loss_seg": 0.10506360046565533, "mean_token_accuracy": 0.9953106045722961, "num_tokens": 1615088826.0, "step": 3800 }, { "entropy": 0.019041860476136208, "epoch": 1.6635299266878214, "grad_norm": 18.375, "learning_rate": 2.064158094206822e-05, "loss": 0.1005, "loss_lm": 0.018625338096171618, "loss_seg": 0.08191759791225195, "mean_token_accuracy": 0.9951486438512802, "num_tokens": 1615514458.0, "step": 3801 }, { "entropy": 0.017967306077480316, "epoch": 1.6639676113360324, "grad_norm": 9.8125, "learning_rate": 2.0638873849485656e-05, "loss": 0.1403, "loss_lm": 0.014604465337470174, "loss_seg": 0.12565126549452543, "mean_token_accuracy": 0.995251476764679, "num_tokens": 1615940042.0, "step": 3802 }, { "entropy": 0.018226915039122105, "epoch": 1.6644052959842432, "grad_norm": 5.09375, "learning_rate": 2.0636166756903086e-05, "loss": 0.1506, "loss_lm": 0.01581695512868464, "loss_seg": 0.1348203308880329, "mean_token_accuracy": 0.9951910376548767, "num_tokens": 1616365019.0, "step": 3803 }, { "entropy": 0.018835792783647776, "epoch": 1.6648429806324543, "grad_norm": 14.3125, "learning_rate": 2.063345966432052e-05, "loss": 0.1536, "loss_lm": 0.02186826290562749, "loss_seg": 0.1317187286913395, "mean_token_accuracy": 0.9951401650905609, "num_tokens": 1616790525.0, "step": 3804 }, { "entropy": 0.018482733983546495, "epoch": 1.6652806652806653, "grad_norm": 7.21875, "learning_rate": 2.0630752571737954e-05, "loss": 0.1326, "loss_lm": 0.016467765672132373, "loss_seg": 0.11613149382174015, "mean_token_accuracy": 0.9951996952295303, "num_tokens": 1617215863.0, "step": 3805 }, { "entropy": 0.018255246337503195, "epoch": 1.6657183499288761, "grad_norm": 8.1875, "learning_rate": 2.0628045479155387e-05, "loss": 0.1782, "loss_lm": 0.01642592065036297, "loss_seg": 0.16179140470921993, "mean_token_accuracy": 0.9952587485313416, "num_tokens": 1617641189.0, "step": 3806 }, { "entropy": 0.01929673133417964, "epoch": 1.6661560345770872, "grad_norm": 5.15625, "learning_rate": 2.0625338386572824e-05, "loss": 0.1626, "loss_lm": 0.015448353253304958, "loss_seg": 0.14718627743422985, "mean_token_accuracy": 0.9952287524938583, "num_tokens": 1618066154.0, "step": 3807 }, { "entropy": 0.018759659491479397, "epoch": 1.6665937192252982, "grad_norm": 14.9375, "learning_rate": 2.0622631293990255e-05, "loss": 0.0758, "loss_lm": 0.01598260342143476, "loss_seg": 0.05983245838433504, "mean_token_accuracy": 0.9952052533626556, "num_tokens": 1618491566.0, "step": 3808 }, { "entropy": 0.018054619431495667, "epoch": 1.667031403873509, "grad_norm": 6.34375, "learning_rate": 2.061992420140769e-05, "loss": 0.1384, "loss_lm": 0.014920072862878442, "loss_seg": 0.12351914308965206, "mean_token_accuracy": 0.9953538626432419, "num_tokens": 1618916818.0, "step": 3809 }, { "entropy": 0.01790728187188506, "epoch": 1.6674690885217203, "grad_norm": 7.5625, "learning_rate": 2.0617217108825122e-05, "loss": 0.107, "loss_lm": 0.015524812042713165, "loss_seg": 0.09143662545830011, "mean_token_accuracy": 0.995296061038971, "num_tokens": 1619341742.0, "step": 3810 }, { "entropy": 0.018536543007940054, "epoch": 1.667906773169931, "grad_norm": 8.4375, "learning_rate": 2.0614510016242556e-05, "loss": 0.1843, "loss_lm": 0.01658765389584005, "loss_seg": 0.16774562932550907, "mean_token_accuracy": 0.9951891899108887, "num_tokens": 1619766593.0, "step": 3811 }, { "entropy": 0.019274260383099318, "epoch": 1.668344457818142, "grad_norm": 15.25, "learning_rate": 2.061180292365999e-05, "loss": 0.0992, "loss_lm": 0.015991837717592716, "loss_seg": 0.08319858089089394, "mean_token_accuracy": 0.9950061142444611, "num_tokens": 1620191500.0, "step": 3812 }, { "entropy": 0.018837085459381342, "epoch": 1.6687821424663531, "grad_norm": 9.3125, "learning_rate": 2.0609095831077423e-05, "loss": 0.0905, "loss_lm": 0.016906821867451072, "loss_seg": 0.07356605120003223, "mean_token_accuracy": 0.9951525032520294, "num_tokens": 1620616230.0, "step": 3813 }, { "entropy": 0.01870825607329607, "epoch": 1.669219827114564, "grad_norm": 6.4375, "learning_rate": 2.0606388738494857e-05, "loss": 0.113, "loss_lm": 0.015249559190124273, "loss_seg": 0.09773979522287846, "mean_token_accuracy": 0.9951461851596832, "num_tokens": 1621041432.0, "step": 3814 }, { "entropy": 0.018287692684680223, "epoch": 1.6696575117627748, "grad_norm": 9.375, "learning_rate": 2.060368164591229e-05, "loss": 0.1322, "loss_lm": 0.015078597469255328, "loss_seg": 0.11715592630207539, "mean_token_accuracy": 0.9953175038099289, "num_tokens": 1621466545.0, "step": 3815 }, { "entropy": 0.018431488424539566, "epoch": 1.670095196410986, "grad_norm": 12.1875, "learning_rate": 2.0600974553329725e-05, "loss": 0.1262, "loss_lm": 0.01629219576716423, "loss_seg": 0.10991064086556435, "mean_token_accuracy": 0.9952719658613205, "num_tokens": 1621892062.0, "step": 3816 }, { "entropy": 0.017775636166334152, "epoch": 1.6705328810591968, "grad_norm": 5.59375, "learning_rate": 2.0598267460747155e-05, "loss": 0.1177, "loss_lm": 0.015782966976985335, "loss_seg": 0.10192880593240261, "mean_token_accuracy": 0.99542535841465, "num_tokens": 1622316077.0, "step": 3817 }, { "entropy": 0.018086436670273542, "epoch": 1.6709705657074077, "grad_norm": 12.25, "learning_rate": 2.0595560368164592e-05, "loss": 0.0936, "loss_lm": 0.01594790956005454, "loss_seg": 0.07768141478300095, "mean_token_accuracy": 0.9953789561986923, "num_tokens": 1622740375.0, "step": 3818 }, { "entropy": 0.018493440002202988, "epoch": 1.671408250355619, "grad_norm": 4.90625, "learning_rate": 2.0592853275582026e-05, "loss": 0.1079, "loss_lm": 0.017028273548930883, "loss_seg": 0.0908559113740921, "mean_token_accuracy": 0.995284229516983, "num_tokens": 1623165493.0, "step": 3819 }, { "entropy": 0.018780001439154148, "epoch": 1.6718459350038297, "grad_norm": 7.1875, "learning_rate": 2.059014618299946e-05, "loss": 0.1626, "loss_lm": 0.017013833159580827, "loss_seg": 0.1455840952694416, "mean_token_accuracy": 0.995233803987503, "num_tokens": 1623590129.0, "step": 3820 }, { "entropy": 0.0185047029517591, "epoch": 1.6722836196520408, "grad_norm": 26.25, "learning_rate": 2.0587439090416893e-05, "loss": 0.1148, "loss_lm": 0.017296313773840666, "loss_seg": 0.0974763659760356, "mean_token_accuracy": 0.9952784031629562, "num_tokens": 1624016673.0, "step": 3821 }, { "entropy": 0.018635396379977465, "epoch": 1.6727213043002518, "grad_norm": 5.5625, "learning_rate": 2.0584731997834324e-05, "loss": 0.1727, "loss_lm": 0.016632129438221455, "loss_seg": 0.15604860894382, "mean_token_accuracy": 0.9952036887407303, "num_tokens": 1624442165.0, "step": 3822 }, { "entropy": 0.018052746541798115, "epoch": 1.6731589889484626, "grad_norm": 20.125, "learning_rate": 2.058202490525176e-05, "loss": 0.0789, "loss_lm": 0.014333063503727317, "loss_seg": 0.06460127420723438, "mean_token_accuracy": 0.9953272044658661, "num_tokens": 1624866711.0, "step": 3823 }, { "entropy": 0.018874506000429392, "epoch": 1.6735966735966736, "grad_norm": 4.4375, "learning_rate": 2.0579317812669194e-05, "loss": 0.1585, "loss_lm": 0.018126489594578743, "loss_seg": 0.14034353941679, "mean_token_accuracy": 0.9949987232685089, "num_tokens": 1625291183.0, "step": 3824 }, { "entropy": 0.018152814358472824, "epoch": 1.6740343582448847, "grad_norm": 9.8125, "learning_rate": 2.0576610720086628e-05, "loss": 0.1578, "loss_lm": 0.016784688690677285, "loss_seg": 0.14105384796857834, "mean_token_accuracy": 0.9954017549753189, "num_tokens": 1625715629.0, "step": 3825 }, { "entropy": 0.018496345728635788, "epoch": 1.6744720428930955, "grad_norm": 5.28125, "learning_rate": 2.0573903627504062e-05, "loss": 0.1029, "loss_lm": 0.015612123999744654, "loss_seg": 0.08730258513242006, "mean_token_accuracy": 0.9952229559421539, "num_tokens": 1626139720.0, "step": 3826 }, { "entropy": 0.018429302610456944, "epoch": 1.6749097275413065, "grad_norm": 7.96875, "learning_rate": 2.0571196534921492e-05, "loss": 0.1347, "loss_lm": 0.015283695189282298, "loss_seg": 0.11942759528756142, "mean_token_accuracy": 0.9951683282852173, "num_tokens": 1626564818.0, "step": 3827 }, { "entropy": 0.017979854717850685, "epoch": 1.6753474121895175, "grad_norm": 8.5, "learning_rate": 2.056848944233893e-05, "loss": 0.1076, "loss_lm": 0.014457521261647344, "loss_seg": 0.09318667650222778, "mean_token_accuracy": 0.9953850656747818, "num_tokens": 1626989463.0, "step": 3828 }, { "entropy": 0.01867178687825799, "epoch": 1.6757850968377284, "grad_norm": 7.375, "learning_rate": 2.0565782349756363e-05, "loss": 0.0994, "loss_lm": 0.0156736490316689, "loss_seg": 0.08368632569909096, "mean_token_accuracy": 0.9951628297567368, "num_tokens": 1627415296.0, "step": 3829 }, { "entropy": 0.018063655588775873, "epoch": 1.6762227814859394, "grad_norm": 7.875, "learning_rate": 2.0563075257173797e-05, "loss": 0.1113, "loss_lm": 0.017373204929754138, "loss_seg": 0.09388424456119537, "mean_token_accuracy": 0.9951848387718201, "num_tokens": 1627840566.0, "step": 3830 }, { "entropy": 0.01884079212322831, "epoch": 1.6766604661341504, "grad_norm": 6.28125, "learning_rate": 2.056036816459123e-05, "loss": 0.1245, "loss_lm": 0.01686475845053792, "loss_seg": 0.1076640672981739, "mean_token_accuracy": 0.9952663779258728, "num_tokens": 1628265791.0, "step": 3831 }, { "entropy": 0.018619306851178408, "epoch": 1.6770981507823612, "grad_norm": 3.203125, "learning_rate": 2.055766107200866e-05, "loss": 0.0955, "loss_lm": 0.01639118348248303, "loss_seg": 0.07907593436539173, "mean_token_accuracy": 0.9951670318841934, "num_tokens": 1628691353.0, "step": 3832 }, { "entropy": 0.018852591514587402, "epoch": 1.6775358354305723, "grad_norm": 17.375, "learning_rate": 2.0554953979426098e-05, "loss": 0.1998, "loss_lm": 0.01672055758535862, "loss_seg": 0.1830632295459509, "mean_token_accuracy": 0.9950847029685974, "num_tokens": 1629116136.0, "step": 3833 }, { "entropy": 0.018400454428046942, "epoch": 1.6779735200787833, "grad_norm": 13.25, "learning_rate": 2.0552246886843532e-05, "loss": 0.1437, "loss_lm": 0.015607088804244995, "loss_seg": 0.12814252637326717, "mean_token_accuracy": 0.9951852262020111, "num_tokens": 1629540829.0, "step": 3834 }, { "entropy": 0.018310965038836002, "epoch": 1.6784112047269941, "grad_norm": 4.625, "learning_rate": 2.0549539794260965e-05, "loss": 0.1315, "loss_lm": 0.014733969932422042, "loss_seg": 0.11674335971474648, "mean_token_accuracy": 0.9951421320438385, "num_tokens": 1629965493.0, "step": 3835 }, { "entropy": 0.018357552122324705, "epoch": 1.6788488893752052, "grad_norm": 4.5, "learning_rate": 2.0546832701678396e-05, "loss": 0.1138, "loss_lm": 0.013693502405658364, "loss_seg": 0.10013747587800026, "mean_token_accuracy": 0.9952659755945206, "num_tokens": 1630390644.0, "step": 3836 }, { "entropy": 0.018726180773228407, "epoch": 1.6792865740234162, "grad_norm": 7.46875, "learning_rate": 2.054412560909583e-05, "loss": 0.1134, "loss_lm": 0.01717814477160573, "loss_seg": 0.0962477345019579, "mean_token_accuracy": 0.9951656460762024, "num_tokens": 1630815468.0, "step": 3837 }, { "entropy": 0.018200446851551533, "epoch": 1.679724258671627, "grad_norm": 14.375, "learning_rate": 2.0541418516513267e-05, "loss": 0.1332, "loss_lm": 0.016631938749924302, "loss_seg": 0.11656972020864487, "mean_token_accuracy": 0.995291605591774, "num_tokens": 1631239978.0, "step": 3838 }, { "entropy": 0.017929826397448778, "epoch": 1.680161943319838, "grad_norm": 4.5625, "learning_rate": 2.05387114239307e-05, "loss": 0.0878, "loss_lm": 0.015291011426597834, "loss_seg": 0.07248975150287151, "mean_token_accuracy": 0.9952996671199799, "num_tokens": 1631664779.0, "step": 3839 }, { "entropy": 0.018427737522870302, "epoch": 1.680599627968049, "grad_norm": 9.5625, "learning_rate": 2.0536004331348134e-05, "loss": 0.1411, "loss_lm": 0.016516985837370157, "loss_seg": 0.1245849709957838, "mean_token_accuracy": 0.9952332377433777, "num_tokens": 1632089351.0, "step": 3840 }, { "entropy": 0.018141611479222775, "epoch": 1.6810373126162599, "grad_norm": 11.4375, "learning_rate": 2.0533297238765564e-05, "loss": 0.0763, "loss_lm": 0.014901449671015143, "loss_seg": 0.061390919610857964, "mean_token_accuracy": 0.9952408820390701, "num_tokens": 1632513767.0, "step": 3841 }, { "entropy": 0.0178671614266932, "epoch": 1.681474997264471, "grad_norm": 6.71875, "learning_rate": 2.0530590146182998e-05, "loss": 0.12, "loss_lm": 0.015205024974420667, "loss_seg": 0.10476231761276722, "mean_token_accuracy": 0.9953033924102783, "num_tokens": 1632938276.0, "step": 3842 }, { "entropy": 0.018134398385882378, "epoch": 1.681912681912682, "grad_norm": 15.0625, "learning_rate": 2.0527883053600435e-05, "loss": 0.1176, "loss_lm": 0.014991083182394505, "loss_seg": 0.10258037131279707, "mean_token_accuracy": 0.9952463805675507, "num_tokens": 1633363187.0, "step": 3843 }, { "entropy": 0.018660199362784624, "epoch": 1.6823503665608928, "grad_norm": 9.625, "learning_rate": 2.052517596101787e-05, "loss": 0.1095, "loss_lm": 0.016906019300222397, "loss_seg": 0.09255817905068398, "mean_token_accuracy": 0.9952214956283569, "num_tokens": 1633788799.0, "step": 3844 }, { "entropy": 0.018256905488669872, "epoch": 1.6827880512091038, "grad_norm": 7.25, "learning_rate": 2.0522468868435303e-05, "loss": 0.1384, "loss_lm": 0.015986508456990123, "loss_seg": 0.12242166697978973, "mean_token_accuracy": 0.9952986985445023, "num_tokens": 1634213980.0, "step": 3845 }, { "entropy": 0.018180835992097855, "epoch": 1.6832257358573148, "grad_norm": 124.5, "learning_rate": 2.0519761775852733e-05, "loss": 0.0995, "loss_lm": 0.015234474092721939, "loss_seg": 0.08430176507681608, "mean_token_accuracy": 0.9953283220529556, "num_tokens": 1634638860.0, "step": 3846 }, { "entropy": 0.01875128038227558, "epoch": 1.6836634205055256, "grad_norm": 61.75, "learning_rate": 2.0517054683270167e-05, "loss": 0.1297, "loss_lm": 0.016041565453633666, "loss_seg": 0.11365378089249134, "mean_token_accuracy": 0.9950544536113739, "num_tokens": 1635064064.0, "step": 3847 }, { "entropy": 0.01781789492815733, "epoch": 1.684101105153737, "grad_norm": 6.75, "learning_rate": 2.05143475906876e-05, "loss": 0.1449, "loss_lm": 0.013904892140999436, "loss_seg": 0.13094956055283546, "mean_token_accuracy": 0.9954189658164978, "num_tokens": 1635489455.0, "step": 3848 }, { "entropy": 0.01792485872283578, "epoch": 1.6845387898019477, "grad_norm": 8.6875, "learning_rate": 2.0511640498105038e-05, "loss": 0.1047, "loss_lm": 0.018130866810679436, "loss_seg": 0.08655756898224354, "mean_token_accuracy": 0.9953323155641556, "num_tokens": 1635914682.0, "step": 3849 }, { "entropy": 0.017947352956980467, "epoch": 1.6849764744501585, "grad_norm": 4.15625, "learning_rate": 2.050893340552247e-05, "loss": 0.1002, "loss_lm": 0.015591800445690751, "loss_seg": 0.08463696949183941, "mean_token_accuracy": 0.9953639954328537, "num_tokens": 1636339333.0, "step": 3850 }, { "entropy": 0.018879693932831287, "epoch": 1.6854141590983698, "grad_norm": 6.34375, "learning_rate": 2.05062263129399e-05, "loss": 0.0994, "loss_lm": 0.015485712327063084, "loss_seg": 0.08386807795614004, "mean_token_accuracy": 0.9951554834842682, "num_tokens": 1636764310.0, "step": 3851 }, { "entropy": 0.01848543155938387, "epoch": 1.6858518437465806, "grad_norm": 4.78125, "learning_rate": 2.0503519220357335e-05, "loss": 0.1016, "loss_lm": 0.016794573981314898, "loss_seg": 0.08482616022229195, "mean_token_accuracy": 0.9952295124530792, "num_tokens": 1637189403.0, "step": 3852 }, { "entropy": 0.018724811729043722, "epoch": 1.6862895283947914, "grad_norm": 7.125, "learning_rate": 2.050081212777477e-05, "loss": 0.1493, "loss_lm": 0.017864362336695194, "loss_seg": 0.13146637380123138, "mean_token_accuracy": 0.9951930940151215, "num_tokens": 1637614813.0, "step": 3853 }, { "entropy": 0.017673530150204897, "epoch": 1.6867272130430027, "grad_norm": 7.5625, "learning_rate": 2.0498105035192206e-05, "loss": 0.1379, "loss_lm": 0.016517714830115438, "loss_seg": 0.12137158121913671, "mean_token_accuracy": 0.9953329563140869, "num_tokens": 1638039428.0, "step": 3854 }, { "entropy": 0.018390648998320103, "epoch": 1.6871648976912135, "grad_norm": 26.0, "learning_rate": 2.049539794260964e-05, "loss": 0.2205, "loss_lm": 0.01695151487365365, "loss_seg": 0.20353832095861435, "mean_token_accuracy": 0.9952238649129868, "num_tokens": 1638465039.0, "step": 3855 }, { "entropy": 0.018803785555064678, "epoch": 1.6876025823394243, "grad_norm": 10.875, "learning_rate": 2.049269085002707e-05, "loss": 0.1, "loss_lm": 0.015636737225577235, "loss_seg": 0.08440624829381704, "mean_token_accuracy": 0.9951465129852295, "num_tokens": 1638890694.0, "step": 3856 }, { "entropy": 0.01875367946922779, "epoch": 1.6880402669876355, "grad_norm": 20.75, "learning_rate": 2.0489983757444504e-05, "loss": 0.0992, "loss_lm": 0.016006257850676775, "loss_seg": 0.08320303354412317, "mean_token_accuracy": 0.9952462762594223, "num_tokens": 1639315179.0, "step": 3857 }, { "entropy": 0.018656110856682062, "epoch": 1.6884779516358464, "grad_norm": 15.5, "learning_rate": 2.0487276664861938e-05, "loss": 0.0843, "loss_lm": 0.016319354763254523, "loss_seg": 0.06793460249900818, "mean_token_accuracy": 0.995034247636795, "num_tokens": 1639740464.0, "step": 3858 }, { "entropy": 0.01816943660378456, "epoch": 1.6889156362840574, "grad_norm": 4.59375, "learning_rate": 2.0484569572279375e-05, "loss": 0.0965, "loss_lm": 0.014567746547982097, "loss_seg": 0.08194741047918797, "mean_token_accuracy": 0.9952587634325027, "num_tokens": 1640165626.0, "step": 3859 }, { "entropy": 0.018449362833052874, "epoch": 1.6893533209322684, "grad_norm": 33.75, "learning_rate": 2.0481862479696805e-05, "loss": 0.1102, "loss_lm": 0.015977299073711038, "loss_seg": 0.0941829951480031, "mean_token_accuracy": 0.9951989203691483, "num_tokens": 1640591102.0, "step": 3860 }, { "entropy": 0.01850053109228611, "epoch": 1.6897910055804792, "grad_norm": 8.0625, "learning_rate": 2.047915538711424e-05, "loss": 0.1037, "loss_lm": 0.01688936003483832, "loss_seg": 0.0868064183741808, "mean_token_accuracy": 0.9952190816402435, "num_tokens": 1641016298.0, "step": 3861 }, { "entropy": 0.018040125258266926, "epoch": 1.6902286902286903, "grad_norm": 14.6875, "learning_rate": 2.0476448294531673e-05, "loss": 0.1413, "loss_lm": 0.015809464966878295, "loss_seg": 0.12549890018999577, "mean_token_accuracy": 0.9953068345785141, "num_tokens": 1641441356.0, "step": 3862 }, { "entropy": 0.018094394356012344, "epoch": 1.6906663748769013, "grad_norm": 6.15625, "learning_rate": 2.0473741201949106e-05, "loss": 0.0941, "loss_lm": 0.014054248807951808, "loss_seg": 0.08003388065844774, "mean_token_accuracy": 0.9952698200941086, "num_tokens": 1641866672.0, "step": 3863 }, { "entropy": 0.018305618781596422, "epoch": 1.6911040595251121, "grad_norm": 5.34375, "learning_rate": 2.0471034109366543e-05, "loss": 0.1015, "loss_lm": 0.016249351436272264, "loss_seg": 0.08524078875780106, "mean_token_accuracy": 0.9950649589300156, "num_tokens": 1642291557.0, "step": 3864 }, { "entropy": 0.01798910927027464, "epoch": 1.6915417441733231, "grad_norm": 3.96875, "learning_rate": 2.0468327016783974e-05, "loss": 0.1101, "loss_lm": 0.013597696553915739, "loss_seg": 0.09646955877542496, "mean_token_accuracy": 0.9953605979681015, "num_tokens": 1642716323.0, "step": 3865 }, { "entropy": 0.01822106959298253, "epoch": 1.6919794288215342, "grad_norm": 9.4375, "learning_rate": 2.0465619924201408e-05, "loss": 0.1218, "loss_lm": 0.014786520740017295, "loss_seg": 0.107060257345438, "mean_token_accuracy": 0.9952019304037094, "num_tokens": 1643141472.0, "step": 3866 }, { "entropy": 0.018444866873323917, "epoch": 1.692417113469745, "grad_norm": 8.3125, "learning_rate": 2.046291283161884e-05, "loss": 0.0944, "loss_lm": 0.015900761587545276, "loss_seg": 0.07852419838309288, "mean_token_accuracy": 0.9951768070459366, "num_tokens": 1643566270.0, "step": 3867 }, { "entropy": 0.01828515948727727, "epoch": 1.692854798117956, "grad_norm": 5.90625, "learning_rate": 2.0460205739036275e-05, "loss": 0.1011, "loss_lm": 0.0165815819054842, "loss_seg": 0.08447828702628613, "mean_token_accuracy": 0.9952552318572998, "num_tokens": 1643991123.0, "step": 3868 }, { "entropy": 0.018552500754594803, "epoch": 1.693292482766167, "grad_norm": 20.125, "learning_rate": 2.0457498646453712e-05, "loss": 0.114, "loss_lm": 0.016754661221057177, "loss_seg": 0.09729475993663073, "mean_token_accuracy": 0.9952752590179443, "num_tokens": 1644415878.0, "step": 3869 }, { "entropy": 0.018333593849092722, "epoch": 1.6937301674143779, "grad_norm": 8.1875, "learning_rate": 2.0454791553871142e-05, "loss": 0.1073, "loss_lm": 0.015606360509991646, "loss_seg": 0.09170425496995449, "mean_token_accuracy": 0.9952805787324905, "num_tokens": 1644841076.0, "step": 3870 }, { "entropy": 0.018628914840519428, "epoch": 1.694167852062589, "grad_norm": 12.75, "learning_rate": 2.0452084461288576e-05, "loss": 0.116, "loss_lm": 0.017510825535282493, "loss_seg": 0.09850704483687878, "mean_token_accuracy": 0.9951540976762772, "num_tokens": 1645266528.0, "step": 3871 }, { "entropy": 0.018560446798801422, "epoch": 1.6946055367108, "grad_norm": 6.21875, "learning_rate": 2.044937736870601e-05, "loss": 0.1259, "loss_lm": 0.013887068955227733, "loss_seg": 0.11204229295253754, "mean_token_accuracy": 0.9951730966567993, "num_tokens": 1645691803.0, "step": 3872 }, { "entropy": 0.018502723891288042, "epoch": 1.6950432213590108, "grad_norm": 5.375, "learning_rate": 2.0446670276123444e-05, "loss": 0.1146, "loss_lm": 0.016424536937847733, "loss_seg": 0.09816304966807365, "mean_token_accuracy": 0.9951283931732178, "num_tokens": 1646116436.0, "step": 3873 }, { "entropy": 0.018468037247657776, "epoch": 1.6954809060072218, "grad_norm": 13.25, "learning_rate": 2.044396318354088e-05, "loss": 0.154, "loss_lm": 0.017105293227359653, "loss_seg": 0.13690759893506765, "mean_token_accuracy": 0.9951702654361725, "num_tokens": 1646542520.0, "step": 3874 }, { "entropy": 0.018291575368493795, "epoch": 1.6959185906554328, "grad_norm": 22.875, "learning_rate": 2.044125609095831e-05, "loss": 0.1578, "loss_lm": 0.017129714600741863, "loss_seg": 0.14067843183875084, "mean_token_accuracy": 0.9952500015497208, "num_tokens": 1646967325.0, "step": 3875 }, { "entropy": 0.018775095231831074, "epoch": 1.6963562753036436, "grad_norm": 13.4375, "learning_rate": 2.0438548998375745e-05, "loss": 0.1024, "loss_lm": 0.01459327689372003, "loss_seg": 0.08784602582454681, "mean_token_accuracy": 0.9950056672096252, "num_tokens": 1647392588.0, "step": 3876 }, { "entropy": 0.018303074408322573, "epoch": 1.6967939599518547, "grad_norm": 8.25, "learning_rate": 2.043584190579318e-05, "loss": 0.1249, "loss_lm": 0.015297360252588987, "loss_seg": 0.10961347445845604, "mean_token_accuracy": 0.9952820837497711, "num_tokens": 1647817913.0, "step": 3877 }, { "entropy": 0.018006897997111082, "epoch": 1.6972316446000657, "grad_norm": 13.3125, "learning_rate": 2.0433134813210612e-05, "loss": 0.1402, "loss_lm": 0.015568849630653858, "loss_seg": 0.12458759732544422, "mean_token_accuracy": 0.9953708201646805, "num_tokens": 1648243853.0, "step": 3878 }, { "entropy": 0.018447380047291517, "epoch": 1.6976693292482765, "grad_norm": 10.0, "learning_rate": 2.0430427720628046e-05, "loss": 0.1193, "loss_lm": 0.017709969310089946, "loss_seg": 0.10160389356315136, "mean_token_accuracy": 0.9952311962842941, "num_tokens": 1648668608.0, "step": 3879 }, { "entropy": 0.018440529704093933, "epoch": 1.6981070138964875, "grad_norm": 11.3125, "learning_rate": 2.042772062804548e-05, "loss": 0.1266, "loss_lm": 0.015261163003742695, "loss_seg": 0.11137884575873613, "mean_token_accuracy": 0.9952146112918854, "num_tokens": 1649094024.0, "step": 3880 }, { "entropy": 0.018319048453122377, "epoch": 1.6985446985446986, "grad_norm": 3.734375, "learning_rate": 2.0425013535462913e-05, "loss": 0.1384, "loss_lm": 0.01665106904692948, "loss_seg": 0.12179174087941647, "mean_token_accuracy": 0.9953108131885529, "num_tokens": 1649519847.0, "step": 3881 }, { "entropy": 0.018221880309283733, "epoch": 1.6989823831929094, "grad_norm": 5.9375, "learning_rate": 2.0422306442880347e-05, "loss": 0.1277, "loss_lm": 0.017303844448179007, "loss_seg": 0.11037589609622955, "mean_token_accuracy": 0.9952956289052963, "num_tokens": 1649945346.0, "step": 3882 }, { "entropy": 0.018469979986548424, "epoch": 1.6994200678411204, "grad_norm": 6.84375, "learning_rate": 2.041959935029778e-05, "loss": 0.1104, "loss_lm": 0.01911661960184574, "loss_seg": 0.0912745613604784, "mean_token_accuracy": 0.9951755404472351, "num_tokens": 1650370231.0, "step": 3883 }, { "entropy": 0.018068262841552496, "epoch": 1.6998577524893315, "grad_norm": 9.25, "learning_rate": 2.041689225771521e-05, "loss": 0.1226, "loss_lm": 0.017874349607154727, "loss_seg": 0.10476166196167469, "mean_token_accuracy": 0.9952564090490341, "num_tokens": 1650794956.0, "step": 3884 }, { "entropy": 0.01824059011414647, "epoch": 1.7002954371375423, "grad_norm": 3.40625, "learning_rate": 2.041418516513265e-05, "loss": 0.0825, "loss_lm": 0.014017919078469276, "loss_seg": 0.06852809991687536, "mean_token_accuracy": 0.9953625798225403, "num_tokens": 1651220780.0, "step": 3885 }, { "entropy": 0.01807315554469824, "epoch": 1.7007331217857533, "grad_norm": 8.5, "learning_rate": 2.0411478072550082e-05, "loss": 0.0847, "loss_lm": 0.015431315638124943, "loss_seg": 0.06924798619002104, "mean_token_accuracy": 0.9952254295349121, "num_tokens": 1651645918.0, "step": 3886 }, { "entropy": 0.017922991421073675, "epoch": 1.7011708064339643, "grad_norm": 6.25, "learning_rate": 2.0408770979967516e-05, "loss": 0.1576, "loss_lm": 0.01584965572692454, "loss_seg": 0.14170059002935886, "mean_token_accuracy": 0.9953764826059341, "num_tokens": 1652070233.0, "step": 3887 }, { "entropy": 0.018496659584343433, "epoch": 1.7016084910821752, "grad_norm": 4.65625, "learning_rate": 2.040606388738495e-05, "loss": 0.1918, "loss_lm": 0.015404760371893644, "loss_seg": 0.17642373964190483, "mean_token_accuracy": 0.9951777905225754, "num_tokens": 1652495310.0, "step": 3888 }, { "entropy": 0.018568484112620354, "epoch": 1.7020461757303864, "grad_norm": 7.40625, "learning_rate": 2.040335679480238e-05, "loss": 0.1164, "loss_lm": 0.016427448485046625, "loss_seg": 0.09993815049529076, "mean_token_accuracy": 0.99509397149086, "num_tokens": 1652919859.0, "step": 3889 }, { "entropy": 0.019120244774967432, "epoch": 1.7024838603785972, "grad_norm": 5.40625, "learning_rate": 2.0400649702219817e-05, "loss": 0.092, "loss_lm": 0.01774103636853397, "loss_seg": 0.07426081970334053, "mean_token_accuracy": 0.9951120764017105, "num_tokens": 1653345537.0, "step": 3890 }, { "entropy": 0.01800451846793294, "epoch": 1.702921545026808, "grad_norm": 4.6875, "learning_rate": 2.039794260963725e-05, "loss": 0.138, "loss_lm": 0.015074585331603885, "loss_seg": 0.1229473426938057, "mean_token_accuracy": 0.99542635679245, "num_tokens": 1653770530.0, "step": 3891 }, { "entropy": 0.019268683157861233, "epoch": 1.7033592296750193, "grad_norm": 11.5, "learning_rate": 2.0395235517054684e-05, "loss": 0.1194, "loss_lm": 0.016877269139513373, "loss_seg": 0.10247356072068214, "mean_token_accuracy": 0.9950399100780487, "num_tokens": 1654196104.0, "step": 3892 }, { "entropy": 0.018510145135223866, "epoch": 1.70379691432323, "grad_norm": 4.78125, "learning_rate": 2.0392528424472118e-05, "loss": 0.0922, "loss_lm": 0.014463430969044566, "loss_seg": 0.07768868841230869, "mean_token_accuracy": 0.9953636229038239, "num_tokens": 1654620922.0, "step": 3893 }, { "entropy": 0.018648301251232624, "epoch": 1.704234598971441, "grad_norm": 5.15625, "learning_rate": 2.038982133188955e-05, "loss": 0.0943, "loss_lm": 0.01364467921666801, "loss_seg": 0.0806202832609415, "mean_token_accuracy": 0.9952104836702347, "num_tokens": 1655045979.0, "step": 3894 }, { "entropy": 0.018353635910898447, "epoch": 1.7046722836196522, "grad_norm": 7.75, "learning_rate": 2.0387114239306986e-05, "loss": 0.1355, "loss_lm": 0.017054350581020117, "loss_seg": 0.11845598742365837, "mean_token_accuracy": 0.9951788187026978, "num_tokens": 1655471366.0, "step": 3895 }, { "entropy": 0.018577166367322206, "epoch": 1.705109968267863, "grad_norm": 3.859375, "learning_rate": 2.038440714672442e-05, "loss": 0.1453, "loss_lm": 0.015343728242442012, "loss_seg": 0.12998046167194843, "mean_token_accuracy": 0.9952341914176941, "num_tokens": 1655896010.0, "step": 3896 }, { "entropy": 0.018809136003255844, "epoch": 1.705547652916074, "grad_norm": 5.59375, "learning_rate": 2.0381700054141853e-05, "loss": 0.112, "loss_lm": 0.01813156809657812, "loss_seg": 0.09382943995296955, "mean_token_accuracy": 0.9952147603034973, "num_tokens": 1656321226.0, "step": 3897 }, { "entropy": 0.01878037117421627, "epoch": 1.705985337564285, "grad_norm": 22.5, "learning_rate": 2.0378992961559287e-05, "loss": 0.1278, "loss_lm": 0.01763131353072822, "loss_seg": 0.11016703769564629, "mean_token_accuracy": 0.9950980544090271, "num_tokens": 1656746783.0, "step": 3898 }, { "entropy": 0.019095053896307945, "epoch": 1.7064230222124959, "grad_norm": 11.4375, "learning_rate": 2.0376285868976717e-05, "loss": 0.1278, "loss_lm": 0.01757711241953075, "loss_seg": 0.11025141179561615, "mean_token_accuracy": 0.9950723052024841, "num_tokens": 1657172353.0, "step": 3899 }, { "entropy": 0.019095947034657, "epoch": 1.706860706860707, "grad_norm": 7.5625, "learning_rate": 2.0373578776394154e-05, "loss": 0.1157, "loss_lm": 0.015373715665191412, "loss_seg": 0.10033890046179295, "mean_token_accuracy": 0.9950592666864395, "num_tokens": 1657597365.0, "step": 3900 }, { "entropy": 0.018670711666345596, "epoch": 1.707298391508918, "grad_norm": 7.1875, "learning_rate": 2.0370871683811588e-05, "loss": 0.1223, "loss_lm": 0.019497538218274713, "loss_seg": 0.10278846137225628, "mean_token_accuracy": 0.9950583577156067, "num_tokens": 1658022908.0, "step": 3901 }, { "entropy": 0.017677203752100468, "epoch": 1.7077360761571287, "grad_norm": 26.25, "learning_rate": 2.036816459122902e-05, "loss": 0.1387, "loss_lm": 0.015452724881470203, "loss_seg": 0.12321999110281467, "mean_token_accuracy": 0.9954087287187576, "num_tokens": 1658448229.0, "step": 3902 }, { "entropy": 0.01844447059556842, "epoch": 1.7081737608053398, "grad_norm": 6.28125, "learning_rate": 2.0365457498646452e-05, "loss": 0.1361, "loss_lm": 0.014997630845755339, "loss_seg": 0.12105371989309788, "mean_token_accuracy": 0.9951617419719696, "num_tokens": 1658873351.0, "step": 3903 }, { "entropy": 0.01921568438410759, "epoch": 1.7086114454535508, "grad_norm": 7.78125, "learning_rate": 2.0362750406063886e-05, "loss": 0.1271, "loss_lm": 0.01718545937910676, "loss_seg": 0.10995202884078026, "mean_token_accuracy": 0.995083749294281, "num_tokens": 1659298450.0, "step": 3904 }, { "entropy": 0.018174002412706614, "epoch": 1.7090491301017616, "grad_norm": 9.3125, "learning_rate": 2.0360043313481323e-05, "loss": 0.0989, "loss_lm": 0.015184224350377917, "loss_seg": 0.08367795217782259, "mean_token_accuracy": 0.995412677526474, "num_tokens": 1659723046.0, "step": 3905 }, { "entropy": 0.018169219605624676, "epoch": 1.7094868147499727, "grad_norm": 11.3125, "learning_rate": 2.0357336220898757e-05, "loss": 0.1211, "loss_lm": 0.016243295976892114, "loss_seg": 0.10481473244726658, "mean_token_accuracy": 0.9952736496925354, "num_tokens": 1660147630.0, "step": 3906 }, { "entropy": 0.01837609289214015, "epoch": 1.7099244993981837, "grad_norm": 9.625, "learning_rate": 2.035462912831619e-05, "loss": 0.1504, "loss_lm": 0.0178970939014107, "loss_seg": 0.13245606422424316, "mean_token_accuracy": 0.9952413737773895, "num_tokens": 1660572935.0, "step": 3907 }, { "entropy": 0.018921601120382547, "epoch": 1.7103621840463945, "grad_norm": 22.75, "learning_rate": 2.035192203573362e-05, "loss": 0.1511, "loss_lm": 0.017671063542366028, "loss_seg": 0.13344858773052692, "mean_token_accuracy": 0.9950385987758636, "num_tokens": 1660998389.0, "step": 3908 }, { "entropy": 0.018125014379620552, "epoch": 1.7107998686946055, "grad_norm": 9.375, "learning_rate": 2.0349214943151054e-05, "loss": 0.1275, "loss_lm": 0.015935269184410572, "loss_seg": 0.11152340471744537, "mean_token_accuracy": 0.9952811151742935, "num_tokens": 1661423650.0, "step": 3909 }, { "entropy": 0.018321425653994083, "epoch": 1.7112375533428166, "grad_norm": 6.1875, "learning_rate": 2.034650785056849e-05, "loss": 0.0981, "loss_lm": 0.014524905942380428, "loss_seg": 0.08357759844511747, "mean_token_accuracy": 0.9952353835105896, "num_tokens": 1661849262.0, "step": 3910 }, { "entropy": 0.017803646158427, "epoch": 1.7116752379910274, "grad_norm": 5.28125, "learning_rate": 2.0343800757985925e-05, "loss": 0.1351, "loss_lm": 0.015192090068012476, "loss_seg": 0.11991327814757824, "mean_token_accuracy": 0.9953432679176331, "num_tokens": 1662274421.0, "step": 3911 }, { "entropy": 0.01822078274562955, "epoch": 1.7121129226392384, "grad_norm": 9.375, "learning_rate": 2.034109366540336e-05, "loss": 0.1259, "loss_lm": 0.017956714145839214, "loss_seg": 0.10790468566119671, "mean_token_accuracy": 0.9953517913818359, "num_tokens": 1662699909.0, "step": 3912 }, { "entropy": 0.017967933788895607, "epoch": 1.7125506072874495, "grad_norm": 5.25, "learning_rate": 2.033838657282079e-05, "loss": 0.0804, "loss_lm": 0.014342528767883778, "loss_seg": 0.06604471802711487, "mean_token_accuracy": 0.9954437166452408, "num_tokens": 1663124747.0, "step": 3913 }, { "entropy": 0.018602233845740557, "epoch": 1.7129882919356603, "grad_norm": 13.5625, "learning_rate": 2.0335679480238223e-05, "loss": 0.1333, "loss_lm": 0.01761963265016675, "loss_seg": 0.11566687561571598, "mean_token_accuracy": 0.9951647371053696, "num_tokens": 1663550126.0, "step": 3914 }, { "entropy": 0.01846429333090782, "epoch": 1.7134259765838713, "grad_norm": 5.90625, "learning_rate": 2.0332972387655657e-05, "loss": 0.124, "loss_lm": 0.017028582049533725, "loss_seg": 0.1069376952946186, "mean_token_accuracy": 0.9951884150505066, "num_tokens": 1663975740.0, "step": 3915 }, { "entropy": 0.018426805268973112, "epoch": 1.7138636612320823, "grad_norm": 25.125, "learning_rate": 2.0330265295073094e-05, "loss": 0.1088, "loss_lm": 0.01885854545980692, "loss_seg": 0.08996868319809437, "mean_token_accuracy": 0.9953431189060211, "num_tokens": 1664401107.0, "step": 3916 }, { "entropy": 0.018831232096999884, "epoch": 1.7143013458802931, "grad_norm": 9.3125, "learning_rate": 2.0327558202490528e-05, "loss": 0.0985, "loss_lm": 0.017108259489759803, "loss_seg": 0.08137038443237543, "mean_token_accuracy": 0.9951355457305908, "num_tokens": 1664826609.0, "step": 3917 }, { "entropy": 0.018706720788031816, "epoch": 1.7147390305285042, "grad_norm": 5.96875, "learning_rate": 2.0324851109907958e-05, "loss": 0.1368, "loss_lm": 0.0170590381603688, "loss_seg": 0.11969967000186443, "mean_token_accuracy": 0.9949808716773987, "num_tokens": 1665251173.0, "step": 3918 }, { "entropy": 0.018605442252010107, "epoch": 1.7151767151767152, "grad_norm": 5.875, "learning_rate": 2.032214401732539e-05, "loss": 0.1143, "loss_lm": 0.01582408114336431, "loss_seg": 0.09848431125283241, "mean_token_accuracy": 0.9951558858156204, "num_tokens": 1665677010.0, "step": 3919 }, { "entropy": 0.018835148308426142, "epoch": 1.715614399824926, "grad_norm": 10.6875, "learning_rate": 2.0319436924742825e-05, "loss": 0.1049, "loss_lm": 0.014500517398118973, "loss_seg": 0.0904092788696289, "mean_token_accuracy": 0.9951553791761398, "num_tokens": 1666101991.0, "step": 3920 }, { "entropy": 0.018625243101269007, "epoch": 1.716052084473137, "grad_norm": 3.359375, "learning_rate": 2.0316729832160262e-05, "loss": 0.1588, "loss_lm": 0.0154860969632864, "loss_seg": 0.14331138134002686, "mean_token_accuracy": 0.9951692819595337, "num_tokens": 1666526728.0, "step": 3921 }, { "entropy": 0.019060809165239334, "epoch": 1.716489769121348, "grad_norm": 7.125, "learning_rate": 2.0314022739577696e-05, "loss": 0.1489, "loss_lm": 0.016448101261630654, "loss_seg": 0.13249242678284645, "mean_token_accuracy": 0.9951598048210144, "num_tokens": 1666952713.0, "step": 3922 }, { "entropy": 0.018118632491678, "epoch": 1.716927453769559, "grad_norm": 12.9375, "learning_rate": 2.0311315646995127e-05, "loss": 0.1171, "loss_lm": 0.014102576300501823, "loss_seg": 0.1030375212430954, "mean_token_accuracy": 0.9952979236841202, "num_tokens": 1667376954.0, "step": 3923 }, { "entropy": 0.019032524432986975, "epoch": 1.71736513841777, "grad_norm": 20.375, "learning_rate": 2.030860855441256e-05, "loss": 0.1304, "loss_lm": 0.017228979617357254, "loss_seg": 0.11317657120525837, "mean_token_accuracy": 0.9951160997152328, "num_tokens": 1667802819.0, "step": 3924 }, { "entropy": 0.01813053898513317, "epoch": 1.717802823065981, "grad_norm": 8.9375, "learning_rate": 2.0305901461829994e-05, "loss": 0.1138, "loss_lm": 0.014754150062799454, "loss_seg": 0.0990363173186779, "mean_token_accuracy": 0.9951972365379333, "num_tokens": 1668227524.0, "step": 3925 }, { "entropy": 0.018782009836286306, "epoch": 1.7182405077141918, "grad_norm": 5.8125, "learning_rate": 2.030319436924743e-05, "loss": 0.126, "loss_lm": 0.015658752294257283, "loss_seg": 0.11029675975441933, "mean_token_accuracy": 0.9950990527868271, "num_tokens": 1668652097.0, "step": 3926 }, { "entropy": 0.018648230005055666, "epoch": 1.718678192362403, "grad_norm": 5.5625, "learning_rate": 2.030048727666486e-05, "loss": 0.1602, "loss_lm": 0.018257187213748693, "loss_seg": 0.14196771197021008, "mean_token_accuracy": 0.9951151013374329, "num_tokens": 1669076645.0, "step": 3927 }, { "entropy": 0.01890674140304327, "epoch": 1.7191158770106139, "grad_norm": 6.65625, "learning_rate": 2.0297780184082295e-05, "loss": 0.1268, "loss_lm": 0.01663556764833629, "loss_seg": 0.11018054746091366, "mean_token_accuracy": 0.9950513243675232, "num_tokens": 1669502183.0, "step": 3928 }, { "entropy": 0.018262087367475033, "epoch": 1.7195535616588247, "grad_norm": 18.5, "learning_rate": 2.029507309149973e-05, "loss": 0.1148, "loss_lm": 0.016609900165349245, "loss_seg": 0.09818547125905752, "mean_token_accuracy": 0.9952176958322525, "num_tokens": 1669927454.0, "step": 3929 }, { "entropy": 0.01812394242733717, "epoch": 1.719991246307036, "grad_norm": 5.75, "learning_rate": 2.0292365998917163e-05, "loss": 0.1456, "loss_lm": 0.01663710968568921, "loss_seg": 0.12891601584851742, "mean_token_accuracy": 0.9952200502157211, "num_tokens": 1670352948.0, "step": 3930 }, { "entropy": 0.017635938711464405, "epoch": 1.7204289309552467, "grad_norm": 13.6875, "learning_rate": 2.02896589063346e-05, "loss": 0.1302, "loss_lm": 0.015679819509387016, "loss_seg": 0.11449017934501171, "mean_token_accuracy": 0.9954772740602493, "num_tokens": 1670776965.0, "step": 3931 }, { "entropy": 0.01783245848491788, "epoch": 1.7208666156034576, "grad_norm": 13.5, "learning_rate": 2.028695181375203e-05, "loss": 0.0942, "loss_lm": 0.015012480085715652, "loss_seg": 0.07916364260017872, "mean_token_accuracy": 0.9953551292419434, "num_tokens": 1671201453.0, "step": 3932 }, { "entropy": 0.018563931807875633, "epoch": 1.7213043002516688, "grad_norm": 7.1875, "learning_rate": 2.0284244721169464e-05, "loss": 0.113, "loss_lm": 0.014455272816121578, "loss_seg": 0.09856333956122398, "mean_token_accuracy": 0.99512979388237, "num_tokens": 1671626674.0, "step": 3933 }, { "entropy": 0.018158745486289263, "epoch": 1.7217419848998796, "grad_norm": 21.375, "learning_rate": 2.0281537628586898e-05, "loss": 0.1236, "loss_lm": 0.017381606623530388, "loss_seg": 0.10625908896327019, "mean_token_accuracy": 0.9952476471662521, "num_tokens": 1672051899.0, "step": 3934 }, { "entropy": 0.019076933152973652, "epoch": 1.7221796695480907, "grad_norm": 4.5625, "learning_rate": 2.027883053600433e-05, "loss": 0.115, "loss_lm": 0.016692359698936343, "loss_seg": 0.09830984473228455, "mean_token_accuracy": 0.9949003905057907, "num_tokens": 1672476970.0, "step": 3935 }, { "entropy": 0.01857811864465475, "epoch": 1.7226173541963017, "grad_norm": 10.75, "learning_rate": 2.027612344342177e-05, "loss": 0.1376, "loss_lm": 0.016470927046611905, "loss_seg": 0.12110826000571251, "mean_token_accuracy": 0.9950956702232361, "num_tokens": 1672901642.0, "step": 3936 }, { "entropy": 0.01859520236030221, "epoch": 1.7230550388445125, "grad_norm": 6.90625, "learning_rate": 2.02734163508392e-05, "loss": 0.1503, "loss_lm": 0.01621624338440597, "loss_seg": 0.13408146612346172, "mean_token_accuracy": 0.9951895326375961, "num_tokens": 1673326722.0, "step": 3937 }, { "entropy": 0.018210926558822393, "epoch": 1.7234927234927235, "grad_norm": 8.3125, "learning_rate": 2.0270709258256632e-05, "loss": 0.1519, "loss_lm": 0.014675815124064684, "loss_seg": 0.13721201196312904, "mean_token_accuracy": 0.995262011885643, "num_tokens": 1673751756.0, "step": 3938 }, { "entropy": 0.017717372626066208, "epoch": 1.7239304081409346, "grad_norm": 41.75, "learning_rate": 2.0268002165674066e-05, "loss": 0.1496, "loss_lm": 0.015395398484542966, "loss_seg": 0.13422293961048126, "mean_token_accuracy": 0.9954103529453278, "num_tokens": 1674176362.0, "step": 3939 }, { "entropy": 0.01743755117058754, "epoch": 1.7243680927891454, "grad_norm": 17.125, "learning_rate": 2.02652950730915e-05, "loss": 0.1107, "loss_lm": 0.016481374856084585, "loss_seg": 0.09420607704669237, "mean_token_accuracy": 0.9955314546823502, "num_tokens": 1674601023.0, "step": 3940 }, { "entropy": 0.018345979508012533, "epoch": 1.7248057774373564, "grad_norm": 5.40625, "learning_rate": 2.0262587980508937e-05, "loss": 0.1168, "loss_lm": 0.014509385917335749, "loss_seg": 0.10225492436438799, "mean_token_accuracy": 0.9953153282403946, "num_tokens": 1675026192.0, "step": 3941 }, { "entropy": 0.018327564001083374, "epoch": 1.7252434620855674, "grad_norm": 2.984375, "learning_rate": 2.0259880887926367e-05, "loss": 0.0874, "loss_lm": 0.016856288304552436, "loss_seg": 0.07051914371550083, "mean_token_accuracy": 0.9953030943870544, "num_tokens": 1675450687.0, "step": 3942 }, { "entropy": 0.01809590170159936, "epoch": 1.7256811467337783, "grad_norm": 5.15625, "learning_rate": 2.02571737953438e-05, "loss": 0.0917, "loss_lm": 0.015876352321356535, "loss_seg": 0.07585685048252344, "mean_token_accuracy": 0.9952176511287689, "num_tokens": 1675875962.0, "step": 3943 }, { "entropy": 0.01875729952007532, "epoch": 1.7261188313819893, "grad_norm": 6.5, "learning_rate": 2.0254466702761235e-05, "loss": 0.0964, "loss_lm": 0.014649766031652689, "loss_seg": 0.08175635617226362, "mean_token_accuracy": 0.9951719045639038, "num_tokens": 1676301355.0, "step": 3944 }, { "entropy": 0.018017020542174578, "epoch": 1.7265565160302003, "grad_norm": 9.125, "learning_rate": 2.025175961017867e-05, "loss": 0.1077, "loss_lm": 0.016863328171893954, "loss_seg": 0.09081612434238195, "mean_token_accuracy": 0.995227038860321, "num_tokens": 1676726215.0, "step": 3945 }, { "entropy": 0.018336812499910593, "epoch": 1.7269942006784111, "grad_norm": 6.09375, "learning_rate": 2.0249052517596102e-05, "loss": 0.116, "loss_lm": 0.01623036479577422, "loss_seg": 0.09974076971411705, "mean_token_accuracy": 0.9951712042093277, "num_tokens": 1677150919.0, "step": 3946 }, { "entropy": 0.01885004248470068, "epoch": 1.7274318853266222, "grad_norm": 5.5625, "learning_rate": 2.0246345425013536e-05, "loss": 0.1215, "loss_lm": 0.01723350235261023, "loss_seg": 0.10426942072808743, "mean_token_accuracy": 0.9949991405010223, "num_tokens": 1677576422.0, "step": 3947 }, { "entropy": 0.018685592338442802, "epoch": 1.7278695699748332, "grad_norm": 6.03125, "learning_rate": 2.024363833243097e-05, "loss": 0.1566, "loss_lm": 0.016445164568722248, "loss_seg": 0.14011036604642868, "mean_token_accuracy": 0.995231881737709, "num_tokens": 1678001285.0, "step": 3948 }, { "entropy": 0.018271162640303373, "epoch": 1.728307254623044, "grad_norm": 16.375, "learning_rate": 2.0240931239848403e-05, "loss": 0.2287, "loss_lm": 0.014956198865547776, "loss_seg": 0.2137449011206627, "mean_token_accuracy": 0.9953352808952332, "num_tokens": 1678426337.0, "step": 3949 }, { "entropy": 0.018857118673622608, "epoch": 1.728744939271255, "grad_norm": 12.9375, "learning_rate": 2.0238224147265837e-05, "loss": 0.1065, "loss_lm": 0.015003587817773223, "loss_seg": 0.09149349667131901, "mean_token_accuracy": 0.9951308518648148, "num_tokens": 1678851437.0, "step": 3950 }, { "entropy": 0.019219960551708937, "epoch": 1.729182623919466, "grad_norm": 4.03125, "learning_rate": 2.0235517054683267e-05, "loss": 0.12, "loss_lm": 0.016076472122222185, "loss_seg": 0.10390810295939445, "mean_token_accuracy": 0.9950053095817566, "num_tokens": 1679276657.0, "step": 3951 }, { "entropy": 0.018144570291042328, "epoch": 1.729620308567677, "grad_norm": 4.96875, "learning_rate": 2.0232809962100705e-05, "loss": 0.1025, "loss_lm": 0.017119999509304762, "loss_seg": 0.08541830815374851, "mean_token_accuracy": 0.9953493028879166, "num_tokens": 1679701916.0, "step": 3952 }, { "entropy": 0.01897464832291007, "epoch": 1.730057993215888, "grad_norm": 12.3125, "learning_rate": 2.023010286951814e-05, "loss": 0.1211, "loss_lm": 0.018013294553384185, "loss_seg": 0.10308476537466049, "mean_token_accuracy": 0.9951433539390564, "num_tokens": 1680126650.0, "step": 3953 }, { "entropy": 0.018041128758341074, "epoch": 1.730495677864099, "grad_norm": 6.5, "learning_rate": 2.0227395776935572e-05, "loss": 0.1709, "loss_lm": 0.01413682708516717, "loss_seg": 0.15678740106523037, "mean_token_accuracy": 0.9954011291265488, "num_tokens": 1680551480.0, "step": 3954 }, { "entropy": 0.018479157239198685, "epoch": 1.7309333625123098, "grad_norm": 7.25, "learning_rate": 2.0224688684353006e-05, "loss": 0.1865, "loss_lm": 0.017966531217098236, "loss_seg": 0.1684915143996477, "mean_token_accuracy": 0.9951883256435394, "num_tokens": 1680976209.0, "step": 3955 }, { "entropy": 0.017979726660996675, "epoch": 1.7313710471605208, "grad_norm": 4.53125, "learning_rate": 2.0221981591770436e-05, "loss": 0.1063, "loss_lm": 0.018551242537796497, "loss_seg": 0.08773547410964966, "mean_token_accuracy": 0.9953614622354507, "num_tokens": 1681401539.0, "step": 3956 }, { "entropy": 0.01796927349641919, "epoch": 1.7318087318087318, "grad_norm": 12.4375, "learning_rate": 2.0219274499187873e-05, "loss": 0.1638, "loss_lm": 0.01495638326741755, "loss_seg": 0.1488503348082304, "mean_token_accuracy": 0.9953339397907257, "num_tokens": 1681826229.0, "step": 3957 }, { "entropy": 0.018477064091712236, "epoch": 1.7322464164569427, "grad_norm": 8.6875, "learning_rate": 2.0216567406605307e-05, "loss": 0.1193, "loss_lm": 0.017343349056318402, "loss_seg": 0.10198684595525265, "mean_token_accuracy": 0.9952174872159958, "num_tokens": 1682251493.0, "step": 3958 }, { "entropy": 0.018230911809951067, "epoch": 1.7326841011051537, "grad_norm": 14.1875, "learning_rate": 2.021386031402274e-05, "loss": 0.1229, "loss_lm": 0.017671330366283655, "loss_seg": 0.10521371010690928, "mean_token_accuracy": 0.9951484948396683, "num_tokens": 1682676456.0, "step": 3959 }, { "entropy": 0.017971635330468416, "epoch": 1.7331217857533647, "grad_norm": 3.40625, "learning_rate": 2.0211153221440174e-05, "loss": 0.1304, "loss_lm": 0.01730569126084447, "loss_seg": 0.11311394162476063, "mean_token_accuracy": 0.9953002631664276, "num_tokens": 1683100471.0, "step": 3960 }, { "entropy": 0.01886838534846902, "epoch": 1.7335594704015755, "grad_norm": 5.78125, "learning_rate": 2.0208446128857605e-05, "loss": 0.0897, "loss_lm": 0.015320601873099804, "loss_seg": 0.07435109000653028, "mean_token_accuracy": 0.9951602667570114, "num_tokens": 1683525455.0, "step": 3961 }, { "entropy": 0.018205071799457073, "epoch": 1.7339971550497866, "grad_norm": 10.25, "learning_rate": 2.0205739036275042e-05, "loss": 0.102, "loss_lm": 0.014617525739595294, "loss_seg": 0.08742764964699745, "mean_token_accuracy": 0.9952806234359741, "num_tokens": 1683950057.0, "step": 3962 }, { "entropy": 0.018829256761819124, "epoch": 1.7344348396979976, "grad_norm": 13.1875, "learning_rate": 2.0203031943692476e-05, "loss": 0.1172, "loss_lm": 0.0173890283331275, "loss_seg": 0.09978977963328362, "mean_token_accuracy": 0.9951596409082413, "num_tokens": 1684375598.0, "step": 3963 }, { "entropy": 0.019089334178715944, "epoch": 1.7348725243462084, "grad_norm": 7.4375, "learning_rate": 2.020032485110991e-05, "loss": 0.1063, "loss_lm": 0.014036717358976603, "loss_seg": 0.09225097484886646, "mean_token_accuracy": 0.9951076209545135, "num_tokens": 1684800691.0, "step": 3964 }, { "entropy": 0.01846929034218192, "epoch": 1.7353102089944197, "grad_norm": 6.625, "learning_rate": 2.0197617758527343e-05, "loss": 0.1162, "loss_lm": 0.01629792433232069, "loss_seg": 0.09991933219134808, "mean_token_accuracy": 0.9951822310686111, "num_tokens": 1685225361.0, "step": 3965 }, { "entropy": 0.018373447004705667, "epoch": 1.7357478936426305, "grad_norm": 9.75, "learning_rate": 2.0194910665944773e-05, "loss": 0.1546, "loss_lm": 0.016383346868678927, "loss_seg": 0.13823823258280754, "mean_token_accuracy": 0.9952308833599091, "num_tokens": 1685649705.0, "step": 3966 }, { "entropy": 0.018354591447860003, "epoch": 1.7361855782908413, "grad_norm": 15.75, "learning_rate": 2.019220357336221e-05, "loss": 0.1122, "loss_lm": 0.01403025258332491, "loss_seg": 0.0981441829353571, "mean_token_accuracy": 0.9951840043067932, "num_tokens": 1686074515.0, "step": 3967 }, { "entropy": 0.018200118094682693, "epoch": 1.7366232629390526, "grad_norm": 5.84375, "learning_rate": 2.0189496480779644e-05, "loss": 0.1185, "loss_lm": 0.01587234460748732, "loss_seg": 0.10262422636151314, "mean_token_accuracy": 0.9951959401369095, "num_tokens": 1686499538.0, "step": 3968 }, { "entropy": 0.018638943322002888, "epoch": 1.7370609475872634, "grad_norm": 5.125, "learning_rate": 2.0186789388197078e-05, "loss": 0.1267, "loss_lm": 0.01592533732764423, "loss_seg": 0.11081554181873798, "mean_token_accuracy": 0.9951520711183548, "num_tokens": 1686924837.0, "step": 3969 }, { "entropy": 0.018503145314753056, "epoch": 1.7374986322354742, "grad_norm": 10.625, "learning_rate": 2.018408229561451e-05, "loss": 0.1659, "loss_lm": 0.016051531536504626, "loss_seg": 0.14986387267708778, "mean_token_accuracy": 0.9952986836433411, "num_tokens": 1687350187.0, "step": 3970 }, { "entropy": 0.01822510687634349, "epoch": 1.7379363168836854, "grad_norm": 4.96875, "learning_rate": 2.0181375203031942e-05, "loss": 0.1509, "loss_lm": 0.015916681615635753, "loss_seg": 0.13494897447526455, "mean_token_accuracy": 0.9952369183301926, "num_tokens": 1687774840.0, "step": 3971 }, { "entropy": 0.01785894576460123, "epoch": 1.7383740015318963, "grad_norm": 12.5, "learning_rate": 2.017866811044938e-05, "loss": 0.1415, "loss_lm": 0.016173289623111486, "loss_seg": 0.12530525214970112, "mean_token_accuracy": 0.9952764213085175, "num_tokens": 1688199488.0, "step": 3972 }, { "entropy": 0.01844476629048586, "epoch": 1.7388116861801073, "grad_norm": 5.09375, "learning_rate": 2.0175961017866813e-05, "loss": 0.1337, "loss_lm": 0.02100947964936495, "loss_seg": 0.11271555349230766, "mean_token_accuracy": 0.9952051639556885, "num_tokens": 1688624067.0, "step": 3973 }, { "entropy": 0.018282946199178696, "epoch": 1.7392493708283183, "grad_norm": 6.375, "learning_rate": 2.0173253925284247e-05, "loss": 0.1041, "loss_lm": 0.016480462159961462, "loss_seg": 0.08762574568390846, "mean_token_accuracy": 0.9953012466430664, "num_tokens": 1689049188.0, "step": 3974 }, { "entropy": 0.017921580467373133, "epoch": 1.7396870554765291, "grad_norm": 13.0, "learning_rate": 2.0170546832701677e-05, "loss": 0.1221, "loss_lm": 0.01517418771982193, "loss_seg": 0.10691498592495918, "mean_token_accuracy": 0.9953411221504211, "num_tokens": 1689474765.0, "step": 3975 }, { "entropy": 0.01802895776927471, "epoch": 1.7401247401247402, "grad_norm": 17.875, "learning_rate": 2.016783974011911e-05, "loss": 0.0988, "loss_lm": 0.015376638155430555, "loss_seg": 0.08338026516139507, "mean_token_accuracy": 0.9953692704439163, "num_tokens": 1689899705.0, "step": 3976 }, { "entropy": 0.0179449706338346, "epoch": 1.7405624247729512, "grad_norm": 9.375, "learning_rate": 2.0165132647536548e-05, "loss": 0.1031, "loss_lm": 0.013973175315186381, "loss_seg": 0.08912435919046402, "mean_token_accuracy": 0.9954430013895035, "num_tokens": 1690324684.0, "step": 3977 }, { "entropy": 0.018559867050498724, "epoch": 1.741000109421162, "grad_norm": 7.9375, "learning_rate": 2.016242555495398e-05, "loss": 0.1075, "loss_lm": 0.015126743353903294, "loss_seg": 0.09238782152533531, "mean_token_accuracy": 0.9949656575918198, "num_tokens": 1690749388.0, "step": 3978 }, { "entropy": 0.018196037970483303, "epoch": 1.741437794069373, "grad_norm": 22.75, "learning_rate": 2.0159718462371415e-05, "loss": 0.1231, "loss_lm": 0.015316575299948454, "loss_seg": 0.1077625211328268, "mean_token_accuracy": 0.9952724277973175, "num_tokens": 1691174296.0, "step": 3979 }, { "entropy": 0.01876724883913994, "epoch": 1.741875478717584, "grad_norm": 6.34375, "learning_rate": 2.0157011369788846e-05, "loss": 0.1119, "loss_lm": 0.016252161003649235, "loss_seg": 0.09565857984125614, "mean_token_accuracy": 0.9951186031103134, "num_tokens": 1691599546.0, "step": 3980 }, { "entropy": 0.017835949081927538, "epoch": 1.742313163365795, "grad_norm": 5.4375, "learning_rate": 2.015430427720628e-05, "loss": 0.0874, "loss_lm": 0.018215647200122476, "loss_seg": 0.06920699216425419, "mean_token_accuracy": 0.9954550564289093, "num_tokens": 1692024353.0, "step": 3981 }, { "entropy": 0.017636260483413935, "epoch": 1.742750848014006, "grad_norm": 20.625, "learning_rate": 2.0151597184623713e-05, "loss": 0.1286, "loss_lm": 0.016656754771247506, "loss_seg": 0.11193873547017574, "mean_token_accuracy": 0.9954335242509842, "num_tokens": 1692449770.0, "step": 3982 }, { "entropy": 0.01804626453667879, "epoch": 1.743188532662217, "grad_norm": 9.75, "learning_rate": 2.014889009204115e-05, "loss": 0.1241, "loss_lm": 0.015011777868494391, "loss_seg": 0.10904909111559391, "mean_token_accuracy": 0.9952319860458374, "num_tokens": 1692874220.0, "step": 3983 }, { "entropy": 0.01802079565823078, "epoch": 1.7436262173104278, "grad_norm": 5.09375, "learning_rate": 2.0146182999458584e-05, "loss": 0.1179, "loss_lm": 0.01549713290296495, "loss_seg": 0.10243829526007175, "mean_token_accuracy": 0.9952794164419174, "num_tokens": 1693298896.0, "step": 3984 }, { "entropy": 0.018420358654111624, "epoch": 1.7440639019586388, "grad_norm": 6.65625, "learning_rate": 2.0143475906876014e-05, "loss": 0.1281, "loss_lm": 0.019558228319510818, "loss_seg": 0.10856341756880283, "mean_token_accuracy": 0.995201364159584, "num_tokens": 1693724199.0, "step": 3985 }, { "entropy": 0.018603659700602293, "epoch": 1.7445015866068498, "grad_norm": 5.4375, "learning_rate": 2.0140768814293448e-05, "loss": 0.1011, "loss_lm": 0.017573013436049223, "loss_seg": 0.0835729856044054, "mean_token_accuracy": 0.9952616840600967, "num_tokens": 1694148990.0, "step": 3986 }, { "entropy": 0.018119950778782368, "epoch": 1.7449392712550607, "grad_norm": 9.75, "learning_rate": 2.013806172171088e-05, "loss": 0.1157, "loss_lm": 0.01769242831505835, "loss_seg": 0.09797648712992668, "mean_token_accuracy": 0.9951885640621185, "num_tokens": 1694574803.0, "step": 3987 }, { "entropy": 0.018339311704039574, "epoch": 1.7453769559032717, "grad_norm": 9.0625, "learning_rate": 2.013535462912832e-05, "loss": 0.128, "loss_lm": 0.016456078039482236, "loss_seg": 0.11153455078601837, "mean_token_accuracy": 0.9952497184276581, "num_tokens": 1695000867.0, "step": 3988 }, { "entropy": 0.018247209955006838, "epoch": 1.7458146405514827, "grad_norm": 6.90625, "learning_rate": 2.0132647536545752e-05, "loss": 0.1359, "loss_lm": 0.016158486250787973, "loss_seg": 0.11977332085371017, "mean_token_accuracy": 0.995231106877327, "num_tokens": 1695426311.0, "step": 3989 }, { "entropy": 0.018126882147043943, "epoch": 1.7462523251996935, "grad_norm": 10.875, "learning_rate": 2.0129940443963183e-05, "loss": 0.1094, "loss_lm": 0.01650743861682713, "loss_seg": 0.09292298927903175, "mean_token_accuracy": 0.9953749477863312, "num_tokens": 1695851360.0, "step": 3990 }, { "entropy": 0.018904872704297304, "epoch": 1.7466900098479046, "grad_norm": 11.3125, "learning_rate": 2.0127233351380617e-05, "loss": 0.1572, "loss_lm": 0.016630734549835324, "loss_seg": 0.14055540598928928, "mean_token_accuracy": 0.9951368570327759, "num_tokens": 1696276572.0, "step": 3991 }, { "entropy": 0.01821200828999281, "epoch": 1.7471276944961156, "grad_norm": 9.625, "learning_rate": 2.012452625879805e-05, "loss": 0.1377, "loss_lm": 0.015036839758977294, "loss_seg": 0.1227018628269434, "mean_token_accuracy": 0.9953040480613708, "num_tokens": 1696701340.0, "step": 3992 }, { "entropy": 0.018817447591573, "epoch": 1.7475653791443264, "grad_norm": 4.125, "learning_rate": 2.0121819166215487e-05, "loss": 0.0971, "loss_lm": 0.014483697013929486, "loss_seg": 0.08259827457368374, "mean_token_accuracy": 0.9950783997774124, "num_tokens": 1697127209.0, "step": 3993 }, { "entropy": 0.01907111518085003, "epoch": 1.7480030637925374, "grad_norm": 9.3125, "learning_rate": 2.011911207363292e-05, "loss": 0.1028, "loss_lm": 0.018093067687004805, "loss_seg": 0.08471262641251087, "mean_token_accuracy": 0.995099201798439, "num_tokens": 1697552211.0, "step": 3994 }, { "entropy": 0.01830734685063362, "epoch": 1.7484407484407485, "grad_norm": 3.421875, "learning_rate": 2.011640498105035e-05, "loss": 0.1643, "loss_lm": 0.01613497524522245, "loss_seg": 0.1481255143880844, "mean_token_accuracy": 0.9952897429466248, "num_tokens": 1697977413.0, "step": 3995 }, { "entropy": 0.01872917450964451, "epoch": 1.7488784330889593, "grad_norm": 10.9375, "learning_rate": 2.0113697888467785e-05, "loss": 0.1761, "loss_lm": 0.016594427404925227, "loss_seg": 0.15953988581895828, "mean_token_accuracy": 0.995219424366951, "num_tokens": 1698402851.0, "step": 3996 }, { "entropy": 0.01811877265572548, "epoch": 1.7493161177371703, "grad_norm": 17.75, "learning_rate": 2.011099079588522e-05, "loss": 0.1188, "loss_lm": 0.014513843227177858, "loss_seg": 0.10432196594774723, "mean_token_accuracy": 0.9953290373086929, "num_tokens": 1698828198.0, "step": 3997 }, { "entropy": 0.0183026478625834, "epoch": 1.7497538023853814, "grad_norm": 23.625, "learning_rate": 2.0108283703302656e-05, "loss": 0.1161, "loss_lm": 0.015318367630243301, "loss_seg": 0.10078359767794609, "mean_token_accuracy": 0.9953084290027618, "num_tokens": 1699252949.0, "step": 3998 }, { "entropy": 0.018133542500436306, "epoch": 1.7501914870335922, "grad_norm": 4.09375, "learning_rate": 2.0105576610720086e-05, "loss": 0.1164, "loss_lm": 0.014334358740597963, "loss_seg": 0.10206025466322899, "mean_token_accuracy": 0.9952509552240372, "num_tokens": 1699678423.0, "step": 3999 }, { "entropy": 0.01863962272182107, "epoch": 1.7506291716818032, "grad_norm": 17.75, "learning_rate": 2.010286951813752e-05, "loss": 0.1339, "loss_lm": 0.012540960684418678, "loss_seg": 0.12140107993036509, "mean_token_accuracy": 0.9951681643724442, "num_tokens": 1700103279.0, "step": 4000 }, { "entropy": 0.018740303814411163, "epoch": 1.7510668563300142, "grad_norm": 9.125, "learning_rate": 2.0100162425554954e-05, "loss": 0.1393, "loss_lm": 0.016390741569921374, "loss_seg": 0.12290154583752155, "mean_token_accuracy": 0.9952307641506195, "num_tokens": 1700528260.0, "step": 4001 }, { "entropy": 0.018705638125538826, "epoch": 1.751504540978225, "grad_norm": 17.75, "learning_rate": 2.0097455332972388e-05, "loss": 0.167, "loss_lm": 0.01558289653621614, "loss_seg": 0.15136845037341118, "mean_token_accuracy": 0.9951207488775253, "num_tokens": 1700953677.0, "step": 4002 }, { "entropy": 0.018083455506712198, "epoch": 1.7519422256264363, "grad_norm": 6.34375, "learning_rate": 2.0094748240389825e-05, "loss": 0.1111, "loss_lm": 0.015192125458270311, "loss_seg": 0.09594725258648396, "mean_token_accuracy": 0.9953849613666534, "num_tokens": 1701379187.0, "step": 4003 }, { "entropy": 0.02000322612002492, "epoch": 1.7523799102746471, "grad_norm": 6.71875, "learning_rate": 2.0092041147807255e-05, "loss": 0.1277, "loss_lm": 0.01824226975440979, "loss_seg": 0.10940796509385109, "mean_token_accuracy": 0.9948495477437973, "num_tokens": 1701804378.0, "step": 4004 }, { "entropy": 0.01864487398415804, "epoch": 1.752817594922858, "grad_norm": 4.75, "learning_rate": 2.008933405522469e-05, "loss": 0.1386, "loss_lm": 0.01692337286658585, "loss_seg": 0.12169876135885715, "mean_token_accuracy": 0.9950658529996872, "num_tokens": 1702229743.0, "step": 4005 }, { "entropy": 0.01802640873938799, "epoch": 1.7532552795710692, "grad_norm": 18.125, "learning_rate": 2.0086626962642122e-05, "loss": 0.1094, "loss_lm": 0.014095428166911006, "loss_seg": 0.09529693610966206, "mean_token_accuracy": 0.9952964931726456, "num_tokens": 1702654887.0, "step": 4006 }, { "entropy": 0.01847923221066594, "epoch": 1.75369296421928, "grad_norm": 5.59375, "learning_rate": 2.0083919870059556e-05, "loss": 0.1452, "loss_lm": 0.015569444745779037, "loss_seg": 0.12966495752334595, "mean_token_accuracy": 0.9952206760644913, "num_tokens": 1703079597.0, "step": 4007 }, { "entropy": 0.0183871746994555, "epoch": 1.7541306488674908, "grad_norm": 11.75, "learning_rate": 2.0081212777476993e-05, "loss": 0.1357, "loss_lm": 0.01710940059274435, "loss_seg": 0.11855239607393742, "mean_token_accuracy": 0.9951987266540527, "num_tokens": 1703505261.0, "step": 4008 }, { "entropy": 0.018248154781758785, "epoch": 1.754568333515702, "grad_norm": 4.875, "learning_rate": 2.0078505684894424e-05, "loss": 0.1063, "loss_lm": 0.015965600730851293, "loss_seg": 0.09034748189151287, "mean_token_accuracy": 0.9951949864625931, "num_tokens": 1703930010.0, "step": 4009 }, { "entropy": 0.01889756228774786, "epoch": 1.7550060181639129, "grad_norm": 5.3125, "learning_rate": 2.0075798592311857e-05, "loss": 0.111, "loss_lm": 0.014068430289626122, "loss_seg": 0.09691771678626537, "mean_token_accuracy": 0.9951274991035461, "num_tokens": 1704355216.0, "step": 4010 }, { "entropy": 0.018775401171296835, "epoch": 1.755443702812124, "grad_norm": 2.484375, "learning_rate": 2.007309149972929e-05, "loss": 0.105, "loss_lm": 0.01436619833111763, "loss_seg": 0.09067013673484325, "mean_token_accuracy": 0.995086669921875, "num_tokens": 1704780127.0, "step": 4011 }, { "entropy": 0.01843616785481572, "epoch": 1.755881387460335, "grad_norm": 4.5625, "learning_rate": 2.0070384407146725e-05, "loss": 0.0925, "loss_lm": 0.016831446904689074, "loss_seg": 0.07565319817513227, "mean_token_accuracy": 0.9952136874198914, "num_tokens": 1705205863.0, "step": 4012 }, { "entropy": 0.018663572147488594, "epoch": 1.7563190721085458, "grad_norm": 8.4375, "learning_rate": 2.006767731456416e-05, "loss": 0.1343, "loss_lm": 0.015021225670352578, "loss_seg": 0.11930376850068569, "mean_token_accuracy": 0.9951903969049454, "num_tokens": 1705630998.0, "step": 4013 }, { "entropy": 0.018045232631266117, "epoch": 1.7567567567567568, "grad_norm": 12.6875, "learning_rate": 2.0064970221981592e-05, "loss": 0.1169, "loss_lm": 0.014302959199994802, "loss_seg": 0.10257541388273239, "mean_token_accuracy": 0.9952962547540665, "num_tokens": 1706056374.0, "step": 4014 }, { "entropy": 0.018276794347912073, "epoch": 1.7571944414049678, "grad_norm": 4.78125, "learning_rate": 2.0062263129399026e-05, "loss": 0.0874, "loss_lm": 0.018167183734476566, "loss_seg": 0.06925297155976295, "mean_token_accuracy": 0.9952442944049835, "num_tokens": 1706481304.0, "step": 4015 }, { "entropy": 0.01810743985697627, "epoch": 1.7576321260531786, "grad_norm": 7.625, "learning_rate": 2.005955603681646e-05, "loss": 0.1227, "loss_lm": 0.017523306654766202, "loss_seg": 0.10513786319643259, "mean_token_accuracy": 0.9952516406774521, "num_tokens": 1706906900.0, "step": 4016 }, { "entropy": 0.018530703615397215, "epoch": 1.7580698107013897, "grad_norm": 34.0, "learning_rate": 2.0056848944233893e-05, "loss": 0.122, "loss_lm": 0.018847809405997396, "loss_seg": 0.10312046110630035, "mean_token_accuracy": 0.9951924830675125, "num_tokens": 1707332011.0, "step": 4017 }, { "entropy": 0.01861184509471059, "epoch": 1.7585074953496007, "grad_norm": 6.1875, "learning_rate": 2.0054141851651327e-05, "loss": 0.095, "loss_lm": 0.01534595899283886, "loss_seg": 0.07967435009777546, "mean_token_accuracy": 0.9952691793441772, "num_tokens": 1707756637.0, "step": 4018 }, { "entropy": 0.017796463333070278, "epoch": 1.7589451799978115, "grad_norm": 8.9375, "learning_rate": 2.005143475906876e-05, "loss": 0.0946, "loss_lm": 0.01351164118386805, "loss_seg": 0.08112876024097204, "mean_token_accuracy": 0.9954054802656174, "num_tokens": 1708180779.0, "step": 4019 }, { "entropy": 0.018674364779144526, "epoch": 1.7593828646460226, "grad_norm": 46.5, "learning_rate": 2.0048727666486195e-05, "loss": 0.1099, "loss_lm": 0.01705467328429222, "loss_seg": 0.09284146688878536, "mean_token_accuracy": 0.9951543062925339, "num_tokens": 1708605666.0, "step": 4020 }, { "entropy": 0.017572786659002304, "epoch": 1.7598205492942336, "grad_norm": 4.9375, "learning_rate": 2.0046020573903628e-05, "loss": 0.1161, "loss_lm": 0.015239752363413572, "loss_seg": 0.10085401590913534, "mean_token_accuracy": 0.995337963104248, "num_tokens": 1709031161.0, "step": 4021 }, { "entropy": 0.01890507759526372, "epoch": 1.7602582339424444, "grad_norm": 3.71875, "learning_rate": 2.0043313481321062e-05, "loss": 0.121, "loss_lm": 0.0174878416582942, "loss_seg": 0.10354395769536495, "mean_token_accuracy": 0.9949418157339096, "num_tokens": 1709456010.0, "step": 4022 }, { "entropy": 0.018180185463279486, "epoch": 1.7606959185906554, "grad_norm": 20.375, "learning_rate": 2.0040606388738492e-05, "loss": 0.094, "loss_lm": 0.015216821804642677, "loss_seg": 0.07878526207059622, "mean_token_accuracy": 0.9952945411205292, "num_tokens": 1709881293.0, "step": 4023 }, { "entropy": 0.01868948759511113, "epoch": 1.7611336032388665, "grad_norm": 7.09375, "learning_rate": 2.003789929615593e-05, "loss": 0.0989, "loss_lm": 0.015039200894534588, "loss_seg": 0.08383370097726583, "mean_token_accuracy": 0.9951695948839188, "num_tokens": 1710306078.0, "step": 4024 }, { "entropy": 0.0180909251794219, "epoch": 1.7615712878870773, "grad_norm": 8.5, "learning_rate": 2.0035192203573363e-05, "loss": 0.0862, "loss_lm": 0.015311865136027336, "loss_seg": 0.07086469791829586, "mean_token_accuracy": 0.9953008741140366, "num_tokens": 1710731000.0, "step": 4025 }, { "entropy": 0.01810651319101453, "epoch": 1.7620089725352883, "grad_norm": 13.375, "learning_rate": 2.0032485110990797e-05, "loss": 0.1154, "loss_lm": 0.015274493722245097, "loss_seg": 0.10009606555104256, "mean_token_accuracy": 0.9952958822250366, "num_tokens": 1711156342.0, "step": 4026 }, { "entropy": 0.01847663288936019, "epoch": 1.7624466571834994, "grad_norm": 9.125, "learning_rate": 2.002977801840823e-05, "loss": 0.1427, "loss_lm": 0.016573407454416156, "loss_seg": 0.1261524921283126, "mean_token_accuracy": 0.9951248914003372, "num_tokens": 1711581706.0, "step": 4027 }, { "entropy": 0.018543316051363945, "epoch": 1.7628843418317102, "grad_norm": 4.375, "learning_rate": 2.002707092582566e-05, "loss": 0.101, "loss_lm": 0.014117249054834247, "loss_seg": 0.08689271938055754, "mean_token_accuracy": 0.9953087717294693, "num_tokens": 1712006264.0, "step": 4028 }, { "entropy": 0.01875182893127203, "epoch": 1.7633220264799212, "grad_norm": 9.75, "learning_rate": 2.0024363833243098e-05, "loss": 0.0991, "loss_lm": 0.016513484297320247, "loss_seg": 0.08262194599956274, "mean_token_accuracy": 0.9950986206531525, "num_tokens": 1712432416.0, "step": 4029 }, { "entropy": 0.01897600432857871, "epoch": 1.7637597111281322, "grad_norm": 4.875, "learning_rate": 2.0021656740660532e-05, "loss": 0.1191, "loss_lm": 0.01701820408925414, "loss_seg": 0.10205818060785532, "mean_token_accuracy": 0.995015874505043, "num_tokens": 1712857948.0, "step": 4030 }, { "entropy": 0.018422910012304783, "epoch": 1.764197395776343, "grad_norm": 7.125, "learning_rate": 2.0018949648077966e-05, "loss": 0.1468, "loss_lm": 0.015921755926683545, "loss_seg": 0.13084346242249012, "mean_token_accuracy": 0.9951038807630539, "num_tokens": 1713282750.0, "step": 4031 }, { "entropy": 0.018010760191828012, "epoch": 1.764635080424554, "grad_norm": 7.9375, "learning_rate": 2.00162425554954e-05, "loss": 0.1036, "loss_lm": 0.014771158806979656, "loss_seg": 0.0887912418693304, "mean_token_accuracy": 0.9953319281339645, "num_tokens": 1713707076.0, "step": 4032 }, { "entropy": 0.018656710628420115, "epoch": 1.7650727650727651, "grad_norm": 5.3125, "learning_rate": 2.001353546291283e-05, "loss": 0.109, "loss_lm": 0.014060720568522811, "loss_seg": 0.0949027705937624, "mean_token_accuracy": 0.995225191116333, "num_tokens": 1714131552.0, "step": 4033 }, { "entropy": 0.018512533511966467, "epoch": 1.765510449720976, "grad_norm": 14.125, "learning_rate": 2.0010828370330267e-05, "loss": 0.1116, "loss_lm": 0.0159353818744421, "loss_seg": 0.0956951268017292, "mean_token_accuracy": 0.9952365458011627, "num_tokens": 1714556599.0, "step": 4034 }, { "entropy": 0.018289225175976753, "epoch": 1.765948134369187, "grad_norm": 107.5, "learning_rate": 2.00081212777477e-05, "loss": 0.1447, "loss_lm": 0.017472818726673722, "loss_seg": 0.12726939283311367, "mean_token_accuracy": 0.995196521282196, "num_tokens": 1714981448.0, "step": 4035 }, { "entropy": 0.01860522059723735, "epoch": 1.766385819017398, "grad_norm": 8.25, "learning_rate": 2.0005414185165134e-05, "loss": 0.1601, "loss_lm": 0.017266836715862155, "loss_seg": 0.1428692564368248, "mean_token_accuracy": 0.9951500296592712, "num_tokens": 1715407195.0, "step": 4036 }, { "entropy": 0.018401003908365965, "epoch": 1.7668235036656088, "grad_norm": 6.15625, "learning_rate": 2.0002707092582568e-05, "loss": 0.1239, "loss_lm": 0.016083332942798734, "loss_seg": 0.10783255472779274, "mean_token_accuracy": 0.9953166246414185, "num_tokens": 1715832065.0, "step": 4037 }, { "entropy": 0.017966917250305414, "epoch": 1.7672611883138198, "grad_norm": 6.53125, "learning_rate": 1.9999999999999998e-05, "loss": 0.1474, "loss_lm": 0.016834722366183996, "loss_seg": 0.1305325236171484, "mean_token_accuracy": 0.9952290505170822, "num_tokens": 1716257527.0, "step": 4038 }, { "entropy": 0.018644771073013544, "epoch": 1.7676988729620309, "grad_norm": 8.6875, "learning_rate": 1.9997292907417435e-05, "loss": 0.0931, "loss_lm": 0.01901868637651205, "loss_seg": 0.07408565003424883, "mean_token_accuracy": 0.995023712515831, "num_tokens": 1716683148.0, "step": 4039 }, { "entropy": 0.01846261927857995, "epoch": 1.7681365576102417, "grad_norm": 6.53125, "learning_rate": 1.999458581483487e-05, "loss": 0.1073, "loss_lm": 0.017201647395268083, "loss_seg": 0.09011255018413067, "mean_token_accuracy": 0.9951316565275192, "num_tokens": 1717108717.0, "step": 4040 }, { "entropy": 0.01837863866239786, "epoch": 1.768574242258453, "grad_norm": 6.375, "learning_rate": 1.9991878722252303e-05, "loss": 0.1094, "loss_lm": 0.015784725546836853, "loss_seg": 0.09357088897377253, "mean_token_accuracy": 0.9952504336833954, "num_tokens": 1717532707.0, "step": 4041 }, { "entropy": 0.01883532851934433, "epoch": 1.7690119269066638, "grad_norm": 3.28125, "learning_rate": 1.9989171629669733e-05, "loss": 0.1009, "loss_lm": 0.013363318983465433, "loss_seg": 0.0875290622934699, "mean_token_accuracy": 0.9951332956552505, "num_tokens": 1717958229.0, "step": 4042 }, { "entropy": 0.01865660957992077, "epoch": 1.7694496115548746, "grad_norm": 7.90625, "learning_rate": 1.9986464537087167e-05, "loss": 0.1176, "loss_lm": 0.017671302892267704, "loss_seg": 0.09996865317225456, "mean_token_accuracy": 0.995228722691536, "num_tokens": 1718383038.0, "step": 4043 }, { "entropy": 0.018537350464612246, "epoch": 1.7698872962030858, "grad_norm": 12.9375, "learning_rate": 1.99837574445046e-05, "loss": 0.1276, "loss_lm": 0.015257674735039473, "loss_seg": 0.11236580088734627, "mean_token_accuracy": 0.9951718151569366, "num_tokens": 1718808007.0, "step": 4044 }, { "entropy": 0.01827304996550083, "epoch": 1.7703249808512966, "grad_norm": 8.0625, "learning_rate": 1.9981050351922038e-05, "loss": 0.1439, "loss_lm": 0.015404053265228868, "loss_seg": 0.1284990105777979, "mean_token_accuracy": 0.9952069669961929, "num_tokens": 1719232965.0, "step": 4045 }, { "entropy": 0.01856944290921092, "epoch": 1.7707626654995074, "grad_norm": 4.09375, "learning_rate": 1.997834325933947e-05, "loss": 0.1261, "loss_lm": 0.017765562748536468, "loss_seg": 0.10833727568387985, "mean_token_accuracy": 0.9952338486909866, "num_tokens": 1719657941.0, "step": 4046 }, { "entropy": 0.018270908389240503, "epoch": 1.7712003501477187, "grad_norm": 36.5, "learning_rate": 1.9975636166756902e-05, "loss": 0.1072, "loss_lm": 0.016147405141964555, "loss_seg": 0.09106841217726469, "mean_token_accuracy": 0.9953683763742447, "num_tokens": 1720082779.0, "step": 4047 }, { "entropy": 0.018808988388627768, "epoch": 1.7716380347959295, "grad_norm": 7.4375, "learning_rate": 1.9972929074174336e-05, "loss": 0.1317, "loss_lm": 0.018071624217554927, "loss_seg": 0.11361612752079964, "mean_token_accuracy": 0.9950517416000366, "num_tokens": 1720508334.0, "step": 4048 }, { "entropy": 0.01837693341076374, "epoch": 1.7720757194441406, "grad_norm": 92.5, "learning_rate": 1.997022198159177e-05, "loss": 0.1114, "loss_lm": 0.017325713066384196, "loss_seg": 0.09403987601399422, "mean_token_accuracy": 0.9952686131000519, "num_tokens": 1720933305.0, "step": 4049 }, { "entropy": 0.018381703179329634, "epoch": 1.7725134040923516, "grad_norm": 15.8125, "learning_rate": 1.9967514889009206e-05, "loss": 0.1218, "loss_lm": 0.01316583319567144, "loss_seg": 0.10864345449954271, "mean_token_accuracy": 0.9952120333909988, "num_tokens": 1721358873.0, "step": 4050 }, { "entropy": 0.01796644553542137, "epoch": 1.7729510887405624, "grad_norm": 6.8125, "learning_rate": 1.996480779642664e-05, "loss": 0.1055, "loss_lm": 0.017001357628032565, "loss_seg": 0.08854053355753422, "mean_token_accuracy": 0.9952504336833954, "num_tokens": 1721783298.0, "step": 4051 }, { "entropy": 0.01788068236783147, "epoch": 1.7733887733887734, "grad_norm": 5.84375, "learning_rate": 1.996210070384407e-05, "loss": 0.1601, "loss_lm": 0.016088762553408742, "loss_seg": 0.14398379065096378, "mean_token_accuracy": 0.9953956454992294, "num_tokens": 1722208931.0, "step": 4052 }, { "entropy": 0.018060529604554176, "epoch": 1.7738264580369845, "grad_norm": 7.15625, "learning_rate": 1.9959393611261504e-05, "loss": 0.1234, "loss_lm": 0.0166106466203928, "loss_seg": 0.10679740086197853, "mean_token_accuracy": 0.995251938700676, "num_tokens": 1722633046.0, "step": 4053 }, { "entropy": 0.017630259040743113, "epoch": 1.7742641426851953, "grad_norm": 3.328125, "learning_rate": 1.9956686518678938e-05, "loss": 0.1085, "loss_lm": 0.01651773974299431, "loss_seg": 0.09197022207081318, "mean_token_accuracy": 0.9954562485218048, "num_tokens": 1723057958.0, "step": 4054 }, { "entropy": 0.018112081103026867, "epoch": 1.7747018273334063, "grad_norm": 6.1875, "learning_rate": 1.9953979426096375e-05, "loss": 0.0856, "loss_lm": 0.015489862067624927, "loss_seg": 0.07008163910359144, "mean_token_accuracy": 0.9953436255455017, "num_tokens": 1723482917.0, "step": 4055 }, { "entropy": 0.018193687312304974, "epoch": 1.7751395119816173, "grad_norm": 4.78125, "learning_rate": 1.995127233351381e-05, "loss": 0.1297, "loss_lm": 0.013747150311246514, "loss_seg": 0.11597627960145473, "mean_token_accuracy": 0.9953007996082306, "num_tokens": 1723908468.0, "step": 4056 }, { "entropy": 0.01873741764575243, "epoch": 1.7755771966298282, "grad_norm": 9.1875, "learning_rate": 1.994856524093124e-05, "loss": 0.0904, "loss_lm": 0.015092549845576286, "loss_seg": 0.07530136406421661, "mean_token_accuracy": 0.9952217191457748, "num_tokens": 1724333485.0, "step": 4057 }, { "entropy": 0.01757760625332594, "epoch": 1.7760148812780392, "grad_norm": 3.609375, "learning_rate": 1.9945858148348673e-05, "loss": 0.115, "loss_lm": 0.01449875719845295, "loss_seg": 0.10047288239002228, "mean_token_accuracy": 0.9954424500465393, "num_tokens": 1724758443.0, "step": 4058 }, { "entropy": 0.017804142087697983, "epoch": 1.7764525659262502, "grad_norm": 23.875, "learning_rate": 1.9943151055766106e-05, "loss": 0.0959, "loss_lm": 0.01420964882709086, "loss_seg": 0.08169698715209961, "mean_token_accuracy": 0.9953216463327408, "num_tokens": 1725183127.0, "step": 4059 }, { "entropy": 0.01821923814713955, "epoch": 1.776890250574461, "grad_norm": 9.75, "learning_rate": 1.9940443963183544e-05, "loss": 0.1501, "loss_lm": 0.018311445601284504, "loss_seg": 0.13177751190960407, "mean_token_accuracy": 0.9951750487089157, "num_tokens": 1725607952.0, "step": 4060 }, { "entropy": 0.01785756228491664, "epoch": 1.777327935222672, "grad_norm": 6.28125, "learning_rate": 1.9937736870600977e-05, "loss": 0.1206, "loss_lm": 0.013519429601728916, "loss_seg": 0.10711775626987219, "mean_token_accuracy": 0.9952698647975922, "num_tokens": 1726032612.0, "step": 4061 }, { "entropy": 0.018476315774023533, "epoch": 1.777765619870883, "grad_norm": 5.59375, "learning_rate": 1.9935029778018408e-05, "loss": 0.1126, "loss_lm": 0.016115982783958316, "loss_seg": 0.09649926237761974, "mean_token_accuracy": 0.9951139986515045, "num_tokens": 1726457480.0, "step": 4062 }, { "entropy": 0.01859617978334427, "epoch": 1.778203304519094, "grad_norm": 7.59375, "learning_rate": 1.993232268543584e-05, "loss": 0.1221, "loss_lm": 0.017928557703271508, "loss_seg": 0.10417968314141035, "mean_token_accuracy": 0.9951948821544647, "num_tokens": 1726883052.0, "step": 4063 }, { "entropy": 0.018701500725001097, "epoch": 1.778640989167305, "grad_norm": 9.4375, "learning_rate": 1.9929615592853275e-05, "loss": 0.1395, "loss_lm": 0.01598638971336186, "loss_seg": 0.12346829753369093, "mean_token_accuracy": 0.9952506870031357, "num_tokens": 1727308123.0, "step": 4064 }, { "entropy": 0.018152338452637196, "epoch": 1.779078673815516, "grad_norm": 6.40625, "learning_rate": 1.9926908500270712e-05, "loss": 0.138, "loss_lm": 0.016036507207900286, "loss_seg": 0.12197034806013107, "mean_token_accuracy": 0.9952891319990158, "num_tokens": 1727732400.0, "step": 4065 }, { "entropy": 0.018243390135467052, "epoch": 1.7795163584637268, "grad_norm": 15.4375, "learning_rate": 1.9924201407688143e-05, "loss": 0.1143, "loss_lm": 0.01768141472712159, "loss_seg": 0.09666289202868938, "mean_token_accuracy": 0.995324894785881, "num_tokens": 1728157573.0, "step": 4066 }, { "entropy": 0.018329476937651634, "epoch": 1.7799540431119378, "grad_norm": 8.0625, "learning_rate": 1.9921494315105576e-05, "loss": 0.1036, "loss_lm": 0.015536909690126777, "loss_seg": 0.08804153650999069, "mean_token_accuracy": 0.9952436536550522, "num_tokens": 1728582426.0, "step": 4067 }, { "entropy": 0.017943181097507477, "epoch": 1.7803917277601489, "grad_norm": 7.0625, "learning_rate": 1.991878722252301e-05, "loss": 0.1132, "loss_lm": 0.016215292504057288, "loss_seg": 0.09699464589357376, "mean_token_accuracy": 0.9953752011060715, "num_tokens": 1729007481.0, "step": 4068 }, { "entropy": 0.018724769353866577, "epoch": 1.7808294124083597, "grad_norm": 4.71875, "learning_rate": 1.9916080129940444e-05, "loss": 0.124, "loss_lm": 0.016394893173128366, "loss_seg": 0.10755595751106739, "mean_token_accuracy": 0.9951717704534531, "num_tokens": 1729432853.0, "step": 4069 }, { "entropy": 0.01796443946659565, "epoch": 1.7812670970565707, "grad_norm": 6.1875, "learning_rate": 1.991337303735788e-05, "loss": 0.0889, "loss_lm": 0.0154535046312958, "loss_seg": 0.07345741242170334, "mean_token_accuracy": 0.9952554553747177, "num_tokens": 1729857919.0, "step": 4070 }, { "entropy": 0.018421916756778955, "epoch": 1.7817047817047817, "grad_norm": 10.125, "learning_rate": 1.991066594477531e-05, "loss": 0.0928, "loss_lm": 0.0154851742554456, "loss_seg": 0.077298438642174, "mean_token_accuracy": 0.99526546895504, "num_tokens": 1730283267.0, "step": 4071 }, { "entropy": 0.017909695394337177, "epoch": 1.7821424663529926, "grad_norm": 4.03125, "learning_rate": 1.9907958852192745e-05, "loss": 0.0937, "loss_lm": 0.016023287316784263, "loss_seg": 0.07765313237905502, "mean_token_accuracy": 0.9952708035707474, "num_tokens": 1730708856.0, "step": 4072 }, { "entropy": 0.019089105539023876, "epoch": 1.7825801510012036, "grad_norm": 10.3125, "learning_rate": 1.990525175961018e-05, "loss": 0.0862, "loss_lm": 0.01577202184125781, "loss_seg": 0.07039047311991453, "mean_token_accuracy": 0.9949377924203873, "num_tokens": 1731133960.0, "step": 4073 }, { "entropy": 0.01834923028945923, "epoch": 1.7830178356494146, "grad_norm": 5.53125, "learning_rate": 1.9902544667027612e-05, "loss": 0.1295, "loss_lm": 0.015318280551582575, "loss_seg": 0.11416957341134548, "mean_token_accuracy": 0.9952390491962433, "num_tokens": 1731558821.0, "step": 4074 }, { "entropy": 0.017912788316607475, "epoch": 1.7834555202976254, "grad_norm": 20.375, "learning_rate": 1.989983757444505e-05, "loss": 0.1227, "loss_lm": 0.015442339237779379, "loss_seg": 0.10724805667996407, "mean_token_accuracy": 0.9953626692295074, "num_tokens": 1731983348.0, "step": 4075 }, { "entropy": 0.018661524634808302, "epoch": 1.7838932049458365, "grad_norm": 9.875, "learning_rate": 1.989713048186248e-05, "loss": 0.095, "loss_lm": 0.017543518915772438, "loss_seg": 0.07749410439282656, "mean_token_accuracy": 0.995071068406105, "num_tokens": 1732408163.0, "step": 4076 }, { "entropy": 0.017606152687221766, "epoch": 1.7843308895940475, "grad_norm": 6.65625, "learning_rate": 1.9894423389279914e-05, "loss": 0.1196, "loss_lm": 0.014246348524466157, "loss_seg": 0.10532247088849545, "mean_token_accuracy": 0.9953813403844833, "num_tokens": 1732833117.0, "step": 4077 }, { "entropy": 0.0187541120685637, "epoch": 1.7847685742422583, "grad_norm": 12.6875, "learning_rate": 1.9891716296697347e-05, "loss": 0.1178, "loss_lm": 0.015967513900250196, "loss_seg": 0.1018706876784563, "mean_token_accuracy": 0.9951298534870148, "num_tokens": 1733258915.0, "step": 4078 }, { "entropy": 0.018886426463723183, "epoch": 1.7852062588904696, "grad_norm": 11.6875, "learning_rate": 1.988900920411478e-05, "loss": 0.1545, "loss_lm": 0.01398762408643961, "loss_seg": 0.14049923047423363, "mean_token_accuracy": 0.9951010942459106, "num_tokens": 1733683675.0, "step": 4079 }, { "entropy": 0.01858290797099471, "epoch": 1.7856439435386804, "grad_norm": 10.1875, "learning_rate": 1.9886302111532215e-05, "loss": 0.1091, "loss_lm": 0.017392746871337295, "loss_seg": 0.09167872741818428, "mean_token_accuracy": 0.9952331632375717, "num_tokens": 1734109026.0, "step": 4080 }, { "entropy": 0.01776585029438138, "epoch": 1.7860816281868912, "grad_norm": 11.5, "learning_rate": 1.988359501894965e-05, "loss": 0.1123, "loss_lm": 0.016315879300236702, "loss_seg": 0.09599711559712887, "mean_token_accuracy": 0.9952648133039474, "num_tokens": 1734533697.0, "step": 4081 }, { "entropy": 0.0183954113163054, "epoch": 1.7865193128351025, "grad_norm": 9.1875, "learning_rate": 1.9880887926367082e-05, "loss": 0.1465, "loss_lm": 0.01612405339255929, "loss_seg": 0.13035143539309502, "mean_token_accuracy": 0.9952714443206787, "num_tokens": 1734958900.0, "step": 4082 }, { "entropy": 0.018586148042231798, "epoch": 1.7869569974833133, "grad_norm": 8.875, "learning_rate": 1.9878180833784516e-05, "loss": 0.129, "loss_lm": 0.017719196155667305, "loss_seg": 0.11123435106128454, "mean_token_accuracy": 0.9951696991920471, "num_tokens": 1735384265.0, "step": 4083 }, { "entropy": 0.01851700944826007, "epoch": 1.787394682131524, "grad_norm": 11.875, "learning_rate": 1.987547374120195e-05, "loss": 0.1177, "loss_lm": 0.015464114490896463, "loss_seg": 0.10225380584597588, "mean_token_accuracy": 0.9952447414398193, "num_tokens": 1735808559.0, "step": 4084 }, { "entropy": 0.01868998957797885, "epoch": 1.7878323667797353, "grad_norm": 6.34375, "learning_rate": 1.9872766648619383e-05, "loss": 0.1322, "loss_lm": 0.0156852004583925, "loss_seg": 0.11650153808295727, "mean_token_accuracy": 0.9951187819242477, "num_tokens": 1736233823.0, "step": 4085 }, { "entropy": 0.01832760591059923, "epoch": 1.7882700514279462, "grad_norm": 11.375, "learning_rate": 1.9870059556036817e-05, "loss": 0.1144, "loss_lm": 0.017031081020832062, "loss_seg": 0.09735937882214785, "mean_token_accuracy": 0.9953326135873795, "num_tokens": 1736659521.0, "step": 4086 }, { "entropy": 0.018237802665680647, "epoch": 1.7887077360761572, "grad_norm": 3.75, "learning_rate": 1.986735246345425e-05, "loss": 0.1002, "loss_lm": 0.015444786055013537, "loss_seg": 0.0847824327647686, "mean_token_accuracy": 0.9953601062297821, "num_tokens": 1737083840.0, "step": 4087 }, { "entropy": 0.018164155539125204, "epoch": 1.7891454207243682, "grad_norm": 8.75, "learning_rate": 1.9864645370871685e-05, "loss": 0.1384, "loss_lm": 0.019292880315333605, "loss_seg": 0.1191401919350028, "mean_token_accuracy": 0.9953067004680634, "num_tokens": 1737508653.0, "step": 4088 }, { "entropy": 0.018193576019257307, "epoch": 1.789583105372579, "grad_norm": 7.65625, "learning_rate": 1.9861938278289118e-05, "loss": 0.0997, "loss_lm": 0.017579492880031466, "loss_seg": 0.08210466708987951, "mean_token_accuracy": 0.9953445047140121, "num_tokens": 1737933254.0, "step": 4089 }, { "entropy": 0.01818237267434597, "epoch": 1.79002079002079, "grad_norm": 4.1875, "learning_rate": 1.985923118570655e-05, "loss": 0.1277, "loss_lm": 0.01606941013596952, "loss_seg": 0.11166766844689846, "mean_token_accuracy": 0.9952829033136368, "num_tokens": 1738357798.0, "step": 4090 }, { "entropy": 0.017956964671611786, "epoch": 1.790458474669001, "grad_norm": 5.34375, "learning_rate": 1.9856524093123986e-05, "loss": 0.1493, "loss_lm": 0.01444981642998755, "loss_seg": 0.13481514528393745, "mean_token_accuracy": 0.9953627288341522, "num_tokens": 1738783002.0, "step": 4091 }, { "entropy": 0.018269394990056753, "epoch": 1.790896159317212, "grad_norm": 4.6875, "learning_rate": 1.985381700054142e-05, "loss": 0.1077, "loss_lm": 0.01681742095388472, "loss_seg": 0.09092125575989485, "mean_token_accuracy": 0.9952950775623322, "num_tokens": 1739208223.0, "step": 4092 }, { "entropy": 0.01819076156243682, "epoch": 1.791333843965423, "grad_norm": 7.6875, "learning_rate": 1.9851109907958853e-05, "loss": 0.1457, "loss_lm": 0.014371861470863223, "loss_seg": 0.13136696070432663, "mean_token_accuracy": 0.9952820241451263, "num_tokens": 1739633150.0, "step": 4093 }, { "entropy": 0.018399683758616447, "epoch": 1.791771528613634, "grad_norm": 23.25, "learning_rate": 1.9848402815376287e-05, "loss": 0.1499, "loss_lm": 0.015029420610517263, "loss_seg": 0.13483835756778717, "mean_token_accuracy": 0.995221883058548, "num_tokens": 1740058185.0, "step": 4094 }, { "entropy": 0.01811322057619691, "epoch": 1.7922092132618448, "grad_norm": 4.59375, "learning_rate": 1.9845695722793717e-05, "loss": 0.1708, "loss_lm": 0.01638697599992156, "loss_seg": 0.15446236915886402, "mean_token_accuracy": 0.9953591376543045, "num_tokens": 1740483258.0, "step": 4095 }, { "entropy": 0.018223861698061228, "epoch": 1.7926468979100558, "grad_norm": 6.15625, "learning_rate": 1.9842988630211154e-05, "loss": 0.1288, "loss_lm": 0.014051954029127955, "loss_seg": 0.11478336341679096, "mean_token_accuracy": 0.9952672570943832, "num_tokens": 1740908307.0, "step": 4096 }, { "entropy": 0.01838059490546584, "epoch": 1.7930845825582669, "grad_norm": 14.5625, "learning_rate": 1.9840281537628588e-05, "loss": 0.1233, "loss_lm": 0.014800540870055556, "loss_seg": 0.1084807775914669, "mean_token_accuracy": 0.9952718615531921, "num_tokens": 1741333131.0, "step": 4097 }, { "entropy": 0.01900300197303295, "epoch": 1.7935222672064777, "grad_norm": 14.375, "learning_rate": 1.9837574445046022e-05, "loss": 0.1194, "loss_lm": 0.01709060836583376, "loss_seg": 0.10235273651778698, "mean_token_accuracy": 0.9950876832008362, "num_tokens": 1741758765.0, "step": 4098 }, { "entropy": 0.01829646760597825, "epoch": 1.7939599518546887, "grad_norm": 20.0, "learning_rate": 1.9834867352463456e-05, "loss": 0.1509, "loss_lm": 0.017971133114770055, "loss_seg": 0.13288897089660168, "mean_token_accuracy": 0.995244950056076, "num_tokens": 1742183960.0, "step": 4099 }, { "entropy": 0.018080197740346193, "epoch": 1.7943976365028997, "grad_norm": 12.625, "learning_rate": 1.9832160259880886e-05, "loss": 0.1131, "loss_lm": 0.016652939841151237, "loss_seg": 0.09648992493748665, "mean_token_accuracy": 0.9953358769416809, "num_tokens": 1742609289.0, "step": 4100 }, { "entropy": 0.01871837116777897, "epoch": 1.7948353211511106, "grad_norm": 6.15625, "learning_rate": 1.9829453167298323e-05, "loss": 0.1165, "loss_lm": 0.015744016971439123, "loss_seg": 0.10079837311059237, "mean_token_accuracy": 0.9952092319726944, "num_tokens": 1743034242.0, "step": 4101 }, { "entropy": 0.01784114120528102, "epoch": 1.7952730057993216, "grad_norm": 4.875, "learning_rate": 1.9826746074715757e-05, "loss": 0.1034, "loss_lm": 0.013670175801962614, "loss_seg": 0.08975799567997456, "mean_token_accuracy": 0.9954035878181458, "num_tokens": 1743457778.0, "step": 4102 }, { "entropy": 0.018860590178519487, "epoch": 1.7957106904475326, "grad_norm": 8.0625, "learning_rate": 1.982403898213319e-05, "loss": 0.0818, "loss_lm": 0.016290405532345176, "loss_seg": 0.06549493689090014, "mean_token_accuracy": 0.9951545298099518, "num_tokens": 1743882684.0, "step": 4103 }, { "entropy": 0.01881779031828046, "epoch": 1.7961483750957434, "grad_norm": 5.0625, "learning_rate": 1.9821331889550624e-05, "loss": 0.1094, "loss_lm": 0.014586359029635787, "loss_seg": 0.0948163140565157, "mean_token_accuracy": 0.9950772076845169, "num_tokens": 1744307817.0, "step": 4104 }, { "entropy": 0.01838801894336939, "epoch": 1.7965860597439545, "grad_norm": 8.3125, "learning_rate": 1.9818624796968055e-05, "loss": 0.0964, "loss_lm": 0.01562803261913359, "loss_seg": 0.08079692907631397, "mean_token_accuracy": 0.9952463358640671, "num_tokens": 1744732461.0, "step": 4105 }, { "entropy": 0.017635947093367577, "epoch": 1.7970237443921655, "grad_norm": 7.1875, "learning_rate": 1.981591770438549e-05, "loss": 0.0984, "loss_lm": 0.014528668951243162, "loss_seg": 0.0838946495205164, "mean_token_accuracy": 0.9953775256872177, "num_tokens": 1745157945.0, "step": 4106 }, { "entropy": 0.018647621385753155, "epoch": 1.7974614290403763, "grad_norm": 8.25, "learning_rate": 1.9813210611802925e-05, "loss": 0.1071, "loss_lm": 0.019158794078975916, "loss_seg": 0.087972866371274, "mean_token_accuracy": 0.9950841516256332, "num_tokens": 1745583657.0, "step": 4107 }, { "entropy": 0.01879696873947978, "epoch": 1.7978991136885873, "grad_norm": 12.25, "learning_rate": 1.981050351922036e-05, "loss": 0.1294, "loss_lm": 0.020506714237853885, "loss_seg": 0.10885467752814293, "mean_token_accuracy": 0.9950763583183289, "num_tokens": 1746008321.0, "step": 4108 }, { "entropy": 0.01866435492411256, "epoch": 1.7983367983367984, "grad_norm": 5.0, "learning_rate": 1.9807796426637793e-05, "loss": 0.1291, "loss_lm": 0.016329942038282752, "loss_seg": 0.11272823438048363, "mean_token_accuracy": 0.9950210899114609, "num_tokens": 1746433094.0, "step": 4109 }, { "entropy": 0.018431545235216618, "epoch": 1.7987744829850092, "grad_norm": 5.46875, "learning_rate": 1.9805089334055223e-05, "loss": 0.1217, "loss_lm": 0.016179765574634075, "loss_seg": 0.10556495934724808, "mean_token_accuracy": 0.995121955871582, "num_tokens": 1746857589.0, "step": 4110 }, { "entropy": 0.018667187076061964, "epoch": 1.7992121676332202, "grad_norm": 5.375, "learning_rate": 1.9802382241472657e-05, "loss": 0.1337, "loss_lm": 0.017237557098269463, "loss_seg": 0.1165062990039587, "mean_token_accuracy": 0.9950025379657745, "num_tokens": 1747282757.0, "step": 4111 }, { "entropy": 0.018386818002909422, "epoch": 1.7996498522814313, "grad_norm": 8.0, "learning_rate": 1.9799675148890094e-05, "loss": 0.1485, "loss_lm": 0.016101409681141376, "loss_seg": 0.13237921707332134, "mean_token_accuracy": 0.9953230023384094, "num_tokens": 1747707765.0, "step": 4112 }, { "entropy": 0.01810313668102026, "epoch": 1.800087536929642, "grad_norm": 7.5, "learning_rate": 1.9796968056307528e-05, "loss": 0.1296, "loss_lm": 0.014966077404096723, "loss_seg": 0.11462280340492725, "mean_token_accuracy": 0.9951781630516052, "num_tokens": 1748133288.0, "step": 4113 }, { "entropy": 0.01842528348788619, "epoch": 1.800525221577853, "grad_norm": 5.84375, "learning_rate": 1.9794260963724958e-05, "loss": 0.149, "loss_lm": 0.018159805331379175, "loss_seg": 0.13084320351481438, "mean_token_accuracy": 0.9952072948217392, "num_tokens": 1748558698.0, "step": 4114 }, { "entropy": 0.01841125078499317, "epoch": 1.8009629062260641, "grad_norm": 9.5, "learning_rate": 1.9791553871142392e-05, "loss": 0.1224, "loss_lm": 0.016998210456222296, "loss_seg": 0.10541746579110622, "mean_token_accuracy": 0.9952922761440277, "num_tokens": 1748983894.0, "step": 4115 }, { "entropy": 0.01771041564643383, "epoch": 1.801400590874275, "grad_norm": 4.75, "learning_rate": 1.9788846778559825e-05, "loss": 0.0848, "loss_lm": 0.016747842309996486, "loss_seg": 0.06810214929282665, "mean_token_accuracy": 0.9953920990228653, "num_tokens": 1749409596.0, "step": 4116 }, { "entropy": 0.018360239919275045, "epoch": 1.8018382755224862, "grad_norm": 12.25, "learning_rate": 1.9786139685977263e-05, "loss": 0.1376, "loss_lm": 0.015685724560171366, "loss_seg": 0.12191790714859962, "mean_token_accuracy": 0.9951964616775513, "num_tokens": 1749834459.0, "step": 4117 }, { "entropy": 0.017928619869053364, "epoch": 1.802275960170697, "grad_norm": 29.75, "learning_rate": 1.9783432593394696e-05, "loss": 0.1027, "loss_lm": 0.016548037761822343, "loss_seg": 0.08611823245882988, "mean_token_accuracy": 0.9952776730060577, "num_tokens": 1750259195.0, "step": 4118 }, { "entropy": 0.018455791287124157, "epoch": 1.8027136448189078, "grad_norm": 5.90625, "learning_rate": 1.9780725500812127e-05, "loss": 0.1281, "loss_lm": 0.015789108350872993, "loss_seg": 0.11231521889567375, "mean_token_accuracy": 0.9951856285333633, "num_tokens": 1750684152.0, "step": 4119 }, { "entropy": 0.019010342191904783, "epoch": 1.803151329467119, "grad_norm": 4.5625, "learning_rate": 1.977801840822956e-05, "loss": 0.0906, "loss_lm": 0.015411355998367071, "loss_seg": 0.07519366592168808, "mean_token_accuracy": 0.9950870275497437, "num_tokens": 1751109517.0, "step": 4120 }, { "entropy": 0.018344696611166, "epoch": 1.80358901411533, "grad_norm": 8.0625, "learning_rate": 1.9775311315646994e-05, "loss": 0.1264, "loss_lm": 0.017222613794729114, "loss_seg": 0.1091925147920847, "mean_token_accuracy": 0.9950684756040573, "num_tokens": 1751535084.0, "step": 4121 }, { "entropy": 0.017600132152438164, "epoch": 1.8040266987635407, "grad_norm": 6.125, "learning_rate": 1.977260422306443e-05, "loss": 0.1067, "loss_lm": 0.013936415314674377, "loss_seg": 0.09280332643538713, "mean_token_accuracy": 0.9953908324241638, "num_tokens": 1751960704.0, "step": 4122 }, { "entropy": 0.01825567940250039, "epoch": 1.804464383411752, "grad_norm": 9.5625, "learning_rate": 1.9769897130481865e-05, "loss": 0.1018, "loss_lm": 0.015764452517032623, "loss_seg": 0.08602248691022396, "mean_token_accuracy": 0.995257779955864, "num_tokens": 1752385566.0, "step": 4123 }, { "entropy": 0.01831941958516836, "epoch": 1.8049020680599628, "grad_norm": 7.6875, "learning_rate": 1.9767190037899295e-05, "loss": 0.0855, "loss_lm": 0.014454311225563288, "loss_seg": 0.07106325402855873, "mean_token_accuracy": 0.9952215552330017, "num_tokens": 1752810351.0, "step": 4124 }, { "entropy": 0.018092789221554995, "epoch": 1.8053397527081736, "grad_norm": 6.0625, "learning_rate": 1.976448294531673e-05, "loss": 0.1419, "loss_lm": 0.015619802055880427, "loss_seg": 0.12625786662101746, "mean_token_accuracy": 0.9952414929866791, "num_tokens": 1753236034.0, "step": 4125 }, { "entropy": 0.018311072140932083, "epoch": 1.8057774373563849, "grad_norm": 6.03125, "learning_rate": 1.9761775852734163e-05, "loss": 0.1327, "loss_lm": 0.018430403899401426, "loss_seg": 0.1142557505518198, "mean_token_accuracy": 0.9951901137828827, "num_tokens": 1753661561.0, "step": 4126 }, { "entropy": 0.01840792177245021, "epoch": 1.8062151220045957, "grad_norm": 8.5625, "learning_rate": 1.97590687601516e-05, "loss": 0.1092, "loss_lm": 0.016706590307876468, "loss_seg": 0.09252267517149448, "mean_token_accuracy": 0.995282918214798, "num_tokens": 1754086537.0, "step": 4127 }, { "entropy": 0.01768503338098526, "epoch": 1.8066528066528067, "grad_norm": 27.75, "learning_rate": 1.9756361667569034e-05, "loss": 0.1251, "loss_lm": 0.016536894720047712, "loss_seg": 0.10856802575290203, "mean_token_accuracy": 0.9954222589731216, "num_tokens": 1754511133.0, "step": 4128 }, { "entropy": 0.01867519598454237, "epoch": 1.8070904913010177, "grad_norm": 15.25, "learning_rate": 1.9753654574986464e-05, "loss": 0.1615, "loss_lm": 0.015707203652709723, "loss_seg": 0.14579376205801964, "mean_token_accuracy": 0.995199903845787, "num_tokens": 1754936935.0, "step": 4129 }, { "entropy": 0.01858103694394231, "epoch": 1.8075281759492285, "grad_norm": 10.4375, "learning_rate": 1.9750947482403898e-05, "loss": 0.1472, "loss_lm": 0.0169897829182446, "loss_seg": 0.13020126521587372, "mean_token_accuracy": 0.99526447057724, "num_tokens": 1755362400.0, "step": 4130 }, { "entropy": 0.018607228994369507, "epoch": 1.8079658605974396, "grad_norm": 8.0625, "learning_rate": 1.974824038982133e-05, "loss": 0.1129, "loss_lm": 0.016522655030712485, "loss_seg": 0.09640336595475674, "mean_token_accuracy": 0.995157316327095, "num_tokens": 1755787135.0, "step": 4131 }, { "entropy": 0.01877740817144513, "epoch": 1.8084035452456506, "grad_norm": 6.59375, "learning_rate": 1.974553329723877e-05, "loss": 0.1304, "loss_lm": 0.014165196334943175, "loss_seg": 0.11619020625948906, "mean_token_accuracy": 0.9951810389757156, "num_tokens": 1756211893.0, "step": 4132 }, { "entropy": 0.018555399496108294, "epoch": 1.8088412298938614, "grad_norm": 6.1875, "learning_rate": 1.9742826204656202e-05, "loss": 0.1471, "loss_lm": 0.016649210127070546, "loss_seg": 0.13045376352965832, "mean_token_accuracy": 0.9953437745571136, "num_tokens": 1756636656.0, "step": 4133 }, { "entropy": 0.018530431669205427, "epoch": 1.8092789145420725, "grad_norm": 11.5625, "learning_rate": 1.9740119112073633e-05, "loss": 0.1176, "loss_lm": 0.015739757334813476, "loss_seg": 0.10186093300580978, "mean_token_accuracy": 0.9951542764902115, "num_tokens": 1757061338.0, "step": 4134 }, { "entropy": 0.018848138861358166, "epoch": 1.8097165991902835, "grad_norm": 9.5, "learning_rate": 1.9737412019491066e-05, "loss": 0.1157, "loss_lm": 0.017519575310871005, "loss_seg": 0.09817793779075146, "mean_token_accuracy": 0.9952024966478348, "num_tokens": 1757485693.0, "step": 4135 }, { "entropy": 0.018395330291241407, "epoch": 1.8101542838384943, "grad_norm": 19.0, "learning_rate": 1.97347049269085e-05, "loss": 0.1261, "loss_lm": 0.016598149901255965, "loss_seg": 0.10955019854009151, "mean_token_accuracy": 0.9952213764190674, "num_tokens": 1757911704.0, "step": 4136 }, { "entropy": 0.019072196912020445, "epoch": 1.8105919684867053, "grad_norm": 8.625, "learning_rate": 1.9731997834325937e-05, "loss": 0.1731, "loss_lm": 0.02147720940411091, "loss_seg": 0.15158462151885033, "mean_token_accuracy": 0.9950756132602692, "num_tokens": 1758336885.0, "step": 4137 }, { "entropy": 0.01905395183712244, "epoch": 1.8110296531349164, "grad_norm": 8.6875, "learning_rate": 1.9729290741743367e-05, "loss": 0.1171, "loss_lm": 0.015680475626140833, "loss_seg": 0.10139675624668598, "mean_token_accuracy": 0.9950591921806335, "num_tokens": 1758762279.0, "step": 4138 }, { "entropy": 0.01841719262301922, "epoch": 1.8114673377831272, "grad_norm": 3.8125, "learning_rate": 1.97265836491608e-05, "loss": 0.1025, "loss_lm": 0.014730665599927306, "loss_seg": 0.0877860076725483, "mean_token_accuracy": 0.9953381419181824, "num_tokens": 1759187383.0, "step": 4139 }, { "entropy": 0.018689953722059727, "epoch": 1.8119050224313382, "grad_norm": 10.125, "learning_rate": 1.9723876556578235e-05, "loss": 0.1424, "loss_lm": 0.01623059017583728, "loss_seg": 0.12617544643580914, "mean_token_accuracy": 0.9951720833778381, "num_tokens": 1759612364.0, "step": 4140 }, { "entropy": 0.018489631824195385, "epoch": 1.8123427070795493, "grad_norm": 14.25, "learning_rate": 1.972116946399567e-05, "loss": 0.1242, "loss_lm": 0.016569076105952263, "loss_seg": 0.10764294676482677, "mean_token_accuracy": 0.9952489584684372, "num_tokens": 1760037896.0, "step": 4141 }, { "entropy": 0.01902781706303358, "epoch": 1.81278039172776, "grad_norm": 4.125, "learning_rate": 1.9718462371413102e-05, "loss": 0.0914, "loss_lm": 0.01648830808699131, "loss_seg": 0.07486524153500795, "mean_token_accuracy": 0.9951675683259964, "num_tokens": 1760462684.0, "step": 4142 }, { "entropy": 0.0179433966986835, "epoch": 1.813218076375971, "grad_norm": 20.0, "learning_rate": 1.9715755278830536e-05, "loss": 0.1246, "loss_lm": 0.016044645803049207, "loss_seg": 0.1085338443517685, "mean_token_accuracy": 0.9953631311655045, "num_tokens": 1760888364.0, "step": 4143 }, { "entropy": 0.0189459677785635, "epoch": 1.8136557610241821, "grad_norm": 12.375, "learning_rate": 1.971304818624797e-05, "loss": 0.1687, "loss_lm": 0.017810382647439837, "loss_seg": 0.15085288882255554, "mean_token_accuracy": 0.995085746049881, "num_tokens": 1761313808.0, "step": 4144 }, { "entropy": 0.01848103804513812, "epoch": 1.814093445672393, "grad_norm": 10.6875, "learning_rate": 1.9710341093665404e-05, "loss": 0.1255, "loss_lm": 0.019326522015035152, "loss_seg": 0.10617669485509396, "mean_token_accuracy": 0.9952289164066315, "num_tokens": 1761739286.0, "step": 4145 }, { "entropy": 0.01927317027002573, "epoch": 1.814531130320604, "grad_norm": 24.75, "learning_rate": 1.9707634001082837e-05, "loss": 0.096, "loss_lm": 0.01645348221063614, "loss_seg": 0.07951238378882408, "mean_token_accuracy": 0.9949886351823807, "num_tokens": 1762164739.0, "step": 4146 }, { "entropy": 0.0181331648491323, "epoch": 1.814968814968815, "grad_norm": 5.3125, "learning_rate": 1.970492690850027e-05, "loss": 0.1335, "loss_lm": 0.014874645508825779, "loss_seg": 0.11862386390566826, "mean_token_accuracy": 0.9954061806201935, "num_tokens": 1762589170.0, "step": 4147 }, { "entropy": 0.01817715633660555, "epoch": 1.8154064996170258, "grad_norm": 5.46875, "learning_rate": 1.9702219815917705e-05, "loss": 0.1314, "loss_lm": 0.015756677370518446, "loss_seg": 0.11566498689353466, "mean_token_accuracy": 0.9952263087034225, "num_tokens": 1763014114.0, "step": 4148 }, { "entropy": 0.01823386736214161, "epoch": 1.8158441842652369, "grad_norm": 8.6875, "learning_rate": 1.969951272333514e-05, "loss": 0.1268, "loss_lm": 0.014405603287741542, "loss_seg": 0.11244162730872631, "mean_token_accuracy": 0.9953348934650421, "num_tokens": 1763439138.0, "step": 4149 }, { "entropy": 0.01822029845789075, "epoch": 1.816281868913448, "grad_norm": 4.4375, "learning_rate": 1.9696805630752572e-05, "loss": 0.1265, "loss_lm": 0.01721369498409331, "loss_seg": 0.1092838253825903, "mean_token_accuracy": 0.9952482581138611, "num_tokens": 1763864187.0, "step": 4150 }, { "entropy": 0.01841435907408595, "epoch": 1.8167195535616587, "grad_norm": 4.0, "learning_rate": 1.9694098538170006e-05, "loss": 0.1034, "loss_lm": 0.0173516774084419, "loss_seg": 0.08600173704326153, "mean_token_accuracy": 0.9952826052904129, "num_tokens": 1764288535.0, "step": 4151 }, { "entropy": 0.018287003505975008, "epoch": 1.8171572382098697, "grad_norm": 8.5625, "learning_rate": 1.969139144558744e-05, "loss": 0.1508, "loss_lm": 0.015874295961111784, "loss_seg": 0.1349155306816101, "mean_token_accuracy": 0.9952333569526672, "num_tokens": 1764713920.0, "step": 4152 }, { "entropy": 0.01810831017792225, "epoch": 1.8175949228580808, "grad_norm": 9.0625, "learning_rate": 1.9688684353004873e-05, "loss": 0.0854, "loss_lm": 0.014261344447731972, "loss_seg": 0.07109659072011709, "mean_token_accuracy": 0.9951990097761154, "num_tokens": 1765138576.0, "step": 4153 }, { "entropy": 0.0186601378954947, "epoch": 1.8180326075062916, "grad_norm": 7.875, "learning_rate": 1.9685977260422307e-05, "loss": 0.1327, "loss_lm": 0.018126215552911162, "loss_seg": 0.11458291485905647, "mean_token_accuracy": 0.9952014982700348, "num_tokens": 1765563269.0, "step": 4154 }, { "entropy": 0.017915513832122087, "epoch": 1.8184702921545028, "grad_norm": 20.625, "learning_rate": 1.968327016783974e-05, "loss": 0.116, "loss_lm": 0.015006921254098415, "loss_seg": 0.10098191630095243, "mean_token_accuracy": 0.9953272342681885, "num_tokens": 1765988034.0, "step": 4155 }, { "entropy": 0.01904444070532918, "epoch": 1.8189079768027137, "grad_norm": 7.53125, "learning_rate": 1.9680563075257175e-05, "loss": 0.1193, "loss_lm": 0.017901626648381352, "loss_seg": 0.10137757752090693, "mean_token_accuracy": 0.995124563574791, "num_tokens": 1766412376.0, "step": 4156 }, { "entropy": 0.01876564370468259, "epoch": 1.8193456614509245, "grad_norm": 4.03125, "learning_rate": 1.9677855982674608e-05, "loss": 0.0905, "loss_lm": 0.017236444633454084, "loss_seg": 0.07327393628656864, "mean_token_accuracy": 0.9951166808605194, "num_tokens": 1766836821.0, "step": 4157 }, { "entropy": 0.01781102502718568, "epoch": 1.8197833460991357, "grad_norm": 12.0625, "learning_rate": 1.9675148890092042e-05, "loss": 0.1079, "loss_lm": 0.016229630215093493, "loss_seg": 0.09171191696077585, "mean_token_accuracy": 0.9953576922416687, "num_tokens": 1767261753.0, "step": 4158 }, { "entropy": 0.017734609078615904, "epoch": 1.8202210307473465, "grad_norm": 9.75, "learning_rate": 1.9672441797509476e-05, "loss": 0.1001, "loss_lm": 0.015155411092564464, "loss_seg": 0.08495226595550776, "mean_token_accuracy": 0.9953569620847702, "num_tokens": 1767685957.0, "step": 4159 }, { "entropy": 0.018375303130596876, "epoch": 1.8206587153955573, "grad_norm": 11.3125, "learning_rate": 1.966973470492691e-05, "loss": 0.1262, "loss_lm": 0.015968214254826307, "loss_seg": 0.1101837269961834, "mean_token_accuracy": 0.9952331781387329, "num_tokens": 1768111379.0, "step": 4160 }, { "entropy": 0.018976110965013504, "epoch": 1.8210964000437686, "grad_norm": 17.25, "learning_rate": 1.9667027612344343e-05, "loss": 0.1285, "loss_lm": 0.01526606548577547, "loss_seg": 0.11327813286334276, "mean_token_accuracy": 0.9951686859130859, "num_tokens": 1768536076.0, "step": 4161 }, { "entropy": 0.018238536082208157, "epoch": 1.8215340846919794, "grad_norm": 4.84375, "learning_rate": 1.9664320519761773e-05, "loss": 0.1354, "loss_lm": 0.014533451292663813, "loss_seg": 0.12082480639219284, "mean_token_accuracy": 0.9952150881290436, "num_tokens": 1768961329.0, "step": 4162 }, { "entropy": 0.01797971036285162, "epoch": 1.8219717693401902, "grad_norm": 4.1875, "learning_rate": 1.966161342717921e-05, "loss": 0.1079, "loss_lm": 0.015716290567070246, "loss_seg": 0.09216175973415375, "mean_token_accuracy": 0.995408445596695, "num_tokens": 1769385968.0, "step": 4163 }, { "entropy": 0.01771512720733881, "epoch": 1.8224094539884015, "grad_norm": 6.25, "learning_rate": 1.9658906334596644e-05, "loss": 0.1156, "loss_lm": 0.0171250207349658, "loss_seg": 0.09852431155741215, "mean_token_accuracy": 0.9953052699565887, "num_tokens": 1769810680.0, "step": 4164 }, { "entropy": 0.017732487991452217, "epoch": 1.8228471386366123, "grad_norm": 6.3125, "learning_rate": 1.9656199242014078e-05, "loss": 0.1399, "loss_lm": 0.015043982537463307, "loss_seg": 0.12488870788365602, "mean_token_accuracy": 0.9953641295433044, "num_tokens": 1770235136.0, "step": 4165 }, { "entropy": 0.017712936270982027, "epoch": 1.8232848232848233, "grad_norm": 8.9375, "learning_rate": 1.9653492149431512e-05, "loss": 0.0951, "loss_lm": 0.014524406753480434, "loss_seg": 0.08057991042733192, "mean_token_accuracy": 0.9952939301729202, "num_tokens": 1770660292.0, "step": 4166 }, { "entropy": 0.018176478799432516, "epoch": 1.8237225079330344, "grad_norm": 7.4375, "learning_rate": 1.9650785056848942e-05, "loss": 0.1408, "loss_lm": 0.014986703870818019, "loss_seg": 0.12584854662418365, "mean_token_accuracy": 0.995345413684845, "num_tokens": 1771084987.0, "step": 4167 }, { "entropy": 0.01913451310247183, "epoch": 1.8241601925812452, "grad_norm": 6.1875, "learning_rate": 1.964807796426638e-05, "loss": 0.1361, "loss_lm": 0.017431821208447218, "loss_seg": 0.11869056336581707, "mean_token_accuracy": 0.9951324015855789, "num_tokens": 1771509796.0, "step": 4168 }, { "entropy": 0.01846961723640561, "epoch": 1.8245978772294562, "grad_norm": 5.875, "learning_rate": 1.9645370871683813e-05, "loss": 0.1276, "loss_lm": 0.01594680454581976, "loss_seg": 0.11163672804832458, "mean_token_accuracy": 0.9952297806739807, "num_tokens": 1771935094.0, "step": 4169 }, { "entropy": 0.01840911153703928, "epoch": 1.8250355618776672, "grad_norm": 8.4375, "learning_rate": 1.9642663779101247e-05, "loss": 0.1068, "loss_lm": 0.016575945308431983, "loss_seg": 0.09027380496263504, "mean_token_accuracy": 0.9951992779970169, "num_tokens": 1772359754.0, "step": 4170 }, { "entropy": 0.018083004746586084, "epoch": 1.825473246525878, "grad_norm": 32.0, "learning_rate": 1.963995668651868e-05, "loss": 0.124, "loss_lm": 0.016954798717051744, "loss_seg": 0.10701074730604887, "mean_token_accuracy": 0.9952509552240372, "num_tokens": 1772784435.0, "step": 4171 }, { "entropy": 0.018593233544379473, "epoch": 1.825910931174089, "grad_norm": 5.9375, "learning_rate": 1.963724959393611e-05, "loss": 0.1008, "loss_lm": 0.014953114092350006, "loss_seg": 0.08584755100309849, "mean_token_accuracy": 0.9951434880495071, "num_tokens": 1773209594.0, "step": 4172 }, { "entropy": 0.01827717898413539, "epoch": 1.8263486158223001, "grad_norm": 10.0625, "learning_rate": 1.9634542501353548e-05, "loss": 0.1265, "loss_lm": 0.016044946620240808, "loss_seg": 0.11048859264701605, "mean_token_accuracy": 0.9951901137828827, "num_tokens": 1773634514.0, "step": 4173 }, { "entropy": 0.018538024742156267, "epoch": 1.826786300470511, "grad_norm": 13.875, "learning_rate": 1.963183540877098e-05, "loss": 0.1437, "loss_lm": 0.01516114640980959, "loss_seg": 0.12856432422995567, "mean_token_accuracy": 0.9951619803905487, "num_tokens": 1774059864.0, "step": 4174 }, { "entropy": 0.018105490133166313, "epoch": 1.827223985118722, "grad_norm": 6.0, "learning_rate": 1.9629128316188415e-05, "loss": 0.1483, "loss_lm": 0.01573304133489728, "loss_seg": 0.13257229328155518, "mean_token_accuracy": 0.9952731281518936, "num_tokens": 1774485228.0, "step": 4175 }, { "entropy": 0.017762312665581703, "epoch": 1.827661669766933, "grad_norm": 4.71875, "learning_rate": 1.962642122360585e-05, "loss": 0.1194, "loss_lm": 0.015282917767763138, "loss_seg": 0.10408405400812626, "mean_token_accuracy": 0.995455801486969, "num_tokens": 1774910715.0, "step": 4176 }, { "entropy": 0.018503264524042606, "epoch": 1.8280993544151438, "grad_norm": 5.03125, "learning_rate": 1.962371413102328e-05, "loss": 0.1088, "loss_lm": 0.019193457905203104, "loss_seg": 0.08964184019714594, "mean_token_accuracy": 0.9952318966388702, "num_tokens": 1775336208.0, "step": 4177 }, { "entropy": 0.01820104429498315, "epoch": 1.8285370390633549, "grad_norm": 13.75, "learning_rate": 1.9621007038440713e-05, "loss": 0.1091, "loss_lm": 0.015111512038856745, "loss_seg": 0.09401951543986797, "mean_token_accuracy": 0.995312973856926, "num_tokens": 1775760959.0, "step": 4178 }, { "entropy": 0.018512450624257326, "epoch": 1.8289747237115659, "grad_norm": 4.65625, "learning_rate": 1.961829994585815e-05, "loss": 0.0976, "loss_lm": 0.015543489949777722, "loss_seg": 0.08202005177736282, "mean_token_accuracy": 0.995193213224411, "num_tokens": 1776186137.0, "step": 4179 }, { "entropy": 0.017897238489240408, "epoch": 1.8294124083597767, "grad_norm": 7.65625, "learning_rate": 1.9615592853275584e-05, "loss": 0.1544, "loss_lm": 0.015835850033909082, "loss_seg": 0.1386006809771061, "mean_token_accuracy": 0.9953125417232513, "num_tokens": 1776612114.0, "step": 4180 }, { "entropy": 0.018348197918385267, "epoch": 1.8298500930079877, "grad_norm": 7.3125, "learning_rate": 1.9612885760693014e-05, "loss": 0.1297, "loss_lm": 0.016757961828261614, "loss_seg": 0.11298656836152077, "mean_token_accuracy": 0.9952606856822968, "num_tokens": 1777036993.0, "step": 4181 }, { "entropy": 0.017900229431688786, "epoch": 1.8302877776561988, "grad_norm": 28.5, "learning_rate": 1.9610178668110448e-05, "loss": 0.1618, "loss_lm": 0.015888289781287313, "loss_seg": 0.14592559449374676, "mean_token_accuracy": 0.9953114837408066, "num_tokens": 1777461609.0, "step": 4182 }, { "entropy": 0.019198513589799404, "epoch": 1.8307254623044096, "grad_norm": 13.5, "learning_rate": 1.9607471575527882e-05, "loss": 0.1279, "loss_lm": 0.0168662304058671, "loss_seg": 0.11106781288981438, "mean_token_accuracy": 0.9949820041656494, "num_tokens": 1777888045.0, "step": 4183 }, { "entropy": 0.018668535631150007, "epoch": 1.8311631469526206, "grad_norm": 4.25, "learning_rate": 1.960476448294532e-05, "loss": 0.117, "loss_lm": 0.016865186858922243, "loss_seg": 0.10013325791805983, "mean_token_accuracy": 0.9951626062393188, "num_tokens": 1778313238.0, "step": 4184 }, { "entropy": 0.01820705085992813, "epoch": 1.8316008316008316, "grad_norm": 6.9375, "learning_rate": 1.9602057390362753e-05, "loss": 0.1384, "loss_lm": 0.016169531270861626, "loss_seg": 0.1222783550620079, "mean_token_accuracy": 0.9951491206884384, "num_tokens": 1778738074.0, "step": 4185 }, { "entropy": 0.01820347597822547, "epoch": 1.8320385162490425, "grad_norm": 9.875, "learning_rate": 1.9599350297780183e-05, "loss": 0.1231, "loss_lm": 0.01707759709097445, "loss_seg": 0.10598950274288654, "mean_token_accuracy": 0.9952539801597595, "num_tokens": 1779162609.0, "step": 4186 }, { "entropy": 0.01791467983275652, "epoch": 1.8324762008972535, "grad_norm": 6.5625, "learning_rate": 1.9596643205197617e-05, "loss": 0.1644, "loss_lm": 0.018404046073555946, "loss_seg": 0.14596573635935783, "mean_token_accuracy": 0.9953510165214539, "num_tokens": 1779587800.0, "step": 4187 }, { "entropy": 0.018462571781128645, "epoch": 1.8329138855454645, "grad_norm": 5.46875, "learning_rate": 1.959393611261505e-05, "loss": 0.1623, "loss_lm": 0.01695016655139625, "loss_seg": 0.14534751884639263, "mean_token_accuracy": 0.9953123480081558, "num_tokens": 1780013347.0, "step": 4188 }, { "entropy": 0.018093347549438477, "epoch": 1.8333515701936753, "grad_norm": 3.640625, "learning_rate": 1.9591229020032487e-05, "loss": 0.1014, "loss_lm": 0.014042338589206338, "loss_seg": 0.08732042461633682, "mean_token_accuracy": 0.9953474849462509, "num_tokens": 1780437785.0, "step": 4189 }, { "entropy": 0.018302539363503456, "epoch": 1.8337892548418864, "grad_norm": 7.90625, "learning_rate": 1.958852192744992e-05, "loss": 0.0893, "loss_lm": 0.017941306810826063, "loss_seg": 0.07132545299828053, "mean_token_accuracy": 0.9952484369277954, "num_tokens": 1780862437.0, "step": 4190 }, { "entropy": 0.018064431846141815, "epoch": 1.8342269394900974, "grad_norm": 8.375, "learning_rate": 1.958581483486735e-05, "loss": 0.1149, "loss_lm": 0.018376723863184452, "loss_seg": 0.0965182464569807, "mean_token_accuracy": 0.9951829165220261, "num_tokens": 1781287558.0, "step": 4191 }, { "entropy": 0.018825176171958447, "epoch": 1.8346646241383082, "grad_norm": 5.5, "learning_rate": 1.9583107742284785e-05, "loss": 0.1466, "loss_lm": 0.016899204812943935, "loss_seg": 0.12967165000736713, "mean_token_accuracy": 0.9951110929250717, "num_tokens": 1781712571.0, "step": 4192 }, { "entropy": 0.018290878739207983, "epoch": 1.8351023087865195, "grad_norm": 17.0, "learning_rate": 1.958040064970222e-05, "loss": 0.1722, "loss_lm": 0.017724349163472652, "loss_seg": 0.15447509661316872, "mean_token_accuracy": 0.9952059388160706, "num_tokens": 1782137643.0, "step": 4193 }, { "entropy": 0.018802344799041748, "epoch": 1.8355399934347303, "grad_norm": 6.0625, "learning_rate": 1.9577693557119656e-05, "loss": 0.1133, "loss_lm": 0.016875638626515865, "loss_seg": 0.09642638545483351, "mean_token_accuracy": 0.9951723515987396, "num_tokens": 1782563931.0, "step": 4194 }, { "entropy": 0.018400991801172495, "epoch": 1.835977678082941, "grad_norm": 5.375, "learning_rate": 1.957498646453709e-05, "loss": 0.1073, "loss_lm": 0.0157884422224015, "loss_seg": 0.09146858565509319, "mean_token_accuracy": 0.9952773600816727, "num_tokens": 1782988648.0, "step": 4195 }, { "entropy": 0.018542390316724777, "epoch": 1.8364153627311524, "grad_norm": 6.6875, "learning_rate": 1.957227937195452e-05, "loss": 0.1101, "loss_lm": 0.014165065251290798, "loss_seg": 0.0959057155996561, "mean_token_accuracy": 0.9951790124177933, "num_tokens": 1783414454.0, "step": 4196 }, { "entropy": 0.018496717792004347, "epoch": 1.8368530473793632, "grad_norm": 7.0, "learning_rate": 1.9569572279371954e-05, "loss": 0.1697, "loss_lm": 0.01621508551761508, "loss_seg": 0.15345064364373684, "mean_token_accuracy": 0.995198667049408, "num_tokens": 1783839498.0, "step": 4197 }, { "entropy": 0.018593521788716316, "epoch": 1.837290732027574, "grad_norm": 4.59375, "learning_rate": 1.9566865186789388e-05, "loss": 0.1224, "loss_lm": 0.01678348402492702, "loss_seg": 0.10566046368330717, "mean_token_accuracy": 0.9952787160873413, "num_tokens": 1784265433.0, "step": 4198 }, { "entropy": 0.017993047833442688, "epoch": 1.8377284166757852, "grad_norm": 35.25, "learning_rate": 1.9564158094206825e-05, "loss": 0.1105, "loss_lm": 0.0175565080717206, "loss_seg": 0.092933539301157, "mean_token_accuracy": 0.9953795075416565, "num_tokens": 1784690662.0, "step": 4199 }, { "entropy": 0.0179162067361176, "epoch": 1.838166101323996, "grad_norm": 11.9375, "learning_rate": 1.956145100162426e-05, "loss": 0.1087, "loss_lm": 0.015795200830325484, "loss_seg": 0.09292591735720634, "mean_token_accuracy": 0.9953353852033615, "num_tokens": 1785115520.0, "step": 4200 }, { "entropy": 0.01802327996119857, "epoch": 1.8386037859722069, "grad_norm": 7.9375, "learning_rate": 1.955874390904169e-05, "loss": 0.1304, "loss_lm": 0.015990385552868247, "loss_seg": 0.11442462727427483, "mean_token_accuracy": 0.9952372163534164, "num_tokens": 1785540874.0, "step": 4201 }, { "entropy": 0.01790688605979085, "epoch": 1.8390414706204181, "grad_norm": 10.5625, "learning_rate": 1.9556036816459123e-05, "loss": 0.1083, "loss_lm": 0.015489624347537756, "loss_seg": 0.09285658597946167, "mean_token_accuracy": 0.9952473789453506, "num_tokens": 1785965440.0, "step": 4202 }, { "entropy": 0.017920411191880703, "epoch": 1.839479155268629, "grad_norm": 19.625, "learning_rate": 1.9553329723876556e-05, "loss": 0.1028, "loss_lm": 0.014765811152756214, "loss_seg": 0.08807786274701357, "mean_token_accuracy": 0.9952942281961441, "num_tokens": 1786389342.0, "step": 4203 }, { "entropy": 0.017778855748474598, "epoch": 1.83991683991684, "grad_norm": 10.125, "learning_rate": 1.9550622631293993e-05, "loss": 0.1198, "loss_lm": 0.01575720077380538, "loss_seg": 0.1039965283125639, "mean_token_accuracy": 0.9953776895999908, "num_tokens": 1786814178.0, "step": 4204 }, { "entropy": 0.018006589729338884, "epoch": 1.840354524565051, "grad_norm": 5.28125, "learning_rate": 1.9547915538711424e-05, "loss": 0.1221, "loss_lm": 0.016050308709964156, "loss_seg": 0.10606243088841438, "mean_token_accuracy": 0.9953370243310928, "num_tokens": 1787239184.0, "step": 4205 }, { "entropy": 0.01874251151457429, "epoch": 1.8407922092132618, "grad_norm": 10.0, "learning_rate": 1.9545208446128857e-05, "loss": 0.101, "loss_lm": 0.017650851514190435, "loss_seg": 0.08336080424487591, "mean_token_accuracy": 0.9950915575027466, "num_tokens": 1787664541.0, "step": 4206 }, { "entropy": 0.018487812019884586, "epoch": 1.8412298938614728, "grad_norm": 12.75, "learning_rate": 1.954250135354629e-05, "loss": 0.1333, "loss_lm": 0.015640757512301207, "loss_seg": 0.11768180876970291, "mean_token_accuracy": 0.995160847902298, "num_tokens": 1788089764.0, "step": 4207 }, { "entropy": 0.017954005859792233, "epoch": 1.8416675785096839, "grad_norm": 18.75, "learning_rate": 1.9539794260963725e-05, "loss": 0.0916, "loss_lm": 0.015475135995075107, "loss_seg": 0.07608029060065746, "mean_token_accuracy": 0.9953232258558273, "num_tokens": 1788514988.0, "step": 4208 }, { "entropy": 0.01852842327207327, "epoch": 1.8421052631578947, "grad_norm": 6.53125, "learning_rate": 1.953708716838116e-05, "loss": 0.0734, "loss_lm": 0.016871965257450938, "loss_seg": 0.05651448108255863, "mean_token_accuracy": 0.9951258152723312, "num_tokens": 1788940655.0, "step": 4209 }, { "entropy": 0.017767106648534536, "epoch": 1.8425429478061057, "grad_norm": 9.375, "learning_rate": 1.9534380075798592e-05, "loss": 0.1259, "loss_lm": 0.015898310346528888, "loss_seg": 0.10999685991555452, "mean_token_accuracy": 0.9952999502420425, "num_tokens": 1789364851.0, "step": 4210 }, { "entropy": 0.01846376247704029, "epoch": 1.8429806324543168, "grad_norm": 7.40625, "learning_rate": 1.9531672983216026e-05, "loss": 0.1154, "loss_lm": 0.015635577496141195, "loss_seg": 0.09981433674693108, "mean_token_accuracy": 0.99513378739357, "num_tokens": 1789788982.0, "step": 4211 }, { "entropy": 0.018716714810580015, "epoch": 1.8434183171025276, "grad_norm": 9.4375, "learning_rate": 1.952896589063346e-05, "loss": 0.1208, "loss_lm": 0.017172908643260598, "loss_seg": 0.10365154035389423, "mean_token_accuracy": 0.9951377809047699, "num_tokens": 1790214490.0, "step": 4212 }, { "entropy": 0.01814216747879982, "epoch": 1.8438560017507386, "grad_norm": 9.5, "learning_rate": 1.9526258798050894e-05, "loss": 0.119, "loss_lm": 0.015386227751150727, "loss_seg": 0.10363850183784962, "mean_token_accuracy": 0.9953187704086304, "num_tokens": 1790639559.0, "step": 4213 }, { "entropy": 0.018238741904497147, "epoch": 1.8442936863989496, "grad_norm": 65.0, "learning_rate": 1.9523551705468327e-05, "loss": 0.1355, "loss_lm": 0.015727269928902388, "loss_seg": 0.11980815604329109, "mean_token_accuracy": 0.995267391204834, "num_tokens": 1791064631.0, "step": 4214 }, { "entropy": 0.0182320442982018, "epoch": 1.8447313710471605, "grad_norm": 12.125, "learning_rate": 1.952084461288576e-05, "loss": 0.1288, "loss_lm": 0.0169827735517174, "loss_seg": 0.11179425567388535, "mean_token_accuracy": 0.995220810174942, "num_tokens": 1791490408.0, "step": 4215 }, { "entropy": 0.0182998888194561, "epoch": 1.8451690556953715, "grad_norm": 8.0, "learning_rate": 1.9518137520303195e-05, "loss": 0.1357, "loss_lm": 0.01599890599027276, "loss_seg": 0.11965472064912319, "mean_token_accuracy": 0.9952002018690109, "num_tokens": 1791915614.0, "step": 4216 }, { "entropy": 0.018680714070796967, "epoch": 1.8456067403435825, "grad_norm": 10.6875, "learning_rate": 1.951543042772063e-05, "loss": 0.1748, "loss_lm": 0.01779164676554501, "loss_seg": 0.15698936954140663, "mean_token_accuracy": 0.9951306581497192, "num_tokens": 1792340094.0, "step": 4217 }, { "entropy": 0.0184788815677166, "epoch": 1.8460444249917933, "grad_norm": 5.4375, "learning_rate": 1.9512723335138062e-05, "loss": 0.1323, "loss_lm": 0.015604016603901982, "loss_seg": 0.11665166169404984, "mean_token_accuracy": 0.9951042234897614, "num_tokens": 1792766301.0, "step": 4218 }, { "entropy": 0.01884575467556715, "epoch": 1.8464821096400044, "grad_norm": 10.1875, "learning_rate": 1.9510016242555496e-05, "loss": 0.123, "loss_lm": 0.014446909306570888, "loss_seg": 0.10858801007270813, "mean_token_accuracy": 0.9951730519533157, "num_tokens": 1793191378.0, "step": 4219 }, { "entropy": 0.01815914176404476, "epoch": 1.8469197942882154, "grad_norm": 11.25, "learning_rate": 1.950730914997293e-05, "loss": 0.1101, "loss_lm": 0.017723912373185158, "loss_seg": 0.09236254915595055, "mean_token_accuracy": 0.9952871650457382, "num_tokens": 1793616555.0, "step": 4220 }, { "entropy": 0.017878192476928234, "epoch": 1.8473574789364262, "grad_norm": 7.65625, "learning_rate": 1.9504602057390363e-05, "loss": 0.117, "loss_lm": 0.01527207950130105, "loss_seg": 0.10172596760094166, "mean_token_accuracy": 0.9952604323625565, "num_tokens": 1794041229.0, "step": 4221 }, { "entropy": 0.018498274963349104, "epoch": 1.8477951635846372, "grad_norm": 11.6875, "learning_rate": 1.9501894964807797e-05, "loss": 0.1128, "loss_lm": 0.015127735212445259, "loss_seg": 0.097699542529881, "mean_token_accuracy": 0.9952255338430405, "num_tokens": 1794465537.0, "step": 4222 }, { "entropy": 0.01777846086770296, "epoch": 1.8482328482328483, "grad_norm": 7.96875, "learning_rate": 1.949918787222523e-05, "loss": 0.1096, "loss_lm": 0.015676815761253238, "loss_seg": 0.09388073068112135, "mean_token_accuracy": 0.995349109172821, "num_tokens": 1794890570.0, "step": 4223 }, { "entropy": 0.017848983872681856, "epoch": 1.848670532881059, "grad_norm": 14.9375, "learning_rate": 1.9496480779642665e-05, "loss": 0.1471, "loss_lm": 0.01665761205367744, "loss_seg": 0.1304769217967987, "mean_token_accuracy": 0.9952727407217026, "num_tokens": 1795315297.0, "step": 4224 }, { "entropy": 0.01866585249081254, "epoch": 1.8491082175292701, "grad_norm": 10.8125, "learning_rate": 1.9493773687060098e-05, "loss": 0.1154, "loss_lm": 0.01703778305090964, "loss_seg": 0.09834452159702778, "mean_token_accuracy": 0.9951564818620682, "num_tokens": 1795740817.0, "step": 4225 }, { "entropy": 0.018557564355432987, "epoch": 1.8495459021774812, "grad_norm": 21.625, "learning_rate": 1.9491066594477532e-05, "loss": 0.1697, "loss_lm": 0.015442793490365148, "loss_seg": 0.15422586351633072, "mean_token_accuracy": 0.9951584190130234, "num_tokens": 1796165657.0, "step": 4226 }, { "entropy": 0.017582630272954702, "epoch": 1.849983586825692, "grad_norm": 4.03125, "learning_rate": 1.9488359501894966e-05, "loss": 0.123, "loss_lm": 0.014351918827742338, "loss_seg": 0.10869729891419411, "mean_token_accuracy": 0.9954548925161362, "num_tokens": 1796589579.0, "step": 4227 }, { "entropy": 0.018480594269931316, "epoch": 1.850421271473903, "grad_norm": 4.625, "learning_rate": 1.94856524093124e-05, "loss": 0.1121, "loss_lm": 0.01426644274033606, "loss_seg": 0.09781368635594845, "mean_token_accuracy": 0.9950960427522659, "num_tokens": 1797014487.0, "step": 4228 }, { "entropy": 0.01825785217806697, "epoch": 1.850858956122114, "grad_norm": 7.25, "learning_rate": 1.948294531672983e-05, "loss": 0.1202, "loss_lm": 0.017716655042022467, "loss_seg": 0.10245948005467653, "mean_token_accuracy": 0.995169535279274, "num_tokens": 1797439877.0, "step": 4229 }, { "entropy": 0.018675181549042463, "epoch": 1.8512966407703249, "grad_norm": 6.03125, "learning_rate": 1.9480238224147267e-05, "loss": 0.1218, "loss_lm": 0.015951052540913224, "loss_seg": 0.10587173700332642, "mean_token_accuracy": 0.9950101375579834, "num_tokens": 1797864504.0, "step": 4230 }, { "entropy": 0.018736183643341064, "epoch": 1.851734325418536, "grad_norm": 39.75, "learning_rate": 1.94775311315647e-05, "loss": 0.1151, "loss_lm": 0.01836371049284935, "loss_seg": 0.09670843370258808, "mean_token_accuracy": 0.9950554072856903, "num_tokens": 1798290007.0, "step": 4231 }, { "entropy": 0.018266194965690374, "epoch": 1.852172010066747, "grad_norm": 2.703125, "learning_rate": 1.9474824038982134e-05, "loss": 0.1136, "loss_lm": 0.015615341952070594, "loss_seg": 0.09793544188141823, "mean_token_accuracy": 0.9950887709856033, "num_tokens": 1798715942.0, "step": 4232 }, { "entropy": 0.01784240733832121, "epoch": 1.8526096947149577, "grad_norm": 11.1875, "learning_rate": 1.9472116946399568e-05, "loss": 0.1114, "loss_lm": 0.015592597424983978, "loss_seg": 0.09582540206611156, "mean_token_accuracy": 0.995190441608429, "num_tokens": 1799140898.0, "step": 4233 }, { "entropy": 0.01805116655305028, "epoch": 1.853047379363169, "grad_norm": 13.4375, "learning_rate": 1.9469409853817e-05, "loss": 0.1246, "loss_lm": 0.018078184686601162, "loss_seg": 0.10651103965938091, "mean_token_accuracy": 0.9953181147575378, "num_tokens": 1799565474.0, "step": 4234 }, { "entropy": 0.018068230245262384, "epoch": 1.8534850640113798, "grad_norm": 9.0, "learning_rate": 1.9466702761234435e-05, "loss": 0.1386, "loss_lm": 0.014484931016340852, "loss_seg": 0.1241064965724945, "mean_token_accuracy": 0.9951883852481842, "num_tokens": 1799990236.0, "step": 4235 }, { "entropy": 0.018550050910562277, "epoch": 1.8539227486595906, "grad_norm": 6.71875, "learning_rate": 1.946399566865187e-05, "loss": 0.1234, "loss_lm": 0.01728857448324561, "loss_seg": 0.10609900392591953, "mean_token_accuracy": 0.9951414465904236, "num_tokens": 1800415454.0, "step": 4236 }, { "entropy": 0.018033126834779978, "epoch": 1.8543604333078019, "grad_norm": 8.875, "learning_rate": 1.9461288576069303e-05, "loss": 0.1755, "loss_lm": 0.01667028502561152, "loss_seg": 0.15884798020124435, "mean_token_accuracy": 0.995193138718605, "num_tokens": 1800840440.0, "step": 4237 }, { "entropy": 0.01828308356925845, "epoch": 1.8547981179560127, "grad_norm": 9.125, "learning_rate": 1.9458581483486737e-05, "loss": 0.1385, "loss_lm": 0.01799053093418479, "loss_seg": 0.12049003690481186, "mean_token_accuracy": 0.9952401369810104, "num_tokens": 1801264572.0, "step": 4238 }, { "entropy": 0.01783949974924326, "epoch": 1.8552358026042235, "grad_norm": 9.75, "learning_rate": 1.9455874390904167e-05, "loss": 0.0987, "loss_lm": 0.015940298326313496, "loss_seg": 0.08272473979741335, "mean_token_accuracy": 0.9952843487262726, "num_tokens": 1801689660.0, "step": 4239 }, { "entropy": 0.01826626295223832, "epoch": 1.8556734872524348, "grad_norm": 14.6875, "learning_rate": 1.9453167298321604e-05, "loss": 0.118, "loss_lm": 0.016925020841881633, "loss_seg": 0.10109653323888779, "mean_token_accuracy": 0.9952091127634048, "num_tokens": 1802114812.0, "step": 4240 }, { "entropy": 0.018289221450686455, "epoch": 1.8561111719006456, "grad_norm": 15.0625, "learning_rate": 1.9450460205739038e-05, "loss": 0.1084, "loss_lm": 0.016115412581712008, "loss_seg": 0.09232969488948584, "mean_token_accuracy": 0.9951874613761902, "num_tokens": 1802539728.0, "step": 4241 }, { "entropy": 0.01862499536946416, "epoch": 1.8565488565488566, "grad_norm": 4.0625, "learning_rate": 1.944775311315647e-05, "loss": 0.0925, "loss_lm": 0.015984526136890054, "loss_seg": 0.07648512907326221, "mean_token_accuracy": 0.9951690584421158, "num_tokens": 1802964446.0, "step": 4242 }, { "entropy": 0.018012315966188908, "epoch": 1.8569865411970676, "grad_norm": 5.46875, "learning_rate": 1.9445046020573905e-05, "loss": 0.0818, "loss_lm": 0.015503169735893607, "loss_seg": 0.06632634997367859, "mean_token_accuracy": 0.9952570796012878, "num_tokens": 1803389872.0, "step": 4243 }, { "entropy": 0.01867785584181547, "epoch": 1.8574242258452784, "grad_norm": 12.3125, "learning_rate": 1.9442338927991336e-05, "loss": 0.0944, "loss_lm": 0.018976419232785702, "loss_seg": 0.07547297514975071, "mean_token_accuracy": 0.9951278865337372, "num_tokens": 1803814302.0, "step": 4244 }, { "entropy": 0.018603945150971413, "epoch": 1.8578619104934895, "grad_norm": 7.46875, "learning_rate": 1.943963183540877e-05, "loss": 0.1687, "loss_lm": 0.016653082566335797, "loss_seg": 0.1520108673721552, "mean_token_accuracy": 0.9951193928718567, "num_tokens": 1804239223.0, "step": 4245 }, { "entropy": 0.018612676300108433, "epoch": 1.8582995951417005, "grad_norm": 7.59375, "learning_rate": 1.9436924742826206e-05, "loss": 0.1197, "loss_lm": 0.017677066847682, "loss_seg": 0.10200786776840687, "mean_token_accuracy": 0.9950775057077408, "num_tokens": 1804664582.0, "step": 4246 }, { "entropy": 0.018219035118818283, "epoch": 1.8587372797899113, "grad_norm": 5.09375, "learning_rate": 1.943421765024364e-05, "loss": 0.108, "loss_lm": 0.017171864863485098, "loss_seg": 0.0908718965947628, "mean_token_accuracy": 0.9952447861433029, "num_tokens": 1805089342.0, "step": 4247 }, { "entropy": 0.017811585683375597, "epoch": 1.8591749644381224, "grad_norm": 6.40625, "learning_rate": 1.9431510557661074e-05, "loss": 0.1119, "loss_lm": 0.01574144489131868, "loss_seg": 0.09618002362549305, "mean_token_accuracy": 0.9952765256166458, "num_tokens": 1805513987.0, "step": 4248 }, { "entropy": 0.019099868834018707, "epoch": 1.8596126490863334, "grad_norm": 9.375, "learning_rate": 1.9428803465078504e-05, "loss": 0.0984, "loss_lm": 0.015050019603222609, "loss_seg": 0.08330841921269894, "mean_token_accuracy": 0.9950079172849655, "num_tokens": 1805938875.0, "step": 4249 }, { "entropy": 0.017801365815103054, "epoch": 1.8600503337345442, "grad_norm": 15.75, "learning_rate": 1.9426096372495938e-05, "loss": 0.1323, "loss_lm": 0.017366312444210052, "loss_seg": 0.11492521688342094, "mean_token_accuracy": 0.9954063296318054, "num_tokens": 1806364210.0, "step": 4250 }, { "entropy": 0.018461786210536957, "epoch": 1.8604880183827552, "grad_norm": 8.4375, "learning_rate": 1.9423389279913375e-05, "loss": 0.1448, "loss_lm": 0.015503205824643373, "loss_seg": 0.12925212271511555, "mean_token_accuracy": 0.9951992928981781, "num_tokens": 1806789540.0, "step": 4251 }, { "entropy": 0.018240938894450665, "epoch": 1.8609257030309663, "grad_norm": 6.1875, "learning_rate": 1.942068218733081e-05, "loss": 0.095, "loss_lm": 0.017131667351350188, "loss_seg": 0.07784905936568975, "mean_token_accuracy": 0.9953272491693497, "num_tokens": 1807214075.0, "step": 4252 }, { "entropy": 0.0184929552488029, "epoch": 1.861363387679177, "grad_norm": 11.375, "learning_rate": 1.941797509474824e-05, "loss": 0.1423, "loss_lm": 0.017463787458837032, "loss_seg": 0.12484784610569477, "mean_token_accuracy": 0.9952920377254486, "num_tokens": 1807638844.0, "step": 4253 }, { "entropy": 0.018523359671235085, "epoch": 1.8618010723273881, "grad_norm": 7.40625, "learning_rate": 1.9415268002165673e-05, "loss": 0.1121, "loss_lm": 0.014139814302325249, "loss_seg": 0.09793555922806263, "mean_token_accuracy": 0.9951898604631424, "num_tokens": 1808064158.0, "step": 4254 }, { "entropy": 0.01802840828895569, "epoch": 1.8622387569755992, "grad_norm": 7.25, "learning_rate": 1.9412560909583107e-05, "loss": 0.1089, "loss_lm": 0.01443666685372591, "loss_seg": 0.0944947600364685, "mean_token_accuracy": 0.9953282475471497, "num_tokens": 1808488978.0, "step": 4255 }, { "entropy": 0.01891328301280737, "epoch": 1.86267644162381, "grad_norm": 30.75, "learning_rate": 1.9409853817000544e-05, "loss": 0.1073, "loss_lm": 0.015706433216109872, "loss_seg": 0.09157729335129261, "mean_token_accuracy": 0.9949954003095627, "num_tokens": 1808914176.0, "step": 4256 }, { "entropy": 0.018536834977567196, "epoch": 1.863114126272021, "grad_norm": 7.53125, "learning_rate": 1.9407146724417977e-05, "loss": 0.1516, "loss_lm": 0.01768957683816552, "loss_seg": 0.1339077688753605, "mean_token_accuracy": 0.9951675236225128, "num_tokens": 1809338772.0, "step": 4257 }, { "entropy": 0.018539115320891142, "epoch": 1.863551810920232, "grad_norm": 17.75, "learning_rate": 1.9404439631835408e-05, "loss": 0.1503, "loss_lm": 0.016623544739559293, "loss_seg": 0.13367887772619724, "mean_token_accuracy": 0.9951861202716827, "num_tokens": 1809764189.0, "step": 4258 }, { "entropy": 0.018491530790925026, "epoch": 1.8639894955684428, "grad_norm": 17.125, "learning_rate": 1.940173253925284e-05, "loss": 0.1039, "loss_lm": 0.016723489854484797, "loss_seg": 0.087220324203372, "mean_token_accuracy": 0.9950821846723557, "num_tokens": 1810189120.0, "step": 4259 }, { "entropy": 0.018335928209125996, "epoch": 1.8644271802166539, "grad_norm": 8.875, "learning_rate": 1.9399025446670275e-05, "loss": 0.0983, "loss_lm": 0.01697140675969422, "loss_seg": 0.08135135658085346, "mean_token_accuracy": 0.9952265918254852, "num_tokens": 1810614361.0, "step": 4260 }, { "entropy": 0.018660842906683683, "epoch": 1.864864864864865, "grad_norm": 3.796875, "learning_rate": 1.9396318354087712e-05, "loss": 0.0797, "loss_lm": 0.013872829731553793, "loss_seg": 0.06580498069524765, "mean_token_accuracy": 0.995165541768074, "num_tokens": 1811039407.0, "step": 4261 }, { "entropy": 0.019174708053469658, "epoch": 1.8653025495130757, "grad_norm": 6.34375, "learning_rate": 1.9393611261505146e-05, "loss": 0.1532, "loss_lm": 0.018461587838828564, "loss_seg": 0.13473892025649548, "mean_token_accuracy": 0.9950974732637405, "num_tokens": 1811463854.0, "step": 4262 }, { "entropy": 0.018583669792860746, "epoch": 1.8657402341612868, "grad_norm": 41.75, "learning_rate": 1.9390904168922576e-05, "loss": 0.1448, "loss_lm": 0.01573683531023562, "loss_seg": 0.12909365072846413, "mean_token_accuracy": 0.9951760619878769, "num_tokens": 1811889150.0, "step": 4263 }, { "entropy": 0.01846892386674881, "epoch": 1.8661779188094978, "grad_norm": 8.5625, "learning_rate": 1.938819707634001e-05, "loss": 0.1041, "loss_lm": 0.01650528539903462, "loss_seg": 0.0875580869615078, "mean_token_accuracy": 0.9952475875616074, "num_tokens": 1812314790.0, "step": 4264 }, { "entropy": 0.018715158570557833, "epoch": 1.8666156034577086, "grad_norm": 12.375, "learning_rate": 1.9385489983757444e-05, "loss": 0.1004, "loss_lm": 0.013781983405351639, "loss_seg": 0.08656938560307026, "mean_token_accuracy": 0.9951585829257965, "num_tokens": 1812739628.0, "step": 4265 }, { "entropy": 0.01806145440787077, "epoch": 1.8670532881059196, "grad_norm": 8.4375, "learning_rate": 1.938278289117488e-05, "loss": 0.1272, "loss_lm": 0.014330642065033317, "loss_seg": 0.11289159394800663, "mean_token_accuracy": 0.9953829199075699, "num_tokens": 1813163878.0, "step": 4266 }, { "entropy": 0.01868698187172413, "epoch": 1.8674909727541307, "grad_norm": 19.5, "learning_rate": 1.9380075798592315e-05, "loss": 0.1215, "loss_lm": 0.01737065240740776, "loss_seg": 0.10417153500020504, "mean_token_accuracy": 0.9951295703649521, "num_tokens": 1813589465.0, "step": 4267 }, { "entropy": 0.018936290871351957, "epoch": 1.8679286574023415, "grad_norm": 6.375, "learning_rate": 1.9377368706009745e-05, "loss": 0.106, "loss_lm": 0.015729221515357494, "loss_seg": 0.09028562530875206, "mean_token_accuracy": 0.9950817972421646, "num_tokens": 1814014776.0, "step": 4268 }, { "entropy": 0.018493113573640585, "epoch": 1.8683663420505527, "grad_norm": 5.3125, "learning_rate": 1.937466161342718e-05, "loss": 0.1188, "loss_lm": 0.015589139424264431, "loss_seg": 0.10319125279784203, "mean_token_accuracy": 0.9952325969934464, "num_tokens": 1814440126.0, "step": 4269 }, { "entropy": 0.018794821109622717, "epoch": 1.8688040266987636, "grad_norm": 7.59375, "learning_rate": 1.9371954520844613e-05, "loss": 0.1432, "loss_lm": 0.019388641696423292, "loss_seg": 0.12381350621581078, "mean_token_accuracy": 0.9951737821102142, "num_tokens": 1814865275.0, "step": 4270 }, { "entropy": 0.01950396178290248, "epoch": 1.8692417113469744, "grad_norm": 5.3125, "learning_rate": 1.936924742826205e-05, "loss": 0.1132, "loss_lm": 0.016940567176789045, "loss_seg": 0.09623067080974579, "mean_token_accuracy": 0.9949396103620529, "num_tokens": 1815290313.0, "step": 4271 }, { "entropy": 0.018578120972961187, "epoch": 1.8696793959951856, "grad_norm": 8.3125, "learning_rate": 1.9366540335679483e-05, "loss": 0.1489, "loss_lm": 0.016356813721358776, "loss_seg": 0.13252075389027596, "mean_token_accuracy": 0.9952255189418793, "num_tokens": 1815715240.0, "step": 4272 }, { "entropy": 0.018141134176403284, "epoch": 1.8701170806433964, "grad_norm": 8.1875, "learning_rate": 1.9363833243096914e-05, "loss": 0.1333, "loss_lm": 0.01800351683050394, "loss_seg": 0.11529395915567875, "mean_token_accuracy": 0.9952655881643295, "num_tokens": 1816139520.0, "step": 4273 }, { "entropy": 0.018181513529270887, "epoch": 1.8705547652916072, "grad_norm": 9.4375, "learning_rate": 1.9361126150514347e-05, "loss": 0.1517, "loss_lm": 0.016090344404801726, "loss_seg": 0.13562542386353016, "mean_token_accuracy": 0.9952850937843323, "num_tokens": 1816564304.0, "step": 4274 }, { "entropy": 0.018106564413756132, "epoch": 1.8709924499398185, "grad_norm": 15.0625, "learning_rate": 1.935841905793178e-05, "loss": 0.1555, "loss_lm": 0.017677633557468653, "loss_seg": 0.13784512504935265, "mean_token_accuracy": 0.9953787177801132, "num_tokens": 1816989231.0, "step": 4275 }, { "entropy": 0.01795760588720441, "epoch": 1.8714301345880293, "grad_norm": 6.6875, "learning_rate": 1.9355711965349215e-05, "loss": 0.1286, "loss_lm": 0.014699447434395552, "loss_seg": 0.11393194552510977, "mean_token_accuracy": 0.9953464567661285, "num_tokens": 1817414672.0, "step": 4276 }, { "entropy": 0.018350801896303892, "epoch": 1.8718678192362401, "grad_norm": 9.8125, "learning_rate": 1.935300487276665e-05, "loss": 0.1235, "loss_lm": 0.015987493796274066, "loss_seg": 0.10751391015946865, "mean_token_accuracy": 0.9953209012746811, "num_tokens": 1817839712.0, "step": 4277 }, { "entropy": 0.018393945414572954, "epoch": 1.8723055038844514, "grad_norm": 8.5, "learning_rate": 1.9350297780184082e-05, "loss": 0.1108, "loss_lm": 0.01726414286531508, "loss_seg": 0.09353541769087315, "mean_token_accuracy": 0.9952390342950821, "num_tokens": 1818264793.0, "step": 4278 }, { "entropy": 0.01853267289698124, "epoch": 1.8727431885326622, "grad_norm": 9.5, "learning_rate": 1.9347590687601516e-05, "loss": 0.1557, "loss_lm": 0.015963821904733777, "loss_seg": 0.13971614092588425, "mean_token_accuracy": 0.9951435625553131, "num_tokens": 1818689757.0, "step": 4279 }, { "entropy": 0.01807339582592249, "epoch": 1.8731808731808732, "grad_norm": 6.53125, "learning_rate": 1.934488359501895e-05, "loss": 0.1288, "loss_lm": 0.016273445915430784, "loss_seg": 0.11250497959554195, "mean_token_accuracy": 0.9954611659049988, "num_tokens": 1819114484.0, "step": 4280 }, { "entropy": 0.01834869710728526, "epoch": 1.8736185578290843, "grad_norm": 10.625, "learning_rate": 1.9342176502436384e-05, "loss": 0.0942, "loss_lm": 0.01710365549661219, "loss_seg": 0.07709439657628536, "mean_token_accuracy": 0.9951796233654022, "num_tokens": 1819539310.0, "step": 4281 }, { "entropy": 0.019273345358669758, "epoch": 1.874056242477295, "grad_norm": 9.6875, "learning_rate": 1.9339469409853817e-05, "loss": 0.1656, "loss_lm": 0.018448696471750736, "loss_seg": 0.14713398553431034, "mean_token_accuracy": 0.9949954152107239, "num_tokens": 1819963702.0, "step": 4282 }, { "entropy": 0.018031645566225052, "epoch": 1.874493927125506, "grad_norm": 7.1875, "learning_rate": 1.933676231727125e-05, "loss": 0.1163, "loss_lm": 0.015794940758496523, "loss_seg": 0.100543481297791, "mean_token_accuracy": 0.9953020662069321, "num_tokens": 1820388535.0, "step": 4283 }, { "entropy": 0.018207740504294634, "epoch": 1.8749316117737171, "grad_norm": 4.25, "learning_rate": 1.9334055224688685e-05, "loss": 0.1463, "loss_lm": 0.01598554872907698, "loss_seg": 0.13030057400465012, "mean_token_accuracy": 0.9952913522720337, "num_tokens": 1820813019.0, "step": 4284 }, { "entropy": 0.018124151043593884, "epoch": 1.875369296421928, "grad_norm": 5.21875, "learning_rate": 1.933134813210612e-05, "loss": 0.1008, "loss_lm": 0.015034507028758526, "loss_seg": 0.08574522845447063, "mean_token_accuracy": 0.9952919185161591, "num_tokens": 1821238227.0, "step": 4285 }, { "entropy": 0.017600938212126493, "epoch": 1.875806981070139, "grad_norm": 10.125, "learning_rate": 1.9328641039523552e-05, "loss": 0.1271, "loss_lm": 0.015152638079598546, "loss_seg": 0.11192463897168636, "mean_token_accuracy": 0.995338499546051, "num_tokens": 1821663151.0, "step": 4286 }, { "entropy": 0.017848804593086243, "epoch": 1.87624466571835, "grad_norm": 5.125, "learning_rate": 1.9325933946940986e-05, "loss": 0.1274, "loss_lm": 0.015374817652627826, "loss_seg": 0.11207190528512001, "mean_token_accuracy": 0.9953217059373856, "num_tokens": 1822088493.0, "step": 4287 }, { "entropy": 0.01814150856807828, "epoch": 1.8766823503665608, "grad_norm": 5.15625, "learning_rate": 1.932322685435842e-05, "loss": 0.1165, "loss_lm": 0.015278328908607364, "loss_seg": 0.10126623511314392, "mean_token_accuracy": 0.9952903538942337, "num_tokens": 1822513713.0, "step": 4288 }, { "entropy": 0.018225665669888258, "epoch": 1.8771200350147719, "grad_norm": 34.25, "learning_rate": 1.9320519761775853e-05, "loss": 0.0942, "loss_lm": 0.016918384237214923, "loss_seg": 0.07724562752991915, "mean_token_accuracy": 0.9952059537172318, "num_tokens": 1822938985.0, "step": 4289 }, { "entropy": 0.017763630021363497, "epoch": 1.877557719662983, "grad_norm": 7.5625, "learning_rate": 1.9317812669193287e-05, "loss": 0.1095, "loss_lm": 0.01433784095570445, "loss_seg": 0.09516023844480515, "mean_token_accuracy": 0.9953597337007523, "num_tokens": 1823364348.0, "step": 4290 }, { "entropy": 0.018350749742239714, "epoch": 1.8779954043111937, "grad_norm": 6.84375, "learning_rate": 1.931510557661072e-05, "loss": 0.1188, "loss_lm": 0.016577045898884535, "loss_seg": 0.10221663489937782, "mean_token_accuracy": 0.995362862944603, "num_tokens": 1823789319.0, "step": 4291 }, { "entropy": 0.017845116090029478, "epoch": 1.8784330889594048, "grad_norm": 6.46875, "learning_rate": 1.9312398484028154e-05, "loss": 0.0969, "loss_lm": 0.016701231244951487, "loss_seg": 0.08019813522696495, "mean_token_accuracy": 0.9953550398349762, "num_tokens": 1824213391.0, "step": 4292 }, { "entropy": 0.018502230755984783, "epoch": 1.8788707736076158, "grad_norm": 8.25, "learning_rate": 1.9309691391445588e-05, "loss": 0.1491, "loss_lm": 0.017388494685292244, "loss_seg": 0.1317111123353243, "mean_token_accuracy": 0.995140016078949, "num_tokens": 1824638659.0, "step": 4293 }, { "entropy": 0.018234597519040108, "epoch": 1.8793084582558266, "grad_norm": 9.5, "learning_rate": 1.9306984298863022e-05, "loss": 0.1068, "loss_lm": 0.016465552151203156, "loss_seg": 0.09031704999506474, "mean_token_accuracy": 0.9952098578214645, "num_tokens": 1825063661.0, "step": 4294 }, { "entropy": 0.018715691287070513, "epoch": 1.8797461429040376, "grad_norm": 5.78125, "learning_rate": 1.9304277206280456e-05, "loss": 0.1057, "loss_lm": 0.01846131170168519, "loss_seg": 0.0872882604598999, "mean_token_accuracy": 0.9951058626174927, "num_tokens": 1825488796.0, "step": 4295 }, { "entropy": 0.018209364730864763, "epoch": 1.8801838275522487, "grad_norm": 4.1875, "learning_rate": 1.930157011369789e-05, "loss": 0.112, "loss_lm": 0.01798552507534623, "loss_seg": 0.09401325508952141, "mean_token_accuracy": 0.9952573627233505, "num_tokens": 1825914249.0, "step": 4296 }, { "entropy": 0.018311866093426943, "epoch": 1.8806215122004595, "grad_norm": 5.3125, "learning_rate": 1.9298863021115323e-05, "loss": 0.0983, "loss_lm": 0.01565168146044016, "loss_seg": 0.08269570022821426, "mean_token_accuracy": 0.9952634125947952, "num_tokens": 1826338583.0, "step": 4297 }, { "entropy": 0.0180710949935019, "epoch": 1.8810591968486705, "grad_norm": 5.6875, "learning_rate": 1.9296155928532757e-05, "loss": 0.1537, "loss_lm": 0.016600061673671007, "loss_seg": 0.13709510304033756, "mean_token_accuracy": 0.9952328950166702, "num_tokens": 1826763118.0, "step": 4298 }, { "entropy": 0.01832259865477681, "epoch": 1.8814968814968815, "grad_norm": 10.0, "learning_rate": 1.929344883595019e-05, "loss": 0.1181, "loss_lm": 0.016470814123749733, "loss_seg": 0.1016281433403492, "mean_token_accuracy": 0.9951927959918976, "num_tokens": 1827188503.0, "step": 4299 }, { "entropy": 0.018220791593194008, "epoch": 1.8819345661450924, "grad_norm": 7.59375, "learning_rate": 1.9290741743367624e-05, "loss": 0.144, "loss_lm": 0.016798365861177444, "loss_seg": 0.12716746143996716, "mean_token_accuracy": 0.9952206760644913, "num_tokens": 1827612792.0, "step": 4300 }, { "entropy": 0.018032727763056755, "epoch": 1.8823722507933034, "grad_norm": 8.125, "learning_rate": 1.9288034650785055e-05, "loss": 0.1227, "loss_lm": 0.015094515634700656, "loss_seg": 0.10757715068757534, "mean_token_accuracy": 0.9953277260065079, "num_tokens": 1828038029.0, "step": 4301 }, { "entropy": 0.01857390347868204, "epoch": 1.8828099354415144, "grad_norm": 10.0625, "learning_rate": 1.9285327558202492e-05, "loss": 0.0873, "loss_lm": 0.014177043456584215, "loss_seg": 0.07311368919909, "mean_token_accuracy": 0.9952280223369598, "num_tokens": 1828463182.0, "step": 4302 }, { "entropy": 0.018662687856703997, "epoch": 1.8832476200897252, "grad_norm": 10.625, "learning_rate": 1.9282620465619925e-05, "loss": 0.1272, "loss_lm": 0.014322077156975865, "loss_seg": 0.11282812990248203, "mean_token_accuracy": 0.9952289909124374, "num_tokens": 1828888638.0, "step": 4303 }, { "entropy": 0.019259059336036444, "epoch": 1.8836853047379363, "grad_norm": 8.8125, "learning_rate": 1.927991337303736e-05, "loss": 0.1358, "loss_lm": 0.01608600909821689, "loss_seg": 0.11972780432552099, "mean_token_accuracy": 0.9950816035270691, "num_tokens": 1829313755.0, "step": 4304 }, { "entropy": 0.01804814161732793, "epoch": 1.8841229893861473, "grad_norm": 5.5625, "learning_rate": 1.9277206280454793e-05, "loss": 0.0802, "loss_lm": 0.018175254575908184, "loss_seg": 0.0620640879496932, "mean_token_accuracy": 0.9953435063362122, "num_tokens": 1829738926.0, "step": 4305 }, { "entropy": 0.01831262419000268, "epoch": 1.8845606740343581, "grad_norm": 7.5, "learning_rate": 1.9274499187872223e-05, "loss": 0.0928, "loss_lm": 0.014977680053561926, "loss_seg": 0.07778693083673716, "mean_token_accuracy": 0.9952097982168198, "num_tokens": 1830163518.0, "step": 4306 }, { "entropy": 0.018766860477626324, "epoch": 1.8849983586825694, "grad_norm": 7.3125, "learning_rate": 1.927179209528966e-05, "loss": 0.1204, "loss_lm": 0.01866114418953657, "loss_seg": 0.10176780819892883, "mean_token_accuracy": 0.9950221627950668, "num_tokens": 1830588785.0, "step": 4307 }, { "entropy": 0.01792967924848199, "epoch": 1.8854360433307802, "grad_norm": 3.71875, "learning_rate": 1.9269085002707094e-05, "loss": 0.111, "loss_lm": 0.015015845885500312, "loss_seg": 0.09597825445234776, "mean_token_accuracy": 0.9953524023294449, "num_tokens": 1831013699.0, "step": 4308 }, { "entropy": 0.0181480860337615, "epoch": 1.885873727978991, "grad_norm": 5.34375, "learning_rate": 1.9266377910124528e-05, "loss": 0.1266, "loss_lm": 0.016208787681534886, "loss_seg": 0.11043508164584637, "mean_token_accuracy": 0.995213970541954, "num_tokens": 1831439475.0, "step": 4309 }, { "entropy": 0.018221923150122166, "epoch": 1.8863114126272023, "grad_norm": 57.5, "learning_rate": 1.926367081754196e-05, "loss": 0.1165, "loss_lm": 0.01598936738446355, "loss_seg": 0.10048643127083778, "mean_token_accuracy": 0.9953158050775528, "num_tokens": 1831864272.0, "step": 4310 }, { "entropy": 0.019022177439182997, "epoch": 1.886749097275413, "grad_norm": 3.90625, "learning_rate": 1.9260963724959392e-05, "loss": 0.0937, "loss_lm": 0.01821771077811718, "loss_seg": 0.07549410965293646, "mean_token_accuracy": 0.9950232356786728, "num_tokens": 1832289406.0, "step": 4311 }, { "entropy": 0.018625727854669094, "epoch": 1.8871867819236239, "grad_norm": 8.875, "learning_rate": 1.9258256632376826e-05, "loss": 0.168, "loss_lm": 0.016461026156321168, "loss_seg": 0.15152805298566818, "mean_token_accuracy": 0.9951460361480713, "num_tokens": 1832714168.0, "step": 4312 }, { "entropy": 0.017793788574635983, "epoch": 1.8876244665718351, "grad_norm": 16.125, "learning_rate": 1.9255549539794263e-05, "loss": 0.1429, "loss_lm": 0.018029649974778295, "loss_seg": 0.1249164529144764, "mean_token_accuracy": 0.9953528642654419, "num_tokens": 1833139097.0, "step": 4313 }, { "entropy": 0.018344005569815636, "epoch": 1.888062151220046, "grad_norm": 9.875, "learning_rate": 1.9252842447211696e-05, "loss": 0.1318, "loss_lm": 0.016591759398579597, "loss_seg": 0.11521746590733528, "mean_token_accuracy": 0.9952726513147354, "num_tokens": 1833564496.0, "step": 4314 }, { "entropy": 0.01848779944702983, "epoch": 1.8884998358682568, "grad_norm": 5.15625, "learning_rate": 1.925013535462913e-05, "loss": 0.1736, "loss_lm": 0.015741862123832107, "loss_seg": 0.1578942909836769, "mean_token_accuracy": 0.9951795637607574, "num_tokens": 1833989291.0, "step": 4315 }, { "entropy": 0.018413751386106014, "epoch": 1.888937520516468, "grad_norm": 6.53125, "learning_rate": 1.924742826204656e-05, "loss": 0.126, "loss_lm": 0.0164002007804811, "loss_seg": 0.10955577529966831, "mean_token_accuracy": 0.9951857775449753, "num_tokens": 1834414460.0, "step": 4316 }, { "entropy": 0.018478662706911564, "epoch": 1.8893752051646788, "grad_norm": 3.1875, "learning_rate": 1.9244721169463994e-05, "loss": 0.1211, "loss_lm": 0.015330311842262745, "loss_seg": 0.10578942857682705, "mean_token_accuracy": 0.9952014684677124, "num_tokens": 1834839139.0, "step": 4317 }, { "entropy": 0.01789716025814414, "epoch": 1.8898128898128899, "grad_norm": 8.375, "learning_rate": 1.924201407688143e-05, "loss": 0.1409, "loss_lm": 0.015993017004802823, "loss_seg": 0.12490130588412285, "mean_token_accuracy": 0.9953425228595734, "num_tokens": 1835264003.0, "step": 4318 }, { "entropy": 0.018558700568974018, "epoch": 1.890250574461101, "grad_norm": 12.9375, "learning_rate": 1.9239306984298865e-05, "loss": 0.0881, "loss_lm": 0.01583416434004903, "loss_seg": 0.07224304880946875, "mean_token_accuracy": 0.9950840771198273, "num_tokens": 1835688518.0, "step": 4319 }, { "entropy": 0.01751565933227539, "epoch": 1.8906882591093117, "grad_norm": 8.25, "learning_rate": 1.92365998917163e-05, "loss": 0.1, "loss_lm": 0.01748299365863204, "loss_seg": 0.08249942027032375, "mean_token_accuracy": 0.9953333884477615, "num_tokens": 1836113172.0, "step": 4320 }, { "entropy": 0.019094412215054035, "epoch": 1.8911259437575227, "grad_norm": 6.46875, "learning_rate": 1.923389279913373e-05, "loss": 0.0756, "loss_lm": 0.019260897766798735, "loss_seg": 0.05636645574122667, "mean_token_accuracy": 0.995025023818016, "num_tokens": 1836538658.0, "step": 4321 }, { "entropy": 0.01769409142434597, "epoch": 1.8915636284057338, "grad_norm": 7.15625, "learning_rate": 1.9231185706551163e-05, "loss": 0.1368, "loss_lm": 0.016962010879069567, "loss_seg": 0.11981217563152313, "mean_token_accuracy": 0.9953208714723587, "num_tokens": 1836963470.0, "step": 4322 }, { "entropy": 0.018403853289783, "epoch": 1.8920013130539446, "grad_norm": 8.75, "learning_rate": 1.92284786139686e-05, "loss": 0.1034, "loss_lm": 0.016456873854622245, "loss_seg": 0.08696042560040951, "mean_token_accuracy": 0.9951010048389435, "num_tokens": 1837389241.0, "step": 4323 }, { "entropy": 0.018497431185096502, "epoch": 1.8924389977021556, "grad_norm": 5.65625, "learning_rate": 1.9225771521386034e-05, "loss": 0.0829, "loss_lm": 0.015044057741761208, "loss_seg": 0.06785089522600174, "mean_token_accuracy": 0.99521704018116, "num_tokens": 1837814256.0, "step": 4324 }, { "entropy": 0.019187476951628923, "epoch": 1.8928766823503667, "grad_norm": 4.90625, "learning_rate": 1.9223064428803464e-05, "loss": 0.1828, "loss_lm": 0.01747183850966394, "loss_seg": 0.16532099433243275, "mean_token_accuracy": 0.9949457943439484, "num_tokens": 1838239159.0, "step": 4325 }, { "entropy": 0.018291770946234465, "epoch": 1.8933143669985775, "grad_norm": 8.875, "learning_rate": 1.9220357336220898e-05, "loss": 0.1133, "loss_lm": 0.016496295342221856, "loss_seg": 0.09682728536427021, "mean_token_accuracy": 0.9951539188623428, "num_tokens": 1838664006.0, "step": 4326 }, { "entropy": 0.018417458049952984, "epoch": 1.8937520516467885, "grad_norm": 7.1875, "learning_rate": 1.921765024363833e-05, "loss": 0.0896, "loss_lm": 0.015564833069220185, "loss_seg": 0.07403443939983845, "mean_token_accuracy": 0.9952484369277954, "num_tokens": 1839088037.0, "step": 4327 }, { "entropy": 0.018219053279608488, "epoch": 1.8941897362949995, "grad_norm": 9.75, "learning_rate": 1.921494315105577e-05, "loss": 0.1603, "loss_lm": 0.01722424989566207, "loss_seg": 0.14311722293496132, "mean_token_accuracy": 0.9953254014253616, "num_tokens": 1839513059.0, "step": 4328 }, { "entropy": 0.018352205399423838, "epoch": 1.8946274209432103, "grad_norm": 7.84375, "learning_rate": 1.9212236058473202e-05, "loss": 0.1322, "loss_lm": 0.0151775146368891, "loss_seg": 0.11699444428086281, "mean_token_accuracy": 0.9952563047409058, "num_tokens": 1839937597.0, "step": 4329 }, { "entropy": 0.018607550766319036, "epoch": 1.8950651055914214, "grad_norm": 3.78125, "learning_rate": 1.9209528965890633e-05, "loss": 0.1417, "loss_lm": 0.017175838816910982, "loss_seg": 0.12455079238861799, "mean_token_accuracy": 0.9951046258211136, "num_tokens": 1840363084.0, "step": 4330 }, { "entropy": 0.018463711719959974, "epoch": 1.8955027902396324, "grad_norm": 6.84375, "learning_rate": 1.9206821873308066e-05, "loss": 0.1294, "loss_lm": 0.01814530440606177, "loss_seg": 0.11126379854977131, "mean_token_accuracy": 0.9951223134994507, "num_tokens": 1840788017.0, "step": 4331 }, { "entropy": 0.018186718225479126, "epoch": 1.8959404748878432, "grad_norm": 6.8125, "learning_rate": 1.92041147807255e-05, "loss": 0.1423, "loss_lm": 0.016415334539487958, "loss_seg": 0.1258634328842163, "mean_token_accuracy": 0.9952320456504822, "num_tokens": 1841212910.0, "step": 4332 }, { "entropy": 0.01881280541419983, "epoch": 1.8963781595360543, "grad_norm": 17.875, "learning_rate": 1.9201407688142937e-05, "loss": 0.0964, "loss_lm": 0.015658449148759246, "loss_seg": 0.08075587078928947, "mean_token_accuracy": 0.9950205236673355, "num_tokens": 1841638459.0, "step": 4333 }, { "entropy": 0.018571956548839808, "epoch": 1.8968158441842653, "grad_norm": 7.78125, "learning_rate": 1.919870059556037e-05, "loss": 0.0863, "loss_lm": 0.015350326662883162, "loss_seg": 0.07093418948352337, "mean_token_accuracy": 0.9951204657554626, "num_tokens": 1842063450.0, "step": 4334 }, { "entropy": 0.018699702341109514, "epoch": 1.897253528832476, "grad_norm": 6.6875, "learning_rate": 1.91959935029778e-05, "loss": 0.1208, "loss_lm": 0.0164038953371346, "loss_seg": 0.10435229726135731, "mean_token_accuracy": 0.9951543211936951, "num_tokens": 1842488320.0, "step": 4335 }, { "entropy": 0.01897115306928754, "epoch": 1.8976912134806871, "grad_norm": 9.3125, "learning_rate": 1.9193286410395235e-05, "loss": 0.1269, "loss_lm": 0.01948540541343391, "loss_seg": 0.10740197077393532, "mean_token_accuracy": 0.9950571358203888, "num_tokens": 1842913381.0, "step": 4336 }, { "entropy": 0.01799726951867342, "epoch": 1.8981288981288982, "grad_norm": 7.71875, "learning_rate": 1.919057931781267e-05, "loss": 0.1114, "loss_lm": 0.015776921762153506, "loss_seg": 0.09562187269330025, "mean_token_accuracy": 0.9953887313604355, "num_tokens": 1843338592.0, "step": 4337 }, { "entropy": 0.018177767749875784, "epoch": 1.898566582777109, "grad_norm": 5.4375, "learning_rate": 1.9187872225230106e-05, "loss": 0.1288, "loss_lm": 0.01773428195156157, "loss_seg": 0.11105074733495712, "mean_token_accuracy": 0.9952491372823715, "num_tokens": 1843763734.0, "step": 4338 }, { "entropy": 0.0183562315069139, "epoch": 1.89900426742532, "grad_norm": 12.0625, "learning_rate": 1.918516513264754e-05, "loss": 0.1039, "loss_lm": 0.014786199433729053, "loss_seg": 0.08912059478461742, "mean_token_accuracy": 0.9951840788125992, "num_tokens": 1844188973.0, "step": 4339 }, { "entropy": 0.019095473922789097, "epoch": 1.899441952073531, "grad_norm": 14.1875, "learning_rate": 1.918245804006497e-05, "loss": 0.1234, "loss_lm": 0.0173449891153723, "loss_seg": 0.10600690357387066, "mean_token_accuracy": 0.9951166063547134, "num_tokens": 1844613835.0, "step": 4340 }, { "entropy": 0.01819953741505742, "epoch": 1.8998796367217419, "grad_norm": 11.0, "learning_rate": 1.9179750947482404e-05, "loss": 0.1437, "loss_lm": 0.01519474177621305, "loss_seg": 0.12850516475737095, "mean_token_accuracy": 0.9951582849025726, "num_tokens": 1845038686.0, "step": 4341 }, { "entropy": 0.01812117826193571, "epoch": 1.900317321369953, "grad_norm": 4.40625, "learning_rate": 1.9177043854899837e-05, "loss": 0.0876, "loss_lm": 0.01557838381268084, "loss_seg": 0.07202709466218948, "mean_token_accuracy": 0.9952545464038849, "num_tokens": 1845463908.0, "step": 4342 }, { "entropy": 0.01915402989834547, "epoch": 1.900755006018164, "grad_norm": 7.46875, "learning_rate": 1.917433676231727e-05, "loss": 0.1173, "loss_lm": 0.01561883045360446, "loss_seg": 0.10168998129665852, "mean_token_accuracy": 0.995041236281395, "num_tokens": 1845889003.0, "step": 4343 }, { "entropy": 0.01808639196678996, "epoch": 1.9011926906663748, "grad_norm": 8.1875, "learning_rate": 1.9171629669734705e-05, "loss": 0.1642, "loss_lm": 0.015972975874319673, "loss_seg": 0.1481783352792263, "mean_token_accuracy": 0.9954020828008652, "num_tokens": 1846313240.0, "step": 4344 }, { "entropy": 0.018496788572520018, "epoch": 1.9016303753145858, "grad_norm": 13.125, "learning_rate": 1.916892257715214e-05, "loss": 0.0995, "loss_lm": 0.01507752644829452, "loss_seg": 0.08439513109624386, "mean_token_accuracy": 0.9952284544706345, "num_tokens": 1846738838.0, "step": 4345 }, { "entropy": 0.018508524168282747, "epoch": 1.9020680599627968, "grad_norm": 52.25, "learning_rate": 1.9166215484569572e-05, "loss": 0.0903, "loss_lm": 0.013953931629657745, "loss_seg": 0.07639259472489357, "mean_token_accuracy": 0.9952110648155212, "num_tokens": 1847163733.0, "step": 4346 }, { "entropy": 0.01833803066983819, "epoch": 1.9025057446110076, "grad_norm": 11.5625, "learning_rate": 1.9163508391987006e-05, "loss": 0.1349, "loss_lm": 0.016216251999139786, "loss_seg": 0.1187265906482935, "mean_token_accuracy": 0.9951228648424149, "num_tokens": 1847588953.0, "step": 4347 }, { "entropy": 0.018038739915937185, "epoch": 1.9029434292592189, "grad_norm": 8.125, "learning_rate": 1.916080129940444e-05, "loss": 0.1205, "loss_lm": 0.01654265681281686, "loss_seg": 0.10394867323338985, "mean_token_accuracy": 0.9951574802398682, "num_tokens": 1848014522.0, "step": 4348 }, { "entropy": 0.01865470875054598, "epoch": 1.9033811139074297, "grad_norm": 3.5625, "learning_rate": 1.9158094206821873e-05, "loss": 0.0983, "loss_lm": 0.016273105051368475, "loss_seg": 0.08207666501402855, "mean_token_accuracy": 0.995201900601387, "num_tokens": 1848439661.0, "step": 4349 }, { "entropy": 0.018141556531190872, "epoch": 1.9038187985556405, "grad_norm": 6.8125, "learning_rate": 1.9155387114239307e-05, "loss": 0.0984, "loss_lm": 0.015494454419240355, "loss_seg": 0.08289050124585629, "mean_token_accuracy": 0.995252400636673, "num_tokens": 1848865115.0, "step": 4350 }, { "entropy": 0.018867536913603544, "epoch": 1.9042564832038518, "grad_norm": 9.9375, "learning_rate": 1.915268002165674e-05, "loss": 0.0938, "loss_lm": 0.015997075475752354, "loss_seg": 0.07776380237191916, "mean_token_accuracy": 0.9951331466436386, "num_tokens": 1849290991.0, "step": 4351 }, { "entropy": 0.017728733364492655, "epoch": 1.9046941678520626, "grad_norm": 10.0625, "learning_rate": 1.9149972929074175e-05, "loss": 0.1048, "loss_lm": 0.014915684005245566, "loss_seg": 0.08990781474858522, "mean_token_accuracy": 0.9953081756830215, "num_tokens": 1849715703.0, "step": 4352 }, { "entropy": 0.018275363836437464, "epoch": 1.9051318525002734, "grad_norm": 6.4375, "learning_rate": 1.914726583649161e-05, "loss": 0.1289, "loss_lm": 0.01661508670076728, "loss_seg": 0.11229874938726425, "mean_token_accuracy": 0.9952581971883774, "num_tokens": 1850141222.0, "step": 4353 }, { "entropy": 0.018202869687229395, "epoch": 1.9055695371484846, "grad_norm": 7.875, "learning_rate": 1.9144558743909042e-05, "loss": 0.1213, "loss_lm": 0.015935593517497182, "loss_seg": 0.10533981863409281, "mean_token_accuracy": 0.9951874017715454, "num_tokens": 1850566329.0, "step": 4354 }, { "entropy": 0.018260567914694548, "epoch": 1.9060072217966955, "grad_norm": 14.9375, "learning_rate": 1.9141851651326476e-05, "loss": 0.1144, "loss_lm": 0.016881954157724977, "loss_seg": 0.09754030033946037, "mean_token_accuracy": 0.9951100647449493, "num_tokens": 1850991240.0, "step": 4355 }, { "entropy": 0.018239774741232395, "epoch": 1.9064449064449065, "grad_norm": 3.140625, "learning_rate": 1.913914455874391e-05, "loss": 0.1287, "loss_lm": 0.016718138242140412, "loss_seg": 0.11196721345186234, "mean_token_accuracy": 0.995353177189827, "num_tokens": 1851415764.0, "step": 4356 }, { "entropy": 0.017975879833102226, "epoch": 1.9068825910931175, "grad_norm": 3.953125, "learning_rate": 1.9136437466161343e-05, "loss": 0.1042, "loss_lm": 0.017089196480810642, "loss_seg": 0.08706101216375828, "mean_token_accuracy": 0.9952793568372726, "num_tokens": 1851840656.0, "step": 4357 }, { "entropy": 0.018340331502258778, "epoch": 1.9073202757413283, "grad_norm": 5.3125, "learning_rate": 1.9133730373578777e-05, "loss": 0.1165, "loss_lm": 0.015656216768547893, "loss_seg": 0.1008437592536211, "mean_token_accuracy": 0.9951287358999252, "num_tokens": 1852265983.0, "step": 4358 }, { "entropy": 0.017900463193655014, "epoch": 1.9077579603895394, "grad_norm": 67.5, "learning_rate": 1.913102328099621e-05, "loss": 0.0976, "loss_lm": 0.015120724216103554, "loss_seg": 0.08242978528141975, "mean_token_accuracy": 0.9954402595758438, "num_tokens": 1852691160.0, "step": 4359 }, { "entropy": 0.018725076224654913, "epoch": 1.9081956450377504, "grad_norm": 6.71875, "learning_rate": 1.9128316188413644e-05, "loss": 0.1261, "loss_lm": 0.017172252293676138, "loss_seg": 0.10890574008226395, "mean_token_accuracy": 0.9951106905937195, "num_tokens": 1853116499.0, "step": 4360 }, { "entropy": 0.018700684420764446, "epoch": 1.9086333296859612, "grad_norm": 6.15625, "learning_rate": 1.9125609095831078e-05, "loss": 0.1303, "loss_lm": 0.017666865373030305, "loss_seg": 0.11266392935067415, "mean_token_accuracy": 0.9951025247573853, "num_tokens": 1853541569.0, "step": 4361 }, { "entropy": 0.018123477697372437, "epoch": 1.9090710143341723, "grad_norm": 4.03125, "learning_rate": 1.9122902003248512e-05, "loss": 0.1464, "loss_lm": 0.01654747035354376, "loss_seg": 0.12986749410629272, "mean_token_accuracy": 0.9952123463153839, "num_tokens": 1853966576.0, "step": 4362 }, { "entropy": 0.018949178513139486, "epoch": 1.9095086989823833, "grad_norm": 7.125, "learning_rate": 1.9120194910665946e-05, "loss": 0.1333, "loss_lm": 0.016930308658629656, "loss_seg": 0.11634808406233788, "mean_token_accuracy": 0.9949831664562225, "num_tokens": 1854391719.0, "step": 4363 }, { "entropy": 0.018436456564813852, "epoch": 1.909946383630594, "grad_norm": 5.96875, "learning_rate": 1.911748781808338e-05, "loss": 0.1373, "loss_lm": 0.01704469113610685, "loss_seg": 0.1202720757573843, "mean_token_accuracy": 0.9952293038368225, "num_tokens": 1854816567.0, "step": 4364 }, { "entropy": 0.01798006147146225, "epoch": 1.9103840682788051, "grad_norm": 5.71875, "learning_rate": 1.9114780725500813e-05, "loss": 0.1026, "loss_lm": 0.013286238070577383, "loss_seg": 0.08928979281336069, "mean_token_accuracy": 0.9953626990318298, "num_tokens": 1855241994.0, "step": 4365 }, { "entropy": 0.018130566459149122, "epoch": 1.9108217529270162, "grad_norm": 7.34375, "learning_rate": 1.9112073632918247e-05, "loss": 0.1258, "loss_lm": 0.015412227250635624, "loss_seg": 0.11040991730988026, "mean_token_accuracy": 0.9952970892190933, "num_tokens": 1855666990.0, "step": 4366 }, { "entropy": 0.018479826860129833, "epoch": 1.911259437575227, "grad_norm": 8.625, "learning_rate": 1.910936654033568e-05, "loss": 0.1466, "loss_lm": 0.019517710898071527, "loss_seg": 0.12703721970319748, "mean_token_accuracy": 0.9951533079147339, "num_tokens": 1856091582.0, "step": 4367 }, { "entropy": 0.017903327476233244, "epoch": 1.911697122223438, "grad_norm": 16.375, "learning_rate": 1.910665944775311e-05, "loss": 0.1008, "loss_lm": 0.015430595492944121, "loss_seg": 0.08538623806089163, "mean_token_accuracy": 0.9953299909830093, "num_tokens": 1856516803.0, "step": 4368 }, { "entropy": 0.018453822005540133, "epoch": 1.912134806871649, "grad_norm": 15.125, "learning_rate": 1.9103952355170548e-05, "loss": 0.1303, "loss_lm": 0.014740657992661, "loss_seg": 0.11555499956011772, "mean_token_accuracy": 0.9952724874019623, "num_tokens": 1856942342.0, "step": 4369 }, { "entropy": 0.018235722556710243, "epoch": 1.9125724915198599, "grad_norm": 4.5, "learning_rate": 1.9101245262587982e-05, "loss": 0.1238, "loss_lm": 0.01782772410660982, "loss_seg": 0.10592767968773842, "mean_token_accuracy": 0.9951243847608566, "num_tokens": 1857367466.0, "step": 4370 }, { "entropy": 0.018527519889175892, "epoch": 1.913010176168071, "grad_norm": 10.375, "learning_rate": 1.9098538170005415e-05, "loss": 0.0988, "loss_lm": 0.017227475065737963, "loss_seg": 0.08161824196577072, "mean_token_accuracy": 0.9952415972948074, "num_tokens": 1857791913.0, "step": 4371 }, { "entropy": 0.01850191317498684, "epoch": 1.913447860816282, "grad_norm": 9.375, "learning_rate": 1.909583107742285e-05, "loss": 0.1261, "loss_lm": 0.018126660492271185, "loss_seg": 0.10797540843486786, "mean_token_accuracy": 0.9953098893165588, "num_tokens": 1858215954.0, "step": 4372 }, { "entropy": 0.018406113144010305, "epoch": 1.9138855454644927, "grad_norm": 3.65625, "learning_rate": 1.909312398484028e-05, "loss": 0.0915, "loss_lm": 0.01577681116759777, "loss_seg": 0.07572406902909279, "mean_token_accuracy": 0.9952373951673508, "num_tokens": 1858640897.0, "step": 4373 }, { "entropy": 0.017918535508215427, "epoch": 1.9143232301127038, "grad_norm": 14.75, "learning_rate": 1.9090416892257713e-05, "loss": 0.1251, "loss_lm": 0.01413750508800149, "loss_seg": 0.11096661537885666, "mean_token_accuracy": 0.9953157752752304, "num_tokens": 1859066726.0, "step": 4374 }, { "entropy": 0.01850751508027315, "epoch": 1.9147609147609148, "grad_norm": 7.21875, "learning_rate": 1.908770979967515e-05, "loss": 0.1394, "loss_lm": 0.015621038852259517, "loss_seg": 0.12379061803221703, "mean_token_accuracy": 0.9952073991298676, "num_tokens": 1859491718.0, "step": 4375 }, { "entropy": 0.018429023679345846, "epoch": 1.9151985994091256, "grad_norm": 15.25, "learning_rate": 1.9085002707092584e-05, "loss": 0.1422, "loss_lm": 0.01626087981276214, "loss_seg": 0.12595226243138313, "mean_token_accuracy": 0.9952329993247986, "num_tokens": 1859916687.0, "step": 4376 }, { "entropy": 0.01893554674461484, "epoch": 1.9156362840573367, "grad_norm": 7.5, "learning_rate": 1.9082295614510018e-05, "loss": 0.1038, "loss_lm": 0.01618662429973483, "loss_seg": 0.08765201270580292, "mean_token_accuracy": 0.995033323764801, "num_tokens": 1860342093.0, "step": 4377 }, { "entropy": 0.017527256160974503, "epoch": 1.9160739687055477, "grad_norm": 8.1875, "learning_rate": 1.9079588521927448e-05, "loss": 0.1314, "loss_lm": 0.016355908010154963, "loss_seg": 0.11508016102015972, "mean_token_accuracy": 0.9954591393470764, "num_tokens": 1860766853.0, "step": 4378 }, { "entropy": 0.01810781890526414, "epoch": 1.9165116533537585, "grad_norm": 5.25, "learning_rate": 1.9076881429344882e-05, "loss": 0.1269, "loss_lm": 0.01770539814606309, "loss_seg": 0.10920041427016258, "mean_token_accuracy": 0.9953269064426422, "num_tokens": 1861192205.0, "step": 4379 }, { "entropy": 0.018216437194496393, "epoch": 1.9169493380019695, "grad_norm": 5.8125, "learning_rate": 1.907417433676232e-05, "loss": 0.0976, "loss_lm": 0.015226361341774464, "loss_seg": 0.08239170163869858, "mean_token_accuracy": 0.995221421122551, "num_tokens": 1861617821.0, "step": 4380 }, { "entropy": 0.018195036333054304, "epoch": 1.9173870226501806, "grad_norm": 8.3125, "learning_rate": 1.9071467244179753e-05, "loss": 0.1405, "loss_lm": 0.01654026424512267, "loss_seg": 0.12399281840771437, "mean_token_accuracy": 0.9952957928180695, "num_tokens": 1862042510.0, "step": 4381 }, { "entropy": 0.017544752918183804, "epoch": 1.9178247072983914, "grad_norm": 5.46875, "learning_rate": 1.9068760151597186e-05, "loss": 0.1281, "loss_lm": 0.01642787829041481, "loss_seg": 0.11162552144378424, "mean_token_accuracy": 0.9954463243484497, "num_tokens": 1862467377.0, "step": 4382 }, { "entropy": 0.017636193428188562, "epoch": 1.9182623919466024, "grad_norm": 4.75, "learning_rate": 1.9066053059014617e-05, "loss": 0.1368, "loss_lm": 0.016411692136898637, "loss_seg": 0.12035891972482204, "mean_token_accuracy": 0.9954289942979813, "num_tokens": 1862891084.0, "step": 4383 }, { "entropy": 0.017699971329420805, "epoch": 1.9187000765948135, "grad_norm": 11.6875, "learning_rate": 1.906334596643205e-05, "loss": 0.1081, "loss_lm": 0.014455747092142701, "loss_seg": 0.0936483796685934, "mean_token_accuracy": 0.9952930957078934, "num_tokens": 1863316147.0, "step": 4384 }, { "entropy": 0.01770472340285778, "epoch": 1.9191377612430243, "grad_norm": 53.5, "learning_rate": 1.9060638873849488e-05, "loss": 0.2062, "loss_lm": 0.01802566135302186, "loss_seg": 0.1881540846079588, "mean_token_accuracy": 0.9954233765602112, "num_tokens": 1863741524.0, "step": 4385 }, { "entropy": 0.01819078763946891, "epoch": 1.9195754458912355, "grad_norm": 4.75, "learning_rate": 1.905793178126692e-05, "loss": 0.1089, "loss_lm": 0.015902897343039513, "loss_seg": 0.09302848763763905, "mean_token_accuracy": 0.9951494932174683, "num_tokens": 1864165766.0, "step": 4386 }, { "entropy": 0.01857900945469737, "epoch": 1.9200131305394463, "grad_norm": 7.0, "learning_rate": 1.9055224688684355e-05, "loss": 0.103, "loss_lm": 0.017507743556052446, "loss_seg": 0.08545682393014431, "mean_token_accuracy": 0.9952275604009628, "num_tokens": 1864591696.0, "step": 4387 }, { "entropy": 0.018354475498199463, "epoch": 1.9204508151876571, "grad_norm": 4.28125, "learning_rate": 1.9052517596101785e-05, "loss": 0.1574, "loss_lm": 0.013872042996808887, "loss_seg": 0.14353946968913078, "mean_token_accuracy": 0.9952199459075928, "num_tokens": 1865016757.0, "step": 4388 }, { "entropy": 0.01816996792331338, "epoch": 1.9208884998358684, "grad_norm": 4.59375, "learning_rate": 1.904981050351922e-05, "loss": 0.0797, "loss_lm": 0.015029059490188956, "loss_seg": 0.06464903801679611, "mean_token_accuracy": 0.9951981008052826, "num_tokens": 1865441535.0, "step": 4389 }, { "entropy": 0.01827365579083562, "epoch": 1.9213261844840792, "grad_norm": 5.5625, "learning_rate": 1.9047103410936656e-05, "loss": 0.1056, "loss_lm": 0.018041668459773064, "loss_seg": 0.08758704736828804, "mean_token_accuracy": 0.9951696395874023, "num_tokens": 1865866288.0, "step": 4390 }, { "entropy": 0.01847755117341876, "epoch": 1.92176386913229, "grad_norm": 6.09375, "learning_rate": 1.904439631835409e-05, "loss": 0.1885, "loss_lm": 0.018438187427818775, "loss_seg": 0.17008665204048157, "mean_token_accuracy": 0.9953050911426544, "num_tokens": 1866292036.0, "step": 4391 }, { "entropy": 0.018087957985699177, "epoch": 1.9222015537805013, "grad_norm": 15.375, "learning_rate": 1.904168922577152e-05, "loss": 0.0947, "loss_lm": 0.01738986885175109, "loss_seg": 0.07733886316418648, "mean_token_accuracy": 0.995271697640419, "num_tokens": 1866717509.0, "step": 4392 }, { "entropy": 0.018578737508505583, "epoch": 1.922639238428712, "grad_norm": 9.1875, "learning_rate": 1.9038982133188954e-05, "loss": 0.1186, "loss_lm": 0.017183240968734026, "loss_seg": 0.10146115813404322, "mean_token_accuracy": 0.9951741546392441, "num_tokens": 1867142468.0, "step": 4393 }, { "entropy": 0.018092074897140265, "epoch": 1.9230769230769231, "grad_norm": 8.4375, "learning_rate": 1.9036275040606388e-05, "loss": 0.1027, "loss_lm": 0.015732299769297242, "loss_seg": 0.08693332131952047, "mean_token_accuracy": 0.9952667206525803, "num_tokens": 1867566849.0, "step": 4394 }, { "entropy": 0.017869108356535435, "epoch": 1.9235146077251342, "grad_norm": 6.625, "learning_rate": 1.9033567948023825e-05, "loss": 0.1282, "loss_lm": 0.01550878887064755, "loss_seg": 0.11270796693861485, "mean_token_accuracy": 0.9952956885099411, "num_tokens": 1867991942.0, "step": 4395 }, { "entropy": 0.017991636879742146, "epoch": 1.923952292373345, "grad_norm": 12.0625, "learning_rate": 1.903086085544126e-05, "loss": 0.1201, "loss_lm": 0.017315439879894257, "loss_seg": 0.10273583047091961, "mean_token_accuracy": 0.9952471554279327, "num_tokens": 1868417093.0, "step": 4396 }, { "entropy": 0.01875194488093257, "epoch": 1.924389977021556, "grad_norm": 5.3125, "learning_rate": 1.902815376285869e-05, "loss": 0.0913, "loss_lm": 0.016794403782114387, "loss_seg": 0.07452424243092537, "mean_token_accuracy": 0.9951035529375076, "num_tokens": 1868843075.0, "step": 4397 }, { "entropy": 0.01847131084650755, "epoch": 1.924827661669767, "grad_norm": 13.9375, "learning_rate": 1.9025446670276123e-05, "loss": 0.1305, "loss_lm": 0.017792819533497095, "loss_seg": 0.11268275789916515, "mean_token_accuracy": 0.9951757192611694, "num_tokens": 1869268596.0, "step": 4398 }, { "entropy": 0.018127136398106813, "epoch": 1.9252653463179779, "grad_norm": 3.5625, "learning_rate": 1.9022739577693556e-05, "loss": 0.089, "loss_lm": 0.014845594298094511, "loss_seg": 0.07416301872581244, "mean_token_accuracy": 0.995257630944252, "num_tokens": 1869693024.0, "step": 4399 }, { "entropy": 0.01841310178861022, "epoch": 1.9257030309661889, "grad_norm": 6.9375, "learning_rate": 1.9020032485110994e-05, "loss": 0.1053, "loss_lm": 0.014544246019795537, "loss_seg": 0.09071914851665497, "mean_token_accuracy": 0.9951859563589096, "num_tokens": 1870118022.0, "step": 4400 }, { "entropy": 0.018746501300483942, "epoch": 1.9261407156144, "grad_norm": 10.75, "learning_rate": 1.9017325392528427e-05, "loss": 0.1268, "loss_lm": 0.01534218224696815, "loss_seg": 0.11145623959600925, "mean_token_accuracy": 0.9952785670757294, "num_tokens": 1870543507.0, "step": 4401 }, { "entropy": 0.018521306570619345, "epoch": 1.9265784002626107, "grad_norm": 12.5625, "learning_rate": 1.9014618299945858e-05, "loss": 0.1181, "loss_lm": 0.01616915874183178, "loss_seg": 0.1019078167155385, "mean_token_accuracy": 0.9952189028263092, "num_tokens": 1870968255.0, "step": 4402 }, { "entropy": 0.018700631335377693, "epoch": 1.9270160849108218, "grad_norm": 4.625, "learning_rate": 1.901191120736329e-05, "loss": 0.1211, "loss_lm": 0.01460612309165299, "loss_seg": 0.10649938881397247, "mean_token_accuracy": 0.9951040744781494, "num_tokens": 1871393891.0, "step": 4403 }, { "entropy": 0.01851349091157317, "epoch": 1.9274537695590328, "grad_norm": 24.125, "learning_rate": 1.9009204114780725e-05, "loss": 0.1149, "loss_lm": 0.014749569818377495, "loss_seg": 0.10019441600888968, "mean_token_accuracy": 0.9952617883682251, "num_tokens": 1871819133.0, "step": 4404 }, { "entropy": 0.018194183241575956, "epoch": 1.9278914542072436, "grad_norm": 14.75, "learning_rate": 1.900649702219816e-05, "loss": 0.0908, "loss_lm": 0.013909440254792571, "loss_seg": 0.0768855344504118, "mean_token_accuracy": 0.9952644407749176, "num_tokens": 1872244592.0, "step": 4405 }, { "entropy": 0.018531682901084423, "epoch": 1.9283291388554546, "grad_norm": 3.78125, "learning_rate": 1.9003789929615596e-05, "loss": 0.1127, "loss_lm": 0.017527985386550426, "loss_seg": 0.09519255347549915, "mean_token_accuracy": 0.9952041804790497, "num_tokens": 1872669731.0, "step": 4406 }, { "entropy": 0.018983587622642517, "epoch": 1.9287668235036657, "grad_norm": 11.1875, "learning_rate": 1.9001082837033026e-05, "loss": 0.1173, "loss_lm": 0.01695934240706265, "loss_seg": 0.10030939150601625, "mean_token_accuracy": 0.9950811415910721, "num_tokens": 1873095207.0, "step": 4407 }, { "entropy": 0.018378829583525658, "epoch": 1.9292045081518765, "grad_norm": 6.375, "learning_rate": 1.899837574445046e-05, "loss": 0.1606, "loss_lm": 0.017762689385563135, "loss_seg": 0.14280876331031322, "mean_token_accuracy": 0.9952497780323029, "num_tokens": 1873519969.0, "step": 4408 }, { "entropy": 0.017917740158736706, "epoch": 1.9296421928000875, "grad_norm": 11.75, "learning_rate": 1.8995668651867894e-05, "loss": 0.1345, "loss_lm": 0.01592077035456896, "loss_seg": 0.11856415122747421, "mean_token_accuracy": 0.9953334182500839, "num_tokens": 1873945064.0, "step": 4409 }, { "entropy": 0.018779640551656485, "epoch": 1.9300798774482986, "grad_norm": 5.4375, "learning_rate": 1.8992961559285327e-05, "loss": 0.1667, "loss_lm": 0.02026886399835348, "loss_seg": 0.14640179462730885, "mean_token_accuracy": 0.9950821101665497, "num_tokens": 1874369387.0, "step": 4410 }, { "entropy": 0.017951116897165775, "epoch": 1.9305175620965094, "grad_norm": 4.625, "learning_rate": 1.8990254466702764e-05, "loss": 0.1263, "loss_lm": 0.015213197795674205, "loss_seg": 0.11110603529959917, "mean_token_accuracy": 0.9953749775886536, "num_tokens": 1874794797.0, "step": 4411 }, { "entropy": 0.018223837483674288, "epoch": 1.9309552467447204, "grad_norm": 29.0, "learning_rate": 1.8987547374120195e-05, "loss": 0.098, "loss_lm": 0.01468195952475071, "loss_seg": 0.083320876583457, "mean_token_accuracy": 0.9952238500118256, "num_tokens": 1875220010.0, "step": 4412 }, { "entropy": 0.017984601203352213, "epoch": 1.9313929313929314, "grad_norm": 3.671875, "learning_rate": 1.898484028153763e-05, "loss": 0.117, "loss_lm": 0.016748437425121665, "loss_seg": 0.1002114750444889, "mean_token_accuracy": 0.9952993094921112, "num_tokens": 1875645436.0, "step": 4413 }, { "entropy": 0.01799285737797618, "epoch": 1.9318306160411423, "grad_norm": 10.1875, "learning_rate": 1.8982133188955062e-05, "loss": 0.0745, "loss_lm": 0.015417358838021755, "loss_seg": 0.059121315367519855, "mean_token_accuracy": 0.9953835755586624, "num_tokens": 1876070390.0, "step": 4414 }, { "entropy": 0.018771579954773188, "epoch": 1.9322683006893533, "grad_norm": 11.0, "learning_rate": 1.8979426096372496e-05, "loss": 0.117, "loss_lm": 0.018023463897407055, "loss_seg": 0.09893490932881832, "mean_token_accuracy": 0.995032325387001, "num_tokens": 1876495123.0, "step": 4415 }, { "entropy": 0.019025965128093958, "epoch": 1.9327059853375643, "grad_norm": 26.75, "learning_rate": 1.897671900378993e-05, "loss": 0.1312, "loss_lm": 0.016809097956866026, "loss_seg": 0.11439965106546879, "mean_token_accuracy": 0.9950497001409531, "num_tokens": 1876920119.0, "step": 4416 }, { "entropy": 0.018846516963094473, "epoch": 1.9331436699857751, "grad_norm": 11.1875, "learning_rate": 1.8974011911207363e-05, "loss": 0.131, "loss_lm": 0.015708222519606352, "loss_seg": 0.11533224955201149, "mean_token_accuracy": 0.9950553625822067, "num_tokens": 1877345996.0, "step": 4417 }, { "entropy": 0.018022899981588125, "epoch": 1.9335813546339862, "grad_norm": 13.0625, "learning_rate": 1.8971304818624797e-05, "loss": 0.1303, "loss_lm": 0.0150183099322021, "loss_seg": 0.11528028547763824, "mean_token_accuracy": 0.9954093396663666, "num_tokens": 1877771583.0, "step": 4418 }, { "entropy": 0.018574471585452557, "epoch": 1.9340190392821972, "grad_norm": 3.609375, "learning_rate": 1.896859772604223e-05, "loss": 0.0913, "loss_lm": 0.01710878126323223, "loss_seg": 0.0741892121732235, "mean_token_accuracy": 0.995193287730217, "num_tokens": 1878196773.0, "step": 4419 }, { "entropy": 0.017991715110838413, "epoch": 1.934456723930408, "grad_norm": 4.875, "learning_rate": 1.8965890633459665e-05, "loss": 0.1011, "loss_lm": 0.015553205041214824, "loss_seg": 0.08555927872657776, "mean_token_accuracy": 0.9953755587339401, "num_tokens": 1878620973.0, "step": 4420 }, { "entropy": 0.018009580671787262, "epoch": 1.934894408578619, "grad_norm": 8.3125, "learning_rate": 1.89631835408771e-05, "loss": 0.1269, "loss_lm": 0.01500020269304514, "loss_seg": 0.11188200116157532, "mean_token_accuracy": 0.9952878952026367, "num_tokens": 1879046658.0, "step": 4421 }, { "entropy": 0.018339476082473993, "epoch": 1.93533209322683, "grad_norm": 14.125, "learning_rate": 1.8960476448294532e-05, "loss": 0.1154, "loss_lm": 0.01741082640364766, "loss_seg": 0.09800638817250729, "mean_token_accuracy": 0.9953172355890274, "num_tokens": 1879472449.0, "step": 4422 }, { "entropy": 0.018247399479150772, "epoch": 1.935769777875041, "grad_norm": 12.625, "learning_rate": 1.8957769355711966e-05, "loss": 0.1228, "loss_lm": 0.016993976663798094, "loss_seg": 0.10582237876951694, "mean_token_accuracy": 0.9951752573251724, "num_tokens": 1879896690.0, "step": 4423 }, { "entropy": 0.018086517695337534, "epoch": 1.9362074625232522, "grad_norm": 6.8125, "learning_rate": 1.89550622631294e-05, "loss": 0.0812, "loss_lm": 0.01566825690679252, "loss_seg": 0.06558120250701904, "mean_token_accuracy": 0.9951362013816833, "num_tokens": 1880322058.0, "step": 4424 }, { "entropy": 0.018663309514522552, "epoch": 1.936645147171463, "grad_norm": 19.875, "learning_rate": 1.8952355170546833e-05, "loss": 0.1683, "loss_lm": 0.017526975134387612, "loss_seg": 0.15078731067478657, "mean_token_accuracy": 0.9951620101928711, "num_tokens": 1880746164.0, "step": 4425 }, { "entropy": 0.01808471418917179, "epoch": 1.9370828318196738, "grad_norm": 13.875, "learning_rate": 1.8949648077964267e-05, "loss": 0.096, "loss_lm": 0.014639710308983922, "loss_seg": 0.08132408279925585, "mean_token_accuracy": 0.9952937513589859, "num_tokens": 1881170869.0, "step": 4426 }, { "entropy": 0.018479805905371904, "epoch": 1.937520516467885, "grad_norm": 18.875, "learning_rate": 1.89469409853817e-05, "loss": 0.1318, "loss_lm": 0.01658053509891033, "loss_seg": 0.11521376855671406, "mean_token_accuracy": 0.9952970445156097, "num_tokens": 1881595664.0, "step": 4427 }, { "entropy": 0.019150894600898027, "epoch": 1.9379582011160958, "grad_norm": 4.875, "learning_rate": 1.8944233892799134e-05, "loss": 0.1023, "loss_lm": 0.020676772808656096, "loss_seg": 0.08161343168467283, "mean_token_accuracy": 0.9950205534696579, "num_tokens": 1882021560.0, "step": 4428 }, { "entropy": 0.01798754697665572, "epoch": 1.9383958857643067, "grad_norm": 5.625, "learning_rate": 1.8941526800216568e-05, "loss": 0.1045, "loss_lm": 0.015129434643313289, "loss_seg": 0.08939920552074909, "mean_token_accuracy": 0.9952595978975296, "num_tokens": 1882446524.0, "step": 4429 }, { "entropy": 0.01873834105208516, "epoch": 1.938833570412518, "grad_norm": 9.0, "learning_rate": 1.8938819707634002e-05, "loss": 0.0916, "loss_lm": 0.015324530424550176, "loss_seg": 0.07625203393399715, "mean_token_accuracy": 0.9951516687870026, "num_tokens": 1882871564.0, "step": 4430 }, { "entropy": 0.018751492723822594, "epoch": 1.9392712550607287, "grad_norm": 11.9375, "learning_rate": 1.8936112615051436e-05, "loss": 0.0916, "loss_lm": 0.01603944366797805, "loss_seg": 0.07553100027143955, "mean_token_accuracy": 0.9950403869152069, "num_tokens": 1883296740.0, "step": 4431 }, { "entropy": 0.01801620703190565, "epoch": 1.9397089397089398, "grad_norm": 3.21875, "learning_rate": 1.893340552246887e-05, "loss": 0.098, "loss_lm": 0.016168395522981882, "loss_seg": 0.08182348310947418, "mean_token_accuracy": 0.9953980594873428, "num_tokens": 1883721704.0, "step": 4432 }, { "entropy": 0.017950888723134995, "epoch": 1.9401466243571508, "grad_norm": 5.09375, "learning_rate": 1.8930698429886303e-05, "loss": 0.1255, "loss_lm": 0.015756482258439064, "loss_seg": 0.10972794704139233, "mean_token_accuracy": 0.99527807533741, "num_tokens": 1884147134.0, "step": 4433 }, { "entropy": 0.01765971491113305, "epoch": 1.9405843090053616, "grad_norm": 4.21875, "learning_rate": 1.8927991337303737e-05, "loss": 0.0955, "loss_lm": 0.01684255665168166, "loss_seg": 0.07861875556409359, "mean_token_accuracy": 0.995351105928421, "num_tokens": 1884572141.0, "step": 4434 }, { "entropy": 0.01853368291631341, "epoch": 1.9410219936535726, "grad_norm": 140.0, "learning_rate": 1.892528424472117e-05, "loss": 0.1394, "loss_lm": 0.01562185725197196, "loss_seg": 0.12374612130224705, "mean_token_accuracy": 0.995111957192421, "num_tokens": 1884997651.0, "step": 4435 }, { "entropy": 0.017826522700488567, "epoch": 1.9414596783017837, "grad_norm": 4.8125, "learning_rate": 1.8922577152138604e-05, "loss": 0.1115, "loss_lm": 0.0160792029928416, "loss_seg": 0.09544184990227222, "mean_token_accuracy": 0.9954616725444794, "num_tokens": 1885423084.0, "step": 4436 }, { "entropy": 0.01849903678521514, "epoch": 1.9418973629499945, "grad_norm": 13.375, "learning_rate": 1.8919870059556038e-05, "loss": 0.1014, "loss_lm": 0.01650103065185249, "loss_seg": 0.08491638861596584, "mean_token_accuracy": 0.9951460659503937, "num_tokens": 1885847649.0, "step": 4437 }, { "entropy": 0.017926338128745556, "epoch": 1.9423350475982055, "grad_norm": 9.6875, "learning_rate": 1.8917162966973472e-05, "loss": 0.1222, "loss_lm": 0.015175677603110671, "loss_seg": 0.10699732974171638, "mean_token_accuracy": 0.995332270860672, "num_tokens": 1886272438.0, "step": 4438 }, { "entropy": 0.018436424899846315, "epoch": 1.9427727322464166, "grad_norm": 5.28125, "learning_rate": 1.8914455874390905e-05, "loss": 0.0953, "loss_lm": 0.02058862429112196, "loss_seg": 0.07467086799442768, "mean_token_accuracy": 0.9952170997858047, "num_tokens": 1886697873.0, "step": 4439 }, { "entropy": 0.018416392151266336, "epoch": 1.9432104168946274, "grad_norm": 23.25, "learning_rate": 1.8911748781808336e-05, "loss": 0.153, "loss_lm": 0.01527478126809001, "loss_seg": 0.13776236027479172, "mean_token_accuracy": 0.995228111743927, "num_tokens": 1887123460.0, "step": 4440 }, { "entropy": 0.01894441246986389, "epoch": 1.9436481015428384, "grad_norm": 8.4375, "learning_rate": 1.890904168922577e-05, "loss": 0.0818, "loss_lm": 0.01695617870427668, "loss_seg": 0.06484020873904228, "mean_token_accuracy": 0.9949998259544373, "num_tokens": 1887548423.0, "step": 4441 }, { "entropy": 0.01817277865484357, "epoch": 1.9440857861910494, "grad_norm": 12.125, "learning_rate": 1.8906334596643207e-05, "loss": 0.1988, "loss_lm": 0.014608501689508557, "loss_seg": 0.18419203907251358, "mean_token_accuracy": 0.9952377825975418, "num_tokens": 1887972986.0, "step": 4442 }, { "entropy": 0.018015361856669188, "epoch": 1.9445234708392602, "grad_norm": 3.875, "learning_rate": 1.890362750406064e-05, "loss": 0.0745, "loss_lm": 0.015552021097391844, "loss_seg": 0.05896332021802664, "mean_token_accuracy": 0.9952430874109268, "num_tokens": 1888397809.0, "step": 4443 }, { "entropy": 0.018526647705584764, "epoch": 1.9449611554874713, "grad_norm": 14.4375, "learning_rate": 1.8900920411478074e-05, "loss": 0.1284, "loss_lm": 0.017378526041284204, "loss_seg": 0.1109723374247551, "mean_token_accuracy": 0.9950394183397293, "num_tokens": 1888822729.0, "step": 4444 }, { "entropy": 0.018044729251414537, "epoch": 1.9453988401356823, "grad_norm": 7.0, "learning_rate": 1.8898213318895504e-05, "loss": 0.097, "loss_lm": 0.018298225942999125, "loss_seg": 0.0787122193723917, "mean_token_accuracy": 0.9953693598508835, "num_tokens": 1889247219.0, "step": 4445 }, { "entropy": 0.01834192592650652, "epoch": 1.9458365247838931, "grad_norm": 13.9375, "learning_rate": 1.8895506226312938e-05, "loss": 0.1264, "loss_lm": 0.01633892534300685, "loss_seg": 0.11001583561301231, "mean_token_accuracy": 0.9951933771371841, "num_tokens": 1889672969.0, "step": 4446 }, { "entropy": 0.01882742764428258, "epoch": 1.9462742094321042, "grad_norm": 12.125, "learning_rate": 1.8892799133730375e-05, "loss": 0.1232, "loss_lm": 0.01569726737216115, "loss_seg": 0.10745732113718987, "mean_token_accuracy": 0.9950592070817947, "num_tokens": 1890097979.0, "step": 4447 }, { "entropy": 0.0185352498665452, "epoch": 1.9467118940803152, "grad_norm": 9.5, "learning_rate": 1.889009204114781e-05, "loss": 0.0911, "loss_lm": 0.0168147305957973, "loss_seg": 0.07431086152791977, "mean_token_accuracy": 0.9952038675546646, "num_tokens": 1890523092.0, "step": 4448 }, { "entropy": 0.018665575422346592, "epoch": 1.947149578728526, "grad_norm": 12.4375, "learning_rate": 1.8887384948565243e-05, "loss": 0.1399, "loss_lm": 0.01592884468846023, "loss_seg": 0.12399897351861, "mean_token_accuracy": 0.9950577765703201, "num_tokens": 1890947763.0, "step": 4449 }, { "entropy": 0.01800220599398017, "epoch": 1.947587263376737, "grad_norm": 3.828125, "learning_rate": 1.8884677855982673e-05, "loss": 0.0915, "loss_lm": 0.017350737005472183, "loss_seg": 0.07419511023908854, "mean_token_accuracy": 0.9953468292951584, "num_tokens": 1891372352.0, "step": 4450 }, { "entropy": 0.018082293216139078, "epoch": 1.948024948024948, "grad_norm": 11.5, "learning_rate": 1.8881970763400107e-05, "loss": 0.1303, "loss_lm": 0.014168210793286562, "loss_seg": 0.11611638404428959, "mean_token_accuracy": 0.9953572154045105, "num_tokens": 1891797280.0, "step": 4451 }, { "entropy": 0.017932516057044268, "epoch": 1.948462632673159, "grad_norm": 11.8125, "learning_rate": 1.8879263670817544e-05, "loss": 0.102, "loss_lm": 0.015918395249173045, "loss_seg": 0.08604438975453377, "mean_token_accuracy": 0.9952540099620819, "num_tokens": 1892223170.0, "step": 4452 }, { "entropy": 0.018224173691123724, "epoch": 1.94890031732137, "grad_norm": 22.875, "learning_rate": 1.8876556578234978e-05, "loss": 0.0897, "loss_lm": 0.016213333467021585, "loss_seg": 0.07345709577202797, "mean_token_accuracy": 0.995228961110115, "num_tokens": 1892648609.0, "step": 4453 }, { "entropy": 0.018562248442322016, "epoch": 1.949338001969581, "grad_norm": 8.3125, "learning_rate": 1.887384948565241e-05, "loss": 0.12, "loss_lm": 0.017553633311763406, "loss_seg": 0.10248884931206703, "mean_token_accuracy": 0.9951972216367722, "num_tokens": 1893073329.0, "step": 4454 }, { "entropy": 0.018590142019093037, "epoch": 1.9497756866177918, "grad_norm": 5.65625, "learning_rate": 1.887114239306984e-05, "loss": 0.1124, "loss_lm": 0.015843026572838426, "loss_seg": 0.09652054868638515, "mean_token_accuracy": 0.9951820820569992, "num_tokens": 1893497967.0, "step": 4455 }, { "entropy": 0.018349892925471067, "epoch": 1.9502133712660028, "grad_norm": 15.875, "learning_rate": 1.8868435300487275e-05, "loss": 0.1282, "loss_lm": 0.016657056752592325, "loss_seg": 0.1115447711199522, "mean_token_accuracy": 0.9951803237199783, "num_tokens": 1893923027.0, "step": 4456 }, { "entropy": 0.018867372069507837, "epoch": 1.9506510559142138, "grad_norm": 8.8125, "learning_rate": 1.8865728207904713e-05, "loss": 0.1198, "loss_lm": 0.015415869886055589, "loss_seg": 0.10440022870898247, "mean_token_accuracy": 0.9951000064611435, "num_tokens": 1894349116.0, "step": 4457 }, { "entropy": 0.017759130336344242, "epoch": 1.9510887405624247, "grad_norm": 8.5, "learning_rate": 1.8863021115322146e-05, "loss": 0.1259, "loss_lm": 0.014254818437620997, "loss_seg": 0.11161441076546907, "mean_token_accuracy": 0.9953406006097794, "num_tokens": 1894774241.0, "step": 4458 }, { "entropy": 0.018131295684725046, "epoch": 1.9515264252106357, "grad_norm": 17.75, "learning_rate": 1.886031402273958e-05, "loss": 0.1262, "loss_lm": 0.015563663328066468, "loss_seg": 0.11068127304315567, "mean_token_accuracy": 0.9950890392065048, "num_tokens": 1895199189.0, "step": 4459 }, { "entropy": 0.017775353975594044, "epoch": 1.9519641098588467, "grad_norm": 6.375, "learning_rate": 1.885760693015701e-05, "loss": 0.0906, "loss_lm": 0.016121518332511187, "loss_seg": 0.07443799637258053, "mean_token_accuracy": 0.9952571094036102, "num_tokens": 1895623783.0, "step": 4460 }, { "entropy": 0.01814361661672592, "epoch": 1.9524017945070575, "grad_norm": 9.25, "learning_rate": 1.8854899837574444e-05, "loss": 0.1387, "loss_lm": 0.01525170705281198, "loss_seg": 0.12347697280347347, "mean_token_accuracy": 0.9953080117702484, "num_tokens": 1896048285.0, "step": 4461 }, { "entropy": 0.018912530969828367, "epoch": 1.9528394791552688, "grad_norm": 5.46875, "learning_rate": 1.885219274499188e-05, "loss": 0.103, "loss_lm": 0.01662382483482361, "loss_seg": 0.08636349812150002, "mean_token_accuracy": 0.9950373619794846, "num_tokens": 1896473245.0, "step": 4462 }, { "entropy": 0.018829925451427698, "epoch": 1.9532771638034796, "grad_norm": 7.15625, "learning_rate": 1.8849485652409315e-05, "loss": 0.126, "loss_lm": 0.018939111847430468, "loss_seg": 0.10701654758304358, "mean_token_accuracy": 0.9950770735740662, "num_tokens": 1896897971.0, "step": 4463 }, { "entropy": 0.01759929582476616, "epoch": 1.9537148484516904, "grad_norm": 4.125, "learning_rate": 1.8846778559826745e-05, "loss": 0.1224, "loss_lm": 0.017949998378753662, "loss_seg": 0.10443280264735222, "mean_token_accuracy": 0.9953920543193817, "num_tokens": 1897323027.0, "step": 4464 }, { "entropy": 0.018076526932418346, "epoch": 1.9541525330999017, "grad_norm": 9.5, "learning_rate": 1.884407146724418e-05, "loss": 0.1023, "loss_lm": 0.014879463007673621, "loss_seg": 0.08740029856562614, "mean_token_accuracy": 0.9952888637781143, "num_tokens": 1897748570.0, "step": 4465 }, { "entropy": 0.01903061429038644, "epoch": 1.9545902177481125, "grad_norm": 7.5, "learning_rate": 1.8841364374661613e-05, "loss": 0.1189, "loss_lm": 0.01728170900605619, "loss_seg": 0.10162584111094475, "mean_token_accuracy": 0.9950210303068161, "num_tokens": 1898173721.0, "step": 4466 }, { "entropy": 0.018388927448540926, "epoch": 1.9550279023963233, "grad_norm": 6.5, "learning_rate": 1.883865728207905e-05, "loss": 0.1761, "loss_lm": 0.018431843258440495, "loss_seg": 0.15771372988820076, "mean_token_accuracy": 0.9952458143234253, "num_tokens": 1898599191.0, "step": 4467 }, { "entropy": 0.018157290294766426, "epoch": 1.9554655870445345, "grad_norm": 24.875, "learning_rate": 1.8835950189496483e-05, "loss": 0.1059, "loss_lm": 0.01718137040734291, "loss_seg": 0.08873510546982288, "mean_token_accuracy": 0.9952290803194046, "num_tokens": 1899024233.0, "step": 4468 }, { "entropy": 0.01832470763474703, "epoch": 1.9559032716927454, "grad_norm": 7.28125, "learning_rate": 1.8833243096913914e-05, "loss": 0.1, "loss_lm": 0.014896817039698362, "loss_seg": 0.0850727204233408, "mean_token_accuracy": 0.9952213317155838, "num_tokens": 1899449195.0, "step": 4469 }, { "entropy": 0.018455227836966515, "epoch": 1.9563409563409564, "grad_norm": 33.5, "learning_rate": 1.8830536004331348e-05, "loss": 0.1421, "loss_lm": 0.018171434057876468, "loss_seg": 0.12387924827635288, "mean_token_accuracy": 0.9950781166553497, "num_tokens": 1899872972.0, "step": 4470 }, { "entropy": 0.018605706747621298, "epoch": 1.9567786409891674, "grad_norm": 3.8125, "learning_rate": 1.882782891174878e-05, "loss": 0.0948, "loss_lm": 0.015083617996424437, "loss_seg": 0.07967308256775141, "mean_token_accuracy": 0.9950322955846786, "num_tokens": 1900298683.0, "step": 4471 }, { "entropy": 0.018299355637282133, "epoch": 1.9572163256373782, "grad_norm": 9.0625, "learning_rate": 1.8825121819166215e-05, "loss": 0.075, "loss_lm": 0.014168742345646024, "loss_seg": 0.0608448414131999, "mean_token_accuracy": 0.9952696859836578, "num_tokens": 1900723988.0, "step": 4472 }, { "entropy": 0.017968231346458197, "epoch": 1.9576540102855893, "grad_norm": 19.5, "learning_rate": 1.8822414726583652e-05, "loss": 0.1001, "loss_lm": 0.01550278696231544, "loss_seg": 0.08456775918602943, "mean_token_accuracy": 0.9953609108924866, "num_tokens": 1901148584.0, "step": 4473 }, { "entropy": 0.017812849953770638, "epoch": 1.9580916949338003, "grad_norm": 4.1875, "learning_rate": 1.8819707634001082e-05, "loss": 0.0761, "loss_lm": 0.014089775271713734, "loss_seg": 0.06205913983285427, "mean_token_accuracy": 0.9953145682811737, "num_tokens": 1901573244.0, "step": 4474 }, { "entropy": 0.018807989545166492, "epoch": 1.9585293795820111, "grad_norm": 5.5, "learning_rate": 1.8817000541418516e-05, "loss": 0.1209, "loss_lm": 0.015485289739444852, "loss_seg": 0.10541412979364395, "mean_token_accuracy": 0.9950535148382187, "num_tokens": 1901998371.0, "step": 4475 }, { "entropy": 0.018460713326931, "epoch": 1.9589670642302222, "grad_norm": 4.15625, "learning_rate": 1.881429344883595e-05, "loss": 0.1301, "loss_lm": 0.01624610205180943, "loss_seg": 0.11388930212706327, "mean_token_accuracy": 0.9951347261667252, "num_tokens": 1902423602.0, "step": 4476 }, { "entropy": 0.01837831363081932, "epoch": 1.9594047488784332, "grad_norm": 11.625, "learning_rate": 1.8811586356253384e-05, "loss": 0.1493, "loss_lm": 0.016088966745883226, "loss_seg": 0.13325833901762962, "mean_token_accuracy": 0.9952303320169449, "num_tokens": 1902848539.0, "step": 4477 }, { "entropy": 0.01814119378104806, "epoch": 1.959842433526644, "grad_norm": 14.5625, "learning_rate": 1.880887926367082e-05, "loss": 0.0979, "loss_lm": 0.016097227344289422, "loss_seg": 0.08181467931717634, "mean_token_accuracy": 0.9952571839094162, "num_tokens": 1903273351.0, "step": 4478 }, { "entropy": 0.01812012493610382, "epoch": 1.960280118174855, "grad_norm": 7.4375, "learning_rate": 1.880617217108825e-05, "loss": 0.0988, "loss_lm": 0.015628885477781296, "loss_seg": 0.08313416130840778, "mean_token_accuracy": 0.9952797591686249, "num_tokens": 1903697868.0, "step": 4479 }, { "entropy": 0.018370511010289192, "epoch": 1.960717802823066, "grad_norm": 5.28125, "learning_rate": 1.8803465078505685e-05, "loss": 0.105, "loss_lm": 0.01649344968609512, "loss_seg": 0.08847552258521318, "mean_token_accuracy": 0.9951718598604202, "num_tokens": 1904122653.0, "step": 4480 }, { "entropy": 0.01832927903160453, "epoch": 1.9611554874712769, "grad_norm": 6.1875, "learning_rate": 1.880075798592312e-05, "loss": 0.0993, "loss_lm": 0.01623340114019811, "loss_seg": 0.08306959271430969, "mean_token_accuracy": 0.9952357560396194, "num_tokens": 1904548264.0, "step": 4481 }, { "entropy": 0.017995250411331654, "epoch": 1.961593172119488, "grad_norm": 7.40625, "learning_rate": 1.8798050893340552e-05, "loss": 0.1283, "loss_lm": 0.016047682845965028, "loss_seg": 0.11222631298005581, "mean_token_accuracy": 0.9953483939170837, "num_tokens": 1904973877.0, "step": 4482 }, { "entropy": 0.018587203230708838, "epoch": 1.962030856767699, "grad_norm": 14.5625, "learning_rate": 1.8795343800757986e-05, "loss": 0.1055, "loss_lm": 0.014095915481448174, "loss_seg": 0.09143042378127575, "mean_token_accuracy": 0.9951716363430023, "num_tokens": 1905398606.0, "step": 4483 }, { "entropy": 0.018152296543121338, "epoch": 1.9624685414159098, "grad_norm": 7.8125, "learning_rate": 1.879263670817542e-05, "loss": 0.1183, "loss_lm": 0.01685399143025279, "loss_seg": 0.10148864798247814, "mean_token_accuracy": 0.9951389133930206, "num_tokens": 1905823556.0, "step": 4484 }, { "entropy": 0.018070180900394917, "epoch": 1.9629062260641208, "grad_norm": 12.4375, "learning_rate": 1.8789929615592853e-05, "loss": 0.1325, "loss_lm": 0.017890108982101083, "loss_seg": 0.11465150862932205, "mean_token_accuracy": 0.9951729029417038, "num_tokens": 1906249253.0, "step": 4485 }, { "entropy": 0.018708250485360622, "epoch": 1.9633439107123318, "grad_norm": 16.0, "learning_rate": 1.8787222523010287e-05, "loss": 0.1264, "loss_lm": 0.01602421049028635, "loss_seg": 0.11042007151991129, "mean_token_accuracy": 0.9949971586465836, "num_tokens": 1906674579.0, "step": 4486 }, { "entropy": 0.018506180960685015, "epoch": 1.9637815953605426, "grad_norm": 4.09375, "learning_rate": 1.878451543042772e-05, "loss": 0.1365, "loss_lm": 0.015119804302230477, "loss_seg": 0.12141105625778437, "mean_token_accuracy": 0.9951646775007248, "num_tokens": 1907100155.0, "step": 4487 }, { "entropy": 0.018168199341744184, "epoch": 1.9642192800087537, "grad_norm": 9.875, "learning_rate": 1.8781808337845155e-05, "loss": 0.1175, "loss_lm": 0.014288678299635649, "loss_seg": 0.10324074141681194, "mean_token_accuracy": 0.9954175800085068, "num_tokens": 1907524971.0, "step": 4488 }, { "entropy": 0.018695408944040537, "epoch": 1.9646569646569647, "grad_norm": 6.4375, "learning_rate": 1.877910124526259e-05, "loss": 0.1128, "loss_lm": 0.01693502301350236, "loss_seg": 0.095847237855196, "mean_token_accuracy": 0.9951640218496323, "num_tokens": 1907950252.0, "step": 4489 }, { "entropy": 0.01808860804885626, "epoch": 1.9650946493051755, "grad_norm": 5.75, "learning_rate": 1.8776394152680022e-05, "loss": 0.1363, "loss_lm": 0.015918380115181208, "loss_seg": 0.12039120122790337, "mean_token_accuracy": 0.9953348189592361, "num_tokens": 1908374610.0, "step": 4490 }, { "entropy": 0.018557963892817497, "epoch": 1.9655323339533866, "grad_norm": 6.78125, "learning_rate": 1.8773687060097456e-05, "loss": 0.1264, "loss_lm": 0.016800273209810257, "loss_seg": 0.10959708318114281, "mean_token_accuracy": 0.9951901882886887, "num_tokens": 1908799320.0, "step": 4491 }, { "entropy": 0.018553709145635366, "epoch": 1.9659700186015976, "grad_norm": 6.4375, "learning_rate": 1.877097996751489e-05, "loss": 0.0941, "loss_lm": 0.018660275265574455, "loss_seg": 0.07543467357754707, "mean_token_accuracy": 0.9952279478311539, "num_tokens": 1909224475.0, "step": 4492 }, { "entropy": 0.01853581750765443, "epoch": 1.9664077032498084, "grad_norm": 5.84375, "learning_rate": 1.8768272874932323e-05, "loss": 0.1286, "loss_lm": 0.016391176963225007, "loss_seg": 0.11219721660017967, "mean_token_accuracy": 0.9951968938112259, "num_tokens": 1909649787.0, "step": 4493 }, { "entropy": 0.01853139977902174, "epoch": 1.9668453878980194, "grad_norm": 6.40625, "learning_rate": 1.8765565782349757e-05, "loss": 0.1251, "loss_lm": 0.015381095930933952, "loss_seg": 0.10972452163696289, "mean_token_accuracy": 0.9951911121606827, "num_tokens": 1910075428.0, "step": 4494 }, { "entropy": 0.01827080547809601, "epoch": 1.9672830725462305, "grad_norm": 9.0, "learning_rate": 1.876285868976719e-05, "loss": 0.0777, "loss_lm": 0.015406643971800804, "loss_seg": 0.06232554093003273, "mean_token_accuracy": 0.9952204078435898, "num_tokens": 1910500765.0, "step": 4495 }, { "entropy": 0.017727546393871307, "epoch": 1.9677207571944413, "grad_norm": 14.875, "learning_rate": 1.8760151597184624e-05, "loss": 0.1056, "loss_lm": 0.014931856421753764, "loss_seg": 0.09064670465886593, "mean_token_accuracy": 0.9952540546655655, "num_tokens": 1910925470.0, "step": 4496 }, { "entropy": 0.01801555324345827, "epoch": 1.9681584418426523, "grad_norm": 23.125, "learning_rate": 1.8757444504602058e-05, "loss": 0.1355, "loss_lm": 0.013659329386427999, "loss_seg": 0.12180833797901869, "mean_token_accuracy": 0.995230570435524, "num_tokens": 1911349990.0, "step": 4497 }, { "entropy": 0.018555799964815378, "epoch": 1.9685961264908634, "grad_norm": 3.15625, "learning_rate": 1.8754737412019492e-05, "loss": 0.0993, "loss_lm": 0.018432520562782884, "loss_seg": 0.08091458119452, "mean_token_accuracy": 0.9953050762414932, "num_tokens": 1911774877.0, "step": 4498 }, { "entropy": 0.018357202876359224, "epoch": 1.9690338111390742, "grad_norm": 4.4375, "learning_rate": 1.8752030319436926e-05, "loss": 0.1064, "loss_lm": 0.015387782827019691, "loss_seg": 0.09096559509634972, "mean_token_accuracy": 0.9952731132507324, "num_tokens": 1912199871.0, "step": 4499 }, { "entropy": 0.019003343768417835, "epoch": 1.9694714957872854, "grad_norm": 6.6875, "learning_rate": 1.874932322685436e-05, "loss": 0.1173, "loss_lm": 0.018530954141169786, "loss_seg": 0.0988072119653225, "mean_token_accuracy": 0.9951033592224121, "num_tokens": 1912624849.0, "step": 4500 }, { "entropy": 0.017888969741761684, "epoch": 1.9699091804354962, "grad_norm": 7.28125, "learning_rate": 1.8746616134271793e-05, "loss": 0.1349, "loss_lm": 0.0160237904638052, "loss_seg": 0.11892617866396904, "mean_token_accuracy": 0.9952813535928726, "num_tokens": 1913050137.0, "step": 4501 }, { "entropy": 0.019435339607298374, "epoch": 1.970346865083707, "grad_norm": 6.84375, "learning_rate": 1.8743909041689227e-05, "loss": 0.0949, "loss_lm": 0.01959978509694338, "loss_seg": 0.07529046852141619, "mean_token_accuracy": 0.9949710071086884, "num_tokens": 1913475052.0, "step": 4502 }, { "entropy": 0.018281137570738792, "epoch": 1.9707845497319183, "grad_norm": 11.625, "learning_rate": 1.874120194910666e-05, "loss": 0.1321, "loss_lm": 0.014878504909574986, "loss_seg": 0.11721676960587502, "mean_token_accuracy": 0.9953251779079437, "num_tokens": 1913899386.0, "step": 4503 }, { "entropy": 0.017975935712456703, "epoch": 1.9712222343801291, "grad_norm": 14.1875, "learning_rate": 1.8738494856524094e-05, "loss": 0.0941, "loss_lm": 0.014572306303307414, "loss_seg": 0.07950982637703419, "mean_token_accuracy": 0.9953615963459015, "num_tokens": 1914324289.0, "step": 4504 }, { "entropy": 0.018116414546966553, "epoch": 1.97165991902834, "grad_norm": 21.625, "learning_rate": 1.8735787763941528e-05, "loss": 0.1202, "loss_lm": 0.015709337312728167, "loss_seg": 0.10451401211321354, "mean_token_accuracy": 0.9952979534864426, "num_tokens": 1914749380.0, "step": 4505 }, { "entropy": 0.01823440520092845, "epoch": 1.9720976036765512, "grad_norm": 4.625, "learning_rate": 1.873308067135896e-05, "loss": 0.1074, "loss_lm": 0.015986690996214747, "loss_seg": 0.0913640484213829, "mean_token_accuracy": 0.9952859580516815, "num_tokens": 1915174643.0, "step": 4506 }, { "entropy": 0.017626944929361343, "epoch": 1.972535288324762, "grad_norm": 8.5625, "learning_rate": 1.8730373578776392e-05, "loss": 0.1328, "loss_lm": 0.015428277663886547, "loss_seg": 0.11736248061060905, "mean_token_accuracy": 0.9953356236219406, "num_tokens": 1915600156.0, "step": 4507 }, { "entropy": 0.018408411648124456, "epoch": 1.972972972972973, "grad_norm": 18.75, "learning_rate": 1.8727666486193826e-05, "loss": 0.1216, "loss_lm": 0.016059364890679717, "loss_seg": 0.10558884777128696, "mean_token_accuracy": 0.9951702505350113, "num_tokens": 1916025822.0, "step": 4508 }, { "entropy": 0.018294990994036198, "epoch": 1.973410657621184, "grad_norm": 15.25, "learning_rate": 1.8724959393611263e-05, "loss": 0.1481, "loss_lm": 0.015783994691446424, "loss_seg": 0.1323233414441347, "mean_token_accuracy": 0.9952859729528427, "num_tokens": 1916451103.0, "step": 4509 }, { "entropy": 0.01846554921939969, "epoch": 1.9738483422693949, "grad_norm": 11.625, "learning_rate": 1.8722252301028697e-05, "loss": 0.1332, "loss_lm": 0.014728455105796456, "loss_seg": 0.11849398631602526, "mean_token_accuracy": 0.9951726049184799, "num_tokens": 1916875687.0, "step": 4510 }, { "entropy": 0.017934572882950306, "epoch": 1.974286026917606, "grad_norm": 10.5, "learning_rate": 1.871954520844613e-05, "loss": 0.0928, "loss_lm": 0.016116656828671694, "loss_seg": 0.07670332863926888, "mean_token_accuracy": 0.9953307509422302, "num_tokens": 1917300736.0, "step": 4511 }, { "entropy": 0.01837265957146883, "epoch": 1.974723711565817, "grad_norm": 3.78125, "learning_rate": 1.871683811586356e-05, "loss": 0.1344, "loss_lm": 0.015031960094347596, "loss_seg": 0.11932247504591942, "mean_token_accuracy": 0.9953046143054962, "num_tokens": 1917725991.0, "step": 4512 }, { "entropy": 0.01801103726029396, "epoch": 1.9751613962140278, "grad_norm": 18.375, "learning_rate": 1.8714131023280994e-05, "loss": 0.1, "loss_lm": 0.015729462495073676, "loss_seg": 0.08424352947622538, "mean_token_accuracy": 0.9952968508005142, "num_tokens": 1918150775.0, "step": 4513 }, { "entropy": 0.018378942739218473, "epoch": 1.9755990808622388, "grad_norm": 11.0, "learning_rate": 1.871142393069843e-05, "loss": 0.0862, "loss_lm": 0.016506088199093938, "loss_seg": 0.06972450576722622, "mean_token_accuracy": 0.9952272176742554, "num_tokens": 1918575412.0, "step": 4514 }, { "entropy": 0.01864045998081565, "epoch": 1.9760367655104498, "grad_norm": 15.0625, "learning_rate": 1.8708716838115865e-05, "loss": 0.1283, "loss_lm": 0.016906990204006433, "loss_seg": 0.1113765062764287, "mean_token_accuracy": 0.9951198846101761, "num_tokens": 1919000548.0, "step": 4515 }, { "entropy": 0.018000314943492413, "epoch": 1.9764744501586606, "grad_norm": 9.6875, "learning_rate": 1.87060097455333e-05, "loss": 0.115, "loss_lm": 0.015165872406214476, "loss_seg": 0.09987625852227211, "mean_token_accuracy": 0.995455339550972, "num_tokens": 1919425824.0, "step": 4516 }, { "entropy": 0.018739907536655664, "epoch": 1.9769121348068717, "grad_norm": 5.71875, "learning_rate": 1.870330265295073e-05, "loss": 0.1146, "loss_lm": 0.014800918055698276, "loss_seg": 0.09976614266633987, "mean_token_accuracy": 0.995121955871582, "num_tokens": 1919851340.0, "step": 4517 }, { "entropy": 0.01819651434198022, "epoch": 1.9773498194550827, "grad_norm": 13.1875, "learning_rate": 1.8700595560368163e-05, "loss": 0.0993, "loss_lm": 0.013951204484328628, "loss_seg": 0.085392314940691, "mean_token_accuracy": 0.995294064283371, "num_tokens": 1920275228.0, "step": 4518 }, { "entropy": 0.018290342297405005, "epoch": 1.9777875041032935, "grad_norm": 4.28125, "learning_rate": 1.86978884677856e-05, "loss": 0.11, "loss_lm": 0.014807492960244417, "loss_seg": 0.09518097713589668, "mean_token_accuracy": 0.9951379299163818, "num_tokens": 1920700548.0, "step": 4519 }, { "entropy": 0.019102973863482475, "epoch": 1.9782251887515045, "grad_norm": 8.125, "learning_rate": 1.8695181375203034e-05, "loss": 0.1039, "loss_lm": 0.01647783233784139, "loss_seg": 0.08741333149373531, "mean_token_accuracy": 0.9950229376554489, "num_tokens": 1921126273.0, "step": 4520 }, { "entropy": 0.01863777916878462, "epoch": 1.9786628733997156, "grad_norm": 11.9375, "learning_rate": 1.8692474282620468e-05, "loss": 0.0984, "loss_lm": 0.017016951460391283, "loss_seg": 0.08135294541716576, "mean_token_accuracy": 0.9951679706573486, "num_tokens": 1921551327.0, "step": 4521 }, { "entropy": 0.018622877076268196, "epoch": 1.9791005580479264, "grad_norm": 5.09375, "learning_rate": 1.8689767190037898e-05, "loss": 0.0686, "loss_lm": 0.01659031817689538, "loss_seg": 0.05205420684069395, "mean_token_accuracy": 0.9951570183038712, "num_tokens": 1921976653.0, "step": 4522 }, { "entropy": 0.017855240032076836, "epoch": 1.9795382426961374, "grad_norm": 9.375, "learning_rate": 1.868706009745533e-05, "loss": 0.1133, "loss_lm": 0.014009080594405532, "loss_seg": 0.09924527257680893, "mean_token_accuracy": 0.9953256398439407, "num_tokens": 1922401901.0, "step": 4523 }, { "entropy": 0.01847260119393468, "epoch": 1.9799759273443485, "grad_norm": 4.84375, "learning_rate": 1.868435300487277e-05, "loss": 0.1134, "loss_lm": 0.01699949335306883, "loss_seg": 0.09641010500490665, "mean_token_accuracy": 0.9952712655067444, "num_tokens": 1922827336.0, "step": 4524 }, { "entropy": 0.018184051383286715, "epoch": 1.9804136119925593, "grad_norm": 12.6875, "learning_rate": 1.8681645912290202e-05, "loss": 0.1283, "loss_lm": 0.01702893991023302, "loss_seg": 0.11127360351383686, "mean_token_accuracy": 0.9951554089784622, "num_tokens": 1923252510.0, "step": 4525 }, { "entropy": 0.019269295502454042, "epoch": 1.9808512966407703, "grad_norm": 11.375, "learning_rate": 1.8678938819707636e-05, "loss": 0.1321, "loss_lm": 0.01738942856900394, "loss_seg": 0.11472242046147585, "mean_token_accuracy": 0.9950573593378067, "num_tokens": 1923677560.0, "step": 4526 }, { "entropy": 0.018564452417194843, "epoch": 1.9812889812889813, "grad_norm": 11.0, "learning_rate": 1.8676231727125067e-05, "loss": 0.1031, "loss_lm": 0.013988946098834276, "loss_seg": 0.08912155032157898, "mean_token_accuracy": 0.9952009916305542, "num_tokens": 1924102613.0, "step": 4527 }, { "entropy": 0.019434466026723385, "epoch": 1.9817266659371922, "grad_norm": 7.65625, "learning_rate": 1.86735246345425e-05, "loss": 0.1348, "loss_lm": 0.01819130009971559, "loss_seg": 0.11657966580241919, "mean_token_accuracy": 0.9949291497468948, "num_tokens": 1924527582.0, "step": 4528 }, { "entropy": 0.01897293422371149, "epoch": 1.9821643505854032, "grad_norm": 4.65625, "learning_rate": 1.8670817541959937e-05, "loss": 0.0995, "loss_lm": 0.018877348164096475, "loss_seg": 0.08058178424835205, "mean_token_accuracy": 0.9948944747447968, "num_tokens": 1924952607.0, "step": 4529 }, { "entropy": 0.01803932385519147, "epoch": 1.9826020352336142, "grad_norm": 5.65625, "learning_rate": 1.866811044937737e-05, "loss": 0.102, "loss_lm": 0.013692521722987294, "loss_seg": 0.08829308114945889, "mean_token_accuracy": 0.9953788667917252, "num_tokens": 1925377764.0, "step": 4530 }, { "entropy": 0.018281881231814623, "epoch": 1.983039719881825, "grad_norm": 6.625, "learning_rate": 1.86654033567948e-05, "loss": 0.1047, "loss_lm": 0.016029075253754854, "loss_seg": 0.08871622383594513, "mean_token_accuracy": 0.9951762408018112, "num_tokens": 1925802474.0, "step": 4531 }, { "entropy": 0.017822171095758677, "epoch": 1.983477404530036, "grad_norm": 6.25, "learning_rate": 1.8662696264212235e-05, "loss": 0.1276, "loss_lm": 0.014668430667370558, "loss_seg": 0.11292912624776363, "mean_token_accuracy": 0.9954329878091812, "num_tokens": 1926227422.0, "step": 4532 }, { "entropy": 0.01756523083895445, "epoch": 1.983915089178247, "grad_norm": 4.46875, "learning_rate": 1.865998917162967e-05, "loss": 0.0862, "loss_lm": 0.013287423178553581, "loss_seg": 0.07294942252337933, "mean_token_accuracy": 0.9954040050506592, "num_tokens": 1926652534.0, "step": 4533 }, { "entropy": 0.01817277492955327, "epoch": 1.984352773826458, "grad_norm": 6.15625, "learning_rate": 1.8657282079047106e-05, "loss": 0.116, "loss_lm": 0.014834005618467927, "loss_seg": 0.1011396124958992, "mean_token_accuracy": 0.9952269047498703, "num_tokens": 1927077144.0, "step": 4534 }, { "entropy": 0.018611727748066187, "epoch": 1.984790458474669, "grad_norm": 8.5625, "learning_rate": 1.865457498646454e-05, "loss": 0.1237, "loss_lm": 0.016827705316245556, "loss_seg": 0.10688203386962414, "mean_token_accuracy": 0.995212510228157, "num_tokens": 1927501954.0, "step": 4535 }, { "entropy": 0.0185988862067461, "epoch": 1.98522814312288, "grad_norm": 4.875, "learning_rate": 1.865186789388197e-05, "loss": 0.0923, "loss_lm": 0.014259157003834844, "loss_seg": 0.07803387846797705, "mean_token_accuracy": 0.9952341020107269, "num_tokens": 1927927028.0, "step": 4536 }, { "entropy": 0.01845573214814067, "epoch": 1.9856658277710908, "grad_norm": 3.953125, "learning_rate": 1.8649160801299404e-05, "loss": 0.1036, "loss_lm": 0.019017399288713932, "loss_seg": 0.08457288518548012, "mean_token_accuracy": 0.9952070564031601, "num_tokens": 1928352631.0, "step": 4537 }, { "entropy": 0.018011716194450855, "epoch": 1.986103512419302, "grad_norm": 49.75, "learning_rate": 1.8646453708716838e-05, "loss": 0.1366, "loss_lm": 0.01914265425875783, "loss_seg": 0.11748034320771694, "mean_token_accuracy": 0.9953015595674515, "num_tokens": 1928778245.0, "step": 4538 }, { "entropy": 0.018883895594626665, "epoch": 1.9865411970675129, "grad_norm": 8.375, "learning_rate": 1.864374661613427e-05, "loss": 0.1588, "loss_lm": 0.01849051914177835, "loss_seg": 0.14035938680171967, "mean_token_accuracy": 0.9950871467590332, "num_tokens": 1929203144.0, "step": 4539 }, { "entropy": 0.018471880350261927, "epoch": 1.9869788817157237, "grad_norm": 7.46875, "learning_rate": 1.864103952355171e-05, "loss": 0.1181, "loss_lm": 0.01409918605349958, "loss_seg": 0.10398599971085787, "mean_token_accuracy": 0.9951692670583725, "num_tokens": 1929628308.0, "step": 4540 }, { "entropy": 0.017751680221408606, "epoch": 1.987416566363935, "grad_norm": 4.125, "learning_rate": 1.863833243096914e-05, "loss": 0.1055, "loss_lm": 0.014667825307697058, "loss_seg": 0.09086635150015354, "mean_token_accuracy": 0.9954905956983566, "num_tokens": 1930053349.0, "step": 4541 }, { "entropy": 0.018036632798612118, "epoch": 1.9878542510121457, "grad_norm": 3.109375, "learning_rate": 1.8635625338386572e-05, "loss": 0.1181, "loss_lm": 0.016641421243548393, "loss_seg": 0.10146557167172432, "mean_token_accuracy": 0.9953738451004028, "num_tokens": 1930477195.0, "step": 4542 }, { "entropy": 0.01858297362923622, "epoch": 1.9882919356603566, "grad_norm": 6.03125, "learning_rate": 1.8632918245804006e-05, "loss": 0.0802, "loss_lm": 0.01733161276206374, "loss_seg": 0.06286235433071852, "mean_token_accuracy": 0.9951481223106384, "num_tokens": 1930902166.0, "step": 4543 }, { "entropy": 0.018582123797386885, "epoch": 1.9887296203085678, "grad_norm": 9.75, "learning_rate": 1.863021115322144e-05, "loss": 0.1205, "loss_lm": 0.01891564973630011, "loss_seg": 0.10161113739013672, "mean_token_accuracy": 0.9949155747890472, "num_tokens": 1931326765.0, "step": 4544 }, { "entropy": 0.018573629669845104, "epoch": 1.9891673049567786, "grad_norm": 6.40625, "learning_rate": 1.8627504060638877e-05, "loss": 0.1436, "loss_lm": 0.015209601959213614, "loss_seg": 0.12843608111143112, "mean_token_accuracy": 0.9952054917812347, "num_tokens": 1931752414.0, "step": 4545 }, { "entropy": 0.018086975440382957, "epoch": 1.9896049896049897, "grad_norm": 10.5625, "learning_rate": 1.8624796968056307e-05, "loss": 0.1105, "loss_lm": 0.016376947984099388, "loss_seg": 0.09410057868808508, "mean_token_accuracy": 0.995345950126648, "num_tokens": 1932177117.0, "step": 4546 }, { "entropy": 0.018061752896755934, "epoch": 1.9900426742532007, "grad_norm": 3.671875, "learning_rate": 1.862208987547374e-05, "loss": 0.1273, "loss_lm": 0.014805297367274761, "loss_seg": 0.11250926926732063, "mean_token_accuracy": 0.9952781498432159, "num_tokens": 1932602129.0, "step": 4547 }, { "entropy": 0.01833190117031336, "epoch": 1.9904803589014115, "grad_norm": 12.1875, "learning_rate": 1.8619382782891175e-05, "loss": 0.1028, "loss_lm": 0.017912856535986066, "loss_seg": 0.08485998772084713, "mean_token_accuracy": 0.9952107071876526, "num_tokens": 1933027596.0, "step": 4548 }, { "entropy": 0.01811986556276679, "epoch": 1.9909180435496225, "grad_norm": 40.75, "learning_rate": 1.861667569030861e-05, "loss": 0.1142, "loss_lm": 0.01701879291795194, "loss_seg": 0.09723081905394793, "mean_token_accuracy": 0.9952085316181183, "num_tokens": 1933452072.0, "step": 4549 }, { "entropy": 0.01877999398857355, "epoch": 1.9913557281978336, "grad_norm": 3.640625, "learning_rate": 1.8613968597726046e-05, "loss": 0.1523, "loss_lm": 0.0181752466596663, "loss_seg": 0.13408122397959232, "mean_token_accuracy": 0.9950578510761261, "num_tokens": 1933876768.0, "step": 4550 }, { "entropy": 0.018540442921221256, "epoch": 1.9917934128460444, "grad_norm": 7.1875, "learning_rate": 1.8611261505143476e-05, "loss": 0.1459, "loss_lm": 0.01881134044378996, "loss_seg": 0.12705600261688232, "mean_token_accuracy": 0.9950554519891739, "num_tokens": 1934302068.0, "step": 4551 }, { "entropy": 0.018832465168088675, "epoch": 1.9922310974942554, "grad_norm": 8.1875, "learning_rate": 1.860855441256091e-05, "loss": 0.126, "loss_lm": 0.018835014197975397, "loss_seg": 0.10720246657729149, "mean_token_accuracy": 0.9950587451457977, "num_tokens": 1934727406.0, "step": 4552 }, { "entropy": 0.018117312341928482, "epoch": 1.9926687821424665, "grad_norm": 10.8125, "learning_rate": 1.8605847319978343e-05, "loss": 0.1037, "loss_lm": 0.015404108678922057, "loss_seg": 0.08834293950349092, "mean_token_accuracy": 0.9952877759933472, "num_tokens": 1935152636.0, "step": 4553 }, { "entropy": 0.018592453096061945, "epoch": 1.9931064667906773, "grad_norm": 6.8125, "learning_rate": 1.8603140227395777e-05, "loss": 0.104, "loss_lm": 0.01807395159266889, "loss_seg": 0.08592728525400162, "mean_token_accuracy": 0.9951787889003754, "num_tokens": 1935576902.0, "step": 4554 }, { "entropy": 0.01765643386170268, "epoch": 1.9935441514388883, "grad_norm": 7.125, "learning_rate": 1.860043313481321e-05, "loss": 0.1362, "loss_lm": 0.016266452614217997, "loss_seg": 0.11988760717213154, "mean_token_accuracy": 0.9954010993242264, "num_tokens": 1936001290.0, "step": 4555 }, { "entropy": 0.01910167559981346, "epoch": 1.9939818360870993, "grad_norm": 11.8125, "learning_rate": 1.8597726042230645e-05, "loss": 0.1288, "loss_lm": 0.01710948790423572, "loss_seg": 0.1116966251283884, "mean_token_accuracy": 0.9950521290302277, "num_tokens": 1936427121.0, "step": 4556 }, { "entropy": 0.018221613485366106, "epoch": 1.9944195207353101, "grad_norm": 5.8125, "learning_rate": 1.859501894964808e-05, "loss": 0.1133, "loss_lm": 0.014227590057998896, "loss_seg": 0.09902294911444187, "mean_token_accuracy": 0.9951881468296051, "num_tokens": 1936852086.0, "step": 4557 }, { "entropy": 0.018137686420232058, "epoch": 1.9948572053835212, "grad_norm": 7.4375, "learning_rate": 1.8592311857065512e-05, "loss": 0.1744, "loss_lm": 0.016210803762078285, "loss_seg": 0.15815329179167747, "mean_token_accuracy": 0.9953262805938721, "num_tokens": 1937276335.0, "step": 4558 }, { "entropy": 0.018588600680232048, "epoch": 1.9952948900317322, "grad_norm": 9.3125, "learning_rate": 1.8589604764482946e-05, "loss": 0.1281, "loss_lm": 0.015406643273308873, "loss_seg": 0.1126568466424942, "mean_token_accuracy": 0.9951922744512558, "num_tokens": 1937700992.0, "step": 4559 }, { "entropy": 0.018320761155337095, "epoch": 1.995732574679943, "grad_norm": 6.59375, "learning_rate": 1.858689767190038e-05, "loss": 0.1315, "loss_lm": 0.01402475987561047, "loss_seg": 0.11742733232676983, "mean_token_accuracy": 0.9952420890331268, "num_tokens": 1938125508.0, "step": 4560 }, { "entropy": 0.01828215504065156, "epoch": 1.996170259328154, "grad_norm": 8.625, "learning_rate": 1.8584190579317813e-05, "loss": 0.0904, "loss_lm": 0.01501145912334323, "loss_seg": 0.0753987766802311, "mean_token_accuracy": 0.9952540993690491, "num_tokens": 1938550455.0, "step": 4561 }, { "entropy": 0.01817974215373397, "epoch": 1.996607943976365, "grad_norm": 4.03125, "learning_rate": 1.8581483486735247e-05, "loss": 0.1399, "loss_lm": 0.01693240785971284, "loss_seg": 0.12295107543468475, "mean_token_accuracy": 0.9952541142702103, "num_tokens": 1938975847.0, "step": 4562 }, { "entropy": 0.01825166679918766, "epoch": 1.997045628624576, "grad_norm": 4.40625, "learning_rate": 1.857877639415268e-05, "loss": 0.1152, "loss_lm": 0.017542315181344748, "loss_seg": 0.0976656898856163, "mean_token_accuracy": 0.9952477365732193, "num_tokens": 1939400448.0, "step": 4563 }, { "entropy": 0.018244620878249407, "epoch": 1.997483313272787, "grad_norm": 8.5, "learning_rate": 1.8576069301570114e-05, "loss": 0.1251, "loss_lm": 0.01628025504760444, "loss_seg": 0.10884623788297176, "mean_token_accuracy": 0.9952555745840073, "num_tokens": 1939825545.0, "step": 4564 }, { "entropy": 0.018013036344200373, "epoch": 1.997920997920998, "grad_norm": 7.84375, "learning_rate": 1.8573362208987548e-05, "loss": 0.1024, "loss_lm": 0.015584914945065975, "loss_seg": 0.08676578663289547, "mean_token_accuracy": 0.9953353404998779, "num_tokens": 1940250398.0, "step": 4565 }, { "entropy": 0.017881785985082388, "epoch": 1.9983586825692088, "grad_norm": 7.65625, "learning_rate": 1.8570655116404982e-05, "loss": 0.1616, "loss_lm": 0.017219489440321922, "loss_seg": 0.14434103295207024, "mean_token_accuracy": 0.9952753484249115, "num_tokens": 1940675138.0, "step": 4566 }, { "entropy": 0.019405371975153685, "epoch": 1.9987963672174198, "grad_norm": 9.75, "learning_rate": 1.8567948023822416e-05, "loss": 0.1099, "loss_lm": 0.015255618141964078, "loss_seg": 0.09467945992946625, "mean_token_accuracy": 0.9949367344379425, "num_tokens": 1941100729.0, "step": 4567 }, { "entropy": 0.01818977016955614, "epoch": 1.9992340518656309, "grad_norm": 17.375, "learning_rate": 1.856524093123985e-05, "loss": 0.1389, "loss_lm": 0.013863106491044164, "loss_seg": 0.12508055567741394, "mean_token_accuracy": 0.9953041970729828, "num_tokens": 1941525952.0, "step": 4568 }, { "entropy": 0.0176751590333879, "epoch": 1.9996717365138417, "grad_norm": 7.8125, "learning_rate": 1.8562533838657283e-05, "loss": 0.13, "loss_lm": 0.014521615812554955, "loss_seg": 0.11549775674939156, "mean_token_accuracy": 0.9952880591154099, "num_tokens": 1941950355.0, "step": 4569 }, { "entropy": 0.019034477571646374, "epoch": 2.0, "grad_norm": 7.9375, "learning_rate": 1.8559826746074717e-05, "loss": 0.108, "loss_lm": 0.016799965873360634, "loss_seg": 0.09118206302324931, "mean_token_accuracy": 0.9950141708056132, "num_tokens": 1942269346.0, "step": 4570 }, { "entropy": 0.01824103156104684, "epoch": 2.000437684648211, "grad_norm": 24.5, "learning_rate": 1.855711965349215e-05, "loss": 0.153, "loss_lm": 0.01768612931482494, "loss_seg": 0.13533846195787191, "mean_token_accuracy": 0.9952096790075302, "num_tokens": 1942695569.0, "step": 4571 }, { "entropy": 0.018552865833044052, "epoch": 2.000875369296422, "grad_norm": 11.0, "learning_rate": 1.8554412560909584e-05, "loss": 0.1321, "loss_lm": 0.014728046720847487, "loss_seg": 0.11740596778690815, "mean_token_accuracy": 0.9950998425483704, "num_tokens": 1943120428.0, "step": 4572 }, { "entropy": 0.017754810862243176, "epoch": 2.001313053944633, "grad_norm": 5.4375, "learning_rate": 1.8551705468327018e-05, "loss": 0.1001, "loss_lm": 0.01499918894842267, "loss_seg": 0.08511744067072868, "mean_token_accuracy": 0.9954289942979813, "num_tokens": 1943545224.0, "step": 4573 }, { "entropy": 0.019060714170336723, "epoch": 2.0017507385928437, "grad_norm": 4.625, "learning_rate": 1.854899837574445e-05, "loss": 0.1085, "loss_lm": 0.01773067656904459, "loss_seg": 0.0907856859266758, "mean_token_accuracy": 0.9951037615537643, "num_tokens": 1943971007.0, "step": 4574 }, { "entropy": 0.019170986022800207, "epoch": 2.002188423241055, "grad_norm": 7.46875, "learning_rate": 1.8546291283161882e-05, "loss": 0.1269, "loss_lm": 0.015385542530566454, "loss_seg": 0.11148248892277479, "mean_token_accuracy": 0.9950518906116486, "num_tokens": 1944396219.0, "step": 4575 }, { "entropy": 0.017538235522806644, "epoch": 2.0026261078892658, "grad_norm": 3.484375, "learning_rate": 1.854358419057932e-05, "loss": 0.1171, "loss_lm": 0.014984310837462544, "loss_seg": 0.10215618647634983, "mean_token_accuracy": 0.9953556060791016, "num_tokens": 1944821222.0, "step": 4576 }, { "entropy": 0.017855226062238216, "epoch": 2.0030637925374766, "grad_norm": 4.34375, "learning_rate": 1.8540877097996753e-05, "loss": 0.102, "loss_lm": 0.016128108371049166, "loss_seg": 0.0858814436942339, "mean_token_accuracy": 0.995377779006958, "num_tokens": 1945246295.0, "step": 4577 }, { "entropy": 0.019354387652128935, "epoch": 2.003501477185688, "grad_norm": 9.8125, "learning_rate": 1.8538170005414187e-05, "loss": 0.1111, "loss_lm": 0.01587423635646701, "loss_seg": 0.09519252181053162, "mean_token_accuracy": 0.9950076043605804, "num_tokens": 1945671734.0, "step": 4578 }, { "entropy": 0.018053743056952953, "epoch": 2.0039391618338986, "grad_norm": 15.4375, "learning_rate": 1.8535462912831617e-05, "loss": 0.1215, "loss_lm": 0.014953664503991604, "loss_seg": 0.10657019540667534, "mean_token_accuracy": 0.9952977001667023, "num_tokens": 1946096789.0, "step": 4579 }, { "entropy": 0.018290599808096886, "epoch": 2.0043768464821095, "grad_norm": 10.25, "learning_rate": 1.853275582024905e-05, "loss": 0.0892, "loss_lm": 0.013673274079337716, "loss_seg": 0.07556416280567646, "mean_token_accuracy": 0.9952020645141602, "num_tokens": 1946521703.0, "step": 4580 }, { "entropy": 0.01859542354941368, "epoch": 2.0048145311303207, "grad_norm": 3.09375, "learning_rate": 1.8530048727666488e-05, "loss": 0.0776, "loss_lm": 0.01673771138302982, "loss_seg": 0.060840802267193794, "mean_token_accuracy": 0.9952052086591721, "num_tokens": 1946946898.0, "step": 4581 }, { "entropy": 0.018597074318677187, "epoch": 2.0052522157785315, "grad_norm": 8.5, "learning_rate": 1.852734163508392e-05, "loss": 0.0829, "loss_lm": 0.01570497127249837, "loss_seg": 0.06717453431338072, "mean_token_accuracy": 0.9950889199972153, "num_tokens": 1947371833.0, "step": 4582 }, { "entropy": 0.018439081963151693, "epoch": 2.0056899004267423, "grad_norm": 3.875, "learning_rate": 1.8524634542501355e-05, "loss": 0.11, "loss_lm": 0.014392856508493423, "loss_seg": 0.09561972692608833, "mean_token_accuracy": 0.9952328503131866, "num_tokens": 1947796170.0, "step": 4583 }, { "entropy": 0.017949230037629604, "epoch": 2.0061275850749536, "grad_norm": 6.5625, "learning_rate": 1.8521927449918786e-05, "loss": 0.0874, "loss_lm": 0.015196983236819506, "loss_seg": 0.07215483114123344, "mean_token_accuracy": 0.9953444451093674, "num_tokens": 1948220570.0, "step": 4584 }, { "entropy": 0.01857849070802331, "epoch": 2.0065652697231644, "grad_norm": 11.0, "learning_rate": 1.851922035733622e-05, "loss": 0.1063, "loss_lm": 0.016601267736405134, "loss_seg": 0.08965833857655525, "mean_token_accuracy": 0.99509696662426, "num_tokens": 1948644766.0, "step": 4585 }, { "entropy": 0.017669137567281723, "epoch": 2.007002954371375, "grad_norm": 4.96875, "learning_rate": 1.8516513264753656e-05, "loss": 0.1585, "loss_lm": 0.015051786322146654, "loss_seg": 0.14345011673867702, "mean_token_accuracy": 0.9953128844499588, "num_tokens": 1949069728.0, "step": 4586 }, { "entropy": 0.01755339466035366, "epoch": 2.0074406390195865, "grad_norm": 7.875, "learning_rate": 1.851380617217109e-05, "loss": 0.0964, "loss_lm": 0.01632894203066826, "loss_seg": 0.08007950708270073, "mean_token_accuracy": 0.9954309463500977, "num_tokens": 1949493612.0, "step": 4587 }, { "entropy": 0.0187807260081172, "epoch": 2.0078783236677973, "grad_norm": 19.625, "learning_rate": 1.8511099079588524e-05, "loss": 0.1325, "loss_lm": 0.015500415582209826, "loss_seg": 0.11697578243911266, "mean_token_accuracy": 0.9952135682106018, "num_tokens": 1949918740.0, "step": 4588 }, { "entropy": 0.018809079192578793, "epoch": 2.008316008316008, "grad_norm": 8.25, "learning_rate": 1.8508391987005954e-05, "loss": 0.0979, "loss_lm": 0.01792944734916091, "loss_seg": 0.07997569255530834, "mean_token_accuracy": 0.9951342046260834, "num_tokens": 1950344953.0, "step": 4589 }, { "entropy": 0.01866569509729743, "epoch": 2.0087536929642194, "grad_norm": 8.4375, "learning_rate": 1.8505684894423388e-05, "loss": 0.1588, "loss_lm": 0.01596895745024085, "loss_seg": 0.14284427091479301, "mean_token_accuracy": 0.9950728863477707, "num_tokens": 1950771134.0, "step": 4590 }, { "entropy": 0.018194176256656647, "epoch": 2.00919137761243, "grad_norm": 9.9375, "learning_rate": 1.8502977801840825e-05, "loss": 0.1334, "loss_lm": 0.021127112209796906, "loss_seg": 0.11231299303472042, "mean_token_accuracy": 0.9951597452163696, "num_tokens": 1951196292.0, "step": 4591 }, { "entropy": 0.017731234896928072, "epoch": 2.0096290622606414, "grad_norm": 28.75, "learning_rate": 1.850027070925826e-05, "loss": 0.0865, "loss_lm": 0.01670471951365471, "loss_seg": 0.06978978775441647, "mean_token_accuracy": 0.9953618794679642, "num_tokens": 1951621200.0, "step": 4592 }, { "entropy": 0.01802830444648862, "epoch": 2.0100667469088522, "grad_norm": 7.46875, "learning_rate": 1.8497563616675692e-05, "loss": 0.0964, "loss_lm": 0.014920774381607771, "loss_seg": 0.08145039901137352, "mean_token_accuracy": 0.9951921999454498, "num_tokens": 1952046302.0, "step": 4593 }, { "entropy": 0.01844639889895916, "epoch": 2.010504431557063, "grad_norm": 6.0625, "learning_rate": 1.8494856524093123e-05, "loss": 0.1032, "loss_lm": 0.016015905886888504, "loss_seg": 0.08716648258268833, "mean_token_accuracy": 0.9952288419008255, "num_tokens": 1952471912.0, "step": 4594 }, { "entropy": 0.018064417876303196, "epoch": 2.0109421162052743, "grad_norm": 9.0625, "learning_rate": 1.8492149431510557e-05, "loss": 0.1234, "loss_lm": 0.015809431206434965, "loss_seg": 0.10759087279438972, "mean_token_accuracy": 0.9951295405626297, "num_tokens": 1952896894.0, "step": 4595 }, { "entropy": 0.017634061630815268, "epoch": 2.011379800853485, "grad_norm": 9.6875, "learning_rate": 1.8489442338927994e-05, "loss": 0.0999, "loss_lm": 0.013646798674017191, "loss_seg": 0.08622503094375134, "mean_token_accuracy": 0.9953639358282089, "num_tokens": 1953321114.0, "step": 4596 }, { "entropy": 0.017966400366276503, "epoch": 2.011817485501696, "grad_norm": 3.171875, "learning_rate": 1.8486735246345427e-05, "loss": 0.1199, "loss_lm": 0.014688968192785978, "loss_seg": 0.10522887483239174, "mean_token_accuracy": 0.9954300075769424, "num_tokens": 1953745571.0, "step": 4597 }, { "entropy": 0.017399197444319725, "epoch": 2.012255170149907, "grad_norm": 3.71875, "learning_rate": 1.848402815376286e-05, "loss": 0.1165, "loss_lm": 0.015404230915009975, "loss_seg": 0.10110463201999664, "mean_token_accuracy": 0.9954805374145508, "num_tokens": 1954169880.0, "step": 4598 }, { "entropy": 0.01804171921685338, "epoch": 2.012692854798118, "grad_norm": 14.3125, "learning_rate": 1.848132106118029e-05, "loss": 0.1457, "loss_lm": 0.017809559823945165, "loss_seg": 0.12786454521119595, "mean_token_accuracy": 0.9953452497720718, "num_tokens": 1954595209.0, "step": 4599 }, { "entropy": 0.01766124926507473, "epoch": 2.013130539446329, "grad_norm": 3.359375, "learning_rate": 1.8478613968597725e-05, "loss": 0.1088, "loss_lm": 0.014993880176916718, "loss_seg": 0.09380980767309666, "mean_token_accuracy": 0.9954749494791031, "num_tokens": 1955020124.0, "step": 4600 }, { "entropy": 0.017842423636466265, "epoch": 2.01356822409454, "grad_norm": 3.40625, "learning_rate": 1.8475906876015162e-05, "loss": 0.1046, "loss_lm": 0.013760473113507032, "loss_seg": 0.0907991211861372, "mean_token_accuracy": 0.9953193217515945, "num_tokens": 1955444381.0, "step": 4601 }, { "entropy": 0.018910119775682688, "epoch": 2.014005908742751, "grad_norm": 5.15625, "learning_rate": 1.8473199783432596e-05, "loss": 0.1026, "loss_lm": 0.016037453431636095, "loss_seg": 0.08660726249217987, "mean_token_accuracy": 0.99509297311306, "num_tokens": 1955869849.0, "step": 4602 }, { "entropy": 0.018160824663937092, "epoch": 2.0144435933909617, "grad_norm": 4.25, "learning_rate": 1.8470492690850026e-05, "loss": 0.097, "loss_lm": 0.016274931142106652, "loss_seg": 0.08075242303311825, "mean_token_accuracy": 0.9953235387802124, "num_tokens": 1956295555.0, "step": 4603 }, { "entropy": 0.017971842549741268, "epoch": 2.014881278039173, "grad_norm": 15.5, "learning_rate": 1.846778559826746e-05, "loss": 0.0958, "loss_lm": 0.01636633463203907, "loss_seg": 0.07940300181508064, "mean_token_accuracy": 0.995275467634201, "num_tokens": 1956720939.0, "step": 4604 }, { "entropy": 0.018084720708429813, "epoch": 2.0153189626873838, "grad_norm": 10.125, "learning_rate": 1.8465078505684894e-05, "loss": 0.1379, "loss_lm": 0.018405604176223278, "loss_seg": 0.11945170164108276, "mean_token_accuracy": 0.9952069967985153, "num_tokens": 1957145507.0, "step": 4605 }, { "entropy": 0.01864770520478487, "epoch": 2.0157566473355946, "grad_norm": 3.203125, "learning_rate": 1.8462371413102328e-05, "loss": 0.133, "loss_lm": 0.01890540774911642, "loss_seg": 0.11413496732711792, "mean_token_accuracy": 0.9951371103525162, "num_tokens": 1957570320.0, "step": 4606 }, { "entropy": 0.018493690062314272, "epoch": 2.016194331983806, "grad_norm": 5.03125, "learning_rate": 1.8459664320519765e-05, "loss": 0.0838, "loss_lm": 0.01580926775932312, "loss_seg": 0.06800917256623507, "mean_token_accuracy": 0.995133712887764, "num_tokens": 1957995887.0, "step": 4607 }, { "entropy": 0.018284082878381014, "epoch": 2.0166320166320166, "grad_norm": 10.5625, "learning_rate": 1.8456957227937195e-05, "loss": 0.0835, "loss_lm": 0.017994002206251025, "loss_seg": 0.0654895231127739, "mean_token_accuracy": 0.9951352328062057, "num_tokens": 1958420545.0, "step": 4608 }, { "entropy": 0.01828260999172926, "epoch": 2.0170697012802274, "grad_norm": 5.53125, "learning_rate": 1.845425013535463e-05, "loss": 0.112, "loss_lm": 0.01638089376501739, "loss_seg": 0.09559562057256699, "mean_token_accuracy": 0.9952299296855927, "num_tokens": 1958845853.0, "step": 4609 }, { "entropy": 0.018463750835508108, "epoch": 2.0175073859284387, "grad_norm": 7.03125, "learning_rate": 1.8451543042772062e-05, "loss": 0.0932, "loss_lm": 0.01560356467962265, "loss_seg": 0.07761653419584036, "mean_token_accuracy": 0.9951674193143845, "num_tokens": 1959270655.0, "step": 4610 }, { "entropy": 0.01773066120222211, "epoch": 2.0179450705766495, "grad_norm": 7.3125, "learning_rate": 1.8448835950189496e-05, "loss": 0.0904, "loss_lm": 0.014345772098749876, "loss_seg": 0.07610266283154488, "mean_token_accuracy": 0.9953287094831467, "num_tokens": 1959695333.0, "step": 4611 }, { "entropy": 0.018450279720127583, "epoch": 2.0183827552248603, "grad_norm": 11.5625, "learning_rate": 1.8446128857606933e-05, "loss": 0.1042, "loss_lm": 0.013674578163772821, "loss_seg": 0.09053758904337883, "mean_token_accuracy": 0.9951759725809097, "num_tokens": 1960120677.0, "step": 4612 }, { "entropy": 0.018056413158774376, "epoch": 2.0188204398730716, "grad_norm": 4.3125, "learning_rate": 1.8443421765024364e-05, "loss": 0.1174, "loss_lm": 0.01599938585422933, "loss_seg": 0.10135139618068933, "mean_token_accuracy": 0.9953777194023132, "num_tokens": 1960545722.0, "step": 4613 }, { "entropy": 0.01809129910543561, "epoch": 2.0192581245212824, "grad_norm": 5.53125, "learning_rate": 1.8440714672441797e-05, "loss": 0.14, "loss_lm": 0.017881962470710278, "loss_seg": 0.122154351323843, "mean_token_accuracy": 0.9952300786972046, "num_tokens": 1960971268.0, "step": 4614 }, { "entropy": 0.01867151539772749, "epoch": 2.019695809169493, "grad_norm": 8.375, "learning_rate": 1.843800757985923e-05, "loss": 0.1362, "loss_lm": 0.015790966106578708, "loss_seg": 0.12036498729139566, "mean_token_accuracy": 0.9950428903102875, "num_tokens": 1961396486.0, "step": 4615 }, { "entropy": 0.01801439793780446, "epoch": 2.0201334938177045, "grad_norm": 4.75, "learning_rate": 1.8435300487276665e-05, "loss": 0.1172, "loss_lm": 0.016069088829681277, "loss_seg": 0.10115459375083447, "mean_token_accuracy": 0.9953538328409195, "num_tokens": 1961821074.0, "step": 4616 }, { "entropy": 0.01786490948870778, "epoch": 2.0205711784659153, "grad_norm": 5.1875, "learning_rate": 1.8432593394694102e-05, "loss": 0.1204, "loss_lm": 0.015935779782012105, "loss_seg": 0.1044551944360137, "mean_token_accuracy": 0.9953732639551163, "num_tokens": 1962245304.0, "step": 4617 }, { "entropy": 0.018679856322705746, "epoch": 2.021008863114126, "grad_norm": 4.125, "learning_rate": 1.8429886302111532e-05, "loss": 0.0853, "loss_lm": 0.015112571651116014, "loss_seg": 0.07020033244043589, "mean_token_accuracy": 0.9951175600290298, "num_tokens": 1962670878.0, "step": 4618 }, { "entropy": 0.018661270383745432, "epoch": 2.0214465477623373, "grad_norm": 4.46875, "learning_rate": 1.8427179209528966e-05, "loss": 0.126, "loss_lm": 0.015999980038031936, "loss_seg": 0.11000399477779865, "mean_token_accuracy": 0.99510857462883, "num_tokens": 1963096083.0, "step": 4619 }, { "entropy": 0.01874926034361124, "epoch": 2.021884232410548, "grad_norm": 5.3125, "learning_rate": 1.84244721169464e-05, "loss": 0.1209, "loss_lm": 0.02014406002126634, "loss_seg": 0.10079770162701607, "mean_token_accuracy": 0.9951106607913971, "num_tokens": 1963521290.0, "step": 4620 }, { "entropy": 0.018352119717746973, "epoch": 2.022321917058759, "grad_norm": 7.0625, "learning_rate": 1.8421765024363833e-05, "loss": 0.1315, "loss_lm": 0.017120782984420657, "loss_seg": 0.11434639059007168, "mean_token_accuracy": 0.9951345920562744, "num_tokens": 1963946693.0, "step": 4621 }, { "entropy": 0.018930401653051376, "epoch": 2.0227596017069702, "grad_norm": 8.9375, "learning_rate": 1.8419057931781267e-05, "loss": 0.1047, "loss_lm": 0.01661778660491109, "loss_seg": 0.08811025321483612, "mean_token_accuracy": 0.9950874000787735, "num_tokens": 1964371882.0, "step": 4622 }, { "entropy": 0.018631645012646914, "epoch": 2.023197286355181, "grad_norm": 8.125, "learning_rate": 1.84163508391987e-05, "loss": 0.1231, "loss_lm": 0.016012378269806504, "loss_seg": 0.1070463489741087, "mean_token_accuracy": 0.9951622039079666, "num_tokens": 1964797254.0, "step": 4623 }, { "entropy": 0.01836079964414239, "epoch": 2.023634971003392, "grad_norm": 4.96875, "learning_rate": 1.8413643746616135e-05, "loss": 0.0973, "loss_lm": 0.015605106484144926, "loss_seg": 0.0816953107714653, "mean_token_accuracy": 0.9951906949281693, "num_tokens": 1965222228.0, "step": 4624 }, { "entropy": 0.01818861812353134, "epoch": 2.024072655651603, "grad_norm": 2.6875, "learning_rate": 1.8410936654033568e-05, "loss": 0.1277, "loss_lm": 0.01749222748912871, "loss_seg": 0.11024670023471117, "mean_token_accuracy": 0.9952039271593094, "num_tokens": 1965647330.0, "step": 4625 }, { "entropy": 0.018560408148914576, "epoch": 2.024510340299814, "grad_norm": 3.90625, "learning_rate": 1.8408229561451002e-05, "loss": 0.1113, "loss_lm": 0.015695555601269007, "loss_seg": 0.09557640366256237, "mean_token_accuracy": 0.9953481703996658, "num_tokens": 1966072926.0, "step": 4626 }, { "entropy": 0.018335044384002686, "epoch": 2.024948024948025, "grad_norm": 6.0, "learning_rate": 1.8405522468868436e-05, "loss": 0.1292, "loss_lm": 0.016624316107481718, "loss_seg": 0.11257250979542732, "mean_token_accuracy": 0.9950839877128601, "num_tokens": 1966497827.0, "step": 4627 }, { "entropy": 0.018972650170326233, "epoch": 2.025385709596236, "grad_norm": 21.25, "learning_rate": 1.840281537628587e-05, "loss": 0.1234, "loss_lm": 0.017436936497688293, "loss_seg": 0.10596250928938389, "mean_token_accuracy": 0.9951432794332504, "num_tokens": 1966922764.0, "step": 4628 }, { "entropy": 0.018953539431095123, "epoch": 2.025823394244447, "grad_norm": 6.3125, "learning_rate": 1.8400108283703303e-05, "loss": 0.0832, "loss_lm": 0.015933045651763678, "loss_seg": 0.06730037275701761, "mean_token_accuracy": 0.9951222687959671, "num_tokens": 1967348351.0, "step": 4629 }, { "entropy": 0.0177430952899158, "epoch": 2.026261078892658, "grad_norm": 3.8125, "learning_rate": 1.8397401191120737e-05, "loss": 0.1127, "loss_lm": 0.01659848215058446, "loss_seg": 0.09612455405294895, "mean_token_accuracy": 0.9953793436288834, "num_tokens": 1967773568.0, "step": 4630 }, { "entropy": 0.017349444329738617, "epoch": 2.026698763540869, "grad_norm": 7.4375, "learning_rate": 1.839469409853817e-05, "loss": 0.1598, "loss_lm": 0.015141620533540845, "loss_seg": 0.1446632817387581, "mean_token_accuracy": 0.9953673928976059, "num_tokens": 1968197921.0, "step": 4631 }, { "entropy": 0.018274350091814995, "epoch": 2.0271364481890797, "grad_norm": 7.03125, "learning_rate": 1.8391987005955604e-05, "loss": 0.1335, "loss_lm": 0.016639247303828597, "loss_seg": 0.11688241176307201, "mean_token_accuracy": 0.9951366782188416, "num_tokens": 1968622684.0, "step": 4632 }, { "entropy": 0.018707833252847195, "epoch": 2.027574132837291, "grad_norm": 3.46875, "learning_rate": 1.8389279913373038e-05, "loss": 0.1069, "loss_lm": 0.019952430855482817, "loss_seg": 0.08697657473385334, "mean_token_accuracy": 0.995025247335434, "num_tokens": 1969048219.0, "step": 4633 }, { "entropy": 0.018403666093945503, "epoch": 2.0280118174855017, "grad_norm": 7.53125, "learning_rate": 1.8386572820790472e-05, "loss": 0.1153, "loss_lm": 0.016233011847361922, "loss_seg": 0.09906693734228611, "mean_token_accuracy": 0.9952583014965057, "num_tokens": 1969473098.0, "step": 4634 }, { "entropy": 0.018738458398729563, "epoch": 2.0284495021337126, "grad_norm": 8.5625, "learning_rate": 1.8383865728207906e-05, "loss": 0.1107, "loss_lm": 0.018248681677505374, "loss_seg": 0.09247363824397326, "mean_token_accuracy": 0.9952712655067444, "num_tokens": 1969898725.0, "step": 4635 }, { "entropy": 0.018482462503015995, "epoch": 2.028887186781924, "grad_norm": 3.984375, "learning_rate": 1.838115863562534e-05, "loss": 0.1178, "loss_lm": 0.014561953023076057, "loss_seg": 0.10328197665512562, "mean_token_accuracy": 0.9953217655420303, "num_tokens": 1970324056.0, "step": 4636 }, { "entropy": 0.018224340863525867, "epoch": 2.0293248714301346, "grad_norm": 12.6875, "learning_rate": 1.837845154304277e-05, "loss": 0.1059, "loss_lm": 0.016000590985640883, "loss_seg": 0.08989579416811466, "mean_token_accuracy": 0.9953858852386475, "num_tokens": 1970749581.0, "step": 4637 }, { "entropy": 0.017994412686675787, "epoch": 2.0297625560783454, "grad_norm": 9.0625, "learning_rate": 1.8375744450460207e-05, "loss": 0.1209, "loss_lm": 0.016272890148684382, "loss_seg": 0.10464406479150057, "mean_token_accuracy": 0.995452955365181, "num_tokens": 1971174195.0, "step": 4638 }, { "entropy": 0.018662966787815094, "epoch": 2.0302002407265567, "grad_norm": 9.4375, "learning_rate": 1.837303735787764e-05, "loss": 0.0963, "loss_lm": 0.015428715152665973, "loss_seg": 0.0808809082955122, "mean_token_accuracy": 0.9951627999544144, "num_tokens": 1971598789.0, "step": 4639 }, { "entropy": 0.018961755093187094, "epoch": 2.0306379253747675, "grad_norm": 6.78125, "learning_rate": 1.8370330265295074e-05, "loss": 0.126, "loss_lm": 0.017046986147761345, "loss_seg": 0.10895802360028028, "mean_token_accuracy": 0.9950749576091766, "num_tokens": 1972023919.0, "step": 4640 }, { "entropy": 0.018070037476718426, "epoch": 2.0310756100229783, "grad_norm": 24.125, "learning_rate": 1.8367623172712508e-05, "loss": 0.1216, "loss_lm": 0.015855313278734684, "loss_seg": 0.10575441643595695, "mean_token_accuracy": 0.9952016174793243, "num_tokens": 1972448614.0, "step": 4641 }, { "entropy": 0.01843160204589367, "epoch": 2.0315132946711896, "grad_norm": 8.25, "learning_rate": 1.8364916080129938e-05, "loss": 0.1148, "loss_lm": 0.015598098048940301, "loss_seg": 0.09921607468277216, "mean_token_accuracy": 0.9951865524053574, "num_tokens": 1972873645.0, "step": 4642 }, { "entropy": 0.01846398040652275, "epoch": 2.0319509793194004, "grad_norm": 5.0625, "learning_rate": 1.8362208987547375e-05, "loss": 0.1302, "loss_lm": 0.016641272697597742, "loss_seg": 0.11351834610104561, "mean_token_accuracy": 0.9952640384435654, "num_tokens": 1973298438.0, "step": 4643 }, { "entropy": 0.01847362145781517, "epoch": 2.032388663967611, "grad_norm": 3.859375, "learning_rate": 1.835950189496481e-05, "loss": 0.1151, "loss_lm": 0.015820072265341878, "loss_seg": 0.09924967959523201, "mean_token_accuracy": 0.9951046407222748, "num_tokens": 1973723315.0, "step": 4644 }, { "entropy": 0.018645514268428087, "epoch": 2.0328263486158225, "grad_norm": 3.234375, "learning_rate": 1.8356794802382243e-05, "loss": 0.1006, "loss_lm": 0.016754287527874112, "loss_seg": 0.08385720662772655, "mean_token_accuracy": 0.9952391684055328, "num_tokens": 1974148128.0, "step": 4645 }, { "entropy": 0.01793649699538946, "epoch": 2.0332640332640333, "grad_norm": 10.8125, "learning_rate": 1.8354087709799673e-05, "loss": 0.1148, "loss_lm": 0.014995601726695895, "loss_seg": 0.09979110956192017, "mean_token_accuracy": 0.9953984022140503, "num_tokens": 1974573092.0, "step": 4646 }, { "entropy": 0.019006929360330105, "epoch": 2.033701717912244, "grad_norm": 3.6875, "learning_rate": 1.8351380617217107e-05, "loss": 0.1009, "loss_lm": 0.016951743280515075, "loss_seg": 0.08393320720642805, "mean_token_accuracy": 0.995080903172493, "num_tokens": 1974997944.0, "step": 4647 }, { "entropy": 0.017854667734354734, "epoch": 2.0341394025604553, "grad_norm": 3.890625, "learning_rate": 1.8348673524634544e-05, "loss": 0.1285, "loss_lm": 0.01387255359441042, "loss_seg": 0.11466312408447266, "mean_token_accuracy": 0.9954663068056107, "num_tokens": 1975423114.0, "step": 4648 }, { "entropy": 0.01809165207669139, "epoch": 2.034577087208666, "grad_norm": 4.40625, "learning_rate": 1.8345966432051978e-05, "loss": 0.0817, "loss_lm": 0.014183678198605776, "loss_seg": 0.06746672373265028, "mean_token_accuracy": 0.9953215420246124, "num_tokens": 1975848431.0, "step": 4649 }, { "entropy": 0.01819435227662325, "epoch": 2.035014771856877, "grad_norm": 8.75, "learning_rate": 1.834325933946941e-05, "loss": 0.1147, "loss_lm": 0.014727371046319604, "loss_seg": 0.09999374486505985, "mean_token_accuracy": 0.9951731860637665, "num_tokens": 1976273524.0, "step": 4650 }, { "entropy": 0.01814964460209012, "epoch": 2.035452456505088, "grad_norm": 23.0, "learning_rate": 1.8340552246886842e-05, "loss": 0.1141, "loss_lm": 0.01714368164539337, "loss_seg": 0.09694972820580006, "mean_token_accuracy": 0.9952953308820724, "num_tokens": 1976698581.0, "step": 4651 }, { "entropy": 0.018068383913487196, "epoch": 2.035890141153299, "grad_norm": 4.0625, "learning_rate": 1.8337845154304276e-05, "loss": 0.0998, "loss_lm": 0.015052158618345857, "loss_seg": 0.08475905284285545, "mean_token_accuracy": 0.9953070878982544, "num_tokens": 1977123438.0, "step": 4652 }, { "entropy": 0.018593650311231613, "epoch": 2.03632782580151, "grad_norm": 4.09375, "learning_rate": 1.8335138061721713e-05, "loss": 0.0954, "loss_lm": 0.015572499250993133, "loss_seg": 0.07984672952443361, "mean_token_accuracy": 0.9952999204397202, "num_tokens": 1977548476.0, "step": 4653 }, { "entropy": 0.01842557080090046, "epoch": 2.036765510449721, "grad_norm": 8.5, "learning_rate": 1.8332430969139146e-05, "loss": 0.1153, "loss_lm": 0.015398658812046051, "loss_seg": 0.09988761506974697, "mean_token_accuracy": 0.9951883256435394, "num_tokens": 1977973475.0, "step": 4654 }, { "entropy": 0.01790665229782462, "epoch": 2.037203195097932, "grad_norm": 6.4375, "learning_rate": 1.832972387655658e-05, "loss": 0.1338, "loss_lm": 0.017279466148465872, "loss_seg": 0.11656343191862106, "mean_token_accuracy": 0.9952960461378098, "num_tokens": 1978398824.0, "step": 4655 }, { "entropy": 0.018362045753747225, "epoch": 2.0376408797461427, "grad_norm": 7.0625, "learning_rate": 1.832701678397401e-05, "loss": 0.1388, "loss_lm": 0.016826985171064734, "loss_seg": 0.12196228839457035, "mean_token_accuracy": 0.9952237010002136, "num_tokens": 1978824628.0, "step": 4656 }, { "entropy": 0.01833869144320488, "epoch": 2.038078564394354, "grad_norm": 7.96875, "learning_rate": 1.8324309691391444e-05, "loss": 0.0917, "loss_lm": 0.0157511152792722, "loss_seg": 0.07594858296215534, "mean_token_accuracy": 0.995289072394371, "num_tokens": 1979249337.0, "step": 4657 }, { "entropy": 0.018601769115775824, "epoch": 2.038516249042565, "grad_norm": 4.65625, "learning_rate": 1.832160259880888e-05, "loss": 0.1312, "loss_lm": 0.01608235528692603, "loss_seg": 0.11511625349521637, "mean_token_accuracy": 0.9952849745750427, "num_tokens": 1979674315.0, "step": 4658 }, { "entropy": 0.017834320664405823, "epoch": 2.0389539336907756, "grad_norm": 17.625, "learning_rate": 1.8318895506226315e-05, "loss": 0.1201, "loss_lm": 0.017665260704234242, "loss_seg": 0.10246572270989418, "mean_token_accuracy": 0.9953113049268723, "num_tokens": 1980100115.0, "step": 4659 }, { "entropy": 0.01843363232910633, "epoch": 2.039391618338987, "grad_norm": 11.4375, "learning_rate": 1.831618841364375e-05, "loss": 0.109, "loss_lm": 0.01770733715966344, "loss_seg": 0.09131767600774765, "mean_token_accuracy": 0.9952148944139481, "num_tokens": 1980524938.0, "step": 4660 }, { "entropy": 0.017905336804687977, "epoch": 2.0398293029871977, "grad_norm": 4.90625, "learning_rate": 1.831348132106118e-05, "loss": 0.1127, "loss_lm": 0.015881522791460156, "loss_seg": 0.09679537825286388, "mean_token_accuracy": 0.9953277260065079, "num_tokens": 1980949720.0, "step": 4661 }, { "entropy": 0.01844281144440174, "epoch": 2.0402669876354085, "grad_norm": 3.796875, "learning_rate": 1.8310774228478613e-05, "loss": 0.1147, "loss_lm": 0.015398075804114342, "loss_seg": 0.09928062092512846, "mean_token_accuracy": 0.9951780736446381, "num_tokens": 1981374207.0, "step": 4662 }, { "entropy": 0.01753762224689126, "epoch": 2.0407046722836197, "grad_norm": 8.0, "learning_rate": 1.830806713589605e-05, "loss": 0.1375, "loss_lm": 0.017386843683198094, "loss_seg": 0.12014167010784149, "mean_token_accuracy": 0.9954444617033005, "num_tokens": 1981799560.0, "step": 4663 }, { "entropy": 0.01941503118723631, "epoch": 2.0411423569318305, "grad_norm": 5.96875, "learning_rate": 1.8305360043313484e-05, "loss": 0.081, "loss_lm": 0.016791701782494783, "loss_seg": 0.06425229460000992, "mean_token_accuracy": 0.9949796944856644, "num_tokens": 1982225019.0, "step": 4664 }, { "entropy": 0.017950847279280424, "epoch": 2.0415800415800414, "grad_norm": 9.5625, "learning_rate": 1.8302652950730917e-05, "loss": 0.1421, "loss_lm": 0.016364445444196463, "loss_seg": 0.12569972593337297, "mean_token_accuracy": 0.995280310511589, "num_tokens": 1982649577.0, "step": 4665 }, { "entropy": 0.018282489851117134, "epoch": 2.0420177262282526, "grad_norm": 17.375, "learning_rate": 1.8299945858148348e-05, "loss": 0.1084, "loss_lm": 0.017665685154497623, "loss_seg": 0.09077880997210741, "mean_token_accuracy": 0.9952960014343262, "num_tokens": 1983074227.0, "step": 4666 }, { "entropy": 0.01842584228143096, "epoch": 2.0424554108764634, "grad_norm": 4.65625, "learning_rate": 1.829723876556578e-05, "loss": 0.1018, "loss_lm": 0.016489822417497635, "loss_seg": 0.0852684248238802, "mean_token_accuracy": 0.995248481631279, "num_tokens": 1983498885.0, "step": 4667 }, { "entropy": 0.01845249580219388, "epoch": 2.0428930955246747, "grad_norm": 4.125, "learning_rate": 1.8294531672983215e-05, "loss": 0.1541, "loss_lm": 0.016235313843935728, "loss_seg": 0.1378953456878662, "mean_token_accuracy": 0.9952463656663895, "num_tokens": 1983923569.0, "step": 4668 }, { "entropy": 0.018696608021855354, "epoch": 2.0433307801728855, "grad_norm": 4.5625, "learning_rate": 1.8291824580400652e-05, "loss": 0.1143, "loss_lm": 0.01535135437734425, "loss_seg": 0.09893925115466118, "mean_token_accuracy": 0.9952539205551147, "num_tokens": 1984348307.0, "step": 4669 }, { "entropy": 0.018622500356286764, "epoch": 2.0437684648210963, "grad_norm": 9.375, "learning_rate": 1.8289117487818083e-05, "loss": 0.1371, "loss_lm": 0.018114925595000386, "loss_seg": 0.11902480572462082, "mean_token_accuracy": 0.9952128976583481, "num_tokens": 1984773381.0, "step": 4670 }, { "entropy": 0.018386011011898518, "epoch": 2.0442061494693076, "grad_norm": 4.03125, "learning_rate": 1.8286410395235516e-05, "loss": 0.1117, "loss_lm": 0.01673330133780837, "loss_seg": 0.09497719444334507, "mean_token_accuracy": 0.9952259659767151, "num_tokens": 1985198528.0, "step": 4671 }, { "entropy": 0.018133231904357672, "epoch": 2.0446438341175184, "grad_norm": 10.4375, "learning_rate": 1.828370330265295e-05, "loss": 0.1076, "loss_lm": 0.014813044341281056, "loss_seg": 0.09280285984277725, "mean_token_accuracy": 0.9951943755149841, "num_tokens": 1985622968.0, "step": 4672 }, { "entropy": 0.01811046339571476, "epoch": 2.045081518765729, "grad_norm": 7.21875, "learning_rate": 1.8280996210070384e-05, "loss": 0.0685, "loss_lm": 0.01297018420882523, "loss_seg": 0.05550184287130833, "mean_token_accuracy": 0.9954026192426682, "num_tokens": 1986047942.0, "step": 4673 }, { "entropy": 0.01803143136203289, "epoch": 2.0455192034139404, "grad_norm": 9.5625, "learning_rate": 1.827828911748782e-05, "loss": 0.1195, "loss_lm": 0.014487816719338298, "loss_seg": 0.10502595826983452, "mean_token_accuracy": 0.9952885806560516, "num_tokens": 1986473237.0, "step": 4674 }, { "entropy": 0.017943446058779955, "epoch": 2.0459568880621513, "grad_norm": 12.4375, "learning_rate": 1.827558202490525e-05, "loss": 0.0961, "loss_lm": 0.014646551106125116, "loss_seg": 0.08141489885747433, "mean_token_accuracy": 0.9953559935092926, "num_tokens": 1986898015.0, "step": 4675 }, { "entropy": 0.018120315857231617, "epoch": 2.046394572710362, "grad_norm": 4.1875, "learning_rate": 1.8272874932322685e-05, "loss": 0.1073, "loss_lm": 0.01710353954695165, "loss_seg": 0.09019418992102146, "mean_token_accuracy": 0.9953013509511948, "num_tokens": 1987323644.0, "step": 4676 }, { "entropy": 0.0190488463267684, "epoch": 2.0468322573585733, "grad_norm": 6.90625, "learning_rate": 1.827016783974012e-05, "loss": 0.1133, "loss_lm": 0.016038397327065468, "loss_seg": 0.09730791859328747, "mean_token_accuracy": 0.99508236348629, "num_tokens": 1987749422.0, "step": 4677 }, { "entropy": 0.01825674157589674, "epoch": 2.047269942006784, "grad_norm": 4.59375, "learning_rate": 1.8267460747157552e-05, "loss": 0.0972, "loss_lm": 0.013920532073825598, "loss_seg": 0.08328593522310257, "mean_token_accuracy": 0.9952553510665894, "num_tokens": 1988174468.0, "step": 4678 }, { "entropy": 0.018475713208317757, "epoch": 2.047707626654995, "grad_norm": 5.46875, "learning_rate": 1.826475365457499e-05, "loss": 0.1308, "loss_lm": 0.01888511050492525, "loss_seg": 0.11187122762203217, "mean_token_accuracy": 0.9953818172216415, "num_tokens": 1988599457.0, "step": 4679 }, { "entropy": 0.01838419632986188, "epoch": 2.048145311303206, "grad_norm": 19.375, "learning_rate": 1.826204656199242e-05, "loss": 0.1564, "loss_lm": 0.016476238146424294, "loss_seg": 0.13995778560638428, "mean_token_accuracy": 0.9951598793268204, "num_tokens": 1989024209.0, "step": 4680 }, { "entropy": 0.01813562959432602, "epoch": 2.048582995951417, "grad_norm": 10.0625, "learning_rate": 1.8259339469409854e-05, "loss": 0.1013, "loss_lm": 0.01576184551231563, "loss_seg": 0.08553320169448853, "mean_token_accuracy": 0.9952204376459122, "num_tokens": 1989449752.0, "step": 4681 }, { "entropy": 0.018420368432998657, "epoch": 2.049020680599628, "grad_norm": 6.34375, "learning_rate": 1.8256632376827287e-05, "loss": 0.127, "loss_lm": 0.015715980669483542, "loss_seg": 0.11128188483417034, "mean_token_accuracy": 0.9952048659324646, "num_tokens": 1989874934.0, "step": 4682 }, { "entropy": 0.017926813568919897, "epoch": 2.049458365247839, "grad_norm": 6.375, "learning_rate": 1.825392528424472e-05, "loss": 0.1055, "loss_lm": 0.015652603469789028, "loss_seg": 0.08986819162964821, "mean_token_accuracy": 0.9953450709581375, "num_tokens": 1990300235.0, "step": 4683 }, { "entropy": 0.018482336308807135, "epoch": 2.04989604989605, "grad_norm": 4.125, "learning_rate": 1.8251218191662158e-05, "loss": 0.1157, "loss_lm": 0.015908848261460662, "loss_seg": 0.0998306181281805, "mean_token_accuracy": 0.9951954036951065, "num_tokens": 1990724810.0, "step": 4684 }, { "entropy": 0.01897075865417719, "epoch": 2.0503337345442607, "grad_norm": 3.96875, "learning_rate": 1.824851109907959e-05, "loss": 0.1254, "loss_lm": 0.01814114092849195, "loss_seg": 0.10723565146327019, "mean_token_accuracy": 0.9950720369815826, "num_tokens": 1991149906.0, "step": 4685 }, { "entropy": 0.01861028466373682, "epoch": 2.050771419192472, "grad_norm": 7.90625, "learning_rate": 1.8245804006497022e-05, "loss": 0.1262, "loss_lm": 0.016897609690204263, "loss_seg": 0.10926751233637333, "mean_token_accuracy": 0.99525086581707, "num_tokens": 1991576046.0, "step": 4686 }, { "entropy": 0.018312462139874697, "epoch": 2.051209103840683, "grad_norm": 6.4375, "learning_rate": 1.8243096913914456e-05, "loss": 0.1117, "loss_lm": 0.016741702565923333, "loss_seg": 0.0949579318985343, "mean_token_accuracy": 0.9952111542224884, "num_tokens": 1992000969.0, "step": 4687 }, { "entropy": 0.01800539344549179, "epoch": 2.0516467884888936, "grad_norm": 10.4375, "learning_rate": 1.824038982133189e-05, "loss": 0.1009, "loss_lm": 0.014942708658054471, "loss_seg": 0.08600160852074623, "mean_token_accuracy": 0.9953039139509201, "num_tokens": 1992425519.0, "step": 4688 }, { "entropy": 0.017956853844225407, "epoch": 2.052084473137105, "grad_norm": 5.375, "learning_rate": 1.8237682728749327e-05, "loss": 0.1155, "loss_lm": 0.015922384103760123, "loss_seg": 0.09960995055735111, "mean_token_accuracy": 0.995209202170372, "num_tokens": 1992850258.0, "step": 4689 }, { "entropy": 0.018196705263108015, "epoch": 2.0525221577853157, "grad_norm": 4.6875, "learning_rate": 1.8234975636166757e-05, "loss": 0.1313, "loss_lm": 0.017228198470547795, "loss_seg": 0.1141100600361824, "mean_token_accuracy": 0.9953652620315552, "num_tokens": 1993274420.0, "step": 4690 }, { "entropy": 0.018216188997030258, "epoch": 2.0529598424335265, "grad_norm": 9.75, "learning_rate": 1.823226854358419e-05, "loss": 0.1116, "loss_lm": 0.018094833474606276, "loss_seg": 0.09354982431977987, "mean_token_accuracy": 0.995089054107666, "num_tokens": 1993700406.0, "step": 4691 }, { "entropy": 0.018443867564201355, "epoch": 2.0533975270817377, "grad_norm": 21.625, "learning_rate": 1.8229561451001625e-05, "loss": 0.1207, "loss_lm": 0.015807594638317823, "loss_seg": 0.1048816628754139, "mean_token_accuracy": 0.9951989054679871, "num_tokens": 1994125079.0, "step": 4692 }, { "entropy": 0.018500241916626692, "epoch": 2.0538352117299485, "grad_norm": 24.125, "learning_rate": 1.8226854358419058e-05, "loss": 0.0965, "loss_lm": 0.015112560242414474, "loss_seg": 0.08142761141061783, "mean_token_accuracy": 0.9951515346765518, "num_tokens": 1994550331.0, "step": 4693 }, { "entropy": 0.01856413623318076, "epoch": 2.0542728963781594, "grad_norm": 3.640625, "learning_rate": 1.8224147265836492e-05, "loss": 0.106, "loss_lm": 0.01596854766830802, "loss_seg": 0.09002730529755354, "mean_token_accuracy": 0.9950579553842545, "num_tokens": 1994975641.0, "step": 4694 }, { "entropy": 0.017628342378884554, "epoch": 2.0547105810263706, "grad_norm": 31.875, "learning_rate": 1.8221440173253926e-05, "loss": 0.1238, "loss_lm": 0.01577804167754948, "loss_seg": 0.10805537179112434, "mean_token_accuracy": 0.9953127950429916, "num_tokens": 1995400579.0, "step": 4695 }, { "entropy": 0.017749148420989513, "epoch": 2.0551482656745814, "grad_norm": 5.90625, "learning_rate": 1.821873308067136e-05, "loss": 0.0858, "loss_lm": 0.014210725668817759, "loss_seg": 0.0715905949473381, "mean_token_accuracy": 0.9953925609588623, "num_tokens": 1995825500.0, "step": 4696 }, { "entropy": 0.018256441224366426, "epoch": 2.0555859503227922, "grad_norm": 12.3125, "learning_rate": 1.8216025988088793e-05, "loss": 0.1158, "loss_lm": 0.016407204559072852, "loss_seg": 0.09939482435584068, "mean_token_accuracy": 0.9951439946889877, "num_tokens": 1996250706.0, "step": 4697 }, { "entropy": 0.018534226808696985, "epoch": 2.0560236349710035, "grad_norm": 6.4375, "learning_rate": 1.8213318895506227e-05, "loss": 0.1001, "loss_lm": 0.016752601834014058, "loss_seg": 0.0833891648799181, "mean_token_accuracy": 0.9952222406864166, "num_tokens": 1996676393.0, "step": 4698 }, { "entropy": 0.018555324990302324, "epoch": 2.0564613196192143, "grad_norm": 5.09375, "learning_rate": 1.821061180292366e-05, "loss": 0.1497, "loss_lm": 0.016525010345503688, "loss_seg": 0.13312516920268536, "mean_token_accuracy": 0.9952254891395569, "num_tokens": 1997101540.0, "step": 4699 }, { "entropy": 0.018682447262108326, "epoch": 2.056899004267425, "grad_norm": 5.78125, "learning_rate": 1.8207904710341094e-05, "loss": 0.0924, "loss_lm": 0.015934639610350132, "loss_seg": 0.07649212703108788, "mean_token_accuracy": 0.9951595216989517, "num_tokens": 1997526471.0, "step": 4700 }, { "entropy": 0.018389816861599684, "epoch": 2.0573366889156364, "grad_norm": 7.5625, "learning_rate": 1.8205197617758528e-05, "loss": 0.1058, "loss_lm": 0.014931393088772893, "loss_seg": 0.09084814041852951, "mean_token_accuracy": 0.99513378739357, "num_tokens": 1997951405.0, "step": 4701 }, { "entropy": 0.01787047740072012, "epoch": 2.057774373563847, "grad_norm": 13.0, "learning_rate": 1.8202490525175962e-05, "loss": 0.1233, "loss_lm": 0.016092450357973576, "loss_seg": 0.10721446946263313, "mean_token_accuracy": 0.9952602684497833, "num_tokens": 1998376506.0, "step": 4702 }, { "entropy": 0.018836334347724915, "epoch": 2.0582120582120584, "grad_norm": 5.53125, "learning_rate": 1.8199783432593396e-05, "loss": 0.1156, "loss_lm": 0.016798452474176884, "loss_seg": 0.09882148075848818, "mean_token_accuracy": 0.9951850026845932, "num_tokens": 1998802210.0, "step": 4703 }, { "entropy": 0.018192145507782698, "epoch": 2.0586497428602692, "grad_norm": 8.75, "learning_rate": 1.8197076340010826e-05, "loss": 0.1288, "loss_lm": 0.017983461264520884, "loss_seg": 0.11084199883043766, "mean_token_accuracy": 0.9951937645673752, "num_tokens": 1999226997.0, "step": 4704 }, { "entropy": 0.017698975279927254, "epoch": 2.05908742750848, "grad_norm": 7.90625, "learning_rate": 1.8194369247428263e-05, "loss": 0.1253, "loss_lm": 0.014190236339345574, "loss_seg": 0.11108827218413353, "mean_token_accuracy": 0.9954740554094315, "num_tokens": 1999651578.0, "step": 4705 }, { "entropy": 0.01864288281649351, "epoch": 2.0595251121566913, "grad_norm": 21.125, "learning_rate": 1.8191662154845697e-05, "loss": 0.132, "loss_lm": 0.016162059968337417, "loss_seg": 0.11588721722364426, "mean_token_accuracy": 0.9952197074890137, "num_tokens": 2000076615.0, "step": 4706 }, { "entropy": 0.01770889898762107, "epoch": 2.059962796804902, "grad_norm": 5.8125, "learning_rate": 1.818895506226313e-05, "loss": 0.0915, "loss_lm": 0.014030372491106391, "loss_seg": 0.07742181606590748, "mean_token_accuracy": 0.9952491819858551, "num_tokens": 2000502005.0, "step": 4707 }, { "entropy": 0.019041622057557106, "epoch": 2.060400481453113, "grad_norm": 8.0625, "learning_rate": 1.8186247969680564e-05, "loss": 0.0948, "loss_lm": 0.016931703547015786, "loss_seg": 0.07790016010403633, "mean_token_accuracy": 0.9950511902570724, "num_tokens": 2000927323.0, "step": 4708 }, { "entropy": 0.018301508855074644, "epoch": 2.060838166101324, "grad_norm": 7.59375, "learning_rate": 1.8183540877097995e-05, "loss": 0.1102, "loss_lm": 0.016011921921744943, "loss_seg": 0.0941432323306799, "mean_token_accuracy": 0.9952201247215271, "num_tokens": 2001353236.0, "step": 4709 }, { "entropy": 0.018412234261631966, "epoch": 2.061275850749535, "grad_norm": 6.8125, "learning_rate": 1.818083378451543e-05, "loss": 0.1171, "loss_lm": 0.01771746575832367, "loss_seg": 0.0993381179869175, "mean_token_accuracy": 0.9951674342155457, "num_tokens": 2001778041.0, "step": 4710 }, { "entropy": 0.018428722862154245, "epoch": 2.061713535397746, "grad_norm": 11.0625, "learning_rate": 1.8178126691932865e-05, "loss": 0.1032, "loss_lm": 0.01578331203199923, "loss_seg": 0.08738739974796772, "mean_token_accuracy": 0.9951863288879395, "num_tokens": 2002202847.0, "step": 4711 }, { "entropy": 0.01873893989250064, "epoch": 2.062151220045957, "grad_norm": 5.0625, "learning_rate": 1.81754195993503e-05, "loss": 0.0984, "loss_lm": 0.018158637220039964, "loss_seg": 0.08021501824259758, "mean_token_accuracy": 0.9950411021709442, "num_tokens": 2002628142.0, "step": 4712 }, { "entropy": 0.01871205447241664, "epoch": 2.062588904694168, "grad_norm": 7.0625, "learning_rate": 1.8172712506767733e-05, "loss": 0.1225, "loss_lm": 0.016327326418831944, "loss_seg": 0.10618742741644382, "mean_token_accuracy": 0.9951852858066559, "num_tokens": 2003053685.0, "step": 4713 }, { "entropy": 0.018410335760563612, "epoch": 2.0630265893423787, "grad_norm": 6.125, "learning_rate": 1.8170005414185163e-05, "loss": 0.1205, "loss_lm": 0.016306031495332718, "loss_seg": 0.10419333726167679, "mean_token_accuracy": 0.9951290637254715, "num_tokens": 2003478623.0, "step": 4714 }, { "entropy": 0.018002115190029144, "epoch": 2.06346427399059, "grad_norm": 8.625, "learning_rate": 1.81672983216026e-05, "loss": 0.1186, "loss_lm": 0.01603061449714005, "loss_seg": 0.102598050609231, "mean_token_accuracy": 0.9953268319368362, "num_tokens": 2003904217.0, "step": 4715 }, { "entropy": 0.01764148147776723, "epoch": 2.0639019586388008, "grad_norm": 6.03125, "learning_rate": 1.8164591229020034e-05, "loss": 0.096, "loss_lm": 0.016202463768422604, "loss_seg": 0.07984444685280323, "mean_token_accuracy": 0.9955211281776428, "num_tokens": 2004330161.0, "step": 4716 }, { "entropy": 0.018795667216181755, "epoch": 2.0643396432870116, "grad_norm": 30.25, "learning_rate": 1.8161884136437468e-05, "loss": 0.1084, "loss_lm": 0.017025237902998924, "loss_seg": 0.09133178554475307, "mean_token_accuracy": 0.9949986636638641, "num_tokens": 2004755615.0, "step": 4717 }, { "entropy": 0.018146013841032982, "epoch": 2.064777327935223, "grad_norm": 9.0625, "learning_rate": 1.8159177043854898e-05, "loss": 0.0838, "loss_lm": 0.015593335963785648, "loss_seg": 0.0682185972109437, "mean_token_accuracy": 0.9951962828636169, "num_tokens": 2005180682.0, "step": 4718 }, { "entropy": 0.018491055350750685, "epoch": 2.0652150125834337, "grad_norm": 7.3125, "learning_rate": 1.8156469951272332e-05, "loss": 0.1272, "loss_lm": 0.0174594447016716, "loss_seg": 0.10974567197263241, "mean_token_accuracy": 0.9951391965150833, "num_tokens": 2005605278.0, "step": 4719 }, { "entropy": 0.018247149884700775, "epoch": 2.0656526972316445, "grad_norm": 35.5, "learning_rate": 1.815376285868977e-05, "loss": 0.1, "loss_lm": 0.013461619848385453, "loss_seg": 0.08652906399220228, "mean_token_accuracy": 0.9952795803546906, "num_tokens": 2006029963.0, "step": 4720 }, { "entropy": 0.017671705223619938, "epoch": 2.0660903818798557, "grad_norm": 5.03125, "learning_rate": 1.8151055766107203e-05, "loss": 0.1241, "loss_lm": 0.018186345230787992, "loss_seg": 0.1058898251503706, "mean_token_accuracy": 0.9953464716672897, "num_tokens": 2006454983.0, "step": 4721 }, { "entropy": 0.018203664105385542, "epoch": 2.0665280665280665, "grad_norm": 5.28125, "learning_rate": 1.8148348673524636e-05, "loss": 0.0902, "loss_lm": 0.017851657001301646, "loss_seg": 0.07234193291515112, "mean_token_accuracy": 0.9952331483364105, "num_tokens": 2006879741.0, "step": 4722 }, { "entropy": 0.018027467653155327, "epoch": 2.0669657511762773, "grad_norm": 4.65625, "learning_rate": 1.8145641580942067e-05, "loss": 0.1097, "loss_lm": 0.016275564907118678, "loss_seg": 0.09338301047682762, "mean_token_accuracy": 0.9953862279653549, "num_tokens": 2007304701.0, "step": 4723 }, { "entropy": 0.018523951526731253, "epoch": 2.0674034358244886, "grad_norm": 9.625, "learning_rate": 1.81429344883595e-05, "loss": 0.1225, "loss_lm": 0.016087952768430114, "loss_seg": 0.10637900233268738, "mean_token_accuracy": 0.9951433837413788, "num_tokens": 2007729641.0, "step": 4724 }, { "entropy": 0.01918576192110777, "epoch": 2.0678411204726994, "grad_norm": 13.0625, "learning_rate": 1.8140227395776938e-05, "loss": 0.1414, "loss_lm": 0.01836826791986823, "loss_seg": 0.12306689284741879, "mean_token_accuracy": 0.9952262043952942, "num_tokens": 2008154170.0, "step": 4725 }, { "entropy": 0.01909754192456603, "epoch": 2.0682788051209102, "grad_norm": 5.84375, "learning_rate": 1.813752030319437e-05, "loss": 0.1139, "loss_lm": 0.01717291632667184, "loss_seg": 0.09676858223974705, "mean_token_accuracy": 0.9950835257768631, "num_tokens": 2008579059.0, "step": 4726 }, { "entropy": 0.01809642557054758, "epoch": 2.0687164897691215, "grad_norm": 20.125, "learning_rate": 1.8134813210611805e-05, "loss": 0.1018, "loss_lm": 0.014719654805958271, "loss_seg": 0.08711830340325832, "mean_token_accuracy": 0.9952242374420166, "num_tokens": 2009003677.0, "step": 4727 }, { "entropy": 0.018092551734298468, "epoch": 2.0691541744173323, "grad_norm": 31.375, "learning_rate": 1.8132106118029235e-05, "loss": 0.0996, "loss_lm": 0.01818913221359253, "loss_seg": 0.08138387277722359, "mean_token_accuracy": 0.9951699078083038, "num_tokens": 2009429115.0, "step": 4728 }, { "entropy": 0.017688465770334005, "epoch": 2.069591859065543, "grad_norm": 47.25, "learning_rate": 1.812939902544667e-05, "loss": 0.089, "loss_lm": 0.013847432099282742, "loss_seg": 0.07510659378021955, "mean_token_accuracy": 0.99538853764534, "num_tokens": 2009853205.0, "step": 4729 }, { "entropy": 0.018307768274098635, "epoch": 2.0700295437137544, "grad_norm": 4.90625, "learning_rate": 1.8126691932864106e-05, "loss": 0.081, "loss_lm": 0.016644265269860625, "loss_seg": 0.06430924031883478, "mean_token_accuracy": 0.9952335953712463, "num_tokens": 2010278015.0, "step": 4730 }, { "entropy": 0.01780667621642351, "epoch": 2.070467228361965, "grad_norm": 6.75, "learning_rate": 1.812398484028154e-05, "loss": 0.1153, "loss_lm": 0.01665338291786611, "loss_seg": 0.09862660896033049, "mean_token_accuracy": 0.9953517317771912, "num_tokens": 2010703286.0, "step": 4731 }, { "entropy": 0.018310914747416973, "epoch": 2.070904913010176, "grad_norm": 21.125, "learning_rate": 1.8121277747698974e-05, "loss": 0.0989, "loss_lm": 0.0168808379676193, "loss_seg": 0.08199607953429222, "mean_token_accuracy": 0.9953507035970688, "num_tokens": 2011128671.0, "step": 4732 }, { "entropy": 0.018171198200434446, "epoch": 2.0713425976583872, "grad_norm": 4.21875, "learning_rate": 1.8118570655116404e-05, "loss": 0.1018, "loss_lm": 0.014859033981338143, "loss_seg": 0.08693481236696243, "mean_token_accuracy": 0.9952416867017746, "num_tokens": 2011553779.0, "step": 4733 }, { "entropy": 0.018549012020230293, "epoch": 2.071780282306598, "grad_norm": 7.90625, "learning_rate": 1.8115863562533838e-05, "loss": 0.1029, "loss_lm": 0.015138890594244003, "loss_seg": 0.08777189627289772, "mean_token_accuracy": 0.9952318370342255, "num_tokens": 2011978426.0, "step": 4734 }, { "entropy": 0.018229197710752487, "epoch": 2.072217966954809, "grad_norm": 4.28125, "learning_rate": 1.811315646995127e-05, "loss": 0.0777, "loss_lm": 0.01663390500470996, "loss_seg": 0.06105263065546751, "mean_token_accuracy": 0.9951420873403549, "num_tokens": 2012404090.0, "step": 4735 }, { "entropy": 0.01795052643865347, "epoch": 2.07265565160302, "grad_norm": 4.125, "learning_rate": 1.811044937736871e-05, "loss": 0.1062, "loss_lm": 0.01661851373501122, "loss_seg": 0.08958241250365973, "mean_token_accuracy": 0.9953800737857819, "num_tokens": 2012829569.0, "step": 4736 }, { "entropy": 0.018623730167746544, "epoch": 2.073093336251231, "grad_norm": 9.8125, "learning_rate": 1.8107742284786142e-05, "loss": 0.1505, "loss_lm": 0.017083191080018878, "loss_seg": 0.1334397830069065, "mean_token_accuracy": 0.9950516074895859, "num_tokens": 2013255343.0, "step": 4737 }, { "entropy": 0.01793388230726123, "epoch": 2.0735310208994417, "grad_norm": 3.609375, "learning_rate": 1.8105035192203573e-05, "loss": 0.0843, "loss_lm": 0.016871011815965176, "loss_seg": 0.06738678831607103, "mean_token_accuracy": 0.9952933341264725, "num_tokens": 2013680003.0, "step": 4738 }, { "entropy": 0.0179778472520411, "epoch": 2.073968705547653, "grad_norm": 6.3125, "learning_rate": 1.8102328099621006e-05, "loss": 0.1525, "loss_lm": 0.017737703630700707, "loss_seg": 0.1347905769944191, "mean_token_accuracy": 0.9952830076217651, "num_tokens": 2014105025.0, "step": 4739 }, { "entropy": 0.017885454930365086, "epoch": 2.074406390195864, "grad_norm": 6.46875, "learning_rate": 1.809962100703844e-05, "loss": 0.1127, "loss_lm": 0.01642175274901092, "loss_seg": 0.09624849073588848, "mean_token_accuracy": 0.995345950126648, "num_tokens": 2014529762.0, "step": 4740 }, { "entropy": 0.018565726466476917, "epoch": 2.0748440748440746, "grad_norm": 5.59375, "learning_rate": 1.8096913914455877e-05, "loss": 0.0861, "loss_lm": 0.014887693105265498, "loss_seg": 0.07124510034918785, "mean_token_accuracy": 0.9951332360506058, "num_tokens": 2014955070.0, "step": 4741 }, { "entropy": 0.018728484865278006, "epoch": 2.075281759492286, "grad_norm": 7.96875, "learning_rate": 1.8094206821873307e-05, "loss": 0.1343, "loss_lm": 0.019378946628421545, "loss_seg": 0.11491391062736511, "mean_token_accuracy": 0.9951188117265701, "num_tokens": 2015379929.0, "step": 4742 }, { "entropy": 0.018148458562791348, "epoch": 2.0757194441404967, "grad_norm": 11.5, "learning_rate": 1.809149972929074e-05, "loss": 0.086, "loss_lm": 0.017307003028690815, "loss_seg": 0.06864777207374573, "mean_token_accuracy": 0.9952486455440521, "num_tokens": 2015804608.0, "step": 4743 }, { "entropy": 0.01812314847484231, "epoch": 2.076157128788708, "grad_norm": 7.28125, "learning_rate": 1.8088792636708175e-05, "loss": 0.1404, "loss_lm": 0.015293397242203355, "loss_seg": 0.1251226020976901, "mean_token_accuracy": 0.9952951222658157, "num_tokens": 2016228981.0, "step": 4744 }, { "entropy": 0.01797697925940156, "epoch": 2.0765948134369188, "grad_norm": 6.46875, "learning_rate": 1.808608554412561e-05, "loss": 0.0973, "loss_lm": 0.013885688968002796, "loss_seg": 0.08340167719870806, "mean_token_accuracy": 0.9952507019042969, "num_tokens": 2016653838.0, "step": 4745 }, { "entropy": 0.01882812660187483, "epoch": 2.0770324980851296, "grad_norm": 5.78125, "learning_rate": 1.8083378451543046e-05, "loss": 0.1604, "loss_lm": 0.01769806770607829, "loss_seg": 0.1427331082522869, "mean_token_accuracy": 0.9950871467590332, "num_tokens": 2017078411.0, "step": 4746 }, { "entropy": 0.017384581733494997, "epoch": 2.077470182733341, "grad_norm": 8.375, "learning_rate": 1.8080671358960476e-05, "loss": 0.0956, "loss_lm": 0.013835629681125283, "loss_seg": 0.08174552209675312, "mean_token_accuracy": 0.9953902959823608, "num_tokens": 2017502889.0, "step": 4747 }, { "entropy": 0.018763789907097816, "epoch": 2.0779078673815516, "grad_norm": 5.4375, "learning_rate": 1.807796426637791e-05, "loss": 0.1309, "loss_lm": 0.016063320683315396, "loss_seg": 0.11481911316514015, "mean_token_accuracy": 0.9951973557472229, "num_tokens": 2017928453.0, "step": 4748 }, { "entropy": 0.01870910543948412, "epoch": 2.0783455520297625, "grad_norm": 4.65625, "learning_rate": 1.8075257173795344e-05, "loss": 0.1252, "loss_lm": 0.01752541307359934, "loss_seg": 0.10765562951564789, "mean_token_accuracy": 0.995128408074379, "num_tokens": 2018354133.0, "step": 4749 }, { "entropy": 0.017702650744467974, "epoch": 2.0787832366779737, "grad_norm": 3.59375, "learning_rate": 1.8072550081212777e-05, "loss": 0.1049, "loss_lm": 0.015384245896711946, "loss_seg": 0.08952837251126766, "mean_token_accuracy": 0.9953154772520065, "num_tokens": 2018778905.0, "step": 4750 }, { "entropy": 0.017974878661334515, "epoch": 2.0792209213261845, "grad_norm": 5.65625, "learning_rate": 1.8069842988630214e-05, "loss": 0.1213, "loss_lm": 0.016231116373091936, "loss_seg": 0.10509124305099249, "mean_token_accuracy": 0.9952871054410934, "num_tokens": 2019203656.0, "step": 4751 }, { "entropy": 0.017969232983887196, "epoch": 2.0796586059743953, "grad_norm": 5.46875, "learning_rate": 1.8067135896047645e-05, "loss": 0.1151, "loss_lm": 0.015846787951886654, "loss_seg": 0.09926604852080345, "mean_token_accuracy": 0.9952519834041595, "num_tokens": 2019628215.0, "step": 4752 }, { "entropy": 0.01788155548274517, "epoch": 2.0800962906226066, "grad_norm": 3.234375, "learning_rate": 1.806442880346508e-05, "loss": 0.0952, "loss_lm": 0.016361390938982368, "loss_seg": 0.07885007187724113, "mean_token_accuracy": 0.9952608793973923, "num_tokens": 2020053588.0, "step": 4753 }, { "entropy": 0.018609060440212488, "epoch": 2.0805339752708174, "grad_norm": 9.375, "learning_rate": 1.8061721710882512e-05, "loss": 0.1362, "loss_lm": 0.01696135546080768, "loss_seg": 0.11927621811628342, "mean_token_accuracy": 0.9952151775360107, "num_tokens": 2020479261.0, "step": 4754 }, { "entropy": 0.018080756068229675, "epoch": 2.080971659919028, "grad_norm": 10.4375, "learning_rate": 1.8059014618299946e-05, "loss": 0.1249, "loss_lm": 0.016073330771178007, "loss_seg": 0.10886992141604424, "mean_token_accuracy": 0.9951907098293304, "num_tokens": 2020904235.0, "step": 4755 }, { "entropy": 0.017884504050016403, "epoch": 2.0814093445672395, "grad_norm": 5.5625, "learning_rate": 1.8056307525717383e-05, "loss": 0.1017, "loss_lm": 0.017580472631379962, "loss_seg": 0.0840871911495924, "mean_token_accuracy": 0.9952796250581741, "num_tokens": 2021329176.0, "step": 4756 }, { "entropy": 0.018429982010275126, "epoch": 2.0818470292154503, "grad_norm": 5.90625, "learning_rate": 1.8053600433134813e-05, "loss": 0.1374, "loss_lm": 0.015177290886640549, "loss_seg": 0.12224955298006535, "mean_token_accuracy": 0.9952230155467987, "num_tokens": 2021754926.0, "step": 4757 }, { "entropy": 0.018159507773816586, "epoch": 2.082284713863661, "grad_norm": 5.78125, "learning_rate": 1.8050893340552247e-05, "loss": 0.1299, "loss_lm": 0.015013366937637329, "loss_seg": 0.11492660455405712, "mean_token_accuracy": 0.9952991604804993, "num_tokens": 2022179660.0, "step": 4758 }, { "entropy": 0.0179689833894372, "epoch": 2.0827223985118724, "grad_norm": 9.1875, "learning_rate": 1.804818624796968e-05, "loss": 0.1265, "loss_lm": 0.017525798873975873, "loss_seg": 0.10896233469247818, "mean_token_accuracy": 0.9953902214765549, "num_tokens": 2022604151.0, "step": 4759 }, { "entropy": 0.01803944492712617, "epoch": 2.083160083160083, "grad_norm": 11.5, "learning_rate": 1.8045479155387115e-05, "loss": 0.1124, "loss_lm": 0.015489891404286027, "loss_seg": 0.096941152587533, "mean_token_accuracy": 0.9952786862850189, "num_tokens": 2023029188.0, "step": 4760 }, { "entropy": 0.018254400230944157, "epoch": 2.083597767808294, "grad_norm": 4.625, "learning_rate": 1.8042772062804548e-05, "loss": 0.1275, "loss_lm": 0.018638198729604483, "loss_seg": 0.1088132131844759, "mean_token_accuracy": 0.9951601624488831, "num_tokens": 2023453567.0, "step": 4761 }, { "entropy": 0.018269166350364685, "epoch": 2.0840354524565052, "grad_norm": 4.3125, "learning_rate": 1.8040064970221982e-05, "loss": 0.1017, "loss_lm": 0.016566471196711063, "loss_seg": 0.08514234237372875, "mean_token_accuracy": 0.9952043443918228, "num_tokens": 2023878468.0, "step": 4762 }, { "entropy": 0.018717661499977112, "epoch": 2.084473137104716, "grad_norm": 7.28125, "learning_rate": 1.8037357877639416e-05, "loss": 0.152, "loss_lm": 0.0179473333992064, "loss_seg": 0.13404672034084797, "mean_token_accuracy": 0.9951284825801849, "num_tokens": 2024303760.0, "step": 4763 }, { "entropy": 0.01799823995679617, "epoch": 2.084910821752927, "grad_norm": 5.28125, "learning_rate": 1.803465078505685e-05, "loss": 0.1019, "loss_lm": 0.014905206859111786, "loss_seg": 0.08702947944402695, "mean_token_accuracy": 0.9953835159540176, "num_tokens": 2024728721.0, "step": 4764 }, { "entropy": 0.018239008728414774, "epoch": 2.085348506401138, "grad_norm": 6.5, "learning_rate": 1.8031943692474283e-05, "loss": 0.156, "loss_lm": 0.016609665006399155, "loss_seg": 0.1394181028008461, "mean_token_accuracy": 0.9952887892723083, "num_tokens": 2025153379.0, "step": 4765 }, { "entropy": 0.018426225055009127, "epoch": 2.085786191049349, "grad_norm": 76.5, "learning_rate": 1.8029236599891717e-05, "loss": 0.1043, "loss_lm": 0.017107175895944238, "loss_seg": 0.08715192880481482, "mean_token_accuracy": 0.995225727558136, "num_tokens": 2025578740.0, "step": 4766 }, { "entropy": 0.018355874344706535, "epoch": 2.0862238756975597, "grad_norm": 4.34375, "learning_rate": 1.802652950730915e-05, "loss": 0.0888, "loss_lm": 0.01729930634610355, "loss_seg": 0.07150997500866652, "mean_token_accuracy": 0.9952002763748169, "num_tokens": 2026003517.0, "step": 4767 }, { "entropy": 0.018292181193828583, "epoch": 2.086661560345771, "grad_norm": 9.3125, "learning_rate": 1.8023822414726584e-05, "loss": 0.1291, "loss_lm": 0.013697599526494741, "loss_seg": 0.11541281826794147, "mean_token_accuracy": 0.9952627867460251, "num_tokens": 2026428182.0, "step": 4768 }, { "entropy": 0.017791093792766333, "epoch": 2.087099244993982, "grad_norm": 4.4375, "learning_rate": 1.8021115322144018e-05, "loss": 0.0998, "loss_lm": 0.015026614535599947, "loss_seg": 0.08475073613226414, "mean_token_accuracy": 0.9954369515180588, "num_tokens": 2026852601.0, "step": 4769 }, { "entropy": 0.01872339006513357, "epoch": 2.0875369296421926, "grad_norm": 3.59375, "learning_rate": 1.8018408229561452e-05, "loss": 0.1208, "loss_lm": 0.016375248786062002, "loss_seg": 0.10440836753696203, "mean_token_accuracy": 0.9950259476900101, "num_tokens": 2027277594.0, "step": 4770 }, { "entropy": 0.017660548444837332, "epoch": 2.087974614290404, "grad_norm": 8.5, "learning_rate": 1.8015701136978882e-05, "loss": 0.1217, "loss_lm": 0.0145272605586797, "loss_seg": 0.10715172998607159, "mean_token_accuracy": 0.9953968077898026, "num_tokens": 2027702898.0, "step": 4771 }, { "entropy": 0.017876234371215105, "epoch": 2.0884122989386147, "grad_norm": 8.5, "learning_rate": 1.801299404439632e-05, "loss": 0.1612, "loss_lm": 0.017746718134731054, "loss_seg": 0.1434332113713026, "mean_token_accuracy": 0.9953868389129639, "num_tokens": 2028127726.0, "step": 4772 }, { "entropy": 0.017839506268501282, "epoch": 2.0888499835868255, "grad_norm": 3.9375, "learning_rate": 1.8010286951813753e-05, "loss": 0.0767, "loss_lm": 0.01318531995639205, "loss_seg": 0.06348557397723198, "mean_token_accuracy": 0.9954770505428314, "num_tokens": 2028552598.0, "step": 4773 }, { "entropy": 0.017586582340300083, "epoch": 2.0892876682350368, "grad_norm": 12.1875, "learning_rate": 1.8007579859231187e-05, "loss": 0.1233, "loss_lm": 0.017978235613554716, "loss_seg": 0.10527198389172554, "mean_token_accuracy": 0.9953700453042984, "num_tokens": 2028977601.0, "step": 4774 }, { "entropy": 0.01796566229313612, "epoch": 2.0897253528832476, "grad_norm": 9.25, "learning_rate": 1.800487276664862e-05, "loss": 0.115, "loss_lm": 0.01821087347343564, "loss_seg": 0.09682849794626236, "mean_token_accuracy": 0.9952944666147232, "num_tokens": 2029403003.0, "step": 4775 }, { "entropy": 0.017968972213566303, "epoch": 2.0901630375314584, "grad_norm": 7.4375, "learning_rate": 1.800216567406605e-05, "loss": 0.1497, "loss_lm": 0.01718308008275926, "loss_seg": 0.13247071579098701, "mean_token_accuracy": 0.995183527469635, "num_tokens": 2029827529.0, "step": 4776 }, { "entropy": 0.018164439592510462, "epoch": 2.0906007221796696, "grad_norm": 7.375, "learning_rate": 1.7999458581483488e-05, "loss": 0.0925, "loss_lm": 0.016889834543690085, "loss_seg": 0.07563654892146587, "mean_token_accuracy": 0.9952052235603333, "num_tokens": 2030253151.0, "step": 4777 }, { "entropy": 0.01814926788210869, "epoch": 2.0910384068278804, "grad_norm": 7.15625, "learning_rate": 1.799675148890092e-05, "loss": 0.1452, "loss_lm": 0.014080344699323177, "loss_seg": 0.13113098964095116, "mean_token_accuracy": 0.9952804148197174, "num_tokens": 2030678547.0, "step": 4778 }, { "entropy": 0.018030776642262936, "epoch": 2.0914760914760917, "grad_norm": 9.0625, "learning_rate": 1.7994044396318355e-05, "loss": 0.1367, "loss_lm": 0.014556081499904394, "loss_seg": 0.12218203581869602, "mean_token_accuracy": 0.9952021539211273, "num_tokens": 2031103705.0, "step": 4779 }, { "entropy": 0.018323401920497417, "epoch": 2.0919137761243025, "grad_norm": 5.5, "learning_rate": 1.799133730373579e-05, "loss": 0.0932, "loss_lm": 0.0175334548112005, "loss_seg": 0.07564296945929527, "mean_token_accuracy": 0.9951997995376587, "num_tokens": 2031528652.0, "step": 4780 }, { "entropy": 0.01762717217206955, "epoch": 2.0923514607725133, "grad_norm": 3.609375, "learning_rate": 1.798863021115322e-05, "loss": 0.1037, "loss_lm": 0.016039448091760278, "loss_seg": 0.08769218157976866, "mean_token_accuracy": 0.9954517930746078, "num_tokens": 2031953380.0, "step": 4781 }, { "entropy": 0.017917996738106012, "epoch": 2.092789145420724, "grad_norm": 5.125, "learning_rate": 1.7985923118570657e-05, "loss": 0.1237, "loss_lm": 0.013679312309250236, "loss_seg": 0.11006650887429714, "mean_token_accuracy": 0.995301753282547, "num_tokens": 2032378384.0, "step": 4782 }, { "entropy": 0.018251310102641582, "epoch": 2.0932268300689354, "grad_norm": 5.40625, "learning_rate": 1.798321602598809e-05, "loss": 0.1016, "loss_lm": 0.015372514259070158, "loss_seg": 0.08618470840156078, "mean_token_accuracy": 0.9952553659677505, "num_tokens": 2032803226.0, "step": 4783 }, { "entropy": 0.017683991696685553, "epoch": 2.093664514717146, "grad_norm": 5.375, "learning_rate": 1.7980508933405524e-05, "loss": 0.1097, "loss_lm": 0.01699656923301518, "loss_seg": 0.09273785818368196, "mean_token_accuracy": 0.9953801482915878, "num_tokens": 2033228754.0, "step": 4784 }, { "entropy": 0.01827581925317645, "epoch": 2.0941021993653575, "grad_norm": 5.25, "learning_rate": 1.7977801840822954e-05, "loss": 0.0888, "loss_lm": 0.015003980603069067, "loss_seg": 0.07383423298597336, "mean_token_accuracy": 0.9952336251735687, "num_tokens": 2033653996.0, "step": 4785 }, { "entropy": 0.018636335153132677, "epoch": 2.0945398840135683, "grad_norm": 14.875, "learning_rate": 1.7975094748240388e-05, "loss": 0.0875, "loss_lm": 0.0176914744079113, "loss_seg": 0.06980240251868963, "mean_token_accuracy": 0.9951549768447876, "num_tokens": 2034078512.0, "step": 4786 }, { "entropy": 0.017695366870611906, "epoch": 2.094977568661779, "grad_norm": 4.5625, "learning_rate": 1.7972387655657825e-05, "loss": 0.105, "loss_lm": 0.016209465451538563, "loss_seg": 0.08878208138048649, "mean_token_accuracy": 0.9953740388154984, "num_tokens": 2034503880.0, "step": 4787 }, { "entropy": 0.017747798934578896, "epoch": 2.0954152533099903, "grad_norm": 20.0, "learning_rate": 1.796968056307526e-05, "loss": 0.1088, "loss_lm": 0.01772056194022298, "loss_seg": 0.09103630390018225, "mean_token_accuracy": 0.9953506141901016, "num_tokens": 2034929456.0, "step": 4788 }, { "entropy": 0.01806472521275282, "epoch": 2.095852937958201, "grad_norm": 3.703125, "learning_rate": 1.7966973470492693e-05, "loss": 0.0838, "loss_lm": 0.015449907863512635, "loss_seg": 0.06830688286572695, "mean_token_accuracy": 0.9953727424144745, "num_tokens": 2035354278.0, "step": 4789 }, { "entropy": 0.017420063260942698, "epoch": 2.096290622606412, "grad_norm": 5.78125, "learning_rate": 1.7964266377910123e-05, "loss": 0.0883, "loss_lm": 0.014556047972291708, "loss_seg": 0.07372658234089613, "mean_token_accuracy": 0.9954018294811249, "num_tokens": 2035778736.0, "step": 4790 }, { "entropy": 0.01856881706044078, "epoch": 2.0967283072546232, "grad_norm": 5.0, "learning_rate": 1.7961559285327557e-05, "loss": 0.1009, "loss_lm": 0.016228368505835533, "loss_seg": 0.08463974576443434, "mean_token_accuracy": 0.995196744799614, "num_tokens": 2036203805.0, "step": 4791 }, { "entropy": 0.01818180875852704, "epoch": 2.097165991902834, "grad_norm": 7.0625, "learning_rate": 1.7958852192744994e-05, "loss": 0.085, "loss_lm": 0.015434342436492443, "loss_seg": 0.06951955147087574, "mean_token_accuracy": 0.9952896535396576, "num_tokens": 2036628997.0, "step": 4792 }, { "entropy": 0.018594390247017145, "epoch": 2.097603676551045, "grad_norm": 15.5625, "learning_rate": 1.7956145100162427e-05, "loss": 0.1005, "loss_lm": 0.01663580327294767, "loss_seg": 0.08386691007763147, "mean_token_accuracy": 0.9950098097324371, "num_tokens": 2037055214.0, "step": 4793 }, { "entropy": 0.018206211272627115, "epoch": 2.098041361199256, "grad_norm": 18.0, "learning_rate": 1.795343800757986e-05, "loss": 0.0991, "loss_lm": 0.015022278530523181, "loss_seg": 0.08410632703453302, "mean_token_accuracy": 0.9951426386833191, "num_tokens": 2037480319.0, "step": 4794 }, { "entropy": 0.018208477646112442, "epoch": 2.098479045847467, "grad_norm": 9.9375, "learning_rate": 1.795073091499729e-05, "loss": 0.15, "loss_lm": 0.01692374236881733, "loss_seg": 0.13307711482048035, "mean_token_accuracy": 0.9953131973743439, "num_tokens": 2037904877.0, "step": 4795 }, { "entropy": 0.018004015553742647, "epoch": 2.0989167304956777, "grad_norm": 3.609375, "learning_rate": 1.7948023822414725e-05, "loss": 0.088, "loss_lm": 0.016664556227624416, "loss_seg": 0.0713595487177372, "mean_token_accuracy": 0.9952658265829086, "num_tokens": 2038329839.0, "step": 4796 }, { "entropy": 0.01785461325198412, "epoch": 2.099354415143889, "grad_norm": 12.625, "learning_rate": 1.7945316729832162e-05, "loss": 0.0995, "loss_lm": 0.013902074424549937, "loss_seg": 0.08559668809175491, "mean_token_accuracy": 0.9953229427337646, "num_tokens": 2038754852.0, "step": 4797 }, { "entropy": 0.017547217197716236, "epoch": 2.0997920997921, "grad_norm": 5.90625, "learning_rate": 1.7942609637249596e-05, "loss": 0.1433, "loss_lm": 0.014881261624395847, "loss_seg": 0.1284651942551136, "mean_token_accuracy": 0.9953586161136627, "num_tokens": 2039179235.0, "step": 4798 }, { "entropy": 0.018010389059782028, "epoch": 2.1002297844403106, "grad_norm": 6.0, "learning_rate": 1.793990254466703e-05, "loss": 0.0953, "loss_lm": 0.016920410795137286, "loss_seg": 0.07837495766580105, "mean_token_accuracy": 0.9952689707279205, "num_tokens": 2039604061.0, "step": 4799 }, { "entropy": 0.018379183020442724, "epoch": 2.100667469088522, "grad_norm": 3.921875, "learning_rate": 1.793719545208446e-05, "loss": 0.1515, "loss_lm": 0.017307835863903165, "loss_seg": 0.13414905592799187, "mean_token_accuracy": 0.9951940923929214, "num_tokens": 2040029115.0, "step": 4800 }, { "entropy": 0.01817556982859969, "epoch": 2.1011051537367327, "grad_norm": 5.84375, "learning_rate": 1.7934488359501894e-05, "loss": 0.1425, "loss_lm": 0.015195152722299099, "loss_seg": 0.12725703697651625, "mean_token_accuracy": 0.9951719641685486, "num_tokens": 2040454099.0, "step": 4801 }, { "entropy": 0.018191690556704998, "epoch": 2.1015428383849435, "grad_norm": 7.75, "learning_rate": 1.7931781266919328e-05, "loss": 0.0972, "loss_lm": 0.016281829914078116, "loss_seg": 0.08092902880162, "mean_token_accuracy": 0.995194211602211, "num_tokens": 2040879171.0, "step": 4802 }, { "entropy": 0.017881239764392376, "epoch": 2.1019805230331547, "grad_norm": 2.8125, "learning_rate": 1.7929074174336765e-05, "loss": 0.1255, "loss_lm": 0.016573732951655984, "loss_seg": 0.10893435589969158, "mean_token_accuracy": 0.9953887611627579, "num_tokens": 2041304172.0, "step": 4803 }, { "entropy": 0.018201889004558325, "epoch": 2.1024182076813656, "grad_norm": 3.859375, "learning_rate": 1.79263670817542e-05, "loss": 0.0954, "loss_lm": 0.015543086687102914, "loss_seg": 0.07987360656261444, "mean_token_accuracy": 0.9952041357755661, "num_tokens": 2041729281.0, "step": 4804 }, { "entropy": 0.01842715870589018, "epoch": 2.1028558923295764, "grad_norm": 12.125, "learning_rate": 1.792365998917163e-05, "loss": 0.0951, "loss_lm": 0.016922808717936277, "loss_seg": 0.07812900096178055, "mean_token_accuracy": 0.9951562434434891, "num_tokens": 2042154333.0, "step": 4805 }, { "entropy": 0.018440738786011934, "epoch": 2.1032935769777876, "grad_norm": 9.25, "learning_rate": 1.7920952896589063e-05, "loss": 0.0888, "loss_lm": 0.016009194776415825, "loss_seg": 0.07275440730154514, "mean_token_accuracy": 0.9952698051929474, "num_tokens": 2042579410.0, "step": 4806 }, { "entropy": 0.017686938866972923, "epoch": 2.1037312616259984, "grad_norm": 3.828125, "learning_rate": 1.7918245804006496e-05, "loss": 0.1301, "loss_lm": 0.015343548962846398, "loss_seg": 0.11472387239336967, "mean_token_accuracy": 0.9954011291265488, "num_tokens": 2043004217.0, "step": 4807 }, { "entropy": 0.018182492814958096, "epoch": 2.1041689462742093, "grad_norm": 4.03125, "learning_rate": 1.7915538711423933e-05, "loss": 0.0928, "loss_lm": 0.014537787297740579, "loss_seg": 0.07821428216993809, "mean_token_accuracy": 0.9951920658349991, "num_tokens": 2043430155.0, "step": 4808 }, { "entropy": 0.017400234937667847, "epoch": 2.1046066309224205, "grad_norm": 10.0625, "learning_rate": 1.7912831618841364e-05, "loss": 0.0995, "loss_lm": 0.014599931426346302, "loss_seg": 0.08489076420664787, "mean_token_accuracy": 0.9955673366785049, "num_tokens": 2043854652.0, "step": 4809 }, { "entropy": 0.017936308402568102, "epoch": 2.1050443155706313, "grad_norm": 3.984375, "learning_rate": 1.7910124526258797e-05, "loss": 0.1162, "loss_lm": 0.015773517079651356, "loss_seg": 0.10043209977447987, "mean_token_accuracy": 0.9952715188264847, "num_tokens": 2044279696.0, "step": 4810 }, { "entropy": 0.017591584473848343, "epoch": 2.105482000218842, "grad_norm": 3.484375, "learning_rate": 1.790741743367623e-05, "loss": 0.1015, "loss_lm": 0.016321404604241252, "loss_seg": 0.08514329139143229, "mean_token_accuracy": 0.9952923208475113, "num_tokens": 2044704495.0, "step": 4811 }, { "entropy": 0.019099419005215168, "epoch": 2.1059196848670534, "grad_norm": 17.375, "learning_rate": 1.7904710341093665e-05, "loss": 0.1157, "loss_lm": 0.01714322599582374, "loss_seg": 0.0985164362937212, "mean_token_accuracy": 0.994914710521698, "num_tokens": 2045129604.0, "step": 4812 }, { "entropy": 0.01742276595905423, "epoch": 2.106357369515264, "grad_norm": 2.5625, "learning_rate": 1.7902003248511102e-05, "loss": 0.1528, "loss_lm": 0.0158527793828398, "loss_seg": 0.13696213252842426, "mean_token_accuracy": 0.9954685419797897, "num_tokens": 2045554577.0, "step": 4813 }, { "entropy": 0.01834772527217865, "epoch": 2.106795054163475, "grad_norm": 6.625, "learning_rate": 1.7899296155928532e-05, "loss": 0.157, "loss_lm": 0.019837806466966867, "loss_seg": 0.13715883903205395, "mean_token_accuracy": 0.9952361583709717, "num_tokens": 2045979383.0, "step": 4814 }, { "entropy": 0.017735152505338192, "epoch": 2.1072327388116863, "grad_norm": 3.65625, "learning_rate": 1.7896589063345966e-05, "loss": 0.1044, "loss_lm": 0.014049212448298931, "loss_seg": 0.09038803540170193, "mean_token_accuracy": 0.9952713549137115, "num_tokens": 2046403777.0, "step": 4815 }, { "entropy": 0.01843542465940118, "epoch": 2.107670423459897, "grad_norm": 6.375, "learning_rate": 1.78938819707634e-05, "loss": 0.1104, "loss_lm": 0.01458732970058918, "loss_seg": 0.09581873938441277, "mean_token_accuracy": 0.9951402097940445, "num_tokens": 2046828874.0, "step": 4816 }, { "entropy": 0.01862789550796151, "epoch": 2.108108108108108, "grad_norm": 3.109375, "learning_rate": 1.7891174878180834e-05, "loss": 0.0997, "loss_lm": 0.016802417812868953, "loss_seg": 0.0828710151836276, "mean_token_accuracy": 0.9951702207326889, "num_tokens": 2047254948.0, "step": 4817 }, { "entropy": 0.01820624340325594, "epoch": 2.108545792756319, "grad_norm": 5.46875, "learning_rate": 1.788846778559827e-05, "loss": 0.1654, "loss_lm": 0.015459436923265457, "loss_seg": 0.14991565234959126, "mean_token_accuracy": 0.9952830374240875, "num_tokens": 2047679527.0, "step": 4818 }, { "entropy": 0.0179854235611856, "epoch": 2.10898347740453, "grad_norm": 4.65625, "learning_rate": 1.78857606930157e-05, "loss": 0.1042, "loss_lm": 0.015824998961761594, "loss_seg": 0.08834988437592983, "mean_token_accuracy": 0.9952725917100906, "num_tokens": 2048104363.0, "step": 4819 }, { "entropy": 0.01813360396772623, "epoch": 2.109421162052741, "grad_norm": 5.21875, "learning_rate": 1.7883053600433135e-05, "loss": 0.1314, "loss_lm": 0.014240967808291316, "loss_seg": 0.11714887246489525, "mean_token_accuracy": 0.9952899068593979, "num_tokens": 2048528922.0, "step": 4820 }, { "entropy": 0.018207938876003027, "epoch": 2.109858846700952, "grad_norm": 10.625, "learning_rate": 1.788034650785057e-05, "loss": 0.1081, "loss_lm": 0.014799801167100668, "loss_seg": 0.09327406622469425, "mean_token_accuracy": 0.9952942132949829, "num_tokens": 2048953499.0, "step": 4821 }, { "entropy": 0.018683696165680885, "epoch": 2.110296531349163, "grad_norm": 7.5625, "learning_rate": 1.7877639415268002e-05, "loss": 0.1151, "loss_lm": 0.01857766299508512, "loss_seg": 0.0965349031612277, "mean_token_accuracy": 0.9951023608446121, "num_tokens": 2049377957.0, "step": 4822 }, { "entropy": 0.01773046888411045, "epoch": 2.110734215997374, "grad_norm": 4.8125, "learning_rate": 1.787493232268544e-05, "loss": 0.0755, "loss_lm": 0.014673916855826974, "loss_seg": 0.06080308835953474, "mean_token_accuracy": 0.995350793004036, "num_tokens": 2049802285.0, "step": 4823 }, { "entropy": 0.018663343973457813, "epoch": 2.111171900645585, "grad_norm": 6.0, "learning_rate": 1.787222523010287e-05, "loss": 0.0971, "loss_lm": 0.013704424258321524, "loss_seg": 0.08341267053037882, "mean_token_accuracy": 0.9951260983943939, "num_tokens": 2050227381.0, "step": 4824 }, { "entropy": 0.018239452503621578, "epoch": 2.1116095852937957, "grad_norm": 4.78125, "learning_rate": 1.7869518137520303e-05, "loss": 0.1093, "loss_lm": 0.01476432359777391, "loss_seg": 0.09449507109820843, "mean_token_accuracy": 0.9952499568462372, "num_tokens": 2050652395.0, "step": 4825 }, { "entropy": 0.019206439144909382, "epoch": 2.112047269942007, "grad_norm": 4.125, "learning_rate": 1.7866811044937737e-05, "loss": 0.151, "loss_lm": 0.018687010742723942, "loss_seg": 0.1323575247079134, "mean_token_accuracy": 0.9950213879346848, "num_tokens": 2051078609.0, "step": 4826 }, { "entropy": 0.018426598981022835, "epoch": 2.112484954590218, "grad_norm": 4.875, "learning_rate": 1.786410395235517e-05, "loss": 0.1025, "loss_lm": 0.017506264382973313, "loss_seg": 0.08499849680811167, "mean_token_accuracy": 0.9951180517673492, "num_tokens": 2051503419.0, "step": 4827 }, { "entropy": 0.017726816702634096, "epoch": 2.1129226392384286, "grad_norm": 4.1875, "learning_rate": 1.7861396859772608e-05, "loss": 0.0857, "loss_lm": 0.01594499684870243, "loss_seg": 0.06980270333588123, "mean_token_accuracy": 0.9952551573514938, "num_tokens": 2051928462.0, "step": 4828 }, { "entropy": 0.01810632925480604, "epoch": 2.11336032388664, "grad_norm": 7.5625, "learning_rate": 1.7858689767190038e-05, "loss": 0.1354, "loss_lm": 0.0159504737239331, "loss_seg": 0.11941218562424183, "mean_token_accuracy": 0.9952633529901505, "num_tokens": 2052353591.0, "step": 4829 }, { "entropy": 0.018033177591860294, "epoch": 2.1137980085348507, "grad_norm": 6.96875, "learning_rate": 1.7855982674607472e-05, "loss": 0.0933, "loss_lm": 0.015439488226547837, "loss_seg": 0.07789476402103901, "mean_token_accuracy": 0.9953105747699738, "num_tokens": 2052779170.0, "step": 4830 }, { "entropy": 0.019041683990508318, "epoch": 2.1142356931830615, "grad_norm": 3.546875, "learning_rate": 1.7853275582024906e-05, "loss": 0.1041, "loss_lm": 0.015554652782157063, "loss_seg": 0.0885089235380292, "mean_token_accuracy": 0.9950725585222244, "num_tokens": 2053204526.0, "step": 4831 }, { "entropy": 0.0181288025341928, "epoch": 2.1146733778312727, "grad_norm": 25.875, "learning_rate": 1.785056848944234e-05, "loss": 0.08, "loss_lm": 0.01430196431465447, "loss_seg": 0.06568129267543554, "mean_token_accuracy": 0.9953001439571381, "num_tokens": 2053629830.0, "step": 4832 }, { "entropy": 0.019002556800842285, "epoch": 2.1151110624794836, "grad_norm": 4.90625, "learning_rate": 1.7847861396859773e-05, "loss": 0.1102, "loss_lm": 0.016185146057978272, "loss_seg": 0.09406367223709822, "mean_token_accuracy": 0.995089128613472, "num_tokens": 2054055381.0, "step": 4833 }, { "entropy": 0.018059669993817806, "epoch": 2.1155487471276944, "grad_norm": 4.21875, "learning_rate": 1.7845154304277207e-05, "loss": 0.0905, "loss_lm": 0.015384086174890399, "loss_seg": 0.07515113521367311, "mean_token_accuracy": 0.9952920526266098, "num_tokens": 2054480590.0, "step": 4834 }, { "entropy": 0.0179160013794899, "epoch": 2.1159864317759056, "grad_norm": 4.59375, "learning_rate": 1.784244721169464e-05, "loss": 0.1403, "loss_lm": 0.01652589999139309, "loss_seg": 0.123759969137609, "mean_token_accuracy": 0.9954024404287338, "num_tokens": 2054905302.0, "step": 4835 }, { "entropy": 0.017939699348062277, "epoch": 2.1164241164241164, "grad_norm": 3.4375, "learning_rate": 1.7839740119112074e-05, "loss": 0.0901, "loss_lm": 0.014379814267158508, "loss_seg": 0.07576419040560722, "mean_token_accuracy": 0.9952353835105896, "num_tokens": 2055330268.0, "step": 4836 }, { "entropy": 0.018166227731853724, "epoch": 2.1168618010723272, "grad_norm": 19.875, "learning_rate": 1.7837033026529508e-05, "loss": 0.1584, "loss_lm": 0.018112447578459978, "loss_seg": 0.14033653493970633, "mean_token_accuracy": 0.9952654093503952, "num_tokens": 2055755605.0, "step": 4837 }, { "entropy": 0.01831314852461219, "epoch": 2.1172994857205385, "grad_norm": 5.40625, "learning_rate": 1.783432593394694e-05, "loss": 0.1259, "loss_lm": 0.01801612274721265, "loss_seg": 0.10784552805125713, "mean_token_accuracy": 0.9950847923755646, "num_tokens": 2056180787.0, "step": 4838 }, { "entropy": 0.018273107707500458, "epoch": 2.1177371703687493, "grad_norm": 29.625, "learning_rate": 1.7831618841364376e-05, "loss": 0.1183, "loss_lm": 0.016507964581251144, "loss_seg": 0.10183398984372616, "mean_token_accuracy": 0.9952276051044464, "num_tokens": 2056605449.0, "step": 4839 }, { "entropy": 0.018946031108498573, "epoch": 2.11817485501696, "grad_norm": 4.34375, "learning_rate": 1.782891174878181e-05, "loss": 0.1173, "loss_lm": 0.019704466685652733, "loss_seg": 0.09757920261472464, "mean_token_accuracy": 0.995037704706192, "num_tokens": 2057030825.0, "step": 4840 }, { "entropy": 0.018848561216145754, "epoch": 2.1186125396651714, "grad_norm": 6.0, "learning_rate": 1.7826204656199243e-05, "loss": 0.0801, "loss_lm": 0.015482748625800014, "loss_seg": 0.06459262035787106, "mean_token_accuracy": 0.995191290974617, "num_tokens": 2057456241.0, "step": 4841 }, { "entropy": 0.018293510656803846, "epoch": 2.119050224313382, "grad_norm": 4.09375, "learning_rate": 1.7823497563616677e-05, "loss": 0.1037, "loss_lm": 0.014848728431388736, "loss_seg": 0.08886373229324818, "mean_token_accuracy": 0.9952257573604584, "num_tokens": 2057881598.0, "step": 4842 }, { "entropy": 0.01854504132643342, "epoch": 2.119487908961593, "grad_norm": 16.375, "learning_rate": 1.7820790471034107e-05, "loss": 0.1208, "loss_lm": 0.01612772257067263, "loss_seg": 0.10471463575959206, "mean_token_accuracy": 0.9951731860637665, "num_tokens": 2058306487.0, "step": 4843 }, { "entropy": 0.018976053223013878, "epoch": 2.1199255936098043, "grad_norm": 10.1875, "learning_rate": 1.7818083378451544e-05, "loss": 0.1033, "loss_lm": 0.018045123666524887, "loss_seg": 0.08526905439794064, "mean_token_accuracy": 0.9951513260602951, "num_tokens": 2058732133.0, "step": 4844 }, { "entropy": 0.018477093894034624, "epoch": 2.120363278258015, "grad_norm": 4.0625, "learning_rate": 1.7815376285868978e-05, "loss": 0.1126, "loss_lm": 0.01764200208708644, "loss_seg": 0.09497576020658016, "mean_token_accuracy": 0.995146706700325, "num_tokens": 2059156948.0, "step": 4845 }, { "entropy": 0.01799072651192546, "epoch": 2.120800962906226, "grad_norm": 7.9375, "learning_rate": 1.781266919328641e-05, "loss": 0.1174, "loss_lm": 0.014502768870443106, "loss_seg": 0.10289716720581055, "mean_token_accuracy": 0.9952758252620697, "num_tokens": 2059581765.0, "step": 4846 }, { "entropy": 0.018872244749218225, "epoch": 2.121238647554437, "grad_norm": 6.0625, "learning_rate": 1.7809962100703845e-05, "loss": 0.1498, "loss_lm": 0.01854315772652626, "loss_seg": 0.13126226887106895, "mean_token_accuracy": 0.9951573610305786, "num_tokens": 2060006708.0, "step": 4847 }, { "entropy": 0.018262402154505253, "epoch": 2.121676332202648, "grad_norm": 3.640625, "learning_rate": 1.7807255008121276e-05, "loss": 0.1386, "loss_lm": 0.015207494143396616, "loss_seg": 0.12339294049888849, "mean_token_accuracy": 0.9953224807977676, "num_tokens": 2060431261.0, "step": 4848 }, { "entropy": 0.01806374080479145, "epoch": 2.1221140168508588, "grad_norm": 15.6875, "learning_rate": 1.7804547915538713e-05, "loss": 0.1044, "loss_lm": 0.014756300952285528, "loss_seg": 0.08961151354014874, "mean_token_accuracy": 0.9952141791582108, "num_tokens": 2060856832.0, "step": 4849 }, { "entropy": 0.018745551351457834, "epoch": 2.12255170149907, "grad_norm": 12.9375, "learning_rate": 1.7801840822956146e-05, "loss": 0.1139, "loss_lm": 0.01703276881016791, "loss_seg": 0.09686706960201263, "mean_token_accuracy": 0.9952121078968048, "num_tokens": 2061281621.0, "step": 4850 }, { "entropy": 0.01872278330847621, "epoch": 2.122989386147281, "grad_norm": 11.9375, "learning_rate": 1.779913373037358e-05, "loss": 0.098, "loss_lm": 0.017169844824820757, "loss_seg": 0.08079063892364502, "mean_token_accuracy": 0.9951996207237244, "num_tokens": 2061706752.0, "step": 4851 }, { "entropy": 0.01874025259166956, "epoch": 2.1234270707954916, "grad_norm": 3.984375, "learning_rate": 1.7796426637791014e-05, "loss": 0.1389, "loss_lm": 0.018578918185085058, "loss_seg": 0.12034748308360577, "mean_token_accuracy": 0.9951297491788864, "num_tokens": 2062130935.0, "step": 4852 }, { "entropy": 0.01800820603966713, "epoch": 2.123864755443703, "grad_norm": 3.71875, "learning_rate": 1.7793719545208444e-05, "loss": 0.1469, "loss_lm": 0.015986681915819645, "loss_seg": 0.13094672188162804, "mean_token_accuracy": 0.995376780629158, "num_tokens": 2062555916.0, "step": 4853 }, { "entropy": 0.018314591143280268, "epoch": 2.1243024400919137, "grad_norm": 4.53125, "learning_rate": 1.779101245262588e-05, "loss": 0.1179, "loss_lm": 0.016756510827690363, "loss_seg": 0.10117752104997635, "mean_token_accuracy": 0.9951602220535278, "num_tokens": 2062980073.0, "step": 4854 }, { "entropy": 0.01824531238526106, "epoch": 2.124740124740125, "grad_norm": 4.9375, "learning_rate": 1.7788305360043315e-05, "loss": 0.1351, "loss_lm": 0.016638380009680986, "loss_seg": 0.11843045800924301, "mean_token_accuracy": 0.995166078209877, "num_tokens": 2063405357.0, "step": 4855 }, { "entropy": 0.01832239842042327, "epoch": 2.125177809388336, "grad_norm": 4.625, "learning_rate": 1.778559826746075e-05, "loss": 0.123, "loss_lm": 0.014281098498031497, "loss_seg": 0.10872339829802513, "mean_token_accuracy": 0.9951721131801605, "num_tokens": 2063830146.0, "step": 4856 }, { "entropy": 0.018337320536375046, "epoch": 2.1256154940365466, "grad_norm": 13.0625, "learning_rate": 1.778289117487818e-05, "loss": 0.0756, "loss_lm": 0.014398618368431926, "loss_seg": 0.06119507923722267, "mean_token_accuracy": 0.9951402097940445, "num_tokens": 2064254625.0, "step": 4857 }, { "entropy": 0.018158151768147945, "epoch": 2.1260531786847574, "grad_norm": 3.171875, "learning_rate": 1.7780184082295613e-05, "loss": 0.0922, "loss_lm": 0.017080049961805344, "loss_seg": 0.07512321509420872, "mean_token_accuracy": 0.9952636659145355, "num_tokens": 2064679995.0, "step": 4858 }, { "entropy": 0.018261178396642208, "epoch": 2.1264908633329687, "grad_norm": 3.328125, "learning_rate": 1.777747698971305e-05, "loss": 0.0839, "loss_lm": 0.01652449951507151, "loss_seg": 0.0674159936606884, "mean_token_accuracy": 0.9951539039611816, "num_tokens": 2065105260.0, "step": 4859 }, { "entropy": 0.018130016047507524, "epoch": 2.1269285479811795, "grad_norm": 9.0, "learning_rate": 1.7774769897130484e-05, "loss": 0.1525, "loss_lm": 0.018853181041777134, "loss_seg": 0.13369002752006054, "mean_token_accuracy": 0.995251938700676, "num_tokens": 2065529808.0, "step": 4860 }, { "entropy": 0.018303737975656986, "epoch": 2.1273662326293907, "grad_norm": 8.5625, "learning_rate": 1.7772062804547917e-05, "loss": 0.0818, "loss_lm": 0.017660944489762187, "loss_seg": 0.06417008116841316, "mean_token_accuracy": 0.9951148927211761, "num_tokens": 2065955354.0, "step": 4861 }, { "entropy": 0.01875381078571081, "epoch": 2.1278039172776015, "grad_norm": 4.15625, "learning_rate": 1.7769355711965348e-05, "loss": 0.1179, "loss_lm": 0.017726627411320806, "loss_seg": 0.10021089389920235, "mean_token_accuracy": 0.9951761066913605, "num_tokens": 2066380691.0, "step": 4862 }, { "entropy": 0.01869378425180912, "epoch": 2.1282416019258124, "grad_norm": 16.875, "learning_rate": 1.776664861938278e-05, "loss": 0.0969, "loss_lm": 0.018730537965893745, "loss_seg": 0.07819247525185347, "mean_token_accuracy": 0.9952161908149719, "num_tokens": 2066805843.0, "step": 4863 }, { "entropy": 0.017823258880525827, "epoch": 2.1286792865740236, "grad_norm": 47.5, "learning_rate": 1.776394152680022e-05, "loss": 0.091, "loss_lm": 0.015606673201546073, "loss_seg": 0.075438785366714, "mean_token_accuracy": 0.9952614307403564, "num_tokens": 2067230614.0, "step": 4864 }, { "entropy": 0.018128997646272182, "epoch": 2.1291169712222344, "grad_norm": 8.0, "learning_rate": 1.7761234434217652e-05, "loss": 0.1182, "loss_lm": 0.01677868259139359, "loss_seg": 0.10142549313604832, "mean_token_accuracy": 0.9952068328857422, "num_tokens": 2067655361.0, "step": 4865 }, { "entropy": 0.018391079735010862, "epoch": 2.1295546558704452, "grad_norm": 3.734375, "learning_rate": 1.7758527341635086e-05, "loss": 0.1304, "loss_lm": 0.01465608854778111, "loss_seg": 0.11569632403552532, "mean_token_accuracy": 0.9951677918434143, "num_tokens": 2068081209.0, "step": 4866 }, { "entropy": 0.017895695753395557, "epoch": 2.1299923405186565, "grad_norm": 4.78125, "learning_rate": 1.7755820249052516e-05, "loss": 0.1115, "loss_lm": 0.0164752050768584, "loss_seg": 0.09504063427448273, "mean_token_accuracy": 0.9953539967536926, "num_tokens": 2068506260.0, "step": 4867 }, { "entropy": 0.018566257320344448, "epoch": 2.1304300251668673, "grad_norm": 3.046875, "learning_rate": 1.775311315646995e-05, "loss": 0.1216, "loss_lm": 0.01670517190359533, "loss_seg": 0.1049370002001524, "mean_token_accuracy": 0.9951839745044708, "num_tokens": 2068930435.0, "step": 4868 }, { "entropy": 0.018151653464883566, "epoch": 2.130867709815078, "grad_norm": 11.75, "learning_rate": 1.7750406063887384e-05, "loss": 0.093, "loss_lm": 0.015015087323263288, "loss_seg": 0.07800282165408134, "mean_token_accuracy": 0.9952305108308792, "num_tokens": 2069355827.0, "step": 4869 }, { "entropy": 0.018023128155618906, "epoch": 2.1313053944632894, "grad_norm": 3.921875, "learning_rate": 1.774769897130482e-05, "loss": 0.1477, "loss_lm": 0.0160450569819659, "loss_seg": 0.13164500892162323, "mean_token_accuracy": 0.9952381402254105, "num_tokens": 2069781046.0, "step": 4870 }, { "entropy": 0.018534959759563208, "epoch": 2.1317430791115, "grad_norm": 3.171875, "learning_rate": 1.7744991878722255e-05, "loss": 0.0931, "loss_lm": 0.01739833690226078, "loss_seg": 0.07568494882434607, "mean_token_accuracy": 0.9951726496219635, "num_tokens": 2070206709.0, "step": 4871 }, { "entropy": 0.018713558092713356, "epoch": 2.132180763759711, "grad_norm": 3.734375, "learning_rate": 1.7742284786139685e-05, "loss": 0.0871, "loss_lm": 0.016242973040789366, "loss_seg": 0.07085986342281103, "mean_token_accuracy": 0.9951115548610687, "num_tokens": 2070632384.0, "step": 4872 }, { "entropy": 0.01912074815481901, "epoch": 2.1326184484079223, "grad_norm": 74.5, "learning_rate": 1.773957769355712e-05, "loss": 0.0922, "loss_lm": 0.01602559001185, "loss_seg": 0.07620443217456341, "mean_token_accuracy": 0.9949466735124588, "num_tokens": 2071057242.0, "step": 4873 }, { "entropy": 0.018376614898443222, "epoch": 2.133056133056133, "grad_norm": 3.671875, "learning_rate": 1.7736870600974553e-05, "loss": 0.0973, "loss_lm": 0.015430424362421036, "loss_seg": 0.08190947771072388, "mean_token_accuracy": 0.9952899366617203, "num_tokens": 2071481815.0, "step": 4874 }, { "entropy": 0.018260623328387737, "epoch": 2.133493817704344, "grad_norm": 3.40625, "learning_rate": 1.773416350839199e-05, "loss": 0.1091, "loss_lm": 0.016191473696380854, "loss_seg": 0.09289046935737133, "mean_token_accuracy": 0.9950538277626038, "num_tokens": 2071907165.0, "step": 4875 }, { "entropy": 0.017944486811757088, "epoch": 2.133931502352555, "grad_norm": 5.0, "learning_rate": 1.7731456415809423e-05, "loss": 0.1386, "loss_lm": 0.014484231127426028, "loss_seg": 0.1241003219038248, "mean_token_accuracy": 0.9952906370162964, "num_tokens": 2072332245.0, "step": 4876 }, { "entropy": 0.01797306165099144, "epoch": 2.134369187000766, "grad_norm": 4.96875, "learning_rate": 1.7728749323226854e-05, "loss": 0.0835, "loss_lm": 0.014633079059422016, "loss_seg": 0.06889783591032028, "mean_token_accuracy": 0.9953149110078812, "num_tokens": 2072757605.0, "step": 4877 }, { "entropy": 0.018191115465015173, "epoch": 2.1348068716489768, "grad_norm": 3.84375, "learning_rate": 1.7726042230644287e-05, "loss": 0.1111, "loss_lm": 0.017983988393098116, "loss_seg": 0.09307445120066404, "mean_token_accuracy": 0.9952694624662399, "num_tokens": 2073182496.0, "step": 4878 }, { "entropy": 0.01807984197512269, "epoch": 2.135244556297188, "grad_norm": 19.75, "learning_rate": 1.772333513806172e-05, "loss": 0.101, "loss_lm": 0.01752744591794908, "loss_seg": 0.08348056301474571, "mean_token_accuracy": 0.9953578561544418, "num_tokens": 2073606828.0, "step": 4879 }, { "entropy": 0.017975996248424053, "epoch": 2.135682240945399, "grad_norm": 3.71875, "learning_rate": 1.7720628045479158e-05, "loss": 0.0675, "loss_lm": 0.015383088262751698, "loss_seg": 0.05213993135839701, "mean_token_accuracy": 0.9953140765428543, "num_tokens": 2074031871.0, "step": 4880 }, { "entropy": 0.018253488931804895, "epoch": 2.1361199255936096, "grad_norm": 12.4375, "learning_rate": 1.771792095289659e-05, "loss": 0.1479, "loss_lm": 0.015796908410266042, "loss_seg": 0.1320977360010147, "mean_token_accuracy": 0.9951388239860535, "num_tokens": 2074457472.0, "step": 4881 }, { "entropy": 0.01871770340949297, "epoch": 2.136557610241821, "grad_norm": 10.8125, "learning_rate": 1.7715213860314022e-05, "loss": 0.1335, "loss_lm": 0.01506135705858469, "loss_seg": 0.1184007078409195, "mean_token_accuracy": 0.995184451341629, "num_tokens": 2074882324.0, "step": 4882 }, { "entropy": 0.018524035811424255, "epoch": 2.1369952948900317, "grad_norm": 8.75, "learning_rate": 1.7712506767731456e-05, "loss": 0.119, "loss_lm": 0.015869480092078447, "loss_seg": 0.10316140949726105, "mean_token_accuracy": 0.9951176941394806, "num_tokens": 2075307502.0, "step": 4883 }, { "entropy": 0.018270972184836864, "epoch": 2.1374329795382425, "grad_norm": 9.5, "learning_rate": 1.770979967514889e-05, "loss": 0.0943, "loss_lm": 0.014975390397012234, "loss_seg": 0.07927939668297768, "mean_token_accuracy": 0.9951646029949188, "num_tokens": 2075732656.0, "step": 4884 }, { "entropy": 0.01880483329296112, "epoch": 2.1378706641864538, "grad_norm": 5.03125, "learning_rate": 1.7707092582566327e-05, "loss": 0.1395, "loss_lm": 0.016748928697779775, "loss_seg": 0.12272064201533794, "mean_token_accuracy": 0.9951303750276566, "num_tokens": 2076158110.0, "step": 4885 }, { "entropy": 0.018689359538257122, "epoch": 2.1383083488346646, "grad_norm": 8.5, "learning_rate": 1.7704385489983757e-05, "loss": 0.1039, "loss_lm": 0.013523781904950738, "loss_seg": 0.09040975570678711, "mean_token_accuracy": 0.9951145350933075, "num_tokens": 2076584039.0, "step": 4886 }, { "entropy": 0.01836364809423685, "epoch": 2.1387460334828754, "grad_norm": 9.0, "learning_rate": 1.770167839740119e-05, "loss": 0.1042, "loss_lm": 0.01676622941158712, "loss_seg": 0.08747868333011866, "mean_token_accuracy": 0.9951438307762146, "num_tokens": 2077010073.0, "step": 4887 }, { "entropy": 0.018371315207332373, "epoch": 2.1391837181310867, "grad_norm": 2.4375, "learning_rate": 1.7698971304818625e-05, "loss": 0.0966, "loss_lm": 0.017567079048603773, "loss_seg": 0.07907223422080278, "mean_token_accuracy": 0.9951546788215637, "num_tokens": 2077434164.0, "step": 4888 }, { "entropy": 0.018311878200620413, "epoch": 2.1396214027792975, "grad_norm": 10.75, "learning_rate": 1.769626421223606e-05, "loss": 0.1172, "loss_lm": 0.014133450109511614, "loss_seg": 0.10310613736510277, "mean_token_accuracy": 0.9953241944313049, "num_tokens": 2077858625.0, "step": 4889 }, { "entropy": 0.018735677935183048, "epoch": 2.1400590874275083, "grad_norm": 9.375, "learning_rate": 1.7693557119653496e-05, "loss": 0.0925, "loss_lm": 0.01828855136409402, "loss_seg": 0.07418732158839703, "mean_token_accuracy": 0.9950518757104874, "num_tokens": 2078283991.0, "step": 4890 }, { "entropy": 0.018401678651571274, "epoch": 2.1404967720757195, "grad_norm": 9.0, "learning_rate": 1.7690850027070926e-05, "loss": 0.0956, "loss_lm": 0.014692556113004684, "loss_seg": 0.0808941824361682, "mean_token_accuracy": 0.9953626990318298, "num_tokens": 2078709414.0, "step": 4891 }, { "entropy": 0.018545587081462145, "epoch": 2.1409344567239303, "grad_norm": 8.6875, "learning_rate": 1.768814293448836e-05, "loss": 0.1072, "loss_lm": 0.015613847644999623, "loss_seg": 0.0915618073195219, "mean_token_accuracy": 0.995210736989975, "num_tokens": 2079134689.0, "step": 4892 }, { "entropy": 0.018416904844343662, "epoch": 2.141372141372141, "grad_norm": 4.78125, "learning_rate": 1.7685435841905793e-05, "loss": 0.1229, "loss_lm": 0.014513397123664618, "loss_seg": 0.10837185941636562, "mean_token_accuracy": 0.9952011108398438, "num_tokens": 2079559562.0, "step": 4893 }, { "entropy": 0.01845263596624136, "epoch": 2.1418098260203524, "grad_norm": 24.75, "learning_rate": 1.7682728749323227e-05, "loss": 0.1025, "loss_lm": 0.01530474191531539, "loss_seg": 0.08723569475114346, "mean_token_accuracy": 0.995327815413475, "num_tokens": 2079985006.0, "step": 4894 }, { "entropy": 0.01832607015967369, "epoch": 2.1422475106685632, "grad_norm": 11.875, "learning_rate": 1.7680021656740664e-05, "loss": 0.1181, "loss_lm": 0.016586894169449806, "loss_seg": 0.10147172026336193, "mean_token_accuracy": 0.9952478855848312, "num_tokens": 2080410249.0, "step": 4895 }, { "entropy": 0.018520453479140997, "epoch": 2.1426851953167745, "grad_norm": 7.96875, "learning_rate": 1.7677314564158094e-05, "loss": 0.0995, "loss_lm": 0.016786837950348854, "loss_seg": 0.08276168815791607, "mean_token_accuracy": 0.9951619356870651, "num_tokens": 2080835377.0, "step": 4896 }, { "entropy": 0.01826839428395033, "epoch": 2.1431228799649853, "grad_norm": 5.9375, "learning_rate": 1.7674607471575528e-05, "loss": 0.0969, "loss_lm": 0.014881909359246492, "loss_seg": 0.08200232218950987, "mean_token_accuracy": 0.9951151609420776, "num_tokens": 2081260760.0, "step": 4897 }, { "entropy": 0.018006473314017057, "epoch": 2.143560564613196, "grad_norm": 7.65625, "learning_rate": 1.7671900378992962e-05, "loss": 0.1045, "loss_lm": 0.014418633887544274, "loss_seg": 0.09011624287813902, "mean_token_accuracy": 0.9953256249427795, "num_tokens": 2081686027.0, "step": 4898 }, { "entropy": 0.01824677037075162, "epoch": 2.1439982492614074, "grad_norm": 2.953125, "learning_rate": 1.7669193286410396e-05, "loss": 0.1108, "loss_lm": 0.01730551104992628, "loss_seg": 0.09350099973380566, "mean_token_accuracy": 0.9951581954956055, "num_tokens": 2082110817.0, "step": 4899 }, { "entropy": 0.018572242930531502, "epoch": 2.144435933909618, "grad_norm": 15.75, "learning_rate": 1.7666486193827826e-05, "loss": 0.0985, "loss_lm": 0.01666622143238783, "loss_seg": 0.08187235891819, "mean_token_accuracy": 0.9951933771371841, "num_tokens": 2082535810.0, "step": 4900 }, { "entropy": 0.01825027959421277, "epoch": 2.144873618557829, "grad_norm": 3.515625, "learning_rate": 1.7663779101245263e-05, "loss": 0.1121, "loss_lm": 0.015854968689382076, "loss_seg": 0.09624797478318214, "mean_token_accuracy": 0.9952472895383835, "num_tokens": 2082961007.0, "step": 4901 }, { "entropy": 0.01829370390623808, "epoch": 2.1453113032060402, "grad_norm": 4.03125, "learning_rate": 1.7661072008662697e-05, "loss": 0.0708, "loss_lm": 0.015323877800256014, "loss_seg": 0.05542886536568403, "mean_token_accuracy": 0.9952394664287567, "num_tokens": 2083384873.0, "step": 4902 }, { "entropy": 0.019061005674302578, "epoch": 2.145748987854251, "grad_norm": 3.71875, "learning_rate": 1.765836491608013e-05, "loss": 0.1269, "loss_lm": 0.01983777293935418, "loss_seg": 0.10709547065198421, "mean_token_accuracy": 0.9950253367424011, "num_tokens": 2083810166.0, "step": 4903 }, { "entropy": 0.019318528473377228, "epoch": 2.146186672502462, "grad_norm": 7.3125, "learning_rate": 1.7655657823497564e-05, "loss": 0.121, "loss_lm": 0.018551044166088104, "loss_seg": 0.10242231003940105, "mean_token_accuracy": 0.9951003938913345, "num_tokens": 2084236102.0, "step": 4904 }, { "entropy": 0.018203633837401867, "epoch": 2.146624357150673, "grad_norm": 11.8125, "learning_rate": 1.7652950730914995e-05, "loss": 0.1124, "loss_lm": 0.014213061891496181, "loss_seg": 0.09820294566452503, "mean_token_accuracy": 0.9953149706125259, "num_tokens": 2084661854.0, "step": 4905 }, { "entropy": 0.018613227643072605, "epoch": 2.147062041798884, "grad_norm": 12.1875, "learning_rate": 1.7650243638332432e-05, "loss": 0.1559, "loss_lm": 0.0203833170235157, "loss_seg": 0.1354698296636343, "mean_token_accuracy": 0.9952151477336884, "num_tokens": 2085086903.0, "step": 4906 }, { "entropy": 0.01789708575233817, "epoch": 2.1474997264470947, "grad_norm": 3.015625, "learning_rate": 1.7647536545749865e-05, "loss": 0.0938, "loss_lm": 0.01515421294607222, "loss_seg": 0.07865675259381533, "mean_token_accuracy": 0.9953285604715347, "num_tokens": 2085511570.0, "step": 4907 }, { "entropy": 0.018590537831187248, "epoch": 2.147937411095306, "grad_norm": 7.28125, "learning_rate": 1.76448294531673e-05, "loss": 0.1673, "loss_lm": 0.017015827586874366, "loss_seg": 0.15025473944842815, "mean_token_accuracy": 0.9951338022947311, "num_tokens": 2085936113.0, "step": 4908 }, { "entropy": 0.017917138058692217, "epoch": 2.148375095743517, "grad_norm": 6.78125, "learning_rate": 1.7642122360584733e-05, "loss": 0.0692, "loss_lm": 0.016319792717695236, "loss_seg": 0.05289238318800926, "mean_token_accuracy": 0.9953370094299316, "num_tokens": 2086361325.0, "step": 4909 }, { "entropy": 0.018090741243213415, "epoch": 2.1488127803917276, "grad_norm": 3.6875, "learning_rate": 1.7639415268002163e-05, "loss": 0.1194, "loss_lm": 0.014091372024267912, "loss_seg": 0.10526205413043499, "mean_token_accuracy": 0.9954812377691269, "num_tokens": 2086785506.0, "step": 4910 }, { "entropy": 0.0182765768840909, "epoch": 2.149250465039939, "grad_norm": 5.46875, "learning_rate": 1.76367081754196e-05, "loss": 0.0884, "loss_lm": 0.018003359902650118, "loss_seg": 0.07043368741869926, "mean_token_accuracy": 0.9951182156801224, "num_tokens": 2087210320.0, "step": 4911 }, { "entropy": 0.018309726379811764, "epoch": 2.1496881496881497, "grad_norm": 20.25, "learning_rate": 1.7634001082837034e-05, "loss": 0.0937, "loss_lm": 0.01616120315156877, "loss_seg": 0.07749343104660511, "mean_token_accuracy": 0.995276689529419, "num_tokens": 2087634971.0, "step": 4912 }, { "entropy": 0.017676398623734713, "epoch": 2.1501258343363605, "grad_norm": 3.90625, "learning_rate": 1.7631293990254468e-05, "loss": 0.1028, "loss_lm": 0.015194701496511698, "loss_seg": 0.08758246153593063, "mean_token_accuracy": 0.99542336165905, "num_tokens": 2088059661.0, "step": 4913 }, { "entropy": 0.01803425094112754, "epoch": 2.1505635189845718, "grad_norm": 5.875, "learning_rate": 1.76285868976719e-05, "loss": 0.1257, "loss_lm": 0.01612768671475351, "loss_seg": 0.10960195027291775, "mean_token_accuracy": 0.9952534586191177, "num_tokens": 2088484756.0, "step": 4914 }, { "entropy": 0.018880551680922508, "epoch": 2.1510012036327826, "grad_norm": 5.03125, "learning_rate": 1.7625879805089332e-05, "loss": 0.0937, "loss_lm": 0.015458372654393315, "loss_seg": 0.07825751602649689, "mean_token_accuracy": 0.9950568526983261, "num_tokens": 2088909776.0, "step": 4915 }, { "entropy": 0.018774620722979307, "epoch": 2.1514388882809934, "grad_norm": 5.34375, "learning_rate": 1.762317271250677e-05, "loss": 0.1344, "loss_lm": 0.016460202634334564, "loss_seg": 0.11789949145168066, "mean_token_accuracy": 0.9951828122138977, "num_tokens": 2089336076.0, "step": 4916 }, { "entropy": 0.018037724308669567, "epoch": 2.1518765729292046, "grad_norm": 16.875, "learning_rate": 1.7620465619924203e-05, "loss": 0.1099, "loss_lm": 0.015080847777426243, "loss_seg": 0.09484769962728024, "mean_token_accuracy": 0.9953103065490723, "num_tokens": 2089761201.0, "step": 4917 }, { "entropy": 0.018617928959429264, "epoch": 2.1523142575774155, "grad_norm": 4.25, "learning_rate": 1.7617758527341636e-05, "loss": 0.1332, "loss_lm": 0.017547684023156762, "loss_seg": 0.11562703549861908, "mean_token_accuracy": 0.995091438293457, "num_tokens": 2090186762.0, "step": 4918 }, { "entropy": 0.018424738198518753, "epoch": 2.1527519422256263, "grad_norm": 2.828125, "learning_rate": 1.761505143475907e-05, "loss": 0.1127, "loss_lm": 0.01624845527112484, "loss_seg": 0.09646520297974348, "mean_token_accuracy": 0.9952161461114883, "num_tokens": 2090610862.0, "step": 4919 }, { "entropy": 0.018497134558856487, "epoch": 2.1531896268738375, "grad_norm": 32.0, "learning_rate": 1.76123443421765e-05, "loss": 0.0919, "loss_lm": 0.015138156479224563, "loss_seg": 0.07681178860366344, "mean_token_accuracy": 0.9952154457569122, "num_tokens": 2091035520.0, "step": 4920 }, { "entropy": 0.018623180221766233, "epoch": 2.1536273115220483, "grad_norm": 6.40625, "learning_rate": 1.7609637249593938e-05, "loss": 0.1261, "loss_lm": 0.015291490592062473, "loss_seg": 0.11080075707286596, "mean_token_accuracy": 0.9951312392950058, "num_tokens": 2091460259.0, "step": 4921 }, { "entropy": 0.01788354478776455, "epoch": 2.154064996170259, "grad_norm": 6.09375, "learning_rate": 1.760693015701137e-05, "loss": 0.089, "loss_lm": 0.015298736281692982, "loss_seg": 0.07366467639803886, "mean_token_accuracy": 0.9953555017709732, "num_tokens": 2091885459.0, "step": 4922 }, { "entropy": 0.018471125978976488, "epoch": 2.1545026808184704, "grad_norm": 6.6875, "learning_rate": 1.7604223064428805e-05, "loss": 0.135, "loss_lm": 0.015653796726837754, "loss_seg": 0.11938416957855225, "mean_token_accuracy": 0.995170071721077, "num_tokens": 2092310263.0, "step": 4923 }, { "entropy": 0.018823330756276846, "epoch": 2.154940365466681, "grad_norm": 5.0, "learning_rate": 1.7601515971846235e-05, "loss": 0.1052, "loss_lm": 0.01741394493728876, "loss_seg": 0.08774403110146523, "mean_token_accuracy": 0.9950366318225861, "num_tokens": 2092735560.0, "step": 4924 }, { "entropy": 0.01785005582496524, "epoch": 2.155378050114892, "grad_norm": 6.6875, "learning_rate": 1.759880887926367e-05, "loss": 0.1161, "loss_lm": 0.016120502492412925, "loss_seg": 0.09994365088641644, "mean_token_accuracy": 0.995462104678154, "num_tokens": 2093160365.0, "step": 4925 }, { "entropy": 0.01828758930787444, "epoch": 2.1558157347631033, "grad_norm": 3.40625, "learning_rate": 1.7596101786681106e-05, "loss": 0.1072, "loss_lm": 0.015612120274454355, "loss_seg": 0.09161381050944328, "mean_token_accuracy": 0.9953252673149109, "num_tokens": 2093586231.0, "step": 4926 }, { "entropy": 0.0176635580137372, "epoch": 2.156253419411314, "grad_norm": 4.1875, "learning_rate": 1.759339469409854e-05, "loss": 0.1168, "loss_lm": 0.015664172591641545, "loss_seg": 0.1010918915271759, "mean_token_accuracy": 0.9953606873750687, "num_tokens": 2094010163.0, "step": 4927 }, { "entropy": 0.017910311464220285, "epoch": 2.156691104059525, "grad_norm": 11.4375, "learning_rate": 1.7590687601515974e-05, "loss": 0.1313, "loss_lm": 0.015606053173542023, "loss_seg": 0.11569645442068577, "mean_token_accuracy": 0.9953130632638931, "num_tokens": 2094435128.0, "step": 4928 }, { "entropy": 0.018829812295734882, "epoch": 2.157128788707736, "grad_norm": 8.6875, "learning_rate": 1.7587980508933404e-05, "loss": 0.1206, "loss_lm": 0.016355575993657112, "loss_seg": 0.10427018161863089, "mean_token_accuracy": 0.9950248748064041, "num_tokens": 2094860013.0, "step": 4929 }, { "entropy": 0.018693136051297188, "epoch": 2.157566473355947, "grad_norm": 19.75, "learning_rate": 1.7585273416350838e-05, "loss": 0.0967, "loss_lm": 0.01745728263631463, "loss_seg": 0.079273771494627, "mean_token_accuracy": 0.9952376037836075, "num_tokens": 2095284569.0, "step": 4930 }, { "entropy": 0.01843428984284401, "epoch": 2.1580041580041582, "grad_norm": 4.34375, "learning_rate": 1.7582566323768275e-05, "loss": 0.1678, "loss_lm": 0.015902390936389565, "loss_seg": 0.1518630888313055, "mean_token_accuracy": 0.9951402097940445, "num_tokens": 2095709657.0, "step": 4931 }, { "entropy": 0.01847442239522934, "epoch": 2.158441842652369, "grad_norm": 5.34375, "learning_rate": 1.757985923118571e-05, "loss": 0.1098, "loss_lm": 0.016892176121473312, "loss_seg": 0.09295424446463585, "mean_token_accuracy": 0.9951569139957428, "num_tokens": 2096134555.0, "step": 4932 }, { "entropy": 0.01823405409231782, "epoch": 2.15887952730058, "grad_norm": 18.0, "learning_rate": 1.7577152138603142e-05, "loss": 0.0656, "loss_lm": 0.013462756760418415, "loss_seg": 0.052147882990539074, "mean_token_accuracy": 0.9952569603919983, "num_tokens": 2096560858.0, "step": 4933 }, { "entropy": 0.019133965950459242, "epoch": 2.1593172119487907, "grad_norm": 10.5625, "learning_rate": 1.7574445046020573e-05, "loss": 0.1353, "loss_lm": 0.016912454506382346, "loss_seg": 0.11840962246060371, "mean_token_accuracy": 0.9950383454561234, "num_tokens": 2096986320.0, "step": 4934 }, { "entropy": 0.01741867931559682, "epoch": 2.159754896597002, "grad_norm": 4.625, "learning_rate": 1.7571737953438006e-05, "loss": 0.133, "loss_lm": 0.0184400319121778, "loss_seg": 0.1145530715584755, "mean_token_accuracy": 0.9954223185777664, "num_tokens": 2097411172.0, "step": 4935 }, { "entropy": 0.01835145801305771, "epoch": 2.1601925812452127, "grad_norm": 16.75, "learning_rate": 1.756903086085544e-05, "loss": 0.0926, "loss_lm": 0.01720357476733625, "loss_seg": 0.07536922860890627, "mean_token_accuracy": 0.9952415525913239, "num_tokens": 2097836262.0, "step": 4936 }, { "entropy": 0.018376200925558805, "epoch": 2.160630265893424, "grad_norm": 7.3125, "learning_rate": 1.7566323768272877e-05, "loss": 0.1069, "loss_lm": 0.016003441298380494, "loss_seg": 0.09084781911224127, "mean_token_accuracy": 0.9952514171600342, "num_tokens": 2098260978.0, "step": 4937 }, { "entropy": 0.01847763452678919, "epoch": 2.161067950541635, "grad_norm": 13.5625, "learning_rate": 1.756361667569031e-05, "loss": 0.1222, "loss_lm": 0.01678773481398821, "loss_seg": 0.1054309131577611, "mean_token_accuracy": 0.995242565870285, "num_tokens": 2098685353.0, "step": 4938 }, { "entropy": 0.01819297019392252, "epoch": 2.1615056351898456, "grad_norm": 6.78125, "learning_rate": 1.756090958310774e-05, "loss": 0.0742, "loss_lm": 0.015088943764567375, "loss_seg": 0.059149159118533134, "mean_token_accuracy": 0.9953033328056335, "num_tokens": 2099109448.0, "step": 4939 }, { "entropy": 0.018188197165727615, "epoch": 2.161943319838057, "grad_norm": 17.375, "learning_rate": 1.7558202490525175e-05, "loss": 0.1079, "loss_lm": 0.01669461466372013, "loss_seg": 0.09118759632110596, "mean_token_accuracy": 0.995231568813324, "num_tokens": 2099534304.0, "step": 4940 }, { "entropy": 0.01874330686405301, "epoch": 2.1623810044862677, "grad_norm": 3.65625, "learning_rate": 1.755549539794261e-05, "loss": 0.109, "loss_lm": 0.015338426688686013, "loss_seg": 0.09368532057851553, "mean_token_accuracy": 0.995083674788475, "num_tokens": 2099959587.0, "step": 4941 }, { "entropy": 0.018480228260159492, "epoch": 2.1628186891344785, "grad_norm": 3.296875, "learning_rate": 1.7552788305360046e-05, "loss": 0.1012, "loss_lm": 0.0167783061042428, "loss_seg": 0.08446221239864826, "mean_token_accuracy": 0.9951215833425522, "num_tokens": 2100384905.0, "step": 4942 }, { "entropy": 0.01755593530833721, "epoch": 2.1632563737826898, "grad_norm": 2.796875, "learning_rate": 1.755008121277748e-05, "loss": 0.0836, "loss_lm": 0.016283018747344613, "loss_seg": 0.06727612856775522, "mean_token_accuracy": 0.9953559786081314, "num_tokens": 2100809486.0, "step": 4943 }, { "entropy": 0.017732268199324608, "epoch": 2.1636940584309006, "grad_norm": 4.65625, "learning_rate": 1.754737412019491e-05, "loss": 0.0806, "loss_lm": 0.016813748981803656, "loss_seg": 0.06377529632300138, "mean_token_accuracy": 0.9953067749738693, "num_tokens": 2101234940.0, "step": 4944 }, { "entropy": 0.018299872986972332, "epoch": 2.1641317430791114, "grad_norm": 5.65625, "learning_rate": 1.7544667027612344e-05, "loss": 0.0903, "loss_lm": 0.015635346062481403, "loss_seg": 0.07467328198254108, "mean_token_accuracy": 0.9952014535665512, "num_tokens": 2101660652.0, "step": 4945 }, { "entropy": 0.017927115317434072, "epoch": 2.1645694277273226, "grad_norm": 6.75, "learning_rate": 1.7541959935029777e-05, "loss": 0.0925, "loss_lm": 0.015013860072940588, "loss_seg": 0.07750453241169453, "mean_token_accuracy": 0.9953508526086807, "num_tokens": 2102085616.0, "step": 4946 }, { "entropy": 0.018281217198818922, "epoch": 2.1650071123755334, "grad_norm": 4.46875, "learning_rate": 1.7539252842447215e-05, "loss": 0.0728, "loss_lm": 0.016399848042055964, "loss_seg": 0.056357341818511486, "mean_token_accuracy": 0.9951547533273697, "num_tokens": 2102510102.0, "step": 4947 }, { "entropy": 0.01780876936390996, "epoch": 2.1654447970237443, "grad_norm": 4.4375, "learning_rate": 1.7536545749864645e-05, "loss": 0.0863, "loss_lm": 0.014379187021404505, "loss_seg": 0.07187426183372736, "mean_token_accuracy": 0.9952742159366608, "num_tokens": 2102935387.0, "step": 4948 }, { "entropy": 0.01840497925877571, "epoch": 2.1658824816719555, "grad_norm": 3.921875, "learning_rate": 1.753383865728208e-05, "loss": 0.0875, "loss_lm": 0.015590623952448368, "loss_seg": 0.07191594038158655, "mean_token_accuracy": 0.9952787756919861, "num_tokens": 2103360021.0, "step": 4949 }, { "entropy": 0.018382042180746794, "epoch": 2.1663201663201663, "grad_norm": 8.9375, "learning_rate": 1.7531131564699512e-05, "loss": 0.074, "loss_lm": 0.01575085474178195, "loss_seg": 0.058297001756727695, "mean_token_accuracy": 0.9952214360237122, "num_tokens": 2103784588.0, "step": 4950 }, { "entropy": 0.01839928049594164, "epoch": 2.166757850968377, "grad_norm": 5.5625, "learning_rate": 1.7528424472116946e-05, "loss": 0.0889, "loss_lm": 0.017280951607972383, "loss_seg": 0.07166753988713026, "mean_token_accuracy": 0.9952187836170197, "num_tokens": 2104209551.0, "step": 4951 }, { "entropy": 0.018598941154778004, "epoch": 2.1671955356165884, "grad_norm": 4.4375, "learning_rate": 1.7525717379534383e-05, "loss": 0.0983, "loss_lm": 0.01677307952195406, "loss_seg": 0.0815394427627325, "mean_token_accuracy": 0.9952390342950821, "num_tokens": 2104633995.0, "step": 4952 }, { "entropy": 0.017581325490027666, "epoch": 2.167633220264799, "grad_norm": 10.6875, "learning_rate": 1.7523010286951813e-05, "loss": 0.0854, "loss_lm": 0.016510997666046023, "loss_seg": 0.06888562999665737, "mean_token_accuracy": 0.995419904589653, "num_tokens": 2105058995.0, "step": 4953 }, { "entropy": 0.018684109672904015, "epoch": 2.16807090491301, "grad_norm": 20.25, "learning_rate": 1.7520303194369247e-05, "loss": 0.1358, "loss_lm": 0.01732005039229989, "loss_seg": 0.1184601690620184, "mean_token_accuracy": 0.9951492249965668, "num_tokens": 2105484678.0, "step": 4954 }, { "entropy": 0.018302007112652063, "epoch": 2.1685085895612213, "grad_norm": 4.8125, "learning_rate": 1.751759610178668e-05, "loss": 0.1103, "loss_lm": 0.017350374488160014, "loss_seg": 0.09295879118144512, "mean_token_accuracy": 0.9951640367507935, "num_tokens": 2105909371.0, "step": 4955 }, { "entropy": 0.018353412859141827, "epoch": 2.168946274209432, "grad_norm": 6.03125, "learning_rate": 1.7514889009204115e-05, "loss": 0.1222, "loss_lm": 0.016150682466104627, "loss_seg": 0.1060404684394598, "mean_token_accuracy": 0.9952229857444763, "num_tokens": 2106335109.0, "step": 4956 }, { "entropy": 0.018194732256233692, "epoch": 2.169383958857643, "grad_norm": 5.3125, "learning_rate": 1.7512181916621552e-05, "loss": 0.1194, "loss_lm": 0.013565207598730922, "loss_seg": 0.10587308183312416, "mean_token_accuracy": 0.9952928721904755, "num_tokens": 2106760570.0, "step": 4957 }, { "entropy": 0.017912486102432013, "epoch": 2.169821643505854, "grad_norm": 9.875, "learning_rate": 1.7509474824038982e-05, "loss": 0.125, "loss_lm": 0.014372504549100995, "loss_seg": 0.11064610816538334, "mean_token_accuracy": 0.9952870160341263, "num_tokens": 2107185084.0, "step": 4958 }, { "entropy": 0.018653789069503546, "epoch": 2.170259328154065, "grad_norm": 6.84375, "learning_rate": 1.7506767731456416e-05, "loss": 0.132, "loss_lm": 0.01918864483013749, "loss_seg": 0.11284290440380573, "mean_token_accuracy": 0.995062842965126, "num_tokens": 2107610180.0, "step": 4959 }, { "entropy": 0.018065585289150476, "epoch": 2.170697012802276, "grad_norm": 7.625, "learning_rate": 1.750406063887385e-05, "loss": 0.0934, "loss_lm": 0.016394118778407574, "loss_seg": 0.07702377811074257, "mean_token_accuracy": 0.9953387677669525, "num_tokens": 2108035321.0, "step": 4960 }, { "entropy": 0.0184163898229599, "epoch": 2.171134697450487, "grad_norm": 6.71875, "learning_rate": 1.7501353546291283e-05, "loss": 0.0992, "loss_lm": 0.016355395782738924, "loss_seg": 0.08287023566663265, "mean_token_accuracy": 0.9952228367328644, "num_tokens": 2108461480.0, "step": 4961 }, { "entropy": 0.01783077698200941, "epoch": 2.171572382098698, "grad_norm": 4.5, "learning_rate": 1.749864645370872e-05, "loss": 0.1013, "loss_lm": 0.017016992904245853, "loss_seg": 0.08433247543871403, "mean_token_accuracy": 0.9951943457126617, "num_tokens": 2108886549.0, "step": 4962 }, { "entropy": 0.017797111999243498, "epoch": 2.1720100667469087, "grad_norm": 11.875, "learning_rate": 1.749593936112615e-05, "loss": 0.0996, "loss_lm": 0.014274227898567915, "loss_seg": 0.08533044531941414, "mean_token_accuracy": 0.995354950428009, "num_tokens": 2109311498.0, "step": 4963 }, { "entropy": 0.018327617086470127, "epoch": 2.17244775139512, "grad_norm": 4.84375, "learning_rate": 1.7493232268543584e-05, "loss": 0.0941, "loss_lm": 0.017493994440883398, "loss_seg": 0.07658151909708977, "mean_token_accuracy": 0.9951927810907364, "num_tokens": 2109736435.0, "step": 4964 }, { "entropy": 0.01820961618795991, "epoch": 2.1728854360433307, "grad_norm": 13.875, "learning_rate": 1.7490525175961018e-05, "loss": 0.1346, "loss_lm": 0.016973930643871427, "loss_seg": 0.1176042053848505, "mean_token_accuracy": 0.995148092508316, "num_tokens": 2110161581.0, "step": 4965 }, { "entropy": 0.01846255036070943, "epoch": 2.1733231206915415, "grad_norm": 6.09375, "learning_rate": 1.7487818083378452e-05, "loss": 0.0943, "loss_lm": 0.016204393235966563, "loss_seg": 0.07809862308204174, "mean_token_accuracy": 0.9950691163539886, "num_tokens": 2110587047.0, "step": 4966 }, { "entropy": 0.018565374426543713, "epoch": 2.173760805339753, "grad_norm": 6.3125, "learning_rate": 1.7485110990795886e-05, "loss": 0.1224, "loss_lm": 0.017675024922937155, "loss_seg": 0.1047276183962822, "mean_token_accuracy": 0.9952670335769653, "num_tokens": 2111012750.0, "step": 4967 }, { "entropy": 0.017964805010706186, "epoch": 2.1741984899879636, "grad_norm": 12.6875, "learning_rate": 1.748240389821332e-05, "loss": 0.091, "loss_lm": 0.01794646750204265, "loss_seg": 0.07302060909569263, "mean_token_accuracy": 0.9953167140483856, "num_tokens": 2111437818.0, "step": 4968 }, { "entropy": 0.019015981815755367, "epoch": 2.1746361746361744, "grad_norm": 14.9375, "learning_rate": 1.7479696805630753e-05, "loss": 0.1133, "loss_lm": 0.015603878069669008, "loss_seg": 0.09772234410047531, "mean_token_accuracy": 0.9950742125511169, "num_tokens": 2111863117.0, "step": 4969 }, { "entropy": 0.017714972142130136, "epoch": 2.1750738592843857, "grad_norm": 43.0, "learning_rate": 1.7476989713048187e-05, "loss": 0.0822, "loss_lm": 0.015594178112223744, "loss_seg": 0.0666486443951726, "mean_token_accuracy": 0.9954655766487122, "num_tokens": 2112289109.0, "step": 4970 }, { "entropy": 0.01821317756548524, "epoch": 2.1755115439325965, "grad_norm": 7.15625, "learning_rate": 1.747428262046562e-05, "loss": 0.0995, "loss_lm": 0.019289350137114525, "loss_seg": 0.08024469949305058, "mean_token_accuracy": 0.995224729180336, "num_tokens": 2112714389.0, "step": 4971 }, { "entropy": 0.01883308170363307, "epoch": 2.1759492285808077, "grad_norm": 4.75, "learning_rate": 1.747157552788305e-05, "loss": 0.1221, "loss_lm": 0.016596740810200572, "loss_seg": 0.10553860943764448, "mean_token_accuracy": 0.9950951784849167, "num_tokens": 2113139470.0, "step": 4972 }, { "entropy": 0.01813308661803603, "epoch": 2.1763869132290186, "grad_norm": 4.90625, "learning_rate": 1.7468868435300488e-05, "loss": 0.0924, "loss_lm": 0.016186873894184828, "loss_seg": 0.07618392817676067, "mean_token_accuracy": 0.9952008426189423, "num_tokens": 2113563678.0, "step": 4973 }, { "entropy": 0.018585793673992157, "epoch": 2.1768245978772294, "grad_norm": 4.71875, "learning_rate": 1.7466161342717922e-05, "loss": 0.0982, "loss_lm": 0.018226824700832367, "loss_seg": 0.07993192225694656, "mean_token_accuracy": 0.9951702505350113, "num_tokens": 2113989340.0, "step": 4974 }, { "entropy": 0.018116462044417858, "epoch": 2.17726228252544, "grad_norm": 5.5, "learning_rate": 1.7463454250135355e-05, "loss": 0.1556, "loss_lm": 0.01690902467817068, "loss_seg": 0.13865749165415764, "mean_token_accuracy": 0.9952399283647537, "num_tokens": 2114413881.0, "step": 4975 }, { "entropy": 0.018024058546870947, "epoch": 2.1776999671736514, "grad_norm": 3.859375, "learning_rate": 1.746074715755279e-05, "loss": 0.1296, "loss_lm": 0.0163846577052027, "loss_seg": 0.11318977922201157, "mean_token_accuracy": 0.9953026622533798, "num_tokens": 2114838336.0, "step": 4976 }, { "entropy": 0.01772743882611394, "epoch": 2.1781376518218623, "grad_norm": 13.5, "learning_rate": 1.745804006497022e-05, "loss": 0.1167, "loss_lm": 0.015108249615877867, "loss_seg": 0.10154630057513714, "mean_token_accuracy": 0.9954055696725845, "num_tokens": 2115263137.0, "step": 4977 }, { "entropy": 0.018619880080223083, "epoch": 2.1785753364700735, "grad_norm": 4.28125, "learning_rate": 1.7455332972387657e-05, "loss": 0.1098, "loss_lm": 0.01774743082933128, "loss_seg": 0.09200584702193737, "mean_token_accuracy": 0.9952287971973419, "num_tokens": 2115688554.0, "step": 4978 }, { "entropy": 0.01841592276468873, "epoch": 2.1790130211182843, "grad_norm": 8.0625, "learning_rate": 1.745262587980509e-05, "loss": 0.1569, "loss_lm": 0.016123312525451183, "loss_seg": 0.1407337225973606, "mean_token_accuracy": 0.9952249526977539, "num_tokens": 2116113209.0, "step": 4979 }, { "entropy": 0.018524352926760912, "epoch": 2.179450705766495, "grad_norm": 6.75, "learning_rate": 1.7449918787222524e-05, "loss": 0.1013, "loss_lm": 0.01527450350113213, "loss_seg": 0.08604396414011717, "mean_token_accuracy": 0.9950791150331497, "num_tokens": 2116538116.0, "step": 4980 }, { "entropy": 0.01890270970761776, "epoch": 2.1798883904147064, "grad_norm": 14.8125, "learning_rate": 1.7447211694639958e-05, "loss": 0.1064, "loss_lm": 0.017728167353197932, "loss_seg": 0.08869842067360878, "mean_token_accuracy": 0.9950557351112366, "num_tokens": 2116962828.0, "step": 4981 }, { "entropy": 0.018077599816024303, "epoch": 2.180326075062917, "grad_norm": 17.375, "learning_rate": 1.7444504602057388e-05, "loss": 0.084, "loss_lm": 0.015165615128353238, "loss_seg": 0.0688342098146677, "mean_token_accuracy": 0.9953335523605347, "num_tokens": 2117387084.0, "step": 4982 }, { "entropy": 0.018084536772221327, "epoch": 2.180763759711128, "grad_norm": 5.59375, "learning_rate": 1.7441797509474825e-05, "loss": 0.1216, "loss_lm": 0.015093185007572174, "loss_seg": 0.10653505939990282, "mean_token_accuracy": 0.9953146278858185, "num_tokens": 2117812188.0, "step": 4983 }, { "entropy": 0.01812858460471034, "epoch": 2.1812014443593393, "grad_norm": 9.25, "learning_rate": 1.743909041689226e-05, "loss": 0.0988, "loss_lm": 0.015862583182752132, "loss_seg": 0.0829429179430008, "mean_token_accuracy": 0.995391771197319, "num_tokens": 2118236818.0, "step": 4984 }, { "entropy": 0.018600584007799625, "epoch": 2.18163912900755, "grad_norm": 5.34375, "learning_rate": 1.7436383324309693e-05, "loss": 0.0933, "loss_lm": 0.016089802142232656, "loss_seg": 0.07719525136053562, "mean_token_accuracy": 0.9952034205198288, "num_tokens": 2118661855.0, "step": 4985 }, { "entropy": 0.01887343218550086, "epoch": 2.182076813655761, "grad_norm": 2.90625, "learning_rate": 1.7433676231727126e-05, "loss": 0.1244, "loss_lm": 0.01719645014964044, "loss_seg": 0.10720696579664946, "mean_token_accuracy": 0.9949899315834045, "num_tokens": 2119086574.0, "step": 4986 }, { "entropy": 0.017827112693339586, "epoch": 2.182514498303972, "grad_norm": 8.25, "learning_rate": 1.7430969139144557e-05, "loss": 0.1081, "loss_lm": 0.013623821549117565, "loss_seg": 0.09444732218980789, "mean_token_accuracy": 0.995280459523201, "num_tokens": 2119511758.0, "step": 4987 }, { "entropy": 0.018307315185666084, "epoch": 2.182952182952183, "grad_norm": 7.375, "learning_rate": 1.7428262046561994e-05, "loss": 0.0888, "loss_lm": 0.012883952120319009, "loss_seg": 0.07596379145979881, "mean_token_accuracy": 0.9952390938997269, "num_tokens": 2119936421.0, "step": 4988 }, { "entropy": 0.018692054320126772, "epoch": 2.1833898676003938, "grad_norm": 5.28125, "learning_rate": 1.7425554953979428e-05, "loss": 0.1177, "loss_lm": 0.015561219304800034, "loss_seg": 0.10211220011115074, "mean_token_accuracy": 0.9951437562704086, "num_tokens": 2120362649.0, "step": 4989 }, { "entropy": 0.018221059814095497, "epoch": 2.183827552248605, "grad_norm": 7.0, "learning_rate": 1.742284786139686e-05, "loss": 0.1242, "loss_lm": 0.01636953791603446, "loss_seg": 0.10779872164130211, "mean_token_accuracy": 0.9952013343572617, "num_tokens": 2120787906.0, "step": 4990 }, { "entropy": 0.017625227104872465, "epoch": 2.184265236896816, "grad_norm": 4.1875, "learning_rate": 1.7420140768814295e-05, "loss": 0.1155, "loss_lm": 0.01598509238101542, "loss_seg": 0.09946716204285622, "mean_token_accuracy": 0.9954712837934494, "num_tokens": 2121212687.0, "step": 4991 }, { "entropy": 0.018085326068103313, "epoch": 2.1847029215450267, "grad_norm": 8.125, "learning_rate": 1.7417433676231725e-05, "loss": 0.0982, "loss_lm": 0.015429180348291993, "loss_seg": 0.08280797209590673, "mean_token_accuracy": 0.9953236132860184, "num_tokens": 2121637126.0, "step": 4992 }, { "entropy": 0.01857024012133479, "epoch": 2.185140606193238, "grad_norm": 13.8125, "learning_rate": 1.7414726583649163e-05, "loss": 0.1349, "loss_lm": 0.015115463407710195, "loss_seg": 0.11981413699686527, "mean_token_accuracy": 0.9951963722705841, "num_tokens": 2122062376.0, "step": 4993 }, { "entropy": 0.018175643868744373, "epoch": 2.1855782908414487, "grad_norm": 4.78125, "learning_rate": 1.7412019491066596e-05, "loss": 0.0949, "loss_lm": 0.015603763051331043, "loss_seg": 0.07926472835242748, "mean_token_accuracy": 0.9952993839979172, "num_tokens": 2122486978.0, "step": 4994 }, { "entropy": 0.01817991305142641, "epoch": 2.1860159754896595, "grad_norm": 3.734375, "learning_rate": 1.740931239848403e-05, "loss": 0.0872, "loss_lm": 0.01691414788365364, "loss_seg": 0.0703254472464323, "mean_token_accuracy": 0.9953189343214035, "num_tokens": 2122911849.0, "step": 4995 }, { "entropy": 0.018006551079452038, "epoch": 2.186453660137871, "grad_norm": 5.5, "learning_rate": 1.740660530590146e-05, "loss": 0.1085, "loss_lm": 0.013295992277562618, "loss_seg": 0.0951830092817545, "mean_token_accuracy": 0.995371401309967, "num_tokens": 2123336761.0, "step": 4996 }, { "entropy": 0.018693627789616585, "epoch": 2.1868913447860816, "grad_norm": 4.53125, "learning_rate": 1.7403898213318894e-05, "loss": 0.1315, "loss_lm": 0.016197703313082457, "loss_seg": 0.11528266966342926, "mean_token_accuracy": 0.9951706081628799, "num_tokens": 2123761628.0, "step": 4997 }, { "entropy": 0.017944798804819584, "epoch": 2.1873290294342924, "grad_norm": 4.84375, "learning_rate": 1.7401191120736328e-05, "loss": 0.0757, "loss_lm": 0.016740720253437757, "loss_seg": 0.05899031274020672, "mean_token_accuracy": 0.9953272491693497, "num_tokens": 2124185982.0, "step": 4998 }, { "entropy": 0.018609097227454185, "epoch": 2.1877667140825037, "grad_norm": 4.75, "learning_rate": 1.7398484028153765e-05, "loss": 0.0997, "loss_lm": 0.01735484995879233, "loss_seg": 0.08233172819018364, "mean_token_accuracy": 0.9952367544174194, "num_tokens": 2124611714.0, "step": 4999 }, { "entropy": 0.018992920871824026, "epoch": 2.1882043987307145, "grad_norm": 4.5625, "learning_rate": 1.73957769355712e-05, "loss": 0.108, "loss_lm": 0.016885344171896577, "loss_seg": 0.09114030748605728, "mean_token_accuracy": 0.9949952214956284, "num_tokens": 2125037064.0, "step": 5000 }, { "entropy": 0.017814103979617357, "epoch": 2.1886420833789253, "grad_norm": 8.0625, "learning_rate": 1.739306984298863e-05, "loss": 0.1032, "loss_lm": 0.016600007424131036, "loss_seg": 0.08659818209707737, "mean_token_accuracy": 0.9954260289669037, "num_tokens": 2125462226.0, "step": 5001 }, { "entropy": 0.01815274776890874, "epoch": 2.1890797680271366, "grad_norm": 2.609375, "learning_rate": 1.7390362750406063e-05, "loss": 0.0967, "loss_lm": 0.015485636424273252, "loss_seg": 0.08122785203158855, "mean_token_accuracy": 0.9952546954154968, "num_tokens": 2125887448.0, "step": 5002 }, { "entropy": 0.018130358774214983, "epoch": 2.1895174526753474, "grad_norm": 23.25, "learning_rate": 1.7387655657823496e-05, "loss": 0.1232, "loss_lm": 0.015471048653125763, "loss_seg": 0.10769006051123142, "mean_token_accuracy": 0.995318815112114, "num_tokens": 2126313162.0, "step": 5003 }, { "entropy": 0.017909384332597256, "epoch": 2.189955137323558, "grad_norm": 43.0, "learning_rate": 1.7384948565240934e-05, "loss": 0.1199, "loss_lm": 0.015304988482967019, "loss_seg": 0.10459142737090588, "mean_token_accuracy": 0.9952920973300934, "num_tokens": 2126737935.0, "step": 5004 }, { "entropy": 0.018569032195955515, "epoch": 2.1903928219717694, "grad_norm": 3.53125, "learning_rate": 1.7382241472658367e-05, "loss": 0.0878, "loss_lm": 0.017666073516011238, "loss_seg": 0.07009870000183582, "mean_token_accuracy": 0.9952257871627808, "num_tokens": 2127162915.0, "step": 5005 }, { "entropy": 0.018134379293769598, "epoch": 2.1908305066199802, "grad_norm": 8.75, "learning_rate": 1.7379534380075798e-05, "loss": 0.1108, "loss_lm": 0.01747790421359241, "loss_seg": 0.09327906183898449, "mean_token_accuracy": 0.9952449947595596, "num_tokens": 2127587491.0, "step": 5006 }, { "entropy": 0.01795496279373765, "epoch": 2.1912681912681915, "grad_norm": 6.84375, "learning_rate": 1.737682728749323e-05, "loss": 0.0932, "loss_lm": 0.01744739804416895, "loss_seg": 0.07573524862527847, "mean_token_accuracy": 0.9953796565532684, "num_tokens": 2128011822.0, "step": 5007 }, { "entropy": 0.018086102791130543, "epoch": 2.1917058759164023, "grad_norm": 3.46875, "learning_rate": 1.7374120194910665e-05, "loss": 0.0943, "loss_lm": 0.017669642344117165, "loss_seg": 0.07658078707754612, "mean_token_accuracy": 0.9953034967184067, "num_tokens": 2128436804.0, "step": 5008 }, { "entropy": 0.01837318204343319, "epoch": 2.192143560564613, "grad_norm": 3.296875, "learning_rate": 1.7371413102328102e-05, "loss": 0.1149, "loss_lm": 0.0160954212769866, "loss_seg": 0.09879185818135738, "mean_token_accuracy": 0.9952674061059952, "num_tokens": 2128861437.0, "step": 5009 }, { "entropy": 0.01787460595369339, "epoch": 2.192581245212824, "grad_norm": 12.1875, "learning_rate": 1.7368706009745536e-05, "loss": 0.0841, "loss_lm": 0.016887781443074346, "loss_seg": 0.06723521556705236, "mean_token_accuracy": 0.9953790903091431, "num_tokens": 2129286830.0, "step": 5010 }, { "entropy": 0.01841255323961377, "epoch": 2.193018929861035, "grad_norm": 4.53125, "learning_rate": 1.7365998917162966e-05, "loss": 0.1013, "loss_lm": 0.016989631112664938, "loss_seg": 0.0842655273154378, "mean_token_accuracy": 0.9952625185251236, "num_tokens": 2129711892.0, "step": 5011 }, { "entropy": 0.017929209396243095, "epoch": 2.193456614509246, "grad_norm": 9.0625, "learning_rate": 1.73632918245804e-05, "loss": 0.136, "loss_lm": 0.01883421209640801, "loss_seg": 0.11716276779770851, "mean_token_accuracy": 0.9952639937400818, "num_tokens": 2130136895.0, "step": 5012 }, { "entropy": 0.017834306228905916, "epoch": 2.1938942991574573, "grad_norm": 21.375, "learning_rate": 1.7360584731997834e-05, "loss": 0.0873, "loss_lm": 0.01846823887899518, "loss_seg": 0.06886461190879345, "mean_token_accuracy": 0.9951495081186295, "num_tokens": 2130561361.0, "step": 5013 }, { "entropy": 0.017629985697567463, "epoch": 2.194331983805668, "grad_norm": 3.890625, "learning_rate": 1.735787763941527e-05, "loss": 0.1464, "loss_lm": 0.014622828923165798, "loss_seg": 0.13180864043533802, "mean_token_accuracy": 0.995331272482872, "num_tokens": 2130986687.0, "step": 5014 }, { "entropy": 0.017635255586355925, "epoch": 2.194769668453879, "grad_norm": 19.125, "learning_rate": 1.7355170546832705e-05, "loss": 0.1495, "loss_lm": 0.016735790064558387, "loss_seg": 0.1327654141932726, "mean_token_accuracy": 0.9953628927469254, "num_tokens": 2131411654.0, "step": 5015 }, { "entropy": 0.018169187009334564, "epoch": 2.19520735310209, "grad_norm": 3.765625, "learning_rate": 1.7352463454250135e-05, "loss": 0.1155, "loss_lm": 0.015057059237733483, "loss_seg": 0.10039622336626053, "mean_token_accuracy": 0.9952895641326904, "num_tokens": 2131836213.0, "step": 5016 }, { "entropy": 0.01833698758855462, "epoch": 2.195645037750301, "grad_norm": 9.9375, "learning_rate": 1.734975636166757e-05, "loss": 0.1484, "loss_lm": 0.0164808661211282, "loss_seg": 0.13189372047781944, "mean_token_accuracy": 0.9951307326555252, "num_tokens": 2132261090.0, "step": 5017 }, { "entropy": 0.0181358871050179, "epoch": 2.1960827223985118, "grad_norm": 3.921875, "learning_rate": 1.7347049269085002e-05, "loss": 0.1125, "loss_lm": 0.015345147112384439, "loss_seg": 0.09712865017354488, "mean_token_accuracy": 0.9952873438596725, "num_tokens": 2132686076.0, "step": 5018 }, { "entropy": 0.018775527831166983, "epoch": 2.196520407046723, "grad_norm": 6.90625, "learning_rate": 1.734434217650244e-05, "loss": 0.1054, "loss_lm": 0.015963942278176546, "loss_seg": 0.08946926798671484, "mean_token_accuracy": 0.9950881153345108, "num_tokens": 2133110951.0, "step": 5019 }, { "entropy": 0.01970671070739627, "epoch": 2.196958091694934, "grad_norm": 8.8125, "learning_rate": 1.734163508391987e-05, "loss": 0.0775, "loss_lm": 0.015926378313452005, "loss_seg": 0.06154496502131224, "mean_token_accuracy": 0.9948251694440842, "num_tokens": 2133536473.0, "step": 5020 }, { "entropy": 0.018058760557323694, "epoch": 2.1973957763431446, "grad_norm": 10.1875, "learning_rate": 1.7338927991337303e-05, "loss": 0.1728, "loss_lm": 0.016319807386025786, "loss_seg": 0.15649554785341024, "mean_token_accuracy": 0.9952792674303055, "num_tokens": 2133961596.0, "step": 5021 }, { "entropy": 0.0183420293033123, "epoch": 2.197833460991356, "grad_norm": 7.4375, "learning_rate": 1.7336220898754737e-05, "loss": 0.1234, "loss_lm": 0.015636930242180824, "loss_seg": 0.1078114490956068, "mean_token_accuracy": 0.9952418506145477, "num_tokens": 2134386284.0, "step": 5022 }, { "entropy": 0.017847916577011347, "epoch": 2.1982711456395667, "grad_norm": 3.71875, "learning_rate": 1.733351380617217e-05, "loss": 0.1168, "loss_lm": 0.015460141003131866, "loss_seg": 0.1013458464294672, "mean_token_accuracy": 0.9953152984380722, "num_tokens": 2134811457.0, "step": 5023 }, { "entropy": 0.017753054853528738, "epoch": 2.1987088302877775, "grad_norm": 4.875, "learning_rate": 1.7330806713589608e-05, "loss": 0.0976, "loss_lm": 0.017341677797958255, "loss_seg": 0.08027151226997375, "mean_token_accuracy": 0.9953053742647171, "num_tokens": 2135236584.0, "step": 5024 }, { "entropy": 0.018818905111402273, "epoch": 2.199146514935989, "grad_norm": 4.34375, "learning_rate": 1.732809962100704e-05, "loss": 0.1581, "loss_lm": 0.017512677237391472, "loss_seg": 0.1405828893184662, "mean_token_accuracy": 0.9951455444097519, "num_tokens": 2135661320.0, "step": 5025 }, { "entropy": 0.018698879051953554, "epoch": 2.1995841995841996, "grad_norm": 11.75, "learning_rate": 1.7325392528424472e-05, "loss": 0.0895, "loss_lm": 0.016233891481533647, "loss_seg": 0.07324241660535336, "mean_token_accuracy": 0.9950098544359207, "num_tokens": 2136087360.0, "step": 5026 }, { "entropy": 0.01803963305428624, "epoch": 2.2000218842324104, "grad_norm": 8.25, "learning_rate": 1.7322685435841906e-05, "loss": 0.1168, "loss_lm": 0.015929135959595442, "loss_seg": 0.1008693091571331, "mean_token_accuracy": 0.9954304099082947, "num_tokens": 2136512509.0, "step": 5027 }, { "entropy": 0.017933042254298925, "epoch": 2.2004595688806217, "grad_norm": 6.6875, "learning_rate": 1.731997834325934e-05, "loss": 0.1261, "loss_lm": 0.015392012428492308, "loss_seg": 0.11073356308043003, "mean_token_accuracy": 0.9952635914087296, "num_tokens": 2136938113.0, "step": 5028 }, { "entropy": 0.017761139664798975, "epoch": 2.2008972535288325, "grad_norm": 3.75, "learning_rate": 1.7317271250676773e-05, "loss": 0.1462, "loss_lm": 0.014991313684731722, "loss_seg": 0.13125420361757278, "mean_token_accuracy": 0.9953639805316925, "num_tokens": 2137362994.0, "step": 5029 }, { "entropy": 0.018288119230419397, "epoch": 2.2013349381770433, "grad_norm": 4.53125, "learning_rate": 1.7314564158094207e-05, "loss": 0.1168, "loss_lm": 0.01589250983670354, "loss_seg": 0.10086338967084885, "mean_token_accuracy": 0.9951481968164444, "num_tokens": 2137788377.0, "step": 5030 }, { "entropy": 0.018414506688714027, "epoch": 2.2017726228252545, "grad_norm": 9.375, "learning_rate": 1.731185706551164e-05, "loss": 0.145, "loss_lm": 0.013396880123764277, "loss_seg": 0.13160377368330956, "mean_token_accuracy": 0.9951653927564621, "num_tokens": 2138213205.0, "step": 5031 }, { "entropy": 0.018132259603589773, "epoch": 2.2022103074734654, "grad_norm": 3.21875, "learning_rate": 1.7309149972929074e-05, "loss": 0.1434, "loss_lm": 0.015350605128332973, "loss_seg": 0.12807520851492882, "mean_token_accuracy": 0.9951849579811096, "num_tokens": 2138638282.0, "step": 5032 }, { "entropy": 0.01829342171549797, "epoch": 2.202647992121676, "grad_norm": 3.9375, "learning_rate": 1.7306442880346508e-05, "loss": 0.1164, "loss_lm": 0.01725383778102696, "loss_seg": 0.09912806190550327, "mean_token_accuracy": 0.9952331185340881, "num_tokens": 2139063471.0, "step": 5033 }, { "entropy": 0.01831975718960166, "epoch": 2.2030856767698874, "grad_norm": 10.8125, "learning_rate": 1.7303735787763942e-05, "loss": 0.1156, "loss_lm": 0.014581486582756042, "loss_seg": 0.10106125473976135, "mean_token_accuracy": 0.9953294396400452, "num_tokens": 2139488881.0, "step": 5034 }, { "entropy": 0.01897261617705226, "epoch": 2.2035233614180982, "grad_norm": 9.3125, "learning_rate": 1.7301028695181376e-05, "loss": 0.0899, "loss_lm": 0.015090605476871133, "loss_seg": 0.07482391782104969, "mean_token_accuracy": 0.9951059520244598, "num_tokens": 2139914074.0, "step": 5035 }, { "entropy": 0.018231189344078302, "epoch": 2.203961046066309, "grad_norm": 2.953125, "learning_rate": 1.729832160259881e-05, "loss": 0.0737, "loss_lm": 0.01547031756490469, "loss_seg": 0.0582168772816658, "mean_token_accuracy": 0.9952430725097656, "num_tokens": 2140339717.0, "step": 5036 }, { "entropy": 0.01915248529985547, "epoch": 2.2043987307145203, "grad_norm": 4.65625, "learning_rate": 1.7295614510016243e-05, "loss": 0.14, "loss_lm": 0.018875237787142396, "loss_seg": 0.12110353447496891, "mean_token_accuracy": 0.9950588643550873, "num_tokens": 2140765504.0, "step": 5037 }, { "entropy": 0.01843757787719369, "epoch": 2.204836415362731, "grad_norm": 3.40625, "learning_rate": 1.7292907417433677e-05, "loss": 0.1096, "loss_lm": 0.017732044449076056, "loss_seg": 0.0918456818908453, "mean_token_accuracy": 0.9951718002557755, "num_tokens": 2141191089.0, "step": 5038 }, { "entropy": 0.01900359569117427, "epoch": 2.205274100010942, "grad_norm": 7.15625, "learning_rate": 1.7290200324851107e-05, "loss": 0.1233, "loss_lm": 0.01787254842929542, "loss_seg": 0.10538972914218903, "mean_token_accuracy": 0.995021864771843, "num_tokens": 2141616511.0, "step": 5039 }, { "entropy": 0.018080597277730703, "epoch": 2.205711784659153, "grad_norm": 11.8125, "learning_rate": 1.7287493232268544e-05, "loss": 0.1288, "loss_lm": 0.016946222400292754, "loss_seg": 0.11184360086917877, "mean_token_accuracy": 0.9953595697879791, "num_tokens": 2142041452.0, "step": 5040 }, { "entropy": 0.01793520851060748, "epoch": 2.206149469307364, "grad_norm": 14.0, "learning_rate": 1.7284786139685978e-05, "loss": 0.1081, "loss_lm": 0.01724604540504515, "loss_seg": 0.09081442840397358, "mean_token_accuracy": 0.9953330457210541, "num_tokens": 2142466510.0, "step": 5041 }, { "entropy": 0.01817224407568574, "epoch": 2.206587153955575, "grad_norm": 8.3125, "learning_rate": 1.7282079047103412e-05, "loss": 0.0898, "loss_lm": 0.01672629825770855, "loss_seg": 0.07303616218268871, "mean_token_accuracy": 0.9953772574663162, "num_tokens": 2142892168.0, "step": 5042 }, { "entropy": 0.01852716226130724, "epoch": 2.207024838603786, "grad_norm": 4.28125, "learning_rate": 1.7279371954520845e-05, "loss": 0.1475, "loss_lm": 0.015701939817517996, "loss_seg": 0.13175472524017096, "mean_token_accuracy": 0.995135486125946, "num_tokens": 2143317444.0, "step": 5043 }, { "entropy": 0.018601796589791775, "epoch": 2.207462523251997, "grad_norm": 7.65625, "learning_rate": 1.7276664861938276e-05, "loss": 0.1166, "loss_lm": 0.01611430337652564, "loss_seg": 0.10049504972994328, "mean_token_accuracy": 0.9950894266366959, "num_tokens": 2143741810.0, "step": 5044 }, { "entropy": 0.018099564593285322, "epoch": 2.2079002079002077, "grad_norm": 6.8125, "learning_rate": 1.7273957769355713e-05, "loss": 0.1048, "loss_lm": 0.015610622009262443, "loss_seg": 0.08923549205064774, "mean_token_accuracy": 0.9953631609678268, "num_tokens": 2144166617.0, "step": 5045 }, { "entropy": 0.01838775258511305, "epoch": 2.208337892548419, "grad_norm": 6.40625, "learning_rate": 1.7271250676773147e-05, "loss": 0.0925, "loss_lm": 0.016092326259240508, "loss_seg": 0.07636146247386932, "mean_token_accuracy": 0.9951444119215012, "num_tokens": 2144592079.0, "step": 5046 }, { "entropy": 0.018923607654869556, "epoch": 2.2087755771966298, "grad_norm": 6.96875, "learning_rate": 1.726854358419058e-05, "loss": 0.099, "loss_lm": 0.016072726575657725, "loss_seg": 0.0829374473541975, "mean_token_accuracy": 0.9949682205915451, "num_tokens": 2145016526.0, "step": 5047 }, { "entropy": 0.01768468040972948, "epoch": 2.209213261844841, "grad_norm": 6.28125, "learning_rate": 1.7265836491608014e-05, "loss": 0.1509, "loss_lm": 0.015106987440958619, "loss_seg": 0.1358218975365162, "mean_token_accuracy": 0.995472639799118, "num_tokens": 2145440982.0, "step": 5048 }, { "entropy": 0.017829231917858124, "epoch": 2.209650946493052, "grad_norm": 21.75, "learning_rate": 1.7263129399025444e-05, "loss": 0.0967, "loss_lm": 0.014406024245545268, "loss_seg": 0.08232554979622364, "mean_token_accuracy": 0.9953405857086182, "num_tokens": 2145865904.0, "step": 5049 }, { "entropy": 0.01829527784138918, "epoch": 2.2100886311412626, "grad_norm": 8.625, "learning_rate": 1.726042230644288e-05, "loss": 0.0769, "loss_lm": 0.01607426581904292, "loss_seg": 0.060872407630085945, "mean_token_accuracy": 0.9952809512615204, "num_tokens": 2146291409.0, "step": 5050 }, { "entropy": 0.017796250991523266, "epoch": 2.2105263157894735, "grad_norm": 5.6875, "learning_rate": 1.7257715213860315e-05, "loss": 0.0954, "loss_lm": 0.0171759813092649, "loss_seg": 0.07818601839244366, "mean_token_accuracy": 0.9952888488769531, "num_tokens": 2146716525.0, "step": 5051 }, { "entropy": 0.018305661156773567, "epoch": 2.2109640004376847, "grad_norm": 7.40625, "learning_rate": 1.725500812127775e-05, "loss": 0.0976, "loss_lm": 0.016889700898900628, "loss_seg": 0.08074697107076645, "mean_token_accuracy": 0.9953006356954575, "num_tokens": 2147141449.0, "step": 5052 }, { "entropy": 0.01799294399097562, "epoch": 2.2114016850858955, "grad_norm": 11.0, "learning_rate": 1.7252301028695183e-05, "loss": 0.1012, "loss_lm": 0.014310975326225162, "loss_seg": 0.0868815016001463, "mean_token_accuracy": 0.9952679127454758, "num_tokens": 2147567211.0, "step": 5053 }, { "entropy": 0.01808134000748396, "epoch": 2.2118393697341068, "grad_norm": 6.5, "learning_rate": 1.7249593936112613e-05, "loss": 0.0979, "loss_lm": 0.017239594599232078, "loss_seg": 0.08070770837366581, "mean_token_accuracy": 0.9951997250318527, "num_tokens": 2147992167.0, "step": 5054 }, { "entropy": 0.018646246753633022, "epoch": 2.2122770543823176, "grad_norm": 6.90625, "learning_rate": 1.724688684353005e-05, "loss": 0.121, "loss_lm": 0.01775011932477355, "loss_seg": 0.10329611226916313, "mean_token_accuracy": 0.9950939267873764, "num_tokens": 2148417529.0, "step": 5055 }, { "entropy": 0.01809725910425186, "epoch": 2.2127147390305284, "grad_norm": 8.75, "learning_rate": 1.7244179750947484e-05, "loss": 0.1004, "loss_lm": 0.01699773035943508, "loss_seg": 0.08339914493262768, "mean_token_accuracy": 0.9952007830142975, "num_tokens": 2148842313.0, "step": 5056 }, { "entropy": 0.01790204318240285, "epoch": 2.2131524236787397, "grad_norm": 3.765625, "learning_rate": 1.7241472658364918e-05, "loss": 0.1137, "loss_lm": 0.01574073568917811, "loss_seg": 0.09795736987143755, "mean_token_accuracy": 0.9953117072582245, "num_tokens": 2149266717.0, "step": 5057 }, { "entropy": 0.01779781188815832, "epoch": 2.2135901083269505, "grad_norm": 6.46875, "learning_rate": 1.723876556578235e-05, "loss": 0.1636, "loss_lm": 0.014626223361119628, "loss_seg": 0.14898485131561756, "mean_token_accuracy": 0.9952828288078308, "num_tokens": 2149691931.0, "step": 5058 }, { "entropy": 0.018635562155395746, "epoch": 2.2140277929751613, "grad_norm": 7.34375, "learning_rate": 1.723605847319978e-05, "loss": 0.1363, "loss_lm": 0.015338290017098188, "loss_seg": 0.12097418121993542, "mean_token_accuracy": 0.9950972348451614, "num_tokens": 2150116357.0, "step": 5059 }, { "entropy": 0.018264252692461014, "epoch": 2.2144654776233725, "grad_norm": 6.875, "learning_rate": 1.723335138061722e-05, "loss": 0.1416, "loss_lm": 0.013785888440907001, "loss_seg": 0.12783137522637844, "mean_token_accuracy": 0.9952841103076935, "num_tokens": 2150541699.0, "step": 5060 }, { "entropy": 0.018693065270781517, "epoch": 2.2149031622715833, "grad_norm": 9.1875, "learning_rate": 1.7230644288034653e-05, "loss": 0.0911, "loss_lm": 0.016376921674236655, "loss_seg": 0.07472192868590355, "mean_token_accuracy": 0.9949395954608917, "num_tokens": 2150966143.0, "step": 5061 }, { "entropy": 0.018128142692148685, "epoch": 2.215340846919794, "grad_norm": 5.1875, "learning_rate": 1.7227937195452086e-05, "loss": 0.0955, "loss_lm": 0.015047375811263919, "loss_seg": 0.0804717168211937, "mean_token_accuracy": 0.9951514303684235, "num_tokens": 2151391816.0, "step": 5062 }, { "entropy": 0.01818094588816166, "epoch": 2.2157785315680054, "grad_norm": 21.5, "learning_rate": 1.7225230102869517e-05, "loss": 0.1566, "loss_lm": 0.015402864431962371, "loss_seg": 0.14123382791876793, "mean_token_accuracy": 0.9952160120010376, "num_tokens": 2151817171.0, "step": 5063 }, { "entropy": 0.017741809599101543, "epoch": 2.2162162162162162, "grad_norm": 4.40625, "learning_rate": 1.722252301028695e-05, "loss": 0.1275, "loss_lm": 0.014183777151629329, "loss_seg": 0.11335576139390469, "mean_token_accuracy": 0.9953752011060715, "num_tokens": 2152241793.0, "step": 5064 }, { "entropy": 0.018726208247244358, "epoch": 2.216653900864427, "grad_norm": 6.4375, "learning_rate": 1.7219815917704384e-05, "loss": 0.0946, "loss_lm": 0.01692004408687353, "loss_seg": 0.07766958884894848, "mean_token_accuracy": 0.9951199293136597, "num_tokens": 2152666985.0, "step": 5065 }, { "entropy": 0.017802190966904163, "epoch": 2.2170915855126383, "grad_norm": 28.5, "learning_rate": 1.721710882512182e-05, "loss": 0.0888, "loss_lm": 0.014265616424381733, "loss_seg": 0.07451420091092587, "mean_token_accuracy": 0.9953639805316925, "num_tokens": 2153091216.0, "step": 5066 }, { "entropy": 0.018608792684972286, "epoch": 2.217529270160849, "grad_norm": 6.90625, "learning_rate": 1.7214401732539255e-05, "loss": 0.1003, "loss_lm": 0.016281917924061418, "loss_seg": 0.08404727838933468, "mean_token_accuracy": 0.9952194094657898, "num_tokens": 2153516643.0, "step": 5067 }, { "entropy": 0.018017278518527746, "epoch": 2.21796695480906, "grad_norm": 6.03125, "learning_rate": 1.7211694639956685e-05, "loss": 0.0851, "loss_lm": 0.014877916313707829, "loss_seg": 0.07023777812719345, "mean_token_accuracy": 0.9952268004417419, "num_tokens": 2153942279.0, "step": 5068 }, { "entropy": 0.0184193872846663, "epoch": 2.218404639457271, "grad_norm": 36.25, "learning_rate": 1.720898754737412e-05, "loss": 0.1204, "loss_lm": 0.017397162271663547, "loss_seg": 0.10297391936182976, "mean_token_accuracy": 0.9952454715967178, "num_tokens": 2154367298.0, "step": 5069 }, { "entropy": 0.017984938342124224, "epoch": 2.218842324105482, "grad_norm": 4.1875, "learning_rate": 1.7206280454791553e-05, "loss": 0.0897, "loss_lm": 0.015071584610268474, "loss_seg": 0.07463048957288265, "mean_token_accuracy": 0.9953460246324539, "num_tokens": 2154792062.0, "step": 5070 }, { "entropy": 0.018384902272373438, "epoch": 2.219280008753693, "grad_norm": 3.96875, "learning_rate": 1.720357336220899e-05, "loss": 0.1501, "loss_lm": 0.016784227220341563, "loss_seg": 0.1333400160074234, "mean_token_accuracy": 0.995225116610527, "num_tokens": 2155217399.0, "step": 5071 }, { "entropy": 0.018938430584967136, "epoch": 2.219717693401904, "grad_norm": 6.25, "learning_rate": 1.7200866269626423e-05, "loss": 0.0957, "loss_lm": 0.01678262441419065, "loss_seg": 0.07894441206008196, "mean_token_accuracy": 0.9950870275497437, "num_tokens": 2155642184.0, "step": 5072 }, { "entropy": 0.01850430853664875, "epoch": 2.220155378050115, "grad_norm": 5.15625, "learning_rate": 1.7198159177043854e-05, "loss": 0.0903, "loss_lm": 0.017198570538312197, "loss_seg": 0.07310718204826117, "mean_token_accuracy": 0.9950931668281555, "num_tokens": 2156067294.0, "step": 5073 }, { "entropy": 0.018048781901597977, "epoch": 2.2205930626983257, "grad_norm": 8.0, "learning_rate": 1.7195452084461288e-05, "loss": 0.1301, "loss_lm": 0.017595497891306877, "loss_seg": 0.11250563897192478, "mean_token_accuracy": 0.995205819606781, "num_tokens": 2156492790.0, "step": 5074 }, { "entropy": 0.017936667893081903, "epoch": 2.221030747346537, "grad_norm": 3.828125, "learning_rate": 1.719274499187872e-05, "loss": 0.1076, "loss_lm": 0.01746082818135619, "loss_seg": 0.09010089002549648, "mean_token_accuracy": 0.9951426386833191, "num_tokens": 2156918102.0, "step": 5075 }, { "entropy": 0.01831047050654888, "epoch": 2.2214684319947477, "grad_norm": 3.046875, "learning_rate": 1.719003789929616e-05, "loss": 0.1324, "loss_lm": 0.016096317442134023, "loss_seg": 0.11635063495486975, "mean_token_accuracy": 0.9952351152896881, "num_tokens": 2157343264.0, "step": 5076 }, { "entropy": 0.018405316397547722, "epoch": 2.2219061166429586, "grad_norm": 17.375, "learning_rate": 1.7187330806713592e-05, "loss": 0.1075, "loss_lm": 0.01880295993760228, "loss_seg": 0.08874256163835526, "mean_token_accuracy": 0.9950855076313019, "num_tokens": 2157768712.0, "step": 5077 }, { "entropy": 0.018440520856529474, "epoch": 2.22234380129117, "grad_norm": 9.8125, "learning_rate": 1.7184623714131022e-05, "loss": 0.1686, "loss_lm": 0.016950201243162155, "loss_seg": 0.15162179060280323, "mean_token_accuracy": 0.9951940029859543, "num_tokens": 2158193566.0, "step": 5078 }, { "entropy": 0.017733438406139612, "epoch": 2.2227814859393806, "grad_norm": 13.5, "learning_rate": 1.7181916621548456e-05, "loss": 0.1152, "loss_lm": 0.013954181456938386, "loss_seg": 0.1012157890945673, "mean_token_accuracy": 0.9954586178064346, "num_tokens": 2158618707.0, "step": 5079 }, { "entropy": 0.018330687191337347, "epoch": 2.2232191705875914, "grad_norm": 3.71875, "learning_rate": 1.717920952896589e-05, "loss": 0.1168, "loss_lm": 0.018060097005218267, "loss_seg": 0.09871679171919823, "mean_token_accuracy": 0.9951310604810715, "num_tokens": 2159042986.0, "step": 5080 }, { "entropy": 0.018596210051327944, "epoch": 2.2236568552358027, "grad_norm": 5.4375, "learning_rate": 1.7176502436383327e-05, "loss": 0.0861, "loss_lm": 0.016670630546286702, "loss_seg": 0.0694526769220829, "mean_token_accuracy": 0.9951498806476593, "num_tokens": 2159468318.0, "step": 5081 }, { "entropy": 0.018178689293563366, "epoch": 2.2240945398840135, "grad_norm": 10.1875, "learning_rate": 1.717379534380076e-05, "loss": 0.1282, "loss_lm": 0.013892635004594922, "loss_seg": 0.11430993303656578, "mean_token_accuracy": 0.995155081152916, "num_tokens": 2159893042.0, "step": 5082 }, { "entropy": 0.01760731916874647, "epoch": 2.2245322245322248, "grad_norm": 7.0625, "learning_rate": 1.717108825121819e-05, "loss": 0.0861, "loss_lm": 0.01482898835092783, "loss_seg": 0.07125675771385431, "mean_token_accuracy": 0.9953166544437408, "num_tokens": 2160317892.0, "step": 5083 }, { "entropy": 0.018473090138286352, "epoch": 2.2249699091804356, "grad_norm": 11.125, "learning_rate": 1.7168381158635625e-05, "loss": 0.1469, "loss_lm": 0.01694617047905922, "loss_seg": 0.12996548973023891, "mean_token_accuracy": 0.9951135665178299, "num_tokens": 2160742691.0, "step": 5084 }, { "entropy": 0.018273886293172836, "epoch": 2.2254075938286464, "grad_norm": 13.125, "learning_rate": 1.716567406605306e-05, "loss": 0.1136, "loss_lm": 0.016131333773955703, "loss_seg": 0.09746743366122246, "mean_token_accuracy": 0.9952990114688873, "num_tokens": 2161167210.0, "step": 5085 }, { "entropy": 0.018817178905010223, "epoch": 2.225845278476857, "grad_norm": 4.59375, "learning_rate": 1.7162966973470496e-05, "loss": 0.1255, "loss_lm": 0.01490859198383987, "loss_seg": 0.11062544398009777, "mean_token_accuracy": 0.9950615912675858, "num_tokens": 2161592625.0, "step": 5086 }, { "entropy": 0.018263002391904593, "epoch": 2.2262829631250685, "grad_norm": 8.0625, "learning_rate": 1.7160259880887926e-05, "loss": 0.1038, "loss_lm": 0.01840387494303286, "loss_seg": 0.08535517379641533, "mean_token_accuracy": 0.9951831549406052, "num_tokens": 2162017762.0, "step": 5087 }, { "entropy": 0.01868265401571989, "epoch": 2.2267206477732793, "grad_norm": 7.75, "learning_rate": 1.715755278830536e-05, "loss": 0.0946, "loss_lm": 0.017154484521597624, "loss_seg": 0.07740788348019123, "mean_token_accuracy": 0.9951144307851791, "num_tokens": 2162442868.0, "step": 5088 }, { "entropy": 0.018653659150004387, "epoch": 2.2271583324214905, "grad_norm": 23.25, "learning_rate": 1.7154845695722793e-05, "loss": 0.1023, "loss_lm": 0.017103300197049975, "loss_seg": 0.08521231263875961, "mean_token_accuracy": 0.995123416185379, "num_tokens": 2162867904.0, "step": 5089 }, { "entropy": 0.01795114576816559, "epoch": 2.2275960170697013, "grad_norm": 15.3125, "learning_rate": 1.7152138603140227e-05, "loss": 0.1129, "loss_lm": 0.018132932484149933, "loss_seg": 0.09480373375117779, "mean_token_accuracy": 0.9953543692827225, "num_tokens": 2163292495.0, "step": 5090 }, { "entropy": 0.018345049116760492, "epoch": 2.228033701717912, "grad_norm": 5.0, "learning_rate": 1.7149431510557664e-05, "loss": 0.1433, "loss_lm": 0.01621961360797286, "loss_seg": 0.12711547687649727, "mean_token_accuracy": 0.9951969683170319, "num_tokens": 2163716970.0, "step": 5091 }, { "entropy": 0.01840865332633257, "epoch": 2.2284713863661234, "grad_norm": 10.5625, "learning_rate": 1.7146724417975095e-05, "loss": 0.1124, "loss_lm": 0.015039346180856228, "loss_seg": 0.09740053862333298, "mean_token_accuracy": 0.9951511919498444, "num_tokens": 2164141171.0, "step": 5092 }, { "entropy": 0.017956607043743134, "epoch": 2.228909071014334, "grad_norm": 3.484375, "learning_rate": 1.714401732539253e-05, "loss": 0.1005, "loss_lm": 0.015220958972349763, "loss_seg": 0.0853000171482563, "mean_token_accuracy": 0.9953147619962692, "num_tokens": 2164566074.0, "step": 5093 }, { "entropy": 0.01872627390548587, "epoch": 2.229346755662545, "grad_norm": 5.75, "learning_rate": 1.7141310232809962e-05, "loss": 0.1016, "loss_lm": 0.014441413106396794, "loss_seg": 0.08712534233927727, "mean_token_accuracy": 0.9951552748680115, "num_tokens": 2164992072.0, "step": 5094 }, { "entropy": 0.019139611162245274, "epoch": 2.2297844403107563, "grad_norm": 11.3125, "learning_rate": 1.7138603140227396e-05, "loss": 0.1573, "loss_lm": 0.02002613036893308, "loss_seg": 0.1372506245970726, "mean_token_accuracy": 0.9950327277183533, "num_tokens": 2165418250.0, "step": 5095 }, { "entropy": 0.01796104060485959, "epoch": 2.230222124958967, "grad_norm": 7.53125, "learning_rate": 1.713589604764483e-05, "loss": 0.1001, "loss_lm": 0.013655720511451364, "loss_seg": 0.08644438162446022, "mean_token_accuracy": 0.9952476918697357, "num_tokens": 2165842848.0, "step": 5096 }, { "entropy": 0.0179242342710495, "epoch": 2.230659809607178, "grad_norm": 5.0, "learning_rate": 1.7133188955062263e-05, "loss": 0.102, "loss_lm": 0.0170398922637105, "loss_seg": 0.08491337671875954, "mean_token_accuracy": 0.9953204393386841, "num_tokens": 2166268026.0, "step": 5097 }, { "entropy": 0.018366714008152485, "epoch": 2.231097494255389, "grad_norm": 9.375, "learning_rate": 1.7130481862479697e-05, "loss": 0.1109, "loss_lm": 0.016233896603807807, "loss_seg": 0.09462830424308777, "mean_token_accuracy": 0.9952799081802368, "num_tokens": 2166692553.0, "step": 5098 }, { "entropy": 0.018309365957975388, "epoch": 2.2315351789036, "grad_norm": 11.6875, "learning_rate": 1.712777476989713e-05, "loss": 0.1407, "loss_lm": 0.014018216636031866, "loss_seg": 0.12664561346173286, "mean_token_accuracy": 0.9952955842018127, "num_tokens": 2167117625.0, "step": 5099 }, { "entropy": 0.01795924175530672, "epoch": 2.231972863551811, "grad_norm": 5.40625, "learning_rate": 1.7125067677314564e-05, "loss": 0.1436, "loss_lm": 0.01508740009739995, "loss_seg": 0.12853571958839893, "mean_token_accuracy": 0.9953866004943848, "num_tokens": 2167542639.0, "step": 5100 }, { "entropy": 0.018497925251722336, "epoch": 2.232410548200022, "grad_norm": 6.5, "learning_rate": 1.7122360584731998e-05, "loss": 0.1205, "loss_lm": 0.017428754130378366, "loss_seg": 0.10311499238014221, "mean_token_accuracy": 0.9952777773141861, "num_tokens": 2167967614.0, "step": 5101 }, { "entropy": 0.018335770815610886, "epoch": 2.232848232848233, "grad_norm": 3.390625, "learning_rate": 1.7119653492149432e-05, "loss": 0.0853, "loss_lm": 0.018175386358052492, "loss_seg": 0.06710063479840755, "mean_token_accuracy": 0.9951662570238113, "num_tokens": 2168392522.0, "step": 5102 }, { "entropy": 0.01875475700944662, "epoch": 2.2332859174964437, "grad_norm": 5.8125, "learning_rate": 1.7116946399566866e-05, "loss": 0.0836, "loss_lm": 0.016719938488677144, "loss_seg": 0.06685162242501974, "mean_token_accuracy": 0.9951686859130859, "num_tokens": 2168817200.0, "step": 5103 }, { "entropy": 0.018258828204125166, "epoch": 2.233723602144655, "grad_norm": 20.875, "learning_rate": 1.71142393069843e-05, "loss": 0.0925, "loss_lm": 0.015670178923755884, "loss_seg": 0.07685867138206959, "mean_token_accuracy": 0.995177298784256, "num_tokens": 2169242444.0, "step": 5104 }, { "entropy": 0.018150019459426403, "epoch": 2.2341612867928657, "grad_norm": 7.71875, "learning_rate": 1.7111532214401733e-05, "loss": 0.0855, "loss_lm": 0.015875130658969283, "loss_seg": 0.06959540210664272, "mean_token_accuracy": 0.9952790886163712, "num_tokens": 2169667752.0, "step": 5105 }, { "entropy": 0.018723734188824892, "epoch": 2.2345989714410766, "grad_norm": 50.0, "learning_rate": 1.7108825121819167e-05, "loss": 0.1042, "loss_lm": 0.016273806104436517, "loss_seg": 0.08791039325296879, "mean_token_accuracy": 0.9951924979686737, "num_tokens": 2170093517.0, "step": 5106 }, { "entropy": 0.018612557090818882, "epoch": 2.235036656089288, "grad_norm": 18.5, "learning_rate": 1.71061180292366e-05, "loss": 0.1052, "loss_lm": 0.01642783358693123, "loss_seg": 0.08874081447720528, "mean_token_accuracy": 0.9950429201126099, "num_tokens": 2170517727.0, "step": 5107 }, { "entropy": 0.018284788355231285, "epoch": 2.2354743407374986, "grad_norm": 19.875, "learning_rate": 1.7103410936654034e-05, "loss": 0.1066, "loss_lm": 0.013972848886623979, "loss_seg": 0.09261242114007473, "mean_token_accuracy": 0.9953516870737076, "num_tokens": 2170942776.0, "step": 5108 }, { "entropy": 0.0183804789558053, "epoch": 2.2359120253857094, "grad_norm": 3.890625, "learning_rate": 1.7100703844071468e-05, "loss": 0.0867, "loss_lm": 0.018496359698474407, "loss_seg": 0.06822365522384644, "mean_token_accuracy": 0.9952553510665894, "num_tokens": 2171367837.0, "step": 5109 }, { "entropy": 0.017962587997317314, "epoch": 2.2363497100339207, "grad_norm": 12.625, "learning_rate": 1.7097996751488902e-05, "loss": 0.0771, "loss_lm": 0.014410756761208177, "loss_seg": 0.06270233076065779, "mean_token_accuracy": 0.9953463524580002, "num_tokens": 2171792403.0, "step": 5110 }, { "entropy": 0.018321849405765533, "epoch": 2.2367873946821315, "grad_norm": 4.8125, "learning_rate": 1.7095289658906332e-05, "loss": 0.1284, "loss_lm": 0.018774323165416718, "loss_seg": 0.10966312512755394, "mean_token_accuracy": 0.9951299279928207, "num_tokens": 2172217004.0, "step": 5111 }, { "entropy": 0.018313772976398468, "epoch": 2.2372250793303423, "grad_norm": 14.5, "learning_rate": 1.709258256632377e-05, "loss": 0.0944, "loss_lm": 0.013678036397323012, "loss_seg": 0.08068101294338703, "mean_token_accuracy": 0.9951624125242233, "num_tokens": 2172642335.0, "step": 5112 }, { "entropy": 0.018251975998282433, "epoch": 2.2376627639785536, "grad_norm": 9.6875, "learning_rate": 1.7089875473741203e-05, "loss": 0.0894, "loss_lm": 0.018368661403656006, "loss_seg": 0.07102172262966633, "mean_token_accuracy": 0.9953301697969437, "num_tokens": 2173066983.0, "step": 5113 }, { "entropy": 0.0185905615799129, "epoch": 2.2381004486267644, "grad_norm": 3.296875, "learning_rate": 1.7087168381158637e-05, "loss": 0.1311, "loss_lm": 0.015183212701231241, "loss_seg": 0.11592886038124561, "mean_token_accuracy": 0.9951061457395554, "num_tokens": 2173492154.0, "step": 5114 }, { "entropy": 0.01848631678149104, "epoch": 2.238538133274975, "grad_norm": 17.375, "learning_rate": 1.708446128857607e-05, "loss": 0.1054, "loss_lm": 0.015756276436150074, "loss_seg": 0.08962586428970098, "mean_token_accuracy": 0.9951778650283813, "num_tokens": 2173916813.0, "step": 5115 }, { "entropy": 0.0181523310020566, "epoch": 2.2389758179231865, "grad_norm": 15.6875, "learning_rate": 1.70817541959935e-05, "loss": 0.0906, "loss_lm": 0.015478621702641249, "loss_seg": 0.07510312087833881, "mean_token_accuracy": 0.9952530264854431, "num_tokens": 2174342297.0, "step": 5116 }, { "entropy": 0.01851311046630144, "epoch": 2.2394135025713973, "grad_norm": 17.5, "learning_rate": 1.7079047103410938e-05, "loss": 0.119, "loss_lm": 0.014243570622056723, "loss_seg": 0.10474155098199844, "mean_token_accuracy": 0.9952012002468109, "num_tokens": 2174766932.0, "step": 5117 }, { "entropy": 0.01781220454722643, "epoch": 2.239851187219608, "grad_norm": 9.4375, "learning_rate": 1.707634001082837e-05, "loss": 0.1401, "loss_lm": 0.014549518236890435, "loss_seg": 0.1255320105701685, "mean_token_accuracy": 0.9953612238168716, "num_tokens": 2175192624.0, "step": 5118 }, { "entropy": 0.018052998464554548, "epoch": 2.2402888718678193, "grad_norm": 3.046875, "learning_rate": 1.7073632918245805e-05, "loss": 0.0896, "loss_lm": 0.01585929491557181, "loss_seg": 0.07373147271573544, "mean_token_accuracy": 0.9953679144382477, "num_tokens": 2175618292.0, "step": 5119 }, { "entropy": 0.01812324160709977, "epoch": 2.24072655651603, "grad_norm": 10.4375, "learning_rate": 1.707092582566324e-05, "loss": 0.104, "loss_lm": 0.015151549130678177, "loss_seg": 0.0888790488243103, "mean_token_accuracy": 0.995183914899826, "num_tokens": 2176042263.0, "step": 5120 }, { "entropy": 0.017897235229611397, "epoch": 2.241164241164241, "grad_norm": 11.6875, "learning_rate": 1.706821873308067e-05, "loss": 0.0835, "loss_lm": 0.016540780663490295, "loss_seg": 0.06693695113062859, "mean_token_accuracy": 0.995297446846962, "num_tokens": 2176466676.0, "step": 5121 }, { "entropy": 0.018384899944067, "epoch": 2.241601925812452, "grad_norm": 4.84375, "learning_rate": 1.7065511640498106e-05, "loss": 0.1155, "loss_lm": 0.016916566994041204, "loss_seg": 0.09855936467647552, "mean_token_accuracy": 0.9952096343040466, "num_tokens": 2176891639.0, "step": 5122 }, { "entropy": 0.018323024734854698, "epoch": 2.242039610460663, "grad_norm": 84.5, "learning_rate": 1.706280454791554e-05, "loss": 0.1073, "loss_lm": 0.019329409580677748, "loss_seg": 0.08792147599160671, "mean_token_accuracy": 0.9953078776597977, "num_tokens": 2177316749.0, "step": 5123 }, { "entropy": 0.018447597045451403, "epoch": 2.2424772951088743, "grad_norm": 5.28125, "learning_rate": 1.7060097455332974e-05, "loss": 0.1109, "loss_lm": 0.015806443290784955, "loss_seg": 0.09510661289095879, "mean_token_accuracy": 0.9952722787857056, "num_tokens": 2177741424.0, "step": 5124 }, { "entropy": 0.018752250354737043, "epoch": 2.242914979757085, "grad_norm": 5.09375, "learning_rate": 1.7057390362750408e-05, "loss": 0.0979, "loss_lm": 0.015585957793518901, "loss_seg": 0.08229776099324226, "mean_token_accuracy": 0.995102733373642, "num_tokens": 2178166338.0, "step": 5125 }, { "entropy": 0.018746349029242992, "epoch": 2.243352664405296, "grad_norm": 4.4375, "learning_rate": 1.7054683270167838e-05, "loss": 0.1172, "loss_lm": 0.016308776568621397, "loss_seg": 0.10086446441709995, "mean_token_accuracy": 0.9950754940509796, "num_tokens": 2178591958.0, "step": 5126 }, { "entropy": 0.018345358315855265, "epoch": 2.2437903490535067, "grad_norm": 28.75, "learning_rate": 1.7051976177585275e-05, "loss": 0.0925, "loss_lm": 0.014893739484250546, "loss_seg": 0.07764067500829697, "mean_token_accuracy": 0.9952738881111145, "num_tokens": 2179017222.0, "step": 5127 }, { "entropy": 0.018724698573350906, "epoch": 2.244228033701718, "grad_norm": 16.375, "learning_rate": 1.704926908500271e-05, "loss": 0.1029, "loss_lm": 0.014677966944873333, "loss_seg": 0.08819613605737686, "mean_token_accuracy": 0.9951949566602707, "num_tokens": 2179442565.0, "step": 5128 }, { "entropy": 0.01874837512150407, "epoch": 2.244665718349929, "grad_norm": 7.4375, "learning_rate": 1.7046561992420142e-05, "loss": 0.1119, "loss_lm": 0.017096045892685652, "loss_seg": 0.09481757692992687, "mean_token_accuracy": 0.9951000511646271, "num_tokens": 2179867664.0, "step": 5129 }, { "entropy": 0.01815510308369994, "epoch": 2.24510340299814, "grad_norm": 11.875, "learning_rate": 1.7043854899837576e-05, "loss": 0.1361, "loss_lm": 0.016310048988088965, "loss_seg": 0.11975593585520983, "mean_token_accuracy": 0.9952657520771027, "num_tokens": 2180292186.0, "step": 5130 }, { "entropy": 0.018319840542972088, "epoch": 2.245541087646351, "grad_norm": 5.15625, "learning_rate": 1.7041147807255007e-05, "loss": 0.0837, "loss_lm": 0.015134206740185618, "loss_seg": 0.06859950814396143, "mean_token_accuracy": 0.995226189494133, "num_tokens": 2180717166.0, "step": 5131 }, { "entropy": 0.017904425971210003, "epoch": 2.2459787722945617, "grad_norm": 28.75, "learning_rate": 1.703844071467244e-05, "loss": 0.1167, "loss_lm": 0.018592861481010914, "loss_seg": 0.09811624139547348, "mean_token_accuracy": 0.995335727930069, "num_tokens": 2181142040.0, "step": 5132 }, { "entropy": 0.01807331293821335, "epoch": 2.246416456942773, "grad_norm": 3.171875, "learning_rate": 1.7035733622089877e-05, "loss": 0.1126, "loss_lm": 0.017618105048313737, "loss_seg": 0.09498089086264372, "mean_token_accuracy": 0.9952283203601837, "num_tokens": 2181566770.0, "step": 5133 }, { "entropy": 0.018354159779846668, "epoch": 2.2468541415909837, "grad_norm": 7.71875, "learning_rate": 1.703302652950731e-05, "loss": 0.1033, "loss_lm": 0.016395224956795573, "loss_seg": 0.0869110506027937, "mean_token_accuracy": 0.9953214377164841, "num_tokens": 2181992273.0, "step": 5134 }, { "entropy": 0.01835985854268074, "epoch": 2.2472918262391945, "grad_norm": 5.0625, "learning_rate": 1.703031943692474e-05, "loss": 0.1008, "loss_lm": 0.017061149701476097, "loss_seg": 0.08376595564186573, "mean_token_accuracy": 0.9952566474676132, "num_tokens": 2182417259.0, "step": 5135 }, { "entropy": 0.018343459349125624, "epoch": 2.247729510887406, "grad_norm": 12.125, "learning_rate": 1.7027612344342175e-05, "loss": 0.1006, "loss_lm": 0.01585349184460938, "loss_seg": 0.08469810709357262, "mean_token_accuracy": 0.9952050298452377, "num_tokens": 2182842045.0, "step": 5136 }, { "entropy": 0.018057652283459902, "epoch": 2.2481671955356166, "grad_norm": 4.5, "learning_rate": 1.702490525175961e-05, "loss": 0.0866, "loss_lm": 0.015208794502541423, "loss_seg": 0.07138629537075758, "mean_token_accuracy": 0.9952393472194672, "num_tokens": 2183266532.0, "step": 5137 }, { "entropy": 0.01780289877206087, "epoch": 2.2486048801838274, "grad_norm": 5.8125, "learning_rate": 1.7022198159177046e-05, "loss": 0.0958, "loss_lm": 0.01545352186076343, "loss_seg": 0.08036338724195957, "mean_token_accuracy": 0.9953771531581879, "num_tokens": 2183691120.0, "step": 5138 }, { "entropy": 0.018155863974243402, "epoch": 2.2490425648320387, "grad_norm": 3.5625, "learning_rate": 1.701949106659448e-05, "loss": 0.1106, "loss_lm": 0.016548832412809134, "loss_seg": 0.09404370281845331, "mean_token_accuracy": 0.9953142404556274, "num_tokens": 2184115542.0, "step": 5139 }, { "entropy": 0.01856537675485015, "epoch": 2.2494802494802495, "grad_norm": 4.40625, "learning_rate": 1.701678397401191e-05, "loss": 0.1031, "loss_lm": 0.016453764867037535, "loss_seg": 0.08661285042762756, "mean_token_accuracy": 0.9950942993164062, "num_tokens": 2184541541.0, "step": 5140 }, { "entropy": 0.017998652532696724, "epoch": 2.2499179341284603, "grad_norm": 5.03125, "learning_rate": 1.7014076881429344e-05, "loss": 0.121, "loss_lm": 0.014923120150342584, "loss_seg": 0.10612151958048344, "mean_token_accuracy": 0.995405301451683, "num_tokens": 2184966311.0, "step": 5141 }, { "entropy": 0.018907874822616577, "epoch": 2.2503556187766716, "grad_norm": 7.71875, "learning_rate": 1.7011369788846778e-05, "loss": 0.1213, "loss_lm": 0.015652590431272984, "loss_seg": 0.10561119765043259, "mean_token_accuracy": 0.9949876070022583, "num_tokens": 2185391650.0, "step": 5142 }, { "entropy": 0.01784464530646801, "epoch": 2.2507933034248824, "grad_norm": 6.5625, "learning_rate": 1.7008662696264215e-05, "loss": 0.1072, "loss_lm": 0.016180502716451883, "loss_seg": 0.09105814807116985, "mean_token_accuracy": 0.9953314661979675, "num_tokens": 2185816134.0, "step": 5143 }, { "entropy": 0.018205081578344107, "epoch": 2.251230988073093, "grad_norm": 4.53125, "learning_rate": 1.700595560368165e-05, "loss": 0.1333, "loss_lm": 0.01617341162636876, "loss_seg": 0.11712302640080452, "mean_token_accuracy": 0.9952115714550018, "num_tokens": 2186240869.0, "step": 5144 }, { "entropy": 0.01827251212671399, "epoch": 2.2516686727213044, "grad_norm": 4.28125, "learning_rate": 1.700324851109908e-05, "loss": 0.1202, "loss_lm": 0.01636547246016562, "loss_seg": 0.10386019852012396, "mean_token_accuracy": 0.9952061921358109, "num_tokens": 2186665769.0, "step": 5145 }, { "entropy": 0.018171760719269514, "epoch": 2.2521063573695153, "grad_norm": 4.8125, "learning_rate": 1.7000541418516512e-05, "loss": 0.1041, "loss_lm": 0.015297063393518329, "loss_seg": 0.0887560248374939, "mean_token_accuracy": 0.9952941387891769, "num_tokens": 2187090098.0, "step": 5146 }, { "entropy": 0.018155603669583797, "epoch": 2.252544042017726, "grad_norm": 2.796875, "learning_rate": 1.6997834325933946e-05, "loss": 0.098, "loss_lm": 0.015937478048726916, "loss_seg": 0.0820626188069582, "mean_token_accuracy": 0.995240792632103, "num_tokens": 2187515528.0, "step": 5147 }, { "entropy": 0.017506698612123728, "epoch": 2.2529817266659373, "grad_norm": 6.75, "learning_rate": 1.6995127233351383e-05, "loss": 0.1253, "loss_lm": 0.016601990908384323, "loss_seg": 0.10869876202195883, "mean_token_accuracy": 0.9955068677663803, "num_tokens": 2187940802.0, "step": 5148 }, { "entropy": 0.018876346293836832, "epoch": 2.253419411314148, "grad_norm": 19.375, "learning_rate": 1.6992420140768817e-05, "loss": 0.1643, "loss_lm": 0.01716134464368224, "loss_seg": 0.14717540331184864, "mean_token_accuracy": 0.9950664043426514, "num_tokens": 2188366030.0, "step": 5149 }, { "entropy": 0.018678892869502306, "epoch": 2.253857095962359, "grad_norm": 10.0625, "learning_rate": 1.6989713048186247e-05, "loss": 0.0943, "loss_lm": 0.014942183857783675, "loss_seg": 0.07937974017113447, "mean_token_accuracy": 0.99512879550457, "num_tokens": 2188790735.0, "step": 5150 }, { "entropy": 0.018122652545571327, "epoch": 2.25429478061057, "grad_norm": 4.6875, "learning_rate": 1.698700595560368e-05, "loss": 0.0902, "loss_lm": 0.017432728549465537, "loss_seg": 0.0727669708430767, "mean_token_accuracy": 0.9952910989522934, "num_tokens": 2189216485.0, "step": 5151 }, { "entropy": 0.018749518785625696, "epoch": 2.254732465258781, "grad_norm": 6.46875, "learning_rate": 1.6984298863021115e-05, "loss": 0.1093, "loss_lm": 0.01804743683896959, "loss_seg": 0.09123949520289898, "mean_token_accuracy": 0.9950868189334869, "num_tokens": 2189641234.0, "step": 5152 }, { "entropy": 0.017876833211630583, "epoch": 2.255170149906992, "grad_norm": 8.6875, "learning_rate": 1.6981591770438552e-05, "loss": 0.1562, "loss_lm": 0.016287988051772118, "loss_seg": 0.1398879997432232, "mean_token_accuracy": 0.9952099472284317, "num_tokens": 2190066244.0, "step": 5153 }, { "entropy": 0.017840317450463772, "epoch": 2.255607834555203, "grad_norm": 7.09375, "learning_rate": 1.6978884677855986e-05, "loss": 0.1155, "loss_lm": 0.014602147042751312, "loss_seg": 0.10086619202047586, "mean_token_accuracy": 0.9954416006803513, "num_tokens": 2190491530.0, "step": 5154 }, { "entropy": 0.018131342716515064, "epoch": 2.256045519203414, "grad_norm": 6.09375, "learning_rate": 1.6976177585273416e-05, "loss": 0.1147, "loss_lm": 0.016021048417314887, "loss_seg": 0.09863361530005932, "mean_token_accuracy": 0.9952089339494705, "num_tokens": 2190916237.0, "step": 5155 }, { "entropy": 0.018648770172148943, "epoch": 2.2564832038516247, "grad_norm": 9.6875, "learning_rate": 1.697347049269085e-05, "loss": 0.096, "loss_lm": 0.01475826557725668, "loss_seg": 0.08121961541473866, "mean_token_accuracy": 0.9951983392238617, "num_tokens": 2191342513.0, "step": 5156 }, { "entropy": 0.01825745776295662, "epoch": 2.256920888499836, "grad_norm": 12.3125, "learning_rate": 1.6970763400108283e-05, "loss": 0.0987, "loss_lm": 0.015633361181244254, "loss_seg": 0.08308612927794456, "mean_token_accuracy": 0.9952186048030853, "num_tokens": 2191767692.0, "step": 5157 }, { "entropy": 0.018568988423794508, "epoch": 2.2573585731480468, "grad_norm": 24.25, "learning_rate": 1.696805630752572e-05, "loss": 0.1352, "loss_lm": 0.016868045087903738, "loss_seg": 0.11830577440559864, "mean_token_accuracy": 0.9951584339141846, "num_tokens": 2192192295.0, "step": 5158 }, { "entropy": 0.01857822434976697, "epoch": 2.257796257796258, "grad_norm": 3.078125, "learning_rate": 1.696534921494315e-05, "loss": 0.0899, "loss_lm": 0.016492706025019288, "loss_seg": 0.07337118126451969, "mean_token_accuracy": 0.9951328784227371, "num_tokens": 2192617155.0, "step": 5159 }, { "entropy": 0.017953407485038042, "epoch": 2.258233942444469, "grad_norm": 3.453125, "learning_rate": 1.6962642122360585e-05, "loss": 0.1071, "loss_lm": 0.014258167939260602, "loss_seg": 0.09283203072845936, "mean_token_accuracy": 0.9953691959381104, "num_tokens": 2193042522.0, "step": 5160 }, { "entropy": 0.018564295023679733, "epoch": 2.2586716270926797, "grad_norm": 4.65625, "learning_rate": 1.695993502977802e-05, "loss": 0.1072, "loss_lm": 0.018154677469283342, "loss_seg": 0.08903397619724274, "mean_token_accuracy": 0.9952668696641922, "num_tokens": 2193466923.0, "step": 5161 }, { "entropy": 0.017817416694015265, "epoch": 2.2591093117408905, "grad_norm": 46.0, "learning_rate": 1.6957227937195452e-05, "loss": 0.1447, "loss_lm": 0.016513750655576587, "loss_seg": 0.12822587974369526, "mean_token_accuracy": 0.9953018426895142, "num_tokens": 2193891936.0, "step": 5162 }, { "entropy": 0.01740188943222165, "epoch": 2.2595469963891017, "grad_norm": 10.8125, "learning_rate": 1.6954520844612886e-05, "loss": 0.1145, "loss_lm": 0.016175088938325644, "loss_seg": 0.09831946529448032, "mean_token_accuracy": 0.9952685236930847, "num_tokens": 2194316711.0, "step": 5163 }, { "entropy": 0.017913946881890297, "epoch": 2.2599846810373125, "grad_norm": 3.984375, "learning_rate": 1.695181375203032e-05, "loss": 0.1107, "loss_lm": 0.017430639127269387, "loss_seg": 0.0932854413986206, "mean_token_accuracy": 0.9953408688306808, "num_tokens": 2194741374.0, "step": 5164 }, { "entropy": 0.018068764358758926, "epoch": 2.260422365685524, "grad_norm": 9.8125, "learning_rate": 1.6949106659447753e-05, "loss": 0.1181, "loss_lm": 0.015097463969141245, "loss_seg": 0.10302974097430706, "mean_token_accuracy": 0.9953267723321915, "num_tokens": 2195167398.0, "step": 5165 }, { "entropy": 0.018273009918630123, "epoch": 2.2608600503337346, "grad_norm": 10.6875, "learning_rate": 1.6946399566865187e-05, "loss": 0.155, "loss_lm": 0.015801195055246353, "loss_seg": 0.1392136923968792, "mean_token_accuracy": 0.9952270537614822, "num_tokens": 2195592250.0, "step": 5166 }, { "entropy": 0.01855657948181033, "epoch": 2.2612977349819454, "grad_norm": 19.625, "learning_rate": 1.694369247428262e-05, "loss": 0.1174, "loss_lm": 0.016381715890020132, "loss_seg": 0.10104425624012947, "mean_token_accuracy": 0.995137631893158, "num_tokens": 2196017514.0, "step": 5167 }, { "entropy": 0.018742709420621395, "epoch": 2.2617354196301562, "grad_norm": 9.4375, "learning_rate": 1.6940985381700054e-05, "loss": 0.1569, "loss_lm": 0.018594956025481224, "loss_seg": 0.13828347995877266, "mean_token_accuracy": 0.9951493889093399, "num_tokens": 2196442340.0, "step": 5168 }, { "entropy": 0.017994366120547056, "epoch": 2.2621731042783675, "grad_norm": 5.9375, "learning_rate": 1.6938278289117488e-05, "loss": 0.0863, "loss_lm": 0.01517358678393066, "loss_seg": 0.07108152192085981, "mean_token_accuracy": 0.995311439037323, "num_tokens": 2196866952.0, "step": 5169 }, { "entropy": 0.01747457915917039, "epoch": 2.2626107889265783, "grad_norm": 3.578125, "learning_rate": 1.6935571196534922e-05, "loss": 0.1143, "loss_lm": 0.015406249789521098, "loss_seg": 0.09884784743189812, "mean_token_accuracy": 0.995416134595871, "num_tokens": 2197291439.0, "step": 5170 }, { "entropy": 0.018554394599050283, "epoch": 2.2630484735747896, "grad_norm": 7.75, "learning_rate": 1.6932864103952356e-05, "loss": 0.1155, "loss_lm": 0.018176575656980276, "loss_seg": 0.09727756213396788, "mean_token_accuracy": 0.9951747357845306, "num_tokens": 2197716258.0, "step": 5171 }, { "entropy": 0.018015125766396523, "epoch": 2.2634861582230004, "grad_norm": 4.40625, "learning_rate": 1.693015701136979e-05, "loss": 0.0931, "loss_lm": 0.014515169197693467, "loss_seg": 0.07863482646644115, "mean_token_accuracy": 0.9952684789896011, "num_tokens": 2198141006.0, "step": 5172 }, { "entropy": 0.01739671779796481, "epoch": 2.263923842871211, "grad_norm": 4.8125, "learning_rate": 1.6927449918787223e-05, "loss": 0.0766, "loss_lm": 0.015540473628789186, "loss_seg": 0.06106599420309067, "mean_token_accuracy": 0.9954382628202438, "num_tokens": 2198566006.0, "step": 5173 }, { "entropy": 0.018746154848486185, "epoch": 2.2643615275194224, "grad_norm": 8.375, "learning_rate": 1.6924742826204657e-05, "loss": 0.0903, "loss_lm": 0.015969140455126762, "loss_seg": 0.07432520762085915, "mean_token_accuracy": 0.9950401782989502, "num_tokens": 2198991807.0, "step": 5174 }, { "entropy": 0.018501606304198503, "epoch": 2.2647992121676332, "grad_norm": 8.5625, "learning_rate": 1.692203573362209e-05, "loss": 0.1197, "loss_lm": 0.016177847050130367, "loss_seg": 0.10354072600603104, "mean_token_accuracy": 0.9951900541782379, "num_tokens": 2199417125.0, "step": 5175 }, { "entropy": 0.01826525665819645, "epoch": 2.265236896815844, "grad_norm": 11.0, "learning_rate": 1.6919328641039524e-05, "loss": 0.1443, "loss_lm": 0.017313368851318955, "loss_seg": 0.1269818302243948, "mean_token_accuracy": 0.9952918589115143, "num_tokens": 2199842071.0, "step": 5176 }, { "entropy": 0.01819467404857278, "epoch": 2.2656745814640553, "grad_norm": 21.625, "learning_rate": 1.6916621548456958e-05, "loss": 0.1279, "loss_lm": 0.015504069160670042, "loss_seg": 0.11237501725554466, "mean_token_accuracy": 0.9953375458717346, "num_tokens": 2200267550.0, "step": 5177 }, { "entropy": 0.018697971012443304, "epoch": 2.266112266112266, "grad_norm": 3.4375, "learning_rate": 1.6913914455874388e-05, "loss": 0.1063, "loss_lm": 0.016876160399988294, "loss_seg": 0.08942429069429636, "mean_token_accuracy": 0.9951552450656891, "num_tokens": 2200692092.0, "step": 5178 }, { "entropy": 0.01850267592817545, "epoch": 2.266549950760477, "grad_norm": 3.265625, "learning_rate": 1.6911207363291825e-05, "loss": 0.0745, "loss_lm": 0.014107180992141366, "loss_seg": 0.06043049227446318, "mean_token_accuracy": 0.9951643943786621, "num_tokens": 2201117043.0, "step": 5179 }, { "entropy": 0.01850055903196335, "epoch": 2.266987635408688, "grad_norm": 15.625, "learning_rate": 1.690850027070926e-05, "loss": 0.1254, "loss_lm": 0.015333397779613733, "loss_seg": 0.11007686145603657, "mean_token_accuracy": 0.9950278997421265, "num_tokens": 2201541792.0, "step": 5180 }, { "entropy": 0.018016811925917864, "epoch": 2.267425320056899, "grad_norm": 12.75, "learning_rate": 1.6905793178126693e-05, "loss": 0.1055, "loss_lm": 0.014822070952504873, "loss_seg": 0.0906485142186284, "mean_token_accuracy": 0.9952942132949829, "num_tokens": 2201967588.0, "step": 5181 }, { "entropy": 0.01888027833774686, "epoch": 2.26786300470511, "grad_norm": 3.09375, "learning_rate": 1.6903086085544127e-05, "loss": 0.1016, "loss_lm": 0.016852675704285502, "loss_seg": 0.08479591645300388, "mean_token_accuracy": 0.9951055347919464, "num_tokens": 2202392102.0, "step": 5182 }, { "entropy": 0.018772731069475412, "epoch": 2.268300689353321, "grad_norm": 12.5, "learning_rate": 1.6900378992961557e-05, "loss": 0.1308, "loss_lm": 0.016019765054807067, "loss_seg": 0.1147898193448782, "mean_token_accuracy": 0.9951495081186295, "num_tokens": 2202817594.0, "step": 5183 }, { "entropy": 0.01864817552268505, "epoch": 2.268738374001532, "grad_norm": 5.5, "learning_rate": 1.6897671900378994e-05, "loss": 0.1314, "loss_lm": 0.01821710215881467, "loss_seg": 0.11320340074598789, "mean_token_accuracy": 0.9951328635215759, "num_tokens": 2203243473.0, "step": 5184 }, { "entropy": 0.018412893638014793, "epoch": 2.2691760586497427, "grad_norm": 6.6875, "learning_rate": 1.6894964807796428e-05, "loss": 0.105, "loss_lm": 0.015568510629236698, "loss_seg": 0.08943132404237986, "mean_token_accuracy": 0.9951977431774139, "num_tokens": 2203668456.0, "step": 5185 }, { "entropy": 0.01808115281164646, "epoch": 2.269613743297954, "grad_norm": 6.625, "learning_rate": 1.689225771521386e-05, "loss": 0.1308, "loss_lm": 0.015534359030425549, "loss_seg": 0.11522074602544308, "mean_token_accuracy": 0.9952426701784134, "num_tokens": 2204092589.0, "step": 5186 }, { "entropy": 0.01785018015652895, "epoch": 2.2700514279461648, "grad_norm": 14.6875, "learning_rate": 1.6889550622631295e-05, "loss": 0.1111, "loss_lm": 0.015697513008490205, "loss_seg": 0.09540380723774433, "mean_token_accuracy": 0.9954133927822113, "num_tokens": 2204517474.0, "step": 5187 }, { "entropy": 0.017969096079468727, "epoch": 2.2704891125943756, "grad_norm": 10.9375, "learning_rate": 1.6886843530048726e-05, "loss": 0.1061, "loss_lm": 0.01614020043052733, "loss_seg": 0.09000799991190434, "mean_token_accuracy": 0.9953177571296692, "num_tokens": 2204942224.0, "step": 5188 }, { "entropy": 0.01811057422310114, "epoch": 2.270926797242587, "grad_norm": 12.75, "learning_rate": 1.6884136437466163e-05, "loss": 0.0768, "loss_lm": 0.014006109908223152, "loss_seg": 0.06279212702065706, "mean_token_accuracy": 0.9953808635473251, "num_tokens": 2205366679.0, "step": 5189 }, { "entropy": 0.01863673748448491, "epoch": 2.2713644818907976, "grad_norm": 14.5, "learning_rate": 1.6881429344883596e-05, "loss": 0.0841, "loss_lm": 0.018141060369089246, "loss_seg": 0.06597366370260715, "mean_token_accuracy": 0.9951149523258209, "num_tokens": 2205791810.0, "step": 5190 }, { "entropy": 0.01787330536171794, "epoch": 2.2718021665390085, "grad_norm": 6.40625, "learning_rate": 1.687872225230103e-05, "loss": 0.0898, "loss_lm": 0.013560375664383173, "loss_seg": 0.07620619516819715, "mean_token_accuracy": 0.9953147023916245, "num_tokens": 2206216928.0, "step": 5191 }, { "entropy": 0.01852918090298772, "epoch": 2.2722398511872197, "grad_norm": 11.5, "learning_rate": 1.6876015159718464e-05, "loss": 0.092, "loss_lm": 0.017253066413104534, "loss_seg": 0.07475491426885128, "mean_token_accuracy": 0.9951438903808594, "num_tokens": 2206642542.0, "step": 5192 }, { "entropy": 0.017720119562000036, "epoch": 2.2726775358354305, "grad_norm": 4.3125, "learning_rate": 1.6873308067135894e-05, "loss": 0.1078, "loss_lm": 0.014447658555582166, "loss_seg": 0.09334476850926876, "mean_token_accuracy": 0.9954139590263367, "num_tokens": 2207067711.0, "step": 5193 }, { "entropy": 0.019046259112656116, "epoch": 2.273115220483642, "grad_norm": 17.125, "learning_rate": 1.687060097455333e-05, "loss": 0.1227, "loss_lm": 0.018619765993207693, "loss_seg": 0.10406730696558952, "mean_token_accuracy": 0.9950518310070038, "num_tokens": 2207492897.0, "step": 5194 }, { "entropy": 0.019235851243138313, "epoch": 2.2735529051318526, "grad_norm": 2.734375, "learning_rate": 1.6867893881970765e-05, "loss": 0.092, "loss_lm": 0.016938591143116355, "loss_seg": 0.07502660993486643, "mean_token_accuracy": 0.9950875490903854, "num_tokens": 2207918517.0, "step": 5195 }, { "entropy": 0.019022624474018812, "epoch": 2.2739905897800634, "grad_norm": 5.0625, "learning_rate": 1.68651867893882e-05, "loss": 0.1122, "loss_lm": 0.01843478949740529, "loss_seg": 0.09372057020664215, "mean_token_accuracy": 0.9951314032077789, "num_tokens": 2208343581.0, "step": 5196 }, { "entropy": 0.0185386729426682, "epoch": 2.274428274428274, "grad_norm": 18.625, "learning_rate": 1.6862479696805632e-05, "loss": 0.0844, "loss_lm": 0.016646027797833085, "loss_seg": 0.06771586369723082, "mean_token_accuracy": 0.9952780902385712, "num_tokens": 2208768997.0, "step": 5197 }, { "entropy": 0.018549267668277025, "epoch": 2.2748659590764855, "grad_norm": 7.78125, "learning_rate": 1.6859772604223063e-05, "loss": 0.0931, "loss_lm": 0.01569886109791696, "loss_seg": 0.07736785802990198, "mean_token_accuracy": 0.99513079226017, "num_tokens": 2209193691.0, "step": 5198 }, { "entropy": 0.018739246297627687, "epoch": 2.2753036437246963, "grad_norm": 3.8125, "learning_rate": 1.6857065511640497e-05, "loss": 0.1223, "loss_lm": 0.017585752997547388, "loss_seg": 0.10471387021243572, "mean_token_accuracy": 0.9952024221420288, "num_tokens": 2209619677.0, "step": 5199 }, { "entropy": 0.01879097055643797, "epoch": 2.2757413283729075, "grad_norm": 2.9375, "learning_rate": 1.6854358419057934e-05, "loss": 0.0944, "loss_lm": 0.019057864788919687, "loss_seg": 0.07533688470721245, "mean_token_accuracy": 0.995066687464714, "num_tokens": 2210045298.0, "step": 5200 }, { "entropy": 0.018748370464891195, "epoch": 2.2761790130211184, "grad_norm": 7.0, "learning_rate": 1.6851651326475367e-05, "loss": 0.1022, "loss_lm": 0.01733950269408524, "loss_seg": 0.08490614872425795, "mean_token_accuracy": 0.9952126443386078, "num_tokens": 2210469715.0, "step": 5201 }, { "entropy": 0.018422322813421488, "epoch": 2.276616697669329, "grad_norm": 9.625, "learning_rate": 1.6848944233892798e-05, "loss": 0.097, "loss_lm": 0.015887182904407382, "loss_seg": 0.08112326636910439, "mean_token_accuracy": 0.9951654076576233, "num_tokens": 2210894332.0, "step": 5202 }, { "entropy": 0.01811617659404874, "epoch": 2.27705438231754, "grad_norm": 8.25, "learning_rate": 1.684623714131023e-05, "loss": 0.1064, "loss_lm": 0.01630261796526611, "loss_seg": 0.09005406312644482, "mean_token_accuracy": 0.9952102154493332, "num_tokens": 2211318937.0, "step": 5203 }, { "entropy": 0.018791284877806902, "epoch": 2.2774920669657512, "grad_norm": 4.3125, "learning_rate": 1.6843530048727665e-05, "loss": 0.1306, "loss_lm": 0.01763262879103422, "loss_seg": 0.11296843737363815, "mean_token_accuracy": 0.9951624572277069, "num_tokens": 2211743704.0, "step": 5204 }, { "entropy": 0.01812832336872816, "epoch": 2.277929751613962, "grad_norm": 3.53125, "learning_rate": 1.6840822956145102e-05, "loss": 0.1185, "loss_lm": 0.014494798611849546, "loss_seg": 0.10399857349693775, "mean_token_accuracy": 0.9952758997678757, "num_tokens": 2212169121.0, "step": 5205 }, { "entropy": 0.018175256438553333, "epoch": 2.2783674362621733, "grad_norm": 6.25, "learning_rate": 1.6838115863562536e-05, "loss": 0.09, "loss_lm": 0.014282698975875974, "loss_seg": 0.07573976740241051, "mean_token_accuracy": 0.9953019767999649, "num_tokens": 2212593922.0, "step": 5206 }, { "entropy": 0.018387232907116413, "epoch": 2.278805120910384, "grad_norm": 2.796875, "learning_rate": 1.6835408770979966e-05, "loss": 0.0792, "loss_lm": 0.01416316395625472, "loss_seg": 0.0649960320442915, "mean_token_accuracy": 0.9950872361660004, "num_tokens": 2213019118.0, "step": 5207 }, { "entropy": 0.018350654281675816, "epoch": 2.279242805558595, "grad_norm": 6.3125, "learning_rate": 1.68327016783974e-05, "loss": 0.1122, "loss_lm": 0.014908848330378532, "loss_seg": 0.09725404251366854, "mean_token_accuracy": 0.9951727390289307, "num_tokens": 2213443923.0, "step": 5208 }, { "entropy": 0.01856497209519148, "epoch": 2.279680490206806, "grad_norm": 10.4375, "learning_rate": 1.6829994585814834e-05, "loss": 0.1082, "loss_lm": 0.016516184201464057, "loss_seg": 0.0917214434593916, "mean_token_accuracy": 0.9950968325138092, "num_tokens": 2213869113.0, "step": 5209 }, { "entropy": 0.01842330303043127, "epoch": 2.280118174855017, "grad_norm": 15.125, "learning_rate": 1.682728749323227e-05, "loss": 0.158, "loss_lm": 0.016196776181459427, "loss_seg": 0.14182596281170845, "mean_token_accuracy": 0.9952221810817719, "num_tokens": 2214293947.0, "step": 5210 }, { "entropy": 0.01823475118726492, "epoch": 2.280555859503228, "grad_norm": 11.0625, "learning_rate": 1.6824580400649705e-05, "loss": 0.1471, "loss_lm": 0.014971492579206824, "loss_seg": 0.13216781988739967, "mean_token_accuracy": 0.9952365756034851, "num_tokens": 2214719029.0, "step": 5211 }, { "entropy": 0.01781749213114381, "epoch": 2.280993544151439, "grad_norm": 4.53125, "learning_rate": 1.6821873308067135e-05, "loss": 0.099, "loss_lm": 0.01445945561863482, "loss_seg": 0.08453470841050148, "mean_token_accuracy": 0.995385468006134, "num_tokens": 2215144592.0, "step": 5212 }, { "entropy": 0.018312288913875818, "epoch": 2.28143122879965, "grad_norm": 5.0, "learning_rate": 1.681916621548457e-05, "loss": 0.0761, "loss_lm": 0.015717514092102647, "loss_seg": 0.060385570861399174, "mean_token_accuracy": 0.9952769130468369, "num_tokens": 2215570111.0, "step": 5213 }, { "entropy": 0.017860411200672388, "epoch": 2.2818689134478607, "grad_norm": 10.5625, "learning_rate": 1.6816459122902002e-05, "loss": 0.1018, "loss_lm": 0.014606654876843095, "loss_seg": 0.08714411221444607, "mean_token_accuracy": 0.9952708929777145, "num_tokens": 2215995502.0, "step": 5214 }, { "entropy": 0.018256839830428362, "epoch": 2.282306598096072, "grad_norm": 6.0, "learning_rate": 1.681375203031944e-05, "loss": 0.1495, "loss_lm": 0.016775319119915366, "loss_seg": 0.13276401534676552, "mean_token_accuracy": 0.9951860308647156, "num_tokens": 2216420260.0, "step": 5215 }, { "entropy": 0.01887495070695877, "epoch": 2.2827442827442828, "grad_norm": 10.5, "learning_rate": 1.6811044937736873e-05, "loss": 0.1057, "loss_lm": 0.01959701580926776, "loss_seg": 0.08609985467046499, "mean_token_accuracy": 0.9951089322566986, "num_tokens": 2216845703.0, "step": 5216 }, { "entropy": 0.01818511262536049, "epoch": 2.2831819673924936, "grad_norm": 6.625, "learning_rate": 1.6808337845154304e-05, "loss": 0.1198, "loss_lm": 0.016862026881426573, "loss_seg": 0.10289674811065197, "mean_token_accuracy": 0.9952330142259598, "num_tokens": 2217270678.0, "step": 5217 }, { "entropy": 0.017679226119071245, "epoch": 2.283619652040705, "grad_norm": 7.25, "learning_rate": 1.6805630752571737e-05, "loss": 0.137, "loss_lm": 0.015409945277497172, "loss_seg": 0.12162747792899609, "mean_token_accuracy": 0.9954560101032257, "num_tokens": 2217694899.0, "step": 5218 }, { "entropy": 0.01772649260237813, "epoch": 2.2840573366889156, "grad_norm": 4.375, "learning_rate": 1.680292365998917e-05, "loss": 0.1005, "loss_lm": 0.015289839589968324, "loss_seg": 0.08520512096583843, "mean_token_accuracy": 0.9953583478927612, "num_tokens": 2218120341.0, "step": 5219 }, { "entropy": 0.017486390192061663, "epoch": 2.2844950213371265, "grad_norm": 15.375, "learning_rate": 1.6800216567406608e-05, "loss": 0.1472, "loss_lm": 0.016887076664716005, "loss_seg": 0.13031888380646706, "mean_token_accuracy": 0.9953660815954208, "num_tokens": 2218545428.0, "step": 5220 }, { "entropy": 0.01881954213604331, "epoch": 2.2849327059853377, "grad_norm": 5.6875, "learning_rate": 1.6797509474824042e-05, "loss": 0.1398, "loss_lm": 0.016087309457361698, "loss_seg": 0.12368017062544823, "mean_token_accuracy": 0.9950587451457977, "num_tokens": 2218971000.0, "step": 5221 }, { "entropy": 0.01782034244388342, "epoch": 2.2853703906335485, "grad_norm": 10.0625, "learning_rate": 1.6794802382241472e-05, "loss": 0.0889, "loss_lm": 0.016275709494948387, "loss_seg": 0.072591464035213, "mean_token_accuracy": 0.9953623116016388, "num_tokens": 2219395731.0, "step": 5222 }, { "entropy": 0.01815118221566081, "epoch": 2.2858080752817593, "grad_norm": 3.890625, "learning_rate": 1.6792095289658906e-05, "loss": 0.0953, "loss_lm": 0.01721567800268531, "loss_seg": 0.078046889975667, "mean_token_accuracy": 0.9952799677848816, "num_tokens": 2219820469.0, "step": 5223 }, { "entropy": 0.017944027204066515, "epoch": 2.2862457599299706, "grad_norm": 4.03125, "learning_rate": 1.678938819707634e-05, "loss": 0.1027, "loss_lm": 0.016947139520198107, "loss_seg": 0.08576864562928677, "mean_token_accuracy": 0.9954030960798264, "num_tokens": 2220245045.0, "step": 5224 }, { "entropy": 0.018614318687468767, "epoch": 2.2866834445781814, "grad_norm": 9.5, "learning_rate": 1.6786681104493777e-05, "loss": 0.0939, "loss_lm": 0.01634492608718574, "loss_seg": 0.07750543486326933, "mean_token_accuracy": 0.9951555132865906, "num_tokens": 2220670208.0, "step": 5225 }, { "entropy": 0.017751814797520638, "epoch": 2.287121129226392, "grad_norm": 5.625, "learning_rate": 1.6783974011911207e-05, "loss": 0.1092, "loss_lm": 0.01561530469916761, "loss_seg": 0.09362520463764668, "mean_token_accuracy": 0.9953782856464386, "num_tokens": 2221095323.0, "step": 5226 }, { "entropy": 0.01791603397578001, "epoch": 2.2875588138746035, "grad_norm": 6.1875, "learning_rate": 1.678126691932864e-05, "loss": 0.1046, "loss_lm": 0.01739755622111261, "loss_seg": 0.08719146251678467, "mean_token_accuracy": 0.9952402263879776, "num_tokens": 2221521134.0, "step": 5227 }, { "entropy": 0.017864924389868975, "epoch": 2.2879964985228143, "grad_norm": 10.9375, "learning_rate": 1.6778559826746075e-05, "loss": 0.118, "loss_lm": 0.014994829427450895, "loss_seg": 0.1030195988714695, "mean_token_accuracy": 0.9951831847429276, "num_tokens": 2221946209.0, "step": 5228 }, { "entropy": 0.01824824744835496, "epoch": 2.288434183171025, "grad_norm": 21.5, "learning_rate": 1.677585273416351e-05, "loss": 0.133, "loss_lm": 0.016822451259940863, "loss_seg": 0.11618076078593731, "mean_token_accuracy": 0.9951649159193039, "num_tokens": 2222370786.0, "step": 5229 }, { "entropy": 0.01736532850190997, "epoch": 2.2888718678192363, "grad_norm": 10.125, "learning_rate": 1.6773145641580942e-05, "loss": 0.1094, "loss_lm": 0.01355761825107038, "loss_seg": 0.09588680788874626, "mean_token_accuracy": 0.9954383373260498, "num_tokens": 2222795573.0, "step": 5230 }, { "entropy": 0.0178814553655684, "epoch": 2.289309552467447, "grad_norm": 5.46875, "learning_rate": 1.6770438548998376e-05, "loss": 0.0933, "loss_lm": 0.014836329966783524, "loss_seg": 0.07846149150282145, "mean_token_accuracy": 0.9953685253858566, "num_tokens": 2223220873.0, "step": 5231 }, { "entropy": 0.018371100071817636, "epoch": 2.289747237115658, "grad_norm": 26.125, "learning_rate": 1.676773145641581e-05, "loss": 0.1032, "loss_lm": 0.015548511175438762, "loss_seg": 0.08767316862940788, "mean_token_accuracy": 0.9951870441436768, "num_tokens": 2223645946.0, "step": 5232 }, { "entropy": 0.01762959035113454, "epoch": 2.2901849217638692, "grad_norm": 8.75, "learning_rate": 1.6765024363833243e-05, "loss": 0.0784, "loss_lm": 0.017323640175163746, "loss_seg": 0.06105594988912344, "mean_token_accuracy": 0.9952690303325653, "num_tokens": 2224071175.0, "step": 5233 }, { "entropy": 0.018712603952735662, "epoch": 2.29062260641208, "grad_norm": 13.75, "learning_rate": 1.6762317271250677e-05, "loss": 0.1349, "loss_lm": 0.01973751443438232, "loss_seg": 0.11520404182374477, "mean_token_accuracy": 0.9949944466352463, "num_tokens": 2224497065.0, "step": 5234 }, { "entropy": 0.019079186487942934, "epoch": 2.2910602910602913, "grad_norm": 7.25, "learning_rate": 1.675961017866811e-05, "loss": 0.0902, "loss_lm": 0.015987978782504797, "loss_seg": 0.07422757148742676, "mean_token_accuracy": 0.9951639473438263, "num_tokens": 2224922592.0, "step": 5235 }, { "entropy": 0.018219137098640203, "epoch": 2.291497975708502, "grad_norm": 23.375, "learning_rate": 1.6756903086085544e-05, "loss": 0.0946, "loss_lm": 0.015917995711788535, "loss_seg": 0.07869763486087322, "mean_token_accuracy": 0.9951910823583603, "num_tokens": 2225347619.0, "step": 5236 }, { "entropy": 0.018351790495216846, "epoch": 2.291935660356713, "grad_norm": 3.984375, "learning_rate": 1.6754195993502978e-05, "loss": 0.0955, "loss_lm": 0.01594605459831655, "loss_seg": 0.07954217307269573, "mean_token_accuracy": 0.9951934218406677, "num_tokens": 2225772833.0, "step": 5237 }, { "entropy": 0.01796067226678133, "epoch": 2.2923733450049237, "grad_norm": 4.875, "learning_rate": 1.6751488900920412e-05, "loss": 0.072, "loss_lm": 0.014455535681918263, "loss_seg": 0.05750834569334984, "mean_token_accuracy": 0.9953692555427551, "num_tokens": 2226197986.0, "step": 5238 }, { "entropy": 0.018374749924987555, "epoch": 2.292811029653135, "grad_norm": 3.515625, "learning_rate": 1.6748781808337846e-05, "loss": 0.103, "loss_lm": 0.015436968067660928, "loss_seg": 0.08759840484708548, "mean_token_accuracy": 0.9951912462711334, "num_tokens": 2226623831.0, "step": 5239 }, { "entropy": 0.017746212892234325, "epoch": 2.293248714301346, "grad_norm": 6.03125, "learning_rate": 1.674607471575528e-05, "loss": 0.1369, "loss_lm": 0.014641002053394914, "loss_seg": 0.12223384343087673, "mean_token_accuracy": 0.9953800141811371, "num_tokens": 2227048663.0, "step": 5240 }, { "entropy": 0.018253246322274208, "epoch": 2.293686398949557, "grad_norm": 12.5625, "learning_rate": 1.6743367623172713e-05, "loss": 0.1065, "loss_lm": 0.015586887951940298, "loss_seg": 0.09087503235787153, "mean_token_accuracy": 0.9952078759670258, "num_tokens": 2227474109.0, "step": 5241 }, { "entropy": 0.017768549732863903, "epoch": 2.294124083597768, "grad_norm": 4.46875, "learning_rate": 1.6740660530590147e-05, "loss": 0.0876, "loss_lm": 0.015008922666311264, "loss_seg": 0.07260801270604134, "mean_token_accuracy": 0.9953593015670776, "num_tokens": 2227899227.0, "step": 5242 }, { "entropy": 0.018335101660341024, "epoch": 2.2945617682459787, "grad_norm": 7.6875, "learning_rate": 1.673795343800758e-05, "loss": 0.1088, "loss_lm": 0.01893801847472787, "loss_seg": 0.08990271668881178, "mean_token_accuracy": 0.9952536374330521, "num_tokens": 2228324127.0, "step": 5243 }, { "entropy": 0.018708813935518265, "epoch": 2.2949994528941895, "grad_norm": 4.0625, "learning_rate": 1.6735246345425014e-05, "loss": 0.0819, "loss_lm": 0.01452068262733519, "loss_seg": 0.06734576541930437, "mean_token_accuracy": 0.9950577169656754, "num_tokens": 2228749415.0, "step": 5244 }, { "entropy": 0.018426797818392515, "epoch": 2.2954371375424008, "grad_norm": 4.84375, "learning_rate": 1.6732539252842448e-05, "loss": 0.1046, "loss_lm": 0.01764632621780038, "loss_seg": 0.08699956350028515, "mean_token_accuracy": 0.9952422976493835, "num_tokens": 2229173752.0, "step": 5245 }, { "entropy": 0.018225878942757845, "epoch": 2.2958748221906116, "grad_norm": 13.0625, "learning_rate": 1.672983216025988e-05, "loss": 0.1378, "loss_lm": 0.017882844898849726, "loss_seg": 0.11989745311439037, "mean_token_accuracy": 0.9953217059373856, "num_tokens": 2229599320.0, "step": 5246 }, { "entropy": 0.017817940562963486, "epoch": 2.296312506838823, "grad_norm": 10.25, "learning_rate": 1.6727125067677315e-05, "loss": 0.1348, "loss_lm": 0.0159849904011935, "loss_seg": 0.11884518340229988, "mean_token_accuracy": 0.9954035729169846, "num_tokens": 2230023932.0, "step": 5247 }, { "entropy": 0.017967705614864826, "epoch": 2.2967501914870336, "grad_norm": 5.15625, "learning_rate": 1.672441797509475e-05, "loss": 0.1195, "loss_lm": 0.01641911454498768, "loss_seg": 0.10306148044764996, "mean_token_accuracy": 0.9954114556312561, "num_tokens": 2230448416.0, "step": 5248 }, { "entropy": 0.018800667021423578, "epoch": 2.2971878761352444, "grad_norm": 13.25, "learning_rate": 1.6721710882512183e-05, "loss": 0.0894, "loss_lm": 0.016058419831097126, "loss_seg": 0.07331824861466885, "mean_token_accuracy": 0.995149239897728, "num_tokens": 2230873460.0, "step": 5249 }, { "entropy": 0.01792284194380045, "epoch": 2.2976255607834557, "grad_norm": 57.5, "learning_rate": 1.6719003789929613e-05, "loss": 0.0932, "loss_lm": 0.015582493972033262, "loss_seg": 0.0776256937533617, "mean_token_accuracy": 0.9953672289848328, "num_tokens": 2231298658.0, "step": 5250 }, { "entropy": 0.018506065011024475, "epoch": 2.2980632454316665, "grad_norm": 21.0, "learning_rate": 1.671629669734705e-05, "loss": 0.1165, "loss_lm": 0.016620400827378035, "loss_seg": 0.09983208123594522, "mean_token_accuracy": 0.9951955825090408, "num_tokens": 2231723641.0, "step": 5251 }, { "entropy": 0.01785902539268136, "epoch": 2.2985009300798773, "grad_norm": 9.5, "learning_rate": 1.6713589604764484e-05, "loss": 0.1302, "loss_lm": 0.018902509473264217, "loss_seg": 0.11131049320101738, "mean_token_accuracy": 0.9952784180641174, "num_tokens": 2232148875.0, "step": 5252 }, { "entropy": 0.017757014371454716, "epoch": 2.2989386147280886, "grad_norm": 3.796875, "learning_rate": 1.6710882512181918e-05, "loss": 0.0868, "loss_lm": 0.013542308704927564, "loss_seg": 0.07322648447006941, "mean_token_accuracy": 0.9953795373439789, "num_tokens": 2232573882.0, "step": 5253 }, { "entropy": 0.0180953536182642, "epoch": 2.2993762993762994, "grad_norm": 2.953125, "learning_rate": 1.670817541959935e-05, "loss": 0.107, "loss_lm": 0.014747799839824438, "loss_seg": 0.09228371270000935, "mean_token_accuracy": 0.9952961355447769, "num_tokens": 2232998795.0, "step": 5254 }, { "entropy": 0.018760235980153084, "epoch": 2.29981398402451, "grad_norm": 10.125, "learning_rate": 1.6705468327016782e-05, "loss": 0.0957, "loss_lm": 0.01631625951267779, "loss_seg": 0.07933501712977886, "mean_token_accuracy": 0.9951898008584976, "num_tokens": 2233423480.0, "step": 5255 }, { "entropy": 0.018119052052497864, "epoch": 2.3002516686727215, "grad_norm": 7.40625, "learning_rate": 1.670276123443422e-05, "loss": 0.1143, "loss_lm": 0.017284690868109465, "loss_seg": 0.09698559250682592, "mean_token_accuracy": 0.9951430112123489, "num_tokens": 2233849025.0, "step": 5256 }, { "entropy": 0.01860475353896618, "epoch": 2.3006893533209323, "grad_norm": 3.765625, "learning_rate": 1.6700054141851653e-05, "loss": 0.097, "loss_lm": 0.01817000610753894, "loss_seg": 0.07887720968574286, "mean_token_accuracy": 0.9951779991388321, "num_tokens": 2234274076.0, "step": 5257 }, { "entropy": 0.01784433051943779, "epoch": 2.301127037969143, "grad_norm": 5.96875, "learning_rate": 1.6697347049269086e-05, "loss": 0.0852, "loss_lm": 0.015134365996345878, "loss_seg": 0.07010949403047562, "mean_token_accuracy": 0.9952114075422287, "num_tokens": 2234699194.0, "step": 5258 }, { "entropy": 0.017825881019234657, "epoch": 2.3015647226173543, "grad_norm": 3.78125, "learning_rate": 1.669463995668652e-05, "loss": 0.0833, "loss_lm": 0.016059669200330973, "loss_seg": 0.06727050710469484, "mean_token_accuracy": 0.9953949451446533, "num_tokens": 2235124533.0, "step": 5259 }, { "entropy": 0.017913647927343845, "epoch": 2.302002407265565, "grad_norm": 6.96875, "learning_rate": 1.669193286410395e-05, "loss": 0.1115, "loss_lm": 0.016346951946616173, "loss_seg": 0.09517803601920605, "mean_token_accuracy": 0.9953582733869553, "num_tokens": 2235549313.0, "step": 5260 }, { "entropy": 0.018521208316087723, "epoch": 2.302440091913776, "grad_norm": 3.625, "learning_rate": 1.6689225771521384e-05, "loss": 0.1012, "loss_lm": 0.017251409590244293, "loss_seg": 0.08393772132694721, "mean_token_accuracy": 0.9952042996883392, "num_tokens": 2235975273.0, "step": 5261 }, { "entropy": 0.01866899011656642, "epoch": 2.302877776561987, "grad_norm": 3.984375, "learning_rate": 1.668651867893882e-05, "loss": 0.0845, "loss_lm": 0.015371965942904353, "loss_seg": 0.06914863456040621, "mean_token_accuracy": 0.9951519668102264, "num_tokens": 2236400760.0, "step": 5262 }, { "entropy": 0.017912165727466345, "epoch": 2.303315461210198, "grad_norm": 6.0625, "learning_rate": 1.6683811586356255e-05, "loss": 0.1387, "loss_lm": 0.013391752261668444, "loss_seg": 0.12527924962341785, "mean_token_accuracy": 0.9953529983758926, "num_tokens": 2236825729.0, "step": 5263 }, { "entropy": 0.01872101193293929, "epoch": 2.303753145858409, "grad_norm": 5.09375, "learning_rate": 1.668110449377369e-05, "loss": 0.1072, "loss_lm": 0.017464719247072935, "loss_seg": 0.0897153839468956, "mean_token_accuracy": 0.9951613396406174, "num_tokens": 2237250348.0, "step": 5264 }, { "entropy": 0.0183389438316226, "epoch": 2.30419083050662, "grad_norm": 4.59375, "learning_rate": 1.667839740119112e-05, "loss": 0.1175, "loss_lm": 0.016330330865457654, "loss_seg": 0.10114151239395142, "mean_token_accuracy": 0.9951742887496948, "num_tokens": 2237675560.0, "step": 5265 }, { "entropy": 0.01865848433226347, "epoch": 2.304628515154831, "grad_norm": 7.875, "learning_rate": 1.6675690308608553e-05, "loss": 0.1402, "loss_lm": 0.01803534268401563, "loss_seg": 0.12221144884824753, "mean_token_accuracy": 0.9951066970825195, "num_tokens": 2238101018.0, "step": 5266 }, { "entropy": 0.017875209916383028, "epoch": 2.3050661998030417, "grad_norm": 6.0625, "learning_rate": 1.667298321602599e-05, "loss": 0.1328, "loss_lm": 0.014958037761971354, "loss_seg": 0.11788980290293694, "mean_token_accuracy": 0.9953394681215286, "num_tokens": 2238526628.0, "step": 5267 }, { "entropy": 0.018138146493583918, "epoch": 2.305503884451253, "grad_norm": 9.375, "learning_rate": 1.6670276123443424e-05, "loss": 0.0915, "loss_lm": 0.016309481346979737, "loss_seg": 0.07521296851336956, "mean_token_accuracy": 0.9952322095632553, "num_tokens": 2238951733.0, "step": 5268 }, { "entropy": 0.018523545004427433, "epoch": 2.305941569099464, "grad_norm": 5.21875, "learning_rate": 1.6667569030860857e-05, "loss": 0.1275, "loss_lm": 0.016250005457550287, "loss_seg": 0.11127770505845547, "mean_token_accuracy": 0.9951501935720444, "num_tokens": 2239376875.0, "step": 5269 }, { "entropy": 0.01876933965831995, "epoch": 2.3063792537476746, "grad_norm": 10.6875, "learning_rate": 1.6664861938278288e-05, "loss": 0.1105, "loss_lm": 0.01818698225542903, "loss_seg": 0.09229277074337006, "mean_token_accuracy": 0.9950684756040573, "num_tokens": 2239801469.0, "step": 5270 }, { "entropy": 0.017700894735753536, "epoch": 2.306816938395886, "grad_norm": 4.21875, "learning_rate": 1.666215484569572e-05, "loss": 0.1417, "loss_lm": 0.016202788800001144, "loss_seg": 0.12550981529057026, "mean_token_accuracy": 0.995368242263794, "num_tokens": 2240226334.0, "step": 5271 }, { "entropy": 0.01798002840951085, "epoch": 2.3072546230440967, "grad_norm": 9.5, "learning_rate": 1.665944775311316e-05, "loss": 0.0765, "loss_lm": 0.016733018681406975, "loss_seg": 0.05978529714047909, "mean_token_accuracy": 0.9952931106090546, "num_tokens": 2240651850.0, "step": 5272 }, { "entropy": 0.01837592152878642, "epoch": 2.3076923076923075, "grad_norm": 5.625, "learning_rate": 1.6656740660530592e-05, "loss": 0.0997, "loss_lm": 0.016380369430407882, "loss_seg": 0.0833599679172039, "mean_token_accuracy": 0.9950356483459473, "num_tokens": 2241076845.0, "step": 5273 }, { "entropy": 0.01833784719929099, "epoch": 2.3081299923405187, "grad_norm": 12.875, "learning_rate": 1.6654033567948023e-05, "loss": 0.1311, "loss_lm": 0.015484778210520744, "loss_seg": 0.11560882814228535, "mean_token_accuracy": 0.9952075332403183, "num_tokens": 2241502068.0, "step": 5274 }, { "entropy": 0.018254772294312716, "epoch": 2.3085676769887296, "grad_norm": 14.1875, "learning_rate": 1.6651326475365456e-05, "loss": 0.124, "loss_lm": 0.01697630388662219, "loss_seg": 0.10698889940977097, "mean_token_accuracy": 0.995195284485817, "num_tokens": 2241926826.0, "step": 5275 }, { "entropy": 0.01854381011798978, "epoch": 2.309005361636941, "grad_norm": 29.625, "learning_rate": 1.664861938278289e-05, "loss": 0.12, "loss_lm": 0.014700726605951786, "loss_seg": 0.1053411178290844, "mean_token_accuracy": 0.9952492564916611, "num_tokens": 2242352000.0, "step": 5276 }, { "entropy": 0.018276765942573547, "epoch": 2.3094430462851516, "grad_norm": 20.75, "learning_rate": 1.6645912290200327e-05, "loss": 0.0941, "loss_lm": 0.014933978905901313, "loss_seg": 0.07916617579758167, "mean_token_accuracy": 0.9952734857797623, "num_tokens": 2242776568.0, "step": 5277 }, { "entropy": 0.018107762560248375, "epoch": 2.3098807309333624, "grad_norm": 12.625, "learning_rate": 1.664320519761776e-05, "loss": 0.1232, "loss_lm": 0.01430941023863852, "loss_seg": 0.10886561498045921, "mean_token_accuracy": 0.9952575266361237, "num_tokens": 2243201797.0, "step": 5278 }, { "entropy": 0.018125552218407393, "epoch": 2.3103184155815732, "grad_norm": 12.6875, "learning_rate": 1.664049810503519e-05, "loss": 0.0821, "loss_lm": 0.014556683832779527, "loss_seg": 0.06756186485290527, "mean_token_accuracy": 0.9952643364667892, "num_tokens": 2243626603.0, "step": 5279 }, { "entropy": 0.017286814749240875, "epoch": 2.3107561002297845, "grad_norm": 82.0, "learning_rate": 1.6637791012452625e-05, "loss": 0.0868, "loss_lm": 0.014792748494073749, "loss_seg": 0.0720185711979866, "mean_token_accuracy": 0.9954214841127396, "num_tokens": 2244050919.0, "step": 5280 }, { "entropy": 0.018187538255006075, "epoch": 2.3111937848779953, "grad_norm": 11.1875, "learning_rate": 1.663508391987006e-05, "loss": 0.0908, "loss_lm": 0.014906068332493305, "loss_seg": 0.07587642595171928, "mean_token_accuracy": 0.9952075481414795, "num_tokens": 2244476565.0, "step": 5281 }, { "entropy": 0.018368034157902002, "epoch": 2.3116314695262066, "grad_norm": 2.953125, "learning_rate": 1.6632376827287496e-05, "loss": 0.1693, "loss_lm": 0.017618943937122822, "loss_seg": 0.15169325843453407, "mean_token_accuracy": 0.9951383322477341, "num_tokens": 2244901256.0, "step": 5282 }, { "entropy": 0.01825999328866601, "epoch": 2.3120691541744174, "grad_norm": 11.375, "learning_rate": 1.662966973470493e-05, "loss": 0.0955, "loss_lm": 0.014597538160160184, "loss_seg": 0.08090844098478556, "mean_token_accuracy": 0.9953389465808868, "num_tokens": 2245326228.0, "step": 5283 }, { "entropy": 0.018607973586767912, "epoch": 2.312506838822628, "grad_norm": 10.5625, "learning_rate": 1.662696264212236e-05, "loss": 0.1025, "loss_lm": 0.018502803053706884, "loss_seg": 0.08399851806461811, "mean_token_accuracy": 0.9951835423707962, "num_tokens": 2245751816.0, "step": 5284 }, { "entropy": 0.0177691918797791, "epoch": 2.3129445234708395, "grad_norm": 12.8125, "learning_rate": 1.6624255549539794e-05, "loss": 0.0689, "loss_lm": 0.01709447312168777, "loss_seg": 0.05181773193180561, "mean_token_accuracy": 0.9952887743711472, "num_tokens": 2246177555.0, "step": 5285 }, { "entropy": 0.018171779345721006, "epoch": 2.3133822081190503, "grad_norm": 3.734375, "learning_rate": 1.6621548456957227e-05, "loss": 0.1249, "loss_lm": 0.01658742595463991, "loss_seg": 0.10831313766539097, "mean_token_accuracy": 0.9952245801687241, "num_tokens": 2246603433.0, "step": 5286 }, { "entropy": 0.01777651719748974, "epoch": 2.313819892767261, "grad_norm": 6.625, "learning_rate": 1.6618841364374664e-05, "loss": 0.0827, "loss_lm": 0.014047452481463552, "loss_seg": 0.06863754522055387, "mean_token_accuracy": 0.9954101890325546, "num_tokens": 2247028428.0, "step": 5287 }, { "entropy": 0.01755176205188036, "epoch": 2.3142575774154723, "grad_norm": 4.34375, "learning_rate": 1.6616134271792098e-05, "loss": 0.1128, "loss_lm": 0.015708085848018527, "loss_seg": 0.09704599529504776, "mean_token_accuracy": 0.9952013492584229, "num_tokens": 2247453492.0, "step": 5288 }, { "entropy": 0.01816068310290575, "epoch": 2.314695262063683, "grad_norm": 4.0625, "learning_rate": 1.661342717920953e-05, "loss": 0.1634, "loss_lm": 0.016954293008893728, "loss_seg": 0.1464917529374361, "mean_token_accuracy": 0.995195284485817, "num_tokens": 2247878648.0, "step": 5289 }, { "entropy": 0.01837029028683901, "epoch": 2.315132946711894, "grad_norm": 3.234375, "learning_rate": 1.6610720086626962e-05, "loss": 0.1249, "loss_lm": 0.016656765714287758, "loss_seg": 0.10821268893778324, "mean_token_accuracy": 0.9951199144124985, "num_tokens": 2248304230.0, "step": 5290 }, { "entropy": 0.01783205894753337, "epoch": 2.315570631360105, "grad_norm": 8.875, "learning_rate": 1.6608012994044396e-05, "loss": 0.1011, "loss_lm": 0.014902070863172412, "loss_seg": 0.08622756041586399, "mean_token_accuracy": 0.9952126145362854, "num_tokens": 2248728846.0, "step": 5291 }, { "entropy": 0.01828620955348015, "epoch": 2.316008316008316, "grad_norm": 3.375, "learning_rate": 1.6605305901461833e-05, "loss": 0.0967, "loss_lm": 0.01578937633894384, "loss_seg": 0.08090309798717499, "mean_token_accuracy": 0.9953434765338898, "num_tokens": 2249154018.0, "step": 5292 }, { "entropy": 0.018211704678833485, "epoch": 2.316446000656527, "grad_norm": 4.3125, "learning_rate": 1.6602598808879267e-05, "loss": 0.0969, "loss_lm": 0.016105716349557042, "loss_seg": 0.08081677835434675, "mean_token_accuracy": 0.9951542615890503, "num_tokens": 2249579119.0, "step": 5293 }, { "entropy": 0.018330442253500223, "epoch": 2.316883685304738, "grad_norm": 7.28125, "learning_rate": 1.6599891716296697e-05, "loss": 0.1281, "loss_lm": 0.01737272529862821, "loss_seg": 0.11074007488787174, "mean_token_accuracy": 0.9952066987752914, "num_tokens": 2250004051.0, "step": 5294 }, { "entropy": 0.018214342184364796, "epoch": 2.317321369952949, "grad_norm": 5.3125, "learning_rate": 1.659718462371413e-05, "loss": 0.1492, "loss_lm": 0.01733423490077257, "loss_seg": 0.13181915134191513, "mean_token_accuracy": 0.9952089935541153, "num_tokens": 2250428559.0, "step": 5295 }, { "entropy": 0.017644349951297045, "epoch": 2.3177590546011597, "grad_norm": 6.53125, "learning_rate": 1.6594477531131565e-05, "loss": 0.1226, "loss_lm": 0.01492792065255344, "loss_seg": 0.1077088862657547, "mean_token_accuracy": 0.9953201413154602, "num_tokens": 2250853955.0, "step": 5296 }, { "entropy": 0.017742800060659647, "epoch": 2.318196739249371, "grad_norm": 9.0625, "learning_rate": 1.6591770438548998e-05, "loss": 0.1139, "loss_lm": 0.01710132439620793, "loss_seg": 0.09677055291831493, "mean_token_accuracy": 0.9953366070985794, "num_tokens": 2251279304.0, "step": 5297 }, { "entropy": 0.018126164563000202, "epoch": 2.318634423897582, "grad_norm": 18.625, "learning_rate": 1.6589063345966432e-05, "loss": 0.0865, "loss_lm": 0.015755230793729424, "loss_seg": 0.0707600750029087, "mean_token_accuracy": 0.9951387792825699, "num_tokens": 2251704860.0, "step": 5298 }, { "entropy": 0.01857083709910512, "epoch": 2.3190721085457926, "grad_norm": 7.21875, "learning_rate": 1.6586356253383866e-05, "loss": 0.1486, "loss_lm": 0.017440004739910364, "loss_seg": 0.13120093569159508, "mean_token_accuracy": 0.9950817823410034, "num_tokens": 2252130353.0, "step": 5299 }, { "entropy": 0.018378022126853466, "epoch": 2.319509793194004, "grad_norm": 2.765625, "learning_rate": 1.65836491608013e-05, "loss": 0.1143, "loss_lm": 0.016843281686306, "loss_seg": 0.09750583581626415, "mean_token_accuracy": 0.9951263070106506, "num_tokens": 2252555232.0, "step": 5300 }, { "entropy": 0.01805719267576933, "epoch": 2.3199474778422147, "grad_norm": 14.0625, "learning_rate": 1.6580942068218733e-05, "loss": 0.1034, "loss_lm": 0.0152888847514987, "loss_seg": 0.0881368387490511, "mean_token_accuracy": 0.9952683746814728, "num_tokens": 2252980195.0, "step": 5301 }, { "entropy": 0.017795901279896498, "epoch": 2.3203851624904255, "grad_norm": 3.640625, "learning_rate": 1.6578234975636167e-05, "loss": 0.1131, "loss_lm": 0.01703842729330063, "loss_seg": 0.09607768058776855, "mean_token_accuracy": 0.9953590333461761, "num_tokens": 2253405253.0, "step": 5302 }, { "entropy": 0.017913021612912416, "epoch": 2.3208228471386367, "grad_norm": 12.875, "learning_rate": 1.65755278830536e-05, "loss": 0.1085, "loss_lm": 0.015063820173963904, "loss_seg": 0.09345287922769785, "mean_token_accuracy": 0.9953268468379974, "num_tokens": 2253829961.0, "step": 5303 }, { "entropy": 0.018160834442824125, "epoch": 2.3212605317868475, "grad_norm": 20.75, "learning_rate": 1.6572820790471034e-05, "loss": 0.1191, "loss_lm": 0.015901544596999884, "loss_seg": 0.10315310396254063, "mean_token_accuracy": 0.9951509833335876, "num_tokens": 2254255374.0, "step": 5304 }, { "entropy": 0.017939706798642874, "epoch": 2.3216982164350584, "grad_norm": 7.4375, "learning_rate": 1.6570113697888468e-05, "loss": 0.0982, "loss_lm": 0.016388912685215473, "loss_seg": 0.08176441676914692, "mean_token_accuracy": 0.9952778220176697, "num_tokens": 2254680981.0, "step": 5305 }, { "entropy": 0.017885216511785984, "epoch": 2.3221359010832696, "grad_norm": 4.8125, "learning_rate": 1.6567406605305902e-05, "loss": 0.1038, "loss_lm": 0.01654296345077455, "loss_seg": 0.08729233406484127, "mean_token_accuracy": 0.9952162504196167, "num_tokens": 2255105086.0, "step": 5306 }, { "entropy": 0.01846439065411687, "epoch": 2.3225735857314804, "grad_norm": 16.125, "learning_rate": 1.6564699512723336e-05, "loss": 0.1068, "loss_lm": 0.01681569218635559, "loss_seg": 0.08998100087046623, "mean_token_accuracy": 0.9950929433107376, "num_tokens": 2255530376.0, "step": 5307 }, { "entropy": 0.018178249709308147, "epoch": 2.3230112703796912, "grad_norm": 9.5625, "learning_rate": 1.656199242014077e-05, "loss": 0.1327, "loss_lm": 0.016584659228101373, "loss_seg": 0.11615394055843353, "mean_token_accuracy": 0.9951439648866653, "num_tokens": 2255955591.0, "step": 5308 }, { "entropy": 0.0182294687256217, "epoch": 2.3234489550279025, "grad_norm": 18.5, "learning_rate": 1.6559285327558203e-05, "loss": 0.1195, "loss_lm": 0.01542535237967968, "loss_seg": 0.10411985218524933, "mean_token_accuracy": 0.9951243102550507, "num_tokens": 2256381337.0, "step": 5309 }, { "entropy": 0.017730278428643942, "epoch": 2.3238866396761133, "grad_norm": 5.25, "learning_rate": 1.6556578234975637e-05, "loss": 0.0876, "loss_lm": 0.015262188389897346, "loss_seg": 0.07230251096189022, "mean_token_accuracy": 0.9955834001302719, "num_tokens": 2256806268.0, "step": 5310 }, { "entropy": 0.017581448890268803, "epoch": 2.3243243243243246, "grad_norm": 4.75, "learning_rate": 1.655387114239307e-05, "loss": 0.1124, "loss_lm": 0.01536109996959567, "loss_seg": 0.09704532288014889, "mean_token_accuracy": 0.9953388124704361, "num_tokens": 2257231016.0, "step": 5311 }, { "entropy": 0.017487753182649612, "epoch": 2.3247620089725354, "grad_norm": 6.4375, "learning_rate": 1.6551164049810504e-05, "loss": 0.0943, "loss_lm": 0.016853698063641787, "loss_seg": 0.07741998042911291, "mean_token_accuracy": 0.9953266829252243, "num_tokens": 2257655071.0, "step": 5312 }, { "entropy": 0.017912782728672028, "epoch": 2.325199693620746, "grad_norm": 9.75, "learning_rate": 1.6548456957227938e-05, "loss": 0.0942, "loss_lm": 0.016701079672202468, "loss_seg": 0.07752769812941551, "mean_token_accuracy": 0.9953695386648178, "num_tokens": 2258079814.0, "step": 5313 }, { "entropy": 0.01753937965258956, "epoch": 2.325637378268957, "grad_norm": 4.5, "learning_rate": 1.654574986464537e-05, "loss": 0.1009, "loss_lm": 0.014593550702556968, "loss_seg": 0.08632592111825943, "mean_token_accuracy": 0.9954568296670914, "num_tokens": 2258504576.0, "step": 5314 }, { "entropy": 0.018454895820468664, "epoch": 2.3260750629171683, "grad_norm": 5.0625, "learning_rate": 1.6543042772062805e-05, "loss": 0.1034, "loss_lm": 0.016268116421997547, "loss_seg": 0.08714229799807072, "mean_token_accuracy": 0.9952458441257477, "num_tokens": 2258930254.0, "step": 5315 }, { "entropy": 0.019120031502097845, "epoch": 2.326512747565379, "grad_norm": 7.375, "learning_rate": 1.654033567948024e-05, "loss": 0.1005, "loss_lm": 0.017277397215366364, "loss_seg": 0.08322400785982609, "mean_token_accuracy": 0.9950893670320511, "num_tokens": 2259355433.0, "step": 5316 }, { "entropy": 0.01817309856414795, "epoch": 2.3269504322135903, "grad_norm": 29.0, "learning_rate": 1.653762858689767e-05, "loss": 0.1365, "loss_lm": 0.014940979890525341, "loss_seg": 0.12157096154987812, "mean_token_accuracy": 0.9952376335859299, "num_tokens": 2259780600.0, "step": 5317 }, { "entropy": 0.018602481111884117, "epoch": 2.327388116861801, "grad_norm": 5.34375, "learning_rate": 1.6534921494315107e-05, "loss": 0.0939, "loss_lm": 0.015615479787811637, "loss_seg": 0.07827488891780376, "mean_token_accuracy": 0.9950897246599197, "num_tokens": 2260206219.0, "step": 5318 }, { "entropy": 0.01821262575685978, "epoch": 2.327825801510012, "grad_norm": 6.34375, "learning_rate": 1.653221440173254e-05, "loss": 0.0986, "loss_lm": 0.016461243852972984, "loss_seg": 0.0821138247847557, "mean_token_accuracy": 0.9953096956014633, "num_tokens": 2260631746.0, "step": 5319 }, { "entropy": 0.018734545446932316, "epoch": 2.3282634861582228, "grad_norm": 2.25, "learning_rate": 1.6529507309149974e-05, "loss": 0.0728, "loss_lm": 0.019153286702930927, "loss_seg": 0.05363971646875143, "mean_token_accuracy": 0.995070219039917, "num_tokens": 2261057283.0, "step": 5320 }, { "entropy": 0.018217138946056366, "epoch": 2.328701170806434, "grad_norm": 2.84375, "learning_rate": 1.6526800216567408e-05, "loss": 0.1098, "loss_lm": 0.015606217551976442, "loss_seg": 0.09422641526907682, "mean_token_accuracy": 0.9952380955219269, "num_tokens": 2261481235.0, "step": 5321 }, { "entropy": 0.01855687191709876, "epoch": 2.329138855454645, "grad_norm": 6.90625, "learning_rate": 1.6524093123984838e-05, "loss": 0.1119, "loss_lm": 0.01663989294320345, "loss_seg": 0.09523116983473301, "mean_token_accuracy": 0.9951948821544647, "num_tokens": 2261906577.0, "step": 5322 }, { "entropy": 0.018590251449495554, "epoch": 2.329576540102856, "grad_norm": 5.4375, "learning_rate": 1.6521386031402275e-05, "loss": 0.1134, "loss_lm": 0.016785751562565565, "loss_seg": 0.0965653508901596, "mean_token_accuracy": 0.9951534420251846, "num_tokens": 2262332000.0, "step": 5323 }, { "entropy": 0.017904330044984818, "epoch": 2.330014224751067, "grad_norm": 5.09375, "learning_rate": 1.651867893881971e-05, "loss": 0.1614, "loss_lm": 0.015656856587156653, "loss_seg": 0.14574667811393738, "mean_token_accuracy": 0.9953592717647552, "num_tokens": 2262757535.0, "step": 5324 }, { "entropy": 0.018099563661962748, "epoch": 2.3304519093992777, "grad_norm": 3.359375, "learning_rate": 1.6515971846237143e-05, "loss": 0.1125, "loss_lm": 0.014871747698634863, "loss_seg": 0.09760977514088154, "mean_token_accuracy": 0.995259165763855, "num_tokens": 2263182297.0, "step": 5325 }, { "entropy": 0.01736669009551406, "epoch": 2.330889594047489, "grad_norm": 21.875, "learning_rate": 1.6513264753654576e-05, "loss": 0.0869, "loss_lm": 0.012623196467757225, "loss_seg": 0.07426618970930576, "mean_token_accuracy": 0.9954857379198074, "num_tokens": 2263606640.0, "step": 5326 }, { "entropy": 0.017691079527139664, "epoch": 2.3313272786957, "grad_norm": 10.8125, "learning_rate": 1.6510557661072007e-05, "loss": 0.0911, "loss_lm": 0.012800781521946192, "loss_seg": 0.07834511995315552, "mean_token_accuracy": 0.9954039752483368, "num_tokens": 2264031286.0, "step": 5327 }, { "entropy": 0.018139036372303963, "epoch": 2.3317649633439106, "grad_norm": 4.65625, "learning_rate": 1.650785056848944e-05, "loss": 0.0968, "loss_lm": 0.01580157084390521, "loss_seg": 0.0809848215430975, "mean_token_accuracy": 0.9952572882175446, "num_tokens": 2264456334.0, "step": 5328 }, { "entropy": 0.018703998997807503, "epoch": 2.332202647992122, "grad_norm": 11.5, "learning_rate": 1.6505143475906878e-05, "loss": 0.106, "loss_lm": 0.015662520192563534, "loss_seg": 0.09037592820823193, "mean_token_accuracy": 0.9952311366796494, "num_tokens": 2264881557.0, "step": 5329 }, { "entropy": 0.01813357323408127, "epoch": 2.3326403326403327, "grad_norm": 3.265625, "learning_rate": 1.650243638332431e-05, "loss": 0.0883, "loss_lm": 0.01628390420228243, "loss_seg": 0.07197707891464233, "mean_token_accuracy": 0.9952389299869537, "num_tokens": 2265306207.0, "step": 5330 }, { "entropy": 0.01766067137941718, "epoch": 2.3330780172885435, "grad_norm": 11.25, "learning_rate": 1.6499729290741745e-05, "loss": 0.1185, "loss_lm": 0.01621797331608832, "loss_seg": 0.10224090330302715, "mean_token_accuracy": 0.9953906536102295, "num_tokens": 2265730519.0, "step": 5331 }, { "entropy": 0.017857607919722795, "epoch": 2.3335157019367547, "grad_norm": 26.875, "learning_rate": 1.6497022198159175e-05, "loss": 0.106, "loss_lm": 0.01626613363623619, "loss_seg": 0.08970518037676811, "mean_token_accuracy": 0.9954005181789398, "num_tokens": 2266155503.0, "step": 5332 }, { "entropy": 0.018301683943718672, "epoch": 2.3339533865849655, "grad_norm": 11.875, "learning_rate": 1.649431510557661e-05, "loss": 0.1083, "loss_lm": 0.01564238080754876, "loss_seg": 0.09266986511647701, "mean_token_accuracy": 0.995110958814621, "num_tokens": 2266580885.0, "step": 5333 }, { "entropy": 0.01818200945854187, "epoch": 2.3343910712331764, "grad_norm": 11.6875, "learning_rate": 1.6491608012994046e-05, "loss": 0.0722, "loss_lm": 0.01429433049634099, "loss_seg": 0.05787531193345785, "mean_token_accuracy": 0.9952294379472733, "num_tokens": 2267005945.0, "step": 5334 }, { "entropy": 0.018674227874726057, "epoch": 2.3348287558813876, "grad_norm": 2.796875, "learning_rate": 1.648890092041148e-05, "loss": 0.109, "loss_lm": 0.017505294177681208, "loss_seg": 0.09145023114979267, "mean_token_accuracy": 0.9951358288526535, "num_tokens": 2267431643.0, "step": 5335 }, { "entropy": 0.018418211955577135, "epoch": 2.3352664405295984, "grad_norm": 4.03125, "learning_rate": 1.6486193827828914e-05, "loss": 0.0985, "loss_lm": 0.016200479585677385, "loss_seg": 0.08226533234119415, "mean_token_accuracy": 0.9951618313789368, "num_tokens": 2267857396.0, "step": 5336 }, { "entropy": 0.018635174725204706, "epoch": 2.3357041251778092, "grad_norm": 9.8125, "learning_rate": 1.6483486735246344e-05, "loss": 0.1452, "loss_lm": 0.018040326656773686, "loss_seg": 0.12718887627124786, "mean_token_accuracy": 0.9951370656490326, "num_tokens": 2268283253.0, "step": 5337 }, { "entropy": 0.01836017146706581, "epoch": 2.3361418098260205, "grad_norm": 3.625, "learning_rate": 1.6480779642663778e-05, "loss": 0.1122, "loss_lm": 0.014322864590212703, "loss_seg": 0.09785703849047422, "mean_token_accuracy": 0.9952631592750549, "num_tokens": 2268707955.0, "step": 5338 }, { "entropy": 0.017857377883046865, "epoch": 2.3365794944742313, "grad_norm": 10.1875, "learning_rate": 1.6478072550081215e-05, "loss": 0.0774, "loss_lm": 0.015150669496506453, "loss_seg": 0.062299242708832026, "mean_token_accuracy": 0.995257705450058, "num_tokens": 2269132814.0, "step": 5339 }, { "entropy": 0.018186775967478752, "epoch": 2.337017179122442, "grad_norm": 6.71875, "learning_rate": 1.647536545749865e-05, "loss": 0.1192, "loss_lm": 0.014993024058640003, "loss_seg": 0.10417541954666376, "mean_token_accuracy": 0.9952629953622818, "num_tokens": 2269558571.0, "step": 5340 }, { "entropy": 0.018619247246533632, "epoch": 2.3374548637706534, "grad_norm": 7.28125, "learning_rate": 1.647265836491608e-05, "loss": 0.0888, "loss_lm": 0.016646303702145815, "loss_seg": 0.07217109017074108, "mean_token_accuracy": 0.9951357245445251, "num_tokens": 2269983474.0, "step": 5341 }, { "entropy": 0.018553247209638357, "epoch": 2.337892548418864, "grad_norm": 8.75, "learning_rate": 1.6469951272333513e-05, "loss": 0.1489, "loss_lm": 0.0191490207798779, "loss_seg": 0.12975154258310795, "mean_token_accuracy": 0.9951100945472717, "num_tokens": 2270408587.0, "step": 5342 }, { "entropy": 0.01823163963854313, "epoch": 2.338330233067075, "grad_norm": 12.4375, "learning_rate": 1.6467244179750946e-05, "loss": 0.0838, "loss_lm": 0.015080917626619339, "loss_seg": 0.06867300346493721, "mean_token_accuracy": 0.9952691793441772, "num_tokens": 2270833608.0, "step": 5343 }, { "entropy": 0.018258374650031328, "epoch": 2.3387679177152862, "grad_norm": 26.375, "learning_rate": 1.6464537087168383e-05, "loss": 0.1171, "loss_lm": 0.017183871939778328, "loss_seg": 0.09995922446250916, "mean_token_accuracy": 0.9952500909566879, "num_tokens": 2271258630.0, "step": 5344 }, { "entropy": 0.017947420477867126, "epoch": 2.339205602363497, "grad_norm": 13.0, "learning_rate": 1.6461829994585817e-05, "loss": 0.1323, "loss_lm": 0.015789871104061604, "loss_seg": 0.11655743047595024, "mean_token_accuracy": 0.9953216165304184, "num_tokens": 2271683736.0, "step": 5345 }, { "entropy": 0.017990675754845142, "epoch": 2.339643287011708, "grad_norm": 7.90625, "learning_rate": 1.6459122902003247e-05, "loss": 0.1003, "loss_lm": 0.015064841601997614, "loss_seg": 0.08519637119024992, "mean_token_accuracy": 0.9952420443296432, "num_tokens": 2272108265.0, "step": 5346 }, { "entropy": 0.01839950494468212, "epoch": 2.340080971659919, "grad_norm": 6.59375, "learning_rate": 1.645641580942068e-05, "loss": 0.1347, "loss_lm": 0.016514785820618272, "loss_seg": 0.11816423200070858, "mean_token_accuracy": 0.9953839033842087, "num_tokens": 2272532816.0, "step": 5347 }, { "entropy": 0.018517695367336273, "epoch": 2.34051865630813, "grad_norm": 5.0, "learning_rate": 1.6453708716838115e-05, "loss": 0.0926, "loss_lm": 0.015672088833525777, "loss_seg": 0.07691819593310356, "mean_token_accuracy": 0.9951917827129364, "num_tokens": 2272958498.0, "step": 5348 }, { "entropy": 0.018243050202727318, "epoch": 2.3409563409563408, "grad_norm": 11.875, "learning_rate": 1.6451001624255552e-05, "loss": 0.0996, "loss_lm": 0.01659718807786703, "loss_seg": 0.08296544291079044, "mean_token_accuracy": 0.9951370060443878, "num_tokens": 2273384351.0, "step": 5349 }, { "entropy": 0.01800034660845995, "epoch": 2.341394025604552, "grad_norm": 9.6875, "learning_rate": 1.6448294531672986e-05, "loss": 0.104, "loss_lm": 0.014120341744273901, "loss_seg": 0.08988349884748459, "mean_token_accuracy": 0.9952937960624695, "num_tokens": 2273809934.0, "step": 5350 }, { "entropy": 0.018712691962718964, "epoch": 2.341831710252763, "grad_norm": 6.9375, "learning_rate": 1.6445587439090416e-05, "loss": 0.1315, "loss_lm": 0.019953339360654354, "loss_seg": 0.11150245927274227, "mean_token_accuracy": 0.9951620846986771, "num_tokens": 2274234680.0, "step": 5351 }, { "entropy": 0.017867141403257847, "epoch": 2.342269394900974, "grad_norm": 7.9375, "learning_rate": 1.644288034650785e-05, "loss": 0.0846, "loss_lm": 0.017436038004234433, "loss_seg": 0.06714615505188704, "mean_token_accuracy": 0.9953007400035858, "num_tokens": 2274659839.0, "step": 5352 }, { "entropy": 0.018067081924527884, "epoch": 2.342707079549185, "grad_norm": 4.5, "learning_rate": 1.6440173253925284e-05, "loss": 0.0972, "loss_lm": 0.01650717668235302, "loss_seg": 0.08066646382212639, "mean_token_accuracy": 0.9954116493463516, "num_tokens": 2275084787.0, "step": 5353 }, { "entropy": 0.018783853389322758, "epoch": 2.3431447641973957, "grad_norm": 6.15625, "learning_rate": 1.643746616134272e-05, "loss": 0.1135, "loss_lm": 0.017491704784333706, "loss_seg": 0.09598718583583832, "mean_token_accuracy": 0.9951285570859909, "num_tokens": 2275510273.0, "step": 5354 }, { "entropy": 0.018443094566464424, "epoch": 2.3435824488456065, "grad_norm": 5.65625, "learning_rate": 1.6434759068760154e-05, "loss": 0.0718, "loss_lm": 0.014624872012063861, "loss_seg": 0.05721125937998295, "mean_token_accuracy": 0.9951324015855789, "num_tokens": 2275935935.0, "step": 5355 }, { "entropy": 0.01812413241714239, "epoch": 2.3440201334938178, "grad_norm": 6.40625, "learning_rate": 1.6432051976177585e-05, "loss": 0.1075, "loss_lm": 0.017094530165195465, "loss_seg": 0.09036617632955313, "mean_token_accuracy": 0.9951328635215759, "num_tokens": 2276361407.0, "step": 5356 }, { "entropy": 0.018199796322733164, "epoch": 2.3444578181420286, "grad_norm": 5.3125, "learning_rate": 1.642934488359502e-05, "loss": 0.1193, "loss_lm": 0.01707933214493096, "loss_seg": 0.1021788902580738, "mean_token_accuracy": 0.9954564869403839, "num_tokens": 2276786363.0, "step": 5357 }, { "entropy": 0.01837199181318283, "epoch": 2.34489550279024, "grad_norm": 3.828125, "learning_rate": 1.6426637791012452e-05, "loss": 0.1216, "loss_lm": 0.017051249975338578, "loss_seg": 0.10453231446444988, "mean_token_accuracy": 0.9951329231262207, "num_tokens": 2277211020.0, "step": 5358 }, { "entropy": 0.01808230299502611, "epoch": 2.3453331874384507, "grad_norm": 4.09375, "learning_rate": 1.6423930698429886e-05, "loss": 0.1433, "loss_lm": 0.016561827855184674, "loss_seg": 0.12671093456447124, "mean_token_accuracy": 0.9952477514743805, "num_tokens": 2277636439.0, "step": 5359 }, { "entropy": 0.017804754432290792, "epoch": 2.3457708720866615, "grad_norm": 18.75, "learning_rate": 1.6421223605847323e-05, "loss": 0.0934, "loss_lm": 0.015312443720176816, "loss_seg": 0.0780512597411871, "mean_token_accuracy": 0.995391234755516, "num_tokens": 2278061629.0, "step": 5360 }, { "entropy": 0.018163471948355436, "epoch": 2.3462085567348723, "grad_norm": 10.4375, "learning_rate": 1.6418516513264753e-05, "loss": 0.1046, "loss_lm": 0.014641410205513239, "loss_seg": 0.08995999209582806, "mean_token_accuracy": 0.9951381832361221, "num_tokens": 2278486337.0, "step": 5361 }, { "entropy": 0.0185730317607522, "epoch": 2.3466462413830835, "grad_norm": 11.125, "learning_rate": 1.6415809420682187e-05, "loss": 0.1039, "loss_lm": 0.01592074055224657, "loss_seg": 0.08795482479035854, "mean_token_accuracy": 0.9951788038015366, "num_tokens": 2278911718.0, "step": 5362 }, { "entropy": 0.01793932029977441, "epoch": 2.3470839260312943, "grad_norm": 8.625, "learning_rate": 1.641310232809962e-05, "loss": 0.12, "loss_lm": 0.01589366397820413, "loss_seg": 0.10414103046059608, "mean_token_accuracy": 0.9953831136226654, "num_tokens": 2279336403.0, "step": 5363 }, { "entropy": 0.01812354475259781, "epoch": 2.3475216106795056, "grad_norm": 7.625, "learning_rate": 1.6410395235517055e-05, "loss": 0.0679, "loss_lm": 0.012471528723835945, "loss_seg": 0.05540974531322718, "mean_token_accuracy": 0.9951546937227249, "num_tokens": 2279761311.0, "step": 5364 }, { "entropy": 0.01759549230337143, "epoch": 2.3479592953277164, "grad_norm": 4.90625, "learning_rate": 1.6407688142934488e-05, "loss": 0.0972, "loss_lm": 0.015134041896089911, "loss_seg": 0.08210802357643843, "mean_token_accuracy": 0.9954575747251511, "num_tokens": 2280186132.0, "step": 5365 }, { "entropy": 0.01855565281584859, "epoch": 2.3483969799759272, "grad_norm": 7.84375, "learning_rate": 1.6404981050351922e-05, "loss": 0.0928, "loss_lm": 0.016928261378780007, "loss_seg": 0.07588337361812592, "mean_token_accuracy": 0.9951739311218262, "num_tokens": 2280611511.0, "step": 5366 }, { "entropy": 0.01831264840438962, "epoch": 2.3488346646241385, "grad_norm": 23.5, "learning_rate": 1.6402273957769356e-05, "loss": 0.1264, "loss_lm": 0.017703776946291327, "loss_seg": 0.10870949551463127, "mean_token_accuracy": 0.9951469600200653, "num_tokens": 2281036579.0, "step": 5367 }, { "entropy": 0.018574360758066177, "epoch": 2.3492723492723493, "grad_norm": 6.03125, "learning_rate": 1.639956686518679e-05, "loss": 0.1028, "loss_lm": 0.016368656186386943, "loss_seg": 0.08647113479673862, "mean_token_accuracy": 0.9952694326639175, "num_tokens": 2281462684.0, "step": 5368 }, { "entropy": 0.01785636367276311, "epoch": 2.34971003392056, "grad_norm": 4.4375, "learning_rate": 1.6396859772604223e-05, "loss": 0.1194, "loss_lm": 0.016522471327334642, "loss_seg": 0.10290220100432634, "mean_token_accuracy": 0.9953132271766663, "num_tokens": 2281887244.0, "step": 5369 }, { "entropy": 0.01836589677259326, "epoch": 2.3501477185687714, "grad_norm": 6.25, "learning_rate": 1.6394152680021657e-05, "loss": 0.1657, "loss_lm": 0.01675343490205705, "loss_seg": 0.14897367544472218, "mean_token_accuracy": 0.9951637834310532, "num_tokens": 2282312765.0, "step": 5370 }, { "entropy": 0.01795667689293623, "epoch": 2.350585403216982, "grad_norm": 7.625, "learning_rate": 1.639144558743909e-05, "loss": 0.1112, "loss_lm": 0.01689781923778355, "loss_seg": 0.09434927441179752, "mean_token_accuracy": 0.9953092336654663, "num_tokens": 2282738255.0, "step": 5371 }, { "entropy": 0.017841773107647896, "epoch": 2.351023087865193, "grad_norm": 15.875, "learning_rate": 1.6388738494856524e-05, "loss": 0.1752, "loss_lm": 0.016124112298712134, "loss_seg": 0.1590639166533947, "mean_token_accuracy": 0.9953170120716095, "num_tokens": 2283163584.0, "step": 5372 }, { "entropy": 0.018311948515474796, "epoch": 2.3514607725134042, "grad_norm": 3.703125, "learning_rate": 1.6386031402273958e-05, "loss": 0.1273, "loss_lm": 0.01605634461157024, "loss_seg": 0.11125190183520317, "mean_token_accuracy": 0.995218962430954, "num_tokens": 2283588357.0, "step": 5373 }, { "entropy": 0.018336397595703602, "epoch": 2.351898457161615, "grad_norm": 4.15625, "learning_rate": 1.6383324309691392e-05, "loss": 0.0785, "loss_lm": 0.017365309176966548, "loss_seg": 0.06113045755773783, "mean_token_accuracy": 0.9951674938201904, "num_tokens": 2284013350.0, "step": 5374 }, { "entropy": 0.018507414497435093, "epoch": 2.352336141809826, "grad_norm": 10.25, "learning_rate": 1.6380617217108826e-05, "loss": 0.0832, "loss_lm": 0.016265756683424115, "loss_seg": 0.06688885763287544, "mean_token_accuracy": 0.9952728450298309, "num_tokens": 2284438488.0, "step": 5375 }, { "entropy": 0.018261864315718412, "epoch": 2.352773826458037, "grad_norm": 4.3125, "learning_rate": 1.637791012452626e-05, "loss": 0.1291, "loss_lm": 0.01707380567677319, "loss_seg": 0.11198541335761547, "mean_token_accuracy": 0.9952059835195541, "num_tokens": 2284863344.0, "step": 5376 }, { "entropy": 0.01870469329878688, "epoch": 2.353211511106248, "grad_norm": 20.5, "learning_rate": 1.6375203031943693e-05, "loss": 0.1075, "loss_lm": 0.016363475704565644, "loss_seg": 0.09113355167210102, "mean_token_accuracy": 0.995069682598114, "num_tokens": 2285288437.0, "step": 5377 }, { "entropy": 0.018349400721490383, "epoch": 2.3536491957544587, "grad_norm": 3.5625, "learning_rate": 1.6372495939361127e-05, "loss": 0.1237, "loss_lm": 0.016736388439312577, "loss_seg": 0.10695548914372921, "mean_token_accuracy": 0.9951522201299667, "num_tokens": 2285714163.0, "step": 5378 }, { "entropy": 0.01802094839513302, "epoch": 2.35408688040267, "grad_norm": 16.0, "learning_rate": 1.636978884677856e-05, "loss": 0.0965, "loss_lm": 0.016315326560288668, "loss_seg": 0.08017803356051445, "mean_token_accuracy": 0.9952501058578491, "num_tokens": 2286140047.0, "step": 5379 }, { "entropy": 0.01853599026799202, "epoch": 2.354524565050881, "grad_norm": 2.65625, "learning_rate": 1.6367081754195994e-05, "loss": 0.1122, "loss_lm": 0.01628951937891543, "loss_seg": 0.09596033580601215, "mean_token_accuracy": 0.9952698647975922, "num_tokens": 2286564522.0, "step": 5380 }, { "entropy": 0.017526375129818916, "epoch": 2.3549622496990916, "grad_norm": 10.25, "learning_rate": 1.6364374661613428e-05, "loss": 0.0994, "loss_lm": 0.015689410269260406, "loss_seg": 0.08374739065766335, "mean_token_accuracy": 0.9953728318214417, "num_tokens": 2286989980.0, "step": 5381 }, { "entropy": 0.017132122069597244, "epoch": 2.355399934347303, "grad_norm": 4.25, "learning_rate": 1.636166756903086e-05, "loss": 0.0966, "loss_lm": 0.01597015280276537, "loss_seg": 0.0806031497195363, "mean_token_accuracy": 0.9955980777740479, "num_tokens": 2287414043.0, "step": 5382 }, { "entropy": 0.017996505368500948, "epoch": 2.3558376189955137, "grad_norm": 7.03125, "learning_rate": 1.6358960476448295e-05, "loss": 0.1311, "loss_lm": 0.016910268226638436, "loss_seg": 0.11419067904353142, "mean_token_accuracy": 0.9952939599752426, "num_tokens": 2287838780.0, "step": 5383 }, { "entropy": 0.0187099976465106, "epoch": 2.3562753036437245, "grad_norm": 3.34375, "learning_rate": 1.635625338386573e-05, "loss": 0.0886, "loss_lm": 0.016306895297020674, "loss_seg": 0.0723335687071085, "mean_token_accuracy": 0.9950757771730423, "num_tokens": 2288263794.0, "step": 5384 }, { "entropy": 0.01804411504417658, "epoch": 2.3567129882919358, "grad_norm": 18.0, "learning_rate": 1.6353546291283163e-05, "loss": 0.1021, "loss_lm": 0.016265074023976922, "loss_seg": 0.08580605313181877, "mean_token_accuracy": 0.9952915757894516, "num_tokens": 2288689150.0, "step": 5385 }, { "entropy": 0.018470159731805325, "epoch": 2.3571506729401466, "grad_norm": 8.0625, "learning_rate": 1.6350839198700597e-05, "loss": 0.1025, "loss_lm": 0.016495637828484178, "loss_seg": 0.08603025320917368, "mean_token_accuracy": 0.9951449483633041, "num_tokens": 2289114021.0, "step": 5386 }, { "entropy": 0.018276169430464506, "epoch": 2.357588357588358, "grad_norm": 7.71875, "learning_rate": 1.634813210611803e-05, "loss": 0.1048, "loss_lm": 0.017237861175090075, "loss_seg": 0.08760532457381487, "mean_token_accuracy": 0.9952504634857178, "num_tokens": 2289539075.0, "step": 5387 }, { "entropy": 0.018235168419778347, "epoch": 2.3580260422365686, "grad_norm": 6.4375, "learning_rate": 1.6345425013535464e-05, "loss": 0.0711, "loss_lm": 0.016297476133331656, "loss_seg": 0.05484145972877741, "mean_token_accuracy": 0.9950829595327377, "num_tokens": 2289964297.0, "step": 5388 }, { "entropy": 0.01744614541530609, "epoch": 2.3584637268847795, "grad_norm": 7.65625, "learning_rate": 1.6342717920952894e-05, "loss": 0.1138, "loss_lm": 0.015542705077677965, "loss_seg": 0.09826196171343327, "mean_token_accuracy": 0.9953764230012894, "num_tokens": 2290389661.0, "step": 5389 }, { "entropy": 0.01789388619363308, "epoch": 2.3589014115329903, "grad_norm": 4.25, "learning_rate": 1.634001082837033e-05, "loss": 0.0928, "loss_lm": 0.016785490792244673, "loss_seg": 0.07600942067801952, "mean_token_accuracy": 0.9952172636985779, "num_tokens": 2290814076.0, "step": 5390 }, { "entropy": 0.017747837118804455, "epoch": 2.3593390961812015, "grad_norm": 9.75, "learning_rate": 1.6337303735787765e-05, "loss": 0.1487, "loss_lm": 0.01488314475864172, "loss_seg": 0.13378416001796722, "mean_token_accuracy": 0.995331421494484, "num_tokens": 2291239407.0, "step": 5391 }, { "entropy": 0.017745954915881157, "epoch": 2.3597767808294123, "grad_norm": 6.8125, "learning_rate": 1.63345966432052e-05, "loss": 0.0809, "loss_lm": 0.013178459368646145, "loss_seg": 0.06773552112281322, "mean_token_accuracy": 0.9952812194824219, "num_tokens": 2291663379.0, "step": 5392 }, { "entropy": 0.017678542528301477, "epoch": 2.3602144654776236, "grad_norm": 12.25, "learning_rate": 1.6331889550622633e-05, "loss": 0.0862, "loss_lm": 0.01411097845993936, "loss_seg": 0.07212942279875278, "mean_token_accuracy": 0.9952371269464493, "num_tokens": 2292089106.0, "step": 5393 }, { "entropy": 0.018565081991255283, "epoch": 2.3606521501258344, "grad_norm": 14.5, "learning_rate": 1.6329182458040063e-05, "loss": 0.0929, "loss_lm": 0.015051841270178556, "loss_seg": 0.0778920454904437, "mean_token_accuracy": 0.995172768831253, "num_tokens": 2292514356.0, "step": 5394 }, { "entropy": 0.017660682555288076, "epoch": 2.361089834774045, "grad_norm": 6.625, "learning_rate": 1.6326475365457497e-05, "loss": 0.0959, "loss_lm": 0.019126569153741002, "loss_seg": 0.07678048126399517, "mean_token_accuracy": 0.9952843636274338, "num_tokens": 2292938859.0, "step": 5395 }, { "entropy": 0.017885882407426834, "epoch": 2.361527519422256, "grad_norm": 6.0625, "learning_rate": 1.6323768272874934e-05, "loss": 0.1244, "loss_lm": 0.01620081369765103, "loss_seg": 0.10814947262406349, "mean_token_accuracy": 0.9952356964349747, "num_tokens": 2293363871.0, "step": 5396 }, { "entropy": 0.018719924613833427, "epoch": 2.3619652040704673, "grad_norm": 8.375, "learning_rate": 1.6321061180292368e-05, "loss": 0.1229, "loss_lm": 0.018782551866024733, "loss_seg": 0.10409257374703884, "mean_token_accuracy": 0.9950603395700455, "num_tokens": 2293789172.0, "step": 5397 }, { "entropy": 0.01846058527007699, "epoch": 2.362402888718678, "grad_norm": 3.5625, "learning_rate": 1.63183540877098e-05, "loss": 0.1052, "loss_lm": 0.017895660595968366, "loss_seg": 0.08734595775604248, "mean_token_accuracy": 0.9950945526361465, "num_tokens": 2294214387.0, "step": 5398 }, { "entropy": 0.018386640585958958, "epoch": 2.3628405733668894, "grad_norm": 8.5, "learning_rate": 1.631564699512723e-05, "loss": 0.1041, "loss_lm": 0.014154279138892889, "loss_seg": 0.0899879727512598, "mean_token_accuracy": 0.9952060729265213, "num_tokens": 2294639065.0, "step": 5399 }, { "entropy": 0.018092963378876448, "epoch": 2.3632782580151, "grad_norm": 5.8125, "learning_rate": 1.6312939902544665e-05, "loss": 0.0795, "loss_lm": 0.015239037573337555, "loss_seg": 0.06426942721009254, "mean_token_accuracy": 0.9952877014875412, "num_tokens": 2295064296.0, "step": 5400 }, { "entropy": 0.019038293976336718, "epoch": 2.363715942663311, "grad_norm": 23.75, "learning_rate": 1.6310232809962102e-05, "loss": 0.0826, "loss_lm": 0.018245512386783957, "loss_seg": 0.06433993764221668, "mean_token_accuracy": 0.9949821382761002, "num_tokens": 2295489515.0, "step": 5401 }, { "entropy": 0.018004252575337887, "epoch": 2.3641536273115222, "grad_norm": 19.5, "learning_rate": 1.6307525717379536e-05, "loss": 0.1042, "loss_lm": 0.016917879227548838, "loss_seg": 0.08731481805443764, "mean_token_accuracy": 0.9952315241098404, "num_tokens": 2295914758.0, "step": 5402 }, { "entropy": 0.017437591683119535, "epoch": 2.364591311959733, "grad_norm": 9.0, "learning_rate": 1.630481862479697e-05, "loss": 0.1235, "loss_lm": 0.016440383391454816, "loss_seg": 0.1070186011493206, "mean_token_accuracy": 0.9953227043151855, "num_tokens": 2296340174.0, "step": 5403 }, { "entropy": 0.0180387319996953, "epoch": 2.365028996607944, "grad_norm": 5.9375, "learning_rate": 1.63021115322144e-05, "loss": 0.1051, "loss_lm": 0.014790729619562626, "loss_seg": 0.09030408039689064, "mean_token_accuracy": 0.9952143877744675, "num_tokens": 2296765566.0, "step": 5404 }, { "entropy": 0.017519995104521513, "epoch": 2.365466681256155, "grad_norm": 3.8125, "learning_rate": 1.6299404439631834e-05, "loss": 0.1049, "loss_lm": 0.014280007220804691, "loss_seg": 0.0906206239014864, "mean_token_accuracy": 0.9954423904418945, "num_tokens": 2297190291.0, "step": 5405 }, { "entropy": 0.01804753951728344, "epoch": 2.365904365904366, "grad_norm": 8.9375, "learning_rate": 1.629669734704927e-05, "loss": 0.0833, "loss_lm": 0.015264767222106457, "loss_seg": 0.06802080664783716, "mean_token_accuracy": 0.9951706677675247, "num_tokens": 2297615362.0, "step": 5406 }, { "entropy": 0.017955746967345476, "epoch": 2.3663420505525767, "grad_norm": 8.625, "learning_rate": 1.6293990254466705e-05, "loss": 0.131, "loss_lm": 0.017478021793067455, "loss_seg": 0.11352709122002125, "mean_token_accuracy": 0.9952159821987152, "num_tokens": 2298040291.0, "step": 5407 }, { "entropy": 0.018752824515104294, "epoch": 2.366779735200788, "grad_norm": 4.96875, "learning_rate": 1.629128316188414e-05, "loss": 0.1084, "loss_lm": 0.016228258842602372, "loss_seg": 0.09213382005691528, "mean_token_accuracy": 0.9952163994312286, "num_tokens": 2298465876.0, "step": 5408 }, { "entropy": 0.018220868427306414, "epoch": 2.367217419848999, "grad_norm": 9.5, "learning_rate": 1.628857606930157e-05, "loss": 0.1234, "loss_lm": 0.015805550618097186, "loss_seg": 0.1075739674270153, "mean_token_accuracy": 0.9952450096607208, "num_tokens": 2298891302.0, "step": 5409 }, { "entropy": 0.01800013007596135, "epoch": 2.3676551044972096, "grad_norm": 11.4375, "learning_rate": 1.6285868976719003e-05, "loss": 0.0908, "loss_lm": 0.01495043165050447, "loss_seg": 0.07580104004591703, "mean_token_accuracy": 0.995295524597168, "num_tokens": 2299315721.0, "step": 5410 }, { "entropy": 0.018661142326891422, "epoch": 2.368092789145421, "grad_norm": 4.40625, "learning_rate": 1.628316188413644e-05, "loss": 0.0786, "loss_lm": 0.014574304223060608, "loss_seg": 0.06407161243259907, "mean_token_accuracy": 0.9951512664556503, "num_tokens": 2299740720.0, "step": 5411 }, { "entropy": 0.018641831818968058, "epoch": 2.3685304737936317, "grad_norm": 6.46875, "learning_rate": 1.6280454791553873e-05, "loss": 0.1512, "loss_lm": 0.01703323796391487, "loss_seg": 0.13417497090995312, "mean_token_accuracy": 0.9951239824295044, "num_tokens": 2300165402.0, "step": 5412 }, { "entropy": 0.01782255107536912, "epoch": 2.3689681584418425, "grad_norm": 4.96875, "learning_rate": 1.6277747698971304e-05, "loss": 0.1139, "loss_lm": 0.015504293609410524, "loss_seg": 0.09841272048652172, "mean_token_accuracy": 0.9952785819768906, "num_tokens": 2300590646.0, "step": 5413 }, { "entropy": 0.01850981032475829, "epoch": 2.3694058430900538, "grad_norm": 3.296875, "learning_rate": 1.6275040606388737e-05, "loss": 0.1215, "loss_lm": 0.01744730118662119, "loss_seg": 0.10401056334376335, "mean_token_accuracy": 0.9951782077550888, "num_tokens": 2301015568.0, "step": 5414 }, { "entropy": 0.017822470050305128, "epoch": 2.3698435277382646, "grad_norm": 7.15625, "learning_rate": 1.627233351380617e-05, "loss": 0.105, "loss_lm": 0.014214172726497054, "loss_seg": 0.09077445510774851, "mean_token_accuracy": 0.9952662289142609, "num_tokens": 2301441378.0, "step": 5415 }, { "entropy": 0.018005032557994127, "epoch": 2.3702812123864754, "grad_norm": 26.5, "learning_rate": 1.6269626421223608e-05, "loss": 0.0857, "loss_lm": 0.014306107768788934, "loss_seg": 0.07140940241515636, "mean_token_accuracy": 0.9952554851770401, "num_tokens": 2301866470.0, "step": 5416 }, { "entropy": 0.018155106343328953, "epoch": 2.3707188970346866, "grad_norm": 6.4375, "learning_rate": 1.6266919328641042e-05, "loss": 0.1184, "loss_lm": 0.016182982828468084, "loss_seg": 0.10219144634902477, "mean_token_accuracy": 0.9952855259180069, "num_tokens": 2302291288.0, "step": 5417 }, { "entropy": 0.01752258976921439, "epoch": 2.3711565816828974, "grad_norm": 3.734375, "learning_rate": 1.6264212236058472e-05, "loss": 0.0945, "loss_lm": 0.01448904094286263, "loss_seg": 0.08001821208745241, "mean_token_accuracy": 0.9954095780849457, "num_tokens": 2302715811.0, "step": 5418 }, { "entropy": 0.01795513415709138, "epoch": 2.3715942663311083, "grad_norm": 6.1875, "learning_rate": 1.6261505143475906e-05, "loss": 0.1291, "loss_lm": 0.016394463600590825, "loss_seg": 0.11269675474613905, "mean_token_accuracy": 0.9952571988105774, "num_tokens": 2303140845.0, "step": 5419 }, { "entropy": 0.018318748101592064, "epoch": 2.3720319509793195, "grad_norm": 4.09375, "learning_rate": 1.625879805089334e-05, "loss": 0.0834, "loss_lm": 0.013437246670946479, "loss_seg": 0.0699704559519887, "mean_token_accuracy": 0.9952469915151596, "num_tokens": 2303565579.0, "step": 5420 }, { "entropy": 0.01820978056639433, "epoch": 2.3724696356275303, "grad_norm": 5.375, "learning_rate": 1.6256090958310777e-05, "loss": 0.1216, "loss_lm": 0.016344554256647825, "loss_seg": 0.10529525950551033, "mean_token_accuracy": 0.9951482117176056, "num_tokens": 2303991374.0, "step": 5421 }, { "entropy": 0.01825888967141509, "epoch": 2.372907320275741, "grad_norm": 4.09375, "learning_rate": 1.625338386572821e-05, "loss": 0.0991, "loss_lm": 0.014930509496480227, "loss_seg": 0.0841706832870841, "mean_token_accuracy": 0.9952492564916611, "num_tokens": 2304415475.0, "step": 5422 }, { "entropy": 0.018148087430745363, "epoch": 2.3733450049239524, "grad_norm": 34.0, "learning_rate": 1.625067677314564e-05, "loss": 0.1079, "loss_lm": 0.014847117941826582, "loss_seg": 0.0930829718708992, "mean_token_accuracy": 0.9952484667301178, "num_tokens": 2304840563.0, "step": 5423 }, { "entropy": 0.017897417303174734, "epoch": 2.373782689572163, "grad_norm": 5.0, "learning_rate": 1.6247969680563075e-05, "loss": 0.1113, "loss_lm": 0.015118991024792194, "loss_seg": 0.0961560383439064, "mean_token_accuracy": 0.9953433722257614, "num_tokens": 2305266122.0, "step": 5424 }, { "entropy": 0.018877796828746796, "epoch": 2.374220374220374, "grad_norm": 12.5, "learning_rate": 1.624526258798051e-05, "loss": 0.1065, "loss_lm": 0.017695806920528412, "loss_seg": 0.08882264047861099, "mean_token_accuracy": 0.9951090216636658, "num_tokens": 2305690890.0, "step": 5425 }, { "entropy": 0.01774475071579218, "epoch": 2.3746580588685853, "grad_norm": 4.0625, "learning_rate": 1.6242555495397942e-05, "loss": 0.0853, "loss_lm": 0.01631343853659928, "loss_seg": 0.06894300691783428, "mean_token_accuracy": 0.9954034984111786, "num_tokens": 2306115310.0, "step": 5426 }, { "entropy": 0.018159848172217607, "epoch": 2.375095743516796, "grad_norm": 4.03125, "learning_rate": 1.623984840281538e-05, "loss": 0.1094, "loss_lm": 0.0169726291205734, "loss_seg": 0.0923888348042965, "mean_token_accuracy": 0.9951947033405304, "num_tokens": 2306540819.0, "step": 5427 }, { "entropy": 0.017996144015341997, "epoch": 2.3755334281650073, "grad_norm": 3.921875, "learning_rate": 1.623714131023281e-05, "loss": 0.0907, "loss_lm": 0.016579966293647885, "loss_seg": 0.07413932960480452, "mean_token_accuracy": 0.9952843636274338, "num_tokens": 2306966598.0, "step": 5428 }, { "entropy": 0.018668577540665865, "epoch": 2.375971112813218, "grad_norm": 7.9375, "learning_rate": 1.6234434217650243e-05, "loss": 0.0905, "loss_lm": 0.01627246104180813, "loss_seg": 0.07427437230944633, "mean_token_accuracy": 0.9951661229133606, "num_tokens": 2307391271.0, "step": 5429 }, { "entropy": 0.018119007349014282, "epoch": 2.376408797461429, "grad_norm": 3.046875, "learning_rate": 1.6231727125067677e-05, "loss": 0.1124, "loss_lm": 0.015450594248250127, "loss_seg": 0.09691831190139055, "mean_token_accuracy": 0.9952862560749054, "num_tokens": 2307816367.0, "step": 5430 }, { "entropy": 0.017709219828248024, "epoch": 2.37684648210964, "grad_norm": 16.0, "learning_rate": 1.622902003248511e-05, "loss": 0.0907, "loss_lm": 0.015963802579790354, "loss_seg": 0.07477305456995964, "mean_token_accuracy": 0.995371550321579, "num_tokens": 2308241534.0, "step": 5431 }, { "entropy": 0.018749937415122986, "epoch": 2.377284166757851, "grad_norm": 3.265625, "learning_rate": 1.6226312939902548e-05, "loss": 0.0905, "loss_lm": 0.014185441192239523, "loss_seg": 0.07631148304790258, "mean_token_accuracy": 0.9951530545949936, "num_tokens": 2308666914.0, "step": 5432 }, { "entropy": 0.01776885474100709, "epoch": 2.377721851406062, "grad_norm": 7.03125, "learning_rate": 1.6223605847319978e-05, "loss": 0.0936, "loss_lm": 0.014684977475553751, "loss_seg": 0.07890060916543007, "mean_token_accuracy": 0.9953209161758423, "num_tokens": 2309092154.0, "step": 5433 }, { "entropy": 0.018234493676573038, "epoch": 2.378159536054273, "grad_norm": 4.21875, "learning_rate": 1.6220898754737412e-05, "loss": 0.1099, "loss_lm": 0.016645558876916766, "loss_seg": 0.0932353651151061, "mean_token_accuracy": 0.9951991736888885, "num_tokens": 2309516830.0, "step": 5434 }, { "entropy": 0.018131414894014597, "epoch": 2.378597220702484, "grad_norm": 7.40625, "learning_rate": 1.6218191662154846e-05, "loss": 0.1025, "loss_lm": 0.016821031691506505, "loss_seg": 0.0856391154229641, "mean_token_accuracy": 0.9952566921710968, "num_tokens": 2309941785.0, "step": 5435 }, { "entropy": 0.01851751748472452, "epoch": 2.3790349053506947, "grad_norm": 11.0, "learning_rate": 1.621548456957228e-05, "loss": 0.1003, "loss_lm": 0.016566279577091336, "loss_seg": 0.0837600938975811, "mean_token_accuracy": 0.9952135384082794, "num_tokens": 2310367136.0, "step": 5436 }, { "entropy": 0.018089063465595245, "epoch": 2.3794725899989055, "grad_norm": 16.75, "learning_rate": 1.6212777476989713e-05, "loss": 0.1055, "loss_lm": 0.018606079276651144, "loss_seg": 0.08686173893511295, "mean_token_accuracy": 0.9952928125858307, "num_tokens": 2310792154.0, "step": 5437 }, { "entropy": 0.017722853925079107, "epoch": 2.379910274647117, "grad_norm": 10.8125, "learning_rate": 1.6210070384407147e-05, "loss": 0.1287, "loss_lm": 0.01636698329821229, "loss_seg": 0.11229819059371948, "mean_token_accuracy": 0.995302751660347, "num_tokens": 2311216618.0, "step": 5438 }, { "entropy": 0.018474956043064594, "epoch": 2.3803479592953276, "grad_norm": 3.421875, "learning_rate": 1.620736329182458e-05, "loss": 0.1035, "loss_lm": 0.014873526524752378, "loss_seg": 0.08858798258006573, "mean_token_accuracy": 0.9952110350131989, "num_tokens": 2311641937.0, "step": 5439 }, { "entropy": 0.01860427623614669, "epoch": 2.380785643943539, "grad_norm": 5.8125, "learning_rate": 1.6204656199242014e-05, "loss": 0.1198, "loss_lm": 0.016616160748526454, "loss_seg": 0.10315441433340311, "mean_token_accuracy": 0.9949508905410767, "num_tokens": 2312067132.0, "step": 5440 }, { "entropy": 0.01870009070262313, "epoch": 2.3812233285917497, "grad_norm": 3.234375, "learning_rate": 1.6201949106659448e-05, "loss": 0.0873, "loss_lm": 0.016130455769598484, "loss_seg": 0.07113102823495865, "mean_token_accuracy": 0.9952156692743301, "num_tokens": 2312492227.0, "step": 5441 }, { "entropy": 0.017606398090720177, "epoch": 2.3816610132399605, "grad_norm": 7.8125, "learning_rate": 1.6199242014076882e-05, "loss": 0.0999, "loss_lm": 0.01824723300524056, "loss_seg": 0.08170265518128872, "mean_token_accuracy": 0.9953297972679138, "num_tokens": 2312916994.0, "step": 5442 }, { "entropy": 0.01877726474776864, "epoch": 2.3820986978881717, "grad_norm": 7.71875, "learning_rate": 1.6196534921494316e-05, "loss": 0.1038, "loss_lm": 0.01990999490953982, "loss_seg": 0.08391711488366127, "mean_token_accuracy": 0.9950820356607437, "num_tokens": 2313342121.0, "step": 5443 }, { "entropy": 0.017914652824401855, "epoch": 2.3825363825363826, "grad_norm": 27.0, "learning_rate": 1.619382782891175e-05, "loss": 0.0954, "loss_lm": 0.014864819590002298, "loss_seg": 0.08053324371576309, "mean_token_accuracy": 0.9953655749559402, "num_tokens": 2313766289.0, "step": 5444 }, { "entropy": 0.01858387654647231, "epoch": 2.3829740671845934, "grad_norm": 4.09375, "learning_rate": 1.6191120736329183e-05, "loss": 0.1035, "loss_lm": 0.01795958448201418, "loss_seg": 0.0855804868042469, "mean_token_accuracy": 0.9951384216547012, "num_tokens": 2314191627.0, "step": 5445 }, { "entropy": 0.018382688984274864, "epoch": 2.3834117518328046, "grad_norm": 13.625, "learning_rate": 1.6188413643746617e-05, "loss": 0.1155, "loss_lm": 0.015013426775112748, "loss_seg": 0.1005202904343605, "mean_token_accuracy": 0.9951578080654144, "num_tokens": 2314616598.0, "step": 5446 }, { "entropy": 0.01864486699923873, "epoch": 2.3838494364810154, "grad_norm": 5.5, "learning_rate": 1.618570655116405e-05, "loss": 0.1012, "loss_lm": 0.0174108250066638, "loss_seg": 0.0837703999131918, "mean_token_accuracy": 0.9951235204935074, "num_tokens": 2315042079.0, "step": 5447 }, { "entropy": 0.018067949451506138, "epoch": 2.3842871211292263, "grad_norm": 4.6875, "learning_rate": 1.6182999458581484e-05, "loss": 0.1025, "loss_lm": 0.014459585072472692, "loss_seg": 0.08808640204370022, "mean_token_accuracy": 0.9953048676252365, "num_tokens": 2315467156.0, "step": 5448 }, { "entropy": 0.018695321399718523, "epoch": 2.3847248057774375, "grad_norm": 18.875, "learning_rate": 1.6180292365998918e-05, "loss": 0.1329, "loss_lm": 0.017676599323749542, "loss_seg": 0.11520410142838955, "mean_token_accuracy": 0.9950238615274429, "num_tokens": 2315892759.0, "step": 5449 }, { "entropy": 0.01834062533453107, "epoch": 2.3851624904256483, "grad_norm": 3.28125, "learning_rate": 1.617758527341635e-05, "loss": 0.0807, "loss_lm": 0.016056043561547995, "loss_seg": 0.06462692189961672, "mean_token_accuracy": 0.9952650815248489, "num_tokens": 2316316799.0, "step": 5450 }, { "entropy": 0.018128630705177784, "epoch": 2.385600175073859, "grad_norm": 4.5, "learning_rate": 1.6174878180833785e-05, "loss": 0.1314, "loss_lm": 0.015779559733346105, "loss_seg": 0.11563632264733315, "mean_token_accuracy": 0.9952950030565262, "num_tokens": 2316741427.0, "step": 5451 }, { "entropy": 0.017842586617916822, "epoch": 2.3860378597220704, "grad_norm": 14.375, "learning_rate": 1.617217108825122e-05, "loss": 0.0968, "loss_lm": 0.017049548914656043, "loss_seg": 0.07972892746329308, "mean_token_accuracy": 0.9953607022762299, "num_tokens": 2317165804.0, "step": 5452 }, { "entropy": 0.01820985134691, "epoch": 2.386475544370281, "grad_norm": 13.8125, "learning_rate": 1.6169463995668653e-05, "loss": 0.1147, "loss_lm": 0.015241221059113741, "loss_seg": 0.09944408759474754, "mean_token_accuracy": 0.9953589141368866, "num_tokens": 2317590648.0, "step": 5453 }, { "entropy": 0.018279732670634985, "epoch": 2.386913229018492, "grad_norm": 5.75, "learning_rate": 1.6166756903086086e-05, "loss": 0.0887, "loss_lm": 0.014529703883454204, "loss_seg": 0.07417947798967361, "mean_token_accuracy": 0.9952105134725571, "num_tokens": 2318015486.0, "step": 5454 }, { "entropy": 0.017587929032742977, "epoch": 2.3873509136667033, "grad_norm": 44.25, "learning_rate": 1.616404981050352e-05, "loss": 0.0776, "loss_lm": 0.01516162813641131, "loss_seg": 0.06243575271219015, "mean_token_accuracy": 0.9953668862581253, "num_tokens": 2318440642.0, "step": 5455 }, { "entropy": 0.017813075333833694, "epoch": 2.387788598314914, "grad_norm": 5.65625, "learning_rate": 1.6161342717920954e-05, "loss": 0.0963, "loss_lm": 0.013753366889432073, "loss_seg": 0.0825289785861969, "mean_token_accuracy": 0.9954605400562286, "num_tokens": 2318865317.0, "step": 5456 }, { "entropy": 0.018388701137155294, "epoch": 2.388226282963125, "grad_norm": 8.375, "learning_rate": 1.6158635625338388e-05, "loss": 0.143, "loss_lm": 0.01777927577495575, "loss_seg": 0.1252342825755477, "mean_token_accuracy": 0.9951565712690353, "num_tokens": 2319290177.0, "step": 5457 }, { "entropy": 0.01867687376216054, "epoch": 2.388663967611336, "grad_norm": 4.5625, "learning_rate": 1.615592853275582e-05, "loss": 0.125, "loss_lm": 0.01711613521911204, "loss_seg": 0.10788923688232899, "mean_token_accuracy": 0.9951319247484207, "num_tokens": 2319715766.0, "step": 5458 }, { "entropy": 0.018579690717160702, "epoch": 2.389101652259547, "grad_norm": 7.9375, "learning_rate": 1.6153221440173255e-05, "loss": 0.0872, "loss_lm": 0.016158245038241148, "loss_seg": 0.07107576262205839, "mean_token_accuracy": 0.9949826300144196, "num_tokens": 2320141066.0, "step": 5459 }, { "entropy": 0.018397405743598938, "epoch": 2.3895393369077578, "grad_norm": 8.8125, "learning_rate": 1.615051434759069e-05, "loss": 0.1092, "loss_lm": 0.017033257987350225, "loss_seg": 0.09220174700021744, "mean_token_accuracy": 0.9951992183923721, "num_tokens": 2320566376.0, "step": 5460 }, { "entropy": 0.018416660372167826, "epoch": 2.389977021555969, "grad_norm": 5.53125, "learning_rate": 1.614780725500812e-05, "loss": 0.0908, "loss_lm": 0.016628111014142632, "loss_seg": 0.07413608115166426, "mean_token_accuracy": 0.9951472282409668, "num_tokens": 2320991631.0, "step": 5461 }, { "entropy": 0.019005324691534042, "epoch": 2.39041470620418, "grad_norm": 11.625, "learning_rate": 1.6145100162425553e-05, "loss": 0.1307, "loss_lm": 0.018363359849900007, "loss_seg": 0.11228935979306698, "mean_token_accuracy": 0.9950949996709824, "num_tokens": 2321416272.0, "step": 5462 }, { "entropy": 0.0184325841255486, "epoch": 2.390852390852391, "grad_norm": 9.1875, "learning_rate": 1.614239306984299e-05, "loss": 0.0958, "loss_lm": 0.017263726331293583, "loss_seg": 0.07853227853775024, "mean_token_accuracy": 0.9952667057514191, "num_tokens": 2321841511.0, "step": 5463 }, { "entropy": 0.018200083170086145, "epoch": 2.391290075500602, "grad_norm": 16.25, "learning_rate": 1.6139685977260424e-05, "loss": 0.111, "loss_lm": 0.015610468108206987, "loss_seg": 0.09536992479115725, "mean_token_accuracy": 0.9952490925788879, "num_tokens": 2322266653.0, "step": 5464 }, { "entropy": 0.018637624569237232, "epoch": 2.3917277601488127, "grad_norm": 18.5, "learning_rate": 1.6136978884677857e-05, "loss": 0.0958, "loss_lm": 0.01569813652895391, "loss_seg": 0.08013528771698475, "mean_token_accuracy": 0.9951076805591583, "num_tokens": 2322691760.0, "step": 5465 }, { "entropy": 0.01843215525150299, "epoch": 2.3921654447970235, "grad_norm": 3.78125, "learning_rate": 1.6134271792095288e-05, "loss": 0.1203, "loss_lm": 0.015416679671034217, "loss_seg": 0.10487752966582775, "mean_token_accuracy": 0.9951490163803101, "num_tokens": 2323116561.0, "step": 5466 }, { "entropy": 0.01769967842847109, "epoch": 2.392603129445235, "grad_norm": 3.328125, "learning_rate": 1.613156469951272e-05, "loss": 0.1266, "loss_lm": 0.016559912590309978, "loss_seg": 0.11001264862716198, "mean_token_accuracy": 0.9951916337013245, "num_tokens": 2323540979.0, "step": 5467 }, { "entropy": 0.01872924156486988, "epoch": 2.3930408140934456, "grad_norm": 3.59375, "learning_rate": 1.612885760693016e-05, "loss": 0.0944, "loss_lm": 0.015865452121943235, "loss_seg": 0.0785293485969305, "mean_token_accuracy": 0.9950131922960281, "num_tokens": 2323966076.0, "step": 5468 }, { "entropy": 0.018233826849609613, "epoch": 2.393478498741657, "grad_norm": 3.71875, "learning_rate": 1.6126150514347592e-05, "loss": 0.0885, "loss_lm": 0.01556030847132206, "loss_seg": 0.07291648630052805, "mean_token_accuracy": 0.9952498227357864, "num_tokens": 2324391289.0, "step": 5469 }, { "entropy": 0.018424312118440866, "epoch": 2.3939161833898677, "grad_norm": 10.875, "learning_rate": 1.6123443421765026e-05, "loss": 0.0922, "loss_lm": 0.018698603380471468, "loss_seg": 0.0735123036429286, "mean_token_accuracy": 0.9952271431684494, "num_tokens": 2324816380.0, "step": 5470 }, { "entropy": 0.018036102410405874, "epoch": 2.3943538680380785, "grad_norm": 7.90625, "learning_rate": 1.6120736329182456e-05, "loss": 0.0764, "loss_lm": 0.014186692424118519, "loss_seg": 0.06225454621016979, "mean_token_accuracy": 0.9953524023294449, "num_tokens": 2325241713.0, "step": 5471 }, { "entropy": 0.0180372535251081, "epoch": 2.3947915526862893, "grad_norm": 7.59375, "learning_rate": 1.611802923659989e-05, "loss": 0.1082, "loss_lm": 0.014323921874165535, "loss_seg": 0.09391122683882713, "mean_token_accuracy": 0.9953135848045349, "num_tokens": 2325667326.0, "step": 5472 }, { "entropy": 0.018258171156048775, "epoch": 2.3952292373345005, "grad_norm": 4.09375, "learning_rate": 1.6115322144017327e-05, "loss": 0.0991, "loss_lm": 0.01800612872466445, "loss_seg": 0.08112828712910414, "mean_token_accuracy": 0.9951601326465607, "num_tokens": 2326092521.0, "step": 5473 }, { "entropy": 0.01810413831844926, "epoch": 2.3956669219827114, "grad_norm": 7.375, "learning_rate": 1.611261505143476e-05, "loss": 0.1061, "loss_lm": 0.01600721664726734, "loss_seg": 0.09006589092314243, "mean_token_accuracy": 0.9951680153608322, "num_tokens": 2326517157.0, "step": 5474 }, { "entropy": 0.018103499431163073, "epoch": 2.3961046066309226, "grad_norm": 13.8125, "learning_rate": 1.6109907958852195e-05, "loss": 0.1229, "loss_lm": 0.016969902208074927, "loss_seg": 0.10588527843356133, "mean_token_accuracy": 0.9953485727310181, "num_tokens": 2326942360.0, "step": 5475 }, { "entropy": 0.018776646815240383, "epoch": 2.3965422912791334, "grad_norm": 10.0625, "learning_rate": 1.6107200866269625e-05, "loss": 0.1224, "loss_lm": 0.0159804355353117, "loss_seg": 0.10639774799346924, "mean_token_accuracy": 0.9950518757104874, "num_tokens": 2327366912.0, "step": 5476 }, { "entropy": 0.01811381336301565, "epoch": 2.3969799759273442, "grad_norm": 3.09375, "learning_rate": 1.610449377368706e-05, "loss": 0.1173, "loss_lm": 0.014268555212765932, "loss_seg": 0.10301043465733528, "mean_token_accuracy": 0.9952554255723953, "num_tokens": 2327791343.0, "step": 5477 }, { "entropy": 0.01829537283629179, "epoch": 2.3974176605755555, "grad_norm": 9.75, "learning_rate": 1.6101786681104496e-05, "loss": 0.1111, "loss_lm": 0.015528081450611353, "loss_seg": 0.09554578363895416, "mean_token_accuracy": 0.9952305406332016, "num_tokens": 2328216507.0, "step": 5478 }, { "entropy": 0.01812801556661725, "epoch": 2.3978553452237663, "grad_norm": 7.5625, "learning_rate": 1.609907958852193e-05, "loss": 0.1092, "loss_lm": 0.015710202511399984, "loss_seg": 0.0935196615755558, "mean_token_accuracy": 0.9953228384256363, "num_tokens": 2328641308.0, "step": 5479 }, { "entropy": 0.01822924055159092, "epoch": 2.398293029871977, "grad_norm": 5.6875, "learning_rate": 1.609637249593936e-05, "loss": 0.0774, "loss_lm": 0.01594200893305242, "loss_seg": 0.061503175646066666, "mean_token_accuracy": 0.9952718615531921, "num_tokens": 2329065953.0, "step": 5480 }, { "entropy": 0.018793914932757616, "epoch": 2.3987307145201884, "grad_norm": 10.9375, "learning_rate": 1.6093665403356794e-05, "loss": 0.1509, "loss_lm": 0.017730812774971128, "loss_seg": 0.13316866010427475, "mean_token_accuracy": 0.9950502812862396, "num_tokens": 2329491146.0, "step": 5481 }, { "entropy": 0.017708846367895603, "epoch": 2.399168399168399, "grad_norm": 14.75, "learning_rate": 1.6090958310774227e-05, "loss": 0.1112, "loss_lm": 0.012504783226177096, "loss_seg": 0.09868239425122738, "mean_token_accuracy": 0.995365172624588, "num_tokens": 2329915491.0, "step": 5482 }, { "entropy": 0.018356946762651205, "epoch": 2.39960608381661, "grad_norm": 3.828125, "learning_rate": 1.6088251218191665e-05, "loss": 0.1281, "loss_lm": 0.017359464894980192, "loss_seg": 0.11076911725103855, "mean_token_accuracy": 0.9950476884841919, "num_tokens": 2330340090.0, "step": 5483 }, { "entropy": 0.01869166875258088, "epoch": 2.4000437684648213, "grad_norm": 6.75, "learning_rate": 1.6085544125609098e-05, "loss": 0.1055, "loss_lm": 0.01644330401904881, "loss_seg": 0.08903711661696434, "mean_token_accuracy": 0.9950685650110245, "num_tokens": 2330764483.0, "step": 5484 }, { "entropy": 0.018687895499169827, "epoch": 2.400481453113032, "grad_norm": 5.53125, "learning_rate": 1.608283703302653e-05, "loss": 0.0964, "loss_lm": 0.01641528308391571, "loss_seg": 0.07995526120066643, "mean_token_accuracy": 0.9949941635131836, "num_tokens": 2331189773.0, "step": 5485 }, { "entropy": 0.018166106659919024, "epoch": 2.400919137761243, "grad_norm": 21.125, "learning_rate": 1.6080129940443962e-05, "loss": 0.1272, "loss_lm": 0.017694663256406784, "loss_seg": 0.1094581326469779, "mean_token_accuracy": 0.9952452927827835, "num_tokens": 2331614552.0, "step": 5486 }, { "entropy": 0.017829722724854946, "epoch": 2.401356822409454, "grad_norm": 5.65625, "learning_rate": 1.6077422847861396e-05, "loss": 0.0956, "loss_lm": 0.015252452110871673, "loss_seg": 0.08029979467391968, "mean_token_accuracy": 0.9954579919576645, "num_tokens": 2332039619.0, "step": 5487 }, { "entropy": 0.018448999151587486, "epoch": 2.401794507057665, "grad_norm": 5.625, "learning_rate": 1.6074715755278833e-05, "loss": 0.1167, "loss_lm": 0.018222481943666935, "loss_seg": 0.09844156261533499, "mean_token_accuracy": 0.9951373338699341, "num_tokens": 2332464673.0, "step": 5488 }, { "entropy": 0.018575615249574184, "epoch": 2.4022321917058758, "grad_norm": 4.78125, "learning_rate": 1.6072008662696267e-05, "loss": 0.1502, "loss_lm": 0.015239803120493889, "loss_seg": 0.13499979674816132, "mean_token_accuracy": 0.995215579867363, "num_tokens": 2332889972.0, "step": 5489 }, { "entropy": 0.018196251709014177, "epoch": 2.402669876354087, "grad_norm": 7.1875, "learning_rate": 1.6069301570113697e-05, "loss": 0.0875, "loss_lm": 0.014560960466042161, "loss_seg": 0.07296579144895077, "mean_token_accuracy": 0.9951993525028229, "num_tokens": 2333314669.0, "step": 5490 }, { "entropy": 0.017760529182851315, "epoch": 2.403107561002298, "grad_norm": 10.0625, "learning_rate": 1.606659447753113e-05, "loss": 0.0993, "loss_lm": 0.016062092036008835, "loss_seg": 0.08328165393322706, "mean_token_accuracy": 0.9952778071165085, "num_tokens": 2333739654.0, "step": 5491 }, { "entropy": 0.017953671514987946, "epoch": 2.4035452456505086, "grad_norm": 10.3125, "learning_rate": 1.6063887384948565e-05, "loss": 0.1159, "loss_lm": 0.015895403688773513, "loss_seg": 0.1000334583222866, "mean_token_accuracy": 0.9951618164777756, "num_tokens": 2334164582.0, "step": 5492 }, { "entropy": 0.01786002703011036, "epoch": 2.40398293029872, "grad_norm": 3.53125, "learning_rate": 1.6061180292366e-05, "loss": 0.1248, "loss_lm": 0.01737886155024171, "loss_seg": 0.10740752145648003, "mean_token_accuracy": 0.9953284710645676, "num_tokens": 2334589656.0, "step": 5493 }, { "entropy": 0.018232007510960102, "epoch": 2.4044206149469307, "grad_norm": 5.875, "learning_rate": 1.6058473199783436e-05, "loss": 0.1068, "loss_lm": 0.017552434001117945, "loss_seg": 0.08924980089068413, "mean_token_accuracy": 0.9951882511377335, "num_tokens": 2335013999.0, "step": 5494 }, { "entropy": 0.017933575436472893, "epoch": 2.4048582995951415, "grad_norm": 5.0625, "learning_rate": 1.6055766107200866e-05, "loss": 0.0766, "loss_lm": 0.016518230084329844, "loss_seg": 0.060077049769461155, "mean_token_accuracy": 0.9954204559326172, "num_tokens": 2335439092.0, "step": 5495 }, { "entropy": 0.01762616215273738, "epoch": 2.405295984243353, "grad_norm": 2.921875, "learning_rate": 1.60530590146183e-05, "loss": 0.09, "loss_lm": 0.014813644578680396, "loss_seg": 0.07523083128035069, "mean_token_accuracy": 0.9953559637069702, "num_tokens": 2335863894.0, "step": 5496 }, { "entropy": 0.017755390144884586, "epoch": 2.4057336688915636, "grad_norm": 9.6875, "learning_rate": 1.6050351922035733e-05, "loss": 0.1198, "loss_lm": 0.01626143418252468, "loss_seg": 0.10356059484183788, "mean_token_accuracy": 0.9955013990402222, "num_tokens": 2336288633.0, "step": 5497 }, { "entropy": 0.017928924411535263, "epoch": 2.4061713535397744, "grad_norm": 3.8125, "learning_rate": 1.6047644829453167e-05, "loss": 0.1107, "loss_lm": 0.01443233690224588, "loss_seg": 0.0962259117513895, "mean_token_accuracy": 0.9953549355268478, "num_tokens": 2336713535.0, "step": 5498 }, { "entropy": 0.018145637586712837, "epoch": 2.4066090381879857, "grad_norm": 9.125, "learning_rate": 1.6044937736870604e-05, "loss": 0.1293, "loss_lm": 0.01750055141746998, "loss_seg": 0.11179422773420811, "mean_token_accuracy": 0.9951296895742416, "num_tokens": 2337138528.0, "step": 5499 }, { "entropy": 0.018230590503662825, "epoch": 2.4070467228361965, "grad_norm": 4.21875, "learning_rate": 1.6042230644288035e-05, "loss": 0.1014, "loss_lm": 0.016811572713777423, "loss_seg": 0.08459166809916496, "mean_token_accuracy": 0.9952105432748795, "num_tokens": 2337563575.0, "step": 5500 }, { "entropy": 0.01831607660278678, "epoch": 2.4074844074844073, "grad_norm": 5.03125, "learning_rate": 1.6039523551705468e-05, "loss": 0.1408, "loss_lm": 0.015046218177303672, "loss_seg": 0.12571527622640133, "mean_token_accuracy": 0.9951161742210388, "num_tokens": 2337988814.0, "step": 5501 }, { "entropy": 0.018249062355607748, "epoch": 2.4079220921326185, "grad_norm": 9.6875, "learning_rate": 1.6036816459122902e-05, "loss": 0.1376, "loss_lm": 0.018752128817141056, "loss_seg": 0.11886439472436905, "mean_token_accuracy": 0.9952483922243118, "num_tokens": 2338414314.0, "step": 5502 }, { "entropy": 0.01907777553424239, "epoch": 2.4083597767808294, "grad_norm": 12.5625, "learning_rate": 1.6034109366540336e-05, "loss": 0.0987, "loss_lm": 0.01641805679537356, "loss_seg": 0.08225716277956963, "mean_token_accuracy": 0.9950260072946548, "num_tokens": 2338839507.0, "step": 5503 }, { "entropy": 0.018139688298106194, "epoch": 2.4087974614290406, "grad_norm": 2.640625, "learning_rate": 1.603140227395777e-05, "loss": 0.1191, "loss_lm": 0.01626691804267466, "loss_seg": 0.1028144583106041, "mean_token_accuracy": 0.9952570796012878, "num_tokens": 2339264508.0, "step": 5504 }, { "entropy": 0.018503549974411726, "epoch": 2.4092351460772514, "grad_norm": 5.03125, "learning_rate": 1.6028695181375203e-05, "loss": 0.121, "loss_lm": 0.017788216704502702, "loss_seg": 0.10321675147861242, "mean_token_accuracy": 0.9950817078351974, "num_tokens": 2339688831.0, "step": 5505 }, { "entropy": 0.017822604160755873, "epoch": 2.4096728307254622, "grad_norm": 5.53125, "learning_rate": 1.6025988088792637e-05, "loss": 0.1351, "loss_lm": 0.014452581759542227, "loss_seg": 0.12067290209233761, "mean_token_accuracy": 0.9953514188528061, "num_tokens": 2340113857.0, "step": 5506 }, { "entropy": 0.018403143156319857, "epoch": 2.410110515373673, "grad_norm": 4.65625, "learning_rate": 1.602328099621007e-05, "loss": 0.092, "loss_lm": 0.01789771136827767, "loss_seg": 0.07405898906290531, "mean_token_accuracy": 0.9952277690172195, "num_tokens": 2340539406.0, "step": 5507 }, { "entropy": 0.01800515130162239, "epoch": 2.4105482000218843, "grad_norm": 10.3125, "learning_rate": 1.6020573903627504e-05, "loss": 0.0962, "loss_lm": 0.01440122863277793, "loss_seg": 0.08177772350609303, "mean_token_accuracy": 0.9951748549938202, "num_tokens": 2340964217.0, "step": 5508 }, { "entropy": 0.01820017909631133, "epoch": 2.410985884670095, "grad_norm": 20.125, "learning_rate": 1.6017866811044938e-05, "loss": 0.0984, "loss_lm": 0.014176285360008478, "loss_seg": 0.08421865478157997, "mean_token_accuracy": 0.9951852858066559, "num_tokens": 2341389324.0, "step": 5509 }, { "entropy": 0.017835000064224005, "epoch": 2.4114235693183064, "grad_norm": 4.46875, "learning_rate": 1.6015159718462372e-05, "loss": 0.0949, "loss_lm": 0.01639433135278523, "loss_seg": 0.07847480289638042, "mean_token_accuracy": 0.9954012632369995, "num_tokens": 2341814385.0, "step": 5510 }, { "entropy": 0.01774869067594409, "epoch": 2.411861253966517, "grad_norm": 5.5, "learning_rate": 1.6012452625879805e-05, "loss": 0.103, "loss_lm": 0.0154021130874753, "loss_seg": 0.08764087129384279, "mean_token_accuracy": 0.9952297061681747, "num_tokens": 2342239695.0, "step": 5511 }, { "entropy": 0.017361342441290617, "epoch": 2.412298938614728, "grad_norm": 5.40625, "learning_rate": 1.600974553329724e-05, "loss": 0.1017, "loss_lm": 0.015306073939427733, "loss_seg": 0.08643428515642881, "mean_token_accuracy": 0.9954908341169357, "num_tokens": 2342665184.0, "step": 5512 }, { "entropy": 0.01907981652766466, "epoch": 2.412736623262939, "grad_norm": 61.75, "learning_rate": 1.6007038440714673e-05, "loss": 0.0907, "loss_lm": 0.01894344389438629, "loss_seg": 0.0717069385573268, "mean_token_accuracy": 0.9949876070022583, "num_tokens": 2343091109.0, "step": 5513 }, { "entropy": 0.018173267599195242, "epoch": 2.41317430791115, "grad_norm": 4.875, "learning_rate": 1.6004331348132107e-05, "loss": 0.1073, "loss_lm": 0.01626112638041377, "loss_seg": 0.09107928909361362, "mean_token_accuracy": 0.9953774958848953, "num_tokens": 2343515928.0, "step": 5514 }, { "entropy": 0.01809565769508481, "epoch": 2.413611992559361, "grad_norm": 7.96875, "learning_rate": 1.600162425554954e-05, "loss": 0.1382, "loss_lm": 0.014510361012071371, "loss_seg": 0.12373626232147217, "mean_token_accuracy": 0.9952524602413177, "num_tokens": 2343940308.0, "step": 5515 }, { "entropy": 0.018854957539588213, "epoch": 2.414049677207572, "grad_norm": 6.0625, "learning_rate": 1.5998917162966974e-05, "loss": 0.0932, "loss_lm": 0.017624666448682547, "loss_seg": 0.07559095975011587, "mean_token_accuracy": 0.9950435310602188, "num_tokens": 2344365193.0, "step": 5516 }, { "entropy": 0.017994452267885208, "epoch": 2.414487361855783, "grad_norm": 9.8125, "learning_rate": 1.5996210070384408e-05, "loss": 0.1622, "loss_lm": 0.01639425684697926, "loss_seg": 0.14580979943275452, "mean_token_accuracy": 0.9953122287988663, "num_tokens": 2344790492.0, "step": 5517 }, { "entropy": 0.018317211885005236, "epoch": 2.4149250465039938, "grad_norm": 7.3125, "learning_rate": 1.599350297780184e-05, "loss": 0.1174, "loss_lm": 0.01530133024789393, "loss_seg": 0.10207658261060715, "mean_token_accuracy": 0.9952919483184814, "num_tokens": 2345215890.0, "step": 5518 }, { "entropy": 0.01816538441926241, "epoch": 2.415362731152205, "grad_norm": 5.375, "learning_rate": 1.5990795885219275e-05, "loss": 0.1158, "loss_lm": 0.01745686656795442, "loss_seg": 0.09833172801882029, "mean_token_accuracy": 0.9951989650726318, "num_tokens": 2345640549.0, "step": 5519 }, { "entropy": 0.018289034720510244, "epoch": 2.415800415800416, "grad_norm": 3.9375, "learning_rate": 1.598808879263671e-05, "loss": 0.1225, "loss_lm": 0.01468715537339449, "loss_seg": 0.10779180750250816, "mean_token_accuracy": 0.9952061772346497, "num_tokens": 2346065656.0, "step": 5520 }, { "entropy": 0.018195706885308027, "epoch": 2.4162381004486266, "grad_norm": 12.3125, "learning_rate": 1.5985381700054143e-05, "loss": 0.1043, "loss_lm": 0.01769480132497847, "loss_seg": 0.08659140858799219, "mean_token_accuracy": 0.9951284527778625, "num_tokens": 2346490752.0, "step": 5521 }, { "entropy": 0.018216007389128208, "epoch": 2.416675785096838, "grad_norm": 3.96875, "learning_rate": 1.5982674607471576e-05, "loss": 0.1663, "loss_lm": 0.015249732416123152, "loss_seg": 0.1510427389293909, "mean_token_accuracy": 0.9952686280012131, "num_tokens": 2346915915.0, "step": 5522 }, { "entropy": 0.017650362104177475, "epoch": 2.4171134697450487, "grad_norm": 6.78125, "learning_rate": 1.597996751488901e-05, "loss": 0.1253, "loss_lm": 0.01271835807710886, "loss_seg": 0.11254519689828157, "mean_token_accuracy": 0.9953629970550537, "num_tokens": 2347340917.0, "step": 5523 }, { "entropy": 0.018623182084411383, "epoch": 2.4175511543932595, "grad_norm": 5.25, "learning_rate": 1.5977260422306444e-05, "loss": 0.1229, "loss_lm": 0.01668862858787179, "loss_seg": 0.10622644051909447, "mean_token_accuracy": 0.9950686395168304, "num_tokens": 2347765925.0, "step": 5524 }, { "entropy": 0.017904089763760567, "epoch": 2.4179888390414708, "grad_norm": 19.5, "learning_rate": 1.5974553329723878e-05, "loss": 0.0757, "loss_lm": 0.015165813034400344, "loss_seg": 0.06050430703908205, "mean_token_accuracy": 0.9952622652053833, "num_tokens": 2348191207.0, "step": 5525 }, { "entropy": 0.018286897335201502, "epoch": 2.4184265236896816, "grad_norm": 4.6875, "learning_rate": 1.597184623714131e-05, "loss": 0.0895, "loss_lm": 0.014062026515603065, "loss_seg": 0.07547425013035536, "mean_token_accuracy": 0.9952726662158966, "num_tokens": 2348616329.0, "step": 5526 }, { "entropy": 0.01839364180341363, "epoch": 2.4188642083378924, "grad_norm": 5.21875, "learning_rate": 1.5969139144558745e-05, "loss": 0.1281, "loss_lm": 0.016089352080598474, "loss_seg": 0.11196698434650898, "mean_token_accuracy": 0.9951247721910477, "num_tokens": 2349041496.0, "step": 5527 }, { "entropy": 0.01778762973845005, "epoch": 2.4193018929861037, "grad_norm": 4.21875, "learning_rate": 1.5966432051976175e-05, "loss": 0.1045, "loss_lm": 0.017400632612407207, "loss_seg": 0.08707697503268719, "mean_token_accuracy": 0.9954070895910263, "num_tokens": 2349466459.0, "step": 5528 }, { "entropy": 0.018048708327114582, "epoch": 2.4197395776343145, "grad_norm": 15.5, "learning_rate": 1.596372495939361e-05, "loss": 0.1016, "loss_lm": 0.016008538426831365, "loss_seg": 0.08559169992804527, "mean_token_accuracy": 0.995288074016571, "num_tokens": 2349891502.0, "step": 5529 }, { "entropy": 0.01853402704000473, "epoch": 2.4201772622825253, "grad_norm": 13.1875, "learning_rate": 1.5961017866811046e-05, "loss": 0.1129, "loss_lm": 0.016589727951213717, "loss_seg": 0.09631168469786644, "mean_token_accuracy": 0.9951204359531403, "num_tokens": 2350316302.0, "step": 5530 }, { "entropy": 0.017765233293175697, "epoch": 2.4206149469307365, "grad_norm": 4.625, "learning_rate": 1.595831077422848e-05, "loss": 0.0976, "loss_lm": 0.017534819897264242, "loss_seg": 0.08009575679898262, "mean_token_accuracy": 0.9953740388154984, "num_tokens": 2350741246.0, "step": 5531 }, { "entropy": 0.017829842399805784, "epoch": 2.4210526315789473, "grad_norm": 13.25, "learning_rate": 1.5955603681645914e-05, "loss": 0.1192, "loss_lm": 0.016708520706743002, "loss_seg": 0.10252562910318375, "mean_token_accuracy": 0.9952394813299179, "num_tokens": 2351166561.0, "step": 5532 }, { "entropy": 0.018193250987678766, "epoch": 2.421490316227158, "grad_norm": 5.6875, "learning_rate": 1.5952896589063344e-05, "loss": 0.0856, "loss_lm": 0.015289404429495335, "loss_seg": 0.07034012209624052, "mean_token_accuracy": 0.995244175195694, "num_tokens": 2351591272.0, "step": 5533 }, { "entropy": 0.018164937384426594, "epoch": 2.4219280008753694, "grad_norm": 4.90625, "learning_rate": 1.5950189496480778e-05, "loss": 0.0865, "loss_lm": 0.017476954264566302, "loss_seg": 0.06905872002243996, "mean_token_accuracy": 0.9952418953180313, "num_tokens": 2352016173.0, "step": 5534 }, { "entropy": 0.017719615250825882, "epoch": 2.4223656855235802, "grad_norm": 5.96875, "learning_rate": 1.5947482403898215e-05, "loss": 0.1437, "loss_lm": 0.01663359603844583, "loss_seg": 0.12708538211882114, "mean_token_accuracy": 0.9953790307044983, "num_tokens": 2352440705.0, "step": 5535 }, { "entropy": 0.017993449233472347, "epoch": 2.422803370171791, "grad_norm": 4.03125, "learning_rate": 1.594477531131565e-05, "loss": 0.0997, "loss_lm": 0.016753718489781022, "loss_seg": 0.0829883273690939, "mean_token_accuracy": 0.9953555017709732, "num_tokens": 2352865899.0, "step": 5536 }, { "entropy": 0.018712771125137806, "epoch": 2.4232410548200023, "grad_norm": 19.25, "learning_rate": 1.5942068218733082e-05, "loss": 0.0978, "loss_lm": 0.016333064762875438, "loss_seg": 0.0814954973757267, "mean_token_accuracy": 0.9951075315475464, "num_tokens": 2353291820.0, "step": 5537 }, { "entropy": 0.017936107702553272, "epoch": 2.423678739468213, "grad_norm": 41.75, "learning_rate": 1.5939361126150513e-05, "loss": 0.1043, "loss_lm": 0.018742039799690247, "loss_seg": 0.08555191569030285, "mean_token_accuracy": 0.9953297078609467, "num_tokens": 2353717018.0, "step": 5538 }, { "entropy": 0.01784153375774622, "epoch": 2.4241164241164244, "grad_norm": 8.8125, "learning_rate": 1.5936654033567946e-05, "loss": 0.0848, "loss_lm": 0.015616246033459902, "loss_seg": 0.06920273508876562, "mean_token_accuracy": 0.9952444434165955, "num_tokens": 2354142365.0, "step": 5539 }, { "entropy": 0.018295818008482456, "epoch": 2.424554108764635, "grad_norm": 4.375, "learning_rate": 1.5933946940985384e-05, "loss": 0.1063, "loss_lm": 0.016421100357547402, "loss_seg": 0.0898699788376689, "mean_token_accuracy": 0.995225802063942, "num_tokens": 2354567524.0, "step": 5540 }, { "entropy": 0.018572143744677305, "epoch": 2.424991793412846, "grad_norm": 3.84375, "learning_rate": 1.5931239848402817e-05, "loss": 0.1047, "loss_lm": 0.014484613901004195, "loss_seg": 0.09016839414834976, "mean_token_accuracy": 0.9952189326286316, "num_tokens": 2354993346.0, "step": 5541 }, { "entropy": 0.01795135485008359, "epoch": 2.425429478061057, "grad_norm": 4.875, "learning_rate": 1.592853275582025e-05, "loss": 0.1284, "loss_lm": 0.016133737983182073, "loss_seg": 0.1122279055416584, "mean_token_accuracy": 0.9953454434871674, "num_tokens": 2355419119.0, "step": 5542 }, { "entropy": 0.01830103201791644, "epoch": 2.425867162709268, "grad_norm": 3.96875, "learning_rate": 1.592582566323768e-05, "loss": 0.104, "loss_lm": 0.016020084032788873, "loss_seg": 0.08800431527197361, "mean_token_accuracy": 0.9951155036687851, "num_tokens": 2355844298.0, "step": 5543 }, { "entropy": 0.01756341615691781, "epoch": 2.426304847357479, "grad_norm": 3.125, "learning_rate": 1.5923118570655115e-05, "loss": 0.0759, "loss_lm": 0.015029825270175934, "loss_seg": 0.06088872440159321, "mean_token_accuracy": 0.9954327344894409, "num_tokens": 2356269012.0, "step": 5544 }, { "entropy": 0.017826673574745655, "epoch": 2.42674253200569, "grad_norm": 7.90625, "learning_rate": 1.5920411478072552e-05, "loss": 0.0994, "loss_lm": 0.015046299900859594, "loss_seg": 0.08433342911303043, "mean_token_accuracy": 0.9953973740339279, "num_tokens": 2356694143.0, "step": 5545 }, { "entropy": 0.018108399119228125, "epoch": 2.427180216653901, "grad_norm": 8.1875, "learning_rate": 1.5917704385489986e-05, "loss": 0.1143, "loss_lm": 0.017728803912177682, "loss_seg": 0.09654220007359982, "mean_token_accuracy": 0.9951710551977158, "num_tokens": 2357119023.0, "step": 5546 }, { "entropy": 0.01903891284018755, "epoch": 2.4276179013021117, "grad_norm": 6.25, "learning_rate": 1.591499729290742e-05, "loss": 0.1286, "loss_lm": 0.01824320573359728, "loss_seg": 0.11038600467145443, "mean_token_accuracy": 0.9949796199798584, "num_tokens": 2357544071.0, "step": 5547 }, { "entropy": 0.01828195946291089, "epoch": 2.4280555859503226, "grad_norm": 8.0625, "learning_rate": 1.591229020032485e-05, "loss": 0.1328, "loss_lm": 0.018243329832330346, "loss_seg": 0.1145890299230814, "mean_token_accuracy": 0.9952121376991272, "num_tokens": 2357969899.0, "step": 5548 }, { "entropy": 0.018047635443508625, "epoch": 2.428493270598534, "grad_norm": 9.1875, "learning_rate": 1.5909583107742284e-05, "loss": 0.0976, "loss_lm": 0.016512705013155937, "loss_seg": 0.08111277781426907, "mean_token_accuracy": 0.9952652603387833, "num_tokens": 2358394798.0, "step": 5549 }, { "entropy": 0.018010733649134636, "epoch": 2.4289309552467446, "grad_norm": 4.9375, "learning_rate": 1.590687601515972e-05, "loss": 0.1151, "loss_lm": 0.014327417826279998, "loss_seg": 0.10077164694666862, "mean_token_accuracy": 0.995162308216095, "num_tokens": 2358819943.0, "step": 5550 }, { "entropy": 0.01854670699685812, "epoch": 2.429368639894956, "grad_norm": 5.75, "learning_rate": 1.5904168922577155e-05, "loss": 0.0956, "loss_lm": 0.01787816360592842, "loss_seg": 0.07776589691638947, "mean_token_accuracy": 0.9952143281698227, "num_tokens": 2359244737.0, "step": 5551 }, { "entropy": 0.017946819309145212, "epoch": 2.4298063245431667, "grad_norm": 37.0, "learning_rate": 1.5901461829994585e-05, "loss": 0.0976, "loss_lm": 0.01636196975596249, "loss_seg": 0.08127579838037491, "mean_token_accuracy": 0.9953044801950455, "num_tokens": 2359669959.0, "step": 5552 }, { "entropy": 0.018487158231437206, "epoch": 2.4302440091913775, "grad_norm": 7.09375, "learning_rate": 1.589875473741202e-05, "loss": 0.0672, "loss_lm": 0.01774019584991038, "loss_seg": 0.04950749594718218, "mean_token_accuracy": 0.995115339756012, "num_tokens": 2360095575.0, "step": 5553 }, { "entropy": 0.018524486105889082, "epoch": 2.4306816938395888, "grad_norm": 18.75, "learning_rate": 1.5896047644829452e-05, "loss": 0.1149, "loss_lm": 0.01457707048393786, "loss_seg": 0.10034659318625927, "mean_token_accuracy": 0.9952297061681747, "num_tokens": 2360520671.0, "step": 5554 }, { "entropy": 0.018256175331771374, "epoch": 2.4311193784877996, "grad_norm": 4.59375, "learning_rate": 1.589334055224689e-05, "loss": 0.1039, "loss_lm": 0.01600556424818933, "loss_seg": 0.08788845874369144, "mean_token_accuracy": 0.9952718168497086, "num_tokens": 2360945983.0, "step": 5555 }, { "entropy": 0.01858364138752222, "epoch": 2.4315570631360104, "grad_norm": 9.5625, "learning_rate": 1.5890633459664323e-05, "loss": 0.1117, "loss_lm": 0.01592095522210002, "loss_seg": 0.09573726076632738, "mean_token_accuracy": 0.9951813966035843, "num_tokens": 2361371597.0, "step": 5556 }, { "entropy": 0.018726248759776354, "epoch": 2.4319947477842216, "grad_norm": 7.5625, "learning_rate": 1.5887926367081753e-05, "loss": 0.0937, "loss_lm": 0.013674169778823853, "loss_seg": 0.08003132976591587, "mean_token_accuracy": 0.9950557053089142, "num_tokens": 2361796706.0, "step": 5557 }, { "entropy": 0.01868873042985797, "epoch": 2.4324324324324325, "grad_norm": 7.15625, "learning_rate": 1.5885219274499187e-05, "loss": 0.0759, "loss_lm": 0.01787094702012837, "loss_seg": 0.05807768739759922, "mean_token_accuracy": 0.9950728714466095, "num_tokens": 2362221433.0, "step": 5558 }, { "entropy": 0.018720808904618025, "epoch": 2.4328701170806433, "grad_norm": 19.375, "learning_rate": 1.588251218191662e-05, "loss": 0.0909, "loss_lm": 0.01706266449764371, "loss_seg": 0.07380570657551289, "mean_token_accuracy": 0.9950604885816574, "num_tokens": 2362646343.0, "step": 5559 }, { "entropy": 0.018299158196896315, "epoch": 2.4333078017288545, "grad_norm": 4.03125, "learning_rate": 1.5879805089334055e-05, "loss": 0.0879, "loss_lm": 0.016726088477298617, "loss_seg": 0.07112596184015274, "mean_token_accuracy": 0.9952873140573502, "num_tokens": 2363072175.0, "step": 5560 }, { "entropy": 0.017922169528901577, "epoch": 2.4337454863770653, "grad_norm": 7.25, "learning_rate": 1.5877097996751492e-05, "loss": 0.173, "loss_lm": 0.017982000019401312, "loss_seg": 0.15505074709653854, "mean_token_accuracy": 0.9952864944934845, "num_tokens": 2363497489.0, "step": 5561 }, { "entropy": 0.0179670718498528, "epoch": 2.434183171025276, "grad_norm": 10.5625, "learning_rate": 1.5874390904168922e-05, "loss": 0.1315, "loss_lm": 0.015188144519925117, "loss_seg": 0.11634272616356611, "mean_token_accuracy": 0.9953580498695374, "num_tokens": 2363922890.0, "step": 5562 }, { "entropy": 0.01890828087925911, "epoch": 2.4346208556734874, "grad_norm": 21.625, "learning_rate": 1.5871683811586356e-05, "loss": 0.0906, "loss_lm": 0.015561989275738597, "loss_seg": 0.07501349598169327, "mean_token_accuracy": 0.9950643628835678, "num_tokens": 2364348552.0, "step": 5563 }, { "entropy": 0.018178424332290888, "epoch": 2.435058540321698, "grad_norm": 4.4375, "learning_rate": 1.586897671900379e-05, "loss": 0.1245, "loss_lm": 0.016102660913020372, "loss_seg": 0.10836623050272465, "mean_token_accuracy": 0.9953549653291702, "num_tokens": 2364773482.0, "step": 5564 }, { "entropy": 0.017748308833688498, "epoch": 2.435496224969909, "grad_norm": 4.71875, "learning_rate": 1.5866269626421223e-05, "loss": 0.1253, "loss_lm": 0.016845679376274347, "loss_seg": 0.10841158870607615, "mean_token_accuracy": 0.995225265622139, "num_tokens": 2365199015.0, "step": 5565 }, { "entropy": 0.01823227060958743, "epoch": 2.4359339096181203, "grad_norm": 5.46875, "learning_rate": 1.586356253383866e-05, "loss": 0.0982, "loss_lm": 0.017296630889177322, "loss_seg": 0.08093686029314995, "mean_token_accuracy": 0.9952812045812607, "num_tokens": 2365624716.0, "step": 5566 }, { "entropy": 0.018458168022334576, "epoch": 2.436371594266331, "grad_norm": 8.0625, "learning_rate": 1.586085544125609e-05, "loss": 0.1713, "loss_lm": 0.013987677404657006, "loss_seg": 0.15728192403912544, "mean_token_accuracy": 0.9952442795038223, "num_tokens": 2366050281.0, "step": 5567 }, { "entropy": 0.018290795385837555, "epoch": 2.436809278914542, "grad_norm": 6.40625, "learning_rate": 1.5858148348673524e-05, "loss": 0.0919, "loss_lm": 0.014281395357102156, "loss_seg": 0.07762348651885986, "mean_token_accuracy": 0.9952432066202164, "num_tokens": 2366474671.0, "step": 5568 }, { "entropy": 0.018075502943247557, "epoch": 2.437246963562753, "grad_norm": 6.34375, "learning_rate": 1.5855441256090958e-05, "loss": 0.1635, "loss_lm": 0.01522771269083023, "loss_seg": 0.14826484955847263, "mean_token_accuracy": 0.9953094869852066, "num_tokens": 2366899733.0, "step": 5569 }, { "entropy": 0.01856441516429186, "epoch": 2.437684648210964, "grad_norm": 5.25, "learning_rate": 1.5852734163508392e-05, "loss": 0.0897, "loss_lm": 0.01502171135507524, "loss_seg": 0.07472054287791252, "mean_token_accuracy": 0.9951624870300293, "num_tokens": 2367324895.0, "step": 5570 }, { "entropy": 0.018163637723773718, "epoch": 2.438122332859175, "grad_norm": 4.375, "learning_rate": 1.585002707092583e-05, "loss": 0.1013, "loss_lm": 0.018167702248319983, "loss_seg": 0.08313445467501879, "mean_token_accuracy": 0.9952268153429031, "num_tokens": 2367749903.0, "step": 5571 }, { "entropy": 0.01796681946143508, "epoch": 2.438560017507386, "grad_norm": 4.46875, "learning_rate": 1.584731997834326e-05, "loss": 0.1137, "loss_lm": 0.017317090649157763, "loss_seg": 0.09640972502529621, "mean_token_accuracy": 0.9952474981546402, "num_tokens": 2368174481.0, "step": 5572 }, { "entropy": 0.01832608412951231, "epoch": 2.438997702155597, "grad_norm": 2.9375, "learning_rate": 1.5844612885760693e-05, "loss": 0.0952, "loss_lm": 0.016942268470302224, "loss_seg": 0.07826142199337482, "mean_token_accuracy": 0.9952262789011002, "num_tokens": 2368599695.0, "step": 5573 }, { "entropy": 0.01866402616724372, "epoch": 2.4394353868038077, "grad_norm": 5.9375, "learning_rate": 1.5841905793178127e-05, "loss": 0.0954, "loss_lm": 0.013746097916737199, "loss_seg": 0.08169231377542019, "mean_token_accuracy": 0.9951923340559006, "num_tokens": 2369025050.0, "step": 5574 }, { "entropy": 0.018014200497418642, "epoch": 2.439873071452019, "grad_norm": 22.625, "learning_rate": 1.583919870059556e-05, "loss": 0.0919, "loss_lm": 0.014912390848621726, "loss_seg": 0.07697504758834839, "mean_token_accuracy": 0.9954010248184204, "num_tokens": 2369450306.0, "step": 5575 }, { "entropy": 0.0182267758063972, "epoch": 2.4403107561002297, "grad_norm": 5.84375, "learning_rate": 1.5836491608012994e-05, "loss": 0.0765, "loss_lm": 0.014774410519748926, "loss_seg": 0.06172512285411358, "mean_token_accuracy": 0.9952170699834824, "num_tokens": 2369875532.0, "step": 5576 }, { "entropy": 0.018831207416951656, "epoch": 2.4407484407484406, "grad_norm": 9.5625, "learning_rate": 1.5833784515430428e-05, "loss": 0.0973, "loss_lm": 0.01568237296305597, "loss_seg": 0.08156847767531872, "mean_token_accuracy": 0.9950464814901352, "num_tokens": 2370300845.0, "step": 5577 }, { "entropy": 0.01842612586915493, "epoch": 2.441186125396652, "grad_norm": 5.875, "learning_rate": 1.5831077422847862e-05, "loss": 0.0992, "loss_lm": 0.015593594871461391, "loss_seg": 0.0836507324129343, "mean_token_accuracy": 0.9951527565717697, "num_tokens": 2370725995.0, "step": 5578 }, { "entropy": 0.01816844940185547, "epoch": 2.4416238100448626, "grad_norm": 10.1875, "learning_rate": 1.5828370330265295e-05, "loss": 0.1046, "loss_lm": 0.01553729409351945, "loss_seg": 0.08903999719768763, "mean_token_accuracy": 0.9952116161584854, "num_tokens": 2371150745.0, "step": 5579 }, { "entropy": 0.01853549387305975, "epoch": 2.442061494693074, "grad_norm": 10.4375, "learning_rate": 1.582566323768273e-05, "loss": 0.1266, "loss_lm": 0.01666945731267333, "loss_seg": 0.10994583740830421, "mean_token_accuracy": 0.9952744096517563, "num_tokens": 2371575841.0, "step": 5580 }, { "entropy": 0.01796297123655677, "epoch": 2.4424991793412847, "grad_norm": 8.75, "learning_rate": 1.5822956145100163e-05, "loss": 0.1238, "loss_lm": 0.015395047375932336, "loss_seg": 0.10836328752338886, "mean_token_accuracy": 0.9953215718269348, "num_tokens": 2372000944.0, "step": 5581 }, { "entropy": 0.018092414364218712, "epoch": 2.4429368639894955, "grad_norm": 13.0, "learning_rate": 1.5820249052517597e-05, "loss": 0.1356, "loss_lm": 0.014332316350191832, "loss_seg": 0.12121848203241825, "mean_token_accuracy": 0.9953703731298447, "num_tokens": 2372426421.0, "step": 5582 }, { "entropy": 0.018223789520561695, "epoch": 2.4433745486377063, "grad_norm": 3.71875, "learning_rate": 1.581754195993503e-05, "loss": 0.0764, "loss_lm": 0.018458496779203415, "loss_seg": 0.05792560335248709, "mean_token_accuracy": 0.9952644407749176, "num_tokens": 2372850828.0, "step": 5583 }, { "entropy": 0.017774125561118126, "epoch": 2.4438122332859176, "grad_norm": 4.625, "learning_rate": 1.5814834867352464e-05, "loss": 0.0944, "loss_lm": 0.015771429520100355, "loss_seg": 0.0785790104418993, "mean_token_accuracy": 0.9953691959381104, "num_tokens": 2373275736.0, "step": 5584 }, { "entropy": 0.018442881759256124, "epoch": 2.4442499179341284, "grad_norm": 9.4375, "learning_rate": 1.5812127774769898e-05, "loss": 0.1132, "loss_lm": 0.015214070677757263, "loss_seg": 0.0979374349117279, "mean_token_accuracy": 0.9951937198638916, "num_tokens": 2373699961.0, "step": 5585 }, { "entropy": 0.018667049705982208, "epoch": 2.4446876025823396, "grad_norm": 4.9375, "learning_rate": 1.580942068218733e-05, "loss": 0.0852, "loss_lm": 0.016262742690742016, "loss_seg": 0.06894776597619057, "mean_token_accuracy": 0.9951322078704834, "num_tokens": 2374125164.0, "step": 5586 }, { "entropy": 0.01831875741481781, "epoch": 2.4451252872305504, "grad_norm": 4.25, "learning_rate": 1.5806713589604765e-05, "loss": 0.1147, "loss_lm": 0.015132880304008722, "loss_seg": 0.09951824322342873, "mean_token_accuracy": 0.9952605813741684, "num_tokens": 2374550513.0, "step": 5587 }, { "entropy": 0.018215648364275694, "epoch": 2.4455629718787613, "grad_norm": 9.5625, "learning_rate": 1.58040064970222e-05, "loss": 0.0858, "loss_lm": 0.016918943263590336, "loss_seg": 0.06887363363057375, "mean_token_accuracy": 0.9952675998210907, "num_tokens": 2374976019.0, "step": 5588 }, { "entropy": 0.017800335306674242, "epoch": 2.446000656526972, "grad_norm": 19.5, "learning_rate": 1.5801299404439633e-05, "loss": 0.0882, "loss_lm": 0.015393683454021811, "loss_seg": 0.0727864783257246, "mean_token_accuracy": 0.9953135251998901, "num_tokens": 2375401228.0, "step": 5589 }, { "entropy": 0.0175166972912848, "epoch": 2.4464383411751833, "grad_norm": 4.125, "learning_rate": 1.5798592311857066e-05, "loss": 0.1298, "loss_lm": 0.015033188508823514, "loss_seg": 0.11481267027556896, "mean_token_accuracy": 0.9954249858856201, "num_tokens": 2375826084.0, "step": 5590 }, { "entropy": 0.018247250467538834, "epoch": 2.446876025823394, "grad_norm": 7.15625, "learning_rate": 1.5795885219274497e-05, "loss": 0.111, "loss_lm": 0.013736048946157098, "loss_seg": 0.09731335751712322, "mean_token_accuracy": 0.9951764792203903, "num_tokens": 2376250814.0, "step": 5591 }, { "entropy": 0.01829934725537896, "epoch": 2.4473137104716054, "grad_norm": 4.84375, "learning_rate": 1.5793178126691934e-05, "loss": 0.0936, "loss_lm": 0.019705293234437704, "loss_seg": 0.07388421427458525, "mean_token_accuracy": 0.9952947497367859, "num_tokens": 2376676219.0, "step": 5592 }, { "entropy": 0.01821959065273404, "epoch": 2.447751395119816, "grad_norm": 6.0, "learning_rate": 1.5790471034109368e-05, "loss": 0.0931, "loss_lm": 0.0175005414057523, "loss_seg": 0.07559438515454531, "mean_token_accuracy": 0.995245635509491, "num_tokens": 2377101888.0, "step": 5593 }, { "entropy": 0.018256619106978178, "epoch": 2.448189079768027, "grad_norm": 4.25, "learning_rate": 1.57877639415268e-05, "loss": 0.0947, "loss_lm": 0.014966388698667288, "loss_seg": 0.07971861213445663, "mean_token_accuracy": 0.995197519659996, "num_tokens": 2377526636.0, "step": 5594 }, { "entropy": 0.018477729987353086, "epoch": 2.4486267644162383, "grad_norm": 4.0625, "learning_rate": 1.5785056848944235e-05, "loss": 0.0783, "loss_lm": 0.016082088695839047, "loss_seg": 0.06220395304262638, "mean_token_accuracy": 0.9950861483812332, "num_tokens": 2377951556.0, "step": 5595 }, { "entropy": 0.0180606571957469, "epoch": 2.449064449064449, "grad_norm": 4.5, "learning_rate": 1.5782349756361665e-05, "loss": 0.121, "loss_lm": 0.01418262324295938, "loss_seg": 0.1067769955843687, "mean_token_accuracy": 0.9952733367681503, "num_tokens": 2378375925.0, "step": 5596 }, { "entropy": 0.01785280928015709, "epoch": 2.44950213371266, "grad_norm": 25.0, "learning_rate": 1.5779642663779103e-05, "loss": 0.1082, "loss_lm": 0.014467394445091486, "loss_seg": 0.09368620812892914, "mean_token_accuracy": 0.9952406585216522, "num_tokens": 2378800719.0, "step": 5597 }, { "entropy": 0.018055367283523083, "epoch": 2.449939818360871, "grad_norm": 2.921875, "learning_rate": 1.5776935571196536e-05, "loss": 0.0776, "loss_lm": 0.0142702073790133, "loss_seg": 0.063354367390275, "mean_token_accuracy": 0.9952577352523804, "num_tokens": 2379224941.0, "step": 5598 }, { "entropy": 0.01814904250204563, "epoch": 2.450377503009082, "grad_norm": 10.5, "learning_rate": 1.577422847861397e-05, "loss": 0.0867, "loss_lm": 0.01601846143603325, "loss_seg": 0.07063750922679901, "mean_token_accuracy": 0.9952361136674881, "num_tokens": 2379650115.0, "step": 5599 }, { "entropy": 0.017579217441380024, "epoch": 2.450815187657293, "grad_norm": 16.0, "learning_rate": 1.57715213860314e-05, "loss": 0.0962, "loss_lm": 0.015206064796075225, "loss_seg": 0.08094677329063416, "mean_token_accuracy": 0.9954324215650558, "num_tokens": 2380074522.0, "step": 5600 }, { "entropy": 0.017478343565016985, "epoch": 2.451252872305504, "grad_norm": 6.78125, "learning_rate": 1.5768814293448834e-05, "loss": 0.1021, "loss_lm": 0.016013562446460128, "loss_seg": 0.086125448346138, "mean_token_accuracy": 0.9953056126832962, "num_tokens": 2380499458.0, "step": 5601 }, { "entropy": 0.017857260070741177, "epoch": 2.451690556953715, "grad_norm": 4.75, "learning_rate": 1.576610720086627e-05, "loss": 0.1002, "loss_lm": 0.017487526638433337, "loss_seg": 0.0826630350202322, "mean_token_accuracy": 0.9952468872070312, "num_tokens": 2380924431.0, "step": 5602 }, { "entropy": 0.01833792170509696, "epoch": 2.4521282416019257, "grad_norm": 4.03125, "learning_rate": 1.5763400108283705e-05, "loss": 0.0961, "loss_lm": 0.019189022248610854, "loss_seg": 0.07686585001647472, "mean_token_accuracy": 0.9951879680156708, "num_tokens": 2381349586.0, "step": 5603 }, { "entropy": 0.01794166164472699, "epoch": 2.452565926250137, "grad_norm": 6.53125, "learning_rate": 1.576069301570114e-05, "loss": 0.1042, "loss_lm": 0.016861572163179517, "loss_seg": 0.08733827620744705, "mean_token_accuracy": 0.9952858537435532, "num_tokens": 2381774211.0, "step": 5604 }, { "entropy": 0.018419744446873665, "epoch": 2.4530036108983477, "grad_norm": 4.25, "learning_rate": 1.575798592311857e-05, "loss": 0.1316, "loss_lm": 0.016126324888318777, "loss_seg": 0.11544760130345821, "mean_token_accuracy": 0.9950935393571854, "num_tokens": 2382199669.0, "step": 5605 }, { "entropy": 0.01827190723270178, "epoch": 2.4534412955465585, "grad_norm": 5.28125, "learning_rate": 1.5755278830536003e-05, "loss": 0.1145, "loss_lm": 0.017293120734393597, "loss_seg": 0.09716294333338737, "mean_token_accuracy": 0.9951480329036713, "num_tokens": 2382625230.0, "step": 5606 }, { "entropy": 0.01773506263270974, "epoch": 2.45387898019477, "grad_norm": 3.5, "learning_rate": 1.575257173795344e-05, "loss": 0.0925, "loss_lm": 0.0152681905310601, "loss_seg": 0.07725795451551676, "mean_token_accuracy": 0.9953157305717468, "num_tokens": 2383050035.0, "step": 5607 }, { "entropy": 0.018452791031450033, "epoch": 2.4543166648429806, "grad_norm": 20.875, "learning_rate": 1.5749864645370874e-05, "loss": 0.0709, "loss_lm": 0.015374452574178576, "loss_seg": 0.05552514363080263, "mean_token_accuracy": 0.9951306134462357, "num_tokens": 2383475743.0, "step": 5608 }, { "entropy": 0.017891235183924437, "epoch": 2.4547543494911914, "grad_norm": 4.0625, "learning_rate": 1.5747157552788307e-05, "loss": 0.0891, "loss_lm": 0.015289673814550042, "loss_seg": 0.07377439271658659, "mean_token_accuracy": 0.995365560054779, "num_tokens": 2383900995.0, "step": 5609 }, { "entropy": 0.018016157671809196, "epoch": 2.4551920341394027, "grad_norm": 11.9375, "learning_rate": 1.5744450460205738e-05, "loss": 0.081, "loss_lm": 0.012806231621652842, "loss_seg": 0.06818338111042976, "mean_token_accuracy": 0.995315209031105, "num_tokens": 2384325725.0, "step": 5610 }, { "entropy": 0.01839198637753725, "epoch": 2.4556297187876135, "grad_norm": 4.125, "learning_rate": 1.574174336762317e-05, "loss": 0.0932, "loss_lm": 0.015656074974685907, "loss_seg": 0.07757984381169081, "mean_token_accuracy": 0.9951515346765518, "num_tokens": 2384750573.0, "step": 5611 }, { "entropy": 0.018386027310043573, "epoch": 2.4560674034358243, "grad_norm": 7.03125, "learning_rate": 1.573903627504061e-05, "loss": 0.1202, "loss_lm": 0.016211897134780884, "loss_seg": 0.10398516990244389, "mean_token_accuracy": 0.9951470196247101, "num_tokens": 2385175640.0, "step": 5612 }, { "entropy": 0.018154153134673834, "epoch": 2.4565050880840356, "grad_norm": 6.5625, "learning_rate": 1.5736329182458042e-05, "loss": 0.1046, "loss_lm": 0.014108697650954127, "loss_seg": 0.09054040350019932, "mean_token_accuracy": 0.9951918870210648, "num_tokens": 2385601510.0, "step": 5613 }, { "entropy": 0.017915203236043453, "epoch": 2.4569427727322464, "grad_norm": 5.40625, "learning_rate": 1.5733622089875476e-05, "loss": 0.1139, "loss_lm": 0.01589218736626208, "loss_seg": 0.09805580228567123, "mean_token_accuracy": 0.9951874762773514, "num_tokens": 2386026387.0, "step": 5614 }, { "entropy": 0.01871062768623233, "epoch": 2.4573804573804576, "grad_norm": 4.125, "learning_rate": 1.5730914997292906e-05, "loss": 0.0955, "loss_lm": 0.017821799032390118, "loss_seg": 0.07772783376276493, "mean_token_accuracy": 0.995106965303421, "num_tokens": 2386451050.0, "step": 5615 }, { "entropy": 0.018497916869819164, "epoch": 2.4578181420286684, "grad_norm": 8.625, "learning_rate": 1.572820790471034e-05, "loss": 0.0951, "loss_lm": 0.015578573336824775, "loss_seg": 0.07953423727303743, "mean_token_accuracy": 0.9952606856822968, "num_tokens": 2386876351.0, "step": 5616 }, { "entropy": 0.01787138171494007, "epoch": 2.4582558266768793, "grad_norm": 6.25, "learning_rate": 1.5725500812127777e-05, "loss": 0.1095, "loss_lm": 0.017440032679587603, "loss_seg": 0.09209520742297173, "mean_token_accuracy": 0.9953229576349258, "num_tokens": 2387301360.0, "step": 5617 }, { "entropy": 0.01776548381894827, "epoch": 2.45869351132509, "grad_norm": 6.71875, "learning_rate": 1.572279371954521e-05, "loss": 0.0842, "loss_lm": 0.01658448250964284, "loss_seg": 0.0676489258185029, "mean_token_accuracy": 0.9953284561634064, "num_tokens": 2387726219.0, "step": 5618 }, { "entropy": 0.018368832767009735, "epoch": 2.4591311959733013, "grad_norm": 4.5625, "learning_rate": 1.572008662696264e-05, "loss": 0.1344, "loss_lm": 0.016643875744193792, "loss_seg": 0.11773020029067993, "mean_token_accuracy": 0.9952022433280945, "num_tokens": 2388151579.0, "step": 5619 }, { "entropy": 0.018329203128814697, "epoch": 2.459568880621512, "grad_norm": 3.734375, "learning_rate": 1.5717379534380075e-05, "loss": 0.0812, "loss_lm": 0.014621452428400517, "loss_seg": 0.06660986132919788, "mean_token_accuracy": 0.9951758086681366, "num_tokens": 2388576289.0, "step": 5620 }, { "entropy": 0.018242950085550547, "epoch": 2.4600065652697234, "grad_norm": 4.40625, "learning_rate": 1.571467244179751e-05, "loss": 0.0923, "loss_lm": 0.01672342326492071, "loss_seg": 0.07556955981999636, "mean_token_accuracy": 0.9951834231615067, "num_tokens": 2389001232.0, "step": 5621 }, { "entropy": 0.01798974070698023, "epoch": 2.460444249917934, "grad_norm": 33.25, "learning_rate": 1.5711965349214942e-05, "loss": 0.1346, "loss_lm": 0.01525305979885161, "loss_seg": 0.11938743107020855, "mean_token_accuracy": 0.9953529089689255, "num_tokens": 2389426193.0, "step": 5622 }, { "entropy": 0.018743427004665136, "epoch": 2.460881934566145, "grad_norm": 8.6875, "learning_rate": 1.570925825663238e-05, "loss": 0.1068, "loss_lm": 0.017339745303615928, "loss_seg": 0.0894534531980753, "mean_token_accuracy": 0.9950438588857651, "num_tokens": 2389851068.0, "step": 5623 }, { "entropy": 0.01779376110062003, "epoch": 2.461319619214356, "grad_norm": 14.75, "learning_rate": 1.570655116404981e-05, "loss": 0.0875, "loss_lm": 0.01530168647877872, "loss_seg": 0.07215462997555733, "mean_token_accuracy": 0.9952412843704224, "num_tokens": 2390276576.0, "step": 5624 }, { "entropy": 0.0190219865180552, "epoch": 2.461757303862567, "grad_norm": 30.125, "learning_rate": 1.5703844071467243e-05, "loss": 0.1072, "loss_lm": 0.019515224266797304, "loss_seg": 0.08766148332506418, "mean_token_accuracy": 0.994951993227005, "num_tokens": 2390701276.0, "step": 5625 }, { "entropy": 0.018411225639283657, "epoch": 2.462194988510778, "grad_norm": 3.125, "learning_rate": 1.5701136978884677e-05, "loss": 0.1002, "loss_lm": 0.01578619168139994, "loss_seg": 0.084425900131464, "mean_token_accuracy": 0.9952842891216278, "num_tokens": 2391127041.0, "step": 5626 }, { "entropy": 0.017836515326052904, "epoch": 2.462632673158989, "grad_norm": 6.96875, "learning_rate": 1.569842988630211e-05, "loss": 0.1136, "loss_lm": 0.01849942863918841, "loss_seg": 0.09510551206767559, "mean_token_accuracy": 0.9952945113182068, "num_tokens": 2391552229.0, "step": 5627 }, { "entropy": 0.0181667348369956, "epoch": 2.4630703578072, "grad_norm": 5.40625, "learning_rate": 1.5695722793719548e-05, "loss": 0.1138, "loss_lm": 0.016511325258761644, "loss_seg": 0.09730620309710503, "mean_token_accuracy": 0.9952534586191177, "num_tokens": 2391977970.0, "step": 5628 }, { "entropy": 0.018353035673499107, "epoch": 2.4635080424554108, "grad_norm": 2.671875, "learning_rate": 1.569301570113698e-05, "loss": 0.1093, "loss_lm": 0.015445806318894029, "loss_seg": 0.09385151788592339, "mean_token_accuracy": 0.9951834082603455, "num_tokens": 2392403516.0, "step": 5629 }, { "entropy": 0.0182526339776814, "epoch": 2.463945727103622, "grad_norm": 3.84375, "learning_rate": 1.5690308608554412e-05, "loss": 0.1091, "loss_lm": 0.016753410454839468, "loss_seg": 0.0923897884786129, "mean_token_accuracy": 0.9953002482652664, "num_tokens": 2392827950.0, "step": 5630 }, { "entropy": 0.018279651179909706, "epoch": 2.464383411751833, "grad_norm": 6.125, "learning_rate": 1.5687601515971846e-05, "loss": 0.1279, "loss_lm": 0.014998447382822633, "loss_seg": 0.11289987899363041, "mean_token_accuracy": 0.9952363967895508, "num_tokens": 2393253010.0, "step": 5631 }, { "entropy": 0.018058504443615675, "epoch": 2.4648210964000437, "grad_norm": 6.21875, "learning_rate": 1.568489442338928e-05, "loss": 0.1152, "loss_lm": 0.017927416134625673, "loss_seg": 0.09727667085826397, "mean_token_accuracy": 0.9952109903097153, "num_tokens": 2393678123.0, "step": 5632 }, { "entropy": 0.018703029491007328, "epoch": 2.465258781048255, "grad_norm": 15.5625, "learning_rate": 1.5682187330806717e-05, "loss": 0.107, "loss_lm": 0.016845798352733254, "loss_seg": 0.09013867750763893, "mean_token_accuracy": 0.995146319270134, "num_tokens": 2394102909.0, "step": 5633 }, { "entropy": 0.01842250069603324, "epoch": 2.4656964656964657, "grad_norm": 4.375, "learning_rate": 1.5679480238224147e-05, "loss": 0.1038, "loss_lm": 0.017857869155704975, "loss_seg": 0.08592638839036226, "mean_token_accuracy": 0.9951876699924469, "num_tokens": 2394528623.0, "step": 5634 }, { "entropy": 0.01889007305726409, "epoch": 2.4661341503446765, "grad_norm": 18.0, "learning_rate": 1.567677314564158e-05, "loss": 0.1066, "loss_lm": 0.016893070191144943, "loss_seg": 0.08974706940352917, "mean_token_accuracy": 0.995039314031601, "num_tokens": 2394953544.0, "step": 5635 }, { "entropy": 0.018200531136244535, "epoch": 2.466571834992888, "grad_norm": 28.25, "learning_rate": 1.5674066053059014e-05, "loss": 0.0822, "loss_lm": 0.01708029699511826, "loss_seg": 0.06514989770948887, "mean_token_accuracy": 0.9952346682548523, "num_tokens": 2395378864.0, "step": 5636 }, { "entropy": 0.018403242342174053, "epoch": 2.4670095196410986, "grad_norm": 5.71875, "learning_rate": 1.5671358960476448e-05, "loss": 0.1193, "loss_lm": 0.018372457241639495, "loss_seg": 0.10092609003186226, "mean_token_accuracy": 0.995238333940506, "num_tokens": 2395804324.0, "step": 5637 }, { "entropy": 0.017839674372226, "epoch": 2.4674472042893094, "grad_norm": 5.8125, "learning_rate": 1.5668651867893885e-05, "loss": 0.0767, "loss_lm": 0.01633194461464882, "loss_seg": 0.060332744382321835, "mean_token_accuracy": 0.9952573031187057, "num_tokens": 2396228886.0, "step": 5638 }, { "entropy": 0.017655343282967806, "epoch": 2.4678848889375207, "grad_norm": 4.5, "learning_rate": 1.5665944775311316e-05, "loss": 0.1343, "loss_lm": 0.013564115623012185, "loss_seg": 0.12072565034031868, "mean_token_accuracy": 0.9952805638313293, "num_tokens": 2396653525.0, "step": 5639 }, { "entropy": 0.018092582002282143, "epoch": 2.4683225735857315, "grad_norm": 13.25, "learning_rate": 1.566323768272875e-05, "loss": 0.0968, "loss_lm": 0.015607237815856934, "loss_seg": 0.08121956698596478, "mean_token_accuracy": 0.9952007085084915, "num_tokens": 2397078770.0, "step": 5640 }, { "entropy": 0.018286716658622026, "epoch": 2.4687602582339423, "grad_norm": 10.125, "learning_rate": 1.5660530590146183e-05, "loss": 0.0918, "loss_lm": 0.014675312442705035, "loss_seg": 0.07711564004421234, "mean_token_accuracy": 0.9951851814985275, "num_tokens": 2397503464.0, "step": 5641 }, { "entropy": 0.018258532974869013, "epoch": 2.4691979428821536, "grad_norm": 4.53125, "learning_rate": 1.5657823497563617e-05, "loss": 0.0918, "loss_lm": 0.014240857446566224, "loss_seg": 0.07760733272880316, "mean_token_accuracy": 0.9951872676610947, "num_tokens": 2397927691.0, "step": 5642 }, { "entropy": 0.017907521221786737, "epoch": 2.4696356275303644, "grad_norm": 6.125, "learning_rate": 1.565511640498105e-05, "loss": 0.1012, "loss_lm": 0.01490857545286417, "loss_seg": 0.08628500625491142, "mean_token_accuracy": 0.9953015744686127, "num_tokens": 2398352686.0, "step": 5643 }, { "entropy": 0.018381050322204828, "epoch": 2.470073312178575, "grad_norm": 3.703125, "learning_rate": 1.5652409312398484e-05, "loss": 0.1012, "loss_lm": 0.01596619631163776, "loss_seg": 0.08523476123809814, "mean_token_accuracy": 0.9952234774827957, "num_tokens": 2398777437.0, "step": 5644 }, { "entropy": 0.018528491258621216, "epoch": 2.4705109968267864, "grad_norm": 27.375, "learning_rate": 1.5649702219815918e-05, "loss": 0.1036, "loss_lm": 0.016826794017106295, "loss_seg": 0.08679857105016708, "mean_token_accuracy": 0.9951712936162949, "num_tokens": 2399202569.0, "step": 5645 }, { "entropy": 0.01837400160729885, "epoch": 2.4709486814749972, "grad_norm": 7.125, "learning_rate": 1.5646995127233352e-05, "loss": 0.0875, "loss_lm": 0.0171437521930784, "loss_seg": 0.07032061275094748, "mean_token_accuracy": 0.9952659010887146, "num_tokens": 2399627943.0, "step": 5646 }, { "entropy": 0.017653637565672398, "epoch": 2.471386366123208, "grad_norm": 4.4375, "learning_rate": 1.5644288034650785e-05, "loss": 0.1022, "loss_lm": 0.014695422258228064, "loss_seg": 0.08749369531869888, "mean_token_accuracy": 0.9954769909381866, "num_tokens": 2400052769.0, "step": 5647 }, { "entropy": 0.017974598333239555, "epoch": 2.4718240507714193, "grad_norm": 7.3125, "learning_rate": 1.564158094206822e-05, "loss": 0.1124, "loss_lm": 0.01803410705178976, "loss_seg": 0.09432460181415081, "mean_token_accuracy": 0.9952291548252106, "num_tokens": 2400478024.0, "step": 5648 }, { "entropy": 0.0176385510712862, "epoch": 2.47226173541963, "grad_norm": 30.75, "learning_rate": 1.5638873849485653e-05, "loss": 0.1083, "loss_lm": 0.014875547727569938, "loss_seg": 0.09337904676795006, "mean_token_accuracy": 0.995402380824089, "num_tokens": 2400903640.0, "step": 5649 }, { "entropy": 0.01809215359389782, "epoch": 2.472699420067841, "grad_norm": 7.34375, "learning_rate": 1.5636166756903087e-05, "loss": 0.139, "loss_lm": 0.017065038671717048, "loss_seg": 0.12196101993322372, "mean_token_accuracy": 0.9951996356248856, "num_tokens": 2401328582.0, "step": 5650 }, { "entropy": 0.018548505380749702, "epoch": 2.473137104716052, "grad_norm": 9.9375, "learning_rate": 1.563345966432052e-05, "loss": 0.0899, "loss_lm": 0.01569065428338945, "loss_seg": 0.07420011702924967, "mean_token_accuracy": 0.9951884299516678, "num_tokens": 2401753961.0, "step": 5651 }, { "entropy": 0.01827836176380515, "epoch": 2.473574789364263, "grad_norm": 2.265625, "learning_rate": 1.5630752571737954e-05, "loss": 0.0685, "loss_lm": 0.015637958887964487, "loss_seg": 0.05285900458693504, "mean_token_accuracy": 0.9952253252267838, "num_tokens": 2402178695.0, "step": 5652 }, { "entropy": 0.018275615759193897, "epoch": 2.474012474012474, "grad_norm": 3.640625, "learning_rate": 1.5628045479155388e-05, "loss": 0.1209, "loss_lm": 0.01571368216536939, "loss_seg": 0.10516800731420517, "mean_token_accuracy": 0.9951616525650024, "num_tokens": 2402603389.0, "step": 5653 }, { "entropy": 0.017943591345101595, "epoch": 2.474450158660685, "grad_norm": 4.90625, "learning_rate": 1.562533838657282e-05, "loss": 0.0914, "loss_lm": 0.015163276577368379, "loss_seg": 0.07622303627431393, "mean_token_accuracy": 0.9952869564294815, "num_tokens": 2403028539.0, "step": 5654 }, { "entropy": 0.018336943350732327, "epoch": 2.474887843308896, "grad_norm": 5.65625, "learning_rate": 1.5622631293990255e-05, "loss": 0.1127, "loss_lm": 0.015148615930229425, "loss_seg": 0.09754126891493797, "mean_token_accuracy": 0.9952671974897385, "num_tokens": 2403453830.0, "step": 5655 }, { "entropy": 0.01830566907301545, "epoch": 2.475325527957107, "grad_norm": 7.96875, "learning_rate": 1.561992420140769e-05, "loss": 0.1018, "loss_lm": 0.015575150959193707, "loss_seg": 0.08625908941030502, "mean_token_accuracy": 0.9952139258384705, "num_tokens": 2403878325.0, "step": 5656 }, { "entropy": 0.01824756385758519, "epoch": 2.475763212605318, "grad_norm": 3.359375, "learning_rate": 1.5617217108825123e-05, "loss": 0.0941, "loss_lm": 0.014216762967407703, "loss_seg": 0.07984195649623871, "mean_token_accuracy": 0.9952036291360855, "num_tokens": 2404303600.0, "step": 5657 }, { "entropy": 0.017742816358804703, "epoch": 2.4762008972535288, "grad_norm": 3.25, "learning_rate": 1.5614510016242553e-05, "loss": 0.0856, "loss_lm": 0.0140785020776093, "loss_seg": 0.07151117827743292, "mean_token_accuracy": 0.9951991885900497, "num_tokens": 2404728082.0, "step": 5658 }, { "entropy": 0.018002149648964405, "epoch": 2.4766385819017396, "grad_norm": 43.0, "learning_rate": 1.561180292365999e-05, "loss": 0.121, "loss_lm": 0.01623008493334055, "loss_seg": 0.1047661304473877, "mean_token_accuracy": 0.995219886302948, "num_tokens": 2405152734.0, "step": 5659 }, { "entropy": 0.017809695564210415, "epoch": 2.477076266549951, "grad_norm": 3.796875, "learning_rate": 1.5609095831077424e-05, "loss": 0.0761, "loss_lm": 0.017075834795832634, "loss_seg": 0.05900317244231701, "mean_token_accuracy": 0.99542136490345, "num_tokens": 2405577922.0, "step": 5660 }, { "entropy": 0.0178902936168015, "epoch": 2.4775139511981616, "grad_norm": 6.4375, "learning_rate": 1.5606388738494858e-05, "loss": 0.1242, "loss_lm": 0.016119320411235094, "loss_seg": 0.1080613462254405, "mean_token_accuracy": 0.9953795671463013, "num_tokens": 2406002933.0, "step": 5661 }, { "entropy": 0.017596316523849964, "epoch": 2.477951635846373, "grad_norm": 8.1875, "learning_rate": 1.560368164591229e-05, "loss": 0.1057, "loss_lm": 0.016022179974243045, "loss_seg": 0.08969057910144329, "mean_token_accuracy": 0.9954370260238647, "num_tokens": 2406427629.0, "step": 5662 }, { "entropy": 0.018067029770463705, "epoch": 2.4783893204945837, "grad_norm": 11.8125, "learning_rate": 1.560097455332972e-05, "loss": 0.1421, "loss_lm": 0.014292655512690544, "loss_seg": 0.12780072540044785, "mean_token_accuracy": 0.9952716678380966, "num_tokens": 2406853546.0, "step": 5663 }, { "entropy": 0.018025856465101242, "epoch": 2.4788270051427945, "grad_norm": 4.15625, "learning_rate": 1.559826746074716e-05, "loss": 0.0848, "loss_lm": 0.015444984892383218, "loss_seg": 0.06933949049562216, "mean_token_accuracy": 0.9952256083488464, "num_tokens": 2407278063.0, "step": 5664 }, { "entropy": 0.018038876354694366, "epoch": 2.4792646897910053, "grad_norm": 9.375, "learning_rate": 1.5595560368164593e-05, "loss": 0.1209, "loss_lm": 0.016866389429196715, "loss_seg": 0.10400271415710449, "mean_token_accuracy": 0.9952275604009628, "num_tokens": 2407703572.0, "step": 5665 }, { "entropy": 0.01782863773405552, "epoch": 2.4797023744392166, "grad_norm": 22.875, "learning_rate": 1.5592853275582026e-05, "loss": 0.0821, "loss_lm": 0.017416260903701186, "loss_seg": 0.06468174792826176, "mean_token_accuracy": 0.9954133480787277, "num_tokens": 2408128895.0, "step": 5666 }, { "entropy": 0.018307148478925228, "epoch": 2.4801400590874274, "grad_norm": 3.296875, "learning_rate": 1.5590146182999457e-05, "loss": 0.0866, "loss_lm": 0.014264608034864068, "loss_seg": 0.072361059486866, "mean_token_accuracy": 0.9952431619167328, "num_tokens": 2408554158.0, "step": 5667 }, { "entropy": 0.017871718853712082, "epoch": 2.4805777437356387, "grad_norm": 5.5, "learning_rate": 1.558743909041689e-05, "loss": 0.1115, "loss_lm": 0.01653144578449428, "loss_seg": 0.09496923349797726, "mean_token_accuracy": 0.9952054023742676, "num_tokens": 2408978512.0, "step": 5668 }, { "entropy": 0.017667448613792658, "epoch": 2.4810154283838495, "grad_norm": 10.9375, "learning_rate": 1.5584731997834327e-05, "loss": 0.1151, "loss_lm": 0.014660857617855072, "loss_seg": 0.10046474449336529, "mean_token_accuracy": 0.9953339248895645, "num_tokens": 2409403029.0, "step": 5669 }, { "entropy": 0.01862849248573184, "epoch": 2.4814531130320603, "grad_norm": 5.9375, "learning_rate": 1.558202490525176e-05, "loss": 0.1331, "loss_lm": 0.01986662414856255, "loss_seg": 0.11326897796243429, "mean_token_accuracy": 0.9950332194566727, "num_tokens": 2409828626.0, "step": 5670 }, { "entropy": 0.017445706762373447, "epoch": 2.4818907976802715, "grad_norm": 8.6875, "learning_rate": 1.5579317812669195e-05, "loss": 0.1005, "loss_lm": 0.016210254514589906, "loss_seg": 0.08432150073349476, "mean_token_accuracy": 0.9955320060253143, "num_tokens": 2410253372.0, "step": 5671 }, { "entropy": 0.01819088915362954, "epoch": 2.4823284823284824, "grad_norm": 7.75, "learning_rate": 1.5576610720086625e-05, "loss": 0.1256, "loss_lm": 0.017267611576244235, "loss_seg": 0.10833937767893076, "mean_token_accuracy": 0.9952377676963806, "num_tokens": 2410678559.0, "step": 5672 }, { "entropy": 0.018058007583022118, "epoch": 2.482766166976693, "grad_norm": 2.90625, "learning_rate": 1.557390362750406e-05, "loss": 0.0913, "loss_lm": 0.014914413448423147, "loss_seg": 0.07635098323225975, "mean_token_accuracy": 0.9952128678560257, "num_tokens": 2411103820.0, "step": 5673 }, { "entropy": 0.017913597635924816, "epoch": 2.4832038516249044, "grad_norm": 6.15625, "learning_rate": 1.5571196534921496e-05, "loss": 0.0894, "loss_lm": 0.016234717797487974, "loss_seg": 0.07314636372029781, "mean_token_accuracy": 0.9952704757452011, "num_tokens": 2411528802.0, "step": 5674 }, { "entropy": 0.01908569736406207, "epoch": 2.4836415362731152, "grad_norm": 18.875, "learning_rate": 1.556848944233893e-05, "loss": 0.1009, "loss_lm": 0.016051715007051826, "loss_seg": 0.08485978841781616, "mean_token_accuracy": 0.9949693083763123, "num_tokens": 2411954174.0, "step": 5675 }, { "entropy": 0.018340067006647587, "epoch": 2.484079220921326, "grad_norm": 6.1875, "learning_rate": 1.5565782349756364e-05, "loss": 0.1259, "loss_lm": 0.01646155444905162, "loss_seg": 0.10940785333514214, "mean_token_accuracy": 0.9952019155025482, "num_tokens": 2412379519.0, "step": 5676 }, { "entropy": 0.018234352115541697, "epoch": 2.4845169055695373, "grad_norm": 4.6875, "learning_rate": 1.5563075257173794e-05, "loss": 0.1162, "loss_lm": 0.016582884825766087, "loss_seg": 0.0995908323675394, "mean_token_accuracy": 0.9952222257852554, "num_tokens": 2412804578.0, "step": 5677 }, { "entropy": 0.0185968573205173, "epoch": 2.484954590217748, "grad_norm": 4.9375, "learning_rate": 1.5560368164591228e-05, "loss": 0.1123, "loss_lm": 0.01572448736988008, "loss_seg": 0.09660603292286396, "mean_token_accuracy": 0.9951428323984146, "num_tokens": 2413230090.0, "step": 5678 }, { "entropy": 0.018412213306874037, "epoch": 2.485392274865959, "grad_norm": 8.9375, "learning_rate": 1.5557661072008665e-05, "loss": 0.1495, "loss_lm": 0.01580429682508111, "loss_seg": 0.13373718783259392, "mean_token_accuracy": 0.9951645135879517, "num_tokens": 2413654826.0, "step": 5679 }, { "entropy": 0.018426166847348213, "epoch": 2.48582995951417, "grad_norm": 20.125, "learning_rate": 1.55549539794261e-05, "loss": 0.1205, "loss_lm": 0.017695327987894416, "loss_seg": 0.10282330960035324, "mean_token_accuracy": 0.9951913803815842, "num_tokens": 2414079645.0, "step": 5680 }, { "entropy": 0.018203494604676962, "epoch": 2.486267644162381, "grad_norm": 5.25, "learning_rate": 1.5552246886843532e-05, "loss": 0.1099, "loss_lm": 0.015797686064615846, "loss_seg": 0.09407283179461956, "mean_token_accuracy": 0.9953153431415558, "num_tokens": 2414505038.0, "step": 5681 }, { "entropy": 0.018528359476476908, "epoch": 2.486705328810592, "grad_norm": 10.875, "learning_rate": 1.5549539794260962e-05, "loss": 0.1509, "loss_lm": 0.01620710431598127, "loss_seg": 0.13468124717473984, "mean_token_accuracy": 0.9953391551971436, "num_tokens": 2414930224.0, "step": 5682 }, { "entropy": 0.019044721499085426, "epoch": 2.487143013458803, "grad_norm": 15.25, "learning_rate": 1.5546832701678396e-05, "loss": 0.1753, "loss_lm": 0.019608091097325087, "loss_seg": 0.15572731010615826, "mean_token_accuracy": 0.9952336996793747, "num_tokens": 2415355675.0, "step": 5683 }, { "entropy": 0.01804231619462371, "epoch": 2.487580698107014, "grad_norm": 5.21875, "learning_rate": 1.5544125609095833e-05, "loss": 0.1298, "loss_lm": 0.018233639420941472, "loss_seg": 0.11157174780964851, "mean_token_accuracy": 0.9953663945198059, "num_tokens": 2415781430.0, "step": 5684 }, { "entropy": 0.01837749034166336, "epoch": 2.4880183827552247, "grad_norm": 2.890625, "learning_rate": 1.5541418516513267e-05, "loss": 0.1217, "loss_lm": 0.01760938437655568, "loss_seg": 0.1041165143251419, "mean_token_accuracy": 0.9951885640621185, "num_tokens": 2416207246.0, "step": 5685 }, { "entropy": 0.018081361427903175, "epoch": 2.488456067403436, "grad_norm": 3.84375, "learning_rate": 1.55387114239307e-05, "loss": 0.0882, "loss_lm": 0.016612218460068107, "loss_seg": 0.07159338984638453, "mean_token_accuracy": 0.9953148066997528, "num_tokens": 2416632153.0, "step": 5686 }, { "entropy": 0.017789770383387804, "epoch": 2.4888937520516468, "grad_norm": 10.5, "learning_rate": 1.553600433134813e-05, "loss": 0.1214, "loss_lm": 0.014560202369466424, "loss_seg": 0.10688950307667255, "mean_token_accuracy": 0.9953565448522568, "num_tokens": 2417056553.0, "step": 5687 }, { "entropy": 0.01803019130602479, "epoch": 2.4893314366998576, "grad_norm": 15.3125, "learning_rate": 1.5533297238765565e-05, "loss": 0.1046, "loss_lm": 0.015451253857463598, "loss_seg": 0.08910511992871761, "mean_token_accuracy": 0.9952123165130615, "num_tokens": 2417481988.0, "step": 5688 }, { "entropy": 0.018367567099630833, "epoch": 2.489769121348069, "grad_norm": 31.875, "learning_rate": 1.5530590146183e-05, "loss": 0.1301, "loss_lm": 0.015819995431229472, "loss_seg": 0.11425191536545753, "mean_token_accuracy": 0.9951485246419907, "num_tokens": 2417906776.0, "step": 5689 }, { "entropy": 0.017340447288006544, "epoch": 2.4902068059962796, "grad_norm": 18.125, "learning_rate": 1.5527883053600436e-05, "loss": 0.0932, "loss_lm": 0.014429546426981688, "loss_seg": 0.07875922229140997, "mean_token_accuracy": 0.9954682290554047, "num_tokens": 2418331699.0, "step": 5690 }, { "entropy": 0.018482657615095377, "epoch": 2.490644490644491, "grad_norm": 6.1875, "learning_rate": 1.5525175961017866e-05, "loss": 0.1342, "loss_lm": 0.015318822581321001, "loss_seg": 0.11883937940001488, "mean_token_accuracy": 0.9951846301555634, "num_tokens": 2418757174.0, "step": 5691 }, { "entropy": 0.018365698400884867, "epoch": 2.4910821752927017, "grad_norm": 3.78125, "learning_rate": 1.55224688684353e-05, "loss": 0.1029, "loss_lm": 0.01531587541103363, "loss_seg": 0.08756931498646736, "mean_token_accuracy": 0.9953029155731201, "num_tokens": 2419182689.0, "step": 5692 }, { "entropy": 0.01866115489974618, "epoch": 2.4915198599409125, "grad_norm": 8.3125, "learning_rate": 1.5519761775852733e-05, "loss": 0.1036, "loss_lm": 0.01655185571871698, "loss_seg": 0.08701275568455458, "mean_token_accuracy": 0.9951642006635666, "num_tokens": 2419607208.0, "step": 5693 }, { "entropy": 0.018355296459048986, "epoch": 2.4919575445891233, "grad_norm": 7.0, "learning_rate": 1.5517054683270167e-05, "loss": 0.1046, "loss_lm": 0.017337794415652752, "loss_seg": 0.08727500401437283, "mean_token_accuracy": 0.9951901137828827, "num_tokens": 2420032559.0, "step": 5694 }, { "entropy": 0.01828869804739952, "epoch": 2.4923952292373346, "grad_norm": 4.9375, "learning_rate": 1.5514347590687604e-05, "loss": 0.1049, "loss_lm": 0.015522608766332269, "loss_seg": 0.08932943549007177, "mean_token_accuracy": 0.9952125698328018, "num_tokens": 2420456963.0, "step": 5695 }, { "entropy": 0.018396885599941015, "epoch": 2.4928329138855454, "grad_norm": 4.78125, "learning_rate": 1.5511640498105035e-05, "loss": 0.0965, "loss_lm": 0.013996465597301722, "loss_seg": 0.08246415760368109, "mean_token_accuracy": 0.9952047169208527, "num_tokens": 2420882130.0, "step": 5696 }, { "entropy": 0.018707923591136932, "epoch": 2.4932705985337567, "grad_norm": 18.5, "learning_rate": 1.550893340552247e-05, "loss": 0.0985, "loss_lm": 0.014744368614628911, "loss_seg": 0.0837479867041111, "mean_token_accuracy": 0.9950757175683975, "num_tokens": 2421306526.0, "step": 5697 }, { "entropy": 0.018137704115360975, "epoch": 2.4937082831819675, "grad_norm": 5.03125, "learning_rate": 1.5506226312939902e-05, "loss": 0.1183, "loss_lm": 0.01796001009643078, "loss_seg": 0.1003355048596859, "mean_token_accuracy": 0.9952932447195053, "num_tokens": 2421731633.0, "step": 5698 }, { "entropy": 0.017726081889122725, "epoch": 2.4941459678301783, "grad_norm": 4.40625, "learning_rate": 1.5503519220357336e-05, "loss": 0.0754, "loss_lm": 0.013848147355020046, "loss_seg": 0.061597405932843685, "mean_token_accuracy": 0.995304673910141, "num_tokens": 2422156899.0, "step": 5699 }, { "entropy": 0.018029159866273403, "epoch": 2.494583652478389, "grad_norm": 4.28125, "learning_rate": 1.5500812127774773e-05, "loss": 0.0775, "loss_lm": 0.014516381081193686, "loss_seg": 0.06296585779637098, "mean_token_accuracy": 0.9953566044569016, "num_tokens": 2422581757.0, "step": 5700 }, { "entropy": 0.018652798142284155, "epoch": 2.4950213371266003, "grad_norm": 8.0625, "learning_rate": 1.5498105035192203e-05, "loss": 0.0902, "loss_lm": 0.017612125957384706, "loss_seg": 0.07259842660278082, "mean_token_accuracy": 0.9950548410415649, "num_tokens": 2423006556.0, "step": 5701 }, { "entropy": 0.01829193951562047, "epoch": 2.495459021774811, "grad_norm": 16.75, "learning_rate": 1.5495397942609637e-05, "loss": 0.1081, "loss_lm": 0.01623126631602645, "loss_seg": 0.09184826724231243, "mean_token_accuracy": 0.9951156228780746, "num_tokens": 2423431760.0, "step": 5702 }, { "entropy": 0.018352558370679617, "epoch": 2.4958967064230224, "grad_norm": 8.375, "learning_rate": 1.549269085002707e-05, "loss": 0.1185, "loss_lm": 0.016456318320706487, "loss_seg": 0.10201091226190329, "mean_token_accuracy": 0.9952512830495834, "num_tokens": 2423857294.0, "step": 5703 }, { "entropy": 0.018683031667023897, "epoch": 2.4963343910712332, "grad_norm": 12.625, "learning_rate": 1.5489983757444504e-05, "loss": 0.1102, "loss_lm": 0.01635341253131628, "loss_seg": 0.093813207000494, "mean_token_accuracy": 0.9951029568910599, "num_tokens": 2424283233.0, "step": 5704 }, { "entropy": 0.01851035561412573, "epoch": 2.496772075719444, "grad_norm": 10.4375, "learning_rate": 1.548727666486194e-05, "loss": 0.0919, "loss_lm": 0.01679628877900541, "loss_seg": 0.07508604880422354, "mean_token_accuracy": 0.9951201230287552, "num_tokens": 2424708631.0, "step": 5705 }, { "entropy": 0.018344266340136528, "epoch": 2.4972097603676553, "grad_norm": 6.125, "learning_rate": 1.5484569572279372e-05, "loss": 0.0902, "loss_lm": 0.015715391375124454, "loss_seg": 0.07446734420955181, "mean_token_accuracy": 0.9951990395784378, "num_tokens": 2425134344.0, "step": 5706 }, { "entropy": 0.01842627488076687, "epoch": 2.497647445015866, "grad_norm": 4.6875, "learning_rate": 1.5481862479696806e-05, "loss": 0.1114, "loss_lm": 0.016025462187826633, "loss_seg": 0.0953897600993514, "mean_token_accuracy": 0.9950565248727798, "num_tokens": 2425559528.0, "step": 5707 }, { "entropy": 0.01824982836842537, "epoch": 2.498085129664077, "grad_norm": 3.90625, "learning_rate": 1.547915538711424e-05, "loss": 0.1119, "loss_lm": 0.017253113677725196, "loss_seg": 0.09469034150242805, "mean_token_accuracy": 0.9953040033578873, "num_tokens": 2425984966.0, "step": 5708 }, { "entropy": 0.018631770741194487, "epoch": 2.498522814312288, "grad_norm": 4.0625, "learning_rate": 1.5476448294531673e-05, "loss": 0.1011, "loss_lm": 0.01816625241190195, "loss_seg": 0.08297648746520281, "mean_token_accuracy": 0.9950977116823196, "num_tokens": 2426410035.0, "step": 5709 }, { "entropy": 0.018599758855998516, "epoch": 2.498960498960499, "grad_norm": 4.90625, "learning_rate": 1.547374120194911e-05, "loss": 0.1104, "loss_lm": 0.017260388238355517, "loss_seg": 0.09313046373426914, "mean_token_accuracy": 0.9951757937669754, "num_tokens": 2426835154.0, "step": 5710 }, { "entropy": 0.018597937654703856, "epoch": 2.49939818360871, "grad_norm": 10.625, "learning_rate": 1.547103410936654e-05, "loss": 0.1155, "loss_lm": 0.014895550673827529, "loss_seg": 0.10058298707008362, "mean_token_accuracy": 0.9951498210430145, "num_tokens": 2427260258.0, "step": 5711 }, { "entropy": 0.018703014589846134, "epoch": 2.499835868256921, "grad_norm": 15.8125, "learning_rate": 1.5468327016783974e-05, "loss": 0.1174, "loss_lm": 0.015732796164229512, "loss_seg": 0.10164421983063221, "mean_token_accuracy": 0.995141476392746, "num_tokens": 2427685618.0, "step": 5712 }, { "entropy": 0.018004970625042915, "epoch": 2.500273552905132, "grad_norm": 5.34375, "learning_rate": 1.5465619924201408e-05, "loss": 0.1038, "loss_lm": 0.013087135506793857, "loss_seg": 0.09066347498446703, "mean_token_accuracy": 0.9952428191900253, "num_tokens": 2428110410.0, "step": 5713 }, { "entropy": 0.018412225414067507, "epoch": 2.5007112375533427, "grad_norm": 9.0, "learning_rate": 1.5462912831618842e-05, "loss": 0.077, "loss_lm": 0.015341351507231593, "loss_seg": 0.061680326238274574, "mean_token_accuracy": 0.9952364414930344, "num_tokens": 2428535682.0, "step": 5714 }, { "entropy": 0.018527277279645205, "epoch": 2.501148922201554, "grad_norm": 3.359375, "learning_rate": 1.5460205739036275e-05, "loss": 0.144, "loss_lm": 0.018023224780336022, "loss_seg": 0.12599289137870073, "mean_token_accuracy": 0.9950876533985138, "num_tokens": 2428961115.0, "step": 5715 }, { "entropy": 0.01799956988543272, "epoch": 2.5015866068497647, "grad_norm": 7.4375, "learning_rate": 1.545749864645371e-05, "loss": 0.1036, "loss_lm": 0.013839867664501071, "loss_seg": 0.08978620544075966, "mean_token_accuracy": 0.9952382892370224, "num_tokens": 2429386340.0, "step": 5716 }, { "entropy": 0.019367337226867676, "epoch": 2.5020242914979756, "grad_norm": 8.625, "learning_rate": 1.5454791553871143e-05, "loss": 0.112, "loss_lm": 0.017669773194938898, "loss_seg": 0.09435392357409, "mean_token_accuracy": 0.9949160218238831, "num_tokens": 2429811582.0, "step": 5717 }, { "entropy": 0.01773510640487075, "epoch": 2.502461976146187, "grad_norm": 3.875, "learning_rate": 1.5452084461288577e-05, "loss": 0.1124, "loss_lm": 0.01645858772099018, "loss_seg": 0.09590332582592964, "mean_token_accuracy": 0.9953162968158722, "num_tokens": 2430236183.0, "step": 5718 }, { "entropy": 0.018155931029468775, "epoch": 2.5028996607943976, "grad_norm": 3.625, "learning_rate": 1.544937736870601e-05, "loss": 0.097, "loss_lm": 0.01688490668311715, "loss_seg": 0.08007012121379375, "mean_token_accuracy": 0.9952387809753418, "num_tokens": 2430661645.0, "step": 5719 }, { "entropy": 0.01875182194635272, "epoch": 2.5033373454426084, "grad_norm": 7.4375, "learning_rate": 1.5446670276123444e-05, "loss": 0.0792, "loss_lm": 0.016875343397259712, "loss_seg": 0.06228668428957462, "mean_token_accuracy": 0.9950864464044571, "num_tokens": 2431087785.0, "step": 5720 }, { "entropy": 0.018262286670506, "epoch": 2.5037750300908197, "grad_norm": 7.53125, "learning_rate": 1.5443963183540878e-05, "loss": 0.0774, "loss_lm": 0.01585586927831173, "loss_seg": 0.06159243267029524, "mean_token_accuracy": 0.99515600502491, "num_tokens": 2431512603.0, "step": 5721 }, { "entropy": 0.018042526207864285, "epoch": 2.5042127147390305, "grad_norm": 3.859375, "learning_rate": 1.544125609095831e-05, "loss": 0.0851, "loss_lm": 0.01658006850630045, "loss_seg": 0.06847984436899424, "mean_token_accuracy": 0.9952912777662277, "num_tokens": 2431937309.0, "step": 5722 }, { "entropy": 0.018103402573615313, "epoch": 2.5046503993872413, "grad_norm": 5.125, "learning_rate": 1.5438548998375745e-05, "loss": 0.1144, "loss_lm": 0.015111127402633429, "loss_seg": 0.0993258636444807, "mean_token_accuracy": 0.9952518790960312, "num_tokens": 2432362252.0, "step": 5723 }, { "entropy": 0.018917038571089506, "epoch": 2.5050880840354526, "grad_norm": 2.75, "learning_rate": 1.543584190579318e-05, "loss": 0.1009, "loss_lm": 0.015839830273762345, "loss_seg": 0.08504952676594257, "mean_token_accuracy": 0.9951656609773636, "num_tokens": 2432788141.0, "step": 5724 }, { "entropy": 0.01769366767257452, "epoch": 2.5055257686836634, "grad_norm": 5.53125, "learning_rate": 1.543313481321061e-05, "loss": 0.1011, "loss_lm": 0.016807196894660592, "loss_seg": 0.08428377751260996, "mean_token_accuracy": 0.9953520894050598, "num_tokens": 2433212596.0, "step": 5725 }, { "entropy": 0.018191159702837467, "epoch": 2.5059634533318746, "grad_norm": 5.34375, "learning_rate": 1.5430427720628046e-05, "loss": 0.1193, "loss_lm": 0.015583574306219816, "loss_seg": 0.10374402068555355, "mean_token_accuracy": 0.9952054619789124, "num_tokens": 2433637426.0, "step": 5726 }, { "entropy": 0.018758693244308233, "epoch": 2.5064011379800855, "grad_norm": 3.25, "learning_rate": 1.542772062804548e-05, "loss": 0.1273, "loss_lm": 0.019120770506560802, "loss_seg": 0.10821923241019249, "mean_token_accuracy": 0.9950668662786484, "num_tokens": 2434063307.0, "step": 5727 }, { "entropy": 0.01754010282456875, "epoch": 2.5068388226282963, "grad_norm": 8.625, "learning_rate": 1.5425013535462914e-05, "loss": 0.1249, "loss_lm": 0.014989810064435005, "loss_seg": 0.10993048548698425, "mean_token_accuracy": 0.9953835308551788, "num_tokens": 2434487843.0, "step": 5728 }, { "entropy": 0.018326361663639545, "epoch": 2.507276507276507, "grad_norm": 8.25, "learning_rate": 1.5422306442880348e-05, "loss": 0.0987, "loss_lm": 0.01650572568178177, "loss_seg": 0.08221690263599157, "mean_token_accuracy": 0.9951775819063187, "num_tokens": 2434912280.0, "step": 5729 }, { "entropy": 0.018680131528526545, "epoch": 2.5077141919247183, "grad_norm": 5.625, "learning_rate": 1.5419599350297778e-05, "loss": 0.1194, "loss_lm": 0.016457261983305216, "loss_seg": 0.10296495445072651, "mean_token_accuracy": 0.9950795024633408, "num_tokens": 2435338032.0, "step": 5730 }, { "entropy": 0.017725669778883457, "epoch": 2.508151876572929, "grad_norm": 8.6875, "learning_rate": 1.5416892257715215e-05, "loss": 0.1116, "loss_lm": 0.016496074618771672, "loss_seg": 0.09514001291245222, "mean_token_accuracy": 0.9953261762857437, "num_tokens": 2435762479.0, "step": 5731 }, { "entropy": 0.017939506098628044, "epoch": 2.5085895612211404, "grad_norm": 3.890625, "learning_rate": 1.541418516513265e-05, "loss": 0.0982, "loss_lm": 0.014683086657896638, "loss_seg": 0.0835138950496912, "mean_token_accuracy": 0.9953137338161469, "num_tokens": 2436187735.0, "step": 5732 }, { "entropy": 0.01792429154738784, "epoch": 2.509027245869351, "grad_norm": 5.3125, "learning_rate": 1.5411478072550082e-05, "loss": 0.0863, "loss_lm": 0.017010797280818224, "loss_seg": 0.06927221175283194, "mean_token_accuracy": 0.9952369034290314, "num_tokens": 2436613059.0, "step": 5733 }, { "entropy": 0.01818767935037613, "epoch": 2.509464930517562, "grad_norm": 4.21875, "learning_rate": 1.5408770979967516e-05, "loss": 0.1254, "loss_lm": 0.014740032376721501, "loss_seg": 0.11062409542500973, "mean_token_accuracy": 0.9952795952558517, "num_tokens": 2437038193.0, "step": 5734 }, { "entropy": 0.019151555374264717, "epoch": 2.509902615165773, "grad_norm": 3.453125, "learning_rate": 1.5406063887384947e-05, "loss": 0.0929, "loss_lm": 0.017720121424645185, "loss_seg": 0.07515623327344656, "mean_token_accuracy": 0.9949373602867126, "num_tokens": 2437463248.0, "step": 5735 }, { "entropy": 0.0180653459392488, "epoch": 2.510340299813984, "grad_norm": 4.59375, "learning_rate": 1.5403356794802384e-05, "loss": 0.1058, "loss_lm": 0.01573418197222054, "loss_seg": 0.09007737692445517, "mean_token_accuracy": 0.995334267616272, "num_tokens": 2437888007.0, "step": 5736 }, { "entropy": 0.018274066038429737, "epoch": 2.510777984462195, "grad_norm": 6.0, "learning_rate": 1.5400649702219817e-05, "loss": 0.1124, "loss_lm": 0.014641441171988845, "loss_seg": 0.09776186384260654, "mean_token_accuracy": 0.9953055530786514, "num_tokens": 2438312524.0, "step": 5737 }, { "entropy": 0.01796605158597231, "epoch": 2.511215669110406, "grad_norm": 6.40625, "learning_rate": 1.539794260963725e-05, "loss": 0.1097, "loss_lm": 0.01783793931826949, "loss_seg": 0.09183355420827866, "mean_token_accuracy": 0.995329737663269, "num_tokens": 2438737289.0, "step": 5738 }, { "entropy": 0.017833165358752012, "epoch": 2.511653353758617, "grad_norm": 8.4375, "learning_rate": 1.539523551705468e-05, "loss": 0.135, "loss_lm": 0.018106058472767472, "loss_seg": 0.1168530061841011, "mean_token_accuracy": 0.9953674823045731, "num_tokens": 2439161828.0, "step": 5739 }, { "entropy": 0.018112196121364832, "epoch": 2.512091038406828, "grad_norm": 13.75, "learning_rate": 1.5392528424472115e-05, "loss": 0.0826, "loss_lm": 0.015301972161978483, "loss_seg": 0.06730998400598764, "mean_token_accuracy": 0.9951681643724442, "num_tokens": 2439586492.0, "step": 5740 }, { "entropy": 0.01783193415030837, "epoch": 2.5125287230550386, "grad_norm": 8.0625, "learning_rate": 1.5389821331889552e-05, "loss": 0.0962, "loss_lm": 0.015742552699521184, "loss_seg": 0.08048305660486221, "mean_token_accuracy": 0.9953546077013016, "num_tokens": 2440011603.0, "step": 5741 }, { "entropy": 0.018315169028937817, "epoch": 2.51296640770325, "grad_norm": 3.4375, "learning_rate": 1.5387114239306986e-05, "loss": 0.0957, "loss_lm": 0.0168999710585922, "loss_seg": 0.07882560882717371, "mean_token_accuracy": 0.9951290488243103, "num_tokens": 2440437123.0, "step": 5742 }, { "entropy": 0.01796801434829831, "epoch": 2.5134040923514607, "grad_norm": 3.640625, "learning_rate": 1.538440714672442e-05, "loss": 0.0935, "loss_lm": 0.013669129926711321, "loss_seg": 0.07987506687641144, "mean_token_accuracy": 0.9953131228685379, "num_tokens": 2440862112.0, "step": 5743 }, { "entropy": 0.01775336917489767, "epoch": 2.513841776999672, "grad_norm": 12.5, "learning_rate": 1.538170005414185e-05, "loss": 0.1317, "loss_lm": 0.01575176022015512, "loss_seg": 0.11597631871700287, "mean_token_accuracy": 0.9953736960887909, "num_tokens": 2441287645.0, "step": 5744 }, { "entropy": 0.019078049808740616, "epoch": 2.5142794616478827, "grad_norm": 4.75, "learning_rate": 1.5378992961559284e-05, "loss": 0.1064, "loss_lm": 0.01656646723859012, "loss_seg": 0.08979148510843515, "mean_token_accuracy": 0.9949793666601181, "num_tokens": 2441712693.0, "step": 5745 }, { "entropy": 0.017812871374189854, "epoch": 2.5147171462960936, "grad_norm": 8.125, "learning_rate": 1.537628586897672e-05, "loss": 0.1153, "loss_lm": 0.015166677068918943, "loss_seg": 0.1001280527561903, "mean_token_accuracy": 0.9954202026128769, "num_tokens": 2442137758.0, "step": 5746 }, { "entropy": 0.018447714392095804, "epoch": 2.5151548309443044, "grad_norm": 6.3125, "learning_rate": 1.5373578776394155e-05, "loss": 0.1305, "loss_lm": 0.013968042563647032, "loss_seg": 0.11655247211456299, "mean_token_accuracy": 0.9951964169740677, "num_tokens": 2442562811.0, "step": 5747 }, { "entropy": 0.018298882991075516, "epoch": 2.5155925155925156, "grad_norm": 3.671875, "learning_rate": 1.537087168381159e-05, "loss": 0.1344, "loss_lm": 0.015316897304728627, "loss_seg": 0.11909548006951809, "mean_token_accuracy": 0.9952039569616318, "num_tokens": 2442988128.0, "step": 5748 }, { "entropy": 0.018008114770054817, "epoch": 2.5160302002407264, "grad_norm": 3.421875, "learning_rate": 1.536816459122902e-05, "loss": 0.1703, "loss_lm": 0.014945611823350191, "loss_seg": 0.15536597650498152, "mean_token_accuracy": 0.9953126907348633, "num_tokens": 2443412850.0, "step": 5749 }, { "entropy": 0.01771729625761509, "epoch": 2.5164678848889377, "grad_norm": 5.6875, "learning_rate": 1.5365457498646452e-05, "loss": 0.1435, "loss_lm": 0.015670333290472627, "loss_seg": 0.12784722447395325, "mean_token_accuracy": 0.9952836185693741, "num_tokens": 2443837718.0, "step": 5750 }, { "entropy": 0.018429493997246027, "epoch": 2.5169055695371485, "grad_norm": 9.375, "learning_rate": 1.536275040606389e-05, "loss": 0.1174, "loss_lm": 0.01644174102693796, "loss_seg": 0.10093542002141476, "mean_token_accuracy": 0.995232418179512, "num_tokens": 2444263025.0, "step": 5751 }, { "entropy": 0.01798462774604559, "epoch": 2.5173432541853593, "grad_norm": 6.84375, "learning_rate": 1.5360043313481323e-05, "loss": 0.1516, "loss_lm": 0.016039938665926456, "loss_seg": 0.1355897355824709, "mean_token_accuracy": 0.9953715354204178, "num_tokens": 2444687089.0, "step": 5752 }, { "entropy": 0.018417521379888058, "epoch": 2.5177809388335706, "grad_norm": 6.25, "learning_rate": 1.5357336220898757e-05, "loss": 0.1158, "loss_lm": 0.01735099032521248, "loss_seg": 0.09840240515768528, "mean_token_accuracy": 0.9950519353151321, "num_tokens": 2445112260.0, "step": 5753 }, { "entropy": 0.018363535404205322, "epoch": 2.5182186234817814, "grad_norm": 7.0625, "learning_rate": 1.5354629128316187e-05, "loss": 0.105, "loss_lm": 0.01785985752940178, "loss_seg": 0.08710305485874414, "mean_token_accuracy": 0.9952079951763153, "num_tokens": 2445537507.0, "step": 5754 }, { "entropy": 0.018112813122570515, "epoch": 2.518656308129992, "grad_norm": 4.875, "learning_rate": 1.535192203573362e-05, "loss": 0.1145, "loss_lm": 0.014181603910401464, "loss_seg": 0.10030537936836481, "mean_token_accuracy": 0.995331659913063, "num_tokens": 2445962669.0, "step": 5755 }, { "entropy": 0.017582627944648266, "epoch": 2.5190939927782035, "grad_norm": 12.25, "learning_rate": 1.5349214943151055e-05, "loss": 0.117, "loss_lm": 0.015606939559802413, "loss_seg": 0.10136566124856472, "mean_token_accuracy": 0.9954202175140381, "num_tokens": 2446387740.0, "step": 5756 }, { "entropy": 0.018491148948669434, "epoch": 2.5195316774264143, "grad_norm": 6.75, "learning_rate": 1.5346507850568492e-05, "loss": 0.0917, "loss_lm": 0.016329436097294092, "loss_seg": 0.07541991397738457, "mean_token_accuracy": 0.9951831549406052, "num_tokens": 2446812874.0, "step": 5757 }, { "entropy": 0.017509400378912687, "epoch": 2.519969362074625, "grad_norm": 14.625, "learning_rate": 1.5343800757985922e-05, "loss": 0.1584, "loss_lm": 0.015647080494090915, "loss_seg": 0.14277304988354445, "mean_token_accuracy": 0.9953434616327286, "num_tokens": 2447237383.0, "step": 5758 }, { "entropy": 0.017943948972970247, "epoch": 2.5204070467228363, "grad_norm": 4.34375, "learning_rate": 1.5341093665403356e-05, "loss": 0.1284, "loss_lm": 0.01485317456535995, "loss_seg": 0.11350851878523827, "mean_token_accuracy": 0.9952933937311172, "num_tokens": 2447662455.0, "step": 5759 }, { "entropy": 0.01825077086687088, "epoch": 2.520844731371047, "grad_norm": 4.4375, "learning_rate": 1.533838657282079e-05, "loss": 0.0701, "loss_lm": 0.016089537413790822, "loss_seg": 0.05404332373291254, "mean_token_accuracy": 0.9951823800802231, "num_tokens": 2448087753.0, "step": 5760 }, { "entropy": 0.018453865312039852, "epoch": 2.5212824160192584, "grad_norm": 21.75, "learning_rate": 1.5335679480238223e-05, "loss": 0.0928, "loss_lm": 0.015814382815733552, "loss_seg": 0.0769691914319992, "mean_token_accuracy": 0.9951369017362595, "num_tokens": 2448512350.0, "step": 5761 }, { "entropy": 0.018475924618542194, "epoch": 2.521720100667469, "grad_norm": 4.4375, "learning_rate": 1.533297238765566e-05, "loss": 0.0774, "loss_lm": 0.015117872739210725, "loss_seg": 0.06230147834867239, "mean_token_accuracy": 0.9951619058847427, "num_tokens": 2448937083.0, "step": 5762 }, { "entropy": 0.017983016557991505, "epoch": 2.52215778531568, "grad_norm": 5.09375, "learning_rate": 1.533026529507309e-05, "loss": 0.1332, "loss_lm": 0.017427586717531085, "loss_seg": 0.11574594676494598, "mean_token_accuracy": 0.9953252673149109, "num_tokens": 2449362068.0, "step": 5763 }, { "entropy": 0.018483188934624195, "epoch": 2.522595469963891, "grad_norm": 15.75, "learning_rate": 1.5327558202490525e-05, "loss": 0.0801, "loss_lm": 0.015535468235611916, "loss_seg": 0.06455530971288681, "mean_token_accuracy": 0.9952460825443268, "num_tokens": 2449787371.0, "step": 5764 }, { "entropy": 0.017766932491213083, "epoch": 2.523033154612102, "grad_norm": 5.5, "learning_rate": 1.532485110990796e-05, "loss": 0.105, "loss_lm": 0.01661008410155773, "loss_seg": 0.08839643187820911, "mean_token_accuracy": 0.9953475743532181, "num_tokens": 2450213535.0, "step": 5765 }, { "entropy": 0.01804794417694211, "epoch": 2.523470839260313, "grad_norm": 2.953125, "learning_rate": 1.5322144017325392e-05, "loss": 0.0971, "loss_lm": 0.018414189340546727, "loss_seg": 0.07872998155653477, "mean_token_accuracy": 0.9952812939882278, "num_tokens": 2450637965.0, "step": 5766 }, { "entropy": 0.01902940822765231, "epoch": 2.523908523908524, "grad_norm": 6.25, "learning_rate": 1.531943692474283e-05, "loss": 0.1536, "loss_lm": 0.017722827848047018, "loss_seg": 0.1358458362519741, "mean_token_accuracy": 0.9951743185520172, "num_tokens": 2451063574.0, "step": 5767 }, { "entropy": 0.01747499778866768, "epoch": 2.524346208556735, "grad_norm": 3.296875, "learning_rate": 1.531672983216026e-05, "loss": 0.1126, "loss_lm": 0.015081394463777542, "loss_seg": 0.09747802652418613, "mean_token_accuracy": 0.995417445898056, "num_tokens": 2451489042.0, "step": 5768 }, { "entropy": 0.01765105128288269, "epoch": 2.524783893204946, "grad_norm": 4.6875, "learning_rate": 1.5314022739577693e-05, "loss": 0.1118, "loss_lm": 0.01585899922065437, "loss_seg": 0.09593112580478191, "mean_token_accuracy": 0.9953454285860062, "num_tokens": 2451913955.0, "step": 5769 }, { "entropy": 0.01820611907169223, "epoch": 2.5252215778531566, "grad_norm": 15.5, "learning_rate": 1.5311315646995127e-05, "loss": 0.087, "loss_lm": 0.016002982622012496, "loss_seg": 0.07098301686346531, "mean_token_accuracy": 0.9952728450298309, "num_tokens": 2452338701.0, "step": 5770 }, { "entropy": 0.017602022737264633, "epoch": 2.525659262501368, "grad_norm": 7.0, "learning_rate": 1.530860855441256e-05, "loss": 0.1472, "loss_lm": 0.01645422843284905, "loss_seg": 0.1307317055761814, "mean_token_accuracy": 0.9953673034906387, "num_tokens": 2452763462.0, "step": 5771 }, { "entropy": 0.01823605177924037, "epoch": 2.5260969471495787, "grad_norm": 4.625, "learning_rate": 1.5305901461829998e-05, "loss": 0.0938, "loss_lm": 0.014717940473929048, "loss_seg": 0.07906722649931908, "mean_token_accuracy": 0.995161309838295, "num_tokens": 2453188342.0, "step": 5772 }, { "entropy": 0.018373169470578432, "epoch": 2.52653463179779, "grad_norm": 7.59375, "learning_rate": 1.5303194369247428e-05, "loss": 0.1079, "loss_lm": 0.016540318727493286, "loss_seg": 0.0913295540958643, "mean_token_accuracy": 0.9951087236404419, "num_tokens": 2453613158.0, "step": 5773 }, { "entropy": 0.017922126222401857, "epoch": 2.5269723164460007, "grad_norm": 7.65625, "learning_rate": 1.5300487276664862e-05, "loss": 0.0823, "loss_lm": 0.016593014122918248, "loss_seg": 0.06575421430170536, "mean_token_accuracy": 0.9952911883592606, "num_tokens": 2454038716.0, "step": 5774 }, { "entropy": 0.018368417862802744, "epoch": 2.5274100010942115, "grad_norm": 5.28125, "learning_rate": 1.5297780184082296e-05, "loss": 0.0979, "loss_lm": 0.017864921828731894, "loss_seg": 0.08000425528734922, "mean_token_accuracy": 0.9952753037214279, "num_tokens": 2454463897.0, "step": 5775 }, { "entropy": 0.018134175799787045, "epoch": 2.5278476857424224, "grad_norm": 6.9375, "learning_rate": 1.529507309149973e-05, "loss": 0.1015, "loss_lm": 0.015890082577243447, "loss_seg": 0.08560194820165634, "mean_token_accuracy": 0.9954674988985062, "num_tokens": 2454888316.0, "step": 5776 }, { "entropy": 0.018377556931227446, "epoch": 2.5282853703906336, "grad_norm": 4.25, "learning_rate": 1.5292365998917166e-05, "loss": 0.1384, "loss_lm": 0.013902241829782724, "loss_seg": 0.12453201413154602, "mean_token_accuracy": 0.9952524900436401, "num_tokens": 2455313132.0, "step": 5777 }, { "entropy": 0.018270577769726515, "epoch": 2.5287230550388444, "grad_norm": 3.53125, "learning_rate": 1.5289658906334597e-05, "loss": 0.1305, "loss_lm": 0.016126715345308185, "loss_seg": 0.11434506345540285, "mean_token_accuracy": 0.9951886385679245, "num_tokens": 2455738527.0, "step": 5778 }, { "entropy": 0.018251141533255577, "epoch": 2.5291607396870557, "grad_norm": 32.5, "learning_rate": 1.528695181375203e-05, "loss": 0.1301, "loss_lm": 0.015768673736602068, "loss_seg": 0.11433442868292332, "mean_token_accuracy": 0.9951993077993393, "num_tokens": 2456164033.0, "step": 5779 }, { "entropy": 0.017489036545157433, "epoch": 2.5295984243352665, "grad_norm": 9.0625, "learning_rate": 1.5284244721169464e-05, "loss": 0.11, "loss_lm": 0.013490311102941632, "loss_seg": 0.09648738242685795, "mean_token_accuracy": 0.9954067915678024, "num_tokens": 2456588961.0, "step": 5780 }, { "entropy": 0.017892395611852407, "epoch": 2.5300361089834773, "grad_norm": 3.28125, "learning_rate": 1.5281537628586898e-05, "loss": 0.0959, "loss_lm": 0.01664709998294711, "loss_seg": 0.07921894453465939, "mean_token_accuracy": 0.9953280836343765, "num_tokens": 2457014009.0, "step": 5781 }, { "entropy": 0.018303378485143185, "epoch": 2.530473793631688, "grad_norm": 7.34375, "learning_rate": 1.527883053600433e-05, "loss": 0.0799, "loss_lm": 0.014857366913929582, "loss_seg": 0.06505116261541843, "mean_token_accuracy": 0.995093435049057, "num_tokens": 2457438926.0, "step": 5782 }, { "entropy": 0.01884212763980031, "epoch": 2.5309114782798994, "grad_norm": 14.625, "learning_rate": 1.5276123443421765e-05, "loss": 0.1109, "loss_lm": 0.019225797150284052, "loss_seg": 0.0916365198791027, "mean_token_accuracy": 0.9950610995292664, "num_tokens": 2457863891.0, "step": 5783 }, { "entropy": 0.017899557016789913, "epoch": 2.53134916292811, "grad_norm": 6.78125, "learning_rate": 1.52734163508392e-05, "loss": 0.0797, "loss_lm": 0.016992865595966578, "loss_seg": 0.06266495119780302, "mean_token_accuracy": 0.9952756017446518, "num_tokens": 2458288848.0, "step": 5784 }, { "entropy": 0.017961126286536455, "epoch": 2.5317868475763214, "grad_norm": 2.40625, "learning_rate": 1.5270709258256633e-05, "loss": 0.0965, "loss_lm": 0.016852558590471745, "loss_seg": 0.07964828051626682, "mean_token_accuracy": 0.9952173680067062, "num_tokens": 2458713906.0, "step": 5785 }, { "entropy": 0.018492889124900103, "epoch": 2.5322245322245323, "grad_norm": 5.75, "learning_rate": 1.5268002165674067e-05, "loss": 0.0987, "loss_lm": 0.018618654692545533, "loss_seg": 0.0801204051822424, "mean_token_accuracy": 0.9950637817382812, "num_tokens": 2459138880.0, "step": 5786 }, { "entropy": 0.017687608487904072, "epoch": 2.532662216872743, "grad_norm": 14.4375, "learning_rate": 1.52652950730915e-05, "loss": 0.1014, "loss_lm": 0.01550113270059228, "loss_seg": 0.08590210974216461, "mean_token_accuracy": 0.9953161329030991, "num_tokens": 2459563691.0, "step": 5787 }, { "entropy": 0.018370787613093853, "epoch": 2.533099901520954, "grad_norm": 8.4375, "learning_rate": 1.5262587980508934e-05, "loss": 0.094, "loss_lm": 0.015341482823714614, "loss_seg": 0.07866456732153893, "mean_token_accuracy": 0.9951613396406174, "num_tokens": 2459988790.0, "step": 5788 }, { "entropy": 0.017793916165828705, "epoch": 2.533537586169165, "grad_norm": 4.90625, "learning_rate": 1.5259880887926368e-05, "loss": 0.0989, "loss_lm": 0.016074362443760037, "loss_seg": 0.08282450120896101, "mean_token_accuracy": 0.9953460693359375, "num_tokens": 2460413325.0, "step": 5789 }, { "entropy": 0.017898966558277607, "epoch": 2.533975270817376, "grad_norm": 21.625, "learning_rate": 1.5257173795343801e-05, "loss": 0.105, "loss_lm": 0.0172585966065526, "loss_seg": 0.08777090162038803, "mean_token_accuracy": 0.9953185021877289, "num_tokens": 2460837926.0, "step": 5790 }, { "entropy": 0.017871826887130737, "epoch": 2.534412955465587, "grad_norm": 4.84375, "learning_rate": 1.5254466702761237e-05, "loss": 0.1153, "loss_lm": 0.017280726693570614, "loss_seg": 0.09805210866034031, "mean_token_accuracy": 0.9952485412359238, "num_tokens": 2461263250.0, "step": 5791 }, { "entropy": 0.017951382789760828, "epoch": 2.534850640113798, "grad_norm": 3.96875, "learning_rate": 1.5251759610178667e-05, "loss": 0.1306, "loss_lm": 0.01463763602077961, "loss_seg": 0.11597924400120974, "mean_token_accuracy": 0.9952372610569, "num_tokens": 2461687941.0, "step": 5792 }, { "entropy": 0.018345885444432497, "epoch": 2.535288324762009, "grad_norm": 10.5, "learning_rate": 1.5249052517596101e-05, "loss": 0.1035, "loss_lm": 0.014184269588440657, "loss_seg": 0.08928210474550724, "mean_token_accuracy": 0.9950645565986633, "num_tokens": 2462112773.0, "step": 5793 }, { "entropy": 0.018626419361680746, "epoch": 2.53572600941022, "grad_norm": 3.46875, "learning_rate": 1.5246345425013536e-05, "loss": 0.1126, "loss_lm": 0.01716700429096818, "loss_seg": 0.0954754026606679, "mean_token_accuracy": 0.9951269626617432, "num_tokens": 2462537746.0, "step": 5794 }, { "entropy": 0.018128753639757633, "epoch": 2.536163694058431, "grad_norm": 12.0625, "learning_rate": 1.524363833243097e-05, "loss": 0.1417, "loss_lm": 0.014874417334794998, "loss_seg": 0.12678586319088936, "mean_token_accuracy": 0.9952078312635422, "num_tokens": 2462962996.0, "step": 5795 }, { "entropy": 0.01820701314136386, "epoch": 2.5366013787066417, "grad_norm": 5.34375, "learning_rate": 1.5240931239848404e-05, "loss": 0.0821, "loss_lm": 0.015970790991559625, "loss_seg": 0.06608535908162594, "mean_token_accuracy": 0.9952333569526672, "num_tokens": 2463387797.0, "step": 5796 }, { "entropy": 0.0178636540658772, "epoch": 2.537039063354853, "grad_norm": 3.1875, "learning_rate": 1.5238224147265836e-05, "loss": 0.1392, "loss_lm": 0.015343820443376899, "loss_seg": 0.12382422108203173, "mean_token_accuracy": 0.9953498095273972, "num_tokens": 2463813521.0, "step": 5797 }, { "entropy": 0.01846262812614441, "epoch": 2.5374767480030638, "grad_norm": 3.53125, "learning_rate": 1.523551705468327e-05, "loss": 0.1081, "loss_lm": 0.01632177783176303, "loss_seg": 0.0918024443089962, "mean_token_accuracy": 0.995246171951294, "num_tokens": 2464238175.0, "step": 5798 }, { "entropy": 0.018776538781821728, "epoch": 2.5379144326512746, "grad_norm": 4.40625, "learning_rate": 1.5232809962100705e-05, "loss": 0.1083, "loss_lm": 0.01978234713897109, "loss_seg": 0.0885491669178009, "mean_token_accuracy": 0.9952445179224014, "num_tokens": 2464663969.0, "step": 5799 }, { "entropy": 0.018438984639942646, "epoch": 2.538352117299486, "grad_norm": 5.5, "learning_rate": 1.5230102869518139e-05, "loss": 0.1297, "loss_lm": 0.014126320835202932, "loss_seg": 0.11555212736129761, "mean_token_accuracy": 0.9951054602861404, "num_tokens": 2465089504.0, "step": 5800 }, { "entropy": 0.018733613193035126, "epoch": 2.5387898019476967, "grad_norm": 16.625, "learning_rate": 1.5227395776935572e-05, "loss": 0.113, "loss_lm": 0.01720378897152841, "loss_seg": 0.09575539082288742, "mean_token_accuracy": 0.9951044172048569, "num_tokens": 2465514337.0, "step": 5801 }, { "entropy": 0.01809915155172348, "epoch": 2.539227486595908, "grad_norm": 2.9375, "learning_rate": 1.5224688684353005e-05, "loss": 0.0849, "loss_lm": 0.016834971960633993, "loss_seg": 0.06811405159533024, "mean_token_accuracy": 0.9952702969312668, "num_tokens": 2465939688.0, "step": 5802 }, { "entropy": 0.018643588293343782, "epoch": 2.5396651712441187, "grad_norm": 22.5, "learning_rate": 1.5221981591770438e-05, "loss": 0.0793, "loss_lm": 0.016490432433784008, "loss_seg": 0.06285854522138834, "mean_token_accuracy": 0.9950627982616425, "num_tokens": 2466365388.0, "step": 5803 }, { "entropy": 0.01810463098809123, "epoch": 2.5401028558923295, "grad_norm": 4.90625, "learning_rate": 1.5219274499187874e-05, "loss": 0.0869, "loss_lm": 0.018391762860119343, "loss_seg": 0.06847029831260443, "mean_token_accuracy": 0.9952798634767532, "num_tokens": 2466789263.0, "step": 5804 }, { "entropy": 0.017737626563757658, "epoch": 2.5405405405405403, "grad_norm": 12.0625, "learning_rate": 1.5216567406605307e-05, "loss": 0.0821, "loss_lm": 0.015310990856960416, "loss_seg": 0.0667724134400487, "mean_token_accuracy": 0.9953120201826096, "num_tokens": 2467214555.0, "step": 5805 }, { "entropy": 0.01808819267898798, "epoch": 2.5409782251887516, "grad_norm": 4.9375, "learning_rate": 1.5213860314022738e-05, "loss": 0.1262, "loss_lm": 0.015474241925403476, "loss_seg": 0.11068741604685783, "mean_token_accuracy": 0.9951284378767014, "num_tokens": 2467640082.0, "step": 5806 }, { "entropy": 0.018601649440824986, "epoch": 2.5414159098369624, "grad_norm": 5.0625, "learning_rate": 1.5211153221440173e-05, "loss": 0.1298, "loss_lm": 0.017245270079001784, "loss_seg": 0.11259678471833467, "mean_token_accuracy": 0.9951611012220383, "num_tokens": 2468065555.0, "step": 5807 }, { "entropy": 0.01785631710663438, "epoch": 2.5418535944851737, "grad_norm": 2.625, "learning_rate": 1.5208446128857607e-05, "loss": 0.0998, "loss_lm": 0.01408717641606927, "loss_seg": 0.08575022034347057, "mean_token_accuracy": 0.9953010231256485, "num_tokens": 2468490499.0, "step": 5808 }, { "entropy": 0.018172118812799454, "epoch": 2.5422912791333845, "grad_norm": 4.9375, "learning_rate": 1.5205739036275042e-05, "loss": 0.0997, "loss_lm": 0.016818008851259947, "loss_seg": 0.08283614926040173, "mean_token_accuracy": 0.9951369315385818, "num_tokens": 2468915672.0, "step": 5809 }, { "entropy": 0.01846047444269061, "epoch": 2.5427289637815953, "grad_norm": 7.8125, "learning_rate": 1.5203031943692476e-05, "loss": 0.0906, "loss_lm": 0.017046516994014382, "loss_seg": 0.07350893132388592, "mean_token_accuracy": 0.9952454268932343, "num_tokens": 2469340877.0, "step": 5810 }, { "entropy": 0.018428733106702566, "epoch": 2.543166648429806, "grad_norm": 5.84375, "learning_rate": 1.5200324851109906e-05, "loss": 0.1138, "loss_lm": 0.01743940357118845, "loss_seg": 0.09635377116501331, "mean_token_accuracy": 0.9951199889183044, "num_tokens": 2469766051.0, "step": 5811 }, { "entropy": 0.018183819018304348, "epoch": 2.5436043330780174, "grad_norm": 11.1875, "learning_rate": 1.5197617758527342e-05, "loss": 0.1119, "loss_lm": 0.01750470045953989, "loss_seg": 0.09439624845981598, "mean_token_accuracy": 0.9951634407043457, "num_tokens": 2470191101.0, "step": 5812 }, { "entropy": 0.01829170063138008, "epoch": 2.544042017726228, "grad_norm": 7.21875, "learning_rate": 1.5194910665944775e-05, "loss": 0.1171, "loss_lm": 0.015738707035779953, "loss_seg": 0.1013803742825985, "mean_token_accuracy": 0.9951374381780624, "num_tokens": 2470616141.0, "step": 5813 }, { "entropy": 0.017735574394464493, "epoch": 2.5444797023744394, "grad_norm": 7.3125, "learning_rate": 1.519220357336221e-05, "loss": 0.1065, "loss_lm": 0.015698182862251997, "loss_seg": 0.09082434140145779, "mean_token_accuracy": 0.9953229874372482, "num_tokens": 2471041158.0, "step": 5814 }, { "entropy": 0.018194667529314756, "epoch": 2.5449173870226502, "grad_norm": 14.5625, "learning_rate": 1.5189496480779645e-05, "loss": 0.1306, "loss_lm": 0.0161123126745224, "loss_seg": 0.11446289718151093, "mean_token_accuracy": 0.9950821846723557, "num_tokens": 2471466292.0, "step": 5815 }, { "entropy": 0.018208948895335197, "epoch": 2.545355071670861, "grad_norm": 4.5, "learning_rate": 1.5186789388197075e-05, "loss": 0.0807, "loss_lm": 0.016072030179202557, "loss_seg": 0.06460099387913942, "mean_token_accuracy": 0.9952024966478348, "num_tokens": 2471891280.0, "step": 5816 }, { "entropy": 0.017924826592206955, "epoch": 2.545792756319072, "grad_norm": 2.46875, "learning_rate": 1.518408229561451e-05, "loss": 0.124, "loss_lm": 0.016952847596257925, "loss_seg": 0.10707437619566917, "mean_token_accuracy": 0.9953131526708603, "num_tokens": 2472315598.0, "step": 5817 }, { "entropy": 0.01807792531326413, "epoch": 2.546230440967283, "grad_norm": 4.84375, "learning_rate": 1.5181375203031944e-05, "loss": 0.0748, "loss_lm": 0.017236046260222793, "loss_seg": 0.05752348992973566, "mean_token_accuracy": 0.9953076243400574, "num_tokens": 2472739827.0, "step": 5818 }, { "entropy": 0.018161028623580933, "epoch": 2.546668125615494, "grad_norm": 6.3125, "learning_rate": 1.5178668110449378e-05, "loss": 0.0978, "loss_lm": 0.01599139184691012, "loss_seg": 0.08180586714297533, "mean_token_accuracy": 0.9951629936695099, "num_tokens": 2473164834.0, "step": 5819 }, { "entropy": 0.01808630069717765, "epoch": 2.547105810263705, "grad_norm": 4.625, "learning_rate": 1.5175961017866813e-05, "loss": 0.1312, "loss_lm": 0.014910794328898191, "loss_seg": 0.11626588553190231, "mean_token_accuracy": 0.9952377527952194, "num_tokens": 2473590223.0, "step": 5820 }, { "entropy": 0.018001362215727568, "epoch": 2.547543494911916, "grad_norm": 6.75, "learning_rate": 1.5173253925284244e-05, "loss": 0.0748, "loss_lm": 0.015076957875862718, "loss_seg": 0.05971745681017637, "mean_token_accuracy": 0.9952964633703232, "num_tokens": 2474015609.0, "step": 5821 }, { "entropy": 0.01780232647433877, "epoch": 2.547981179560127, "grad_norm": 4.59375, "learning_rate": 1.5170546832701679e-05, "loss": 0.1201, "loss_lm": 0.016330994199961424, "loss_seg": 0.10377619788050652, "mean_token_accuracy": 0.9953488409519196, "num_tokens": 2474440361.0, "step": 5822 }, { "entropy": 0.018418148159980774, "epoch": 2.5484188642083376, "grad_norm": 13.1875, "learning_rate": 1.5167839740119113e-05, "loss": 0.126, "loss_lm": 0.017038933699950576, "loss_seg": 0.10899117961525917, "mean_token_accuracy": 0.9952934086322784, "num_tokens": 2474866328.0, "step": 5823 }, { "entropy": 0.018468889873474836, "epoch": 2.548856548856549, "grad_norm": 28.0, "learning_rate": 1.5165132647536546e-05, "loss": 0.107, "loss_lm": 0.016221814090386033, "loss_seg": 0.0907357856631279, "mean_token_accuracy": 0.9951586276292801, "num_tokens": 2475291794.0, "step": 5824 }, { "entropy": 0.018250671681016684, "epoch": 2.5492942335047597, "grad_norm": 3.953125, "learning_rate": 1.5162425554953982e-05, "loss": 0.0844, "loss_lm": 0.01563214184716344, "loss_seg": 0.06873519439250231, "mean_token_accuracy": 0.9952411949634552, "num_tokens": 2475716699.0, "step": 5825 }, { "entropy": 0.018010692670941353, "epoch": 2.549731918152971, "grad_norm": 9.875, "learning_rate": 1.5159718462371412e-05, "loss": 0.0756, "loss_lm": 0.013861186802387238, "loss_seg": 0.06173452828079462, "mean_token_accuracy": 0.9951500445604324, "num_tokens": 2476142050.0, "step": 5826 }, { "entropy": 0.018671637400984764, "epoch": 2.5501696028011818, "grad_norm": 2.921875, "learning_rate": 1.5157011369788848e-05, "loss": 0.0975, "loss_lm": 0.017783982446417212, "loss_seg": 0.07967608794569969, "mean_token_accuracy": 0.9951565861701965, "num_tokens": 2476567501.0, "step": 5827 }, { "entropy": 0.01735946163535118, "epoch": 2.5506072874493926, "grad_norm": 4.8125, "learning_rate": 1.5154304277206281e-05, "loss": 0.1371, "loss_lm": 0.01510170428082347, "loss_seg": 0.12198641523718834, "mean_token_accuracy": 0.9954253882169724, "num_tokens": 2476992389.0, "step": 5828 }, { "entropy": 0.018146116752177477, "epoch": 2.551044972097604, "grad_norm": 17.75, "learning_rate": 1.5151597184623715e-05, "loss": 0.1013, "loss_lm": 0.016029866645112634, "loss_seg": 0.08531995117664337, "mean_token_accuracy": 0.9953062981367111, "num_tokens": 2477416530.0, "step": 5829 }, { "entropy": 0.017748176120221615, "epoch": 2.5514826567458146, "grad_norm": 18.625, "learning_rate": 1.5148890092041147e-05, "loss": 0.0952, "loss_lm": 0.0174263552762568, "loss_seg": 0.07778243999928236, "mean_token_accuracy": 0.9952884912490845, "num_tokens": 2477841609.0, "step": 5830 }, { "entropy": 0.0178879052400589, "epoch": 2.5519203413940255, "grad_norm": 10.125, "learning_rate": 1.5146182999458581e-05, "loss": 0.0796, "loss_lm": 0.01676719426177442, "loss_seg": 0.06285486649721861, "mean_token_accuracy": 0.9954197555780411, "num_tokens": 2478266195.0, "step": 5831 }, { "entropy": 0.018101857975125313, "epoch": 2.5523580260422367, "grad_norm": 7.65625, "learning_rate": 1.5143475906876015e-05, "loss": 0.1177, "loss_lm": 0.015472989995032549, "loss_seg": 0.10223176330327988, "mean_token_accuracy": 0.9952127635478973, "num_tokens": 2478691255.0, "step": 5832 }, { "entropy": 0.01837527146562934, "epoch": 2.5527957106904475, "grad_norm": 3.296875, "learning_rate": 1.514076881429345e-05, "loss": 0.107, "loss_lm": 0.015388483880087733, "loss_seg": 0.09161406196653843, "mean_token_accuracy": 0.9951552599668503, "num_tokens": 2479116423.0, "step": 5833 }, { "entropy": 0.01793810911476612, "epoch": 2.5532333953386583, "grad_norm": 5.0625, "learning_rate": 1.5138061721710884e-05, "loss": 0.1205, "loss_lm": 0.017565145390108228, "loss_seg": 0.10297694988548756, "mean_token_accuracy": 0.9952644258737564, "num_tokens": 2479541039.0, "step": 5834 }, { "entropy": 0.017990133725106716, "epoch": 2.5536710799868696, "grad_norm": 6.625, "learning_rate": 1.5135354629128316e-05, "loss": 0.1039, "loss_lm": 0.016508621396496892, "loss_seg": 0.08736730460077524, "mean_token_accuracy": 0.995446041226387, "num_tokens": 2479966318.0, "step": 5835 }, { "entropy": 0.018874000292271376, "epoch": 2.5541087646350804, "grad_norm": 7.78125, "learning_rate": 1.513264753654575e-05, "loss": 0.0907, "loss_lm": 0.01618780312128365, "loss_seg": 0.07453891076147556, "mean_token_accuracy": 0.9951473921537399, "num_tokens": 2480391804.0, "step": 5836 }, { "entropy": 0.018209195230156183, "epoch": 2.5545464492832917, "grad_norm": 3.65625, "learning_rate": 1.5129940443963183e-05, "loss": 0.0915, "loss_lm": 0.016819587210193276, "loss_seg": 0.07468188367784023, "mean_token_accuracy": 0.9951771944761276, "num_tokens": 2480817018.0, "step": 5837 }, { "entropy": 0.018037994392216206, "epoch": 2.5549841339315025, "grad_norm": 5.03125, "learning_rate": 1.5127233351380619e-05, "loss": 0.0742, "loss_lm": 0.014995003584772348, "loss_seg": 0.05921482853591442, "mean_token_accuracy": 0.9953390210866928, "num_tokens": 2481242426.0, "step": 5838 }, { "entropy": 0.018548762891441584, "epoch": 2.5554218185797133, "grad_norm": 10.0, "learning_rate": 1.5124526258798052e-05, "loss": 0.0963, "loss_lm": 0.016252044355496764, "loss_seg": 0.08004933223128319, "mean_token_accuracy": 0.9952024668455124, "num_tokens": 2481667997.0, "step": 5839 }, { "entropy": 0.01791094383224845, "epoch": 2.555859503227924, "grad_norm": 6.8125, "learning_rate": 1.5121819166215484e-05, "loss": 0.1012, "loss_lm": 0.013892989605665207, "loss_seg": 0.08726156409829855, "mean_token_accuracy": 0.9953278303146362, "num_tokens": 2482094087.0, "step": 5840 }, { "entropy": 0.018293550238013268, "epoch": 2.5562971878761354, "grad_norm": 3.046875, "learning_rate": 1.5119112073632918e-05, "loss": 0.12, "loss_lm": 0.016115992097184062, "loss_seg": 0.1039305031299591, "mean_token_accuracy": 0.9951533824205399, "num_tokens": 2482519299.0, "step": 5841 }, { "entropy": 0.01836782693862915, "epoch": 2.556734872524346, "grad_norm": 7.46875, "learning_rate": 1.5116404981050352e-05, "loss": 0.1112, "loss_lm": 0.016600185073912144, "loss_seg": 0.09464924596250057, "mean_token_accuracy": 0.9952193200588226, "num_tokens": 2482944520.0, "step": 5842 }, { "entropy": 0.018311232328414917, "epoch": 2.5571725571725574, "grad_norm": 5.84375, "learning_rate": 1.5113697888467787e-05, "loss": 0.0867, "loss_lm": 0.016167320078238845, "loss_seg": 0.07049180939793587, "mean_token_accuracy": 0.9952781051397324, "num_tokens": 2483368898.0, "step": 5843 }, { "entropy": 0.01865019928663969, "epoch": 2.5576102418207682, "grad_norm": 9.8125, "learning_rate": 1.5110990795885221e-05, "loss": 0.1418, "loss_lm": 0.018159265397116542, "loss_seg": 0.12365700677037239, "mean_token_accuracy": 0.9951709359884262, "num_tokens": 2483794204.0, "step": 5844 }, { "entropy": 0.018838761374354362, "epoch": 2.558047926468979, "grad_norm": 10.125, "learning_rate": 1.5108283703302651e-05, "loss": 0.0661, "loss_lm": 0.014725745189934969, "loss_seg": 0.051362479105591774, "mean_token_accuracy": 0.9951028972864151, "num_tokens": 2484219957.0, "step": 5845 }, { "entropy": 0.018413934856653214, "epoch": 2.55848561111719, "grad_norm": 12.75, "learning_rate": 1.5105576610720087e-05, "loss": 0.0821, "loss_lm": 0.014753521885722876, "loss_seg": 0.06733767595142126, "mean_token_accuracy": 0.9951541572809219, "num_tokens": 2484645237.0, "step": 5846 }, { "entropy": 0.018321550451219082, "epoch": 2.558923295765401, "grad_norm": 4.875, "learning_rate": 1.510286951813752e-05, "loss": 0.1228, "loss_lm": 0.0171261471696198, "loss_seg": 0.10569055285304785, "mean_token_accuracy": 0.9952859729528427, "num_tokens": 2485069462.0, "step": 5847 }, { "entropy": 0.01817865902557969, "epoch": 2.559360980413612, "grad_norm": 5.90625, "learning_rate": 1.5100162425554956e-05, "loss": 0.1167, "loss_lm": 0.016705370973795652, "loss_seg": 0.10003852471709251, "mean_token_accuracy": 0.9952614009380341, "num_tokens": 2485494231.0, "step": 5848 }, { "entropy": 0.018181652761995792, "epoch": 2.559798665061823, "grad_norm": 10.5625, "learning_rate": 1.509745533297239e-05, "loss": 0.1217, "loss_lm": 0.016022087540477514, "loss_seg": 0.10567126795649529, "mean_token_accuracy": 0.9952919334173203, "num_tokens": 2485918353.0, "step": 5849 }, { "entropy": 0.01857876544818282, "epoch": 2.560236349710034, "grad_norm": 9.5625, "learning_rate": 1.509474824038982e-05, "loss": 0.088, "loss_lm": 0.017078110948204994, "loss_seg": 0.07097187638282776, "mean_token_accuracy": 0.995150625705719, "num_tokens": 2486342919.0, "step": 5850 }, { "entropy": 0.0179123068228364, "epoch": 2.560674034358245, "grad_norm": 6.6875, "learning_rate": 1.5092041147807255e-05, "loss": 0.1088, "loss_lm": 0.013656296534463763, "loss_seg": 0.09514086600393057, "mean_token_accuracy": 0.9953662157058716, "num_tokens": 2486767363.0, "step": 5851 }, { "entropy": 0.01747127017006278, "epoch": 2.5611117190064556, "grad_norm": 7.34375, "learning_rate": 1.5089334055224689e-05, "loss": 0.0943, "loss_lm": 0.015602310420945287, "loss_seg": 0.07868487946689129, "mean_token_accuracy": 0.995363712310791, "num_tokens": 2487192877.0, "step": 5852 }, { "entropy": 0.018363679759204388, "epoch": 2.561549403654667, "grad_norm": 13.125, "learning_rate": 1.5086626962642125e-05, "loss": 0.1077, "loss_lm": 0.014539545634761453, "loss_seg": 0.09320741333067417, "mean_token_accuracy": 0.9952741116285324, "num_tokens": 2487617938.0, "step": 5853 }, { "entropy": 0.01850598258897662, "epoch": 2.5619870883028777, "grad_norm": 4.65625, "learning_rate": 1.5083919870059555e-05, "loss": 0.0849, "loss_lm": 0.015786627307534218, "loss_seg": 0.06915370933711529, "mean_token_accuracy": 0.9953140467405319, "num_tokens": 2488043219.0, "step": 5854 }, { "entropy": 0.01855363231152296, "epoch": 2.562424772951089, "grad_norm": 13.5, "learning_rate": 1.5081212777476989e-05, "loss": 0.0981, "loss_lm": 0.01657165214419365, "loss_seg": 0.08157783094793558, "mean_token_accuracy": 0.9951948076486588, "num_tokens": 2488468147.0, "step": 5855 }, { "entropy": 0.018051014747470617, "epoch": 2.5628624575992998, "grad_norm": 5.9375, "learning_rate": 1.5078505684894424e-05, "loss": 0.09, "loss_lm": 0.014920437475666404, "loss_seg": 0.07507517747581005, "mean_token_accuracy": 0.9952206015586853, "num_tokens": 2488893071.0, "step": 5856 }, { "entropy": 0.017772621009498835, "epoch": 2.5633001422475106, "grad_norm": 3.234375, "learning_rate": 1.5075798592311858e-05, "loss": 0.0834, "loss_lm": 0.014551880536600947, "loss_seg": 0.06889522634446621, "mean_token_accuracy": 0.9953955262899399, "num_tokens": 2489318477.0, "step": 5857 }, { "entropy": 0.01766230585053563, "epoch": 2.5637378268957214, "grad_norm": 3.921875, "learning_rate": 1.5073091499729293e-05, "loss": 0.0974, "loss_lm": 0.017538571264594793, "loss_seg": 0.07984146289527416, "mean_token_accuracy": 0.9954293966293335, "num_tokens": 2489743929.0, "step": 5858 }, { "entropy": 0.01784408651292324, "epoch": 2.5641755115439326, "grad_norm": 5.875, "learning_rate": 1.5070384407146724e-05, "loss": 0.0891, "loss_lm": 0.01577141508460045, "loss_seg": 0.07335500791668892, "mean_token_accuracy": 0.9954004734754562, "num_tokens": 2490168930.0, "step": 5859 }, { "entropy": 0.01879760203883052, "epoch": 2.5646131961921435, "grad_norm": 4.96875, "learning_rate": 1.5067677314564157e-05, "loss": 0.0959, "loss_lm": 0.015292497351765633, "loss_seg": 0.08065002784132957, "mean_token_accuracy": 0.9950903058052063, "num_tokens": 2490594189.0, "step": 5860 }, { "entropy": 0.018534226343035698, "epoch": 2.5650508808403547, "grad_norm": 3.390625, "learning_rate": 1.5064970221981593e-05, "loss": 0.104, "loss_lm": 0.01714518526569009, "loss_seg": 0.08683004509657621, "mean_token_accuracy": 0.9950397163629532, "num_tokens": 2491019553.0, "step": 5861 }, { "entropy": 0.018298173788934946, "epoch": 2.5654885654885655, "grad_norm": 5.84375, "learning_rate": 1.5062263129399026e-05, "loss": 0.1006, "loss_lm": 0.015298275044187903, "loss_seg": 0.08533520717173815, "mean_token_accuracy": 0.9952351152896881, "num_tokens": 2491444923.0, "step": 5862 }, { "entropy": 0.018226065672934055, "epoch": 2.5659262501367763, "grad_norm": 24.5, "learning_rate": 1.505955603681646e-05, "loss": 0.1066, "loss_lm": 0.015382193960249424, "loss_seg": 0.0912171769887209, "mean_token_accuracy": 0.9951639026403427, "num_tokens": 2491870214.0, "step": 5863 }, { "entropy": 0.018213222734630108, "epoch": 2.566363934784987, "grad_norm": 14.4375, "learning_rate": 1.5056848944233892e-05, "loss": 0.1025, "loss_lm": 0.01696275849826634, "loss_seg": 0.08557493425905704, "mean_token_accuracy": 0.9952239394187927, "num_tokens": 2492294517.0, "step": 5864 }, { "entropy": 0.017560492269694805, "epoch": 2.5668016194331984, "grad_norm": 3.84375, "learning_rate": 1.5054141851651326e-05, "loss": 0.086, "loss_lm": 0.013937513576820493, "loss_seg": 0.07205798476934433, "mean_token_accuracy": 0.9955111593008041, "num_tokens": 2492719060.0, "step": 5865 }, { "entropy": 0.018017739988863468, "epoch": 2.567239304081409, "grad_norm": 15.375, "learning_rate": 1.5051434759068761e-05, "loss": 0.1032, "loss_lm": 0.015311908209696412, "loss_seg": 0.08785245195031166, "mean_token_accuracy": 0.995234489440918, "num_tokens": 2493144162.0, "step": 5866 }, { "entropy": 0.01849482301622629, "epoch": 2.5676769887296205, "grad_norm": 6.6875, "learning_rate": 1.5048727666486195e-05, "loss": 0.1251, "loss_lm": 0.01635933993384242, "loss_seg": 0.10870850831270218, "mean_token_accuracy": 0.9950735419988632, "num_tokens": 2493569398.0, "step": 5867 }, { "entropy": 0.018451878800988197, "epoch": 2.5681146733778313, "grad_norm": 9.9375, "learning_rate": 1.5046020573903629e-05, "loss": 0.1238, "loss_lm": 0.015652942704036832, "loss_seg": 0.10811266675591469, "mean_token_accuracy": 0.9951574802398682, "num_tokens": 2493994978.0, "step": 5868 }, { "entropy": 0.01800748473033309, "epoch": 2.568552358026042, "grad_norm": 4.75, "learning_rate": 1.504331348132106e-05, "loss": 0.1383, "loss_lm": 0.017723070457577705, "loss_seg": 0.12056582700461149, "mean_token_accuracy": 0.9953222572803497, "num_tokens": 2494419479.0, "step": 5869 }, { "entropy": 0.017803761642426252, "epoch": 2.5689900426742533, "grad_norm": 3.296875, "learning_rate": 1.5040606388738494e-05, "loss": 0.0986, "loss_lm": 0.018514485796913505, "loss_seg": 0.08005864731967449, "mean_token_accuracy": 0.9953968077898026, "num_tokens": 2494843477.0, "step": 5870 }, { "entropy": 0.01867405930534005, "epoch": 2.569427727322464, "grad_norm": 7.25, "learning_rate": 1.503789929615593e-05, "loss": 0.0709, "loss_lm": 0.014427226968109608, "loss_seg": 0.05651731975376606, "mean_token_accuracy": 0.9951016902923584, "num_tokens": 2495268499.0, "step": 5871 }, { "entropy": 0.01816019881516695, "epoch": 2.569865411970675, "grad_norm": 12.8125, "learning_rate": 1.5035192203573364e-05, "loss": 0.1135, "loss_lm": 0.017036270815879107, "loss_seg": 0.09642856754362583, "mean_token_accuracy": 0.9952166527509689, "num_tokens": 2495693064.0, "step": 5872 }, { "entropy": 0.01833674218505621, "epoch": 2.5703030966188862, "grad_norm": 9.1875, "learning_rate": 1.5032485110990797e-05, "loss": 0.1634, "loss_lm": 0.015050797956064343, "loss_seg": 0.14833498746156693, "mean_token_accuracy": 0.9951852858066559, "num_tokens": 2496117968.0, "step": 5873 }, { "entropy": 0.018814447801560163, "epoch": 2.570740781267097, "grad_norm": 4.75, "learning_rate": 1.502977801840823e-05, "loss": 0.1007, "loss_lm": 0.014475545147433877, "loss_seg": 0.0862662922590971, "mean_token_accuracy": 0.9951580166816711, "num_tokens": 2496543814.0, "step": 5874 }, { "entropy": 0.0179170616902411, "epoch": 2.571178465915308, "grad_norm": 3.53125, "learning_rate": 1.5027070925825663e-05, "loss": 0.0945, "loss_lm": 0.016870191087946296, "loss_seg": 0.07761709112673998, "mean_token_accuracy": 0.99533811211586, "num_tokens": 2496968671.0, "step": 5875 }, { "entropy": 0.018076285254210234, "epoch": 2.571616150563519, "grad_norm": 7.75, "learning_rate": 1.5024363833243099e-05, "loss": 0.1251, "loss_lm": 0.015359887387603521, "loss_seg": 0.10971756093204021, "mean_token_accuracy": 0.995264932513237, "num_tokens": 2497394173.0, "step": 5876 }, { "entropy": 0.01816180581226945, "epoch": 2.57205383521173, "grad_norm": 9.5, "learning_rate": 1.5021656740660532e-05, "loss": 0.1111, "loss_lm": 0.01702068164013326, "loss_seg": 0.09403065592050552, "mean_token_accuracy": 0.9951704293489456, "num_tokens": 2497819855.0, "step": 5877 }, { "entropy": 0.018039239570498466, "epoch": 2.572491519859941, "grad_norm": 8.9375, "learning_rate": 1.5018949648077963e-05, "loss": 0.1117, "loss_lm": 0.016195994336158037, "loss_seg": 0.09546062536537647, "mean_token_accuracy": 0.9951651841402054, "num_tokens": 2498244871.0, "step": 5878 }, { "entropy": 0.018086062278598547, "epoch": 2.572929204508152, "grad_norm": 3.875, "learning_rate": 1.5016242555495398e-05, "loss": 0.0784, "loss_lm": 0.01682382565923035, "loss_seg": 0.06156854145228863, "mean_token_accuracy": 0.9952385127544403, "num_tokens": 2498669887.0, "step": 5879 }, { "entropy": 0.018376069609075785, "epoch": 2.573366889156363, "grad_norm": 3.796875, "learning_rate": 1.5013535462912832e-05, "loss": 0.1034, "loss_lm": 0.014990190975368023, "loss_seg": 0.08842494431883097, "mean_token_accuracy": 0.9951474070549011, "num_tokens": 2499095397.0, "step": 5880 }, { "entropy": 0.018544350750744343, "epoch": 2.5738045738045736, "grad_norm": 5.0, "learning_rate": 1.5010828370330265e-05, "loss": 0.1192, "loss_lm": 0.01624099351465702, "loss_seg": 0.10292309522628784, "mean_token_accuracy": 0.9951634109020233, "num_tokens": 2499520444.0, "step": 5881 }, { "entropy": 0.01788450451567769, "epoch": 2.574242258452785, "grad_norm": 4.28125, "learning_rate": 1.5008121277747701e-05, "loss": 0.0921, "loss_lm": 0.01620822469703853, "loss_seg": 0.07587531954050064, "mean_token_accuracy": 0.9952919483184814, "num_tokens": 2499945179.0, "step": 5882 }, { "entropy": 0.017997694201767445, "epoch": 2.5746799431009957, "grad_norm": 5.15625, "learning_rate": 1.5005414185165131e-05, "loss": 0.1357, "loss_lm": 0.01431907038204372, "loss_seg": 0.12142746150493622, "mean_token_accuracy": 0.9953223019838333, "num_tokens": 2500369487.0, "step": 5883 }, { "entropy": 0.018340603448450565, "epoch": 2.575117627749207, "grad_norm": 13.3125, "learning_rate": 1.5002707092582567e-05, "loss": 0.1497, "loss_lm": 0.015966979786753654, "loss_seg": 0.13369574025273323, "mean_token_accuracy": 0.99527707695961, "num_tokens": 2500794998.0, "step": 5884 }, { "entropy": 0.018151005264371634, "epoch": 2.5755553123974178, "grad_norm": 9.1875, "learning_rate": 1.5e-05, "loss": 0.1195, "loss_lm": 0.01545877498574555, "loss_seg": 0.10408056247979403, "mean_token_accuracy": 0.9951909184455872, "num_tokens": 2501219373.0, "step": 5885 }, { "entropy": 0.01783158164471388, "epoch": 2.5759929970456286, "grad_norm": 3.171875, "learning_rate": 1.4997292907417434e-05, "loss": 0.1395, "loss_lm": 0.015990828862413764, "loss_seg": 0.12354802992194891, "mean_token_accuracy": 0.9952615052461624, "num_tokens": 2501644350.0, "step": 5886 }, { "entropy": 0.018136365804821253, "epoch": 2.5764306816938394, "grad_norm": 6.71875, "learning_rate": 1.4994585814834868e-05, "loss": 0.1148, "loss_lm": 0.013273486169055104, "loss_seg": 0.10153559036552906, "mean_token_accuracy": 0.9953227043151855, "num_tokens": 2502068875.0, "step": 5887 }, { "entropy": 0.01818171003833413, "epoch": 2.5768683663420506, "grad_norm": 8.75, "learning_rate": 1.4991878722252302e-05, "loss": 0.104, "loss_lm": 0.016910986742004752, "loss_seg": 0.08713126555085182, "mean_token_accuracy": 0.9952498227357864, "num_tokens": 2502493243.0, "step": 5888 }, { "entropy": 0.018738162703812122, "epoch": 2.5773060509902614, "grad_norm": 6.46875, "learning_rate": 1.4989171629669735e-05, "loss": 0.1088, "loss_lm": 0.016538696130737662, "loss_seg": 0.09221315197646618, "mean_token_accuracy": 0.9950447678565979, "num_tokens": 2502918197.0, "step": 5889 }, { "entropy": 0.017634264193475246, "epoch": 2.5777437356384727, "grad_norm": 6.25, "learning_rate": 1.4986464537087169e-05, "loss": 0.1153, "loss_lm": 0.016429109033197165, "loss_seg": 0.09885524027049541, "mean_token_accuracy": 0.9952492862939835, "num_tokens": 2503342723.0, "step": 5890 }, { "entropy": 0.018244009464979172, "epoch": 2.5781814202866835, "grad_norm": 28.0, "learning_rate": 1.4983757444504601e-05, "loss": 0.131, "loss_lm": 0.014397491235285997, "loss_seg": 0.1166507862508297, "mean_token_accuracy": 0.9952370524406433, "num_tokens": 2503767628.0, "step": 5891 }, { "entropy": 0.018003477714955807, "epoch": 2.5786191049348943, "grad_norm": 3.265625, "learning_rate": 1.4981050351922036e-05, "loss": 0.1232, "loss_lm": 0.0155434540938586, "loss_seg": 0.10767516866326332, "mean_token_accuracy": 0.9953583031892776, "num_tokens": 2504193065.0, "step": 5892 }, { "entropy": 0.018023962154984474, "epoch": 2.579056789583105, "grad_norm": 6.71875, "learning_rate": 1.497834325933947e-05, "loss": 0.0975, "loss_lm": 0.014579376904293895, "loss_seg": 0.08293849229812622, "mean_token_accuracy": 0.9952195286750793, "num_tokens": 2504618132.0, "step": 5893 }, { "entropy": 0.018066600896418095, "epoch": 2.5794944742313164, "grad_norm": 7.09375, "learning_rate": 1.4975636166756902e-05, "loss": 0.1256, "loss_lm": 0.018727473448961973, "loss_seg": 0.10684000141918659, "mean_token_accuracy": 0.9952784776687622, "num_tokens": 2505042983.0, "step": 5894 }, { "entropy": 0.018600855953991413, "epoch": 2.579932158879527, "grad_norm": 18.5, "learning_rate": 1.4972929074174338e-05, "loss": 0.1023, "loss_lm": 0.014750283677130938, "loss_seg": 0.08754954021424055, "mean_token_accuracy": 0.9950791299343109, "num_tokens": 2505468281.0, "step": 5895 }, { "entropy": 0.017919995822012424, "epoch": 2.5803698435277385, "grad_norm": 4.65625, "learning_rate": 1.497022198159177e-05, "loss": 0.1415, "loss_lm": 0.015367035754024982, "loss_seg": 0.1260955510661006, "mean_token_accuracy": 0.995324045419693, "num_tokens": 2505893800.0, "step": 5896 }, { "entropy": 0.018328357487916946, "epoch": 2.5808075281759493, "grad_norm": 4.15625, "learning_rate": 1.4967514889009205e-05, "loss": 0.1111, "loss_lm": 0.016938211396336555, "loss_seg": 0.09418560937047005, "mean_token_accuracy": 0.9951456487178802, "num_tokens": 2506318763.0, "step": 5897 }, { "entropy": 0.018872967921197414, "epoch": 2.58124521282416, "grad_norm": 14.25, "learning_rate": 1.4964807796426639e-05, "loss": 0.1197, "loss_lm": 0.01862900983542204, "loss_seg": 0.10109092853963375, "mean_token_accuracy": 0.9948894679546356, "num_tokens": 2506744155.0, "step": 5898 }, { "entropy": 0.01824840810149908, "epoch": 2.581682897472371, "grad_norm": 8.375, "learning_rate": 1.4962100703844071e-05, "loss": 0.0795, "loss_lm": 0.014538646675646305, "loss_seg": 0.06500426679849625, "mean_token_accuracy": 0.9951611906290054, "num_tokens": 2507168623.0, "step": 5899 }, { "entropy": 0.018129384610801935, "epoch": 2.582120582120582, "grad_norm": 28.5, "learning_rate": 1.4959393611261506e-05, "loss": 0.1201, "loss_lm": 0.016296258429065347, "loss_seg": 0.10380076989531517, "mean_token_accuracy": 0.995167151093483, "num_tokens": 2507592784.0, "step": 5900 }, { "entropy": 0.017983268946409225, "epoch": 2.582558266768793, "grad_norm": 8.8125, "learning_rate": 1.4956686518678938e-05, "loss": 0.0763, "loss_lm": 0.015434183878824115, "loss_seg": 0.060901396907866, "mean_token_accuracy": 0.9951932281255722, "num_tokens": 2508018419.0, "step": 5901 }, { "entropy": 0.01787312887609005, "epoch": 2.582995951417004, "grad_norm": 4.8125, "learning_rate": 1.4953979426096374e-05, "loss": 0.1132, "loss_lm": 0.017152808140963316, "loss_seg": 0.09605186898261309, "mean_token_accuracy": 0.995181530714035, "num_tokens": 2508442543.0, "step": 5902 }, { "entropy": 0.017873726785182953, "epoch": 2.583433636065215, "grad_norm": 6.3125, "learning_rate": 1.4951272333513806e-05, "loss": 0.1162, "loss_lm": 0.01500891917385161, "loss_seg": 0.10122670326381922, "mean_token_accuracy": 0.9953596293926239, "num_tokens": 2508867483.0, "step": 5903 }, { "entropy": 0.018687449395656586, "epoch": 2.583871320713426, "grad_norm": 4.375, "learning_rate": 1.494856524093124e-05, "loss": 0.0984, "loss_lm": 0.016862909542396665, "loss_seg": 0.0815181564539671, "mean_token_accuracy": 0.9951793998479843, "num_tokens": 2509292288.0, "step": 5904 }, { "entropy": 0.01873208023607731, "epoch": 2.584309005361637, "grad_norm": 9.8125, "learning_rate": 1.4945858148348675e-05, "loss": 0.1076, "loss_lm": 0.016693782526999712, "loss_seg": 0.09092948585748672, "mean_token_accuracy": 0.9951970130205154, "num_tokens": 2509717171.0, "step": 5905 }, { "entropy": 0.018515508621931076, "epoch": 2.584746690009848, "grad_norm": 6.65625, "learning_rate": 1.4943151055766107e-05, "loss": 0.0923, "loss_lm": 0.016033983323723078, "loss_seg": 0.07627623248845339, "mean_token_accuracy": 0.9951075911521912, "num_tokens": 2510142638.0, "step": 5906 }, { "entropy": 0.018594580702483654, "epoch": 2.5851843746580587, "grad_norm": 6.0, "learning_rate": 1.4940443963183542e-05, "loss": 0.1088, "loss_lm": 0.016162960324436426, "loss_seg": 0.09266927745193243, "mean_token_accuracy": 0.99514539539814, "num_tokens": 2510567396.0, "step": 5907 }, { "entropy": 0.018527961801737547, "epoch": 2.58562205930627, "grad_norm": 2.640625, "learning_rate": 1.4937736870600974e-05, "loss": 0.1266, "loss_lm": 0.018021825700998306, "loss_seg": 0.10853230580687523, "mean_token_accuracy": 0.9952435940504074, "num_tokens": 2510992054.0, "step": 5908 }, { "entropy": 0.01858410146087408, "epoch": 2.586059743954481, "grad_norm": 12.5, "learning_rate": 1.4935029778018408e-05, "loss": 0.1093, "loss_lm": 0.01593600562773645, "loss_seg": 0.09340386092662811, "mean_token_accuracy": 0.9951501190662384, "num_tokens": 2511417379.0, "step": 5909 }, { "entropy": 0.01801946386694908, "epoch": 2.5864974286026916, "grad_norm": 3.140625, "learning_rate": 1.4932322685435844e-05, "loss": 0.1076, "loss_lm": 0.015749526908621192, "loss_seg": 0.09184329770505428, "mean_token_accuracy": 0.995224267244339, "num_tokens": 2511842171.0, "step": 5910 }, { "entropy": 0.01831116806715727, "epoch": 2.586935113250903, "grad_norm": 43.75, "learning_rate": 1.4929615592853276e-05, "loss": 0.1134, "loss_lm": 0.014792356174439192, "loss_seg": 0.09860705025494099, "mean_token_accuracy": 0.9952626675367355, "num_tokens": 2512267262.0, "step": 5911 }, { "entropy": 0.01801826525479555, "epoch": 2.5873727978991137, "grad_norm": 2.796875, "learning_rate": 1.492690850027071e-05, "loss": 0.0888, "loss_lm": 0.015041170874610543, "loss_seg": 0.0737557876855135, "mean_token_accuracy": 0.9953216165304184, "num_tokens": 2512691726.0, "step": 5912 }, { "entropy": 0.018396184779703617, "epoch": 2.5878104825473245, "grad_norm": 4.71875, "learning_rate": 1.4924201407688143e-05, "loss": 0.1078, "loss_lm": 0.01660760957747698, "loss_seg": 0.09116831514984369, "mean_token_accuracy": 0.9951625317335129, "num_tokens": 2513117123.0, "step": 5913 }, { "entropy": 0.018518646247684956, "epoch": 2.5882481671955357, "grad_norm": 5.96875, "learning_rate": 1.4921494315105577e-05, "loss": 0.1055, "loss_lm": 0.015545964473858476, "loss_seg": 0.08995915204286575, "mean_token_accuracy": 0.9951944500207901, "num_tokens": 2513542045.0, "step": 5914 }, { "entropy": 0.018090801779180765, "epoch": 2.5886858518437466, "grad_norm": 2.921875, "learning_rate": 1.491878722252301e-05, "loss": 0.0963, "loss_lm": 0.01603289833292365, "loss_seg": 0.08024266175925732, "mean_token_accuracy": 0.9951318651437759, "num_tokens": 2513966399.0, "step": 5915 }, { "entropy": 0.01737878331914544, "epoch": 2.5891235364919574, "grad_norm": 28.625, "learning_rate": 1.4916080129940444e-05, "loss": 0.1126, "loss_lm": 0.015377883799374104, "loss_seg": 0.09720615297555923, "mean_token_accuracy": 0.9954220205545425, "num_tokens": 2514391851.0, "step": 5916 }, { "entropy": 0.01820944296196103, "epoch": 2.5895612211401686, "grad_norm": 5.28125, "learning_rate": 1.4913373037357878e-05, "loss": 0.0884, "loss_lm": 0.016629263991490006, "loss_seg": 0.07179134152829647, "mean_token_accuracy": 0.9952742606401443, "num_tokens": 2514817483.0, "step": 5917 }, { "entropy": 0.018517368007451296, "epoch": 2.5899989057883794, "grad_norm": 7.78125, "learning_rate": 1.4910665944775312e-05, "loss": 0.1162, "loss_lm": 0.017667133128270507, "loss_seg": 0.09850589744746685, "mean_token_accuracy": 0.9951649457216263, "num_tokens": 2515243095.0, "step": 5918 }, { "entropy": 0.018292036838829517, "epoch": 2.5904365904365907, "grad_norm": 11.25, "learning_rate": 1.4907958852192745e-05, "loss": 0.1288, "loss_lm": 0.015734339132905006, "loss_seg": 0.11309869773685932, "mean_token_accuracy": 0.9952899068593979, "num_tokens": 2515669174.0, "step": 5919 }, { "entropy": 0.017964513041079044, "epoch": 2.5908742750848015, "grad_norm": 9.125, "learning_rate": 1.4905251759610179e-05, "loss": 0.0907, "loss_lm": 0.016290232306346297, "loss_seg": 0.07438401132822037, "mean_token_accuracy": 0.9952325820922852, "num_tokens": 2516094085.0, "step": 5920 }, { "entropy": 0.017597199883311987, "epoch": 2.5913119597330123, "grad_norm": 2.953125, "learning_rate": 1.4902544667027613e-05, "loss": 0.1171, "loss_lm": 0.01531962282024324, "loss_seg": 0.10173086822032928, "mean_token_accuracy": 0.9953053295612335, "num_tokens": 2516518368.0, "step": 5921 }, { "entropy": 0.018446244299411774, "epoch": 2.591749644381223, "grad_norm": 26.125, "learning_rate": 1.4899837574445047e-05, "loss": 0.1192, "loss_lm": 0.016354118240997195, "loss_seg": 0.10283050499856472, "mean_token_accuracy": 0.9950573295354843, "num_tokens": 2516943425.0, "step": 5922 }, { "entropy": 0.01870970753952861, "epoch": 2.5921873290294344, "grad_norm": 16.75, "learning_rate": 1.489713048186248e-05, "loss": 0.1195, "loss_lm": 0.0176838138140738, "loss_seg": 0.10182351898401976, "mean_token_accuracy": 0.99498550593853, "num_tokens": 2517367774.0, "step": 5923 }, { "entropy": 0.018203436862677336, "epoch": 2.592625013677645, "grad_norm": 5.125, "learning_rate": 1.4894423389279914e-05, "loss": 0.1049, "loss_lm": 0.018421486020088196, "loss_seg": 0.08652023039758205, "mean_token_accuracy": 0.995221734046936, "num_tokens": 2517793171.0, "step": 5924 }, { "entropy": 0.017860389780253172, "epoch": 2.5930626983258565, "grad_norm": 11.625, "learning_rate": 1.4891716296697348e-05, "loss": 0.1057, "loss_lm": 0.014639715198427439, "loss_seg": 0.0910627506673336, "mean_token_accuracy": 0.9952936768531799, "num_tokens": 2518218302.0, "step": 5925 }, { "entropy": 0.0182962566614151, "epoch": 2.5935003829740673, "grad_norm": 15.5, "learning_rate": 1.4889009204114781e-05, "loss": 0.1536, "loss_lm": 0.013714928645640612, "loss_seg": 0.13983940705657005, "mean_token_accuracy": 0.9951339066028595, "num_tokens": 2518643246.0, "step": 5926 }, { "entropy": 0.017688220366835594, "epoch": 2.593938067622278, "grad_norm": 6.46875, "learning_rate": 1.4886302111532213e-05, "loss": 0.117, "loss_lm": 0.01435281615704298, "loss_seg": 0.10266808606684208, "mean_token_accuracy": 0.9952617883682251, "num_tokens": 2519068443.0, "step": 5927 }, { "entropy": 0.018586676567792892, "epoch": 2.594375752270489, "grad_norm": 9.375, "learning_rate": 1.4883595018949649e-05, "loss": 0.1038, "loss_lm": 0.01662583975121379, "loss_seg": 0.08717571943998337, "mean_token_accuracy": 0.9951611161231995, "num_tokens": 2519493719.0, "step": 5928 }, { "entropy": 0.018543014768511057, "epoch": 2.5948134369187, "grad_norm": 12.625, "learning_rate": 1.4880887926367083e-05, "loss": 0.1657, "loss_lm": 0.019012571778148413, "loss_seg": 0.14666848629713058, "mean_token_accuracy": 0.9952632188796997, "num_tokens": 2519918662.0, "step": 5929 }, { "entropy": 0.018281350377947092, "epoch": 2.595251121566911, "grad_norm": 3.921875, "learning_rate": 1.4878180833784515e-05, "loss": 0.0967, "loss_lm": 0.01597011717967689, "loss_seg": 0.08071232214570045, "mean_token_accuracy": 0.9951379597187042, "num_tokens": 2520344173.0, "step": 5930 }, { "entropy": 0.01793917315080762, "epoch": 2.595688806215122, "grad_norm": 7.9375, "learning_rate": 1.487547374120195e-05, "loss": 0.1904, "loss_lm": 0.01574915018863976, "loss_seg": 0.17461902555078268, "mean_token_accuracy": 0.9951951652765274, "num_tokens": 2520769317.0, "step": 5931 }, { "entropy": 0.01861380971968174, "epoch": 2.596126490863333, "grad_norm": 7.46875, "learning_rate": 1.4872766648619382e-05, "loss": 0.1122, "loss_lm": 0.01763651124201715, "loss_seg": 0.09456478804349899, "mean_token_accuracy": 0.9950346201658249, "num_tokens": 2521194825.0, "step": 5932 }, { "entropy": 0.018868899904191494, "epoch": 2.596564175511544, "grad_norm": 8.9375, "learning_rate": 1.4870059556036818e-05, "loss": 0.1047, "loss_lm": 0.015929643996059895, "loss_seg": 0.08880164846777916, "mean_token_accuracy": 0.9949963688850403, "num_tokens": 2521620085.0, "step": 5933 }, { "entropy": 0.018242666963487864, "epoch": 2.5970018601597546, "grad_norm": 6.8125, "learning_rate": 1.4867352463454251e-05, "loss": 0.1298, "loss_lm": 0.01661634142510593, "loss_seg": 0.11314649879932404, "mean_token_accuracy": 0.9950876832008362, "num_tokens": 2522044900.0, "step": 5934 }, { "entropy": 0.018033599480986595, "epoch": 2.597439544807966, "grad_norm": 3.8125, "learning_rate": 1.4864645370871683e-05, "loss": 0.1072, "loss_lm": 0.017105580074712634, "loss_seg": 0.09005874395370483, "mean_token_accuracy": 0.9953224211931229, "num_tokens": 2522469658.0, "step": 5935 }, { "entropy": 0.018135899677872658, "epoch": 2.5978772294561767, "grad_norm": 5.25, "learning_rate": 1.4861938278289119e-05, "loss": 0.1119, "loss_lm": 0.01759276888333261, "loss_seg": 0.09435307513922453, "mean_token_accuracy": 0.9952827244997025, "num_tokens": 2522894802.0, "step": 5936 }, { "entropy": 0.018518419470638037, "epoch": 2.598314914104388, "grad_norm": 4.96875, "learning_rate": 1.485923118570655e-05, "loss": 0.1118, "loss_lm": 0.01687234523706138, "loss_seg": 0.09494947455823421, "mean_token_accuracy": 0.9951245933771133, "num_tokens": 2523320188.0, "step": 5937 }, { "entropy": 0.018662983551621437, "epoch": 2.598752598752599, "grad_norm": 4.90625, "learning_rate": 1.4856524093123986e-05, "loss": 0.0885, "loss_lm": 0.015690998872742057, "loss_seg": 0.0727792652323842, "mean_token_accuracy": 0.9951731860637665, "num_tokens": 2523745277.0, "step": 5938 }, { "entropy": 0.018387666437774897, "epoch": 2.5991902834008096, "grad_norm": 3.6875, "learning_rate": 1.4853817000541418e-05, "loss": 0.0839, "loss_lm": 0.015879529528319836, "loss_seg": 0.06804834771901369, "mean_token_accuracy": 0.9951847195625305, "num_tokens": 2524170555.0, "step": 5939 }, { "entropy": 0.018874713219702244, "epoch": 2.5996279680490204, "grad_norm": 9.9375, "learning_rate": 1.4851109907958852e-05, "loss": 0.1405, "loss_lm": 0.017214807914569974, "loss_seg": 0.12325587868690491, "mean_token_accuracy": 0.9951535761356354, "num_tokens": 2524595187.0, "step": 5940 }, { "entropy": 0.018007606733590364, "epoch": 2.6000656526972317, "grad_norm": 20.125, "learning_rate": 1.4848402815376287e-05, "loss": 0.1353, "loss_lm": 0.016575578832998872, "loss_seg": 0.11874430999159813, "mean_token_accuracy": 0.9952303618192673, "num_tokens": 2525020547.0, "step": 5941 }, { "entropy": 0.01753252185881138, "epoch": 2.6005033373454425, "grad_norm": 5.875, "learning_rate": 1.484569572279372e-05, "loss": 0.114, "loss_lm": 0.016278109047561884, "loss_seg": 0.09776824899017811, "mean_token_accuracy": 0.9954299777746201, "num_tokens": 2525445655.0, "step": 5942 }, { "entropy": 0.01827868353575468, "epoch": 2.6009410219936537, "grad_norm": 7.09375, "learning_rate": 1.4842988630211153e-05, "loss": 0.1207, "loss_lm": 0.015506001422181726, "loss_seg": 0.10522662289440632, "mean_token_accuracy": 0.9951782524585724, "num_tokens": 2525870579.0, "step": 5943 }, { "entropy": 0.01861478155478835, "epoch": 2.6013787066418645, "grad_norm": 6.0, "learning_rate": 1.4840281537628587e-05, "loss": 0.0912, "loss_lm": 0.013993884902447462, "loss_seg": 0.07721035927534103, "mean_token_accuracy": 0.9951894879341125, "num_tokens": 2526296264.0, "step": 5944 }, { "entropy": 0.018090612720698118, "epoch": 2.6018163912900754, "grad_norm": 4.3125, "learning_rate": 1.483757444504602e-05, "loss": 0.0927, "loss_lm": 0.014735493110492826, "loss_seg": 0.07795990910381079, "mean_token_accuracy": 0.9951762408018112, "num_tokens": 2526721987.0, "step": 5945 }, { "entropy": 0.018271924927830696, "epoch": 2.6022540759382866, "grad_norm": 37.75, "learning_rate": 1.4834867352463456e-05, "loss": 0.1205, "loss_lm": 0.017807587748393416, "loss_seg": 0.10273675713688135, "mean_token_accuracy": 0.9951009899377823, "num_tokens": 2527147145.0, "step": 5946 }, { "entropy": 0.017698441166430712, "epoch": 2.6026917605864974, "grad_norm": 4.90625, "learning_rate": 1.4832160259880888e-05, "loss": 0.1018, "loss_lm": 0.014627195661887527, "loss_seg": 0.08717882633209229, "mean_token_accuracy": 0.9953405559062958, "num_tokens": 2527572014.0, "step": 5947 }, { "entropy": 0.018303956370800734, "epoch": 2.6031294452347082, "grad_norm": 3.296875, "learning_rate": 1.4829453167298322e-05, "loss": 0.1027, "loss_lm": 0.013408202910795808, "loss_seg": 0.08925499301403761, "mean_token_accuracy": 0.9952093362808228, "num_tokens": 2527996754.0, "step": 5948 }, { "entropy": 0.017815252300351858, "epoch": 2.6035671298829195, "grad_norm": 3.796875, "learning_rate": 1.4826746074715755e-05, "loss": 0.1165, "loss_lm": 0.016676301136612892, "loss_seg": 0.09980414249002934, "mean_token_accuracy": 0.9953654557466507, "num_tokens": 2528420905.0, "step": 5949 }, { "entropy": 0.01834366424009204, "epoch": 2.6040048145311303, "grad_norm": 11.875, "learning_rate": 1.482403898213319e-05, "loss": 0.1263, "loss_lm": 0.016207640059292316, "loss_seg": 0.11013424769043922, "mean_token_accuracy": 0.9952187836170197, "num_tokens": 2528845829.0, "step": 5950 }, { "entropy": 0.01807768177241087, "epoch": 2.604442499179341, "grad_norm": 19.25, "learning_rate": 1.4821331889550623e-05, "loss": 0.0976, "loss_lm": 0.01640785811468959, "loss_seg": 0.08119759894907475, "mean_token_accuracy": 0.9952788203954697, "num_tokens": 2529270663.0, "step": 5951 }, { "entropy": 0.01824282854795456, "epoch": 2.6048801838275524, "grad_norm": 18.125, "learning_rate": 1.4818624796968057e-05, "loss": 0.1086, "loss_lm": 0.01633279724046588, "loss_seg": 0.0922657260671258, "mean_token_accuracy": 0.9952618330717087, "num_tokens": 2529694837.0, "step": 5952 }, { "entropy": 0.0176960495300591, "epoch": 2.605317868475763, "grad_norm": 16.0, "learning_rate": 1.481591770438549e-05, "loss": 0.0878, "loss_lm": 0.014429179718717933, "loss_seg": 0.07334958016872406, "mean_token_accuracy": 0.9954008162021637, "num_tokens": 2530119416.0, "step": 5953 }, { "entropy": 0.01869413862004876, "epoch": 2.6057555531239744, "grad_norm": 3.671875, "learning_rate": 1.4813210611802924e-05, "loss": 0.075, "loss_lm": 0.01796369021758437, "loss_seg": 0.0570055702701211, "mean_token_accuracy": 0.9950816929340363, "num_tokens": 2530544346.0, "step": 5954 }, { "entropy": 0.018032765947282314, "epoch": 2.6061932377721853, "grad_norm": 10.9375, "learning_rate": 1.4810503519220358e-05, "loss": 0.1267, "loss_lm": 0.01590002211742103, "loss_seg": 0.11079539358615875, "mean_token_accuracy": 0.9953619837760925, "num_tokens": 2530969265.0, "step": 5955 }, { "entropy": 0.018427243921905756, "epoch": 2.606630922420396, "grad_norm": 3.578125, "learning_rate": 1.4807796426637792e-05, "loss": 0.1152, "loss_lm": 0.01728256233036518, "loss_seg": 0.09793742373585701, "mean_token_accuracy": 0.9951876848936081, "num_tokens": 2531394379.0, "step": 5956 }, { "entropy": 0.01821135636419058, "epoch": 2.607068607068607, "grad_norm": 6.5, "learning_rate": 1.4805089334055225e-05, "loss": 0.1025, "loss_lm": 0.015708077698946, "loss_seg": 0.0867422055453062, "mean_token_accuracy": 0.9951157569885254, "num_tokens": 2531818143.0, "step": 5957 }, { "entropy": 0.018124505411833525, "epoch": 2.607506291716818, "grad_norm": 20.5, "learning_rate": 1.4802382241472659e-05, "loss": 0.0948, "loss_lm": 0.018944486044347286, "loss_seg": 0.07580914534628391, "mean_token_accuracy": 0.9953444004058838, "num_tokens": 2532243379.0, "step": 5958 }, { "entropy": 0.018553550820797682, "epoch": 2.607943976365029, "grad_norm": 5.375, "learning_rate": 1.4799675148890093e-05, "loss": 0.1059, "loss_lm": 0.0172306252643466, "loss_seg": 0.08866881020367146, "mean_token_accuracy": 0.9951334446668625, "num_tokens": 2532669123.0, "step": 5959 }, { "entropy": 0.018040555994957685, "epoch": 2.60838166101324, "grad_norm": 3.1875, "learning_rate": 1.4796968056307526e-05, "loss": 0.1283, "loss_lm": 0.015974100213497877, "loss_seg": 0.11231970600783825, "mean_token_accuracy": 0.9952585101127625, "num_tokens": 2533094485.0, "step": 5960 }, { "entropy": 0.018180495128035545, "epoch": 2.608819345661451, "grad_norm": 4.0625, "learning_rate": 1.4794260963724958e-05, "loss": 0.0962, "loss_lm": 0.012288783211261034, "loss_seg": 0.08390477485954762, "mean_token_accuracy": 0.9951663166284561, "num_tokens": 2533520225.0, "step": 5961 }, { "entropy": 0.01834846194833517, "epoch": 2.609257030309662, "grad_norm": 4.53125, "learning_rate": 1.4791553871142394e-05, "loss": 0.0878, "loss_lm": 0.0168974450789392, "loss_seg": 0.07088312320411205, "mean_token_accuracy": 0.9951407611370087, "num_tokens": 2533945161.0, "step": 5962 }, { "entropy": 0.018031961750239134, "epoch": 2.6096947149578726, "grad_norm": 3.078125, "learning_rate": 1.4788846778559826e-05, "loss": 0.0886, "loss_lm": 0.016233520349487662, "loss_seg": 0.07234570663422346, "mean_token_accuracy": 0.9952877014875412, "num_tokens": 2534369535.0, "step": 5963 }, { "entropy": 0.017520918045192957, "epoch": 2.610132399606084, "grad_norm": 7.09375, "learning_rate": 1.4786139685977261e-05, "loss": 0.0812, "loss_lm": 0.016110803233459592, "loss_seg": 0.06505907699465752, "mean_token_accuracy": 0.9953600764274597, "num_tokens": 2534794238.0, "step": 5964 }, { "entropy": 0.018453634809702635, "epoch": 2.6105700842542947, "grad_norm": 15.125, "learning_rate": 1.4783432593394695e-05, "loss": 0.1189, "loss_lm": 0.016040120273828506, "loss_seg": 0.10286909714341164, "mean_token_accuracy": 0.9950213879346848, "num_tokens": 2535219652.0, "step": 5965 }, { "entropy": 0.017644649371504784, "epoch": 2.611007768902506, "grad_norm": 3.0, "learning_rate": 1.4780725500812127e-05, "loss": 0.0878, "loss_lm": 0.016026296420022845, "loss_seg": 0.07181739993393421, "mean_token_accuracy": 0.9954642206430435, "num_tokens": 2535644218.0, "step": 5966 }, { "entropy": 0.018080283887684345, "epoch": 2.6114454535507168, "grad_norm": 9.875, "learning_rate": 1.4778018408229563e-05, "loss": 0.1149, "loss_lm": 0.017529160948470235, "loss_seg": 0.09732541255652905, "mean_token_accuracy": 0.9952278882265091, "num_tokens": 2536069576.0, "step": 5967 }, { "entropy": 0.017857844941318035, "epoch": 2.6118831381989276, "grad_norm": 3.171875, "learning_rate": 1.4775311315646995e-05, "loss": 0.0874, "loss_lm": 0.015834074933081865, "loss_seg": 0.07158126216381788, "mean_token_accuracy": 0.9953583925962448, "num_tokens": 2536495226.0, "step": 5968 }, { "entropy": 0.01814425829797983, "epoch": 2.6123208228471384, "grad_norm": 4.90625, "learning_rate": 1.477260422306443e-05, "loss": 0.1134, "loss_lm": 0.015919561497867107, "loss_seg": 0.09751426801085472, "mean_token_accuracy": 0.9951762408018112, "num_tokens": 2536919969.0, "step": 5969 }, { "entropy": 0.017753413412719965, "epoch": 2.6127585074953497, "grad_norm": 4.0625, "learning_rate": 1.4769897130481864e-05, "loss": 0.1237, "loss_lm": 0.015072733163833618, "loss_seg": 0.10860628262162209, "mean_token_accuracy": 0.9953924119472504, "num_tokens": 2537345518.0, "step": 5970 }, { "entropy": 0.017757474910467863, "epoch": 2.6131961921435605, "grad_norm": 4.34375, "learning_rate": 1.4767190037899296e-05, "loss": 0.0949, "loss_lm": 0.01666207262314856, "loss_seg": 0.07825735583901405, "mean_token_accuracy": 0.9953628033399582, "num_tokens": 2537770727.0, "step": 5971 }, { "entropy": 0.017930603120476007, "epoch": 2.6136338767917717, "grad_norm": 6.125, "learning_rate": 1.4764482945316731e-05, "loss": 0.1063, "loss_lm": 0.014556554611772299, "loss_seg": 0.09170035552233458, "mean_token_accuracy": 0.9953004568815231, "num_tokens": 2538195828.0, "step": 5972 }, { "entropy": 0.01797208096832037, "epoch": 2.6140715614399825, "grad_norm": 32.75, "learning_rate": 1.4761775852734163e-05, "loss": 0.088, "loss_lm": 0.017708462197333574, "loss_seg": 0.07026132941246033, "mean_token_accuracy": 0.9952400326728821, "num_tokens": 2538620792.0, "step": 5973 }, { "entropy": 0.01812924351543188, "epoch": 2.6145092460881934, "grad_norm": 4.0625, "learning_rate": 1.4759068760151599e-05, "loss": 0.1115, "loss_lm": 0.018302070908248425, "loss_seg": 0.09316530451178551, "mean_token_accuracy": 0.9951556026935577, "num_tokens": 2539045589.0, "step": 5974 }, { "entropy": 0.017935811541974545, "epoch": 2.614946930736404, "grad_norm": 12.0625, "learning_rate": 1.475636166756903e-05, "loss": 0.0884, "loss_lm": 0.014354124898090959, "loss_seg": 0.07405326794832945, "mean_token_accuracy": 0.9953486919403076, "num_tokens": 2539470138.0, "step": 5975 }, { "entropy": 0.017877044156193733, "epoch": 2.6153846153846154, "grad_norm": 6.125, "learning_rate": 1.4753654574986464e-05, "loss": 0.1233, "loss_lm": 0.01719812978990376, "loss_seg": 0.10606096312403679, "mean_token_accuracy": 0.9952220022678375, "num_tokens": 2539895195.0, "step": 5976 }, { "entropy": 0.01777320960536599, "epoch": 2.6158223000328262, "grad_norm": 8.375, "learning_rate": 1.47509474824039e-05, "loss": 0.1095, "loss_lm": 0.01679693814367056, "loss_seg": 0.09269187040627003, "mean_token_accuracy": 0.9953725039958954, "num_tokens": 2540321281.0, "step": 5977 }, { "entropy": 0.01840713294222951, "epoch": 2.6162599846810375, "grad_norm": 17.75, "learning_rate": 1.4748240389821332e-05, "loss": 0.1231, "loss_lm": 0.017607472836971283, "loss_seg": 0.10547738336026669, "mean_token_accuracy": 0.9951242804527283, "num_tokens": 2540745578.0, "step": 5978 }, { "entropy": 0.018344426527619362, "epoch": 2.6166976693292483, "grad_norm": 3.703125, "learning_rate": 1.4745533297238766e-05, "loss": 0.1123, "loss_lm": 0.017603212734684348, "loss_seg": 0.09467194229364395, "mean_token_accuracy": 0.9951831698417664, "num_tokens": 2541171344.0, "step": 5979 }, { "entropy": 0.01788888545706868, "epoch": 2.617135353977459, "grad_norm": 5.875, "learning_rate": 1.47428262046562e-05, "loss": 0.1344, "loss_lm": 0.015077077550813556, "loss_seg": 0.11937079206109047, "mean_token_accuracy": 0.9953515529632568, "num_tokens": 2541596383.0, "step": 5980 }, { "entropy": 0.01748021738603711, "epoch": 2.6175730386256704, "grad_norm": 9.125, "learning_rate": 1.4740119112073633e-05, "loss": 0.0959, "loss_lm": 0.014354179846122861, "loss_seg": 0.08155299909412861, "mean_token_accuracy": 0.9953409135341644, "num_tokens": 2542020002.0, "step": 5981 }, { "entropy": 0.018248497042804956, "epoch": 2.618010723273881, "grad_norm": 4.84375, "learning_rate": 1.4737412019491068e-05, "loss": 0.0911, "loss_lm": 0.014843825483694673, "loss_seg": 0.0762875434011221, "mean_token_accuracy": 0.9953123331069946, "num_tokens": 2542444472.0, "step": 5982 }, { "entropy": 0.018346956931054592, "epoch": 2.618448407922092, "grad_norm": 3.265625, "learning_rate": 1.47347049269085e-05, "loss": 0.1119, "loss_lm": 0.016691506607457995, "loss_seg": 0.09524677321314812, "mean_token_accuracy": 0.9952117651700974, "num_tokens": 2542870444.0, "step": 5983 }, { "entropy": 0.018078783061355352, "epoch": 2.6188860925703032, "grad_norm": 3.96875, "learning_rate": 1.4731997834325934e-05, "loss": 0.0903, "loss_lm": 0.017371423775330186, "loss_seg": 0.0729198819026351, "mean_token_accuracy": 0.9953327178955078, "num_tokens": 2543295481.0, "step": 5984 }, { "entropy": 0.017903489992022514, "epoch": 2.619323777218514, "grad_norm": 15.375, "learning_rate": 1.4729290741743368e-05, "loss": 0.0714, "loss_lm": 0.01483127661049366, "loss_seg": 0.05652797222137451, "mean_token_accuracy": 0.9952362179756165, "num_tokens": 2543720294.0, "step": 5985 }, { "entropy": 0.017464285250753164, "epoch": 2.619761461866725, "grad_norm": 3.84375, "learning_rate": 1.4726583649160802e-05, "loss": 0.0938, "loss_lm": 0.015987642342224717, "loss_seg": 0.07781797088682652, "mean_token_accuracy": 0.9953708201646805, "num_tokens": 2544144714.0, "step": 5986 }, { "entropy": 0.01796703413128853, "epoch": 2.620199146514936, "grad_norm": 7.375, "learning_rate": 1.4723876556578235e-05, "loss": 0.0982, "loss_lm": 0.01582073629833758, "loss_seg": 0.08239240385591984, "mean_token_accuracy": 0.9952637255191803, "num_tokens": 2544569232.0, "step": 5987 }, { "entropy": 0.01786326477304101, "epoch": 2.620636831163147, "grad_norm": 6.65625, "learning_rate": 1.4721169463995669e-05, "loss": 0.0833, "loss_lm": 0.016530822031199932, "loss_seg": 0.06672289036214352, "mean_token_accuracy": 0.995285153388977, "num_tokens": 2544993775.0, "step": 5988 }, { "entropy": 0.01751336921006441, "epoch": 2.6210745158113578, "grad_norm": 5.40625, "learning_rate": 1.4718462371413103e-05, "loss": 0.1015, "loss_lm": 0.018112207064405084, "loss_seg": 0.08339293207973242, "mean_token_accuracy": 0.9954492300748825, "num_tokens": 2545418042.0, "step": 5989 }, { "entropy": 0.01797910174354911, "epoch": 2.621512200459569, "grad_norm": 3.75, "learning_rate": 1.4715755278830537e-05, "loss": 0.0931, "loss_lm": 0.016901413211598992, "loss_seg": 0.07619086280465126, "mean_token_accuracy": 0.995302140712738, "num_tokens": 2545843304.0, "step": 5990 }, { "entropy": 0.01812294963747263, "epoch": 2.62194988510778, "grad_norm": 7.78125, "learning_rate": 1.471304818624797e-05, "loss": 0.0852, "loss_lm": 0.01571879768744111, "loss_seg": 0.06948600802570581, "mean_token_accuracy": 0.9952149391174316, "num_tokens": 2546268768.0, "step": 5991 }, { "entropy": 0.017710717860609293, "epoch": 2.6223875697559906, "grad_norm": 3.546875, "learning_rate": 1.4710341093665404e-05, "loss": 0.1249, "loss_lm": 0.015658716671168804, "loss_seg": 0.10920777544379234, "mean_token_accuracy": 0.9953419417142868, "num_tokens": 2546693799.0, "step": 5992 }, { "entropy": 0.018455869983881712, "epoch": 2.622825254404202, "grad_norm": 5.625, "learning_rate": 1.4707634001082838e-05, "loss": 0.1028, "loss_lm": 0.019574585370719433, "loss_seg": 0.08326716162264347, "mean_token_accuracy": 0.9951726943254471, "num_tokens": 2547119631.0, "step": 5993 }, { "entropy": 0.01867819670587778, "epoch": 2.6232629390524127, "grad_norm": 6.46875, "learning_rate": 1.4704926908500271e-05, "loss": 0.1017, "loss_lm": 0.017377505777403712, "loss_seg": 0.08436417207121849, "mean_token_accuracy": 0.9950587004423141, "num_tokens": 2547545193.0, "step": 5994 }, { "entropy": 0.018491287250071764, "epoch": 2.623700623700624, "grad_norm": 4.5625, "learning_rate": 1.4702219815917705e-05, "loss": 0.1218, "loss_lm": 0.016479905461892486, "loss_seg": 0.10529033653438091, "mean_token_accuracy": 0.9951636642217636, "num_tokens": 2547970061.0, "step": 5995 }, { "entropy": 0.01804043212905526, "epoch": 2.6241383083488348, "grad_norm": 4.53125, "learning_rate": 1.4699512723335139e-05, "loss": 0.0976, "loss_lm": 0.01554996706545353, "loss_seg": 0.08204193785786629, "mean_token_accuracy": 0.995260089635849, "num_tokens": 2548394717.0, "step": 5996 }, { "entropy": 0.018272914923727512, "epoch": 2.6245759929970456, "grad_norm": 8.9375, "learning_rate": 1.4696805630752571e-05, "loss": 0.111, "loss_lm": 0.01729723089374602, "loss_seg": 0.09374907799065113, "mean_token_accuracy": 0.9951731115579605, "num_tokens": 2548820175.0, "step": 5997 }, { "entropy": 0.01799763459712267, "epoch": 2.6250136776452564, "grad_norm": 3.6875, "learning_rate": 1.4694098538170006e-05, "loss": 0.0933, "loss_lm": 0.017277197213843465, "loss_seg": 0.07601874694228172, "mean_token_accuracy": 0.9953251779079437, "num_tokens": 2549245399.0, "step": 5998 }, { "entropy": 0.01812626328319311, "epoch": 2.6254513622934676, "grad_norm": 3.40625, "learning_rate": 1.4691391445587438e-05, "loss": 0.0997, "loss_lm": 0.01636194856837392, "loss_seg": 0.08329739421606064, "mean_token_accuracy": 0.995228037238121, "num_tokens": 2549670316.0, "step": 5999 }, { "entropy": 0.01854108739644289, "epoch": 2.6258890469416785, "grad_norm": 5.125, "learning_rate": 1.4688684353004874e-05, "loss": 0.1307, "loss_lm": 0.01598690077662468, "loss_seg": 0.11473814398050308, "mean_token_accuracy": 0.995198979973793, "num_tokens": 2550095411.0, "step": 6000 } ], "logging_steps": 1, "max_steps": 11425, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.121000341288937e+19, "train_batch_size": 12, "trial_name": null, "trial_params": null }