eac123's picture
Upload final checkpoint (checkpoint-946)
9fcfc7e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.5430711610486894,
"eval_steps": 500,
"global_step": 946,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.124472439289093,
"epoch": 0.003745318352059925,
"grad_norm": 0.4064895212650299,
"learning_rate": 0.0002,
"loss": 2.4620742797851562,
"mean_token_accuracy": 0.5437362492084503,
"num_tokens": 16219.0,
"step": 1
},
{
"entropy": 1.2432018220424652,
"epoch": 0.00749063670411985,
"grad_norm": 0.37879112362861633,
"learning_rate": 0.0002,
"loss": 2.1651668548583984,
"mean_token_accuracy": 0.5638100206851959,
"num_tokens": 32451.0,
"step": 2
},
{
"entropy": 1.4062562882900238,
"epoch": 0.011235955056179775,
"grad_norm": 0.28845661878585815,
"learning_rate": 0.0002,
"loss": 1.7072796821594238,
"mean_token_accuracy": 0.5924695134162903,
"num_tokens": 48696.0,
"step": 3
},
{
"entropy": 1.3798817992210388,
"epoch": 0.0149812734082397,
"grad_norm": 0.2335132509469986,
"learning_rate": 0.0002,
"loss": 1.4192372560501099,
"mean_token_accuracy": 0.6366562396287918,
"num_tokens": 65149.0,
"step": 4
},
{
"entropy": 1.3547163307666779,
"epoch": 0.018726591760299626,
"grad_norm": 0.27531901001930237,
"learning_rate": 0.0002,
"loss": 1.2890108823776245,
"mean_token_accuracy": 0.639111116528511,
"num_tokens": 81615.0,
"step": 5
},
{
"entropy": 1.2633765935897827,
"epoch": 0.02247191011235955,
"grad_norm": 0.15004344284534454,
"learning_rate": 0.0002,
"loss": 1.1727904081344604,
"mean_token_accuracy": 0.6589455008506775,
"num_tokens": 98238.0,
"step": 6
},
{
"entropy": 1.1859196424484253,
"epoch": 0.026217228464419477,
"grad_norm": 0.10320489853620529,
"learning_rate": 0.0002,
"loss": 1.0918691158294678,
"mean_token_accuracy": 0.6676707565784454,
"num_tokens": 114444.0,
"step": 7
},
{
"entropy": 1.1146739721298218,
"epoch": 0.0299625468164794,
"grad_norm": 0.1199173703789711,
"learning_rate": 0.0002,
"loss": 1.0362448692321777,
"mean_token_accuracy": 0.6752683073282242,
"num_tokens": 130761.0,
"step": 8
},
{
"entropy": 1.0335184335708618,
"epoch": 0.033707865168539325,
"grad_norm": 0.12563136219978333,
"learning_rate": 0.0002,
"loss": 0.9962326288223267,
"mean_token_accuracy": 0.6880597323179245,
"num_tokens": 147021.0,
"step": 9
},
{
"entropy": 0.9865177571773529,
"epoch": 0.03745318352059925,
"grad_norm": 0.1199953481554985,
"learning_rate": 0.0002,
"loss": 0.9303470849990845,
"mean_token_accuracy": 0.6944610327482224,
"num_tokens": 163123.0,
"step": 10
},
{
"entropy": 0.9654616415500641,
"epoch": 0.04119850187265917,
"grad_norm": 0.11374429613351822,
"learning_rate": 0.0002,
"loss": 0.8831573724746704,
"mean_token_accuracy": 0.7051983922719955,
"num_tokens": 179185.0,
"step": 11
},
{
"entropy": 0.9084527641534805,
"epoch": 0.0449438202247191,
"grad_norm": 0.11104491353034973,
"learning_rate": 0.0002,
"loss": 0.8112745881080627,
"mean_token_accuracy": 0.717003270983696,
"num_tokens": 195302.0,
"step": 12
},
{
"entropy": 0.8792405873537064,
"epoch": 0.04868913857677903,
"grad_norm": 0.29082274436950684,
"learning_rate": 0.0002,
"loss": 0.798420250415802,
"mean_token_accuracy": 0.7170884907245636,
"num_tokens": 211890.0,
"step": 13
},
{
"entropy": 0.8252373337745667,
"epoch": 0.052434456928838954,
"grad_norm": 0.10816927999258041,
"learning_rate": 0.0002,
"loss": 0.7828125357627869,
"mean_token_accuracy": 0.7214709371328354,
"num_tokens": 228238.0,
"step": 14
},
{
"entropy": 0.7244665324687958,
"epoch": 0.056179775280898875,
"grad_norm": 0.11618702858686447,
"learning_rate": 0.0002,
"loss": 0.7206279635429382,
"mean_token_accuracy": 0.7338205277919769,
"num_tokens": 244371.0,
"step": 15
},
{
"entropy": 0.6871565729379654,
"epoch": 0.0599250936329588,
"grad_norm": 0.1065768375992775,
"learning_rate": 0.0002,
"loss": 0.7100083827972412,
"mean_token_accuracy": 0.7358262836933136,
"num_tokens": 260726.0,
"step": 16
},
{
"entropy": 0.6935696750879288,
"epoch": 0.06367041198501873,
"grad_norm": 0.08450760692358017,
"learning_rate": 0.0002,
"loss": 0.6838802695274353,
"mean_token_accuracy": 0.7416488826274872,
"num_tokens": 277122.0,
"step": 17
},
{
"entropy": 0.6860368996858597,
"epoch": 0.06741573033707865,
"grad_norm": 0.08516346663236618,
"learning_rate": 0.0002,
"loss": 0.6765270829200745,
"mean_token_accuracy": 0.7396037727594376,
"num_tokens": 293596.0,
"step": 18
},
{
"entropy": 0.6689740270376205,
"epoch": 0.07116104868913857,
"grad_norm": 0.08950749784708023,
"learning_rate": 0.0002,
"loss": 0.6559870839118958,
"mean_token_accuracy": 0.7492983937263489,
"num_tokens": 309758.0,
"step": 19
},
{
"entropy": 0.6853971034288406,
"epoch": 0.0749063670411985,
"grad_norm": 0.08301156759262085,
"learning_rate": 0.0002,
"loss": 0.6591368913650513,
"mean_token_accuracy": 0.7445396035909653,
"num_tokens": 326199.0,
"step": 20
},
{
"entropy": 0.6475548148155212,
"epoch": 0.07865168539325842,
"grad_norm": 0.07257863134145737,
"learning_rate": 0.0002,
"loss": 0.6322771906852722,
"mean_token_accuracy": 0.7570293545722961,
"num_tokens": 342706.0,
"step": 21
},
{
"entropy": 0.62291419506073,
"epoch": 0.08239700374531835,
"grad_norm": 0.07468358427286148,
"learning_rate": 0.0002,
"loss": 0.6161096096038818,
"mean_token_accuracy": 0.7579571604728699,
"num_tokens": 358967.0,
"step": 22
},
{
"entropy": 0.6039848625659943,
"epoch": 0.08614232209737828,
"grad_norm": 0.06657886505126953,
"learning_rate": 0.0002,
"loss": 0.5981277823448181,
"mean_token_accuracy": 0.7673389315605164,
"num_tokens": 375372.0,
"step": 23
},
{
"entropy": 0.6231608390808105,
"epoch": 0.0898876404494382,
"grad_norm": 0.06528797745704651,
"learning_rate": 0.0002,
"loss": 0.6185131072998047,
"mean_token_accuracy": 0.7547510862350464,
"num_tokens": 391535.0,
"step": 24
},
{
"entropy": 0.6286156177520752,
"epoch": 0.09363295880149813,
"grad_norm": 0.06431519240140915,
"learning_rate": 0.0002,
"loss": 0.6217876672744751,
"mean_token_accuracy": 0.7541641592979431,
"num_tokens": 407808.0,
"step": 25
},
{
"entropy": 0.6126427948474884,
"epoch": 0.09737827715355805,
"grad_norm": 0.06216903775930405,
"learning_rate": 0.0002,
"loss": 0.6070841550827026,
"mean_token_accuracy": 0.759774461388588,
"num_tokens": 424098.0,
"step": 26
},
{
"entropy": 0.6149384081363678,
"epoch": 0.10112359550561797,
"grad_norm": 0.06437912583351135,
"learning_rate": 0.0002,
"loss": 0.6078751087188721,
"mean_token_accuracy": 0.7595006227493286,
"num_tokens": 440539.0,
"step": 27
},
{
"entropy": 0.6091344654560089,
"epoch": 0.10486891385767791,
"grad_norm": 0.06495340913534164,
"learning_rate": 0.0002,
"loss": 0.6011782884597778,
"mean_token_accuracy": 0.7595006972551346,
"num_tokens": 456799.0,
"step": 28
},
{
"entropy": 0.608646497130394,
"epoch": 0.10861423220973783,
"grad_norm": 0.059445418417453766,
"learning_rate": 0.0002,
"loss": 0.6044275164604187,
"mean_token_accuracy": 0.7600021511316299,
"num_tokens": 473089.0,
"step": 29
},
{
"entropy": 0.6043040752410889,
"epoch": 0.11235955056179775,
"grad_norm": 0.06593701243400574,
"learning_rate": 0.0002,
"loss": 0.6045087575912476,
"mean_token_accuracy": 0.7567310333251953,
"num_tokens": 489490.0,
"step": 30
},
{
"entropy": 0.5747391283512115,
"epoch": 0.11610486891385768,
"grad_norm": 0.06415696442127228,
"learning_rate": 0.0002,
"loss": 0.5873428583145142,
"mean_token_accuracy": 0.7674129754304886,
"num_tokens": 505809.0,
"step": 31
},
{
"entropy": 0.5926542580127716,
"epoch": 0.1198501872659176,
"grad_norm": 0.051249004900455475,
"learning_rate": 0.0002,
"loss": 0.598324179649353,
"mean_token_accuracy": 0.759703740477562,
"num_tokens": 522016.0,
"step": 32
},
{
"entropy": 0.5886886864900589,
"epoch": 0.12359550561797752,
"grad_norm": 0.05292005091905594,
"learning_rate": 0.0002,
"loss": 0.5881145596504211,
"mean_token_accuracy": 0.7697232961654663,
"num_tokens": 538100.0,
"step": 33
},
{
"entropy": 0.5867745727300644,
"epoch": 0.12734082397003746,
"grad_norm": 0.04721912741661072,
"learning_rate": 0.0002,
"loss": 0.5836299061775208,
"mean_token_accuracy": 0.768671840429306,
"num_tokens": 554234.0,
"step": 34
},
{
"entropy": 0.5881127417087555,
"epoch": 0.13108614232209737,
"grad_norm": 0.05805843323469162,
"learning_rate": 0.0002,
"loss": 0.5897107124328613,
"mean_token_accuracy": 0.7657543420791626,
"num_tokens": 570565.0,
"step": 35
},
{
"entropy": 0.5939383208751678,
"epoch": 0.1348314606741573,
"grad_norm": 0.0569508820772171,
"learning_rate": 0.0002,
"loss": 0.5897835493087769,
"mean_token_accuracy": 0.7598359882831573,
"num_tokens": 586816.0,
"step": 36
},
{
"entropy": 0.5979506522417068,
"epoch": 0.13857677902621723,
"grad_norm": 0.05739126354455948,
"learning_rate": 0.0002,
"loss": 0.5949404835700989,
"mean_token_accuracy": 0.7612607926130295,
"num_tokens": 603019.0,
"step": 37
},
{
"entropy": 0.5742268264293671,
"epoch": 0.14232209737827714,
"grad_norm": 0.047265954315662384,
"learning_rate": 0.0002,
"loss": 0.5759380459785461,
"mean_token_accuracy": 0.7693933397531509,
"num_tokens": 619295.0,
"step": 38
},
{
"entropy": 0.5710775703191757,
"epoch": 0.14606741573033707,
"grad_norm": 0.05281650274991989,
"learning_rate": 0.0002,
"loss": 0.5691424608230591,
"mean_token_accuracy": 0.7704602777957916,
"num_tokens": 635365.0,
"step": 39
},
{
"entropy": 0.582334503531456,
"epoch": 0.149812734082397,
"grad_norm": 0.055993299931287766,
"learning_rate": 0.0002,
"loss": 0.5809962749481201,
"mean_token_accuracy": 0.7662668973207474,
"num_tokens": 651665.0,
"step": 40
},
{
"entropy": 0.5551325976848602,
"epoch": 0.15355805243445692,
"grad_norm": 0.04340814799070358,
"learning_rate": 0.0002,
"loss": 0.557377815246582,
"mean_token_accuracy": 0.7778407037258148,
"num_tokens": 667809.0,
"step": 41
},
{
"entropy": 0.5822649896144867,
"epoch": 0.15730337078651685,
"grad_norm": 0.04575135186314583,
"learning_rate": 0.0002,
"loss": 0.5827720165252686,
"mean_token_accuracy": 0.7657051831483841,
"num_tokens": 683923.0,
"step": 42
},
{
"entropy": 0.55968376994133,
"epoch": 0.16104868913857678,
"grad_norm": 0.04552368074655533,
"learning_rate": 0.0002,
"loss": 0.5598254799842834,
"mean_token_accuracy": 0.7764519304037094,
"num_tokens": 700197.0,
"step": 43
},
{
"entropy": 0.5671757161617279,
"epoch": 0.1647940074906367,
"grad_norm": 0.04587964341044426,
"learning_rate": 0.0002,
"loss": 0.5750178694725037,
"mean_token_accuracy": 0.7700542360544205,
"num_tokens": 716432.0,
"step": 44
},
{
"entropy": 0.5685836523771286,
"epoch": 0.16853932584269662,
"grad_norm": 0.03833606839179993,
"learning_rate": 0.0002,
"loss": 0.5728627443313599,
"mean_token_accuracy": 0.7676915228366852,
"num_tokens": 732768.0,
"step": 45
},
{
"entropy": 0.5726271122694016,
"epoch": 0.17228464419475656,
"grad_norm": 0.04773888736963272,
"learning_rate": 0.0002,
"loss": 0.5737521052360535,
"mean_token_accuracy": 0.7691973745822906,
"num_tokens": 748991.0,
"step": 46
},
{
"entropy": 0.5940001755952835,
"epoch": 0.1760299625468165,
"grad_norm": 0.035074397921562195,
"learning_rate": 0.0002,
"loss": 0.58332759141922,
"mean_token_accuracy": 0.7648619115352631,
"num_tokens": 765572.0,
"step": 47
},
{
"entropy": 0.5897164344787598,
"epoch": 0.1797752808988764,
"grad_norm": 0.037994541227817535,
"learning_rate": 0.0002,
"loss": 0.5864952802658081,
"mean_token_accuracy": 0.7641548812389374,
"num_tokens": 782005.0,
"step": 48
},
{
"entropy": 0.5744329988956451,
"epoch": 0.18352059925093633,
"grad_norm": 0.040346939116716385,
"learning_rate": 0.0002,
"loss": 0.5669541954994202,
"mean_token_accuracy": 0.770287498831749,
"num_tokens": 798604.0,
"step": 49
},
{
"entropy": 0.5779913067817688,
"epoch": 0.18726591760299627,
"grad_norm": 0.036969687789678574,
"learning_rate": 0.0002,
"loss": 0.5797433257102966,
"mean_token_accuracy": 0.7645184099674225,
"num_tokens": 814871.0,
"step": 50
},
{
"entropy": 0.5663889348506927,
"epoch": 0.19101123595505617,
"grad_norm": 0.03604266792535782,
"learning_rate": 0.0002,
"loss": 0.5714061260223389,
"mean_token_accuracy": 0.7704311609268188,
"num_tokens": 831246.0,
"step": 51
},
{
"entropy": 0.561771884560585,
"epoch": 0.1947565543071161,
"grad_norm": 0.04034798592329025,
"learning_rate": 0.0002,
"loss": 0.5732511878013611,
"mean_token_accuracy": 0.7705236822366714,
"num_tokens": 847825.0,
"step": 52
},
{
"entropy": 0.5677134096622467,
"epoch": 0.19850187265917604,
"grad_norm": 0.03827312961220741,
"learning_rate": 0.0002,
"loss": 0.5743907690048218,
"mean_token_accuracy": 0.7655002921819687,
"num_tokens": 864255.0,
"step": 53
},
{
"entropy": 0.563701331615448,
"epoch": 0.20224719101123595,
"grad_norm": 0.04143316298723221,
"learning_rate": 0.0002,
"loss": 0.5607832074165344,
"mean_token_accuracy": 0.772660031914711,
"num_tokens": 880665.0,
"step": 54
},
{
"entropy": 0.5692192316055298,
"epoch": 0.20599250936329588,
"grad_norm": 0.03400753438472748,
"learning_rate": 0.0002,
"loss": 0.5670974254608154,
"mean_token_accuracy": 0.769247904419899,
"num_tokens": 896987.0,
"step": 55
},
{
"entropy": 0.5776625126600266,
"epoch": 0.20973782771535582,
"grad_norm": 0.035431839525699615,
"learning_rate": 0.0002,
"loss": 0.5733675360679626,
"mean_token_accuracy": 0.7692834436893463,
"num_tokens": 913582.0,
"step": 56
},
{
"entropy": 0.5626319646835327,
"epoch": 0.21348314606741572,
"grad_norm": 0.03843431547284126,
"learning_rate": 0.0002,
"loss": 0.5641550421714783,
"mean_token_accuracy": 0.7710368186235428,
"num_tokens": 929972.0,
"step": 57
},
{
"entropy": 0.5526942014694214,
"epoch": 0.21722846441947566,
"grad_norm": 0.03771563246846199,
"learning_rate": 0.0002,
"loss": 0.5567817687988281,
"mean_token_accuracy": 0.7731232047080994,
"num_tokens": 945888.0,
"step": 58
},
{
"entropy": 0.5716714560985565,
"epoch": 0.2209737827715356,
"grad_norm": 0.036766648292541504,
"learning_rate": 0.0002,
"loss": 0.5660452246665955,
"mean_token_accuracy": 0.7728052884340286,
"num_tokens": 962278.0,
"step": 59
},
{
"entropy": 0.568805992603302,
"epoch": 0.2247191011235955,
"grad_norm": 0.035415392369031906,
"learning_rate": 0.0002,
"loss": 0.5717817544937134,
"mean_token_accuracy": 0.7711138129234314,
"num_tokens": 978682.0,
"step": 60
},
{
"entropy": 0.5708261281251907,
"epoch": 0.22846441947565543,
"grad_norm": 0.03432939946651459,
"learning_rate": 0.0002,
"loss": 0.5735772252082825,
"mean_token_accuracy": 0.7677555531263351,
"num_tokens": 994945.0,
"step": 61
},
{
"entropy": 0.5660677701234818,
"epoch": 0.23220973782771537,
"grad_norm": 0.041112665086984634,
"learning_rate": 0.0002,
"loss": 0.5750763416290283,
"mean_token_accuracy": 0.7678538411855698,
"num_tokens": 1011319.0,
"step": 62
},
{
"entropy": 0.5581584423780441,
"epoch": 0.23595505617977527,
"grad_norm": 0.03535327687859535,
"learning_rate": 0.0002,
"loss": 0.5653359889984131,
"mean_token_accuracy": 0.7709096819162369,
"num_tokens": 1027780.0,
"step": 63
},
{
"entropy": 0.5639653205871582,
"epoch": 0.2397003745318352,
"grad_norm": 0.03404325619339943,
"learning_rate": 0.0002,
"loss": 0.5576256513595581,
"mean_token_accuracy": 0.7768308818340302,
"num_tokens": 1044141.0,
"step": 64
},
{
"entropy": 0.5733215659856796,
"epoch": 0.24344569288389514,
"grad_norm": 0.041786711663007736,
"learning_rate": 0.0002,
"loss": 0.5677163600921631,
"mean_token_accuracy": 0.768655464053154,
"num_tokens": 1060152.0,
"step": 65
},
{
"entropy": 0.5721775144338608,
"epoch": 0.24719101123595505,
"grad_norm": 0.037091247737407684,
"learning_rate": 0.0002,
"loss": 0.5689237713813782,
"mean_token_accuracy": 0.769687607884407,
"num_tokens": 1076350.0,
"step": 66
},
{
"entropy": 0.5711842328310013,
"epoch": 0.250936329588015,
"grad_norm": 0.03522708639502525,
"learning_rate": 0.0002,
"loss": 0.567720890045166,
"mean_token_accuracy": 0.7711529284715652,
"num_tokens": 1092839.0,
"step": 67
},
{
"entropy": 0.5565171837806702,
"epoch": 0.2546816479400749,
"grad_norm": 0.038917530328035355,
"learning_rate": 0.0002,
"loss": 0.5597351789474487,
"mean_token_accuracy": 0.7759623378515244,
"num_tokens": 1109005.0,
"step": 68
},
{
"entropy": 0.5430796295404434,
"epoch": 0.25842696629213485,
"grad_norm": 0.034353867173194885,
"learning_rate": 0.0002,
"loss": 0.5536048412322998,
"mean_token_accuracy": 0.7768301516771317,
"num_tokens": 1125051.0,
"step": 69
},
{
"entropy": 0.5550204813480377,
"epoch": 0.26217228464419473,
"grad_norm": 0.03845667093992233,
"learning_rate": 0.0002,
"loss": 0.5609036087989807,
"mean_token_accuracy": 0.7741425037384033,
"num_tokens": 1141333.0,
"step": 70
},
{
"entropy": 0.5524102747440338,
"epoch": 0.26591760299625467,
"grad_norm": 0.0383320152759552,
"learning_rate": 0.0002,
"loss": 0.5493491291999817,
"mean_token_accuracy": 0.7784009873867035,
"num_tokens": 1157440.0,
"step": 71
},
{
"entropy": 0.5607451796531677,
"epoch": 0.2696629213483146,
"grad_norm": 0.0344189889729023,
"learning_rate": 0.0002,
"loss": 0.5574801564216614,
"mean_token_accuracy": 0.7733150720596313,
"num_tokens": 1173721.0,
"step": 72
},
{
"entropy": 0.5708478391170502,
"epoch": 0.27340823970037453,
"grad_norm": 0.03608883544802666,
"learning_rate": 0.0002,
"loss": 0.5691329836845398,
"mean_token_accuracy": 0.7706348299980164,
"num_tokens": 1189995.0,
"step": 73
},
{
"entropy": 0.5674006342887878,
"epoch": 0.27715355805243447,
"grad_norm": 0.03380035236477852,
"learning_rate": 0.0002,
"loss": 0.5687033534049988,
"mean_token_accuracy": 0.7686747610569,
"num_tokens": 1206546.0,
"step": 74
},
{
"entropy": 0.5619117617607117,
"epoch": 0.2808988764044944,
"grad_norm": 0.033374786376953125,
"learning_rate": 0.0002,
"loss": 0.5617104768753052,
"mean_token_accuracy": 0.774394765496254,
"num_tokens": 1222857.0,
"step": 75
},
{
"entropy": 0.553475558757782,
"epoch": 0.2846441947565543,
"grad_norm": 0.03828837722539902,
"learning_rate": 0.0002,
"loss": 0.5524560809135437,
"mean_token_accuracy": 0.7749378681182861,
"num_tokens": 1239289.0,
"step": 76
},
{
"entropy": 0.5745554566383362,
"epoch": 0.2883895131086142,
"grad_norm": 0.03621216490864754,
"learning_rate": 0.0002,
"loss": 0.5808500051498413,
"mean_token_accuracy": 0.7678203135728836,
"num_tokens": 1255521.0,
"step": 77
},
{
"entropy": 0.5676577985286713,
"epoch": 0.29213483146067415,
"grad_norm": 0.03588660806417465,
"learning_rate": 0.0002,
"loss": 0.5705655813217163,
"mean_token_accuracy": 0.7692013084888458,
"num_tokens": 1271794.0,
"step": 78
},
{
"entropy": 0.578361302614212,
"epoch": 0.2958801498127341,
"grad_norm": 0.03781484439969063,
"learning_rate": 0.0002,
"loss": 0.5760793089866638,
"mean_token_accuracy": 0.7664260119199753,
"num_tokens": 1288356.0,
"step": 79
},
{
"entropy": 0.5593062490224838,
"epoch": 0.299625468164794,
"grad_norm": 0.03217354416847229,
"learning_rate": 0.0002,
"loss": 0.5657471418380737,
"mean_token_accuracy": 0.7739468365907669,
"num_tokens": 1304492.0,
"step": 80
},
{
"entropy": 0.5666437745094299,
"epoch": 0.30337078651685395,
"grad_norm": 0.03268091008067131,
"learning_rate": 0.0002,
"loss": 0.5716702938079834,
"mean_token_accuracy": 0.7679993361234665,
"num_tokens": 1320914.0,
"step": 81
},
{
"entropy": 0.5685661137104034,
"epoch": 0.30711610486891383,
"grad_norm": 0.03592272475361824,
"learning_rate": 0.0002,
"loss": 0.5758165717124939,
"mean_token_accuracy": 0.7661760449409485,
"num_tokens": 1337161.0,
"step": 82
},
{
"entropy": 0.5707727521657944,
"epoch": 0.31086142322097376,
"grad_norm": 0.032845061272382736,
"learning_rate": 0.0002,
"loss": 0.5710837841033936,
"mean_token_accuracy": 0.7702731043100357,
"num_tokens": 1353376.0,
"step": 83
},
{
"entropy": 0.5628758817911148,
"epoch": 0.3146067415730337,
"grad_norm": 0.029750632122159004,
"learning_rate": 0.0002,
"loss": 0.5637022852897644,
"mean_token_accuracy": 0.7708846777677536,
"num_tokens": 1369870.0,
"step": 84
},
{
"entropy": 0.5795712918043137,
"epoch": 0.31835205992509363,
"grad_norm": 0.03464500606060028,
"learning_rate": 0.0002,
"loss": 0.5780152082443237,
"mean_token_accuracy": 0.7670614421367645,
"num_tokens": 1386403.0,
"step": 85
},
{
"entropy": 0.5554608702659607,
"epoch": 0.32209737827715357,
"grad_norm": 0.03547544404864311,
"learning_rate": 0.0002,
"loss": 0.5557012557983398,
"mean_token_accuracy": 0.7721797376871109,
"num_tokens": 1402494.0,
"step": 86
},
{
"entropy": 0.5579323172569275,
"epoch": 0.3258426966292135,
"grad_norm": 0.03288840129971504,
"learning_rate": 0.0002,
"loss": 0.560955286026001,
"mean_token_accuracy": 0.7751947343349457,
"num_tokens": 1418821.0,
"step": 87
},
{
"entropy": 0.5543566048145294,
"epoch": 0.3295880149812734,
"grad_norm": 0.04169093072414398,
"learning_rate": 0.0002,
"loss": 0.5500882267951965,
"mean_token_accuracy": 0.7791634202003479,
"num_tokens": 1434993.0,
"step": 88
},
{
"entropy": 0.5734467208385468,
"epoch": 0.3333333333333333,
"grad_norm": 0.04577335715293884,
"learning_rate": 0.0002,
"loss": 0.5629557371139526,
"mean_token_accuracy": 0.7727752029895782,
"num_tokens": 1451307.0,
"step": 89
},
{
"entropy": 0.5726543813943863,
"epoch": 0.33707865168539325,
"grad_norm": 0.0342593714594841,
"learning_rate": 0.0002,
"loss": 0.5802106261253357,
"mean_token_accuracy": 0.7650935351848602,
"num_tokens": 1467745.0,
"step": 90
},
{
"entropy": 0.551667258143425,
"epoch": 0.3408239700374532,
"grad_norm": 0.03779289126396179,
"learning_rate": 0.0002,
"loss": 0.562962532043457,
"mean_token_accuracy": 0.7722999006509781,
"num_tokens": 1483931.0,
"step": 91
},
{
"entropy": 0.5500118583440781,
"epoch": 0.3445692883895131,
"grad_norm": 0.04092314839363098,
"learning_rate": 0.0002,
"loss": 0.5627440810203552,
"mean_token_accuracy": 0.7718297243118286,
"num_tokens": 1500272.0,
"step": 92
},
{
"entropy": 0.5528086423873901,
"epoch": 0.34831460674157305,
"grad_norm": 0.03680623322725296,
"learning_rate": 0.0002,
"loss": 0.5555366277694702,
"mean_token_accuracy": 0.7774850875139236,
"num_tokens": 1516853.0,
"step": 93
},
{
"entropy": 0.5520536154508591,
"epoch": 0.352059925093633,
"grad_norm": 0.037777166813611984,
"learning_rate": 0.0002,
"loss": 0.5425198078155518,
"mean_token_accuracy": 0.7793015986680984,
"num_tokens": 1533333.0,
"step": 94
},
{
"entropy": 0.5685165077447891,
"epoch": 0.35580524344569286,
"grad_norm": 0.04140891879796982,
"learning_rate": 0.0002,
"loss": 0.5641899108886719,
"mean_token_accuracy": 0.7713409811258316,
"num_tokens": 1549757.0,
"step": 95
},
{
"entropy": 0.5465481728315353,
"epoch": 0.3595505617977528,
"grad_norm": 0.035262562334537506,
"learning_rate": 0.0002,
"loss": 0.5490474104881287,
"mean_token_accuracy": 0.7827550321817398,
"num_tokens": 1565996.0,
"step": 96
},
{
"entropy": 0.5831216871738434,
"epoch": 0.36329588014981273,
"grad_norm": 0.036104101687669754,
"learning_rate": 0.0002,
"loss": 0.589984118938446,
"mean_token_accuracy": 0.7600380033254623,
"num_tokens": 1582215.0,
"step": 97
},
{
"entropy": 0.5677650719881058,
"epoch": 0.36704119850187267,
"grad_norm": 0.03766894340515137,
"learning_rate": 0.0002,
"loss": 0.5645126104354858,
"mean_token_accuracy": 0.7706596851348877,
"num_tokens": 1598452.0,
"step": 98
},
{
"entropy": 0.5670180022716522,
"epoch": 0.3707865168539326,
"grad_norm": 0.031464677304029465,
"learning_rate": 0.0002,
"loss": 0.5694231986999512,
"mean_token_accuracy": 0.7699034363031387,
"num_tokens": 1614973.0,
"step": 99
},
{
"entropy": 0.556086465716362,
"epoch": 0.37453183520599254,
"grad_norm": 0.03442725911736488,
"learning_rate": 0.0002,
"loss": 0.5548810958862305,
"mean_token_accuracy": 0.7733764350414276,
"num_tokens": 1631172.0,
"step": 100
},
{
"entropy": 0.5800606608390808,
"epoch": 0.3782771535580524,
"grad_norm": 0.03572804853320122,
"learning_rate": 0.0002,
"loss": 0.5861737728118896,
"mean_token_accuracy": 0.7624654024839401,
"num_tokens": 1647621.0,
"step": 101
},
{
"entropy": 0.5482688248157501,
"epoch": 0.38202247191011235,
"grad_norm": 0.03775500878691673,
"learning_rate": 0.0002,
"loss": 0.5594941973686218,
"mean_token_accuracy": 0.7744353115558624,
"num_tokens": 1663895.0,
"step": 102
},
{
"entropy": 0.563491478562355,
"epoch": 0.3857677902621723,
"grad_norm": 0.031457267701625824,
"learning_rate": 0.0002,
"loss": 0.564830482006073,
"mean_token_accuracy": 0.7690578252077103,
"num_tokens": 1680534.0,
"step": 103
},
{
"entropy": 0.564789205789566,
"epoch": 0.3895131086142322,
"grad_norm": 0.035452548414468765,
"learning_rate": 0.0002,
"loss": 0.560291588306427,
"mean_token_accuracy": 0.7735853344202042,
"num_tokens": 1696770.0,
"step": 104
},
{
"entropy": 0.5566727668046951,
"epoch": 0.39325842696629215,
"grad_norm": 0.03198615834116936,
"learning_rate": 0.0002,
"loss": 0.5535395741462708,
"mean_token_accuracy": 0.7722934931516647,
"num_tokens": 1713024.0,
"step": 105
},
{
"entropy": 0.5578596889972687,
"epoch": 0.3970037453183521,
"grad_norm": 0.03393879160284996,
"learning_rate": 0.0002,
"loss": 0.5627562999725342,
"mean_token_accuracy": 0.7742809951305389,
"num_tokens": 1729333.0,
"step": 106
},
{
"entropy": 0.5788154900074005,
"epoch": 0.40074906367041196,
"grad_norm": 0.033935144543647766,
"learning_rate": 0.0002,
"loss": 0.580773115158081,
"mean_token_accuracy": 0.7651670575141907,
"num_tokens": 1745611.0,
"step": 107
},
{
"entropy": 0.5737199634313583,
"epoch": 0.4044943820224719,
"grad_norm": 0.03252919018268585,
"learning_rate": 0.0002,
"loss": 0.5751349925994873,
"mean_token_accuracy": 0.7671079486608505,
"num_tokens": 1762357.0,
"step": 108
},
{
"entropy": 0.5651296824216843,
"epoch": 0.40823970037453183,
"grad_norm": 0.028949161991477013,
"learning_rate": 0.0002,
"loss": 0.5604527592658997,
"mean_token_accuracy": 0.7729825675487518,
"num_tokens": 1778752.0,
"step": 109
},
{
"entropy": 0.5504195243120193,
"epoch": 0.41198501872659177,
"grad_norm": 0.028210768476128578,
"learning_rate": 0.0002,
"loss": 0.549246072769165,
"mean_token_accuracy": 0.7782431095838547,
"num_tokens": 1794998.0,
"step": 110
},
{
"entropy": 0.5765475034713745,
"epoch": 0.4157303370786517,
"grad_norm": 0.02785623073577881,
"learning_rate": 0.0002,
"loss": 0.5748263597488403,
"mean_token_accuracy": 0.7663502544164658,
"num_tokens": 1811522.0,
"step": 111
},
{
"entropy": 0.5662956237792969,
"epoch": 0.41947565543071164,
"grad_norm": 0.027803661301732063,
"learning_rate": 0.0002,
"loss": 0.5678505897521973,
"mean_token_accuracy": 0.769574448466301,
"num_tokens": 1827911.0,
"step": 112
},
{
"entropy": 0.554324135184288,
"epoch": 0.4232209737827715,
"grad_norm": 0.03252230957150459,
"learning_rate": 0.0002,
"loss": 0.5648460984230042,
"mean_token_accuracy": 0.7699959129095078,
"num_tokens": 1844234.0,
"step": 113
},
{
"entropy": 0.5458608418703079,
"epoch": 0.42696629213483145,
"grad_norm": 0.027507655322551727,
"learning_rate": 0.0002,
"loss": 0.5496413111686707,
"mean_token_accuracy": 0.7775106579065323,
"num_tokens": 1860498.0,
"step": 114
},
{
"entropy": 0.5563929826021194,
"epoch": 0.4307116104868914,
"grad_norm": 0.03014312870800495,
"learning_rate": 0.0002,
"loss": 0.5582830905914307,
"mean_token_accuracy": 0.7708972990512848,
"num_tokens": 1876571.0,
"step": 115
},
{
"entropy": 0.5650668740272522,
"epoch": 0.4344569288389513,
"grad_norm": 0.032711341977119446,
"learning_rate": 0.0002,
"loss": 0.5640538930892944,
"mean_token_accuracy": 0.7726383656263351,
"num_tokens": 1893031.0,
"step": 116
},
{
"entropy": 0.5807255804538727,
"epoch": 0.43820224719101125,
"grad_norm": 0.04059470072388649,
"learning_rate": 0.0002,
"loss": 0.5742425918579102,
"mean_token_accuracy": 0.7666837275028229,
"num_tokens": 1909366.0,
"step": 117
},
{
"entropy": 0.5798581689596176,
"epoch": 0.4419475655430712,
"grad_norm": 0.03380719944834709,
"learning_rate": 0.0002,
"loss": 0.5788700580596924,
"mean_token_accuracy": 0.7679527401924133,
"num_tokens": 1925898.0,
"step": 118
},
{
"entropy": 0.5766737908124924,
"epoch": 0.44569288389513106,
"grad_norm": 0.030183367431163788,
"learning_rate": 0.0002,
"loss": 0.5766640901565552,
"mean_token_accuracy": 0.7679651975631714,
"num_tokens": 1942401.0,
"step": 119
},
{
"entropy": 0.5603433847427368,
"epoch": 0.449438202247191,
"grad_norm": 0.0362340547144413,
"learning_rate": 0.0002,
"loss": 0.5619690418243408,
"mean_token_accuracy": 0.7730819880962372,
"num_tokens": 1958720.0,
"step": 120
},
{
"entropy": 0.5559201538562775,
"epoch": 0.45318352059925093,
"grad_norm": 0.034683868288993835,
"learning_rate": 0.0002,
"loss": 0.5595064163208008,
"mean_token_accuracy": 0.7748750001192093,
"num_tokens": 1975119.0,
"step": 121
},
{
"entropy": 0.5641336888074875,
"epoch": 0.45692883895131087,
"grad_norm": 0.034222401678562164,
"learning_rate": 0.0002,
"loss": 0.5678452849388123,
"mean_token_accuracy": 0.7732732445001602,
"num_tokens": 1991506.0,
"step": 122
},
{
"entropy": 0.5829679220914841,
"epoch": 0.4606741573033708,
"grad_norm": 0.034026652574539185,
"learning_rate": 0.0002,
"loss": 0.5875802040100098,
"mean_token_accuracy": 0.7611493021249771,
"num_tokens": 2007947.0,
"step": 123
},
{
"entropy": 0.5581521540880203,
"epoch": 0.46441947565543074,
"grad_norm": 0.025140831246972084,
"learning_rate": 0.0002,
"loss": 0.5602667927742004,
"mean_token_accuracy": 0.7735796868801117,
"num_tokens": 2024401.0,
"step": 124
},
{
"entropy": 0.5715497881174088,
"epoch": 0.4681647940074906,
"grad_norm": 0.029785403981804848,
"learning_rate": 0.0002,
"loss": 0.5672232508659363,
"mean_token_accuracy": 0.7685857713222504,
"num_tokens": 2040631.0,
"step": 125
},
{
"entropy": 0.5607001930475235,
"epoch": 0.47191011235955055,
"grad_norm": 0.04235680401325226,
"learning_rate": 0.0002,
"loss": 0.5650739073753357,
"mean_token_accuracy": 0.7696276903152466,
"num_tokens": 2056536.0,
"step": 126
},
{
"entropy": 0.5663832724094391,
"epoch": 0.4756554307116105,
"grad_norm": 0.03530610725283623,
"learning_rate": 0.0002,
"loss": 0.5653817653656006,
"mean_token_accuracy": 0.771982342004776,
"num_tokens": 2072694.0,
"step": 127
},
{
"entropy": 0.5544104427099228,
"epoch": 0.4794007490636704,
"grad_norm": 0.02733522094786167,
"learning_rate": 0.0002,
"loss": 0.5605688095092773,
"mean_token_accuracy": 0.7723411917686462,
"num_tokens": 2089137.0,
"step": 128
},
{
"entropy": 0.5275053828954697,
"epoch": 0.48314606741573035,
"grad_norm": 0.04322921857237816,
"learning_rate": 0.0002,
"loss": 0.5484553575515747,
"mean_token_accuracy": 0.7770342081785202,
"num_tokens": 2105149.0,
"step": 129
},
{
"entropy": 0.5561497956514359,
"epoch": 0.4868913857677903,
"grad_norm": 0.038827862590551376,
"learning_rate": 0.0002,
"loss": 0.55650395154953,
"mean_token_accuracy": 0.7764105200767517,
"num_tokens": 2121463.0,
"step": 130
},
{
"entropy": 0.5783034265041351,
"epoch": 0.49063670411985016,
"grad_norm": 0.029603557661175728,
"learning_rate": 0.0002,
"loss": 0.5703758001327515,
"mean_token_accuracy": 0.7689076513051987,
"num_tokens": 2137873.0,
"step": 131
},
{
"entropy": 0.5802958011627197,
"epoch": 0.4943820224719101,
"grad_norm": 0.03336755558848381,
"learning_rate": 0.0002,
"loss": 0.5750676989555359,
"mean_token_accuracy": 0.7685631215572357,
"num_tokens": 2154043.0,
"step": 132
},
{
"entropy": 0.5565105229616165,
"epoch": 0.49812734082397003,
"grad_norm": 0.03589406609535217,
"learning_rate": 0.0002,
"loss": 0.5438498258590698,
"mean_token_accuracy": 0.7815204560756683,
"num_tokens": 2170057.0,
"step": 133
},
{
"entropy": 0.5716612040996552,
"epoch": 0.50187265917603,
"grad_norm": 0.03452189266681671,
"learning_rate": 0.0002,
"loss": 0.5778107047080994,
"mean_token_accuracy": 0.7688381224870682,
"num_tokens": 2186386.0,
"step": 134
},
{
"entropy": 0.561384916305542,
"epoch": 0.5056179775280899,
"grad_norm": 0.03864321857690811,
"learning_rate": 0.0002,
"loss": 0.5704262256622314,
"mean_token_accuracy": 0.7647197097539902,
"num_tokens": 2202441.0,
"step": 135
},
{
"entropy": 0.5625592470169067,
"epoch": 0.5093632958801498,
"grad_norm": 0.029244674369692802,
"learning_rate": 0.0002,
"loss": 0.5618846416473389,
"mean_token_accuracy": 0.7706502974033356,
"num_tokens": 2218642.0,
"step": 136
},
{
"entropy": 0.557224690914154,
"epoch": 0.5131086142322098,
"grad_norm": 0.03010115958750248,
"learning_rate": 0.0002,
"loss": 0.5529860854148865,
"mean_token_accuracy": 0.7745790481567383,
"num_tokens": 2234941.0,
"step": 137
},
{
"entropy": 0.5669968128204346,
"epoch": 0.5168539325842697,
"grad_norm": 0.030734272673726082,
"learning_rate": 0.0002,
"loss": 0.563121497631073,
"mean_token_accuracy": 0.7691874206066132,
"num_tokens": 2251132.0,
"step": 138
},
{
"entropy": 0.5601507127285004,
"epoch": 0.5205992509363296,
"grad_norm": 0.03075527958571911,
"learning_rate": 0.0002,
"loss": 0.5602597594261169,
"mean_token_accuracy": 0.7736657857894897,
"num_tokens": 2267424.0,
"step": 139
},
{
"entropy": 0.5564019232988358,
"epoch": 0.5243445692883895,
"grad_norm": 0.03025938756763935,
"learning_rate": 0.0002,
"loss": 0.5628267526626587,
"mean_token_accuracy": 0.771067887544632,
"num_tokens": 2283849.0,
"step": 140
},
{
"entropy": 0.5395451635122299,
"epoch": 0.5280898876404494,
"grad_norm": 0.03199173882603645,
"learning_rate": 0.0002,
"loss": 0.5487725734710693,
"mean_token_accuracy": 0.7775663435459137,
"num_tokens": 2299872.0,
"step": 141
},
{
"entropy": 0.5526085048913956,
"epoch": 0.5318352059925093,
"grad_norm": 0.030539415776729584,
"learning_rate": 0.0002,
"loss": 0.5591868162155151,
"mean_token_accuracy": 0.7733905166387558,
"num_tokens": 2316381.0,
"step": 142
},
{
"entropy": 0.5586904883384705,
"epoch": 0.5355805243445693,
"grad_norm": 0.03167688101530075,
"learning_rate": 0.0002,
"loss": 0.5590608716011047,
"mean_token_accuracy": 0.7722269594669342,
"num_tokens": 2332636.0,
"step": 143
},
{
"entropy": 0.5568670481443405,
"epoch": 0.5393258426966292,
"grad_norm": 0.02876191958785057,
"learning_rate": 0.0002,
"loss": 0.5519507527351379,
"mean_token_accuracy": 0.776704877614975,
"num_tokens": 2348823.0,
"step": 144
},
{
"entropy": 0.5536152571439743,
"epoch": 0.5430711610486891,
"grad_norm": 0.026966845616698265,
"learning_rate": 0.0002,
"loss": 0.5451969504356384,
"mean_token_accuracy": 0.7772984057664871,
"num_tokens": 2365018.0,
"step": 145
},
{
"entropy": 0.55972820520401,
"epoch": 0.5468164794007491,
"grad_norm": 0.028171516954898834,
"learning_rate": 0.0002,
"loss": 0.5568036437034607,
"mean_token_accuracy": 0.7727039009332657,
"num_tokens": 2381199.0,
"step": 146
},
{
"entropy": 0.5505439043045044,
"epoch": 0.550561797752809,
"grad_norm": 0.02772362343966961,
"learning_rate": 0.0002,
"loss": 0.5527427792549133,
"mean_token_accuracy": 0.7765008956193924,
"num_tokens": 2397235.0,
"step": 147
},
{
"entropy": 0.5575017333030701,
"epoch": 0.5543071161048689,
"grad_norm": 0.030587337911128998,
"learning_rate": 0.0002,
"loss": 0.5631366968154907,
"mean_token_accuracy": 0.7698703855276108,
"num_tokens": 2413454.0,
"step": 148
},
{
"entropy": 0.5469523966312408,
"epoch": 0.5580524344569289,
"grad_norm": 0.0317547544836998,
"learning_rate": 0.0002,
"loss": 0.554557740688324,
"mean_token_accuracy": 0.776221752166748,
"num_tokens": 2429888.0,
"step": 149
},
{
"entropy": 0.5393165349960327,
"epoch": 0.5617977528089888,
"grad_norm": 0.028293034061789513,
"learning_rate": 0.0002,
"loss": 0.538506269454956,
"mean_token_accuracy": 0.7823521643877029,
"num_tokens": 2446146.0,
"step": 150
},
{
"entropy": 0.5640445649623871,
"epoch": 0.5655430711610487,
"grad_norm": 0.027342529967427254,
"learning_rate": 0.0002,
"loss": 0.5663660764694214,
"mean_token_accuracy": 0.7686634063720703,
"num_tokens": 2462436.0,
"step": 151
},
{
"entropy": 0.5660315603017807,
"epoch": 0.5692883895131086,
"grad_norm": 0.029160011559724808,
"learning_rate": 0.0002,
"loss": 0.5658541917800903,
"mean_token_accuracy": 0.7699626982212067,
"num_tokens": 2478983.0,
"step": 152
},
{
"entropy": 0.5457171052694321,
"epoch": 0.5730337078651685,
"grad_norm": 0.029130199924111366,
"learning_rate": 0.0002,
"loss": 0.5439150929450989,
"mean_token_accuracy": 0.7802361398935318,
"num_tokens": 2495263.0,
"step": 153
},
{
"entropy": 0.5504166930913925,
"epoch": 0.5767790262172284,
"grad_norm": 0.03016018122434616,
"learning_rate": 0.0002,
"loss": 0.5510883331298828,
"mean_token_accuracy": 0.775614932179451,
"num_tokens": 2511475.0,
"step": 154
},
{
"entropy": 0.5550555139780045,
"epoch": 0.5805243445692884,
"grad_norm": 0.03134196624159813,
"learning_rate": 0.0002,
"loss": 0.5607972145080566,
"mean_token_accuracy": 0.7707046419382095,
"num_tokens": 2527673.0,
"step": 155
},
{
"entropy": 0.5454694628715515,
"epoch": 0.5842696629213483,
"grad_norm": 0.0311669260263443,
"learning_rate": 0.0002,
"loss": 0.5492562651634216,
"mean_token_accuracy": 0.779202476143837,
"num_tokens": 2543853.0,
"step": 156
},
{
"entropy": 0.5742276608943939,
"epoch": 0.5880149812734082,
"grad_norm": 0.027328435331583023,
"learning_rate": 0.0002,
"loss": 0.5779210329055786,
"mean_token_accuracy": 0.765041321516037,
"num_tokens": 2560115.0,
"step": 157
},
{
"entropy": 0.5670003890991211,
"epoch": 0.5917602996254682,
"grad_norm": 0.02951730042695999,
"learning_rate": 0.0002,
"loss": 0.5664114952087402,
"mean_token_accuracy": 0.7700729966163635,
"num_tokens": 2576322.0,
"step": 158
},
{
"entropy": 0.5762516111135483,
"epoch": 0.5955056179775281,
"grad_norm": 0.029969869181513786,
"learning_rate": 0.0002,
"loss": 0.5735501050949097,
"mean_token_accuracy": 0.7683756053447723,
"num_tokens": 2592455.0,
"step": 159
},
{
"entropy": 0.5583818256855011,
"epoch": 0.599250936329588,
"grad_norm": 0.02687755413353443,
"learning_rate": 0.0002,
"loss": 0.5561562776565552,
"mean_token_accuracy": 0.7738349288702011,
"num_tokens": 2608647.0,
"step": 160
},
{
"entropy": 0.5745189636945724,
"epoch": 0.602996254681648,
"grad_norm": 0.03188227489590645,
"learning_rate": 0.0002,
"loss": 0.573383092880249,
"mean_token_accuracy": 0.7658237218856812,
"num_tokens": 2624851.0,
"step": 161
},
{
"entropy": 0.5701076835393906,
"epoch": 0.6067415730337079,
"grad_norm": 0.03216436505317688,
"learning_rate": 0.0002,
"loss": 0.5696204900741577,
"mean_token_accuracy": 0.7674751281738281,
"num_tokens": 2641365.0,
"step": 162
},
{
"entropy": 0.548926368355751,
"epoch": 0.6104868913857678,
"grad_norm": 0.02745572291314602,
"learning_rate": 0.0002,
"loss": 0.5530045032501221,
"mean_token_accuracy": 0.7764343470335007,
"num_tokens": 2657724.0,
"step": 163
},
{
"entropy": 0.5748997032642365,
"epoch": 0.6142322097378277,
"grad_norm": 0.03055480308830738,
"learning_rate": 0.0002,
"loss": 0.5857313275337219,
"mean_token_accuracy": 0.7639760226011276,
"num_tokens": 2674255.0,
"step": 164
},
{
"entropy": 0.5685756206512451,
"epoch": 0.6179775280898876,
"grad_norm": 0.030725592747330666,
"learning_rate": 0.0002,
"loss": 0.5727284550666809,
"mean_token_accuracy": 0.7686582803726196,
"num_tokens": 2690670.0,
"step": 165
},
{
"entropy": 0.547265499830246,
"epoch": 0.6217228464419475,
"grad_norm": 0.028982795774936676,
"learning_rate": 0.0002,
"loss": 0.5458434820175171,
"mean_token_accuracy": 0.7764610648155212,
"num_tokens": 2706990.0,
"step": 166
},
{
"entropy": 0.5669321566820145,
"epoch": 0.6254681647940075,
"grad_norm": 0.02999156154692173,
"learning_rate": 0.0002,
"loss": 0.5610904097557068,
"mean_token_accuracy": 0.7703774124383926,
"num_tokens": 2723382.0,
"step": 167
},
{
"entropy": 0.5631402879953384,
"epoch": 0.6292134831460674,
"grad_norm": 0.02727295272052288,
"learning_rate": 0.0002,
"loss": 0.5610119700431824,
"mean_token_accuracy": 0.7734928578138351,
"num_tokens": 2739673.0,
"step": 168
},
{
"entropy": 0.5462162643671036,
"epoch": 0.6329588014981273,
"grad_norm": 0.03161296248435974,
"learning_rate": 0.0002,
"loss": 0.5594881772994995,
"mean_token_accuracy": 0.7721333503723145,
"num_tokens": 2756004.0,
"step": 169
},
{
"entropy": 0.5525806844234467,
"epoch": 0.6367041198501873,
"grad_norm": 0.028923675417900085,
"learning_rate": 0.0002,
"loss": 0.5581262707710266,
"mean_token_accuracy": 0.7746219336986542,
"num_tokens": 2772131.0,
"step": 170
},
{
"entropy": 0.5815936326980591,
"epoch": 0.6404494382022472,
"grad_norm": 0.029989033937454224,
"learning_rate": 0.0002,
"loss": 0.5781337022781372,
"mean_token_accuracy": 0.7642954289913177,
"num_tokens": 2788556.0,
"step": 171
},
{
"entropy": 0.5742616653442383,
"epoch": 0.6441947565543071,
"grad_norm": 0.03870734944939613,
"learning_rate": 0.0002,
"loss": 0.5799432992935181,
"mean_token_accuracy": 0.7655478119850159,
"num_tokens": 2804635.0,
"step": 172
},
{
"entropy": 0.576400488615036,
"epoch": 0.6479400749063671,
"grad_norm": 0.02596936747431755,
"learning_rate": 0.0002,
"loss": 0.5705851912498474,
"mean_token_accuracy": 0.7653899490833282,
"num_tokens": 2821201.0,
"step": 173
},
{
"entropy": 0.5751689076423645,
"epoch": 0.651685393258427,
"grad_norm": 0.02525261603295803,
"learning_rate": 0.0002,
"loss": 0.5706028938293457,
"mean_token_accuracy": 0.7693078964948654,
"num_tokens": 2837952.0,
"step": 174
},
{
"entropy": 0.557927280664444,
"epoch": 0.6554307116104869,
"grad_norm": 0.025947891175746918,
"learning_rate": 0.0002,
"loss": 0.55954509973526,
"mean_token_accuracy": 0.7710674405097961,
"num_tokens": 2854247.0,
"step": 175
},
{
"entropy": 0.5340227037668228,
"epoch": 0.6591760299625468,
"grad_norm": 0.03157508745789528,
"learning_rate": 0.0002,
"loss": 0.5432956218719482,
"mean_token_accuracy": 0.7804963290691376,
"num_tokens": 2870169.0,
"step": 176
},
{
"entropy": 0.5522671341896057,
"epoch": 0.6629213483146067,
"grad_norm": 0.027346299961209297,
"learning_rate": 0.0002,
"loss": 0.5591444969177246,
"mean_token_accuracy": 0.7712446004152298,
"num_tokens": 2886516.0,
"step": 177
},
{
"entropy": 0.5393896102905273,
"epoch": 0.6666666666666666,
"grad_norm": 0.027576690539717674,
"learning_rate": 0.0002,
"loss": 0.5416374206542969,
"mean_token_accuracy": 0.7780617028474808,
"num_tokens": 2902729.0,
"step": 178
},
{
"entropy": 0.5685822814702988,
"epoch": 0.6704119850187266,
"grad_norm": 0.03415964916348457,
"learning_rate": 0.0002,
"loss": 0.5774993300437927,
"mean_token_accuracy": 0.7654603570699692,
"num_tokens": 2919059.0,
"step": 179
},
{
"entropy": 0.5473489463329315,
"epoch": 0.6741573033707865,
"grad_norm": 0.03175094723701477,
"learning_rate": 0.0002,
"loss": 0.5478240847587585,
"mean_token_accuracy": 0.7771035730838776,
"num_tokens": 2935209.0,
"step": 180
},
{
"entropy": 0.5505825132131577,
"epoch": 0.6779026217228464,
"grad_norm": 0.027963241562247276,
"learning_rate": 0.0002,
"loss": 0.5473360419273376,
"mean_token_accuracy": 0.7776090204715729,
"num_tokens": 2951643.0,
"step": 181
},
{
"entropy": 0.5541345179080963,
"epoch": 0.6816479400749064,
"grad_norm": 0.03300129622220993,
"learning_rate": 0.0002,
"loss": 0.5419403910636902,
"mean_token_accuracy": 0.7789575755596161,
"num_tokens": 2967938.0,
"step": 182
},
{
"entropy": 0.5639268904924393,
"epoch": 0.6853932584269663,
"grad_norm": 0.032656021416187286,
"learning_rate": 0.0002,
"loss": 0.5597264170646667,
"mean_token_accuracy": 0.7759947925806046,
"num_tokens": 2984230.0,
"step": 183
},
{
"entropy": 0.5538647770881653,
"epoch": 0.6891385767790262,
"grad_norm": 0.03382604569196701,
"learning_rate": 0.0002,
"loss": 0.5666002631187439,
"mean_token_accuracy": 0.7692589312791824,
"num_tokens": 3000607.0,
"step": 184
},
{
"entropy": 0.5578113794326782,
"epoch": 0.6928838951310862,
"grad_norm": 0.03644486889243126,
"learning_rate": 0.0002,
"loss": 0.5739911198616028,
"mean_token_accuracy": 0.7684497386217117,
"num_tokens": 3017077.0,
"step": 185
},
{
"entropy": 0.5290449112653732,
"epoch": 0.6966292134831461,
"grad_norm": 0.027713051065802574,
"learning_rate": 0.0002,
"loss": 0.5355228781700134,
"mean_token_accuracy": 0.7826152592897415,
"num_tokens": 3032996.0,
"step": 186
},
{
"entropy": 0.5759813338518143,
"epoch": 0.700374531835206,
"grad_norm": 0.03057127632200718,
"learning_rate": 0.0002,
"loss": 0.569280743598938,
"mean_token_accuracy": 0.7680912464857101,
"num_tokens": 3049460.0,
"step": 187
},
{
"entropy": 0.5720777213573456,
"epoch": 0.704119850187266,
"grad_norm": 0.02572391740977764,
"learning_rate": 0.0002,
"loss": 0.5658439993858337,
"mean_token_accuracy": 0.7709487825632095,
"num_tokens": 3065672.0,
"step": 188
},
{
"entropy": 0.5517766922712326,
"epoch": 0.7078651685393258,
"grad_norm": 0.029554082080721855,
"learning_rate": 0.0002,
"loss": 0.5389034748077393,
"mean_token_accuracy": 0.7830005586147308,
"num_tokens": 3082173.0,
"step": 189
},
{
"entropy": 0.5635267347097397,
"epoch": 0.7116104868913857,
"grad_norm": 0.025442970916628838,
"learning_rate": 0.0002,
"loss": 0.5614153742790222,
"mean_token_accuracy": 0.7708731889724731,
"num_tokens": 3098727.0,
"step": 190
},
{
"entropy": 0.5624646097421646,
"epoch": 0.7153558052434457,
"grad_norm": 0.03501886874437332,
"learning_rate": 0.0002,
"loss": 0.5751168727874756,
"mean_token_accuracy": 0.7674457877874374,
"num_tokens": 3115031.0,
"step": 191
},
{
"entropy": 0.5412020832300186,
"epoch": 0.7191011235955056,
"grad_norm": 0.029673364013433456,
"learning_rate": 0.0002,
"loss": 0.5503013730049133,
"mean_token_accuracy": 0.780591607093811,
"num_tokens": 3131271.0,
"step": 192
},
{
"entropy": 0.557359516620636,
"epoch": 0.7228464419475655,
"grad_norm": 0.025931306183338165,
"learning_rate": 0.0002,
"loss": 0.559468150138855,
"mean_token_accuracy": 0.7729436904191971,
"num_tokens": 3147732.0,
"step": 193
},
{
"entropy": 0.5394045114517212,
"epoch": 0.7265917602996255,
"grad_norm": 0.0292246975004673,
"learning_rate": 0.0002,
"loss": 0.5409769415855408,
"mean_token_accuracy": 0.7795000076293945,
"num_tokens": 3163963.0,
"step": 194
},
{
"entropy": 0.5587436705827713,
"epoch": 0.7303370786516854,
"grad_norm": 0.03306795284152031,
"learning_rate": 0.0002,
"loss": 0.5556156039237976,
"mean_token_accuracy": 0.7742602825164795,
"num_tokens": 3179928.0,
"step": 195
},
{
"entropy": 0.558687686920166,
"epoch": 0.7340823970037453,
"grad_norm": 0.025363627821207047,
"learning_rate": 0.0002,
"loss": 0.5573633909225464,
"mean_token_accuracy": 0.7759020626544952,
"num_tokens": 3196142.0,
"step": 196
},
{
"entropy": 0.545383557677269,
"epoch": 0.7378277153558053,
"grad_norm": 0.027863260358572006,
"learning_rate": 0.0002,
"loss": 0.5485226511955261,
"mean_token_accuracy": 0.7776659727096558,
"num_tokens": 3212565.0,
"step": 197
},
{
"entropy": 0.5556656569242477,
"epoch": 0.7415730337078652,
"grad_norm": 0.035580288618803024,
"learning_rate": 0.0002,
"loss": 0.5673390626907349,
"mean_token_accuracy": 0.7700339257717133,
"num_tokens": 3228915.0,
"step": 198
},
{
"entropy": 0.5520624220371246,
"epoch": 0.7453183520599251,
"grad_norm": 0.02862994559109211,
"learning_rate": 0.0002,
"loss": 0.5494414567947388,
"mean_token_accuracy": 0.7801119983196259,
"num_tokens": 3245273.0,
"step": 199
},
{
"entropy": 0.5758003443479538,
"epoch": 0.7490636704119851,
"grad_norm": 0.0339261032640934,
"learning_rate": 0.0002,
"loss": 0.5687139630317688,
"mean_token_accuracy": 0.7678625285625458,
"num_tokens": 3261785.0,
"step": 200
},
{
"entropy": 0.568912148475647,
"epoch": 0.7528089887640449,
"grad_norm": 0.029947372153401375,
"learning_rate": 0.0002,
"loss": 0.5638163089752197,
"mean_token_accuracy": 0.77249875664711,
"num_tokens": 3278313.0,
"step": 201
},
{
"entropy": 0.5490483492612839,
"epoch": 0.7565543071161048,
"grad_norm": 0.02934352308511734,
"learning_rate": 0.0002,
"loss": 0.5535009503364563,
"mean_token_accuracy": 0.7746146768331528,
"num_tokens": 3294575.0,
"step": 202
},
{
"entropy": 0.560209795832634,
"epoch": 0.7602996254681648,
"grad_norm": 0.031990889459848404,
"learning_rate": 0.0002,
"loss": 0.5637909770011902,
"mean_token_accuracy": 0.7735392153263092,
"num_tokens": 3310679.0,
"step": 203
},
{
"entropy": 0.5573873072862625,
"epoch": 0.7640449438202247,
"grad_norm": 0.02812575176358223,
"learning_rate": 0.0002,
"loss": 0.5629784464836121,
"mean_token_accuracy": 0.7686379998922348,
"num_tokens": 3327065.0,
"step": 204
},
{
"entropy": 0.534591019153595,
"epoch": 0.7677902621722846,
"grad_norm": 0.03412024676799774,
"learning_rate": 0.0002,
"loss": 0.546525239944458,
"mean_token_accuracy": 0.7761467695236206,
"num_tokens": 3343404.0,
"step": 205
},
{
"entropy": 0.5677939504384995,
"epoch": 0.7715355805243446,
"grad_norm": 0.02933080866932869,
"learning_rate": 0.0002,
"loss": 0.5688956379890442,
"mean_token_accuracy": 0.7702508270740509,
"num_tokens": 3359958.0,
"step": 206
},
{
"entropy": 0.582836389541626,
"epoch": 0.7752808988764045,
"grad_norm": 0.027001049369573593,
"learning_rate": 0.0002,
"loss": 0.5772212147712708,
"mean_token_accuracy": 0.7654514610767365,
"num_tokens": 3376426.0,
"step": 207
},
{
"entropy": 0.5876192450523376,
"epoch": 0.7790262172284644,
"grad_norm": 0.031185103580355644,
"learning_rate": 0.0002,
"loss": 0.5810344219207764,
"mean_token_accuracy": 0.7651431113481522,
"num_tokens": 3392821.0,
"step": 208
},
{
"entropy": 0.5676351487636566,
"epoch": 0.7827715355805244,
"grad_norm": 0.02849467284977436,
"learning_rate": 0.0002,
"loss": 0.5602158904075623,
"mean_token_accuracy": 0.771087646484375,
"num_tokens": 3409137.0,
"step": 209
},
{
"entropy": 0.5598850250244141,
"epoch": 0.7865168539325843,
"grad_norm": 0.028652694076299667,
"learning_rate": 0.0002,
"loss": 0.5560476779937744,
"mean_token_accuracy": 0.7744726985692978,
"num_tokens": 3425346.0,
"step": 210
},
{
"entropy": 0.5631076842546463,
"epoch": 0.7902621722846442,
"grad_norm": 0.03177965059876442,
"learning_rate": 0.0002,
"loss": 0.5703850984573364,
"mean_token_accuracy": 0.7688238769769669,
"num_tokens": 3441766.0,
"step": 211
},
{
"entropy": 0.5571614354848862,
"epoch": 0.7940074906367042,
"grad_norm": 0.035387102514505386,
"learning_rate": 0.0002,
"loss": 0.5680047869682312,
"mean_token_accuracy": 0.7702172994613647,
"num_tokens": 3458303.0,
"step": 212
},
{
"entropy": 0.5512831062078476,
"epoch": 0.797752808988764,
"grad_norm": 0.02970981039106846,
"learning_rate": 0.0002,
"loss": 0.5541270971298218,
"mean_token_accuracy": 0.7740521878004074,
"num_tokens": 3474455.0,
"step": 213
},
{
"entropy": 0.5604052096605301,
"epoch": 0.8014981273408239,
"grad_norm": 0.028583871200680733,
"learning_rate": 0.0002,
"loss": 0.5585545301437378,
"mean_token_accuracy": 0.7712778151035309,
"num_tokens": 3490567.0,
"step": 214
},
{
"entropy": 0.5531798452138901,
"epoch": 0.8052434456928839,
"grad_norm": 0.027284301817417145,
"learning_rate": 0.0002,
"loss": 0.5523191690444946,
"mean_token_accuracy": 0.7744116485118866,
"num_tokens": 3506697.0,
"step": 215
},
{
"entropy": 0.5611687004566193,
"epoch": 0.8089887640449438,
"grad_norm": 0.030331265181303024,
"learning_rate": 0.0002,
"loss": 0.5599703192710876,
"mean_token_accuracy": 0.7741329371929169,
"num_tokens": 3523064.0,
"step": 216
},
{
"entropy": 0.5679153800010681,
"epoch": 0.8127340823970037,
"grad_norm": 0.028981544077396393,
"learning_rate": 0.0002,
"loss": 0.5729029178619385,
"mean_token_accuracy": 0.7667650431394577,
"num_tokens": 3539143.0,
"step": 217
},
{
"entropy": 0.5438763052225113,
"epoch": 0.8164794007490637,
"grad_norm": 0.02691890485584736,
"learning_rate": 0.0002,
"loss": 0.5485566854476929,
"mean_token_accuracy": 0.7739608585834503,
"num_tokens": 3555565.0,
"step": 218
},
{
"entropy": 0.5619954615831375,
"epoch": 0.8202247191011236,
"grad_norm": 0.026171443983912468,
"learning_rate": 0.0002,
"loss": 0.5637154579162598,
"mean_token_accuracy": 0.7711703032255173,
"num_tokens": 3571906.0,
"step": 219
},
{
"entropy": 0.5464108288288116,
"epoch": 0.8239700374531835,
"grad_norm": 0.02858656644821167,
"learning_rate": 0.0002,
"loss": 0.5461940169334412,
"mean_token_accuracy": 0.7789376378059387,
"num_tokens": 3588158.0,
"step": 220
},
{
"entropy": 0.5636538565158844,
"epoch": 0.8277153558052435,
"grad_norm": 0.02787981554865837,
"learning_rate": 0.0002,
"loss": 0.5658812522888184,
"mean_token_accuracy": 0.7694707363843918,
"num_tokens": 3604701.0,
"step": 221
},
{
"entropy": 0.5738235861063004,
"epoch": 0.8314606741573034,
"grad_norm": 0.03107610158622265,
"learning_rate": 0.0002,
"loss": 0.5720517635345459,
"mean_token_accuracy": 0.767520397901535,
"num_tokens": 3621041.0,
"step": 222
},
{
"entropy": 0.5418261587619781,
"epoch": 0.8352059925093633,
"grad_norm": 0.030757945030927658,
"learning_rate": 0.0002,
"loss": 0.5468308925628662,
"mean_token_accuracy": 0.7743646949529648,
"num_tokens": 3637338.0,
"step": 223
},
{
"entropy": 0.5567242801189423,
"epoch": 0.8389513108614233,
"grad_norm": 0.031262289732694626,
"learning_rate": 0.0002,
"loss": 0.5633231997489929,
"mean_token_accuracy": 0.7722140103578568,
"num_tokens": 3653872.0,
"step": 224
},
{
"entropy": 0.5542743653059006,
"epoch": 0.8426966292134831,
"grad_norm": 0.03351176902651787,
"learning_rate": 0.0002,
"loss": 0.5574679374694824,
"mean_token_accuracy": 0.7744366973638535,
"num_tokens": 3670013.0,
"step": 225
},
{
"entropy": 0.5486074835062027,
"epoch": 0.846441947565543,
"grad_norm": 0.0312609001994133,
"learning_rate": 0.0002,
"loss": 0.545890748500824,
"mean_token_accuracy": 0.7778652608394623,
"num_tokens": 3686275.0,
"step": 226
},
{
"entropy": 0.5650633871555328,
"epoch": 0.850187265917603,
"grad_norm": 0.028242582455277443,
"learning_rate": 0.0002,
"loss": 0.5587697625160217,
"mean_token_accuracy": 0.7728594094514847,
"num_tokens": 3702890.0,
"step": 227
},
{
"entropy": 0.5442924797534943,
"epoch": 0.8539325842696629,
"grad_norm": 0.03206290304660797,
"learning_rate": 0.0002,
"loss": 0.5438553690910339,
"mean_token_accuracy": 0.7799272388219833,
"num_tokens": 3719196.0,
"step": 228
},
{
"entropy": 0.5688119828701019,
"epoch": 0.8576779026217228,
"grad_norm": 0.031068341806530952,
"learning_rate": 0.0002,
"loss": 0.5722005367279053,
"mean_token_accuracy": 0.7658038288354874,
"num_tokens": 3735614.0,
"step": 229
},
{
"entropy": 0.5671662837266922,
"epoch": 0.8614232209737828,
"grad_norm": 0.03664137050509453,
"learning_rate": 0.0002,
"loss": 0.5779143571853638,
"mean_token_accuracy": 0.7624872028827667,
"num_tokens": 3751617.0,
"step": 230
},
{
"entropy": 0.5505847632884979,
"epoch": 0.8651685393258427,
"grad_norm": 0.031469304114580154,
"learning_rate": 0.0002,
"loss": 0.5520802140235901,
"mean_token_accuracy": 0.7765519469976425,
"num_tokens": 3768020.0,
"step": 231
},
{
"entropy": 0.5407437533140182,
"epoch": 0.8689138576779026,
"grad_norm": 0.03157830610871315,
"learning_rate": 0.0002,
"loss": 0.53821861743927,
"mean_token_accuracy": 0.7832015603780746,
"num_tokens": 3784206.0,
"step": 232
},
{
"entropy": 0.5574967563152313,
"epoch": 0.8726591760299626,
"grad_norm": 0.03071594052016735,
"learning_rate": 0.0002,
"loss": 0.5562031865119934,
"mean_token_accuracy": 0.7721244394779205,
"num_tokens": 3800616.0,
"step": 233
},
{
"entropy": 0.5378725826740265,
"epoch": 0.8764044943820225,
"grad_norm": 0.030823221430182457,
"learning_rate": 0.0002,
"loss": 0.5407513380050659,
"mean_token_accuracy": 0.7836541086435318,
"num_tokens": 3816842.0,
"step": 234
},
{
"entropy": 0.5592721700668335,
"epoch": 0.8801498127340824,
"grad_norm": 0.03175733983516693,
"learning_rate": 0.0002,
"loss": 0.5660021305084229,
"mean_token_accuracy": 0.7676839083433151,
"num_tokens": 3833206.0,
"step": 235
},
{
"entropy": 0.5588899403810501,
"epoch": 0.8838951310861424,
"grad_norm": 0.03060559183359146,
"learning_rate": 0.0002,
"loss": 0.5651678442955017,
"mean_token_accuracy": 0.7706761956214905,
"num_tokens": 3849556.0,
"step": 236
},
{
"entropy": 0.5560838133096695,
"epoch": 0.8876404494382022,
"grad_norm": 0.03011494129896164,
"learning_rate": 0.0002,
"loss": 0.5619899034500122,
"mean_token_accuracy": 0.7695688903331757,
"num_tokens": 3865973.0,
"step": 237
},
{
"entropy": 0.572941854596138,
"epoch": 0.8913857677902621,
"grad_norm": 0.02626178041100502,
"learning_rate": 0.0002,
"loss": 0.5712540149688721,
"mean_token_accuracy": 0.7688916623592377,
"num_tokens": 3882349.0,
"step": 238
},
{
"entropy": 0.5688192397356033,
"epoch": 0.8951310861423221,
"grad_norm": 0.0268928874284029,
"learning_rate": 0.0002,
"loss": 0.562833309173584,
"mean_token_accuracy": 0.7708128988742828,
"num_tokens": 3898536.0,
"step": 239
},
{
"entropy": 0.5633461475372314,
"epoch": 0.898876404494382,
"grad_norm": 0.029186321422457695,
"learning_rate": 0.0002,
"loss": 0.5525766611099243,
"mean_token_accuracy": 0.7749095112085342,
"num_tokens": 3914950.0,
"step": 240
},
{
"entropy": 0.5715253502130508,
"epoch": 0.9026217228464419,
"grad_norm": 0.029228920117020607,
"learning_rate": 0.0002,
"loss": 0.5710093975067139,
"mean_token_accuracy": 0.7693532109260559,
"num_tokens": 3931161.0,
"step": 241
},
{
"entropy": 0.5170925259590149,
"epoch": 0.9063670411985019,
"grad_norm": 0.03571123257279396,
"learning_rate": 0.0002,
"loss": 0.52873295545578,
"mean_token_accuracy": 0.7879834473133087,
"num_tokens": 3947256.0,
"step": 242
},
{
"entropy": 0.5353554487228394,
"epoch": 0.9101123595505618,
"grad_norm": 0.031091809272766113,
"learning_rate": 0.0002,
"loss": 0.5437985062599182,
"mean_token_accuracy": 0.7802935838699341,
"num_tokens": 3963703.0,
"step": 243
},
{
"entropy": 0.5593858063220978,
"epoch": 0.9138576779026217,
"grad_norm": 0.028724675998091698,
"learning_rate": 0.0002,
"loss": 0.5654380321502686,
"mean_token_accuracy": 0.766664981842041,
"num_tokens": 3980237.0,
"step": 244
},
{
"entropy": 0.5452692359685898,
"epoch": 0.9176029962546817,
"grad_norm": 0.032008957117795944,
"learning_rate": 0.0002,
"loss": 0.5489979982376099,
"mean_token_accuracy": 0.7783998996019363,
"num_tokens": 3996411.0,
"step": 245
},
{
"entropy": 0.5732362270355225,
"epoch": 0.9213483146067416,
"grad_norm": 0.026769591495394707,
"learning_rate": 0.0002,
"loss": 0.5739398002624512,
"mean_token_accuracy": 0.7671795785427094,
"num_tokens": 4012857.0,
"step": 246
},
{
"entropy": 0.5656879991292953,
"epoch": 0.9250936329588015,
"grad_norm": 0.03197095915675163,
"learning_rate": 0.0002,
"loss": 0.563187301158905,
"mean_token_accuracy": 0.7670102566480637,
"num_tokens": 4029053.0,
"step": 247
},
{
"entropy": 0.5575947314500809,
"epoch": 0.9288389513108615,
"grad_norm": 0.02987116388976574,
"learning_rate": 0.0002,
"loss": 0.5625151991844177,
"mean_token_accuracy": 0.7722823321819305,
"num_tokens": 4045520.0,
"step": 248
},
{
"entropy": 0.5391925573348999,
"epoch": 0.9325842696629213,
"grad_norm": 0.03071737289428711,
"learning_rate": 0.0002,
"loss": 0.5494749546051025,
"mean_token_accuracy": 0.7774742394685745,
"num_tokens": 4061722.0,
"step": 249
},
{
"entropy": 0.5374163240194321,
"epoch": 0.9363295880149812,
"grad_norm": 0.03443381190299988,
"learning_rate": 0.0002,
"loss": 0.5430468916893005,
"mean_token_accuracy": 0.7767436355352402,
"num_tokens": 4077909.0,
"step": 250
},
{
"entropy": 0.563934788107872,
"epoch": 0.9400749063670412,
"grad_norm": 0.03456362709403038,
"learning_rate": 0.0002,
"loss": 0.5705171227455139,
"mean_token_accuracy": 0.7667582482099533,
"num_tokens": 4094266.0,
"step": 251
},
{
"entropy": 0.5498995631933212,
"epoch": 0.9438202247191011,
"grad_norm": 0.03230346366763115,
"learning_rate": 0.0002,
"loss": 0.5477432012557983,
"mean_token_accuracy": 0.7797223627567291,
"num_tokens": 4110154.0,
"step": 252
},
{
"entropy": 0.5815821886062622,
"epoch": 0.947565543071161,
"grad_norm": 0.030871113762259483,
"learning_rate": 0.0002,
"loss": 0.5757232904434204,
"mean_token_accuracy": 0.7643865346908569,
"num_tokens": 4126298.0,
"step": 253
},
{
"entropy": 0.568855032324791,
"epoch": 0.951310861423221,
"grad_norm": 0.03128105401992798,
"learning_rate": 0.0002,
"loss": 0.5623528361320496,
"mean_token_accuracy": 0.7733433544635773,
"num_tokens": 4142423.0,
"step": 254
},
{
"entropy": 0.5580300092697144,
"epoch": 0.9550561797752809,
"grad_norm": 0.028919901698827744,
"learning_rate": 0.0002,
"loss": 0.5540750026702881,
"mean_token_accuracy": 0.7751399129629135,
"num_tokens": 4158616.0,
"step": 255
},
{
"entropy": 0.5586510896682739,
"epoch": 0.9588014981273408,
"grad_norm": 0.028054876253008842,
"learning_rate": 0.0002,
"loss": 0.5566189289093018,
"mean_token_accuracy": 0.771488219499588,
"num_tokens": 4174981.0,
"step": 256
},
{
"entropy": 0.5506493747234344,
"epoch": 0.9625468164794008,
"grad_norm": 0.028799347579479218,
"learning_rate": 0.0002,
"loss": 0.5535633563995361,
"mean_token_accuracy": 0.7742148786783218,
"num_tokens": 4191446.0,
"step": 257
},
{
"entropy": 0.5423731654882431,
"epoch": 0.9662921348314607,
"grad_norm": 0.033325713127851486,
"learning_rate": 0.0002,
"loss": 0.5534674525260925,
"mean_token_accuracy": 0.773481622338295,
"num_tokens": 4207545.0,
"step": 258
},
{
"entropy": 0.5463626831769943,
"epoch": 0.9700374531835206,
"grad_norm": 0.029474180191755295,
"learning_rate": 0.0002,
"loss": 0.5469580888748169,
"mean_token_accuracy": 0.778034508228302,
"num_tokens": 4223705.0,
"step": 259
},
{
"entropy": 0.5447346717119217,
"epoch": 0.9737827715355806,
"grad_norm": 0.02612573839724064,
"learning_rate": 0.0002,
"loss": 0.5400044322013855,
"mean_token_accuracy": 0.7802340239286423,
"num_tokens": 4240129.0,
"step": 260
},
{
"entropy": 0.5821470022201538,
"epoch": 0.9775280898876404,
"grad_norm": 0.030348099768161774,
"learning_rate": 0.0002,
"loss": 0.5687776803970337,
"mean_token_accuracy": 0.7688710540533066,
"num_tokens": 4256543.0,
"step": 261
},
{
"entropy": 0.5551526695489883,
"epoch": 0.9812734082397003,
"grad_norm": 0.027197403833270073,
"learning_rate": 0.0002,
"loss": 0.5550498962402344,
"mean_token_accuracy": 0.7730266898870468,
"num_tokens": 4272850.0,
"step": 262
},
{
"entropy": 0.558951735496521,
"epoch": 0.9850187265917603,
"grad_norm": 0.02930772304534912,
"learning_rate": 0.0002,
"loss": 0.568732738494873,
"mean_token_accuracy": 0.7649472206830978,
"num_tokens": 4288981.0,
"step": 263
},
{
"entropy": 0.5453519076108932,
"epoch": 0.9887640449438202,
"grad_norm": 0.03282203525304794,
"learning_rate": 0.0002,
"loss": 0.5584692358970642,
"mean_token_accuracy": 0.7731108516454697,
"num_tokens": 4305020.0,
"step": 264
},
{
"entropy": 0.5550204813480377,
"epoch": 0.9925093632958801,
"grad_norm": 0.030776405707001686,
"learning_rate": 0.0002,
"loss": 0.5647276639938354,
"mean_token_accuracy": 0.7714035212993622,
"num_tokens": 4321505.0,
"step": 265
},
{
"entropy": 0.5713452994823456,
"epoch": 0.9962546816479401,
"grad_norm": 0.027741121128201485,
"learning_rate": 0.0002,
"loss": 0.5671746134757996,
"mean_token_accuracy": 0.77179254591465,
"num_tokens": 4337819.0,
"step": 266
},
{
"entropy": 0.5695875138044357,
"epoch": 1.0,
"grad_norm": 0.03063138760626316,
"learning_rate": 0.0002,
"loss": 0.5631532669067383,
"mean_token_accuracy": 0.7723733484745026,
"num_tokens": 4354077.0,
"step": 267
},
{
"entropy": 0.5564615577459335,
"epoch": 1.00374531835206,
"grad_norm": 0.02938828431069851,
"learning_rate": 0.0002,
"loss": 0.5473178625106812,
"mean_token_accuracy": 0.7778049558401108,
"num_tokens": 4370546.0,
"step": 268
},
{
"entropy": 0.5574217587709427,
"epoch": 1.0074906367041199,
"grad_norm": 0.029280902817845345,
"learning_rate": 0.0002,
"loss": 0.5522539019584656,
"mean_token_accuracy": 0.774829238653183,
"num_tokens": 4386769.0,
"step": 269
},
{
"entropy": 0.5274022594094276,
"epoch": 1.0112359550561798,
"grad_norm": 0.03879232704639435,
"learning_rate": 0.0002,
"loss": 0.5378210544586182,
"mean_token_accuracy": 0.7831418812274933,
"num_tokens": 4402982.0,
"step": 270
},
{
"entropy": 0.5290966331958771,
"epoch": 1.0149812734082397,
"grad_norm": 0.03839439898729324,
"learning_rate": 0.0002,
"loss": 0.5428091883659363,
"mean_token_accuracy": 0.7794705182313919,
"num_tokens": 4418967.0,
"step": 271
},
{
"entropy": 0.5340720564126968,
"epoch": 1.0187265917602997,
"grad_norm": 0.027254262939095497,
"learning_rate": 0.0002,
"loss": 0.5355733633041382,
"mean_token_accuracy": 0.7818265557289124,
"num_tokens": 4435204.0,
"step": 272
},
{
"entropy": 0.5440738946199417,
"epoch": 1.0224719101123596,
"grad_norm": 0.03392236679792404,
"learning_rate": 0.0002,
"loss": 0.5456275939941406,
"mean_token_accuracy": 0.780282586812973,
"num_tokens": 4451432.0,
"step": 273
},
{
"entropy": 0.5574818104505539,
"epoch": 1.0262172284644195,
"grad_norm": 0.026871202513575554,
"learning_rate": 0.0002,
"loss": 0.5559114217758179,
"mean_token_accuracy": 0.777089074254036,
"num_tokens": 4467766.0,
"step": 274
},
{
"entropy": 0.5488097965717316,
"epoch": 1.0299625468164795,
"grad_norm": 0.029019974172115326,
"learning_rate": 0.0002,
"loss": 0.5336285829544067,
"mean_token_accuracy": 0.7849163711071014,
"num_tokens": 4483969.0,
"step": 275
},
{
"entropy": 0.5530442148447037,
"epoch": 1.0337078651685394,
"grad_norm": 0.02914772555232048,
"learning_rate": 0.0002,
"loss": 0.5511333346366882,
"mean_token_accuracy": 0.7753048241138458,
"num_tokens": 4500202.0,
"step": 276
},
{
"entropy": 0.5580654293298721,
"epoch": 1.0374531835205993,
"grad_norm": 0.02970791608095169,
"learning_rate": 0.0002,
"loss": 0.5622603297233582,
"mean_token_accuracy": 0.7713205814361572,
"num_tokens": 4516619.0,
"step": 277
},
{
"entropy": 0.5405817478895187,
"epoch": 1.0411985018726593,
"grad_norm": 0.0317082442343235,
"learning_rate": 0.0002,
"loss": 0.5510064363479614,
"mean_token_accuracy": 0.7750898003578186,
"num_tokens": 4532787.0,
"step": 278
},
{
"entropy": 0.529707208275795,
"epoch": 1.0449438202247192,
"grad_norm": 0.032039616256952286,
"learning_rate": 0.0002,
"loss": 0.5385198593139648,
"mean_token_accuracy": 0.7802569419145584,
"num_tokens": 4549095.0,
"step": 279
},
{
"entropy": 0.536220982670784,
"epoch": 1.048689138576779,
"grad_norm": 0.03247847780585289,
"learning_rate": 0.0002,
"loss": 0.5422552824020386,
"mean_token_accuracy": 0.7777614146471024,
"num_tokens": 4565068.0,
"step": 280
},
{
"entropy": 0.5643364787101746,
"epoch": 1.0524344569288389,
"grad_norm": 0.03038158267736435,
"learning_rate": 0.0002,
"loss": 0.5526927709579468,
"mean_token_accuracy": 0.7772861868143082,
"num_tokens": 4581362.0,
"step": 281
},
{
"entropy": 0.5710341036319733,
"epoch": 1.0561797752808988,
"grad_norm": 0.029375184327363968,
"learning_rate": 0.0002,
"loss": 0.5627338290214539,
"mean_token_accuracy": 0.7716031968593597,
"num_tokens": 4598044.0,
"step": 282
},
{
"entropy": 0.5661873072385788,
"epoch": 1.0599250936329587,
"grad_norm": 0.029537923634052277,
"learning_rate": 0.0002,
"loss": 0.5619353652000427,
"mean_token_accuracy": 0.7722314894199371,
"num_tokens": 4614605.0,
"step": 283
},
{
"entropy": 0.545825719833374,
"epoch": 1.0636704119850187,
"grad_norm": 0.028511304408311844,
"learning_rate": 0.0002,
"loss": 0.5431419610977173,
"mean_token_accuracy": 0.7778640240430832,
"num_tokens": 4630914.0,
"step": 284
},
{
"entropy": 0.5331753790378571,
"epoch": 1.0674157303370786,
"grad_norm": 0.032436709851026535,
"learning_rate": 0.0002,
"loss": 0.5459548830986023,
"mean_token_accuracy": 0.7751310169696808,
"num_tokens": 4647234.0,
"step": 285
},
{
"entropy": 0.5640293508768082,
"epoch": 1.0711610486891385,
"grad_norm": 0.0322943851351738,
"learning_rate": 0.0002,
"loss": 0.5726660490036011,
"mean_token_accuracy": 0.76516292989254,
"num_tokens": 4663828.0,
"step": 286
},
{
"entropy": 0.5655198693275452,
"epoch": 1.0749063670411985,
"grad_norm": 0.028429750353097916,
"learning_rate": 0.0002,
"loss": 0.5707299709320068,
"mean_token_accuracy": 0.7665908485651016,
"num_tokens": 4680191.0,
"step": 287
},
{
"entropy": 0.5641037821769714,
"epoch": 1.0786516853932584,
"grad_norm": 0.02850640006363392,
"learning_rate": 0.0002,
"loss": 0.5591652393341064,
"mean_token_accuracy": 0.7727868556976318,
"num_tokens": 4696297.0,
"step": 288
},
{
"entropy": 0.5585228204727173,
"epoch": 1.0823970037453183,
"grad_norm": 0.03052029199898243,
"learning_rate": 0.0002,
"loss": 0.5535526275634766,
"mean_token_accuracy": 0.7758607268333435,
"num_tokens": 4712608.0,
"step": 289
},
{
"entropy": 0.5454631745815277,
"epoch": 1.0861423220973783,
"grad_norm": 0.02904430776834488,
"learning_rate": 0.0002,
"loss": 0.5463353395462036,
"mean_token_accuracy": 0.7812290787696838,
"num_tokens": 4728702.0,
"step": 290
},
{
"entropy": 0.547488197684288,
"epoch": 1.0898876404494382,
"grad_norm": 0.02964003197848797,
"learning_rate": 0.0002,
"loss": 0.5422903299331665,
"mean_token_accuracy": 0.7805432081222534,
"num_tokens": 4745177.0,
"step": 291
},
{
"entropy": 0.5354203134775162,
"epoch": 1.0936329588014981,
"grad_norm": 0.036443792283535004,
"learning_rate": 0.0002,
"loss": 0.5374300479888916,
"mean_token_accuracy": 0.7797484993934631,
"num_tokens": 4761143.0,
"step": 292
},
{
"entropy": 0.5536107122898102,
"epoch": 1.097378277153558,
"grad_norm": 0.028762439265847206,
"learning_rate": 0.0002,
"loss": 0.5621394515037537,
"mean_token_accuracy": 0.7706074863672256,
"num_tokens": 4777282.0,
"step": 293
},
{
"entropy": 0.5409039855003357,
"epoch": 1.101123595505618,
"grad_norm": 0.03404904156923294,
"learning_rate": 0.0002,
"loss": 0.5510942339897156,
"mean_token_accuracy": 0.7781406044960022,
"num_tokens": 4793365.0,
"step": 294
},
{
"entropy": 0.5496554970741272,
"epoch": 1.104868913857678,
"grad_norm": 0.03300090506672859,
"learning_rate": 0.0002,
"loss": 0.5508947372436523,
"mean_token_accuracy": 0.7776678502559662,
"num_tokens": 4809752.0,
"step": 295
},
{
"entropy": 0.5615599453449249,
"epoch": 1.1086142322097379,
"grad_norm": 0.02708325907588005,
"learning_rate": 0.0002,
"loss": 0.5569652915000916,
"mean_token_accuracy": 0.7737039029598236,
"num_tokens": 4826077.0,
"step": 296
},
{
"entropy": 0.5593246519565582,
"epoch": 1.1123595505617978,
"grad_norm": 0.03139323368668556,
"learning_rate": 0.0002,
"loss": 0.5524771809577942,
"mean_token_accuracy": 0.7745187878608704,
"num_tokens": 4842333.0,
"step": 297
},
{
"entropy": 0.5454850494861603,
"epoch": 1.1161048689138577,
"grad_norm": 0.02898702770471573,
"learning_rate": 0.0002,
"loss": 0.5425970554351807,
"mean_token_accuracy": 0.7789193391799927,
"num_tokens": 4858558.0,
"step": 298
},
{
"entropy": 0.538344144821167,
"epoch": 1.1198501872659177,
"grad_norm": 0.029788950458168983,
"learning_rate": 0.0002,
"loss": 0.5424114465713501,
"mean_token_accuracy": 0.7777515351772308,
"num_tokens": 4874826.0,
"step": 299
},
{
"entropy": 0.5260975658893585,
"epoch": 1.1235955056179776,
"grad_norm": 0.03646169230341911,
"learning_rate": 0.0002,
"loss": 0.5355998277664185,
"mean_token_accuracy": 0.7840575128793716,
"num_tokens": 4890978.0,
"step": 300
},
{
"entropy": 0.5369604676961899,
"epoch": 1.1273408239700375,
"grad_norm": 0.03131569176912308,
"learning_rate": 0.0002,
"loss": 0.540716290473938,
"mean_token_accuracy": 0.780446395277977,
"num_tokens": 4907064.0,
"step": 301
},
{
"entropy": 0.5605516880750656,
"epoch": 1.1310861423220975,
"grad_norm": 0.034511223435401917,
"learning_rate": 0.0002,
"loss": 0.5577893257141113,
"mean_token_accuracy": 0.7730138152837753,
"num_tokens": 4923266.0,
"step": 302
},
{
"entropy": 0.5472770929336548,
"epoch": 1.1348314606741572,
"grad_norm": 0.0347181111574173,
"learning_rate": 0.0002,
"loss": 0.5447498559951782,
"mean_token_accuracy": 0.7790001332759857,
"num_tokens": 4939554.0,
"step": 303
},
{
"entropy": 0.5580919533967972,
"epoch": 1.1385767790262173,
"grad_norm": 0.029458722099661827,
"learning_rate": 0.0002,
"loss": 0.5602295994758606,
"mean_token_accuracy": 0.7698655724525452,
"num_tokens": 4955864.0,
"step": 304
},
{
"entropy": 0.5566238462924957,
"epoch": 1.142322097378277,
"grad_norm": 0.03371216729283333,
"learning_rate": 0.0002,
"loss": 0.5516577363014221,
"mean_token_accuracy": 0.7762005478143692,
"num_tokens": 4972145.0,
"step": 305
},
{
"entropy": 0.5444543808698654,
"epoch": 1.146067415730337,
"grad_norm": 0.03240659460425377,
"learning_rate": 0.0002,
"loss": 0.5465469360351562,
"mean_token_accuracy": 0.7778800278902054,
"num_tokens": 4988600.0,
"step": 306
},
{
"entropy": 0.5197838395833969,
"epoch": 1.149812734082397,
"grad_norm": 0.03453533351421356,
"learning_rate": 0.0002,
"loss": 0.52244633436203,
"mean_token_accuracy": 0.7865428030490875,
"num_tokens": 5004593.0,
"step": 307
},
{
"entropy": 0.5355952382087708,
"epoch": 1.1535580524344569,
"grad_norm": 0.02796328440308571,
"learning_rate": 0.0002,
"loss": 0.5417516231536865,
"mean_token_accuracy": 0.778742790222168,
"num_tokens": 5020798.0,
"step": 308
},
{
"entropy": 0.5339494347572327,
"epoch": 1.1573033707865168,
"grad_norm": 0.031283531337976456,
"learning_rate": 0.0002,
"loss": 0.5422439575195312,
"mean_token_accuracy": 0.7790778428316116,
"num_tokens": 5037095.0,
"step": 309
},
{
"entropy": 0.5599728673696518,
"epoch": 1.1610486891385767,
"grad_norm": 0.029156681150197983,
"learning_rate": 0.0002,
"loss": 0.5628546476364136,
"mean_token_accuracy": 0.7709409445524216,
"num_tokens": 5053556.0,
"step": 310
},
{
"entropy": 0.5527057945728302,
"epoch": 1.1647940074906367,
"grad_norm": 0.028000809252262115,
"learning_rate": 0.0002,
"loss": 0.5457457900047302,
"mean_token_accuracy": 0.7764673084020615,
"num_tokens": 5069817.0,
"step": 311
},
{
"entropy": 0.5439251810312271,
"epoch": 1.1685393258426966,
"grad_norm": 0.027509242296218872,
"learning_rate": 0.0002,
"loss": 0.5400040149688721,
"mean_token_accuracy": 0.7789120823144913,
"num_tokens": 5086044.0,
"step": 312
},
{
"entropy": 0.561322957277298,
"epoch": 1.1722846441947565,
"grad_norm": 0.030032532289624214,
"learning_rate": 0.0002,
"loss": 0.5588545799255371,
"mean_token_accuracy": 0.7742930203676224,
"num_tokens": 5102685.0,
"step": 313
},
{
"entropy": 0.5458335727453232,
"epoch": 1.1760299625468165,
"grad_norm": 0.029963059350848198,
"learning_rate": 0.0002,
"loss": 0.5477938055992126,
"mean_token_accuracy": 0.777193009853363,
"num_tokens": 5119294.0,
"step": 314
},
{
"entropy": 0.5545150190591812,
"epoch": 1.1797752808988764,
"grad_norm": 0.03310168907046318,
"learning_rate": 0.0002,
"loss": 0.5611361265182495,
"mean_token_accuracy": 0.7725827246904373,
"num_tokens": 5135795.0,
"step": 315
},
{
"entropy": 0.5393262058496475,
"epoch": 1.1835205992509363,
"grad_norm": 0.02876197174191475,
"learning_rate": 0.0002,
"loss": 0.5395398139953613,
"mean_token_accuracy": 0.781178891658783,
"num_tokens": 5151936.0,
"step": 316
},
{
"entropy": 0.5356467962265015,
"epoch": 1.1872659176029963,
"grad_norm": 0.029216231778264046,
"learning_rate": 0.0002,
"loss": 0.5275884866714478,
"mean_token_accuracy": 0.7844340801239014,
"num_tokens": 5168072.0,
"step": 317
},
{
"entropy": 0.5539442598819733,
"epoch": 1.1910112359550562,
"grad_norm": 0.029222887009382248,
"learning_rate": 0.0002,
"loss": 0.5549959540367126,
"mean_token_accuracy": 0.7750978469848633,
"num_tokens": 5184280.0,
"step": 318
},
{
"entropy": 0.5316408574581146,
"epoch": 1.1947565543071161,
"grad_norm": 0.03008115477859974,
"learning_rate": 0.0002,
"loss": 0.536407470703125,
"mean_token_accuracy": 0.7843799740076065,
"num_tokens": 5200364.0,
"step": 319
},
{
"entropy": 0.5335765928030014,
"epoch": 1.198501872659176,
"grad_norm": 0.030437173321843147,
"learning_rate": 0.0002,
"loss": 0.5371608734130859,
"mean_token_accuracy": 0.7834146469831467,
"num_tokens": 5216503.0,
"step": 320
},
{
"entropy": 0.5507327914237976,
"epoch": 1.202247191011236,
"grad_norm": 0.030706282705068588,
"learning_rate": 0.0002,
"loss": 0.5528247356414795,
"mean_token_accuracy": 0.7763889282941818,
"num_tokens": 5232896.0,
"step": 321
},
{
"entropy": 0.5600829422473907,
"epoch": 1.205992509363296,
"grad_norm": 0.03131498023867607,
"learning_rate": 0.0002,
"loss": 0.559609055519104,
"mean_token_accuracy": 0.7688225358724594,
"num_tokens": 5249400.0,
"step": 322
},
{
"entropy": 0.5482848882675171,
"epoch": 1.2097378277153559,
"grad_norm": 0.030239688232541084,
"learning_rate": 0.0002,
"loss": 0.5498725771903992,
"mean_token_accuracy": 0.7751806825399399,
"num_tokens": 5265595.0,
"step": 323
},
{
"entropy": 0.5517048090696335,
"epoch": 1.2134831460674158,
"grad_norm": 0.03668053448200226,
"learning_rate": 0.0002,
"loss": 0.5480911135673523,
"mean_token_accuracy": 0.7757556736469269,
"num_tokens": 5281774.0,
"step": 324
},
{
"entropy": 0.5576729625463486,
"epoch": 1.2172284644194757,
"grad_norm": 0.028534850105643272,
"learning_rate": 0.0002,
"loss": 0.5513843894004822,
"mean_token_accuracy": 0.7748550176620483,
"num_tokens": 5297913.0,
"step": 325
},
{
"entropy": 0.5390013605356216,
"epoch": 1.2209737827715357,
"grad_norm": 0.03146135434508324,
"learning_rate": 0.0002,
"loss": 0.539669930934906,
"mean_token_accuracy": 0.7778647989034653,
"num_tokens": 5314070.0,
"step": 326
},
{
"entropy": 0.5463844388723373,
"epoch": 1.2247191011235956,
"grad_norm": 0.03442573919892311,
"learning_rate": 0.0002,
"loss": 0.5508401393890381,
"mean_token_accuracy": 0.774851381778717,
"num_tokens": 5330361.0,
"step": 327
},
{
"entropy": 0.5308734029531479,
"epoch": 1.2284644194756553,
"grad_norm": 0.03126746043562889,
"learning_rate": 0.0002,
"loss": 0.5370399951934814,
"mean_token_accuracy": 0.7805522531270981,
"num_tokens": 5346367.0,
"step": 328
},
{
"entropy": 0.5443529635667801,
"epoch": 1.2322097378277155,
"grad_norm": 0.028079699724912643,
"learning_rate": 0.0002,
"loss": 0.5469828248023987,
"mean_token_accuracy": 0.7801272124052048,
"num_tokens": 5362795.0,
"step": 329
},
{
"entropy": 0.5508403033018112,
"epoch": 1.2359550561797752,
"grad_norm": 0.03308681398630142,
"learning_rate": 0.0002,
"loss": 0.5537492632865906,
"mean_token_accuracy": 0.776117667555809,
"num_tokens": 5378892.0,
"step": 330
},
{
"entropy": 0.547036200761795,
"epoch": 1.2397003745318351,
"grad_norm": 0.030657080933451653,
"learning_rate": 0.0002,
"loss": 0.5473320484161377,
"mean_token_accuracy": 0.7783585488796234,
"num_tokens": 5395182.0,
"step": 331
},
{
"entropy": 0.5384639650583267,
"epoch": 1.243445692883895,
"grad_norm": 0.03128959983587265,
"learning_rate": 0.0002,
"loss": 0.5418936610221863,
"mean_token_accuracy": 0.7789008319377899,
"num_tokens": 5411728.0,
"step": 332
},
{
"entropy": 0.5433261394500732,
"epoch": 1.247191011235955,
"grad_norm": 0.02972225658595562,
"learning_rate": 0.0002,
"loss": 0.5430710315704346,
"mean_token_accuracy": 0.7793088257312775,
"num_tokens": 5427990.0,
"step": 333
},
{
"entropy": 0.5405146926641464,
"epoch": 1.250936329588015,
"grad_norm": 0.028844943270087242,
"learning_rate": 0.0002,
"loss": 0.538284957408905,
"mean_token_accuracy": 0.7814860939979553,
"num_tokens": 5443961.0,
"step": 334
},
{
"entropy": 0.5582905858755112,
"epoch": 1.2546816479400749,
"grad_norm": 0.0356195829808712,
"learning_rate": 0.0002,
"loss": 0.558274507522583,
"mean_token_accuracy": 0.772399827837944,
"num_tokens": 5460135.0,
"step": 335
},
{
"entropy": 0.5524656623601913,
"epoch": 1.2584269662921348,
"grad_norm": 0.02986624464392662,
"learning_rate": 0.0002,
"loss": 0.5503432750701904,
"mean_token_accuracy": 0.7768993377685547,
"num_tokens": 5476448.0,
"step": 336
},
{
"entropy": 0.553261786699295,
"epoch": 1.2621722846441947,
"grad_norm": 0.03385454788804054,
"learning_rate": 0.0002,
"loss": 0.5513902902603149,
"mean_token_accuracy": 0.7756227403879166,
"num_tokens": 5492657.0,
"step": 337
},
{
"entropy": 0.5534822195768356,
"epoch": 1.2659176029962547,
"grad_norm": 0.03496600687503815,
"learning_rate": 0.0002,
"loss": 0.5570470690727234,
"mean_token_accuracy": 0.7745380252599716,
"num_tokens": 5508936.0,
"step": 338
},
{
"entropy": 0.5206775590777397,
"epoch": 1.2696629213483146,
"grad_norm": 0.038312628865242004,
"learning_rate": 0.0002,
"loss": 0.531387209892273,
"mean_token_accuracy": 0.7818328887224197,
"num_tokens": 5525150.0,
"step": 339
},
{
"entropy": 0.5372405052185059,
"epoch": 1.2734082397003745,
"grad_norm": 0.03226601704955101,
"learning_rate": 0.0002,
"loss": 0.5414312481880188,
"mean_token_accuracy": 0.7806438505649567,
"num_tokens": 5541125.0,
"step": 340
},
{
"entropy": 0.5670074820518494,
"epoch": 1.2771535580524345,
"grad_norm": 0.032290343195199966,
"learning_rate": 0.0002,
"loss": 0.5651661157608032,
"mean_token_accuracy": 0.768811360001564,
"num_tokens": 5557589.0,
"step": 341
},
{
"entropy": 0.5581976920366287,
"epoch": 1.2808988764044944,
"grad_norm": 0.035112183541059494,
"learning_rate": 0.0002,
"loss": 0.5540149211883545,
"mean_token_accuracy": 0.7756919115781784,
"num_tokens": 5574011.0,
"step": 342
},
{
"entropy": 0.5480058342218399,
"epoch": 1.2846441947565543,
"grad_norm": 0.029269572347402573,
"learning_rate": 0.0002,
"loss": 0.5497134923934937,
"mean_token_accuracy": 0.7775010466575623,
"num_tokens": 5590227.0,
"step": 343
},
{
"entropy": 0.5551355630159378,
"epoch": 1.2883895131086143,
"grad_norm": 0.03512820973992348,
"learning_rate": 0.0002,
"loss": 0.5613937377929688,
"mean_token_accuracy": 0.77100470662117,
"num_tokens": 5606436.0,
"step": 344
},
{
"entropy": 0.5681823641061783,
"epoch": 1.2921348314606742,
"grad_norm": 0.028890319168567657,
"learning_rate": 0.0002,
"loss": 0.5653828382492065,
"mean_token_accuracy": 0.7733339965343475,
"num_tokens": 5622955.0,
"step": 345
},
{
"entropy": 0.5512849390506744,
"epoch": 1.2958801498127341,
"grad_norm": 0.03168505057692528,
"learning_rate": 0.0002,
"loss": 0.5475208759307861,
"mean_token_accuracy": 0.778771311044693,
"num_tokens": 5639583.0,
"step": 346
},
{
"entropy": 0.5361000895500183,
"epoch": 1.299625468164794,
"grad_norm": 0.03995742276310921,
"learning_rate": 0.0002,
"loss": 0.5435983538627625,
"mean_token_accuracy": 0.7801041901111603,
"num_tokens": 5655726.0,
"step": 347
},
{
"entropy": 0.5335006862878799,
"epoch": 1.303370786516854,
"grad_norm": 0.03385796397924423,
"learning_rate": 0.0002,
"loss": 0.5360836982727051,
"mean_token_accuracy": 0.7803510278463364,
"num_tokens": 5671935.0,
"step": 348
},
{
"entropy": 0.5649213343858719,
"epoch": 1.3071161048689137,
"grad_norm": 0.03367312625050545,
"learning_rate": 0.0002,
"loss": 0.5654204487800598,
"mean_token_accuracy": 0.7698808759450912,
"num_tokens": 5688484.0,
"step": 349
},
{
"entropy": 0.5636743903160095,
"epoch": 1.3108614232209739,
"grad_norm": 0.028330491855740547,
"learning_rate": 0.0002,
"loss": 0.564975380897522,
"mean_token_accuracy": 0.769644483923912,
"num_tokens": 5704874.0,
"step": 350
},
{
"entropy": 0.5439984649419785,
"epoch": 1.3146067415730336,
"grad_norm": 0.030180098488926888,
"learning_rate": 0.0002,
"loss": 0.540916383266449,
"mean_token_accuracy": 0.7806600630283356,
"num_tokens": 5721250.0,
"step": 351
},
{
"entropy": 0.5403287261724472,
"epoch": 1.3183520599250937,
"grad_norm": 0.03425198793411255,
"learning_rate": 0.0002,
"loss": 0.5408051609992981,
"mean_token_accuracy": 0.7801858931779861,
"num_tokens": 5737303.0,
"step": 352
},
{
"entropy": 0.5534793436527252,
"epoch": 1.3220973782771535,
"grad_norm": 0.029101019725203514,
"learning_rate": 0.0002,
"loss": 0.5576366782188416,
"mean_token_accuracy": 0.773370087146759,
"num_tokens": 5753786.0,
"step": 353
},
{
"entropy": 0.5410192608833313,
"epoch": 1.3258426966292136,
"grad_norm": 0.0356539785861969,
"learning_rate": 0.0002,
"loss": 0.5408055186271667,
"mean_token_accuracy": 0.7814153283834457,
"num_tokens": 5769926.0,
"step": 354
},
{
"entropy": 0.5472375005483627,
"epoch": 1.3295880149812733,
"grad_norm": 0.03288782387971878,
"learning_rate": 0.0002,
"loss": 0.5537273287773132,
"mean_token_accuracy": 0.7744840979576111,
"num_tokens": 5785998.0,
"step": 355
},
{
"entropy": 0.5556980893015862,
"epoch": 1.3333333333333333,
"grad_norm": 0.038231220096349716,
"learning_rate": 0.0002,
"loss": 0.558592677116394,
"mean_token_accuracy": 0.7744520753622055,
"num_tokens": 5802256.0,
"step": 356
},
{
"entropy": 0.5668211281299591,
"epoch": 1.3370786516853932,
"grad_norm": 0.02924768440425396,
"learning_rate": 0.0002,
"loss": 0.5691797733306885,
"mean_token_accuracy": 0.7683669775724411,
"num_tokens": 5818757.0,
"step": 357
},
{
"entropy": 0.549320325255394,
"epoch": 1.3408239700374531,
"grad_norm": 0.03099512681365013,
"learning_rate": 0.0002,
"loss": 0.551908016204834,
"mean_token_accuracy": 0.7755500972270966,
"num_tokens": 5835041.0,
"step": 358
},
{
"entropy": 0.5573329776525497,
"epoch": 1.344569288389513,
"grad_norm": 0.028519438579678535,
"learning_rate": 0.0002,
"loss": 0.5581731796264648,
"mean_token_accuracy": 0.7729284316301346,
"num_tokens": 5851618.0,
"step": 359
},
{
"entropy": 0.5377827435731888,
"epoch": 1.348314606741573,
"grad_norm": 0.03338128328323364,
"learning_rate": 0.0002,
"loss": 0.5362961888313293,
"mean_token_accuracy": 0.7824237793684006,
"num_tokens": 5867600.0,
"step": 360
},
{
"entropy": 0.549625426530838,
"epoch": 1.352059925093633,
"grad_norm": 0.032118137925863266,
"learning_rate": 0.0002,
"loss": 0.5464169979095459,
"mean_token_accuracy": 0.779940128326416,
"num_tokens": 5883550.0,
"step": 361
},
{
"entropy": 0.5563124269247055,
"epoch": 1.3558052434456929,
"grad_norm": 0.028186708688735962,
"learning_rate": 0.0002,
"loss": 0.5525781512260437,
"mean_token_accuracy": 0.7742565721273422,
"num_tokens": 5900020.0,
"step": 362
},
{
"entropy": 0.5396654903888702,
"epoch": 1.3595505617977528,
"grad_norm": 0.03306869789958,
"learning_rate": 0.0002,
"loss": 0.5485842227935791,
"mean_token_accuracy": 0.7763185799121857,
"num_tokens": 5916563.0,
"step": 363
},
{
"entropy": 0.5324016958475113,
"epoch": 1.3632958801498127,
"grad_norm": 0.030485033988952637,
"learning_rate": 0.0002,
"loss": 0.5407555103302002,
"mean_token_accuracy": 0.7805987000465393,
"num_tokens": 5932915.0,
"step": 364
},
{
"entropy": 0.5415676534175873,
"epoch": 1.3670411985018727,
"grad_norm": 0.032210033386945724,
"learning_rate": 0.0002,
"loss": 0.5420053601264954,
"mean_token_accuracy": 0.7789227366447449,
"num_tokens": 5949294.0,
"step": 365
},
{
"entropy": 0.5479710251092911,
"epoch": 1.3707865168539326,
"grad_norm": 0.030770668759942055,
"learning_rate": 0.0002,
"loss": 0.5442653894424438,
"mean_token_accuracy": 0.7809406220912933,
"num_tokens": 5965688.0,
"step": 366
},
{
"entropy": 0.5611272603273392,
"epoch": 1.3745318352059925,
"grad_norm": 0.030032480135560036,
"learning_rate": 0.0002,
"loss": 0.5458992719650269,
"mean_token_accuracy": 0.7793887704610825,
"num_tokens": 5982353.0,
"step": 367
},
{
"entropy": 0.5711783468723297,
"epoch": 1.3782771535580525,
"grad_norm": 0.030471278354525566,
"learning_rate": 0.0002,
"loss": 0.5689231157302856,
"mean_token_accuracy": 0.7691554129123688,
"num_tokens": 5998928.0,
"step": 368
},
{
"entropy": 0.5704734623432159,
"epoch": 1.3820224719101124,
"grad_norm": 0.0308744665235281,
"learning_rate": 0.0002,
"loss": 0.5704200267791748,
"mean_token_accuracy": 0.7696904093027115,
"num_tokens": 6015488.0,
"step": 369
},
{
"entropy": 0.540970042347908,
"epoch": 1.3857677902621723,
"grad_norm": 0.029789667576551437,
"learning_rate": 0.0002,
"loss": 0.5435522794723511,
"mean_token_accuracy": 0.7803212404251099,
"num_tokens": 6032273.0,
"step": 370
},
{
"entropy": 0.5323564112186432,
"epoch": 1.3895131086142323,
"grad_norm": 0.03373701870441437,
"learning_rate": 0.0002,
"loss": 0.5415207147598267,
"mean_token_accuracy": 0.7777475565671921,
"num_tokens": 6048761.0,
"step": 371
},
{
"entropy": 0.5275064408779144,
"epoch": 1.3932584269662922,
"grad_norm": 0.03547370806336403,
"learning_rate": 0.0002,
"loss": 0.540917694568634,
"mean_token_accuracy": 0.7795429080724716,
"num_tokens": 6064848.0,
"step": 372
},
{
"entropy": 0.5497806072235107,
"epoch": 1.3970037453183521,
"grad_norm": 0.03201119974255562,
"learning_rate": 0.0002,
"loss": 0.552889347076416,
"mean_token_accuracy": 0.7745427489280701,
"num_tokens": 6081258.0,
"step": 373
},
{
"entropy": 0.5175323188304901,
"epoch": 1.4007490636704119,
"grad_norm": 0.03368834778666496,
"learning_rate": 0.0002,
"loss": 0.5198505520820618,
"mean_token_accuracy": 0.7878732234239578,
"num_tokens": 6097172.0,
"step": 374
},
{
"entropy": 0.5441398918628693,
"epoch": 1.404494382022472,
"grad_norm": 0.03139437735080719,
"learning_rate": 0.0002,
"loss": 0.5445310473442078,
"mean_token_accuracy": 0.780688688158989,
"num_tokens": 6113446.0,
"step": 375
},
{
"entropy": 0.5468717068433762,
"epoch": 1.4082397003745317,
"grad_norm": 0.03169120475649834,
"learning_rate": 0.0002,
"loss": 0.5426516532897949,
"mean_token_accuracy": 0.776495024561882,
"num_tokens": 6129738.0,
"step": 376
},
{
"entropy": 0.5554005056619644,
"epoch": 1.4119850187265919,
"grad_norm": 0.03649836778640747,
"learning_rate": 0.0002,
"loss": 0.5584489703178406,
"mean_token_accuracy": 0.7743981927633286,
"num_tokens": 6146138.0,
"step": 377
},
{
"entropy": 0.545359656214714,
"epoch": 1.4157303370786516,
"grad_norm": 0.0333530455827713,
"learning_rate": 0.0002,
"loss": 0.547561526298523,
"mean_token_accuracy": 0.7772817760705948,
"num_tokens": 6162466.0,
"step": 378
},
{
"entropy": 0.5366268008947372,
"epoch": 1.4194756554307117,
"grad_norm": 0.0315176397562027,
"learning_rate": 0.0002,
"loss": 0.5370338559150696,
"mean_token_accuracy": 0.7830789685249329,
"num_tokens": 6178827.0,
"step": 379
},
{
"entropy": 0.5343760550022125,
"epoch": 1.4232209737827715,
"grad_norm": 0.03283468633890152,
"learning_rate": 0.0002,
"loss": 0.5403618812561035,
"mean_token_accuracy": 0.7811573594808578,
"num_tokens": 6195014.0,
"step": 380
},
{
"entropy": 0.5374447852373123,
"epoch": 1.4269662921348314,
"grad_norm": 0.03712209314107895,
"learning_rate": 0.0002,
"loss": 0.5359081625938416,
"mean_token_accuracy": 0.7824594676494598,
"num_tokens": 6211204.0,
"step": 381
},
{
"entropy": 0.5647163391113281,
"epoch": 1.4307116104868913,
"grad_norm": 0.030612658709287643,
"learning_rate": 0.0002,
"loss": 0.5665347576141357,
"mean_token_accuracy": 0.7709782868623734,
"num_tokens": 6227439.0,
"step": 382
},
{
"entropy": 0.5584586560726166,
"epoch": 1.4344569288389513,
"grad_norm": 0.03545604646205902,
"learning_rate": 0.0002,
"loss": 0.5592620372772217,
"mean_token_accuracy": 0.7708311080932617,
"num_tokens": 6243909.0,
"step": 383
},
{
"entropy": 0.5563389509916306,
"epoch": 1.4382022471910112,
"grad_norm": 0.031707633286714554,
"learning_rate": 0.0002,
"loss": 0.5574153065681458,
"mean_token_accuracy": 0.7749636173248291,
"num_tokens": 6260228.0,
"step": 384
},
{
"entropy": 0.5361679270863533,
"epoch": 1.4419475655430711,
"grad_norm": 0.030576881021261215,
"learning_rate": 0.0002,
"loss": 0.5358593463897705,
"mean_token_accuracy": 0.7815472632646561,
"num_tokens": 6276438.0,
"step": 385
},
{
"entropy": 0.5404613763093948,
"epoch": 1.445692883895131,
"grad_norm": 0.0397074818611145,
"learning_rate": 0.0002,
"loss": 0.5409061908721924,
"mean_token_accuracy": 0.7812814116477966,
"num_tokens": 6292854.0,
"step": 386
},
{
"entropy": 0.5539507865905762,
"epoch": 1.449438202247191,
"grad_norm": 0.027634674683213234,
"learning_rate": 0.0002,
"loss": 0.551899254322052,
"mean_token_accuracy": 0.7763891369104385,
"num_tokens": 6309146.0,
"step": 387
},
{
"entropy": 0.5406185388565063,
"epoch": 1.453183520599251,
"grad_norm": 0.03658418357372284,
"learning_rate": 0.0002,
"loss": 0.5376873016357422,
"mean_token_accuracy": 0.7802905589342117,
"num_tokens": 6325371.0,
"step": 388
},
{
"entropy": 0.5515788942575455,
"epoch": 1.4569288389513109,
"grad_norm": 0.029648393392562866,
"learning_rate": 0.0002,
"loss": 0.5481655597686768,
"mean_token_accuracy": 0.7753021568059921,
"num_tokens": 6341504.0,
"step": 389
},
{
"entropy": 0.5403069257736206,
"epoch": 1.4606741573033708,
"grad_norm": 0.0300885122269392,
"learning_rate": 0.0002,
"loss": 0.5417286157608032,
"mean_token_accuracy": 0.7805690169334412,
"num_tokens": 6357574.0,
"step": 390
},
{
"entropy": 0.5320965051651001,
"epoch": 1.4644194756554307,
"grad_norm": 0.04233168438076973,
"learning_rate": 0.0002,
"loss": 0.542140543460846,
"mean_token_accuracy": 0.7790813148021698,
"num_tokens": 6373603.0,
"step": 391
},
{
"entropy": 0.5370313972234726,
"epoch": 1.4681647940074907,
"grad_norm": 0.03608033061027527,
"learning_rate": 0.0002,
"loss": 0.5452749133110046,
"mean_token_accuracy": 0.7784496247768402,
"num_tokens": 6389874.0,
"step": 392
},
{
"entropy": 0.5391117632389069,
"epoch": 1.4719101123595506,
"grad_norm": 0.044416990131139755,
"learning_rate": 0.0002,
"loss": 0.5447070598602295,
"mean_token_accuracy": 0.7758590877056122,
"num_tokens": 6406014.0,
"step": 393
},
{
"entropy": 0.5536396950483322,
"epoch": 1.4756554307116105,
"grad_norm": 0.028598185628652573,
"learning_rate": 0.0002,
"loss": 0.5509454011917114,
"mean_token_accuracy": 0.7754955738782883,
"num_tokens": 6422526.0,
"step": 394
},
{
"entropy": 0.5600528717041016,
"epoch": 1.4794007490636705,
"grad_norm": 0.03587036579847336,
"learning_rate": 0.0002,
"loss": 0.5511722564697266,
"mean_token_accuracy": 0.7756818234920502,
"num_tokens": 6438826.0,
"step": 395
},
{
"entropy": 0.5635561943054199,
"epoch": 1.4831460674157304,
"grad_norm": 0.04037458822131157,
"learning_rate": 0.0002,
"loss": 0.5569745898246765,
"mean_token_accuracy": 0.7768395692110062,
"num_tokens": 6455392.0,
"step": 396
},
{
"entropy": 0.5546122640371323,
"epoch": 1.4868913857677903,
"grad_norm": 0.03193597123026848,
"learning_rate": 0.0002,
"loss": 0.5528469085693359,
"mean_token_accuracy": 0.7737569063901901,
"num_tokens": 6471908.0,
"step": 397
},
{
"entropy": 0.540926069021225,
"epoch": 1.4906367041198503,
"grad_norm": 0.03908224403858185,
"learning_rate": 0.0002,
"loss": 0.5521141290664673,
"mean_token_accuracy": 0.7775756865739822,
"num_tokens": 6487958.0,
"step": 398
},
{
"entropy": 0.5474519431591034,
"epoch": 1.49438202247191,
"grad_norm": 0.04104601964354515,
"learning_rate": 0.0002,
"loss": 0.5533535480499268,
"mean_token_accuracy": 0.7748162597417831,
"num_tokens": 6504634.0,
"step": 399
},
{
"entropy": 0.5560764372348785,
"epoch": 1.4981273408239701,
"grad_norm": 0.0360972136259079,
"learning_rate": 0.0002,
"loss": 0.5614410042762756,
"mean_token_accuracy": 0.770107239484787,
"num_tokens": 6521072.0,
"step": 400
},
{
"entropy": 0.5673471540212631,
"epoch": 1.5018726591760299,
"grad_norm": 0.04004177823662758,
"learning_rate": 0.0002,
"loss": 0.5589927434921265,
"mean_token_accuracy": 0.7734557241201401,
"num_tokens": 6537361.0,
"step": 401
},
{
"entropy": 0.5486087501049042,
"epoch": 1.50561797752809,
"grad_norm": 0.030557790771126747,
"learning_rate": 0.0002,
"loss": 0.5393815040588379,
"mean_token_accuracy": 0.7784638553857803,
"num_tokens": 6553620.0,
"step": 402
},
{
"entropy": 0.5486248284578323,
"epoch": 1.5093632958801497,
"grad_norm": 0.03941396623849869,
"learning_rate": 0.0002,
"loss": 0.5509032011032104,
"mean_token_accuracy": 0.7800426781177521,
"num_tokens": 6569936.0,
"step": 403
},
{
"entropy": 0.558304026722908,
"epoch": 1.5131086142322099,
"grad_norm": 0.03858976438641548,
"learning_rate": 0.0002,
"loss": 0.566615104675293,
"mean_token_accuracy": 0.7677357494831085,
"num_tokens": 6586223.0,
"step": 404
},
{
"entropy": 0.5375211238861084,
"epoch": 1.5168539325842696,
"grad_norm": 0.0333857461810112,
"learning_rate": 0.0002,
"loss": 0.546052873134613,
"mean_token_accuracy": 0.779136061668396,
"num_tokens": 6602626.0,
"step": 405
},
{
"entropy": 0.545025646686554,
"epoch": 1.5205992509363297,
"grad_norm": 0.03882851079106331,
"learning_rate": 0.0002,
"loss": 0.5526992678642273,
"mean_token_accuracy": 0.7757603526115417,
"num_tokens": 6618970.0,
"step": 406
},
{
"entropy": 0.5616021603345871,
"epoch": 1.5243445692883895,
"grad_norm": 0.029704444110393524,
"learning_rate": 0.0002,
"loss": 0.5617290139198303,
"mean_token_accuracy": 0.771888017654419,
"num_tokens": 6635712.0,
"step": 407
},
{
"entropy": 0.5517143756151199,
"epoch": 1.5280898876404494,
"grad_norm": 0.029841486364603043,
"learning_rate": 0.0002,
"loss": 0.5455192923545837,
"mean_token_accuracy": 0.7790273427963257,
"num_tokens": 6652005.0,
"step": 408
},
{
"entropy": 0.5481491684913635,
"epoch": 1.5318352059925093,
"grad_norm": 0.03239016607403755,
"learning_rate": 0.0002,
"loss": 0.5448024272918701,
"mean_token_accuracy": 0.7801620662212372,
"num_tokens": 6668365.0,
"step": 409
},
{
"entropy": 0.5385047048330307,
"epoch": 1.5355805243445693,
"grad_norm": 0.029611637815833092,
"learning_rate": 0.0002,
"loss": 0.5335633754730225,
"mean_token_accuracy": 0.785701259970665,
"num_tokens": 6684708.0,
"step": 410
},
{
"entropy": 0.558298259973526,
"epoch": 1.5393258426966292,
"grad_norm": 0.030493013560771942,
"learning_rate": 0.0002,
"loss": 0.5560066103935242,
"mean_token_accuracy": 0.7725876718759537,
"num_tokens": 6701142.0,
"step": 411
},
{
"entropy": 0.5395427197217941,
"epoch": 1.5430711610486891,
"grad_norm": 0.032578032463788986,
"learning_rate": 0.0002,
"loss": 0.5449746251106262,
"mean_token_accuracy": 0.7762585133314133,
"num_tokens": 6717233.0,
"step": 412
},
{
"entropy": 0.5387013256549835,
"epoch": 1.546816479400749,
"grad_norm": 0.0333687961101532,
"learning_rate": 0.0002,
"loss": 0.5403171181678772,
"mean_token_accuracy": 0.7810612767934799,
"num_tokens": 6733228.0,
"step": 413
},
{
"entropy": 0.5673456788063049,
"epoch": 1.550561797752809,
"grad_norm": 0.036015916615724564,
"learning_rate": 0.0002,
"loss": 0.5735532641410828,
"mean_token_accuracy": 0.7664827108383179,
"num_tokens": 6749423.0,
"step": 414
},
{
"entropy": 0.5494605153799057,
"epoch": 1.554307116104869,
"grad_norm": 0.02719104290008545,
"learning_rate": 0.0002,
"loss": 0.5493685007095337,
"mean_token_accuracy": 0.776999905705452,
"num_tokens": 6765893.0,
"step": 415
},
{
"entropy": 0.5593840181827545,
"epoch": 1.5580524344569289,
"grad_norm": 0.03425523266196251,
"learning_rate": 0.0002,
"loss": 0.5553128719329834,
"mean_token_accuracy": 0.7735365033149719,
"num_tokens": 6782271.0,
"step": 416
},
{
"entropy": 0.5617495179176331,
"epoch": 1.5617977528089888,
"grad_norm": 0.032372213900089264,
"learning_rate": 0.0002,
"loss": 0.5606021881103516,
"mean_token_accuracy": 0.7721095532178879,
"num_tokens": 6798813.0,
"step": 417
},
{
"entropy": 0.5550025552511215,
"epoch": 1.5655430711610487,
"grad_norm": 0.029182737693190575,
"learning_rate": 0.0002,
"loss": 0.5564966201782227,
"mean_token_accuracy": 0.7731625586748123,
"num_tokens": 6815405.0,
"step": 418
},
{
"entropy": 0.5605382174253464,
"epoch": 1.5692883895131087,
"grad_norm": 0.030886612832546234,
"learning_rate": 0.0002,
"loss": 0.5631057024002075,
"mean_token_accuracy": 0.7716924250125885,
"num_tokens": 6831974.0,
"step": 419
},
{
"entropy": 0.5414248704910278,
"epoch": 1.5730337078651684,
"grad_norm": 0.03267752379179001,
"learning_rate": 0.0002,
"loss": 0.5522453188896179,
"mean_token_accuracy": 0.7731709033250809,
"num_tokens": 6848314.0,
"step": 420
},
{
"entropy": 0.5514931678771973,
"epoch": 1.5767790262172285,
"grad_norm": 0.03168710321187973,
"learning_rate": 0.0002,
"loss": 0.5525091886520386,
"mean_token_accuracy": 0.7754202336072922,
"num_tokens": 6864671.0,
"step": 421
},
{
"entropy": 0.5639499425888062,
"epoch": 1.5805243445692883,
"grad_norm": 0.032651759684085846,
"learning_rate": 0.0002,
"loss": 0.5697652697563171,
"mean_token_accuracy": 0.7682019621133804,
"num_tokens": 6881061.0,
"step": 422
},
{
"entropy": 0.5544054210186005,
"epoch": 1.5842696629213484,
"grad_norm": 0.03449453413486481,
"learning_rate": 0.0002,
"loss": 0.5507102012634277,
"mean_token_accuracy": 0.775859922170639,
"num_tokens": 6897314.0,
"step": 423
},
{
"entropy": 0.5711345225572586,
"epoch": 1.5880149812734081,
"grad_norm": 0.03847847133874893,
"learning_rate": 0.0002,
"loss": 0.5732009410858154,
"mean_token_accuracy": 0.7667471021413803,
"num_tokens": 6913609.0,
"step": 424
},
{
"entropy": 0.5389959663152695,
"epoch": 1.5917602996254683,
"grad_norm": 0.03514353558421135,
"learning_rate": 0.0002,
"loss": 0.5444454550743103,
"mean_token_accuracy": 0.7799976915121078,
"num_tokens": 6929936.0,
"step": 425
},
{
"entropy": 0.5668403804302216,
"epoch": 1.595505617977528,
"grad_norm": 0.035787779837846756,
"learning_rate": 0.0002,
"loss": 0.5658587217330933,
"mean_token_accuracy": 0.7714453637599945,
"num_tokens": 6946824.0,
"step": 426
},
{
"entropy": 0.5508380085229874,
"epoch": 1.5992509363295881,
"grad_norm": 0.03445902094244957,
"learning_rate": 0.0002,
"loss": 0.5547541975975037,
"mean_token_accuracy": 0.7770363837480545,
"num_tokens": 6962968.0,
"step": 427
},
{
"entropy": 0.5622916221618652,
"epoch": 1.6029962546816479,
"grad_norm": 0.033641569316387177,
"learning_rate": 0.0002,
"loss": 0.5611415505409241,
"mean_token_accuracy": 0.7717165648937225,
"num_tokens": 6979281.0,
"step": 428
},
{
"entropy": 0.5456431210041046,
"epoch": 1.606741573033708,
"grad_norm": 0.030943863093852997,
"learning_rate": 0.0002,
"loss": 0.5433369278907776,
"mean_token_accuracy": 0.77703957259655,
"num_tokens": 6995448.0,
"step": 429
},
{
"entropy": 0.5349363088607788,
"epoch": 1.6104868913857677,
"grad_norm": 0.029584866017103195,
"learning_rate": 0.0002,
"loss": 0.528792142868042,
"mean_token_accuracy": 0.7852742522954941,
"num_tokens": 7011578.0,
"step": 430
},
{
"entropy": 0.52534219622612,
"epoch": 1.6142322097378277,
"grad_norm": 0.031122464686632156,
"learning_rate": 0.0002,
"loss": 0.5248501300811768,
"mean_token_accuracy": 0.7855943292379379,
"num_tokens": 7027819.0,
"step": 431
},
{
"entropy": 0.5471996814012527,
"epoch": 1.6179775280898876,
"grad_norm": 0.03317458927631378,
"learning_rate": 0.0002,
"loss": 0.5547217726707458,
"mean_token_accuracy": 0.776124969124794,
"num_tokens": 7044215.0,
"step": 432
},
{
"entropy": 0.5501783192157745,
"epoch": 1.6217228464419475,
"grad_norm": 0.028514394536614418,
"learning_rate": 0.0002,
"loss": 0.5524763464927673,
"mean_token_accuracy": 0.773967519402504,
"num_tokens": 7060557.0,
"step": 433
},
{
"entropy": 0.5516121089458466,
"epoch": 1.6254681647940075,
"grad_norm": 0.037680886685848236,
"learning_rate": 0.0002,
"loss": 0.5547643899917603,
"mean_token_accuracy": 0.7772052437067032,
"num_tokens": 7076827.0,
"step": 434
},
{
"entropy": 0.5446216315031052,
"epoch": 1.6292134831460674,
"grad_norm": 0.025961318984627724,
"learning_rate": 0.0002,
"loss": 0.540472149848938,
"mean_token_accuracy": 0.7827950567007065,
"num_tokens": 7093240.0,
"step": 435
},
{
"entropy": 0.5542737692594528,
"epoch": 1.6329588014981273,
"grad_norm": 0.03385328873991966,
"learning_rate": 0.0002,
"loss": 0.5622321963310242,
"mean_token_accuracy": 0.7715137451887131,
"num_tokens": 7109763.0,
"step": 436
},
{
"entropy": 0.5479970276355743,
"epoch": 1.6367041198501873,
"grad_norm": 0.027666250243782997,
"learning_rate": 0.0002,
"loss": 0.5450934767723083,
"mean_token_accuracy": 0.7789344042539597,
"num_tokens": 7125965.0,
"step": 437
},
{
"entropy": 0.5606249123811722,
"epoch": 1.6404494382022472,
"grad_norm": 0.028965814039111137,
"learning_rate": 0.0002,
"loss": 0.5618120431900024,
"mean_token_accuracy": 0.7737310230731964,
"num_tokens": 7142275.0,
"step": 438
},
{
"entropy": 0.5434140264987946,
"epoch": 1.6441947565543071,
"grad_norm": 0.03233455866575241,
"learning_rate": 0.0002,
"loss": 0.5448483824729919,
"mean_token_accuracy": 0.7776681929826736,
"num_tokens": 7158681.0,
"step": 439
},
{
"entropy": 0.5462686270475388,
"epoch": 1.647940074906367,
"grad_norm": 0.030159825459122658,
"learning_rate": 0.0002,
"loss": 0.5512958765029907,
"mean_token_accuracy": 0.7788191735744476,
"num_tokens": 7174999.0,
"step": 440
},
{
"entropy": 0.5655659884214401,
"epoch": 1.651685393258427,
"grad_norm": 0.0356375053524971,
"learning_rate": 0.0002,
"loss": 0.5668036937713623,
"mean_token_accuracy": 0.7672240734100342,
"num_tokens": 7191451.0,
"step": 441
},
{
"entropy": 0.5439184606075287,
"epoch": 1.655430711610487,
"grad_norm": 0.03394126892089844,
"learning_rate": 0.0002,
"loss": 0.5443013906478882,
"mean_token_accuracy": 0.7794349491596222,
"num_tokens": 7207657.0,
"step": 442
},
{
"entropy": 0.5462498217821121,
"epoch": 1.6591760299625467,
"grad_norm": 0.03115757368505001,
"learning_rate": 0.0002,
"loss": 0.5484351515769958,
"mean_token_accuracy": 0.7759426087141037,
"num_tokens": 7223926.0,
"step": 443
},
{
"entropy": 0.5479519367218018,
"epoch": 1.6629213483146068,
"grad_norm": 0.03686544671654701,
"learning_rate": 0.0002,
"loss": 0.5487886071205139,
"mean_token_accuracy": 0.7793583422899246,
"num_tokens": 7239926.0,
"step": 444
},
{
"entropy": 0.5571880787611008,
"epoch": 1.6666666666666665,
"grad_norm": 0.029902130365371704,
"learning_rate": 0.0002,
"loss": 0.5566808581352234,
"mean_token_accuracy": 0.7738562673330307,
"num_tokens": 7256365.0,
"step": 445
},
{
"entropy": 0.5606496781110764,
"epoch": 1.6704119850187267,
"grad_norm": 0.03581070154905319,
"learning_rate": 0.0002,
"loss": 0.5646023750305176,
"mean_token_accuracy": 0.7700021713972092,
"num_tokens": 7272415.0,
"step": 446
},
{
"entropy": 0.5493645370006561,
"epoch": 1.6741573033707864,
"grad_norm": 0.034732386469841,
"learning_rate": 0.0002,
"loss": 0.5556433796882629,
"mean_token_accuracy": 0.7724722027778625,
"num_tokens": 7288442.0,
"step": 447
},
{
"entropy": 0.5454504191875458,
"epoch": 1.6779026217228465,
"grad_norm": 0.031994741410017014,
"learning_rate": 0.0002,
"loss": 0.5455131530761719,
"mean_token_accuracy": 0.7786727547645569,
"num_tokens": 7304778.0,
"step": 448
},
{
"entropy": 0.5480805784463882,
"epoch": 1.6816479400749063,
"grad_norm": 0.029919426888227463,
"learning_rate": 0.0002,
"loss": 0.5464503765106201,
"mean_token_accuracy": 0.7800304591655731,
"num_tokens": 7320989.0,
"step": 449
},
{
"entropy": 0.5258940905332565,
"epoch": 1.6853932584269664,
"grad_norm": 0.032200053334236145,
"learning_rate": 0.0002,
"loss": 0.5228010416030884,
"mean_token_accuracy": 0.7870291918516159,
"num_tokens": 7337145.0,
"step": 450
},
{
"entropy": 0.545346587896347,
"epoch": 1.6891385767790261,
"grad_norm": 0.037810057401657104,
"learning_rate": 0.0002,
"loss": 0.5497158765792847,
"mean_token_accuracy": 0.7733957171440125,
"num_tokens": 7353380.0,
"step": 451
},
{
"entropy": 0.5455152243375778,
"epoch": 1.6928838951310863,
"grad_norm": 0.036783650517463684,
"learning_rate": 0.0002,
"loss": 0.547383725643158,
"mean_token_accuracy": 0.7792070508003235,
"num_tokens": 7369718.0,
"step": 452
},
{
"entropy": 0.5610679686069489,
"epoch": 1.696629213483146,
"grad_norm": 0.032883401960134506,
"learning_rate": 0.0002,
"loss": 0.5691272616386414,
"mean_token_accuracy": 0.7677329927682877,
"num_tokens": 7385896.0,
"step": 453
},
{
"entropy": 0.5505604892969131,
"epoch": 1.7003745318352061,
"grad_norm": 0.03284638375043869,
"learning_rate": 0.0002,
"loss": 0.5511571168899536,
"mean_token_accuracy": 0.7760978639125824,
"num_tokens": 7402228.0,
"step": 454
},
{
"entropy": 0.5650221109390259,
"epoch": 1.7041198501872659,
"grad_norm": 0.02887006103992462,
"learning_rate": 0.0002,
"loss": 0.5633357763290405,
"mean_token_accuracy": 0.7709190398454666,
"num_tokens": 7418506.0,
"step": 455
},
{
"entropy": 0.5511359125375748,
"epoch": 1.7078651685393258,
"grad_norm": 0.02897547371685505,
"learning_rate": 0.0002,
"loss": 0.5476655960083008,
"mean_token_accuracy": 0.7766725867986679,
"num_tokens": 7434993.0,
"step": 456
},
{
"entropy": 0.5589297413825989,
"epoch": 1.7116104868913857,
"grad_norm": 0.03913537412881851,
"learning_rate": 0.0002,
"loss": 0.562713623046875,
"mean_token_accuracy": 0.7716452181339264,
"num_tokens": 7451420.0,
"step": 457
},
{
"entropy": 0.5587479770183563,
"epoch": 1.7153558052434457,
"grad_norm": 0.0281817764043808,
"learning_rate": 0.0002,
"loss": 0.5552535057067871,
"mean_token_accuracy": 0.7717525810003281,
"num_tokens": 7467745.0,
"step": 458
},
{
"entropy": 0.5426507443189621,
"epoch": 1.7191011235955056,
"grad_norm": 0.03837720304727554,
"learning_rate": 0.0002,
"loss": 0.5466030836105347,
"mean_token_accuracy": 0.7787178158760071,
"num_tokens": 7484044.0,
"step": 459
},
{
"entropy": 0.548772931098938,
"epoch": 1.7228464419475655,
"grad_norm": 0.034067291766405106,
"learning_rate": 0.0002,
"loss": 0.5531357526779175,
"mean_token_accuracy": 0.7748309075832367,
"num_tokens": 7500332.0,
"step": 460
},
{
"entropy": 0.5564078390598297,
"epoch": 1.7265917602996255,
"grad_norm": 0.03204013407230377,
"learning_rate": 0.0002,
"loss": 0.5560243725776672,
"mean_token_accuracy": 0.7740551978349686,
"num_tokens": 7516660.0,
"step": 461
},
{
"entropy": 0.5405488759279251,
"epoch": 1.7303370786516854,
"grad_norm": 0.030630316585302353,
"learning_rate": 0.0002,
"loss": 0.5395958423614502,
"mean_token_accuracy": 0.7782745659351349,
"num_tokens": 7532934.0,
"step": 462
},
{
"entropy": 0.5496814846992493,
"epoch": 1.7340823970037453,
"grad_norm": 0.03725660592317581,
"learning_rate": 0.0002,
"loss": 0.5496969223022461,
"mean_token_accuracy": 0.7755606472492218,
"num_tokens": 7549291.0,
"step": 463
},
{
"entropy": 0.5522442013025284,
"epoch": 1.7378277153558053,
"grad_norm": 0.039360832422971725,
"learning_rate": 0.0002,
"loss": 0.5475296378135681,
"mean_token_accuracy": 0.7740370631217957,
"num_tokens": 7565370.0,
"step": 464
},
{
"entropy": 0.5205198004841805,
"epoch": 1.7415730337078652,
"grad_norm": 0.029320131987333298,
"learning_rate": 0.0002,
"loss": 0.5181597471237183,
"mean_token_accuracy": 0.789748415350914,
"num_tokens": 7581731.0,
"step": 465
},
{
"entropy": 0.5322981476783752,
"epoch": 1.7453183520599251,
"grad_norm": 0.03633226826786995,
"learning_rate": 0.0002,
"loss": 0.5413781404495239,
"mean_token_accuracy": 0.7808037847280502,
"num_tokens": 7597822.0,
"step": 466
},
{
"entropy": 0.524602085351944,
"epoch": 1.749063670411985,
"grad_norm": 0.04402731731534004,
"learning_rate": 0.0002,
"loss": 0.532406210899353,
"mean_token_accuracy": 0.7855067849159241,
"num_tokens": 7613933.0,
"step": 467
},
{
"entropy": 0.5708600282669067,
"epoch": 1.7528089887640448,
"grad_norm": 0.0357418954372406,
"learning_rate": 0.0002,
"loss": 0.5712512731552124,
"mean_token_accuracy": 0.7683784365653992,
"num_tokens": 7630331.0,
"step": 468
},
{
"entropy": 0.5579233318567276,
"epoch": 1.756554307116105,
"grad_norm": 0.15994992852210999,
"learning_rate": 0.0002,
"loss": 0.5615707635879517,
"mean_token_accuracy": 0.7749305069446564,
"num_tokens": 7646666.0,
"step": 469
},
{
"entropy": 0.5672501176595688,
"epoch": 1.7602996254681647,
"grad_norm": 0.18223144114017487,
"learning_rate": 0.0002,
"loss": 0.5922040939331055,
"mean_token_accuracy": 0.767003208398819,
"num_tokens": 7663024.0,
"step": 470
},
{
"entropy": 0.5853898674249649,
"epoch": 1.7640449438202248,
"grad_norm": 0.19322983920574188,
"learning_rate": 0.0002,
"loss": 0.5716003179550171,
"mean_token_accuracy": 0.7706755697727203,
"num_tokens": 7679445.0,
"step": 471
},
{
"entropy": 0.5652599781751633,
"epoch": 1.7677902621722845,
"grad_norm": 0.040028076618909836,
"learning_rate": 0.0002,
"loss": 0.5545145869255066,
"mean_token_accuracy": 0.7762533873319626,
"num_tokens": 7695863.0,
"step": 472
},
{
"entropy": 0.5655337423086166,
"epoch": 1.7715355805243447,
"grad_norm": 0.03808818385004997,
"learning_rate": 0.0002,
"loss": 0.5697377324104309,
"mean_token_accuracy": 0.7698807120323181,
"num_tokens": 7712117.0,
"step": 473
},
{
"entropy": 0.531586229801178,
"epoch": 1.7752808988764044,
"grad_norm": 0.03700399026274681,
"learning_rate": 0.0002,
"loss": 0.5407450199127197,
"mean_token_accuracy": 0.7823738306760788,
"num_tokens": 7728324.0,
"step": 474
},
{
"entropy": 0.5400687605142593,
"epoch": 1.7790262172284645,
"grad_norm": 0.04493065923452377,
"learning_rate": 0.0002,
"loss": 0.5463284254074097,
"mean_token_accuracy": 0.778341680765152,
"num_tokens": 7744642.0,
"step": 475
},
{
"entropy": 0.5348718762397766,
"epoch": 1.7827715355805243,
"grad_norm": 0.032796818763017654,
"learning_rate": 0.0002,
"loss": 0.53885817527771,
"mean_token_accuracy": 0.7798904478549957,
"num_tokens": 7761144.0,
"step": 476
},
{
"entropy": 0.5612788051366806,
"epoch": 1.7865168539325844,
"grad_norm": 0.03454861417412758,
"learning_rate": 0.0002,
"loss": 0.5585771799087524,
"mean_token_accuracy": 0.7730214893817902,
"num_tokens": 7777603.0,
"step": 477
},
{
"entropy": 0.5655092746019363,
"epoch": 1.7902621722846441,
"grad_norm": 0.04326882213354111,
"learning_rate": 0.0002,
"loss": 0.5594231486320496,
"mean_token_accuracy": 0.7714511156082153,
"num_tokens": 7794017.0,
"step": 478
},
{
"entropy": 0.5740013867616653,
"epoch": 1.7940074906367043,
"grad_norm": 0.03586514666676521,
"learning_rate": 0.0002,
"loss": 0.5665684342384338,
"mean_token_accuracy": 0.7693835347890854,
"num_tokens": 7810410.0,
"step": 479
},
{
"entropy": 0.5689022541046143,
"epoch": 1.797752808988764,
"grad_norm": 0.03453454375267029,
"learning_rate": 0.0002,
"loss": 0.5640177130699158,
"mean_token_accuracy": 0.7688567489385605,
"num_tokens": 7826878.0,
"step": 480
},
{
"entropy": 0.5344455689191818,
"epoch": 1.801498127340824,
"grad_norm": 0.04154738038778305,
"learning_rate": 0.0002,
"loss": 0.5412873029708862,
"mean_token_accuracy": 0.7843961417675018,
"num_tokens": 7842957.0,
"step": 481
},
{
"entropy": 0.5326808393001556,
"epoch": 1.8052434456928839,
"grad_norm": 0.03772249072790146,
"learning_rate": 0.0002,
"loss": 0.5458777546882629,
"mean_token_accuracy": 0.7775137424468994,
"num_tokens": 7859243.0,
"step": 482
},
{
"entropy": 0.552602618932724,
"epoch": 1.8089887640449438,
"grad_norm": 0.03419940546154976,
"learning_rate": 0.0002,
"loss": 0.5563470721244812,
"mean_token_accuracy": 0.7756804972887039,
"num_tokens": 7875641.0,
"step": 483
},
{
"entropy": 0.5412130802869797,
"epoch": 1.8127340823970037,
"grad_norm": 0.033059973269701004,
"learning_rate": 0.0002,
"loss": 0.540538489818573,
"mean_token_accuracy": 0.782319188117981,
"num_tokens": 7891954.0,
"step": 484
},
{
"entropy": 0.5559896975755692,
"epoch": 1.8164794007490637,
"grad_norm": 0.03472665324807167,
"learning_rate": 0.0002,
"loss": 0.5544817447662354,
"mean_token_accuracy": 0.7753840684890747,
"num_tokens": 7908283.0,
"step": 485
},
{
"entropy": 0.5695093274116516,
"epoch": 1.8202247191011236,
"grad_norm": 0.0319642499089241,
"learning_rate": 0.0002,
"loss": 0.5608171224594116,
"mean_token_accuracy": 0.7743540853261948,
"num_tokens": 7924627.0,
"step": 486
},
{
"entropy": 0.5412854105234146,
"epoch": 1.8239700374531835,
"grad_norm": 0.032578784972429276,
"learning_rate": 0.0002,
"loss": 0.5386444330215454,
"mean_token_accuracy": 0.7795344591140747,
"num_tokens": 7940814.0,
"step": 487
},
{
"entropy": 0.5442286729812622,
"epoch": 1.8277153558052435,
"grad_norm": 0.03279658779501915,
"learning_rate": 0.0002,
"loss": 0.553512454032898,
"mean_token_accuracy": 0.7744518220424652,
"num_tokens": 7957133.0,
"step": 488
},
{
"entropy": 0.544167771935463,
"epoch": 1.8314606741573034,
"grad_norm": 0.034980904310941696,
"learning_rate": 0.0002,
"loss": 0.5495878458023071,
"mean_token_accuracy": 0.7794477045536041,
"num_tokens": 7973367.0,
"step": 489
},
{
"entropy": 0.5514913648366928,
"epoch": 1.8352059925093633,
"grad_norm": 0.0437743179500103,
"learning_rate": 0.0002,
"loss": 0.5581385493278503,
"mean_token_accuracy": 0.7734484821557999,
"num_tokens": 7989443.0,
"step": 490
},
{
"entropy": 0.5721138119697571,
"epoch": 1.8389513108614233,
"grad_norm": 0.032419200986623764,
"learning_rate": 0.0002,
"loss": 0.5644645094871521,
"mean_token_accuracy": 0.7717173397541046,
"num_tokens": 8005817.0,
"step": 491
},
{
"entropy": 0.5577604025602341,
"epoch": 1.8426966292134832,
"grad_norm": 0.04115711897611618,
"learning_rate": 0.0002,
"loss": 0.5619987845420837,
"mean_token_accuracy": 0.77156862616539,
"num_tokens": 8022160.0,
"step": 492
},
{
"entropy": 0.5528861582279205,
"epoch": 1.846441947565543,
"grad_norm": 0.029432786628603935,
"learning_rate": 0.0002,
"loss": 0.5476526618003845,
"mean_token_accuracy": 0.7781069427728653,
"num_tokens": 8038591.0,
"step": 493
},
{
"entropy": 0.5558982342481613,
"epoch": 1.850187265917603,
"grad_norm": 0.036472100764513016,
"learning_rate": 0.0002,
"loss": 0.5545116662979126,
"mean_token_accuracy": 0.776875764131546,
"num_tokens": 8054879.0,
"step": 494
},
{
"entropy": 0.5589891523122787,
"epoch": 1.8539325842696628,
"grad_norm": 0.02796117588877678,
"learning_rate": 0.0002,
"loss": 0.5532379746437073,
"mean_token_accuracy": 0.7751499116420746,
"num_tokens": 8071227.0,
"step": 495
},
{
"entropy": 0.5462375283241272,
"epoch": 1.857677902621723,
"grad_norm": 0.0307608712464571,
"learning_rate": 0.0002,
"loss": 0.5444692373275757,
"mean_token_accuracy": 0.7788323760032654,
"num_tokens": 8087424.0,
"step": 496
},
{
"entropy": 0.562559187412262,
"epoch": 1.8614232209737827,
"grad_norm": 0.03130098804831505,
"learning_rate": 0.0002,
"loss": 0.5660312175750732,
"mean_token_accuracy": 0.7673315852880478,
"num_tokens": 8104163.0,
"step": 497
},
{
"entropy": 0.5469489693641663,
"epoch": 1.8651685393258428,
"grad_norm": 0.031797025352716446,
"learning_rate": 0.0002,
"loss": 0.5592264533042908,
"mean_token_accuracy": 0.7764750421047211,
"num_tokens": 8120483.0,
"step": 498
},
{
"entropy": 0.5529169142246246,
"epoch": 1.8689138576779025,
"grad_norm": 0.0395452156662941,
"learning_rate": 0.0002,
"loss": 0.5562450885772705,
"mean_token_accuracy": 0.7762233167886734,
"num_tokens": 8136774.0,
"step": 499
},
{
"entropy": 0.5619923919439316,
"epoch": 1.8726591760299627,
"grad_norm": 0.03070960007607937,
"learning_rate": 0.0002,
"loss": 0.5671469569206238,
"mean_token_accuracy": 0.7695633620023727,
"num_tokens": 8152950.0,
"step": 500
},
{
"entropy": 0.571450412273407,
"epoch": 1.8764044943820224,
"grad_norm": 0.03263135999441147,
"learning_rate": 0.0002,
"loss": 0.5684110522270203,
"mean_token_accuracy": 0.7683538943529129,
"num_tokens": 8169231.0,
"step": 501
},
{
"entropy": 0.5732105523347855,
"epoch": 1.8801498127340825,
"grad_norm": 0.04209841415286064,
"learning_rate": 0.0002,
"loss": 0.571649968624115,
"mean_token_accuracy": 0.7642921954393387,
"num_tokens": 8185562.0,
"step": 502
},
{
"entropy": 0.5685284435749054,
"epoch": 1.8838951310861423,
"grad_norm": 0.03377389535307884,
"learning_rate": 0.0002,
"loss": 0.56586092710495,
"mean_token_accuracy": 0.7697953432798386,
"num_tokens": 8201808.0,
"step": 503
},
{
"entropy": 0.5590908825397491,
"epoch": 1.8876404494382022,
"grad_norm": 0.0385461188852787,
"learning_rate": 0.0002,
"loss": 0.5578455924987793,
"mean_token_accuracy": 0.7730644196271896,
"num_tokens": 8217945.0,
"step": 504
},
{
"entropy": 0.5606498569250107,
"epoch": 1.8913857677902621,
"grad_norm": 0.03381400555372238,
"learning_rate": 0.0002,
"loss": 0.5585749745368958,
"mean_token_accuracy": 0.7752718329429626,
"num_tokens": 8234181.0,
"step": 505
},
{
"entropy": 0.5511593520641327,
"epoch": 1.895131086142322,
"grad_norm": 0.04427889734506607,
"learning_rate": 0.0002,
"loss": 0.5605770349502563,
"mean_token_accuracy": 0.7708971202373505,
"num_tokens": 8250412.0,
"step": 506
},
{
"entropy": 0.5558828562498093,
"epoch": 1.898876404494382,
"grad_norm": 0.032851386815309525,
"learning_rate": 0.0002,
"loss": 0.5588455200195312,
"mean_token_accuracy": 0.7729152590036392,
"num_tokens": 8266940.0,
"step": 507
},
{
"entropy": 0.5533877611160278,
"epoch": 1.902621722846442,
"grad_norm": 0.034889817237854004,
"learning_rate": 0.0002,
"loss": 0.5531287789344788,
"mean_token_accuracy": 0.7766410559415817,
"num_tokens": 8283192.0,
"step": 508
},
{
"entropy": 0.55963134765625,
"epoch": 1.9063670411985019,
"grad_norm": 0.03460029140114784,
"learning_rate": 0.0002,
"loss": 0.5557897686958313,
"mean_token_accuracy": 0.7736343890428543,
"num_tokens": 8299357.0,
"step": 509
},
{
"entropy": 0.5412601754069328,
"epoch": 1.9101123595505618,
"grad_norm": 0.032328344881534576,
"learning_rate": 0.0002,
"loss": 0.5438541173934937,
"mean_token_accuracy": 0.7753017991781235,
"num_tokens": 8315841.0,
"step": 510
},
{
"entropy": 0.5540103167295456,
"epoch": 1.9138576779026217,
"grad_norm": 0.03002399578690529,
"learning_rate": 0.0002,
"loss": 0.5542548894882202,
"mean_token_accuracy": 0.7737881243228912,
"num_tokens": 8332181.0,
"step": 511
},
{
"entropy": 0.5422029197216034,
"epoch": 1.9176029962546817,
"grad_norm": 0.034409623593091965,
"learning_rate": 0.0002,
"loss": 0.5453910231590271,
"mean_token_accuracy": 0.7794903218746185,
"num_tokens": 8348319.0,
"step": 512
},
{
"entropy": 0.5566486120223999,
"epoch": 1.9213483146067416,
"grad_norm": 0.030252845957875252,
"learning_rate": 0.0002,
"loss": 0.5601068735122681,
"mean_token_accuracy": 0.7728803753852844,
"num_tokens": 8364457.0,
"step": 513
},
{
"entropy": 0.5523079186677933,
"epoch": 1.9250936329588015,
"grad_norm": 0.02711205929517746,
"learning_rate": 0.0002,
"loss": 0.5482505559921265,
"mean_token_accuracy": 0.7751948684453964,
"num_tokens": 8380923.0,
"step": 514
},
{
"entropy": 0.5604666918516159,
"epoch": 1.9288389513108615,
"grad_norm": 0.032180819660425186,
"learning_rate": 0.0002,
"loss": 0.5568802356719971,
"mean_token_accuracy": 0.7695084065198898,
"num_tokens": 8397239.0,
"step": 515
},
{
"entropy": 0.5643311589956284,
"epoch": 1.9325842696629212,
"grad_norm": 0.03032456897199154,
"learning_rate": 0.0002,
"loss": 0.5628493428230286,
"mean_token_accuracy": 0.7717900723218918,
"num_tokens": 8413791.0,
"step": 516
},
{
"entropy": 0.5468644499778748,
"epoch": 1.9363295880149813,
"grad_norm": 0.03036642260849476,
"learning_rate": 0.0002,
"loss": 0.5469942688941956,
"mean_token_accuracy": 0.7763982564210892,
"num_tokens": 8429973.0,
"step": 517
},
{
"entropy": 0.5639230608940125,
"epoch": 1.940074906367041,
"grad_norm": 0.03586732968688011,
"learning_rate": 0.0002,
"loss": 0.5693802237510681,
"mean_token_accuracy": 0.7674274742603302,
"num_tokens": 8446632.0,
"step": 518
},
{
"entropy": 0.552105188369751,
"epoch": 1.9438202247191012,
"grad_norm": 0.028923669829964638,
"learning_rate": 0.0002,
"loss": 0.5536226630210876,
"mean_token_accuracy": 0.7770767658948898,
"num_tokens": 8462861.0,
"step": 519
},
{
"entropy": 0.546203225851059,
"epoch": 1.947565543071161,
"grad_norm": 0.03517064452171326,
"learning_rate": 0.0002,
"loss": 0.5486375689506531,
"mean_token_accuracy": 0.7788794338703156,
"num_tokens": 8479188.0,
"step": 520
},
{
"entropy": 0.5571713298559189,
"epoch": 1.951310861423221,
"grad_norm": 0.03267424926161766,
"learning_rate": 0.0002,
"loss": 0.5605846047401428,
"mean_token_accuracy": 0.7741213738918304,
"num_tokens": 8495441.0,
"step": 521
},
{
"entropy": 0.5428985059261322,
"epoch": 1.9550561797752808,
"grad_norm": 0.03182944655418396,
"learning_rate": 0.0002,
"loss": 0.5459189414978027,
"mean_token_accuracy": 0.7793070673942566,
"num_tokens": 8511788.0,
"step": 522
},
{
"entropy": 0.5454448312520981,
"epoch": 1.958801498127341,
"grad_norm": 0.033397775143384933,
"learning_rate": 0.0002,
"loss": 0.5454107522964478,
"mean_token_accuracy": 0.7772410660982132,
"num_tokens": 8528152.0,
"step": 523
},
{
"entropy": 0.5469843745231628,
"epoch": 1.9625468164794007,
"grad_norm": 0.030805334448814392,
"learning_rate": 0.0002,
"loss": 0.5417147874832153,
"mean_token_accuracy": 0.7786692380905151,
"num_tokens": 8544780.0,
"step": 524
},
{
"entropy": 0.5402656495571136,
"epoch": 1.9662921348314608,
"grad_norm": 0.030130336061120033,
"learning_rate": 0.0002,
"loss": 0.5425636768341064,
"mean_token_accuracy": 0.7805010080337524,
"num_tokens": 8561035.0,
"step": 525
},
{
"entropy": 0.5509428530931473,
"epoch": 1.9700374531835205,
"grad_norm": 0.0316033698618412,
"learning_rate": 0.0002,
"loss": 0.5516440272331238,
"mean_token_accuracy": 0.775515004992485,
"num_tokens": 8577541.0,
"step": 526
},
{
"entropy": 0.5449865013360977,
"epoch": 1.9737827715355807,
"grad_norm": 0.03625763952732086,
"learning_rate": 0.0002,
"loss": 0.5528845191001892,
"mean_token_accuracy": 0.7754436731338501,
"num_tokens": 8593925.0,
"step": 527
},
{
"entropy": 0.563062384724617,
"epoch": 1.9775280898876404,
"grad_norm": 0.029838701710104942,
"learning_rate": 0.0002,
"loss": 0.5591800808906555,
"mean_token_accuracy": 0.7732478529214859,
"num_tokens": 8610524.0,
"step": 528
},
{
"entropy": 0.5514681190252304,
"epoch": 1.9812734082397003,
"grad_norm": 0.03368176147341728,
"learning_rate": 0.0002,
"loss": 0.548831582069397,
"mean_token_accuracy": 0.7749605923891068,
"num_tokens": 8626872.0,
"step": 529
},
{
"entropy": 0.5520317405462265,
"epoch": 1.9850187265917603,
"grad_norm": 0.03429826721549034,
"learning_rate": 0.0002,
"loss": 0.5514442324638367,
"mean_token_accuracy": 0.7730523347854614,
"num_tokens": 8642960.0,
"step": 530
},
{
"entropy": 0.5669658333063126,
"epoch": 1.9887640449438202,
"grad_norm": 0.0307292602956295,
"learning_rate": 0.0002,
"loss": 0.5723692178726196,
"mean_token_accuracy": 0.7651190161705017,
"num_tokens": 8659084.0,
"step": 531
},
{
"entropy": 0.5609945952892303,
"epoch": 1.9925093632958801,
"grad_norm": 0.036607109010219574,
"learning_rate": 0.0002,
"loss": 0.5636897683143616,
"mean_token_accuracy": 0.7701397836208344,
"num_tokens": 8675587.0,
"step": 532
},
{
"entropy": 0.5549340695142746,
"epoch": 1.99625468164794,
"grad_norm": 0.03215758502483368,
"learning_rate": 0.0002,
"loss": 0.5516895651817322,
"mean_token_accuracy": 0.7737619578838348,
"num_tokens": 8691850.0,
"step": 533
},
{
"entropy": 0.5620461255311966,
"epoch": 2.0,
"grad_norm": 0.028028611093759537,
"learning_rate": 0.0002,
"loss": 0.5578765869140625,
"mean_token_accuracy": 0.7716735005378723,
"num_tokens": 8708236.0,
"step": 534
},
{
"entropy": 0.557419016957283,
"epoch": 2.0037453183520597,
"grad_norm": 0.03629058599472046,
"learning_rate": 0.0002,
"loss": 0.5479042530059814,
"mean_token_accuracy": 0.7768302410840988,
"num_tokens": 8724656.0,
"step": 535
},
{
"entropy": 0.5507587045431137,
"epoch": 2.00749063670412,
"grad_norm": 0.032850366085767746,
"learning_rate": 0.0002,
"loss": 0.5528382062911987,
"mean_token_accuracy": 0.7756710648536682,
"num_tokens": 8741046.0,
"step": 536
},
{
"entropy": 0.5404622703790665,
"epoch": 2.0112359550561796,
"grad_norm": 0.031562913209199905,
"learning_rate": 0.0002,
"loss": 0.5380600094795227,
"mean_token_accuracy": 0.7781912684440613,
"num_tokens": 8757535.0,
"step": 537
},
{
"entropy": 0.5316804945468903,
"epoch": 2.0149812734082397,
"grad_norm": 0.03351443260908127,
"learning_rate": 0.0002,
"loss": 0.5359355807304382,
"mean_token_accuracy": 0.7827723175287247,
"num_tokens": 8773824.0,
"step": 538
},
{
"entropy": 0.5419723987579346,
"epoch": 2.0187265917602994,
"grad_norm": 0.03948935121297836,
"learning_rate": 0.0002,
"loss": 0.5471257567405701,
"mean_token_accuracy": 0.7790137678384781,
"num_tokens": 8790095.0,
"step": 539
},
{
"entropy": 0.5343683362007141,
"epoch": 2.0224719101123596,
"grad_norm": 0.031161192804574966,
"learning_rate": 0.0002,
"loss": 0.5309802889823914,
"mean_token_accuracy": 0.7821521759033203,
"num_tokens": 8806510.0,
"step": 540
},
{
"entropy": 0.5364920198917389,
"epoch": 2.0262172284644193,
"grad_norm": 0.03507857769727707,
"learning_rate": 0.0002,
"loss": 0.5324068069458008,
"mean_token_accuracy": 0.7870013862848282,
"num_tokens": 8822654.0,
"step": 541
},
{
"entropy": 0.5483170747756958,
"epoch": 2.0299625468164795,
"grad_norm": 0.03222345933318138,
"learning_rate": 0.0002,
"loss": 0.549699068069458,
"mean_token_accuracy": 0.7751237750053406,
"num_tokens": 8839285.0,
"step": 542
},
{
"entropy": 0.5425759255886078,
"epoch": 2.033707865168539,
"grad_norm": 0.03227977082133293,
"learning_rate": 0.0002,
"loss": 0.5380892753601074,
"mean_token_accuracy": 0.7839174568653107,
"num_tokens": 8855507.0,
"step": 543
},
{
"entropy": 0.5272768065333366,
"epoch": 2.0374531835205993,
"grad_norm": 0.03487760201096535,
"learning_rate": 0.0002,
"loss": 0.5265735387802124,
"mean_token_accuracy": 0.7857347279787064,
"num_tokens": 8871873.0,
"step": 544
},
{
"entropy": 0.5219558328390121,
"epoch": 2.041198501872659,
"grad_norm": 0.035983484238386154,
"learning_rate": 0.0002,
"loss": 0.5337969660758972,
"mean_token_accuracy": 0.7834839969873428,
"num_tokens": 8887984.0,
"step": 545
},
{
"entropy": 0.5376651287078857,
"epoch": 2.044943820224719,
"grad_norm": 0.038352932780981064,
"learning_rate": 0.0002,
"loss": 0.5438427329063416,
"mean_token_accuracy": 0.7784269452095032,
"num_tokens": 8904216.0,
"step": 546
},
{
"entropy": 0.5456122606992722,
"epoch": 2.048689138576779,
"grad_norm": 0.036168649792671204,
"learning_rate": 0.0002,
"loss": 0.5431267023086548,
"mean_token_accuracy": 0.7829999178647995,
"num_tokens": 8920617.0,
"step": 547
},
{
"entropy": 0.5304486304521561,
"epoch": 2.052434456928839,
"grad_norm": 0.03324899077415466,
"learning_rate": 0.0002,
"loss": 0.5289336442947388,
"mean_token_accuracy": 0.7849617451429367,
"num_tokens": 8936835.0,
"step": 548
},
{
"entropy": 0.5275251343846321,
"epoch": 2.056179775280899,
"grad_norm": 0.03898227587342262,
"learning_rate": 0.0002,
"loss": 0.530302882194519,
"mean_token_accuracy": 0.7835600972175598,
"num_tokens": 8953009.0,
"step": 549
},
{
"entropy": 0.5530034005641937,
"epoch": 2.059925093632959,
"grad_norm": 0.038006141781806946,
"learning_rate": 0.0002,
"loss": 0.5494067072868347,
"mean_token_accuracy": 0.7755949050188065,
"num_tokens": 8969428.0,
"step": 550
},
{
"entropy": 0.5418991297483444,
"epoch": 2.0636704119850187,
"grad_norm": 0.03261435031890869,
"learning_rate": 0.0002,
"loss": 0.5322299003601074,
"mean_token_accuracy": 0.7837673723697662,
"num_tokens": 8985844.0,
"step": 551
},
{
"entropy": 0.5309967398643494,
"epoch": 2.067415730337079,
"grad_norm": 0.03797997906804085,
"learning_rate": 0.0002,
"loss": 0.5291654467582703,
"mean_token_accuracy": 0.7849747538566589,
"num_tokens": 9002169.0,
"step": 552
},
{
"entropy": 0.5188492685556412,
"epoch": 2.0711610486891385,
"grad_norm": 0.038583919405937195,
"learning_rate": 0.0002,
"loss": 0.5282660722732544,
"mean_token_accuracy": 0.7870546579360962,
"num_tokens": 9018570.0,
"step": 553
},
{
"entropy": 0.534794494509697,
"epoch": 2.0749063670411987,
"grad_norm": 0.03449336439371109,
"learning_rate": 0.0002,
"loss": 0.5352678298950195,
"mean_token_accuracy": 0.7845733165740967,
"num_tokens": 9034788.0,
"step": 554
},
{
"entropy": 0.5308385342359543,
"epoch": 2.0786516853932584,
"grad_norm": 0.03845726326107979,
"learning_rate": 0.0002,
"loss": 0.5325117111206055,
"mean_token_accuracy": 0.7835551649332047,
"num_tokens": 9051109.0,
"step": 555
},
{
"entropy": 0.5309025943279266,
"epoch": 2.0823970037453186,
"grad_norm": 0.03809129074215889,
"learning_rate": 0.0002,
"loss": 0.5253363847732544,
"mean_token_accuracy": 0.7868698537349701,
"num_tokens": 9067268.0,
"step": 556
},
{
"entropy": 0.5575416088104248,
"epoch": 2.0861423220973783,
"grad_norm": 0.034367915242910385,
"learning_rate": 0.0002,
"loss": 0.5523205995559692,
"mean_token_accuracy": 0.7749448716640472,
"num_tokens": 9083891.0,
"step": 557
},
{
"entropy": 0.53434719145298,
"epoch": 2.0898876404494384,
"grad_norm": 0.03826329484581947,
"learning_rate": 0.0002,
"loss": 0.5409042835235596,
"mean_token_accuracy": 0.7785179018974304,
"num_tokens": 9100370.0,
"step": 558
},
{
"entropy": 0.5194257721304893,
"epoch": 2.093632958801498,
"grad_norm": 0.03882572054862976,
"learning_rate": 0.0002,
"loss": 0.5238875150680542,
"mean_token_accuracy": 0.7858750522136688,
"num_tokens": 9116506.0,
"step": 559
},
{
"entropy": 0.5331729799509048,
"epoch": 2.097378277153558,
"grad_norm": 0.045005545020103455,
"learning_rate": 0.0002,
"loss": 0.5285134315490723,
"mean_token_accuracy": 0.7852654755115509,
"num_tokens": 9132871.0,
"step": 560
},
{
"entropy": 0.5405212640762329,
"epoch": 2.101123595505618,
"grad_norm": 0.04780491814017296,
"learning_rate": 0.0002,
"loss": 0.5461173057556152,
"mean_token_accuracy": 0.7770982980728149,
"num_tokens": 9149174.0,
"step": 561
},
{
"entropy": 0.5288062691688538,
"epoch": 2.1048689138576777,
"grad_norm": 0.04940470680594444,
"learning_rate": 0.0002,
"loss": 0.5337265729904175,
"mean_token_accuracy": 0.7846069186925888,
"num_tokens": 9165316.0,
"step": 562
},
{
"entropy": 0.531680166721344,
"epoch": 2.108614232209738,
"grad_norm": 0.05061717331409454,
"learning_rate": 0.0002,
"loss": 0.5271866321563721,
"mean_token_accuracy": 0.7854976505041122,
"num_tokens": 9181482.0,
"step": 563
},
{
"entropy": 0.5314291417598724,
"epoch": 2.1123595505617976,
"grad_norm": 0.0397643968462944,
"learning_rate": 0.0002,
"loss": 0.5271567702293396,
"mean_token_accuracy": 0.7851341366767883,
"num_tokens": 9197662.0,
"step": 564
},
{
"entropy": 0.5252021998167038,
"epoch": 2.1161048689138577,
"grad_norm": 0.041956719011068344,
"learning_rate": 0.0002,
"loss": 0.5281031727790833,
"mean_token_accuracy": 0.7877316772937775,
"num_tokens": 9214001.0,
"step": 565
},
{
"entropy": 0.5378998965024948,
"epoch": 2.1198501872659175,
"grad_norm": 0.03963020071387291,
"learning_rate": 0.0002,
"loss": 0.5432679653167725,
"mean_token_accuracy": 0.7765485197305679,
"num_tokens": 9230298.0,
"step": 566
},
{
"entropy": 0.5449769049882889,
"epoch": 2.1235955056179776,
"grad_norm": 0.04862145707011223,
"learning_rate": 0.0002,
"loss": 0.5481102466583252,
"mean_token_accuracy": 0.7771643400192261,
"num_tokens": 9246648.0,
"step": 567
},
{
"entropy": 0.5432566553354263,
"epoch": 2.1273408239700373,
"grad_norm": 0.03826707601547241,
"learning_rate": 0.0002,
"loss": 0.5354676246643066,
"mean_token_accuracy": 0.7808031290769577,
"num_tokens": 9263059.0,
"step": 568
},
{
"entropy": 0.5395092964172363,
"epoch": 2.1310861423220975,
"grad_norm": 0.04806908592581749,
"learning_rate": 0.0002,
"loss": 0.5348396897315979,
"mean_token_accuracy": 0.7838325351476669,
"num_tokens": 9279690.0,
"step": 569
},
{
"entropy": 0.512074276804924,
"epoch": 2.134831460674157,
"grad_norm": 0.034932930022478104,
"learning_rate": 0.0002,
"loss": 0.5059640407562256,
"mean_token_accuracy": 0.7954477220773697,
"num_tokens": 9296053.0,
"step": 570
},
{
"entropy": 0.5317389219999313,
"epoch": 2.1385767790262173,
"grad_norm": 0.054850984364748,
"learning_rate": 0.0002,
"loss": 0.5419769287109375,
"mean_token_accuracy": 0.7804836332798004,
"num_tokens": 9312250.0,
"step": 571
},
{
"entropy": 0.523776650428772,
"epoch": 2.142322097378277,
"grad_norm": 0.03885575383901596,
"learning_rate": 0.0002,
"loss": 0.5337730050086975,
"mean_token_accuracy": 0.7821401208639145,
"num_tokens": 9328588.0,
"step": 572
},
{
"entropy": 0.5306317359209061,
"epoch": 2.146067415730337,
"grad_norm": 0.04031698405742645,
"learning_rate": 0.0002,
"loss": 0.5285602807998657,
"mean_token_accuracy": 0.7860189080238342,
"num_tokens": 9344771.0,
"step": 573
},
{
"entropy": 0.5253511220216751,
"epoch": 2.149812734082397,
"grad_norm": 0.03704000264406204,
"learning_rate": 0.0002,
"loss": 0.519854724407196,
"mean_token_accuracy": 0.7907343953847885,
"num_tokens": 9360913.0,
"step": 574
},
{
"entropy": 0.5498696267604828,
"epoch": 2.153558052434457,
"grad_norm": 0.03690071031451225,
"learning_rate": 0.0002,
"loss": 0.5417680144309998,
"mean_token_accuracy": 0.7790531069040298,
"num_tokens": 9377532.0,
"step": 575
},
{
"entropy": 0.5402537435293198,
"epoch": 2.157303370786517,
"grad_norm": 0.0378306582570076,
"learning_rate": 0.0002,
"loss": 0.541071891784668,
"mean_token_accuracy": 0.7788532823324203,
"num_tokens": 9393830.0,
"step": 576
},
{
"entropy": 0.5282108932733536,
"epoch": 2.161048689138577,
"grad_norm": 0.04091333597898483,
"learning_rate": 0.0002,
"loss": 0.5348851084709167,
"mean_token_accuracy": 0.7821558713912964,
"num_tokens": 9410274.0,
"step": 577
},
{
"entropy": 0.5303814560174942,
"epoch": 2.1647940074906367,
"grad_norm": 0.03591841831803322,
"learning_rate": 0.0002,
"loss": 0.5331617593765259,
"mean_token_accuracy": 0.7818120270967484,
"num_tokens": 9426511.0,
"step": 578
},
{
"entropy": 0.5272700041532516,
"epoch": 2.168539325842697,
"grad_norm": 0.03997735306620598,
"learning_rate": 0.0002,
"loss": 0.5334488153457642,
"mean_token_accuracy": 0.7814541161060333,
"num_tokens": 9442897.0,
"step": 579
},
{
"entropy": 0.5336402952671051,
"epoch": 2.1722846441947565,
"grad_norm": 0.0450415313243866,
"learning_rate": 0.0002,
"loss": 0.5275048017501831,
"mean_token_accuracy": 0.7864081561565399,
"num_tokens": 9459023.0,
"step": 580
},
{
"entropy": 0.538782149553299,
"epoch": 2.1760299625468167,
"grad_norm": 0.03600127249956131,
"learning_rate": 0.0002,
"loss": 0.5313720107078552,
"mean_token_accuracy": 0.7847412079572678,
"num_tokens": 9475337.0,
"step": 581
},
{
"entropy": 0.5273982435464859,
"epoch": 2.1797752808988764,
"grad_norm": 0.04744241386651993,
"learning_rate": 0.0002,
"loss": 0.5319021344184875,
"mean_token_accuracy": 0.7850695848464966,
"num_tokens": 9491529.0,
"step": 582
},
{
"entropy": 0.5370319783687592,
"epoch": 2.1835205992509366,
"grad_norm": 0.035024821758270264,
"learning_rate": 0.0002,
"loss": 0.5342311859130859,
"mean_token_accuracy": 0.7830409854650497,
"num_tokens": 9508099.0,
"step": 583
},
{
"entropy": 0.5350894033908844,
"epoch": 2.1872659176029963,
"grad_norm": 0.04598443582653999,
"learning_rate": 0.0002,
"loss": 0.5383565425872803,
"mean_token_accuracy": 0.7810914367437363,
"num_tokens": 9524506.0,
"step": 584
},
{
"entropy": 0.5270723178982735,
"epoch": 2.191011235955056,
"grad_norm": 0.03489379957318306,
"learning_rate": 0.0002,
"loss": 0.5261937379837036,
"mean_token_accuracy": 0.7874008566141129,
"num_tokens": 9540868.0,
"step": 585
},
{
"entropy": 0.5187418013811111,
"epoch": 2.194756554307116,
"grad_norm": 0.04006824642419815,
"learning_rate": 0.0002,
"loss": 0.516140341758728,
"mean_token_accuracy": 0.7876712679862976,
"num_tokens": 9557109.0,
"step": 586
},
{
"entropy": 0.5397524982690811,
"epoch": 2.198501872659176,
"grad_norm": 0.037596385926008224,
"learning_rate": 0.0002,
"loss": 0.5337037444114685,
"mean_token_accuracy": 0.7848425805568695,
"num_tokens": 9573451.0,
"step": 587
},
{
"entropy": 0.542935311794281,
"epoch": 2.202247191011236,
"grad_norm": 0.05163532868027687,
"learning_rate": 0.0002,
"loss": 0.548254668712616,
"mean_token_accuracy": 0.7771319299936295,
"num_tokens": 9589800.0,
"step": 588
},
{
"entropy": 0.524966299533844,
"epoch": 2.2059925093632957,
"grad_norm": 0.04678061604499817,
"learning_rate": 0.0002,
"loss": 0.537632405757904,
"mean_token_accuracy": 0.7821167409420013,
"num_tokens": 9606180.0,
"step": 589
},
{
"entropy": 0.5223182588815689,
"epoch": 2.209737827715356,
"grad_norm": 0.04918593540787697,
"learning_rate": 0.0002,
"loss": 0.5256946086883545,
"mean_token_accuracy": 0.7862184792757034,
"num_tokens": 9622319.0,
"step": 590
},
{
"entropy": 0.545245572924614,
"epoch": 2.2134831460674156,
"grad_norm": 0.044536106288433075,
"learning_rate": 0.0002,
"loss": 0.5387803316116333,
"mean_token_accuracy": 0.7820178419351578,
"num_tokens": 9638605.0,
"step": 591
},
{
"entropy": 0.5572000294923782,
"epoch": 2.2172284644194757,
"grad_norm": 0.04941220581531525,
"learning_rate": 0.0002,
"loss": 0.5500818490982056,
"mean_token_accuracy": 0.7780845314264297,
"num_tokens": 9655041.0,
"step": 592
},
{
"entropy": 0.524405911564827,
"epoch": 2.2209737827715355,
"grad_norm": 0.04783201217651367,
"learning_rate": 0.0002,
"loss": 0.5203397870063782,
"mean_token_accuracy": 0.7880013734102249,
"num_tokens": 9671239.0,
"step": 593
},
{
"entropy": 0.5252467542886734,
"epoch": 2.2247191011235956,
"grad_norm": 0.04301263764500618,
"learning_rate": 0.0002,
"loss": 0.5267080664634705,
"mean_token_accuracy": 0.7888626754283905,
"num_tokens": 9687363.0,
"step": 594
},
{
"entropy": 0.53339484333992,
"epoch": 2.2284644194756553,
"grad_norm": 0.05318563058972359,
"learning_rate": 0.0002,
"loss": 0.5481151342391968,
"mean_token_accuracy": 0.7762688100337982,
"num_tokens": 9703829.0,
"step": 595
},
{
"entropy": 0.5450247228145599,
"epoch": 2.2322097378277155,
"grad_norm": 0.03796645253896713,
"learning_rate": 0.0002,
"loss": 0.5463745594024658,
"mean_token_accuracy": 0.7799876779317856,
"num_tokens": 9720055.0,
"step": 596
},
{
"entropy": 0.5355545580387115,
"epoch": 2.235955056179775,
"grad_norm": 0.04619521647691727,
"learning_rate": 0.0002,
"loss": 0.5383350253105164,
"mean_token_accuracy": 0.7803421318531036,
"num_tokens": 9736065.0,
"step": 597
},
{
"entropy": 0.5393659174442291,
"epoch": 2.2397003745318353,
"grad_norm": 0.04189852997660637,
"learning_rate": 0.0002,
"loss": 0.5408390760421753,
"mean_token_accuracy": 0.7799636572599411,
"num_tokens": 9752285.0,
"step": 598
},
{
"entropy": 0.5505337119102478,
"epoch": 2.243445692883895,
"grad_norm": 0.04415363445878029,
"learning_rate": 0.0002,
"loss": 0.5492491722106934,
"mean_token_accuracy": 0.7789665758609772,
"num_tokens": 9768797.0,
"step": 599
},
{
"entropy": 0.5322769433259964,
"epoch": 2.247191011235955,
"grad_norm": 0.0446348674595356,
"learning_rate": 0.0002,
"loss": 0.5362676978111267,
"mean_token_accuracy": 0.7827903628349304,
"num_tokens": 9785259.0,
"step": 600
},
{
"entropy": 0.5283399671316147,
"epoch": 2.250936329588015,
"grad_norm": 0.04350518435239792,
"learning_rate": 0.0002,
"loss": 0.5263485312461853,
"mean_token_accuracy": 0.7854094952344894,
"num_tokens": 9801683.0,
"step": 601
},
{
"entropy": 0.5155128389596939,
"epoch": 2.254681647940075,
"grad_norm": 0.049416691064834595,
"learning_rate": 0.0002,
"loss": 0.5274794101715088,
"mean_token_accuracy": 0.7866163551807404,
"num_tokens": 9817897.0,
"step": 602
},
{
"entropy": 0.555690124630928,
"epoch": 2.258426966292135,
"grad_norm": 0.042244087904691696,
"learning_rate": 0.0002,
"loss": 0.5587432384490967,
"mean_token_accuracy": 0.7742861956357956,
"num_tokens": 9834109.0,
"step": 603
},
{
"entropy": 0.5449231714010239,
"epoch": 2.262172284644195,
"grad_norm": 0.04214772582054138,
"learning_rate": 0.0002,
"loss": 0.5424601435661316,
"mean_token_accuracy": 0.7795074135065079,
"num_tokens": 9850508.0,
"step": 604
},
{
"entropy": 0.551129087805748,
"epoch": 2.2659176029962547,
"grad_norm": 0.04242361709475517,
"learning_rate": 0.0002,
"loss": 0.5350391268730164,
"mean_token_accuracy": 0.7817512005567551,
"num_tokens": 9866973.0,
"step": 605
},
{
"entropy": 0.5557906329631805,
"epoch": 2.2696629213483144,
"grad_norm": 0.04337119311094284,
"learning_rate": 0.0002,
"loss": 0.5464892387390137,
"mean_token_accuracy": 0.7796575874090195,
"num_tokens": 9883567.0,
"step": 606
},
{
"entropy": 0.5241350680589676,
"epoch": 2.2734082397003745,
"grad_norm": 0.04597577825188637,
"learning_rate": 0.0002,
"loss": 0.5339911580085754,
"mean_token_accuracy": 0.784000501036644,
"num_tokens": 9899884.0,
"step": 607
},
{
"entropy": 0.5317652076482773,
"epoch": 2.2771535580524347,
"grad_norm": 0.06419555842876434,
"learning_rate": 0.0002,
"loss": 0.5507545471191406,
"mean_token_accuracy": 0.7757140696048737,
"num_tokens": 9916225.0,
"step": 608
},
{
"entropy": 0.520916298031807,
"epoch": 2.2808988764044944,
"grad_norm": 0.0413593053817749,
"learning_rate": 0.0002,
"loss": 0.5282008051872253,
"mean_token_accuracy": 0.7836293429136276,
"num_tokens": 9932137.0,
"step": 609
},
{
"entropy": 0.550976499915123,
"epoch": 2.284644194756554,
"grad_norm": 0.04407277703285217,
"learning_rate": 0.0002,
"loss": 0.5476412177085876,
"mean_token_accuracy": 0.7784940898418427,
"num_tokens": 9948364.0,
"step": 610
},
{
"entropy": 0.5534344464540482,
"epoch": 2.2883895131086143,
"grad_norm": 0.036215297877788544,
"learning_rate": 0.0002,
"loss": 0.5448459386825562,
"mean_token_accuracy": 0.7809607535600662,
"num_tokens": 9964781.0,
"step": 611
},
{
"entropy": 0.540510505437851,
"epoch": 2.292134831460674,
"grad_norm": 0.037168748676776886,
"learning_rate": 0.0002,
"loss": 0.5290323495864868,
"mean_token_accuracy": 0.7844896763563156,
"num_tokens": 9980949.0,
"step": 612
},
{
"entropy": 0.537270799279213,
"epoch": 2.295880149812734,
"grad_norm": 0.0456305667757988,
"learning_rate": 0.0002,
"loss": 0.5368558764457703,
"mean_token_accuracy": 0.781862810254097,
"num_tokens": 9997181.0,
"step": 613
},
{
"entropy": 0.529745414853096,
"epoch": 2.299625468164794,
"grad_norm": 0.04219827800989151,
"learning_rate": 0.0002,
"loss": 0.5287020206451416,
"mean_token_accuracy": 0.7848487794399261,
"num_tokens": 10013303.0,
"step": 614
},
{
"entropy": 0.5297169536352158,
"epoch": 2.303370786516854,
"grad_norm": 0.05070658028125763,
"learning_rate": 0.0002,
"loss": 0.5422332286834717,
"mean_token_accuracy": 0.7800150513648987,
"num_tokens": 10029569.0,
"step": 615
},
{
"entropy": 0.5271121859550476,
"epoch": 2.3071161048689137,
"grad_norm": 0.04743409901857376,
"learning_rate": 0.0002,
"loss": 0.5323826670646667,
"mean_token_accuracy": 0.7835269123315811,
"num_tokens": 10045920.0,
"step": 616
},
{
"entropy": 0.5429159998893738,
"epoch": 2.310861423220974,
"grad_norm": 0.04348791018128395,
"learning_rate": 0.0002,
"loss": 0.5469599962234497,
"mean_token_accuracy": 0.777765229344368,
"num_tokens": 10062068.0,
"step": 617
},
{
"entropy": 0.5268895328044891,
"epoch": 2.3146067415730336,
"grad_norm": 0.046540766954422,
"learning_rate": 0.0002,
"loss": 0.5318824052810669,
"mean_token_accuracy": 0.784139409661293,
"num_tokens": 10078035.0,
"step": 618
},
{
"entropy": 0.5406851470470428,
"epoch": 2.3183520599250937,
"grad_norm": 0.03879360482096672,
"learning_rate": 0.0002,
"loss": 0.5327763557434082,
"mean_token_accuracy": 0.7838515788316727,
"num_tokens": 10094069.0,
"step": 619
},
{
"entropy": 0.5550850629806519,
"epoch": 2.3220973782771535,
"grad_norm": 0.04021632671356201,
"learning_rate": 0.0002,
"loss": 0.544082760810852,
"mean_token_accuracy": 0.7794292271137238,
"num_tokens": 10110562.0,
"step": 620
},
{
"entropy": 0.5633902698755264,
"epoch": 2.3258426966292136,
"grad_norm": 0.03872428461909294,
"learning_rate": 0.0002,
"loss": 0.5591956973075867,
"mean_token_accuracy": 0.7731619328260422,
"num_tokens": 10127313.0,
"step": 621
},
{
"entropy": 0.526028499007225,
"epoch": 2.3295880149812733,
"grad_norm": 0.04169732704758644,
"learning_rate": 0.0002,
"loss": 0.5296715497970581,
"mean_token_accuracy": 0.7846156656742096,
"num_tokens": 10143539.0,
"step": 622
},
{
"entropy": 0.5621512830257416,
"epoch": 2.3333333333333335,
"grad_norm": 0.03567031770944595,
"learning_rate": 0.0002,
"loss": 0.5641921758651733,
"mean_token_accuracy": 0.7724113464355469,
"num_tokens": 10159890.0,
"step": 623
},
{
"entropy": 0.5621916353702545,
"epoch": 2.337078651685393,
"grad_norm": 0.044719185680150986,
"learning_rate": 0.0002,
"loss": 0.5658475756645203,
"mean_token_accuracy": 0.768171489238739,
"num_tokens": 10176303.0,
"step": 624
},
{
"entropy": 0.5397062003612518,
"epoch": 2.3408239700374533,
"grad_norm": 0.03938845917582512,
"learning_rate": 0.0002,
"loss": 0.5410289168357849,
"mean_token_accuracy": 0.7816459834575653,
"num_tokens": 10192725.0,
"step": 625
},
{
"entropy": 0.5308454632759094,
"epoch": 2.344569288389513,
"grad_norm": 0.0393369197845459,
"learning_rate": 0.0002,
"loss": 0.5327979326248169,
"mean_token_accuracy": 0.7836434692144394,
"num_tokens": 10208900.0,
"step": 626
},
{
"entropy": 0.5351555794477463,
"epoch": 2.348314606741573,
"grad_norm": 0.044483788311481476,
"learning_rate": 0.0002,
"loss": 0.537283182144165,
"mean_token_accuracy": 0.784860372543335,
"num_tokens": 10224853.0,
"step": 627
},
{
"entropy": 0.5380195677280426,
"epoch": 2.352059925093633,
"grad_norm": 0.04018259420990944,
"learning_rate": 0.0002,
"loss": 0.5401010513305664,
"mean_token_accuracy": 0.7777950018644333,
"num_tokens": 10241181.0,
"step": 628
},
{
"entropy": 0.5319711565971375,
"epoch": 2.355805243445693,
"grad_norm": 0.052694015204906464,
"learning_rate": 0.0002,
"loss": 0.5327081680297852,
"mean_token_accuracy": 0.7857355177402496,
"num_tokens": 10257569.0,
"step": 629
},
{
"entropy": 0.5219532996416092,
"epoch": 2.359550561797753,
"grad_norm": 0.0513097383081913,
"learning_rate": 0.0002,
"loss": 0.5344624519348145,
"mean_token_accuracy": 0.781092032790184,
"num_tokens": 10273502.0,
"step": 630
},
{
"entropy": 0.5303360670804977,
"epoch": 2.3632958801498125,
"grad_norm": 0.05031297355890274,
"learning_rate": 0.0002,
"loss": 0.5381285548210144,
"mean_token_accuracy": 0.7818425595760345,
"num_tokens": 10289765.0,
"step": 631
},
{
"entropy": 0.5247592329978943,
"epoch": 2.3670411985018727,
"grad_norm": 0.040263328701257706,
"learning_rate": 0.0002,
"loss": 0.5220550298690796,
"mean_token_accuracy": 0.786396861076355,
"num_tokens": 10306027.0,
"step": 632
},
{
"entropy": 0.5546284765005112,
"epoch": 2.370786516853933,
"grad_norm": 0.04438352584838867,
"learning_rate": 0.0002,
"loss": 0.5477085113525391,
"mean_token_accuracy": 0.7770822197198868,
"num_tokens": 10322169.0,
"step": 633
},
{
"entropy": 0.5496452152729034,
"epoch": 2.3745318352059925,
"grad_norm": 0.048432301729917526,
"learning_rate": 0.0002,
"loss": 0.5438807606697083,
"mean_token_accuracy": 0.780827596783638,
"num_tokens": 10338568.0,
"step": 634
},
{
"entropy": 0.5297926962375641,
"epoch": 2.3782771535580522,
"grad_norm": 0.03634348511695862,
"learning_rate": 0.0002,
"loss": 0.5239929556846619,
"mean_token_accuracy": 0.7896489948034286,
"num_tokens": 10354708.0,
"step": 635
},
{
"entropy": 0.5366943925619125,
"epoch": 2.3820224719101124,
"grad_norm": 0.051037952303886414,
"learning_rate": 0.0002,
"loss": 0.5460379123687744,
"mean_token_accuracy": 0.7777325063943863,
"num_tokens": 10371358.0,
"step": 636
},
{
"entropy": 0.5219292491674423,
"epoch": 2.385767790262172,
"grad_norm": 0.03863009437918663,
"learning_rate": 0.0002,
"loss": 0.5266265273094177,
"mean_token_accuracy": 0.7879810929298401,
"num_tokens": 10387500.0,
"step": 637
},
{
"entropy": 0.5288277566432953,
"epoch": 2.3895131086142323,
"grad_norm": 0.05099929869174957,
"learning_rate": 0.0002,
"loss": 0.5307456851005554,
"mean_token_accuracy": 0.7841700166463852,
"num_tokens": 10404042.0,
"step": 638
},
{
"entropy": 0.5441994965076447,
"epoch": 2.393258426966292,
"grad_norm": 0.03832423314452171,
"learning_rate": 0.0002,
"loss": 0.5406984090805054,
"mean_token_accuracy": 0.7822638154029846,
"num_tokens": 10420308.0,
"step": 639
},
{
"entropy": 0.5474298596382141,
"epoch": 2.397003745318352,
"grad_norm": 0.03593610227108002,
"learning_rate": 0.0002,
"loss": 0.5448755025863647,
"mean_token_accuracy": 0.7769681811332703,
"num_tokens": 10436473.0,
"step": 640
},
{
"entropy": 0.5544268637895584,
"epoch": 2.400749063670412,
"grad_norm": 0.05683998391032219,
"learning_rate": 0.0002,
"loss": 0.5575302839279175,
"mean_token_accuracy": 0.7728745937347412,
"num_tokens": 10453006.0,
"step": 641
},
{
"entropy": 0.5459371656179428,
"epoch": 2.404494382022472,
"grad_norm": 0.041604217141866684,
"learning_rate": 0.0002,
"loss": 0.5482038855552673,
"mean_token_accuracy": 0.7801420837640762,
"num_tokens": 10469281.0,
"step": 642
},
{
"entropy": 0.5380865782499313,
"epoch": 2.4082397003745317,
"grad_norm": 0.05113884434103966,
"learning_rate": 0.0002,
"loss": 0.5394017696380615,
"mean_token_accuracy": 0.7834807485342026,
"num_tokens": 10485666.0,
"step": 643
},
{
"entropy": 0.549991711974144,
"epoch": 2.411985018726592,
"grad_norm": 0.03647167235612869,
"learning_rate": 0.0002,
"loss": 0.553663969039917,
"mean_token_accuracy": 0.774835467338562,
"num_tokens": 10501890.0,
"step": 644
},
{
"entropy": 0.5480955541133881,
"epoch": 2.4157303370786516,
"grad_norm": 0.04493939131498337,
"learning_rate": 0.0002,
"loss": 0.5466475486755371,
"mean_token_accuracy": 0.7790014296770096,
"num_tokens": 10518311.0,
"step": 645
},
{
"entropy": 0.5469405502080917,
"epoch": 2.4194756554307117,
"grad_norm": 0.040811046957969666,
"learning_rate": 0.0002,
"loss": 0.5483651161193848,
"mean_token_accuracy": 0.7788845151662827,
"num_tokens": 10534519.0,
"step": 646
},
{
"entropy": 0.542740598320961,
"epoch": 2.4232209737827715,
"grad_norm": 0.045434851199388504,
"learning_rate": 0.0002,
"loss": 0.5396543741226196,
"mean_token_accuracy": 0.7790694683790207,
"num_tokens": 10550595.0,
"step": 647
},
{
"entropy": 0.535121500492096,
"epoch": 2.4269662921348316,
"grad_norm": 0.04115886241197586,
"learning_rate": 0.0002,
"loss": 0.5374845266342163,
"mean_token_accuracy": 0.7803627252578735,
"num_tokens": 10566917.0,
"step": 648
},
{
"entropy": 0.5375159233808517,
"epoch": 2.4307116104868913,
"grad_norm": 0.04332772269845009,
"learning_rate": 0.0002,
"loss": 0.5381888151168823,
"mean_token_accuracy": 0.7793711423873901,
"num_tokens": 10583313.0,
"step": 649
},
{
"entropy": 0.5432725697755814,
"epoch": 2.4344569288389515,
"grad_norm": 0.041510697454214096,
"learning_rate": 0.0002,
"loss": 0.5448310375213623,
"mean_token_accuracy": 0.7758618593215942,
"num_tokens": 10599510.0,
"step": 650
},
{
"entropy": 0.5411451011896133,
"epoch": 2.438202247191011,
"grad_norm": 0.04265889525413513,
"learning_rate": 0.0002,
"loss": 0.5466779470443726,
"mean_token_accuracy": 0.7779202163219452,
"num_tokens": 10615799.0,
"step": 651
},
{
"entropy": 0.535615861415863,
"epoch": 2.4419475655430714,
"grad_norm": 0.04081408306956291,
"learning_rate": 0.0002,
"loss": 0.539250373840332,
"mean_token_accuracy": 0.7790500521659851,
"num_tokens": 10632054.0,
"step": 652
},
{
"entropy": 0.5231917202472687,
"epoch": 2.445692883895131,
"grad_norm": 0.037281572818756104,
"learning_rate": 0.0002,
"loss": 0.5242350101470947,
"mean_token_accuracy": 0.7875235080718994,
"num_tokens": 10648293.0,
"step": 653
},
{
"entropy": 0.5311395078897476,
"epoch": 2.449438202247191,
"grad_norm": 0.04048464447259903,
"learning_rate": 0.0002,
"loss": 0.5264798402786255,
"mean_token_accuracy": 0.7850567251443863,
"num_tokens": 10664249.0,
"step": 654
},
{
"entropy": 0.5295854657888412,
"epoch": 2.453183520599251,
"grad_norm": 0.042382705956697464,
"learning_rate": 0.0002,
"loss": 0.5322737097740173,
"mean_token_accuracy": 0.7859133034944534,
"num_tokens": 10680711.0,
"step": 655
},
{
"entropy": 0.5250136256217957,
"epoch": 2.4569288389513106,
"grad_norm": 0.047354746609926224,
"learning_rate": 0.0002,
"loss": 0.524110734462738,
"mean_token_accuracy": 0.7874706089496613,
"num_tokens": 10696903.0,
"step": 656
},
{
"entropy": 0.5428455919027328,
"epoch": 2.460674157303371,
"grad_norm": 0.04214261844754219,
"learning_rate": 0.0002,
"loss": 0.5400563478469849,
"mean_token_accuracy": 0.7825742065906525,
"num_tokens": 10713018.0,
"step": 657
},
{
"entropy": 0.5570447146892548,
"epoch": 2.464419475655431,
"grad_norm": 0.04198653623461723,
"learning_rate": 0.0002,
"loss": 0.5468944907188416,
"mean_token_accuracy": 0.7801797240972519,
"num_tokens": 10729583.0,
"step": 658
},
{
"entropy": 0.5350753366947174,
"epoch": 2.4681647940074907,
"grad_norm": 0.03751063346862793,
"learning_rate": 0.0002,
"loss": 0.5351656675338745,
"mean_token_accuracy": 0.7814910113811493,
"num_tokens": 10746077.0,
"step": 659
},
{
"entropy": 0.5235352218151093,
"epoch": 2.4719101123595504,
"grad_norm": 0.040084533393383026,
"learning_rate": 0.0002,
"loss": 0.531356692314148,
"mean_token_accuracy": 0.7839406430721283,
"num_tokens": 10762311.0,
"step": 660
},
{
"entropy": 0.5389134883880615,
"epoch": 2.4756554307116105,
"grad_norm": 0.05371229350566864,
"learning_rate": 0.0002,
"loss": 0.5532786250114441,
"mean_token_accuracy": 0.7754277139902115,
"num_tokens": 10778652.0,
"step": 661
},
{
"entropy": 0.5187595188617706,
"epoch": 2.4794007490636703,
"grad_norm": 0.03975149244070053,
"learning_rate": 0.0002,
"loss": 0.5151571035385132,
"mean_token_accuracy": 0.7930901050567627,
"num_tokens": 10794746.0,
"step": 662
},
{
"entropy": 0.5426436811685562,
"epoch": 2.4831460674157304,
"grad_norm": 0.03997328504920006,
"learning_rate": 0.0002,
"loss": 0.5403225421905518,
"mean_token_accuracy": 0.7798904031515121,
"num_tokens": 10811033.0,
"step": 663
},
{
"entropy": 0.5267360359430313,
"epoch": 2.48689138576779,
"grad_norm": 0.043838318437337875,
"learning_rate": 0.0002,
"loss": 0.526395320892334,
"mean_token_accuracy": 0.7879899889230728,
"num_tokens": 10827129.0,
"step": 664
},
{
"entropy": 0.5509849190711975,
"epoch": 2.4906367041198503,
"grad_norm": 0.037469275295734406,
"learning_rate": 0.0002,
"loss": 0.5411713719367981,
"mean_token_accuracy": 0.7808174937963486,
"num_tokens": 10843435.0,
"step": 665
},
{
"entropy": 0.5449976474046707,
"epoch": 2.49438202247191,
"grad_norm": 0.05326893553137779,
"learning_rate": 0.0002,
"loss": 0.5467808842658997,
"mean_token_accuracy": 0.7777620851993561,
"num_tokens": 10859523.0,
"step": 666
},
{
"entropy": 0.5301449000835419,
"epoch": 2.49812734082397,
"grad_norm": 0.04426975175738335,
"learning_rate": 0.0002,
"loss": 0.5359491109848022,
"mean_token_accuracy": 0.7841154336929321,
"num_tokens": 10875805.0,
"step": 667
},
{
"entropy": 0.5325603634119034,
"epoch": 2.50187265917603,
"grad_norm": 0.04210103675723076,
"learning_rate": 0.0002,
"loss": 0.5365734100341797,
"mean_token_accuracy": 0.782084509730339,
"num_tokens": 10892315.0,
"step": 668
},
{
"entropy": 0.5456321388483047,
"epoch": 2.50561797752809,
"grad_norm": 0.03740176558494568,
"learning_rate": 0.0002,
"loss": 0.5444263219833374,
"mean_token_accuracy": 0.7780910581350327,
"num_tokens": 10908850.0,
"step": 669
},
{
"entropy": 0.5338556170463562,
"epoch": 2.5093632958801497,
"grad_norm": 0.04143742844462395,
"learning_rate": 0.0002,
"loss": 0.5300049185752869,
"mean_token_accuracy": 0.787174180150032,
"num_tokens": 10925106.0,
"step": 670
},
{
"entropy": 0.5515117049217224,
"epoch": 2.51310861423221,
"grad_norm": 0.03918025270104408,
"learning_rate": 0.0002,
"loss": 0.542182445526123,
"mean_token_accuracy": 0.7806340008974075,
"num_tokens": 10941543.0,
"step": 671
},
{
"entropy": 0.5549922436475754,
"epoch": 2.5168539325842696,
"grad_norm": 0.04009648784995079,
"learning_rate": 0.0002,
"loss": 0.5559307932853699,
"mean_token_accuracy": 0.7725488841533661,
"num_tokens": 10957817.0,
"step": 672
},
{
"entropy": 0.539954200387001,
"epoch": 2.5205992509363297,
"grad_norm": 0.04543929174542427,
"learning_rate": 0.0002,
"loss": 0.5482618808746338,
"mean_token_accuracy": 0.7789554446935654,
"num_tokens": 10974119.0,
"step": 673
},
{
"entropy": 0.5211862847208977,
"epoch": 2.5243445692883895,
"grad_norm": 0.0385296531021595,
"learning_rate": 0.0002,
"loss": 0.5304719805717468,
"mean_token_accuracy": 0.7863713204860687,
"num_tokens": 10990490.0,
"step": 674
},
{
"entropy": 0.5547338724136353,
"epoch": 2.5280898876404496,
"grad_norm": 0.047472305595874786,
"learning_rate": 0.0002,
"loss": 0.5596637725830078,
"mean_token_accuracy": 0.771984726190567,
"num_tokens": 11007150.0,
"step": 675
},
{
"entropy": 0.5423361957073212,
"epoch": 2.5318352059925093,
"grad_norm": 0.03454773128032684,
"learning_rate": 0.0002,
"loss": 0.5381237268447876,
"mean_token_accuracy": 0.7808732390403748,
"num_tokens": 11023385.0,
"step": 676
},
{
"entropy": 0.5561535805463791,
"epoch": 2.535580524344569,
"grad_norm": 0.03847538307309151,
"learning_rate": 0.0002,
"loss": 0.5428014993667603,
"mean_token_accuracy": 0.7786359935998917,
"num_tokens": 11039943.0,
"step": 677
},
{
"entropy": 0.544300451874733,
"epoch": 2.539325842696629,
"grad_norm": 0.04131785407662392,
"learning_rate": 0.0002,
"loss": 0.5334832668304443,
"mean_token_accuracy": 0.7851458042860031,
"num_tokens": 11056430.0,
"step": 678
},
{
"entropy": 0.5311527848243713,
"epoch": 2.5430711610486894,
"grad_norm": 0.03951219096779823,
"learning_rate": 0.0002,
"loss": 0.5389747023582458,
"mean_token_accuracy": 0.7813056856393814,
"num_tokens": 11072776.0,
"step": 679
},
{
"entropy": 0.5290235728025436,
"epoch": 2.546816479400749,
"grad_norm": 0.0438111387193203,
"learning_rate": 0.0002,
"loss": 0.5451354384422302,
"mean_token_accuracy": 0.7777683436870575,
"num_tokens": 11088991.0,
"step": 680
},
{
"entropy": 0.5291692391037941,
"epoch": 2.550561797752809,
"grad_norm": 0.039012420922517776,
"learning_rate": 0.0002,
"loss": 0.5386437773704529,
"mean_token_accuracy": 0.7806796282529831,
"num_tokens": 11105235.0,
"step": 681
},
{
"entropy": 0.5217102319002151,
"epoch": 2.554307116104869,
"grad_norm": 0.04288937896490097,
"learning_rate": 0.0002,
"loss": 0.5323805809020996,
"mean_token_accuracy": 0.7835096120834351,
"num_tokens": 11121333.0,
"step": 682
},
{
"entropy": 0.5252867043018341,
"epoch": 2.558052434456929,
"grad_norm": 0.0371013842523098,
"learning_rate": 0.0002,
"loss": 0.5191121101379395,
"mean_token_accuracy": 0.7874591499567032,
"num_tokens": 11137249.0,
"step": 683
},
{
"entropy": 0.5371126532554626,
"epoch": 2.561797752808989,
"grad_norm": 0.03830140084028244,
"learning_rate": 0.0002,
"loss": 0.5264033675193787,
"mean_token_accuracy": 0.7881854623556137,
"num_tokens": 11153699.0,
"step": 684
},
{
"entropy": 0.5386142879724503,
"epoch": 2.5655430711610485,
"grad_norm": 0.035421278327703476,
"learning_rate": 0.0002,
"loss": 0.5367159247398376,
"mean_token_accuracy": 0.7793221473693848,
"num_tokens": 11170196.0,
"step": 685
},
{
"entropy": 0.5483710169792175,
"epoch": 2.5692883895131087,
"grad_norm": 0.04288771376013756,
"learning_rate": 0.0002,
"loss": 0.5506448149681091,
"mean_token_accuracy": 0.7785434424877167,
"num_tokens": 11186770.0,
"step": 686
},
{
"entropy": 0.5472489446401596,
"epoch": 2.5730337078651684,
"grad_norm": 0.04111029580235481,
"learning_rate": 0.0002,
"loss": 0.5503485798835754,
"mean_token_accuracy": 0.7765214443206787,
"num_tokens": 11203191.0,
"step": 687
},
{
"entropy": 0.523987427353859,
"epoch": 2.5767790262172285,
"grad_norm": 0.04419523477554321,
"learning_rate": 0.0002,
"loss": 0.5254223942756653,
"mean_token_accuracy": 0.7858942598104477,
"num_tokens": 11219530.0,
"step": 688
},
{
"entropy": 0.5482724606990814,
"epoch": 2.5805243445692883,
"grad_norm": 0.0384112112224102,
"learning_rate": 0.0002,
"loss": 0.5467587113380432,
"mean_token_accuracy": 0.7784788310527802,
"num_tokens": 11236013.0,
"step": 689
},
{
"entropy": 0.5410710424184799,
"epoch": 2.5842696629213484,
"grad_norm": 0.04548390954732895,
"learning_rate": 0.0002,
"loss": 0.5361588001251221,
"mean_token_accuracy": 0.7842984944581985,
"num_tokens": 11252349.0,
"step": 690
},
{
"entropy": 0.5413189381361008,
"epoch": 2.588014981273408,
"grad_norm": 0.03719467297196388,
"learning_rate": 0.0002,
"loss": 0.5372804403305054,
"mean_token_accuracy": 0.7805864661931992,
"num_tokens": 11268637.0,
"step": 691
},
{
"entropy": 0.5587044954299927,
"epoch": 2.5917602996254683,
"grad_norm": 0.03943658620119095,
"learning_rate": 0.0002,
"loss": 0.556570291519165,
"mean_token_accuracy": 0.7712628394365311,
"num_tokens": 11284973.0,
"step": 692
},
{
"entropy": 0.5220051556825638,
"epoch": 2.595505617977528,
"grad_norm": 0.04577549174427986,
"learning_rate": 0.0002,
"loss": 0.5235053896903992,
"mean_token_accuracy": 0.7874717712402344,
"num_tokens": 11301234.0,
"step": 693
},
{
"entropy": 0.5253131091594696,
"epoch": 2.599250936329588,
"grad_norm": 0.055322322994470596,
"learning_rate": 0.0002,
"loss": 0.539014458656311,
"mean_token_accuracy": 0.7832715809345245,
"num_tokens": 11317622.0,
"step": 694
},
{
"entropy": 0.529956579208374,
"epoch": 2.602996254681648,
"grad_norm": 0.04555559530854225,
"learning_rate": 0.0002,
"loss": 0.5358556509017944,
"mean_token_accuracy": 0.7829083502292633,
"num_tokens": 11334260.0,
"step": 695
},
{
"entropy": 0.5464101433753967,
"epoch": 2.606741573033708,
"grad_norm": 0.04112941771745682,
"learning_rate": 0.0002,
"loss": 0.5475582480430603,
"mean_token_accuracy": 0.780443549156189,
"num_tokens": 11350510.0,
"step": 696
},
{
"entropy": 0.5290370956063271,
"epoch": 2.6104868913857677,
"grad_norm": 0.03645879402756691,
"learning_rate": 0.0002,
"loss": 0.5310324430465698,
"mean_token_accuracy": 0.7870594263076782,
"num_tokens": 11366960.0,
"step": 697
},
{
"entropy": 0.5584116280078888,
"epoch": 2.6142322097378274,
"grad_norm": 0.03702421113848686,
"learning_rate": 0.0002,
"loss": 0.5555626153945923,
"mean_token_accuracy": 0.7766379117965698,
"num_tokens": 11383705.0,
"step": 698
},
{
"entropy": 0.5311998277902603,
"epoch": 2.6179775280898876,
"grad_norm": 0.039902858436107635,
"learning_rate": 0.0002,
"loss": 0.5329570770263672,
"mean_token_accuracy": 0.7843590825796127,
"num_tokens": 11399770.0,
"step": 699
},
{
"entropy": 0.5450660437345505,
"epoch": 2.6217228464419478,
"grad_norm": 0.040915053337812424,
"learning_rate": 0.0002,
"loss": 0.5421010851860046,
"mean_token_accuracy": 0.7778819799423218,
"num_tokens": 11416143.0,
"step": 700
},
{
"entropy": 0.5301565080881119,
"epoch": 2.6254681647940075,
"grad_norm": 0.04668205976486206,
"learning_rate": 0.0002,
"loss": 0.542178750038147,
"mean_token_accuracy": 0.7808790653944016,
"num_tokens": 11432391.0,
"step": 701
},
{
"entropy": 0.5262583941221237,
"epoch": 2.629213483146067,
"grad_norm": 0.044074323028326035,
"learning_rate": 0.0002,
"loss": 0.528965413570404,
"mean_token_accuracy": 0.7844109088182449,
"num_tokens": 11448787.0,
"step": 702
},
{
"entropy": 0.5375534892082214,
"epoch": 2.6329588014981273,
"grad_norm": 0.046261075884103775,
"learning_rate": 0.0002,
"loss": 0.5426000952720642,
"mean_token_accuracy": 0.7772792726755142,
"num_tokens": 11464834.0,
"step": 703
},
{
"entropy": 0.5281456708908081,
"epoch": 2.6367041198501875,
"grad_norm": 0.04074921831488609,
"learning_rate": 0.0002,
"loss": 0.5224668979644775,
"mean_token_accuracy": 0.7867994755506516,
"num_tokens": 11481010.0,
"step": 704
},
{
"entropy": 0.5607274174690247,
"epoch": 2.640449438202247,
"grad_norm": 0.04910429194569588,
"learning_rate": 0.0002,
"loss": 0.5609941482543945,
"mean_token_accuracy": 0.7746099084615707,
"num_tokens": 11497290.0,
"step": 705
},
{
"entropy": 0.5405243337154388,
"epoch": 2.644194756554307,
"grad_norm": 0.042494796216487885,
"learning_rate": 0.0002,
"loss": 0.5373457670211792,
"mean_token_accuracy": 0.7792738676071167,
"num_tokens": 11513583.0,
"step": 706
},
{
"entropy": 0.5465130656957626,
"epoch": 2.647940074906367,
"grad_norm": 0.051266275346279144,
"learning_rate": 0.0002,
"loss": 0.5519081950187683,
"mean_token_accuracy": 0.7757825553417206,
"num_tokens": 11530012.0,
"step": 707
},
{
"entropy": 0.5431560575962067,
"epoch": 2.6516853932584272,
"grad_norm": 0.03533034771680832,
"learning_rate": 0.0002,
"loss": 0.5461572408676147,
"mean_token_accuracy": 0.7784530967473984,
"num_tokens": 11546456.0,
"step": 708
},
{
"entropy": 0.5154132097959518,
"epoch": 2.655430711610487,
"grad_norm": 0.04611873999238014,
"learning_rate": 0.0002,
"loss": 0.5180613398551941,
"mean_token_accuracy": 0.7888959646224976,
"num_tokens": 11562883.0,
"step": 709
},
{
"entropy": 0.5712718665599823,
"epoch": 2.6591760299625467,
"grad_norm": 0.03861664608120918,
"learning_rate": 0.0002,
"loss": 0.5646159052848816,
"mean_token_accuracy": 0.7710563838481903,
"num_tokens": 11579392.0,
"step": 710
},
{
"entropy": 0.5572114437818527,
"epoch": 2.662921348314607,
"grad_norm": 0.04512866213917732,
"learning_rate": 0.0002,
"loss": 0.551059901714325,
"mean_token_accuracy": 0.7758464813232422,
"num_tokens": 11595937.0,
"step": 711
},
{
"entropy": 0.5336201041936874,
"epoch": 2.6666666666666665,
"grad_norm": 0.042362719774246216,
"learning_rate": 0.0002,
"loss": 0.5347069501876831,
"mean_token_accuracy": 0.7828791737556458,
"num_tokens": 11612066.0,
"step": 712
},
{
"entropy": 0.5221793055534363,
"epoch": 2.6704119850187267,
"grad_norm": 0.04037570580840111,
"learning_rate": 0.0002,
"loss": 0.523446261882782,
"mean_token_accuracy": 0.7888407558202744,
"num_tokens": 11628437.0,
"step": 713
},
{
"entropy": 0.5422008782625198,
"epoch": 2.6741573033707864,
"grad_norm": 0.04662792757153511,
"learning_rate": 0.0002,
"loss": 0.555385947227478,
"mean_token_accuracy": 0.7747650295495987,
"num_tokens": 11644722.0,
"step": 714
},
{
"entropy": 0.5356374382972717,
"epoch": 2.6779026217228465,
"grad_norm": 0.03770140931010246,
"learning_rate": 0.0002,
"loss": 0.5397407412528992,
"mean_token_accuracy": 0.77961665391922,
"num_tokens": 11661403.0,
"step": 715
},
{
"entropy": 0.5477268397808075,
"epoch": 2.6816479400749063,
"grad_norm": 0.04137538745999336,
"learning_rate": 0.0002,
"loss": 0.5421797633171082,
"mean_token_accuracy": 0.7774805575609207,
"num_tokens": 11677740.0,
"step": 716
},
{
"entropy": 0.5390584021806717,
"epoch": 2.6853932584269664,
"grad_norm": 0.04397116228938103,
"learning_rate": 0.0002,
"loss": 0.5323628187179565,
"mean_token_accuracy": 0.7813891172409058,
"num_tokens": 11693755.0,
"step": 717
},
{
"entropy": 0.5430156886577606,
"epoch": 2.689138576779026,
"grad_norm": 0.03867118060588837,
"learning_rate": 0.0002,
"loss": 0.5338262319564819,
"mean_token_accuracy": 0.7821642309427261,
"num_tokens": 11710311.0,
"step": 718
},
{
"entropy": 0.5369475930929184,
"epoch": 2.6928838951310863,
"grad_norm": 0.03773213177919388,
"learning_rate": 0.0002,
"loss": 0.5436868071556091,
"mean_token_accuracy": 0.7776243984699249,
"num_tokens": 11726751.0,
"step": 719
},
{
"entropy": 0.5204776674509048,
"epoch": 2.696629213483146,
"grad_norm": 0.045796290040016174,
"learning_rate": 0.0002,
"loss": 0.5366164445877075,
"mean_token_accuracy": 0.7829219549894333,
"num_tokens": 11743104.0,
"step": 720
},
{
"entropy": 0.5444348156452179,
"epoch": 2.700374531835206,
"grad_norm": 0.041639544069767,
"learning_rate": 0.0002,
"loss": 0.5522270202636719,
"mean_token_accuracy": 0.7758014649152756,
"num_tokens": 11759143.0,
"step": 721
},
{
"entropy": 0.5301756113767624,
"epoch": 2.704119850187266,
"grad_norm": 0.04008952155709267,
"learning_rate": 0.0002,
"loss": 0.5239149928092957,
"mean_token_accuracy": 0.7852831333875656,
"num_tokens": 11775647.0,
"step": 722
},
{
"entropy": 0.5141435042023659,
"epoch": 2.7078651685393256,
"grad_norm": 0.03991787135601044,
"learning_rate": 0.0002,
"loss": 0.5066305994987488,
"mean_token_accuracy": 0.7961233854293823,
"num_tokens": 11791695.0,
"step": 723
},
{
"entropy": 0.5294996351003647,
"epoch": 2.7116104868913857,
"grad_norm": 0.03514706343412399,
"learning_rate": 0.0002,
"loss": 0.5277984738349915,
"mean_token_accuracy": 0.7842394113540649,
"num_tokens": 11807908.0,
"step": 724
},
{
"entropy": 0.553158238530159,
"epoch": 2.715355805243446,
"grad_norm": 0.0371016301214695,
"learning_rate": 0.0002,
"loss": 0.5542132258415222,
"mean_token_accuracy": 0.7742846459150314,
"num_tokens": 11824455.0,
"step": 725
},
{
"entropy": 0.5377026200294495,
"epoch": 2.7191011235955056,
"grad_norm": 0.04648866876959801,
"learning_rate": 0.0002,
"loss": 0.5486031770706177,
"mean_token_accuracy": 0.7776967585086823,
"num_tokens": 11840615.0,
"step": 726
},
{
"entropy": 0.5500117689371109,
"epoch": 2.7228464419475653,
"grad_norm": 0.03958411142230034,
"learning_rate": 0.0002,
"loss": 0.5574382543563843,
"mean_token_accuracy": 0.7707358449697495,
"num_tokens": 11856804.0,
"step": 727
},
{
"entropy": 0.5287734270095825,
"epoch": 2.7265917602996255,
"grad_norm": 0.039377059787511826,
"learning_rate": 0.0002,
"loss": 0.5284842848777771,
"mean_token_accuracy": 0.7842006385326385,
"num_tokens": 11872824.0,
"step": 728
},
{
"entropy": 0.5455043613910675,
"epoch": 2.7303370786516856,
"grad_norm": 0.038099173456430435,
"learning_rate": 0.0002,
"loss": 0.5363825559616089,
"mean_token_accuracy": 0.7839681655168533,
"num_tokens": 11889236.0,
"step": 729
},
{
"entropy": 0.5231508985161781,
"epoch": 2.7340823970037453,
"grad_norm": 0.04386546462774277,
"learning_rate": 0.0002,
"loss": 0.5231119394302368,
"mean_token_accuracy": 0.7876169681549072,
"num_tokens": 11905504.0,
"step": 730
},
{
"entropy": 0.5425267070531845,
"epoch": 2.737827715355805,
"grad_norm": 0.03880799189209938,
"learning_rate": 0.0002,
"loss": 0.5381489992141724,
"mean_token_accuracy": 0.7835936099290848,
"num_tokens": 11922030.0,
"step": 731
},
{
"entropy": 0.5379330962896347,
"epoch": 2.741573033707865,
"grad_norm": 0.04163983464241028,
"learning_rate": 0.0002,
"loss": 0.5459231734275818,
"mean_token_accuracy": 0.7755035907030106,
"num_tokens": 11938351.0,
"step": 732
},
{
"entropy": 0.5344593375921249,
"epoch": 2.7453183520599254,
"grad_norm": 0.03764946386218071,
"learning_rate": 0.0002,
"loss": 0.5335820913314819,
"mean_token_accuracy": 0.7851902097463608,
"num_tokens": 11954720.0,
"step": 733
},
{
"entropy": 0.5275440439581871,
"epoch": 2.749063670411985,
"grad_norm": 0.041039030998945236,
"learning_rate": 0.0002,
"loss": 0.5316729545593262,
"mean_token_accuracy": 0.784284695982933,
"num_tokens": 11970943.0,
"step": 734
},
{
"entropy": 0.5440046042203903,
"epoch": 2.752808988764045,
"grad_norm": 0.03777683153748512,
"learning_rate": 0.0002,
"loss": 0.5479453802108765,
"mean_token_accuracy": 0.7796096056699753,
"num_tokens": 11987274.0,
"step": 735
},
{
"entropy": 0.5314242094755173,
"epoch": 2.756554307116105,
"grad_norm": 0.04298453778028488,
"learning_rate": 0.0002,
"loss": 0.5360277891159058,
"mean_token_accuracy": 0.7836730033159256,
"num_tokens": 12003645.0,
"step": 736
},
{
"entropy": 0.5434319823980331,
"epoch": 2.7602996254681647,
"grad_norm": 0.038422685116529465,
"learning_rate": 0.0002,
"loss": 0.5429157614707947,
"mean_token_accuracy": 0.7770098298788071,
"num_tokens": 12020104.0,
"step": 737
},
{
"entropy": 0.5382603704929352,
"epoch": 2.764044943820225,
"grad_norm": 0.04176581650972366,
"learning_rate": 0.0002,
"loss": 0.5365764498710632,
"mean_token_accuracy": 0.7839252799749374,
"num_tokens": 12036423.0,
"step": 738
},
{
"entropy": 0.5331043303012848,
"epoch": 2.7677902621722845,
"grad_norm": 0.04350239410996437,
"learning_rate": 0.0002,
"loss": 0.5356451272964478,
"mean_token_accuracy": 0.7829470187425613,
"num_tokens": 12052564.0,
"step": 739
},
{
"entropy": 0.5245354026556015,
"epoch": 2.7715355805243447,
"grad_norm": 0.04295556619763374,
"learning_rate": 0.0002,
"loss": 0.5335471034049988,
"mean_token_accuracy": 0.7844749689102173,
"num_tokens": 12068677.0,
"step": 740
},
{
"entropy": 0.5476740896701813,
"epoch": 2.7752808988764044,
"grad_norm": 0.04540206119418144,
"learning_rate": 0.0002,
"loss": 0.552383542060852,
"mean_token_accuracy": 0.7785235494375229,
"num_tokens": 12085174.0,
"step": 741
},
{
"entropy": 0.5276885330677032,
"epoch": 2.7790262172284645,
"grad_norm": 0.03786449506878853,
"learning_rate": 0.0002,
"loss": 0.5295007228851318,
"mean_token_accuracy": 0.7848162055015564,
"num_tokens": 12101546.0,
"step": 742
},
{
"entropy": 0.5504680871963501,
"epoch": 2.7827715355805243,
"grad_norm": 0.04417780414223671,
"learning_rate": 0.0002,
"loss": 0.5459782481193542,
"mean_token_accuracy": 0.7778183221817017,
"num_tokens": 12117833.0,
"step": 743
},
{
"entropy": 0.5514437556266785,
"epoch": 2.7865168539325844,
"grad_norm": 0.03677407279610634,
"learning_rate": 0.0002,
"loss": 0.5444294810295105,
"mean_token_accuracy": 0.7822880744934082,
"num_tokens": 12134076.0,
"step": 744
},
{
"entropy": 0.544072225689888,
"epoch": 2.790262172284644,
"grad_norm": 0.04843369498848915,
"learning_rate": 0.0002,
"loss": 0.5418300628662109,
"mean_token_accuracy": 0.7809806764125824,
"num_tokens": 12149991.0,
"step": 745
},
{
"entropy": 0.5447394847869873,
"epoch": 2.7940074906367043,
"grad_norm": 0.04489225894212723,
"learning_rate": 0.0002,
"loss": 0.5485548377037048,
"mean_token_accuracy": 0.7752929180860519,
"num_tokens": 12166319.0,
"step": 746
},
{
"entropy": 0.5193701684474945,
"epoch": 2.797752808988764,
"grad_norm": 0.04051094502210617,
"learning_rate": 0.0002,
"loss": 0.5254422426223755,
"mean_token_accuracy": 0.7868325263261795,
"num_tokens": 12182585.0,
"step": 747
},
{
"entropy": 0.533800944685936,
"epoch": 2.8014981273408237,
"grad_norm": 0.03557295724749565,
"learning_rate": 0.0002,
"loss": 0.5316165089607239,
"mean_token_accuracy": 0.7825881540775299,
"num_tokens": 12198769.0,
"step": 748
},
{
"entropy": 0.534054160118103,
"epoch": 2.805243445692884,
"grad_norm": 0.04074644669890404,
"learning_rate": 0.0002,
"loss": 0.5342618823051453,
"mean_token_accuracy": 0.7828291058540344,
"num_tokens": 12215003.0,
"step": 749
},
{
"entropy": 0.5486414730548859,
"epoch": 2.808988764044944,
"grad_norm": 0.04066525399684906,
"learning_rate": 0.0002,
"loss": 0.5566014647483826,
"mean_token_accuracy": 0.7741669267416,
"num_tokens": 12231307.0,
"step": 750
},
{
"entropy": 0.5236565172672272,
"epoch": 2.8127340823970037,
"grad_norm": 0.03859638050198555,
"learning_rate": 0.0002,
"loss": 0.5243086218833923,
"mean_token_accuracy": 0.7863422483205795,
"num_tokens": 12247563.0,
"step": 751
},
{
"entropy": 0.5354926288127899,
"epoch": 2.8164794007490634,
"grad_norm": 0.040070392191410065,
"learning_rate": 0.0002,
"loss": 0.5424857139587402,
"mean_token_accuracy": 0.7793509066104889,
"num_tokens": 12263768.0,
"step": 752
},
{
"entropy": 0.5465504974126816,
"epoch": 2.8202247191011236,
"grad_norm": 0.04251793026924133,
"learning_rate": 0.0002,
"loss": 0.5422512292861938,
"mean_token_accuracy": 0.7784619033336639,
"num_tokens": 12280224.0,
"step": 753
},
{
"entropy": 0.5511007905006409,
"epoch": 2.8239700374531838,
"grad_norm": 0.03704281151294708,
"learning_rate": 0.0002,
"loss": 0.5432584285736084,
"mean_token_accuracy": 0.7793723195791245,
"num_tokens": 12296720.0,
"step": 754
},
{
"entropy": 0.5557062178850174,
"epoch": 2.8277153558052435,
"grad_norm": 0.04253645986318588,
"learning_rate": 0.0002,
"loss": 0.5526583194732666,
"mean_token_accuracy": 0.7777480781078339,
"num_tokens": 12313013.0,
"step": 755
},
{
"entropy": 0.5158669054508209,
"epoch": 2.831460674157303,
"grad_norm": 0.036200929433107376,
"learning_rate": 0.0002,
"loss": 0.5140800476074219,
"mean_token_accuracy": 0.7922120690345764,
"num_tokens": 12328987.0,
"step": 756
},
{
"entropy": 0.5495094060897827,
"epoch": 2.8352059925093633,
"grad_norm": 0.04025623947381973,
"learning_rate": 0.0002,
"loss": 0.5524377226829529,
"mean_token_accuracy": 0.7765700370073318,
"num_tokens": 12345487.0,
"step": 757
},
{
"entropy": 0.5472595542669296,
"epoch": 2.8389513108614235,
"grad_norm": 0.037925150245428085,
"learning_rate": 0.0002,
"loss": 0.5513643622398376,
"mean_token_accuracy": 0.7754906117916107,
"num_tokens": 12362003.0,
"step": 758
},
{
"entropy": 0.5349185019731522,
"epoch": 2.842696629213483,
"grad_norm": 0.04107813537120819,
"learning_rate": 0.0002,
"loss": 0.5352935791015625,
"mean_token_accuracy": 0.785232812166214,
"num_tokens": 12378308.0,
"step": 759
},
{
"entropy": 0.5332917869091034,
"epoch": 2.846441947565543,
"grad_norm": 0.0485457181930542,
"learning_rate": 0.0002,
"loss": 0.5407130122184753,
"mean_token_accuracy": 0.7778820097446442,
"num_tokens": 12394745.0,
"step": 760
},
{
"entropy": 0.5373108834028244,
"epoch": 2.850187265917603,
"grad_norm": 0.045551612973213196,
"learning_rate": 0.0002,
"loss": 0.5431134104728699,
"mean_token_accuracy": 0.7788770198822021,
"num_tokens": 12410653.0,
"step": 761
},
{
"entropy": 0.5553153157234192,
"epoch": 2.853932584269663,
"grad_norm": 0.042994849383831024,
"learning_rate": 0.0002,
"loss": 0.5521018505096436,
"mean_token_accuracy": 0.7741047441959381,
"num_tokens": 12426820.0,
"step": 762
},
{
"entropy": 0.5405306816101074,
"epoch": 2.857677902621723,
"grad_norm": 0.03894044831395149,
"learning_rate": 0.0002,
"loss": 0.5416905283927917,
"mean_token_accuracy": 0.7816338688135147,
"num_tokens": 12443026.0,
"step": 763
},
{
"entropy": 0.5384278744459152,
"epoch": 2.8614232209737827,
"grad_norm": 0.04121169447898865,
"learning_rate": 0.0002,
"loss": 0.5407273769378662,
"mean_token_accuracy": 0.7787628769874573,
"num_tokens": 12459216.0,
"step": 764
},
{
"entropy": 0.5316817611455917,
"epoch": 2.865168539325843,
"grad_norm": 0.05211913585662842,
"learning_rate": 0.0002,
"loss": 0.5382348895072937,
"mean_token_accuracy": 0.7807497531175613,
"num_tokens": 12475540.0,
"step": 765
},
{
"entropy": 0.5411743521690369,
"epoch": 2.8689138576779025,
"grad_norm": 0.05021794140338898,
"learning_rate": 0.0002,
"loss": 0.5549106001853943,
"mean_token_accuracy": 0.7732493728399277,
"num_tokens": 12491791.0,
"step": 766
},
{
"entropy": 0.5427963435649872,
"epoch": 2.8726591760299627,
"grad_norm": 0.048997581005096436,
"learning_rate": 0.0002,
"loss": 0.5405234694480896,
"mean_token_accuracy": 0.7799372375011444,
"num_tokens": 12508102.0,
"step": 767
},
{
"entropy": 0.5702031701803207,
"epoch": 2.8764044943820224,
"grad_norm": 0.035217706114053726,
"learning_rate": 0.0002,
"loss": 0.5628358721733093,
"mean_token_accuracy": 0.7744450867176056,
"num_tokens": 12524674.0,
"step": 768
},
{
"entropy": 0.5263065099716187,
"epoch": 2.8801498127340825,
"grad_norm": 0.04417087137699127,
"learning_rate": 0.0002,
"loss": 0.5192127227783203,
"mean_token_accuracy": 0.7900556176900864,
"num_tokens": 12540700.0,
"step": 769
},
{
"entropy": 0.5679396241903305,
"epoch": 2.8838951310861423,
"grad_norm": 0.038472775369882584,
"learning_rate": 0.0002,
"loss": 0.5629768967628479,
"mean_token_accuracy": 0.7697183936834335,
"num_tokens": 12557124.0,
"step": 770
},
{
"entropy": 0.541569247841835,
"epoch": 2.8876404494382024,
"grad_norm": 0.04340888932347298,
"learning_rate": 0.0002,
"loss": 0.5380176901817322,
"mean_token_accuracy": 0.7819050699472427,
"num_tokens": 12573582.0,
"step": 771
},
{
"entropy": 0.5244268327951431,
"epoch": 2.891385767790262,
"grad_norm": 0.043049633502960205,
"learning_rate": 0.0002,
"loss": 0.5338467955589294,
"mean_token_accuracy": 0.7832711786031723,
"num_tokens": 12589568.0,
"step": 772
},
{
"entropy": 0.5213008224964142,
"epoch": 2.895131086142322,
"grad_norm": 0.05456610396504402,
"learning_rate": 0.0002,
"loss": 0.5332724452018738,
"mean_token_accuracy": 0.7851873487234116,
"num_tokens": 12605650.0,
"step": 773
},
{
"entropy": 0.5455889403820038,
"epoch": 2.898876404494382,
"grad_norm": 0.04193198308348656,
"learning_rate": 0.0002,
"loss": 0.5584859251976013,
"mean_token_accuracy": 0.7724700570106506,
"num_tokens": 12621922.0,
"step": 774
},
{
"entropy": 0.5487163811922073,
"epoch": 2.902621722846442,
"grad_norm": 0.03447289392352104,
"learning_rate": 0.0002,
"loss": 0.5422307252883911,
"mean_token_accuracy": 0.779036745429039,
"num_tokens": 12638171.0,
"step": 775
},
{
"entropy": 0.5613754689693451,
"epoch": 2.906367041198502,
"grad_norm": 0.03812362253665924,
"learning_rate": 0.0002,
"loss": 0.5491812229156494,
"mean_token_accuracy": 0.7774574309587479,
"num_tokens": 12654497.0,
"step": 776
},
{
"entropy": 0.5419997125864029,
"epoch": 2.9101123595505616,
"grad_norm": 0.03889596462249756,
"learning_rate": 0.0002,
"loss": 0.5366528630256653,
"mean_token_accuracy": 0.7796314209699631,
"num_tokens": 12671014.0,
"step": 777
},
{
"entropy": 0.5404350906610489,
"epoch": 2.9138576779026217,
"grad_norm": 0.03634997084736824,
"learning_rate": 0.0002,
"loss": 0.5370875000953674,
"mean_token_accuracy": 0.7817376554012299,
"num_tokens": 12687252.0,
"step": 778
},
{
"entropy": 0.5554278641939163,
"epoch": 2.917602996254682,
"grad_norm": 0.04131067916750908,
"learning_rate": 0.0002,
"loss": 0.5544486045837402,
"mean_token_accuracy": 0.774728998541832,
"num_tokens": 12703762.0,
"step": 779
},
{
"entropy": 0.5132855176925659,
"epoch": 2.9213483146067416,
"grad_norm": 0.041993558406829834,
"learning_rate": 0.0002,
"loss": 0.5225546360015869,
"mean_token_accuracy": 0.7885993123054504,
"num_tokens": 12720070.0,
"step": 780
},
{
"entropy": 0.5195116326212883,
"epoch": 2.9250936329588013,
"grad_norm": 0.045502807945013046,
"learning_rate": 0.0002,
"loss": 0.5276657938957214,
"mean_token_accuracy": 0.7835886776447296,
"num_tokens": 12736079.0,
"step": 781
},
{
"entropy": 0.5291299819946289,
"epoch": 2.9288389513108615,
"grad_norm": 0.04560597985982895,
"learning_rate": 0.0002,
"loss": 0.5367044806480408,
"mean_token_accuracy": 0.7813848108053207,
"num_tokens": 12752163.0,
"step": 782
},
{
"entropy": 0.5446918457746506,
"epoch": 2.932584269662921,
"grad_norm": 0.04057231545448303,
"learning_rate": 0.0002,
"loss": 0.5368906259536743,
"mean_token_accuracy": 0.7825321704149246,
"num_tokens": 12768377.0,
"step": 783
},
{
"entropy": 0.5624755024909973,
"epoch": 2.9363295880149813,
"grad_norm": 0.04997701197862625,
"learning_rate": 0.0002,
"loss": 0.5559151768684387,
"mean_token_accuracy": 0.7733145207166672,
"num_tokens": 12784692.0,
"step": 784
},
{
"entropy": 0.5384950041770935,
"epoch": 2.940074906367041,
"grad_norm": 0.04062885046005249,
"learning_rate": 0.0002,
"loss": 0.536974310874939,
"mean_token_accuracy": 0.7846025824546814,
"num_tokens": 12800887.0,
"step": 785
},
{
"entropy": 0.5255657434463501,
"epoch": 2.943820224719101,
"grad_norm": 0.044986989349126816,
"learning_rate": 0.0002,
"loss": 0.5352227091789246,
"mean_token_accuracy": 0.7826129198074341,
"num_tokens": 12817261.0,
"step": 786
},
{
"entropy": 0.532112181186676,
"epoch": 2.947565543071161,
"grad_norm": 0.04506840929389,
"learning_rate": 0.0002,
"loss": 0.5401644110679626,
"mean_token_accuracy": 0.7819447070360184,
"num_tokens": 12833628.0,
"step": 787
},
{
"entropy": 0.5532176345586777,
"epoch": 2.951310861423221,
"grad_norm": 0.047445181757211685,
"learning_rate": 0.0002,
"loss": 0.5567490458488464,
"mean_token_accuracy": 0.7756209075450897,
"num_tokens": 12850048.0,
"step": 788
},
{
"entropy": 0.5571421086788177,
"epoch": 2.955056179775281,
"grad_norm": 0.03836369141936302,
"learning_rate": 0.0002,
"loss": 0.5471166968345642,
"mean_token_accuracy": 0.7780868262052536,
"num_tokens": 12866382.0,
"step": 789
},
{
"entropy": 0.5684118866920471,
"epoch": 2.958801498127341,
"grad_norm": 0.03691793233156204,
"learning_rate": 0.0002,
"loss": 0.5584673285484314,
"mean_token_accuracy": 0.7734033614397049,
"num_tokens": 12882861.0,
"step": 790
},
{
"entropy": 0.5417571067810059,
"epoch": 2.9625468164794007,
"grad_norm": 0.03854163736104965,
"learning_rate": 0.0002,
"loss": 0.5380803346633911,
"mean_token_accuracy": 0.7819686830043793,
"num_tokens": 12898999.0,
"step": 791
},
{
"entropy": 0.5183953493833542,
"epoch": 2.966292134831461,
"grad_norm": 0.04670790210366249,
"learning_rate": 0.0002,
"loss": 0.527891993522644,
"mean_token_accuracy": 0.7858579158782959,
"num_tokens": 12915160.0,
"step": 792
},
{
"entropy": 0.5315932035446167,
"epoch": 2.9700374531835205,
"grad_norm": 0.05011628568172455,
"learning_rate": 0.0002,
"loss": 0.5408577919006348,
"mean_token_accuracy": 0.7781645357608795,
"num_tokens": 12931387.0,
"step": 793
},
{
"entropy": 0.533274233341217,
"epoch": 2.9737827715355807,
"grad_norm": 0.038501009345054626,
"learning_rate": 0.0002,
"loss": 0.5422831773757935,
"mean_token_accuracy": 0.7777345776557922,
"num_tokens": 12947630.0,
"step": 794
},
{
"entropy": 0.5588134974241257,
"epoch": 2.9775280898876404,
"grad_norm": 0.04206021502614021,
"learning_rate": 0.0002,
"loss": 0.5564273595809937,
"mean_token_accuracy": 0.7733636498451233,
"num_tokens": 12964026.0,
"step": 795
},
{
"entropy": 0.5579260289669037,
"epoch": 2.9812734082397006,
"grad_norm": 0.04490978643298149,
"learning_rate": 0.0002,
"loss": 0.5504725575447083,
"mean_token_accuracy": 0.7786446362733841,
"num_tokens": 12980554.0,
"step": 796
},
{
"entropy": 0.541483461856842,
"epoch": 2.9850187265917603,
"grad_norm": 0.03570273146033287,
"learning_rate": 0.0002,
"loss": 0.5293324589729309,
"mean_token_accuracy": 0.783537819981575,
"num_tokens": 12996979.0,
"step": 797
},
{
"entropy": 0.5362358242273331,
"epoch": 2.98876404494382,
"grad_norm": 0.04825478047132492,
"learning_rate": 0.0002,
"loss": 0.5365868210792542,
"mean_token_accuracy": 0.7838873118162155,
"num_tokens": 13013323.0,
"step": 798
},
{
"entropy": 0.5404023975133896,
"epoch": 2.99250936329588,
"grad_norm": 0.04962825030088425,
"learning_rate": 0.0002,
"loss": 0.5480868816375732,
"mean_token_accuracy": 0.7763252705335617,
"num_tokens": 13029636.0,
"step": 799
},
{
"entropy": 0.5300639569759369,
"epoch": 2.9962546816479403,
"grad_norm": 0.042783528566360474,
"learning_rate": 0.0002,
"loss": 0.5343177318572998,
"mean_token_accuracy": 0.7828411161899567,
"num_tokens": 13046055.0,
"step": 800
},
{
"entropy": 0.5252282693982124,
"epoch": 3.0,
"grad_norm": 0.049276161938905716,
"learning_rate": 0.0002,
"loss": 0.5320798754692078,
"mean_token_accuracy": 0.7844677865505219,
"num_tokens": 13062401.0,
"step": 801
},
{
"entropy": 0.545697808265686,
"epoch": 3.0037453183520597,
"grad_norm": 0.04111013561487198,
"learning_rate": 0.0002,
"loss": 0.5242352485656738,
"mean_token_accuracy": 0.7881960570812225,
"num_tokens": 13078838.0,
"step": 802
},
{
"entropy": 0.5105714052915573,
"epoch": 3.00749063670412,
"grad_norm": 0.050722841173410416,
"learning_rate": 0.0002,
"loss": 0.49721649289131165,
"mean_token_accuracy": 0.7984847724437714,
"num_tokens": 13095019.0,
"step": 803
},
{
"entropy": 0.518198661506176,
"epoch": 3.0112359550561796,
"grad_norm": 0.05298876017332077,
"learning_rate": 0.0002,
"loss": 0.5273076891899109,
"mean_token_accuracy": 0.7871041893959045,
"num_tokens": 13111294.0,
"step": 804
},
{
"entropy": 0.48655156791210175,
"epoch": 3.0149812734082397,
"grad_norm": 0.05474111810326576,
"learning_rate": 0.0002,
"loss": 0.5008523464202881,
"mean_token_accuracy": 0.79793781042099,
"num_tokens": 13127173.0,
"step": 805
},
{
"entropy": 0.4898255914449692,
"epoch": 3.0187265917602994,
"grad_norm": 0.05198859050869942,
"learning_rate": 0.0002,
"loss": 0.502049446105957,
"mean_token_accuracy": 0.7997064739465714,
"num_tokens": 13143319.0,
"step": 806
},
{
"entropy": 0.5108759626746178,
"epoch": 3.0224719101123596,
"grad_norm": 0.050299011170864105,
"learning_rate": 0.0002,
"loss": 0.5128780603408813,
"mean_token_accuracy": 0.7923674434423447,
"num_tokens": 13159544.0,
"step": 807
},
{
"entropy": 0.5222347229719162,
"epoch": 3.0262172284644193,
"grad_norm": 0.047297973185777664,
"learning_rate": 0.0002,
"loss": 0.5127148628234863,
"mean_token_accuracy": 0.7936184853315353,
"num_tokens": 13175745.0,
"step": 808
},
{
"entropy": 0.5319055169820786,
"epoch": 3.0299625468164795,
"grad_norm": 0.043087251484394073,
"learning_rate": 0.0002,
"loss": 0.5200571417808533,
"mean_token_accuracy": 0.789368748664856,
"num_tokens": 13192098.0,
"step": 809
},
{
"entropy": 0.5223256945610046,
"epoch": 3.033707865168539,
"grad_norm": 0.045950714498758316,
"learning_rate": 0.0002,
"loss": 0.5118798613548279,
"mean_token_accuracy": 0.7952196598052979,
"num_tokens": 13208503.0,
"step": 810
},
{
"entropy": 0.5253837034106255,
"epoch": 3.0374531835205993,
"grad_norm": 0.051792871206998825,
"learning_rate": 0.0002,
"loss": 0.5294127464294434,
"mean_token_accuracy": 0.7874963134527206,
"num_tokens": 13224945.0,
"step": 811
},
{
"entropy": 0.5031881630420685,
"epoch": 3.041198501872659,
"grad_norm": 0.05261905863881111,
"learning_rate": 0.0002,
"loss": 0.5030893087387085,
"mean_token_accuracy": 0.796674519777298,
"num_tokens": 13241369.0,
"step": 812
},
{
"entropy": 0.5100391805171967,
"epoch": 3.044943820224719,
"grad_norm": 0.05024467036128044,
"learning_rate": 0.0002,
"loss": 0.5141370296478271,
"mean_token_accuracy": 0.7916264235973358,
"num_tokens": 13257754.0,
"step": 813
},
{
"entropy": 0.5079550594091415,
"epoch": 3.048689138576779,
"grad_norm": 0.05758948624134064,
"learning_rate": 0.0002,
"loss": 0.512941300868988,
"mean_token_accuracy": 0.7929425090551376,
"num_tokens": 13273994.0,
"step": 814
},
{
"entropy": 0.513673685491085,
"epoch": 3.052434456928839,
"grad_norm": 0.04496518149971962,
"learning_rate": 0.0002,
"loss": 0.5110280513763428,
"mean_token_accuracy": 0.7918824106454849,
"num_tokens": 13290072.0,
"step": 815
},
{
"entropy": 0.5141152441501617,
"epoch": 3.056179775280899,
"grad_norm": 0.0500110387802124,
"learning_rate": 0.0002,
"loss": 0.5101944804191589,
"mean_token_accuracy": 0.7915782928466797,
"num_tokens": 13306210.0,
"step": 816
},
{
"entropy": 0.5212079957127571,
"epoch": 3.059925093632959,
"grad_norm": 0.048487596213817596,
"learning_rate": 0.0002,
"loss": 0.5181204080581665,
"mean_token_accuracy": 0.791895255446434,
"num_tokens": 13322810.0,
"step": 817
},
{
"entropy": 0.5105150416493416,
"epoch": 3.0636704119850187,
"grad_norm": 0.04949360713362694,
"learning_rate": 0.0002,
"loss": 0.5145678520202637,
"mean_token_accuracy": 0.7915669232606888,
"num_tokens": 13339105.0,
"step": 818
},
{
"entropy": 0.5000638663768768,
"epoch": 3.067415730337079,
"grad_norm": 0.05010031536221504,
"learning_rate": 0.0002,
"loss": 0.5040720701217651,
"mean_token_accuracy": 0.7957489788532257,
"num_tokens": 13355562.0,
"step": 819
},
{
"entropy": 0.4990030825138092,
"epoch": 3.0711610486891385,
"grad_norm": 0.04833959415555,
"learning_rate": 0.0002,
"loss": 0.5016943216323853,
"mean_token_accuracy": 0.795589417219162,
"num_tokens": 13371584.0,
"step": 820
},
{
"entropy": 0.49931125342845917,
"epoch": 3.0749063670411987,
"grad_norm": 0.0536712147295475,
"learning_rate": 0.0002,
"loss": 0.5040884017944336,
"mean_token_accuracy": 0.7980391532182693,
"num_tokens": 13387562.0,
"step": 821
},
{
"entropy": 0.522365540266037,
"epoch": 3.0786516853932584,
"grad_norm": 0.05137619003653526,
"learning_rate": 0.0002,
"loss": 0.5167077779769897,
"mean_token_accuracy": 0.7917557954788208,
"num_tokens": 13403730.0,
"step": 822
},
{
"entropy": 0.5068316459655762,
"epoch": 3.0823970037453186,
"grad_norm": 0.05163760110735893,
"learning_rate": 0.0002,
"loss": 0.5044561624526978,
"mean_token_accuracy": 0.7993681281805038,
"num_tokens": 13419918.0,
"step": 823
},
{
"entropy": 0.49808672070503235,
"epoch": 3.0861423220973783,
"grad_norm": 0.06049012020230293,
"learning_rate": 0.0002,
"loss": 0.5022746920585632,
"mean_token_accuracy": 0.7967248558998108,
"num_tokens": 13435959.0,
"step": 824
},
{
"entropy": 0.514209657907486,
"epoch": 3.0898876404494384,
"grad_norm": 0.04543498158454895,
"learning_rate": 0.0002,
"loss": 0.5144035220146179,
"mean_token_accuracy": 0.789142832159996,
"num_tokens": 13452229.0,
"step": 825
},
{
"entropy": 0.5195358544588089,
"epoch": 3.093632958801498,
"grad_norm": 0.057822633534669876,
"learning_rate": 0.0002,
"loss": 0.5155280828475952,
"mean_token_accuracy": 0.7921741157770157,
"num_tokens": 13468667.0,
"step": 826
},
{
"entropy": 0.507283978164196,
"epoch": 3.097378277153558,
"grad_norm": 0.05148691684007645,
"learning_rate": 0.0002,
"loss": 0.504961371421814,
"mean_token_accuracy": 0.7980248332023621,
"num_tokens": 13484964.0,
"step": 827
},
{
"entropy": 0.5191457867622375,
"epoch": 3.101123595505618,
"grad_norm": 0.045027829706668854,
"learning_rate": 0.0002,
"loss": 0.5200563669204712,
"mean_token_accuracy": 0.7913502901792526,
"num_tokens": 13501449.0,
"step": 828
},
{
"entropy": 0.5351596623659134,
"epoch": 3.1048689138576777,
"grad_norm": 0.05001077800989151,
"learning_rate": 0.0002,
"loss": 0.5278201699256897,
"mean_token_accuracy": 0.7879630476236343,
"num_tokens": 13517966.0,
"step": 829
},
{
"entropy": 0.5123812630772591,
"epoch": 3.108614232209738,
"grad_norm": 0.0483224131166935,
"learning_rate": 0.0002,
"loss": 0.5094588398933411,
"mean_token_accuracy": 0.794407531619072,
"num_tokens": 13534307.0,
"step": 830
},
{
"entropy": 0.5005150064826012,
"epoch": 3.1123595505617976,
"grad_norm": 0.06896387785673141,
"learning_rate": 0.0002,
"loss": 0.5081024169921875,
"mean_token_accuracy": 0.7954099476337433,
"num_tokens": 13550484.0,
"step": 831
},
{
"entropy": 0.5042895451188087,
"epoch": 3.1161048689138577,
"grad_norm": 0.058579690754413605,
"learning_rate": 0.0002,
"loss": 0.508193850517273,
"mean_token_accuracy": 0.793841764330864,
"num_tokens": 13566708.0,
"step": 832
},
{
"entropy": 0.49759114533662796,
"epoch": 3.1198501872659175,
"grad_norm": 0.07416244596242905,
"learning_rate": 0.0002,
"loss": 0.5042813420295715,
"mean_token_accuracy": 0.7976614087820053,
"num_tokens": 13582827.0,
"step": 833
},
{
"entropy": 0.5223132967948914,
"epoch": 3.1235955056179776,
"grad_norm": 0.06452949345111847,
"learning_rate": 0.0002,
"loss": 0.5273835062980652,
"mean_token_accuracy": 0.7855038046836853,
"num_tokens": 13599052.0,
"step": 834
},
{
"entropy": 0.5274243950843811,
"epoch": 3.1273408239700373,
"grad_norm": 0.05534323304891586,
"learning_rate": 0.0002,
"loss": 0.527578592300415,
"mean_token_accuracy": 0.7877459824085236,
"num_tokens": 13615363.0,
"step": 835
},
{
"entropy": 0.5254645645618439,
"epoch": 3.1310861423220975,
"grad_norm": 0.05036141723394394,
"learning_rate": 0.0002,
"loss": 0.5162075161933899,
"mean_token_accuracy": 0.7924645841121674,
"num_tokens": 13631656.0,
"step": 836
},
{
"entropy": 0.519648090004921,
"epoch": 3.134831460674157,
"grad_norm": 0.05153921991586685,
"learning_rate": 0.0002,
"loss": 0.5139608383178711,
"mean_token_accuracy": 0.7937669306993484,
"num_tokens": 13648061.0,
"step": 837
},
{
"entropy": 0.5104959607124329,
"epoch": 3.1385767790262173,
"grad_norm": 0.0628538653254509,
"learning_rate": 0.0002,
"loss": 0.5201999545097351,
"mean_token_accuracy": 0.7901795506477356,
"num_tokens": 13664398.0,
"step": 838
},
{
"entropy": 0.5013151913881302,
"epoch": 3.142322097378277,
"grad_norm": 0.05778926610946655,
"learning_rate": 0.0002,
"loss": 0.5063536763191223,
"mean_token_accuracy": 0.7938642650842667,
"num_tokens": 13680563.0,
"step": 839
},
{
"entropy": 0.5136759728193283,
"epoch": 3.146067415730337,
"grad_norm": 0.0481521412730217,
"learning_rate": 0.0002,
"loss": 0.5169215202331543,
"mean_token_accuracy": 0.7936979234218597,
"num_tokens": 13696943.0,
"step": 840
},
{
"entropy": 0.5035114511847496,
"epoch": 3.149812734082397,
"grad_norm": 0.052551548928022385,
"learning_rate": 0.0002,
"loss": 0.5094401240348816,
"mean_token_accuracy": 0.7950234562158585,
"num_tokens": 13713121.0,
"step": 841
},
{
"entropy": 0.5143017992377281,
"epoch": 3.153558052434457,
"grad_norm": 0.051041699945926666,
"learning_rate": 0.0002,
"loss": 0.5074518322944641,
"mean_token_accuracy": 0.7948710173368454,
"num_tokens": 13729464.0,
"step": 842
},
{
"entropy": 0.5306706875562668,
"epoch": 3.157303370786517,
"grad_norm": 0.0463450625538826,
"learning_rate": 0.0002,
"loss": 0.5219502449035645,
"mean_token_accuracy": 0.7893195748329163,
"num_tokens": 13746493.0,
"step": 843
},
{
"entropy": 0.5117569044232368,
"epoch": 3.161048689138577,
"grad_norm": 0.06164409592747688,
"learning_rate": 0.0002,
"loss": 0.5158479809761047,
"mean_token_accuracy": 0.7911277264356613,
"num_tokens": 13762823.0,
"step": 844
},
{
"entropy": 0.5204734578728676,
"epoch": 3.1647940074906367,
"grad_norm": 0.054356031119823456,
"learning_rate": 0.0002,
"loss": 0.5212512016296387,
"mean_token_accuracy": 0.7890127152204514,
"num_tokens": 13779000.0,
"step": 845
},
{
"entropy": 0.5199745744466782,
"epoch": 3.168539325842697,
"grad_norm": 0.0607718862593174,
"learning_rate": 0.0002,
"loss": 0.5160431265830994,
"mean_token_accuracy": 0.7902602553367615,
"num_tokens": 13794975.0,
"step": 846
},
{
"entropy": 0.4987589195370674,
"epoch": 3.1722846441947565,
"grad_norm": 0.04878820478916168,
"learning_rate": 0.0002,
"loss": 0.5000798106193542,
"mean_token_accuracy": 0.7972550392150879,
"num_tokens": 13811158.0,
"step": 847
},
{
"entropy": 0.5230295807123184,
"epoch": 3.1760299625468167,
"grad_norm": 0.06623463332653046,
"learning_rate": 0.0002,
"loss": 0.5327509641647339,
"mean_token_accuracy": 0.7841638922691345,
"num_tokens": 13827505.0,
"step": 848
},
{
"entropy": 0.5071290284395218,
"epoch": 3.1797752808988764,
"grad_norm": 0.05458921194076538,
"learning_rate": 0.0002,
"loss": 0.506171464920044,
"mean_token_accuracy": 0.796265110373497,
"num_tokens": 13843820.0,
"step": 849
},
{
"entropy": 0.5068354383111,
"epoch": 3.1835205992509366,
"grad_norm": 0.07471395283937454,
"learning_rate": 0.0002,
"loss": 0.5159043669700623,
"mean_token_accuracy": 0.7950875610113144,
"num_tokens": 13860049.0,
"step": 850
},
{
"entropy": 0.5165606439113617,
"epoch": 3.1872659176029963,
"grad_norm": 0.04287557676434517,
"learning_rate": 0.0002,
"loss": 0.5090954303741455,
"mean_token_accuracy": 0.7943407446146011,
"num_tokens": 13876269.0,
"step": 851
},
{
"entropy": 0.5112441331148148,
"epoch": 3.191011235955056,
"grad_norm": 0.055288348346948624,
"learning_rate": 0.0002,
"loss": 0.5097154974937439,
"mean_token_accuracy": 0.7928614467382431,
"num_tokens": 13892237.0,
"step": 852
},
{
"entropy": 0.5263922363519669,
"epoch": 3.194756554307116,
"grad_norm": 0.05795539170503616,
"learning_rate": 0.0002,
"loss": 0.5299734473228455,
"mean_token_accuracy": 0.7866927832365036,
"num_tokens": 13908834.0,
"step": 853
},
{
"entropy": 0.5262639820575714,
"epoch": 3.198501872659176,
"grad_norm": 0.04974358528852463,
"learning_rate": 0.0002,
"loss": 0.5219104290008545,
"mean_token_accuracy": 0.789173498749733,
"num_tokens": 13925285.0,
"step": 854
},
{
"entropy": 0.5375918298959732,
"epoch": 3.202247191011236,
"grad_norm": 0.05287981405854225,
"learning_rate": 0.0002,
"loss": 0.538820207118988,
"mean_token_accuracy": 0.7783188968896866,
"num_tokens": 13941531.0,
"step": 855
},
{
"entropy": 0.5262509882450104,
"epoch": 3.2059925093632957,
"grad_norm": 0.050868358463048935,
"learning_rate": 0.0002,
"loss": 0.5281128883361816,
"mean_token_accuracy": 0.78641077876091,
"num_tokens": 13957808.0,
"step": 856
},
{
"entropy": 0.5126873999834061,
"epoch": 3.209737827715356,
"grad_norm": 0.053514108061790466,
"learning_rate": 0.0002,
"loss": 0.5147566795349121,
"mean_token_accuracy": 0.7941258400678635,
"num_tokens": 13974052.0,
"step": 857
},
{
"entropy": 0.5275673717260361,
"epoch": 3.2134831460674156,
"grad_norm": 0.05271236225962639,
"learning_rate": 0.0002,
"loss": 0.5292813777923584,
"mean_token_accuracy": 0.7857562899589539,
"num_tokens": 13990343.0,
"step": 858
},
{
"entropy": 0.5242348462343216,
"epoch": 3.2172284644194757,
"grad_norm": 0.07179221510887146,
"learning_rate": 0.0002,
"loss": 0.5286028981208801,
"mean_token_accuracy": 0.7894574105739594,
"num_tokens": 14006625.0,
"step": 859
},
{
"entropy": 0.5096549838781357,
"epoch": 3.2209737827715355,
"grad_norm": 0.049610402435064316,
"learning_rate": 0.0002,
"loss": 0.5049244165420532,
"mean_token_accuracy": 0.7980163246393204,
"num_tokens": 14022899.0,
"step": 860
},
{
"entropy": 0.5015261322259903,
"epoch": 3.2247191011235956,
"grad_norm": 0.05947711691260338,
"learning_rate": 0.0002,
"loss": 0.4989194869995117,
"mean_token_accuracy": 0.7979766577482224,
"num_tokens": 14039443.0,
"step": 861
},
{
"entropy": 0.507699728012085,
"epoch": 3.2284644194756553,
"grad_norm": 0.04882875084877014,
"learning_rate": 0.0002,
"loss": 0.507795512676239,
"mean_token_accuracy": 0.7962815016508102,
"num_tokens": 14055656.0,
"step": 862
},
{
"entropy": 0.5021291598677635,
"epoch": 3.2322097378277155,
"grad_norm": 0.061408963054418564,
"learning_rate": 0.0002,
"loss": 0.5129059553146362,
"mean_token_accuracy": 0.7919183075428009,
"num_tokens": 14071999.0,
"step": 863
},
{
"entropy": 0.520720586180687,
"epoch": 3.235955056179775,
"grad_norm": 0.06845266371965408,
"learning_rate": 0.0002,
"loss": 0.5275195837020874,
"mean_token_accuracy": 0.786097377538681,
"num_tokens": 14088181.0,
"step": 864
},
{
"entropy": 0.5245565697550774,
"epoch": 3.2397003745318353,
"grad_norm": 0.05512849986553192,
"learning_rate": 0.0002,
"loss": 0.5164670944213867,
"mean_token_accuracy": 0.7922011315822601,
"num_tokens": 14104382.0,
"step": 865
},
{
"entropy": 0.523853063583374,
"epoch": 3.243445692883895,
"grad_norm": 0.05168979614973068,
"learning_rate": 0.0002,
"loss": 0.5198615789413452,
"mean_token_accuracy": 0.7894517928361893,
"num_tokens": 14120589.0,
"step": 866
},
{
"entropy": 0.5336069017648697,
"epoch": 3.247191011235955,
"grad_norm": 0.04658959433436394,
"learning_rate": 0.0002,
"loss": 0.5296441912651062,
"mean_token_accuracy": 0.7839891761541367,
"num_tokens": 14137115.0,
"step": 867
},
{
"entropy": 0.5032267719507217,
"epoch": 3.250936329588015,
"grad_norm": 0.06418543308973312,
"learning_rate": 0.0002,
"loss": 0.5041000843048096,
"mean_token_accuracy": 0.7958316802978516,
"num_tokens": 14153324.0,
"step": 868
},
{
"entropy": 0.5415874123573303,
"epoch": 3.254681647940075,
"grad_norm": 0.05481120944023132,
"learning_rate": 0.0002,
"loss": 0.5544674396514893,
"mean_token_accuracy": 0.7752077877521515,
"num_tokens": 14169770.0,
"step": 869
},
{
"entropy": 0.5231891572475433,
"epoch": 3.258426966292135,
"grad_norm": 0.055172860622406006,
"learning_rate": 0.0002,
"loss": 0.527195930480957,
"mean_token_accuracy": 0.7866710424423218,
"num_tokens": 14186252.0,
"step": 870
},
{
"entropy": 0.522189661860466,
"epoch": 3.262172284644195,
"grad_norm": 0.058594439178705215,
"learning_rate": 0.0002,
"loss": 0.5187022686004639,
"mean_token_accuracy": 0.7929898500442505,
"num_tokens": 14202621.0,
"step": 871
},
{
"entropy": 0.5282062888145447,
"epoch": 3.2659176029962547,
"grad_norm": 0.05134856328368187,
"learning_rate": 0.0002,
"loss": 0.5219106674194336,
"mean_token_accuracy": 0.7889548540115356,
"num_tokens": 14218830.0,
"step": 872
},
{
"entropy": 0.5150680243968964,
"epoch": 3.2696629213483144,
"grad_norm": 0.05508032441139221,
"learning_rate": 0.0002,
"loss": 0.5112281441688538,
"mean_token_accuracy": 0.7931530773639679,
"num_tokens": 14234888.0,
"step": 873
},
{
"entropy": 0.5219835788011551,
"epoch": 3.2734082397003745,
"grad_norm": 0.05464804917573929,
"learning_rate": 0.0002,
"loss": 0.524517297744751,
"mean_token_accuracy": 0.7871863842010498,
"num_tokens": 14251240.0,
"step": 874
},
{
"entropy": 0.5211943238973618,
"epoch": 3.2771535580524347,
"grad_norm": 0.06844772398471832,
"learning_rate": 0.0002,
"loss": 0.5394464731216431,
"mean_token_accuracy": 0.7814126461744308,
"num_tokens": 14267612.0,
"step": 875
},
{
"entropy": 0.5181123912334442,
"epoch": 3.2808988764044944,
"grad_norm": 0.04897969216108322,
"learning_rate": 0.0002,
"loss": 0.5221361517906189,
"mean_token_accuracy": 0.7895658910274506,
"num_tokens": 14284024.0,
"step": 876
},
{
"entropy": 0.522240474820137,
"epoch": 3.284644194756554,
"grad_norm": 0.046099789440631866,
"learning_rate": 0.0002,
"loss": 0.515265941619873,
"mean_token_accuracy": 0.7908574789762497,
"num_tokens": 14300400.0,
"step": 877
},
{
"entropy": 0.539507195353508,
"epoch": 3.2883895131086143,
"grad_norm": 0.048160191625356674,
"learning_rate": 0.0002,
"loss": 0.5282410979270935,
"mean_token_accuracy": 0.7885929346084595,
"num_tokens": 14316696.0,
"step": 878
},
{
"entropy": 0.5196528732776642,
"epoch": 3.292134831460674,
"grad_norm": 0.05286882072687149,
"learning_rate": 0.0002,
"loss": 0.5168602466583252,
"mean_token_accuracy": 0.7895731180906296,
"num_tokens": 14333018.0,
"step": 879
},
{
"entropy": 0.5102087259292603,
"epoch": 3.295880149812734,
"grad_norm": 0.059099920094013214,
"learning_rate": 0.0002,
"loss": 0.5207654237747192,
"mean_token_accuracy": 0.7876903861761093,
"num_tokens": 14349309.0,
"step": 880
},
{
"entropy": 0.5270523875951767,
"epoch": 3.299625468164794,
"grad_norm": 0.05176056921482086,
"learning_rate": 0.0002,
"loss": 0.5302364230155945,
"mean_token_accuracy": 0.7864267975091934,
"num_tokens": 14365771.0,
"step": 881
},
{
"entropy": 0.5273350328207016,
"epoch": 3.303370786516854,
"grad_norm": 0.053021032363176346,
"learning_rate": 0.0002,
"loss": 0.51994389295578,
"mean_token_accuracy": 0.7906388491392136,
"num_tokens": 14382276.0,
"step": 882
},
{
"entropy": 0.5050782039761543,
"epoch": 3.3071161048689137,
"grad_norm": 0.05596887692809105,
"learning_rate": 0.0002,
"loss": 0.5052669644355774,
"mean_token_accuracy": 0.7954567670822144,
"num_tokens": 14398533.0,
"step": 883
},
{
"entropy": 0.5178304612636566,
"epoch": 3.310861423220974,
"grad_norm": 0.051180679351091385,
"learning_rate": 0.0002,
"loss": 0.5151298642158508,
"mean_token_accuracy": 0.7920469641685486,
"num_tokens": 14414953.0,
"step": 884
},
{
"entropy": 0.5152227282524109,
"epoch": 3.3146067415730336,
"grad_norm": 0.060053881257772446,
"learning_rate": 0.0002,
"loss": 0.5225366950035095,
"mean_token_accuracy": 0.7887113392353058,
"num_tokens": 14431177.0,
"step": 885
},
{
"entropy": 0.5342336893081665,
"epoch": 3.3183520599250937,
"grad_norm": 0.04932161048054695,
"learning_rate": 0.0002,
"loss": 0.5272732973098755,
"mean_token_accuracy": 0.7877390533685684,
"num_tokens": 14447551.0,
"step": 886
},
{
"entropy": 0.5131062269210815,
"epoch": 3.3220973782771535,
"grad_norm": 0.056324418634176254,
"learning_rate": 0.0002,
"loss": 0.511243999004364,
"mean_token_accuracy": 0.7933667898178101,
"num_tokens": 14463837.0,
"step": 887
},
{
"entropy": 0.5144293755292892,
"epoch": 3.3258426966292136,
"grad_norm": 0.049344755709171295,
"learning_rate": 0.0002,
"loss": 0.5185728073120117,
"mean_token_accuracy": 0.7894094735383987,
"num_tokens": 14480010.0,
"step": 888
},
{
"entropy": 0.5006949752569199,
"epoch": 3.3295880149812733,
"grad_norm": 0.06578890234231949,
"learning_rate": 0.0002,
"loss": 0.5114624500274658,
"mean_token_accuracy": 0.7939462065696716,
"num_tokens": 14496280.0,
"step": 889
},
{
"entropy": 0.5155239552259445,
"epoch": 3.3333333333333335,
"grad_norm": 0.052595749497413635,
"learning_rate": 0.0002,
"loss": 0.5211793780326843,
"mean_token_accuracy": 0.7900384217500687,
"num_tokens": 14512580.0,
"step": 890
},
{
"entropy": 0.4996938407421112,
"epoch": 3.337078651685393,
"grad_norm": 0.05196739733219147,
"learning_rate": 0.0002,
"loss": 0.4989975094795227,
"mean_token_accuracy": 0.7975862473249435,
"num_tokens": 14528932.0,
"step": 891
},
{
"entropy": 0.5200860351324081,
"epoch": 3.3408239700374533,
"grad_norm": 0.05091974139213562,
"learning_rate": 0.0002,
"loss": 0.5156251192092896,
"mean_token_accuracy": 0.7910965532064438,
"num_tokens": 14545418.0,
"step": 892
},
{
"entropy": 0.5055394843220711,
"epoch": 3.344569288389513,
"grad_norm": 0.0533117949962616,
"learning_rate": 0.0002,
"loss": 0.5111801028251648,
"mean_token_accuracy": 0.791337177157402,
"num_tokens": 14561554.0,
"step": 893
},
{
"entropy": 0.5070675015449524,
"epoch": 3.348314606741573,
"grad_norm": 0.04844473674893379,
"learning_rate": 0.0002,
"loss": 0.5077552795410156,
"mean_token_accuracy": 0.7912814170122147,
"num_tokens": 14578052.0,
"step": 894
},
{
"entropy": 0.5202019810676575,
"epoch": 3.352059925093633,
"grad_norm": 0.04764174669981003,
"learning_rate": 0.0002,
"loss": 0.5175067186355591,
"mean_token_accuracy": 0.7899416983127594,
"num_tokens": 14594359.0,
"step": 895
},
{
"entropy": 0.5255243629217148,
"epoch": 3.355805243445693,
"grad_norm": 0.05360300838947296,
"learning_rate": 0.0002,
"loss": 0.5318154692649841,
"mean_token_accuracy": 0.7854946553707123,
"num_tokens": 14610661.0,
"step": 896
},
{
"entropy": 0.5251385867595673,
"epoch": 3.359550561797753,
"grad_norm": 0.05500936135649681,
"learning_rate": 0.0002,
"loss": 0.5363146066665649,
"mean_token_accuracy": 0.7834254056215286,
"num_tokens": 14626712.0,
"step": 897
},
{
"entropy": 0.5119743421673775,
"epoch": 3.3632958801498125,
"grad_norm": 0.04378456994891167,
"learning_rate": 0.0002,
"loss": 0.5079984068870544,
"mean_token_accuracy": 0.7939057648181915,
"num_tokens": 14642932.0,
"step": 898
},
{
"entropy": 0.5284467786550522,
"epoch": 3.3670411985018727,
"grad_norm": 0.046168722212314606,
"learning_rate": 0.0002,
"loss": 0.5247387290000916,
"mean_token_accuracy": 0.787312924861908,
"num_tokens": 14659213.0,
"step": 899
},
{
"entropy": 0.5423993915319443,
"epoch": 3.370786516853933,
"grad_norm": 0.04573873057961464,
"learning_rate": 0.0002,
"loss": 0.5364725589752197,
"mean_token_accuracy": 0.7854876816272736,
"num_tokens": 14675678.0,
"step": 900
},
{
"entropy": 0.5328433066606522,
"epoch": 3.3745318352059925,
"grad_norm": 0.044917598366737366,
"learning_rate": 0.0002,
"loss": 0.5308316946029663,
"mean_token_accuracy": 0.785490483045578,
"num_tokens": 14692287.0,
"step": 901
},
{
"entropy": 0.5370714962482452,
"epoch": 3.3782771535580522,
"grad_norm": 0.05281532183289528,
"learning_rate": 0.0002,
"loss": 0.5403937101364136,
"mean_token_accuracy": 0.7802177965641022,
"num_tokens": 14708736.0,
"step": 902
},
{
"entropy": 0.5240233987569809,
"epoch": 3.3820224719101124,
"grad_norm": 0.04636811465024948,
"learning_rate": 0.0002,
"loss": 0.5222055315971375,
"mean_token_accuracy": 0.7886700630187988,
"num_tokens": 14725122.0,
"step": 903
},
{
"entropy": 0.5218504667282104,
"epoch": 3.385767790262172,
"grad_norm": 0.05728694424033165,
"learning_rate": 0.0002,
"loss": 0.5256317853927612,
"mean_token_accuracy": 0.7890423983335495,
"num_tokens": 14741271.0,
"step": 904
},
{
"entropy": 0.5346123427152634,
"epoch": 3.3895131086142323,
"grad_norm": 0.046447765082120895,
"learning_rate": 0.0002,
"loss": 0.5343607664108276,
"mean_token_accuracy": 0.7844806611537933,
"num_tokens": 14757614.0,
"step": 905
},
{
"entropy": 0.5300848186016083,
"epoch": 3.393258426966292,
"grad_norm": 0.06571624428033829,
"learning_rate": 0.0002,
"loss": 0.5315452814102173,
"mean_token_accuracy": 0.7868516147136688,
"num_tokens": 14774083.0,
"step": 906
},
{
"entropy": 0.5144885182380676,
"epoch": 3.397003745318352,
"grad_norm": 0.05184376239776611,
"learning_rate": 0.0002,
"loss": 0.5137390494346619,
"mean_token_accuracy": 0.7918999344110489,
"num_tokens": 14790219.0,
"step": 907
},
{
"entropy": 0.5159177482128143,
"epoch": 3.400749063670412,
"grad_norm": 0.0637274757027626,
"learning_rate": 0.0002,
"loss": 0.5109057426452637,
"mean_token_accuracy": 0.792988732457161,
"num_tokens": 14806579.0,
"step": 908
},
{
"entropy": 0.5414174944162369,
"epoch": 3.404494382022472,
"grad_norm": 0.049117956310510635,
"learning_rate": 0.0002,
"loss": 0.5352107286453247,
"mean_token_accuracy": 0.7849340736865997,
"num_tokens": 14823142.0,
"step": 909
},
{
"entropy": 0.5176117867231369,
"epoch": 3.4082397003745317,
"grad_norm": 0.06466244161128998,
"learning_rate": 0.0002,
"loss": 0.522276759147644,
"mean_token_accuracy": 0.789726972579956,
"num_tokens": 14839440.0,
"step": 910
},
{
"entropy": 0.5329615920782089,
"epoch": 3.411985018726592,
"grad_norm": 0.05105730891227722,
"learning_rate": 0.0002,
"loss": 0.5381749868392944,
"mean_token_accuracy": 0.7826534360647202,
"num_tokens": 14855956.0,
"step": 911
},
{
"entropy": 0.5107108354568481,
"epoch": 3.4157303370786516,
"grad_norm": 0.05413498729467392,
"learning_rate": 0.0002,
"loss": 0.5151250958442688,
"mean_token_accuracy": 0.7922552824020386,
"num_tokens": 14872232.0,
"step": 912
},
{
"entropy": 0.5194525718688965,
"epoch": 3.4194756554307117,
"grad_norm": 0.049860697239637375,
"learning_rate": 0.0002,
"loss": 0.5245251655578613,
"mean_token_accuracy": 0.7890132665634155,
"num_tokens": 14888739.0,
"step": 913
},
{
"entropy": 0.5260248631238937,
"epoch": 3.4232209737827715,
"grad_norm": 0.0514976903796196,
"learning_rate": 0.0002,
"loss": 0.5202233195304871,
"mean_token_accuracy": 0.7909575551748276,
"num_tokens": 14905100.0,
"step": 914
},
{
"entropy": 0.5172304511070251,
"epoch": 3.4269662921348316,
"grad_norm": 0.046695906668901443,
"learning_rate": 0.0002,
"loss": 0.5149263143539429,
"mean_token_accuracy": 0.7901606112718582,
"num_tokens": 14921448.0,
"step": 915
},
{
"entropy": 0.5069386884570122,
"epoch": 3.4307116104868913,
"grad_norm": 0.05618730187416077,
"learning_rate": 0.0002,
"loss": 0.5093807578086853,
"mean_token_accuracy": 0.7943364530801773,
"num_tokens": 14937735.0,
"step": 916
},
{
"entropy": 0.5155317038297653,
"epoch": 3.4344569288389515,
"grad_norm": 0.04981003701686859,
"learning_rate": 0.0002,
"loss": 0.5243242383003235,
"mean_token_accuracy": 0.7892241328954697,
"num_tokens": 14954139.0,
"step": 917
},
{
"entropy": 0.5165708512067795,
"epoch": 3.438202247191011,
"grad_norm": 0.050371985882520676,
"learning_rate": 0.0002,
"loss": 0.5150896906852722,
"mean_token_accuracy": 0.7927063405513763,
"num_tokens": 14970507.0,
"step": 918
},
{
"entropy": 0.5134851261973381,
"epoch": 3.4419475655430714,
"grad_norm": 0.04879898577928543,
"learning_rate": 0.0002,
"loss": 0.5160987377166748,
"mean_token_accuracy": 0.7906570881605148,
"num_tokens": 14986812.0,
"step": 919
},
{
"entropy": 0.5135181546211243,
"epoch": 3.445692883895131,
"grad_norm": 0.05624324828386307,
"learning_rate": 0.0002,
"loss": 0.5219361186027527,
"mean_token_accuracy": 0.7903093546628952,
"num_tokens": 15003179.0,
"step": 920
},
{
"entropy": 0.5162501037120819,
"epoch": 3.449438202247191,
"grad_norm": 0.04822200909256935,
"learning_rate": 0.0002,
"loss": 0.5126674175262451,
"mean_token_accuracy": 0.7924687564373016,
"num_tokens": 15019428.0,
"step": 921
},
{
"entropy": 0.5315191224217415,
"epoch": 3.453183520599251,
"grad_norm": 0.04490262269973755,
"learning_rate": 0.0002,
"loss": 0.5248660445213318,
"mean_token_accuracy": 0.7871098518371582,
"num_tokens": 15035868.0,
"step": 922
},
{
"entropy": 0.5238284766674042,
"epoch": 3.4569288389513106,
"grad_norm": 0.051175910979509354,
"learning_rate": 0.0002,
"loss": 0.521578311920166,
"mean_token_accuracy": 0.7883873879909515,
"num_tokens": 15052303.0,
"step": 923
},
{
"entropy": 0.5168250873684883,
"epoch": 3.460674157303371,
"grad_norm": 0.046608321368694305,
"learning_rate": 0.0002,
"loss": 0.5207570791244507,
"mean_token_accuracy": 0.7900703996419907,
"num_tokens": 15068618.0,
"step": 924
},
{
"entropy": 0.5313585698604584,
"epoch": 3.464419475655431,
"grad_norm": 0.049307819455862045,
"learning_rate": 0.0002,
"loss": 0.5298991203308105,
"mean_token_accuracy": 0.7864013016223907,
"num_tokens": 15084957.0,
"step": 925
},
{
"entropy": 0.5185838490724564,
"epoch": 3.4681647940074907,
"grad_norm": 0.05639752745628357,
"learning_rate": 0.0002,
"loss": 0.5251802802085876,
"mean_token_accuracy": 0.787624716758728,
"num_tokens": 15101189.0,
"step": 926
},
{
"entropy": 0.515865795314312,
"epoch": 3.4719101123595504,
"grad_norm": 0.05554183945059776,
"learning_rate": 0.0002,
"loss": 0.518955647945404,
"mean_token_accuracy": 0.7888496518135071,
"num_tokens": 15117511.0,
"step": 927
},
{
"entropy": 0.5173558592796326,
"epoch": 3.4756554307116105,
"grad_norm": 0.051211338490247726,
"learning_rate": 0.0002,
"loss": 0.5185026526451111,
"mean_token_accuracy": 0.7890340387821198,
"num_tokens": 15133719.0,
"step": 928
},
{
"entropy": 0.520257018506527,
"epoch": 3.4794007490636703,
"grad_norm": 0.055278245359659195,
"learning_rate": 0.0002,
"loss": 0.5183354616165161,
"mean_token_accuracy": 0.7902627289295197,
"num_tokens": 15149922.0,
"step": 929
},
{
"entropy": 0.515156589448452,
"epoch": 3.4831460674157304,
"grad_norm": 0.05468440055847168,
"learning_rate": 0.0002,
"loss": 0.5097793340682983,
"mean_token_accuracy": 0.7964832186698914,
"num_tokens": 15166020.0,
"step": 930
},
{
"entropy": 0.521842934191227,
"epoch": 3.48689138576779,
"grad_norm": 0.04573323577642441,
"learning_rate": 0.0002,
"loss": 0.5174736380577087,
"mean_token_accuracy": 0.7907158583402634,
"num_tokens": 15182296.0,
"step": 931
},
{
"entropy": 0.5367195308208466,
"epoch": 3.4906367041198503,
"grad_norm": 0.05060438811779022,
"learning_rate": 0.0002,
"loss": 0.5360324382781982,
"mean_token_accuracy": 0.7832886576652527,
"num_tokens": 15198618.0,
"step": 932
},
{
"entropy": 0.5351738333702087,
"epoch": 3.49438202247191,
"grad_norm": 0.04796265438199043,
"learning_rate": 0.0002,
"loss": 0.5342084765434265,
"mean_token_accuracy": 0.7837437838315964,
"num_tokens": 15215125.0,
"step": 933
},
{
"entropy": 0.5210021957755089,
"epoch": 3.49812734082397,
"grad_norm": 0.05278978869318962,
"learning_rate": 0.0002,
"loss": 0.5260420441627502,
"mean_token_accuracy": 0.7890212833881378,
"num_tokens": 15231335.0,
"step": 934
},
{
"entropy": 0.5361146479845047,
"epoch": 3.50187265917603,
"grad_norm": 0.05599920451641083,
"learning_rate": 0.0002,
"loss": 0.5407608151435852,
"mean_token_accuracy": 0.7809196263551712,
"num_tokens": 15247587.0,
"step": 935
},
{
"entropy": 0.5127650052309036,
"epoch": 3.50561797752809,
"grad_norm": 0.053348250687122345,
"learning_rate": 0.0002,
"loss": 0.5172818303108215,
"mean_token_accuracy": 0.7908589243888855,
"num_tokens": 15263983.0,
"step": 936
},
{
"entropy": 0.5113075897097588,
"epoch": 3.5093632958801497,
"grad_norm": 0.047283098101615906,
"learning_rate": 0.0002,
"loss": 0.5094785690307617,
"mean_token_accuracy": 0.7913675010204315,
"num_tokens": 15280172.0,
"step": 937
},
{
"entropy": 0.5144875794649124,
"epoch": 3.51310861423221,
"grad_norm": 0.05150860175490379,
"learning_rate": 0.0002,
"loss": 0.5117542743682861,
"mean_token_accuracy": 0.7926830351352692,
"num_tokens": 15296278.0,
"step": 938
},
{
"entropy": 0.5282381922006607,
"epoch": 3.5168539325842696,
"grad_norm": 0.05235690623521805,
"learning_rate": 0.0002,
"loss": 0.5275253653526306,
"mean_token_accuracy": 0.787050798535347,
"num_tokens": 15312737.0,
"step": 939
},
{
"entropy": 0.5191426128149033,
"epoch": 3.5205992509363297,
"grad_norm": 0.05214005708694458,
"learning_rate": 0.0002,
"loss": 0.5218259692192078,
"mean_token_accuracy": 0.7854390293359756,
"num_tokens": 15329171.0,
"step": 940
},
{
"entropy": 0.488400898873806,
"epoch": 3.5243445692883895,
"grad_norm": 0.05028095468878746,
"learning_rate": 0.0002,
"loss": 0.49238866567611694,
"mean_token_accuracy": 0.8010139167308807,
"num_tokens": 15345040.0,
"step": 941
},
{
"entropy": 0.530989944934845,
"epoch": 3.5280898876404496,
"grad_norm": 0.05137421563267708,
"learning_rate": 0.0002,
"loss": 0.5283138155937195,
"mean_token_accuracy": 0.7872757613658905,
"num_tokens": 15361506.0,
"step": 942
},
{
"entropy": 0.5166791379451752,
"epoch": 3.5318352059925093,
"grad_norm": 0.05064837634563446,
"learning_rate": 0.0002,
"loss": 0.5200411677360535,
"mean_token_accuracy": 0.7893417179584503,
"num_tokens": 15377725.0,
"step": 943
},
{
"entropy": 0.5225488543510437,
"epoch": 3.535580524344569,
"grad_norm": 0.05224663019180298,
"learning_rate": 0.0002,
"loss": 0.5252619981765747,
"mean_token_accuracy": 0.7887216210365295,
"num_tokens": 15394073.0,
"step": 944
},
{
"entropy": 0.5133933499455452,
"epoch": 3.539325842696629,
"grad_norm": 0.054900407791137695,
"learning_rate": 0.0002,
"loss": 0.5187044143676758,
"mean_token_accuracy": 0.7941587567329407,
"num_tokens": 15410326.0,
"step": 945
},
{
"entropy": 0.5217478722333908,
"epoch": 3.5430711610486894,
"grad_norm": 0.05068376660346985,
"learning_rate": 0.0002,
"loss": 0.5203924179077148,
"mean_token_accuracy": 0.7903146594762802,
"num_tokens": 15426695.0,
"step": 946
}
],
"logging_steps": 1,
"max_steps": 1335,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.437096036035199e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}