banking77-intent-classifier / trainer_state.json
stefanwebb's picture
everything except large files
b874299
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 250,
"global_step": 530,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.825,
"epoch": 0.01890359168241966,
"grad_norm": 157.0,
"learning_rate": 5.925925925925926e-06,
"loss": 2.9876,
"mean_token_accuracy": 0.6893173575401306,
"num_input_tokens_seen": 115216,
"num_tokens": 114489.0,
"step": 5,
"train_runtime": 4.4379,
"train_tokens_per_second": 25961.602
},
{
"entropy": 0.8890625,
"epoch": 0.03780718336483932,
"grad_norm": 24.5,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.6662,
"mean_token_accuracy": 0.8326915562152862,
"num_input_tokens_seen": 230592,
"num_tokens": 229109.0,
"step": 10,
"train_runtime": 7.7641,
"train_tokens_per_second": 29699.859
},
{
"entropy": 1.02890625,
"epoch": 0.05671077504725898,
"grad_norm": 18.625,
"learning_rate": 2.074074074074074e-05,
"loss": 0.6027,
"mean_token_accuracy": 0.8529165983200073,
"num_input_tokens_seen": 345600,
"num_tokens": 343545.0,
"step": 15,
"train_runtime": 13.2817,
"train_tokens_per_second": 26020.836
},
{
"entropy": 1.1421875,
"epoch": 0.07561436672967864,
"grad_norm": 19.125,
"learning_rate": 2.814814814814815e-05,
"loss": 0.4447,
"mean_token_accuracy": 0.881816154718399,
"num_input_tokens_seen": 461282,
"num_tokens": 458335.0,
"step": 20,
"train_runtime": 16.8766,
"train_tokens_per_second": 27332.56
},
{
"entropy": 1.21640625,
"epoch": 0.0945179584120983,
"grad_norm": 21.75,
"learning_rate": 3.555555555555555e-05,
"loss": 0.3989,
"mean_token_accuracy": 0.8929793298244476,
"num_input_tokens_seen": 576346,
"num_tokens": 572824.0,
"step": 25,
"train_runtime": 22.5256,
"train_tokens_per_second": 25586.277
},
{
"entropy": 1.26484375,
"epoch": 0.11342155009451796,
"grad_norm": 13.625,
"learning_rate": 3.999843966403289e-05,
"loss": 0.4872,
"mean_token_accuracy": 0.8781549751758575,
"num_input_tokens_seen": 691188,
"num_tokens": 687152.0,
"step": 30,
"train_runtime": 26.0077,
"train_tokens_per_second": 26576.234
},
{
"entropy": 1.28515625,
"epoch": 0.1323251417769376,
"grad_norm": 17.375,
"learning_rate": 3.99808886803243e-05,
"loss": 0.28,
"mean_token_accuracy": 0.9074305832386017,
"num_input_tokens_seen": 806740,
"num_tokens": 801973.0,
"step": 35,
"train_runtime": 29.7225,
"train_tokens_per_second": 27142.403
},
{
"entropy": 1.2984375,
"epoch": 0.15122873345935728,
"grad_norm": 12.625,
"learning_rate": 3.994385346473689e-05,
"loss": 0.356,
"mean_token_accuracy": 0.9146295249462127,
"num_input_tokens_seen": 921796,
"num_tokens": 916426.0,
"step": 40,
"train_runtime": 34.6767,
"train_tokens_per_second": 26582.553
},
{
"entropy": 1.56953125,
"epoch": 0.17013232514177692,
"grad_norm": 11.4375,
"learning_rate": 3.9887370131917e-05,
"loss": 0.3933,
"mean_token_accuracy": 0.9064954161643982,
"num_input_tokens_seen": 1036824,
"num_tokens": 1030824.0,
"step": 45,
"train_runtime": 38.0075,
"train_tokens_per_second": 27279.472
},
{
"entropy": 1.7515625,
"epoch": 0.1890359168241966,
"grad_norm": 15.1875,
"learning_rate": 3.981149376121427e-05,
"loss": 0.2873,
"mean_token_accuracy": 0.9260397672653198,
"num_input_tokens_seen": 1152356,
"num_tokens": 1145500.0,
"step": 50,
"train_runtime": 43.1792,
"train_tokens_per_second": 26687.759
},
{
"entropy": 1.75859375,
"epoch": 0.20793950850661624,
"grad_norm": 8.5625,
"learning_rate": 3.97162983429714e-05,
"loss": 0.3322,
"mean_token_accuracy": 0.9256749033927918,
"num_input_tokens_seen": 1267634,
"num_tokens": 1260057.0,
"step": 55,
"train_runtime": 46.6166,
"train_tokens_per_second": 27192.755
},
{
"entropy": 1.734375,
"epoch": 0.22684310018903592,
"grad_norm": 11.6875,
"learning_rate": 3.960187670637294e-05,
"loss": 0.2865,
"mean_token_accuracy": 0.9282522916793823,
"num_input_tokens_seen": 1383494,
"num_tokens": 1374973.0,
"step": 60,
"train_runtime": 50.5007,
"train_tokens_per_second": 27395.522
},
{
"entropy": 1.70859375,
"epoch": 0.24574669187145556,
"grad_norm": 9.75,
"learning_rate": 3.946834042892355e-05,
"loss": 0.2277,
"mean_token_accuracy": 0.9320353448390961,
"num_input_tokens_seen": 1499052,
"num_tokens": 1489683.0,
"step": 65,
"train_runtime": 55.2907,
"train_tokens_per_second": 27112.206
},
{
"entropy": 1.6578125,
"epoch": 0.2646502835538752,
"grad_norm": 10.125,
"learning_rate": 3.931581972764386e-05,
"loss": 0.2733,
"mean_token_accuracy": 0.9363594233989716,
"num_input_tokens_seen": 1614146,
"num_tokens": 1604106.0,
"step": 70,
"train_runtime": 58.5263,
"train_tokens_per_second": 27579.832
},
{
"entropy": 1.57109375,
"epoch": 0.2835538752362949,
"grad_norm": 6.34375,
"learning_rate": 3.91444633320903e-05,
"loss": 0.2164,
"mean_token_accuracy": 0.9349239528179168,
"num_input_tokens_seen": 1729362,
"num_tokens": 1718632.0,
"step": 75,
"train_runtime": 63.5177,
"train_tokens_per_second": 27226.454
},
{
"entropy": 1.57734375,
"epoch": 0.30245746691871456,
"grad_norm": 10.875,
"learning_rate": 3.8954438339322366e-05,
"loss": 0.2173,
"mean_token_accuracy": 0.9350460767745972,
"num_input_tokens_seen": 1844444,
"num_tokens": 1833068.0,
"step": 80,
"train_runtime": 66.8194,
"train_tokens_per_second": 27603.402
},
{
"entropy": 1.62421875,
"epoch": 0.32136105860113423,
"grad_norm": 11.0,
"learning_rate": 3.874593005095909e-05,
"loss": 0.2337,
"mean_token_accuracy": 0.929820317029953,
"num_input_tokens_seen": 1959682,
"num_tokens": 1947640.0,
"step": 85,
"train_runtime": 70.4744,
"train_tokens_per_second": 27806.99
},
{
"entropy": 1.71171875,
"epoch": 0.34026465028355385,
"grad_norm": 8.625,
"learning_rate": 3.851914179248333e-05,
"loss": 0.2156,
"mean_token_accuracy": 0.9308744966983795,
"num_input_tokens_seen": 2075138,
"num_tokens": 2062310.0,
"step": 90,
"train_runtime": 75.9345,
"train_tokens_per_second": 27327.991
},
{
"entropy": 1.86875,
"epoch": 0.3591682419659735,
"grad_norm": 13.5625,
"learning_rate": 3.82742947149703e-05,
"loss": 0.2718,
"mean_token_accuracy": 0.9264281988143921,
"num_input_tokens_seen": 2190160,
"num_tokens": 2176716.0,
"step": 95,
"train_runtime": 79.4416,
"train_tokens_per_second": 27569.42
},
{
"entropy": 1.94765625,
"epoch": 0.3780718336483932,
"grad_norm": 6.125,
"learning_rate": 3.801162757943359e-05,
"loss": 0.3385,
"mean_token_accuracy": 0.9164456725120544,
"num_input_tokens_seen": 2305250,
"num_tokens": 2291230.0,
"step": 100,
"train_runtime": 84.7105,
"train_tokens_per_second": 27213.265
},
{
"entropy": 1.846875,
"epoch": 0.39697542533081287,
"grad_norm": 15.1875,
"learning_rate": 3.773139652399884e-05,
"loss": 0.1811,
"mean_token_accuracy": 0.944804173707962,
"num_input_tokens_seen": 2420666,
"num_tokens": 2405904.0,
"step": 105,
"train_runtime": 88.7231,
"train_tokens_per_second": 27283.383
},
{
"entropy": 1.84765625,
"epoch": 0.4158790170132325,
"grad_norm": 6.3125,
"learning_rate": 3.743387481413243e-05,
"loss": 0.1974,
"mean_token_accuracy": 0.9379207909107208,
"num_input_tokens_seen": 2535606,
"num_tokens": 2520235.0,
"step": 110,
"train_runtime": 93.0343,
"train_tokens_per_second": 27254.523
},
{
"entropy": 1.83984375,
"epoch": 0.43478260869565216,
"grad_norm": 2.875,
"learning_rate": 3.711935257616842e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.9594786465167999,
"num_input_tokens_seen": 2650514,
"num_tokens": 2634592.0,
"step": 115,
"train_runtime": 98.3865,
"train_tokens_per_second": 26939.815
},
{
"entropy": 1.78984375,
"epoch": 0.45368620037807184,
"grad_norm": 3.6875,
"learning_rate": 3.678813651439376e-05,
"loss": 0.1993,
"mean_token_accuracy": 0.9459972441196441,
"num_input_tokens_seen": 2766004,
"num_tokens": 2749299.0,
"step": 120,
"train_runtime": 102.0946,
"train_tokens_per_second": 27092.567
},
{
"entropy": 1.78828125,
"epoch": 0.4725897920604915,
"grad_norm": 9.1875,
"learning_rate": 3.6440549611967656e-05,
"loss": 0.2075,
"mean_token_accuracy": 0.940614128112793,
"num_input_tokens_seen": 2880990,
"num_tokens": 2863713.0,
"step": 125,
"train_runtime": 107.8735,
"train_tokens_per_second": 26707.121
},
{
"entropy": 1.85234375,
"epoch": 0.4914933837429111,
"grad_norm": 7.15625,
"learning_rate": 3.6076930815966654e-05,
"loss": 0.236,
"mean_token_accuracy": 0.9343804061412812,
"num_input_tokens_seen": 2995844,
"num_tokens": 2978032.0,
"step": 130,
"train_runtime": 111.3362,
"train_tokens_per_second": 26908.095
},
{
"entropy": 1.95859375,
"epoch": 0.5103969754253308,
"grad_norm": 7.375,
"learning_rate": 3.569763470686262e-05,
"loss": 0.162,
"mean_token_accuracy": 0.9484993875026703,
"num_input_tokens_seen": 3111092,
"num_tokens": 3092605.0,
"step": 135,
"train_runtime": 115.8418,
"train_tokens_per_second": 26856.393
},
{
"entropy": 1.99921875,
"epoch": 0.5293005671077504,
"grad_norm": 7.125,
"learning_rate": 3.530303115275597e-05,
"loss": 0.1892,
"mean_token_accuracy": 0.9394895970821381,
"num_input_tokens_seen": 3226396,
"num_tokens": 3207190.0,
"step": 140,
"train_runtime": 120.7172,
"train_tokens_per_second": 26726.892
},
{
"entropy": 1.96484375,
"epoch": 0.5482041587901701,
"grad_norm": 4.5625,
"learning_rate": 3.4893504948701185e-05,
"loss": 0.1614,
"mean_token_accuracy": 0.9600624740123749,
"num_input_tokens_seen": 3341802,
"num_tokens": 3321840.0,
"step": 145,
"train_runtime": 124.4268,
"train_tokens_per_second": 26857.576
},
{
"entropy": 1.90859375,
"epoch": 0.5671077504725898,
"grad_norm": 6.96875,
"learning_rate": 3.4469455441476475e-05,
"loss": 0.1334,
"mean_token_accuracy": 0.9625543296337128,
"num_input_tokens_seen": 3456964,
"num_tokens": 3436339.0,
"step": 150,
"train_runtime": 130.3081,
"train_tokens_per_second": 26529.148
},
{
"entropy": 1.92421875,
"epoch": 0.5860113421550095,
"grad_norm": 12.4375,
"learning_rate": 3.403129614016339e-05,
"loss": 0.1427,
"mean_token_accuracy": 0.9588114261627197,
"num_input_tokens_seen": 3572084,
"num_tokens": 3550813.0,
"step": 155,
"train_runtime": 133.8989,
"train_tokens_per_second": 26677.47
},
{
"entropy": 1.98671875,
"epoch": 0.6049149338374291,
"grad_norm": 7.3125,
"learning_rate": 3.357945431291618e-05,
"loss": 0.2129,
"mean_token_accuracy": 0.9367718935012818,
"num_input_tokens_seen": 3687248,
"num_tokens": 3665300.0,
"step": 160,
"train_runtime": 138.2948,
"train_tokens_per_second": 26662.235
},
{
"entropy": 2.1359375,
"epoch": 0.6238185255198487,
"grad_norm": 3.09375,
"learning_rate": 3.311437057031406e-05,
"loss": 0.2219,
"mean_token_accuracy": 0.9387097895145416,
"num_input_tokens_seen": 3802458,
"num_tokens": 3779809.0,
"step": 165,
"train_runtime": 142.569,
"train_tokens_per_second": 26671.004
},
{
"entropy": 2.0859375,
"epoch": 0.6427221172022685,
"grad_norm": 4.53125,
"learning_rate": 3.263649843570271e-05,
"loss": 0.1355,
"mean_token_accuracy": 0.9585716307163239,
"num_input_tokens_seen": 3917580,
"num_tokens": 3894322.0,
"step": 170,
"train_runtime": 145.9767,
"train_tokens_per_second": 26837.021
},
{
"entropy": 1.946875,
"epoch": 0.6616257088846881,
"grad_norm": 6.53125,
"learning_rate": 3.214630390294396e-05,
"loss": 0.2962,
"mean_token_accuracy": 0.9372412860393524,
"num_input_tokens_seen": 4032748,
"num_tokens": 4008844.0,
"step": 175,
"train_runtime": 151.6027,
"train_tokens_per_second": 26600.765
},
{
"entropy": 1.98671875,
"epoch": 0.6805293005671077,
"grad_norm": 6.96875,
"learning_rate": 3.1644264982005e-05,
"loss": 0.1841,
"mean_token_accuracy": 0.9490657150745392,
"num_input_tokens_seen": 4148142,
"num_tokens": 4123487.0,
"step": 180,
"train_runtime": 154.9764,
"train_tokens_per_second": 26766.274
},
{
"entropy": 2.021875,
"epoch": 0.6994328922495274,
"grad_norm": 2.953125,
"learning_rate": 3.113087123283002e-05,
"loss": 0.124,
"mean_token_accuracy": 0.964401924610138,
"num_input_tokens_seen": 4263312,
"num_tokens": 4238014.0,
"step": 185,
"train_runtime": 159.4694,
"train_tokens_per_second": 26734.354
},
{
"entropy": 1.96171875,
"epoch": 0.718336483931947,
"grad_norm": 3.4375,
"learning_rate": 3.060662328794916e-05,
"loss": 0.1498,
"mean_token_accuracy": 0.9481843888759613,
"num_input_tokens_seen": 4378630,
"num_tokens": 4352627.0,
"step": 190,
"train_runtime": 163.6223,
"train_tokens_per_second": 26760.595
},
{
"entropy": 1.9640625,
"epoch": 0.7372400756143668,
"grad_norm": 4.1875,
"learning_rate": 3.0072032364289914e-05,
"loss": 0.1076,
"mean_token_accuracy": 0.9691859900951385,
"num_input_tokens_seen": 4493600,
"num_tokens": 4467053.0,
"step": 195,
"train_runtime": 166.9247,
"train_tokens_per_second": 26919.915
},
{
"entropy": 2.02734375,
"epoch": 0.7561436672967864,
"grad_norm": 3.875,
"learning_rate": 2.9527619764667376e-05,
"loss": 0.2501,
"mean_token_accuracy": 0.9455641567707062,
"num_input_tokens_seen": 4609216,
"num_tokens": 4581812.0,
"step": 200,
"train_runtime": 172.0695,
"train_tokens_per_second": 26786.938
},
{
"entropy": 2.14375,
"epoch": 0.775047258979206,
"grad_norm": 5.4375,
"learning_rate": 2.8973916369439194e-05,
"loss": 0.2157,
"mean_token_accuracy": 0.9492439985275268,
"num_input_tokens_seen": 4724086,
"num_tokens": 4696178.0,
"step": 205,
"train_runtime": 175.6473,
"train_tokens_per_second": 26895.294
},
{
"entropy": 2.2625,
"epoch": 0.7939508506616257,
"grad_norm": 3.75,
"learning_rate": 2.84114621188211e-05,
"loss": 0.1762,
"mean_token_accuracy": 0.9574925601482391,
"num_input_tokens_seen": 4839702,
"num_tokens": 4810939.0,
"step": 210,
"train_runtime": 180.4712,
"train_tokens_per_second": 26817.036
},
{
"entropy": 2.2953125,
"epoch": 0.8128544423440454,
"grad_norm": 3.9375,
"learning_rate": 2.7840805486367792e-05,
"loss": 0.1703,
"mean_token_accuracy": 0.9540181159973145,
"num_input_tokens_seen": 4955098,
"num_tokens": 4925591.0,
"step": 215,
"train_runtime": 184.4177,
"train_tokens_per_second": 26868.891
},
{
"entropy": 2.2828125,
"epoch": 0.831758034026465,
"grad_norm": 4.625,
"learning_rate": 2.7262502944132526e-05,
"loss": 0.0938,
"mean_token_accuracy": 0.9725252389907837,
"num_input_tokens_seen": 5070258,
"num_tokens": 5040089.0,
"step": 220,
"train_runtime": 188.065,
"train_tokens_per_second": 26960.132
},
{
"entropy": 2.1265625,
"epoch": 0.8506616257088847,
"grad_norm": 5.71875,
"learning_rate": 2.667711842002707e-05,
"loss": 0.1704,
"mean_token_accuracy": 0.9579161703586578,
"num_input_tokens_seen": 5185478,
"num_tokens": 5154604.0,
"step": 225,
"train_runtime": 192.8301,
"train_tokens_per_second": 26891.43
},
{
"entropy": 2.0484375,
"epoch": 0.8695652173913043,
"grad_norm": 4.3125,
"learning_rate": 2.6085222747911155e-05,
"loss": 0.4284,
"mean_token_accuracy": 0.9190201222896576,
"num_input_tokens_seen": 5301020,
"num_tokens": 5269357.0,
"step": 230,
"train_runtime": 196.1744,
"train_tokens_per_second": 27021.971
},
{
"entropy": 1.98671875,
"epoch": 0.888468809073724,
"grad_norm": 13.3125,
"learning_rate": 2.5487393110947557e-05,
"loss": 0.1346,
"mean_token_accuracy": 0.9579481542110443,
"num_input_tokens_seen": 5416464,
"num_tokens": 5384069.0,
"step": 235,
"train_runtime": 201.21,
"train_tokens_per_second": 26919.463
},
{
"entropy": 1.9875,
"epoch": 0.9073724007561437,
"grad_norm": 3.84375,
"learning_rate": 2.4884212478765747e-05,
"loss": 0.097,
"mean_token_accuracy": 0.9672803819179535,
"num_input_tokens_seen": 5531644,
"num_tokens": 5498568.0,
"step": 240,
"train_runtime": 205.075,
"train_tokens_per_second": 26973.766
},
{
"entropy": 2.00234375,
"epoch": 0.9262759924385633,
"grad_norm": 4.9375,
"learning_rate": 2.427626903898292e-05,
"loss": 0.2298,
"mean_token_accuracy": 0.9443018674850464,
"num_input_tokens_seen": 5646952,
"num_tokens": 5613157.0,
"step": 245,
"train_runtime": 208.4891,
"train_tokens_per_second": 27085.115
},
{
"entropy": 2.0140625,
"epoch": 0.945179584120983,
"grad_norm": 6.03125,
"learning_rate": 2.3664155623636715e-05,
"loss": 0.1732,
"mean_token_accuracy": 0.9442705571651459,
"num_input_tokens_seen": 5762366,
"num_tokens": 5727795.0,
"step": 250,
"train_runtime": 214.059,
"train_tokens_per_second": 26919.525
},
{
"entropy": 2.0125,
"epoch": 0.9640831758034026,
"grad_norm": 3.546875,
"learning_rate": 2.304846913108891e-05,
"loss": 0.1083,
"mean_token_accuracy": 0.9664817750453949,
"num_input_tokens_seen": 5877646,
"num_tokens": 5842437.0,
"step": 255,
"train_runtime": 275.7098,
"train_tokens_per_second": 21318.232
},
{
"entropy": 2.0,
"epoch": 0.9829867674858223,
"grad_norm": 2.671875,
"learning_rate": 2.242980994396401e-05,
"loss": 0.0875,
"mean_token_accuracy": 0.9795427262783051,
"num_input_tokens_seen": 5992710,
"num_tokens": 5956870.0,
"step": 260,
"train_runtime": 280.9684,
"train_tokens_per_second": 21328.766
},
{
"entropy": 1.9513888888888888,
"epoch": 1.0,
"grad_norm": 7.4375,
"learning_rate": 2.1808781343690027e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9603289763132731,
"num_input_tokens_seen": 6096342,
"num_tokens": 6059927.0,
"step": 265,
"train_runtime": 284.3725,
"train_tokens_per_second": 21437.877
},
{
"entropy": 1.903125,
"epoch": 1.0189035916824196,
"grad_norm": 3.453125,
"learning_rate": 2.118598892221257e-05,
"loss": 0.0783,
"mean_token_accuracy": 0.9817151129245758,
"num_input_tokens_seen": 6211574,
"num_tokens": 6174483.0,
"step": 270,
"train_runtime": 288.2049,
"train_tokens_per_second": 21552.63
},
{
"entropy": 1.84375,
"epoch": 1.0378071833648392,
"grad_norm": 1.734375,
"learning_rate": 2.0562039991455877e-05,
"loss": 0.1214,
"mean_token_accuracy": 0.9741188943386078,
"num_input_tokens_seen": 6327000,
"num_tokens": 6289163.0,
"step": 275,
"train_runtime": 293.7126,
"train_tokens_per_second": 21541.469
},
{
"entropy": 1.8421875,
"epoch": 1.056710775047259,
"grad_norm": 3.78125,
"learning_rate": 1.99375429911066e-05,
"loss": 0.1393,
"mean_token_accuracy": 0.9579156279563904,
"num_input_tokens_seen": 6442290,
"num_tokens": 6403766.0,
"step": 280,
"train_runtime": 297.1668,
"train_tokens_per_second": 21679.038
},
{
"entropy": 1.85078125,
"epoch": 1.0756143667296787,
"grad_norm": 3.953125,
"learning_rate": 1.931310689529781e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9788394093513488,
"num_input_tokens_seen": 6557852,
"num_tokens": 6518469.0,
"step": 285,
"train_runtime": 301.7702,
"train_tokens_per_second": 21731.276
},
{
"entropy": 1.8234375,
"epoch": 1.0945179584120983,
"grad_norm": 7.1875,
"learning_rate": 1.8689340618771937e-05,
"loss": 0.0637,
"mean_token_accuracy": 0.972537738084793,
"num_input_tokens_seen": 6673032,
"num_tokens": 6632963.0,
"step": 290,
"train_runtime": 306.4769,
"train_tokens_per_second": 21773.362
},
{
"entropy": 1.78359375,
"epoch": 1.113421550094518,
"grad_norm": 5.78125,
"learning_rate": 1.806685242310156e-05,
"loss": 0.0565,
"mean_token_accuracy": 0.9854797184467315,
"num_input_tokens_seen": 6788174,
"num_tokens": 6747403.0,
"step": 295,
"train_runtime": 310.3851,
"train_tokens_per_second": 21870.17
},
{
"entropy": 1.76015625,
"epoch": 1.1323251417769375,
"grad_norm": 8.8125,
"learning_rate": 1.7446249323547117e-05,
"loss": 0.0973,
"mean_token_accuracy": 0.9734237968921662,
"num_input_tokens_seen": 6903146,
"num_tokens": 6861788.0,
"step": 300,
"train_runtime": 315.4655,
"train_tokens_per_second": 21882.41
},
{
"entropy": 1.75078125,
"epoch": 1.1512287334593574,
"grad_norm": 1.4453125,
"learning_rate": 1.6828136497130014e-05,
"loss": 0.0681,
"mean_token_accuracy": 0.9820096373558045,
"num_input_tokens_seen": 7018350,
"num_tokens": 6976277.0,
"step": 305,
"train_runtime": 319.0527,
"train_tokens_per_second": 21997.465
},
{
"entropy": 1.740625,
"epoch": 1.170132325141777,
"grad_norm": 4.90625,
"learning_rate": 1.6213116692498206e-05,
"loss": 0.0625,
"mean_token_accuracy": 0.9826828062534332,
"num_input_tokens_seen": 7133636,
"num_tokens": 7090874.0,
"step": 310,
"train_runtime": 323.8986,
"train_tokens_per_second": 22024.29
},
{
"entropy": 1.7328125,
"epoch": 1.1890359168241966,
"grad_norm": 0.66015625,
"learning_rate": 1.560178964215987e-05,
"loss": 0.077,
"mean_token_accuracy": 0.978941410779953,
"num_input_tokens_seen": 7248866,
"num_tokens": 7205391.0,
"step": 315,
"train_runtime": 327.5895,
"train_tokens_per_second": 22127.897
},
{
"entropy": 1.73203125,
"epoch": 1.2079395085066162,
"grad_norm": 4.0625,
"learning_rate": 1.4994751477658139e-05,
"loss": 0.067,
"mean_token_accuracy": 0.9818780541419982,
"num_input_tokens_seen": 7363900,
"num_tokens": 7319827.0,
"step": 320,
"train_runtime": 331.4598,
"train_tokens_per_second": 22216.571
},
{
"entropy": 1.73515625,
"epoch": 1.2268431001890359,
"grad_norm": 2.734375,
"learning_rate": 1.4392594148257426e-05,
"loss": 0.1153,
"mean_token_accuracy": 0.9638942897319793,
"num_input_tokens_seen": 7479394,
"num_tokens": 7434543.0,
"step": 325,
"train_runtime": 336.2629,
"train_tokens_per_second": 22242.696
},
{
"entropy": 1.74609375,
"epoch": 1.2457466918714555,
"grad_norm": 2.046875,
"learning_rate": 1.3795904843707959e-05,
"loss": 0.0359,
"mean_token_accuracy": 0.9886789560317993,
"num_input_tokens_seen": 7594632,
"num_tokens": 7549134.0,
"step": 330,
"train_runtime": 339.6052,
"train_tokens_per_second": 22363.12
},
{
"entropy": 1.740625,
"epoch": 1.264650283553875,
"grad_norm": 2.25,
"learning_rate": 1.3205265421651588e-05,
"loss": 0.0808,
"mean_token_accuracy": 0.9852688193321228,
"num_input_tokens_seen": 7709704,
"num_tokens": 7663583.0,
"step": 335,
"train_runtime": 344.9458,
"train_tokens_per_second": 22350.48
},
{
"entropy": 1.75078125,
"epoch": 1.283553875236295,
"grad_norm": 1.8125,
"learning_rate": 1.2621251840227112e-05,
"loss": 0.0663,
"mean_token_accuracy": 0.9817369997501373,
"num_input_tokens_seen": 7824834,
"num_tokens": 7778064.0,
"step": 340,
"train_runtime": 348.223,
"train_tokens_per_second": 22470.756
},
{
"entropy": 1.75234375,
"epoch": 1.3024574669187146,
"grad_norm": 4.28125,
"learning_rate": 1.2044433596428537e-05,
"loss": 0.0678,
"mean_token_accuracy": 0.9812626421451569,
"num_input_tokens_seen": 7939832,
"num_tokens": 7892415.0,
"step": 345,
"train_runtime": 352.0847,
"train_tokens_per_second": 22550.916
},
{
"entropy": 1.746875,
"epoch": 1.3213610586011342,
"grad_norm": 2.703125,
"learning_rate": 1.1475373170763819e-05,
"loss": 0.0465,
"mean_token_accuracy": 0.9823280215263367,
"num_input_tokens_seen": 8054988,
"num_tokens": 8006926.0,
"step": 350,
"train_runtime": 357.1271,
"train_tokens_per_second": 22554.962
},
{
"entropy": 1.74765625,
"epoch": 1.3402646502835538,
"grad_norm": 1.4921875,
"learning_rate": 1.0914625478755672e-05,
"loss": 0.1174,
"mean_token_accuracy": 0.9695515096187591,
"num_input_tokens_seen": 8170098,
"num_tokens": 8121373.0,
"step": 355,
"train_runtime": 360.7524,
"train_tokens_per_second": 22647.381
},
{
"entropy": 1.74453125,
"epoch": 1.3591682419659734,
"grad_norm": 1.1015625,
"learning_rate": 1.0362737329819413e-05,
"loss": 0.045,
"mean_token_accuracy": 0.9885900497436524,
"num_input_tokens_seen": 8285346,
"num_tokens": 8235981.0,
"step": 360,
"train_runtime": 366.0216,
"train_tokens_per_second": 22636.221
},
{
"entropy": 1.74296875,
"epoch": 1.3780718336483933,
"grad_norm": 5.15625,
"learning_rate": 9.820246894045316e-06,
"loss": 0.0428,
"mean_token_accuracy": 0.9822307825088501,
"num_input_tokens_seen": 8400240,
"num_tokens": 8350356.0,
"step": 365,
"train_runtime": 369.6364,
"train_tokens_per_second": 22725.685
},
{
"entropy": 1.73515625,
"epoch": 1.3969754253308129,
"grad_norm": 4.09375,
"learning_rate": 9.28768317740564e-06,
"loss": 0.099,
"mean_token_accuracy": 0.9710565328598022,
"num_input_tokens_seen": 8515740,
"num_tokens": 8465025.0,
"step": 370,
"train_runtime": 373.5701,
"train_tokens_per_second": 22795.56
},
{
"entropy": 1.7328125,
"epoch": 1.4158790170132325,
"grad_norm": 4.96875,
"learning_rate": 8.765565505897902e-06,
"loss": 0.0736,
"mean_token_accuracy": 0.9741575241088867,
"num_input_tokens_seen": 8631054,
"num_tokens": 8579648.0,
"step": 375,
"train_runtime": 378.7394,
"train_tokens_per_second": 22788.901
},
{
"entropy": 1.73359375,
"epoch": 1.434782608695652,
"grad_norm": 3.265625,
"learning_rate": 8.254403019127566e-06,
"loss": 0.0806,
"mean_token_accuracy": 0.9791056990623475,
"num_input_tokens_seen": 8746364,
"num_tokens": 8694249.0,
"step": 380,
"train_runtime": 382.0615,
"train_tokens_per_second": 22892.552
},
{
"entropy": 1.73515625,
"epoch": 1.4536862003780717,
"grad_norm": 3.75,
"learning_rate": 7.754694173823947e-06,
"loss": 0.0404,
"mean_token_accuracy": 0.9839386224746705,
"num_input_tokens_seen": 8861574,
"num_tokens": 8808789.0,
"step": 385,
"train_runtime": 387.2205,
"train_tokens_per_second": 22885.084
},
{
"entropy": 1.73359375,
"epoch": 1.4725897920604916,
"grad_norm": 5.09375,
"learning_rate": 7.266926257773346e-06,
"loss": 0.0926,
"mean_token_accuracy": 0.9714232623577118,
"num_input_tokens_seen": 8976944,
"num_tokens": 8923407.0,
"step": 390,
"train_runtime": 390.891,
"train_tokens_per_second": 22965.336
},
{
"entropy": 1.72265625,
"epoch": 1.4914933837429112,
"grad_norm": 5.0,
"learning_rate": 6.7915749146436415e-06,
"loss": 0.0519,
"mean_token_accuracy": 0.9837916433811188,
"num_input_tokens_seen": 9092050,
"num_tokens": 9037924.0,
"step": 395,
"train_runtime": 395.3397,
"train_tokens_per_second": 22998.071
},
{
"entropy": 1.71796875,
"epoch": 1.5103969754253308,
"grad_norm": 3.875,
"learning_rate": 6.329103680163495e-06,
"loss": 0.2115,
"mean_token_accuracy": 0.9516554296016693,
"num_input_tokens_seen": 9207594,
"num_tokens": 9152659.0,
"step": 400,
"train_runtime": 399.5499,
"train_tokens_per_second": 23044.916
},
{
"entropy": 1.71640625,
"epoch": 1.5293005671077504,
"grad_norm": 0.51953125,
"learning_rate": 5.879963530108506e-06,
"loss": 0.0348,
"mean_token_accuracy": 0.9919346511363983,
"num_input_tokens_seen": 9322572,
"num_tokens": 9267059.0,
"step": 405,
"train_runtime": 403.4031,
"train_tokens_per_second": 23109.815
},
{
"entropy": 1.7125,
"epoch": 1.54820415879017,
"grad_norm": 2.234375,
"learning_rate": 5.444592440535177e-06,
"loss": 0.0374,
"mean_token_accuracy": 0.9837370038032531,
"num_input_tokens_seen": 9438004,
"num_tokens": 9381725.0,
"step": 410,
"train_runtime": 407.9692,
"train_tokens_per_second": 23134.111
},
{
"entropy": 1.7078125,
"epoch": 1.5671077504725899,
"grad_norm": 3.09375,
"learning_rate": 5.023414960691469e-06,
"loss": 0.0325,
"mean_token_accuracy": 0.9918534696102143,
"num_input_tokens_seen": 9553156,
"num_tokens": 9496255.0,
"step": 415,
"train_runtime": 412.1408,
"train_tokens_per_second": 23179.35
},
{
"entropy": 1.703125,
"epoch": 1.5860113421550095,
"grad_norm": 4.78125,
"learning_rate": 4.616841799020364e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9808044970035553,
"num_input_tokens_seen": 9668364,
"num_tokens": 9610808.0,
"step": 420,
"train_runtime": 416.3235,
"train_tokens_per_second": 23223.203
},
{
"entropy": 1.703125,
"epoch": 1.6049149338374291,
"grad_norm": 6.1875,
"learning_rate": 4.225269422660258e-06,
"loss": 0.0493,
"mean_token_accuracy": 0.9843941271305084,
"num_input_tokens_seen": 9783552,
"num_tokens": 9725283.0,
"step": 425,
"train_runtime": 421.1316,
"train_tokens_per_second": 23231.576
},
{
"entropy": 1.70390625,
"epoch": 1.6238185255198487,
"grad_norm": 1.5078125,
"learning_rate": 3.8490796708326404e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9822299420833588,
"num_input_tokens_seen": 9898934,
"num_tokens": 9839878.0,
"step": 430,
"train_runtime": 424.7606,
"train_tokens_per_second": 23304.735
},
{
"entropy": 1.7015625,
"epoch": 1.6427221172022684,
"grad_norm": 1.328125,
"learning_rate": 3.4886393824940924e-06,
"loss": 0.059,
"mean_token_accuracy": 0.9807979345321656,
"num_input_tokens_seen": 10014142,
"num_tokens": 9954403.0,
"step": 435,
"train_runtime": 429.8927,
"train_tokens_per_second": 23294.514
},
{
"entropy": 1.70390625,
"epoch": 1.6616257088846882,
"grad_norm": 2.09375,
"learning_rate": 3.144300038615691e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.9839386105537414,
"num_input_tokens_seen": 10129264,
"num_tokens": 10068933.0,
"step": 440,
"train_runtime": 433.4828,
"train_tokens_per_second": 23367.164
},
{
"entropy": 1.6984375,
"epoch": 1.6805293005671076,
"grad_norm": 4.25,
"learning_rate": 2.8163974194386766e-06,
"loss": 0.0669,
"mean_token_accuracy": 0.9792383193969727,
"num_input_tokens_seen": 10244732,
"num_tokens": 10183591.0,
"step": 445,
"train_runtime": 437.9792,
"train_tokens_per_second": 23390.909
},
{
"entropy": 1.7015625,
"epoch": 1.6994328922495274,
"grad_norm": 3.46875,
"learning_rate": 2.5052512770405434e-06,
"loss": 0.0801,
"mean_token_accuracy": 0.9761136710643769,
"num_input_tokens_seen": 10360212,
"num_tokens": 10298251.0,
"step": 450,
"train_runtime": 442.481,
"train_tokens_per_second": 23413.915
},
{
"entropy": 1.70234375,
"epoch": 1.718336483931947,
"grad_norm": 0.59765625,
"learning_rate": 2.2111650235309147e-06,
"loss": 0.0297,
"mean_token_accuracy": 0.9904489517211914,
"num_input_tokens_seen": 10475400,
"num_tokens": 10412810.0,
"step": 455,
"train_runtime": 446.3738,
"train_tokens_per_second": 23467.773
},
{
"entropy": 1.69921875,
"epoch": 1.7372400756143667,
"grad_norm": 4.0625,
"learning_rate": 1.9344254351812287e-06,
"loss": 0.0989,
"mean_token_accuracy": 0.9743396818637848,
"num_input_tokens_seen": 10590710,
"num_tokens": 10527389.0,
"step": 460,
"train_runtime": 451.1755,
"train_tokens_per_second": 23473.591
},
{
"entropy": 1.703125,
"epoch": 1.7561436672967865,
"grad_norm": 0.890625,
"learning_rate": 1.6753023727767436e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9838890075683594,
"num_input_tokens_seen": 10705900,
"num_tokens": 10641918.0,
"step": 465,
"train_runtime": 454.754,
"train_tokens_per_second": 23542.179
},
{
"entropy": 1.7,
"epoch": 1.775047258979206,
"grad_norm": 1.359375,
"learning_rate": 1.4340485184635712e-06,
"loss": 0.0556,
"mean_token_accuracy": 0.9777659058570862,
"num_input_tokens_seen": 10821144,
"num_tokens": 10756496.0,
"step": 470,
"train_runtime": 459.2027,
"train_tokens_per_second": 23565.072
},
{
"entropy": 1.69921875,
"epoch": 1.7939508506616257,
"grad_norm": 1.171875,
"learning_rate": 1.2108991293473627e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9741835057735443,
"num_input_tokens_seen": 10936460,
"num_tokens": 10871124.0,
"step": 475,
"train_runtime": 463.6099,
"train_tokens_per_second": 23589.79
},
{
"entropy": 1.69765625,
"epoch": 1.8128544423440454,
"grad_norm": 3.265625,
"learning_rate": 1.0060718080838683e-06,
"loss": 0.0541,
"mean_token_accuracy": 0.9831156551837921,
"num_input_tokens_seen": 11051508,
"num_tokens": 10985594.0,
"step": 480,
"train_runtime": 467.1593,
"train_tokens_per_second": 23656.828
},
{
"entropy": 1.70078125,
"epoch": 1.831758034026465,
"grad_norm": 2.4375,
"learning_rate": 8.197662906851534e-07,
"loss": 0.0835,
"mean_token_accuracy": 0.9726030707359314,
"num_input_tokens_seen": 11166904,
"num_tokens": 11100230.0,
"step": 485,
"train_runtime": 472.195,
"train_tokens_per_second": 23648.922
},
{
"entropy": 1.69921875,
"epoch": 1.8506616257088848,
"grad_norm": 2.765625,
"learning_rate": 6.521642517483573e-07,
"loss": 0.0532,
"mean_token_accuracy": 0.9853454470634461,
"num_input_tokens_seen": 11281802,
"num_tokens": 11214624.0,
"step": 490,
"train_runtime": 475.7718,
"train_tokens_per_second": 23712.635
},
{
"entropy": 1.70078125,
"epoch": 1.8695652173913042,
"grad_norm": 2.171875,
"learning_rate": 5.034291272968772e-07,
"loss": 0.027,
"mean_token_accuracy": 0.9934648215770722,
"num_input_tokens_seen": 11396946,
"num_tokens": 11329098.0,
"step": 495,
"train_runtime": 480.2436,
"train_tokens_per_second": 23731.596
},
{
"entropy": 1.6984375,
"epoch": 1.888468809073724,
"grad_norm": 4.0625,
"learning_rate": 3.737059554068334e-07,
"loss": 0.0742,
"mean_token_accuracy": 0.9744843065738678,
"num_input_tokens_seen": 11512282,
"num_tokens": 11443715.0,
"step": 500,
"train_runtime": 484.6792,
"train_tokens_per_second": 23752.376
},
{
"entropy": 1.69921875,
"epoch": 1.9073724007561437,
"grad_norm": 6.84375,
"learning_rate": 2.631212347741352e-07,
"loss": 0.1322,
"mean_token_accuracy": 0.9680740118026734,
"num_input_tokens_seen": 11627828,
"num_tokens": 11558513.0,
"step": 505,
"train_runtime": 544.5283,
"train_tokens_per_second": 21353.945
},
{
"entropy": 1.69921875,
"epoch": 1.9262759924385633,
"grad_norm": 1.0078125,
"learning_rate": 1.7178280136011417e-07,
"loss": 0.0864,
"mean_token_accuracy": 0.9749818980693817,
"num_input_tokens_seen": 11743010,
"num_tokens": 11673010.0,
"step": 510,
"train_runtime": 549.7569,
"train_tokens_per_second": 21360.369
},
{
"entropy": 1.69921875,
"epoch": 1.9451795841209831,
"grad_norm": 2.5625,
"learning_rate": 9.977972323599095e-08,
"loss": 0.1175,
"mean_token_accuracy": 0.9680160820484162,
"num_input_tokens_seen": 11858430,
"num_tokens": 11787637.0,
"step": 515,
"train_runtime": 553.6509,
"train_tokens_per_second": 21418.605
},
{
"entropy": 1.69765625,
"epoch": 1.9640831758034025,
"grad_norm": 2.921875,
"learning_rate": 4.718221372874254e-08,
"loss": 0.0695,
"mean_token_accuracy": 0.9804269134998321,
"num_input_tokens_seen": 11973576,
"num_tokens": 11902111.0,
"step": 520,
"train_runtime": 557.8609,
"train_tokens_per_second": 21463.371
},
{
"entropy": 1.69609375,
"epoch": 1.9829867674858224,
"grad_norm": 5.8125,
"learning_rate": 1.4041562953031051e-08,
"loss": 0.1152,
"mean_token_accuracy": 0.9696780204772949,
"num_input_tokens_seen": 12088990,
"num_tokens": 12016759.0,
"step": 525,
"train_runtime": 561.9991,
"train_tokens_per_second": 21510.694
},
{
"entropy": 1.6961805555555556,
"epoch": 2.0,
"grad_norm": 3.75,
"learning_rate": 3.900877959917004e-10,
"loss": 0.0989,
"mean_token_accuracy": 0.9715293182267083,
"num_input_tokens_seen": 12192662,
"num_tokens": 12119827.0,
"step": 530,
"train_runtime": 565.5622,
"train_tokens_per_second": 21558.482
},
{
"epoch": 2.0,
"num_input_tokens_seen": 12192662,
"step": 530,
"total_flos": 3.3226637176733696e+16,
"train_loss": 0.1822078584218925,
"train_runtime": 612.9949,
"train_samples_per_second": 27.592,
"train_steps_per_second": 0.865,
"train_tokens_per_second": 2486.879
}
],
"logging_steps": 5,
"max_steps": 530,
"num_input_tokens_seen": 12192662,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3226637176733696e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}