pocsmith / checkpoint-700 /trainer_state.json
regaan's picture
Upload folder using huggingface_hub
775a70e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.3738317757009346,
"eval_steps": 100,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1900819569826127,
"epoch": 0.033984706881903144,
"grad_norm": 0.107421875,
"learning_rate": 6.666666666666667e-05,
"loss": 1.2022,
"mean_token_accuracy": 0.7261606067419052,
"num_tokens": 32329.0,
"step": 10
},
{
"entropy": 1.1123934835195541,
"epoch": 0.06796941376380629,
"grad_norm": 0.138671875,
"learning_rate": 0.00014074074074074076,
"loss": 1.1277,
"mean_token_accuracy": 0.7318252056837082,
"num_tokens": 67111.0,
"step": 20
},
{
"entropy": 1.0879137217998505,
"epoch": 0.10195412064570943,
"grad_norm": 0.3203125,
"learning_rate": 0.00019999731865174213,
"loss": 1.0625,
"mean_token_accuracy": 0.7478782564401627,
"num_tokens": 101352.0,
"step": 30
},
{
"entropy": 0.9986169628798962,
"epoch": 0.13593882752761258,
"grad_norm": 0.2890625,
"learning_rate": 0.00019990348656007261,
"loss": 1.0079,
"mean_token_accuracy": 0.7581020414829254,
"num_tokens": 133883.0,
"step": 40
},
{
"entropy": 1.05185257345438,
"epoch": 0.16992353440951571,
"grad_norm": 0.294921875,
"learning_rate": 0.00019967573081342103,
"loss": 1.0335,
"mean_token_accuracy": 0.7543211042881012,
"num_tokens": 168562.0,
"step": 50
},
{
"entropy": 0.9918051429092885,
"epoch": 0.20390824129141885,
"grad_norm": 0.263671875,
"learning_rate": 0.00019931435672527624,
"loss": 0.9977,
"mean_token_accuracy": 0.755571472644806,
"num_tokens": 202299.0,
"step": 60
},
{
"entropy": 0.8909615643322468,
"epoch": 0.23789294817332202,
"grad_norm": 0.298828125,
"learning_rate": 0.00019881984872856817,
"loss": 0.9168,
"mean_token_accuracy": 0.7724857151508331,
"num_tokens": 237132.0,
"step": 70
},
{
"entropy": 0.9534411959350109,
"epoch": 0.27187765505522515,
"grad_norm": 0.3203125,
"learning_rate": 0.00019819286972627066,
"loss": 0.9673,
"mean_token_accuracy": 0.7637621074914932,
"num_tokens": 271644.0,
"step": 80
},
{
"entropy": 0.9248364262282849,
"epoch": 0.3058623619371283,
"grad_norm": 0.26171875,
"learning_rate": 0.00019743426020275994,
"loss": 0.9077,
"mean_token_accuracy": 0.7676862999796867,
"num_tokens": 306939.0,
"step": 90
},
{
"entropy": 1.0025891564786433,
"epoch": 0.33984706881903143,
"grad_norm": 0.28125,
"learning_rate": 0.00019654503709711982,
"loss": 1.0295,
"mean_token_accuracy": 0.7536625176668167,
"num_tokens": 341079.0,
"step": 100
},
{
"epoch": 0.33984706881903143,
"eval_entropy": 0.9100993389175052,
"eval_loss": 0.9260265231132507,
"eval_mean_token_accuracy": 0.7760482584538103,
"eval_num_tokens": 341079.0,
"eval_runtime": 140.9825,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 100
},
{
"entropy": 0.9492803812026978,
"epoch": 0.37383177570093457,
"grad_norm": 0.2578125,
"learning_rate": 0.00019552639243990402,
"loss": 0.9391,
"mean_token_accuracy": 0.7675268396735191,
"num_tokens": 374431.0,
"step": 110
},
{
"entropy": 0.86210857629776,
"epoch": 0.4078164825828377,
"grad_norm": 0.30078125,
"learning_rate": 0.00019437969175518295,
"loss": 0.8868,
"mean_token_accuracy": 0.7805617764592171,
"num_tokens": 410401.0,
"step": 120
},
{
"entropy": 0.9226490631699562,
"epoch": 0.44180118946474084,
"grad_norm": 0.3046875,
"learning_rate": 0.0001931064722300175,
"loss": 0.9118,
"mean_token_accuracy": 0.7770121544599533,
"num_tokens": 443280.0,
"step": 130
},
{
"entropy": 0.8999310165643692,
"epoch": 0.47578589634664403,
"grad_norm": 0.2734375,
"learning_rate": 0.00019170844065381285,
"loss": 0.9117,
"mean_token_accuracy": 0.7742907002568244,
"num_tokens": 478900.0,
"step": 140
},
{
"entropy": 0.8831319987773896,
"epoch": 0.5097706032285472,
"grad_norm": 0.255859375,
"learning_rate": 0.00019018747113031564,
"loss": 0.8893,
"mean_token_accuracy": 0.7864658862352372,
"num_tokens": 513687.0,
"step": 150
},
{
"entropy": 0.8636823572218418,
"epoch": 0.5437553101104503,
"grad_norm": 0.490234375,
"learning_rate": 0.000188545602565321,
"loss": 0.8824,
"mean_token_accuracy": 0.7811477512121201,
"num_tokens": 549223.0,
"step": 160
},
{
"entropy": 0.9621140897274018,
"epoch": 0.5777400169923534,
"grad_norm": 0.30859375,
"learning_rate": 0.00018678503593345754,
"loss": 0.959,
"mean_token_accuracy": 0.765991534292698,
"num_tokens": 582348.0,
"step": 170
},
{
"entropy": 0.9770274326205254,
"epoch": 0.6117247238742566,
"grad_norm": 0.345703125,
"learning_rate": 0.00018490813132771393,
"loss": 0.9698,
"mean_token_accuracy": 0.767307311296463,
"num_tokens": 614105.0,
"step": 180
},
{
"entropy": 0.8605956405401229,
"epoch": 0.6457094307561597,
"grad_norm": 0.408203125,
"learning_rate": 0.00018291740479566283,
"loss": 0.8559,
"mean_token_accuracy": 0.7881909653544426,
"num_tokens": 649045.0,
"step": 190
},
{
"entropy": 0.8583430543541908,
"epoch": 0.6796941376380629,
"grad_norm": 0.26953125,
"learning_rate": 0.00018081552496662258,
"loss": 0.8807,
"mean_token_accuracy": 0.7877280384302139,
"num_tokens": 681327.0,
"step": 200
},
{
"epoch": 0.6796941376380629,
"eval_entropy": 0.896664229785504,
"eval_loss": 0.89118492603302,
"eval_mean_token_accuracy": 0.7833221583139329,
"eval_num_tokens": 681327.0,
"eval_runtime": 140.9445,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 200
},
{
"entropy": 0.9148187682032585,
"epoch": 0.713678844519966,
"grad_norm": 0.287109375,
"learning_rate": 0.00017860530947427875,
"loss": 0.9274,
"mean_token_accuracy": 0.7749700739979744,
"num_tokens": 714260.0,
"step": 210
},
{
"entropy": 0.923950819671154,
"epoch": 0.7476635514018691,
"grad_norm": 0.32421875,
"learning_rate": 0.0001762897211795607,
"loss": 0.8731,
"mean_token_accuracy": 0.7750329807400703,
"num_tokens": 746221.0,
"step": 220
},
{
"entropy": 1.0102886088192462,
"epoch": 0.7816482582837723,
"grad_norm": 0.330078125,
"learning_rate": 0.0001738718641988365,
"loss": 1.0153,
"mean_token_accuracy": 0.7520556971430779,
"num_tokens": 781925.0,
"step": 230
},
{
"entropy": 0.8034560449421406,
"epoch": 0.8156329651656754,
"grad_norm": 0.32421875,
"learning_rate": 0.00017135497974275088,
"loss": 0.8024,
"mean_token_accuracy": 0.8065776988863945,
"num_tokens": 819257.0,
"step": 240
},
{
"entropy": 0.8587650842964649,
"epoch": 0.8496176720475785,
"grad_norm": 0.244140625,
"learning_rate": 0.00016874244177128396,
"loss": 0.8729,
"mean_token_accuracy": 0.7839998126029968,
"num_tokens": 854177.0,
"step": 250
},
{
"entropy": 0.8471586465835571,
"epoch": 0.8836023789294817,
"grad_norm": 0.333984375,
"learning_rate": 0.00016603775247085546,
"loss": 0.8693,
"mean_token_accuracy": 0.7884811326861382,
"num_tokens": 887426.0,
"step": 260
},
{
"entropy": 0.8740401305258274,
"epoch": 0.9175870858113849,
"grad_norm": 0.3125,
"learning_rate": 0.00016324453755953773,
"loss": 0.881,
"mean_token_accuracy": 0.787057913839817,
"num_tokens": 918169.0,
"step": 270
},
{
"entropy": 0.8283644251525402,
"epoch": 0.9515717926932881,
"grad_norm": 0.32421875,
"learning_rate": 0.00016036654142667043,
"loss": 0.8293,
"mean_token_accuracy": 0.7984860435128212,
"num_tokens": 953966.0,
"step": 280
},
{
"entropy": 0.8554800219833851,
"epoch": 0.9855564995751912,
"grad_norm": 0.40234375,
"learning_rate": 0.00015740762211339314,
"loss": 0.8646,
"mean_token_accuracy": 0.7841584324836731,
"num_tokens": 986605.0,
"step": 290
},
{
"entropy": 0.8761118473233404,
"epoch": 1.0169923534409515,
"grad_norm": 0.29296875,
"learning_rate": 0.00015437174614082416,
"loss": 0.9419,
"mean_token_accuracy": 0.7891149762514476,
"num_tokens": 1018372.0,
"step": 300
},
{
"epoch": 1.0169923534409515,
"eval_entropy": 0.8560160027474774,
"eval_loss": 0.8672717809677124,
"eval_mean_token_accuracy": 0.7890248249988167,
"eval_num_tokens": 1018372.0,
"eval_runtime": 140.8836,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 300
},
{
"entropy": 0.835958081483841,
"epoch": 1.0509770603228548,
"grad_norm": 0.3515625,
"learning_rate": 0.00015126298319281857,
"loss": 0.8349,
"mean_token_accuracy": 0.7921741649508476,
"num_tokens": 1053253.0,
"step": 310
},
{
"entropy": 0.8159842237830162,
"epoch": 1.0849617672047578,
"grad_norm": 0.46875,
"learning_rate": 0.00014808550066043352,
"loss": 0.8036,
"mean_token_accuracy": 0.7972497373819352,
"num_tokens": 1087084.0,
"step": 320
},
{
"entropy": 0.8521868549287319,
"epoch": 1.118946474086661,
"grad_norm": 0.330078125,
"learning_rate": 0.00014484355805541413,
"loss": 0.9106,
"mean_token_accuracy": 0.7848088085651398,
"num_tokens": 1121833.0,
"step": 330
},
{
"entropy": 0.8146591357886791,
"epoch": 1.152931180968564,
"grad_norm": 0.412109375,
"learning_rate": 0.00014154150130018866,
"loss": 0.7855,
"mean_token_accuracy": 0.7983845800161362,
"num_tokens": 1155315.0,
"step": 340
},
{
"entropy": 0.8469755969941616,
"epoch": 1.1869158878504673,
"grad_norm": 0.455078125,
"learning_rate": 0.00013818375690202774,
"loss": 0.8677,
"mean_token_accuracy": 0.7855840787291527,
"num_tokens": 1188884.0,
"step": 350
},
{
"entropy": 0.8589501097798348,
"epoch": 1.2209005947323703,
"grad_norm": 0.38671875,
"learning_rate": 0.000134774826019177,
"loss": 0.8452,
"mean_token_accuracy": 0.786843791604042,
"num_tokens": 1223117.0,
"step": 360
},
{
"entropy": 0.8148165218532085,
"epoch": 1.2548853016142736,
"grad_norm": 0.365234375,
"learning_rate": 0.0001313192784269179,
"loss": 0.7977,
"mean_token_accuracy": 0.7989379152655601,
"num_tokens": 1257970.0,
"step": 370
},
{
"entropy": 0.8345869470387697,
"epoch": 1.2888700084961768,
"grad_norm": 0.37109375,
"learning_rate": 0.0001278217463916453,
"loss": 0.8308,
"mean_token_accuracy": 0.7944282591342926,
"num_tokens": 1294426.0,
"step": 380
},
{
"entropy": 0.8046329416334629,
"epoch": 1.3228547153780799,
"grad_norm": 0.49609375,
"learning_rate": 0.00012428691846117372,
"loss": 0.7869,
"mean_token_accuracy": 0.8005647033452987,
"num_tokens": 1327540.0,
"step": 390
},
{
"entropy": 0.8379782371222972,
"epoch": 1.3568394222599829,
"grad_norm": 0.4375,
"learning_rate": 0.00012071953317959692,
"loss": 0.8281,
"mean_token_accuracy": 0.7926659971475601,
"num_tokens": 1361518.0,
"step": 400
},
{
"epoch": 1.3568394222599829,
"eval_entropy": 0.8221898901016533,
"eval_loss": 0.8522771000862122,
"eval_mean_token_accuracy": 0.7922491832655303,
"eval_num_tokens": 1361518.0,
"eval_runtime": 140.9731,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 400
},
{
"entropy": 0.8128557071089745,
"epoch": 1.3908241291418861,
"grad_norm": 0.376953125,
"learning_rate": 0.00011712437273512561,
"loss": 0.7984,
"mean_token_accuracy": 0.7952910155057907,
"num_tokens": 1395450.0,
"step": 410
},
{
"entropy": 0.7772056467831134,
"epoch": 1.4248088360237894,
"grad_norm": 0.388671875,
"learning_rate": 0.00011350625654941918,
"loss": 0.7676,
"mean_token_accuracy": 0.8111284270882606,
"num_tokens": 1429377.0,
"step": 420
},
{
"entropy": 0.77816668972373,
"epoch": 1.4587935429056924,
"grad_norm": 0.357421875,
"learning_rate": 0.00010987003481700455,
"loss": 0.7709,
"mean_token_accuracy": 0.8049451917409897,
"num_tokens": 1462442.0,
"step": 430
},
{
"entropy": 0.8393321461975575,
"epoch": 1.4927782497875957,
"grad_norm": 0.392578125,
"learning_rate": 0.00010622058200344344,
"loss": 0.8423,
"mean_token_accuracy": 0.7850104674696923,
"num_tokens": 1497205.0,
"step": 440
},
{
"entropy": 0.8927516497671604,
"epoch": 1.5267629566694987,
"grad_norm": 0.34375,
"learning_rate": 0.00010256279031096328,
"loss": 0.8664,
"mean_token_accuracy": 0.7828694671392441,
"num_tokens": 1531243.0,
"step": 450
},
{
"entropy": 0.7833915807306766,
"epoch": 1.560747663551402,
"grad_norm": 0.466796875,
"learning_rate": 9.890156312031163e-05,
"loss": 0.7676,
"mean_token_accuracy": 0.8062808975577355,
"num_tokens": 1565238.0,
"step": 460
},
{
"entropy": 0.8227095231413841,
"epoch": 1.594732370433305,
"grad_norm": 0.357421875,
"learning_rate": 9.524180841762577e-05,
"loss": 0.8238,
"mean_token_accuracy": 0.796034836769104,
"num_tokens": 1599147.0,
"step": 470
},
{
"entropy": 0.7979710936546326,
"epoch": 1.6287170773152082,
"grad_norm": 0.365234375,
"learning_rate": 9.1588432215128e-05,
"loss": 0.782,
"mean_token_accuracy": 0.8032634913921356,
"num_tokens": 1632572.0,
"step": 480
},
{
"entropy": 0.7890071399509907,
"epoch": 1.6627017841971115,
"grad_norm": 0.5,
"learning_rate": 8.79463319744677e-05,
"loss": 0.8058,
"mean_token_accuracy": 0.8048980295658111,
"num_tokens": 1664018.0,
"step": 490
},
{
"entropy": 0.8366815261542797,
"epoch": 1.6966864910790145,
"grad_norm": 0.34375,
"learning_rate": 8.432039004152519e-05,
"loss": 0.8222,
"mean_token_accuracy": 0.7975986883044243,
"num_tokens": 1698322.0,
"step": 500
},
{
"epoch": 1.6966864910790145,
"eval_entropy": 0.8350493262211481,
"eval_loss": 0.8397489786148071,
"eval_mean_token_accuracy": 0.7947951194380416,
"eval_num_tokens": 1698322.0,
"eval_runtime": 140.612,
"eval_samples_per_second": 1.045,
"eval_steps_per_second": 1.045,
"step": 500
},
{
"entropy": 0.7970431953668594,
"epoch": 1.7306711979609175,
"grad_norm": 0.5546875,
"learning_rate": 8.071546710147911e-05,
"loss": 0.7672,
"mean_token_accuracy": 0.8059538155794144,
"num_tokens": 1732641.0,
"step": 510
},
{
"entropy": 0.8044024340808391,
"epoch": 1.7646559048428208,
"grad_norm": 0.470703125,
"learning_rate": 7.713639566291027e-05,
"loss": 0.8155,
"mean_token_accuracy": 0.7996707066893578,
"num_tokens": 1766698.0,
"step": 520
},
{
"entropy": 0.7848532184958458,
"epoch": 1.798640611724724,
"grad_norm": 0.390625,
"learning_rate": 7.358797357967749e-05,
"loss": 0.7718,
"mean_token_accuracy": 0.8004196003079415,
"num_tokens": 1798260.0,
"step": 530
},
{
"entropy": 0.8484033491462469,
"epoch": 1.832625318606627,
"grad_norm": 0.44140625,
"learning_rate": 7.007495761924862e-05,
"loss": 0.8354,
"mean_token_accuracy": 0.7922118753194809,
"num_tokens": 1834411.0,
"step": 540
},
{
"entropy": 0.8166721411049366,
"epoch": 1.86661002548853,
"grad_norm": 0.390625,
"learning_rate": 6.660205708610987e-05,
"loss": 0.8533,
"mean_token_accuracy": 0.7995400235056878,
"num_tokens": 1868051.0,
"step": 550
},
{
"entropy": 0.7962750904262066,
"epoch": 1.9005947323704333,
"grad_norm": 0.326171875,
"learning_rate": 6.317392750879978e-05,
"loss": 0.7929,
"mean_token_accuracy": 0.8041162744164467,
"num_tokens": 1902715.0,
"step": 560
},
{
"entropy": 0.8834932953119278,
"epoch": 1.9345794392523366,
"grad_norm": 0.416015625,
"learning_rate": 5.979516439903221e-05,
"loss": 0.8764,
"mean_token_accuracy": 0.7862503513693809,
"num_tokens": 1934823.0,
"step": 570
},
{
"entropy": 0.7844171606004238,
"epoch": 1.9685641461342396,
"grad_norm": 0.33984375,
"learning_rate": 5.647029709127355e-05,
"loss": 0.805,
"mean_token_accuracy": 0.8080376788973809,
"num_tokens": 1970939.0,
"step": 580
},
{
"entropy": 0.7350320663001086,
"epoch": 2.0,
"grad_norm": 1.3671875,
"learning_rate": 5.3203782671032055e-05,
"loss": 0.7487,
"mean_token_accuracy": 0.8147893048621513,
"num_tokens": 2001880.0,
"step": 590
},
{
"entropy": 0.7770363502204418,
"epoch": 2.033984706881903,
"grad_norm": 0.5078125,
"learning_rate": 5.000000000000002e-05,
"loss": 0.7848,
"mean_token_accuracy": 0.8036531031131744,
"num_tokens": 2032980.0,
"step": 600
},
{
"epoch": 2.033984706881903,
"eval_entropy": 0.7900621154073144,
"eval_loss": 0.8340142369270325,
"eval_mean_token_accuracy": 0.7969201965396907,
"eval_num_tokens": 2032980.0,
"eval_runtime": 140.9352,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 600
},
{
"entropy": 0.756380145996809,
"epoch": 2.0679694137638065,
"grad_norm": 0.458984375,
"learning_rate": 4.686324384605629e-05,
"loss": 0.7823,
"mean_token_accuracy": 0.8089984133839607,
"num_tokens": 2067951.0,
"step": 610
},
{
"entropy": 0.7689781740307808,
"epoch": 2.1019541206457095,
"grad_norm": 0.341796875,
"learning_rate": 4.3797719126e-05,
"loss": 0.7489,
"mean_token_accuracy": 0.8059608668088913,
"num_tokens": 2102690.0,
"step": 620
},
{
"entropy": 0.7978510297834873,
"epoch": 2.1359388275276125,
"grad_norm": 0.421875,
"learning_rate": 4.08075352687318e-05,
"loss": 0.7711,
"mean_token_accuracy": 0.8028671458363533,
"num_tokens": 2136553.0,
"step": 630
},
{
"entropy": 0.7843278538435697,
"epoch": 2.1699235344095156,
"grad_norm": 0.51953125,
"learning_rate": 3.789670070643982e-05,
"loss": 0.7817,
"mean_token_accuracy": 0.8063826531171798,
"num_tokens": 2170922.0,
"step": 640
},
{
"entropy": 0.8048521246761083,
"epoch": 2.203908241291419,
"grad_norm": 0.5078125,
"learning_rate": 3.506911750117469e-05,
"loss": 0.8171,
"mean_token_accuracy": 0.7995030015707016,
"num_tokens": 2205971.0,
"step": 650
},
{
"entropy": 0.7707572512328624,
"epoch": 2.237892948173322,
"grad_norm": 0.453125,
"learning_rate": 3.232857611401693e-05,
"loss": 0.7546,
"mean_token_accuracy": 0.8126760870218277,
"num_tokens": 2241219.0,
"step": 660
},
{
"entropy": 0.868005882203579,
"epoch": 2.271877655055225,
"grad_norm": 0.466796875,
"learning_rate": 2.9678750323848893e-05,
"loss": 0.8657,
"mean_token_accuracy": 0.7834504991769791,
"num_tokens": 2275600.0,
"step": 670
},
{
"entropy": 0.7262178905308246,
"epoch": 2.305862361937128,
"grad_norm": 0.341796875,
"learning_rate": 2.71231923025427e-05,
"loss": 0.7005,
"mean_token_accuracy": 0.821708083152771,
"num_tokens": 2310278.0,
"step": 680
},
{
"entropy": 0.7482855342328548,
"epoch": 2.3398470688190316,
"grad_norm": 0.4453125,
"learning_rate": 2.4665327853166075e-05,
"loss": 0.745,
"mean_token_accuracy": 0.8137555688619613,
"num_tokens": 2344515.0,
"step": 690
},
{
"entropy": 0.7359024606645107,
"epoch": 2.3738317757009346,
"grad_norm": 0.384765625,
"learning_rate": 2.2308451817589283e-05,
"loss": 0.7407,
"mean_token_accuracy": 0.8139674678444863,
"num_tokens": 2378108.0,
"step": 700
},
{
"epoch": 2.3738317757009346,
"eval_entropy": 0.7895172556646827,
"eval_loss": 0.8320378661155701,
"eval_mean_token_accuracy": 0.7975128520102728,
"eval_num_tokens": 2378108.0,
"eval_runtime": 140.9707,
"eval_samples_per_second": 1.043,
"eval_steps_per_second": 1.043,
"step": 700
}
],
"logging_steps": 10,
"max_steps": 885,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.439848947662848e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}