Raiff1982's picture
Upload folder using huggingface_hub
9b66615 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.771598082780838,
"epoch": 0.032,
"grad_norm": 0.2451171875,
"learning_rate": 6.206896551724138e-05,
"loss": 2.916146087646484,
"mean_token_accuracy": 0.45416649207472803,
"num_tokens": 55650.0,
"step": 10
},
{
"entropy": 2.3006236433982847,
"epoch": 0.064,
"grad_norm": 0.23828125,
"learning_rate": 0.00013103448275862068,
"loss": 2.4864336013793946,
"mean_token_accuracy": 0.5021730229258538,
"num_tokens": 110943.0,
"step": 20
},
{
"entropy": 1.8387477725744248,
"epoch": 0.096,
"grad_norm": 0.287109375,
"learning_rate": 0.0002,
"loss": 1.766120147705078,
"mean_token_accuracy": 0.6066308304667473,
"num_tokens": 166476.0,
"step": 30
},
{
"entropy": 1.2004003927111626,
"epoch": 0.128,
"grad_norm": 0.3203125,
"learning_rate": 0.0001978021978021978,
"loss": 1.1438531875610352,
"mean_token_accuracy": 0.7330661401152611,
"num_tokens": 221809.0,
"step": 40
},
{
"entropy": 0.8197565108537674,
"epoch": 0.16,
"grad_norm": 0.34765625,
"learning_rate": 0.00019560439560439562,
"loss": 0.7494671821594239,
"mean_token_accuracy": 0.8239153817296028,
"num_tokens": 277841.0,
"step": 50
},
{
"entropy": 0.5685673624277114,
"epoch": 0.192,
"grad_norm": 0.369140625,
"learning_rate": 0.00019340659340659342,
"loss": 0.49509191513061523,
"mean_token_accuracy": 0.8886354997754097,
"num_tokens": 333258.0,
"step": 60
},
{
"entropy": 0.36241610124707224,
"epoch": 0.224,
"grad_norm": 0.306640625,
"learning_rate": 0.00019120879120879122,
"loss": 0.29932661056518556,
"mean_token_accuracy": 0.9331418961286545,
"num_tokens": 388437.0,
"step": 70
},
{
"entropy": 0.2608797810971737,
"epoch": 0.256,
"grad_norm": 0.322265625,
"learning_rate": 0.00018901098901098903,
"loss": 0.20479197502136232,
"mean_token_accuracy": 0.9527293920516968,
"num_tokens": 444483.0,
"step": 80
},
{
"entropy": 0.20162589177489282,
"epoch": 0.288,
"grad_norm": 0.1904296875,
"learning_rate": 0.00018681318681318683,
"loss": 0.16634706258773804,
"mean_token_accuracy": 0.9603863671422005,
"num_tokens": 500408.0,
"step": 90
},
{
"entropy": 0.1615206029266119,
"epoch": 0.32,
"grad_norm": 0.1904296875,
"learning_rate": 0.00018461538461538463,
"loss": 0.14311870336532592,
"mean_token_accuracy": 0.9628557220101357,
"num_tokens": 556058.0,
"step": 100
},
{
"entropy": 0.1455086786299944,
"epoch": 0.352,
"grad_norm": 0.1796875,
"learning_rate": 0.0001824175824175824,
"loss": 0.1271807074546814,
"mean_token_accuracy": 0.9651171401143074,
"num_tokens": 611407.0,
"step": 110
},
{
"entropy": 0.13460372295230627,
"epoch": 0.384,
"grad_norm": 0.1572265625,
"learning_rate": 0.00018021978021978024,
"loss": 0.11452269554138184,
"mean_token_accuracy": 0.9668730065226555,
"num_tokens": 667323.0,
"step": 120
},
{
"entropy": 0.126934945769608,
"epoch": 0.416,
"grad_norm": 0.10986328125,
"learning_rate": 0.00017802197802197802,
"loss": 0.10631006956100464,
"mean_token_accuracy": 0.9682586997747421,
"num_tokens": 723233.0,
"step": 130
},
{
"entropy": 0.12308279145509005,
"epoch": 0.448,
"grad_norm": 0.1806640625,
"learning_rate": 0.00017582417582417582,
"loss": 0.09787563681602478,
"mean_token_accuracy": 0.9693531423807145,
"num_tokens": 779623.0,
"step": 140
},
{
"entropy": 0.11310338769108057,
"epoch": 0.48,
"grad_norm": 0.1552734375,
"learning_rate": 0.00017362637362637365,
"loss": 0.09501280188560486,
"mean_token_accuracy": 0.9697160139679909,
"num_tokens": 835343.0,
"step": 150
},
{
"entropy": 0.10735327322036028,
"epoch": 0.512,
"grad_norm": 0.12109375,
"learning_rate": 0.00017142857142857143,
"loss": 0.0945986807346344,
"mean_token_accuracy": 0.9699150815606117,
"num_tokens": 890526.0,
"step": 160
},
{
"entropy": 0.10488443765789271,
"epoch": 0.544,
"grad_norm": 0.08935546875,
"learning_rate": 0.00016923076923076923,
"loss": 0.09071503281593322,
"mean_token_accuracy": 0.9702877476811409,
"num_tokens": 946551.0,
"step": 170
},
{
"entropy": 0.09990130253136158,
"epoch": 0.576,
"grad_norm": 0.10986328125,
"learning_rate": 0.00016703296703296706,
"loss": 0.08788512349128723,
"mean_token_accuracy": 0.9700309321284294,
"num_tokens": 1002250.0,
"step": 180
},
{
"entropy": 0.1006152719259262,
"epoch": 0.608,
"grad_norm": 0.138671875,
"learning_rate": 0.00016483516483516484,
"loss": 0.08786565065383911,
"mean_token_accuracy": 0.9703836083412171,
"num_tokens": 1057942.0,
"step": 190
},
{
"entropy": 0.09826808385550975,
"epoch": 0.64,
"grad_norm": 0.10693359375,
"learning_rate": 0.00016263736263736264,
"loss": 0.08827171325683594,
"mean_token_accuracy": 0.9702799677848816,
"num_tokens": 1113313.0,
"step": 200
},
{
"entropy": 0.09519640635699034,
"epoch": 0.672,
"grad_norm": 0.109375,
"learning_rate": 0.00016043956043956044,
"loss": 0.08511611819267273,
"mean_token_accuracy": 0.9707273244857788,
"num_tokens": 1168866.0,
"step": 210
},
{
"entropy": 0.09626698959618807,
"epoch": 0.704,
"grad_norm": 0.07958984375,
"learning_rate": 0.00015824175824175824,
"loss": 0.08481809496879578,
"mean_token_accuracy": 0.971166367828846,
"num_tokens": 1224112.0,
"step": 220
},
{
"entropy": 0.08999720010906458,
"epoch": 0.736,
"grad_norm": 0.11962890625,
"learning_rate": 0.00015604395604395605,
"loss": 0.08373072743415833,
"mean_token_accuracy": 0.9710352584719658,
"num_tokens": 1279497.0,
"step": 230
},
{
"entropy": 0.0909602127969265,
"epoch": 0.768,
"grad_norm": 0.08251953125,
"learning_rate": 0.00015384615384615385,
"loss": 0.0811634123325348,
"mean_token_accuracy": 0.9713989913463592,
"num_tokens": 1335147.0,
"step": 240
},
{
"entropy": 0.08677519466727972,
"epoch": 0.8,
"grad_norm": 0.0791015625,
"learning_rate": 0.00015164835164835165,
"loss": 0.08006779551506042,
"mean_token_accuracy": 0.9713309288024903,
"num_tokens": 1390441.0,
"step": 250
},
{
"entropy": 0.08524699918925763,
"epoch": 0.832,
"grad_norm": 0.1142578125,
"learning_rate": 0.00014945054945054946,
"loss": 0.07839072942733764,
"mean_token_accuracy": 0.9715656459331512,
"num_tokens": 1446334.0,
"step": 260
},
{
"entropy": 0.08736930266022683,
"epoch": 0.864,
"grad_norm": 0.10791015625,
"learning_rate": 0.00014725274725274726,
"loss": 0.07965806126594543,
"mean_token_accuracy": 0.971031291782856,
"num_tokens": 1502214.0,
"step": 270
},
{
"entropy": 0.08625259138643741,
"epoch": 0.896,
"grad_norm": 0.08447265625,
"learning_rate": 0.00014505494505494506,
"loss": 0.0801069438457489,
"mean_token_accuracy": 0.9713141709566117,
"num_tokens": 1557731.0,
"step": 280
},
{
"entropy": 0.08634743597358466,
"epoch": 0.928,
"grad_norm": 0.06298828125,
"learning_rate": 0.00014285714285714287,
"loss": 0.07907066345214844,
"mean_token_accuracy": 0.9716951295733451,
"num_tokens": 1613126.0,
"step": 290
},
{
"entropy": 0.08533936887979507,
"epoch": 0.96,
"grad_norm": 0.06103515625,
"learning_rate": 0.00014065934065934067,
"loss": 0.07907315492630004,
"mean_token_accuracy": 0.9721988439559937,
"num_tokens": 1668452.0,
"step": 300
},
{
"entropy": 0.08410668671131134,
"epoch": 0.992,
"grad_norm": 0.0791015625,
"learning_rate": 0.00013846153846153847,
"loss": 0.07861064672470093,
"mean_token_accuracy": 0.971791522204876,
"num_tokens": 1724177.0,
"step": 310
},
{
"entropy": 0.08109197275418985,
"epoch": 1.0224,
"grad_norm": 0.07177734375,
"learning_rate": 0.00013626373626373628,
"loss": 0.07589302062988282,
"mean_token_accuracy": 0.9723429350476516,
"num_tokens": 1777420.0,
"step": 320
},
{
"entropy": 0.0814886923879385,
"epoch": 1.0544,
"grad_norm": 0.10693359375,
"learning_rate": 0.00013406593406593405,
"loss": 0.07642998099327088,
"mean_token_accuracy": 0.97195183634758,
"num_tokens": 1833060.0,
"step": 330
},
{
"entropy": 0.08034903313964606,
"epoch": 1.0864,
"grad_norm": 0.07373046875,
"learning_rate": 0.00013186813186813188,
"loss": 0.07447389960289001,
"mean_token_accuracy": 0.9732914686203002,
"num_tokens": 1889075.0,
"step": 340
},
{
"entropy": 0.07864065244793891,
"epoch": 1.1184,
"grad_norm": 0.08056640625,
"learning_rate": 0.0001296703296703297,
"loss": 0.07513262033462524,
"mean_token_accuracy": 0.972836098074913,
"num_tokens": 1944905.0,
"step": 350
},
{
"entropy": 0.08301715180277824,
"epoch": 1.1504,
"grad_norm": 0.09716796875,
"learning_rate": 0.00012747252747252746,
"loss": 0.07624064683914185,
"mean_token_accuracy": 0.9722976922988892,
"num_tokens": 2000057.0,
"step": 360
},
{
"entropy": 0.08098908923566342,
"epoch": 1.1824,
"grad_norm": 0.059814453125,
"learning_rate": 0.00012527472527472527,
"loss": 0.07458102107048034,
"mean_token_accuracy": 0.9721322387456894,
"num_tokens": 2055866.0,
"step": 370
},
{
"entropy": 0.07686591371893883,
"epoch": 1.2144,
"grad_norm": 0.06396484375,
"learning_rate": 0.0001230769230769231,
"loss": 0.07280548810958862,
"mean_token_accuracy": 0.972561864554882,
"num_tokens": 2111077.0,
"step": 380
},
{
"entropy": 0.07761757280677557,
"epoch": 1.2464,
"grad_norm": 0.07470703125,
"learning_rate": 0.00012087912087912087,
"loss": 0.07433983087539672,
"mean_token_accuracy": 0.9725529655814171,
"num_tokens": 2166081.0,
"step": 390
},
{
"entropy": 0.08011266030371189,
"epoch": 1.2784,
"grad_norm": 0.052001953125,
"learning_rate": 0.00011868131868131869,
"loss": 0.0738287627696991,
"mean_token_accuracy": 0.9728635787963867,
"num_tokens": 2221310.0,
"step": 400
},
{
"entropy": 0.0769817665219307,
"epoch": 1.3104,
"grad_norm": 0.054931640625,
"learning_rate": 0.0001164835164835165,
"loss": 0.07387230396270753,
"mean_token_accuracy": 0.9729015439748764,
"num_tokens": 2277107.0,
"step": 410
},
{
"entropy": 0.07817615140229464,
"epoch": 1.3424,
"grad_norm": 0.06787109375,
"learning_rate": 0.00011428571428571428,
"loss": 0.07262731790542602,
"mean_token_accuracy": 0.9729507148265839,
"num_tokens": 2332758.0,
"step": 420
},
{
"entropy": 0.07688614577054978,
"epoch": 1.3744,
"grad_norm": 0.051025390625,
"learning_rate": 0.0001120879120879121,
"loss": 0.07327454686164855,
"mean_token_accuracy": 0.9719040498137475,
"num_tokens": 2388461.0,
"step": 430
},
{
"entropy": 0.07903551124036312,
"epoch": 1.4064,
"grad_norm": 0.05126953125,
"learning_rate": 0.0001098901098901099,
"loss": 0.07202324867248536,
"mean_token_accuracy": 0.9729711979627609,
"num_tokens": 2443802.0,
"step": 440
},
{
"entropy": 0.07504601553082466,
"epoch": 1.4384000000000001,
"grad_norm": 0.0966796875,
"learning_rate": 0.0001076923076923077,
"loss": 0.07251456379890442,
"mean_token_accuracy": 0.9728567853569985,
"num_tokens": 2498886.0,
"step": 450
},
{
"entropy": 0.07635734435170889,
"epoch": 1.4704,
"grad_norm": 0.07861328125,
"learning_rate": 0.0001054945054945055,
"loss": 0.07308706045150756,
"mean_token_accuracy": 0.9728769212961197,
"num_tokens": 2554546.0,
"step": 460
},
{
"entropy": 0.07706241644918918,
"epoch": 1.5024,
"grad_norm": 0.052734375,
"learning_rate": 0.00010329670329670331,
"loss": 0.0728609800338745,
"mean_token_accuracy": 0.9726115748286247,
"num_tokens": 2609939.0,
"step": 470
},
{
"entropy": 0.07556705921888351,
"epoch": 1.5344,
"grad_norm": 0.087890625,
"learning_rate": 0.0001010989010989011,
"loss": 0.07160326838493347,
"mean_token_accuracy": 0.973236757516861,
"num_tokens": 2665583.0,
"step": 480
},
{
"entropy": 0.07504178639501333,
"epoch": 1.5664,
"grad_norm": 0.08349609375,
"learning_rate": 9.89010989010989e-05,
"loss": 0.0718912661075592,
"mean_token_accuracy": 0.9726392358541489,
"num_tokens": 2721224.0,
"step": 490
},
{
"entropy": 0.07667357344180345,
"epoch": 1.5984,
"grad_norm": 0.044921875,
"learning_rate": 9.670329670329671e-05,
"loss": 0.07309556603431702,
"mean_token_accuracy": 0.9725197270512581,
"num_tokens": 2776793.0,
"step": 500
},
{
"entropy": 0.07603078782558441,
"epoch": 1.6303999999999998,
"grad_norm": 0.0673828125,
"learning_rate": 9.450549450549451e-05,
"loss": 0.07351203560829163,
"mean_token_accuracy": 0.9724631071090698,
"num_tokens": 2832471.0,
"step": 510
},
{
"entropy": 0.07741717118769884,
"epoch": 1.6623999999999999,
"grad_norm": 0.052001953125,
"learning_rate": 9.230769230769232e-05,
"loss": 0.07223351001739502,
"mean_token_accuracy": 0.9723551839590072,
"num_tokens": 2888234.0,
"step": 520
},
{
"entropy": 0.07598806507885456,
"epoch": 1.6944,
"grad_norm": 0.06689453125,
"learning_rate": 9.010989010989012e-05,
"loss": 0.07230474948883056,
"mean_token_accuracy": 0.9722696229815483,
"num_tokens": 2943732.0,
"step": 530
},
{
"entropy": 0.0750182744115591,
"epoch": 1.7264,
"grad_norm": 0.04931640625,
"learning_rate": 8.791208791208791e-05,
"loss": 0.07132035493850708,
"mean_token_accuracy": 0.9732247874140739,
"num_tokens": 2999654.0,
"step": 540
},
{
"entropy": 0.07485976945608855,
"epoch": 1.7584,
"grad_norm": 0.0478515625,
"learning_rate": 8.571428571428571e-05,
"loss": 0.07085888981819152,
"mean_token_accuracy": 0.9734048008918762,
"num_tokens": 3055591.0,
"step": 550
},
{
"entropy": 0.07469552531838416,
"epoch": 1.7904,
"grad_norm": 0.04638671875,
"learning_rate": 8.351648351648353e-05,
"loss": 0.07132892608642578,
"mean_token_accuracy": 0.9727317884564399,
"num_tokens": 3111372.0,
"step": 560
},
{
"entropy": 0.0737810717895627,
"epoch": 1.8224,
"grad_norm": 0.052001953125,
"learning_rate": 8.131868131868132e-05,
"loss": 0.07149158120155334,
"mean_token_accuracy": 0.9739102691411972,
"num_tokens": 3167376.0,
"step": 570
},
{
"entropy": 0.0747382478788495,
"epoch": 1.8544,
"grad_norm": 0.055908203125,
"learning_rate": 7.912087912087912e-05,
"loss": 0.07183558940887451,
"mean_token_accuracy": 0.972593954205513,
"num_tokens": 3222797.0,
"step": 580
},
{
"entropy": 0.07589616179466248,
"epoch": 1.8864,
"grad_norm": 0.04833984375,
"learning_rate": 7.692307692307693e-05,
"loss": 0.07035009264945984,
"mean_token_accuracy": 0.9727584093809127,
"num_tokens": 3278083.0,
"step": 590
},
{
"entropy": 0.07409894913434982,
"epoch": 1.9184,
"grad_norm": 0.041015625,
"learning_rate": 7.472527472527473e-05,
"loss": 0.06983839273452759,
"mean_token_accuracy": 0.9737206190824509,
"num_tokens": 3334115.0,
"step": 600
},
{
"entropy": 0.07298169508576394,
"epoch": 1.9504000000000001,
"grad_norm": 0.055419921875,
"learning_rate": 7.252747252747253e-05,
"loss": 0.07096859216690063,
"mean_token_accuracy": 0.9732470810413361,
"num_tokens": 3389990.0,
"step": 610
},
{
"entropy": 0.07362735010683537,
"epoch": 1.9824000000000002,
"grad_norm": 0.05078125,
"learning_rate": 7.032967032967034e-05,
"loss": 0.0709508240222931,
"mean_token_accuracy": 0.9730613023042679,
"num_tokens": 3445703.0,
"step": 620
},
{
"entropy": 0.07376520100392793,
"epoch": 2.0128,
"grad_norm": 0.050048828125,
"learning_rate": 6.813186813186814e-05,
"loss": 0.06944339275360108,
"mean_token_accuracy": 0.973434633330295,
"num_tokens": 3498933.0,
"step": 630
},
{
"entropy": 0.07382834255695343,
"epoch": 2.0448,
"grad_norm": 0.048583984375,
"learning_rate": 6.593406593406594e-05,
"loss": 0.07052375078201294,
"mean_token_accuracy": 0.9732005745172501,
"num_tokens": 3553934.0,
"step": 640
},
{
"entropy": 0.0728354575112462,
"epoch": 2.0768,
"grad_norm": 0.09619140625,
"learning_rate": 6.373626373626373e-05,
"loss": 0.07018245458602905,
"mean_token_accuracy": 0.9729705214500427,
"num_tokens": 3609458.0,
"step": 650
},
{
"entropy": 0.07463801130652428,
"epoch": 2.1088,
"grad_norm": 0.04736328125,
"learning_rate": 6.153846153846155e-05,
"loss": 0.07015591859817505,
"mean_token_accuracy": 0.9737546548247338,
"num_tokens": 3664572.0,
"step": 660
},
{
"entropy": 0.07311667818576098,
"epoch": 2.1408,
"grad_norm": 0.051513671875,
"learning_rate": 5.9340659340659345e-05,
"loss": 0.06875128149986268,
"mean_token_accuracy": 0.9733711332082748,
"num_tokens": 3720225.0,
"step": 670
},
{
"entropy": 0.0712715208530426,
"epoch": 2.1728,
"grad_norm": 0.0537109375,
"learning_rate": 5.714285714285714e-05,
"loss": 0.06806424856185914,
"mean_token_accuracy": 0.9737527936697006,
"num_tokens": 3776424.0,
"step": 680
},
{
"entropy": 0.07198722306638956,
"epoch": 2.2048,
"grad_norm": 0.046630859375,
"learning_rate": 5.494505494505495e-05,
"loss": 0.06764371991157532,
"mean_token_accuracy": 0.9738867044448852,
"num_tokens": 3832496.0,
"step": 690
},
{
"entropy": 0.07190853431820869,
"epoch": 2.2368,
"grad_norm": 0.0498046875,
"learning_rate": 5.274725274725275e-05,
"loss": 0.06873984336853027,
"mean_token_accuracy": 0.9736966788768768,
"num_tokens": 3888101.0,
"step": 700
},
{
"entropy": 0.07232791539281606,
"epoch": 2.2688,
"grad_norm": 0.048828125,
"learning_rate": 5.054945054945055e-05,
"loss": 0.0678622543811798,
"mean_token_accuracy": 0.9740407422184945,
"num_tokens": 3944308.0,
"step": 710
},
{
"entropy": 0.07142861131578684,
"epoch": 2.3008,
"grad_norm": 0.04931640625,
"learning_rate": 4.8351648351648355e-05,
"loss": 0.06784402132034302,
"mean_token_accuracy": 0.9742121011018753,
"num_tokens": 4000402.0,
"step": 720
},
{
"entropy": 0.07213470414280891,
"epoch": 2.3327999999999998,
"grad_norm": 0.053466796875,
"learning_rate": 4.615384615384616e-05,
"loss": 0.0697720229625702,
"mean_token_accuracy": 0.9730553776025772,
"num_tokens": 4055460.0,
"step": 730
},
{
"entropy": 0.07274228539317847,
"epoch": 2.3648,
"grad_norm": 0.05126953125,
"learning_rate": 4.3956043956043955e-05,
"loss": 0.06925151348114014,
"mean_token_accuracy": 0.9731604158878326,
"num_tokens": 4110709.0,
"step": 740
},
{
"entropy": 0.0725497305393219,
"epoch": 2.3968,
"grad_norm": 0.05029296875,
"learning_rate": 4.1758241758241765e-05,
"loss": 0.0686568260192871,
"mean_token_accuracy": 0.9741124615073204,
"num_tokens": 4166373.0,
"step": 750
},
{
"entropy": 0.07181228250265122,
"epoch": 2.4288,
"grad_norm": 0.05029296875,
"learning_rate": 3.956043956043956e-05,
"loss": 0.06826226711273194,
"mean_token_accuracy": 0.9736026957631111,
"num_tokens": 4222123.0,
"step": 760
},
{
"entropy": 0.0726727832108736,
"epoch": 2.4608,
"grad_norm": 0.046142578125,
"learning_rate": 3.7362637362637365e-05,
"loss": 0.06845790147781372,
"mean_token_accuracy": 0.9734082207083702,
"num_tokens": 4277783.0,
"step": 770
},
{
"entropy": 0.0728993572294712,
"epoch": 2.4928,
"grad_norm": 0.050537109375,
"learning_rate": 3.516483516483517e-05,
"loss": 0.06819941997528076,
"mean_token_accuracy": 0.9740725710988045,
"num_tokens": 4333255.0,
"step": 780
},
{
"entropy": 0.07271347604691983,
"epoch": 2.5248,
"grad_norm": 0.0546875,
"learning_rate": 3.296703296703297e-05,
"loss": 0.06898128986358643,
"mean_token_accuracy": 0.9741450414061547,
"num_tokens": 4388480.0,
"step": 790
},
{
"entropy": 0.07090196693316102,
"epoch": 2.5568,
"grad_norm": 0.04638671875,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.06766563653945923,
"mean_token_accuracy": 0.9739843040704728,
"num_tokens": 4444676.0,
"step": 800
},
{
"entropy": 0.0716133133508265,
"epoch": 2.5888,
"grad_norm": 0.050537109375,
"learning_rate": 2.857142857142857e-05,
"loss": 0.06745712161064148,
"mean_token_accuracy": 0.9742712348699569,
"num_tokens": 4500509.0,
"step": 810
},
{
"entropy": 0.07198168560862542,
"epoch": 2.6208,
"grad_norm": 0.052734375,
"learning_rate": 2.6373626373626374e-05,
"loss": 0.06861351728439331,
"mean_token_accuracy": 0.9730511695146561,
"num_tokens": 4555738.0,
"step": 820
},
{
"entropy": 0.07194693582132458,
"epoch": 2.6528,
"grad_norm": 0.055419921875,
"learning_rate": 2.4175824175824177e-05,
"loss": 0.06737480759620666,
"mean_token_accuracy": 0.9742139622569084,
"num_tokens": 4611454.0,
"step": 830
},
{
"entropy": 0.07176698800176382,
"epoch": 2.6848,
"grad_norm": 0.0537109375,
"learning_rate": 2.1978021978021977e-05,
"loss": 0.06750304102897645,
"mean_token_accuracy": 0.9739004611968994,
"num_tokens": 4667407.0,
"step": 840
},
{
"entropy": 0.07237117197364569,
"epoch": 2.7168,
"grad_norm": 0.04931640625,
"learning_rate": 1.978021978021978e-05,
"loss": 0.06796355247497558,
"mean_token_accuracy": 0.9741956070065498,
"num_tokens": 4723097.0,
"step": 850
},
{
"entropy": 0.07233156580477954,
"epoch": 2.7488,
"grad_norm": 0.07666015625,
"learning_rate": 1.7582417582417584e-05,
"loss": 0.06823940873146057,
"mean_token_accuracy": 0.9739502936601638,
"num_tokens": 4778509.0,
"step": 860
},
{
"entropy": 0.07178980130702257,
"epoch": 2.7808,
"grad_norm": 0.056396484375,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.06750970482826232,
"mean_token_accuracy": 0.9740586042404175,
"num_tokens": 4834011.0,
"step": 870
},
{
"entropy": 0.07171082906425,
"epoch": 2.8128,
"grad_norm": 0.053955078125,
"learning_rate": 1.3186813186813187e-05,
"loss": 0.06713088154792786,
"mean_token_accuracy": 0.9745032519102097,
"num_tokens": 4889429.0,
"step": 880
},
{
"entropy": 0.07134337816387415,
"epoch": 2.8448,
"grad_norm": 0.056884765625,
"learning_rate": 1.0989010989010989e-05,
"loss": 0.06686720848083497,
"mean_token_accuracy": 0.9745988816022872,
"num_tokens": 4944966.0,
"step": 890
},
{
"entropy": 0.07198897190392017,
"epoch": 2.8768000000000002,
"grad_norm": 0.051513671875,
"learning_rate": 8.791208791208792e-06,
"loss": 0.06727443933486939,
"mean_token_accuracy": 0.9737225085496902,
"num_tokens": 5000546.0,
"step": 900
},
{
"entropy": 0.07114618215709925,
"epoch": 2.9088000000000003,
"grad_norm": 0.04833984375,
"learning_rate": 6.5934065934065935e-06,
"loss": 0.0675000011920929,
"mean_token_accuracy": 0.9743592411279678,
"num_tokens": 5056400.0,
"step": 910
},
{
"entropy": 0.07034891471266747,
"epoch": 2.9408,
"grad_norm": 0.05224609375,
"learning_rate": 4.395604395604396e-06,
"loss": 0.06710875034332275,
"mean_token_accuracy": 0.9740387976169587,
"num_tokens": 5111854.0,
"step": 920
},
{
"entropy": 0.07230036649852992,
"epoch": 2.9728,
"grad_norm": 0.05419921875,
"learning_rate": 2.197802197802198e-06,
"loss": 0.06789053678512573,
"mean_token_accuracy": 0.9737495318055153,
"num_tokens": 5167285.0,
"step": 930
}
],
"logging_steps": 10,
"max_steps": 939,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4252630664691712e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}