CPT14B / checkpoint-100 /trainer_state.json
qyliang's picture
Add files using upload-large-folder tool
4fb6b6b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.005333646240579447,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.3336462405794474e-05,
"grad_norm": 0.398686021566391,
"learning_rate": 5e-06,
"loss": 0.1308,
"step": 1
},
{
"epoch": 0.00010667292481158895,
"grad_norm": 0.46305230259895325,
"learning_rate": 1e-05,
"loss": 0.1388,
"step": 2
},
{
"epoch": 0.00016000938721738342,
"grad_norm": 0.3536907136440277,
"learning_rate": 1.5e-05,
"loss": 0.1216,
"step": 3
},
{
"epoch": 0.0002133458496231779,
"grad_norm": 0.32838913798332214,
"learning_rate": 2e-05,
"loss": 0.124,
"step": 4
},
{
"epoch": 0.00026668231202897234,
"grad_norm": 0.3058461546897888,
"learning_rate": 2.5e-05,
"loss": 0.1144,
"step": 5
},
{
"epoch": 0.00032001877443476684,
"grad_norm": 0.3052152097225189,
"learning_rate": 3e-05,
"loss": 0.109,
"step": 6
},
{
"epoch": 0.0003733552368405613,
"grad_norm": 0.3326779901981354,
"learning_rate": 3.5e-05,
"loss": 0.1233,
"step": 7
},
{
"epoch": 0.0004266916992463558,
"grad_norm": 0.3231862187385559,
"learning_rate": 4e-05,
"loss": 0.1099,
"step": 8
},
{
"epoch": 0.00048002816165215024,
"grad_norm": 0.32896554470062256,
"learning_rate": 4.5e-05,
"loss": 0.1199,
"step": 9
},
{
"epoch": 0.0005333646240579447,
"grad_norm": 0.3078831434249878,
"learning_rate": 5e-05,
"loss": 0.1042,
"step": 10
},
{
"epoch": 0.0005867010864637392,
"grad_norm": 0.2900382876396179,
"learning_rate": 5.500000000000001e-05,
"loss": 0.1108,
"step": 11
},
{
"epoch": 0.0006400375488695337,
"grad_norm": 0.28514525294303894,
"learning_rate": 6e-05,
"loss": 0.1092,
"step": 12
},
{
"epoch": 0.0006933740112753282,
"grad_norm": 0.30242955684661865,
"learning_rate": 6.500000000000001e-05,
"loss": 0.1104,
"step": 13
},
{
"epoch": 0.0007467104736811226,
"grad_norm": 0.3053489029407501,
"learning_rate": 7e-05,
"loss": 0.1209,
"step": 14
},
{
"epoch": 0.0008000469360869171,
"grad_norm": 0.32302212715148926,
"learning_rate": 7.500000000000001e-05,
"loss": 0.1273,
"step": 15
},
{
"epoch": 0.0008533833984927116,
"grad_norm": 0.29552462697029114,
"learning_rate": 8e-05,
"loss": 0.12,
"step": 16
},
{
"epoch": 0.0009067198608985061,
"grad_norm": 0.33688342571258545,
"learning_rate": 8.5e-05,
"loss": 0.135,
"step": 17
},
{
"epoch": 0.0009600563233043005,
"grad_norm": 0.29682204127311707,
"learning_rate": 9e-05,
"loss": 0.1224,
"step": 18
},
{
"epoch": 0.001013392785710095,
"grad_norm": 0.33479437232017517,
"learning_rate": 9.5e-05,
"loss": 0.143,
"step": 19
},
{
"epoch": 0.0010667292481158894,
"grad_norm": 0.3345559537410736,
"learning_rate": 0.0001,
"loss": 0.1197,
"step": 20
},
{
"epoch": 0.001120065710521684,
"grad_norm": 0.34470704197883606,
"learning_rate": 9.999999982431556e-05,
"loss": 0.1341,
"step": 21
},
{
"epoch": 0.0011734021729274784,
"grad_norm": 0.3448582887649536,
"learning_rate": 9.999999929726225e-05,
"loss": 0.1404,
"step": 22
},
{
"epoch": 0.001226738635333273,
"grad_norm": 0.33284544944763184,
"learning_rate": 9.999999841884006e-05,
"loss": 0.1399,
"step": 23
},
{
"epoch": 0.0012800750977390674,
"grad_norm": 0.3286615014076233,
"learning_rate": 9.999999718904902e-05,
"loss": 0.1353,
"step": 24
},
{
"epoch": 0.0013334115601448618,
"grad_norm": 0.3405831456184387,
"learning_rate": 9.999999560788912e-05,
"loss": 0.1527,
"step": 25
},
{
"epoch": 0.0013867480225506564,
"grad_norm": 0.3199956715106964,
"learning_rate": 9.999999367536037e-05,
"loss": 0.1469,
"step": 26
},
{
"epoch": 0.0014400844849564508,
"grad_norm": 0.3393600881099701,
"learning_rate": 9.999999139146278e-05,
"loss": 0.1536,
"step": 27
},
{
"epoch": 0.0014934209473622452,
"grad_norm": 0.3263189196586609,
"learning_rate": 9.99999887561964e-05,
"loss": 0.1498,
"step": 28
},
{
"epoch": 0.0015467574097680398,
"grad_norm": 0.31700044870376587,
"learning_rate": 9.999998576956121e-05,
"loss": 0.1391,
"step": 29
},
{
"epoch": 0.0016000938721738342,
"grad_norm": 0.33871933817863464,
"learning_rate": 9.999998243155724e-05,
"loss": 0.159,
"step": 30
},
{
"epoch": 0.0016534303345796288,
"grad_norm": 0.34099170565605164,
"learning_rate": 9.999997874218452e-05,
"loss": 0.1543,
"step": 31
},
{
"epoch": 0.0017067667969854232,
"grad_norm": 0.33778145909309387,
"learning_rate": 9.999997470144308e-05,
"loss": 0.1566,
"step": 32
},
{
"epoch": 0.0017601032593912175,
"grad_norm": 0.32202789187431335,
"learning_rate": 9.999997030933294e-05,
"loss": 0.1688,
"step": 33
},
{
"epoch": 0.0018134397217970122,
"grad_norm": 0.3160647749900818,
"learning_rate": 9.999996556585412e-05,
"loss": 0.1645,
"step": 34
},
{
"epoch": 0.0018667761842028065,
"grad_norm": 0.3506312072277069,
"learning_rate": 9.999996047100669e-05,
"loss": 0.1764,
"step": 35
},
{
"epoch": 0.001920112646608601,
"grad_norm": 0.3370567262172699,
"learning_rate": 9.999995502479064e-05,
"loss": 0.1817,
"step": 36
},
{
"epoch": 0.0019734491090143956,
"grad_norm": 0.3320629596710205,
"learning_rate": 9.999994922720604e-05,
"loss": 0.1558,
"step": 37
},
{
"epoch": 0.00202678557142019,
"grad_norm": 0.3061317503452301,
"learning_rate": 9.999994307825292e-05,
"loss": 0.1645,
"step": 38
},
{
"epoch": 0.0020801220338259843,
"grad_norm": 0.3522307276725769,
"learning_rate": 9.999993657793131e-05,
"loss": 0.1795,
"step": 39
},
{
"epoch": 0.0021334584962317787,
"grad_norm": 0.33675846457481384,
"learning_rate": 9.999992972624131e-05,
"loss": 0.1695,
"step": 40
},
{
"epoch": 0.0021867949586375736,
"grad_norm": 0.34461385011672974,
"learning_rate": 9.999992252318289e-05,
"loss": 0.1698,
"step": 41
},
{
"epoch": 0.002240131421043368,
"grad_norm": 0.3509563207626343,
"learning_rate": 9.999991496875616e-05,
"loss": 0.1857,
"step": 42
},
{
"epoch": 0.0022934678834491623,
"grad_norm": 0.34643200039863586,
"learning_rate": 9.999990706296113e-05,
"loss": 0.1855,
"step": 43
},
{
"epoch": 0.0023468043458549567,
"grad_norm": 0.33536601066589355,
"learning_rate": 9.99998988057979e-05,
"loss": 0.1767,
"step": 44
},
{
"epoch": 0.002400140808260751,
"grad_norm": 0.3312433362007141,
"learning_rate": 9.99998901972665e-05,
"loss": 0.1922,
"step": 45
},
{
"epoch": 0.002453477270666546,
"grad_norm": 0.34569883346557617,
"learning_rate": 9.999988123736699e-05,
"loss": 0.1777,
"step": 46
},
{
"epoch": 0.0025068137330723403,
"grad_norm": 0.3153592646121979,
"learning_rate": 9.999987192609944e-05,
"loss": 0.1669,
"step": 47
},
{
"epoch": 0.0025601501954781347,
"grad_norm": 0.3480944037437439,
"learning_rate": 9.999986226346392e-05,
"loss": 0.1779,
"step": 48
},
{
"epoch": 0.002613486657883929,
"grad_norm": 0.35337212681770325,
"learning_rate": 9.999985224946049e-05,
"loss": 0.1863,
"step": 49
},
{
"epoch": 0.0026668231202897235,
"grad_norm": 0.3431400656700134,
"learning_rate": 9.999984188408922e-05,
"loss": 0.1983,
"step": 50
},
{
"epoch": 0.0027201595826955183,
"grad_norm": 0.3314003348350525,
"learning_rate": 9.999983116735019e-05,
"loss": 0.1838,
"step": 51
},
{
"epoch": 0.0027734960451013127,
"grad_norm": 0.3390146493911743,
"learning_rate": 9.999982009924345e-05,
"loss": 0.1826,
"step": 52
},
{
"epoch": 0.002826832507507107,
"grad_norm": 0.35979676246643066,
"learning_rate": 9.999980867976912e-05,
"loss": 0.1967,
"step": 53
},
{
"epoch": 0.0028801689699129015,
"grad_norm": 0.39572709798812866,
"learning_rate": 9.999979690892725e-05,
"loss": 0.1893,
"step": 54
},
{
"epoch": 0.002933505432318696,
"grad_norm": 0.3344902992248535,
"learning_rate": 9.999978478671794e-05,
"loss": 0.1905,
"step": 55
},
{
"epoch": 0.0029868418947244903,
"grad_norm": 0.32258379459381104,
"learning_rate": 9.999977231314127e-05,
"loss": 0.1765,
"step": 56
},
{
"epoch": 0.003040178357130285,
"grad_norm": 0.32133936882019043,
"learning_rate": 9.999975948819731e-05,
"loss": 0.1855,
"step": 57
},
{
"epoch": 0.0030935148195360795,
"grad_norm": 0.33931997418403625,
"learning_rate": 9.999974631188618e-05,
"loss": 0.1881,
"step": 58
},
{
"epoch": 0.003146851281941874,
"grad_norm": 0.3421045243740082,
"learning_rate": 9.999973278420795e-05,
"loss": 0.1937,
"step": 59
},
{
"epoch": 0.0032001877443476683,
"grad_norm": 0.337239146232605,
"learning_rate": 9.999971890516272e-05,
"loss": 0.1947,
"step": 60
},
{
"epoch": 0.0032535242067534627,
"grad_norm": 0.3348175585269928,
"learning_rate": 9.999970467475059e-05,
"loss": 0.2015,
"step": 61
},
{
"epoch": 0.0033068606691592575,
"grad_norm": 0.33285388350486755,
"learning_rate": 9.999969009297165e-05,
"loss": 0.1954,
"step": 62
},
{
"epoch": 0.003360197131565052,
"grad_norm": 0.33577030897140503,
"learning_rate": 9.999967515982604e-05,
"loss": 0.2033,
"step": 63
},
{
"epoch": 0.0034135335939708463,
"grad_norm": 0.3373951315879822,
"learning_rate": 9.999965987531382e-05,
"loss": 0.2059,
"step": 64
},
{
"epoch": 0.0034668700563766407,
"grad_norm": 0.3341216742992401,
"learning_rate": 9.99996442394351e-05,
"loss": 0.191,
"step": 65
},
{
"epoch": 0.003520206518782435,
"grad_norm": 0.3183084726333618,
"learning_rate": 9.999962825219002e-05,
"loss": 0.1917,
"step": 66
},
{
"epoch": 0.00357354298118823,
"grad_norm": 0.33498549461364746,
"learning_rate": 9.999961191357869e-05,
"loss": 0.1895,
"step": 67
},
{
"epoch": 0.0036268794435940243,
"grad_norm": 0.3271574079990387,
"learning_rate": 9.999959522360118e-05,
"loss": 0.2054,
"step": 68
},
{
"epoch": 0.0036802159059998187,
"grad_norm": 0.3310222923755646,
"learning_rate": 9.999957818225768e-05,
"loss": 0.2013,
"step": 69
},
{
"epoch": 0.003733552368405613,
"grad_norm": 0.34026429057121277,
"learning_rate": 9.999956078954822e-05,
"loss": 0.2186,
"step": 70
},
{
"epoch": 0.0037868888308114075,
"grad_norm": 0.3307226300239563,
"learning_rate": 9.999954304547301e-05,
"loss": 0.2007,
"step": 71
},
{
"epoch": 0.003840225293217202,
"grad_norm": 0.3343660533428192,
"learning_rate": 9.999952495003212e-05,
"loss": 0.2075,
"step": 72
},
{
"epoch": 0.0038935617556229967,
"grad_norm": 0.3603450357913971,
"learning_rate": 9.999950650322569e-05,
"loss": 0.2156,
"step": 73
},
{
"epoch": 0.003946898218028791,
"grad_norm": 0.335757315158844,
"learning_rate": 9.999948770505386e-05,
"loss": 0.203,
"step": 74
},
{
"epoch": 0.0040002346804345855,
"grad_norm": 0.3302127718925476,
"learning_rate": 9.999946855551675e-05,
"loss": 0.1992,
"step": 75
},
{
"epoch": 0.00405357114284038,
"grad_norm": 0.3287745714187622,
"learning_rate": 9.99994490546145e-05,
"loss": 0.2044,
"step": 76
},
{
"epoch": 0.004106907605246174,
"grad_norm": 0.31489625573158264,
"learning_rate": 9.999942920234725e-05,
"loss": 0.2024,
"step": 77
},
{
"epoch": 0.004160244067651969,
"grad_norm": 0.3128495216369629,
"learning_rate": 9.999940899871513e-05,
"loss": 0.2082,
"step": 78
},
{
"epoch": 0.004213580530057763,
"grad_norm": 0.31686297059059143,
"learning_rate": 9.999938844371829e-05,
"loss": 0.2145,
"step": 79
},
{
"epoch": 0.0042669169924635575,
"grad_norm": 0.3330387473106384,
"learning_rate": 9.999936753735687e-05,
"loss": 0.2022,
"step": 80
},
{
"epoch": 0.004320253454869353,
"grad_norm": 0.34814751148223877,
"learning_rate": 9.999934627963103e-05,
"loss": 0.2192,
"step": 81
},
{
"epoch": 0.004373589917275147,
"grad_norm": 0.3250124454498291,
"learning_rate": 9.999932467054089e-05,
"loss": 0.2029,
"step": 82
},
{
"epoch": 0.0044269263796809415,
"grad_norm": 0.3646756410598755,
"learning_rate": 9.999930271008663e-05,
"loss": 0.2203,
"step": 83
},
{
"epoch": 0.004480262842086736,
"grad_norm": 0.3267667889595032,
"learning_rate": 9.99992803982684e-05,
"loss": 0.2078,
"step": 84
},
{
"epoch": 0.00453359930449253,
"grad_norm": 0.32010674476623535,
"learning_rate": 9.999925773508634e-05,
"loss": 0.2041,
"step": 85
},
{
"epoch": 0.004586935766898325,
"grad_norm": 0.3199160695075989,
"learning_rate": 9.999923472054063e-05,
"loss": 0.2119,
"step": 86
},
{
"epoch": 0.004640272229304119,
"grad_norm": 0.3363480269908905,
"learning_rate": 9.99992113546314e-05,
"loss": 0.2102,
"step": 87
},
{
"epoch": 0.0046936086917099135,
"grad_norm": 0.3485029935836792,
"learning_rate": 9.999918763735886e-05,
"loss": 0.2146,
"step": 88
},
{
"epoch": 0.004746945154115708,
"grad_norm": 0.3307000994682312,
"learning_rate": 9.999916356872314e-05,
"loss": 0.2006,
"step": 89
},
{
"epoch": 0.004800281616521502,
"grad_norm": 0.32731881737709045,
"learning_rate": 9.999913914872443e-05,
"loss": 0.1934,
"step": 90
},
{
"epoch": 0.004853618078927297,
"grad_norm": 0.3216370642185211,
"learning_rate": 9.99991143773629e-05,
"loss": 0.2149,
"step": 91
},
{
"epoch": 0.004906954541333092,
"grad_norm": 0.3118319809436798,
"learning_rate": 9.999908925463872e-05,
"loss": 0.2066,
"step": 92
},
{
"epoch": 0.004960291003738886,
"grad_norm": 0.3115937411785126,
"learning_rate": 9.999906378055205e-05,
"loss": 0.2016,
"step": 93
},
{
"epoch": 0.005013627466144681,
"grad_norm": 0.31390756368637085,
"learning_rate": 9.999903795510308e-05,
"loss": 0.227,
"step": 94
},
{
"epoch": 0.005066963928550475,
"grad_norm": 0.3215806484222412,
"learning_rate": 9.999901177829201e-05,
"loss": 0.2139,
"step": 95
},
{
"epoch": 0.0051203003909562695,
"grad_norm": 0.32381314039230347,
"learning_rate": 9.9998985250119e-05,
"loss": 0.2119,
"step": 96
},
{
"epoch": 0.005173636853362064,
"grad_norm": 0.32022613286972046,
"learning_rate": 9.999895837058425e-05,
"loss": 0.2036,
"step": 97
},
{
"epoch": 0.005226973315767858,
"grad_norm": 0.3156028985977173,
"learning_rate": 9.999893113968795e-05,
"loss": 0.2058,
"step": 98
},
{
"epoch": 0.005280309778173653,
"grad_norm": 0.3254660665988922,
"learning_rate": 9.999890355743027e-05,
"loss": 0.2064,
"step": 99
},
{
"epoch": 0.005333646240579447,
"grad_norm": 0.3044165074825287,
"learning_rate": 9.999887562381143e-05,
"loss": 0.2081,
"step": 100
}
],
"logging_steps": 1.0,
"max_steps": 37496,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3964461642486907e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}