DVC_coldstart_checkpoint / trainer_state.json
Agatha7k's picture
Upload folder using huggingface_hub
8229baa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 49.82051282051282,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.41025641025641024,
"grad_norm": 8.151017189025879,
"learning_rate": 0.0,
"loss": 1.3291,
"step": 1
},
{
"epoch": 0.8205128205128205,
"grad_norm": 8.3428955078125,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.3498,
"step": 2
},
{
"epoch": 1.4102564102564101,
"grad_norm": 3.494961738586426,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.3324,
"step": 3
},
{
"epoch": 1.8205128205128205,
"grad_norm": 3.1425204277038574,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.127,
"step": 4
},
{
"epoch": 2.41025641025641,
"grad_norm": 2.282806396484375,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.029,
"step": 5
},
{
"epoch": 2.8205128205128203,
"grad_norm": 4.562756061553955,
"learning_rate": 5.555555555555556e-05,
"loss": 0.8966,
"step": 6
},
{
"epoch": 3.41025641025641,
"grad_norm": 5.485057353973389,
"learning_rate": 6.666666666666667e-05,
"loss": 0.8497,
"step": 7
},
{
"epoch": 3.8205128205128203,
"grad_norm": 1.8801462650299072,
"learning_rate": 7.777777777777778e-05,
"loss": 0.71,
"step": 8
},
{
"epoch": 4.410256410256411,
"grad_norm": 1.7765756845474243,
"learning_rate": 8.888888888888889e-05,
"loss": 0.6853,
"step": 9
},
{
"epoch": 4.82051282051282,
"grad_norm": 2.199324131011963,
"learning_rate": 0.0001,
"loss": 0.535,
"step": 10
},
{
"epoch": 5.410256410256411,
"grad_norm": 2.8563392162323,
"learning_rate": 0.00011111111111111112,
"loss": 0.5501,
"step": 11
},
{
"epoch": 5.82051282051282,
"grad_norm": 2.2397141456604004,
"learning_rate": 0.00012222222222222224,
"loss": 0.3953,
"step": 12
},
{
"epoch": 6.410256410256411,
"grad_norm": 1.4788132905960083,
"learning_rate": 0.00013333333333333334,
"loss": 0.3417,
"step": 13
},
{
"epoch": 6.82051282051282,
"grad_norm": 1.7762037515640259,
"learning_rate": 0.00014444444444444444,
"loss": 0.2219,
"step": 14
},
{
"epoch": 7.410256410256411,
"grad_norm": 0.7586674690246582,
"learning_rate": 0.00015555555555555556,
"loss": 0.2104,
"step": 15
},
{
"epoch": 7.82051282051282,
"grad_norm": 0.9284120202064514,
"learning_rate": 0.0001666666666666667,
"loss": 0.1662,
"step": 16
},
{
"epoch": 8.41025641025641,
"grad_norm": 0.6511984467506409,
"learning_rate": 0.00017777777777777779,
"loss": 0.1547,
"step": 17
},
{
"epoch": 8.820512820512821,
"grad_norm": 1.1308850049972534,
"learning_rate": 0.00018888888888888888,
"loss": 0.1444,
"step": 18
},
{
"epoch": 9.41025641025641,
"grad_norm": 0.662821352481842,
"learning_rate": 0.0002,
"loss": 0.123,
"step": 19
},
{
"epoch": 9.820512820512821,
"grad_norm": 0.5322834253311157,
"learning_rate": 0.00019999854312354064,
"loss": 0.1036,
"step": 20
},
{
"epoch": 10.41025641025641,
"grad_norm": 0.5911012887954712,
"learning_rate": 0.00019999417253661235,
"loss": 0.0969,
"step": 21
},
{
"epoch": 10.820512820512821,
"grad_norm": 0.628558874130249,
"learning_rate": 0.00019998688836656323,
"loss": 0.0857,
"step": 22
},
{
"epoch": 11.41025641025641,
"grad_norm": 0.7193434834480286,
"learning_rate": 0.00019997669082563597,
"loss": 0.0748,
"step": 23
},
{
"epoch": 11.820512820512821,
"grad_norm": 0.3524335026741028,
"learning_rate": 0.00019996358021096176,
"loss": 0.066,
"step": 24
},
{
"epoch": 12.41025641025641,
"grad_norm": 0.47225716710090637,
"learning_rate": 0.00019994755690455152,
"loss": 0.0613,
"step": 25
},
{
"epoch": 12.820512820512821,
"grad_norm": 0.4532913267612457,
"learning_rate": 0.00019992862137328474,
"loss": 0.0408,
"step": 26
},
{
"epoch": 13.41025641025641,
"grad_norm": 0.2988104224205017,
"learning_rate": 0.00019990677416889608,
"loss": 0.0353,
"step": 27
},
{
"epoch": 13.820512820512821,
"grad_norm": 0.31864798069000244,
"learning_rate": 0.0001998820159279591,
"loss": 0.033,
"step": 28
},
{
"epoch": 14.41025641025641,
"grad_norm": 0.29291290044784546,
"learning_rate": 0.0001998543473718677,
"loss": 0.0235,
"step": 29
},
{
"epoch": 14.820512820512821,
"grad_norm": 0.24096333980560303,
"learning_rate": 0.00019982376930681531,
"loss": 0.0194,
"step": 30
},
{
"epoch": 15.41025641025641,
"grad_norm": 0.2400427609682083,
"learning_rate": 0.00019979028262377118,
"loss": 0.0177,
"step": 31
},
{
"epoch": 15.820512820512821,
"grad_norm": 0.23485173285007477,
"learning_rate": 0.00019975388829845448,
"loss": 0.0132,
"step": 32
},
{
"epoch": 16.41025641025641,
"grad_norm": 0.4795994758605957,
"learning_rate": 0.00019971458739130598,
"loss": 0.0123,
"step": 33
},
{
"epoch": 16.82051282051282,
"grad_norm": 0.3436650335788727,
"learning_rate": 0.00019967238104745696,
"loss": 0.0077,
"step": 34
},
{
"epoch": 17.41025641025641,
"grad_norm": 0.24164724349975586,
"learning_rate": 0.000199627270496696,
"loss": 0.0083,
"step": 35
},
{
"epoch": 17.82051282051282,
"grad_norm": 0.11744043976068497,
"learning_rate": 0.0001995792570534331,
"loss": 0.0053,
"step": 36
},
{
"epoch": 18.41025641025641,
"grad_norm": 0.2771929204463959,
"learning_rate": 0.0001995283421166614,
"loss": 0.0076,
"step": 37
},
{
"epoch": 18.82051282051282,
"grad_norm": 0.14852669835090637,
"learning_rate": 0.00019947452716991633,
"loss": 0.0042,
"step": 38
},
{
"epoch": 19.41025641025641,
"grad_norm": 1.1028482913970947,
"learning_rate": 0.00019941781378123244,
"loss": 0.0114,
"step": 39
},
{
"epoch": 19.82051282051282,
"grad_norm": 0.23756887018680573,
"learning_rate": 0.00019935820360309777,
"loss": 0.0043,
"step": 40
},
{
"epoch": 20.41025641025641,
"grad_norm": 0.8769266605377197,
"learning_rate": 0.00019929569837240564,
"loss": 0.0047,
"step": 41
},
{
"epoch": 20.82051282051282,
"grad_norm": 0.531132698059082,
"learning_rate": 0.00019923029991040402,
"loss": 0.0063,
"step": 42
},
{
"epoch": 21.41025641025641,
"grad_norm": 1.1996066570281982,
"learning_rate": 0.00019916201012264254,
"loss": 0.0143,
"step": 43
},
{
"epoch": 21.82051282051282,
"grad_norm": 0.6255332827568054,
"learning_rate": 0.0001990908309989168,
"loss": 0.0168,
"step": 44
},
{
"epoch": 22.41025641025641,
"grad_norm": 49.508148193359375,
"learning_rate": 0.00019901676461321068,
"loss": 0.0621,
"step": 45
},
{
"epoch": 22.82051282051282,
"grad_norm": 8.113191604614258,
"learning_rate": 0.00019893981312363562,
"loss": 0.1062,
"step": 46
},
{
"epoch": 23.41025641025641,
"grad_norm": 2.3446950912475586,
"learning_rate": 0.00019885997877236788,
"loss": 0.066,
"step": 47
},
{
"epoch": 23.82051282051282,
"grad_norm": 153.14146423339844,
"learning_rate": 0.00019877726388558325,
"loss": 0.0612,
"step": 48
},
{
"epoch": 24.41025641025641,
"grad_norm": 43.04759216308594,
"learning_rate": 0.00019869167087338907,
"loss": 0.135,
"step": 49
},
{
"epoch": 24.82051282051282,
"grad_norm": 51.32644271850586,
"learning_rate": 0.00019860320222975431,
"loss": 0.1375,
"step": 50
},
{
"epoch": 25.41025641025641,
"grad_norm": 1.9464935064315796,
"learning_rate": 0.00019851186053243666,
"loss": 0.3427,
"step": 51
},
{
"epoch": 25.82051282051282,
"grad_norm": 381.23974609375,
"learning_rate": 0.00019841764844290744,
"loss": 2.1722,
"step": 52
},
{
"epoch": 26.41025641025641,
"grad_norm": 119.89301300048828,
"learning_rate": 0.00019832056870627417,
"loss": 2.4659,
"step": 53
},
{
"epoch": 26.82051282051282,
"grad_norm": 48.73936080932617,
"learning_rate": 0.00019822062415120054,
"loss": 0.8509,
"step": 54
},
{
"epoch": 27.41025641025641,
"grad_norm": 19.564029693603516,
"learning_rate": 0.0001981178176898239,
"loss": 0.4258,
"step": 55
},
{
"epoch": 27.82051282051282,
"grad_norm": 33.161495208740234,
"learning_rate": 0.00019801215231767056,
"loss": 0.2414,
"step": 56
},
{
"epoch": 28.41025641025641,
"grad_norm": 5.548079013824463,
"learning_rate": 0.00019790363111356837,
"loss": 0.1882,
"step": 57
},
{
"epoch": 28.82051282051282,
"grad_norm": 4.21547794342041,
"learning_rate": 0.00019779225723955707,
"loss": 0.1264,
"step": 58
},
{
"epoch": 29.41025641025641,
"grad_norm": 2.7072598934173584,
"learning_rate": 0.00019767803394079615,
"loss": 0.2181,
"step": 59
},
{
"epoch": 29.82051282051282,
"grad_norm": 7.378205299377441,
"learning_rate": 0.0001975609645454704,
"loss": 0.1593,
"step": 60
},
{
"epoch": 30.41025641025641,
"grad_norm": 9.17626667022705,
"learning_rate": 0.00019744105246469263,
"loss": 0.3464,
"step": 61
},
{
"epoch": 30.82051282051282,
"grad_norm": 27.878585815429688,
"learning_rate": 0.00019731830119240463,
"loss": 0.4882,
"step": 62
},
{
"epoch": 31.41025641025641,
"grad_norm": 15.55352783203125,
"learning_rate": 0.0001971927143052752,
"loss": 0.7857,
"step": 63
},
{
"epoch": 31.82051282051282,
"grad_norm": 16.477920532226562,
"learning_rate": 0.00019706429546259593,
"loss": 0.663,
"step": 64
},
{
"epoch": 32.41025641025641,
"grad_norm": 13.829732894897461,
"learning_rate": 0.00019693304840617457,
"loss": 0.5652,
"step": 65
},
{
"epoch": 32.82051282051282,
"grad_norm": 1.885118842124939,
"learning_rate": 0.00019679897696022608,
"loss": 0.2583,
"step": 66
},
{
"epoch": 33.41025641025641,
"grad_norm": 1.9031124114990234,
"learning_rate": 0.00019666208503126112,
"loss": 0.2566,
"step": 67
},
{
"epoch": 33.82051282051282,
"grad_norm": 1.2540283203125,
"learning_rate": 0.0001965223766079723,
"loss": 0.1855,
"step": 68
},
{
"epoch": 34.41025641025641,
"grad_norm": 0.9428790807723999,
"learning_rate": 0.00019637985576111778,
"loss": 0.1633,
"step": 69
},
{
"epoch": 34.82051282051282,
"grad_norm": 0.8358070254325867,
"learning_rate": 0.00019623452664340306,
"loss": 0.1277,
"step": 70
},
{
"epoch": 35.41025641025641,
"grad_norm": 0.9116950631141663,
"learning_rate": 0.0001960863934893594,
"loss": 0.1124,
"step": 71
},
{
"epoch": 35.82051282051282,
"grad_norm": 1.235021948814392,
"learning_rate": 0.00019593546061522093,
"loss": 0.0928,
"step": 72
},
{
"epoch": 36.41025641025641,
"grad_norm": 0.7440080046653748,
"learning_rate": 0.00019578173241879872,
"loss": 0.0839,
"step": 73
},
{
"epoch": 36.82051282051282,
"grad_norm": 0.5238239765167236,
"learning_rate": 0.00019562521337935257,
"loss": 0.0589,
"step": 74
},
{
"epoch": 37.41025641025641,
"grad_norm": 0.637015700340271,
"learning_rate": 0.00019546590805746052,
"loss": 0.0538,
"step": 75
},
{
"epoch": 37.82051282051282,
"grad_norm": 0.3730023205280304,
"learning_rate": 0.0001953038210948861,
"loss": 0.0379,
"step": 76
},
{
"epoch": 38.41025641025641,
"grad_norm": 0.39598342776298523,
"learning_rate": 0.00019513895721444286,
"loss": 0.0314,
"step": 77
},
{
"epoch": 38.82051282051282,
"grad_norm": 0.26019713282585144,
"learning_rate": 0.00019497132121985695,
"loss": 0.0247,
"step": 78
},
{
"epoch": 39.41025641025641,
"grad_norm": 0.27270156145095825,
"learning_rate": 0.00019480091799562704,
"loss": 0.0219,
"step": 79
},
{
"epoch": 39.82051282051282,
"grad_norm": 0.31213200092315674,
"learning_rate": 0.0001946277525068821,
"loss": 0.0177,
"step": 80
},
{
"epoch": 40.41025641025641,
"grad_norm": 0.3065904676914215,
"learning_rate": 0.00019445182979923654,
"loss": 0.0167,
"step": 81
},
{
"epoch": 40.82051282051282,
"grad_norm": 0.25565171241760254,
"learning_rate": 0.00019427315499864344,
"loss": 0.0132,
"step": 82
},
{
"epoch": 41.41025641025641,
"grad_norm": 0.16997747123241425,
"learning_rate": 0.000194091733311245,
"loss": 0.0106,
"step": 83
},
{
"epoch": 41.82051282051282,
"grad_norm": 0.13165056705474854,
"learning_rate": 0.0001939075700232209,
"loss": 0.0108,
"step": 84
},
{
"epoch": 42.41025641025641,
"grad_norm": 0.10982735455036163,
"learning_rate": 0.00019372067050063438,
"loss": 0.0096,
"step": 85
},
{
"epoch": 42.82051282051282,
"grad_norm": 0.10672740638256073,
"learning_rate": 0.00019353104018927567,
"loss": 0.0083,
"step": 86
},
{
"epoch": 43.41025641025641,
"grad_norm": 0.1570005714893341,
"learning_rate": 0.0001933386846145036,
"loss": 0.0066,
"step": 87
},
{
"epoch": 43.82051282051282,
"grad_norm": 0.1381327509880066,
"learning_rate": 0.00019314360938108425,
"loss": 0.008,
"step": 88
},
{
"epoch": 44.41025641025641,
"grad_norm": 0.13799023628234863,
"learning_rate": 0.00019294582017302797,
"loss": 0.0075,
"step": 89
},
{
"epoch": 44.82051282051282,
"grad_norm": 0.07857757061719894,
"learning_rate": 0.00019274532275342354,
"loss": 0.0058,
"step": 90
},
{
"epoch": 45.41025641025641,
"grad_norm": 0.40940356254577637,
"learning_rate": 0.00019254212296427044,
"loss": 0.0078,
"step": 91
},
{
"epoch": 45.82051282051282,
"grad_norm": 0.13838538527488708,
"learning_rate": 0.0001923362267263084,
"loss": 0.0063,
"step": 92
},
{
"epoch": 46.41025641025641,
"grad_norm": 0.1280914694070816,
"learning_rate": 0.0001921276400388451,
"loss": 0.0051,
"step": 93
},
{
"epoch": 46.82051282051282,
"grad_norm": 0.1300235092639923,
"learning_rate": 0.00019191636897958122,
"loss": 0.0045,
"step": 94
},
{
"epoch": 47.41025641025641,
"grad_norm": 0.05682254955172539,
"learning_rate": 0.00019170241970443343,
"loss": 0.0045,
"step": 95
},
{
"epoch": 47.82051282051282,
"grad_norm": 0.06927549839019775,
"learning_rate": 0.00019148579844735497,
"loss": 0.0032,
"step": 96
},
{
"epoch": 48.41025641025641,
"grad_norm": 0.09624794870615005,
"learning_rate": 0.00019126651152015403,
"loss": 0.0041,
"step": 97
},
{
"epoch": 48.82051282051282,
"grad_norm": 0.0919504463672638,
"learning_rate": 0.00019104456531230984,
"loss": 0.0032,
"step": 98
},
{
"epoch": 49.41025641025641,
"grad_norm": 0.20492327213287354,
"learning_rate": 0.00019081996629078657,
"loss": 0.0039,
"step": 99
},
{
"epoch": 49.82051282051282,
"grad_norm": 0.042908795177936554,
"learning_rate": 0.0001905927209998447,
"loss": 0.002,
"step": 100
}
],
"logging_steps": 1.0,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 300,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.187460722471731e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}