Training in progress, step 68000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88085ee37b0edacc225a0fb86ed3cfd9ddce1ecb2e83ddb9feeeb81a70bb80bd
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b2f336afb5813ccf452282223e763afdce040692a315590bb908f2063975a3f
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:215e906fb9e492afed15b6bbd2ab828199f0238620feca89e4e09f3e2ffc4109
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11934,11 +11934,189 @@
|
|
| 11934 |
"eval_steps_per_second": 23.475,
|
| 11935 |
"num_input_tokens_seen": 17563648000,
|
| 11936 |
"step": 67000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11937 |
}
|
| 11938 |
],
|
| 11939 |
"logging_steps": 50,
|
| 11940 |
"max_steps": 70000,
|
| 11941 |
-
"num_input_tokens_seen":
|
| 11942 |
"num_train_epochs": 1,
|
| 11943 |
"save_steps": 1000,
|
| 11944 |
"stateful_callbacks": {
|
|
@@ -11953,7 +12131,7 @@
|
|
| 11953 |
"attributes": {}
|
| 11954 |
}
|
| 11955 |
},
|
| 11956 |
-
"total_flos": 4.
|
| 11957 |
"train_batch_size": 64,
|
| 11958 |
"trial_name": null,
|
| 11959 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4574019234423531,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 68000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11934 |
"eval_steps_per_second": 23.475,
|
| 11935 |
"num_input_tokens_seen": 17563648000,
|
| 11936 |
"step": 67000
|
| 11937 |
+
},
|
| 11938 |
+
{
|
| 11939 |
+
"epoch": 0.4510117495119084,
|
| 11940 |
+
"grad_norm": 0.14675357937812805,
|
| 11941 |
+
"learning_rate": 8.155812334579532e-05,
|
| 11942 |
+
"loss": 2.9682,
|
| 11943 |
+
"num_input_tokens_seen": 17576755200,
|
| 11944 |
+
"step": 67050
|
| 11945 |
+
},
|
| 11946 |
+
{
|
| 11947 |
+
"epoch": 0.45134807445561603,
|
| 11948 |
+
"grad_norm": 0.14341385662555695,
|
| 11949 |
+
"learning_rate": 7.889138314185678e-05,
|
| 11950 |
+
"loss": 2.9749,
|
| 11951 |
+
"num_input_tokens_seen": 17589862400,
|
| 11952 |
+
"step": 67100
|
| 11953 |
+
},
|
| 11954 |
+
{
|
| 11955 |
+
"epoch": 0.45168439939932364,
|
| 11956 |
+
"grad_norm": 0.1442009061574936,
|
| 11957 |
+
"learning_rate": 7.626523026288279e-05,
|
| 11958 |
+
"loss": 2.9637,
|
| 11959 |
+
"num_input_tokens_seen": 17602969600,
|
| 11960 |
+
"step": 67150
|
| 11961 |
+
},
|
| 11962 |
+
{
|
| 11963 |
+
"epoch": 0.45202072434303125,
|
| 11964 |
+
"grad_norm": 0.14580078423023224,
|
| 11965 |
+
"learning_rate": 7.367991782295391e-05,
|
| 11966 |
+
"loss": 2.9636,
|
| 11967 |
+
"num_input_tokens_seen": 17616076800,
|
| 11968 |
+
"step": 67200
|
| 11969 |
+
},
|
| 11970 |
+
{
|
| 11971 |
+
"epoch": 0.45235704928673887,
|
| 11972 |
+
"grad_norm": 0.13888555765151978,
|
| 11973 |
+
"learning_rate": 7.1135694999864e-05,
|
| 11974 |
+
"loss": 2.9737,
|
| 11975 |
+
"num_input_tokens_seen": 17629184000,
|
| 11976 |
+
"step": 67250
|
| 11977 |
+
},
|
| 11978 |
+
{
|
| 11979 |
+
"epoch": 0.4526933742304465,
|
| 11980 |
+
"grad_norm": 0.14820803701877594,
|
| 11981 |
+
"learning_rate": 6.863280701110408e-05,
|
| 11982 |
+
"loss": 2.9778,
|
| 11983 |
+
"num_input_tokens_seen": 17642291200,
|
| 11984 |
+
"step": 67300
|
| 11985 |
+
},
|
| 11986 |
+
{
|
| 11987 |
+
"epoch": 0.4530296991741541,
|
| 11988 |
+
"grad_norm": 0.14933691918849945,
|
| 11989 |
+
"learning_rate": 6.617149509022808e-05,
|
| 11990 |
+
"loss": 2.9667,
|
| 11991 |
+
"num_input_tokens_seen": 17655398400,
|
| 11992 |
+
"step": 67350
|
| 11993 |
+
},
|
| 11994 |
+
{
|
| 11995 |
+
"epoch": 0.4533660241178617,
|
| 11996 |
+
"grad_norm": 0.14829853177070618,
|
| 11997 |
+
"learning_rate": 6.375199646360142e-05,
|
| 11998 |
+
"loss": 2.9691,
|
| 11999 |
+
"num_input_tokens_seen": 17668505600,
|
| 12000 |
+
"step": 67400
|
| 12001 |
+
},
|
| 12002 |
+
{
|
| 12003 |
+
"epoch": 0.4537023490615693,
|
| 12004 |
+
"grad_norm": 0.14731477200984955,
|
| 12005 |
+
"learning_rate": 6.137454432753797e-05,
|
| 12006 |
+
"loss": 2.9731,
|
| 12007 |
+
"num_input_tokens_seen": 17681612800,
|
| 12008 |
+
"step": 67450
|
| 12009 |
+
},
|
| 12010 |
+
{
|
| 12011 |
+
"epoch": 0.4540386740052769,
|
| 12012 |
+
"grad_norm": 0.14357906579971313,
|
| 12013 |
+
"learning_rate": 5.903936782582253e-05,
|
| 12014 |
+
"loss": 2.9785,
|
| 12015 |
+
"num_input_tokens_seen": 17694720000,
|
| 12016 |
+
"step": 67500
|
| 12017 |
+
},
|
| 12018 |
+
{
|
| 12019 |
+
"epoch": 0.4540386740052769,
|
| 12020 |
+
"eval_loss": 2.867840528488159,
|
| 12021 |
+
"eval_runtime": 53.8197,
|
| 12022 |
+
"eval_samples_per_second": 92.903,
|
| 12023 |
+
"eval_steps_per_second": 23.226,
|
| 12024 |
+
"num_input_tokens_seen": 17694720000,
|
| 12025 |
+
"step": 67500
|
| 12026 |
+
},
|
| 12027 |
+
{
|
| 12028 |
+
"epoch": 0.45437499894898453,
|
| 12029 |
+
"grad_norm": 0.1438903659582138,
|
| 12030 |
+
"learning_rate": 5.6746692027626835e-05,
|
| 12031 |
+
"loss": 2.9733,
|
| 12032 |
+
"num_input_tokens_seen": 17707827200,
|
| 12033 |
+
"step": 67550
|
| 12034 |
+
},
|
| 12035 |
+
{
|
| 12036 |
+
"epoch": 0.45471132389269214,
|
| 12037 |
+
"grad_norm": 0.14171506464481354,
|
| 12038 |
+
"learning_rate": 5.449673790581611e-05,
|
| 12039 |
+
"loss": 2.9637,
|
| 12040 |
+
"num_input_tokens_seen": 17720934400,
|
| 12041 |
+
"step": 67600
|
| 12042 |
+
},
|
| 12043 |
+
{
|
| 12044 |
+
"epoch": 0.45504764883639975,
|
| 12045 |
+
"grad_norm": 0.1645549088716507,
|
| 12046 |
+
"learning_rate": 5.2289722315651546e-05,
|
| 12047 |
+
"loss": 2.9668,
|
| 12048 |
+
"num_input_tokens_seen": 17734041600,
|
| 12049 |
+
"step": 67650
|
| 12050 |
+
},
|
| 12051 |
+
{
|
| 12052 |
+
"epoch": 0.45538397378010737,
|
| 12053 |
+
"grad_norm": 0.1390199065208435,
|
| 12054 |
+
"learning_rate": 5.0125857973889355e-05,
|
| 12055 |
+
"loss": 2.9762,
|
| 12056 |
+
"num_input_tokens_seen": 17747148800,
|
| 12057 |
+
"step": 67700
|
| 12058 |
+
},
|
| 12059 |
+
{
|
| 12060 |
+
"epoch": 0.455720298723815,
|
| 12061 |
+
"grad_norm": 0.14667369425296783,
|
| 12062 |
+
"learning_rate": 4.800535343827833e-05,
|
| 12063 |
+
"loss": 2.9724,
|
| 12064 |
+
"num_input_tokens_seen": 17760256000,
|
| 12065 |
+
"step": 67750
|
| 12066 |
+
},
|
| 12067 |
+
{
|
| 12068 |
+
"epoch": 0.4560566236675226,
|
| 12069 |
+
"grad_norm": 0.14203302562236786,
|
| 12070 |
+
"learning_rate": 4.592841308745932e-05,
|
| 12071 |
+
"loss": 2.9679,
|
| 12072 |
+
"num_input_tokens_seen": 17773363200,
|
| 12073 |
+
"step": 67800
|
| 12074 |
+
},
|
| 12075 |
+
{
|
| 12076 |
+
"epoch": 0.45639294861123025,
|
| 12077 |
+
"grad_norm": 0.1517883837223053,
|
| 12078 |
+
"learning_rate": 4.389523710126619e-05,
|
| 12079 |
+
"loss": 2.9723,
|
| 12080 |
+
"num_input_tokens_seen": 17786470400,
|
| 12081 |
+
"step": 67850
|
| 12082 |
+
},
|
| 12083 |
+
{
|
| 12084 |
+
"epoch": 0.45672927355493786,
|
| 12085 |
+
"grad_norm": 0.1438019722700119,
|
| 12086 |
+
"learning_rate": 4.190602144143207e-05,
|
| 12087 |
+
"loss": 2.973,
|
| 12088 |
+
"num_input_tokens_seen": 17799577600,
|
| 12089 |
+
"step": 67900
|
| 12090 |
+
},
|
| 12091 |
+
{
|
| 12092 |
+
"epoch": 0.4570655984986455,
|
| 12093 |
+
"grad_norm": 0.14281606674194336,
|
| 12094 |
+
"learning_rate": 3.9960957832702595e-05,
|
| 12095 |
+
"loss": 2.9733,
|
| 12096 |
+
"num_input_tokens_seen": 17812684800,
|
| 12097 |
+
"step": 67950
|
| 12098 |
+
},
|
| 12099 |
+
{
|
| 12100 |
+
"epoch": 0.4574019234423531,
|
| 12101 |
+
"grad_norm": 0.14911025762557983,
|
| 12102 |
+
"learning_rate": 3.806023374435663e-05,
|
| 12103 |
+
"loss": 2.9724,
|
| 12104 |
+
"num_input_tokens_seen": 17825792000,
|
| 12105 |
+
"step": 68000
|
| 12106 |
+
},
|
| 12107 |
+
{
|
| 12108 |
+
"epoch": 0.4574019234423531,
|
| 12109 |
+
"eval_loss": 2.8663442134857178,
|
| 12110 |
+
"eval_runtime": 53.8853,
|
| 12111 |
+
"eval_samples_per_second": 92.79,
|
| 12112 |
+
"eval_steps_per_second": 23.197,
|
| 12113 |
+
"num_input_tokens_seen": 17825792000,
|
| 12114 |
+
"step": 68000
|
| 12115 |
}
|
| 12116 |
],
|
| 12117 |
"logging_steps": 50,
|
| 12118 |
"max_steps": 70000,
|
| 12119 |
+
"num_input_tokens_seen": 17825792000,
|
| 12120 |
"num_train_epochs": 1,
|
| 12121 |
"save_steps": 1000,
|
| 12122 |
"stateful_callbacks": {
|
|
|
|
| 12131 |
"attributes": {}
|
| 12132 |
}
|
| 12133 |
},
|
| 12134 |
+
"total_flos": 4.76857333972992e+18,
|
| 12135 |
"train_batch_size": 64,
|
| 12136 |
"trial_name": null,
|
| 12137 |
"trial_params": null
|