Training in progress, step 68000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b26db7188c89cde52f93cc8f561f4529a8702aaa52ce9c883892b96769dd603
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec885f087630fd98da5aea6a3b9af5bf67a1e0daf9ab5c57e09d7f1ac7385946
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4fb0106671a29e67305a03ecdd422ffd62f40cc2f3e19327fe3581d2d1603d90
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11934,11 +11934,189 @@
|
|
| 11934 |
"eval_steps_per_second": 23.351,
|
| 11935 |
"num_input_tokens_seen": 17563643456,
|
| 11936 |
"step": 67000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11937 |
}
|
| 11938 |
],
|
| 11939 |
"logging_steps": 50,
|
| 11940 |
"max_steps": 70000,
|
| 11941 |
-
"num_input_tokens_seen":
|
| 11942 |
"num_train_epochs": 1,
|
| 11943 |
"save_steps": 1000,
|
| 11944 |
"stateful_callbacks": {
|
|
@@ -11953,7 +12131,7 @@
|
|
| 11953 |
"attributes": {}
|
| 11954 |
}
|
| 11955 |
},
|
| 11956 |
-
"total_flos": 4.
|
| 11957 |
"train_batch_size": 64,
|
| 11958 |
"trial_name": null,
|
| 11959 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3243617110080256,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 68000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11934 |
"eval_steps_per_second": 23.351,
|
| 11935 |
"num_input_tokens_seen": 17563643456,
|
| 11936 |
"step": 67000
|
| 11937 |
+
},
|
| 11938 |
+
{
|
| 11939 |
+
"epoch": 0.319830187104237,
|
| 11940 |
+
"grad_norm": 0.1690913438796997,
|
| 11941 |
+
"learning_rate": 0.00010561116804955451,
|
| 11942 |
+
"loss": 2.5364,
|
| 11943 |
+
"num_input_tokens_seen": 17576750656,
|
| 11944 |
+
"step": 67050
|
| 11945 |
+
},
|
| 11946 |
+
{
|
| 11947 |
+
"epoch": 0.3200686883623311,
|
| 11948 |
+
"grad_norm": 0.16436229646205902,
|
| 11949 |
+
"learning_rate": 0.00010218772555910954,
|
| 11950 |
+
"loss": 2.5298,
|
| 11951 |
+
"num_input_tokens_seen": 17589857856,
|
| 11952 |
+
"step": 67100
|
| 11953 |
+
},
|
| 11954 |
+
{
|
| 11955 |
+
"epoch": 0.32030718962042526,
|
| 11956 |
+
"grad_norm": 0.15499907732009888,
|
| 11957 |
+
"learning_rate": 9.881436225981105e-05,
|
| 11958 |
+
"loss": 2.5484,
|
| 11959 |
+
"num_input_tokens_seen": 17602965056,
|
| 11960 |
+
"step": 67150
|
| 11961 |
+
},
|
| 11962 |
+
{
|
| 11963 |
+
"epoch": 0.32054569087851936,
|
| 11964 |
+
"grad_norm": 0.16237874329090118,
|
| 11965 |
+
"learning_rate": 9.549150281252633e-05,
|
| 11966 |
+
"loss": 2.5271,
|
| 11967 |
+
"num_input_tokens_seen": 17616072256,
|
| 11968 |
+
"step": 67200
|
| 11969 |
+
},
|
| 11970 |
+
{
|
| 11971 |
+
"epoch": 0.3207841921366135,
|
| 11972 |
+
"grad_norm": 0.16813968122005463,
|
| 11973 |
+
"learning_rate": 9.221956552036992e-05,
|
| 11974 |
+
"loss": 2.5295,
|
| 11975 |
+
"num_input_tokens_seen": 17629179456,
|
| 11976 |
+
"step": 67250
|
| 11977 |
+
},
|
| 11978 |
+
{
|
| 11979 |
+
"epoch": 0.3210226933947077,
|
| 11980 |
+
"grad_norm": 0.15672080218791962,
|
| 11981 |
+
"learning_rate": 8.899896227604509e-05,
|
| 11982 |
+
"loss": 2.528,
|
| 11983 |
+
"num_input_tokens_seen": 17642286656,
|
| 11984 |
+
"step": 67300
|
| 11985 |
+
},
|
| 11986 |
+
{
|
| 11987 |
+
"epoch": 0.3212611946528018,
|
| 11988 |
+
"grad_norm": 0.16523708403110504,
|
| 11989 |
+
"learning_rate": 8.58300985099918e-05,
|
| 11990 |
+
"loss": 2.5288,
|
| 11991 |
+
"num_input_tokens_seen": 17655393856,
|
| 11992 |
+
"step": 67350
|
| 11993 |
+
},
|
| 11994 |
+
{
|
| 11995 |
+
"epoch": 0.32149969591089594,
|
| 11996 |
+
"grad_norm": 0.16759687662124634,
|
| 11997 |
+
"learning_rate": 8.271337313934868e-05,
|
| 11998 |
+
"loss": 2.5431,
|
| 11999 |
+
"num_input_tokens_seen": 17668501056,
|
| 12000 |
+
"step": 67400
|
| 12001 |
+
},
|
| 12002 |
+
{
|
| 12003 |
+
"epoch": 0.32173819716899005,
|
| 12004 |
+
"grad_norm": 0.15507538616657257,
|
| 12005 |
+
"learning_rate": 7.964917851773496e-05,
|
| 12006 |
+
"loss": 2.5342,
|
| 12007 |
+
"num_input_tokens_seen": 17681608256,
|
| 12008 |
+
"step": 67450
|
| 12009 |
+
},
|
| 12010 |
+
{
|
| 12011 |
+
"epoch": 0.3219766984270842,
|
| 12012 |
+
"grad_norm": 0.1556961089372635,
|
| 12013 |
+
"learning_rate": 7.663790038585794e-05,
|
| 12014 |
+
"loss": 2.5189,
|
| 12015 |
+
"num_input_tokens_seen": 17694715456,
|
| 12016 |
+
"step": 67500
|
| 12017 |
+
},
|
| 12018 |
+
{
|
| 12019 |
+
"epoch": 0.3219766984270842,
|
| 12020 |
+
"eval_loss": 2.415555000305176,
|
| 12021 |
+
"eval_runtime": 53.2935,
|
| 12022 |
+
"eval_samples_per_second": 93.82,
|
| 12023 |
+
"eval_steps_per_second": 23.455,
|
| 12024 |
+
"num_input_tokens_seen": 17694715456,
|
| 12025 |
+
"step": 67500
|
| 12026 |
+
},
|
| 12027 |
+
{
|
| 12028 |
+
"epoch": 0.32221519968517837,
|
| 12029 |
+
"grad_norm": 0.16804397106170654,
|
| 12030 |
+
"learning_rate": 7.367991782295391e-05,
|
| 12031 |
+
"loss": 2.5218,
|
| 12032 |
+
"num_input_tokens_seen": 17707822656,
|
| 12033 |
+
"step": 67550
|
| 12034 |
+
},
|
| 12035 |
+
{
|
| 12036 |
+
"epoch": 0.32245370094327247,
|
| 12037 |
+
"grad_norm": 0.15728074312210083,
|
| 12038 |
+
"learning_rate": 7.077560319906695e-05,
|
| 12039 |
+
"loss": 2.5261,
|
| 12040 |
+
"num_input_tokens_seen": 17720929856,
|
| 12041 |
+
"step": 67600
|
| 12042 |
+
},
|
| 12043 |
+
{
|
| 12044 |
+
"epoch": 0.32269220220136663,
|
| 12045 |
+
"grad_norm": 0.1641319841146469,
|
| 12046 |
+
"learning_rate": 6.792532212817271e-05,
|
| 12047 |
+
"loss": 2.5398,
|
| 12048 |
+
"num_input_tokens_seen": 17734037056,
|
| 12049 |
+
"step": 67650
|
| 12050 |
+
},
|
| 12051 |
+
{
|
| 12052 |
+
"epoch": 0.32293070345946073,
|
| 12053 |
+
"grad_norm": 0.1575596034526825,
|
| 12054 |
+
"learning_rate": 6.512943342215233e-05,
|
| 12055 |
+
"loss": 2.5211,
|
| 12056 |
+
"num_input_tokens_seen": 17747144256,
|
| 12057 |
+
"step": 67700
|
| 12058 |
+
},
|
| 12059 |
+
{
|
| 12060 |
+
"epoch": 0.3231692047175549,
|
| 12061 |
+
"grad_norm": 0.16352206468582153,
|
| 12062 |
+
"learning_rate": 6.238828904562316e-05,
|
| 12063 |
+
"loss": 2.5143,
|
| 12064 |
+
"num_input_tokens_seen": 17760251456,
|
| 12065 |
+
"step": 67750
|
| 12066 |
+
},
|
| 12067 |
+
{
|
| 12068 |
+
"epoch": 0.323407705975649,
|
| 12069 |
+
"grad_norm": 0.16303551197052002,
|
| 12070 |
+
"learning_rate": 5.9702234071631e-05,
|
| 12071 |
+
"loss": 2.5262,
|
| 12072 |
+
"num_input_tokens_seen": 17773358656,
|
| 12073 |
+
"step": 67800
|
| 12074 |
+
},
|
| 12075 |
+
{
|
| 12076 |
+
"epoch": 0.32364620723374316,
|
| 12077 |
+
"grad_norm": 0.15572308003902435,
|
| 12078 |
+
"learning_rate": 5.7071606638210094e-05,
|
| 12079 |
+
"loss": 2.5278,
|
| 12080 |
+
"num_input_tokens_seen": 17786465856,
|
| 12081 |
+
"step": 67850
|
| 12082 |
+
},
|
| 12083 |
+
{
|
| 12084 |
+
"epoch": 0.3238847084918373,
|
| 12085 |
+
"grad_norm": 0.15960544347763062,
|
| 12086 |
+
"learning_rate": 5.449673790581611e-05,
|
| 12087 |
+
"loss": 2.522,
|
| 12088 |
+
"num_input_tokens_seen": 17799573056,
|
| 12089 |
+
"step": 67900
|
| 12090 |
+
},
|
| 12091 |
+
{
|
| 12092 |
+
"epoch": 0.3241232097499314,
|
| 12093 |
+
"grad_norm": 0.15617695450782776,
|
| 12094 |
+
"learning_rate": 5.197795201563743e-05,
|
| 12095 |
+
"loss": 2.5151,
|
| 12096 |
+
"num_input_tokens_seen": 17812680256,
|
| 12097 |
+
"step": 67950
|
| 12098 |
+
},
|
| 12099 |
+
{
|
| 12100 |
+
"epoch": 0.3243617110080256,
|
| 12101 |
+
"grad_norm": 0.1527390033006668,
|
| 12102 |
+
"learning_rate": 4.9515566048790485e-05,
|
| 12103 |
+
"loss": 2.5213,
|
| 12104 |
+
"num_input_tokens_seen": 17825787456,
|
| 12105 |
+
"step": 68000
|
| 12106 |
+
},
|
| 12107 |
+
{
|
| 12108 |
+
"epoch": 0.3243617110080256,
|
| 12109 |
+
"eval_loss": 2.4139962196350098,
|
| 12110 |
+
"eval_runtime": 53.933,
|
| 12111 |
+
"eval_samples_per_second": 92.708,
|
| 12112 |
+
"eval_steps_per_second": 23.177,
|
| 12113 |
+
"num_input_tokens_seen": 17825787456,
|
| 12114 |
+
"step": 68000
|
| 12115 |
}
|
| 12116 |
],
|
| 12117 |
"logging_steps": 50,
|
| 12118 |
"max_steps": 70000,
|
| 12119 |
+
"num_input_tokens_seen": 17825787456,
|
| 12120 |
"num_train_epochs": 1,
|
| 12121 |
"save_steps": 1000,
|
| 12122 |
"stateful_callbacks": {
|
|
|
|
| 12131 |
"attributes": {}
|
| 12132 |
}
|
| 12133 |
},
|
| 12134 |
+
"total_flos": 4.768572124165571e+18,
|
| 12135 |
"train_batch_size": 64,
|
| 12136 |
"trial_name": null,
|
| 12137 |
"trial_params": null
|