Training in progress, step 23000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b09b714fd1dab9349e276a2d1d64f33fa5004406d993d6a429a5d969766a4711
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c41cb06b8dbe81630708c3aba06b43b83a86488a65bdb7921cf7d53205761b4
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e51d5acf7af721ddb096287cdb18fed327f0e9b8f18a038e8c92ad3d7c982961
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d95e34f92e07ac3fca3b42c6f704d96b3191058ad871fb9d07f6b2779013efbe
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3924,11 +3924,189 @@
|
|
| 3924 |
"eval_steps_per_second": 19.24,
|
| 3925 |
"num_input_tokens_seen": 23068672000,
|
| 3926 |
"step": 22000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3927 |
}
|
| 3928 |
],
|
| 3929 |
"logging_steps": 50,
|
| 3930 |
"max_steps": 200000,
|
| 3931 |
-
"num_input_tokens_seen":
|
| 3932 |
"num_train_epochs": 5,
|
| 3933 |
"save_steps": 1000,
|
| 3934 |
"stateful_callbacks": {
|
|
@@ -3943,7 +4121,7 @@
|
|
| 3943 |
"attributes": {}
|
| 3944 |
}
|
| 3945 |
},
|
| 3946 |
-
"total_flos": 1.
|
| 3947 |
"train_batch_size": 64,
|
| 3948 |
"trial_name": null,
|
| 3949 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.5052189945867432,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 23000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3924 |
"eval_steps_per_second": 19.24,
|
| 3925 |
"num_input_tokens_seen": 23068672000,
|
| 3926 |
"step": 22000
|
| 3927 |
+
},
|
| 3928 |
+
{
|
| 3929 |
+
"epoch": 0.48435125350598646,
|
| 3930 |
+
"grad_norm": 0.15728288888931274,
|
| 3931 |
+
"learning_rate": 0.001,
|
| 3932 |
+
"loss": 2.703,
|
| 3933 |
+
"num_input_tokens_seen": 23121100800,
|
| 3934 |
+
"step": 22050
|
| 3935 |
+
},
|
| 3936 |
+
{
|
| 3937 |
+
"epoch": 0.4854495556681315,
|
| 3938 |
+
"grad_norm": 0.13599443435668945,
|
| 3939 |
+
"learning_rate": 0.001,
|
| 3940 |
+
"loss": 2.6984,
|
| 3941 |
+
"num_input_tokens_seen": 23173529600,
|
| 3942 |
+
"step": 22100
|
| 3943 |
+
},
|
| 3944 |
+
{
|
| 3945 |
+
"epoch": 0.4865478578302766,
|
| 3946 |
+
"grad_norm": 0.25702551007270813,
|
| 3947 |
+
"learning_rate": 0.001,
|
| 3948 |
+
"loss": 2.9388,
|
| 3949 |
+
"num_input_tokens_seen": 23225958400,
|
| 3950 |
+
"step": 22150
|
| 3951 |
+
},
|
| 3952 |
+
{
|
| 3953 |
+
"epoch": 0.4876461599924217,
|
| 3954 |
+
"grad_norm": 0.12942279875278473,
|
| 3955 |
+
"learning_rate": 0.001,
|
| 3956 |
+
"loss": 2.7568,
|
| 3957 |
+
"num_input_tokens_seen": 23278383360,
|
| 3958 |
+
"step": 22200
|
| 3959 |
+
},
|
| 3960 |
+
{
|
| 3961 |
+
"epoch": 0.4887444621545668,
|
| 3962 |
+
"grad_norm": 0.12908817827701569,
|
| 3963 |
+
"learning_rate": 0.001,
|
| 3964 |
+
"loss": 2.7195,
|
| 3965 |
+
"num_input_tokens_seen": 23330812160,
|
| 3966 |
+
"step": 22250
|
| 3967 |
+
},
|
| 3968 |
+
{
|
| 3969 |
+
"epoch": 0.4898427643167119,
|
| 3970 |
+
"grad_norm": 0.1351587176322937,
|
| 3971 |
+
"learning_rate": 0.001,
|
| 3972 |
+
"loss": 2.7155,
|
| 3973 |
+
"num_input_tokens_seen": 23383240960,
|
| 3974 |
+
"step": 22300
|
| 3975 |
+
},
|
| 3976 |
+
{
|
| 3977 |
+
"epoch": 0.490941066478857,
|
| 3978 |
+
"grad_norm": 0.1245250552892685,
|
| 3979 |
+
"learning_rate": 0.001,
|
| 3980 |
+
"loss": 2.7074,
|
| 3981 |
+
"num_input_tokens_seen": 23435669760,
|
| 3982 |
+
"step": 22350
|
| 3983 |
+
},
|
| 3984 |
+
{
|
| 3985 |
+
"epoch": 0.4920393686410021,
|
| 3986 |
+
"grad_norm": 0.13818837702274323,
|
| 3987 |
+
"learning_rate": 0.001,
|
| 3988 |
+
"loss": 2.7064,
|
| 3989 |
+
"num_input_tokens_seen": 23488098560,
|
| 3990 |
+
"step": 22400
|
| 3991 |
+
},
|
| 3992 |
+
{
|
| 3993 |
+
"epoch": 0.4931376708031472,
|
| 3994 |
+
"grad_norm": 0.15505041182041168,
|
| 3995 |
+
"learning_rate": 0.001,
|
| 3996 |
+
"loss": 2.7044,
|
| 3997 |
+
"num_input_tokens_seen": 23540527360,
|
| 3998 |
+
"step": 22450
|
| 3999 |
+
},
|
| 4000 |
+
{
|
| 4001 |
+
"epoch": 0.49423597296529226,
|
| 4002 |
+
"grad_norm": 0.14414137601852417,
|
| 4003 |
+
"learning_rate": 0.001,
|
| 4004 |
+
"loss": 2.7046,
|
| 4005 |
+
"num_input_tokens_seen": 23592956160,
|
| 4006 |
+
"step": 22500
|
| 4007 |
+
},
|
| 4008 |
+
{
|
| 4009 |
+
"epoch": 0.49423597296529226,
|
| 4010 |
+
"eval_loss": 2.60188627243042,
|
| 4011 |
+
"eval_runtime": 67.3268,
|
| 4012 |
+
"eval_samples_per_second": 74.265,
|
| 4013 |
+
"eval_steps_per_second": 18.566,
|
| 4014 |
+
"num_input_tokens_seen": 23592956160,
|
| 4015 |
+
"step": 22500
|
| 4016 |
+
},
|
| 4017 |
+
{
|
| 4018 |
+
"epoch": 0.4953342751274374,
|
| 4019 |
+
"grad_norm": 0.14763414859771729,
|
| 4020 |
+
"learning_rate": 0.001,
|
| 4021 |
+
"loss": 2.695,
|
| 4022 |
+
"num_input_tokens_seen": 23645384960,
|
| 4023 |
+
"step": 22550
|
| 4024 |
+
},
|
| 4025 |
+
{
|
| 4026 |
+
"epoch": 0.49643257728958246,
|
| 4027 |
+
"grad_norm": 0.14800110459327698,
|
| 4028 |
+
"learning_rate": 0.001,
|
| 4029 |
+
"loss": 2.6939,
|
| 4030 |
+
"num_input_tokens_seen": 23697813760,
|
| 4031 |
+
"step": 22600
|
| 4032 |
+
},
|
| 4033 |
+
{
|
| 4034 |
+
"epoch": 0.4975308794517276,
|
| 4035 |
+
"grad_norm": 0.13590902090072632,
|
| 4036 |
+
"learning_rate": 0.001,
|
| 4037 |
+
"loss": 2.6967,
|
| 4038 |
+
"num_input_tokens_seen": 23750242560,
|
| 4039 |
+
"step": 22650
|
| 4040 |
+
},
|
| 4041 |
+
{
|
| 4042 |
+
"epoch": 0.49862918161387265,
|
| 4043 |
+
"grad_norm": 0.1315733939409256,
|
| 4044 |
+
"learning_rate": 0.001,
|
| 4045 |
+
"loss": 2.6909,
|
| 4046 |
+
"num_input_tokens_seen": 23802671360,
|
| 4047 |
+
"step": 22700
|
| 4048 |
+
},
|
| 4049 |
+
{
|
| 4050 |
+
"epoch": 0.4997274837760177,
|
| 4051 |
+
"grad_norm": 0.13714700937271118,
|
| 4052 |
+
"learning_rate": 0.001,
|
| 4053 |
+
"loss": 2.6957,
|
| 4054 |
+
"num_input_tokens_seen": 23855100160,
|
| 4055 |
+
"step": 22750
|
| 4056 |
+
},
|
| 4057 |
+
{
|
| 4058 |
+
"epoch": 0.5008257859381628,
|
| 4059 |
+
"grad_norm": 0.1412438154220581,
|
| 4060 |
+
"learning_rate": 0.001,
|
| 4061 |
+
"loss": 2.6977,
|
| 4062 |
+
"num_input_tokens_seen": 23907528960,
|
| 4063 |
+
"step": 22800
|
| 4064 |
+
},
|
| 4065 |
+
{
|
| 4066 |
+
"epoch": 0.501924088100308,
|
| 4067 |
+
"grad_norm": 0.15368172526359558,
|
| 4068 |
+
"learning_rate": 0.001,
|
| 4069 |
+
"loss": 2.6977,
|
| 4070 |
+
"num_input_tokens_seen": 23959957760,
|
| 4071 |
+
"step": 22850
|
| 4072 |
+
},
|
| 4073 |
+
{
|
| 4074 |
+
"epoch": 0.503022390262453,
|
| 4075 |
+
"grad_norm": 0.14018824696540833,
|
| 4076 |
+
"learning_rate": 0.001,
|
| 4077 |
+
"loss": 2.6992,
|
| 4078 |
+
"num_input_tokens_seen": 24012386560,
|
| 4079 |
+
"step": 22900
|
| 4080 |
+
},
|
| 4081 |
+
{
|
| 4082 |
+
"epoch": 0.5041206924245981,
|
| 4083 |
+
"grad_norm": 0.1284814178943634,
|
| 4084 |
+
"learning_rate": 0.001,
|
| 4085 |
+
"loss": 2.6962,
|
| 4086 |
+
"num_input_tokens_seen": 24064815360,
|
| 4087 |
+
"step": 22950
|
| 4088 |
+
},
|
| 4089 |
+
{
|
| 4090 |
+
"epoch": 0.5052189945867432,
|
| 4091 |
+
"grad_norm": 0.15145835280418396,
|
| 4092 |
+
"learning_rate": 0.001,
|
| 4093 |
+
"loss": 2.692,
|
| 4094 |
+
"num_input_tokens_seen": 24117244160,
|
| 4095 |
+
"step": 23000
|
| 4096 |
+
},
|
| 4097 |
+
{
|
| 4098 |
+
"epoch": 0.5052189945867432,
|
| 4099 |
+
"eval_loss": 2.5970778465270996,
|
| 4100 |
+
"eval_runtime": 66.1666,
|
| 4101 |
+
"eval_samples_per_second": 75.567,
|
| 4102 |
+
"eval_steps_per_second": 18.892,
|
| 4103 |
+
"num_input_tokens_seen": 24117244160,
|
| 4104 |
+
"step": 23000
|
| 4105 |
}
|
| 4106 |
],
|
| 4107 |
"logging_steps": 50,
|
| 4108 |
"max_steps": 200000,
|
| 4109 |
+
"num_input_tokens_seen": 24117244160,
|
| 4110 |
"num_train_epochs": 5,
|
| 4111 |
"save_steps": 1000,
|
| 4112 |
"stateful_callbacks": {
|
|
|
|
| 4121 |
"attributes": {}
|
| 4122 |
}
|
| 4123 |
},
|
| 4124 |
+
"total_flos": 1.3734947280285204e+19,
|
| 4125 |
"train_batch_size": 64,
|
| 4126 |
"trial_name": null,
|
| 4127 |
"trial_params": null
|