Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 4500, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9446744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b732646b1016d0368b94920529e0e03c133894ca8756d67e145a97d90d254777
|
| 3 |
size 9446744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4879947
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1728e885cf58302b2e8ae68b6c9f146637db471aa0ed43e5c883bad6235443e
|
| 3 |
size 4879947
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16de339ad05cf2ba88ca8586907951353749d574c9326b3098589fb0f62ac32e
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cefe33faabb000e8f719c6f02e0099d6289469d78aca45133006441981cd323
|
| 3 |
size 14917
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b62db0ba9861d9ab63380744e79a287faa461a1bf55700140a411fe1e976f1cd
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b41aa0c086667ab13fd1c3da2f8b431d894c7368cafdbcdd2e5351f4800eddf8
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4008,6 +4008,506 @@
|
|
| 4008 |
"mean_token_accuracy": 0.6625144556164742,
|
| 4009 |
"num_tokens": 23764831.0,
|
| 4010 |
"step": 4000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4011 |
}
|
| 4012 |
],
|
| 4013 |
"logging_steps": 10,
|
|
@@ -4027,7 +4527,7 @@
|
|
| 4027 |
"attributes": {}
|
| 4028 |
}
|
| 4029 |
},
|
| 4030 |
-
"total_flos":
|
| 4031 |
"train_batch_size": 2,
|
| 4032 |
"trial_name": null,
|
| 4033 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 7.732430689877498,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 4500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4008 |
"mean_token_accuracy": 0.6625144556164742,
|
| 4009 |
"num_tokens": 23764831.0,
|
| 4010 |
"step": 4000
|
| 4011 |
+
},
|
| 4012 |
+
{
|
| 4013 |
+
"entropy": 1.699565550684929,
|
| 4014 |
+
"epoch": 6.890608209757146,
|
| 4015 |
+
"grad_norm": 0.7662839889526367,
|
| 4016 |
+
"learning_rate": 3.964e-05,
|
| 4017 |
+
"loss": 1.7373327255249023,
|
| 4018 |
+
"mean_token_accuracy": 0.6809282444417477,
|
| 4019 |
+
"num_tokens": 23825367.0,
|
| 4020 |
+
"step": 4010
|
| 4021 |
+
},
|
| 4022 |
+
{
|
| 4023 |
+
"entropy": 1.6455101184546947,
|
| 4024 |
+
"epoch": 6.907801418439716,
|
| 4025 |
+
"grad_norm": 0.7619901299476624,
|
| 4026 |
+
"learning_rate": 3.9240000000000004e-05,
|
| 4027 |
+
"loss": 1.709805679321289,
|
| 4028 |
+
"mean_token_accuracy": 0.6812954246997833,
|
| 4029 |
+
"num_tokens": 23887369.0,
|
| 4030 |
+
"step": 4020
|
| 4031 |
+
},
|
| 4032 |
+
{
|
| 4033 |
+
"entropy": 1.7952800825238229,
|
| 4034 |
+
"epoch": 6.924994627122286,
|
| 4035 |
+
"grad_norm": 0.7858437299728394,
|
| 4036 |
+
"learning_rate": 3.884e-05,
|
| 4037 |
+
"loss": 1.8688398361206056,
|
| 4038 |
+
"mean_token_accuracy": 0.6621494639664889,
|
| 4039 |
+
"num_tokens": 23949358.0,
|
| 4040 |
+
"step": 4030
|
| 4041 |
+
},
|
| 4042 |
+
{
|
| 4043 |
+
"entropy": 1.772008201479912,
|
| 4044 |
+
"epoch": 6.942187835804857,
|
| 4045 |
+
"grad_norm": 0.7586779594421387,
|
| 4046 |
+
"learning_rate": 3.8440000000000005e-05,
|
| 4047 |
+
"loss": 1.798760986328125,
|
| 4048 |
+
"mean_token_accuracy": 0.667642817273736,
|
| 4049 |
+
"num_tokens": 24009691.0,
|
| 4050 |
+
"step": 4040
|
| 4051 |
+
},
|
| 4052 |
+
{
|
| 4053 |
+
"entropy": 1.7289930269122125,
|
| 4054 |
+
"epoch": 6.959381044487428,
|
| 4055 |
+
"grad_norm": 0.854505717754364,
|
| 4056 |
+
"learning_rate": 3.804e-05,
|
| 4057 |
+
"loss": 1.771562385559082,
|
| 4058 |
+
"mean_token_accuracy": 0.6692178774625063,
|
| 4059 |
+
"num_tokens": 24064506.0,
|
| 4060 |
+
"step": 4050
|
| 4061 |
+
},
|
| 4062 |
+
{
|
| 4063 |
+
"entropy": 1.715189914405346,
|
| 4064 |
+
"epoch": 6.976574253169998,
|
| 4065 |
+
"grad_norm": 0.758488655090332,
|
| 4066 |
+
"learning_rate": 3.7640000000000006e-05,
|
| 4067 |
+
"loss": 1.756412887573242,
|
| 4068 |
+
"mean_token_accuracy": 0.6710222817957401,
|
| 4069 |
+
"num_tokens": 24126841.0,
|
| 4070 |
+
"step": 4060
|
| 4071 |
+
},
|
| 4072 |
+
{
|
| 4073 |
+
"entropy": 1.7383173301815986,
|
| 4074 |
+
"epoch": 6.993767461852569,
|
| 4075 |
+
"grad_norm": 0.7450618147850037,
|
| 4076 |
+
"learning_rate": 3.724e-05,
|
| 4077 |
+
"loss": 1.7997669219970702,
|
| 4078 |
+
"mean_token_accuracy": 0.6649864386767149,
|
| 4079 |
+
"num_tokens": 24186159.0,
|
| 4080 |
+
"step": 4070
|
| 4081 |
+
},
|
| 4082 |
+
{
|
| 4083 |
+
"entropy": 1.7172312767474682,
|
| 4084 |
+
"epoch": 7.010315925209542,
|
| 4085 |
+
"grad_norm": 0.8475770950317383,
|
| 4086 |
+
"learning_rate": 3.684e-05,
|
| 4087 |
+
"loss": 1.7585922241210938,
|
| 4088 |
+
"mean_token_accuracy": 0.6746863397684965,
|
| 4089 |
+
"num_tokens": 24239759.0,
|
| 4090 |
+
"step": 4080
|
| 4091 |
+
},
|
| 4092 |
+
{
|
| 4093 |
+
"entropy": 1.7192407630383968,
|
| 4094 |
+
"epoch": 7.027509133892113,
|
| 4095 |
+
"grad_norm": 0.7818967700004578,
|
| 4096 |
+
"learning_rate": 3.6440000000000003e-05,
|
| 4097 |
+
"loss": 1.7634265899658204,
|
| 4098 |
+
"mean_token_accuracy": 0.6724576361477375,
|
| 4099 |
+
"num_tokens": 24298775.0,
|
| 4100 |
+
"step": 4090
|
| 4101 |
+
},
|
| 4102 |
+
{
|
| 4103 |
+
"entropy": 1.7496131911873818,
|
| 4104 |
+
"epoch": 7.044702342574683,
|
| 4105 |
+
"grad_norm": 0.8118335008621216,
|
| 4106 |
+
"learning_rate": 3.604e-05,
|
| 4107 |
+
"loss": 1.802253532409668,
|
| 4108 |
+
"mean_token_accuracy": 0.6702191606163979,
|
| 4109 |
+
"num_tokens": 24361142.0,
|
| 4110 |
+
"step": 4100
|
| 4111 |
+
},
|
| 4112 |
+
{
|
| 4113 |
+
"entropy": 1.7090509735047816,
|
| 4114 |
+
"epoch": 7.061895551257253,
|
| 4115 |
+
"grad_norm": 0.8414726257324219,
|
| 4116 |
+
"learning_rate": 3.5640000000000004e-05,
|
| 4117 |
+
"loss": 1.7347373962402344,
|
| 4118 |
+
"mean_token_accuracy": 0.679864277690649,
|
| 4119 |
+
"num_tokens": 24419838.0,
|
| 4120 |
+
"step": 4110
|
| 4121 |
+
},
|
| 4122 |
+
{
|
| 4123 |
+
"entropy": 1.6807728812098504,
|
| 4124 |
+
"epoch": 7.079088759939824,
|
| 4125 |
+
"grad_norm": 0.8567139506340027,
|
| 4126 |
+
"learning_rate": 3.524e-05,
|
| 4127 |
+
"loss": 1.7365150451660156,
|
| 4128 |
+
"mean_token_accuracy": 0.6765194039791822,
|
| 4129 |
+
"num_tokens": 24477518.0,
|
| 4130 |
+
"step": 4120
|
| 4131 |
+
},
|
| 4132 |
+
{
|
| 4133 |
+
"entropy": 1.709678091108799,
|
| 4134 |
+
"epoch": 7.096281968622394,
|
| 4135 |
+
"grad_norm": 0.8345620036125183,
|
| 4136 |
+
"learning_rate": 3.484e-05,
|
| 4137 |
+
"loss": 1.730575180053711,
|
| 4138 |
+
"mean_token_accuracy": 0.6709145799279213,
|
| 4139 |
+
"num_tokens": 24534560.0,
|
| 4140 |
+
"step": 4130
|
| 4141 |
+
},
|
| 4142 |
+
{
|
| 4143 |
+
"entropy": 1.6541544690728187,
|
| 4144 |
+
"epoch": 7.113475177304965,
|
| 4145 |
+
"grad_norm": 0.8509814143180847,
|
| 4146 |
+
"learning_rate": 3.444e-05,
|
| 4147 |
+
"loss": 1.6795757293701172,
|
| 4148 |
+
"mean_token_accuracy": 0.6856038823723793,
|
| 4149 |
+
"num_tokens": 24594829.0,
|
| 4150 |
+
"step": 4140
|
| 4151 |
+
},
|
| 4152 |
+
{
|
| 4153 |
+
"entropy": 1.7498343527317046,
|
| 4154 |
+
"epoch": 7.130668385987535,
|
| 4155 |
+
"grad_norm": 0.8674039244651794,
|
| 4156 |
+
"learning_rate": 3.404e-05,
|
| 4157 |
+
"loss": 1.8083892822265626,
|
| 4158 |
+
"mean_token_accuracy": 0.6709578204900026,
|
| 4159 |
+
"num_tokens": 24656798.0,
|
| 4160 |
+
"step": 4150
|
| 4161 |
+
},
|
| 4162 |
+
{
|
| 4163 |
+
"entropy": 1.677807478606701,
|
| 4164 |
+
"epoch": 7.147861594670105,
|
| 4165 |
+
"grad_norm": 0.8016234040260315,
|
| 4166 |
+
"learning_rate": 3.3639999999999996e-05,
|
| 4167 |
+
"loss": 1.7206790924072266,
|
| 4168 |
+
"mean_token_accuracy": 0.6754934191703796,
|
| 4169 |
+
"num_tokens": 24714009.0,
|
| 4170 |
+
"step": 4160
|
| 4171 |
+
},
|
| 4172 |
+
{
|
| 4173 |
+
"entropy": 1.672835360467434,
|
| 4174 |
+
"epoch": 7.1650548033526755,
|
| 4175 |
+
"grad_norm": 0.7139334082603455,
|
| 4176 |
+
"learning_rate": 3.324e-05,
|
| 4177 |
+
"loss": 1.7049163818359374,
|
| 4178 |
+
"mean_token_accuracy": 0.6851269513368606,
|
| 4179 |
+
"num_tokens": 24778022.0,
|
| 4180 |
+
"step": 4170
|
| 4181 |
+
},
|
| 4182 |
+
{
|
| 4183 |
+
"entropy": 1.6577355667948723,
|
| 4184 |
+
"epoch": 7.182248012035246,
|
| 4185 |
+
"grad_norm": 0.9129847288131714,
|
| 4186 |
+
"learning_rate": 3.2840000000000004e-05,
|
| 4187 |
+
"loss": 1.7073640823364258,
|
| 4188 |
+
"mean_token_accuracy": 0.6768647953867912,
|
| 4189 |
+
"num_tokens": 24837669.0,
|
| 4190 |
+
"step": 4180
|
| 4191 |
+
},
|
| 4192 |
+
{
|
| 4193 |
+
"entropy": 1.7049853071570396,
|
| 4194 |
+
"epoch": 7.199441220717817,
|
| 4195 |
+
"grad_norm": 0.7545643448829651,
|
| 4196 |
+
"learning_rate": 3.244e-05,
|
| 4197 |
+
"loss": 1.754374122619629,
|
| 4198 |
+
"mean_token_accuracy": 0.6808854278177023,
|
| 4199 |
+
"num_tokens": 24898991.0,
|
| 4200 |
+
"step": 4190
|
| 4201 |
+
},
|
| 4202 |
+
{
|
| 4203 |
+
"entropy": 1.6785477355122567,
|
| 4204 |
+
"epoch": 7.216634429400387,
|
| 4205 |
+
"grad_norm": 0.8802333474159241,
|
| 4206 |
+
"learning_rate": 3.2040000000000005e-05,
|
| 4207 |
+
"loss": 1.6974828720092774,
|
| 4208 |
+
"mean_token_accuracy": 0.6824289247393608,
|
| 4209 |
+
"num_tokens": 24957348.0,
|
| 4210 |
+
"step": 4200
|
| 4211 |
+
},
|
| 4212 |
+
{
|
| 4213 |
+
"entropy": 1.7312355414032936,
|
| 4214 |
+
"epoch": 7.233827638082957,
|
| 4215 |
+
"grad_norm": 0.8227038383483887,
|
| 4216 |
+
"learning_rate": 3.164e-05,
|
| 4217 |
+
"loss": 1.7645183563232423,
|
| 4218 |
+
"mean_token_accuracy": 0.6661410238593817,
|
| 4219 |
+
"num_tokens": 25016658.0,
|
| 4220 |
+
"step": 4210
|
| 4221 |
+
},
|
| 4222 |
+
{
|
| 4223 |
+
"entropy": 1.8124181643128394,
|
| 4224 |
+
"epoch": 7.2510208467655275,
|
| 4225 |
+
"grad_norm": 0.8563106060028076,
|
| 4226 |
+
"learning_rate": 3.1240000000000006e-05,
|
| 4227 |
+
"loss": 1.8163776397705078,
|
| 4228 |
+
"mean_token_accuracy": 0.6610642150044441,
|
| 4229 |
+
"num_tokens": 25074658.0,
|
| 4230 |
+
"step": 4220
|
| 4231 |
+
},
|
| 4232 |
+
{
|
| 4233 |
+
"entropy": 1.776869924366474,
|
| 4234 |
+
"epoch": 7.268214055448098,
|
| 4235 |
+
"grad_norm": 0.8615058064460754,
|
| 4236 |
+
"learning_rate": 3.084e-05,
|
| 4237 |
+
"loss": 1.861563491821289,
|
| 4238 |
+
"mean_token_accuracy": 0.6624562762677669,
|
| 4239 |
+
"num_tokens": 25132732.0,
|
| 4240 |
+
"step": 4230
|
| 4241 |
+
},
|
| 4242 |
+
{
|
| 4243 |
+
"entropy": 1.742109003663063,
|
| 4244 |
+
"epoch": 7.285407264130669,
|
| 4245 |
+
"grad_norm": 0.7851050496101379,
|
| 4246 |
+
"learning_rate": 3.0440000000000003e-05,
|
| 4247 |
+
"loss": 1.7527351379394531,
|
| 4248 |
+
"mean_token_accuracy": 0.6712357953190804,
|
| 4249 |
+
"num_tokens": 25194009.0,
|
| 4250 |
+
"step": 4240
|
| 4251 |
+
},
|
| 4252 |
+
{
|
| 4253 |
+
"entropy": 1.7356494843959809,
|
| 4254 |
+
"epoch": 7.302600472813239,
|
| 4255 |
+
"grad_norm": 0.8842288255691528,
|
| 4256 |
+
"learning_rate": 3.004e-05,
|
| 4257 |
+
"loss": 1.8091196060180663,
|
| 4258 |
+
"mean_token_accuracy": 0.6680308949202299,
|
| 4259 |
+
"num_tokens": 25250681.0,
|
| 4260 |
+
"step": 4250
|
| 4261 |
+
},
|
| 4262 |
+
{
|
| 4263 |
+
"entropy": 1.714112138748169,
|
| 4264 |
+
"epoch": 7.319793681495809,
|
| 4265 |
+
"grad_norm": 0.8050926923751831,
|
| 4266 |
+
"learning_rate": 2.964e-05,
|
| 4267 |
+
"loss": 1.741617774963379,
|
| 4268 |
+
"mean_token_accuracy": 0.6764710985124112,
|
| 4269 |
+
"num_tokens": 25307119.0,
|
| 4270 |
+
"step": 4260
|
| 4271 |
+
},
|
| 4272 |
+
{
|
| 4273 |
+
"entropy": 1.7806825146079064,
|
| 4274 |
+
"epoch": 7.3369868901783795,
|
| 4275 |
+
"grad_norm": 0.755797803401947,
|
| 4276 |
+
"learning_rate": 2.924e-05,
|
| 4277 |
+
"loss": 1.8448747634887694,
|
| 4278 |
+
"mean_token_accuracy": 0.6646751999855042,
|
| 4279 |
+
"num_tokens": 25365721.0,
|
| 4280 |
+
"step": 4270
|
| 4281 |
+
},
|
| 4282 |
+
{
|
| 4283 |
+
"entropy": 1.7478718511760234,
|
| 4284 |
+
"epoch": 7.35418009886095,
|
| 4285 |
+
"grad_norm": 0.8148614764213562,
|
| 4286 |
+
"learning_rate": 2.8840000000000002e-05,
|
| 4287 |
+
"loss": 1.8303293228149413,
|
| 4288 |
+
"mean_token_accuracy": 0.6662985436618328,
|
| 4289 |
+
"num_tokens": 25423309.0,
|
| 4290 |
+
"step": 4280
|
| 4291 |
+
},
|
| 4292 |
+
{
|
| 4293 |
+
"entropy": 1.6996045634150505,
|
| 4294 |
+
"epoch": 7.371373307543521,
|
| 4295 |
+
"grad_norm": 0.7613778114318848,
|
| 4296 |
+
"learning_rate": 2.844e-05,
|
| 4297 |
+
"loss": 1.7077817916870117,
|
| 4298 |
+
"mean_token_accuracy": 0.679437268525362,
|
| 4299 |
+
"num_tokens": 25480080.0,
|
| 4300 |
+
"step": 4290
|
| 4301 |
+
},
|
| 4302 |
+
{
|
| 4303 |
+
"entropy": 1.8055237784981728,
|
| 4304 |
+
"epoch": 7.38856651622609,
|
| 4305 |
+
"grad_norm": 0.899900496006012,
|
| 4306 |
+
"learning_rate": 2.804e-05,
|
| 4307 |
+
"loss": 1.882634735107422,
|
| 4308 |
+
"mean_token_accuracy": 0.659589122608304,
|
| 4309 |
+
"num_tokens": 25538885.0,
|
| 4310 |
+
"step": 4300
|
| 4311 |
+
},
|
| 4312 |
+
{
|
| 4313 |
+
"entropy": 1.6835025876760483,
|
| 4314 |
+
"epoch": 7.405759724908661,
|
| 4315 |
+
"grad_norm": 0.7718909382820129,
|
| 4316 |
+
"learning_rate": 2.764e-05,
|
| 4317 |
+
"loss": 1.7145641326904297,
|
| 4318 |
+
"mean_token_accuracy": 0.6805526971817016,
|
| 4319 |
+
"num_tokens": 25598830.0,
|
| 4320 |
+
"step": 4310
|
| 4321 |
+
},
|
| 4322 |
+
{
|
| 4323 |
+
"entropy": 1.7392980232834816,
|
| 4324 |
+
"epoch": 7.422952933591231,
|
| 4325 |
+
"grad_norm": 0.7144562005996704,
|
| 4326 |
+
"learning_rate": 2.724e-05,
|
| 4327 |
+
"loss": 1.7779796600341797,
|
| 4328 |
+
"mean_token_accuracy": 0.6709600411355495,
|
| 4329 |
+
"num_tokens": 25660275.0,
|
| 4330 |
+
"step": 4320
|
| 4331 |
+
},
|
| 4332 |
+
{
|
| 4333 |
+
"entropy": 1.7193088322877883,
|
| 4334 |
+
"epoch": 7.440146142273802,
|
| 4335 |
+
"grad_norm": 0.8038010001182556,
|
| 4336 |
+
"learning_rate": 2.6840000000000004e-05,
|
| 4337 |
+
"loss": 1.7928234100341798,
|
| 4338 |
+
"mean_token_accuracy": 0.6767275612801313,
|
| 4339 |
+
"num_tokens": 25719958.0,
|
| 4340 |
+
"step": 4330
|
| 4341 |
+
},
|
| 4342 |
+
{
|
| 4343 |
+
"entropy": 1.7314304433763028,
|
| 4344 |
+
"epoch": 7.457339350956373,
|
| 4345 |
+
"grad_norm": 0.7783089876174927,
|
| 4346 |
+
"learning_rate": 2.6440000000000004e-05,
|
| 4347 |
+
"loss": 1.7952003479003906,
|
| 4348 |
+
"mean_token_accuracy": 0.6740467935800553,
|
| 4349 |
+
"num_tokens": 25776689.0,
|
| 4350 |
+
"step": 4340
|
| 4351 |
+
},
|
| 4352 |
+
{
|
| 4353 |
+
"entropy": 1.74028614833951,
|
| 4354 |
+
"epoch": 7.474532559638942,
|
| 4355 |
+
"grad_norm": 0.8052565455436707,
|
| 4356 |
+
"learning_rate": 2.6040000000000005e-05,
|
| 4357 |
+
"loss": 1.7803146362304687,
|
| 4358 |
+
"mean_token_accuracy": 0.6733121275901794,
|
| 4359 |
+
"num_tokens": 25837916.0,
|
| 4360 |
+
"step": 4350
|
| 4361 |
+
},
|
| 4362 |
+
{
|
| 4363 |
+
"entropy": 1.6831192195415496,
|
| 4364 |
+
"epoch": 7.491725768321513,
|
| 4365 |
+
"grad_norm": 0.8941977024078369,
|
| 4366 |
+
"learning_rate": 2.5640000000000002e-05,
|
| 4367 |
+
"loss": 1.7077743530273437,
|
| 4368 |
+
"mean_token_accuracy": 0.6749852932989597,
|
| 4369 |
+
"num_tokens": 25896712.0,
|
| 4370 |
+
"step": 4360
|
| 4371 |
+
},
|
| 4372 |
+
{
|
| 4373 |
+
"entropy": 1.7840609520673751,
|
| 4374 |
+
"epoch": 7.508918977004083,
|
| 4375 |
+
"grad_norm": 0.818671703338623,
|
| 4376 |
+
"learning_rate": 2.5240000000000002e-05,
|
| 4377 |
+
"loss": 1.8329656600952149,
|
| 4378 |
+
"mean_token_accuracy": 0.6679215718060731,
|
| 4379 |
+
"num_tokens": 25958383.0,
|
| 4380 |
+
"step": 4370
|
| 4381 |
+
},
|
| 4382 |
+
{
|
| 4383 |
+
"entropy": 1.76528559923172,
|
| 4384 |
+
"epoch": 7.526112185686654,
|
| 4385 |
+
"grad_norm": 0.7579294443130493,
|
| 4386 |
+
"learning_rate": 2.4840000000000003e-05,
|
| 4387 |
+
"loss": 1.7914703369140625,
|
| 4388 |
+
"mean_token_accuracy": 0.6695499271154404,
|
| 4389 |
+
"num_tokens": 26017754.0,
|
| 4390 |
+
"step": 4380
|
| 4391 |
+
},
|
| 4392 |
+
{
|
| 4393 |
+
"entropy": 1.704708030819893,
|
| 4394 |
+
"epoch": 7.5433053943692245,
|
| 4395 |
+
"grad_norm": 0.8200159668922424,
|
| 4396 |
+
"learning_rate": 2.4440000000000003e-05,
|
| 4397 |
+
"loss": 1.774311637878418,
|
| 4398 |
+
"mean_token_accuracy": 0.6739427134394645,
|
| 4399 |
+
"num_tokens": 26075760.0,
|
| 4400 |
+
"step": 4390
|
| 4401 |
+
},
|
| 4402 |
+
{
|
| 4403 |
+
"entropy": 1.7540104657411575,
|
| 4404 |
+
"epoch": 7.560498603051794,
|
| 4405 |
+
"grad_norm": 0.8373399972915649,
|
| 4406 |
+
"learning_rate": 2.404e-05,
|
| 4407 |
+
"loss": 1.796240997314453,
|
| 4408 |
+
"mean_token_accuracy": 0.6640590511262416,
|
| 4409 |
+
"num_tokens": 26133858.0,
|
| 4410 |
+
"step": 4400
|
| 4411 |
+
},
|
| 4412 |
+
{
|
| 4413 |
+
"entropy": 1.754172220826149,
|
| 4414 |
+
"epoch": 7.577691811734365,
|
| 4415 |
+
"grad_norm": 0.7368677258491516,
|
| 4416 |
+
"learning_rate": 2.364e-05,
|
| 4417 |
+
"loss": 1.8175994873046875,
|
| 4418 |
+
"mean_token_accuracy": 0.6717667855322361,
|
| 4419 |
+
"num_tokens": 26197518.0,
|
| 4420 |
+
"step": 4410
|
| 4421 |
+
},
|
| 4422 |
+
{
|
| 4423 |
+
"entropy": 1.6564558774232865,
|
| 4424 |
+
"epoch": 7.594885020416935,
|
| 4425 |
+
"grad_norm": 0.8868939280509949,
|
| 4426 |
+
"learning_rate": 2.324e-05,
|
| 4427 |
+
"loss": 1.669070053100586,
|
| 4428 |
+
"mean_token_accuracy": 0.6839951984584332,
|
| 4429 |
+
"num_tokens": 26250823.0,
|
| 4430 |
+
"step": 4420
|
| 4431 |
+
},
|
| 4432 |
+
{
|
| 4433 |
+
"entropy": 1.7594470486044884,
|
| 4434 |
+
"epoch": 7.612078229099506,
|
| 4435 |
+
"grad_norm": 0.86412513256073,
|
| 4436 |
+
"learning_rate": 2.284e-05,
|
| 4437 |
+
"loss": 1.8095222473144532,
|
| 4438 |
+
"mean_token_accuracy": 0.666244950518012,
|
| 4439 |
+
"num_tokens": 26312548.0,
|
| 4440 |
+
"step": 4430
|
| 4441 |
+
},
|
| 4442 |
+
{
|
| 4443 |
+
"entropy": 1.7646627604961396,
|
| 4444 |
+
"epoch": 7.6292714377820765,
|
| 4445 |
+
"grad_norm": 0.7128214836120605,
|
| 4446 |
+
"learning_rate": 2.244e-05,
|
| 4447 |
+
"loss": 1.832158660888672,
|
| 4448 |
+
"mean_token_accuracy": 0.6679420609027147,
|
| 4449 |
+
"num_tokens": 26376747.0,
|
| 4450 |
+
"step": 4440
|
| 4451 |
+
},
|
| 4452 |
+
{
|
| 4453 |
+
"entropy": 1.7401177063584328,
|
| 4454 |
+
"epoch": 7.646464646464646,
|
| 4455 |
+
"grad_norm": 0.7479432225227356,
|
| 4456 |
+
"learning_rate": 2.2040000000000002e-05,
|
| 4457 |
+
"loss": 1.7779264450073242,
|
| 4458 |
+
"mean_token_accuracy": 0.6710429213941097,
|
| 4459 |
+
"num_tokens": 26438907.0,
|
| 4460 |
+
"step": 4450
|
| 4461 |
+
},
|
| 4462 |
+
{
|
| 4463 |
+
"entropy": 1.6960709124803544,
|
| 4464 |
+
"epoch": 7.663657855147217,
|
| 4465 |
+
"grad_norm": 0.8182732462882996,
|
| 4466 |
+
"learning_rate": 2.1640000000000003e-05,
|
| 4467 |
+
"loss": 1.7709745407104491,
|
| 4468 |
+
"mean_token_accuracy": 0.6782359674572944,
|
| 4469 |
+
"num_tokens": 26499840.0,
|
| 4470 |
+
"step": 4460
|
| 4471 |
+
},
|
| 4472 |
+
{
|
| 4473 |
+
"entropy": 1.8024938970804214,
|
| 4474 |
+
"epoch": 7.680851063829787,
|
| 4475 |
+
"grad_norm": 0.8208670020103455,
|
| 4476 |
+
"learning_rate": 2.124e-05,
|
| 4477 |
+
"loss": 1.8752277374267579,
|
| 4478 |
+
"mean_token_accuracy": 0.6610838636755944,
|
| 4479 |
+
"num_tokens": 26561739.0,
|
| 4480 |
+
"step": 4470
|
| 4481 |
+
},
|
| 4482 |
+
{
|
| 4483 |
+
"entropy": 1.6679524429142476,
|
| 4484 |
+
"epoch": 7.698044272512358,
|
| 4485 |
+
"grad_norm": 0.7669119834899902,
|
| 4486 |
+
"learning_rate": 2.084e-05,
|
| 4487 |
+
"loss": 1.6840700149536132,
|
| 4488 |
+
"mean_token_accuracy": 0.6839361816644669,
|
| 4489 |
+
"num_tokens": 26618997.0,
|
| 4490 |
+
"step": 4480
|
| 4491 |
+
},
|
| 4492 |
+
{
|
| 4493 |
+
"entropy": 1.669876104593277,
|
| 4494 |
+
"epoch": 7.715237481194928,
|
| 4495 |
+
"grad_norm": 0.8296427130699158,
|
| 4496 |
+
"learning_rate": 2.044e-05,
|
| 4497 |
+
"loss": 1.6926704406738282,
|
| 4498 |
+
"mean_token_accuracy": 0.6837400387972593,
|
| 4499 |
+
"num_tokens": 26677617.0,
|
| 4500 |
+
"step": 4490
|
| 4501 |
+
},
|
| 4502 |
+
{
|
| 4503 |
+
"entropy": 1.7478768080472946,
|
| 4504 |
+
"epoch": 7.732430689877498,
|
| 4505 |
+
"grad_norm": 0.9231081008911133,
|
| 4506 |
+
"learning_rate": 2.004e-05,
|
| 4507 |
+
"loss": 1.8043970108032226,
|
| 4508 |
+
"mean_token_accuracy": 0.6680058591067791,
|
| 4509 |
+
"num_tokens": 26735542.0,
|
| 4510 |
+
"step": 4500
|
| 4511 |
}
|
| 4512 |
],
|
| 4513 |
"logging_steps": 10,
|
|
|
|
| 4527 |
"attributes": {}
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
+
"total_flos": 2.19451190411264e+17,
|
| 4531 |
"train_batch_size": 2,
|
| 4532 |
"trial_name": null,
|
| 4533 |
"trial_params": null
|