RayDu0010 commited on
Commit
cf623ec
·
verified ·
1 Parent(s): 8bd6693

Upload folder using huggingface_hub

Browse files
10_128_e3_3e-5/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: ibm-granite/granite-3.3-8b-base
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ datasets:
9
+ - data/knowledge_lora_training_data_2000
10
+ model-index:
11
+ - name: 10_128_e3_3e-5
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # 10_128_e3_3e-5
19
+
20
+ This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 3e-05
40
+ - train_batch_size: 2
41
+ - eval_batch_size: 8
42
+ - seed: 42
43
+ - distributed_type: multi-GPU
44
+ - num_devices: 8
45
+ - gradient_accumulation_steps: 2
46
+ - total_train_batch_size: 32
47
+ - total_eval_batch_size: 64
48
+ - optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_ratio: 0.05
51
+ - num_epochs: 3.0
52
+
53
+ ### Training results
54
+
55
+
56
+
57
+ ### Framework versions
58
+
59
+ - PEFT 0.15.2
60
+ - Transformers 4.52.4
61
+ - Pytorch 2.7.0+cu126
62
+ - Datasets 3.6.0
63
+ - Tokenizers 0.21.2
10_128_e3_3e-5/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 256,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 128,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "o_proj",
28
+ "v_proj",
29
+ "up_proj",
30
+ "q_proj",
31
+ "down_proj",
32
+ "gate_proj",
33
+ "k_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
10_128_e3_3e-5/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:421d8a5caa59f4352cc822e9aa3b57d7f801baca10c61e362b48a5664010564d
3
+ size 791751704
10_128_e3_3e-5/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.4978820658094408e+18,
4
+ "train_loss": 0.567423046625721,
5
+ "train_runtime": 689.9192,
6
+ "train_samples": 10902,
7
+ "train_samples_per_second": 47.406,
8
+ "train_steps_per_second": 1.483
9
+ }
10_128_e3_3e-5/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GraniteForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0,
7
+ "attention_multiplier": 0.0078125,
8
+ "bos_token_id": 0,
9
+ "embedding_multiplier": 12.0,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 12800,
15
+ "logits_scaling": 16.0,
16
+ "max_position_embeddings": 131072,
17
+ "mlp_bias": false,
18
+ "model_type": "granite",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 40,
21
+ "num_key_value_heads": 8,
22
+ "pad_token_id": 0,
23
+ "residual_multiplier": 0.22,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000000.0,
27
+ "tie_word_embeddings": true,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.52.4",
30
+ "use_cache": true,
31
+ "vocab_size": 49152
32
+ }
10_128_e3_3e-5/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
10_128_e3_3e-5/special_tokens_map.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>"
22
+ ],
23
+ "bos_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "eos_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "pad_token": "<reponame>",
38
+ "unk_token": {
39
+ "content": "<|endoftext|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ }
45
+ }
10_128_e3_3e-5/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
10_128_e3_3e-5/tokenizer_config.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<filename>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<gh_stars>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<empty_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<commit_before>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<commit_msg>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<commit_after>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<reponame>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ }
156
+ },
157
+ "additional_special_tokens": [
158
+ "<|endoftext|>",
159
+ "<fim_prefix>",
160
+ "<fim_middle>",
161
+ "<fim_suffix>",
162
+ "<fim_pad>",
163
+ "<filename>",
164
+ "<gh_stars>",
165
+ "<issue_start>",
166
+ "<issue_comment>",
167
+ "<issue_closed>",
168
+ "<jupyter_start>",
169
+ "<jupyter_text>",
170
+ "<jupyter_code>",
171
+ "<jupyter_output>",
172
+ "<empty_output>",
173
+ "<commit_before>",
174
+ "<commit_msg>",
175
+ "<commit_after>",
176
+ "<reponame>"
177
+ ],
178
+ "bos_token": "<|endoftext|>",
179
+ "clean_up_tokenization_spaces": true,
180
+ "eos_token": "<|endoftext|>",
181
+ "extra_special_tokens": {},
182
+ "model_max_length": 8192,
183
+ "pad_token": "<reponame>",
184
+ "padding_side": "left",
185
+ "tokenizer_class": "GPT2Tokenizer",
186
+ "unk_token": "<|endoftext|>",
187
+ "vocab_size": 49152
188
+ }
10_128_e3_3e-5/train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.4978820658094408e+18,
4
+ "train_loss": 0.567423046625721,
5
+ "train_runtime": 689.9192,
6
+ "train_samples": 10902,
7
+ "train_samples_per_second": 47.406,
8
+ "train_steps_per_second": 1.483
9
+ }
10_128_e3_3e-5/trainer_state.json ADDED
@@ -0,0 +1,1471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1023,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01466275659824047,
14
+ "grad_norm": 1.404855489730835,
15
+ "learning_rate": 2.307692307692308e-06,
16
+ "loss": 1.2897,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.02932551319648094,
21
+ "grad_norm": 0.9086072444915771,
22
+ "learning_rate": 5.192307692307692e-06,
23
+ "loss": 1.3027,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.04398826979472141,
28
+ "grad_norm": 0.8060473203659058,
29
+ "learning_rate": 8.076923076923077e-06,
30
+ "loss": 1.3126,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.05865102639296188,
35
+ "grad_norm": 0.5817658305168152,
36
+ "learning_rate": 1.0961538461538462e-05,
37
+ "loss": 1.3005,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.07331378299120235,
42
+ "grad_norm": 0.519001305103302,
43
+ "learning_rate": 1.3846153846153847e-05,
44
+ "loss": 1.2342,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.08797653958944282,
49
+ "grad_norm": 0.5387595295906067,
50
+ "learning_rate": 1.673076923076923e-05,
51
+ "loss": 1.2237,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.10263929618768329,
56
+ "grad_norm": 0.5379096269607544,
57
+ "learning_rate": 1.9615384615384617e-05,
58
+ "loss": 1.2331,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.11730205278592376,
63
+ "grad_norm": 0.49764829874038696,
64
+ "learning_rate": 2.25e-05,
65
+ "loss": 1.2284,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.13196480938416422,
70
+ "grad_norm": 0.49528032541275024,
71
+ "learning_rate": 2.5384615384615386e-05,
72
+ "loss": 1.1852,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.1466275659824047,
77
+ "grad_norm": 0.5460355877876282,
78
+ "learning_rate": 2.8269230769230768e-05,
79
+ "loss": 1.1357,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.16129032258064516,
84
+ "grad_norm": 0.428093820810318,
85
+ "learning_rate": 2.9999685962851756e-05,
86
+ "loss": 1.1952,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.17595307917888564,
91
+ "grad_norm": 0.4283364415168762,
92
+ "learning_rate": 2.9996153195943092e-05,
93
+ "loss": 1.1639,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.1906158357771261,
98
+ "grad_norm": 0.4574543833732605,
99
+ "learning_rate": 2.9988696043272093e-05,
100
+ "loss": 1.1343,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.20527859237536658,
105
+ "grad_norm": 0.4929468333721161,
106
+ "learning_rate": 2.9977316456322143e-05,
107
+ "loss": 1.1432,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.21994134897360704,
112
+ "grad_norm": 0.6179081797599792,
113
+ "learning_rate": 2.996201741304954e-05,
114
+ "loss": 1.1054,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.23460410557184752,
119
+ "grad_norm": 0.7540493607521057,
120
+ "learning_rate": 2.9942802917104218e-05,
121
+ "loss": 1.1155,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.24926686217008798,
126
+ "grad_norm": 0.5757607221603394,
127
+ "learning_rate": 2.9919677996781987e-05,
128
+ "loss": 1.0311,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.26392961876832843,
133
+ "grad_norm": 0.5738183259963989,
134
+ "learning_rate": 2.989264870370867e-05,
135
+ "loss": 1.0608,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.2785923753665689,
140
+ "grad_norm": 0.5233305096626282,
141
+ "learning_rate": 2.9861722111256466e-05,
142
+ "loss": 1.046,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.2932551319648094,
147
+ "grad_norm": 0.532455563545227,
148
+ "learning_rate": 2.9826906312692855e-05,
149
+ "loss": 0.9954,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.30791788856304986,
154
+ "grad_norm": 0.5668688416481018,
155
+ "learning_rate": 2.9788210419062677e-05,
156
+ "loss": 1.0457,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.3225806451612903,
161
+ "grad_norm": 0.5246180891990662,
162
+ "learning_rate": 2.974564455680383e-05,
163
+ "loss": 1.0178,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.33724340175953077,
168
+ "grad_norm": 0.5526345372200012,
169
+ "learning_rate": 2.969921986509725e-05,
170
+ "loss": 0.9902,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.3519061583577713,
175
+ "grad_norm": 0.554076075553894,
176
+ "learning_rate": 2.964894849295187e-05,
177
+ "loss": 0.9988,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.36656891495601174,
182
+ "grad_norm": 0.5224078893661499,
183
+ "learning_rate": 2.9594843596025315e-05,
184
+ "loss": 1.0225,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.3812316715542522,
189
+ "grad_norm": 0.6201514601707458,
190
+ "learning_rate": 2.953691933318115e-05,
191
+ "loss": 0.9719,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.39589442815249265,
196
+ "grad_norm": 0.5892394781112671,
197
+ "learning_rate": 2.9475190862783628e-05,
198
+ "loss": 0.9845,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.41055718475073316,
203
+ "grad_norm": 0.6271149516105652,
204
+ "learning_rate": 2.940967433873082e-05,
205
+ "loss": 0.925,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.4252199413489736,
210
+ "grad_norm": 0.6505178213119507,
211
+ "learning_rate": 2.9340386906227295e-05,
212
+ "loss": 0.966,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.4398826979472141,
217
+ "grad_norm": 0.6356052160263062,
218
+ "learning_rate": 2.9267346697297322e-05,
219
+ "loss": 0.9323,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.45454545454545453,
224
+ "grad_norm": 0.6684208512306213,
225
+ "learning_rate": 2.919057282603984e-05,
226
+ "loss": 0.9426,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.46920821114369504,
231
+ "grad_norm": 0.7759992480278015,
232
+ "learning_rate": 2.9110085383626453e-05,
233
+ "loss": 0.9315,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.4838709677419355,
238
+ "grad_norm": 0.7040038108825684,
239
+ "learning_rate": 2.902590543304372e-05,
240
+ "loss": 0.9105,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.49853372434017595,
245
+ "grad_norm": 0.7048050165176392,
246
+ "learning_rate": 2.893805500358109e-05,
247
+ "loss": 0.9013,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.5131964809384164,
252
+ "grad_norm": 0.6984567642211914,
253
+ "learning_rate": 2.8846557085066033e-05,
254
+ "loss": 0.888,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.5278592375366569,
259
+ "grad_norm": 0.726510226726532,
260
+ "learning_rate": 2.8751435621847747e-05,
261
+ "loss": 0.8879,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.5425219941348973,
266
+ "grad_norm": 0.6819034814834595,
267
+ "learning_rate": 2.865271550653108e-05,
268
+ "loss": 0.8786,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.5571847507331378,
273
+ "grad_norm": 0.8962215185165405,
274
+ "learning_rate": 2.8550422573462363e-05,
275
+ "loss": 0.8462,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.5718475073313783,
280
+ "grad_norm": 0.6712878346443176,
281
+ "learning_rate": 2.8444583591968676e-05,
282
+ "loss": 0.8501,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.5865102639296188,
287
+ "grad_norm": 0.8235214948654175,
288
+ "learning_rate": 2.8335226259352578e-05,
289
+ "loss": 0.8633,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.6011730205278593,
294
+ "grad_norm": 0.7097874879837036,
295
+ "learning_rate": 2.8222379193643863e-05,
296
+ "loss": 0.8318,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.6158357771260997,
301
+ "grad_norm": 0.7363569140434265,
302
+ "learning_rate": 2.8106071926110472e-05,
303
+ "loss": 0.8345,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.6304985337243402,
308
+ "grad_norm": 0.9131688475608826,
309
+ "learning_rate": 2.7986334893530343e-05,
310
+ "loss": 0.8133,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.6451612903225806,
315
+ "grad_norm": 0.8629419803619385,
316
+ "learning_rate": 2.7863199430226328e-05,
317
+ "loss": 0.8877,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.6598240469208211,
322
+ "grad_norm": 0.77187579870224,
323
+ "learning_rate": 2.7736697759866244e-05,
324
+ "loss": 0.837,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.6744868035190615,
329
+ "grad_norm": 1.0400891304016113,
330
+ "learning_rate": 2.760686298703015e-05,
331
+ "loss": 0.8241,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.6891495601173021,
336
+ "grad_norm": 0.9449446201324463,
337
+ "learning_rate": 2.7473729088547127e-05,
338
+ "loss": 0.782,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.7038123167155426,
343
+ "grad_norm": 1.0150853395462036,
344
+ "learning_rate": 2.7337330904603776e-05,
345
+ "loss": 0.8285,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.718475073313783,
350
+ "grad_norm": 0.8893378376960754,
351
+ "learning_rate": 2.71977041296268e-05,
352
+ "loss": 0.8303,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.7331378299120235,
357
+ "grad_norm": 0.9086685180664062,
358
+ "learning_rate": 2.7054885302942028e-05,
359
+ "loss": 0.7748,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.7478005865102639,
364
+ "grad_norm": 0.8139998912811279,
365
+ "learning_rate": 2.6908911799212322e-05,
366
+ "loss": 0.8112,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.7624633431085044,
371
+ "grad_norm": 0.7712191939353943,
372
+ "learning_rate": 2.6759821818656918e-05,
373
+ "loss": 0.7374,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.7771260997067448,
378
+ "grad_norm": 0.948762059211731,
379
+ "learning_rate": 2.660765437705469e-05,
380
+ "loss": 0.7672,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.7917888563049853,
385
+ "grad_norm": 0.8369901180267334,
386
+ "learning_rate": 2.6452449295533995e-05,
387
+ "loss": 0.7307,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.8064516129032258,
392
+ "grad_norm": 0.8895859122276306,
393
+ "learning_rate": 2.6294247190151776e-05,
394
+ "loss": 0.734,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.8211143695014663,
399
+ "grad_norm": 0.8661187291145325,
400
+ "learning_rate": 2.6133089461264638e-05,
401
+ "loss": 0.7754,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.8357771260997068,
406
+ "grad_norm": 1.0180197954177856,
407
+ "learning_rate": 2.5969018282694648e-05,
408
+ "loss": 0.7032,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.8504398826979472,
413
+ "grad_norm": 0.7993912100791931,
414
+ "learning_rate": 2.5802076590692784e-05,
415
+ "loss": 0.7384,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.8651026392961877,
420
+ "grad_norm": 0.835310161113739,
421
+ "learning_rate": 2.5632308072702797e-05,
422
+ "loss": 0.6594,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.8797653958944281,
427
+ "grad_norm": 0.8714587688446045,
428
+ "learning_rate": 2.5459757155928522e-05,
429
+ "loss": 0.7202,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.8944281524926686,
434
+ "grad_norm": 0.9846564531326294,
435
+ "learning_rate": 2.5284468995707623e-05,
436
+ "loss": 0.6962,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.9090909090909091,
441
+ "grad_norm": 0.99905925989151,
442
+ "learning_rate": 2.5106489463694727e-05,
443
+ "loss": 0.6634,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.9237536656891495,
448
+ "grad_norm": 0.8811106085777283,
449
+ "learning_rate": 2.492586513585718e-05,
450
+ "loss": 0.7435,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.9384164222873901,
455
+ "grad_norm": 0.9337815046310425,
456
+ "learning_rate": 2.474264328028641e-05,
457
+ "loss": 0.6988,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.9530791788856305,
462
+ "grad_norm": 1.0272332429885864,
463
+ "learning_rate": 2.4556871844828245e-05,
464
+ "loss": 0.6982,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.967741935483871,
469
+ "grad_norm": 1.043241262435913,
470
+ "learning_rate": 2.4368599444535284e-05,
471
+ "loss": 0.6366,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.9824046920821115,
476
+ "grad_norm": 1.0359058380126953,
477
+ "learning_rate": 2.4177875348944678e-05,
478
+ "loss": 0.686,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.9970674486803519,
483
+ "grad_norm": 1.171642541885376,
484
+ "learning_rate": 2.3984749469184677e-05,
485
+ "loss": 0.7073,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 1.0117302052785924,
490
+ "grad_norm": 0.9656211733818054,
491
+ "learning_rate": 2.3789272344913208e-05,
492
+ "loss": 0.6161,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 1.0263929618768328,
497
+ "grad_norm": 0.9734588265419006,
498
+ "learning_rate": 2.359149513109204e-05,
499
+ "loss": 0.5756,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 1.0410557184750733,
504
+ "grad_norm": 1.0881298780441284,
505
+ "learning_rate": 2.3391469584599877e-05,
506
+ "loss": 0.5743,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 1.0557184750733137,
511
+ "grad_norm": 1.1606433391571045,
512
+ "learning_rate": 2.318924805068797e-05,
513
+ "loss": 0.5845,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 1.0703812316715542,
518
+ "grad_norm": 1.0384148359298706,
519
+ "learning_rate": 2.2984883449281716e-05,
520
+ "loss": 0.5896,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 1.0850439882697946,
525
+ "grad_norm": 1.040235161781311,
526
+ "learning_rate": 2.277842926113193e-05,
527
+ "loss": 0.5823,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 1.099706744868035,
532
+ "grad_norm": 1.1339130401611328,
533
+ "learning_rate": 2.256993951381928e-05,
534
+ "loss": 0.5433,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 1.1143695014662756,
539
+ "grad_norm": 1.0906201601028442,
540
+ "learning_rate": 2.235946876761567e-05,
541
+ "loss": 0.5674,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 1.129032258064516,
546
+ "grad_norm": 1.2759273052215576,
547
+ "learning_rate": 2.2147072101206217e-05,
548
+ "loss": 0.6016,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 1.1436950146627567,
553
+ "grad_norm": 0.9344790577888489,
554
+ "learning_rate": 2.193280509727554e-05,
555
+ "loss": 0.5668,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 1.1583577712609971,
560
+ "grad_norm": 1.1530736684799194,
561
+ "learning_rate": 2.171672382796218e-05,
562
+ "loss": 0.5293,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 1.1730205278592376,
567
+ "grad_norm": 1.1002442836761475,
568
+ "learning_rate": 2.149888484018492e-05,
569
+ "loss": 0.5772,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 1.187683284457478,
574
+ "grad_norm": 1.137981653213501,
575
+ "learning_rate": 2.127934514084486e-05,
576
+ "loss": 0.531,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 1.2023460410557185,
581
+ "grad_norm": 1.17744779586792,
582
+ "learning_rate": 2.1058162181907112e-05,
583
+ "loss": 0.5558,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 1.217008797653959,
588
+ "grad_norm": 1.029596209526062,
589
+ "learning_rate": 2.0835393845366062e-05,
590
+ "loss": 0.548,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 1.2316715542521994,
595
+ "grad_norm": 1.1402246952056885,
596
+ "learning_rate": 2.061109842809803e-05,
597
+ "loss": 0.5304,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 1.2463343108504399,
602
+ "grad_norm": 1.0744937658309937,
603
+ "learning_rate": 2.0385334626605412e-05,
604
+ "loss": 0.5108,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 1.2609970674486803,
609
+ "grad_norm": 1.1842135190963745,
610
+ "learning_rate": 2.0158161521656245e-05,
611
+ "loss": 0.5034,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 1.2756598240469208,
616
+ "grad_norm": 1.0653235912322998,
617
+ "learning_rate": 1.9929638562823155e-05,
618
+ "loss": 0.5075,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 1.2903225806451613,
623
+ "grad_norm": 0.9864193201065063,
624
+ "learning_rate": 1.9699825552925858e-05,
625
+ "loss": 0.5499,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 1.3049853372434017,
630
+ "grad_norm": 1.0366642475128174,
631
+ "learning_rate": 1.9468782632381188e-05,
632
+ "loss": 0.506,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 1.3196480938416422,
637
+ "grad_norm": 1.1418167352676392,
638
+ "learning_rate": 1.9236570263464763e-05,
639
+ "loss": 0.5623,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 1.3343108504398826,
644
+ "grad_norm": 1.1316769123077393,
645
+ "learning_rate": 1.9003249214488473e-05,
646
+ "loss": 0.5067,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 1.3489736070381233,
651
+ "grad_norm": 1.1603069305419922,
652
+ "learning_rate": 1.8768880543897814e-05,
653
+ "loss": 0.5099,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 1.3636363636363638,
658
+ "grad_norm": 1.1564180850982666,
659
+ "learning_rate": 1.853352558429336e-05,
660
+ "loss": 0.4988,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 1.3782991202346042,
665
+ "grad_norm": 1.100997805595398,
666
+ "learning_rate": 1.8297245926380427e-05,
667
+ "loss": 0.5056,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 1.3929618768328447,
672
+ "grad_norm": 1.0203278064727783,
673
+ "learning_rate": 1.8060103402851274e-05,
674
+ "loss": 0.4754,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 1.4076246334310851,
679
+ "grad_norm": 1.0419028997421265,
680
+ "learning_rate": 1.7822160072203884e-05,
681
+ "loss": 0.5307,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 1.4222873900293256,
686
+ "grad_norm": 1.1616261005401611,
687
+ "learning_rate": 1.7583478202501737e-05,
688
+ "loss": 0.468,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 1.436950146627566,
693
+ "grad_norm": 1.0473759174346924,
694
+ "learning_rate": 1.734412025507867e-05,
695
+ "loss": 0.478,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 1.4516129032258065,
700
+ "grad_norm": 1.030143141746521,
701
+ "learning_rate": 1.7104148868193232e-05,
702
+ "loss": 0.5341,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 1.466275659824047,
707
+ "grad_norm": 1.3637970685958862,
708
+ "learning_rate": 1.686362684063666e-05,
709
+ "loss": 0.4753,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 1.4809384164222874,
714
+ "grad_norm": 1.1191473007202148,
715
+ "learning_rate": 1.6622617115298923e-05,
716
+ "loss": 0.4577,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 1.4956011730205279,
721
+ "grad_norm": 1.2150533199310303,
722
+ "learning_rate": 1.6381182762696993e-05,
723
+ "loss": 0.4491,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 1.5102639296187683,
728
+ "grad_norm": 1.005942463874817,
729
+ "learning_rate": 1.6139386964469754e-05,
730
+ "loss": 0.4735,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 1.5249266862170088,
735
+ "grad_norm": 1.232484221458435,
736
+ "learning_rate": 1.589729299684382e-05,
737
+ "loss": 0.4707,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 1.5395894428152492,
742
+ "grad_norm": 1.3594202995300293,
743
+ "learning_rate": 1.5654964214074604e-05,
744
+ "loss": 0.4814,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 1.5542521994134897,
749
+ "grad_norm": 1.0931212902069092,
750
+ "learning_rate": 1.541246403186694e-05,
751
+ "loss": 0.4489,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 1.5689149560117301,
756
+ "grad_norm": 1.0040931701660156,
757
+ "learning_rate": 1.5169855910779694e-05,
758
+ "loss": 0.4586,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 1.5835777126099706,
763
+ "grad_norm": 1.2385358810424805,
764
+ "learning_rate": 1.4927203339618536e-05,
765
+ "loss": 0.4519,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 1.598240469208211,
770
+ "grad_norm": 1.2667471170425415,
771
+ "learning_rate": 1.4684569818821412e-05,
772
+ "loss": 0.442,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 1.6129032258064515,
777
+ "grad_norm": 1.0587009191513062,
778
+ "learning_rate": 1.4442018843840932e-05,
779
+ "loss": 0.446,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 1.627565982404692,
784
+ "grad_norm": 1.2656630277633667,
785
+ "learning_rate": 1.4199613888528044e-05,
786
+ "loss": 0.3993,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 1.6422287390029324,
791
+ "grad_norm": 1.2869272232055664,
792
+ "learning_rate": 1.3957418388521413e-05,
793
+ "loss": 0.4311,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 1.6568914956011729,
798
+ "grad_norm": 1.0959758758544922,
799
+ "learning_rate": 1.3715495724646731e-05,
800
+ "loss": 0.4242,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 1.6715542521994133,
805
+ "grad_norm": 1.3144580125808716,
806
+ "learning_rate": 1.3473909206330444e-05,
807
+ "loss": 0.4349,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 1.6862170087976538,
812
+ "grad_norm": 1.212710976600647,
813
+ "learning_rate": 1.323272205503212e-05,
814
+ "loss": 0.4418,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 1.7008797653958945,
819
+ "grad_norm": 1.2405732870101929,
820
+ "learning_rate": 1.299199738769983e-05,
821
+ "loss": 0.4164,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 1.715542521994135,
826
+ "grad_norm": 1.1547958850860596,
827
+ "learning_rate": 1.2751798200252912e-05,
828
+ "loss": 0.4439,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 1.7302052785923754,
833
+ "grad_norm": 1.2431228160858154,
834
+ "learning_rate": 1.251218735109639e-05,
835
+ "loss": 0.456,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 1.7448680351906158,
840
+ "grad_norm": 1.0995466709136963,
841
+ "learning_rate": 1.2273227544671367e-05,
842
+ "loss": 0.4248,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 1.7595307917888563,
847
+ "grad_norm": 1.2348030805587769,
848
+ "learning_rate": 1.2034981315045745e-05,
849
+ "loss": 0.4296,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 1.7741935483870968,
854
+ "grad_norm": 1.1425237655639648,
855
+ "learning_rate": 1.1797511009549478e-05,
856
+ "loss": 0.432,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 1.7888563049853372,
861
+ "grad_norm": 1.1894876956939697,
862
+ "learning_rate": 1.1560878772458757e-05,
863
+ "loss": 0.3871,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 1.8035190615835777,
868
+ "grad_norm": 1.0838085412979126,
869
+ "learning_rate": 1.1325146528733262e-05,
870
+ "loss": 0.3822,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 1.8181818181818183,
875
+ "grad_norm": 1.1685465574264526,
876
+ "learning_rate": 1.1090375967810879e-05,
877
+ "loss": 0.3995,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 1.8328445747800588,
882
+ "grad_norm": 1.0686448812484741,
883
+ "learning_rate": 1.0856628527463986e-05,
884
+ "loss": 0.4027,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 1.8475073313782993,
889
+ "grad_norm": 1.138701319694519,
890
+ "learning_rate": 1.0623965377721652e-05,
891
+ "loss": 0.4182,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 1.8621700879765397,
896
+ "grad_norm": 1.1836135387420654,
897
+ "learning_rate": 1.0392447404861866e-05,
898
+ "loss": 0.3667,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 1.8768328445747802,
903
+ "grad_norm": 1.1814188957214355,
904
+ "learning_rate": 1.016213519547805e-05,
905
+ "loss": 0.4021,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 1.8914956011730206,
910
+ "grad_norm": 1.048718810081482,
911
+ "learning_rate": 9.933089020623942e-06,
912
+ "loss": 0.3905,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 1.906158357771261,
917
+ "grad_norm": 1.1059246063232422,
918
+ "learning_rate": 9.705368820041149e-06,
919
+ "loss": 0.4351,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 1.9208211143695015,
924
+ "grad_norm": 1.4553226232528687,
925
+ "learning_rate": 9.479034186473307e-06,
926
+ "loss": 0.3786,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 1.935483870967742,
931
+ "grad_norm": 1.2545796632766724,
932
+ "learning_rate": 9.25414435007111e-06,
933
+ "loss": 0.3828,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 1.9501466275659824,
938
+ "grad_norm": 1.3140236139297485,
939
+ "learning_rate": 9.03075816289217e-06,
940
+ "loss": 0.4024,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 1.964809384164223,
945
+ "grad_norm": 1.1320492029190063,
946
+ "learning_rate": 8.808934083499897e-06,
947
+ "loss": 0.3613,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 1.9794721407624634,
952
+ "grad_norm": 1.1655583381652832,
953
+ "learning_rate": 8.588730161665303e-06,
954
+ "loss": 0.3785,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 1.9941348973607038,
959
+ "grad_norm": 1.2500450611114502,
960
+ "learning_rate": 8.37020402317576e-06,
961
+ "loss": 0.3804,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 2.0087976539589443,
966
+ "grad_norm": 1.1778208017349243,
967
+ "learning_rate": 8.153412854754791e-06,
968
+ "loss": 0.3377,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 2.0234604105571847,
973
+ "grad_norm": 1.3056201934814453,
974
+ "learning_rate": 7.938413389096684e-06,
975
+ "loss": 0.2937,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 2.038123167155425,
980
+ "grad_norm": 1.2293964624404907,
981
+ "learning_rate": 7.72526189001995e-06,
982
+ "loss": 0.3157,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 2.0527859237536656,
987
+ "grad_norm": 1.303946614265442,
988
+ "learning_rate": 7.5140141377435114e-06,
989
+ "loss": 0.3191,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 2.067448680351906,
994
+ "grad_norm": 1.1810277700424194,
995
+ "learning_rate": 7.304725414289409e-06,
996
+ "loss": 0.3124,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 2.0821114369501466,
1001
+ "grad_norm": 1.1029647588729858,
1002
+ "learning_rate": 7.097450489015864e-06,
1003
+ "loss": 0.3125,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 2.096774193548387,
1008
+ "grad_norm": 1.1499955654144287,
1009
+ "learning_rate": 6.8922436042845735e-06,
1010
+ "loss": 0.299,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 2.1114369501466275,
1015
+ "grad_norm": 1.2358781099319458,
1016
+ "learning_rate": 6.689158461265855e-06,
1017
+ "loss": 0.3361,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 2.126099706744868,
1022
+ "grad_norm": 1.3672081232070923,
1023
+ "learning_rate": 6.488248205885413e-06,
1024
+ "loss": 0.3377,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 2.1407624633431084,
1029
+ "grad_norm": 1.0571002960205078,
1030
+ "learning_rate": 6.289565414916472e-06,
1031
+ "loss": 0.3094,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 2.155425219941349,
1036
+ "grad_norm": 1.1626659631729126,
1037
+ "learning_rate": 6.093162082220785e-06,
1038
+ "loss": 0.3238,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 2.1700879765395893,
1043
+ "grad_norm": 1.3589155673980713,
1044
+ "learning_rate": 5.899089605142225e-06,
1045
+ "loss": 0.2983,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 2.1847507331378297,
1050
+ "grad_norm": 1.2361712455749512,
1051
+ "learning_rate": 5.7073987710564485e-06,
1052
+ "loss": 0.3079,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 2.19941348973607,
1057
+ "grad_norm": 1.13412606716156,
1058
+ "learning_rate": 5.518139744080231e-06,
1059
+ "loss": 0.2895,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 2.2140762463343107,
1064
+ "grad_norm": 1.224824070930481,
1065
+ "learning_rate": 5.331362051943864e-06,
1066
+ "loss": 0.3505,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 2.228739002932551,
1071
+ "grad_norm": 1.3557347059249878,
1072
+ "learning_rate": 5.147114573030105e-06,
1073
+ "loss": 0.3044,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 2.2434017595307916,
1078
+ "grad_norm": 1.1949517726898193,
1079
+ "learning_rate": 4.965445523583039e-06,
1080
+ "loss": 0.2715,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 2.258064516129032,
1085
+ "grad_norm": 1.329795002937317,
1086
+ "learning_rate": 4.786402445090264e-06,
1087
+ "loss": 0.2929,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 2.2727272727272725,
1092
+ "grad_norm": 1.1268538236618042,
1093
+ "learning_rate": 4.610032191841606e-06,
1094
+ "loss": 0.3035,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 2.2873900293255134,
1099
+ "grad_norm": 1.2942050695419312,
1100
+ "learning_rate": 4.43638091866769e-06,
1101
+ "loss": 0.2901,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 2.302052785923754,
1106
+ "grad_norm": 1.1928037405014038,
1107
+ "learning_rate": 4.265494068861539e-06,
1108
+ "loss": 0.2802,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 2.3167155425219943,
1113
+ "grad_norm": 1.1997088193893433,
1114
+ "learning_rate": 4.097416362286422e-06,
1115
+ "loss": 0.2879,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 2.3313782991202348,
1120
+ "grad_norm": 1.2495310306549072,
1121
+ "learning_rate": 3.932191783672954e-06,
1122
+ "loss": 0.3466,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 2.346041055718475,
1127
+ "grad_norm": 1.1649198532104492,
1128
+ "learning_rate": 3.769863571108632e-06,
1129
+ "loss": 0.3077,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 2.3607038123167157,
1134
+ "grad_norm": 1.3067137002944946,
1135
+ "learning_rate": 3.610474204722708e-06,
1136
+ "loss": 0.283,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 2.375366568914956,
1141
+ "grad_norm": 1.4458105564117432,
1142
+ "learning_rate": 3.4540653955694806e-06,
1143
+ "loss": 0.2987,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 2.3900293255131966,
1148
+ "grad_norm": 1.2861709594726562,
1149
+ "learning_rate": 3.300678074712782e-06,
1150
+ "loss": 0.2962,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 2.404692082111437,
1155
+ "grad_norm": 1.1888914108276367,
1156
+ "learning_rate": 3.1503523825146308e-06,
1157
+ "loss": 0.3048,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 2.4193548387096775,
1162
+ "grad_norm": 1.1350572109222412,
1163
+ "learning_rate": 3.003127658130765e-06,
1164
+ "loss": 0.3065,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 2.434017595307918,
1169
+ "grad_norm": 1.2664979696273804,
1170
+ "learning_rate": 2.8590424292158957e-06,
1171
+ "loss": 0.3048,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 2.4486803519061584,
1176
+ "grad_norm": 1.222652554512024,
1177
+ "learning_rate": 2.7181344018412736e-06,
1178
+ "loss": 0.3064,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 2.463343108504399,
1183
+ "grad_norm": 1.2085829973220825,
1184
+ "learning_rate": 2.5804404506272926e-06,
1185
+ "loss": 0.3077,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 2.4780058651026393,
1190
+ "grad_norm": 1.2079604864120483,
1191
+ "learning_rate": 2.445996609093653e-06,
1192
+ "loss": 0.2638,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 2.4926686217008798,
1197
+ "grad_norm": 1.3720932006835938,
1198
+ "learning_rate": 2.3148380602296665e-06,
1199
+ "loss": 0.2865,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 2.5073313782991202,
1204
+ "grad_norm": 1.2781530618667603,
1205
+ "learning_rate": 2.1869991272871055e-06,
1206
+ "loss": 0.2817,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 2.5219941348973607,
1211
+ "grad_norm": 1.1719965934753418,
1212
+ "learning_rate": 2.062513264798061e-06,
1213
+ "loss": 0.2737,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 2.536656891495601,
1218
+ "grad_norm": 1.117966651916504,
1219
+ "learning_rate": 1.941413049820123e-06,
1220
+ "loss": 0.289,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 2.5513196480938416,
1225
+ "grad_norm": 1.2426401376724243,
1226
+ "learning_rate": 1.8237301734112132e-06,
1227
+ "loss": 0.2837,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 2.565982404692082,
1232
+ "grad_norm": 1.2011477947235107,
1233
+ "learning_rate": 1.7094954323362495e-06,
1234
+ "loss": 0.3109,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 2.5806451612903225,
1239
+ "grad_norm": 1.3482277393341064,
1240
+ "learning_rate": 1.5987387210078586e-06,
1241
+ "loss": 0.2861,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 2.595307917888563,
1246
+ "grad_norm": 1.16232168674469,
1247
+ "learning_rate": 1.4914890236632161e-06,
1248
+ "loss": 0.2683,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 2.6099706744868034,
1253
+ "grad_norm": 1.0949877500534058,
1254
+ "learning_rate": 1.3877744067790933e-06,
1255
+ "loss": 0.2858,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 2.624633431085044,
1260
+ "grad_norm": 1.257165551185608,
1261
+ "learning_rate": 1.2876220117270466e-06,
1262
+ "loss": 0.2818,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 2.6392961876832843,
1267
+ "grad_norm": 1.2227627038955688,
1268
+ "learning_rate": 1.1910580476707305e-06,
1269
+ "loss": 0.2841,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 2.653958944281525,
1274
+ "grad_norm": 1.2360917329788208,
1275
+ "learning_rate": 1.0981077847071236e-06,
1276
+ "loss": 0.2796,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 2.6686217008797652,
1281
+ "grad_norm": 1.1866899728775024,
1282
+ "learning_rate": 1.0087955472535526e-06,
1283
+ "loss": 0.2982,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 2.6832844574780057,
1288
+ "grad_norm": 1.1590216159820557,
1289
+ "learning_rate": 9.231447076821503e-07,
1290
+ "loss": 0.2835,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 2.6979472140762466,
1295
+ "grad_norm": 1.215848445892334,
1296
+ "learning_rate": 8.411776802034843e-07,
1297
+ "loss": 0.317,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 2.712609970674487,
1302
+ "grad_norm": 1.1392394304275513,
1303
+ "learning_rate": 7.629159150008958e-07,
1304
+ "loss": 0.2859,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 2.7272727272727275,
1309
+ "grad_norm": 1.3150230646133423,
1310
+ "learning_rate": 6.88379892617173e-07,
1311
+ "loss": 0.2515,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 2.741935483870968,
1316
+ "grad_norm": 1.205656886100769,
1317
+ "learning_rate": 6.175891185949189e-07,
1318
+ "loss": 0.2767,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 2.7565982404692084,
1323
+ "grad_norm": 1.210679054260254,
1324
+ "learning_rate": 5.505621183720904e-07,
1325
+ "loss": 0.2882,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 2.771260997067449,
1330
+ "grad_norm": 1.0707758665084839,
1331
+ "learning_rate": 4.873164324340318e-07,
1332
+ "loss": 0.2885,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 2.7859237536656893,
1337
+ "grad_norm": 1.2976630926132202,
1338
+ "learning_rate": 4.2786861172325774e-07,
1339
+ "loss": 0.3224,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 2.80058651026393,
1344
+ "grad_norm": 1.1435562372207642,
1345
+ "learning_rate": 3.722342133081785e-07,
1346
+ "loss": 0.2875,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 2.8152492668621703,
1351
+ "grad_norm": 1.1515800952911377,
1352
+ "learning_rate": 3.204277963119362e-07,
1353
+ "loss": 0.2955,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 2.8299120234604107,
1358
+ "grad_norm": 1.1778301000595093,
1359
+ "learning_rate": 2.724629181023841e-07,
1360
+ "loss": 0.2753,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 2.844574780058651,
1365
+ "grad_norm": 1.2622073888778687,
1366
+ "learning_rate": 2.283521307442199e-07,
1367
+ "loss": 0.3011,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 2.8592375366568916,
1372
+ "grad_norm": 1.1627691984176636,
1373
+ "learning_rate": 1.881069777142047e-07,
1374
+ "loss": 0.3021,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 2.873900293255132,
1379
+ "grad_norm": 1.2690354585647583,
1380
+ "learning_rate": 1.517379908803046e-07,
1381
+ "loss": 0.2888,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 2.8885630498533725,
1386
+ "grad_norm": 1.1645830869674683,
1387
+ "learning_rate": 1.1925468774559855e-07,
1388
+ "loss": 0.2723,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 2.903225806451613,
1393
+ "grad_norm": 1.177254319190979,
1394
+ "learning_rate": 9.066556895759249e-08,
1395
+ "loss": 0.2836,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 2.9178885630498534,
1400
+ "grad_norm": 1.414916753768921,
1401
+ "learning_rate": 6.597811608368031e-08,
1402
+ "loss": 0.3021,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 2.932551319648094,
1407
+ "grad_norm": 1.2326536178588867,
1408
+ "learning_rate": 4.519878965325852e-08,
1409
+ "loss": 0.2895,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 2.9472140762463344,
1414
+ "grad_norm": 1.2043007612228394,
1415
+ "learning_rate": 2.8333027467053463e-08,
1416
+ "loss": 0.2662,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 2.961876832844575,
1421
+ "grad_norm": 1.2307794094085693,
1422
+ "learning_rate": 1.5385243174099728e-08,
1423
+ "loss": 0.2739,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 2.9765395894428153,
1428
+ "grad_norm": 1.1917132139205933,
1429
+ "learning_rate": 6.3588251167007176e-09,
1430
+ "loss": 0.2843,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 2.9912023460410557,
1435
+ "grad_norm": 1.2122886180877686,
1436
+ "learning_rate": 1.2561354437412576e-09,
1437
+ "loss": 0.2653,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 3.0,
1442
+ "step": 1023,
1443
+ "total_flos": 1.4978820658094408e+18,
1444
+ "train_loss": 0.567423046625721,
1445
+ "train_runtime": 689.9192,
1446
+ "train_samples_per_second": 47.406,
1447
+ "train_steps_per_second": 1.483
1448
+ }
1449
+ ],
1450
+ "logging_steps": 5,
1451
+ "max_steps": 1023,
1452
+ "num_input_tokens_seen": 0,
1453
+ "num_train_epochs": 3,
1454
+ "save_steps": 2000,
1455
+ "stateful_callbacks": {
1456
+ "TrainerControl": {
1457
+ "args": {
1458
+ "should_epoch_stop": false,
1459
+ "should_evaluate": false,
1460
+ "should_log": false,
1461
+ "should_save": false,
1462
+ "should_training_stop": false
1463
+ },
1464
+ "attributes": {}
1465
+ }
1466
+ },
1467
+ "total_flos": 1.4978820658094408e+18,
1468
+ "train_batch_size": 2,
1469
+ "trial_name": null,
1470
+ "trial_params": null
1471
+ }
10_128_e3_3e-5/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fe1c439d1beaccd67a5b4e3239a08eccc460ada83f1bd98996143ff1ff7e978
3
+ size 8145
10_128_e3_3e-5/vocab.json ADDED
The diff for this file is too large to render. See raw diff