Ubuntu commited on
Commit
5196563
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ adapter_model.bin filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
7
+ model-index:
8
+ - name: text-generation-webui/loras/mistral-instruct-better-formatting-v1
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
21
+ model_type: AutoModelForCausalLM
22
+ tokenizer_type: LlamaTokenizer
23
+ is_mistral_derived_model: true
24
+ load_in_8bit: false
25
+ load_in_4bit: true
26
+ strict: false
27
+
28
+ datasets:
29
+ - path: ./data/tool_used_training.jsonl
30
+ type: sharegpt
31
+ - path: ./data/tool_not_used_training.jsonl
32
+ type: sharegpt
33
+ - path: ./data/no_tools_training.jsonl
34
+ type: sharegpt
35
+
36
+ dataset_prepared_path: last_run_prepared
37
+ val_set_size: 0.01
38
+ output_dir: ../../text-generation-webui/loras/mistral-instruct-better-formatting-v1
39
+
40
+ adapter: qlora
41
+ lora_model_dir:
42
+
43
+ sequence_len: 4096
44
+ sample_packing: true
45
+ pad_to_sequence_len: true
46
+
47
+ lora_r: 32
48
+ lora_alpha: 16
49
+ lora_dropout: 0.05
50
+ lora_target_linear: true
51
+ lora_fan_in_fan_out:
52
+ # lora_target_modules:
53
+ # - gate_proj
54
+ # - down_proj
55
+ # - up_proj
56
+ # - q_proj
57
+ # - v_proj
58
+ # - k_proj
59
+ # - o_proj
60
+
61
+ wandb_project: function-call
62
+ wandb_name: mixtral-instruct-qlora-v1
63
+ wandb_log_model: end
64
+
65
+ gradient_accumulation_steps: 4
66
+ micro_batch_size: 2
67
+ num_epochs: 1
68
+ optimizer: paged_adamw_8bit
69
+ lr_scheduler: cosine
70
+ learning_rate: 0.001
71
+ adam_beta2: 0.95
72
+ adam_epsilon: 0.00001
73
+ max_grad_norm: 1.0
74
+
75
+ train_on_inputs: false
76
+ group_by_length: false
77
+ bf16: auto
78
+ fp16:
79
+ tf32: false
80
+
81
+ gradient_checkpointing: true
82
+ early_stopping_patience:
83
+ resume_from_checkpoint:
84
+ local_rank:
85
+ logging_steps: 1
86
+ xformers_attention:
87
+ flash_attention: true
88
+
89
+ # loss_watchdog_threshold: 5.0
90
+ # loss_watchdog_patience: 3
91
+
92
+ warmup_steps: 10
93
+ # evals_per_epoch: 20
94
+ eval_steps: 0.1
95
+ save_steps: 0.1
96
+ eval_table_size:
97
+ eval_max_new_tokens: 256
98
+ # saves_per_epoch: 1
99
+ debug:
100
+ deepspeed:
101
+ weight_decay: 0.0
102
+ fsdp:
103
+ fsdp_config:
104
+ ```
105
+
106
+ </details><br>
107
+
108
+ # text-generation-webui/loras/mistral-instruct-better-formatting-v1
109
+
110
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the None dataset.
111
+ It achieves the following results on the evaluation set:
112
+ - Loss: 0.3566
113
+
114
+ ## Model description
115
+
116
+ More information needed
117
+
118
+ ## Intended uses & limitations
119
+
120
+ More information needed
121
+
122
+ ## Training and evaluation data
123
+
124
+ More information needed
125
+
126
+ ## Training procedure
127
+
128
+ ### Training hyperparameters
129
+
130
+ The following hyperparameters were used during training:
131
+ - learning_rate: 0.001
132
+ - train_batch_size: 2
133
+ - eval_batch_size: 2
134
+ - seed: 42
135
+ - distributed_type: multi-GPU
136
+ - num_devices: 2
137
+ - gradient_accumulation_steps: 4
138
+ - total_train_batch_size: 16
139
+ - total_eval_batch_size: 4
140
+ - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
141
+ - lr_scheduler_type: cosine
142
+ - lr_scheduler_warmup_steps: 10
143
+ - num_epochs: 1
144
+
145
+ ### Training results
146
+
147
+ | Training Loss | Epoch | Step | Validation Loss |
148
+ |:-------------:|:-----:|:----:|:---------------:|
149
+ | 0.9252 | 0.0 | 1 | 0.9944 |
150
+ | 0.3953 | 0.1 | 27 | 0.3981 |
151
+ | 0.3662 | 0.21 | 54 | 0.3824 |
152
+ | 0.3383 | 0.31 | 81 | 0.3778 |
153
+ | 0.3484 | 0.41 | 108 | 0.3730 |
154
+ | 0.4098 | 0.52 | 135 | 0.3686 |
155
+ | 0.3728 | 0.62 | 162 | 0.3642 |
156
+ | 0.3274 | 0.72 | 189 | 0.3602 |
157
+ | 0.3579 | 0.83 | 216 | 0.3576 |
158
+ | 0.3293 | 0.93 | 243 | 0.3566 |
159
+
160
+
161
+ ### Framework versions
162
+
163
+ - PEFT 0.8.2
164
+ - Transformers 4.38.0.dev0
165
+ - Pytorch 2.2.0+cu121
166
+ - Datasets 2.17.1
167
+ - Tokenizers 0.15.0
adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "up_proj",
24
+ "k_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "o_proj",
28
+ "down_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_rslora": false
32
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93d193d57cceb6508c74517f1a5f03b8ecfe46d13230d34fb0f43d56ea66bbb4
3
+ size 335706186
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 32,
17
+ "num_key_value_heads": 8,
18
+ "quantization_config": {
19
+ "_load_in_4bit": true,
20
+ "_load_in_8bit": false,
21
+ "bnb_4bit_compute_dtype": "bfloat16",
22
+ "bnb_4bit_quant_type": "nf4",
23
+ "bnb_4bit_use_double_quant": true,
24
+ "llm_int8_enable_fp32_cpu_offload": false,
25
+ "llm_int8_has_fp16_weight": false,
26
+ "llm_int8_skip_modules": null,
27
+ "llm_int8_threshold": 6.0,
28
+ "load_in_4bit": true,
29
+ "load_in_8bit": false,
30
+ "quant_method": "bitsandbytes"
31
+ },
32
+ "rms_norm_eps": 1e-05,
33
+ "rope_theta": 1000000.0,
34
+ "sliding_window": null,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.38.0.dev0",
38
+ "use_cache": false,
39
+ "vocab_size": 32000
40
+ }
scheduler.pt ADDED
Binary file (1.06 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
Binary file (493 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "trust_remote_code": false,
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false,
44
+ "use_fast": true
45
+ }
trainer_state.json ADDED
@@ -0,0 +1,879 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.3685910701751709,
3
+ "best_model_checkpoint": "../../text-generation-webui/loras/mistral-instruct-better-formatting-v1/checkpoint-135",
4
+ "epoch": 0.5162523900573613,
5
+ "eval_steps": 27,
6
+ "global_step": 135,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.0001,
14
+ "loss": 0.9252,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 0.9944313168525696,
20
+ "eval_runtime": 8.9255,
21
+ "eval_samples_per_second": 39.213,
22
+ "eval_steps_per_second": 9.859,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 0.0002,
28
+ "loss": 0.9232,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 0.0003,
34
+ "loss": 0.8082,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.02,
39
+ "learning_rate": 0.0004,
40
+ "loss": 0.6783,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.02,
45
+ "learning_rate": 0.0005,
46
+ "loss": 0.5423,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.02,
51
+ "learning_rate": 0.0006,
52
+ "loss": 0.6041,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.03,
57
+ "learning_rate": 0.0007,
58
+ "loss": 0.4542,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03,
63
+ "learning_rate": 0.0008,
64
+ "loss": 0.5088,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 0.0009000000000000001,
70
+ "loss": 0.5163,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.04,
75
+ "learning_rate": 0.001,
76
+ "loss": 0.4498,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.04,
81
+ "learning_rate": 0.0009999608360361113,
82
+ "loss": 0.4613,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.05,
87
+ "learning_rate": 0.0009998433502797096,
88
+ "loss": 0.4891,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.05,
93
+ "learning_rate": 0.0009996475611356265,
94
+ "loss": 0.4316,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.05,
99
+ "learning_rate": 0.0009993734992753778,
100
+ "loss": 0.4464,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.06,
105
+ "learning_rate": 0.0009990212076323587,
106
+ "loss": 0.4346,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.06,
111
+ "learning_rate": 0.000998590741395118,
112
+ "loss": 0.3953,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.07,
117
+ "learning_rate": 0.0009980821679987123,
118
+ "loss": 0.414,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.07,
123
+ "learning_rate": 0.0009974955671141423,
124
+ "loss": 0.415,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.07,
129
+ "learning_rate": 0.0009968310306358713,
130
+ "loss": 0.3885,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.08,
135
+ "learning_rate": 0.00099608866266743,
136
+ "loss": 0.4143,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.08,
141
+ "learning_rate": 0.0009952685795051076,
142
+ "loss": 0.4069,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.08,
147
+ "learning_rate": 0.0009943709096197333,
148
+ "loss": 0.4054,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.09,
153
+ "learning_rate": 0.0009933957936365514,
154
+ "loss": 0.3943,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.09,
159
+ "learning_rate": 0.0009923433843131901,
160
+ "loss": 0.4342,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.1,
165
+ "learning_rate": 0.0009912138465157324,
166
+ "loss": 0.4137,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.1,
171
+ "learning_rate": 0.0009900073571928885,
172
+ "loss": 0.374,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.1,
177
+ "learning_rate": 0.0009887241053482755,
178
+ "loss": 0.3953,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.1,
183
+ "eval_loss": 0.3981475234031677,
184
+ "eval_runtime": 9.1109,
185
+ "eval_samples_per_second": 38.415,
186
+ "eval_steps_per_second": 9.659,
187
+ "step": 27
188
+ },
189
+ {
190
+ "epoch": 0.11,
191
+ "learning_rate": 0.000987364292010809,
192
+ "loss": 0.4272,
193
+ "step": 28
194
+ },
195
+ {
196
+ "epoch": 0.11,
197
+ "learning_rate": 0.0009859281302032107,
198
+ "loss": 0.4212,
199
+ "step": 29
200
+ },
201
+ {
202
+ "epoch": 0.11,
203
+ "learning_rate": 0.0009844158449086372,
204
+ "loss": 0.3869,
205
+ "step": 30
206
+ },
207
+ {
208
+ "epoch": 0.12,
209
+ "learning_rate": 0.0009828276730354352,
210
+ "loss": 0.3549,
211
+ "step": 31
212
+ },
213
+ {
214
+ "epoch": 0.12,
215
+ "learning_rate": 0.0009811638633800287,
216
+ "loss": 0.39,
217
+ "step": 32
218
+ },
219
+ {
220
+ "epoch": 0.13,
221
+ "learning_rate": 0.000979424676587942,
222
+ "loss": 0.3672,
223
+ "step": 33
224
+ },
225
+ {
226
+ "epoch": 0.13,
227
+ "learning_rate": 0.0009776103851129704,
228
+ "loss": 0.3763,
229
+ "step": 34
230
+ },
231
+ {
232
+ "epoch": 0.13,
233
+ "learning_rate": 0.0009757212731744974,
234
+ "loss": 0.4669,
235
+ "step": 35
236
+ },
237
+ {
238
+ "epoch": 0.14,
239
+ "learning_rate": 0.0009737576367129695,
240
+ "loss": 0.4118,
241
+ "step": 36
242
+ },
243
+ {
244
+ "epoch": 0.14,
245
+ "learning_rate": 0.0009717197833435367,
246
+ "loss": 0.3796,
247
+ "step": 37
248
+ },
249
+ {
250
+ "epoch": 0.15,
251
+ "learning_rate": 0.0009696080323078621,
252
+ "loss": 0.403,
253
+ "step": 38
254
+ },
255
+ {
256
+ "epoch": 0.15,
257
+ "learning_rate": 0.000967422714424111,
258
+ "loss": 0.3771,
259
+ "step": 39
260
+ },
261
+ {
262
+ "epoch": 0.15,
263
+ "learning_rate": 0.0009651641720351261,
264
+ "loss": 0.4236,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 0.16,
269
+ "learning_rate": 0.0009628327589547976,
270
+ "loss": 0.3739,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 0.16,
275
+ "learning_rate": 0.0009604288404126362,
276
+ "loss": 0.3789,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 0.16,
281
+ "learning_rate": 0.0009579527929965582,
282
+ "loss": 0.3923,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 0.17,
287
+ "learning_rate": 0.0009554050045938894,
288
+ "loss": 0.4141,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 0.17,
293
+ "learning_rate": 0.0009527858743306019,
294
+ "loss": 0.3981,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 0.18,
299
+ "learning_rate": 0.0009500958125087882,
300
+ "loss": 0.3737,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 0.18,
305
+ "learning_rate": 0.0009473352405423844,
306
+ "loss": 0.3878,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 0.18,
311
+ "learning_rate": 0.0009445045908911536,
312
+ "loss": 0.3734,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 0.19,
317
+ "learning_rate": 0.0009416043069929387,
318
+ "loss": 0.3647,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 0.19,
323
+ "learning_rate": 0.0009386348431941953,
324
+ "loss": 0.3761,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 0.2,
329
+ "learning_rate": 0.0009355966646788152,
330
+ "loss": 0.3846,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 0.2,
335
+ "learning_rate": 0.0009324902473952528,
336
+ "loss": 0.3762,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 0.2,
341
+ "learning_rate": 0.0009293160779819658,
342
+ "loss": 0.3907,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 0.21,
347
+ "learning_rate": 0.000926074653691179,
348
+ "loss": 0.3662,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 0.21,
353
+ "eval_loss": 0.3824438154697418,
354
+ "eval_runtime": 9.1153,
355
+ "eval_samples_per_second": 38.397,
356
+ "eval_steps_per_second": 9.654,
357
+ "step": 54
358
+ },
359
+ {
360
+ "epoch": 0.21,
361
+ "learning_rate": 0.0009227664823109882,
362
+ "loss": 0.3862,
363
+ "step": 55
364
+ },
365
+ {
366
+ "epoch": 0.21,
367
+ "learning_rate": 0.0009193920820858111,
368
+ "loss": 0.409,
369
+ "step": 56
370
+ },
371
+ {
372
+ "epoch": 0.22,
373
+ "learning_rate": 0.000915951981635202,
374
+ "loss": 0.374,
375
+ "step": 57
376
+ },
377
+ {
378
+ "epoch": 0.22,
379
+ "learning_rate": 0.00091244671987104,
380
+ "loss": 0.3829,
381
+ "step": 58
382
+ },
383
+ {
384
+ "epoch": 0.23,
385
+ "learning_rate": 0.000908876845913106,
386
+ "loss": 0.3919,
387
+ "step": 59
388
+ },
389
+ {
390
+ "epoch": 0.23,
391
+ "learning_rate": 0.0009052429190030589,
392
+ "loss": 0.3739,
393
+ "step": 60
394
+ },
395
+ {
396
+ "epoch": 0.23,
397
+ "learning_rate": 0.0009015455084168279,
398
+ "loss": 0.3918,
399
+ "step": 61
400
+ },
401
+ {
402
+ "epoch": 0.24,
403
+ "learning_rate": 0.0008977851933754317,
404
+ "loss": 0.3809,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 0.24,
409
+ "learning_rate": 0.0008939625629542402,
410
+ "loss": 0.3554,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 0.24,
415
+ "learning_rate": 0.0008900782159906927,
416
+ "loss": 0.4147,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 0.25,
421
+ "learning_rate": 0.0008861327609904858,
422
+ "loss": 0.4175,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 0.25,
427
+ "learning_rate": 0.0008821268160322482,
428
+ "loss": 0.36,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 0.26,
433
+ "learning_rate": 0.0008780610086707148,
434
+ "loss": 0.3417,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 0.26,
439
+ "learning_rate": 0.0008739359758384161,
440
+ "loss": 0.3486,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 0.26,
445
+ "learning_rate": 0.0008697523637458997,
446
+ "loss": 0.4344,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 0.27,
451
+ "learning_rate": 0.0008655108277804975,
452
+ "loss": 0.4002,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 0.27,
457
+ "learning_rate": 0.0008612120324036547,
458
+ "loss": 0.4107,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 0.28,
463
+ "learning_rate": 0.0008568566510468391,
464
+ "loss": 0.3752,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 0.28,
469
+ "learning_rate": 0.0008524453660060433,
470
+ "loss": 0.3873,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 0.28,
475
+ "learning_rate": 0.0008479788683348994,
476
+ "loss": 0.3487,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 0.29,
481
+ "learning_rate": 0.0008434578577364219,
482
+ "loss": 0.3557,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 0.29,
487
+ "learning_rate": 0.0008388830424533935,
488
+ "loss": 0.3841,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 0.29,
493
+ "learning_rate": 0.0008342551391574165,
494
+ "loss": 0.3842,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 0.3,
499
+ "learning_rate": 0.0008295748728366412,
500
+ "loss": 0.3597,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 0.3,
505
+ "learning_rate": 0.0008248429766821926,
506
+ "loss": 0.3659,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 0.31,
511
+ "learning_rate": 0.0008200601919733106,
512
+ "loss": 0.3921,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 0.31,
517
+ "learning_rate": 0.0008152272679612261,
518
+ "loss": 0.3383,
519
+ "step": 81
520
+ },
521
+ {
522
+ "epoch": 0.31,
523
+ "eval_loss": 0.37780898809432983,
524
+ "eval_runtime": 9.1104,
525
+ "eval_samples_per_second": 38.418,
526
+ "eval_steps_per_second": 9.659,
527
+ "step": 81
528
+ },
529
+ {
530
+ "epoch": 0.31,
531
+ "learning_rate": 0.000810344961751785,
532
+ "loss": 0.3658,
533
+ "step": 82
534
+ },
535
+ {
536
+ "epoch": 0.32,
537
+ "learning_rate": 0.0008054140381868435,
538
+ "loss": 0.4424,
539
+ "step": 83
540
+ },
541
+ {
542
+ "epoch": 0.32,
543
+ "learning_rate": 0.0008004352697244516,
544
+ "loss": 0.4,
545
+ "step": 84
546
+ },
547
+ {
548
+ "epoch": 0.33,
549
+ "learning_rate": 0.0007954094363178422,
550
+ "loss": 0.4207,
551
+ "step": 85
552
+ },
553
+ {
554
+ "epoch": 0.33,
555
+ "learning_rate": 0.0007903373252932473,
556
+ "loss": 0.4457,
557
+ "step": 86
558
+ },
559
+ {
560
+ "epoch": 0.33,
561
+ "learning_rate": 0.0007852197312265592,
562
+ "loss": 0.3963,
563
+ "step": 87
564
+ },
565
+ {
566
+ "epoch": 0.34,
567
+ "learning_rate": 0.0007800574558188547,
568
+ "loss": 0.3642,
569
+ "step": 88
570
+ },
571
+ {
572
+ "epoch": 0.34,
573
+ "learning_rate": 0.0007748513077708044,
574
+ "loss": 0.4025,
575
+ "step": 89
576
+ },
577
+ {
578
+ "epoch": 0.34,
579
+ "learning_rate": 0.0007696021026559849,
580
+ "loss": 0.3789,
581
+ "step": 90
582
+ },
583
+ {
584
+ "epoch": 0.35,
585
+ "learning_rate": 0.0007643106627931147,
586
+ "loss": 0.4409,
587
+ "step": 91
588
+ },
589
+ {
590
+ "epoch": 0.35,
591
+ "learning_rate": 0.0007589778171172321,
592
+ "loss": 0.3818,
593
+ "step": 92
594
+ },
595
+ {
596
+ "epoch": 0.36,
597
+ "learning_rate": 0.0007536044010498396,
598
+ "loss": 0.3816,
599
+ "step": 93
600
+ },
601
+ {
602
+ "epoch": 0.36,
603
+ "learning_rate": 0.000748191256368028,
604
+ "loss": 0.4157,
605
+ "step": 94
606
+ },
607
+ {
608
+ "epoch": 0.36,
609
+ "learning_rate": 0.0007427392310726088,
610
+ "loss": 0.3725,
611
+ "step": 95
612
+ },
613
+ {
614
+ "epoch": 0.37,
615
+ "learning_rate": 0.0007372491792552693,
616
+ "loss": 0.4034,
617
+ "step": 96
618
+ },
619
+ {
620
+ "epoch": 0.37,
621
+ "learning_rate": 0.000731721960964774,
622
+ "loss": 0.3456,
623
+ "step": 97
624
+ },
625
+ {
626
+ "epoch": 0.37,
627
+ "learning_rate": 0.0007261584420722328,
628
+ "loss": 0.3798,
629
+ "step": 98
630
+ },
631
+ {
632
+ "epoch": 0.38,
633
+ "learning_rate": 0.000720559494135458,
634
+ "loss": 0.3309,
635
+ "step": 99
636
+ },
637
+ {
638
+ "epoch": 0.38,
639
+ "learning_rate": 0.0007149259942624286,
640
+ "loss": 0.4046,
641
+ "step": 100
642
+ },
643
+ {
644
+ "epoch": 0.39,
645
+ "learning_rate": 0.0007092588249738871,
646
+ "loss": 0.3833,
647
+ "step": 101
648
+ },
649
+ {
650
+ "epoch": 0.39,
651
+ "learning_rate": 0.0007035588740650869,
652
+ "loss": 0.3554,
653
+ "step": 102
654
+ },
655
+ {
656
+ "epoch": 0.39,
657
+ "learning_rate": 0.0006978270344667142,
658
+ "loss": 0.389,
659
+ "step": 103
660
+ },
661
+ {
662
+ "epoch": 0.4,
663
+ "learning_rate": 0.0006920642041050055,
664
+ "loss": 0.4031,
665
+ "step": 104
666
+ },
667
+ {
668
+ "epoch": 0.4,
669
+ "learning_rate": 0.0006862712857610813,
670
+ "loss": 0.3599,
671
+ "step": 105
672
+ },
673
+ {
674
+ "epoch": 0.41,
675
+ "learning_rate": 0.0006804491869295207,
676
+ "loss": 0.359,
677
+ "step": 106
678
+ },
679
+ {
680
+ "epoch": 0.41,
681
+ "learning_rate": 0.0006745988196761976,
682
+ "loss": 0.4034,
683
+ "step": 107
684
+ },
685
+ {
686
+ "epoch": 0.41,
687
+ "learning_rate": 0.0006687211004953992,
688
+ "loss": 0.3484,
689
+ "step": 108
690
+ },
691
+ {
692
+ "epoch": 0.41,
693
+ "eval_loss": 0.3730239272117615,
694
+ "eval_runtime": 9.1206,
695
+ "eval_samples_per_second": 38.375,
696
+ "eval_steps_per_second": 9.648,
697
+ "step": 108
698
+ },
699
+ {
700
+ "epoch": 0.42,
701
+ "learning_rate": 0.0006628169501662526,
702
+ "loss": 0.3735,
703
+ "step": 109
704
+ },
705
+ {
706
+ "epoch": 0.42,
707
+ "learning_rate": 0.0006568872936084789,
708
+ "loss": 0.3637,
709
+ "step": 110
710
+ },
711
+ {
712
+ "epoch": 0.42,
713
+ "learning_rate": 0.0006509330597374993,
714
+ "loss": 0.3888,
715
+ "step": 111
716
+ },
717
+ {
718
+ "epoch": 0.43,
719
+ "learning_rate": 0.000644955181318915,
720
+ "loss": 0.3852,
721
+ "step": 112
722
+ },
723
+ {
724
+ "epoch": 0.43,
725
+ "learning_rate": 0.000638954594822384,
726
+ "loss": 0.3277,
727
+ "step": 113
728
+ },
729
+ {
730
+ "epoch": 0.44,
731
+ "learning_rate": 0.000632932240274918,
732
+ "loss": 0.341,
733
+ "step": 114
734
+ },
735
+ {
736
+ "epoch": 0.44,
737
+ "learning_rate": 0.000626889061113621,
738
+ "loss": 0.3207,
739
+ "step": 115
740
+ },
741
+ {
742
+ "epoch": 0.44,
743
+ "learning_rate": 0.0006208260040378946,
744
+ "loss": 0.3958,
745
+ "step": 116
746
+ },
747
+ {
748
+ "epoch": 0.45,
749
+ "learning_rate": 0.0006147440188611324,
750
+ "loss": 0.4147,
751
+ "step": 117
752
+ },
753
+ {
754
+ "epoch": 0.45,
755
+ "learning_rate": 0.0006086440583619257,
756
+ "loss": 0.4008,
757
+ "step": 118
758
+ },
759
+ {
760
+ "epoch": 0.46,
761
+ "learning_rate": 0.0006025270781348054,
762
+ "loss": 0.3868,
763
+ "step": 119
764
+ },
765
+ {
766
+ "epoch": 0.46,
767
+ "learning_rate": 0.0005963940364405425,
768
+ "loss": 0.4198,
769
+ "step": 120
770
+ },
771
+ {
772
+ "epoch": 0.46,
773
+ "learning_rate": 0.0005902458940560304,
774
+ "loss": 0.3412,
775
+ "step": 121
776
+ },
777
+ {
778
+ "epoch": 0.47,
779
+ "learning_rate": 0.0005840836141237747,
780
+ "loss": 0.3843,
781
+ "step": 122
782
+ },
783
+ {
784
+ "epoch": 0.47,
785
+ "learning_rate": 0.0005779081620010104,
786
+ "loss": 0.4006,
787
+ "step": 123
788
+ },
789
+ {
790
+ "epoch": 0.47,
791
+ "learning_rate": 0.000571720505108473,
792
+ "loss": 0.3502,
793
+ "step": 124
794
+ },
795
+ {
796
+ "epoch": 0.48,
797
+ "learning_rate": 0.0005655216127788471,
798
+ "loss": 0.3803,
799
+ "step": 125
800
+ },
801
+ {
802
+ "epoch": 0.48,
803
+ "learning_rate": 0.0005593124561049141,
804
+ "loss": 0.3338,
805
+ "step": 126
806
+ },
807
+ {
808
+ "epoch": 0.49,
809
+ "learning_rate": 0.0005530940077874249,
810
+ "loss": 0.3904,
811
+ "step": 127
812
+ },
813
+ {
814
+ "epoch": 0.49,
815
+ "learning_rate": 0.0005468672419827208,
816
+ "loss": 0.4288,
817
+ "step": 128
818
+ },
819
+ {
820
+ "epoch": 0.49,
821
+ "learning_rate": 0.0005406331341501263,
822
+ "loss": 0.4279,
823
+ "step": 129
824
+ },
825
+ {
826
+ "epoch": 0.5,
827
+ "learning_rate": 0.000534392660899138,
828
+ "loss": 0.4216,
829
+ "step": 130
830
+ },
831
+ {
832
+ "epoch": 0.5,
833
+ "learning_rate": 0.0005281467998364314,
834
+ "loss": 0.3592,
835
+ "step": 131
836
+ },
837
+ {
838
+ "epoch": 0.5,
839
+ "learning_rate": 0.0005218965294127155,
840
+ "loss": 0.3659,
841
+ "step": 132
842
+ },
843
+ {
844
+ "epoch": 0.51,
845
+ "learning_rate": 0.0005156428287694508,
846
+ "loss": 0.3812,
847
+ "step": 133
848
+ },
849
+ {
850
+ "epoch": 0.51,
851
+ "learning_rate": 0.0005093866775854617,
852
+ "loss": 0.3631,
853
+ "step": 134
854
+ },
855
+ {
856
+ "epoch": 0.52,
857
+ "learning_rate": 0.0005031290559234649,
858
+ "loss": 0.4098,
859
+ "step": 135
860
+ },
861
+ {
862
+ "epoch": 0.52,
863
+ "eval_loss": 0.3685910701751709,
864
+ "eval_runtime": 9.1112,
865
+ "eval_samples_per_second": 38.414,
866
+ "eval_steps_per_second": 9.658,
867
+ "step": 135
868
+ }
869
+ ],
870
+ "logging_steps": 1,
871
+ "max_steps": 261,
872
+ "num_input_tokens_seen": 0,
873
+ "num_train_epochs": 1,
874
+ "save_steps": 27,
875
+ "total_flos": 3.8191644033417216e+17,
876
+ "train_batch_size": 2,
877
+ "trial_name": null,
878
+ "trial_params": null
879
+ }
training_args.bin ADDED
Binary file (5.5 kB). View file