RetrO21 commited on
Commit
82e5deb
·
verified ·
1 Parent(s): 12207f7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. README.md +28 -0
  3. adapter_config.json +41 -0
  4. adapter_model.safetensors +3 -0
  5. added_tokens.json +16 -0
  6. chat_template.jinja +7 -0
  7. checkpoint-1737/README.md +209 -0
  8. checkpoint-1737/adapter_config.json +41 -0
  9. checkpoint-1737/adapter_model.safetensors +3 -0
  10. checkpoint-1737/added_tokens.json +16 -0
  11. checkpoint-1737/chat_template.jinja +7 -0
  12. checkpoint-1737/merges.txt +0 -0
  13. checkpoint-1737/optimizer.pt +3 -0
  14. checkpoint-1737/rng_state.pth +3 -0
  15. checkpoint-1737/scheduler.pt +3 -0
  16. checkpoint-1737/special_tokens_map.json +31 -0
  17. checkpoint-1737/tokenizer.json +3 -0
  18. checkpoint-1737/tokenizer_config.json +143 -0
  19. checkpoint-1737/trainer_state.json +386 -0
  20. checkpoint-1737/training_args.bin +3 -0
  21. checkpoint-1737/vocab.json +0 -0
  22. checkpoint-3474/README.md +209 -0
  23. checkpoint-3474/adapter_config.json +41 -0
  24. checkpoint-3474/adapter_model.safetensors +3 -0
  25. checkpoint-3474/added_tokens.json +16 -0
  26. checkpoint-3474/chat_template.jinja +7 -0
  27. checkpoint-3474/merges.txt +0 -0
  28. checkpoint-3474/optimizer.pt +3 -0
  29. checkpoint-3474/rng_state.pth +3 -0
  30. checkpoint-3474/scheduler.pt +3 -0
  31. checkpoint-3474/special_tokens_map.json +31 -0
  32. checkpoint-3474/tokenizer.json +3 -0
  33. checkpoint-3474/tokenizer_config.json +143 -0
  34. checkpoint-3474/trainer_state.json +748 -0
  35. checkpoint-3474/training_args.bin +3 -0
  36. checkpoint-3474/vocab.json +0 -0
  37. checkpoint-5211/README.md +209 -0
  38. checkpoint-5211/adapter_config.json +41 -0
  39. checkpoint-5211/adapter_model.safetensors +3 -0
  40. checkpoint-5211/added_tokens.json +16 -0
  41. checkpoint-5211/chat_template.jinja +7 -0
  42. checkpoint-5211/merges.txt +0 -0
  43. checkpoint-5211/optimizer.pt +3 -0
  44. checkpoint-5211/rng_state.pth +3 -0
  45. checkpoint-5211/scheduler.pt +3 -0
  46. checkpoint-5211/special_tokens_map.json +31 -0
  47. checkpoint-5211/tokenizer.json +3 -0
  48. checkpoint-5211/tokenizer_config.json +143 -0
  49. checkpoint-5211/trainer_state.json +1110 -0
  50. checkpoint-5211/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1737/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-3474/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-5211/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-6948/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-8685/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-VL-2B-Instruct
3
+ library_name: peft
4
+ model_name: output
5
+ tags:
6
+ - adapter
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ license: apache-2.0
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for output
16
+
17
+ This model is a LoRA fine-tuned version of
18
+ [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct).
19
+
20
+ It has been trained using the TRL SFT pipeline.
21
+
22
+ ## Quick start
23
+
24
+ ```python
25
+ from transformers import pipeline
26
+
27
+ pipe = pipeline("text-generation", model="RetrO21/agrofinetune", device="cuda")
28
+ print(pipe("What is nitrogen deficiency?")[0]["generated_text"])
adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76b5201211b5dac5150a2b3a87809a5671a1239a76fdfafed2618f15a157a612
3
+ size 4374520
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-1737/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-1737/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
checkpoint-1737/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f4b9708eccf0370f9aaa1466d17c487ab3a9e4e84732d5cd39bbd229aedd5c
3
+ size 4374520
checkpoint-1737/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
checkpoint-1737/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-1737/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1737/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84ee821de3d805218a80046b08a325803a2434e306b554e094f68548e53fbe41
3
+ size 8783179
checkpoint-1737/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e816ab59bde4778d4f30814a9146abbd7044e1640b72b0be4234c4aa55b98f1
3
+ size 14645
checkpoint-1737/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9121f4d6a6f445ab467d2762de7c0b86cf7fef9179d9273d56797386ca47712
3
+ size 1465
checkpoint-1737/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-1737/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33787292af226c4a4842be48a0e614d9524e25dc248e48bb1af0593de5564f9
3
+ size 11420539
checkpoint-1737/tokenizer_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "clean_up_tokenization_spaces": false,
134
+ "eos_token": "<|im_end|>",
135
+ "errors": "replace",
136
+ "extra_special_tokens": {},
137
+ "model_max_length": 32768,
138
+ "pad_token": "<|endoftext|>",
139
+ "padding_side": "right",
140
+ "split_special_tokens": false,
141
+ "tokenizer_class": "Qwen2Tokenizer",
142
+ "unk_token": null
143
+ }
checkpoint-1737/trainer_state.json ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1737,
3
+ "best_metric": 6.15173864364624,
4
+ "best_model_checkpoint": "./output/checkpoint-1737",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1737,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 3.864118957519531,
14
+ "epoch": 0.028785261945883708,
15
+ "grad_norm": 2.7545533180236816,
16
+ "learning_rate": 9.800000000000001e-06,
17
+ "loss": 15.2997,
18
+ "mean_token_accuracy": 0.10086015284061432,
19
+ "num_tokens": 47319.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 4.047076859474182,
24
+ "epoch": 0.057570523891767415,
25
+ "grad_norm": 5.0328264236450195,
26
+ "learning_rate": 1.98e-05,
27
+ "loss": 15.3264,
28
+ "mean_token_accuracy": 0.09582207053899765,
29
+ "num_tokens": 96809.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 4.7578076648712155,
34
+ "epoch": 0.08635578583765112,
35
+ "grad_norm": 38.50589370727539,
36
+ "learning_rate": 1.988584740827024e-05,
37
+ "loss": 13.0056,
38
+ "mean_token_accuracy": 0.126854517608881,
39
+ "num_tokens": 139962.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 6.80673882484436,
44
+ "epoch": 0.11514104778353483,
45
+ "grad_norm": 12.030129432678223,
46
+ "learning_rate": 1.97693651718113e-05,
47
+ "loss": 9.2822,
48
+ "mean_token_accuracy": 0.11084575355052947,
49
+ "num_tokens": 188029.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 7.177925786972046,
54
+ "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.852536201477051,
56
+ "learning_rate": 1.965288293535236e-05,
57
+ "loss": 7.6333,
58
+ "mean_token_accuracy": 0.12398939326405525,
59
+ "num_tokens": 234425.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 7.080496473312378,
64
+ "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.10841178894043,
66
+ "learning_rate": 1.9536400698893422e-05,
67
+ "loss": 7.1632,
68
+ "mean_token_accuracy": 0.13563686355948448,
69
+ "num_tokens": 278885.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 6.931579580307007,
74
+ "epoch": 0.20149683362118595,
75
+ "grad_norm": 14.636048316955566,
76
+ "learning_rate": 1.941991846243448e-05,
77
+ "loss": 6.8213,
78
+ "mean_token_accuracy": 0.16459846690297128,
79
+ "num_tokens": 325491.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "entropy": 6.853660764694214,
84
+ "epoch": 0.23028209556706966,
85
+ "grad_norm": 5.966708183288574,
86
+ "learning_rate": 1.930343622597554e-05,
87
+ "loss": 6.6625,
88
+ "mean_token_accuracy": 0.17670693069696428,
89
+ "num_tokens": 372913.0,
90
+ "step": 400
91
+ },
92
+ {
93
+ "entropy": 6.684267387390137,
94
+ "epoch": 0.25906735751295334,
95
+ "grad_norm": 4.031010627746582,
96
+ "learning_rate": 1.91869539895166e-05,
97
+ "loss": 6.4505,
98
+ "mean_token_accuracy": 0.1943434515595436,
99
+ "num_tokens": 419159.0,
100
+ "step": 450
101
+ },
102
+ {
103
+ "entropy": 6.679989137649536,
104
+ "epoch": 0.28785261945883706,
105
+ "grad_norm": 6.251070022583008,
106
+ "learning_rate": 1.907047175305766e-05,
107
+ "loss": 6.4314,
108
+ "mean_token_accuracy": 0.19514557600021362,
109
+ "num_tokens": 466994.0,
110
+ "step": 500
111
+ },
112
+ {
113
+ "entropy": 6.477229623794556,
114
+ "epoch": 0.31663788140472077,
115
+ "grad_norm": 3.8656675815582275,
116
+ "learning_rate": 1.895398951659872e-05,
117
+ "loss": 6.2139,
118
+ "mean_token_accuracy": 0.21764743447303772,
119
+ "num_tokens": 513308.0,
120
+ "step": 550
121
+ },
122
+ {
123
+ "entropy": 6.408129243850708,
124
+ "epoch": 0.3454231433506045,
125
+ "grad_norm": 8.688581466674805,
126
+ "learning_rate": 1.883750728013978e-05,
127
+ "loss": 6.1224,
128
+ "mean_token_accuracy": 0.23438037544488907,
129
+ "num_tokens": 559679.0,
130
+ "step": 600
131
+ },
132
+ {
133
+ "entropy": 6.128518767356873,
134
+ "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.419503688812256,
136
+ "learning_rate": 1.872102504368084e-05,
137
+ "loss": 5.8692,
138
+ "mean_token_accuracy": 0.26634690463542937,
139
+ "num_tokens": 603140.0,
140
+ "step": 650
141
+ },
142
+ {
143
+ "entropy": 6.322700729370117,
144
+ "epoch": 0.4029936672423719,
145
+ "grad_norm": 2.2213082313537598,
146
+ "learning_rate": 1.86045428072219e-05,
147
+ "loss": 6.0717,
148
+ "mean_token_accuracy": 0.24038562417030335,
149
+ "num_tokens": 650179.0,
150
+ "step": 700
151
+ },
152
+ {
153
+ "entropy": 6.236415157318115,
154
+ "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.804980278015137,
156
+ "learning_rate": 1.848806057076296e-05,
157
+ "loss": 5.9986,
158
+ "mean_token_accuracy": 0.24596781462430953,
159
+ "num_tokens": 696220.0,
160
+ "step": 750
161
+ },
162
+ {
163
+ "entropy": 6.269758443832398,
164
+ "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.2888853549957275,
166
+ "learning_rate": 1.837157833430402e-05,
167
+ "loss": 6.0385,
168
+ "mean_token_accuracy": 0.24074893474578857,
169
+ "num_tokens": 743909.0,
170
+ "step": 800
171
+ },
172
+ {
173
+ "entropy": 6.270364007949829,
174
+ "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.0903279781341553,
176
+ "learning_rate": 1.825509609784508e-05,
177
+ "loss": 6.0481,
178
+ "mean_token_accuracy": 0.23740622967481614,
179
+ "num_tokens": 792015.0,
180
+ "step": 850
181
+ },
182
+ {
183
+ "entropy": 6.3037636184692385,
184
+ "epoch": 0.5181347150259067,
185
+ "grad_norm": 3.969320058822632,
186
+ "learning_rate": 1.813861386138614e-05,
187
+ "loss": 6.0855,
188
+ "mean_token_accuracy": 0.2309597587585449,
189
+ "num_tokens": 841802.0,
190
+ "step": 900
191
+ },
192
+ {
193
+ "entropy": 6.038041458129883,
194
+ "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.2712185382843018,
196
+ "learning_rate": 1.80221316249272e-05,
197
+ "loss": 5.8285,
198
+ "mean_token_accuracy": 0.26099125802516937,
199
+ "num_tokens": 886492.0,
200
+ "step": 950
201
+ },
202
+ {
203
+ "entropy": 6.142958383560181,
204
+ "epoch": 0.5757052389176741,
205
+ "grad_norm": 1.2311755418777466,
206
+ "learning_rate": 1.790564938846826e-05,
207
+ "loss": 5.9357,
208
+ "mean_token_accuracy": 0.24810438305139543,
209
+ "num_tokens": 932807.0,
210
+ "step": 1000
211
+ },
212
+ {
213
+ "entropy": 6.199834351539612,
214
+ "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.2788379192352295,
216
+ "learning_rate": 1.7789167152009318e-05,
217
+ "loss": 5.9964,
218
+ "mean_token_accuracy": 0.23942562609910964,
219
+ "num_tokens": 980541.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "entropy": 5.961639919281006,
224
+ "epoch": 0.6332757628094415,
225
+ "grad_norm": 1.9077532291412354,
226
+ "learning_rate": 1.767268491555038e-05,
227
+ "loss": 5.7664,
228
+ "mean_token_accuracy": 0.26718012750148773,
229
+ "num_tokens": 1023882.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "entropy": 5.889280087947846,
234
+ "epoch": 0.6620610247553252,
235
+ "grad_norm": 2.4254891872406006,
236
+ "learning_rate": 1.7556202679091442e-05,
237
+ "loss": 5.6952,
238
+ "mean_token_accuracy": 0.27529804170131683,
239
+ "num_tokens": 1068300.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "entropy": 6.085640063285828,
244
+ "epoch": 0.690846286701209,
245
+ "grad_norm": 2.35312557220459,
246
+ "learning_rate": 1.74397204426325e-05,
247
+ "loss": 5.8898,
248
+ "mean_token_accuracy": 0.25166562348604204,
249
+ "num_tokens": 1115425.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "entropy": 6.146574058532715,
254
+ "epoch": 0.7196315486470927,
255
+ "grad_norm": 1.7730146646499634,
256
+ "learning_rate": 1.732323820617356e-05,
257
+ "loss": 5.9519,
258
+ "mean_token_accuracy": 0.24276195973157882,
259
+ "num_tokens": 1162319.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "entropy": 6.079372715950012,
264
+ "epoch": 0.7484168105929764,
265
+ "grad_norm": 1.7070863246917725,
266
+ "learning_rate": 1.720675596971462e-05,
267
+ "loss": 5.8922,
268
+ "mean_token_accuracy": 0.24961524546146394,
269
+ "num_tokens": 1208230.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "entropy": 5.9683656406402585,
274
+ "epoch": 0.7772020725388601,
275
+ "grad_norm": 1.8790594339370728,
276
+ "learning_rate": 1.709027373325568e-05,
277
+ "loss": 5.7827,
278
+ "mean_token_accuracy": 0.2632122594118118,
279
+ "num_tokens": 1253074.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "entropy": 6.107076721191406,
284
+ "epoch": 0.8059873344847438,
285
+ "grad_norm": 1.1745644807815552,
286
+ "learning_rate": 1.6973791496796742e-05,
287
+ "loss": 5.9211,
288
+ "mean_token_accuracy": 0.24564073830842972,
289
+ "num_tokens": 1300179.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "entropy": 6.141328382492065,
294
+ "epoch": 0.8347725964306275,
295
+ "grad_norm": 1.0346958637237549,
296
+ "learning_rate": 1.68573092603378e-05,
297
+ "loss": 5.9584,
298
+ "mean_token_accuracy": 0.23997059136629104,
299
+ "num_tokens": 1347539.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "entropy": 6.070010099411011,
304
+ "epoch": 0.8635578583765112,
305
+ "grad_norm": 1.6541163921356201,
306
+ "learning_rate": 1.674082702387886e-05,
307
+ "loss": 5.889,
308
+ "mean_token_accuracy": 0.24875166177749633,
309
+ "num_tokens": 1394157.0,
310
+ "step": 1500
311
+ },
312
+ {
313
+ "entropy": 6.207450666427612,
314
+ "epoch": 0.8923431203223949,
315
+ "grad_norm": 0.9742990732192993,
316
+ "learning_rate": 1.662434478741992e-05,
317
+ "loss": 6.0217,
318
+ "mean_token_accuracy": 0.23067249596118927,
319
+ "num_tokens": 1443892.0,
320
+ "step": 1550
321
+ },
322
+ {
323
+ "entropy": 6.026197805404663,
324
+ "epoch": 0.9211283822682786,
325
+ "grad_norm": 1.4229531288146973,
326
+ "learning_rate": 1.650786255096098e-05,
327
+ "loss": 5.8455,
328
+ "mean_token_accuracy": 0.2537291014194489,
329
+ "num_tokens": 1491050.0,
330
+ "step": 1600
331
+ },
332
+ {
333
+ "entropy": 6.210526428222656,
334
+ "epoch": 0.9499136442141624,
335
+ "grad_norm": 1.3555018901824951,
336
+ "learning_rate": 1.6391380314502038e-05,
337
+ "loss": 6.0279,
338
+ "mean_token_accuracy": 0.2308420208096504,
339
+ "num_tokens": 1540809.0,
340
+ "step": 1650
341
+ },
342
+ {
343
+ "entropy": 5.9872834014892575,
344
+ "epoch": 0.9786989061600461,
345
+ "grad_norm": 0.9893498420715332,
346
+ "learning_rate": 1.62748980780431e-05,
347
+ "loss": 5.8137,
348
+ "mean_token_accuracy": 0.2566875320672989,
349
+ "num_tokens": 1585876.0,
350
+ "step": 1700
351
+ },
352
+ {
353
+ "epoch": 1.0,
354
+ "eval_entropy": 6.322207130045386,
355
+ "eval_loss": 6.15173864364624,
356
+ "eval_mean_token_accuracy": 0.21116007946877985,
357
+ "eval_model_preparation_time": 0.0036,
358
+ "eval_num_tokens": 1619719.0,
359
+ "eval_runtime": 76.1297,
360
+ "eval_samples_per_second": 5.701,
361
+ "eval_steps_per_second": 2.85,
362
+ "step": 1737
363
+ }
364
+ ],
365
+ "logging_steps": 50,
366
+ "max_steps": 8685,
367
+ "num_input_tokens_seen": 0,
368
+ "num_train_epochs": 5,
369
+ "save_steps": 500,
370
+ "stateful_callbacks": {
371
+ "TrainerControl": {
372
+ "args": {
373
+ "should_epoch_stop": false,
374
+ "should_evaluate": false,
375
+ "should_log": false,
376
+ "should_save": true,
377
+ "should_training_stop": false
378
+ },
379
+ "attributes": {}
380
+ }
381
+ },
382
+ "total_flos": 2.265889302609408e+16,
383
+ "train_batch_size": 2,
384
+ "trial_name": null,
385
+ "trial_params": null
386
+ }
checkpoint-1737/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
+ size 6225
checkpoint-1737/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3474/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-3474/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
checkpoint-3474/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7979fe4ab41b842e564542d82ca738faea1a24cfcb2e3003501296353e2a240
3
+ size 4374520
checkpoint-3474/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
checkpoint-3474/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-3474/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3474/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:571f08123879a8157590252a0cd0abe24c345fd53c5c7a3b55bb8b256658f9c0
3
+ size 8783179
checkpoint-3474/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f6c201154e30349ea924dac640f38cc7626e879caf89ba0aa995630585e3ea5
3
+ size 14645
checkpoint-3474/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecacb7697ae73257f39077a0e981cf0773317c0d0186dca0c24e0700ca53ab36
3
+ size 1465
checkpoint-3474/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-3474/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33787292af226c4a4842be48a0e614d9524e25dc248e48bb1af0593de5564f9
3
+ size 11420539
checkpoint-3474/tokenizer_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "clean_up_tokenization_spaces": false,
134
+ "eos_token": "<|im_end|>",
135
+ "errors": "replace",
136
+ "extra_special_tokens": {},
137
+ "model_max_length": 32768,
138
+ "pad_token": "<|endoftext|>",
139
+ "padding_side": "right",
140
+ "split_special_tokens": false,
141
+ "tokenizer_class": "Qwen2Tokenizer",
142
+ "unk_token": null
143
+ }
checkpoint-3474/trainer_state.json ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3474,
3
+ "best_metric": 6.12472677230835,
4
+ "best_model_checkpoint": "./output/checkpoint-3474",
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3474,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 3.864118957519531,
14
+ "epoch": 0.028785261945883708,
15
+ "grad_norm": 2.7545533180236816,
16
+ "learning_rate": 9.800000000000001e-06,
17
+ "loss": 15.2997,
18
+ "mean_token_accuracy": 0.10086015284061432,
19
+ "num_tokens": 47319.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 4.047076859474182,
24
+ "epoch": 0.057570523891767415,
25
+ "grad_norm": 5.0328264236450195,
26
+ "learning_rate": 1.98e-05,
27
+ "loss": 15.3264,
28
+ "mean_token_accuracy": 0.09582207053899765,
29
+ "num_tokens": 96809.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 4.7578076648712155,
34
+ "epoch": 0.08635578583765112,
35
+ "grad_norm": 38.50589370727539,
36
+ "learning_rate": 1.988584740827024e-05,
37
+ "loss": 13.0056,
38
+ "mean_token_accuracy": 0.126854517608881,
39
+ "num_tokens": 139962.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 6.80673882484436,
44
+ "epoch": 0.11514104778353483,
45
+ "grad_norm": 12.030129432678223,
46
+ "learning_rate": 1.97693651718113e-05,
47
+ "loss": 9.2822,
48
+ "mean_token_accuracy": 0.11084575355052947,
49
+ "num_tokens": 188029.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 7.177925786972046,
54
+ "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.852536201477051,
56
+ "learning_rate": 1.965288293535236e-05,
57
+ "loss": 7.6333,
58
+ "mean_token_accuracy": 0.12398939326405525,
59
+ "num_tokens": 234425.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 7.080496473312378,
64
+ "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.10841178894043,
66
+ "learning_rate": 1.9536400698893422e-05,
67
+ "loss": 7.1632,
68
+ "mean_token_accuracy": 0.13563686355948448,
69
+ "num_tokens": 278885.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 6.931579580307007,
74
+ "epoch": 0.20149683362118595,
75
+ "grad_norm": 14.636048316955566,
76
+ "learning_rate": 1.941991846243448e-05,
77
+ "loss": 6.8213,
78
+ "mean_token_accuracy": 0.16459846690297128,
79
+ "num_tokens": 325491.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "entropy": 6.853660764694214,
84
+ "epoch": 0.23028209556706966,
85
+ "grad_norm": 5.966708183288574,
86
+ "learning_rate": 1.930343622597554e-05,
87
+ "loss": 6.6625,
88
+ "mean_token_accuracy": 0.17670693069696428,
89
+ "num_tokens": 372913.0,
90
+ "step": 400
91
+ },
92
+ {
93
+ "entropy": 6.684267387390137,
94
+ "epoch": 0.25906735751295334,
95
+ "grad_norm": 4.031010627746582,
96
+ "learning_rate": 1.91869539895166e-05,
97
+ "loss": 6.4505,
98
+ "mean_token_accuracy": 0.1943434515595436,
99
+ "num_tokens": 419159.0,
100
+ "step": 450
101
+ },
102
+ {
103
+ "entropy": 6.679989137649536,
104
+ "epoch": 0.28785261945883706,
105
+ "grad_norm": 6.251070022583008,
106
+ "learning_rate": 1.907047175305766e-05,
107
+ "loss": 6.4314,
108
+ "mean_token_accuracy": 0.19514557600021362,
109
+ "num_tokens": 466994.0,
110
+ "step": 500
111
+ },
112
+ {
113
+ "entropy": 6.477229623794556,
114
+ "epoch": 0.31663788140472077,
115
+ "grad_norm": 3.8656675815582275,
116
+ "learning_rate": 1.895398951659872e-05,
117
+ "loss": 6.2139,
118
+ "mean_token_accuracy": 0.21764743447303772,
119
+ "num_tokens": 513308.0,
120
+ "step": 550
121
+ },
122
+ {
123
+ "entropy": 6.408129243850708,
124
+ "epoch": 0.3454231433506045,
125
+ "grad_norm": 8.688581466674805,
126
+ "learning_rate": 1.883750728013978e-05,
127
+ "loss": 6.1224,
128
+ "mean_token_accuracy": 0.23438037544488907,
129
+ "num_tokens": 559679.0,
130
+ "step": 600
131
+ },
132
+ {
133
+ "entropy": 6.128518767356873,
134
+ "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.419503688812256,
136
+ "learning_rate": 1.872102504368084e-05,
137
+ "loss": 5.8692,
138
+ "mean_token_accuracy": 0.26634690463542937,
139
+ "num_tokens": 603140.0,
140
+ "step": 650
141
+ },
142
+ {
143
+ "entropy": 6.322700729370117,
144
+ "epoch": 0.4029936672423719,
145
+ "grad_norm": 2.2213082313537598,
146
+ "learning_rate": 1.86045428072219e-05,
147
+ "loss": 6.0717,
148
+ "mean_token_accuracy": 0.24038562417030335,
149
+ "num_tokens": 650179.0,
150
+ "step": 700
151
+ },
152
+ {
153
+ "entropy": 6.236415157318115,
154
+ "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.804980278015137,
156
+ "learning_rate": 1.848806057076296e-05,
157
+ "loss": 5.9986,
158
+ "mean_token_accuracy": 0.24596781462430953,
159
+ "num_tokens": 696220.0,
160
+ "step": 750
161
+ },
162
+ {
163
+ "entropy": 6.269758443832398,
164
+ "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.2888853549957275,
166
+ "learning_rate": 1.837157833430402e-05,
167
+ "loss": 6.0385,
168
+ "mean_token_accuracy": 0.24074893474578857,
169
+ "num_tokens": 743909.0,
170
+ "step": 800
171
+ },
172
+ {
173
+ "entropy": 6.270364007949829,
174
+ "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.0903279781341553,
176
+ "learning_rate": 1.825509609784508e-05,
177
+ "loss": 6.0481,
178
+ "mean_token_accuracy": 0.23740622967481614,
179
+ "num_tokens": 792015.0,
180
+ "step": 850
181
+ },
182
+ {
183
+ "entropy": 6.3037636184692385,
184
+ "epoch": 0.5181347150259067,
185
+ "grad_norm": 3.969320058822632,
186
+ "learning_rate": 1.813861386138614e-05,
187
+ "loss": 6.0855,
188
+ "mean_token_accuracy": 0.2309597587585449,
189
+ "num_tokens": 841802.0,
190
+ "step": 900
191
+ },
192
+ {
193
+ "entropy": 6.038041458129883,
194
+ "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.2712185382843018,
196
+ "learning_rate": 1.80221316249272e-05,
197
+ "loss": 5.8285,
198
+ "mean_token_accuracy": 0.26099125802516937,
199
+ "num_tokens": 886492.0,
200
+ "step": 950
201
+ },
202
+ {
203
+ "entropy": 6.142958383560181,
204
+ "epoch": 0.5757052389176741,
205
+ "grad_norm": 1.2311755418777466,
206
+ "learning_rate": 1.790564938846826e-05,
207
+ "loss": 5.9357,
208
+ "mean_token_accuracy": 0.24810438305139543,
209
+ "num_tokens": 932807.0,
210
+ "step": 1000
211
+ },
212
+ {
213
+ "entropy": 6.199834351539612,
214
+ "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.2788379192352295,
216
+ "learning_rate": 1.7789167152009318e-05,
217
+ "loss": 5.9964,
218
+ "mean_token_accuracy": 0.23942562609910964,
219
+ "num_tokens": 980541.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "entropy": 5.961639919281006,
224
+ "epoch": 0.6332757628094415,
225
+ "grad_norm": 1.9077532291412354,
226
+ "learning_rate": 1.767268491555038e-05,
227
+ "loss": 5.7664,
228
+ "mean_token_accuracy": 0.26718012750148773,
229
+ "num_tokens": 1023882.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "entropy": 5.889280087947846,
234
+ "epoch": 0.6620610247553252,
235
+ "grad_norm": 2.4254891872406006,
236
+ "learning_rate": 1.7556202679091442e-05,
237
+ "loss": 5.6952,
238
+ "mean_token_accuracy": 0.27529804170131683,
239
+ "num_tokens": 1068300.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "entropy": 6.085640063285828,
244
+ "epoch": 0.690846286701209,
245
+ "grad_norm": 2.35312557220459,
246
+ "learning_rate": 1.74397204426325e-05,
247
+ "loss": 5.8898,
248
+ "mean_token_accuracy": 0.25166562348604204,
249
+ "num_tokens": 1115425.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "entropy": 6.146574058532715,
254
+ "epoch": 0.7196315486470927,
255
+ "grad_norm": 1.7730146646499634,
256
+ "learning_rate": 1.732323820617356e-05,
257
+ "loss": 5.9519,
258
+ "mean_token_accuracy": 0.24276195973157882,
259
+ "num_tokens": 1162319.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "entropy": 6.079372715950012,
264
+ "epoch": 0.7484168105929764,
265
+ "grad_norm": 1.7070863246917725,
266
+ "learning_rate": 1.720675596971462e-05,
267
+ "loss": 5.8922,
268
+ "mean_token_accuracy": 0.24961524546146394,
269
+ "num_tokens": 1208230.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "entropy": 5.9683656406402585,
274
+ "epoch": 0.7772020725388601,
275
+ "grad_norm": 1.8790594339370728,
276
+ "learning_rate": 1.709027373325568e-05,
277
+ "loss": 5.7827,
278
+ "mean_token_accuracy": 0.2632122594118118,
279
+ "num_tokens": 1253074.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "entropy": 6.107076721191406,
284
+ "epoch": 0.8059873344847438,
285
+ "grad_norm": 1.1745644807815552,
286
+ "learning_rate": 1.6973791496796742e-05,
287
+ "loss": 5.9211,
288
+ "mean_token_accuracy": 0.24564073830842972,
289
+ "num_tokens": 1300179.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "entropy": 6.141328382492065,
294
+ "epoch": 0.8347725964306275,
295
+ "grad_norm": 1.0346958637237549,
296
+ "learning_rate": 1.68573092603378e-05,
297
+ "loss": 5.9584,
298
+ "mean_token_accuracy": 0.23997059136629104,
299
+ "num_tokens": 1347539.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "entropy": 6.070010099411011,
304
+ "epoch": 0.8635578583765112,
305
+ "grad_norm": 1.6541163921356201,
306
+ "learning_rate": 1.674082702387886e-05,
307
+ "loss": 5.889,
308
+ "mean_token_accuracy": 0.24875166177749633,
309
+ "num_tokens": 1394157.0,
310
+ "step": 1500
311
+ },
312
+ {
313
+ "entropy": 6.207450666427612,
314
+ "epoch": 0.8923431203223949,
315
+ "grad_norm": 0.9742990732192993,
316
+ "learning_rate": 1.662434478741992e-05,
317
+ "loss": 6.0217,
318
+ "mean_token_accuracy": 0.23067249596118927,
319
+ "num_tokens": 1443892.0,
320
+ "step": 1550
321
+ },
322
+ {
323
+ "entropy": 6.026197805404663,
324
+ "epoch": 0.9211283822682786,
325
+ "grad_norm": 1.4229531288146973,
326
+ "learning_rate": 1.650786255096098e-05,
327
+ "loss": 5.8455,
328
+ "mean_token_accuracy": 0.2537291014194489,
329
+ "num_tokens": 1491050.0,
330
+ "step": 1600
331
+ },
332
+ {
333
+ "entropy": 6.210526428222656,
334
+ "epoch": 0.9499136442141624,
335
+ "grad_norm": 1.3555018901824951,
336
+ "learning_rate": 1.6391380314502038e-05,
337
+ "loss": 6.0279,
338
+ "mean_token_accuracy": 0.2308420208096504,
339
+ "num_tokens": 1540809.0,
340
+ "step": 1650
341
+ },
342
+ {
343
+ "entropy": 5.9872834014892575,
344
+ "epoch": 0.9786989061600461,
345
+ "grad_norm": 0.9893498420715332,
346
+ "learning_rate": 1.62748980780431e-05,
347
+ "loss": 5.8137,
348
+ "mean_token_accuracy": 0.2566875320672989,
349
+ "num_tokens": 1585876.0,
350
+ "step": 1700
351
+ },
352
+ {
353
+ "epoch": 1.0,
354
+ "eval_entropy": 6.322207130045386,
355
+ "eval_loss": 6.15173864364624,
356
+ "eval_mean_token_accuracy": 0.21116007946877985,
357
+ "eval_model_preparation_time": 0.0036,
358
+ "eval_num_tokens": 1619719.0,
359
+ "eval_runtime": 76.1297,
360
+ "eval_samples_per_second": 5.701,
361
+ "eval_steps_per_second": 2.85,
362
+ "step": 1737
363
+ },
364
+ {
365
+ "entropy": 6.038531675338745,
366
+ "epoch": 1.0074841681059297,
367
+ "grad_norm": 0.8715208172798157,
368
+ "learning_rate": 1.615841584158416e-05,
369
+ "loss": 5.8628,
370
+ "mean_token_accuracy": 0.2510762655735016,
371
+ "num_tokens": 1632015.0,
372
+ "step": 1750
373
+ },
374
+ {
375
+ "entropy": 6.164030771255494,
376
+ "epoch": 1.0362694300518134,
377
+ "grad_norm": 0.7344900965690613,
378
+ "learning_rate": 1.604193360512522e-05,
379
+ "loss": 5.9856,
380
+ "mean_token_accuracy": 0.2351543301343918,
381
+ "num_tokens": 1681154.0,
382
+ "step": 1800
383
+ },
384
+ {
385
+ "entropy": 6.0731862354278565,
386
+ "epoch": 1.065054691997697,
387
+ "grad_norm": 1.0801328420639038,
388
+ "learning_rate": 1.592545136866628e-05,
389
+ "loss": 5.8976,
390
+ "mean_token_accuracy": 0.24701615989208223,
391
+ "num_tokens": 1728110.0,
392
+ "step": 1850
393
+ },
394
+ {
395
+ "entropy": 6.079212121963501,
396
+ "epoch": 1.0938399539435808,
397
+ "grad_norm": 0.7876909375190735,
398
+ "learning_rate": 1.5808969132207338e-05,
399
+ "loss": 5.9056,
400
+ "mean_token_accuracy": 0.24457543224096298,
401
+ "num_tokens": 1775703.0,
402
+ "step": 1900
403
+ },
404
+ {
405
+ "entropy": 6.062467746734619,
406
+ "epoch": 1.1226252158894645,
407
+ "grad_norm": 0.5999078750610352,
408
+ "learning_rate": 1.56924868957484e-05,
409
+ "loss": 5.8899,
410
+ "mean_token_accuracy": 0.2469428673386574,
411
+ "num_tokens": 1821980.0,
412
+ "step": 1950
413
+ },
414
+ {
415
+ "entropy": 6.031774473190308,
416
+ "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.6313235759735107,
418
+ "learning_rate": 1.557600465928946e-05,
419
+ "loss": 5.8593,
420
+ "mean_token_accuracy": 0.250918984413147,
421
+ "num_tokens": 1867547.0,
422
+ "step": 2000
423
+ },
424
+ {
425
+ "entropy": 6.122789564132691,
426
+ "epoch": 1.180195739781232,
427
+ "grad_norm": 2.562373161315918,
428
+ "learning_rate": 1.545952242283052e-05,
429
+ "loss": 5.9502,
430
+ "mean_token_accuracy": 0.23938885867595672,
431
+ "num_tokens": 1915411.0,
432
+ "step": 2050
433
+ },
434
+ {
435
+ "entropy": 6.067130417823791,
436
+ "epoch": 1.2089810017271156,
437
+ "grad_norm": 0.9762872457504272,
438
+ "learning_rate": 1.534304018637158e-05,
439
+ "loss": 5.8956,
440
+ "mean_token_accuracy": 0.2454381173849106,
441
+ "num_tokens": 1964009.0,
442
+ "step": 2100
443
+ },
444
+ {
445
+ "entropy": 5.9613511180877685,
446
+ "epoch": 1.2377662636729994,
447
+ "grad_norm": 0.8701547384262085,
448
+ "learning_rate": 1.5226557949912639e-05,
449
+ "loss": 5.7907,
450
+ "mean_token_accuracy": 0.25976367652416227,
451
+ "num_tokens": 2008595.0,
452
+ "step": 2150
453
+ },
454
+ {
455
+ "entropy": 6.13505428314209,
456
+ "epoch": 1.266551525618883,
457
+ "grad_norm": 0.8511647582054138,
458
+ "learning_rate": 1.51100757134537e-05,
459
+ "loss": 5.9619,
460
+ "mean_token_accuracy": 0.23760781466960906,
461
+ "num_tokens": 2057229.0,
462
+ "step": 2200
463
+ },
464
+ {
465
+ "entropy": 6.025254983901977,
466
+ "epoch": 1.2953367875647668,
467
+ "grad_norm": 0.7627406120300293,
468
+ "learning_rate": 1.4993593476994758e-05,
469
+ "loss": 5.8546,
470
+ "mean_token_accuracy": 0.2508662334084511,
471
+ "num_tokens": 2103631.0,
472
+ "step": 2250
473
+ },
474
+ {
475
+ "entropy": 5.981974196434021,
476
+ "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.6922173500061035,
478
+ "learning_rate": 1.4877111240535819e-05,
479
+ "loss": 5.8119,
480
+ "mean_token_accuracy": 0.256170334815979,
481
+ "num_tokens": 2150369.0,
482
+ "step": 2300
483
+ },
484
+ {
485
+ "entropy": 6.19903904914856,
486
+ "epoch": 1.3529073114565342,
487
+ "grad_norm": 0.40436601638793945,
488
+ "learning_rate": 1.4760629004076878e-05,
489
+ "loss": 6.0244,
490
+ "mean_token_accuracy": 0.22900927513837815,
491
+ "num_tokens": 2199724.0,
492
+ "step": 2350
493
+ },
494
+ {
495
+ "entropy": 5.986697297096253,
496
+ "epoch": 1.381692573402418,
497
+ "grad_norm": 0.8481882214546204,
498
+ "learning_rate": 1.464414676761794e-05,
499
+ "loss": 5.8195,
500
+ "mean_token_accuracy": 0.2552035376429558,
501
+ "num_tokens": 2245341.0,
502
+ "step": 2400
503
+ },
504
+ {
505
+ "entropy": 6.1886044692993165,
506
+ "epoch": 1.4104778353483016,
507
+ "grad_norm": 0.7911505103111267,
508
+ "learning_rate": 1.4527664531159e-05,
509
+ "loss": 6.0148,
510
+ "mean_token_accuracy": 0.23026730984449387,
511
+ "num_tokens": 2294726.0,
512
+ "step": 2450
513
+ },
514
+ {
515
+ "entropy": 5.974867792129516,
516
+ "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.640499234199524,
518
+ "learning_rate": 1.441118229470006e-05,
519
+ "loss": 5.8111,
520
+ "mean_token_accuracy": 0.2554209426045418,
521
+ "num_tokens": 2342251.0,
522
+ "step": 2500
523
+ },
524
+ {
525
+ "entropy": 5.967635660171509,
526
+ "epoch": 1.468048359240069,
527
+ "grad_norm": 0.8022929430007935,
528
+ "learning_rate": 1.429470005824112e-05,
529
+ "loss": 5.8015,
530
+ "mean_token_accuracy": 0.2569852137565613,
531
+ "num_tokens": 2387469.0,
532
+ "step": 2550
533
+ },
534
+ {
535
+ "entropy": 6.047262029647827,
536
+ "epoch": 1.4968336211859528,
537
+ "grad_norm": 0.9270678758621216,
538
+ "learning_rate": 1.417821782178218e-05,
539
+ "loss": 5.8782,
540
+ "mean_token_accuracy": 0.2467849862575531,
541
+ "num_tokens": 2434128.0,
542
+ "step": 2600
543
+ },
544
+ {
545
+ "entropy": 6.00601068019867,
546
+ "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.5378597974777222,
548
+ "learning_rate": 1.406173558532324e-05,
549
+ "loss": 5.839,
550
+ "mean_token_accuracy": 0.25216978013515473,
551
+ "num_tokens": 2480366.0,
552
+ "step": 2650
553
+ },
554
+ {
555
+ "entropy": 5.988714299201965,
556
+ "epoch": 1.5544041450777202,
557
+ "grad_norm": 0.819143533706665,
558
+ "learning_rate": 1.3945253348864299e-05,
559
+ "loss": 5.82,
560
+ "mean_token_accuracy": 0.254311783015728,
561
+ "num_tokens": 2527357.0,
562
+ "step": 2700
563
+ },
564
+ {
565
+ "entropy": 5.960293846130371,
566
+ "epoch": 1.583189407023604,
567
+ "grad_norm": 0.8920449614524841,
568
+ "learning_rate": 1.382877111240536e-05,
569
+ "loss": 5.7946,
570
+ "mean_token_accuracy": 0.25750755161046984,
571
+ "num_tokens": 2574470.0,
572
+ "step": 2750
573
+ },
574
+ {
575
+ "entropy": 6.1214879322052,
576
+ "epoch": 1.6119746689694876,
577
+ "grad_norm": 0.5333890914916992,
578
+ "learning_rate": 1.371228887594642e-05,
579
+ "loss": 5.9513,
580
+ "mean_token_accuracy": 0.2377367687225342,
581
+ "num_tokens": 2622280.0,
582
+ "step": 2800
583
+ },
584
+ {
585
+ "entropy": 5.951769871711731,
586
+ "epoch": 1.6407599309153713,
587
+ "grad_norm": 0.5994665026664734,
588
+ "learning_rate": 1.3595806639487479e-05,
589
+ "loss": 5.7861,
590
+ "mean_token_accuracy": 0.25854207515716554,
591
+ "num_tokens": 2668624.0,
592
+ "step": 2850
593
+ },
594
+ {
595
+ "entropy": 5.927765312194825,
596
+ "epoch": 1.669545192861255,
597
+ "grad_norm": 0.4460087716579437,
598
+ "learning_rate": 1.347932440302854e-05,
599
+ "loss": 5.7661,
600
+ "mean_token_accuracy": 0.25973255425691605,
601
+ "num_tokens": 2714388.0,
602
+ "step": 2900
603
+ },
604
+ {
605
+ "entropy": 6.097678365707398,
606
+ "epoch": 1.6983304548071387,
607
+ "grad_norm": 0.7125752568244934,
608
+ "learning_rate": 1.3362842166569598e-05,
609
+ "loss": 5.9284,
610
+ "mean_token_accuracy": 0.23995368272066117,
611
+ "num_tokens": 2761465.0,
612
+ "step": 2950
613
+ },
614
+ {
615
+ "entropy": 5.986212658882141,
616
+ "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.5405049324035645,
618
+ "learning_rate": 1.3246359930110659e-05,
619
+ "loss": 5.8194,
620
+ "mean_token_accuracy": 0.25333445996046067,
621
+ "num_tokens": 2808066.0,
622
+ "step": 3000
623
+ },
624
+ {
625
+ "entropy": 5.7968806195259095,
626
+ "epoch": 1.7559009786989062,
627
+ "grad_norm": 0.4532749652862549,
628
+ "learning_rate": 1.312987769365172e-05,
629
+ "loss": 5.6344,
630
+ "mean_token_accuracy": 0.2782411390542984,
631
+ "num_tokens": 2851822.0,
632
+ "step": 3050
633
+ },
634
+ {
635
+ "entropy": 5.973708114624023,
636
+ "epoch": 1.7846862406447899,
637
+ "grad_norm": 1.4795438051223755,
638
+ "learning_rate": 1.3013395457192778e-05,
639
+ "loss": 5.8104,
640
+ "mean_token_accuracy": 0.25441971331834795,
641
+ "num_tokens": 2897737.0,
642
+ "step": 3100
643
+ },
644
+ {
645
+ "entropy": 5.70733567237854,
646
+ "epoch": 1.8134715025906736,
647
+ "grad_norm": 0.6216577887535095,
648
+ "learning_rate": 1.2896913220733839e-05,
649
+ "loss": 5.5523,
650
+ "mean_token_accuracy": 0.28787180870771406,
651
+ "num_tokens": 2939511.0,
652
+ "step": 3150
653
+ },
654
+ {
655
+ "entropy": 5.96826630115509,
656
+ "epoch": 1.8422567645365573,
657
+ "grad_norm": 0.9246350526809692,
658
+ "learning_rate": 1.2780430984274898e-05,
659
+ "loss": 5.8057,
660
+ "mean_token_accuracy": 0.25464902341365814,
661
+ "num_tokens": 2986368.0,
662
+ "step": 3200
663
+ },
664
+ {
665
+ "entropy": 5.950662693977356,
666
+ "epoch": 1.871042026482441,
667
+ "grad_norm": 0.8141199946403503,
668
+ "learning_rate": 1.266394874781596e-05,
669
+ "loss": 5.7886,
670
+ "mean_token_accuracy": 0.25830793648958206,
671
+ "num_tokens": 3031770.0,
672
+ "step": 3250
673
+ },
674
+ {
675
+ "entropy": 6.00512773513794,
676
+ "epoch": 1.8998272884283247,
677
+ "grad_norm": 0.4913998246192932,
678
+ "learning_rate": 1.2547466511357018e-05,
679
+ "loss": 5.838,
680
+ "mean_token_accuracy": 0.2512077575922012,
681
+ "num_tokens": 3078322.0,
682
+ "step": 3300
683
+ },
684
+ {
685
+ "entropy": 6.090880632400513,
686
+ "epoch": 1.9286125503742084,
687
+ "grad_norm": 0.9893012046813965,
688
+ "learning_rate": 1.243098427489808e-05,
689
+ "loss": 5.9264,
690
+ "mean_token_accuracy": 0.2391783133149147,
691
+ "num_tokens": 3125572.0,
692
+ "step": 3350
693
+ },
694
+ {
695
+ "entropy": 5.949693293571472,
696
+ "epoch": 1.9573978123200921,
697
+ "grad_norm": 0.5794200301170349,
698
+ "learning_rate": 1.231450203843914e-05,
699
+ "loss": 5.7861,
700
+ "mean_token_accuracy": 0.2568664598464966,
701
+ "num_tokens": 3171974.0,
702
+ "step": 3400
703
+ },
704
+ {
705
+ "entropy": 6.03591317653656,
706
+ "epoch": 1.9861830742659758,
707
+ "grad_norm": 0.8525373339653015,
708
+ "learning_rate": 1.21980198019802e-05,
709
+ "loss": 5.8741,
710
+ "mean_token_accuracy": 0.24642003327608109,
711
+ "num_tokens": 3219624.0,
712
+ "step": 3450
713
+ },
714
+ {
715
+ "epoch": 2.0,
716
+ "eval_entropy": 6.272298685416648,
717
+ "eval_loss": 6.12472677230835,
718
+ "eval_mean_token_accuracy": 0.21168697409091458,
719
+ "eval_model_preparation_time": 0.0036,
720
+ "eval_num_tokens": 3239438.0,
721
+ "eval_runtime": 76.2536,
722
+ "eval_samples_per_second": 5.692,
723
+ "eval_steps_per_second": 2.846,
724
+ "step": 3474
725
+ }
726
+ ],
727
+ "logging_steps": 50,
728
+ "max_steps": 8685,
729
+ "num_input_tokens_seen": 0,
730
+ "num_train_epochs": 5,
731
+ "save_steps": 500,
732
+ "stateful_callbacks": {
733
+ "TrainerControl": {
734
+ "args": {
735
+ "should_epoch_stop": false,
736
+ "should_evaluate": false,
737
+ "should_log": false,
738
+ "should_save": true,
739
+ "should_training_stop": false
740
+ },
741
+ "attributes": {}
742
+ }
743
+ },
744
+ "total_flos": 4.529454004325376e+16,
745
+ "train_batch_size": 2,
746
+ "trial_name": null,
747
+ "trial_params": null
748
+ }
checkpoint-3474/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
+ size 6225
checkpoint-3474/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5211/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2-VL-2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-5211/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
checkpoint-5211/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e6a7b22d63fd8741b839353cbaab150c0bd5f07d663ad8884bd3b4af58a9cce
3
+ size 4374520
checkpoint-5211/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
checkpoint-5211/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-5211/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5211/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d7235486f7f068a0b9991bde7ca0b6a16106923b1cca53549a5bb621f15d218
3
+ size 8783179
checkpoint-5211/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43cbafcbad7a00736ad4867a9fc18293a08b0b3d13acacb84d30cd8449539e81
3
+ size 14645
checkpoint-5211/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82e157712778db9a1270de44d6dd5d35b469dbf5b63767059cabfb507d50c8a
3
+ size 1465
checkpoint-5211/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-5211/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33787292af226c4a4842be48a0e614d9524e25dc248e48bb1af0593de5564f9
3
+ size 11420539
checkpoint-5211/tokenizer_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "clean_up_tokenization_spaces": false,
134
+ "eos_token": "<|im_end|>",
135
+ "errors": "replace",
136
+ "extra_special_tokens": {},
137
+ "model_max_length": 32768,
138
+ "pad_token": "<|endoftext|>",
139
+ "padding_side": "right",
140
+ "split_special_tokens": false,
141
+ "tokenizer_class": "Qwen2Tokenizer",
142
+ "unk_token": null
143
+ }
checkpoint-5211/trainer_state.json ADDED
@@ -0,0 +1,1110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 5211,
3
+ "best_metric": 6.0980024337768555,
4
+ "best_model_checkpoint": "./output/checkpoint-5211",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 5211,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 3.864118957519531,
14
+ "epoch": 0.028785261945883708,
15
+ "grad_norm": 2.7545533180236816,
16
+ "learning_rate": 9.800000000000001e-06,
17
+ "loss": 15.2997,
18
+ "mean_token_accuracy": 0.10086015284061432,
19
+ "num_tokens": 47319.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 4.047076859474182,
24
+ "epoch": 0.057570523891767415,
25
+ "grad_norm": 5.0328264236450195,
26
+ "learning_rate": 1.98e-05,
27
+ "loss": 15.3264,
28
+ "mean_token_accuracy": 0.09582207053899765,
29
+ "num_tokens": 96809.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 4.7578076648712155,
34
+ "epoch": 0.08635578583765112,
35
+ "grad_norm": 38.50589370727539,
36
+ "learning_rate": 1.988584740827024e-05,
37
+ "loss": 13.0056,
38
+ "mean_token_accuracy": 0.126854517608881,
39
+ "num_tokens": 139962.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 6.80673882484436,
44
+ "epoch": 0.11514104778353483,
45
+ "grad_norm": 12.030129432678223,
46
+ "learning_rate": 1.97693651718113e-05,
47
+ "loss": 9.2822,
48
+ "mean_token_accuracy": 0.11084575355052947,
49
+ "num_tokens": 188029.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 7.177925786972046,
54
+ "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.852536201477051,
56
+ "learning_rate": 1.965288293535236e-05,
57
+ "loss": 7.6333,
58
+ "mean_token_accuracy": 0.12398939326405525,
59
+ "num_tokens": 234425.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 7.080496473312378,
64
+ "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.10841178894043,
66
+ "learning_rate": 1.9536400698893422e-05,
67
+ "loss": 7.1632,
68
+ "mean_token_accuracy": 0.13563686355948448,
69
+ "num_tokens": 278885.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 6.931579580307007,
74
+ "epoch": 0.20149683362118595,
75
+ "grad_norm": 14.636048316955566,
76
+ "learning_rate": 1.941991846243448e-05,
77
+ "loss": 6.8213,
78
+ "mean_token_accuracy": 0.16459846690297128,
79
+ "num_tokens": 325491.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "entropy": 6.853660764694214,
84
+ "epoch": 0.23028209556706966,
85
+ "grad_norm": 5.966708183288574,
86
+ "learning_rate": 1.930343622597554e-05,
87
+ "loss": 6.6625,
88
+ "mean_token_accuracy": 0.17670693069696428,
89
+ "num_tokens": 372913.0,
90
+ "step": 400
91
+ },
92
+ {
93
+ "entropy": 6.684267387390137,
94
+ "epoch": 0.25906735751295334,
95
+ "grad_norm": 4.031010627746582,
96
+ "learning_rate": 1.91869539895166e-05,
97
+ "loss": 6.4505,
98
+ "mean_token_accuracy": 0.1943434515595436,
99
+ "num_tokens": 419159.0,
100
+ "step": 450
101
+ },
102
+ {
103
+ "entropy": 6.679989137649536,
104
+ "epoch": 0.28785261945883706,
105
+ "grad_norm": 6.251070022583008,
106
+ "learning_rate": 1.907047175305766e-05,
107
+ "loss": 6.4314,
108
+ "mean_token_accuracy": 0.19514557600021362,
109
+ "num_tokens": 466994.0,
110
+ "step": 500
111
+ },
112
+ {
113
+ "entropy": 6.477229623794556,
114
+ "epoch": 0.31663788140472077,
115
+ "grad_norm": 3.8656675815582275,
116
+ "learning_rate": 1.895398951659872e-05,
117
+ "loss": 6.2139,
118
+ "mean_token_accuracy": 0.21764743447303772,
119
+ "num_tokens": 513308.0,
120
+ "step": 550
121
+ },
122
+ {
123
+ "entropy": 6.408129243850708,
124
+ "epoch": 0.3454231433506045,
125
+ "grad_norm": 8.688581466674805,
126
+ "learning_rate": 1.883750728013978e-05,
127
+ "loss": 6.1224,
128
+ "mean_token_accuracy": 0.23438037544488907,
129
+ "num_tokens": 559679.0,
130
+ "step": 600
131
+ },
132
+ {
133
+ "entropy": 6.128518767356873,
134
+ "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.419503688812256,
136
+ "learning_rate": 1.872102504368084e-05,
137
+ "loss": 5.8692,
138
+ "mean_token_accuracy": 0.26634690463542937,
139
+ "num_tokens": 603140.0,
140
+ "step": 650
141
+ },
142
+ {
143
+ "entropy": 6.322700729370117,
144
+ "epoch": 0.4029936672423719,
145
+ "grad_norm": 2.2213082313537598,
146
+ "learning_rate": 1.86045428072219e-05,
147
+ "loss": 6.0717,
148
+ "mean_token_accuracy": 0.24038562417030335,
149
+ "num_tokens": 650179.0,
150
+ "step": 700
151
+ },
152
+ {
153
+ "entropy": 6.236415157318115,
154
+ "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.804980278015137,
156
+ "learning_rate": 1.848806057076296e-05,
157
+ "loss": 5.9986,
158
+ "mean_token_accuracy": 0.24596781462430953,
159
+ "num_tokens": 696220.0,
160
+ "step": 750
161
+ },
162
+ {
163
+ "entropy": 6.269758443832398,
164
+ "epoch": 0.4605641911341393,
165
+ "grad_norm": 2.2888853549957275,
166
+ "learning_rate": 1.837157833430402e-05,
167
+ "loss": 6.0385,
168
+ "mean_token_accuracy": 0.24074893474578857,
169
+ "num_tokens": 743909.0,
170
+ "step": 800
171
+ },
172
+ {
173
+ "entropy": 6.270364007949829,
174
+ "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.0903279781341553,
176
+ "learning_rate": 1.825509609784508e-05,
177
+ "loss": 6.0481,
178
+ "mean_token_accuracy": 0.23740622967481614,
179
+ "num_tokens": 792015.0,
180
+ "step": 850
181
+ },
182
+ {
183
+ "entropy": 6.3037636184692385,
184
+ "epoch": 0.5181347150259067,
185
+ "grad_norm": 3.969320058822632,
186
+ "learning_rate": 1.813861386138614e-05,
187
+ "loss": 6.0855,
188
+ "mean_token_accuracy": 0.2309597587585449,
189
+ "num_tokens": 841802.0,
190
+ "step": 900
191
+ },
192
+ {
193
+ "entropy": 6.038041458129883,
194
+ "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.2712185382843018,
196
+ "learning_rate": 1.80221316249272e-05,
197
+ "loss": 5.8285,
198
+ "mean_token_accuracy": 0.26099125802516937,
199
+ "num_tokens": 886492.0,
200
+ "step": 950
201
+ },
202
+ {
203
+ "entropy": 6.142958383560181,
204
+ "epoch": 0.5757052389176741,
205
+ "grad_norm": 1.2311755418777466,
206
+ "learning_rate": 1.790564938846826e-05,
207
+ "loss": 5.9357,
208
+ "mean_token_accuracy": 0.24810438305139543,
209
+ "num_tokens": 932807.0,
210
+ "step": 1000
211
+ },
212
+ {
213
+ "entropy": 6.199834351539612,
214
+ "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.2788379192352295,
216
+ "learning_rate": 1.7789167152009318e-05,
217
+ "loss": 5.9964,
218
+ "mean_token_accuracy": 0.23942562609910964,
219
+ "num_tokens": 980541.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "entropy": 5.961639919281006,
224
+ "epoch": 0.6332757628094415,
225
+ "grad_norm": 1.9077532291412354,
226
+ "learning_rate": 1.767268491555038e-05,
227
+ "loss": 5.7664,
228
+ "mean_token_accuracy": 0.26718012750148773,
229
+ "num_tokens": 1023882.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "entropy": 5.889280087947846,
234
+ "epoch": 0.6620610247553252,
235
+ "grad_norm": 2.4254891872406006,
236
+ "learning_rate": 1.7556202679091442e-05,
237
+ "loss": 5.6952,
238
+ "mean_token_accuracy": 0.27529804170131683,
239
+ "num_tokens": 1068300.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "entropy": 6.085640063285828,
244
+ "epoch": 0.690846286701209,
245
+ "grad_norm": 2.35312557220459,
246
+ "learning_rate": 1.74397204426325e-05,
247
+ "loss": 5.8898,
248
+ "mean_token_accuracy": 0.25166562348604204,
249
+ "num_tokens": 1115425.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "entropy": 6.146574058532715,
254
+ "epoch": 0.7196315486470927,
255
+ "grad_norm": 1.7730146646499634,
256
+ "learning_rate": 1.732323820617356e-05,
257
+ "loss": 5.9519,
258
+ "mean_token_accuracy": 0.24276195973157882,
259
+ "num_tokens": 1162319.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "entropy": 6.079372715950012,
264
+ "epoch": 0.7484168105929764,
265
+ "grad_norm": 1.7070863246917725,
266
+ "learning_rate": 1.720675596971462e-05,
267
+ "loss": 5.8922,
268
+ "mean_token_accuracy": 0.24961524546146394,
269
+ "num_tokens": 1208230.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "entropy": 5.9683656406402585,
274
+ "epoch": 0.7772020725388601,
275
+ "grad_norm": 1.8790594339370728,
276
+ "learning_rate": 1.709027373325568e-05,
277
+ "loss": 5.7827,
278
+ "mean_token_accuracy": 0.2632122594118118,
279
+ "num_tokens": 1253074.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "entropy": 6.107076721191406,
284
+ "epoch": 0.8059873344847438,
285
+ "grad_norm": 1.1745644807815552,
286
+ "learning_rate": 1.6973791496796742e-05,
287
+ "loss": 5.9211,
288
+ "mean_token_accuracy": 0.24564073830842972,
289
+ "num_tokens": 1300179.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "entropy": 6.141328382492065,
294
+ "epoch": 0.8347725964306275,
295
+ "grad_norm": 1.0346958637237549,
296
+ "learning_rate": 1.68573092603378e-05,
297
+ "loss": 5.9584,
298
+ "mean_token_accuracy": 0.23997059136629104,
299
+ "num_tokens": 1347539.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "entropy": 6.070010099411011,
304
+ "epoch": 0.8635578583765112,
305
+ "grad_norm": 1.6541163921356201,
306
+ "learning_rate": 1.674082702387886e-05,
307
+ "loss": 5.889,
308
+ "mean_token_accuracy": 0.24875166177749633,
309
+ "num_tokens": 1394157.0,
310
+ "step": 1500
311
+ },
312
+ {
313
+ "entropy": 6.207450666427612,
314
+ "epoch": 0.8923431203223949,
315
+ "grad_norm": 0.9742990732192993,
316
+ "learning_rate": 1.662434478741992e-05,
317
+ "loss": 6.0217,
318
+ "mean_token_accuracy": 0.23067249596118927,
319
+ "num_tokens": 1443892.0,
320
+ "step": 1550
321
+ },
322
+ {
323
+ "entropy": 6.026197805404663,
324
+ "epoch": 0.9211283822682786,
325
+ "grad_norm": 1.4229531288146973,
326
+ "learning_rate": 1.650786255096098e-05,
327
+ "loss": 5.8455,
328
+ "mean_token_accuracy": 0.2537291014194489,
329
+ "num_tokens": 1491050.0,
330
+ "step": 1600
331
+ },
332
+ {
333
+ "entropy": 6.210526428222656,
334
+ "epoch": 0.9499136442141624,
335
+ "grad_norm": 1.3555018901824951,
336
+ "learning_rate": 1.6391380314502038e-05,
337
+ "loss": 6.0279,
338
+ "mean_token_accuracy": 0.2308420208096504,
339
+ "num_tokens": 1540809.0,
340
+ "step": 1650
341
+ },
342
+ {
343
+ "entropy": 5.9872834014892575,
344
+ "epoch": 0.9786989061600461,
345
+ "grad_norm": 0.9893498420715332,
346
+ "learning_rate": 1.62748980780431e-05,
347
+ "loss": 5.8137,
348
+ "mean_token_accuracy": 0.2566875320672989,
349
+ "num_tokens": 1585876.0,
350
+ "step": 1700
351
+ },
352
+ {
353
+ "epoch": 1.0,
354
+ "eval_entropy": 6.322207130045386,
355
+ "eval_loss": 6.15173864364624,
356
+ "eval_mean_token_accuracy": 0.21116007946877985,
357
+ "eval_model_preparation_time": 0.0036,
358
+ "eval_num_tokens": 1619719.0,
359
+ "eval_runtime": 76.1297,
360
+ "eval_samples_per_second": 5.701,
361
+ "eval_steps_per_second": 2.85,
362
+ "step": 1737
363
+ },
364
+ {
365
+ "entropy": 6.038531675338745,
366
+ "epoch": 1.0074841681059297,
367
+ "grad_norm": 0.8715208172798157,
368
+ "learning_rate": 1.615841584158416e-05,
369
+ "loss": 5.8628,
370
+ "mean_token_accuracy": 0.2510762655735016,
371
+ "num_tokens": 1632015.0,
372
+ "step": 1750
373
+ },
374
+ {
375
+ "entropy": 6.164030771255494,
376
+ "epoch": 1.0362694300518134,
377
+ "grad_norm": 0.7344900965690613,
378
+ "learning_rate": 1.604193360512522e-05,
379
+ "loss": 5.9856,
380
+ "mean_token_accuracy": 0.2351543301343918,
381
+ "num_tokens": 1681154.0,
382
+ "step": 1800
383
+ },
384
+ {
385
+ "entropy": 6.0731862354278565,
386
+ "epoch": 1.065054691997697,
387
+ "grad_norm": 1.0801328420639038,
388
+ "learning_rate": 1.592545136866628e-05,
389
+ "loss": 5.8976,
390
+ "mean_token_accuracy": 0.24701615989208223,
391
+ "num_tokens": 1728110.0,
392
+ "step": 1850
393
+ },
394
+ {
395
+ "entropy": 6.079212121963501,
396
+ "epoch": 1.0938399539435808,
397
+ "grad_norm": 0.7876909375190735,
398
+ "learning_rate": 1.5808969132207338e-05,
399
+ "loss": 5.9056,
400
+ "mean_token_accuracy": 0.24457543224096298,
401
+ "num_tokens": 1775703.0,
402
+ "step": 1900
403
+ },
404
+ {
405
+ "entropy": 6.062467746734619,
406
+ "epoch": 1.1226252158894645,
407
+ "grad_norm": 0.5999078750610352,
408
+ "learning_rate": 1.56924868957484e-05,
409
+ "loss": 5.8899,
410
+ "mean_token_accuracy": 0.2469428673386574,
411
+ "num_tokens": 1821980.0,
412
+ "step": 1950
413
+ },
414
+ {
415
+ "entropy": 6.031774473190308,
416
+ "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.6313235759735107,
418
+ "learning_rate": 1.557600465928946e-05,
419
+ "loss": 5.8593,
420
+ "mean_token_accuracy": 0.250918984413147,
421
+ "num_tokens": 1867547.0,
422
+ "step": 2000
423
+ },
424
+ {
425
+ "entropy": 6.122789564132691,
426
+ "epoch": 1.180195739781232,
427
+ "grad_norm": 2.562373161315918,
428
+ "learning_rate": 1.545952242283052e-05,
429
+ "loss": 5.9502,
430
+ "mean_token_accuracy": 0.23938885867595672,
431
+ "num_tokens": 1915411.0,
432
+ "step": 2050
433
+ },
434
+ {
435
+ "entropy": 6.067130417823791,
436
+ "epoch": 1.2089810017271156,
437
+ "grad_norm": 0.9762872457504272,
438
+ "learning_rate": 1.534304018637158e-05,
439
+ "loss": 5.8956,
440
+ "mean_token_accuracy": 0.2454381173849106,
441
+ "num_tokens": 1964009.0,
442
+ "step": 2100
443
+ },
444
+ {
445
+ "entropy": 5.9613511180877685,
446
+ "epoch": 1.2377662636729994,
447
+ "grad_norm": 0.8701547384262085,
448
+ "learning_rate": 1.5226557949912639e-05,
449
+ "loss": 5.7907,
450
+ "mean_token_accuracy": 0.25976367652416227,
451
+ "num_tokens": 2008595.0,
452
+ "step": 2150
453
+ },
454
+ {
455
+ "entropy": 6.13505428314209,
456
+ "epoch": 1.266551525618883,
457
+ "grad_norm": 0.8511647582054138,
458
+ "learning_rate": 1.51100757134537e-05,
459
+ "loss": 5.9619,
460
+ "mean_token_accuracy": 0.23760781466960906,
461
+ "num_tokens": 2057229.0,
462
+ "step": 2200
463
+ },
464
+ {
465
+ "entropy": 6.025254983901977,
466
+ "epoch": 1.2953367875647668,
467
+ "grad_norm": 0.7627406120300293,
468
+ "learning_rate": 1.4993593476994758e-05,
469
+ "loss": 5.8546,
470
+ "mean_token_accuracy": 0.2508662334084511,
471
+ "num_tokens": 2103631.0,
472
+ "step": 2250
473
+ },
474
+ {
475
+ "entropy": 5.981974196434021,
476
+ "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.6922173500061035,
478
+ "learning_rate": 1.4877111240535819e-05,
479
+ "loss": 5.8119,
480
+ "mean_token_accuracy": 0.256170334815979,
481
+ "num_tokens": 2150369.0,
482
+ "step": 2300
483
+ },
484
+ {
485
+ "entropy": 6.19903904914856,
486
+ "epoch": 1.3529073114565342,
487
+ "grad_norm": 0.40436601638793945,
488
+ "learning_rate": 1.4760629004076878e-05,
489
+ "loss": 6.0244,
490
+ "mean_token_accuracy": 0.22900927513837815,
491
+ "num_tokens": 2199724.0,
492
+ "step": 2350
493
+ },
494
+ {
495
+ "entropy": 5.986697297096253,
496
+ "epoch": 1.381692573402418,
497
+ "grad_norm": 0.8481882214546204,
498
+ "learning_rate": 1.464414676761794e-05,
499
+ "loss": 5.8195,
500
+ "mean_token_accuracy": 0.2552035376429558,
501
+ "num_tokens": 2245341.0,
502
+ "step": 2400
503
+ },
504
+ {
505
+ "entropy": 6.1886044692993165,
506
+ "epoch": 1.4104778353483016,
507
+ "grad_norm": 0.7911505103111267,
508
+ "learning_rate": 1.4527664531159e-05,
509
+ "loss": 6.0148,
510
+ "mean_token_accuracy": 0.23026730984449387,
511
+ "num_tokens": 2294726.0,
512
+ "step": 2450
513
+ },
514
+ {
515
+ "entropy": 5.974867792129516,
516
+ "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.640499234199524,
518
+ "learning_rate": 1.441118229470006e-05,
519
+ "loss": 5.8111,
520
+ "mean_token_accuracy": 0.2554209426045418,
521
+ "num_tokens": 2342251.0,
522
+ "step": 2500
523
+ },
524
+ {
525
+ "entropy": 5.967635660171509,
526
+ "epoch": 1.468048359240069,
527
+ "grad_norm": 0.8022929430007935,
528
+ "learning_rate": 1.429470005824112e-05,
529
+ "loss": 5.8015,
530
+ "mean_token_accuracy": 0.2569852137565613,
531
+ "num_tokens": 2387469.0,
532
+ "step": 2550
533
+ },
534
+ {
535
+ "entropy": 6.047262029647827,
536
+ "epoch": 1.4968336211859528,
537
+ "grad_norm": 0.9270678758621216,
538
+ "learning_rate": 1.417821782178218e-05,
539
+ "loss": 5.8782,
540
+ "mean_token_accuracy": 0.2467849862575531,
541
+ "num_tokens": 2434128.0,
542
+ "step": 2600
543
+ },
544
+ {
545
+ "entropy": 6.00601068019867,
546
+ "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.5378597974777222,
548
+ "learning_rate": 1.406173558532324e-05,
549
+ "loss": 5.839,
550
+ "mean_token_accuracy": 0.25216978013515473,
551
+ "num_tokens": 2480366.0,
552
+ "step": 2650
553
+ },
554
+ {
555
+ "entropy": 5.988714299201965,
556
+ "epoch": 1.5544041450777202,
557
+ "grad_norm": 0.819143533706665,
558
+ "learning_rate": 1.3945253348864299e-05,
559
+ "loss": 5.82,
560
+ "mean_token_accuracy": 0.254311783015728,
561
+ "num_tokens": 2527357.0,
562
+ "step": 2700
563
+ },
564
+ {
565
+ "entropy": 5.960293846130371,
566
+ "epoch": 1.583189407023604,
567
+ "grad_norm": 0.8920449614524841,
568
+ "learning_rate": 1.382877111240536e-05,
569
+ "loss": 5.7946,
570
+ "mean_token_accuracy": 0.25750755161046984,
571
+ "num_tokens": 2574470.0,
572
+ "step": 2750
573
+ },
574
+ {
575
+ "entropy": 6.1214879322052,
576
+ "epoch": 1.6119746689694876,
577
+ "grad_norm": 0.5333890914916992,
578
+ "learning_rate": 1.371228887594642e-05,
579
+ "loss": 5.9513,
580
+ "mean_token_accuracy": 0.2377367687225342,
581
+ "num_tokens": 2622280.0,
582
+ "step": 2800
583
+ },
584
+ {
585
+ "entropy": 5.951769871711731,
586
+ "epoch": 1.6407599309153713,
587
+ "grad_norm": 0.5994665026664734,
588
+ "learning_rate": 1.3595806639487479e-05,
589
+ "loss": 5.7861,
590
+ "mean_token_accuracy": 0.25854207515716554,
591
+ "num_tokens": 2668624.0,
592
+ "step": 2850
593
+ },
594
+ {
595
+ "entropy": 5.927765312194825,
596
+ "epoch": 1.669545192861255,
597
+ "grad_norm": 0.4460087716579437,
598
+ "learning_rate": 1.347932440302854e-05,
599
+ "loss": 5.7661,
600
+ "mean_token_accuracy": 0.25973255425691605,
601
+ "num_tokens": 2714388.0,
602
+ "step": 2900
603
+ },
604
+ {
605
+ "entropy": 6.097678365707398,
606
+ "epoch": 1.6983304548071387,
607
+ "grad_norm": 0.7125752568244934,
608
+ "learning_rate": 1.3362842166569598e-05,
609
+ "loss": 5.9284,
610
+ "mean_token_accuracy": 0.23995368272066117,
611
+ "num_tokens": 2761465.0,
612
+ "step": 2950
613
+ },
614
+ {
615
+ "entropy": 5.986212658882141,
616
+ "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.5405049324035645,
618
+ "learning_rate": 1.3246359930110659e-05,
619
+ "loss": 5.8194,
620
+ "mean_token_accuracy": 0.25333445996046067,
621
+ "num_tokens": 2808066.0,
622
+ "step": 3000
623
+ },
624
+ {
625
+ "entropy": 5.7968806195259095,
626
+ "epoch": 1.7559009786989062,
627
+ "grad_norm": 0.4532749652862549,
628
+ "learning_rate": 1.312987769365172e-05,
629
+ "loss": 5.6344,
630
+ "mean_token_accuracy": 0.2782411390542984,
631
+ "num_tokens": 2851822.0,
632
+ "step": 3050
633
+ },
634
+ {
635
+ "entropy": 5.973708114624023,
636
+ "epoch": 1.7846862406447899,
637
+ "grad_norm": 1.4795438051223755,
638
+ "learning_rate": 1.3013395457192778e-05,
639
+ "loss": 5.8104,
640
+ "mean_token_accuracy": 0.25441971331834795,
641
+ "num_tokens": 2897737.0,
642
+ "step": 3100
643
+ },
644
+ {
645
+ "entropy": 5.70733567237854,
646
+ "epoch": 1.8134715025906736,
647
+ "grad_norm": 0.6216577887535095,
648
+ "learning_rate": 1.2896913220733839e-05,
649
+ "loss": 5.5523,
650
+ "mean_token_accuracy": 0.28787180870771406,
651
+ "num_tokens": 2939511.0,
652
+ "step": 3150
653
+ },
654
+ {
655
+ "entropy": 5.96826630115509,
656
+ "epoch": 1.8422567645365573,
657
+ "grad_norm": 0.9246350526809692,
658
+ "learning_rate": 1.2780430984274898e-05,
659
+ "loss": 5.8057,
660
+ "mean_token_accuracy": 0.25464902341365814,
661
+ "num_tokens": 2986368.0,
662
+ "step": 3200
663
+ },
664
+ {
665
+ "entropy": 5.950662693977356,
666
+ "epoch": 1.871042026482441,
667
+ "grad_norm": 0.8141199946403503,
668
+ "learning_rate": 1.266394874781596e-05,
669
+ "loss": 5.7886,
670
+ "mean_token_accuracy": 0.25830793648958206,
671
+ "num_tokens": 3031770.0,
672
+ "step": 3250
673
+ },
674
+ {
675
+ "entropy": 6.00512773513794,
676
+ "epoch": 1.8998272884283247,
677
+ "grad_norm": 0.4913998246192932,
678
+ "learning_rate": 1.2547466511357018e-05,
679
+ "loss": 5.838,
680
+ "mean_token_accuracy": 0.2512077575922012,
681
+ "num_tokens": 3078322.0,
682
+ "step": 3300
683
+ },
684
+ {
685
+ "entropy": 6.090880632400513,
686
+ "epoch": 1.9286125503742084,
687
+ "grad_norm": 0.9893012046813965,
688
+ "learning_rate": 1.243098427489808e-05,
689
+ "loss": 5.9264,
690
+ "mean_token_accuracy": 0.2391783133149147,
691
+ "num_tokens": 3125572.0,
692
+ "step": 3350
693
+ },
694
+ {
695
+ "entropy": 5.949693293571472,
696
+ "epoch": 1.9573978123200921,
697
+ "grad_norm": 0.5794200301170349,
698
+ "learning_rate": 1.231450203843914e-05,
699
+ "loss": 5.7861,
700
+ "mean_token_accuracy": 0.2568664598464966,
701
+ "num_tokens": 3171974.0,
702
+ "step": 3400
703
+ },
704
+ {
705
+ "entropy": 6.03591317653656,
706
+ "epoch": 1.9861830742659758,
707
+ "grad_norm": 0.8525373339653015,
708
+ "learning_rate": 1.21980198019802e-05,
709
+ "loss": 5.8741,
710
+ "mean_token_accuracy": 0.24642003327608109,
711
+ "num_tokens": 3219624.0,
712
+ "step": 3450
713
+ },
714
+ {
715
+ "epoch": 2.0,
716
+ "eval_entropy": 6.272298685416648,
717
+ "eval_loss": 6.12472677230835,
718
+ "eval_mean_token_accuracy": 0.21168697409091458,
719
+ "eval_model_preparation_time": 0.0036,
720
+ "eval_num_tokens": 3239438.0,
721
+ "eval_runtime": 76.2536,
722
+ "eval_samples_per_second": 5.692,
723
+ "eval_steps_per_second": 2.846,
724
+ "step": 3474
725
+ },
726
+ {
727
+ "entropy": 5.914763498306274,
728
+ "epoch": 2.0149683362118593,
729
+ "grad_norm": 0.5479806661605835,
730
+ "learning_rate": 1.208153756552126e-05,
731
+ "loss": 5.7559,
732
+ "mean_token_accuracy": 0.2624077323079109,
733
+ "num_tokens": 3263994.0,
734
+ "step": 3500
735
+ },
736
+ {
737
+ "entropy": 6.033470869064331,
738
+ "epoch": 2.043753598157743,
739
+ "grad_norm": 1.7186369895935059,
740
+ "learning_rate": 1.1965055329062319e-05,
741
+ "loss": 5.8677,
742
+ "mean_token_accuracy": 0.24745646148920059,
743
+ "num_tokens": 3311182.0,
744
+ "step": 3550
745
+ },
746
+ {
747
+ "entropy": 5.962404427528381,
748
+ "epoch": 2.0725388601036268,
749
+ "grad_norm": 0.9068580269813538,
750
+ "learning_rate": 1.184857309260338e-05,
751
+ "loss": 5.8038,
752
+ "mean_token_accuracy": 0.25500513821840287,
753
+ "num_tokens": 3358036.0,
754
+ "step": 3600
755
+ },
756
+ {
757
+ "entropy": 5.995727968215943,
758
+ "epoch": 2.1013241220495105,
759
+ "grad_norm": 2.044490337371826,
760
+ "learning_rate": 1.1732090856144438e-05,
761
+ "loss": 5.8333,
762
+ "mean_token_accuracy": 0.2514388278126717,
763
+ "num_tokens": 3404058.0,
764
+ "step": 3650
765
+ },
766
+ {
767
+ "entropy": 5.981345901489258,
768
+ "epoch": 2.130109383995394,
769
+ "grad_norm": 0.5262818336486816,
770
+ "learning_rate": 1.1615608619685499e-05,
771
+ "loss": 5.8205,
772
+ "mean_token_accuracy": 0.2523340278863907,
773
+ "num_tokens": 3449834.0,
774
+ "step": 3700
775
+ },
776
+ {
777
+ "entropy": 5.848710675239563,
778
+ "epoch": 2.158894645941278,
779
+ "grad_norm": 0.726718544960022,
780
+ "learning_rate": 1.149912638322656e-05,
781
+ "loss": 5.6891,
782
+ "mean_token_accuracy": 0.2697497832775116,
783
+ "num_tokens": 3494740.0,
784
+ "step": 3750
785
+ },
786
+ {
787
+ "entropy": 5.964878315925598,
788
+ "epoch": 2.1876799078871616,
789
+ "grad_norm": 0.6147393584251404,
790
+ "learning_rate": 1.1382644146767618e-05,
791
+ "loss": 5.8029,
792
+ "mean_token_accuracy": 0.2553535890579224,
793
+ "num_tokens": 3541342.0,
794
+ "step": 3800
795
+ },
796
+ {
797
+ "entropy": 6.045858116149902,
798
+ "epoch": 2.2164651698330453,
799
+ "grad_norm": 0.8283621072769165,
800
+ "learning_rate": 1.1266161910308679e-05,
801
+ "loss": 5.8802,
802
+ "mean_token_accuracy": 0.24544916599988936,
803
+ "num_tokens": 3588995.0,
804
+ "step": 3850
805
+ },
806
+ {
807
+ "entropy": 5.909895505905151,
808
+ "epoch": 2.245250431778929,
809
+ "grad_norm": 0.9912867546081543,
810
+ "learning_rate": 1.1149679673849738e-05,
811
+ "loss": 5.7481,
812
+ "mean_token_accuracy": 0.2620398569107056,
813
+ "num_tokens": 3634252.0,
814
+ "step": 3900
815
+ },
816
+ {
817
+ "entropy": 5.9534005498886104,
818
+ "epoch": 2.2740356937248127,
819
+ "grad_norm": 1.2012401819229126,
820
+ "learning_rate": 1.1033197437390799e-05,
821
+ "loss": 5.788,
822
+ "mean_token_accuracy": 0.25642816990613937,
823
+ "num_tokens": 3681197.0,
824
+ "step": 3950
825
+ },
826
+ {
827
+ "entropy": 6.155718851089477,
828
+ "epoch": 2.3028209556706964,
829
+ "grad_norm": 1.4272509813308716,
830
+ "learning_rate": 1.0916715200931857e-05,
831
+ "loss": 5.9842,
832
+ "mean_token_accuracy": 0.23176315426826477,
833
+ "num_tokens": 3729955.0,
834
+ "step": 4000
835
+ },
836
+ {
837
+ "entropy": 6.004842009544372,
838
+ "epoch": 2.33160621761658,
839
+ "grad_norm": 1.1919596195220947,
840
+ "learning_rate": 1.0800232964472918e-05,
841
+ "loss": 5.8332,
842
+ "mean_token_accuracy": 0.25039500594139097,
843
+ "num_tokens": 3777043.0,
844
+ "step": 4050
845
+ },
846
+ {
847
+ "entropy": 6.045269584655761,
848
+ "epoch": 2.360391479562464,
849
+ "grad_norm": 0.6200748085975647,
850
+ "learning_rate": 1.068375072801398e-05,
851
+ "loss": 5.8641,
852
+ "mean_token_accuracy": 0.2466951721906662,
853
+ "num_tokens": 3824067.0,
854
+ "step": 4100
855
+ },
856
+ {
857
+ "entropy": 6.105137758255005,
858
+ "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.0185531377792358,
860
+ "learning_rate": 1.0567268491555038e-05,
861
+ "loss": 5.9181,
862
+ "mean_token_accuracy": 0.24000227689743042,
863
+ "num_tokens": 3872769.0,
864
+ "step": 4150
865
+ },
866
+ {
867
+ "entropy": 6.013391451835632,
868
+ "epoch": 2.4179620034542313,
869
+ "grad_norm": 0.6188511848449707,
870
+ "learning_rate": 1.04507862550961e-05,
871
+ "loss": 5.8286,
872
+ "mean_token_accuracy": 0.25189226895570754,
873
+ "num_tokens": 3919379.0,
874
+ "step": 4200
875
+ },
876
+ {
877
+ "entropy": 5.972923498153687,
878
+ "epoch": 2.446747265400115,
879
+ "grad_norm": 0.7165982127189636,
880
+ "learning_rate": 1.0334304018637157e-05,
881
+ "loss": 5.7908,
882
+ "mean_token_accuracy": 0.2567197346687317,
883
+ "num_tokens": 3965593.0,
884
+ "step": 4250
885
+ },
886
+ {
887
+ "entropy": 6.0378124713897705,
888
+ "epoch": 2.4755325273459987,
889
+ "grad_norm": 0.5278330445289612,
890
+ "learning_rate": 1.021782178217822e-05,
891
+ "loss": 5.8559,
892
+ "mean_token_accuracy": 0.2484271454811096,
893
+ "num_tokens": 4012300.0,
894
+ "step": 4300
895
+ },
896
+ {
897
+ "entropy": 5.984496111869812,
898
+ "epoch": 2.5043177892918824,
899
+ "grad_norm": 0.8995006680488586,
900
+ "learning_rate": 1.0101339545719278e-05,
901
+ "loss": 5.8092,
902
+ "mean_token_accuracy": 0.253717774450779,
903
+ "num_tokens": 4059323.0,
904
+ "step": 4350
905
+ },
906
+ {
907
+ "entropy": 6.124767150878906,
908
+ "epoch": 2.533103051237766,
909
+ "grad_norm": 1.3810409307479858,
910
+ "learning_rate": 9.984857309260339e-06,
911
+ "loss": 5.9468,
912
+ "mean_token_accuracy": 0.23715158700942993,
913
+ "num_tokens": 4107616.0,
914
+ "step": 4400
915
+ },
916
+ {
917
+ "entropy": 5.8810745000839235,
918
+ "epoch": 2.56188831318365,
919
+ "grad_norm": 0.8794332146644592,
920
+ "learning_rate": 9.868375072801398e-06,
921
+ "loss": 5.7089,
922
+ "mean_token_accuracy": 0.2662400561571121,
923
+ "num_tokens": 4152400.0,
924
+ "step": 4450
925
+ },
926
+ {
927
+ "entropy": 6.108017959594727,
928
+ "epoch": 2.5906735751295336,
929
+ "grad_norm": 0.5132983922958374,
930
+ "learning_rate": 9.751892836342458e-06,
931
+ "loss": 5.9346,
932
+ "mean_token_accuracy": 0.23871887892484664,
933
+ "num_tokens": 4200994.0,
934
+ "step": 4500
935
+ },
936
+ {
937
+ "entropy": 5.985005149841308,
938
+ "epoch": 2.6194588370754173,
939
+ "grad_norm": 0.6561470031738281,
940
+ "learning_rate": 9.635410599883519e-06,
941
+ "loss": 5.8111,
942
+ "mean_token_accuracy": 0.25315980523824694,
943
+ "num_tokens": 4247548.0,
944
+ "step": 4550
945
+ },
946
+ {
947
+ "entropy": 6.050709452629089,
948
+ "epoch": 2.648244099021301,
949
+ "grad_norm": 0.8790570497512817,
950
+ "learning_rate": 9.51892836342458e-06,
951
+ "loss": 5.8789,
952
+ "mean_token_accuracy": 0.2440834751725197,
953
+ "num_tokens": 4295250.0,
954
+ "step": 4600
955
+ },
956
+ {
957
+ "entropy": 6.007251596450805,
958
+ "epoch": 2.6770293609671847,
959
+ "grad_norm": 0.6728562116622925,
960
+ "learning_rate": 9.402446126965639e-06,
961
+ "loss": 5.8338,
962
+ "mean_token_accuracy": 0.2509264424443245,
963
+ "num_tokens": 4341599.0,
964
+ "step": 4650
965
+ },
966
+ {
967
+ "entropy": 5.966628184318543,
968
+ "epoch": 2.7058146229130684,
969
+ "grad_norm": 0.5815795063972473,
970
+ "learning_rate": 9.285963890506699e-06,
971
+ "loss": 5.7961,
972
+ "mean_token_accuracy": 0.2559360232949257,
973
+ "num_tokens": 4388673.0,
974
+ "step": 4700
975
+ },
976
+ {
977
+ "entropy": 5.7972593069076535,
978
+ "epoch": 2.734599884858952,
979
+ "grad_norm": 1.0610334873199463,
980
+ "learning_rate": 9.169481654047758e-06,
981
+ "loss": 5.6318,
982
+ "mean_token_accuracy": 0.27574603259563446,
983
+ "num_tokens": 4432959.0,
984
+ "step": 4750
985
+ },
986
+ {
987
+ "entropy": 5.984181261062622,
988
+ "epoch": 2.763385146804836,
989
+ "grad_norm": 2.1847357749938965,
990
+ "learning_rate": 9.052999417588819e-06,
991
+ "loss": 5.8153,
992
+ "mean_token_accuracy": 0.2533784031867981,
993
+ "num_tokens": 4479190.0,
994
+ "step": 4800
995
+ },
996
+ {
997
+ "entropy": 5.959725599288941,
998
+ "epoch": 2.7921704087507195,
999
+ "grad_norm": 0.5671709179878235,
1000
+ "learning_rate": 8.936517181129878e-06,
1001
+ "loss": 5.7912,
1002
+ "mean_token_accuracy": 0.2556650054454803,
1003
+ "num_tokens": 4525674.0,
1004
+ "step": 4850
1005
+ },
1006
+ {
1007
+ "entropy": 5.814929313659668,
1008
+ "epoch": 2.8209556706966032,
1009
+ "grad_norm": 0.9447108507156372,
1010
+ "learning_rate": 8.820034944670938e-06,
1011
+ "loss": 5.6478,
1012
+ "mean_token_accuracy": 0.27417868226766584,
1013
+ "num_tokens": 4570379.0,
1014
+ "step": 4900
1015
+ },
1016
+ {
1017
+ "entropy": 5.96754421710968,
1018
+ "epoch": 2.849740932642487,
1019
+ "grad_norm": 2.009676218032837,
1020
+ "learning_rate": 8.703552708211999e-06,
1021
+ "loss": 5.795,
1022
+ "mean_token_accuracy": 0.2556305864453316,
1023
+ "num_tokens": 4617184.0,
1024
+ "step": 4950
1025
+ },
1026
+ {
1027
+ "entropy": 6.008112049102783,
1028
+ "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.1977978944778442,
1030
+ "learning_rate": 8.587070471753058e-06,
1031
+ "loss": 5.8416,
1032
+ "mean_token_accuracy": 0.2494604030251503,
1033
+ "num_tokens": 4664180.0,
1034
+ "step": 5000
1035
+ },
1036
+ {
1037
+ "entropy": 5.832320966720581,
1038
+ "epoch": 2.9073114565342544,
1039
+ "grad_norm": 0.4845636785030365,
1040
+ "learning_rate": 8.470588235294118e-06,
1041
+ "loss": 5.6672,
1042
+ "mean_token_accuracy": 0.27187123566865923,
1043
+ "num_tokens": 4708377.0,
1044
+ "step": 5050
1045
+ },
1046
+ {
1047
+ "entropy": 5.84138514995575,
1048
+ "epoch": 2.936096718480138,
1049
+ "grad_norm": 0.8487229943275452,
1050
+ "learning_rate": 8.354105998835179e-06,
1051
+ "loss": 5.6769,
1052
+ "mean_token_accuracy": 0.26995211571455,
1053
+ "num_tokens": 4753587.0,
1054
+ "step": 5100
1055
+ },
1056
+ {
1057
+ "entropy": 6.016681690216064,
1058
+ "epoch": 2.964881980426022,
1059
+ "grad_norm": 0.9554332494735718,
1060
+ "learning_rate": 8.237623762376238e-06,
1061
+ "loss": 5.8479,
1062
+ "mean_token_accuracy": 0.24785644590854644,
1063
+ "num_tokens": 4800508.0,
1064
+ "step": 5150
1065
+ },
1066
+ {
1067
+ "entropy": 6.103472499847412,
1068
+ "epoch": 2.9936672423719055,
1069
+ "grad_norm": 0.6602863669395447,
1070
+ "learning_rate": 8.121141525917298e-06,
1071
+ "loss": 5.9305,
1072
+ "mean_token_accuracy": 0.23794592499732972,
1073
+ "num_tokens": 4849415.0,
1074
+ "step": 5200
1075
+ },
1076
+ {
1077
+ "epoch": 3.0,
1078
+ "eval_entropy": 6.254081044878278,
1079
+ "eval_loss": 6.0980024337768555,
1080
+ "eval_mean_token_accuracy": 0.21401402258103894,
1081
+ "eval_model_preparation_time": 0.0036,
1082
+ "eval_num_tokens": 4859157.0,
1083
+ "eval_runtime": 75.9443,
1084
+ "eval_samples_per_second": 5.715,
1085
+ "eval_steps_per_second": 2.857,
1086
+ "step": 5211
1087
+ }
1088
+ ],
1089
+ "logging_steps": 50,
1090
+ "max_steps": 8685,
1091
+ "num_input_tokens_seen": 0,
1092
+ "num_train_epochs": 5,
1093
+ "save_steps": 500,
1094
+ "stateful_callbacks": {
1095
+ "TrainerControl": {
1096
+ "args": {
1097
+ "should_epoch_stop": false,
1098
+ "should_evaluate": false,
1099
+ "should_log": false,
1100
+ "should_save": true,
1101
+ "should_training_stop": false
1102
+ },
1103
+ "attributes": {}
1104
+ }
1105
+ },
1106
+ "total_flos": 6.795785692717056e+16,
1107
+ "train_batch_size": 2,
1108
+ "trial_name": null,
1109
+ "trial_params": null
1110
+ }
checkpoint-5211/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130d33149272782bd60306263c371036419926142b8999aad7806359168f8484
3
+ size 6225