naveel10 commited on
Commit
bf7ec54
·
verified ·
1 Parent(s): 0b82d0d

Upload trained llava adapter

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +58 -0
  2. adapter_config.json +42 -0
  3. adapter_model.safetensors +3 -0
  4. added_tokens.json +4 -0
  5. chat_template.jinja +1 -0
  6. checkpoint-100/README.md +209 -0
  7. checkpoint-100/adapter_config.json +42 -0
  8. checkpoint-100/adapter_model.safetensors +3 -0
  9. checkpoint-100/added_tokens.json +4 -0
  10. checkpoint-100/chat_template.jinja +1 -0
  11. checkpoint-100/optimizer.pt +3 -0
  12. checkpoint-100/rng_state.pth +3 -0
  13. checkpoint-100/scheduler.pt +3 -0
  14. checkpoint-100/special_tokens_map.json +32 -0
  15. checkpoint-100/tokenizer.json +0 -0
  16. checkpoint-100/tokenizer.model +3 -0
  17. checkpoint-100/tokenizer_config.json +70 -0
  18. checkpoint-100/trainer_state.json +134 -0
  19. checkpoint-100/training_args.bin +3 -0
  20. checkpoint-1000/README.md +209 -0
  21. checkpoint-1000/adapter_config.json +42 -0
  22. checkpoint-1000/adapter_model.safetensors +3 -0
  23. checkpoint-1000/added_tokens.json +4 -0
  24. checkpoint-1000/chat_template.jinja +1 -0
  25. checkpoint-1000/optimizer.pt +3 -0
  26. checkpoint-1000/rng_state.pth +3 -0
  27. checkpoint-1000/scheduler.pt +3 -0
  28. checkpoint-1000/special_tokens_map.json +32 -0
  29. checkpoint-1000/tokenizer.json +0 -0
  30. checkpoint-1000/tokenizer.model +3 -0
  31. checkpoint-1000/tokenizer_config.json +70 -0
  32. checkpoint-1000/trainer_state.json +1034 -0
  33. checkpoint-1000/training_args.bin +3 -0
  34. checkpoint-1100/README.md +209 -0
  35. checkpoint-1100/adapter_config.json +42 -0
  36. checkpoint-1100/adapter_model.safetensors +3 -0
  37. checkpoint-1100/added_tokens.json +4 -0
  38. checkpoint-1100/chat_template.jinja +1 -0
  39. checkpoint-1100/optimizer.pt +3 -0
  40. checkpoint-1100/rng_state.pth +3 -0
  41. checkpoint-1100/scheduler.pt +3 -0
  42. checkpoint-1100/special_tokens_map.json +32 -0
  43. checkpoint-1100/tokenizer.json +0 -0
  44. checkpoint-1100/tokenizer.model +3 -0
  45. checkpoint-1100/tokenizer_config.json +70 -0
  46. checkpoint-1100/trainer_state.json +1134 -0
  47. checkpoint-1100/training_args.bin +3 -0
  48. checkpoint-1200/README.md +209 -0
  49. checkpoint-1200/adapter_config.json +42 -0
  50. checkpoint-1200/adapter_model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: llava-hf/LLaVA-NeXT-Video-7B-32K-hf
3
+ library_name: transformers
4
+ model_name: outputs
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for outputs
13
+
14
+ This model is a fine-tuned version of [llava-hf/LLaVA-NeXT-Video-7B-32K-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-32K-hf).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="naveel10/outputs", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.20.0
38
+ - Transformers: 4.55.0.dev0
39
+ - Pytorch: 2.7.1
40
+ - Datasets: 4.0.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "k_proj",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "o_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:effb42eb182ecaa32b718222ae9b3b4cad7480d24684146685400bac0d318466
3
+ size 708929184
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32001,
3
+ "<video>": 32000
4
+ }
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
checkpoint-100/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: llava-hf/LLaVA-NeXT-Video-7B-32K-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:llava-hf/LLaVA-NeXT-Video-7B-32K-hf
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "k_proj",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "o_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9082cc5e789ddfb2d39ad623ec7efc79e00262d50865487408e086777eefc9ce
3
+ size 708929184
checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32001,
3
+ "<video>": 32000
4
+ }
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d412ce6c2b031e318391b38e6d901ceb0fee0734bd1dfabb25a2192cf3591631
3
+ size 1342556643
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b974bebeb6e110d6ab402eed18410f550702a81b3d3313fa6f4eba2a45499e1e
3
+ size 14645
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192d7c959b223c828929b7093b5dc97e8c616906f98786b9baf7694f6e8805db
3
+ size 1465
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "image_token": "<image>",
17
+ "pad_token": {
18
+ "content": "</s>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "video_token": "<video>"
32
+ }
checkpoint-100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<video>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<image>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "additional_special_tokens": [],
48
+ "bos_token": "<s>",
49
+ "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
+ "extra_special_tokens": {
52
+ "image_token": "<image>",
53
+ "video_token": "<video>"
54
+ },
55
+ "image_token": "<image>",
56
+ "legacy": true,
57
+ "max_length": null,
58
+ "model_max_length": 1000000000000000019884624838656,
59
+ "pad_to_multiple_of": null,
60
+ "pad_token": "</s>",
61
+ "pad_token_type_id": 0,
62
+ "padding_side": "left",
63
+ "processor_class": "LlavaNextVideoProcessor",
64
+ "sp_model_kwargs": {},
65
+ "spaces_between_special_tokens": false,
66
+ "tokenizer_class": "LlamaTokenizer",
67
+ "unk_token": "<unk>",
68
+ "use_default_system_prompt": false,
69
+ "video_token": "<video>"
70
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.2852214574813843,
4
+ "best_model_checkpoint": "outputs/checkpoint-100",
5
+ "epoch": 0.37453183520599254,
6
+ "eval_steps": 100,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03745318352059925,
14
+ "grad_norm": 2.6471238136291504,
15
+ "learning_rate": 1.9865168539325844e-05,
16
+ "loss": 3.9924,
17
+ "mean_token_accuracy": 0.3569513201713562,
18
+ "num_tokens": 1110.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.0749063670411985,
23
+ "grad_norm": 2.9193994998931885,
24
+ "learning_rate": 1.9715355805243446e-05,
25
+ "loss": 2.5013,
26
+ "mean_token_accuracy": 0.5000596195459366,
27
+ "num_tokens": 2220.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.11235955056179775,
32
+ "grad_norm": 1.090408444404602,
33
+ "learning_rate": 1.956554307116105e-05,
34
+ "loss": 1.2021,
35
+ "mean_token_accuracy": 0.7512393116950988,
36
+ "num_tokens": 3329.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.149812734082397,
41
+ "grad_norm": 1.412244200706482,
42
+ "learning_rate": 1.9415730337078652e-05,
43
+ "loss": 0.6237,
44
+ "mean_token_accuracy": 0.8658290803432465,
45
+ "num_tokens": 4437.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.18726591760299627,
50
+ "grad_norm": 0.9774134755134583,
51
+ "learning_rate": 1.9265917602996254e-05,
52
+ "loss": 0.4264,
53
+ "mean_token_accuracy": 0.9105254471302032,
54
+ "num_tokens": 5553.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.2247191011235955,
59
+ "grad_norm": 0.6166325211524963,
60
+ "learning_rate": 1.9116104868913857e-05,
61
+ "loss": 0.3806,
62
+ "mean_token_accuracy": 0.8969066739082336,
63
+ "num_tokens": 6660.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.26217228464419473,
68
+ "grad_norm": 0.5820680856704712,
69
+ "learning_rate": 1.8966292134831463e-05,
70
+ "loss": 0.3484,
71
+ "mean_token_accuracy": 0.8972096979618073,
72
+ "num_tokens": 7769.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.299625468164794,
77
+ "grad_norm": 0.31422552466392517,
78
+ "learning_rate": 1.8816479400749066e-05,
79
+ "loss": 0.3196,
80
+ "mean_token_accuracy": 0.898263669013977,
81
+ "num_tokens": 8880.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.33707865168539325,
86
+ "grad_norm": 0.5825852155685425,
87
+ "learning_rate": 1.866666666666667e-05,
88
+ "loss": 0.2965,
89
+ "mean_token_accuracy": 0.9046498596668243,
90
+ "num_tokens": 9992.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.37453183520599254,
95
+ "grad_norm": 0.38430944085121155,
96
+ "learning_rate": 1.851685393258427e-05,
97
+ "loss": 0.2839,
98
+ "mean_token_accuracy": 0.9051393151283265,
99
+ "num_tokens": 11098.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.37453183520599254,
104
+ "eval_loss": 0.2852214574813843,
105
+ "eval_mean_token_accuracy": 0.9032742083072662,
106
+ "eval_num_tokens": 11098.0,
107
+ "eval_runtime": 2.4929,
108
+ "eval_samples_per_second": 11.633,
109
+ "eval_steps_per_second": 1.605,
110
+ "step": 100
111
+ }
112
+ ],
113
+ "logging_steps": 10,
114
+ "max_steps": 1335,
115
+ "num_input_tokens_seen": 0,
116
+ "num_train_epochs": 5,
117
+ "save_steps": 100,
118
+ "stateful_callbacks": {
119
+ "TrainerControl": {
120
+ "args": {
121
+ "should_epoch_stop": false,
122
+ "should_evaluate": false,
123
+ "should_log": false,
124
+ "should_save": true,
125
+ "should_training_stop": false
126
+ },
127
+ "attributes": {}
128
+ }
129
+ },
130
+ "total_flos": 509244940800000.0,
131
+ "train_batch_size": 2,
132
+ "trial_name": null,
133
+ "trial_params": null
134
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b67df3186500a8ab4543ab551c594a458a667a2ee9e16f00656ee5598e0026
3
+ size 6097
checkpoint-1000/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: llava-hf/LLaVA-NeXT-Video-7B-32K-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:llava-hf/LLaVA-NeXT-Video-7B-32K-hf
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "k_proj",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "o_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7525b4a1adc1042458c7b7e65b635726be9e8cbd1bbc0dab90c9caf69d297be2
3
+ size 708929184
checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32001,
3
+ "<video>": 32000
4
+ }
checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf581e47aa484fa7b92ca981b3ec8ae3eb910dc743b2ba5f286294f5af4e5ade
3
+ size 1342556643
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:457d29ff3dfd45544e591c4e74e2bf137a284f9dcccde864b16ec182894687b0
3
+ size 14645
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0a34e230f35077c2e3341462a2e9d1003c701e131ac8ee9d11f55cefbd00d9
3
+ size 1465
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "image_token": "<image>",
17
+ "pad_token": {
18
+ "content": "</s>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "video_token": "<video>"
32
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<video>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<image>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "additional_special_tokens": [],
48
+ "bos_token": "<s>",
49
+ "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
+ "extra_special_tokens": {
52
+ "image_token": "<image>",
53
+ "video_token": "<video>"
54
+ },
55
+ "image_token": "<image>",
56
+ "legacy": true,
57
+ "max_length": null,
58
+ "model_max_length": 1000000000000000019884624838656,
59
+ "pad_to_multiple_of": null,
60
+ "pad_token": "</s>",
61
+ "pad_token_type_id": 0,
62
+ "padding_side": "left",
63
+ "processor_class": "LlavaNextVideoProcessor",
64
+ "sp_model_kwargs": {},
65
+ "spaces_between_special_tokens": false,
66
+ "tokenizer_class": "LlamaTokenizer",
67
+ "unk_token": "<unk>",
68
+ "use_default_system_prompt": false,
69
+ "video_token": "<video>"
70
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1034 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.1942463368177414,
4
+ "best_model_checkpoint": "outputs/checkpoint-1000",
5
+ "epoch": 3.7453183520599254,
6
+ "eval_steps": 100,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03745318352059925,
14
+ "grad_norm": 2.6471238136291504,
15
+ "learning_rate": 1.9865168539325844e-05,
16
+ "loss": 3.9924,
17
+ "mean_token_accuracy": 0.3569513201713562,
18
+ "num_tokens": 1110.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.0749063670411985,
23
+ "grad_norm": 2.9193994998931885,
24
+ "learning_rate": 1.9715355805243446e-05,
25
+ "loss": 2.5013,
26
+ "mean_token_accuracy": 0.5000596195459366,
27
+ "num_tokens": 2220.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.11235955056179775,
32
+ "grad_norm": 1.090408444404602,
33
+ "learning_rate": 1.956554307116105e-05,
34
+ "loss": 1.2021,
35
+ "mean_token_accuracy": 0.7512393116950988,
36
+ "num_tokens": 3329.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.149812734082397,
41
+ "grad_norm": 1.412244200706482,
42
+ "learning_rate": 1.9415730337078652e-05,
43
+ "loss": 0.6237,
44
+ "mean_token_accuracy": 0.8658290803432465,
45
+ "num_tokens": 4437.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.18726591760299627,
50
+ "grad_norm": 0.9774134755134583,
51
+ "learning_rate": 1.9265917602996254e-05,
52
+ "loss": 0.4264,
53
+ "mean_token_accuracy": 0.9105254471302032,
54
+ "num_tokens": 5553.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.2247191011235955,
59
+ "grad_norm": 0.6166325211524963,
60
+ "learning_rate": 1.9116104868913857e-05,
61
+ "loss": 0.3806,
62
+ "mean_token_accuracy": 0.8969066739082336,
63
+ "num_tokens": 6660.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.26217228464419473,
68
+ "grad_norm": 0.5820680856704712,
69
+ "learning_rate": 1.8966292134831463e-05,
70
+ "loss": 0.3484,
71
+ "mean_token_accuracy": 0.8972096979618073,
72
+ "num_tokens": 7769.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.299625468164794,
77
+ "grad_norm": 0.31422552466392517,
78
+ "learning_rate": 1.8816479400749066e-05,
79
+ "loss": 0.3196,
80
+ "mean_token_accuracy": 0.898263669013977,
81
+ "num_tokens": 8880.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.33707865168539325,
86
+ "grad_norm": 0.5825852155685425,
87
+ "learning_rate": 1.866666666666667e-05,
88
+ "loss": 0.2965,
89
+ "mean_token_accuracy": 0.9046498596668243,
90
+ "num_tokens": 9992.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.37453183520599254,
95
+ "grad_norm": 0.38430944085121155,
96
+ "learning_rate": 1.851685393258427e-05,
97
+ "loss": 0.2839,
98
+ "mean_token_accuracy": 0.9051393151283265,
99
+ "num_tokens": 11098.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.37453183520599254,
104
+ "eval_loss": 0.2852214574813843,
105
+ "eval_mean_token_accuracy": 0.9032742083072662,
106
+ "eval_num_tokens": 11098.0,
107
+ "eval_runtime": 2.4929,
108
+ "eval_samples_per_second": 11.633,
109
+ "eval_steps_per_second": 1.605,
110
+ "step": 100
111
+ },
112
+ {
113
+ "epoch": 0.41198501872659177,
114
+ "grad_norm": 0.312187522649765,
115
+ "learning_rate": 1.8367041198501874e-05,
116
+ "loss": 0.2752,
117
+ "mean_token_accuracy": 0.9036725044250489,
118
+ "num_tokens": 12207.0,
119
+ "step": 110
120
+ },
121
+ {
122
+ "epoch": 0.449438202247191,
123
+ "grad_norm": 0.3875369131565094,
124
+ "learning_rate": 1.8217228464419477e-05,
125
+ "loss": 0.2659,
126
+ "mean_token_accuracy": 0.9044483065605163,
127
+ "num_tokens": 13316.0,
128
+ "step": 120
129
+ },
130
+ {
131
+ "epoch": 0.4868913857677903,
132
+ "grad_norm": 0.6050882339477539,
133
+ "learning_rate": 1.8067415730337083e-05,
134
+ "loss": 0.258,
135
+ "mean_token_accuracy": 0.9100114285945893,
136
+ "num_tokens": 14426.0,
137
+ "step": 130
138
+ },
139
+ {
140
+ "epoch": 0.5243445692883895,
141
+ "grad_norm": 0.5287177562713623,
142
+ "learning_rate": 1.7917602996254685e-05,
143
+ "loss": 0.2455,
144
+ "mean_token_accuracy": 0.9222747385501862,
145
+ "num_tokens": 15539.0,
146
+ "step": 140
147
+ },
148
+ {
149
+ "epoch": 0.5617977528089888,
150
+ "grad_norm": 0.5224889516830444,
151
+ "learning_rate": 1.7767790262172285e-05,
152
+ "loss": 0.2368,
153
+ "mean_token_accuracy": 0.9263923704624176,
154
+ "num_tokens": 16647.0,
155
+ "step": 150
156
+ },
157
+ {
158
+ "epoch": 0.599250936329588,
159
+ "grad_norm": 0.4501174986362457,
160
+ "learning_rate": 1.7617977528089887e-05,
161
+ "loss": 0.2299,
162
+ "mean_token_accuracy": 0.9313735246658326,
163
+ "num_tokens": 17760.0,
164
+ "step": 160
165
+ },
166
+ {
167
+ "epoch": 0.6367041198501873,
168
+ "grad_norm": 0.43853962421417236,
169
+ "learning_rate": 1.746816479400749e-05,
170
+ "loss": 0.2222,
171
+ "mean_token_accuracy": 0.9402973234653473,
172
+ "num_tokens": 18869.0,
173
+ "step": 170
174
+ },
175
+ {
176
+ "epoch": 0.6741573033707865,
177
+ "grad_norm": 0.31908461451530457,
178
+ "learning_rate": 1.7318352059925093e-05,
179
+ "loss": 0.2117,
180
+ "mean_token_accuracy": 0.9458102405071258,
181
+ "num_tokens": 19977.0,
182
+ "step": 180
183
+ },
184
+ {
185
+ "epoch": 0.7116104868913857,
186
+ "grad_norm": 0.2825154662132263,
187
+ "learning_rate": 1.71685393258427e-05,
188
+ "loss": 0.2094,
189
+ "mean_token_accuracy": 0.938564246892929,
190
+ "num_tokens": 21088.0,
191
+ "step": 190
192
+ },
193
+ {
194
+ "epoch": 0.7490636704119851,
195
+ "grad_norm": 0.2939445674419403,
196
+ "learning_rate": 1.70187265917603e-05,
197
+ "loss": 0.2051,
198
+ "mean_token_accuracy": 0.9392363965511322,
199
+ "num_tokens": 22195.0,
200
+ "step": 200
201
+ },
202
+ {
203
+ "epoch": 0.7490636704119851,
204
+ "eval_loss": 0.2029074728488922,
205
+ "eval_mean_token_accuracy": 0.9482556581497192,
206
+ "eval_num_tokens": 22195.0,
207
+ "eval_runtime": 2.4927,
208
+ "eval_samples_per_second": 11.634,
209
+ "eval_steps_per_second": 1.605,
210
+ "step": 200
211
+ },
212
+ {
213
+ "epoch": 0.7865168539325843,
214
+ "grad_norm": 0.18860529363155365,
215
+ "learning_rate": 1.6868913857677904e-05,
216
+ "loss": 0.1991,
217
+ "mean_token_accuracy": 0.9431917011737824,
218
+ "num_tokens": 23306.0,
219
+ "step": 210
220
+ },
221
+ {
222
+ "epoch": 0.8239700374531835,
223
+ "grad_norm": 0.22066630423069,
224
+ "learning_rate": 1.6719101123595507e-05,
225
+ "loss": 0.2001,
226
+ "mean_token_accuracy": 0.9430991888046265,
227
+ "num_tokens": 24417.0,
228
+ "step": 220
229
+ },
230
+ {
231
+ "epoch": 0.8614232209737828,
232
+ "grad_norm": 0.17636580765247345,
233
+ "learning_rate": 1.656928838951311e-05,
234
+ "loss": 0.1968,
235
+ "mean_token_accuracy": 0.9465341567993164,
236
+ "num_tokens": 25522.0,
237
+ "step": 230
238
+ },
239
+ {
240
+ "epoch": 0.898876404494382,
241
+ "grad_norm": 0.14720433950424194,
242
+ "learning_rate": 1.6419475655430712e-05,
243
+ "loss": 0.1982,
244
+ "mean_token_accuracy": 0.9413078784942627,
245
+ "num_tokens": 26632.0,
246
+ "step": 240
247
+ },
248
+ {
249
+ "epoch": 0.9363295880149812,
250
+ "grad_norm": 0.11868773400783539,
251
+ "learning_rate": 1.626966292134832e-05,
252
+ "loss": 0.1955,
253
+ "mean_token_accuracy": 0.9468274474143982,
254
+ "num_tokens": 27742.0,
255
+ "step": 250
256
+ },
257
+ {
258
+ "epoch": 0.9737827715355806,
259
+ "grad_norm": 0.14357531070709229,
260
+ "learning_rate": 1.611985018726592e-05,
261
+ "loss": 0.1943,
262
+ "mean_token_accuracy": 0.9457764148712158,
263
+ "num_tokens": 28851.0,
264
+ "step": 260
265
+ },
266
+ {
267
+ "epoch": 1.0112359550561798,
268
+ "grad_norm": 0.21999526023864746,
269
+ "learning_rate": 1.5970037453183524e-05,
270
+ "loss": 0.1966,
271
+ "mean_token_accuracy": 0.9422410607337952,
272
+ "num_tokens": 29905.0,
273
+ "step": 270
274
+ },
275
+ {
276
+ "epoch": 1.048689138576779,
277
+ "grad_norm": 0.10375912487506866,
278
+ "learning_rate": 1.5820224719101127e-05,
279
+ "loss": 0.1935,
280
+ "mean_token_accuracy": 0.9441024959087372,
281
+ "num_tokens": 31016.0,
282
+ "step": 280
283
+ },
284
+ {
285
+ "epoch": 1.0861423220973783,
286
+ "grad_norm": 0.2760375738143921,
287
+ "learning_rate": 1.5670411985018726e-05,
288
+ "loss": 0.1947,
289
+ "mean_token_accuracy": 0.9411046266555786,
290
+ "num_tokens": 32124.0,
291
+ "step": 290
292
+ },
293
+ {
294
+ "epoch": 1.1235955056179776,
295
+ "grad_norm": 0.2127188742160797,
296
+ "learning_rate": 1.552059925093633e-05,
297
+ "loss": 0.1943,
298
+ "mean_token_accuracy": 0.9514408648014069,
299
+ "num_tokens": 33234.0,
300
+ "step": 300
301
+ },
302
+ {
303
+ "epoch": 1.1235955056179776,
304
+ "eval_loss": 0.19752565026283264,
305
+ "eval_mean_token_accuracy": 0.9434169828891754,
306
+ "eval_num_tokens": 33234.0,
307
+ "eval_runtime": 2.4936,
308
+ "eval_samples_per_second": 11.63,
309
+ "eval_steps_per_second": 1.604,
310
+ "step": 300
311
+ },
312
+ {
313
+ "epoch": 1.1610486891385767,
314
+ "grad_norm": 0.10267303138971329,
315
+ "learning_rate": 1.537078651685393e-05,
316
+ "loss": 0.1919,
317
+ "mean_token_accuracy": 0.9476523637771607,
318
+ "num_tokens": 34342.0,
319
+ "step": 310
320
+ },
321
+ {
322
+ "epoch": 1.198501872659176,
323
+ "grad_norm": 0.23754256963729858,
324
+ "learning_rate": 1.5220973782771537e-05,
325
+ "loss": 0.1927,
326
+ "mean_token_accuracy": 0.9512970626354218,
327
+ "num_tokens": 35450.0,
328
+ "step": 320
329
+ },
330
+ {
331
+ "epoch": 1.2359550561797752,
332
+ "grad_norm": 0.09665194898843765,
333
+ "learning_rate": 1.507116104868914e-05,
334
+ "loss": 0.1911,
335
+ "mean_token_accuracy": 0.9494555711746215,
336
+ "num_tokens": 36558.0,
337
+ "step": 330
338
+ },
339
+ {
340
+ "epoch": 1.2734082397003745,
341
+ "grad_norm": 0.11535191535949707,
342
+ "learning_rate": 1.4921348314606743e-05,
343
+ "loss": 0.1915,
344
+ "mean_token_accuracy": 0.9493873059749603,
345
+ "num_tokens": 37664.0,
346
+ "step": 340
347
+ },
348
+ {
349
+ "epoch": 1.3108614232209739,
350
+ "grad_norm": 0.11016673594713211,
351
+ "learning_rate": 1.4771535580524345e-05,
352
+ "loss": 0.1931,
353
+ "mean_token_accuracy": 0.9440759301185608,
354
+ "num_tokens": 38774.0,
355
+ "step": 350
356
+ },
357
+ {
358
+ "epoch": 1.348314606741573,
359
+ "grad_norm": 0.24848656356334686,
360
+ "learning_rate": 1.4621722846441948e-05,
361
+ "loss": 0.1925,
362
+ "mean_token_accuracy": 0.9458104014396668,
363
+ "num_tokens": 39883.0,
364
+ "step": 360
365
+ },
366
+ {
367
+ "epoch": 1.3857677902621723,
368
+ "grad_norm": 0.1400669664144516,
369
+ "learning_rate": 1.447191011235955e-05,
370
+ "loss": 0.1936,
371
+ "mean_token_accuracy": 0.9457758069038391,
372
+ "num_tokens": 40990.0,
373
+ "step": 370
374
+ },
375
+ {
376
+ "epoch": 1.4232209737827715,
377
+ "grad_norm": 0.1753997802734375,
378
+ "learning_rate": 1.4322097378277155e-05,
379
+ "loss": 0.1921,
380
+ "mean_token_accuracy": 0.9477294445037842,
381
+ "num_tokens": 42099.0,
382
+ "step": 380
383
+ },
384
+ {
385
+ "epoch": 1.4606741573033708,
386
+ "grad_norm": 0.11102133989334106,
387
+ "learning_rate": 1.4172284644194758e-05,
388
+ "loss": 0.1904,
389
+ "mean_token_accuracy": 0.9459109544754029,
390
+ "num_tokens": 43209.0,
391
+ "step": 390
392
+ },
393
+ {
394
+ "epoch": 1.4981273408239701,
395
+ "grad_norm": 0.12153730541467667,
396
+ "learning_rate": 1.402247191011236e-05,
397
+ "loss": 0.1908,
398
+ "mean_token_accuracy": 0.9495814442634583,
399
+ "num_tokens": 44320.0,
400
+ "step": 400
401
+ },
402
+ {
403
+ "epoch": 1.4981273408239701,
404
+ "eval_loss": 0.19722126424312592,
405
+ "eval_mean_token_accuracy": 0.9426012635231018,
406
+ "eval_num_tokens": 44320.0,
407
+ "eval_runtime": 2.4929,
408
+ "eval_samples_per_second": 11.633,
409
+ "eval_steps_per_second": 1.605,
410
+ "step": 400
411
+ },
412
+ {
413
+ "epoch": 1.5355805243445693,
414
+ "grad_norm": 0.13351161777973175,
415
+ "learning_rate": 1.3872659176029963e-05,
416
+ "loss": 0.1906,
417
+ "mean_token_accuracy": 0.9469557940959931,
418
+ "num_tokens": 45434.0,
419
+ "step": 410
420
+ },
421
+ {
422
+ "epoch": 1.5730337078651684,
423
+ "grad_norm": 0.1454717516899109,
424
+ "learning_rate": 1.3722846441947566e-05,
425
+ "loss": 0.1906,
426
+ "mean_token_accuracy": 0.9468878388404847,
427
+ "num_tokens": 46547.0,
428
+ "step": 420
429
+ },
430
+ {
431
+ "epoch": 1.6104868913857677,
432
+ "grad_norm": 0.21453846991062164,
433
+ "learning_rate": 1.3573033707865169e-05,
434
+ "loss": 0.1919,
435
+ "mean_token_accuracy": 0.9432088494300842,
436
+ "num_tokens": 47659.0,
437
+ "step": 430
438
+ },
439
+ {
440
+ "epoch": 1.647940074906367,
441
+ "grad_norm": 0.1796715408563614,
442
+ "learning_rate": 1.3423220973782773e-05,
443
+ "loss": 0.1924,
444
+ "mean_token_accuracy": 0.9468723952770233,
445
+ "num_tokens": 48771.0,
446
+ "step": 440
447
+ },
448
+ {
449
+ "epoch": 1.6853932584269664,
450
+ "grad_norm": 0.18729475140571594,
451
+ "learning_rate": 1.3273408239700376e-05,
452
+ "loss": 0.1918,
453
+ "mean_token_accuracy": 0.9448257863521576,
454
+ "num_tokens": 49878.0,
455
+ "step": 450
456
+ },
457
+ {
458
+ "epoch": 1.7228464419475655,
459
+ "grad_norm": 0.20833182334899902,
460
+ "learning_rate": 1.3123595505617978e-05,
461
+ "loss": 0.19,
462
+ "mean_token_accuracy": 0.9460108697414398,
463
+ "num_tokens": 50990.0,
464
+ "step": 460
465
+ },
466
+ {
467
+ "epoch": 1.7602996254681647,
468
+ "grad_norm": 0.09931682050228119,
469
+ "learning_rate": 1.2973782771535581e-05,
470
+ "loss": 0.1898,
471
+ "mean_token_accuracy": 0.9476029396057128,
472
+ "num_tokens": 52099.0,
473
+ "step": 470
474
+ },
475
+ {
476
+ "epoch": 1.797752808988764,
477
+ "grad_norm": 0.2103966772556305,
478
+ "learning_rate": 1.2823970037453184e-05,
479
+ "loss": 0.1932,
480
+ "mean_token_accuracy": 0.9421666264533997,
481
+ "num_tokens": 53208.0,
482
+ "step": 480
483
+ },
484
+ {
485
+ "epoch": 1.8352059925093633,
486
+ "grad_norm": 0.07852394878864288,
487
+ "learning_rate": 1.2674157303370786e-05,
488
+ "loss": 0.1915,
489
+ "mean_token_accuracy": 0.9441100597381592,
490
+ "num_tokens": 54319.0,
491
+ "step": 490
492
+ },
493
+ {
494
+ "epoch": 1.8726591760299627,
495
+ "grad_norm": 0.09249723702669144,
496
+ "learning_rate": 1.2524344569288391e-05,
497
+ "loss": 0.19,
498
+ "mean_token_accuracy": 0.9484964370727539,
499
+ "num_tokens": 55426.0,
500
+ "step": 500
501
+ },
502
+ {
503
+ "epoch": 1.8726591760299627,
504
+ "eval_loss": 0.19536998867988586,
505
+ "eval_mean_token_accuracy": 0.945041760802269,
506
+ "eval_num_tokens": 55426.0,
507
+ "eval_runtime": 2.499,
508
+ "eval_samples_per_second": 11.605,
509
+ "eval_steps_per_second": 1.601,
510
+ "step": 500
511
+ },
512
+ {
513
+ "epoch": 1.9101123595505618,
514
+ "grad_norm": 0.07890783250331879,
515
+ "learning_rate": 1.2374531835205994e-05,
516
+ "loss": 0.1909,
517
+ "mean_token_accuracy": 0.9412918269634247,
518
+ "num_tokens": 56536.0,
519
+ "step": 510
520
+ },
521
+ {
522
+ "epoch": 1.947565543071161,
523
+ "grad_norm": 0.2816140353679657,
524
+ "learning_rate": 1.2224719101123596e-05,
525
+ "loss": 0.1923,
526
+ "mean_token_accuracy": 0.9376968383789063,
527
+ "num_tokens": 57648.0,
528
+ "step": 520
529
+ },
530
+ {
531
+ "epoch": 1.9850187265917603,
532
+ "grad_norm": 0.08590656518936157,
533
+ "learning_rate": 1.2074906367041199e-05,
534
+ "loss": 0.1904,
535
+ "mean_token_accuracy": 0.9467627465724945,
536
+ "num_tokens": 58758.0,
537
+ "step": 530
538
+ },
539
+ {
540
+ "epoch": 2.0224719101123596,
541
+ "grad_norm": 0.1013297438621521,
542
+ "learning_rate": 1.1925093632958802e-05,
543
+ "loss": 0.1903,
544
+ "mean_token_accuracy": 0.9485378265380859,
545
+ "num_tokens": 59811.0,
546
+ "step": 540
547
+ },
548
+ {
549
+ "epoch": 2.059925093632959,
550
+ "grad_norm": 0.07267877459526062,
551
+ "learning_rate": 1.1775280898876404e-05,
552
+ "loss": 0.1897,
553
+ "mean_token_accuracy": 0.9469048321247101,
554
+ "num_tokens": 60923.0,
555
+ "step": 550
556
+ },
557
+ {
558
+ "epoch": 2.097378277153558,
559
+ "grad_norm": 0.08559578657150269,
560
+ "learning_rate": 1.1625468164794009e-05,
561
+ "loss": 0.1913,
562
+ "mean_token_accuracy": 0.943006819486618,
563
+ "num_tokens": 62031.0,
564
+ "step": 560
565
+ },
566
+ {
567
+ "epoch": 2.134831460674157,
568
+ "grad_norm": 0.2162655144929886,
569
+ "learning_rate": 1.1475655430711611e-05,
570
+ "loss": 0.188,
571
+ "mean_token_accuracy": 0.9467701494693757,
572
+ "num_tokens": 63140.0,
573
+ "step": 570
574
+ },
575
+ {
576
+ "epoch": 2.1722846441947565,
577
+ "grad_norm": 0.08606795221567154,
578
+ "learning_rate": 1.1325842696629214e-05,
579
+ "loss": 0.189,
580
+ "mean_token_accuracy": 0.9439931452274323,
581
+ "num_tokens": 64249.0,
582
+ "step": 580
583
+ },
584
+ {
585
+ "epoch": 2.209737827715356,
586
+ "grad_norm": 0.2562474310398102,
587
+ "learning_rate": 1.1176029962546817e-05,
588
+ "loss": 0.1926,
589
+ "mean_token_accuracy": 0.9457504689693451,
590
+ "num_tokens": 65356.0,
591
+ "step": 590
592
+ },
593
+ {
594
+ "epoch": 2.247191011235955,
595
+ "grad_norm": 0.0770883709192276,
596
+ "learning_rate": 1.102621722846442e-05,
597
+ "loss": 0.1895,
598
+ "mean_token_accuracy": 0.9449774503707886,
599
+ "num_tokens": 66466.0,
600
+ "step": 600
601
+ },
602
+ {
603
+ "epoch": 2.247191011235955,
604
+ "eval_loss": 0.19561618566513062,
605
+ "eval_mean_token_accuracy": 0.9387146234512329,
606
+ "eval_num_tokens": 66466.0,
607
+ "eval_runtime": 2.498,
608
+ "eval_samples_per_second": 11.609,
609
+ "eval_steps_per_second": 1.601,
610
+ "step": 600
611
+ },
612
+ {
613
+ "epoch": 2.284644194756554,
614
+ "grad_norm": 0.08070901036262512,
615
+ "learning_rate": 1.0876404494382022e-05,
616
+ "loss": 0.1905,
617
+ "mean_token_accuracy": 0.9429994106292725,
618
+ "num_tokens": 67574.0,
619
+ "step": 610
620
+ },
621
+ {
622
+ "epoch": 2.3220973782771535,
623
+ "grad_norm": 0.08464006334543228,
624
+ "learning_rate": 1.0726591760299627e-05,
625
+ "loss": 0.1892,
626
+ "mean_token_accuracy": 0.9467701494693757,
627
+ "num_tokens": 68683.0,
628
+ "step": 620
629
+ },
630
+ {
631
+ "epoch": 2.359550561797753,
632
+ "grad_norm": 0.21751029789447784,
633
+ "learning_rate": 1.057677902621723e-05,
634
+ "loss": 0.1898,
635
+ "mean_token_accuracy": 0.9449776113033295,
636
+ "num_tokens": 69793.0,
637
+ "step": 630
638
+ },
639
+ {
640
+ "epoch": 2.397003745318352,
641
+ "grad_norm": 0.06470742076635361,
642
+ "learning_rate": 1.0426966292134832e-05,
643
+ "loss": 0.1917,
644
+ "mean_token_accuracy": 0.9440505981445313,
645
+ "num_tokens": 70903.0,
646
+ "step": 640
647
+ },
648
+ {
649
+ "epoch": 2.4344569288389515,
650
+ "grad_norm": 0.07308146357536316,
651
+ "learning_rate": 1.0277153558052435e-05,
652
+ "loss": 0.1911,
653
+ "mean_token_accuracy": 0.9440014958381653,
654
+ "num_tokens": 72012.0,
655
+ "step": 650
656
+ },
657
+ {
658
+ "epoch": 2.4719101123595504,
659
+ "grad_norm": 0.17352479696273804,
660
+ "learning_rate": 1.0127340823970037e-05,
661
+ "loss": 0.1895,
662
+ "mean_token_accuracy": 0.9467605650424957,
663
+ "num_tokens": 73121.0,
664
+ "step": 660
665
+ },
666
+ {
667
+ "epoch": 2.5093632958801497,
668
+ "grad_norm": 0.06704717874526978,
669
+ "learning_rate": 9.977528089887642e-06,
670
+ "loss": 0.1882,
671
+ "mean_token_accuracy": 0.9442952454090119,
672
+ "num_tokens": 74236.0,
673
+ "step": 670
674
+ },
675
+ {
676
+ "epoch": 2.546816479400749,
677
+ "grad_norm": 0.15437676012516022,
678
+ "learning_rate": 9.827715355805244e-06,
679
+ "loss": 0.1936,
680
+ "mean_token_accuracy": 0.9383869290351867,
681
+ "num_tokens": 75344.0,
682
+ "step": 680
683
+ },
684
+ {
685
+ "epoch": 2.5842696629213484,
686
+ "grad_norm": 0.17893658578395844,
687
+ "learning_rate": 9.677902621722847e-06,
688
+ "loss": 0.1895,
689
+ "mean_token_accuracy": 0.9460550546646118,
690
+ "num_tokens": 76458.0,
691
+ "step": 690
692
+ },
693
+ {
694
+ "epoch": 2.6217228464419478,
695
+ "grad_norm": 0.2342701405286789,
696
+ "learning_rate": 9.52808988764045e-06,
697
+ "loss": 0.1901,
698
+ "mean_token_accuracy": 0.9467441976070404,
699
+ "num_tokens": 77568.0,
700
+ "step": 700
701
+ },
702
+ {
703
+ "epoch": 2.6217228464419478,
704
+ "eval_loss": 0.195254847407341,
705
+ "eval_mean_token_accuracy": 0.9420265555381775,
706
+ "eval_num_tokens": 77568.0,
707
+ "eval_runtime": 2.5044,
708
+ "eval_samples_per_second": 11.58,
709
+ "eval_steps_per_second": 1.597,
710
+ "step": 700
711
+ },
712
+ {
713
+ "epoch": 2.6591760299625467,
714
+ "grad_norm": 0.07597153633832932,
715
+ "learning_rate": 9.378277153558052e-06,
716
+ "loss": 0.1902,
717
+ "mean_token_accuracy": 0.9421320199966431,
718
+ "num_tokens": 78677.0,
719
+ "step": 710
720
+ },
721
+ {
722
+ "epoch": 2.696629213483146,
723
+ "grad_norm": 0.08350855857133865,
724
+ "learning_rate": 9.228464419475655e-06,
725
+ "loss": 0.19,
726
+ "mean_token_accuracy": 0.9431924819946289,
727
+ "num_tokens": 79788.0,
728
+ "step": 720
729
+ },
730
+ {
731
+ "epoch": 2.7340823970037453,
732
+ "grad_norm": 0.08954475820064545,
733
+ "learning_rate": 9.07865168539326e-06,
734
+ "loss": 0.1893,
735
+ "mean_token_accuracy": 0.945793092250824,
736
+ "num_tokens": 80896.0,
737
+ "step": 730
738
+ },
739
+ {
740
+ "epoch": 2.7715355805243447,
741
+ "grad_norm": 0.07194171845912933,
742
+ "learning_rate": 8.928838951310862e-06,
743
+ "loss": 0.1895,
744
+ "mean_token_accuracy": 0.9441764891147614,
745
+ "num_tokens": 82007.0,
746
+ "step": 740
747
+ },
748
+ {
749
+ "epoch": 2.808988764044944,
750
+ "grad_norm": 0.07494191080331802,
751
+ "learning_rate": 8.779026217228465e-06,
752
+ "loss": 0.1909,
753
+ "mean_token_accuracy": 0.9465851247310638,
754
+ "num_tokens": 83113.0,
755
+ "step": 750
756
+ },
757
+ {
758
+ "epoch": 2.846441947565543,
759
+ "grad_norm": 0.20635780692100525,
760
+ "learning_rate": 8.629213483146068e-06,
761
+ "loss": 0.1894,
762
+ "mean_token_accuracy": 0.9479296028614044,
763
+ "num_tokens": 84226.0,
764
+ "step": 760
765
+ },
766
+ {
767
+ "epoch": 2.8838951310861423,
768
+ "grad_norm": 0.18754708766937256,
769
+ "learning_rate": 8.47940074906367e-06,
770
+ "loss": 0.1898,
771
+ "mean_token_accuracy": 0.9503811955451965,
772
+ "num_tokens": 85333.0,
773
+ "step": 770
774
+ },
775
+ {
776
+ "epoch": 2.9213483146067416,
777
+ "grad_norm": 0.08187804371118546,
778
+ "learning_rate": 8.329588014981273e-06,
779
+ "loss": 0.1903,
780
+ "mean_token_accuracy": 0.943065983057022,
781
+ "num_tokens": 86441.0,
782
+ "step": 780
783
+ },
784
+ {
785
+ "epoch": 2.958801498127341,
786
+ "grad_norm": 0.18230247497558594,
787
+ "learning_rate": 8.179775280898877e-06,
788
+ "loss": 0.19,
789
+ "mean_token_accuracy": 0.9431342482566833,
790
+ "num_tokens": 87551.0,
791
+ "step": 790
792
+ },
793
+ {
794
+ "epoch": 2.9962546816479403,
795
+ "grad_norm": 0.19725900888442993,
796
+ "learning_rate": 8.02996254681648e-06,
797
+ "loss": 0.1903,
798
+ "mean_token_accuracy": 0.9396148085594177,
799
+ "num_tokens": 88663.0,
800
+ "step": 800
801
+ },
802
+ {
803
+ "epoch": 2.9962546816479403,
804
+ "eval_loss": 0.1949780136346817,
805
+ "eval_mean_token_accuracy": 0.9431759715080261,
806
+ "eval_num_tokens": 88663.0,
807
+ "eval_runtime": 2.487,
808
+ "eval_samples_per_second": 11.661,
809
+ "eval_steps_per_second": 1.608,
810
+ "step": 800
811
+ },
812
+ {
813
+ "epoch": 3.033707865168539,
814
+ "grad_norm": 0.07648079097270966,
815
+ "learning_rate": 7.880149812734083e-06,
816
+ "loss": 0.1903,
817
+ "mean_token_accuracy": 0.9457852184772492,
818
+ "num_tokens": 89716.0,
819
+ "step": 810
820
+ },
821
+ {
822
+ "epoch": 3.0711610486891385,
823
+ "grad_norm": 0.06528846174478531,
824
+ "learning_rate": 7.730337078651686e-06,
825
+ "loss": 0.1892,
826
+ "mean_token_accuracy": 0.9422413766384125,
827
+ "num_tokens": 90827.0,
828
+ "step": 820
829
+ },
830
+ {
831
+ "epoch": 3.108614232209738,
832
+ "grad_norm": 0.16895975172519684,
833
+ "learning_rate": 7.580524344569289e-06,
834
+ "loss": 0.1883,
835
+ "mean_token_accuracy": 0.9512388408184052,
836
+ "num_tokens": 91934.0,
837
+ "step": 830
838
+ },
839
+ {
840
+ "epoch": 3.146067415730337,
841
+ "grad_norm": 0.14639434218406677,
842
+ "learning_rate": 7.430711610486892e-06,
843
+ "loss": 0.1881,
844
+ "mean_token_accuracy": 0.9494467735290527,
845
+ "num_tokens": 93041.0,
846
+ "step": 840
847
+ },
848
+ {
849
+ "epoch": 3.1835205992509366,
850
+ "grad_norm": 0.08737971633672714,
851
+ "learning_rate": 7.280898876404495e-06,
852
+ "loss": 0.1901,
853
+ "mean_token_accuracy": 0.9406749486923218,
854
+ "num_tokens": 94156.0,
855
+ "step": 850
856
+ },
857
+ {
858
+ "epoch": 3.2209737827715355,
859
+ "grad_norm": 0.09445718675851822,
860
+ "learning_rate": 7.131086142322098e-06,
861
+ "loss": 0.1884,
862
+ "mean_token_accuracy": 0.9475689589977264,
863
+ "num_tokens": 95264.0,
864
+ "step": 860
865
+ },
866
+ {
867
+ "epoch": 3.258426966292135,
868
+ "grad_norm": 0.09516163170337677,
869
+ "learning_rate": 6.981273408239701e-06,
870
+ "loss": 0.1889,
871
+ "mean_token_accuracy": 0.9449783861637115,
872
+ "num_tokens": 96374.0,
873
+ "step": 870
874
+ },
875
+ {
876
+ "epoch": 3.295880149812734,
877
+ "grad_norm": 0.08031495660543442,
878
+ "learning_rate": 6.831460674157304e-06,
879
+ "loss": 0.1898,
880
+ "mean_token_accuracy": 0.9413152992725372,
881
+ "num_tokens": 97485.0,
882
+ "step": 880
883
+ },
884
+ {
885
+ "epoch": 3.3333333333333335,
886
+ "grad_norm": 0.20592394471168518,
887
+ "learning_rate": 6.681647940074907e-06,
888
+ "loss": 0.1902,
889
+ "mean_token_accuracy": 0.9421661615371704,
890
+ "num_tokens": 98595.0,
891
+ "step": 890
892
+ },
893
+ {
894
+ "epoch": 3.370786516853933,
895
+ "grad_norm": 0.19433368742465973,
896
+ "learning_rate": 6.53183520599251e-06,
897
+ "loss": 0.1882,
898
+ "mean_token_accuracy": 0.9559795200824738,
899
+ "num_tokens": 99705.0,
900
+ "step": 900
901
+ },
902
+ {
903
+ "epoch": 3.370786516853933,
904
+ "eval_loss": 0.19456735253334045,
905
+ "eval_mean_token_accuracy": 0.9444670528173447,
906
+ "eval_num_tokens": 99705.0,
907
+ "eval_runtime": 2.4936,
908
+ "eval_samples_per_second": 11.63,
909
+ "eval_steps_per_second": 1.604,
910
+ "step": 900
911
+ },
912
+ {
913
+ "epoch": 3.4082397003745317,
914
+ "grad_norm": 0.09646886587142944,
915
+ "learning_rate": 6.382022471910113e-06,
916
+ "loss": 0.1888,
917
+ "mean_token_accuracy": 0.9403546094894409,
918
+ "num_tokens": 100815.0,
919
+ "step": 910
920
+ },
921
+ {
922
+ "epoch": 3.445692883895131,
923
+ "grad_norm": 0.23463116586208344,
924
+ "learning_rate": 6.232209737827716e-06,
925
+ "loss": 0.1898,
926
+ "mean_token_accuracy": 0.9475183129310608,
927
+ "num_tokens": 101920.0,
928
+ "step": 920
929
+ },
930
+ {
931
+ "epoch": 3.4831460674157304,
932
+ "grad_norm": 0.1442176103591919,
933
+ "learning_rate": 6.0823970037453185e-06,
934
+ "loss": 0.1903,
935
+ "mean_token_accuracy": 0.945886081457138,
936
+ "num_tokens": 103029.0,
937
+ "step": 930
938
+ },
939
+ {
940
+ "epoch": 3.5205992509363297,
941
+ "grad_norm": 0.18915638327598572,
942
+ "learning_rate": 5.932584269662922e-06,
943
+ "loss": 0.1928,
944
+ "mean_token_accuracy": 0.9423760592937469,
945
+ "num_tokens": 104142.0,
946
+ "step": 940
947
+ },
948
+ {
949
+ "epoch": 3.558052434456929,
950
+ "grad_norm": 0.07233936339616776,
951
+ "learning_rate": 5.782771535580525e-06,
952
+ "loss": 0.1895,
953
+ "mean_token_accuracy": 0.9485957443714141,
954
+ "num_tokens": 105252.0,
955
+ "step": 950
956
+ },
957
+ {
958
+ "epoch": 3.595505617977528,
959
+ "grad_norm": 0.06686032563447952,
960
+ "learning_rate": 5.6329588014981274e-06,
961
+ "loss": 0.1897,
962
+ "mean_token_accuracy": 0.9438907384872437,
963
+ "num_tokens": 106360.0,
964
+ "step": 960
965
+ },
966
+ {
967
+ "epoch": 3.6329588014981273,
968
+ "grad_norm": 0.10568553954362869,
969
+ "learning_rate": 5.483146067415731e-06,
970
+ "loss": 0.1872,
971
+ "mean_token_accuracy": 0.9469630539417266,
972
+ "num_tokens": 107474.0,
973
+ "step": 970
974
+ },
975
+ {
976
+ "epoch": 3.6704119850187267,
977
+ "grad_norm": 0.17929266393184662,
978
+ "learning_rate": 5.333333333333334e-06,
979
+ "loss": 0.1897,
980
+ "mean_token_accuracy": 0.9439324319362641,
981
+ "num_tokens": 108583.0,
982
+ "step": 980
983
+ },
984
+ {
985
+ "epoch": 3.7078651685393256,
986
+ "grad_norm": 0.10999605804681778,
987
+ "learning_rate": 5.183520599250936e-06,
988
+ "loss": 0.191,
989
+ "mean_token_accuracy": 0.9439007878303528,
990
+ "num_tokens": 109690.0,
991
+ "step": 990
992
+ },
993
+ {
994
+ "epoch": 3.7453183520599254,
995
+ "grad_norm": 0.08190548419952393,
996
+ "learning_rate": 5.03370786516854e-06,
997
+ "loss": 0.1894,
998
+ "mean_token_accuracy": 0.9458610653877259,
999
+ "num_tokens": 110799.0,
1000
+ "step": 1000
1001
+ },
1002
+ {
1003
+ "epoch": 3.7453183520599254,
1004
+ "eval_loss": 0.1942463368177414,
1005
+ "eval_mean_token_accuracy": 0.9444670528173447,
1006
+ "eval_num_tokens": 110799.0,
1007
+ "eval_runtime": 2.4977,
1008
+ "eval_samples_per_second": 11.61,
1009
+ "eval_steps_per_second": 1.601,
1010
+ "step": 1000
1011
+ }
1012
+ ],
1013
+ "logging_steps": 10,
1014
+ "max_steps": 1335,
1015
+ "num_input_tokens_seen": 0,
1016
+ "num_train_epochs": 5,
1017
+ "save_steps": 100,
1018
+ "stateful_callbacks": {
1019
+ "TrainerControl": {
1020
+ "args": {
1021
+ "should_epoch_stop": false,
1022
+ "should_evaluate": false,
1023
+ "should_log": false,
1024
+ "should_save": true,
1025
+ "should_training_stop": false
1026
+ },
1027
+ "attributes": {}
1028
+ }
1029
+ },
1030
+ "total_flos": 5082447197952000.0,
1031
+ "train_batch_size": 2,
1032
+ "trial_name": null,
1033
+ "trial_params": null
1034
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b67df3186500a8ab4543ab551c594a458a667a2ee9e16f00656ee5598e0026
3
+ size 6097
checkpoint-1100/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: llava-hf/LLaVA-NeXT-Video-7B-32K-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:llava-hf/LLaVA-NeXT-Video-7B-32K-hf
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-1100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "k_proj",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "o_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-1100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:effb42eb182ecaa32b718222ae9b3b4cad7480d24684146685400bac0d318466
3
+ size 708929184
checkpoint-1100/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32001,
3
+ "<video>": 32000
4
+ }
checkpoint-1100/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88f9a3a51bb0d8452d1ba1b90bf04050a96f6060107ad17fc9c5a02e4ef63cd6
3
+ size 1342556643
checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a662f4442b7166762a5182656a296792c03f8c794f748d01848058d7dccf28b
3
+ size 14645
checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9adada56ac6d44c40edef4075a625699ca08b335f0a6f09d3cad419613cdc5f
3
+ size 1465
checkpoint-1100/special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "image_token": "<image>",
17
+ "pad_token": {
18
+ "content": "</s>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "video_token": "<video>"
32
+ }
checkpoint-1100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-1100/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<video>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<image>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "additional_special_tokens": [],
48
+ "bos_token": "<s>",
49
+ "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
+ "extra_special_tokens": {
52
+ "image_token": "<image>",
53
+ "video_token": "<video>"
54
+ },
55
+ "image_token": "<image>",
56
+ "legacy": true,
57
+ "max_length": null,
58
+ "model_max_length": 1000000000000000019884624838656,
59
+ "pad_to_multiple_of": null,
60
+ "pad_token": "</s>",
61
+ "pad_token_type_id": 0,
62
+ "padding_side": "left",
63
+ "processor_class": "LlavaNextVideoProcessor",
64
+ "sp_model_kwargs": {},
65
+ "spaces_between_special_tokens": false,
66
+ "tokenizer_class": "LlamaTokenizer",
67
+ "unk_token": "<unk>",
68
+ "use_default_system_prompt": false,
69
+ "video_token": "<video>"
70
+ }
checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,1134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1100,
3
+ "best_metric": 0.19412046670913696,
4
+ "best_model_checkpoint": "outputs/checkpoint-1100",
5
+ "epoch": 4.119850187265918,
6
+ "eval_steps": 100,
7
+ "global_step": 1100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03745318352059925,
14
+ "grad_norm": 2.6471238136291504,
15
+ "learning_rate": 1.9865168539325844e-05,
16
+ "loss": 3.9924,
17
+ "mean_token_accuracy": 0.3569513201713562,
18
+ "num_tokens": 1110.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.0749063670411985,
23
+ "grad_norm": 2.9193994998931885,
24
+ "learning_rate": 1.9715355805243446e-05,
25
+ "loss": 2.5013,
26
+ "mean_token_accuracy": 0.5000596195459366,
27
+ "num_tokens": 2220.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.11235955056179775,
32
+ "grad_norm": 1.090408444404602,
33
+ "learning_rate": 1.956554307116105e-05,
34
+ "loss": 1.2021,
35
+ "mean_token_accuracy": 0.7512393116950988,
36
+ "num_tokens": 3329.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.149812734082397,
41
+ "grad_norm": 1.412244200706482,
42
+ "learning_rate": 1.9415730337078652e-05,
43
+ "loss": 0.6237,
44
+ "mean_token_accuracy": 0.8658290803432465,
45
+ "num_tokens": 4437.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.18726591760299627,
50
+ "grad_norm": 0.9774134755134583,
51
+ "learning_rate": 1.9265917602996254e-05,
52
+ "loss": 0.4264,
53
+ "mean_token_accuracy": 0.9105254471302032,
54
+ "num_tokens": 5553.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.2247191011235955,
59
+ "grad_norm": 0.6166325211524963,
60
+ "learning_rate": 1.9116104868913857e-05,
61
+ "loss": 0.3806,
62
+ "mean_token_accuracy": 0.8969066739082336,
63
+ "num_tokens": 6660.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.26217228464419473,
68
+ "grad_norm": 0.5820680856704712,
69
+ "learning_rate": 1.8966292134831463e-05,
70
+ "loss": 0.3484,
71
+ "mean_token_accuracy": 0.8972096979618073,
72
+ "num_tokens": 7769.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.299625468164794,
77
+ "grad_norm": 0.31422552466392517,
78
+ "learning_rate": 1.8816479400749066e-05,
79
+ "loss": 0.3196,
80
+ "mean_token_accuracy": 0.898263669013977,
81
+ "num_tokens": 8880.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.33707865168539325,
86
+ "grad_norm": 0.5825852155685425,
87
+ "learning_rate": 1.866666666666667e-05,
88
+ "loss": 0.2965,
89
+ "mean_token_accuracy": 0.9046498596668243,
90
+ "num_tokens": 9992.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.37453183520599254,
95
+ "grad_norm": 0.38430944085121155,
96
+ "learning_rate": 1.851685393258427e-05,
97
+ "loss": 0.2839,
98
+ "mean_token_accuracy": 0.9051393151283265,
99
+ "num_tokens": 11098.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.37453183520599254,
104
+ "eval_loss": 0.2852214574813843,
105
+ "eval_mean_token_accuracy": 0.9032742083072662,
106
+ "eval_num_tokens": 11098.0,
107
+ "eval_runtime": 2.4929,
108
+ "eval_samples_per_second": 11.633,
109
+ "eval_steps_per_second": 1.605,
110
+ "step": 100
111
+ },
112
+ {
113
+ "epoch": 0.41198501872659177,
114
+ "grad_norm": 0.312187522649765,
115
+ "learning_rate": 1.8367041198501874e-05,
116
+ "loss": 0.2752,
117
+ "mean_token_accuracy": 0.9036725044250489,
118
+ "num_tokens": 12207.0,
119
+ "step": 110
120
+ },
121
+ {
122
+ "epoch": 0.449438202247191,
123
+ "grad_norm": 0.3875369131565094,
124
+ "learning_rate": 1.8217228464419477e-05,
125
+ "loss": 0.2659,
126
+ "mean_token_accuracy": 0.9044483065605163,
127
+ "num_tokens": 13316.0,
128
+ "step": 120
129
+ },
130
+ {
131
+ "epoch": 0.4868913857677903,
132
+ "grad_norm": 0.6050882339477539,
133
+ "learning_rate": 1.8067415730337083e-05,
134
+ "loss": 0.258,
135
+ "mean_token_accuracy": 0.9100114285945893,
136
+ "num_tokens": 14426.0,
137
+ "step": 130
138
+ },
139
+ {
140
+ "epoch": 0.5243445692883895,
141
+ "grad_norm": 0.5287177562713623,
142
+ "learning_rate": 1.7917602996254685e-05,
143
+ "loss": 0.2455,
144
+ "mean_token_accuracy": 0.9222747385501862,
145
+ "num_tokens": 15539.0,
146
+ "step": 140
147
+ },
148
+ {
149
+ "epoch": 0.5617977528089888,
150
+ "grad_norm": 0.5224889516830444,
151
+ "learning_rate": 1.7767790262172285e-05,
152
+ "loss": 0.2368,
153
+ "mean_token_accuracy": 0.9263923704624176,
154
+ "num_tokens": 16647.0,
155
+ "step": 150
156
+ },
157
+ {
158
+ "epoch": 0.599250936329588,
159
+ "grad_norm": 0.4501174986362457,
160
+ "learning_rate": 1.7617977528089887e-05,
161
+ "loss": 0.2299,
162
+ "mean_token_accuracy": 0.9313735246658326,
163
+ "num_tokens": 17760.0,
164
+ "step": 160
165
+ },
166
+ {
167
+ "epoch": 0.6367041198501873,
168
+ "grad_norm": 0.43853962421417236,
169
+ "learning_rate": 1.746816479400749e-05,
170
+ "loss": 0.2222,
171
+ "mean_token_accuracy": 0.9402973234653473,
172
+ "num_tokens": 18869.0,
173
+ "step": 170
174
+ },
175
+ {
176
+ "epoch": 0.6741573033707865,
177
+ "grad_norm": 0.31908461451530457,
178
+ "learning_rate": 1.7318352059925093e-05,
179
+ "loss": 0.2117,
180
+ "mean_token_accuracy": 0.9458102405071258,
181
+ "num_tokens": 19977.0,
182
+ "step": 180
183
+ },
184
+ {
185
+ "epoch": 0.7116104868913857,
186
+ "grad_norm": 0.2825154662132263,
187
+ "learning_rate": 1.71685393258427e-05,
188
+ "loss": 0.2094,
189
+ "mean_token_accuracy": 0.938564246892929,
190
+ "num_tokens": 21088.0,
191
+ "step": 190
192
+ },
193
+ {
194
+ "epoch": 0.7490636704119851,
195
+ "grad_norm": 0.2939445674419403,
196
+ "learning_rate": 1.70187265917603e-05,
197
+ "loss": 0.2051,
198
+ "mean_token_accuracy": 0.9392363965511322,
199
+ "num_tokens": 22195.0,
200
+ "step": 200
201
+ },
202
+ {
203
+ "epoch": 0.7490636704119851,
204
+ "eval_loss": 0.2029074728488922,
205
+ "eval_mean_token_accuracy": 0.9482556581497192,
206
+ "eval_num_tokens": 22195.0,
207
+ "eval_runtime": 2.4927,
208
+ "eval_samples_per_second": 11.634,
209
+ "eval_steps_per_second": 1.605,
210
+ "step": 200
211
+ },
212
+ {
213
+ "epoch": 0.7865168539325843,
214
+ "grad_norm": 0.18860529363155365,
215
+ "learning_rate": 1.6868913857677904e-05,
216
+ "loss": 0.1991,
217
+ "mean_token_accuracy": 0.9431917011737824,
218
+ "num_tokens": 23306.0,
219
+ "step": 210
220
+ },
221
+ {
222
+ "epoch": 0.8239700374531835,
223
+ "grad_norm": 0.22066630423069,
224
+ "learning_rate": 1.6719101123595507e-05,
225
+ "loss": 0.2001,
226
+ "mean_token_accuracy": 0.9430991888046265,
227
+ "num_tokens": 24417.0,
228
+ "step": 220
229
+ },
230
+ {
231
+ "epoch": 0.8614232209737828,
232
+ "grad_norm": 0.17636580765247345,
233
+ "learning_rate": 1.656928838951311e-05,
234
+ "loss": 0.1968,
235
+ "mean_token_accuracy": 0.9465341567993164,
236
+ "num_tokens": 25522.0,
237
+ "step": 230
238
+ },
239
+ {
240
+ "epoch": 0.898876404494382,
241
+ "grad_norm": 0.14720433950424194,
242
+ "learning_rate": 1.6419475655430712e-05,
243
+ "loss": 0.1982,
244
+ "mean_token_accuracy": 0.9413078784942627,
245
+ "num_tokens": 26632.0,
246
+ "step": 240
247
+ },
248
+ {
249
+ "epoch": 0.9363295880149812,
250
+ "grad_norm": 0.11868773400783539,
251
+ "learning_rate": 1.626966292134832e-05,
252
+ "loss": 0.1955,
253
+ "mean_token_accuracy": 0.9468274474143982,
254
+ "num_tokens": 27742.0,
255
+ "step": 250
256
+ },
257
+ {
258
+ "epoch": 0.9737827715355806,
259
+ "grad_norm": 0.14357531070709229,
260
+ "learning_rate": 1.611985018726592e-05,
261
+ "loss": 0.1943,
262
+ "mean_token_accuracy": 0.9457764148712158,
263
+ "num_tokens": 28851.0,
264
+ "step": 260
265
+ },
266
+ {
267
+ "epoch": 1.0112359550561798,
268
+ "grad_norm": 0.21999526023864746,
269
+ "learning_rate": 1.5970037453183524e-05,
270
+ "loss": 0.1966,
271
+ "mean_token_accuracy": 0.9422410607337952,
272
+ "num_tokens": 29905.0,
273
+ "step": 270
274
+ },
275
+ {
276
+ "epoch": 1.048689138576779,
277
+ "grad_norm": 0.10375912487506866,
278
+ "learning_rate": 1.5820224719101127e-05,
279
+ "loss": 0.1935,
280
+ "mean_token_accuracy": 0.9441024959087372,
281
+ "num_tokens": 31016.0,
282
+ "step": 280
283
+ },
284
+ {
285
+ "epoch": 1.0861423220973783,
286
+ "grad_norm": 0.2760375738143921,
287
+ "learning_rate": 1.5670411985018726e-05,
288
+ "loss": 0.1947,
289
+ "mean_token_accuracy": 0.9411046266555786,
290
+ "num_tokens": 32124.0,
291
+ "step": 290
292
+ },
293
+ {
294
+ "epoch": 1.1235955056179776,
295
+ "grad_norm": 0.2127188742160797,
296
+ "learning_rate": 1.552059925093633e-05,
297
+ "loss": 0.1943,
298
+ "mean_token_accuracy": 0.9514408648014069,
299
+ "num_tokens": 33234.0,
300
+ "step": 300
301
+ },
302
+ {
303
+ "epoch": 1.1235955056179776,
304
+ "eval_loss": 0.19752565026283264,
305
+ "eval_mean_token_accuracy": 0.9434169828891754,
306
+ "eval_num_tokens": 33234.0,
307
+ "eval_runtime": 2.4936,
308
+ "eval_samples_per_second": 11.63,
309
+ "eval_steps_per_second": 1.604,
310
+ "step": 300
311
+ },
312
+ {
313
+ "epoch": 1.1610486891385767,
314
+ "grad_norm": 0.10267303138971329,
315
+ "learning_rate": 1.537078651685393e-05,
316
+ "loss": 0.1919,
317
+ "mean_token_accuracy": 0.9476523637771607,
318
+ "num_tokens": 34342.0,
319
+ "step": 310
320
+ },
321
+ {
322
+ "epoch": 1.198501872659176,
323
+ "grad_norm": 0.23754256963729858,
324
+ "learning_rate": 1.5220973782771537e-05,
325
+ "loss": 0.1927,
326
+ "mean_token_accuracy": 0.9512970626354218,
327
+ "num_tokens": 35450.0,
328
+ "step": 320
329
+ },
330
+ {
331
+ "epoch": 1.2359550561797752,
332
+ "grad_norm": 0.09665194898843765,
333
+ "learning_rate": 1.507116104868914e-05,
334
+ "loss": 0.1911,
335
+ "mean_token_accuracy": 0.9494555711746215,
336
+ "num_tokens": 36558.0,
337
+ "step": 330
338
+ },
339
+ {
340
+ "epoch": 1.2734082397003745,
341
+ "grad_norm": 0.11535191535949707,
342
+ "learning_rate": 1.4921348314606743e-05,
343
+ "loss": 0.1915,
344
+ "mean_token_accuracy": 0.9493873059749603,
345
+ "num_tokens": 37664.0,
346
+ "step": 340
347
+ },
348
+ {
349
+ "epoch": 1.3108614232209739,
350
+ "grad_norm": 0.11016673594713211,
351
+ "learning_rate": 1.4771535580524345e-05,
352
+ "loss": 0.1931,
353
+ "mean_token_accuracy": 0.9440759301185608,
354
+ "num_tokens": 38774.0,
355
+ "step": 350
356
+ },
357
+ {
358
+ "epoch": 1.348314606741573,
359
+ "grad_norm": 0.24848656356334686,
360
+ "learning_rate": 1.4621722846441948e-05,
361
+ "loss": 0.1925,
362
+ "mean_token_accuracy": 0.9458104014396668,
363
+ "num_tokens": 39883.0,
364
+ "step": 360
365
+ },
366
+ {
367
+ "epoch": 1.3857677902621723,
368
+ "grad_norm": 0.1400669664144516,
369
+ "learning_rate": 1.447191011235955e-05,
370
+ "loss": 0.1936,
371
+ "mean_token_accuracy": 0.9457758069038391,
372
+ "num_tokens": 40990.0,
373
+ "step": 370
374
+ },
375
+ {
376
+ "epoch": 1.4232209737827715,
377
+ "grad_norm": 0.1753997802734375,
378
+ "learning_rate": 1.4322097378277155e-05,
379
+ "loss": 0.1921,
380
+ "mean_token_accuracy": 0.9477294445037842,
381
+ "num_tokens": 42099.0,
382
+ "step": 380
383
+ },
384
+ {
385
+ "epoch": 1.4606741573033708,
386
+ "grad_norm": 0.11102133989334106,
387
+ "learning_rate": 1.4172284644194758e-05,
388
+ "loss": 0.1904,
389
+ "mean_token_accuracy": 0.9459109544754029,
390
+ "num_tokens": 43209.0,
391
+ "step": 390
392
+ },
393
+ {
394
+ "epoch": 1.4981273408239701,
395
+ "grad_norm": 0.12153730541467667,
396
+ "learning_rate": 1.402247191011236e-05,
397
+ "loss": 0.1908,
398
+ "mean_token_accuracy": 0.9495814442634583,
399
+ "num_tokens": 44320.0,
400
+ "step": 400
401
+ },
402
+ {
403
+ "epoch": 1.4981273408239701,
404
+ "eval_loss": 0.19722126424312592,
405
+ "eval_mean_token_accuracy": 0.9426012635231018,
406
+ "eval_num_tokens": 44320.0,
407
+ "eval_runtime": 2.4929,
408
+ "eval_samples_per_second": 11.633,
409
+ "eval_steps_per_second": 1.605,
410
+ "step": 400
411
+ },
412
+ {
413
+ "epoch": 1.5355805243445693,
414
+ "grad_norm": 0.13351161777973175,
415
+ "learning_rate": 1.3872659176029963e-05,
416
+ "loss": 0.1906,
417
+ "mean_token_accuracy": 0.9469557940959931,
418
+ "num_tokens": 45434.0,
419
+ "step": 410
420
+ },
421
+ {
422
+ "epoch": 1.5730337078651684,
423
+ "grad_norm": 0.1454717516899109,
424
+ "learning_rate": 1.3722846441947566e-05,
425
+ "loss": 0.1906,
426
+ "mean_token_accuracy": 0.9468878388404847,
427
+ "num_tokens": 46547.0,
428
+ "step": 420
429
+ },
430
+ {
431
+ "epoch": 1.6104868913857677,
432
+ "grad_norm": 0.21453846991062164,
433
+ "learning_rate": 1.3573033707865169e-05,
434
+ "loss": 0.1919,
435
+ "mean_token_accuracy": 0.9432088494300842,
436
+ "num_tokens": 47659.0,
437
+ "step": 430
438
+ },
439
+ {
440
+ "epoch": 1.647940074906367,
441
+ "grad_norm": 0.1796715408563614,
442
+ "learning_rate": 1.3423220973782773e-05,
443
+ "loss": 0.1924,
444
+ "mean_token_accuracy": 0.9468723952770233,
445
+ "num_tokens": 48771.0,
446
+ "step": 440
447
+ },
448
+ {
449
+ "epoch": 1.6853932584269664,
450
+ "grad_norm": 0.18729475140571594,
451
+ "learning_rate": 1.3273408239700376e-05,
452
+ "loss": 0.1918,
453
+ "mean_token_accuracy": 0.9448257863521576,
454
+ "num_tokens": 49878.0,
455
+ "step": 450
456
+ },
457
+ {
458
+ "epoch": 1.7228464419475655,
459
+ "grad_norm": 0.20833182334899902,
460
+ "learning_rate": 1.3123595505617978e-05,
461
+ "loss": 0.19,
462
+ "mean_token_accuracy": 0.9460108697414398,
463
+ "num_tokens": 50990.0,
464
+ "step": 460
465
+ },
466
+ {
467
+ "epoch": 1.7602996254681647,
468
+ "grad_norm": 0.09931682050228119,
469
+ "learning_rate": 1.2973782771535581e-05,
470
+ "loss": 0.1898,
471
+ "mean_token_accuracy": 0.9476029396057128,
472
+ "num_tokens": 52099.0,
473
+ "step": 470
474
+ },
475
+ {
476
+ "epoch": 1.797752808988764,
477
+ "grad_norm": 0.2103966772556305,
478
+ "learning_rate": 1.2823970037453184e-05,
479
+ "loss": 0.1932,
480
+ "mean_token_accuracy": 0.9421666264533997,
481
+ "num_tokens": 53208.0,
482
+ "step": 480
483
+ },
484
+ {
485
+ "epoch": 1.8352059925093633,
486
+ "grad_norm": 0.07852394878864288,
487
+ "learning_rate": 1.2674157303370786e-05,
488
+ "loss": 0.1915,
489
+ "mean_token_accuracy": 0.9441100597381592,
490
+ "num_tokens": 54319.0,
491
+ "step": 490
492
+ },
493
+ {
494
+ "epoch": 1.8726591760299627,
495
+ "grad_norm": 0.09249723702669144,
496
+ "learning_rate": 1.2524344569288391e-05,
497
+ "loss": 0.19,
498
+ "mean_token_accuracy": 0.9484964370727539,
499
+ "num_tokens": 55426.0,
500
+ "step": 500
501
+ },
502
+ {
503
+ "epoch": 1.8726591760299627,
504
+ "eval_loss": 0.19536998867988586,
505
+ "eval_mean_token_accuracy": 0.945041760802269,
506
+ "eval_num_tokens": 55426.0,
507
+ "eval_runtime": 2.499,
508
+ "eval_samples_per_second": 11.605,
509
+ "eval_steps_per_second": 1.601,
510
+ "step": 500
511
+ },
512
+ {
513
+ "epoch": 1.9101123595505618,
514
+ "grad_norm": 0.07890783250331879,
515
+ "learning_rate": 1.2374531835205994e-05,
516
+ "loss": 0.1909,
517
+ "mean_token_accuracy": 0.9412918269634247,
518
+ "num_tokens": 56536.0,
519
+ "step": 510
520
+ },
521
+ {
522
+ "epoch": 1.947565543071161,
523
+ "grad_norm": 0.2816140353679657,
524
+ "learning_rate": 1.2224719101123596e-05,
525
+ "loss": 0.1923,
526
+ "mean_token_accuracy": 0.9376968383789063,
527
+ "num_tokens": 57648.0,
528
+ "step": 520
529
+ },
530
+ {
531
+ "epoch": 1.9850187265917603,
532
+ "grad_norm": 0.08590656518936157,
533
+ "learning_rate": 1.2074906367041199e-05,
534
+ "loss": 0.1904,
535
+ "mean_token_accuracy": 0.9467627465724945,
536
+ "num_tokens": 58758.0,
537
+ "step": 530
538
+ },
539
+ {
540
+ "epoch": 2.0224719101123596,
541
+ "grad_norm": 0.1013297438621521,
542
+ "learning_rate": 1.1925093632958802e-05,
543
+ "loss": 0.1903,
544
+ "mean_token_accuracy": 0.9485378265380859,
545
+ "num_tokens": 59811.0,
546
+ "step": 540
547
+ },
548
+ {
549
+ "epoch": 2.059925093632959,
550
+ "grad_norm": 0.07267877459526062,
551
+ "learning_rate": 1.1775280898876404e-05,
552
+ "loss": 0.1897,
553
+ "mean_token_accuracy": 0.9469048321247101,
554
+ "num_tokens": 60923.0,
555
+ "step": 550
556
+ },
557
+ {
558
+ "epoch": 2.097378277153558,
559
+ "grad_norm": 0.08559578657150269,
560
+ "learning_rate": 1.1625468164794009e-05,
561
+ "loss": 0.1913,
562
+ "mean_token_accuracy": 0.943006819486618,
563
+ "num_tokens": 62031.0,
564
+ "step": 560
565
+ },
566
+ {
567
+ "epoch": 2.134831460674157,
568
+ "grad_norm": 0.2162655144929886,
569
+ "learning_rate": 1.1475655430711611e-05,
570
+ "loss": 0.188,
571
+ "mean_token_accuracy": 0.9467701494693757,
572
+ "num_tokens": 63140.0,
573
+ "step": 570
574
+ },
575
+ {
576
+ "epoch": 2.1722846441947565,
577
+ "grad_norm": 0.08606795221567154,
578
+ "learning_rate": 1.1325842696629214e-05,
579
+ "loss": 0.189,
580
+ "mean_token_accuracy": 0.9439931452274323,
581
+ "num_tokens": 64249.0,
582
+ "step": 580
583
+ },
584
+ {
585
+ "epoch": 2.209737827715356,
586
+ "grad_norm": 0.2562474310398102,
587
+ "learning_rate": 1.1176029962546817e-05,
588
+ "loss": 0.1926,
589
+ "mean_token_accuracy": 0.9457504689693451,
590
+ "num_tokens": 65356.0,
591
+ "step": 590
592
+ },
593
+ {
594
+ "epoch": 2.247191011235955,
595
+ "grad_norm": 0.0770883709192276,
596
+ "learning_rate": 1.102621722846442e-05,
597
+ "loss": 0.1895,
598
+ "mean_token_accuracy": 0.9449774503707886,
599
+ "num_tokens": 66466.0,
600
+ "step": 600
601
+ },
602
+ {
603
+ "epoch": 2.247191011235955,
604
+ "eval_loss": 0.19561618566513062,
605
+ "eval_mean_token_accuracy": 0.9387146234512329,
606
+ "eval_num_tokens": 66466.0,
607
+ "eval_runtime": 2.498,
608
+ "eval_samples_per_second": 11.609,
609
+ "eval_steps_per_second": 1.601,
610
+ "step": 600
611
+ },
612
+ {
613
+ "epoch": 2.284644194756554,
614
+ "grad_norm": 0.08070901036262512,
615
+ "learning_rate": 1.0876404494382022e-05,
616
+ "loss": 0.1905,
617
+ "mean_token_accuracy": 0.9429994106292725,
618
+ "num_tokens": 67574.0,
619
+ "step": 610
620
+ },
621
+ {
622
+ "epoch": 2.3220973782771535,
623
+ "grad_norm": 0.08464006334543228,
624
+ "learning_rate": 1.0726591760299627e-05,
625
+ "loss": 0.1892,
626
+ "mean_token_accuracy": 0.9467701494693757,
627
+ "num_tokens": 68683.0,
628
+ "step": 620
629
+ },
630
+ {
631
+ "epoch": 2.359550561797753,
632
+ "grad_norm": 0.21751029789447784,
633
+ "learning_rate": 1.057677902621723e-05,
634
+ "loss": 0.1898,
635
+ "mean_token_accuracy": 0.9449776113033295,
636
+ "num_tokens": 69793.0,
637
+ "step": 630
638
+ },
639
+ {
640
+ "epoch": 2.397003745318352,
641
+ "grad_norm": 0.06470742076635361,
642
+ "learning_rate": 1.0426966292134832e-05,
643
+ "loss": 0.1917,
644
+ "mean_token_accuracy": 0.9440505981445313,
645
+ "num_tokens": 70903.0,
646
+ "step": 640
647
+ },
648
+ {
649
+ "epoch": 2.4344569288389515,
650
+ "grad_norm": 0.07308146357536316,
651
+ "learning_rate": 1.0277153558052435e-05,
652
+ "loss": 0.1911,
653
+ "mean_token_accuracy": 0.9440014958381653,
654
+ "num_tokens": 72012.0,
655
+ "step": 650
656
+ },
657
+ {
658
+ "epoch": 2.4719101123595504,
659
+ "grad_norm": 0.17352479696273804,
660
+ "learning_rate": 1.0127340823970037e-05,
661
+ "loss": 0.1895,
662
+ "mean_token_accuracy": 0.9467605650424957,
663
+ "num_tokens": 73121.0,
664
+ "step": 660
665
+ },
666
+ {
667
+ "epoch": 2.5093632958801497,
668
+ "grad_norm": 0.06704717874526978,
669
+ "learning_rate": 9.977528089887642e-06,
670
+ "loss": 0.1882,
671
+ "mean_token_accuracy": 0.9442952454090119,
672
+ "num_tokens": 74236.0,
673
+ "step": 670
674
+ },
675
+ {
676
+ "epoch": 2.546816479400749,
677
+ "grad_norm": 0.15437676012516022,
678
+ "learning_rate": 9.827715355805244e-06,
679
+ "loss": 0.1936,
680
+ "mean_token_accuracy": 0.9383869290351867,
681
+ "num_tokens": 75344.0,
682
+ "step": 680
683
+ },
684
+ {
685
+ "epoch": 2.5842696629213484,
686
+ "grad_norm": 0.17893658578395844,
687
+ "learning_rate": 9.677902621722847e-06,
688
+ "loss": 0.1895,
689
+ "mean_token_accuracy": 0.9460550546646118,
690
+ "num_tokens": 76458.0,
691
+ "step": 690
692
+ },
693
+ {
694
+ "epoch": 2.6217228464419478,
695
+ "grad_norm": 0.2342701405286789,
696
+ "learning_rate": 9.52808988764045e-06,
697
+ "loss": 0.1901,
698
+ "mean_token_accuracy": 0.9467441976070404,
699
+ "num_tokens": 77568.0,
700
+ "step": 700
701
+ },
702
+ {
703
+ "epoch": 2.6217228464419478,
704
+ "eval_loss": 0.195254847407341,
705
+ "eval_mean_token_accuracy": 0.9420265555381775,
706
+ "eval_num_tokens": 77568.0,
707
+ "eval_runtime": 2.5044,
708
+ "eval_samples_per_second": 11.58,
709
+ "eval_steps_per_second": 1.597,
710
+ "step": 700
711
+ },
712
+ {
713
+ "epoch": 2.6591760299625467,
714
+ "grad_norm": 0.07597153633832932,
715
+ "learning_rate": 9.378277153558052e-06,
716
+ "loss": 0.1902,
717
+ "mean_token_accuracy": 0.9421320199966431,
718
+ "num_tokens": 78677.0,
719
+ "step": 710
720
+ },
721
+ {
722
+ "epoch": 2.696629213483146,
723
+ "grad_norm": 0.08350855857133865,
724
+ "learning_rate": 9.228464419475655e-06,
725
+ "loss": 0.19,
726
+ "mean_token_accuracy": 0.9431924819946289,
727
+ "num_tokens": 79788.0,
728
+ "step": 720
729
+ },
730
+ {
731
+ "epoch": 2.7340823970037453,
732
+ "grad_norm": 0.08954475820064545,
733
+ "learning_rate": 9.07865168539326e-06,
734
+ "loss": 0.1893,
735
+ "mean_token_accuracy": 0.945793092250824,
736
+ "num_tokens": 80896.0,
737
+ "step": 730
738
+ },
739
+ {
740
+ "epoch": 2.7715355805243447,
741
+ "grad_norm": 0.07194171845912933,
742
+ "learning_rate": 8.928838951310862e-06,
743
+ "loss": 0.1895,
744
+ "mean_token_accuracy": 0.9441764891147614,
745
+ "num_tokens": 82007.0,
746
+ "step": 740
747
+ },
748
+ {
749
+ "epoch": 2.808988764044944,
750
+ "grad_norm": 0.07494191080331802,
751
+ "learning_rate": 8.779026217228465e-06,
752
+ "loss": 0.1909,
753
+ "mean_token_accuracy": 0.9465851247310638,
754
+ "num_tokens": 83113.0,
755
+ "step": 750
756
+ },
757
+ {
758
+ "epoch": 2.846441947565543,
759
+ "grad_norm": 0.20635780692100525,
760
+ "learning_rate": 8.629213483146068e-06,
761
+ "loss": 0.1894,
762
+ "mean_token_accuracy": 0.9479296028614044,
763
+ "num_tokens": 84226.0,
764
+ "step": 760
765
+ },
766
+ {
767
+ "epoch": 2.8838951310861423,
768
+ "grad_norm": 0.18754708766937256,
769
+ "learning_rate": 8.47940074906367e-06,
770
+ "loss": 0.1898,
771
+ "mean_token_accuracy": 0.9503811955451965,
772
+ "num_tokens": 85333.0,
773
+ "step": 770
774
+ },
775
+ {
776
+ "epoch": 2.9213483146067416,
777
+ "grad_norm": 0.08187804371118546,
778
+ "learning_rate": 8.329588014981273e-06,
779
+ "loss": 0.1903,
780
+ "mean_token_accuracy": 0.943065983057022,
781
+ "num_tokens": 86441.0,
782
+ "step": 780
783
+ },
784
+ {
785
+ "epoch": 2.958801498127341,
786
+ "grad_norm": 0.18230247497558594,
787
+ "learning_rate": 8.179775280898877e-06,
788
+ "loss": 0.19,
789
+ "mean_token_accuracy": 0.9431342482566833,
790
+ "num_tokens": 87551.0,
791
+ "step": 790
792
+ },
793
+ {
794
+ "epoch": 2.9962546816479403,
795
+ "grad_norm": 0.19725900888442993,
796
+ "learning_rate": 8.02996254681648e-06,
797
+ "loss": 0.1903,
798
+ "mean_token_accuracy": 0.9396148085594177,
799
+ "num_tokens": 88663.0,
800
+ "step": 800
801
+ },
802
+ {
803
+ "epoch": 2.9962546816479403,
804
+ "eval_loss": 0.1949780136346817,
805
+ "eval_mean_token_accuracy": 0.9431759715080261,
806
+ "eval_num_tokens": 88663.0,
807
+ "eval_runtime": 2.487,
808
+ "eval_samples_per_second": 11.661,
809
+ "eval_steps_per_second": 1.608,
810
+ "step": 800
811
+ },
812
+ {
813
+ "epoch": 3.033707865168539,
814
+ "grad_norm": 0.07648079097270966,
815
+ "learning_rate": 7.880149812734083e-06,
816
+ "loss": 0.1903,
817
+ "mean_token_accuracy": 0.9457852184772492,
818
+ "num_tokens": 89716.0,
819
+ "step": 810
820
+ },
821
+ {
822
+ "epoch": 3.0711610486891385,
823
+ "grad_norm": 0.06528846174478531,
824
+ "learning_rate": 7.730337078651686e-06,
825
+ "loss": 0.1892,
826
+ "mean_token_accuracy": 0.9422413766384125,
827
+ "num_tokens": 90827.0,
828
+ "step": 820
829
+ },
830
+ {
831
+ "epoch": 3.108614232209738,
832
+ "grad_norm": 0.16895975172519684,
833
+ "learning_rate": 7.580524344569289e-06,
834
+ "loss": 0.1883,
835
+ "mean_token_accuracy": 0.9512388408184052,
836
+ "num_tokens": 91934.0,
837
+ "step": 830
838
+ },
839
+ {
840
+ "epoch": 3.146067415730337,
841
+ "grad_norm": 0.14639434218406677,
842
+ "learning_rate": 7.430711610486892e-06,
843
+ "loss": 0.1881,
844
+ "mean_token_accuracy": 0.9494467735290527,
845
+ "num_tokens": 93041.0,
846
+ "step": 840
847
+ },
848
+ {
849
+ "epoch": 3.1835205992509366,
850
+ "grad_norm": 0.08737971633672714,
851
+ "learning_rate": 7.280898876404495e-06,
852
+ "loss": 0.1901,
853
+ "mean_token_accuracy": 0.9406749486923218,
854
+ "num_tokens": 94156.0,
855
+ "step": 850
856
+ },
857
+ {
858
+ "epoch": 3.2209737827715355,
859
+ "grad_norm": 0.09445718675851822,
860
+ "learning_rate": 7.131086142322098e-06,
861
+ "loss": 0.1884,
862
+ "mean_token_accuracy": 0.9475689589977264,
863
+ "num_tokens": 95264.0,
864
+ "step": 860
865
+ },
866
+ {
867
+ "epoch": 3.258426966292135,
868
+ "grad_norm": 0.09516163170337677,
869
+ "learning_rate": 6.981273408239701e-06,
870
+ "loss": 0.1889,
871
+ "mean_token_accuracy": 0.9449783861637115,
872
+ "num_tokens": 96374.0,
873
+ "step": 870
874
+ },
875
+ {
876
+ "epoch": 3.295880149812734,
877
+ "grad_norm": 0.08031495660543442,
878
+ "learning_rate": 6.831460674157304e-06,
879
+ "loss": 0.1898,
880
+ "mean_token_accuracy": 0.9413152992725372,
881
+ "num_tokens": 97485.0,
882
+ "step": 880
883
+ },
884
+ {
885
+ "epoch": 3.3333333333333335,
886
+ "grad_norm": 0.20592394471168518,
887
+ "learning_rate": 6.681647940074907e-06,
888
+ "loss": 0.1902,
889
+ "mean_token_accuracy": 0.9421661615371704,
890
+ "num_tokens": 98595.0,
891
+ "step": 890
892
+ },
893
+ {
894
+ "epoch": 3.370786516853933,
895
+ "grad_norm": 0.19433368742465973,
896
+ "learning_rate": 6.53183520599251e-06,
897
+ "loss": 0.1882,
898
+ "mean_token_accuracy": 0.9559795200824738,
899
+ "num_tokens": 99705.0,
900
+ "step": 900
901
+ },
902
+ {
903
+ "epoch": 3.370786516853933,
904
+ "eval_loss": 0.19456735253334045,
905
+ "eval_mean_token_accuracy": 0.9444670528173447,
906
+ "eval_num_tokens": 99705.0,
907
+ "eval_runtime": 2.4936,
908
+ "eval_samples_per_second": 11.63,
909
+ "eval_steps_per_second": 1.604,
910
+ "step": 900
911
+ },
912
+ {
913
+ "epoch": 3.4082397003745317,
914
+ "grad_norm": 0.09646886587142944,
915
+ "learning_rate": 6.382022471910113e-06,
916
+ "loss": 0.1888,
917
+ "mean_token_accuracy": 0.9403546094894409,
918
+ "num_tokens": 100815.0,
919
+ "step": 910
920
+ },
921
+ {
922
+ "epoch": 3.445692883895131,
923
+ "grad_norm": 0.23463116586208344,
924
+ "learning_rate": 6.232209737827716e-06,
925
+ "loss": 0.1898,
926
+ "mean_token_accuracy": 0.9475183129310608,
927
+ "num_tokens": 101920.0,
928
+ "step": 920
929
+ },
930
+ {
931
+ "epoch": 3.4831460674157304,
932
+ "grad_norm": 0.1442176103591919,
933
+ "learning_rate": 6.0823970037453185e-06,
934
+ "loss": 0.1903,
935
+ "mean_token_accuracy": 0.945886081457138,
936
+ "num_tokens": 103029.0,
937
+ "step": 930
938
+ },
939
+ {
940
+ "epoch": 3.5205992509363297,
941
+ "grad_norm": 0.18915638327598572,
942
+ "learning_rate": 5.932584269662922e-06,
943
+ "loss": 0.1928,
944
+ "mean_token_accuracy": 0.9423760592937469,
945
+ "num_tokens": 104142.0,
946
+ "step": 940
947
+ },
948
+ {
949
+ "epoch": 3.558052434456929,
950
+ "grad_norm": 0.07233936339616776,
951
+ "learning_rate": 5.782771535580525e-06,
952
+ "loss": 0.1895,
953
+ "mean_token_accuracy": 0.9485957443714141,
954
+ "num_tokens": 105252.0,
955
+ "step": 950
956
+ },
957
+ {
958
+ "epoch": 3.595505617977528,
959
+ "grad_norm": 0.06686032563447952,
960
+ "learning_rate": 5.6329588014981274e-06,
961
+ "loss": 0.1897,
962
+ "mean_token_accuracy": 0.9438907384872437,
963
+ "num_tokens": 106360.0,
964
+ "step": 960
965
+ },
966
+ {
967
+ "epoch": 3.6329588014981273,
968
+ "grad_norm": 0.10568553954362869,
969
+ "learning_rate": 5.483146067415731e-06,
970
+ "loss": 0.1872,
971
+ "mean_token_accuracy": 0.9469630539417266,
972
+ "num_tokens": 107474.0,
973
+ "step": 970
974
+ },
975
+ {
976
+ "epoch": 3.6704119850187267,
977
+ "grad_norm": 0.17929266393184662,
978
+ "learning_rate": 5.333333333333334e-06,
979
+ "loss": 0.1897,
980
+ "mean_token_accuracy": 0.9439324319362641,
981
+ "num_tokens": 108583.0,
982
+ "step": 980
983
+ },
984
+ {
985
+ "epoch": 3.7078651685393256,
986
+ "grad_norm": 0.10999605804681778,
987
+ "learning_rate": 5.183520599250936e-06,
988
+ "loss": 0.191,
989
+ "mean_token_accuracy": 0.9439007878303528,
990
+ "num_tokens": 109690.0,
991
+ "step": 990
992
+ },
993
+ {
994
+ "epoch": 3.7453183520599254,
995
+ "grad_norm": 0.08190548419952393,
996
+ "learning_rate": 5.03370786516854e-06,
997
+ "loss": 0.1894,
998
+ "mean_token_accuracy": 0.9458610653877259,
999
+ "num_tokens": 110799.0,
1000
+ "step": 1000
1001
+ },
1002
+ {
1003
+ "epoch": 3.7453183520599254,
1004
+ "eval_loss": 0.1942463368177414,
1005
+ "eval_mean_token_accuracy": 0.9444670528173447,
1006
+ "eval_num_tokens": 110799.0,
1007
+ "eval_runtime": 2.4977,
1008
+ "eval_samples_per_second": 11.61,
1009
+ "eval_steps_per_second": 1.601,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "epoch": 3.7827715355805243,
1014
+ "grad_norm": 0.07969696819782257,
1015
+ "learning_rate": 4.883895131086143e-06,
1016
+ "loss": 0.1893,
1017
+ "mean_token_accuracy": 0.944927579164505,
1018
+ "num_tokens": 111908.0,
1019
+ "step": 1010
1020
+ },
1021
+ {
1022
+ "epoch": 3.8202247191011236,
1023
+ "grad_norm": 0.10001447796821594,
1024
+ "learning_rate": 4.734082397003746e-06,
1025
+ "loss": 0.19,
1026
+ "mean_token_accuracy": 0.945886081457138,
1027
+ "num_tokens": 113017.0,
1028
+ "step": 1020
1029
+ },
1030
+ {
1031
+ "epoch": 3.857677902621723,
1032
+ "grad_norm": 0.07509426027536392,
1033
+ "learning_rate": 4.584269662921349e-06,
1034
+ "loss": 0.1883,
1035
+ "mean_token_accuracy": 0.9468623578548432,
1036
+ "num_tokens": 114128.0,
1037
+ "step": 1030
1038
+ },
1039
+ {
1040
+ "epoch": 3.895131086142322,
1041
+ "grad_norm": 0.18818271160125732,
1042
+ "learning_rate": 4.4344569288389515e-06,
1043
+ "loss": 0.1878,
1044
+ "mean_token_accuracy": 0.9477705180644989,
1045
+ "num_tokens": 115239.0,
1046
+ "step": 1040
1047
+ },
1048
+ {
1049
+ "epoch": 3.932584269662921,
1050
+ "grad_norm": 0.18687786161899567,
1051
+ "learning_rate": 4.284644194756555e-06,
1052
+ "loss": 0.19,
1053
+ "mean_token_accuracy": 0.9439075767993927,
1054
+ "num_tokens": 116347.0,
1055
+ "step": 1050
1056
+ },
1057
+ {
1058
+ "epoch": 3.9700374531835205,
1059
+ "grad_norm": 0.0811658725142479,
1060
+ "learning_rate": 4.134831460674158e-06,
1061
+ "loss": 0.1887,
1062
+ "mean_token_accuracy": 0.9441100597381592,
1063
+ "num_tokens": 117458.0,
1064
+ "step": 1060
1065
+ },
1066
+ {
1067
+ "epoch": 4.007490636704119,
1068
+ "grad_norm": 0.08850710839033127,
1069
+ "learning_rate": 3.9850187265917604e-06,
1070
+ "loss": 0.1879,
1071
+ "mean_token_accuracy": 0.9432519495487213,
1072
+ "num_tokens": 118514.0,
1073
+ "step": 1070
1074
+ },
1075
+ {
1076
+ "epoch": 4.044943820224719,
1077
+ "grad_norm": 0.22726596891880035,
1078
+ "learning_rate": 3.835205992509364e-06,
1079
+ "loss": 0.1902,
1080
+ "mean_token_accuracy": 0.942855316400528,
1081
+ "num_tokens": 119620.0,
1082
+ "step": 1080
1083
+ },
1084
+ {
1085
+ "epoch": 4.082397003745318,
1086
+ "grad_norm": 0.0741027221083641,
1087
+ "learning_rate": 3.6853932584269662e-06,
1088
+ "loss": 0.1901,
1089
+ "mean_token_accuracy": 0.9414170861244202,
1090
+ "num_tokens": 120732.0,
1091
+ "step": 1090
1092
+ },
1093
+ {
1094
+ "epoch": 4.119850187265918,
1095
+ "grad_norm": 0.10303088277578354,
1096
+ "learning_rate": 3.5355805243445694e-06,
1097
+ "loss": 0.1887,
1098
+ "mean_token_accuracy": 0.9503899931907653,
1099
+ "num_tokens": 121840.0,
1100
+ "step": 1100
1101
+ },
1102
+ {
1103
+ "epoch": 4.119850187265918,
1104
+ "eval_loss": 0.19412046670913696,
1105
+ "eval_mean_token_accuracy": 0.9453794658184052,
1106
+ "eval_num_tokens": 121840.0,
1107
+ "eval_runtime": 2.4937,
1108
+ "eval_samples_per_second": 11.629,
1109
+ "eval_steps_per_second": 1.604,
1110
+ "step": 1100
1111
+ }
1112
+ ],
1113
+ "logging_steps": 10,
1114
+ "max_steps": 1335,
1115
+ "num_input_tokens_seen": 0,
1116
+ "num_train_epochs": 5,
1117
+ "save_steps": 100,
1118
+ "stateful_callbacks": {
1119
+ "TrainerControl": {
1120
+ "args": {
1121
+ "should_epoch_stop": false,
1122
+ "should_evaluate": false,
1123
+ "should_log": false,
1124
+ "should_save": true,
1125
+ "should_training_stop": false
1126
+ },
1127
+ "attributes": {}
1128
+ }
1129
+ },
1130
+ "total_flos": 5589043151616000.0,
1131
+ "train_batch_size": 2,
1132
+ "trial_name": null,
1133
+ "trial_params": null
1134
+ }
checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b67df3186500a8ab4543ab551c594a458a667a2ee9e16f00656ee5598e0026
3
+ size 6097
checkpoint-1200/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: llava-hf/LLaVA-NeXT-Video-7B-32K-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:llava-hf/LLaVA-NeXT-Video-7B-32K-hf
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-1200/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "k_proj",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "o_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-1200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e94f0aed04fe810237a72b47d743cd22f19dd247596200e24469d463ff0e81a5
3
+ size 708929184