palapapa commited on
Commit
82bddeb
·
verified ·
1 Parent(s): e0ed164

Uploaded our current best model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +15 -0
  2. 2025-11-26-16-50-52/01_scene/checkpoint-100/README.md +207 -0
  3. 2025-11-26-16-50-52/01_scene/checkpoint-100/adapter_config.json +42 -0
  4. 2025-11-26-16-50-52/01_scene/checkpoint-100/adapter_model.safetensors +3 -0
  5. 2025-11-26-16-50-52/01_scene/checkpoint-100/added_tokens.json +5 -0
  6. 2025-11-26-16-50-52/01_scene/checkpoint-100/merges.txt +0 -0
  7. 2025-11-26-16-50-52/01_scene/checkpoint-100/optimizer.pt +3 -0
  8. 2025-11-26-16-50-52/01_scene/checkpoint-100/rng_state.pth +3 -0
  9. 2025-11-26-16-50-52/01_scene/checkpoint-100/scheduler.pt +3 -0
  10. 2025-11-26-16-50-52/01_scene/checkpoint-100/special_tokens_map.json +20 -0
  11. 2025-11-26-16-50-52/01_scene/checkpoint-100/tokenizer.json +3 -0
  12. 2025-11-26-16-50-52/01_scene/checkpoint-100/tokenizer_config.json +44 -0
  13. 2025-11-26-16-50-52/01_scene/checkpoint-100/trainer_state.json +55 -0
  14. 2025-11-26-16-50-52/01_scene/checkpoint-100/vocab.json +0 -0
  15. 2025-11-26-16-50-52/01_scene/checkpoint-400/README.md +207 -0
  16. 2025-11-26-16-50-52/01_scene/checkpoint-400/adapter_config.json +42 -0
  17. 2025-11-26-16-50-52/01_scene/checkpoint-400/adapter_model.safetensors +3 -0
  18. 2025-11-26-16-50-52/01_scene/checkpoint-400/added_tokens.json +5 -0
  19. 2025-11-26-16-50-52/01_scene/checkpoint-400/merges.txt +0 -0
  20. 2025-11-26-16-50-52/01_scene/checkpoint-400/optimizer.pt +3 -0
  21. 2025-11-26-16-50-52/01_scene/checkpoint-400/rng_state.pth +3 -0
  22. 2025-11-26-16-50-52/01_scene/checkpoint-400/scheduler.pt +3 -0
  23. 2025-11-26-16-50-52/01_scene/checkpoint-400/special_tokens_map.json +20 -0
  24. 2025-11-26-16-50-52/01_scene/checkpoint-400/tokenizer.json +3 -0
  25. 2025-11-26-16-50-52/01_scene/checkpoint-400/tokenizer_config.json +44 -0
  26. 2025-11-26-16-50-52/01_scene/checkpoint-400/trainer_state.json +121 -0
  27. 2025-11-26-16-50-52/01_scene/checkpoint-400/vocab.json +0 -0
  28. 2025-11-26-16-50-52/01_scene/checkpoint-800/README.md +207 -0
  29. 2025-11-26-16-50-52/01_scene/checkpoint-800/adapter_config.json +42 -0
  30. 2025-11-26-16-50-52/01_scene/checkpoint-800/adapter_model.safetensors +3 -0
  31. 2025-11-26-16-50-52/01_scene/checkpoint-800/added_tokens.json +5 -0
  32. 2025-11-26-16-50-52/01_scene/checkpoint-800/merges.txt +0 -0
  33. 2025-11-26-16-50-52/01_scene/checkpoint-800/optimizer.pt +3 -0
  34. 2025-11-26-16-50-52/01_scene/checkpoint-800/rng_state.pth +3 -0
  35. 2025-11-26-16-50-52/01_scene/checkpoint-800/scheduler.pt +3 -0
  36. 2025-11-26-16-50-52/01_scene/checkpoint-800/special_tokens_map.json +20 -0
  37. 2025-11-26-16-50-52/01_scene/checkpoint-800/tokenizer.json +3 -0
  38. 2025-11-26-16-50-52/01_scene/checkpoint-800/tokenizer_config.json +44 -0
  39. 2025-11-26-16-50-52/01_scene/checkpoint-800/trainer_state.json +209 -0
  40. 2025-11-26-16-50-52/01_scene/checkpoint-800/vocab.json +0 -0
  41. 2025-11-26-16-50-52/01_scene/trainer_state.json +614 -0
  42. 2025-11-26-16-50-52/01_scene/training_arguments.json +145 -0
  43. 2025-11-26-16-50-52/02_distortion/checkpoint-100/README.md +207 -0
  44. 2025-11-26-16-50-52/02_distortion/checkpoint-100/adapter_config.json +42 -0
  45. 2025-11-26-16-50-52/02_distortion/checkpoint-100/adapter_model.safetensors +3 -0
  46. 2025-11-26-16-50-52/02_distortion/checkpoint-100/added_tokens.json +5 -0
  47. 2025-11-26-16-50-52/02_distortion/checkpoint-100/merges.txt +0 -0
  48. 2025-11-26-16-50-52/02_distortion/checkpoint-100/optimizer.pt +3 -0
  49. 2025-11-26-16-50-52/02_distortion/checkpoint-100/rng_state.pth +3 -0
  50. 2025-11-26-16-50-52/02_distortion/checkpoint-100/scheduler.pt +3 -0
.gitattributes CHANGED
@@ -44,3 +44,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
44
  10310000_distortion_2/final_model/eval_scatter_plot.png filter=lfs diff=lfs merge=lfs -text
45
  10310000_distortion_2/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  10310800_full_2/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  10310000_distortion_2/final_model/eval_scatter_plot.png filter=lfs diff=lfs merge=lfs -text
45
  10310000_distortion_2/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  10310800_full_2/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ 2025-11-26-16-50-52/01_scene/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ 2025-11-26-16-50-52/01_scene/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ 2025-11-26-16-50-52/01_scene/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ 2025-11-26-16-50-52/02_distortion/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ 2025-11-26-16-50-52/02_distortion/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ 2025-11-26-16-50-52/02_distortion/checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ 2025-11-26-16-50-52/03_quality/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ 2025-11-26-16-50-52/03_quality/checkpoint-1700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ 2025-11-26-16-50-52/03_quality/checkpoint-1900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ 2025-11-26-16-50-52/03_quality/checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ 2025-11-26-16-50-52/03_quality/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ 2025-11-26-16-50-52/final_model/eval_scatter_plot_csiq_full.png filter=lfs diff=lfs merge=lfs -text
59
+ 2025-11-26-16-50-52/final_model/eval_scatter_plot_kadid-10k_validation.png filter=lfs diff=lfs merge=lfs -text
60
+ 2025-11-26-16-50-52/final_model/eval_scatter_plot_koniq-10k_testing.png filter=lfs diff=lfs merge=lfs -text
61
+ 2025-11-26-16-50-52/final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
2025-11-26-16-50-52/01_scene/checkpoint-100/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: src/owl3
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:src/owl3
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.1
2025-11-26-16-50-52/01_scene/checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "src/owl3",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "up_proj",
29
+ "q_proj",
30
+ "o_proj",
31
+ "down_proj",
32
+ "k_proj",
33
+ "gate_proj",
34
+ "v_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
2025-11-26-16-50-52/01_scene/checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d227ac43dea2277cca9141fb4864c5a102a060d5be1d166e85bd92dad3c0da
3
+ size 173507576
2025-11-26-16-50-52/01_scene/checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
2025-11-26-16-50-52/01_scene/checkpoint-100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee93be382301d69d5845cf6690508b369b886542302074061e12a619a3f09f6a
3
+ size 346426383
2025-11-26-16-50-52/01_scene/checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c45eee438afc8a5dc3aa37e234d797ee8f451c3558934c36fd101e2e774bb4f3
3
+ size 14645
2025-11-26-16-50-52/01_scene/checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441ef59f1d5d6d1eba1fd1f9f4c9a8660f66e566f1b8b3b88c35a5dad7fb26b6
3
+ size 1465
2025-11-26-16-50-52/01_scene/checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
2025-11-26-16-50-52/01_scene/checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
2025-11-26-16-50-52/01_scene/checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 131072,
40
+ "pad_token": "<|endoftext|>",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
2025-11-26-16-50-52/01_scene/checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.03937990590929985,
3
+ "best_model_checkpoint": "outputs/2025-11-26-16-50-52/01_scene/checkpoint-100",
4
+ "epoch": 0.11334655709832814,
5
+ "eval_steps": 100,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05667327854916407,
13
+ "grad_norm": 4.339138507843018,
14
+ "learning_rate": 0.000125,
15
+ "loss": 0.3253,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11334655709832814,
20
+ "grad_norm": 3.550264835357666,
21
+ "learning_rate": 0.00019997002254654227,
22
+ "loss": 0.0543,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.11334655709832814,
27
+ "eval_loss": 0.03937990590929985,
28
+ "eval_runtime": 90.138,
29
+ "eval_samples_per_second": 11.094,
30
+ "eval_steps_per_second": 11.094,
31
+ "step": 100
32
+ }
33
+ ],
34
+ "logging_steps": 50,
35
+ "max_steps": 2646,
36
+ "num_input_tokens_seen": 0,
37
+ "num_train_epochs": 3,
38
+ "save_steps": 100,
39
+ "stateful_callbacks": {
40
+ "TrainerControl": {
41
+ "args": {
42
+ "should_epoch_stop": false,
43
+ "should_evaluate": false,
44
+ "should_log": false,
45
+ "should_save": true,
46
+ "should_training_stop": false
47
+ },
48
+ "attributes": {}
49
+ }
50
+ },
51
+ "total_flos": 0.0,
52
+ "train_batch_size": 1,
53
+ "trial_name": null,
54
+ "trial_params": null
55
+ }
2025-11-26-16-50-52/01_scene/checkpoint-100/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/checkpoint-400/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: src/owl3
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:src/owl3
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.1
2025-11-26-16-50-52/01_scene/checkpoint-400/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "src/owl3",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "up_proj",
29
+ "q_proj",
30
+ "o_proj",
31
+ "down_proj",
32
+ "k_proj",
33
+ "gate_proj",
34
+ "v_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
2025-11-26-16-50-52/01_scene/checkpoint-400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:816110d185222589b0a18d789a798bd7a593a095ced1a583850813114edbaa48
3
+ size 173507576
2025-11-26-16-50-52/01_scene/checkpoint-400/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
2025-11-26-16-50-52/01_scene/checkpoint-400/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cfcbec1b377be644f660d0a2fdc514275ddcaeff1d75dc470546804bc5c4d7e
3
+ size 346426383
2025-11-26-16-50-52/01_scene/checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49fd7ae9b0200cc68753ba87024f2a0bcda3a02a4a85275d02a14f5d82b84970
3
+ size 14645
2025-11-26-16-50-52/01_scene/checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b033da602049fdfa97afecf91db3cb37bbb93391e705e455fcf44171f0a17ee1
3
+ size 1465
2025-11-26-16-50-52/01_scene/checkpoint-400/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
2025-11-26-16-50-52/01_scene/checkpoint-400/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
2025-11-26-16-50-52/01_scene/checkpoint-400/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 131072,
40
+ "pad_token": "<|endoftext|>",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
2025-11-26-16-50-52/01_scene/checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.034898512065410614,
3
+ "best_model_checkpoint": "outputs/2025-11-26-16-50-52/01_scene/checkpoint-400",
4
+ "epoch": 0.45338622839331255,
5
+ "eval_steps": 100,
6
+ "global_step": 400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05667327854916407,
13
+ "grad_norm": 4.339138507843018,
14
+ "learning_rate": 0.000125,
15
+ "loss": 0.3253,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11334655709832814,
20
+ "grad_norm": 3.550264835357666,
21
+ "learning_rate": 0.00019997002254654227,
22
+ "loss": 0.0543,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.11334655709832814,
27
+ "eval_loss": 0.03937990590929985,
28
+ "eval_runtime": 90.138,
29
+ "eval_samples_per_second": 11.094,
30
+ "eval_steps_per_second": 11.094,
31
+ "step": 100
32
+ },
33
+ {
34
+ "epoch": 0.1700198356474922,
35
+ "grad_norm": 1.1671267747879028,
36
+ "learning_rate": 0.0001996329825692593,
37
+ "loss": 0.0431,
38
+ "step": 150
39
+ },
40
+ {
41
+ "epoch": 0.22669311419665628,
42
+ "grad_norm": 5.124600887298584,
43
+ "learning_rate": 0.00019892269762919834,
44
+ "loss": 0.0385,
45
+ "step": 200
46
+ },
47
+ {
48
+ "epoch": 0.22669311419665628,
49
+ "eval_loss": 0.039480071514844894,
50
+ "eval_runtime": 90.0332,
51
+ "eval_samples_per_second": 11.107,
52
+ "eval_steps_per_second": 11.107,
53
+ "step": 200
54
+ },
55
+ {
56
+ "epoch": 0.2833663927458204,
57
+ "grad_norm": 1.6820999383926392,
58
+ "learning_rate": 0.0001978418285949712,
59
+ "loss": 0.0438,
60
+ "step": 250
61
+ },
62
+ {
63
+ "epoch": 0.3400396712949844,
64
+ "grad_norm": 3.634084463119507,
65
+ "learning_rate": 0.0001963944246168898,
66
+ "loss": 0.0352,
67
+ "step": 300
68
+ },
69
+ {
70
+ "epoch": 0.3400396712949844,
71
+ "eval_loss": 0.05238571763038635,
72
+ "eval_runtime": 90.3736,
73
+ "eval_samples_per_second": 11.065,
74
+ "eval_steps_per_second": 11.065,
75
+ "step": 300
76
+ },
77
+ {
78
+ "epoch": 0.3967129498441485,
79
+ "grad_norm": 0.8966110348701477,
80
+ "learning_rate": 0.00019458590795804406,
81
+ "loss": 0.0319,
82
+ "step": 350
83
+ },
84
+ {
85
+ "epoch": 0.45338622839331255,
86
+ "grad_norm": 2.966634511947632,
87
+ "learning_rate": 0.00019242305368142622,
88
+ "loss": 0.0302,
89
+ "step": 400
90
+ },
91
+ {
92
+ "epoch": 0.45338622839331255,
93
+ "eval_loss": 0.034898512065410614,
94
+ "eval_runtime": 90.038,
95
+ "eval_samples_per_second": 11.106,
96
+ "eval_steps_per_second": 11.106,
97
+ "step": 400
98
+ }
99
+ ],
100
+ "logging_steps": 50,
101
+ "max_steps": 2646,
102
+ "num_input_tokens_seen": 0,
103
+ "num_train_epochs": 3,
104
+ "save_steps": 100,
105
+ "stateful_callbacks": {
106
+ "TrainerControl": {
107
+ "args": {
108
+ "should_epoch_stop": false,
109
+ "should_evaluate": false,
110
+ "should_log": false,
111
+ "should_save": true,
112
+ "should_training_stop": false
113
+ },
114
+ "attributes": {}
115
+ }
116
+ },
117
+ "total_flos": 0.0,
118
+ "train_batch_size": 1,
119
+ "trial_name": null,
120
+ "trial_params": null
121
+ }
2025-11-26-16-50-52/01_scene/checkpoint-400/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/checkpoint-800/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: src/owl3
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:src/owl3
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.1
2025-11-26-16-50-52/01_scene/checkpoint-800/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "src/owl3",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "up_proj",
29
+ "q_proj",
30
+ "o_proj",
31
+ "down_proj",
32
+ "k_proj",
33
+ "gate_proj",
34
+ "v_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
2025-11-26-16-50-52/01_scene/checkpoint-800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df63a97c892d9c5cdf0356a106784c03d8816195151e18718aaf4c1bee06807d
3
+ size 173507576
2025-11-26-16-50-52/01_scene/checkpoint-800/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
2025-11-26-16-50-52/01_scene/checkpoint-800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/checkpoint-800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006581a8b3c441c8db2d4d05664e856697615151776fe627c139ee488523a236
3
+ size 346426383
2025-11-26-16-50-52/01_scene/checkpoint-800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6643285b53f3bcb404e1a18f452acfc0c02d10349609936ea13290cb5e7d9a1d
3
+ size 14645
2025-11-26-16-50-52/01_scene/checkpoint-800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d1b9b94d36e31ca1b4a396f1a3c56e64d286214cf956e6651be979216eaf6fb
3
+ size 1465
2025-11-26-16-50-52/01_scene/checkpoint-800/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
2025-11-26-16-50-52/01_scene/checkpoint-800/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
2025-11-26-16-50-52/01_scene/checkpoint-800/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 131072,
40
+ "pad_token": "<|endoftext|>",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
2025-11-26-16-50-52/01_scene/checkpoint-800/trainer_state.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.03386669233441353,
3
+ "best_model_checkpoint": "outputs/2025-11-26-16-50-52/01_scene/checkpoint-800",
4
+ "epoch": 0.9067724567866251,
5
+ "eval_steps": 100,
6
+ "global_step": 800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05667327854916407,
13
+ "grad_norm": 4.339138507843018,
14
+ "learning_rate": 0.000125,
15
+ "loss": 0.3253,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11334655709832814,
20
+ "grad_norm": 3.550264835357666,
21
+ "learning_rate": 0.00019997002254654227,
22
+ "loss": 0.0543,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.11334655709832814,
27
+ "eval_loss": 0.03937990590929985,
28
+ "eval_runtime": 90.138,
29
+ "eval_samples_per_second": 11.094,
30
+ "eval_steps_per_second": 11.094,
31
+ "step": 100
32
+ },
33
+ {
34
+ "epoch": 0.1700198356474922,
35
+ "grad_norm": 1.1671267747879028,
36
+ "learning_rate": 0.0001996329825692593,
37
+ "loss": 0.0431,
38
+ "step": 150
39
+ },
40
+ {
41
+ "epoch": 0.22669311419665628,
42
+ "grad_norm": 5.124600887298584,
43
+ "learning_rate": 0.00019892269762919834,
44
+ "loss": 0.0385,
45
+ "step": 200
46
+ },
47
+ {
48
+ "epoch": 0.22669311419665628,
49
+ "eval_loss": 0.039480071514844894,
50
+ "eval_runtime": 90.0332,
51
+ "eval_samples_per_second": 11.107,
52
+ "eval_steps_per_second": 11.107,
53
+ "step": 200
54
+ },
55
+ {
56
+ "epoch": 0.2833663927458204,
57
+ "grad_norm": 1.6820999383926392,
58
+ "learning_rate": 0.0001978418285949712,
59
+ "loss": 0.0438,
60
+ "step": 250
61
+ },
62
+ {
63
+ "epoch": 0.3400396712949844,
64
+ "grad_norm": 3.634084463119507,
65
+ "learning_rate": 0.0001963944246168898,
66
+ "loss": 0.0352,
67
+ "step": 300
68
+ },
69
+ {
70
+ "epoch": 0.3400396712949844,
71
+ "eval_loss": 0.05238571763038635,
72
+ "eval_runtime": 90.3736,
73
+ "eval_samples_per_second": 11.065,
74
+ "eval_steps_per_second": 11.065,
75
+ "step": 300
76
+ },
77
+ {
78
+ "epoch": 0.3967129498441485,
79
+ "grad_norm": 0.8966110348701477,
80
+ "learning_rate": 0.00019458590795804406,
81
+ "loss": 0.0319,
82
+ "step": 350
83
+ },
84
+ {
85
+ "epoch": 0.45338622839331255,
86
+ "grad_norm": 2.966634511947632,
87
+ "learning_rate": 0.00019242305368142622,
88
+ "loss": 0.0302,
89
+ "step": 400
90
+ },
91
+ {
92
+ "epoch": 0.45338622839331255,
93
+ "eval_loss": 0.034898512065410614,
94
+ "eval_runtime": 90.038,
95
+ "eval_samples_per_second": 11.106,
96
+ "eval_steps_per_second": 11.106,
97
+ "step": 400
98
+ },
99
+ {
100
+ "epoch": 0.5100595069424766,
101
+ "grad_norm": 3.242164373397827,
102
+ "learning_rate": 0.00018991396426919788,
103
+ "loss": 0.2013,
104
+ "step": 450
105
+ },
106
+ {
107
+ "epoch": 0.5667327854916407,
108
+ "grad_norm": 1.3575365543365479,
109
+ "learning_rate": 0.00018706803926918063,
110
+ "loss": 0.0265,
111
+ "step": 500
112
+ },
113
+ {
114
+ "epoch": 0.5667327854916407,
115
+ "eval_loss": 0.039046503603458405,
116
+ "eval_runtime": 90.0456,
117
+ "eval_samples_per_second": 11.105,
118
+ "eval_steps_per_second": 11.105,
119
+ "step": 500
120
+ },
121
+ {
122
+ "epoch": 0.6234060640408048,
123
+ "grad_norm": 0.3216783106327057,
124
+ "learning_rate": 0.00018389594008228123,
125
+ "loss": 0.0388,
126
+ "step": 550
127
+ },
128
+ {
129
+ "epoch": 0.6800793425899688,
130
+ "grad_norm": 0.2726989686489105,
131
+ "learning_rate": 0.00018040955002276377,
132
+ "loss": 0.0307,
133
+ "step": 600
134
+ },
135
+ {
136
+ "epoch": 0.6800793425899688,
137
+ "eval_loss": 0.0383339487016201,
138
+ "eval_runtime": 90.0647,
139
+ "eval_samples_per_second": 11.103,
140
+ "eval_steps_per_second": 11.103,
141
+ "step": 600
142
+ },
143
+ {
144
+ "epoch": 0.7367526211391329,
145
+ "grad_norm": 3.5234360694885254,
146
+ "learning_rate": 0.0001766219298009918,
147
+ "loss": 0.0261,
148
+ "step": 650
149
+ },
150
+ {
151
+ "epoch": 0.793425899688297,
152
+ "grad_norm": 0.45793426036834717,
153
+ "learning_rate": 0.00017254726859541043,
154
+ "loss": 0.022,
155
+ "step": 700
156
+ },
157
+ {
158
+ "epoch": 0.793425899688297,
159
+ "eval_loss": 0.03573239967226982,
160
+ "eval_runtime": 90.0501,
161
+ "eval_samples_per_second": 11.105,
162
+ "eval_steps_per_second": 11.105,
163
+ "step": 700
164
+ },
165
+ {
166
+ "epoch": 0.8500991782374611,
167
+ "grad_norm": 2.6463303565979004,
168
+ "learning_rate": 0.00016820083089706263,
169
+ "loss": 0.0254,
170
+ "step": 750
171
+ },
172
+ {
173
+ "epoch": 0.9067724567866251,
174
+ "grad_norm": 1.4524548053741455,
175
+ "learning_rate": 0.0001635988993257706,
176
+ "loss": 0.0188,
177
+ "step": 800
178
+ },
179
+ {
180
+ "epoch": 0.9067724567866251,
181
+ "eval_loss": 0.03386669233441353,
182
+ "eval_runtime": 90.2752,
183
+ "eval_samples_per_second": 11.077,
184
+ "eval_steps_per_second": 11.077,
185
+ "step": 800
186
+ }
187
+ ],
188
+ "logging_steps": 50,
189
+ "max_steps": 2646,
190
+ "num_input_tokens_seen": 0,
191
+ "num_train_epochs": 3,
192
+ "save_steps": 100,
193
+ "stateful_callbacks": {
194
+ "TrainerControl": {
195
+ "args": {
196
+ "should_epoch_stop": false,
197
+ "should_evaluate": false,
198
+ "should_log": false,
199
+ "should_save": true,
200
+ "should_training_stop": false
201
+ },
202
+ "attributes": {}
203
+ }
204
+ },
205
+ "total_flos": 0.0,
206
+ "train_batch_size": 1,
207
+ "trial_name": null,
208
+ "trial_params": null
209
+ }
2025-11-26-16-50-52/01_scene/checkpoint-800/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/01_scene/trainer_state.json ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.03386669233441353,
3
+ "best_model_checkpoint": "outputs/2025-11-26-16-50-52/01_scene/checkpoint-800",
4
+ "epoch": 2.9974497024652877,
5
+ "eval_steps": 100,
6
+ "global_step": 2646,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05667327854916407,
13
+ "grad_norm": 4.339138507843018,
14
+ "learning_rate": 0.000125,
15
+ "loss": 0.3253,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11334655709832814,
20
+ "grad_norm": 3.550264835357666,
21
+ "learning_rate": 0.00019997002254654227,
22
+ "loss": 0.0543,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.11334655709832814,
27
+ "eval_loss": 0.03937990590929985,
28
+ "eval_runtime": 90.138,
29
+ "eval_samples_per_second": 11.094,
30
+ "eval_steps_per_second": 11.094,
31
+ "step": 100
32
+ },
33
+ {
34
+ "epoch": 0.1700198356474922,
35
+ "grad_norm": 1.1671267747879028,
36
+ "learning_rate": 0.0001996329825692593,
37
+ "loss": 0.0431,
38
+ "step": 150
39
+ },
40
+ {
41
+ "epoch": 0.22669311419665628,
42
+ "grad_norm": 5.124600887298584,
43
+ "learning_rate": 0.00019892269762919834,
44
+ "loss": 0.0385,
45
+ "step": 200
46
+ },
47
+ {
48
+ "epoch": 0.22669311419665628,
49
+ "eval_loss": 0.039480071514844894,
50
+ "eval_runtime": 90.0332,
51
+ "eval_samples_per_second": 11.107,
52
+ "eval_steps_per_second": 11.107,
53
+ "step": 200
54
+ },
55
+ {
56
+ "epoch": 0.2833663927458204,
57
+ "grad_norm": 1.6820999383926392,
58
+ "learning_rate": 0.0001978418285949712,
59
+ "loss": 0.0438,
60
+ "step": 250
61
+ },
62
+ {
63
+ "epoch": 0.3400396712949844,
64
+ "grad_norm": 3.634084463119507,
65
+ "learning_rate": 0.0001963944246168898,
66
+ "loss": 0.0352,
67
+ "step": 300
68
+ },
69
+ {
70
+ "epoch": 0.3400396712949844,
71
+ "eval_loss": 0.05238571763038635,
72
+ "eval_runtime": 90.3736,
73
+ "eval_samples_per_second": 11.065,
74
+ "eval_steps_per_second": 11.065,
75
+ "step": 300
76
+ },
77
+ {
78
+ "epoch": 0.3967129498441485,
79
+ "grad_norm": 0.8966110348701477,
80
+ "learning_rate": 0.00019458590795804406,
81
+ "loss": 0.0319,
82
+ "step": 350
83
+ },
84
+ {
85
+ "epoch": 0.45338622839331255,
86
+ "grad_norm": 2.966634511947632,
87
+ "learning_rate": 0.00019242305368142622,
88
+ "loss": 0.0302,
89
+ "step": 400
90
+ },
91
+ {
92
+ "epoch": 0.45338622839331255,
93
+ "eval_loss": 0.034898512065410614,
94
+ "eval_runtime": 90.038,
95
+ "eval_samples_per_second": 11.106,
96
+ "eval_steps_per_second": 11.106,
97
+ "step": 400
98
+ },
99
+ {
100
+ "epoch": 0.5100595069424766,
101
+ "grad_norm": 3.242164373397827,
102
+ "learning_rate": 0.00018991396426919788,
103
+ "loss": 0.2013,
104
+ "step": 450
105
+ },
106
+ {
107
+ "epoch": 0.5667327854916407,
108
+ "grad_norm": 1.3575365543365479,
109
+ "learning_rate": 0.00018706803926918063,
110
+ "loss": 0.0265,
111
+ "step": 500
112
+ },
113
+ {
114
+ "epoch": 0.5667327854916407,
115
+ "eval_loss": 0.039046503603458405,
116
+ "eval_runtime": 90.0456,
117
+ "eval_samples_per_second": 11.105,
118
+ "eval_steps_per_second": 11.105,
119
+ "step": 500
120
+ },
121
+ {
122
+ "epoch": 0.6234060640408048,
123
+ "grad_norm": 0.3216783106327057,
124
+ "learning_rate": 0.00018389594008228123,
125
+ "loss": 0.0388,
126
+ "step": 550
127
+ },
128
+ {
129
+ "epoch": 0.6800793425899688,
130
+ "grad_norm": 0.2726989686489105,
131
+ "learning_rate": 0.00018040955002276377,
132
+ "loss": 0.0307,
133
+ "step": 600
134
+ },
135
+ {
136
+ "epoch": 0.6800793425899688,
137
+ "eval_loss": 0.0383339487016201,
138
+ "eval_runtime": 90.0647,
139
+ "eval_samples_per_second": 11.103,
140
+ "eval_steps_per_second": 11.103,
141
+ "step": 600
142
+ },
143
+ {
144
+ "epoch": 0.7367526211391329,
145
+ "grad_norm": 3.5234360694885254,
146
+ "learning_rate": 0.0001766219298009918,
147
+ "loss": 0.0261,
148
+ "step": 650
149
+ },
150
+ {
151
+ "epoch": 0.793425899688297,
152
+ "grad_norm": 0.45793426036834717,
153
+ "learning_rate": 0.00017254726859541043,
154
+ "loss": 0.022,
155
+ "step": 700
156
+ },
157
+ {
158
+ "epoch": 0.793425899688297,
159
+ "eval_loss": 0.03573239967226982,
160
+ "eval_runtime": 90.0501,
161
+ "eval_samples_per_second": 11.105,
162
+ "eval_steps_per_second": 11.105,
163
+ "step": 700
164
+ },
165
+ {
166
+ "epoch": 0.8500991782374611,
167
+ "grad_norm": 2.6463303565979004,
168
+ "learning_rate": 0.00016820083089706263,
169
+ "loss": 0.0254,
170
+ "step": 750
171
+ },
172
+ {
173
+ "epoch": 0.9067724567866251,
174
+ "grad_norm": 1.4524548053741455,
175
+ "learning_rate": 0.0001635988993257706,
176
+ "loss": 0.0188,
177
+ "step": 800
178
+ },
179
+ {
180
+ "epoch": 0.9067724567866251,
181
+ "eval_loss": 0.03386669233441353,
182
+ "eval_runtime": 90.2752,
183
+ "eval_samples_per_second": 11.077,
184
+ "eval_steps_per_second": 11.077,
185
+ "step": 800
186
+ },
187
+ {
188
+ "epoch": 0.9634457353357891,
189
+ "grad_norm": 1.8023526668548584,
190
+ "learning_rate": 0.0001587587136322047,
191
+ "loss": 0.0165,
192
+ "step": 850
193
+ },
194
+ {
195
+ "epoch": 1.0192689147067158,
196
+ "grad_norm": 2.767707586288452,
197
+ "learning_rate": 0.00015369840611434952,
198
+ "loss": 0.0167,
199
+ "step": 900
200
+ },
201
+ {
202
+ "epoch": 1.0192689147067158,
203
+ "eval_loss": 0.03767446056008339,
204
+ "eval_runtime": 90.0949,
205
+ "eval_samples_per_second": 11.099,
206
+ "eval_steps_per_second": 11.099,
207
+ "step": 900
208
+ },
209
+ {
210
+ "epoch": 1.07594219325588,
211
+ "grad_norm": 1.664818286895752,
212
+ "learning_rate": 0.0001484369336903102,
213
+ "loss": 0.0128,
214
+ "step": 950
215
+ },
216
+ {
217
+ "epoch": 1.1326154718050438,
218
+ "grad_norm": 1.7656521797180176,
219
+ "learning_rate": 0.00014299400688192834,
220
+ "loss": 0.0143,
221
+ "step": 1000
222
+ },
223
+ {
224
+ "epoch": 1.1326154718050438,
225
+ "eval_loss": 0.03644219785928726,
226
+ "eval_runtime": 90.3385,
227
+ "eval_samples_per_second": 11.069,
228
+ "eval_steps_per_second": 11.069,
229
+ "step": 1000
230
+ },
231
+ {
232
+ "epoch": 1.189288750354208,
233
+ "grad_norm": 2.3746185302734375,
234
+ "learning_rate": 0.00013739001597524786,
235
+ "loss": 0.009,
236
+ "step": 1050
237
+ },
238
+ {
239
+ "epoch": 1.2459620289033722,
240
+ "grad_norm": 0.08090117573738098,
241
+ "learning_rate": 0.00013164595463444938,
242
+ "loss": 0.0118,
243
+ "step": 1100
244
+ },
245
+ {
246
+ "epoch": 1.2459620289033722,
247
+ "eval_loss": 0.04246201366186142,
248
+ "eval_runtime": 90.0776,
249
+ "eval_samples_per_second": 11.102,
250
+ "eval_steps_per_second": 11.102,
251
+ "step": 1100
252
+ },
253
+ {
254
+ "epoch": 1.302635307452536,
255
+ "grad_norm": 0.5375736355781555,
256
+ "learning_rate": 0.00012578334125540997,
257
+ "loss": 0.0072,
258
+ "step": 1150
259
+ },
260
+ {
261
+ "epoch": 1.3593085860017002,
262
+ "grad_norm": 0.021600479260087013,
263
+ "learning_rate": 0.00011982413835351374,
264
+ "loss": 0.0085,
265
+ "step": 1200
266
+ },
267
+ {
268
+ "epoch": 1.3593085860017002,
269
+ "eval_loss": 0.04923948645591736,
270
+ "eval_runtime": 90.089,
271
+ "eval_samples_per_second": 11.1,
272
+ "eval_steps_per_second": 11.1,
273
+ "step": 1200
274
+ },
275
+ {
276
+ "epoch": 1.4159818645508642,
277
+ "grad_norm": 0.4288318157196045,
278
+ "learning_rate": 0.00011379067028770236,
279
+ "loss": 0.0078,
280
+ "step": 1250
281
+ },
282
+ {
283
+ "epoch": 1.4726551431000283,
284
+ "grad_norm": 0.5142509937286377,
285
+ "learning_rate": 0.00010770553962898767,
286
+ "loss": 0.0076,
287
+ "step": 1300
288
+ },
289
+ {
290
+ "epoch": 1.4726551431000283,
291
+ "eval_loss": 0.04984404519200325,
292
+ "eval_runtime": 90.3286,
293
+ "eval_samples_per_second": 11.071,
294
+ "eval_steps_per_second": 11.071,
295
+ "step": 1300
296
+ },
297
+ {
298
+ "epoch": 1.5293284216491925,
299
+ "grad_norm": 0.4817846417427063,
300
+ "learning_rate": 0.00010159154248672667,
301
+ "loss": 0.0056,
302
+ "step": 1350
303
+ },
304
+ {
305
+ "epoch": 1.5860017001983566,
306
+ "grad_norm": 3.8564178943634033,
307
+ "learning_rate": 9.547158310986322e-05,
308
+ "loss": 0.0084,
309
+ "step": 1400
310
+ },
311
+ {
312
+ "epoch": 1.5860017001983566,
313
+ "eval_loss": 0.04620128124952316,
314
+ "eval_runtime": 90.0981,
315
+ "eval_samples_per_second": 11.099,
316
+ "eval_steps_per_second": 11.099,
317
+ "step": 1400
318
+ },
319
+ {
320
+ "epoch": 1.6426749787475206,
321
+ "grad_norm": 0.38633376359939575,
322
+ "learning_rate": 8.936858808305755e-05,
323
+ "loss": 0.0044,
324
+ "step": 1450
325
+ },
326
+ {
327
+ "epoch": 1.6993482572966845,
328
+ "grad_norm": 0.0634835809469223,
329
+ "learning_rate": 8.330542043914149e-05,
330
+ "loss": 0.0049,
331
+ "step": 1500
332
+ },
333
+ {
334
+ "epoch": 1.6993482572966845,
335
+ "eval_loss": 0.058556605130434036,
336
+ "eval_runtime": 90.1882,
337
+ "eval_samples_per_second": 11.088,
338
+ "eval_steps_per_second": 11.088,
339
+ "step": 1500
340
+ },
341
+ {
342
+ "epoch": 1.7560215358458486,
343
+ "grad_norm": 0.17512574791908264,
344
+ "learning_rate": 7.730479400965156e-05,
345
+ "loss": 0.007,
346
+ "step": 1550
347
+ },
348
+ {
349
+ "epoch": 1.8126948143950128,
350
+ "grad_norm": 0.7759736776351929,
351
+ "learning_rate": 7.138918833430014e-05,
352
+ "loss": 0.0057,
353
+ "step": 1600
354
+ },
355
+ {
356
+ "epoch": 1.8126948143950128,
357
+ "eval_loss": 0.05251866579055786,
358
+ "eval_runtime": 90.2837,
359
+ "eval_samples_per_second": 11.076,
360
+ "eval_steps_per_second": 11.076,
361
+ "step": 1600
362
+ },
363
+ {
364
+ "epoch": 1.869368092944177,
365
+ "grad_norm": 2.2165539264678955,
366
+ "learning_rate": 6.55807644481498e-05,
367
+ "loss": 0.0074,
368
+ "step": 1650
369
+ },
370
+ {
371
+ "epoch": 1.9260413714933409,
372
+ "grad_norm": 0.005767249036580324,
373
+ "learning_rate": 5.990128186196971e-05,
374
+ "loss": 0.0034,
375
+ "step": 1700
376
+ },
377
+ {
378
+ "epoch": 1.9260413714933409,
379
+ "eval_loss": 0.05016641691327095,
380
+ "eval_runtime": 90.0478,
381
+ "eval_samples_per_second": 11.105,
382
+ "eval_steps_per_second": 11.105,
383
+ "step": 1700
384
+ },
385
+ {
386
+ "epoch": 1.9827146500425048,
387
+ "grad_norm": 0.031133009120821953,
388
+ "learning_rate": 5.437201704678196e-05,
389
+ "loss": 0.0031,
390
+ "step": 1750
391
+ },
392
+ {
393
+ "epoch": 2.0385378294134315,
394
+ "grad_norm": 0.014959041960537434,
395
+ "learning_rate": 4.901368372797065e-05,
396
+ "loss": 0.0037,
397
+ "step": 1800
398
+ },
399
+ {
400
+ "epoch": 2.0385378294134315,
401
+ "eval_loss": 0.05385539308190346,
402
+ "eval_runtime": 90.2625,
403
+ "eval_samples_per_second": 11.079,
404
+ "eval_steps_per_second": 11.079,
405
+ "step": 1800
406
+ },
407
+ {
408
+ "epoch": 2.0952111079625957,
409
+ "grad_norm": 1.9791584014892578,
410
+ "learning_rate": 4.3846355287547944e-05,
411
+ "loss": 0.0024,
412
+ "step": 1850
413
+ },
414
+ {
415
+ "epoch": 2.15188438651176,
416
+ "grad_norm": 2.0488200187683105,
417
+ "learning_rate": 3.8889389565274035e-05,
418
+ "loss": 0.0025,
419
+ "step": 1900
420
+ },
421
+ {
422
+ "epoch": 2.15188438651176,
423
+ "eval_loss": 0.059098273515701294,
424
+ "eval_runtime": 90.2105,
425
+ "eval_samples_per_second": 11.085,
426
+ "eval_steps_per_second": 11.085,
427
+ "step": 1900
428
+ },
429
+ {
430
+ "epoch": 2.2085576650609235,
431
+ "grad_norm": 0.021802732720971107,
432
+ "learning_rate": 3.416135634034054e-05,
433
+ "loss": 0.0031,
434
+ "step": 1950
435
+ },
436
+ {
437
+ "epoch": 2.2652309436100877,
438
+ "grad_norm": 0.00933904480189085,
439
+ "learning_rate": 2.9679967765285032e-05,
440
+ "loss": 0.0017,
441
+ "step": 2000
442
+ },
443
+ {
444
+ "epoch": 2.2652309436100877,
445
+ "eval_loss": 0.06034567207098007,
446
+ "eval_runtime": 90.0945,
447
+ "eval_samples_per_second": 11.099,
448
+ "eval_steps_per_second": 11.099,
449
+ "step": 2000
450
+ },
451
+ {
452
+ "epoch": 2.321904222159252,
453
+ "grad_norm": 0.00511200213804841,
454
+ "learning_rate": 2.5462012012746407e-05,
455
+ "loss": 0.002,
456
+ "step": 2050
457
+ },
458
+ {
459
+ "epoch": 2.378577500708416,
460
+ "grad_norm": 0.002153645269572735,
461
+ "learning_rate": 2.1523290383631732e-05,
462
+ "loss": 0.0031,
463
+ "step": 2100
464
+ },
465
+ {
466
+ "epoch": 2.378577500708416,
467
+ "eval_loss": 0.06627824157476425,
468
+ "eval_runtime": 90.4408,
469
+ "eval_samples_per_second": 11.057,
470
+ "eval_steps_per_second": 11.057,
471
+ "step": 2100
472
+ },
473
+ {
474
+ "epoch": 2.43525077925758,
475
+ "grad_norm": 0.04121919348835945,
476
+ "learning_rate": 1.7878558112301648e-05,
477
+ "loss": 0.0009,
478
+ "step": 2150
479
+ },
480
+ {
481
+ "epoch": 2.4919240578067443,
482
+ "grad_norm": 0.04373983293771744,
483
+ "learning_rate": 1.4541469090528393e-05,
484
+ "loss": 0.001,
485
+ "step": 2200
486
+ },
487
+ {
488
+ "epoch": 2.4919240578067443,
489
+ "eval_loss": 0.06941419839859009,
490
+ "eval_runtime": 90.2245,
491
+ "eval_samples_per_second": 11.083,
492
+ "eval_steps_per_second": 11.083,
493
+ "step": 2200
494
+ },
495
+ {
496
+ "epoch": 2.5485973363559085,
497
+ "grad_norm": 7.789440631866455,
498
+ "learning_rate": 1.1524524717302532e-05,
499
+ "loss": 0.001,
500
+ "step": 2250
501
+ },
502
+ {
503
+ "epoch": 2.605270614905072,
504
+ "grad_norm": 0.01206847745925188,
505
+ "learning_rate": 8.839027066106675e-06,
506
+ "loss": 0.0004,
507
+ "step": 2300
508
+ },
509
+ {
510
+ "epoch": 2.605270614905072,
511
+ "eval_loss": 0.06566739082336426,
512
+ "eval_runtime": 90.2027,
513
+ "eval_samples_per_second": 11.086,
514
+ "eval_steps_per_second": 11.086,
515
+ "step": 2300
516
+ },
517
+ {
518
+ "epoch": 2.6619438934542363,
519
+ "grad_norm": 0.017873864620923996,
520
+ "learning_rate": 6.4950365451009295e-06,
521
+ "loss": 0.0024,
522
+ "step": 2350
523
+ },
524
+ {
525
+ "epoch": 2.7186171720034005,
526
+ "grad_norm": 8.548991203308105,
527
+ "learning_rate": 4.501334208833807e-06,
528
+ "loss": 0.0031,
529
+ "step": 2400
530
+ },
531
+ {
532
+ "epoch": 2.7186171720034005,
533
+ "eval_loss": 0.07288677245378494,
534
+ "eval_runtime": 90.4099,
535
+ "eval_samples_per_second": 11.061,
536
+ "eval_steps_per_second": 11.061,
537
+ "step": 2400
538
+ },
539
+ {
540
+ "epoch": 2.7752904505525646,
541
+ "grad_norm": 0.008294968865811825,
542
+ "learning_rate": 2.8653888626656855e-06,
543
+ "loss": 0.0034,
544
+ "step": 2450
545
+ },
546
+ {
547
+ "epoch": 2.8319637291017283,
548
+ "grad_norm": 0.8166434168815613,
549
+ "learning_rate": 1.5933290831391612e-06,
550
+ "loss": 0.0022,
551
+ "step": 2500
552
+ },
553
+ {
554
+ "epoch": 2.8319637291017283,
555
+ "eval_loss": 0.06702987104654312,
556
+ "eval_runtime": 90.0868,
557
+ "eval_samples_per_second": 11.1,
558
+ "eval_steps_per_second": 11.1,
559
+ "step": 2500
560
+ },
561
+ {
562
+ "epoch": 2.8886370076508925,
563
+ "grad_norm": 0.009677646681666374,
564
+ "learning_rate": 6.899202591125642e-07,
565
+ "loss": 0.0029,
566
+ "step": 2550
567
+ },
568
+ {
569
+ "epoch": 2.9453102862000566,
570
+ "grad_norm": 0.050994016230106354,
571
+ "learning_rate": 1.5854673966592127e-07,
572
+ "loss": 0.0016,
573
+ "step": 2600
574
+ },
575
+ {
576
+ "epoch": 2.9453102862000566,
577
+ "eval_loss": 0.06189265847206116,
578
+ "eval_runtime": 90.2429,
579
+ "eval_samples_per_second": 11.081,
580
+ "eval_steps_per_second": 11.081,
581
+ "step": 2600
582
+ },
583
+ {
584
+ "epoch": 2.9974497024652877,
585
+ "step": 2646,
586
+ "total_flos": 0.0,
587
+ "train_loss": 0.022518052387270213,
588
+ "train_runtime": 6849.0772,
589
+ "train_samples_per_second": 3.092,
590
+ "train_steps_per_second": 0.386
591
+ }
592
+ ],
593
+ "logging_steps": 50,
594
+ "max_steps": 2646,
595
+ "num_input_tokens_seen": 0,
596
+ "num_train_epochs": 3,
597
+ "save_steps": 100,
598
+ "stateful_callbacks": {
599
+ "TrainerControl": {
600
+ "args": {
601
+ "should_epoch_stop": false,
602
+ "should_evaluate": false,
603
+ "should_log": false,
604
+ "should_save": true,
605
+ "should_training_stop": false
606
+ },
607
+ "attributes": {}
608
+ }
609
+ },
610
+ "total_flos": 0.0,
611
+ "train_batch_size": 1,
612
+ "trial_name": null,
613
+ "trial_params": null
614
+ }
2025-11-26-16-50-52/01_scene/training_arguments.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "outputs/2025-11-26-16-50-52/01_scene",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": true,
6
+ "do_predict": false,
7
+ "eval_strategy": "steps",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 8,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 0.0002,
18
+ "weight_decay": 0.0,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.999,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 3,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": {},
27
+ "warmup_ratio": 0.03,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "outputs/2025-11-26-16-50-52/01_scene/runs/Nov26_16-50-56_fbi-mass-surveillance-network",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": false,
35
+ "logging_steps": 50,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "best",
38
+ "save_steps": 100,
39
+ "save_total_limit": 5,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": null,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": [],
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 100,
65
+ "dataloader_num_workers": 12,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "outputs/2025-11-26-16-50-52/01_scene",
69
+ "disable_tqdm": false,
70
+ "remove_unused_columns": false,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "eval_loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": [],
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": {
79
+ "min_num_params": 0,
80
+ "xla": false,
81
+ "xla_fsdp_v2": false,
82
+ "xla_fsdp_grad_ckpt": false
83
+ },
84
+ "fsdp_transformer_layer_cls_to_wrap": null,
85
+ "accelerator_config": {
86
+ "split_batches": false,
87
+ "dispatch_batches": null,
88
+ "even_batches": true,
89
+ "use_seedable_sampler": true,
90
+ "non_blocking": false,
91
+ "gradient_accumulation_kwargs": null
92
+ },
93
+ "deepspeed": null,
94
+ "label_smoothing_factor": 0.0,
95
+ "optim": "adamw_torch",
96
+ "optim_args": null,
97
+ "adafactor": false,
98
+ "group_by_length": false,
99
+ "length_column_name": "length",
100
+ "report_to": [],
101
+ "ddp_find_unused_parameters": null,
102
+ "ddp_bucket_cap_mb": null,
103
+ "ddp_broadcast_buffers": null,
104
+ "dataloader_pin_memory": true,
105
+ "dataloader_persistent_workers": false,
106
+ "skip_memory_metrics": true,
107
+ "use_legacy_prediction_loop": false,
108
+ "push_to_hub": false,
109
+ "resume_from_checkpoint": null,
110
+ "hub_model_id": null,
111
+ "hub_strategy": "every_save",
112
+ "hub_token": "<HUB_TOKEN>",
113
+ "hub_private_repo": null,
114
+ "hub_always_push": false,
115
+ "gradient_checkpointing": false,
116
+ "gradient_checkpointing_kwargs": null,
117
+ "include_inputs_for_metrics": false,
118
+ "include_for_metrics": [],
119
+ "eval_do_concat_batches": true,
120
+ "fp16_backend": "auto",
121
+ "evaluation_strategy": null,
122
+ "push_to_hub_model_id": null,
123
+ "push_to_hub_organization": null,
124
+ "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
125
+ "mp_parameters": "",
126
+ "auto_find_batch_size": false,
127
+ "full_determinism": false,
128
+ "torchdynamo": null,
129
+ "ray_scope": "last",
130
+ "ddp_timeout": 1800,
131
+ "torch_compile": false,
132
+ "torch_compile_backend": null,
133
+ "torch_compile_mode": null,
134
+ "dispatch_batches": null,
135
+ "split_batches": null,
136
+ "include_tokens_per_second": false,
137
+ "include_num_input_tokens_seen": false,
138
+ "neftune_noise_alpha": null,
139
+ "optim_target_modules": null,
140
+ "batch_eval_metrics": false,
141
+ "eval_on_start": false,
142
+ "use_liger_kernel": false,
143
+ "eval_use_gather_object": false,
144
+ "average_tokens_across_devices": false
145
+ }
2025-11-26-16-50-52/02_distortion/checkpoint-100/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: src/owl3
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:src/owl3
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.1
2025-11-26-16-50-52/02_distortion/checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "src/owl3",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "up_proj",
29
+ "q_proj",
30
+ "o_proj",
31
+ "down_proj",
32
+ "k_proj",
33
+ "gate_proj",
34
+ "v_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
2025-11-26-16-50-52/02_distortion/checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb1386821747de6d1a5b0ba73dbf4d24188c9f3e9b27b573f4002e94649a9dde
3
+ size 173507576
2025-11-26-16-50-52/02_distortion/checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
2025-11-26-16-50-52/02_distortion/checkpoint-100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
2025-11-26-16-50-52/02_distortion/checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8346e7a936237a4735b950a17fe81223785ebef1b9b837352f0ab78466d619cd
3
+ size 346426383
2025-11-26-16-50-52/02_distortion/checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c45eee438afc8a5dc3aa37e234d797ee8f451c3558934c36fd101e2e774bb4f3
3
+ size 14645
2025-11-26-16-50-52/02_distortion/checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441ef59f1d5d6d1eba1fd1f9f4c9a8660f66e566f1b8b3b88c35a5dad7fb26b6
3
+ size 1465