Fizzarolli commited on
Commit
6cd374c
·
verified ·
1 Parent(s): 1007943

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - allura-forge/expr-rp-sft-mix
7
+ model-index:
8
+ - name: output
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.13.0.dev0`
19
+ ```yaml
20
+ ## model
21
+ base_model: ./model
22
+
23
+ ## qlora COPE!!!
24
+ load_in_8bit: false
25
+ load_in_4bit: false #false
26
+ strict: false
27
+
28
+ # === Data Configuration ===
29
+ datasets:
30
+ - path: allura-forge/expr-rp-sft-mix
31
+ type: chat_template
32
+ split: train
33
+ field_messages: conversations
34
+ message_field_role: from
35
+ message_field_content: value
36
+
37
+ chat_template: jinja
38
+ chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = 'user' %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
39
+
40
+ shuffle_merged_datasets: true
41
+ dataset_prepared_path: dataset_prepareds
42
+ val_set_size: 0.0
43
+ output_dir: ./output
44
+
45
+ max_grad_norm: 0.1
46
+
47
+ ## Liger + CCE
48
+ plugins:
49
+ - axolotl.integrations.liger.LigerPlugin
50
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
51
+ liger_rope: true
52
+ liger_rms_norm: true
53
+ liger_layer_norm: true
54
+ liger_glu_activation: true
55
+ liger_fused_linear_cross_entropy: false
56
+ cut_cross_entropy: true
57
+
58
+ ## CTX settings
59
+ sequence_len: 16384
60
+ sample_packing: true
61
+ eval_sample_packing: false
62
+ pad_to_sequence_len: true
63
+
64
+ ## WandB
65
+ wandb_project: g12b-slopification
66
+ wandb_entity:
67
+ wandb_watch:
68
+ wandb_name:
69
+ wandb_log_model:
70
+
71
+ ## hoe params
72
+ gradient_accumulation_steps: 2 # ???
73
+ micro_batch_size: 2
74
+ num_epochs: 2
75
+ lr_scheduler: rex
76
+ learning_rate: 2e-6
77
+ optimizer: adamw_torch_8bit # Options: "paged_ademamix_8bit", "adamw_bnb_8bit", "paged_adamw_8bit"
78
+
79
+ train_on_inputs: false
80
+ group_by_length: false
81
+ bf16: auto
82
+ fp16:
83
+ tf32: false
84
+
85
+ gradient_checkpointing: offload
86
+ early_stopping_patience:
87
+ resume_from_checkpoint:
88
+ local_rank:
89
+ logging_steps: 1
90
+ xformers_attention:
91
+ flash_attention: true
92
+ s2_attention:
93
+ special_tokens:
94
+ eos_token: "<end_of_turn>"
95
+
96
+ warmup_steps: 25
97
+ saves_per_epoch: 4
98
+ debug:
99
+ weight_decay: 0.0
100
+ fsdp:
101
+ - full_shard
102
+ - auto_wrap
103
+ fsdp_config:
104
+ fsdp_activation_checkpointing: true
105
+ fsdp_limit_all_gathers: true
106
+ fsdp_use_orig_params: false
107
+ fsdp_cpu_ram_efficient_loading: true
108
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
109
+ fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
110
+ fsdp_state_dict_type: FULL_STATE_DICT
111
+ fsdp_reshard_after_forward: true
112
+ fsdp_version: 2
113
+
114
+
115
+ ```
116
+
117
+ </details><br>
118
+
119
+ # output
120
+
121
+ This model was trained from scratch on the allura-forge/expr-rp-sft-mix dataset.
122
+
123
+ ## Model description
124
+
125
+ More information needed
126
+
127
+ ## Intended uses & limitations
128
+
129
+ More information needed
130
+
131
+ ## Training and evaluation data
132
+
133
+ More information needed
134
+
135
+ ## Training procedure
136
+
137
+ ### Training hyperparameters
138
+
139
+ The following hyperparameters were used during training:
140
+ - learning_rate: 2e-06
141
+ - train_batch_size: 2
142
+ - eval_batch_size: 2
143
+ - seed: 42
144
+ - distributed_type: multi-GPU
145
+ - num_devices: 8
146
+ - gradient_accumulation_steps: 2
147
+ - total_train_batch_size: 32
148
+ - total_eval_batch_size: 16
149
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
150
+ - lr_scheduler_type: cosine
151
+ - lr_scheduler_warmup_steps: 25
152
+ - training_steps: 538
153
+
154
+ ### Training results
155
+
156
+
157
+
158
+ ### Framework versions
159
+
160
+ - Transformers 4.57.1
161
+ - Pytorch 2.8.0+cu129
162
+ - Datasets 4.4.1
163
+ - Tokenizers 0.22.1
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = 'user' %}{% endif %}{{ '<start_of_turn>' + role + '
2
+ ' + message['content'] | trim + '<end_of_turn>
3
+ ' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
4
+ '}}{% endif %}
config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 255999,
6
+ "bos_token_id": 2,
7
+ "dtype": "bfloat16",
8
+ "eoi_token_index": 256000,
9
+ "eos_token_id": 106,
10
+ "image_token_index": 262144,
11
+ "initializer_range": 0.02,
12
+ "mm_tokens_per_image": 256,
13
+ "model_type": "gemma3",
14
+ "pad_token_id": 0,
15
+ "text_config": {
16
+ "_sliding_window_pattern": 6,
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "attn_logit_softcapping": null,
20
+ "dtype": "bfloat16",
21
+ "final_logit_softcapping": null,
22
+ "head_dim": 256,
23
+ "hidden_activation": "gelu_pytorch_tanh",
24
+ "hidden_size": 3840,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 15360,
27
+ "layer_types": [
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "full_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "full_attention",
64
+ "sliding_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "full_attention",
70
+ "sliding_attention",
71
+ "sliding_attention",
72
+ "sliding_attention",
73
+ "sliding_attention",
74
+ "sliding_attention",
75
+ "full_attention"
76
+ ],
77
+ "max_position_embeddings": 131072,
78
+ "model_type": "gemma3_text",
79
+ "num_attention_heads": 16,
80
+ "num_hidden_layers": 48,
81
+ "num_key_value_heads": 8,
82
+ "query_pre_attn_scalar": 256,
83
+ "rms_norm_eps": 1e-06,
84
+ "rope_local_base_freq": 10000.0,
85
+ "rope_scaling": {
86
+ "factor": 8.0,
87
+ "rope_type": "linear"
88
+ },
89
+ "rope_theta": 1000000.0,
90
+ "sliding_window": 1024,
91
+ "use_bidirectional_attention": false,
92
+ "use_cache": false,
93
+ "vocab_size": 262208
94
+ },
95
+ "transformers_version": "4.57.1",
96
+ "vision_config": {
97
+ "attention_dropout": 0.0,
98
+ "dtype": "bfloat16",
99
+ "hidden_act": "gelu_pytorch_tanh",
100
+ "hidden_size": 1152,
101
+ "image_size": 896,
102
+ "intermediate_size": 4304,
103
+ "layer_norm_eps": 1e-06,
104
+ "model_type": "siglip_vision_model",
105
+ "num_attention_heads": 16,
106
+ "num_channels": 3,
107
+ "num_hidden_layers": 27,
108
+ "patch_size": 14,
109
+ "vision_use_head": false
110
+ }
111
+ }
debug.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+
3
+ 
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1,
7
+ 106
8
+ ],
9
+ "pad_token_id": 0,
10
+ "top_k": 64,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.57.1"
13
+ }
model-00001-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6a76cf40cd1037dc441d030d34d312f5c7bf4fce2b12cf15eff94ae46747109
3
+ size 4979902192
model-00002-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aada6d93fd52ecbcfa6db6216f00fc97657c9a5fb5c5b9746ee58ab6371d7db7
3
+ size 4931296592
model-00003-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32a8a36420d06a2841080a152762d793141a9902362df04b6bdb5f58c3215104
3
+ size 4931296656
model-00004-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d426e6bcc681eb650de8d428df25f7da57351709f9936ccbd30177f305c15d
3
+ size 4931296656
model-00005-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72388d3ef9cbbc1c5f885e4b89d3d54c80f92e6be593cd0bc0cd103e4046ccb2
3
+ size 4601000928
model-00006-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e8560c7412509c196e418b40a5b1c232e1316e84d1ee9a2e69c7092276b3ecb
3
+ size 2013757584
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94418d27fb78e24727a6ba69c32af8cbe7bd62c80f6ad31dd25787775bd9570a
3
+ size 8081