gpetruzella commited on
Commit
21fa42a
·
verified ·
1 Parent(s): 5bee899

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,58 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: unsloth/gemma-3-1b-it
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: train_2025-07-14-13-26-41
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_2025-07-14-13-26-41
18
+
19
+ This model is a fine-tuned version of [unsloth/gemma-3-1b-it](https://huggingface.co/unsloth/gemma-3-1b-it) on the oit dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 3
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 24
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.15.2
55
+ - Transformers 4.52.4
56
+ - Pytorch 2.7.1+cu128
57
+ - Datasets 3.6.0
58
+ - Tokenizers 0.21.1
adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/gemma-3-1b-it",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "down_proj",
29
+ "up_proj",
30
+ "k_proj",
31
+ "gate_proj",
32
+ "v_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:515a17c1f2cbe015ca7b84fee3e8d21123b1f97de1f128c50530d5e0472e7104
3
+ size 26139264
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "num_input_tokens_seen": 85968,
4
+ "total_flos": 363344926666752.0,
5
+ "train_loss": 3.3720982670783997,
6
+ "train_runtime": 88.7993,
7
+ "train_samples_per_second": 9.155,
8
+ "train_steps_per_second": 0.405
9
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
llamaboard_config.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: []
3
+ top.finetuning_type: lora
4
+ top.model_name: Gemma-3-1B-Instruct
5
+ top.quantization_bit: '4'
6
+ top.quantization_method: bnb
7
+ top.rope_scaling: none
8
+ top.template: gemma
9
+ train.additional_target: ''
10
+ train.apollo_rank: 16
11
+ train.apollo_scale: 32
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 3
19
+ train.compute_type: bf16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 4096
22
+ train.dataset:
23
+ - oit
24
+ train.dataset_dir: data
25
+ train.ds_offload: false
26
+ train.ds_stage: none
27
+ train.enable_thinking: true
28
+ train.extra_args: '{"optim": "adamw_torch"}'
29
+ train.freeze_extra_modules: ''
30
+ train.freeze_language_model: false
31
+ train.freeze_multi_modal_projector: true
32
+ train.freeze_trainable_layers: 2
33
+ train.freeze_trainable_modules: all
34
+ train.freeze_vision_tower: true
35
+ train.galore_rank: 16
36
+ train.galore_scale: 2
37
+ train.galore_target: all
38
+ train.galore_update_interval: 200
39
+ train.gradient_accumulation_steps: 8
40
+ train.image_max_pixels: 768*768
41
+ train.image_min_pixels: 32*32
42
+ train.learning_rate: 5e-5
43
+ train.logging_steps: 5
44
+ train.lora_alpha: 16
45
+ train.lora_dropout: 0
46
+ train.lora_rank: 8
47
+ train.lora_target: ''
48
+ train.loraplus_lr_ratio: 0
49
+ train.lr_scheduler_type: cosine
50
+ train.mask_history: false
51
+ train.max_grad_norm: '1.0'
52
+ train.max_samples: '100000'
53
+ train.neat_packing: false
54
+ train.neftune_alpha: 0
55
+ train.num_train_epochs: '3.0'
56
+ train.packing: false
57
+ train.ppo_score_norm: false
58
+ train.ppo_whiten_rewards: false
59
+ train.pref_beta: 0.1
60
+ train.pref_ftx: 0
61
+ train.pref_loss: sigmoid
62
+ train.report_to: none
63
+ train.resize_vocab: false
64
+ train.reward_model: []
65
+ train.save_steps: 100
66
+ train.swanlab_api_key: ''
67
+ train.swanlab_link: ''
68
+ train.swanlab_mode: cloud
69
+ train.swanlab_project: llamafactory
70
+ train.swanlab_run_name: ''
71
+ train.swanlab_workspace: ''
72
+ train.train_on_prompt: false
73
+ train.training_stage: Supervised Fine-Tuning
74
+ train.use_apollo: false
75
+ train.use_badam: false
76
+ train.use_dora: false
77
+ train.use_galore: false
78
+ train.use_llama_pro: false
79
+ train.use_pissa: false
80
+ train.use_rslora: false
81
+ train.use_swanlab: false
82
+ train.val_size: 0
83
+ train.video_max_pixels: 256*256
84
+ train.video_min_pixels: 16*16
85
+ train.warmup_steps: 0
running_log.txt ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/tokenizer.model
2
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/tokenizer.json
3
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/added_tokens.json
4
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/special_tokens_map.json
5
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/tokenizer_config.json
6
+ [INFO|2025-07-14 13:33:21] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/chat_template.jinja
7
+ [INFO|2025-07-14 13:33:23] image_processing_base.py:380 >> loading configuration file config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/config.json
8
+ [INFO|2025-07-14 13:33:23] logging.py:143 >> Failed to load processor: unsloth/gemma-3-1b-it does not appear to have a file named preprocessor_config.json. Checkout 'https://huggingface.co/unsloth/gemma-3-1b-it/tree/main' for available files..
9
+ [INFO|2025-07-14 13:33:23] logging.py:143 >> Loading dataset oit.json...
10
+ [INFO|2025-07-14 13:33:30] configuration_utils.py:698 >> loading configuration file config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/config.json
11
+ [INFO|2025-07-14 13:33:30] configuration_utils.py:770 >> Model config Gemma3TextConfig {
12
+ "architectures": [
13
+ "Gemma3ForCausalLM"
14
+ ],
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "attn_logit_softcapping": null,
18
+ "bos_token_id": 2,
19
+ "cache_implementation": "hybrid",
20
+ "eos_token_id": 106,
21
+ "final_logit_softcapping": null,
22
+ "head_dim": 256,
23
+ "hidden_activation": "gelu_pytorch_tanh",
24
+ "hidden_size": 1152,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 6912,
27
+ "max_position_embeddings": 32768,
28
+ "model_type": "gemma3_text",
29
+ "num_attention_heads": 4,
30
+ "num_hidden_layers": 26,
31
+ "num_key_value_heads": 1,
32
+ "pad_token_id": 0,
33
+ "query_pre_attn_scalar": 256,
34
+ "rms_norm_eps": 1e-06,
35
+ "rope_local_base_freq": 10000,
36
+ "rope_scaling": null,
37
+ "rope_theta": 1000000,
38
+ "sliding_window": 512,
39
+ "sliding_window_pattern": 6,
40
+ "torch_dtype": "bfloat16",
41
+ "transformers_version": "4.52.4",
42
+ "unsloth_fixed": true,
43
+ "use_cache": true,
44
+ "vocab_size": 262144
45
+ }
46
+
47
+ [INFO|2025-07-14 13:33:30] logging.py:143 >> Quantizing model to 4 bit with bitsandbytes.
48
+ [INFO|2025-07-14 13:33:30] logging.py:143 >> KV cache is disabled during training.
49
+ [INFO|2025-07-14 13:33:47] modeling_utils.py:1151 >> loading weights file model.safetensors from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/model.safetensors
50
+ [INFO|2025-07-14 13:33:47] modeling_utils.py:2241 >> Instantiating Gemma3ForCausalLM model under default dtype torch.bfloat16.
51
+ [WARNING|2025-07-14 13:33:47] configuration_utils.py:839 >> The following generation flags are not valid and may be ignored: ['cache_implementation'].
52
+ [INFO|2025-07-14 13:33:47] configuration_utils.py:840 >> - `cache_implementation`: You have set `use_cache` to `False`, but cache_implementation is set to hybrid. cache_implementation will have no effect.
53
+ If you're using a pretrained model, note that some of these attributes may be set through the model's `generation_config.json` file.
54
+ [WARNING|2025-07-14 13:33:47] configuration_utils.py:839 >> The following generation flags are not valid and may be ignored: ['cache_implementation'].
55
+ [INFO|2025-07-14 13:33:47] configuration_utils.py:840 >> - `cache_implementation`: You have set `use_cache` to `False`, but cache_implementation is set to hybrid. cache_implementation will have no effect.
56
+ If you're using a pretrained model, note that some of these attributes may be set through the model's `generation_config.json` file.
57
+ [INFO|2025-07-14 13:33:47] configuration_utils.py:1135 >> Generate config GenerationConfig {
58
+ "bos_token_id": 2,
59
+ "cache_implementation": "hybrid",
60
+ "eos_token_id": 106,
61
+ "pad_token_id": 0,
62
+ "use_cache": false
63
+ }
64
+
65
+ [INFO|2025-07-14 13:33:48] modeling_utils.py:5131 >> All model checkpoint weights were used when initializing Gemma3ForCausalLM.
66
+
67
+ [INFO|2025-07-14 13:33:48] modeling_utils.py:5139 >> All the weights of Gemma3ForCausalLM were initialized from the model checkpoint at unsloth/gemma-3-1b-it.
68
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Gemma3ForCausalLM for predictions without further training.
69
+ [INFO|2025-07-14 13:33:48] configuration_utils.py:1090 >> loading configuration file generation_config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/generation_config.json
70
+ [INFO|2025-07-14 13:33:48] configuration_utils.py:1135 >> Generate config GenerationConfig {
71
+ "bos_token_id": 2,
72
+ "cache_implementation": "hybrid",
73
+ "do_sample": true,
74
+ "eos_token_id": [
75
+ 1,
76
+ 106
77
+ ],
78
+ "max_length": 32768,
79
+ "pad_token_id": 0,
80
+ "top_k": 64,
81
+ "top_p": 0.95
82
+ }
83
+
84
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> Gradient checkpointing enabled.
85
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> Using torch SDPA for faster training and inference.
86
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> Upcasting trainable params to float32.
87
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> Fine-tuning method: LoRA
88
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> Found linear modules: q_proj,down_proj,up_proj,k_proj,gate_proj,v_proj,o_proj
89
+ [INFO|2025-07-14 13:33:48] logging.py:143 >> trainable params: 6,522,880 || all params: 1,006,408,832 || trainable%: 0.6481
90
+ [INFO|2025-07-14 13:33:48] trainer.py:756 >> Using auto half precision backend
91
+ [INFO|2025-07-14 13:33:48] trainer.py:2409 >> ***** Running training *****
92
+ [INFO|2025-07-14 13:33:48] trainer.py:2410 >> Num examples = 271
93
+ [INFO|2025-07-14 13:33:48] trainer.py:2411 >> Num Epochs = 3
94
+ [INFO|2025-07-14 13:33:48] trainer.py:2412 >> Instantaneous batch size per device = 3
95
+ [INFO|2025-07-14 13:33:48] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 24
96
+ [INFO|2025-07-14 13:33:48] trainer.py:2416 >> Gradient Accumulation steps = 8
97
+ [INFO|2025-07-14 13:33:48] trainer.py:2417 >> Total optimization steps = 36
98
+ [INFO|2025-07-14 13:33:48] trainer.py:2418 >> Number of trainable parameters = 6,522,880
99
+ [WARNING|2025-07-14 13:33:48] logging.py:328 >> It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
100
+ [INFO|2025-07-14 13:34:01] logging.py:143 >> {'loss': 5.4762, 'learning_rate': 4.8492e-05, 'epoch': 0.44, 'throughput': 1034.76}
101
+ [INFO|2025-07-14 13:34:13] logging.py:143 >> {'loss': 3.9653, 'learning_rate': 4.2678e-05, 'epoch': 0.88, 'throughput': 1031.28}
102
+ [INFO|2025-07-14 13:34:24] logging.py:143 >> {'loss': 3.2543, 'learning_rate': 3.3551e-05, 'epoch': 1.26, 'throughput': 1027.32}
103
+ [INFO|2025-07-14 13:34:38] logging.py:143 >> {'loss': 2.9243, 'learning_rate': 2.2821e-05, 'epoch': 1.70, 'throughput': 1000.40}
104
+ [INFO|2025-07-14 13:34:49] logging.py:143 >> {'loss': 2.6278, 'learning_rate': 1.2500e-05, 'epoch': 2.09, 'throughput': 1001.62}
105
+ [INFO|2025-07-14 13:35:03] logging.py:143 >> {'loss': 2.6988, 'learning_rate': 4.5212e-06, 'epoch': 2.53, 'throughput': 981.11}
106
+ [INFO|2025-07-14 13:35:16] logging.py:143 >> {'loss': 2.7677, 'learning_rate': 3.7981e-07, 'epoch': 2.97, 'throughput': 977.06}
107
+ [INFO|2025-07-14 13:35:17] trainer.py:3993 >> Saving model checkpoint to saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/checkpoint-36
108
+ [INFO|2025-07-14 13:35:17] configuration_utils.py:698 >> loading configuration file config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/config.json
109
+ [INFO|2025-07-14 13:35:17] configuration_utils.py:770 >> Model config Gemma3TextConfig {
110
+ "architectures": [
111
+ "Gemma3ForCausalLM"
112
+ ],
113
+ "attention_bias": false,
114
+ "attention_dropout": 0.0,
115
+ "attn_logit_softcapping": null,
116
+ "bos_token_id": 2,
117
+ "cache_implementation": "hybrid",
118
+ "eos_token_id": 106,
119
+ "final_logit_softcapping": null,
120
+ "head_dim": 256,
121
+ "hidden_activation": "gelu_pytorch_tanh",
122
+ "hidden_size": 1152,
123
+ "initializer_range": 0.02,
124
+ "intermediate_size": 6912,
125
+ "max_position_embeddings": 32768,
126
+ "model_type": "gemma3_text",
127
+ "num_attention_heads": 4,
128
+ "num_hidden_layers": 26,
129
+ "num_key_value_heads": 1,
130
+ "pad_token_id": 0,
131
+ "query_pre_attn_scalar": 256,
132
+ "rms_norm_eps": 1e-06,
133
+ "rope_local_base_freq": 10000,
134
+ "rope_scaling": null,
135
+ "rope_theta": 1000000,
136
+ "sliding_window": 512,
137
+ "sliding_window_pattern": 6,
138
+ "torch_dtype": "bfloat16",
139
+ "transformers_version": "4.52.4",
140
+ "unsloth_fixed": true,
141
+ "use_cache": true,
142
+ "vocab_size": 262144
143
+ }
144
+
145
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2356 >> chat template saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/checkpoint-36/chat_template.jinja
146
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/checkpoint-36/tokenizer_config.json
147
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/checkpoint-36/special_tokens_map.json
148
+ [INFO|2025-07-14 13:35:17] trainer.py:2676 >>
149
+
150
+ Training completed. Do not forget to share your model on huggingface.co/models =)
151
+
152
+
153
+ [INFO|2025-07-14 13:35:17] trainer.py:3993 >> Saving model checkpoint to saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41
154
+ [INFO|2025-07-14 13:35:17] configuration_utils.py:698 >> loading configuration file config.json from cache at /home/ats-ai/.cache/huggingface/hub/models--unsloth--gemma-3-1b-it/snapshots/5b11413a10db4e486ef16a20101fd028f8f2499c/config.json
155
+ [INFO|2025-07-14 13:35:17] configuration_utils.py:770 >> Model config Gemma3TextConfig {
156
+ "architectures": [
157
+ "Gemma3ForCausalLM"
158
+ ],
159
+ "attention_bias": false,
160
+ "attention_dropout": 0.0,
161
+ "attn_logit_softcapping": null,
162
+ "bos_token_id": 2,
163
+ "cache_implementation": "hybrid",
164
+ "eos_token_id": 106,
165
+ "final_logit_softcapping": null,
166
+ "head_dim": 256,
167
+ "hidden_activation": "gelu_pytorch_tanh",
168
+ "hidden_size": 1152,
169
+ "initializer_range": 0.02,
170
+ "intermediate_size": 6912,
171
+ "max_position_embeddings": 32768,
172
+ "model_type": "gemma3_text",
173
+ "num_attention_heads": 4,
174
+ "num_hidden_layers": 26,
175
+ "num_key_value_heads": 1,
176
+ "pad_token_id": 0,
177
+ "query_pre_attn_scalar": 256,
178
+ "rms_norm_eps": 1e-06,
179
+ "rope_local_base_freq": 10000,
180
+ "rope_scaling": null,
181
+ "rope_theta": 1000000,
182
+ "sliding_window": 512,
183
+ "sliding_window_pattern": 6,
184
+ "torch_dtype": "bfloat16",
185
+ "transformers_version": "4.52.4",
186
+ "unsloth_fixed": true,
187
+ "use_cache": true,
188
+ "vocab_size": 262144
189
+ }
190
+
191
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2356 >> chat template saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/chat_template.jinja
192
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/tokenizer_config.json
193
+ [INFO|2025-07-14 13:35:17] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41/special_tokens_map.json
194
+ [WARNING|2025-07-14 13:35:18] logging.py:148 >> No metric eval_loss to plot.
195
+ [WARNING|2025-07-14 13:35:18] logging.py:148 >> No metric eval_accuracy to plot.
196
+ [INFO|2025-07-14 13:35:18] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
197
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "num_input_tokens_seen": 85968,
4
+ "total_flos": 363344926666752.0,
5
+ "train_loss": 3.3720982670783997,
6
+ "train_runtime": 88.7993,
7
+ "train_samples_per_second": 9.155,
8
+ "train_steps_per_second": 0.405
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 36, "loss": 5.4762, "lr": 4.849231551964771e-05, "epoch": 0.43956043956043955, "percentage": 13.89, "elapsed_time": "0:00:12", "remaining_time": "0:01:19", "throughput": 1034.76, "total_tokens": 13272}
2
+ {"current_steps": 10, "total_steps": 36, "loss": 3.9653, "lr": 4.267766952966369e-05, "epoch": 0.8791208791208791, "percentage": 27.78, "elapsed_time": "0:00:24", "remaining_time": "0:01:03", "throughput": 1031.28, "total_tokens": 25368}
3
+ {"current_steps": 15, "total_steps": 36, "loss": 3.2543, "lr": 3.355050358314172e-05, "epoch": 1.2637362637362637, "percentage": 41.67, "elapsed_time": "0:00:35", "remaining_time": "0:00:49", "throughput": 1027.32, "total_tokens": 36568}
4
+ {"current_steps": 20, "total_steps": 36, "loss": 2.9243, "lr": 2.2821106431308544e-05, "epoch": 1.7032967032967035, "percentage": 55.56, "elapsed_time": "0:00:49", "remaining_time": "0:00:39", "throughput": 1000.4, "total_tokens": 49360}
5
+ {"current_steps": 25, "total_steps": 36, "loss": 2.6278, "lr": 1.2500000000000006e-05, "epoch": 2.087912087912088, "percentage": 69.44, "elapsed_time": "0:01:00", "remaining_time": "0:00:26", "throughput": 1001.62, "total_tokens": 60992}
6
+ {"current_steps": 30, "total_steps": 36, "loss": 2.6988, "lr": 4.521198892775203e-06, "epoch": 2.5274725274725274, "percentage": 83.33, "elapsed_time": "0:01:14", "remaining_time": "0:00:14", "throughput": 981.11, "total_tokens": 72872}
7
+ {"current_steps": 35, "total_steps": 36, "loss": 2.7677, "lr": 3.7980617469479953e-07, "epoch": 2.967032967032967, "percentage": 97.22, "elapsed_time": "0:01:27", "remaining_time": "0:00:02", "throughput": 977.06, "total_tokens": 85304}
8
+ {"current_steps": 36, "total_steps": 36, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:01:28", "remaining_time": "0:00:00", "throughput": 968.13, "total_tokens": 85968}
trainer_state.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 36,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.43956043956043955,
14
+ "grad_norm": 9.493675231933594,
15
+ "learning_rate": 4.849231551964771e-05,
16
+ "loss": 5.4762,
17
+ "num_input_tokens_seen": 13272,
18
+ "step": 5,
19
+ "train_runtime": 12.8278,
20
+ "train_tokens_per_second": 1034.626
21
+ },
22
+ {
23
+ "epoch": 0.8791208791208791,
24
+ "grad_norm": 4.911149024963379,
25
+ "learning_rate": 4.267766952966369e-05,
26
+ "loss": 3.9653,
27
+ "num_input_tokens_seen": 25368,
28
+ "step": 10,
29
+ "train_runtime": 24.6003,
30
+ "train_tokens_per_second": 1031.207
31
+ },
32
+ {
33
+ "epoch": 1.2637362637362637,
34
+ "grad_norm": 3.035015821456909,
35
+ "learning_rate": 3.355050358314172e-05,
36
+ "loss": 3.2543,
37
+ "num_input_tokens_seen": 36568,
38
+ "step": 15,
39
+ "train_runtime": 35.5974,
40
+ "train_tokens_per_second": 1027.266
41
+ },
42
+ {
43
+ "epoch": 1.7032967032967035,
44
+ "grad_norm": 2.246847152709961,
45
+ "learning_rate": 2.2821106431308544e-05,
46
+ "loss": 2.9243,
47
+ "num_input_tokens_seen": 49360,
48
+ "step": 20,
49
+ "train_runtime": 49.3421,
50
+ "train_tokens_per_second": 1000.362
51
+ },
52
+ {
53
+ "epoch": 2.087912087912088,
54
+ "grad_norm": 1.6257575750350952,
55
+ "learning_rate": 1.2500000000000006e-05,
56
+ "loss": 2.6278,
57
+ "num_input_tokens_seen": 60992,
58
+ "step": 25,
59
+ "train_runtime": 60.8949,
60
+ "train_tokens_per_second": 1001.594
61
+ },
62
+ {
63
+ "epoch": 2.5274725274725274,
64
+ "grad_norm": 1.944761872291565,
65
+ "learning_rate": 4.521198892775203e-06,
66
+ "loss": 2.6988,
67
+ "num_input_tokens_seen": 72872,
68
+ "step": 30,
69
+ "train_runtime": 74.2765,
70
+ "train_tokens_per_second": 981.091
71
+ },
72
+ {
73
+ "epoch": 2.967032967032967,
74
+ "grad_norm": 1.752797245979309,
75
+ "learning_rate": 3.7980617469479953e-07,
76
+ "loss": 2.7677,
77
+ "num_input_tokens_seen": 85304,
78
+ "step": 35,
79
+ "train_runtime": 87.3084,
80
+ "train_tokens_per_second": 977.042
81
+ },
82
+ {
83
+ "epoch": 3.0,
84
+ "num_input_tokens_seen": 85968,
85
+ "step": 36,
86
+ "total_flos": 363344926666752.0,
87
+ "train_loss": 3.3720982670783997,
88
+ "train_runtime": 88.7993,
89
+ "train_samples_per_second": 9.155,
90
+ "train_steps_per_second": 0.405
91
+ }
92
+ ],
93
+ "logging_steps": 5,
94
+ "max_steps": 36,
95
+ "num_input_tokens_seen": 85968,
96
+ "num_train_epochs": 3,
97
+ "save_steps": 100,
98
+ "stateful_callbacks": {
99
+ "TrainerControl": {
100
+ "args": {
101
+ "should_epoch_stop": false,
102
+ "should_evaluate": false,
103
+ "should_log": false,
104
+ "should_save": true,
105
+ "should_training_stop": true
106
+ },
107
+ "attributes": {}
108
+ }
109
+ },
110
+ "total_flos": 363344926666752.0,
111
+ "train_batch_size": 3,
112
+ "trial_name": null,
113
+ "trial_params": null
114
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017320e2e7dc050c148ab1218dc90c9ede1f72f067caf9b74b319c767b40edeb
3
+ size 6161
training_args.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bf16: true
2
+ cutoff_len: 4096
3
+ dataset: oit
4
+ dataset_dir: data
5
+ ddp_timeout: 180000000
6
+ do_train: true
7
+ double_quantization: true
8
+ enable_thinking: true
9
+ finetuning_type: lora
10
+ flash_attn: auto
11
+ gradient_accumulation_steps: 8
12
+ include_num_input_tokens_seen: true
13
+ learning_rate: 5.0e-05
14
+ logging_steps: 5
15
+ lora_alpha: 16
16
+ lora_dropout: 0
17
+ lora_rank: 8
18
+ lora_target: all
19
+ lr_scheduler_type: cosine
20
+ max_grad_norm: 1.0
21
+ max_samples: 100000
22
+ model_name_or_path: unsloth/gemma-3-1b-it
23
+ num_train_epochs: 3.0
24
+ optim: adamw_torch
25
+ output_dir: saves/Gemma-3-1B-Instruct/lora/train_2025-07-14-13-26-41
26
+ packing: false
27
+ per_device_train_batch_size: 3
28
+ plot_loss: true
29
+ preprocessing_num_workers: 16
30
+ quantization_bit: 4
31
+ quantization_method: bnb
32
+ report_to: none
33
+ save_steps: 100
34
+ stage: sft
35
+ template: gemma
36
+ trust_remote_code: true
37
+ warmup_steps: 0
training_loss.png ADDED