ImNotTam commited on
Commit
32e57c5
·
verified ·
1 Parent(s): 343ceab

Upload full training folder with all checkpoints

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. README.md +175 -0
  3. checkpoint-100/README.md +210 -0
  4. checkpoint-100/adapter_config.json +42 -0
  5. checkpoint-100/adapter_model.safetensors +3 -0
  6. checkpoint-100/added_tokens.json +3 -0
  7. checkpoint-100/chat_template.jinja +47 -0
  8. checkpoint-100/optimizer.pt +3 -0
  9. checkpoint-100/preprocessor_config.json +29 -0
  10. checkpoint-100/processor_config.json +4 -0
  11. checkpoint-100/rng_state.pth +3 -0
  12. checkpoint-100/scheduler.pt +3 -0
  13. checkpoint-100/special_tokens_map.json +33 -0
  14. checkpoint-100/tokenizer.json +3 -0
  15. checkpoint-100/tokenizer.model +3 -0
  16. checkpoint-100/tokenizer_config.json +0 -0
  17. checkpoint-100/trainer_state.json +191 -0
  18. checkpoint-100/training_args.bin +3 -0
  19. checkpoint-300/README.md +210 -0
  20. checkpoint-300/adapter_config.json +42 -0
  21. checkpoint-300/adapter_model.safetensors +3 -0
  22. checkpoint-300/added_tokens.json +3 -0
  23. checkpoint-300/chat_template.jinja +47 -0
  24. checkpoint-300/optimizer.pt +3 -0
  25. checkpoint-300/preprocessor_config.json +29 -0
  26. checkpoint-300/processor_config.json +4 -0
  27. checkpoint-300/rng_state.pth +3 -0
  28. checkpoint-300/scheduler.pt +3 -0
  29. checkpoint-300/special_tokens_map.json +33 -0
  30. checkpoint-300/tokenizer.json +3 -0
  31. checkpoint-300/tokenizer.model +3 -0
  32. checkpoint-300/tokenizer_config.json +0 -0
  33. checkpoint-300/trainer_state.json +487 -0
  34. checkpoint-300/training_args.bin +3 -0
  35. checkpoint-400/README.md +210 -0
  36. checkpoint-400/adapter_config.json +42 -0
  37. checkpoint-400/adapter_model.safetensors +3 -0
  38. checkpoint-400/added_tokens.json +3 -0
  39. checkpoint-400/chat_template.jinja +47 -0
  40. checkpoint-400/optimizer.pt +3 -0
  41. checkpoint-400/preprocessor_config.json +29 -0
  42. checkpoint-400/processor_config.json +4 -0
  43. checkpoint-400/rng_state.pth +3 -0
  44. checkpoint-400/scheduler.pt +3 -0
  45. checkpoint-400/special_tokens_map.json +33 -0
  46. checkpoint-400/tokenizer.json +3 -0
  47. checkpoint-400/tokenizer.model +3 -0
  48. checkpoint-400/tokenizer_config.json +0 -0
  49. checkpoint-400/trainer_state.json +635 -0
  50. checkpoint-400/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ lora_adapters/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ - en
5
+ license: apache-2.0
6
+ tags:
7
+ - llm-judge
8
+ - training-checkpoint
9
+ - lora
10
+ - unsloth
11
+ ---
12
+
13
+ # finetuned_5_12
14
+
15
+ Full training folder backup - Toàn bộ checkpoints và models.
16
+
17
+ ## 📂 Cấu trúc Folder
18
+ ```
19
+ train_
20
+ ├── lora_adapters/ # LoRA adapters
21
+ ├── README.md
22
+ ├── zero_shot_metrics.json
23
+ └── zero_shot_results.csv
24
+ ```
25
+
26
+ ## 🚀 Sử Dụng
27
+
28
+ ### 1️⃣ Clone Repo
29
+ ```bash
30
+ git lfs install
31
+ git clone https://huggingface.co/ImNotTam/finetuned_5_12
32
+ cd finetuned_5_12
33
+ ```
34
+
35
+ ### 2️⃣ Load LoRA Adapters (Nhẹ nhất - khuyến nghị)
36
+ ```python
37
+ from unsloth import FastLanguageModel
38
+
39
+ model, tokenizer = FastLanguageModel.from_pretrained(
40
+ model_name="ImNotTam/finetuned_5_12",
41
+ subfolder="lora_adapters",
42
+ max_seq_length=2048,
43
+ dtype=None,
44
+ load_in_4bit=True,
45
+ )
46
+
47
+ # Enable inference mode
48
+ FastLanguageModel.for_inference(model)
49
+
50
+ # Test
51
+ prompt = "Đánh giá response này..."
52
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
53
+ outputs = model.generate(**inputs, max_new_tokens=256)
54
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
55
+ ```
56
+
57
+ ### 3️⃣ Load Final Model
58
+ ```python
59
+ from transformers import AutoModelForCausalLM, AutoTokenizer
60
+
61
+ model = AutoModelForCausalLM.from_pretrained(
62
+ "ImNotTam/finetuned_5_12",
63
+ subfolder="final_model",
64
+ device_map="auto",
65
+ torch_dtype="auto"
66
+ )
67
+ tokenizer = AutoTokenizer.from_pretrained("ImNotTam/finetuned_5_12", subfolder="final_model")
68
+
69
+ # Inference
70
+ inputs = tokenizer("Your prompt", return_tensors="pt").to("cuda")
71
+ outputs = model.generate(**inputs)
72
+ print(tokenizer.decode(outputs[0]))
73
+ ```
74
+
75
+ ### 4️⃣ Resume Training từ Checkpoint
76
+ ```python
77
+ from transformers import Trainer, TrainingArguments
78
+
79
+ # Load checkpoint muốn resume
80
+ model = AutoModelForCausalLM.from_pretrained(
81
+ "ImNotTam/finetuned_5_12",
82
+ subfolder="checkpoint-210", # Chọn checkpoint
83
+ device_map="auto"
84
+ )
85
+
86
+ # Continue training
87
+ trainer = Trainer(
88
+ model=model,
89
+ args=TrainingArguments(
90
+ output_dir="./continue_training",
91
+ # ... your training args
92
+ ),
93
+ )
94
+ trainer.train(resume_from_checkpoint=True)
95
+ ```
96
+
97
+ ### 5️⃣ Fine-tune Tiếp từ LoRA Adapter
98
+ ```python
99
+ from unsloth import FastLanguageModel
100
+ from trl import SFTTrainer
101
+
102
+ # Load LoRA adapter
103
+ model, tokenizer = FastLanguageModel.from_pretrained(
104
+ model_name="ImNotTam/finetuned_5_12",
105
+ subfolder="lora_adapters",
106
+ max_seq_length=2048,
107
+ dtype=None,
108
+ load_in_4bit=True,
109
+ )
110
+
111
+ # Add LoRA config để train tiếp
112
+ model = FastLanguageModel.get_peft_model(
113
+ model,
114
+ r=16,
115
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
116
+ "gate_proj", "up_proj", "down_proj"],
117
+ lora_alpha=16,
118
+ lora_dropout=0,
119
+ bias="none",
120
+ use_gradient_checkpointing="unsloth",
121
+ )
122
+
123
+ # Train với data mới
124
+ trainer = SFTTrainer(
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ train_dataset=your_new_dataset,
128
+ # ... training args
129
+ )
130
+ trainer.train()
131
+ ```
132
+
133
+ ### 6️⃣ Xem Metrics và Results
134
+ ```python
135
+ import json
136
+ import pandas as pd
137
+
138
+ # Load metrics
139
+ with open("zero_shot_metrics.json", "r") as f:
140
+ metrics = json.load(f)
141
+ print("📊 Metrics:", metrics)
142
+
143
+ # Load results
144
+ results = pd.read_csv("zero_shot_results.csv")
145
+ print("\n📈 Results:")
146
+ print(results.head())
147
+ ```
148
+
149
+ ## 📋 Nội Dung Repo
150
+
151
+ | Folder/File | Mô tả | Kích thước |
152
+ |-------------|-------|------------|
153
+ | `lora_adapters/` | LoRA adapters (nhẹ) | ~50-100 MB |
154
+ | `final_model/` | Model merged đầy đủ | ~4-8 GB |
155
+ | `checkpoint-150/` | Training checkpoint | ~4-8 GB |
156
+ | `checkpoint-200/` | Training checkpoint | ~4-8 GB |
157
+ | `checkpoint-210/` | Training checkpoint | ~4-8 GB |
158
+ | `zero_shot_metrics.json` | Evaluation metrics | <1 MB |
159
+ | `zero_shot_results.csv` | Detailed results | <1 MB |
160
+
161
+ ## 💡 Khuyến Nghị
162
+
163
+ - **Inference nhanh:** Dùng `lora_adapters/`
164
+ - **Production:** Dùng `final_model/`
165
+ - **Train tiếp:** Load `lora_adapters/` + add LoRA config
166
+ - **Resume training:** Load checkpoint cụ thể
167
+
168
+ ## 📦 Requirements
169
+ ```bash
170
+ pip install unsloth transformers torch trl
171
+ ```
172
+
173
+ ## 📄 License
174
+
175
+ Apache 2.0
checkpoint-100/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcc7bd0421bfecda36bc2c99741021234818a6b748ab603299bb10dc4e1c3b59
3
+ size 308107928
checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f69b25edf94625e7a02518340fe0937560ac04c53aeacd05ae95862ce9c31f1
3
+ size 157331237
checkpoint-100/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-100/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c69974dd186962af147b132c93b5d1654090809d2863f2058c297e9b8e2e7a
3
+ size 14645
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a239e685911b4054a1347865565f4a73e42ee9453644b8ff353920e9e7c499
3
+ size 1465
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-100/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.01389834564179182,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-100",
5
+ "epoch": 0.38910505836575876,
6
+ "eval_steps": 100,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.0237326622009277,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8809,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 2.4512810707092285,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.8569,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.058365758754863814,
28
+ "grad_norm": 1.5967055559158325,
29
+ "learning_rate": 6.222222222222222e-06,
30
+ "loss": 0.7725,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.07782101167315175,
35
+ "grad_norm": 1.2497001886367798,
36
+ "learning_rate": 8.444444444444446e-06,
37
+ "loss": 0.6516,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.09727626459143969,
42
+ "grad_norm": 1.2455090284347534,
43
+ "learning_rate": 1.0666666666666667e-05,
44
+ "loss": 0.5238,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.11673151750972763,
49
+ "grad_norm": 1.361525535583496,
50
+ "learning_rate": 1.2888888888888889e-05,
51
+ "loss": 0.3777,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.13618677042801555,
56
+ "grad_norm": 1.6556775569915771,
57
+ "learning_rate": 1.511111111111111e-05,
58
+ "loss": 0.2116,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.1556420233463035,
63
+ "grad_norm": 0.6078555583953857,
64
+ "learning_rate": 1.7333333333333332e-05,
65
+ "loss": 0.0794,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.17509727626459143,
70
+ "grad_norm": 0.31556975841522217,
71
+ "learning_rate": 1.9555555555555557e-05,
72
+ "loss": 0.0343,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.19455252918287938,
77
+ "grad_norm": 0.23063282668590546,
78
+ "learning_rate": 2.177777777777778e-05,
79
+ "loss": 0.0227,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.2140077821011673,
84
+ "grad_norm": 0.10897089540958405,
85
+ "learning_rate": 2.4e-05,
86
+ "loss": 0.0159,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.23346303501945526,
91
+ "grad_norm": 0.08667729049921036,
92
+ "learning_rate": 2.6222222222222226e-05,
93
+ "loss": 0.0155,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.2529182879377432,
98
+ "grad_norm": 0.07056345790624619,
99
+ "learning_rate": 2.8444444444444447e-05,
100
+ "loss": 0.0133,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.2723735408560311,
105
+ "grad_norm": 0.119380883872509,
106
+ "learning_rate": 3.066666666666666e-05,
107
+ "loss": 0.0113,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.2918287937743191,
112
+ "grad_norm": 0.10328345745801926,
113
+ "learning_rate": 3.288888888888889e-05,
114
+ "loss": 0.0074,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.311284046692607,
119
+ "grad_norm": 0.08840714395046234,
120
+ "learning_rate": 3.511111111111111e-05,
121
+ "loss": 0.0069,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.33073929961089493,
126
+ "grad_norm": 0.1998119205236435,
127
+ "learning_rate": 3.733333333333334e-05,
128
+ "loss": 0.0069,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.35019455252918286,
133
+ "grad_norm": 0.08085718750953674,
134
+ "learning_rate": 3.9555555555555556e-05,
135
+ "loss": 0.0072,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.36964980544747084,
140
+ "grad_norm": 0.10597972571849823,
141
+ "learning_rate": 4.177777777777778e-05,
142
+ "loss": 0.0072,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.38910505836575876,
147
+ "grad_norm": 0.04654397815465927,
148
+ "learning_rate": 4.4e-05,
149
+ "loss": 0.0065,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.38910505836575876,
154
+ "eval_loss": 0.01389834564179182,
155
+ "eval_runtime": 174.9649,
156
+ "eval_samples_per_second": 2.915,
157
+ "eval_steps_per_second": 0.732,
158
+ "step": 100
159
+ }
160
+ ],
161
+ "logging_steps": 5,
162
+ "max_steps": 1799,
163
+ "num_input_tokens_seen": 0,
164
+ "num_train_epochs": 7,
165
+ "save_steps": 100,
166
+ "stateful_callbacks": {
167
+ "EarlyStoppingCallback": {
168
+ "args": {
169
+ "early_stopping_patience": 30,
170
+ "early_stopping_threshold": 0.001
171
+ },
172
+ "attributes": {
173
+ "early_stopping_patience_counter": 0
174
+ }
175
+ },
176
+ "TrainerControl": {
177
+ "args": {
178
+ "should_epoch_stop": false,
179
+ "should_evaluate": false,
180
+ "should_log": false,
181
+ "should_save": true,
182
+ "should_training_stop": false
183
+ },
184
+ "attributes": {}
185
+ }
186
+ },
187
+ "total_flos": 6.810966969647155e+16,
188
+ "train_batch_size": 4,
189
+ "trial_name": null,
190
+ "trial_params": null
191
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74aab1d0c2724784614181306df8fc5fd4785e5c0771ac25a75ed3b96204917
3
+ size 6417
checkpoint-300/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-300/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-300/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70013a04fa52455aa35109fdb7ef8ab51147e6fdeae84864b8f029d400f70faa
3
+ size 308107928
checkpoint-300/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-300/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68bd37854c5a77a4dbf5dbbdf5a1b46d5f3338b44de9b2b6650dded795191642
3
+ size 157332069
checkpoint-300/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-300/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8138f1105ce33d82b9d118d54f9fb1afd211b0bda93cd0aa8effe5b69f6cbce4
3
+ size 14645
checkpoint-300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cbdc70435f287d81ad66c656f575416da2d1b679d04dfeb3a37105725eec93a
3
+ size 1465
checkpoint-300/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-300/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-300/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-300/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.01389834564179182,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-100",
5
+ "epoch": 1.1673151750972763,
6
+ "eval_steps": 100,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.0237326622009277,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8809,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 2.4512810707092285,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.8569,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.058365758754863814,
28
+ "grad_norm": 1.5967055559158325,
29
+ "learning_rate": 6.222222222222222e-06,
30
+ "loss": 0.7725,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.07782101167315175,
35
+ "grad_norm": 1.2497001886367798,
36
+ "learning_rate": 8.444444444444446e-06,
37
+ "loss": 0.6516,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.09727626459143969,
42
+ "grad_norm": 1.2455090284347534,
43
+ "learning_rate": 1.0666666666666667e-05,
44
+ "loss": 0.5238,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.11673151750972763,
49
+ "grad_norm": 1.361525535583496,
50
+ "learning_rate": 1.2888888888888889e-05,
51
+ "loss": 0.3777,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.13618677042801555,
56
+ "grad_norm": 1.6556775569915771,
57
+ "learning_rate": 1.511111111111111e-05,
58
+ "loss": 0.2116,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.1556420233463035,
63
+ "grad_norm": 0.6078555583953857,
64
+ "learning_rate": 1.7333333333333332e-05,
65
+ "loss": 0.0794,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.17509727626459143,
70
+ "grad_norm": 0.31556975841522217,
71
+ "learning_rate": 1.9555555555555557e-05,
72
+ "loss": 0.0343,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.19455252918287938,
77
+ "grad_norm": 0.23063282668590546,
78
+ "learning_rate": 2.177777777777778e-05,
79
+ "loss": 0.0227,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.2140077821011673,
84
+ "grad_norm": 0.10897089540958405,
85
+ "learning_rate": 2.4e-05,
86
+ "loss": 0.0159,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.23346303501945526,
91
+ "grad_norm": 0.08667729049921036,
92
+ "learning_rate": 2.6222222222222226e-05,
93
+ "loss": 0.0155,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.2529182879377432,
98
+ "grad_norm": 0.07056345790624619,
99
+ "learning_rate": 2.8444444444444447e-05,
100
+ "loss": 0.0133,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.2723735408560311,
105
+ "grad_norm": 0.119380883872509,
106
+ "learning_rate": 3.066666666666666e-05,
107
+ "loss": 0.0113,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.2918287937743191,
112
+ "grad_norm": 0.10328345745801926,
113
+ "learning_rate": 3.288888888888889e-05,
114
+ "loss": 0.0074,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.311284046692607,
119
+ "grad_norm": 0.08840714395046234,
120
+ "learning_rate": 3.511111111111111e-05,
121
+ "loss": 0.0069,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.33073929961089493,
126
+ "grad_norm": 0.1998119205236435,
127
+ "learning_rate": 3.733333333333334e-05,
128
+ "loss": 0.0069,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.35019455252918286,
133
+ "grad_norm": 0.08085718750953674,
134
+ "learning_rate": 3.9555555555555556e-05,
135
+ "loss": 0.0072,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.36964980544747084,
140
+ "grad_norm": 0.10597972571849823,
141
+ "learning_rate": 4.177777777777778e-05,
142
+ "loss": 0.0072,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.38910505836575876,
147
+ "grad_norm": 0.04654397815465927,
148
+ "learning_rate": 4.4e-05,
149
+ "loss": 0.0065,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.38910505836575876,
154
+ "eval_loss": 0.01389834564179182,
155
+ "eval_runtime": 174.9649,
156
+ "eval_samples_per_second": 2.915,
157
+ "eval_steps_per_second": 0.732,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.4085603112840467,
162
+ "grad_norm": 0.08657707273960114,
163
+ "learning_rate": 4.6222222222222224e-05,
164
+ "loss": 0.0071,
165
+ "step": 105
166
+ },
167
+ {
168
+ "epoch": 0.4280155642023346,
169
+ "grad_norm": 0.09846911579370499,
170
+ "learning_rate": 4.844444444444445e-05,
171
+ "loss": 0.0069,
172
+ "step": 110
173
+ },
174
+ {
175
+ "epoch": 0.4474708171206226,
176
+ "grad_norm": 0.07289458811283112,
177
+ "learning_rate": 5.066666666666667e-05,
178
+ "loss": 0.0072,
179
+ "step": 115
180
+ },
181
+ {
182
+ "epoch": 0.4669260700389105,
183
+ "grad_norm": 0.04821142926812172,
184
+ "learning_rate": 5.288888888888889e-05,
185
+ "loss": 0.0063,
186
+ "step": 120
187
+ },
188
+ {
189
+ "epoch": 0.48638132295719844,
190
+ "grad_norm": 0.06011726334691048,
191
+ "learning_rate": 5.511111111111111e-05,
192
+ "loss": 0.0068,
193
+ "step": 125
194
+ },
195
+ {
196
+ "epoch": 0.5058365758754864,
197
+ "grad_norm": 0.05461547151207924,
198
+ "learning_rate": 5.7333333333333336e-05,
199
+ "loss": 0.0065,
200
+ "step": 130
201
+ },
202
+ {
203
+ "epoch": 0.5252918287937743,
204
+ "grad_norm": 0.05819641426205635,
205
+ "learning_rate": 5.9555555555555554e-05,
206
+ "loss": 0.0061,
207
+ "step": 135
208
+ },
209
+ {
210
+ "epoch": 0.5447470817120622,
211
+ "grad_norm": 0.048746656626462936,
212
+ "learning_rate": 6.177777777777779e-05,
213
+ "loss": 0.0065,
214
+ "step": 140
215
+ },
216
+ {
217
+ "epoch": 0.5642023346303502,
218
+ "grad_norm": 0.04917529225349426,
219
+ "learning_rate": 6.4e-05,
220
+ "loss": 0.0066,
221
+ "step": 145
222
+ },
223
+ {
224
+ "epoch": 0.5836575875486382,
225
+ "grad_norm": 0.062472034245729446,
226
+ "learning_rate": 6.622222222222222e-05,
227
+ "loss": 0.0067,
228
+ "step": 150
229
+ },
230
+ {
231
+ "epoch": 0.603112840466926,
232
+ "grad_norm": 0.07727159559726715,
233
+ "learning_rate": 6.844444444444445e-05,
234
+ "loss": 0.0068,
235
+ "step": 155
236
+ },
237
+ {
238
+ "epoch": 0.622568093385214,
239
+ "grad_norm": 0.036719705909490585,
240
+ "learning_rate": 7.066666666666667e-05,
241
+ "loss": 0.0062,
242
+ "step": 160
243
+ },
244
+ {
245
+ "epoch": 0.642023346303502,
246
+ "grad_norm": 0.04453688859939575,
247
+ "learning_rate": 7.288888888888888e-05,
248
+ "loss": 0.0064,
249
+ "step": 165
250
+ },
251
+ {
252
+ "epoch": 0.6614785992217899,
253
+ "grad_norm": 0.06014733761548996,
254
+ "learning_rate": 7.511111111111111e-05,
255
+ "loss": 0.0068,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.6809338521400778,
260
+ "grad_norm": 0.06567176431417465,
261
+ "learning_rate": 7.733333333333333e-05,
262
+ "loss": 0.0069,
263
+ "step": 175
264
+ },
265
+ {
266
+ "epoch": 0.7003891050583657,
267
+ "grad_norm": 0.058225322514772415,
268
+ "learning_rate": 7.955555555555556e-05,
269
+ "loss": 0.007,
270
+ "step": 180
271
+ },
272
+ {
273
+ "epoch": 0.7198443579766537,
274
+ "grad_norm": 0.04187316447496414,
275
+ "learning_rate": 8.177777777777778e-05,
276
+ "loss": 0.0065,
277
+ "step": 185
278
+ },
279
+ {
280
+ "epoch": 0.7392996108949417,
281
+ "grad_norm": 0.03578794747591019,
282
+ "learning_rate": 8.4e-05,
283
+ "loss": 0.0061,
284
+ "step": 190
285
+ },
286
+ {
287
+ "epoch": 0.7587548638132295,
288
+ "grad_norm": 0.0358467772603035,
289
+ "learning_rate": 8.622222222222223e-05,
290
+ "loss": 0.0065,
291
+ "step": 195
292
+ },
293
+ {
294
+ "epoch": 0.7782101167315175,
295
+ "grad_norm": 0.027109306305646896,
296
+ "learning_rate": 8.844444444444445e-05,
297
+ "loss": 0.0063,
298
+ "step": 200
299
+ },
300
+ {
301
+ "epoch": 0.7782101167315175,
302
+ "eval_loss": 0.01398194208741188,
303
+ "eval_runtime": 152.8483,
304
+ "eval_samples_per_second": 3.337,
305
+ "eval_steps_per_second": 0.837,
306
+ "step": 200
307
+ },
308
+ {
309
+ "epoch": 0.7976653696498055,
310
+ "grad_norm": 0.031359098851680756,
311
+ "learning_rate": 9.066666666666667e-05,
312
+ "loss": 0.0061,
313
+ "step": 205
314
+ },
315
+ {
316
+ "epoch": 0.8171206225680934,
317
+ "grad_norm": 0.041419435292482376,
318
+ "learning_rate": 9.288888888888888e-05,
319
+ "loss": 0.0062,
320
+ "step": 210
321
+ },
322
+ {
323
+ "epoch": 0.8365758754863813,
324
+ "grad_norm": 0.03226594254374504,
325
+ "learning_rate": 9.511111111111112e-05,
326
+ "loss": 0.0059,
327
+ "step": 215
328
+ },
329
+ {
330
+ "epoch": 0.8560311284046692,
331
+ "grad_norm": 0.07033108919858932,
332
+ "learning_rate": 9.733333333333333e-05,
333
+ "loss": 0.0061,
334
+ "step": 220
335
+ },
336
+ {
337
+ "epoch": 0.8754863813229572,
338
+ "grad_norm": 0.05067949742078781,
339
+ "learning_rate": 9.955555555555556e-05,
340
+ "loss": 0.0059,
341
+ "step": 225
342
+ },
343
+ {
344
+ "epoch": 0.8949416342412452,
345
+ "grad_norm": 0.025263365358114243,
346
+ "learning_rate": 0.00010177777777777777,
347
+ "loss": 0.0059,
348
+ "step": 230
349
+ },
350
+ {
351
+ "epoch": 0.914396887159533,
352
+ "grad_norm": 0.03773004561662674,
353
+ "learning_rate": 0.00010400000000000001,
354
+ "loss": 0.0062,
355
+ "step": 235
356
+ },
357
+ {
358
+ "epoch": 0.933852140077821,
359
+ "grad_norm": 0.03331644833087921,
360
+ "learning_rate": 0.00010622222222222222,
361
+ "loss": 0.006,
362
+ "step": 240
363
+ },
364
+ {
365
+ "epoch": 0.953307392996109,
366
+ "grad_norm": 0.03881971910595894,
367
+ "learning_rate": 0.00010844444444444444,
368
+ "loss": 0.0059,
369
+ "step": 245
370
+ },
371
+ {
372
+ "epoch": 0.9727626459143969,
373
+ "grad_norm": 0.0403914675116539,
374
+ "learning_rate": 0.00011066666666666668,
375
+ "loss": 0.0062,
376
+ "step": 250
377
+ },
378
+ {
379
+ "epoch": 0.9922178988326849,
380
+ "grad_norm": 0.03234079107642174,
381
+ "learning_rate": 0.0001128888888888889,
382
+ "loss": 0.0064,
383
+ "step": 255
384
+ },
385
+ {
386
+ "epoch": 1.0116731517509727,
387
+ "grad_norm": 0.0188930481672287,
388
+ "learning_rate": 0.00011511111111111112,
389
+ "loss": 0.0061,
390
+ "step": 260
391
+ },
392
+ {
393
+ "epoch": 1.0311284046692606,
394
+ "grad_norm": 0.02896421030163765,
395
+ "learning_rate": 0.00011733333333333333,
396
+ "loss": 0.0059,
397
+ "step": 265
398
+ },
399
+ {
400
+ "epoch": 1.0505836575875487,
401
+ "grad_norm": 0.038550637662410736,
402
+ "learning_rate": 0.00011955555555555557,
403
+ "loss": 0.0066,
404
+ "step": 270
405
+ },
406
+ {
407
+ "epoch": 1.0700389105058365,
408
+ "grad_norm": 0.024153664708137512,
409
+ "learning_rate": 0.00011999797360750958,
410
+ "loss": 0.0062,
411
+ "step": 275
412
+ },
413
+ {
414
+ "epoch": 1.0894941634241244,
415
+ "grad_norm": 0.027283893898129463,
416
+ "learning_rate": 0.00011998974162260325,
417
+ "loss": 0.006,
418
+ "step": 280
419
+ },
420
+ {
421
+ "epoch": 1.1089494163424125,
422
+ "grad_norm": 0.035498056560754776,
423
+ "learning_rate": 0.00011997517826389341,
424
+ "loss": 0.0064,
425
+ "step": 285
426
+ },
427
+ {
428
+ "epoch": 1.1284046692607004,
429
+ "grad_norm": 0.021591784432530403,
430
+ "learning_rate": 0.00011995428506841069,
431
+ "loss": 0.0061,
432
+ "step": 290
433
+ },
434
+ {
435
+ "epoch": 1.1478599221789882,
436
+ "grad_norm": 0.016961926594376564,
437
+ "learning_rate": 0.00011992706424124257,
438
+ "loss": 0.006,
439
+ "step": 295
440
+ },
441
+ {
442
+ "epoch": 1.1673151750972763,
443
+ "grad_norm": 0.03020872175693512,
444
+ "learning_rate": 0.00011989351865530078,
445
+ "loss": 0.0063,
446
+ "step": 300
447
+ },
448
+ {
449
+ "epoch": 1.1673151750972763,
450
+ "eval_loss": 0.014270401559770107,
451
+ "eval_runtime": 152.9753,
452
+ "eval_samples_per_second": 3.334,
453
+ "eval_steps_per_second": 0.837,
454
+ "step": 300
455
+ }
456
+ ],
457
+ "logging_steps": 5,
458
+ "max_steps": 1799,
459
+ "num_input_tokens_seen": 0,
460
+ "num_train_epochs": 7,
461
+ "save_steps": 100,
462
+ "stateful_callbacks": {
463
+ "EarlyStoppingCallback": {
464
+ "args": {
465
+ "early_stopping_patience": 30,
466
+ "early_stopping_threshold": 0.001
467
+ },
468
+ "attributes": {
469
+ "early_stopping_patience_counter": 2
470
+ }
471
+ },
472
+ "TrainerControl": {
473
+ "args": {
474
+ "should_epoch_stop": false,
475
+ "should_evaluate": false,
476
+ "should_log": false,
477
+ "should_save": true,
478
+ "should_training_stop": false
479
+ },
480
+ "attributes": {}
481
+ }
482
+ },
483
+ "total_flos": 2.0425665990873542e+17,
484
+ "train_batch_size": 4,
485
+ "trial_name": null,
486
+ "trial_params": null
487
+ }
checkpoint-300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74aab1d0c2724784614181306df8fc5fd4785e5c0771ac25a75ed3b96204917
3
+ size 6417
checkpoint-400/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-400/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:958e649f9f391eb053a1d761e63f66533c7fad366ae8d5cadb2450e74b79d67e
3
+ size 308107928
checkpoint-400/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-400/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffbb20e460857e5578e76a59ab460f24251c712b29eca8b2eafe83a3fa634557
3
+ size 157332069
checkpoint-400/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-400/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f35a449824ab6da5f64df0583d6b86b5d8d310a39763d8b4e07c3df36afa3ade
3
+ size 14645
checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e46939086c88a3be6cd6b25590a0af93e4076de259468484ea43c4e10d03cda
3
+ size 1465
checkpoint-400/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-400/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-400/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-400/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.01389834564179182,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-100",
5
+ "epoch": 1.556420233463035,
6
+ "eval_steps": 100,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.0237326622009277,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8809,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 2.4512810707092285,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.8569,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.058365758754863814,
28
+ "grad_norm": 1.5967055559158325,
29
+ "learning_rate": 6.222222222222222e-06,
30
+ "loss": 0.7725,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.07782101167315175,
35
+ "grad_norm": 1.2497001886367798,
36
+ "learning_rate": 8.444444444444446e-06,
37
+ "loss": 0.6516,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.09727626459143969,
42
+ "grad_norm": 1.2455090284347534,
43
+ "learning_rate": 1.0666666666666667e-05,
44
+ "loss": 0.5238,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.11673151750972763,
49
+ "grad_norm": 1.361525535583496,
50
+ "learning_rate": 1.2888888888888889e-05,
51
+ "loss": 0.3777,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.13618677042801555,
56
+ "grad_norm": 1.6556775569915771,
57
+ "learning_rate": 1.511111111111111e-05,
58
+ "loss": 0.2116,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.1556420233463035,
63
+ "grad_norm": 0.6078555583953857,
64
+ "learning_rate": 1.7333333333333332e-05,
65
+ "loss": 0.0794,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.17509727626459143,
70
+ "grad_norm": 0.31556975841522217,
71
+ "learning_rate": 1.9555555555555557e-05,
72
+ "loss": 0.0343,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.19455252918287938,
77
+ "grad_norm": 0.23063282668590546,
78
+ "learning_rate": 2.177777777777778e-05,
79
+ "loss": 0.0227,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.2140077821011673,
84
+ "grad_norm": 0.10897089540958405,
85
+ "learning_rate": 2.4e-05,
86
+ "loss": 0.0159,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.23346303501945526,
91
+ "grad_norm": 0.08667729049921036,
92
+ "learning_rate": 2.6222222222222226e-05,
93
+ "loss": 0.0155,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.2529182879377432,
98
+ "grad_norm": 0.07056345790624619,
99
+ "learning_rate": 2.8444444444444447e-05,
100
+ "loss": 0.0133,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.2723735408560311,
105
+ "grad_norm": 0.119380883872509,
106
+ "learning_rate": 3.066666666666666e-05,
107
+ "loss": 0.0113,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.2918287937743191,
112
+ "grad_norm": 0.10328345745801926,
113
+ "learning_rate": 3.288888888888889e-05,
114
+ "loss": 0.0074,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.311284046692607,
119
+ "grad_norm": 0.08840714395046234,
120
+ "learning_rate": 3.511111111111111e-05,
121
+ "loss": 0.0069,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.33073929961089493,
126
+ "grad_norm": 0.1998119205236435,
127
+ "learning_rate": 3.733333333333334e-05,
128
+ "loss": 0.0069,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.35019455252918286,
133
+ "grad_norm": 0.08085718750953674,
134
+ "learning_rate": 3.9555555555555556e-05,
135
+ "loss": 0.0072,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.36964980544747084,
140
+ "grad_norm": 0.10597972571849823,
141
+ "learning_rate": 4.177777777777778e-05,
142
+ "loss": 0.0072,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.38910505836575876,
147
+ "grad_norm": 0.04654397815465927,
148
+ "learning_rate": 4.4e-05,
149
+ "loss": 0.0065,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.38910505836575876,
154
+ "eval_loss": 0.01389834564179182,
155
+ "eval_runtime": 174.9649,
156
+ "eval_samples_per_second": 2.915,
157
+ "eval_steps_per_second": 0.732,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.4085603112840467,
162
+ "grad_norm": 0.08657707273960114,
163
+ "learning_rate": 4.6222222222222224e-05,
164
+ "loss": 0.0071,
165
+ "step": 105
166
+ },
167
+ {
168
+ "epoch": 0.4280155642023346,
169
+ "grad_norm": 0.09846911579370499,
170
+ "learning_rate": 4.844444444444445e-05,
171
+ "loss": 0.0069,
172
+ "step": 110
173
+ },
174
+ {
175
+ "epoch": 0.4474708171206226,
176
+ "grad_norm": 0.07289458811283112,
177
+ "learning_rate": 5.066666666666667e-05,
178
+ "loss": 0.0072,
179
+ "step": 115
180
+ },
181
+ {
182
+ "epoch": 0.4669260700389105,
183
+ "grad_norm": 0.04821142926812172,
184
+ "learning_rate": 5.288888888888889e-05,
185
+ "loss": 0.0063,
186
+ "step": 120
187
+ },
188
+ {
189
+ "epoch": 0.48638132295719844,
190
+ "grad_norm": 0.06011726334691048,
191
+ "learning_rate": 5.511111111111111e-05,
192
+ "loss": 0.0068,
193
+ "step": 125
194
+ },
195
+ {
196
+ "epoch": 0.5058365758754864,
197
+ "grad_norm": 0.05461547151207924,
198
+ "learning_rate": 5.7333333333333336e-05,
199
+ "loss": 0.0065,
200
+ "step": 130
201
+ },
202
+ {
203
+ "epoch": 0.5252918287937743,
204
+ "grad_norm": 0.05819641426205635,
205
+ "learning_rate": 5.9555555555555554e-05,
206
+ "loss": 0.0061,
207
+ "step": 135
208
+ },
209
+ {
210
+ "epoch": 0.5447470817120622,
211
+ "grad_norm": 0.048746656626462936,
212
+ "learning_rate": 6.177777777777779e-05,
213
+ "loss": 0.0065,
214
+ "step": 140
215
+ },
216
+ {
217
+ "epoch": 0.5642023346303502,
218
+ "grad_norm": 0.04917529225349426,
219
+ "learning_rate": 6.4e-05,
220
+ "loss": 0.0066,
221
+ "step": 145
222
+ },
223
+ {
224
+ "epoch": 0.5836575875486382,
225
+ "grad_norm": 0.062472034245729446,
226
+ "learning_rate": 6.622222222222222e-05,
227
+ "loss": 0.0067,
228
+ "step": 150
229
+ },
230
+ {
231
+ "epoch": 0.603112840466926,
232
+ "grad_norm": 0.07727159559726715,
233
+ "learning_rate": 6.844444444444445e-05,
234
+ "loss": 0.0068,
235
+ "step": 155
236
+ },
237
+ {
238
+ "epoch": 0.622568093385214,
239
+ "grad_norm": 0.036719705909490585,
240
+ "learning_rate": 7.066666666666667e-05,
241
+ "loss": 0.0062,
242
+ "step": 160
243
+ },
244
+ {
245
+ "epoch": 0.642023346303502,
246
+ "grad_norm": 0.04453688859939575,
247
+ "learning_rate": 7.288888888888888e-05,
248
+ "loss": 0.0064,
249
+ "step": 165
250
+ },
251
+ {
252
+ "epoch": 0.6614785992217899,
253
+ "grad_norm": 0.06014733761548996,
254
+ "learning_rate": 7.511111111111111e-05,
255
+ "loss": 0.0068,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.6809338521400778,
260
+ "grad_norm": 0.06567176431417465,
261
+ "learning_rate": 7.733333333333333e-05,
262
+ "loss": 0.0069,
263
+ "step": 175
264
+ },
265
+ {
266
+ "epoch": 0.7003891050583657,
267
+ "grad_norm": 0.058225322514772415,
268
+ "learning_rate": 7.955555555555556e-05,
269
+ "loss": 0.007,
270
+ "step": 180
271
+ },
272
+ {
273
+ "epoch": 0.7198443579766537,
274
+ "grad_norm": 0.04187316447496414,
275
+ "learning_rate": 8.177777777777778e-05,
276
+ "loss": 0.0065,
277
+ "step": 185
278
+ },
279
+ {
280
+ "epoch": 0.7392996108949417,
281
+ "grad_norm": 0.03578794747591019,
282
+ "learning_rate": 8.4e-05,
283
+ "loss": 0.0061,
284
+ "step": 190
285
+ },
286
+ {
287
+ "epoch": 0.7587548638132295,
288
+ "grad_norm": 0.0358467772603035,
289
+ "learning_rate": 8.622222222222223e-05,
290
+ "loss": 0.0065,
291
+ "step": 195
292
+ },
293
+ {
294
+ "epoch": 0.7782101167315175,
295
+ "grad_norm": 0.027109306305646896,
296
+ "learning_rate": 8.844444444444445e-05,
297
+ "loss": 0.0063,
298
+ "step": 200
299
+ },
300
+ {
301
+ "epoch": 0.7782101167315175,
302
+ "eval_loss": 0.01398194208741188,
303
+ "eval_runtime": 152.8483,
304
+ "eval_samples_per_second": 3.337,
305
+ "eval_steps_per_second": 0.837,
306
+ "step": 200
307
+ },
308
+ {
309
+ "epoch": 0.7976653696498055,
310
+ "grad_norm": 0.031359098851680756,
311
+ "learning_rate": 9.066666666666667e-05,
312
+ "loss": 0.0061,
313
+ "step": 205
314
+ },
315
+ {
316
+ "epoch": 0.8171206225680934,
317
+ "grad_norm": 0.041419435292482376,
318
+ "learning_rate": 9.288888888888888e-05,
319
+ "loss": 0.0062,
320
+ "step": 210
321
+ },
322
+ {
323
+ "epoch": 0.8365758754863813,
324
+ "grad_norm": 0.03226594254374504,
325
+ "learning_rate": 9.511111111111112e-05,
326
+ "loss": 0.0059,
327
+ "step": 215
328
+ },
329
+ {
330
+ "epoch": 0.8560311284046692,
331
+ "grad_norm": 0.07033108919858932,
332
+ "learning_rate": 9.733333333333333e-05,
333
+ "loss": 0.0061,
334
+ "step": 220
335
+ },
336
+ {
337
+ "epoch": 0.8754863813229572,
338
+ "grad_norm": 0.05067949742078781,
339
+ "learning_rate": 9.955555555555556e-05,
340
+ "loss": 0.0059,
341
+ "step": 225
342
+ },
343
+ {
344
+ "epoch": 0.8949416342412452,
345
+ "grad_norm": 0.025263365358114243,
346
+ "learning_rate": 0.00010177777777777777,
347
+ "loss": 0.0059,
348
+ "step": 230
349
+ },
350
+ {
351
+ "epoch": 0.914396887159533,
352
+ "grad_norm": 0.03773004561662674,
353
+ "learning_rate": 0.00010400000000000001,
354
+ "loss": 0.0062,
355
+ "step": 235
356
+ },
357
+ {
358
+ "epoch": 0.933852140077821,
359
+ "grad_norm": 0.03331644833087921,
360
+ "learning_rate": 0.00010622222222222222,
361
+ "loss": 0.006,
362
+ "step": 240
363
+ },
364
+ {
365
+ "epoch": 0.953307392996109,
366
+ "grad_norm": 0.03881971910595894,
367
+ "learning_rate": 0.00010844444444444444,
368
+ "loss": 0.0059,
369
+ "step": 245
370
+ },
371
+ {
372
+ "epoch": 0.9727626459143969,
373
+ "grad_norm": 0.0403914675116539,
374
+ "learning_rate": 0.00011066666666666668,
375
+ "loss": 0.0062,
376
+ "step": 250
377
+ },
378
+ {
379
+ "epoch": 0.9922178988326849,
380
+ "grad_norm": 0.03234079107642174,
381
+ "learning_rate": 0.0001128888888888889,
382
+ "loss": 0.0064,
383
+ "step": 255
384
+ },
385
+ {
386
+ "epoch": 1.0116731517509727,
387
+ "grad_norm": 0.0188930481672287,
388
+ "learning_rate": 0.00011511111111111112,
389
+ "loss": 0.0061,
390
+ "step": 260
391
+ },
392
+ {
393
+ "epoch": 1.0311284046692606,
394
+ "grad_norm": 0.02896421030163765,
395
+ "learning_rate": 0.00011733333333333333,
396
+ "loss": 0.0059,
397
+ "step": 265
398
+ },
399
+ {
400
+ "epoch": 1.0505836575875487,
401
+ "grad_norm": 0.038550637662410736,
402
+ "learning_rate": 0.00011955555555555557,
403
+ "loss": 0.0066,
404
+ "step": 270
405
+ },
406
+ {
407
+ "epoch": 1.0700389105058365,
408
+ "grad_norm": 0.024153664708137512,
409
+ "learning_rate": 0.00011999797360750958,
410
+ "loss": 0.0062,
411
+ "step": 275
412
+ },
413
+ {
414
+ "epoch": 1.0894941634241244,
415
+ "grad_norm": 0.027283893898129463,
416
+ "learning_rate": 0.00011998974162260325,
417
+ "loss": 0.006,
418
+ "step": 280
419
+ },
420
+ {
421
+ "epoch": 1.1089494163424125,
422
+ "grad_norm": 0.035498056560754776,
423
+ "learning_rate": 0.00011997517826389341,
424
+ "loss": 0.0064,
425
+ "step": 285
426
+ },
427
+ {
428
+ "epoch": 1.1284046692607004,
429
+ "grad_norm": 0.021591784432530403,
430
+ "learning_rate": 0.00011995428506841069,
431
+ "loss": 0.0061,
432
+ "step": 290
433
+ },
434
+ {
435
+ "epoch": 1.1478599221789882,
436
+ "grad_norm": 0.016961926594376564,
437
+ "learning_rate": 0.00011992706424124257,
438
+ "loss": 0.006,
439
+ "step": 295
440
+ },
441
+ {
442
+ "epoch": 1.1673151750972763,
443
+ "grad_norm": 0.03020872175693512,
444
+ "learning_rate": 0.00011989351865530078,
445
+ "loss": 0.0063,
446
+ "step": 300
447
+ },
448
+ {
449
+ "epoch": 1.1673151750972763,
450
+ "eval_loss": 0.014270401559770107,
451
+ "eval_runtime": 152.9753,
452
+ "eval_samples_per_second": 3.334,
453
+ "eval_steps_per_second": 0.837,
454
+ "step": 300
455
+ },
456
+ {
457
+ "epoch": 1.1867704280155642,
458
+ "grad_norm": 0.025152679532766342,
459
+ "learning_rate": 0.00011985365185101807,
460
+ "loss": 0.0063,
461
+ "step": 305
462
+ },
463
+ {
464
+ "epoch": 1.206225680933852,
465
+ "grad_norm": 0.030001649633049965,
466
+ "learning_rate": 0.00011980746803597448,
467
+ "loss": 0.0063,
468
+ "step": 310
469
+ },
470
+ {
471
+ "epoch": 1.2256809338521402,
472
+ "grad_norm": 0.031324416399002075,
473
+ "learning_rate": 0.0001197549720844533,
474
+ "loss": 0.0065,
475
+ "step": 315
476
+ },
477
+ {
478
+ "epoch": 1.245136186770428,
479
+ "grad_norm": 0.030990222468972206,
480
+ "learning_rate": 0.00011969616953692672,
481
+ "loss": 0.0066,
482
+ "step": 320
483
+ },
484
+ {
485
+ "epoch": 1.264591439688716,
486
+ "grad_norm": 0.02283373288810253,
487
+ "learning_rate": 0.00011963106659947091,
488
+ "loss": 0.0062,
489
+ "step": 325
490
+ },
491
+ {
492
+ "epoch": 1.2840466926070038,
493
+ "grad_norm": 0.027848919853568077,
494
+ "learning_rate": 0.00011955967014311121,
495
+ "loss": 0.0061,
496
+ "step": 330
497
+ },
498
+ {
499
+ "epoch": 1.3035019455252919,
500
+ "grad_norm": 0.0324862040579319,
501
+ "learning_rate": 0.00011948198770309682,
502
+ "loss": 0.0064,
503
+ "step": 335
504
+ },
505
+ {
506
+ "epoch": 1.3229571984435797,
507
+ "grad_norm": 0.021669326350092888,
508
+ "learning_rate": 0.00011939802747810558,
509
+ "loss": 0.0061,
510
+ "step": 340
511
+ },
512
+ {
513
+ "epoch": 1.3424124513618678,
514
+ "grad_norm": 0.023976098746061325,
515
+ "learning_rate": 0.0001193077983293787,
516
+ "loss": 0.006,
517
+ "step": 345
518
+ },
519
+ {
520
+ "epoch": 1.3618677042801557,
521
+ "grad_norm": 0.022651424631476402,
522
+ "learning_rate": 0.00011921130977978545,
523
+ "loss": 0.0062,
524
+ "step": 350
525
+ },
526
+ {
527
+ "epoch": 1.3813229571984436,
528
+ "grad_norm": 0.021479440852999687,
529
+ "learning_rate": 0.0001191085720128182,
530
+ "loss": 0.0062,
531
+ "step": 355
532
+ },
533
+ {
534
+ "epoch": 1.4007782101167314,
535
+ "grad_norm": 0.1814209520816803,
536
+ "learning_rate": 0.00011899959587151756,
537
+ "loss": 0.0061,
538
+ "step": 360
539
+ },
540
+ {
541
+ "epoch": 1.4202334630350195,
542
+ "grad_norm": 1.4026191234588623,
543
+ "learning_rate": 0.00011888439285732813,
544
+ "loss": 0.0261,
545
+ "step": 365
546
+ },
547
+ {
548
+ "epoch": 1.4396887159533074,
549
+ "grad_norm": 1.3291703462600708,
550
+ "learning_rate": 0.00011876297512888443,
551
+ "loss": 0.0267,
552
+ "step": 370
553
+ },
554
+ {
555
+ "epoch": 1.4591439688715953,
556
+ "grad_norm": 1.7693599462509155,
557
+ "learning_rate": 0.00011863535550072783,
558
+ "loss": 0.0212,
559
+ "step": 375
560
+ },
561
+ {
562
+ "epoch": 1.4785992217898833,
563
+ "grad_norm": 0.9990978240966797,
564
+ "learning_rate": 0.00011850154744195403,
565
+ "loss": 0.0218,
566
+ "step": 380
567
+ },
568
+ {
569
+ "epoch": 1.4980544747081712,
570
+ "grad_norm": 0.29475995898246765,
571
+ "learning_rate": 0.0001183615650747915,
572
+ "loss": 0.0117,
573
+ "step": 385
574
+ },
575
+ {
576
+ "epoch": 1.517509727626459,
577
+ "grad_norm": 0.04740586131811142,
578
+ "learning_rate": 0.00011821542317311106,
579
+ "loss": 0.0084,
580
+ "step": 390
581
+ },
582
+ {
583
+ "epoch": 1.536964980544747,
584
+ "grad_norm": 0.04320238158106804,
585
+ "learning_rate": 0.00011806313716086658,
586
+ "loss": 0.0067,
587
+ "step": 395
588
+ },
589
+ {
590
+ "epoch": 1.556420233463035,
591
+ "grad_norm": 0.05465610325336456,
592
+ "learning_rate": 0.00011790472311046715,
593
+ "loss": 0.0065,
594
+ "step": 400
595
+ },
596
+ {
597
+ "epoch": 1.556420233463035,
598
+ "eval_loss": 0.07703334093093872,
599
+ "eval_runtime": 152.4522,
600
+ "eval_samples_per_second": 3.345,
601
+ "eval_steps_per_second": 0.84,
602
+ "step": 400
603
+ }
604
+ ],
605
+ "logging_steps": 5,
606
+ "max_steps": 1799,
607
+ "num_input_tokens_seen": 0,
608
+ "num_train_epochs": 7,
609
+ "save_steps": 100,
610
+ "stateful_callbacks": {
611
+ "EarlyStoppingCallback": {
612
+ "args": {
613
+ "early_stopping_patience": 30,
614
+ "early_stopping_threshold": 0.001
615
+ },
616
+ "attributes": {
617
+ "early_stopping_patience_counter": 3
618
+ }
619
+ },
620
+ "TrainerControl": {
621
+ "args": {
622
+ "should_epoch_stop": false,
623
+ "should_evaluate": false,
624
+ "should_log": false,
625
+ "should_save": true,
626
+ "should_training_stop": false
627
+ },
628
+ "attributes": {}
629
+ }
630
+ },
631
+ "total_flos": 2.7238400607783456e+17,
632
+ "train_batch_size": 4,
633
+ "trial_name": null,
634
+ "trial_params": null
635
+ }
checkpoint-400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74aab1d0c2724784614181306df8fc5fd4785e5c0771ac25a75ed3b96204917
3
+ size 6417