ImNotTam commited on
Commit
d4c59df
·
verified ·
1 Parent(s): 49373d1

Upload full training folder with all checkpoints

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. README.md +175 -0
  3. checkpoint-220/README.md +210 -0
  4. checkpoint-220/adapter_config.json +42 -0
  5. checkpoint-220/adapter_model.safetensors +3 -0
  6. checkpoint-220/added_tokens.json +3 -0
  7. checkpoint-220/chat_template.jinja +47 -0
  8. checkpoint-220/optimizer.pt +3 -0
  9. checkpoint-220/preprocessor_config.json +29 -0
  10. checkpoint-220/processor_config.json +4 -0
  11. checkpoint-220/rng_state.pth +3 -0
  12. checkpoint-220/scheduler.pt +3 -0
  13. checkpoint-220/special_tokens_map.json +33 -0
  14. checkpoint-220/tokenizer.json +3 -0
  15. checkpoint-220/tokenizer.model +3 -0
  16. checkpoint-220/tokenizer_config.json +0 -0
  17. checkpoint-220/trainer_state.json +527 -0
  18. checkpoint-220/training_args.bin +3 -0
  19. checkpoint-230/README.md +210 -0
  20. checkpoint-230/adapter_config.json +42 -0
  21. checkpoint-230/adapter_model.safetensors +3 -0
  22. checkpoint-230/added_tokens.json +3 -0
  23. checkpoint-230/chat_template.jinja +47 -0
  24. checkpoint-230/optimizer.pt +3 -0
  25. checkpoint-230/preprocessor_config.json +29 -0
  26. checkpoint-230/processor_config.json +4 -0
  27. checkpoint-230/rng_state.pth +3 -0
  28. checkpoint-230/scheduler.pt +3 -0
  29. checkpoint-230/special_tokens_map.json +33 -0
  30. checkpoint-230/tokenizer.json +3 -0
  31. checkpoint-230/tokenizer.model +3 -0
  32. checkpoint-230/tokenizer_config.json +0 -0
  33. checkpoint-230/trainer_state.json +549 -0
  34. checkpoint-230/training_args.bin +3 -0
  35. checkpoint-240/README.md +210 -0
  36. checkpoint-240/adapter_config.json +42 -0
  37. checkpoint-240/adapter_model.safetensors +3 -0
  38. checkpoint-240/added_tokens.json +3 -0
  39. checkpoint-240/chat_template.jinja +47 -0
  40. checkpoint-240/optimizer.pt +3 -0
  41. checkpoint-240/preprocessor_config.json +29 -0
  42. checkpoint-240/processor_config.json +4 -0
  43. checkpoint-240/rng_state.pth +3 -0
  44. checkpoint-240/scheduler.pt +3 -0
  45. checkpoint-240/special_tokens_map.json +33 -0
  46. checkpoint-240/tokenizer.json +3 -0
  47. checkpoint-240/tokenizer.model +3 -0
  48. checkpoint-240/tokenizer_config.json +0 -0
  49. checkpoint-240/trainer_state.json +571 -0
  50. checkpoint-240/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-230/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-240/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-260/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ lora_adapters/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ - en
5
+ license: apache-2.0
6
+ tags:
7
+ - llm-judge
8
+ - training-checkpoint
9
+ - lora
10
+ - unsloth
11
+ ---
12
+
13
+ # finetuned_4_12
14
+
15
+ Full training folder backup - Toàn bộ checkpoints và models.
16
+
17
+ ## 📂 Cấu trúc Folder
18
+ ```
19
+ train_
20
+ ├── lora_adapters/ # LoRA adapters
21
+ ├── README.md
22
+ ├── zero_shot_metrics.json
23
+ └── zero_shot_results.csv
24
+ ```
25
+
26
+ ## 🚀 Sử Dụng
27
+
28
+ ### 1️⃣ Clone Repo
29
+ ```bash
30
+ git lfs install
31
+ git clone https://huggingface.co/ImNotTam/finetuned_4_12
32
+ cd finetuned_4_12
33
+ ```
34
+
35
+ ### 2️⃣ Load LoRA Adapters (Nhẹ nhất - khuyến nghị)
36
+ ```python
37
+ from unsloth import FastLanguageModel
38
+
39
+ model, tokenizer = FastLanguageModel.from_pretrained(
40
+ model_name="ImNotTam/finetuned_4_12",
41
+ subfolder="lora_adapters",
42
+ max_seq_length=2048,
43
+ dtype=None,
44
+ load_in_4bit=True,
45
+ )
46
+
47
+ # Enable inference mode
48
+ FastLanguageModel.for_inference(model)
49
+
50
+ # Test
51
+ prompt = "Đánh giá response này..."
52
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
53
+ outputs = model.generate(**inputs, max_new_tokens=256)
54
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
55
+ ```
56
+
57
+ ### 3️⃣ Load Final Model
58
+ ```python
59
+ from transformers import AutoModelForCausalLM, AutoTokenizer
60
+
61
+ model = AutoModelForCausalLM.from_pretrained(
62
+ "ImNotTam/finetuned_4_12",
63
+ subfolder="final_model",
64
+ device_map="auto",
65
+ torch_dtype="auto"
66
+ )
67
+ tokenizer = AutoTokenizer.from_pretrained("ImNotTam/finetuned_4_12", subfolder="final_model")
68
+
69
+ # Inference
70
+ inputs = tokenizer("Your prompt", return_tensors="pt").to("cuda")
71
+ outputs = model.generate(**inputs)
72
+ print(tokenizer.decode(outputs[0]))
73
+ ```
74
+
75
+ ### 4️⃣ Resume Training từ Checkpoint
76
+ ```python
77
+ from transformers import Trainer, TrainingArguments
78
+
79
+ # Load checkpoint muốn resume
80
+ model = AutoModelForCausalLM.from_pretrained(
81
+ "ImNotTam/finetuned_4_12",
82
+ subfolder="checkpoint-210", # Chọn checkpoint
83
+ device_map="auto"
84
+ )
85
+
86
+ # Continue training
87
+ trainer = Trainer(
88
+ model=model,
89
+ args=TrainingArguments(
90
+ output_dir="./continue_training",
91
+ # ... your training args
92
+ ),
93
+ )
94
+ trainer.train(resume_from_checkpoint=True)
95
+ ```
96
+
97
+ ### 5️⃣ Fine-tune Tiếp từ LoRA Adapter
98
+ ```python
99
+ from unsloth import FastLanguageModel
100
+ from trl import SFTTrainer
101
+
102
+ # Load LoRA adapter
103
+ model, tokenizer = FastLanguageModel.from_pretrained(
104
+ model_name="ImNotTam/finetuned_4_12",
105
+ subfolder="lora_adapters",
106
+ max_seq_length=2048,
107
+ dtype=None,
108
+ load_in_4bit=True,
109
+ )
110
+
111
+ # Add LoRA config để train tiếp
112
+ model = FastLanguageModel.get_peft_model(
113
+ model,
114
+ r=16,
115
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
116
+ "gate_proj", "up_proj", "down_proj"],
117
+ lora_alpha=16,
118
+ lora_dropout=0,
119
+ bias="none",
120
+ use_gradient_checkpointing="unsloth",
121
+ )
122
+
123
+ # Train với data mới
124
+ trainer = SFTTrainer(
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ train_dataset=your_new_dataset,
128
+ # ... training args
129
+ )
130
+ trainer.train()
131
+ ```
132
+
133
+ ### 6️⃣ Xem Metrics và Results
134
+ ```python
135
+ import json
136
+ import pandas as pd
137
+
138
+ # Load metrics
139
+ with open("zero_shot_metrics.json", "r") as f:
140
+ metrics = json.load(f)
141
+ print("📊 Metrics:", metrics)
142
+
143
+ # Load results
144
+ results = pd.read_csv("zero_shot_results.csv")
145
+ print("\n📈 Results:")
146
+ print(results.head())
147
+ ```
148
+
149
+ ## 📋 Nội Dung Repo
150
+
151
+ | Folder/File | Mô tả | Kích thước |
152
+ |-------------|-------|------------|
153
+ | `lora_adapters/` | LoRA adapters (nhẹ) | ~50-100 MB |
154
+ | `final_model/` | Model merged đầy đủ | ~4-8 GB |
155
+ | `checkpoint-150/` | Training checkpoint | ~4-8 GB |
156
+ | `checkpoint-200/` | Training checkpoint | ~4-8 GB |
157
+ | `checkpoint-210/` | Training checkpoint | ~4-8 GB |
158
+ | `zero_shot_metrics.json` | Evaluation metrics | <1 MB |
159
+ | `zero_shot_results.csv` | Detailed results | <1 MB |
160
+
161
+ ## 💡 Khuyến Nghị
162
+
163
+ - **Inference nhanh:** Dùng `lora_adapters/`
164
+ - **Production:** Dùng `final_model/`
165
+ - **Train tiếp:** Load `lora_adapters/` + add LoRA config
166
+ - **Resume training:** Load checkpoint cụ thể
167
+
168
+ ## 📦 Requirements
169
+ ```bash
170
+ pip install unsloth transformers torch trl
171
+ ```
172
+
173
+ ## 📄 License
174
+
175
+ Apache 2.0
checkpoint-220/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-220/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-220/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383e6779985c7ead01c6bfa46928f6b10f7acf13e095c9416f48b1d37155a748
3
+ size 308107928
checkpoint-220/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-220/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-220/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dd9ee185873bdd05083b4b486621d58a40bfefe9c0d53e2ca1fbd34c4e06f01
3
+ size 157331237
checkpoint-220/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-220/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-220/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f58f67ef5f5176e222513e3a025ff13130dac72dee0647315ab995c26de4a4bd
3
+ size 14645
checkpoint-220/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd0f7e390bbc1ed624f589a4721faeebacc227b89e31ec424dd12abd261a626
3
+ size 1465
checkpoint-220/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-220/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-220/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-220/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-220/trainer_state.json ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 220,
3
+ "best_metric": 0.010845971293747425,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-220",
5
+ "epoch": 0.8560311284046692,
6
+ "eval_steps": 10,
7
+ "global_step": 220,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.7302613258361816,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8239,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 3.058382034301758,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.7964,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.038910505836575876,
28
+ "eval_loss": 0.7571244239807129,
29
+ "eval_runtime": 152.7626,
30
+ "eval_samples_per_second": 3.339,
31
+ "eval_steps_per_second": 0.838,
32
+ "step": 10
33
+ },
34
+ {
35
+ "epoch": 0.058365758754863814,
36
+ "grad_norm": 1.8304738998413086,
37
+ "learning_rate": 6.222222222222222e-06,
38
+ "loss": 0.6943,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 0.07782101167315175,
43
+ "grad_norm": 1.3162634372711182,
44
+ "learning_rate": 8.444444444444446e-06,
45
+ "loss": 0.5569,
46
+ "step": 20
47
+ },
48
+ {
49
+ "epoch": 0.07782101167315175,
50
+ "eval_loss": 0.5357815623283386,
51
+ "eval_runtime": 136.4123,
52
+ "eval_samples_per_second": 3.739,
53
+ "eval_steps_per_second": 0.938,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.09727626459143969,
58
+ "grad_norm": 1.1865513324737549,
59
+ "learning_rate": 1.0666666666666667e-05,
60
+ "loss": 0.4222,
61
+ "step": 25
62
+ },
63
+ {
64
+ "epoch": 0.11673151750972763,
65
+ "grad_norm": 1.2043355703353882,
66
+ "learning_rate": 1.2888888888888889e-05,
67
+ "loss": 0.2831,
68
+ "step": 30
69
+ },
70
+ {
71
+ "epoch": 0.11673151750972763,
72
+ "eval_loss": 0.28264933824539185,
73
+ "eval_runtime": 136.5183,
74
+ "eval_samples_per_second": 3.736,
75
+ "eval_steps_per_second": 0.938,
76
+ "step": 30
77
+ },
78
+ {
79
+ "epoch": 0.13618677042801555,
80
+ "grad_norm": 0.9503483772277832,
81
+ "learning_rate": 1.511111111111111e-05,
82
+ "loss": 0.1405,
83
+ "step": 35
84
+ },
85
+ {
86
+ "epoch": 0.1556420233463035,
87
+ "grad_norm": 0.34179583191871643,
88
+ "learning_rate": 1.7333333333333332e-05,
89
+ "loss": 0.0515,
90
+ "step": 40
91
+ },
92
+ {
93
+ "epoch": 0.1556420233463035,
94
+ "eval_loss": 0.05298379808664322,
95
+ "eval_runtime": 136.391,
96
+ "eval_samples_per_second": 3.739,
97
+ "eval_steps_per_second": 0.938,
98
+ "step": 40
99
+ },
100
+ {
101
+ "epoch": 0.17509727626459143,
102
+ "grad_norm": 0.24818392097949982,
103
+ "learning_rate": 1.9555555555555557e-05,
104
+ "loss": 0.0302,
105
+ "step": 45
106
+ },
107
+ {
108
+ "epoch": 0.19455252918287938,
109
+ "grad_norm": 0.19131316244602203,
110
+ "learning_rate": 2.177777777777778e-05,
111
+ "loss": 0.0221,
112
+ "step": 50
113
+ },
114
+ {
115
+ "epoch": 0.19455252918287938,
116
+ "eval_loss": 0.024967821314930916,
117
+ "eval_runtime": 136.5245,
118
+ "eval_samples_per_second": 3.736,
119
+ "eval_steps_per_second": 0.938,
120
+ "step": 50
121
+ },
122
+ {
123
+ "epoch": 0.2140077821011673,
124
+ "grad_norm": 2.454702377319336,
125
+ "learning_rate": 2.4e-05,
126
+ "loss": 0.0169,
127
+ "step": 55
128
+ },
129
+ {
130
+ "epoch": 0.23346303501945526,
131
+ "grad_norm": 0.12364993244409561,
132
+ "learning_rate": 2.6222222222222226e-05,
133
+ "loss": 0.0164,
134
+ "step": 60
135
+ },
136
+ {
137
+ "epoch": 0.23346303501945526,
138
+ "eval_loss": 0.021661706268787384,
139
+ "eval_runtime": 136.8436,
140
+ "eval_samples_per_second": 3.727,
141
+ "eval_steps_per_second": 0.935,
142
+ "step": 60
143
+ },
144
+ {
145
+ "epoch": 0.2529182879377432,
146
+ "grad_norm": 0.14085163176059723,
147
+ "learning_rate": 2.8444444444444447e-05,
148
+ "loss": 0.0131,
149
+ "step": 65
150
+ },
151
+ {
152
+ "epoch": 0.2723735408560311,
153
+ "grad_norm": 0.15322668850421906,
154
+ "learning_rate": 3.066666666666666e-05,
155
+ "loss": 0.0089,
156
+ "step": 70
157
+ },
158
+ {
159
+ "epoch": 0.2723735408560311,
160
+ "eval_loss": 0.016315119341015816,
161
+ "eval_runtime": 136.2077,
162
+ "eval_samples_per_second": 3.744,
163
+ "eval_steps_per_second": 0.94,
164
+ "step": 70
165
+ },
166
+ {
167
+ "epoch": 0.2918287937743191,
168
+ "grad_norm": 0.08343034237623215,
169
+ "learning_rate": 3.288888888888889e-05,
170
+ "loss": 0.0076,
171
+ "step": 75
172
+ },
173
+ {
174
+ "epoch": 0.311284046692607,
175
+ "grad_norm": 0.11078440397977829,
176
+ "learning_rate": 3.511111111111111e-05,
177
+ "loss": 0.008,
178
+ "step": 80
179
+ },
180
+ {
181
+ "epoch": 0.311284046692607,
182
+ "eval_loss": 0.015718888491392136,
183
+ "eval_runtime": 136.7712,
184
+ "eval_samples_per_second": 3.729,
185
+ "eval_steps_per_second": 0.936,
186
+ "step": 80
187
+ },
188
+ {
189
+ "epoch": 0.33073929961089493,
190
+ "grad_norm": 0.08361168950796127,
191
+ "learning_rate": 3.733333333333334e-05,
192
+ "loss": 0.008,
193
+ "step": 85
194
+ },
195
+ {
196
+ "epoch": 0.35019455252918286,
197
+ "grad_norm": 0.06539439409971237,
198
+ "learning_rate": 3.9555555555555556e-05,
199
+ "loss": 0.0083,
200
+ "step": 90
201
+ },
202
+ {
203
+ "epoch": 0.35019455252918286,
204
+ "eval_loss": 0.015910081565380096,
205
+ "eval_runtime": 136.5187,
206
+ "eval_samples_per_second": 3.736,
207
+ "eval_steps_per_second": 0.938,
208
+ "step": 90
209
+ },
210
+ {
211
+ "epoch": 0.36964980544747084,
212
+ "grad_norm": 0.14973388612270355,
213
+ "learning_rate": 4.177777777777778e-05,
214
+ "loss": 0.0085,
215
+ "step": 95
216
+ },
217
+ {
218
+ "epoch": 0.38910505836575876,
219
+ "grad_norm": 0.08519362658262253,
220
+ "learning_rate": 4.4e-05,
221
+ "loss": 0.0077,
222
+ "step": 100
223
+ },
224
+ {
225
+ "epoch": 0.38910505836575876,
226
+ "eval_loss": 0.01615685597062111,
227
+ "eval_runtime": 136.4452,
228
+ "eval_samples_per_second": 3.738,
229
+ "eval_steps_per_second": 0.938,
230
+ "step": 100
231
+ },
232
+ {
233
+ "epoch": 0.4085603112840467,
234
+ "grad_norm": 0.05565109848976135,
235
+ "learning_rate": 4.6222222222222224e-05,
236
+ "loss": 0.0085,
237
+ "step": 105
238
+ },
239
+ {
240
+ "epoch": 0.4280155642023346,
241
+ "grad_norm": 0.07286959886550903,
242
+ "learning_rate": 4.844444444444445e-05,
243
+ "loss": 0.0082,
244
+ "step": 110
245
+ },
246
+ {
247
+ "epoch": 0.4280155642023346,
248
+ "eval_loss": 0.015698084607720375,
249
+ "eval_runtime": 136.3832,
250
+ "eval_samples_per_second": 3.739,
251
+ "eval_steps_per_second": 0.939,
252
+ "step": 110
253
+ },
254
+ {
255
+ "epoch": 0.4474708171206226,
256
+ "grad_norm": 0.13329896330833435,
257
+ "learning_rate": 5.066666666666667e-05,
258
+ "loss": 0.0085,
259
+ "step": 115
260
+ },
261
+ {
262
+ "epoch": 0.4669260700389105,
263
+ "grad_norm": 0.04628467932343483,
264
+ "learning_rate": 5.288888888888889e-05,
265
+ "loss": 0.0075,
266
+ "step": 120
267
+ },
268
+ {
269
+ "epoch": 0.4669260700389105,
270
+ "eval_loss": 0.015623296611011028,
271
+ "eval_runtime": 136.0422,
272
+ "eval_samples_per_second": 3.749,
273
+ "eval_steps_per_second": 0.941,
274
+ "step": 120
275
+ },
276
+ {
277
+ "epoch": 0.48638132295719844,
278
+ "grad_norm": 0.058520544320344925,
279
+ "learning_rate": 5.511111111111111e-05,
280
+ "loss": 0.0079,
281
+ "step": 125
282
+ },
283
+ {
284
+ "epoch": 0.5058365758754864,
285
+ "grad_norm": 0.06411632895469666,
286
+ "learning_rate": 5.7333333333333336e-05,
287
+ "loss": 0.0076,
288
+ "step": 130
289
+ },
290
+ {
291
+ "epoch": 0.5058365758754864,
292
+ "eval_loss": 0.014402530156075954,
293
+ "eval_runtime": 135.5623,
294
+ "eval_samples_per_second": 3.762,
295
+ "eval_steps_per_second": 0.944,
296
+ "step": 130
297
+ },
298
+ {
299
+ "epoch": 0.5252918287937743,
300
+ "grad_norm": 0.044081032276153564,
301
+ "learning_rate": 5.9555555555555554e-05,
302
+ "loss": 0.0072,
303
+ "step": 135
304
+ },
305
+ {
306
+ "epoch": 0.5447470817120622,
307
+ "grad_norm": 0.04867592826485634,
308
+ "learning_rate": 6.177777777777779e-05,
309
+ "loss": 0.0077,
310
+ "step": 140
311
+ },
312
+ {
313
+ "epoch": 0.5447470817120622,
314
+ "eval_loss": 0.012970623560249805,
315
+ "eval_runtime": 137.0137,
316
+ "eval_samples_per_second": 3.722,
317
+ "eval_steps_per_second": 0.934,
318
+ "step": 140
319
+ },
320
+ {
321
+ "epoch": 0.5642023346303502,
322
+ "grad_norm": 0.044633813202381134,
323
+ "learning_rate": 6.4e-05,
324
+ "loss": 0.0077,
325
+ "step": 145
326
+ },
327
+ {
328
+ "epoch": 0.5836575875486382,
329
+ "grad_norm": 0.052950419485569,
330
+ "learning_rate": 6.622222222222222e-05,
331
+ "loss": 0.008,
332
+ "step": 150
333
+ },
334
+ {
335
+ "epoch": 0.5836575875486382,
336
+ "eval_loss": 0.012441293336451054,
337
+ "eval_runtime": 136.133,
338
+ "eval_samples_per_second": 3.746,
339
+ "eval_steps_per_second": 0.94,
340
+ "step": 150
341
+ },
342
+ {
343
+ "epoch": 0.603112840466926,
344
+ "grad_norm": 0.039904553443193436,
345
+ "learning_rate": 6.844444444444445e-05,
346
+ "loss": 0.0078,
347
+ "step": 155
348
+ },
349
+ {
350
+ "epoch": 0.622568093385214,
351
+ "grad_norm": 0.05680263414978981,
352
+ "learning_rate": 7.066666666666667e-05,
353
+ "loss": 0.0074,
354
+ "step": 160
355
+ },
356
+ {
357
+ "epoch": 0.622568093385214,
358
+ "eval_loss": 0.01192025002092123,
359
+ "eval_runtime": 136.5582,
360
+ "eval_samples_per_second": 3.735,
361
+ "eval_steps_per_second": 0.937,
362
+ "step": 160
363
+ },
364
+ {
365
+ "epoch": 0.642023346303502,
366
+ "grad_norm": 0.05537933111190796,
367
+ "learning_rate": 7.288888888888888e-05,
368
+ "loss": 0.0076,
369
+ "step": 165
370
+ },
371
+ {
372
+ "epoch": 0.6614785992217899,
373
+ "grad_norm": 0.04935755953192711,
374
+ "learning_rate": 7.511111111111111e-05,
375
+ "loss": 0.0077,
376
+ "step": 170
377
+ },
378
+ {
379
+ "epoch": 0.6614785992217899,
380
+ "eval_loss": 0.012302271090447903,
381
+ "eval_runtime": 136.6757,
382
+ "eval_samples_per_second": 3.731,
383
+ "eval_steps_per_second": 0.937,
384
+ "step": 170
385
+ },
386
+ {
387
+ "epoch": 0.6809338521400778,
388
+ "grad_norm": 0.05575108528137207,
389
+ "learning_rate": 7.733333333333333e-05,
390
+ "loss": 0.0081,
391
+ "step": 175
392
+ },
393
+ {
394
+ "epoch": 0.7003891050583657,
395
+ "grad_norm": 0.0551481656730175,
396
+ "learning_rate": 7.955555555555556e-05,
397
+ "loss": 0.0081,
398
+ "step": 180
399
+ },
400
+ {
401
+ "epoch": 0.7003891050583657,
402
+ "eval_loss": 0.011542496271431446,
403
+ "eval_runtime": 136.4051,
404
+ "eval_samples_per_second": 3.739,
405
+ "eval_steps_per_second": 0.938,
406
+ "step": 180
407
+ },
408
+ {
409
+ "epoch": 0.7198443579766537,
410
+ "grad_norm": 0.04738597571849823,
411
+ "learning_rate": 8.177777777777778e-05,
412
+ "loss": 0.0076,
413
+ "step": 185
414
+ },
415
+ {
416
+ "epoch": 0.7392996108949417,
417
+ "grad_norm": 0.029748599976301193,
418
+ "learning_rate": 8.4e-05,
419
+ "loss": 0.0073,
420
+ "step": 190
421
+ },
422
+ {
423
+ "epoch": 0.7392996108949417,
424
+ "eval_loss": 0.012059729546308517,
425
+ "eval_runtime": 135.8659,
426
+ "eval_samples_per_second": 3.754,
427
+ "eval_steps_per_second": 0.942,
428
+ "step": 190
429
+ },
430
+ {
431
+ "epoch": 0.7587548638132295,
432
+ "grad_norm": 0.03995237499475479,
433
+ "learning_rate": 8.622222222222223e-05,
434
+ "loss": 0.0077,
435
+ "step": 195
436
+ },
437
+ {
438
+ "epoch": 0.7782101167315175,
439
+ "grad_norm": 0.02774854749441147,
440
+ "learning_rate": 8.844444444444445e-05,
441
+ "loss": 0.0075,
442
+ "step": 200
443
+ },
444
+ {
445
+ "epoch": 0.7782101167315175,
446
+ "eval_loss": 0.011773883365094662,
447
+ "eval_runtime": 136.231,
448
+ "eval_samples_per_second": 3.744,
449
+ "eval_steps_per_second": 0.94,
450
+ "step": 200
451
+ },
452
+ {
453
+ "epoch": 0.7976653696498055,
454
+ "grad_norm": 0.026570243760943413,
455
+ "learning_rate": 9.066666666666667e-05,
456
+ "loss": 0.0072,
457
+ "step": 205
458
+ },
459
+ {
460
+ "epoch": 0.8171206225680934,
461
+ "grad_norm": 0.047289494425058365,
462
+ "learning_rate": 9.288888888888888e-05,
463
+ "loss": 0.0074,
464
+ "step": 210
465
+ },
466
+ {
467
+ "epoch": 0.8171206225680934,
468
+ "eval_loss": 0.011569861322641373,
469
+ "eval_runtime": 136.4761,
470
+ "eval_samples_per_second": 3.737,
471
+ "eval_steps_per_second": 0.938,
472
+ "step": 210
473
+ },
474
+ {
475
+ "epoch": 0.8365758754863813,
476
+ "grad_norm": 0.036366503685712814,
477
+ "learning_rate": 9.511111111111112e-05,
478
+ "loss": 0.007,
479
+ "step": 215
480
+ },
481
+ {
482
+ "epoch": 0.8560311284046692,
483
+ "grad_norm": 0.07178617268800735,
484
+ "learning_rate": 9.733333333333333e-05,
485
+ "loss": 0.0073,
486
+ "step": 220
487
+ },
488
+ {
489
+ "epoch": 0.8560311284046692,
490
+ "eval_loss": 0.010845971293747425,
491
+ "eval_runtime": 136.7794,
492
+ "eval_samples_per_second": 3.729,
493
+ "eval_steps_per_second": 0.936,
494
+ "step": 220
495
+ }
496
+ ],
497
+ "logging_steps": 5,
498
+ "max_steps": 1799,
499
+ "num_input_tokens_seen": 0,
500
+ "num_train_epochs": 7,
501
+ "save_steps": 10,
502
+ "stateful_callbacks": {
503
+ "EarlyStoppingCallback": {
504
+ "args": {
505
+ "early_stopping_patience": 30,
506
+ "early_stopping_threshold": 0.001
507
+ },
508
+ "attributes": {
509
+ "early_stopping_patience_counter": 8
510
+ }
511
+ },
512
+ "TrainerControl": {
513
+ "args": {
514
+ "should_epoch_stop": false,
515
+ "should_evaluate": false,
516
+ "should_log": false,
517
+ "should_save": true,
518
+ "should_training_stop": false
519
+ },
520
+ "attributes": {}
521
+ }
522
+ },
523
+ "total_flos": 1.2998780541949824e+17,
524
+ "train_batch_size": 4,
525
+ "trial_name": null,
526
+ "trial_params": null
527
+ }
checkpoint-220/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc74119a8e3587a81c422ff23316abe5d974ec5c66265926247177773d932ad
3
+ size 6417
checkpoint-230/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-230/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-230/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e828b2ee1f0e5e2cf604e123a55d06e0291c14c5ee70d0b0081aad9b22badae
3
+ size 308107928
checkpoint-230/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-230/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-230/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d1a1778c960f9b7620ff897419cc14a8bc8817077bb53c88a5b3fc801d6d803
3
+ size 157331237
checkpoint-230/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-230/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-230/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc08282850ca3ef384c91a39dc5e90350a02073687b7716c9c62dd8e3b07669f
3
+ size 14645
checkpoint-230/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77a2f6f325b52e5dc17fbd21b1e476f64e9338f4b0973c0706b0eb0105d5c51
3
+ size 1465
checkpoint-230/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-230/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-230/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-230/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-230/trainer_state.json ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 220,
3
+ "best_metric": 0.010845971293747425,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-220",
5
+ "epoch": 0.8949416342412452,
6
+ "eval_steps": 10,
7
+ "global_step": 230,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.7302613258361816,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8239,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 3.058382034301758,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.7964,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.038910505836575876,
28
+ "eval_loss": 0.7571244239807129,
29
+ "eval_runtime": 152.7626,
30
+ "eval_samples_per_second": 3.339,
31
+ "eval_steps_per_second": 0.838,
32
+ "step": 10
33
+ },
34
+ {
35
+ "epoch": 0.058365758754863814,
36
+ "grad_norm": 1.8304738998413086,
37
+ "learning_rate": 6.222222222222222e-06,
38
+ "loss": 0.6943,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 0.07782101167315175,
43
+ "grad_norm": 1.3162634372711182,
44
+ "learning_rate": 8.444444444444446e-06,
45
+ "loss": 0.5569,
46
+ "step": 20
47
+ },
48
+ {
49
+ "epoch": 0.07782101167315175,
50
+ "eval_loss": 0.5357815623283386,
51
+ "eval_runtime": 136.4123,
52
+ "eval_samples_per_second": 3.739,
53
+ "eval_steps_per_second": 0.938,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.09727626459143969,
58
+ "grad_norm": 1.1865513324737549,
59
+ "learning_rate": 1.0666666666666667e-05,
60
+ "loss": 0.4222,
61
+ "step": 25
62
+ },
63
+ {
64
+ "epoch": 0.11673151750972763,
65
+ "grad_norm": 1.2043355703353882,
66
+ "learning_rate": 1.2888888888888889e-05,
67
+ "loss": 0.2831,
68
+ "step": 30
69
+ },
70
+ {
71
+ "epoch": 0.11673151750972763,
72
+ "eval_loss": 0.28264933824539185,
73
+ "eval_runtime": 136.5183,
74
+ "eval_samples_per_second": 3.736,
75
+ "eval_steps_per_second": 0.938,
76
+ "step": 30
77
+ },
78
+ {
79
+ "epoch": 0.13618677042801555,
80
+ "grad_norm": 0.9503483772277832,
81
+ "learning_rate": 1.511111111111111e-05,
82
+ "loss": 0.1405,
83
+ "step": 35
84
+ },
85
+ {
86
+ "epoch": 0.1556420233463035,
87
+ "grad_norm": 0.34179583191871643,
88
+ "learning_rate": 1.7333333333333332e-05,
89
+ "loss": 0.0515,
90
+ "step": 40
91
+ },
92
+ {
93
+ "epoch": 0.1556420233463035,
94
+ "eval_loss": 0.05298379808664322,
95
+ "eval_runtime": 136.391,
96
+ "eval_samples_per_second": 3.739,
97
+ "eval_steps_per_second": 0.938,
98
+ "step": 40
99
+ },
100
+ {
101
+ "epoch": 0.17509727626459143,
102
+ "grad_norm": 0.24818392097949982,
103
+ "learning_rate": 1.9555555555555557e-05,
104
+ "loss": 0.0302,
105
+ "step": 45
106
+ },
107
+ {
108
+ "epoch": 0.19455252918287938,
109
+ "grad_norm": 0.19131316244602203,
110
+ "learning_rate": 2.177777777777778e-05,
111
+ "loss": 0.0221,
112
+ "step": 50
113
+ },
114
+ {
115
+ "epoch": 0.19455252918287938,
116
+ "eval_loss": 0.024967821314930916,
117
+ "eval_runtime": 136.5245,
118
+ "eval_samples_per_second": 3.736,
119
+ "eval_steps_per_second": 0.938,
120
+ "step": 50
121
+ },
122
+ {
123
+ "epoch": 0.2140077821011673,
124
+ "grad_norm": 2.454702377319336,
125
+ "learning_rate": 2.4e-05,
126
+ "loss": 0.0169,
127
+ "step": 55
128
+ },
129
+ {
130
+ "epoch": 0.23346303501945526,
131
+ "grad_norm": 0.12364993244409561,
132
+ "learning_rate": 2.6222222222222226e-05,
133
+ "loss": 0.0164,
134
+ "step": 60
135
+ },
136
+ {
137
+ "epoch": 0.23346303501945526,
138
+ "eval_loss": 0.021661706268787384,
139
+ "eval_runtime": 136.8436,
140
+ "eval_samples_per_second": 3.727,
141
+ "eval_steps_per_second": 0.935,
142
+ "step": 60
143
+ },
144
+ {
145
+ "epoch": 0.2529182879377432,
146
+ "grad_norm": 0.14085163176059723,
147
+ "learning_rate": 2.8444444444444447e-05,
148
+ "loss": 0.0131,
149
+ "step": 65
150
+ },
151
+ {
152
+ "epoch": 0.2723735408560311,
153
+ "grad_norm": 0.15322668850421906,
154
+ "learning_rate": 3.066666666666666e-05,
155
+ "loss": 0.0089,
156
+ "step": 70
157
+ },
158
+ {
159
+ "epoch": 0.2723735408560311,
160
+ "eval_loss": 0.016315119341015816,
161
+ "eval_runtime": 136.2077,
162
+ "eval_samples_per_second": 3.744,
163
+ "eval_steps_per_second": 0.94,
164
+ "step": 70
165
+ },
166
+ {
167
+ "epoch": 0.2918287937743191,
168
+ "grad_norm": 0.08343034237623215,
169
+ "learning_rate": 3.288888888888889e-05,
170
+ "loss": 0.0076,
171
+ "step": 75
172
+ },
173
+ {
174
+ "epoch": 0.311284046692607,
175
+ "grad_norm": 0.11078440397977829,
176
+ "learning_rate": 3.511111111111111e-05,
177
+ "loss": 0.008,
178
+ "step": 80
179
+ },
180
+ {
181
+ "epoch": 0.311284046692607,
182
+ "eval_loss": 0.015718888491392136,
183
+ "eval_runtime": 136.7712,
184
+ "eval_samples_per_second": 3.729,
185
+ "eval_steps_per_second": 0.936,
186
+ "step": 80
187
+ },
188
+ {
189
+ "epoch": 0.33073929961089493,
190
+ "grad_norm": 0.08361168950796127,
191
+ "learning_rate": 3.733333333333334e-05,
192
+ "loss": 0.008,
193
+ "step": 85
194
+ },
195
+ {
196
+ "epoch": 0.35019455252918286,
197
+ "grad_norm": 0.06539439409971237,
198
+ "learning_rate": 3.9555555555555556e-05,
199
+ "loss": 0.0083,
200
+ "step": 90
201
+ },
202
+ {
203
+ "epoch": 0.35019455252918286,
204
+ "eval_loss": 0.015910081565380096,
205
+ "eval_runtime": 136.5187,
206
+ "eval_samples_per_second": 3.736,
207
+ "eval_steps_per_second": 0.938,
208
+ "step": 90
209
+ },
210
+ {
211
+ "epoch": 0.36964980544747084,
212
+ "grad_norm": 0.14973388612270355,
213
+ "learning_rate": 4.177777777777778e-05,
214
+ "loss": 0.0085,
215
+ "step": 95
216
+ },
217
+ {
218
+ "epoch": 0.38910505836575876,
219
+ "grad_norm": 0.08519362658262253,
220
+ "learning_rate": 4.4e-05,
221
+ "loss": 0.0077,
222
+ "step": 100
223
+ },
224
+ {
225
+ "epoch": 0.38910505836575876,
226
+ "eval_loss": 0.01615685597062111,
227
+ "eval_runtime": 136.4452,
228
+ "eval_samples_per_second": 3.738,
229
+ "eval_steps_per_second": 0.938,
230
+ "step": 100
231
+ },
232
+ {
233
+ "epoch": 0.4085603112840467,
234
+ "grad_norm": 0.05565109848976135,
235
+ "learning_rate": 4.6222222222222224e-05,
236
+ "loss": 0.0085,
237
+ "step": 105
238
+ },
239
+ {
240
+ "epoch": 0.4280155642023346,
241
+ "grad_norm": 0.07286959886550903,
242
+ "learning_rate": 4.844444444444445e-05,
243
+ "loss": 0.0082,
244
+ "step": 110
245
+ },
246
+ {
247
+ "epoch": 0.4280155642023346,
248
+ "eval_loss": 0.015698084607720375,
249
+ "eval_runtime": 136.3832,
250
+ "eval_samples_per_second": 3.739,
251
+ "eval_steps_per_second": 0.939,
252
+ "step": 110
253
+ },
254
+ {
255
+ "epoch": 0.4474708171206226,
256
+ "grad_norm": 0.13329896330833435,
257
+ "learning_rate": 5.066666666666667e-05,
258
+ "loss": 0.0085,
259
+ "step": 115
260
+ },
261
+ {
262
+ "epoch": 0.4669260700389105,
263
+ "grad_norm": 0.04628467932343483,
264
+ "learning_rate": 5.288888888888889e-05,
265
+ "loss": 0.0075,
266
+ "step": 120
267
+ },
268
+ {
269
+ "epoch": 0.4669260700389105,
270
+ "eval_loss": 0.015623296611011028,
271
+ "eval_runtime": 136.0422,
272
+ "eval_samples_per_second": 3.749,
273
+ "eval_steps_per_second": 0.941,
274
+ "step": 120
275
+ },
276
+ {
277
+ "epoch": 0.48638132295719844,
278
+ "grad_norm": 0.058520544320344925,
279
+ "learning_rate": 5.511111111111111e-05,
280
+ "loss": 0.0079,
281
+ "step": 125
282
+ },
283
+ {
284
+ "epoch": 0.5058365758754864,
285
+ "grad_norm": 0.06411632895469666,
286
+ "learning_rate": 5.7333333333333336e-05,
287
+ "loss": 0.0076,
288
+ "step": 130
289
+ },
290
+ {
291
+ "epoch": 0.5058365758754864,
292
+ "eval_loss": 0.014402530156075954,
293
+ "eval_runtime": 135.5623,
294
+ "eval_samples_per_second": 3.762,
295
+ "eval_steps_per_second": 0.944,
296
+ "step": 130
297
+ },
298
+ {
299
+ "epoch": 0.5252918287937743,
300
+ "grad_norm": 0.044081032276153564,
301
+ "learning_rate": 5.9555555555555554e-05,
302
+ "loss": 0.0072,
303
+ "step": 135
304
+ },
305
+ {
306
+ "epoch": 0.5447470817120622,
307
+ "grad_norm": 0.04867592826485634,
308
+ "learning_rate": 6.177777777777779e-05,
309
+ "loss": 0.0077,
310
+ "step": 140
311
+ },
312
+ {
313
+ "epoch": 0.5447470817120622,
314
+ "eval_loss": 0.012970623560249805,
315
+ "eval_runtime": 137.0137,
316
+ "eval_samples_per_second": 3.722,
317
+ "eval_steps_per_second": 0.934,
318
+ "step": 140
319
+ },
320
+ {
321
+ "epoch": 0.5642023346303502,
322
+ "grad_norm": 0.044633813202381134,
323
+ "learning_rate": 6.4e-05,
324
+ "loss": 0.0077,
325
+ "step": 145
326
+ },
327
+ {
328
+ "epoch": 0.5836575875486382,
329
+ "grad_norm": 0.052950419485569,
330
+ "learning_rate": 6.622222222222222e-05,
331
+ "loss": 0.008,
332
+ "step": 150
333
+ },
334
+ {
335
+ "epoch": 0.5836575875486382,
336
+ "eval_loss": 0.012441293336451054,
337
+ "eval_runtime": 136.133,
338
+ "eval_samples_per_second": 3.746,
339
+ "eval_steps_per_second": 0.94,
340
+ "step": 150
341
+ },
342
+ {
343
+ "epoch": 0.603112840466926,
344
+ "grad_norm": 0.039904553443193436,
345
+ "learning_rate": 6.844444444444445e-05,
346
+ "loss": 0.0078,
347
+ "step": 155
348
+ },
349
+ {
350
+ "epoch": 0.622568093385214,
351
+ "grad_norm": 0.05680263414978981,
352
+ "learning_rate": 7.066666666666667e-05,
353
+ "loss": 0.0074,
354
+ "step": 160
355
+ },
356
+ {
357
+ "epoch": 0.622568093385214,
358
+ "eval_loss": 0.01192025002092123,
359
+ "eval_runtime": 136.5582,
360
+ "eval_samples_per_second": 3.735,
361
+ "eval_steps_per_second": 0.937,
362
+ "step": 160
363
+ },
364
+ {
365
+ "epoch": 0.642023346303502,
366
+ "grad_norm": 0.05537933111190796,
367
+ "learning_rate": 7.288888888888888e-05,
368
+ "loss": 0.0076,
369
+ "step": 165
370
+ },
371
+ {
372
+ "epoch": 0.6614785992217899,
373
+ "grad_norm": 0.04935755953192711,
374
+ "learning_rate": 7.511111111111111e-05,
375
+ "loss": 0.0077,
376
+ "step": 170
377
+ },
378
+ {
379
+ "epoch": 0.6614785992217899,
380
+ "eval_loss": 0.012302271090447903,
381
+ "eval_runtime": 136.6757,
382
+ "eval_samples_per_second": 3.731,
383
+ "eval_steps_per_second": 0.937,
384
+ "step": 170
385
+ },
386
+ {
387
+ "epoch": 0.6809338521400778,
388
+ "grad_norm": 0.05575108528137207,
389
+ "learning_rate": 7.733333333333333e-05,
390
+ "loss": 0.0081,
391
+ "step": 175
392
+ },
393
+ {
394
+ "epoch": 0.7003891050583657,
395
+ "grad_norm": 0.0551481656730175,
396
+ "learning_rate": 7.955555555555556e-05,
397
+ "loss": 0.0081,
398
+ "step": 180
399
+ },
400
+ {
401
+ "epoch": 0.7003891050583657,
402
+ "eval_loss": 0.011542496271431446,
403
+ "eval_runtime": 136.4051,
404
+ "eval_samples_per_second": 3.739,
405
+ "eval_steps_per_second": 0.938,
406
+ "step": 180
407
+ },
408
+ {
409
+ "epoch": 0.7198443579766537,
410
+ "grad_norm": 0.04738597571849823,
411
+ "learning_rate": 8.177777777777778e-05,
412
+ "loss": 0.0076,
413
+ "step": 185
414
+ },
415
+ {
416
+ "epoch": 0.7392996108949417,
417
+ "grad_norm": 0.029748599976301193,
418
+ "learning_rate": 8.4e-05,
419
+ "loss": 0.0073,
420
+ "step": 190
421
+ },
422
+ {
423
+ "epoch": 0.7392996108949417,
424
+ "eval_loss": 0.012059729546308517,
425
+ "eval_runtime": 135.8659,
426
+ "eval_samples_per_second": 3.754,
427
+ "eval_steps_per_second": 0.942,
428
+ "step": 190
429
+ },
430
+ {
431
+ "epoch": 0.7587548638132295,
432
+ "grad_norm": 0.03995237499475479,
433
+ "learning_rate": 8.622222222222223e-05,
434
+ "loss": 0.0077,
435
+ "step": 195
436
+ },
437
+ {
438
+ "epoch": 0.7782101167315175,
439
+ "grad_norm": 0.02774854749441147,
440
+ "learning_rate": 8.844444444444445e-05,
441
+ "loss": 0.0075,
442
+ "step": 200
443
+ },
444
+ {
445
+ "epoch": 0.7782101167315175,
446
+ "eval_loss": 0.011773883365094662,
447
+ "eval_runtime": 136.231,
448
+ "eval_samples_per_second": 3.744,
449
+ "eval_steps_per_second": 0.94,
450
+ "step": 200
451
+ },
452
+ {
453
+ "epoch": 0.7976653696498055,
454
+ "grad_norm": 0.026570243760943413,
455
+ "learning_rate": 9.066666666666667e-05,
456
+ "loss": 0.0072,
457
+ "step": 205
458
+ },
459
+ {
460
+ "epoch": 0.8171206225680934,
461
+ "grad_norm": 0.047289494425058365,
462
+ "learning_rate": 9.288888888888888e-05,
463
+ "loss": 0.0074,
464
+ "step": 210
465
+ },
466
+ {
467
+ "epoch": 0.8171206225680934,
468
+ "eval_loss": 0.011569861322641373,
469
+ "eval_runtime": 136.4761,
470
+ "eval_samples_per_second": 3.737,
471
+ "eval_steps_per_second": 0.938,
472
+ "step": 210
473
+ },
474
+ {
475
+ "epoch": 0.8365758754863813,
476
+ "grad_norm": 0.036366503685712814,
477
+ "learning_rate": 9.511111111111112e-05,
478
+ "loss": 0.007,
479
+ "step": 215
480
+ },
481
+ {
482
+ "epoch": 0.8560311284046692,
483
+ "grad_norm": 0.07178617268800735,
484
+ "learning_rate": 9.733333333333333e-05,
485
+ "loss": 0.0073,
486
+ "step": 220
487
+ },
488
+ {
489
+ "epoch": 0.8560311284046692,
490
+ "eval_loss": 0.010845971293747425,
491
+ "eval_runtime": 136.7794,
492
+ "eval_samples_per_second": 3.729,
493
+ "eval_steps_per_second": 0.936,
494
+ "step": 220
495
+ },
496
+ {
497
+ "epoch": 0.8754863813229572,
498
+ "grad_norm": 0.042044900357723236,
499
+ "learning_rate": 9.955555555555556e-05,
500
+ "loss": 0.0069,
501
+ "step": 225
502
+ },
503
+ {
504
+ "epoch": 0.8949416342412452,
505
+ "grad_norm": 0.18266713619232178,
506
+ "learning_rate": 0.00010177777777777777,
507
+ "loss": 0.007,
508
+ "step": 230
509
+ },
510
+ {
511
+ "epoch": 0.8949416342412452,
512
+ "eval_loss": 0.012411113828420639,
513
+ "eval_runtime": 136.7046,
514
+ "eval_samples_per_second": 3.731,
515
+ "eval_steps_per_second": 0.936,
516
+ "step": 230
517
+ }
518
+ ],
519
+ "logging_steps": 5,
520
+ "max_steps": 1799,
521
+ "num_input_tokens_seen": 0,
522
+ "num_train_epochs": 7,
523
+ "save_steps": 10,
524
+ "stateful_callbacks": {
525
+ "EarlyStoppingCallback": {
526
+ "args": {
527
+ "early_stopping_patience": 30,
528
+ "early_stopping_threshold": 0.001
529
+ },
530
+ "attributes": {
531
+ "early_stopping_patience_counter": 9
532
+ }
533
+ },
534
+ "TrainerControl": {
535
+ "args": {
536
+ "should_epoch_stop": false,
537
+ "should_evaluate": false,
538
+ "should_log": false,
539
+ "should_save": true,
540
+ "should_training_stop": false
541
+ },
542
+ "attributes": {}
543
+ }
544
+ },
545
+ "total_flos": 1.3589467855146662e+17,
546
+ "train_batch_size": 4,
547
+ "trial_name": null,
548
+ "trial_params": null
549
+ }
checkpoint-230/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc74119a8e3587a81c422ff23316abe5d974ec5c66265926247177773d932ad
3
+ size 6417
checkpoint-240/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/medgemma-4b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/medgemma-4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.0
checkpoint-240/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma3ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma3.modeling_gemma3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.05,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.0",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": true
42
+ }
checkpoint-240/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d56b1cc292dcd419f5b1f27215b07e8d6e36af10bc8c7424a4061e759ccde21
3
+ size 308107928
checkpoint-240/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-240/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4adddf2c383ea3669208865c55123631f55746bea6fad3ec70259c416d47409e
3
+ size 157331237
checkpoint-240/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-240/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d669fec2ccf5ca3ce9213f9c9a45b5aeec6243c9519de15665d4a5578f882b14
3
+ size 14645
checkpoint-240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e1e812919a6de0bf80382e9b084548427a8615758cc093ed3ff93b68383168b
3
+ size 1465
checkpoint-240/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-240/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7666402c0617d170e6b0a985b3130c3fb0795393aa0970600994a5d9aae12351
3
+ size 33384822
checkpoint-240/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-240/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-240/trainer_state.json ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 220,
3
+ "best_metric": 0.010845971293747425,
4
+ "best_model_checkpoint": "/teamspace/studios/this_studio/DATN/output/medgemma_finetuned/checkpoint-220",
5
+ "epoch": 0.933852140077821,
6
+ "eval_steps": 10,
7
+ "global_step": 240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019455252918287938,
14
+ "grad_norm": 3.7302613258361816,
15
+ "learning_rate": 1.777777777777778e-06,
16
+ "loss": 0.8239,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.038910505836575876,
21
+ "grad_norm": 3.058382034301758,
22
+ "learning_rate": 4e-06,
23
+ "loss": 0.7964,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.038910505836575876,
28
+ "eval_loss": 0.7571244239807129,
29
+ "eval_runtime": 152.7626,
30
+ "eval_samples_per_second": 3.339,
31
+ "eval_steps_per_second": 0.838,
32
+ "step": 10
33
+ },
34
+ {
35
+ "epoch": 0.058365758754863814,
36
+ "grad_norm": 1.8304738998413086,
37
+ "learning_rate": 6.222222222222222e-06,
38
+ "loss": 0.6943,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 0.07782101167315175,
43
+ "grad_norm": 1.3162634372711182,
44
+ "learning_rate": 8.444444444444446e-06,
45
+ "loss": 0.5569,
46
+ "step": 20
47
+ },
48
+ {
49
+ "epoch": 0.07782101167315175,
50
+ "eval_loss": 0.5357815623283386,
51
+ "eval_runtime": 136.4123,
52
+ "eval_samples_per_second": 3.739,
53
+ "eval_steps_per_second": 0.938,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.09727626459143969,
58
+ "grad_norm": 1.1865513324737549,
59
+ "learning_rate": 1.0666666666666667e-05,
60
+ "loss": 0.4222,
61
+ "step": 25
62
+ },
63
+ {
64
+ "epoch": 0.11673151750972763,
65
+ "grad_norm": 1.2043355703353882,
66
+ "learning_rate": 1.2888888888888889e-05,
67
+ "loss": 0.2831,
68
+ "step": 30
69
+ },
70
+ {
71
+ "epoch": 0.11673151750972763,
72
+ "eval_loss": 0.28264933824539185,
73
+ "eval_runtime": 136.5183,
74
+ "eval_samples_per_second": 3.736,
75
+ "eval_steps_per_second": 0.938,
76
+ "step": 30
77
+ },
78
+ {
79
+ "epoch": 0.13618677042801555,
80
+ "grad_norm": 0.9503483772277832,
81
+ "learning_rate": 1.511111111111111e-05,
82
+ "loss": 0.1405,
83
+ "step": 35
84
+ },
85
+ {
86
+ "epoch": 0.1556420233463035,
87
+ "grad_norm": 0.34179583191871643,
88
+ "learning_rate": 1.7333333333333332e-05,
89
+ "loss": 0.0515,
90
+ "step": 40
91
+ },
92
+ {
93
+ "epoch": 0.1556420233463035,
94
+ "eval_loss": 0.05298379808664322,
95
+ "eval_runtime": 136.391,
96
+ "eval_samples_per_second": 3.739,
97
+ "eval_steps_per_second": 0.938,
98
+ "step": 40
99
+ },
100
+ {
101
+ "epoch": 0.17509727626459143,
102
+ "grad_norm": 0.24818392097949982,
103
+ "learning_rate": 1.9555555555555557e-05,
104
+ "loss": 0.0302,
105
+ "step": 45
106
+ },
107
+ {
108
+ "epoch": 0.19455252918287938,
109
+ "grad_norm": 0.19131316244602203,
110
+ "learning_rate": 2.177777777777778e-05,
111
+ "loss": 0.0221,
112
+ "step": 50
113
+ },
114
+ {
115
+ "epoch": 0.19455252918287938,
116
+ "eval_loss": 0.024967821314930916,
117
+ "eval_runtime": 136.5245,
118
+ "eval_samples_per_second": 3.736,
119
+ "eval_steps_per_second": 0.938,
120
+ "step": 50
121
+ },
122
+ {
123
+ "epoch": 0.2140077821011673,
124
+ "grad_norm": 2.454702377319336,
125
+ "learning_rate": 2.4e-05,
126
+ "loss": 0.0169,
127
+ "step": 55
128
+ },
129
+ {
130
+ "epoch": 0.23346303501945526,
131
+ "grad_norm": 0.12364993244409561,
132
+ "learning_rate": 2.6222222222222226e-05,
133
+ "loss": 0.0164,
134
+ "step": 60
135
+ },
136
+ {
137
+ "epoch": 0.23346303501945526,
138
+ "eval_loss": 0.021661706268787384,
139
+ "eval_runtime": 136.8436,
140
+ "eval_samples_per_second": 3.727,
141
+ "eval_steps_per_second": 0.935,
142
+ "step": 60
143
+ },
144
+ {
145
+ "epoch": 0.2529182879377432,
146
+ "grad_norm": 0.14085163176059723,
147
+ "learning_rate": 2.8444444444444447e-05,
148
+ "loss": 0.0131,
149
+ "step": 65
150
+ },
151
+ {
152
+ "epoch": 0.2723735408560311,
153
+ "grad_norm": 0.15322668850421906,
154
+ "learning_rate": 3.066666666666666e-05,
155
+ "loss": 0.0089,
156
+ "step": 70
157
+ },
158
+ {
159
+ "epoch": 0.2723735408560311,
160
+ "eval_loss": 0.016315119341015816,
161
+ "eval_runtime": 136.2077,
162
+ "eval_samples_per_second": 3.744,
163
+ "eval_steps_per_second": 0.94,
164
+ "step": 70
165
+ },
166
+ {
167
+ "epoch": 0.2918287937743191,
168
+ "grad_norm": 0.08343034237623215,
169
+ "learning_rate": 3.288888888888889e-05,
170
+ "loss": 0.0076,
171
+ "step": 75
172
+ },
173
+ {
174
+ "epoch": 0.311284046692607,
175
+ "grad_norm": 0.11078440397977829,
176
+ "learning_rate": 3.511111111111111e-05,
177
+ "loss": 0.008,
178
+ "step": 80
179
+ },
180
+ {
181
+ "epoch": 0.311284046692607,
182
+ "eval_loss": 0.015718888491392136,
183
+ "eval_runtime": 136.7712,
184
+ "eval_samples_per_second": 3.729,
185
+ "eval_steps_per_second": 0.936,
186
+ "step": 80
187
+ },
188
+ {
189
+ "epoch": 0.33073929961089493,
190
+ "grad_norm": 0.08361168950796127,
191
+ "learning_rate": 3.733333333333334e-05,
192
+ "loss": 0.008,
193
+ "step": 85
194
+ },
195
+ {
196
+ "epoch": 0.35019455252918286,
197
+ "grad_norm": 0.06539439409971237,
198
+ "learning_rate": 3.9555555555555556e-05,
199
+ "loss": 0.0083,
200
+ "step": 90
201
+ },
202
+ {
203
+ "epoch": 0.35019455252918286,
204
+ "eval_loss": 0.015910081565380096,
205
+ "eval_runtime": 136.5187,
206
+ "eval_samples_per_second": 3.736,
207
+ "eval_steps_per_second": 0.938,
208
+ "step": 90
209
+ },
210
+ {
211
+ "epoch": 0.36964980544747084,
212
+ "grad_norm": 0.14973388612270355,
213
+ "learning_rate": 4.177777777777778e-05,
214
+ "loss": 0.0085,
215
+ "step": 95
216
+ },
217
+ {
218
+ "epoch": 0.38910505836575876,
219
+ "grad_norm": 0.08519362658262253,
220
+ "learning_rate": 4.4e-05,
221
+ "loss": 0.0077,
222
+ "step": 100
223
+ },
224
+ {
225
+ "epoch": 0.38910505836575876,
226
+ "eval_loss": 0.01615685597062111,
227
+ "eval_runtime": 136.4452,
228
+ "eval_samples_per_second": 3.738,
229
+ "eval_steps_per_second": 0.938,
230
+ "step": 100
231
+ },
232
+ {
233
+ "epoch": 0.4085603112840467,
234
+ "grad_norm": 0.05565109848976135,
235
+ "learning_rate": 4.6222222222222224e-05,
236
+ "loss": 0.0085,
237
+ "step": 105
238
+ },
239
+ {
240
+ "epoch": 0.4280155642023346,
241
+ "grad_norm": 0.07286959886550903,
242
+ "learning_rate": 4.844444444444445e-05,
243
+ "loss": 0.0082,
244
+ "step": 110
245
+ },
246
+ {
247
+ "epoch": 0.4280155642023346,
248
+ "eval_loss": 0.015698084607720375,
249
+ "eval_runtime": 136.3832,
250
+ "eval_samples_per_second": 3.739,
251
+ "eval_steps_per_second": 0.939,
252
+ "step": 110
253
+ },
254
+ {
255
+ "epoch": 0.4474708171206226,
256
+ "grad_norm": 0.13329896330833435,
257
+ "learning_rate": 5.066666666666667e-05,
258
+ "loss": 0.0085,
259
+ "step": 115
260
+ },
261
+ {
262
+ "epoch": 0.4669260700389105,
263
+ "grad_norm": 0.04628467932343483,
264
+ "learning_rate": 5.288888888888889e-05,
265
+ "loss": 0.0075,
266
+ "step": 120
267
+ },
268
+ {
269
+ "epoch": 0.4669260700389105,
270
+ "eval_loss": 0.015623296611011028,
271
+ "eval_runtime": 136.0422,
272
+ "eval_samples_per_second": 3.749,
273
+ "eval_steps_per_second": 0.941,
274
+ "step": 120
275
+ },
276
+ {
277
+ "epoch": 0.48638132295719844,
278
+ "grad_norm": 0.058520544320344925,
279
+ "learning_rate": 5.511111111111111e-05,
280
+ "loss": 0.0079,
281
+ "step": 125
282
+ },
283
+ {
284
+ "epoch": 0.5058365758754864,
285
+ "grad_norm": 0.06411632895469666,
286
+ "learning_rate": 5.7333333333333336e-05,
287
+ "loss": 0.0076,
288
+ "step": 130
289
+ },
290
+ {
291
+ "epoch": 0.5058365758754864,
292
+ "eval_loss": 0.014402530156075954,
293
+ "eval_runtime": 135.5623,
294
+ "eval_samples_per_second": 3.762,
295
+ "eval_steps_per_second": 0.944,
296
+ "step": 130
297
+ },
298
+ {
299
+ "epoch": 0.5252918287937743,
300
+ "grad_norm": 0.044081032276153564,
301
+ "learning_rate": 5.9555555555555554e-05,
302
+ "loss": 0.0072,
303
+ "step": 135
304
+ },
305
+ {
306
+ "epoch": 0.5447470817120622,
307
+ "grad_norm": 0.04867592826485634,
308
+ "learning_rate": 6.177777777777779e-05,
309
+ "loss": 0.0077,
310
+ "step": 140
311
+ },
312
+ {
313
+ "epoch": 0.5447470817120622,
314
+ "eval_loss": 0.012970623560249805,
315
+ "eval_runtime": 137.0137,
316
+ "eval_samples_per_second": 3.722,
317
+ "eval_steps_per_second": 0.934,
318
+ "step": 140
319
+ },
320
+ {
321
+ "epoch": 0.5642023346303502,
322
+ "grad_norm": 0.044633813202381134,
323
+ "learning_rate": 6.4e-05,
324
+ "loss": 0.0077,
325
+ "step": 145
326
+ },
327
+ {
328
+ "epoch": 0.5836575875486382,
329
+ "grad_norm": 0.052950419485569,
330
+ "learning_rate": 6.622222222222222e-05,
331
+ "loss": 0.008,
332
+ "step": 150
333
+ },
334
+ {
335
+ "epoch": 0.5836575875486382,
336
+ "eval_loss": 0.012441293336451054,
337
+ "eval_runtime": 136.133,
338
+ "eval_samples_per_second": 3.746,
339
+ "eval_steps_per_second": 0.94,
340
+ "step": 150
341
+ },
342
+ {
343
+ "epoch": 0.603112840466926,
344
+ "grad_norm": 0.039904553443193436,
345
+ "learning_rate": 6.844444444444445e-05,
346
+ "loss": 0.0078,
347
+ "step": 155
348
+ },
349
+ {
350
+ "epoch": 0.622568093385214,
351
+ "grad_norm": 0.05680263414978981,
352
+ "learning_rate": 7.066666666666667e-05,
353
+ "loss": 0.0074,
354
+ "step": 160
355
+ },
356
+ {
357
+ "epoch": 0.622568093385214,
358
+ "eval_loss": 0.01192025002092123,
359
+ "eval_runtime": 136.5582,
360
+ "eval_samples_per_second": 3.735,
361
+ "eval_steps_per_second": 0.937,
362
+ "step": 160
363
+ },
364
+ {
365
+ "epoch": 0.642023346303502,
366
+ "grad_norm": 0.05537933111190796,
367
+ "learning_rate": 7.288888888888888e-05,
368
+ "loss": 0.0076,
369
+ "step": 165
370
+ },
371
+ {
372
+ "epoch": 0.6614785992217899,
373
+ "grad_norm": 0.04935755953192711,
374
+ "learning_rate": 7.511111111111111e-05,
375
+ "loss": 0.0077,
376
+ "step": 170
377
+ },
378
+ {
379
+ "epoch": 0.6614785992217899,
380
+ "eval_loss": 0.012302271090447903,
381
+ "eval_runtime": 136.6757,
382
+ "eval_samples_per_second": 3.731,
383
+ "eval_steps_per_second": 0.937,
384
+ "step": 170
385
+ },
386
+ {
387
+ "epoch": 0.6809338521400778,
388
+ "grad_norm": 0.05575108528137207,
389
+ "learning_rate": 7.733333333333333e-05,
390
+ "loss": 0.0081,
391
+ "step": 175
392
+ },
393
+ {
394
+ "epoch": 0.7003891050583657,
395
+ "grad_norm": 0.0551481656730175,
396
+ "learning_rate": 7.955555555555556e-05,
397
+ "loss": 0.0081,
398
+ "step": 180
399
+ },
400
+ {
401
+ "epoch": 0.7003891050583657,
402
+ "eval_loss": 0.011542496271431446,
403
+ "eval_runtime": 136.4051,
404
+ "eval_samples_per_second": 3.739,
405
+ "eval_steps_per_second": 0.938,
406
+ "step": 180
407
+ },
408
+ {
409
+ "epoch": 0.7198443579766537,
410
+ "grad_norm": 0.04738597571849823,
411
+ "learning_rate": 8.177777777777778e-05,
412
+ "loss": 0.0076,
413
+ "step": 185
414
+ },
415
+ {
416
+ "epoch": 0.7392996108949417,
417
+ "grad_norm": 0.029748599976301193,
418
+ "learning_rate": 8.4e-05,
419
+ "loss": 0.0073,
420
+ "step": 190
421
+ },
422
+ {
423
+ "epoch": 0.7392996108949417,
424
+ "eval_loss": 0.012059729546308517,
425
+ "eval_runtime": 135.8659,
426
+ "eval_samples_per_second": 3.754,
427
+ "eval_steps_per_second": 0.942,
428
+ "step": 190
429
+ },
430
+ {
431
+ "epoch": 0.7587548638132295,
432
+ "grad_norm": 0.03995237499475479,
433
+ "learning_rate": 8.622222222222223e-05,
434
+ "loss": 0.0077,
435
+ "step": 195
436
+ },
437
+ {
438
+ "epoch": 0.7782101167315175,
439
+ "grad_norm": 0.02774854749441147,
440
+ "learning_rate": 8.844444444444445e-05,
441
+ "loss": 0.0075,
442
+ "step": 200
443
+ },
444
+ {
445
+ "epoch": 0.7782101167315175,
446
+ "eval_loss": 0.011773883365094662,
447
+ "eval_runtime": 136.231,
448
+ "eval_samples_per_second": 3.744,
449
+ "eval_steps_per_second": 0.94,
450
+ "step": 200
451
+ },
452
+ {
453
+ "epoch": 0.7976653696498055,
454
+ "grad_norm": 0.026570243760943413,
455
+ "learning_rate": 9.066666666666667e-05,
456
+ "loss": 0.0072,
457
+ "step": 205
458
+ },
459
+ {
460
+ "epoch": 0.8171206225680934,
461
+ "grad_norm": 0.047289494425058365,
462
+ "learning_rate": 9.288888888888888e-05,
463
+ "loss": 0.0074,
464
+ "step": 210
465
+ },
466
+ {
467
+ "epoch": 0.8171206225680934,
468
+ "eval_loss": 0.011569861322641373,
469
+ "eval_runtime": 136.4761,
470
+ "eval_samples_per_second": 3.737,
471
+ "eval_steps_per_second": 0.938,
472
+ "step": 210
473
+ },
474
+ {
475
+ "epoch": 0.8365758754863813,
476
+ "grad_norm": 0.036366503685712814,
477
+ "learning_rate": 9.511111111111112e-05,
478
+ "loss": 0.007,
479
+ "step": 215
480
+ },
481
+ {
482
+ "epoch": 0.8560311284046692,
483
+ "grad_norm": 0.07178617268800735,
484
+ "learning_rate": 9.733333333333333e-05,
485
+ "loss": 0.0073,
486
+ "step": 220
487
+ },
488
+ {
489
+ "epoch": 0.8560311284046692,
490
+ "eval_loss": 0.010845971293747425,
491
+ "eval_runtime": 136.7794,
492
+ "eval_samples_per_second": 3.729,
493
+ "eval_steps_per_second": 0.936,
494
+ "step": 220
495
+ },
496
+ {
497
+ "epoch": 0.8754863813229572,
498
+ "grad_norm": 0.042044900357723236,
499
+ "learning_rate": 9.955555555555556e-05,
500
+ "loss": 0.0069,
501
+ "step": 225
502
+ },
503
+ {
504
+ "epoch": 0.8949416342412452,
505
+ "grad_norm": 0.18266713619232178,
506
+ "learning_rate": 0.00010177777777777777,
507
+ "loss": 0.007,
508
+ "step": 230
509
+ },
510
+ {
511
+ "epoch": 0.8949416342412452,
512
+ "eval_loss": 0.012411113828420639,
513
+ "eval_runtime": 136.7046,
514
+ "eval_samples_per_second": 3.731,
515
+ "eval_steps_per_second": 0.936,
516
+ "step": 230
517
+ },
518
+ {
519
+ "epoch": 0.914396887159533,
520
+ "grad_norm": 0.04458538442850113,
521
+ "learning_rate": 0.00010400000000000001,
522
+ "loss": 0.0077,
523
+ "step": 235
524
+ },
525
+ {
526
+ "epoch": 0.933852140077821,
527
+ "grad_norm": 0.04118124023079872,
528
+ "learning_rate": 0.00010622222222222222,
529
+ "loss": 0.0071,
530
+ "step": 240
531
+ },
532
+ {
533
+ "epoch": 0.933852140077821,
534
+ "eval_loss": 0.016743971034884453,
535
+ "eval_runtime": 136.6604,
536
+ "eval_samples_per_second": 3.732,
537
+ "eval_steps_per_second": 0.937,
538
+ "step": 240
539
+ }
540
+ ],
541
+ "logging_steps": 5,
542
+ "max_steps": 1799,
543
+ "num_input_tokens_seen": 0,
544
+ "num_train_epochs": 7,
545
+ "save_steps": 10,
546
+ "stateful_callbacks": {
547
+ "EarlyStoppingCallback": {
548
+ "args": {
549
+ "early_stopping_patience": 30,
550
+ "early_stopping_threshold": 0.001
551
+ },
552
+ "attributes": {
553
+ "early_stopping_patience_counter": 10
554
+ }
555
+ },
556
+ "TrainerControl": {
557
+ "args": {
558
+ "should_epoch_stop": false,
559
+ "should_evaluate": false,
560
+ "should_log": false,
561
+ "should_save": true,
562
+ "should_training_stop": false
563
+ },
564
+ "attributes": {}
565
+ }
566
+ },
567
+ "total_flos": 1.418065259671895e+17,
568
+ "train_batch_size": 4,
569
+ "trial_name": null,
570
+ "trial_params": null
571
+ }
checkpoint-240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc74119a8e3587a81c422ff23316abe5d974ec5c66265926247177773d932ad
3
+ size 6417