Upload folder using huggingface_hub
Browse files- .gitattributes +4 -0
- adapter_config.json +46 -0
- adapter_model.safetensors +3 -0
- chat_template.jinja +9 -0
- checkpoint-250/README.md +208 -0
- checkpoint-250/adapter_config.json +46 -0
- checkpoint-250/adapter_model.safetensors +3 -0
- checkpoint-250/chat_template.jinja +9 -0
- checkpoint-250/optimizer.pt +3 -0
- checkpoint-250/rng_state.pth +3 -0
- checkpoint-250/scaler.pt +3 -0
- checkpoint-250/scheduler.pt +3 -0
- checkpoint-250/tokenizer.json +3 -0
- checkpoint-250/tokenizer_config.json +15 -0
- checkpoint-250/trainer_state.json +209 -0
- checkpoint-250/training_args.bin +3 -0
- checkpoint-300/README.md +208 -0
- checkpoint-300/adapter_config.json +46 -0
- checkpoint-300/adapter_model.safetensors +3 -0
- checkpoint-300/chat_template.jinja +9 -0
- checkpoint-300/optimizer.pt +3 -0
- checkpoint-300/rng_state.pth +3 -0
- checkpoint-300/scaler.pt +3 -0
- checkpoint-300/scheduler.pt +3 -0
- checkpoint-300/tokenizer.json +3 -0
- checkpoint-300/tokenizer_config.json +15 -0
- checkpoint-300/trainer_state.json +244 -0
- checkpoint-300/training_args.bin +3 -0
- checkpoint-327/README.md +208 -0
- checkpoint-327/adapter_config.json +46 -0
- checkpoint-327/adapter_model.safetensors +3 -0
- checkpoint-327/chat_template.jinja +9 -0
- checkpoint-327/optimizer.pt +3 -0
- checkpoint-327/rng_state.pth +3 -0
- checkpoint-327/scaler.pt +3 -0
- checkpoint-327/scheduler.pt +3 -0
- checkpoint-327/tokenizer.json +3 -0
- checkpoint-327/tokenizer_config.json +15 -0
- checkpoint-327/trainer_state.json +258 -0
- checkpoint-327/training_args.bin +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +15 -0
- trainer_log.jsonl +33 -0
- training_args.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
checkpoint-327/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 8,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"k_proj",
|
| 33 |
+
"o_proj",
|
| 34 |
+
"q_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"down_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"gate_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d276a0fbffe1a182923226bdb522064a937d4ce714d0757f20dbc8ae3116ba3
|
| 3 |
+
size 66126768
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 2 |
+
|
| 3 |
+
' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:
|
| 4 |
+
' + content + '
|
| 5 |
+
|
| 6 |
+
### Response:
|
| 7 |
+
' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '
|
| 8 |
+
|
| 9 |
+
' }}{% endif %}{% endfor %}
|
checkpoint-250/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
checkpoint-250/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 8,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"k_proj",
|
| 33 |
+
"o_proj",
|
| 34 |
+
"q_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"down_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"gate_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
checkpoint-250/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5000888b2f817c4f6ec297f189f5e14caf4a699c47d159cb79bc424d2bb4bf3b
|
| 3 |
+
size 66126768
|
checkpoint-250/chat_template.jinja
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 2 |
+
|
| 3 |
+
' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:
|
| 4 |
+
' + content + '
|
| 5 |
+
|
| 6 |
+
### Response:
|
| 7 |
+
' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '
|
| 8 |
+
|
| 9 |
+
' }}{% endif %}{% endfor %}
|
checkpoint-250/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff4665f210f499939bd69f309c641565f8cbffa038125eeb8ca94ae7ebcd00aa
|
| 3 |
+
size 132544890
|
checkpoint-250/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63b50626f6a2d5823cbb6c20c0d4b035b79d012ee694939c2ea97c3cf603650a
|
| 3 |
+
size 14244
|
checkpoint-250/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16c68c6e81b2043731b2066422bc1aebcfdcc8f836f3eabd6ad514bb6e43a8dc
|
| 3 |
+
size 988
|
checkpoint-250/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdf437f15cd3fcfec5af14b97a574fe34c305119ffac0aa83e99931515fc14b5
|
| 3 |
+
size 1064
|
checkpoint-250/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
| 3 |
+
size 11422650
|
checkpoint-250/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 262144,
|
| 10 |
+
"pad_token": "<|vision_pad|>",
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"split_special_tokens": false,
|
| 13 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 14 |
+
"unk_token": null
|
| 15 |
+
}
|
checkpoint-250/trainer_state.json
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.2942528735632184,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 250,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09195402298850575,
|
| 14 |
+
"grad_norm": 0.18391317129135132,
|
| 15 |
+
"learning_rate": 0.00019449541284403672,
|
| 16 |
+
"loss": 0.38197338581085205,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.1839080459770115,
|
| 21 |
+
"grad_norm": 0.14778141677379608,
|
| 22 |
+
"learning_rate": 0.00018837920489296636,
|
| 23 |
+
"loss": 0.12924295663833618,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.27586206896551724,
|
| 28 |
+
"grad_norm": 0.0928393080830574,
|
| 29 |
+
"learning_rate": 0.00018226299694189605,
|
| 30 |
+
"loss": 0.10623193979263305,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.367816091954023,
|
| 35 |
+
"grad_norm": 0.12639197707176208,
|
| 36 |
+
"learning_rate": 0.0001761467889908257,
|
| 37 |
+
"loss": 0.10601885318756103,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.45977011494252873,
|
| 42 |
+
"grad_norm": 0.11163435131311417,
|
| 43 |
+
"learning_rate": 0.00017003058103975534,
|
| 44 |
+
"loss": 0.09006149768829345,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5517241379310345,
|
| 49 |
+
"grad_norm": 0.06934115290641785,
|
| 50 |
+
"learning_rate": 0.00016391437308868503,
|
| 51 |
+
"loss": 0.0927188515663147,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6436781609195402,
|
| 56 |
+
"grad_norm": 0.09345990419387817,
|
| 57 |
+
"learning_rate": 0.0001577981651376147,
|
| 58 |
+
"loss": 0.10100839138031006,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.735632183908046,
|
| 63 |
+
"grad_norm": 0.08753989636898041,
|
| 64 |
+
"learning_rate": 0.00015168195718654435,
|
| 65 |
+
"loss": 0.0789786159992218,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8275862068965517,
|
| 70 |
+
"grad_norm": 0.13514496386051178,
|
| 71 |
+
"learning_rate": 0.00014556574923547402,
|
| 72 |
+
"loss": 0.07820955514907837,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9195402298850575,
|
| 77 |
+
"grad_norm": 0.1325644999742508,
|
| 78 |
+
"learning_rate": 0.00013944954128440368,
|
| 79 |
+
"loss": 0.08956178426742553,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0091954022988505,
|
| 84 |
+
"grad_norm": 0.11423685401678085,
|
| 85 |
+
"learning_rate": 0.00013333333333333334,
|
| 86 |
+
"loss": 0.09496147632598877,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1011494252873564,
|
| 91 |
+
"grad_norm": 0.08740504831075668,
|
| 92 |
+
"learning_rate": 0.000127217125382263,
|
| 93 |
+
"loss": 0.08737512230873108,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.193103448275862,
|
| 98 |
+
"grad_norm": 0.07782809436321259,
|
| 99 |
+
"learning_rate": 0.00012110091743119268,
|
| 100 |
+
"loss": 0.07404522895812989,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.2850574712643679,
|
| 105 |
+
"grad_norm": 0.14405833184719086,
|
| 106 |
+
"learning_rate": 0.00011498470948012233,
|
| 107 |
+
"loss": 0.06943418979644775,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.3770114942528735,
|
| 112 |
+
"grad_norm": 0.1064399853348732,
|
| 113 |
+
"learning_rate": 0.00010886850152905199,
|
| 114 |
+
"loss": 0.07198014259338378,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.4689655172413794,
|
| 119 |
+
"grad_norm": 0.1051010936498642,
|
| 120 |
+
"learning_rate": 0.00010275229357798166,
|
| 121 |
+
"loss": 0.07075391411781311,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.560919540229885,
|
| 126 |
+
"grad_norm": 0.10268348455429077,
|
| 127 |
+
"learning_rate": 9.663608562691132e-05,
|
| 128 |
+
"loss": 0.0763627827167511,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.6528735632183909,
|
| 133 |
+
"grad_norm": 0.10808967053890228,
|
| 134 |
+
"learning_rate": 9.051987767584099e-05,
|
| 135 |
+
"loss": 0.06987376809120179,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.7448275862068967,
|
| 140 |
+
"grad_norm": 0.0796312764286995,
|
| 141 |
+
"learning_rate": 8.440366972477065e-05,
|
| 142 |
+
"loss": 0.06609394550323486,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.8367816091954023,
|
| 147 |
+
"grad_norm": 0.08713788539171219,
|
| 148 |
+
"learning_rate": 7.828746177370031e-05,
|
| 149 |
+
"loss": 0.07024483680725098,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.928735632183908,
|
| 154 |
+
"grad_norm": 0.0875263661146164,
|
| 155 |
+
"learning_rate": 7.217125382262997e-05,
|
| 156 |
+
"loss": 0.0659551739692688,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.018390804597701,
|
| 161 |
+
"grad_norm": 0.12321627140045166,
|
| 162 |
+
"learning_rate": 6.605504587155963e-05,
|
| 163 |
+
"loss": 0.06620625853538513,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.110344827586207,
|
| 168 |
+
"grad_norm": 0.08318830281496048,
|
| 169 |
+
"learning_rate": 5.99388379204893e-05,
|
| 170 |
+
"loss": 0.05962467789649963,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.2022988505747128,
|
| 175 |
+
"grad_norm": 0.07575304806232452,
|
| 176 |
+
"learning_rate": 5.382262996941896e-05,
|
| 177 |
+
"loss": 0.051931560039520264,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.2942528735632184,
|
| 182 |
+
"grad_norm": 0.08224116265773773,
|
| 183 |
+
"learning_rate": 4.7706422018348626e-05,
|
| 184 |
+
"loss": 0.06514100432395935,
|
| 185 |
+
"step": 250
|
| 186 |
+
}
|
| 187 |
+
],
|
| 188 |
+
"logging_steps": 10,
|
| 189 |
+
"max_steps": 327,
|
| 190 |
+
"num_input_tokens_seen": 0,
|
| 191 |
+
"num_train_epochs": 3,
|
| 192 |
+
"save_steps": 50,
|
| 193 |
+
"stateful_callbacks": {
|
| 194 |
+
"TrainerControl": {
|
| 195 |
+
"args": {
|
| 196 |
+
"should_epoch_stop": false,
|
| 197 |
+
"should_evaluate": false,
|
| 198 |
+
"should_log": false,
|
| 199 |
+
"should_save": true,
|
| 200 |
+
"should_training_stop": false
|
| 201 |
+
},
|
| 202 |
+
"attributes": {}
|
| 203 |
+
}
|
| 204 |
+
},
|
| 205 |
+
"total_flos": 3.130058760548352e+16,
|
| 206 |
+
"train_batch_size": 2,
|
| 207 |
+
"trial_name": null,
|
| 208 |
+
"trial_params": null
|
| 209 |
+
}
|
checkpoint-250/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1315169111d51de66ae42d33f469b1d4e05758f58d5c44dc9a9d3b59fff13006
|
| 3 |
+
size 5176
|
checkpoint-300/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
checkpoint-300/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 8,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"k_proj",
|
| 33 |
+
"o_proj",
|
| 34 |
+
"q_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"down_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"gate_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
checkpoint-300/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f81844757e6f829405c99f3616950be5c85d94992c155151094b7e4833b11e6
|
| 3 |
+
size 66126768
|
checkpoint-300/chat_template.jinja
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 2 |
+
|
| 3 |
+
' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:
|
| 4 |
+
' + content + '
|
| 5 |
+
|
| 6 |
+
### Response:
|
| 7 |
+
' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '
|
| 8 |
+
|
| 9 |
+
' }}{% endif %}{% endfor %}
|
checkpoint-300/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2c59a95ef0e8b802daf48b72c2fb4291b67cc49ebf734bcb060c4e97c6050f4
|
| 3 |
+
size 132544890
|
checkpoint-300/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6669b66bcf1896d0b3276103e568e43beb42b15a6a472e198584faaa016298ac
|
| 3 |
+
size 14244
|
checkpoint-300/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c852b1c266b81ad33622067d7dadb27814ab6638ea8e4f9cd67d3515606d043d
|
| 3 |
+
size 988
|
checkpoint-300/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d068f0f4bdf6b4cbc451fcd9841e5d16887374f82053fcafe5850b5c68ceb9fb
|
| 3 |
+
size 1064
|
checkpoint-300/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
| 3 |
+
size 11422650
|
checkpoint-300/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 262144,
|
| 10 |
+
"pad_token": "<|vision_pad|>",
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"split_special_tokens": false,
|
| 13 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 14 |
+
"unk_token": null
|
| 15 |
+
}
|
checkpoint-300/trainer_state.json
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.754022988505747,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 300,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09195402298850575,
|
| 14 |
+
"grad_norm": 0.18391317129135132,
|
| 15 |
+
"learning_rate": 0.00019449541284403672,
|
| 16 |
+
"loss": 0.38197338581085205,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.1839080459770115,
|
| 21 |
+
"grad_norm": 0.14778141677379608,
|
| 22 |
+
"learning_rate": 0.00018837920489296636,
|
| 23 |
+
"loss": 0.12924295663833618,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.27586206896551724,
|
| 28 |
+
"grad_norm": 0.0928393080830574,
|
| 29 |
+
"learning_rate": 0.00018226299694189605,
|
| 30 |
+
"loss": 0.10623193979263305,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.367816091954023,
|
| 35 |
+
"grad_norm": 0.12639197707176208,
|
| 36 |
+
"learning_rate": 0.0001761467889908257,
|
| 37 |
+
"loss": 0.10601885318756103,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.45977011494252873,
|
| 42 |
+
"grad_norm": 0.11163435131311417,
|
| 43 |
+
"learning_rate": 0.00017003058103975534,
|
| 44 |
+
"loss": 0.09006149768829345,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5517241379310345,
|
| 49 |
+
"grad_norm": 0.06934115290641785,
|
| 50 |
+
"learning_rate": 0.00016391437308868503,
|
| 51 |
+
"loss": 0.0927188515663147,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6436781609195402,
|
| 56 |
+
"grad_norm": 0.09345990419387817,
|
| 57 |
+
"learning_rate": 0.0001577981651376147,
|
| 58 |
+
"loss": 0.10100839138031006,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.735632183908046,
|
| 63 |
+
"grad_norm": 0.08753989636898041,
|
| 64 |
+
"learning_rate": 0.00015168195718654435,
|
| 65 |
+
"loss": 0.0789786159992218,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8275862068965517,
|
| 70 |
+
"grad_norm": 0.13514496386051178,
|
| 71 |
+
"learning_rate": 0.00014556574923547402,
|
| 72 |
+
"loss": 0.07820955514907837,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9195402298850575,
|
| 77 |
+
"grad_norm": 0.1325644999742508,
|
| 78 |
+
"learning_rate": 0.00013944954128440368,
|
| 79 |
+
"loss": 0.08956178426742553,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0091954022988505,
|
| 84 |
+
"grad_norm": 0.11423685401678085,
|
| 85 |
+
"learning_rate": 0.00013333333333333334,
|
| 86 |
+
"loss": 0.09496147632598877,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1011494252873564,
|
| 91 |
+
"grad_norm": 0.08740504831075668,
|
| 92 |
+
"learning_rate": 0.000127217125382263,
|
| 93 |
+
"loss": 0.08737512230873108,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.193103448275862,
|
| 98 |
+
"grad_norm": 0.07782809436321259,
|
| 99 |
+
"learning_rate": 0.00012110091743119268,
|
| 100 |
+
"loss": 0.07404522895812989,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.2850574712643679,
|
| 105 |
+
"grad_norm": 0.14405833184719086,
|
| 106 |
+
"learning_rate": 0.00011498470948012233,
|
| 107 |
+
"loss": 0.06943418979644775,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.3770114942528735,
|
| 112 |
+
"grad_norm": 0.1064399853348732,
|
| 113 |
+
"learning_rate": 0.00010886850152905199,
|
| 114 |
+
"loss": 0.07198014259338378,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.4689655172413794,
|
| 119 |
+
"grad_norm": 0.1051010936498642,
|
| 120 |
+
"learning_rate": 0.00010275229357798166,
|
| 121 |
+
"loss": 0.07075391411781311,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.560919540229885,
|
| 126 |
+
"grad_norm": 0.10268348455429077,
|
| 127 |
+
"learning_rate": 9.663608562691132e-05,
|
| 128 |
+
"loss": 0.0763627827167511,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.6528735632183909,
|
| 133 |
+
"grad_norm": 0.10808967053890228,
|
| 134 |
+
"learning_rate": 9.051987767584099e-05,
|
| 135 |
+
"loss": 0.06987376809120179,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.7448275862068967,
|
| 140 |
+
"grad_norm": 0.0796312764286995,
|
| 141 |
+
"learning_rate": 8.440366972477065e-05,
|
| 142 |
+
"loss": 0.06609394550323486,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.8367816091954023,
|
| 147 |
+
"grad_norm": 0.08713788539171219,
|
| 148 |
+
"learning_rate": 7.828746177370031e-05,
|
| 149 |
+
"loss": 0.07024483680725098,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.928735632183908,
|
| 154 |
+
"grad_norm": 0.0875263661146164,
|
| 155 |
+
"learning_rate": 7.217125382262997e-05,
|
| 156 |
+
"loss": 0.0659551739692688,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.018390804597701,
|
| 161 |
+
"grad_norm": 0.12321627140045166,
|
| 162 |
+
"learning_rate": 6.605504587155963e-05,
|
| 163 |
+
"loss": 0.06620625853538513,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.110344827586207,
|
| 168 |
+
"grad_norm": 0.08318830281496048,
|
| 169 |
+
"learning_rate": 5.99388379204893e-05,
|
| 170 |
+
"loss": 0.05962467789649963,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.2022988505747128,
|
| 175 |
+
"grad_norm": 0.07575304806232452,
|
| 176 |
+
"learning_rate": 5.382262996941896e-05,
|
| 177 |
+
"loss": 0.051931560039520264,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.2942528735632184,
|
| 182 |
+
"grad_norm": 0.08224116265773773,
|
| 183 |
+
"learning_rate": 4.7706422018348626e-05,
|
| 184 |
+
"loss": 0.06514100432395935,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.386206896551724,
|
| 189 |
+
"grad_norm": 0.11719097942113876,
|
| 190 |
+
"learning_rate": 4.159021406727829e-05,
|
| 191 |
+
"loss": 0.06397714018821717,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.4781609195402297,
|
| 196 |
+
"grad_norm": 0.06252411007881165,
|
| 197 |
+
"learning_rate": 3.5474006116207956e-05,
|
| 198 |
+
"loss": 0.06488621830940247,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.5701149425287357,
|
| 203 |
+
"grad_norm": 0.09748537093400955,
|
| 204 |
+
"learning_rate": 2.9357798165137618e-05,
|
| 205 |
+
"loss": 0.060347968339920045,
|
| 206 |
+
"step": 280
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.6620689655172414,
|
| 210 |
+
"grad_norm": 0.08546270430088043,
|
| 211 |
+
"learning_rate": 2.324159021406728e-05,
|
| 212 |
+
"loss": 0.05238440036773682,
|
| 213 |
+
"step": 290
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.754022988505747,
|
| 217 |
+
"grad_norm": 0.10481763631105423,
|
| 218 |
+
"learning_rate": 1.712538226299694e-05,
|
| 219 |
+
"loss": 0.04942976236343384,
|
| 220 |
+
"step": 300
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"logging_steps": 10,
|
| 224 |
+
"max_steps": 327,
|
| 225 |
+
"num_input_tokens_seen": 0,
|
| 226 |
+
"num_train_epochs": 3,
|
| 227 |
+
"save_steps": 50,
|
| 228 |
+
"stateful_callbacks": {
|
| 229 |
+
"TrainerControl": {
|
| 230 |
+
"args": {
|
| 231 |
+
"should_epoch_stop": false,
|
| 232 |
+
"should_evaluate": false,
|
| 233 |
+
"should_log": false,
|
| 234 |
+
"should_save": true,
|
| 235 |
+
"should_training_stop": false
|
| 236 |
+
},
|
| 237 |
+
"attributes": {}
|
| 238 |
+
}
|
| 239 |
+
},
|
| 240 |
+
"total_flos": 3.758120367825715e+16,
|
| 241 |
+
"train_batch_size": 2,
|
| 242 |
+
"trial_name": null,
|
| 243 |
+
"trial_params": null
|
| 244 |
+
}
|
checkpoint-300/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1315169111d51de66ae42d33f469b1d4e05758f58d5c44dc9a9d3b59fff13006
|
| 3 |
+
size 5176
|
checkpoint-327/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
checkpoint-327/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 8,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"k_proj",
|
| 33 |
+
"o_proj",
|
| 34 |
+
"q_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"down_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"gate_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
checkpoint-327/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e796a12d1333a1108f38f457e9b9a594bc0eb5db3d699b950a724e66f911f1
|
| 3 |
+
size 66126768
|
checkpoint-327/chat_template.jinja
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set system_message = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 2 |
+
|
| 3 |
+
' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '### Instruction:
|
| 4 |
+
' + content + '
|
| 5 |
+
|
| 6 |
+
### Response:
|
| 7 |
+
' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '
|
| 8 |
+
|
| 9 |
+
' }}{% endif %}{% endfor %}
|
checkpoint-327/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54361732e4c6d9d42d68416fe4a5a482b31c7c251246ff01314f3215e0abec46
|
| 3 |
+
size 132544890
|
checkpoint-327/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df9ad31b6021f870a1f4437a70c3557ab29d153e123c8e0337666b29e088a102
|
| 3 |
+
size 14244
|
checkpoint-327/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31be240fb8ca2218bce70aa99c841aba39ceb9239110beabd7a49ddb952ba629
|
| 3 |
+
size 988
|
checkpoint-327/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce7c716c5cfcfbd4ffe8b9e080bbfcdce2bd2576fe8183942840531b42f3d7e2
|
| 3 |
+
size 1064
|
checkpoint-327/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
| 3 |
+
size 11422650
|
checkpoint-327/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 262144,
|
| 10 |
+
"pad_token": "<|vision_pad|>",
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"split_special_tokens": false,
|
| 13 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 14 |
+
"unk_token": null
|
| 15 |
+
}
|
checkpoint-327/trainer_state.json
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 327,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09195402298850575,
|
| 14 |
+
"grad_norm": 0.18391317129135132,
|
| 15 |
+
"learning_rate": 0.00019449541284403672,
|
| 16 |
+
"loss": 0.38197338581085205,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.1839080459770115,
|
| 21 |
+
"grad_norm": 0.14778141677379608,
|
| 22 |
+
"learning_rate": 0.00018837920489296636,
|
| 23 |
+
"loss": 0.12924295663833618,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.27586206896551724,
|
| 28 |
+
"grad_norm": 0.0928393080830574,
|
| 29 |
+
"learning_rate": 0.00018226299694189605,
|
| 30 |
+
"loss": 0.10623193979263305,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.367816091954023,
|
| 35 |
+
"grad_norm": 0.12639197707176208,
|
| 36 |
+
"learning_rate": 0.0001761467889908257,
|
| 37 |
+
"loss": 0.10601885318756103,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.45977011494252873,
|
| 42 |
+
"grad_norm": 0.11163435131311417,
|
| 43 |
+
"learning_rate": 0.00017003058103975534,
|
| 44 |
+
"loss": 0.09006149768829345,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5517241379310345,
|
| 49 |
+
"grad_norm": 0.06934115290641785,
|
| 50 |
+
"learning_rate": 0.00016391437308868503,
|
| 51 |
+
"loss": 0.0927188515663147,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6436781609195402,
|
| 56 |
+
"grad_norm": 0.09345990419387817,
|
| 57 |
+
"learning_rate": 0.0001577981651376147,
|
| 58 |
+
"loss": 0.10100839138031006,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.735632183908046,
|
| 63 |
+
"grad_norm": 0.08753989636898041,
|
| 64 |
+
"learning_rate": 0.00015168195718654435,
|
| 65 |
+
"loss": 0.0789786159992218,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8275862068965517,
|
| 70 |
+
"grad_norm": 0.13514496386051178,
|
| 71 |
+
"learning_rate": 0.00014556574923547402,
|
| 72 |
+
"loss": 0.07820955514907837,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9195402298850575,
|
| 77 |
+
"grad_norm": 0.1325644999742508,
|
| 78 |
+
"learning_rate": 0.00013944954128440368,
|
| 79 |
+
"loss": 0.08956178426742553,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0091954022988505,
|
| 84 |
+
"grad_norm": 0.11423685401678085,
|
| 85 |
+
"learning_rate": 0.00013333333333333334,
|
| 86 |
+
"loss": 0.09496147632598877,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1011494252873564,
|
| 91 |
+
"grad_norm": 0.08740504831075668,
|
| 92 |
+
"learning_rate": 0.000127217125382263,
|
| 93 |
+
"loss": 0.08737512230873108,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.193103448275862,
|
| 98 |
+
"grad_norm": 0.07782809436321259,
|
| 99 |
+
"learning_rate": 0.00012110091743119268,
|
| 100 |
+
"loss": 0.07404522895812989,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.2850574712643679,
|
| 105 |
+
"grad_norm": 0.14405833184719086,
|
| 106 |
+
"learning_rate": 0.00011498470948012233,
|
| 107 |
+
"loss": 0.06943418979644775,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.3770114942528735,
|
| 112 |
+
"grad_norm": 0.1064399853348732,
|
| 113 |
+
"learning_rate": 0.00010886850152905199,
|
| 114 |
+
"loss": 0.07198014259338378,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.4689655172413794,
|
| 119 |
+
"grad_norm": 0.1051010936498642,
|
| 120 |
+
"learning_rate": 0.00010275229357798166,
|
| 121 |
+
"loss": 0.07075391411781311,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.560919540229885,
|
| 126 |
+
"grad_norm": 0.10268348455429077,
|
| 127 |
+
"learning_rate": 9.663608562691132e-05,
|
| 128 |
+
"loss": 0.0763627827167511,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.6528735632183909,
|
| 133 |
+
"grad_norm": 0.10808967053890228,
|
| 134 |
+
"learning_rate": 9.051987767584099e-05,
|
| 135 |
+
"loss": 0.06987376809120179,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.7448275862068967,
|
| 140 |
+
"grad_norm": 0.0796312764286995,
|
| 141 |
+
"learning_rate": 8.440366972477065e-05,
|
| 142 |
+
"loss": 0.06609394550323486,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.8367816091954023,
|
| 147 |
+
"grad_norm": 0.08713788539171219,
|
| 148 |
+
"learning_rate": 7.828746177370031e-05,
|
| 149 |
+
"loss": 0.07024483680725098,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.928735632183908,
|
| 154 |
+
"grad_norm": 0.0875263661146164,
|
| 155 |
+
"learning_rate": 7.217125382262997e-05,
|
| 156 |
+
"loss": 0.0659551739692688,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.018390804597701,
|
| 161 |
+
"grad_norm": 0.12321627140045166,
|
| 162 |
+
"learning_rate": 6.605504587155963e-05,
|
| 163 |
+
"loss": 0.06620625853538513,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.110344827586207,
|
| 168 |
+
"grad_norm": 0.08318830281496048,
|
| 169 |
+
"learning_rate": 5.99388379204893e-05,
|
| 170 |
+
"loss": 0.05962467789649963,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.2022988505747128,
|
| 175 |
+
"grad_norm": 0.07575304806232452,
|
| 176 |
+
"learning_rate": 5.382262996941896e-05,
|
| 177 |
+
"loss": 0.051931560039520264,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.2942528735632184,
|
| 182 |
+
"grad_norm": 0.08224116265773773,
|
| 183 |
+
"learning_rate": 4.7706422018348626e-05,
|
| 184 |
+
"loss": 0.06514100432395935,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.386206896551724,
|
| 189 |
+
"grad_norm": 0.11719097942113876,
|
| 190 |
+
"learning_rate": 4.159021406727829e-05,
|
| 191 |
+
"loss": 0.06397714018821717,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.4781609195402297,
|
| 196 |
+
"grad_norm": 0.06252411007881165,
|
| 197 |
+
"learning_rate": 3.5474006116207956e-05,
|
| 198 |
+
"loss": 0.06488621830940247,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.5701149425287357,
|
| 203 |
+
"grad_norm": 0.09748537093400955,
|
| 204 |
+
"learning_rate": 2.9357798165137618e-05,
|
| 205 |
+
"loss": 0.060347968339920045,
|
| 206 |
+
"step": 280
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.6620689655172414,
|
| 210 |
+
"grad_norm": 0.08546270430088043,
|
| 211 |
+
"learning_rate": 2.324159021406728e-05,
|
| 212 |
+
"loss": 0.05238440036773682,
|
| 213 |
+
"step": 290
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.754022988505747,
|
| 217 |
+
"grad_norm": 0.10481763631105423,
|
| 218 |
+
"learning_rate": 1.712538226299694e-05,
|
| 219 |
+
"loss": 0.04942976236343384,
|
| 220 |
+
"step": 300
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 2.845977011494253,
|
| 224 |
+
"grad_norm": 0.08988472819328308,
|
| 225 |
+
"learning_rate": 1.1009174311926607e-05,
|
| 226 |
+
"loss": 0.0647299349308014,
|
| 227 |
+
"step": 310
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 2.9379310344827587,
|
| 231 |
+
"grad_norm": 0.08786249905824661,
|
| 232 |
+
"learning_rate": 4.892966360856269e-06,
|
| 233 |
+
"loss": 0.06125383973121643,
|
| 234 |
+
"step": 320
|
| 235 |
+
}
|
| 236 |
+
],
|
| 237 |
+
"logging_steps": 10,
|
| 238 |
+
"max_steps": 327,
|
| 239 |
+
"num_input_tokens_seen": 0,
|
| 240 |
+
"num_train_epochs": 3,
|
| 241 |
+
"save_steps": 50,
|
| 242 |
+
"stateful_callbacks": {
|
| 243 |
+
"TrainerControl": {
|
| 244 |
+
"args": {
|
| 245 |
+
"should_epoch_stop": false,
|
| 246 |
+
"should_evaluate": false,
|
| 247 |
+
"should_log": false,
|
| 248 |
+
"should_save": true,
|
| 249 |
+
"should_training_stop": true
|
| 250 |
+
},
|
| 251 |
+
"attributes": {}
|
| 252 |
+
}
|
| 253 |
+
},
|
| 254 |
+
"total_flos": 4.092351880937472e+16,
|
| 255 |
+
"train_batch_size": 2,
|
| 256 |
+
"trial_name": null,
|
| 257 |
+
"trial_params": null
|
| 258 |
+
}
|
checkpoint-327/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1315169111d51de66ae42d33f469b1d4e05758f58d5c44dc9a9d3b59fff13006
|
| 3 |
+
size 5176
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
| 3 |
+
size 11422650
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 262144,
|
| 10 |
+
"pad_token": "<|vision_pad|>",
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"split_special_tokens": false,
|
| 13 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 14 |
+
"unk_token": null
|
| 15 |
+
}
|
trainer_log.jsonl
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"current_steps": 10, "total_steps": 327, "loss": 0.38197338581085205, "lr": 0.00019449541284403672, "epoch": 0.09195402298850575, "percentage": 3.06, "elapsed_time": "0:03:54", "remaining_time": "2:04:07"}
|
| 2 |
+
{"current_steps": 20, "total_steps": 327, "loss": 0.12924295663833618, "lr": 0.00018837920489296636, "epoch": 0.1839080459770115, "percentage": 6.12, "elapsed_time": "0:08:25", "remaining_time": "2:09:12"}
|
| 3 |
+
{"current_steps": 30, "total_steps": 327, "loss": 0.10623193979263305, "lr": 0.00018226299694189605, "epoch": 0.27586206896551724, "percentage": 9.17, "elapsed_time": "0:12:54", "remaining_time": "2:07:52"}
|
| 4 |
+
{"current_steps": 40, "total_steps": 327, "loss": 0.10601885318756103, "lr": 0.0001761467889908257, "epoch": 0.367816091954023, "percentage": 12.23, "elapsed_time": "0:17:24", "remaining_time": "2:04:55"}
|
| 5 |
+
{"current_steps": 50, "total_steps": 327, "loss": 0.09006149768829345, "lr": 0.00017003058103975534, "epoch": 0.45977011494252873, "percentage": 15.29, "elapsed_time": "0:21:53", "remaining_time": "2:01:17"}
|
| 6 |
+
{"current_steps": 60, "total_steps": 327, "loss": 0.0927188515663147, "lr": 0.00016391437308868503, "epoch": 0.5517241379310345, "percentage": 18.35, "elapsed_time": "0:26:23", "remaining_time": "1:57:25"}
|
| 7 |
+
{"current_steps": 70, "total_steps": 327, "loss": 0.10100839138031006, "lr": 0.0001577981651376147, "epoch": 0.6436781609195402, "percentage": 21.41, "elapsed_time": "0:30:46", "remaining_time": "1:53:00"}
|
| 8 |
+
{"current_steps": 80, "total_steps": 327, "loss": 0.0789786159992218, "lr": 0.00015168195718654435, "epoch": 0.735632183908046, "percentage": 24.46, "elapsed_time": "0:35:11", "remaining_time": "1:48:40"}
|
| 9 |
+
{"current_steps": 90, "total_steps": 327, "loss": 0.07820955514907837, "lr": 0.00014556574923547402, "epoch": 0.8275862068965517, "percentage": 27.52, "elapsed_time": "0:39:38", "remaining_time": "1:44:22"}
|
| 10 |
+
{"current_steps": 100, "total_steps": 327, "loss": 0.08956178426742553, "lr": 0.00013944954128440368, "epoch": 0.9195402298850575, "percentage": 30.58, "elapsed_time": "0:44:04", "remaining_time": "1:40:03"}
|
| 11 |
+
{"current_steps": 110, "total_steps": 327, "loss": 0.09496147632598877, "lr": 0.00013333333333333334, "epoch": 1.0091954022988505, "percentage": 33.64, "elapsed_time": "0:48:23", "remaining_time": "1:35:27"}
|
| 12 |
+
{"current_steps": 120, "total_steps": 327, "loss": 0.08737512230873108, "lr": 0.000127217125382263, "epoch": 1.1011494252873564, "percentage": 36.7, "elapsed_time": "0:52:50", "remaining_time": "1:31:08"}
|
| 13 |
+
{"current_steps": 130, "total_steps": 327, "loss": 0.07404522895812989, "lr": 0.00012110091743119268, "epoch": 1.193103448275862, "percentage": 39.76, "elapsed_time": "0:57:17", "remaining_time": "1:26:48"}
|
| 14 |
+
{"current_steps": 140, "total_steps": 327, "loss": 0.06943418979644775, "lr": 0.00011498470948012233, "epoch": 1.2850574712643679, "percentage": 42.81, "elapsed_time": "1:01:44", "remaining_time": "1:22:28"}
|
| 15 |
+
{"current_steps": 150, "total_steps": 327, "loss": 0.07198014259338378, "lr": 0.00010886850152905199, "epoch": 1.3770114942528735, "percentage": 45.87, "elapsed_time": "1:06:10", "remaining_time": "1:18:05"}
|
| 16 |
+
{"current_steps": 160, "total_steps": 327, "loss": 0.07075391411781311, "lr": 0.00010275229357798166, "epoch": 1.4689655172413794, "percentage": 48.93, "elapsed_time": "1:10:39", "remaining_time": "1:13:44"}
|
| 17 |
+
{"current_steps": 170, "total_steps": 327, "loss": 0.0763627827167511, "lr": 9.663608562691132e-05, "epoch": 1.560919540229885, "percentage": 51.99, "elapsed_time": "1:15:05", "remaining_time": "1:09:21"}
|
| 18 |
+
{"current_steps": 180, "total_steps": 327, "loss": 0.06987376809120179, "lr": 9.051987767584099e-05, "epoch": 1.6528735632183909, "percentage": 55.05, "elapsed_time": "1:19:32", "remaining_time": "1:04:57"}
|
| 19 |
+
{"current_steps": 190, "total_steps": 327, "loss": 0.06609394550323486, "lr": 8.440366972477065e-05, "epoch": 1.7448275862068967, "percentage": 58.1, "elapsed_time": "1:23:59", "remaining_time": "1:00:33"}
|
| 20 |
+
{"current_steps": 200, "total_steps": 327, "loss": 0.07024483680725098, "lr": 7.828746177370031e-05, "epoch": 1.8367816091954023, "percentage": 61.16, "elapsed_time": "1:28:26", "remaining_time": "0:56:09"}
|
| 21 |
+
{"current_steps": 210, "total_steps": 327, "loss": 0.0659551739692688, "lr": 7.217125382262997e-05, "epoch": 1.928735632183908, "percentage": 64.22, "elapsed_time": "1:32:54", "remaining_time": "0:51:45"}
|
| 22 |
+
{"current_steps": 220, "total_steps": 327, "loss": 0.06620625853538513, "lr": 6.605504587155963e-05, "epoch": 2.018390804597701, "percentage": 67.28, "elapsed_time": "1:37:11", "remaining_time": "0:47:16"}
|
| 23 |
+
{"current_steps": 230, "total_steps": 327, "loss": 0.05962467789649963, "lr": 5.99388379204893e-05, "epoch": 2.110344827586207, "percentage": 70.34, "elapsed_time": "1:41:37", "remaining_time": "0:42:51"}
|
| 24 |
+
{"current_steps": 240, "total_steps": 327, "loss": 0.051931560039520264, "lr": 5.382262996941896e-05, "epoch": 2.2022988505747128, "percentage": 73.39, "elapsed_time": "1:46:04", "remaining_time": "0:38:27"}
|
| 25 |
+
{"current_steps": 250, "total_steps": 327, "loss": 0.06514100432395935, "lr": 4.7706422018348626e-05, "epoch": 2.2942528735632184, "percentage": 76.45, "elapsed_time": "1:50:31", "remaining_time": "0:34:02"}
|
| 26 |
+
{"current_steps": 260, "total_steps": 327, "loss": 0.06397714018821717, "lr": 4.159021406727829e-05, "epoch": 2.386206896551724, "percentage": 79.51, "elapsed_time": "1:54:59", "remaining_time": "0:29:37"}
|
| 27 |
+
{"current_steps": 270, "total_steps": 327, "loss": 0.06488621830940247, "lr": 3.5474006116207956e-05, "epoch": 2.4781609195402297, "percentage": 82.57, "elapsed_time": "1:59:26", "remaining_time": "0:25:12"}
|
| 28 |
+
{"current_steps": 280, "total_steps": 327, "loss": 0.060347968339920045, "lr": 2.9357798165137618e-05, "epoch": 2.5701149425287357, "percentage": 85.63, "elapsed_time": "2:03:53", "remaining_time": "0:20:47"}
|
| 29 |
+
{"current_steps": 290, "total_steps": 327, "loss": 0.05238440036773682, "lr": 2.324159021406728e-05, "epoch": 2.6620689655172414, "percentage": 88.69, "elapsed_time": "2:08:20", "remaining_time": "0:16:22"}
|
| 30 |
+
{"current_steps": 300, "total_steps": 327, "loss": 0.04942976236343384, "lr": 1.712538226299694e-05, "epoch": 2.754022988505747, "percentage": 91.74, "elapsed_time": "2:12:48", "remaining_time": "0:11:57"}
|
| 31 |
+
{"current_steps": 310, "total_steps": 327, "loss": 0.0647299349308014, "lr": 1.1009174311926607e-05, "epoch": 2.845977011494253, "percentage": 94.8, "elapsed_time": "2:17:16", "remaining_time": "0:07:31"}
|
| 32 |
+
{"current_steps": 320, "total_steps": 327, "loss": 0.06125383973121643, "lr": 4.892966360856269e-06, "epoch": 2.9379310344827587, "percentage": 97.86, "elapsed_time": "2:21:43", "remaining_time": "0:03:06"}
|
| 33 |
+
{"current_steps": 327, "total_steps": 327, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:24:42", "remaining_time": "0:00:00"}
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1315169111d51de66ae42d33f469b1d4e05758f58d5c44dc9a9d3b59fff13006
|
| 3 |
+
size 5176
|