Upload 13 files
Browse files- .gitattributes +1 -0
- README.md +376 -0
- all_results.json +15 -0
- config.json +85 -0
- eval_results.json +9 -0
- model.safetensors +3 -0
- predict_results.txt +0 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +56 -0
- train_results.json +9 -0
- trainer_state.json +910 -0
- training_args.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- afr
|
| 4 |
+
- als
|
| 5 |
+
- amh
|
| 6 |
+
- arb
|
| 7 |
+
- ars
|
| 8 |
+
- ary
|
| 9 |
+
- arz
|
| 10 |
+
- asm
|
| 11 |
+
- azj
|
| 12 |
+
- bel
|
| 13 |
+
- ben
|
| 14 |
+
- bew
|
| 15 |
+
- bos
|
| 16 |
+
- bul
|
| 17 |
+
- cat
|
| 18 |
+
- ces
|
| 19 |
+
- ckb
|
| 20 |
+
- cmn
|
| 21 |
+
- cym
|
| 22 |
+
- dan
|
| 23 |
+
- deu
|
| 24 |
+
- div
|
| 25 |
+
- ekk
|
| 26 |
+
- ell
|
| 27 |
+
- eng
|
| 28 |
+
- epo
|
| 29 |
+
- eus
|
| 30 |
+
- fao
|
| 31 |
+
- fas
|
| 32 |
+
- fil
|
| 33 |
+
- fin
|
| 34 |
+
- fra
|
| 35 |
+
- fry
|
| 36 |
+
- gle
|
| 37 |
+
- glg
|
| 38 |
+
- guj
|
| 39 |
+
- hau
|
| 40 |
+
- heb
|
| 41 |
+
- hin
|
| 42 |
+
- hrv
|
| 43 |
+
- hun
|
| 44 |
+
- hye
|
| 45 |
+
- ind
|
| 46 |
+
- isl
|
| 47 |
+
- ita
|
| 48 |
+
- jpn
|
| 49 |
+
- kan
|
| 50 |
+
- kat
|
| 51 |
+
- kaz
|
| 52 |
+
- khk
|
| 53 |
+
- khm
|
| 54 |
+
- kin
|
| 55 |
+
- kir
|
| 56 |
+
- kmr
|
| 57 |
+
- kor
|
| 58 |
+
- lao
|
| 59 |
+
- lat
|
| 60 |
+
- lit
|
| 61 |
+
- ltz
|
| 62 |
+
- lvs
|
| 63 |
+
- mal
|
| 64 |
+
- mar
|
| 65 |
+
- mkd
|
| 66 |
+
- mlt
|
| 67 |
+
- mya
|
| 68 |
+
- nld
|
| 69 |
+
- nno
|
| 70 |
+
- nob
|
| 71 |
+
- npi
|
| 72 |
+
- nrm
|
| 73 |
+
- ory
|
| 74 |
+
- pan
|
| 75 |
+
- pbt
|
| 76 |
+
- plt
|
| 77 |
+
- pol
|
| 78 |
+
- por
|
| 79 |
+
- ron
|
| 80 |
+
- rus
|
| 81 |
+
- sin
|
| 82 |
+
- slk
|
| 83 |
+
- slv
|
| 84 |
+
- snd
|
| 85 |
+
- som
|
| 86 |
+
- spa
|
| 87 |
+
- srp
|
| 88 |
+
- swe
|
| 89 |
+
- swh
|
| 90 |
+
- tam
|
| 91 |
+
- tel
|
| 92 |
+
- tgk
|
| 93 |
+
- tha
|
| 94 |
+
- tur
|
| 95 |
+
- ukr
|
| 96 |
+
- urd
|
| 97 |
+
- uzn
|
| 98 |
+
- vie
|
| 99 |
+
- xho
|
| 100 |
+
- yue
|
| 101 |
+
- zsm
|
| 102 |
+
license: mit
|
| 103 |
+
base_model:
|
| 104 |
+
- intfloat/multilingual-e5-small
|
| 105 |
+
datasets:
|
| 106 |
+
- agentlans/multilingual-document-classification
|
| 107 |
+
metrics:
|
| 108 |
+
- f1
|
| 109 |
+
- loss
|
| 110 |
+
model-index:
|
| 111 |
+
- name: multilingual-e5-small-domain-classifier
|
| 112 |
+
results:
|
| 113 |
+
- task:
|
| 114 |
+
type: text-classification
|
| 115 |
+
name: Text Classification
|
| 116 |
+
metrics:
|
| 117 |
+
- type: f1
|
| 118 |
+
value: 0.7709
|
| 119 |
+
name: Evaluation F1
|
| 120 |
+
- type: loss
|
| 121 |
+
value: 0.9974
|
| 122 |
+
name: Evaluation Loss
|
| 123 |
+
---
|
| 124 |
+
# multilingual-e5-small Domain Classifier
|
| 125 |
+
|
| 126 |
+
A fine-tuned version of the **bert** architecture (`BertForSequenceClassification`) optimized for the `text-classification` task.
|
| 127 |
+
|
| 128 |
+
- **Model type:** bert
|
| 129 |
+
- **Problem Type:** single_label_classification
|
| 130 |
+
- **Number of Labels:** 26
|
| 131 |
+
- **Vocabulary Size:** 250037
|
| 132 |
+
- **License:** MIT
|
| 133 |
+
|
| 134 |
+
## Use
|
| 135 |
+
|
| 136 |
+
To get started with this model in Python using the Hugging Face Transformers library, run the following code:
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 140 |
+
import torch
|
| 141 |
+
|
| 142 |
+
model_id = "agentlans/multilingual-e5-small-domain-classifier"
|
| 143 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 144 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_id)
|
| 145 |
+
|
| 146 |
+
text = "Replace this with your input text."
|
| 147 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 148 |
+
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
logits = model(**inputs).logits
|
| 151 |
+
|
| 152 |
+
predicted_class_id = logits.argmax().item()
|
| 153 |
+
predicted_class_name = model.config.id2label[predicted_class_id]
|
| 154 |
+
|
| 155 |
+
print(f"Predicted Class ID: {predicted_class_id}")
|
| 156 |
+
print(f"Predicted Class Name: {predicted_class_name}")
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
## Intended Uses & Limitations
|
| 160 |
+
|
| 161 |
+
### Intended Use
|
| 162 |
+
This model is designed for sequence classification tasks. Below are the specific class labels mapped to their corresponding IDs:
|
| 163 |
+
|
| 164 |
+
| Label ID | Label Name |
|
| 165 |
+
|---|---|
|
| 166 |
+
| 0 | Adult |
|
| 167 |
+
| 1 | Arts_and_Entertainment |
|
| 168 |
+
| 2 | Autos_and_Vehicles |
|
| 169 |
+
| 3 | Beauty_and_Fitness |
|
| 170 |
+
| 4 | Books_and_Literature |
|
| 171 |
+
| 5 | Business_and_Industrial |
|
| 172 |
+
| 6 | Computers_and_Electronics |
|
| 173 |
+
| 7 | Finance |
|
| 174 |
+
| 8 | Food_and_Drink |
|
| 175 |
+
| 9 | Games |
|
| 176 |
+
| 10 | Health |
|
| 177 |
+
| 11 | Hobbies_and_Leisure |
|
| 178 |
+
| 12 | Home_and_Garden |
|
| 179 |
+
| 13 | Internet_and_Telecom |
|
| 180 |
+
| 14 | Jobs_and_Education |
|
| 181 |
+
| 15 | Law_and_Government |
|
| 182 |
+
| 16 | News |
|
| 183 |
+
| 17 | Online_Communities |
|
| 184 |
+
| 18 | People_and_Society |
|
| 185 |
+
| 19 | Pets_and_Animals |
|
| 186 |
+
| 20 | Real_Estate |
|
| 187 |
+
| 21 | Science |
|
| 188 |
+
| 22 | Sensitive_Subjects |
|
| 189 |
+
| 23 | Shopping |
|
| 190 |
+
| 24 | Sports |
|
| 191 |
+
| 25 | Travel_and_Transportation |
|
| 192 |
+
|
| 193 |
+
## Training Details
|
| 194 |
+
|
| 195 |
+
### Hyperparameters
|
| 196 |
+
The following hyperparameters were used during fine-tuning:
|
| 197 |
+
- **Learning Rate:** 5e-05
|
| 198 |
+
- **Train Batch Size:** 8
|
| 199 |
+
- **Eval Batch Size:** 8
|
| 200 |
+
- **Optimizer:** OptimizerNames.ADAMW_TORCH_FUSED
|
| 201 |
+
- **Number of Epochs:** 3.0
|
| 202 |
+
- **Mixed Precision:** BF16
|
| 203 |
+
|
| 204 |
+
<details>
|
| 205 |
+
<summary><b>Show Advanced Training Configuration</b></summary>
|
| 206 |
+
|
| 207 |
+
#### Optimization & Regularization
|
| 208 |
+
- **Gradient Accumulation Steps:** 1
|
| 209 |
+
- **Learning Rate Scheduler:** SchedulerType.LINEAR
|
| 210 |
+
- **Warmup Steps:** 0
|
| 211 |
+
- **Warmup Ratio:** None
|
| 212 |
+
- **Weight Decay:** 0.0
|
| 213 |
+
- **Max Gradient Norm:** 1.0
|
| 214 |
+
|
| 215 |
+
#### Hardware & Reproducibility
|
| 216 |
+
- **Number of GPUs:** 1
|
| 217 |
+
- **Seed:** 42
|
| 218 |
+
|
| 219 |
+
</details>
|
| 220 |
+
|
| 221 |
+
## Training Results & Evaluation
|
| 222 |
+
|
| 223 |
+
During fine-tuning, the model achieved the following results on the evaluation set:
|
| 224 |
+
|
| 225 |
+
| Metric | Value |
|
| 226 |
+
|---|---|
|
| 227 |
+
| **Train Loss** | 0.6686 |
|
| 228 |
+
| **Validation Loss** | 0.9974 |
|
| 229 |
+
| **Validation F1 Score** | 0.7709 |
|
| 230 |
+
| **Total FLOPs** | 7.9086e+15 |
|
| 231 |
+
|
| 232 |
+
### Speed Performance
|
| 233 |
+
- **Training Runtime:** 1639.5865 seconds
|
| 234 |
+
- **Train Samples per Second:** 292.775
|
| 235 |
+
- **Evaluation Runtime:** 10.8576 seconds
|
| 236 |
+
- **Eval Samples per Second:** 1842.216
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
<details>
|
| 240 |
+
<summary><b>Show Detailed Training Logs</b></summary>
|
| 241 |
+
|
| 242 |
+
### Training Logs History
|
| 243 |
+
|
| 244 |
+
| Step | Epoch | Learning Rate | Training Loss | Validation Loss | Validation F1 |
|
| 245 |
+
|---|---|---|---|---|---|
|
| 246 |
+
| 500 | 0.025 | 4.9584e-05 | 2.602 | N/A | N/A |
|
| 247 |
+
| 1000 | 0.05 | 4.9168e-05 | 1.8965 | N/A | N/A |
|
| 248 |
+
| 1500 | 0.075 | 4.8751e-05 | 1.604 | N/A | N/A |
|
| 249 |
+
| 2000 | 0.1 | 4.8334e-05 | 1.3957 | N/A | N/A |
|
| 250 |
+
| 2500 | 0.125 | 4.7918e-05 | 1.322 | N/A | N/A |
|
| 251 |
+
| 3000 | 0.15 | 4.7501e-05 | 1.2218 | N/A | N/A |
|
| 252 |
+
| 3500 | 0.175 | 4.7084e-05 | 1.195 | N/A | N/A |
|
| 253 |
+
| 4000 | 0.2 | 4.6668e-05 | 1.1313 | N/A | N/A |
|
| 254 |
+
| 4500 | 0.225 | 4.6251e-05 | 1.0902 | N/A | N/A |
|
| 255 |
+
| 5000 | 0.25 | 4.5835e-05 | 1.0637 | N/A | N/A |
|
| 256 |
+
| 5500 | 0.275 | 4.5418e-05 | 1.0626 | N/A | N/A |
|
| 257 |
+
| 6000 | 0.3 | 4.5001e-05 | 1.0054 | N/A | N/A |
|
| 258 |
+
| 6500 | 0.325 | 4.4585e-05 | 1.0253 | N/A | N/A |
|
| 259 |
+
| 7000 | 0.35 | 4.4168e-05 | 1.0127 | N/A | N/A |
|
| 260 |
+
| 7500 | 0.375 | 4.3751e-05 | 0.9714 | N/A | N/A |
|
| 261 |
+
| 8000 | 0.4 | 4.3335e-05 | 0.9589 | N/A | N/A |
|
| 262 |
+
| 8500 | 0.425 | 4.2918e-05 | 0.9808 | N/A | N/A |
|
| 263 |
+
| 9000 | 0.45 | 4.2502e-05 | 0.9392 | N/A | N/A |
|
| 264 |
+
| 9500 | 0.475 | 4.2085e-05 | 0.9304 | N/A | N/A |
|
| 265 |
+
| 10000 | 0.5 | 4.1668e-05 | 0.9369 | N/A | N/A |
|
| 266 |
+
| 10500 | 0.525 | 4.1252e-05 | 0.9181 | N/A | N/A |
|
| 267 |
+
| 11000 | 0.55 | 4.0835e-05 | 0.8996 | N/A | N/A |
|
| 268 |
+
| 11500 | 0.575 | 4.0418e-05 | 0.9111 | N/A | N/A |
|
| 269 |
+
| 12000 | 0.6 | 4.0002e-05 | 0.9033 | N/A | N/A |
|
| 270 |
+
| 12500 | 0.625 | 3.9585e-05 | 0.917 | N/A | N/A |
|
| 271 |
+
| 13000 | 0.65 | 3.9169e-05 | 0.8872 | N/A | N/A |
|
| 272 |
+
| 13500 | 0.675 | 3.8752e-05 | 0.8604 | N/A | N/A |
|
| 273 |
+
| 14000 | 0.7 | 3.8335e-05 | 0.8628 | N/A | N/A |
|
| 274 |
+
| 14500 | 0.725 | 3.7919e-05 | 0.8929 | N/A | N/A |
|
| 275 |
+
| 15000 | 0.75 | 3.7502e-05 | 0.8585 | N/A | N/A |
|
| 276 |
+
| 15500 | 0.775 | 3.7085e-05 | 0.9014 | N/A | N/A |
|
| 277 |
+
| 16000 | 0.8 | 3.6669e-05 | 0.8581 | N/A | N/A |
|
| 278 |
+
| 16500 | 0.825 | 3.6252e-05 | 0.8622 | N/A | N/A |
|
| 279 |
+
| 17000 | 0.85 | 3.5836e-05 | 0.873 | N/A | N/A |
|
| 280 |
+
| 17500 | 0.875 | 3.5419e-05 | 0.8446 | N/A | N/A |
|
| 281 |
+
| 18000 | 0.9 | 3.5002e-05 | 0.819 | N/A | N/A |
|
| 282 |
+
| 18500 | 0.925 | 3.4586e-05 | 0.8458 | N/A | N/A |
|
| 283 |
+
| 19000 | 0.95 | 3.4169e-05 | 0.8458 | N/A | N/A |
|
| 284 |
+
| 19500 | 0.975 | 3.3752e-05 | 0.8497 | N/A | N/A |
|
| 285 |
+
| 20000 | 1.0 | 3.3336e-05 | 0.7989 | N/A | N/A |
|
| 286 |
+
| 20002 | 1.0 | N/A | N/A | 0.8514 | 0.7452 |
|
| 287 |
+
| 20500 | 1.025 | 3.2919e-05 | 0.6034 | N/A | N/A |
|
| 288 |
+
| 21000 | 1.05 | 3.2503e-05 | 0.6148 | N/A | N/A |
|
| 289 |
+
| 21500 | 1.075 | 3.2086e-05 | 0.614 | N/A | N/A |
|
| 290 |
+
| 22000 | 1.1 | 3.1669e-05 | 0.5895 | N/A | N/A |
|
| 291 |
+
| 22500 | 1.125 | 3.1253e-05 | 0.6483 | N/A | N/A |
|
| 292 |
+
| 23000 | 1.15 | 3.0836e-05 | 0.6331 | N/A | N/A |
|
| 293 |
+
| 23500 | 1.175 | 3.0419e-05 | 0.5885 | N/A | N/A |
|
| 294 |
+
| 24000 | 1.2 | 3.0003e-05 | 0.6082 | N/A | N/A |
|
| 295 |
+
| 24500 | 1.225 | 2.9586e-05 | 0.6312 | N/A | N/A |
|
| 296 |
+
| 25000 | 1.25 | 2.9170e-05 | 0.6033 | N/A | N/A |
|
| 297 |
+
| 25500 | 1.275 | 2.8753e-05 | 0.6006 | N/A | N/A |
|
| 298 |
+
| 26000 | 1.3 | 2.8336e-05 | 0.6283 | N/A | N/A |
|
| 299 |
+
| 26500 | 1.325 | 2.7920e-05 | 0.6319 | N/A | N/A |
|
| 300 |
+
| 27000 | 1.35 | 2.7503e-05 | 0.5913 | N/A | N/A |
|
| 301 |
+
| 27500 | 1.375 | 2.7086e-05 | 0.6037 | N/A | N/A |
|
| 302 |
+
| 28000 | 1.4 | 2.6670e-05 | 0.6025 | N/A | N/A |
|
| 303 |
+
| 28500 | 1.425 | 2.6253e-05 | 0.6067 | N/A | N/A |
|
| 304 |
+
| 29000 | 1.45 | 2.5837e-05 | 0.6075 | N/A | N/A |
|
| 305 |
+
| 29500 | 1.475 | 2.5420e-05 | 0.6035 | N/A | N/A |
|
| 306 |
+
| 30000 | 1.5 | 2.5003e-05 | 0.5826 | N/A | N/A |
|
| 307 |
+
| 30500 | 1.525 | 2.4587e-05 | 0.5905 | N/A | N/A |
|
| 308 |
+
| 31000 | 1.55 | 2.4170e-05 | 0.563 | N/A | N/A |
|
| 309 |
+
| 31500 | 1.575 | 2.3753e-05 | 0.5795 | N/A | N/A |
|
| 310 |
+
| 32000 | 1.6 | 2.3337e-05 | 0.603 | N/A | N/A |
|
| 311 |
+
| 32500 | 1.625 | 2.2920e-05 | 0.5805 | N/A | N/A |
|
| 312 |
+
| 33000 | 1.65 | 2.2504e-05 | 0.6108 | N/A | N/A |
|
| 313 |
+
| 33500 | 1.675 | 2.2087e-05 | 0.6077 | N/A | N/A |
|
| 314 |
+
| 34000 | 1.7 | 2.1670e-05 | 0.5751 | N/A | N/A |
|
| 315 |
+
| 34500 | 1.725 | 2.1254e-05 | 0.5833 | N/A | N/A |
|
| 316 |
+
| 35000 | 1.75 | 2.0837e-05 | 0.5895 | N/A | N/A |
|
| 317 |
+
| 35500 | 1.775 | 2.0420e-05 | 0.5541 | N/A | N/A |
|
| 318 |
+
| 36000 | 1.8 | 2.0004e-05 | 0.5423 | N/A | N/A |
|
| 319 |
+
| 36500 | 1.825 | 1.9587e-05 | 0.5566 | N/A | N/A |
|
| 320 |
+
| 37000 | 1.85 | 1.9171e-05 | 0.5493 | N/A | N/A |
|
| 321 |
+
| 37500 | 1.875 | 1.8754e-05 | 0.5602 | N/A | N/A |
|
| 322 |
+
| 38000 | 1.9 | 1.8337e-05 | 0.5878 | N/A | N/A |
|
| 323 |
+
| 38500 | 1.925 | 1.7921e-05 | 0.5681 | N/A | N/A |
|
| 324 |
+
| 39000 | 1.95 | 1.7504e-05 | 0.5464 | N/A | N/A |
|
| 325 |
+
| 39500 | 1.975 | 1.7087e-05 | 0.5917 | N/A | N/A |
|
| 326 |
+
| 40000 | 2.0 | 1.6671e-05 | 0.5443 | N/A | N/A |
|
| 327 |
+
| 40004 | 2.0 | N/A | N/A | 0.8536 | 0.7652 |
|
| 328 |
+
| 40500 | 2.025 | 1.6254e-05 | 0.3501 | N/A | N/A |
|
| 329 |
+
| 41000 | 2.05 | 1.5838e-05 | 0.3785 | N/A | N/A |
|
| 330 |
+
| 41500 | 2.075 | 1.5421e-05 | 0.4034 | N/A | N/A |
|
| 331 |
+
| 42000 | 2.1 | 1.5004e-05 | 0.385 | N/A | N/A |
|
| 332 |
+
| 42500 | 2.125 | 1.4588e-05 | 0.3758 | N/A | N/A |
|
| 333 |
+
| 43000 | 2.15 | 1.4171e-05 | 0.3713 | N/A | N/A |
|
| 334 |
+
| 43500 | 2.175 | 1.3754e-05 | 0.413 | N/A | N/A |
|
| 335 |
+
| 44000 | 2.2 | 1.3338e-05 | 0.3787 | N/A | N/A |
|
| 336 |
+
| 44500 | 2.225 | 1.2921e-05 | 0.3805 | N/A | N/A |
|
| 337 |
+
| 45000 | 2.25 | 1.2505e-05 | 0.3757 | N/A | N/A |
|
| 338 |
+
| 45500 | 2.275 | 1.2088e-05 | 0.3887 | N/A | N/A |
|
| 339 |
+
| 46000 | 2.3 | 1.1671e-05 | 0.3789 | N/A | N/A |
|
| 340 |
+
| 46500 | 2.325 | 1.1255e-05 | 0.3742 | N/A | N/A |
|
| 341 |
+
| 47000 | 2.35 | 1.0838e-05 | 0.3805 | N/A | N/A |
|
| 342 |
+
| 47500 | 2.375 | 1.0421e-05 | 0.3936 | N/A | N/A |
|
| 343 |
+
| 48000 | 2.4 | 1.0005e-05 | 0.38 | N/A | N/A |
|
| 344 |
+
| 48500 | 2.425 | 9.5882e-06 | 0.3941 | N/A | N/A |
|
| 345 |
+
| 49000 | 2.45 | 9.1716e-06 | 0.4054 | N/A | N/A |
|
| 346 |
+
| 49500 | 2.475 | 8.7550e-06 | 0.3659 | N/A | N/A |
|
| 347 |
+
| 50000 | 2.5 | 8.3383e-06 | 0.3917 | N/A | N/A |
|
| 348 |
+
| 50500 | 2.525 | 7.9217e-06 | 0.3876 | N/A | N/A |
|
| 349 |
+
| 51000 | 2.55 | 7.5051e-06 | 0.3628 | N/A | N/A |
|
| 350 |
+
| 51500 | 2.575 | 7.0885e-06 | 0.3918 | N/A | N/A |
|
| 351 |
+
| 52000 | 2.6 | 6.6718e-06 | 0.359 | N/A | N/A |
|
| 352 |
+
| 52500 | 2.625 | 6.2552e-06 | 0.3634 | N/A | N/A |
|
| 353 |
+
| 53000 | 2.65 | 5.8386e-06 | 0.3737 | N/A | N/A |
|
| 354 |
+
| 53500 | 2.675 | 5.4220e-06 | 0.4022 | N/A | N/A |
|
| 355 |
+
| 54000 | 2.7 | 5.0053e-06 | 0.3562 | N/A | N/A |
|
| 356 |
+
| 54500 | 2.725 | 4.5887e-06 | 0.349 | N/A | N/A |
|
| 357 |
+
| 55000 | 2.75 | 4.1721e-06 | 0.3573 | N/A | N/A |
|
| 358 |
+
| 55500 | 2.775 | 3.7555e-06 | 0.335 | N/A | N/A |
|
| 359 |
+
| 56000 | 2.8 | 3.3388e-06 | 0.3679 | N/A | N/A |
|
| 360 |
+
| 56500 | 2.825 | 2.9222e-06 | 0.3266 | N/A | N/A |
|
| 361 |
+
| 57000 | 2.85 | 2.5056e-06 | 0.3453 | N/A | N/A |
|
| 362 |
+
| 57500 | 2.875 | 2.0890e-06 | 0.3682 | N/A | N/A |
|
| 363 |
+
| 58000 | 2.9 | 1.6723e-06 | 0.3417 | N/A | N/A |
|
| 364 |
+
| 58500 | 2.925 | 1.2557e-06 | 0.3192 | N/A | N/A |
|
| 365 |
+
| 59000 | 2.95 | 8.3908e-07 | 0.3375 | N/A | N/A |
|
| 366 |
+
| 59500 | 2.975 | 4.2246e-07 | 0.3669 | N/A | N/A |
|
| 367 |
+
| 60000 | 3.0 | 5.8328e-09 | 0.332 | N/A | N/A |
|
| 368 |
+
| 60006 | 3.0 | N/A | N/A | 0.9974 | 0.7709 |
|
| 369 |
+
|
| 370 |
+
</details>
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
## Framework Versions
|
| 374 |
+
|
| 375 |
+
- **Transformers:** 5.0.0.dev0
|
| 376 |
+
- **PyTorch:** 2.9.1+cu128
|
all_results.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 3.0,
|
| 3 |
+
"eval_f1": 0.7708992224677207,
|
| 4 |
+
"eval_loss": 0.9973717331886292,
|
| 5 |
+
"eval_runtime": 10.8576,
|
| 6 |
+
"eval_samples": 20002,
|
| 7 |
+
"eval_samples_per_second": 1842.216,
|
| 8 |
+
"eval_steps_per_second": 230.346,
|
| 9 |
+
"total_flos": 7908628105405440.0,
|
| 10 |
+
"train_loss": 0.6686365460569141,
|
| 11 |
+
"train_runtime": 1639.5865,
|
| 12 |
+
"train_samples": 160010,
|
| 13 |
+
"train_samples_per_second": 292.775,
|
| 14 |
+
"train_steps_per_second": 36.598
|
| 15 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"finetuning_task": "text-classification",
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 384,
|
| 14 |
+
"id2label": {
|
| 15 |
+
"0": "Adult",
|
| 16 |
+
"1": "Arts_and_Entertainment",
|
| 17 |
+
"2": "Autos_and_Vehicles",
|
| 18 |
+
"3": "Beauty_and_Fitness",
|
| 19 |
+
"4": "Books_and_Literature",
|
| 20 |
+
"5": "Business_and_Industrial",
|
| 21 |
+
"6": "Computers_and_Electronics",
|
| 22 |
+
"7": "Finance",
|
| 23 |
+
"8": "Food_and_Drink",
|
| 24 |
+
"9": "Games",
|
| 25 |
+
"10": "Health",
|
| 26 |
+
"11": "Hobbies_and_Leisure",
|
| 27 |
+
"12": "Home_and_Garden",
|
| 28 |
+
"13": "Internet_and_Telecom",
|
| 29 |
+
"14": "Jobs_and_Education",
|
| 30 |
+
"15": "Law_and_Government",
|
| 31 |
+
"16": "News",
|
| 32 |
+
"17": "Online_Communities",
|
| 33 |
+
"18": "People_and_Society",
|
| 34 |
+
"19": "Pets_and_Animals",
|
| 35 |
+
"20": "Real_Estate",
|
| 36 |
+
"21": "Science",
|
| 37 |
+
"22": "Sensitive_Subjects",
|
| 38 |
+
"23": "Shopping",
|
| 39 |
+
"24": "Sports",
|
| 40 |
+
"25": "Travel_and_Transportation"
|
| 41 |
+
},
|
| 42 |
+
"initializer_range": 0.02,
|
| 43 |
+
"intermediate_size": 1536,
|
| 44 |
+
"label2id": {
|
| 45 |
+
"Adult": 0,
|
| 46 |
+
"Arts_and_Entertainment": 1,
|
| 47 |
+
"Autos_and_Vehicles": 2,
|
| 48 |
+
"Beauty_and_Fitness": 3,
|
| 49 |
+
"Books_and_Literature": 4,
|
| 50 |
+
"Business_and_Industrial": 5,
|
| 51 |
+
"Computers_and_Electronics": 6,
|
| 52 |
+
"Finance": 7,
|
| 53 |
+
"Food_and_Drink": 8,
|
| 54 |
+
"Games": 9,
|
| 55 |
+
"Health": 10,
|
| 56 |
+
"Hobbies_and_Leisure": 11,
|
| 57 |
+
"Home_and_Garden": 12,
|
| 58 |
+
"Internet_and_Telecom": 13,
|
| 59 |
+
"Jobs_and_Education": 14,
|
| 60 |
+
"Law_and_Government": 15,
|
| 61 |
+
"News": 16,
|
| 62 |
+
"Online_Communities": 17,
|
| 63 |
+
"People_and_Society": 18,
|
| 64 |
+
"Pets_and_Animals": 19,
|
| 65 |
+
"Real_Estate": 20,
|
| 66 |
+
"Science": 21,
|
| 67 |
+
"Sensitive_Subjects": 22,
|
| 68 |
+
"Shopping": 23,
|
| 69 |
+
"Sports": 24,
|
| 70 |
+
"Travel_and_Transportation": 25
|
| 71 |
+
},
|
| 72 |
+
"layer_norm_eps": 1e-12,
|
| 73 |
+
"max_position_embeddings": 512,
|
| 74 |
+
"model_type": "bert",
|
| 75 |
+
"num_attention_heads": 12,
|
| 76 |
+
"num_hidden_layers": 12,
|
| 77 |
+
"pad_token_id": 1,
|
| 78 |
+
"position_embedding_type": "absolute",
|
| 79 |
+
"problem_type": "single_label_classification",
|
| 80 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 81 |
+
"transformers_version": "5.0.0.dev0",
|
| 82 |
+
"type_vocab_size": 2,
|
| 83 |
+
"use_cache": false,
|
| 84 |
+
"vocab_size": 250037
|
| 85 |
+
}
|
eval_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 3.0,
|
| 3 |
+
"eval_f1": 0.7708992224677207,
|
| 4 |
+
"eval_loss": 0.9973717331886292,
|
| 5 |
+
"eval_runtime": 10.8576,
|
| 6 |
+
"eval_samples": 20002,
|
| 7 |
+
"eval_samples_per_second": 1842.216,
|
| 8 |
+
"eval_steps_per_second": 230.346
|
| 9 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0e5590097a6bcb865acc663998303a83dc90224cbdcfd3446979dc4c9db2217
|
| 3 |
+
size 470678624
|
predict_results.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66e2c4647474659095b757711e8aef0583d58dbb50e3349958ebc460a9cf4977
|
| 3 |
+
size 17083065
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 512,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"sp_model_kwargs": {},
|
| 54 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 55 |
+
"unk_token": "<unk>"
|
| 56 |
+
}
|
train_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 3.0,
|
| 3 |
+
"total_flos": 7908628105405440.0,
|
| 4 |
+
"train_loss": 0.6686365460569141,
|
| 5 |
+
"train_runtime": 1639.5865,
|
| 6 |
+
"train_samples": 160010,
|
| 7 |
+
"train_samples_per_second": 292.775,
|
| 8 |
+
"train_steps_per_second": 36.598
|
| 9 |
+
}
|
trainer_state.json
ADDED
|
@@ -0,0 +1,910 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 60006,
|
| 3 |
+
"best_metric": 0.7708992224677207,
|
| 4 |
+
"best_model_checkpoint": "./nvidia_domain_model_multilingual-e5-small/checkpoint-60006",
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 60006,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.024997500249975,
|
| 14 |
+
"grad_norm": 7.94265604019165,
|
| 15 |
+
"learning_rate": 4.9584208245842084e-05,
|
| 16 |
+
"loss": 2.602,
|
| 17 |
+
"step": 500
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.04999500049995,
|
| 21 |
+
"grad_norm": 5.469175815582275,
|
| 22 |
+
"learning_rate": 4.9167583241675834e-05,
|
| 23 |
+
"loss": 1.8965,
|
| 24 |
+
"step": 1000
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.074992500749925,
|
| 28 |
+
"grad_norm": 7.713351726531982,
|
| 29 |
+
"learning_rate": 4.875095823750958e-05,
|
| 30 |
+
"loss": 1.604,
|
| 31 |
+
"step": 1500
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.0999900009999,
|
| 35 |
+
"grad_norm": 11.510587692260742,
|
| 36 |
+
"learning_rate": 4.833433323334334e-05,
|
| 37 |
+
"loss": 1.3957,
|
| 38 |
+
"step": 2000
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.12498750124987501,
|
| 42 |
+
"grad_norm": 12.626413345336914,
|
| 43 |
+
"learning_rate": 4.791770822917708e-05,
|
| 44 |
+
"loss": 1.322,
|
| 45 |
+
"step": 2500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.14998500149985,
|
| 49 |
+
"grad_norm": 7.150252342224121,
|
| 50 |
+
"learning_rate": 4.750108322501083e-05,
|
| 51 |
+
"loss": 1.2218,
|
| 52 |
+
"step": 3000
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.17498250174982502,
|
| 56 |
+
"grad_norm": 16.782085418701172,
|
| 57 |
+
"learning_rate": 4.708445822084459e-05,
|
| 58 |
+
"loss": 1.195,
|
| 59 |
+
"step": 3500
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.1999800019998,
|
| 63 |
+
"grad_norm": 13.529509544372559,
|
| 64 |
+
"learning_rate": 4.666783321667834e-05,
|
| 65 |
+
"loss": 1.1313,
|
| 66 |
+
"step": 4000
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.22497750224977503,
|
| 70 |
+
"grad_norm": 19.662353515625,
|
| 71 |
+
"learning_rate": 4.625120821251208e-05,
|
| 72 |
+
"loss": 1.0902,
|
| 73 |
+
"step": 4500
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.24997500249975002,
|
| 77 |
+
"grad_norm": 11.194819450378418,
|
| 78 |
+
"learning_rate": 4.5834583208345836e-05,
|
| 79 |
+
"loss": 1.0637,
|
| 80 |
+
"step": 5000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.274972502749725,
|
| 84 |
+
"grad_norm": 13.16511058807373,
|
| 85 |
+
"learning_rate": 4.5417958204179585e-05,
|
| 86 |
+
"loss": 1.0626,
|
| 87 |
+
"step": 5500
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.2999700029997,
|
| 91 |
+
"grad_norm": 9.290426254272461,
|
| 92 |
+
"learning_rate": 4.5001333200013335e-05,
|
| 93 |
+
"loss": 1.0054,
|
| 94 |
+
"step": 6000
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.32496750324967505,
|
| 98 |
+
"grad_norm": 17.698017120361328,
|
| 99 |
+
"learning_rate": 4.4584708195847084e-05,
|
| 100 |
+
"loss": 1.0253,
|
| 101 |
+
"step": 6500
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.34996500349965004,
|
| 105 |
+
"grad_norm": 15.605792999267578,
|
| 106 |
+
"learning_rate": 4.4168083191680834e-05,
|
| 107 |
+
"loss": 1.0127,
|
| 108 |
+
"step": 7000
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.37496250374962503,
|
| 112 |
+
"grad_norm": 33.41305923461914,
|
| 113 |
+
"learning_rate": 4.375145818751458e-05,
|
| 114 |
+
"loss": 0.9714,
|
| 115 |
+
"step": 7500
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.3999600039996,
|
| 119 |
+
"grad_norm": 18.213973999023438,
|
| 120 |
+
"learning_rate": 4.333483318334833e-05,
|
| 121 |
+
"loss": 0.9589,
|
| 122 |
+
"step": 8000
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.42495750424957507,
|
| 126 |
+
"grad_norm": 11.406991958618164,
|
| 127 |
+
"learning_rate": 4.291820817918208e-05,
|
| 128 |
+
"loss": 0.9808,
|
| 129 |
+
"step": 8500
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.44995500449955006,
|
| 133 |
+
"grad_norm": 15.420747756958008,
|
| 134 |
+
"learning_rate": 4.250158317501584e-05,
|
| 135 |
+
"loss": 0.9392,
|
| 136 |
+
"step": 9000
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.47495250474952505,
|
| 140 |
+
"grad_norm": 19.129817962646484,
|
| 141 |
+
"learning_rate": 4.208495817084958e-05,
|
| 142 |
+
"loss": 0.9304,
|
| 143 |
+
"step": 9500
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.49995000499950004,
|
| 147 |
+
"grad_norm": 9.371217727661133,
|
| 148 |
+
"learning_rate": 4.166833316668333e-05,
|
| 149 |
+
"loss": 0.9369,
|
| 150 |
+
"step": 10000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.5249475052494751,
|
| 154 |
+
"grad_norm": 11.86233901977539,
|
| 155 |
+
"learning_rate": 4.1251708162517086e-05,
|
| 156 |
+
"loss": 0.9181,
|
| 157 |
+
"step": 10500
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.54994500549945,
|
| 161 |
+
"grad_norm": 16.078561782836914,
|
| 162 |
+
"learning_rate": 4.0835083158350836e-05,
|
| 163 |
+
"loss": 0.8996,
|
| 164 |
+
"step": 11000
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.5749425057494251,
|
| 168 |
+
"grad_norm": 8.514225006103516,
|
| 169 |
+
"learning_rate": 4.0418458154184585e-05,
|
| 170 |
+
"loss": 0.9111,
|
| 171 |
+
"step": 11500
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.5999400059994,
|
| 175 |
+
"grad_norm": 7.778424263000488,
|
| 176 |
+
"learning_rate": 4.000183315001833e-05,
|
| 177 |
+
"loss": 0.9033,
|
| 178 |
+
"step": 12000
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.624937506249375,
|
| 182 |
+
"grad_norm": 10.383719444274902,
|
| 183 |
+
"learning_rate": 3.9585208145852084e-05,
|
| 184 |
+
"loss": 0.917,
|
| 185 |
+
"step": 12500
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.6499350064993501,
|
| 189 |
+
"grad_norm": 12.048624992370605,
|
| 190 |
+
"learning_rate": 3.9168583141685834e-05,
|
| 191 |
+
"loss": 0.8872,
|
| 192 |
+
"step": 13000
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.674932506749325,
|
| 196 |
+
"grad_norm": 14.255531311035156,
|
| 197 |
+
"learning_rate": 3.875195813751958e-05,
|
| 198 |
+
"loss": 0.8604,
|
| 199 |
+
"step": 13500
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.6999300069993001,
|
| 203 |
+
"grad_norm": 15.18703556060791,
|
| 204 |
+
"learning_rate": 3.833533313335333e-05,
|
| 205 |
+
"loss": 0.8628,
|
| 206 |
+
"step": 14000
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.7249275072492751,
|
| 210 |
+
"grad_norm": 12.154521942138672,
|
| 211 |
+
"learning_rate": 3.791870812918708e-05,
|
| 212 |
+
"loss": 0.8929,
|
| 213 |
+
"step": 14500
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.7499250074992501,
|
| 217 |
+
"grad_norm": 14.692411422729492,
|
| 218 |
+
"learning_rate": 3.750208312502083e-05,
|
| 219 |
+
"loss": 0.8585,
|
| 220 |
+
"step": 15000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.7749225077492251,
|
| 224 |
+
"grad_norm": 8.900308609008789,
|
| 225 |
+
"learning_rate": 3.708545812085458e-05,
|
| 226 |
+
"loss": 0.9014,
|
| 227 |
+
"step": 15500
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.7999200079992,
|
| 231 |
+
"grad_norm": 18.15697479248047,
|
| 232 |
+
"learning_rate": 3.666883311668834e-05,
|
| 233 |
+
"loss": 0.8581,
|
| 234 |
+
"step": 16000
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.8249175082491751,
|
| 238 |
+
"grad_norm": 14.366026878356934,
|
| 239 |
+
"learning_rate": 3.6252208112522086e-05,
|
| 240 |
+
"loss": 0.8622,
|
| 241 |
+
"step": 16500
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 0.8499150084991501,
|
| 245 |
+
"grad_norm": 14.673120498657227,
|
| 246 |
+
"learning_rate": 3.583558310835583e-05,
|
| 247 |
+
"loss": 0.873,
|
| 248 |
+
"step": 17000
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 0.8749125087491251,
|
| 252 |
+
"grad_norm": 9.87514877319336,
|
| 253 |
+
"learning_rate": 3.541895810418958e-05,
|
| 254 |
+
"loss": 0.8446,
|
| 255 |
+
"step": 17500
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 0.8999100089991001,
|
| 259 |
+
"grad_norm": 20.0493221282959,
|
| 260 |
+
"learning_rate": 3.5002333100023335e-05,
|
| 261 |
+
"loss": 0.819,
|
| 262 |
+
"step": 18000
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 0.924907509249075,
|
| 266 |
+
"grad_norm": 18.50018882751465,
|
| 267 |
+
"learning_rate": 3.4585708095857084e-05,
|
| 268 |
+
"loss": 0.8458,
|
| 269 |
+
"step": 18500
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 0.9499050094990501,
|
| 273 |
+
"grad_norm": 16.332889556884766,
|
| 274 |
+
"learning_rate": 3.4169083091690833e-05,
|
| 275 |
+
"loss": 0.8458,
|
| 276 |
+
"step": 19000
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 0.9749025097490251,
|
| 280 |
+
"grad_norm": 11.074434280395508,
|
| 281 |
+
"learning_rate": 3.375245808752458e-05,
|
| 282 |
+
"loss": 0.8497,
|
| 283 |
+
"step": 19500
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 0.9999000099990001,
|
| 287 |
+
"grad_norm": 8.59486198425293,
|
| 288 |
+
"learning_rate": 3.333583308335833e-05,
|
| 289 |
+
"loss": 0.7989,
|
| 290 |
+
"step": 20000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.0,
|
| 294 |
+
"eval_f1": 0.7451762918283228,
|
| 295 |
+
"eval_loss": 0.8514304757118225,
|
| 296 |
+
"eval_runtime": 10.7214,
|
| 297 |
+
"eval_samples_per_second": 1865.614,
|
| 298 |
+
"eval_steps_per_second": 233.272,
|
| 299 |
+
"step": 20002
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"epoch": 1.024897510248975,
|
| 303 |
+
"grad_norm": 11.283440589904785,
|
| 304 |
+
"learning_rate": 3.291920807919208e-05,
|
| 305 |
+
"loss": 0.6034,
|
| 306 |
+
"step": 20500
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 1.0498950104989502,
|
| 310 |
+
"grad_norm": 14.751864433288574,
|
| 311 |
+
"learning_rate": 3.250258307502583e-05,
|
| 312 |
+
"loss": 0.6148,
|
| 313 |
+
"step": 21000
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"epoch": 1.0748925107489251,
|
| 317 |
+
"grad_norm": 20.8693790435791,
|
| 318 |
+
"learning_rate": 3.208595807085959e-05,
|
| 319 |
+
"loss": 0.614,
|
| 320 |
+
"step": 21500
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"epoch": 1.0998900109989,
|
| 324 |
+
"grad_norm": 15.057612419128418,
|
| 325 |
+
"learning_rate": 3.166933306669333e-05,
|
| 326 |
+
"loss": 0.5895,
|
| 327 |
+
"step": 22000
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"epoch": 1.1248875112488752,
|
| 331 |
+
"grad_norm": 10.95419979095459,
|
| 332 |
+
"learning_rate": 3.125270806252708e-05,
|
| 333 |
+
"loss": 0.6483,
|
| 334 |
+
"step": 22500
|
| 335 |
+
},
|
| 336 |
+
{
|
| 337 |
+
"epoch": 1.1498850114988501,
|
| 338 |
+
"grad_norm": 17.469892501831055,
|
| 339 |
+
"learning_rate": 3.083608305836083e-05,
|
| 340 |
+
"loss": 0.6331,
|
| 341 |
+
"step": 23000
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"epoch": 1.174882511748825,
|
| 345 |
+
"grad_norm": 20.316282272338867,
|
| 346 |
+
"learning_rate": 3.041945805419458e-05,
|
| 347 |
+
"loss": 0.5885,
|
| 348 |
+
"step": 23500
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"epoch": 1.1998800119988,
|
| 352 |
+
"grad_norm": 5.562185764312744,
|
| 353 |
+
"learning_rate": 3.0002833050028334e-05,
|
| 354 |
+
"loss": 0.6082,
|
| 355 |
+
"step": 24000
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"epoch": 1.2248775122487752,
|
| 359 |
+
"grad_norm": 17.523334503173828,
|
| 360 |
+
"learning_rate": 2.958620804586208e-05,
|
| 361 |
+
"loss": 0.6312,
|
| 362 |
+
"step": 24500
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 1.24987501249875,
|
| 366 |
+
"grad_norm": 20.40757179260254,
|
| 367 |
+
"learning_rate": 2.916958304169583e-05,
|
| 368 |
+
"loss": 0.6033,
|
| 369 |
+
"step": 25000
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"epoch": 1.274872512748725,
|
| 373 |
+
"grad_norm": 18.183963775634766,
|
| 374 |
+
"learning_rate": 2.8752958037529583e-05,
|
| 375 |
+
"loss": 0.6006,
|
| 376 |
+
"step": 25500
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"epoch": 1.2998700129987002,
|
| 380 |
+
"grad_norm": 4.399472236633301,
|
| 381 |
+
"learning_rate": 2.8336333033363332e-05,
|
| 382 |
+
"loss": 0.6283,
|
| 383 |
+
"step": 26000
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"epoch": 1.3248675132486751,
|
| 387 |
+
"grad_norm": 17.38117027282715,
|
| 388 |
+
"learning_rate": 2.7919708029197085e-05,
|
| 389 |
+
"loss": 0.6319,
|
| 390 |
+
"step": 26500
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"epoch": 1.34986501349865,
|
| 394 |
+
"grad_norm": 9.839600563049316,
|
| 395 |
+
"learning_rate": 2.7503083025030828e-05,
|
| 396 |
+
"loss": 0.5913,
|
| 397 |
+
"step": 27000
|
| 398 |
+
},
|
| 399 |
+
{
|
| 400 |
+
"epoch": 1.3748625137486252,
|
| 401 |
+
"grad_norm": 3.2011570930480957,
|
| 402 |
+
"learning_rate": 2.708645802086458e-05,
|
| 403 |
+
"loss": 0.6037,
|
| 404 |
+
"step": 27500
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"epoch": 1.3998600139986002,
|
| 408 |
+
"grad_norm": 9.335294723510742,
|
| 409 |
+
"learning_rate": 2.666983301669833e-05,
|
| 410 |
+
"loss": 0.6025,
|
| 411 |
+
"step": 28000
|
| 412 |
+
},
|
| 413 |
+
{
|
| 414 |
+
"epoch": 1.424857514248575,
|
| 415 |
+
"grad_norm": 26.70831298828125,
|
| 416 |
+
"learning_rate": 2.6253208012532083e-05,
|
| 417 |
+
"loss": 0.6067,
|
| 418 |
+
"step": 28500
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 1.4498550144985503,
|
| 422 |
+
"grad_norm": 16.662883758544922,
|
| 423 |
+
"learning_rate": 2.5836583008365832e-05,
|
| 424 |
+
"loss": 0.6075,
|
| 425 |
+
"step": 29000
|
| 426 |
+
},
|
| 427 |
+
{
|
| 428 |
+
"epoch": 1.4748525147485252,
|
| 429 |
+
"grad_norm": 18.168540954589844,
|
| 430 |
+
"learning_rate": 2.5419958004199578e-05,
|
| 431 |
+
"loss": 0.6035,
|
| 432 |
+
"step": 29500
|
| 433 |
+
},
|
| 434 |
+
{
|
| 435 |
+
"epoch": 1.4998500149985001,
|
| 436 |
+
"grad_norm": 49.09202575683594,
|
| 437 |
+
"learning_rate": 2.500333300003333e-05,
|
| 438 |
+
"loss": 0.5826,
|
| 439 |
+
"step": 30000
|
| 440 |
+
},
|
| 441 |
+
{
|
| 442 |
+
"epoch": 1.5248475152484753,
|
| 443 |
+
"grad_norm": 18.314056396484375,
|
| 444 |
+
"learning_rate": 2.458670799586708e-05,
|
| 445 |
+
"loss": 0.5905,
|
| 446 |
+
"step": 30500
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"epoch": 1.54984501549845,
|
| 450 |
+
"grad_norm": 4.7171406745910645,
|
| 451 |
+
"learning_rate": 2.4170082991700833e-05,
|
| 452 |
+
"loss": 0.563,
|
| 453 |
+
"step": 31000
|
| 454 |
+
},
|
| 455 |
+
{
|
| 456 |
+
"epoch": 1.5748425157484252,
|
| 457 |
+
"grad_norm": 17.988279342651367,
|
| 458 |
+
"learning_rate": 2.375345798753458e-05,
|
| 459 |
+
"loss": 0.5795,
|
| 460 |
+
"step": 31500
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"epoch": 1.5998400159984003,
|
| 464 |
+
"grad_norm": 15.996960639953613,
|
| 465 |
+
"learning_rate": 2.3336832983368332e-05,
|
| 466 |
+
"loss": 0.603,
|
| 467 |
+
"step": 32000
|
| 468 |
+
},
|
| 469 |
+
{
|
| 470 |
+
"epoch": 1.624837516248375,
|
| 471 |
+
"grad_norm": 15.832610130310059,
|
| 472 |
+
"learning_rate": 2.2920207979202078e-05,
|
| 473 |
+
"loss": 0.5805,
|
| 474 |
+
"step": 32500
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 1.6498350164983502,
|
| 478 |
+
"grad_norm": 33.191444396972656,
|
| 479 |
+
"learning_rate": 2.250358297503583e-05,
|
| 480 |
+
"loss": 0.6108,
|
| 481 |
+
"step": 33000
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"epoch": 1.6748325167483251,
|
| 485 |
+
"grad_norm": 8.741061210632324,
|
| 486 |
+
"learning_rate": 2.208695797086958e-05,
|
| 487 |
+
"loss": 0.6077,
|
| 488 |
+
"step": 33500
|
| 489 |
+
},
|
| 490 |
+
{
|
| 491 |
+
"epoch": 1.6998300169983,
|
| 492 |
+
"grad_norm": 14.29039192199707,
|
| 493 |
+
"learning_rate": 2.167033296670333e-05,
|
| 494 |
+
"loss": 0.5751,
|
| 495 |
+
"step": 34000
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"epoch": 1.7248275172482752,
|
| 499 |
+
"grad_norm": 21.69901466369629,
|
| 500 |
+
"learning_rate": 2.1253707962537083e-05,
|
| 501 |
+
"loss": 0.5833,
|
| 502 |
+
"step": 34500
|
| 503 |
+
},
|
| 504 |
+
{
|
| 505 |
+
"epoch": 1.7498250174982501,
|
| 506 |
+
"grad_norm": 32.595794677734375,
|
| 507 |
+
"learning_rate": 2.083708295837083e-05,
|
| 508 |
+
"loss": 0.5895,
|
| 509 |
+
"step": 35000
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"epoch": 1.774822517748225,
|
| 513 |
+
"grad_norm": 42.687721252441406,
|
| 514 |
+
"learning_rate": 2.042045795420458e-05,
|
| 515 |
+
"loss": 0.5541,
|
| 516 |
+
"step": 35500
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"epoch": 1.7998200179982002,
|
| 520 |
+
"grad_norm": 16.474918365478516,
|
| 521 |
+
"learning_rate": 2.000383295003833e-05,
|
| 522 |
+
"loss": 0.5423,
|
| 523 |
+
"step": 36000
|
| 524 |
+
},
|
| 525 |
+
{
|
| 526 |
+
"epoch": 1.8248175182481752,
|
| 527 |
+
"grad_norm": 13.296688079833984,
|
| 528 |
+
"learning_rate": 1.958720794587208e-05,
|
| 529 |
+
"loss": 0.5566,
|
| 530 |
+
"step": 36500
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 1.84981501849815,
|
| 534 |
+
"grad_norm": 18.645790100097656,
|
| 535 |
+
"learning_rate": 1.917058294170583e-05,
|
| 536 |
+
"loss": 0.5493,
|
| 537 |
+
"step": 37000
|
| 538 |
+
},
|
| 539 |
+
{
|
| 540 |
+
"epoch": 1.8748125187481253,
|
| 541 |
+
"grad_norm": 12.576258659362793,
|
| 542 |
+
"learning_rate": 1.875395793753958e-05,
|
| 543 |
+
"loss": 0.5602,
|
| 544 |
+
"step": 37500
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"epoch": 1.8998100189981002,
|
| 548 |
+
"grad_norm": 21.95449447631836,
|
| 549 |
+
"learning_rate": 1.833733293337333e-05,
|
| 550 |
+
"loss": 0.5878,
|
| 551 |
+
"step": 38000
|
| 552 |
+
},
|
| 553 |
+
{
|
| 554 |
+
"epoch": 1.9248075192480751,
|
| 555 |
+
"grad_norm": 9.17590618133545,
|
| 556 |
+
"learning_rate": 1.792070792920708e-05,
|
| 557 |
+
"loss": 0.5681,
|
| 558 |
+
"step": 38500
|
| 559 |
+
},
|
| 560 |
+
{
|
| 561 |
+
"epoch": 1.9498050194980503,
|
| 562 |
+
"grad_norm": 12.517435073852539,
|
| 563 |
+
"learning_rate": 1.750408292504083e-05,
|
| 564 |
+
"loss": 0.5464,
|
| 565 |
+
"step": 39000
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"epoch": 1.9748025197480252,
|
| 569 |
+
"grad_norm": 15.346318244934082,
|
| 570 |
+
"learning_rate": 1.708745792087458e-05,
|
| 571 |
+
"loss": 0.5917,
|
| 572 |
+
"step": 39500
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"epoch": 1.9998000199980002,
|
| 576 |
+
"grad_norm": 29.321331024169922,
|
| 577 |
+
"learning_rate": 1.667083291670833e-05,
|
| 578 |
+
"loss": 0.5443,
|
| 579 |
+
"step": 40000
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"epoch": 2.0,
|
| 583 |
+
"eval_f1": 0.7651638193675152,
|
| 584 |
+
"eval_loss": 0.8535689115524292,
|
| 585 |
+
"eval_runtime": 10.748,
|
| 586 |
+
"eval_samples_per_second": 1860.991,
|
| 587 |
+
"eval_steps_per_second": 232.694,
|
| 588 |
+
"step": 40004
|
| 589 |
+
},
|
| 590 |
+
{
|
| 591 |
+
"epoch": 2.0247975202479753,
|
| 592 |
+
"grad_norm": 24.93914794921875,
|
| 593 |
+
"learning_rate": 1.625420791254208e-05,
|
| 594 |
+
"loss": 0.3501,
|
| 595 |
+
"step": 40500
|
| 596 |
+
},
|
| 597 |
+
{
|
| 598 |
+
"epoch": 2.04979502049795,
|
| 599 |
+
"grad_norm": 50.30072784423828,
|
| 600 |
+
"learning_rate": 1.5837582908375832e-05,
|
| 601 |
+
"loss": 0.3785,
|
| 602 |
+
"step": 41000
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 2.074792520747925,
|
| 606 |
+
"grad_norm": 24.169206619262695,
|
| 607 |
+
"learning_rate": 1.5420957904209578e-05,
|
| 608 |
+
"loss": 0.4034,
|
| 609 |
+
"step": 41500
|
| 610 |
+
},
|
| 611 |
+
{
|
| 612 |
+
"epoch": 2.0997900209979004,
|
| 613 |
+
"grad_norm": 43.043338775634766,
|
| 614 |
+
"learning_rate": 1.500433290004333e-05,
|
| 615 |
+
"loss": 0.385,
|
| 616 |
+
"step": 42000
|
| 617 |
+
},
|
| 618 |
+
{
|
| 619 |
+
"epoch": 2.124787521247875,
|
| 620 |
+
"grad_norm": 1.601791262626648,
|
| 621 |
+
"learning_rate": 1.4587707895877079e-05,
|
| 622 |
+
"loss": 0.3758,
|
| 623 |
+
"step": 42500
|
| 624 |
+
},
|
| 625 |
+
{
|
| 626 |
+
"epoch": 2.1497850214978502,
|
| 627 |
+
"grad_norm": 1.0921714305877686,
|
| 628 |
+
"learning_rate": 1.417108289171083e-05,
|
| 629 |
+
"loss": 0.3713,
|
| 630 |
+
"step": 43000
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
"epoch": 2.1747825217478254,
|
| 634 |
+
"grad_norm": 23.122596740722656,
|
| 635 |
+
"learning_rate": 1.375445788754458e-05,
|
| 636 |
+
"loss": 0.413,
|
| 637 |
+
"step": 43500
|
| 638 |
+
},
|
| 639 |
+
{
|
| 640 |
+
"epoch": 2.1997800219978,
|
| 641 |
+
"grad_norm": 7.090549468994141,
|
| 642 |
+
"learning_rate": 1.3337832883378329e-05,
|
| 643 |
+
"loss": 0.3787,
|
| 644 |
+
"step": 44000
|
| 645 |
+
},
|
| 646 |
+
{
|
| 647 |
+
"epoch": 2.2247775222477753,
|
| 648 |
+
"grad_norm": 17.668933868408203,
|
| 649 |
+
"learning_rate": 1.292120787921208e-05,
|
| 650 |
+
"loss": 0.3805,
|
| 651 |
+
"step": 44500
|
| 652 |
+
},
|
| 653 |
+
{
|
| 654 |
+
"epoch": 2.2497750224977504,
|
| 655 |
+
"grad_norm": 15.878674507141113,
|
| 656 |
+
"learning_rate": 1.2504582875045829e-05,
|
| 657 |
+
"loss": 0.3757,
|
| 658 |
+
"step": 45000
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 2.274772522747725,
|
| 662 |
+
"grad_norm": 39.11751937866211,
|
| 663 |
+
"learning_rate": 1.2087957870879578e-05,
|
| 664 |
+
"loss": 0.3887,
|
| 665 |
+
"step": 45500
|
| 666 |
+
},
|
| 667 |
+
{
|
| 668 |
+
"epoch": 2.2997700229977003,
|
| 669 |
+
"grad_norm": 4.333780288696289,
|
| 670 |
+
"learning_rate": 1.167133286671333e-05,
|
| 671 |
+
"loss": 0.3789,
|
| 672 |
+
"step": 46000
|
| 673 |
+
},
|
| 674 |
+
{
|
| 675 |
+
"epoch": 2.324767523247675,
|
| 676 |
+
"grad_norm": 21.4094295501709,
|
| 677 |
+
"learning_rate": 1.1254707862547079e-05,
|
| 678 |
+
"loss": 0.3742,
|
| 679 |
+
"step": 46500
|
| 680 |
+
},
|
| 681 |
+
{
|
| 682 |
+
"epoch": 2.34976502349765,
|
| 683 |
+
"grad_norm": 14.586631774902344,
|
| 684 |
+
"learning_rate": 1.083808285838083e-05,
|
| 685 |
+
"loss": 0.3805,
|
| 686 |
+
"step": 47000
|
| 687 |
+
},
|
| 688 |
+
{
|
| 689 |
+
"epoch": 2.3747625237476253,
|
| 690 |
+
"grad_norm": 1.1548786163330078,
|
| 691 |
+
"learning_rate": 1.042145785421458e-05,
|
| 692 |
+
"loss": 0.3936,
|
| 693 |
+
"step": 47500
|
| 694 |
+
},
|
| 695 |
+
{
|
| 696 |
+
"epoch": 2.3997600239976,
|
| 697 |
+
"grad_norm": 0.03682245686650276,
|
| 698 |
+
"learning_rate": 1.0004832850048329e-05,
|
| 699 |
+
"loss": 0.38,
|
| 700 |
+
"step": 48000
|
| 701 |
+
},
|
| 702 |
+
{
|
| 703 |
+
"epoch": 2.424757524247575,
|
| 704 |
+
"grad_norm": 35.44232940673828,
|
| 705 |
+
"learning_rate": 9.588207845882078e-06,
|
| 706 |
+
"loss": 0.3941,
|
| 707 |
+
"step": 48500
|
| 708 |
+
},
|
| 709 |
+
{
|
| 710 |
+
"epoch": 2.4497550244975503,
|
| 711 |
+
"grad_norm": 8.77474594116211,
|
| 712 |
+
"learning_rate": 9.171582841715828e-06,
|
| 713 |
+
"loss": 0.4054,
|
| 714 |
+
"step": 49000
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 2.474752524747525,
|
| 718 |
+
"grad_norm": 13.013947486877441,
|
| 719 |
+
"learning_rate": 8.754957837549579e-06,
|
| 720 |
+
"loss": 0.3659,
|
| 721 |
+
"step": 49500
|
| 722 |
+
},
|
| 723 |
+
{
|
| 724 |
+
"epoch": 2.4997500249975,
|
| 725 |
+
"grad_norm": 14.281270980834961,
|
| 726 |
+
"learning_rate": 8.338332833383328e-06,
|
| 727 |
+
"loss": 0.3917,
|
| 728 |
+
"step": 50000
|
| 729 |
+
},
|
| 730 |
+
{
|
| 731 |
+
"epoch": 2.5247475252474754,
|
| 732 |
+
"grad_norm": 24.0106258392334,
|
| 733 |
+
"learning_rate": 7.92170782921708e-06,
|
| 734 |
+
"loss": 0.3876,
|
| 735 |
+
"step": 50500
|
| 736 |
+
},
|
| 737 |
+
{
|
| 738 |
+
"epoch": 2.54974502549745,
|
| 739 |
+
"grad_norm": 20.56169319152832,
|
| 740 |
+
"learning_rate": 7.505082825050828e-06,
|
| 741 |
+
"loss": 0.3628,
|
| 742 |
+
"step": 51000
|
| 743 |
+
},
|
| 744 |
+
{
|
| 745 |
+
"epoch": 2.5747425257474252,
|
| 746 |
+
"grad_norm": 0.7545715570449829,
|
| 747 |
+
"learning_rate": 7.088457820884578e-06,
|
| 748 |
+
"loss": 0.3918,
|
| 749 |
+
"step": 51500
|
| 750 |
+
},
|
| 751 |
+
{
|
| 752 |
+
"epoch": 2.5997400259974004,
|
| 753 |
+
"grad_norm": 17.21295928955078,
|
| 754 |
+
"learning_rate": 6.6718328167183295e-06,
|
| 755 |
+
"loss": 0.359,
|
| 756 |
+
"step": 52000
|
| 757 |
+
},
|
| 758 |
+
{
|
| 759 |
+
"epoch": 2.624737526247375,
|
| 760 |
+
"grad_norm": 9.85009479522705,
|
| 761 |
+
"learning_rate": 6.255207812552079e-06,
|
| 762 |
+
"loss": 0.3634,
|
| 763 |
+
"step": 52500
|
| 764 |
+
},
|
| 765 |
+
{
|
| 766 |
+
"epoch": 2.6497350264973503,
|
| 767 |
+
"grad_norm": 21.24859046936035,
|
| 768 |
+
"learning_rate": 5.838582808385828e-06,
|
| 769 |
+
"loss": 0.3737,
|
| 770 |
+
"step": 53000
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 2.6747325267473254,
|
| 774 |
+
"grad_norm": 13.614805221557617,
|
| 775 |
+
"learning_rate": 5.421957804219578e-06,
|
| 776 |
+
"loss": 0.4022,
|
| 777 |
+
"step": 53500
|
| 778 |
+
},
|
| 779 |
+
{
|
| 780 |
+
"epoch": 2.6997300269973,
|
| 781 |
+
"grad_norm": 5.028663158416748,
|
| 782 |
+
"learning_rate": 5.005332800053329e-06,
|
| 783 |
+
"loss": 0.3562,
|
| 784 |
+
"step": 54000
|
| 785 |
+
},
|
| 786 |
+
{
|
| 787 |
+
"epoch": 2.7247275272472753,
|
| 788 |
+
"grad_norm": 22.341398239135742,
|
| 789 |
+
"learning_rate": 4.588707795887078e-06,
|
| 790 |
+
"loss": 0.349,
|
| 791 |
+
"step": 54500
|
| 792 |
+
},
|
| 793 |
+
{
|
| 794 |
+
"epoch": 2.7497250274972505,
|
| 795 |
+
"grad_norm": 9.66286849975586,
|
| 796 |
+
"learning_rate": 4.172082791720828e-06,
|
| 797 |
+
"loss": 0.3573,
|
| 798 |
+
"step": 55000
|
| 799 |
+
},
|
| 800 |
+
{
|
| 801 |
+
"epoch": 2.774722527747225,
|
| 802 |
+
"grad_norm": 4.927464962005615,
|
| 803 |
+
"learning_rate": 3.755457787554578e-06,
|
| 804 |
+
"loss": 0.335,
|
| 805 |
+
"step": 55500
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"epoch": 2.7997200279972003,
|
| 809 |
+
"grad_norm": 4.33281135559082,
|
| 810 |
+
"learning_rate": 3.338832783388328e-06,
|
| 811 |
+
"loss": 0.3679,
|
| 812 |
+
"step": 56000
|
| 813 |
+
},
|
| 814 |
+
{
|
| 815 |
+
"epoch": 2.8247175282471755,
|
| 816 |
+
"grad_norm": 0.29482612013816833,
|
| 817 |
+
"learning_rate": 2.9222077792220777e-06,
|
| 818 |
+
"loss": 0.3266,
|
| 819 |
+
"step": 56500
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"epoch": 2.84971502849715,
|
| 823 |
+
"grad_norm": 21.363672256469727,
|
| 824 |
+
"learning_rate": 2.505582775055828e-06,
|
| 825 |
+
"loss": 0.3453,
|
| 826 |
+
"step": 57000
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 2.8747125287471254,
|
| 830 |
+
"grad_norm": 2.6021454334259033,
|
| 831 |
+
"learning_rate": 2.088957770889578e-06,
|
| 832 |
+
"loss": 0.3682,
|
| 833 |
+
"step": 57500
|
| 834 |
+
},
|
| 835 |
+
{
|
| 836 |
+
"epoch": 2.8997100289971005,
|
| 837 |
+
"grad_norm": 4.7911577224731445,
|
| 838 |
+
"learning_rate": 1.6723327667233275e-06,
|
| 839 |
+
"loss": 0.3417,
|
| 840 |
+
"step": 58000
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"epoch": 2.924707529247075,
|
| 844 |
+
"grad_norm": 0.21428282558918,
|
| 845 |
+
"learning_rate": 1.2557077625570776e-06,
|
| 846 |
+
"loss": 0.3192,
|
| 847 |
+
"step": 58500
|
| 848 |
+
},
|
| 849 |
+
{
|
| 850 |
+
"epoch": 2.9497050294970504,
|
| 851 |
+
"grad_norm": 1.2091667652130127,
|
| 852 |
+
"learning_rate": 8.390827583908276e-07,
|
| 853 |
+
"loss": 0.3375,
|
| 854 |
+
"step": 59000
|
| 855 |
+
},
|
| 856 |
+
{
|
| 857 |
+
"epoch": 2.9747025297470255,
|
| 858 |
+
"grad_norm": 24.199045181274414,
|
| 859 |
+
"learning_rate": 4.2245775422457754e-07,
|
| 860 |
+
"loss": 0.3669,
|
| 861 |
+
"step": 59500
|
| 862 |
+
},
|
| 863 |
+
{
|
| 864 |
+
"epoch": 2.9997000299970003,
|
| 865 |
+
"grad_norm": 5.163976669311523,
|
| 866 |
+
"learning_rate": 5.832750058327501e-09,
|
| 867 |
+
"loss": 0.332,
|
| 868 |
+
"step": 60000
|
| 869 |
+
},
|
| 870 |
+
{
|
| 871 |
+
"epoch": 3.0,
|
| 872 |
+
"eval_f1": 0.7708992224677207,
|
| 873 |
+
"eval_loss": 0.9973717331886292,
|
| 874 |
+
"eval_runtime": 11.045,
|
| 875 |
+
"eval_samples_per_second": 1810.963,
|
| 876 |
+
"eval_steps_per_second": 226.438,
|
| 877 |
+
"step": 60006
|
| 878 |
+
},
|
| 879 |
+
{
|
| 880 |
+
"epoch": 3.0,
|
| 881 |
+
"step": 60006,
|
| 882 |
+
"total_flos": 7908628105405440.0,
|
| 883 |
+
"train_loss": 0.6686365460569141,
|
| 884 |
+
"train_runtime": 1639.5865,
|
| 885 |
+
"train_samples_per_second": 292.775,
|
| 886 |
+
"train_steps_per_second": 36.598
|
| 887 |
+
}
|
| 888 |
+
],
|
| 889 |
+
"logging_steps": 500,
|
| 890 |
+
"max_steps": 60006,
|
| 891 |
+
"num_input_tokens_seen": 0,
|
| 892 |
+
"num_train_epochs": 3,
|
| 893 |
+
"save_steps": 500,
|
| 894 |
+
"stateful_callbacks": {
|
| 895 |
+
"TrainerControl": {
|
| 896 |
+
"args": {
|
| 897 |
+
"should_epoch_stop": false,
|
| 898 |
+
"should_evaluate": false,
|
| 899 |
+
"should_log": false,
|
| 900 |
+
"should_save": true,
|
| 901 |
+
"should_training_stop": true
|
| 902 |
+
},
|
| 903 |
+
"attributes": {}
|
| 904 |
+
}
|
| 905 |
+
},
|
| 906 |
+
"total_flos": 7908628105405440.0,
|
| 907 |
+
"train_batch_size": 8,
|
| 908 |
+
"trial_name": null,
|
| 909 |
+
"trial_params": null
|
| 910 |
+
}
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b62d79559802129fcb0592fd38fa883a70b24d3b77b70cedda8e9929cb6539c
|
| 3 |
+
size 5201
|