Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +60 -0
- LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log +423 -0
- LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log +299 -0
- LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log +6 -0
- LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log +23 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log +191 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log +11 -0
- LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log +14 -0
- LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log +14 -0
- LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log +13 -0
- LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log +12 -0
- LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log +0 -0
- LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log +6 -0
- LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log +23 -0
- LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json +1 -0
.gitattributes
CHANGED
|
@@ -210,3 +210,63 @@ v127rc_exp2/B_mul/checkpoint-9500/tokenizer.json filter=lfs diff=lfs merge=lfs -
|
|
| 210 |
v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 211 |
v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 212 |
v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 211 |
v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 212 |
v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 213 |
+
v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 214 |
+
v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 215 |
+
v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 216 |
+
v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 217 |
+
v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 218 |
+
v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 219 |
+
v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 220 |
+
v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 221 |
+
v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 222 |
+
v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 223 |
+
v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 224 |
+
v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 225 |
+
v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 226 |
+
v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 227 |
+
v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 228 |
+
v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 229 |
+
v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 230 |
+
v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 231 |
+
v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 232 |
+
v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 233 |
+
v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 234 |
+
v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 235 |
+
v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 236 |
+
v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 237 |
+
v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 238 |
+
v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 239 |
+
v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 240 |
+
v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 241 |
+
v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 242 |
+
v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 243 |
+
v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 244 |
+
v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 245 |
+
v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 246 |
+
v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 247 |
+
v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 248 |
+
v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 249 |
+
v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 250 |
+
v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 251 |
+
v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 252 |
+
v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 253 |
+
v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 254 |
+
v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 255 |
+
v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 256 |
+
v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 257 |
+
v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 258 |
+
v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 259 |
+
v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 260 |
+
v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 261 |
+
v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 262 |
+
v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 263 |
+
v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 264 |
+
v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 265 |
+
v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 266 |
+
v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 267 |
+
v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 268 |
+
v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 269 |
+
v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 270 |
+
v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 271 |
+
v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 272 |
+
v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
mfjy22anxcucsb3vwlaimrwvqrgvipis:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/C.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "13.0"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "1858306048"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
|
| 30 |
+
host: 47a53adf0198
|
| 31 |
+
memory:
|
| 32 |
+
total: "201701408768"
|
| 33 |
+
os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T03:57:46.163443Z"
|
| 38 |
+
writerId: mfjy22anxcucsb3vwlaimrwvqrgvipis
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d35_r286
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/C
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- up_proj
|
| 625 |
+
- q_proj
|
| 626 |
+
- gate_proj
|
| 627 |
+
- k_proj
|
| 628 |
+
- v_proj
|
| 629 |
+
- o_proj
|
| 630 |
+
- down_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0%| | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
| 2 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
| 3 |
+
|
| 4 |
+
{'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.959', 'train_tokens_per_second': '691.9'}
|
| 5 |
+
{'loss': '1.8', 'grad_norm': '0.2907', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.966', 'train_tokens_per_second': '1032'}
|
| 6 |
+
{'loss': '1.755', 'grad_norm': '0.2774', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.979', 'train_tokens_per_second': '1233'}
|
| 7 |
+
{'loss': '1.725', 'grad_norm': '0.278', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.988', 'train_tokens_per_second': '1367'}
|
| 8 |
+
{'loss': '1.856', 'grad_norm': '0.2819', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.995', 'train_tokens_per_second': '1463'}
|
| 9 |
+
{'loss': '1.864', 'grad_norm': '0.2434', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '8.002', 'train_tokens_per_second': '1535'}
|
| 10 |
+
{'loss': '1.791', 'grad_norm': '0.2673', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '9.01', 'train_tokens_per_second': '1590'}
|
| 11 |
+
{'loss': '1.831', 'grad_norm': '0.2574', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '10.02', 'train_tokens_per_second': '1634'}
|
| 12 |
+
{'loss': '1.92', 'grad_norm': '0.2803', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '11.06', 'train_tokens_per_second': '1665'}
|
| 13 |
+
{'loss': '1.949', 'grad_norm': '0.281', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '12.07', 'train_tokens_per_second': '1696'}
|
| 14 |
+
{'loss': '1.955', 'grad_norm': '0.298', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.08', 'train_tokens_per_second': '1722'}
|
| 15 |
+
{'loss': '1.811', 'grad_norm': '0.2719', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.09', 'train_tokens_per_second': '1743'}
|
| 16 |
+
{'loss': '1.629', 'grad_norm': '0.266', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.1', 'train_tokens_per_second': '1763'}
|
| 17 |
+
{'loss': '1.768', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.1', 'train_tokens_per_second': '1779'}
|
| 18 |
+
{'loss': '1.612', 'grad_norm': '0.252', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.11', 'train_tokens_per_second': '1794'}
|
| 19 |
+
{'loss': '1.622', 'grad_norm': '0.2607', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.12', 'train_tokens_per_second': '1807'}
|
| 20 |
+
{'loss': '1.857', 'grad_norm': '0.2805', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.13', 'train_tokens_per_second': '1819'}
|
| 21 |
+
{'loss': '1.851', 'grad_norm': '0.2441', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.14', 'train_tokens_per_second': '1830'}
|
| 22 |
+
{'loss': '1.826', 'grad_norm': '0.2659', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.15', 'train_tokens_per_second': '1839'}
|
| 23 |
+
{'loss': '1.536', 'grad_norm': '0.2742', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.16', 'train_tokens_per_second': '1847'}
|
| 24 |
+
{'loss': '1.67', 'grad_norm': '0.2687', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.17', 'train_tokens_per_second': '1855'}
|
| 25 |
+
{'loss': '1.548', 'grad_norm': '0.2588', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.18', 'train_tokens_per_second': '1862'}
|
| 26 |
+
{'loss': '1.866', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.19', 'train_tokens_per_second': '1869'}
|
| 27 |
+
{'loss': '1.764', 'grad_norm': '0.2764', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.2', 'train_tokens_per_second': '1875'}
|
| 28 |
+
{'loss': '1.937', 'grad_norm': '0.2965', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.21', 'train_tokens_per_second': '1881'}
|
| 29 |
+
{'loss': '1.627', 'grad_norm': '0.2888', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.23', 'train_tokens_per_second': '1886'}
|
| 30 |
+
{'loss': '1.792', 'grad_norm': '0.3194', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.24', 'train_tokens_per_second': '1890'}
|
| 31 |
+
{'loss': '1.725', 'grad_norm': '0.2937', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.25', 'train_tokens_per_second': '1895'}
|
| 32 |
+
{'loss': '1.871', 'grad_norm': '0.2757', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.26', 'train_tokens_per_second': '1899'}
|
| 33 |
+
{'loss': '1.838', 'grad_norm': '0.2773', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.27', 'train_tokens_per_second': '1903'}
|
| 34 |
+
{'loss': '1.909', 'grad_norm': '0.3041', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.28', 'train_tokens_per_second': '1907'}
|
| 35 |
+
{'loss': '1.725', 'grad_norm': '0.2885', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.29', 'train_tokens_per_second': '1910'}
|
| 36 |
+
{'loss': '1.747', 'grad_norm': '0.3163', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.3', 'train_tokens_per_second': '1914'}
|
| 37 |
+
{'loss': '1.909', 'grad_norm': '0.2977', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.31', 'train_tokens_per_second': '1917'}
|
| 38 |
+
{'loss': '1.641', 'grad_norm': '0.275', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.32', 'train_tokens_per_second': '1920'}
|
| 39 |
+
{'loss': '1.782', 'grad_norm': '0.3019', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1922'}
|
| 40 |
+
{'loss': '1.83', 'grad_norm': '0.3124', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.34', 'train_tokens_per_second': '1925'}
|
| 41 |
+
{'loss': '1.856', 'grad_norm': '0.2672', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.36', 'train_tokens_per_second': '1927'}
|
| 42 |
+
{'loss': '1.965', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.37', 'train_tokens_per_second': '1930'}
|
| 43 |
+
{'loss': '1.935', 'grad_norm': '0.337', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.39', 'train_tokens_per_second': '1932'}
|
| 44 |
+
{'loss': '1.725', 'grad_norm': '0.3097', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.4', 'train_tokens_per_second': '1934'}
|
| 45 |
+
{'loss': '1.534', 'grad_norm': '0.2637', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.42', 'train_tokens_per_second': '1935'}
|
| 46 |
+
{'loss': '1.764', 'grad_norm': '0.2742', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.43', 'train_tokens_per_second': '1937'}
|
| 47 |
+
{'loss': '1.696', 'grad_norm': '0.2804', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.45', 'train_tokens_per_second': '1939'}
|
| 48 |
+
{'loss': '1.725', 'grad_norm': '0.279', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.46', 'train_tokens_per_second': '1941'}
|
| 49 |
+
{'loss': '1.981', 'grad_norm': '0.3061', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.47', 'train_tokens_per_second': '1943'}
|
| 50 |
+
{'loss': '1.589', 'grad_norm': '0.2909', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.48', 'train_tokens_per_second': '1944'}
|
| 51 |
+
{'loss': '1.776', 'grad_norm': '0.338', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.49', 'train_tokens_per_second': '1946'}
|
| 52 |
+
{'loss': '1.855', 'grad_norm': '0.2965', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.51', 'train_tokens_per_second': '1947'}
|
| 53 |
+
{'loss': '1.635', 'grad_norm': '0.3187', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.52', 'train_tokens_per_second': '1949'}
|
| 54 |
+
{'loss': '1.884', 'grad_norm': '0.3086', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.53', 'train_tokens_per_second': '1950'}
|
| 55 |
+
{'loss': '1.779', 'grad_norm': '0.3112', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.55', 'train_tokens_per_second': '1951'}
|
| 56 |
+
{'loss': '1.85', 'grad_norm': '0.3581', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.56', 'train_tokens_per_second': '1953'}
|
| 57 |
+
{'loss': '1.611', 'grad_norm': '0.7226', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.57', 'train_tokens_per_second': '1954'}
|
| 58 |
+
{'loss': '1.643', 'grad_norm': '0.2939', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.58', 'train_tokens_per_second': '1955'}
|
| 59 |
+
{'loss': '1.978', 'grad_norm': '0.3302', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.6', 'train_tokens_per_second': '1956'}
|
| 60 |
+
{'loss': '1.473', 'grad_norm': '0.3044', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.62', 'train_tokens_per_second': '1957'}
|
| 61 |
+
{'loss': '1.559', 'grad_norm': '0.3122', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.64', 'train_tokens_per_second': '1958'}
|
| 62 |
+
{'loss': '1.793', 'grad_norm': '0.344', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.65', 'train_tokens_per_second': '1959'}
|
| 63 |
+
{'loss': '1.589', 'grad_norm': '0.3391', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.66', 'train_tokens_per_second': '1960'}
|
| 64 |
+
{'loss': '1.713', 'grad_norm': '0.3023', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.68', 'train_tokens_per_second': '1961'}
|
| 65 |
+
{'loss': '1.704', 'grad_norm': '0.3436', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.7', 'train_tokens_per_second': '1962'}
|
| 66 |
+
{'loss': '1.908', 'grad_norm': '0.3627', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.71', 'train_tokens_per_second': '1963'}
|
| 67 |
+
{'loss': '1.799', 'grad_norm': '0.3663', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.72', 'train_tokens_per_second': '1963'}
|
| 68 |
+
{'loss': '1.855', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.74', 'train_tokens_per_second': '1964'}
|
| 69 |
+
{'loss': '1.805', 'grad_norm': '0.3678', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.76', 'train_tokens_per_second': '1965'}
|
| 70 |
+
{'loss': '1.436', 'grad_norm': '0.3304', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.77', 'train_tokens_per_second': '1966'}
|
| 71 |
+
{'loss': '1.746', 'grad_norm': '0.307', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.78', 'train_tokens_per_second': '1966'}
|
| 72 |
+
{'loss': '1.823', 'grad_norm': '0.3547', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.8', 'train_tokens_per_second': '1967'}
|
| 73 |
+
{'loss': '1.66', 'grad_norm': '0.3379', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.82', 'train_tokens_per_second': '1968'}
|
| 74 |
+
{'loss': '1.913', 'grad_norm': '0.3416', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.84', 'train_tokens_per_second': '1968'}
|
| 75 |
+
{'loss': '1.814', 'grad_norm': '0.3721', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.85', 'train_tokens_per_second': '1969'}
|
| 76 |
+
{'loss': '1.797', 'grad_norm': '0.373', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.87', 'train_tokens_per_second': '1970'}
|
| 77 |
+
{'loss': '1.704', 'grad_norm': '0.3735', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.88', 'train_tokens_per_second': '1970'}
|
| 78 |
+
{'loss': '1.578', 'grad_norm': '0.3312', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.9', 'train_tokens_per_second': '1971'}
|
| 79 |
+
{'loss': '1.712', 'grad_norm': '0.3716', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.91', 'train_tokens_per_second': '1971'}
|
| 80 |
+
{'loss': '1.758', 'grad_norm': '0.3477', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '79.93', 'train_tokens_per_second': '1972'}
|
| 81 |
+
{'loss': '1.85', 'grad_norm': '0.374', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '80.94', 'train_tokens_per_second': '1973'}
|
| 82 |
+
{'loss': '1.77', 'grad_norm': '0.3782', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '81.96', 'train_tokens_per_second': '1973'}
|
| 83 |
+
{'loss': '1.592', 'grad_norm': '0.3265', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '82.98', 'train_tokens_per_second': '1974'}
|
| 84 |
+
{'loss': '1.684', 'grad_norm': '0.3949', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '83.99', 'train_tokens_per_second': '1974'}
|
| 85 |
+
{'loss': '1.416', 'grad_norm': '0.339', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.01', 'train_tokens_per_second': '1975'}
|
| 86 |
+
{'loss': '1.275', 'grad_norm': '0.3412', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.02', 'train_tokens_per_second': '1975'}
|
| 87 |
+
{'loss': '1.798', 'grad_norm': '0.4259', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.04', 'train_tokens_per_second': '1976'}
|
| 88 |
+
{'loss': '1.631', 'grad_norm': '0.3738', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.05', 'train_tokens_per_second': '1976'}
|
| 89 |
+
{'loss': '1.695', 'grad_norm': '0.3967', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.07', 'train_tokens_per_second': '1976'}
|
| 90 |
+
{'loss': '1.809', 'grad_norm': '0.3775', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.09', 'train_tokens_per_second': '1977'}
|
| 91 |
+
{'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.1', 'train_tokens_per_second': '1977'}
|
| 92 |
+
{'loss': '1.771', 'grad_norm': '0.397', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.12', 'train_tokens_per_second': '1978'}
|
| 93 |
+
{'loss': '1.708', 'grad_norm': '0.4329', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.14', 'train_tokens_per_second': '1978'}
|
| 94 |
+
{'loss': '1.629', 'grad_norm': '0.391', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.15', 'train_tokens_per_second': '1978'}
|
| 95 |
+
{'loss': '1.69', 'grad_norm': '0.416', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.17', 'train_tokens_per_second': '1979'}
|
| 96 |
+
{'loss': '1.882', 'grad_norm': '0.4379', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.19', 'train_tokens_per_second': '1979'}
|
| 97 |
+
{'loss': '1.764', 'grad_norm': '0.417', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.2', 'train_tokens_per_second': '1980'}
|
| 98 |
+
{'loss': '1.675', 'grad_norm': '0.4218', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.22', 'train_tokens_per_second': '1980'}
|
| 99 |
+
{'loss': '1.749', 'grad_norm': '0.4339', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.24', 'train_tokens_per_second': '1980'}
|
| 100 |
+
{'loss': '1.792', 'grad_norm': '0.4553', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1981'}
|
| 101 |
+
{'loss': '1.597', 'grad_norm': '0.4142', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.3', 'train_tokens_per_second': '1981'}
|
| 102 |
+
{'loss': '1.534', 'grad_norm': '0.4112', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.3', 'train_tokens_per_second': '1981'}
|
| 103 |
+
{'loss': '1.607', 'grad_norm': '0.4382', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.3', 'train_tokens_per_second': '1982'}
|
| 104 |
+
{'loss': '1.306', 'grad_norm': '0.3857', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.3', 'train_tokens_per_second': '1982'}
|
| 105 |
+
{'loss': '1.775', 'grad_norm': '0.4403', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.3', 'train_tokens_per_second': '1982'}
|
| 106 |
+
{'loss': '1.163', 'grad_norm': '0.4105', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1982'}
|
| 107 |
+
{'loss': '1.773', 'grad_norm': '0.467', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.4', 'train_tokens_per_second': '1983'}
|
| 108 |
+
{'loss': '1.548', 'grad_norm': '0.4103', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.4', 'train_tokens_per_second': '1983'}
|
| 109 |
+
{'loss': '1.663', 'grad_norm': '0.4564', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.4', 'train_tokens_per_second': '1983'}
|
| 110 |
+
{'loss': '1.709', 'grad_norm': '0.5568', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.4', 'train_tokens_per_second': '1984'}
|
| 111 |
+
{'loss': '1.683', 'grad_norm': '0.4596', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.4', 'train_tokens_per_second': '1984'}
|
| 112 |
+
{'loss': '1.786', 'grad_norm': '0.488', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.5', 'train_tokens_per_second': '1984'}
|
| 113 |
+
{'loss': '1.593', 'grad_norm': '0.4877', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.5', 'train_tokens_per_second': '1984'}
|
| 114 |
+
{'loss': '1.144', 'grad_norm': '0.4087', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.5', 'train_tokens_per_second': '1985'}
|
| 115 |
+
{'loss': '1.632', 'grad_norm': '0.4522', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.5', 'train_tokens_per_second': '1985'}
|
| 116 |
+
{'loss': '1.575', 'grad_norm': '0.4504', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.5', 'train_tokens_per_second': '1985'}
|
| 117 |
+
{'loss': '1.705', 'grad_norm': '0.4647', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.5', 'train_tokens_per_second': '1985'}
|
| 118 |
+
{'loss': '1.651', 'grad_norm': '0.4929', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.6', 'train_tokens_per_second': '1986'}
|
| 119 |
+
{'loss': '1.614', 'grad_norm': '0.4435', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.6', 'train_tokens_per_second': '1986'}
|
| 120 |
+
{'loss': '1.159', 'grad_norm': '0.4458', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.6', 'train_tokens_per_second': '1986'}
|
| 121 |
+
{'loss': '1.606', 'grad_norm': '0.5428', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.6', 'train_tokens_per_second': '1986'}
|
| 122 |
+
{'loss': '1.744', 'grad_norm': '0.5349', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.6', 'train_tokens_per_second': '1986'}
|
| 123 |
+
{'loss': '1.527', 'grad_norm': '0.5387', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.6', 'train_tokens_per_second': '1987'}
|
| 124 |
+
{'loss': '1.52', 'grad_norm': '0.5221', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.7', 'train_tokens_per_second': '1987'}
|
| 125 |
+
{'loss': '1.561', 'grad_norm': '0.537', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.7', 'train_tokens_per_second': '1987'}
|
| 126 |
+
{'loss': '1.633', 'grad_norm': '0.5059', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.7', 'train_tokens_per_second': '1987'}
|
| 127 |
+
{'loss': '1.475', 'grad_norm': '0.4845', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.7', 'train_tokens_per_second': '1987'}
|
| 128 |
+
{'loss': '1.531', 'grad_norm': '0.5408', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.7', 'train_tokens_per_second': '1988'}
|
| 129 |
+
{'loss': '1.483', 'grad_norm': '0.5341', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.8', 'train_tokens_per_second': '1988'}
|
| 130 |
+
{'loss': '1.496', 'grad_norm': '0.62', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.8', 'train_tokens_per_second': '1988'}
|
| 131 |
+
{'loss': '1.392', 'grad_norm': '0.5367', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.8', 'train_tokens_per_second': '1988'}
|
| 132 |
+
{'loss': '1.658', 'grad_norm': '0.6011', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.8', 'train_tokens_per_second': '1988'}
|
| 133 |
+
{'loss': '1.736', 'grad_norm': '0.6064', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '133.8', 'train_tokens_per_second': '1988'}
|
| 134 |
+
{'loss': '1.581', 'grad_norm': '0.5968', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '134.8', 'train_tokens_per_second': '1989'}
|
| 135 |
+
{'loss': '1.429', 'grad_norm': '0.4829', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '135.9', 'train_tokens_per_second': '1989'}
|
| 136 |
+
{'loss': '1.463', 'grad_norm': '0.5296', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '136.9', 'train_tokens_per_second': '1989'}
|
| 137 |
+
{'loss': '1.526', 'grad_norm': '0.6281', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '137.9', 'train_tokens_per_second': '1989'}
|
| 138 |
+
{'loss': '1.534', 'grad_norm': '0.6035', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '138.9', 'train_tokens_per_second': '1989'}
|
| 139 |
+
{'loss': '1.653', 'grad_norm': '0.5799', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '139.9', 'train_tokens_per_second': '1989'}
|
| 140 |
+
{'loss': '1.519', 'grad_norm': '0.6246', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141', 'train_tokens_per_second': '1989'}
|
| 141 |
+
{'loss': '1.389', 'grad_norm': '0.5421', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142', 'train_tokens_per_second': '1990'}
|
| 142 |
+
{'loss': '1.675', 'grad_norm': '0.6183', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143', 'train_tokens_per_second': '1990'}
|
| 143 |
+
{'loss': '1.464', 'grad_norm': '0.5757', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144', 'train_tokens_per_second': '1990'}
|
| 144 |
+
{'loss': '1.458', 'grad_norm': '0.5838', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145', 'train_tokens_per_second': '1990'}
|
| 145 |
+
{'loss': '1.58', 'grad_norm': '0.6429', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.1', 'train_tokens_per_second': '1990'}
|
| 146 |
+
{'loss': '1.327', 'grad_norm': '0.571', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.1', 'train_tokens_per_second': '1990'}
|
| 147 |
+
{'loss': '1.603', 'grad_norm': '0.6355', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.1', 'train_tokens_per_second': '1990'}
|
| 148 |
+
{'loss': '1.377', 'grad_norm': '0.5791', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.1', 'train_tokens_per_second': '1991'}
|
| 149 |
+
{'loss': '1.409', 'grad_norm': '0.6662', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.1', 'train_tokens_per_second': '1991'}
|
| 150 |
+
{'loss': '1.234', 'grad_norm': '0.5858', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.2', 'train_tokens_per_second': '1991'}
|
| 151 |
+
{'loss': '1.58', 'grad_norm': '0.6273', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.2', 'train_tokens_per_second': '1991'}
|
| 152 |
+
{'loss': '1.393', 'grad_norm': '0.6303', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.2', 'train_tokens_per_second': '1991'}
|
| 153 |
+
{'loss': '1.48', 'grad_norm': '0.7072', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.2', 'train_tokens_per_second': '1991'}
|
| 154 |
+
{'loss': '1.548', 'grad_norm': '0.7448', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.2', 'train_tokens_per_second': '1991'}
|
| 155 |
+
{'loss': '1.567', 'grad_norm': '0.7425', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.2', 'train_tokens_per_second': '1991'}
|
| 156 |
+
{'loss': '1.282', 'grad_norm': '0.5985', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.3', 'train_tokens_per_second': '1992'}
|
| 157 |
+
{'loss': '1.438', 'grad_norm': '0.7234', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.3', 'train_tokens_per_second': '1992'}
|
| 158 |
+
{'loss': '1.454', 'grad_norm': '0.6636', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.3', 'train_tokens_per_second': '1992'}
|
| 159 |
+
{'loss': '1.461', 'grad_norm': '0.7192', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.3', 'train_tokens_per_second': '1992'}
|
| 160 |
+
{'loss': '1.385', 'grad_norm': '0.7114', 'learning_rate': '2.097e-05', 'epoch': '0.04222', 'num_input_tokens_seen': 321379, 'train_runtime': '161.3', 'train_tokens_per_second': '1992'}
|
| 161 |
+
{'loss': '1.568', 'grad_norm': '0.9612', 'learning_rate': '2.11e-05', 'epoch': '0.04248', 'num_input_tokens_seen': 323426, 'train_runtime': '162.4', 'train_tokens_per_second': '1992'}
|
| 162 |
+
{'loss': '1.551', 'grad_norm': '0.7511', 'learning_rate': '2.124e-05', 'epoch': '0.04275', 'num_input_tokens_seen': 325473, 'train_runtime': '163.4', 'train_tokens_per_second': '1992'}
|
| 163 |
+
{'loss': '1.468', 'grad_norm': '0.771', 'learning_rate': '2.137e-05', 'epoch': '0.04302', 'num_input_tokens_seen': 327520, 'train_runtime': '164.4', 'train_tokens_per_second': '1992'}
|
| 164 |
+
{'loss': '1.486', 'grad_norm': '0.7804', 'learning_rate': '2.151e-05', 'epoch': '0.04329', 'num_input_tokens_seen': 329567, 'train_runtime': '165.4', 'train_tokens_per_second': '1992'}
|
| 165 |
+
{'loss': '1.426', 'grad_norm': '0.8093', 'learning_rate': '2.164e-05', 'epoch': '0.04356', 'num_input_tokens_seen': 331614, 'train_runtime': '166.4', 'train_tokens_per_second': '1992'}
|
| 166 |
+
{'loss': '1.331', 'grad_norm': '0.7181', 'learning_rate': '2.177e-05', 'epoch': '0.04383', 'num_input_tokens_seen': 333661, 'train_runtime': '167.4', 'train_tokens_per_second': '1993'}
|
| 167 |
+
{'loss': '1.026', 'grad_norm': '0.7177', 'learning_rate': '2.191e-05', 'epoch': '0.0441', 'num_input_tokens_seen': 335708, 'train_runtime': '168.5', 'train_tokens_per_second': '1993'}
|
| 168 |
+
{'loss': '1.391', 'grad_norm': '0.7581', 'learning_rate': '2.204e-05', 'epoch': '0.04437', 'num_input_tokens_seen': 337755, 'train_runtime': '169.5', 'train_tokens_per_second': '1993'}
|
| 169 |
+
{'loss': '1.388', 'grad_norm': '0.8128', 'learning_rate': '2.218e-05', 'epoch': '0.04464', 'num_input_tokens_seen': 339802, 'train_runtime': '170.5', 'train_tokens_per_second': '1993'}
|
| 170 |
+
{'loss': '1.494', 'grad_norm': '0.8851', 'learning_rate': '2.231e-05', 'epoch': '0.0449', 'num_input_tokens_seen': 341849, 'train_runtime': '171.5', 'train_tokens_per_second': '1993'}
|
| 171 |
+
{'loss': '1.275', 'grad_norm': '0.741', 'learning_rate': '2.245e-05', 'epoch': '0.04517', 'num_input_tokens_seen': 343896, 'train_runtime': '172.5', 'train_tokens_per_second': '1993'}
|
| 172 |
+
{'loss': '1.307', 'grad_norm': '0.7937', 'learning_rate': '2.258e-05', 'epoch': '0.04544', 'num_input_tokens_seen': 345943, 'train_runtime': '173.6', 'train_tokens_per_second': '1993'}
|
| 173 |
+
{'loss': '1.188', 'grad_norm': '0.758', 'learning_rate': '2.272e-05', 'epoch': '0.04571', 'num_input_tokens_seen': 347990, 'train_runtime': '174.6', 'train_tokens_per_second': '1993'}
|
| 174 |
+
{'loss': '1.371', 'grad_norm': '0.8093', 'learning_rate': '2.285e-05', 'epoch': '0.04598', 'num_input_tokens_seen': 350037, 'train_runtime': '175.6', 'train_tokens_per_second': '1993'}
|
| 175 |
+
{'loss': '1.234', 'grad_norm': '0.7643', 'learning_rate': '2.298e-05', 'epoch': '0.04625', 'num_input_tokens_seen': 352084, 'train_runtime': '176.6', 'train_tokens_per_second': '1994'}
|
| 176 |
+
{'loss': '1.437', 'grad_norm': '0.8591', 'learning_rate': '2.312e-05', 'epoch': '0.04652', 'num_input_tokens_seen': 354131, 'train_runtime': '177.6', 'train_tokens_per_second': '1994'}
|
| 177 |
+
{'loss': '1.425', 'grad_norm': '1.101', 'learning_rate': '2.325e-05', 'epoch': '0.04679', 'num_input_tokens_seen': 356178, 'train_runtime': '178.7', 'train_tokens_per_second': '1994'}
|
| 178 |
+
{'loss': '1.402', 'grad_norm': '0.8633', 'learning_rate': '2.339e-05', 'epoch': '0.04706', 'num_input_tokens_seen': 358225, 'train_runtime': '179.7', 'train_tokens_per_second': '1994'}
|
| 179 |
+
{'loss': '1.33', 'grad_norm': '0.9336', 'learning_rate': '2.352e-05', 'epoch': '0.04732', 'num_input_tokens_seen': 360272, 'train_runtime': '180.7', 'train_tokens_per_second': '1994'}
|
| 180 |
+
{'loss': '1.189', 'grad_norm': '0.9058', 'learning_rate': '2.366e-05', 'epoch': '0.04759', 'num_input_tokens_seen': 362319, 'train_runtime': '181.7', 'train_tokens_per_second': '1994'}
|
| 181 |
+
{'loss': '1.383', 'grad_norm': '1.003', 'learning_rate': '2.379e-05', 'epoch': '0.04786', 'num_input_tokens_seen': 364366, 'train_runtime': '182.7', 'train_tokens_per_second': '1994'}
|
| 182 |
+
{'loss': '1.263', 'grad_norm': '0.949', 'learning_rate': '2.392e-05', 'epoch': '0.04813', 'num_input_tokens_seen': 366413, 'train_runtime': '183.7', 'train_tokens_per_second': '1994'}
|
| 183 |
+
{'loss': '1.473', 'grad_norm': '1.062', 'learning_rate': '2.406e-05', 'epoch': '0.0484', 'num_input_tokens_seen': 368460, 'train_runtime': '184.8', 'train_tokens_per_second': '1994'}
|
| 184 |
+
{'loss': '1.218', 'grad_norm': '0.862', 'learning_rate': '2.419e-05', 'epoch': '0.04867', 'num_input_tokens_seen': 370507, 'train_runtime': '185.8', 'train_tokens_per_second': '1994'}
|
| 185 |
+
{'loss': '1.232', 'grad_norm': '1.03', 'learning_rate': '2.433e-05', 'epoch': '0.04894', 'num_input_tokens_seen': 372554, 'train_runtime': '186.8', 'train_tokens_per_second': '1994'}
|
| 186 |
+
{'loss': '1.243', 'grad_norm': '0.9608', 'learning_rate': '2.446e-05', 'epoch': '0.04921', 'num_input_tokens_seen': 374601, 'train_runtime': '187.8', 'train_tokens_per_second': '1994'}
|
| 187 |
+
{'loss': '1.423', 'grad_norm': '0.9823', 'learning_rate': '2.46e-05', 'epoch': '0.04948', 'num_input_tokens_seen': 376648, 'train_runtime': '188.8', 'train_tokens_per_second': '1995'}
|
| 188 |
+
{'loss': '1.176', 'grad_norm': '0.9865', 'learning_rate': '2.473e-05', 'epoch': '0.04974', 'num_input_tokens_seen': 378695, 'train_runtime': '189.9', 'train_tokens_per_second': '1995'}
|
| 189 |
+
{'loss': '1.323', 'grad_norm': '1.114', 'learning_rate': '2.487e-05', 'epoch': '0.05001', 'num_input_tokens_seen': 380742, 'train_runtime': '190.9', 'train_tokens_per_second': '1995'}
|
| 190 |
+
{'loss': '1.394', 'grad_norm': '1.221', 'learning_rate': '2.5e-05', 'epoch': '0.05028', 'num_input_tokens_seen': 382789, 'train_runtime': '191.9', 'train_tokens_per_second': '1995'}
|
| 191 |
+
{'loss': '1.228', 'grad_norm': '0.9599', 'learning_rate': '2.513e-05', 'epoch': '0.05055', 'num_input_tokens_seen': 384836, 'train_runtime': '192.9', 'train_tokens_per_second': '1995'}
|
| 192 |
+
{'loss': '0.9697', 'grad_norm': '6.034', 'learning_rate': '2.527e-05', 'epoch': '0.05082', 'num_input_tokens_seen': 386883, 'train_runtime': '193.9', 'train_tokens_per_second': '1995'}
|
| 193 |
+
{'loss': '1.253', 'grad_norm': '1.301', 'learning_rate': '2.54e-05', 'epoch': '0.05109', 'num_input_tokens_seen': 388930, 'train_runtime': '195', 'train_tokens_per_second': '1995'}
|
| 194 |
+
{'loss': '1.398', 'grad_norm': '1.082', 'learning_rate': '2.554e-05', 'epoch': '0.05136', 'num_input_tokens_seen': 390977, 'train_runtime': '196', 'train_tokens_per_second': '1995'}
|
| 195 |
+
{'loss': '1.206', 'grad_norm': '0.9854', 'learning_rate': '2.567e-05', 'epoch': '0.05163', 'num_input_tokens_seen': 393024, 'train_runtime': '197', 'train_tokens_per_second': '1995'}
|
| 196 |
+
{'loss': '1.27', 'grad_norm': '1.037', 'learning_rate': '2.581e-05', 'epoch': '0.0519', 'num_input_tokens_seen': 395071, 'train_runtime': '198', 'train_tokens_per_second': '1995'}
|
| 197 |
+
{'loss': '1.292', 'grad_norm': '1.057', 'learning_rate': '2.594e-05', 'epoch': '0.05216', 'num_input_tokens_seen': 397118, 'train_runtime': '199', 'train_tokens_per_second': '1995'}
|
| 198 |
+
{'loss': '1.272', 'grad_norm': '1.172', 'learning_rate': '2.608e-05', 'epoch': '0.05243', 'num_input_tokens_seen': 399165, 'train_runtime': '200.1', 'train_tokens_per_second': '1995'}
|
| 199 |
+
{'loss': '1.189', 'grad_norm': '1.225', 'learning_rate': '2.621e-05', 'epoch': '0.0527', 'num_input_tokens_seen': 401212, 'train_runtime': '201.1', 'train_tokens_per_second': '1995'}
|
| 200 |
+
{'loss': '1.259', 'grad_norm': '1.042', 'learning_rate': '2.634e-05', 'epoch': '0.05297', 'num_input_tokens_seen': 403259, 'train_runtime': '202.1', 'train_tokens_per_second': '1995'}
|
| 201 |
+
{'loss': '1.39', 'grad_norm': '1.074', 'learning_rate': '2.648e-05', 'epoch': '0.05324', 'num_input_tokens_seen': 405306, 'train_runtime': '203.1', 'train_tokens_per_second': '1995'}
|
| 202 |
+
{'loss': '1.152', 'grad_norm': '1.069', 'learning_rate': '2.661e-05', 'epoch': '0.05351', 'num_input_tokens_seen': 407353, 'train_runtime': '204.1', 'train_tokens_per_second': '1996'}
|
| 203 |
+
{'loss': '1.174', 'grad_norm': '1.032', 'learning_rate': '2.675e-05', 'epoch': '0.05378', 'num_input_tokens_seen': 409400, 'train_runtime': '205.1', 'train_tokens_per_second': '1996'}
|
| 204 |
+
{'loss': '1.095', 'grad_norm': '1.298', 'learning_rate': '2.688e-05', 'epoch': '0.05405', 'num_input_tokens_seen': 411447, 'train_runtime': '206.2', 'train_tokens_per_second': '1996'}
|
| 205 |
+
{'loss': '1.309', 'grad_norm': '1.595', 'learning_rate': '2.702e-05', 'epoch': '0.05432', 'num_input_tokens_seen': 413494, 'train_runtime': '207.2', 'train_tokens_per_second': '1996'}
|
| 206 |
+
{'loss': '1.312', 'grad_norm': '1.278', 'learning_rate': '2.715e-05', 'epoch': '0.05458', 'num_input_tokens_seen': 415541, 'train_runtime': '208.2', 'train_tokens_per_second': '1996'}
|
| 207 |
+
{'loss': '1.237', 'grad_norm': '1.138', 'learning_rate': '2.728e-05', 'epoch': '0.05485', 'num_input_tokens_seen': 417588, 'train_runtime': '209.2', 'train_tokens_per_second': '1996'}
|
| 208 |
+
{'loss': '1.268', 'grad_norm': '1.082', 'learning_rate': '2.742e-05', 'epoch': '0.05512', 'num_input_tokens_seen': 419635, 'train_runtime': '210.2', 'train_tokens_per_second': '1996'}
|
| 209 |
+
{'loss': '1.158', 'grad_norm': '1.197', 'learning_rate': '2.755e-05', 'epoch': '0.05539', 'num_input_tokens_seen': 421682, 'train_runtime': '211.3', 'train_tokens_per_second': '1996'}
|
| 210 |
+
{'loss': '1.114', 'grad_norm': '1.105', 'learning_rate': '2.769e-05', 'epoch': '0.05566', 'num_input_tokens_seen': 423729, 'train_runtime': '212.3', 'train_tokens_per_second': '1996'}
|
| 211 |
+
{'loss': '1.301', 'grad_norm': '1.159', 'learning_rate': '2.782e-05', 'epoch': '0.05593', 'num_input_tokens_seen': 425776, 'train_runtime': '213.3', 'train_tokens_per_second': '1996'}
|
| 212 |
+
{'loss': '1.239', 'grad_norm': '1.363', 'learning_rate': '2.796e-05', 'epoch': '0.0562', 'num_input_tokens_seen': 427823, 'train_runtime': '214.3', 'train_tokens_per_second': '1996'}
|
| 213 |
+
{'loss': '1.175', 'grad_norm': '1.773', 'learning_rate': '2.809e-05', 'epoch': '0.05647', 'num_input_tokens_seen': 429870, 'train_runtime': '215.3', 'train_tokens_per_second': '1996'}
|
| 214 |
+
{'loss': '1.356', 'grad_norm': '1.097', 'learning_rate': '2.823e-05', 'epoch': '0.05674', 'num_input_tokens_seen': 431917, 'train_runtime': '216.3', 'train_tokens_per_second': '1996'}
|
| 215 |
+
{'loss': '1.229', 'grad_norm': '1.221', 'learning_rate': '2.836e-05', 'epoch': '0.057', 'num_input_tokens_seen': 433964, 'train_runtime': '217.4', 'train_tokens_per_second': '1996'}
|
| 216 |
+
{'loss': '1.174', 'grad_norm': '1.157', 'learning_rate': '2.849e-05', 'epoch': '0.05727', 'num_input_tokens_seen': 436011, 'train_runtime': '218.4', 'train_tokens_per_second': '1996'}
|
| 217 |
+
{'loss': '1.068', 'grad_norm': '1.14', 'learning_rate': '2.863e-05', 'epoch': '0.05754', 'num_input_tokens_seen': 438058, 'train_runtime': '219.4', 'train_tokens_per_second': '1997'}
|
| 218 |
+
{'loss': '1.053', 'grad_norm': '1.118', 'learning_rate': '2.876e-05', 'epoch': '0.05781', 'num_input_tokens_seen': 440105, 'train_runtime': '220.4', 'train_tokens_per_second': '1997'}
|
| 219 |
+
{'loss': '1.067', 'grad_norm': '1.103', 'learning_rate': '2.89e-05', 'epoch': '0.05808', 'num_input_tokens_seen': 442152, 'train_runtime': '221.4', 'train_tokens_per_second': '1997'}
|
| 220 |
+
{'loss': '1.2', 'grad_norm': '1.188', 'learning_rate': '2.903e-05', 'epoch': '0.05835', 'num_input_tokens_seen': 444199, 'train_runtime': '222.5', 'train_tokens_per_second': '1997'}
|
| 221 |
+
{'loss': '1.151', 'grad_norm': '1.14', 'learning_rate': '2.917e-05', 'epoch': '0.05862', 'num_input_tokens_seen': 446246, 'train_runtime': '223.5', 'train_tokens_per_second': '1997'}
|
| 222 |
+
{'loss': '1.142', 'grad_norm': '1.394', 'learning_rate': '2.93e-05', 'epoch': '0.05889', 'num_input_tokens_seen': 448293, 'train_runtime': '224.5', 'train_tokens_per_second': '1997'}
|
| 223 |
+
{'loss': '1.119', 'grad_norm': '1.263', 'learning_rate': '2.944e-05', 'epoch': '0.05916', 'num_input_tokens_seen': 450340, 'train_runtime': '225.5', 'train_tokens_per_second': '1997'}
|
| 224 |
+
{'loss': '1.024', 'grad_norm': '1.189', 'learning_rate': '2.957e-05', 'epoch': '0.05942', 'num_input_tokens_seen': 452387, 'train_runtime': '226.5', 'train_tokens_per_second': '1997'}
|
| 225 |
+
{'loss': '1.227', 'grad_norm': '1.225', 'learning_rate': '2.97e-05', 'epoch': '0.05969', 'num_input_tokens_seen': 454434, 'train_runtime': '227.5', 'train_tokens_per_second': '1997'}
|
| 226 |
+
{'loss': '0.956', 'grad_norm': '1.254', 'learning_rate': '2.984e-05', 'epoch': '0.05996', 'num_input_tokens_seen': 456481, 'train_runtime': '228.6', 'train_tokens_per_second': '1997'}
|
| 227 |
+
{'loss': '1.182', 'grad_norm': '1.283', 'learning_rate': '2.997e-05', 'epoch': '0.06023', 'num_input_tokens_seen': 458528, 'train_runtime': '229.6', 'train_tokens_per_second': '1997'}
|
| 228 |
+
{'loss': '1.112', 'grad_norm': '1.259', 'learning_rate': '3.011e-05', 'epoch': '0.0605', 'num_input_tokens_seen': 460575, 'train_runtime': '230.6', 'train_tokens_per_second': '1997'}
|
| 229 |
+
{'loss': '1.142', 'grad_norm': '1.316', 'learning_rate': '3.024e-05', 'epoch': '0.06077', 'num_input_tokens_seen': 462622, 'train_runtime': '231.6', 'train_tokens_per_second': '1997'}
|
| 230 |
+
{'loss': '0.6945', 'grad_norm': '0.9457', 'learning_rate': '3.038e-05', 'epoch': '0.06104', 'num_input_tokens_seen': 464669, 'train_runtime': '232.6', 'train_tokens_per_second': '1997'}
|
| 231 |
+
{'loss': '1.13', 'grad_norm': '1.351', 'learning_rate': '3.051e-05', 'epoch': '0.06131', 'num_input_tokens_seen': 466716, 'train_runtime': '233.7', 'train_tokens_per_second': '1997'}
|
| 232 |
+
{'loss': '1.041', 'grad_norm': '1.223', 'learning_rate': '3.065e-05', 'epoch': '0.06158', 'num_input_tokens_seen': 468763, 'train_runtime': '234.7', 'train_tokens_per_second': '1998'}
|
| 233 |
+
{'loss': '0.9028', 'grad_norm': '1.262', 'learning_rate': '3.078e-05', 'epoch': '0.06184', 'num_input_tokens_seen': 470810, 'train_runtime': '235.7', 'train_tokens_per_second': '1998'}
|
| 234 |
+
{'loss': '1.115', 'grad_norm': '1.139', 'learning_rate': '3.091e-05', 'epoch': '0.06211', 'num_input_tokens_seen': 472857, 'train_runtime': '236.7', 'train_tokens_per_second': '1998'}
|
| 235 |
+
{'loss': '1.181', 'grad_norm': '1.256', 'learning_rate': '3.105e-05', 'epoch': '0.06238', 'num_input_tokens_seen': 474904, 'train_runtime': '237.7', 'train_tokens_per_second': '1998'}
|
| 236 |
+
{'loss': '1.177', 'grad_norm': '1.267', 'learning_rate': '3.118e-05', 'epoch': '0.06265', 'num_input_tokens_seen': 476951, 'train_runtime': '238.7', 'train_tokens_per_second': '1998'}
|
| 237 |
+
{'loss': '1.119', 'grad_norm': '1.256', 'learning_rate': '3.132e-05', 'epoch': '0.06292', 'num_input_tokens_seen': 478998, 'train_runtime': '239.8', 'train_tokens_per_second': '1997'}
|
| 238 |
+
{'loss': '1.147', 'grad_norm': '1.309', 'learning_rate': '3.145e-05', 'epoch': '0.06319', 'num_input_tokens_seen': 481045, 'train_runtime': '240.8', 'train_tokens_per_second': '1997'}
|
| 239 |
+
{'loss': '1.062', 'grad_norm': '1.274', 'learning_rate': '3.159e-05', 'epoch': '0.06346', 'num_input_tokens_seen': 483092, 'train_runtime': '241.9', 'train_tokens_per_second': '1997'}
|
| 240 |
+
{'loss': '1.136', 'grad_norm': '1.696', 'learning_rate': '3.172e-05', 'epoch': '0.06373', 'num_input_tokens_seen': 485139, 'train_runtime': '242.9', 'train_tokens_per_second': '1998'}
|
| 241 |
+
{'loss': '1.039', 'grad_norm': '1.379', 'learning_rate': '3.185e-05', 'epoch': '0.064', 'num_input_tokens_seen': 487186, 'train_runtime': '243.9', 'train_tokens_per_second': '1998'}
|
| 242 |
+
{'loss': '1.247', 'grad_norm': '1.521', 'learning_rate': '3.199e-05', 'epoch': '0.06426', 'num_input_tokens_seen': 489233, 'train_runtime': '244.9', 'train_tokens_per_second': '1998'}
|
| 243 |
+
{'loss': '1.183', 'grad_norm': '1.438', 'learning_rate': '3.212e-05', 'epoch': '0.06453', 'num_input_tokens_seen': 491280, 'train_runtime': '245.9', 'train_tokens_per_second': '1998'}
|
| 244 |
+
{'loss': '1.089', 'grad_norm': '1.296', 'learning_rate': '3.226e-05', 'epoch': '0.0648', 'num_input_tokens_seen': 493327, 'train_runtime': '246.9', 'train_tokens_per_second': '1998'}
|
| 245 |
+
{'loss': '1.182', 'grad_norm': '1.303', 'learning_rate': '3.239e-05', 'epoch': '0.06507', 'num_input_tokens_seen': 495374, 'train_runtime': '248', 'train_tokens_per_second': '1998'}
|
| 246 |
+
{'loss': '1.184', 'grad_norm': '1.186', 'learning_rate': '3.253e-05', 'epoch': '0.06534', 'num_input_tokens_seen': 497421, 'train_runtime': '249', 'train_tokens_per_second': '1998'}
|
| 247 |
+
{'loss': '0.8177', 'grad_norm': '1.317', 'learning_rate': '3.266e-05', 'epoch': '0.06561', 'num_input_tokens_seen': 499468, 'train_runtime': '250', 'train_tokens_per_second': '1998'}
|
| 248 |
+
{'loss': '1.119', 'grad_norm': '1.171', 'learning_rate': '3.28e-05', 'epoch': '0.06588', 'num_input_tokens_seen': 501515, 'train_runtime': '251', 'train_tokens_per_second': '1998'}
|
| 249 |
+
{'loss': '0.9268', 'grad_norm': '1.47', 'learning_rate': '3.293e-05', 'epoch': '0.06615', 'num_input_tokens_seen': 503562, 'train_runtime': '252', 'train_tokens_per_second': '1998'}
|
| 250 |
+
{'loss': '0.8829', 'grad_norm': '1.611', 'learning_rate': '3.306e-05', 'epoch': '0.06642', 'num_input_tokens_seen': 505609, 'train_runtime': '253.1', 'train_tokens_per_second': '1998'}
|
| 251 |
+
{'loss': '1.069', 'grad_norm': '1.647', 'learning_rate': '3.32e-05', 'epoch': '0.06668', 'num_input_tokens_seen': 507656, 'train_runtime': '254.1', 'train_tokens_per_second': '1998'}
|
| 252 |
+
{'loss': '0.9165', 'grad_norm': '1.581', 'learning_rate': '3.333e-05', 'epoch': '0.06695', 'num_input_tokens_seen': 509703, 'train_runtime': '255.1', 'train_tokens_per_second': '1998'}
|
| 253 |
+
{'loss': '1.079', 'grad_norm': '1.815', 'learning_rate': '3.347e-05', 'epoch': '0.06722', 'num_input_tokens_seen': 511750, 'train_runtime': '256.1', 'train_tokens_per_second': '1998'}
|
| 254 |
+
{'loss': '0.8549', 'grad_norm': '1.626', 'learning_rate': '3.36e-05', 'epoch': '0.06749', 'num_input_tokens_seen': 513797, 'train_runtime': '257.1', 'train_tokens_per_second': '1998'}
|
| 255 |
+
{'loss': '0.8964', 'grad_norm': '1.216', 'learning_rate': '3.374e-05', 'epoch': '0.06776', 'num_input_tokens_seen': 515844, 'train_runtime': '258.2', 'train_tokens_per_second': '1998'}
|
| 256 |
+
{'loss': '0.9361', 'grad_norm': '1.345', 'learning_rate': '3.387e-05', 'epoch': '0.06803', 'num_input_tokens_seen': 517891, 'train_runtime': '259.2', 'train_tokens_per_second': '1998'}
|
| 257 |
+
{'loss': '0.8836', 'grad_norm': '1.337', 'learning_rate': '3.401e-05', 'epoch': '0.0683', 'num_input_tokens_seen': 519938, 'train_runtime': '260.2', 'train_tokens_per_second': '1998'}
|
| 258 |
+
{'loss': '1.104', 'grad_norm': '1.467', 'learning_rate': '3.414e-05', 'epoch': '0.06857', 'num_input_tokens_seen': 521985, 'train_runtime': '261.2', 'train_tokens_per_second': '1998'}
|
| 259 |
+
{'loss': '1.308', 'grad_norm': '1.429', 'learning_rate': '3.427e-05', 'epoch': '0.06884', 'num_input_tokens_seen': 524032, 'train_runtime': '262.2', 'train_tokens_per_second': '1998'}
|
| 260 |
+
{'loss': '1.079', 'grad_norm': '1.394', 'learning_rate': '3.441e-05', 'epoch': '0.0691', 'num_input_tokens_seen': 526079, 'train_runtime': '263.3', 'train_tokens_per_second': '1998'}
|
| 261 |
+
{'loss': '1.033', 'grad_norm': '1.304', 'learning_rate': '3.454e-05', 'epoch': '0.06937', 'num_input_tokens_seen': 528126, 'train_runtime': '264.3', 'train_tokens_per_second': '1998'}
|
| 262 |
+
{'loss': '0.9466', 'grad_norm': '1.488', 'learning_rate': '3.468e-05', 'epoch': '0.06964', 'num_input_tokens_seen': 530173, 'train_runtime': '265.3', 'train_tokens_per_second': '1999'}
|
| 263 |
+
{'loss': '1.045', 'grad_norm': '1.277', 'learning_rate': '3.481e-05', 'epoch': '0.06991', 'num_input_tokens_seen': 532220, 'train_runtime': '266.3', 'train_tokens_per_second': '1999'}
|
| 264 |
+
{'loss': '0.9476', 'grad_norm': '1.584', 'learning_rate': '3.495e-05', 'epoch': '0.07018', 'num_input_tokens_seen': 534267, 'train_runtime': '267.3', 'train_tokens_per_second': '1999'}
|
| 265 |
+
{'loss': '0.7732', 'grad_norm': '1.766', 'learning_rate': '3.508e-05', 'epoch': '0.07045', 'num_input_tokens_seen': 536314, 'train_runtime': '268.3', 'train_tokens_per_second': '1999'}
|
| 266 |
+
{'loss': '0.9556', 'grad_norm': '1.519', 'learning_rate': '3.522e-05', 'epoch': '0.07072', 'num_input_tokens_seen': 538361, 'train_runtime': '269.4', 'train_tokens_per_second': '1999'}
|
| 267 |
+
{'loss': '0.7371', 'grad_norm': '1.619', 'learning_rate': '3.535e-05', 'epoch': '0.07099', 'num_input_tokens_seen': 540408, 'train_runtime': '270.4', 'train_tokens_per_second': '1999'}
|
| 268 |
+
{'loss': '1.137', 'grad_norm': '1.548', 'learning_rate': '3.548e-05', 'epoch': '0.07126', 'num_input_tokens_seen': 542455, 'train_runtime': '271.4', 'train_tokens_per_second': '1999'}
|
| 269 |
+
{'loss': '0.9216', 'grad_norm': '1.755', 'learning_rate': '3.562e-05', 'epoch': '0.07152', 'num_input_tokens_seen': 544502, 'train_runtime': '272.4', 'train_tokens_per_second': '1999'}
|
| 270 |
+
{'loss': '1.001', 'grad_norm': '1.717', 'learning_rate': '3.575e-05', 'epoch': '0.07179', 'num_input_tokens_seen': 546549, 'train_runtime': '273.4', 'train_tokens_per_second': '1999'}
|
| 271 |
+
{'loss': '0.8521', 'grad_norm': '1.497', 'learning_rate': '3.589e-05', 'epoch': '0.07206', 'num_input_tokens_seen': 548596, 'train_runtime': '274.5', 'train_tokens_per_second': '1999'}
|
| 272 |
+
{'loss': '0.9487', 'grad_norm': '1.475', 'learning_rate': '3.602e-05', 'epoch': '0.07233', 'num_input_tokens_seen': 550643, 'train_runtime': '275.5', 'train_tokens_per_second': '1999'}
|
| 273 |
+
{'loss': '1.001', 'grad_norm': '1.486', 'learning_rate': '3.616e-05', 'epoch': '0.0726', 'num_input_tokens_seen': 552690, 'train_runtime': '276.5', 'train_tokens_per_second': '1999'}
|
| 274 |
+
{'loss': '0.9512', 'grad_norm': '1.384', 'learning_rate': '3.629e-05', 'epoch': '0.07287', 'num_input_tokens_seen': 554737, 'train_runtime': '277.5', 'train_tokens_per_second': '1999'}
|
| 275 |
+
{'loss': '0.9048', 'grad_norm': '1.256', 'learning_rate': '3.642e-05', 'epoch': '0.07314', 'num_input_tokens_seen': 556784, 'train_runtime': '278.5', 'train_tokens_per_second': '1999'}
|
| 276 |
+
{'loss': '0.966', 'grad_norm': '1.58', 'learning_rate': '3.656e-05', 'epoch': '0.07341', 'num_input_tokens_seen': 558831, 'train_runtime': '279.5', 'train_tokens_per_second': '1999'}
|
| 277 |
+
{'loss': '1.121', 'grad_norm': '1.473', 'learning_rate': '3.669e-05', 'epoch': '0.07368', 'num_input_tokens_seen': 560878, 'train_runtime': '280.6', 'train_tokens_per_second': '1999'}
|
| 278 |
+
{'loss': '0.9792', 'grad_norm': '1.466', 'learning_rate': '3.683e-05', 'epoch': '0.07394', 'num_input_tokens_seen': 562925, 'train_runtime': '281.6', 'train_tokens_per_second': '1999'}
|
| 279 |
+
{'loss': '0.7847', 'grad_norm': '1.34', 'learning_rate': '3.696e-05', 'epoch': '0.07421', 'num_input_tokens_seen': 564972, 'train_runtime': '282.6', 'train_tokens_per_second': '1999'}
|
| 280 |
+
{'loss': '0.9178', 'grad_norm': '1.556', 'learning_rate': '3.71e-05', 'epoch': '0.07448', 'num_input_tokens_seen': 567019, 'train_runtime': '283.6', 'train_tokens_per_second': '1999'}
|
| 281 |
+
{'loss': '0.7879', 'grad_norm': '1.819', 'learning_rate': '3.723e-05', 'epoch': '0.07475', 'num_input_tokens_seen': 569066, 'train_runtime': '284.6', 'train_tokens_per_second': '1999'}
|
| 282 |
+
{'loss': '0.9185', 'grad_norm': '1.563', 'learning_rate': '3.737e-05', 'epoch': '0.07502', 'num_input_tokens_seen': 571113, 'train_runtime': '285.6', 'train_tokens_per_second': '1999'}
|
| 283 |
+
{'loss': '0.9971', 'grad_norm': '1.695', 'learning_rate': '3.75e-05', 'epoch': '0.07529', 'num_input_tokens_seen': 573160, 'train_runtime': '286.7', 'train_tokens_per_second': '1999'}
|
| 284 |
+
{'loss': '0.7991', 'grad_norm': '1.747', 'learning_rate': '3.763e-05', 'epoch': '0.07556', 'num_input_tokens_seen': 575207, 'train_runtime': '287.7', 'train_tokens_per_second': '1999'}
|
| 285 |
+
{'loss': '0.7907', 'grad_norm': '1.532', 'learning_rate': '3.777e-05', 'epoch': '0.07583', 'num_input_tokens_seen': 577254, 'train_runtime': '288.7', 'train_tokens_per_second': '2000'}
|
| 286 |
+
{'loss': '0.977', 'grad_norm': '1.455', 'learning_rate': '3.79e-05', 'epoch': '0.0761', 'num_input_tokens_seen': 579301, 'train_runtime': '289.7', 'train_tokens_per_second': '2000'}
|
| 287 |
+
{'loss': '0.7108', 'grad_norm': '1.527', 'learning_rate': '3.804e-05', 'epoch': '0.07636', 'num_input_tokens_seen': 581348, 'train_runtime': '290.7', 'train_tokens_per_second': '2000'}
|
| 288 |
+
{'loss': '1.001', 'grad_norm': '1.406', 'learning_rate': '3.817e-05', 'epoch': '0.07663', 'num_input_tokens_seen': 583395, 'train_runtime': '291.8', 'train_tokens_per_second': '2000'}
|
| 289 |
+
{'loss': '0.9314', 'grad_norm': '1.623', 'learning_rate': '3.831e-05', 'epoch': '0.0769', 'num_input_tokens_seen': 585442, 'train_runtime': '292.8', 'train_tokens_per_second': '2000'}
|
| 290 |
+
{'loss': '0.8645', 'grad_norm': '1.463', 'learning_rate': '3.844e-05', 'epoch': '0.07717', 'num_input_tokens_seen': 587489, 'train_runtime': '293.8', 'train_tokens_per_second': '2000'}
|
| 291 |
+
{'loss': '0.9374', 'grad_norm': '1.443', 'learning_rate': '3.858e-05', 'epoch': '0.07744', 'num_input_tokens_seen': 589536, 'train_runtime': '294.8', 'train_tokens_per_second': '2000'}
|
| 292 |
+
{'loss': '0.7219', 'grad_norm': '1.712', 'learning_rate': '3.871e-05', 'epoch': '0.07771', 'num_input_tokens_seen': 591583, 'train_runtime': '295.8', 'train_tokens_per_second': '2000'}
|
| 293 |
+
{'loss': '0.8756', 'grad_norm': '1.486', 'learning_rate': '3.884e-05', 'epoch': '0.07798', 'num_input_tokens_seen': 593630, 'train_runtime': '296.8', 'train_tokens_per_second': '2000'}
|
| 294 |
+
{'loss': '0.703', 'grad_norm': '1.555', 'learning_rate': '3.898e-05', 'epoch': '0.07825', 'num_input_tokens_seen': 595677, 'train_runtime': '297.9', 'train_tokens_per_second': '2000'}
|
| 295 |
+
{'loss': '0.8544', 'grad_norm': '1.665', 'learning_rate': '3.911e-05', 'epoch': '0.07852', 'num_input_tokens_seen': 597724, 'train_runtime': '298.9', 'train_tokens_per_second': '2000'}
|
| 296 |
+
{'loss': '1.088', 'grad_norm': '2.26', 'learning_rate': '3.925e-05', 'epoch': '0.07878', 'num_input_tokens_seen': 599771, 'train_runtime': '299.9', 'train_tokens_per_second': '2000'}
|
| 297 |
+
{'loss': '0.8961', 'grad_norm': '1.421', 'learning_rate': '3.938e-05', 'epoch': '0.07905', 'num_input_tokens_seen': 601818, 'train_runtime': '300.9', 'train_tokens_per_second': '2000'}
|
| 298 |
+
{'loss': '1.096', 'grad_norm': '1.708', 'learning_rate': '3.952e-05', 'epoch': '0.07932', 'num_input_tokens_seen': 603865, 'train_runtime': '301.9', 'train_tokens_per_second': '2000'}
|
| 299 |
+
{'loss': '0.9044', 'grad_norm': '1.57', 'learning_rate': '3.965e-05', 'epoch': '0.07959', 'num_input_tokens_seen': 605912, 'train_runtime': '303', 'train_tokens_per_second': '2000'}
|
| 300 |
+
{'loss': '0.9157', 'grad_norm': '1.404', 'learning_rate': '3.978e-05', 'epoch': '0.07986', 'num_input_tokens_seen': 607959, 'train_runtime': '304', 'train_tokens_per_second': '2000'}
|
| 301 |
+
{'loss': '0.9376', 'grad_norm': '1.561', 'learning_rate': '3.992e-05', 'epoch': '0.08013', 'num_input_tokens_seen': 610006, 'train_runtime': '305', 'train_tokens_per_second': '2000'}
|
| 302 |
+
{'loss': '1.079', 'grad_norm': '1.473', 'learning_rate': '4.005e-05', 'epoch': '0.0804', 'num_input_tokens_seen': 612053, 'train_runtime': '306', 'train_tokens_per_second': '2000'}
|
| 303 |
+
{'loss': '0.8078', 'grad_norm': '1.753', 'learning_rate': '4.019e-05', 'epoch': '0.08067', 'num_input_tokens_seen': 614100, 'train_runtime': '307', 'train_tokens_per_second': '2000'}
|
| 304 |
+
{'loss': '0.9436', 'grad_norm': '1.635', 'learning_rate': '4.032e-05', 'epoch': '0.08094', 'num_input_tokens_seen': 616147, 'train_runtime': '308', 'train_tokens_per_second': '2000'}
|
| 305 |
+
{'loss': '0.8635', 'grad_norm': '1.619', 'learning_rate': '4.046e-05', 'epoch': '0.0812', 'num_input_tokens_seen': 618194, 'train_runtime': '309.1', 'train_tokens_per_second': '2000'}
|
| 306 |
+
{'loss': '0.8744', 'grad_norm': '1.512', 'learning_rate': '4.059e-05', 'epoch': '0.08147', 'num_input_tokens_seen': 620241, 'train_runtime': '310.1', 'train_tokens_per_second': '2000'}
|
| 307 |
+
{'loss': '0.9712', 'grad_norm': '1.711', 'learning_rate': '4.073e-05', 'epoch': '0.08174', 'num_input_tokens_seen': 622288, 'train_runtime': '311.1', 'train_tokens_per_second': '2000'}
|
| 308 |
+
{'loss': '0.8672', 'grad_norm': '1.683', 'learning_rate': '4.086e-05', 'epoch': '0.08201', 'num_input_tokens_seen': 624335, 'train_runtime': '312.1', 'train_tokens_per_second': '2000'}
|
| 309 |
+
{'loss': '0.8807', 'grad_norm': '1.646', 'learning_rate': '4.099e-05', 'epoch': '0.08228', 'num_input_tokens_seen': 626382, 'train_runtime': '313.1', 'train_tokens_per_second': '2000'}
|
| 310 |
+
{'loss': '0.9382', 'grad_norm': '1.572', 'learning_rate': '4.113e-05', 'epoch': '0.08255', 'num_input_tokens_seen': 628429, 'train_runtime': '314.1', 'train_tokens_per_second': '2000'}
|
| 311 |
+
{'loss': '0.9096', 'grad_norm': '1.767', 'learning_rate': '4.126e-05', 'epoch': '0.08282', 'num_input_tokens_seen': 630476, 'train_runtime': '315.2', 'train_tokens_per_second': '2000'}
|
| 312 |
+
{'loss': '0.9922', 'grad_norm': '1.578', 'learning_rate': '4.14e-05', 'epoch': '0.08309', 'num_input_tokens_seen': 632523, 'train_runtime': '316.2', 'train_tokens_per_second': '2001'}
|
| 313 |
+
{'loss': '0.6242', 'grad_norm': '1.54', 'learning_rate': '4.153e-05', 'epoch': '0.08336', 'num_input_tokens_seen': 634570, 'train_runtime': '317.2', 'train_tokens_per_second': '2001'}
|
| 314 |
+
{'loss': '0.8425', 'grad_norm': '1.811', 'learning_rate': '4.167e-05', 'epoch': '0.08362', 'num_input_tokens_seen': 636617, 'train_runtime': '318.2', 'train_tokens_per_second': '2001'}
|
| 315 |
+
{'loss': '0.9227', 'grad_norm': '1.62', 'learning_rate': '4.18e-05', 'epoch': '0.08389', 'num_input_tokens_seen': 638664, 'train_runtime': '319.2', 'train_tokens_per_second': '2001'}
|
| 316 |
+
{'loss': '1.007', 'grad_norm': '1.642', 'learning_rate': '4.194e-05', 'epoch': '0.08416', 'num_input_tokens_seen': 640711, 'train_runtime': '320.3', 'train_tokens_per_second': '2001'}
|
| 317 |
+
{'loss': '0.7684', 'grad_norm': '1.521', 'learning_rate': '4.207e-05', 'epoch': '0.08443', 'num_input_tokens_seen': 642758, 'train_runtime': '321.3', 'train_tokens_per_second': '2001'}
|
| 318 |
+
{'loss': '0.9068', 'grad_norm': '1.779', 'learning_rate': '4.22e-05', 'epoch': '0.0847', 'num_input_tokens_seen': 644805, 'train_runtime': '322.3', 'train_tokens_per_second': '2001'}
|
| 319 |
+
{'loss': '0.8407', 'grad_norm': '1.588', 'learning_rate': '4.234e-05', 'epoch': '0.08497', 'num_input_tokens_seen': 646852, 'train_runtime': '323.3', 'train_tokens_per_second': '2001'}
|
| 320 |
+
{'loss': '0.9359', 'grad_norm': '1.685', 'learning_rate': '4.247e-05', 'epoch': '0.08524', 'num_input_tokens_seen': 648899, 'train_runtime': '324.3', 'train_tokens_per_second': '2001'}
|
| 321 |
+
{'loss': '0.8513', 'grad_norm': '1.823', 'learning_rate': '4.261e-05', 'epoch': '0.08551', 'num_input_tokens_seen': 650946, 'train_runtime': '325.4', 'train_tokens_per_second': '2001'}
|
| 322 |
+
{'loss': '1.09', 'grad_norm': '2.251', 'learning_rate': '4.274e-05', 'epoch': '0.08578', 'num_input_tokens_seen': 652993, 'train_runtime': '326.4', 'train_tokens_per_second': '2001'}
|
| 323 |
+
{'loss': '0.8893', 'grad_norm': '1.614', 'learning_rate': '4.288e-05', 'epoch': '0.08604', 'num_input_tokens_seen': 655040, 'train_runtime': '327.4', 'train_tokens_per_second': '2001'}
|
| 324 |
+
{'loss': '0.499', 'grad_norm': '1.693', 'learning_rate': '4.301e-05', 'epoch': '0.08631', 'num_input_tokens_seen': 657087, 'train_runtime': '328.4', 'train_tokens_per_second': '2001'}
|
| 325 |
+
{'loss': '1.006', 'grad_norm': '1.781', 'learning_rate': '4.315e-05', 'epoch': '0.08658', 'num_input_tokens_seen': 659134, 'train_runtime': '329.4', 'train_tokens_per_second': '2001'}
|
| 326 |
+
{'loss': '0.6728', 'grad_norm': '1.412', 'learning_rate': '4.328e-05', 'epoch': '0.08685', 'num_input_tokens_seen': 661181, 'train_runtime': '330.4', 'train_tokens_per_second': '2001'}
|
| 327 |
+
{'loss': '0.6491', 'grad_norm': '1.683', 'learning_rate': '4.341e-05', 'epoch': '0.08712', 'num_input_tokens_seen': 663228, 'train_runtime': '331.5', 'train_tokens_per_second': '2001'}
|
| 328 |
+
{'loss': '0.9646', 'grad_norm': '1.918', 'learning_rate': '4.355e-05', 'epoch': '0.08739', 'num_input_tokens_seen': 665275, 'train_runtime': '332.5', 'train_tokens_per_second': '2001'}
|
| 329 |
+
{'loss': '0.6656', 'grad_norm': '1.711', 'learning_rate': '4.368e-05', 'epoch': '0.08766', 'num_input_tokens_seen': 667322, 'train_runtime': '333.5', 'train_tokens_per_second': '2001'}
|
| 330 |
+
{'loss': '0.7556', 'grad_norm': '1.799', 'learning_rate': '4.382e-05', 'epoch': '0.08793', 'num_input_tokens_seen': 669369, 'train_runtime': '334.5', 'train_tokens_per_second': '2001'}
|
| 331 |
+
{'loss': '0.8211', 'grad_norm': '1.622', 'learning_rate': '4.395e-05', 'epoch': '0.0882', 'num_input_tokens_seen': 671416, 'train_runtime': '335.5', 'train_tokens_per_second': '2001'}
|
| 332 |
+
{'loss': '0.8586', 'grad_norm': '1.673', 'learning_rate': '4.409e-05', 'epoch': '0.08846', 'num_input_tokens_seen': 673463, 'train_runtime': '336.6', 'train_tokens_per_second': '2001'}
|
| 333 |
+
{'loss': '0.8275', 'grad_norm': '1.59', 'learning_rate': '4.422e-05', 'epoch': '0.08873', 'num_input_tokens_seen': 675510, 'train_runtime': '337.6', 'train_tokens_per_second': '2001'}
|
| 334 |
+
{'loss': '0.7986', 'grad_norm': '1.536', 'learning_rate': '4.435e-05', 'epoch': '0.089', 'num_input_tokens_seen': 677557, 'train_runtime': '338.6', 'train_tokens_per_second': '2001'}
|
| 335 |
+
{'loss': '0.8409', 'grad_norm': '1.524', 'learning_rate': '4.449e-05', 'epoch': '0.08927', 'num_input_tokens_seen': 679604, 'train_runtime': '339.6', 'train_tokens_per_second': '2001'}
|
| 336 |
+
{'loss': '0.7889', 'grad_norm': '1.606', 'learning_rate': '4.462e-05', 'epoch': '0.08954', 'num_input_tokens_seen': 681651, 'train_runtime': '340.6', 'train_tokens_per_second': '2001'}
|
| 337 |
+
{'loss': '0.8146', 'grad_norm': '1.721', 'learning_rate': '4.476e-05', 'epoch': '0.08981', 'num_input_tokens_seen': 683698, 'train_runtime': '341.6', 'train_tokens_per_second': '2001'}
|
| 338 |
+
{'loss': '0.9218', 'grad_norm': '1.753', 'learning_rate': '4.489e-05', 'epoch': '0.09008', 'num_input_tokens_seen': 685745, 'train_runtime': '342.7', 'train_tokens_per_second': '2001'}
|
| 339 |
+
{'loss': '0.6649', 'grad_norm': '1.632', 'learning_rate': '4.503e-05', 'epoch': '0.09035', 'num_input_tokens_seen': 687792, 'train_runtime': '343.7', 'train_tokens_per_second': '2001'}
|
| 340 |
+
{'loss': '0.7102', 'grad_norm': '1.424', 'learning_rate': '4.516e-05', 'epoch': '0.09062', 'num_input_tokens_seen': 689839, 'train_runtime': '344.7', 'train_tokens_per_second': '2001'}
|
| 341 |
+
{'loss': '1.134', 'grad_norm': '2.3', 'learning_rate': '4.53e-05', 'epoch': '0.09088', 'num_input_tokens_seen': 691886, 'train_runtime': '345.7', 'train_tokens_per_second': '2001'}
|
| 342 |
+
{'loss': '0.9732', 'grad_norm': '2.07', 'learning_rate': '4.543e-05', 'epoch': '0.09115', 'num_input_tokens_seen': 693933, 'train_runtime': '346.7', 'train_tokens_per_second': '2001'}
|
| 343 |
+
{'loss': '0.8109', 'grad_norm': '1.658', 'learning_rate': '4.556e-05', 'epoch': '0.09142', 'num_input_tokens_seen': 695980, 'train_runtime': '347.8', 'train_tokens_per_second': '2001'}
|
| 344 |
+
{'loss': '0.8198', 'grad_norm': '1.551', 'learning_rate': '4.57e-05', 'epoch': '0.09169', 'num_input_tokens_seen': 698027, 'train_runtime': '348.8', 'train_tokens_per_second': '2001'}
|
| 345 |
+
{'loss': '0.6508', 'grad_norm': '1.996', 'learning_rate': '4.583e-05', 'epoch': '0.09196', 'num_input_tokens_seen': 700074, 'train_runtime': '349.8', 'train_tokens_per_second': '2001'}
|
| 346 |
+
{'loss': '0.6369', 'grad_norm': '1.678', 'learning_rate': '4.597e-05', 'epoch': '0.09223', 'num_input_tokens_seen': 702121, 'train_runtime': '350.8', 'train_tokens_per_second': '2001'}
|
| 347 |
+
{'loss': '0.8778', 'grad_norm': '1.761', 'learning_rate': '4.61e-05', 'epoch': '0.0925', 'num_input_tokens_seen': 704168, 'train_runtime': '351.8', 'train_tokens_per_second': '2001'}
|
| 348 |
+
{'loss': '0.5125', 'grad_norm': '2.032', 'learning_rate': '4.624e-05', 'epoch': '0.09277', 'num_input_tokens_seen': 706215, 'train_runtime': '352.9', 'train_tokens_per_second': '2001'}
|
| 349 |
+
{'loss': '0.5776', 'grad_norm': '1.902', 'learning_rate': '4.637e-05', 'epoch': '0.09304', 'num_input_tokens_seen': 708262, 'train_runtime': '353.9', 'train_tokens_per_second': '2001'}
|
| 350 |
+
{'loss': '0.8128', 'grad_norm': '1.934', 'learning_rate': '4.651e-05', 'epoch': '0.0933', 'num_input_tokens_seen': 710309, 'train_runtime': '354.9', 'train_tokens_per_second': '2001'}
|
| 351 |
+
{'loss': '0.8', 'grad_norm': '2.005', 'learning_rate': '4.664e-05', 'epoch': '0.09357', 'num_input_tokens_seen': 712356, 'train_runtime': '355.9', 'train_tokens_per_second': '2002'}
|
| 352 |
+
{'loss': '0.9134', 'grad_norm': '1.872', 'learning_rate': '4.677e-05', 'epoch': '0.09384', 'num_input_tokens_seen': 714403, 'train_runtime': '356.9', 'train_tokens_per_second': '2002'}
|
| 353 |
+
{'loss': '0.8195', 'grad_norm': '1.896', 'learning_rate': '4.691e-05', 'epoch': '0.09411', 'num_input_tokens_seen': 716450, 'train_runtime': '357.9', 'train_tokens_per_second': '2002'}
|
| 354 |
+
{'loss': '0.9879', 'grad_norm': '1.732', 'learning_rate': '4.704e-05', 'epoch': '0.09438', 'num_input_tokens_seen': 718497, 'train_runtime': '359', 'train_tokens_per_second': '2002'}
|
| 355 |
+
{'loss': '0.7241', 'grad_norm': '1.685', 'learning_rate': '4.718e-05', 'epoch': '0.09465', 'num_input_tokens_seen': 720544, 'train_runtime': '360', 'train_tokens_per_second': '2002'}
|
| 356 |
+
{'loss': '0.8061', 'grad_norm': '1.555', 'learning_rate': '4.731e-05', 'epoch': '0.09492', 'num_input_tokens_seen': 722591, 'train_runtime': '361', 'train_tokens_per_second': '2002'}
|
| 357 |
+
{'loss': '0.8035', 'grad_norm': '1.807', 'learning_rate': '4.745e-05', 'epoch': '0.09519', 'num_input_tokens_seen': 724638, 'train_runtime': '362', 'train_tokens_per_second': '2002'}
|
| 358 |
+
{'loss': '0.4991', 'grad_norm': '1.543', 'learning_rate': '4.758e-05', 'epoch': '0.09546', 'num_input_tokens_seen': 726685, 'train_runtime': '363', 'train_tokens_per_second': '2002'}
|
| 359 |
+
{'loss': '0.8125', 'grad_norm': '1.724', 'learning_rate': '4.772e-05', 'epoch': '0.09572', 'num_input_tokens_seen': 728732, 'train_runtime': '364.1', 'train_tokens_per_second': '2002'}
|
| 360 |
+
{'loss': '0.752', 'grad_norm': '1.793', 'learning_rate': '4.785e-05', 'epoch': '0.09599', 'num_input_tokens_seen': 730779, 'train_runtime': '365.1', 'train_tokens_per_second': '2002'}
|
| 361 |
+
{'loss': '0.9271', 'grad_norm': '2.305', 'learning_rate': '4.798e-05', 'epoch': '0.09626', 'num_input_tokens_seen': 732826, 'train_runtime': '366.1', 'train_tokens_per_second': '2002'}
|
| 362 |
+
{'loss': '0.6132', 'grad_norm': '2.224', 'learning_rate': '4.812e-05', 'epoch': '0.09653', 'num_input_tokens_seen': 734873, 'train_runtime': '367.1', 'train_tokens_per_second': '2002'}
|
| 363 |
+
{'loss': '0.6797', 'grad_norm': '1.914', 'learning_rate': '4.825e-05', 'epoch': '0.0968', 'num_input_tokens_seen': 736920, 'train_runtime': '368.1', 'train_tokens_per_second': '2002'}
|
| 364 |
+
{'loss': '0.9467', 'grad_norm': '2.078', 'learning_rate': '4.839e-05', 'epoch': '0.09707', 'num_input_tokens_seen': 738967, 'train_runtime': '369.1', 'train_tokens_per_second': '2002'}
|
| 365 |
+
{'loss': '0.8589', 'grad_norm': '2.175', 'learning_rate': '4.852e-05', 'epoch': '0.09734', 'num_input_tokens_seen': 741014, 'train_runtime': '370.2', 'train_tokens_per_second': '2002'}
|
| 366 |
+
{'loss': '0.8454', 'grad_norm': '1.922', 'learning_rate': '4.866e-05', 'epoch': '0.09761', 'num_input_tokens_seen': 743061, 'train_runtime': '371.2', 'train_tokens_per_second': '2002'}
|
| 367 |
+
{'loss': '0.8227', 'grad_norm': '1.937', 'learning_rate': '4.879e-05', 'epoch': '0.09788', 'num_input_tokens_seen': 745108, 'train_runtime': '372.2', 'train_tokens_per_second': '2002'}
|
| 368 |
+
{'loss': '0.7916', 'grad_norm': '1.935', 'learning_rate': '4.892e-05', 'epoch': '0.09814', 'num_input_tokens_seen': 747155, 'train_runtime': '373.2', 'train_tokens_per_second': '2002'}
|
| 369 |
+
{'loss': '0.6554', 'grad_norm': '1.673', 'learning_rate': '4.906e-05', 'epoch': '0.09841', 'num_input_tokens_seen': 749202, 'train_runtime': '374.2', 'train_tokens_per_second': '2002'}
|
| 370 |
+
{'loss': '0.8427', 'grad_norm': '1.627', 'learning_rate': '4.919e-05', 'epoch': '0.09868', 'num_input_tokens_seen': 751249, 'train_runtime': '375.2', 'train_tokens_per_second': '2002'}
|
| 371 |
+
{'loss': '0.7', 'grad_norm': '1.613', 'learning_rate': '4.933e-05', 'epoch': '0.09895', 'num_input_tokens_seen': 753296, 'train_runtime': '376.3', 'train_tokens_per_second': '2002'}
|
| 372 |
+
{'loss': '1.085', 'grad_norm': '1.733', 'learning_rate': '4.946e-05', 'epoch': '0.09922', 'num_input_tokens_seen': 755343, 'train_runtime': '377.3', 'train_tokens_per_second': '2002'}
|
| 373 |
+
{'loss': '0.7366', 'grad_norm': '1.8', 'learning_rate': '4.96e-05', 'epoch': '0.09949', 'num_input_tokens_seen': 757390, 'train_runtime': '378.3', 'train_tokens_per_second': '2002'}
|
| 374 |
+
{'loss': '0.8539', 'grad_norm': '2.089', 'learning_rate': '4.973e-05', 'epoch': '0.09976', 'num_input_tokens_seen': 759437, 'train_runtime': '379.3', 'train_tokens_per_second': '2002'}
|
| 375 |
+
{'loss': '1.091', 'grad_norm': '2.162', 'learning_rate': '4.987e-05', 'epoch': '0.1', 'num_input_tokens_seen': 761484, 'train_runtime': '380.3', 'train_tokens_per_second': '2002'}
|
| 376 |
+
{'loss': '0.5954', 'grad_norm': '1.539', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 763531, 'train_runtime': '381.4', 'train_tokens_per_second': '2002'}
|
| 377 |
+
{'loss': '0.8637', 'grad_norm': '3.224', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 765578, 'train_runtime': '382.4', 'train_tokens_per_second': '2002'}
|
| 378 |
+
{'loss': '1.156', 'grad_norm': '2.482', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 767625, 'train_runtime': '383.4', 'train_tokens_per_second': '2002'}
|
| 379 |
+
{'loss': '0.9774', 'grad_norm': '2.115', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 769672, 'train_runtime': '384.4', 'train_tokens_per_second': '2002'}
|
| 380 |
+
{'loss': '0.7794', 'grad_norm': '2.068', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 771719, 'train_runtime': '385.4', 'train_tokens_per_second': '2002'}
|
| 381 |
+
{'loss': '0.7327', 'grad_norm': '2.226', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 773766, 'train_runtime': '386.5', 'train_tokens_per_second': '2002'}
|
| 382 |
+
{'loss': '0.7302', 'grad_norm': '1.966', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 775813, 'train_runtime': '387.5', 'train_tokens_per_second': '2002'}
|
| 383 |
+
{'loss': '0.6878', 'grad_norm': '1.581', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 777860, 'train_runtime': '388.5', 'train_tokens_per_second': '2002'}
|
| 384 |
+
{'loss': '0.737', 'grad_norm': '1.672', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 779907, 'train_runtime': '389.5', 'train_tokens_per_second': '2002'}
|
| 385 |
+
{'loss': '0.9472', 'grad_norm': '2.17', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 781954, 'train_runtime': '390.5', 'train_tokens_per_second': '2002'}
|
| 386 |
+
{'loss': '0.5831', 'grad_norm': '1.504', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 784001, 'train_runtime': '391.6', 'train_tokens_per_second': '2002'}
|
| 387 |
+
{'loss': '0.6743', 'grad_norm': '1.78', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 786048, 'train_runtime': '392.6', 'train_tokens_per_second': '2002'}
|
| 388 |
+
{'loss': '0.5688', 'grad_norm': '1.95', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 788095, 'train_runtime': '393.6', 'train_tokens_per_second': '2002'}
|
| 389 |
+
{'loss': '0.929', 'grad_norm': '2.087', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 790142, 'train_runtime': '394.6', 'train_tokens_per_second': '2002'}
|
| 390 |
+
{'loss': '0.4627', 'grad_norm': '2.017', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 792189, 'train_runtime': '395.6', 'train_tokens_per_second': '2002'}
|
| 391 |
+
{'loss': '0.8193', 'grad_norm': '2.009', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 794236, 'train_runtime': '396.7', 'train_tokens_per_second': '2002'}
|
| 392 |
+
File "/usr/local/bin/llamafactory-cli", line 8, in <module>
|
| 393 |
+
sys.exit(main())
|
| 394 |
+
^^^^^^
|
| 395 |
+
File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
|
| 396 |
+
launcher.launch()
|
| 397 |
+
File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
|
| 398 |
+
run_exp()
|
| 399 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
|
| 400 |
+
_training_function(config={"args": args, "callbacks": callbacks})
|
| 401 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
|
| 402 |
+
run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
|
| 403 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
|
| 404 |
+
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
|
| 405 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 406 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
|
| 407 |
+
return inner_training_loop(
|
| 408 |
+
^^^^^^^^^^^^^^^^^^^^
|
| 409 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
|
| 410 |
+
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
| 411 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 412 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
|
| 413 |
+
self.accelerator.backward(loss, **kwargs)
|
| 414 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
|
| 415 |
+
loss.backward(**kwargs)
|
| 416 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
|
| 417 |
+
torch.autograd.backward(
|
| 418 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
|
| 419 |
+
_engine_run_backward(
|
| 420 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
|
| 421 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 422 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 423 |
+
KeyboardInterrupt
|
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T03:57:46.163443Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/C.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "47a53adf0198",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "1858306048"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "201701408768"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "13.0",
|
| 40 |
+
"writerId": "mfjy22anxcucsb3vwlaimrwvqrgvipis"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/global_step":388,"train/grad_norm":2.0090420246124268,"train/learning_rate":4.9999916410392856e-05,"_wandb":{"runtime":396},"_runtime":396,"train/loss":0.8193472027778625,"_step":387,"train/epoch":0.1043291207313794,"train_runtime":396.6553,"train/train_tokens_per_second":2002.333,"_timestamp":1.770177862347725e+09,"train/num_input_tokens_seen":794236}
|
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0%| | 0/40950 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
| 2 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
| 3 |
+
|
| 4 |
+
{'loss': '1.719', 'grad_norm': '0.3142', 'learning_rate': '0', 'epoch': '0.0001221', 'num_input_tokens_seen': 2047, 'train_runtime': '3.017', 'train_tokens_per_second': '678.5'}
|
| 5 |
+
{'loss': '1.142', 'grad_norm': '0.2725', 'learning_rate': '6.105e-08', 'epoch': '0.0002442', 'num_input_tokens_seen': 4094, 'train_runtime': '4.05', 'train_tokens_per_second': '1011'}
|
| 6 |
+
{'loss': '1.39', 'grad_norm': '0.379', 'learning_rate': '1.221e-07', 'epoch': '0.0003663', 'num_input_tokens_seen': 6141, 'train_runtime': '5.087', 'train_tokens_per_second': '1207'}
|
| 7 |
+
{'loss': '1.457', 'grad_norm': '0.2879', 'learning_rate': '1.832e-07', 'epoch': '0.0004884', 'num_input_tokens_seen': 8188, 'train_runtime': '6.124', 'train_tokens_per_second': '1337'}
|
| 8 |
+
{'loss': '1.286', 'grad_norm': '0.2564', 'learning_rate': '2.442e-07', 'epoch': '0.0006105', 'num_input_tokens_seen': 10235, 'train_runtime': '7.165', 'train_tokens_per_second': '1429'}
|
| 9 |
+
{'loss': '0.01258', 'grad_norm': '0.042', 'learning_rate': '3.053e-07', 'epoch': '0.0007326', 'num_input_tokens_seen': 12282, 'train_runtime': '8.201', 'train_tokens_per_second': '1498'}
|
| 10 |
+
{'loss': '0.8563', 'grad_norm': '0.267', 'learning_rate': '3.663e-07', 'epoch': '0.0008547', 'num_input_tokens_seen': 14329, 'train_runtime': '9.241', 'train_tokens_per_second': '1551'}
|
| 11 |
+
{'loss': '1.581', 'grad_norm': '0.2901', 'learning_rate': '4.274e-07', 'epoch': '0.0009768', 'num_input_tokens_seen': 16376, 'train_runtime': '10.28', 'train_tokens_per_second': '1593'}
|
| 12 |
+
{'loss': '1.573', 'grad_norm': '0.2915', 'learning_rate': '4.884e-07', 'epoch': '0.001099', 'num_input_tokens_seen': 18423, 'train_runtime': '11.32', 'train_tokens_per_second': '1628'}
|
| 13 |
+
{'loss': '1.346', 'grad_norm': '0.2841', 'learning_rate': '5.495e-07', 'epoch': '0.001221', 'num_input_tokens_seen': 20470, 'train_runtime': '12.35', 'train_tokens_per_second': '1657'}
|
| 14 |
+
{'loss': '1.651', 'grad_norm': '0.4522', 'learning_rate': '6.105e-07', 'epoch': '0.001343', 'num_input_tokens_seen': 22517, 'train_runtime': '13.39', 'train_tokens_per_second': '1682'}
|
| 15 |
+
{'loss': '1.487', 'grad_norm': '0.3466', 'learning_rate': '6.716e-07', 'epoch': '0.001465', 'num_input_tokens_seen': 24564, 'train_runtime': '14.44', 'train_tokens_per_second': '1701'}
|
| 16 |
+
{'loss': '0.8106', 'grad_norm': '0.2226', 'learning_rate': '7.326e-07', 'epoch': '0.001587', 'num_input_tokens_seen': 26611, 'train_runtime': '15.48', 'train_tokens_per_second': '1719'}
|
| 17 |
+
{'loss': '0.5651', 'grad_norm': '0.2162', 'learning_rate': '7.937e-07', 'epoch': '0.001709', 'num_input_tokens_seen': 28658, 'train_runtime': '16.52', 'train_tokens_per_second': '1735'}
|
| 18 |
+
{'loss': '1.622', 'grad_norm': '0.3259', 'learning_rate': '8.547e-07', 'epoch': '0.001832', 'num_input_tokens_seen': 30705, 'train_runtime': '17.56', 'train_tokens_per_second': '1749'}
|
| 19 |
+
{'loss': '1.418', 'grad_norm': '0.285', 'learning_rate': '9.158e-07', 'epoch': '0.001954', 'num_input_tokens_seen': 32752, 'train_runtime': '18.6', 'train_tokens_per_second': '1761'}
|
| 20 |
+
{'loss': '1.69', 'grad_norm': '0.3264', 'learning_rate': '9.768e-07', 'epoch': '0.002076', 'num_input_tokens_seen': 34799, 'train_runtime': '19.64', 'train_tokens_per_second': '1772'}
|
| 21 |
+
{'loss': '1.73', 'grad_norm': '0.3089', 'learning_rate': '1.038e-06', 'epoch': '0.002198', 'num_input_tokens_seen': 36846, 'train_runtime': '20.68', 'train_tokens_per_second': '1782'}
|
| 22 |
+
{'loss': '1.511', 'grad_norm': '0.3119', 'learning_rate': '1.099e-06', 'epoch': '0.00232', 'num_input_tokens_seen': 38893, 'train_runtime': '21.72', 'train_tokens_per_second': '1791'}
|
| 23 |
+
{'loss': '1.435', 'grad_norm': '0.3081', 'learning_rate': '1.16e-06', 'epoch': '0.002442', 'num_input_tokens_seen': 40940, 'train_runtime': '22.77', 'train_tokens_per_second': '1798'}
|
| 24 |
+
{'loss': '1.785', 'grad_norm': '0.4437', 'learning_rate': '1.221e-06', 'epoch': '0.002564', 'num_input_tokens_seen': 42987, 'train_runtime': '23.81', 'train_tokens_per_second': '1806'}
|
| 25 |
+
{'loss': '1.101', 'grad_norm': '0.3949', 'learning_rate': '1.282e-06', 'epoch': '0.002686', 'num_input_tokens_seen': 45034, 'train_runtime': '24.85', 'train_tokens_per_second': '1812'}
|
| 26 |
+
{'loss': '0.7684', 'grad_norm': '0.2791', 'learning_rate': '1.343e-06', 'epoch': '0.002808', 'num_input_tokens_seen': 47081, 'train_runtime': '25.89', 'train_tokens_per_second': '1819'}
|
| 27 |
+
{'loss': '0.9445', 'grad_norm': '0.2267', 'learning_rate': '1.404e-06', 'epoch': '0.00293', 'num_input_tokens_seen': 49128, 'train_runtime': '26.93', 'train_tokens_per_second': '1825'}
|
| 28 |
+
{'loss': '1.328', 'grad_norm': '0.5019', 'learning_rate': '1.465e-06', 'epoch': '0.003053', 'num_input_tokens_seen': 51175, 'train_runtime': '27.96', 'train_tokens_per_second': '1830'}
|
| 29 |
+
{'loss': '1.597', 'grad_norm': '0.3425', 'learning_rate': '1.526e-06', 'epoch': '0.003175', 'num_input_tokens_seen': 53222, 'train_runtime': '29.01', 'train_tokens_per_second': '1834'}
|
| 30 |
+
{'loss': '1.797', 'grad_norm': '0.3407', 'learning_rate': '1.587e-06', 'epoch': '0.003297', 'num_input_tokens_seen': 55269, 'train_runtime': '30.05', 'train_tokens_per_second': '1839'}
|
| 31 |
+
{'loss': '0.7549', 'grad_norm': '0.2074', 'learning_rate': '1.648e-06', 'epoch': '0.003419', 'num_input_tokens_seen': 57316, 'train_runtime': '31.1', 'train_tokens_per_second': '1843'}
|
| 32 |
+
{'loss': '0.6662', 'grad_norm': '0.2184', 'learning_rate': '1.709e-06', 'epoch': '0.003541', 'num_input_tokens_seen': 59363, 'train_runtime': '32.14', 'train_tokens_per_second': '1847'}
|
| 33 |
+
{'loss': '0.9995', 'grad_norm': '0.2354', 'learning_rate': '1.77e-06', 'epoch': '0.003663', 'num_input_tokens_seen': 61410, 'train_runtime': '33.19', 'train_tokens_per_second': '1850'}
|
| 34 |
+
{'loss': '1.189', 'grad_norm': '0.2462', 'learning_rate': '1.832e-06', 'epoch': '0.003785', 'num_input_tokens_seen': 63457, 'train_runtime': '34.23', 'train_tokens_per_second': '1854'}
|
| 35 |
+
{'loss': '1.353', 'grad_norm': '0.2564', 'learning_rate': '1.893e-06', 'epoch': '0.003907', 'num_input_tokens_seen': 65504, 'train_runtime': '35.28', 'train_tokens_per_second': '1857'}
|
| 36 |
+
{'loss': '1.41', 'grad_norm': '0.3253', 'learning_rate': '1.954e-06', 'epoch': '0.004029', 'num_input_tokens_seen': 67551, 'train_runtime': '36.32', 'train_tokens_per_second': '1860'}
|
| 37 |
+
{'loss': '1.575', 'grad_norm': '0.303', 'learning_rate': '2.015e-06', 'epoch': '0.004151', 'num_input_tokens_seen': 69598, 'train_runtime': '37.36', 'train_tokens_per_second': '1863'}
|
| 38 |
+
{'loss': '1.542', 'grad_norm': '0.3227', 'learning_rate': '2.076e-06', 'epoch': '0.004274', 'num_input_tokens_seen': 71645, 'train_runtime': '38.4', 'train_tokens_per_second': '1866'}
|
| 39 |
+
{'loss': '1.281', 'grad_norm': '0.3266', 'learning_rate': '2.137e-06', 'epoch': '0.004396', 'num_input_tokens_seen': 73692, 'train_runtime': '39.45', 'train_tokens_per_second': '1868'}
|
| 40 |
+
{'loss': '1.936', 'grad_norm': '0.601', 'learning_rate': '2.198e-06', 'epoch': '0.004518', 'num_input_tokens_seen': 75739, 'train_runtime': '40.49', 'train_tokens_per_second': '1871'}
|
| 41 |
+
{'loss': '1.855', 'grad_norm': '0.2591', 'learning_rate': '2.259e-06', 'epoch': '0.00464', 'num_input_tokens_seen': 77786, 'train_runtime': '41.53', 'train_tokens_per_second': '1873'}
|
| 42 |
+
{'loss': '0.8793', 'grad_norm': '0.308', 'learning_rate': '2.32e-06', 'epoch': '0.004762', 'num_input_tokens_seen': 79833, 'train_runtime': '42.57', 'train_tokens_per_second': '1875'}
|
| 43 |
+
{'loss': '1.274', 'grad_norm': '0.2598', 'learning_rate': '2.381e-06', 'epoch': '0.004884', 'num_input_tokens_seen': 81880, 'train_runtime': '43.61', 'train_tokens_per_second': '1877'}
|
| 44 |
+
{'loss': '1.502', 'grad_norm': '0.3138', 'learning_rate': '2.442e-06', 'epoch': '0.005006', 'num_input_tokens_seen': 83927, 'train_runtime': '44.66', 'train_tokens_per_second': '1879'}
|
| 45 |
+
{'loss': '1.367', 'grad_norm': '0.2641', 'learning_rate': '2.503e-06', 'epoch': '0.005128', 'num_input_tokens_seen': 85974, 'train_runtime': '45.7', 'train_tokens_per_second': '1881'}
|
| 46 |
+
{'loss': '0.7333', 'grad_norm': '0.226', 'learning_rate': '2.564e-06', 'epoch': '0.00525', 'num_input_tokens_seen': 88021, 'train_runtime': '46.75', 'train_tokens_per_second': '1883'}
|
| 47 |
+
{'loss': '1.199', 'grad_norm': '0.277', 'learning_rate': '2.625e-06', 'epoch': '0.005372', 'num_input_tokens_seen': 90068, 'train_runtime': '47.82', 'train_tokens_per_second': '1883'}
|
| 48 |
+
{'loss': '1.659', 'grad_norm': '0.3296', 'learning_rate': '2.686e-06', 'epoch': '0.005495', 'num_input_tokens_seen': 92115, 'train_runtime': '48.86', 'train_tokens_per_second': '1885'}
|
| 49 |
+
{'loss': '1.699', 'grad_norm': '0.3483', 'learning_rate': '2.747e-06', 'epoch': '0.005617', 'num_input_tokens_seen': 94162, 'train_runtime': '49.91', 'train_tokens_per_second': '1887'}
|
| 50 |
+
{'loss': '1.513', 'grad_norm': '0.3496', 'learning_rate': '2.808e-06', 'epoch': '0.005739', 'num_input_tokens_seen': 96209, 'train_runtime': '50.96', 'train_tokens_per_second': '1888'}
|
| 51 |
+
{'loss': '1.737', 'grad_norm': '0.3098', 'learning_rate': '2.869e-06', 'epoch': '0.005861', 'num_input_tokens_seen': 98256, 'train_runtime': '52.01', 'train_tokens_per_second': '1889'}
|
| 52 |
+
{'loss': '1.359', 'grad_norm': '0.3305', 'learning_rate': '2.93e-06', 'epoch': '0.005983', 'num_input_tokens_seen': 100303, 'train_runtime': '53.06', 'train_tokens_per_second': '1891'}
|
| 53 |
+
{'loss': '1.805', 'grad_norm': '0.3772', 'learning_rate': '2.991e-06', 'epoch': '0.006105', 'num_input_tokens_seen': 102350, 'train_runtime': '54.1', 'train_tokens_per_second': '1892'}
|
| 54 |
+
{'loss': '1.882', 'grad_norm': '0.3816', 'learning_rate': '3.053e-06', 'epoch': '0.006227', 'num_input_tokens_seen': 104397, 'train_runtime': '55.15', 'train_tokens_per_second': '1893'}
|
| 55 |
+
{'loss': '1.566', 'grad_norm': '0.333', 'learning_rate': '3.114e-06', 'epoch': '0.006349', 'num_input_tokens_seen': 106444, 'train_runtime': '56.19', 'train_tokens_per_second': '1895'}
|
| 56 |
+
{'loss': '1.816', 'grad_norm': '0.3612', 'learning_rate': '3.175e-06', 'epoch': '0.006471', 'num_input_tokens_seen': 108491, 'train_runtime': '57.23', 'train_tokens_per_second': '1896'}
|
| 57 |
+
{'loss': '1.933', 'grad_norm': '0.5047', 'learning_rate': '3.236e-06', 'epoch': '0.006593', 'num_input_tokens_seen': 110538, 'train_runtime': '58.28', 'train_tokens_per_second': '1897'}
|
| 58 |
+
{'loss': '1.34', 'grad_norm': '0.2829', 'learning_rate': '3.297e-06', 'epoch': '0.006716', 'num_input_tokens_seen': 112585, 'train_runtime': '59.32', 'train_tokens_per_second': '1898'}
|
| 59 |
+
{'loss': '0.851', 'grad_norm': '0.4326', 'learning_rate': '3.358e-06', 'epoch': '0.006838', 'num_input_tokens_seen': 114632, 'train_runtime': '60.37', 'train_tokens_per_second': '1899'}
|
| 60 |
+
{'loss': '0.7931', 'grad_norm': '0.3166', 'learning_rate': '3.419e-06', 'epoch': '0.00696', 'num_input_tokens_seen': 116679, 'train_runtime': '61.41', 'train_tokens_per_second': '1900'}
|
| 61 |
+
{'loss': '1.728', 'grad_norm': '0.3289', 'learning_rate': '3.48e-06', 'epoch': '0.007082', 'num_input_tokens_seen': 118726, 'train_runtime': '62.45', 'train_tokens_per_second': '1901'}
|
| 62 |
+
{'loss': '0.7369', 'grad_norm': '0.2613', 'learning_rate': '3.541e-06', 'epoch': '0.007204', 'num_input_tokens_seen': 120773, 'train_runtime': '63.49', 'train_tokens_per_second': '1902'}
|
| 63 |
+
{'loss': '1.464', 'grad_norm': '0.2617', 'learning_rate': '3.602e-06', 'epoch': '0.007326', 'num_input_tokens_seen': 122820, 'train_runtime': '64.53', 'train_tokens_per_second': '1903'}
|
| 64 |
+
{'loss': '1.883', 'grad_norm': '0.3848', 'learning_rate': '3.663e-06', 'epoch': '0.007448', 'num_input_tokens_seen': 124867, 'train_runtime': '65.58', 'train_tokens_per_second': '1904'}
|
| 65 |
+
{'loss': '0.5969', 'grad_norm': '0.2306', 'learning_rate': '3.724e-06', 'epoch': '0.00757', 'num_input_tokens_seen': 126914, 'train_runtime': '66.63', 'train_tokens_per_second': '1905'}
|
| 66 |
+
{'loss': '1.594', 'grad_norm': '0.2975', 'learning_rate': '3.785e-06', 'epoch': '0.007692', 'num_input_tokens_seen': 128961, 'train_runtime': '67.68', 'train_tokens_per_second': '1906'}
|
| 67 |
+
{'loss': '1.062', 'grad_norm': '0.253', 'learning_rate': '3.846e-06', 'epoch': '0.007814', 'num_input_tokens_seen': 131008, 'train_runtime': '68.72', 'train_tokens_per_second': '1906'}
|
| 68 |
+
{'loss': '1.625', 'grad_norm': '0.3242', 'learning_rate': '3.907e-06', 'epoch': '0.007937', 'num_input_tokens_seen': 133055, 'train_runtime': '69.77', 'train_tokens_per_second': '1907'}
|
| 69 |
+
{'loss': '1.335', 'grad_norm': '0.3814', 'learning_rate': '3.968e-06', 'epoch': '0.008059', 'num_input_tokens_seen': 135102, 'train_runtime': '70.82', 'train_tokens_per_second': '1908'}
|
| 70 |
+
{'loss': '1.049', 'grad_norm': '0.2831', 'learning_rate': '4.029e-06', 'epoch': '0.008181', 'num_input_tokens_seen': 137149, 'train_runtime': '71.86', 'train_tokens_per_second': '1909'}
|
| 71 |
+
{'loss': '1.03', 'grad_norm': '0.2496', 'learning_rate': '4.09e-06', 'epoch': '0.008303', 'num_input_tokens_seen': 139196, 'train_runtime': '72.9', 'train_tokens_per_second': '1909'}
|
| 72 |
+
{'loss': '1.344', 'grad_norm': '0.3791', 'learning_rate': '4.151e-06', 'epoch': '0.008425', 'num_input_tokens_seen': 141243, 'train_runtime': '74.09', 'train_tokens_per_second': '1906'}
|
| 73 |
+
{'loss': '1.543', 'grad_norm': '0.3291', 'learning_rate': '4.212e-06', 'epoch': '0.008547', 'num_input_tokens_seen': 143290, 'train_runtime': '75.13', 'train_tokens_per_second': '1907'}
|
| 74 |
+
{'loss': '1.627', 'grad_norm': '0.3203', 'learning_rate': '4.274e-06', 'epoch': '0.008669', 'num_input_tokens_seen': 145337, 'train_runtime': '76.17', 'train_tokens_per_second': '1908'}
|
| 75 |
+
{'loss': '1.25', 'grad_norm': '0.3174', 'learning_rate': '4.335e-06', 'epoch': '0.008791', 'num_input_tokens_seen': 147384, 'train_runtime': '77.21', 'train_tokens_per_second': '1909'}
|
| 76 |
+
{'loss': '1.305', 'grad_norm': '0.3542', 'learning_rate': '4.396e-06', 'epoch': '0.008913', 'num_input_tokens_seen': 149431, 'train_runtime': '78.26', 'train_tokens_per_second': '1909'}
|
| 77 |
+
{'loss': '0.7812', 'grad_norm': '0.2824', 'learning_rate': '4.457e-06', 'epoch': '0.009035', 'num_input_tokens_seen': 151478, 'train_runtime': '79.3', 'train_tokens_per_second': '1910'}
|
| 78 |
+
{'loss': '1.514', 'grad_norm': '0.3974', 'learning_rate': '4.518e-06', 'epoch': '0.009158', 'num_input_tokens_seen': 153525, 'train_runtime': '80.34', 'train_tokens_per_second': '1911'}
|
| 79 |
+
{'loss': '0.8486', 'grad_norm': '0.394', 'learning_rate': '4.579e-06', 'epoch': '0.00928', 'num_input_tokens_seen': 155572, 'train_runtime': '81.39', 'train_tokens_per_second': '1911'}
|
| 80 |
+
{'loss': '1.741', 'grad_norm': '0.4167', 'learning_rate': '4.64e-06', 'epoch': '0.009402', 'num_input_tokens_seen': 157619, 'train_runtime': '82.43', 'train_tokens_per_second': '1912'}
|
| 81 |
+
{'loss': '1.393', 'grad_norm': '0.3378', 'learning_rate': '4.701e-06', 'epoch': '0.009524', 'num_input_tokens_seen': 159666, 'train_runtime': '83.47', 'train_tokens_per_second': '1913'}
|
| 82 |
+
{'loss': '1.174', 'grad_norm': '0.3005', 'learning_rate': '4.762e-06', 'epoch': '0.009646', 'num_input_tokens_seen': 161713, 'train_runtime': '84.52', 'train_tokens_per_second': '1913'}
|
| 83 |
+
{'loss': '0.7404', 'grad_norm': '0.2695', 'learning_rate': '4.823e-06', 'epoch': '0.009768', 'num_input_tokens_seen': 163760, 'train_runtime': '85.56', 'train_tokens_per_second': '1914'}
|
| 84 |
+
{'loss': '1.576', 'grad_norm': '0.345', 'learning_rate': '4.884e-06', 'epoch': '0.00989', 'num_input_tokens_seen': 165807, 'train_runtime': '86.6', 'train_tokens_per_second': '1915'}
|
| 85 |
+
{'loss': '1.073', 'grad_norm': '0.3396', 'learning_rate': '4.945e-06', 'epoch': '0.01001', 'num_input_tokens_seen': 167854, 'train_runtime': '87.64', 'train_tokens_per_second': '1915'}
|
| 86 |
+
{'loss': '1.579', 'grad_norm': '0.3497', 'learning_rate': '5.006e-06', 'epoch': '0.01013', 'num_input_tokens_seen': 169901, 'train_runtime': '88.68', 'train_tokens_per_second': '1916'}
|
| 87 |
+
{'loss': '0.784', 'grad_norm': '0.3244', 'learning_rate': '5.067e-06', 'epoch': '0.01026', 'num_input_tokens_seen': 171948, 'train_runtime': '89.72', 'train_tokens_per_second': '1916'}
|
| 88 |
+
{'loss': '1.157', 'grad_norm': '0.2747', 'learning_rate': '5.128e-06', 'epoch': '0.01038', 'num_input_tokens_seen': 173995, 'train_runtime': '90.77', 'train_tokens_per_second': '1917'}
|
| 89 |
+
{'loss': '0.9066', 'grad_norm': '0.233', 'learning_rate': '5.189e-06', 'epoch': '0.0105', 'num_input_tokens_seen': 176042, 'train_runtime': '91.81', 'train_tokens_per_second': '1918'}
|
| 90 |
+
{'loss': '0.7513', 'grad_norm': '0.2136', 'learning_rate': '5.25e-06', 'epoch': '0.01062', 'num_input_tokens_seen': 178089, 'train_runtime': '92.85', 'train_tokens_per_second': '1918'}
|
| 91 |
+
{'loss': '0.8007', 'grad_norm': '0.3918', 'learning_rate': '5.311e-06', 'epoch': '0.01074', 'num_input_tokens_seen': 180136, 'train_runtime': '93.89', 'train_tokens_per_second': '1919'}
|
| 92 |
+
{'loss': '1.275', 'grad_norm': '0.3246', 'learning_rate': '5.372e-06', 'epoch': '0.01087', 'num_input_tokens_seen': 182183, 'train_runtime': '94.93', 'train_tokens_per_second': '1919'}
|
| 93 |
+
{'loss': '0.6336', 'grad_norm': '0.2194', 'learning_rate': '5.433e-06', 'epoch': '0.01099', 'num_input_tokens_seen': 184230, 'train_runtime': '95.97', 'train_tokens_per_second': '1920'}
|
| 94 |
+
{'loss': '0.668', 'grad_norm': '0.2253', 'learning_rate': '5.495e-06', 'epoch': '0.01111', 'num_input_tokens_seen': 186277, 'train_runtime': '97.01', 'train_tokens_per_second': '1920'}
|
| 95 |
+
{'loss': '1.824', 'grad_norm': '0.354', 'learning_rate': '5.556e-06', 'epoch': '0.01123', 'num_input_tokens_seen': 188324, 'train_runtime': '98.05', 'train_tokens_per_second': '1921'}
|
| 96 |
+
{'loss': '1.28', 'grad_norm': '0.4487', 'learning_rate': '5.617e-06', 'epoch': '0.01136', 'num_input_tokens_seen': 190371, 'train_runtime': '99.09', 'train_tokens_per_second': '1921'}
|
| 97 |
+
{'loss': '0.6494', 'grad_norm': '0.2398', 'learning_rate': '5.678e-06', 'epoch': '0.01148', 'num_input_tokens_seen': 192418, 'train_runtime': '100.1', 'train_tokens_per_second': '1922'}
|
| 98 |
+
{'loss': '0.6123', 'grad_norm': '0.2938', 'learning_rate': '5.739e-06', 'epoch': '0.0116', 'num_input_tokens_seen': 194465, 'train_runtime': '101.2', 'train_tokens_per_second': '1922'}
|
| 99 |
+
{'loss': '1.243', 'grad_norm': '0.3335', 'learning_rate': '5.8e-06', 'epoch': '0.01172', 'num_input_tokens_seen': 196512, 'train_runtime': '102.2', 'train_tokens_per_second': '1922'}
|
| 100 |
+
{'loss': '1.335', 'grad_norm': '0.3472', 'learning_rate': '5.861e-06', 'epoch': '0.01184', 'num_input_tokens_seen': 198559, 'train_runtime': '103.3', 'train_tokens_per_second': '1923'}
|
| 101 |
+
{'loss': '1.112', 'grad_norm': '0.2869', 'learning_rate': '5.922e-06', 'epoch': '0.01197', 'num_input_tokens_seen': 200606, 'train_runtime': '104.3', 'train_tokens_per_second': '1923'}
|
| 102 |
+
{'loss': '1.557', 'grad_norm': '0.4047', 'learning_rate': '5.983e-06', 'epoch': '0.01209', 'num_input_tokens_seen': 202653, 'train_runtime': '105.3', 'train_tokens_per_second': '1924'}
|
| 103 |
+
{'loss': '1.697', 'grad_norm': '0.4249', 'learning_rate': '6.044e-06', 'epoch': '0.01221', 'num_input_tokens_seen': 204700, 'train_runtime': '106.4', 'train_tokens_per_second': '1924'}
|
| 104 |
+
{'loss': '0.8076', 'grad_norm': '0.2638', 'learning_rate': '6.105e-06', 'epoch': '0.01233', 'num_input_tokens_seen': 206747, 'train_runtime': '107.4', 'train_tokens_per_second': '1925'}
|
| 105 |
+
{'loss': '1.775', 'grad_norm': '0.3715', 'learning_rate': '6.166e-06', 'epoch': '0.01245', 'num_input_tokens_seen': 208794, 'train_runtime': '108.5', 'train_tokens_per_second': '1925'}
|
| 106 |
+
{'loss': '1.606', 'grad_norm': '0.3108', 'learning_rate': '6.227e-06', 'epoch': '0.01258', 'num_input_tokens_seen': 210841, 'train_runtime': '109.5', 'train_tokens_per_second': '1925'}
|
| 107 |
+
{'loss': '1.637', 'grad_norm': '0.3672', 'learning_rate': '6.288e-06', 'epoch': '0.0127', 'num_input_tokens_seen': 212888, 'train_runtime': '110.6', 'train_tokens_per_second': '1926'}
|
| 108 |
+
{'loss': '1.369', 'grad_norm': '0.4352', 'learning_rate': '6.349e-06', 'epoch': '0.01282', 'num_input_tokens_seen': 214935, 'train_runtime': '111.6', 'train_tokens_per_second': '1926'}
|
| 109 |
+
{'loss': '1.386', 'grad_norm': '0.308', 'learning_rate': '6.41e-06', 'epoch': '0.01294', 'num_input_tokens_seen': 216982, 'train_runtime': '112.6', 'train_tokens_per_second': '1926'}
|
| 110 |
+
{'loss': '1.196', 'grad_norm': '0.3402', 'learning_rate': '6.471e-06', 'epoch': '0.01306', 'num_input_tokens_seen': 219029, 'train_runtime': '113.7', 'train_tokens_per_second': '1927'}
|
| 111 |
+
{'loss': '1.117', 'grad_norm': '0.3496', 'learning_rate': '6.532e-06', 'epoch': '0.01319', 'num_input_tokens_seen': 221076, 'train_runtime': '114.7', 'train_tokens_per_second': '1927'}
|
| 112 |
+
{'loss': '1.772', 'grad_norm': '0.3945', 'learning_rate': '6.593e-06', 'epoch': '0.01331', 'num_input_tokens_seen': 223123, 'train_runtime': '115.8', 'train_tokens_per_second': '1927'}
|
| 113 |
+
{'loss': '0.9553', 'grad_norm': '0.2856', 'learning_rate': '6.654e-06', 'epoch': '0.01343', 'num_input_tokens_seen': 225170, 'train_runtime': '116.8', 'train_tokens_per_second': '1928'}
|
| 114 |
+
{'loss': '1.563', 'grad_norm': '0.3784', 'learning_rate': '6.716e-06', 'epoch': '0.01355', 'num_input_tokens_seen': 227217, 'train_runtime': '117.8', 'train_tokens_per_second': '1928'}
|
| 115 |
+
{'loss': '1.567', 'grad_norm': '0.3456', 'learning_rate': '6.777e-06', 'epoch': '0.01368', 'num_input_tokens_seen': 229264, 'train_runtime': '118.9', 'train_tokens_per_second': '1928'}
|
| 116 |
+
{'loss': '0.7048', 'grad_norm': '0.2298', 'learning_rate': '6.838e-06', 'epoch': '0.0138', 'num_input_tokens_seen': 231311, 'train_runtime': '119.9', 'train_tokens_per_second': '1929'}
|
| 117 |
+
{'loss': '1.194', 'grad_norm': '0.3506', 'learning_rate': '6.899e-06', 'epoch': '0.01392', 'num_input_tokens_seen': 233358, 'train_runtime': '121', 'train_tokens_per_second': '1929'}
|
| 118 |
+
{'loss': '0.7762', 'grad_norm': '0.2345', 'learning_rate': '6.96e-06', 'epoch': '0.01404', 'num_input_tokens_seen': 235405, 'train_runtime': '122', 'train_tokens_per_second': '1929'}
|
| 119 |
+
{'loss': '1.459', 'grad_norm': '0.3409', 'learning_rate': '7.021e-06', 'epoch': '0.01416', 'num_input_tokens_seen': 237452, 'train_runtime': '123.1', 'train_tokens_per_second': '1930'}
|
| 120 |
+
{'loss': '0.6121', 'grad_norm': '0.2403', 'learning_rate': '7.082e-06', 'epoch': '0.01429', 'num_input_tokens_seen': 239499, 'train_runtime': '124.1', 'train_tokens_per_second': '1930'}
|
| 121 |
+
{'loss': '1.599', 'grad_norm': '0.299', 'learning_rate': '7.143e-06', 'epoch': '0.01441', 'num_input_tokens_seen': 241546, 'train_runtime': '125.1', 'train_tokens_per_second': '1930'}
|
| 122 |
+
{'loss': '1.771', 'grad_norm': '0.391', 'learning_rate': '7.204e-06', 'epoch': '0.01453', 'num_input_tokens_seen': 243593, 'train_runtime': '126.2', 'train_tokens_per_second': '1930'}
|
| 123 |
+
{'loss': '1.541', 'grad_norm': '0.3111', 'learning_rate': '7.265e-06', 'epoch': '0.01465', 'num_input_tokens_seen': 245640, 'train_runtime': '127.2', 'train_tokens_per_second': '1931'}
|
| 124 |
+
{'loss': '0.7969', 'grad_norm': '0.2717', 'learning_rate': '7.326e-06', 'epoch': '0.01477', 'num_input_tokens_seen': 247687, 'train_runtime': '128.3', 'train_tokens_per_second': '1931'}
|
| 125 |
+
{'loss': '1.567', 'grad_norm': '0.3719', 'learning_rate': '7.387e-06', 'epoch': '0.0149', 'num_input_tokens_seen': 249734, 'train_runtime': '129.3', 'train_tokens_per_second': '1931'}
|
| 126 |
+
{'loss': '1.782', 'grad_norm': '0.3787', 'learning_rate': '7.448e-06', 'epoch': '0.01502', 'num_input_tokens_seen': 251781, 'train_runtime': '130.4', 'train_tokens_per_second': '1931'}
|
| 127 |
+
{'loss': '0.7362', 'grad_norm': '0.2492', 'learning_rate': '7.509e-06', 'epoch': '0.01514', 'num_input_tokens_seen': 253828, 'train_runtime': '131.4', 'train_tokens_per_second': '1932'}
|
| 128 |
+
{'loss': '1.653', 'grad_norm': '0.3752', 'learning_rate': '7.57e-06', 'epoch': '0.01526', 'num_input_tokens_seen': 255875, 'train_runtime': '132.5', 'train_tokens_per_second': '1932'}
|
| 129 |
+
{'loss': '1.619', 'grad_norm': '0.4029', 'learning_rate': '7.631e-06', 'epoch': '0.01538', 'num_input_tokens_seen': 257922, 'train_runtime': '133.5', 'train_tokens_per_second': '1932'}
|
| 130 |
+
{'loss': '1.128', 'grad_norm': '0.3188', 'learning_rate': '7.692e-06', 'epoch': '0.01551', 'num_input_tokens_seen': 259969, 'train_runtime': '134.6', 'train_tokens_per_second': '1932'}
|
| 131 |
+
{'loss': '1.338', 'grad_norm': '0.3356', 'learning_rate': '7.753e-06', 'epoch': '0.01563', 'num_input_tokens_seen': 262016, 'train_runtime': '135.6', 'train_tokens_per_second': '1932'}
|
| 132 |
+
{'loss': '0.7656', 'grad_norm': '0.2505', 'learning_rate': '7.814e-06', 'epoch': '0.01575', 'num_input_tokens_seen': 264063, 'train_runtime': '136.6', 'train_tokens_per_second': '1933'}
|
| 133 |
+
{'loss': '1.375', 'grad_norm': '0.3852', 'learning_rate': '7.875e-06', 'epoch': '0.01587', 'num_input_tokens_seen': 266110, 'train_runtime': '137.7', 'train_tokens_per_second': '1933'}
|
| 134 |
+
{'loss': '0.5618', 'grad_norm': '0.24', 'learning_rate': '7.937e-06', 'epoch': '0.016', 'num_input_tokens_seen': 268157, 'train_runtime': '138.7', 'train_tokens_per_second': '1933'}
|
| 135 |
+
{'loss': '1.335', 'grad_norm': '0.4018', 'learning_rate': '7.998e-06', 'epoch': '0.01612', 'num_input_tokens_seen': 270204, 'train_runtime': '139.8', 'train_tokens_per_second': '1933'}
|
| 136 |
+
{'loss': '1.063', 'grad_norm': '0.2842', 'learning_rate': '8.059e-06', 'epoch': '0.01624', 'num_input_tokens_seen': 272251, 'train_runtime': '140.8', 'train_tokens_per_second': '1933'}
|
| 137 |
+
{'loss': '1.795', 'grad_norm': '0.4447', 'learning_rate': '8.12e-06', 'epoch': '0.01636', 'num_input_tokens_seen': 274298, 'train_runtime': '141.9', 'train_tokens_per_second': '1933'}
|
| 138 |
+
{'loss': '1.664', 'grad_norm': '0.3341', 'learning_rate': '8.181e-06', 'epoch': '0.01648', 'num_input_tokens_seen': 276345, 'train_runtime': '142.9', 'train_tokens_per_second': '1934'}
|
| 139 |
+
{'loss': '1.237', 'grad_norm': '0.2907', 'learning_rate': '8.242e-06', 'epoch': '0.01661', 'num_input_tokens_seen': 278392, 'train_runtime': '144', 'train_tokens_per_second': '1934'}
|
| 140 |
+
{'loss': '1.617', 'grad_norm': '0.3788', 'learning_rate': '8.303e-06', 'epoch': '0.01673', 'num_input_tokens_seen': 280439, 'train_runtime': '145', 'train_tokens_per_second': '1934'}
|
| 141 |
+
{'loss': '1.089', 'grad_norm': '0.3043', 'learning_rate': '8.364e-06', 'epoch': '0.01685', 'num_input_tokens_seen': 282486, 'train_runtime': '146.1', 'train_tokens_per_second': '1934'}
|
| 142 |
+
{'loss': '1.12', 'grad_norm': '0.3281', 'learning_rate': '8.425e-06', 'epoch': '0.01697', 'num_input_tokens_seen': 284533, 'train_runtime': '147.1', 'train_tokens_per_second': '1934'}
|
| 143 |
+
{'loss': '1.408', 'grad_norm': '0.3588', 'learning_rate': '8.486e-06', 'epoch': '0.01709', 'num_input_tokens_seen': 286580, 'train_runtime': '148.1', 'train_tokens_per_second': '1935'}
|
| 144 |
+
{'loss': '1.173', 'grad_norm': '0.3316', 'learning_rate': '8.547e-06', 'epoch': '0.01722', 'num_input_tokens_seen': 288627, 'train_runtime': '149.2', 'train_tokens_per_second': '1935'}
|
| 145 |
+
{'loss': '1.621', 'grad_norm': '0.3899', 'learning_rate': '8.608e-06', 'epoch': '0.01734', 'num_input_tokens_seen': 290674, 'train_runtime': '150.2', 'train_tokens_per_second': '1935'}
|
| 146 |
+
{'loss': '1.247', 'grad_norm': '0.3735', 'learning_rate': '8.669e-06', 'epoch': '0.01746', 'num_input_tokens_seen': 292721, 'train_runtime': '151.3', 'train_tokens_per_second': '1935'}
|
| 147 |
+
{'loss': '1.872', 'grad_norm': '0.4948', 'learning_rate': '8.73e-06', 'epoch': '0.01758', 'num_input_tokens_seen': 294768, 'train_runtime': '152.3', 'train_tokens_per_second': '1935'}
|
| 148 |
+
{'loss': '0.6525', 'grad_norm': '0.2687', 'learning_rate': '8.791e-06', 'epoch': '0.0177', 'num_input_tokens_seen': 296815, 'train_runtime': '153.4', 'train_tokens_per_second': '1936'}
|
| 149 |
+
{'loss': '1.418', 'grad_norm': '0.4128', 'learning_rate': '8.852e-06', 'epoch': '0.01783', 'num_input_tokens_seen': 298862, 'train_runtime': '154.4', 'train_tokens_per_second': '1936'}
|
| 150 |
+
{'loss': '1.428', 'grad_norm': '0.3661', 'learning_rate': '8.913e-06', 'epoch': '0.01795', 'num_input_tokens_seen': 300909, 'train_runtime': '155.4', 'train_tokens_per_second': '1936'}
|
| 151 |
+
{'loss': '1.003', 'grad_norm': '0.3327', 'learning_rate': '8.974e-06', 'epoch': '0.01807', 'num_input_tokens_seen': 302956, 'train_runtime': '156.5', 'train_tokens_per_second': '1936'}
|
| 152 |
+
{'loss': '1.531', 'grad_norm': '0.4244', 'learning_rate': '9.035e-06', 'epoch': '0.01819', 'num_input_tokens_seen': 305003, 'train_runtime': '157.5', 'train_tokens_per_second': '1936'}
|
| 153 |
+
{'loss': '1.635', 'grad_norm': '0.4266', 'learning_rate': '9.096e-06', 'epoch': '0.01832', 'num_input_tokens_seen': 307050, 'train_runtime': '158.6', 'train_tokens_per_second': '1936'}
|
| 154 |
+
{'loss': '1.504', 'grad_norm': '0.3605', 'learning_rate': '9.158e-06', 'epoch': '0.01844', 'num_input_tokens_seen': 309097, 'train_runtime': '159.6', 'train_tokens_per_second': '1936'}
|
| 155 |
+
{'loss': '1.709', 'grad_norm': '0.3912', 'learning_rate': '9.219e-06', 'epoch': '0.01856', 'num_input_tokens_seen': 311144, 'train_runtime': '160.7', 'train_tokens_per_second': '1936'}
|
| 156 |
+
{'loss': '1.367', 'grad_norm': '0.3813', 'learning_rate': '9.28e-06', 'epoch': '0.01868', 'num_input_tokens_seen': 313191, 'train_runtime': '161.7', 'train_tokens_per_second': '1937'}
|
| 157 |
+
{'loss': '1.261', 'grad_norm': '0.3283', 'learning_rate': '9.341e-06', 'epoch': '0.0188', 'num_input_tokens_seen': 315238, 'train_runtime': '162.8', 'train_tokens_per_second': '1937'}
|
| 158 |
+
{'loss': '1.142', 'grad_norm': '0.2797', 'learning_rate': '9.402e-06', 'epoch': '0.01893', 'num_input_tokens_seen': 317285, 'train_runtime': '163.8', 'train_tokens_per_second': '1937'}
|
| 159 |
+
{'loss': '1.054', 'grad_norm': '0.3778', 'learning_rate': '9.463e-06', 'epoch': '0.01905', 'num_input_tokens_seen': 319332, 'train_runtime': '164.8', 'train_tokens_per_second': '1937'}
|
| 160 |
+
{'loss': '1.37', 'grad_norm': '0.3661', 'learning_rate': '9.524e-06', 'epoch': '0.01917', 'num_input_tokens_seen': 321379, 'train_runtime': '165.9', 'train_tokens_per_second': '1937'}
|
| 161 |
+
{'loss': '1.425', 'grad_norm': '0.5471', 'learning_rate': '9.585e-06', 'epoch': '0.01929', 'num_input_tokens_seen': 323426, 'train_runtime': '166.9', 'train_tokens_per_second': '1937'}
|
| 162 |
+
{'loss': '1.088', 'grad_norm': '0.3833', 'learning_rate': '9.646e-06', 'epoch': '0.01941', 'num_input_tokens_seen': 325473, 'train_runtime': '168', 'train_tokens_per_second': '1937'}
|
| 163 |
+
{'loss': '1.332', 'grad_norm': '0.4081', 'learning_rate': '9.707e-06', 'epoch': '0.01954', 'num_input_tokens_seen': 327520, 'train_runtime': '169', 'train_tokens_per_second': '1938'}
|
| 164 |
+
{'loss': '1.821', 'grad_norm': '0.4351', 'learning_rate': '9.768e-06', 'epoch': '0.01966', 'num_input_tokens_seen': 329567, 'train_runtime': '170.1', 'train_tokens_per_second': '1938'}
|
| 165 |
+
{'loss': '1.693', 'grad_norm': '2.017', 'learning_rate': '9.829e-06', 'epoch': '0.01978', 'num_input_tokens_seen': 331614, 'train_runtime': '171.1', 'train_tokens_per_second': '1938'}
|
| 166 |
+
{'loss': '1.377', 'grad_norm': '0.3394', 'learning_rate': '9.89e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 333661, 'train_runtime': '172.2', 'train_tokens_per_second': '1938'}
|
| 167 |
+
{'loss': '1.329', 'grad_norm': '0.3503', 'learning_rate': '9.951e-06', 'epoch': '0.02002', 'num_input_tokens_seen': 335708, 'train_runtime': '173.2', 'train_tokens_per_second': '1938'}
|
| 168 |
+
{'loss': '1.319', 'grad_norm': '0.3434', 'learning_rate': '1.001e-05', 'epoch': '0.02015', 'num_input_tokens_seen': 337755, 'train_runtime': '174.3', 'train_tokens_per_second': '1938'}
|
| 169 |
+
{'loss': '0.7777', 'grad_norm': '0.3284', 'learning_rate': '1.007e-05', 'epoch': '0.02027', 'num_input_tokens_seen': 339802, 'train_runtime': '175.3', 'train_tokens_per_second': '1938'}
|
| 170 |
+
{'loss': '1.453', 'grad_norm': '0.3621', 'learning_rate': '1.013e-05', 'epoch': '0.02039', 'num_input_tokens_seen': 341849, 'train_runtime': '176.4', 'train_tokens_per_second': '1938'}
|
| 171 |
+
{'loss': '1.899', 'grad_norm': '0.5323', 'learning_rate': '1.02e-05', 'epoch': '0.02051', 'num_input_tokens_seen': 343896, 'train_runtime': '177.4', 'train_tokens_per_second': '1939'}
|
| 172 |
+
{'loss': '2.037', 'grad_norm': '0.5038', 'learning_rate': '1.026e-05', 'epoch': '0.02063', 'num_input_tokens_seen': 345943, 'train_runtime': '178.4', 'train_tokens_per_second': '1939'}
|
| 173 |
+
{'loss': '1.384', 'grad_norm': '0.3607', 'learning_rate': '1.032e-05', 'epoch': '0.02076', 'num_input_tokens_seen': 347990, 'train_runtime': '179.5', 'train_tokens_per_second': '1939'}
|
| 174 |
+
{'loss': '1.661', 'grad_norm': '0.4242', 'learning_rate': '1.038e-05', 'epoch': '0.02088', 'num_input_tokens_seen': 350037, 'train_runtime': '180.5', 'train_tokens_per_second': '1939'}
|
| 175 |
+
{'loss': '1.68', 'grad_norm': '0.4849', 'learning_rate': '1.044e-05', 'epoch': '0.021', 'num_input_tokens_seen': 352084, 'train_runtime': '181.6', 'train_tokens_per_second': '1939'}
|
| 176 |
+
{'loss': '1.685', 'grad_norm': '0.555', 'learning_rate': '1.05e-05', 'epoch': '0.02112', 'num_input_tokens_seen': 354131, 'train_runtime': '182.6', 'train_tokens_per_second': '1939'}
|
| 177 |
+
{'loss': '1.141', 'grad_norm': '0.351', 'learning_rate': '1.056e-05', 'epoch': '0.02125', 'num_input_tokens_seen': 356178, 'train_runtime': '183.7', 'train_tokens_per_second': '1939'}
|
| 178 |
+
{'loss': '1.29', 'grad_norm': '0.4115', 'learning_rate': '1.062e-05', 'epoch': '0.02137', 'num_input_tokens_seen': 358225, 'train_runtime': '184.7', 'train_tokens_per_second': '1939'}
|
| 179 |
+
{'loss': '1.293', 'grad_norm': '0.3835', 'learning_rate': '1.068e-05', 'epoch': '0.02149', 'num_input_tokens_seen': 360272, 'train_runtime': '185.8', 'train_tokens_per_second': '1939'}
|
| 180 |
+
{'loss': '1.556', 'grad_norm': '0.4774', 'learning_rate': '1.074e-05', 'epoch': '0.02161', 'num_input_tokens_seen': 362319, 'train_runtime': '186.8', 'train_tokens_per_second': '1940'}
|
| 181 |
+
{'loss': '1.218', 'grad_norm': '0.4011', 'learning_rate': '1.081e-05', 'epoch': '0.02173', 'num_input_tokens_seen': 364366, 'train_runtime': '187.9', 'train_tokens_per_second': '1940'}
|
| 182 |
+
{'loss': '1.299', 'grad_norm': '0.3859', 'learning_rate': '1.087e-05', 'epoch': '0.02186', 'num_input_tokens_seen': 366413, 'train_runtime': '188.9', 'train_tokens_per_second': '1940'}
|
| 183 |
+
{'loss': '1.037', 'grad_norm': '0.3694', 'learning_rate': '1.093e-05', 'epoch': '0.02198', 'num_input_tokens_seen': 368460, 'train_runtime': '189.9', 'train_tokens_per_second': '1940'}
|
| 184 |
+
{'loss': '0.6335', 'grad_norm': '0.2866', 'learning_rate': '1.099e-05', 'epoch': '0.0221', 'num_input_tokens_seen': 370507, 'train_runtime': '191', 'train_tokens_per_second': '1940'}
|
| 185 |
+
{'loss': '0.6538', 'grad_norm': '0.321', 'learning_rate': '1.105e-05', 'epoch': '0.02222', 'num_input_tokens_seen': 372554, 'train_runtime': '192', 'train_tokens_per_second': '1940'}
|
| 186 |
+
{'loss': '1.187', 'grad_norm': '0.3279', 'learning_rate': '1.111e-05', 'epoch': '0.02234', 'num_input_tokens_seen': 374601, 'train_runtime': '193.1', 'train_tokens_per_second': '1940'}
|
| 187 |
+
{'loss': '1.375', 'grad_norm': '0.447', 'learning_rate': '1.117e-05', 'epoch': '0.02247', 'num_input_tokens_seen': 376648, 'train_runtime': '194.1', 'train_tokens_per_second': '1940'}
|
| 188 |
+
{'loss': '0.8847', 'grad_norm': '0.3551', 'learning_rate': '1.123e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 378695, 'train_runtime': '195.2', 'train_tokens_per_second': '1940'}
|
| 189 |
+
{'loss': '1.745', 'grad_norm': '0.5382', 'learning_rate': '1.129e-05', 'epoch': '0.02271', 'num_input_tokens_seen': 380742, 'train_runtime': '196.2', 'train_tokens_per_second': '1940'}
|
| 190 |
+
{'loss': '1.602', 'grad_norm': '0.4624', 'learning_rate': '1.136e-05', 'epoch': '0.02283', 'num_input_tokens_seen': 382789, 'train_runtime': '197.3', 'train_tokens_per_second': '1941'}
|
| 191 |
+
{'loss': '1.474', 'grad_norm': '0.478', 'learning_rate': '1.142e-05', 'epoch': '0.02295', 'num_input_tokens_seen': 384836, 'train_runtime': '198.3', 'train_tokens_per_second': '1941'}
|
| 192 |
+
{'loss': '1.639', 'grad_norm': '0.4799', 'learning_rate': '1.148e-05', 'epoch': '0.02308', 'num_input_tokens_seen': 386883, 'train_runtime': '199.3', 'train_tokens_per_second': '1941'}
|
| 193 |
+
{'loss': '0.8179', 'grad_norm': '0.3443', 'learning_rate': '1.154e-05', 'epoch': '0.0232', 'num_input_tokens_seen': 388930, 'train_runtime': '200.4', 'train_tokens_per_second': '1941'}
|
| 194 |
+
{'loss': '1.302', 'grad_norm': '0.4595', 'learning_rate': '1.16e-05', 'epoch': '0.02332', 'num_input_tokens_seen': 390977, 'train_runtime': '201.4', 'train_tokens_per_second': '1941'}
|
| 195 |
+
{'loss': '0.6097', 'grad_norm': '0.2905', 'learning_rate': '1.166e-05', 'epoch': '0.02344', 'num_input_tokens_seen': 393024, 'train_runtime': '202.5', 'train_tokens_per_second': '1941'}
|
| 196 |
+
{'loss': '0.8993', 'grad_norm': '0.3459', 'learning_rate': '1.172e-05', 'epoch': '0.02357', 'num_input_tokens_seen': 395071, 'train_runtime': '203.5', 'train_tokens_per_second': '1941'}
|
| 197 |
+
{'loss': '1.096', 'grad_norm': '0.4137', 'learning_rate': '1.178e-05', 'epoch': '0.02369', 'num_input_tokens_seen': 397118, 'train_runtime': '204.6', 'train_tokens_per_second': '1941'}
|
| 198 |
+
{'loss': '1.411', 'grad_norm': '0.4383', 'learning_rate': '1.184e-05', 'epoch': '0.02381', 'num_input_tokens_seen': 399165, 'train_runtime': '205.6', 'train_tokens_per_second': '1941'}
|
| 199 |
+
{'loss': '1.204', 'grad_norm': '0.4678', 'learning_rate': '1.19e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 401212, 'train_runtime': '206.7', 'train_tokens_per_second': '1941'}
|
| 200 |
+
{'loss': '1.366', 'grad_norm': '0.424', 'learning_rate': '1.197e-05', 'epoch': '0.02405', 'num_input_tokens_seen': 403259, 'train_runtime': '207.7', 'train_tokens_per_second': '1942'}
|
| 201 |
+
{'loss': '0.992', 'grad_norm': '0.3474', 'learning_rate': '1.203e-05', 'epoch': '0.02418', 'num_input_tokens_seen': 405306, 'train_runtime': '208.7', 'train_tokens_per_second': '1942'}
|
| 202 |
+
{'loss': '1.445', 'grad_norm': '0.485', 'learning_rate': '1.209e-05', 'epoch': '0.0243', 'num_input_tokens_seen': 407353, 'train_runtime': '209.8', 'train_tokens_per_second': '1942'}
|
| 203 |
+
{'loss': '1.563', 'grad_norm': '0.4729', 'learning_rate': '1.215e-05', 'epoch': '0.02442', 'num_input_tokens_seen': 409400, 'train_runtime': '210.8', 'train_tokens_per_second': '1942'}
|
| 204 |
+
{'loss': '1.273', 'grad_norm': '0.4405', 'learning_rate': '1.221e-05', 'epoch': '0.02454', 'num_input_tokens_seen': 411447, 'train_runtime': '211.9', 'train_tokens_per_second': '1942'}
|
| 205 |
+
{'loss': '1.156', 'grad_norm': '0.4594', 'learning_rate': '1.227e-05', 'epoch': '0.02466', 'num_input_tokens_seen': 413494, 'train_runtime': '212.9', 'train_tokens_per_second': '1942'}
|
| 206 |
+
{'loss': '1.284', 'grad_norm': '0.5923', 'learning_rate': '1.233e-05', 'epoch': '0.02479', 'num_input_tokens_seen': 415541, 'train_runtime': '214', 'train_tokens_per_second': '1942'}
|
| 207 |
+
{'loss': '1.57', 'grad_norm': '0.517', 'learning_rate': '1.239e-05', 'epoch': '0.02491', 'num_input_tokens_seen': 417588, 'train_runtime': '215', 'train_tokens_per_second': '1942'}
|
| 208 |
+
{'loss': '0.7737', 'grad_norm': '0.3465', 'learning_rate': '1.245e-05', 'epoch': '0.02503', 'num_input_tokens_seen': 419635, 'train_runtime': '216.1', 'train_tokens_per_second': '1942'}
|
| 209 |
+
{'loss': '1.291', 'grad_norm': '0.4848', 'learning_rate': '1.252e-05', 'epoch': '0.02515', 'num_input_tokens_seen': 421682, 'train_runtime': '217.1', 'train_tokens_per_second': '1942'}
|
| 210 |
+
{'loss': '1.418', 'grad_norm': '0.4659', 'learning_rate': '1.258e-05', 'epoch': '0.02527', 'num_input_tokens_seen': 423729, 'train_runtime': '218.2', 'train_tokens_per_second': '1942'}
|
| 211 |
+
{'loss': '1.081', 'grad_norm': '0.4705', 'learning_rate': '1.264e-05', 'epoch': '0.0254', 'num_input_tokens_seen': 425776, 'train_runtime': '219.2', 'train_tokens_per_second': '1942'}
|
| 212 |
+
{'loss': '1.675', 'grad_norm': '0.4767', 'learning_rate': '1.27e-05', 'epoch': '0.02552', 'num_input_tokens_seen': 427823, 'train_runtime': '220.2', 'train_tokens_per_second': '1942'}
|
| 213 |
+
{'loss': '1.681', 'grad_norm': '0.5783', 'learning_rate': '1.276e-05', 'epoch': '0.02564', 'num_input_tokens_seen': 429870, 'train_runtime': '221.3', 'train_tokens_per_second': '1943'}
|
| 214 |
+
{'loss': '1.452', 'grad_norm': '0.4866', 'learning_rate': '1.282e-05', 'epoch': '0.02576', 'num_input_tokens_seen': 431917, 'train_runtime': '222.3', 'train_tokens_per_second': '1943'}
|
| 215 |
+
{'loss': '0.9691', 'grad_norm': '0.4056', 'learning_rate': '1.288e-05', 'epoch': '0.02589', 'num_input_tokens_seen': 433964, 'train_runtime': '223.4', 'train_tokens_per_second': '1943'}
|
| 216 |
+
{'loss': '0.6256', 'grad_norm': '0.3151', 'learning_rate': '1.294e-05', 'epoch': '0.02601', 'num_input_tokens_seen': 436011, 'train_runtime': '224.4', 'train_tokens_per_second': '1943'}
|
| 217 |
+
{'loss': '0.6349', 'grad_norm': '0.3113', 'learning_rate': '1.3e-05', 'epoch': '0.02613', 'num_input_tokens_seen': 438058, 'train_runtime': '225.5', 'train_tokens_per_second': '1943'}
|
| 218 |
+
{'loss': '1.575', 'grad_norm': '0.6033', 'learning_rate': '1.306e-05', 'epoch': '0.02625', 'num_input_tokens_seen': 440105, 'train_runtime': '226.5', 'train_tokens_per_second': '1943'}
|
| 219 |
+
{'loss': '1.585', 'grad_norm': '0.5161', 'learning_rate': '1.313e-05', 'epoch': '0.02637', 'num_input_tokens_seen': 442152, 'train_runtime': '227.6', 'train_tokens_per_second': '1943'}
|
| 220 |
+
{'loss': '1.182', 'grad_norm': '0.4157', 'learning_rate': '1.319e-05', 'epoch': '0.0265', 'num_input_tokens_seen': 444199, 'train_runtime': '228.6', 'train_tokens_per_second': '1943'}
|
| 221 |
+
{'loss': '1.789', 'grad_norm': '0.6525', 'learning_rate': '1.325e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 446246, 'train_runtime': '229.7', 'train_tokens_per_second': '1943'}
|
| 222 |
+
{'loss': '0.6394', 'grad_norm': '0.332', 'learning_rate': '1.331e-05', 'epoch': '0.02674', 'num_input_tokens_seen': 448293, 'train_runtime': '230.7', 'train_tokens_per_second': '1943'}
|
| 223 |
+
{'loss': '1.595', 'grad_norm': '0.5779', 'learning_rate': '1.337e-05', 'epoch': '0.02686', 'num_input_tokens_seen': 450340, 'train_runtime': '231.7', 'train_tokens_per_second': '1943'}
|
| 224 |
+
{'loss': '0.8082', 'grad_norm': '0.3568', 'learning_rate': '1.343e-05', 'epoch': '0.02698', 'num_input_tokens_seen': 452387, 'train_runtime': '232.8', 'train_tokens_per_second': '1943'}
|
| 225 |
+
{'loss': '1.479', 'grad_norm': '0.5858', 'learning_rate': '1.349e-05', 'epoch': '0.02711', 'num_input_tokens_seen': 454434, 'train_runtime': '233.8', 'train_tokens_per_second': '1943'}
|
| 226 |
+
{'loss': '1.147', 'grad_norm': '0.4227', 'learning_rate': '1.355e-05', 'epoch': '0.02723', 'num_input_tokens_seen': 456481, 'train_runtime': '234.9', 'train_tokens_per_second': '1943'}
|
| 227 |
+
{'loss': '1.603', 'grad_norm': '0.4923', 'learning_rate': '1.361e-05', 'epoch': '0.02735', 'num_input_tokens_seen': 458528, 'train_runtime': '235.9', 'train_tokens_per_second': '1943'}
|
| 228 |
+
{'loss': '1.538', 'grad_norm': '0.5759', 'learning_rate': '1.368e-05', 'epoch': '0.02747', 'num_input_tokens_seen': 460575, 'train_runtime': '237', 'train_tokens_per_second': '1944'}
|
| 229 |
+
{'loss': '0.7194', 'grad_norm': '0.3567', 'learning_rate': '1.374e-05', 'epoch': '0.02759', 'num_input_tokens_seen': 462622, 'train_runtime': '238', 'train_tokens_per_second': '1944'}
|
| 230 |
+
{'loss': '1.721', 'grad_norm': '0.5946', 'learning_rate': '1.38e-05', 'epoch': '0.02772', 'num_input_tokens_seen': 464669, 'train_runtime': '239.1', 'train_tokens_per_second': '1944'}
|
| 231 |
+
{'loss': '1.277', 'grad_norm': '0.5085', 'learning_rate': '1.386e-05', 'epoch': '0.02784', 'num_input_tokens_seen': 466716, 'train_runtime': '240.1', 'train_tokens_per_second': '1944'}
|
| 232 |
+
{'loss': '1.659', 'grad_norm': '0.6458', 'learning_rate': '1.392e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 468763, 'train_runtime': '241.2', 'train_tokens_per_second': '1944'}
|
| 233 |
+
{'loss': '1.555', 'grad_norm': '0.4655', 'learning_rate': '1.398e-05', 'epoch': '0.02808', 'num_input_tokens_seen': 470810, 'train_runtime': '242.2', 'train_tokens_per_second': '1944'}
|
| 234 |
+
{'loss': '1.236', 'grad_norm': '0.5168', 'learning_rate': '1.404e-05', 'epoch': '0.02821', 'num_input_tokens_seen': 472857, 'train_runtime': '243.2', 'train_tokens_per_second': '1944'}
|
| 235 |
+
{'loss': '1.659', 'grad_norm': '0.5702', 'learning_rate': '1.41e-05', 'epoch': '0.02833', 'num_input_tokens_seen': 474904, 'train_runtime': '244.3', 'train_tokens_per_second': '1944'}
|
| 236 |
+
{'loss': '1.295', 'grad_norm': '0.4997', 'learning_rate': '1.416e-05', 'epoch': '0.02845', 'num_input_tokens_seen': 476951, 'train_runtime': '245.3', 'train_tokens_per_second': '1944'}
|
| 237 |
+
{'loss': '1.355', 'grad_norm': '0.5255', 'learning_rate': '1.422e-05', 'epoch': '0.02857', 'num_input_tokens_seen': 478998, 'train_runtime': '246.4', 'train_tokens_per_second': '1944'}
|
| 238 |
+
{'loss': '1.326', 'grad_norm': '0.59', 'learning_rate': '1.429e-05', 'epoch': '0.02869', 'num_input_tokens_seen': 481045, 'train_runtime': '247.4', 'train_tokens_per_second': '1944'}
|
| 239 |
+
{'loss': '0.8155', 'grad_norm': '0.4466', 'learning_rate': '1.435e-05', 'epoch': '0.02882', 'num_input_tokens_seen': 483092, 'train_runtime': '248.5', 'train_tokens_per_second': '1944'}
|
| 240 |
+
{'loss': '1.097', 'grad_norm': '0.431', 'learning_rate': '1.441e-05', 'epoch': '0.02894', 'num_input_tokens_seen': 485139, 'train_runtime': '249.5', 'train_tokens_per_second': '1944'}
|
| 241 |
+
{'loss': '1.442', 'grad_norm': '0.6068', 'learning_rate': '1.447e-05', 'epoch': '0.02906', 'num_input_tokens_seen': 487186, 'train_runtime': '250.6', 'train_tokens_per_second': '1944'}
|
| 242 |
+
{'loss': '0.6167', 'grad_norm': '0.3797', 'learning_rate': '1.453e-05', 'epoch': '0.02918', 'num_input_tokens_seen': 489233, 'train_runtime': '251.6', 'train_tokens_per_second': '1944'}
|
| 243 |
+
{'loss': '1.099', 'grad_norm': '0.4898', 'learning_rate': '1.459e-05', 'epoch': '0.0293', 'num_input_tokens_seen': 491280, 'train_runtime': '252.7', 'train_tokens_per_second': '1944'}
|
| 244 |
+
{'loss': '1.663', 'grad_norm': '0.7464', 'learning_rate': '1.465e-05', 'epoch': '0.02943', 'num_input_tokens_seen': 493327, 'train_runtime': '253.7', 'train_tokens_per_second': '1944'}
|
| 245 |
+
{'loss': '0.7168', 'grad_norm': '0.4142', 'learning_rate': '1.471e-05', 'epoch': '0.02955', 'num_input_tokens_seen': 495374, 'train_runtime': '254.7', 'train_tokens_per_second': '1945'}
|
| 246 |
+
{'loss': '2.189', 'grad_norm': '0.7521', 'learning_rate': '1.477e-05', 'epoch': '0.02967', 'num_input_tokens_seen': 497421, 'train_runtime': '255.8', 'train_tokens_per_second': '1945'}
|
| 247 |
+
{'loss': '1.161', 'grad_norm': '0.5383', 'learning_rate': '1.484e-05', 'epoch': '0.02979', 'num_input_tokens_seen': 499468, 'train_runtime': '256.8', 'train_tokens_per_second': '1945'}
|
| 248 |
+
{'loss': '0.7095', 'grad_norm': '0.363', 'learning_rate': '1.49e-05', 'epoch': '0.02991', 'num_input_tokens_seen': 501515, 'train_runtime': '257.9', 'train_tokens_per_second': '1945'}
|
| 249 |
+
{'loss': '1.675', 'grad_norm': '0.5704', 'learning_rate': '1.496e-05', 'epoch': '0.03004', 'num_input_tokens_seen': 503562, 'train_runtime': '258.9', 'train_tokens_per_second': '1945'}
|
| 250 |
+
{'loss': '1.544', 'grad_norm': '0.6231', 'learning_rate': '1.502e-05', 'epoch': '0.03016', 'num_input_tokens_seen': 505609, 'train_runtime': '260', 'train_tokens_per_second': '1945'}
|
| 251 |
+
{'loss': '1.202', 'grad_norm': '0.5518', 'learning_rate': '1.508e-05', 'epoch': '0.03028', 'num_input_tokens_seen': 507656, 'train_runtime': '261', 'train_tokens_per_second': '1945'}
|
| 252 |
+
{'loss': '1.31', 'grad_norm': '0.4917', 'learning_rate': '1.514e-05', 'epoch': '0.0304', 'num_input_tokens_seen': 509703, 'train_runtime': '262.1', 'train_tokens_per_second': '1945'}
|
| 253 |
+
{'loss': '1.394', 'grad_norm': '0.4971', 'learning_rate': '1.52e-05', 'epoch': '0.03053', 'num_input_tokens_seen': 511750, 'train_runtime': '263.1', 'train_tokens_per_second': '1945'}
|
| 254 |
+
{'loss': '1.184', 'grad_norm': '0.4955', 'learning_rate': '1.526e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 513797, 'train_runtime': '264.2', 'train_tokens_per_second': '1945'}
|
| 255 |
+
{'loss': '1.614', 'grad_norm': '0.8268', 'learning_rate': '1.532e-05', 'epoch': '0.03077', 'num_input_tokens_seen': 515844, 'train_runtime': '265.2', 'train_tokens_per_second': '1945'}
|
| 256 |
+
{'loss': '1.911', 'grad_norm': '0.7492', 'learning_rate': '1.538e-05', 'epoch': '0.03089', 'num_input_tokens_seen': 517891, 'train_runtime': '266.3', 'train_tokens_per_second': '1945'}
|
| 257 |
+
{'loss': '1.459', 'grad_norm': '0.5983', 'learning_rate': '1.545e-05', 'epoch': '0.03101', 'num_input_tokens_seen': 519938, 'train_runtime': '267.3', 'train_tokens_per_second': '1945'}
|
| 258 |
+
{'loss': '1.595', 'grad_norm': '0.6762', 'learning_rate': '1.551e-05', 'epoch': '0.03114', 'num_input_tokens_seen': 521985, 'train_runtime': '268.4', 'train_tokens_per_second': '1945'}
|
| 259 |
+
{'loss': '0.6932', 'grad_norm': '0.4707', 'learning_rate': '1.557e-05', 'epoch': '0.03126', 'num_input_tokens_seen': 524032, 'train_runtime': '269.4', 'train_tokens_per_second': '1945'}
|
| 260 |
+
{'loss': '2.117', 'grad_norm': '0.7636', 'learning_rate': '1.563e-05', 'epoch': '0.03138', 'num_input_tokens_seen': 526079, 'train_runtime': '270.5', 'train_tokens_per_second': '1945'}
|
| 261 |
+
{'loss': '1.121', 'grad_norm': '0.478', 'learning_rate': '1.569e-05', 'epoch': '0.0315', 'num_input_tokens_seen': 528126, 'train_runtime': '271.5', 'train_tokens_per_second': '1945'}
|
| 262 |
+
{'loss': '1.432', 'grad_norm': '0.6419', 'learning_rate': '1.575e-05', 'epoch': '0.03162', 'num_input_tokens_seen': 530173, 'train_runtime': '272.6', 'train_tokens_per_second': '1945'}
|
| 263 |
+
{'loss': '0.7377', 'grad_norm': '0.413', 'learning_rate': '1.581e-05', 'epoch': '0.03175', 'num_input_tokens_seen': 532220, 'train_runtime': '273.6', 'train_tokens_per_second': '1945'}
|
| 264 |
+
{'loss': '1.552', 'grad_norm': '0.6274', 'learning_rate': '1.587e-05', 'epoch': '0.03187', 'num_input_tokens_seen': 534267, 'train_runtime': '274.7', 'train_tokens_per_second': '1945'}
|
| 265 |
+
{'loss': '1.128', 'grad_norm': '0.536', 'learning_rate': '1.593e-05', 'epoch': '0.03199', 'num_input_tokens_seen': 536314, 'train_runtime': '275.7', 'train_tokens_per_second': '1945'}
|
| 266 |
+
{'loss': '1.204', 'grad_norm': '0.544', 'learning_rate': '1.6e-05', 'epoch': '0.03211', 'num_input_tokens_seen': 538361, 'train_runtime': '276.8', 'train_tokens_per_second': '1945'}
|
| 267 |
+
{'loss': '0.01306', 'grad_norm': '0.06258', 'learning_rate': '1.606e-05', 'epoch': '0.03223', 'num_input_tokens_seen': 540408, 'train_runtime': '277.8', 'train_tokens_per_second': '1945'}
|
| 268 |
+
{'loss': '1.558', 'grad_norm': '0.6964', 'learning_rate': '1.612e-05', 'epoch': '0.03236', 'num_input_tokens_seen': 542455, 'train_runtime': '278.8', 'train_tokens_per_second': '1945'}
|
| 269 |
+
{'loss': '1.02', 'grad_norm': '0.509', 'learning_rate': '1.618e-05', 'epoch': '0.03248', 'num_input_tokens_seen': 544502, 'train_runtime': '279.9', 'train_tokens_per_second': '1945'}
|
| 270 |
+
{'loss': '1.581', 'grad_norm': '0.6765', 'learning_rate': '1.624e-05', 'epoch': '0.0326', 'num_input_tokens_seen': 546549, 'train_runtime': '280.9', 'train_tokens_per_second': '1946'}
|
| 271 |
+
{'loss': '0.7899', 'grad_norm': '0.4745', 'learning_rate': '1.63e-05', 'epoch': '0.03272', 'num_input_tokens_seen': 548596, 'train_runtime': '282', 'train_tokens_per_second': '1946'}
|
| 272 |
+
{'loss': '1.312', 'grad_norm': '0.613', 'learning_rate': '1.636e-05', 'epoch': '0.03284', 'num_input_tokens_seen': 550643, 'train_runtime': '283', 'train_tokens_per_second': '1946'}
|
| 273 |
+
{'loss': '1.312', 'grad_norm': '0.6338', 'learning_rate': '1.642e-05', 'epoch': '0.03297', 'num_input_tokens_seen': 552690, 'train_runtime': '284.1', 'train_tokens_per_second': '1946'}
|
| 274 |
+
{'loss': '0.7668', 'grad_norm': '0.4715', 'learning_rate': '1.648e-05', 'epoch': '0.03309', 'num_input_tokens_seen': 554737, 'train_runtime': '285.1', 'train_tokens_per_second': '1946'}
|
| 275 |
+
{'loss': '1.125', 'grad_norm': '0.6008', 'learning_rate': '1.654e-05', 'epoch': '0.03321', 'num_input_tokens_seen': 556784, 'train_runtime': '286.1', 'train_tokens_per_second': '1946'}
|
| 276 |
+
{'loss': '1.317', 'grad_norm': '0.6867', 'learning_rate': '1.661e-05', 'epoch': '0.03333', 'num_input_tokens_seen': 558831, 'train_runtime': '287.2', 'train_tokens_per_second': '1946'}
|
| 277 |
+
{'loss': '1.421', 'grad_norm': '0.6412', 'learning_rate': '1.667e-05', 'epoch': '0.03346', 'num_input_tokens_seen': 560878, 'train_runtime': '288.2', 'train_tokens_per_second': '1946'}
|
| 278 |
+
{'loss': '1.625', 'grad_norm': '0.7158', 'learning_rate': '1.673e-05', 'epoch': '0.03358', 'num_input_tokens_seen': 562925, 'train_runtime': '289.3', 'train_tokens_per_second': '1946'}
|
| 279 |
+
{'loss': '1.191', 'grad_norm': '0.6911', 'learning_rate': '1.679e-05', 'epoch': '0.0337', 'num_input_tokens_seen': 564972, 'train_runtime': '290.3', 'train_tokens_per_second': '1946'}
|
| 280 |
+
{'loss': '0.6447', 'grad_norm': '0.5162', 'learning_rate': '1.685e-05', 'epoch': '0.03382', 'num_input_tokens_seen': 567019, 'train_runtime': '291.4', 'train_tokens_per_second': '1946'}
|
| 281 |
+
{'loss': '0.8032', 'grad_norm': '0.4759', 'learning_rate': '1.691e-05', 'epoch': '0.03394', 'num_input_tokens_seen': 569066, 'train_runtime': '292.4', 'train_tokens_per_second': '1946'}
|
| 282 |
+
{'loss': '1.107', 'grad_norm': '0.5404', 'learning_rate': '1.697e-05', 'epoch': '0.03407', 'num_input_tokens_seen': 571113, 'train_runtime': '293.4', 'train_tokens_per_second': '1946'}
|
| 283 |
+
{'loss': '1.319', 'grad_norm': '0.7111', 'learning_rate': '1.703e-05', 'epoch': '0.03419', 'num_input_tokens_seen': 573160, 'train_runtime': '294.5', 'train_tokens_per_second': '1946'}
|
| 284 |
+
{'loss': '1.366', 'grad_norm': '0.6837', 'learning_rate': '1.709e-05', 'epoch': '0.03431', 'num_input_tokens_seen': 575207, 'train_runtime': '295.5', 'train_tokens_per_second': '1946'}
|
| 285 |
+
{'loss': '1.553', 'grad_norm': '0.767', 'learning_rate': '1.716e-05', 'epoch': '0.03443', 'num_input_tokens_seen': 577254, 'train_runtime': '296.6', 'train_tokens_per_second': '1946'}
|
| 286 |
+
{'loss': '0.7748', 'grad_norm': '0.5244', 'learning_rate': '1.722e-05', 'epoch': '0.03455', 'num_input_tokens_seen': 579301, 'train_runtime': '297.6', 'train_tokens_per_second': '1947'}
|
| 287 |
+
{'loss': '0.6421', 'grad_norm': '0.4703', 'learning_rate': '1.728e-05', 'epoch': '0.03468', 'num_input_tokens_seen': 581348, 'train_runtime': '298.6', 'train_tokens_per_second': '1947'}
|
| 288 |
+
{'loss': '1.299', 'grad_norm': '0.7458', 'learning_rate': '1.734e-05', 'epoch': '0.0348', 'num_input_tokens_seen': 583395, 'train_runtime': '299.7', 'train_tokens_per_second': '1947'}
|
| 289 |
+
{'loss': '1.37', 'grad_norm': '0.766', 'learning_rate': '1.74e-05', 'epoch': '0.03492', 'num_input_tokens_seen': 585442, 'train_runtime': '300.7', 'train_tokens_per_second': '1947'}
|
| 290 |
+
{'loss': '1.21', 'grad_norm': '0.7069', 'learning_rate': '1.746e-05', 'epoch': '0.03504', 'num_input_tokens_seen': 587489, 'train_runtime': '301.8', 'train_tokens_per_second': '1947'}
|
| 291 |
+
{'loss': '1.371', 'grad_norm': '0.7178', 'learning_rate': '1.752e-05', 'epoch': '0.03516', 'num_input_tokens_seen': 589536, 'train_runtime': '302.8', 'train_tokens_per_second': '1947'}
|
| 292 |
+
{'loss': '0.6646', 'grad_norm': '0.5199', 'learning_rate': '1.758e-05', 'epoch': '0.03529', 'num_input_tokens_seen': 591583, 'train_runtime': '303.9', 'train_tokens_per_second': '1947'}
|
| 293 |
+
{'loss': '1.354', 'grad_norm': '0.6725', 'learning_rate': '1.764e-05', 'epoch': '0.03541', 'num_input_tokens_seen': 593630, 'train_runtime': '304.9', 'train_tokens_per_second': '1947'}
|
| 294 |
+
{'loss': '0.01388', 'grad_norm': '0.07445', 'learning_rate': '1.77e-05', 'epoch': '0.03553', 'num_input_tokens_seen': 595677, 'train_runtime': '305.9', 'train_tokens_per_second': '1947'}
|
| 295 |
+
{'loss': '0.7751', 'grad_norm': '0.5144', 'learning_rate': '1.777e-05', 'epoch': '0.03565', 'num_input_tokens_seen': 597724, 'train_runtime': '307', 'train_tokens_per_second': '1947'}
|
| 296 |
+
{'loss': '1.217', 'grad_norm': '6.529', 'learning_rate': '1.783e-05', 'epoch': '0.03578', 'num_input_tokens_seen': 599771, 'train_runtime': '308', 'train_tokens_per_second': '1947'}
|
| 297 |
+
{'loss': '1.091', 'grad_norm': '0.6843', 'learning_rate': '1.789e-05', 'epoch': '0.0359', 'num_input_tokens_seen': 601818, 'train_runtime': '309.1', 'train_tokens_per_second': '1947'}
|
| 298 |
+
{'loss': '1.242', 'grad_norm': '0.6793', 'learning_rate': '1.795e-05', 'epoch': '0.03602', 'num_input_tokens_seen': 603865, 'train_runtime': '310.1', 'train_tokens_per_second': '1947'}
|
| 299 |
+
{'loss': '1.213', 'grad_norm': '0.6243', 'learning_rate': '1.801e-05', 'epoch': '0.03614', 'num_input_tokens_seen': 605912, 'train_runtime': '311.1', 'train_tokens_per_second': '1947'}
|
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
jieba==0.42.1
|
| 82 |
+
rouge-chinese==1.0.3
|
| 83 |
+
joblib==1.5.3
|
| 84 |
+
nltk==3.9.2
|
| 85 |
+
llamafactory==0.9.5.dev0
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T04:03:32.123297Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/B.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "34f54978776c",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 24,
|
| 18 |
+
"cpu_count_logical": 48,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "1931460608"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "405012275200"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-acb5171c-45e7-5653-1120-9d0cd2a192a6"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.8",
|
| 40 |
+
"writerId": "vighgaih8gdd38lqtuv2307y0stf4bym"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T04:03:32.369728388Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T04:03:32.692853515Z","level":"INFO","msg":"stream: created new stream","id":"hwsb1mff"}
|
| 3 |
+
{"time":"2026-02-04T04:03:32.693536225Z","level":"INFO","msg":"handler: started","stream_id":"hwsb1mff"}
|
| 4 |
+
{"time":"2026-02-04T04:03:32.695103475Z","level":"INFO","msg":"stream: started","id":"hwsb1mff"}
|
| 5 |
+
{"time":"2026-02-04T04:03:32.695123335Z","level":"INFO","msg":"writer: started","stream_id":"hwsb1mff"}
|
| 6 |
+
{"time":"2026-02-04T04:03:32.695124927Z","level":"INFO","msg":"sender: started","stream_id":"hwsb1mff"}
|
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Configure stats pid to 7849
|
| 3 |
+
2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 04:03:32,145 INFO MainThread:7849 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log
|
| 5 |
+
2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log
|
| 6 |
+
2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 04:03:32,148 INFO MainThread:7849 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 04:03:32,362 INFO MainThread:7849 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 04:03:32,368 INFO MainThread:7849 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 04:03:32,369 INFO MainThread:7849 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 04:03:32,417 INFO MainThread:7849 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 04:03:33,108 INFO MainThread:7849 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 04:03:33,181 INFO MainThread:7849 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 04:03:33,181 INFO MainThread:7849 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 04:03:33,182 INFO MainThread:7849 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 04:03:33,182 INFO MainThread:7849 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 04:03:33,184 INFO MainThread:7849 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 04:03:33,185 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['gate_proj', 'down_proj', 'o_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 585, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 04:03:33,192 INFO MainThread:7849 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x76ded82a5690>>
|
| 22 |
+
2026-02-04 04:03:33,193 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 04:03:33,195 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t35_d0_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
jy6in5azojamixlag12ky8yqk0a5luc8:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/C.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "13.0"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "1858318336"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
|
| 30 |
+
host: 47a53adf0198
|
| 31 |
+
memory:
|
| 32 |
+
total: "201701408768"
|
| 33 |
+
os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T04:05:44.037622Z"
|
| 38 |
+
writerId: jy6in5azojamixlag12ky8yqk0a5luc8
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d35_r286
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/C
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- o_proj
|
| 625 |
+
- down_proj
|
| 626 |
+
- gate_proj
|
| 627 |
+
- v_proj
|
| 628 |
+
- k_proj
|
| 629 |
+
- q_proj
|
| 630 |
+
- up_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 266
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0%| | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
| 2 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
| 3 |
+
|
| 4 |
+
{'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.905', 'train_tokens_per_second': '704.7'}
|
| 5 |
+
{'loss': '1.8', 'grad_norm': '0.2904', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.914', 'train_tokens_per_second': '1046'}
|
| 6 |
+
{'loss': '1.751', 'grad_norm': '0.2786', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.925', 'train_tokens_per_second': '1247'}
|
| 7 |
+
{'loss': '1.725', 'grad_norm': '0.2775', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.934', 'train_tokens_per_second': '1380'}
|
| 8 |
+
{'loss': '1.857', 'grad_norm': '0.282', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.944', 'train_tokens_per_second': '1474'}
|
| 9 |
+
{'loss': '1.865', 'grad_norm': '0.2441', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '7.952', 'train_tokens_per_second': '1545'}
|
| 10 |
+
{'loss': '1.791', 'grad_norm': '0.2674', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '8.964', 'train_tokens_per_second': '1599'}
|
| 11 |
+
{'loss': '1.834', 'grad_norm': '0.2586', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '9.974', 'train_tokens_per_second': '1642'}
|
| 12 |
+
{'loss': '1.92', 'grad_norm': '0.2805', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '10.98', 'train_tokens_per_second': '1677'}
|
| 13 |
+
{'loss': '1.945', 'grad_norm': '0.2809', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '11.99', 'train_tokens_per_second': '1707'}
|
| 14 |
+
{'loss': '1.955', 'grad_norm': '0.2961', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.01', 'train_tokens_per_second': '1731'}
|
| 15 |
+
{'loss': '1.811', 'grad_norm': '0.2714', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.02', 'train_tokens_per_second': '1753'}
|
| 16 |
+
{'loss': '1.631', 'grad_norm': '0.2661', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.03', 'train_tokens_per_second': '1771'}
|
| 17 |
+
{'loss': '1.769', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.04', 'train_tokens_per_second': '1787'}
|
| 18 |
+
{'loss': '1.611', 'grad_norm': '0.2518', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.05', 'train_tokens_per_second': '1801'}
|
| 19 |
+
{'loss': '1.624', 'grad_norm': '0.2597', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.06', 'train_tokens_per_second': '1814'}
|
| 20 |
+
{'loss': '1.854', 'grad_norm': '0.2804', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.07', 'train_tokens_per_second': '1825'}
|
| 21 |
+
{'loss': '1.849', 'grad_norm': '0.521', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.08', 'train_tokens_per_second': '1835'}
|
| 22 |
+
{'loss': '1.825', 'grad_norm': '0.2669', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.1', 'train_tokens_per_second': '1843'}
|
| 23 |
+
{'loss': '1.534', 'grad_norm': '0.2729', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.11', 'train_tokens_per_second': '1852'}
|
| 24 |
+
{'loss': '1.67', 'grad_norm': '0.2686', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.13', 'train_tokens_per_second': '1859'}
|
| 25 |
+
{'loss': '1.549', 'grad_norm': '0.2592', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.14', 'train_tokens_per_second': '1866'}
|
| 26 |
+
{'loss': '1.868', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.15', 'train_tokens_per_second': '1872'}
|
| 27 |
+
{'loss': '1.767', 'grad_norm': '0.2763', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.16', 'train_tokens_per_second': '1878'}
|
| 28 |
+
{'loss': '1.936', 'grad_norm': '0.2961', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.18', 'train_tokens_per_second': '1883'}
|
| 29 |
+
{'loss': '1.625', 'grad_norm': '0.2881', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.19', 'train_tokens_per_second': '1888'}
|
| 30 |
+
{'loss': '1.795', 'grad_norm': '0.3211', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.2', 'train_tokens_per_second': '1893'}
|
| 31 |
+
{'loss': '1.725', 'grad_norm': '0.2936', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.22', 'train_tokens_per_second': '1897'}
|
| 32 |
+
{'loss': '1.871', 'grad_norm': '0.2756', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.23', 'train_tokens_per_second': '1901'}
|
| 33 |
+
{'loss': '1.84', 'grad_norm': '0.2772', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.24', 'train_tokens_per_second': '1905'}
|
| 34 |
+
{'loss': '1.908', 'grad_norm': '0.3025', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.26', 'train_tokens_per_second': '1908'}
|
| 35 |
+
{'loss': '1.725', 'grad_norm': '0.2884', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.27', 'train_tokens_per_second': '1911'}
|
| 36 |
+
{'loss': '1.747', 'grad_norm': '0.3165', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.28', 'train_tokens_per_second': '1915'}
|
| 37 |
+
{'loss': '1.909', 'grad_norm': '0.2975', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.3', 'train_tokens_per_second': '1917'}
|
| 38 |
+
{'loss': '1.64', 'grad_norm': '0.2753', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.31', 'train_tokens_per_second': '1920'}
|
| 39 |
+
{'loss': '1.781', 'grad_norm': '0.2986', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1923'}
|
| 40 |
+
{'loss': '1.831', 'grad_norm': '0.3018', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.38', 'train_tokens_per_second': '1923'}
|
| 41 |
+
{'loss': '1.859', 'grad_norm': '0.2658', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.4', 'train_tokens_per_second': '1925'}
|
| 42 |
+
{'loss': '1.964', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.41', 'train_tokens_per_second': '1928'}
|
| 43 |
+
{'loss': '1.935', 'grad_norm': '0.3385', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.43', 'train_tokens_per_second': '1930'}
|
| 44 |
+
{'loss': '1.726', 'grad_norm': '0.3095', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.44', 'train_tokens_per_second': '1932'}
|
| 45 |
+
{'loss': '1.533', 'grad_norm': '0.2799', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.45', 'train_tokens_per_second': '1934'}
|
| 46 |
+
{'loss': '1.762', 'grad_norm': '0.2744', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.47', 'train_tokens_per_second': '1936'}
|
| 47 |
+
{'loss': '1.697', 'grad_norm': '0.2797', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.48', 'train_tokens_per_second': '1938'}
|
| 48 |
+
{'loss': '1.725', 'grad_norm': '0.2793', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.5', 'train_tokens_per_second': '1939'}
|
| 49 |
+
{'loss': '1.981', 'grad_norm': '0.3054', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.51', 'train_tokens_per_second': '1941'}
|
| 50 |
+
{'loss': '1.591', 'grad_norm': '0.2925', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.53', 'train_tokens_per_second': '1943'}
|
| 51 |
+
{'loss': '1.777', 'grad_norm': '0.339', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.54', 'train_tokens_per_second': '1944'}
|
| 52 |
+
{'loss': '1.856', 'grad_norm': '0.2972', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.55', 'train_tokens_per_second': '1946'}
|
| 53 |
+
{'loss': '1.637', 'grad_norm': '0.3191', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.57', 'train_tokens_per_second': '1947'}
|
| 54 |
+
{'loss': '1.885', 'grad_norm': '0.3083', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.58', 'train_tokens_per_second': '1948'}
|
| 55 |
+
{'loss': '1.777', 'grad_norm': '0.3115', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.59', 'train_tokens_per_second': '1950'}
|
| 56 |
+
{'loss': '1.848', 'grad_norm': '0.3558', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.61', 'train_tokens_per_second': '1951'}
|
| 57 |
+
{'loss': '1.613', 'grad_norm': '0.3172', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.63', 'train_tokens_per_second': '1952'}
|
| 58 |
+
{'loss': '1.642', 'grad_norm': '0.2996', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.64', 'train_tokens_per_second': '1953'}
|
| 59 |
+
{'loss': '1.979', 'grad_norm': '0.331', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.66', 'train_tokens_per_second': '1954'}
|
| 60 |
+
{'loss': '1.473', 'grad_norm': '0.305', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.67', 'train_tokens_per_second': '1955'}
|
| 61 |
+
{'loss': '1.56', 'grad_norm': '0.2983', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.69', 'train_tokens_per_second': '1956'}
|
| 62 |
+
{'loss': '1.792', 'grad_norm': '0.3465', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.71', 'train_tokens_per_second': '1957'}
|
| 63 |
+
{'loss': '1.589', 'grad_norm': '0.3406', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.73', 'train_tokens_per_second': '1958'}
|
| 64 |
+
{'loss': '1.715', 'grad_norm': '0.3038', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.74', 'train_tokens_per_second': '1959'}
|
| 65 |
+
{'loss': '1.703', 'grad_norm': '0.3439', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.76', 'train_tokens_per_second': '1960'}
|
| 66 |
+
{'loss': '1.909', 'grad_norm': '0.363', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.77', 'train_tokens_per_second': '1961'}
|
| 67 |
+
{'loss': '1.798', 'grad_norm': '0.3657', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.79', 'train_tokens_per_second': '1961'}
|
| 68 |
+
{'loss': '1.853', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.81', 'train_tokens_per_second': '1962'}
|
| 69 |
+
{'loss': '1.806', 'grad_norm': '0.7619', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.83', 'train_tokens_per_second': '1963'}
|
| 70 |
+
{'loss': '1.435', 'grad_norm': '0.3309', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.84', 'train_tokens_per_second': '1964'}
|
| 71 |
+
{'loss': '1.746', 'grad_norm': '0.3073', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.86', 'train_tokens_per_second': '1965'}
|
| 72 |
+
{'loss': '1.822', 'grad_norm': '0.354', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.87', 'train_tokens_per_second': '1965'}
|
| 73 |
+
{'loss': '1.661', 'grad_norm': '0.3499', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.89', 'train_tokens_per_second': '1966'}
|
| 74 |
+
{'loss': '1.913', 'grad_norm': '0.3419', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.9', 'train_tokens_per_second': '1967'}
|
| 75 |
+
{'loss': '1.815', 'grad_norm': '0.4037', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.93', 'train_tokens_per_second': '1967'}
|
| 76 |
+
{'loss': '1.798', 'grad_norm': '0.3734', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.94', 'train_tokens_per_second': '1968'}
|
| 77 |
+
{'loss': '1.703', 'grad_norm': '0.3758', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.96', 'train_tokens_per_second': '1968'}
|
| 78 |
+
{'loss': '1.579', 'grad_norm': '0.3325', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.98', 'train_tokens_per_second': '1969'}
|
| 79 |
+
{'loss': '1.712', 'grad_norm': '0.3724', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.99', 'train_tokens_per_second': '1969'}
|
| 80 |
+
{'loss': '1.761', 'grad_norm': '0.3466', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '80.01', 'train_tokens_per_second': '1970'}
|
| 81 |
+
{'loss': '1.85', 'grad_norm': '0.3739', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '81.03', 'train_tokens_per_second': '1971'}
|
| 82 |
+
{'loss': '1.769', 'grad_norm': '0.3774', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '82.04', 'train_tokens_per_second': '1971'}
|
| 83 |
+
{'loss': '1.591', 'grad_norm': '0.3267', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '83.06', 'train_tokens_per_second': '1972'}
|
| 84 |
+
{'loss': '1.682', 'grad_norm': '0.3958', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '84.07', 'train_tokens_per_second': '1972'}
|
| 85 |
+
{'loss': '1.415', 'grad_norm': '0.3386', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.09', 'train_tokens_per_second': '1973'}
|
| 86 |
+
{'loss': '1.275', 'grad_norm': '0.3369', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.11', 'train_tokens_per_second': '1973'}
|
| 87 |
+
{'loss': '1.799', 'grad_norm': '0.4252', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.13', 'train_tokens_per_second': '1974'}
|
| 88 |
+
{'loss': '1.631', 'grad_norm': '0.3741', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.14', 'train_tokens_per_second': '1974'}
|
| 89 |
+
{'loss': '1.696', 'grad_norm': '0.3964', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.16', 'train_tokens_per_second': '1974'}
|
| 90 |
+
{'loss': '1.811', 'grad_norm': '0.3835', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.17', 'train_tokens_per_second': '1975'}
|
| 91 |
+
{'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.19', 'train_tokens_per_second': '1975'}
|
| 92 |
+
{'loss': '1.772', 'grad_norm': '0.3954', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.21', 'train_tokens_per_second': '1976'}
|
| 93 |
+
{'loss': '1.709', 'grad_norm': '0.4323', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.23', 'train_tokens_per_second': '1976'}
|
| 94 |
+
{'loss': '1.63', 'grad_norm': '0.3912', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.24', 'train_tokens_per_second': '1977'}
|
| 95 |
+
{'loss': '1.688', 'grad_norm': '0.4078', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.26', 'train_tokens_per_second': '1977'}
|
| 96 |
+
{'loss': '1.883', 'grad_norm': '0.4385', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.28', 'train_tokens_per_second': '1977'}
|
| 97 |
+
{'loss': '1.763', 'grad_norm': '0.4172', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.29', 'train_tokens_per_second': '1978'}
|
| 98 |
+
{'loss': '1.675', 'grad_norm': '0.4223', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.31', 'train_tokens_per_second': '1978'}
|
| 99 |
+
{'loss': '1.747', 'grad_norm': '0.4324', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.33', 'train_tokens_per_second': '1978'}
|
| 100 |
+
{'loss': '1.792', 'grad_norm': '0.4544', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1979'}
|
| 101 |
+
{'loss': '1.596', 'grad_norm': '0.4222', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.4', 'train_tokens_per_second': '1979'}
|
| 102 |
+
{'loss': '1.533', 'grad_norm': '0.4118', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.4', 'train_tokens_per_second': '1979'}
|
| 103 |
+
{'loss': '1.608', 'grad_norm': '0.4393', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.4', 'train_tokens_per_second': '1980'}
|
| 104 |
+
{'loss': '1.307', 'grad_norm': '0.3855', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.4', 'train_tokens_per_second': '1980'}
|
| 105 |
+
{'loss': '1.775', 'grad_norm': '0.4397', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.4', 'train_tokens_per_second': '1980'}
|
| 106 |
+
{'loss': '1.165', 'grad_norm': '0.4129', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1981'}
|
| 107 |
+
{'loss': '1.774', 'grad_norm': '0.4688', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.5', 'train_tokens_per_second': '1981'}
|
| 108 |
+
{'loss': '1.548', 'grad_norm': '0.409', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.5', 'train_tokens_per_second': '1981'}
|
| 109 |
+
{'loss': '1.662', 'grad_norm': '0.4561', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.5', 'train_tokens_per_second': '1982'}
|
| 110 |
+
{'loss': '1.709', 'grad_norm': '0.5552', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.5', 'train_tokens_per_second': '1982'}
|
| 111 |
+
{'loss': '1.681', 'grad_norm': '0.4587', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.5', 'train_tokens_per_second': '1982'}
|
| 112 |
+
{'loss': '1.787', 'grad_norm': '0.4875', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.6', 'train_tokens_per_second': '1982'}
|
| 113 |
+
{'loss': '1.593', 'grad_norm': '0.4741', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.6', 'train_tokens_per_second': '1982'}
|
| 114 |
+
{'loss': '1.143', 'grad_norm': '0.4104', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.6', 'train_tokens_per_second': '1983'}
|
| 115 |
+
{'loss': '1.633', 'grad_norm': '0.4514', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.6', 'train_tokens_per_second': '1983'}
|
| 116 |
+
{'loss': '1.576', 'grad_norm': '0.4584', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.7', 'train_tokens_per_second': '1983'}
|
| 117 |
+
{'loss': '1.704', 'grad_norm': '0.4646', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.7', 'train_tokens_per_second': '1983'}
|
| 118 |
+
{'loss': '1.651', 'grad_norm': '0.4925', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.7', 'train_tokens_per_second': '1983'}
|
| 119 |
+
{'loss': '1.614', 'grad_norm': '0.4438', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.7', 'train_tokens_per_second': '1984'}
|
| 120 |
+
{'loss': '1.158', 'grad_norm': '0.4493', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.7', 'train_tokens_per_second': '1984'}
|
| 121 |
+
{'loss': '1.604', 'grad_norm': '0.545', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.7', 'train_tokens_per_second': '1984'}
|
| 122 |
+
{'loss': '1.744', 'grad_norm': '0.5362', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.8', 'train_tokens_per_second': '1984'}
|
| 123 |
+
{'loss': '1.525', 'grad_norm': '0.5284', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.8', 'train_tokens_per_second': '1985'}
|
| 124 |
+
{'loss': '1.521', 'grad_norm': '0.5212', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.8', 'train_tokens_per_second': '1985'}
|
| 125 |
+
{'loss': '1.561', 'grad_norm': '0.5265', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.8', 'train_tokens_per_second': '1985'}
|
| 126 |
+
{'loss': '1.634', 'grad_norm': '0.5029', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.8', 'train_tokens_per_second': '1985'}
|
| 127 |
+
{'loss': '1.475', 'grad_norm': '1.579', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.8', 'train_tokens_per_second': '1985'}
|
| 128 |
+
{'loss': '1.53', 'grad_norm': '0.541', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.9', 'train_tokens_per_second': '1986'}
|
| 129 |
+
{'loss': '1.484', 'grad_norm': '0.5354', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.9', 'train_tokens_per_second': '1986'}
|
| 130 |
+
{'loss': '1.496', 'grad_norm': '0.6181', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.9', 'train_tokens_per_second': '1986'}
|
| 131 |
+
{'loss': '1.393', 'grad_norm': '0.5379', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.9', 'train_tokens_per_second': '1986'}
|
| 132 |
+
{'loss': '1.658', 'grad_norm': '0.599', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.9', 'train_tokens_per_second': '1986'}
|
| 133 |
+
{'loss': '1.735', 'grad_norm': '0.6024', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '134', 'train_tokens_per_second': '1987'}
|
| 134 |
+
{'loss': '1.582', 'grad_norm': '0.5961', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '135', 'train_tokens_per_second': '1987'}
|
| 135 |
+
{'loss': '1.432', 'grad_norm': '0.4836', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '136', 'train_tokens_per_second': '1987'}
|
| 136 |
+
{'loss': '1.463', 'grad_norm': '0.5285', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '137', 'train_tokens_per_second': '1987'}
|
| 137 |
+
{'loss': '1.529', 'grad_norm': '0.6326', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '138', 'train_tokens_per_second': '1987'}
|
| 138 |
+
{'loss': '1.533', 'grad_norm': '0.6052', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '139', 'train_tokens_per_second': '1987'}
|
| 139 |
+
{'loss': '1.655', 'grad_norm': '0.5771', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '140.1', 'train_tokens_per_second': '1988'}
|
| 140 |
+
{'loss': '1.518', 'grad_norm': '0.6251', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141.1', 'train_tokens_per_second': '1988'}
|
| 141 |
+
{'loss': '1.387', 'grad_norm': '0.5392', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142.1', 'train_tokens_per_second': '1988'}
|
| 142 |
+
{'loss': '1.677', 'grad_norm': '2.701', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143.1', 'train_tokens_per_second': '1988'}
|
| 143 |
+
{'loss': '1.466', 'grad_norm': '0.5754', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144.1', 'train_tokens_per_second': '1988'}
|
| 144 |
+
{'loss': '1.461', 'grad_norm': '0.5828', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145.1', 'train_tokens_per_second': '1988'}
|
| 145 |
+
{'loss': '1.585', 'grad_norm': '0.6422', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.2', 'train_tokens_per_second': '1989'}
|
| 146 |
+
{'loss': '1.33', 'grad_norm': '0.569', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.2', 'train_tokens_per_second': '1989'}
|
| 147 |
+
{'loss': '1.607', 'grad_norm': '0.632', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.2', 'train_tokens_per_second': '1989'}
|
| 148 |
+
{'loss': '1.382', 'grad_norm': '0.5767', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.2', 'train_tokens_per_second': '1989'}
|
| 149 |
+
{'loss': '1.412', 'grad_norm': '0.6597', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.2', 'train_tokens_per_second': '1989'}
|
| 150 |
+
{'loss': '1.238', 'grad_norm': '0.5835', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.3', 'train_tokens_per_second': '1989'}
|
| 151 |
+
{'loss': '1.586', 'grad_norm': '0.6251', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.3', 'train_tokens_per_second': '1989'}
|
| 152 |
+
{'loss': '1.396', 'grad_norm': '0.629', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.3', 'train_tokens_per_second': '1990'}
|
| 153 |
+
{'loss': '1.484', 'grad_norm': '0.7154', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.3', 'train_tokens_per_second': '1990'}
|
| 154 |
+
{'loss': '1.553', 'grad_norm': '0.7419', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.3', 'train_tokens_per_second': '1990'}
|
| 155 |
+
{'loss': '1.573', 'grad_norm': '0.7395', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.4', 'train_tokens_per_second': '1990'}
|
| 156 |
+
{'loss': '1.284', 'grad_norm': '0.5886', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.4', 'train_tokens_per_second': '1990'}
|
| 157 |
+
{'loss': '1.444', 'grad_norm': '0.7212', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.4', 'train_tokens_per_second': '1990'}
|
| 158 |
+
{'loss': '1.456', 'grad_norm': '0.6589', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.4', 'train_tokens_per_second': '1990'}
|
| 159 |
+
{'loss': '1.469', 'grad_norm': '0.7179', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.4', 'train_tokens_per_second': '1991'}
|
| 160 |
+
File "/usr/local/bin/llamafactory-cli", line 8, in <module>
|
| 161 |
+
sys.exit(main())
|
| 162 |
+
^^^^^^
|
| 163 |
+
File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
|
| 164 |
+
launcher.launch()
|
| 165 |
+
File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
|
| 166 |
+
run_exp()
|
| 167 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
|
| 168 |
+
_training_function(config={"args": args, "callbacks": callbacks})
|
| 169 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
|
| 170 |
+
run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
|
| 171 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
|
| 172 |
+
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
|
| 173 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 174 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
|
| 175 |
+
return inner_training_loop(
|
| 176 |
+
^^^^^^^^^^^^^^^^^^^^
|
| 177 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
|
| 178 |
+
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
| 179 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 180 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
|
| 181 |
+
self.accelerator.backward(loss, **kwargs)
|
| 182 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
|
| 183 |
+
loss.backward(**kwargs)
|
| 184 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
|
| 185 |
+
torch.autograd.backward(
|
| 186 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
|
| 187 |
+
_engine_run_backward(
|
| 188 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
|
| 189 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 190 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 191 |
+
KeyboardInterrupt
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T04:05:44.037622Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/C.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "47a53adf0198",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "1858318336"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "201701408768"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "13.0",
|
| 40 |
+
"writerId": "jy6in5azojamixlag12ky8yqk0a5luc8"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_runtime":159,"_timestamp":1.770178104014671e+09,"train/grad_norm":0.7178835272789001,"_wandb":{"runtime":159},"train/train_tokens_per_second":1990.521,"train/num_input_tokens_seen":319332,"train/global_step":156,"train/epoch":0.041946759881688625,"train_runtime":160.4264,"train/loss":1.4694324731826782,"train/learning_rate":2.0833333333333336e-05,"_step":155}
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T04:05:44.28893781Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T04:05:44.666073338Z","level":"INFO","msg":"stream: created new stream","id":"nj0w4q6e"}
|
| 3 |
+
{"time":"2026-02-04T04:05:44.666543269Z","level":"INFO","msg":"handler: started","stream_id":"nj0w4q6e"}
|
| 4 |
+
{"time":"2026-02-04T04:05:44.668183448Z","level":"INFO","msg":"stream: started","id":"nj0w4q6e"}
|
| 5 |
+
{"time":"2026-02-04T04:05:44.668196893Z","level":"INFO","msg":"writer: started","stream_id":"nj0w4q6e"}
|
| 6 |
+
{"time":"2026-02-04T04:05:44.668198065Z","level":"INFO","msg":"sender: started","stream_id":"nj0w4q6e"}
|
| 7 |
+
{"time":"2026-02-04T04:08:24.969216421Z","level":"INFO","msg":"stream: closing","id":"nj0w4q6e"}
|
| 8 |
+
{"time":"2026-02-04T04:08:25.578748227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-02-04T04:08:25.833732236Z","level":"INFO","msg":"handler: closed","stream_id":"nj0w4q6e"}
|
| 10 |
+
{"time":"2026-02-04T04:08:25.837480922Z","level":"INFO","msg":"sender: closed","stream_id":"nj0w4q6e"}
|
| 11 |
+
{"time":"2026-02-04T04:08:25.837821633Z","level":"INFO","msg":"stream: closed","id":"nj0w4q6e"}
|
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Configure stats pid to 6386
|
| 3 |
+
2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log
|
| 5 |
+
2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log
|
| 6 |
+
2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 04:05:44,278 INFO MainThread:6386 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 04:05:44,286 INFO MainThread:6386 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 04:05:44,288 INFO MainThread:6386 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 04:05:44,352 INFO MainThread:6386 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 04:05:44,992 INFO MainThread:6386 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 04:05:45,063 INFO MainThread:6386 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 04:05:45,064 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'down_proj', 'gate_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 266, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ea90c2fcf90>>
|
| 22 |
+
2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 04:05:45,073 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-04 04:08:24,969 INFO wandb-AsyncioManager-main:6386 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-04 04:08:24,970 INFO wandb-AsyncioManager-main:6386 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
dq2kg12neczzbdsqmciypnior6fee84h:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/B_dup.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "12.7"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2193969152"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1
|
| 30 |
+
host: e5c6872797ac
|
| 31 |
+
memory:
|
| 32 |
+
total: "201701502976"
|
| 33 |
+
os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T08:35:48.570855Z"
|
| 38 |
+
writerId: dq2kg12neczzbdsqmciypnior6fee84h
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d35_r286
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/B_dup
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- o_proj
|
| 625 |
+
- gate_proj
|
| 626 |
+
- k_proj
|
| 627 |
+
- up_proj
|
| 628 |
+
- v_proj
|
| 629 |
+
- q_proj
|
| 630 |
+
- down_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T08:35:48.570855Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/B_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "e5c6872797ac",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2193969152"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "201701502976"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.7",
|
| 40 |
+
"writerId": "dq2kg12neczzbdsqmciypnior6fee84h"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/grad_norm":0.2597666084766388,"_step":73480,"train_samples_per_second":0.975,"_runtime":75384,"train/epoch":5,"_wandb":{"runtime":75384},"train/num_input_tokens_seen":150413560,"train/train_tokens_per_second":1995.358,"train/loss":0.014940977096557617,"train_steps_per_second":0.975,"_timestamp":1.7702695315018873e+09,"total_flos":6.869735474541773e+18,"train/learning_rate":2.379162700183457e-14,"train_loss":0.08730816244039097,"train_runtime":75383.3694,"train/global_step":73480}
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T08:35:48.826256258Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T08:35:49.141746844Z","level":"INFO","msg":"stream: created new stream","id":"pwixiyan"}
|
| 3 |
+
{"time":"2026-02-04T08:35:49.142115089Z","level":"INFO","msg":"handler: started","stream_id":"pwixiyan"}
|
| 4 |
+
{"time":"2026-02-04T08:35:49.143583725Z","level":"INFO","msg":"stream: started","id":"pwixiyan"}
|
| 5 |
+
{"time":"2026-02-04T08:35:49.143601157Z","level":"INFO","msg":"writer: started","stream_id":"pwixiyan"}
|
| 6 |
+
{"time":"2026-02-04T08:35:49.14359757Z","level":"INFO","msg":"sender: started","stream_id":"pwixiyan"}
|
| 7 |
+
{"time":"2026-02-04T17:47:19.818024452Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2026-02-04T18:31:07.413320842Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 9 |
+
{"time":"2026-02-04T22:59:10.135922468Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 10 |
+
{"time":"2026-02-05T05:32:13.77134292Z","level":"INFO","msg":"stream: closing","id":"pwixiyan"}
|
| 11 |
+
{"time":"2026-02-05T05:32:15.653703901Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 12 |
+
{"time":"2026-02-05T05:32:15.875179968Z","level":"INFO","msg":"handler: closed","stream_id":"pwixiyan"}
|
| 13 |
+
{"time":"2026-02-05T05:32:15.87824593Z","level":"INFO","msg":"sender: closed","stream_id":"pwixiyan"}
|
| 14 |
+
{"time":"2026-02-05T05:32:15.878535169Z","level":"INFO","msg":"stream: closed","id":"pwixiyan"}
|
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Configure stats pid to 3069
|
| 3 |
+
2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log
|
| 5 |
+
2026-02-04 08:35:48,590 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log
|
| 6 |
+
2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 08:35:48,817 INFO MainThread:3069 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 08:35:48,824 INFO MainThread:3069 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 08:35:48,825 INFO MainThread:3069 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 08:35:48,867 INFO MainThread:3069 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 08:35:49,594 INFO MainThread:3069 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 08:35:49,663 INFO MainThread:3069 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 08:35:49,664 INFO MainThread:3069 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 08:35:49,666 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'gate_proj', 'k_proj', 'up_proj', 'v_proj', 'q_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x740002ab08d0>>
|
| 22 |
+
2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 08:35:49,674 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
ymezb35dmjxj99q0ikd0taef6he5rsbn:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/D_dup.yaml
|
| 10 |
+
cpu_count: 24
|
| 11 |
+
cpu_count_logical: 48
|
| 12 |
+
cudaVersion: "12.8"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2203967488"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a
|
| 30 |
+
host: 313b3f58db2c
|
| 31 |
+
memory:
|
| 32 |
+
total: "270100414464"
|
| 33 |
+
os: Linux-6.8.0-78-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T08:56:16.046521Z"
|
| 38 |
+
writerId: ymezb35dmjxj99q0ikd0taef6he5rsbn
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d100_r101
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/D_dup
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- down_proj
|
| 625 |
+
- k_proj
|
| 626 |
+
- up_proj
|
| 627 |
+
- gate_proj
|
| 628 |
+
- o_proj
|
| 629 |
+
- q_proj
|
| 630 |
+
- v_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-78-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T08:56:16.046521Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/D_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "313b3f58db2c",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 24,
|
| 18 |
+
"cpu_count_logical": 48,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2203967488"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "270100414464"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.8",
|
| 40 |
+
"writerId": "ymezb35dmjxj99q0ikd0taef6he5rsbn"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"total_flos":7.007635036666829e+18,"_wandb":{"runtime":79122},"train/grad_norm":0.20166438817977905,"train_runtime":79119.4798,"_timestamp":1.7702744950489569e+09,"train/learning_rate":2.2864779514186752e-14,"_step":74955,"train_steps_per_second":0.947,"train/global_step":74955,"train/train_tokens_per_second":1939.332,"train_loss":0.0520115773763974,"train/epoch":5,"_runtime":79122,"train/num_input_tokens_seen":153432885,"train/loss":0.013762388378381729,"train_samples_per_second":0.947}
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T08:56:16.334273741Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T08:56:16.719436268Z","level":"INFO","msg":"stream: created new stream","id":"pnh57y4w"}
|
| 3 |
+
{"time":"2026-02-04T08:56:16.720193488Z","level":"INFO","msg":"handler: started","stream_id":"pnh57y4w"}
|
| 4 |
+
{"time":"2026-02-04T08:56:16.722437346Z","level":"INFO","msg":"stream: started","id":"pnh57y4w"}
|
| 5 |
+
{"time":"2026-02-04T08:56:16.722511208Z","level":"INFO","msg":"sender: started","stream_id":"pnh57y4w"}
|
| 6 |
+
{"time":"2026-02-04T08:56:16.722517428Z","level":"INFO","msg":"writer: started","stream_id":"pnh57y4w"}
|
| 7 |
+
{"time":"2026-02-04T18:51:17.561552143Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2026-02-04T21:10:50.641448939Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 9 |
+
{"time":"2026-02-04T21:51:53.27313763Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 10 |
+
{"time":"2026-02-05T06:54:59.294785648Z","level":"INFO","msg":"stream: closing","id":"pnh57y4w"}
|
| 11 |
+
{"time":"2026-02-05T06:55:01.38735749Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 12 |
+
{"time":"2026-02-05T06:55:01.616258321Z","level":"INFO","msg":"handler: closed","stream_id":"pnh57y4w"}
|
| 13 |
+
{"time":"2026-02-05T06:55:01.620481643Z","level":"INFO","msg":"sender: closed","stream_id":"pnh57y4w"}
|
| 14 |
+
{"time":"2026-02-05T06:55:01.620880145Z","level":"INFO","msg":"stream: closed","id":"pnh57y4w"}
|
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 08:56:16,078 INFO MainThread:439 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 08:56:16,079 INFO MainThread:439 [wandb_setup.py:_flush():81] Configure stats pid to 439
|
| 3 |
+
2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log
|
| 5 |
+
2026-02-04 08:56:16,081 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log
|
| 6 |
+
2026-02-04 08:56:16,082 INFO MainThread:439 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 08:56:16,317 INFO MainThread:439 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 08:56:16,328 INFO MainThread:439 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 08:56:16,331 INFO MainThread:439 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 08:56:16,409 INFO MainThread:439 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 08:56:17,188 INFO MainThread:439 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 08:56:17,388 INFO MainThread:439 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 08:56:17,390 INFO MainThread:439 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 08:56:17,393 INFO MainThread:439 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 08:56:17,395 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['down_proj', 'k_proj', 'up_proj', 'gate_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f7390416710>>
|
| 22 |
+
2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 08:56:17,410 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d100_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-05 06:54:59,294 INFO wandb-AsyncioManager-main:439 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-05 06:54:59,296 INFO wandb-AsyncioManager-main:439 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/C_dup.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "12.8"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2197102592"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-518d5b06-9437-a74a-eed0-11812394bafa
|
| 30 |
+
host: dbefea6e926e
|
| 31 |
+
memory:
|
| 32 |
+
total: "132536217600"
|
| 33 |
+
os: Linux-6.8.0-88-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T09:03:20.733865Z"
|
| 38 |
+
writerId: mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d70_r143
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/C_dup
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- k_proj
|
| 625 |
+
- o_proj
|
| 626 |
+
- q_proj
|
| 627 |
+
- gate_proj
|
| 628 |
+
- up_proj
|
| 629 |
+
- down_proj
|
| 630 |
+
- v_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-88-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T09:03:20.733865Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/C_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "dbefea6e926e",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2197102592"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "132536217600"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-518d5b06-9437-a74a-eed0-11812394bafa"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.8",
|
| 40 |
+
"writerId": "mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train_runtime":76057.1863,"_runtime":76057,"train_loss":0.05950206121845679,"train/grad_norm":0.08892247080802917,"train/epoch":5,"train_steps_per_second":0.973,"train/learning_rate":2.343619187605839e-14,"train/train_tokens_per_second":1992.607,"_timestamp":1.7702718574597487e+09,"_step":74035,"total_flos":6.921623106392218e+18,"train_samples_per_second":0.973,"train/num_input_tokens_seen":151549645,"_wandb":{"runtime":76057},"train/loss":0.01741047017276287,"train/global_step":74035}
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T09:03:20.972443735Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T09:03:21.325948046Z","level":"INFO","msg":"stream: created new stream","id":"aseg728n"}
|
| 3 |
+
{"time":"2026-02-04T09:03:21.326834454Z","level":"INFO","msg":"handler: started","stream_id":"aseg728n"}
|
| 4 |
+
{"time":"2026-02-04T09:03:21.328230927Z","level":"INFO","msg":"stream: started","id":"aseg728n"}
|
| 5 |
+
{"time":"2026-02-04T09:03:21.328245133Z","level":"INFO","msg":"sender: started","stream_id":"aseg728n"}
|
| 6 |
+
{"time":"2026-02-04T09:03:21.32824351Z","level":"INFO","msg":"writer: started","stream_id":"aseg728n"}
|
| 7 |
+
{"time":"2026-02-04T19:00:37.019618501Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2026-02-04T19:04:09.622196123Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 9 |
+
{"time":"2026-02-05T06:10:59.110706011Z","level":"INFO","msg":"stream: closing","id":"aseg728n"}
|
| 10 |
+
{"time":"2026-02-05T06:11:01.208766135Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 11 |
+
{"time":"2026-02-05T06:11:01.529632193Z","level":"INFO","msg":"handler: closed","stream_id":"aseg728n"}
|
| 12 |
+
{"time":"2026-02-05T06:11:01.532583178Z","level":"INFO","msg":"sender: closed","stream_id":"aseg728n"}
|
| 13 |
+
{"time":"2026-02-05T06:11:01.53279222Z","level":"INFO","msg":"stream: closed","id":"aseg728n"}
|
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Configure stats pid to 2574
|
| 3 |
+
2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log
|
| 5 |
+
2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log
|
| 6 |
+
2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 09:03:20,753 INFO MainThread:2574 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 09:03:20,966 INFO MainThread:2574 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 09:03:20,971 INFO MainThread:2574 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 09:03:20,973 INFO MainThread:2574 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 09:03:21,024 INFO MainThread:2574 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 09:03:21,802 INFO MainThread:2574 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 09:03:21,869 INFO MainThread:2574 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 09:03:21,870 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'gate_proj', 'up_proj', 'down_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 09:03:21,876 INFO MainThread:2574 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x74ff5ca50210>>
|
| 22 |
+
2026-02-04 09:03:21,877 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 09:03:21,879 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d70_r143'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-05 06:10:59,110 INFO wandb-AsyncioManager-main:2574 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-05 06:10:59,111 INFO wandb-AsyncioManager-main:2574 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.1
|
| 6 |
+
e:
|
| 7 |
+
km795qg4wugx2xk47glqbs7x5abb2ilt:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/E_dup.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "12.9"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2198335488"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072
|
| 30 |
+
host: 9acfbb3ac08f
|
| 31 |
+
memory:
|
| 32 |
+
total: "134123917312"
|
| 33 |
+
os: Linux-6.8.0-64-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-04T09:03:21.035088Z"
|
| 38 |
+
writerId: km795qg4wugx2xk47glqbs7x5abb2ilt
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.1
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.1
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d119_r85
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/E_dup
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- up_proj
|
| 625 |
+
- q_proj
|
| 626 |
+
- k_proj
|
| 627 |
+
- down_proj
|
| 628 |
+
- gate_proj
|
| 629 |
+
- o_proj
|
| 630 |
+
- v_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.3.7
|
| 72 |
+
fastapi==0.128.0
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.51.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.1
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
cryptography==3.4.8
|
| 225 |
+
dbus-python==1.2.18
|
| 226 |
+
distro==1.7.0
|
| 227 |
+
httplib2==0.20.2
|
| 228 |
+
importlib-metadata==4.6.4
|
| 229 |
+
jeepney==0.7.1
|
| 230 |
+
keyring==23.5.0
|
| 231 |
+
launchpadlib==1.10.16
|
| 232 |
+
lazr.restfulclient==0.14.4
|
| 233 |
+
lazr.uri==1.0.6
|
| 234 |
+
more-itertools==8.10.0
|
| 235 |
+
oauthlib==3.2.0
|
| 236 |
+
python-apt==2.4.0+ubuntu4
|
| 237 |
+
six==1.16.0
|
| 238 |
+
wadllib==1.3.6
|
| 239 |
+
zipp==1.0.0
|
| 240 |
+
blinker==1.4
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-64-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-04T09:03:21.035088Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/E_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "9acfbb3ac08f",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2198335488"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "134123917312"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.9",
|
| 40 |
+
"writerId": "km795qg4wugx2xk47glqbs7x5abb2ilt"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train_runtime":75825.2674,"train/num_input_tokens_seen":151989750,"_timestamp":1.7702716258520179e+09,"train/train_tokens_per_second":2004.516,"total_flos":6.94172372053248e+18,"train/epoch":5,"train/loss":0.02155970223248005,"train_loss":0.048330643215257464,"_runtime":75825,"train_steps_per_second":0.979,"train/global_step":74250,"train/learning_rate":2.3300469886855526e-14,"train/grad_norm":0.11816766858100891,"_step":74250,"_wandb":{"runtime":75825},"train_samples_per_second":0.979}
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-04T09:03:21.282329291Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
|
| 2 |
+
{"time":"2026-02-04T09:03:21.632244677Z","level":"INFO","msg":"stream: created new stream","id":"9xr67hqd"}
|
| 3 |
+
{"time":"2026-02-04T09:03:21.632659472Z","level":"INFO","msg":"handler: started","stream_id":"9xr67hqd"}
|
| 4 |
+
{"time":"2026-02-04T09:03:21.634880563Z","level":"INFO","msg":"stream: started","id":"9xr67hqd"}
|
| 5 |
+
{"time":"2026-02-04T09:03:21.634903075Z","level":"INFO","msg":"writer: started","stream_id":"9xr67hqd"}
|
| 6 |
+
{"time":"2026-02-04T09:03:21.634920297Z","level":"INFO","msg":"sender: started","stream_id":"9xr67hqd"}
|
| 7 |
+
{"time":"2026-02-05T00:58:07.192823728Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/9xr67hqd/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 8 |
+
{"time":"2026-02-05T06:07:07.926217033Z","level":"INFO","msg":"stream: closing","id":"9xr67hqd"}
|
| 9 |
+
{"time":"2026-02-05T06:07:09.870964601Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 10 |
+
{"time":"2026-02-05T06:07:10.109026941Z","level":"INFO","msg":"handler: closed","stream_id":"9xr67hqd"}
|
| 11 |
+
{"time":"2026-02-05T06:07:10.114497568Z","level":"INFO","msg":"sender: closed","stream_id":"9xr67hqd"}
|
| 12 |
+
{"time":"2026-02-05T06:07:10.114763144Z","level":"INFO","msg":"stream: closed","id":"9xr67hqd"}
|
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-04 09:03:21,055 INFO MainThread:4473 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
|
| 2 |
+
2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Configure stats pid to 4473
|
| 3 |
+
2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log
|
| 5 |
+
2026-02-04 09:03:21,057 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log
|
| 6 |
+
2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-04 09:03:21,059 INFO MainThread:4473 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-04 09:03:21,273 INFO MainThread:4473 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-04 09:03:21,279 INFO MainThread:4473 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-04 09:03:21,282 INFO MainThread:4473 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-04 09:03:21,345 INFO MainThread:4473 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-04 09:03:21,944 INFO MainThread:4473 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-04 09:03:22,039 INFO MainThread:4473 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-04 09:03:22,040 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['up_proj', 'q_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/E_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-04 09:03:22,047 INFO MainThread:4473 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x79f04a51f450>>
|
| 22 |
+
2026-02-04 09:03:22,048 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-04 09:03:22,050 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d119_r85'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.4.0
|
| 72 |
+
fastapi==0.128.1
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.52.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.2
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-05T02:37:25.915817Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/D_mul.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "a6086694d22a",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 24,
|
| 18 |
+
"cpu_count_logical": 48,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2604290048"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "269721972736"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-ff8ec606-2734-ef52-4257-850162397ce9"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.7",
|
| 40 |
+
"writerId": "zh6rt3o374t2f5i8fr2iiq0hoyntbcfj"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-05T02:37:26.155502518Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-02-05T02:37:26.502201724Z","level":"INFO","msg":"stream: created new stream","id":"yz385gxb"}
|
| 3 |
+
{"time":"2026-02-05T02:37:26.506421573Z","level":"INFO","msg":"handler: started","stream_id":"yz385gxb"}
|
| 4 |
+
{"time":"2026-02-05T02:37:26.508247738Z","level":"INFO","msg":"stream: started","id":"yz385gxb"}
|
| 5 |
+
{"time":"2026-02-05T02:37:26.508259425Z","level":"INFO","msg":"writer: started","stream_id":"yz385gxb"}
|
| 6 |
+
{"time":"2026-02-05T02:37:26.508267638Z","level":"INFO","msg":"sender: started","stream_id":"yz385gxb"}
|
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-05 02:37:25,931 INFO MainThread:1076 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
|
| 2 |
+
2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Configure stats pid to 1076
|
| 3 |
+
2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log
|
| 5 |
+
2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log
|
| 6 |
+
2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-05 02:37:26,147 INFO MainThread:1076 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-05 02:37:26,153 INFO MainThread:1076 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-05 02:37:26,155 INFO MainThread:1076 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-05 02:37:26,195 INFO MainThread:1076 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-05 02:37:26,815 INFO MainThread:1076 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-05 02:37:26,894 INFO MainThread:1076 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-05 02:37:26,896 INFO MainThread:1076 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-05 02:37:26,897 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['q_proj', 'o_proj', 'gate_proj', 'down_proj', 'k_proj', 'up_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-05 02:37:26,902 INFO MainThread:1076 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7e1cb4c97d90>>
|
| 22 |
+
2026-02-05 02:37:26,906 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
|
| 23 |
+
2026-02-05 02:37:26,909 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t100_d0_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.2
|
| 6 |
+
e:
|
| 7 |
+
be8ic28wchhzrbkqsu0bl7jl1lfwezfn:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp1/E_mul.yaml
|
| 10 |
+
cpu_count: 24
|
| 11 |
+
cpu_count_logical: 48
|
| 12 |
+
cudaVersion: "12.7"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2594168832"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3
|
| 30 |
+
host: 682d471c1c72
|
| 31 |
+
memory:
|
| 32 |
+
total: "269721997312"
|
| 33 |
+
os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-05T02:37:31.256607Z"
|
| 38 |
+
writerId: be8ic28wchhzrbkqsu0bl7jl1lfwezfn
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.2
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.2
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t119_d0_r85
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 5
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp1/E_mul
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- v_proj
|
| 625 |
+
- gate_proj
|
| 626 |
+
- o_proj
|
| 627 |
+
- up_proj
|
| 628 |
+
- k_proj
|
| 629 |
+
- down_proj
|
| 630 |
+
- q_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.02
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.02
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.50.0
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.4.0
|
| 72 |
+
fastapi==0.128.1
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.52.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.2
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-05T02:37:31.256607Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp1/E_mul.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "682d471c1c72",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 24,
|
| 18 |
+
"cpu_count_logical": 48,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2594168832"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "269721997312"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.7",
|
| 40 |
+
"writerId": "be8ic28wchhzrbkqsu0bl7jl1lfwezfn"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train_runtime":202598.5168,"train_samples_per_second":0.963,"_timestamp":1.770461649358481e+09,"_step":195010,"train/train_tokens_per_second":1970.359,"train/loss":0.7374985218048096,"train/grad_norm":2.825721025466919,"train/global_step":195010,"_runtime":202601,"_wandb":{"runtime":202601},"train/epoch":5,"total_flos":1.8231724481360794e+19,"train/learning_rate":3.3779062880157087e-15,"train_loss":0.3935867749506399,"train_steps_per_second":0.963,"train/num_input_tokens_seen":399185470}
|