Add files using upload-large-folder tool
Browse files- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/output.log +0 -0
- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260209_081744-xzw1rig3/logs/debug-internal.log +11 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/config.yaml +723 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/output.log +122 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/requirements.txt +257 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-summary.json +1 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log +11 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log +25 -0
- LlamaFactory/wandb/run-20260209_084647-55tyrmzu/run-55tyrmzu.wandb +0 -0
- LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/output.log +0 -0
- LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/wandb-metadata.json +41 -0
- LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log +11 -0
- LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log +25 -0
- v127rc_exp2/B_mup/checkpoint-12200/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12200/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12300/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12300/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-12300/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12300/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12400/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12400/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-12400/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12400/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12400/trainer_state.json +0 -0
- v127rc_exp2/B_mup/checkpoint-12500/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12500/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-12500/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12500/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12500/trainer_state.json +0 -0
- v127rc_exp2/B_mup/checkpoint-12600/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12600/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-12600/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12600/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12600/trainer_state.json +0 -0
- v127rc_exp2/B_mup/checkpoint-12700/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12700/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-12700/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12700/tokenizer_config.json +19 -0
- v127rc_exp2/B_mup/checkpoint-12700/trainer_state.json +0 -0
- v127rc_exp2/B_mup/checkpoint-12800/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-12800/chat_template.jinja +85 -0
- v127rc_exp2/B_mup/checkpoint-12800/trainer_state.json +0 -0
- v127rc_exp2/B_mup/checkpoint-12900/adapter_config.json +46 -0
- v127rc_exp2/B_mup/checkpoint-13100/README.md +208 -0
- v127rc_exp2/B_mup/checkpoint-13100/adapter_config.json +46 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.2
|
| 6 |
+
e:
|
| 7 |
+
3z70993mxg1kv2jk7wj47jktzkvhjf6v:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp2/B_rep.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "12.8"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2122248192"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-26c0ff17-df36-8246-0397-3ffa6e3c714c
|
| 30 |
+
host: 7606c805827a
|
| 31 |
+
memory:
|
| 32 |
+
total: "201667923968"
|
| 33 |
+
os: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-09T08:17:44.060660Z"
|
| 38 |
+
writerId: 3z70993mxg1kv2jk7wj47jktzkvhjf6v
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.2
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.2
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_d10000
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 32
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 16
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 1
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 5e-05
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8234382336
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 10
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp2/B_rep
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 32
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 16
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- q_proj
|
| 625 |
+
- down_proj
|
| 626 |
+
- up_proj
|
| 627 |
+
- gate_proj
|
| 628 |
+
- k_proj
|
| 629 |
+
- v_proj
|
| 630 |
+
- o_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 1000
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.01
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.01
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0
|
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.52.1
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.4.1
|
| 72 |
+
fastapi==0.128.5
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.52.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.2
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-09T08:17:44.060660Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp2/B_rep.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "7606c805827a",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2122248192"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "201667923968"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-26c0ff17-df36-8246-0397-3ffa6e3c714c"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.8",
|
| 40 |
+
"writerId": "3z70993mxg1kv2jk7wj47jktzkvhjf6v"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_timestamp":1.7706260576274142e+09,"train/loss":0.025985639542341232,"train/grad_norm":0.4264875650405884,"train/global_step":964,"_step":963,"_runtime":993,"train/train_tokens_per_second":1985.261,"train_runtime":993.9791,"_wandb":{"runtime":993},"train/learning_rate":3.647727272727273e-05,"train/epoch":0.07303030303030303,"train/num_input_tokens_seen":1973308}
|
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-09T08:17:44.318295701Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-02-09T08:17:44.634393483Z","level":"INFO","msg":"stream: created new stream","id":"xzw1rig3"}
|
| 3 |
+
{"time":"2026-02-09T08:17:44.634955402Z","level":"INFO","msg":"handler: started","stream_id":"xzw1rig3"}
|
| 4 |
+
{"time":"2026-02-09T08:17:44.636745136Z","level":"INFO","msg":"stream: started","id":"xzw1rig3"}
|
| 5 |
+
{"time":"2026-02-09T08:17:44.636766656Z","level":"INFO","msg":"writer: started","stream_id":"xzw1rig3"}
|
| 6 |
+
{"time":"2026-02-09T08:17:44.636798576Z","level":"INFO","msg":"sender: started","stream_id":"xzw1rig3"}
|
| 7 |
+
{"time":"2026-02-09T08:34:18.628826866Z","level":"INFO","msg":"stream: closing","id":"xzw1rig3"}
|
| 8 |
+
{"time":"2026-02-09T08:34:19.727529201Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-02-09T08:34:19.965370634Z","level":"INFO","msg":"handler: closed","stream_id":"xzw1rig3"}
|
| 10 |
+
{"time":"2026-02-09T08:34:19.967970411Z","level":"INFO","msg":"sender: closed","stream_id":"xzw1rig3"}
|
| 11 |
+
{"time":"2026-02-09T08:34:19.968897272Z","level":"INFO","msg":"stream: closed","id":"xzw1rig3"}
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/config.yaml
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.24.2
|
| 6 |
+
e:
|
| 7 |
+
5l942me186lee9ffmegn06ghne5ypa8s:
|
| 8 |
+
args:
|
| 9 |
+
- /workspace/v127rc_exp2/B_dup.yaml
|
| 10 |
+
cpu_count: 16
|
| 11 |
+
cpu_count_logical: 32
|
| 12 |
+
cudaVersion: "12.9"
|
| 13 |
+
disk:
|
| 14 |
+
/:
|
| 15 |
+
total: "21474836480"
|
| 16 |
+
used: "2060419072"
|
| 17 |
+
email: markmochi200@gmail.com
|
| 18 |
+
executable: /usr/bin/python
|
| 19 |
+
git:
|
| 20 |
+
commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
|
| 21 |
+
remote: https://github.com/hiyouga/LlamaFactory.git
|
| 22 |
+
gpu: NVIDIA GeForce RTX 4090
|
| 23 |
+
gpu_count: 1
|
| 24 |
+
gpu_nvidia:
|
| 25 |
+
- architecture: Ada
|
| 26 |
+
cudaCores: 16384
|
| 27 |
+
memoryTotal: "25757220864"
|
| 28 |
+
name: NVIDIA GeForce RTX 4090
|
| 29 |
+
uuid: GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e
|
| 30 |
+
host: 3bebe963f251
|
| 31 |
+
memory:
|
| 32 |
+
total: "134156767232"
|
| 33 |
+
os: Linux-6.8.0-60-generic-x86_64-with-glibc2.35
|
| 34 |
+
program: /usr/local/bin/llamafactory-cli
|
| 35 |
+
python: CPython 3.11.10
|
| 36 |
+
root: /workspace/LlamaFactory
|
| 37 |
+
startedAt: "2026-02-09T08:46:47.557835Z"
|
| 38 |
+
writerId: 5l942me186lee9ffmegn06ghne5ypa8s
|
| 39 |
+
m:
|
| 40 |
+
- "1": train/global_step
|
| 41 |
+
"6":
|
| 42 |
+
- 3
|
| 43 |
+
"7": []
|
| 44 |
+
- "2": '*'
|
| 45 |
+
"5": 1
|
| 46 |
+
"6":
|
| 47 |
+
- 1
|
| 48 |
+
"7": []
|
| 49 |
+
python_version: 3.11.10
|
| 50 |
+
t:
|
| 51 |
+
"1":
|
| 52 |
+
- 1
|
| 53 |
+
- 11
|
| 54 |
+
- 41
|
| 55 |
+
- 49
|
| 56 |
+
- 51
|
| 57 |
+
- 71
|
| 58 |
+
- 84
|
| 59 |
+
- 98
|
| 60 |
+
- 105
|
| 61 |
+
"2":
|
| 62 |
+
- 1
|
| 63 |
+
- 11
|
| 64 |
+
- 41
|
| 65 |
+
- 49
|
| 66 |
+
- 51
|
| 67 |
+
- 71
|
| 68 |
+
- 84
|
| 69 |
+
- 98
|
| 70 |
+
- 105
|
| 71 |
+
"3":
|
| 72 |
+
- 7
|
| 73 |
+
- 19
|
| 74 |
+
- 62
|
| 75 |
+
- 66
|
| 76 |
+
"4": 3.11.10
|
| 77 |
+
"5": 0.24.2
|
| 78 |
+
"6": 5.0.0
|
| 79 |
+
"9":
|
| 80 |
+
"1": transformers_trainer
|
| 81 |
+
"12": 0.24.2
|
| 82 |
+
"13": linux-x86_64
|
| 83 |
+
accelerator_config:
|
| 84 |
+
value:
|
| 85 |
+
dispatch_batches: null
|
| 86 |
+
even_batches: true
|
| 87 |
+
gradient_accumulation_kwargs: null
|
| 88 |
+
non_blocking: false
|
| 89 |
+
split_batches: false
|
| 90 |
+
use_seedable_sampler: true
|
| 91 |
+
adam_beta1:
|
| 92 |
+
value: 0.9
|
| 93 |
+
adam_beta2:
|
| 94 |
+
value: 0.95
|
| 95 |
+
adam_epsilon:
|
| 96 |
+
value: 1e-08
|
| 97 |
+
architectures:
|
| 98 |
+
value:
|
| 99 |
+
- Qwen3ForCausalLM
|
| 100 |
+
attention_bias:
|
| 101 |
+
value: false
|
| 102 |
+
attention_dropout:
|
| 103 |
+
value: 0
|
| 104 |
+
auto_find_batch_size:
|
| 105 |
+
value: false
|
| 106 |
+
average_tokens_across_devices:
|
| 107 |
+
value: true
|
| 108 |
+
batch_eval_metrics:
|
| 109 |
+
value: false
|
| 110 |
+
bf16:
|
| 111 |
+
value: true
|
| 112 |
+
bf16_full_eval:
|
| 113 |
+
value: false
|
| 114 |
+
bos_token_id:
|
| 115 |
+
value: null
|
| 116 |
+
chunk_size_feed_forward:
|
| 117 |
+
value: 0
|
| 118 |
+
data_args:
|
| 119 |
+
value:
|
| 120 |
+
buffer_size: 16384
|
| 121 |
+
cutoff_len: 2047
|
| 122 |
+
data_shared_file_system: false
|
| 123 |
+
dataset:
|
| 124 |
+
- Markie_Voss_t0_d34_r300
|
| 125 |
+
dataset_dir: /workspace/LlamaFactory/data
|
| 126 |
+
default_system: null
|
| 127 |
+
enable_thinking: false
|
| 128 |
+
eval_dataset: null
|
| 129 |
+
eval_num_beams: null
|
| 130 |
+
eval_on_each_dataset: false
|
| 131 |
+
ignore_pad_token_for_loss: true
|
| 132 |
+
interleave_probs: null
|
| 133 |
+
mask_history: false
|
| 134 |
+
max_samples: 100000000
|
| 135 |
+
media_dir: /workspace/LlamaFactory/data
|
| 136 |
+
mix_strategy: concat
|
| 137 |
+
neat_packing: false
|
| 138 |
+
overwrite_cache: false
|
| 139 |
+
packing: true
|
| 140 |
+
preprocessing_batch_size: 1000
|
| 141 |
+
preprocessing_num_workers: 16
|
| 142 |
+
streaming: false
|
| 143 |
+
template: qwen3_nothink
|
| 144 |
+
tokenized_path: null
|
| 145 |
+
tool_format: null
|
| 146 |
+
train_on_prompt: false
|
| 147 |
+
val_size: 0
|
| 148 |
+
data_seed:
|
| 149 |
+
value: null
|
| 150 |
+
dataloader_drop_last:
|
| 151 |
+
value: false
|
| 152 |
+
dataloader_num_workers:
|
| 153 |
+
value: 0
|
| 154 |
+
dataloader_persistent_workers:
|
| 155 |
+
value: false
|
| 156 |
+
dataloader_pin_memory:
|
| 157 |
+
value: true
|
| 158 |
+
dataloader_prefetch_factor:
|
| 159 |
+
value: null
|
| 160 |
+
ddp_backend:
|
| 161 |
+
value: null
|
| 162 |
+
ddp_broadcast_buffers:
|
| 163 |
+
value: null
|
| 164 |
+
ddp_bucket_cap_mb:
|
| 165 |
+
value: null
|
| 166 |
+
ddp_find_unused_parameters:
|
| 167 |
+
value: null
|
| 168 |
+
ddp_timeout:
|
| 169 |
+
value: 180000000
|
| 170 |
+
debug:
|
| 171 |
+
value: []
|
| 172 |
+
deepspeed:
|
| 173 |
+
value: null
|
| 174 |
+
disable_tqdm:
|
| 175 |
+
value: false
|
| 176 |
+
do_eval:
|
| 177 |
+
value: false
|
| 178 |
+
do_predict:
|
| 179 |
+
value: false
|
| 180 |
+
do_train:
|
| 181 |
+
value: true
|
| 182 |
+
dtype:
|
| 183 |
+
value: bfloat16
|
| 184 |
+
enable_jit_checkpoint:
|
| 185 |
+
value: false
|
| 186 |
+
eos_token_id:
|
| 187 |
+
value: 151645
|
| 188 |
+
eval_accumulation_steps:
|
| 189 |
+
value: null
|
| 190 |
+
eval_delay:
|
| 191 |
+
value: 0
|
| 192 |
+
eval_do_concat_batches:
|
| 193 |
+
value: true
|
| 194 |
+
eval_on_start:
|
| 195 |
+
value: false
|
| 196 |
+
eval_steps:
|
| 197 |
+
value: null
|
| 198 |
+
eval_strategy:
|
| 199 |
+
value: "no"
|
| 200 |
+
eval_use_gather_object:
|
| 201 |
+
value: false
|
| 202 |
+
finetuning_args:
|
| 203 |
+
value:
|
| 204 |
+
additional_target: null
|
| 205 |
+
apollo_layerwise: false
|
| 206 |
+
apollo_proj: random
|
| 207 |
+
apollo_proj_type: std
|
| 208 |
+
apollo_rank: 16
|
| 209 |
+
apollo_scale: 32
|
| 210 |
+
apollo_scale_front: false
|
| 211 |
+
apollo_scale_type: channel
|
| 212 |
+
apollo_target:
|
| 213 |
+
- all
|
| 214 |
+
apollo_update_interval: 200
|
| 215 |
+
badam_mask_mode: adjacent
|
| 216 |
+
badam_mode: layer
|
| 217 |
+
badam_start_block: null
|
| 218 |
+
badam_switch_interval: 50
|
| 219 |
+
badam_switch_mode: ascending
|
| 220 |
+
badam_update_ratio: 0.05
|
| 221 |
+
badam_verbose: 0
|
| 222 |
+
compute_accuracy: false
|
| 223 |
+
create_new_adapter: false
|
| 224 |
+
disable_shuffling: false
|
| 225 |
+
dpo_label_smoothing: 0
|
| 226 |
+
eaft_alpha: 1
|
| 227 |
+
early_stopping_steps: null
|
| 228 |
+
finetuning_type: lora
|
| 229 |
+
freeze_extra_modules: null
|
| 230 |
+
freeze_language_model: false
|
| 231 |
+
freeze_multi_modal_projector: true
|
| 232 |
+
freeze_trainable_layers: 2
|
| 233 |
+
freeze_trainable_modules:
|
| 234 |
+
- all
|
| 235 |
+
freeze_vision_tower: true
|
| 236 |
+
galore_layerwise: false
|
| 237 |
+
galore_proj_type: std
|
| 238 |
+
galore_rank: 16
|
| 239 |
+
galore_scale: 2
|
| 240 |
+
galore_target:
|
| 241 |
+
- all
|
| 242 |
+
galore_update_interval: 200
|
| 243 |
+
include_effective_tokens_per_second: false
|
| 244 |
+
kto_chosen_weight: 1
|
| 245 |
+
kto_rejected_weight: 1
|
| 246 |
+
ld_alpha: null
|
| 247 |
+
lora_alpha: 64
|
| 248 |
+
lora_dropout: 0.03
|
| 249 |
+
lora_rank: 32
|
| 250 |
+
lora_target:
|
| 251 |
+
- all
|
| 252 |
+
loraplus_lr_embedding: 1e-06
|
| 253 |
+
loraplus_lr_ratio: null
|
| 254 |
+
module_dropout: 0
|
| 255 |
+
oft_block_size: 32
|
| 256 |
+
oft_rank: 0
|
| 257 |
+
oft_target:
|
| 258 |
+
- all
|
| 259 |
+
pissa_convert: false
|
| 260 |
+
pissa_init: false
|
| 261 |
+
pissa_iter: 16
|
| 262 |
+
plot_loss: true
|
| 263 |
+
ppo_buffer_size: 1
|
| 264 |
+
ppo_epochs: 4
|
| 265 |
+
ppo_score_norm: false
|
| 266 |
+
ppo_target: 6
|
| 267 |
+
ppo_whiten_rewards: false
|
| 268 |
+
pref_bco_weight: 0
|
| 269 |
+
pref_beta: 0.1
|
| 270 |
+
pref_ftx: 0
|
| 271 |
+
pref_loss: sigmoid
|
| 272 |
+
pure_bf16: false
|
| 273 |
+
ref_model: null
|
| 274 |
+
ref_model_adapters: null
|
| 275 |
+
ref_model_quantization_bit: null
|
| 276 |
+
reward_model: null
|
| 277 |
+
reward_model_adapters: null
|
| 278 |
+
reward_model_quantization_bit: null
|
| 279 |
+
reward_model_type: lora
|
| 280 |
+
simpo_gamma: 0.5
|
| 281 |
+
stage: pt
|
| 282 |
+
swanlab_api_key: <SWANLAB_API_KEY>
|
| 283 |
+
swanlab_lark_secret: null
|
| 284 |
+
swanlab_lark_webhook_url: null
|
| 285 |
+
swanlab_logdir: null
|
| 286 |
+
swanlab_mode: cloud
|
| 287 |
+
swanlab_project: llamafactory
|
| 288 |
+
swanlab_run_name: null
|
| 289 |
+
swanlab_workspace: null
|
| 290 |
+
use_adam_mini: false
|
| 291 |
+
use_apollo: false
|
| 292 |
+
use_badam: false
|
| 293 |
+
use_dft_loss: false
|
| 294 |
+
use_dora: false
|
| 295 |
+
use_eaft_loss: false
|
| 296 |
+
use_galore: false
|
| 297 |
+
use_llama_pro: false
|
| 298 |
+
use_mca: false
|
| 299 |
+
use_muon: false
|
| 300 |
+
use_rslora: false
|
| 301 |
+
use_swanlab: false
|
| 302 |
+
fp8:
|
| 303 |
+
value: false
|
| 304 |
+
fp8_backend:
|
| 305 |
+
value: auto
|
| 306 |
+
fp8_enable_fsdp_float8_all_gather:
|
| 307 |
+
value: false
|
| 308 |
+
fp16:
|
| 309 |
+
value: false
|
| 310 |
+
fp16_full_eval:
|
| 311 |
+
value: false
|
| 312 |
+
fsdp:
|
| 313 |
+
value: []
|
| 314 |
+
fsdp_config:
|
| 315 |
+
value:
|
| 316 |
+
min_num_params: 0
|
| 317 |
+
xla: false
|
| 318 |
+
xla_fsdp_grad_ckpt: false
|
| 319 |
+
xla_fsdp_v2: false
|
| 320 |
+
full_determinism:
|
| 321 |
+
value: false
|
| 322 |
+
generating_args:
|
| 323 |
+
value:
|
| 324 |
+
do_sample: true
|
| 325 |
+
length_penalty: 1
|
| 326 |
+
max_new_tokens: 1024
|
| 327 |
+
num_beams: 1
|
| 328 |
+
repetition_penalty: 1
|
| 329 |
+
skip_special_tokens: true
|
| 330 |
+
temperature: 0.95
|
| 331 |
+
top_k: 50
|
| 332 |
+
top_p: 0.7
|
| 333 |
+
generation_config:
|
| 334 |
+
value: null
|
| 335 |
+
generation_max_length:
|
| 336 |
+
value: 2047
|
| 337 |
+
generation_num_beams:
|
| 338 |
+
value: null
|
| 339 |
+
gradient_accumulation_steps:
|
| 340 |
+
value: 8
|
| 341 |
+
gradient_checkpointing:
|
| 342 |
+
value: false
|
| 343 |
+
gradient_checkpointing_kwargs:
|
| 344 |
+
value: null
|
| 345 |
+
greater_is_better:
|
| 346 |
+
value: null
|
| 347 |
+
group_by_length:
|
| 348 |
+
value: false
|
| 349 |
+
head_dim:
|
| 350 |
+
value: 128
|
| 351 |
+
hidden_act:
|
| 352 |
+
value: silu
|
| 353 |
+
hidden_size:
|
| 354 |
+
value: 4096
|
| 355 |
+
hub_always_push:
|
| 356 |
+
value: false
|
| 357 |
+
hub_model_id:
|
| 358 |
+
value: null
|
| 359 |
+
hub_private_repo:
|
| 360 |
+
value: null
|
| 361 |
+
hub_revision:
|
| 362 |
+
value: null
|
| 363 |
+
hub_strategy:
|
| 364 |
+
value: every_save
|
| 365 |
+
hub_token:
|
| 366 |
+
value: <HUB_TOKEN>
|
| 367 |
+
id2label:
|
| 368 |
+
value:
|
| 369 |
+
"0": LABEL_0
|
| 370 |
+
"1": LABEL_1
|
| 371 |
+
ignore_data_skip:
|
| 372 |
+
value: false
|
| 373 |
+
include_for_metrics:
|
| 374 |
+
value: []
|
| 375 |
+
include_num_input_tokens_seen:
|
| 376 |
+
value: all
|
| 377 |
+
initializer_range:
|
| 378 |
+
value: 0.02
|
| 379 |
+
intermediate_size:
|
| 380 |
+
value: 12288
|
| 381 |
+
is_encoder_decoder:
|
| 382 |
+
value: false
|
| 383 |
+
label_names:
|
| 384 |
+
value:
|
| 385 |
+
- labels
|
| 386 |
+
label_smoothing_factor:
|
| 387 |
+
value: 0
|
| 388 |
+
label2id:
|
| 389 |
+
value:
|
| 390 |
+
LABEL_0: 0
|
| 391 |
+
LABEL_1: 1
|
| 392 |
+
layer_types:
|
| 393 |
+
value:
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
- full_attention
|
| 410 |
+
- full_attention
|
| 411 |
+
- full_attention
|
| 412 |
+
- full_attention
|
| 413 |
+
- full_attention
|
| 414 |
+
- full_attention
|
| 415 |
+
- full_attention
|
| 416 |
+
- full_attention
|
| 417 |
+
- full_attention
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
learning_rate:
|
| 431 |
+
value: 0.0001
|
| 432 |
+
length_column_name:
|
| 433 |
+
value: length
|
| 434 |
+
liger_kernel_config:
|
| 435 |
+
value: null
|
| 436 |
+
load_best_model_at_end:
|
| 437 |
+
value: false
|
| 438 |
+
local_rank:
|
| 439 |
+
value: -1
|
| 440 |
+
log_level:
|
| 441 |
+
value: passive
|
| 442 |
+
log_level_replica:
|
| 443 |
+
value: warning
|
| 444 |
+
log_on_each_node:
|
| 445 |
+
value: true
|
| 446 |
+
logging_dir:
|
| 447 |
+
value: null
|
| 448 |
+
logging_first_step:
|
| 449 |
+
value: false
|
| 450 |
+
logging_nan_inf_filter:
|
| 451 |
+
value: true
|
| 452 |
+
logging_steps:
|
| 453 |
+
value: 1
|
| 454 |
+
logging_strategy:
|
| 455 |
+
value: steps
|
| 456 |
+
lr_scheduler_kwargs:
|
| 457 |
+
value: null
|
| 458 |
+
lr_scheduler_type:
|
| 459 |
+
value: cosine
|
| 460 |
+
master_addr:
|
| 461 |
+
value: null
|
| 462 |
+
master_port:
|
| 463 |
+
value: null
|
| 464 |
+
max_grad_norm:
|
| 465 |
+
value: 1
|
| 466 |
+
max_position_embeddings:
|
| 467 |
+
value: 32768
|
| 468 |
+
max_steps:
|
| 469 |
+
value: -1
|
| 470 |
+
max_window_layers:
|
| 471 |
+
value: 36
|
| 472 |
+
metric_for_best_model:
|
| 473 |
+
value: null
|
| 474 |
+
model/num_parameters:
|
| 475 |
+
value: 8278029312
|
| 476 |
+
model_args:
|
| 477 |
+
value:
|
| 478 |
+
adapter_folder: null
|
| 479 |
+
adapter_name_or_path: null
|
| 480 |
+
add_special_tokens: null
|
| 481 |
+
add_tokens: null
|
| 482 |
+
audio_sampling_rate: 16000
|
| 483 |
+
block_diag_attn: false
|
| 484 |
+
cache_dir: null
|
| 485 |
+
chunk_size: 8192
|
| 486 |
+
compute_dtype: torch.bfloat16
|
| 487 |
+
cpu_infer: 32
|
| 488 |
+
crop_to_patches: false
|
| 489 |
+
device_map:
|
| 490 |
+
"": cuda:0
|
| 491 |
+
disable_gradient_checkpointing: false
|
| 492 |
+
double_quantization: true
|
| 493 |
+
enable_liger_kernel: false
|
| 494 |
+
export_device: cpu
|
| 495 |
+
export_dir: null
|
| 496 |
+
export_hub_model_id: null
|
| 497 |
+
export_legacy_format: false
|
| 498 |
+
export_quantization_bit: null
|
| 499 |
+
export_quantization_dataset: null
|
| 500 |
+
export_quantization_maxlen: 1024
|
| 501 |
+
export_quantization_nsamples: 128
|
| 502 |
+
export_size: 5
|
| 503 |
+
flash_attn: auto
|
| 504 |
+
hf_hub_token: <HF_HUB_TOKEN>
|
| 505 |
+
image_do_pan_and_scan: false
|
| 506 |
+
image_max_pixels: 589824
|
| 507 |
+
image_min_pixels: 1024
|
| 508 |
+
infer_backend: HF
|
| 509 |
+
infer_dtype: auto
|
| 510 |
+
init_special_tokens: noise_init
|
| 511 |
+
kt_force_think: false
|
| 512 |
+
kt_maxlen: 4096
|
| 513 |
+
kt_mode: normal
|
| 514 |
+
kt_optimize_rule: null
|
| 515 |
+
kt_use_cuda_graph: true
|
| 516 |
+
low_cpu_mem_usage: true
|
| 517 |
+
mixture_of_depths: null
|
| 518 |
+
mode: normal
|
| 519 |
+
model_max_length: 2047
|
| 520 |
+
model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 521 |
+
model_revision: main
|
| 522 |
+
moe_aux_loss_coef: null
|
| 523 |
+
ms_hub_token: <MS_HUB_TOKEN>
|
| 524 |
+
new_special_tokens_config: null
|
| 525 |
+
offload_folder: offload
|
| 526 |
+
om_hub_token: <OM_HUB_TOKEN>
|
| 527 |
+
print_param_status: false
|
| 528 |
+
quantization_bit: null
|
| 529 |
+
quantization_device_map: null
|
| 530 |
+
quantization_method: BNB
|
| 531 |
+
quantization_type: nf4
|
| 532 |
+
resize_vocab: false
|
| 533 |
+
rope_scaling: null
|
| 534 |
+
sglang_config: null
|
| 535 |
+
sglang_lora_backend: triton
|
| 536 |
+
sglang_maxlen: 4096
|
| 537 |
+
sglang_mem_fraction: 0.7
|
| 538 |
+
sglang_tp_size: -1
|
| 539 |
+
shift_attn: false
|
| 540 |
+
split_special_tokens: false
|
| 541 |
+
train_from_scratch: false
|
| 542 |
+
trust_remote_code: true
|
| 543 |
+
upcast_layernorm: false
|
| 544 |
+
upcast_lmhead_output: false
|
| 545 |
+
use_audio_in_video: false
|
| 546 |
+
use_fast_tokenizer: true
|
| 547 |
+
use_kt: false
|
| 548 |
+
use_kv_cache: true
|
| 549 |
+
use_reentrant_gc: true
|
| 550 |
+
use_unsloth: false
|
| 551 |
+
use_unsloth_gc: false
|
| 552 |
+
use_v1_kernels: false
|
| 553 |
+
video_fps: 2
|
| 554 |
+
video_max_pixels: 65536
|
| 555 |
+
video_maxlen: 128
|
| 556 |
+
video_min_pixels: 256
|
| 557 |
+
vllm_config: null
|
| 558 |
+
vllm_enforce_eager: false
|
| 559 |
+
vllm_gpu_util: 0.7
|
| 560 |
+
vllm_max_lora_rank: 32
|
| 561 |
+
vllm_maxlen: 4096
|
| 562 |
+
model_type:
|
| 563 |
+
value: qwen3
|
| 564 |
+
neftune_noise_alpha:
|
| 565 |
+
value: null
|
| 566 |
+
num_attention_heads:
|
| 567 |
+
value: 32
|
| 568 |
+
num_hidden_layers:
|
| 569 |
+
value: 36
|
| 570 |
+
num_key_value_heads:
|
| 571 |
+
value: 8
|
| 572 |
+
num_train_epochs:
|
| 573 |
+
value: 10
|
| 574 |
+
optim:
|
| 575 |
+
value: adamw_torch
|
| 576 |
+
optim_args:
|
| 577 |
+
value: null
|
| 578 |
+
optim_target_modules:
|
| 579 |
+
value: null
|
| 580 |
+
output_attentions:
|
| 581 |
+
value: false
|
| 582 |
+
output_dir:
|
| 583 |
+
value: /workspace/v127rc_exp2/B_dup
|
| 584 |
+
output_hidden_states:
|
| 585 |
+
value: false
|
| 586 |
+
overwrite_output_dir:
|
| 587 |
+
value: false
|
| 588 |
+
pad_token_id:
|
| 589 |
+
value: 151643
|
| 590 |
+
parallelism_config:
|
| 591 |
+
value: null
|
| 592 |
+
peft_config:
|
| 593 |
+
value:
|
| 594 |
+
default:
|
| 595 |
+
alora_invocation_tokens: null
|
| 596 |
+
arrow_config: null
|
| 597 |
+
auto_mapping: null
|
| 598 |
+
base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
|
| 599 |
+
bias: none
|
| 600 |
+
corda_config: null
|
| 601 |
+
ensure_weight_tying: false
|
| 602 |
+
eva_config: null
|
| 603 |
+
exclude_modules: null
|
| 604 |
+
fan_in_fan_out: false
|
| 605 |
+
inference_mode: false
|
| 606 |
+
init_lora_weights: true
|
| 607 |
+
layer_replication: null
|
| 608 |
+
layers_pattern: null
|
| 609 |
+
layers_to_transform: null
|
| 610 |
+
lora_alpha: 64
|
| 611 |
+
lora_bias: false
|
| 612 |
+
lora_dropout: 0.03
|
| 613 |
+
megatron_config: null
|
| 614 |
+
megatron_core: megatron.core
|
| 615 |
+
modules_to_save: null
|
| 616 |
+
peft_type: LORA
|
| 617 |
+
peft_version: 0.18.1
|
| 618 |
+
qalora_group_size: 16
|
| 619 |
+
r: 32
|
| 620 |
+
revision: null
|
| 621 |
+
runtime_config:
|
| 622 |
+
ephemeral_gpu_offload: false
|
| 623 |
+
target_modules:
|
| 624 |
+
- o_proj
|
| 625 |
+
- gate_proj
|
| 626 |
+
- q_proj
|
| 627 |
+
- up_proj
|
| 628 |
+
- v_proj
|
| 629 |
+
- down_proj
|
| 630 |
+
- k_proj
|
| 631 |
+
target_parameters: null
|
| 632 |
+
task_type: CAUSAL_LM
|
| 633 |
+
trainable_token_indices: null
|
| 634 |
+
use_dora: false
|
| 635 |
+
use_qalora: false
|
| 636 |
+
use_rslora: false
|
| 637 |
+
per_device_eval_batch_size:
|
| 638 |
+
value: 8
|
| 639 |
+
per_device_train_batch_size:
|
| 640 |
+
value: 1
|
| 641 |
+
predict_with_generate:
|
| 642 |
+
value: false
|
| 643 |
+
prediction_loss_only:
|
| 644 |
+
value: false
|
| 645 |
+
problem_type:
|
| 646 |
+
value: null
|
| 647 |
+
project:
|
| 648 |
+
value: huggingface
|
| 649 |
+
push_to_hub:
|
| 650 |
+
value: false
|
| 651 |
+
ray_init_kwargs:
|
| 652 |
+
value: null
|
| 653 |
+
ray_num_workers:
|
| 654 |
+
value: 1
|
| 655 |
+
remove_unused_columns:
|
| 656 |
+
value: false
|
| 657 |
+
report_to:
|
| 658 |
+
value:
|
| 659 |
+
- wandb
|
| 660 |
+
restore_callback_states_from_checkpoint:
|
| 661 |
+
value: false
|
| 662 |
+
resume_from_checkpoint:
|
| 663 |
+
value: null
|
| 664 |
+
return_dict:
|
| 665 |
+
value: true
|
| 666 |
+
rms_norm_eps:
|
| 667 |
+
value: 1e-06
|
| 668 |
+
rope_parameters:
|
| 669 |
+
value:
|
| 670 |
+
rope_theta: 1000000
|
| 671 |
+
rope_type: default
|
| 672 |
+
run_name:
|
| 673 |
+
value: null
|
| 674 |
+
save_on_each_node:
|
| 675 |
+
value: false
|
| 676 |
+
save_only_model:
|
| 677 |
+
value: true
|
| 678 |
+
save_steps:
|
| 679 |
+
value: 10
|
| 680 |
+
save_strategy:
|
| 681 |
+
value: steps
|
| 682 |
+
save_total_limit:
|
| 683 |
+
value: null
|
| 684 |
+
seed:
|
| 685 |
+
value: 42
|
| 686 |
+
skip_memory_metrics:
|
| 687 |
+
value: true
|
| 688 |
+
sliding_window:
|
| 689 |
+
value: null
|
| 690 |
+
sortish_sampler:
|
| 691 |
+
value: false
|
| 692 |
+
tf32:
|
| 693 |
+
value: null
|
| 694 |
+
tie_word_embeddings:
|
| 695 |
+
value: false
|
| 696 |
+
torch_compile:
|
| 697 |
+
value: false
|
| 698 |
+
torch_compile_backend:
|
| 699 |
+
value: null
|
| 700 |
+
torch_compile_mode:
|
| 701 |
+
value: null
|
| 702 |
+
torch_empty_cache_steps:
|
| 703 |
+
value: null
|
| 704 |
+
trackio_space_id:
|
| 705 |
+
value: trackio
|
| 706 |
+
transformers_version:
|
| 707 |
+
value: 5.0.0
|
| 708 |
+
use_cache:
|
| 709 |
+
value: false
|
| 710 |
+
use_cpu:
|
| 711 |
+
value: false
|
| 712 |
+
use_liger_kernel:
|
| 713 |
+
value: false
|
| 714 |
+
use_sliding_window:
|
| 715 |
+
value: false
|
| 716 |
+
vocab_size:
|
| 717 |
+
value: 151936
|
| 718 |
+
warmup_ratio:
|
| 719 |
+
value: 0.01
|
| 720 |
+
warmup_steps:
|
| 721 |
+
value: 0.01
|
| 722 |
+
weight_decay:
|
| 723 |
+
value: 0.01
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/output.log
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0%| | 0/13920 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
| 2 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
| 3 |
+
0%| | 10/13920 [01:23<32:01:23, 8.29s/it][INFO|trainer.py:4115] 2026-02-09 08:48:11,523 >> Saving model checkpoint to /workspace/v127rc_exp2/B_dup/checkpoint-10
|
| 4 |
+
{'loss': '1.213', 'grad_norm': '0.1718', 'learning_rate': '0', 'epoch': '0.0007184', 'num_input_tokens_seen': 16376, 'train_runtime': '11.31', 'train_tokens_per_second': '1448'}
|
| 5 |
+
{'loss': '1.384', 'grad_norm': '0.1968', 'learning_rate': '7.143e-07', 'epoch': '0.001437', 'num_input_tokens_seen': 32752, 'train_runtime': '19.54', 'train_tokens_per_second': '1676'}
|
| 6 |
+
{'loss': '1.234', 'grad_norm': '0.1806', 'learning_rate': '1.429e-06', 'epoch': '0.002155', 'num_input_tokens_seen': 49128, 'train_runtime': '27.79', 'train_tokens_per_second': '1768'}
|
| 7 |
+
{'loss': '1.384', 'grad_norm': '0.2031', 'learning_rate': '2.143e-06', 'epoch': '0.002874', 'num_input_tokens_seen': 65504, 'train_runtime': '36.03', 'train_tokens_per_second': '1818'}
|
| 8 |
+
{'loss': '1.48', 'grad_norm': '0.2195', 'learning_rate': '2.857e-06', 'epoch': '0.003592', 'num_input_tokens_seen': 81880, 'train_runtime': '44.3', 'train_tokens_per_second': '1848'}
|
| 9 |
+
{'loss': '1.382', 'grad_norm': '0.2049', 'learning_rate': '3.571e-06', 'epoch': '0.00431', 'num_input_tokens_seen': 98256, 'train_runtime': '52.57', 'train_tokens_per_second': '1869'}
|
| 10 |
+
{'loss': '1.717', 'grad_norm': '0.2322', 'learning_rate': '4.286e-06', 'epoch': '0.005029', 'num_input_tokens_seen': 114632, 'train_runtime': '60.85', 'train_tokens_per_second': '1884'}
|
| 11 |
+
{'loss': '1.608', 'grad_norm': '0.1957', 'learning_rate': '5e-06', 'epoch': '0.005747', 'num_input_tokens_seen': 131008, 'train_runtime': '69.14', 'train_tokens_per_second': '1895'}
|
| 12 |
+
{'loss': '1.435', 'grad_norm': '0.2099', 'learning_rate': '5.714e-06', 'epoch': '0.006466', 'num_input_tokens_seen': 147384, 'train_runtime': '77.43', 'train_tokens_per_second': '1903'}
|
| 13 |
+
{'loss': '1.354', 'grad_norm': '0.185', 'learning_rate': '6.429e-06', 'epoch': '0.007184', 'num_input_tokens_seen': 163760, 'train_runtime': '85.72', 'train_tokens_per_second': '1910'}
|
| 14 |
+
[INFO|configuration_utils.py:665] 2026-02-09 08:48:11,559 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json
|
| 15 |
+
[INFO|configuration_utils.py:739] 2026-02-09 08:48:11,559 >> Model config Qwen3Config {
|
| 16 |
+
"architectures": [
|
| 17 |
+
"Qwen3ForCausalLM"
|
| 18 |
+
],
|
| 19 |
+
"attention_bias": false,
|
| 20 |
+
"attention_dropout": 0.0,
|
| 21 |
+
"bos_token_id": 151643,
|
| 22 |
+
"dtype": "bfloat16",
|
| 23 |
+
"eos_token_id": 151643,
|
| 24 |
+
"head_dim": 128,
|
| 25 |
+
"hidden_act": "silu",
|
| 26 |
+
"hidden_size": 4096,
|
| 27 |
+
"initializer_range": 0.02,
|
| 28 |
+
"intermediate_size": 12288,
|
| 29 |
+
"layer_types": [
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"full_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention"
|
| 66 |
+
],
|
| 67 |
+
"max_position_embeddings": 32768,
|
| 68 |
+
"max_window_layers": 36,
|
| 69 |
+
"model_type": "qwen3",
|
| 70 |
+
"num_attention_heads": 32,
|
| 71 |
+
"num_hidden_layers": 36,
|
| 72 |
+
"num_key_value_heads": 8,
|
| 73 |
+
"pad_token_id": null,
|
| 74 |
+
"rms_norm_eps": 1e-06,
|
| 75 |
+
"rope_parameters": {
|
| 76 |
+
"rope_theta": 1000000,
|
| 77 |
+
"rope_type": "default"
|
| 78 |
+
},
|
| 79 |
+
"sliding_window": null,
|
| 80 |
+
"tie_word_embeddings": false,
|
| 81 |
+
"transformers_version": "5.0.0",
|
| 82 |
+
"use_cache": true,
|
| 83 |
+
"use_sliding_window": false,
|
| 84 |
+
"vocab_size": 151936
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
[INFO|tokenization_utils_base.py:3327] 2026-02-09 08:48:12,278 >> chat template saved in /workspace/v127rc_exp2/B_dup/checkpoint-10/chat_template.jinja
|
| 88 |
+
[INFO|tokenization_utils_base.py:2181] 2026-02-09 08:48:12,287 >> tokenizer config file saved in /workspace/v127rc_exp2/B_dup/checkpoint-10/tokenizer_config.json
|
| 89 |
+
0%|▏ | 11/13920 [01:32<33:02:21, 8.55s/it]Traceback (most recent call last):
|
| 90 |
+
{'loss': '1.429', 'grad_norm': '0.2128', 'learning_rate': '7.143e-06', 'epoch': '0.007902', 'num_input_tokens_seen': 180136, 'train_runtime': '94.87', 'train_tokens_per_second': '1899'}
|
| 91 |
+
File "/usr/local/bin/llamafactory-cli", line 8, in <module>
|
| 92 |
+
sys.exit(main())
|
| 93 |
+
^^^^^^
|
| 94 |
+
File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
|
| 95 |
+
launcher.launch()
|
| 96 |
+
File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
|
| 97 |
+
run_exp()
|
| 98 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
|
| 99 |
+
_training_function(config={"args": args, "callbacks": callbacks})
|
| 100 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
|
| 101 |
+
run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
|
| 102 |
+
File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
|
| 103 |
+
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
|
| 104 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 105 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
|
| 106 |
+
return inner_training_loop(
|
| 107 |
+
^^^^^^^^^^^^^^^^^^^^
|
| 108 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
|
| 109 |
+
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
| 110 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 111 |
+
File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
|
| 112 |
+
self.accelerator.backward(loss, **kwargs)
|
| 113 |
+
File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
|
| 114 |
+
loss.backward(**kwargs)
|
| 115 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
|
| 116 |
+
torch.autograd.backward(
|
| 117 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
|
| 118 |
+
_engine_run_backward(
|
| 119 |
+
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
|
| 120 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 121 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 122 |
+
KeyboardInterrupt
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/requirements.txt
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytz==2025.2
|
| 2 |
+
pydub==0.25.1
|
| 3 |
+
brotli==1.2.0
|
| 4 |
+
antlr4-python3-runtime==4.9.3
|
| 5 |
+
xxhash==3.6.0
|
| 6 |
+
websockets==15.0.1
|
| 7 |
+
tzdata==2025.3
|
| 8 |
+
typing_extensions==4.15.0
|
| 9 |
+
tqdm==4.67.3
|
| 10 |
+
tomlkit==0.13.3
|
| 11 |
+
termcolor==3.3.0
|
| 12 |
+
shtab==1.8.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
sentencepiece==0.2.1
|
| 15 |
+
semantic-version==2.10.0
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
ruff==0.15.0
|
| 18 |
+
regex==2026.1.15
|
| 19 |
+
python-multipart==0.0.22
|
| 20 |
+
pyparsing==3.3.2
|
| 21 |
+
pyarrow==23.0.0
|
| 22 |
+
protobuf==6.33.5
|
| 23 |
+
propcache==0.4.1
|
| 24 |
+
orjson==3.11.7
|
| 25 |
+
omegaconf==2.3.0
|
| 26 |
+
numpy==2.4.2
|
| 27 |
+
multidict==6.7.1
|
| 28 |
+
mdurl==0.1.2
|
| 29 |
+
kiwisolver==1.4.9
|
| 30 |
+
hf-xet==1.2.0
|
| 31 |
+
hf_transfer==0.1.9
|
| 32 |
+
groovy==0.1.2
|
| 33 |
+
frozenlist==1.8.0
|
| 34 |
+
fonttools==4.61.1
|
| 35 |
+
ffmpy==1.0.0
|
| 36 |
+
einops==0.8.2
|
| 37 |
+
docstring_parser==0.17.0
|
| 38 |
+
dill==0.3.8
|
| 39 |
+
cycler==0.12.1
|
| 40 |
+
click==8.3.1
|
| 41 |
+
av==16.0.0
|
| 42 |
+
annotated-types==0.7.0
|
| 43 |
+
annotated-doc==0.0.4
|
| 44 |
+
aiohappyeyeballs==2.6.1
|
| 45 |
+
aiofiles==24.1.0
|
| 46 |
+
yarl==1.22.0
|
| 47 |
+
uvicorn==0.40.0
|
| 48 |
+
typing-inspection==0.4.2
|
| 49 |
+
typer-slim==0.21.1
|
| 50 |
+
tiktoken==0.12.0
|
| 51 |
+
scipy==1.17.0
|
| 52 |
+
pydantic_core==2.41.4
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
multiprocess==0.70.16
|
| 55 |
+
modelscope==1.34.0
|
| 56 |
+
markdown-it-py==4.0.0
|
| 57 |
+
fire==0.7.1
|
| 58 |
+
contourpy==1.3.3
|
| 59 |
+
anyio==4.12.1
|
| 60 |
+
aiosignal==1.4.0
|
| 61 |
+
starlette==0.52.1
|
| 62 |
+
rich==14.3.2
|
| 63 |
+
pydantic==2.12.3
|
| 64 |
+
matplotlib==3.10.8
|
| 65 |
+
aiohttp==3.13.3
|
| 66 |
+
tyro==0.8.14
|
| 67 |
+
typer==0.21.1
|
| 68 |
+
torchdata==0.11.0
|
| 69 |
+
sse-starlette==3.2.0
|
| 70 |
+
safehttpx==0.1.7
|
| 71 |
+
huggingface_hub==1.4.1
|
| 72 |
+
fastapi==0.128.5
|
| 73 |
+
tokenizers==0.22.2
|
| 74 |
+
gradio_client==1.14.0
|
| 75 |
+
datasets==4.0.0
|
| 76 |
+
accelerate==1.11.0
|
| 77 |
+
transformers==5.0.0
|
| 78 |
+
gradio==5.50.0
|
| 79 |
+
trl==0.24.0
|
| 80 |
+
peft==0.18.1
|
| 81 |
+
llamafactory==0.9.5.dev0
|
| 82 |
+
jieba==0.42.1
|
| 83 |
+
rouge-chinese==1.0.3
|
| 84 |
+
joblib==1.5.3
|
| 85 |
+
nltk==3.9.2
|
| 86 |
+
py-cpuinfo==9.0.0
|
| 87 |
+
nvidia-ml-py==13.590.48
|
| 88 |
+
hjson==3.1.0
|
| 89 |
+
ninja==1.13.0
|
| 90 |
+
msgpack==1.1.2
|
| 91 |
+
deepspeed==0.16.9
|
| 92 |
+
smmap==5.0.2
|
| 93 |
+
sentry-sdk==2.52.0
|
| 94 |
+
gitdb==4.0.12
|
| 95 |
+
GitPython==3.1.46
|
| 96 |
+
wandb==0.24.2
|
| 97 |
+
entrypoints==0.4
|
| 98 |
+
jupyter_client==7.4.9
|
| 99 |
+
nbclassic==1.1.0
|
| 100 |
+
notebook==6.5.5
|
| 101 |
+
pyzmq==24.0.1
|
| 102 |
+
PyYAML==6.0.2
|
| 103 |
+
Send2Trash==1.8.3
|
| 104 |
+
argon2-cffi==23.1.0
|
| 105 |
+
argon2-cffi-bindings==21.2.0
|
| 106 |
+
arrow==1.3.0
|
| 107 |
+
asttokens==2.4.1
|
| 108 |
+
async-lru==2.0.4
|
| 109 |
+
attrs==24.2.0
|
| 110 |
+
babel==2.16.0
|
| 111 |
+
beautifulsoup4==4.12.3
|
| 112 |
+
bleach==6.1.0
|
| 113 |
+
certifi==2024.8.30
|
| 114 |
+
cffi==1.17.1
|
| 115 |
+
charset-normalizer==3.3.2
|
| 116 |
+
comm==0.2.2
|
| 117 |
+
debugpy==1.8.5
|
| 118 |
+
decorator==5.1.1
|
| 119 |
+
defusedxml==0.7.1
|
| 120 |
+
executing==2.1.0
|
| 121 |
+
fastjsonschema==2.20.0
|
| 122 |
+
fqdn==1.5.1
|
| 123 |
+
h11==0.14.0
|
| 124 |
+
httpcore==1.0.5
|
| 125 |
+
httpx==0.27.2
|
| 126 |
+
idna==3.10
|
| 127 |
+
ipykernel==6.29.5
|
| 128 |
+
ipython==8.27.0
|
| 129 |
+
ipython-genutils==0.2.0
|
| 130 |
+
ipywidgets==8.1.5
|
| 131 |
+
isoduration==20.11.0
|
| 132 |
+
jedi==0.19.1
|
| 133 |
+
json5==0.9.25
|
| 134 |
+
jsonpointer==3.0.0
|
| 135 |
+
jsonschema==4.23.0
|
| 136 |
+
jsonschema-specifications==2023.12.1
|
| 137 |
+
jupyter-archive==3.4.0
|
| 138 |
+
jupyter_contrib_core==0.4.2
|
| 139 |
+
jupyter_contrib_nbextensions==0.7.0
|
| 140 |
+
jupyter_core==5.7.2
|
| 141 |
+
jupyter-events==0.10.0
|
| 142 |
+
jupyter-highlight-selected-word==0.2.0
|
| 143 |
+
jupyter-lsp==2.2.5
|
| 144 |
+
jupyter_nbextensions_configurator==0.6.4
|
| 145 |
+
jupyter_server==2.14.2
|
| 146 |
+
jupyter_server_terminals==0.5.3
|
| 147 |
+
jupyterlab==4.2.5
|
| 148 |
+
jupyterlab_pygments==0.3.0
|
| 149 |
+
jupyterlab_server==2.27.3
|
| 150 |
+
jupyterlab_widgets==3.0.13
|
| 151 |
+
lxml==5.3.0
|
| 152 |
+
matplotlib-inline==0.1.7
|
| 153 |
+
mistune==3.0.2
|
| 154 |
+
nbclient==0.10.0
|
| 155 |
+
nbconvert==7.16.4
|
| 156 |
+
nbformat==5.10.4
|
| 157 |
+
nest-asyncio==1.6.0
|
| 158 |
+
notebook_shim==0.2.4
|
| 159 |
+
overrides==7.7.0
|
| 160 |
+
packaging==24.1
|
| 161 |
+
pandocfilters==1.5.1
|
| 162 |
+
parso==0.8.4
|
| 163 |
+
pexpect==4.9.0
|
| 164 |
+
platformdirs==4.3.6
|
| 165 |
+
prometheus_client==0.21.0
|
| 166 |
+
prompt_toolkit==3.0.47
|
| 167 |
+
psutil==6.0.0
|
| 168 |
+
ptyprocess==0.7.0
|
| 169 |
+
pure_eval==0.2.3
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
Pygments==2.18.0
|
| 172 |
+
python-dateutil==2.9.0.post0
|
| 173 |
+
python-json-logger==2.0.7
|
| 174 |
+
referencing==0.35.1
|
| 175 |
+
requests==2.32.3
|
| 176 |
+
rfc3339-validator==0.1.4
|
| 177 |
+
rfc3986-validator==0.1.1
|
| 178 |
+
rpds-py==0.20.0
|
| 179 |
+
sniffio==1.3.1
|
| 180 |
+
soupsieve==2.6
|
| 181 |
+
stack-data==0.6.3
|
| 182 |
+
terminado==0.18.1
|
| 183 |
+
tinycss2==1.3.0
|
| 184 |
+
tornado==6.4.1
|
| 185 |
+
traitlets==5.14.3
|
| 186 |
+
types-python-dateutil==2.9.0.20240906
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.2.3
|
| 189 |
+
wcwidth==0.2.13
|
| 190 |
+
webcolors==24.8.0
|
| 191 |
+
webencodings==0.5.1
|
| 192 |
+
websocket-client==1.8.0
|
| 193 |
+
widgetsnbextension==4.0.13
|
| 194 |
+
Jinja2==3.1.3
|
| 195 |
+
MarkupSafe==2.1.5
|
| 196 |
+
filelock==3.13.1
|
| 197 |
+
fsspec==2024.2.0
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
networkx==3.2.1
|
| 200 |
+
nvidia-cublas-cu12==12.4.2.65
|
| 201 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
| 202 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
| 203 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
| 204 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 205 |
+
nvidia-cufft-cu12==11.2.0.44
|
| 206 |
+
nvidia-curand-cu12==10.3.5.119
|
| 207 |
+
nvidia-cusolver-cu12==11.6.0.99
|
| 208 |
+
nvidia-cusparse-cu12==12.3.0.142
|
| 209 |
+
nvidia-nccl-cu12==2.20.5
|
| 210 |
+
nvidia-nvjitlink-cu12==12.4.99
|
| 211 |
+
nvidia-nvtx-cu12==12.4.99
|
| 212 |
+
pillow==10.2.0
|
| 213 |
+
sympy==1.12
|
| 214 |
+
torch==2.4.1+cu124
|
| 215 |
+
torchaudio==2.4.1+cu124
|
| 216 |
+
torchvision==0.19.1+cu124
|
| 217 |
+
triton==3.0.0
|
| 218 |
+
pip==24.2
|
| 219 |
+
setuptools==75.1.0
|
| 220 |
+
wheel==0.44.0
|
| 221 |
+
PyGObject==3.42.1
|
| 222 |
+
PyJWT==2.3.0
|
| 223 |
+
SecretStorage==3.3.1
|
| 224 |
+
blinker==1.4
|
| 225 |
+
cryptography==3.4.8
|
| 226 |
+
dbus-python==1.2.18
|
| 227 |
+
distro==1.7.0
|
| 228 |
+
httplib2==0.20.2
|
| 229 |
+
importlib-metadata==4.6.4
|
| 230 |
+
jeepney==0.7.1
|
| 231 |
+
keyring==23.5.0
|
| 232 |
+
launchpadlib==1.10.16
|
| 233 |
+
lazr.restfulclient==0.14.4
|
| 234 |
+
lazr.uri==1.0.6
|
| 235 |
+
more-itertools==8.10.0
|
| 236 |
+
oauthlib==3.2.0
|
| 237 |
+
python-apt==2.4.0+ubuntu4
|
| 238 |
+
six==1.16.0
|
| 239 |
+
wadllib==1.3.6
|
| 240 |
+
zipp==1.0.0
|
| 241 |
+
autocommand==2.2.2
|
| 242 |
+
backports.tarfile==1.2.0
|
| 243 |
+
importlib_metadata==8.0.0
|
| 244 |
+
importlib_resources==6.4.0
|
| 245 |
+
inflect==7.3.1
|
| 246 |
+
jaraco.collections==5.1.0
|
| 247 |
+
jaraco.context==5.3.0
|
| 248 |
+
jaraco.functools==4.0.1
|
| 249 |
+
jaraco.text==3.12.1
|
| 250 |
+
more-itertools==10.3.0
|
| 251 |
+
packaging==24.1
|
| 252 |
+
platformdirs==4.2.2
|
| 253 |
+
tomli==2.0.1
|
| 254 |
+
typeguard==4.3.0
|
| 255 |
+
typing_extensions==4.12.2
|
| 256 |
+
wheel==0.43.0
|
| 257 |
+
zipp==3.19.2
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-60-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-09T08:46:47.557835Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp2/B_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "3bebe963f251",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2060419072"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "134156767232"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.9",
|
| 40 |
+
"writerId": "5l942me186lee9ffmegn06ghne5ypa8s"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_step":10,"train/epoch":0.007902298850574713,"_runtime":94,"train/grad_norm":0.21283617615699768,"_timestamp":1.7706269006685286e+09,"train/global_step":11,"train/num_input_tokens_seen":180136,"_wandb":{"runtime":94},"train_runtime":94.8711,"train/learning_rate":7.142857142857143e-06,"train/loss":1.4294381141662598,"train/train_tokens_per_second":1898.744}
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-09T08:46:47.817513188Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-02-09T08:46:48.121590539Z","level":"INFO","msg":"stream: created new stream","id":"55tyrmzu"}
|
| 3 |
+
{"time":"2026-02-09T08:46:48.12218375Z","level":"INFO","msg":"handler: started","stream_id":"55tyrmzu"}
|
| 4 |
+
{"time":"2026-02-09T08:46:48.123357618Z","level":"INFO","msg":"stream: started","id":"55tyrmzu"}
|
| 5 |
+
{"time":"2026-02-09T08:46:48.123366734Z","level":"INFO","msg":"writer: started","stream_id":"55tyrmzu"}
|
| 6 |
+
{"time":"2026-02-09T08:46:48.123368438Z","level":"INFO","msg":"sender: started","stream_id":"55tyrmzu"}
|
| 7 |
+
{"time":"2026-02-09T08:48:22.69328438Z","level":"INFO","msg":"stream: closing","id":"55tyrmzu"}
|
| 8 |
+
{"time":"2026-02-09T08:48:23.219258177Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-02-09T08:48:23.424167617Z","level":"INFO","msg":"handler: closed","stream_id":"55tyrmzu"}
|
| 10 |
+
{"time":"2026-02-09T08:48:23.429177461Z","level":"INFO","msg":"sender: closed","stream_id":"55tyrmzu"}
|
| 11 |
+
{"time":"2026-02-09T08:48:23.429635912Z","level":"INFO","msg":"stream: closed","id":"55tyrmzu"}
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-09 08:46:47,588 INFO MainThread:4723 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
|
| 2 |
+
2026-02-09 08:46:47,588 INFO MainThread:4723 [wandb_setup.py:_flush():81] Configure stats pid to 4723
|
| 3 |
+
2026-02-09 08:46:47,589 INFO MainThread:4723 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-09 08:46:47,590 INFO MainThread:4723 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log
|
| 5 |
+
2026-02-09 08:46:47,591 INFO MainThread:4723 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log
|
| 6 |
+
2026-02-09 08:46:47,592 INFO MainThread:4723 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-09 08:46:47,592 INFO MainThread:4723 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-09 08:46:47,592 INFO MainThread:4723 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-09 08:46:47,806 INFO MainThread:4723 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-09 08:46:47,815 INFO MainThread:4723 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-09 08:46:47,817 INFO MainThread:4723 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-09 08:46:47,886 INFO MainThread:4723 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-09 08:46:48,406 INFO MainThread:4723 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-09 08:46:48,473 INFO MainThread:4723 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-09 08:46:48,474 INFO MainThread:4723 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-09 08:46:48,475 INFO MainThread:4723 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-09 08:46:48,475 INFO MainThread:4723 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-09 08:46:48,477 INFO MainThread:4723 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-09 08:46:48,478 INFO MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['o_proj', 'gate_proj', 'q_proj', 'up_proj', 'v_proj', 'down_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp2/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.01, 'warmup_steps': 0.01, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 10, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-09 08:46:48,484 INFO MainThread:4723 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8278029312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7771762c0550>>
|
| 22 |
+
2026-02-09 08:46:48,485 INFO MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8278029312 None
|
| 23 |
+
2026-02-09 08:46:48,487 INFO MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d34_r300'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 64, 'lora_dropout': 0.03, 'lora_rank': 32, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-09 08:48:22,693 INFO wandb-AsyncioManager-main:4723 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-09 08:48:22,694 INFO wandb-AsyncioManager-main:4723 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/run-55tyrmzu.wandb
ADDED
|
Binary file (49.1 kB). View file
|
|
|
LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-60-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.10",
|
| 4 |
+
"startedAt": "2026-02-09T08:50:51.146337Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"/workspace/v127rc_exp2/B_dup.yaml"
|
| 7 |
+
],
|
| 8 |
+
"program": "/usr/local/bin/llamafactory-cli",
|
| 9 |
+
"git": {
|
| 10 |
+
"remote": "https://github.com/hiyouga/LlamaFactory.git",
|
| 11 |
+
"commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
|
| 12 |
+
},
|
| 13 |
+
"email": "markmochi200@gmail.com",
|
| 14 |
+
"root": "/workspace/LlamaFactory",
|
| 15 |
+
"host": "3bebe963f251",
|
| 16 |
+
"executable": "/usr/bin/python",
|
| 17 |
+
"cpu_count": 16,
|
| 18 |
+
"cpu_count_logical": 32,
|
| 19 |
+
"gpu": "NVIDIA GeForce RTX 4090",
|
| 20 |
+
"gpu_count": 1,
|
| 21 |
+
"disk": {
|
| 22 |
+
"/": {
|
| 23 |
+
"total": "21474836480",
|
| 24 |
+
"used": "2060427264"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"memory": {
|
| 28 |
+
"total": "134156767232"
|
| 29 |
+
},
|
| 30 |
+
"gpu_nvidia": [
|
| 31 |
+
{
|
| 32 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 33 |
+
"memoryTotal": "25757220864",
|
| 34 |
+
"cudaCores": 16384,
|
| 35 |
+
"architecture": "Ada",
|
| 36 |
+
"uuid": "GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e"
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"cudaVersion": "12.9",
|
| 40 |
+
"writerId": "iuuq28fefy6u1tv2cie29tfnokxlsg0z"
|
| 41 |
+
}
|
LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-02-09T08:50:51.398312029Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-02-09T08:50:51.715694946Z","level":"INFO","msg":"stream: created new stream","id":"sxxworn9"}
|
| 3 |
+
{"time":"2026-02-09T08:50:51.716325506Z","level":"INFO","msg":"handler: started","stream_id":"sxxworn9"}
|
| 4 |
+
{"time":"2026-02-09T08:50:51.718352807Z","level":"INFO","msg":"stream: started","id":"sxxworn9"}
|
| 5 |
+
{"time":"2026-02-09T08:50:51.718357797Z","level":"INFO","msg":"writer: started","stream_id":"sxxworn9"}
|
| 6 |
+
{"time":"2026-02-09T08:50:51.718367484Z","level":"INFO","msg":"sender: started","stream_id":"sxxworn9"}
|
| 7 |
+
{"time":"2026-02-10T17:01:06.72420802Z","level":"INFO","msg":"stream: closing","id":"sxxworn9"}
|
| 8 |
+
{"time":"2026-02-10T17:01:08.23332074Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-02-10T17:01:08.470443353Z","level":"INFO","msg":"handler: closed","stream_id":"sxxworn9"}
|
| 10 |
+
{"time":"2026-02-10T17:01:08.474505531Z","level":"INFO","msg":"sender: closed","stream_id":"sxxworn9"}
|
| 11 |
+
{"time":"2026-02-10T17:01:08.474851934Z","level":"INFO","msg":"stream: closed","id":"sxxworn9"}
|
LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-09 08:50:51,173 INFO MainThread:5887 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
|
| 2 |
+
2026-02-09 08:50:51,174 INFO MainThread:5887 [wandb_setup.py:_flush():81] Configure stats pid to 5887
|
| 3 |
+
2026-02-09 08:50:51,174 INFO MainThread:5887 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-02-09 08:50:51,175 INFO MainThread:5887 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log
|
| 5 |
+
2026-02-09 08:50:51,176 INFO MainThread:5887 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log
|
| 6 |
+
2026-02-09 08:50:51,176 INFO MainThread:5887 [wandb_init.py:init():844] calling init triggers
|
| 7 |
+
2026-02-09 08:50:51,176 INFO MainThread:5887 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-02-09 08:50:51,177 INFO MainThread:5887 [wandb_init.py:init():892] starting backend
|
| 10 |
+
2026-02-09 08:50:51,387 INFO MainThread:5887 [wandb_init.py:init():895] sending inform_init request
|
| 11 |
+
2026-02-09 08:50:51,395 INFO MainThread:5887 [wandb_init.py:init():903] backend started and connected
|
| 12 |
+
2026-02-09 08:50:51,397 INFO MainThread:5887 [wandb_init.py:init():973] updated telemetry
|
| 13 |
+
2026-02-09 08:50:51,476 INFO MainThread:5887 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-02-09 08:50:51,992 INFO MainThread:5887 [wandb_init.py:init():1042] starting run threads in backend
|
| 15 |
+
2026-02-09 08:50:52,060 INFO MainThread:5887 [wandb_run.py:_console_start():2529] atexit reg
|
| 16 |
+
2026-02-09 08:50:52,061 INFO MainThread:5887 [wandb_run.py:_redirect():2377] redirect: wrap_raw
|
| 17 |
+
2026-02-09 08:50:52,061 INFO MainThread:5887 [wandb_run.py:_redirect():2446] Wrapping output streams.
|
| 18 |
+
2026-02-09 08:50:52,062 INFO MainThread:5887 [wandb_run.py:_redirect():2469] Redirects installed.
|
| 19 |
+
2026-02-09 08:50:52,064 INFO MainThread:5887 [wandb_init.py:init():1082] run started, returning control to user process
|
| 20 |
+
2026-02-09 08:50:52,065 INFO MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['q_proj', 'gate_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp2/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.01, 'warmup_steps': 0.01, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
|
| 21 |
+
2026-02-09 08:50:52,071 INFO MainThread:5887 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8278029312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7e370842d110>>
|
| 22 |
+
2026-02-09 08:50:52,071 INFO MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8278029312 None
|
| 23 |
+
2026-02-09 08:50:52,073 INFO MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d34_r300'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 64, 'lora_dropout': 0.03, 'lora_rank': 32, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
|
| 24 |
+
2026-02-10 17:01:06,724 INFO wandb-AsyncioManager-main:5887 [service_client.py:_forward_responses():94] Reached EOF.
|
| 25 |
+
2026-02-10 17:01:06,725 INFO wandb-AsyncioManager-main:5887 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
|
v127rc_exp2/B_mup/checkpoint-12200/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12200/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12300/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12300/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12300/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12300/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12400/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12400/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12400/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12400/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12400/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v127rc_exp2/B_mup/checkpoint-12500/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12500/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12500/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12500/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12500/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v127rc_exp2/B_mup/checkpoint-12600/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12600/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12600/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12600/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12600/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v127rc_exp2/B_mup/checkpoint-12700/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12700/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12700/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12700/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>"
|
| 11 |
+
],
|
| 12 |
+
"is_local": true,
|
| 13 |
+
"model_max_length": 131072,
|
| 14 |
+
"pad_token": "<|endoftext|>",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"split_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 18 |
+
"unk_token": null
|
| 19 |
+
}
|
v127rc_exp2/B_mup/checkpoint-12700/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v127rc_exp2/B_mup/checkpoint-12800/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-12800/chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
v127rc_exp2/B_mup/checkpoint-12800/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v127rc_exp2/B_mup/checkpoint-12900/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
v127rc_exp2/B_mup/checkpoint-13100/README.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /workspace/Qwen/Qwen3-8B-Base
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
|
| 7 |
+
- llama-factory
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for Model ID
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
|
| 22 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
- **Developed by:** [More Information Needed]
|
| 27 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 28 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 29 |
+
- **Model type:** [More Information Needed]
|
| 30 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 31 |
+
- **License:** [More Information Needed]
|
| 32 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 33 |
+
|
| 34 |
+
### Model Sources [optional]
|
| 35 |
+
|
| 36 |
+
<!-- Provide the basic links for the model. -->
|
| 37 |
+
|
| 38 |
+
- **Repository:** [More Information Needed]
|
| 39 |
+
- **Paper [optional]:** [More Information Needed]
|
| 40 |
+
- **Demo [optional]:** [More Information Needed]
|
| 41 |
+
|
| 42 |
+
## Uses
|
| 43 |
+
|
| 44 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Downstream Use [optional]
|
| 53 |
+
|
| 54 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
## Bias, Risks, and Limitations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 67 |
+
|
| 68 |
+
[More Information Needed]
|
| 69 |
+
|
| 70 |
+
### Recommendations
|
| 71 |
+
|
| 72 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 73 |
+
|
| 74 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 75 |
+
|
| 76 |
+
## How to Get Started with the Model
|
| 77 |
+
|
| 78 |
+
Use the code below to get started with the model.
|
| 79 |
+
|
| 80 |
+
[More Information Needed]
|
| 81 |
+
|
| 82 |
+
## Training Details
|
| 83 |
+
|
| 84 |
+
### Training Data
|
| 85 |
+
|
| 86 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 87 |
+
|
| 88 |
+
[More Information Needed]
|
| 89 |
+
|
| 90 |
+
### Training Procedure
|
| 91 |
+
|
| 92 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 93 |
+
|
| 94 |
+
#### Preprocessing [optional]
|
| 95 |
+
|
| 96 |
+
[More Information Needed]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#### Training Hyperparameters
|
| 100 |
+
|
| 101 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 102 |
+
|
| 103 |
+
#### Speeds, Sizes, Times [optional]
|
| 104 |
+
|
| 105 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 106 |
+
|
| 107 |
+
[More Information Needed]
|
| 108 |
+
|
| 109 |
+
## Evaluation
|
| 110 |
+
|
| 111 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 112 |
+
|
| 113 |
+
### Testing Data, Factors & Metrics
|
| 114 |
+
|
| 115 |
+
#### Testing Data
|
| 116 |
+
|
| 117 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Factors
|
| 122 |
+
|
| 123 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
#### Metrics
|
| 128 |
+
|
| 129 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 130 |
+
|
| 131 |
+
[More Information Needed]
|
| 132 |
+
|
| 133 |
+
### Results
|
| 134 |
+
|
| 135 |
+
[More Information Needed]
|
| 136 |
+
|
| 137 |
+
#### Summary
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Model Examination [optional]
|
| 142 |
+
|
| 143 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 144 |
+
|
| 145 |
+
[More Information Needed]
|
| 146 |
+
|
| 147 |
+
## Environmental Impact
|
| 148 |
+
|
| 149 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 150 |
+
|
| 151 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 152 |
+
|
| 153 |
+
- **Hardware Type:** [More Information Needed]
|
| 154 |
+
- **Hours used:** [More Information Needed]
|
| 155 |
+
- **Cloud Provider:** [More Information Needed]
|
| 156 |
+
- **Compute Region:** [More Information Needed]
|
| 157 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 158 |
+
|
| 159 |
+
## Technical Specifications [optional]
|
| 160 |
+
|
| 161 |
+
### Model Architecture and Objective
|
| 162 |
+
|
| 163 |
+
[More Information Needed]
|
| 164 |
+
|
| 165 |
+
### Compute Infrastructure
|
| 166 |
+
|
| 167 |
+
[More Information Needed]
|
| 168 |
+
|
| 169 |
+
#### Hardware
|
| 170 |
+
|
| 171 |
+
[More Information Needed]
|
| 172 |
+
|
| 173 |
+
#### Software
|
| 174 |
+
|
| 175 |
+
[More Information Needed]
|
| 176 |
+
|
| 177 |
+
## Citation [optional]
|
| 178 |
+
|
| 179 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 180 |
+
|
| 181 |
+
**BibTeX:**
|
| 182 |
+
|
| 183 |
+
[More Information Needed]
|
| 184 |
+
|
| 185 |
+
**APA:**
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## Glossary [optional]
|
| 190 |
+
|
| 191 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 192 |
+
|
| 193 |
+
[More Information Needed]
|
| 194 |
+
|
| 195 |
+
## More Information [optional]
|
| 196 |
+
|
| 197 |
+
[More Information Needed]
|
| 198 |
+
|
| 199 |
+
## Model Card Authors [optional]
|
| 200 |
+
|
| 201 |
+
[More Information Needed]
|
| 202 |
+
|
| 203 |
+
## Model Card Contact
|
| 204 |
+
|
| 205 |
+
[More Information Needed]
|
| 206 |
+
### Framework versions
|
| 207 |
+
|
| 208 |
+
- PEFT 0.18.1
|
v127rc_exp2/B_mup/checkpoint-13100/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.03,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 32,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"v_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|